Added unicode support to dot metacharacter - it now matches _any_ unicode character (almost)

This commit is contained in:
2024-11-18 16:44:43 -05:00
parent 8a1f1dc621
commit c56d81a335
4 changed files with 35 additions and 5 deletions

13
main.go
View File

@@ -141,7 +141,7 @@ func shuntingYard(re string) []postfixNode {
}
if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
outQueue = append(outQueue, newPostfixNode(dotChars()...))
outQueue = append(outQueue, newPostfixDotNode())
continue
}
if c == '^' { // Start-of-string assertion
@@ -282,6 +282,9 @@ func thompson(re []postfixNode) *State {
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
state := State{}
state.transitions = make(map[int][]*State)
if c.isDot {
state.isDot = true
}
state.content = rune2Contents(c.contents)
state.output = make([]*State, 0)
state.output = append(state.output, &state)
@@ -397,6 +400,7 @@ func main() {
var re string
re = flag.Args()[0]
var test_str string
var test_runes []rune // Rune-slice representation of test_str
var err error
// Create reader for stdin and writer for stdout // End index is one more than last index of match
reader := bufio.NewReader(os.Stdin)
@@ -406,7 +410,8 @@ func main() {
startState := thompson(re_postfix)
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.'
for test_str, err = reader.ReadString('\n'); err == nil; test_str, err = reader.ReadString('\n') {
matchIndices := findAllMatches(startState, []rune(test_str))
test_runes = []rune(test_str)
matchIndices := findAllMatches(startState, []rune(test_runes))
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
// This should make checking O(1) instead of O(n)
indicesToPrint := new_uniq_arr[int]()
@@ -422,7 +427,7 @@ func main() {
// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
// These are the values we want to print, now that we have inverted the match.
// Re-initialize indicesToPrint and add all of these values to it.
indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)
indicesToPrint.add(setDifference(genRange(0, len(test_runes)), oldIndices)...)
}
// If lineFlag is enabled, we should only print something if:
@@ -435,7 +440,7 @@ func main() {
continue
}
}
for i, c := range test_str {
for i, c := range test_runes {
if indicesToPrint.contains(i) {
color.New(color.FgRed).Fprintf(out, "%c", c)
// Newline after every match - only if -o is enabled and -v is disabled.