diff --git a/main.go b/main.go index ea47319..f9538ab 100644 --- a/main.go +++ b/main.go @@ -141,7 +141,7 @@ func shuntingYard(re string) []postfixNode { } if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E - outQueue = append(outQueue, newPostfixNode(dotChars()...)) + outQueue = append(outQueue, newPostfixDotNode()) continue } if c == '^' { // Start-of-string assertion @@ -282,6 +282,9 @@ func thompson(re []postfixNode) *State { if c.nodetype == CHARACTER || c.nodetype == ASSERTION { state := State{} state.transitions = make(map[int][]*State) + if c.isDot { + state.isDot = true + } state.content = rune2Contents(c.contents) state.output = make([]*State, 0) state.output = append(state.output, &state) @@ -397,6 +400,7 @@ func main() { var re string re = flag.Args()[0] var test_str string + var test_runes []rune // Rune-slice representation of test_str var err error // Create reader for stdin and writer for stdout // End index is one more than last index of match reader := bufio.NewReader(os.Stdin) @@ -406,7 +410,8 @@ func main() { startState := thompson(re_postfix) // Read every string from stdin until we encounter an error. If the error isn't EOF, panic.' for test_str, err = reader.ReadString('\n'); err == nil; test_str, err = reader.ReadString('\n') { - matchIndices := findAllMatches(startState, []rune(test_str)) + test_runes = []rune(test_str) + matchIndices := findAllMatches(startState, []rune(test_runes)) // Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6 // This should make checking O(1) instead of O(n) indicesToPrint := new_uniq_arr[int]() @@ -422,7 +427,7 @@ func main() { // Find all numbers from 0 to len(test_str) that are NOT in oldIndices. // These are the values we want to print, now that we have inverted the match. // Re-initialize indicesToPrint and add all of these values to it. - indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...) + indicesToPrint.add(setDifference(genRange(0, len(test_runes)), oldIndices)...) } // If lineFlag is enabled, we should only print something if: @@ -435,7 +440,7 @@ func main() { continue } } - for i, c := range test_str { + for i, c := range test_runes { if indicesToPrint.contains(i) { color.New(color.FgRed).Fprintf(out, "%c", c) // Newline after every match - only if -o is enabled and -v is disabled. diff --git a/misc.go b/misc.go index f3c8fd5..20443a7 100644 --- a/misc.go +++ b/misc.go @@ -8,8 +8,10 @@ import ( var whitespaceChars = []rune{' ', '\t', '\n'} var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'} var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_") +var notDotChars = []rune{'\n'} var LBRACKET rune = 0xF0000 var RBRACKET rune = 0xF0001 +var ANY_CHAR rune = 0xF0002 // Represents any character - used for 'dot' metacharacter func dotChars() []rune { // Returns all possible characters represented by the dot metacharacter - this is too tedious to define as a variable, which is why it is a function start := 0x0020 diff --git a/nfa.go b/nfa.go index b472020..69b27d5 100644 --- a/nfa.go +++ b/nfa.go @@ -23,6 +23,7 @@ type State struct { isKleene bool // Identifies whether current node is a 0-state representing Kleene star assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states + isDot bool // Whether or not the state represents a 'dot' metacharacter. A 'dot' node doesn't store any contents directly, as it would take up too much space } // Clones the NFA starting from the given state. @@ -94,6 +95,9 @@ func (s State) contentContains(str []rune, idx int) bool { if s.assert != NONE { return s.checkAssertion(str, idx) } + if s.isDot { + return !slices.Contains(notDotChars, str[idx]) + } // Default - s.assert must be NONE return slices.Contains(s.content, int(str[idx])) } @@ -116,7 +120,12 @@ func (s State) matchesFor(str []rune, idx int) ([]*State, int) { if s.assert == NONWBOUND && isWordBoundary(str, idx) { return make([]*State, 0), -1 } - return s.transitions[int(str[idx])], len(s.transitions[int(str[idx])]) + listTransitions := s.transitions[int(str[idx])] + for _, dest := range s.transitions[int(ANY_CHAR)] { + listTransitions = append(listTransitions, dest) + } + numTransitions := len(listTransitions) + return listTransitions, numTransitions } type NFA struct { diff --git a/postfixNode.go b/postfixNode.go index 24645bb..c6c1808 100644 --- a/postfixNode.go +++ b/postfixNode.go @@ -26,6 +26,7 @@ type postfixNode struct { contents []rune // Contents of the node - the length of this would only be >1 if the node represents a character class startReps int // Minimum number of times the node should be repeated - used with numeric specifiers endReps int // Maximum number of times the node should be repeated - used with numeric specifiers + isDot bool // Whether or not the current node represents a 'dot' metacharacter } // Creates a new escaped node - the given character is assumed to have been preceded by a backslash @@ -105,9 +106,22 @@ func newPostfixNode(contents ...rune) postfixNode { return to_return } +// Creates and returns a postfixNode representing the 'dot' metacharacter. +func newPostfixDotNode() postfixNode { + toReturn := postfixNode{} + toReturn.startReps = 1 + toReturn.endReps = 1 + toReturn.nodetype = CHARACTER + toReturn.isDot = true + toReturn.contents = []rune{ANY_CHAR} + return toReturn +} + // Creates a character node, regardless of the contents func newPostfixCharNode(contents ...rune) postfixNode { toReturn := postfixNode{} + toReturn.startReps = 1 + toReturn.endReps = 1 toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, contents...) return toReturn