Added unicode support to dot metacharacter - it now matches _any_ unicode character (almost)

This commit is contained in:
2024-11-18 16:44:43 -05:00
parent 8a1f1dc621
commit c56d81a335
4 changed files with 35 additions and 5 deletions

11
nfa.go
View File

@@ -23,6 +23,7 @@ type State struct {
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
isDot bool // Whether or not the state represents a 'dot' metacharacter. A 'dot' node doesn't store any contents directly, as it would take up too much space
}
// Clones the NFA starting from the given state.
@@ -94,6 +95,9 @@ func (s State) contentContains(str []rune, idx int) bool {
if s.assert != NONE {
return s.checkAssertion(str, idx)
}
if s.isDot {
return !slices.Contains(notDotChars, str[idx])
}
// Default - s.assert must be NONE
return slices.Contains(s.content, int(str[idx]))
}
@@ -116,7 +120,12 @@ func (s State) matchesFor(str []rune, idx int) ([]*State, int) {
if s.assert == NONWBOUND && isWordBoundary(str, idx) {
return make([]*State, 0), -1
}
return s.transitions[int(str[idx])], len(s.transitions[int(str[idx])])
listTransitions := s.transitions[int(str[idx])]
for _, dest := range s.transitions[int(ANY_CHAR)] {
listTransitions = append(listTransitions, dest)
}
numTransitions := len(listTransitions)
return listTransitions, numTransitions
}
type NFA struct {