diff --git a/Makefile b/Makefile index 252549f..4a309a9 100644 --- a/Makefile +++ b/Makefile @@ -6,8 +6,8 @@ fmt: vet: fmt go vet ./... buildLib: vet - go build -gcflags="-N -l" ./... + go build -gcflags="all=-N -l" ./... buildCmd: buildLib - go build -C cmd/ -gcflags="-N -l" -o re ./... + go build -C cmd/ -gcflags="all=-N -l" -o re ./... test: buildCmd go test -v ./... diff --git a/cmd/unique_array.go b/cmd/unique_array.go index e03621a..88c56cb 100644 --- a/cmd/unique_array.go +++ b/cmd/unique_array.go @@ -16,7 +16,6 @@ func (s *uniq_arr[T]) add(vals ...T) { s.backingMap[item] = struct{}{} } } - return } func (s uniq_arr[T]) contains(val T) bool { diff --git a/regex/compile.go b/regex/compile.go index b40c371..d9bef70 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -12,16 +12,27 @@ var notDotChars []rune // A Reg represents the result of compiling a regular expression. It contains // the startState of the NFA representation of the regex, and the number of capturing -// groups in the regex. +// groups in the regex. It also contains the expression string. type Reg struct { - start *nfaState - numGroups int + start *nfaState + numGroups int + str string + preferLongest bool } -// numSubexp eturns the number of sub-expressions in the given [Reg]. This is equivalent +// NumSubexp returns the number of sub-expressions in the given [Reg]. This is equivalent // to the number of capturing groups. -func (r Reg) NumSubexp() int { - return r.numGroups +func (re Reg) NumSubexp() int { + return re.numGroups +} + +// String returns the string used to compile the expression. +func (re Reg) String() string { + return re.str +} + +func (re *Reg) Longest() { + re.preferLongest = true } const concatRune rune = 0xF0001 @@ -816,13 +827,12 @@ func thompson(re []postfixNode) (Reg, error) { // In these cases, we will return an NFA with 1 state, with an assertion that is always true. if len(re) == 0 { start := zeroLengthMatchState() - nfa = append(nfa, &start) + nfa = append(nfa, start) } for _, c := range re { if c.nodetype == characterNode || c.nodetype == assertionNode { stateToAdd := nfaState{} - stateToAdd.transitions = make(map[int][]*nfaState) if c.allChars { stateToAdd.allChars = true if len(c.except) != 0 { @@ -934,7 +944,6 @@ func thompson(re []postfixNode) (Reg, error) { s.isEmpty = true s.output = make([]*nfaState, 0) s.output = append(s.output, s) - s.transitions = make(map[int][]*nfaState) // LPAREN nodes are just added normally if c.nodetype == lparenNode { numGroups++ @@ -966,7 +975,7 @@ func thompson(re []postfixNode) (Reg, error) { s.groupNum = lparenNode.groupNum to_add := concatenate(lparenNode, s) nfa = append(nfa, to_add) - } else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen + } else if middleNode.groupBegin && middleNode.numTransitions() == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen to_add := concatenate(middleNode, s) @@ -989,7 +998,8 @@ func thompson(re []postfixNode) (Reg, error) { if c.nodetype == charclassNode { // A Character class consists of all the nodes in it, alternated // Map the list of nodes to a list of states, each state containing the contents of a specific node states := funcMap(c.nodeContents, func(node postfixNode) *nfaState { - s := newState() + s := &nfaState{} + s.output = append(s.output, s) nodeContents := node.contents if caseInsensitive { nodeContents = slices.Concat(funcMap(nodeContents, func(r rune) []rune { @@ -1003,7 +1013,7 @@ func thompson(re []postfixNode) (Reg, error) { return n.contents })...) } - return &s + return s }) // Reduce the list of states down to a single state by alternating them toAdd := funcReduce(states, func(s1 *nfaState, s2 *nfaState) *nfaState { @@ -1030,14 +1040,14 @@ func thompson(re []postfixNode) (Reg, error) { if err != nil { return Reg{}, fmt.Errorf("error applying kleene star") } - stateToAdd, err := kleene(*s1) + stateToAdd, err := kleene(s1) if err != nil { return Reg{}, err } nfa = append(nfa, stateToAdd) case plusNode: // a+ is equivalent to aa* s1 := mustPop(&nfa) - s2, err := kleene(*s1) + s2, err := kleene(s1) if err != nil { return Reg{}, err } @@ -1048,7 +1058,10 @@ func thompson(re []postfixNode) (Reg, error) { if err != nil { return Reg{}, fmt.Errorf("error applying question operator") } - s2 := question(s1) + s2, err := question(s1) + if err != nil { + return Reg{}, err + } nfa = append(nfa, s2) case pipeNode: // A pipe operator doesn't actually need either operand to be present. If an operand isn't present, @@ -1059,21 +1072,21 @@ func thompson(re []postfixNode) (Reg, error) { // '|a' // '^a|' // '^|a' - s1, err1 := pop(&nfa) - s2, err2 := pop(&nfa) - if err2 != nil || (s2.groupBegin && len(s2.transitions) == 0) { // Doesn't exist, or its just an LPAREN + s2, err1 := pop(&nfa) + s1, err2 := pop(&nfa) + if err2 != nil || (s2.groupBegin && s2.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back nfa = append(nfa, s2) } tmp := zeroLengthMatchState() - s2 = &tmp + s2 = tmp } - if err1 != nil || (s1.groupBegin && len(s1.transitions) == 0) { // Doesn't exist, or its just an LPAREN + if err1 != nil || (s1.groupBegin && s1.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN if err1 == nil { // See above for explanation nfa = append(nfa, s1) } tmp := zeroLengthMatchState() - s1 = &tmp + s1 = tmp } s3 := alternate(s1, s2) nfa = append(nfa, s3) @@ -1100,14 +1113,18 @@ func thompson(re []postfixNode) (Reg, error) { stateToAdd = concatenate(stateToAdd, cloneState(poppedState)) } if c.endReps == infinite_reps { // Case 3 - s2, err := kleene(*poppedState) + s2, err := kleene(poppedState) if err != nil { return Reg{}, err } stateToAdd = concatenate(stateToAdd, s2) } else { // Case 2 for i := c.startReps; i < c.endReps; i++ { - stateToAdd = concatenate(stateToAdd, question(cloneState(poppedState))) + tmp, err := question(cloneState(poppedState)) + if err != nil { + return Reg{}, fmt.Errorf("error processing bounded repetition") + } + stateToAdd = concatenate(stateToAdd, tmp) } } nfa = append(nfa, stateToAdd) @@ -1117,9 +1134,13 @@ func thompson(re []postfixNode) (Reg, error) { return Reg{}, fmt.Errorf("invalid regex") } - verifyLastStates(nfa) + lastState := newState() + lastState.isLast = true + + concatenate(nfa[0], &lastState) - return Reg{nfa[0], numGroups}, nil + // The string is empty here, because we add it in Compile() + return Reg{nfa[0], numGroups, "", false}, nil } @@ -1137,10 +1158,11 @@ func Compile(re string, flags ...ReFlag) (Reg, error) { if err != nil { return Reg{}, fmt.Errorf("error compiling regex: %w", err) } + reg.str = re return reg, nil } -// MustCompile panicks if Compile returns an error. They are identical in all other respects. +// MustCompile panics if Compile returns an error. They are identical in all other respects. func MustCompile(re string, flags ...ReFlag) Reg { reg, err := Compile(re, flags...) if err != nil { diff --git a/regex/doc.go b/regex/doc.go index 1b821c1..c5124e0 100644 --- a/regex/doc.go +++ b/regex/doc.go @@ -4,6 +4,8 @@ Package regex implements regular expression search, using a custom non-bracktrac The engine relies completely on UTF-8 codepoints. As such, it is capable of matching characters from other languages, emojis and symbols. +The API and regex syntax are largely compatible with that of the stdlib's [regexp], with a few key differences (see 'Key Differences with regexp'). + The full syntax is specified below. # Syntax @@ -55,8 +57,8 @@ POSIX classes (inside normal character classes): Composition: def Match d, followed by e, followed by f - x|y Match x or y (prefer longer one) - xy|z Match xy or z + x|y Match x or y (prefer x) + xy|z Match xy or z (prefer xy) Repitition (always greedy, preferring more): @@ -94,10 +96,11 @@ Lookarounds: Numeric ranges: Match any number from x to y (inclusive) (x and y must be positive numbers) + \= 0 && g.EndIdx >= 0 { - numValid++ - } - } - return numValid -} - // Returns a string containing the indices of all (valid) groups in the match func (m Match) String() string { var toRet string @@ -59,7 +48,7 @@ func (idx Group) String() string { return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx) } -// Returns whether a group is valid (ie. whether it matched any text). It +// IsValid returns whether a group is valid (ie. whether it matched any text). It // simply ensures that both indices of the group are >= 0. func (g Group) IsValid() bool { return g.StartIdx >= 0 && g.EndIdx >= 0 @@ -70,101 +59,42 @@ func getZeroGroup(m Match) Group { return m[0] } -// takeZeroState takes the 0-state (if such a transition exists) for all states in the -// given slice. It returns the resulting states. If any of the resulting states is a 0-state, -// the second ret val is true. -// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index. -func takeZeroState(states []*nfaState, numGroups int, idx int) (rtv []*nfaState, isZero bool) { - for _, state := range states { - if len(state.transitions[epsilon]) > 0 { - for _, s := range state.transitions[epsilon] { - if s.threadGroups == nil { - s.threadGroups = newMatch(numGroups + 1) - } - copy(s.threadGroups, state.threadGroups) - if s.groupBegin { - s.threadGroups[s.groupNum].StartIdx = idx - // openParenGroups = append(openParenGroups, s.groupNum) - } - if s.groupEnd { - s.threadGroups[s.groupNum].EndIdx = idx - // closeParenGroups = append(closeParenGroups, s.groupNum) - } - } - rtv = append(rtv, state.transitions[epsilon]...) - } - } - for _, state := range rtv { - if len(state.transitions[epsilon]) > 0 { - return rtv, true - } - } - return rtv, false +func copyThread(to *nfaState, from nfaState) { + to.threadGroups = append([]Group{}, from.threadGroups...) } -// zeroMatchPossible returns true if a zero-length match is possible -// from any of the given states, given the string and our position in it. -// It uses the same algorithm to find zero-states as the one inside the loop, -// so I should probably put it in a function. -func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState) bool { - zeroStates, isZero := takeZeroState(states, numGroups, idx) - tempstates := make([]*nfaState, 0, len(zeroStates)+len(states)) - tempstates = append(tempstates, states...) - tempstates = append(tempstates, zeroStates...) - num_appended := 0 // number of unique states addded to tempstates - for isZero == true { - zeroStates, isZero = takeZeroState(tempstates, numGroups, idx) - tempstates, num_appended = uniqueAppend(tempstates, zeroStates...) - if num_appended == 0 { // break if we haven't appended any more unique values - break - } - } - for _, state := range tempstates { - if state.isEmpty && (state.assert == noneAssert || state.checkAssertion(str, idx)) && state.isLast { - return true - } +// Find returns the 0-group of the leftmost match of the regex in the given string. +// An error value != nil indicates that no match was found. +func (re Reg) Find(str string) (Group, error) { + match, err := re.FindNthMatch(str, 1) + if err != nil { + return Group{}, fmt.Errorf("no matches found") } - return false + return getZeroGroup(match), nil } -// Prunes the slice by removing overlapping indices. -func pruneIndices(indices []Match) []Match { - // First, sort the slice by the start indices - sort.Slice(indices, func(i, j int) bool { - return indices[i][0].StartIdx < indices[j][0].StartIdx - }) - toRet := make([]Match, 0, len(indices)) - current := indices[0] - for _, idx := range indices[1:] { - // idx doesn't overlap with current (starts after current ends), so add current to result - // and update the current. - if idx[0].StartIdx >= current[0].EndIdx { - toRet = append(toRet, current) - current = idx - } else if idx[0].EndIdx > current[0].EndIdx { - // idx overlaps, but it is longer, so update current - current = idx - } - } - // Add last state - toRet = append(toRet, current) - return toRet +// Match returns a boolean value, indicating whether the regex found a match in the given string. +func (re Reg) Match(str string) bool { + _, err := re.Find(str) + return err == nil } -// Find returns the 0-group of the leftmost match of the regex in the given string. -// An error value != nil indicates that no match was found. -func (regex Reg) Find(str string) (Group, error) { - match, err := regex.FindNthMatch(str, 1) +// CompileMatch compiles expr and returns true if str contains a match of the expression. +// It is equivalent to [regexp.Match]. +// An optional list of flags may be provided (see [ReFlag]). +// It returns an error (!= nil) if there was an error compiling the expression. +func CompileMatch(expr string, str string, flags ...ReFlag) (bool, error) { + re, err := Compile(expr, flags...) if err != nil { - return Group{}, fmt.Errorf("no matches found") + return false, err } - return getZeroGroup(match), nil + return re.Match(str), nil } // FindAll returns a slice containing all the 0-groups of the regex in the given string. // A 0-group represents the match without any submatches. -func (regex Reg) FindAll(str string) []Group { - indices := regex.FindAllSubmatch(str) +func (re Reg) FindAll(str string) []Group { + indices := re.FindAllSubmatch(str) zeroGroups := funcMap(indices, getZeroGroup) return zeroGroups } @@ -173,8 +103,8 @@ func (regex Reg) FindAll(str string) []Group { // The return value will be an empty string in two situations: // 1. No match was found // 2. The match was an empty string -func (regex Reg) FindString(str string) string { - match, err := regex.FindNthMatch(str, 1) +func (re Reg) FindString(str string) string { + match, err := re.FindNthMatch(str, 1) if err != nil { return "" } @@ -187,8 +117,8 @@ func (regex Reg) FindString(str string) string { // number of groups. The validity of a group (whether or not it matched anything) can be determined with // [Group.IsValid], or by checking that both indices of the group are >= 0. // The second-return value is nil if no match was found. -func (regex Reg) FindSubmatch(str string) (Match, error) { - match, err := regex.FindNthMatch(str, 1) +func (re Reg) FindSubmatch(str string) (Match, error) { + match, err := re.FindNthMatch(str, 1) if err != nil { return Match{}, fmt.Errorf("no match found") } else { @@ -196,11 +126,41 @@ func (regex Reg) FindSubmatch(str string) (Match, error) { } } -// FindAllString is the 'all' version of FindString. +// FindStringSubmatch is the 'string' version of [FindSubmatch]. It returns a slice of strings, +// where the string at index i contains the text matched by the i-th capturing group. +// The 0-th index represents the entire match. +// An empty string at index n could mean: +// , +// 1. Group n did not find a match +// 2. Group n found a zero-length match +// +// A return value of nil indicates no match. +func (re Reg) FindStringSubmatch(str string) []string { + matchStr := make([]string, re.numGroups+1) + match, err := re.FindSubmatch(str) + if err != nil { + return nil + } + nonEmptyMatchFound := false + for i := range match { + if match[i].IsValid() { + matchStr[i] = str[match[i].StartIdx:match[i].EndIdx] + nonEmptyMatchFound = true + } else { + matchStr[i] = "" + } + } + if nonEmptyMatchFound == false { + return nil + } + return matchStr +} + +// FindAllString is the 'all' version of [FindString]. // It returns a slice of strings containing the text of all matches of // the regex in the given string. -func (regex Reg) FindAllString(str string) []string { - zerogroups := regex.FindAll(str) +func (re Reg) FindAllString(str string) []string { + zerogroups := re.FindAll(str) matchStrs := funcMap(zerogroups, func(g Group) string { return str[g.StartIdx:g.EndIdx] }) @@ -209,14 +169,14 @@ func (regex Reg) FindAllString(str string) []string { // FindNthMatch return the 'n'th match of the regex in the given string. // It returns an error (!= nil) if there are fewer than 'n' matches in the string. -func (regex Reg) FindNthMatch(str string, n int) (Match, error) { +func (re Reg) FindNthMatch(str string, n int) (Match, error) { idx := 0 matchNum := 0 str_runes := []rune(str) var matchFound bool var matchIdx Match for idx <= len(str_runes) { - matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups) + matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest) if matchFound { matchNum++ } @@ -229,31 +189,65 @@ func (regex Reg) FindNthMatch(str string, n int) (Match, error) { } // FindAllSubmatch returns a slice of matches in the given string. -func (regex Reg) FindAllSubmatch(str string) []Match { +func (re Reg) FindAllSubmatch(str string) []Match { idx := 0 str_runes := []rune(str) var matchFound bool var matchIdx Match indices := make([]Match, 0) for idx <= len(str_runes) { - matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups) + matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest) if matchFound { indices = append(indices, matchIdx) } } - if len(indices) > 0 { - return pruneIndices(indices) - } return indices } +func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState { + if stateExists(list, state) || stateExists(visited, state) { + return list + } + visited = append(visited, state) + + if state.isKleene || state.isQuestion { + copyThread(state.splitState, state) + list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest) + copyThread(state.next, state) + list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest) + return list + } + if state.isAlternation { + copyThread(state.next, state) + list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest) + copyThread(state.splitState, state) + list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest) + return list + } + state.threadGroups = append([]Group{}, threadGroups...) + if state.assert != noneAssert { + if state.checkAssertion(str, idx, preferLongest) { + copyThread(state.next, state) + return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest) + } + } + if state.groupBegin { + state.threadGroups[state.groupNum].StartIdx = idx + return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest) + } + if state.groupEnd { + state.threadGroups[state.groupNum].EndIdx = idx + return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest) + } + return append(list, state) + +} + // Helper for FindAllMatches. Returns whether it found a match, the // first Match it finds, and how far it got into the string ie. where // the next search should start from. -// -// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array. -func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) { +func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int, preferLongest bool) (bool, Match, int) { // Base case - exit if offset exceeds string's length if offset > len(str) { // The second value here shouldn't be used, because we should exit when the third return value is > than len(str) @@ -261,214 +255,120 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in } resetThreads(start) - // Hold a list of match indices for the current run. When we - // can no longer find a match, the match with the largest range is - // chosen as the match for the entire string. - // This allows us to pick the longest possible match (which is how greedy matching works). - // COMMENT ABOVE IS CURRENTLY NOT UP-TO-DATE - tempIndices := newMatch(numGroups + 1) - - foundPath := false - startIdx := offset - endIdx := offset - currentStates := make([]*nfaState, 0) - tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration - i := offset // Index in string - startingFrom := i // Store starting index + currentStates := make([]nfaState, 0) + nextStates := make([]nfaState, 0) + i := offset // Index in string // If the first state is an assertion, makes sure the assertion // is true before we do _anything_ else. if start.assert != noneAssert { - if start.checkAssertion(str, offset) == false { + if start.checkAssertion(str, offset, preferLongest) == false { i++ return false, []Group{}, i } } - // Increment until we hit a character matching the start state (assuming not 0-state) - if start.isEmpty == false { - for i < len(str) && !start.contentContains(str, i) { - i++ - } - startIdx = i - startingFrom = i - i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character - } start.threadGroups = newMatch(numGroups + 1) - // Check if the start state begins a group - if so, add the start index to our list - if start.groupBegin { - start.threadGroups[start.groupNum].StartIdx = i - // tempIndices[start.groupNum].startIdx = i - } - - currentStates = append(currentStates, start) - - // Main loop - for i < len(str) { - foundPath = false - - zeroStates := make([]*nfaState, 0) - // Keep taking zero-states, until there are no more left to take - // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take. - zeroStates, isZero := takeZeroState(currentStates, numGroups, i) - tempStates = append(tempStates, zeroStates...) - num_appended := 0 - for isZero == true { - zeroStates, isZero = takeZeroState(tempStates, numGroups, i) - tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) - if num_appended == 0 { // Break if we haven't appended any more unique values - break - } + start.threadGroups[0].StartIdx = i + currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest) + var match Match = nil + for idx := i; idx <= len(str); idx++ { + if len(currentStates) == 0 { + break } + for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ { + currentState := currentStates[currentStateIdx] - currentStates = slices.Concat(currentStates, tempStates) - tempStates = nil - - // Take any transitions corresponding to current character - numStatesMatched := 0 // The number of states which had at least 1 match for this round - assertionFailed := false // Whether or not an assertion failed for this round - lastStateInList := false // Whether or not a last state was in our list of states - var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found - lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states - for numStatesMatched == 0 && lastStateInList == false { - if len(currentStates) == 0 { - break + if currentState.threadGroups == nil { + currentState.threadGroups = newMatch(numGroups + 1) + currentState.threadGroups[0].StartIdx = idx } - state, _ := pop(¤tStates) - matches, numMatches := state.matchesFor(str, i) - if numMatches > 0 { - numStatesMatched++ - tempStates = append([]*nfaState(nil), matches...) - foundPath = true - for _, m := range matches { - if m.threadGroups == nil { - m.threadGroups = newMatch(numGroups + 1) - } - copy(m.threadGroups, state.threadGroups) + + if currentState.isLast { + currentState.threadGroups[0].EndIdx = idx + match = append([]Group{}, currentState.threadGroups...) + if !preferLongest { + break } - } - if numMatches < 0 { - assertionFailed = true - } - if state.isLast { - if state.isLookaround() { - lastLookaroundInList = true + } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character + if currentState.contentContains(str, idx, preferLongest) { + nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest) } - lastStateInList = true - lastStatePtr = state } } + currentStates = append([]nfaState{}, nextStates...) + nextStates = nil + } + if match != nil { + if offset == match[0].EndIdx { + return true, match, match[0].EndIdx + 1 + } + return true, match, match[0].EndIdx + } + return false, []Group{}, i + 1 +} - if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed - // If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_ - // state. The explanation below is my attempt to explain this behavior. - // If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails. - // - // One of the states in our list was a last state and a lookaround. In this case, we - // don't abort upon failure of the assertion, because we have found - // another path to a final state. - // Even if the last state _was_ an assertion, we can use the previously - // saved indices to find a match. - if lastLookaroundInList { - break +// Expand appends template to dst, expanding any variables in template to the relevant capturing group. +// +// A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group. +// To insert a literal $, do not put a number after it. Alternatively, you can use $$. +// src is the input string, and match must be the result of [Reg.FindSubmatch]. +func (re Reg) Expand(dst string, template string, src string, match Match) string { + templateRuneSlc := []rune(template) + srcRuneSlc := []rune(src) + i := 0 + for i < len(templateRuneSlc) { + c := templateRuneSlc[i] + if c == '$' { + i += 1 + // The dollar sign is the last character of the string, or it is proceeded by another dollar sign + if i >= len(templateRuneSlc) || templateRuneSlc[i] == '$' { + dst += "$" + i++ } else { - if i == startingFrom { + numStr := "" + for unicode.IsDigit(templateRuneSlc[i]) { + numStr += string(templateRuneSlc[i]) i++ } - return false, []Group{}, i - } - } - // Check if we can find a state in our list that is: - // a. A last-state - // b. Empty - // c. Doesn't assert anything - for _, s := range currentStates { - if s.isLast && s.isEmpty && s.assert == noneAssert { - lastStatePtr = s - lastStateInList = true - } - } - if lastStateInList && numStatesMatched == 0 { // A last-state was in the list of states. add the matchIndex to our MatchIndex list - for j := 1; j < numGroups+1; j++ { - tempIndices[j] = lastStatePtr.threadGroups[j] - } - endIdx = i - tempIndices[0] = Group{startIdx, endIdx} - if tempIndices[0].StartIdx == tempIndices[0].EndIdx { - return true, tempIndices, tempIndices[0].EndIdx + 1 - } else { - return true, tempIndices, tempIndices[0].EndIdx - } - } - - // Check if we can find a zero-length match - if foundPath == false { - if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok { - if tempIndices[0].IsValid() == false { - tempIndices[0] = Group{startIdx, startIdx} - } - } - // If we haven't moved in the string, increment the counter by 1 - // to ensure we don't keep trying the same string over and over. - // if i == startingFrom { - startIdx++ - // i++ - // } - if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() { - if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. - return true, tempIndices, tempIndices[0].EndIdx + 1 + if numStr == "" { + dst += "$" } else { - return true, tempIndices, tempIndices[0].EndIdx + num, _ := strconv.Atoi(numStr) + if num < len(match) { + dst += string(srcRuneSlc[match[num].StartIdx:match[num].EndIdx]) + } else { + dst += "$" + numStr + } } } - return false, []Group{}, startIdx - } - currentStates = make([]*nfaState, len(tempStates)) - copy(currentStates, tempStates) - tempStates = nil - - i++ - } - - // End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position. - // This is the exact same algorithm used inside the loop, so I should probably put it in a function. - zeroStates, isZero := takeZeroState(currentStates, numGroups, i) - tempStates = append(tempStates, zeroStates...) - num_appended := 0 // Number of unique states addded to tempStates - for isZero == true { - zeroStates, isZero = takeZeroState(tempStates, numGroups, i) - tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) - if num_appended == 0 { // Break if we haven't appended any more unique values - break + } else { + dst += string(c) + i++ } } + return dst +} - currentStates = append(currentStates, tempStates...) - tempStates = nil - - for _, state := range currentStates { - // Only add the match if the start index is in bounds. If the state has an assertion, - // make sure the assertion checks out. - if state.isLast && i <= len(str) { - if state.assert == noneAssert || state.checkAssertion(str, i) { - for j := 1; j < numGroups+1; j++ { - tempIndices[j] = state.threadGroups[j] - } - endIdx = i - tempIndices[0] = Group{startIdx, endIdx} - } - } +// LiteralPrefix returns a string that must begin any match of the given regular expression. +// The second return value is true if the string comprises the entire expression. +func (re Reg) LiteralPrefix() (prefix string, complete bool) { + state := re.start + if state.assert != noneAssert { + state = state.next } - - if tempIndices.numValidGroups() > 0 { - if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. - return true, tempIndices, tempIndices[0].EndIdx + 1 - } else { - return true, tempIndices, tempIndices[0].EndIdx + for !(state.isLast) && (!state.isAlternation) && len(state.content) == 1 && state.assert == noneAssert { + if state.groupBegin || state.groupEnd { + state = state.next + continue } + prefix += string(rune(state.content[0])) + state = state.next } - if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over. - startIdx++ + if state.isLast { + complete = true + } else { + complete = false } - return false, []Group{}, startIdx + return prefix, complete } diff --git a/regex/misc.go b/regex/misc.go index 2d21e61..38b5313 100644 --- a/regex/misc.go +++ b/regex/misc.go @@ -48,49 +48,6 @@ func isNormalChar(c rune) bool { return !slices.Contains(specialChars, c) } -// Ensure that the given elements are only appended to the given slice if they -// don't already exist. Returns the new slice, and the number of unique items appended. -func uniqueAppend[T comparable](slc []T, items ...T) ([]T, int) { - num_appended := 0 - for _, item := range items { - if !slices.Contains(slc, item) { - slc = append(slc, item) - num_appended++ - } - } - return slc, num_appended -} - -func uniqueAppendFunc[T any](slc []T, fn func(T, T) bool, items ...T) ([]T, int) { - toRet := make([]T, len(slc)) - num_appended := 0 - copy(toRet, slc) - for _, item := range items { - itemExists := false - for _, val := range slc { - if fn(item, val) { - itemExists = true - } - } - if !itemExists { - toRet = append(toRet, item) - num_appended++ - } - } - return toRet, num_appended -} - -// Returns true only if all the given elements are equal -func allEqual[T comparable](items ...T) bool { - first := items[0] - for _, item := range items { - if item != first { - return false - } - } - return true -} - // Map function - convert a slice of T to a slice of V, based on a function // that maps a T to a V func funcMap[T, V any](slc []T, fn func(T) V) []V { diff --git a/regex/nfa.go b/regex/nfa.go index 8f63eb0..c649712 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -25,23 +25,25 @@ const ( ) type nfaState struct { - content stateContents // Contents of current state - isEmpty bool // If it is empty - Union operator and Kleene star states will be empty - isLast bool // If it is the last state (acept state) - output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. - transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) - isKleene bool // Identifies whether current node is a 0-state representing Kleene star - isQuestion bool // Identifies whether current node is a 0-state representing the question operator - isAlternation bool // Identifies whether current node is a 0-state representing an alternation - assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything - allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space - except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. - lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds - lookaroundNFA *nfaState // Holds the NFA of the lookaroundRegex - if it exists - lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround - groupBegin bool // Whether or not the node starts a capturing group - groupEnd bool // Whether or not the node ends a capturing group - groupNum int // Which capturing group the node starts / ends + content stateContents // Contents of current state + isEmpty bool // If it is empty - Union operator and Kleene star states will be empty + isLast bool // If it is the last state (acept state) + output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. + // transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) + next *nfaState // The next state (not for alternation or kleene states) + isKleene bool // Identifies whether current node is a 0-state representing Kleene star + isQuestion bool // Identifies whether current node is a 0-state representing the question operator + isAlternation bool // Identifies whether current node is a 0-state representing an alternation + splitState *nfaState // Only for alternation states - the 'other' branch of the alternation ('next' is the first) + assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything + allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space + except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. + lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds + lookaroundNFA *nfaState // Holds the NFA of the lookaroundRegex - if it exists + lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround + groupBegin bool // Whether or not the node starts a capturing group + groupEnd bool // Whether or not the node ends a capturing group + groupNum int // Which capturing group the node starts / ends // The following properties depend on the current match - I should think about resetting them for every match. zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over. @@ -70,7 +72,6 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) isEmpty: stateToClone.isEmpty, isLast: stateToClone.isLast, output: make([]*nfaState, len(stateToClone.output)), - transitions: make(map[int][]*nfaState), isKleene: stateToClone.isKleene, isQuestion: stateToClone.isQuestion, isAlternation: stateToClone.isAlternation, @@ -91,20 +92,18 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) clone.output[i] = cloneStateHelper(s, cloneMap) } } - for k, v := range stateToClone.transitions { - clone.transitions[k] = make([]*nfaState, len(v)) - for i, s := range v { - if s == stateToClone { - clone.transitions[k][i] = clone - } else { - clone.transitions[k][i] = cloneStateHelper(s, cloneMap) - } - } - } if stateToClone.lookaroundNFA == stateToClone { clone.lookaroundNFA = clone } clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap) + if stateToClone.splitState == stateToClone { + clone.splitState = clone + } + clone.splitState = cloneStateHelper(stateToClone.splitState, cloneMap) + if stateToClone.next == stateToClone { + clone.next = clone + } + clone.next = cloneStateHelper(stateToClone.next, cloneMap) return clone } @@ -115,22 +114,26 @@ func resetThreads(start *nfaState) { } func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) { + if state == nil { + return + } if _, ok := visitedMap[state]; ok { return } // Assuming it hasn't been visited state.threadGroups = nil visitedMap[state] = true - for _, v := range state.transitions { - for _, nextState := range v { - resetThreadsHelper(nextState, visitedMap) - } + if state.isAlternation { + resetThreadsHelper(state.next, visitedMap) + resetThreadsHelper(state.splitState, visitedMap) + } else { + resetThreadsHelper(state.next, visitedMap) } } // Checks if the given state's assertion is true. Returns true if the given // state doesn't have an assertion. -func (s nfaState) checkAssertion(str []rune, idx int) bool { +func (s nfaState) checkAssertion(str []rune, idx int, preferLongest bool) bool { if s.assert == alwaysTrueAssert { return true } @@ -180,7 +183,7 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool { strToMatch = string(runesToMatch) } - regComp := Reg{startState, s.lookaroundNumCaptureGroups} + regComp := Reg{startState, s.lookaroundNumCaptureGroups, s.lookaroundRegex, preferLongest} matchIndices := regComp.FindAll(strToMatch) numMatchesFound := 0 @@ -207,9 +210,12 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool { } // Returns true if the contents of 's' contain the value at the given index of the given string -func (s nfaState) contentContains(str []rune, idx int) bool { +func (s nfaState) contentContains(str []rune, idx int, preferLongest bool) bool { if s.assert != noneAssert { - return s.checkAssertion(str, idx) + return s.checkAssertion(str, idx, preferLongest) + } + if idx >= len(str) { + return false } if s.allChars { return !slices.Contains(slices.Concat(notDotChars, s.except), str[idx]) // Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node. @@ -222,74 +228,84 @@ func (s nfaState) isLookaround() bool { return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert } -// Returns the matches for the character at the given index of the given string. -// Also returns the number of matches. Returns -1 if an assertion failed. -func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) { - // Assertions can be viewed as 'checks'. If the check fails, we return - // an empty array and 0. - // If it passes, we treat it like any other state, and return all the transitions. - if s.assert != noneAssert { - if s.checkAssertion(str, idx) == false { - return make([]*nfaState, 0), -1 - } +func (s nfaState) numTransitions() int { + if s.next == nil && s.splitState == nil { + return 0 } - listTransitions := s.transitions[int(str[idx])] - for _, dest := range s.transitions[int(anyCharRune)] { - if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) { - // Add an allChar state to the list of matches if: - // a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't. - // b. The current character isn't the state's exception list. - listTransitions = append(listTransitions, dest) - } + if s.next == nil || s.splitState == nil { + return 1 } - numTransitions := len(listTransitions) - return listTransitions, numTransitions + return 2 } -// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates -func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) { - if len(st.transitions) == 0 { - st.isLast = true - return - } - // if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a* - if len(st.transitions) == 1 { // Eg. a* - var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one - for _, c := range st.content { - if len(st.transitions[c]) != 1 || st.transitions[c][0] != st { - moreThanOneTrans = true - } - } - st.isLast = !moreThanOneTrans - } +// Returns the matches for the character at the given index of the given string. +// Also returns the number of matches. Returns -1 if an assertion failed. +//func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) { +// // Assertions can be viewed as 'checks'. If the check fails, we return +// // an empty array and 0. +// // If it passes, we treat it like any other state, and return all the transitions. +// if s.assert != noneAssert { +// if s.checkAssertion(str, idx) == false { +// return make([]*nfaState, 0), -1 +// } +// } +// listTransitions := s.transitions[int(str[idx])] +// for _, dest := range s.transitions[int(anyCharRune)] { +// if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) { +// // Add an allChar state to the list of matches if: +// // a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't. +// // b. The current character isn't the state's exception list. +// listTransitions = append(listTransitions, dest) +// } +// } +// numTransitions := len(listTransitions) +// return listTransitions, numTransitions +//} - if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state - transitionDests := make([]*nfaState, 0) - for _, v := range st.transitions { - transitionDests = append(transitionDests, v...) - } - if allEqual(transitionDests...) { - st.isLast = true - return - } - } - if visited[st] == true { - return - } - visited[st] = true - for _, states := range st.transitions { - for i := range states { - if states[i] != st { - verifyLastStatesHelper(states[i], visited) - } - } - } -} +// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates +//func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) { +// if st.numTransitions() == 0 { +// st.isLast = true +// return +// } +// // if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a* +// if st.numTransitions() == 1 { // Eg. a* +// var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one +// for _, c := range st.content { +// if len(st.transitions[c]) != 1 || st.transitions[c][0] != st { +// moreThanOneTrans = true +// } +// } +// st.isLast = !moreThanOneTrans +// } +// +// if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state +// transitionDests := make([]*nfaState, 0) +// for _, v := range st.transitions { +// transitionDests = append(transitionDests, v...) +// } +// if allEqual(transitionDests...) { +// st.isLast = true +// return +// } +// } +// if visited[st] == true { +// return +// } +// visited[st] = true +// for _, states := range st.transitions { +// for i := range states { +// if states[i] != st { +// verifyLastStatesHelper(states[i], visited) +// } +// } +// } +//} // verifyLastStates enables the 'isLast' flag for the leaf nodes (last states) -func verifyLastStates(start []*nfaState) { - verifyLastStatesHelper(start[0], make(map[*nfaState]bool)) -} +//func verifyLastStates(start []*nfaState) { +// verifyLastStatesHelper(start[0], make(map[*nfaState]bool)) +//} // Concatenates s1 and s2, returns the start of the concatenation. func concatenate(s1 *nfaState, s2 *nfaState) *nfaState { @@ -297,75 +313,84 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState { return s2 } for i := range s1.output { - for _, c := range s2.content { // Create transitions for every element in s1's content to s2' - s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2) - } + s1.output[i].next = s2 } s1.output = s2.output return s1 } -func kleene(s1 nfaState) (*nfaState, error) { +func kleene(s1 *nfaState) (*nfaState, error) { if s1.isEmpty && s1.assert != noneAssert { return nil, fmt.Errorf("previous token is not quantifiable") } toReturn := &nfaState{} - toReturn.transitions = make(map[int][]*nfaState) - toReturn.content = newContents(epsilon) toReturn.isEmpty = true + toReturn.isAlternation = true + toReturn.content = newContents(epsilon) + toReturn.splitState = s1 + + // toReturn := &nfaState{} + // toReturn.transitions = make(map[int][]*nfaState) + // toReturn.content = newContents(epsilon) toReturn.isKleene = true - toReturn.output = append(toReturn.output, toReturn) + toReturn.output = append([]*nfaState{}, toReturn) for i := range s1.output { - for _, c := range toReturn.content { - s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn) - } - } - for _, c := range s1.content { - toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1) + s1.output[i].next = toReturn } + // for _, c := range s1.content { + // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1) + // } + //toReturn.kleeneState = &s1 return toReturn, nil } func alternate(s1 *nfaState, s2 *nfaState) *nfaState { toReturn := &nfaState{} - toReturn.transitions = make(map[int][]*nfaState) + // toReturn.transitions = make(map[int][]*nfaState) toReturn.output = append(toReturn.output, s1.output...) toReturn.output = append(toReturn.output, s2.output...) - // Unique append is used here (and elsewhere) to ensure that, - // for any given transition, a state can only be mentioned once. - // For example, given the transition 'a', the state 's1' can only be mentioned once. - // This would lead to multiple instances of the same set of match indices, since both - // 's1' states would be considered to match. - for _, c := range s1.content { - toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1) - } - for _, c := range s2.content { - toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2) - } + // // Unique append is used here (and elsewhere) to ensure that, + // // for any given transition, a state can only be mentioned once. + // // For example, given the transition 'a', the state 's1' can only be mentioned once. + // // This would lead to multiple instances of the same set of match indices, since both + // // 's1' states would be considered to match. + // for _, c := range s1.content { + // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1) + // } + // for _, c := range s2.content { + // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2) + // } toReturn.content = newContents(epsilon) toReturn.isEmpty = true toReturn.isAlternation = true + toReturn.next = s1 + toReturn.splitState = s2 return toReturn } -func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|) - s2 := &nfaState{} - s2.transitions = make(map[int][]*nfaState) - s2.content = newContents(epsilon) - s2.output = append(s2.output, s2) - s2.isEmpty = true - s2.isQuestion = true - s3 := alternate(s1, s2) - return s3 +func question(s1 *nfaState) (*nfaState, error) { // Use the fact that ab? == a(b|) + if s1.isEmpty && s1.assert != noneAssert { + return nil, fmt.Errorf("previous token is not quantifiable") + } + toReturn := &nfaState{} + toReturn.isEmpty = true + toReturn.isAlternation = true + toReturn.isQuestion = true + toReturn.content = newContents(epsilon) + toReturn.splitState = s1 + toReturn.output = append([]*nfaState{}, toReturn) + toReturn.output = append(toReturn.output, s1.output...) + // s2.transitions = make(map[int][]*nfaState) + return toReturn, nil } // Creates and returns a new state with the 'default' values. func newState() nfaState { ret := nfaState{ - output: make([]*nfaState, 0), - transitions: make(map[int][]*nfaState), + output: make([]*nfaState, 0), + // transitions: make(map[int][]*nfaState), assert: noneAssert, except: append([]rune{}, 0), lookaroundRegex: "", @@ -377,10 +402,40 @@ func newState() nfaState { } // Creates and returns a state that _always_ has a zero-length match. -func zeroLengthMatchState() nfaState { - start := newState() +func zeroLengthMatchState() *nfaState { + start := &nfaState{} start.content = newContents(epsilon) start.isEmpty = true start.assert = alwaysTrueAssert + start.output = append([]*nfaState{}, start) return start } + +func (s nfaState) equals(other nfaState) bool { + return s.isEmpty == other.isEmpty && + s.isLast == other.isLast && + slices.Equal(s.output, other.output) && + slices.Equal(s.content, other.content) && + s.next == other.next && + s.isKleene == other.isKleene && + s.isQuestion == other.isQuestion && + s.isAlternation == other.isAlternation && + s.splitState == other.splitState && + s.assert == other.assert && + s.allChars == other.allChars && + slices.Equal(s.except, other.except) && + s.lookaroundNFA == other.lookaroundNFA && + s.groupBegin == other.groupBegin && + s.groupEnd == other.groupEnd && + s.groupNum == other.groupNum && + slices.Equal(s.threadGroups, other.threadGroups) +} + +func stateExists(list []nfaState, s nfaState) bool { + for i := range list { + if list[i].equals(s) { + return true + } + } + return false +} diff --git a/regex/priorityQueue.go b/regex/priorityQueue.go deleted file mode 100644 index 59592a9..0000000 --- a/regex/priorityQueue.go +++ /dev/null @@ -1,76 +0,0 @@ -package regex - -import "container/heap" - -// Implement a priority queue using container/heap - -const ( - min_priority int = iota - zerostate_priority - alternation_priority - kleene_priority - char_priority - max_priority -) - -func getPriority(state *nfaState) int { - if state.isKleene { - return kleene_priority - } else if state.isQuestion || state.isAlternation { - return alternation_priority - } else { - if state.isEmpty { - return zerostate_priority - } else { - return char_priority - } - } -} - -type priorQueueItem struct { - state *nfaState - priority int - index int -} - -type priorityQueue []*priorQueueItem - -func (pq priorityQueue) Len() int { - return len(pq) -} - -func (pq priorityQueue) Less(i, j int) bool { - if pq[i].priority == pq[j].priority { - return pq[i].index > pq[j].index - } - return pq[i].priority > pq[j].priority // We want max-heap, so we use greater-than -} - -func (pq priorityQueue) Swap(i, j int) { - pq[i], pq[j] = pq[j], pq[i] - pq[i].index = i - pq[j].index = j -} - -func (pq *priorityQueue) Push(x any) { - length := len(*pq) - item := x.(*priorQueueItem) - item.index = length - *pq = append(*pq, item) -} - -func (pq *priorityQueue) Pop() any { - old := *pq - n := len(old) - item := old[n-1] - old[n-1] = nil - item.index = -1 - *pq = old[0 : n-1] - return item -} - -func (pq *priorityQueue) update(item *priorQueueItem, value *nfaState, priority int) { - item.state = value - item.priority = priority - heap.Fix(pq, item.index) -} diff --git a/regex/range2regex.go b/regex/range2regex.go index a01dfff..de8e0f4 100644 --- a/regex/range2regex.go +++ b/regex/range2regex.go @@ -109,7 +109,7 @@ func range2regex(start int, end int) (string, error) { startSlc := intToSlc(rg.start) endSlc := intToSlc(rg.end) if len(startSlc) != len(endSlc) { - return "", fmt.Errorf("Error parsing numeric range") + return "", fmt.Errorf("error parsing numeric range") } for i := range startSlc { if startSlc[i] == endSlc[i] { diff --git a/regex/re_test.go b/regex/re_test.go index 8d24304..8b9fc8d 100644 --- a/regex/re_test.go +++ b/regex/re_test.go @@ -25,7 +25,9 @@ var reTests = []struct { {"a*b", nil, "qwqw", []Group{}}, {"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}}, {"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}}, - {"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}}, + // This match will only happen with Longest() + // {"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}}, + {"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}}}, {"b*a*a", nil, "bba", []Group{{0, 3}}}, {"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}}, {"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}}, @@ -528,7 +530,7 @@ var groupTests = []struct { }{ {"(a)(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}}, {"((a))(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}}, - {"(0)", nil, "ab", []Match{[]Group{}}}, + {"(0)", nil, "ab", []Match{}}, {"(a)b", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}}}}, {"a(b)", nil, "ab", []Match{[]Group{{0, 2}, {1, 2}}}}, {"(a|b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}}, @@ -537,10 +539,11 @@ var groupTests = []struct { {"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}}, {"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, - {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, - {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, + // This match will only happen with Longest() + // {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, + {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 3}, {0, 3}, {-1, -1}}}}, {"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, - {"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}}, + {"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}}, {"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}}, {"(a?)a?", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}}, {"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}}, @@ -578,7 +581,7 @@ var groupTests = []struct { {`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}}, {`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}}, {`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, - {`(((((((((a)))))))))\41`, nil, `a`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, + {`(((((((((a)))))))))\41`, nil, `a!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}}, {`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}}, @@ -633,7 +636,7 @@ var groupTests = []struct { {`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `BCDD`, []Match{}}, {`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}}, {`(((((((((a)))))))))`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, - {`(((((((((a)))))))))\41`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, + {`(((((((((a)))))))))\41`, []ReFlag{RE_CASE_INSENSITIVE}, `A!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(.*)c(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}}, {`\((.*), (.*)\)`, []ReFlag{RE_CASE_INSENSITIVE}, `(A, B)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}}, {`(a)(b)c|ab`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}}}}, @@ -701,7 +704,7 @@ func TestFind(t *testing.T) { if len(test.result) == 0 { return // Manually pass the test, because this is the expected behavior } else { - t.Errorf("Wanted no match Got %v\n", groupIndex) + t.Errorf("Wanted %v Got no matches\n", test.result) } } else { if groupIndex != test.result[0] { @@ -743,7 +746,7 @@ func TestFindString(t *testing.T) { foundString := regComp.FindString(test.str) if len(test.result) == 0 { if foundString != "" { - t.Errorf("Expected no match got %v\n", foundString) + t.Errorf("Wanted no match got %v\n", foundString) } } else { expectedString := test.str[test.result[0].StartIdx:test.result[0].EndIdx] @@ -791,11 +794,68 @@ func TestFindSubmatch(t *testing.T) { } } match, err := regComp.FindSubmatch(test.str) + if err != nil { + if len(test.result) != 0 { + t.Errorf("Wanted %v got no match\n", test.result[0]) + } + } else if len(test.result) == 0 { + t.Errorf("Wanted no match got %v\n", match) + } for i := range match { if match[i].IsValid() { if test.result[0][i] != match[i] { t.Errorf("Wanted %v Got %v\n", test.result[0], match) } + } else { + if i < len(test.result) && test.result[0][i].IsValid() { + t.Errorf("Wanted %v Got %v\n", test.result[0], match) + } + } + } + }) + } +} +func TestFindStringSubmatch(t *testing.T) { + for _, test := range groupTests { + t.Run(test.re+" "+test.str, func(t *testing.T) { + regComp, err := Compile(test.re, test.flags...) + if err != nil { + if test.result != nil { + panic(err) + } + } + matchStr := regComp.FindStringSubmatch(test.str) + if matchStr == nil { + if len(test.result) != 0 { + expectedStr := funcMap(test.result[0], func(g Group) string { + if g.IsValid() { + return test.str[g.StartIdx:g.EndIdx] + } else { + return "" + } + }) + t.Errorf("Wanted %v got no match\n", expectedStr) + } + } else if len(test.result) == 0 { + t.Errorf("Wanted no match got %v\n", matchStr) + } else { + expectedStr := funcMap(test.result[0], func(g Group) string { + if g.IsValid() { + return test.str[g.StartIdx:g.EndIdx] + } else { + return "" + } + }) + for i, groupStr := range matchStr { + if groupStr == "" { + if i < len(expectedStr) && expectedStr[i] != "" { + t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr) + } + } else { + if expectedStr[i] != groupStr { + t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr) + } + } } } }) @@ -817,6 +877,10 @@ func TestFindAllSubmatch(t *testing.T) { if test.result[i][j] != matchIndices[i][j] { t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) } + } else { + if i < len(test.result) && j < len(test.result[i]) && test.result[i][j].IsValid() { + t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) + } } } }