16 Commits

Author SHA1 Message Date
858e535fba Continued implementing Thompson's algorithm 2025-02-05 18:01:36 -05:00
7c62ba6bfd Started implementing Thompson's algorithm for matching, because the old one was completely backtracking (so it would enter infinite loops on something like '(a*)*' )
The git diff claims that a ton of code was changed, but most of it was just indentation changes.
2025-02-05 12:21:12 -05:00
d4e8cb74fd Replaced pointer to nfaState with nfaState 2025-02-05 11:32:20 -05:00
3ce611d121 More work towards implementing PCRE matching 2025-02-04 14:09:24 -05:00
e0253dfaf3 Change kleene() to an alternation-style construct 2025-02-04 14:09:04 -05:00
753e973d82 Started rewrite of matching algorithm, got concatenation and alternation done, kleene and zero-state stuff is next 2025-02-03 22:01:52 -05:00
5563a70568 Reverse the order in which I pop states for alternation, because this messes with the left branch-right branch thing 2025-02-03 21:59:41 -05:00
de0d7345a8 Store left and right branches of alternation separately 2025-02-03 21:59:05 -05:00
ad273b0c68 Trying to emulate backtracking by using string pointers within threads (something similar to rsc's 2nd regexp article) 2025-02-03 16:50:11 -05:00
e167cdb2cb Fixed mistake in test output 2025-02-03 16:49:30 -05:00
1fd48ae614 Store the current string pointer as a 'thread variable' (allows us to simulate backtracking) 2025-02-03 16:49:10 -05:00
09812956ac Disable all optimizations 2025-02-03 16:48:09 -05:00
fbc9dfcc95 Trying something out; we'll see if it works 2025-02-03 16:47:53 -05:00
bc32e0cb76 Started working on converting to PCRE matching rules (prefer left branch of alternation) 2025-02-03 14:06:14 -05:00
ad0f7d0178 Added new state fields to tell if a state is a question or alternation 2025-02-03 14:05:53 -05:00
4e597f8eb1 Implemented a priority-queue to use while matching 2025-02-03 14:05:30 -05:00
6 changed files with 467 additions and 177 deletions

View File

@@ -6,8 +6,8 @@ fmt:
vet: fmt
go vet ./...
buildLib: vet
go build -gcflags="-N -l" ./...
go build -gcflags="all=-N -l" ./...
buildCmd: buildLib
go build -C cmd/ -gcflags="-N -l" -o re ./...
go build -C cmd/ -gcflags="all=-N -l" -o re ./...
test: buildCmd
go test -v ./...

View File

@@ -1059,8 +1059,8 @@ func thompson(re []postfixNode) (Reg, error) {
// '|a'
// '^a|'
// '^|a'
s1, err1 := pop(&nfa)
s2, err2 := pop(&nfa)
s2, err1 := pop(&nfa)
s1, err2 := pop(&nfa)
if err2 != nil || (s2.groupBegin && len(s2.transitions) == 0) { // Doesn't exist, or its just an LPAREN
if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back
nfa = append(nfa, s2)

View File

@@ -2,6 +2,7 @@ package regex
import (
"fmt"
"slices"
"sort"
)
@@ -150,6 +151,11 @@ func pruneIndices(indices []Match) []Match {
return toRet
}
func copyThread(to *nfaState, from nfaState) {
to.threadSP = from.threadSP
to.threadGroups = append([]Group{}, from.threadGroups...)
}
// Find returns the 0-group of the leftmost match of the regex in the given string.
// An error value != nil indicates that no match was found.
func (regex Reg) Find(str string) (Group, error) {
@@ -265,15 +271,16 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// chosen as the match for the entire string.
// This allows us to pick the longest possible match (which is how greedy matching works).
// COMMENT ABOVE IS CURRENTLY NOT UP-TO-DATE
tempIndices := newMatch(numGroups + 1)
// tempIndices := newMatch(numGroups + 1)
foundPath := false
startIdx := offset
endIdx := offset
currentStates := make([]*nfaState, 0)
tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration
i := offset // Index in string
startingFrom := i // Store starting index
// foundPath := false
//startIdx := offset
//endIdx := offset
currentStates := make([]nfaState, 0)
nextStates := make([]nfaState, 0)
// tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration
i := offset // Index in string
//startingFrom := i // Store starting index
// If the first state is an assertion, makes sure the assertion
// is true before we do _anything_ else.
@@ -284,181 +291,348 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
}
}
// Increment until we hit a character matching the start state (assuming not 0-state)
if start.isEmpty == false {
for i < len(str) && !start.contentContains(str, i) {
i++
}
startIdx = i
startingFrom = i
i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character
}
// if start.isEmpty == false {
// for i < len(str) && !start.contentContains(str, i) {
// i++
// }
// startIdx = i
// startingFrom = i
// i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character
// }
start.threadGroups = newMatch(numGroups + 1)
// start.threadGroups = newMatch(numGroups + 1)
// Check if the start state begins a group - if so, add the start index to our list
if start.groupBegin {
start.threadGroups[start.groupNum].StartIdx = i
// tempIndices[start.groupNum].startIdx = i
}
currentStates = append(currentStates, start)
//if start.groupBegin {
// start.threadGroups[start.groupNum].StartIdx = i
// tempIndices[start.groupNum].startIdx = i
//}
start.threadSP = i
currentStates = append(currentStates, *start)
var foundMatch bool
var isEmptyAndNoAssertion bool
// Main loop
for i < len(str) {
foundPath = false
for idx := i; idx <= len(str); idx++ {
for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ {
currentState := currentStates[currentStateIdx]
foundMatch = false
isEmptyAndNoAssertion = false
zeroStates := make([]*nfaState, 0)
// Keep taking zero-states, until there are no more left to take
// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
tempStates = append(tempStates, zeroStates...)
num_appended := 0
for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break
if currentState.threadGroups == nil {
currentState.threadGroups = newMatch(numGroups + 1)
currentState.threadGroups[0].StartIdx = idx
}
}
currentStates, _ = uniqueAppend(currentStates, tempStates...)
tempStates = nil
if currentState.groupBegin {
currentState.threadGroups[currentState.groupNum].StartIdx = idx
// allMatches := make([]nfaState, 0)
// for _, v := range currentState.transitions {
// dereferenced := funcMap(v, func(s *nfaState) nfaState {
// return *s
// })
// allMatches = append(allMatches, dereferenced...)
// }
// slices.Reverse(allMatches)
// for i := range allMatches {
// copyThread(&allMatches[i], currentState)
// }
// currentStates = append(currentStates, allMatches...)
}
if currentState.groupEnd {
currentState.threadGroups[currentState.groupNum].EndIdx = idx
// allMatches := make([]nfaState, 0)
// for _, v := range currentState.transitions {
// dereferenced := funcMap(v, func(s *nfaState) nfaState {
// return *s
// })
// allMatches = append(allMatches, dereferenced...)
// }
// slices.Reverse(allMatches)
// for i := range allMatches {
// copyThread(&allMatches[i], currentState)
// }
// currentStates = append(currentStates, allMatches...)
}
// Take any transitions corresponding to current character
numStatesMatched := 0 // The number of states which had at least 1 match for this round
assertionFailed := false // Whether or not an assertion failed for this round
lastStateInList := false // Whether or not a last state was in our list of states
var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found
lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
for _, state := range currentStates {
matches, numMatches := state.matchesFor(str, i)
if numMatches > 0 {
numStatesMatched++
tempStates = append(tempStates, matches...)
foundPath = true
for _, m := range matches {
if m.threadGroups == nil {
m.threadGroups = newMatch(numGroups + 1)
}
copy(m.threadGroups, state.threadGroups)
}
}
if numMatches < 0 {
assertionFailed = true
}
if state.isLast {
if state.isLookaround() {
lastLookaroundInList = true
}
lastStateInList = true
lastStatePtr = state
}
}
if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed
// If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_
// state. The explanation below is my attempt to explain this behavior.
// If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails.
//
// One of the states in our list was a last state and a lookaround. In this case, we
// don't abort upon failure of the assertion, because we have found
// another path to a final state.
// Even if the last state _was_ an assertion, we can use the previously
// saved indices to find a match.
if lastLookaroundInList {
break
} else {
if i == startingFrom {
i++
}
return false, []Group{}, i
}
}
// Check if we can find a state in our list that is:
// a. A last-state
// b. Empty
// c. Doesn't assert anything
for _, s := range currentStates {
if s.isLast && s.isEmpty && s.assert == noneAssert {
lastStatePtr = s
lastStateInList = true
}
}
if lastStateInList { // A last-state was in the list of states. add the matchIndex to our MatchIndex list
for j := 1; j < numGroups+1; j++ {
tempIndices[j] = lastStatePtr.threadGroups[j]
}
endIdx = i
tempIndices[0] = Group{startIdx, endIdx}
}
// Check if we can find a zero-length match
if foundPath == false {
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
if tempIndices[0].IsValid() == false {
tempIndices[0] = Group{startIdx, startIdx}
}
}
// If we haven't moved in the string, increment the counter by 1
// to ensure we don't keep trying the same string over and over.
// if i == startingFrom {
startIdx++
// i++
// if currentState.isKleene {
// // Append the next-state (after the kleene), then append the kleene state
// allMatches := make([]*nfaState, 0)
// for _, v := range currentState.transitions {
// allMatches = append(allMatches, v...)
// }
if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() {
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true, tempIndices, tempIndices[0].EndIdx + 1
// slices.Reverse(allMatches)
// for _, m := range allMatches {
// m.threadGroups = currentState.threadGroups
// m.threadSP = idx
// }
// currentStates = append(currentStates, allMatches...)
//
// // kleeneState := currentState.kleeneState
// // kleeneState.threadGroups = currentState.threadGroups
// // kleeneState.threadSP = currentState.threadSP
// // currentStates = append(currentStates, kleeneState)
// continue
// }
// Alternation - enqueue left then right state, and continue
if currentState.isAlternation {
leftState := currentState.leftState
copyThread(leftState, currentState)
currentStates = append(currentStates, *currentState.leftState)
rightState := currentState.rightState
copyThread(rightState, currentState)
currentStates = append(currentStates, *currentState.rightState)
continue
}
// Empty state - enqueue next state, do _not_ increment the SP
if currentState.isEmpty && currentState.assert == noneAssert { //&& currentState.groupBegin == false && currentState.groupEnd == false {
isEmptyAndNoAssertion = true
}
if currentState.contentContains(str, idx) {
foundMatch = true
}
if isEmptyAndNoAssertion || foundMatch {
allMatches := make([]nfaState, 0)
for _, v := range currentState.transitions {
dereferenced := funcMap(v, func(s *nfaState) nfaState {
return *s
})
allMatches = append(allMatches, dereferenced...)
}
slices.Reverse(allMatches)
for i := range allMatches {
copyThread(&allMatches[i], currentState)
if foundMatch && currentState.assert == noneAssert {
allMatches[i].threadSP += 1
}
}
if currentState.groupBegin {
currentStates = slices.Insert(currentStates, currentStateIdx+1, allMatches...)
} else if currentState.groupEnd {
currentStates = append(currentStates, allMatches...)
} else {
return true, tempIndices, tempIndices[0].EndIdx
nextStates = append(nextStates, allMatches...)
}
}
return false, []Group{}, startIdx
}
currentStates = make([]*nfaState, len(tempStates))
copy(currentStates, tempStates)
tempStates = nil
i++
}
// End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position.
// This is the exact same algorithm used inside the loop, so I should probably put it in a function.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
tempStates = append(tempStates, zeroStates...)
num_appended := 0 // Number of unique states addded to tempStates
for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break
}
}
currentStates = append(currentStates, tempStates...)
tempStates = nil
for _, state := range currentStates {
// Only add the match if the start index is in bounds. If the state has an assertion,
// make sure the assertion checks out.
if state.isLast && i <= len(str) {
if state.assert == noneAssert || state.checkAssertion(str, i) {
for j := 1; j < numGroups+1; j++ {
tempIndices[j] = state.threadGroups[j]
if currentState.isLast && len(nextStates) == 0 { // Last state reached
if foundMatch {
if currentState.assert != noneAssert {
currentState.threadGroups[0].EndIdx = idx
} else {
currentState.threadGroups[0].EndIdx = idx + 1
}
if idx == currentState.threadGroups[0].StartIdx {
idx += 1
}
return true, currentState.threadGroups, idx
} else if isEmptyAndNoAssertion {
currentState.threadGroups[0].EndIdx = idx
if idx == currentState.threadGroups[0].StartIdx {
idx++
}
return true, currentState.threadGroups, idx
}
endIdx = i
tempIndices[0] = Group{startIdx, endIdx}
}
}
currentStates = append([]nfaState{}, nextStates...)
nextStates = nil
}
if tempIndices.numValidGroups() > 0 {
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true, tempIndices, tempIndices[0].EndIdx + 1
} else {
return true, tempIndices, tempIndices[0].EndIdx
}
}
if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over.
startIdx++
}
return false, []Group{}, startIdx
return false, []Group{}, i + 1
// zeroStates := make([]*nfaState, 0)
// // Keep taking zero-states, until there are no more left to take
// // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
// topStateItem := currentStates.peek()
// topState := topStateItem.(*priorQueueItem).state
// zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i)
// tempStates = append(tempStates, zeroStates...)
// num_appended := 0
// for isZero == true {
// zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
// tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
// if num_appended == 0 { // Break if we haven't appended any more unique values
// break
// }
// }
// if isZero == true {
// currentStates.Pop()
// }
//
// for _, state := range tempStates {
// heap.Push(currentStates, newPriorQueueItem(state))
// }
// tempStates = nil
//
// // Take any transitions corresponding to current character
// numStatesMatched := 0 // The number of states which had at least 1 match for this round
// assertionFailed := false // Whether or not an assertion failed for this round
// lastStateInList := false // Whether or not a last state was in our list of states
// var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found
// lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
// for numStatesMatched == 0 && lastStateInList == false {
// if currentStates.Len() == 0 {
// break
// }
// stateItem := heap.Pop(currentStates)
// state := stateItem.(*priorQueueItem).state
// matches, numMatches := state.matchesFor(str, i)
// if numMatches > 0 {
// numStatesMatched++
// tempStates = append([]*nfaState(nil), matches...)
// foundPath = true
// for _, m := range matches {
// if m.threadGroups == nil {
// m.threadGroups = newMatch(numGroups + 1)
// }
// m.threadSP = state.threadSP + 1
// copy(m.threadGroups, state.threadGroups)
// }
// }
// if numMatches < 0 {
// assertionFailed = true
// }
// if state.isLast {
// if state.isLookaround() {
// lastLookaroundInList = true
// }
// lastStateInList = true
// lastStatePtr = state
// }
// }
//
// if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed
// // If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_
// // state. The explanation below is my attempt to explain this behavior.
// // If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails.
// //
// // One of the states in our list was a last state and a lookaround. In this case, we
// // don't abort upon failure of the assertion, because we have found
// // another path to a final state.
// // Even if the last state _was_ an assertion, we can use the previously
// // saved indices to find a match.
// if lastLookaroundInList {
// break
// } else {
// if i == startingFrom {
// i++
// }
// return false, []Group{}, i
// }
// }
// // Check if we can find a state in our list that is:
// // a. A last-state
// // b. Empty
// // c. Doesn't assert anything
// for _, stateItem := range *currentStates {
// s := stateItem.state
// if s.isLast && s.isEmpty && s.assert == noneAssert {
// lastStatePtr = s
// lastStateInList = true
// }
// }
// if lastStateInList && numStatesMatched == 0 { // A last-state was in the list of states. add the matchIndex to our MatchIndex list
// for j := 1; j < numGroups+1; j++ {
// tempIndices[j] = lastStatePtr.threadGroups[j]
// }
// endIdx = i
// tempIndices[0] = Group{startIdx, endIdx}
// if tempIndices[0].StartIdx == tempIndices[0].EndIdx {
// return true, tempIndices, tempIndices[0].EndIdx + 1
// } else {
// return true, tempIndices, tempIndices[0].EndIdx
// }
// }
//
// // Check if we can find a zero-length match
// if foundPath == false {
// currentStatesList := funcMap(*currentStates, func(item *priorQueueItem) *nfaState {
// return item.state
// })
// if ok := zeroMatchPossible(str, i, numGroups, currentStatesList...); ok {
// if tempIndices[0].IsValid() == false {
// tempIndices[0] = Group{startIdx, startIdx}
// }
// }
// // If we haven't moved in the string, increment the counter by 1
// // to ensure we don't keep trying the same string over and over.
// // if i == startingFrom {
// startIdx++
// // i++
// // }
// if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() {
// if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
// return true, tempIndices, tempIndices[0].EndIdx + 1
// } else {
// return true, tempIndices, tempIndices[0].EndIdx
// }
// }
// return false, []Group{}, startIdx
// }
// currentStates = &priorityQueue{}
// slices.Reverse(tempStates)
// for _, state := range tempStates {
// heap.Push(currentStates, newPriorQueueItem(state))
// }
// tempStates = nil
//
// i++
// }
//
// // End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position.
// // This is the exact same algorithm used inside the loop, so I should probably put it in a function.
//
// if currentStates.Len() > 0 {
// topStateItem := currentStates.peek()
// topState := topStateItem.(*priorQueueItem).state
// zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i)
// tempStates = append(tempStates, zeroStates...)
// num_appended := 0 // Number of unique states addded to tempStates
// for isZero == true {
// zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
// tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
// if num_appended == 0 { // Break if we haven't appended any more unique values
// break
// }
// }
// }
//
// for _, state := range tempStates {
// heap.Push(currentStates, newPriorQueueItem(state))
// }
//
// tempStates = nil
//
// for _, stateItem := range *currentStates {
// state := stateItem.state
// // Only add the match if the start index is in bounds. If the state has an assertion,
// // make sure the assertion checks out.
// if state.isLast && i <= len(str) {
// if state.assert == noneAssert || state.checkAssertion(str, i) {
// for j := 1; j < numGroups+1; j++ {
// tempIndices[j] = state.threadGroups[j]
// }
// endIdx = i
// tempIndices[0] = Group{startIdx, endIdx}
// }
// }
// }
//
// if tempIndices.numValidGroups() > 0 {
// if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
// return true, tempIndices, tempIndices[0].EndIdx + 1
// } else {
// return true, tempIndices, tempIndices[0].EndIdx
// }
// }
//
// if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over.
//
// startIdx++
// }
//
// return false, []Group{}, startIdx
}

View File

@@ -31,6 +31,10 @@ type nfaState struct {
output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
isQuestion bool // Identifies whether current node is a 0-state representing the question operator
isAlternation bool // Identifies whether current node is a 0-state representing an alternation
leftState *nfaState // Only for alternation states - the 'left' branch of the alternation
rightState *nfaState // Only for alternation states - the 'right' branch of the alternation
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
@@ -43,6 +47,7 @@ type nfaState struct {
// The following properties depend on the current match - I should think about resetting them for every match.
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
threadSP int // The string pointer of the thread - where it is in the input string
}
// Clones the NFA starting from the given state.
@@ -70,6 +75,8 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
output: make([]*nfaState, len(stateToClone.output)),
transitions: make(map[int][]*nfaState),
isKleene: stateToClone.isKleene,
isQuestion: stateToClone.isQuestion,
isAlternation: stateToClone.isAlternation,
assert: stateToClone.assert,
zeroMatchFound: stateToClone.zeroMatchFound,
allChars: stateToClone.allChars,
@@ -101,6 +108,14 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
clone.lookaroundNFA = clone
}
clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap)
if stateToClone.leftState == stateToClone {
clone.leftState = clone
}
clone.leftState = cloneStateHelper(stateToClone.leftState, cloneMap)
if stateToClone.rightState == stateToClone {
clone.rightState = clone
}
clone.rightState = cloneStateHelper(stateToClone.rightState, cloneMap)
return clone
}
@@ -116,6 +131,7 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
}
// Assuming it hasn't been visited
state.threadGroups = nil
state.threadSP = 0
visitedMap[state] = true
for _, v := range state.transitions {
for _, nextState := range v {
@@ -207,6 +223,9 @@ func (s nfaState) contentContains(str []rune, idx int) bool {
if s.assert != noneAssert {
return s.checkAssertion(str, idx)
}
if idx >= len(str) {
return false
}
if s.allChars {
return !slices.Contains(slices.Concat(notDotChars, s.except), str[idx]) // Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node.
}
@@ -306,12 +325,16 @@ func kleene(s1 nfaState) (*nfaState, error) {
return nil, fmt.Errorf("previous token is not quantifiable")
}
toReturn := &nfaState{}
toReturn.transitions = make(map[int][]*nfaState)
toReturn.content = newContents(epsilon)
emptyState := zeroLengthMatchState()
emptyState.assert = noneAssert
toReturn := alternate(&s1, &emptyState)
// toReturn := &nfaState{}
// toReturn.transitions = make(map[int][]*nfaState)
// toReturn.content = newContents(epsilon)
toReturn.isEmpty = true
toReturn.isKleene = true
toReturn.output = append(toReturn.output, toReturn)
toReturn.output = []*nfaState{&emptyState}
for i := range s1.output {
for _, c := range toReturn.content {
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
@@ -320,6 +343,7 @@ func kleene(s1 nfaState) (*nfaState, error) {
for _, c := range s1.content {
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
}
//toReturn.kleeneState = &s1
return toReturn, nil
}
@@ -341,6 +365,9 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
}
toReturn.content = newContents(epsilon)
toReturn.isEmpty = true
toReturn.isAlternation = true
toReturn.leftState = s1
toReturn.rightState = s2
return toReturn
}

89
regex/priorityQueue.go Normal file
View File

@@ -0,0 +1,89 @@
package regex
import "container/heap"
// Implement a priority queue using container/heap
const (
min_priority int = iota
zerostate_priority
alternation_priority
kleene_priority
char_priority
max_priority
)
func getPriority(state *nfaState) int {
if state.isKleene {
return zerostate_priority
} else if state.isAlternation {
return alternation_priority
} else {
if state.isEmpty {
return zerostate_priority
} else {
return char_priority
}
}
}
type priorQueueItem struct {
state *nfaState
priority int
index int
}
func newPriorQueueItem(state *nfaState) *priorQueueItem {
return &priorQueueItem{
state: state,
index: -1,
priority: getPriority(state),
}
}
type priorityQueue []*priorQueueItem
func (pq priorityQueue) Len() int {
return len(pq)
}
func (pq priorityQueue) Less(i, j int) bool {
if pq[i].priority == pq[j].priority {
return pq[i].index < pq[j].index
}
return pq[i].priority > pq[j].priority // We want max-heap, so we use greater-than
}
func (pq priorityQueue) Swap(i, j int) {
pq[i], pq[j] = pq[j], pq[i]
pq[i].index = i
pq[j].index = j
}
func (pq *priorityQueue) Push(x any) {
length := len(*pq)
item := x.(*priorQueueItem)
item.index = length
*pq = append(*pq, item)
}
func (pq *priorityQueue) Pop() any {
old := *pq
n := len(old)
item := old[n-1]
old[n-1] = nil
item.index = -1
*pq = old[0 : n-1]
return item
}
func (pq *priorityQueue) peek() any {
queue := *pq
n := len(queue)
return queue[n-1]
}
func (pq *priorityQueue) update(item *priorQueueItem, value *nfaState, priority int) {
item.state = value
item.priority = priority
heap.Fix(pq, item.index)
}

View File

@@ -701,7 +701,7 @@ func TestFind(t *testing.T) {
if len(test.result) == 0 {
return // Manually pass the test, because this is the expected behavior
} else {
t.Errorf("Wanted no match Got %v\n", groupIndex)
t.Errorf("Wanted %v Got no matches\n", test.result)
}
} else {
if groupIndex != test.result[0] {