From ad273b0c682c63d080da3b30da504b67bd53d482 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Mon, 3 Feb 2025 16:50:11 -0500 Subject: [PATCH] Trying to emulate backtracking by using string pointers within threads (something similar to rsc's 2nd regexp article) --- regex/matching.go | 70 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index 17c2bcb..af2ede3 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -1,6 +1,7 @@ package regex import ( + "container/heap" "fmt" "slices" "sort" @@ -271,7 +272,8 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in foundPath := false startIdx := offset endIdx := offset - currentStates := make([]*nfaState, 0) + currentStates := &priorityQueue{} + heap.Init(currentStates) tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration i := offset // Index in string startingFrom := i // Store starting index @@ -301,16 +303,19 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // tempIndices[start.groupNum].startIdx = i } - currentStates = append(currentStates, start) - + start.threadSP = i + heap.Push(currentStates, newPriorQueueItem(start)) // Main loop - for i < len(str) { + for currentStates.Len() > 0 { + currentState := heap.Pop(currentStates) foundPath = false zeroStates := make([]*nfaState, 0) // Keep taking zero-states, until there are no more left to take // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take. - zeroStates, isZero := takeZeroState(currentStates, numGroups, i) + topStateItem := currentStates.peek() + topState := topStateItem.(*priorQueueItem).state + zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) tempStates = append(tempStates, zeroStates...) num_appended := 0 for isZero == true { @@ -320,8 +325,13 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in break } } + if isZero == true { + currentStates.Pop() + } - currentStates = slices.Concat(currentStates, tempStates) + for _, state := range tempStates { + heap.Push(currentStates, newPriorQueueItem(state)) + } tempStates = nil // Take any transitions corresponding to current character @@ -331,10 +341,11 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states for numStatesMatched == 0 && lastStateInList == false { - if len(currentStates) == 0 { + if currentStates.Len() == 0 { break } - state, _ := pop(¤tStates) + stateItem := heap.Pop(currentStates) + state := stateItem.(*priorQueueItem).state matches, numMatches := state.matchesFor(str, i) if numMatches > 0 { numStatesMatched++ @@ -344,6 +355,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in if m.threadGroups == nil { m.threadGroups = newMatch(numGroups + 1) } + m.threadSP = state.threadSP + 1 copy(m.threadGroups, state.threadGroups) } } @@ -382,7 +394,8 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // a. A last-state // b. Empty // c. Doesn't assert anything - for _, s := range currentStates { + for _, stateItem := range *currentStates { + s := stateItem.state if s.isLast && s.isEmpty && s.assert == noneAssert { lastStatePtr = s lastStateInList = true @@ -403,7 +416,10 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // Check if we can find a zero-length match if foundPath == false { - if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok { + currentStatesList := funcMap(*currentStates, func(item *priorQueueItem) *nfaState { + return item.state + }) + if ok := zeroMatchPossible(str, i, numGroups, currentStatesList...); ok { if tempIndices[0].IsValid() == false { tempIndices[0] = Group{startIdx, startIdx} } @@ -423,8 +439,11 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in } return false, []Group{}, startIdx } - currentStates = make([]*nfaState, len(tempStates)) - copy(currentStates, tempStates) + currentStates = &priorityQueue{} + slices.Reverse(tempStates) + for _, state := range tempStates { + heap.Push(currentStates, newPriorQueueItem(state)) + } tempStates = nil i++ @@ -432,21 +451,28 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position. // This is the exact same algorithm used inside the loop, so I should probably put it in a function. - zeroStates, isZero := takeZeroState(currentStates, numGroups, i) - tempStates = append(tempStates, zeroStates...) - num_appended := 0 // Number of unique states addded to tempStates - for isZero == true { - zeroStates, isZero = takeZeroState(tempStates, numGroups, i) - tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) - if num_appended == 0 { // Break if we haven't appended any more unique values - break + if currentStates.Len() > 0 { + topStateItem := currentStates.peek() + topState := topStateItem.(*priorQueueItem).state + zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) + tempStates = append(tempStates, zeroStates...) + num_appended := 0 // Number of unique states addded to tempStates + for isZero == true { + zeroStates, isZero = takeZeroState(tempStates, numGroups, i) + tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) + if num_appended == 0 { // Break if we haven't appended any more unique values + break + } } } - currentStates = append(currentStates, tempStates...) + for _, state := range tempStates { + heap.Push(currentStates, newPriorQueueItem(state)) + } tempStates = nil - for _, state := range currentStates { + for _, stateItem := range *currentStates { + state := stateItem.state // Only add the match if the start index is in bounds. If the state has an assertion, // make sure the assertion checks out. if state.isLast && i <= len(str) {