Trying to emulate backtracking by using string pointers within threads (something similar to rsc's 2nd regexp article)

remotes/origin/implementPCREMatchingRules
Aadhavan Srinivasan 1 month ago
parent e167cdb2cb
commit ad273b0c68

@ -1,6 +1,7 @@
package regex package regex
import ( import (
"container/heap"
"fmt" "fmt"
"slices" "slices"
"sort" "sort"
@ -271,7 +272,8 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
foundPath := false foundPath := false
startIdx := offset startIdx := offset
endIdx := offset endIdx := offset
currentStates := make([]*nfaState, 0) currentStates := &priorityQueue{}
heap.Init(currentStates)
tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration
i := offset // Index in string i := offset // Index in string
startingFrom := i // Store starting index startingFrom := i // Store starting index
@ -301,16 +303,19 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// tempIndices[start.groupNum].startIdx = i // tempIndices[start.groupNum].startIdx = i
} }
currentStates = append(currentStates, start) start.threadSP = i
heap.Push(currentStates, newPriorQueueItem(start))
// Main loop // Main loop
for i < len(str) { for currentStates.Len() > 0 {
currentState := heap.Pop(currentStates)
foundPath = false foundPath = false
zeroStates := make([]*nfaState, 0) zeroStates := make([]*nfaState, 0)
// Keep taking zero-states, until there are no more left to take // Keep taking zero-states, until there are no more left to take
// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take. // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i) topStateItem := currentStates.peek()
topState := topStateItem.(*priorQueueItem).state
zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i)
tempStates = append(tempStates, zeroStates...) tempStates = append(tempStates, zeroStates...)
num_appended := 0 num_appended := 0
for isZero == true { for isZero == true {
@ -320,8 +325,13 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
break break
} }
} }
if isZero == true {
currentStates.Pop()
}
currentStates = slices.Concat(currentStates, tempStates) for _, state := range tempStates {
heap.Push(currentStates, newPriorQueueItem(state))
}
tempStates = nil tempStates = nil
// Take any transitions corresponding to current character // Take any transitions corresponding to current character
@ -331,10 +341,11 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found
lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
for numStatesMatched == 0 && lastStateInList == false { for numStatesMatched == 0 && lastStateInList == false {
if len(currentStates) == 0 { if currentStates.Len() == 0 {
break break
} }
state, _ := pop(&currentStates) stateItem := heap.Pop(currentStates)
state := stateItem.(*priorQueueItem).state
matches, numMatches := state.matchesFor(str, i) matches, numMatches := state.matchesFor(str, i)
if numMatches > 0 { if numMatches > 0 {
numStatesMatched++ numStatesMatched++
@ -344,6 +355,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
if m.threadGroups == nil { if m.threadGroups == nil {
m.threadGroups = newMatch(numGroups + 1) m.threadGroups = newMatch(numGroups + 1)
} }
m.threadSP = state.threadSP + 1
copy(m.threadGroups, state.threadGroups) copy(m.threadGroups, state.threadGroups)
} }
} }
@ -382,7 +394,8 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// a. A last-state // a. A last-state
// b. Empty // b. Empty
// c. Doesn't assert anything // c. Doesn't assert anything
for _, s := range currentStates { for _, stateItem := range *currentStates {
s := stateItem.state
if s.isLast && s.isEmpty && s.assert == noneAssert { if s.isLast && s.isEmpty && s.assert == noneAssert {
lastStatePtr = s lastStatePtr = s
lastStateInList = true lastStateInList = true
@ -403,7 +416,10 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// Check if we can find a zero-length match // Check if we can find a zero-length match
if foundPath == false { if foundPath == false {
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok { currentStatesList := funcMap(*currentStates, func(item *priorQueueItem) *nfaState {
return item.state
})
if ok := zeroMatchPossible(str, i, numGroups, currentStatesList...); ok {
if tempIndices[0].IsValid() == false { if tempIndices[0].IsValid() == false {
tempIndices[0] = Group{startIdx, startIdx} tempIndices[0] = Group{startIdx, startIdx}
} }
@ -423,8 +439,11 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
} }
return false, []Group{}, startIdx return false, []Group{}, startIdx
} }
currentStates = make([]*nfaState, len(tempStates)) currentStates = &priorityQueue{}
copy(currentStates, tempStates) slices.Reverse(tempStates)
for _, state := range tempStates {
heap.Push(currentStates, newPriorQueueItem(state))
}
tempStates = nil tempStates = nil
i++ i++
@ -432,21 +451,28 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position. // End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position.
// This is the exact same algorithm used inside the loop, so I should probably put it in a function. // This is the exact same algorithm used inside the loop, so I should probably put it in a function.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i) if currentStates.Len() > 0 {
tempStates = append(tempStates, zeroStates...) topStateItem := currentStates.peek()
num_appended := 0 // Number of unique states addded to tempStates topState := topStateItem.(*priorQueueItem).state
for isZero == true { zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i)
zeroStates, isZero = takeZeroState(tempStates, numGroups, i) tempStates = append(tempStates, zeroStates...)
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) num_appended := 0 // Number of unique states addded to tempStates
if num_appended == 0 { // Break if we haven't appended any more unique values for isZero == true {
break zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break
}
} }
} }
currentStates = append(currentStates, tempStates...) for _, state := range tempStates {
heap.Push(currentStates, newPriorQueueItem(state))
}
tempStates = nil tempStates = nil
for _, state := range currentStates { for _, stateItem := range *currentStates {
state := stateItem.state
// Only add the match if the start index is in bounds. If the state has an assertion, // Only add the match if the start index is in bounds. If the state has an assertion,
// make sure the assertion checks out. // make sure the assertion checks out.
if state.isLast && i <= len(str) { if state.isLast && i <= len(str) {

Loading…
Cancel
Save