8 Commits

5 changed files with 159 additions and 26 deletions

View File

@@ -6,8 +6,8 @@ fmt:
vet: fmt vet: fmt
go vet ./... go vet ./...
buildLib: vet buildLib: vet
go build -gcflags="-N -l" ./... go build -gcflags="all=-N -l" ./...
buildCmd: buildLib buildCmd: buildLib
go build -C cmd/ -gcflags="-N -l" -o re ./... go build -C cmd/ -gcflags="all=-N -l" -o re ./...
test: buildCmd test: buildCmd
go test -v ./... go test -v ./...

View File

@@ -1,7 +1,9 @@
package regex package regex
import ( import (
"container/heap"
"fmt" "fmt"
"slices"
"sort" "sort"
) )
@@ -270,7 +272,8 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
foundPath := false foundPath := false
startIdx := offset startIdx := offset
endIdx := offset endIdx := offset
currentStates := make([]*nfaState, 0) currentStates := &priorityQueue{}
heap.Init(currentStates)
tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration
i := offset // Index in string i := offset // Index in string
startingFrom := i // Store starting index startingFrom := i // Store starting index
@@ -300,16 +303,19 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// tempIndices[start.groupNum].startIdx = i // tempIndices[start.groupNum].startIdx = i
} }
currentStates = append(currentStates, start) start.threadSP = i
heap.Push(currentStates, newPriorQueueItem(start))
// Main loop // Main loop
for i < len(str) { for currentStates.Len() > 0 {
currentState := heap.Pop(currentStates)
foundPath = false foundPath = false
zeroStates := make([]*nfaState, 0) zeroStates := make([]*nfaState, 0)
// Keep taking zero-states, until there are no more left to take // Keep taking zero-states, until there are no more left to take
// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take. // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i) topStateItem := currentStates.peek()
topState := topStateItem.(*priorQueueItem).state
zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i)
tempStates = append(tempStates, zeroStates...) tempStates = append(tempStates, zeroStates...)
num_appended := 0 num_appended := 0
for isZero == true { for isZero == true {
@@ -319,8 +325,13 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
break break
} }
} }
if isZero == true {
currentStates.Pop()
}
currentStates, _ = uniqueAppend(currentStates, tempStates...) for _, state := range tempStates {
heap.Push(currentStates, newPriorQueueItem(state))
}
tempStates = nil tempStates = nil
// Take any transitions corresponding to current character // Take any transitions corresponding to current character
@@ -329,16 +340,22 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
lastStateInList := false // Whether or not a last state was in our list of states lastStateInList := false // Whether or not a last state was in our list of states
var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found
lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
for _, state := range currentStates { for numStatesMatched == 0 && lastStateInList == false {
if currentStates.Len() == 0 {
break
}
stateItem := heap.Pop(currentStates)
state := stateItem.(*priorQueueItem).state
matches, numMatches := state.matchesFor(str, i) matches, numMatches := state.matchesFor(str, i)
if numMatches > 0 { if numMatches > 0 {
numStatesMatched++ numStatesMatched++
tempStates = append(tempStates, matches...) tempStates = append([]*nfaState(nil), matches...)
foundPath = true foundPath = true
for _, m := range matches { for _, m := range matches {
if m.threadGroups == nil { if m.threadGroups == nil {
m.threadGroups = newMatch(numGroups + 1) m.threadGroups = newMatch(numGroups + 1)
} }
m.threadSP = state.threadSP + 1
copy(m.threadGroups, state.threadGroups) copy(m.threadGroups, state.threadGroups)
} }
} }
@@ -377,23 +394,32 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// a. A last-state // a. A last-state
// b. Empty // b. Empty
// c. Doesn't assert anything // c. Doesn't assert anything
for _, s := range currentStates { for _, stateItem := range *currentStates {
s := stateItem.state
if s.isLast && s.isEmpty && s.assert == noneAssert { if s.isLast && s.isEmpty && s.assert == noneAssert {
lastStatePtr = s lastStatePtr = s
lastStateInList = true lastStateInList = true
} }
} }
if lastStateInList { // A last-state was in the list of states. add the matchIndex to our MatchIndex list if lastStateInList && numStatesMatched == 0 { // A last-state was in the list of states. add the matchIndex to our MatchIndex list
for j := 1; j < numGroups+1; j++ { for j := 1; j < numGroups+1; j++ {
tempIndices[j] = lastStatePtr.threadGroups[j] tempIndices[j] = lastStatePtr.threadGroups[j]
} }
endIdx = i endIdx = i
tempIndices[0] = Group{startIdx, endIdx} tempIndices[0] = Group{startIdx, endIdx}
if tempIndices[0].StartIdx == tempIndices[0].EndIdx {
return true, tempIndices, tempIndices[0].EndIdx + 1
} else {
return true, tempIndices, tempIndices[0].EndIdx
}
} }
// Check if we can find a zero-length match // Check if we can find a zero-length match
if foundPath == false { if foundPath == false {
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok { currentStatesList := funcMap(*currentStates, func(item *priorQueueItem) *nfaState {
return item.state
})
if ok := zeroMatchPossible(str, i, numGroups, currentStatesList...); ok {
if tempIndices[0].IsValid() == false { if tempIndices[0].IsValid() == false {
tempIndices[0] = Group{startIdx, startIdx} tempIndices[0] = Group{startIdx, startIdx}
} }
@@ -413,8 +439,11 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
} }
return false, []Group{}, startIdx return false, []Group{}, startIdx
} }
currentStates = make([]*nfaState, len(tempStates)) currentStates = &priorityQueue{}
copy(currentStates, tempStates) slices.Reverse(tempStates)
for _, state := range tempStates {
heap.Push(currentStates, newPriorQueueItem(state))
}
tempStates = nil tempStates = nil
i++ i++
@@ -422,21 +451,28 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position. // End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position.
// This is the exact same algorithm used inside the loop, so I should probably put it in a function. // This is the exact same algorithm used inside the loop, so I should probably put it in a function.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i) if currentStates.Len() > 0 {
tempStates = append(tempStates, zeroStates...) topStateItem := currentStates.peek()
num_appended := 0 // Number of unique states addded to tempStates topState := topStateItem.(*priorQueueItem).state
for isZero == true { zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i)
zeroStates, isZero = takeZeroState(tempStates, numGroups, i) tempStates = append(tempStates, zeroStates...)
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) num_appended := 0 // Number of unique states addded to tempStates
if num_appended == 0 { // Break if we haven't appended any more unique values for isZero == true {
break zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break
}
} }
} }
currentStates = append(currentStates, tempStates...) for _, state := range tempStates {
heap.Push(currentStates, newPriorQueueItem(state))
}
tempStates = nil tempStates = nil
for _, state := range currentStates { for _, stateItem := range *currentStates {
state := stateItem.state
// Only add the match if the start index is in bounds. If the state has an assertion, // Only add the match if the start index is in bounds. If the state has an assertion,
// make sure the assertion checks out. // make sure the assertion checks out.
if state.isLast && i <= len(str) { if state.isLast && i <= len(str) {

View File

@@ -31,6 +31,8 @@ type nfaState struct {
output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
isKleene bool // Identifies whether current node is a 0-state representing Kleene star isKleene bool // Identifies whether current node is a 0-state representing Kleene star
isQuestion bool // Identifies whether current node is a 0-state representing the question operator
isAlternation bool // Identifies whether current node is a 0-state representing an alternation
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
@@ -43,6 +45,7 @@ type nfaState struct {
// The following properties depend on the current match - I should think about resetting them for every match. // The following properties depend on the current match - I should think about resetting them for every match.
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over. threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
threadSP int // The string pointer of the thread - where it is in the input string
} }
// Clones the NFA starting from the given state. // Clones the NFA starting from the given state.
@@ -70,6 +73,8 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
output: make([]*nfaState, len(stateToClone.output)), output: make([]*nfaState, len(stateToClone.output)),
transitions: make(map[int][]*nfaState), transitions: make(map[int][]*nfaState),
isKleene: stateToClone.isKleene, isKleene: stateToClone.isKleene,
isQuestion: stateToClone.isQuestion,
isAlternation: stateToClone.isAlternation,
assert: stateToClone.assert, assert: stateToClone.assert,
zeroMatchFound: stateToClone.zeroMatchFound, zeroMatchFound: stateToClone.zeroMatchFound,
allChars: stateToClone.allChars, allChars: stateToClone.allChars,
@@ -116,6 +121,7 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
} }
// Assuming it hasn't been visited // Assuming it hasn't been visited
state.threadGroups = nil state.threadGroups = nil
state.threadSP = 0
visitedMap[state] = true visitedMap[state] = true
for _, v := range state.transitions { for _, v := range state.transitions {
for _, nextState := range v { for _, nextState := range v {
@@ -341,6 +347,7 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
} }
toReturn.content = newContents(epsilon) toReturn.content = newContents(epsilon)
toReturn.isEmpty = true toReturn.isEmpty = true
toReturn.isAlternation = true
return toReturn return toReturn
} }
@@ -351,6 +358,7 @@ func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|)
s2.content = newContents(epsilon) s2.content = newContents(epsilon)
s2.output = append(s2.output, s2) s2.output = append(s2.output, s2)
s2.isEmpty = true s2.isEmpty = true
s2.isQuestion = true
s3 := alternate(s1, s2) s3 := alternate(s1, s2)
return s3 return s3
} }

89
regex/priorityQueue.go Normal file
View File

@@ -0,0 +1,89 @@
package regex
import "container/heap"
// Implement a priority queue using container/heap
const (
min_priority int = iota
zerostate_priority
alternation_priority
kleene_priority
char_priority
max_priority
)
func getPriority(state *nfaState) int {
if state.isKleene {
return zerostate_priority
} else if state.isAlternation {
return alternation_priority
} else {
if state.isEmpty {
return zerostate_priority
} else {
return char_priority
}
}
}
type priorQueueItem struct {
state *nfaState
priority int
index int
}
func newPriorQueueItem(state *nfaState) *priorQueueItem {
return &priorQueueItem{
state: state,
index: -1,
priority: getPriority(state),
}
}
type priorityQueue []*priorQueueItem
func (pq priorityQueue) Len() int {
return len(pq)
}
func (pq priorityQueue) Less(i, j int) bool {
if pq[i].priority == pq[j].priority {
return pq[i].index < pq[j].index
}
return pq[i].priority > pq[j].priority // We want max-heap, so we use greater-than
}
func (pq priorityQueue) Swap(i, j int) {
pq[i], pq[j] = pq[j], pq[i]
pq[i].index = i
pq[j].index = j
}
func (pq *priorityQueue) Push(x any) {
length := len(*pq)
item := x.(*priorQueueItem)
item.index = length
*pq = append(*pq, item)
}
func (pq *priorityQueue) Pop() any {
old := *pq
n := len(old)
item := old[n-1]
old[n-1] = nil
item.index = -1
*pq = old[0 : n-1]
return item
}
func (pq *priorityQueue) peek() any {
queue := *pq
n := len(queue)
return queue[n-1]
}
func (pq *priorityQueue) update(item *priorQueueItem, value *nfaState, priority int) {
item.state = value
item.priority = priority
heap.Fix(pq, item.index)
}

View File

@@ -701,7 +701,7 @@ func TestFind(t *testing.T) {
if len(test.result) == 0 { if len(test.result) == 0 {
return // Manually pass the test, because this is the expected behavior return // Manually pass the test, because this is the expected behavior
} else { } else {
t.Errorf("Wanted no match Got %v\n", groupIndex) t.Errorf("Wanted %v Got no matches\n", test.result)
} }
} else { } else {
if groupIndex != test.result[0] { if groupIndex != test.result[0] {