7 Commits

3 changed files with 176 additions and 124 deletions

View File

@@ -987,7 +987,8 @@ func thompson(re []postfixNode) (Reg, error) {
if c.nodetype == charclassNode { // A Character class consists of all the nodes in it, alternated if c.nodetype == charclassNode { // A Character class consists of all the nodes in it, alternated
// Map the list of nodes to a list of states, each state containing the contents of a specific node // Map the list of nodes to a list of states, each state containing the contents of a specific node
states := funcMap(c.nodeContents, func(node postfixNode) *nfaState { states := funcMap(c.nodeContents, func(node postfixNode) *nfaState {
s := newState() s := &nfaState{}
s.output = append(s.output, s)
nodeContents := node.contents nodeContents := node.contents
if caseInsensitive { if caseInsensitive {
nodeContents = slices.Concat(funcMap(nodeContents, func(r rune) []rune { nodeContents = slices.Concat(funcMap(nodeContents, func(r rune) []rune {
@@ -1001,7 +1002,7 @@ func thompson(re []postfixNode) (Reg, error) {
return n.contents return n.contents
})...) })...)
} }
return &s return s
}) })
// Reduce the list of states down to a single state by alternating them // Reduce the list of states down to a single state by alternating them
toAdd := funcReduce(states, func(s1 *nfaState, s2 *nfaState) *nfaState { toAdd := funcReduce(states, func(s1 *nfaState, s2 *nfaState) *nfaState {
@@ -1046,7 +1047,10 @@ func thompson(re []postfixNode) (Reg, error) {
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error applying question operator") return Reg{}, fmt.Errorf("error applying question operator")
} }
s2 := question(s1) s2, err := question(s1)
if err != nil {
return Reg{}, err
}
nfa = append(nfa, s2) nfa = append(nfa, s2)
case pipeNode: case pipeNode:
// A pipe operator doesn't actually need either operand to be present. If an operand isn't present, // A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
@@ -1105,7 +1109,11 @@ func thompson(re []postfixNode) (Reg, error) {
stateToAdd = concatenate(stateToAdd, s2) stateToAdd = concatenate(stateToAdd, s2)
} else { // Case 2 } else { // Case 2
for i := c.startReps; i < c.endReps; i++ { for i := c.startReps; i < c.endReps; i++ {
stateToAdd = concatenate(stateToAdd, question(cloneState(poppedState))) tmp, err := question(cloneState(poppedState))
if err != nil {
return Reg{}, fmt.Errorf("error processing bounded repetition")
}
stateToAdd = concatenate(stateToAdd, tmp)
} }
} }
nfa = append(nfa, stateToAdd) nfa = append(nfa, stateToAdd)

View File

@@ -2,7 +2,6 @@ package regex
import ( import (
"fmt" "fmt"
"slices"
"sort" "sort"
) )
@@ -152,7 +151,6 @@ func pruneIndices(indices []Match) []Match {
} }
func copyThread(to *nfaState, from nfaState) { func copyThread(to *nfaState, from nfaState) {
to.threadSP = from.threadSP
to.threadGroups = append([]Group{}, from.threadGroups...) to.threadGroups = append([]Group{}, from.threadGroups...)
} }
@@ -253,6 +251,43 @@ func (regex Reg) FindAllSubmatch(str string) []Match {
return indices return indices
} }
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group) []nfaState {
if stateExists(list, state) {
return list
}
if state.isKleene || state.isQuestion {
copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups)
copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups)
return list
}
if state.isAlternation {
copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups)
copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups)
return list
}
state.threadGroups = append([]Group{}, threadGroups...)
if state.assert != noneAssert {
if state.checkAssertion(str, idx) {
copyThread(state.next, state)
return append(list, addStateToList(str, idx, list, *state.next, state.threadGroups)...)
}
}
if state.groupBegin {
state.threadGroups[state.groupNum].StartIdx = idx
return append(list, addStateToList(str, idx, list, *state.next, state.threadGroups)...)
}
if state.groupEnd {
state.threadGroups[state.groupNum].EndIdx = idx
return append(list, addStateToList(str, idx, list, *state.next, state.threadGroups)...)
}
return append(list, state)
}
// Helper for FindAllMatches. Returns whether it found a match, the // Helper for FindAllMatches. Returns whether it found a match, the
// first Match it finds, and how far it got into the string ie. where // first Match it finds, and how far it got into the string ie. where
// the next search should start from. // the next search should start from.
@@ -307,134 +342,113 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// tempIndices[start.groupNum].startIdx = i // tempIndices[start.groupNum].startIdx = i
//} //}
start.threadSP = i start.threadGroups = newMatch(numGroups + 1)
currentStates = append(currentStates, *start) start.threadGroups[0].StartIdx = i
var foundMatch bool currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups)
var isEmptyAndNoAssertion bool var match Match = nil
// var isEmptyAndNoAssertion bool
// Main loop // Main loop
for idx := i; idx <= len(str); idx++ { for idx := i; idx <= len(str); idx++ {
if len(currentStates) == 0 {
break
}
for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ { for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ {
currentState := currentStates[currentStateIdx] currentState := currentStates[currentStateIdx]
foundMatch = false
isEmptyAndNoAssertion = false
if currentState.threadGroups == nil { if currentState.threadGroups == nil {
currentState.threadGroups = newMatch(numGroups + 1) currentState.threadGroups = newMatch(numGroups + 1)
currentState.threadGroups[0].StartIdx = idx currentState.threadGroups[0].StartIdx = idx
} }
if currentState.groupBegin { if currentState.isLast {
currentState.threadGroups[currentState.groupNum].StartIdx = idx currentState.threadGroups[0].EndIdx = idx
// allMatches := make([]nfaState, 0) match = append([]Group{}, currentState.threadGroups...)
// for _, v := range currentState.transitions { break
// dereferenced := funcMap(v, func(s *nfaState) nfaState { } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd { // Normal character or assertion
// return *s if currentState.contentContains(str, idx) {
// }) nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups)
// allMatches = append(allMatches, dereferenced...) }
// }
// slices.Reverse(allMatches)
// for i := range allMatches {
// copyThread(&allMatches[i], currentState)
// }
// currentStates = append(currentStates, allMatches...)
}
if currentState.groupEnd {
currentState.threadGroups[currentState.groupNum].EndIdx = idx
// allMatches := make([]nfaState, 0)
// for _, v := range currentState.transitions {
// dereferenced := funcMap(v, func(s *nfaState) nfaState {
// return *s
// })
// allMatches = append(allMatches, dereferenced...)
// }
// slices.Reverse(allMatches)
// for i := range allMatches {
// copyThread(&allMatches[i], currentState)
// }
// currentStates = append(currentStates, allMatches...)
} }
// if currentState.isKleene { // if currentState.groupBegin {
// // Append the next-state (after the kleene), then append the kleene state // currentState.threadGroups[currentState.groupNum].StartIdx = idx
// allMatches := make([]*nfaState, 0)
// for _, v := range currentState.transitions {
// allMatches = append(allMatches, v...)
// } // }
// slices.Reverse(allMatches) // if currentState.groupEnd {
// for _, m := range allMatches { // currentState.threadGroups[currentState.groupNum].EndIdx = idx
// m.threadGroups = currentState.threadGroups
// m.threadSP = idx
// } // }
// currentStates = append(currentStates, allMatches...)
//
// // kleeneState := currentState.kleeneState
// // kleeneState.threadGroups = currentState.threadGroups
// // kleeneState.threadSP = currentState.threadSP
// // currentStates = append(currentStates, kleeneState)
// continue
// }
// Alternation - enqueue left then right state, and continue // Alternation - enqueue left then right state, and continue
if currentState.isAlternation { // if currentState.isAlternation {
if currentState.isKleene { // Reverse order of adding things // if currentState.isKleene { // Reverse order of adding things
rightState := currentState.splitState // rightState := currentState.splitState
copyThread(rightState, currentState) // copyThread(rightState, currentState)
currentStates = slices.Insert(currentStates, currentStateIdx+1, *rightState) // currentStates = slices.Insert(currentStates, currentStateIdx+1, *rightState)
leftState := currentState.next // leftState := currentState.next
copyThread(leftState, currentState) // copyThread(leftState, currentState)
currentStates = slices.Insert(currentStates, currentStateIdx+2, *leftState) // currentStates = slices.Insert(currentStates, currentStateIdx+2, *leftState)
} else { // } else {
leftState := currentState.next // leftState := currentState.next
copyThread(leftState, currentState) // copyThread(leftState, currentState)
currentStates = slices.Insert(currentStates, currentStateIdx+1, *leftState) // currentStates = slices.Insert(currentStates, currentStateIdx+1, *leftState)
rightState := currentState.splitState // rightState := currentState.splitState
copyThread(rightState, currentState) // copyThread(rightState, currentState)
currentStates = slices.Insert(currentStates, currentStateIdx+2, *rightState) // currentStates = slices.Insert(currentStates, currentStateIdx+2, *rightState)
} // }
continue // continue
} // }
// Empty state - enqueue next state, do _not_ increment the SP // Empty state - enqueue next state, do _not_ increment the SP
if !currentState.isAlternation && currentState.isEmpty && currentState.assert == noneAssert { //&& currentState.groupBegin == false && currentState.groupEnd == false { // if !currentState.isAlternation && currentState.isEmpty && currentState.assert == noneAssert { //&& currentState.groupBegin == false && currentState.groupEnd == false {
isEmptyAndNoAssertion = true // isEmptyAndNoAssertion = true
} // }
//
if currentState.contentContains(str, idx) { // if currentState.contentContains(str, idx) {
foundMatch = true // foundMatch = true
} // }
//
if isEmptyAndNoAssertion || foundMatch { // if isEmptyAndNoAssertion || foundMatch {
allMatches := make([]nfaState, 0) // nextMatch := *(currentState.next)
allMatches = append(allMatches, *(currentState.next)) // copyThread(&nextMatch, currentState)
slices.Reverse(allMatches) // if currentState.groupBegin {
for i := range allMatches { // // if !stateExists(currentStates, nextMatch) {
copyThread(&allMatches[i], currentState) // currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch)
if foundMatch && currentState.assert == noneAssert { // //}
allMatches[i].threadSP += 1 // } else if currentState.groupEnd {
} // if !stateExists(currentStates, nextMatch) {
} // currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch) // append(currentStates, nextMatch)
if currentState.groupBegin { // }
currentStates = slices.Insert(currentStates, currentStateIdx+1, allMatches...) // } else if currentState.assert != noneAssert {
} else if currentState.groupEnd { // if !stateExists(currentStates, nextMatch) {
currentStates = append(currentStates, allMatches...) // currentStates = append(currentStates, nextMatch)
} else if currentState.assert != noneAssert { // }
currentStates = append(currentStates, allMatches...) // } else if currentState.isEmpty && !currentState.groupBegin && !currentState.groupEnd {
} else { // if !stateExists(currentStates, nextMatch) {
nextStates = append(nextStates, allMatches...) // currentStates = append(currentStates, nextMatch)
} // }
} // } else {
// if !stateExists(nextStates, nextMatch) {
if currentState.isLast && len(nextStates) == 0 { // Last state reached // nextStates = append(nextStates, nextMatch)
currentState.threadGroups[0].EndIdx = idx // }
if idx == currentState.threadGroups[0].StartIdx { // }
idx += 1 // }
} //
return true, currentState.threadGroups, idx // if currentState.isLast && len(nextStates) == 0 { // Last state reached
} // currentState.threadGroups[0].EndIdx = idx
// if idx == currentState.threadGroups[0].StartIdx {
// idx += 1
// }
// return true, currentState.threadGroups, idx
// }
} }
currentStates = append([]nfaState{}, nextStates...) currentStates = append([]nfaState{}, nextStates...)
nextStates = nil nextStates = nil
} }
if match != nil {
if offset == match[0].EndIdx {
return true, match, match[0].EndIdx + 1
}
return true, match, match[0].EndIdx
}
return false, []Group{}, i + 1 return false, []Group{}, i + 1
// zeroStates := make([]*nfaState, 0) // zeroStates := make([]*nfaState, 0)
// // Keep taking zero-states, until there are no more left to take // // Keep taking zero-states, until there are no more left to take

View File

@@ -47,7 +47,6 @@ type nfaState struct {
// The following properties depend on the current match - I should think about resetting them for every match. // The following properties depend on the current match - I should think about resetting them for every match.
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over. threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
threadSP int // The string pointer of the thread - where it is in the input string
} }
// Clones the NFA starting from the given state. // Clones the NFA starting from the given state.
@@ -123,7 +122,6 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
} }
// Assuming it hasn't been visited // Assuming it hasn't been visited
state.threadGroups = nil state.threadGroups = nil
state.threadSP = 0
visitedMap[state] = true visitedMap[state] = true
if state.isAlternation { if state.isAlternation {
resetThreadsHelper(state.next, visitedMap) resetThreadsHelper(state.next, visitedMap)
@@ -331,9 +329,6 @@ func kleene(s1 *nfaState) (*nfaState, error) {
toReturn.isAlternation = true toReturn.isAlternation = true
toReturn.content = newContents(epsilon) toReturn.content = newContents(epsilon)
toReturn.splitState = s1 toReturn.splitState = s1
for i := range s1.output {
s1.output[i].next = toReturn
}
// toReturn := &nfaState{} // toReturn := &nfaState{}
// toReturn.transitions = make(map[int][]*nfaState) // toReturn.transitions = make(map[int][]*nfaState)
@@ -375,14 +370,20 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
return toReturn return toReturn
} }
func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|) func question(s1 *nfaState) (*nfaState, error) { // Use the fact that ab? == a(b|)
s2 := &nfaState{} if s1.isEmpty && s1.assert != noneAssert {
return nil, fmt.Errorf("previous token is not quantifiable")
}
toReturn := &nfaState{}
toReturn.isEmpty = true
toReturn.isAlternation = true
toReturn.isQuestion = true
toReturn.content = newContents(epsilon)
toReturn.splitState = s1
toReturn.output = append([]*nfaState{}, toReturn)
toReturn.output = append(toReturn.output, s1.output...)
// s2.transitions = make(map[int][]*nfaState) // s2.transitions = make(map[int][]*nfaState)
s2.content = newContents(epsilon) return toReturn, nil
s2.output = append(s2.output, s2)
s2.isEmpty = true
s3 := alternate(s1, s2)
return s3
} }
// Creates and returns a new state with the 'default' values. // Creates and returns a new state with the 'default' values.
@@ -408,3 +409,32 @@ func zeroLengthMatchState() nfaState {
start.assert = alwaysTrueAssert start.assert = alwaysTrueAssert
return start return start
} }
func (s nfaState) equals(other nfaState) bool {
return slices.Equal(s.content, other.content) &&
s.isEmpty == other.isEmpty &&
s.isLast == other.isLast &&
slices.Equal(s.output, other.output) &&
s.next == other.next &&
s.isKleene == other.isKleene &&
s.isQuestion == other.isQuestion &&
s.isAlternation == other.isAlternation &&
s.splitState == other.splitState &&
s.assert == other.assert &&
s.allChars == other.allChars &&
slices.Equal(s.except, other.except) &&
s.lookaroundNFA == other.lookaroundNFA &&
s.groupBegin == other.groupBegin &&
s.groupEnd == other.groupEnd &&
s.groupNum == other.groupNum &&
slices.Equal(s.threadGroups, other.threadGroups)
}
func stateExists(list []nfaState, s nfaState) bool {
for i := range list {
if list[i].equals(s) {
return true
}
}
return false
}