Got rid of transitions parameter, changed how kleene state is processed

I replaced the transition parameter for nfaState, replacing it with a
single nfaState pointer. This is because any non-alternation state will
only have one next state, so the map was just added complexity.

I changed alternation processing - instead of having their own dedicated
fields, they just use the new 'next' parameter, and another one called
'splitState'.

I also changed the kleene state processing to remove the unecessary
empty state in the right-side alternation (it actually messed up my
matching).
implementPCREMatchingRules
Aadhavan Srinivasan 1 month ago
parent 858e535fba
commit cca8c7cda2

@ -822,7 +822,6 @@ func thompson(re []postfixNode) (Reg, error) {
for _, c := range re { for _, c := range re {
if c.nodetype == characterNode || c.nodetype == assertionNode { if c.nodetype == characterNode || c.nodetype == assertionNode {
stateToAdd := nfaState{} stateToAdd := nfaState{}
stateToAdd.transitions = make(map[int][]*nfaState)
if c.allChars { if c.allChars {
stateToAdd.allChars = true stateToAdd.allChars = true
if len(c.except) != 0 { if len(c.except) != 0 {
@ -934,7 +933,6 @@ func thompson(re []postfixNode) (Reg, error) {
s.isEmpty = true s.isEmpty = true
s.output = make([]*nfaState, 0) s.output = make([]*nfaState, 0)
s.output = append(s.output, s) s.output = append(s.output, s)
s.transitions = make(map[int][]*nfaState)
// LPAREN nodes are just added normally // LPAREN nodes are just added normally
if c.nodetype == lparenNode { if c.nodetype == lparenNode {
numGroups++ numGroups++
@ -966,7 +964,7 @@ func thompson(re []postfixNode) (Reg, error) {
s.groupNum = lparenNode.groupNum s.groupNum = lparenNode.groupNum
to_add := concatenate(lparenNode, s) to_add := concatenate(lparenNode, s)
nfa = append(nfa, to_add) nfa = append(nfa, to_add)
} else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen } else if middleNode.groupBegin && middleNode.numTransitions() == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group
s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen
to_add := concatenate(middleNode, s) to_add := concatenate(middleNode, s)
@ -1030,14 +1028,14 @@ func thompson(re []postfixNode) (Reg, error) {
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error applying kleene star") return Reg{}, fmt.Errorf("error applying kleene star")
} }
stateToAdd, err := kleene(*s1) stateToAdd, err := kleene(s1)
if err != nil { if err != nil {
return Reg{}, err return Reg{}, err
} }
nfa = append(nfa, stateToAdd) nfa = append(nfa, stateToAdd)
case plusNode: // a+ is equivalent to aa* case plusNode: // a+ is equivalent to aa*
s1 := mustPop(&nfa) s1 := mustPop(&nfa)
s2, err := kleene(*s1) s2, err := kleene(s1)
if err != nil { if err != nil {
return Reg{}, err return Reg{}, err
} }
@ -1061,14 +1059,14 @@ func thompson(re []postfixNode) (Reg, error) {
// '^|a' // '^|a'
s2, err1 := pop(&nfa) s2, err1 := pop(&nfa)
s1, err2 := pop(&nfa) s1, err2 := pop(&nfa)
if err2 != nil || (s2.groupBegin && len(s2.transitions) == 0) { // Doesn't exist, or its just an LPAREN if err2 != nil || (s2.groupBegin && s2.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN
if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back
nfa = append(nfa, s2) nfa = append(nfa, s2)
} }
tmp := zeroLengthMatchState() tmp := zeroLengthMatchState()
s2 = &tmp s2 = &tmp
} }
if err1 != nil || (s1.groupBegin && len(s1.transitions) == 0) { // Doesn't exist, or its just an LPAREN if err1 != nil || (s1.groupBegin && s1.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN
if err1 == nil { // See above for explanation if err1 == nil { // See above for explanation
nfa = append(nfa, s1) nfa = append(nfa, s1)
} }
@ -1100,7 +1098,7 @@ func thompson(re []postfixNode) (Reg, error) {
stateToAdd = concatenate(stateToAdd, cloneState(poppedState)) stateToAdd = concatenate(stateToAdd, cloneState(poppedState))
} }
if c.endReps == infinite_reps { // Case 3 if c.endReps == infinite_reps { // Case 3
s2, err := kleene(*poppedState) s2, err := kleene(poppedState)
if err != nil { if err != nil {
return Reg{}, err return Reg{}, err
} }
@ -1117,7 +1115,10 @@ func thompson(re []postfixNode) (Reg, error) {
return Reg{}, fmt.Errorf("invalid regex") return Reg{}, fmt.Errorf("invalid regex")
} }
verifyLastStates(nfa) lastState := newState()
lastState.isLast = true
concatenate(nfa[0], &lastState)
return Reg{nfa[0], numGroups}, nil return Reg{nfa[0], numGroups}, nil

@ -29,12 +29,12 @@ type nfaState struct {
isEmpty bool // If it is empty - Union operator and Kleene star states will be empty isEmpty bool // If it is empty - Union operator and Kleene star states will be empty
isLast bool // If it is the last state (acept state) isLast bool // If it is the last state (acept state)
output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) // transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
next *nfaState // The next state (not for alternation or kleene states)
isKleene bool // Identifies whether current node is a 0-state representing Kleene star isKleene bool // Identifies whether current node is a 0-state representing Kleene star
isQuestion bool // Identifies whether current node is a 0-state representing the question operator isQuestion bool // Identifies whether current node is a 0-state representing the question operator
isAlternation bool // Identifies whether current node is a 0-state representing an alternation isAlternation bool // Identifies whether current node is a 0-state representing an alternation
leftState *nfaState // Only for alternation states - the 'left' branch of the alternation splitState *nfaState // Only for alternation states - the 'other' branch of the alternation ('next' is the first)
rightState *nfaState // Only for alternation states - the 'right' branch of the alternation
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
@ -73,7 +73,6 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
isEmpty: stateToClone.isEmpty, isEmpty: stateToClone.isEmpty,
isLast: stateToClone.isLast, isLast: stateToClone.isLast,
output: make([]*nfaState, len(stateToClone.output)), output: make([]*nfaState, len(stateToClone.output)),
transitions: make(map[int][]*nfaState),
isKleene: stateToClone.isKleene, isKleene: stateToClone.isKleene,
isQuestion: stateToClone.isQuestion, isQuestion: stateToClone.isQuestion,
isAlternation: stateToClone.isAlternation, isAlternation: stateToClone.isAlternation,
@ -94,28 +93,18 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
clone.output[i] = cloneStateHelper(s, cloneMap) clone.output[i] = cloneStateHelper(s, cloneMap)
} }
} }
for k, v := range stateToClone.transitions {
clone.transitions[k] = make([]*nfaState, len(v))
for i, s := range v {
if s == stateToClone {
clone.transitions[k][i] = clone
} else {
clone.transitions[k][i] = cloneStateHelper(s, cloneMap)
}
}
}
if stateToClone.lookaroundNFA == stateToClone { if stateToClone.lookaroundNFA == stateToClone {
clone.lookaroundNFA = clone clone.lookaroundNFA = clone
} }
clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap) clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap)
if stateToClone.leftState == stateToClone { if stateToClone.splitState == stateToClone {
clone.leftState = clone clone.splitState = clone
} }
clone.leftState = cloneStateHelper(stateToClone.leftState, cloneMap) clone.splitState = cloneStateHelper(stateToClone.splitState, cloneMap)
if stateToClone.rightState == stateToClone { if stateToClone.next == stateToClone {
clone.rightState = clone clone.next = clone
} }
clone.rightState = cloneStateHelper(stateToClone.rightState, cloneMap) clone.next = cloneStateHelper(stateToClone.next, cloneMap)
return clone return clone
} }
@ -126,6 +115,9 @@ func resetThreads(start *nfaState) {
} }
func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) { func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
if state == nil {
return
}
if _, ok := visitedMap[state]; ok { if _, ok := visitedMap[state]; ok {
return return
} }
@ -133,10 +125,11 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
state.threadGroups = nil state.threadGroups = nil
state.threadSP = 0 state.threadSP = 0
visitedMap[state] = true visitedMap[state] = true
for _, v := range state.transitions { if state.isAlternation {
for _, nextState := range v { resetThreadsHelper(state.next, visitedMap)
resetThreadsHelper(nextState, visitedMap) resetThreadsHelper(state.splitState, visitedMap)
} } else {
resetThreadsHelper(state.next, visitedMap)
} }
} }
@ -237,74 +230,84 @@ func (s nfaState) isLookaround() bool {
return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert
} }
// Returns the matches for the character at the given index of the given string. func (s nfaState) numTransitions() int {
// Also returns the number of matches. Returns -1 if an assertion failed. if s.next == nil && s.splitState == nil {
func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) { return 0
// Assertions can be viewed as 'checks'. If the check fails, we return
// an empty array and 0.
// If it passes, we treat it like any other state, and return all the transitions.
if s.assert != noneAssert {
if s.checkAssertion(str, idx) == false {
return make([]*nfaState, 0), -1
} }
if s.next == nil || s.splitState == nil {
return 1
} }
listTransitions := s.transitions[int(str[idx])] return 2
for _, dest := range s.transitions[int(anyCharRune)] {
if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) {
// Add an allChar state to the list of matches if:
// a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't.
// b. The current character isn't the state's exception list.
listTransitions = append(listTransitions, dest)
}
}
numTransitions := len(listTransitions)
return listTransitions, numTransitions
} }
// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates // Returns the matches for the character at the given index of the given string.
func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) { // Also returns the number of matches. Returns -1 if an assertion failed.
if len(st.transitions) == 0 { //func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) {
st.isLast = true // // Assertions can be viewed as 'checks'. If the check fails, we return
return // // an empty array and 0.
} // // If it passes, we treat it like any other state, and return all the transitions.
// if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a* // if s.assert != noneAssert {
if len(st.transitions) == 1 { // Eg. a* // if s.checkAssertion(str, idx) == false {
var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one // return make([]*nfaState, 0), -1
for _, c := range st.content { // }
if len(st.transitions[c]) != 1 || st.transitions[c][0] != st { // }
moreThanOneTrans = true // listTransitions := s.transitions[int(str[idx])]
} // for _, dest := range s.transitions[int(anyCharRune)] {
} // if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) {
st.isLast = !moreThanOneTrans // // Add an allChar state to the list of matches if:
} // // a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't.
// // b. The current character isn't the state's exception list.
// listTransitions = append(listTransitions, dest)
// }
// }
// numTransitions := len(listTransitions)
// return listTransitions, numTransitions
//}
if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state // verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates
transitionDests := make([]*nfaState, 0) //func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) {
for _, v := range st.transitions { // if st.numTransitions() == 0 {
transitionDests = append(transitionDests, v...) // st.isLast = true
} // return
if allEqual(transitionDests...) { // }
st.isLast = true // // if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
return // if st.numTransitions() == 1 { // Eg. a*
} // var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
} // for _, c := range st.content {
if visited[st] == true { // if len(st.transitions[c]) != 1 || st.transitions[c][0] != st {
return // moreThanOneTrans = true
} // }
visited[st] = true // }
for _, states := range st.transitions { // st.isLast = !moreThanOneTrans
for i := range states { // }
if states[i] != st { //
verifyLastStatesHelper(states[i], visited) // if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
} // transitionDests := make([]*nfaState, 0)
} // for _, v := range st.transitions {
} // transitionDests = append(transitionDests, v...)
} // }
// if allEqual(transitionDests...) {
// st.isLast = true
// return
// }
// }
// if visited[st] == true {
// return
// }
// visited[st] = true
// for _, states := range st.transitions {
// for i := range states {
// if states[i] != st {
// verifyLastStatesHelper(states[i], visited)
// }
// }
// }
//}
// verifyLastStates enables the 'isLast' flag for the leaf nodes (last states) // verifyLastStates enables the 'isLast' flag for the leaf nodes (last states)
func verifyLastStates(start []*nfaState) { //func verifyLastStates(start []*nfaState) {
verifyLastStatesHelper(start[0], make(map[*nfaState]bool)) // verifyLastStatesHelper(start[0], make(map[*nfaState]bool))
} //}
// Concatenates s1 and s2, returns the start of the concatenation. // Concatenates s1 and s2, returns the start of the concatenation.
func concatenate(s1 *nfaState, s2 *nfaState) *nfaState { func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
@ -312,69 +315,69 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
return s2 return s2
} }
for i := range s1.output { for i := range s1.output {
for _, c := range s2.content { // Create transitions for every element in s1's content to s2' s1.output[i].next = s2
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2)
}
} }
s1.output = s2.output s1.output = s2.output
return s1 return s1
} }
func kleene(s1 nfaState) (*nfaState, error) { func kleene(s1 *nfaState) (*nfaState, error) {
if s1.isEmpty && s1.assert != noneAssert { if s1.isEmpty && s1.assert != noneAssert {
return nil, fmt.Errorf("previous token is not quantifiable") return nil, fmt.Errorf("previous token is not quantifiable")
} }
emptyState := zeroLengthMatchState() toReturn := &nfaState{}
emptyState.assert = noneAssert toReturn.isEmpty = true
toReturn := alternate(&s1, &emptyState) toReturn.isAlternation = true
toReturn.content = newContents(epsilon)
toReturn.splitState = s1
for i := range s1.output {
s1.output[i].next = toReturn
}
// toReturn := &nfaState{} // toReturn := &nfaState{}
// toReturn.transitions = make(map[int][]*nfaState) // toReturn.transitions = make(map[int][]*nfaState)
// toReturn.content = newContents(epsilon) // toReturn.content = newContents(epsilon)
toReturn.isEmpty = true
toReturn.isKleene = true toReturn.isKleene = true
toReturn.output = []*nfaState{&emptyState} toReturn.output = append([]*nfaState{}, toReturn)
for i := range s1.output { for i := range s1.output {
for _, c := range toReturn.content { s1.output[i].next = toReturn
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
}
}
for _, c := range s1.content {
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
} }
// for _, c := range s1.content {
// toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
// }
//toReturn.kleeneState = &s1 //toReturn.kleeneState = &s1
return toReturn, nil return toReturn, nil
} }
func alternate(s1 *nfaState, s2 *nfaState) *nfaState { func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
toReturn := &nfaState{} toReturn := &nfaState{}
toReturn.transitions = make(map[int][]*nfaState) // toReturn.transitions = make(map[int][]*nfaState)
toReturn.output = append(toReturn.output, s1.output...) toReturn.output = append(toReturn.output, s1.output...)
toReturn.output = append(toReturn.output, s2.output...) toReturn.output = append(toReturn.output, s2.output...)
// Unique append is used here (and elsewhere) to ensure that, // // Unique append is used here (and elsewhere) to ensure that,
// for any given transition, a state can only be mentioned once. // // for any given transition, a state can only be mentioned once.
// For example, given the transition 'a', the state 's1' can only be mentioned once. // // For example, given the transition 'a', the state 's1' can only be mentioned once.
// This would lead to multiple instances of the same set of match indices, since both // // This would lead to multiple instances of the same set of match indices, since both
// 's1' states would be considered to match. // // 's1' states would be considered to match.
for _, c := range s1.content { // for _, c := range s1.content {
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1) // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
} // }
for _, c := range s2.content { // for _, c := range s2.content {
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2) // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
} // }
toReturn.content = newContents(epsilon) toReturn.content = newContents(epsilon)
toReturn.isEmpty = true toReturn.isEmpty = true
toReturn.isAlternation = true toReturn.isAlternation = true
toReturn.leftState = s1 toReturn.next = s1
toReturn.rightState = s2 toReturn.splitState = s2
return toReturn return toReturn
} }
func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|) func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|)
s2 := &nfaState{} s2 := &nfaState{}
s2.transitions = make(map[int][]*nfaState) // s2.transitions = make(map[int][]*nfaState)
s2.content = newContents(epsilon) s2.content = newContents(epsilon)
s2.output = append(s2.output, s2) s2.output = append(s2.output, s2)
s2.isEmpty = true s2.isEmpty = true
@ -386,7 +389,7 @@ func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|)
func newState() nfaState { func newState() nfaState {
ret := nfaState{ ret := nfaState{
output: make([]*nfaState, 0), output: make([]*nfaState, 0),
transitions: make(map[int][]*nfaState), // transitions: make(map[int][]*nfaState),
assert: noneAssert, assert: noneAssert,
except: append([]rune{}, 0), except: append([]rune{}, 0),
lookaroundRegex: "", lookaroundRegex: "",

Loading…
Cancel
Save