diff --git a/regex/compile.go b/regex/compile.go index 1068966..0429c37 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -822,7 +822,6 @@ func thompson(re []postfixNode) (Reg, error) { for _, c := range re { if c.nodetype == characterNode || c.nodetype == assertionNode { stateToAdd := nfaState{} - stateToAdd.transitions = make(map[int][]*nfaState) if c.allChars { stateToAdd.allChars = true if len(c.except) != 0 { @@ -934,7 +933,6 @@ func thompson(re []postfixNode) (Reg, error) { s.isEmpty = true s.output = make([]*nfaState, 0) s.output = append(s.output, s) - s.transitions = make(map[int][]*nfaState) // LPAREN nodes are just added normally if c.nodetype == lparenNode { numGroups++ @@ -966,7 +964,7 @@ func thompson(re []postfixNode) (Reg, error) { s.groupNum = lparenNode.groupNum to_add := concatenate(lparenNode, s) nfa = append(nfa, to_add) - } else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen + } else if middleNode.groupBegin && middleNode.numTransitions() == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen to_add := concatenate(middleNode, s) @@ -1030,14 +1028,14 @@ func thompson(re []postfixNode) (Reg, error) { if err != nil { return Reg{}, fmt.Errorf("error applying kleene star") } - stateToAdd, err := kleene(*s1) + stateToAdd, err := kleene(s1) if err != nil { return Reg{}, err } nfa = append(nfa, stateToAdd) case plusNode: // a+ is equivalent to aa* s1 := mustPop(&nfa) - s2, err := kleene(*s1) + s2, err := kleene(s1) if err != nil { return Reg{}, err } @@ -1061,14 +1059,14 @@ func thompson(re []postfixNode) (Reg, error) { // '^|a' s2, err1 := pop(&nfa) s1, err2 := pop(&nfa) - if err2 != nil || (s2.groupBegin && len(s2.transitions) == 0) { // Doesn't exist, or its just an LPAREN + if err2 != nil || (s2.groupBegin && s2.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back nfa = append(nfa, s2) } tmp := zeroLengthMatchState() s2 = &tmp } - if err1 != nil || (s1.groupBegin && len(s1.transitions) == 0) { // Doesn't exist, or its just an LPAREN + if err1 != nil || (s1.groupBegin && s1.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN if err1 == nil { // See above for explanation nfa = append(nfa, s1) } @@ -1100,7 +1098,7 @@ func thompson(re []postfixNode) (Reg, error) { stateToAdd = concatenate(stateToAdd, cloneState(poppedState)) } if c.endReps == infinite_reps { // Case 3 - s2, err := kleene(*poppedState) + s2, err := kleene(poppedState) if err != nil { return Reg{}, err } @@ -1117,7 +1115,10 @@ func thompson(re []postfixNode) (Reg, error) { return Reg{}, fmt.Errorf("invalid regex") } - verifyLastStates(nfa) + lastState := newState() + lastState.isLast = true + + concatenate(nfa[0], &lastState) return Reg{nfa[0], numGroups}, nil diff --git a/regex/nfa.go b/regex/nfa.go index a9c1ec6..79daaf6 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -25,25 +25,25 @@ const ( ) type nfaState struct { - content stateContents // Contents of current state - isEmpty bool // If it is empty - Union operator and Kleene star states will be empty - isLast bool // If it is the last state (acept state) - output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. - transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) - isKleene bool // Identifies whether current node is a 0-state representing Kleene star - isQuestion bool // Identifies whether current node is a 0-state representing the question operator - isAlternation bool // Identifies whether current node is a 0-state representing an alternation - leftState *nfaState // Only for alternation states - the 'left' branch of the alternation - rightState *nfaState // Only for alternation states - the 'right' branch of the alternation - assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything - allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space - except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. - lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds - lookaroundNFA *nfaState // Holds the NFA of the lookaroundRegex - if it exists - lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround - groupBegin bool // Whether or not the node starts a capturing group - groupEnd bool // Whether or not the node ends a capturing group - groupNum int // Which capturing group the node starts / ends + content stateContents // Contents of current state + isEmpty bool // If it is empty - Union operator and Kleene star states will be empty + isLast bool // If it is the last state (acept state) + output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. + // transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) + next *nfaState // The next state (not for alternation or kleene states) + isKleene bool // Identifies whether current node is a 0-state representing Kleene star + isQuestion bool // Identifies whether current node is a 0-state representing the question operator + isAlternation bool // Identifies whether current node is a 0-state representing an alternation + splitState *nfaState // Only for alternation states - the 'other' branch of the alternation ('next' is the first) + assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything + allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space + except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. + lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds + lookaroundNFA *nfaState // Holds the NFA of the lookaroundRegex - if it exists + lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround + groupBegin bool // Whether or not the node starts a capturing group + groupEnd bool // Whether or not the node ends a capturing group + groupNum int // Which capturing group the node starts / ends // The following properties depend on the current match - I should think about resetting them for every match. zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over. @@ -73,7 +73,6 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) isEmpty: stateToClone.isEmpty, isLast: stateToClone.isLast, output: make([]*nfaState, len(stateToClone.output)), - transitions: make(map[int][]*nfaState), isKleene: stateToClone.isKleene, isQuestion: stateToClone.isQuestion, isAlternation: stateToClone.isAlternation, @@ -94,28 +93,18 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) clone.output[i] = cloneStateHelper(s, cloneMap) } } - for k, v := range stateToClone.transitions { - clone.transitions[k] = make([]*nfaState, len(v)) - for i, s := range v { - if s == stateToClone { - clone.transitions[k][i] = clone - } else { - clone.transitions[k][i] = cloneStateHelper(s, cloneMap) - } - } - } if stateToClone.lookaroundNFA == stateToClone { clone.lookaroundNFA = clone } clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap) - if stateToClone.leftState == stateToClone { - clone.leftState = clone + if stateToClone.splitState == stateToClone { + clone.splitState = clone } - clone.leftState = cloneStateHelper(stateToClone.leftState, cloneMap) - if stateToClone.rightState == stateToClone { - clone.rightState = clone + clone.splitState = cloneStateHelper(stateToClone.splitState, cloneMap) + if stateToClone.next == stateToClone { + clone.next = clone } - clone.rightState = cloneStateHelper(stateToClone.rightState, cloneMap) + clone.next = cloneStateHelper(stateToClone.next, cloneMap) return clone } @@ -126,6 +115,9 @@ func resetThreads(start *nfaState) { } func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) { + if state == nil { + return + } if _, ok := visitedMap[state]; ok { return } @@ -133,10 +125,11 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) { state.threadGroups = nil state.threadSP = 0 visitedMap[state] = true - for _, v := range state.transitions { - for _, nextState := range v { - resetThreadsHelper(nextState, visitedMap) - } + if state.isAlternation { + resetThreadsHelper(state.next, visitedMap) + resetThreadsHelper(state.splitState, visitedMap) + } else { + resetThreadsHelper(state.next, visitedMap) } } @@ -237,74 +230,84 @@ func (s nfaState) isLookaround() bool { return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert } -// Returns the matches for the character at the given index of the given string. -// Also returns the number of matches. Returns -1 if an assertion failed. -func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) { - // Assertions can be viewed as 'checks'. If the check fails, we return - // an empty array and 0. - // If it passes, we treat it like any other state, and return all the transitions. - if s.assert != noneAssert { - if s.checkAssertion(str, idx) == false { - return make([]*nfaState, 0), -1 - } +func (s nfaState) numTransitions() int { + if s.next == nil && s.splitState == nil { + return 0 } - listTransitions := s.transitions[int(str[idx])] - for _, dest := range s.transitions[int(anyCharRune)] { - if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) { - // Add an allChar state to the list of matches if: - // a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't. - // b. The current character isn't the state's exception list. - listTransitions = append(listTransitions, dest) - } + if s.next == nil || s.splitState == nil { + return 1 } - numTransitions := len(listTransitions) - return listTransitions, numTransitions + return 2 } -// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates -func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) { - if len(st.transitions) == 0 { - st.isLast = true - return - } - // if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a* - if len(st.transitions) == 1 { // Eg. a* - var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one - for _, c := range st.content { - if len(st.transitions[c]) != 1 || st.transitions[c][0] != st { - moreThanOneTrans = true - } - } - st.isLast = !moreThanOneTrans - } +// Returns the matches for the character at the given index of the given string. +// Also returns the number of matches. Returns -1 if an assertion failed. +//func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) { +// // Assertions can be viewed as 'checks'. If the check fails, we return +// // an empty array and 0. +// // If it passes, we treat it like any other state, and return all the transitions. +// if s.assert != noneAssert { +// if s.checkAssertion(str, idx) == false { +// return make([]*nfaState, 0), -1 +// } +// } +// listTransitions := s.transitions[int(str[idx])] +// for _, dest := range s.transitions[int(anyCharRune)] { +// if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) { +// // Add an allChar state to the list of matches if: +// // a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't. +// // b. The current character isn't the state's exception list. +// listTransitions = append(listTransitions, dest) +// } +// } +// numTransitions := len(listTransitions) +// return listTransitions, numTransitions +//} - if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state - transitionDests := make([]*nfaState, 0) - for _, v := range st.transitions { - transitionDests = append(transitionDests, v...) - } - if allEqual(transitionDests...) { - st.isLast = true - return - } - } - if visited[st] == true { - return - } - visited[st] = true - for _, states := range st.transitions { - for i := range states { - if states[i] != st { - verifyLastStatesHelper(states[i], visited) - } - } - } -} +// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates +//func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) { +// if st.numTransitions() == 0 { +// st.isLast = true +// return +// } +// // if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a* +// if st.numTransitions() == 1 { // Eg. a* +// var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one +// for _, c := range st.content { +// if len(st.transitions[c]) != 1 || st.transitions[c][0] != st { +// moreThanOneTrans = true +// } +// } +// st.isLast = !moreThanOneTrans +// } +// +// if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state +// transitionDests := make([]*nfaState, 0) +// for _, v := range st.transitions { +// transitionDests = append(transitionDests, v...) +// } +// if allEqual(transitionDests...) { +// st.isLast = true +// return +// } +// } +// if visited[st] == true { +// return +// } +// visited[st] = true +// for _, states := range st.transitions { +// for i := range states { +// if states[i] != st { +// verifyLastStatesHelper(states[i], visited) +// } +// } +// } +//} // verifyLastStates enables the 'isLast' flag for the leaf nodes (last states) -func verifyLastStates(start []*nfaState) { - verifyLastStatesHelper(start[0], make(map[*nfaState]bool)) -} +//func verifyLastStates(start []*nfaState) { +// verifyLastStatesHelper(start[0], make(map[*nfaState]bool)) +//} // Concatenates s1 and s2, returns the start of the concatenation. func concatenate(s1 *nfaState, s2 *nfaState) *nfaState { @@ -312,69 +315,69 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState { return s2 } for i := range s1.output { - for _, c := range s2.content { // Create transitions for every element in s1's content to s2' - s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2) - } + s1.output[i].next = s2 } s1.output = s2.output return s1 } -func kleene(s1 nfaState) (*nfaState, error) { +func kleene(s1 *nfaState) (*nfaState, error) { if s1.isEmpty && s1.assert != noneAssert { return nil, fmt.Errorf("previous token is not quantifiable") } - emptyState := zeroLengthMatchState() - emptyState.assert = noneAssert - toReturn := alternate(&s1, &emptyState) + toReturn := &nfaState{} + toReturn.isEmpty = true + toReturn.isAlternation = true + toReturn.content = newContents(epsilon) + toReturn.splitState = s1 + for i := range s1.output { + s1.output[i].next = toReturn + } // toReturn := &nfaState{} // toReturn.transitions = make(map[int][]*nfaState) // toReturn.content = newContents(epsilon) - toReturn.isEmpty = true toReturn.isKleene = true - toReturn.output = []*nfaState{&emptyState} + toReturn.output = append([]*nfaState{}, toReturn) for i := range s1.output { - for _, c := range toReturn.content { - s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn) - } - } - for _, c := range s1.content { - toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1) + s1.output[i].next = toReturn } + // for _, c := range s1.content { + // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1) + // } //toReturn.kleeneState = &s1 return toReturn, nil } func alternate(s1 *nfaState, s2 *nfaState) *nfaState { toReturn := &nfaState{} - toReturn.transitions = make(map[int][]*nfaState) + // toReturn.transitions = make(map[int][]*nfaState) toReturn.output = append(toReturn.output, s1.output...) toReturn.output = append(toReturn.output, s2.output...) - // Unique append is used here (and elsewhere) to ensure that, - // for any given transition, a state can only be mentioned once. - // For example, given the transition 'a', the state 's1' can only be mentioned once. - // This would lead to multiple instances of the same set of match indices, since both - // 's1' states would be considered to match. - for _, c := range s1.content { - toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1) - } - for _, c := range s2.content { - toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2) - } + // // Unique append is used here (and elsewhere) to ensure that, + // // for any given transition, a state can only be mentioned once. + // // For example, given the transition 'a', the state 's1' can only be mentioned once. + // // This would lead to multiple instances of the same set of match indices, since both + // // 's1' states would be considered to match. + // for _, c := range s1.content { + // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1) + // } + // for _, c := range s2.content { + // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2) + // } toReturn.content = newContents(epsilon) toReturn.isEmpty = true toReturn.isAlternation = true - toReturn.leftState = s1 - toReturn.rightState = s2 + toReturn.next = s1 + toReturn.splitState = s2 return toReturn } func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|) s2 := &nfaState{} - s2.transitions = make(map[int][]*nfaState) + // s2.transitions = make(map[int][]*nfaState) s2.content = newContents(epsilon) s2.output = append(s2.output, s2) s2.isEmpty = true @@ -385,8 +388,8 @@ func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|) // Creates and returns a new state with the 'default' values. func newState() nfaState { ret := nfaState{ - output: make([]*nfaState, 0), - transitions: make(map[int][]*nfaState), + output: make([]*nfaState, 0), + // transitions: make(map[int][]*nfaState), assert: noneAssert, except: append([]rune{}, 0), lookaroundRegex: "",