Got rid of transitions parameter, changed how kleene state is processed

I replaced the transition parameter for nfaState, replacing it with a single nfaState pointer. This is because any non-alternation state will only have one next state, so the map was just added complexity. I changed alternation processing - instead of having their own dedicated fields, they just use the new 'next' parameter, and another one called 'splitState'. I also changed the kleene state processing to remove the unecessary empty state in the right-side alternation (it actually messed up my matching).
2025-02-05 22:20:28 -05:00
parent 858e535fba
commit cca8c7cda2
2 changed files with 147 additions and 143 deletions
--- a/regex/compile.go
+++ b/regex/compile.go
@@ -822,7 +822,6 @@ func thompson(re []postfixNode) (Reg, error) {
 	for _, c := range re {
 		if c.nodetype == characterNode || c.nodetype == assertionNode {
 			stateToAdd := nfaState{}
-			stateToAdd.transitions = make(map[int][]*nfaState)
 			if c.allChars {
 				stateToAdd.allChars = true
 				if len(c.except) != 0 {
@@ -934,7 +933,6 @@ func thompson(re []postfixNode) (Reg, error) {
 			s.isEmpty = true
 			s.output = make([]*nfaState, 0)
 			s.output = append(s.output, s)
-			s.transitions = make(map[int][]*nfaState)
 			// LPAREN nodes are just added normally
 			if c.nodetype == lparenNode {
 				numGroups++
@@ -966,7 +964,7 @@ func thompson(re []postfixNode) (Reg, error) {
 					s.groupNum = lparenNode.groupNum
 					to_add := concatenate(lparenNode, s)
 					nfa = append(nfa, to_add)
-				} else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
+				} else if middleNode.groupBegin && middleNode.numTransitions() == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
 					nfa = append(nfa, lparenNode)    // I shouldn't have popped this out, because it is not involved in the current capturing group
 					s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen
 					to_add := concatenate(middleNode, s)
@@ -1030,14 +1028,14 @@ func thompson(re []postfixNode) (Reg, error) {
 			if err != nil {
 				return Reg{}, fmt.Errorf("error applying kleene star")
 			}
-			stateToAdd, err := kleene(*s1)
+			stateToAdd, err := kleene(s1)
 			if err != nil {
 				return Reg{}, err
 			}
 			nfa = append(nfa, stateToAdd)
 		case plusNode: // a+ is equivalent to aa*
 			s1 := mustPop(&nfa)
-			s2, err := kleene(*s1)
+			s2, err := kleene(s1)
 			if err != nil {
 				return Reg{}, err
 			}
@@ -1061,14 +1059,14 @@ func thompson(re []postfixNode) (Reg, error) {
 			// 	'^|a'
 			s2, err1 := pop(&nfa)
 			s1, err2 := pop(&nfa)
-			if err2 != nil || (s2.groupBegin && len(s2.transitions) == 0) { // Doesn't exist, or its just an LPAREN
+			if err2 != nil || (s2.groupBegin && s2.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN
 				if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back
 					nfa = append(nfa, s2)
 				}
 				tmp := zeroLengthMatchState()
 				s2 = &tmp
 			}
-			if err1 != nil || (s1.groupBegin && len(s1.transitions) == 0) { // Doesn't exist, or its just an LPAREN
+			if err1 != nil || (s1.groupBegin && s1.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN
 				if err1 == nil { // See above for explanation
 					nfa = append(nfa, s1)
 				}
@@ -1100,7 +1098,7 @@ func thompson(re []postfixNode) (Reg, error) {
 				stateToAdd = concatenate(stateToAdd, cloneState(poppedState))
 			}
 			if c.endReps == infinite_reps { // Case 3
-				s2, err := kleene(*poppedState)
+				s2, err := kleene(poppedState)
 				if err != nil {
 					return Reg{}, err
 				}
@@ -1117,7 +1115,10 @@ func thompson(re []postfixNode) (Reg, error) {
 		return Reg{}, fmt.Errorf("invalid regex")
 	}

-	verifyLastStates(nfa)
+	lastState := newState()
+	lastState.isLast = true
+
+	concatenate(nfa[0], &lastState)

 	return Reg{nfa[0], numGroups}, nil

--- a/regex/nfa.go
+++ b/regex/nfa.go
@@ -25,25 +25,25 @@ const (
 )

 type nfaState struct {
-	content                    stateContents       // Contents of current state
-	isEmpty                    bool                // If it is empty - Union operator and Kleene star states will be empty
-	isLast                     bool                // If it is the last state (acept state)
-	output                     []*nfaState         // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
-	transitions                map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
-	isKleene                   bool                // Identifies whether current node is a 0-state representing Kleene star
-	isQuestion                 bool                // Identifies whether current node is a 0-state representing the question operator
-	isAlternation              bool                // Identifies whether current node is a 0-state representing an alternation
-	leftState                  *nfaState           // Only for alternation states - the 'left' branch of the alternation
-	rightState                 *nfaState           // Only for alternation states - the 'right' branch of the alternation
-	assert                     assertType          // Type of assertion of current node - NONE means that the node doesn't assert anything
-	allChars                   bool                // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
-	except                     []rune              // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
-	lookaroundRegex            string              // Only for lookaround states - Contents of the regex that the lookaround state holds
-	lookaroundNFA              *nfaState           // Holds the NFA of the lookaroundRegex - if it exists
-	lookaroundNumCaptureGroups int                 // Number of capturing groups in lookaround regex if current node is a lookaround
-	groupBegin                 bool                // Whether or not the node starts a capturing group
-	groupEnd                   bool                // Whether or not the node ends a capturing group
-	groupNum                   int                 // Which capturing group the node starts / ends
+	content stateContents // Contents of current state
+	isEmpty bool          // If it is empty - Union operator and Kleene star states will be empty
+	isLast  bool          // If it is the last state (acept state)
+	output  []*nfaState   // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
+	//	transitions                map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
+	next                       *nfaState  // The next state (not for alternation or kleene states)
+	isKleene                   bool       // Identifies whether current node is a 0-state representing Kleene star
+	isQuestion                 bool       // Identifies whether current node is a 0-state representing the question operator
+	isAlternation              bool       // Identifies whether current node is a 0-state representing an alternation
+	splitState                 *nfaState  // Only for alternation states - the 'other' branch of the alternation ('next' is the first)
+	assert                     assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
+	allChars                   bool       // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
+	except                     []rune     // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
+	lookaroundRegex            string     // Only for lookaround states - Contents of the regex that the lookaround state holds
+	lookaroundNFA              *nfaState  // Holds the NFA of the lookaroundRegex - if it exists
+	lookaroundNumCaptureGroups int        // Number of capturing groups in lookaround regex if current node is a lookaround
+	groupBegin                 bool       // Whether or not the node starts a capturing group
+	groupEnd                   bool       // Whether or not the node ends a capturing group
+	groupNum                   int        // Which capturing group the node starts / ends
 	// The following properties depend on the current match - I should think about resetting them for every match.
 	zeroMatchFound bool    // Whether or not the state has been used for a zero-length match - only relevant for zero states
 	threadGroups   []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
@@ -73,7 +73,6 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
 		isEmpty:         stateToClone.isEmpty,
 		isLast:          stateToClone.isLast,
 		output:          make([]*nfaState, len(stateToClone.output)),
-		transitions:     make(map[int][]*nfaState),
 		isKleene:        stateToClone.isKleene,
 		isQuestion:      stateToClone.isQuestion,
 		isAlternation:   stateToClone.isAlternation,
@@ -94,28 +93,18 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
 			clone.output[i] = cloneStateHelper(s, cloneMap)
 		}
 	}
-	for k, v := range stateToClone.transitions {
-		clone.transitions[k] = make([]*nfaState, len(v))
-		for i, s := range v {
-			if s == stateToClone {
-				clone.transitions[k][i] = clone
-			} else {
-				clone.transitions[k][i] = cloneStateHelper(s, cloneMap)
-			}
-		}
-	}
 	if stateToClone.lookaroundNFA == stateToClone {
 		clone.lookaroundNFA = clone
 	}
 	clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap)
-	if stateToClone.leftState == stateToClone {
-		clone.leftState = clone
+	if stateToClone.splitState == stateToClone {
+		clone.splitState = clone
 	}
-	clone.leftState = cloneStateHelper(stateToClone.leftState, cloneMap)
-	if stateToClone.rightState == stateToClone {
-		clone.rightState = clone
+	clone.splitState = cloneStateHelper(stateToClone.splitState, cloneMap)
+	if stateToClone.next == stateToClone {
+		clone.next = clone
 	}
-	clone.rightState = cloneStateHelper(stateToClone.rightState, cloneMap)
+	clone.next = cloneStateHelper(stateToClone.next, cloneMap)
 	return clone
 }

@@ -126,6 +115,9 @@ func resetThreads(start *nfaState) {
 }

 func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
+	if state == nil {
+		return
+	}
 	if _, ok := visitedMap[state]; ok {
 		return
 	}
@@ -133,10 +125,11 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
 	state.threadGroups = nil
 	state.threadSP = 0
 	visitedMap[state] = true
-	for _, v := range state.transitions {
-		for _, nextState := range v {
-			resetThreadsHelper(nextState, visitedMap)
-		}
+	if state.isAlternation {
+		resetThreadsHelper(state.next, visitedMap)
+		resetThreadsHelper(state.splitState, visitedMap)
+	} else {
+		resetThreadsHelper(state.next, visitedMap)
 	}
 }

@@ -237,74 +230,84 @@ func (s nfaState) isLookaround() bool {
 	return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert
 }

+func (s nfaState) numTransitions() int {
+	if s.next == nil && s.splitState == nil {
+		return 0
+	}
+	if s.next == nil || s.splitState == nil {
+		return 1
+	}
+	return 2
+}
+
 // Returns the matches for the character at the given index of the given string.
 // Also returns the number of matches. Returns -1 if an assertion failed.
-func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) {
-	// Assertions can be viewed as 'checks'. If the check fails, we return
-	// an empty array and 0.
-	// If it passes, we treat it like any other state, and return all the transitions.
-	if s.assert != noneAssert {
-		if s.checkAssertion(str, idx) == false {
-			return make([]*nfaState, 0), -1
-		}
-	}
-	listTransitions := s.transitions[int(str[idx])]
-	for _, dest := range s.transitions[int(anyCharRune)] {
-		if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) {
-			// Add an allChar state to the list of matches if:
-			// 		a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't.
-			// 		b. The current character isn't the state's exception list.
-			listTransitions = append(listTransitions, dest)
-		}
-	}
-	numTransitions := len(listTransitions)
-	return listTransitions, numTransitions
-}
+//func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) {
+//	// Assertions can be viewed as 'checks'. If the check fails, we return
+//	// an empty array and 0.
+//	// If it passes, we treat it like any other state, and return all the transitions.
+//	if s.assert != noneAssert {
+//		if s.checkAssertion(str, idx) == false {
+//			return make([]*nfaState, 0), -1
+//		}
+//	}
+//	listTransitions := s.transitions[int(str[idx])]
+//	for _, dest := range s.transitions[int(anyCharRune)] {
+//		if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) {
+//			// Add an allChar state to the list of matches if:
+//			// 		a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't.
+//			// 		b. The current character isn't the state's exception list.
+//			listTransitions = append(listTransitions, dest)
+//		}
+//	}
+//	numTransitions := len(listTransitions)
+//	return listTransitions, numTransitions
+//}

 // verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates
-func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) {
-	if len(st.transitions) == 0 {
-		st.isLast = true
-		return
-	}
-	//	if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
-	if len(st.transitions) == 1 { // Eg. a*
-		var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
-		for _, c := range st.content {
-			if len(st.transitions[c]) != 1 || st.transitions[c][0] != st {
-				moreThanOneTrans = true
-			}
-		}
-		st.isLast = !moreThanOneTrans
-	}
-
-	if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
-		transitionDests := make([]*nfaState, 0)
-		for _, v := range st.transitions {
-			transitionDests = append(transitionDests, v...)
-		}
-		if allEqual(transitionDests...) {
-			st.isLast = true
-			return
-		}
-	}
-	if visited[st] == true {
-		return
-	}
-	visited[st] = true
-	for _, states := range st.transitions {
-		for i := range states {
-			if states[i] != st {
-				verifyLastStatesHelper(states[i], visited)
-			}
-		}
-	}
-}
+//func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) {
+//	if st.numTransitions() == 0 {
+//		st.isLast = true
+//		return
+//	}
+//	//	if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
+//	if st.numTransitions() == 1 { // Eg. a*
+//		var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
+//		for _, c := range st.content {
+//			if len(st.transitions[c]) != 1 || st.transitions[c][0] != st {
+//				moreThanOneTrans = true
+//			}
+//		}
+//		st.isLast = !moreThanOneTrans
+//	}
+//
+//	if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
+//		transitionDests := make([]*nfaState, 0)
+//		for _, v := range st.transitions {
+//			transitionDests = append(transitionDests, v...)
+//		}
+//		if allEqual(transitionDests...) {
+//			st.isLast = true
+//			return
+//		}
+//	}
+//	if visited[st] == true {
+//		return
+//	}
+//	visited[st] = true
+//	for _, states := range st.transitions {
+//		for i := range states {
+//			if states[i] != st {
+//				verifyLastStatesHelper(states[i], visited)
+//			}
+//		}
+//	}
+//}

 // verifyLastStates enables the 'isLast' flag for the leaf nodes (last states)
-func verifyLastStates(start []*nfaState) {
-	verifyLastStatesHelper(start[0], make(map[*nfaState]bool))
-}
+//func verifyLastStates(start []*nfaState) {
+//	verifyLastStatesHelper(start[0], make(map[*nfaState]bool))
+//}

 // Concatenates s1 and s2, returns the start of the concatenation.
 func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
@@ -312,69 +315,69 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
 		return s2
 	}
 	for i := range s1.output {
-		for _, c := range s2.content { // Create transitions for every element in s1's content to s2'
-			s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2)
-		}
+		s1.output[i].next = s2
 	}
 	s1.output = s2.output
 	return s1
 }

-func kleene(s1 nfaState) (*nfaState, error) {
+func kleene(s1 *nfaState) (*nfaState, error) {
 	if s1.isEmpty && s1.assert != noneAssert {
 		return nil, fmt.Errorf("previous token is not quantifiable")
 	}

-	emptyState := zeroLengthMatchState()
-	emptyState.assert = noneAssert
-	toReturn := alternate(&s1, &emptyState)
+	toReturn := &nfaState{}
+	toReturn.isEmpty = true
+	toReturn.isAlternation = true
+	toReturn.content = newContents(epsilon)
+	toReturn.splitState = s1
+	for i := range s1.output {
+		s1.output[i].next = toReturn
+	}

 	//	toReturn := &nfaState{}
 	//	toReturn.transitions = make(map[int][]*nfaState)
 	//	toReturn.content = newContents(epsilon)
-	toReturn.isEmpty = true
 	toReturn.isKleene = true
-	toReturn.output = []*nfaState{&emptyState}
+	toReturn.output = append([]*nfaState{}, toReturn)
 	for i := range s1.output {
-		for _, c := range toReturn.content {
-			s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
-		}
-	}
-	for _, c := range s1.content {
-		toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
+		s1.output[i].next = toReturn
 	}
+	//	for _, c := range s1.content {
+	//		toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
+	//	}
 	//toReturn.kleeneState = &s1
 	return toReturn, nil
 }

 func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
 	toReturn := &nfaState{}
-	toReturn.transitions = make(map[int][]*nfaState)
+	//	toReturn.transitions = make(map[int][]*nfaState)
 	toReturn.output = append(toReturn.output, s1.output...)
 	toReturn.output = append(toReturn.output, s2.output...)
-	// Unique append is used here (and elsewhere) to ensure that,
-	// for any given transition, a state can only be mentioned once.
-	// For example, given the transition 'a', the state 's1' can only be mentioned once.
-	// This would lead to multiple instances of the same set of match indices, since both
-	// 's1' states would be considered to match.
-	for _, c := range s1.content {
-		toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
-	}
-	for _, c := range s2.content {
-		toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
-	}
+	//	// Unique append is used here (and elsewhere) to ensure that,
+	//	// for any given transition, a state can only be mentioned once.
+	//	// For example, given the transition 'a', the state 's1' can only be mentioned once.
+	//	// This would lead to multiple instances of the same set of match indices, since both
+	//	// 's1' states would be considered to match.
+	//	for _, c := range s1.content {
+	//		toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
+	//	}
+	//	for _, c := range s2.content {
+	//		toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
+	//	}
 	toReturn.content = newContents(epsilon)
 	toReturn.isEmpty = true
 	toReturn.isAlternation = true
-	toReturn.leftState = s1
-	toReturn.rightState = s2
+	toReturn.next = s1
+	toReturn.splitState = s2

 	return toReturn
 }

 func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|)
 	s2 := &nfaState{}
-	s2.transitions = make(map[int][]*nfaState)
+	//	s2.transitions = make(map[int][]*nfaState)
 	s2.content = newContents(epsilon)
 	s2.output = append(s2.output, s2)
 	s2.isEmpty = true
@@ -385,8 +388,8 @@ func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|)
 // Creates and returns a new state with the 'default' values.
 func newState() nfaState {
 	ret := nfaState{
-		output:          make([]*nfaState, 0),
-		transitions:     make(map[int][]*nfaState),
+		output: make([]*nfaState, 0),
+		//		transitions:     make(map[int][]*nfaState),
 		assert:          noneAssert,
 		except:          append([]rune{}, 0),
 		lookaroundRegex: "",