Continued implementing Thompson's algorithm

Started implementing Thompson's algorithm for matching, because the old one was completely backtracking (so it would enter infinite loops on something like '(a*)*' )
The git diff claims that a ton of code was changed, but most of it was just indentation changes.
2025-02-05 18:01:36 -05:00 · 2025-02-05 12:21:12 -05:00 · 2025-02-05 11:32:20 -05:00 · 2025-02-04 14:09:24 -05:00 · 2025-02-04 14:09:04 -05:00 · 2025-02-03 22:01:52 -05:00
6 changed files with 467 additions and 177 deletions
--- a/4
+++ b/4
@@ -6,8 +6,8 @@ fmt:
 vet: fmt
 	go vet ./...
 buildLib: vet
-	go build -gcflags="-N -l" ./...
+	go build -gcflags="all=-N -l" ./...
 buildCmd: buildLib
-	go build -C cmd/ -gcflags="-N -l" -o re ./...
+	go build -C cmd/ -gcflags="all=-N -l" -o re ./...
 test: buildCmd
 	go test -v ./...
--- a/regex/compile.go
+++ b/regex/compile.go
@@ -1059,8 +1059,8 @@ func thompson(re []postfixNode) (Reg, error) {
 			// 	'|a'
 			// 	'^a|'
 			// 	'^|a'
-			s1, err1 := pop(&nfa)
-			s2, err2 := pop(&nfa)
+			s2, err1 := pop(&nfa)
+			s1, err2 := pop(&nfa)
 			if err2 != nil || (s2.groupBegin && len(s2.transitions) == 0) { // Doesn't exist, or its just an LPAREN
 				if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back
 					nfa = append(nfa, s2)
--- a/regex/matching.go
+++ b/regex/matching.go
@@ -2,6 +2,7 @@ package regex

 import (
 	"fmt"
+	"slices"
 	"sort"
 )

@@ -150,6 +151,11 @@ func pruneIndices(indices []Match) []Match {
 	return toRet
 }

+func copyThread(to *nfaState, from nfaState) {
+	to.threadSP = from.threadSP
+	to.threadGroups = append([]Group{}, from.threadGroups...)
+}
+
 // Find returns the 0-group of the leftmost match of the regex in the given string.
 // An error value != nil indicates that no match was found.
 func (regex Reg) Find(str string) (Group, error) {
@@ -265,15 +271,16 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
 	// chosen as the match for the entire string.
 	// This allows us to pick the longest possible match (which is how greedy matching works).
 	// COMMENT ABOVE IS CURRENTLY NOT UP-TO-DATE
-	tempIndices := newMatch(numGroups + 1)
+	//	tempIndices := newMatch(numGroups + 1)

-	foundPath := false
-	startIdx := offset
-	endIdx := offset
-	currentStates := make([]*nfaState, 0)
-	tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration
-	i := offset                        // Index in string
-	startingFrom := i                  // Store starting index
+	//	foundPath := false
+	//startIdx := offset
+	//endIdx := offset
+	currentStates := make([]nfaState, 0)
+	nextStates := make([]nfaState, 0)
+	//	tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration
+	i := offset // Index in string
+	//startingFrom := i                  // Store starting index

 	// If the first state is an assertion, makes sure the assertion
 	// is true before we do _anything_ else.
@@ -284,181 +291,348 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
 		}
 	}
 	// Increment until we hit a character matching the start state (assuming not 0-state)
-	if start.isEmpty == false {
-		for i < len(str) && !start.contentContains(str, i) {
-			i++
-		}
-		startIdx = i
-		startingFrom = i
-		i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character
-	}
+	//	if start.isEmpty == false {
+	//		for i < len(str) && !start.contentContains(str, i) {
+	//			i++
+	//		}
+	//		startIdx = i
+	//		startingFrom = i
+	//		i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character
+	//	}

-	start.threadGroups = newMatch(numGroups + 1)
+	//	start.threadGroups = newMatch(numGroups + 1)
 	// Check if the start state begins a group - if so, add the start index to our list
-	if start.groupBegin {
-		start.threadGroups[start.groupNum].StartIdx = i
-		//		tempIndices[start.groupNum].startIdx = i
-	}
-
-	currentStates = append(currentStates, start)
+	//if start.groupBegin {
+	//		start.threadGroups[start.groupNum].StartIdx = i
+	//		tempIndices[start.groupNum].startIdx = i
+	//}

+	start.threadSP = i
+	currentStates = append(currentStates, *start)
+	var foundMatch bool
+	var isEmptyAndNoAssertion bool
 	// Main loop
-	for i < len(str) {
-		foundPath = false
+	for idx := i; idx <= len(str); idx++ {
+		for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ {
+			currentState := currentStates[currentStateIdx]
+			foundMatch = false
+			isEmptyAndNoAssertion = false

-		zeroStates := make([]*nfaState, 0)
-		// Keep taking zero-states, until there are no more left to take
-		// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
-		zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
-		tempStates = append(tempStates, zeroStates...)
-		num_appended := 0
-		for isZero == true {
-			zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
-			tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
-			if num_appended == 0 { // Break if we haven't appended any more unique values
-				break
+			if currentState.threadGroups == nil {
+				currentState.threadGroups = newMatch(numGroups + 1)
+				currentState.threadGroups[0].StartIdx = idx
 			}
-		}

-		currentStates, _ = uniqueAppend(currentStates, tempStates...)
-		tempStates = nil
+			if currentState.groupBegin {
+				currentState.threadGroups[currentState.groupNum].StartIdx = idx
+				//		allMatches := make([]nfaState, 0)
+				//		for _, v := range currentState.transitions {
+				//			dereferenced := funcMap(v, func(s *nfaState) nfaState {
+				//				return *s
+				//			})
+				//			allMatches = append(allMatches, dereferenced...)
+				//		}
+				//		slices.Reverse(allMatches)
+				//		for i := range allMatches {
+				//			copyThread(&allMatches[i], currentState)
+				//		}
+				//		currentStates = append(currentStates, allMatches...)
+			}
+			if currentState.groupEnd {
+				currentState.threadGroups[currentState.groupNum].EndIdx = idx
+				//			allMatches := make([]nfaState, 0)
+				//			for _, v := range currentState.transitions {
+				//				dereferenced := funcMap(v, func(s *nfaState) nfaState {
+				//					return *s
+				//				})
+				//				allMatches = append(allMatches, dereferenced...)
+				//			}
+				//			slices.Reverse(allMatches)
+				//			for i := range allMatches {
+				//				copyThread(&allMatches[i], currentState)
+				//			}
+				//			currentStates = append(currentStates, allMatches...)
+			}

-		// Take any transitions corresponding to current character
-		numStatesMatched := 0            // The number of states which had at least 1 match for this round
-		assertionFailed := false         // Whether or not an assertion failed for this round
-		lastStateInList := false         // Whether or not a last state was in our list of states
-		var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found
-		lastLookaroundInList := false    // Whether or not a last state (that is a lookaround) was in our list of states
-		for _, state := range currentStates {
-			matches, numMatches := state.matchesFor(str, i)
-			if numMatches > 0 {
-				numStatesMatched++
-				tempStates = append(tempStates, matches...)
-				foundPath = true
-				for _, m := range matches {
-					if m.threadGroups == nil {
-						m.threadGroups = newMatch(numGroups + 1)
-					}
-					copy(m.threadGroups, state.threadGroups)
-				}
-			}
-			if numMatches < 0 {
-				assertionFailed = true
-			}
-			if state.isLast {
-				if state.isLookaround() {
-					lastLookaroundInList = true
-				}
-				lastStateInList = true
-				lastStatePtr = state
-			}
-		}
-
-		if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed
-			// If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_
-			// state. The explanation below is my attempt to explain this behavior.
-			// If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails.
-			//
-			// One of the states in our list was a last state and a lookaround. In this case, we
-			// don't abort upon failure of the assertion, because we have found
-			// another path to a final state.
-			// Even if the last state _was_ an assertion, we can use the previously
-			// saved indices to find a match.
-			if lastLookaroundInList {
-				break
-			} else {
-				if i == startingFrom {
-					i++
-				}
-				return false, []Group{}, i
-			}
-		}
-		// Check if we can find a state in our list that is:
-		// 	a. A last-state
-		// 	b. Empty
-		// 	c. Doesn't assert anything
-		for _, s := range currentStates {
-			if s.isLast && s.isEmpty && s.assert == noneAssert {
-				lastStatePtr = s
-				lastStateInList = true
-			}
-		}
-		if lastStateInList { // A last-state was in the list of states. add the matchIndex to our MatchIndex list
-			for j := 1; j < numGroups+1; j++ {
-				tempIndices[j] = lastStatePtr.threadGroups[j]
-			}
-			endIdx = i
-			tempIndices[0] = Group{startIdx, endIdx}
-		}
-
-		// Check if we can find a zero-length match
-		if foundPath == false {
-			if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
-				if tempIndices[0].IsValid() == false {
-					tempIndices[0] = Group{startIdx, startIdx}
-				}
-			}
-			// If we haven't moved in the string, increment the counter by 1
-			// to ensure we don't keep trying the same string over and over.
-			//			if i == startingFrom {
-			startIdx++
-			//	i++
+			//		if currentState.isKleene {
+			//			// Append the next-state (after the kleene), then append the kleene state
+			//			allMatches := make([]*nfaState, 0)
+			//			for _, v := range currentState.transitions {
+			//				allMatches = append(allMatches, v...)
 			//			}
-			if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() {
-				if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
-					return true, tempIndices, tempIndices[0].EndIdx + 1
+			//			slices.Reverse(allMatches)
+			//			for _, m := range allMatches {
+			//				m.threadGroups = currentState.threadGroups
+			//				m.threadSP = idx
+			//			}
+			//			currentStates = append(currentStates, allMatches...)
+			//
+			//			//	kleeneState := currentState.kleeneState
+			//			//	kleeneState.threadGroups = currentState.threadGroups
+			//			//	kleeneState.threadSP = currentState.threadSP
+			//			//	currentStates = append(currentStates, kleeneState)
+			//			continue
+			//		}
+
+			// Alternation - enqueue left then right state, and continue
+			if currentState.isAlternation {
+				leftState := currentState.leftState
+				copyThread(leftState, currentState)
+				currentStates = append(currentStates, *currentState.leftState)
+				rightState := currentState.rightState
+				copyThread(rightState, currentState)
+				currentStates = append(currentStates, *currentState.rightState)
+				continue
+			}
+
+			// Empty state - enqueue next state, do _not_ increment the SP
+			if currentState.isEmpty && currentState.assert == noneAssert { //&& currentState.groupBegin == false && currentState.groupEnd == false {
+				isEmptyAndNoAssertion = true
+			}
+
+			if currentState.contentContains(str, idx) {
+				foundMatch = true
+			}
+
+			if isEmptyAndNoAssertion || foundMatch {
+				allMatches := make([]nfaState, 0)
+				for _, v := range currentState.transitions {
+					dereferenced := funcMap(v, func(s *nfaState) nfaState {
+						return *s
+					})
+					allMatches = append(allMatches, dereferenced...)
+				}
+				slices.Reverse(allMatches)
+				for i := range allMatches {
+					copyThread(&allMatches[i], currentState)
+					if foundMatch && currentState.assert == noneAssert {
+						allMatches[i].threadSP += 1
+					}
+				}
+				if currentState.groupBegin {
+					currentStates = slices.Insert(currentStates, currentStateIdx+1, allMatches...)
+				} else if currentState.groupEnd {
+					currentStates = append(currentStates, allMatches...)
 				} else {
-					return true, tempIndices, tempIndices[0].EndIdx
+					nextStates = append(nextStates, allMatches...)
 				}
 			}
-			return false, []Group{}, startIdx
-		}
-		currentStates = make([]*nfaState, len(tempStates))
-		copy(currentStates, tempStates)
-		tempStates = nil

-		i++
-	}
-
-	// End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position.
-	// This is the exact same algorithm used inside the loop, so I should probably put it in a function.
-	zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
-	tempStates = append(tempStates, zeroStates...)
-	num_appended := 0 // Number of unique states addded to tempStates
-	for isZero == true {
-		zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
-		tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
-		if num_appended == 0 { // Break if we haven't appended any more unique values
-			break
-		}
-	}
-
-	currentStates = append(currentStates, tempStates...)
-	tempStates = nil
-
-	for _, state := range currentStates {
-		// Only add the match if the start index is in bounds. If the state has an assertion,
-		// make sure the assertion checks out.
-		if state.isLast && i <= len(str) {
-			if state.assert == noneAssert || state.checkAssertion(str, i) {
-				for j := 1; j < numGroups+1; j++ {
-					tempIndices[j] = state.threadGroups[j]
+			if currentState.isLast && len(nextStates) == 0 { // Last state reached
+				if foundMatch {
+					if currentState.assert != noneAssert {
+						currentState.threadGroups[0].EndIdx = idx
+					} else {
+						currentState.threadGroups[0].EndIdx = idx + 1
+					}
+					if idx == currentState.threadGroups[0].StartIdx {
+						idx += 1
+					}
+					return true, currentState.threadGroups, idx
+				} else if isEmptyAndNoAssertion {
+					currentState.threadGroups[0].EndIdx = idx
+					if idx == currentState.threadGroups[0].StartIdx {
+						idx++
+					}
+					return true, currentState.threadGroups, idx
 				}
-				endIdx = i
-				tempIndices[0] = Group{startIdx, endIdx}
+
 			}
 		}
+		currentStates = append([]nfaState{}, nextStates...)
+		nextStates = nil
 	}
-
-	if tempIndices.numValidGroups() > 0 {
-		if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
-			return true, tempIndices, tempIndices[0].EndIdx + 1
-		} else {
-			return true, tempIndices, tempIndices[0].EndIdx
-		}
-	}
-	if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over.
-		startIdx++
-	}
-	return false, []Group{}, startIdx
+	return false, []Group{}, i + 1
+	//		zeroStates := make([]*nfaState, 0)
+	//		// Keep taking zero-states, until there are no more left to take
+	//		// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
+	//		topStateItem := currentStates.peek()
+	//		topState := topStateItem.(*priorQueueItem).state
+	//		zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i)
+	//		tempStates = append(tempStates, zeroStates...)
+	//		num_appended := 0
+	//		for isZero == true {
+	//			zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
+	//			tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
+	//			if num_appended == 0 { // Break if we haven't appended any more unique values
+	//				break
+	//			}
+	//		}
+	//		if isZero == true {
+	//			currentStates.Pop()
+	//		}
+	//
+	//		for _, state := range tempStates {
+	//			heap.Push(currentStates, newPriorQueueItem(state))
+	//		}
+	//		tempStates = nil
+	//
+	//		// Take any transitions corresponding to current character
+	//		numStatesMatched := 0            // The number of states which had at least 1 match for this round
+	//		assertionFailed := false         // Whether or not an assertion failed for this round
+	//		lastStateInList := false         // Whether or not a last state was in our list of states
+	//		var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found
+	//		lastLookaroundInList := false    // Whether or not a last state (that is a lookaround) was in our list of states
+	//		for numStatesMatched == 0 && lastStateInList == false {
+	//			if currentStates.Len() == 0 {
+	//				break
+	//			}
+	//			stateItem := heap.Pop(currentStates)
+	//			state := stateItem.(*priorQueueItem).state
+	//			matches, numMatches := state.matchesFor(str, i)
+	//			if numMatches > 0 {
+	//				numStatesMatched++
+	//				tempStates = append([]*nfaState(nil), matches...)
+	//				foundPath = true
+	//				for _, m := range matches {
+	//					if m.threadGroups == nil {
+	//						m.threadGroups = newMatch(numGroups + 1)
+	//					}
+	//					m.threadSP = state.threadSP + 1
+	//					copy(m.threadGroups, state.threadGroups)
+	//				}
+	//			}
+	//			if numMatches < 0 {
+	//				assertionFailed = true
+	//			}
+	//			if state.isLast {
+	//				if state.isLookaround() {
+	//					lastLookaroundInList = true
+	//				}
+	//				lastStateInList = true
+	//				lastStatePtr = state
+	//			}
+	//		}
+	//
+	//		if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed
+	//			// If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_
+	//			// state. The explanation below is my attempt to explain this behavior.
+	//			// If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails.
+	//			//
+	//			// One of the states in our list was a last state and a lookaround. In this case, we
+	//			// don't abort upon failure of the assertion, because we have found
+	//			// another path to a final state.
+	//			// Even if the last state _was_ an assertion, we can use the previously
+	//			// saved indices to find a match.
+	//			if lastLookaroundInList {
+	//				break
+	//			} else {
+	//				if i == startingFrom {
+	//					i++
+	//				}
+	//				return false, []Group{}, i
+	//			}
+	//		}
+	//		// Check if we can find a state in our list that is:
+	//		// 	a. A last-state
+	//		// 	b. Empty
+	//		// 	c. Doesn't assert anything
+	//		for _, stateItem := range *currentStates {
+	//			s := stateItem.state
+	//			if s.isLast && s.isEmpty && s.assert == noneAssert {
+	//				lastStatePtr = s
+	//				lastStateInList = true
+	//			}
+	//		}
+	//		if lastStateInList && numStatesMatched == 0 { // A last-state was in the list of states. add the matchIndex to our MatchIndex list
+	//			for j := 1; j < numGroups+1; j++ {
+	//				tempIndices[j] = lastStatePtr.threadGroups[j]
+	//			}
+	//			endIdx = i
+	//			tempIndices[0] = Group{startIdx, endIdx}
+	//			if tempIndices[0].StartIdx == tempIndices[0].EndIdx {
+	//				return true, tempIndices, tempIndices[0].EndIdx + 1
+	//			} else {
+	//				return true, tempIndices, tempIndices[0].EndIdx
+	//			}
+	//		}
+	//
+	//		// Check if we can find a zero-length match
+	//		if foundPath == false {
+	//			currentStatesList := funcMap(*currentStates, func(item *priorQueueItem) *nfaState {
+	//				return item.state
+	//			})
+	//			if ok := zeroMatchPossible(str, i, numGroups, currentStatesList...); ok {
+	//				if tempIndices[0].IsValid() == false {
+	//					tempIndices[0] = Group{startIdx, startIdx}
+	//				}
+	//			}
+	//			// If we haven't moved in the string, increment the counter by 1
+	//			// to ensure we don't keep trying the same string over and over.
+	//			//			if i == startingFrom {
+	//			startIdx++
+	//			//	i++
+	//			//			}
+	//			if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() {
+	//				if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
+	//					return true, tempIndices, tempIndices[0].EndIdx + 1
+	//				} else {
+	//					return true, tempIndices, tempIndices[0].EndIdx
+	//				}
+	//			}
+	//			return false, []Group{}, startIdx
+	//		}
+	//		currentStates = &priorityQueue{}
+	//		slices.Reverse(tempStates)
+	//		for _, state := range tempStates {
+	//			heap.Push(currentStates, newPriorQueueItem(state))
+	//		}
+	//		tempStates = nil
+	//
+	//		i++
+	//	}
+	//
+	// // End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position.
+	// // This is the exact same algorithm used inside the loop, so I should probably put it in a function.
+	//
+	//	if currentStates.Len() > 0 {
+	//		topStateItem := currentStates.peek()
+	//		topState := topStateItem.(*priorQueueItem).state
+	//		zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i)
+	//		tempStates = append(tempStates, zeroStates...)
+	//		num_appended := 0 // Number of unique states addded to tempStates
+	//		for isZero == true {
+	//			zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
+	//			tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
+	//			if num_appended == 0 { // Break if we haven't appended any more unique values
+	//				break
+	//			}
+	//		}
+	//	}
+	//
+	//	for _, state := range tempStates {
+	//		heap.Push(currentStates, newPriorQueueItem(state))
+	//	}
+	//
+	// tempStates = nil
+	//
+	//	for _, stateItem := range *currentStates {
+	//		state := stateItem.state
+	//		// Only add the match if the start index is in bounds. If the state has an assertion,
+	//		// make sure the assertion checks out.
+	//		if state.isLast && i <= len(str) {
+	//			if state.assert == noneAssert || state.checkAssertion(str, i) {
+	//				for j := 1; j < numGroups+1; j++ {
+	//					tempIndices[j] = state.threadGroups[j]
+	//				}
+	//				endIdx = i
+	//				tempIndices[0] = Group{startIdx, endIdx}
+	//			}
+	//		}
+	//	}
+	//
+	//	if tempIndices.numValidGroups() > 0 {
+	//		if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
+	//			return true, tempIndices, tempIndices[0].EndIdx + 1
+	//		} else {
+	//			return true, tempIndices, tempIndices[0].EndIdx
+	//		}
+	//	}
+	//
+	// if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over.
+	//
+	//		startIdx++
+	//	}
+	//
+	// return false, []Group{}, startIdx
 }
--- a/regex/nfa.go
+++ b/regex/nfa.go
@@ -31,6 +31,10 @@ type nfaState struct {
 	output                     []*nfaState         // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
 	transitions                map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
 	isKleene                   bool                // Identifies whether current node is a 0-state representing Kleene star
+	isQuestion                 bool                // Identifies whether current node is a 0-state representing the question operator
+	isAlternation              bool                // Identifies whether current node is a 0-state representing an alternation
+	leftState                  *nfaState           // Only for alternation states - the 'left' branch of the alternation
+	rightState                 *nfaState           // Only for alternation states - the 'right' branch of the alternation
 	assert                     assertType          // Type of assertion of current node - NONE means that the node doesn't assert anything
 	allChars                   bool                // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
 	except                     []rune              // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
@@ -43,6 +47,7 @@ type nfaState struct {
 	// The following properties depend on the current match - I should think about resetting them for every match.
 	zeroMatchFound bool    // Whether or not the state has been used for a zero-length match - only relevant for zero states
 	threadGroups   []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
+	threadSP       int     // The string pointer of the thread - where it is in the input string
 }

 // Clones the NFA starting from the given state.
@@ -70,6 +75,8 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
 		output:          make([]*nfaState, len(stateToClone.output)),
 		transitions:     make(map[int][]*nfaState),
 		isKleene:        stateToClone.isKleene,
+		isQuestion:      stateToClone.isQuestion,
+		isAlternation:   stateToClone.isAlternation,
 		assert:          stateToClone.assert,
 		zeroMatchFound:  stateToClone.zeroMatchFound,
 		allChars:        stateToClone.allChars,
@@ -101,6 +108,14 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
 		clone.lookaroundNFA = clone
 	}
 	clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap)
+	if stateToClone.leftState == stateToClone {
+		clone.leftState = clone
+	}
+	clone.leftState = cloneStateHelper(stateToClone.leftState, cloneMap)
+	if stateToClone.rightState == stateToClone {
+		clone.rightState = clone
+	}
+	clone.rightState = cloneStateHelper(stateToClone.rightState, cloneMap)
 	return clone
 }

@@ -116,6 +131,7 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
 	}
 	// Assuming it hasn't been visited
 	state.threadGroups = nil
+	state.threadSP = 0
 	visitedMap[state] = true
 	for _, v := range state.transitions {
 		for _, nextState := range v {
@@ -207,6 +223,9 @@ func (s nfaState) contentContains(str []rune, idx int) bool {
 	if s.assert != noneAssert {
 		return s.checkAssertion(str, idx)
 	}
+	if idx >= len(str) {
+		return false
+	}
 	if s.allChars {
 		return !slices.Contains(slices.Concat(notDotChars, s.except), str[idx]) // Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node.
 	}
@@ -306,12 +325,16 @@ func kleene(s1 nfaState) (*nfaState, error) {
 		return nil, fmt.Errorf("previous token is not quantifiable")
 	}

-	toReturn := &nfaState{}
-	toReturn.transitions = make(map[int][]*nfaState)
-	toReturn.content = newContents(epsilon)
+	emptyState := zeroLengthMatchState()
+	emptyState.assert = noneAssert
+	toReturn := alternate(&s1, &emptyState)
+
+	//	toReturn := &nfaState{}
+	//	toReturn.transitions = make(map[int][]*nfaState)
+	//	toReturn.content = newContents(epsilon)
 	toReturn.isEmpty = true
 	toReturn.isKleene = true
-	toReturn.output = append(toReturn.output, toReturn)
+	toReturn.output = []*nfaState{&emptyState}
 	for i := range s1.output {
 		for _, c := range toReturn.content {
 			s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
@@ -320,6 +343,7 @@ func kleene(s1 nfaState) (*nfaState, error) {
 	for _, c := range s1.content {
 		toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
 	}
+	//toReturn.kleeneState = &s1
 	return toReturn, nil
 }

@@ -341,6 +365,9 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
 	}
 	toReturn.content = newContents(epsilon)
 	toReturn.isEmpty = true
+	toReturn.isAlternation = true
+	toReturn.leftState = s1
+	toReturn.rightState = s2

 	return toReturn
 }
--- a/regex/priorityQueue.go
+++ b/regex/priorityQueue.go
@@ -0,0 +1,89 @@
+package regex
+
+import "container/heap"
+
+// Implement a priority queue using container/heap
+
+const (
+	min_priority int = iota
+	zerostate_priority
+	alternation_priority
+	kleene_priority
+	char_priority
+	max_priority
+)
+
+func getPriority(state *nfaState) int {
+	if state.isKleene {
+		return zerostate_priority
+	} else if state.isAlternation {
+		return alternation_priority
+	} else {
+		if state.isEmpty {
+			return zerostate_priority
+		} else {
+			return char_priority
+		}
+	}
+}
+
+type priorQueueItem struct {
+	state    *nfaState
+	priority int
+	index    int
+}
+
+func newPriorQueueItem(state *nfaState) *priorQueueItem {
+	return &priorQueueItem{
+		state:    state,
+		index:    -1,
+		priority: getPriority(state),
+	}
+}
+
+type priorityQueue []*priorQueueItem
+
+func (pq priorityQueue) Len() int {
+	return len(pq)
+}
+
+func (pq priorityQueue) Less(i, j int) bool {
+	if pq[i].priority == pq[j].priority {
+		return pq[i].index < pq[j].index
+	}
+	return pq[i].priority > pq[j].priority // We want max-heap, so we use greater-than
+}
+
+func (pq priorityQueue) Swap(i, j int) {
+	pq[i], pq[j] = pq[j], pq[i]
+	pq[i].index = i
+	pq[j].index = j
+}
+
+func (pq *priorityQueue) Push(x any) {
+	length := len(*pq)
+	item := x.(*priorQueueItem)
+	item.index = length
+	*pq = append(*pq, item)
+}
+
+func (pq *priorityQueue) Pop() any {
+	old := *pq
+	n := len(old)
+	item := old[n-1]
+	old[n-1] = nil
+	item.index = -1
+	*pq = old[0 : n-1]
+	return item
+}
+func (pq *priorityQueue) peek() any {
+	queue := *pq
+	n := len(queue)
+	return queue[n-1]
+}
+
+func (pq *priorityQueue) update(item *priorQueueItem, value *nfaState, priority int) {
+	item.state = value
+	item.priority = priority
+	heap.Fix(pq, item.index)
+}
--- a/regex/re_test.go
+++ b/regex/re_test.go
@@ -701,7 +701,7 @@ func TestFind(t *testing.T) {
 					if len(test.result) == 0 {
 						return // Manually pass the test, because this is the expected behavior
 					} else {
-						t.Errorf("Wanted no match Got %v\n", groupIndex)
+						t.Errorf("Wanted %v Got no matches\n", test.result)
 					}
 				} else {
 					if groupIndex != test.result[0] {
Author	SHA1	Message	Date
Aadhavan Srinivasan	858e535fba	Continued implementing Thompson's algorithm	2025-02-05 18:01:36 -05:00
Aadhavan Srinivasan	7c62ba6bfd	Started implementing Thompson's algorithm for matching, because the old one was completely backtracking (so it would enter infinite loops on something like '(a)' ) The git diff claims that a ton of code was changed, but most of it was just indentation changes.	2025-02-05 12:21:12 -05:00
Aadhavan Srinivasan	d4e8cb74fd	Replaced pointer to nfaState with nfaState	2025-02-05 11:32:20 -05:00
Aadhavan Srinivasan	3ce611d121	More work towards implementing PCRE matching	2025-02-04 14:09:24 -05:00
Aadhavan Srinivasan	e0253dfaf3	Change kleene() to an alternation-style construct	2025-02-04 14:09:04 -05:00
Aadhavan Srinivasan	753e973d82	Started rewrite of matching algorithm, got concatenation and alternation done, kleene and zero-state stuff is next	2025-02-03 22:01:52 -05:00
Aadhavan Srinivasan	5563a70568	Reverse the order in which I pop states for alternation, because this messes with the left branch-right branch thing	2025-02-03 21:59:41 -05:00
Aadhavan Srinivasan	de0d7345a8	Store left and right branches of alternation separately	2025-02-03 21:59:05 -05:00
Aadhavan Srinivasan	ad273b0c68	Trying to emulate backtracking by using string pointers within threads (something similar to rsc's 2nd regexp article)	2025-02-03 16:50:11 -05:00
Aadhavan Srinivasan	e167cdb2cb	Fixed mistake in test output	2025-02-03 16:49:30 -05:00
Aadhavan Srinivasan	1fd48ae614	Store the current string pointer as a 'thread variable' (allows us to simulate backtracking)	2025-02-03 16:49:10 -05:00
Aadhavan Srinivasan	09812956ac	Disable all optimizations	2025-02-03 16:48:09 -05:00
Aadhavan Srinivasan	fbc9dfcc95	Trying something out; we'll see if it works	2025-02-03 16:47:53 -05:00
Aadhavan Srinivasan	bc32e0cb76	Started working on converting to PCRE matching rules (prefer left branch of alternation)	2025-02-03 14:06:14 -05:00
Aadhavan Srinivasan	ad0f7d0178	Added new state fields to tell if a state is a question or alternation	2025-02-03 14:05:53 -05:00
Aadhavan Srinivasan	4e597f8eb1	Implemented a priority-queue to use while matching	2025-02-03 14:05:30 -05:00