From 360bdc8e112558ec352059e9ae135a993d8fcbc7 Mon Sep 17 00:00:00 2001
From: Aadhavan Srinivasan <aadhavan@twomorecents.org>
Date: Thu, 31 Oct 2024 17:02:13 -0400
Subject: [PATCH] Big rewrite - assertion handling, zero-match fixes, change in
 recursive calls

I added support for transitions. I wrote a function to determine if
a given state has transitions for a character at a given point in the
string. This helps me check if the current state has an assertion, and
take actions based on that.

I also fixed zero-length matching (almost, see todo.txt). It works for
nearly all cases I could think of, although I still need to write more
tests. I wrote a function to check if zero-length matches are possible
with a given state.

I also changed the way recursive calls work. Rather than passing a
modified string, the function stores the location in the input string.
This location is updated with each call to the function.

Finally, the function now increments the offset by 1 instead of
incrementing by the length of the longest match. This leads to a bit of
overhead eg. if a regex matches index 1-5, then 1-5, 2-5, 3-5, 4-5 are
all stored. To fix this, I wrote (and used) a function to check if
a match overlaps with any matches in a slice.
---
 matching.go | 116 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 95 insertions(+), 21 deletions(-)

diff --git a/matching.go b/matching.go
index fcc972f..94a6ea0 100644
--- a/matching.go
+++ b/matching.go
@@ -6,6 +6,21 @@ type matchIndex struct {
 	endIdx   int
 }
 
+// Returns true if the given matchIndex is an improper subset of any of the indices in the slice.
+// When we add an index to our slice, we want to make sure a larger match isn't already present.
+func overlaps(idx matchIndex, idxes []matchIndex) bool {
+	for _, val := range idxes {
+		if idx.startIdx >= val.startIdx && idx.endIdx <= val.endIdx {
+			// A zero-length match doesn't overlap if it is located at the start or end
+			// of the other match
+			if !(idx.startIdx == idx.endIdx && (idx.startIdx == val.startIdx || idx.startIdx == val.endIdx)) {
+				return true
+			}
+		}
+	}
+	return false
+}
+
 // takeZeroState takes the 0-state (if such a transition exists) for all states in the
 // given slice. It returns the resulting states. If any of the resulting states is a 0-state,
 // the second parameter is true.
@@ -23,31 +38,66 @@ func takeZeroState(states []*State) (rtv []*State, isZero bool) {
 	return rtv, false
 }
 
+// zeroMatchPossible returns true if a zero-length match is possible
+// from any of the given states.
+// It uses the same algorithm to find zero-states as the one inside the loop,
+// so I should probably put it in a function.
+func zeroMatchPossible(states ...*State) bool {
+	zerostates, iszero := takeZeroState(states)
+	tempstates := make([]*State, 0)
+	tempstates = append(tempstates, states...)
+	tempstates = append(tempstates, zerostates...)
+	num_appended := 0 // number of unique states addded to tempstates
+	for iszero == true {
+		zerostates, iszero = takeZeroState(tempstates)
+		tempstates, num_appended = unique_append(tempstates, zerostates...)
+		if num_appended == 0 { // break if we haven't appended any more unique values
+			break
+		}
+	}
+	for _, state := range tempstates {
+		if state.isEmpty && state.assert == NONE && state.isLast {
+			return true
+		}
+	}
+	return false
+}
+
 // findAllMatches tries to findAllMatches the regex represented by given start-state, with
 // the given string
 func findAllMatches(start *State, str string) (indices []matchIndex) {
 	return findAllMatchesHelper(start, str, make([]matchIndex, 0), 0)
 }
 func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset int) []matchIndex {
-	// 'Base case' - exit if string is empty.
-	if len(str) == 0 {
-		// If the start is a Kleene star, then it should also match an empty string.
-		if start.isKleene && start.isLast {
-			indices, _ = unique_append(indices, matchIndex{offset, offset})
+	// Base case - exit if offset exceeds string's length
+	if offset > len(str) {
+		return indices
+	}
+	// 'Base case' - if we are at the end of the string, check if we can add a zero-length match
+	if offset == len(str) {
+		// Get all zero-state matches. If we can get to a zero-state without matching anything, we
+		// can add a zero-length match. This is all true only if the start state itself matches nothing.
+		if start.isEmpty && start.assert == NONE {
+			if zeroMatchPossible(start) {
+				if !overlaps(matchIndex{offset, offset}, indices) {
+					indices, _ = unique_append(indices, matchIndex{offset, offset})
+				}
+			}
 		}
 		return indices
 	}
 
 	foundPath := false
-	startIdx := 0
-	endIdx := 0
+	startIdx := offset
+	endIdx := offset
 	currentStates := make([]*State, 0)
 	tempStates := make([]*State, 0) // Used to store states that should be used in next loop iteration
-	i := 0                          // Index in string
+	i := offset                     // Index in string
 	startingFrom := i               // Store starting index
+
 	// Increment until we hit a character matching the start state (assuming not 0-state)
 	if start.isEmpty == false {
-		for i < len(str) && !start.content.contains(int(str[i])) {
+		for i < len(str) && !start.contentContains([]rune(str), i) {
 			i++
 		}
 		startIdx = i
@@ -83,21 +133,41 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
 		tempStates = nil
 
 		// Take any transitions corresponding to current character
+		numStatesMatched := 0    // The number of states which had at least 1 match for this round
+		assertionFailed := false // Whether or not an assertion failed for this round
 		for _, state := range currentStates {
-			if len(state.transitions[int(str[i])]) > 0 {
-				tempStates = append(tempStates, state.transitions[int(str[i])]...)
+			matches, numMatches := state.matchesFor([]rune(str), i)
+			if numMatches > 0 {
+				numStatesMatched++
+				tempStates = append(tempStates, matches...)
 				foundPath = true
 			}
+			if numMatches < 0 {
+				assertionFailed = true
+			}
 			if state.isLast {
 				endIdx = i
-				tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx + offset, endIdx + offset})
+				tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx, endIdx})
 			}
 		}
-		// Recursion - match with rest of string if we have nowhere to go. If we haven't moved in the string, increment the counter by 1 to ensure we don't keep trying the same string over and over
-		if foundPath == false {
+		if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed - bort
 			if i == startingFrom {
 				i++
 			}
+			return findAllMatchesHelper(start, str, indices, i)
+		}
+		// Recursion - match with rest of string if we have nowhere to go.
+		// First check if we can find a zero-length match
+		if foundPath == false {
+			if zeroMatchPossible(currentStates...) {
+				tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx, startIdx})
+			}
+			// If we haven't moved in the string, increment the counter by 1
+			// to ensure we don't keep trying the same string over and over.
+			//			if i == startingFrom {
+			startIdx++
+			//	i++
+			//			}
 			// Get the maximum index-range from the list
 			if len(tempIndices) > 0 {
 				indexToAdd := Reduce(tempIndices, func(i1 matchIndex, i2 matchIndex) matchIndex {
@@ -108,9 +178,11 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
 					}
 					return i2
 				})
-				indices, _ = unique_append(indices, indexToAdd)
+				if !overlaps(indexToAdd, indices) {
+					indices, _ = unique_append(indices, indexToAdd)
+				}
 			}
-			return findAllMatchesHelper(start, str[i:], indices, offset+i)
+			return findAllMatchesHelper(start, str, indices, startIdx)
 		}
 		currentStates = make([]*State, len(tempStates))
 		copy(currentStates, tempStates)
@@ -137,9 +209,9 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
 
 	for _, state := range currentStates {
 		// Only add the match if the start index is in bounds
-		if state.isLast && startIdx+offset < len(str)+offset {
+		if state.isLast && startIdx < len(str) {
 			endIdx = i
-			tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx + offset, endIdx + offset})
+			tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx, endIdx})
 		}
 	}
 	// Get the maximum index-range from the list
@@ -152,9 +224,11 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
 			}
 			return i2
 		})
-		indices, _ = unique_append(indices, indexToAdd)
+		if !overlaps(indexToAdd, indices) {
+			indices, _ = unique_append(indices, indexToAdd)
+		}
 	}
 
-	// Default
-	return indices
+	// Default - call on empty string to get any trailing zero-length matches
+	return findAllMatchesHelper(start, str, indices, startIdx+1)
 }