From 360bdc8e112558ec352059e9ae135a993d8fcbc7 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Thu, 31 Oct 2024 17:02:13 -0400 Subject: [PATCH] Big rewrite - assertion handling, zero-match fixes, change in recursive calls I added support for transitions. I wrote a function to determine if a given state has transitions for a character at a given point in the string. This helps me check if the current state has an assertion, and take actions based on that. I also fixed zero-length matching (almost, see todo.txt). It works for nearly all cases I could think of, although I still need to write more tests. I wrote a function to check if zero-length matches are possible with a given state. I also changed the way recursive calls work. Rather than passing a modified string, the function stores the location in the input string. This location is updated with each call to the function. Finally, the function now increments the offset by 1 instead of incrementing by the length of the longest match. This leads to a bit of overhead eg. if a regex matches index 1-5, then 1-5, 2-5, 3-5, 4-5 are all stored. To fix this, I wrote (and used) a function to check if a match overlaps with any matches in a slice. --- matching.go | 116 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 95 insertions(+), 21 deletions(-) diff --git a/matching.go b/matching.go index fcc972f..94a6ea0 100644 --- a/matching.go +++ b/matching.go @@ -6,6 +6,21 @@ type matchIndex struct { endIdx int } +// Returns true if the given matchIndex is an improper subset of any of the indices in the slice. +// When we add an index to our slice, we want to make sure a larger match isn't already present. +func overlaps(idx matchIndex, idxes []matchIndex) bool { + for _, val := range idxes { + if idx.startIdx >= val.startIdx && idx.endIdx <= val.endIdx { + // A zero-length match doesn't overlap if it is located at the start or end + // of the other match + if !(idx.startIdx == idx.endIdx && (idx.startIdx == val.startIdx || idx.startIdx == val.endIdx)) { + return true + } + } + } + return false +} + // takeZeroState takes the 0-state (if such a transition exists) for all states in the // given slice. It returns the resulting states. If any of the resulting states is a 0-state, // the second parameter is true. @@ -23,31 +38,66 @@ func takeZeroState(states []*State) (rtv []*State, isZero bool) { return rtv, false } +// zeroMatchPossible returns true if a zero-length match is possible +// from any of the given states. +// It uses the same algorithm to find zero-states as the one inside the loop, +// so I should probably put it in a function. +func zeroMatchPossible(states ...*State) bool { + zerostates, iszero := takeZeroState(states) + tempstates := make([]*State, 0) + tempstates = append(tempstates, states...) + tempstates = append(tempstates, zerostates...) + num_appended := 0 // number of unique states addded to tempstates + for iszero == true { + zerostates, iszero = takeZeroState(tempstates) + tempstates, num_appended = unique_append(tempstates, zerostates...) + if num_appended == 0 { // break if we haven't appended any more unique values + break + } + } + for _, state := range tempstates { + if state.isEmpty && state.assert == NONE && state.isLast { + return true + } + } + return false +} + // findAllMatches tries to findAllMatches the regex represented by given start-state, with // the given string func findAllMatches(start *State, str string) (indices []matchIndex) { return findAllMatchesHelper(start, str, make([]matchIndex, 0), 0) } func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset int) []matchIndex { - // 'Base case' - exit if string is empty. - if len(str) == 0 { - // If the start is a Kleene star, then it should also match an empty string. - if start.isKleene && start.isLast { - indices, _ = unique_append(indices, matchIndex{offset, offset}) + // Base case - exit if offset exceeds string's length + if offset > len(str) { + return indices + } + // 'Base case' - if we are at the end of the string, check if we can add a zero-length match + if offset == len(str) { + // Get all zero-state matches. If we can get to a zero-state without matching anything, we + // can add a zero-length match. This is all true only if the start state itself matches nothing. + if start.isEmpty && start.assert == NONE { + if zeroMatchPossible(start) { + if !overlaps(matchIndex{offset, offset}, indices) { + indices, _ = unique_append(indices, matchIndex{offset, offset}) + } + } } return indices } foundPath := false - startIdx := 0 - endIdx := 0 + startIdx := offset + endIdx := offset currentStates := make([]*State, 0) tempStates := make([]*State, 0) // Used to store states that should be used in next loop iteration - i := 0 // Index in string + i := offset // Index in string startingFrom := i // Store starting index + // Increment until we hit a character matching the start state (assuming not 0-state) if start.isEmpty == false { - for i < len(str) && !start.content.contains(int(str[i])) { + for i < len(str) && !start.contentContains([]rune(str), i) { i++ } startIdx = i @@ -83,21 +133,41 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset tempStates = nil // Take any transitions corresponding to current character + numStatesMatched := 0 // The number of states which had at least 1 match for this round + assertionFailed := false // Whether or not an assertion failed for this round for _, state := range currentStates { - if len(state.transitions[int(str[i])]) > 0 { - tempStates = append(tempStates, state.transitions[int(str[i])]...) + matches, numMatches := state.matchesFor([]rune(str), i) + if numMatches > 0 { + numStatesMatched++ + tempStates = append(tempStates, matches...) foundPath = true } + if numMatches < 0 { + assertionFailed = true + } if state.isLast { endIdx = i - tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx + offset, endIdx + offset}) + tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx, endIdx}) } } - // Recursion - match with rest of string if we have nowhere to go. If we haven't moved in the string, increment the counter by 1 to ensure we don't keep trying the same string over and over - if foundPath == false { + if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed - bort if i == startingFrom { i++ } + return findAllMatchesHelper(start, str, indices, i) + } + // Recursion - match with rest of string if we have nowhere to go. + // First check if we can find a zero-length match + if foundPath == false { + if zeroMatchPossible(currentStates...) { + tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx, startIdx}) + } + // If we haven't moved in the string, increment the counter by 1 + // to ensure we don't keep trying the same string over and over. + // if i == startingFrom { + startIdx++ + // i++ + // } // Get the maximum index-range from the list if len(tempIndices) > 0 { indexToAdd := Reduce(tempIndices, func(i1 matchIndex, i2 matchIndex) matchIndex { @@ -108,9 +178,11 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset } return i2 }) - indices, _ = unique_append(indices, indexToAdd) + if !overlaps(indexToAdd, indices) { + indices, _ = unique_append(indices, indexToAdd) + } } - return findAllMatchesHelper(start, str[i:], indices, offset+i) + return findAllMatchesHelper(start, str, indices, startIdx) } currentStates = make([]*State, len(tempStates)) copy(currentStates, tempStates) @@ -137,9 +209,9 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset for _, state := range currentStates { // Only add the match if the start index is in bounds - if state.isLast && startIdx+offset < len(str)+offset { + if state.isLast && startIdx < len(str) { endIdx = i - tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx + offset, endIdx + offset}) + tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx, endIdx}) } } // Get the maximum index-range from the list @@ -152,9 +224,11 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset } return i2 }) - indices, _ = unique_append(indices, indexToAdd) + if !overlaps(indexToAdd, indices) { + indices, _ = unique_append(indices, indexToAdd) + } } - // Default - return indices + // Default - call on empty string to get any trailing zero-length matches + return findAllMatchesHelper(start, str, indices, startIdx+1) }