From 137ea3c746cac224092a6aa9fc89b64564652db3 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Thu, 7 Nov 2024 16:16:50 -0500 Subject: [PATCH] Made findAllMatchesHelper non-recursive, added pruneIndices (improved performance) and more changes I made findAllMatchesHelper a non-recursive function. It now only returns the first match it finds in the string (so I should probably rename it). These indices are collected by findAllMatches and pruned (to remove overlaps). The overlap function has also been rewritten, to make it (I believe) less than O(n^2). I also used the uniq_arr type to make checking for uniqueness O(1) instaed of O(n) (as it was with unique_append()). This has resulted in massive performance gains. There's been a lot of changes here, and I probably haven't documented all of them. --- matching.go | 131 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 82 insertions(+), 49 deletions(-) diff --git a/matching.go b/matching.go index 83f0000..8cbfb60 100644 --- a/matching.go +++ b/matching.go @@ -1,25 +1,17 @@ package main -// a matchIndex represents a match. It contains the start index and end index of the match -type matchIndex struct { +import "sort" + +// a MatchIndex represents a match/group. It contains the start index and end index of the match +type MatchIndex struct { startIdx int endIdx int } -// Returns true if the given matchIndex has ovelrap with any of the indices in the slice. -// When we add an index to our slice, we want to make sure a larger match isn't already present. -func overlaps(idx matchIndex, idxes []matchIndex) bool { - for _, val := range idxes { - if (idx.startIdx > val.startIdx && idx.startIdx < val.endIdx) || (idx.endIdx > val.startIdx && idx.endIdx < val.endIdx) { - // A zero-length match doesn't overlap if it is located at the start or end - // of the other match - if !(idx.startIdx == idx.endIdx && (idx.startIdx == val.startIdx || idx.startIdx == val.endIdx)) { - return true - } - } - } - return false -} +// A Match represents multiple matchIndices. Specifically, it maps an integer (representing the capturing group) +// to the matchIndex of that group. +// Group 0 corresponds to the entire match. +type Match map[int]MatchIndex // takeZeroState takes the 0-state (if such a transition exists) for all states in the // given slice. It returns the resulting states. If any of the resulting states is a 0-state, @@ -44,7 +36,7 @@ func takeZeroState(states []*State) (rtv []*State, isZero bool) { // so I should probably put it in a function. func zeroMatchPossible(states ...*State) bool { zerostates, iszero := takeZeroState(states) - tempstates := make([]*State, 0) + tempstates := make([]*State, 0, len(zerostates)+len(states)) tempstates = append(tempstates, states...) tempstates = append(tempstates, zerostates...) num_appended := 0 // number of unique states addded to tempstates @@ -63,15 +55,60 @@ func zeroMatchPossible(states ...*State) bool { return false } -// findAllMatches tries to findAllMatches the regex represented by given start-state, with +// Prunes the slice by removing overlapping indices. +func pruneIndices(indices []MatchIndex) []MatchIndex { + // First, sort the slice by the start indices + sort.Slice(indices, func(i, j int) bool { + return indices[i].startIdx < indices[j].startIdx + }) + toRet := make([]MatchIndex, 0, len(indices)) + current := indices[0] + for _, idx := range indices[1:] { + // idx doesn't overlap with current (starts after current ends), so add current to result + // and update the current. + if idx.startIdx >= current.endIdx { + toRet = append(toRet, current) + current = idx + } else if idx.endIdx > current.endIdx { + // idx overlaps, but it is longer, so update current + current = idx + } + } + // Add last state + toRet = append(toRet, current) + return toRet +} + +// findAllMatches tries to find all matches of the regex represented by given start-state, with // the given string -func findAllMatches(start *State, str string) (indices []matchIndex) { - return findAllMatchesHelper(start, str, make([]matchIndex, 0), 0) +func findAllMatches(start *State, str string) []MatchIndex { + idx := 0 + var matchFound bool + var matchIdx MatchIndex + indices := new_uniq_arr[MatchIndex]() + for idx <= len(str) { + matchFound, matchIdx, idx = findAllMatchesHelper(start, str, idx) + if matchFound { + indices.add(matchIdx) + } + } + toReturn := indices.values() + if len(toReturn) > 0 { + return pruneIndices(toReturn) + } + return toReturn } -func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset int) []matchIndex { + +// Helper for findAllMatches. Returns whether it found a match, the +// first matchIndex it finds, and how far it got into the string ie. where +// the next search should start from. +// +// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array. +func findAllMatchesHelper(start *State, str string, offset int) (bool, MatchIndex, int) { // Base case - exit if offset exceeds string's length if offset > len(str) { - return indices + // The first value here shouldn't be used, because we should exit when the second return value is > than len(str) + return false, MatchIndex{}, offset } // 'Base case' - if we are at the end of the string, check if we can add a zero-length match if offset == len(str) { @@ -79,12 +116,10 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset // can add a zero-length match. This is all true only if the start state itself matches nothing. if start.isEmpty && start.assert == NONE { if zeroMatchPossible(start) { - if !overlaps(matchIndex{offset, offset}, indices) { - indices, _ = unique_append(indices, matchIndex{offset, offset}) - } + return true, MatchIndex{offset, offset}, offset + 1 } } - return indices + return false, MatchIndex{}, offset + 1 } foundPath := false @@ -110,7 +145,7 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset // can no longer find a match, the match with the largest range is // chosen as the match for the entire string. // This allows us to pick the longest possible match (which is how greedy matching works). - tempIndices := make([]matchIndex, 0) + tempIndices := make([]MatchIndex, 0) // Main loop for i < len(str) { foundPath = false @@ -147,20 +182,19 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset } if state.isLast { endIdx = i - tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx, endIdx}) + tempIndices, _ = unique_append(tempIndices, MatchIndex{startIdx, endIdx}) } } if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed - abort if i == startingFrom { i++ } - return findAllMatchesHelper(start, str, indices, i) + return false, MatchIndex{}, i } - // Recursion - match with rest of string if we have nowhere to go. - // First check if we can find a zero-length match + // Check if we can find a zero-length match if foundPath == false { if zeroMatchPossible(currentStates...) { - tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx, startIdx}) + tempIndices, _ = unique_append(tempIndices, MatchIndex{startIdx, startIdx}) } // If we haven't moved in the string, increment the counter by 1 // to ensure we don't keep trying the same string over and over. @@ -169,9 +203,8 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset // i++ // } // Get the maximum index-range from the list - end := 0 if len(tempIndices) > 0 { - indexToAdd := Reduce(tempIndices, func(i1 matchIndex, i2 matchIndex) matchIndex { + indexToAdd := Reduce(tempIndices, func(i1 MatchIndex, i2 MatchIndex) MatchIndex { r1 := i1.endIdx - i1.startIdx r2 := i2.endIdx - i2.startIdx if r1 >= r2 { @@ -179,16 +212,13 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset } return i2 }) - if !overlaps(indexToAdd, indices) { - indices, _ = unique_append(indices, indexToAdd) - end = indexToAdd.endIdx + if indexToAdd.startIdx == indexToAdd.endIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. + return true, indexToAdd, indexToAdd.endIdx + 1 + } else { + return true, indexToAdd, indexToAdd.endIdx } } - if end == 0 || end == startIdx-1 { // Since we incremented startIdx earlier, we need to check against the old startIdx - return findAllMatchesHelper(start, str, indices, startIdx) - } else { - return findAllMatchesHelper(start, str, indices, end) - } + return false, MatchIndex{}, startIdx } currentStates = make([]*State, len(tempStates)) copy(currentStates, tempStates) @@ -219,13 +249,13 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset if state.isLast && startIdx < len(str) { if state.assert == NONE || state.checkAssertion(str, len(str)) { endIdx = i - tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx, endIdx}) + tempIndices, _ = unique_append(tempIndices, MatchIndex{startIdx, endIdx}) } } } // Get the maximum index-range from the list if len(tempIndices) > 0 { - indexToAdd := Reduce(tempIndices, func(i1 matchIndex, i2 matchIndex) matchIndex { + indexToAdd := Reduce(tempIndices, func(i1 MatchIndex, i2 MatchIndex) MatchIndex { r1 := i1.endIdx - i1.startIdx r2 := i2.endIdx - i2.startIdx if r1 >= r2 { @@ -233,11 +263,14 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset } return i2 }) - if !overlaps(indexToAdd, indices) { - indices, _ = unique_append(indices, indexToAdd) + if indexToAdd.endIdx == indexToAdd.startIdx { // Same statement occurs above, see reasoning there + return true, indexToAdd, indexToAdd.endIdx + 1 + } else { + return true, indexToAdd, indexToAdd.endIdx } } - - // Default - call on empty string to get any trailing zero-length matches - return findAllMatchesHelper(start, str, indices, startIdx+1) + if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over. + startIdx++ + } + return false, MatchIndex{}, startIdx }