@ -1,25 +1,17 @@
package main
// a matchIndex represents a match. It contains the start index and end index of the match
type matchIndex struct {
import "sort"
// a MatchIndex represents a match/group. It contains the start index and end index of the match
type MatchIndex struct {
startIdx int
endIdx int
}
// Returns true if the given matchIndex has ovelrap with any of the indices in the slice.
// When we add an index to our slice, we want to make sure a larger match isn't already present.
func overlaps ( idx matchIndex , idxes [ ] matchIndex ) bool {
for _ , val := range idxes {
if ( idx . startIdx > val . startIdx && idx . startIdx < val . endIdx ) || ( idx . endIdx > val . startIdx && idx . endIdx < val . endIdx ) {
// A zero-length match doesn't overlap if it is located at the start or end
// of the other match
if ! ( idx . startIdx == idx . endIdx && ( idx . startIdx == val . startIdx || idx . startIdx == val . endIdx ) ) {
return true
}
}
}
return false
}
// A Match represents multiple matchIndices. Specifically, it maps an integer (representing the capturing group)
// to the matchIndex of that group.
// Group 0 corresponds to the entire match.
type Match map [ int ] MatchIndex
// takeZeroState takes the 0-state (if such a transition exists) for all states in the
// given slice. It returns the resulting states. If any of the resulting states is a 0-state,
@ -44,7 +36,7 @@ func takeZeroState(states []*State) (rtv []*State, isZero bool) {
// so I should probably put it in a function.
func zeroMatchPossible ( states ... * State ) bool {
zerostates , iszero := takeZeroState ( states )
tempstates := make ( [ ] * State , 0 )
tempstates := make ( [ ] * State , 0 , len ( zerostates ) + len ( states ) )
tempstates = append ( tempstates , states ... )
tempstates = append ( tempstates , zerostates ... )
num_appended := 0 // number of unique states addded to tempstates
@ -63,15 +55,60 @@ func zeroMatchPossible(states ...*State) bool {
return false
}
// findAllMatches tries to findAllMatches the regex represented by given start-state, with
// Prunes the slice by removing overlapping indices.
func pruneIndices ( indices [ ] MatchIndex ) [ ] MatchIndex {
// First, sort the slice by the start indices
sort . Slice ( indices , func ( i , j int ) bool {
return indices [ i ] . startIdx < indices [ j ] . startIdx
} )
toRet := make ( [ ] MatchIndex , 0 , len ( indices ) )
current := indices [ 0 ]
for _ , idx := range indices [ 1 : ] {
// idx doesn't overlap with current (starts after current ends), so add current to result
// and update the current.
if idx . startIdx >= current . endIdx {
toRet = append ( toRet , current )
current = idx
} else if idx . endIdx > current . endIdx {
// idx overlaps, but it is longer, so update current
current = idx
}
}
// Add last state
toRet = append ( toRet , current )
return toRet
}
// findAllMatches tries to find all matches of the regex represented by given start-state, with
// the given string
func findAllMatches ( start * State , str string ) ( indices [ ] matchIndex ) {
return findAllMatchesHelper ( start , str , make ( [ ] matchIndex , 0 ) , 0 )
func findAllMatches ( start * State , str string ) [ ] MatchIndex {
idx := 0
var matchFound bool
var matchIdx MatchIndex
indices := new_uniq_arr [ MatchIndex ] ( )
for idx <= len ( str ) {
matchFound , matchIdx , idx = findAllMatchesHelper ( start , str , idx )
if matchFound {
indices . add ( matchIdx )
}
}
toReturn := indices . values ( )
if len ( toReturn ) > 0 {
return pruneIndices ( toReturn )
}
return toReturn
}
func findAllMatchesHelper ( start * State , str string , indices [ ] matchIndex , offset int ) [ ] matchIndex {
// Helper for findAllMatches. Returns whether it found a match, the
// first matchIndex it finds, and how far it got into the string ie. where
// the next search should start from.
//
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
func findAllMatchesHelper ( start * State , str string , offset int ) ( bool , MatchIndex , int ) {
// Base case - exit if offset exceeds string's length
if offset > len ( str ) {
return indices
// The first value here shouldn't be used, because we should exit when the second return value is > than len(str)
return false , MatchIndex { } , offset
}
// 'Base case' - if we are at the end of the string, check if we can add a zero-length match
if offset == len ( str ) {
@ -79,12 +116,10 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
// can add a zero-length match. This is all true only if the start state itself matches nothing.
if start . isEmpty && start . assert == NONE {
if zeroMatchPossible ( start ) {
if ! overlaps ( matchIndex { offset , offset } , indices ) {
indices , _ = unique_append ( indices , matchIndex { offset , offset } )
}
return true , MatchIndex { offset , offset } , offset + 1
}
}
return indices
return false , MatchIndex { } , offset + 1
}
foundPath := false
@ -110,7 +145,7 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
// can no longer find a match, the match with the largest range is
// chosen as the match for the entire string.
// This allows us to pick the longest possible match (which is how greedy matching works).
tempIndices := make ( [ ] m atchIndex, 0 )
tempIndices := make ( [ ] M atchIndex, 0 )
// Main loop
for i < len ( str ) {
foundPath = false
@ -147,20 +182,19 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
}
if state . isLast {
endIdx = i
tempIndices , _ = unique_append ( tempIndices , m atchIndex{ startIdx , endIdx } )
tempIndices , _ = unique_append ( tempIndices , M atchIndex{ startIdx , endIdx } )
}
}
if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed - abort
if i == startingFrom {
i ++
}
return findAllMatchesHelper ( start , str , indices , i )
return false , MatchIndex { } , i
}
// Recursion - match with rest of string if we have nowhere to go.
// First check if we can find a zero-length match
// Check if we can find a zero-length match
if foundPath == false {
if zeroMatchPossible ( currentStates ... ) {
tempIndices , _ = unique_append ( tempIndices , m atchIndex{ startIdx , startIdx } )
tempIndices , _ = unique_append ( tempIndices , M atchIndex{ startIdx , startIdx } )
}
// If we haven't moved in the string, increment the counter by 1
// to ensure we don't keep trying the same string over and over.
@ -169,9 +203,8 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
// i++
// }
// Get the maximum index-range from the list
end := 0
if len ( tempIndices ) > 0 {
indexToAdd := Reduce ( tempIndices , func ( i1 matchIndex, i2 matchIndex ) m atchIndex {
indexToAdd := Reduce ( tempIndices , func ( i1 MatchIndex, i2 MatchIndex ) M atchIndex {
r1 := i1 . endIdx - i1 . startIdx
r2 := i2 . endIdx - i2 . startIdx
if r1 >= r2 {
@ -179,16 +212,13 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
}
return i2
} )
if ! overlaps ( indexToAdd , indices ) {
indices , _ = unique_append ( indices , indexToAdd )
end = indexToAdd . endIdx
}
}
if end == 0 || end == startIdx - 1 { // Since we incremented startIdx earlier, we need to check against the old startIdx
return findAllMatchesHelper ( start , str , indices , startIdx )
if indexToAdd . startIdx == indexToAdd . endIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true , indexToAdd , indexToAdd . endIdx + 1
} else {
return findAllMatchesHelper ( start , str , indices , end )
return true , indexToAdd , indexToAdd . endIdx
}
}
return false , MatchIndex { } , startIdx
}
currentStates = make ( [ ] * State , len ( tempStates ) )
copy ( currentStates , tempStates )
@ -219,13 +249,13 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
if state . isLast && startIdx < len ( str ) {
if state . assert == NONE || state . checkAssertion ( str , len ( str ) ) {
endIdx = i
tempIndices , _ = unique_append ( tempIndices , m atchIndex{ startIdx , endIdx } )
tempIndices , _ = unique_append ( tempIndices , M atchIndex{ startIdx , endIdx } )
}
}
}
// Get the maximum index-range from the list
if len ( tempIndices ) > 0 {
indexToAdd := Reduce ( tempIndices , func ( i1 matchIndex, i2 matchIndex ) m atchIndex {
indexToAdd := Reduce ( tempIndices , func ( i1 MatchIndex, i2 MatchIndex ) M atchIndex {
r1 := i1 . endIdx - i1 . startIdx
r2 := i2 . endIdx - i2 . startIdx
if r1 >= r2 {
@ -233,11 +263,14 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
}
return i2
} )
if ! overlaps ( indexToAdd , indices ) {
indices , _ = unique_append ( indices , indexToAdd )
if indexToAdd . endIdx == indexToAdd . startIdx { // Same statement occurs above, see reasoning there
return true , indexToAdd , indexToAdd . endIdx + 1
} else {
return true , indexToAdd , indexToAdd . endIdx
}
}
// Default - call on empty string to get any trailing zero-length matches
return findAllMatchesHelper ( start , str , indices , startIdx + 1 )
if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over.
startIdx ++
}
return false , MatchIndex { } , startIdx
}