Big rewrite - assertion handling, zero-match fixes, change in recursive calls
I added support for transitions. I wrote a function to determine if a given state has transitions for a character at a given point in the string. This helps me check if the current state has an assertion, and take actions based on that. I also fixed zero-length matching (almost, see todo.txt). It works for nearly all cases I could think of, although I still need to write more tests. I wrote a function to check if zero-length matches are possible with a given state. I also changed the way recursive calls work. Rather than passing a modified string, the function stores the location in the input string. This location is updated with each call to the function. Finally, the function now increments the offset by 1 instead of incrementing by the length of the longest match. This leads to a bit of overhead eg. if a regex matches index 1-5, then 1-5, 2-5, 3-5, 4-5 are all stored. To fix this, I wrote (and used) a function to check if a match overlaps with any matches in a slice.
This commit is contained in:
116
matching.go
116
matching.go
@@ -6,6 +6,21 @@ type matchIndex struct {
|
||||
endIdx int
|
||||
}
|
||||
|
||||
// Returns true if the given matchIndex is an improper subset of any of the indices in the slice.
|
||||
// When we add an index to our slice, we want to make sure a larger match isn't already present.
|
||||
func overlaps(idx matchIndex, idxes []matchIndex) bool {
|
||||
for _, val := range idxes {
|
||||
if idx.startIdx >= val.startIdx && idx.endIdx <= val.endIdx {
|
||||
// A zero-length match doesn't overlap if it is located at the start or end
|
||||
// of the other match
|
||||
if !(idx.startIdx == idx.endIdx && (idx.startIdx == val.startIdx || idx.startIdx == val.endIdx)) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// takeZeroState takes the 0-state (if such a transition exists) for all states in the
|
||||
// given slice. It returns the resulting states. If any of the resulting states is a 0-state,
|
||||
// the second parameter is true.
|
||||
@@ -23,31 +38,66 @@ func takeZeroState(states []*State) (rtv []*State, isZero bool) {
|
||||
return rtv, false
|
||||
}
|
||||
|
||||
// zeroMatchPossible returns true if a zero-length match is possible
|
||||
// from any of the given states.
|
||||
// It uses the same algorithm to find zero-states as the one inside the loop,
|
||||
// so I should probably put it in a function.
|
||||
func zeroMatchPossible(states ...*State) bool {
|
||||
zerostates, iszero := takeZeroState(states)
|
||||
tempstates := make([]*State, 0)
|
||||
tempstates = append(tempstates, states...)
|
||||
tempstates = append(tempstates, zerostates...)
|
||||
num_appended := 0 // number of unique states addded to tempstates
|
||||
for iszero == true {
|
||||
zerostates, iszero = takeZeroState(tempstates)
|
||||
tempstates, num_appended = unique_append(tempstates, zerostates...)
|
||||
if num_appended == 0 { // break if we haven't appended any more unique values
|
||||
break
|
||||
}
|
||||
}
|
||||
for _, state := range tempstates {
|
||||
if state.isEmpty && state.assert == NONE && state.isLast {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// findAllMatches tries to findAllMatches the regex represented by given start-state, with
|
||||
// the given string
|
||||
func findAllMatches(start *State, str string) (indices []matchIndex) {
|
||||
return findAllMatchesHelper(start, str, make([]matchIndex, 0), 0)
|
||||
}
|
||||
func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset int) []matchIndex {
|
||||
// 'Base case' - exit if string is empty.
|
||||
if len(str) == 0 {
|
||||
// If the start is a Kleene star, then it should also match an empty string.
|
||||
if start.isKleene && start.isLast {
|
||||
indices, _ = unique_append(indices, matchIndex{offset, offset})
|
||||
// Base case - exit if offset exceeds string's length
|
||||
if offset > len(str) {
|
||||
return indices
|
||||
}
|
||||
// 'Base case' - if we are at the end of the string, check if we can add a zero-length match
|
||||
if offset == len(str) {
|
||||
// Get all zero-state matches. If we can get to a zero-state without matching anything, we
|
||||
// can add a zero-length match. This is all true only if the start state itself matches nothing.
|
||||
if start.isEmpty && start.assert == NONE {
|
||||
if zeroMatchPossible(start) {
|
||||
if !overlaps(matchIndex{offset, offset}, indices) {
|
||||
indices, _ = unique_append(indices, matchIndex{offset, offset})
|
||||
}
|
||||
}
|
||||
}
|
||||
return indices
|
||||
}
|
||||
|
||||
foundPath := false
|
||||
startIdx := 0
|
||||
endIdx := 0
|
||||
startIdx := offset
|
||||
endIdx := offset
|
||||
currentStates := make([]*State, 0)
|
||||
tempStates := make([]*State, 0) // Used to store states that should be used in next loop iteration
|
||||
i := 0 // Index in string
|
||||
i := offset // Index in string
|
||||
startingFrom := i // Store starting index
|
||||
|
||||
// Increment until we hit a character matching the start state (assuming not 0-state)
|
||||
if start.isEmpty == false {
|
||||
for i < len(str) && !start.content.contains(int(str[i])) {
|
||||
for i < len(str) && !start.contentContains([]rune(str), i) {
|
||||
i++
|
||||
}
|
||||
startIdx = i
|
||||
@@ -83,21 +133,41 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
|
||||
tempStates = nil
|
||||
|
||||
// Take any transitions corresponding to current character
|
||||
numStatesMatched := 0 // The number of states which had at least 1 match for this round
|
||||
assertionFailed := false // Whether or not an assertion failed for this round
|
||||
for _, state := range currentStates {
|
||||
if len(state.transitions[int(str[i])]) > 0 {
|
||||
tempStates = append(tempStates, state.transitions[int(str[i])]...)
|
||||
matches, numMatches := state.matchesFor([]rune(str), i)
|
||||
if numMatches > 0 {
|
||||
numStatesMatched++
|
||||
tempStates = append(tempStates, matches...)
|
||||
foundPath = true
|
||||
}
|
||||
if numMatches < 0 {
|
||||
assertionFailed = true
|
||||
}
|
||||
if state.isLast {
|
||||
endIdx = i
|
||||
tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx + offset, endIdx + offset})
|
||||
tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx, endIdx})
|
||||
}
|
||||
}
|
||||
// Recursion - match with rest of string if we have nowhere to go. If we haven't moved in the string, increment the counter by 1 to ensure we don't keep trying the same string over and over
|
||||
if foundPath == false {
|
||||
if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed - bort
|
||||
if i == startingFrom {
|
||||
i++
|
||||
}
|
||||
return findAllMatchesHelper(start, str, indices, i)
|
||||
}
|
||||
// Recursion - match with rest of string if we have nowhere to go.
|
||||
// First check if we can find a zero-length match
|
||||
if foundPath == false {
|
||||
if zeroMatchPossible(currentStates...) {
|
||||
tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx, startIdx})
|
||||
}
|
||||
// If we haven't moved in the string, increment the counter by 1
|
||||
// to ensure we don't keep trying the same string over and over.
|
||||
// if i == startingFrom {
|
||||
startIdx++
|
||||
// i++
|
||||
// }
|
||||
// Get the maximum index-range from the list
|
||||
if len(tempIndices) > 0 {
|
||||
indexToAdd := Reduce(tempIndices, func(i1 matchIndex, i2 matchIndex) matchIndex {
|
||||
@@ -108,9 +178,11 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
|
||||
}
|
||||
return i2
|
||||
})
|
||||
indices, _ = unique_append(indices, indexToAdd)
|
||||
if !overlaps(indexToAdd, indices) {
|
||||
indices, _ = unique_append(indices, indexToAdd)
|
||||
}
|
||||
}
|
||||
return findAllMatchesHelper(start, str[i:], indices, offset+i)
|
||||
return findAllMatchesHelper(start, str, indices, startIdx)
|
||||
}
|
||||
currentStates = make([]*State, len(tempStates))
|
||||
copy(currentStates, tempStates)
|
||||
@@ -137,9 +209,9 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
|
||||
|
||||
for _, state := range currentStates {
|
||||
// Only add the match if the start index is in bounds
|
||||
if state.isLast && startIdx+offset < len(str)+offset {
|
||||
if state.isLast && startIdx < len(str) {
|
||||
endIdx = i
|
||||
tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx + offset, endIdx + offset})
|
||||
tempIndices, _ = unique_append(tempIndices, matchIndex{startIdx, endIdx})
|
||||
}
|
||||
}
|
||||
// Get the maximum index-range from the list
|
||||
@@ -152,9 +224,11 @@ func findAllMatchesHelper(start *State, str string, indices []matchIndex, offset
|
||||
}
|
||||
return i2
|
||||
})
|
||||
indices, _ = unique_append(indices, indexToAdd)
|
||||
if !overlaps(indexToAdd, indices) {
|
||||
indices, _ = unique_append(indices, indexToAdd)
|
||||
}
|
||||
}
|
||||
|
||||
// Default
|
||||
return indices
|
||||
// Default - call on empty string to get any trailing zero-length matches
|
||||
return findAllMatchesHelper(start, str, indices, startIdx+1)
|
||||
}
|
||||
|
Reference in New Issue
Block a user