Used Pike's algorithm (an extension to Thompson's algorithm) (see Russ Cox's 2nd article); I think I almost have a working PCRE-style engine

implementPCREMatchingRules
Aadhavan Srinivasan 1 month ago
parent 052de55826
commit 3604486a9b

@ -2,7 +2,6 @@ package regex
import ( import (
"fmt" "fmt"
"slices"
"sort" "sort"
) )
@ -252,32 +251,36 @@ func (regex Reg) FindAllSubmatch(str string) []Match {
return indices return indices
} }
func addStateToList(idx int, list []nfaState, state nfaState) []nfaState { func addStateToList(idx int, list []nfaState, state nfaState, threadGroups []Group) []nfaState {
if stateExists(list, state) { if stateExists(list, state) {
return list return list
} }
if state.isAlternation { if state.isKleene || state.isQuestion {
copyThread(state.next, state)
list = append(list, addStateToList(idx, list, *state.next)...)
copyThread(state.splitState, state) copyThread(state.splitState, state)
list = append(list, addStateToList(idx, list, *state.splitState)...) list = addStateToList(idx, list, *state.splitState, threadGroups)
copyThread(state.next, state)
list = addStateToList(idx, list, *state.next, threadGroups)
return list return list
} }
if state.isKleene { if state.isAlternation {
copyThread(state.splitState, state)
list = append(list, addStateToList(idx, list, *state.splitState)...)
copyThread(state.next, state) copyThread(state.next, state)
list = append(list, addStateToList(idx, list, *state.next)...) list = addStateToList(idx, list, *state.next, threadGroups)
copyThread(state.splitState, state)
list = addStateToList(idx, list, *state.splitState, threadGroups)
return list return list
} }
state.threadGroups = append([]Group{}, threadGroups...)
if state.groupBegin { if state.groupBegin {
state.threadGroups[state.groupNum].StartIdx = idx state.threadGroups[state.groupNum].StartIdx = idx
return append(list, addStateToList(idx, list, *state.next, state.threadGroups)...)
} }
if state.groupEnd { if state.groupEnd {
state.threadGroups[state.groupNum].StartIdx = idx state.threadGroups[state.groupNum].EndIdx = idx
return append(list, addStateToList(idx, list, *state.next, state.threadGroups)...)
} }
copyThread(state.next, state) state.threadGroups = append([]Group{}, threadGroups...)
return append(list, *state.next) return append(list, state)
} }
@ -335,138 +338,113 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// tempIndices[start.groupNum].startIdx = i // tempIndices[start.groupNum].startIdx = i
//} //}
currentStates = append(currentStates, *start) start.threadGroups = newMatch(numGroups + 1)
var foundMatch bool start.threadGroups[0].StartIdx = i
var isEmptyAndNoAssertion bool currentStates = addStateToList(i, currentStates, *start, start.threadGroups)
var match Match = nil
// var isEmptyAndNoAssertion bool
// Main loop // Main loop
for idx := i; idx <= len(str); idx++ { for idx := i; idx <= len(str); idx++ {
if len(currentStates) == 0 {
break
}
for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ { for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ {
currentState := currentStates[currentStateIdx] currentState := currentStates[currentStateIdx]
foundMatch = false
isEmptyAndNoAssertion = false
if currentState.threadGroups == nil { if currentState.threadGroups == nil {
currentState.threadGroups = newMatch(numGroups + 1) currentState.threadGroups = newMatch(numGroups + 1)
currentState.threadGroups[0].StartIdx = idx currentState.threadGroups[0].StartIdx = idx
} }
if currentState.groupBegin { if currentState.isLast {
currentState.threadGroups[currentState.groupNum].StartIdx = idx currentState.threadGroups[0].EndIdx = idx
// allMatches := make([]nfaState, 0) match = append([]Group{}, currentState.threadGroups...)
// for _, v := range currentState.transitions { break
// dereferenced := funcMap(v, func(s *nfaState) nfaState { } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd { // Normal character or assertion
// return *s if currentState.contentContains(str, idx) {
// }) nextStates = addStateToList(idx+1, nextStates, *currentState.next, currentState.threadGroups)
// allMatches = append(allMatches, dereferenced...)
// }
// slices.Reverse(allMatches)
// for i := range allMatches {
// copyThread(&allMatches[i], currentState)
// }
// currentStates = append(currentStates, allMatches...)
} }
if currentState.groupEnd {
currentState.threadGroups[currentState.groupNum].EndIdx = idx
// allMatches := make([]nfaState, 0)
// for _, v := range currentState.transitions {
// dereferenced := funcMap(v, func(s *nfaState) nfaState {
// return *s
// })
// allMatches = append(allMatches, dereferenced...)
// }
// slices.Reverse(allMatches)
// for i := range allMatches {
// copyThread(&allMatches[i], currentState)
// }
// currentStates = append(currentStates, allMatches...)
} }
// if currentState.isKleene { // if currentState.groupBegin {
// // Append the next-state (after the kleene), then append the kleene state // currentState.threadGroups[currentState.groupNum].StartIdx = idx
// allMatches := make([]*nfaState, 0)
// for _, v := range currentState.transitions {
// allMatches = append(allMatches, v...)
// } // }
// slices.Reverse(allMatches) // if currentState.groupEnd {
// for _, m := range allMatches { // currentState.threadGroups[currentState.groupNum].EndIdx = idx
// m.threadGroups = currentState.threadGroups
// m.threadSP = idx
// }
// currentStates = append(currentStates, allMatches...)
//
// // kleeneState := currentState.kleeneState
// // kleeneState.threadGroups = currentState.threadGroups
// // kleeneState.threadSP = currentState.threadSP
// // currentStates = append(currentStates, kleeneState)
// continue
// } // }
// Alternation - enqueue left then right state, and continue // Alternation - enqueue left then right state, and continue
if currentState.isAlternation { // if currentState.isAlternation {
if currentState.isKleene { // Reverse order of adding things // if currentState.isKleene { // Reverse order of adding things
rightState := currentState.splitState // rightState := currentState.splitState
copyThread(rightState, currentState) // copyThread(rightState, currentState)
currentStates = slices.Insert(currentStates, currentStateIdx+1, *rightState) // currentStates = slices.Insert(currentStates, currentStateIdx+1, *rightState)
leftState := currentState.next // leftState := currentState.next
copyThread(leftState, currentState) // copyThread(leftState, currentState)
currentStates = slices.Insert(currentStates, currentStateIdx+2, *leftState) // currentStates = slices.Insert(currentStates, currentStateIdx+2, *leftState)
} else { // } else {
leftState := currentState.next // leftState := currentState.next
copyThread(leftState, currentState) // copyThread(leftState, currentState)
currentStates = slices.Insert(currentStates, currentStateIdx+1, *leftState) // currentStates = slices.Insert(currentStates, currentStateIdx+1, *leftState)
rightState := currentState.splitState // rightState := currentState.splitState
copyThread(rightState, currentState) // copyThread(rightState, currentState)
currentStates = slices.Insert(currentStates, currentStateIdx+2, *rightState) // currentStates = slices.Insert(currentStates, currentStateIdx+2, *rightState)
} // }
continue // continue
} // }
// Empty state - enqueue next state, do _not_ increment the SP // Empty state - enqueue next state, do _not_ increment the SP
if !currentState.isAlternation && currentState.isEmpty && currentState.assert == noneAssert { //&& currentState.groupBegin == false && currentState.groupEnd == false { // if !currentState.isAlternation && currentState.isEmpty && currentState.assert == noneAssert { //&& currentState.groupBegin == false && currentState.groupEnd == false {
isEmptyAndNoAssertion = true // isEmptyAndNoAssertion = true
} // }
//
if currentState.contentContains(str, idx) { // if currentState.contentContains(str, idx) {
foundMatch = true // foundMatch = true
} // }
//
if isEmptyAndNoAssertion || foundMatch { // if isEmptyAndNoAssertion || foundMatch {
nextMatch := *(currentState.next) // nextMatch := *(currentState.next)
copyThread(&nextMatch, currentState) // copyThread(&nextMatch, currentState)
if currentState.groupBegin { // if currentState.groupBegin {
// // if !stateExists(currentStates, nextMatch) {
// currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch)
// //}
// } else if currentState.groupEnd {
// if !stateExists(currentStates, nextMatch) { // if !stateExists(currentStates, nextMatch) {
currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch) // currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch) // append(currentStates, nextMatch)
// }
// } else if currentState.assert != noneAssert {
// if !stateExists(currentStates, nextMatch) {
// currentStates = append(currentStates, nextMatch)
// }
// } else if currentState.isEmpty && !currentState.groupBegin && !currentState.groupEnd {
// if !stateExists(currentStates, nextMatch) {
// currentStates = append(currentStates, nextMatch)
// }
// } else {
// if !stateExists(nextStates, nextMatch) {
// nextStates = append(nextStates, nextMatch)
// }
// }
// }
//
// if currentState.isLast && len(nextStates) == 0 { // Last state reached
// currentState.threadGroups[0].EndIdx = idx
// if idx == currentState.threadGroups[0].StartIdx {
// idx += 1
// }
// return true, currentState.threadGroups, idx
// } // }
} else if currentState.groupEnd {
if !stateExists(currentStates, nextMatch) {
currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch) // append(currentStates, nextMatch)
}
} else if currentState.assert != noneAssert {
if !stateExists(currentStates, nextMatch) {
currentStates = append(currentStates, nextMatch)
}
} else if currentState.isEmpty && !currentState.groupBegin && !currentState.groupEnd {
if !stateExists(currentStates, nextMatch) {
currentStates = append(currentStates, nextMatch)
}
} else {
if !stateExists(nextStates, nextMatch) {
nextStates = append(nextStates, nextMatch)
}
}
}
if currentState.isLast && len(nextStates) == 0 { // Last state reached
currentState.threadGroups[0].EndIdx = idx
if idx == currentState.threadGroups[0].StartIdx {
idx += 1
}
return true, currentState.threadGroups, idx
}
} }
currentStates = append([]nfaState{}, nextStates...) currentStates = append([]nfaState{}, nextStates...)
nextStates = nil nextStates = nil
} }
if match != nil {
if offset == match[0].EndIdx {
return true, match, match[0].EndIdx + 1
}
return true, match, match[0].EndIdx
}
return false, []Group{}, i + 1 return false, []Group{}, i + 1
// zeroStates := make([]*nfaState, 0) // zeroStates := make([]*nfaState, 0)
// // Keep taking zero-states, until there are no more left to take // // Keep taking zero-states, until there are no more left to take

Loading…
Cancel
Save