From fbc9dfcc95c50b067e8b1ae71b969418ebcdb4f3 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Mon, 3 Feb 2025 16:47:53 -0500 Subject: [PATCH 01/48] Trying something out; we'll see if it works --- regex/priorityQueue.go | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/regex/priorityQueue.go b/regex/priorityQueue.go index 59592a9..ae43e86 100644 --- a/regex/priorityQueue.go +++ b/regex/priorityQueue.go @@ -15,8 +15,8 @@ const ( func getPriority(state *nfaState) int { if state.isKleene { - return kleene_priority - } else if state.isQuestion || state.isAlternation { + return zerostate_priority + } else if state.isAlternation { return alternation_priority } else { if state.isEmpty { @@ -33,6 +33,14 @@ type priorQueueItem struct { index int } +func newPriorQueueItem(state *nfaState) *priorQueueItem { + return &priorQueueItem{ + state: state, + index: -1, + priority: getPriority(state), + } +} + type priorityQueue []*priorQueueItem func (pq priorityQueue) Len() int { @@ -41,7 +49,7 @@ func (pq priorityQueue) Len() int { func (pq priorityQueue) Less(i, j int) bool { if pq[i].priority == pq[j].priority { - return pq[i].index > pq[j].index + return pq[i].index < pq[j].index } return pq[i].priority > pq[j].priority // We want max-heap, so we use greater-than } @@ -68,6 +76,11 @@ func (pq *priorityQueue) Pop() any { *pq = old[0 : n-1] return item } +func (pq *priorityQueue) peek() any { + queue := *pq + n := len(queue) + return queue[n-1] +} func (pq *priorityQueue) update(item *priorQueueItem, value *nfaState, priority int) { item.state = value -- 2.30.2 From 09812956ac147f8d5a44959700d5602574e87d8e Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Mon, 3 Feb 2025 16:48:09 -0500 Subject: [PATCH 02/48] Disable all optimizations --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 252549f..4a309a9 100644 --- a/Makefile +++ b/Makefile @@ -6,8 +6,8 @@ fmt: vet: fmt go vet ./... buildLib: vet - go build -gcflags="-N -l" ./... + go build -gcflags="all=-N -l" ./... buildCmd: buildLib - go build -C cmd/ -gcflags="-N -l" -o re ./... + go build -C cmd/ -gcflags="all=-N -l" -o re ./... test: buildCmd go test -v ./... -- 2.30.2 From 1fd48ae6143e57e829e7cb93608356890c9c8606 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Mon, 3 Feb 2025 16:49:10 -0500 Subject: [PATCH 03/48] Store the current string pointer as a 'thread variable' (allows us to simulate backtracking) --- regex/nfa.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/regex/nfa.go b/regex/nfa.go index 8f63eb0..0ceea1b 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -45,6 +45,7 @@ type nfaState struct { // The following properties depend on the current match - I should think about resetting them for every match. zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over. + threadSP int // The string pointer of the thread - where it is in the input string } // Clones the NFA starting from the given state. @@ -120,6 +121,7 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) { } // Assuming it hasn't been visited state.threadGroups = nil + state.threadSP = 0 visitedMap[state] = true for _, v := range state.transitions { for _, nextState := range v { -- 2.30.2 From e167cdb2cbac9c48ced4370151ebe848e5196012 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Mon, 3 Feb 2025 16:49:30 -0500 Subject: [PATCH 04/48] Fixed mistake in test output --- regex/re_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex/re_test.go b/regex/re_test.go index 8d24304..2cccc72 100644 --- a/regex/re_test.go +++ b/regex/re_test.go @@ -701,7 +701,7 @@ func TestFind(t *testing.T) { if len(test.result) == 0 { return // Manually pass the test, because this is the expected behavior } else { - t.Errorf("Wanted no match Got %v\n", groupIndex) + t.Errorf("Wanted %v Got no matches\n", test.result) } } else { if groupIndex != test.result[0] { -- 2.30.2 From ad273b0c682c63d080da3b30da504b67bd53d482 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Mon, 3 Feb 2025 16:50:11 -0500 Subject: [PATCH 05/48] Trying to emulate backtracking by using string pointers within threads (something similar to rsc's 2nd regexp article) --- regex/matching.go | 70 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index 17c2bcb..af2ede3 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -1,6 +1,7 @@ package regex import ( + "container/heap" "fmt" "slices" "sort" @@ -271,7 +272,8 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in foundPath := false startIdx := offset endIdx := offset - currentStates := make([]*nfaState, 0) + currentStates := &priorityQueue{} + heap.Init(currentStates) tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration i := offset // Index in string startingFrom := i // Store starting index @@ -301,16 +303,19 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // tempIndices[start.groupNum].startIdx = i } - currentStates = append(currentStates, start) - + start.threadSP = i + heap.Push(currentStates, newPriorQueueItem(start)) // Main loop - for i < len(str) { + for currentStates.Len() > 0 { + currentState := heap.Pop(currentStates) foundPath = false zeroStates := make([]*nfaState, 0) // Keep taking zero-states, until there are no more left to take // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take. - zeroStates, isZero := takeZeroState(currentStates, numGroups, i) + topStateItem := currentStates.peek() + topState := topStateItem.(*priorQueueItem).state + zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) tempStates = append(tempStates, zeroStates...) num_appended := 0 for isZero == true { @@ -320,8 +325,13 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in break } } + if isZero == true { + currentStates.Pop() + } - currentStates = slices.Concat(currentStates, tempStates) + for _, state := range tempStates { + heap.Push(currentStates, newPriorQueueItem(state)) + } tempStates = nil // Take any transitions corresponding to current character @@ -331,10 +341,11 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states for numStatesMatched == 0 && lastStateInList == false { - if len(currentStates) == 0 { + if currentStates.Len() == 0 { break } - state, _ := pop(¤tStates) + stateItem := heap.Pop(currentStates) + state := stateItem.(*priorQueueItem).state matches, numMatches := state.matchesFor(str, i) if numMatches > 0 { numStatesMatched++ @@ -344,6 +355,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in if m.threadGroups == nil { m.threadGroups = newMatch(numGroups + 1) } + m.threadSP = state.threadSP + 1 copy(m.threadGroups, state.threadGroups) } } @@ -382,7 +394,8 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // a. A last-state // b. Empty // c. Doesn't assert anything - for _, s := range currentStates { + for _, stateItem := range *currentStates { + s := stateItem.state if s.isLast && s.isEmpty && s.assert == noneAssert { lastStatePtr = s lastStateInList = true @@ -403,7 +416,10 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // Check if we can find a zero-length match if foundPath == false { - if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok { + currentStatesList := funcMap(*currentStates, func(item *priorQueueItem) *nfaState { + return item.state + }) + if ok := zeroMatchPossible(str, i, numGroups, currentStatesList...); ok { if tempIndices[0].IsValid() == false { tempIndices[0] = Group{startIdx, startIdx} } @@ -423,8 +439,11 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in } return false, []Group{}, startIdx } - currentStates = make([]*nfaState, len(tempStates)) - copy(currentStates, tempStates) + currentStates = &priorityQueue{} + slices.Reverse(tempStates) + for _, state := range tempStates { + heap.Push(currentStates, newPriorQueueItem(state)) + } tempStates = nil i++ @@ -432,21 +451,28 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position. // This is the exact same algorithm used inside the loop, so I should probably put it in a function. - zeroStates, isZero := takeZeroState(currentStates, numGroups, i) - tempStates = append(tempStates, zeroStates...) - num_appended := 0 // Number of unique states addded to tempStates - for isZero == true { - zeroStates, isZero = takeZeroState(tempStates, numGroups, i) - tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) - if num_appended == 0 { // Break if we haven't appended any more unique values - break + if currentStates.Len() > 0 { + topStateItem := currentStates.peek() + topState := topStateItem.(*priorQueueItem).state + zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) + tempStates = append(tempStates, zeroStates...) + num_appended := 0 // Number of unique states addded to tempStates + for isZero == true { + zeroStates, isZero = takeZeroState(tempStates, numGroups, i) + tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) + if num_appended == 0 { // Break if we haven't appended any more unique values + break + } } } - currentStates = append(currentStates, tempStates...) + for _, state := range tempStates { + heap.Push(currentStates, newPriorQueueItem(state)) + } tempStates = nil - for _, state := range currentStates { + for _, stateItem := range *currentStates { + state := stateItem.state // Only add the match if the start index is in bounds. If the state has an assertion, // make sure the assertion checks out. if state.isLast && i <= len(str) { -- 2.30.2 From de0d7345a8792180d05823067b92dc7934b927eb Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Mon, 3 Feb 2025 21:59:05 -0500 Subject: [PATCH 06/48] Store left and right branches of alternation separately --- regex/nfa.go | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/regex/nfa.go b/regex/nfa.go index 0ceea1b..f03edab 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -33,6 +33,8 @@ type nfaState struct { isKleene bool // Identifies whether current node is a 0-state representing Kleene star isQuestion bool // Identifies whether current node is a 0-state representing the question operator isAlternation bool // Identifies whether current node is a 0-state representing an alternation + leftState *nfaState // Only for alternation states - the 'left' branch of the alternation + rightState *nfaState // Only for alternation states - the 'right' branch of the alternation assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. @@ -106,6 +108,15 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) clone.lookaroundNFA = clone } clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap) + if stateToClone.leftState == stateToClone { + clone.leftState = clone + } + clone.leftState = cloneStateHelper(stateToClone.leftState, cloneMap) + if stateToClone.rightState == stateToClone { + clone.rightState = clone + } + clone.rightState = cloneStateHelper(stateToClone.rightState, cloneMap) + return clone } @@ -213,6 +224,9 @@ func (s nfaState) contentContains(str []rune, idx int) bool { if s.assert != noneAssert { return s.checkAssertion(str, idx) } + if idx >= len(str) { + return false + } if s.allChars { return !slices.Contains(slices.Concat(notDotChars, s.except), str[idx]) // Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node. } @@ -348,6 +362,8 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState { toReturn.content = newContents(epsilon) toReturn.isEmpty = true toReturn.isAlternation = true + toReturn.leftState = s1 + toReturn.rightState = s2 return toReturn } @@ -358,7 +374,7 @@ func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|) s2.content = newContents(epsilon) s2.output = append(s2.output, s2) s2.isEmpty = true - s2.isQuestion = true + s2.isAlternation = true s3 := alternate(s1, s2) return s3 } -- 2.30.2 From 5563a70568a645c959d798598ea5c3e66c224d16 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Mon, 3 Feb 2025 21:59:41 -0500 Subject: [PATCH 07/48] Reverse the order in which I pop states for alternation, because this messes with the left branch-right branch thing --- regex/compile.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex/compile.go b/regex/compile.go index b40c371..1068966 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -1059,8 +1059,8 @@ func thompson(re []postfixNode) (Reg, error) { // '|a' // '^a|' // '^|a' - s1, err1 := pop(&nfa) - s2, err2 := pop(&nfa) + s2, err1 := pop(&nfa) + s1, err2 := pop(&nfa) if err2 != nil || (s2.groupBegin && len(s2.transitions) == 0) { // Doesn't exist, or its just an LPAREN if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back nfa = append(nfa, s2) -- 2.30.2 From 753e973d82f512719336a8217f0fff0493eafb84 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Mon, 3 Feb 2025 22:00:04 -0500 Subject: [PATCH 08/48] Started rewrite of matching algorithm, got concatenation and alternation done, kleene and zero-state stuff is next --- regex/matching.go | 462 +++++++++++++++++++++++++--------------------- 1 file changed, 256 insertions(+), 206 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index af2ede3..ad7d15b 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -1,7 +1,6 @@ package regex import ( - "container/heap" "fmt" "slices" "sort" @@ -267,16 +266,15 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // chosen as the match for the entire string. // This allows us to pick the longest possible match (which is how greedy matching works). // COMMENT ABOVE IS CURRENTLY NOT UP-TO-DATE - tempIndices := newMatch(numGroups + 1) + // tempIndices := newMatch(numGroups + 1) - foundPath := false - startIdx := offset - endIdx := offset - currentStates := &priorityQueue{} - heap.Init(currentStates) - tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration - i := offset // Index in string - startingFrom := i // Store starting index + // foundPath := false + //startIdx := offset + //endIdx := offset + currentStates := make([]*nfaState, 0) + // tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration + i := offset // Index in string + //startingFrom := i // Store starting index // If the first state is an assertion, makes sure the assertion // is true before we do _anything_ else. @@ -287,214 +285,266 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in } } // Increment until we hit a character matching the start state (assuming not 0-state) - if start.isEmpty == false { - for i < len(str) && !start.contentContains(str, i) { - i++ - } - startIdx = i - startingFrom = i - i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character - } - - start.threadGroups = newMatch(numGroups + 1) + // if start.isEmpty == false { + // for i < len(str) && !start.contentContains(str, i) { + // i++ + // } + // startIdx = i + // startingFrom = i + // i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character + // } + + // start.threadGroups = newMatch(numGroups + 1) // Check if the start state begins a group - if so, add the start index to our list - if start.groupBegin { - start.threadGroups[start.groupNum].StartIdx = i - // tempIndices[start.groupNum].startIdx = i - } + //if start.groupBegin { + // start.threadGroups[start.groupNum].StartIdx = i + // tempIndices[start.groupNum].startIdx = i + //} start.threadSP = i - heap.Push(currentStates, newPriorQueueItem(start)) + currentStates = append(currentStates, start) + var foundMatch bool // Main loop - for currentStates.Len() > 0 { - currentState := heap.Pop(currentStates) - foundPath = false - - zeroStates := make([]*nfaState, 0) - // Keep taking zero-states, until there are no more left to take - // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take. - topStateItem := currentStates.peek() - topState := topStateItem.(*priorQueueItem).state - zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) - tempStates = append(tempStates, zeroStates...) - num_appended := 0 - for isZero == true { - zeroStates, isZero = takeZeroState(tempStates, numGroups, i) - tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) - if num_appended == 0 { // Break if we haven't appended any more unique values - break - } - } - if isZero == true { - currentStates.Pop() - } - - for _, state := range tempStates { - heap.Push(currentStates, newPriorQueueItem(state)) - } - tempStates = nil - - // Take any transitions corresponding to current character - numStatesMatched := 0 // The number of states which had at least 1 match for this round - assertionFailed := false // Whether or not an assertion failed for this round - lastStateInList := false // Whether or not a last state was in our list of states - var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found - lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states - for numStatesMatched == 0 && lastStateInList == false { - if currentStates.Len() == 0 { - break - } - stateItem := heap.Pop(currentStates) - state := stateItem.(*priorQueueItem).state - matches, numMatches := state.matchesFor(str, i) - if numMatches > 0 { - numStatesMatched++ - tempStates = append([]*nfaState(nil), matches...) - foundPath = true - for _, m := range matches { - if m.threadGroups == nil { - m.threadGroups = newMatch(numGroups + 1) - } - m.threadSP = state.threadSP + 1 - copy(m.threadGroups, state.threadGroups) - } - } - if numMatches < 0 { - assertionFailed = true - } - if state.isLast { - if state.isLookaround() { - lastLookaroundInList = true - } - lastStateInList = true - lastStatePtr = state - } - } - - if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed - // If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_ - // state. The explanation below is my attempt to explain this behavior. - // If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails. - // - // One of the states in our list was a last state and a lookaround. In this case, we - // don't abort upon failure of the assertion, because we have found - // another path to a final state. - // Even if the last state _was_ an assertion, we can use the previously - // saved indices to find a match. - if lastLookaroundInList { - break - } else { - if i == startingFrom { - i++ - } - return false, []Group{}, i - } + for len(currentStates) > 0 { + currentState, _ := pop(¤tStates) + idx := currentState.threadSP + foundMatch = false + + if currentState.threadGroups == nil { + currentState.threadGroups = newMatch(numGroups + 1) + currentState.threadGroups[0].StartIdx = idx } - // Check if we can find a state in our list that is: - // a. A last-state - // b. Empty - // c. Doesn't assert anything - for _, stateItem := range *currentStates { - s := stateItem.state - if s.isLast && s.isEmpty && s.assert == noneAssert { - lastStatePtr = s - lastStateInList = true + if currentState.groupBegin { + currentState.threadGroups[currentState.groupNum].StartIdx = idx + } else if currentState.groupEnd { + currentState.threadGroups[currentState.groupNum].EndIdx = idx + } else if currentState.isKleene { + // Append the + } else if currentState.isAlternation { + rightState := currentState.rightState + rightState.threadGroups = currentState.threadGroups + rightState.threadSP = currentState.threadSP + currentStates = append(currentStates, currentState.rightState) + leftState := currentState.leftState + leftState.threadGroups = currentState.threadGroups + leftState.threadSP = currentState.threadSP + currentStates = append(currentStates, currentState.leftState) + continue + } else if currentState.contentContains(str, idx) { + foundMatch = true + allMatches := make([]*nfaState, 0) + for _, v := range currentState.transitions { + allMatches = append(allMatches, v...) } - } - if lastStateInList && numStatesMatched == 0 { // A last-state was in the list of states. add the matchIndex to our MatchIndex list - for j := 1; j < numGroups+1; j++ { - tempIndices[j] = lastStatePtr.threadGroups[j] - } - endIdx = i - tempIndices[0] = Group{startIdx, endIdx} - if tempIndices[0].StartIdx == tempIndices[0].EndIdx { - return true, tempIndices, tempIndices[0].EndIdx + 1 - } else { - return true, tempIndices, tempIndices[0].EndIdx - } - } - - // Check if we can find a zero-length match - if foundPath == false { - currentStatesList := funcMap(*currentStates, func(item *priorQueueItem) *nfaState { - return item.state - }) - if ok := zeroMatchPossible(str, i, numGroups, currentStatesList...); ok { - if tempIndices[0].IsValid() == false { - tempIndices[0] = Group{startIdx, startIdx} - } - } - // If we haven't moved in the string, increment the counter by 1 - // to ensure we don't keep trying the same string over and over. - // if i == startingFrom { - startIdx++ - // i++ - // } - if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() { - if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. - return true, tempIndices, tempIndices[0].EndIdx + 1 + slices.Reverse(allMatches) + for _, m := range allMatches { + m.threadGroups = currentState.threadGroups + if currentState.assert == noneAssert { + m.threadSP = idx + 1 } else { - return true, tempIndices, tempIndices[0].EndIdx + m.threadSP = idx } } - return false, []Group{}, startIdx - } - currentStates = &priorityQueue{} - slices.Reverse(tempStates) - for _, state := range tempStates { - heap.Push(currentStates, newPriorQueueItem(state)) + currentStates = append(currentStates, allMatches...) } - tempStates = nil - i++ - } + if currentState.isLast && foundMatch { // Last state reached + currentState.threadGroups[0].EndIdx = idx + 1 + return true, currentState.threadGroups, idx + 1 - // End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position. - // This is the exact same algorithm used inside the loop, so I should probably put it in a function. - if currentStates.Len() > 0 { - topStateItem := currentStates.peek() - topState := topStateItem.(*priorQueueItem).state - zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) - tempStates = append(tempStates, zeroStates...) - num_appended := 0 // Number of unique states addded to tempStates - for isZero == true { - zeroStates, isZero = takeZeroState(tempStates, numGroups, i) - tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) - if num_appended == 0 { // Break if we haven't appended any more unique values - break - } } } - - for _, state := range tempStates { - heap.Push(currentStates, newPriorQueueItem(state)) - } - tempStates = nil - - for _, stateItem := range *currentStates { - state := stateItem.state - // Only add the match if the start index is in bounds. If the state has an assertion, - // make sure the assertion checks out. - if state.isLast && i <= len(str) { - if state.assert == noneAssert || state.checkAssertion(str, i) { - for j := 1; j < numGroups+1; j++ { - tempIndices[j] = state.threadGroups[j] - } - endIdx = i - tempIndices[0] = Group{startIdx, endIdx} - } - } - } - - if tempIndices.numValidGroups() > 0 { - if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. - return true, tempIndices, tempIndices[0].EndIdx + 1 - } else { - return true, tempIndices, tempIndices[0].EndIdx - } - } - if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over. - startIdx++ - } - return false, []Group{}, startIdx + return false, []Group{}, i + 1 + // zeroStates := make([]*nfaState, 0) + // // Keep taking zero-states, until there are no more left to take + // // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take. + // topStateItem := currentStates.peek() + // topState := topStateItem.(*priorQueueItem).state + // zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) + // tempStates = append(tempStates, zeroStates...) + // num_appended := 0 + // for isZero == true { + // zeroStates, isZero = takeZeroState(tempStates, numGroups, i) + // tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) + // if num_appended == 0 { // Break if we haven't appended any more unique values + // break + // } + // } + // if isZero == true { + // currentStates.Pop() + // } + // + // for _, state := range tempStates { + // heap.Push(currentStates, newPriorQueueItem(state)) + // } + // tempStates = nil + // + // // Take any transitions corresponding to current character + // numStatesMatched := 0 // The number of states which had at least 1 match for this round + // assertionFailed := false // Whether or not an assertion failed for this round + // lastStateInList := false // Whether or not a last state was in our list of states + // var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found + // lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states + // for numStatesMatched == 0 && lastStateInList == false { + // if currentStates.Len() == 0 { + // break + // } + // stateItem := heap.Pop(currentStates) + // state := stateItem.(*priorQueueItem).state + // matches, numMatches := state.matchesFor(str, i) + // if numMatches > 0 { + // numStatesMatched++ + // tempStates = append([]*nfaState(nil), matches...) + // foundPath = true + // for _, m := range matches { + // if m.threadGroups == nil { + // m.threadGroups = newMatch(numGroups + 1) + // } + // m.threadSP = state.threadSP + 1 + // copy(m.threadGroups, state.threadGroups) + // } + // } + // if numMatches < 0 { + // assertionFailed = true + // } + // if state.isLast { + // if state.isLookaround() { + // lastLookaroundInList = true + // } + // lastStateInList = true + // lastStatePtr = state + // } + // } + // + // if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed + // // If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_ + // // state. The explanation below is my attempt to explain this behavior. + // // If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails. + // // + // // One of the states in our list was a last state and a lookaround. In this case, we + // // don't abort upon failure of the assertion, because we have found + // // another path to a final state. + // // Even if the last state _was_ an assertion, we can use the previously + // // saved indices to find a match. + // if lastLookaroundInList { + // break + // } else { + // if i == startingFrom { + // i++ + // } + // return false, []Group{}, i + // } + // } + // // Check if we can find a state in our list that is: + // // a. A last-state + // // b. Empty + // // c. Doesn't assert anything + // for _, stateItem := range *currentStates { + // s := stateItem.state + // if s.isLast && s.isEmpty && s.assert == noneAssert { + // lastStatePtr = s + // lastStateInList = true + // } + // } + // if lastStateInList && numStatesMatched == 0 { // A last-state was in the list of states. add the matchIndex to our MatchIndex list + // for j := 1; j < numGroups+1; j++ { + // tempIndices[j] = lastStatePtr.threadGroups[j] + // } + // endIdx = i + // tempIndices[0] = Group{startIdx, endIdx} + // if tempIndices[0].StartIdx == tempIndices[0].EndIdx { + // return true, tempIndices, tempIndices[0].EndIdx + 1 + // } else { + // return true, tempIndices, tempIndices[0].EndIdx + // } + // } + // + // // Check if we can find a zero-length match + // if foundPath == false { + // currentStatesList := funcMap(*currentStates, func(item *priorQueueItem) *nfaState { + // return item.state + // }) + // if ok := zeroMatchPossible(str, i, numGroups, currentStatesList...); ok { + // if tempIndices[0].IsValid() == false { + // tempIndices[0] = Group{startIdx, startIdx} + // } + // } + // // If we haven't moved in the string, increment the counter by 1 + // // to ensure we don't keep trying the same string over and over. + // // if i == startingFrom { + // startIdx++ + // // i++ + // // } + // if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() { + // if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. + // return true, tempIndices, tempIndices[0].EndIdx + 1 + // } else { + // return true, tempIndices, tempIndices[0].EndIdx + // } + // } + // return false, []Group{}, startIdx + // } + // currentStates = &priorityQueue{} + // slices.Reverse(tempStates) + // for _, state := range tempStates { + // heap.Push(currentStates, newPriorQueueItem(state)) + // } + // tempStates = nil + // + // i++ + // } + // + // // End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position. + // // This is the exact same algorithm used inside the loop, so I should probably put it in a function. + // + // if currentStates.Len() > 0 { + // topStateItem := currentStates.peek() + // topState := topStateItem.(*priorQueueItem).state + // zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) + // tempStates = append(tempStates, zeroStates...) + // num_appended := 0 // Number of unique states addded to tempStates + // for isZero == true { + // zeroStates, isZero = takeZeroState(tempStates, numGroups, i) + // tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) + // if num_appended == 0 { // Break if we haven't appended any more unique values + // break + // } + // } + // } + // + // for _, state := range tempStates { + // heap.Push(currentStates, newPriorQueueItem(state)) + // } + // + // tempStates = nil + // + // for _, stateItem := range *currentStates { + // state := stateItem.state + // // Only add the match if the start index is in bounds. If the state has an assertion, + // // make sure the assertion checks out. + // if state.isLast && i <= len(str) { + // if state.assert == noneAssert || state.checkAssertion(str, i) { + // for j := 1; j < numGroups+1; j++ { + // tempIndices[j] = state.threadGroups[j] + // } + // endIdx = i + // tempIndices[0] = Group{startIdx, endIdx} + // } + // } + // } + // + // if tempIndices.numValidGroups() > 0 { + // if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. + // return true, tempIndices, tempIndices[0].EndIdx + 1 + // } else { + // return true, tempIndices, tempIndices[0].EndIdx + // } + // } + // + // if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over. + // + // startIdx++ + // } + // + // return false, []Group{}, startIdx } -- 2.30.2 From e0253dfaf3333d9873c497b5c444d384b4abb183 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Tue, 4 Feb 2025 14:09:04 -0500 Subject: [PATCH 09/48] Change kleene() to an alternation-style construct --- regex/nfa.go | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/regex/nfa.go b/regex/nfa.go index f03edab..a9c1ec6 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -116,7 +116,6 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) clone.rightState = clone } clone.rightState = cloneStateHelper(stateToClone.rightState, cloneMap) - return clone } @@ -326,12 +325,16 @@ func kleene(s1 nfaState) (*nfaState, error) { return nil, fmt.Errorf("previous token is not quantifiable") } - toReturn := &nfaState{} - toReturn.transitions = make(map[int][]*nfaState) - toReturn.content = newContents(epsilon) + emptyState := zeroLengthMatchState() + emptyState.assert = noneAssert + toReturn := alternate(&s1, &emptyState) + + // toReturn := &nfaState{} + // toReturn.transitions = make(map[int][]*nfaState) + // toReturn.content = newContents(epsilon) toReturn.isEmpty = true toReturn.isKleene = true - toReturn.output = append(toReturn.output, toReturn) + toReturn.output = []*nfaState{&emptyState} for i := range s1.output { for _, c := range toReturn.content { s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn) @@ -340,6 +343,7 @@ func kleene(s1 nfaState) (*nfaState, error) { for _, c := range s1.content { toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1) } + //toReturn.kleeneState = &s1 return toReturn, nil } @@ -374,7 +378,6 @@ func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|) s2.content = newContents(epsilon) s2.output = append(s2.output, s2) s2.isEmpty = true - s2.isAlternation = true s3 := alternate(s1, s2) return s3 } -- 2.30.2 From 3ce611d12185471e4e760d2954bf942ea7a661e7 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Tue, 4 Feb 2025 14:09:24 -0500 Subject: [PATCH 10/48] More work towards implementing PCRE matching --- regex/matching.go | 62 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 8 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index ad7d15b..1263e37 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -314,13 +314,36 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in currentState.threadGroups = newMatch(numGroups + 1) currentState.threadGroups[0].StartIdx = idx } + if currentState.groupBegin { currentState.threadGroups[currentState.groupNum].StartIdx = idx - } else if currentState.groupEnd { + } + + if currentState.groupEnd { currentState.threadGroups[currentState.groupNum].EndIdx = idx - } else if currentState.isKleene { - // Append the - } else if currentState.isAlternation { + } + + // if currentState.isKleene { + // // Append the next-state (after the kleene), then append the kleene state + // allMatches := make([]*nfaState, 0) + // for _, v := range currentState.transitions { + // allMatches = append(allMatches, v...) + // } + // slices.Reverse(allMatches) + // for _, m := range allMatches { + // m.threadGroups = currentState.threadGroups + // m.threadSP = idx + // } + // currentStates = append(currentStates, allMatches...) + // + // // kleeneState := currentState.kleeneState + // // kleeneState.threadGroups = currentState.threadGroups + // // kleeneState.threadSP = currentState.threadSP + // // currentStates = append(currentStates, kleeneState) + // continue + // } + + if currentState.isAlternation { rightState := currentState.rightState rightState.threadGroups = currentState.threadGroups rightState.threadSP = currentState.threadSP @@ -330,7 +353,22 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in leftState.threadSP = currentState.threadSP currentStates = append(currentStates, currentState.leftState) continue - } else if currentState.contentContains(str, idx) { + } + + if currentState.isEmpty && currentState.assert == noneAssert { + allMatches := make([]*nfaState, 0) + for _, v := range currentState.transitions { + allMatches = append(allMatches, v...) + } + slices.Reverse(allMatches) + for _, m := range allMatches { + m.threadGroups = currentState.threadGroups + m.threadSP = idx + } + currentStates = append(currentStates, allMatches...) + } + + if currentState.contentContains(str, idx) { foundMatch = true allMatches := make([]*nfaState, 0) for _, v := range currentState.transitions { @@ -348,9 +386,17 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in currentStates = append(currentStates, allMatches...) } - if currentState.isLast && foundMatch { // Last state reached - currentState.threadGroups[0].EndIdx = idx + 1 - return true, currentState.threadGroups, idx + 1 + if currentState.isLast { // Last state reached + if foundMatch { + currentState.threadGroups[0].EndIdx = idx + 1 + return true, currentState.threadGroups, idx + 1 + } else if currentState.isEmpty && currentState.assert == noneAssert { + currentState.threadGroups[0].EndIdx = idx + if idx == currentState.threadGroups[0].StartIdx { + idx++ + } + return true, currentState.threadGroups, idx + } } } -- 2.30.2 From d4e8cb74fdc8efe78a92ee3ef72f0ba9b7566893 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 5 Feb 2025 11:32:20 -0500 Subject: [PATCH 11/48] Replaced pointer to nfaState with nfaState --- regex/matching.go | 65 +++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index 1263e37..23f8317 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -151,6 +151,11 @@ func pruneIndices(indices []Match) []Match { return toRet } +func copyThread(to *nfaState, from nfaState) { + to.threadSP = from.threadSP + to.threadGroups = from.threadGroups +} + // Find returns the 0-group of the leftmost match of the regex in the given string. // An error value != nil indicates that no match was found. func (regex Reg) Find(str string) (Group, error) { @@ -271,7 +276,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // foundPath := false //startIdx := offset //endIdx := offset - currentStates := make([]*nfaState, 0) + currentStates := make([]nfaState, 0) // tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration i := offset // Index in string //startingFrom := i // Store starting index @@ -302,13 +307,15 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in //} start.threadSP = i - currentStates = append(currentStates, start) + currentStates = append(currentStates, *start) var foundMatch bool + var isEmptyAndNoAssertion bool // Main loop for len(currentStates) > 0 { currentState, _ := pop(¤tStates) idx := currentState.threadSP foundMatch = false + isEmptyAndNoAssertion = false if currentState.threadGroups == nil { currentState.threadGroups = newMatch(numGroups + 1) @@ -343,44 +350,39 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // continue // } + // Alternation - enqueue left then right state, and continue if currentState.isAlternation { rightState := currentState.rightState - rightState.threadGroups = currentState.threadGroups - rightState.threadSP = currentState.threadSP - currentStates = append(currentStates, currentState.rightState) + copyThread(rightState, currentState) + currentStates = append(currentStates, *currentState.rightState) leftState := currentState.leftState - leftState.threadGroups = currentState.threadGroups - leftState.threadSP = currentState.threadSP - currentStates = append(currentStates, currentState.leftState) + copyThread(leftState, currentState) + currentStates = append(currentStates, *currentState.leftState) continue } + // Empty state - enqueue next state, do _not_ increment the SP if currentState.isEmpty && currentState.assert == noneAssert { - allMatches := make([]*nfaState, 0) - for _, v := range currentState.transitions { - allMatches = append(allMatches, v...) - } - slices.Reverse(allMatches) - for _, m := range allMatches { - m.threadGroups = currentState.threadGroups - m.threadSP = idx - } - currentStates = append(currentStates, allMatches...) + isEmptyAndNoAssertion = true } if currentState.contentContains(str, idx) { foundMatch = true - allMatches := make([]*nfaState, 0) + } + + if isEmptyAndNoAssertion || foundMatch { + allMatches := make([]nfaState, 0) for _, v := range currentState.transitions { - allMatches = append(allMatches, v...) + dereferenced := funcMap(v, func(s *nfaState) nfaState { + return *s + }) + allMatches = append(allMatches, dereferenced...) } slices.Reverse(allMatches) - for _, m := range allMatches { - m.threadGroups = currentState.threadGroups - if currentState.assert == noneAssert { - m.threadSP = idx + 1 - } else { - m.threadSP = idx + for i := range allMatches { + copyThread(&allMatches[i], currentState) + if foundMatch && currentState.assert == noneAssert { + allMatches[i].threadSP += 1 } } currentStates = append(currentStates, allMatches...) @@ -388,8 +390,15 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in if currentState.isLast { // Last state reached if foundMatch { - currentState.threadGroups[0].EndIdx = idx + 1 - return true, currentState.threadGroups, idx + 1 + if currentState.assert != noneAssert { + currentState.threadGroups[0].EndIdx = idx + } else { + currentState.threadGroups[0].EndIdx = idx + 1 + } + if idx == currentState.threadGroups[0].StartIdx { + idx += 1 + } + return true, currentState.threadGroups, idx } else if currentState.isEmpty && currentState.assert == noneAssert { currentState.threadGroups[0].EndIdx = idx if idx == currentState.threadGroups[0].StartIdx { -- 2.30.2 From 7c62ba6bfdf9db60de31e61d49445fbc3e66e5da Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 5 Feb 2025 12:21:04 -0500 Subject: [PATCH 12/48] Started implementing Thompson's algorithm for matching, because the old one was completely backtracking (so it would enter infinite loops on something like '(a*)*' ) The git diff claims that a ton of code was changed, but most of it was just indentation changes. --- regex/matching.go | 172 ++++++++++++++++++++++++---------------------- 1 file changed, 88 insertions(+), 84 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index 23f8317..760f7c1 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -277,6 +277,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in //startIdx := offset //endIdx := offset currentStates := make([]nfaState, 0) + nextStates := make([]nfaState, 0) // tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration i := offset // Index in string //startingFrom := i // Store starting index @@ -311,103 +312,106 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in var foundMatch bool var isEmptyAndNoAssertion bool // Main loop - for len(currentStates) > 0 { - currentState, _ := pop(¤tStates) - idx := currentState.threadSP - foundMatch = false - isEmptyAndNoAssertion = false - - if currentState.threadGroups == nil { - currentState.threadGroups = newMatch(numGroups + 1) - currentState.threadGroups[0].StartIdx = idx - } - - if currentState.groupBegin { - currentState.threadGroups[currentState.groupNum].StartIdx = idx - } + for idx := i; idx <= len(str); idx++ { + for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ { + currentState := currentStates[currentStateIdx] + foundMatch = false + isEmptyAndNoAssertion = false + + if currentState.threadGroups == nil { + currentState.threadGroups = newMatch(numGroups + 1) + currentState.threadGroups[0].StartIdx = idx + } - if currentState.groupEnd { - currentState.threadGroups[currentState.groupNum].EndIdx = idx - } + if currentState.groupBegin { + currentState.threadGroups[currentState.groupNum].StartIdx = idx + } - // if currentState.isKleene { - // // Append the next-state (after the kleene), then append the kleene state - // allMatches := make([]*nfaState, 0) - // for _, v := range currentState.transitions { - // allMatches = append(allMatches, v...) - // } - // slices.Reverse(allMatches) - // for _, m := range allMatches { - // m.threadGroups = currentState.threadGroups - // m.threadSP = idx - // } - // currentStates = append(currentStates, allMatches...) - // - // // kleeneState := currentState.kleeneState - // // kleeneState.threadGroups = currentState.threadGroups - // // kleeneState.threadSP = currentState.threadSP - // // currentStates = append(currentStates, kleeneState) - // continue - // } - - // Alternation - enqueue left then right state, and continue - if currentState.isAlternation { - rightState := currentState.rightState - copyThread(rightState, currentState) - currentStates = append(currentStates, *currentState.rightState) - leftState := currentState.leftState - copyThread(leftState, currentState) - currentStates = append(currentStates, *currentState.leftState) - continue - } + if currentState.groupEnd { + currentState.threadGroups[currentState.groupNum].EndIdx = idx + } - // Empty state - enqueue next state, do _not_ increment the SP - if currentState.isEmpty && currentState.assert == noneAssert { - isEmptyAndNoAssertion = true - } + // if currentState.isKleene { + // // Append the next-state (after the kleene), then append the kleene state + // allMatches := make([]*nfaState, 0) + // for _, v := range currentState.transitions { + // allMatches = append(allMatches, v...) + // } + // slices.Reverse(allMatches) + // for _, m := range allMatches { + // m.threadGroups = currentState.threadGroups + // m.threadSP = idx + // } + // currentStates = append(currentStates, allMatches...) + // + // // kleeneState := currentState.kleeneState + // // kleeneState.threadGroups = currentState.threadGroups + // // kleeneState.threadSP = currentState.threadSP + // // currentStates = append(currentStates, kleeneState) + // continue + // } + + // Alternation - enqueue left then right state, and continue + if currentState.isAlternation { + leftState := currentState.leftState + copyThread(leftState, currentState) + currentStates = append(currentStates, *currentState.leftState) + rightState := currentState.rightState + copyThread(rightState, currentState) + currentStates = append(currentStates, *currentState.rightState) + continue + } - if currentState.contentContains(str, idx) { - foundMatch = true - } + // Empty state - enqueue next state, do _not_ increment the SP + if currentState.isEmpty && currentState.assert == noneAssert { + isEmptyAndNoAssertion = true + } - if isEmptyAndNoAssertion || foundMatch { - allMatches := make([]nfaState, 0) - for _, v := range currentState.transitions { - dereferenced := funcMap(v, func(s *nfaState) nfaState { - return *s - }) - allMatches = append(allMatches, dereferenced...) + if currentState.contentContains(str, idx) { + foundMatch = true } - slices.Reverse(allMatches) - for i := range allMatches { - copyThread(&allMatches[i], currentState) - if foundMatch && currentState.assert == noneAssert { - allMatches[i].threadSP += 1 + + if isEmptyAndNoAssertion || foundMatch { + allMatches := make([]nfaState, 0) + for _, v := range currentState.transitions { + dereferenced := funcMap(v, func(s *nfaState) nfaState { + return *s + }) + allMatches = append(allMatches, dereferenced...) } + slices.Reverse(allMatches) + for i := range allMatches { + copyThread(&allMatches[i], currentState) + if foundMatch && currentState.assert == noneAssert { + allMatches[i].threadSP += 1 + } + } + nextStates = append(nextStates, allMatches...) } - currentStates = append(currentStates, allMatches...) - } - if currentState.isLast { // Last state reached - if foundMatch { - if currentState.assert != noneAssert { + if currentState.isLast { // Last state reached + if foundMatch { + if currentState.assert != noneAssert { + currentState.threadGroups[0].EndIdx = idx + } else { + currentState.threadGroups[0].EndIdx = idx + 1 + } + if idx == currentState.threadGroups[0].StartIdx { + idx += 1 + } + return true, currentState.threadGroups, idx + } else if currentState.isEmpty && currentState.assert == noneAssert { currentState.threadGroups[0].EndIdx = idx - } else { - currentState.threadGroups[0].EndIdx = idx + 1 + if idx == currentState.threadGroups[0].StartIdx { + idx++ + } + return true, currentState.threadGroups, idx } - if idx == currentState.threadGroups[0].StartIdx { - idx += 1 - } - return true, currentState.threadGroups, idx - } else if currentState.isEmpty && currentState.assert == noneAssert { - currentState.threadGroups[0].EndIdx = idx - if idx == currentState.threadGroups[0].StartIdx { - idx++ - } - return true, currentState.threadGroups, idx - } + } } + copy(currentStates, nextStates) + nextStates = nil } return false, []Group{}, i + 1 // zeroStates := make([]*nfaState, 0) -- 2.30.2 From 858e535fba88846dc8ecda50010834c27762ce92 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 5 Feb 2025 18:01:36 -0500 Subject: [PATCH 13/48] Continued implementing Thompson's algorithm --- regex/matching.go | 43 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index 760f7c1..3252742 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -153,7 +153,7 @@ func pruneIndices(indices []Match) []Match { func copyThread(to *nfaState, from nfaState) { to.threadSP = from.threadSP - to.threadGroups = from.threadGroups + to.threadGroups = append([]Group{}, from.threadGroups...) } // Find returns the 0-group of the leftmost match of the regex in the given string. @@ -325,10 +325,33 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in if currentState.groupBegin { currentState.threadGroups[currentState.groupNum].StartIdx = idx + // allMatches := make([]nfaState, 0) + // for _, v := range currentState.transitions { + // dereferenced := funcMap(v, func(s *nfaState) nfaState { + // return *s + // }) + // allMatches = append(allMatches, dereferenced...) + // } + // slices.Reverse(allMatches) + // for i := range allMatches { + // copyThread(&allMatches[i], currentState) + // } + // currentStates = append(currentStates, allMatches...) } - if currentState.groupEnd { currentState.threadGroups[currentState.groupNum].EndIdx = idx + // allMatches := make([]nfaState, 0) + // for _, v := range currentState.transitions { + // dereferenced := funcMap(v, func(s *nfaState) nfaState { + // return *s + // }) + // allMatches = append(allMatches, dereferenced...) + // } + // slices.Reverse(allMatches) + // for i := range allMatches { + // copyThread(&allMatches[i], currentState) + // } + // currentStates = append(currentStates, allMatches...) } // if currentState.isKleene { @@ -363,7 +386,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in } // Empty state - enqueue next state, do _not_ increment the SP - if currentState.isEmpty && currentState.assert == noneAssert { + if currentState.isEmpty && currentState.assert == noneAssert { //&& currentState.groupBegin == false && currentState.groupEnd == false { isEmptyAndNoAssertion = true } @@ -386,10 +409,16 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in allMatches[i].threadSP += 1 } } - nextStates = append(nextStates, allMatches...) + if currentState.groupBegin { + currentStates = slices.Insert(currentStates, currentStateIdx+1, allMatches...) + } else if currentState.groupEnd { + currentStates = append(currentStates, allMatches...) + } else { + nextStates = append(nextStates, allMatches...) + } } - if currentState.isLast { // Last state reached + if currentState.isLast && len(nextStates) == 0 { // Last state reached if foundMatch { if currentState.assert != noneAssert { currentState.threadGroups[0].EndIdx = idx @@ -400,7 +429,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in idx += 1 } return true, currentState.threadGroups, idx - } else if currentState.isEmpty && currentState.assert == noneAssert { + } else if isEmptyAndNoAssertion { currentState.threadGroups[0].EndIdx = idx if idx == currentState.threadGroups[0].StartIdx { idx++ @@ -410,7 +439,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in } } - copy(currentStates, nextStates) + currentStates = append([]nfaState{}, nextStates...) nextStates = nil } return false, []Group{}, i + 1 -- 2.30.2 From cca8c7cda2df7f221bc3430ab8bfad2190572b98 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 5 Feb 2025 22:20:28 -0500 Subject: [PATCH 14/48] Got rid of transitions parameter, changed how kleene state is processed I replaced the transition parameter for nfaState, replacing it with a single nfaState pointer. This is because any non-alternation state will only have one next state, so the map was just added complexity. I changed alternation processing - instead of having their own dedicated fields, they just use the new 'next' parameter, and another one called 'splitState'. I also changed the kleene state processing to remove the unecessary empty state in the right-side alternation (it actually messed up my matching). --- regex/compile.go | 19 ++-- regex/nfa.go | 269 ++++++++++++++++++++++++----------------------- 2 files changed, 146 insertions(+), 142 deletions(-) diff --git a/regex/compile.go b/regex/compile.go index 1068966..0429c37 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -822,7 +822,6 @@ func thompson(re []postfixNode) (Reg, error) { for _, c := range re { if c.nodetype == characterNode || c.nodetype == assertionNode { stateToAdd := nfaState{} - stateToAdd.transitions = make(map[int][]*nfaState) if c.allChars { stateToAdd.allChars = true if len(c.except) != 0 { @@ -934,7 +933,6 @@ func thompson(re []postfixNode) (Reg, error) { s.isEmpty = true s.output = make([]*nfaState, 0) s.output = append(s.output, s) - s.transitions = make(map[int][]*nfaState) // LPAREN nodes are just added normally if c.nodetype == lparenNode { numGroups++ @@ -966,7 +964,7 @@ func thompson(re []postfixNode) (Reg, error) { s.groupNum = lparenNode.groupNum to_add := concatenate(lparenNode, s) nfa = append(nfa, to_add) - } else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen + } else if middleNode.groupBegin && middleNode.numTransitions() == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen to_add := concatenate(middleNode, s) @@ -1030,14 +1028,14 @@ func thompson(re []postfixNode) (Reg, error) { if err != nil { return Reg{}, fmt.Errorf("error applying kleene star") } - stateToAdd, err := kleene(*s1) + stateToAdd, err := kleene(s1) if err != nil { return Reg{}, err } nfa = append(nfa, stateToAdd) case plusNode: // a+ is equivalent to aa* s1 := mustPop(&nfa) - s2, err := kleene(*s1) + s2, err := kleene(s1) if err != nil { return Reg{}, err } @@ -1061,14 +1059,14 @@ func thompson(re []postfixNode) (Reg, error) { // '^|a' s2, err1 := pop(&nfa) s1, err2 := pop(&nfa) - if err2 != nil || (s2.groupBegin && len(s2.transitions) == 0) { // Doesn't exist, or its just an LPAREN + if err2 != nil || (s2.groupBegin && s2.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back nfa = append(nfa, s2) } tmp := zeroLengthMatchState() s2 = &tmp } - if err1 != nil || (s1.groupBegin && len(s1.transitions) == 0) { // Doesn't exist, or its just an LPAREN + if err1 != nil || (s1.groupBegin && s1.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN if err1 == nil { // See above for explanation nfa = append(nfa, s1) } @@ -1100,7 +1098,7 @@ func thompson(re []postfixNode) (Reg, error) { stateToAdd = concatenate(stateToAdd, cloneState(poppedState)) } if c.endReps == infinite_reps { // Case 3 - s2, err := kleene(*poppedState) + s2, err := kleene(poppedState) if err != nil { return Reg{}, err } @@ -1117,7 +1115,10 @@ func thompson(re []postfixNode) (Reg, error) { return Reg{}, fmt.Errorf("invalid regex") } - verifyLastStates(nfa) + lastState := newState() + lastState.isLast = true + + concatenate(nfa[0], &lastState) return Reg{nfa[0], numGroups}, nil diff --git a/regex/nfa.go b/regex/nfa.go index a9c1ec6..79daaf6 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -25,25 +25,25 @@ const ( ) type nfaState struct { - content stateContents // Contents of current state - isEmpty bool // If it is empty - Union operator and Kleene star states will be empty - isLast bool // If it is the last state (acept state) - output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. - transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) - isKleene bool // Identifies whether current node is a 0-state representing Kleene star - isQuestion bool // Identifies whether current node is a 0-state representing the question operator - isAlternation bool // Identifies whether current node is a 0-state representing an alternation - leftState *nfaState // Only for alternation states - the 'left' branch of the alternation - rightState *nfaState // Only for alternation states - the 'right' branch of the alternation - assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything - allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space - except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. - lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds - lookaroundNFA *nfaState // Holds the NFA of the lookaroundRegex - if it exists - lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround - groupBegin bool // Whether or not the node starts a capturing group - groupEnd bool // Whether or not the node ends a capturing group - groupNum int // Which capturing group the node starts / ends + content stateContents // Contents of current state + isEmpty bool // If it is empty - Union operator and Kleene star states will be empty + isLast bool // If it is the last state (acept state) + output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. + // transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) + next *nfaState // The next state (not for alternation or kleene states) + isKleene bool // Identifies whether current node is a 0-state representing Kleene star + isQuestion bool // Identifies whether current node is a 0-state representing the question operator + isAlternation bool // Identifies whether current node is a 0-state representing an alternation + splitState *nfaState // Only for alternation states - the 'other' branch of the alternation ('next' is the first) + assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything + allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space + except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. + lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds + lookaroundNFA *nfaState // Holds the NFA of the lookaroundRegex - if it exists + lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround + groupBegin bool // Whether or not the node starts a capturing group + groupEnd bool // Whether or not the node ends a capturing group + groupNum int // Which capturing group the node starts / ends // The following properties depend on the current match - I should think about resetting them for every match. zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over. @@ -73,7 +73,6 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) isEmpty: stateToClone.isEmpty, isLast: stateToClone.isLast, output: make([]*nfaState, len(stateToClone.output)), - transitions: make(map[int][]*nfaState), isKleene: stateToClone.isKleene, isQuestion: stateToClone.isQuestion, isAlternation: stateToClone.isAlternation, @@ -94,28 +93,18 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) clone.output[i] = cloneStateHelper(s, cloneMap) } } - for k, v := range stateToClone.transitions { - clone.transitions[k] = make([]*nfaState, len(v)) - for i, s := range v { - if s == stateToClone { - clone.transitions[k][i] = clone - } else { - clone.transitions[k][i] = cloneStateHelper(s, cloneMap) - } - } - } if stateToClone.lookaroundNFA == stateToClone { clone.lookaroundNFA = clone } clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap) - if stateToClone.leftState == stateToClone { - clone.leftState = clone + if stateToClone.splitState == stateToClone { + clone.splitState = clone } - clone.leftState = cloneStateHelper(stateToClone.leftState, cloneMap) - if stateToClone.rightState == stateToClone { - clone.rightState = clone + clone.splitState = cloneStateHelper(stateToClone.splitState, cloneMap) + if stateToClone.next == stateToClone { + clone.next = clone } - clone.rightState = cloneStateHelper(stateToClone.rightState, cloneMap) + clone.next = cloneStateHelper(stateToClone.next, cloneMap) return clone } @@ -126,6 +115,9 @@ func resetThreads(start *nfaState) { } func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) { + if state == nil { + return + } if _, ok := visitedMap[state]; ok { return } @@ -133,10 +125,11 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) { state.threadGroups = nil state.threadSP = 0 visitedMap[state] = true - for _, v := range state.transitions { - for _, nextState := range v { - resetThreadsHelper(nextState, visitedMap) - } + if state.isAlternation { + resetThreadsHelper(state.next, visitedMap) + resetThreadsHelper(state.splitState, visitedMap) + } else { + resetThreadsHelper(state.next, visitedMap) } } @@ -237,74 +230,84 @@ func (s nfaState) isLookaround() bool { return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert } -// Returns the matches for the character at the given index of the given string. -// Also returns the number of matches. Returns -1 if an assertion failed. -func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) { - // Assertions can be viewed as 'checks'. If the check fails, we return - // an empty array and 0. - // If it passes, we treat it like any other state, and return all the transitions. - if s.assert != noneAssert { - if s.checkAssertion(str, idx) == false { - return make([]*nfaState, 0), -1 - } +func (s nfaState) numTransitions() int { + if s.next == nil && s.splitState == nil { + return 0 } - listTransitions := s.transitions[int(str[idx])] - for _, dest := range s.transitions[int(anyCharRune)] { - if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) { - // Add an allChar state to the list of matches if: - // a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't. - // b. The current character isn't the state's exception list. - listTransitions = append(listTransitions, dest) - } + if s.next == nil || s.splitState == nil { + return 1 } - numTransitions := len(listTransitions) - return listTransitions, numTransitions + return 2 } -// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates -func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) { - if len(st.transitions) == 0 { - st.isLast = true - return - } - // if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a* - if len(st.transitions) == 1 { // Eg. a* - var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one - for _, c := range st.content { - if len(st.transitions[c]) != 1 || st.transitions[c][0] != st { - moreThanOneTrans = true - } - } - st.isLast = !moreThanOneTrans - } +// Returns the matches for the character at the given index of the given string. +// Also returns the number of matches. Returns -1 if an assertion failed. +//func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) { +// // Assertions can be viewed as 'checks'. If the check fails, we return +// // an empty array and 0. +// // If it passes, we treat it like any other state, and return all the transitions. +// if s.assert != noneAssert { +// if s.checkAssertion(str, idx) == false { +// return make([]*nfaState, 0), -1 +// } +// } +// listTransitions := s.transitions[int(str[idx])] +// for _, dest := range s.transitions[int(anyCharRune)] { +// if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) { +// // Add an allChar state to the list of matches if: +// // a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't. +// // b. The current character isn't the state's exception list. +// listTransitions = append(listTransitions, dest) +// } +// } +// numTransitions := len(listTransitions) +// return listTransitions, numTransitions +//} - if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state - transitionDests := make([]*nfaState, 0) - for _, v := range st.transitions { - transitionDests = append(transitionDests, v...) - } - if allEqual(transitionDests...) { - st.isLast = true - return - } - } - if visited[st] == true { - return - } - visited[st] = true - for _, states := range st.transitions { - for i := range states { - if states[i] != st { - verifyLastStatesHelper(states[i], visited) - } - } - } -} +// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates +//func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) { +// if st.numTransitions() == 0 { +// st.isLast = true +// return +// } +// // if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a* +// if st.numTransitions() == 1 { // Eg. a* +// var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one +// for _, c := range st.content { +// if len(st.transitions[c]) != 1 || st.transitions[c][0] != st { +// moreThanOneTrans = true +// } +// } +// st.isLast = !moreThanOneTrans +// } +// +// if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state +// transitionDests := make([]*nfaState, 0) +// for _, v := range st.transitions { +// transitionDests = append(transitionDests, v...) +// } +// if allEqual(transitionDests...) { +// st.isLast = true +// return +// } +// } +// if visited[st] == true { +// return +// } +// visited[st] = true +// for _, states := range st.transitions { +// for i := range states { +// if states[i] != st { +// verifyLastStatesHelper(states[i], visited) +// } +// } +// } +//} // verifyLastStates enables the 'isLast' flag for the leaf nodes (last states) -func verifyLastStates(start []*nfaState) { - verifyLastStatesHelper(start[0], make(map[*nfaState]bool)) -} +//func verifyLastStates(start []*nfaState) { +// verifyLastStatesHelper(start[0], make(map[*nfaState]bool)) +//} // Concatenates s1 and s2, returns the start of the concatenation. func concatenate(s1 *nfaState, s2 *nfaState) *nfaState { @@ -312,69 +315,69 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState { return s2 } for i := range s1.output { - for _, c := range s2.content { // Create transitions for every element in s1's content to s2' - s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2) - } + s1.output[i].next = s2 } s1.output = s2.output return s1 } -func kleene(s1 nfaState) (*nfaState, error) { +func kleene(s1 *nfaState) (*nfaState, error) { if s1.isEmpty && s1.assert != noneAssert { return nil, fmt.Errorf("previous token is not quantifiable") } - emptyState := zeroLengthMatchState() - emptyState.assert = noneAssert - toReturn := alternate(&s1, &emptyState) + toReturn := &nfaState{} + toReturn.isEmpty = true + toReturn.isAlternation = true + toReturn.content = newContents(epsilon) + toReturn.splitState = s1 + for i := range s1.output { + s1.output[i].next = toReturn + } // toReturn := &nfaState{} // toReturn.transitions = make(map[int][]*nfaState) // toReturn.content = newContents(epsilon) - toReturn.isEmpty = true toReturn.isKleene = true - toReturn.output = []*nfaState{&emptyState} + toReturn.output = append([]*nfaState{}, toReturn) for i := range s1.output { - for _, c := range toReturn.content { - s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn) - } - } - for _, c := range s1.content { - toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1) + s1.output[i].next = toReturn } + // for _, c := range s1.content { + // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1) + // } //toReturn.kleeneState = &s1 return toReturn, nil } func alternate(s1 *nfaState, s2 *nfaState) *nfaState { toReturn := &nfaState{} - toReturn.transitions = make(map[int][]*nfaState) + // toReturn.transitions = make(map[int][]*nfaState) toReturn.output = append(toReturn.output, s1.output...) toReturn.output = append(toReturn.output, s2.output...) - // Unique append is used here (and elsewhere) to ensure that, - // for any given transition, a state can only be mentioned once. - // For example, given the transition 'a', the state 's1' can only be mentioned once. - // This would lead to multiple instances of the same set of match indices, since both - // 's1' states would be considered to match. - for _, c := range s1.content { - toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1) - } - for _, c := range s2.content { - toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2) - } + // // Unique append is used here (and elsewhere) to ensure that, + // // for any given transition, a state can only be mentioned once. + // // For example, given the transition 'a', the state 's1' can only be mentioned once. + // // This would lead to multiple instances of the same set of match indices, since both + // // 's1' states would be considered to match. + // for _, c := range s1.content { + // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1) + // } + // for _, c := range s2.content { + // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2) + // } toReturn.content = newContents(epsilon) toReturn.isEmpty = true toReturn.isAlternation = true - toReturn.leftState = s1 - toReturn.rightState = s2 + toReturn.next = s1 + toReturn.splitState = s2 return toReturn } func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|) s2 := &nfaState{} - s2.transitions = make(map[int][]*nfaState) + // s2.transitions = make(map[int][]*nfaState) s2.content = newContents(epsilon) s2.output = append(s2.output, s2) s2.isEmpty = true @@ -385,8 +388,8 @@ func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|) // Creates and returns a new state with the 'default' values. func newState() nfaState { ret := nfaState{ - output: make([]*nfaState, 0), - transitions: make(map[int][]*nfaState), + output: make([]*nfaState, 0), + // transitions: make(map[int][]*nfaState), assert: noneAssert, except: append([]rune{}, 0), lookaroundRegex: "", -- 2.30.2 From fbc9bea9fb78beeefb5bf8602e7ed13c0591d10b Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 5 Feb 2025 22:23:31 -0500 Subject: [PATCH 15/48] Commented out unused functions; use new nfaState parameters --- regex/matching.go | 145 ++++++++++++++++++++++------------------------ 1 file changed, 68 insertions(+), 77 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index 3252742..d504801 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -74,58 +74,58 @@ func getZeroGroup(m Match) Group { // given slice. It returns the resulting states. If any of the resulting states is a 0-state, // the second ret val is true. // If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index. -func takeZeroState(states []*nfaState, numGroups int, idx int) (rtv []*nfaState, isZero bool) { - for _, state := range states { - if len(state.transitions[epsilon]) > 0 { - for _, s := range state.transitions[epsilon] { - if s.threadGroups == nil { - s.threadGroups = newMatch(numGroups + 1) - } - copy(s.threadGroups, state.threadGroups) - if s.groupBegin { - s.threadGroups[s.groupNum].StartIdx = idx - // openParenGroups = append(openParenGroups, s.groupNum) - } - if s.groupEnd { - s.threadGroups[s.groupNum].EndIdx = idx - // closeParenGroups = append(closeParenGroups, s.groupNum) - } - } - rtv = append(rtv, state.transitions[epsilon]...) - } - } - for _, state := range rtv { - if len(state.transitions[epsilon]) > 0 { - return rtv, true - } - } - return rtv, false -} +//func takeZeroState(states []*nfaState, numGroups int, idx int) (rtv []*nfaState, isZero bool) { +// for _, state := range states { +// if len(state.transitions[epsilon]) > 0 { +// for _, s := range state.transitions[epsilon] { +// if s.threadGroups == nil { +// s.threadGroups = newMatch(numGroups + 1) +// } +// copy(s.threadGroups, state.threadGroups) +// if s.groupBegin { +// s.threadGroups[s.groupNum].StartIdx = idx +// // openParenGroups = append(openParenGroups, s.groupNum) +// } +// if s.groupEnd { +// s.threadGroups[s.groupNum].EndIdx = idx +// // closeParenGroups = append(closeParenGroups, s.groupNum) +// } +// } +// rtv = append(rtv, state.transitions[epsilon]...) +// } +// } +// for _, state := range rtv { +// if len(state.transitions[epsilon]) > 0 { +// return rtv, true +// } +// } +// return rtv, false +//} // zeroMatchPossible returns true if a zero-length match is possible // from any of the given states, given the string and our position in it. // It uses the same algorithm to find zero-states as the one inside the loop, // so I should probably put it in a function. -func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState) bool { - zeroStates, isZero := takeZeroState(states, numGroups, idx) - tempstates := make([]*nfaState, 0, len(zeroStates)+len(states)) - tempstates = append(tempstates, states...) - tempstates = append(tempstates, zeroStates...) - num_appended := 0 // number of unique states addded to tempstates - for isZero == true { - zeroStates, isZero = takeZeroState(tempstates, numGroups, idx) - tempstates, num_appended = uniqueAppend(tempstates, zeroStates...) - if num_appended == 0 { // break if we haven't appended any more unique values - break - } - } - for _, state := range tempstates { - if state.isEmpty && (state.assert == noneAssert || state.checkAssertion(str, idx)) && state.isLast { - return true - } - } - return false -} +//func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState) bool { +// zeroStates, isZero := takeZeroState(states, numGroups, idx) +// tempstates := make([]*nfaState, 0, len(zeroStates)+len(states)) +// tempstates = append(tempstates, states...) +// tempstates = append(tempstates, zeroStates...) +// num_appended := 0 // number of unique states addded to tempstates +// for isZero == true { +// zeroStates, isZero = takeZeroState(tempstates, numGroups, idx) +// tempstates, num_appended = uniqueAppend(tempstates, zeroStates...) +// if num_appended == 0 { // break if we haven't appended any more unique values +// break +// } +// } +// for _, state := range tempstates { +// if state.isEmpty && (state.assert == noneAssert || state.checkAssertion(str, idx)) && state.isLast { +// return true +// } +// } +// return false +//} // Prunes the slice by removing overlapping indices. func pruneIndices(indices []Match) []Match { @@ -376,17 +376,26 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // Alternation - enqueue left then right state, and continue if currentState.isAlternation { - leftState := currentState.leftState - copyThread(leftState, currentState) - currentStates = append(currentStates, *currentState.leftState) - rightState := currentState.rightState - copyThread(rightState, currentState) - currentStates = append(currentStates, *currentState.rightState) + if currentState.isKleene { // Reverse order of adding things + rightState := currentState.splitState + copyThread(rightState, currentState) + currentStates = append(currentStates, *currentState.splitState) + leftState := currentState.next + copyThread(leftState, currentState) + currentStates = append(currentStates, *currentState.next) + } else { + leftState := currentState.next + copyThread(leftState, currentState) + currentStates = append(currentStates, *currentState.next) + rightState := currentState.splitState + copyThread(rightState, currentState) + currentStates = append(currentStates, *currentState.splitState) + } continue } // Empty state - enqueue next state, do _not_ increment the SP - if currentState.isEmpty && currentState.assert == noneAssert { //&& currentState.groupBegin == false && currentState.groupEnd == false { + if !currentState.isAlternation && currentState.isEmpty && currentState.assert == noneAssert { //&& currentState.groupBegin == false && currentState.groupEnd == false { isEmptyAndNoAssertion = true } @@ -396,12 +405,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in if isEmptyAndNoAssertion || foundMatch { allMatches := make([]nfaState, 0) - for _, v := range currentState.transitions { - dereferenced := funcMap(v, func(s *nfaState) nfaState { - return *s - }) - allMatches = append(allMatches, dereferenced...) - } + allMatches = append(allMatches, *(currentState.next)) slices.Reverse(allMatches) for i := range allMatches { copyThread(&allMatches[i], currentState) @@ -419,24 +423,11 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in } if currentState.isLast && len(nextStates) == 0 { // Last state reached - if foundMatch { - if currentState.assert != noneAssert { - currentState.threadGroups[0].EndIdx = idx - } else { - currentState.threadGroups[0].EndIdx = idx + 1 - } - if idx == currentState.threadGroups[0].StartIdx { - idx += 1 - } - return true, currentState.threadGroups, idx - } else if isEmptyAndNoAssertion { - currentState.threadGroups[0].EndIdx = idx - if idx == currentState.threadGroups[0].StartIdx { - idx++ - } - return true, currentState.threadGroups, idx + currentState.threadGroups[0].EndIdx = idx + if idx == currentState.threadGroups[0].StartIdx { + idx += 1 } - + return true, currentState.threadGroups, idx } } currentStates = append([]nfaState{}, nextStates...) -- 2.30.2 From ed4ffde64e57c555d3f3fb3aa887eaeab66597de Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 5 Feb 2025 22:51:55 -0500 Subject: [PATCH 16/48] REFACTOR NEEDED: Added another special case; insert instead of appending into currentStates --- regex/matching.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index d504801..95f7a69 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -379,17 +379,17 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in if currentState.isKleene { // Reverse order of adding things rightState := currentState.splitState copyThread(rightState, currentState) - currentStates = append(currentStates, *currentState.splitState) + currentStates = slices.Insert(currentStates, currentStateIdx+1, *rightState) leftState := currentState.next copyThread(leftState, currentState) - currentStates = append(currentStates, *currentState.next) + currentStates = slices.Insert(currentStates, currentStateIdx+2, *leftState) } else { leftState := currentState.next copyThread(leftState, currentState) - currentStates = append(currentStates, *currentState.next) + currentStates = slices.Insert(currentStates, currentStateIdx+1, *leftState) rightState := currentState.splitState copyThread(rightState, currentState) - currentStates = append(currentStates, *currentState.splitState) + currentStates = slices.Insert(currentStates, currentStateIdx+2, *rightState) } continue } @@ -417,6 +417,8 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in currentStates = slices.Insert(currentStates, currentStateIdx+1, allMatches...) } else if currentState.groupEnd { currentStates = append(currentStates, allMatches...) + } else if currentState.assert != noneAssert { + currentStates = append(currentStates, allMatches...) } else { nextStates = append(nextStates, allMatches...) } -- 2.30.2 From 8534174ea1d83d3d8a7ed9e8e837cd075daaa5e0 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Thu, 6 Feb 2025 22:06:22 -0500 Subject: [PATCH 17/48] Use pointers instead of values --- regex/compile.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/regex/compile.go b/regex/compile.go index 0429c37..fa51e0d 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -987,7 +987,8 @@ func thompson(re []postfixNode) (Reg, error) { if c.nodetype == charclassNode { // A Character class consists of all the nodes in it, alternated // Map the list of nodes to a list of states, each state containing the contents of a specific node states := funcMap(c.nodeContents, func(node postfixNode) *nfaState { - s := newState() + s := &nfaState{} + s.output = append(s.output, s) nodeContents := node.contents if caseInsensitive { nodeContents = slices.Concat(funcMap(nodeContents, func(r rune) []rune { @@ -1001,7 +1002,7 @@ func thompson(re []postfixNode) (Reg, error) { return n.contents })...) } - return &s + return s }) // Reduce the list of states down to a single state by alternating them toAdd := funcReduce(states, func(s1 *nfaState, s2 *nfaState) *nfaState { -- 2.30.2 From 1d4f695f8f72f113528c28e850f4b4954a233930 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Thu, 6 Feb 2025 22:06:51 -0500 Subject: [PATCH 18/48] Wrote function to check if a state is in an nfaState, based on the Equals function --- regex/nfa.go | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/regex/nfa.go b/regex/nfa.go index 79daaf6..d051a25 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -47,7 +47,6 @@ type nfaState struct { // The following properties depend on the current match - I should think about resetting them for every match. zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over. - threadSP int // The string pointer of the thread - where it is in the input string } // Clones the NFA starting from the given state. @@ -123,7 +122,6 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) { } // Assuming it hasn't been visited state.threadGroups = nil - state.threadSP = 0 visitedMap[state] = true if state.isAlternation { resetThreadsHelper(state.next, visitedMap) @@ -408,3 +406,32 @@ func zeroLengthMatchState() nfaState { start.assert = alwaysTrueAssert return start } + +func (s nfaState) equals(other nfaState) bool { + return slices.Equal(s.content, other.content) && + s.isEmpty == other.isEmpty && + s.isLast == other.isLast && + slices.Equal(s.output, other.output) && + s.next == other.next && + s.isKleene == other.isKleene && + s.isQuestion == other.isQuestion && + s.isAlternation == other.isAlternation && + s.splitState == other.splitState && + s.assert == other.assert && + s.allChars == other.allChars && + slices.Equal(s.except, other.except) && + s.lookaroundNFA == other.lookaroundNFA && + s.groupBegin == other.groupBegin && + s.groupEnd == other.groupEnd && + s.groupNum == other.groupNum && + slices.Equal(s.threadGroups, other.threadGroups) +} + +func stateExists(list []nfaState, s nfaState) bool { + for i := range list { + if list[i].equals(s) { + return true + } + } + return false +} -- 2.30.2 From ccf3b3b29964b44a5477225b93035bded96ade84 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Thu, 6 Feb 2025 22:08:56 -0500 Subject: [PATCH 19/48] More progress on implementing PCRE matching --- regex/matching.go | 62 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index 95f7a69..06fd16b 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -152,7 +152,6 @@ func pruneIndices(indices []Match) []Match { } func copyThread(to *nfaState, from nfaState) { - to.threadSP = from.threadSP to.threadGroups = append([]Group{}, from.threadGroups...) } @@ -253,6 +252,35 @@ func (regex Reg) FindAllSubmatch(str string) []Match { return indices } +func addStateToList(idx int, list []nfaState, state nfaState) []nfaState { + if stateExists(list, state) { + return list + } + if state.isAlternation { + copyThread(state.next, state) + list = append(list, addStateToList(idx, list, *state.next)...) + copyThread(state.splitState, state) + list = append(list, addStateToList(idx, list, *state.splitState)...) + return list + } + if state.isKleene { + copyThread(state.splitState, state) + list = append(list, addStateToList(idx, list, *state.splitState)...) + copyThread(state.next, state) + list = append(list, addStateToList(idx, list, *state.next)...) + return list + } + if state.groupBegin { + state.threadGroups[state.groupNum].StartIdx = idx + } + if state.groupEnd { + state.threadGroups[state.groupNum].StartIdx = idx + } + copyThread(state.next, state) + return append(list, *state.next) + +} + // Helper for FindAllMatches. Returns whether it found a match, the // first Match it finds, and how far it got into the string ie. where // the next search should start from. @@ -307,7 +335,6 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // tempIndices[start.groupNum].startIdx = i //} - start.threadSP = i currentStates = append(currentStates, *start) var foundMatch bool var isEmptyAndNoAssertion bool @@ -404,23 +431,28 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in } if isEmptyAndNoAssertion || foundMatch { - allMatches := make([]nfaState, 0) - allMatches = append(allMatches, *(currentState.next)) - slices.Reverse(allMatches) - for i := range allMatches { - copyThread(&allMatches[i], currentState) - if foundMatch && currentState.assert == noneAssert { - allMatches[i].threadSP += 1 - } - } + nextMatch := *(currentState.next) + copyThread(&nextMatch, currentState) if currentState.groupBegin { - currentStates = slices.Insert(currentStates, currentStateIdx+1, allMatches...) + // if !stateExists(currentStates, nextMatch) { + currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch) + //} } else if currentState.groupEnd { - currentStates = append(currentStates, allMatches...) + if !stateExists(currentStates, nextMatch) { + currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch) // append(currentStates, nextMatch) + } } else if currentState.assert != noneAssert { - currentStates = append(currentStates, allMatches...) + if !stateExists(currentStates, nextMatch) { + currentStates = append(currentStates, nextMatch) + } + } else if currentState.isEmpty && !currentState.groupBegin && !currentState.groupEnd { + if !stateExists(currentStates, nextMatch) { + currentStates = append(currentStates, nextMatch) + } } else { - nextStates = append(nextStates, allMatches...) + if !stateExists(nextStates, nextMatch) { + nextStates = append(nextStates, nextMatch) + } } } -- 2.30.2 From d2ad0d95a88f53248bf2b2739f59d0e037a12574 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Fri, 7 Feb 2025 16:04:26 -0500 Subject: [PATCH 20/48] Modified question operator so that it doesn't create an unnecessary zero-state --- regex/nfa.go | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/regex/nfa.go b/regex/nfa.go index d051a25..d7ac1af 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -329,9 +329,6 @@ func kleene(s1 *nfaState) (*nfaState, error) { toReturn.isAlternation = true toReturn.content = newContents(epsilon) toReturn.splitState = s1 - for i := range s1.output { - s1.output[i].next = toReturn - } // toReturn := &nfaState{} // toReturn.transitions = make(map[int][]*nfaState) @@ -373,14 +370,20 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState { return toReturn } -func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|) - s2 := &nfaState{} +func question(s1 *nfaState) (*nfaState, error) { // Use the fact that ab? == a(b|) + if s1.isEmpty && s1.assert != noneAssert { + return nil, fmt.Errorf("previous token is not quantifiable") + } + toReturn := &nfaState{} + toReturn.isEmpty = true + toReturn.isAlternation = true + toReturn.isQuestion = true + toReturn.content = newContents(epsilon) + toReturn.splitState = s1 + toReturn.output = append([]*nfaState{}, toReturn) + toReturn.output = append(toReturn.output, s1.output...) // s2.transitions = make(map[int][]*nfaState) - s2.content = newContents(epsilon) - s2.output = append(s2.output, s2) - s2.isEmpty = true - s3 := alternate(s1, s2) - return s3 + return toReturn, nil } // Creates and returns a new state with the 'default' values. -- 2.30.2 From 052de558261ce31b4603f43b1a7dde286e8890ce Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Fri, 7 Feb 2025 16:04:46 -0500 Subject: [PATCH 21/48] question() now returns 2 values --- regex/compile.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/regex/compile.go b/regex/compile.go index fa51e0d..03e9f54 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -1047,7 +1047,10 @@ func thompson(re []postfixNode) (Reg, error) { if err != nil { return Reg{}, fmt.Errorf("error applying question operator") } - s2 := question(s1) + s2, err := question(s1) + if err != nil { + return Reg{}, err + } nfa = append(nfa, s2) case pipeNode: // A pipe operator doesn't actually need either operand to be present. If an operand isn't present, @@ -1106,7 +1109,11 @@ func thompson(re []postfixNode) (Reg, error) { stateToAdd = concatenate(stateToAdd, s2) } else { // Case 2 for i := c.startReps; i < c.endReps; i++ { - stateToAdd = concatenate(stateToAdd, question(cloneState(poppedState))) + tmp, err := question(cloneState(poppedState)) + if err != nil { + return Reg{}, fmt.Errorf("error processing bounded repetition") + } + stateToAdd = concatenate(stateToAdd, tmp) } } nfa = append(nfa, stateToAdd) -- 2.30.2 From 3604486a9b8195dc4fa05d496a2943597ad5e3ae Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Fri, 7 Feb 2025 16:06:45 -0500 Subject: [PATCH 22/48] Used Pike's algorithm (an extension to Thompson's algorithm) (see Russ Cox's 2nd article); I think I almost have a working PCRE-style engine --- regex/matching.go | 224 +++++++++++++++++++++------------------------- 1 file changed, 101 insertions(+), 123 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index 06fd16b..dab6446 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -2,7 +2,6 @@ package regex import ( "fmt" - "slices" "sort" ) @@ -252,32 +251,36 @@ func (regex Reg) FindAllSubmatch(str string) []Match { return indices } -func addStateToList(idx int, list []nfaState, state nfaState) []nfaState { +func addStateToList(idx int, list []nfaState, state nfaState, threadGroups []Group) []nfaState { if stateExists(list, state) { return list } - if state.isAlternation { - copyThread(state.next, state) - list = append(list, addStateToList(idx, list, *state.next)...) + if state.isKleene || state.isQuestion { copyThread(state.splitState, state) - list = append(list, addStateToList(idx, list, *state.splitState)...) + list = addStateToList(idx, list, *state.splitState, threadGroups) + copyThread(state.next, state) + list = addStateToList(idx, list, *state.next, threadGroups) return list } - if state.isKleene { - copyThread(state.splitState, state) - list = append(list, addStateToList(idx, list, *state.splitState)...) + if state.isAlternation { copyThread(state.next, state) - list = append(list, addStateToList(idx, list, *state.next)...) + list = addStateToList(idx, list, *state.next, threadGroups) + copyThread(state.splitState, state) + list = addStateToList(idx, list, *state.splitState, threadGroups) return list } + + state.threadGroups = append([]Group{}, threadGroups...) if state.groupBegin { state.threadGroups[state.groupNum].StartIdx = idx + return append(list, addStateToList(idx, list, *state.next, state.threadGroups)...) } if state.groupEnd { - state.threadGroups[state.groupNum].StartIdx = idx + state.threadGroups[state.groupNum].EndIdx = idx + return append(list, addStateToList(idx, list, *state.next, state.threadGroups)...) } - copyThread(state.next, state) - return append(list, *state.next) + state.threadGroups = append([]Group{}, threadGroups...) + return append(list, state) } @@ -335,138 +338,113 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // tempIndices[start.groupNum].startIdx = i //} - currentStates = append(currentStates, *start) - var foundMatch bool - var isEmptyAndNoAssertion bool + start.threadGroups = newMatch(numGroups + 1) + start.threadGroups[0].StartIdx = i + currentStates = addStateToList(i, currentStates, *start, start.threadGroups) + var match Match = nil + // var isEmptyAndNoAssertion bool // Main loop for idx := i; idx <= len(str); idx++ { + if len(currentStates) == 0 { + break + } for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ { currentState := currentStates[currentStateIdx] - foundMatch = false - isEmptyAndNoAssertion = false if currentState.threadGroups == nil { currentState.threadGroups = newMatch(numGroups + 1) currentState.threadGroups[0].StartIdx = idx } - if currentState.groupBegin { - currentState.threadGroups[currentState.groupNum].StartIdx = idx - // allMatches := make([]nfaState, 0) - // for _, v := range currentState.transitions { - // dereferenced := funcMap(v, func(s *nfaState) nfaState { - // return *s - // }) - // allMatches = append(allMatches, dereferenced...) - // } - // slices.Reverse(allMatches) - // for i := range allMatches { - // copyThread(&allMatches[i], currentState) - // } - // currentStates = append(currentStates, allMatches...) - } - if currentState.groupEnd { - currentState.threadGroups[currentState.groupNum].EndIdx = idx - // allMatches := make([]nfaState, 0) - // for _, v := range currentState.transitions { - // dereferenced := funcMap(v, func(s *nfaState) nfaState { - // return *s - // }) - // allMatches = append(allMatches, dereferenced...) - // } - // slices.Reverse(allMatches) - // for i := range allMatches { - // copyThread(&allMatches[i], currentState) - // } - // currentStates = append(currentStates, allMatches...) + if currentState.isLast { + currentState.threadGroups[0].EndIdx = idx + match = append([]Group{}, currentState.threadGroups...) + break + } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd { // Normal character or assertion + if currentState.contentContains(str, idx) { + nextStates = addStateToList(idx+1, nextStates, *currentState.next, currentState.threadGroups) + } } - // if currentState.isKleene { - // // Append the next-state (after the kleene), then append the kleene state - // allMatches := make([]*nfaState, 0) - // for _, v := range currentState.transitions { - // allMatches = append(allMatches, v...) + // if currentState.groupBegin { + // currentState.threadGroups[currentState.groupNum].StartIdx = idx // } - // slices.Reverse(allMatches) - // for _, m := range allMatches { - // m.threadGroups = currentState.threadGroups - // m.threadSP = idx + // if currentState.groupEnd { + // currentState.threadGroups[currentState.groupNum].EndIdx = idx // } - // currentStates = append(currentStates, allMatches...) - // - // // kleeneState := currentState.kleeneState - // // kleeneState.threadGroups = currentState.threadGroups - // // kleeneState.threadSP = currentState.threadSP - // // currentStates = append(currentStates, kleeneState) - // continue - // } // Alternation - enqueue left then right state, and continue - if currentState.isAlternation { - if currentState.isKleene { // Reverse order of adding things - rightState := currentState.splitState - copyThread(rightState, currentState) - currentStates = slices.Insert(currentStates, currentStateIdx+1, *rightState) - leftState := currentState.next - copyThread(leftState, currentState) - currentStates = slices.Insert(currentStates, currentStateIdx+2, *leftState) - } else { - leftState := currentState.next - copyThread(leftState, currentState) - currentStates = slices.Insert(currentStates, currentStateIdx+1, *leftState) - rightState := currentState.splitState - copyThread(rightState, currentState) - currentStates = slices.Insert(currentStates, currentStateIdx+2, *rightState) - } - continue - } + // if currentState.isAlternation { + // if currentState.isKleene { // Reverse order of adding things + // rightState := currentState.splitState + // copyThread(rightState, currentState) + // currentStates = slices.Insert(currentStates, currentStateIdx+1, *rightState) + // leftState := currentState.next + // copyThread(leftState, currentState) + // currentStates = slices.Insert(currentStates, currentStateIdx+2, *leftState) + // } else { + // leftState := currentState.next + // copyThread(leftState, currentState) + // currentStates = slices.Insert(currentStates, currentStateIdx+1, *leftState) + // rightState := currentState.splitState + // copyThread(rightState, currentState) + // currentStates = slices.Insert(currentStates, currentStateIdx+2, *rightState) + // } + // continue + // } // Empty state - enqueue next state, do _not_ increment the SP - if !currentState.isAlternation && currentState.isEmpty && currentState.assert == noneAssert { //&& currentState.groupBegin == false && currentState.groupEnd == false { - isEmptyAndNoAssertion = true - } - - if currentState.contentContains(str, idx) { - foundMatch = true - } - - if isEmptyAndNoAssertion || foundMatch { - nextMatch := *(currentState.next) - copyThread(&nextMatch, currentState) - if currentState.groupBegin { - // if !stateExists(currentStates, nextMatch) { - currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch) - //} - } else if currentState.groupEnd { - if !stateExists(currentStates, nextMatch) { - currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch) // append(currentStates, nextMatch) - } - } else if currentState.assert != noneAssert { - if !stateExists(currentStates, nextMatch) { - currentStates = append(currentStates, nextMatch) - } - } else if currentState.isEmpty && !currentState.groupBegin && !currentState.groupEnd { - if !stateExists(currentStates, nextMatch) { - currentStates = append(currentStates, nextMatch) - } - } else { - if !stateExists(nextStates, nextMatch) { - nextStates = append(nextStates, nextMatch) - } - } - } - - if currentState.isLast && len(nextStates) == 0 { // Last state reached - currentState.threadGroups[0].EndIdx = idx - if idx == currentState.threadGroups[0].StartIdx { - idx += 1 - } - return true, currentState.threadGroups, idx - } + // if !currentState.isAlternation && currentState.isEmpty && currentState.assert == noneAssert { //&& currentState.groupBegin == false && currentState.groupEnd == false { + // isEmptyAndNoAssertion = true + // } + // + // if currentState.contentContains(str, idx) { + // foundMatch = true + // } + // + // if isEmptyAndNoAssertion || foundMatch { + // nextMatch := *(currentState.next) + // copyThread(&nextMatch, currentState) + // if currentState.groupBegin { + // // if !stateExists(currentStates, nextMatch) { + // currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch) + // //} + // } else if currentState.groupEnd { + // if !stateExists(currentStates, nextMatch) { + // currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch) // append(currentStates, nextMatch) + // } + // } else if currentState.assert != noneAssert { + // if !stateExists(currentStates, nextMatch) { + // currentStates = append(currentStates, nextMatch) + // } + // } else if currentState.isEmpty && !currentState.groupBegin && !currentState.groupEnd { + // if !stateExists(currentStates, nextMatch) { + // currentStates = append(currentStates, nextMatch) + // } + // } else { + // if !stateExists(nextStates, nextMatch) { + // nextStates = append(nextStates, nextMatch) + // } + // } + // } + // + // if currentState.isLast && len(nextStates) == 0 { // Last state reached + // currentState.threadGroups[0].EndIdx = idx + // if idx == currentState.threadGroups[0].StartIdx { + // idx += 1 + // } + // return true, currentState.threadGroups, idx + // } } currentStates = append([]nfaState{}, nextStates...) nextStates = nil } + if match != nil { + if offset == match[0].EndIdx { + return true, match, match[0].EndIdx + 1 + } + return true, match, match[0].EndIdx + } return false, []Group{}, i + 1 // zeroStates := make([]*nfaState, 0) // // Keep taking zero-states, until there are no more left to take -- 2.30.2 From 22ead83625401c232c175da1566baf5e6652f1e6 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Fri, 7 Feb 2025 16:19:36 -0500 Subject: [PATCH 23/48] Fixed assertion matching --- regex/matching.go | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index dab6446..d2925bd 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -251,35 +251,39 @@ func (regex Reg) FindAllSubmatch(str string) []Match { return indices } -func addStateToList(idx int, list []nfaState, state nfaState, threadGroups []Group) []nfaState { +func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group) []nfaState { if stateExists(list, state) { return list } if state.isKleene || state.isQuestion { copyThread(state.splitState, state) - list = addStateToList(idx, list, *state.splitState, threadGroups) + list = addStateToList(str, idx, list, *state.splitState, threadGroups) copyThread(state.next, state) - list = addStateToList(idx, list, *state.next, threadGroups) + list = addStateToList(str, idx, list, *state.next, threadGroups) return list } if state.isAlternation { copyThread(state.next, state) - list = addStateToList(idx, list, *state.next, threadGroups) + list = addStateToList(str, idx, list, *state.next, threadGroups) copyThread(state.splitState, state) - list = addStateToList(idx, list, *state.splitState, threadGroups) + list = addStateToList(str, idx, list, *state.splitState, threadGroups) return list } - state.threadGroups = append([]Group{}, threadGroups...) + if state.assert != noneAssert { + if state.checkAssertion(str, idx) { + copyThread(state.next, state) + return append(list, addStateToList(str, idx, list, *state.next, state.threadGroups)...) + } + } if state.groupBegin { state.threadGroups[state.groupNum].StartIdx = idx - return append(list, addStateToList(idx, list, *state.next, state.threadGroups)...) + return append(list, addStateToList(str, idx, list, *state.next, state.threadGroups)...) } if state.groupEnd { state.threadGroups[state.groupNum].EndIdx = idx - return append(list, addStateToList(idx, list, *state.next, state.threadGroups)...) + return append(list, addStateToList(str, idx, list, *state.next, state.threadGroups)...) } - state.threadGroups = append([]Group{}, threadGroups...) return append(list, state) } @@ -340,7 +344,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in start.threadGroups = newMatch(numGroups + 1) start.threadGroups[0].StartIdx = i - currentStates = addStateToList(i, currentStates, *start, start.threadGroups) + currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups) var match Match = nil // var isEmptyAndNoAssertion bool // Main loop @@ -362,7 +366,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in break } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd { // Normal character or assertion if currentState.contentContains(str, idx) { - nextStates = addStateToList(idx+1, nextStates, *currentState.next, currentState.threadGroups) + nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups) } } -- 2.30.2 From 99230b49de346feea1e917cb3e3565310da86676 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sat, 8 Feb 2025 16:05:35 -0500 Subject: [PATCH 24/48] Use new function signature for zeroLengthMatchState() --- regex/compile.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/regex/compile.go b/regex/compile.go index 03e9f54..8e010dc 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -816,7 +816,7 @@ func thompson(re []postfixNode) (Reg, error) { // In these cases, we will return an NFA with 1 state, with an assertion that is always true. if len(re) == 0 { start := zeroLengthMatchState() - nfa = append(nfa, &start) + nfa = append(nfa, start) } for _, c := range re { @@ -1068,14 +1068,14 @@ func thompson(re []postfixNode) (Reg, error) { nfa = append(nfa, s2) } tmp := zeroLengthMatchState() - s2 = &tmp + s2 = tmp } if err1 != nil || (s1.groupBegin && s1.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN if err1 == nil { // See above for explanation nfa = append(nfa, s1) } tmp := zeroLengthMatchState() - s1 = &tmp + s1 = tmp } s3 := alternate(s1, s2) nfa = append(nfa, s3) -- 2.30.2 From 62ca1a872aa09772cff5a591dee9d2311a25f334 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sat, 8 Feb 2025 16:06:14 -0500 Subject: [PATCH 25/48] Made zeroLengthMatchState() return a pointer; reduced the number of comparisons performd by nfaState.equals --- regex/nfa.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/regex/nfa.go b/regex/nfa.go index d7ac1af..8bd1d74 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -402,19 +402,20 @@ func newState() nfaState { } // Creates and returns a state that _always_ has a zero-length match. -func zeroLengthMatchState() nfaState { - start := newState() +func zeroLengthMatchState() *nfaState { + start := &nfaState{} start.content = newContents(epsilon) start.isEmpty = true start.assert = alwaysTrueAssert + start.output = append([]*nfaState{}, start) return start } func (s nfaState) equals(other nfaState) bool { - return slices.Equal(s.content, other.content) && - s.isEmpty == other.isEmpty && + return s.isEmpty == other.isEmpty && s.isLast == other.isLast && slices.Equal(s.output, other.output) && + slices.Equal(s.content, other.content) && s.next == other.next && s.isKleene == other.isKleene && s.isQuestion == other.isQuestion && -- 2.30.2 From f15a5cae348e19b31f4e50358616629a075d4d79 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sat, 8 Feb 2025 16:07:01 -0500 Subject: [PATCH 26/48] Store all states visited in a single run of 'addStateToList()' in a slice --- regex/matching.go | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index d2925bd..6a5e0e7 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -251,38 +251,40 @@ func (regex Reg) FindAllSubmatch(str string) []Match { return indices } -func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group) []nfaState { - if stateExists(list, state) { +func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState) []nfaState { + if stateExists(list, state) || stateExists(visited, state) { return list } + visited = append(visited, state) + if state.isKleene || state.isQuestion { copyThread(state.splitState, state) - list = addStateToList(str, idx, list, *state.splitState, threadGroups) + list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited) copyThread(state.next, state) - list = addStateToList(str, idx, list, *state.next, threadGroups) + list = addStateToList(str, idx, list, *state.next, threadGroups, visited) return list } if state.isAlternation { copyThread(state.next, state) - list = addStateToList(str, idx, list, *state.next, threadGroups) + list = addStateToList(str, idx, list, *state.next, threadGroups, visited) copyThread(state.splitState, state) - list = addStateToList(str, idx, list, *state.splitState, threadGroups) + list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited) return list } state.threadGroups = append([]Group{}, threadGroups...) if state.assert != noneAssert { if state.checkAssertion(str, idx) { copyThread(state.next, state) - return append(list, addStateToList(str, idx, list, *state.next, state.threadGroups)...) + return addStateToList(str, idx, list, *state.next, state.threadGroups, visited) } } if state.groupBegin { state.threadGroups[state.groupNum].StartIdx = idx - return append(list, addStateToList(str, idx, list, *state.next, state.threadGroups)...) + return addStateToList(str, idx, list, *state.next, state.threadGroups, visited) } if state.groupEnd { state.threadGroups[state.groupNum].EndIdx = idx - return append(list, addStateToList(str, idx, list, *state.next, state.threadGroups)...) + return addStateToList(str, idx, list, *state.next, state.threadGroups, visited) } return append(list, state) @@ -344,7 +346,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in start.threadGroups = newMatch(numGroups + 1) start.threadGroups[0].StartIdx = i - currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups) + currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil) var match Match = nil // var isEmptyAndNoAssertion bool // Main loop @@ -366,7 +368,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in break } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd { // Normal character or assertion if currentState.contentContains(str, idx) { - nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups) + nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil) } } -- 2.30.2 From d4e3942d27a0af01620eda97a7d37925873e9487 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 08:58:09 -0500 Subject: [PATCH 27/48] Added Match() and FindStringSubmatch(); removed old code; updated comments --- regex/matching.go | 99 +++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 60 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index 6a5e0e7..7864084 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -14,7 +14,7 @@ import ( // See [Reg.FindSubmatch] for an example. type Match []Group -// a Group represents a group. It contains the start index and end index of the match +// a Group represents a capturing group. It contains the start and index of the group. type Group struct { StartIdx int EndIdx int @@ -58,7 +58,7 @@ func (idx Group) String() string { return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx) } -// Returns whether a group is valid (ie. whether it matched any text). It +// IsValid returns whether a group is valid (ie. whether it matched any text). It // simply ensures that both indices of the group are >= 0. func (g Group) IsValid() bool { return g.StartIdx >= 0 && g.EndIdx >= 0 @@ -69,63 +69,6 @@ func getZeroGroup(m Match) Group { return m[0] } -// takeZeroState takes the 0-state (if such a transition exists) for all states in the -// given slice. It returns the resulting states. If any of the resulting states is a 0-state, -// the second ret val is true. -// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index. -//func takeZeroState(states []*nfaState, numGroups int, idx int) (rtv []*nfaState, isZero bool) { -// for _, state := range states { -// if len(state.transitions[epsilon]) > 0 { -// for _, s := range state.transitions[epsilon] { -// if s.threadGroups == nil { -// s.threadGroups = newMatch(numGroups + 1) -// } -// copy(s.threadGroups, state.threadGroups) -// if s.groupBegin { -// s.threadGroups[s.groupNum].StartIdx = idx -// // openParenGroups = append(openParenGroups, s.groupNum) -// } -// if s.groupEnd { -// s.threadGroups[s.groupNum].EndIdx = idx -// // closeParenGroups = append(closeParenGroups, s.groupNum) -// } -// } -// rtv = append(rtv, state.transitions[epsilon]...) -// } -// } -// for _, state := range rtv { -// if len(state.transitions[epsilon]) > 0 { -// return rtv, true -// } -// } -// return rtv, false -//} - -// zeroMatchPossible returns true if a zero-length match is possible -// from any of the given states, given the string and our position in it. -// It uses the same algorithm to find zero-states as the one inside the loop, -// so I should probably put it in a function. -//func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState) bool { -// zeroStates, isZero := takeZeroState(states, numGroups, idx) -// tempstates := make([]*nfaState, 0, len(zeroStates)+len(states)) -// tempstates = append(tempstates, states...) -// tempstates = append(tempstates, zeroStates...) -// num_appended := 0 // number of unique states addded to tempstates -// for isZero == true { -// zeroStates, isZero = takeZeroState(tempstates, numGroups, idx) -// tempstates, num_appended = uniqueAppend(tempstates, zeroStates...) -// if num_appended == 0 { // break if we haven't appended any more unique values -// break -// } -// } -// for _, state := range tempstates { -// if state.isEmpty && (state.assert == noneAssert || state.checkAssertion(str, idx)) && state.isLast { -// return true -// } -// } -// return false -//} - // Prunes the slice by removing overlapping indices. func pruneIndices(indices []Match) []Match { // First, sort the slice by the start indices @@ -164,6 +107,12 @@ func (regex Reg) Find(str string) (Group, error) { return getZeroGroup(match), nil } +// Match returns a boolean value, indicating whether the regex found a match in the given string. +func (regex Reg) Match(str string) bool { + _, err := regex.Find(str) + return err == nil +} + // FindAll returns a slice containing all the 0-groups of the regex in the given string. // A 0-group represents the match without any submatches. func (regex Reg) FindAll(str string) []Group { @@ -199,7 +148,37 @@ func (regex Reg) FindSubmatch(str string) (Match, error) { } } -// FindAllString is the 'all' version of FindString. +// FindStringSubmatch is the 'string' version of [FindSubmatch]. It returns a slice of strings, +// where the string at index i contains the text matched by the i-th capturing group. +// The 0-th index represents the entire match. +// An empty string at index n could mean: +// , +// 1. Group n did not find a match +// 2. Group n found a zero-length match +// +// A return value of nil indicates no match. +func (regex Reg) FindStringSubmatch(str string) []string { + matchStr := make([]string, regex.numGroups+1) + match, err := regex.FindSubmatch(str) + if err != nil { + return nil + } + nonEmptyMatchFound := false + for i := range match { + if match[i].IsValid() { + matchStr[i] = str[match[i].StartIdx:match[i].EndIdx] + nonEmptyMatchFound = true + } else { + matchStr[i] = "" + } + } + if nonEmptyMatchFound == false { + return nil + } + return matchStr +} + +// FindAllString is the 'all' version of [FindString]. // It returns a slice of strings containing the text of all matches of // the regex in the given string. func (regex Reg) FindAllString(str string) []string { -- 2.30.2 From c577064977b7921f0db200b17a144343ebf92162 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 08:58:46 -0500 Subject: [PATCH 28/48] Added string field to Reg, that contains the expression string; wrote method to return the string --- regex/compile.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/regex/compile.go b/regex/compile.go index 8e010dc..8dbcf37 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -12,18 +12,24 @@ var notDotChars []rune // A Reg represents the result of compiling a regular expression. It contains // the startState of the NFA representation of the regex, and the number of capturing -// groups in the regex. +// groups in the regex. It also contains the expression string. type Reg struct { start *nfaState numGroups int + str string } -// numSubexp eturns the number of sub-expressions in the given [Reg]. This is equivalent +// NumSubexp returns the number of sub-expressions in the given [Reg]. This is equivalent // to the number of capturing groups. func (r Reg) NumSubexp() int { return r.numGroups } +// String returns the string used to compile the expression. +func (r Reg) String() string { + return r.str +} + const concatRune rune = 0xF0001 // Flags for shuntingYard - control its behavior @@ -1128,7 +1134,8 @@ func thompson(re []postfixNode) (Reg, error) { concatenate(nfa[0], &lastState) - return Reg{nfa[0], numGroups}, nil + // The string is empty here, because we add it in Compile() + return Reg{nfa[0], numGroups, ""}, nil } @@ -1146,6 +1153,7 @@ func Compile(re string, flags ...ReFlag) (Reg, error) { if err != nil { return Reg{}, fmt.Errorf("error compiling regex: %w", err) } + reg.str = re return reg, nil } -- 2.30.2 From eddd2ae7001b0995a1e895b8ced05f45abee0e73 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 08:58:58 -0500 Subject: [PATCH 29/48] Updated documentation --- regex/doc.go | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/regex/doc.go b/regex/doc.go index 1b821c1..9ca1f04 100644 --- a/regex/doc.go +++ b/regex/doc.go @@ -4,6 +4,8 @@ Package regex implements regular expression search, using a custom non-bracktrac The engine relies completely on UTF-8 codepoints. As such, it is capable of matching characters from other languages, emojis and symbols. +The API and regex syntax are largely compatible with that of the stdlib's [regexp], with a few key differences (see 'Key Differences with regexp'). + The full syntax is specified below. # Syntax @@ -55,8 +57,8 @@ POSIX classes (inside normal character classes): Composition: def Match d, followed by e, followed by f - x|y Match x or y (prefer longer one) - xy|z Match xy or z + x|y Match x or y (prefer x) + xy|z Match xy or z (prefer xy) Repitition (always greedy, preferring more): @@ -94,10 +96,11 @@ Lookarounds: Numeric ranges: Match any number from x to y (inclusive) (x and y must be positive numbers) + \ Date: Sun, 9 Feb 2025 08:59:16 -0500 Subject: [PATCH 30/48] Use new definition of Reg --- regex/nfa.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex/nfa.go b/regex/nfa.go index 8bd1d74..db53c00 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -183,7 +183,7 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool { strToMatch = string(runesToMatch) } - regComp := Reg{startState, s.lookaroundNumCaptureGroups} + regComp := Reg{startState, s.lookaroundNumCaptureGroups, s.lookaroundRegex} matchIndices := regComp.FindAll(strToMatch) numMatchesFound := 0 -- 2.30.2 From 6334435b83d4963880f2a69fa309022264b1f316 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 09:01:42 -0500 Subject: [PATCH 31/48] Updated tests since the engine uses Perl matching instead of POSIX matching; added tests for FindStringSubmatch --- regex/re_test.go | 65 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 6 deletions(-) diff --git a/regex/re_test.go b/regex/re_test.go index 2cccc72..b8b1381 100644 --- a/regex/re_test.go +++ b/regex/re_test.go @@ -528,7 +528,7 @@ var groupTests = []struct { }{ {"(a)(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}}, {"((a))(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}}, - {"(0)", nil, "ab", []Match{[]Group{}}}, + {"(0)", nil, "ab", []Match{}}, {"(a)b", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}}}}, {"a(b)", nil, "ab", []Match{[]Group{{0, 2}, {1, 2}}}}, {"(a|b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}}, @@ -538,9 +538,8 @@ var groupTests = []struct { {"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}}, {"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, - {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, {"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, - {"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}}, + {"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}}, {"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}}, {"(a?)a?", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}}, {"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}}, @@ -578,7 +577,7 @@ var groupTests = []struct { {`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}}, {`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}}, {`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, - {`(((((((((a)))))))))\41`, nil, `a`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, + {`(((((((((a)))))))))\41`, nil, `a!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}}, {`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}}, @@ -633,7 +632,7 @@ var groupTests = []struct { {`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `BCDD`, []Match{}}, {`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}}, {`(((((((((a)))))))))`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, - {`(((((((((a)))))))))\41`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, + {`(((((((((a)))))))))\41`, []ReFlag{RE_CASE_INSENSITIVE}, `A!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(.*)c(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}}, {`\((.*), (.*)\)`, []ReFlag{RE_CASE_INSENSITIVE}, `(A, B)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}}, {`(a)(b)c|ab`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}}}}, @@ -743,7 +742,7 @@ func TestFindString(t *testing.T) { foundString := regComp.FindString(test.str) if len(test.result) == 0 { if foundString != "" { - t.Errorf("Expected no match got %v\n", foundString) + t.Errorf("Wanted no match got %v\n", foundString) } } else { expectedString := test.str[test.result[0].StartIdx:test.result[0].EndIdx] @@ -796,6 +795,56 @@ func TestFindSubmatch(t *testing.T) { if test.result[0][i] != match[i] { t.Errorf("Wanted %v Got %v\n", test.result[0], match) } + } else { + if i < len(test.result) && test.result[0][i].IsValid() { + t.Errorf("Wanted %v Got %v\n", test.result[0], match) + } + } + } + }) + } +} +func TestFindStringSubmatch(t *testing.T) { + for _, test := range groupTests { + t.Run(test.re+" "+test.str, func(t *testing.T) { + regComp, err := Compile(test.re, test.flags...) + if err != nil { + if test.result != nil { + panic(err) + } + } + matchStr := regComp.FindStringSubmatch(test.str) + if matchStr == nil { + if len(test.result) != 0 { + expectedStr := funcMap(test.result[0], func(g Group) string { + if g.IsValid() { + return test.str[g.StartIdx:g.EndIdx] + } else { + return "" + } + }) + t.Errorf("Wanted %v got no match\n", expectedStr) + } + } else if len(test.result) == 0 { + t.Errorf("Wanted no match got %v\n", matchStr) + } else { + expectedStr := funcMap(test.result[0], func(g Group) string { + if g.IsValid() { + return test.str[g.StartIdx:g.EndIdx] + } else { + return "" + } + }) + for i, groupStr := range matchStr { + if groupStr == "" { + if i < len(expectedStr) && expectedStr[i] != "" { + t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr) + } + } else { + if expectedStr[i] != groupStr { + t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr) + } + } } } }) @@ -817,6 +866,10 @@ func TestFindAllSubmatch(t *testing.T) { if test.result[i][j] != matchIndices[i][j] { t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) } + } else { + if i < len(test.result) && j < len(test.result[i]) && test.result[i][j].IsValid() { + t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) + } } } } -- 2.30.2 From c6ad4caa0d32f5ca828f70b3e2c1ded8dbedbcea Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 09:06:40 -0500 Subject: [PATCH 32/48] Removed a bunch of unused code (let's go!!!) --- regex/matching.go | 322 ---------------------------------------------- 1 file changed, 322 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index 7864084..d9500ce 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -2,7 +2,6 @@ package regex import ( "fmt" - "sort" ) // A Match represents a match found by the regex in a given string. @@ -69,30 +68,6 @@ func getZeroGroup(m Match) Group { return m[0] } -// Prunes the slice by removing overlapping indices. -func pruneIndices(indices []Match) []Match { - // First, sort the slice by the start indices - sort.Slice(indices, func(i, j int) bool { - return indices[i][0].StartIdx < indices[j][0].StartIdx - }) - toRet := make([]Match, 0, len(indices)) - current := indices[0] - for _, idx := range indices[1:] { - // idx doesn't overlap with current (starts after current ends), so add current to result - // and update the current. - if idx[0].StartIdx >= current[0].EndIdx { - toRet = append(toRet, current) - current = idx - } else if idx[0].EndIdx > current[0].EndIdx { - // idx overlaps, but it is longer, so update current - current = idx - } - } - // Add last state - toRet = append(toRet, current) - return toRet -} - func copyThread(to *nfaState, from nfaState) { to.threadGroups = append([]Group{}, from.threadGroups...) } @@ -223,9 +198,6 @@ func (regex Reg) FindAllSubmatch(str string) []Match { indices = append(indices, matchIdx) } } - if len(indices) > 0 { - return pruneIndices(indices) - } return indices } @@ -272,8 +244,6 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread // Helper for FindAllMatches. Returns whether it found a match, the // first Match it finds, and how far it got into the string ie. where // the next search should start from. -// -// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array. func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) { // Base case - exit if offset exceeds string's length if offset > len(str) { @@ -282,21 +252,9 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in } resetThreads(start) - // Hold a list of match indices for the current run. When we - // can no longer find a match, the match with the largest range is - // chosen as the match for the entire string. - // This allows us to pick the longest possible match (which is how greedy matching works). - // COMMENT ABOVE IS CURRENTLY NOT UP-TO-DATE - // tempIndices := newMatch(numGroups + 1) - - // foundPath := false - //startIdx := offset - //endIdx := offset currentStates := make([]nfaState, 0) nextStates := make([]nfaState, 0) - // tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration i := offset // Index in string - //startingFrom := i // Store starting index // If the first state is an assertion, makes sure the assertion // is true before we do _anything_ else. @@ -306,29 +264,11 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in return false, []Group{}, i } } - // Increment until we hit a character matching the start state (assuming not 0-state) - // if start.isEmpty == false { - // for i < len(str) && !start.contentContains(str, i) { - // i++ - // } - // startIdx = i - // startingFrom = i - // i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character - // } - - // start.threadGroups = newMatch(numGroups + 1) - // Check if the start state begins a group - if so, add the start index to our list - //if start.groupBegin { - // start.threadGroups[start.groupNum].StartIdx = i - // tempIndices[start.groupNum].startIdx = i - //} start.threadGroups = newMatch(numGroups + 1) start.threadGroups[0].StartIdx = i currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil) var match Match = nil - // var isEmptyAndNoAssertion bool - // Main loop for idx := i; idx <= len(str); idx++ { if len(currentStates) == 0 { break @@ -350,76 +290,6 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil) } } - - // if currentState.groupBegin { - // currentState.threadGroups[currentState.groupNum].StartIdx = idx - // } - // if currentState.groupEnd { - // currentState.threadGroups[currentState.groupNum].EndIdx = idx - // } - - // Alternation - enqueue left then right state, and continue - // if currentState.isAlternation { - // if currentState.isKleene { // Reverse order of adding things - // rightState := currentState.splitState - // copyThread(rightState, currentState) - // currentStates = slices.Insert(currentStates, currentStateIdx+1, *rightState) - // leftState := currentState.next - // copyThread(leftState, currentState) - // currentStates = slices.Insert(currentStates, currentStateIdx+2, *leftState) - // } else { - // leftState := currentState.next - // copyThread(leftState, currentState) - // currentStates = slices.Insert(currentStates, currentStateIdx+1, *leftState) - // rightState := currentState.splitState - // copyThread(rightState, currentState) - // currentStates = slices.Insert(currentStates, currentStateIdx+2, *rightState) - // } - // continue - // } - - // Empty state - enqueue next state, do _not_ increment the SP - // if !currentState.isAlternation && currentState.isEmpty && currentState.assert == noneAssert { //&& currentState.groupBegin == false && currentState.groupEnd == false { - // isEmptyAndNoAssertion = true - // } - // - // if currentState.contentContains(str, idx) { - // foundMatch = true - // } - // - // if isEmptyAndNoAssertion || foundMatch { - // nextMatch := *(currentState.next) - // copyThread(&nextMatch, currentState) - // if currentState.groupBegin { - // // if !stateExists(currentStates, nextMatch) { - // currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch) - // //} - // } else if currentState.groupEnd { - // if !stateExists(currentStates, nextMatch) { - // currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch) // append(currentStates, nextMatch) - // } - // } else if currentState.assert != noneAssert { - // if !stateExists(currentStates, nextMatch) { - // currentStates = append(currentStates, nextMatch) - // } - // } else if currentState.isEmpty && !currentState.groupBegin && !currentState.groupEnd { - // if !stateExists(currentStates, nextMatch) { - // currentStates = append(currentStates, nextMatch) - // } - // } else { - // if !stateExists(nextStates, nextMatch) { - // nextStates = append(nextStates, nextMatch) - // } - // } - // } - // - // if currentState.isLast && len(nextStates) == 0 { // Last state reached - // currentState.threadGroups[0].EndIdx = idx - // if idx == currentState.threadGroups[0].StartIdx { - // idx += 1 - // } - // return true, currentState.threadGroups, idx - // } } currentStates = append([]nfaState{}, nextStates...) nextStates = nil @@ -431,196 +301,4 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in return true, match, match[0].EndIdx } return false, []Group{}, i + 1 - // zeroStates := make([]*nfaState, 0) - // // Keep taking zero-states, until there are no more left to take - // // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take. - // topStateItem := currentStates.peek() - // topState := topStateItem.(*priorQueueItem).state - // zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) - // tempStates = append(tempStates, zeroStates...) - // num_appended := 0 - // for isZero == true { - // zeroStates, isZero = takeZeroState(tempStates, numGroups, i) - // tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) - // if num_appended == 0 { // Break if we haven't appended any more unique values - // break - // } - // } - // if isZero == true { - // currentStates.Pop() - // } - // - // for _, state := range tempStates { - // heap.Push(currentStates, newPriorQueueItem(state)) - // } - // tempStates = nil - // - // // Take any transitions corresponding to current character - // numStatesMatched := 0 // The number of states which had at least 1 match for this round - // assertionFailed := false // Whether or not an assertion failed for this round - // lastStateInList := false // Whether or not a last state was in our list of states - // var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found - // lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states - // for numStatesMatched == 0 && lastStateInList == false { - // if currentStates.Len() == 0 { - // break - // } - // stateItem := heap.Pop(currentStates) - // state := stateItem.(*priorQueueItem).state - // matches, numMatches := state.matchesFor(str, i) - // if numMatches > 0 { - // numStatesMatched++ - // tempStates = append([]*nfaState(nil), matches...) - // foundPath = true - // for _, m := range matches { - // if m.threadGroups == nil { - // m.threadGroups = newMatch(numGroups + 1) - // } - // m.threadSP = state.threadSP + 1 - // copy(m.threadGroups, state.threadGroups) - // } - // } - // if numMatches < 0 { - // assertionFailed = true - // } - // if state.isLast { - // if state.isLookaround() { - // lastLookaroundInList = true - // } - // lastStateInList = true - // lastStatePtr = state - // } - // } - // - // if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed - // // If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_ - // // state. The explanation below is my attempt to explain this behavior. - // // If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails. - // // - // // One of the states in our list was a last state and a lookaround. In this case, we - // // don't abort upon failure of the assertion, because we have found - // // another path to a final state. - // // Even if the last state _was_ an assertion, we can use the previously - // // saved indices to find a match. - // if lastLookaroundInList { - // break - // } else { - // if i == startingFrom { - // i++ - // } - // return false, []Group{}, i - // } - // } - // // Check if we can find a state in our list that is: - // // a. A last-state - // // b. Empty - // // c. Doesn't assert anything - // for _, stateItem := range *currentStates { - // s := stateItem.state - // if s.isLast && s.isEmpty && s.assert == noneAssert { - // lastStatePtr = s - // lastStateInList = true - // } - // } - // if lastStateInList && numStatesMatched == 0 { // A last-state was in the list of states. add the matchIndex to our MatchIndex list - // for j := 1; j < numGroups+1; j++ { - // tempIndices[j] = lastStatePtr.threadGroups[j] - // } - // endIdx = i - // tempIndices[0] = Group{startIdx, endIdx} - // if tempIndices[0].StartIdx == tempIndices[0].EndIdx { - // return true, tempIndices, tempIndices[0].EndIdx + 1 - // } else { - // return true, tempIndices, tempIndices[0].EndIdx - // } - // } - // - // // Check if we can find a zero-length match - // if foundPath == false { - // currentStatesList := funcMap(*currentStates, func(item *priorQueueItem) *nfaState { - // return item.state - // }) - // if ok := zeroMatchPossible(str, i, numGroups, currentStatesList...); ok { - // if tempIndices[0].IsValid() == false { - // tempIndices[0] = Group{startIdx, startIdx} - // } - // } - // // If we haven't moved in the string, increment the counter by 1 - // // to ensure we don't keep trying the same string over and over. - // // if i == startingFrom { - // startIdx++ - // // i++ - // // } - // if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() { - // if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. - // return true, tempIndices, tempIndices[0].EndIdx + 1 - // } else { - // return true, tempIndices, tempIndices[0].EndIdx - // } - // } - // return false, []Group{}, startIdx - // } - // currentStates = &priorityQueue{} - // slices.Reverse(tempStates) - // for _, state := range tempStates { - // heap.Push(currentStates, newPriorQueueItem(state)) - // } - // tempStates = nil - // - // i++ - // } - // - // // End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position. - // // This is the exact same algorithm used inside the loop, so I should probably put it in a function. - // - // if currentStates.Len() > 0 { - // topStateItem := currentStates.peek() - // topState := topStateItem.(*priorQueueItem).state - // zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) - // tempStates = append(tempStates, zeroStates...) - // num_appended := 0 // Number of unique states addded to tempStates - // for isZero == true { - // zeroStates, isZero = takeZeroState(tempStates, numGroups, i) - // tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) - // if num_appended == 0 { // Break if we haven't appended any more unique values - // break - // } - // } - // } - // - // for _, state := range tempStates { - // heap.Push(currentStates, newPriorQueueItem(state)) - // } - // - // tempStates = nil - // - // for _, stateItem := range *currentStates { - // state := stateItem.state - // // Only add the match if the start index is in bounds. If the state has an assertion, - // // make sure the assertion checks out. - // if state.isLast && i <= len(str) { - // if state.assert == noneAssert || state.checkAssertion(str, i) { - // for j := 1; j < numGroups+1; j++ { - // tempIndices[j] = state.threadGroups[j] - // } - // endIdx = i - // tempIndices[0] = Group{startIdx, endIdx} - // } - // } - // } - // - // if tempIndices.numValidGroups() > 0 { - // if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. - // return true, tempIndices, tempIndices[0].EndIdx + 1 - // } else { - // return true, tempIndices, tempIndices[0].EndIdx - // } - // } - // - // if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over. - // - // startIdx++ - // } - // - // return false, []Group{}, startIdx } -- 2.30.2 From b7467a00f1b6882121f0191b80aa56d7b6530c28 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 09:07:43 -0500 Subject: [PATCH 33/48] Removed priorityQueue (unused) --- regex/priorityQueue.go | 89 ------------------------------------------ 1 file changed, 89 deletions(-) delete mode 100644 regex/priorityQueue.go diff --git a/regex/priorityQueue.go b/regex/priorityQueue.go deleted file mode 100644 index ae43e86..0000000 --- a/regex/priorityQueue.go +++ /dev/null @@ -1,89 +0,0 @@ -package regex - -import "container/heap" - -// Implement a priority queue using container/heap - -const ( - min_priority int = iota - zerostate_priority - alternation_priority - kleene_priority - char_priority - max_priority -) - -func getPriority(state *nfaState) int { - if state.isKleene { - return zerostate_priority - } else if state.isAlternation { - return alternation_priority - } else { - if state.isEmpty { - return zerostate_priority - } else { - return char_priority - } - } -} - -type priorQueueItem struct { - state *nfaState - priority int - index int -} - -func newPriorQueueItem(state *nfaState) *priorQueueItem { - return &priorQueueItem{ - state: state, - index: -1, - priority: getPriority(state), - } -} - -type priorityQueue []*priorQueueItem - -func (pq priorityQueue) Len() int { - return len(pq) -} - -func (pq priorityQueue) Less(i, j int) bool { - if pq[i].priority == pq[j].priority { - return pq[i].index < pq[j].index - } - return pq[i].priority > pq[j].priority // We want max-heap, so we use greater-than -} - -func (pq priorityQueue) Swap(i, j int) { - pq[i], pq[j] = pq[j], pq[i] - pq[i].index = i - pq[j].index = j -} - -func (pq *priorityQueue) Push(x any) { - length := len(*pq) - item := x.(*priorQueueItem) - item.index = length - *pq = append(*pq, item) -} - -func (pq *priorityQueue) Pop() any { - old := *pq - n := len(old) - item := old[n-1] - old[n-1] = nil - item.index = -1 - *pq = old[0 : n-1] - return item -} -func (pq *priorityQueue) peek() any { - queue := *pq - n := len(queue) - return queue[n-1] -} - -func (pq *priorityQueue) update(item *priorQueueItem, value *nfaState, priority int) { - item.state = value - item.priority = priority - heap.Fix(pq, item.index) -} -- 2.30.2 From e546f01c208f6db4fa88de812f101e809c5b5b41 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 09:12:55 -0500 Subject: [PATCH 34/48] Removed redundant return (staticcheck) --- cmd/unique_array.go | 1 - 1 file changed, 1 deletion(-) diff --git a/cmd/unique_array.go b/cmd/unique_array.go index e03621a..88c56cb 100644 --- a/cmd/unique_array.go +++ b/cmd/unique_array.go @@ -16,7 +16,6 @@ func (s *uniq_arr[T]) add(vals ...T) { s.backingMap[item] = struct{}{} } } - return } func (s uniq_arr[T]) contains(val T) bool { -- 2.30.2 From 7231169270f9e5ba688b913fd179e08250a7f8a9 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 09:13:03 -0500 Subject: [PATCH 35/48] Removed unused functions --- regex/misc.go | 43 ------------------------------------------- 1 file changed, 43 deletions(-) diff --git a/regex/misc.go b/regex/misc.go index 2d21e61..38b5313 100644 --- a/regex/misc.go +++ b/regex/misc.go @@ -48,49 +48,6 @@ func isNormalChar(c rune) bool { return !slices.Contains(specialChars, c) } -// Ensure that the given elements are only appended to the given slice if they -// don't already exist. Returns the new slice, and the number of unique items appended. -func uniqueAppend[T comparable](slc []T, items ...T) ([]T, int) { - num_appended := 0 - for _, item := range items { - if !slices.Contains(slc, item) { - slc = append(slc, item) - num_appended++ - } - } - return slc, num_appended -} - -func uniqueAppendFunc[T any](slc []T, fn func(T, T) bool, items ...T) ([]T, int) { - toRet := make([]T, len(slc)) - num_appended := 0 - copy(toRet, slc) - for _, item := range items { - itemExists := false - for _, val := range slc { - if fn(item, val) { - itemExists = true - } - } - if !itemExists { - toRet = append(toRet, item) - num_appended++ - } - } - return toRet, num_appended -} - -// Returns true only if all the given elements are equal -func allEqual[T comparable](items ...T) bool { - first := items[0] - for _, item := range items { - if item != first { - return false - } - } - return true -} - // Map function - convert a slice of T to a slice of V, based on a function // that maps a T to a V func funcMap[T, V any](slc []T, fn func(T) V) []V { -- 2.30.2 From d172a58258cc62523a78264e754ff3ddc6e3c42f Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 09:13:29 -0500 Subject: [PATCH 36/48] Throw error if match isn't found but test.result has >0 elements --- regex/re_test.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/regex/re_test.go b/regex/re_test.go index b8b1381..de6aaba 100644 --- a/regex/re_test.go +++ b/regex/re_test.go @@ -790,6 +790,13 @@ func TestFindSubmatch(t *testing.T) { } } match, err := regComp.FindSubmatch(test.str) + if err != nil { + if len(test.result) != 0 { + t.Errorf("Wanted %v got no match\n", test.result[0]) + } + } else if len(test.result) == 0 { + t.Errorf("Wanted no match got %v\n", match) + } for i := range match { if match[i].IsValid() { if test.result[0][i] != match[i] { -- 2.30.2 From 76e0170cb9fd5a19583d5dc7edc6c7c42d806e1c Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 09:13:52 -0500 Subject: [PATCH 37/48] Removed unused function --- regex/matching.go | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index d9500ce..1b07ee8 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -28,17 +28,6 @@ func newMatch(size int) Match { return toRet } -// Returns the number of valid groups in the match -func (m Match) numValidGroups() int { - numValid := 0 - for _, g := range m { - if g.StartIdx >= 0 && g.EndIdx >= 0 { - numValid++ - } - } - return numValid -} - // Returns a string containing the indices of all (valid) groups in the match func (m Match) String() string { var toRet string -- 2.30.2 From 835d495990f03f702f90d19e556216f8964e1db2 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 09:14:45 -0500 Subject: [PATCH 38/48] Removed capitalization for error message (staticcheck) --- regex/range2regex.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex/range2regex.go b/regex/range2regex.go index a01dfff..de8e0f4 100644 --- a/regex/range2regex.go +++ b/regex/range2regex.go @@ -109,7 +109,7 @@ func range2regex(start int, end int) (string, error) { startSlc := intToSlc(rg.start) endSlc := intToSlc(rg.end) if len(startSlc) != len(endSlc) { - return "", fmt.Errorf("Error parsing numeric range") + return "", fmt.Errorf("error parsing numeric range") } for i := range startSlc { if startSlc[i] == endSlc[i] { -- 2.30.2 From 47f88c817f562729093b2cd62494b457e3c55061 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 15:14:17 -0500 Subject: [PATCH 39/48] Fixed typo --- regex/compile.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex/compile.go b/regex/compile.go index 8dbcf37..9a703b9 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -1157,7 +1157,7 @@ func Compile(re string, flags ...ReFlag) (Reg, error) { return reg, nil } -// MustCompile panicks if Compile returns an error. They are identical in all other respects. +// MustCompile panics if Compile returns an error. They are identical in all other respects. func MustCompile(re string, flags ...ReFlag) Reg { reg, err := Compile(re, flags...) if err != nil { -- 2.30.2 From 9e12f9dcb32199b23efbd0c7e954daa3c962e1ea Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 15:38:26 -0500 Subject: [PATCH 40/48] Added field to Reg, denoting if we prefer longest match (POSIX style) or not (perl style) --- regex/compile.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/regex/compile.go b/regex/compile.go index 9a703b9..da733de 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -14,9 +14,10 @@ var notDotChars []rune // the startState of the NFA representation of the regex, and the number of capturing // groups in the regex. It also contains the expression string. type Reg struct { - start *nfaState - numGroups int - str string + start *nfaState + numGroups int + str string + preferLongest bool } // NumSubexp returns the number of sub-expressions in the given [Reg]. This is equivalent @@ -30,6 +31,10 @@ func (r Reg) String() string { return r.str } +func (r Reg) Longest() { + r.preferLongest = true +} + const concatRune rune = 0xF0001 // Flags for shuntingYard - control its behavior @@ -1135,7 +1140,7 @@ func thompson(re []postfixNode) (Reg, error) { concatenate(nfa[0], &lastState) // The string is empty here, because we add it in Compile() - return Reg{nfa[0], numGroups, ""}, nil + return Reg{nfa[0], numGroups, "", false}, nil } -- 2.30.2 From 1f5a36353934c0b78d2af207fceec720dcb8a8f9 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 15:39:09 -0500 Subject: [PATCH 41/48] Use new function signatures (with preferLongest) --- regex/nfa.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/regex/nfa.go b/regex/nfa.go index db53c00..c649712 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -133,7 +133,7 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) { // Checks if the given state's assertion is true. Returns true if the given // state doesn't have an assertion. -func (s nfaState) checkAssertion(str []rune, idx int) bool { +func (s nfaState) checkAssertion(str []rune, idx int, preferLongest bool) bool { if s.assert == alwaysTrueAssert { return true } @@ -183,7 +183,7 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool { strToMatch = string(runesToMatch) } - regComp := Reg{startState, s.lookaroundNumCaptureGroups, s.lookaroundRegex} + regComp := Reg{startState, s.lookaroundNumCaptureGroups, s.lookaroundRegex, preferLongest} matchIndices := regComp.FindAll(strToMatch) numMatchesFound := 0 @@ -210,9 +210,9 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool { } // Returns true if the contents of 's' contain the value at the given index of the given string -func (s nfaState) contentContains(str []rune, idx int) bool { +func (s nfaState) contentContains(str []rune, idx int, preferLongest bool) bool { if s.assert != noneAssert { - return s.checkAssertion(str, idx) + return s.checkAssertion(str, idx, preferLongest) } if idx >= len(str) { return false -- 2.30.2 From fb47e082eb9950165d2d8c6ca91a1cecb111b0a4 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 15:39:35 -0500 Subject: [PATCH 42/48] Wrote new methods Expand() and preferLongest(); Use new function signatures (with preferLongest); only characters should be added to next state list --- regex/matching.go | 113 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 96 insertions(+), 17 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index 1b07ee8..4d7c600 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -2,6 +2,8 @@ package regex import ( "fmt" + "strconv" + "unicode" ) // A Match represents a match found by the regex in a given string. @@ -77,6 +79,18 @@ func (regex Reg) Match(str string) bool { return err == nil } +// CompileMatch compiles expr and returns true if str contains a match of the expression. +// It is equivalent to [regexp.Match]. +// An optional list of flags may be provided (see [ReFlag]). +// It returns an error (!= nil) if there was an error compiling the expression. +func CompileMatch(expr string, str string, flags ...ReFlag) (bool, error) { + re, err := Compile(expr, flags...) + if err != nil { + return false, err + } + return re.Match(str), nil +} + // FindAll returns a slice containing all the 0-groups of the regex in the given string. // A 0-group represents the match without any submatches. func (regex Reg) FindAll(str string) []Group { @@ -162,7 +176,7 @@ func (regex Reg) FindNthMatch(str string, n int) (Match, error) { var matchFound bool var matchIdx Match for idx <= len(str_runes) { - matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups) + matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups, regex.preferLongest) if matchFound { matchNum++ } @@ -182,7 +196,7 @@ func (regex Reg) FindAllSubmatch(str string) []Match { var matchIdx Match indices := make([]Match, 0) for idx <= len(str_runes) { - matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups) + matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups, regex.preferLongest) if matchFound { indices = append(indices, matchIdx) } @@ -191,7 +205,7 @@ func (regex Reg) FindAllSubmatch(str string) []Match { return indices } -func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState) []nfaState { +func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState { if stateExists(list, state) || stateExists(visited, state) { return list } @@ -199,32 +213,32 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread if state.isKleene || state.isQuestion { copyThread(state.splitState, state) - list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited) + list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest) copyThread(state.next, state) - list = addStateToList(str, idx, list, *state.next, threadGroups, visited) + list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest) return list } if state.isAlternation { copyThread(state.next, state) - list = addStateToList(str, idx, list, *state.next, threadGroups, visited) + list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest) copyThread(state.splitState, state) - list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited) + list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest) return list } state.threadGroups = append([]Group{}, threadGroups...) if state.assert != noneAssert { - if state.checkAssertion(str, idx) { + if state.checkAssertion(str, idx, preferLongest) { copyThread(state.next, state) - return addStateToList(str, idx, list, *state.next, state.threadGroups, visited) + return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest) } } if state.groupBegin { state.threadGroups[state.groupNum].StartIdx = idx - return addStateToList(str, idx, list, *state.next, state.threadGroups, visited) + return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest) } if state.groupEnd { state.threadGroups[state.groupNum].EndIdx = idx - return addStateToList(str, idx, list, *state.next, state.threadGroups, visited) + return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest) } return append(list, state) @@ -233,7 +247,7 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread // Helper for FindAllMatches. Returns whether it found a match, the // first Match it finds, and how far it got into the string ie. where // the next search should start from. -func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) { +func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int, preferLongest bool) (bool, Match, int) { // Base case - exit if offset exceeds string's length if offset > len(str) { // The second value here shouldn't be used, because we should exit when the third return value is > than len(str) @@ -248,7 +262,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // If the first state is an assertion, makes sure the assertion // is true before we do _anything_ else. if start.assert != noneAssert { - if start.checkAssertion(str, offset) == false { + if start.checkAssertion(str, offset, preferLongest) == false { i++ return false, []Group{}, i } @@ -256,7 +270,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in start.threadGroups = newMatch(numGroups + 1) start.threadGroups[0].StartIdx = i - currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil) + currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest) var match Match = nil for idx := i; idx <= len(str); idx++ { if len(currentStates) == 0 { @@ -274,9 +288,9 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in currentState.threadGroups[0].EndIdx = idx match = append([]Group{}, currentState.threadGroups...) break - } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd { // Normal character or assertion - if currentState.contentContains(str, idx) { - nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil) + } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character + if currentState.contentContains(str, idx, preferLongest) { + nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest) } } } @@ -291,3 +305,68 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in } return false, []Group{}, i + 1 } + +// Expand appends template to dst, expanding any variables in template to the relevant capturing group. +// +// A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group. +// To insert a literal $, do not put a number after it. Alternatively, you can use $$. +// src is the input string, and match must be the result of [Reg.FindSubmatch]. +func (regex Reg) Expand(dst string, template string, src string, match Match) string { + templateRuneSlc := []rune(template) + srcRuneSlc := []rune(src) + i := 0 + for i < len(templateRuneSlc) { + c := templateRuneSlc[i] + if c == '$' { + i += 1 + // The dollar sign is the last character of the string, or it is proceeded by another dollar sign + if i >= len(templateRuneSlc) || templateRuneSlc[i] == '$' { + dst += "$" + i++ + } else { + numStr := "" + for unicode.IsDigit(templateRuneSlc[i]) { + numStr += string(templateRuneSlc[i]) + i++ + } + if numStr == "" { + dst += "$" + } else { + num, _ := strconv.Atoi(numStr) + if num < len(match) { + dst += string(srcRuneSlc[match[num].StartIdx:match[num].EndIdx]) + } else { + dst += "$" + numStr + } + } + } + } else { + dst += string(c) + i++ + } + } + return dst +} + +// LiteralPrefix returns a string that must begin any match of the given regular expression. +// The second return value is true if the string comprises the entire expression. +func (regex Reg) LiteralPrefix() (prefix string, complete bool) { + state := regex.start + if state.assert != noneAssert { + state = state.next + } + for !(state.isLast) && (!state.isAlternation) && len(state.content) == 1 && state.assert == noneAssert { + if state.groupBegin || state.groupEnd { + state = state.next + continue + } + prefix += string(rune(state.content[0])) + state = state.next + } + if state.isLast { + complete = true + } else { + complete = false + } + return prefix, complete +} -- 2.30.2 From d522f50b502f77c536a23bea4a259b80a3fe3153 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 15:40:59 -0500 Subject: [PATCH 43/48] Wrote new example functions --- regex/example_test.go | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/regex/example_test.go b/regex/example_test.go index f2443a2..60ed033 100644 --- a/regex/example_test.go +++ b/regex/example_test.go @@ -52,3 +52,29 @@ func ExampleReg_FindSubmatch() { // 0 1 // 2 3 } + +func ExampleReg_Expand() { + inputStr := `option1: value1 + option2: value2` + regexStr := `(\w+): (\w+)` + templateStr := "$1 = $2\n" + regexComp := regex.MustCompile(regexStr, regex.RE_MULTILINE) + result := "" + for _, submatches := range regexComp.FindAllSubmatch(inputStr) { + result = regexComp.Expand(result, templateStr, inputStr, submatches) + } + fmt.Println(result) + // Output: option1 = value1 + // option2 = value2 + +} + +func ExampleReg_LiteralPrefix() { + regexStr := `a(b|c)d*` + regexComp := regex.MustCompile(regexStr) + prefix, complete := regexComp.LiteralPrefix() + fmt.Println(prefix) + fmt.Println(complete) + // Output: a + // false +} -- 2.30.2 From af15904f3bf54da1d8ce9071eeddfc87888957bb Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 15:41:13 -0500 Subject: [PATCH 44/48] Updated documentation --- regex/doc.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex/doc.go b/regex/doc.go index 9ca1f04..c5124e0 100644 --- a/regex/doc.go +++ b/regex/doc.go @@ -173,6 +173,6 @@ The following features are not available in [regexp], but are supported in my en 1. Lookarounds 2. Numeric ranges -The goal is to shorten the first list, and expand the second. +I hope to shorten the first list, and expand the second. */ package regex -- 2.30.2 From 9fbb99f86c74d6d83f6925fd57b9e4f55472d7cc Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 15:47:57 -0500 Subject: [PATCH 45/48] Wrote example for Longest() --- regex/example_test.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/regex/example_test.go b/regex/example_test.go index 60ed033..8499dfc 100644 --- a/regex/example_test.go +++ b/regex/example_test.go @@ -78,3 +78,14 @@ func ExampleReg_LiteralPrefix() { // Output: a // false } + +func ExampleReg_Longest() { + regexStr := `x|xx` + inputStr := "xx" + regexComp := regex.MustCompile(regexStr) + fmt.Println(regexComp.FindString(inputStr)) + regexComp.Longest() + fmt.Println(regexComp.FindString(inputStr)) + // Output: x + // xx +} -- 2.30.2 From b60ded41366a9e57b3a911a2770c50f11d856f4d Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 15:48:33 -0500 Subject: [PATCH 46/48] Don't break when a match is found, if we are looking for the longest match --- regex/matching.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/regex/matching.go b/regex/matching.go index 4d7c600..0787572 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -287,7 +287,9 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in if currentState.isLast { currentState.threadGroups[0].EndIdx = idx match = append([]Group{}, currentState.threadGroups...) - break + if !preferLongest { + break + } } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character if currentState.contentContains(str, idx, preferLongest) { nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest) -- 2.30.2 From 15ee49f42eb43bd833b59536c68a99fa0de36166 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 15:51:46 -0500 Subject: [PATCH 47/48] Rename method receivers from 'regex' to 're' (it's shorter) --- regex/compile.go | 12 ++++++------ regex/matching.go | 44 ++++++++++++++++++++++---------------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/regex/compile.go b/regex/compile.go index da733de..d9bef70 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -22,17 +22,17 @@ type Reg struct { // NumSubexp returns the number of sub-expressions in the given [Reg]. This is equivalent // to the number of capturing groups. -func (r Reg) NumSubexp() int { - return r.numGroups +func (re Reg) NumSubexp() int { + return re.numGroups } // String returns the string used to compile the expression. -func (r Reg) String() string { - return r.str +func (re Reg) String() string { + return re.str } -func (r Reg) Longest() { - r.preferLongest = true +func (re *Reg) Longest() { + re.preferLongest = true } const concatRune rune = 0xF0001 diff --git a/regex/matching.go b/regex/matching.go index 0787572..4d6b4e3 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -65,8 +65,8 @@ func copyThread(to *nfaState, from nfaState) { // Find returns the 0-group of the leftmost match of the regex in the given string. // An error value != nil indicates that no match was found. -func (regex Reg) Find(str string) (Group, error) { - match, err := regex.FindNthMatch(str, 1) +func (re Reg) Find(str string) (Group, error) { + match, err := re.FindNthMatch(str, 1) if err != nil { return Group{}, fmt.Errorf("no matches found") } @@ -74,8 +74,8 @@ func (regex Reg) Find(str string) (Group, error) { } // Match returns a boolean value, indicating whether the regex found a match in the given string. -func (regex Reg) Match(str string) bool { - _, err := regex.Find(str) +func (re Reg) Match(str string) bool { + _, err := re.Find(str) return err == nil } @@ -93,8 +93,8 @@ func CompileMatch(expr string, str string, flags ...ReFlag) (bool, error) { // FindAll returns a slice containing all the 0-groups of the regex in the given string. // A 0-group represents the match without any submatches. -func (regex Reg) FindAll(str string) []Group { - indices := regex.FindAllSubmatch(str) +func (re Reg) FindAll(str string) []Group { + indices := re.FindAllSubmatch(str) zeroGroups := funcMap(indices, getZeroGroup) return zeroGroups } @@ -103,8 +103,8 @@ func (regex Reg) FindAll(str string) []Group { // The return value will be an empty string in two situations: // 1. No match was found // 2. The match was an empty string -func (regex Reg) FindString(str string) string { - match, err := regex.FindNthMatch(str, 1) +func (re Reg) FindString(str string) string { + match, err := re.FindNthMatch(str, 1) if err != nil { return "" } @@ -117,8 +117,8 @@ func (regex Reg) FindString(str string) string { // number of groups. The validity of a group (whether or not it matched anything) can be determined with // [Group.IsValid], or by checking that both indices of the group are >= 0. // The second-return value is nil if no match was found. -func (regex Reg) FindSubmatch(str string) (Match, error) { - match, err := regex.FindNthMatch(str, 1) +func (re Reg) FindSubmatch(str string) (Match, error) { + match, err := re.FindNthMatch(str, 1) if err != nil { return Match{}, fmt.Errorf("no match found") } else { @@ -135,9 +135,9 @@ func (regex Reg) FindSubmatch(str string) (Match, error) { // 2. Group n found a zero-length match // // A return value of nil indicates no match. -func (regex Reg) FindStringSubmatch(str string) []string { - matchStr := make([]string, regex.numGroups+1) - match, err := regex.FindSubmatch(str) +func (re Reg) FindStringSubmatch(str string) []string { + matchStr := make([]string, re.numGroups+1) + match, err := re.FindSubmatch(str) if err != nil { return nil } @@ -159,8 +159,8 @@ func (regex Reg) FindStringSubmatch(str string) []string { // FindAllString is the 'all' version of [FindString]. // It returns a slice of strings containing the text of all matches of // the regex in the given string. -func (regex Reg) FindAllString(str string) []string { - zerogroups := regex.FindAll(str) +func (re Reg) FindAllString(str string) []string { + zerogroups := re.FindAll(str) matchStrs := funcMap(zerogroups, func(g Group) string { return str[g.StartIdx:g.EndIdx] }) @@ -169,14 +169,14 @@ func (regex Reg) FindAllString(str string) []string { // FindNthMatch return the 'n'th match of the regex in the given string. // It returns an error (!= nil) if there are fewer than 'n' matches in the string. -func (regex Reg) FindNthMatch(str string, n int) (Match, error) { +func (re Reg) FindNthMatch(str string, n int) (Match, error) { idx := 0 matchNum := 0 str_runes := []rune(str) var matchFound bool var matchIdx Match for idx <= len(str_runes) { - matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups, regex.preferLongest) + matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest) if matchFound { matchNum++ } @@ -189,14 +189,14 @@ func (regex Reg) FindNthMatch(str string, n int) (Match, error) { } // FindAllSubmatch returns a slice of matches in the given string. -func (regex Reg) FindAllSubmatch(str string) []Match { +func (re Reg) FindAllSubmatch(str string) []Match { idx := 0 str_runes := []rune(str) var matchFound bool var matchIdx Match indices := make([]Match, 0) for idx <= len(str_runes) { - matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups, regex.preferLongest) + matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest) if matchFound { indices = append(indices, matchIdx) } @@ -313,7 +313,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group. // To insert a literal $, do not put a number after it. Alternatively, you can use $$. // src is the input string, and match must be the result of [Reg.FindSubmatch]. -func (regex Reg) Expand(dst string, template string, src string, match Match) string { +func (re Reg) Expand(dst string, template string, src string, match Match) string { templateRuneSlc := []rune(template) srcRuneSlc := []rune(src) i := 0 @@ -352,8 +352,8 @@ func (regex Reg) Expand(dst string, template string, src string, match Match) st // LiteralPrefix returns a string that must begin any match of the given regular expression. // The second return value is true if the string comprises the entire expression. -func (regex Reg) LiteralPrefix() (prefix string, complete bool) { - state := regex.start +func (re Reg) LiteralPrefix() (prefix string, complete bool) { + state := re.start if state.assert != noneAssert { state = state.next } -- 2.30.2 From d1958f289c72441a148f0005c5ac76d58a0f8357 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 9 Feb 2025 16:08:16 -0500 Subject: [PATCH 48/48] Commented out tests that would only pass with Longest() --- regex/re_test.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/regex/re_test.go b/regex/re_test.go index de6aaba..8b9fc8d 100644 --- a/regex/re_test.go +++ b/regex/re_test.go @@ -25,7 +25,9 @@ var reTests = []struct { {"a*b", nil, "qwqw", []Group{}}, {"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}}, {"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}}, - {"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}}, + // This match will only happen with Longest() + // {"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}}, + {"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}}}, {"b*a*a", nil, "bba", []Group{{0, 3}}}, {"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}}, {"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}}, @@ -537,7 +539,9 @@ var groupTests = []struct { {"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}}, {"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, - {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, + // This match will only happen with Longest() + // {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, + {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 3}, {0, 3}, {-1, -1}}}}, {"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}}, {"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}}, -- 2.30.2