diff --git a/regex/matching.go b/regex/matching.go index af2ede3..ad7d15b 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -1,7 +1,6 @@ package regex import ( - "container/heap" "fmt" "slices" "sort" @@ -267,16 +266,15 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in // chosen as the match for the entire string. // This allows us to pick the longest possible match (which is how greedy matching works). // COMMENT ABOVE IS CURRENTLY NOT UP-TO-DATE - tempIndices := newMatch(numGroups + 1) + // tempIndices := newMatch(numGroups + 1) - foundPath := false - startIdx := offset - endIdx := offset - currentStates := &priorityQueue{} - heap.Init(currentStates) - tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration - i := offset // Index in string - startingFrom := i // Store starting index + // foundPath := false + //startIdx := offset + //endIdx := offset + currentStates := make([]*nfaState, 0) + // tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration + i := offset // Index in string + //startingFrom := i // Store starting index // If the first state is an assertion, makes sure the assertion // is true before we do _anything_ else. @@ -287,214 +285,266 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in } } // Increment until we hit a character matching the start state (assuming not 0-state) - if start.isEmpty == false { - for i < len(str) && !start.contentContains(str, i) { - i++ - } - startIdx = i - startingFrom = i - i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character - } - - start.threadGroups = newMatch(numGroups + 1) + // if start.isEmpty == false { + // for i < len(str) && !start.contentContains(str, i) { + // i++ + // } + // startIdx = i + // startingFrom = i + // i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character + // } + + // start.threadGroups = newMatch(numGroups + 1) // Check if the start state begins a group - if so, add the start index to our list - if start.groupBegin { - start.threadGroups[start.groupNum].StartIdx = i - // tempIndices[start.groupNum].startIdx = i - } + //if start.groupBegin { + // start.threadGroups[start.groupNum].StartIdx = i + // tempIndices[start.groupNum].startIdx = i + //} start.threadSP = i - heap.Push(currentStates, newPriorQueueItem(start)) + currentStates = append(currentStates, start) + var foundMatch bool // Main loop - for currentStates.Len() > 0 { - currentState := heap.Pop(currentStates) - foundPath = false - - zeroStates := make([]*nfaState, 0) - // Keep taking zero-states, until there are no more left to take - // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take. - topStateItem := currentStates.peek() - topState := topStateItem.(*priorQueueItem).state - zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) - tempStates = append(tempStates, zeroStates...) - num_appended := 0 - for isZero == true { - zeroStates, isZero = takeZeroState(tempStates, numGroups, i) - tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) - if num_appended == 0 { // Break if we haven't appended any more unique values - break - } - } - if isZero == true { - currentStates.Pop() - } - - for _, state := range tempStates { - heap.Push(currentStates, newPriorQueueItem(state)) - } - tempStates = nil - - // Take any transitions corresponding to current character - numStatesMatched := 0 // The number of states which had at least 1 match for this round - assertionFailed := false // Whether or not an assertion failed for this round - lastStateInList := false // Whether or not a last state was in our list of states - var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found - lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states - for numStatesMatched == 0 && lastStateInList == false { - if currentStates.Len() == 0 { - break - } - stateItem := heap.Pop(currentStates) - state := stateItem.(*priorQueueItem).state - matches, numMatches := state.matchesFor(str, i) - if numMatches > 0 { - numStatesMatched++ - tempStates = append([]*nfaState(nil), matches...) - foundPath = true - for _, m := range matches { - if m.threadGroups == nil { - m.threadGroups = newMatch(numGroups + 1) - } - m.threadSP = state.threadSP + 1 - copy(m.threadGroups, state.threadGroups) - } - } - if numMatches < 0 { - assertionFailed = true - } - if state.isLast { - if state.isLookaround() { - lastLookaroundInList = true - } - lastStateInList = true - lastStatePtr = state - } - } - - if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed - // If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_ - // state. The explanation below is my attempt to explain this behavior. - // If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails. - // - // One of the states in our list was a last state and a lookaround. In this case, we - // don't abort upon failure of the assertion, because we have found - // another path to a final state. - // Even if the last state _was_ an assertion, we can use the previously - // saved indices to find a match. - if lastLookaroundInList { - break - } else { - if i == startingFrom { - i++ - } - return false, []Group{}, i - } + for len(currentStates) > 0 { + currentState, _ := pop(¤tStates) + idx := currentState.threadSP + foundMatch = false + + if currentState.threadGroups == nil { + currentState.threadGroups = newMatch(numGroups + 1) + currentState.threadGroups[0].StartIdx = idx } - // Check if we can find a state in our list that is: - // a. A last-state - // b. Empty - // c. Doesn't assert anything - for _, stateItem := range *currentStates { - s := stateItem.state - if s.isLast && s.isEmpty && s.assert == noneAssert { - lastStatePtr = s - lastStateInList = true + if currentState.groupBegin { + currentState.threadGroups[currentState.groupNum].StartIdx = idx + } else if currentState.groupEnd { + currentState.threadGroups[currentState.groupNum].EndIdx = idx + } else if currentState.isKleene { + // Append the + } else if currentState.isAlternation { + rightState := currentState.rightState + rightState.threadGroups = currentState.threadGroups + rightState.threadSP = currentState.threadSP + currentStates = append(currentStates, currentState.rightState) + leftState := currentState.leftState + leftState.threadGroups = currentState.threadGroups + leftState.threadSP = currentState.threadSP + currentStates = append(currentStates, currentState.leftState) + continue + } else if currentState.contentContains(str, idx) { + foundMatch = true + allMatches := make([]*nfaState, 0) + for _, v := range currentState.transitions { + allMatches = append(allMatches, v...) } - } - if lastStateInList && numStatesMatched == 0 { // A last-state was in the list of states. add the matchIndex to our MatchIndex list - for j := 1; j < numGroups+1; j++ { - tempIndices[j] = lastStatePtr.threadGroups[j] - } - endIdx = i - tempIndices[0] = Group{startIdx, endIdx} - if tempIndices[0].StartIdx == tempIndices[0].EndIdx { - return true, tempIndices, tempIndices[0].EndIdx + 1 - } else { - return true, tempIndices, tempIndices[0].EndIdx - } - } - - // Check if we can find a zero-length match - if foundPath == false { - currentStatesList := funcMap(*currentStates, func(item *priorQueueItem) *nfaState { - return item.state - }) - if ok := zeroMatchPossible(str, i, numGroups, currentStatesList...); ok { - if tempIndices[0].IsValid() == false { - tempIndices[0] = Group{startIdx, startIdx} - } - } - // If we haven't moved in the string, increment the counter by 1 - // to ensure we don't keep trying the same string over and over. - // if i == startingFrom { - startIdx++ - // i++ - // } - if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() { - if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. - return true, tempIndices, tempIndices[0].EndIdx + 1 + slices.Reverse(allMatches) + for _, m := range allMatches { + m.threadGroups = currentState.threadGroups + if currentState.assert == noneAssert { + m.threadSP = idx + 1 } else { - return true, tempIndices, tempIndices[0].EndIdx + m.threadSP = idx } } - return false, []Group{}, startIdx - } - currentStates = &priorityQueue{} - slices.Reverse(tempStates) - for _, state := range tempStates { - heap.Push(currentStates, newPriorQueueItem(state)) + currentStates = append(currentStates, allMatches...) } - tempStates = nil - i++ - } + if currentState.isLast && foundMatch { // Last state reached + currentState.threadGroups[0].EndIdx = idx + 1 + return true, currentState.threadGroups, idx + 1 - // End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position. - // This is the exact same algorithm used inside the loop, so I should probably put it in a function. - if currentStates.Len() > 0 { - topStateItem := currentStates.peek() - topState := topStateItem.(*priorQueueItem).state - zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) - tempStates = append(tempStates, zeroStates...) - num_appended := 0 // Number of unique states addded to tempStates - for isZero == true { - zeroStates, isZero = takeZeroState(tempStates, numGroups, i) - tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) - if num_appended == 0 { // Break if we haven't appended any more unique values - break - } } } - - for _, state := range tempStates { - heap.Push(currentStates, newPriorQueueItem(state)) - } - tempStates = nil - - for _, stateItem := range *currentStates { - state := stateItem.state - // Only add the match if the start index is in bounds. If the state has an assertion, - // make sure the assertion checks out. - if state.isLast && i <= len(str) { - if state.assert == noneAssert || state.checkAssertion(str, i) { - for j := 1; j < numGroups+1; j++ { - tempIndices[j] = state.threadGroups[j] - } - endIdx = i - tempIndices[0] = Group{startIdx, endIdx} - } - } - } - - if tempIndices.numValidGroups() > 0 { - if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. - return true, tempIndices, tempIndices[0].EndIdx + 1 - } else { - return true, tempIndices, tempIndices[0].EndIdx - } - } - if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over. - startIdx++ - } - return false, []Group{}, startIdx + return false, []Group{}, i + 1 + // zeroStates := make([]*nfaState, 0) + // // Keep taking zero-states, until there are no more left to take + // // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take. + // topStateItem := currentStates.peek() + // topState := topStateItem.(*priorQueueItem).state + // zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) + // tempStates = append(tempStates, zeroStates...) + // num_appended := 0 + // for isZero == true { + // zeroStates, isZero = takeZeroState(tempStates, numGroups, i) + // tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) + // if num_appended == 0 { // Break if we haven't appended any more unique values + // break + // } + // } + // if isZero == true { + // currentStates.Pop() + // } + // + // for _, state := range tempStates { + // heap.Push(currentStates, newPriorQueueItem(state)) + // } + // tempStates = nil + // + // // Take any transitions corresponding to current character + // numStatesMatched := 0 // The number of states which had at least 1 match for this round + // assertionFailed := false // Whether or not an assertion failed for this round + // lastStateInList := false // Whether or not a last state was in our list of states + // var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found + // lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states + // for numStatesMatched == 0 && lastStateInList == false { + // if currentStates.Len() == 0 { + // break + // } + // stateItem := heap.Pop(currentStates) + // state := stateItem.(*priorQueueItem).state + // matches, numMatches := state.matchesFor(str, i) + // if numMatches > 0 { + // numStatesMatched++ + // tempStates = append([]*nfaState(nil), matches...) + // foundPath = true + // for _, m := range matches { + // if m.threadGroups == nil { + // m.threadGroups = newMatch(numGroups + 1) + // } + // m.threadSP = state.threadSP + 1 + // copy(m.threadGroups, state.threadGroups) + // } + // } + // if numMatches < 0 { + // assertionFailed = true + // } + // if state.isLast { + // if state.isLookaround() { + // lastLookaroundInList = true + // } + // lastStateInList = true + // lastStatePtr = state + // } + // } + // + // if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed + // // If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_ + // // state. The explanation below is my attempt to explain this behavior. + // // If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails. + // // + // // One of the states in our list was a last state and a lookaround. In this case, we + // // don't abort upon failure of the assertion, because we have found + // // another path to a final state. + // // Even if the last state _was_ an assertion, we can use the previously + // // saved indices to find a match. + // if lastLookaroundInList { + // break + // } else { + // if i == startingFrom { + // i++ + // } + // return false, []Group{}, i + // } + // } + // // Check if we can find a state in our list that is: + // // a. A last-state + // // b. Empty + // // c. Doesn't assert anything + // for _, stateItem := range *currentStates { + // s := stateItem.state + // if s.isLast && s.isEmpty && s.assert == noneAssert { + // lastStatePtr = s + // lastStateInList = true + // } + // } + // if lastStateInList && numStatesMatched == 0 { // A last-state was in the list of states. add the matchIndex to our MatchIndex list + // for j := 1; j < numGroups+1; j++ { + // tempIndices[j] = lastStatePtr.threadGroups[j] + // } + // endIdx = i + // tempIndices[0] = Group{startIdx, endIdx} + // if tempIndices[0].StartIdx == tempIndices[0].EndIdx { + // return true, tempIndices, tempIndices[0].EndIdx + 1 + // } else { + // return true, tempIndices, tempIndices[0].EndIdx + // } + // } + // + // // Check if we can find a zero-length match + // if foundPath == false { + // currentStatesList := funcMap(*currentStates, func(item *priorQueueItem) *nfaState { + // return item.state + // }) + // if ok := zeroMatchPossible(str, i, numGroups, currentStatesList...); ok { + // if tempIndices[0].IsValid() == false { + // tempIndices[0] = Group{startIdx, startIdx} + // } + // } + // // If we haven't moved in the string, increment the counter by 1 + // // to ensure we don't keep trying the same string over and over. + // // if i == startingFrom { + // startIdx++ + // // i++ + // // } + // if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() { + // if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. + // return true, tempIndices, tempIndices[0].EndIdx + 1 + // } else { + // return true, tempIndices, tempIndices[0].EndIdx + // } + // } + // return false, []Group{}, startIdx + // } + // currentStates = &priorityQueue{} + // slices.Reverse(tempStates) + // for _, state := range tempStates { + // heap.Push(currentStates, newPriorQueueItem(state)) + // } + // tempStates = nil + // + // i++ + // } + // + // // End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position. + // // This is the exact same algorithm used inside the loop, so I should probably put it in a function. + // + // if currentStates.Len() > 0 { + // topStateItem := currentStates.peek() + // topState := topStateItem.(*priorQueueItem).state + // zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) + // tempStates = append(tempStates, zeroStates...) + // num_appended := 0 // Number of unique states addded to tempStates + // for isZero == true { + // zeroStates, isZero = takeZeroState(tempStates, numGroups, i) + // tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) + // if num_appended == 0 { // Break if we haven't appended any more unique values + // break + // } + // } + // } + // + // for _, state := range tempStates { + // heap.Push(currentStates, newPriorQueueItem(state)) + // } + // + // tempStates = nil + // + // for _, stateItem := range *currentStates { + // state := stateItem.state + // // Only add the match if the start index is in bounds. If the state has an assertion, + // // make sure the assertion checks out. + // if state.isLast && i <= len(str) { + // if state.assert == noneAssert || state.checkAssertion(str, i) { + // for j := 1; j < numGroups+1; j++ { + // tempIndices[j] = state.threadGroups[j] + // } + // endIdx = i + // tempIndices[0] = Group{startIdx, endIdx} + // } + // } + // } + // + // if tempIndices.numValidGroups() > 0 { + // if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. + // return true, tempIndices, tempIndices[0].EndIdx + 1 + // } else { + // return true, tempIndices, tempIndices[0].EndIdx + // } + // } + // + // if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over. + // + // startIdx++ + // } + // + // return false, []Group{}, startIdx }