From 77d19cd84eec7ae6979710d4149a428fe1a6e73b Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Fri, 22 Nov 2024 00:11:51 -0500 Subject: [PATCH] Added lookaround-related fields to State struct, added lookaround support to checkAssertion() --- nfa.go | 80 +++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 59 insertions(+), 21 deletions(-) diff --git a/nfa.go b/nfa.go index 095eeb6..9b2d7c2 100644 --- a/nfa.go +++ b/nfa.go @@ -1,6 +1,8 @@ package main -import "slices" +import ( + "slices" +) const EPSILON int = 0 @@ -12,19 +14,24 @@ const ( EOS WBOUND NONWBOUND + PLA // Positive lookahead + NLA // Negative lookahead + PLB // Positive lookbehind + NLB // Negative lookbehind ) type State struct { - content stateContents // Contents of current state - isEmpty bool // If it is empty - Union operator and Kleene star states will be empty - isLast bool // If it is the last state (acept state) - output []*State // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. - transitions map[int][]*State // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) - isKleene bool // Identifies whether current node is a 0-state representing Kleene star - assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything - zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states - allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space - except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. + content stateContents // Contents of current state + isEmpty bool // If it is empty - Union operator and Kleene star states will be empty + isLast bool // If it is the last state (acept state) + output []*State // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. + transitions map[int][]*State // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) + isKleene bool // Identifies whether current node is a 0-state representing Kleene star + assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything + zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states + allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space + except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. + lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds } // Clones the NFA starting from the given state. @@ -43,16 +50,17 @@ func cloneStateHelper(state *State, cloneMap map[*State]*State) *State { // Recursive case - if the clone doesn't exist, create it, add it to the map, // and recursively call for each of the transition states. clone := &State{ - content: append([]int{}, state.content...), - isEmpty: state.isEmpty, - isLast: state.isLast, - output: make([]*State, len(state.output)), - transitions: make(map[int][]*State), - isKleene: state.isKleene, - assert: state.assert, - zeroMatchFound: state.zeroMatchFound, - allChars: state.allChars, - except: append([]rune{}, state.except...), + content: append([]int{}, state.content...), + isEmpty: state.isEmpty, + isLast: state.isLast, + output: make([]*State, len(state.output)), + transitions: make(map[int][]*State), + isKleene: state.isKleene, + assert: state.assert, + zeroMatchFound: state.zeroMatchFound, + allChars: state.allChars, + except: append([]rune{}, state.except...), + lookaroundRegex: state.lookaroundRegex, } cloneMap[state] = clone for i, s := range state.output { @@ -90,6 +98,36 @@ func (s State) checkAssertion(str []rune, idx int) bool { if s.assert == NONWBOUND { return !isWordBoundary(str, idx) } + if s.assert == PLA || s.assert == PLB || s.assert == NLA || s.assert == NLB { // Lookaround + // The process here is simple: + // 1. Compile the regex stored in the state's contents. + // 2. Run it on the test string. + // 3. Based on the kind of lookaround (and the indices we get), determine what action to take. + regex := s.lookaroundRegex + re_postfix := shuntingYard(regex) + startState := thompson(re_postfix) + matchIndices := findAllMatches(startState, str) + + numMatchesFound := 0 + for _, matchIdx := range matchIndices { + if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at the current index + if matchIdx.startIdx == idx { + numMatchesFound++ + } + } + if s.assert == PLB || s.assert == NLB { // Lookbehind - return true (or false) if at least one match _ends_ at the current index. + if matchIdx.endIdx == idx { + numMatchesFound++ + } + } + } + if s.assert == PLA || s.assert == PLB { // Positive assertions want at least one match + return numMatchesFound > 0 + } + if s.assert == NLA || s.assert == NLB { // Negative assertions only want zero matches + return numMatchesFound == 0 + } + } return true }