From 93474c5159715baf05233f1fafb34849409d2dfb Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Thu, 30 Jan 2025 10:31:02 -0500 Subject: [PATCH] Renamed 'state' to 'nfaState' because 'state' by itself means nothing --- regex/compile.go | 24 ++++++------ regex/matching.go | 30 +++++++-------- regex/nfa.go | 94 +++++++++++++++++++++++------------------------ 3 files changed, 74 insertions(+), 74 deletions(-) diff --git a/regex/compile.go b/regex/compile.go index b5e9f43..3816242 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -14,7 +14,7 @@ var notDotChars []rune // the startState of the NFA representation of the regex, and the number of capturing // groups in the regex. type Reg struct { - start *State + start *nfaState numGroups int } @@ -799,8 +799,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { // Thompson's algorithm. Constructs Finite-State Automaton from given string. // Returns start state and number of groups in regex. func thompson(re []postfixNode) (Reg, error) { - nfa := make([]*State, 0) // Stack of states - numGroups := 0 // Number of capturing groups + nfa := make([]*nfaState, 0) // Stack of states + numGroups := 0 // Number of capturing groups // If thompson() receives an empty regex, then whatever was given to shuntingYard() // was parsed away. This doesn't mean that the regex itself is empty. @@ -815,8 +815,8 @@ func thompson(re []postfixNode) (Reg, error) { for _, c := range re { if c.nodetype == characterNode || c.nodetype == assertionNode { - stateToAdd := State{} - stateToAdd.transitions = make(map[int][]*State) + stateToAdd := nfaState{} + stateToAdd.transitions = make(map[int][]*nfaState) if c.allChars { stateToAdd.allChars = true if len(c.except) != 0 { @@ -862,7 +862,7 @@ func thompson(re []postfixNode) (Reg, error) { })...) } stateToAdd.content = stateContents(append([]int(stateToAdd.content), []int(rune2Contents(runesToAdd))...)) - stateToAdd.output = make([]*State, 0) + stateToAdd.output = make([]*nfaState, 0) stateToAdd.output = append(stateToAdd.output, &stateToAdd) stateToAdd.isEmpty = false if c.nodetype == assertionNode { @@ -918,13 +918,13 @@ func thompson(re []postfixNode) (Reg, error) { nfa = append(nfa, &stateToAdd) } if c.nodetype == lparenNode || c.nodetype == rparenNode { - s := &State{} + s := &nfaState{} s.assert = noneAssert s.content = newContents(EPSILON) s.isEmpty = true - s.output = make([]*State, 0) + s.output = make([]*nfaState, 0) s.output = append(s.output, s) - s.transitions = make(map[int][]*State) + s.transitions = make(map[int][]*nfaState) // LPAREN nodes are just added normally if c.nodetype == lparenNode { numGroups++ @@ -971,7 +971,7 @@ func thompson(re []postfixNode) (Reg, error) { } if c.nodetype == charclassNode { // A Character class consists of all the nodes in it, alternated // Map the list of nodes to a list of states, each state containing the contents of a specific node - states := funcMap(c.nodeContents, func(node postfixNode) *State { + states := funcMap(c.nodeContents, func(node postfixNode) *nfaState { s := newState() nodeContents := node.contents if caseInsensitive { @@ -989,7 +989,7 @@ func thompson(re []postfixNode) (Reg, error) { return &s }) // Reduce the list of states down to a single state by alternating them - toAdd := funcReduce(states, func(s1 *State, s2 *State) *State { + toAdd := funcReduce(states, func(s1 *nfaState, s2 *nfaState) *nfaState { return alternate(s1, s2) }) nfa = append(nfa, toAdd) @@ -1066,7 +1066,7 @@ func thompson(re []postfixNode) (Reg, error) { return Reg{}, fmt.Errorf("numeric specifier - start greater than end") } poppedState := mustPop(&nfa) - var stateToAdd *State = nil + var stateToAdd *nfaState = nil // Take advantage of the following facts: // a{5} == aaaaa // a{3,5} == aaaa?a? diff --git a/regex/matching.go b/regex/matching.go index f7f83ca..61dca15 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -61,7 +61,7 @@ func (g Group) isValid() bool { // given slice. It returns the resulting states. If any of the resulting states is a 0-state, // the second ret val is true. // If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index. -func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZero bool) { +func takeZeroState(states []*nfaState, numGroups int, idx int) (rtv []*nfaState, isZero bool) { for _, state := range states { if len(state.transitions[EPSILON]) > 0 { for _, s := range state.transitions[EPSILON] { @@ -93,9 +93,9 @@ func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZer // from any of the given states, given the string and our position in it. // It uses the same algorithm to find zero-states as the one inside the loop, // so I should probably put it in a function. -func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) bool { +func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState) bool { zeroStates, isZero := takeZeroState(states, numGroups, idx) - tempstates := make([]*State, 0, len(zeroStates)+len(states)) + tempstates := make([]*nfaState, 0, len(zeroStates)+len(states)) tempstates = append(tempstates, states...) tempstates = append(tempstates, zeroStates...) num_appended := 0 // number of unique states addded to tempstates @@ -204,7 +204,7 @@ func FindAllMatches(regex Reg, str string) []Match { // the next search should start from. // // Might return duplicates or overlapping indices, so care must be taken to prune the resulting array. -func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (bool, Match, int) { +func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) { // Base case - exit if offset exceeds string's length if offset > len(str) { // The second value here shouldn't be used, because we should exit when the third return value is > than len(str) @@ -221,10 +221,10 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) ( foundPath := false startIdx := offset endIdx := offset - currentStates := make([]*State, 0) - tempStates := make([]*State, 0) // Used to store states that should be used in next loop iteration - i := offset // Index in string - startingFrom := i // Store starting index + currentStates := make([]*nfaState, 0) + tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration + i := offset // Index in string + startingFrom := i // Store starting index // If the first state is an assertion, makes sure the assertion // is true before we do _anything_ else. @@ -257,7 +257,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) ( for i < len(str) { foundPath = false - zeroStates := make([]*State, 0) + zeroStates := make([]*nfaState, 0) // Keep taking zero-states, until there are no more left to take // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take. zeroStates, isZero := takeZeroState(currentStates, numGroups, i) @@ -275,11 +275,11 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) ( tempStates = nil // Take any transitions corresponding to current character - numStatesMatched := 0 // The number of states which had at least 1 match for this round - assertionFailed := false // Whether or not an assertion failed for this round - lastStateInList := false // Whether or not a last state was in our list of states - var lastStatePtr *State = nil // Pointer to the last-state, if it was found - lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states + numStatesMatched := 0 // The number of states which had at least 1 match for this round + assertionFailed := false // Whether or not an assertion failed for this round + lastStateInList := false // Whether or not a last state was in our list of states + var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found + lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states for _, state := range currentStates { matches, numMatches := state.matchesFor(str, i) if numMatches > 0 { @@ -364,7 +364,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) ( } return false, []Group{}, startIdx } - currentStates = make([]*State, len(tempStates)) + currentStates = make([]*nfaState, len(tempStates)) copy(currentStates, tempStates) tempStates = nil diff --git a/regex/nfa.go b/regex/nfa.go index 839af20..cf68341 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -22,36 +22,36 @@ const ( alwaysTrueAssert // An assertion that is always true ) -type state struct { - content stateContents // Contents of current state - isEmpty bool // If it is empty - Union operator and Kleene star states will be empty - isLast bool // If it is the last state (acept state) - output []*state // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. - transitions map[int][]*state // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) - isKleene bool // Identifies whether current node is a 0-state representing Kleene star - assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything - allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space - except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. - lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds - lookaroundNFA *state // Holds the NFA of the lookaroundRegex - if it exists - lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround - groupBegin bool // Whether or not the node starts a capturing group - groupEnd bool // Whether or not the node ends a capturing group - groupNum int // Which capturing group the node starts / ends +type nfaState struct { + content stateContents // Contents of current state + isEmpty bool // If it is empty - Union operator and Kleene star states will be empty + isLast bool // If it is the last state (acept state) + output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. + transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) + isKleene bool // Identifies whether current node is a 0-state representing Kleene star + assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything + allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space + except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. + lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds + lookaroundNFA *nfaState // Holds the NFA of the lookaroundRegex - if it exists + lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround + groupBegin bool // Whether or not the node starts a capturing group + groupEnd bool // Whether or not the node ends a capturing group + groupNum int // Which capturing group the node starts / ends // The following properties depend on the current match - I should think about resetting them for every match. zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over. } // Clones the NFA starting from the given state. -func cloneState(start *state) *state { - return cloneStateHelper(start, make(map[*state]*state)) +func cloneState(start *nfaState) *nfaState { + return cloneStateHelper(start, make(map[*nfaState]*nfaState)) } // Helper function for clone. The map is used to keep track of which states have // already been copied, and which ones haven't. // This function was created using output from Llama3.1:405B. -func cloneStateHelper(stateToClone *state, cloneMap map[*state]*state) *state { +func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) *nfaState { // Base case - if the clone exists in our map, return it. if clone, exists := cloneMap[stateToClone]; exists { return clone @@ -61,12 +61,12 @@ func cloneStateHelper(stateToClone *state, cloneMap map[*state]*state) *state { } // Recursive case - if the clone doesn't exist, create it, add it to the map, // and recursively call for each of the transition states. - clone := &state{ + clone := &nfaState{ content: append([]int{}, stateToClone.content...), isEmpty: stateToClone.isEmpty, isLast: stateToClone.isLast, - output: make([]*state, len(stateToClone.output)), - transitions: make(map[int][]*state), + output: make([]*nfaState, len(stateToClone.output)), + transitions: make(map[int][]*nfaState), isKleene: stateToClone.isKleene, assert: stateToClone.assert, zeroMatchFound: stateToClone.zeroMatchFound, @@ -86,7 +86,7 @@ func cloneStateHelper(stateToClone *state, cloneMap map[*state]*state) *state { } } for k, v := range stateToClone.transitions { - clone.transitions[k] = make([]*state, len(v)) + clone.transitions[k] = make([]*nfaState, len(v)) for i, s := range v { if s == stateToClone { clone.transitions[k][i] = clone @@ -104,7 +104,7 @@ func cloneStateHelper(stateToClone *state, cloneMap map[*state]*state) *state { // Checks if the given state's assertion is true. Returns true if the given // state doesn't have an assertion. -func (s state) checkAssertion(str []rune, idx int) bool { +func (s nfaState) checkAssertion(str []rune, idx int) bool { if s.assert == alwaysTrueAssert { return true } @@ -171,7 +171,7 @@ func (s state) checkAssertion(str []rune, idx int) bool { } // Returns true if the contents of 's' contain the value at the given index of the given string -func (s state) contentContains(str []rune, idx int) bool { +func (s nfaState) contentContains(str []rune, idx int) bool { if s.assert != noneAssert { return s.checkAssertion(str, idx) } @@ -182,19 +182,19 @@ func (s state) contentContains(str []rune, idx int) bool { return slices.Contains(s.content, int(str[idx])) } -func (s state) isLookaround() bool { +func (s nfaState) isLookaround() bool { return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert } // Returns the matches for the character at the given index of the given string. // Also returns the number of matches. Returns -1 if an assertion failed. -func (s state) matchesFor(str []rune, idx int) ([]*state, int) { +func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) { // Assertions can be viewed as 'checks'. If the check fails, we return // an empty array and 0. // If it passes, we treat it like any other state, and return all the transitions. if s.assert != noneAssert { if s.checkAssertion(str, idx) == false { - return make([]*state, 0), -1 + return make([]*nfaState, 0), -1 } } listTransitions := s.transitions[int(str[idx])] @@ -211,7 +211,7 @@ func (s state) matchesFor(str []rune, idx int) ([]*state, int) { } // verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates -func verifyLastStatesHelper(st *state, visited map[*state]bool) { +func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) { if len(st.transitions) == 0 { st.isLast = true return @@ -228,7 +228,7 @@ func verifyLastStatesHelper(st *state, visited map[*state]bool) { } if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state - transitionDests := make([]*state, 0) + transitionDests := make([]*nfaState, 0) for _, v := range st.transitions { transitionDests = append(transitionDests, v...) } @@ -251,12 +251,12 @@ func verifyLastStatesHelper(st *state, visited map[*state]bool) { } // verifyLastStates enables the 'isLast' flag for the leaf nodes (last states) -func verifyLastStates(start []*state) { - verifyLastStatesHelper(start[0], make(map[*state]bool)) +func verifyLastStates(start []*nfaState) { + verifyLastStatesHelper(start[0], make(map[*nfaState]bool)) } // Concatenates s1 and s2, returns the start of the concatenation. -func concatenate(s1 *state, s2 *state) *state { +func concatenate(s1 *nfaState, s2 *nfaState) *nfaState { if s1 == nil { return s2 } @@ -269,13 +269,13 @@ func concatenate(s1 *state, s2 *state) *state { return s1 } -func kleene(s1 state) (*state, error) { +func kleene(s1 nfaState) (*nfaState, error) { if s1.isEmpty && s1.assert != noneAssert { return nil, fmt.Errorf("previous token is not quantifiable") } - toReturn := &state{} - toReturn.transitions = make(map[int][]*state) + toReturn := &nfaState{} + toReturn.transitions = make(map[int][]*nfaState) toReturn.content = newContents(EPSILON) toReturn.isEmpty = true toReturn.isKleene = true @@ -291,9 +291,9 @@ func kleene(s1 state) (*state, error) { return toReturn, nil } -func alternate(s1 *state, s2 *state) *state { - toReturn := &state{} - toReturn.transitions = make(map[int][]*state) +func alternate(s1 *nfaState, s2 *nfaState) *nfaState { + toReturn := &nfaState{} + toReturn.transitions = make(map[int][]*nfaState) toReturn.output = append(toReturn.output, s1.output...) toReturn.output = append(toReturn.output, s2.output...) // Unique append is used here (and elsewhere) to ensure that, @@ -313,9 +313,9 @@ func alternate(s1 *state, s2 *state) *state { return toReturn } -func question(s1 *state) *state { // Use the fact that ab? == a(b|) - s2 := &state{} - s2.transitions = make(map[int][]*state) +func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|) + s2 := &nfaState{} + s2.transitions = make(map[int][]*nfaState) s2.content = newContents(EPSILON) s2.output = append(s2.output, s2) s2.isEmpty = true @@ -324,10 +324,10 @@ func question(s1 *state) *state { // Use the fact that ab? == a(b|) } // Creates and returns a new state with the 'default' values. -func newState() state { - ret := state{ - output: make([]*state, 0), - transitions: make(map[int][]*state), +func newState() nfaState { + ret := nfaState{ + output: make([]*nfaState, 0), + transitions: make(map[int][]*nfaState), assert: noneAssert, except: append([]rune{}, 0), lookaroundRegex: "", @@ -339,7 +339,7 @@ func newState() state { } // Creates and returns a state that _always_ has a zero-length match. -func zeroLengthMatchState() state { +func zeroLengthMatchState() nfaState { start := newState() start.content = newContents(EPSILON) start.isEmpty = true