}elseifmiddleNode.groupBegin&&len(middleNode.transitions)==0{// The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
}elseifmiddleNode.groupBegin&&middleNode.numTransitions()==0{// The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
nfa=append(nfa,lparenNode)// I shouldn't have popped this out, because it is not involved in the current capturing group
s.groupNum=middleNode.groupNum// In this case, the 'middle' node is actually an lparen
// Increment until we hit a character matching the start state (assuming not 0-state)
ifstart.isEmpty==false{
fori<len(str)&&!start.contentContains(str,i){
i++
}
startIdx=i
startingFrom=i
i++// Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character
}
start.threadGroups=newMatch(numGroups+1)
// Check if the start state begins a group - if so, add the start index to our list
ifstart.groupBegin{
start.threadGroups[start.groupNum].StartIdx=i
// tempIndices[start.groupNum].startIdx = i
}
currentStates=append(currentStates,start)
// Main loop
fori<len(str){
foundPath=false
zeroStates:=make([]*nfaState,0)
// Keep taking zero-states, until there are no more left to take
// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
}elseif!currentState.isAlternation&&!currentState.isKleene&&!currentState.isQuestion&&!currentState.groupBegin&&!currentState.groupEnd&¤tState.assert==noneAssert{// Normal character
iftempIndices[0].StartIdx==tempIndices[0].EndIdx{// If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
iftempIndices[0].StartIdx==tempIndices[0].EndIdx{// If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
ifstartIdx==startingFrom{// Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over.
isEmptybool// If it is empty - Union operator and Kleene star states will be empty
isLastbool// If it is the last state (acept state)
output[]*nfaState// The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
transitionsmap[int][]*nfaState// Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
isKleenebool// Identifies whether current node is a 0-state representing Kleene star
isQuestionbool// Identifies whether current node is a 0-state representing the question operator
isAlternationbool// Identifies whether current node is a 0-state representing an alternation
assertassertType// Type of assertion of current node - NONE means that the node doesn't assert anything
allCharsbool// Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except[]rune// Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
lookaroundRegexstring// Only for lookaround states - Contents of the regex that the lookaround state holds
lookaroundNFA*nfaState// Holds the NFA of the lookaroundRegex - if it exists
lookaroundNumCaptureGroupsint// Number of capturing groups in lookaround regex if current node is a lookaround
groupBeginbool// Whether or not the node starts a capturing group
groupEndbool// Whether or not the node ends a capturing group
groupNumint// Which capturing group the node starts / ends
contentstateContents// Contents of current state
isEmptybool// If it is empty - Union operator and Kleene star states will be empty
isLastbool// If it is the last state (acept state)
output[]*nfaState// The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
// transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
next*nfaState// The next state (not for alternation or kleene states)
isKleenebool// Identifies whether current node is a 0-state representing Kleene star
isQuestionbool// Identifies whether current node is a 0-state representing the question operator
isAlternationbool// Identifies whether current node is a 0-state representing an alternation
splitState*nfaState// Only for alternation states - the 'other' branch of the alternation ('next' is the first)
assertassertType// Type of assertion of current node - NONE means that the node doesn't assert anything
allCharsbool// Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except[]rune// Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
lookaroundRegexstring// Only for lookaround states - Contents of the regex that the lookaround state holds
lookaroundNFA*nfaState// Holds the NFA of the lookaroundRegex - if it exists
lookaroundNumCaptureGroupsint// Number of capturing groups in lookaround regex if current node is a lookaround
groupBeginbool// Whether or not the node starts a capturing group
groupEndbool// Whether or not the node ends a capturing group
groupNumint// Which capturing group the node starts / ends
// The following properties depend on the current match - I should think about resetting them for every match.
zeroMatchFoundbool// Whether or not the state has been used for a zero-length match - only relevant for zero states
threadGroups[]Group// Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
return!slices.Contains(slices.Concat(notDotChars,s.except),str[idx])// Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node.
@ -222,74 +228,84 @@ func (s nfaState) isLookaround() bool {
ifst.isKleene{// A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
transitionDests:=make([]*nfaState,0)
for_,v:=rangest.transitions{
transitionDests=append(transitionDests,v...)
}
ifallEqual(transitionDests...){
st.isLast=true
return
}
}
ifvisited[st]==true{
return
}
visited[st]=true
for_,states:=rangest.transitions{
fori:=rangestates{
ifstates[i]!=st{
verifyLastStatesHelper(states[i],visited)
}
}
}
}
// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates
// // if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
// if st.numTransitions() == 1 { // Eg. a*
// var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
// for _, c := range st.content {
// if len(st.transitions[c]) != 1 || st.transitions[c][0] != st {
// moreThanOneTrans = true
// }
// }
// st.isLast = !moreThanOneTrans
// }
//
// if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state