package main
import (
"slices"
)
const EPSILON int = 0xF0000
type assertType int
const (
NONE assertType = iota
SOS
EOS
WBOUND
NONWBOUND
PLA // Positive lookahead
NLA // Negative lookahead
PLB // Positive lookbehind
NLB // Negative lookbehind
)
type State struct {
content stateContents // Contents of current state
isEmpty bool // If it is empty - Union operator and Kleene star states will be empty
isLast bool // If it is the last state (acept state)
output [ ] * State // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
transitions map [ int ] [ ] * State // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except [ ] rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds
lookaroundNFA * State // Holds the NFA of the lookaroundRegex - if it exists
lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround
groupBegin bool // Whether or not the node starts a capturing group
groupEnd bool // Whether or not the node ends a capturing group
groupNum int // Which capturing group the node starts / ends
// The following properties depend on the current match - I should think about resetting them for every match.
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
threadGroups [ ] Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
}
// Clones the NFA starting from the given state.
func cloneState ( start * State ) * State {
return cloneStateHelper ( start , make ( map [ * State ] * State ) )
}
// Helper function for clone. The map is used to keep track of which states have
// already been copied, and which ones haven't.
// This function was created using output from Llama3.1:405B.
func cloneStateHelper ( state * State , cloneMap map [ * State ] * State ) * State {
// Base case - if the clone exists in our map, return it.
if clone , exists := cloneMap [ state ] ; exists {
return clone
}
if state == nil {
return nil
}
// Recursive case - if the clone doesn't exist, create it, add it to the map,
// and recursively call for each of the transition states.
clone := & State {
content : append ( [ ] int { } , state . content ... ) ,
isEmpty : state . isEmpty ,
isLast : state . isLast ,
output : make ( [ ] * State , len ( state . output ) ) ,
transitions : make ( map [ int ] [ ] * State ) ,
isKleene : state . isKleene ,
assert : state . assert ,
zeroMatchFound : state . zeroMatchFound ,
allChars : state . allChars ,
except : append ( [ ] rune { } , state . except ... ) ,
lookaroundRegex : state . lookaroundRegex ,
groupEnd : state . groupEnd ,
groupBegin : state . groupBegin ,
groupNum : state . groupNum ,
}
cloneMap [ state ] = clone
for i , s := range state . output {
if s == state {
clone . output [ i ] = clone
} else {
clone . output [ i ] = cloneStateHelper ( s , cloneMap )
}
}
for k , v := range state . transitions {
clone . transitions [ k ] = make ( [ ] * State , len ( v ) )
for i , s := range v {
if s == state {
clone . transitions [ k ] [ i ] = clone
} else {
clone . transitions [ k ] [ i ] = cloneStateHelper ( s , cloneMap )
}
}
}
if state . lookaroundNFA == state {
clone . lookaroundNFA = clone
}
clone . lookaroundNFA = cloneStateHelper ( state . lookaroundNFA , cloneMap )
return clone
}
// Checks if the given state's assertion is true. Returns true if the given
// state doesn't have an assertion.
func ( s State ) checkAssertion ( str [ ] rune , idx int ) bool {
if s . assert == SOS {
return idx == 0
}
if s . assert == EOS {
// Index is at the end of the string, or it points to the last character which is a newline
return idx == len ( str ) || ( idx == len ( str ) - 1 && str [ len ( str ) - 1 ] == '\n' )
}
if s . assert == WBOUND {
return isWordBoundary ( str , idx )
}
if s . assert == NONWBOUND {
return ! isWordBoundary ( str , idx )
}
if s . isLookaround ( ) {
// The process here is simple:
// 1. Compile the regex stored in the state's contents.
// 2. Run it on a subset of the test string, that ends after the current index in the string
// 3. Based on the kind of lookaround (and the indices we get), determine what action to take.
startState := s . lookaroundNFA
var runesToMatch [ ] rune
var strToMatch string
if s . assert == PLA || s . assert == NLA {
runesToMatch = str [ idx : ]
} else {
runesToMatch = str [ : idx ]
}
if len ( runesToMatch ) == 0 {
strToMatch = ""
} else {
strToMatch = string ( runesToMatch )
}
matchIndices := FindAllMatches ( Reg { startState , s . lookaroundNumCaptureGroups } , strToMatch )
numMatchesFound := 0
for _ , matchIdx := range matchIndices {
if s . assert == PLA || s . assert == NLA { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
if matchIdx [ 0 ] . startIdx == 0 {
numMatchesFound ++
}
}
if s . assert == PLB || s . assert == NLB { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
if matchIdx [ 0 ] . endIdx == idx {
numMatchesFound ++
}
}
}
if s . assert == PLA || s . assert == PLB { // Positive assertions want at least one match
return numMatchesFound > 0
}
if s . assert == NLA || s . assert == NLB { // Negative assertions only want zero matches
return numMatchesFound == 0
}
}
return true
}
// Returns true if the contents of 's' contain the value at the given index of the given string
func ( s State ) contentContains ( str [ ] rune , idx int ) bool {
if s . assert != NONE {
return s . checkAssertion ( str , idx )
}
if s . allChars {
return ! slices . Contains ( slices . Concat ( notDotChars , s . except ) , str [ idx ] ) // Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node.
}
// Default - s.assert must be NONE
return slices . Contains ( s . content , int ( str [ idx ] ) )
}
func ( s State ) isLookaround ( ) bool {
return s . assert == PLA || s . assert == PLB || s . assert == NLA || s . assert == NLB
}
// Returns the matches for the character at the given index of the given string.
// Also returns the number of matches. Returns -1 if an assertion failed.
func ( s State ) matchesFor ( str [ ] rune , idx int ) ( [ ] * State , int ) {
// Assertions can be viewed as 'checks'. If the check fails, we return
// an empty array and 0.
// If it passes, we treat it like any other state, and return all the transitions.
if s . assert != NONE {
if s . checkAssertion ( str , idx ) == false {
return make ( [ ] * State , 0 ) , - 1
}
}
listTransitions := s . transitions [ int ( str [ idx ] ) ]
for _ , dest := range s . transitions [ int ( ANY_CHAR ) ] {
if ! slices . Contains ( slices . Concat ( notDotChars , dest . except ) , str [ idx ] ) {
// Add an allChar state to the list of matches if:
// a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't.
// b. The current character isn't the state's exception list.
listTransitions = append ( listTransitions , dest )
}
}
numTransitions := len ( listTransitions )
return listTransitions , numTransitions
}
type NFA struct {
start State
outputs [ ] State
}
// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates
func verifyLastStatesHelper ( state * State , visited map [ * State ] bool ) {
if len ( state . transitions ) == 0 {
state . isLast = true
return
}
// if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
if len ( state . transitions ) == 1 { // Eg. a*
var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
for _ , c := range state . content {
if len ( state . transitions [ c ] ) != 1 || state . transitions [ c ] [ 0 ] != state {
moreThanOneTrans = true
}
}
state . isLast = ! moreThanOneTrans
}
if state . isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
transitionDests := make ( [ ] * State , 0 )
for _ , v := range state . transitions {
transitionDests = append ( transitionDests , v ... )
}
if allEqual ( transitionDests ... ) {
state . isLast = true
return
}
}
if visited [ state ] == true {
return
}
visited [ state ] = true
for _ , states := range state . transitions {
for i := range states {
if states [ i ] != state {
verifyLastStatesHelper ( states [ i ] , visited )
}
}
}
}
// verifyLastStates enables the 'isLast' flag for the leaf nodes (last states)
func verifyLastStates ( start [ ] * State ) {
verifyLastStatesHelper ( start [ 0 ] , make ( map [ * State ] bool ) )
}
// Concatenates s1 and s2, returns the start of the concatenation.
func concatenate ( s1 * State , s2 * State ) * State {
if s1 == nil {
return s2
}
for i := range s1 . output {
for _ , c := range s2 . content { // Create transitions for every element in s1's content to s2'
s1 . output [ i ] . transitions [ c ] , _ = unique_append ( s1 . output [ i ] . transitions [ c ] , s2 )
}
}
s1 . output = s2 . output
return s1
}
func kleene ( s1 State ) * State {
toReturn := & State { }
toReturn . transitions = make ( map [ int ] [ ] * State )
toReturn . content = newContents ( EPSILON )
toReturn . isEmpty = true
toReturn . isKleene = true
toReturn . output = append ( toReturn . output , toReturn )
for i := range s1 . output {
for _ , c := range toReturn . content {
s1 . output [ i ] . transitions [ c ] , _ = unique_append ( s1 . output [ i ] . transitions [ c ] , toReturn )
}
}
for _ , c := range s1 . content {
toReturn . transitions [ c ] , _ = unique_append ( toReturn . transitions [ c ] , & s1 )
}
return toReturn
}
func alternate ( s1 * State , s2 * State ) * State {
toReturn := & State { }
toReturn . transitions = make ( map [ int ] [ ] * State )
toReturn . output = append ( toReturn . output , s1 . output ... )
toReturn . output = append ( toReturn . output , s2 . output ... )
// Unique append is used here (and elsewhere) to ensure that,
// for any given transition, a state can only be mentioned once.
// For example, given the transition 'a', the state 's1' can only be mentioned once.
// This would lead to multiple instances of the same set of match indices, since both
// 's1' states would be considered to match.
for _ , c := range s1 . content {
toReturn . transitions [ c ] , _ = unique_append ( toReturn . transitions [ c ] , s1 )
}
for _ , c := range s2 . content {
toReturn . transitions [ c ] , _ = unique_append ( toReturn . transitions [ c ] , s2 )
}
toReturn . content = newContents ( EPSILON )
toReturn . isEmpty = true
return toReturn
}
func question ( s1 * State ) * State { // Use the fact that ab? == a(b|)
s2 := & State { }
s2 . transitions = make ( map [ int ] [ ] * State )
s2 . content = newContents ( EPSILON )
s2 . output = append ( s2 . output , s2 )
s2 . isEmpty = true
s3 := alternate ( s1 , s2 )
return s3
}
// Creates and returns a new state with the 'default' values.
func newState ( ) State {
ret := State {
output : make ( [ ] * State , 0 ) ,
transitions : make ( map [ int ] [ ] * State ) ,
assert : NONE ,
except : append ( [ ] rune { } , 0 ) ,
lookaroundRegex : "" ,
groupEnd : false ,
groupBegin : false ,
}
ret . output = append ( ret . output , & ret )
return ret
}