11 Commits

5 changed files with 259 additions and 259 deletions

View File

@@ -14,11 +14,11 @@ var notDotChars []rune
// the startState of the NFA representation of the regex, and the number of capturing
// groups in the regex.
type Reg struct {
start *State
start *nfaState
numGroups int
}
const CONCAT rune = '~'
const concatRune rune = 0xF0001
// Flags for shuntingYard - control its behavior
type ReFlag int
@@ -31,7 +31,7 @@ const (
)
func isOperator(c rune) bool {
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune {
return true
}
return false
@@ -39,7 +39,7 @@ func isOperator(c rune) bool {
/* priority returns the priority of the given operator */
func priority(op rune) int {
precedence := []rune{'|', CONCAT, '+', '*', '?'}
precedence := []rune{'|', concatRune, '+', '*', '?'}
return slices.Index(precedence, op)
}
@@ -320,7 +320,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
re_postfix = append(re_postfix, CONCAT)
re_postfix = append(re_postfix, concatRune)
}
}
}
@@ -450,21 +450,21 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
// Now we should filter that out.
toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1}
toAppend := postfixNode{nodetype: assertionNode, startReps: 1, endReps: 1}
if regex[0] == '<' { // Lookbehind
toAppend.lookaroundDir = LOOKBEHIND
toAppend.lookaroundDir = lookbehind
regex = regex[1:]
} else if regex[0] == '=' || regex[0] == '!' {
toAppend.lookaroundDir = LOOKAHEAD
toAppend.lookaroundDir = lookahead
} else {
return nil, fmt.Errorf("invalid lookaround")
}
// Positive or negative
if regex[0] == '=' { // Positive
toAppend.lookaroundSign = POSITIVE
toAppend.lookaroundSign = positive
toAppend.contents = []rune(regex[1:])
} else if regex[0] == '!' { // Negative
toAppend.lookaroundSign = NEGATIVE
toAppend.lookaroundSign = negative
toAppend.contents = []rune(regex[1:])
} else {
return nil, fmt.Errorf("invalid lookaround")
@@ -489,7 +489,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
topStack, _ = peek(opStack)
}
outQueueFinalElement, _ := peek(outQueue)
if (c == '*' && outQueueFinalElement.nodetype == KLEENE) || (c == '+' && outQueueFinalElement.nodetype == PLUS) { // You cannot apply a quantifier to a quantifier in this way
if (c == '*' && outQueueFinalElement.nodetype == kleeneNode) || (c == '+' && outQueueFinalElement.nodetype == plusNode) { // You cannot apply a quantifier to a quantifier in this way
return nil, fmt.Errorf("illegal use of token '%c'", c)
}
opStack = append(opStack, c)
@@ -739,7 +739,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("invalid start range for numeric specifier")
}
if len(endRange) == 0 { // Case 3 above
endRangeNum = INFINITE_REPS
endRangeNum = infinite_reps
} else { // Case 2 above
var err error
endRangeNum, err = strconv.Atoi(string(endRange))
@@ -751,7 +751,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
idx := len(outQueue) - 1
// Get the last added node
if idx < 0 || outQueue[idx].nodetype == LPAREN {
if idx < 0 || outQueue[idx].nodetype == lparenNode {
return nil, fmt.Errorf("numeric specifier with no content")
}
outQueue[idx].startReps = startRangeNum
@@ -799,8 +799,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state and number of groups in regex.
func thompson(re []postfixNode) (Reg, error) {
nfa := make([]*State, 0) // Stack of states
numGroups := 0 // Number of capturing groups
nfa := make([]*nfaState, 0) // Stack of states
numGroups := 0 // Number of capturing groups
// If thompson() receives an empty regex, then whatever was given to shuntingYard()
// was parsed away. This doesn't mean that the regex itself is empty.
@@ -814,11 +814,11 @@ func thompson(re []postfixNode) (Reg, error) {
}
for _, c := range re {
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
state := State{}
state.transitions = make(map[int][]*State)
if c.nodetype == characterNode || c.nodetype == assertionNode {
stateToAdd := nfaState{}
stateToAdd.transitions = make(map[int][]*nfaState)
if c.allChars {
state.allChars = true
stateToAdd.allChars = true
if len(c.except) != 0 {
// For each node that I am 'excepting' (eg. in an inverted character class):
// - If the node itself has exceptions, then the exceptions cancel out.
@@ -827,7 +827,7 @@ func thompson(re []postfixNode) (Reg, error) {
// - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list.
for _, node := range c.except {
if node.allChars {
state.allChars = false
stateToAdd.allChars = false
// For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all,
// and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match
// those.
@@ -840,7 +840,7 @@ func thompson(re []postfixNode) (Reg, error) {
}
return nodeContents
})...)
state.content = rune2Contents(nodeExceptChars)
stateToAdd.content = rune2Contents(nodeExceptChars)
} else {
charsToAdd := node.contents
if caseInsensitive {
@@ -848,7 +848,7 @@ func thompson(re []postfixNode) (Reg, error) {
return allCases(r, caseInsensitive)
})...)
}
state.except = append(state.except, charsToAdd...)
stateToAdd.except = append(stateToAdd.except, charsToAdd...)
}
}
}
@@ -861,43 +861,43 @@ func thompson(re []postfixNode) (Reg, error) {
return allCases(r, caseInsensitive)
})...)
}
state.content = stateContents(append([]int(state.content), []int(rune2Contents(runesToAdd))...))
state.output = make([]*State, 0)
state.output = append(state.output, &state)
state.isEmpty = false
if c.nodetype == ASSERTION {
state.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
stateToAdd.content = stateContents(append([]int(stateToAdd.content), []int(rune2Contents(runesToAdd))...))
stateToAdd.output = make([]*nfaState, 0)
stateToAdd.output = append(stateToAdd.output, &stateToAdd)
stateToAdd.isEmpty = false
if c.nodetype == assertionNode {
stateToAdd.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
stateToAdd.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
switch c.contents[0] {
case '^':
state.assert = SOS
stateToAdd.assert = sosAssert
case '$':
state.assert = EOS
stateToAdd.assert = eosAssert
case 'b':
state.assert = WBOUND
stateToAdd.assert = wboundAssert
case 'B':
state.assert = NONWBOUND
stateToAdd.assert = nonwboundAssert
}
} else { // Lookaround
state.lookaroundRegex = string(c.contents)
if c.lookaroundDir == LOOKAHEAD {
if c.lookaroundSign == POSITIVE {
state.assert = PLA
stateToAdd.lookaroundRegex = string(c.contents)
if c.lookaroundDir == lookahead {
if c.lookaroundSign == positive {
stateToAdd.assert = plaAssert
}
if c.lookaroundSign == NEGATIVE {
state.assert = NLA
if c.lookaroundSign == negative {
stateToAdd.assert = nlaAssert
}
}
if c.lookaroundDir == LOOKBEHIND {
if c.lookaroundSign == POSITIVE {
state.assert = PLB
if c.lookaroundDir == lookbehind {
if c.lookaroundSign == positive {
stateToAdd.assert = plbAssert
}
if c.lookaroundSign == NEGATIVE {
state.assert = NLB
if c.lookaroundSign == negative {
stateToAdd.assert = nlbAssert
}
}
tmpRe, err := shuntingYard(state.lookaroundRegex)
tmpRe, err := shuntingYard(stateToAdd.lookaroundRegex)
if err != nil {
return Reg{}, fmt.Errorf("error parsing lookaround: %w", err)
}
@@ -905,28 +905,28 @@ func thompson(re []postfixNode) (Reg, error) {
if err != nil {
return Reg{}, fmt.Errorf("error compiling lookaround: %w", err)
}
state.lookaroundNFA = reg.start
state.lookaroundNumCaptureGroups = reg.numGroups
stateToAdd.lookaroundNFA = reg.start
stateToAdd.lookaroundNumCaptureGroups = reg.numGroups
}
}
// Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it
replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\')
replaceByValue(state.except, ESC_BACKSLASH, '\\')
replaceByValue([]int(stateToAdd.content), int(ESC_BACKSLASH), '\\')
replaceByValue(stateToAdd.except, ESC_BACKSLASH, '\\')
nfa = append(nfa, &state)
nfa = append(nfa, &stateToAdd)
}
if c.nodetype == LPAREN || c.nodetype == RPAREN {
s := &State{}
s.assert = NONE
if c.nodetype == lparenNode || c.nodetype == rparenNode {
s := &nfaState{}
s.assert = noneAssert
s.content = newContents(EPSILON)
s.isEmpty = true
s.output = make([]*State, 0)
s.output = make([]*nfaState, 0)
s.output = append(s.output, s)
s.transitions = make(map[int][]*State)
s.transitions = make(map[int][]*nfaState)
// LPAREN nodes are just added normally
if c.nodetype == LPAREN {
if c.nodetype == lparenNode {
numGroups++
s.groupBegin = true
s.groupNum = numGroups
@@ -940,7 +940,7 @@ func thompson(re []postfixNode) (Reg, error) {
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
// and RPAREN nodes.
// If neither node exists, that's a problem so I return an error.
if c.nodetype == RPAREN {
if c.nodetype == rparenNode {
s.groupEnd = true
middleNode, err1 := pop(&nfa)
lparenNode, err2 := pop(&nfa)
@@ -969,9 +969,9 @@ func thompson(re []postfixNode) (Reg, error) {
}
}
}
if c.nodetype == CHARCLASS { // A Character class consists of all the nodes in it, alternated
if c.nodetype == charclassNode { // A Character class consists of all the nodes in it, alternated
// Map the list of nodes to a list of states, each state containing the contents of a specific node
states := funcMap(c.nodeContents, func(node postfixNode) *State {
states := funcMap(c.nodeContents, func(node postfixNode) *nfaState {
s := newState()
nodeContents := node.contents
if caseInsensitive {
@@ -989,14 +989,14 @@ func thompson(re []postfixNode) (Reg, error) {
return &s
})
// Reduce the list of states down to a single state by alternating them
toAdd := funcReduce(states, func(s1 *State, s2 *State) *State {
toAdd := funcReduce(states, func(s1 *nfaState, s2 *nfaState) *nfaState {
return alternate(s1, s2)
})
nfa = append(nfa, toAdd)
}
// Must be an operator if it isn't a character
switch c.nodetype {
case CONCATENATE:
case concatenateNode:
s2 := mustPop(&nfa)
// Relax the requirements for concatenation a little bit - If
// the second element is not found ie. the postfixNodes look
@@ -1008,7 +1008,7 @@ func thompson(re []postfixNode) (Reg, error) {
s1 = concatenate(s1, s2)
nfa = append(nfa, s1)
}
case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
case kleeneNode: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
s1, err := pop(&nfa)
if err != nil {
return Reg{}, fmt.Errorf("error applying kleene star")
@@ -1018,7 +1018,7 @@ func thompson(re []postfixNode) (Reg, error) {
return Reg{}, err
}
nfa = append(nfa, stateToAdd)
case PLUS: // a+ is equivalent to aa*
case plusNode: // a+ is equivalent to aa*
s1 := mustPop(&nfa)
s2, err := kleene(*s1)
if err != nil {
@@ -1026,14 +1026,14 @@ func thompson(re []postfixNode) (Reg, error) {
}
s1 = concatenate(s1, s2)
nfa = append(nfa, s1)
case QUESTION: // ab? is equivalent to a(b|)
case questionNode: // ab? is equivalent to a(b|)
s1, err := pop(&nfa)
if err != nil {
return Reg{}, fmt.Errorf("error applying question operator")
}
s2 := question(s1)
nfa = append(nfa, s2)
case PIPE:
case pipeNode:
// A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
// it is replaced with an implicit 'matchZeroLength' state (this is the same thing that we add at the top if our
// input has zero postfixNodes).
@@ -1065,8 +1065,8 @@ func thompson(re []postfixNode) (Reg, error) {
if c.endReps != -1 && c.endReps < c.startReps {
return Reg{}, fmt.Errorf("numeric specifier - start greater than end")
}
state := mustPop(&nfa)
var stateToAdd *State = nil
poppedState := mustPop(&nfa)
var stateToAdd *nfaState = nil
// Take advantage of the following facts:
// a{5} == aaaaa
// a{3,5} == aaaa?a?
@@ -1080,17 +1080,17 @@ func thompson(re []postfixNode) (Reg, error) {
// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
// at this point, I can leave thompson untouched.
for i := 0; i < c.startReps; i++ { // Case 1
stateToAdd = concatenate(stateToAdd, cloneState(state))
stateToAdd = concatenate(stateToAdd, cloneState(poppedState))
}
if c.endReps == INFINITE_REPS { // Case 3
s2, err := kleene(*state)
if c.endReps == infinite_reps { // Case 3
s2, err := kleene(*poppedState)
if err != nil {
return Reg{}, err
}
stateToAdd = concatenate(stateToAdd, s2)
} else { // Case 2
for i := c.startReps; i < c.endReps; i++ {
stateToAdd = concatenate(stateToAdd, question(cloneState(state)))
stateToAdd = concatenate(stateToAdd, question(cloneState(poppedState)))
}
}
nfa = append(nfa, stateToAdd)

View File

@@ -61,7 +61,7 @@ func (g Group) isValid() bool {
// given slice. It returns the resulting states. If any of the resulting states is a 0-state,
// the second ret val is true.
// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index.
func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZero bool) {
func takeZeroState(states []*nfaState, numGroups int, idx int) (rtv []*nfaState, isZero bool) {
for _, state := range states {
if len(state.transitions[EPSILON]) > 0 {
for _, s := range state.transitions[EPSILON] {
@@ -93,9 +93,9 @@ func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZer
// from any of the given states, given the string and our position in it.
// It uses the same algorithm to find zero-states as the one inside the loop,
// so I should probably put it in a function.
func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) bool {
func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState) bool {
zeroStates, isZero := takeZeroState(states, numGroups, idx)
tempstates := make([]*State, 0, len(zeroStates)+len(states))
tempstates := make([]*nfaState, 0, len(zeroStates)+len(states))
tempstates = append(tempstates, states...)
tempstates = append(tempstates, zeroStates...)
num_appended := 0 // number of unique states addded to tempstates
@@ -107,7 +107,7 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) boo
}
}
for _, state := range tempstates {
if state.isEmpty && (state.assert == NONE || state.checkAssertion(str, idx)) && state.isLast {
if state.isEmpty && (state.assert == noneAssert || state.checkAssertion(str, idx)) && state.isLast {
return true
}
}
@@ -204,7 +204,7 @@ func FindAllMatches(regex Reg, str string) []Match {
// the next search should start from.
//
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (bool, Match, int) {
func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
// Base case - exit if offset exceeds string's length
if offset > len(str) {
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
@@ -221,14 +221,14 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
foundPath := false
startIdx := offset
endIdx := offset
currentStates := make([]*State, 0)
tempStates := make([]*State, 0) // Used to store states that should be used in next loop iteration
i := offset // Index in string
startingFrom := i // Store starting index
currentStates := make([]*nfaState, 0)
tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration
i := offset // Index in string
startingFrom := i // Store starting index
// If the first state is an assertion, makes sure the assertion
// is true before we do _anything_ else.
if start.assert != NONE {
if start.assert != noneAssert {
if start.checkAssertion(str, offset) == false {
i++
return false, []Group{}, i
@@ -257,7 +257,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
for i < len(str) {
foundPath = false
zeroStates := make([]*State, 0)
zeroStates := make([]*nfaState, 0)
// Keep taking zero-states, until there are no more left to take
// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
@@ -275,11 +275,11 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
tempStates = nil
// Take any transitions corresponding to current character
numStatesMatched := 0 // The number of states which had at least 1 match for this round
assertionFailed := false // Whether or not an assertion failed for this round
lastStateInList := false // Whether or not a last state was in our list of states
var lastStatePtr *State = nil // Pointer to the last-state, if it was found
lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
numStatesMatched := 0 // The number of states which had at least 1 match for this round
assertionFailed := false // Whether or not an assertion failed for this round
lastStateInList := false // Whether or not a last state was in our list of states
var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found
lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
for _, state := range currentStates {
matches, numMatches := state.matchesFor(str, i)
if numMatches > 0 {
@@ -329,7 +329,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
// b. Empty
// c. Doesn't assert anything
for _, s := range currentStates {
if s.isLast && s.isEmpty && s.assert == NONE {
if s.isLast && s.isEmpty && s.assert == noneAssert {
lastStatePtr = s
lastStateInList = true
}
@@ -364,7 +364,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
}
return false, []Group{}, startIdx
}
currentStates = make([]*State, len(tempStates))
currentStates = make([]*nfaState, len(tempStates))
copy(currentStates, tempStates)
tempStates = nil
@@ -391,7 +391,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
// Only add the match if the start index is in bounds. If the state has an assertion,
// make sure the assertion checks out.
if state.isLast && i <= len(str) {
if state.assert == NONE || state.checkAssertion(str, i) {
if state.assert == noneAssert || state.checkAssertion(str, i) {
for j := 1; j < numGroups+1; j++ {
tempIndices[j] = state.threadGroups[j]
}

View File

@@ -8,16 +8,16 @@ import (
var whitespaceChars = []rune{' ', '\t', '\n'}
var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_")
var LBRACKET rune = 0xF0001
var RBRACKET rune = 0xF0002
var ANY_CHAR rune = 0xF0003 // Represents any character - used for states where the allChars flag is on.
var LPAREN_CHAR rune = 0xF0004 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
var RPAREN_CHAR rune = 0xF0005
var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPAREN
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash
var CHAR_RANGE rune = 0xF0008 // Represents a character range
var LBRACKET rune = 0xF0002
var RBRACKET rune = 0xF0003
var ANY_CHAR rune = 0xF0004 // Represents any character - used for states where the allChars flag is on.
var LPAREN_CHAR rune = 0xF0005 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
var RPAREN_CHAR rune = 0xF0006
var NONCAPLPAREN_CHAR rune = 0xF0007 // Represents a non-capturing group's LPAREN
var ESC_BACKSLASH rune = 0xF0008 // Represents an escaped backslash
var CHAR_RANGE rune = 0xF0009 // Represents a character range
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', CONCAT, '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR}
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR}
// An interface for int and rune, which are identical
type character interface {

View File

@@ -5,124 +5,124 @@ import (
"slices"
)
const EPSILON int = 0xF0000
const epsilon int = 0xF0000
type assertType int
const (
NONE assertType = iota
SOS
EOS
WBOUND
NONWBOUND
PLA // Positive lookahead
NLA // Negative lookahead
PLB // Positive lookbehind
NLB // Negative lookbehind
ALWAYS_TRUE // An assertion that is always true
noneAssert assertType = iota
sosAssert
eosAssert
wboundAssert
nonwboundAssert
plaAssert // Positive lookahead
nlaAssert // Negative lookahead
plbAssert // Positive lookbehind
nlbAssert // Negative lookbehind
alwaysTrueAssert // An assertion that is always true
)
type State struct {
content stateContents // Contents of current state
isEmpty bool // If it is empty - Union operator and Kleene star states will be empty
isLast bool // If it is the last state (acept state)
output []*State // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
transitions map[int][]*State // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds
lookaroundNFA *State // Holds the NFA of the lookaroundRegex - if it exists
lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround
groupBegin bool // Whether or not the node starts a capturing group
groupEnd bool // Whether or not the node ends a capturing group
groupNum int // Which capturing group the node starts / ends
type nfaState struct {
content stateContents // Contents of current state
isEmpty bool // If it is empty - Union operator and Kleene star states will be empty
isLast bool // If it is the last state (acept state)
output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds
lookaroundNFA *nfaState // Holds the NFA of the lookaroundRegex - if it exists
lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround
groupBegin bool // Whether or not the node starts a capturing group
groupEnd bool // Whether or not the node ends a capturing group
groupNum int // Which capturing group the node starts / ends
// The following properties depend on the current match - I should think about resetting them for every match.
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
}
// Clones the NFA starting from the given state.
func cloneState(start *State) *State {
return cloneStateHelper(start, make(map[*State]*State))
func cloneState(start *nfaState) *nfaState {
return cloneStateHelper(start, make(map[*nfaState]*nfaState))
}
// Helper function for clone. The map is used to keep track of which states have
// already been copied, and which ones haven't.
// This function was created using output from Llama3.1:405B.
func cloneStateHelper(state *State, cloneMap map[*State]*State) *State {
func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) *nfaState {
// Base case - if the clone exists in our map, return it.
if clone, exists := cloneMap[state]; exists {
if clone, exists := cloneMap[stateToClone]; exists {
return clone
}
if state == nil {
if stateToClone == nil {
return nil
}
// Recursive case - if the clone doesn't exist, create it, add it to the map,
// and recursively call for each of the transition states.
clone := &State{
content: append([]int{}, state.content...),
isEmpty: state.isEmpty,
isLast: state.isLast,
output: make([]*State, len(state.output)),
transitions: make(map[int][]*State),
isKleene: state.isKleene,
assert: state.assert,
zeroMatchFound: state.zeroMatchFound,
allChars: state.allChars,
except: append([]rune{}, state.except...),
lookaroundRegex: state.lookaroundRegex,
groupEnd: state.groupEnd,
groupBegin: state.groupBegin,
groupNum: state.groupNum,
clone := &nfaState{
content: append([]int{}, stateToClone.content...),
isEmpty: stateToClone.isEmpty,
isLast: stateToClone.isLast,
output: make([]*nfaState, len(stateToClone.output)),
transitions: make(map[int][]*nfaState),
isKleene: stateToClone.isKleene,
assert: stateToClone.assert,
zeroMatchFound: stateToClone.zeroMatchFound,
allChars: stateToClone.allChars,
except: append([]rune{}, stateToClone.except...),
lookaroundRegex: stateToClone.lookaroundRegex,
groupEnd: stateToClone.groupEnd,
groupBegin: stateToClone.groupBegin,
groupNum: stateToClone.groupNum,
}
cloneMap[state] = clone
for i, s := range state.output {
if s == state {
cloneMap[stateToClone] = clone
for i, s := range stateToClone.output {
if s == stateToClone {
clone.output[i] = clone
} else {
clone.output[i] = cloneStateHelper(s, cloneMap)
}
}
for k, v := range state.transitions {
clone.transitions[k] = make([]*State, len(v))
for k, v := range stateToClone.transitions {
clone.transitions[k] = make([]*nfaState, len(v))
for i, s := range v {
if s == state {
if s == stateToClone {
clone.transitions[k][i] = clone
} else {
clone.transitions[k][i] = cloneStateHelper(s, cloneMap)
}
}
}
if state.lookaroundNFA == state {
if stateToClone.lookaroundNFA == stateToClone {
clone.lookaroundNFA = clone
}
clone.lookaroundNFA = cloneStateHelper(state.lookaroundNFA, cloneMap)
clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap)
return clone
}
// Checks if the given state's assertion is true. Returns true if the given
// state doesn't have an assertion.
func (s State) checkAssertion(str []rune, idx int) bool {
if s.assert == ALWAYS_TRUE {
func (s nfaState) checkAssertion(str []rune, idx int) bool {
if s.assert == alwaysTrueAssert {
return true
}
if s.assert == SOS {
if s.assert == sosAssert {
// Single-line mode: Beginning of string
// Multi-line mode: Previous character was newline
return idx == 0 || (multilineMode && (idx > 0 && str[idx-1] == '\n'))
}
if s.assert == EOS {
if s.assert == eosAssert {
// Single-line mode: End of string
// Multi-line mode: current character is newline
// Index is at the end of the string, or it points to the last character which is a newline
return idx == len(str) || (multilineMode && str[idx] == '\n')
}
if s.assert == WBOUND {
if s.assert == wboundAssert {
return isWordBoundary(str, idx)
}
if s.assert == NONWBOUND {
if s.assert == nonwboundAssert {
return !isWordBoundary(str, idx)
}
if s.isLookaround() {
@@ -133,7 +133,7 @@ func (s State) checkAssertion(str []rune, idx int) bool {
startState := s.lookaroundNFA
var runesToMatch []rune
var strToMatch string
if s.assert == PLA || s.assert == NLA {
if s.assert == plaAssert || s.assert == nlaAssert {
runesToMatch = str[idx:]
} else {
runesToMatch = str[:idx]
@@ -149,21 +149,21 @@ func (s State) checkAssertion(str []rune, idx int) bool {
numMatchesFound := 0
for _, matchIdx := range matchIndices {
if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
if s.assert == plaAssert || s.assert == nlaAssert { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
if matchIdx[0].StartIdx == 0 {
numMatchesFound++
}
}
if s.assert == PLB || s.assert == NLB { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
if s.assert == plbAssert || s.assert == nlbAssert { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
if matchIdx[0].EndIdx == idx {
numMatchesFound++
}
}
}
if s.assert == PLA || s.assert == PLB { // Positive assertions want at least one match
if s.assert == plaAssert || s.assert == plbAssert { // Positive assertions want at least one match
return numMatchesFound > 0
}
if s.assert == NLA || s.assert == NLB { // Negative assertions only want zero matches
if s.assert == nlaAssert || s.assert == nlbAssert { // Negative assertions only want zero matches
return numMatchesFound == 0
}
}
@@ -171,8 +171,8 @@ func (s State) checkAssertion(str []rune, idx int) bool {
}
// Returns true if the contents of 's' contain the value at the given index of the given string
func (s State) contentContains(str []rune, idx int) bool {
if s.assert != NONE {
func (s nfaState) contentContains(str []rune, idx int) bool {
if s.assert != noneAssert {
return s.checkAssertion(str, idx)
}
if s.allChars {
@@ -182,19 +182,19 @@ func (s State) contentContains(str []rune, idx int) bool {
return slices.Contains(s.content, int(str[idx]))
}
func (s State) isLookaround() bool {
return s.assert == PLA || s.assert == PLB || s.assert == NLA || s.assert == NLB
func (s nfaState) isLookaround() bool {
return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert
}
// Returns the matches for the character at the given index of the given string.
// Also returns the number of matches. Returns -1 if an assertion failed.
func (s State) matchesFor(str []rune, idx int) ([]*State, int) {
func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) {
// Assertions can be viewed as 'checks'. If the check fails, we return
// an empty array and 0.
// If it passes, we treat it like any other state, and return all the transitions.
if s.assert != NONE {
if s.assert != noneAssert {
if s.checkAssertion(str, idx) == false {
return make([]*State, 0), -1
return make([]*nfaState, 0), -1
}
}
listTransitions := s.transitions[int(str[idx])]
@@ -211,39 +211,39 @@ func (s State) matchesFor(str []rune, idx int) ([]*State, int) {
}
// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates
func verifyLastStatesHelper(state *State, visited map[*State]bool) {
if len(state.transitions) == 0 {
state.isLast = true
func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) {
if len(st.transitions) == 0 {
st.isLast = true
return
}
// if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
if len(state.transitions) == 1 { // Eg. a*
if len(st.transitions) == 1 { // Eg. a*
var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
for _, c := range state.content {
if len(state.transitions[c]) != 1 || state.transitions[c][0] != state {
for _, c := range st.content {
if len(st.transitions[c]) != 1 || st.transitions[c][0] != st {
moreThanOneTrans = true
}
}
state.isLast = !moreThanOneTrans
st.isLast = !moreThanOneTrans
}
if state.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
transitionDests := make([]*State, 0)
for _, v := range state.transitions {
if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
transitionDests := make([]*nfaState, 0)
for _, v := range st.transitions {
transitionDests = append(transitionDests, v...)
}
if allEqual(transitionDests...) {
state.isLast = true
st.isLast = true
return
}
}
if visited[state] == true {
if visited[st] == true {
return
}
visited[state] = true
for _, states := range state.transitions {
visited[st] = true
for _, states := range st.transitions {
for i := range states {
if states[i] != state {
if states[i] != st {
verifyLastStatesHelper(states[i], visited)
}
}
@@ -251,12 +251,12 @@ func verifyLastStatesHelper(state *State, visited map[*State]bool) {
}
// verifyLastStates enables the 'isLast' flag for the leaf nodes (last states)
func verifyLastStates(start []*State) {
verifyLastStatesHelper(start[0], make(map[*State]bool))
func verifyLastStates(start []*nfaState) {
verifyLastStatesHelper(start[0], make(map[*nfaState]bool))
}
// Concatenates s1 and s2, returns the start of the concatenation.
func concatenate(s1 *State, s2 *State) *State {
func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
if s1 == nil {
return s2
}
@@ -269,14 +269,14 @@ func concatenate(s1 *State, s2 *State) *State {
return s1
}
func kleene(s1 State) (*State, error) {
if s1.isEmpty && s1.assert != NONE {
func kleene(s1 nfaState) (*nfaState, error) {
if s1.isEmpty && s1.assert != noneAssert {
return nil, fmt.Errorf("previous token is not quantifiable")
}
toReturn := &State{}
toReturn.transitions = make(map[int][]*State)
toReturn.content = newContents(EPSILON)
toReturn := &nfaState{}
toReturn.transitions = make(map[int][]*nfaState)
toReturn.content = newContents(epsilon)
toReturn.isEmpty = true
toReturn.isKleene = true
toReturn.output = append(toReturn.output, toReturn)
@@ -291,9 +291,9 @@ func kleene(s1 State) (*State, error) {
return toReturn, nil
}
func alternate(s1 *State, s2 *State) *State {
toReturn := &State{}
toReturn.transitions = make(map[int][]*State)
func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
toReturn := &nfaState{}
toReturn.transitions = make(map[int][]*nfaState)
toReturn.output = append(toReturn.output, s1.output...)
toReturn.output = append(toReturn.output, s2.output...)
// Unique append is used here (and elsewhere) to ensure that,
@@ -307,16 +307,16 @@ func alternate(s1 *State, s2 *State) *State {
for _, c := range s2.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2)
}
toReturn.content = newContents(EPSILON)
toReturn.content = newContents(epsilon)
toReturn.isEmpty = true
return toReturn
}
func question(s1 *State) *State { // Use the fact that ab? == a(b|)
s2 := &State{}
s2.transitions = make(map[int][]*State)
s2.content = newContents(EPSILON)
func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|)
s2 := &nfaState{}
s2.transitions = make(map[int][]*nfaState)
s2.content = newContents(epsilon)
s2.output = append(s2.output, s2)
s2.isEmpty = true
s3 := alternate(s1, s2)
@@ -324,11 +324,11 @@ func question(s1 *State) *State { // Use the fact that ab? == a(b|)
}
// Creates and returns a new state with the 'default' values.
func newState() State {
ret := State{
output: make([]*State, 0),
transitions: make(map[int][]*State),
assert: NONE,
func newState() nfaState {
ret := nfaState{
output: make([]*nfaState, 0),
transitions: make(map[int][]*nfaState),
assert: noneAssert,
except: append([]rune{}, 0),
lookaroundRegex: "",
groupEnd: false,
@@ -339,10 +339,10 @@ func newState() State {
}
// Creates and returns a state that _always_ has a zero-length match.
func zeroLengthMatchState() State {
func zeroLengthMatchState() nfaState {
start := newState()
start.content = newContents(EPSILON)
start.content = newContents(epsilon)
start.isEmpty = true
start.assert = ALWAYS_TRUE
start.assert = alwaysTrueAssert
return start
}

View File

@@ -2,7 +2,7 @@ package regex
import "fmt"
type NodeType int
type nodeType int
// This is a slice containing all escapable characters that have special meaning.
// Eg. \b is word boundary, \w is word character etc.
@@ -10,28 +10,28 @@ var escapedChars []rune = []rune("wWdDbBnaftrvsS0")
// This is a list of the possible node types
const (
CHARACTER NodeType = iota
CHARCLASS
PIPE
CONCATENATE
KLEENE
QUESTION
PLUS
ASSERTION
LPAREN
RPAREN
characterNode nodeType = iota
charclassNode
pipeNode
concatenateNode
kleeneNode
questionNode
plusNode
assertionNode
lparenNode
rparenNode
)
// Helper constants for lookarounds
const POSITIVE = 1
const NEGATIVE = -1
const LOOKAHEAD = 1
const LOOKBEHIND = -1
const positive = 1
const negative = -1
const lookahead = 1
const lookbehind = -1
var INFINITE_REPS int = -1 // Represents infinite reps eg. the end range in {5,}
var infinite_reps int = -1 // Represents infinite reps eg. the end range in {5,}
// This represents a node in the postfix representation of the expression
type postfixNode struct {
nodetype NodeType
nodetype nodeType
contents []rune // Contents of the node
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
@@ -49,11 +49,11 @@ type postfixNode struct {
// it will not match.
func newCharClassNode(nodes []postfixNode, negated bool) postfixNode {
rtv := postfixNode{}
rtv.nodetype = CHARCLASS
rtv.nodetype = charclassNode
rtv.startReps = 1
rtv.endReps = 1
if negated {
rtv.nodetype = CHARACTER
rtv.nodetype = characterNode
rtv.contents = []rune{ANY_CHAR}
rtv.allChars = true
rtv.except = nodes
@@ -70,55 +70,55 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
toReturn.endReps = 1
switch c {
case 's': // Whitespace
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, whitespaceChars...)
case 'S': // Non-whitespace
toReturn = newPostfixDotNode()
toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...))
case 'd': // Digits
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, digitChars...)
case 'D': // Non-digits
toReturn = newPostfixDotNode()
toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...))
case 'w': // word character
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, wordChars...)
case 'W': // Non-word character
toReturn = newPostfixDotNode()
toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...))
case 'b', 'B':
if c == 'b' && inCharClass {
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(8))
} else {
toReturn.nodetype = ASSERTION
toReturn.nodetype = assertionNode
toReturn.contents = append(toReturn.contents, c)
}
case 'n': // Newline character
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, '\n')
case '0': // NULL character
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(0))
case 'a': // Bell character
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(7))
case 'f': // Form feed character
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(12))
case 't': // Horizontal tab character
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(9))
case 'r': // Carriage return
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(13))
case 'v': // Vertical tab
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(11))
case '-': // Literal hyphen - only in character class
if inCharClass {
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, '-')
} else {
return postfixNode{}, fmt.Errorf("invalid escape character")
@@ -127,7 +127,7 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
if isNormalChar(c) { // Normal characters cannot be escaped
return postfixNode{}, fmt.Errorf("invalid escape character")
}
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, c)
}
return toReturn, nil
@@ -142,36 +142,36 @@ func newPostfixNode(contents ...rune) postfixNode {
to_return.startReps = 1
to_return.endReps = 1
if len(contents) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER
to_return.nodetype = CHARACTER
to_return.nodetype = characterNode
to_return.contents = contents
} else { // Node has one element, could be anything
switch contents[0] {
case '+':
to_return.nodetype = PLUS
to_return.nodetype = plusNode
case '?':
to_return.nodetype = QUESTION
to_return.nodetype = questionNode
case '*':
to_return.nodetype = KLEENE
to_return.nodetype = kleeneNode
case '|':
to_return.nodetype = PIPE
case CONCAT:
to_return.nodetype = CONCATENATE
to_return.nodetype = pipeNode
case concatRune:
to_return.nodetype = concatenateNode
case '^', '$':
to_return.nodetype = ASSERTION
to_return.nodetype = assertionNode
case '(':
to_return.nodetype = LPAREN
to_return.nodetype = lparenNode
case ')':
to_return.nodetype = RPAREN
to_return.nodetype = rparenNode
default:
to_return.nodetype = CHARACTER
to_return.nodetype = characterNode
}
to_return.contents = append(to_return.contents, contents...)
// Special cases for LPAREN and RPAREN - they have special characters defined for them
if to_return.nodetype == LPAREN {
if to_return.nodetype == lparenNode {
to_return.contents = []rune{LPAREN_CHAR}
}
if to_return.nodetype == RPAREN {
if to_return.nodetype == rparenNode {
to_return.contents = []rune{RPAREN_CHAR}
}
}
@@ -183,7 +183,7 @@ func newPostfixDotNode() postfixNode {
toReturn := postfixNode{}
toReturn.startReps = 1
toReturn.endReps = 1
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.allChars = true
toReturn.contents = []rune{ANY_CHAR}
return toReturn
@@ -194,7 +194,7 @@ func newPostfixCharNode(contents ...rune) postfixNode {
toReturn := postfixNode{}
toReturn.startReps = 1
toReturn.endReps = 1
toReturn.nodetype = CHARACTER
toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, contents...)
return toReturn
}