11 Commits

5 changed files with 259 additions and 259 deletions

View File

@@ -14,11 +14,11 @@ var notDotChars []rune
// the startState of the NFA representation of the regex, and the number of capturing // the startState of the NFA representation of the regex, and the number of capturing
// groups in the regex. // groups in the regex.
type Reg struct { type Reg struct {
start *State start *nfaState
numGroups int numGroups int
} }
const CONCAT rune = '~' const concatRune rune = 0xF0001
// Flags for shuntingYard - control its behavior // Flags for shuntingYard - control its behavior
type ReFlag int type ReFlag int
@@ -31,7 +31,7 @@ const (
) )
func isOperator(c rune) bool { func isOperator(c rune) bool {
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT { if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune {
return true return true
} }
return false return false
@@ -39,7 +39,7 @@ func isOperator(c rune) bool {
/* priority returns the priority of the given operator */ /* priority returns the priority of the given operator */
func priority(op rune) int { func priority(op rune) int {
precedence := []rune{'|', CONCAT, '+', '*', '?'} precedence := []rune{'|', concatRune, '+', '*', '?'}
return slices.Index(precedence, op) return slices.Index(precedence, op)
} }
@@ -320,7 +320,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
if i < len(re_runes)-1 { if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' { if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
re_postfix = append(re_postfix, CONCAT) re_postfix = append(re_postfix, concatRune)
} }
} }
} }
@@ -450,21 +450,21 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} }
// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind) // 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
// Now we should filter that out. // Now we should filter that out.
toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1} toAppend := postfixNode{nodetype: assertionNode, startReps: 1, endReps: 1}
if regex[0] == '<' { // Lookbehind if regex[0] == '<' { // Lookbehind
toAppend.lookaroundDir = LOOKBEHIND toAppend.lookaroundDir = lookbehind
regex = regex[1:] regex = regex[1:]
} else if regex[0] == '=' || regex[0] == '!' { } else if regex[0] == '=' || regex[0] == '!' {
toAppend.lookaroundDir = LOOKAHEAD toAppend.lookaroundDir = lookahead
} else { } else {
return nil, fmt.Errorf("invalid lookaround") return nil, fmt.Errorf("invalid lookaround")
} }
// Positive or negative // Positive or negative
if regex[0] == '=' { // Positive if regex[0] == '=' { // Positive
toAppend.lookaroundSign = POSITIVE toAppend.lookaroundSign = positive
toAppend.contents = []rune(regex[1:]) toAppend.contents = []rune(regex[1:])
} else if regex[0] == '!' { // Negative } else if regex[0] == '!' { // Negative
toAppend.lookaroundSign = NEGATIVE toAppend.lookaroundSign = negative
toAppend.contents = []rune(regex[1:]) toAppend.contents = []rune(regex[1:])
} else { } else {
return nil, fmt.Errorf("invalid lookaround") return nil, fmt.Errorf("invalid lookaround")
@@ -489,7 +489,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
topStack, _ = peek(opStack) topStack, _ = peek(opStack)
} }
outQueueFinalElement, _ := peek(outQueue) outQueueFinalElement, _ := peek(outQueue)
if (c == '*' && outQueueFinalElement.nodetype == KLEENE) || (c == '+' && outQueueFinalElement.nodetype == PLUS) { // You cannot apply a quantifier to a quantifier in this way if (c == '*' && outQueueFinalElement.nodetype == kleeneNode) || (c == '+' && outQueueFinalElement.nodetype == plusNode) { // You cannot apply a quantifier to a quantifier in this way
return nil, fmt.Errorf("illegal use of token '%c'", c) return nil, fmt.Errorf("illegal use of token '%c'", c)
} }
opStack = append(opStack, c) opStack = append(opStack, c)
@@ -739,7 +739,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("invalid start range for numeric specifier") return nil, fmt.Errorf("invalid start range for numeric specifier")
} }
if len(endRange) == 0 { // Case 3 above if len(endRange) == 0 { // Case 3 above
endRangeNum = INFINITE_REPS endRangeNum = infinite_reps
} else { // Case 2 above } else { // Case 2 above
var err error var err error
endRangeNum, err = strconv.Atoi(string(endRange)) endRangeNum, err = strconv.Atoi(string(endRange))
@@ -751,7 +751,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
idx := len(outQueue) - 1 idx := len(outQueue) - 1
// Get the last added node // Get the last added node
if idx < 0 || outQueue[idx].nodetype == LPAREN { if idx < 0 || outQueue[idx].nodetype == lparenNode {
return nil, fmt.Errorf("numeric specifier with no content") return nil, fmt.Errorf("numeric specifier with no content")
} }
outQueue[idx].startReps = startRangeNum outQueue[idx].startReps = startRangeNum
@@ -799,7 +799,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// Thompson's algorithm. Constructs Finite-State Automaton from given string. // Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state and number of groups in regex. // Returns start state and number of groups in regex.
func thompson(re []postfixNode) (Reg, error) { func thompson(re []postfixNode) (Reg, error) {
nfa := make([]*State, 0) // Stack of states nfa := make([]*nfaState, 0) // Stack of states
numGroups := 0 // Number of capturing groups numGroups := 0 // Number of capturing groups
// If thompson() receives an empty regex, then whatever was given to shuntingYard() // If thompson() receives an empty regex, then whatever was given to shuntingYard()
@@ -814,11 +814,11 @@ func thompson(re []postfixNode) (Reg, error) {
} }
for _, c := range re { for _, c := range re {
if c.nodetype == CHARACTER || c.nodetype == ASSERTION { if c.nodetype == characterNode || c.nodetype == assertionNode {
state := State{} stateToAdd := nfaState{}
state.transitions = make(map[int][]*State) stateToAdd.transitions = make(map[int][]*nfaState)
if c.allChars { if c.allChars {
state.allChars = true stateToAdd.allChars = true
if len(c.except) != 0 { if len(c.except) != 0 {
// For each node that I am 'excepting' (eg. in an inverted character class): // For each node that I am 'excepting' (eg. in an inverted character class):
// - If the node itself has exceptions, then the exceptions cancel out. // - If the node itself has exceptions, then the exceptions cancel out.
@@ -827,7 +827,7 @@ func thompson(re []postfixNode) (Reg, error) {
// - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list. // - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list.
for _, node := range c.except { for _, node := range c.except {
if node.allChars { if node.allChars {
state.allChars = false stateToAdd.allChars = false
// For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all, // For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all,
// and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match // and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match
// those. // those.
@@ -840,7 +840,7 @@ func thompson(re []postfixNode) (Reg, error) {
} }
return nodeContents return nodeContents
})...) })...)
state.content = rune2Contents(nodeExceptChars) stateToAdd.content = rune2Contents(nodeExceptChars)
} else { } else {
charsToAdd := node.contents charsToAdd := node.contents
if caseInsensitive { if caseInsensitive {
@@ -848,7 +848,7 @@ func thompson(re []postfixNode) (Reg, error) {
return allCases(r, caseInsensitive) return allCases(r, caseInsensitive)
})...) })...)
} }
state.except = append(state.except, charsToAdd...) stateToAdd.except = append(stateToAdd.except, charsToAdd...)
} }
} }
} }
@@ -861,43 +861,43 @@ func thompson(re []postfixNode) (Reg, error) {
return allCases(r, caseInsensitive) return allCases(r, caseInsensitive)
})...) })...)
} }
state.content = stateContents(append([]int(state.content), []int(rune2Contents(runesToAdd))...)) stateToAdd.content = stateContents(append([]int(stateToAdd.content), []int(rune2Contents(runesToAdd))...))
state.output = make([]*State, 0) stateToAdd.output = make([]*nfaState, 0)
state.output = append(state.output, &state) stateToAdd.output = append(stateToAdd.output, &stateToAdd)
state.isEmpty = false stateToAdd.isEmpty = false
if c.nodetype == ASSERTION { if c.nodetype == assertionNode {
state.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way. stateToAdd.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string stateToAdd.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
if c.lookaroundDir == 0 || c.lookaroundSign == 0 { if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
switch c.contents[0] { switch c.contents[0] {
case '^': case '^':
state.assert = SOS stateToAdd.assert = sosAssert
case '$': case '$':
state.assert = EOS stateToAdd.assert = eosAssert
case 'b': case 'b':
state.assert = WBOUND stateToAdd.assert = wboundAssert
case 'B': case 'B':
state.assert = NONWBOUND stateToAdd.assert = nonwboundAssert
} }
} else { // Lookaround } else { // Lookaround
state.lookaroundRegex = string(c.contents) stateToAdd.lookaroundRegex = string(c.contents)
if c.lookaroundDir == LOOKAHEAD { if c.lookaroundDir == lookahead {
if c.lookaroundSign == POSITIVE { if c.lookaroundSign == positive {
state.assert = PLA stateToAdd.assert = plaAssert
} }
if c.lookaroundSign == NEGATIVE { if c.lookaroundSign == negative {
state.assert = NLA stateToAdd.assert = nlaAssert
} }
} }
if c.lookaroundDir == LOOKBEHIND { if c.lookaroundDir == lookbehind {
if c.lookaroundSign == POSITIVE { if c.lookaroundSign == positive {
state.assert = PLB stateToAdd.assert = plbAssert
} }
if c.lookaroundSign == NEGATIVE { if c.lookaroundSign == negative {
state.assert = NLB stateToAdd.assert = nlbAssert
} }
} }
tmpRe, err := shuntingYard(state.lookaroundRegex) tmpRe, err := shuntingYard(stateToAdd.lookaroundRegex)
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error parsing lookaround: %w", err) return Reg{}, fmt.Errorf("error parsing lookaround: %w", err)
} }
@@ -905,28 +905,28 @@ func thompson(re []postfixNode) (Reg, error) {
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error compiling lookaround: %w", err) return Reg{}, fmt.Errorf("error compiling lookaround: %w", err)
} }
state.lookaroundNFA = reg.start stateToAdd.lookaroundNFA = reg.start
state.lookaroundNumCaptureGroups = reg.numGroups stateToAdd.lookaroundNumCaptureGroups = reg.numGroups
} }
} }
// Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it // Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it
replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\') replaceByValue([]int(stateToAdd.content), int(ESC_BACKSLASH), '\\')
replaceByValue(state.except, ESC_BACKSLASH, '\\') replaceByValue(stateToAdd.except, ESC_BACKSLASH, '\\')
nfa = append(nfa, &state) nfa = append(nfa, &stateToAdd)
} }
if c.nodetype == LPAREN || c.nodetype == RPAREN { if c.nodetype == lparenNode || c.nodetype == rparenNode {
s := &State{} s := &nfaState{}
s.assert = NONE s.assert = noneAssert
s.content = newContents(EPSILON) s.content = newContents(EPSILON)
s.isEmpty = true s.isEmpty = true
s.output = make([]*State, 0) s.output = make([]*nfaState, 0)
s.output = append(s.output, s) s.output = append(s.output, s)
s.transitions = make(map[int][]*State) s.transitions = make(map[int][]*nfaState)
// LPAREN nodes are just added normally // LPAREN nodes are just added normally
if c.nodetype == LPAREN { if c.nodetype == lparenNode {
numGroups++ numGroups++
s.groupBegin = true s.groupBegin = true
s.groupNum = numGroups s.groupNum = numGroups
@@ -940,7 +940,7 @@ func thompson(re []postfixNode) (Reg, error) {
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN // If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
// and RPAREN nodes. // and RPAREN nodes.
// If neither node exists, that's a problem so I return an error. // If neither node exists, that's a problem so I return an error.
if c.nodetype == RPAREN { if c.nodetype == rparenNode {
s.groupEnd = true s.groupEnd = true
middleNode, err1 := pop(&nfa) middleNode, err1 := pop(&nfa)
lparenNode, err2 := pop(&nfa) lparenNode, err2 := pop(&nfa)
@@ -969,9 +969,9 @@ func thompson(re []postfixNode) (Reg, error) {
} }
} }
} }
if c.nodetype == CHARCLASS { // A Character class consists of all the nodes in it, alternated if c.nodetype == charclassNode { // A Character class consists of all the nodes in it, alternated
// Map the list of nodes to a list of states, each state containing the contents of a specific node // Map the list of nodes to a list of states, each state containing the contents of a specific node
states := funcMap(c.nodeContents, func(node postfixNode) *State { states := funcMap(c.nodeContents, func(node postfixNode) *nfaState {
s := newState() s := newState()
nodeContents := node.contents nodeContents := node.contents
if caseInsensitive { if caseInsensitive {
@@ -989,14 +989,14 @@ func thompson(re []postfixNode) (Reg, error) {
return &s return &s
}) })
// Reduce the list of states down to a single state by alternating them // Reduce the list of states down to a single state by alternating them
toAdd := funcReduce(states, func(s1 *State, s2 *State) *State { toAdd := funcReduce(states, func(s1 *nfaState, s2 *nfaState) *nfaState {
return alternate(s1, s2) return alternate(s1, s2)
}) })
nfa = append(nfa, toAdd) nfa = append(nfa, toAdd)
} }
// Must be an operator if it isn't a character // Must be an operator if it isn't a character
switch c.nodetype { switch c.nodetype {
case CONCATENATE: case concatenateNode:
s2 := mustPop(&nfa) s2 := mustPop(&nfa)
// Relax the requirements for concatenation a little bit - If // Relax the requirements for concatenation a little bit - If
// the second element is not found ie. the postfixNodes look // the second element is not found ie. the postfixNodes look
@@ -1008,7 +1008,7 @@ func thompson(re []postfixNode) (Reg, error) {
s1 = concatenate(s1, s2) s1 = concatenate(s1, s2)
nfa = append(nfa, s1) nfa = append(nfa, s1)
} }
case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state case kleeneNode: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
s1, err := pop(&nfa) s1, err := pop(&nfa)
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error applying kleene star") return Reg{}, fmt.Errorf("error applying kleene star")
@@ -1018,7 +1018,7 @@ func thompson(re []postfixNode) (Reg, error) {
return Reg{}, err return Reg{}, err
} }
nfa = append(nfa, stateToAdd) nfa = append(nfa, stateToAdd)
case PLUS: // a+ is equivalent to aa* case plusNode: // a+ is equivalent to aa*
s1 := mustPop(&nfa) s1 := mustPop(&nfa)
s2, err := kleene(*s1) s2, err := kleene(*s1)
if err != nil { if err != nil {
@@ -1026,14 +1026,14 @@ func thompson(re []postfixNode) (Reg, error) {
} }
s1 = concatenate(s1, s2) s1 = concatenate(s1, s2)
nfa = append(nfa, s1) nfa = append(nfa, s1)
case QUESTION: // ab? is equivalent to a(b|) case questionNode: // ab? is equivalent to a(b|)
s1, err := pop(&nfa) s1, err := pop(&nfa)
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error applying question operator") return Reg{}, fmt.Errorf("error applying question operator")
} }
s2 := question(s1) s2 := question(s1)
nfa = append(nfa, s2) nfa = append(nfa, s2)
case PIPE: case pipeNode:
// A pipe operator doesn't actually need either operand to be present. If an operand isn't present, // A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
// it is replaced with an implicit 'matchZeroLength' state (this is the same thing that we add at the top if our // it is replaced with an implicit 'matchZeroLength' state (this is the same thing that we add at the top if our
// input has zero postfixNodes). // input has zero postfixNodes).
@@ -1065,8 +1065,8 @@ func thompson(re []postfixNode) (Reg, error) {
if c.endReps != -1 && c.endReps < c.startReps { if c.endReps != -1 && c.endReps < c.startReps {
return Reg{}, fmt.Errorf("numeric specifier - start greater than end") return Reg{}, fmt.Errorf("numeric specifier - start greater than end")
} }
state := mustPop(&nfa) poppedState := mustPop(&nfa)
var stateToAdd *State = nil var stateToAdd *nfaState = nil
// Take advantage of the following facts: // Take advantage of the following facts:
// a{5} == aaaaa // a{5} == aaaaa
// a{3,5} == aaaa?a? // a{3,5} == aaaa?a?
@@ -1080,17 +1080,17 @@ func thompson(re []postfixNode) (Reg, error) {
// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier // b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
// at this point, I can leave thompson untouched. // at this point, I can leave thompson untouched.
for i := 0; i < c.startReps; i++ { // Case 1 for i := 0; i < c.startReps; i++ { // Case 1
stateToAdd = concatenate(stateToAdd, cloneState(state)) stateToAdd = concatenate(stateToAdd, cloneState(poppedState))
} }
if c.endReps == INFINITE_REPS { // Case 3 if c.endReps == infinite_reps { // Case 3
s2, err := kleene(*state) s2, err := kleene(*poppedState)
if err != nil { if err != nil {
return Reg{}, err return Reg{}, err
} }
stateToAdd = concatenate(stateToAdd, s2) stateToAdd = concatenate(stateToAdd, s2)
} else { // Case 2 } else { // Case 2
for i := c.startReps; i < c.endReps; i++ { for i := c.startReps; i < c.endReps; i++ {
stateToAdd = concatenate(stateToAdd, question(cloneState(state))) stateToAdd = concatenate(stateToAdd, question(cloneState(poppedState)))
} }
} }
nfa = append(nfa, stateToAdd) nfa = append(nfa, stateToAdd)

View File

@@ -61,7 +61,7 @@ func (g Group) isValid() bool {
// given slice. It returns the resulting states. If any of the resulting states is a 0-state, // given slice. It returns the resulting states. If any of the resulting states is a 0-state,
// the second ret val is true. // the second ret val is true.
// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index. // If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index.
func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZero bool) { func takeZeroState(states []*nfaState, numGroups int, idx int) (rtv []*nfaState, isZero bool) {
for _, state := range states { for _, state := range states {
if len(state.transitions[EPSILON]) > 0 { if len(state.transitions[EPSILON]) > 0 {
for _, s := range state.transitions[EPSILON] { for _, s := range state.transitions[EPSILON] {
@@ -93,9 +93,9 @@ func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZer
// from any of the given states, given the string and our position in it. // from any of the given states, given the string and our position in it.
// It uses the same algorithm to find zero-states as the one inside the loop, // It uses the same algorithm to find zero-states as the one inside the loop,
// so I should probably put it in a function. // so I should probably put it in a function.
func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) bool { func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState) bool {
zeroStates, isZero := takeZeroState(states, numGroups, idx) zeroStates, isZero := takeZeroState(states, numGroups, idx)
tempstates := make([]*State, 0, len(zeroStates)+len(states)) tempstates := make([]*nfaState, 0, len(zeroStates)+len(states))
tempstates = append(tempstates, states...) tempstates = append(tempstates, states...)
tempstates = append(tempstates, zeroStates...) tempstates = append(tempstates, zeroStates...)
num_appended := 0 // number of unique states addded to tempstates num_appended := 0 // number of unique states addded to tempstates
@@ -107,7 +107,7 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) boo
} }
} }
for _, state := range tempstates { for _, state := range tempstates {
if state.isEmpty && (state.assert == NONE || state.checkAssertion(str, idx)) && state.isLast { if state.isEmpty && (state.assert == noneAssert || state.checkAssertion(str, idx)) && state.isLast {
return true return true
} }
} }
@@ -204,7 +204,7 @@ func FindAllMatches(regex Reg, str string) []Match {
// the next search should start from. // the next search should start from.
// //
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array. // Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (bool, Match, int) { func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
// Base case - exit if offset exceeds string's length // Base case - exit if offset exceeds string's length
if offset > len(str) { if offset > len(str) {
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str) // The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
@@ -221,14 +221,14 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
foundPath := false foundPath := false
startIdx := offset startIdx := offset
endIdx := offset endIdx := offset
currentStates := make([]*State, 0) currentStates := make([]*nfaState, 0)
tempStates := make([]*State, 0) // Used to store states that should be used in next loop iteration tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration
i := offset // Index in string i := offset // Index in string
startingFrom := i // Store starting index startingFrom := i // Store starting index
// If the first state is an assertion, makes sure the assertion // If the first state is an assertion, makes sure the assertion
// is true before we do _anything_ else. // is true before we do _anything_ else.
if start.assert != NONE { if start.assert != noneAssert {
if start.checkAssertion(str, offset) == false { if start.checkAssertion(str, offset) == false {
i++ i++
return false, []Group{}, i return false, []Group{}, i
@@ -257,7 +257,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
for i < len(str) { for i < len(str) {
foundPath = false foundPath = false
zeroStates := make([]*State, 0) zeroStates := make([]*nfaState, 0)
// Keep taking zero-states, until there are no more left to take // Keep taking zero-states, until there are no more left to take
// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take. // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i) zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
@@ -278,7 +278,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
numStatesMatched := 0 // The number of states which had at least 1 match for this round numStatesMatched := 0 // The number of states which had at least 1 match for this round
assertionFailed := false // Whether or not an assertion failed for this round assertionFailed := false // Whether or not an assertion failed for this round
lastStateInList := false // Whether or not a last state was in our list of states lastStateInList := false // Whether or not a last state was in our list of states
var lastStatePtr *State = nil // Pointer to the last-state, if it was found var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found
lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
for _, state := range currentStates { for _, state := range currentStates {
matches, numMatches := state.matchesFor(str, i) matches, numMatches := state.matchesFor(str, i)
@@ -329,7 +329,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
// b. Empty // b. Empty
// c. Doesn't assert anything // c. Doesn't assert anything
for _, s := range currentStates { for _, s := range currentStates {
if s.isLast && s.isEmpty && s.assert == NONE { if s.isLast && s.isEmpty && s.assert == noneAssert {
lastStatePtr = s lastStatePtr = s
lastStateInList = true lastStateInList = true
} }
@@ -364,7 +364,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
} }
return false, []Group{}, startIdx return false, []Group{}, startIdx
} }
currentStates = make([]*State, len(tempStates)) currentStates = make([]*nfaState, len(tempStates))
copy(currentStates, tempStates) copy(currentStates, tempStates)
tempStates = nil tempStates = nil
@@ -391,7 +391,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
// Only add the match if the start index is in bounds. If the state has an assertion, // Only add the match if the start index is in bounds. If the state has an assertion,
// make sure the assertion checks out. // make sure the assertion checks out.
if state.isLast && i <= len(str) { if state.isLast && i <= len(str) {
if state.assert == NONE || state.checkAssertion(str, i) { if state.assert == noneAssert || state.checkAssertion(str, i) {
for j := 1; j < numGroups+1; j++ { for j := 1; j < numGroups+1; j++ {
tempIndices[j] = state.threadGroups[j] tempIndices[j] = state.threadGroups[j]
} }

View File

@@ -8,16 +8,16 @@ import (
var whitespaceChars = []rune{' ', '\t', '\n'} var whitespaceChars = []rune{' ', '\t', '\n'}
var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'} var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_") var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_")
var LBRACKET rune = 0xF0001 var LBRACKET rune = 0xF0002
var RBRACKET rune = 0xF0002 var RBRACKET rune = 0xF0003
var ANY_CHAR rune = 0xF0003 // Represents any character - used for states where the allChars flag is on. var ANY_CHAR rune = 0xF0004 // Represents any character - used for states where the allChars flag is on.
var LPAREN_CHAR rune = 0xF0004 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses var LPAREN_CHAR rune = 0xF0005 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
var RPAREN_CHAR rune = 0xF0005 var RPAREN_CHAR rune = 0xF0006
var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPAREN var NONCAPLPAREN_CHAR rune = 0xF0007 // Represents a non-capturing group's LPAREN
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash var ESC_BACKSLASH rune = 0xF0008 // Represents an escaped backslash
var CHAR_RANGE rune = 0xF0008 // Represents a character range var CHAR_RANGE rune = 0xF0009 // Represents a character range
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', CONCAT, '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR} var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR}
// An interface for int and rune, which are identical // An interface for int and rune, which are identical
type character interface { type character interface {

View File

@@ -5,35 +5,35 @@ import (
"slices" "slices"
) )
const EPSILON int = 0xF0000 const epsilon int = 0xF0000
type assertType int type assertType int
const ( const (
NONE assertType = iota noneAssert assertType = iota
SOS sosAssert
EOS eosAssert
WBOUND wboundAssert
NONWBOUND nonwboundAssert
PLA // Positive lookahead plaAssert // Positive lookahead
NLA // Negative lookahead nlaAssert // Negative lookahead
PLB // Positive lookbehind plbAssert // Positive lookbehind
NLB // Negative lookbehind nlbAssert // Negative lookbehind
ALWAYS_TRUE // An assertion that is always true alwaysTrueAssert // An assertion that is always true
) )
type State struct { type nfaState struct {
content stateContents // Contents of current state content stateContents // Contents of current state
isEmpty bool // If it is empty - Union operator and Kleene star states will be empty isEmpty bool // If it is empty - Union operator and Kleene star states will be empty
isLast bool // If it is the last state (acept state) isLast bool // If it is the last state (acept state)
output []*State // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
transitions map[int][]*State // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
isKleene bool // Identifies whether current node is a 0-state representing Kleene star isKleene bool // Identifies whether current node is a 0-state representing Kleene star
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds
lookaroundNFA *State // Holds the NFA of the lookaroundRegex - if it exists lookaroundNFA *nfaState // Holds the NFA of the lookaroundRegex - if it exists
lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround
groupBegin bool // Whether or not the node starts a capturing group groupBegin bool // Whether or not the node starts a capturing group
groupEnd bool // Whether or not the node ends a capturing group groupEnd bool // Whether or not the node ends a capturing group
@@ -44,85 +44,85 @@ type State struct {
} }
// Clones the NFA starting from the given state. // Clones the NFA starting from the given state.
func cloneState(start *State) *State { func cloneState(start *nfaState) *nfaState {
return cloneStateHelper(start, make(map[*State]*State)) return cloneStateHelper(start, make(map[*nfaState]*nfaState))
} }
// Helper function for clone. The map is used to keep track of which states have // Helper function for clone. The map is used to keep track of which states have
// already been copied, and which ones haven't. // already been copied, and which ones haven't.
// This function was created using output from Llama3.1:405B. // This function was created using output from Llama3.1:405B.
func cloneStateHelper(state *State, cloneMap map[*State]*State) *State { func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) *nfaState {
// Base case - if the clone exists in our map, return it. // Base case - if the clone exists in our map, return it.
if clone, exists := cloneMap[state]; exists { if clone, exists := cloneMap[stateToClone]; exists {
return clone return clone
} }
if state == nil { if stateToClone == nil {
return nil return nil
} }
// Recursive case - if the clone doesn't exist, create it, add it to the map, // Recursive case - if the clone doesn't exist, create it, add it to the map,
// and recursively call for each of the transition states. // and recursively call for each of the transition states.
clone := &State{ clone := &nfaState{
content: append([]int{}, state.content...), content: append([]int{}, stateToClone.content...),
isEmpty: state.isEmpty, isEmpty: stateToClone.isEmpty,
isLast: state.isLast, isLast: stateToClone.isLast,
output: make([]*State, len(state.output)), output: make([]*nfaState, len(stateToClone.output)),
transitions: make(map[int][]*State), transitions: make(map[int][]*nfaState),
isKleene: state.isKleene, isKleene: stateToClone.isKleene,
assert: state.assert, assert: stateToClone.assert,
zeroMatchFound: state.zeroMatchFound, zeroMatchFound: stateToClone.zeroMatchFound,
allChars: state.allChars, allChars: stateToClone.allChars,
except: append([]rune{}, state.except...), except: append([]rune{}, stateToClone.except...),
lookaroundRegex: state.lookaroundRegex, lookaroundRegex: stateToClone.lookaroundRegex,
groupEnd: state.groupEnd, groupEnd: stateToClone.groupEnd,
groupBegin: state.groupBegin, groupBegin: stateToClone.groupBegin,
groupNum: state.groupNum, groupNum: stateToClone.groupNum,
} }
cloneMap[state] = clone cloneMap[stateToClone] = clone
for i, s := range state.output { for i, s := range stateToClone.output {
if s == state { if s == stateToClone {
clone.output[i] = clone clone.output[i] = clone
} else { } else {
clone.output[i] = cloneStateHelper(s, cloneMap) clone.output[i] = cloneStateHelper(s, cloneMap)
} }
} }
for k, v := range state.transitions { for k, v := range stateToClone.transitions {
clone.transitions[k] = make([]*State, len(v)) clone.transitions[k] = make([]*nfaState, len(v))
for i, s := range v { for i, s := range v {
if s == state { if s == stateToClone {
clone.transitions[k][i] = clone clone.transitions[k][i] = clone
} else { } else {
clone.transitions[k][i] = cloneStateHelper(s, cloneMap) clone.transitions[k][i] = cloneStateHelper(s, cloneMap)
} }
} }
} }
if state.lookaroundNFA == state { if stateToClone.lookaroundNFA == stateToClone {
clone.lookaroundNFA = clone clone.lookaroundNFA = clone
} }
clone.lookaroundNFA = cloneStateHelper(state.lookaroundNFA, cloneMap) clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap)
return clone return clone
} }
// Checks if the given state's assertion is true. Returns true if the given // Checks if the given state's assertion is true. Returns true if the given
// state doesn't have an assertion. // state doesn't have an assertion.
func (s State) checkAssertion(str []rune, idx int) bool { func (s nfaState) checkAssertion(str []rune, idx int) bool {
if s.assert == ALWAYS_TRUE { if s.assert == alwaysTrueAssert {
return true return true
} }
if s.assert == SOS { if s.assert == sosAssert {
// Single-line mode: Beginning of string // Single-line mode: Beginning of string
// Multi-line mode: Previous character was newline // Multi-line mode: Previous character was newline
return idx == 0 || (multilineMode && (idx > 0 && str[idx-1] == '\n')) return idx == 0 || (multilineMode && (idx > 0 && str[idx-1] == '\n'))
} }
if s.assert == EOS { if s.assert == eosAssert {
// Single-line mode: End of string // Single-line mode: End of string
// Multi-line mode: current character is newline // Multi-line mode: current character is newline
// Index is at the end of the string, or it points to the last character which is a newline // Index is at the end of the string, or it points to the last character which is a newline
return idx == len(str) || (multilineMode && str[idx] == '\n') return idx == len(str) || (multilineMode && str[idx] == '\n')
} }
if s.assert == WBOUND { if s.assert == wboundAssert {
return isWordBoundary(str, idx) return isWordBoundary(str, idx)
} }
if s.assert == NONWBOUND { if s.assert == nonwboundAssert {
return !isWordBoundary(str, idx) return !isWordBoundary(str, idx)
} }
if s.isLookaround() { if s.isLookaround() {
@@ -133,7 +133,7 @@ func (s State) checkAssertion(str []rune, idx int) bool {
startState := s.lookaroundNFA startState := s.lookaroundNFA
var runesToMatch []rune var runesToMatch []rune
var strToMatch string var strToMatch string
if s.assert == PLA || s.assert == NLA { if s.assert == plaAssert || s.assert == nlaAssert {
runesToMatch = str[idx:] runesToMatch = str[idx:]
} else { } else {
runesToMatch = str[:idx] runesToMatch = str[:idx]
@@ -149,21 +149,21 @@ func (s State) checkAssertion(str []rune, idx int) bool {
numMatchesFound := 0 numMatchesFound := 0
for _, matchIdx := range matchIndices { for _, matchIdx := range matchIndices {
if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx. if s.assert == plaAssert || s.assert == nlaAssert { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
if matchIdx[0].StartIdx == 0 { if matchIdx[0].StartIdx == 0 {
numMatchesFound++ numMatchesFound++
} }
} }
if s.assert == PLB || s.assert == NLB { // Lookbehind - return true (or false) if at least one match _ends_ at the current index. if s.assert == plbAssert || s.assert == nlbAssert { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
if matchIdx[0].EndIdx == idx { if matchIdx[0].EndIdx == idx {
numMatchesFound++ numMatchesFound++
} }
} }
} }
if s.assert == PLA || s.assert == PLB { // Positive assertions want at least one match if s.assert == plaAssert || s.assert == plbAssert { // Positive assertions want at least one match
return numMatchesFound > 0 return numMatchesFound > 0
} }
if s.assert == NLA || s.assert == NLB { // Negative assertions only want zero matches if s.assert == nlaAssert || s.assert == nlbAssert { // Negative assertions only want zero matches
return numMatchesFound == 0 return numMatchesFound == 0
} }
} }
@@ -171,8 +171,8 @@ func (s State) checkAssertion(str []rune, idx int) bool {
} }
// Returns true if the contents of 's' contain the value at the given index of the given string // Returns true if the contents of 's' contain the value at the given index of the given string
func (s State) contentContains(str []rune, idx int) bool { func (s nfaState) contentContains(str []rune, idx int) bool {
if s.assert != NONE { if s.assert != noneAssert {
return s.checkAssertion(str, idx) return s.checkAssertion(str, idx)
} }
if s.allChars { if s.allChars {
@@ -182,19 +182,19 @@ func (s State) contentContains(str []rune, idx int) bool {
return slices.Contains(s.content, int(str[idx])) return slices.Contains(s.content, int(str[idx]))
} }
func (s State) isLookaround() bool { func (s nfaState) isLookaround() bool {
return s.assert == PLA || s.assert == PLB || s.assert == NLA || s.assert == NLB return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert
} }
// Returns the matches for the character at the given index of the given string. // Returns the matches for the character at the given index of the given string.
// Also returns the number of matches. Returns -1 if an assertion failed. // Also returns the number of matches. Returns -1 if an assertion failed.
func (s State) matchesFor(str []rune, idx int) ([]*State, int) { func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) {
// Assertions can be viewed as 'checks'. If the check fails, we return // Assertions can be viewed as 'checks'. If the check fails, we return
// an empty array and 0. // an empty array and 0.
// If it passes, we treat it like any other state, and return all the transitions. // If it passes, we treat it like any other state, and return all the transitions.
if s.assert != NONE { if s.assert != noneAssert {
if s.checkAssertion(str, idx) == false { if s.checkAssertion(str, idx) == false {
return make([]*State, 0), -1 return make([]*nfaState, 0), -1
} }
} }
listTransitions := s.transitions[int(str[idx])] listTransitions := s.transitions[int(str[idx])]
@@ -211,39 +211,39 @@ func (s State) matchesFor(str []rune, idx int) ([]*State, int) {
} }
// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates // verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates
func verifyLastStatesHelper(state *State, visited map[*State]bool) { func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) {
if len(state.transitions) == 0 { if len(st.transitions) == 0 {
state.isLast = true st.isLast = true
return return
} }
// if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a* // if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
if len(state.transitions) == 1 { // Eg. a* if len(st.transitions) == 1 { // Eg. a*
var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
for _, c := range state.content { for _, c := range st.content {
if len(state.transitions[c]) != 1 || state.transitions[c][0] != state { if len(st.transitions[c]) != 1 || st.transitions[c][0] != st {
moreThanOneTrans = true moreThanOneTrans = true
} }
} }
state.isLast = !moreThanOneTrans st.isLast = !moreThanOneTrans
} }
if state.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
transitionDests := make([]*State, 0) transitionDests := make([]*nfaState, 0)
for _, v := range state.transitions { for _, v := range st.transitions {
transitionDests = append(transitionDests, v...) transitionDests = append(transitionDests, v...)
} }
if allEqual(transitionDests...) { if allEqual(transitionDests...) {
state.isLast = true st.isLast = true
return return
} }
} }
if visited[state] == true { if visited[st] == true {
return return
} }
visited[state] = true visited[st] = true
for _, states := range state.transitions { for _, states := range st.transitions {
for i := range states { for i := range states {
if states[i] != state { if states[i] != st {
verifyLastStatesHelper(states[i], visited) verifyLastStatesHelper(states[i], visited)
} }
} }
@@ -251,12 +251,12 @@ func verifyLastStatesHelper(state *State, visited map[*State]bool) {
} }
// verifyLastStates enables the 'isLast' flag for the leaf nodes (last states) // verifyLastStates enables the 'isLast' flag for the leaf nodes (last states)
func verifyLastStates(start []*State) { func verifyLastStates(start []*nfaState) {
verifyLastStatesHelper(start[0], make(map[*State]bool)) verifyLastStatesHelper(start[0], make(map[*nfaState]bool))
} }
// Concatenates s1 and s2, returns the start of the concatenation. // Concatenates s1 and s2, returns the start of the concatenation.
func concatenate(s1 *State, s2 *State) *State { func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
if s1 == nil { if s1 == nil {
return s2 return s2
} }
@@ -269,14 +269,14 @@ func concatenate(s1 *State, s2 *State) *State {
return s1 return s1
} }
func kleene(s1 State) (*State, error) { func kleene(s1 nfaState) (*nfaState, error) {
if s1.isEmpty && s1.assert != NONE { if s1.isEmpty && s1.assert != noneAssert {
return nil, fmt.Errorf("previous token is not quantifiable") return nil, fmt.Errorf("previous token is not quantifiable")
} }
toReturn := &State{} toReturn := &nfaState{}
toReturn.transitions = make(map[int][]*State) toReturn.transitions = make(map[int][]*nfaState)
toReturn.content = newContents(EPSILON) toReturn.content = newContents(epsilon)
toReturn.isEmpty = true toReturn.isEmpty = true
toReturn.isKleene = true toReturn.isKleene = true
toReturn.output = append(toReturn.output, toReturn) toReturn.output = append(toReturn.output, toReturn)
@@ -291,9 +291,9 @@ func kleene(s1 State) (*State, error) {
return toReturn, nil return toReturn, nil
} }
func alternate(s1 *State, s2 *State) *State { func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
toReturn := &State{} toReturn := &nfaState{}
toReturn.transitions = make(map[int][]*State) toReturn.transitions = make(map[int][]*nfaState)
toReturn.output = append(toReturn.output, s1.output...) toReturn.output = append(toReturn.output, s1.output...)
toReturn.output = append(toReturn.output, s2.output...) toReturn.output = append(toReturn.output, s2.output...)
// Unique append is used here (and elsewhere) to ensure that, // Unique append is used here (and elsewhere) to ensure that,
@@ -307,16 +307,16 @@ func alternate(s1 *State, s2 *State) *State {
for _, c := range s2.content { for _, c := range s2.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2) toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2)
} }
toReturn.content = newContents(EPSILON) toReturn.content = newContents(epsilon)
toReturn.isEmpty = true toReturn.isEmpty = true
return toReturn return toReturn
} }
func question(s1 *State) *State { // Use the fact that ab? == a(b|) func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|)
s2 := &State{} s2 := &nfaState{}
s2.transitions = make(map[int][]*State) s2.transitions = make(map[int][]*nfaState)
s2.content = newContents(EPSILON) s2.content = newContents(epsilon)
s2.output = append(s2.output, s2) s2.output = append(s2.output, s2)
s2.isEmpty = true s2.isEmpty = true
s3 := alternate(s1, s2) s3 := alternate(s1, s2)
@@ -324,11 +324,11 @@ func question(s1 *State) *State { // Use the fact that ab? == a(b|)
} }
// Creates and returns a new state with the 'default' values. // Creates and returns a new state with the 'default' values.
func newState() State { func newState() nfaState {
ret := State{ ret := nfaState{
output: make([]*State, 0), output: make([]*nfaState, 0),
transitions: make(map[int][]*State), transitions: make(map[int][]*nfaState),
assert: NONE, assert: noneAssert,
except: append([]rune{}, 0), except: append([]rune{}, 0),
lookaroundRegex: "", lookaroundRegex: "",
groupEnd: false, groupEnd: false,
@@ -339,10 +339,10 @@ func newState() State {
} }
// Creates and returns a state that _always_ has a zero-length match. // Creates and returns a state that _always_ has a zero-length match.
func zeroLengthMatchState() State { func zeroLengthMatchState() nfaState {
start := newState() start := newState()
start.content = newContents(EPSILON) start.content = newContents(epsilon)
start.isEmpty = true start.isEmpty = true
start.assert = ALWAYS_TRUE start.assert = alwaysTrueAssert
return start return start
} }

View File

@@ -2,7 +2,7 @@ package regex
import "fmt" import "fmt"
type NodeType int type nodeType int
// This is a slice containing all escapable characters that have special meaning. // This is a slice containing all escapable characters that have special meaning.
// Eg. \b is word boundary, \w is word character etc. // Eg. \b is word boundary, \w is word character etc.
@@ -10,28 +10,28 @@ var escapedChars []rune = []rune("wWdDbBnaftrvsS0")
// This is a list of the possible node types // This is a list of the possible node types
const ( const (
CHARACTER NodeType = iota characterNode nodeType = iota
CHARCLASS charclassNode
PIPE pipeNode
CONCATENATE concatenateNode
KLEENE kleeneNode
QUESTION questionNode
PLUS plusNode
ASSERTION assertionNode
LPAREN lparenNode
RPAREN rparenNode
) )
// Helper constants for lookarounds // Helper constants for lookarounds
const POSITIVE = 1 const positive = 1
const NEGATIVE = -1 const negative = -1
const LOOKAHEAD = 1 const lookahead = 1
const LOOKBEHIND = -1 const lookbehind = -1
var INFINITE_REPS int = -1 // Represents infinite reps eg. the end range in {5,} var infinite_reps int = -1 // Represents infinite reps eg. the end range in {5,}
// This represents a node in the postfix representation of the expression // This represents a node in the postfix representation of the expression
type postfixNode struct { type postfixNode struct {
nodetype NodeType nodetype nodeType
contents []rune // Contents of the node contents []rune // Contents of the node
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
@@ -49,11 +49,11 @@ type postfixNode struct {
// it will not match. // it will not match.
func newCharClassNode(nodes []postfixNode, negated bool) postfixNode { func newCharClassNode(nodes []postfixNode, negated bool) postfixNode {
rtv := postfixNode{} rtv := postfixNode{}
rtv.nodetype = CHARCLASS rtv.nodetype = charclassNode
rtv.startReps = 1 rtv.startReps = 1
rtv.endReps = 1 rtv.endReps = 1
if negated { if negated {
rtv.nodetype = CHARACTER rtv.nodetype = characterNode
rtv.contents = []rune{ANY_CHAR} rtv.contents = []rune{ANY_CHAR}
rtv.allChars = true rtv.allChars = true
rtv.except = nodes rtv.except = nodes
@@ -70,55 +70,55 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
toReturn.endReps = 1 toReturn.endReps = 1
switch c { switch c {
case 's': // Whitespace case 's': // Whitespace
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, whitespaceChars...) toReturn.contents = append(toReturn.contents, whitespaceChars...)
case 'S': // Non-whitespace case 'S': // Non-whitespace
toReturn = newPostfixDotNode() toReturn = newPostfixDotNode()
toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...)) toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...))
case 'd': // Digits case 'd': // Digits
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, digitChars...) toReturn.contents = append(toReturn.contents, digitChars...)
case 'D': // Non-digits case 'D': // Non-digits
toReturn = newPostfixDotNode() toReturn = newPostfixDotNode()
toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...)) toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...))
case 'w': // word character case 'w': // word character
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, wordChars...) toReturn.contents = append(toReturn.contents, wordChars...)
case 'W': // Non-word character case 'W': // Non-word character
toReturn = newPostfixDotNode() toReturn = newPostfixDotNode()
toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...)) toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...))
case 'b', 'B': case 'b', 'B':
if c == 'b' && inCharClass { if c == 'b' && inCharClass {
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(8)) toReturn.contents = append(toReturn.contents, rune(8))
} else { } else {
toReturn.nodetype = ASSERTION toReturn.nodetype = assertionNode
toReturn.contents = append(toReturn.contents, c) toReturn.contents = append(toReturn.contents, c)
} }
case 'n': // Newline character case 'n': // Newline character
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, '\n') toReturn.contents = append(toReturn.contents, '\n')
case '0': // NULL character case '0': // NULL character
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(0)) toReturn.contents = append(toReturn.contents, rune(0))
case 'a': // Bell character case 'a': // Bell character
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(7)) toReturn.contents = append(toReturn.contents, rune(7))
case 'f': // Form feed character case 'f': // Form feed character
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(12)) toReturn.contents = append(toReturn.contents, rune(12))
case 't': // Horizontal tab character case 't': // Horizontal tab character
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(9)) toReturn.contents = append(toReturn.contents, rune(9))
case 'r': // Carriage return case 'r': // Carriage return
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(13)) toReturn.contents = append(toReturn.contents, rune(13))
case 'v': // Vertical tab case 'v': // Vertical tab
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(11)) toReturn.contents = append(toReturn.contents, rune(11))
case '-': // Literal hyphen - only in character class case '-': // Literal hyphen - only in character class
if inCharClass { if inCharClass {
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, '-') toReturn.contents = append(toReturn.contents, '-')
} else { } else {
return postfixNode{}, fmt.Errorf("invalid escape character") return postfixNode{}, fmt.Errorf("invalid escape character")
@@ -127,7 +127,7 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
if isNormalChar(c) { // Normal characters cannot be escaped if isNormalChar(c) { // Normal characters cannot be escaped
return postfixNode{}, fmt.Errorf("invalid escape character") return postfixNode{}, fmt.Errorf("invalid escape character")
} }
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, c) toReturn.contents = append(toReturn.contents, c)
} }
return toReturn, nil return toReturn, nil
@@ -142,36 +142,36 @@ func newPostfixNode(contents ...rune) postfixNode {
to_return.startReps = 1 to_return.startReps = 1
to_return.endReps = 1 to_return.endReps = 1
if len(contents) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER if len(contents) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER
to_return.nodetype = CHARACTER to_return.nodetype = characterNode
to_return.contents = contents to_return.contents = contents
} else { // Node has one element, could be anything } else { // Node has one element, could be anything
switch contents[0] { switch contents[0] {
case '+': case '+':
to_return.nodetype = PLUS to_return.nodetype = plusNode
case '?': case '?':
to_return.nodetype = QUESTION to_return.nodetype = questionNode
case '*': case '*':
to_return.nodetype = KLEENE to_return.nodetype = kleeneNode
case '|': case '|':
to_return.nodetype = PIPE to_return.nodetype = pipeNode
case CONCAT: case concatRune:
to_return.nodetype = CONCATENATE to_return.nodetype = concatenateNode
case '^', '$': case '^', '$':
to_return.nodetype = ASSERTION to_return.nodetype = assertionNode
case '(': case '(':
to_return.nodetype = LPAREN to_return.nodetype = lparenNode
case ')': case ')':
to_return.nodetype = RPAREN to_return.nodetype = rparenNode
default: default:
to_return.nodetype = CHARACTER to_return.nodetype = characterNode
} }
to_return.contents = append(to_return.contents, contents...) to_return.contents = append(to_return.contents, contents...)
// Special cases for LPAREN and RPAREN - they have special characters defined for them // Special cases for LPAREN and RPAREN - they have special characters defined for them
if to_return.nodetype == LPAREN { if to_return.nodetype == lparenNode {
to_return.contents = []rune{LPAREN_CHAR} to_return.contents = []rune{LPAREN_CHAR}
} }
if to_return.nodetype == RPAREN { if to_return.nodetype == rparenNode {
to_return.contents = []rune{RPAREN_CHAR} to_return.contents = []rune{RPAREN_CHAR}
} }
} }
@@ -183,7 +183,7 @@ func newPostfixDotNode() postfixNode {
toReturn := postfixNode{} toReturn := postfixNode{}
toReturn.startReps = 1 toReturn.startReps = 1
toReturn.endReps = 1 toReturn.endReps = 1
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.allChars = true toReturn.allChars = true
toReturn.contents = []rune{ANY_CHAR} toReturn.contents = []rune{ANY_CHAR}
return toReturn return toReturn
@@ -194,7 +194,7 @@ func newPostfixCharNode(contents ...rune) postfixNode {
toReturn := postfixNode{} toReturn := postfixNode{}
toReturn.startReps = 1 toReturn.startReps = 1
toReturn.endReps = 1 toReturn.endReps = 1
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, contents...) toReturn.contents = append(toReturn.contents, contents...)
return toReturn return toReturn
} }