Compare commits
11 Commits
ca8f8e1030
...
7aee4280cc
| Author | SHA1 | Date | |
|---|---|---|---|
| 7aee4280cc | |||
| e01ef48cbc | |||
| 93474c5159 | |||
| d81b2ddaaa | |||
| 429d286439 | |||
| 198a2c12a7 | |||
| 7e88b8a4b0 | |||
| af5b6ebe08 | |||
| 289bba35e2 | |||
| 7e6377a4c4 | |||
| 73c6a442ce |
142
regex/compile.go
142
regex/compile.go
@@ -14,11 +14,11 @@ var notDotChars []rune
|
||||
// the startState of the NFA representation of the regex, and the number of capturing
|
||||
// groups in the regex.
|
||||
type Reg struct {
|
||||
start *State
|
||||
start *nfaState
|
||||
numGroups int
|
||||
}
|
||||
|
||||
const CONCAT rune = '~'
|
||||
const concatRune rune = 0xF0001
|
||||
|
||||
// Flags for shuntingYard - control its behavior
|
||||
type ReFlag int
|
||||
@@ -31,7 +31,7 @@ const (
|
||||
)
|
||||
|
||||
func isOperator(c rune) bool {
|
||||
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
|
||||
if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
@@ -39,7 +39,7 @@ func isOperator(c rune) bool {
|
||||
|
||||
/* priority returns the priority of the given operator */
|
||||
func priority(op rune) int {
|
||||
precedence := []rune{'|', CONCAT, '+', '*', '?'}
|
||||
precedence := []rune{'|', concatRune, '+', '*', '?'}
|
||||
return slices.Index(precedence, op)
|
||||
}
|
||||
|
||||
@@ -320,7 +320,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
||||
if i < len(re_runes)-1 {
|
||||
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
|
||||
re_postfix = append(re_postfix, CONCAT)
|
||||
re_postfix = append(re_postfix, concatRune)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -450,21 +450,21 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
|
||||
// Now we should filter that out.
|
||||
toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1}
|
||||
toAppend := postfixNode{nodetype: assertionNode, startReps: 1, endReps: 1}
|
||||
if regex[0] == '<' { // Lookbehind
|
||||
toAppend.lookaroundDir = LOOKBEHIND
|
||||
toAppend.lookaroundDir = lookbehind
|
||||
regex = regex[1:]
|
||||
} else if regex[0] == '=' || regex[0] == '!' {
|
||||
toAppend.lookaroundDir = LOOKAHEAD
|
||||
toAppend.lookaroundDir = lookahead
|
||||
} else {
|
||||
return nil, fmt.Errorf("invalid lookaround")
|
||||
}
|
||||
// Positive or negative
|
||||
if regex[0] == '=' { // Positive
|
||||
toAppend.lookaroundSign = POSITIVE
|
||||
toAppend.lookaroundSign = positive
|
||||
toAppend.contents = []rune(regex[1:])
|
||||
} else if regex[0] == '!' { // Negative
|
||||
toAppend.lookaroundSign = NEGATIVE
|
||||
toAppend.lookaroundSign = negative
|
||||
toAppend.contents = []rune(regex[1:])
|
||||
} else {
|
||||
return nil, fmt.Errorf("invalid lookaround")
|
||||
@@ -489,7 +489,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
topStack, _ = peek(opStack)
|
||||
}
|
||||
outQueueFinalElement, _ := peek(outQueue)
|
||||
if (c == '*' && outQueueFinalElement.nodetype == KLEENE) || (c == '+' && outQueueFinalElement.nodetype == PLUS) { // You cannot apply a quantifier to a quantifier in this way
|
||||
if (c == '*' && outQueueFinalElement.nodetype == kleeneNode) || (c == '+' && outQueueFinalElement.nodetype == plusNode) { // You cannot apply a quantifier to a quantifier in this way
|
||||
return nil, fmt.Errorf("illegal use of token '%c'", c)
|
||||
}
|
||||
opStack = append(opStack, c)
|
||||
@@ -739,7 +739,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
return nil, fmt.Errorf("invalid start range for numeric specifier")
|
||||
}
|
||||
if len(endRange) == 0 { // Case 3 above
|
||||
endRangeNum = INFINITE_REPS
|
||||
endRangeNum = infinite_reps
|
||||
} else { // Case 2 above
|
||||
var err error
|
||||
endRangeNum, err = strconv.Atoi(string(endRange))
|
||||
@@ -751,7 +751,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
|
||||
idx := len(outQueue) - 1
|
||||
// Get the last added node
|
||||
if idx < 0 || outQueue[idx].nodetype == LPAREN {
|
||||
if idx < 0 || outQueue[idx].nodetype == lparenNode {
|
||||
return nil, fmt.Errorf("numeric specifier with no content")
|
||||
}
|
||||
outQueue[idx].startReps = startRangeNum
|
||||
@@ -799,8 +799,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
|
||||
// Returns start state and number of groups in regex.
|
||||
func thompson(re []postfixNode) (Reg, error) {
|
||||
nfa := make([]*State, 0) // Stack of states
|
||||
numGroups := 0 // Number of capturing groups
|
||||
nfa := make([]*nfaState, 0) // Stack of states
|
||||
numGroups := 0 // Number of capturing groups
|
||||
|
||||
// If thompson() receives an empty regex, then whatever was given to shuntingYard()
|
||||
// was parsed away. This doesn't mean that the regex itself is empty.
|
||||
@@ -814,11 +814,11 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
}
|
||||
|
||||
for _, c := range re {
|
||||
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
|
||||
state := State{}
|
||||
state.transitions = make(map[int][]*State)
|
||||
if c.nodetype == characterNode || c.nodetype == assertionNode {
|
||||
stateToAdd := nfaState{}
|
||||
stateToAdd.transitions = make(map[int][]*nfaState)
|
||||
if c.allChars {
|
||||
state.allChars = true
|
||||
stateToAdd.allChars = true
|
||||
if len(c.except) != 0 {
|
||||
// For each node that I am 'excepting' (eg. in an inverted character class):
|
||||
// - If the node itself has exceptions, then the exceptions cancel out.
|
||||
@@ -827,7 +827,7 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
// - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list.
|
||||
for _, node := range c.except {
|
||||
if node.allChars {
|
||||
state.allChars = false
|
||||
stateToAdd.allChars = false
|
||||
// For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all,
|
||||
// and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match
|
||||
// those.
|
||||
@@ -840,7 +840,7 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
}
|
||||
return nodeContents
|
||||
})...)
|
||||
state.content = rune2Contents(nodeExceptChars)
|
||||
stateToAdd.content = rune2Contents(nodeExceptChars)
|
||||
} else {
|
||||
charsToAdd := node.contents
|
||||
if caseInsensitive {
|
||||
@@ -848,7 +848,7 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
return allCases(r, caseInsensitive)
|
||||
})...)
|
||||
}
|
||||
state.except = append(state.except, charsToAdd...)
|
||||
stateToAdd.except = append(stateToAdd.except, charsToAdd...)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -861,43 +861,43 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
return allCases(r, caseInsensitive)
|
||||
})...)
|
||||
}
|
||||
state.content = stateContents(append([]int(state.content), []int(rune2Contents(runesToAdd))...))
|
||||
state.output = make([]*State, 0)
|
||||
state.output = append(state.output, &state)
|
||||
state.isEmpty = false
|
||||
if c.nodetype == ASSERTION {
|
||||
state.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
|
||||
state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
|
||||
stateToAdd.content = stateContents(append([]int(stateToAdd.content), []int(rune2Contents(runesToAdd))...))
|
||||
stateToAdd.output = make([]*nfaState, 0)
|
||||
stateToAdd.output = append(stateToAdd.output, &stateToAdd)
|
||||
stateToAdd.isEmpty = false
|
||||
if c.nodetype == assertionNode {
|
||||
stateToAdd.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
|
||||
stateToAdd.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
|
||||
if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
|
||||
switch c.contents[0] {
|
||||
case '^':
|
||||
state.assert = SOS
|
||||
stateToAdd.assert = sosAssert
|
||||
case '$':
|
||||
state.assert = EOS
|
||||
stateToAdd.assert = eosAssert
|
||||
case 'b':
|
||||
state.assert = WBOUND
|
||||
stateToAdd.assert = wboundAssert
|
||||
case 'B':
|
||||
state.assert = NONWBOUND
|
||||
stateToAdd.assert = nonwboundAssert
|
||||
}
|
||||
} else { // Lookaround
|
||||
state.lookaroundRegex = string(c.contents)
|
||||
if c.lookaroundDir == LOOKAHEAD {
|
||||
if c.lookaroundSign == POSITIVE {
|
||||
state.assert = PLA
|
||||
stateToAdd.lookaroundRegex = string(c.contents)
|
||||
if c.lookaroundDir == lookahead {
|
||||
if c.lookaroundSign == positive {
|
||||
stateToAdd.assert = plaAssert
|
||||
}
|
||||
if c.lookaroundSign == NEGATIVE {
|
||||
state.assert = NLA
|
||||
if c.lookaroundSign == negative {
|
||||
stateToAdd.assert = nlaAssert
|
||||
}
|
||||
}
|
||||
if c.lookaroundDir == LOOKBEHIND {
|
||||
if c.lookaroundSign == POSITIVE {
|
||||
state.assert = PLB
|
||||
if c.lookaroundDir == lookbehind {
|
||||
if c.lookaroundSign == positive {
|
||||
stateToAdd.assert = plbAssert
|
||||
}
|
||||
if c.lookaroundSign == NEGATIVE {
|
||||
state.assert = NLB
|
||||
if c.lookaroundSign == negative {
|
||||
stateToAdd.assert = nlbAssert
|
||||
}
|
||||
}
|
||||
tmpRe, err := shuntingYard(state.lookaroundRegex)
|
||||
tmpRe, err := shuntingYard(stateToAdd.lookaroundRegex)
|
||||
if err != nil {
|
||||
return Reg{}, fmt.Errorf("error parsing lookaround: %w", err)
|
||||
}
|
||||
@@ -905,28 +905,28 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
if err != nil {
|
||||
return Reg{}, fmt.Errorf("error compiling lookaround: %w", err)
|
||||
}
|
||||
state.lookaroundNFA = reg.start
|
||||
state.lookaroundNumCaptureGroups = reg.numGroups
|
||||
stateToAdd.lookaroundNFA = reg.start
|
||||
stateToAdd.lookaroundNumCaptureGroups = reg.numGroups
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it
|
||||
replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\')
|
||||
replaceByValue(state.except, ESC_BACKSLASH, '\\')
|
||||
replaceByValue([]int(stateToAdd.content), int(ESC_BACKSLASH), '\\')
|
||||
replaceByValue(stateToAdd.except, ESC_BACKSLASH, '\\')
|
||||
|
||||
nfa = append(nfa, &state)
|
||||
nfa = append(nfa, &stateToAdd)
|
||||
}
|
||||
if c.nodetype == LPAREN || c.nodetype == RPAREN {
|
||||
s := &State{}
|
||||
s.assert = NONE
|
||||
if c.nodetype == lparenNode || c.nodetype == rparenNode {
|
||||
s := &nfaState{}
|
||||
s.assert = noneAssert
|
||||
s.content = newContents(EPSILON)
|
||||
s.isEmpty = true
|
||||
s.output = make([]*State, 0)
|
||||
s.output = make([]*nfaState, 0)
|
||||
s.output = append(s.output, s)
|
||||
s.transitions = make(map[int][]*State)
|
||||
s.transitions = make(map[int][]*nfaState)
|
||||
// LPAREN nodes are just added normally
|
||||
if c.nodetype == LPAREN {
|
||||
if c.nodetype == lparenNode {
|
||||
numGroups++
|
||||
s.groupBegin = true
|
||||
s.groupNum = numGroups
|
||||
@@ -940,7 +940,7 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
|
||||
// and RPAREN nodes.
|
||||
// If neither node exists, that's a problem so I return an error.
|
||||
if c.nodetype == RPAREN {
|
||||
if c.nodetype == rparenNode {
|
||||
s.groupEnd = true
|
||||
middleNode, err1 := pop(&nfa)
|
||||
lparenNode, err2 := pop(&nfa)
|
||||
@@ -969,9 +969,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if c.nodetype == CHARCLASS { // A Character class consists of all the nodes in it, alternated
|
||||
if c.nodetype == charclassNode { // A Character class consists of all the nodes in it, alternated
|
||||
// Map the list of nodes to a list of states, each state containing the contents of a specific node
|
||||
states := funcMap(c.nodeContents, func(node postfixNode) *State {
|
||||
states := funcMap(c.nodeContents, func(node postfixNode) *nfaState {
|
||||
s := newState()
|
||||
nodeContents := node.contents
|
||||
if caseInsensitive {
|
||||
@@ -989,14 +989,14 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
return &s
|
||||
})
|
||||
// Reduce the list of states down to a single state by alternating them
|
||||
toAdd := funcReduce(states, func(s1 *State, s2 *State) *State {
|
||||
toAdd := funcReduce(states, func(s1 *nfaState, s2 *nfaState) *nfaState {
|
||||
return alternate(s1, s2)
|
||||
})
|
||||
nfa = append(nfa, toAdd)
|
||||
}
|
||||
// Must be an operator if it isn't a character
|
||||
switch c.nodetype {
|
||||
case CONCATENATE:
|
||||
case concatenateNode:
|
||||
s2 := mustPop(&nfa)
|
||||
// Relax the requirements for concatenation a little bit - If
|
||||
// the second element is not found ie. the postfixNodes look
|
||||
@@ -1008,7 +1008,7 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
s1 = concatenate(s1, s2)
|
||||
nfa = append(nfa, s1)
|
||||
}
|
||||
case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
|
||||
case kleeneNode: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
|
||||
s1, err := pop(&nfa)
|
||||
if err != nil {
|
||||
return Reg{}, fmt.Errorf("error applying kleene star")
|
||||
@@ -1018,7 +1018,7 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
return Reg{}, err
|
||||
}
|
||||
nfa = append(nfa, stateToAdd)
|
||||
case PLUS: // a+ is equivalent to aa*
|
||||
case plusNode: // a+ is equivalent to aa*
|
||||
s1 := mustPop(&nfa)
|
||||
s2, err := kleene(*s1)
|
||||
if err != nil {
|
||||
@@ -1026,14 +1026,14 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
}
|
||||
s1 = concatenate(s1, s2)
|
||||
nfa = append(nfa, s1)
|
||||
case QUESTION: // ab? is equivalent to a(b|)
|
||||
case questionNode: // ab? is equivalent to a(b|)
|
||||
s1, err := pop(&nfa)
|
||||
if err != nil {
|
||||
return Reg{}, fmt.Errorf("error applying question operator")
|
||||
}
|
||||
s2 := question(s1)
|
||||
nfa = append(nfa, s2)
|
||||
case PIPE:
|
||||
case pipeNode:
|
||||
// A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
|
||||
// it is replaced with an implicit 'matchZeroLength' state (this is the same thing that we add at the top if our
|
||||
// input has zero postfixNodes).
|
||||
@@ -1065,8 +1065,8 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
if c.endReps != -1 && c.endReps < c.startReps {
|
||||
return Reg{}, fmt.Errorf("numeric specifier - start greater than end")
|
||||
}
|
||||
state := mustPop(&nfa)
|
||||
var stateToAdd *State = nil
|
||||
poppedState := mustPop(&nfa)
|
||||
var stateToAdd *nfaState = nil
|
||||
// Take advantage of the following facts:
|
||||
// a{5} == aaaaa
|
||||
// a{3,5} == aaaa?a?
|
||||
@@ -1080,17 +1080,17 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
|
||||
// at this point, I can leave thompson untouched.
|
||||
for i := 0; i < c.startReps; i++ { // Case 1
|
||||
stateToAdd = concatenate(stateToAdd, cloneState(state))
|
||||
stateToAdd = concatenate(stateToAdd, cloneState(poppedState))
|
||||
}
|
||||
if c.endReps == INFINITE_REPS { // Case 3
|
||||
s2, err := kleene(*state)
|
||||
if c.endReps == infinite_reps { // Case 3
|
||||
s2, err := kleene(*poppedState)
|
||||
if err != nil {
|
||||
return Reg{}, err
|
||||
}
|
||||
stateToAdd = concatenate(stateToAdd, s2)
|
||||
} else { // Case 2
|
||||
for i := c.startReps; i < c.endReps; i++ {
|
||||
stateToAdd = concatenate(stateToAdd, question(cloneState(state)))
|
||||
stateToAdd = concatenate(stateToAdd, question(cloneState(poppedState)))
|
||||
}
|
||||
}
|
||||
nfa = append(nfa, stateToAdd)
|
||||
|
||||
@@ -61,7 +61,7 @@ func (g Group) isValid() bool {
|
||||
// given slice. It returns the resulting states. If any of the resulting states is a 0-state,
|
||||
// the second ret val is true.
|
||||
// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index.
|
||||
func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZero bool) {
|
||||
func takeZeroState(states []*nfaState, numGroups int, idx int) (rtv []*nfaState, isZero bool) {
|
||||
for _, state := range states {
|
||||
if len(state.transitions[EPSILON]) > 0 {
|
||||
for _, s := range state.transitions[EPSILON] {
|
||||
@@ -93,9 +93,9 @@ func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZer
|
||||
// from any of the given states, given the string and our position in it.
|
||||
// It uses the same algorithm to find zero-states as the one inside the loop,
|
||||
// so I should probably put it in a function.
|
||||
func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) bool {
|
||||
func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState) bool {
|
||||
zeroStates, isZero := takeZeroState(states, numGroups, idx)
|
||||
tempstates := make([]*State, 0, len(zeroStates)+len(states))
|
||||
tempstates := make([]*nfaState, 0, len(zeroStates)+len(states))
|
||||
tempstates = append(tempstates, states...)
|
||||
tempstates = append(tempstates, zeroStates...)
|
||||
num_appended := 0 // number of unique states addded to tempstates
|
||||
@@ -107,7 +107,7 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) boo
|
||||
}
|
||||
}
|
||||
for _, state := range tempstates {
|
||||
if state.isEmpty && (state.assert == NONE || state.checkAssertion(str, idx)) && state.isLast {
|
||||
if state.isEmpty && (state.assert == noneAssert || state.checkAssertion(str, idx)) && state.isLast {
|
||||
return true
|
||||
}
|
||||
}
|
||||
@@ -204,7 +204,7 @@ func FindAllMatches(regex Reg, str string) []Match {
|
||||
// the next search should start from.
|
||||
//
|
||||
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
|
||||
func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (bool, Match, int) {
|
||||
func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
|
||||
// Base case - exit if offset exceeds string's length
|
||||
if offset > len(str) {
|
||||
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
|
||||
@@ -221,14 +221,14 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
|
||||
foundPath := false
|
||||
startIdx := offset
|
||||
endIdx := offset
|
||||
currentStates := make([]*State, 0)
|
||||
tempStates := make([]*State, 0) // Used to store states that should be used in next loop iteration
|
||||
i := offset // Index in string
|
||||
startingFrom := i // Store starting index
|
||||
currentStates := make([]*nfaState, 0)
|
||||
tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration
|
||||
i := offset // Index in string
|
||||
startingFrom := i // Store starting index
|
||||
|
||||
// If the first state is an assertion, makes sure the assertion
|
||||
// is true before we do _anything_ else.
|
||||
if start.assert != NONE {
|
||||
if start.assert != noneAssert {
|
||||
if start.checkAssertion(str, offset) == false {
|
||||
i++
|
||||
return false, []Group{}, i
|
||||
@@ -257,7 +257,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
|
||||
for i < len(str) {
|
||||
foundPath = false
|
||||
|
||||
zeroStates := make([]*State, 0)
|
||||
zeroStates := make([]*nfaState, 0)
|
||||
// Keep taking zero-states, until there are no more left to take
|
||||
// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
|
||||
zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
|
||||
@@ -275,11 +275,11 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
|
||||
tempStates = nil
|
||||
|
||||
// Take any transitions corresponding to current character
|
||||
numStatesMatched := 0 // The number of states which had at least 1 match for this round
|
||||
assertionFailed := false // Whether or not an assertion failed for this round
|
||||
lastStateInList := false // Whether or not a last state was in our list of states
|
||||
var lastStatePtr *State = nil // Pointer to the last-state, if it was found
|
||||
lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
|
||||
numStatesMatched := 0 // The number of states which had at least 1 match for this round
|
||||
assertionFailed := false // Whether or not an assertion failed for this round
|
||||
lastStateInList := false // Whether or not a last state was in our list of states
|
||||
var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found
|
||||
lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
|
||||
for _, state := range currentStates {
|
||||
matches, numMatches := state.matchesFor(str, i)
|
||||
if numMatches > 0 {
|
||||
@@ -329,7 +329,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
|
||||
// b. Empty
|
||||
// c. Doesn't assert anything
|
||||
for _, s := range currentStates {
|
||||
if s.isLast && s.isEmpty && s.assert == NONE {
|
||||
if s.isLast && s.isEmpty && s.assert == noneAssert {
|
||||
lastStatePtr = s
|
||||
lastStateInList = true
|
||||
}
|
||||
@@ -364,7 +364,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
|
||||
}
|
||||
return false, []Group{}, startIdx
|
||||
}
|
||||
currentStates = make([]*State, len(tempStates))
|
||||
currentStates = make([]*nfaState, len(tempStates))
|
||||
copy(currentStates, tempStates)
|
||||
tempStates = nil
|
||||
|
||||
@@ -391,7 +391,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
|
||||
// Only add the match if the start index is in bounds. If the state has an assertion,
|
||||
// make sure the assertion checks out.
|
||||
if state.isLast && i <= len(str) {
|
||||
if state.assert == NONE || state.checkAssertion(str, i) {
|
||||
if state.assert == noneAssert || state.checkAssertion(str, i) {
|
||||
for j := 1; j < numGroups+1; j++ {
|
||||
tempIndices[j] = state.threadGroups[j]
|
||||
}
|
||||
|
||||
@@ -8,16 +8,16 @@ import (
|
||||
var whitespaceChars = []rune{' ', '\t', '\n'}
|
||||
var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
|
||||
var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_")
|
||||
var LBRACKET rune = 0xF0001
|
||||
var RBRACKET rune = 0xF0002
|
||||
var ANY_CHAR rune = 0xF0003 // Represents any character - used for states where the allChars flag is on.
|
||||
var LPAREN_CHAR rune = 0xF0004 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
|
||||
var RPAREN_CHAR rune = 0xF0005
|
||||
var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPAREN
|
||||
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash
|
||||
var CHAR_RANGE rune = 0xF0008 // Represents a character range
|
||||
var LBRACKET rune = 0xF0002
|
||||
var RBRACKET rune = 0xF0003
|
||||
var ANY_CHAR rune = 0xF0004 // Represents any character - used for states where the allChars flag is on.
|
||||
var LPAREN_CHAR rune = 0xF0005 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
|
||||
var RPAREN_CHAR rune = 0xF0006
|
||||
var NONCAPLPAREN_CHAR rune = 0xF0007 // Represents a non-capturing group's LPAREN
|
||||
var ESC_BACKSLASH rune = 0xF0008 // Represents an escaped backslash
|
||||
var CHAR_RANGE rune = 0xF0009 // Represents a character range
|
||||
|
||||
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', CONCAT, '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR}
|
||||
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR}
|
||||
|
||||
// An interface for int and rune, which are identical
|
||||
type character interface {
|
||||
|
||||
224
regex/nfa.go
224
regex/nfa.go
@@ -5,124 +5,124 @@ import (
|
||||
"slices"
|
||||
)
|
||||
|
||||
const EPSILON int = 0xF0000
|
||||
const epsilon int = 0xF0000
|
||||
|
||||
type assertType int
|
||||
|
||||
const (
|
||||
NONE assertType = iota
|
||||
SOS
|
||||
EOS
|
||||
WBOUND
|
||||
NONWBOUND
|
||||
PLA // Positive lookahead
|
||||
NLA // Negative lookahead
|
||||
PLB // Positive lookbehind
|
||||
NLB // Negative lookbehind
|
||||
ALWAYS_TRUE // An assertion that is always true
|
||||
noneAssert assertType = iota
|
||||
sosAssert
|
||||
eosAssert
|
||||
wboundAssert
|
||||
nonwboundAssert
|
||||
plaAssert // Positive lookahead
|
||||
nlaAssert // Negative lookahead
|
||||
plbAssert // Positive lookbehind
|
||||
nlbAssert // Negative lookbehind
|
||||
alwaysTrueAssert // An assertion that is always true
|
||||
)
|
||||
|
||||
type State struct {
|
||||
content stateContents // Contents of current state
|
||||
isEmpty bool // If it is empty - Union operator and Kleene star states will be empty
|
||||
isLast bool // If it is the last state (acept state)
|
||||
output []*State // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
|
||||
transitions map[int][]*State // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
|
||||
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
|
||||
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
|
||||
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
|
||||
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
|
||||
lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds
|
||||
lookaroundNFA *State // Holds the NFA of the lookaroundRegex - if it exists
|
||||
lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround
|
||||
groupBegin bool // Whether or not the node starts a capturing group
|
||||
groupEnd bool // Whether or not the node ends a capturing group
|
||||
groupNum int // Which capturing group the node starts / ends
|
||||
type nfaState struct {
|
||||
content stateContents // Contents of current state
|
||||
isEmpty bool // If it is empty - Union operator and Kleene star states will be empty
|
||||
isLast bool // If it is the last state (acept state)
|
||||
output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
|
||||
transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
|
||||
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
|
||||
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
|
||||
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
|
||||
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
|
||||
lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds
|
||||
lookaroundNFA *nfaState // Holds the NFA of the lookaroundRegex - if it exists
|
||||
lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround
|
||||
groupBegin bool // Whether or not the node starts a capturing group
|
||||
groupEnd bool // Whether or not the node ends a capturing group
|
||||
groupNum int // Which capturing group the node starts / ends
|
||||
// The following properties depend on the current match - I should think about resetting them for every match.
|
||||
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
|
||||
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
|
||||
}
|
||||
|
||||
// Clones the NFA starting from the given state.
|
||||
func cloneState(start *State) *State {
|
||||
return cloneStateHelper(start, make(map[*State]*State))
|
||||
func cloneState(start *nfaState) *nfaState {
|
||||
return cloneStateHelper(start, make(map[*nfaState]*nfaState))
|
||||
}
|
||||
|
||||
// Helper function for clone. The map is used to keep track of which states have
|
||||
// already been copied, and which ones haven't.
|
||||
// This function was created using output from Llama3.1:405B.
|
||||
func cloneStateHelper(state *State, cloneMap map[*State]*State) *State {
|
||||
func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) *nfaState {
|
||||
// Base case - if the clone exists in our map, return it.
|
||||
if clone, exists := cloneMap[state]; exists {
|
||||
if clone, exists := cloneMap[stateToClone]; exists {
|
||||
return clone
|
||||
}
|
||||
if state == nil {
|
||||
if stateToClone == nil {
|
||||
return nil
|
||||
}
|
||||
// Recursive case - if the clone doesn't exist, create it, add it to the map,
|
||||
// and recursively call for each of the transition states.
|
||||
clone := &State{
|
||||
content: append([]int{}, state.content...),
|
||||
isEmpty: state.isEmpty,
|
||||
isLast: state.isLast,
|
||||
output: make([]*State, len(state.output)),
|
||||
transitions: make(map[int][]*State),
|
||||
isKleene: state.isKleene,
|
||||
assert: state.assert,
|
||||
zeroMatchFound: state.zeroMatchFound,
|
||||
allChars: state.allChars,
|
||||
except: append([]rune{}, state.except...),
|
||||
lookaroundRegex: state.lookaroundRegex,
|
||||
groupEnd: state.groupEnd,
|
||||
groupBegin: state.groupBegin,
|
||||
groupNum: state.groupNum,
|
||||
clone := &nfaState{
|
||||
content: append([]int{}, stateToClone.content...),
|
||||
isEmpty: stateToClone.isEmpty,
|
||||
isLast: stateToClone.isLast,
|
||||
output: make([]*nfaState, len(stateToClone.output)),
|
||||
transitions: make(map[int][]*nfaState),
|
||||
isKleene: stateToClone.isKleene,
|
||||
assert: stateToClone.assert,
|
||||
zeroMatchFound: stateToClone.zeroMatchFound,
|
||||
allChars: stateToClone.allChars,
|
||||
except: append([]rune{}, stateToClone.except...),
|
||||
lookaroundRegex: stateToClone.lookaroundRegex,
|
||||
groupEnd: stateToClone.groupEnd,
|
||||
groupBegin: stateToClone.groupBegin,
|
||||
groupNum: stateToClone.groupNum,
|
||||
}
|
||||
cloneMap[state] = clone
|
||||
for i, s := range state.output {
|
||||
if s == state {
|
||||
cloneMap[stateToClone] = clone
|
||||
for i, s := range stateToClone.output {
|
||||
if s == stateToClone {
|
||||
clone.output[i] = clone
|
||||
} else {
|
||||
clone.output[i] = cloneStateHelper(s, cloneMap)
|
||||
}
|
||||
}
|
||||
for k, v := range state.transitions {
|
||||
clone.transitions[k] = make([]*State, len(v))
|
||||
for k, v := range stateToClone.transitions {
|
||||
clone.transitions[k] = make([]*nfaState, len(v))
|
||||
for i, s := range v {
|
||||
if s == state {
|
||||
if s == stateToClone {
|
||||
clone.transitions[k][i] = clone
|
||||
} else {
|
||||
clone.transitions[k][i] = cloneStateHelper(s, cloneMap)
|
||||
}
|
||||
}
|
||||
}
|
||||
if state.lookaroundNFA == state {
|
||||
if stateToClone.lookaroundNFA == stateToClone {
|
||||
clone.lookaroundNFA = clone
|
||||
}
|
||||
clone.lookaroundNFA = cloneStateHelper(state.lookaroundNFA, cloneMap)
|
||||
clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap)
|
||||
return clone
|
||||
}
|
||||
|
||||
// Checks if the given state's assertion is true. Returns true if the given
|
||||
// state doesn't have an assertion.
|
||||
func (s State) checkAssertion(str []rune, idx int) bool {
|
||||
if s.assert == ALWAYS_TRUE {
|
||||
func (s nfaState) checkAssertion(str []rune, idx int) bool {
|
||||
if s.assert == alwaysTrueAssert {
|
||||
return true
|
||||
}
|
||||
if s.assert == SOS {
|
||||
if s.assert == sosAssert {
|
||||
// Single-line mode: Beginning of string
|
||||
// Multi-line mode: Previous character was newline
|
||||
return idx == 0 || (multilineMode && (idx > 0 && str[idx-1] == '\n'))
|
||||
}
|
||||
if s.assert == EOS {
|
||||
if s.assert == eosAssert {
|
||||
// Single-line mode: End of string
|
||||
// Multi-line mode: current character is newline
|
||||
// Index is at the end of the string, or it points to the last character which is a newline
|
||||
return idx == len(str) || (multilineMode && str[idx] == '\n')
|
||||
}
|
||||
if s.assert == WBOUND {
|
||||
if s.assert == wboundAssert {
|
||||
return isWordBoundary(str, idx)
|
||||
}
|
||||
if s.assert == NONWBOUND {
|
||||
if s.assert == nonwboundAssert {
|
||||
return !isWordBoundary(str, idx)
|
||||
}
|
||||
if s.isLookaround() {
|
||||
@@ -133,7 +133,7 @@ func (s State) checkAssertion(str []rune, idx int) bool {
|
||||
startState := s.lookaroundNFA
|
||||
var runesToMatch []rune
|
||||
var strToMatch string
|
||||
if s.assert == PLA || s.assert == NLA {
|
||||
if s.assert == plaAssert || s.assert == nlaAssert {
|
||||
runesToMatch = str[idx:]
|
||||
} else {
|
||||
runesToMatch = str[:idx]
|
||||
@@ -149,21 +149,21 @@ func (s State) checkAssertion(str []rune, idx int) bool {
|
||||
|
||||
numMatchesFound := 0
|
||||
for _, matchIdx := range matchIndices {
|
||||
if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
|
||||
if s.assert == plaAssert || s.assert == nlaAssert { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
|
||||
if matchIdx[0].StartIdx == 0 {
|
||||
numMatchesFound++
|
||||
}
|
||||
}
|
||||
if s.assert == PLB || s.assert == NLB { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
|
||||
if s.assert == plbAssert || s.assert == nlbAssert { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
|
||||
if matchIdx[0].EndIdx == idx {
|
||||
numMatchesFound++
|
||||
}
|
||||
}
|
||||
}
|
||||
if s.assert == PLA || s.assert == PLB { // Positive assertions want at least one match
|
||||
if s.assert == plaAssert || s.assert == plbAssert { // Positive assertions want at least one match
|
||||
return numMatchesFound > 0
|
||||
}
|
||||
if s.assert == NLA || s.assert == NLB { // Negative assertions only want zero matches
|
||||
if s.assert == nlaAssert || s.assert == nlbAssert { // Negative assertions only want zero matches
|
||||
return numMatchesFound == 0
|
||||
}
|
||||
}
|
||||
@@ -171,8 +171,8 @@ func (s State) checkAssertion(str []rune, idx int) bool {
|
||||
}
|
||||
|
||||
// Returns true if the contents of 's' contain the value at the given index of the given string
|
||||
func (s State) contentContains(str []rune, idx int) bool {
|
||||
if s.assert != NONE {
|
||||
func (s nfaState) contentContains(str []rune, idx int) bool {
|
||||
if s.assert != noneAssert {
|
||||
return s.checkAssertion(str, idx)
|
||||
}
|
||||
if s.allChars {
|
||||
@@ -182,19 +182,19 @@ func (s State) contentContains(str []rune, idx int) bool {
|
||||
return slices.Contains(s.content, int(str[idx]))
|
||||
}
|
||||
|
||||
func (s State) isLookaround() bool {
|
||||
return s.assert == PLA || s.assert == PLB || s.assert == NLA || s.assert == NLB
|
||||
func (s nfaState) isLookaround() bool {
|
||||
return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert
|
||||
}
|
||||
|
||||
// Returns the matches for the character at the given index of the given string.
|
||||
// Also returns the number of matches. Returns -1 if an assertion failed.
|
||||
func (s State) matchesFor(str []rune, idx int) ([]*State, int) {
|
||||
func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) {
|
||||
// Assertions can be viewed as 'checks'. If the check fails, we return
|
||||
// an empty array and 0.
|
||||
// If it passes, we treat it like any other state, and return all the transitions.
|
||||
if s.assert != NONE {
|
||||
if s.assert != noneAssert {
|
||||
if s.checkAssertion(str, idx) == false {
|
||||
return make([]*State, 0), -1
|
||||
return make([]*nfaState, 0), -1
|
||||
}
|
||||
}
|
||||
listTransitions := s.transitions[int(str[idx])]
|
||||
@@ -211,39 +211,39 @@ func (s State) matchesFor(str []rune, idx int) ([]*State, int) {
|
||||
}
|
||||
|
||||
// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates
|
||||
func verifyLastStatesHelper(state *State, visited map[*State]bool) {
|
||||
if len(state.transitions) == 0 {
|
||||
state.isLast = true
|
||||
func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) {
|
||||
if len(st.transitions) == 0 {
|
||||
st.isLast = true
|
||||
return
|
||||
}
|
||||
// if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
|
||||
if len(state.transitions) == 1 { // Eg. a*
|
||||
if len(st.transitions) == 1 { // Eg. a*
|
||||
var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
|
||||
for _, c := range state.content {
|
||||
if len(state.transitions[c]) != 1 || state.transitions[c][0] != state {
|
||||
for _, c := range st.content {
|
||||
if len(st.transitions[c]) != 1 || st.transitions[c][0] != st {
|
||||
moreThanOneTrans = true
|
||||
}
|
||||
}
|
||||
state.isLast = !moreThanOneTrans
|
||||
st.isLast = !moreThanOneTrans
|
||||
}
|
||||
|
||||
if state.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
|
||||
transitionDests := make([]*State, 0)
|
||||
for _, v := range state.transitions {
|
||||
if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
|
||||
transitionDests := make([]*nfaState, 0)
|
||||
for _, v := range st.transitions {
|
||||
transitionDests = append(transitionDests, v...)
|
||||
}
|
||||
if allEqual(transitionDests...) {
|
||||
state.isLast = true
|
||||
st.isLast = true
|
||||
return
|
||||
}
|
||||
}
|
||||
if visited[state] == true {
|
||||
if visited[st] == true {
|
||||
return
|
||||
}
|
||||
visited[state] = true
|
||||
for _, states := range state.transitions {
|
||||
visited[st] = true
|
||||
for _, states := range st.transitions {
|
||||
for i := range states {
|
||||
if states[i] != state {
|
||||
if states[i] != st {
|
||||
verifyLastStatesHelper(states[i], visited)
|
||||
}
|
||||
}
|
||||
@@ -251,12 +251,12 @@ func verifyLastStatesHelper(state *State, visited map[*State]bool) {
|
||||
}
|
||||
|
||||
// verifyLastStates enables the 'isLast' flag for the leaf nodes (last states)
|
||||
func verifyLastStates(start []*State) {
|
||||
verifyLastStatesHelper(start[0], make(map[*State]bool))
|
||||
func verifyLastStates(start []*nfaState) {
|
||||
verifyLastStatesHelper(start[0], make(map[*nfaState]bool))
|
||||
}
|
||||
|
||||
// Concatenates s1 and s2, returns the start of the concatenation.
|
||||
func concatenate(s1 *State, s2 *State) *State {
|
||||
func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
|
||||
if s1 == nil {
|
||||
return s2
|
||||
}
|
||||
@@ -269,14 +269,14 @@ func concatenate(s1 *State, s2 *State) *State {
|
||||
return s1
|
||||
}
|
||||
|
||||
func kleene(s1 State) (*State, error) {
|
||||
if s1.isEmpty && s1.assert != NONE {
|
||||
func kleene(s1 nfaState) (*nfaState, error) {
|
||||
if s1.isEmpty && s1.assert != noneAssert {
|
||||
return nil, fmt.Errorf("previous token is not quantifiable")
|
||||
}
|
||||
|
||||
toReturn := &State{}
|
||||
toReturn.transitions = make(map[int][]*State)
|
||||
toReturn.content = newContents(EPSILON)
|
||||
toReturn := &nfaState{}
|
||||
toReturn.transitions = make(map[int][]*nfaState)
|
||||
toReturn.content = newContents(epsilon)
|
||||
toReturn.isEmpty = true
|
||||
toReturn.isKleene = true
|
||||
toReturn.output = append(toReturn.output, toReturn)
|
||||
@@ -291,9 +291,9 @@ func kleene(s1 State) (*State, error) {
|
||||
return toReturn, nil
|
||||
}
|
||||
|
||||
func alternate(s1 *State, s2 *State) *State {
|
||||
toReturn := &State{}
|
||||
toReturn.transitions = make(map[int][]*State)
|
||||
func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
|
||||
toReturn := &nfaState{}
|
||||
toReturn.transitions = make(map[int][]*nfaState)
|
||||
toReturn.output = append(toReturn.output, s1.output...)
|
||||
toReturn.output = append(toReturn.output, s2.output...)
|
||||
// Unique append is used here (and elsewhere) to ensure that,
|
||||
@@ -307,16 +307,16 @@ func alternate(s1 *State, s2 *State) *State {
|
||||
for _, c := range s2.content {
|
||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2)
|
||||
}
|
||||
toReturn.content = newContents(EPSILON)
|
||||
toReturn.content = newContents(epsilon)
|
||||
toReturn.isEmpty = true
|
||||
|
||||
return toReturn
|
||||
}
|
||||
|
||||
func question(s1 *State) *State { // Use the fact that ab? == a(b|)
|
||||
s2 := &State{}
|
||||
s2.transitions = make(map[int][]*State)
|
||||
s2.content = newContents(EPSILON)
|
||||
func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|)
|
||||
s2 := &nfaState{}
|
||||
s2.transitions = make(map[int][]*nfaState)
|
||||
s2.content = newContents(epsilon)
|
||||
s2.output = append(s2.output, s2)
|
||||
s2.isEmpty = true
|
||||
s3 := alternate(s1, s2)
|
||||
@@ -324,11 +324,11 @@ func question(s1 *State) *State { // Use the fact that ab? == a(b|)
|
||||
}
|
||||
|
||||
// Creates and returns a new state with the 'default' values.
|
||||
func newState() State {
|
||||
ret := State{
|
||||
output: make([]*State, 0),
|
||||
transitions: make(map[int][]*State),
|
||||
assert: NONE,
|
||||
func newState() nfaState {
|
||||
ret := nfaState{
|
||||
output: make([]*nfaState, 0),
|
||||
transitions: make(map[int][]*nfaState),
|
||||
assert: noneAssert,
|
||||
except: append([]rune{}, 0),
|
||||
lookaroundRegex: "",
|
||||
groupEnd: false,
|
||||
@@ -339,10 +339,10 @@ func newState() State {
|
||||
}
|
||||
|
||||
// Creates and returns a state that _always_ has a zero-length match.
|
||||
func zeroLengthMatchState() State {
|
||||
func zeroLengthMatchState() nfaState {
|
||||
start := newState()
|
||||
start.content = newContents(EPSILON)
|
||||
start.content = newContents(epsilon)
|
||||
start.isEmpty = true
|
||||
start.assert = ALWAYS_TRUE
|
||||
start.assert = alwaysTrueAssert
|
||||
return start
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ package regex
|
||||
|
||||
import "fmt"
|
||||
|
||||
type NodeType int
|
||||
type nodeType int
|
||||
|
||||
// This is a slice containing all escapable characters that have special meaning.
|
||||
// Eg. \b is word boundary, \w is word character etc.
|
||||
@@ -10,28 +10,28 @@ var escapedChars []rune = []rune("wWdDbBnaftrvsS0")
|
||||
|
||||
// This is a list of the possible node types
|
||||
const (
|
||||
CHARACTER NodeType = iota
|
||||
CHARCLASS
|
||||
PIPE
|
||||
CONCATENATE
|
||||
KLEENE
|
||||
QUESTION
|
||||
PLUS
|
||||
ASSERTION
|
||||
LPAREN
|
||||
RPAREN
|
||||
characterNode nodeType = iota
|
||||
charclassNode
|
||||
pipeNode
|
||||
concatenateNode
|
||||
kleeneNode
|
||||
questionNode
|
||||
plusNode
|
||||
assertionNode
|
||||
lparenNode
|
||||
rparenNode
|
||||
)
|
||||
|
||||
// Helper constants for lookarounds
|
||||
const POSITIVE = 1
|
||||
const NEGATIVE = -1
|
||||
const LOOKAHEAD = 1
|
||||
const LOOKBEHIND = -1
|
||||
const positive = 1
|
||||
const negative = -1
|
||||
const lookahead = 1
|
||||
const lookbehind = -1
|
||||
|
||||
var INFINITE_REPS int = -1 // Represents infinite reps eg. the end range in {5,}
|
||||
var infinite_reps int = -1 // Represents infinite reps eg. the end range in {5,}
|
||||
// This represents a node in the postfix representation of the expression
|
||||
type postfixNode struct {
|
||||
nodetype NodeType
|
||||
nodetype nodeType
|
||||
contents []rune // Contents of the node
|
||||
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
|
||||
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
|
||||
@@ -49,11 +49,11 @@ type postfixNode struct {
|
||||
// it will not match.
|
||||
func newCharClassNode(nodes []postfixNode, negated bool) postfixNode {
|
||||
rtv := postfixNode{}
|
||||
rtv.nodetype = CHARCLASS
|
||||
rtv.nodetype = charclassNode
|
||||
rtv.startReps = 1
|
||||
rtv.endReps = 1
|
||||
if negated {
|
||||
rtv.nodetype = CHARACTER
|
||||
rtv.nodetype = characterNode
|
||||
rtv.contents = []rune{ANY_CHAR}
|
||||
rtv.allChars = true
|
||||
rtv.except = nodes
|
||||
@@ -70,55 +70,55 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
|
||||
toReturn.endReps = 1
|
||||
switch c {
|
||||
case 's': // Whitespace
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, whitespaceChars...)
|
||||
case 'S': // Non-whitespace
|
||||
toReturn = newPostfixDotNode()
|
||||
toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...))
|
||||
case 'd': // Digits
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, digitChars...)
|
||||
case 'D': // Non-digits
|
||||
toReturn = newPostfixDotNode()
|
||||
toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...))
|
||||
case 'w': // word character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, wordChars...)
|
||||
case 'W': // Non-word character
|
||||
toReturn = newPostfixDotNode()
|
||||
toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...))
|
||||
case 'b', 'B':
|
||||
if c == 'b' && inCharClass {
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(8))
|
||||
} else {
|
||||
toReturn.nodetype = ASSERTION
|
||||
toReturn.nodetype = assertionNode
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
}
|
||||
case 'n': // Newline character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, '\n')
|
||||
case '0': // NULL character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(0))
|
||||
case 'a': // Bell character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(7))
|
||||
case 'f': // Form feed character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(12))
|
||||
case 't': // Horizontal tab character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(9))
|
||||
case 'r': // Carriage return
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(13))
|
||||
case 'v': // Vertical tab
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(11))
|
||||
case '-': // Literal hyphen - only in character class
|
||||
if inCharClass {
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, '-')
|
||||
} else {
|
||||
return postfixNode{}, fmt.Errorf("invalid escape character")
|
||||
@@ -127,7 +127,7 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
|
||||
if isNormalChar(c) { // Normal characters cannot be escaped
|
||||
return postfixNode{}, fmt.Errorf("invalid escape character")
|
||||
}
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
}
|
||||
return toReturn, nil
|
||||
@@ -142,36 +142,36 @@ func newPostfixNode(contents ...rune) postfixNode {
|
||||
to_return.startReps = 1
|
||||
to_return.endReps = 1
|
||||
if len(contents) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER
|
||||
to_return.nodetype = CHARACTER
|
||||
to_return.nodetype = characterNode
|
||||
to_return.contents = contents
|
||||
} else { // Node has one element, could be anything
|
||||
switch contents[0] {
|
||||
case '+':
|
||||
to_return.nodetype = PLUS
|
||||
to_return.nodetype = plusNode
|
||||
case '?':
|
||||
to_return.nodetype = QUESTION
|
||||
to_return.nodetype = questionNode
|
||||
case '*':
|
||||
to_return.nodetype = KLEENE
|
||||
to_return.nodetype = kleeneNode
|
||||
case '|':
|
||||
to_return.nodetype = PIPE
|
||||
case CONCAT:
|
||||
to_return.nodetype = CONCATENATE
|
||||
to_return.nodetype = pipeNode
|
||||
case concatRune:
|
||||
to_return.nodetype = concatenateNode
|
||||
case '^', '$':
|
||||
to_return.nodetype = ASSERTION
|
||||
to_return.nodetype = assertionNode
|
||||
case '(':
|
||||
to_return.nodetype = LPAREN
|
||||
to_return.nodetype = lparenNode
|
||||
case ')':
|
||||
to_return.nodetype = RPAREN
|
||||
to_return.nodetype = rparenNode
|
||||
default:
|
||||
to_return.nodetype = CHARACTER
|
||||
to_return.nodetype = characterNode
|
||||
}
|
||||
to_return.contents = append(to_return.contents, contents...)
|
||||
|
||||
// Special cases for LPAREN and RPAREN - they have special characters defined for them
|
||||
if to_return.nodetype == LPAREN {
|
||||
if to_return.nodetype == lparenNode {
|
||||
to_return.contents = []rune{LPAREN_CHAR}
|
||||
}
|
||||
if to_return.nodetype == RPAREN {
|
||||
if to_return.nodetype == rparenNode {
|
||||
to_return.contents = []rune{RPAREN_CHAR}
|
||||
}
|
||||
}
|
||||
@@ -183,7 +183,7 @@ func newPostfixDotNode() postfixNode {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.allChars = true
|
||||
toReturn.contents = []rune{ANY_CHAR}
|
||||
return toReturn
|
||||
@@ -194,7 +194,7 @@ func newPostfixCharNode(contents ...rune) postfixNode {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, contents...)
|
||||
return toReturn
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user