Added more descriptive comments; throw error if non-greedy operator is used; use new definition for kleene()
This commit is contained in:
63
compile.go
63
compile.go
@@ -112,23 +112,30 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
// Convert the string to a slice of runes to allow iteration through it
|
||||
re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
|
||||
re_runes := make([]rune, 0)
|
||||
// Check for numeric range. If we are at the start of a numeric range,
|
||||
// skip to end and construct the equivalent regex for the range.
|
||||
// The reason this is outside the loop below, is that it actually modifies
|
||||
// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
|
||||
// It also makes the overall parsing easier, since I don't have to worry about the numeric range
|
||||
// anymore.
|
||||
// Eventually, I might be able to add it into the main parsing loop, to reduce the time
|
||||
// complexity.
|
||||
// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
|
||||
// The following checks are performed here:
|
||||
// 1. Check for numeric range. If we are at the start of a numeric range,
|
||||
// skip to end and construct the equivalent regex for the range.
|
||||
// The reason this is outside the loop below, is that it actually modifies
|
||||
// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
|
||||
// It also makes the overall parsing easier, since I don't have to worry about the numeric range
|
||||
// anymore.
|
||||
// Eventually, I might be able to add it into the main parsing loop, to reduce the time
|
||||
// complexity.
|
||||
// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
|
||||
//
|
||||
// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
|
||||
// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
|
||||
// 2. Check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
|
||||
// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
|
||||
//
|
||||
// Another check is made for unescaped brackets - opening brackets are replaced with LBRACKET and closing brackets are replaced with RBRACKET.
|
||||
// Finally, check for escaped backslashes. Replace these with the BACKSLASH metacharacter. Later, in thompson(),
|
||||
// these will be converted back. This avoids confusiuon in detecting whether a character is escaped eg. detecting
|
||||
// whether '\\[a]' has an escaped opening bracket (it doesn't).
|
||||
// 3. Another check is made for unescaped brackets - opening brackets are replaced with
|
||||
// LBRACKET and closing brackets are replaced with RBRACKET.
|
||||
//
|
||||
// 4. Check for escaped backslashes. Replace these with the BACKSLASH
|
||||
// metacharacter. Later, in thompson(), these will be converted back. This avoids
|
||||
// confusion in detecting whether a character is escaped eg. detecting
|
||||
// whether '\\[a]' has an escaped opening bracket (it doesn't).
|
||||
//
|
||||
// 5. Check for non-greedy operators. These are not supported at the moment, so an error
|
||||
// must be thrown if the user attempts to use a non-greedy operator.
|
||||
for i := 0; i < len(re_runes_orig); i++ {
|
||||
c := re_runes_orig[i]
|
||||
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
|
||||
@@ -172,6 +179,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
|
||||
re_runes = append(re_runes, RBRACKET)
|
||||
continue
|
||||
} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
|
||||
return nil, fmt.Errorf("non-greedy operators are not supported")
|
||||
} else {
|
||||
re_runes = append(re_runes, c)
|
||||
}
|
||||
@@ -480,9 +489,6 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
if (c == '*' && outQueueFinalElement.nodetype == KLEENE) || (c == '+' && outQueueFinalElement.nodetype == PLUS) { // You cannot apply a quantifier to a quantifier in this way
|
||||
return nil, fmt.Errorf("illegal use of token '%c'", c)
|
||||
}
|
||||
if c == '?' && slices.Contains([]NodeType{KLEENE, PLUS, QUESTION}, outQueueFinalElement.nodetype) {
|
||||
return nil, fmt.Errorf("non-greedy operators not supported")
|
||||
}
|
||||
opStack = append(opStack, c)
|
||||
}
|
||||
}
|
||||
@@ -1004,18 +1010,24 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
if err != nil {
|
||||
return Reg{}, fmt.Errorf("error applying kleene star")
|
||||
}
|
||||
if s1.isEmpty && s1.assert != NONE {
|
||||
return Reg{}, fmt.Errorf("previous token is not quantifiable")
|
||||
stateToAdd, err := kleene(*s1)
|
||||
if err != nil {
|
||||
return Reg{}, err
|
||||
}
|
||||
stateToAdd := kleene(*s1)
|
||||
nfa = append(nfa, stateToAdd)
|
||||
case PLUS: // a+ is equivalent to aa*
|
||||
s1 := mustPop(&nfa)
|
||||
s2 := kleene(*s1)
|
||||
s2, err := kleene(*s1)
|
||||
if err != nil {
|
||||
return Reg{}, err
|
||||
}
|
||||
s1 = concatenate(s1, s2)
|
||||
nfa = append(nfa, s1)
|
||||
case QUESTION: // ab? is equivalent to a(b|)
|
||||
s1 := mustPop(&nfa)
|
||||
s1, err := pop(&nfa)
|
||||
if err != nil {
|
||||
return Reg{}, fmt.Errorf("error applying question operator")
|
||||
}
|
||||
s2 := question(s1)
|
||||
nfa = append(nfa, s2)
|
||||
case PIPE:
|
||||
@@ -1068,7 +1080,10 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
stateToAdd = concatenate(stateToAdd, cloneState(state))
|
||||
}
|
||||
if c.endReps == INFINITE_REPS { // Case 3
|
||||
s2 := kleene(*state)
|
||||
s2, err := kleene(*state)
|
||||
if err != nil {
|
||||
return Reg{}, err
|
||||
}
|
||||
stateToAdd = concatenate(stateToAdd, s2)
|
||||
} else { // Case 2
|
||||
for i := c.startReps; i < c.endReps; i++ {
|
||||
|
Reference in New Issue
Block a user