Added more descriptive comments; throw error if non-greedy operator is used; use new definition for kleene()

master
Aadhavan Srinivasan 4 days ago
parent ecab7cc522
commit 833dd269a8

@ -112,23 +112,30 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// Convert the string to a slice of runes to allow iteration through it // Convert the string to a slice of runes to allow iteration through it
re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges) re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
re_runes := make([]rune, 0) re_runes := make([]rune, 0)
// Check for numeric range. If we are at the start of a numeric range, // The following checks are performed here:
// skip to end and construct the equivalent regex for the range. // 1. Check for numeric range. If we are at the start of a numeric range,
// The reason this is outside the loop below, is that it actually modifies // skip to end and construct the equivalent regex for the range.
// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex). // The reason this is outside the loop below, is that it actually modifies
// It also makes the overall parsing easier, since I don't have to worry about the numeric range // the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
// anymore. // It also makes the overall parsing easier, since I don't have to worry about the numeric range
// Eventually, I might be able to add it into the main parsing loop, to reduce the time // anymore.
// complexity. // Eventually, I might be able to add it into the main parsing loop, to reduce the time
// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range. // complexity.
// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
// //
// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:' // 2. Check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
// I take this out, and put in a special character - NONCAPLPAREN_CHAR. // I take this out, and put in a special character - NONCAPLPAREN_CHAR.
// //
// Another check is made for unescaped brackets - opening brackets are replaced with LBRACKET and closing brackets are replaced with RBRACKET. // 3. Another check is made for unescaped brackets - opening brackets are replaced with
// Finally, check for escaped backslashes. Replace these with the BACKSLASH metacharacter. Later, in thompson(), // LBRACKET and closing brackets are replaced with RBRACKET.
// these will be converted back. This avoids confusiuon in detecting whether a character is escaped eg. detecting //
// whether '\\[a]' has an escaped opening bracket (it doesn't). // 4. Check for escaped backslashes. Replace these with the BACKSLASH
// metacharacter. Later, in thompson(), these will be converted back. This avoids
// confusion in detecting whether a character is escaped eg. detecting
// whether '\\[a]' has an escaped opening bracket (it doesn't).
//
// 5. Check for non-greedy operators. These are not supported at the moment, so an error
// must be thrown if the user attempts to use a non-greedy operator.
for i := 0; i < len(re_runes_orig); i++ { for i := 0; i < len(re_runes_orig); i++ {
c := re_runes_orig[i] c := re_runes_orig[i]
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) { if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
@ -172,6 +179,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') { } else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
re_runes = append(re_runes, RBRACKET) re_runes = append(re_runes, RBRACKET)
continue continue
} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
return nil, fmt.Errorf("non-greedy operators are not supported")
} else { } else {
re_runes = append(re_runes, c) re_runes = append(re_runes, c)
} }
@ -480,9 +489,6 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if (c == '*' && outQueueFinalElement.nodetype == KLEENE) || (c == '+' && outQueueFinalElement.nodetype == PLUS) { // You cannot apply a quantifier to a quantifier in this way if (c == '*' && outQueueFinalElement.nodetype == KLEENE) || (c == '+' && outQueueFinalElement.nodetype == PLUS) { // You cannot apply a quantifier to a quantifier in this way
return nil, fmt.Errorf("illegal use of token '%c'", c) return nil, fmt.Errorf("illegal use of token '%c'", c)
} }
if c == '?' && slices.Contains([]NodeType{KLEENE, PLUS, QUESTION}, outQueueFinalElement.nodetype) {
return nil, fmt.Errorf("non-greedy operators not supported")
}
opStack = append(opStack, c) opStack = append(opStack, c)
} }
} }
@ -1004,18 +1010,24 @@ func thompson(re []postfixNode) (Reg, error) {
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error applying kleene star") return Reg{}, fmt.Errorf("error applying kleene star")
} }
if s1.isEmpty && s1.assert != NONE { stateToAdd, err := kleene(*s1)
return Reg{}, fmt.Errorf("previous token is not quantifiable") if err != nil {
return Reg{}, err
} }
stateToAdd := kleene(*s1)
nfa = append(nfa, stateToAdd) nfa = append(nfa, stateToAdd)
case PLUS: // a+ is equivalent to aa* case PLUS: // a+ is equivalent to aa*
s1 := mustPop(&nfa) s1 := mustPop(&nfa)
s2 := kleene(*s1) s2, err := kleene(*s1)
if err != nil {
return Reg{}, err
}
s1 = concatenate(s1, s2) s1 = concatenate(s1, s2)
nfa = append(nfa, s1) nfa = append(nfa, s1)
case QUESTION: // ab? is equivalent to a(b|) case QUESTION: // ab? is equivalent to a(b|)
s1 := mustPop(&nfa) s1, err := pop(&nfa)
if err != nil {
return Reg{}, fmt.Errorf("error applying question operator")
}
s2 := question(s1) s2 := question(s1)
nfa = append(nfa, s2) nfa = append(nfa, s2)
case PIPE: case PIPE:
@ -1068,7 +1080,10 @@ func thompson(re []postfixNode) (Reg, error) {
stateToAdd = concatenate(stateToAdd, cloneState(state)) stateToAdd = concatenate(stateToAdd, cloneState(state))
} }
if c.endReps == INFINITE_REPS { // Case 3 if c.endReps == INFINITE_REPS { // Case 3
s2 := kleene(*state) s2, err := kleene(*state)
if err != nil {
return Reg{}, err
}
stateToAdd = concatenate(stateToAdd, s2) stateToAdd = concatenate(stateToAdd, s2)
} else { // Case 2 } else { // Case 2
for i := c.startReps; i < c.endReps; i++ { for i := c.startReps; i < c.endReps; i++ {

Loading…
Cancel
Save