From 833dd269a844bec6782482e284271ce976c6f629 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 29 Jan 2025 10:28:18 -0500 Subject: [PATCH] Added more descriptive comments; throw error if non-greedy operator is used; use new definition for kleene() --- compile.go | 63 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/compile.go b/compile.go index 65fa864..c9d8c23 100644 --- a/compile.go +++ b/compile.go @@ -112,23 +112,30 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { // Convert the string to a slice of runes to allow iteration through it re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges) re_runes := make([]rune, 0) - // Check for numeric range. If we are at the start of a numeric range, - // skip to end and construct the equivalent regex for the range. - // The reason this is outside the loop below, is that it actually modifies - // the given regex (we 'cut' the numeric range and 'paste' an equivalent regex). - // It also makes the overall parsing easier, since I don't have to worry about the numeric range - // anymore. - // Eventually, I might be able to add it into the main parsing loop, to reduce the time - // complexity. - // A numeric range has the syntax: . Ir matches all numbers in this range. + // The following checks are performed here: + // 1. Check for numeric range. If we are at the start of a numeric range, + // skip to end and construct the equivalent regex for the range. + // The reason this is outside the loop below, is that it actually modifies + // the given regex (we 'cut' the numeric range and 'paste' an equivalent regex). + // It also makes the overall parsing easier, since I don't have to worry about the numeric range + // anymore. + // Eventually, I might be able to add it into the main parsing loop, to reduce the time + // complexity. + // A numeric range has the syntax: . Ir matches all numbers in this range. // - // Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:' - // I take this out, and put in a special character - NONCAPLPAREN_CHAR. + // 2. Check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:' + // I take this out, and put in a special character - NONCAPLPAREN_CHAR. // - // Another check is made for unescaped brackets - opening brackets are replaced with LBRACKET and closing brackets are replaced with RBRACKET. - // Finally, check for escaped backslashes. Replace these with the BACKSLASH metacharacter. Later, in thompson(), - // these will be converted back. This avoids confusiuon in detecting whether a character is escaped eg. detecting - // whether '\\[a]' has an escaped opening bracket (it doesn't). + // 3. Another check is made for unescaped brackets - opening brackets are replaced with + // LBRACKET and closing brackets are replaced with RBRACKET. + // + // 4. Check for escaped backslashes. Replace these with the BACKSLASH + // metacharacter. Later, in thompson(), these will be converted back. This avoids + // confusion in detecting whether a character is escaped eg. detecting + // whether '\\[a]' has an escaped opening bracket (it doesn't). + // + // 5. Check for non-greedy operators. These are not supported at the moment, so an error + // must be thrown if the user attempts to use a non-greedy operator. for i := 0; i < len(re_runes_orig); i++ { c := re_runes_orig[i] if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) { @@ -172,6 +179,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') { re_runes = append(re_runes, RBRACKET) continue + } else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') { + return nil, fmt.Errorf("non-greedy operators are not supported") } else { re_runes = append(re_runes, c) } @@ -480,9 +489,6 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { if (c == '*' && outQueueFinalElement.nodetype == KLEENE) || (c == '+' && outQueueFinalElement.nodetype == PLUS) { // You cannot apply a quantifier to a quantifier in this way return nil, fmt.Errorf("illegal use of token '%c'", c) } - if c == '?' && slices.Contains([]NodeType{KLEENE, PLUS, QUESTION}, outQueueFinalElement.nodetype) { - return nil, fmt.Errorf("non-greedy operators not supported") - } opStack = append(opStack, c) } } @@ -1004,18 +1010,24 @@ func thompson(re []postfixNode) (Reg, error) { if err != nil { return Reg{}, fmt.Errorf("error applying kleene star") } - if s1.isEmpty && s1.assert != NONE { - return Reg{}, fmt.Errorf("previous token is not quantifiable") + stateToAdd, err := kleene(*s1) + if err != nil { + return Reg{}, err } - stateToAdd := kleene(*s1) nfa = append(nfa, stateToAdd) case PLUS: // a+ is equivalent to aa* s1 := mustPop(&nfa) - s2 := kleene(*s1) + s2, err := kleene(*s1) + if err != nil { + return Reg{}, err + } s1 = concatenate(s1, s2) nfa = append(nfa, s1) case QUESTION: // ab? is equivalent to a(b|) - s1 := mustPop(&nfa) + s1, err := pop(&nfa) + if err != nil { + return Reg{}, fmt.Errorf("error applying question operator") + } s2 := question(s1) nfa = append(nfa, s2) case PIPE: @@ -1068,7 +1080,10 @@ func thompson(re []postfixNode) (Reg, error) { stateToAdd = concatenate(stateToAdd, cloneState(state)) } if c.endReps == INFINITE_REPS { // Case 3 - s2 := kleene(*state) + s2, err := kleene(*state) + if err != nil { + return Reg{}, err + } stateToAdd = concatenate(stateToAdd, s2) } else { // Case 2 for i := c.startReps; i < c.endReps; i++ {