From 98f4c9e418358c1958103055570744bb0c454c6b Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 18 Dec 2024 15:22:43 -0500 Subject: [PATCH] Added support for non-capturing groups --- main.go | 35 +++++++++++++++++++++++++---------- misc.go | 3 ++- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/main.go b/main.go index 602008e..8e99179 100644 --- a/main.go +++ b/main.go @@ -51,6 +51,9 @@ func shuntingYard(re string) []postfixNode { // Eventually, I might be able to add it into the main parsing loop, to reduce the time // complexity. // A numeric range has the syntax: . Ir matches all numbers in this range. + // + // Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:' + // I take this out, and put in a special character - NONCAPLPAREN_CHAR. for i := 0; i < len(re_runes_orig); i++ { c := re_runes_orig[i] if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) { @@ -82,7 +85,9 @@ func shuntingYard(re string) []postfixNode { fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd) regex := range2regex(rangeStart, rangeEnd) re_runes = append(re_runes, []rune(regex)...) - + } else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' { + re_runes = append(re_runes, NONCAPLPAREN_CHAR) + i += 2 } else { re_runes = append(re_runes, c) } @@ -148,7 +153,11 @@ func shuntingYard(re string) []postfixNode { } re_postfix = append(re_postfix, re_runes[i]) // Append closing brace } - if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] == '?') { // Unescaped open parentheses followed by question mark = lokaround. Don't mess with it. + if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen + re_postfix = append(re_postfix, NONCAPLPAREN_CHAR) + i += 3 + } + if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it. i++ // Step inside if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') { panic("Invalid regex. Lookaround intended?") @@ -174,7 +183,7 @@ func shuntingYard(re string) []postfixNode { } continue } - if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped + if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped if i < len(re_runes)-1 { if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' { re_postfix = append(re_postfix, CONCAT) @@ -197,7 +206,7 @@ func shuntingYard(re string) []postfixNode { b. If not, keep popping from opStack (and appending to outQueue) until: i. opStack is empty, OR ii. current character has greater priority than top of opStack - 3. If current character is '(', push to opStack + 3. If current character is '(' or NONCAPLPAREN_CHAR, push to opStack 4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses. 5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue. 6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue. @@ -389,22 +398,28 @@ func shuntingYard(re string) []postfixNode { outQueue[idx].startReps = startRangeNum outQueue[idx].endReps = endRangeNum } - if c == '(' { + if c == '(' || c == NONCAPLPAREN_CHAR { opStack = append(opStack, c) - outQueue = append(outQueue, newPostfixNode(c)) + if c == '(' { // We only push _capturing_ group parentheses to outQueue + outQueue = append(outQueue, newPostfixNode(c)) + } numOpenParens++ } if c == ')' { - // Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack. - for val, err := peek(opStack); val != '('; val, err = peek(opStack) { + // Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack. + var val rune + var err error + for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) { if err != nil { panic("ERROR: Imbalanced parantheses.") } to_append := mustPop(&opStack) outQueue = append(outQueue, newPostfixNode(to_append)) } - _ = mustPop(&opStack) // Get rid of opening parentheses - outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses + _ = mustPop(&opStack) // Get rid of opening parentheses + if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well + outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses + } numOpenParens-- } } diff --git a/misc.go b/misc.go index 8d920f2..eb5c75c 100644 --- a/misc.go +++ b/misc.go @@ -13,6 +13,7 @@ var RBRACKET rune = 0xF0001 var ANY_CHAR rune = 0xF0002 // Represents any character - used for states where the allChars flag is on. var LPAREN_CHAR rune = 0xF0003 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses var RPAREN_CHAR rune = 0xF0004 +var NONCAPLPAREN_CHAR rune = 0xF0005 // Represents a non-capturing group's LPAREN // Returns true if str[idx] and str[idx-1] are separated by a word boundary. func isWordBoundary(str []rune, idx int) bool { @@ -26,7 +27,7 @@ func isWordBoundary(str []rune, idx int) bool { func isNormalChar(c rune) bool { specialChars := []rune(`?*\^${}()+|[].~<>`) - specialChars = append(specialChars, LBRACKET, RBRACKET) + specialChars = append(specialChars, LBRACKET, RBRACKET, NONCAPLPAREN_CHAR) return !slices.Contains(specialChars, c) }