Added support for non-capturing groups

2024-12-18 15:22:43 -05:00
parent 8d6e1a41a5
commit 98f4c9e418
2 changed files with 27 additions and 11 deletions
--- a/main.go
+++ b/main.go
@@ -51,6 +51,9 @@ func shuntingYard(re string) []postfixNode {
 	// Eventually, I might be able to add it into the main parsing loop, to reduce the time
 	// complexity.
 	// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
 	//
 	// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
 	// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
 	for i := 0; i < len(re_runes_orig); i++ {
 		c := re_runes_orig[i]
 		if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
@@ -82,7 +85,9 @@ func shuntingYard(re string) []postfixNode {
 			fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd)
 			regex := range2regex(rangeStart, rangeEnd)
 			re_runes = append(re_runes, []rune(regex)...)
-
+		} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
 			re_runes = append(re_runes, NONCAPLPAREN_CHAR)
 			i += 2
 		} else {
 			re_runes = append(re_runes, c)
 		}
@@ -148,7 +153,11 @@ func shuntingYard(re string) []postfixNode {
 			}
 			re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
 		}
-		if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] == '?') { // Unescaped open parentheses followed by question mark = lokaround. Don't mess with it.
+		if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen
 			re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
 			i += 3
 		}
 		if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
 			i++ // Step inside
 			if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
 				panic("Invalid regex. Lookaround intended?")
@@ -174,7 +183,7 @@ func shuntingYard(re string) []postfixNode {
 			}
 			continue
 		}
-		if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
+		if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
 			if i < len(re_runes)-1 {
 				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
 					re_postfix = append(re_postfix, CONCAT)
@@ -197,7 +206,7 @@ func shuntingYard(re string) []postfixNode {
 			b. If not, keep popping from opStack (and appending to outQueue) until:
 				i. opStack is empty, OR
 				ii. current character has greater priority than top of opStack
-		3. If current character is '(', push to opStack
+		3. If current character is '(' or NONCAPLPAREN_CHAR, push to opStack
 		4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
 		5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
 		6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
@@ -389,22 +398,28 @@ func shuntingYard(re string) []postfixNode {
 			outQueue[idx].startReps = startRangeNum
 			outQueue[idx].endReps = endRangeNum
 		}
-		if c == '(' {
+		if c == '(' || c == NONCAPLPAREN_CHAR {
 			opStack = append(opStack, c)
-			outQueue = append(outQueue, newPostfixNode(c))
+			if c == '(' { // We only push _capturing_ group parentheses to outQueue
 				outQueue = append(outQueue, newPostfixNode(c))
 			}
 			numOpenParens++
 		}
 		if c == ')' {
-			// Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack.
+			// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack.
-			for val, err := peek(opStack); val != '('; val, err = peek(opStack) {
+			var val rune
 			var err error
 			for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) {
 				if err != nil {
 					panic("ERROR: Imbalanced parantheses.")
 				}
 				to_append := mustPop(&opStack)
 				outQueue = append(outQueue, newPostfixNode(to_append))
 			}
-			_ = mustPop(&opStack)                            // Get rid of opening parentheses
+			_ = mustPop(&opStack) // Get rid of opening parentheses
-			outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
+			if val == '(' {       // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
 				outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
 			}
 			numOpenParens--
 		}
 	}
--- a/misc.go
+++ b/misc.go
@@ -13,6 +13,7 @@ var RBRACKET rune = 0xF0001
 var ANY_CHAR rune = 0xF0002    // Represents any character - used for states where the allChars flag is on.
 var LPAREN_CHAR rune = 0xF0003 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
 var RPAREN_CHAR rune = 0xF0004
 var NONCAPLPAREN_CHAR rune = 0xF0005 // Represents a non-capturing group's LPAREN
 // Returns true if str[idx] and str[idx-1] are separated by a word boundary.
 func isWordBoundary(str []rune, idx int) bool {
@@ -26,7 +27,7 @@ func isWordBoundary(str []rune, idx int) bool {
 func isNormalChar(c rune) bool {
 	specialChars := []rune(`?*\^${}()+|[].~<>`)
-	specialChars = append(specialChars, LBRACKET, RBRACKET)
+	specialChars = append(specialChars, LBRACKET, RBRACKET, NONCAPLPAREN_CHAR)
 	return !slices.Contains(specialChars, c)
 }