Added support for lokarounds; parsing and adding nodes for different lookarounds

2024-11-22 00:10:15 -05:00
parent c807d6664e
commit 11c0a0552f
1 changed files with 119 additions and 11 deletions
--- a/main.go
+++ b/main.go
@@ -45,6 +45,10 @@ func shuntingYard(re string) []postfixNode {
 			a. This makes sense, because these operators can't be _concatenated_ with anything else.
 		2. The second character isn't a 'closing operator' - one that applies to something before it
 			a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_.
+	Caveats:
+		1. Don't mess with anything inside brackets - character class
+		2. Don't mess with anything inside braces - numeric repetition
+		3. Don't mess with any lookarounds.
 	*/
 	i := 0
 	for i < len(re_runes) {
@@ -95,6 +99,32 @@ func shuntingYard(re string) []postfixNode {
 			}
 			re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
 		}
+		if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] == '?') { // Unescaped open parentheses followed by question mark = lokaround. Don't mess with it.
+			i++ // Step inside
+			if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
+				panic("Invalid regex. Lookaround intended?")
+			}
+			re_postfix = append(re_postfix, re_runes[i])
+			i++
+			numOpenParens := 1
+			for numOpenParens != 0 {
+				if i >= len(re_runes) {
+					panic("Unclosed lookaround.")
+				}
+				if re_runes[i] == '(' {
+					numOpenParens++
+				}
+				if re_runes[i] == ')' {
+					numOpenParens--
+					if numOpenParens == 0 {
+						break
+					}
+				}
+				re_postfix = append(re_postfix, re_runes[i])
+				i++
+			}
+			continue
+		}
 		if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
 			if i < len(re_runes)-1 {
 				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
@@ -109,6 +139,7 @@ func shuntingYard(re string) []postfixNode {
 	outQueue := make([]postfixNode, 0) // Output queue

 	// Actual algorithm
+	numOpenParens := 0 // Number of open parentheses
 	for i := 0; i < len(re_postfix); i++ {
 		/* Two cases:
 		1. Current character is alphanumeric - send to output queue
@@ -147,7 +178,57 @@ func shuntingYard(re string) []postfixNode {
 		if c == '$' { // End-of-string assertion
 			outQueue = append(outQueue, newPostfixNode(c))
 		}
-
+		// Check if we're at the start of a lookaround
+		if c == '(' && i < len(re_postfix)-1 && re_postfix[i+1] == '?' {
+			i += 2      // Skip opening paren and question mark
+			regex := "" // Stores lookaround regex
+			numOpenParens := 1
+			for numOpenParens != 0 {
+				if i >= len(re_postfix) {
+					panic("Unclosed lookaround.")
+				}
+				if re_postfix[i] == '(' {
+					numOpenParens++
+				}
+				if re_postfix[i] == ')' {
+					numOpenParens--
+					if numOpenParens == 0 {
+						break
+					}
+				}
+				regex += string(re_postfix[i])
+				i++
+			}
+			if regex[len(regex)-1] == ')' { // The closing paren would have also been added. Let's remove that.
+				regex = regex[:len(regex)-1]
+			}
+			if len(regex) <= 1 { // Nothing in regex - panic
+				panic("Invalid lookaround. (too short?)")
+			}
+			// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
+			// Now we should filter that out.
+			toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1}
+			if regex[0] == '<' { // Lookbehind
+				toAppend.lookaroundDir = LOOKBEHIND
+				regex = regex[1:]
+			} else if regex[0] == '=' || regex[0] == '!' {
+				toAppend.lookaroundDir = LOOKAHEAD
+			} else {
+				panic("Invalid lookaround.")
+			}
+			// Positive or negative
+			if regex[0] == '=' { // Positive
+				toAppend.lookaroundSign = POSITIVE
+				toAppend.contents = []rune(regex[1:])
+			} else if regex[0] == '!' { // Negative
+				toAppend.lookaroundSign = NEGATIVE
+				toAppend.contents = []rune(regex[1:])
+			} else {
+				panic("Invalid lookaround.")
+			}
+			outQueue = append(outQueue, toAppend)
+			continue
+		}
 		if isOperator(c) {
 			if len(opStack) == 0 {
 				opStack = append(opStack, c)
@@ -259,6 +340,7 @@ func shuntingYard(re string) []postfixNode {
 		}
 		if c == '(' {
 			opStack = append(opStack, c)
+			numOpenParens++
 		}
 		if c == ')' {
 			// Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack.
@@ -270,6 +352,7 @@ func shuntingYard(re string) []postfixNode {
 				outQueue = append(outQueue, newPostfixNode(to_append))
 			}
 			_ = mustPop(&opStack) // Get rid of opening parantheses
+			numOpenParens--
 		}
 	}

@@ -279,6 +362,10 @@ func shuntingYard(re string) []postfixNode {
 		outQueue = append(outQueue, newPostfixNode(to_append))
 	}

+	if numOpenParens != 0 {
+		panic("ERROR: Imbalanced parantheses.")
+	}
+
 	return outQueue
 }

@@ -301,17 +388,38 @@ func thompson(re []postfixNode) *State {
 			state.output = append(state.output, &state)
 			state.isEmpty = false
 			if c.nodetype == ASSERTION {
+				state.isEmpty = true                 // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
 				state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
-				state.isEmpty = true
-				switch c.contents[0] {
-				case '^':
-					state.assert = SOS
-				case '$':
-					state.assert = EOS
-				case 'b':
-					state.assert = WBOUND
-				case 'B':
-					state.assert = NONWBOUND
+				if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
+					switch c.contents[0] {
+					case '^':
+						state.assert = SOS
+					case '$':
+						state.assert = EOS
+					case 'b':
+						state.assert = WBOUND
+					case 'B':
+						state.assert = NONWBOUND
+					}
+				} else { // Lookaround
+					state.lookaroundRegex = string(c.contents)
+					if c.lookaroundDir == LOOKAHEAD {
+						if c.lookaroundSign == POSITIVE {
+							state.assert = PLA
+						}
+						if c.lookaroundSign == NEGATIVE {
+							state.assert = NLA
+						}
+					}
+					if c.lookaroundDir == LOOKBEHIND {
+						if c.lookaroundSign == POSITIVE {
+							state.assert = PLB
+						}
+						if c.lookaroundSign == NEGATIVE {
+							state.assert = NLB
+						}
+					}
+
 				}
 			}
 			nfa = append(nfa, &state)