From 11c0a0552fbdcbad58508cedcd5d10ef45da5792 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Fri, 22 Nov 2024 00:10:15 -0500 Subject: [PATCH] Added support for lokarounds; parsing and adding nodes for different lookarounds --- main.go | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 119 insertions(+), 11 deletions(-) diff --git a/main.go b/main.go index f4aad68..266392b 100644 --- a/main.go +++ b/main.go @@ -45,6 +45,10 @@ func shuntingYard(re string) []postfixNode { a. This makes sense, because these operators can't be _concatenated_ with anything else. 2. The second character isn't a 'closing operator' - one that applies to something before it a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_. + Caveats: + 1. Don't mess with anything inside brackets - character class + 2. Don't mess with anything inside braces - numeric repetition + 3. Don't mess with any lookarounds. */ i := 0 for i < len(re_runes) { @@ -95,6 +99,32 @@ func shuntingYard(re string) []postfixNode { } re_postfix = append(re_postfix, re_runes[i]) // Append closing brace } + if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] == '?') { // Unescaped open parentheses followed by question mark = lokaround. Don't mess with it. + i++ // Step inside + if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') { + panic("Invalid regex. Lookaround intended?") + } + re_postfix = append(re_postfix, re_runes[i]) + i++ + numOpenParens := 1 + for numOpenParens != 0 { + if i >= len(re_runes) { + panic("Unclosed lookaround.") + } + if re_runes[i] == '(' { + numOpenParens++ + } + if re_runes[i] == ')' { + numOpenParens-- + if numOpenParens == 0 { + break + } + } + re_postfix = append(re_postfix, re_runes[i]) + i++ + } + continue + } if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped if i < len(re_runes)-1 { if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' { @@ -109,6 +139,7 @@ func shuntingYard(re string) []postfixNode { outQueue := make([]postfixNode, 0) // Output queue // Actual algorithm + numOpenParens := 0 // Number of open parentheses for i := 0; i < len(re_postfix); i++ { /* Two cases: 1. Current character is alphanumeric - send to output queue @@ -147,7 +178,57 @@ func shuntingYard(re string) []postfixNode { if c == '$' { // End-of-string assertion outQueue = append(outQueue, newPostfixNode(c)) } - + // Check if we're at the start of a lookaround + if c == '(' && i < len(re_postfix)-1 && re_postfix[i+1] == '?' { + i += 2 // Skip opening paren and question mark + regex := "" // Stores lookaround regex + numOpenParens := 1 + for numOpenParens != 0 { + if i >= len(re_postfix) { + panic("Unclosed lookaround.") + } + if re_postfix[i] == '(' { + numOpenParens++ + } + if re_postfix[i] == ')' { + numOpenParens-- + if numOpenParens == 0 { + break + } + } + regex += string(re_postfix[i]) + i++ + } + if regex[len(regex)-1] == ')' { // The closing paren would have also been added. Let's remove that. + regex = regex[:len(regex)-1] + } + if len(regex) <= 1 { // Nothing in regex - panic + panic("Invalid lookaround. (too short?)") + } + // 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind) + // Now we should filter that out. + toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1} + if regex[0] == '<' { // Lookbehind + toAppend.lookaroundDir = LOOKBEHIND + regex = regex[1:] + } else if regex[0] == '=' || regex[0] == '!' { + toAppend.lookaroundDir = LOOKAHEAD + } else { + panic("Invalid lookaround.") + } + // Positive or negative + if regex[0] == '=' { // Positive + toAppend.lookaroundSign = POSITIVE + toAppend.contents = []rune(regex[1:]) + } else if regex[0] == '!' { // Negative + toAppend.lookaroundSign = NEGATIVE + toAppend.contents = []rune(regex[1:]) + } else { + panic("Invalid lookaround.") + } + outQueue = append(outQueue, toAppend) + continue + } if isOperator(c) { if len(opStack) == 0 { opStack = append(opStack, c) @@ -259,6 +340,7 @@ func shuntingYard(re string) []postfixNode { } if c == '(' { opStack = append(opStack, c) + numOpenParens++ } if c == ')' { // Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack. @@ -270,6 +352,7 @@ func shuntingYard(re string) []postfixNode { outQueue = append(outQueue, newPostfixNode(to_append)) } _ = mustPop(&opStack) // Get rid of opening parantheses + numOpenParens-- } } @@ -279,6 +362,10 @@ func shuntingYard(re string) []postfixNode { outQueue = append(outQueue, newPostfixNode(to_append)) } + if numOpenParens != 0 { + panic("ERROR: Imbalanced parantheses.") + } + return outQueue } @@ -301,17 +388,38 @@ func thompson(re []postfixNode) *State { state.output = append(state.output, &state) state.isEmpty = false if c.nodetype == ASSERTION { + state.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way. state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string - state.isEmpty = true - switch c.contents[0] { - case '^': - state.assert = SOS - case '$': - state.assert = EOS - case 'b': - state.assert = WBOUND - case 'B': - state.assert = NONWBOUND + if c.lookaroundDir == 0 || c.lookaroundSign == 0 { + switch c.contents[0] { + case '^': + state.assert = SOS + case '$': + state.assert = EOS + case 'b': + state.assert = WBOUND + case 'B': + state.assert = NONWBOUND + } + } else { // Lookaround + state.lookaroundRegex = string(c.contents) + if c.lookaroundDir == LOOKAHEAD { + if c.lookaroundSign == POSITIVE { + state.assert = PLA + } + if c.lookaroundSign == NEGATIVE { + state.assert = NLA + } + } + if c.lookaroundDir == LOOKBEHIND { + if c.lookaroundSign == POSITIVE { + state.assert = PLB + } + if c.lookaroundSign == NEGATIVE { + state.assert = NLB + } + } + } } nfa = append(nfa, &state)