Added support for lokarounds; parsing and adding nodes for different lookarounds

master
Aadhavan Srinivasan 1 month ago
parent c807d6664e
commit 11c0a0552f

@ -45,6 +45,10 @@ func shuntingYard(re string) []postfixNode {
a. This makes sense, because these operators can't be _concatenated_ with anything else. a. This makes sense, because these operators can't be _concatenated_ with anything else.
2. The second character isn't a 'closing operator' - one that applies to something before it 2. The second character isn't a 'closing operator' - one that applies to something before it
a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_. a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_.
Caveats:
1. Don't mess with anything inside brackets - character class
2. Don't mess with anything inside braces - numeric repetition
3. Don't mess with any lookarounds.
*/ */
i := 0 i := 0
for i < len(re_runes) { for i < len(re_runes) {
@ -95,6 +99,32 @@ func shuntingYard(re string) []postfixNode {
} }
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
} }
if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] == '?') { // Unescaped open parentheses followed by question mark = lokaround. Don't mess with it.
i++ // Step inside
if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
panic("Invalid regex. Lookaround intended?")
}
re_postfix = append(re_postfix, re_runes[i])
i++
numOpenParens := 1
for numOpenParens != 0 {
if i >= len(re_runes) {
panic("Unclosed lookaround.")
}
if re_runes[i] == '(' {
numOpenParens++
}
if re_runes[i] == ')' {
numOpenParens--
if numOpenParens == 0 {
break
}
}
re_postfix = append(re_postfix, re_runes[i])
i++
}
continue
}
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
if i < len(re_runes)-1 { if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' { if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
@ -109,6 +139,7 @@ func shuntingYard(re string) []postfixNode {
outQueue := make([]postfixNode, 0) // Output queue outQueue := make([]postfixNode, 0) // Output queue
// Actual algorithm // Actual algorithm
numOpenParens := 0 // Number of open parentheses
for i := 0; i < len(re_postfix); i++ { for i := 0; i < len(re_postfix); i++ {
/* Two cases: /* Two cases:
1. Current character is alphanumeric - send to output queue 1. Current character is alphanumeric - send to output queue
@ -147,7 +178,57 @@ func shuntingYard(re string) []postfixNode {
if c == '$' { // End-of-string assertion if c == '$' { // End-of-string assertion
outQueue = append(outQueue, newPostfixNode(c)) outQueue = append(outQueue, newPostfixNode(c))
} }
// Check if we're at the start of a lookaround
if c == '(' && i < len(re_postfix)-1 && re_postfix[i+1] == '?' {
i += 2 // Skip opening paren and question mark
regex := "" // Stores lookaround regex
numOpenParens := 1
for numOpenParens != 0 {
if i >= len(re_postfix) {
panic("Unclosed lookaround.")
}
if re_postfix[i] == '(' {
numOpenParens++
}
if re_postfix[i] == ')' {
numOpenParens--
if numOpenParens == 0 {
break
}
}
regex += string(re_postfix[i])
i++
}
if regex[len(regex)-1] == ')' { // The closing paren would have also been added. Let's remove that.
regex = regex[:len(regex)-1]
}
if len(regex) <= 1 { // Nothing in regex - panic
panic("Invalid lookaround. (too short?)")
}
// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
// Now we should filter that out.
toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1}
if regex[0] == '<' { // Lookbehind
toAppend.lookaroundDir = LOOKBEHIND
regex = regex[1:]
} else if regex[0] == '=' || regex[0] == '!' {
toAppend.lookaroundDir = LOOKAHEAD
} else {
panic("Invalid lookaround.")
}
// Positive or negative
if regex[0] == '=' { // Positive
toAppend.lookaroundSign = POSITIVE
toAppend.contents = []rune(regex[1:])
} else if regex[0] == '!' { // Negative
toAppend.lookaroundSign = NEGATIVE
toAppend.contents = []rune(regex[1:])
} else {
panic("Invalid lookaround.")
}
outQueue = append(outQueue, toAppend)
continue
}
if isOperator(c) { if isOperator(c) {
if len(opStack) == 0 { if len(opStack) == 0 {
opStack = append(opStack, c) opStack = append(opStack, c)
@ -259,6 +340,7 @@ func shuntingYard(re string) []postfixNode {
} }
if c == '(' { if c == '(' {
opStack = append(opStack, c) opStack = append(opStack, c)
numOpenParens++
} }
if c == ')' { if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack. // Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack.
@ -270,6 +352,7 @@ func shuntingYard(re string) []postfixNode {
outQueue = append(outQueue, newPostfixNode(to_append)) outQueue = append(outQueue, newPostfixNode(to_append))
} }
_ = mustPop(&opStack) // Get rid of opening parantheses _ = mustPop(&opStack) // Get rid of opening parantheses
numOpenParens--
} }
} }
@ -279,6 +362,10 @@ func shuntingYard(re string) []postfixNode {
outQueue = append(outQueue, newPostfixNode(to_append)) outQueue = append(outQueue, newPostfixNode(to_append))
} }
if numOpenParens != 0 {
panic("ERROR: Imbalanced parantheses.")
}
return outQueue return outQueue
} }
@ -301,17 +388,38 @@ func thompson(re []postfixNode) *State {
state.output = append(state.output, &state) state.output = append(state.output, &state)
state.isEmpty = false state.isEmpty = false
if c.nodetype == ASSERTION { if c.nodetype == ASSERTION {
state.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
state.isEmpty = true if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
switch c.contents[0] { switch c.contents[0] {
case '^': case '^':
state.assert = SOS state.assert = SOS
case '$': case '$':
state.assert = EOS state.assert = EOS
case 'b': case 'b':
state.assert = WBOUND state.assert = WBOUND
case 'B': case 'B':
state.assert = NONWBOUND state.assert = NONWBOUND
}
} else { // Lookaround
state.lookaroundRegex = string(c.contents)
if c.lookaroundDir == LOOKAHEAD {
if c.lookaroundSign == POSITIVE {
state.assert = PLA
}
if c.lookaroundSign == NEGATIVE {
state.assert = NLA
}
}
if c.lookaroundDir == LOOKBEHIND {
if c.lookaroundSign == POSITIVE {
state.assert = PLB
}
if c.lookaroundSign == NEGATIVE {
state.assert = NLB
}
}
} }
} }
nfa = append(nfa, &state) nfa = append(nfa, &state)

Loading…
Cancel
Save