Added support for lokarounds; parsing and adding nodes for different lookarounds
This commit is contained in:
130
main.go
130
main.go
@@ -45,6 +45,10 @@ func shuntingYard(re string) []postfixNode {
|
||||
a. This makes sense, because these operators can't be _concatenated_ with anything else.
|
||||
2. The second character isn't a 'closing operator' - one that applies to something before it
|
||||
a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_.
|
||||
Caveats:
|
||||
1. Don't mess with anything inside brackets - character class
|
||||
2. Don't mess with anything inside braces - numeric repetition
|
||||
3. Don't mess with any lookarounds.
|
||||
*/
|
||||
i := 0
|
||||
for i < len(re_runes) {
|
||||
@@ -95,6 +99,32 @@ func shuntingYard(re string) []postfixNode {
|
||||
}
|
||||
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
|
||||
}
|
||||
if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] == '?') { // Unescaped open parentheses followed by question mark = lokaround. Don't mess with it.
|
||||
i++ // Step inside
|
||||
if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
|
||||
panic("Invalid regex. Lookaround intended?")
|
||||
}
|
||||
re_postfix = append(re_postfix, re_runes[i])
|
||||
i++
|
||||
numOpenParens := 1
|
||||
for numOpenParens != 0 {
|
||||
if i >= len(re_runes) {
|
||||
panic("Unclosed lookaround.")
|
||||
}
|
||||
if re_runes[i] == '(' {
|
||||
numOpenParens++
|
||||
}
|
||||
if re_runes[i] == ')' {
|
||||
numOpenParens--
|
||||
if numOpenParens == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
re_postfix = append(re_postfix, re_runes[i])
|
||||
i++
|
||||
}
|
||||
continue
|
||||
}
|
||||
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
||||
if i < len(re_runes)-1 {
|
||||
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
|
||||
@@ -109,6 +139,7 @@ func shuntingYard(re string) []postfixNode {
|
||||
outQueue := make([]postfixNode, 0) // Output queue
|
||||
|
||||
// Actual algorithm
|
||||
numOpenParens := 0 // Number of open parentheses
|
||||
for i := 0; i < len(re_postfix); i++ {
|
||||
/* Two cases:
|
||||
1. Current character is alphanumeric - send to output queue
|
||||
@@ -147,7 +178,57 @@ func shuntingYard(re string) []postfixNode {
|
||||
if c == '$' { // End-of-string assertion
|
||||
outQueue = append(outQueue, newPostfixNode(c))
|
||||
}
|
||||
|
||||
// Check if we're at the start of a lookaround
|
||||
if c == '(' && i < len(re_postfix)-1 && re_postfix[i+1] == '?' {
|
||||
i += 2 // Skip opening paren and question mark
|
||||
regex := "" // Stores lookaround regex
|
||||
numOpenParens := 1
|
||||
for numOpenParens != 0 {
|
||||
if i >= len(re_postfix) {
|
||||
panic("Unclosed lookaround.")
|
||||
}
|
||||
if re_postfix[i] == '(' {
|
||||
numOpenParens++
|
||||
}
|
||||
if re_postfix[i] == ')' {
|
||||
numOpenParens--
|
||||
if numOpenParens == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
regex += string(re_postfix[i])
|
||||
i++
|
||||
}
|
||||
if regex[len(regex)-1] == ')' { // The closing paren would have also been added. Let's remove that.
|
||||
regex = regex[:len(regex)-1]
|
||||
}
|
||||
if len(regex) <= 1 { // Nothing in regex - panic
|
||||
panic("Invalid lookaround. (too short?)")
|
||||
}
|
||||
// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
|
||||
// Now we should filter that out.
|
||||
toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1}
|
||||
if regex[0] == '<' { // Lookbehind
|
||||
toAppend.lookaroundDir = LOOKBEHIND
|
||||
regex = regex[1:]
|
||||
} else if regex[0] == '=' || regex[0] == '!' {
|
||||
toAppend.lookaroundDir = LOOKAHEAD
|
||||
} else {
|
||||
panic("Invalid lookaround.")
|
||||
}
|
||||
// Positive or negative
|
||||
if regex[0] == '=' { // Positive
|
||||
toAppend.lookaroundSign = POSITIVE
|
||||
toAppend.contents = []rune(regex[1:])
|
||||
} else if regex[0] == '!' { // Negative
|
||||
toAppend.lookaroundSign = NEGATIVE
|
||||
toAppend.contents = []rune(regex[1:])
|
||||
} else {
|
||||
panic("Invalid lookaround.")
|
||||
}
|
||||
outQueue = append(outQueue, toAppend)
|
||||
continue
|
||||
}
|
||||
if isOperator(c) {
|
||||
if len(opStack) == 0 {
|
||||
opStack = append(opStack, c)
|
||||
@@ -259,6 +340,7 @@ func shuntingYard(re string) []postfixNode {
|
||||
}
|
||||
if c == '(' {
|
||||
opStack = append(opStack, c)
|
||||
numOpenParens++
|
||||
}
|
||||
if c == ')' {
|
||||
// Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack.
|
||||
@@ -270,6 +352,7 @@ func shuntingYard(re string) []postfixNode {
|
||||
outQueue = append(outQueue, newPostfixNode(to_append))
|
||||
}
|
||||
_ = mustPop(&opStack) // Get rid of opening parantheses
|
||||
numOpenParens--
|
||||
}
|
||||
}
|
||||
|
||||
@@ -279,6 +362,10 @@ func shuntingYard(re string) []postfixNode {
|
||||
outQueue = append(outQueue, newPostfixNode(to_append))
|
||||
}
|
||||
|
||||
if numOpenParens != 0 {
|
||||
panic("ERROR: Imbalanced parantheses.")
|
||||
}
|
||||
|
||||
return outQueue
|
||||
}
|
||||
|
||||
@@ -301,17 +388,38 @@ func thompson(re []postfixNode) *State {
|
||||
state.output = append(state.output, &state)
|
||||
state.isEmpty = false
|
||||
if c.nodetype == ASSERTION {
|
||||
state.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
|
||||
state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
|
||||
state.isEmpty = true
|
||||
switch c.contents[0] {
|
||||
case '^':
|
||||
state.assert = SOS
|
||||
case '$':
|
||||
state.assert = EOS
|
||||
case 'b':
|
||||
state.assert = WBOUND
|
||||
case 'B':
|
||||
state.assert = NONWBOUND
|
||||
if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
|
||||
switch c.contents[0] {
|
||||
case '^':
|
||||
state.assert = SOS
|
||||
case '$':
|
||||
state.assert = EOS
|
||||
case 'b':
|
||||
state.assert = WBOUND
|
||||
case 'B':
|
||||
state.assert = NONWBOUND
|
||||
}
|
||||
} else { // Lookaround
|
||||
state.lookaroundRegex = string(c.contents)
|
||||
if c.lookaroundDir == LOOKAHEAD {
|
||||
if c.lookaroundSign == POSITIVE {
|
||||
state.assert = PLA
|
||||
}
|
||||
if c.lookaroundSign == NEGATIVE {
|
||||
state.assert = NLA
|
||||
}
|
||||
}
|
||||
if c.lookaroundDir == LOOKBEHIND {
|
||||
if c.lookaroundSign == POSITIVE {
|
||||
state.assert = PLB
|
||||
}
|
||||
if c.lookaroundSign == NEGATIVE {
|
||||
state.assert = NLB
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
nfa = append(nfa, &state)
|
||||
|
Reference in New Issue
Block a user