@ -45,6 +45,10 @@ func shuntingYard(re string) []postfixNode {
a . This makes sense , because these operators can ' t be _concatenated_ with anything else .
2. The second character isn ' t a ' closing operator ' - one that applies to something before it
a . Again , these operators can ' be concatenated _to_ . They can , however , be concatenated _from_ .
Caveats :
1. Don ' t mess with anything inside brackets - character class
2. Don ' t mess with anything inside braces - numeric repetition
3. Don ' t mess with any lookarounds .
* /
i := 0
for i < len ( re_runes ) {
@ -95,6 +99,32 @@ func shuntingYard(re string) []postfixNode {
}
re_postfix = append ( re_postfix , re_runes [ i ] ) // Append closing brace
}
if i < len ( re_runes ) && re_runes [ i ] == '(' && ( i == 0 || re_runes [ i - 1 ] != '\\' ) && ( i < len ( re_runes ) - 1 && re_runes [ i + 1 ] == '?' ) { // Unescaped open parentheses followed by question mark = lokaround. Don't mess with it.
i ++ // Step inside
if i == len ( re_runes ) - 1 || ( re_runes [ i + 1 ] != '=' && re_runes [ i + 1 ] != '!' && re_runes [ i + 1 ] != '<' ) {
panic ( "Invalid regex. Lookaround intended?" )
}
re_postfix = append ( re_postfix , re_runes [ i ] )
i ++
numOpenParens := 1
for numOpenParens != 0 {
if i >= len ( re_runes ) {
panic ( "Unclosed lookaround." )
}
if re_runes [ i ] == '(' {
numOpenParens ++
}
if re_runes [ i ] == ')' {
numOpenParens --
if numOpenParens == 0 {
break
}
}
re_postfix = append ( re_postfix , re_runes [ i ] )
i ++
}
continue
}
if i < len ( re_runes ) && ( re_runes [ i ] != '(' && re_runes [ i ] != '|' && re_runes [ i ] != '\\' ) || ( i > 0 && re_runes [ i - 1 ] == '\\' ) { // Every character should be concatenated if it is escaped
if i < len ( re_runes ) - 1 {
if re_runes [ i + 1 ] != '|' && re_runes [ i + 1 ] != '*' && re_runes [ i + 1 ] != '+' && re_runes [ i + 1 ] != '?' && re_runes [ i + 1 ] != ')' && re_runes [ i + 1 ] != '{' {
@ -109,6 +139,7 @@ func shuntingYard(re string) []postfixNode {
outQueue := make ( [ ] postfixNode , 0 ) // Output queue
// Actual algorithm
numOpenParens := 0 // Number of open parentheses
for i := 0 ; i < len ( re_postfix ) ; i ++ {
/ * Two cases :
1. Current character is alphanumeric - send to output queue
@ -147,7 +178,57 @@ func shuntingYard(re string) []postfixNode {
if c == '$' { // End-of-string assertion
outQueue = append ( outQueue , newPostfixNode ( c ) )
}
// Check if we're at the start of a lookaround
if c == '(' && i < len ( re_postfix ) - 1 && re_postfix [ i + 1 ] == '?' {
i += 2 // Skip opening paren and question mark
regex := "" // Stores lookaround regex
numOpenParens := 1
for numOpenParens != 0 {
if i >= len ( re_postfix ) {
panic ( "Unclosed lookaround." )
}
if re_postfix [ i ] == '(' {
numOpenParens ++
}
if re_postfix [ i ] == ')' {
numOpenParens --
if numOpenParens == 0 {
break
}
}
regex += string ( re_postfix [ i ] )
i ++
}
if regex [ len ( regex ) - 1 ] == ')' { // The closing paren would have also been added. Let's remove that.
regex = regex [ : len ( regex ) - 1 ]
}
if len ( regex ) <= 1 { // Nothing in regex - panic
panic ( "Invalid lookaround. (too short?)" )
}
// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
// Now we should filter that out.
toAppend := postfixNode { nodetype : ASSERTION , startReps : 1 , endReps : 1 }
if regex [ 0 ] == '<' { // Lookbehind
toAppend . lookaroundDir = LOOKBEHIND
regex = regex [ 1 : ]
} else if regex [ 0 ] == '=' || regex [ 0 ] == '!' {
toAppend . lookaroundDir = LOOKAHEAD
} else {
panic ( "Invalid lookaround." )
}
// Positive or negative
if regex [ 0 ] == '=' { // Positive
toAppend . lookaroundSign = POSITIVE
toAppend . contents = [ ] rune ( regex [ 1 : ] )
} else if regex [ 0 ] == '!' { // Negative
toAppend . lookaroundSign = NEGATIVE
toAppend . contents = [ ] rune ( regex [ 1 : ] )
} else {
panic ( "Invalid lookaround." )
}
outQueue = append ( outQueue , toAppend )
continue
}
if isOperator ( c ) {
if len ( opStack ) == 0 {
opStack = append ( opStack , c )
@ -259,6 +340,7 @@ func shuntingYard(re string) []postfixNode {
}
if c == '(' {
opStack = append ( opStack , c )
numOpenParens ++
}
if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack.
@ -270,6 +352,7 @@ func shuntingYard(re string) []postfixNode {
outQueue = append ( outQueue , newPostfixNode ( to_append ) )
}
_ = mustPop ( & opStack ) // Get rid of opening parantheses
numOpenParens --
}
}
@ -279,6 +362,10 @@ func shuntingYard(re string) []postfixNode {
outQueue = append ( outQueue , newPostfixNode ( to_append ) )
}
if numOpenParens != 0 {
panic ( "ERROR: Imbalanced parantheses." )
}
return outQueue
}
@ -301,8 +388,9 @@ func thompson(re []postfixNode) *State {
state . output = append ( state . output , & state )
state . isEmpty = false
if c . nodetype == ASSERTION {
state . isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
state . content = newContents ( EPSILON ) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
state . isEmpty = true
if c . lookaroundDir == 0 || c . lookaroundSign == 0 {
switch c . contents [ 0 ] {
case '^' :
state . assert = SOS
@ -313,6 +401,26 @@ func thompson(re []postfixNode) *State {
case 'B' :
state . assert = NONWBOUND
}
} else { // Lookaround
state . lookaroundRegex = string ( c . contents )
if c . lookaroundDir == LOOKAHEAD {
if c . lookaroundSign == POSITIVE {
state . assert = PLA
}
if c . lookaroundSign == NEGATIVE {
state . assert = NLA
}
}
if c . lookaroundDir == LOOKBEHIND {
if c . lookaroundSign == POSITIVE {
state . assert = PLB
}
if c . lookaroundSign == NEGATIVE {
state . assert = NLB
}
}
}
}
nfa = append ( nfa , & state )
}