@ -2,6 +2,7 @@ package main
import (
"fmt"
"math"
"slices"
"strconv"
"unicode"
@ -184,6 +185,40 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
re_postfix = append ( re_postfix , NONCAPLPAREN_CHAR )
i += 3
}
if i < len ( re_runes ) && re_runes [ i ] == '\\' { // Something is being escaped (I don't add the backslash to re_postfix, because it was already added earlier)
i ++
if i >= len ( re_runes ) {
return nil , fmt . Errorf ( "Stray backslash in expression." )
}
if re_runes [ i ] == 'x' {
re_postfix = append ( re_postfix , re_runes [ i ] )
i ++
if i >= len ( re_runes ) {
return nil , fmt . Errorf ( "Stray backslash in expression." )
}
if re_runes [ i ] == '{' {
re_postfix = append ( re_postfix , re_runes [ i : i + 8 ] ... )
i += 7
if i >= len ( re_runes ) {
return nil , fmt . Errorf ( "Stray backslash in expression." )
}
} else if isHex ( re_runes [ i ] ) {
re_postfix = append ( re_postfix , re_runes [ i : i + 2 ] ... )
i += 2
} else {
return nil , fmt . Errorf ( "Invalid hex value in expression." )
}
} else if isOctal ( re_runes [ i ] ) {
numDigits := 1
for i + numDigits < len ( re_runes ) && numDigits < 3 && isOctal ( re_runes [ i + numDigits ] ) { // Skip while we see an octal character (max of 3)
numDigits ++
}
re_postfix = append ( re_postfix , re_runes [ i : i + numDigits ] ... )
i += ( numDigits - 1 ) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended
} else {
re_postfix = append ( re_postfix , re_runes [ i ] )
}
}
if i < len ( re_runes ) && re_runes [ i ] == '(' && ( i == 0 || re_runes [ i - 1 ] != '\\' ) && ( i < len ( re_runes ) - 2 && re_runes [ i + 1 ] == '?' && slices . Contains ( [ ] rune { '=' , '!' , '<' } , re_runes [ i + 2 ] ) ) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
i ++ // Step inside
if i == len ( re_runes ) - 1 || ( re_runes [ i + 1 ] != '=' && re_runes [ i + 1 ] != '!' && re_runes [ i + 1 ] != '<' ) {
@ -253,7 +288,45 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil , fmt . Errorf ( "ERROR: Backslash with no escape character." )
}
i ++
outQueue = append ( outQueue , newEscapedNode ( re_postfix [ i ] ) )
if re_postfix [ i ] == 'x' { // Hex value
i ++
if re_postfix [ i ] == '{' && i < len ( re_postfix ) - 6 { // Expanded hex code
var hexVal int
n , err := fmt . Sscanf ( string ( re_postfix [ i : ] ) , "{%x}" , & hexVal )
if n < 1 || err != nil {
return nil , fmt . Errorf ( "Error parsing expanded hex code in expression." )
}
outQueue = append ( outQueue , newPostfixCharNode ( rune ( hexVal ) ) )
i += 7
} else if i < len ( re_postfix ) - 1 { // Two-digit hex code
hexVal , err := strconv . ParseInt ( string ( [ ] rune { re_postfix [ i ] , re_postfix [ i + 1 ] } ) , 16 , 64 ) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
if err != nil {
return nil , fmt . Errorf ( "Error parsing hex characters in expression." )
}
i += 2
outQueue = append ( outQueue , newPostfixCharNode ( rune ( hexVal ) ) )
} else {
return nil , fmt . Errorf ( "Not enough hex characters found in expression." )
}
} else if isOctal ( re_postfix [ i ] ) { // Octal value
var octVal int
n , err := fmt . Sscanf ( string ( re_postfix [ i : ] ) , "%d" , & octVal )
if n < 1 || err != nil {
return nil , fmt . Errorf ( "Error parsing octal value in expression." )
}
if octVal > 777 {
return nil , fmt . Errorf ( "Invalid octal value in expression." )
}
i += int ( math . Ceil ( math . Log10 ( float64 ( octVal ) ) ) ) // Shift forward by the number of digits that were parsed
i -- // Move back one character, because the loop increment will move us back to the next character automatically
octValBase10 , err := strconv . ParseInt ( strconv . Itoa ( octVal ) , 8 , 0 )
if err != nil {
return nil , fmt . Errorf ( "Error parsing octal value in expression." )
}
outQueue = append ( outQueue , newPostfixCharNode ( rune ( octValBase10 ) ) )
} else {
outQueue = append ( outQueue , newEscapedNode ( re_postfix [ i ] ) )
}
continue // Escaped character will automatically be skipped when loop variable increments
}
@ -342,25 +415,60 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
invertMatch = true
i ++
}
chars := make ( [ ] rune , 0 ) // List of characters - used only for character classes
chars := make ( [ ] postfixNode , 0 ) // List of nodes - used only for character classes
for i < len ( re_postfix ) {
if re_postfix [ i ] == RBRACKET {
break
}
chars = append ( chars , re_postfix [ i ] )
i ++
if re_postfix [ i ] == '\\' { // Backslash indicates a character to be escaped
if i == len ( re_postfix ) - 1 {
return nil , fmt . Errorf ( "Stray backslash in character class." )
}
i ++ // Step past backslash
if re_postfix [ i ] == 'x' { // Hex value
i ++
if re_postfix [ i ] == '{' && i < len ( re_postfix ) - 7 { // Expanded hex code
var hexVal int
n , err := fmt . Sscanf ( string ( re_postfix [ i : ] ) , "{%x}" , & hexVal )
if n < 1 || err != nil {
return nil , fmt . Errorf ( "Error parsing expanded hex code in character class." )
}
chars = append ( chars , newPostfixCharNode ( rune ( hexVal ) ) )
i += 8
} else if i < len ( re_postfix ) - 2 { // Two-digit hex code
hexVal , err := strconv . ParseInt ( string ( [ ] rune { re_postfix [ i ] , re_postfix [ i + 1 ] } ) , 16 , 64 ) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
if err != nil {
return nil , fmt . Errorf ( "Error parsing hex characters in character class." )
}
i += 2
chars = append ( chars , newPostfixCharNode ( rune ( hexVal ) ) )
} else {
return nil , fmt . Errorf ( "Not enough hex characters found in character class." )
}
} else if unicode . IsDigit ( re_postfix [ i ] ) { // Octal value
var octVal int
n , err := fmt . Sscanf ( string ( re_postfix [ i : ] ) , "%d" , & octVal )
if n < 1 || err != nil {
return nil , fmt . Errorf ( "Error parsing octal value in character class." )
}
if octVal > 0777 {
return nil , fmt . Errorf ( "Invalid octal value in character class." )
}
i += int ( math . Ceil ( math . Log10 ( float64 ( octVal ) ) / math . Log10 ( 8 ) ) ) // Shift forward by the number of digits that were parsed
chars = append ( chars , newPostfixCharNode ( rune ( octVal ) ) )
} else {
chars = append ( chars , newEscapedNode ( re_postfix [ i ] ) )
}
} else {
chars = append ( chars , newPostfixCharNode ( re_postfix [ i ] ) )
i ++
}
}
if i == len ( re_postfix ) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
return nil , fmt . Errorf ( "Opening bracket without closing bracket." )
}
if ! invertMatch {
outQueue = append ( outQueue , newPostfixCharNode ( chars ... ) )
} else {
// Invert match - create an allChars postfixNode, then add the given states to its 'except' list.
toAdd := newPostfixDotNode ( )
toAdd . except = chars
outQueue = append ( outQueue , toAdd )
}
outQueue = append ( outQueue , newCharClassNode ( chars , invertMatch ) )
continue
}
if c == '{' {
@ -476,10 +584,29 @@ func thompson(re []postfixNode) (Reg, error) {
if c . allChars {
state . allChars = true
if len ( c . except ) != 0 {
state . except = append ( [ ] rune { } , c . except ... )
// For each node that I am 'excepting' (eg. in an inverted character class):
// - If the node itself has exceptions, then the exceptions cancel out.
// Eg. [^\w] == [\W]
// - Since an allChars node is the only kind that _can_ have exceptions, that's what I check for.
// - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list.
for _ , node := range c . except {
if node . allChars {
// For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all,
// and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match
// those.
nodeExceptChars := slices . Concat ( Map ( node . except , func ( node postfixNode ) [ ] rune {
return node . contents
} ) ... )
state . content = rune2Contents ( nodeExceptChars )
} else {
state . except = append ( state . except , node . contents ... )
}
}
}
}
state . content = rune2Contents ( c . contents )
// Convert the current contents to []int, convert the result of rune2contents to []int, append then
// convert back to stateContents.
state . content = stateContents ( append ( [ ] int ( state . content ) , [ ] int ( rune2Contents ( c . contents ) ) ... ) )
state . output = make ( [ ] * State , 0 )
state . output = append ( state . output , & state )
state . isEmpty = false
@ -561,6 +688,19 @@ func thompson(re []postfixNode) (Reg, error) {
}
}
if c . nodetype == CHARCLASS { // A Character class consists of all the nodes in it, alternated
// Map the list of nodes to a list of states, each state containing the contents of a specific node
states := Map ( c . nodeContents , func ( node postfixNode ) * State {
s := newState ( )
s . content = rune2Contents ( node . contents )
return & s
} )
// Reduce the list of states down to a single state by alternating them
toAdd := Reduce ( states , func ( s1 * State , s2 * State ) * State {
return alternate ( s1 , s2 )
} )
nfa = append ( nfa , toAdd )
}
// Must be an operator if it isn't a character
switch c . nodetype {
case CONCATENATE :
@ -613,7 +753,7 @@ func thompson(re []postfixNode) (Reg, error) {
stateToAdd = concatenate ( stateToAdd , s2 )
} else { // Case 2
for i := c . startReps ; i < c . endReps ; i ++ {
stateToAdd = concatenate ( stateToAdd , question ( state ) )
stateToAdd = concatenate ( stateToAdd , question ( cloneState ( state ) ) )
}
}
nfa = append ( nfa , stateToAdd )