New features, changed character class behavior
I added support for hex values (eg. \x0F), octal values (eg. \012) and extended hex values (eg. \x{000F2A}). I also expanded the abilities of character clsses, to include things like escaped characters (eg. [aefp\)]) and character ranges _inside_ inverted character classes (eg. [^\w] which is functionally equivalent to [\W]).
This commit is contained in:
166
compile.go
166
compile.go
@@ -2,6 +2,7 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"unicode"
|
"unicode"
|
||||||
@@ -184,6 +185,40 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
|
re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
|
||||||
i += 3
|
i += 3
|
||||||
}
|
}
|
||||||
|
if i < len(re_runes) && re_runes[i] == '\\' { // Something is being escaped (I don't add the backslash to re_postfix, because it was already added earlier)
|
||||||
|
i++
|
||||||
|
if i >= len(re_runes) {
|
||||||
|
return nil, fmt.Errorf("Stray backslash in expression.")
|
||||||
|
}
|
||||||
|
if re_runes[i] == 'x' {
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
if i >= len(re_runes) {
|
||||||
|
return nil, fmt.Errorf("Stray backslash in expression.")
|
||||||
|
}
|
||||||
|
if re_runes[i] == '{' {
|
||||||
|
re_postfix = append(re_postfix, re_runes[i:i+8]...)
|
||||||
|
i += 7
|
||||||
|
if i >= len(re_runes) {
|
||||||
|
return nil, fmt.Errorf("Stray backslash in expression.")
|
||||||
|
}
|
||||||
|
} else if isHex(re_runes[i]) {
|
||||||
|
re_postfix = append(re_postfix, re_runes[i:i+2]...)
|
||||||
|
i += 2
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("Invalid hex value in expression.")
|
||||||
|
}
|
||||||
|
} else if isOctal(re_runes[i]) {
|
||||||
|
numDigits := 1
|
||||||
|
for i+numDigits < len(re_runes) && numDigits < 3 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 3)
|
||||||
|
numDigits++
|
||||||
|
}
|
||||||
|
re_postfix = append(re_postfix, re_runes[i:i+numDigits]...)
|
||||||
|
i += (numDigits - 1) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended
|
||||||
|
} else {
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
|
if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
|
||||||
i++ // Step inside
|
i++ // Step inside
|
||||||
if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
|
if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
|
||||||
@@ -253,7 +288,45 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
return nil, fmt.Errorf("ERROR: Backslash with no escape character.")
|
return nil, fmt.Errorf("ERROR: Backslash with no escape character.")
|
||||||
}
|
}
|
||||||
i++
|
i++
|
||||||
|
if re_postfix[i] == 'x' { // Hex value
|
||||||
|
i++
|
||||||
|
if re_postfix[i] == '{' && i < len(re_postfix)-6 { // Expanded hex code
|
||||||
|
var hexVal int
|
||||||
|
n, err := fmt.Sscanf(string(re_postfix[i:]), "{%x}", &hexVal)
|
||||||
|
if n < 1 || err != nil {
|
||||||
|
return nil, fmt.Errorf("Error parsing expanded hex code in expression.")
|
||||||
|
}
|
||||||
|
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
|
||||||
|
i += 7
|
||||||
|
} else if i < len(re_postfix)-1 { // Two-digit hex code
|
||||||
|
hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("Error parsing hex characters in expression.")
|
||||||
|
}
|
||||||
|
i += 2
|
||||||
|
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("Not enough hex characters found in expression.")
|
||||||
|
}
|
||||||
|
} else if isOctal(re_postfix[i]) { // Octal value
|
||||||
|
var octVal int
|
||||||
|
n, err := fmt.Sscanf(string(re_postfix[i:]), "%d", &octVal)
|
||||||
|
if n < 1 || err != nil {
|
||||||
|
return nil, fmt.Errorf("Error parsing octal value in expression.")
|
||||||
|
}
|
||||||
|
if octVal > 777 {
|
||||||
|
return nil, fmt.Errorf("Invalid octal value in expression.")
|
||||||
|
}
|
||||||
|
i += int(math.Ceil(math.Log10(float64(octVal)))) // Shift forward by the number of digits that were parsed
|
||||||
|
i-- // Move back one character, because the loop increment will move us back to the next character automatically
|
||||||
|
octValBase10, err := strconv.ParseInt(strconv.Itoa(octVal), 8, 0)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("Error parsing octal value in expression.")
|
||||||
|
}
|
||||||
|
outQueue = append(outQueue, newPostfixCharNode(rune(octValBase10)))
|
||||||
|
} else {
|
||||||
outQueue = append(outQueue, newEscapedNode(re_postfix[i]))
|
outQueue = append(outQueue, newEscapedNode(re_postfix[i]))
|
||||||
|
}
|
||||||
continue // Escaped character will automatically be skipped when loop variable increments
|
continue // Escaped character will automatically be skipped when loop variable increments
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -342,25 +415,60 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
invertMatch = true
|
invertMatch = true
|
||||||
i++
|
i++
|
||||||
}
|
}
|
||||||
chars := make([]rune, 0) // List of characters - used only for character classes
|
chars := make([]postfixNode, 0) // List of nodes - used only for character classes
|
||||||
for i < len(re_postfix) {
|
for i < len(re_postfix) {
|
||||||
if re_postfix[i] == RBRACKET {
|
if re_postfix[i] == RBRACKET {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
chars = append(chars, re_postfix[i])
|
if re_postfix[i] == '\\' { // Backslash indicates a character to be escaped
|
||||||
|
if i == len(re_postfix)-1 {
|
||||||
|
return nil, fmt.Errorf("Stray backslash in character class.")
|
||||||
|
}
|
||||||
|
i++ // Step past backslash
|
||||||
|
|
||||||
|
if re_postfix[i] == 'x' { // Hex value
|
||||||
i++
|
i++
|
||||||
|
if re_postfix[i] == '{' && i < len(re_postfix)-7 { // Expanded hex code
|
||||||
|
var hexVal int
|
||||||
|
n, err := fmt.Sscanf(string(re_postfix[i:]), "{%x}", &hexVal)
|
||||||
|
if n < 1 || err != nil {
|
||||||
|
return nil, fmt.Errorf("Error parsing expanded hex code in character class.")
|
||||||
|
}
|
||||||
|
chars = append(chars, newPostfixCharNode(rune(hexVal)))
|
||||||
|
i += 8
|
||||||
|
} else if i < len(re_postfix)-2 { // Two-digit hex code
|
||||||
|
hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("Error parsing hex characters in character class.")
|
||||||
|
}
|
||||||
|
i += 2
|
||||||
|
chars = append(chars, newPostfixCharNode(rune(hexVal)))
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("Not enough hex characters found in character class.")
|
||||||
|
}
|
||||||
|
} else if unicode.IsDigit(re_postfix[i]) { // Octal value
|
||||||
|
var octVal int
|
||||||
|
n, err := fmt.Sscanf(string(re_postfix[i:]), "%d", &octVal)
|
||||||
|
if n < 1 || err != nil {
|
||||||
|
return nil, fmt.Errorf("Error parsing octal value in character class.")
|
||||||
|
}
|
||||||
|
if octVal > 0777 {
|
||||||
|
return nil, fmt.Errorf("Invalid octal value in character class.")
|
||||||
|
}
|
||||||
|
i += int(math.Ceil(math.Log10(float64(octVal)) / math.Log10(8))) // Shift forward by the number of digits that were parsed
|
||||||
|
chars = append(chars, newPostfixCharNode(rune(octVal)))
|
||||||
|
} else {
|
||||||
|
chars = append(chars, newEscapedNode(re_postfix[i]))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
chars = append(chars, newPostfixCharNode(re_postfix[i]))
|
||||||
|
i++
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
|
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
|
||||||
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
||||||
}
|
}
|
||||||
if !invertMatch {
|
outQueue = append(outQueue, newCharClassNode(chars, invertMatch))
|
||||||
outQueue = append(outQueue, newPostfixCharNode(chars...))
|
|
||||||
} else {
|
|
||||||
// Invert match - create an allChars postfixNode, then add the given states to its 'except' list.
|
|
||||||
toAdd := newPostfixDotNode()
|
|
||||||
toAdd.except = chars
|
|
||||||
outQueue = append(outQueue, toAdd)
|
|
||||||
}
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if c == '{' {
|
if c == '{' {
|
||||||
@@ -476,10 +584,29 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
if c.allChars {
|
if c.allChars {
|
||||||
state.allChars = true
|
state.allChars = true
|
||||||
if len(c.except) != 0 {
|
if len(c.except) != 0 {
|
||||||
state.except = append([]rune{}, c.except...)
|
// For each node that I am 'excepting' (eg. in an inverted character class):
|
||||||
|
// - If the node itself has exceptions, then the exceptions cancel out.
|
||||||
|
// Eg. [^\w] == [\W]
|
||||||
|
// - Since an allChars node is the only kind that _can_ have exceptions, that's what I check for.
|
||||||
|
// - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list.
|
||||||
|
for _, node := range c.except {
|
||||||
|
if node.allChars {
|
||||||
|
// For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all,
|
||||||
|
// and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match
|
||||||
|
// those.
|
||||||
|
nodeExceptChars := slices.Concat(Map(node.except, func(node postfixNode) []rune {
|
||||||
|
return node.contents
|
||||||
|
})...)
|
||||||
|
state.content = rune2Contents(nodeExceptChars)
|
||||||
|
} else {
|
||||||
|
state.except = append(state.except, node.contents...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
state.content = rune2Contents(c.contents)
|
}
|
||||||
|
}
|
||||||
|
// Convert the current contents to []int, convert the result of rune2contents to []int, append then
|
||||||
|
// convert back to stateContents.
|
||||||
|
state.content = stateContents(append([]int(state.content), []int(rune2Contents(c.contents))...))
|
||||||
state.output = make([]*State, 0)
|
state.output = make([]*State, 0)
|
||||||
state.output = append(state.output, &state)
|
state.output = append(state.output, &state)
|
||||||
state.isEmpty = false
|
state.isEmpty = false
|
||||||
@@ -561,6 +688,19 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if c.nodetype == CHARCLASS { // A Character class consists of all the nodes in it, alternated
|
||||||
|
// Map the list of nodes to a list of states, each state containing the contents of a specific node
|
||||||
|
states := Map(c.nodeContents, func(node postfixNode) *State {
|
||||||
|
s := newState()
|
||||||
|
s.content = rune2Contents(node.contents)
|
||||||
|
return &s
|
||||||
|
})
|
||||||
|
// Reduce the list of states down to a single state by alternating them
|
||||||
|
toAdd := Reduce(states, func(s1 *State, s2 *State) *State {
|
||||||
|
return alternate(s1, s2)
|
||||||
|
})
|
||||||
|
nfa = append(nfa, toAdd)
|
||||||
|
}
|
||||||
// Must be an operator if it isn't a character
|
// Must be an operator if it isn't a character
|
||||||
switch c.nodetype {
|
switch c.nodetype {
|
||||||
case CONCATENATE:
|
case CONCATENATE:
|
||||||
@@ -613,7 +753,7 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
stateToAdd = concatenate(stateToAdd, s2)
|
stateToAdd = concatenate(stateToAdd, s2)
|
||||||
} else { // Case 2
|
} else { // Case 2
|
||||||
for i := c.startReps; i < c.endReps; i++ {
|
for i := c.startReps; i < c.endReps; i++ {
|
||||||
stateToAdd = concatenate(stateToAdd, question(state))
|
stateToAdd = concatenate(stateToAdd, question(cloneState(state)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
nfa = append(nfa, stateToAdd)
|
nfa = append(nfa, stateToAdd)
|
||||||
|
Reference in New Issue
Block a user