7 Commits

Author SHA1 Message Date
d210a85253 Updated handling of '\b' when inside character class, made invalid
escapes an error.

The '\b' value refers to a word boundary normally, but refers to the
backspace ASCII value inside a character class. I updated
newEscapedNode() to deal with this. I also changed the behavior, so that
trying to escape any other value results in an error, instead of just
returning the character as-is.
2025-01-21 22:14:38 -05:00
48cff259b2 Updated tests 2025-01-21 22:13:57 -05:00
25cb79f01b Changed the value of EPSILON, so that we can use the NUL character
(which it used to be) in a regex; Also added code to detect escaped
backslashes

Specifically, I replace an escaped backslash with a metacharacter, then
replace it back later on. This prevents problems, like detecting whether
the opening bracket is escaped in '\\[a]'.
2025-01-21 22:12:29 -05:00
0fb78abf7f Added function to replace an element in a slice given its value 2025-01-21 22:09:41 -05:00
9dc4fd4595 Started adding tests from Python's RE test suite 2025-01-20 18:04:19 -05:00
099612ae7f Bug fixes, changed the way I parse octal values 2025-01-20 18:04:05 -05:00
9115858261 Changed assignment of the unicode values by 1, so that EPSILON can now be 0xF0000 2025-01-20 17:08:07 -05:00
5 changed files with 133 additions and 52 deletions

View File

@@ -2,7 +2,6 @@ package main
import (
"fmt"
"math"
"slices"
"strconv"
"unicode"
@@ -82,6 +81,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
//
// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
//
// Finally, check for escaped backslashes. Replace these with the BACKSLASH metacharacter. Later, in thompson(),
// these will be converted back. This avoids confusiuon in detecting whether a character is escaped eg. detecting
// whether '\\[a]' has an escaped opening bracket (it doesn't).
for i := 0; i < len(re_runes_orig); i++ {
c := re_runes_orig[i]
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
@@ -116,6 +119,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
re_runes = append(re_runes, NONCAPLPAREN_CHAR)
i += 2
} else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash
re_runes = append(re_runes, ESC_BACKSLASH)
i++
} else {
re_runes = append(re_runes, c)
}
@@ -145,8 +151,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
return nil, fmt.Errorf("Empty character class.")
}
for re_runes[i] != ']' {
i++ // Skip all characters inside brackets
for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
// TODO: Check for escaped characters
// Check ahead for character range
@@ -303,29 +309,34 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if err != nil {
return nil, fmt.Errorf("Error parsing hex characters in expression.")
}
i += 2
i++ // Loop increment will take care of going forward
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
} else {
return nil, fmt.Errorf("Not enough hex characters found in expression.")
}
} else if isOctal(re_postfix[i]) { // Octal value
var octVal int
n, err := fmt.Sscanf(string(re_postfix[i:]), "%d", &octVal)
if n < 1 || err != nil {
return nil, fmt.Errorf("Error parsing octal value in expression.")
var octVal int64
var octValStr string
numDigitsParsed := 0
for (i+numDigitsParsed) < len(re_postfix) && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 {
octValStr += string(re_postfix[i+numDigitsParsed])
numDigitsParsed++
}
if octVal > 777 {
return nil, fmt.Errorf("Invalid octal value in expression.")
}
i += int(math.Ceil(math.Log10(float64(octVal)))) // Shift forward by the number of digits that were parsed
i-- // Move back one character, because the loop increment will move us back to the next character automatically
octValBase10, err := strconv.ParseInt(strconv.Itoa(octVal), 8, 0)
octVal, err := strconv.ParseInt(octValStr, 8, 32)
if err != nil {
return nil, fmt.Errorf("Error parsing octal value in expression.")
}
outQueue = append(outQueue, newPostfixCharNode(rune(octValBase10)))
if octVal > 0777 {
return nil, fmt.Errorf("Invalid octal value in expression.")
}
i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically
outQueue = append(outQueue, newPostfixCharNode(rune(octVal)))
} else {
outQueue = append(outQueue, newEscapedNode(re_postfix[i]))
escapedNode, err := newEscapedNode(re_postfix[i], false)
if err != nil {
return nil, fmt.Errorf("Invalid escape character in expression.")
}
outQueue = append(outQueue, escapedNode)
}
continue // Escaped character will automatically be skipped when loop variable increments
}
@@ -446,19 +457,30 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} else {
return nil, fmt.Errorf("Not enough hex characters found in character class.")
}
} else if unicode.IsDigit(re_postfix[i]) { // Octal value
var octVal int
n, err := fmt.Sscanf(string(re_postfix[i:]), "%d", &octVal)
if n < 1 || err != nil {
} else if isOctal(re_postfix[i]) { // Octal value
var octVal int64
var octValStr string
numDigitsParsed := 0
for (i+numDigitsParsed) < len(re_postfix)-1 && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else)
octValStr += string(re_postfix[i+numDigitsParsed])
numDigitsParsed++
}
octVal, err := strconv.ParseInt(octValStr, 8, 32)
if err != nil {
return nil, fmt.Errorf("Error parsing octal value in character class.")
}
if octVal > 0777 {
return nil, fmt.Errorf("Invalid octal value in character class.")
}
i += int(math.Ceil(math.Log10(float64(octVal)) / math.Log10(8))) // Shift forward by the number of digits that were parsed
i += numDigitsParsed // Shift forward by the number of characters parsed
chars = append(chars, newPostfixCharNode(rune(octVal)))
} else {
chars = append(chars, newEscapedNode(re_postfix[i]))
escapedNode, err := newEscapedNode(re_postfix[i], true)
if err != nil {
return nil, fmt.Errorf("Invalid escape character in character class.")
}
chars = append(chars, escapedNode)
i++
}
} else {
chars = append(chars, newPostfixCharNode(re_postfix[i]))
@@ -591,6 +613,7 @@ func thompson(re []postfixNode) (Reg, error) {
// - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list.
for _, node := range c.except {
if node.allChars {
state.allChars = false
// For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all,
// and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match
// those.
@@ -655,6 +678,11 @@ func thompson(re []postfixNode) (Reg, error) {
}
}
// Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it
replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\')
replaceByValue(state.except, ESC_BACKSLASH, '\\')
nfa = append(nfa, &state)
}
if c.nodetype == LPAREN || c.nodetype == RPAREN {

23
misc.go
View File

@@ -8,12 +8,13 @@ import (
var whitespaceChars = []rune{' ', '\t', '\n'}
var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_")
var LBRACKET rune = 0xF0000
var RBRACKET rune = 0xF0001
var ANY_CHAR rune = 0xF0002 // Represents any character - used for states where the allChars flag is on.
var LPAREN_CHAR rune = 0xF0003 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
var RPAREN_CHAR rune = 0xF0004
var NONCAPLPAREN_CHAR rune = 0xF0005 // Represents a non-capturing group's LPAREN
var LBRACKET rune = 0xF0001
var RBRACKET rune = 0xF0002
var ANY_CHAR rune = 0xF0003 // Represents any character - used for states where the allChars flag is on.
var LPAREN_CHAR rune = 0xF0004 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
var RPAREN_CHAR rune = 0xF0005
var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPAREN
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash
// Returns true if str[idx] and str[idx-1] are separated by a word boundary.
func isWordBoundary(str []rune, idx int) bool {
@@ -139,3 +140,13 @@ func isHex(c rune) bool {
func isOctal(c rune) bool {
return slices.Contains([]rune("01234567"), c)
}
// Replace an element in a slice with another, given both values
func replaceByValue[T comparable](slc []T, toReplace T, replaceWith T) []T {
for i, val := range slc {
if val == toReplace {
slc[i] = replaceWith
}
}
return slc
}

2
nfa.go
View File

@@ -4,7 +4,7 @@ import (
"slices"
)
const EPSILON int = 0
const EPSILON int = 0xF0000
type assertType int

View File

@@ -1,5 +1,7 @@
package main
import "fmt"
type NodeType int
// This is a slice containing all escapable characters that have special meaning.
@@ -62,7 +64,7 @@ func newCharClassNode(nodes []postfixNode, negated bool) postfixNode {
}
// Creates a new escaped node - the given character is assumed to have been preceded by a backslash
func newEscapedNode(c rune) postfixNode {
func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
toReturn := postfixNode{}
toReturn.startReps = 1
toReturn.endReps = 1
@@ -86,8 +88,13 @@ func newEscapedNode(c rune) postfixNode {
toReturn = newPostfixDotNode()
toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...))
case 'b', 'B':
toReturn.nodetype = ASSERTION
toReturn.contents = append(toReturn.contents, c)
if c == 'b' && inCharClass {
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(8))
} else {
toReturn.nodetype = ASSERTION
toReturn.contents = append(toReturn.contents, c)
}
case 'n': // Newline character
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, '\n')
@@ -110,10 +117,13 @@ func newEscapedNode(c rune) postfixNode {
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(11))
default: // None of the above - append it as a regular character
if isNormalChar(c) { // Normal characters cannot be escaped
return postfixNode{}, fmt.Errorf("Invalid escape character.")
}
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, c)
}
return toReturn
return toReturn, nil
}
// Creates and returns a postfixNode based on the given contents

View File

@@ -110,6 +110,9 @@ var reTests = []struct {
{`\d{3,4}`, "ababab555", []Group{{6, 9}}},
{`\bpaint\b`, "paints", []Group{}},
{`\b\w{5}\b`, "paint", []Group{{0, 5}}},
{`[^\w]`, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
{`[^\W]`, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
{`[\[\]]`, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
// Unicode tests
{`.+`, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
@@ -149,6 +152,8 @@ var reTests = []struct {
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "400", []Group{{0, 3}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "4000", []Group{}},
{"a{1,3}", "aaaaa", []Group{{0, 3}, {3, 5}}},
{`\\[ab\\]`, "a", []Group{}},
{`\\[ab\\]`, `\a`, []Group{{0, 2}}},
// Lookaround tests
{"(?<=bo)y", "boy", []Group{{2, 3}}},
@@ -156,6 +161,24 @@ var reTests = []struct {
{"(?<=f)f+(?=f)", "fffff", []Group{{1, 4}}},
{"(?<=f)f+(?=f)", "fffffa", []Group{{1, 4}}},
// Test cases from Python's RE test suite
{`[\1]`, "\x01", []Group{{0, 1}}},
{`\0`, "\x00", []Group{{0, 1}}},
{`[\0a]`, "\x00", []Group{{0, 1}}},
{`[\0a]`, "\x00", []Group{{0, 1}}},
{`[a\0]`, "\x00", []Group{{0, 1}}},
{`[^a\0]`, "\x00", []Group{}},
{`\a[\b]\f\n\r\t\v`, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
{`[\a][\b][\f][\n][\r][\t][\v]`, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
{`\u`, "", nil},
{`\xff`, "ÿ", []Group{{0, 1}}},
{`\x00ffffffffffffff`, "\xff", []Group{}},
{`\x00f`, "\x0f", []Group{}},
{`\x00fe`, "\xfe", []Group{}},
{`^\w+=(\\[\000-\277]|[^\n\\])*`, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
// Todo - add numeric range tests
}
@@ -183,6 +206,7 @@ var groupTests = []struct {
{"(a?)a?", "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
{"(a?)a?", "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
{"a((b.d){3})", "abfdbhdbid", []Match{[]Group{{0, 10}, {1, 10}, {7, 10}}}},
{`(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\071`, `abcdefghijkl9`, []Match{[]Group{{0, 13}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {10, 11}, {11, 12}}}},
}
func TestFindAllMatches(t *testing.T) {
@@ -190,15 +214,18 @@ func TestFindAllMatches(t *testing.T) {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re)
if err != nil {
panic(err)
}
matchIndices := FindAllMatches(regComp, test.str)
zeroGroups := make([]Group, len(matchIndices))
for i, m := range matchIndices {
zeroGroups[i] = m[0]
}
if !slices.Equal(test.result, zeroGroups) {
t.Errorf("Wanted %v Got %v\n", test.result, zeroGroups)
if test.result != nil {
panic(err)
}
} else {
matchIndices := FindAllMatches(regComp, test.str)
zeroGroups := make([]Group, len(matchIndices))
for i, m := range matchIndices {
zeroGroups[i] = m[0]
}
if !slices.Equal(test.result, zeroGroups) {
t.Errorf("Wanted %v Got %v\n", test.result, zeroGroups)
}
}
})
}
@@ -209,17 +236,20 @@ func TestFindString(t *testing.T) {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re)
if err != nil {
panic(err)
}
foundString := FindString(regComp, test.str)
if len(test.result) == 0 {
if foundString != "" {
t.Errorf("Expected no match got %v\n", foundString)
if test.result != nil {
panic(err)
}
} else {
expectedString := test.str[test.result[0].startIdx:test.result[0].endIdx]
if foundString != foundString {
t.Errorf("Wanted %v Got %v\n", expectedString, foundString)
foundString := FindString(regComp, test.str)
if len(test.result) == 0 {
if foundString != "" {
t.Errorf("Expected no match got %v\n", foundString)
}
} else {
expectedString := test.str[test.result[0].startIdx:test.result[0].endIdx]
if foundString != expectedString {
t.Errorf("Wanted %v Got %v\n", expectedString, foundString)
}
}
}
})
@@ -231,7 +261,9 @@ func TestFindAllGroups(t *testing.T) {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re)
if err != nil {
panic(err)
if test.result != nil {
panic(err)
}
}
matchIndices := FindAllMatches(regComp, test.str)
for i := range matchIndices {