Compare commits
7 Commits
fb46ed62d9
...
d210a85253
Author | SHA1 | Date | |
---|---|---|---|
d210a85253 | |||
48cff259b2 | |||
25cb79f01b | |||
0fb78abf7f | |||
9dc4fd4595 | |||
099612ae7f | |||
9115858261 |
72
compile.go
72
compile.go
@@ -2,7 +2,6 @@ package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"slices"
|
||||
"strconv"
|
||||
"unicode"
|
||||
@@ -82,6 +81,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
//
|
||||
// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
|
||||
// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
|
||||
//
|
||||
// Finally, check for escaped backslashes. Replace these with the BACKSLASH metacharacter. Later, in thompson(),
|
||||
// these will be converted back. This avoids confusiuon in detecting whether a character is escaped eg. detecting
|
||||
// whether '\\[a]' has an escaped opening bracket (it doesn't).
|
||||
for i := 0; i < len(re_runes_orig); i++ {
|
||||
c := re_runes_orig[i]
|
||||
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
|
||||
@@ -116,6 +119,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
|
||||
re_runes = append(re_runes, NONCAPLPAREN_CHAR)
|
||||
i += 2
|
||||
} else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash
|
||||
re_runes = append(re_runes, ESC_BACKSLASH)
|
||||
i++
|
||||
} else {
|
||||
re_runes = append(re_runes, c)
|
||||
}
|
||||
@@ -145,8 +151,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
|
||||
return nil, fmt.Errorf("Empty character class.")
|
||||
}
|
||||
for re_runes[i] != ']' {
|
||||
i++ // Skip all characters inside brackets
|
||||
for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
|
||||
i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
|
||||
// TODO: Check for escaped characters
|
||||
|
||||
// Check ahead for character range
|
||||
@@ -303,29 +309,34 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Error parsing hex characters in expression.")
|
||||
}
|
||||
i += 2
|
||||
i++ // Loop increment will take care of going forward
|
||||
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
|
||||
} else {
|
||||
return nil, fmt.Errorf("Not enough hex characters found in expression.")
|
||||
}
|
||||
} else if isOctal(re_postfix[i]) { // Octal value
|
||||
var octVal int
|
||||
n, err := fmt.Sscanf(string(re_postfix[i:]), "%d", &octVal)
|
||||
if n < 1 || err != nil {
|
||||
return nil, fmt.Errorf("Error parsing octal value in expression.")
|
||||
var octVal int64
|
||||
var octValStr string
|
||||
numDigitsParsed := 0
|
||||
for (i+numDigitsParsed) < len(re_postfix) && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 {
|
||||
octValStr += string(re_postfix[i+numDigitsParsed])
|
||||
numDigitsParsed++
|
||||
}
|
||||
if octVal > 777 {
|
||||
return nil, fmt.Errorf("Invalid octal value in expression.")
|
||||
}
|
||||
i += int(math.Ceil(math.Log10(float64(octVal)))) // Shift forward by the number of digits that were parsed
|
||||
i-- // Move back one character, because the loop increment will move us back to the next character automatically
|
||||
octValBase10, err := strconv.ParseInt(strconv.Itoa(octVal), 8, 0)
|
||||
octVal, err := strconv.ParseInt(octValStr, 8, 32)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Error parsing octal value in expression.")
|
||||
}
|
||||
outQueue = append(outQueue, newPostfixCharNode(rune(octValBase10)))
|
||||
if octVal > 0777 {
|
||||
return nil, fmt.Errorf("Invalid octal value in expression.")
|
||||
}
|
||||
i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically
|
||||
outQueue = append(outQueue, newPostfixCharNode(rune(octVal)))
|
||||
} else {
|
||||
outQueue = append(outQueue, newEscapedNode(re_postfix[i]))
|
||||
escapedNode, err := newEscapedNode(re_postfix[i], false)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Invalid escape character in expression.")
|
||||
}
|
||||
outQueue = append(outQueue, escapedNode)
|
||||
}
|
||||
continue // Escaped character will automatically be skipped when loop variable increments
|
||||
}
|
||||
@@ -446,19 +457,30 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
} else {
|
||||
return nil, fmt.Errorf("Not enough hex characters found in character class.")
|
||||
}
|
||||
} else if unicode.IsDigit(re_postfix[i]) { // Octal value
|
||||
var octVal int
|
||||
n, err := fmt.Sscanf(string(re_postfix[i:]), "%d", &octVal)
|
||||
if n < 1 || err != nil {
|
||||
} else if isOctal(re_postfix[i]) { // Octal value
|
||||
var octVal int64
|
||||
var octValStr string
|
||||
numDigitsParsed := 0
|
||||
for (i+numDigitsParsed) < len(re_postfix)-1 && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else)
|
||||
octValStr += string(re_postfix[i+numDigitsParsed])
|
||||
numDigitsParsed++
|
||||
}
|
||||
octVal, err := strconv.ParseInt(octValStr, 8, 32)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Error parsing octal value in character class.")
|
||||
}
|
||||
if octVal > 0777 {
|
||||
return nil, fmt.Errorf("Invalid octal value in character class.")
|
||||
}
|
||||
i += int(math.Ceil(math.Log10(float64(octVal)) / math.Log10(8))) // Shift forward by the number of digits that were parsed
|
||||
i += numDigitsParsed // Shift forward by the number of characters parsed
|
||||
chars = append(chars, newPostfixCharNode(rune(octVal)))
|
||||
} else {
|
||||
chars = append(chars, newEscapedNode(re_postfix[i]))
|
||||
escapedNode, err := newEscapedNode(re_postfix[i], true)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Invalid escape character in character class.")
|
||||
}
|
||||
chars = append(chars, escapedNode)
|
||||
i++
|
||||
}
|
||||
} else {
|
||||
chars = append(chars, newPostfixCharNode(re_postfix[i]))
|
||||
@@ -591,6 +613,7 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
// - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list.
|
||||
for _, node := range c.except {
|
||||
if node.allChars {
|
||||
state.allChars = false
|
||||
// For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all,
|
||||
// and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match
|
||||
// those.
|
||||
@@ -655,6 +678,11 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it
|
||||
replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\')
|
||||
replaceByValue(state.except, ESC_BACKSLASH, '\\')
|
||||
|
||||
nfa = append(nfa, &state)
|
||||
}
|
||||
if c.nodetype == LPAREN || c.nodetype == RPAREN {
|
||||
|
23
misc.go
23
misc.go
@@ -8,12 +8,13 @@ import (
|
||||
var whitespaceChars = []rune{' ', '\t', '\n'}
|
||||
var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
|
||||
var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_")
|
||||
var LBRACKET rune = 0xF0000
|
||||
var RBRACKET rune = 0xF0001
|
||||
var ANY_CHAR rune = 0xF0002 // Represents any character - used for states where the allChars flag is on.
|
||||
var LPAREN_CHAR rune = 0xF0003 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
|
||||
var RPAREN_CHAR rune = 0xF0004
|
||||
var NONCAPLPAREN_CHAR rune = 0xF0005 // Represents a non-capturing group's LPAREN
|
||||
var LBRACKET rune = 0xF0001
|
||||
var RBRACKET rune = 0xF0002
|
||||
var ANY_CHAR rune = 0xF0003 // Represents any character - used for states where the allChars flag is on.
|
||||
var LPAREN_CHAR rune = 0xF0004 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
|
||||
var RPAREN_CHAR rune = 0xF0005
|
||||
var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPAREN
|
||||
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash
|
||||
|
||||
// Returns true if str[idx] and str[idx-1] are separated by a word boundary.
|
||||
func isWordBoundary(str []rune, idx int) bool {
|
||||
@@ -139,3 +140,13 @@ func isHex(c rune) bool {
|
||||
func isOctal(c rune) bool {
|
||||
return slices.Contains([]rune("01234567"), c)
|
||||
}
|
||||
|
||||
// Replace an element in a slice with another, given both values
|
||||
func replaceByValue[T comparable](slc []T, toReplace T, replaceWith T) []T {
|
||||
for i, val := range slc {
|
||||
if val == toReplace {
|
||||
slc[i] = replaceWith
|
||||
}
|
||||
}
|
||||
return slc
|
||||
}
|
||||
|
2
nfa.go
2
nfa.go
@@ -4,7 +4,7 @@ import (
|
||||
"slices"
|
||||
)
|
||||
|
||||
const EPSILON int = 0
|
||||
const EPSILON int = 0xF0000
|
||||
|
||||
type assertType int
|
||||
|
||||
|
@@ -1,5 +1,7 @@
|
||||
package main
|
||||
|
||||
import "fmt"
|
||||
|
||||
type NodeType int
|
||||
|
||||
// This is a slice containing all escapable characters that have special meaning.
|
||||
@@ -62,7 +64,7 @@ func newCharClassNode(nodes []postfixNode, negated bool) postfixNode {
|
||||
}
|
||||
|
||||
// Creates a new escaped node - the given character is assumed to have been preceded by a backslash
|
||||
func newEscapedNode(c rune) postfixNode {
|
||||
func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
@@ -86,8 +88,13 @@ func newEscapedNode(c rune) postfixNode {
|
||||
toReturn = newPostfixDotNode()
|
||||
toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...))
|
||||
case 'b', 'B':
|
||||
toReturn.nodetype = ASSERTION
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
if c == 'b' && inCharClass {
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(8))
|
||||
} else {
|
||||
toReturn.nodetype = ASSERTION
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
}
|
||||
case 'n': // Newline character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, '\n')
|
||||
@@ -110,10 +117,13 @@ func newEscapedNode(c rune) postfixNode {
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(11))
|
||||
default: // None of the above - append it as a regular character
|
||||
if isNormalChar(c) { // Normal characters cannot be escaped
|
||||
return postfixNode{}, fmt.Errorf("Invalid escape character.")
|
||||
}
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
}
|
||||
return toReturn
|
||||
return toReturn, nil
|
||||
}
|
||||
|
||||
// Creates and returns a postfixNode based on the given contents
|
||||
|
70
re_test.go
70
re_test.go
@@ -110,6 +110,9 @@ var reTests = []struct {
|
||||
{`\d{3,4}`, "ababab555", []Group{{6, 9}}},
|
||||
{`\bpaint\b`, "paints", []Group{}},
|
||||
{`\b\w{5}\b`, "paint", []Group{{0, 5}}},
|
||||
{`[^\w]`, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
|
||||
{`[^\W]`, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
|
||||
{`[\[\]]`, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
|
||||
|
||||
// Unicode tests
|
||||
{`.+`, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
|
||||
@@ -149,6 +152,8 @@ var reTests = []struct {
|
||||
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "400", []Group{{0, 3}}},
|
||||
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "4000", []Group{}},
|
||||
{"a{1,3}", "aaaaa", []Group{{0, 3}, {3, 5}}},
|
||||
{`\\[ab\\]`, "a", []Group{}},
|
||||
{`\\[ab\\]`, `\a`, []Group{{0, 2}}},
|
||||
|
||||
// Lookaround tests
|
||||
{"(?<=bo)y", "boy", []Group{{2, 3}}},
|
||||
@@ -156,6 +161,24 @@ var reTests = []struct {
|
||||
{"(?<=f)f+(?=f)", "fffff", []Group{{1, 4}}},
|
||||
{"(?<=f)f+(?=f)", "fffffa", []Group{{1, 4}}},
|
||||
|
||||
// Test cases from Python's RE test suite
|
||||
{`[\1]`, "\x01", []Group{{0, 1}}},
|
||||
|
||||
{`\0`, "\x00", []Group{{0, 1}}},
|
||||
{`[\0a]`, "\x00", []Group{{0, 1}}},
|
||||
{`[\0a]`, "\x00", []Group{{0, 1}}},
|
||||
{`[a\0]`, "\x00", []Group{{0, 1}}},
|
||||
{`[^a\0]`, "\x00", []Group{}},
|
||||
|
||||
{`\a[\b]\f\n\r\t\v`, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
|
||||
{`[\a][\b][\f][\n][\r][\t][\v]`, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
|
||||
{`\u`, "", nil},
|
||||
{`\xff`, "ÿ", []Group{{0, 1}}},
|
||||
{`\x00ffffffffffffff`, "\xff", []Group{}},
|
||||
{`\x00f`, "\x0f", []Group{}},
|
||||
{`\x00fe`, "\xfe", []Group{}},
|
||||
{`^\w+=(\\[\000-\277]|[^\n\\])*`, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
|
||||
|
||||
// Todo - add numeric range tests
|
||||
}
|
||||
|
||||
@@ -183,6 +206,7 @@ var groupTests = []struct {
|
||||
{"(a?)a?", "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
|
||||
{"(a?)a?", "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
|
||||
{"a((b.d){3})", "abfdbhdbid", []Match{[]Group{{0, 10}, {1, 10}, {7, 10}}}},
|
||||
{`(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\071`, `abcdefghijkl9`, []Match{[]Group{{0, 13}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {10, 11}, {11, 12}}}},
|
||||
}
|
||||
|
||||
func TestFindAllMatches(t *testing.T) {
|
||||
@@ -190,15 +214,18 @@ func TestFindAllMatches(t *testing.T) {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
matchIndices := FindAllMatches(regComp, test.str)
|
||||
zeroGroups := make([]Group, len(matchIndices))
|
||||
for i, m := range matchIndices {
|
||||
zeroGroups[i] = m[0]
|
||||
}
|
||||
if !slices.Equal(test.result, zeroGroups) {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, zeroGroups)
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
}
|
||||
} else {
|
||||
matchIndices := FindAllMatches(regComp, test.str)
|
||||
zeroGroups := make([]Group, len(matchIndices))
|
||||
for i, m := range matchIndices {
|
||||
zeroGroups[i] = m[0]
|
||||
}
|
||||
if !slices.Equal(test.result, zeroGroups) {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, zeroGroups)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -209,17 +236,20 @@ func TestFindString(t *testing.T) {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
foundString := FindString(regComp, test.str)
|
||||
if len(test.result) == 0 {
|
||||
if foundString != "" {
|
||||
t.Errorf("Expected no match got %v\n", foundString)
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
}
|
||||
} else {
|
||||
expectedString := test.str[test.result[0].startIdx:test.result[0].endIdx]
|
||||
if foundString != foundString {
|
||||
t.Errorf("Wanted %v Got %v\n", expectedString, foundString)
|
||||
foundString := FindString(regComp, test.str)
|
||||
if len(test.result) == 0 {
|
||||
if foundString != "" {
|
||||
t.Errorf("Expected no match got %v\n", foundString)
|
||||
}
|
||||
} else {
|
||||
expectedString := test.str[test.result[0].startIdx:test.result[0].endIdx]
|
||||
if foundString != expectedString {
|
||||
t.Errorf("Wanted %v Got %v\n", expectedString, foundString)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
@@ -231,7 +261,9 @@ func TestFindAllGroups(t *testing.T) {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
matchIndices := FindAllMatches(regComp, test.str)
|
||||
for i := range matchIndices {
|
||||
|
Reference in New Issue
Block a user