Compare commits
9 Commits
5e3801af7c
...
94c8044eb7
Author | SHA1 | Date | |
---|---|---|---|
94c8044eb7 | |||
4a45d1c95e | |||
861eb6067e | |||
027dfb4d6b | |||
932a20f641 | |||
4547ba74f0 | |||
125590d334 | |||
e3b8eaf5f8 | |||
20142e93c4 |
66
compile.go
66
compile.go
@@ -324,11 +324,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
*/
|
||||
c := re_postfix[i]
|
||||
if isNormalChar(c) || isSpecialCharWithMetacharReplacement(c) {
|
||||
if caseInsensitive {
|
||||
outQueue = append(outQueue, newPostfixNode(allCases(c)...))
|
||||
} else {
|
||||
outQueue = append(outQueue, newPostfixNode(c))
|
||||
}
|
||||
outQueue = append(outQueue, newPostfixNode(allCases(c, caseInsensitive)...))
|
||||
continue
|
||||
}
|
||||
// Since every unescaped bracket is replaced by a LBRACKET / RBRACKET, there may
|
||||
@@ -344,7 +340,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
|
||||
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
|
||||
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
|
||||
if i == len(re_postfix)-1 { // End of string - throw error, because backslash is an escape character (something needs to come after it)
|
||||
return nil, fmt.Errorf("backslash with no escape character")
|
||||
}
|
||||
i++
|
||||
@@ -356,7 +352,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
if n < 1 || err != nil {
|
||||
return nil, fmt.Errorf("error parsing expanded hex code in expression")
|
||||
}
|
||||
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
|
||||
outQueue = append(outQueue, newPostfixCharNode(allCases(rune(hexVal), caseInsensitive)...))
|
||||
i += 7
|
||||
} else if i < len(re_postfix)-1 { // Two-digit hex code
|
||||
hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
|
||||
@@ -364,7 +360,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
return nil, fmt.Errorf("error parsing hex characters in expression")
|
||||
}
|
||||
i++ // Loop increment will take care of going forward
|
||||
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
|
||||
outQueue = append(outQueue, newPostfixCharNode(allCases(rune(hexVal), caseInsensitive)...))
|
||||
} else {
|
||||
return nil, fmt.Errorf("not enough hex characters found in expression")
|
||||
}
|
||||
@@ -384,7 +380,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
return nil, fmt.Errorf("invalid octal value in expression")
|
||||
}
|
||||
i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically
|
||||
outQueue = append(outQueue, newPostfixCharNode(rune(octVal)))
|
||||
outQueue = append(outQueue, newPostfixCharNode(allCases(rune(octVal), caseInsensitive)...))
|
||||
} else {
|
||||
escapedNode, err := newEscapedNode(re_postfix[i], false)
|
||||
if err != nil {
|
||||
@@ -426,7 +422,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
regex += string(re_postfix[i])
|
||||
i++
|
||||
}
|
||||
if len(regex) <= 1 { // Nothing in regex - panic
|
||||
if len(regex) <= 1 { // Nothing in regex - throw error
|
||||
return nil, fmt.Errorf("invalid lookaround. (too short?)")
|
||||
}
|
||||
// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
|
||||
@@ -469,6 +465,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
outQueue = append(outQueue, newPostfixNode(to_append))
|
||||
topStack, _ = peek(opStack)
|
||||
}
|
||||
outQueueFinalElement, _ := peek(outQueue)
|
||||
if (c == '*' && outQueueFinalElement.nodetype == KLEENE) || (c == '+' && outQueueFinalElement.nodetype == PLUS) { // You cannot apply a quantifier to a quantifier in this way
|
||||
return nil, fmt.Errorf("illegal use of token '%c'", c)
|
||||
}
|
||||
opStack = append(opStack, c)
|
||||
}
|
||||
}
|
||||
@@ -520,7 +520,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
if n < 1 || err != nil {
|
||||
return nil, fmt.Errorf("error parsing expanded hex code in character class")
|
||||
}
|
||||
chars = append(chars, newPostfixCharNode(rune(hexVal)))
|
||||
chars = append(chars, newPostfixCharNode(allCases(rune(hexVal), caseInsensitive)...))
|
||||
i += 8
|
||||
} else if i < len(re_postfix)-2 { // Two-digit hex code
|
||||
hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
|
||||
@@ -528,7 +528,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
return nil, fmt.Errorf("error parsing hex characters in character class")
|
||||
}
|
||||
i += 2
|
||||
chars = append(chars, newPostfixCharNode(rune(hexVal)))
|
||||
chars = append(chars, newPostfixCharNode(allCases(rune(hexVal), caseInsensitive)...))
|
||||
} else {
|
||||
return nil, fmt.Errorf("not enough hex characters found in character class")
|
||||
}
|
||||
@@ -548,7 +548,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
return nil, fmt.Errorf("invalid octal value in character class")
|
||||
}
|
||||
i += numDigitsParsed // Shift forward by the number of characters parsed
|
||||
chars = append(chars, newPostfixCharNode(rune(octVal)))
|
||||
chars = append(chars, newPostfixCharNode(allCases(rune(octVal), caseInsensitive)...))
|
||||
} else {
|
||||
escapedNode, err := newEscapedNode(re_postfix[i], true)
|
||||
if err != nil {
|
||||
@@ -576,9 +576,25 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
case "digit": // Equivalent to '\d'
|
||||
nodeToAdd = newPostfixCharNode(genRangeInclusive('0', '9')...)
|
||||
case "upper": // [A-Z]
|
||||
nodeToAdd = newPostfixCharNode(genRangeInclusive('A', 'Z')...)
|
||||
charsToAdd := genRangeInclusive('A', 'Z')
|
||||
if caseInsensitive {
|
||||
// Convert each rune to a slice of runes using allCases, then flatten the resulting
|
||||
// 2-D slice into a 1-D slice. Assign the result to charsToAdd.
|
||||
charsToAdd = slices.Concat(Map(charsToAdd, func(r rune) []rune {
|
||||
return allCases(r, caseInsensitive)
|
||||
})...)
|
||||
}
|
||||
nodeToAdd = newPostfixCharNode(charsToAdd...)
|
||||
case "lower": // [a-z]
|
||||
nodeToAdd = newPostfixCharNode(genRangeInclusive('a', 'z')...)
|
||||
charsToAdd := genRangeInclusive('a', 'z')
|
||||
if caseInsensitive {
|
||||
// Convert each rune to a slice of runes using allCases, then flatten the resulting
|
||||
// 2-D slice into a 1-D slice. Assign the result to charsToAdd.
|
||||
charsToAdd = slices.Concat(Map(charsToAdd, func(r rune) []rune {
|
||||
return allCases(r, caseInsensitive)
|
||||
})...)
|
||||
}
|
||||
nodeToAdd = newPostfixCharNode(charsToAdd...)
|
||||
case "alpha": //[A-Za-z]
|
||||
nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'Z'), genRangeInclusive('a', 'z'))...)
|
||||
case "xdigit": // [0-9A-Fa-f]
|
||||
@@ -626,7 +642,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
return nil, fmt.Errorf("error parsing high-range unicode value in character class")
|
||||
}
|
||||
}
|
||||
chars = append(chars, newPostfixCharNode(re_postfix[i]))
|
||||
chars = append(chars, newPostfixCharNode(allCases(re_postfix[i], caseInsensitive)...))
|
||||
i++
|
||||
}
|
||||
firstCharAdded = true
|
||||
@@ -654,13 +670,16 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
// We have established that they both have a length of 1
|
||||
startRangeRune := startRangePostfixNode.contents[0]
|
||||
endRangeRune := endRangePostfixNode.contents[0]
|
||||
if startRangeRune > endRangeRune {
|
||||
return nil, fmt.Errorf("character range syntax is [a-b], not [b-a]")
|
||||
}
|
||||
chars = append(chars, newPostfixCharNode(genRange(startRangeRune, endRangeRune+1)...))
|
||||
}
|
||||
|
||||
endOfRange = false // Reset the flag
|
||||
}
|
||||
}
|
||||
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
|
||||
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Throw error.
|
||||
return nil, fmt.Errorf("opening bracket without closing bracket")
|
||||
}
|
||||
|
||||
@@ -690,7 +709,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
|
||||
startRangeNum, err := strconv.Atoi(string(startRange))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
return nil, fmt.Errorf("invalid numeric range")
|
||||
}
|
||||
|
||||
if re_postfix[i] == '}' { // Case 1 above
|
||||
@@ -708,7 +727,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
return nil, fmt.Errorf("brace not closed")
|
||||
}
|
||||
if re_postfix[i] != '}' {
|
||||
return nil, fmt.Errorf("invalid numeric specifier")
|
||||
return nil, fmt.Errorf("invalid start range for numeric specifier")
|
||||
}
|
||||
if len(endRange) == 0 { // Case 3 above
|
||||
endRangeNum = INFINITE_REPS
|
||||
@@ -716,7 +735,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
var err error
|
||||
endRangeNum, err = strconv.Atoi(string(endRange))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
return nil, fmt.Errorf("invalid end range for numeric specifier")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -737,7 +756,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
numOpenParens++
|
||||
}
|
||||
if c == ')' {
|
||||
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack.
|
||||
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Throw error if we reach the end of the stack.
|
||||
var val rune
|
||||
var err error
|
||||
for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) {
|
||||
@@ -951,7 +970,10 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
nfa = append(nfa, s1)
|
||||
}
|
||||
case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
|
||||
s1 := mustPop(&nfa)
|
||||
s1, err := pop(&nfa)
|
||||
if err != nil {
|
||||
return Reg{}, fmt.Errorf("error applying kleene star")
|
||||
}
|
||||
stateToAdd := kleene(*s1)
|
||||
nfa = append(nfa, stateToAdd)
|
||||
case PLUS: // a+ is equivalent to aa*
|
||||
|
35
misc.go
35
misc.go
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"slices"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
@@ -48,12 +49,6 @@ func isNormalChar(c rune) bool {
|
||||
return !slices.Contains(specialChars, c)
|
||||
}
|
||||
|
||||
func assert(cond bool) {
|
||||
if cond != true {
|
||||
panic("Assertion Failed")
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that the given elements are only appended to the given slice if they
|
||||
// don't already exist. Returns the new slice, and the number of unique items appended.
|
||||
func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
|
||||
@@ -130,13 +125,21 @@ func genRangeInclusive[T character](start, end T) []T {
|
||||
return toRet
|
||||
}
|
||||
|
||||
// Returns a rune-slice containing all possible cases of the given rune.
|
||||
// Returns a rune-slice containing all possible cases of the given rune, given the
|
||||
// 'caseInsensitive' boolean variable.
|
||||
// If this variable is false, the rune is returned as-is, without modifications.
|
||||
// If it is true, then we return all possible cases of the
|
||||
// rune.
|
||||
// At the moment, this includes:
|
||||
// 1. Upper case
|
||||
// 2. Lower case
|
||||
// 3. Title case
|
||||
func allCases(r rune) []rune {
|
||||
return []rune{unicode.ToLower(r), unicode.ToUpper(r), unicode.ToTitle(r)}
|
||||
func allCases(r rune, caseInsensitive bool) []rune {
|
||||
if caseInsensitive {
|
||||
return []rune{unicode.ToLower(r), unicode.ToUpper(r), unicode.ToTitle(r)}
|
||||
} else {
|
||||
return []rune{r}
|
||||
}
|
||||
}
|
||||
|
||||
func isHex(c rune) bool {
|
||||
@@ -156,3 +159,17 @@ func replaceByValue[T comparable](slc []T, toReplace T, replaceWith T) []T {
|
||||
}
|
||||
return slc
|
||||
}
|
||||
|
||||
// swapCase swaps the case of every character in the given string, and returns
|
||||
// the new string.
|
||||
func swapCase(str string) string {
|
||||
return strings.Map(func(r rune) rune {
|
||||
switch {
|
||||
case unicode.IsLower(r):
|
||||
return unicode.ToUpper(r)
|
||||
case unicode.IsUpper(r):
|
||||
return unicode.ToLower(r)
|
||||
}
|
||||
return r
|
||||
}, str)
|
||||
}
|
||||
|
@@ -115,7 +115,7 @@ func range2regex(start int, end int) string {
|
||||
if startSlc[i] == endSlc[i] {
|
||||
regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
|
||||
} else {
|
||||
regex += fmt.Sprintf("[%c-%c]", rune(startSlc[i]+48), rune(endSlc[i]+48))
|
||||
regex += fmt.Sprintf("%c%c-%c%c", LBRACKET, rune(startSlc[i]+48), rune(endSlc[i]+48), RBRACKET)
|
||||
}
|
||||
}
|
||||
regex += ")"
|
||||
|
26
re_test.go
26
re_test.go
@@ -212,12 +212,19 @@ var reTests = []struct {
|
||||
{`ab*bc`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`ab*bc`, nil, `abbc`, []Group{{0, 4}}},
|
||||
{`ab*bc`, nil, `abbbbc`, []Group{{0, 6}}},
|
||||
{`ab{0,}c`, nil, `abbbbc`, []Group{{0, 6}}},
|
||||
{`ab+bc`, nil, `abbc`, []Group{{0, 4}}},
|
||||
{`ab+bc`, nil, `abc`, []Group{}},
|
||||
{`ab+bc`, nil, `abq`, []Group{}},
|
||||
{`ab{1,}bc`, nil, `abq`, []Group{}},
|
||||
{`ab+bc`, nil, `abbbbc`, []Group{{0, 6}}},
|
||||
{`ab{1,}bc`, nil, `abbbbc`, []Group{{0, 6}}},
|
||||
{`ab{1,3}bc`, nil, `abbbbc`, []Group{{0, 6}}},
|
||||
{`ab{3,4}bc`, nil, `abbbbc`, []Group{{0, 6}}},
|
||||
{`ab{4,5}bc`, nil, `abbbbc`, []Group{}},
|
||||
{`ab?bc`, nil, `abbc`, []Group{{0, 4}}},
|
||||
{`ab?bc`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`ab{0,1}bc`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`ab?bc`, nil, `abbbbc`, []Group{}},
|
||||
{`ab?c`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`^abc$`, nil, `abc`, []Group{{0, 3}}},
|
||||
@@ -281,6 +288,7 @@ var reTests = []struct {
|
||||
{`a\(*b`, nil, `a((b`, []Group{{0, 4}}},
|
||||
{`a\\b`, nil, `a\b`, []Group{{0, 3}}},
|
||||
{`a+b+c`, nil, `aabbabc`, []Group{{4, 7}}},
|
||||
{`a{1,}b{1,}c`, nil, `aabbabc`, []Group{{4, 7}}},
|
||||
{`)(`, nil, `-`, nil},
|
||||
{`[^ab]*`, nil, `cde`, []Group{{0, 3}, {3, 3}}},
|
||||
{`abc`, nil, ``, []Group{}},
|
||||
@@ -298,11 +306,22 @@ var reTests = []struct {
|
||||
{`[k]`, nil, `ab`, []Group{}},
|
||||
{`a[-]?c`, nil, `ac`, []Group{{0, 2}}},
|
||||
{`^(.+)?B`, nil, `AB`, []Group{{0, 2}}},
|
||||
{`\0009`, nil, "\x009", []Group{{0, 2}}},
|
||||
{`\141`, nil, "a", []Group{{0, 1}}},
|
||||
|
||||
// At this point, the python test suite has a bunch
|
||||
// of backreference tests. Since my engine doesn't
|
||||
// implement backreferences, I've skipped those tests.
|
||||
|
||||
{`*a`, nil, ``, nil},
|
||||
{`(*)b`, nil, ``, nil},
|
||||
{`a**`, nil, ``, nil},
|
||||
|
||||
{`^`, nil, `abc`, []Group{{0, 0}}},
|
||||
{`$`, nil, `abc`, []Group{{3, 3}}},
|
||||
{`a[b-]`, nil, `a-`, []Group{{0, 2}}},
|
||||
{`a[b-a]`, nil, `a-`, nil},
|
||||
|
||||
// Todo - add numeric range tests
|
||||
}
|
||||
|
||||
@@ -340,8 +359,11 @@ var groupTests = []struct {
|
||||
{`((a))`, nil, `abc`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}}}},
|
||||
{`(a)b(c)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 1}, {2, 3}}}},
|
||||
{`(a+|b)*`, nil, `ab`, []Match{[]Group{{0, 2}, {1, 2}}, []Group{{2, 2}}}},
|
||||
{`(a+|b){0,}`, nil, `ab`, []Match{[]Group{{0, 2}, {1, 2}}, []Group{{2, 2}}}},
|
||||
{`(a+|b)+`, nil, `ab`, []Match{[]Group{{0, 2}, {1, 2}}}},
|
||||
{`(a+|b){1,}`, nil, `ab`, []Match{[]Group{{0, 2}, {1, 2}}}},
|
||||
{`(a+|b)?`, nil, `ab`, []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}, []Group{{2, 2}}}},
|
||||
{`(a+|b){0,1}`, nil, `ab`, []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}, []Group{{2, 2}}}},
|
||||
{`(a|b|c|d|e)f`, nil, `ef`, []Match{[]Group{{0, 2}, {0, 1}}}},
|
||||
{`(ab|cd)e`, nil, `abcde`, []Match{[]Group{{2, 5}, {2, 4}}}},
|
||||
{`^(ab|cd)e`, nil, `abcde`, []Match{}},
|
||||
@@ -361,6 +383,7 @@ var groupTests = []struct {
|
||||
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}},
|
||||
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
|
||||
{`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
||||
{`(((((((((a)))))))))\41`, nil, `a`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
||||
{`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
|
||||
{`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
|
||||
|
||||
@@ -379,6 +402,9 @@ var groupTests = []struct {
|
||||
{`([abc]*)x`, nil, `abc`, []Match{}},
|
||||
{`([xyz]*)x`, nil, `abcx`, []Match{[]Group{{3, 4}, {3, 3}}}},
|
||||
{`(a)+b|aac`, nil, `aac`, []Match{[]Group{{0, 3}}}},
|
||||
{`([abc])*d`, nil, `abbbcd`, []Match{[]Group{{0, 6}, {4, 5}}}},
|
||||
{`([abc])*bcd`, nil, `abcd`, []Match{[]Group{{0, 4}, {0, 1}}}},
|
||||
{`^(ab|cd)e`, nil, `abcde`, []Match{}},
|
||||
}
|
||||
|
||||
func TestFindAllMatches(t *testing.T) {
|
||||
|
Reference in New Issue
Block a user