9 Commits

4 changed files with 97 additions and 32 deletions

View File

@@ -324,11 +324,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
*/
c := re_postfix[i]
if isNormalChar(c) || isSpecialCharWithMetacharReplacement(c) {
if caseInsensitive {
outQueue = append(outQueue, newPostfixNode(allCases(c)...))
} else {
outQueue = append(outQueue, newPostfixNode(c))
}
outQueue = append(outQueue, newPostfixNode(allCases(c, caseInsensitive)...))
continue
}
// Since every unescaped bracket is replaced by a LBRACKET / RBRACKET, there may
@@ -344,7 +340,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
if i == len(re_postfix)-1 { // End of string - throw error, because backslash is an escape character (something needs to come after it)
return nil, fmt.Errorf("backslash with no escape character")
}
i++
@@ -356,7 +352,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if n < 1 || err != nil {
return nil, fmt.Errorf("error parsing expanded hex code in expression")
}
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
outQueue = append(outQueue, newPostfixCharNode(allCases(rune(hexVal), caseInsensitive)...))
i += 7
} else if i < len(re_postfix)-1 { // Two-digit hex code
hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
@@ -364,7 +360,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("error parsing hex characters in expression")
}
i++ // Loop increment will take care of going forward
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
outQueue = append(outQueue, newPostfixCharNode(allCases(rune(hexVal), caseInsensitive)...))
} else {
return nil, fmt.Errorf("not enough hex characters found in expression")
}
@@ -384,7 +380,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("invalid octal value in expression")
}
i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically
outQueue = append(outQueue, newPostfixCharNode(rune(octVal)))
outQueue = append(outQueue, newPostfixCharNode(allCases(rune(octVal), caseInsensitive)...))
} else {
escapedNode, err := newEscapedNode(re_postfix[i], false)
if err != nil {
@@ -426,7 +422,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
regex += string(re_postfix[i])
i++
}
if len(regex) <= 1 { // Nothing in regex - panic
if len(regex) <= 1 { // Nothing in regex - throw error
return nil, fmt.Errorf("invalid lookaround. (too short?)")
}
// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
@@ -469,6 +465,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
outQueue = append(outQueue, newPostfixNode(to_append))
topStack, _ = peek(opStack)
}
outQueueFinalElement, _ := peek(outQueue)
if (c == '*' && outQueueFinalElement.nodetype == KLEENE) || (c == '+' && outQueueFinalElement.nodetype == PLUS) { // You cannot apply a quantifier to a quantifier in this way
return nil, fmt.Errorf("illegal use of token '%c'", c)
}
opStack = append(opStack, c)
}
}
@@ -520,7 +520,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if n < 1 || err != nil {
return nil, fmt.Errorf("error parsing expanded hex code in character class")
}
chars = append(chars, newPostfixCharNode(rune(hexVal)))
chars = append(chars, newPostfixCharNode(allCases(rune(hexVal), caseInsensitive)...))
i += 8
} else if i < len(re_postfix)-2 { // Two-digit hex code
hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
@@ -528,7 +528,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("error parsing hex characters in character class")
}
i += 2
chars = append(chars, newPostfixCharNode(rune(hexVal)))
chars = append(chars, newPostfixCharNode(allCases(rune(hexVal), caseInsensitive)...))
} else {
return nil, fmt.Errorf("not enough hex characters found in character class")
}
@@ -548,7 +548,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("invalid octal value in character class")
}
i += numDigitsParsed // Shift forward by the number of characters parsed
chars = append(chars, newPostfixCharNode(rune(octVal)))
chars = append(chars, newPostfixCharNode(allCases(rune(octVal), caseInsensitive)...))
} else {
escapedNode, err := newEscapedNode(re_postfix[i], true)
if err != nil {
@@ -576,9 +576,25 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
case "digit": // Equivalent to '\d'
nodeToAdd = newPostfixCharNode(genRangeInclusive('0', '9')...)
case "upper": // [A-Z]
nodeToAdd = newPostfixCharNode(genRangeInclusive('A', 'Z')...)
charsToAdd := genRangeInclusive('A', 'Z')
if caseInsensitive {
// Convert each rune to a slice of runes using allCases, then flatten the resulting
// 2-D slice into a 1-D slice. Assign the result to charsToAdd.
charsToAdd = slices.Concat(Map(charsToAdd, func(r rune) []rune {
return allCases(r, caseInsensitive)
})...)
}
nodeToAdd = newPostfixCharNode(charsToAdd...)
case "lower": // [a-z]
nodeToAdd = newPostfixCharNode(genRangeInclusive('a', 'z')...)
charsToAdd := genRangeInclusive('a', 'z')
if caseInsensitive {
// Convert each rune to a slice of runes using allCases, then flatten the resulting
// 2-D slice into a 1-D slice. Assign the result to charsToAdd.
charsToAdd = slices.Concat(Map(charsToAdd, func(r rune) []rune {
return allCases(r, caseInsensitive)
})...)
}
nodeToAdd = newPostfixCharNode(charsToAdd...)
case "alpha": //[A-Za-z]
nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'Z'), genRangeInclusive('a', 'z'))...)
case "xdigit": // [0-9A-Fa-f]
@@ -626,7 +642,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("error parsing high-range unicode value in character class")
}
}
chars = append(chars, newPostfixCharNode(re_postfix[i]))
chars = append(chars, newPostfixCharNode(allCases(re_postfix[i], caseInsensitive)...))
i++
}
firstCharAdded = true
@@ -654,13 +670,16 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// We have established that they both have a length of 1
startRangeRune := startRangePostfixNode.contents[0]
endRangeRune := endRangePostfixNode.contents[0]
if startRangeRune > endRangeRune {
return nil, fmt.Errorf("character range syntax is [a-b], not [b-a]")
}
chars = append(chars, newPostfixCharNode(genRange(startRangeRune, endRangeRune+1)...))
}
endOfRange = false // Reset the flag
}
}
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Throw error.
return nil, fmt.Errorf("opening bracket without closing bracket")
}
@@ -690,7 +709,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
startRangeNum, err := strconv.Atoi(string(startRange))
if err != nil {
panic(err)
return nil, fmt.Errorf("invalid numeric range")
}
if re_postfix[i] == '}' { // Case 1 above
@@ -708,7 +727,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("brace not closed")
}
if re_postfix[i] != '}' {
return nil, fmt.Errorf("invalid numeric specifier")
return nil, fmt.Errorf("invalid start range for numeric specifier")
}
if len(endRange) == 0 { // Case 3 above
endRangeNum = INFINITE_REPS
@@ -716,7 +735,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
var err error
endRangeNum, err = strconv.Atoi(string(endRange))
if err != nil {
panic(err)
return nil, fmt.Errorf("invalid end range for numeric specifier")
}
}
}
@@ -737,7 +756,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
numOpenParens++
}
if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack.
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Throw error if we reach the end of the stack.
var val rune
var err error
for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) {
@@ -951,7 +970,10 @@ func thompson(re []postfixNode) (Reg, error) {
nfa = append(nfa, s1)
}
case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
s1 := mustPop(&nfa)
s1, err := pop(&nfa)
if err != nil {
return Reg{}, fmt.Errorf("error applying kleene star")
}
stateToAdd := kleene(*s1)
nfa = append(nfa, stateToAdd)
case PLUS: // a+ is equivalent to aa*

35
misc.go
View File

@@ -2,6 +2,7 @@ package main
import (
"slices"
"strings"
"unicode"
)
@@ -48,12 +49,6 @@ func isNormalChar(c rune) bool {
return !slices.Contains(specialChars, c)
}
func assert(cond bool) {
if cond != true {
panic("Assertion Failed")
}
}
// Ensure that the given elements are only appended to the given slice if they
// don't already exist. Returns the new slice, and the number of unique items appended.
func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
@@ -130,13 +125,21 @@ func genRangeInclusive[T character](start, end T) []T {
return toRet
}
// Returns a rune-slice containing all possible cases of the given rune.
// Returns a rune-slice containing all possible cases of the given rune, given the
// 'caseInsensitive' boolean variable.
// If this variable is false, the rune is returned as-is, without modifications.
// If it is true, then we return all possible cases of the
// rune.
// At the moment, this includes:
// 1. Upper case
// 2. Lower case
// 3. Title case
func allCases(r rune) []rune {
return []rune{unicode.ToLower(r), unicode.ToUpper(r), unicode.ToTitle(r)}
func allCases(r rune, caseInsensitive bool) []rune {
if caseInsensitive {
return []rune{unicode.ToLower(r), unicode.ToUpper(r), unicode.ToTitle(r)}
} else {
return []rune{r}
}
}
func isHex(c rune) bool {
@@ -156,3 +159,17 @@ func replaceByValue[T comparable](slc []T, toReplace T, replaceWith T) []T {
}
return slc
}
// swapCase swaps the case of every character in the given string, and returns
// the new string.
func swapCase(str string) string {
return strings.Map(func(r rune) rune {
switch {
case unicode.IsLower(r):
return unicode.ToUpper(r)
case unicode.IsUpper(r):
return unicode.ToLower(r)
}
return r
}, str)
}

View File

@@ -115,7 +115,7 @@ func range2regex(start int, end int) string {
if startSlc[i] == endSlc[i] {
regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
} else {
regex += fmt.Sprintf("[%c-%c]", rune(startSlc[i]+48), rune(endSlc[i]+48))
regex += fmt.Sprintf("%c%c-%c%c", LBRACKET, rune(startSlc[i]+48), rune(endSlc[i]+48), RBRACKET)
}
}
regex += ")"

View File

@@ -212,12 +212,19 @@ var reTests = []struct {
{`ab*bc`, nil, `abc`, []Group{{0, 3}}},
{`ab*bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab*bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab{0,}c`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab+bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab+bc`, nil, `abc`, []Group{}},
{`ab+bc`, nil, `abq`, []Group{}},
{`ab{1,}bc`, nil, `abq`, []Group{}},
{`ab+bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab{1,}bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab{1,3}bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab{3,4}bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab{4,5}bc`, nil, `abbbbc`, []Group{}},
{`ab?bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab?bc`, nil, `abc`, []Group{{0, 3}}},
{`ab{0,1}bc`, nil, `abc`, []Group{{0, 3}}},
{`ab?bc`, nil, `abbbbc`, []Group{}},
{`ab?c`, nil, `abc`, []Group{{0, 3}}},
{`^abc$`, nil, `abc`, []Group{{0, 3}}},
@@ -281,6 +288,7 @@ var reTests = []struct {
{`a\(*b`, nil, `a((b`, []Group{{0, 4}}},
{`a\\b`, nil, `a\b`, []Group{{0, 3}}},
{`a+b+c`, nil, `aabbabc`, []Group{{4, 7}}},
{`a{1,}b{1,}c`, nil, `aabbabc`, []Group{{4, 7}}},
{`)(`, nil, `-`, nil},
{`[^ab]*`, nil, `cde`, []Group{{0, 3}, {3, 3}}},
{`abc`, nil, ``, []Group{}},
@@ -298,11 +306,22 @@ var reTests = []struct {
{`[k]`, nil, `ab`, []Group{}},
{`a[-]?c`, nil, `ac`, []Group{{0, 2}}},
{`^(.+)?B`, nil, `AB`, []Group{{0, 2}}},
{`\0009`, nil, "\x009", []Group{{0, 2}}},
{`\141`, nil, "a", []Group{{0, 1}}},
// At this point, the python test suite has a bunch
// of backreference tests. Since my engine doesn't
// implement backreferences, I've skipped those tests.
{`*a`, nil, ``, nil},
{`(*)b`, nil, ``, nil},
{`a**`, nil, ``, nil},
{`^`, nil, `abc`, []Group{{0, 0}}},
{`$`, nil, `abc`, []Group{{3, 3}}},
{`a[b-]`, nil, `a-`, []Group{{0, 2}}},
{`a[b-a]`, nil, `a-`, nil},
// Todo - add numeric range tests
}
@@ -340,8 +359,11 @@ var groupTests = []struct {
{`((a))`, nil, `abc`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}}}},
{`(a)b(c)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 1}, {2, 3}}}},
{`(a+|b)*`, nil, `ab`, []Match{[]Group{{0, 2}, {1, 2}}, []Group{{2, 2}}}},
{`(a+|b){0,}`, nil, `ab`, []Match{[]Group{{0, 2}, {1, 2}}, []Group{{2, 2}}}},
{`(a+|b)+`, nil, `ab`, []Match{[]Group{{0, 2}, {1, 2}}}},
{`(a+|b){1,}`, nil, `ab`, []Match{[]Group{{0, 2}, {1, 2}}}},
{`(a+|b)?`, nil, `ab`, []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}, []Group{{2, 2}}}},
{`(a+|b){0,1}`, nil, `ab`, []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}, []Group{{2, 2}}}},
{`(a|b|c|d|e)f`, nil, `ef`, []Match{[]Group{{0, 2}, {0, 1}}}},
{`(ab|cd)e`, nil, `abcde`, []Match{[]Group{{2, 5}, {2, 4}}}},
{`^(ab|cd)e`, nil, `abcde`, []Match{}},
@@ -361,6 +383,7 @@ var groupTests = []struct {
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}},
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
{`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(((((((((a)))))))))\41`, nil, `a`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
{`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
@@ -379,6 +402,9 @@ var groupTests = []struct {
{`([abc]*)x`, nil, `abc`, []Match{}},
{`([xyz]*)x`, nil, `abcx`, []Match{[]Group{{3, 4}, {3, 3}}}},
{`(a)+b|aac`, nil, `aac`, []Match{[]Group{{0, 3}}}},
{`([abc])*d`, nil, `abbbcd`, []Match{[]Group{{0, 6}, {4, 5}}}},
{`([abc])*bcd`, nil, `abcd`, []Match{[]Group{{0, 4}, {0, 1}}}},
{`^(ab|cd)e`, nil, `abcde`, []Match{}},
}
func TestFindAllMatches(t *testing.T) {