11 Commits

Author SHA1 Message Date
Aadhavan Srinivasan
ccb82f781b Enforce the rule that character classes must have at least one character; interpret literal closing brackets as regular characters 2025-01-24 15:50:36 -05:00
Aadhavan Srinivasan
09bbf8d3f1 Refactored isNormalChar(), wrote function to get special characters that have metachar replacements 2025-01-24 15:49:33 -05:00
Aadhavan Srinivasan
d5b4450e50 Added more test cases (1 failing) 2025-01-24 14:58:18 -05:00
Aadhavan Srinivasan
45827b5dd3 Allow hyphen to be escaped inside character class 2025-01-24 14:58:07 -05:00
Aadhavan Srinivasan
c26edcb0c4 Fixed edge cases with character ranges and character classes 2025-01-24 14:57:47 -05:00
Aadhavan Srinivasan
110298b6a6 Added 'flags' field to test struct for all-group tests 2025-01-24 11:11:48 -05:00
Aadhavan Srinivasan
eff4c5a5df Added 'flags' field to test struct for 0-group tests 2025-01-24 11:10:01 -05:00
0bd7a87797 Removed old comment 2025-01-22 20:27:35 -05:00
9cf1c66653 Implemented character range detection later in the code, using a metacharacter 2025-01-22 20:26:58 -05:00
9edc99d73c Modified genRange() so that it can work on ints and runes 2025-01-22 20:25:49 -05:00
Aadhavan Srinivasan
6850396bf9 Removed character range creation from the first part of shuntingYard() (the part that adds concatenation operators), because octal and hex values haven't yet been deciphered at this point in the code 2025-01-22 16:51:00 -05:00
4 changed files with 322 additions and 201 deletions

View File

@@ -153,23 +153,15 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
// TODO: Check for escaped characters
// Check ahead for character range
if i < len(re_runes)-2 && re_runes[i+1] == '-' {
rangeStart := re_runes[i]
rangeEnd := re_runes[i+2]
if int(rangeEnd) < int(rangeStart) {
return nil, fmt.Errorf("Range is out of order.")
}
for i := rangeStart; i <= rangeEnd; i++ {
toAppend = append(toAppend, i)
}
i += 2 // Skip start and hyphen (end will automatically be skipped on next iteration of loop)
continue
// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
if i >= len(re_runes) {
return nil, fmt.Errorf("Opening bracket without closing bracket.")
}
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != ']') { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
re_runes[i] = CHAR_RANGE
}
toAppend = append(toAppend, re_runes[i])
}
// Replace the last character (which should have been ']', with RBRACKET
@@ -280,7 +272,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
*/
c := re_postfix[i]
if isNormalChar(c) {
if isNormalChar(c) || isSpecialCharWithMetacharReplacement(c) {
if caseInsensitive {
outQueue = append(outQueue, newPostfixNode(allCases(c)...))
} else {
@@ -288,7 +280,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
continue
}
// Escape character
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
return nil, fmt.Errorf("ERROR: Backslash with no escape character.")
@@ -420,7 +412,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
}
if c == LBRACKET { // Used for character classes
i++ // Step forward so we can look at the character class
firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
i++ // Step forward so we can look at the character class
var invertMatch bool
if re_postfix[i] == '^' {
invertMatch = true
@@ -428,9 +422,14 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
chars := make([]postfixNode, 0) // List of nodes - used only for character classes
for i < len(re_postfix) {
if re_postfix[i] == RBRACKET {
if firstCharAdded && re_postfix[i] == RBRACKET {
break
}
if re_postfix[i] == CHAR_RANGE {
endOfRange = true
i++
continue
}
if re_postfix[i] == '\\' { // Backslash indicates a character to be escaped
if i == len(re_postfix)-1 {
return nil, fmt.Errorf("Stray backslash in character class.")
@@ -483,13 +482,54 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
i++
}
} else {
if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket.
switch re_postfix[i] {
case LBRACKET:
chars = append(chars, newPostfixCharNode('['))
case RBRACKET:
chars = append(chars, newPostfixCharNode(']'))
default:
return nil, fmt.Errorf("Error parsing high-range unicode value in character class.")
}
}
chars = append(chars, newPostfixCharNode(re_postfix[i]))
i++
}
firstCharAdded = true
if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range
// Things to note:
// 1. In PCRE and Go's regex engine, a letter _can_ be surrounded by hyphens in a character class.
// Eg. [a-b-c]
// While you might think this leads to a syntax error (I thought so), the engine picks 'a-b' as a range,
// then treats the second '-' and 'c' as regular characters in the character class.
// So this regex becomes "Match a character from 'a' to 'b', a literal hyphen, or 'c' ".
// 2. To account for this, the following logic is followed:
// a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range.
// i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such.
// ii. If either the start or end of the range don't exist in 'chars' ie. something like [-a] or [a-], then too will we treat it as a literal hyphen.
// b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range).
endRangePostfixNode, err1 := pop(&chars)
startRangePostfixNode, err2 := pop(&chars)
if (err1 != nil || err2 != nil) || len(startRangePostfixNode.contents) != 1 { // Treat it as a regular hyphen
chars = append(chars, startRangePostfixNode, newPostfixCharNode('-'), endRangePostfixNode)
} else if len(endRangePostfixNode.contents) != 1 { // I don't even know what this would look like, this is just a sanity check
return nil, fmt.Errorf("Error parsing character range.")
} else {
// We have established that they both have a length of 1
startRangeRune := startRangePostfixNode.contents[0]
endRangeRune := endRangePostfixNode.contents[0]
chars = append(chars, newPostfixCharNode(genRange(startRangeRune, endRangeRune+1)...))
}
endOfRange = false // Reset the flag
}
}
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
return nil, fmt.Errorf("Opening bracket without closing bracket.")
}
outQueue = append(outQueue, newCharClassNode(chars, invertMatch))
continue
}
@@ -681,8 +721,7 @@ func thompson(re []postfixNode) (Reg, error) {
// Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it
replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\')
// Uncommenting this seems to make one of the test cases fail. Why?
// replaceByValue(state.except, ESC_BACKSLASH, '\\')
replaceByValue(state.except, ESC_BACKSLASH, '\\')
nfa = append(nfa, &state)
}

24
misc.go
View File

@@ -15,6 +15,14 @@ var LPAREN_CHAR rune = 0xF0004 // Parentheses in regex are concatenated with thi
var RPAREN_CHAR rune = 0xF0005
var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPAREN
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash
var CHAR_RANGE rune = 0xF0008 // Represents a character range
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', '~', '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR}
// An interface for int and rune, which are identical
type character interface {
int | rune
}
// Returns true if str[idx] and str[idx-1] are separated by a word boundary.
func isWordBoundary(str []rune, idx int) bool {
@@ -26,9 +34,17 @@ func isWordBoundary(str []rune, idx int) bool {
return wbounded
}
func isSpecialChar(c rune) bool {
return slices.Contains(specialChars, c)
}
// Some special characters have metacharacter replacements. These characters, when encountered in their literal form, can be treated as regular characters.
func isSpecialCharWithMetacharReplacement(c rune) bool {
return slices.Contains([]rune{'[', ']'}, c)
}
func isNormalChar(c rune) bool {
specialChars := []rune(`?*\^${}()+|[].~<>`)
specialChars = append(specialChars, LBRACKET, RBRACKET, NONCAPLPAREN_CHAR)
return !slices.Contains(specialChars, c)
}
@@ -109,8 +125,8 @@ func Reduce[T any](slc []T, fn func(T, T) T) T {
}
// Generate numbers in a range - start (inclusive) to end (exclusive)
func genRange(start, end int) []int {
toRet := make([]int, end-start)
func genRange[T character](start, end T) []T {
toRet := make([]T, end-start)
for i := start; i < end; i++ {
toRet[i-start] = i
}

View File

@@ -116,6 +116,13 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
case 'v': // Vertical tab
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(11))
case '-': // Literal hyphen - only in character class
if inCharClass {
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, '-')
} else {
return postfixNode{}, fmt.Errorf("Invalid escape character.")
}
default: // None of the above - append it as a regular character
if isNormalChar(c) { // Normal characters cannot be escaped
return postfixNode{}, fmt.Errorf("Invalid escape character.")

View File

@@ -1,221 +1,280 @@
package main
import (
"fmt"
"slices"
"testing"
)
var reTests = []struct {
re string
flags []ReFlag
str string
result []Group // Stores all zero-groups in the match
}{
{"a", "abc", []Group{{0, 1}}},
{"a", "bca", []Group{{2, 3}}},
{"l", "ggllgg", []Group{{2, 3}, {3, 4}}},
{"(b|c)", "abdceb", []Group{{1, 2}, {3, 4}, {5, 6}}},
{"a+", "brerereraaaaabbbbb", []Group{{8, 13}}},
{"ab+", "qweqweqweaqweqweabbbbbr", []Group{{16, 22}}},
{"(b|c|A)", "ooaoobocA", []Group{{5, 6}, {7, 8}, {8, 9}}},
{"ab*", "a", []Group{{0, 1}}},
{"ab*", "abb", []Group{{0, 3}}},
{"a*b", "aaab", []Group{{0, 4}}},
{"a*b", "qwqw", []Group{}},
{"(abc)*", "abcabcabc", []Group{{0, 9}, {9, 9}}},
{"((abc)|(def))*", "abcdef", []Group{{0, 6}, {6, 6}}},
{"(abc)*|(def)*", "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}},
{"b*a*a", "bba", []Group{{0, 3}}},
{"(ab)+", "abcabddd", []Group{{0, 2}, {3, 5}}},
{"a(b(c|d)*)*", "abccbd", []Group{{0, 6}}},
{"a(b|c)*d+", "abccdd", []Group{{0, 6}}},
{"a*", "", []Group{{0, 0}}},
{"a|b", "c", []Group{}},
{"(a|b)*c", "aabbc", []Group{{0, 5}}},
{"a(b|b)", "ab", []Group{{0, 2}}},
{"a*", "aaaaaaaa", []Group{{0, 8}, {8, 8}}},
{"a", nil, "abc", []Group{{0, 1}}},
{"a", nil, "bca", []Group{{2, 3}}},
{"l", nil, "ggllgg", []Group{{2, 3}, {3, 4}}},
{"(b|c)", nil, "abdceb", []Group{{1, 2}, {3, 4}, {5, 6}}},
{"a+", nil, "brerereraaaaabbbbb", []Group{{8, 13}}},
{"ab+", nil, "qweqweqweaqweqweabbbbbr", []Group{{16, 22}}},
{"(b|c|A)", nil, "ooaoobocA", []Group{{5, 6}, {7, 8}, {8, 9}}},
{"ab*", nil, "a", []Group{{0, 1}}},
{"ab*", nil, "abb", []Group{{0, 3}}},
{"a*b", nil, "aaab", []Group{{0, 4}}},
{"a*b", nil, "qwqw", []Group{}},
{"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}},
{"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}},
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}},
{"b*a*a", nil, "bba", []Group{{0, 3}}},
{"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}},
{"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}},
{"a(b|c)*d+", nil, "abccdd", []Group{{0, 6}}},
{"a*", nil, "", []Group{{0, 0}}},
{"a|b", nil, "c", []Group{}},
{"(a|b)*c", nil, "aabbc", []Group{{0, 5}}},
{"a(b|b)", nil, "ab", []Group{{0, 2}}},
{"a*", nil, "aaaaaaaa", []Group{{0, 8}, {8, 8}}},
{"ab?", "ab", []Group{{0, 2}}},
{"a?b", "ab", []Group{{0, 2}}},
{"a?", "", []Group{{0, 0}}},
{"a?b?c", "a", []Group{}},
{"a?b?c?", "ab", []Group{{0, 2}, {2, 2}}},
{"a?b?c?", "ac", []Group{{0, 2}, {2, 2}}},
{"a?b?c", "abc", []Group{{0, 3}}},
{"a?b?c", "acb", []Group{{0, 2}}},
{"ab?", nil, "ab", []Group{{0, 2}}},
{"a?b", nil, "ab", []Group{{0, 2}}},
{"a?", nil, "", []Group{{0, 0}}},
{"a?b?c", nil, "a", []Group{}},
{"a?b?c?", nil, "ab", []Group{{0, 2}, {2, 2}}},
{"a?b?c?", nil, "ac", []Group{{0, 2}, {2, 2}}},
{"a?b?c", nil, "abc", []Group{{0, 3}}},
{"a?b?c", nil, "acb", []Group{{0, 2}}},
{"[abc]", "defadefbdefce", []Group{{3, 4}, {7, 8}, {11, 12}}},
{"[ab]c", "ab", []Group{}},
{"g[ab]c", "gac", []Group{{0, 3}}},
{"g[ab]c", "gbc", []Group{{0, 3}}},
{"g[ab]c", "gc", []Group{}},
{"g[ab]c", "gfc", []Group{}},
{"[ab]*", "aabbbabaababab", []Group{{0, 14}, {14, 14}}},
{"[ab]+", "aabbbablaababab", []Group{{0, 7}, {8, 15}}},
{"[Ff]r[Uu]it", "fruit", []Group{{0, 5}}},
{"[Ff]r[Uu]it", "FrUit", []Group{{0, 5}}},
{"[Ff]r[Uu|]it", "Fr|it", []Group{{0, 5}}},
{"[Ff]r([Uu]|[pP])it", "Frpit", []Group{{0, 5}}},
{"[Ff]r[Uu]|[pP]it", "Frpit", []Group{{2, 5}}},
{"[a-zA-Z]+", "Hello, how is it going?", []Group{{0, 5}, {7, 10}, {11, 13}, {14, 16}, {17, 22}}},
{"[abc]", nil, "defadefbdefce", []Group{{3, 4}, {7, 8}, {11, 12}}},
{"[ab]c", nil, "ab", []Group{}},
{"g[ab]c", nil, "gac", []Group{{0, 3}}},
{"g[ab]c", nil, "gbc", []Group{{0, 3}}},
{"g[ab]c", nil, "gc", []Group{}},
{"g[ab]c", nil, "gfc", []Group{}},
{"[ab]*", nil, "aabbbabaababab", []Group{{0, 14}, {14, 14}}},
{"[ab]+", nil, "aabbbablaababab", []Group{{0, 7}, {8, 15}}},
{"[Ff]r[Uu]it", nil, "fruit", []Group{{0, 5}}},
{"[Ff]r[Uu]it", nil, "FrUit", []Group{{0, 5}}},
{"[Ff]r[Uu|]it", nil, "Fr|it", []Group{{0, 5}}},
{"[Ff]r([Uu]|[pP])it", nil, "Frpit", []Group{{0, 5}}},
{"[Ff]r[Uu]|[pP]it", nil, "Frpit", []Group{{2, 5}}},
{"[a-zA-Z]+", nil, "Hello, how is it going?", []Group{{0, 5}, {7, 10}, {11, 13}, {14, 16}, {17, 22}}},
{".+", "Hello, how is it going?", []Group{{0, 23}}},
{"a.", "a ", []Group{{0, 2}}},
{"a.b", "a/b", []Group{{0, 3}}},
{".", "a ", []Group{{0, 1}, {1, 2}}},
{"a.", "a ", []Group{{0, 2}}},
{".+b", "abc", []Group{{0, 2}}},
{".+", nil, "Hello, how is it going?", []Group{{0, 23}}},
{"a.", nil, "a ", []Group{{0, 2}}},
{"a.b", nil, "a/b", []Group{{0, 3}}},
{".", nil, "a ", []Group{{0, 1}, {1, 2}}},
{"a.", nil, "a ", []Group{{0, 2}}},
{".+b", nil, "abc", []Group{{0, 2}}},
{`\d`, "1a0a3s'''34343s", []Group{{0, 1}, {2, 3}, {4, 5}, {9, 10}, {10, 11}, {11, 12}, {12, 13}, {13, 14}}},
{`\\`, `a\b\c\qwe\`, []Group{{1, 2}, {3, 4}, {5, 6}, {9, 10}}},
{`\W`, `"Hello", he said. How are you doing?`, []Group{{0, 1}, {6, 7}, {7, 8}, {8, 9}, {11, 12}, {16, 17}, {17, 18}, {21, 22}, {25, 26}, {29, 30}, {35, 36}}},
{`\w`, ";';';';';'qwe12", []Group{{10, 11}, {11, 12}, {12, 13}, {13, 14}, {14, 15}}},
{`\s`, "a b c d", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
{`\<`, "<HTML><body>", []Group{{0, 1}, {6, 7}}},
{`\(.+\)`, "Not (paranthesized), (so) is (this) not", []Group{{4, 35}}},
{`\d`, nil, "1a0a3s'''34343s", []Group{{0, 1}, {2, 3}, {4, 5}, {9, 10}, {10, 11}, {11, 12}, {12, 13}, {13, 14}}},
{`\\`, nil, `a\b\c\qwe\`, []Group{{1, 2}, {3, 4}, {5, 6}, {9, 10}}},
{`\W`, nil, `"Hello", he said. How are you doing?`, []Group{{0, 1}, {6, 7}, {7, 8}, {8, 9}, {11, 12}, {16, 17}, {17, 18}, {21, 22}, {25, 26}, {29, 30}, {35, 36}}},
{`\w`, nil, ";';';';';'qwe12", []Group{{10, 11}, {11, 12}, {12, 13}, {13, 14}, {14, 15}}},
{`\s`, nil, "a b c d", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
{`\<`, nil, "<HTML><body>", []Group{{0, 1}, {6, 7}}},
{`\(.+\)`, nil, "Not (paranthesized), (so) is (this) not", []Group{{4, 35}}},
{"[^abc]+", "qarbtopsaplpclkpasdmb prejip0r,p", []Group{{0, 1}, {2, 3}, {4, 8}, {9, 12}, {13, 16}, {17, 20}, {21, 32}}},
{"[^a]+", "qqqaq", []Group{{0, 3}, {4, 5}}},
{"[^0-9]+", "a1b2c3dd", []Group{{0, 1}, {2, 3}, {4, 5}, {6, 8}}},
{"[^abc]+", "ababababbababaccacacacaca", []Group{}},
{`\[`, "a[b[c[]]]", []Group{{1, 2}, {3, 4}, {5, 6}}},
{`\([^)]+\)`, "Not (paranthesized), (so) is (this) not", []Group{{4, 19}, {21, 25}, {29, 35}}},
{"[^abc]+", nil, "qarbtopsaplpclkpasdmb prejip0r,p", []Group{{0, 1}, {2, 3}, {4, 8}, {9, 12}, {13, 16}, {17, 20}, {21, 32}}},
{"[^a]+", nil, "qqqaq", []Group{{0, 3}, {4, 5}}},
{"[^0-9]+", nil, "a1b2c3dd", []Group{{0, 1}, {2, 3}, {4, 5}, {6, 8}}},
{"[^abc]+", nil, "ababababbababaccacacacaca", []Group{}},
{`\[`, nil, "a[b[c[]]]", []Group{{1, 2}, {3, 4}, {5, 6}}},
{`\([^)]+\)`, nil, "Not (paranthesized), (so) is (this) not", []Group{{4, 19}, {21, 25}, {29, 35}}},
{"^ab", "ab bab", []Group{{0, 2}}},
{"^aaaa^", "aaaaaaaa", []Group{}},
{"^([bB][Gg])", "bG", []Group{{0, 2}}},
{"b$", "ba", []Group{}},
{"(boy|girl)$", "girlf", []Group{}},
{`\bint\b`, "print int integer", []Group{{6, 9}}},
{`int\b`, "ints", []Group{}},
{`int(\b|a)`, "inta", []Group{{0, 4}}},
{`\b\d+\b`, "511 a3 43", []Group{{0, 3}, {7, 9}}},
{`\Bint\B`, "prints int integer print", []Group{{2, 5}}},
{`^`, "5^3^2", []Group{{0, 0}}},
{`\^`, "5^3^2", []Group{{1, 2}, {3, 4}}},
{`pool$`, "pool carpool", []Group{{8, 12}}},
{`^int$`, "print int integer", []Group{}},
{`^int$`, "int", []Group{{0, 3}}},
{`b*`, "aaaaaaaaaaqweqwe", []Group{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}, {12, 12}, {13, 13}, {14, 14}, {15, 15}, {16, 16}}},
{"^ab", nil, "ab bab", []Group{{0, 2}}},
{"^aaaa^", nil, "aaaaaaaa", []Group{}},
{"^([bB][Gg])", nil, "bG", []Group{{0, 2}}},
{"b$", nil, "ba", []Group{}},
{"(boy|girl)$", nil, "girlf", []Group{}},
{`\bint\b`, nil, "print int integer", []Group{{6, 9}}},
{`int\b`, nil, "ints", []Group{}},
{`int(\b|a)`, nil, "inta", []Group{{0, 4}}},
{`\b\d+\b`, nil, "511 a3 43", []Group{{0, 3}, {7, 9}}},
{`\Bint\B`, nil, "prints int integer print", []Group{{2, 5}}},
{`^`, nil, "5^3^2", []Group{{0, 0}}},
{`\^`, nil, "5^3^2", []Group{{1, 2}, {3, 4}}},
{`pool$`, nil, "pool carpool", []Group{{8, 12}}},
{`^int$`, nil, "print int integer", []Group{}},
{`^int$`, nil, "int", []Group{{0, 3}}},
{`b*`, nil, "aaaaaaaaaaqweqwe", []Group{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}, {12, 12}, {13, 13}, {14, 14}, {15, 15}, {16, 16}}},
{"a{4}", "aabaaa", []Group{}},
{"ab{5}", "abbbbbab", []Group{{0, 6}}},
{"(a|b){3,4}", "aba", []Group{{0, 3}}},
{"(a|b){3,4}", "ababaa", []Group{{0, 4}}},
{"(bc){5,}", "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
{`\d{3,4}`, "1209", []Group{{0, 4}}},
{`\d{3,4}`, "109", []Group{{0, 3}}},
{`\d{3,4}`, "5", []Group{}},
{`\d{3,4}`, "123135", []Group{{0, 4}}},
{`\d{3,4}`, "89a-0", []Group{}},
{`\d{3,4}`, "ababab555", []Group{{6, 9}}},
{`\bpaint\b`, "paints", []Group{}},
{`\b\w{5}\b`, "paint", []Group{{0, 5}}},
{`[^\w]`, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
{`[^\W]`, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
{`[\[\]]`, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
{"a{4}", nil, "aabaaa", []Group{}},
{"ab{5}", nil, "abbbbbab", []Group{{0, 6}}},
{"(a|b){3,4}", nil, "aba", []Group{{0, 3}}},
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
{`\d{3,4}`, nil, "5", []Group{}},
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
{`\d{3,4}`, nil, "89a-0", []Group{}},
{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
{`\bpaint\b`, nil, "paints", []Group{}},
{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
// Unicode tests
{`.+`, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
{`a.b`, "a²b", []Group{{0, 3}}},
{`[^a]+`, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
{`.+`, nil, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
{`a.b`, nil, "a²b", []Group{{0, 3}}},
{`[^a]+`, nil, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
// Fun experiment - AI-generated tests
{"(abc|def|ghi)", "abcdefg", []Group{{0, 3}, {3, 6}}},
{"a(b|c)d", "abcd", []Group{}},
{"a(b|c)*d", "abcbcd", []Group{{0, 6}}},
{"a(b|c)+d", "abcbcd", []Group{{0, 6}}},
{"a(b|c)?d", "abd", []Group{{0, 3}}},
{".+", "hello world", []Group{{0, 11}}},
{"a.b", "aXb", []Group{{0, 3}}},
{"a.*b", "aXb", []Group{{0, 3}}},
{"a.{2,3}b", "aXXb", []Group{{0, 4}}},
{"a.{2,}b", "aXXXb", []Group{{0, 5}}},
{"a.{0,3}b", "ab", []Group{{0, 2}}},
{"[abc]+", "abcabc", []Group{{0, 6}}},
{"[a-zA-Z]+", "HelloWorld", []Group{{0, 10}}},
{"[^abc]+", "defghi", []Group{{0, 6}}},
{"^hello", "hello world", []Group{{0, 5}}},
{"world$", "hello world", []Group{{6, 11}}},
{`\bhello\b`, "hello world", []Group{{0, 5}}},
{`\Bhello\B`, "hello world", []Group{}},
{"(hello|world)", "hello world", []Group{{0, 5}, {6, 11}}},
{"(hello|world)+", "hello world", []Group{{0, 5}, {6, 11}}},
{"(hello|world)*", "hello world", []Group{{0, 5}, {5, 5}, {6, 11}, {11, 11}}},
{"(hello|world)?", "hello world", []Group{{0, 5}, {5, 5}, {6, 11}, {11, 11}}},
{"ú.+ï", "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 22}}},
{"(?=hello)", "hello world", []Group{{0, 0}}},
{"(?!hello)", "hello world", []Group{{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}}},
{"(?<=hello)", "hello world", []Group{{5, 5}}},
{"(?<!hello)", "hello world", []Group{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "40", []Group{{0, 2}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "040", []Group{}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "400", []Group{{0, 3}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "4000", []Group{}},
{"a{1,3}", "aaaaa", []Group{{0, 3}, {3, 5}}},
{`\\[ab\\]`, "a", []Group{}},
{`\\[ab\\]`, `\a`, []Group{{0, 2}}},
{"(abc|def|ghi)", nil, "abcdefg", []Group{{0, 3}, {3, 6}}},
{"a(b|c)d", nil, "abcd", []Group{}},
{"a(b|c)*d", nil, "abcbcd", []Group{{0, 6}}},
{"a(b|c)+d", nil, "abcbcd", []Group{{0, 6}}},
{"a(b|c)?d", nil, "abd", []Group{{0, 3}}},
{".+", nil, "hello world", []Group{{0, 11}}},
{"a.b", nil, "aXb", []Group{{0, 3}}},
{"a.*b", nil, "aXb", []Group{{0, 3}}},
{"a.{2,3}b", nil, "aXXb", []Group{{0, 4}}},
{"a.{2,}b", nil, "aXXXb", []Group{{0, 5}}},
{"a.{0,3}b", nil, "ab", []Group{{0, 2}}},
{"[abc]+", nil, "abcabc", []Group{{0, 6}}},
{"[a-zA-Z]+", nil, "HelloWorld", []Group{{0, 10}}},
{"[^abc]+", nil, "defghi", []Group{{0, 6}}},
{"^hello", nil, "hello world", []Group{{0, 5}}},
{"world$", nil, "hello world", []Group{{6, 11}}},
{`\bhello\b`, nil, "hello world", []Group{{0, 5}}},
{`\Bhello\B`, nil, "hello world", []Group{}},
{"(hello|world)", nil, "hello world", []Group{{0, 5}, {6, 11}}},
{"(hello|world)+", nil, "hello world", []Group{{0, 5}, {6, 11}}},
{"(hello|world)*", nil, "hello world", []Group{{0, 5}, {5, 5}, {6, 11}, {11, 11}}},
{"(hello|world)?", nil, "hello world", []Group{{0, 5}, {5, 5}, {6, 11}, {11, 11}}},
{"ú.+ï", nil, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 22}}},
{"(?=hello)", nil, "hello world", []Group{{0, 0}}},
{"(?!hello)", nil, "hello world", []Group{{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}}},
{"(?<=hello)", nil, "hello world", []Group{{5, 5}}},
{"(?<!hello)", nil, "hello world", []Group{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "40", []Group{{0, 2}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "040", []Group{}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "400", []Group{{0, 3}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "4000", []Group{}},
{"a{1,3}", nil, "aaaaa", []Group{{0, 3}, {3, 5}}},
{`\\[ab\\]`, nil, "a", []Group{}},
{`\\[ab\\]`, nil, `\a`, []Group{{0, 2}}},
// Lookaround tests
{"(?<=bo)y", "boy", []Group{{2, 3}}},
{"bo(?=y)", "boy", []Group{{0, 2}}},
{"(?<=f)f+(?=f)", "fffff", []Group{{1, 4}}},
{"(?<=f)f+(?=f)", "fffffa", []Group{{1, 4}}},
{"(?<=bo)y", nil, "boy", []Group{{2, 3}}},
{"bo(?=y)", nil, "boy", []Group{{0, 2}}},
{"(?<=f)f+(?=f)", nil, "fffff", []Group{{1, 4}}},
{"(?<=f)f+(?=f)", nil, "fffffa", []Group{{1, 4}}},
// Test cases from Python's RE test suite
{`[\1]`, "\x01", []Group{{0, 1}}},
{`[\1]`, nil, "\x01", []Group{{0, 1}}},
{`\0`, "\x00", []Group{{0, 1}}},
{`[\0a]`, "\x00", []Group{{0, 1}}},
{`[\0a]`, "\x00", []Group{{0, 1}}},
{`[a\0]`, "\x00", []Group{{0, 1}}},
{`[^a\0]`, "\x00", []Group{}},
{`\0`, nil, "\x00", []Group{{0, 1}}},
{`[\0a]`, nil, "\x00", []Group{{0, 1}}},
{`[\0a]`, nil, "\x00", []Group{{0, 1}}},
{`[a\0]`, nil, "\x00", []Group{{0, 1}}},
{`[^a\0]`, nil, "\x00", []Group{}},
{`\a[\b]\f\n\r\t\v`, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
{`[\a][\b][\f][\n][\r][\t][\v]`, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
{`\u`, "", nil},
{`\xff`, "ÿ", []Group{{0, 1}}},
{`\x00ffffffffffffff`, "\xff", []Group{}},
{`\x00f`, "\x0f", []Group{}},
{`\x00fe`, "\xfe", []Group{}},
{`^\w+=(\\[\000-\277]|[^\n\\])*`, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
{`\a[\b]\f\n\r\t\v`, nil, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
{`[\a][\b][\f][\n][\r][\t][\v]`, nil, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
{`\u`, nil, "", nil},
{`\xff`, nil, "ÿ", []Group{{0, 1}}},
{`\x00ffffffffffffff`, nil, "\xff", []Group{}},
{`\x00f`, nil, "\x0f", []Group{}},
{`\x00fe`, nil, "\xfe", []Group{}},
{`^\w+=(\\[\000-\277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
{`a.b`, nil, `acb`, []Group{{0, 3}}},
{`a.b`, nil, "a\nb", []Group{}},
{`a.*b`, nil, "acc\nccb", []Group{}},
{`a.{4,5}b`, nil, "acc\nccb", []Group{}},
{`a.b`, nil, "a\rb", []Group{{0, 3}}},
{`a.b`, []ReFlag{RE_MULTILINE}, "a\nb", []Group{{0, 3}}},
{`a.*b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
{`a.{4,5}b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
{`)`, nil, ``, nil},
{`^$`, nil, ``, []Group{{0, 0}}},
{`abc`, nil, `abc`, []Group{{0, 3}}},
{`abc`, nil, `xbc`, []Group{}},
{`abc`, nil, `axc`, []Group{}},
{`abc`, nil, `abx`, []Group{}},
{`abc`, nil, `xabcy`, []Group{{1, 4}}},
{`abc`, nil, `ababc`, []Group{{2, 5}}},
{`ab*c`, nil, `abc`, []Group{{0, 3}}},
{`ab*bc`, nil, `abc`, []Group{{0, 3}}},
{`ab*bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab*bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab+bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab+bc`, nil, `abc`, []Group{}},
{`ab+bc`, nil, `abq`, []Group{}},
{`ab+bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab?bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab?bc`, nil, `abc`, []Group{{0, 3}}},
{`ab?bc`, nil, `abbbbc`, []Group{}},
{`ab?c`, nil, `abc`, []Group{{0, 3}}},
{`^abc$`, nil, `abc`, []Group{{0, 3}}},
{`^abc$`, nil, `abcc`, []Group{}},
{`^abc`, nil, `abcc`, []Group{{0, 3}}},
{`^abc$`, nil, `aabc`, []Group{}},
{`abc$`, nil, `aabc`, []Group{{1, 4}}},
{`^`, nil, `abc`, []Group{{0, 0}}},
{`$`, nil, `abc`, []Group{{3, 3}}},
{`a.c`, nil, `abc`, []Group{{0, 3}}},
{`a.c`, nil, `axc`, []Group{{0, 3}}},
{`a.*c`, nil, `axyzc`, []Group{{0, 5}}},
{`a.*c`, nil, `axyzd`, []Group{}},
{`a[bc]d`, nil, `abc`, []Group{}},
{`a[bc]d`, nil, `abd`, []Group{{0, 3}}},
{`a[b-d]e`, nil, `abd`, []Group{}},
{`a[b-d]e`, nil, `ace`, []Group{{0, 3}}},
{`a[b-d]`, nil, `aac`, []Group{{1, 3}}},
{`a[-b]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
{`a[\-b]`, nil, `a-`, []Group{{0, 2}}},
{`a[b-]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
{`a[]b`, nil, `-`, nil},
{`a[`, nil, `-`, nil},
{`a\`, nil, `-`, nil},
{`abc)`, nil, `-`, nil},
{`(abc`, nil, `-`, nil},
{`a]`, nil, `a]`, []Group{{0, 2}}},
// Todo - add numeric range tests
}
var groupTests = []struct {
re string
flags []ReFlag
str string
result []Match
}{
{"(a)(b)", "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}},
{"((a))(b)", "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}},
{"(0)", "ab", []Match{[]Group{}}},
{"(a)b", "ab", []Match{[]Group{{0, 2}, {0, 1}}}},
{"a(b)", "ab", []Match{[]Group{{0, 2}, {1, 2}}}},
{"(a|b)", "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
{"(a)|(b)", "ab", []Match{[]Group{{0, 1}, {0, 1}, {-1, -1}}, []Group{{1, 2}, {-1, -1}, {1, 2}}}},
{"(a+)(a)", "aaaa", []Match{[]Group{{0, 4}, {0, 3}, {3, 4}}}},
{"(a+)|(a)", "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a+)(aa)", "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}},
{"(aaaa)|(aaaa)", "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(aaa)|(aaaa)", "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
{"(aaa)|(aaaa)", "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
{"(aaaa)|(aaa)", "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a)|(aa)", "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}},
{"(a?)a?", "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
{"(a?)a?", "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
{"(a?)a?", "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
{"a((b.d){3})", "abfdbhdbid", []Match{[]Group{{0, 10}, {1, 10}, {7, 10}}}},
{`(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\071`, `abcdefghijkl9`, []Match{[]Group{{0, 13}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {10, 11}, {11, 12}}}},
{"(a)(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}},
{"((a))(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}},
{"(0)", nil, "ab", []Match{[]Group{}}},
{"(a)b", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}}}},
{"a(b)", nil, "ab", []Match{[]Group{{0, 2}, {1, 2}}}},
{"(a|b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
{"(a)|(b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}, {-1, -1}}, []Group{{1, 2}, {-1, -1}, {1, 2}}}},
{"(a+)(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 3}, {3, 4}}}},
{"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}},
{"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
{"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}},
{"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
{"(a?)a?", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
{"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
{"a((b.d){3})", nil, "abfdbhdbid", []Match{[]Group{{0, 10}, {1, 10}, {7, 10}}}},
{`(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\071`, nil, `abcdefghijkl9`, []Match{[]Group{{0, 13}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {10, 11}, {11, 12}}}},
}
func TestFindAllMatches(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re)
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
panic(fmt.Errorf("Test Error: %v", err))
}
} else {
matchIndices := FindAllMatches(regComp, test.str)
@@ -234,7 +293,7 @@ func TestFindAllMatches(t *testing.T) {
func TestFindString(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re)
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
@@ -259,7 +318,7 @@ func TestFindString(t *testing.T) {
func TestFindAllGroups(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re)
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)