Compare commits
21 Commits
v0.6.1
...
4c96cfa06c
Author | SHA1 | Date | |
---|---|---|---|
4c96cfa06c | |||
bd56c9c7b5 | |||
6cf523b7ea | |||
ed2671849d | |||
2309d35d30 | |||
5afb7dd04a | |||
d5007a3fd5 | |||
5c4d979d7e | |||
|
435588274c | ||
|
a347ebacc4 | ||
|
ccb82f781b | ||
|
09bbf8d3f1 | ||
|
d5b4450e50 | ||
|
45827b5dd3 | ||
|
c26edcb0c4 | ||
|
110298b6a6 | ||
|
eff4c5a5df | ||
0bd7a87797 | |||
9cf1c66653 | |||
9edc99d73c | |||
|
6850396bf9 |
187
compile.go
187
compile.go
@@ -82,6 +82,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
|
||||
// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
|
||||
//
|
||||
// Another check is made for unescaped brackets - opening brackets are replaced with LBRACKET and closing brackets are replaced with RBRACKET.
|
||||
// Finally, check for escaped backslashes. Replace these with the BACKSLASH metacharacter. Later, in thompson(),
|
||||
// these will be converted back. This avoids confusiuon in detecting whether a character is escaped eg. detecting
|
||||
// whether '\\[a]' has an escaped opening bracket (it doesn't).
|
||||
@@ -122,6 +123,12 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
} else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash
|
||||
re_runes = append(re_runes, ESC_BACKSLASH)
|
||||
i++
|
||||
} else if c == '[' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
|
||||
re_runes = append(re_runes, LBRACKET)
|
||||
continue
|
||||
} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
|
||||
re_runes = append(re_runes, RBRACKET)
|
||||
continue
|
||||
} else {
|
||||
re_runes = append(re_runes, c)
|
||||
}
|
||||
@@ -141,39 +148,28 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
i := 0
|
||||
for i < len(re_runes) {
|
||||
re_postfix = append(re_postfix, re_runes[i])
|
||||
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters.
|
||||
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
|
||||
toAppend := make([]rune, 0) // Holds all the runes in the current character class
|
||||
if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
|
||||
re_postfix = append(re_postfix, '^')
|
||||
i++ // Skip opening bracket and caret
|
||||
if re_runes[i] == LBRACKET && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped.
|
||||
toAppend := make([]rune, 0) // Holds all the runes in the current character class
|
||||
|
||||
i++ // Skip past LBRACKET, because it was already added
|
||||
if i >= len(re_runes) { // Sanity check before we start
|
||||
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
||||
}
|
||||
if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
|
||||
return nil, fmt.Errorf("Empty character class.")
|
||||
}
|
||||
for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
|
||||
i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
|
||||
// TODO: Check for escaped characters
|
||||
|
||||
// Check ahead for character range
|
||||
if i < len(re_runes)-2 && re_runes[i+1] == '-' {
|
||||
rangeStart := re_runes[i]
|
||||
rangeEnd := re_runes[i+2]
|
||||
if int(rangeEnd) < int(rangeStart) {
|
||||
return nil, fmt.Errorf("Range is out of order.")
|
||||
}
|
||||
for re_runes[i] != RBRACKET || i == 0 || re_runes[i-1] == '\\' { // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
|
||||
// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
|
||||
if i >= len(re_runes) {
|
||||
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
||||
}
|
||||
|
||||
for i := rangeStart; i <= rangeEnd; i++ {
|
||||
toAppend = append(toAppend, i)
|
||||
}
|
||||
|
||||
i += 2 // Skip start and hyphen (end will automatically be skipped on next iteration of loop)
|
||||
continue
|
||||
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != RBRACKET) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
|
||||
re_runes[i] = CHAR_RANGE
|
||||
}
|
||||
toAppend = append(toAppend, re_runes[i])
|
||||
i++
|
||||
}
|
||||
// Replace the last character (which should have been ']', with RBRACKET
|
||||
toAppend[len(toAppend)-1] = RBRACKET
|
||||
// Add in the RBRACKET
|
||||
toAppend = append(toAppend, RBRACKET)
|
||||
re_postfix = append(re_postfix, toAppend...)
|
||||
}
|
||||
if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
|
||||
@@ -280,7 +276,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
|
||||
*/
|
||||
c := re_postfix[i]
|
||||
if isNormalChar(c) {
|
||||
if isNormalChar(c) || isSpecialCharWithMetacharReplacement(c) {
|
||||
if caseInsensitive {
|
||||
outQueue = append(outQueue, newPostfixNode(allCases(c)...))
|
||||
} else {
|
||||
@@ -288,7 +284,18 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
continue
|
||||
}
|
||||
// Escape character
|
||||
// Since every unescaped bracket is replaced by a LBRACKET / RBRACKET, there may
|
||||
// have been false positives. For example, the regex ']' has a closing bracket, but it
|
||||
// isn't denoting a character class; it's just a regular character. Since it's not escaped,
|
||||
// though, I would have converted this into an RBRACKET.
|
||||
// To deal with this, I make the following assertion:
|
||||
// If at any point I see an RBRACKET 'in the wild' (not in a character class), then it must be
|
||||
// a regular character, with no special significance.
|
||||
if c == RBRACKET {
|
||||
outQueue = append(outQueue, newPostfixCharNode(']'))
|
||||
continue
|
||||
}
|
||||
|
||||
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
|
||||
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
|
||||
return nil, fmt.Errorf("ERROR: Backslash with no escape character.")
|
||||
@@ -420,7 +427,13 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
}
|
||||
if c == LBRACKET { // Used for character classes
|
||||
i++ // Step forward so we can look at the character class
|
||||
firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
|
||||
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
|
||||
i++ // Step forward so we can look at the character class
|
||||
// Oops, there's nothing there to look at
|
||||
if i >= len(re_postfix) {
|
||||
return nil, fmt.Errorf("Opening bracket with no closing bracket.")
|
||||
}
|
||||
var invertMatch bool
|
||||
if re_postfix[i] == '^' {
|
||||
invertMatch = true
|
||||
@@ -428,9 +441,14 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
chars := make([]postfixNode, 0) // List of nodes - used only for character classes
|
||||
for i < len(re_postfix) {
|
||||
if re_postfix[i] == RBRACKET {
|
||||
if firstCharAdded && re_postfix[i] == RBRACKET {
|
||||
break
|
||||
}
|
||||
if re_postfix[i] == CHAR_RANGE {
|
||||
endOfRange = true
|
||||
i++
|
||||
continue
|
||||
}
|
||||
if re_postfix[i] == '\\' { // Backslash indicates a character to be escaped
|
||||
if i == len(re_postfix)-1 {
|
||||
return nil, fmt.Errorf("Stray backslash in character class.")
|
||||
@@ -483,13 +501,54 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
i++
|
||||
}
|
||||
} else {
|
||||
if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket.
|
||||
switch re_postfix[i] {
|
||||
case LBRACKET:
|
||||
chars = append(chars, newPostfixCharNode('['))
|
||||
case RBRACKET:
|
||||
chars = append(chars, newPostfixCharNode(']'))
|
||||
default:
|
||||
return nil, fmt.Errorf("Error parsing high-range unicode value in character class.")
|
||||
}
|
||||
}
|
||||
chars = append(chars, newPostfixCharNode(re_postfix[i]))
|
||||
i++
|
||||
}
|
||||
firstCharAdded = true
|
||||
|
||||
if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range
|
||||
// Things to note:
|
||||
// 1. In PCRE and Go's regex engine, a letter _can_ be surrounded by hyphens in a character class.
|
||||
// Eg. [a-b-c]
|
||||
// While you might think this leads to a syntax error (I thought so), the engine picks 'a-b' as a range,
|
||||
// then treats the second '-' and 'c' as regular characters in the character class.
|
||||
// So this regex becomes "Match a character from 'a' to 'b', a literal hyphen, or 'c' ".
|
||||
// 2. To account for this, the following logic is followed:
|
||||
// a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range.
|
||||
// i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such.
|
||||
// ii. If either the start or end of the range don't exist in 'chars' ie. something like [-a] or [a-], then too will we treat it as a literal hyphen.
|
||||
// b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range).
|
||||
endRangePostfixNode, err1 := pop(&chars)
|
||||
startRangePostfixNode, err2 := pop(&chars)
|
||||
|
||||
if (err1 != nil || err2 != nil) || len(startRangePostfixNode.contents) != 1 { // Treat it as a regular hyphen
|
||||
chars = append(chars, startRangePostfixNode, newPostfixCharNode('-'), endRangePostfixNode)
|
||||
} else if len(endRangePostfixNode.contents) != 1 { // I don't even know what this would look like, this is just a sanity check
|
||||
return nil, fmt.Errorf("Error parsing character range.")
|
||||
} else {
|
||||
// We have established that they both have a length of 1
|
||||
startRangeRune := startRangePostfixNode.contents[0]
|
||||
endRangeRune := endRangePostfixNode.contents[0]
|
||||
chars = append(chars, newPostfixCharNode(genRange(startRangeRune, endRangeRune+1)...))
|
||||
}
|
||||
|
||||
endOfRange = false // Reset the flag
|
||||
}
|
||||
}
|
||||
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
|
||||
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
||||
}
|
||||
|
||||
outQueue = append(outQueue, newCharClassNode(chars, invertMatch))
|
||||
continue
|
||||
}
|
||||
@@ -599,6 +658,21 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
func thompson(re []postfixNode) (Reg, error) {
|
||||
nfa := make([]*State, 0) // Stack of states
|
||||
numGroups := 0 // Number of capturing groups
|
||||
|
||||
// If thompson() receives an empty regex, then whatever was given to shuntingYard()
|
||||
// was parsed away. This doesn't mean that the regex itself is empty.
|
||||
// For example, it could have been '(?:)'. This is an empty non-capturing group. Since
|
||||
// shuntingYard() doesn't include non-capturing groups in its output (and the group contains
|
||||
// nothing), the output of shuntingYard() (and the input to thompson()) ends up being empty.
|
||||
// In these cases, we will return an NFA with 1 state, with an assertion that is always true.
|
||||
if len(re) == 0 {
|
||||
start := newState()
|
||||
start.content = newContents(EPSILON)
|
||||
start.isEmpty = true
|
||||
start.assert = ALWAYS_TRUE
|
||||
nfa = append(nfa, &start)
|
||||
}
|
||||
|
||||
for _, c := range re {
|
||||
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
|
||||
state := State{}
|
||||
@@ -681,8 +755,7 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
|
||||
// Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it
|
||||
replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\')
|
||||
// Uncommenting this seems to make one of the test cases fail. Why?
|
||||
// replaceByValue(state.except, ESC_BACKSLASH, '\\')
|
||||
replaceByValue(state.except, ESC_BACKSLASH, '\\')
|
||||
|
||||
nfa = append(nfa, &state)
|
||||
}
|
||||
@@ -706,15 +779,36 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
// and then some other node.
|
||||
// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
|
||||
// and added back in.
|
||||
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
|
||||
// and RPAREN nodes.
|
||||
// If neither node exists, that's a problem so I return an error.
|
||||
if c.nodetype == RPAREN {
|
||||
s.groupEnd = true
|
||||
middleNode := mustPop(&nfa)
|
||||
lparenNode := mustPop(&nfa)
|
||||
s.groupNum = lparenNode.groupNum
|
||||
tmp := concatenate(lparenNode, middleNode)
|
||||
to_add := concatenate(tmp, s)
|
||||
nfa = append(nfa, to_add)
|
||||
|
||||
middleNode, err1 := pop(&nfa)
|
||||
lparenNode, err2 := pop(&nfa)
|
||||
if err1 != nil && err2 != nil {
|
||||
return Reg{}, fmt.Errorf("Imbalanced parentheses.")
|
||||
} else if err2 != nil { // There was no third node. ie. something like '()'
|
||||
lparenNode = middleNode
|
||||
if lparenNode.groupBegin != true { // There are only two nodes, but the first one isn't an LPAREN.
|
||||
return Reg{}, fmt.Errorf("Imbalanced parentheses.")
|
||||
}
|
||||
s.groupNum = lparenNode.groupNum
|
||||
to_add := concatenate(lparenNode, s)
|
||||
nfa = append(nfa, to_add)
|
||||
} else {
|
||||
// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
|
||||
if lparenNode.groupBegin {
|
||||
s.groupNum = lparenNode.groupNum
|
||||
} else if middleNode.groupBegin { // Something like 'a()'
|
||||
s.groupNum = middleNode.groupNum
|
||||
} else { // A middleNode and lparenNode exist, but neither is actually an LPAREN.
|
||||
return Reg{}, fmt.Errorf("Imbalanced parentheses.")
|
||||
}
|
||||
tmp := concatenate(lparenNode, middleNode)
|
||||
to_add := concatenate(tmp, s)
|
||||
nfa = append(nfa, to_add)
|
||||
}
|
||||
}
|
||||
}
|
||||
if c.nodetype == CHARCLASS { // A Character class consists of all the nodes in it, alternated
|
||||
@@ -734,9 +828,16 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
switch c.nodetype {
|
||||
case CONCATENATE:
|
||||
s2 := mustPop(&nfa)
|
||||
s1 := mustPop(&nfa)
|
||||
s1 = concatenate(s1, s2)
|
||||
nfa = append(nfa, s1)
|
||||
// Relax the requirements for concatenation a little bit - If
|
||||
// the second element is not found ie. the postfixNodes look
|
||||
// like 'a~', then that's fine, we just skip the concatenation.
|
||||
s1, err := pop(&nfa)
|
||||
if err != nil {
|
||||
nfa = append(nfa, s2)
|
||||
} else {
|
||||
s1 = concatenate(s1, s2)
|
||||
nfa = append(nfa, s1)
|
||||
}
|
||||
case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
|
||||
s1 := mustPop(&nfa)
|
||||
stateToAdd := kleene(*s1)
|
||||
|
24
misc.go
24
misc.go
@@ -15,6 +15,14 @@ var LPAREN_CHAR rune = 0xF0004 // Parentheses in regex are concatenated with thi
|
||||
var RPAREN_CHAR rune = 0xF0005
|
||||
var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPAREN
|
||||
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash
|
||||
var CHAR_RANGE rune = 0xF0008 // Represents a character range
|
||||
|
||||
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', '~', '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR}
|
||||
|
||||
// An interface for int and rune, which are identical
|
||||
type character interface {
|
||||
int | rune
|
||||
}
|
||||
|
||||
// Returns true if str[idx] and str[idx-1] are separated by a word boundary.
|
||||
func isWordBoundary(str []rune, idx int) bool {
|
||||
@@ -26,9 +34,17 @@ func isWordBoundary(str []rune, idx int) bool {
|
||||
return wbounded
|
||||
}
|
||||
|
||||
func isSpecialChar(c rune) bool {
|
||||
return slices.Contains(specialChars, c)
|
||||
|
||||
}
|
||||
|
||||
// Some special characters have metacharacter replacements. These characters, when encountered in their literal form, can be treated as regular characters.
|
||||
func isSpecialCharWithMetacharReplacement(c rune) bool {
|
||||
return slices.Contains([]rune{'[', ']'}, c)
|
||||
}
|
||||
|
||||
func isNormalChar(c rune) bool {
|
||||
specialChars := []rune(`?*\^${}()+|[].~<>`)
|
||||
specialChars = append(specialChars, LBRACKET, RBRACKET, NONCAPLPAREN_CHAR)
|
||||
return !slices.Contains(specialChars, c)
|
||||
}
|
||||
|
||||
@@ -109,8 +125,8 @@ func Reduce[T any](slc []T, fn func(T, T) T) T {
|
||||
}
|
||||
|
||||
// Generate numbers in a range - start (inclusive) to end (exclusive)
|
||||
func genRange(start, end int) []int {
|
||||
toRet := make([]int, end-start)
|
||||
func genRange[T character](start, end T) []T {
|
||||
toRet := make([]T, end-start)
|
||||
for i := start; i < end; i++ {
|
||||
toRet[i-start] = i
|
||||
}
|
||||
|
12
nfa.go
12
nfa.go
@@ -14,10 +14,11 @@ const (
|
||||
EOS
|
||||
WBOUND
|
||||
NONWBOUND
|
||||
PLA // Positive lookahead
|
||||
NLA // Negative lookahead
|
||||
PLB // Positive lookbehind
|
||||
NLB // Negative lookbehind
|
||||
PLA // Positive lookahead
|
||||
NLA // Negative lookahead
|
||||
PLB // Positive lookbehind
|
||||
NLB // Negative lookbehind
|
||||
ALWAYS_TRUE // An assertion that is always true
|
||||
)
|
||||
|
||||
type State struct {
|
||||
@@ -103,6 +104,9 @@ func cloneStateHelper(state *State, cloneMap map[*State]*State) *State {
|
||||
// Checks if the given state's assertion is true. Returns true if the given
|
||||
// state doesn't have an assertion.
|
||||
func (s State) checkAssertion(str []rune, idx int) bool {
|
||||
if s.assert == ALWAYS_TRUE {
|
||||
return true
|
||||
}
|
||||
if s.assert == SOS {
|
||||
return idx == 0
|
||||
}
|
||||
|
@@ -116,6 +116,13 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
|
||||
case 'v': // Vertical tab
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(11))
|
||||
case '-': // Literal hyphen - only in character class
|
||||
if inCharClass {
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, '-')
|
||||
} else {
|
||||
return postfixNode{}, fmt.Errorf("Invalid escape character.")
|
||||
}
|
||||
default: // None of the above - append it as a regular character
|
||||
if isNormalChar(c) { // Normal characters cannot be escaped
|
||||
return postfixNode{}, fmt.Errorf("Invalid escape character.")
|
||||
|
441
re_test.go
441
re_test.go
@@ -1,221 +1,312 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"slices"
|
||||
"testing"
|
||||
)
|
||||
|
||||
var reTests = []struct {
|
||||
re string
|
||||
flags []ReFlag
|
||||
str string
|
||||
result []Group // Stores all zero-groups in the match
|
||||
}{
|
||||
{"a", "abc", []Group{{0, 1}}},
|
||||
{"a", "bca", []Group{{2, 3}}},
|
||||
{"l", "ggllgg", []Group{{2, 3}, {3, 4}}},
|
||||
{"(b|c)", "abdceb", []Group{{1, 2}, {3, 4}, {5, 6}}},
|
||||
{"a+", "brerereraaaaabbbbb", []Group{{8, 13}}},
|
||||
{"ab+", "qweqweqweaqweqweabbbbbr", []Group{{16, 22}}},
|
||||
{"(b|c|A)", "ooaoobocA", []Group{{5, 6}, {7, 8}, {8, 9}}},
|
||||
{"ab*", "a", []Group{{0, 1}}},
|
||||
{"ab*", "abb", []Group{{0, 3}}},
|
||||
{"a*b", "aaab", []Group{{0, 4}}},
|
||||
{"a*b", "qwqw", []Group{}},
|
||||
{"(abc)*", "abcabcabc", []Group{{0, 9}, {9, 9}}},
|
||||
{"((abc)|(def))*", "abcdef", []Group{{0, 6}, {6, 6}}},
|
||||
{"(abc)*|(def)*", "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}},
|
||||
{"b*a*a", "bba", []Group{{0, 3}}},
|
||||
{"(ab)+", "abcabddd", []Group{{0, 2}, {3, 5}}},
|
||||
{"a(b(c|d)*)*", "abccbd", []Group{{0, 6}}},
|
||||
{"a(b|c)*d+", "abccdd", []Group{{0, 6}}},
|
||||
{"a*", "", []Group{{0, 0}}},
|
||||
{"a|b", "c", []Group{}},
|
||||
{"(a|b)*c", "aabbc", []Group{{0, 5}}},
|
||||
{"a(b|b)", "ab", []Group{{0, 2}}},
|
||||
{"a*", "aaaaaaaa", []Group{{0, 8}, {8, 8}}},
|
||||
{"a", nil, "abc", []Group{{0, 1}}},
|
||||
{"a", nil, "bca", []Group{{2, 3}}},
|
||||
{"l", nil, "ggllgg", []Group{{2, 3}, {3, 4}}},
|
||||
{"(b|c)", nil, "abdceb", []Group{{1, 2}, {3, 4}, {5, 6}}},
|
||||
{"a+", nil, "brerereraaaaabbbbb", []Group{{8, 13}}},
|
||||
{"ab+", nil, "qweqweqweaqweqweabbbbbr", []Group{{16, 22}}},
|
||||
{"(b|c|A)", nil, "ooaoobocA", []Group{{5, 6}, {7, 8}, {8, 9}}},
|
||||
{"ab*", nil, "a", []Group{{0, 1}}},
|
||||
{"ab*", nil, "abb", []Group{{0, 3}}},
|
||||
{"a*b", nil, "aaab", []Group{{0, 4}}},
|
||||
{"a*b", nil, "qwqw", []Group{}},
|
||||
{"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}},
|
||||
{"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}},
|
||||
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}},
|
||||
{"b*a*a", nil, "bba", []Group{{0, 3}}},
|
||||
{"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}},
|
||||
{"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}},
|
||||
{"a(b|c)*d+", nil, "abccdd", []Group{{0, 6}}},
|
||||
{"a*", nil, "", []Group{{0, 0}}},
|
||||
{"a|b", nil, "c", []Group{}},
|
||||
{"(a|b)*c", nil, "aabbc", []Group{{0, 5}}},
|
||||
{"a(b|b)", nil, "ab", []Group{{0, 2}}},
|
||||
{"a*", nil, "aaaaaaaa", []Group{{0, 8}, {8, 8}}},
|
||||
|
||||
{"ab?", "ab", []Group{{0, 2}}},
|
||||
{"a?b", "ab", []Group{{0, 2}}},
|
||||
{"a?", "", []Group{{0, 0}}},
|
||||
{"a?b?c", "a", []Group{}},
|
||||
{"a?b?c?", "ab", []Group{{0, 2}, {2, 2}}},
|
||||
{"a?b?c?", "ac", []Group{{0, 2}, {2, 2}}},
|
||||
{"a?b?c", "abc", []Group{{0, 3}}},
|
||||
{"a?b?c", "acb", []Group{{0, 2}}},
|
||||
{"ab?", nil, "ab", []Group{{0, 2}}},
|
||||
{"a?b", nil, "ab", []Group{{0, 2}}},
|
||||
{"a?", nil, "", []Group{{0, 0}}},
|
||||
{"a?b?c", nil, "a", []Group{}},
|
||||
{"a?b?c?", nil, "ab", []Group{{0, 2}, {2, 2}}},
|
||||
{"a?b?c?", nil, "ac", []Group{{0, 2}, {2, 2}}},
|
||||
{"a?b?c", nil, "abc", []Group{{0, 3}}},
|
||||
{"a?b?c", nil, "acb", []Group{{0, 2}}},
|
||||
|
||||
{"[abc]", "defadefbdefce", []Group{{3, 4}, {7, 8}, {11, 12}}},
|
||||
{"[ab]c", "ab", []Group{}},
|
||||
{"g[ab]c", "gac", []Group{{0, 3}}},
|
||||
{"g[ab]c", "gbc", []Group{{0, 3}}},
|
||||
{"g[ab]c", "gc", []Group{}},
|
||||
{"g[ab]c", "gfc", []Group{}},
|
||||
{"[ab]*", "aabbbabaababab", []Group{{0, 14}, {14, 14}}},
|
||||
{"[ab]+", "aabbbablaababab", []Group{{0, 7}, {8, 15}}},
|
||||
{"[Ff]r[Uu]it", "fruit", []Group{{0, 5}}},
|
||||
{"[Ff]r[Uu]it", "FrUit", []Group{{0, 5}}},
|
||||
{"[Ff]r[Uu|]it", "Fr|it", []Group{{0, 5}}},
|
||||
{"[Ff]r([Uu]|[pP])it", "Frpit", []Group{{0, 5}}},
|
||||
{"[Ff]r[Uu]|[pP]it", "Frpit", []Group{{2, 5}}},
|
||||
{"[a-zA-Z]+", "Hello, how is it going?", []Group{{0, 5}, {7, 10}, {11, 13}, {14, 16}, {17, 22}}},
|
||||
{"[abc]", nil, "defadefbdefce", []Group{{3, 4}, {7, 8}, {11, 12}}},
|
||||
{"[ab]c", nil, "ab", []Group{}},
|
||||
{"g[ab]c", nil, "gac", []Group{{0, 3}}},
|
||||
{"g[ab]c", nil, "gbc", []Group{{0, 3}}},
|
||||
{"g[ab]c", nil, "gc", []Group{}},
|
||||
{"g[ab]c", nil, "gfc", []Group{}},
|
||||
{"[ab]*", nil, "aabbbabaababab", []Group{{0, 14}, {14, 14}}},
|
||||
{"[ab]+", nil, "aabbbablaababab", []Group{{0, 7}, {8, 15}}},
|
||||
{"[Ff]r[Uu]it", nil, "fruit", []Group{{0, 5}}},
|
||||
{"[Ff]r[Uu]it", nil, "FrUit", []Group{{0, 5}}},
|
||||
{"[Ff]r[Uu|]it", nil, "Fr|it", []Group{{0, 5}}},
|
||||
{"[Ff]r([Uu]|[pP])it", nil, "Frpit", []Group{{0, 5}}},
|
||||
{"[Ff]r[Uu]|[pP]it", nil, "Frpit", []Group{{2, 5}}},
|
||||
{"[a-zA-Z]+", nil, "Hello, how is it going?", []Group{{0, 5}, {7, 10}, {11, 13}, {14, 16}, {17, 22}}},
|
||||
|
||||
{".+", "Hello, how is it going?", []Group{{0, 23}}},
|
||||
{"a.", "a ", []Group{{0, 2}}},
|
||||
{"a.b", "a/b", []Group{{0, 3}}},
|
||||
{".", "a ", []Group{{0, 1}, {1, 2}}},
|
||||
{"a.", "a ", []Group{{0, 2}}},
|
||||
{".+b", "abc", []Group{{0, 2}}},
|
||||
{".+", nil, "Hello, how is it going?", []Group{{0, 23}}},
|
||||
{"a.", nil, "a ", []Group{{0, 2}}},
|
||||
{"a.b", nil, "a/b", []Group{{0, 3}}},
|
||||
{".", nil, "a ", []Group{{0, 1}, {1, 2}}},
|
||||
{"a.", nil, "a ", []Group{{0, 2}}},
|
||||
{".+b", nil, "abc", []Group{{0, 2}}},
|
||||
|
||||
{`\d`, "1a0a3s'''34343s", []Group{{0, 1}, {2, 3}, {4, 5}, {9, 10}, {10, 11}, {11, 12}, {12, 13}, {13, 14}}},
|
||||
{`\\`, `a\b\c\qwe\`, []Group{{1, 2}, {3, 4}, {5, 6}, {9, 10}}},
|
||||
{`\W`, `"Hello", he said. How are you doing?`, []Group{{0, 1}, {6, 7}, {7, 8}, {8, 9}, {11, 12}, {16, 17}, {17, 18}, {21, 22}, {25, 26}, {29, 30}, {35, 36}}},
|
||||
{`\w`, ";';';';';'qwe12", []Group{{10, 11}, {11, 12}, {12, 13}, {13, 14}, {14, 15}}},
|
||||
{`\s`, "a b c d", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
|
||||
{`\<`, "<HTML><body>", []Group{{0, 1}, {6, 7}}},
|
||||
{`\(.+\)`, "Not (paranthesized), (so) is (this) not", []Group{{4, 35}}},
|
||||
{`\d`, nil, "1a0a3s'''34343s", []Group{{0, 1}, {2, 3}, {4, 5}, {9, 10}, {10, 11}, {11, 12}, {12, 13}, {13, 14}}},
|
||||
{`\\`, nil, `a\b\c\qwe\`, []Group{{1, 2}, {3, 4}, {5, 6}, {9, 10}}},
|
||||
{`\W`, nil, `"Hello", he said. How are you doing?`, []Group{{0, 1}, {6, 7}, {7, 8}, {8, 9}, {11, 12}, {16, 17}, {17, 18}, {21, 22}, {25, 26}, {29, 30}, {35, 36}}},
|
||||
{`\w`, nil, ";';';';';'qwe12", []Group{{10, 11}, {11, 12}, {12, 13}, {13, 14}, {14, 15}}},
|
||||
{`\s`, nil, "a b c d", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
|
||||
{`\<`, nil, "<HTML><body>", []Group{{0, 1}, {6, 7}}},
|
||||
{`\(.+\)`, nil, "Not (paranthesized), (so) is (this) not", []Group{{4, 35}}},
|
||||
|
||||
{"[^abc]+", "qarbtopsaplpclkpasdmb prejip0r,p", []Group{{0, 1}, {2, 3}, {4, 8}, {9, 12}, {13, 16}, {17, 20}, {21, 32}}},
|
||||
{"[^a]+", "qqqaq", []Group{{0, 3}, {4, 5}}},
|
||||
{"[^0-9]+", "a1b2c3dd", []Group{{0, 1}, {2, 3}, {4, 5}, {6, 8}}},
|
||||
{"[^abc]+", "ababababbababaccacacacaca", []Group{}},
|
||||
{`\[`, "a[b[c[]]]", []Group{{1, 2}, {3, 4}, {5, 6}}},
|
||||
{`\([^)]+\)`, "Not (paranthesized), (so) is (this) not", []Group{{4, 19}, {21, 25}, {29, 35}}},
|
||||
{"[^abc]+", nil, "qarbtopsaplpclkpasdmb prejip0r,p", []Group{{0, 1}, {2, 3}, {4, 8}, {9, 12}, {13, 16}, {17, 20}, {21, 32}}},
|
||||
{"[^a]+", nil, "qqqaq", []Group{{0, 3}, {4, 5}}},
|
||||
{"[^0-9]+", nil, "a1b2c3dd", []Group{{0, 1}, {2, 3}, {4, 5}, {6, 8}}},
|
||||
{"[^abc]+", nil, "ababababbababaccacacacaca", []Group{}},
|
||||
{`\[`, nil, "a[b[c[]]]", []Group{{1, 2}, {3, 4}, {5, 6}}},
|
||||
{`\([^)]+\)`, nil, "Not (paranthesized), (so) is (this) not", []Group{{4, 19}, {21, 25}, {29, 35}}},
|
||||
|
||||
{"^ab", "ab bab", []Group{{0, 2}}},
|
||||
{"^aaaa^", "aaaaaaaa", []Group{}},
|
||||
{"^([bB][Gg])", "bG", []Group{{0, 2}}},
|
||||
{"b$", "ba", []Group{}},
|
||||
{"(boy|girl)$", "girlf", []Group{}},
|
||||
{`\bint\b`, "print int integer", []Group{{6, 9}}},
|
||||
{`int\b`, "ints", []Group{}},
|
||||
{`int(\b|a)`, "inta", []Group{{0, 4}}},
|
||||
{`\b\d+\b`, "511 a3 43", []Group{{0, 3}, {7, 9}}},
|
||||
{`\Bint\B`, "prints int integer print", []Group{{2, 5}}},
|
||||
{`^`, "5^3^2", []Group{{0, 0}}},
|
||||
{`\^`, "5^3^2", []Group{{1, 2}, {3, 4}}},
|
||||
{`pool$`, "pool carpool", []Group{{8, 12}}},
|
||||
{`^int$`, "print int integer", []Group{}},
|
||||
{`^int$`, "int", []Group{{0, 3}}},
|
||||
{`b*`, "aaaaaaaaaaqweqwe", []Group{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}, {12, 12}, {13, 13}, {14, 14}, {15, 15}, {16, 16}}},
|
||||
{"^ab", nil, "ab bab", []Group{{0, 2}}},
|
||||
{"^aaaa^", nil, "aaaaaaaa", []Group{}},
|
||||
{"^([bB][Gg])", nil, "bG", []Group{{0, 2}}},
|
||||
{"b$", nil, "ba", []Group{}},
|
||||
{"(boy|girl)$", nil, "girlf", []Group{}},
|
||||
{`\bint\b`, nil, "print int integer", []Group{{6, 9}}},
|
||||
{`int\b`, nil, "ints", []Group{}},
|
||||
{`int(\b|a)`, nil, "inta", []Group{{0, 4}}},
|
||||
{`\b\d+\b`, nil, "511 a3 43", []Group{{0, 3}, {7, 9}}},
|
||||
{`\Bint\B`, nil, "prints int integer print", []Group{{2, 5}}},
|
||||
{`^`, nil, "5^3^2", []Group{{0, 0}}},
|
||||
{`\^`, nil, "5^3^2", []Group{{1, 2}, {3, 4}}},
|
||||
{`pool$`, nil, "pool carpool", []Group{{8, 12}}},
|
||||
{`^int$`, nil, "print int integer", []Group{}},
|
||||
{`^int$`, nil, "int", []Group{{0, 3}}},
|
||||
{`b*`, nil, "aaaaaaaaaaqweqwe", []Group{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}, {12, 12}, {13, 13}, {14, 14}, {15, 15}, {16, 16}}},
|
||||
|
||||
{"a{4}", "aabaaa", []Group{}},
|
||||
{"ab{5}", "abbbbbab", []Group{{0, 6}}},
|
||||
{"(a|b){3,4}", "aba", []Group{{0, 3}}},
|
||||
{"(a|b){3,4}", "ababaa", []Group{{0, 4}}},
|
||||
{"(bc){5,}", "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
|
||||
{`\d{3,4}`, "1209", []Group{{0, 4}}},
|
||||
{`\d{3,4}`, "109", []Group{{0, 3}}},
|
||||
{`\d{3,4}`, "5", []Group{}},
|
||||
{`\d{3,4}`, "123135", []Group{{0, 4}}},
|
||||
{`\d{3,4}`, "89a-0", []Group{}},
|
||||
{`\d{3,4}`, "ababab555", []Group{{6, 9}}},
|
||||
{`\bpaint\b`, "paints", []Group{}},
|
||||
{`\b\w{5}\b`, "paint", []Group{{0, 5}}},
|
||||
{`[^\w]`, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
|
||||
{`[^\W]`, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
|
||||
{`[\[\]]`, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
|
||||
{"a{4}", nil, "aabaaa", []Group{}},
|
||||
{"ab{5}", nil, "abbbbbab", []Group{{0, 6}}},
|
||||
{"(a|b){3,4}", nil, "aba", []Group{{0, 3}}},
|
||||
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
|
||||
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
|
||||
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
|
||||
{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
|
||||
{`\d{3,4}`, nil, "5", []Group{}},
|
||||
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
|
||||
{`\d{3,4}`, nil, "89a-0", []Group{}},
|
||||
{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
|
||||
{`\bpaint\b`, nil, "paints", []Group{}},
|
||||
{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
|
||||
{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
|
||||
{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
|
||||
{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
|
||||
|
||||
// Unicode tests
|
||||
{`.+`, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
|
||||
{`a.b`, "a²b", []Group{{0, 3}}},
|
||||
{`[^a]+`, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
|
||||
{`.+`, nil, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
|
||||
{`a.b`, nil, "a²b", []Group{{0, 3}}},
|
||||
{`[^a]+`, nil, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
|
||||
|
||||
// Fun experiment - AI-generated tests
|
||||
{"(abc|def|ghi)", "abcdefg", []Group{{0, 3}, {3, 6}}},
|
||||
{"a(b|c)d", "abcd", []Group{}},
|
||||
{"a(b|c)*d", "abcbcd", []Group{{0, 6}}},
|
||||
{"a(b|c)+d", "abcbcd", []Group{{0, 6}}},
|
||||
{"a(b|c)?d", "abd", []Group{{0, 3}}},
|
||||
{".+", "hello world", []Group{{0, 11}}},
|
||||
{"a.b", "aXb", []Group{{0, 3}}},
|
||||
{"a.*b", "aXb", []Group{{0, 3}}},
|
||||
{"a.{2,3}b", "aXXb", []Group{{0, 4}}},
|
||||
{"a.{2,}b", "aXXXb", []Group{{0, 5}}},
|
||||
{"a.{0,3}b", "ab", []Group{{0, 2}}},
|
||||
{"[abc]+", "abcabc", []Group{{0, 6}}},
|
||||
{"[a-zA-Z]+", "HelloWorld", []Group{{0, 10}}},
|
||||
{"[^abc]+", "defghi", []Group{{0, 6}}},
|
||||
{"^hello", "hello world", []Group{{0, 5}}},
|
||||
{"world$", "hello world", []Group{{6, 11}}},
|
||||
{`\bhello\b`, "hello world", []Group{{0, 5}}},
|
||||
{`\Bhello\B`, "hello world", []Group{}},
|
||||
{"(hello|world)", "hello world", []Group{{0, 5}, {6, 11}}},
|
||||
{"(hello|world)+", "hello world", []Group{{0, 5}, {6, 11}}},
|
||||
{"(hello|world)*", "hello world", []Group{{0, 5}, {5, 5}, {6, 11}, {11, 11}}},
|
||||
{"(hello|world)?", "hello world", []Group{{0, 5}, {5, 5}, {6, 11}, {11, 11}}},
|
||||
{"ú.+ï", "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 22}}},
|
||||
{"(?=hello)", "hello world", []Group{{0, 0}}},
|
||||
{"(?!hello)", "hello world", []Group{{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}}},
|
||||
{"(?<=hello)", "hello world", []Group{{5, 5}}},
|
||||
{"(?<!hello)", "hello world", []Group{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}}},
|
||||
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "40", []Group{{0, 2}}},
|
||||
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "040", []Group{}},
|
||||
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "400", []Group{{0, 3}}},
|
||||
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "4000", []Group{}},
|
||||
{"a{1,3}", "aaaaa", []Group{{0, 3}, {3, 5}}},
|
||||
{`\\[ab\\]`, "a", []Group{}},
|
||||
{`\\[ab\\]`, `\a`, []Group{{0, 2}}},
|
||||
{"(abc|def|ghi)", nil, "abcdefg", []Group{{0, 3}, {3, 6}}},
|
||||
{"a(b|c)d", nil, "abcd", []Group{}},
|
||||
{"a(b|c)*d", nil, "abcbcd", []Group{{0, 6}}},
|
||||
{"a(b|c)+d", nil, "abcbcd", []Group{{0, 6}}},
|
||||
{"a(b|c)?d", nil, "abd", []Group{{0, 3}}},
|
||||
{".+", nil, "hello world", []Group{{0, 11}}},
|
||||
{"a.b", nil, "aXb", []Group{{0, 3}}},
|
||||
{"a.*b", nil, "aXb", []Group{{0, 3}}},
|
||||
{"a.{2,3}b", nil, "aXXb", []Group{{0, 4}}},
|
||||
{"a.{2,}b", nil, "aXXXb", []Group{{0, 5}}},
|
||||
{"a.{0,3}b", nil, "ab", []Group{{0, 2}}},
|
||||
{"[abc]+", nil, "abcabc", []Group{{0, 6}}},
|
||||
{"[a-zA-Z]+", nil, "HelloWorld", []Group{{0, 10}}},
|
||||
{"[^abc]+", nil, "defghi", []Group{{0, 6}}},
|
||||
{"^hello", nil, "hello world", []Group{{0, 5}}},
|
||||
{"world$", nil, "hello world", []Group{{6, 11}}},
|
||||
{`\bhello\b`, nil, "hello world", []Group{{0, 5}}},
|
||||
{`\Bhello\B`, nil, "hello world", []Group{}},
|
||||
{"(hello|world)", nil, "hello world", []Group{{0, 5}, {6, 11}}},
|
||||
{"(hello|world)+", nil, "hello world", []Group{{0, 5}, {6, 11}}},
|
||||
{"(hello|world)*", nil, "hello world", []Group{{0, 5}, {5, 5}, {6, 11}, {11, 11}}},
|
||||
{"(hello|world)?", nil, "hello world", []Group{{0, 5}, {5, 5}, {6, 11}, {11, 11}}},
|
||||
{"ú.+ï", nil, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 22}}},
|
||||
{"(?=hello)", nil, "hello world", []Group{{0, 0}}},
|
||||
{"(?!hello)", nil, "hello world", []Group{{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}}},
|
||||
{"(?<=hello)", nil, "hello world", []Group{{5, 5}}},
|
||||
{"(?<!hello)", nil, "hello world", []Group{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}}},
|
||||
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "40", []Group{{0, 2}}},
|
||||
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "040", []Group{}},
|
||||
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "400", []Group{{0, 3}}},
|
||||
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "4000", []Group{}},
|
||||
{"a{1,3}", nil, "aaaaa", []Group{{0, 3}, {3, 5}}},
|
||||
{`\\[ab\\]`, nil, "a", []Group{}},
|
||||
{`\\[ab\\]`, nil, `\a`, []Group{{0, 2}}},
|
||||
|
||||
// Lookaround tests
|
||||
{"(?<=bo)y", "boy", []Group{{2, 3}}},
|
||||
{"bo(?=y)", "boy", []Group{{0, 2}}},
|
||||
{"(?<=f)f+(?=f)", "fffff", []Group{{1, 4}}},
|
||||
{"(?<=f)f+(?=f)", "fffffa", []Group{{1, 4}}},
|
||||
{"(?<=bo)y", nil, "boy", []Group{{2, 3}}},
|
||||
{"bo(?=y)", nil, "boy", []Group{{0, 2}}},
|
||||
{"(?<=f)f+(?=f)", nil, "fffff", []Group{{1, 4}}},
|
||||
{"(?<=f)f+(?=f)", nil, "fffffa", []Group{{1, 4}}},
|
||||
|
||||
// Test cases from Python's RE test suite
|
||||
{`[\1]`, "\x01", []Group{{0, 1}}},
|
||||
{`[\1]`, nil, "\x01", []Group{{0, 1}}},
|
||||
|
||||
{`\0`, "\x00", []Group{{0, 1}}},
|
||||
{`[\0a]`, "\x00", []Group{{0, 1}}},
|
||||
{`[\0a]`, "\x00", []Group{{0, 1}}},
|
||||
{`[a\0]`, "\x00", []Group{{0, 1}}},
|
||||
{`[^a\0]`, "\x00", []Group{}},
|
||||
{`\0`, nil, "\x00", []Group{{0, 1}}},
|
||||
{`[\0a]`, nil, "\x00", []Group{{0, 1}}},
|
||||
{`[\0a]`, nil, "\x00", []Group{{0, 1}}},
|
||||
{`[a\0]`, nil, "\x00", []Group{{0, 1}}},
|
||||
{`[^a\0]`, nil, "\x00", []Group{}},
|
||||
|
||||
{`\a[\b]\f\n\r\t\v`, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
|
||||
{`[\a][\b][\f][\n][\r][\t][\v]`, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
|
||||
{`\u`, "", nil},
|
||||
{`\xff`, "ÿ", []Group{{0, 1}}},
|
||||
{`\x00ffffffffffffff`, "\xff", []Group{}},
|
||||
{`\x00f`, "\x0f", []Group{}},
|
||||
{`\x00fe`, "\xfe", []Group{}},
|
||||
{`^\w+=(\\[\000-\277]|[^\n\\])*`, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
|
||||
{`\a[\b]\f\n\r\t\v`, nil, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
|
||||
{`[\a][\b][\f][\n][\r][\t][\v]`, nil, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
|
||||
{`\u`, nil, "", nil},
|
||||
{`\xff`, nil, "ÿ", []Group{{0, 1}}},
|
||||
{`\x00ffffffffffffff`, nil, "\xff", []Group{}},
|
||||
{`\x00f`, nil, "\x0f", []Group{}},
|
||||
{`\x00fe`, nil, "\xfe", []Group{}},
|
||||
{`^\w+=(\\[\000-\277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
|
||||
|
||||
{`a.b`, nil, `acb`, []Group{{0, 3}}},
|
||||
{`a.b`, nil, "a\nb", []Group{}},
|
||||
{`a.*b`, nil, "acc\nccb", []Group{}},
|
||||
{`a.{4,5}b`, nil, "acc\nccb", []Group{}},
|
||||
{`a.b`, nil, "a\rb", []Group{{0, 3}}},
|
||||
{`a.b`, []ReFlag{RE_MULTILINE}, "a\nb", []Group{{0, 3}}},
|
||||
{`a.*b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
|
||||
{`a.{4,5}b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
|
||||
|
||||
{`)`, nil, ``, nil},
|
||||
{`^$`, nil, ``, []Group{{0, 0}}},
|
||||
{`abc`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`abc`, nil, `xbc`, []Group{}},
|
||||
{`abc`, nil, `axc`, []Group{}},
|
||||
{`abc`, nil, `abx`, []Group{}},
|
||||
{`abc`, nil, `xabcy`, []Group{{1, 4}}},
|
||||
{`abc`, nil, `ababc`, []Group{{2, 5}}},
|
||||
{`ab*c`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`ab*bc`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`ab*bc`, nil, `abbc`, []Group{{0, 4}}},
|
||||
{`ab*bc`, nil, `abbbbc`, []Group{{0, 6}}},
|
||||
{`ab+bc`, nil, `abbc`, []Group{{0, 4}}},
|
||||
{`ab+bc`, nil, `abc`, []Group{}},
|
||||
{`ab+bc`, nil, `abq`, []Group{}},
|
||||
{`ab+bc`, nil, `abbbbc`, []Group{{0, 6}}},
|
||||
{`ab?bc`, nil, `abbc`, []Group{{0, 4}}},
|
||||
{`ab?bc`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`ab?bc`, nil, `abbbbc`, []Group{}},
|
||||
{`ab?c`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`^abc$`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`^abc$`, nil, `abcc`, []Group{}},
|
||||
{`^abc`, nil, `abcc`, []Group{{0, 3}}},
|
||||
{`^abc$`, nil, `aabc`, []Group{}},
|
||||
{`abc$`, nil, `aabc`, []Group{{1, 4}}},
|
||||
{`^`, nil, `abc`, []Group{{0, 0}}},
|
||||
{`$`, nil, `abc`, []Group{{3, 3}}},
|
||||
{`a.c`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`a.c`, nil, `axc`, []Group{{0, 3}}},
|
||||
{`a.*c`, nil, `axyzc`, []Group{{0, 5}}},
|
||||
{`a.*c`, nil, `axyzd`, []Group{}},
|
||||
{`a[bc]d`, nil, `abc`, []Group{}},
|
||||
{`a[bc]d`, nil, `abd`, []Group{{0, 3}}},
|
||||
{`a[b-d]e`, nil, `abd`, []Group{}},
|
||||
{`a[b-d]e`, nil, `ace`, []Group{{0, 3}}},
|
||||
{`a[b-d]`, nil, `aac`, []Group{{1, 3}}},
|
||||
{`a[-b]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
|
||||
{`a[\-b]`, nil, `a-`, []Group{{0, 2}}},
|
||||
{`a[b-]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
|
||||
|
||||
{`a[]b`, nil, `-`, nil},
|
||||
{`a[`, nil, `-`, nil},
|
||||
{`a\`, nil, `-`, nil},
|
||||
{`abc)`, nil, `-`, nil},
|
||||
{`(abc`, nil, `-`, nil},
|
||||
{`a]`, nil, `a]`, []Group{{0, 2}}},
|
||||
{`a[]]b`, nil, `a]b`, []Group{{0, 3}}},
|
||||
{`a[\]]b`, nil, `a]b`, []Group{{0, 3}}},
|
||||
{`a[^bc]d`, nil, `aed`, []Group{{0, 3}}},
|
||||
{`a[^bc]d`, nil, `abd`, []Group{}},
|
||||
{`a[^-b]c`, nil, `adc`, []Group{{0, 3}}},
|
||||
{`a[^-b]c`, nil, `a-c`, []Group{}},
|
||||
{`a[^]b]c`, nil, `a]c`, []Group{}},
|
||||
{`a[^]b]c`, nil, `adc`, []Group{{0, 3}}},
|
||||
{`\ba\b`, nil, `a-`, []Group{{0, 1}}},
|
||||
{`\ba\b`, nil, `-a`, []Group{{1, 2}}},
|
||||
{`\ba\b`, nil, `-a-`, []Group{{1, 2}}},
|
||||
{`\by\b`, nil, `xy`, []Group{}},
|
||||
{`\by\b`, nil, `yz`, []Group{}},
|
||||
{`\by\b`, nil, `xyz`, []Group{}},
|
||||
{`x\b`, nil, `xyz`, []Group{}},
|
||||
{`x\B`, nil, `xyz`, []Group{{0, 1}}},
|
||||
{`\Bz`, nil, `xyz`, []Group{{2, 3}}},
|
||||
{`z\B`, nil, `xyz`, []Group{}},
|
||||
{`\Bx`, nil, `xyz`, []Group{}},
|
||||
{`\Ba\B`, nil, `a-`, []Group{}},
|
||||
{`\Ba\B`, nil, `-a`, []Group{}},
|
||||
{`\Ba\B`, nil, `-a-`, []Group{}},
|
||||
{`\By\B`, nil, `xy`, []Group{}},
|
||||
{`\By\B`, nil, `yz`, []Group{}},
|
||||
{`\By\b`, nil, `xy`, []Group{{1, 2}}},
|
||||
{`\by\B`, nil, `yz`, []Group{{0, 1}}},
|
||||
{`\By\B`, nil, `xyz`, []Group{{1, 2}}},
|
||||
{`ab|cd`, nil, `abc`, []Group{{0, 2}}},
|
||||
{`ab|cd`, nil, `abcd`, []Group{{0, 2}, {2, 4}}},
|
||||
|
||||
// Todo - add numeric range tests
|
||||
}
|
||||
|
||||
var groupTests = []struct {
|
||||
re string
|
||||
flags []ReFlag
|
||||
str string
|
||||
result []Match
|
||||
}{
|
||||
{"(a)(b)", "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}},
|
||||
{"((a))(b)", "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}},
|
||||
{"(0)", "ab", []Match{[]Group{}}},
|
||||
{"(a)b", "ab", []Match{[]Group{{0, 2}, {0, 1}}}},
|
||||
{"a(b)", "ab", []Match{[]Group{{0, 2}, {1, 2}}}},
|
||||
{"(a|b)", "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
|
||||
{"(a)|(b)", "ab", []Match{[]Group{{0, 1}, {0, 1}, {-1, -1}}, []Group{{1, 2}, {-1, -1}, {1, 2}}}},
|
||||
{"(a+)(a)", "aaaa", []Match{[]Group{{0, 4}, {0, 3}, {3, 4}}}},
|
||||
{"(a+)|(a)", "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
||||
{"(a+)(aa)", "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}},
|
||||
{"(aaaa)|(aaaa)", "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
||||
{"(aaa)|(aaaa)", "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
|
||||
{"(aaa)|(aaaa)", "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
|
||||
{"(aaaa)|(aaa)", "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
||||
{"(a)|(aa)", "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}},
|
||||
{"(a?)a?", "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
|
||||
{"(a?)a?", "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
|
||||
{"(a?)a?", "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
|
||||
{"a((b.d){3})", "abfdbhdbid", []Match{[]Group{{0, 10}, {1, 10}, {7, 10}}}},
|
||||
{`(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\071`, `abcdefghijkl9`, []Match{[]Group{{0, 13}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {10, 11}, {11, 12}}}},
|
||||
{"(a)(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}},
|
||||
{"((a))(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}},
|
||||
{"(0)", nil, "ab", []Match{[]Group{}}},
|
||||
{"(a)b", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}}}},
|
||||
{"a(b)", nil, "ab", []Match{[]Group{{0, 2}, {1, 2}}}},
|
||||
{"(a|b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
|
||||
{"(a)|(b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}, {-1, -1}}, []Group{{1, 2}, {-1, -1}, {1, 2}}}},
|
||||
{"(a+)(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 3}, {3, 4}}}},
|
||||
{"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
||||
{"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}},
|
||||
{"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
||||
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
|
||||
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
|
||||
{"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
||||
{"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}},
|
||||
{"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
|
||||
{"(a?)a?", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
|
||||
{"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
|
||||
{"a((b.d){3})", nil, "abfdbhdbid", []Match{[]Group{{0, 10}, {1, 10}, {7, 10}}}},
|
||||
{`(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\071`, nil, `abcdefghijkl9`, []Match{[]Group{{0, 13}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {10, 11}, {11, 12}}}},
|
||||
{`()ef`, nil, `def`, []Match{[]Group{{1, 3}, {1, 1}}}},
|
||||
{`(?:)ef`, nil, `def`, []Match{[]Group{{1, 3}}}},
|
||||
{`(?:)`, nil, `def`, []Match{[]Group{{0, 0}}, []Group{{1, 1}}, []Group{{2, 2}}, []Group{{3, 3}}}},
|
||||
}
|
||||
|
||||
func TestFindAllMatches(t *testing.T) {
|
||||
for _, test := range reTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re)
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
panic(fmt.Errorf("Test Error: %v", err))
|
||||
}
|
||||
} else {
|
||||
matchIndices := FindAllMatches(regComp, test.str)
|
||||
@@ -234,7 +325,7 @@ func TestFindAllMatches(t *testing.T) {
|
||||
func TestFindString(t *testing.T) {
|
||||
for _, test := range reTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re)
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
@@ -259,7 +350,7 @@ func TestFindString(t *testing.T) {
|
||||
func TestFindAllGroups(t *testing.T) {
|
||||
for _, test := range groupTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re)
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
|
Reference in New Issue
Block a user