11 Commits

Author SHA1 Message Date
Aadhavan Srinivasan
ccb82f781b Enforce the rule that character classes must have at least one character; interpret literal closing brackets as regular characters 2025-01-24 15:50:36 -05:00
Aadhavan Srinivasan
09bbf8d3f1 Refactored isNormalChar(), wrote function to get special characters that have metachar replacements 2025-01-24 15:49:33 -05:00
Aadhavan Srinivasan
d5b4450e50 Added more test cases (1 failing) 2025-01-24 14:58:18 -05:00
Aadhavan Srinivasan
45827b5dd3 Allow hyphen to be escaped inside character class 2025-01-24 14:58:07 -05:00
Aadhavan Srinivasan
c26edcb0c4 Fixed edge cases with character ranges and character classes 2025-01-24 14:57:47 -05:00
Aadhavan Srinivasan
110298b6a6 Added 'flags' field to test struct for all-group tests 2025-01-24 11:11:48 -05:00
Aadhavan Srinivasan
eff4c5a5df Added 'flags' field to test struct for 0-group tests 2025-01-24 11:10:01 -05:00
0bd7a87797 Removed old comment 2025-01-22 20:27:35 -05:00
9cf1c66653 Implemented character range detection later in the code, using a metacharacter 2025-01-22 20:26:58 -05:00
9edc99d73c Modified genRange() so that it can work on ints and runes 2025-01-22 20:25:49 -05:00
Aadhavan Srinivasan
6850396bf9 Removed character range creation from the first part of shuntingYard() (the part that adds concatenation operators), because octal and hex values haven't yet been deciphered at this point in the code 2025-01-22 16:51:00 -05:00
25 changed files with 2108 additions and 4753 deletions

11
LICENSE
View File

@@ -1,11 +0,0 @@
The MIT License (MIT)
Copyright (c) 2025 Aadhavan Srinivasan
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@@ -1,13 +1,9 @@
.DEFAULT_GOAL := buildCmd
.PHONY: fmt vet buildLib buildCmd test
.DEFAULT_GOAL := build
.PHONY: fmt vet build
fmt:
go fmt ./...
vet: fmt
go vet ./...
buildLib: vet
go build -gcflags="all=-N -l" ./...
buildCmd: buildLib
go build -C cmd/ -gcflags="all=-N -l" -o re ./...
test: buildCmd
go test -v ./...
build: vet
go build -gcflags="-N -l" ./...

View File

@@ -1,27 +0,0 @@
package main
import "slices"
type character interface {
int | rune
}
// Returns all elements in slice A that are NOT in slice B
func setDifference[T comparable](s1 []T, s2 []T) []T {
toReturn := make([]T, 0, len(s1))
for _, val := range s1 {
if !slices.Contains(s2, val) {
toReturn = append(toReturn, val)
}
}
return toReturn
}
// Generate numbers in a range - start (inclusive) to end (exclusive)
func genRange[T character](start, end T) []T {
toRet := make([]T, end-start)
for i := start; i < end; i++ {
toRet[i-start] = i
}
return toRet
}

854
compile.go Normal file
View File

@@ -0,0 +1,854 @@
package main
import (
"fmt"
"slices"
"strconv"
"unicode"
)
// Holds a list of all characters that are _not_ matched by the dot metacharacter
var notDotChars []rune
// A Reg represents the result of compiling a regular expression. It contains
// the startState of the NFA representation of the regex, and the number of capturing
// groups in the regex.
type Reg struct {
start *State
numGroups int
}
const CONCAT rune = '~'
// Flags for shuntingYard - control its behavior
type ReFlag int
const (
RE_NO_FLAGS ReFlag = iota
RE_CASE_INSENSITIVE
RE_MULTILINE
)
func isOperator(c rune) bool {
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
return true
}
return false
}
/* priority returns the priority of the given operator */
func priority(op rune) int {
precedence := []rune{'|', CONCAT, '+', '*', '?'}
return slices.Index(precedence, op)
}
/*
The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
The primary benefit of this is getting rid of parentheses.
It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
An error can be returned for a multitude of reasons - the reason is specified in the error string.
The function also takes in 0 or more flags, which control the behavior of the parser.
See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
*/
func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// Check which flags are enabled
caseInsensitive := false
// In Multiline mode, the newline character is considered a
// 'dot' character ie. the dot metacharacter matches a newline as well.
if slices.Contains(flags, RE_MULTILINE) {
notDotChars = []rune{}
} else {
notDotChars = []rune{'\n'}
}
if slices.Contains(flags, RE_CASE_INSENSITIVE) {
caseInsensitive = true
}
re_postfix := make([]rune, 0)
// Convert the string to a slice of runes to allow iteration through it
re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
re_runes := make([]rune, 0)
// Check for numeric range. If we are at the start of a numeric range,
// skip to end and construct the equivalent regex for the range.
// The reason this is outside the loop below, is that it actually modifies
// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
// It also makes the overall parsing easier, since I don't have to worry about the numeric range
// anymore.
// Eventually, I might be able to add it into the main parsing loop, to reduce the time
// complexity.
// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
//
// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
//
// Finally, check for escaped backslashes. Replace these with the BACKSLASH metacharacter. Later, in thompson(),
// these will be converted back. This avoids confusiuon in detecting whether a character is escaped eg. detecting
// whether '\\[a]' has an escaped opening bracket (it doesn't).
for i := 0; i < len(re_runes_orig); i++ {
c := re_runes_orig[i]
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
i++ // Step over opening angle bracket
tmpStr := ""
hyphenFound := false
for i < len(re_runes_orig) && re_runes_orig[i] != '>' {
if !unicode.IsDigit(re_runes_orig[i]) {
if re_runes_orig[i] != '-' || (hyphenFound) {
return nil, fmt.Errorf("Invalid numeric range.")
}
}
if re_runes_orig[i] == '-' {
hyphenFound = true
}
tmpStr += string(re_runes_orig[i])
i++
}
// End of string reached and last character doesn't close the range
if i == len(re_runes_orig) && re_runes_orig[len(re_runes_orig)-1] != '>' {
return nil, fmt.Errorf("Numeric range not closed.")
}
if len(tmpStr) == 0 {
return nil, fmt.Errorf("Empty numeric range.")
}
// Closing bracket will be skipped when the loop variable increments
var rangeStart int
var rangeEnd int
fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd)
regex := range2regex(rangeStart, rangeEnd)
re_runes = append(re_runes, []rune(regex)...)
} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
re_runes = append(re_runes, NONCAPLPAREN_CHAR)
i += 2
} else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash
re_runes = append(re_runes, ESC_BACKSLASH)
i++
} else {
re_runes = append(re_runes, c)
}
}
/* Add concatenation operators.
Only add a concatenation operator between two characters if both the following conditions are met:
1. The first character isn't an opening parantheses or alteration operator (or an escape character)
a. This makes sense, because these operators can't be _concatenated_ with anything else.
2. The second character isn't a 'closing operator' - one that applies to something before it
a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_.
Caveats:
1. Don't mess with anything inside brackets - character class
2. Don't mess with anything inside braces - numeric repetition
3. Don't mess with any lookarounds.
*/
i := 0
for i < len(re_runes) {
re_postfix = append(re_postfix, re_runes[i])
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters.
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
toAppend := make([]rune, 0) // Holds all the runes in the current character class
if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
re_postfix = append(re_postfix, '^')
i++ // Skip opening bracket and caret
}
if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
return nil, fmt.Errorf("Empty character class.")
}
for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
if i >= len(re_runes) {
return nil, fmt.Errorf("Opening bracket without closing bracket.")
}
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != ']') { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
re_runes[i] = CHAR_RANGE
}
toAppend = append(toAppend, re_runes[i])
}
// Replace the last character (which should have been ']', with RBRACKET
toAppend[len(toAppend)-1] = RBRACKET
re_postfix = append(re_postfix, toAppend...)
}
if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
i++ // Skip opening brace
for i < len(re_runes) && re_runes[i] != '}' {
re_postfix = append(re_postfix, re_runes[i])
i++
}
if i == len(re_runes) {
return nil, fmt.Errorf("Invalid numeric specifier.")
}
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
}
if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen
re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
i += 3
}
if i < len(re_runes) && re_runes[i] == '\\' { // Something is being escaped (I don't add the backslash to re_postfix, because it was already added earlier)
i++
if i >= len(re_runes) {
return nil, fmt.Errorf("Stray backslash in expression.")
}
if re_runes[i] == 'x' {
re_postfix = append(re_postfix, re_runes[i])
i++
if i >= len(re_runes) {
return nil, fmt.Errorf("Stray backslash in expression.")
}
if re_runes[i] == '{' {
re_postfix = append(re_postfix, re_runes[i:i+8]...)
i += 7
if i >= len(re_runes) {
return nil, fmt.Errorf("Stray backslash in expression.")
}
} else if isHex(re_runes[i]) {
re_postfix = append(re_postfix, re_runes[i:i+2]...)
i += 2
} else {
return nil, fmt.Errorf("Invalid hex value in expression.")
}
} else if isOctal(re_runes[i]) {
numDigits := 1
for i+numDigits < len(re_runes) && numDigits < 3 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 3)
numDigits++
}
re_postfix = append(re_postfix, re_runes[i:i+numDigits]...)
i += (numDigits - 1) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended
} else {
re_postfix = append(re_postfix, re_runes[i])
}
}
if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
i++ // Step inside
if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
return nil, fmt.Errorf("Invalid regex. Lookaround intended?")
}
re_postfix = append(re_postfix, re_runes[i])
i++
numOpenParens := 1
for numOpenParens != 0 {
if i >= len(re_runes) {
return nil, fmt.Errorf("Unclosed lookaround.")
}
if re_runes[i] == '(' || re_runes[i] == NONCAPLPAREN_CHAR {
numOpenParens++
}
if re_runes[i] == ')' {
numOpenParens--
if numOpenParens == 0 {
break
}
}
re_postfix = append(re_postfix, re_runes[i])
i++
}
continue
}
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
re_postfix = append(re_postfix, CONCAT)
}
}
}
i++
}
opStack := make([]rune, 0) // Operator stack
outQueue := make([]postfixNode, 0) // Output queue
// Actual algorithm
numOpenParens := 0 // Number of open parentheses
for i := 0; i < len(re_postfix); i++ {
/* Two cases:
1. Current character is alphanumeric - send to output queue
2. Current character is operator - do the following:
a. If current character has greater priority than top of opStack, push to opStack.
b. If not, keep popping from opStack (and appending to outQueue) until:
i. opStack is empty, OR
ii. current character has greater priority than top of opStack
3. If current character is '(' or NONCAPLPAREN_CHAR, push to opStack
4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
*/
c := re_postfix[i]
if isNormalChar(c) || isSpecialCharWithMetacharReplacement(c) {
if caseInsensitive {
outQueue = append(outQueue, newPostfixNode(allCases(c)...))
} else {
outQueue = append(outQueue, newPostfixNode(c))
}
continue
}
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
return nil, fmt.Errorf("ERROR: Backslash with no escape character.")
}
i++
if re_postfix[i] == 'x' { // Hex value
i++
if re_postfix[i] == '{' && i < len(re_postfix)-6 { // Expanded hex code
var hexVal int
n, err := fmt.Sscanf(string(re_postfix[i:]), "{%x}", &hexVal)
if n < 1 || err != nil {
return nil, fmt.Errorf("Error parsing expanded hex code in expression.")
}
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
i += 7
} else if i < len(re_postfix)-1 { // Two-digit hex code
hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
if err != nil {
return nil, fmt.Errorf("Error parsing hex characters in expression.")
}
i++ // Loop increment will take care of going forward
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
} else {
return nil, fmt.Errorf("Not enough hex characters found in expression.")
}
} else if isOctal(re_postfix[i]) { // Octal value
var octVal int64
var octValStr string
numDigitsParsed := 0
for (i+numDigitsParsed) < len(re_postfix) && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 {
octValStr += string(re_postfix[i+numDigitsParsed])
numDigitsParsed++
}
octVal, err := strconv.ParseInt(octValStr, 8, 32)
if err != nil {
return nil, fmt.Errorf("Error parsing octal value in expression.")
}
if octVal > 0777 {
return nil, fmt.Errorf("Invalid octal value in expression.")
}
i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically
outQueue = append(outQueue, newPostfixCharNode(rune(octVal)))
} else {
escapedNode, err := newEscapedNode(re_postfix[i], false)
if err != nil {
return nil, fmt.Errorf("Invalid escape character in expression.")
}
outQueue = append(outQueue, escapedNode)
}
continue // Escaped character will automatically be skipped when loop variable increments
}
if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
outQueue = append(outQueue, newPostfixDotNode())
continue
}
if c == '^' { // Start-of-string assertion
outQueue = append(outQueue, newPostfixNode(c))
}
if c == '$' { // End-of-string assertion
outQueue = append(outQueue, newPostfixNode(c))
}
// Check if we're at the start of a lookaround
if c == '(' && i < len(re_postfix)-1 && re_postfix[i+1] == '?' {
i += 2 // Skip opening paren and question mark
regex := "" // Stores lookaround regex
numOpenParens := 1
for numOpenParens != 0 {
if i >= len(re_postfix) {
return nil, fmt.Errorf("Unclosed lookaround.")
}
if re_postfix[i] == '(' || re_postfix[i] == NONCAPLPAREN_CHAR {
numOpenParens++
}
if re_postfix[i] == ')' {
numOpenParens--
if numOpenParens == 0 {
break
}
}
regex += string(re_postfix[i])
i++
}
if len(regex) <= 1 { // Nothing in regex - panic
return nil, fmt.Errorf("Invalid lookaround. (too short?)")
}
// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
// Now we should filter that out.
toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1}
if regex[0] == '<' { // Lookbehind
toAppend.lookaroundDir = LOOKBEHIND
regex = regex[1:]
} else if regex[0] == '=' || regex[0] == '!' {
toAppend.lookaroundDir = LOOKAHEAD
} else {
return nil, fmt.Errorf("Invalid lookaround.")
}
// Positive or negative
if regex[0] == '=' { // Positive
toAppend.lookaroundSign = POSITIVE
toAppend.contents = []rune(regex[1:])
} else if regex[0] == '!' { // Negative
toAppend.lookaroundSign = NEGATIVE
toAppend.contents = []rune(regex[1:])
} else {
return nil, fmt.Errorf("Invalid lookaround.")
}
outQueue = append(outQueue, toAppend)
continue
}
if isOperator(c) {
if len(opStack) == 0 {
opStack = append(opStack, c)
} else {
topStack, err := peek(opStack)
if err != nil {
return nil, fmt.Errorf("Operator without operand.")
}
if priority(c) > priority(topStack) { // 2a
opStack = append(opStack, c)
} else {
for priority(c) <= priority(topStack) { // 2b
to_append := mustPop(&opStack)
outQueue = append(outQueue, newPostfixNode(to_append))
topStack, _ = peek(opStack)
}
opStack = append(opStack, c)
}
}
}
if c == LBRACKET { // Used for character classes
firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
i++ // Step forward so we can look at the character class
var invertMatch bool
if re_postfix[i] == '^' {
invertMatch = true
i++
}
chars := make([]postfixNode, 0) // List of nodes - used only for character classes
for i < len(re_postfix) {
if firstCharAdded && re_postfix[i] == RBRACKET {
break
}
if re_postfix[i] == CHAR_RANGE {
endOfRange = true
i++
continue
}
if re_postfix[i] == '\\' { // Backslash indicates a character to be escaped
if i == len(re_postfix)-1 {
return nil, fmt.Errorf("Stray backslash in character class.")
}
i++ // Step past backslash
if re_postfix[i] == 'x' { // Hex value
i++
if re_postfix[i] == '{' && i < len(re_postfix)-7 { // Expanded hex code
var hexVal int
n, err := fmt.Sscanf(string(re_postfix[i:]), "{%x}", &hexVal)
if n < 1 || err != nil {
return nil, fmt.Errorf("Error parsing expanded hex code in character class.")
}
chars = append(chars, newPostfixCharNode(rune(hexVal)))
i += 8
} else if i < len(re_postfix)-2 { // Two-digit hex code
hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
if err != nil {
return nil, fmt.Errorf("Error parsing hex characters in character class.")
}
i += 2
chars = append(chars, newPostfixCharNode(rune(hexVal)))
} else {
return nil, fmt.Errorf("Not enough hex characters found in character class.")
}
} else if isOctal(re_postfix[i]) { // Octal value
var octVal int64
var octValStr string
numDigitsParsed := 0
for (i+numDigitsParsed) < len(re_postfix)-1 && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else)
octValStr += string(re_postfix[i+numDigitsParsed])
numDigitsParsed++
}
octVal, err := strconv.ParseInt(octValStr, 8, 32)
if err != nil {
return nil, fmt.Errorf("Error parsing octal value in character class.")
}
if octVal > 0777 {
return nil, fmt.Errorf("Invalid octal value in character class.")
}
i += numDigitsParsed // Shift forward by the number of characters parsed
chars = append(chars, newPostfixCharNode(rune(octVal)))
} else {
escapedNode, err := newEscapedNode(re_postfix[i], true)
if err != nil {
return nil, fmt.Errorf("Invalid escape character in character class.")
}
chars = append(chars, escapedNode)
i++
}
} else {
if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket.
switch re_postfix[i] {
case LBRACKET:
chars = append(chars, newPostfixCharNode('['))
case RBRACKET:
chars = append(chars, newPostfixCharNode(']'))
default:
return nil, fmt.Errorf("Error parsing high-range unicode value in character class.")
}
}
chars = append(chars, newPostfixCharNode(re_postfix[i]))
i++
}
firstCharAdded = true
if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range
// Things to note:
// 1. In PCRE and Go's regex engine, a letter _can_ be surrounded by hyphens in a character class.
// Eg. [a-b-c]
// While you might think this leads to a syntax error (I thought so), the engine picks 'a-b' as a range,
// then treats the second '-' and 'c' as regular characters in the character class.
// So this regex becomes "Match a character from 'a' to 'b', a literal hyphen, or 'c' ".
// 2. To account for this, the following logic is followed:
// a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range.
// i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such.
// ii. If either the start or end of the range don't exist in 'chars' ie. something like [-a] or [a-], then too will we treat it as a literal hyphen.
// b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range).
endRangePostfixNode, err1 := pop(&chars)
startRangePostfixNode, err2 := pop(&chars)
if (err1 != nil || err2 != nil) || len(startRangePostfixNode.contents) != 1 { // Treat it as a regular hyphen
chars = append(chars, startRangePostfixNode, newPostfixCharNode('-'), endRangePostfixNode)
} else if len(endRangePostfixNode.contents) != 1 { // I don't even know what this would look like, this is just a sanity check
return nil, fmt.Errorf("Error parsing character range.")
} else {
// We have established that they both have a length of 1
startRangeRune := startRangePostfixNode.contents[0]
endRangeRune := endRangePostfixNode.contents[0]
chars = append(chars, newPostfixCharNode(genRange(startRangeRune, endRangeRune+1)...))
}
endOfRange = false // Reset the flag
}
}
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
return nil, fmt.Errorf("Opening bracket without closing bracket.")
}
outQueue = append(outQueue, newCharClassNode(chars, invertMatch))
continue
}
if c == '{' {
i++ // Skip opening brace
// Three possibilities:
// 1. Single number - {5}
// 2. Range - {3,5}
// 3. Start with no end, {3,}
startRange := make([]rune, 0)
startRangeNum := 0
endRange := make([]rune, 0)
endRangeNum := 0
for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
startRange = append(startRange, re_postfix[i])
i++
}
if len(startRange) == 0 { // {} is not valid, neither is {,5}
return nil, fmt.Errorf("Invalid numeric specifier.")
}
if i == len(re_postfix) {
return nil, fmt.Errorf("Brace not closed.")
}
startRangeNum, err := strconv.Atoi(string(startRange))
if err != nil {
panic(err)
}
if re_postfix[i] == '}' { // Case 1 above
endRangeNum = startRangeNum
} else {
if re_postfix[i] != ',' {
return nil, fmt.Errorf("Invalid numeric specifier.")
}
i++ // Skip comma
for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
endRange = append(endRange, re_postfix[i])
i++
}
if i == len(re_postfix) {
return nil, fmt.Errorf("Brace not closed.")
}
if re_postfix[i] != '}' {
return nil, fmt.Errorf("Invalid numeric specifier.")
}
if len(endRange) == 0 { // Case 3 above
endRangeNum = INFINITE_REPS
} else { // Case 2 above
var err error
endRangeNum, err = strconv.Atoi(string(endRange))
if err != nil {
panic(err)
}
}
}
idx := len(outQueue) - 1
// Get the last added node
if idx < 0 || outQueue[idx].nodetype == LPAREN {
return nil, fmt.Errorf("Numeric specifier with no content.")
}
outQueue[idx].startReps = startRangeNum
outQueue[idx].endReps = endRangeNum
}
if c == '(' || c == NONCAPLPAREN_CHAR {
opStack = append(opStack, c)
if c == '(' { // We only push _capturing_ group parentheses to outQueue
outQueue = append(outQueue, newPostfixNode(c))
}
numOpenParens++
}
if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack.
var val rune
var err error
for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) {
if err != nil {
return nil, fmt.Errorf("Imbalanced parantheses.")
}
to_append := mustPop(&opStack)
outQueue = append(outQueue, newPostfixNode(to_append))
}
_ = mustPop(&opStack) // Get rid of opening parentheses
if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
}
numOpenParens--
}
}
// Pop all remaining operators (and append to outQueue)
for len(opStack) > 0 {
to_append := mustPop(&opStack)
outQueue = append(outQueue, newPostfixNode(to_append))
}
if numOpenParens != 0 {
return nil, fmt.Errorf("Imbalanced parantheses.")
}
return outQueue, nil
}
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state and number of groups in regex.
func thompson(re []postfixNode) (Reg, error) {
nfa := make([]*State, 0) // Stack of states
numGroups := 0 // Number of capturing groups
for _, c := range re {
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
state := State{}
state.transitions = make(map[int][]*State)
if c.allChars {
state.allChars = true
if len(c.except) != 0 {
// For each node that I am 'excepting' (eg. in an inverted character class):
// - If the node itself has exceptions, then the exceptions cancel out.
// Eg. [^\w] == [\W]
// - Since an allChars node is the only kind that _can_ have exceptions, that's what I check for.
// - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list.
for _, node := range c.except {
if node.allChars {
state.allChars = false
// For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all,
// and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match
// those.
nodeExceptChars := slices.Concat(Map(node.except, func(node postfixNode) []rune {
return node.contents
})...)
state.content = rune2Contents(nodeExceptChars)
} else {
state.except = append(state.except, node.contents...)
}
}
}
}
// Convert the current contents to []int, convert the result of rune2contents to []int, append then
// convert back to stateContents.
state.content = stateContents(append([]int(state.content), []int(rune2Contents(c.contents))...))
state.output = make([]*State, 0)
state.output = append(state.output, &state)
state.isEmpty = false
if c.nodetype == ASSERTION {
state.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
switch c.contents[0] {
case '^':
state.assert = SOS
case '$':
state.assert = EOS
case 'b':
state.assert = WBOUND
case 'B':
state.assert = NONWBOUND
}
} else { // Lookaround
state.lookaroundRegex = string(c.contents)
if c.lookaroundDir == LOOKAHEAD {
if c.lookaroundSign == POSITIVE {
state.assert = PLA
}
if c.lookaroundSign == NEGATIVE {
state.assert = NLA
}
}
if c.lookaroundDir == LOOKBEHIND {
if c.lookaroundSign == POSITIVE {
state.assert = PLB
}
if c.lookaroundSign == NEGATIVE {
state.assert = NLB
}
}
tmpRe, err := shuntingYard(state.lookaroundRegex)
if err != nil {
return Reg{}, fmt.Errorf("Error parsing lookaround: %w", err)
}
reg, err := thompson(tmpRe)
if err != nil {
return Reg{}, fmt.Errorf("Error compiling lookaround: %w", err)
}
state.lookaroundNFA = reg.start
state.lookaroundNumCaptureGroups = reg.numGroups
}
}
// Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it
replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\')
replaceByValue(state.except, ESC_BACKSLASH, '\\')
nfa = append(nfa, &state)
}
if c.nodetype == LPAREN || c.nodetype == RPAREN {
s := &State{}
s.assert = NONE
s.content = newContents(EPSILON)
s.isEmpty = true
s.output = make([]*State, 0)
s.output = append(s.output, s)
s.transitions = make(map[int][]*State)
// LPAREN nodes are just added normally
if c.nodetype == LPAREN {
numGroups++
s.groupBegin = true
s.groupNum = numGroups
nfa = append(nfa, s)
continue
}
// For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN,
// and then some other node.
// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
// and added back in.
if c.nodetype == RPAREN {
s.groupEnd = true
middleNode := mustPop(&nfa)
lparenNode := mustPop(&nfa)
s.groupNum = lparenNode.groupNum
tmp := concatenate(lparenNode, middleNode)
to_add := concatenate(tmp, s)
nfa = append(nfa, to_add)
}
}
if c.nodetype == CHARCLASS { // A Character class consists of all the nodes in it, alternated
// Map the list of nodes to a list of states, each state containing the contents of a specific node
states := Map(c.nodeContents, func(node postfixNode) *State {
s := newState()
s.content = rune2Contents(node.contents)
return &s
})
// Reduce the list of states down to a single state by alternating them
toAdd := Reduce(states, func(s1 *State, s2 *State) *State {
return alternate(s1, s2)
})
nfa = append(nfa, toAdd)
}
// Must be an operator if it isn't a character
switch c.nodetype {
case CONCATENATE:
s2 := mustPop(&nfa)
s1 := mustPop(&nfa)
s1 = concatenate(s1, s2)
nfa = append(nfa, s1)
case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
s1 := mustPop(&nfa)
stateToAdd := kleene(*s1)
nfa = append(nfa, stateToAdd)
case PLUS: // a+ is equivalent to aa*
s1 := mustPop(&nfa)
s2 := kleene(*s1)
s1 = concatenate(s1, s2)
nfa = append(nfa, s1)
case QUESTION: // ab? is equivalent to a(b|)
s1 := mustPop(&nfa)
s2 := question(s1)
nfa = append(nfa, s2)
case PIPE:
s1 := mustPop(&nfa)
s2 := mustPop(&nfa)
s3 := alternate(s1, s2)
nfa = append(nfa, s3)
}
if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
if c.endReps != -1 && c.endReps < c.startReps {
return Reg{}, fmt.Errorf("Numeric specifier - start greater than end.")
}
state := mustPop(&nfa)
var stateToAdd *State = nil
// Take advantage of the following facts:
// a{5} == aaaaa
// a{3,5} == aaaa?a?
// a{5,} == aaaaa+
// Nov. 3 2024 - I have two choices on how I want to implement numeric
// specifiers.
// a. Encode the logic while creating the states. I will have to create a function
// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
// each other (concatenating them with the 'concatenate' method - which takes addresses - does
// not work). Creating this function might be a lot of work.
// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
// at this point, I can leave thompson untouched.
for i := 0; i < c.startReps; i++ { // Case 1
stateToAdd = concatenate(stateToAdd, cloneState(state))
}
if c.endReps == INFINITE_REPS { // Case 3
s2 := kleene(*state)
stateToAdd = concatenate(stateToAdd, s2)
} else { // Case 2
for i := c.startReps; i < c.endReps; i++ {
stateToAdd = concatenate(stateToAdd, question(cloneState(state)))
}
}
nfa = append(nfa, stateToAdd)
}
}
if len(nfa) != 1 {
return Reg{}, fmt.Errorf("Invalid Regex.")
}
verifyLastStates(nfa)
return Reg{nfa[0], numGroups}, nil
}
// Compiles the given regular expression into a Reg type, suitable for use with the
// matching functions. The second return value is non-nil if a compilation error has
// occured. As such, the error value must be checked before using the Reg returned by this function.
// The second parameter is an optional list of flags, passed to the parsing function shuntingYard.
func Compile(re string, flags ...ReFlag) (Reg, error) {
nodes, err := shuntingYard(re, flags...)
if err != nil {
return Reg{}, fmt.Errorf("Error parsing regex: %w", err)
}
reg, err := thompson(nodes)
if err != nil {
return Reg{}, fmt.Errorf("Error compiling regex: %w", err)
}
return reg, nil
}

2
go.mod
View File

@@ -1,4 +1,4 @@
module gitea.twomorecents.org/Rockingcool/kleingrep
module re
go 1.23.1

View File

@@ -8,13 +8,11 @@ import (
"os"
"github.com/fatih/color"
reg "gitea.twomorecents.org/Rockingcool/kleingrep/regex"
)
func main() {
// Flags for the regex Compile function
flagsToCompile := make([]reg.ReFlag, 0)
flagsToCompile := make([]ReFlag, 0)
invertFlag := flag.Bool("v", false, "Invert match.")
// This flag has two 'modes':
@@ -31,10 +29,10 @@ func main() {
// These flags have to be passed to the Compile function
if *multiLineFlag {
flagsToCompile = append(flagsToCompile, reg.RE_MULTILINE, reg.RE_SINGLE_LINE)
flagsToCompile = append(flagsToCompile, RE_MULTILINE)
}
if *caseInsensitiveFlag {
flagsToCompile = append(flagsToCompile, reg.RE_CASE_INSENSITIVE)
flagsToCompile = append(flagsToCompile, RE_CASE_INSENSITIVE)
}
// -l and -o are mutually exclusive: -o overrides -l
@@ -78,7 +76,7 @@ func main() {
reader := bufio.NewReader(os.Stdin)
out := bufio.NewWriter(os.Stdout)
regComp, err := reg.Compile(re, flagsToCompile...)
regComp, err := Compile(re, flagsToCompile...)
if err != nil {
fmt.Println(err)
return
@@ -119,14 +117,14 @@ func main() {
panic(err)
}
}
matchIndices := make([]reg.Match, 0)
matchIndices := make([]Match, 0)
if matchNumFlagEnabled {
tmp, err := regComp.FindNthMatch(test_str, *matchNum)
tmp, err := FindNthMatch(regComp, test_str, *matchNum)
if err == nil {
matchIndices = append(matchIndices, tmp)
}
} else {
matchIndices = regComp.FindAllSubmatch(test_str)
matchIndices = FindAllMatches(regComp, test_str)
}
if *printMatchesFlag {
@@ -137,7 +135,7 @@ func main() {
fmt.Fprintf(out, "Line %d:\n", lineNum)
}
for _, m := range matchIndices {
fmt.Fprintf(out, "%s\n", m.String())
fmt.Fprintf(out, "%s\n", m.toString())
}
err := out.Flush()
if err != nil {
@@ -150,7 +148,7 @@ func main() {
// This should make checking O(1) instead of O(n)
indicesToPrint := new_uniq_arr[int]()
for _, idx := range matchIndices {
indicesToPrint.add(genRange(idx[0].StartIdx, idx[0].EndIdx)...)
indicesToPrint.add(genRange(idx[0].startIdx, idx[0].endIdx)...)
}
// If we are inverting, then we should print the indices which _didn't_ match
// in color.
@@ -185,9 +183,9 @@ func main() {
for i := range test_str {
inMatchIndex := false
for _, m := range matchIndices {
if i == m[0].StartIdx {
if i == m[0].startIdx {
fmt.Fprintf(out, "%s", *substituteText)
i = m[0].EndIdx
i = m[0].endIdx
inMatchIndex = true
break
}
@@ -203,7 +201,7 @@ func main() {
// Newline after every match - only if -o is enabled and -v is disabled.
if *onlyFlag && !(*invertFlag) {
for _, idx := range matchIndices {
if i+1 == idx[0].EndIdx { // End index is one more than last index of match
if i+1 == idx[0].endIdx { // End index is one more than last index of match
fmt.Fprintf(out, "\n")
break
}

415
matching.go Normal file
View File

@@ -0,0 +1,415 @@
package main
import (
"fmt"
"sort"
)
// a Match stores a slice of all the capturing groups in a match.
type Match []Group
// a Group represents a group. It contains the start index and end index of the match
type Group struct {
startIdx int
endIdx int
}
func newMatch(size int) Match {
toRet := make([]Group, size)
for i := range toRet {
toRet[i].startIdx = -1
toRet[i].endIdx = -1
}
return toRet
}
// Returns the number of valid groups in the match
func (m Match) numValidGroups() int {
numValid := 0
for _, g := range m {
if g.startIdx >= 0 && g.endIdx >= 0 {
numValid++
}
}
return numValid
}
// Returns a string containing the indices of all (valid) groups in the match
func (m Match) toString() string {
var toRet string
for i, g := range m {
if g.isValid() {
toRet += fmt.Sprintf("Group %d\n", i)
toRet += g.toString()
toRet += "\n"
}
}
return toRet
}
// Converts the Group into a string representation:
func (idx Group) toString() string {
return fmt.Sprintf("%d\t%d", idx.startIdx, idx.endIdx)
}
// Returns whether a group contains valid indices
func (g Group) isValid() bool {
return g.startIdx >= 0 && g.endIdx >= 0
}
// takeZeroState takes the 0-state (if such a transition exists) for all states in the
// given slice. It returns the resulting states. If any of the resulting states is a 0-state,
// the second ret val is true.
// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index.
func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZero bool) {
for _, state := range states {
if len(state.transitions[EPSILON]) > 0 {
for _, s := range state.transitions[EPSILON] {
if s.threadGroups == nil {
s.threadGroups = newMatch(numGroups + 1)
}
copy(s.threadGroups, state.threadGroups)
if s.groupBegin {
s.threadGroups[s.groupNum].startIdx = idx
// openParenGroups = append(openParenGroups, s.groupNum)
}
if s.groupEnd {
s.threadGroups[s.groupNum].endIdx = idx
// closeParenGroups = append(closeParenGroups, s.groupNum)
}
}
rtv = append(rtv, state.transitions[EPSILON]...)
}
}
for _, state := range rtv {
if len(state.transitions[EPSILON]) > 0 {
return rtv, true
}
}
return rtv, false
}
// zeroMatchPossible returns true if a zero-length match is possible
// from any of the given states, given the string and our position in it.
// It uses the same algorithm to find zero-states as the one inside the loop,
// so I should probably put it in a function.
func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) bool {
zeroStates, isZero := takeZeroState(states, numGroups, idx)
tempstates := make([]*State, 0, len(zeroStates)+len(states))
tempstates = append(tempstates, states...)
tempstates = append(tempstates, zeroStates...)
num_appended := 0 // number of unique states addded to tempstates
for isZero == true {
zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
tempstates, num_appended = unique_append(tempstates, zeroStates...)
if num_appended == 0 { // break if we haven't appended any more unique values
break
}
}
for _, state := range tempstates {
if state.isEmpty && (state.assert == NONE || state.checkAssertion(str, idx)) && state.isLast {
return true
}
}
return false
}
// Prunes the slice by removing overlapping indices.
func pruneIndices(indices []Match) []Match {
// First, sort the slice by the start indices
sort.Slice(indices, func(i, j int) bool {
return indices[i][0].startIdx < indices[j][0].startIdx
})
toRet := make([]Match, 0, len(indices))
current := indices[0]
for _, idx := range indices[1:] {
// idx doesn't overlap with current (starts after current ends), so add current to result
// and update the current.
if idx[0].startIdx >= current[0].endIdx {
toRet = append(toRet, current)
current = idx
} else if idx[0].endIdx > current[0].endIdx {
// idx overlaps, but it is longer, so update current
current = idx
}
}
// Add last state
toRet = append(toRet, current)
return toRet
}
// FindString returns a _string_ containing the _text_ of the _leftmost_ match of
// the regex, in the given string. The return value will be an empty string in two situations:
// 1. No match was found
// 2. The match was an empty string
func FindString(regex Reg, str string) string {
match, err := FindNthMatch(regex, str, 1)
if err != nil {
return ""
}
return str[match[0].startIdx:match[0].endIdx]
}
// FindAllString is the 'all' version of FindString.
// It returns a _slice of strings_ containing the _text_ of _all_ matches of
// the regex, in the given string.
//func FindAllString(regex Reg, str []string) []string {
//
//}
// FindNthMatch finds the 'n'th match of the regex represented by the given start-state, with
// the given string.
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
func FindNthMatch(regex Reg, str string, n int) (Match, error) {
idx := 0
matchNum := 0
str_runes := []rune(str)
var matchFound bool
var matchIdx Match
for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
if matchFound {
matchNum++
}
if matchNum == n {
return matchIdx, nil
}
}
// We haven't found the nth match after scanning the string - Return an error
return nil, fmt.Errorf("Invalid match index. Too few matches found.")
}
// FindAllMatches tries to find all matches of the regex represented by given start-state, with
// the given string
func FindAllMatches(regex Reg, str string) []Match {
idx := 0
str_runes := []rune(str)
var matchFound bool
var matchIdx Match
indices := make([]Match, 0)
for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
if matchFound {
indices = append(indices, matchIdx)
}
}
if len(indices) > 0 {
return pruneIndices(indices)
}
return indices
}
// Helper for FindAllMatches. Returns whether it found a match, the
// first Match it finds, and how far it got into the string ie. where
// the next search should start from.
//
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (bool, Match, int) {
// Base case - exit if offset exceeds string's length
if offset > len(str) {
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
return false, []Group{}, offset
}
// Hold a list of match indices for the current run. When we
// can no longer find a match, the match with the largest range is
// chosen as the match for the entire string.
// This allows us to pick the longest possible match (which is how greedy matching works).
// COMMENT ABOVE IS CURRENTLY NOT UP-TO-DATE
tempIndices := newMatch(numGroups + 1)
foundPath := false
startIdx := offset
endIdx := offset
currentStates := make([]*State, 0)
tempStates := make([]*State, 0) // Used to store states that should be used in next loop iteration
i := offset // Index in string
startingFrom := i // Store starting index
// If the first state is an assertion, makes sure the assertion
// is true before we do _anything_ else.
if start.assert != NONE {
if start.checkAssertion(str, offset) == false {
i++
return false, []Group{}, i
}
}
// Increment until we hit a character matching the start state (assuming not 0-state)
if start.isEmpty == false {
for i < len(str) && !start.contentContains(str, i) {
i++
}
startIdx = i
startingFrom = i
i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character
}
start.threadGroups = newMatch(numGroups + 1)
// Check if the start state begins a group - if so, add the start index to our list
if start.groupBegin {
start.threadGroups[start.groupNum].startIdx = i
// tempIndices[start.groupNum].startIdx = i
}
currentStates = append(currentStates, start)
// Main loop
for i < len(str) {
foundPath = false
zeroStates := make([]*State, 0)
// Keep taking zero-states, until there are no more left to take
// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
tempStates = append(tempStates, zeroStates...)
num_appended := 0
for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = unique_append(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break
}
}
currentStates, _ = unique_append(currentStates, tempStates...)
tempStates = nil
// Take any transitions corresponding to current character
numStatesMatched := 0 // The number of states which had at least 1 match for this round
assertionFailed := false // Whether or not an assertion failed for this round
lastStateInList := false // Whether or not a last state was in our list of states
var lastStatePtr *State = nil // Pointer to the last-state, if it was found
lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
for _, state := range currentStates {
matches, numMatches := state.matchesFor(str, i)
if numMatches > 0 {
numStatesMatched++
tempStates = append(tempStates, matches...)
foundPath = true
for _, m := range matches {
if m.threadGroups == nil {
m.threadGroups = newMatch(numGroups + 1)
}
copy(m.threadGroups, state.threadGroups)
}
}
if numMatches < 0 {
assertionFailed = true
}
if state.isLast {
if state.isLookaround() {
lastLookaroundInList = true
}
lastStateInList = true
lastStatePtr = state
}
}
if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed
// If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_
// state. The explanation below is my attempt to explain this behavior.
// If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails.
//
// One of the states in our list was a last state and a lookaround. In this case, we
// don't abort upon failure of the assertion, because we have found
// another path to a final state.
// Even if the last state _was_ an assertion, we can use the previously
// saved indices to find a match.
if lastLookaroundInList {
break
} else {
if i == startingFrom {
i++
}
return false, []Group{}, i
}
}
// Check if we can find a state in our list that is:
// a. A last-state
// b. Empty
// c. Doesn't assert anything
for _, s := range currentStates {
if s.isLast && s.isEmpty && s.assert == NONE {
lastStatePtr = s
lastStateInList = true
}
}
if lastStateInList { // A last-state was in the list of states. add the matchIndex to our MatchIndex list
for j := 1; j < numGroups+1; j++ {
tempIndices[j] = lastStatePtr.threadGroups[j]
}
endIdx = i
tempIndices[0] = Group{startIdx, endIdx}
}
// Check if we can find a zero-length match
if foundPath == false {
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
if tempIndices[0].isValid() == false {
tempIndices[0] = Group{startIdx, startIdx}
}
}
// If we haven't moved in the string, increment the counter by 1
// to ensure we don't keep trying the same string over and over.
// if i == startingFrom {
startIdx++
// i++
// }
if tempIndices.numValidGroups() > 0 && tempIndices[0].isValid() {
if tempIndices[0].startIdx == tempIndices[0].endIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true, tempIndices, tempIndices[0].endIdx + 1
} else {
return true, tempIndices, tempIndices[0].endIdx
}
}
return false, []Group{}, startIdx
}
currentStates = make([]*State, len(tempStates))
copy(currentStates, tempStates)
tempStates = nil
i++
}
// End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position.
// This is the exact same algorithm used inside the loop, so I should probably put it in a function.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
tempStates = append(tempStates, zeroStates...)
num_appended := 0 // Number of unique states addded to tempStates
for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = unique_append(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break
}
}
currentStates = append(currentStates, tempStates...)
tempStates = nil
for _, state := range currentStates {
// Only add the match if the start index is in bounds. If the state has an assertion,
// make sure the assertion checks out.
if state.isLast && i <= len(str) {
if state.assert == NONE || state.checkAssertion(str, i) {
for j := 1; j < numGroups+1; j++ {
tempIndices[j] = state.threadGroups[j]
}
endIdx = i
tempIndices[0] = Group{startIdx, endIdx}
}
}
}
if tempIndices.numValidGroups() > 0 {
if tempIndices[0].startIdx == tempIndices[0].endIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true, tempIndices, tempIndices[0].endIdx + 1
} else {
return true, tempIndices, tempIndices[0].endIdx
}
}
if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over.
startIdx++
}
return false, []Group{}, startIdx
}

View File

@@ -1,4 +1,4 @@
package regex
package main
import (
"slices"
@@ -8,16 +8,16 @@ import (
var whitespaceChars = []rune{' ', '\t', '\n'}
var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_")
var lbracketRune rune = 0xF0002
var rbracketRune rune = 0xF0003
var anyCharRune rune = 0xF0004 // Represents any character - used for states where the allChars flag is on.
var lparenRune rune = 0xF0005 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
var rparenRune rune = 0xF0006
var nonCapLparenRune rune = 0xF0007 // Represents a non-capturing group's LPAREN
var escBackslashRune rune = 0xF0008 // Represents an escaped backslash
var charRangeRune rune = 0xF0009 // Represents a character range
var LBRACKET rune = 0xF0001
var RBRACKET rune = 0xF0002
var ANY_CHAR rune = 0xF0003 // Represents any character - used for states where the allChars flag is on.
var LPAREN_CHAR rune = 0xF0004 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
var RPAREN_CHAR rune = 0xF0005
var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPAREN
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash
var CHAR_RANGE rune = 0xF0008 // Represents a character range
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', '~', '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR}
// An interface for int and rune, which are identical
type character interface {
@@ -48,9 +48,60 @@ func isNormalChar(c rune) bool {
return !slices.Contains(specialChars, c)
}
func assert(cond bool) {
if cond != true {
panic("Assertion Failed")
}
}
func deleteFromSlice[T comparable](slc []T, val T) []T {
toReturn := make([]T, 0, len(slc))
for _, v := range slc {
if v != val {
toReturn = append(toReturn, v)
}
}
return toReturn
}
// Ensure that the given elements are only appended to the given slice if they
// don't already exist. Returns the new slice, and the number of unique items appended.
func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
num_appended := 0
for _, item := range items {
if !slices.Contains(slc, item) {
slc = append(slc, item)
num_appended++
}
}
return slc, num_appended
}
// Returns true only if all the given elements are equal
func allEqual[T comparable](items ...T) bool {
first := items[0]
for _, item := range items {
if item != first {
return false
}
}
return true
}
// Returns all elements in slice A that are NOT in slice B
func setDifference[T comparable](s1 []T, s2 []T) []T {
toReturn := make([]T, 0, len(s1))
for _, val := range s1 {
if !slices.Contains(s2, val) {
toReturn = append(toReturn, val)
}
}
return toReturn
}
// Map function - convert a slice of T to a slice of V, based on a function
// that maps a T to a V
func funcMap[T, V any](slc []T, fn func(T) V) []V {
func Map[T, V any](slc []T, fn func(T) V) []V {
toReturn := make([]V, len(slc))
for i, val := range slc {
toReturn[i] = fn(val)
@@ -60,7 +111,7 @@ func funcMap[T, V any](slc []T, fn func(T) V) []V {
// Reduce function - reduces a slice of a type into a value of the type,
// based on the given function.
func funcReduce[T any](slc []T, fn func(T, T) T) T {
func Reduce[T any](slc []T, fn func(T, T) T) T {
if len(slc) == 0 {
panic("Reduce on empty slice.")
}
@@ -73,30 +124,29 @@ func funcReduce[T any](slc []T, fn func(T, T) T) T {
return slc[0]
}
// Generate numbers in a range - start to end (both inclusive)
func genRangeInclusive[T character](start, end T) []T {
toRet := make([]T, (end-start)+1)
for i := start; i <= end; i++ {
// Generate numbers in a range - start (inclusive) to end (exclusive)
func genRange[T character](start, end T) []T {
toRet := make([]T, end-start)
for i := start; i < end; i++ {
toRet[i-start] = i
}
return toRet
}
// Returns a rune-slice containing all possible cases of the given rune, given the
// 'caseInsensitive' boolean variable.
// If this variable is false, the rune is returned as-is, without modifications.
// If it is true, then we return all possible cases of the
// rune.
// Returns a rune-slice containing all possible cases of the given rune.
// At the moment, this includes:
// 1. Upper case
// 2. Lower case
// 3. Title case
func allCases(r rune, caseInsensitive bool) []rune {
if caseInsensitive {
return []rune{unicode.ToLower(r), unicode.ToUpper(r), unicode.ToTitle(r)}
} else {
return []rune{r}
}
func allCases(r rune) []rune {
return []rune{unicode.ToLower(r), unicode.ToUpper(r), unicode.ToTitle(r)}
}
// Expands a slice to the given length
func expandSlice[T any](slc []T, newSize int) []T {
toRet := make([]T, newSize)
copy(toRet, slc)
return toRet
}
func isHex(c rune) bool {

331
nfa.go Normal file
View File

@@ -0,0 +1,331 @@
package main
import (
"slices"
)
const EPSILON int = 0xF0000
type assertType int
const (
NONE assertType = iota
SOS
EOS
WBOUND
NONWBOUND
PLA // Positive lookahead
NLA // Negative lookahead
PLB // Positive lookbehind
NLB // Negative lookbehind
)
type State struct {
content stateContents // Contents of current state
isEmpty bool // If it is empty - Union operator and Kleene star states will be empty
isLast bool // If it is the last state (acept state)
output []*State // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
transitions map[int][]*State // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds
lookaroundNFA *State // Holds the NFA of the lookaroundRegex - if it exists
lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround
groupBegin bool // Whether or not the node starts a capturing group
groupEnd bool // Whether or not the node ends a capturing group
groupNum int // Which capturing group the node starts / ends
// The following properties depend on the current match - I should think about resetting them for every match.
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
}
// Clones the NFA starting from the given state.
func cloneState(start *State) *State {
return cloneStateHelper(start, make(map[*State]*State))
}
// Helper function for clone. The map is used to keep track of which states have
// already been copied, and which ones haven't.
// This function was created using output from Llama3.1:405B.
func cloneStateHelper(state *State, cloneMap map[*State]*State) *State {
// Base case - if the clone exists in our map, return it.
if clone, exists := cloneMap[state]; exists {
return clone
}
if state == nil {
return nil
}
// Recursive case - if the clone doesn't exist, create it, add it to the map,
// and recursively call for each of the transition states.
clone := &State{
content: append([]int{}, state.content...),
isEmpty: state.isEmpty,
isLast: state.isLast,
output: make([]*State, len(state.output)),
transitions: make(map[int][]*State),
isKleene: state.isKleene,
assert: state.assert,
zeroMatchFound: state.zeroMatchFound,
allChars: state.allChars,
except: append([]rune{}, state.except...),
lookaroundRegex: state.lookaroundRegex,
groupEnd: state.groupEnd,
groupBegin: state.groupBegin,
groupNum: state.groupNum,
}
cloneMap[state] = clone
for i, s := range state.output {
if s == state {
clone.output[i] = clone
} else {
clone.output[i] = cloneStateHelper(s, cloneMap)
}
}
for k, v := range state.transitions {
clone.transitions[k] = make([]*State, len(v))
for i, s := range v {
if s == state {
clone.transitions[k][i] = clone
} else {
clone.transitions[k][i] = cloneStateHelper(s, cloneMap)
}
}
}
if state.lookaroundNFA == state {
clone.lookaroundNFA = clone
}
clone.lookaroundNFA = cloneStateHelper(state.lookaroundNFA, cloneMap)
return clone
}
// Checks if the given state's assertion is true. Returns true if the given
// state doesn't have an assertion.
func (s State) checkAssertion(str []rune, idx int) bool {
if s.assert == SOS {
return idx == 0
}
if s.assert == EOS {
// Index is at the end of the string, or it points to the last character which is a newline
return idx == len(str) || (idx == len(str)-1 && str[len(str)-1] == '\n')
}
if s.assert == WBOUND {
return isWordBoundary(str, idx)
}
if s.assert == NONWBOUND {
return !isWordBoundary(str, idx)
}
if s.isLookaround() {
// The process here is simple:
// 1. Compile the regex stored in the state's contents.
// 2. Run it on a subset of the test string, that ends after the current index in the string
// 3. Based on the kind of lookaround (and the indices we get), determine what action to take.
startState := s.lookaroundNFA
var runesToMatch []rune
var strToMatch string
if s.assert == PLA || s.assert == NLA {
runesToMatch = str[idx:]
} else {
runesToMatch = str[:idx]
}
if len(runesToMatch) == 0 {
strToMatch = ""
} else {
strToMatch = string(runesToMatch)
}
matchIndices := FindAllMatches(Reg{startState, s.lookaroundNumCaptureGroups}, strToMatch)
numMatchesFound := 0
for _, matchIdx := range matchIndices {
if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
if matchIdx[0].startIdx == 0 {
numMatchesFound++
}
}
if s.assert == PLB || s.assert == NLB { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
if matchIdx[0].endIdx == idx {
numMatchesFound++
}
}
}
if s.assert == PLA || s.assert == PLB { // Positive assertions want at least one match
return numMatchesFound > 0
}
if s.assert == NLA || s.assert == NLB { // Negative assertions only want zero matches
return numMatchesFound == 0
}
}
return true
}
// Returns true if the contents of 's' contain the value at the given index of the given string
func (s State) contentContains(str []rune, idx int) bool {
if s.assert != NONE {
return s.checkAssertion(str, idx)
}
if s.allChars {
return !slices.Contains(slices.Concat(notDotChars, s.except), str[idx]) // Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node.
}
// Default - s.assert must be NONE
return slices.Contains(s.content, int(str[idx]))
}
func (s State) isLookaround() bool {
return s.assert == PLA || s.assert == PLB || s.assert == NLA || s.assert == NLB
}
// Returns the matches for the character at the given index of the given string.
// Also returns the number of matches. Returns -1 if an assertion failed.
func (s State) matchesFor(str []rune, idx int) ([]*State, int) {
// Assertions can be viewed as 'checks'. If the check fails, we return
// an empty array and 0.
// If it passes, we treat it like any other state, and return all the transitions.
if s.assert != NONE {
if s.checkAssertion(str, idx) == false {
return make([]*State, 0), -1
}
}
listTransitions := s.transitions[int(str[idx])]
for _, dest := range s.transitions[int(ANY_CHAR)] {
if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) {
// Add an allChar state to the list of matches if:
// a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't.
// b. The current character isn't the state's exception list.
listTransitions = append(listTransitions, dest)
}
}
numTransitions := len(listTransitions)
return listTransitions, numTransitions
}
type NFA struct {
start State
outputs []State
}
// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates
func verifyLastStatesHelper(state *State, visited map[*State]bool) {
if len(state.transitions) == 0 {
state.isLast = true
return
}
// if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
if len(state.transitions) == 1 { // Eg. a*
var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
for _, c := range state.content {
if len(state.transitions[c]) != 1 || state.transitions[c][0] != state {
moreThanOneTrans = true
}
}
state.isLast = !moreThanOneTrans
}
if state.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
transitionDests := make([]*State, 0)
for _, v := range state.transitions {
transitionDests = append(transitionDests, v...)
}
if allEqual(transitionDests...) {
state.isLast = true
return
}
}
if visited[state] == true {
return
}
visited[state] = true
for _, states := range state.transitions {
for i := range states {
if states[i] != state {
verifyLastStatesHelper(states[i], visited)
}
}
}
}
// verifyLastStates enables the 'isLast' flag for the leaf nodes (last states)
func verifyLastStates(start []*State) {
verifyLastStatesHelper(start[0], make(map[*State]bool))
}
// Concatenates s1 and s2, returns the start of the concatenation.
func concatenate(s1 *State, s2 *State) *State {
if s1 == nil {
return s2
}
for i := range s1.output {
for _, c := range s2.content { // Create transitions for every element in s1's content to s2'
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], s2)
}
}
s1.output = s2.output
return s1
}
func kleene(s1 State) *State {
toReturn := &State{}
toReturn.transitions = make(map[int][]*State)
toReturn.content = newContents(EPSILON)
toReturn.isEmpty = true
toReturn.isKleene = true
toReturn.output = append(toReturn.output, toReturn)
for i := range s1.output {
for _, c := range toReturn.content {
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], toReturn)
}
}
for _, c := range s1.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1)
}
return toReturn
}
func alternate(s1 *State, s2 *State) *State {
toReturn := &State{}
toReturn.transitions = make(map[int][]*State)
toReturn.output = append(toReturn.output, s1.output...)
toReturn.output = append(toReturn.output, s2.output...)
// Unique append is used here (and elsewhere) to ensure that,
// for any given transition, a state can only be mentioned once.
// For example, given the transition 'a', the state 's1' can only be mentioned once.
// This would lead to multiple instances of the same set of match indices, since both
// 's1' states would be considered to match.
for _, c := range s1.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s1)
}
for _, c := range s2.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2)
}
toReturn.content = newContents(EPSILON)
toReturn.isEmpty = true
return toReturn
}
func question(s1 *State) *State { // Use the fact that ab? == a(b|)
s2 := &State{}
s2.transitions = make(map[int][]*State)
s2.content = newContents(EPSILON)
s2.output = append(s2.output, s2)
s2.isEmpty = true
s3 := alternate(s1, s2)
return s3
}
// Creates and returns a new state with the 'default' values.
func newState() State {
ret := State{
output: make([]*State, 0),
transitions: make(map[int][]*State),
assert: NONE,
except: append([]rune{}, 0),
lookaroundRegex: "",
groupEnd: false,
groupBegin: false,
}
ret.output = append(ret.output, &ret)
return ret
}

View File

@@ -1,8 +1,8 @@
package regex
package main
import "fmt"
type nodeType int
type NodeType int
// This is a slice containing all escapable characters that have special meaning.
// Eg. \b is word boundary, \w is word character etc.
@@ -10,28 +10,28 @@ var escapedChars []rune = []rune("wWdDbBnaftrvsS0")
// This is a list of the possible node types
const (
characterNode nodeType = iota
charclassNode
pipeNode
concatenateNode
kleeneNode
questionNode
plusNode
assertionNode
lparenNode
rparenNode
CHARACTER NodeType = iota
CHARCLASS
PIPE
CONCATENATE
KLEENE
QUESTION
PLUS
ASSERTION
LPAREN
RPAREN
)
// Helper constants for lookarounds
const positive = 1
const negative = -1
const lookahead = 1
const lookbehind = -1
const POSITIVE = 1
const NEGATIVE = -1
const LOOKAHEAD = 1
const LOOKBEHIND = -1
var infinite_reps int = -1 // Represents infinite reps eg. the end range in {5,}
var INFINITE_REPS int = -1 // Represents infinite reps eg. the end range in {5,}
// This represents a node in the postfix representation of the expression
type postfixNode struct {
nodetype nodeType
nodetype NodeType
contents []rune // Contents of the node
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
@@ -49,12 +49,12 @@ type postfixNode struct {
// it will not match.
func newCharClassNode(nodes []postfixNode, negated bool) postfixNode {
rtv := postfixNode{}
rtv.nodetype = charclassNode
rtv.nodetype = CHARCLASS
rtv.startReps = 1
rtv.endReps = 1
if negated {
rtv.nodetype = characterNode
rtv.contents = []rune{anyCharRune}
rtv.nodetype = CHARACTER
rtv.contents = []rune{ANY_CHAR}
rtv.allChars = true
rtv.except = nodes
} else {
@@ -70,74 +70,64 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
toReturn.endReps = 1
switch c {
case 's': // Whitespace
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, whitespaceChars...)
case 'S': // Non-whitespace
toReturn = newPostfixDotNode()
toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...))
case 'd': // Digits
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, digitChars...)
case 'D': // Non-digits
toReturn = newPostfixDotNode()
toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...))
case 'w': // word character
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, wordChars...)
case 'W': // Non-word character
toReturn = newPostfixDotNode()
toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...))
case 'b', 'B':
if c == 'b' && inCharClass {
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(8))
} else {
toReturn.nodetype = assertionNode
toReturn.contents = append(toReturn.contents, c)
}
if c == 'B' && inCharClass { // Invalid
return postfixNode{}, fmt.Errorf("word boundaries are not allowed in character class")
}
case 'A', 'z': // A is start of input, z is end of input (regardless of RE_MULTILINE)
if inCharClass {
return postfixNode{}, fmt.Errorf("input boundaries are not allowed in character class")
} else {
toReturn.nodetype = assertionNode
toReturn.nodetype = ASSERTION
toReturn.contents = append(toReturn.contents, c)
}
case 'n': // Newline character
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, '\n')
case '0': // NULL character
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(0))
case 'a': // Bell character
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(7))
case 'f': // Form feed character
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(12))
case 't': // Horizontal tab character
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(9))
case 'r': // Carriage return
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(13))
case 'v': // Vertical tab
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(11))
case '-': // Literal hyphen - only in character class
if inCharClass {
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, '-')
} else {
return postfixNode{}, fmt.Errorf("invalid escape character")
return postfixNode{}, fmt.Errorf("Invalid escape character.")
}
default: // None of the above - append it as a regular character
if isNormalChar(c) { // Normal characters cannot be escaped
return postfixNode{}, fmt.Errorf("invalid escape character")
return postfixNode{}, fmt.Errorf("Invalid escape character.")
}
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, c)
}
return toReturn, nil
@@ -152,37 +142,37 @@ func newPostfixNode(contents ...rune) postfixNode {
to_return.startReps = 1
to_return.endReps = 1
if len(contents) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER
to_return.nodetype = characterNode
to_return.nodetype = CHARACTER
to_return.contents = contents
} else { // Node has one element, could be anything
switch contents[0] {
case '+':
to_return.nodetype = plusNode
to_return.nodetype = PLUS
case '?':
to_return.nodetype = questionNode
to_return.nodetype = QUESTION
case '*':
to_return.nodetype = kleeneNode
to_return.nodetype = KLEENE
case '|':
to_return.nodetype = pipeNode
case concatRune:
to_return.nodetype = concatenateNode
to_return.nodetype = PIPE
case CONCAT:
to_return.nodetype = CONCATENATE
case '^', '$':
to_return.nodetype = assertionNode
to_return.nodetype = ASSERTION
case '(':
to_return.nodetype = lparenNode
to_return.nodetype = LPAREN
case ')':
to_return.nodetype = rparenNode
to_return.nodetype = RPAREN
default:
to_return.nodetype = characterNode
to_return.nodetype = CHARACTER
}
to_return.contents = append(to_return.contents, contents...)
// Special cases for LPAREN and RPAREN - they have special characters defined for them
if to_return.nodetype == lparenNode {
to_return.contents = []rune{lparenRune}
if to_return.nodetype == LPAREN {
to_return.contents = []rune{LPAREN_CHAR}
}
if to_return.nodetype == rparenNode {
to_return.contents = []rune{rparenRune}
if to_return.nodetype == RPAREN {
to_return.contents = []rune{RPAREN_CHAR}
}
}
return to_return
@@ -193,9 +183,9 @@ func newPostfixDotNode() postfixNode {
toReturn := postfixNode{}
toReturn.startReps = 1
toReturn.endReps = 1
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.allChars = true
toReturn.contents = []rune{anyCharRune}
toReturn.contents = []rune{ANY_CHAR}
return toReturn
}
@@ -204,7 +194,7 @@ func newPostfixCharNode(contents ...rune) postfixNode {
toReturn := postfixNode{}
toReturn.startReps = 1
toReturn.endReps = 1
toReturn.nodetype = characterNode
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, contents...)
return toReturn
}

View File

@@ -1,11 +1,9 @@
package regex
package main
import (
"fmt"
"math"
"slices"
"strconv"
"strings"
)
type numRange struct {
@@ -46,11 +44,11 @@ func intToSlc(val int) []int {
return toRet
}
func range2regex(start int, end int) (string, error) {
func range2regex(start int, end int) string {
rangeStart := start
rangeEnd := end
if rangeStart > rangeEnd {
return "", fmt.Errorf("numeric range start greater than range end")
panic("Range start greater than range end.")
}
ranges := make([]numRange, 0)
@@ -101,39 +99,28 @@ func range2regex(start int, end int) (string, error) {
// Last range - tmp to rangeEnd
ranges = append(ranges, numRange{tmp, rangeEnd})
regexSlice := make([]string, 0)
regex := "("
// Generate the regex
for _, rg := range ranges {
tmpStr := ""
tmpStr += string(nonCapLparenRune)
for i, rg := range ranges {
if i > 0 {
regex += "|"
}
regex += "("
startSlc := intToSlc(rg.start)
endSlc := intToSlc(rg.end)
if len(startSlc) != len(endSlc) {
return "", fmt.Errorf("error parsing numeric range")
panic("Ranges have unequal lengths.")
}
for i := range startSlc {
if startSlc[i] == endSlc[i] {
tmpStr += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
} else {
tmpStr += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
regex += fmt.Sprintf("[%c-%c]", rune(startSlc[i]+48), rune(endSlc[i]+48))
}
}
tmpStr += ")"
regexSlice = append(regexSlice, tmpStr)
regex += ")"
}
// Each element of the slice represents one 'group'. Taking 0-255 as an example, the elements would be:
// 1. 0-9
// 2. 10-99
// 3. 100-199
// 4. 200-249
// 5. 250-255
//
// The reason this is reversed before joining it, is because it is incompatible with the PCRE rule for matching.
// The PCRE rule specifies that the left-branch of an alternation is preferred. Even though this engine uses the POSIX
// rule at the moment (which prefers the longest match regardless of the order of the alternation), reversing the string
// has no downsides. It doesn't affect POSIX matching, and it will reduce my burden if I decide to switch to PCRE matching.
slices.Reverse(regexSlice)
regex := string(nonCapLparenRune) + strings.Join(regexSlice, "|") + ")"
return regex, nil
regex += ")"
return regex
}

339
re_test.go Normal file
View File

@@ -0,0 +1,339 @@
package main
import (
"fmt"
"slices"
"testing"
)
var reTests = []struct {
re string
flags []ReFlag
str string
result []Group // Stores all zero-groups in the match
}{
{"a", nil, "abc", []Group{{0, 1}}},
{"a", nil, "bca", []Group{{2, 3}}},
{"l", nil, "ggllgg", []Group{{2, 3}, {3, 4}}},
{"(b|c)", nil, "abdceb", []Group{{1, 2}, {3, 4}, {5, 6}}},
{"a+", nil, "brerereraaaaabbbbb", []Group{{8, 13}}},
{"ab+", nil, "qweqweqweaqweqweabbbbbr", []Group{{16, 22}}},
{"(b|c|A)", nil, "ooaoobocA", []Group{{5, 6}, {7, 8}, {8, 9}}},
{"ab*", nil, "a", []Group{{0, 1}}},
{"ab*", nil, "abb", []Group{{0, 3}}},
{"a*b", nil, "aaab", []Group{{0, 4}}},
{"a*b", nil, "qwqw", []Group{}},
{"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}},
{"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}},
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}},
{"b*a*a", nil, "bba", []Group{{0, 3}}},
{"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}},
{"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}},
{"a(b|c)*d+", nil, "abccdd", []Group{{0, 6}}},
{"a*", nil, "", []Group{{0, 0}}},
{"a|b", nil, "c", []Group{}},
{"(a|b)*c", nil, "aabbc", []Group{{0, 5}}},
{"a(b|b)", nil, "ab", []Group{{0, 2}}},
{"a*", nil, "aaaaaaaa", []Group{{0, 8}, {8, 8}}},
{"ab?", nil, "ab", []Group{{0, 2}}},
{"a?b", nil, "ab", []Group{{0, 2}}},
{"a?", nil, "", []Group{{0, 0}}},
{"a?b?c", nil, "a", []Group{}},
{"a?b?c?", nil, "ab", []Group{{0, 2}, {2, 2}}},
{"a?b?c?", nil, "ac", []Group{{0, 2}, {2, 2}}},
{"a?b?c", nil, "abc", []Group{{0, 3}}},
{"a?b?c", nil, "acb", []Group{{0, 2}}},
{"[abc]", nil, "defadefbdefce", []Group{{3, 4}, {7, 8}, {11, 12}}},
{"[ab]c", nil, "ab", []Group{}},
{"g[ab]c", nil, "gac", []Group{{0, 3}}},
{"g[ab]c", nil, "gbc", []Group{{0, 3}}},
{"g[ab]c", nil, "gc", []Group{}},
{"g[ab]c", nil, "gfc", []Group{}},
{"[ab]*", nil, "aabbbabaababab", []Group{{0, 14}, {14, 14}}},
{"[ab]+", nil, "aabbbablaababab", []Group{{0, 7}, {8, 15}}},
{"[Ff]r[Uu]it", nil, "fruit", []Group{{0, 5}}},
{"[Ff]r[Uu]it", nil, "FrUit", []Group{{0, 5}}},
{"[Ff]r[Uu|]it", nil, "Fr|it", []Group{{0, 5}}},
{"[Ff]r([Uu]|[pP])it", nil, "Frpit", []Group{{0, 5}}},
{"[Ff]r[Uu]|[pP]it", nil, "Frpit", []Group{{2, 5}}},
{"[a-zA-Z]+", nil, "Hello, how is it going?", []Group{{0, 5}, {7, 10}, {11, 13}, {14, 16}, {17, 22}}},
{".+", nil, "Hello, how is it going?", []Group{{0, 23}}},
{"a.", nil, "a ", []Group{{0, 2}}},
{"a.b", nil, "a/b", []Group{{0, 3}}},
{".", nil, "a ", []Group{{0, 1}, {1, 2}}},
{"a.", nil, "a ", []Group{{0, 2}}},
{".+b", nil, "abc", []Group{{0, 2}}},
{`\d`, nil, "1a0a3s'''34343s", []Group{{0, 1}, {2, 3}, {4, 5}, {9, 10}, {10, 11}, {11, 12}, {12, 13}, {13, 14}}},
{`\\`, nil, `a\b\c\qwe\`, []Group{{1, 2}, {3, 4}, {5, 6}, {9, 10}}},
{`\W`, nil, `"Hello", he said. How are you doing?`, []Group{{0, 1}, {6, 7}, {7, 8}, {8, 9}, {11, 12}, {16, 17}, {17, 18}, {21, 22}, {25, 26}, {29, 30}, {35, 36}}},
{`\w`, nil, ";';';';';'qwe12", []Group{{10, 11}, {11, 12}, {12, 13}, {13, 14}, {14, 15}}},
{`\s`, nil, "a b c d", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
{`\<`, nil, "<HTML><body>", []Group{{0, 1}, {6, 7}}},
{`\(.+\)`, nil, "Not (paranthesized), (so) is (this) not", []Group{{4, 35}}},
{"[^abc]+", nil, "qarbtopsaplpclkpasdmb prejip0r,p", []Group{{0, 1}, {2, 3}, {4, 8}, {9, 12}, {13, 16}, {17, 20}, {21, 32}}},
{"[^a]+", nil, "qqqaq", []Group{{0, 3}, {4, 5}}},
{"[^0-9]+", nil, "a1b2c3dd", []Group{{0, 1}, {2, 3}, {4, 5}, {6, 8}}},
{"[^abc]+", nil, "ababababbababaccacacacaca", []Group{}},
{`\[`, nil, "a[b[c[]]]", []Group{{1, 2}, {3, 4}, {5, 6}}},
{`\([^)]+\)`, nil, "Not (paranthesized), (so) is (this) not", []Group{{4, 19}, {21, 25}, {29, 35}}},
{"^ab", nil, "ab bab", []Group{{0, 2}}},
{"^aaaa^", nil, "aaaaaaaa", []Group{}},
{"^([bB][Gg])", nil, "bG", []Group{{0, 2}}},
{"b$", nil, "ba", []Group{}},
{"(boy|girl)$", nil, "girlf", []Group{}},
{`\bint\b`, nil, "print int integer", []Group{{6, 9}}},
{`int\b`, nil, "ints", []Group{}},
{`int(\b|a)`, nil, "inta", []Group{{0, 4}}},
{`\b\d+\b`, nil, "511 a3 43", []Group{{0, 3}, {7, 9}}},
{`\Bint\B`, nil, "prints int integer print", []Group{{2, 5}}},
{`^`, nil, "5^3^2", []Group{{0, 0}}},
{`\^`, nil, "5^3^2", []Group{{1, 2}, {3, 4}}},
{`pool$`, nil, "pool carpool", []Group{{8, 12}}},
{`^int$`, nil, "print int integer", []Group{}},
{`^int$`, nil, "int", []Group{{0, 3}}},
{`b*`, nil, "aaaaaaaaaaqweqwe", []Group{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}, {12, 12}, {13, 13}, {14, 14}, {15, 15}, {16, 16}}},
{"a{4}", nil, "aabaaa", []Group{}},
{"ab{5}", nil, "abbbbbab", []Group{{0, 6}}},
{"(a|b){3,4}", nil, "aba", []Group{{0, 3}}},
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
{`\d{3,4}`, nil, "5", []Group{}},
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
{`\d{3,4}`, nil, "89a-0", []Group{}},
{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
{`\bpaint\b`, nil, "paints", []Group{}},
{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
// Unicode tests
{`.+`, nil, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
{`a.b`, nil, "a²b", []Group{{0, 3}}},
{`[^a]+`, nil, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
// Fun experiment - AI-generated tests
{"(abc|def|ghi)", nil, "abcdefg", []Group{{0, 3}, {3, 6}}},
{"a(b|c)d", nil, "abcd", []Group{}},
{"a(b|c)*d", nil, "abcbcd", []Group{{0, 6}}},
{"a(b|c)+d", nil, "abcbcd", []Group{{0, 6}}},
{"a(b|c)?d", nil, "abd", []Group{{0, 3}}},
{".+", nil, "hello world", []Group{{0, 11}}},
{"a.b", nil, "aXb", []Group{{0, 3}}},
{"a.*b", nil, "aXb", []Group{{0, 3}}},
{"a.{2,3}b", nil, "aXXb", []Group{{0, 4}}},
{"a.{2,}b", nil, "aXXXb", []Group{{0, 5}}},
{"a.{0,3}b", nil, "ab", []Group{{0, 2}}},
{"[abc]+", nil, "abcabc", []Group{{0, 6}}},
{"[a-zA-Z]+", nil, "HelloWorld", []Group{{0, 10}}},
{"[^abc]+", nil, "defghi", []Group{{0, 6}}},
{"^hello", nil, "hello world", []Group{{0, 5}}},
{"world$", nil, "hello world", []Group{{6, 11}}},
{`\bhello\b`, nil, "hello world", []Group{{0, 5}}},
{`\Bhello\B`, nil, "hello world", []Group{}},
{"(hello|world)", nil, "hello world", []Group{{0, 5}, {6, 11}}},
{"(hello|world)+", nil, "hello world", []Group{{0, 5}, {6, 11}}},
{"(hello|world)*", nil, "hello world", []Group{{0, 5}, {5, 5}, {6, 11}, {11, 11}}},
{"(hello|world)?", nil, "hello world", []Group{{0, 5}, {5, 5}, {6, 11}, {11, 11}}},
{"ú.+ï", nil, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 22}}},
{"(?=hello)", nil, "hello world", []Group{{0, 0}}},
{"(?!hello)", nil, "hello world", []Group{{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}}},
{"(?<=hello)", nil, "hello world", []Group{{5, 5}}},
{"(?<!hello)", nil, "hello world", []Group{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "40", []Group{{0, 2}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "040", []Group{}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "400", []Group{{0, 3}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "4000", []Group{}},
{"a{1,3}", nil, "aaaaa", []Group{{0, 3}, {3, 5}}},
{`\\[ab\\]`, nil, "a", []Group{}},
{`\\[ab\\]`, nil, `\a`, []Group{{0, 2}}},
// Lookaround tests
{"(?<=bo)y", nil, "boy", []Group{{2, 3}}},
{"bo(?=y)", nil, "boy", []Group{{0, 2}}},
{"(?<=f)f+(?=f)", nil, "fffff", []Group{{1, 4}}},
{"(?<=f)f+(?=f)", nil, "fffffa", []Group{{1, 4}}},
// Test cases from Python's RE test suite
{`[\1]`, nil, "\x01", []Group{{0, 1}}},
{`\0`, nil, "\x00", []Group{{0, 1}}},
{`[\0a]`, nil, "\x00", []Group{{0, 1}}},
{`[\0a]`, nil, "\x00", []Group{{0, 1}}},
{`[a\0]`, nil, "\x00", []Group{{0, 1}}},
{`[^a\0]`, nil, "\x00", []Group{}},
{`\a[\b]\f\n\r\t\v`, nil, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
{`[\a][\b][\f][\n][\r][\t][\v]`, nil, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
{`\u`, nil, "", nil},
{`\xff`, nil, "ÿ", []Group{{0, 1}}},
{`\x00ffffffffffffff`, nil, "\xff", []Group{}},
{`\x00f`, nil, "\x0f", []Group{}},
{`\x00fe`, nil, "\xfe", []Group{}},
{`^\w+=(\\[\000-\277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
{`a.b`, nil, `acb`, []Group{{0, 3}}},
{`a.b`, nil, "a\nb", []Group{}},
{`a.*b`, nil, "acc\nccb", []Group{}},
{`a.{4,5}b`, nil, "acc\nccb", []Group{}},
{`a.b`, nil, "a\rb", []Group{{0, 3}}},
{`a.b`, []ReFlag{RE_MULTILINE}, "a\nb", []Group{{0, 3}}},
{`a.*b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
{`a.{4,5}b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
{`)`, nil, ``, nil},
{`^$`, nil, ``, []Group{{0, 0}}},
{`abc`, nil, `abc`, []Group{{0, 3}}},
{`abc`, nil, `xbc`, []Group{}},
{`abc`, nil, `axc`, []Group{}},
{`abc`, nil, `abx`, []Group{}},
{`abc`, nil, `xabcy`, []Group{{1, 4}}},
{`abc`, nil, `ababc`, []Group{{2, 5}}},
{`ab*c`, nil, `abc`, []Group{{0, 3}}},
{`ab*bc`, nil, `abc`, []Group{{0, 3}}},
{`ab*bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab*bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab+bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab+bc`, nil, `abc`, []Group{}},
{`ab+bc`, nil, `abq`, []Group{}},
{`ab+bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab?bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab?bc`, nil, `abc`, []Group{{0, 3}}},
{`ab?bc`, nil, `abbbbc`, []Group{}},
{`ab?c`, nil, `abc`, []Group{{0, 3}}},
{`^abc$`, nil, `abc`, []Group{{0, 3}}},
{`^abc$`, nil, `abcc`, []Group{}},
{`^abc`, nil, `abcc`, []Group{{0, 3}}},
{`^abc$`, nil, `aabc`, []Group{}},
{`abc$`, nil, `aabc`, []Group{{1, 4}}},
{`^`, nil, `abc`, []Group{{0, 0}}},
{`$`, nil, `abc`, []Group{{3, 3}}},
{`a.c`, nil, `abc`, []Group{{0, 3}}},
{`a.c`, nil, `axc`, []Group{{0, 3}}},
{`a.*c`, nil, `axyzc`, []Group{{0, 5}}},
{`a.*c`, nil, `axyzd`, []Group{}},
{`a[bc]d`, nil, `abc`, []Group{}},
{`a[bc]d`, nil, `abd`, []Group{{0, 3}}},
{`a[b-d]e`, nil, `abd`, []Group{}},
{`a[b-d]e`, nil, `ace`, []Group{{0, 3}}},
{`a[b-d]`, nil, `aac`, []Group{{1, 3}}},
{`a[-b]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
{`a[\-b]`, nil, `a-`, []Group{{0, 2}}},
{`a[b-]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
{`a[]b`, nil, `-`, nil},
{`a[`, nil, `-`, nil},
{`a\`, nil, `-`, nil},
{`abc)`, nil, `-`, nil},
{`(abc`, nil, `-`, nil},
{`a]`, nil, `a]`, []Group{{0, 2}}},
// Todo - add numeric range tests
}
var groupTests = []struct {
re string
flags []ReFlag
str string
result []Match
}{
{"(a)(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}},
{"((a))(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}},
{"(0)", nil, "ab", []Match{[]Group{}}},
{"(a)b", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}}}},
{"a(b)", nil, "ab", []Match{[]Group{{0, 2}, {1, 2}}}},
{"(a|b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
{"(a)|(b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}, {-1, -1}}, []Group{{1, 2}, {-1, -1}, {1, 2}}}},
{"(a+)(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 3}, {3, 4}}}},
{"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}},
{"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
{"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}},
{"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
{"(a?)a?", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
{"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
{"a((b.d){3})", nil, "abfdbhdbid", []Match{[]Group{{0, 10}, {1, 10}, {7, 10}}}},
{`(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\071`, nil, `abcdefghijkl9`, []Match{[]Group{{0, 13}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {10, 11}, {11, 12}}}},
}
func TestFindAllMatches(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(fmt.Errorf("Test Error: %v", err))
}
} else {
matchIndices := FindAllMatches(regComp, test.str)
zeroGroups := make([]Group, len(matchIndices))
for i, m := range matchIndices {
zeroGroups[i] = m[0]
}
if !slices.Equal(test.result, zeroGroups) {
t.Errorf("Wanted %v Got %v\n", test.result, zeroGroups)
}
}
})
}
}
func TestFindString(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
} else {
foundString := FindString(regComp, test.str)
if len(test.result) == 0 {
if foundString != "" {
t.Errorf("Expected no match got %v\n", foundString)
}
} else {
expectedString := test.str[test.result[0].startIdx:test.result[0].endIdx]
if foundString != expectedString {
t.Errorf("Wanted %v Got %v\n", expectedString, foundString)
}
}
}
})
}
}
func TestFindAllGroups(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
}
matchIndices := FindAllMatches(regComp, test.str)
for i := range matchIndices {
for j := range matchIndices[i] {
if matchIndices[i][j].isValid() {
if test.result[i][j] != matchIndices[i][j] {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}
}
}
}
})
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,162 +0,0 @@
/*
Package regex implements regular expression search, using a custom non-bracktracking engine with support for lookarounds and numeric ranges.
The engine relies completely on UTF-8 codepoints. As such, it is capable of matching characters
from other languages, emojis and symbols.
The API and regex syntax are largely compatible with that of the stdlib's [regexp], with a few key differences (see 'Key Differences with regexp').
The full syntax is specified below.
# Syntax
Single characters:
. Match any character. Newline matching is dependent on the RE_SINGLE_LINE flag.
[abc] Character class - match a, b or c
[a-z] Character range - match any character from a to z
[^abc] Negated character class - match any character except a, b and c
[^a-z] Negated character range - do not match any character from a to z
\[ Match a literal '['. Backslashes can escape any character with special meaning, including another backslash.
\452 Match the character with the octal value 452 (up to 3 digits)
\xFF Match the character with the hex value FF (exactly 2 characters)
\x{0000FF} Match the character with the hex value 0000FF (exactly 6 characters)
\n Newline
\a Bell character
\f Form-feed character
\r Carriage return
\t Horizontal tab
\v Vertical tab
Perl classes:
\d Match any digit character ([0-9])
\D Match any non-digit character ([^0-9])
\w Match any word character ([a-zA-Z0-9_])
\W Match any non-word character ([^a-zA-Z0-9_])
\s Match any whitespace character ([ \t\n])
\S Match any non-whitespace character ([^ \t\n])
POSIX classes (inside normal character classes):
[:digit:] All digit characters ([0-9])
[:upper:] All upper-case letters ([A-Z])
[:lower:] All lower-case letters ([a-z])
[:alpha:] All letters ([a-zA-Z])
[:alnum:] All alphanumeric characters ([a-zA-Z0-9])
[:xdigit:] All hexadecimal characters ([a-fA-F0-9])
[:blank:] All blank characters ([ \t])
[:space:] All whitespace characters ([ \t\n\r\f\v])
[:cntrl:] All control characters ([\x00-\x1F\x7F])
[:punct:] All punctuation characters
[:graph:] All graphical characters ([\x21-\x7E])
[:print:] All graphical characters + space ([\x20-\x7E])
[:word:] All word characters (\w)
[:ascii:] All ASCII values ([\x00-\x7F])
Composition:
def Match d, followed by e, followed by f
x|y Match x or y (prefer x)
xy|z Match xy or z (prefer xy)
Repitition (always greedy, preferring more):
x* Match x zero or more times
x+ Match x one or more times
x? Match x zero or one time
x{m,n} Match x between m and n times (inclusive)
x{m,} Match x atleast m times
x{,n} Match x between 0 and n times (inclusive)
x{m} Match x exactly m times
Grouping:
(expr) Create a capturing group. The contents of the group can be retrieved with [FindAllMatches]
x(y|z) Match x followed by y or z. Given a successful match, the contents of group 1 will include either y or z
(?:expr) Create a non-capturing group. The contents of the group aren't saved.
x(?:y|z) Match x followed by y or z. No groups are created.
Assertions:
^ Match at the start of the input string. If RE_MULTILINE is enabled, it also matches at the start of every line.
$ Match at the end of the input string. If RE_MULTILINE is enabled, it also matches at the end of every line.
\A Always match at the start of the string, regardless of RE_MULTILINE
\z Always match at the end of the string, regardless of RE_MULTILINE
\b Match at a word boundary (a word character followed by a non-word character, or vice-versa)
\B Match at a non-word boundary (a word character followed by a word character, or vice-versa)
Lookarounds:
x(?=y) Positive lookahead - Match x if followed by y
x(?!y) Negative lookahead - Match x if NOT followed by y
(?<=x)y Positive lookbehind - Match y if preceded by x
(?<!x)y Negative lookbehind - Match y if NOT preceded by x
Numeric ranges:
<x-y> Match any number from x to y (inclusive) (x and y must be positive numbers)
\<x Match a literal '<' followed by x
# Key Differences with regexp
The engine and the API differ from [regexp] in a few ways, some of them very subtle.
The key differences are mentioned below.
1. Greediness:
This engine currently does not support non-greedy operators.
2. Byte-slices and runes:
My engine does not support byte-slices. When a matching function receives a string, it converts it into a
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
support made the tradeoff worth it.
3. Return values
Rather than using primitives for return values, my engine defines two types that are used as return
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
[regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The
equivalent expression for this engine is shown below. Note that 'Index' is the default.
Find(All)?(String)?(Submatch)?
[Reg.Find] returns the index of the leftmost match in the string.
If a function contains 'All' it returns all matches instead of just the leftmost one.
If a function contains 'String' it returns the matched text, rather than the index in the string.
If a function contains 'Submatch' it returns the match, including all submatches found by
capturing groups.
The term '0-group' is used to refer to the 0th capturing group of a match (which is the entire match).
Given the following regex:
x(y)
and the input string:
xyz
The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name
returns the 0-group.
# Feature Differences
The following features from [regexp] are (currently) NOT supported:
1. Named capturing groups
2. Non-greedy operators
3. Unicode character classes
4. Embedded flags (flags are instead passed as arguments to [Compile])
5. Literal text with \Q ... \E
The following features are not available in [regexp], but are supported in my engine:
1. Lookarounds
2. Numeric ranges
I hope to shorten the first list, and expand the second.
*/
package regex

View File

@@ -1,181 +0,0 @@
package regex_test
import (
"fmt"
"strings"
"gitea.twomorecents.org/Rockingcool/kleingrep/regex"
)
func ExampleReg_Find() {
regexStr := "b|a"
regexComp := regex.MustCompile(regexStr)
match, _ := regexComp.Find("banana")
fmt.Println(match.String())
// Output: 0 1
}
func ExampleReg_FindAll() {
regexStr := "b|a"
regexComp := regex.MustCompile(regexStr)
matches := regexComp.FindAll("banana")
for _, group := range matches {
fmt.Println(group.String())
}
// Output: 0 1
// 1 2
// 3 4
// 5 6
}
func ExampleReg_FindString() {
regexStr := `\w+\s+(?=sheep)`
regexComp := regex.MustCompile(regexStr)
matchStr := regexComp.FindString("pink cows and yellow sheep")
fmt.Println(matchStr)
// Output: yellow
}
func ExampleReg_FindSubmatch() {
regexStr := `(\d)\.(\d)(\d)`
regexComp := regex.MustCompile(regexStr)
match, _ := regexComp.FindSubmatch("3.14")
fmt.Println(match[0])
fmt.Println(match[1])
fmt.Println(match[2])
// Output: 0 4
// 0 1
// 2 3
}
func ExampleReg_FindStringSubmatch() {
regexStr := `(\d{4})-(\d{2})-(\d{2})`
regexComp := regex.MustCompile(regexStr)
inputStr := `The date is 2025-02-10`
match := regexComp.FindStringSubmatch(inputStr)
fmt.Println(match[1])
fmt.Println(match[3])
// Output: 2025
// 10
}
func ExampleReg_FindAllSubmatch() {
regexStr := `(\d)\.(\d)(\d)`
regexComp := regex.MustCompile(regexStr)
matches := regexComp.FindAllSubmatch("3.14+8.97")
fmt.Println(matches[0][0]) // 0-group (entire match) of 1st match (0-indexed)
fmt.Println(matches[0][1]) // 1st group of 1st match
fmt.Println(matches[1][0]) // 0-group of 2nd match
fmt.Println(matches[1][1]) // 1st group of 2nd math
// Output: 0 4
// 0 1
// 5 9
// 5 6
}
func ExampleReg_FindAllString() {
regexStr := `<0-255>\.<0-255>\.<0-255>\.<0-255>`
inputStr := `192.168.220.7 pings 9.9.9.9`
regexComp := regex.MustCompile(regexStr)
matchStrs := regexComp.FindAllString(inputStr)
fmt.Println(matchStrs[0])
fmt.Println(matchStrs[1])
// Output: 192.168.220.7
// 9.9.9.9
}
func ExampleReg_FindAllStringSubmatch() {
// 'https' ...
// followed by 1 or more alphanumeric characters (including period) ...
// then a forward slash ...
// followed by one more of :
// word character,
// question mark,
// period,
// equals sign
regexStr := `https://([a-z0-9\.]+)/([\w.?=]+)`
regexComp := regex.MustCompile(regexStr, regex.RE_CASE_INSENSITIVE)
inputStr := `You can find me at https://twomorecents.org/index.html and https://news.ycombinator.com/user?id=aadhavans`
matchIndices := regexComp.FindAllStringSubmatch(inputStr)
fmt.Println(matchIndices[0][1]) // 1st group of 1st match (0-indexed)
fmt.Println(matchIndices[0][2]) // 2nd group of 1st match
fmt.Println(matchIndices[1][1]) // 1st group of 2nd match
fmt.Println(matchIndices[1][2]) // 2nd group of 2nd match
// Output: twomorecents.org
// index.html
// news.ycombinator.com
// user?id=aadhavans
}
func ExampleReg_Expand() {
inputStr := `option1: value1
option2: value2`
regexStr := `(\w+): (\w+)`
templateStr := "$1 = $2\n"
regexComp := regex.MustCompile(regexStr, regex.RE_MULTILINE)
result := ""
for _, submatches := range regexComp.FindAllSubmatch(inputStr) {
result = regexComp.Expand(result, templateStr, inputStr, submatches)
}
fmt.Println(result)
// Output: option1 = value1
// option2 = value2
}
func ExampleReg_LiteralPrefix() {
regexStr := `a(b|c)d*`
regexComp := regex.MustCompile(regexStr)
prefix, complete := regexComp.LiteralPrefix()
fmt.Println(prefix)
fmt.Println(complete)
// Output: a
// false
}
func ExampleReg_Longest() {
regexStr := `x|xx`
inputStr := "xx"
regexComp := regex.MustCompile(regexStr)
fmt.Println(regexComp.FindString(inputStr))
regexComp.Longest()
fmt.Println(regexComp.FindString(inputStr))
// Output: x
// xx
}
func ExampleReg_ReplaceAll() {
regexStr := `(\d)(\w)`
inputStr := "5d9t"
regexComp := regex.MustCompile(regexStr)
fmt.Println(regexComp.ReplaceAll(inputStr, `$2$1`))
// Output: d5t9
}
func ExampleReg_ReplaceAllLiteral() {
regexStr := `fox|dog`
inputStr := "the quick brown fox jumped over the lazy dog"
regexComp := regex.MustCompile(regexStr)
fmt.Println(regexComp.ReplaceAllLiteral(inputStr, `duck`))
// Output: the quick brown duck jumped over the lazy duck
}
func ExampleReg_ReplaceAllFunc() {
regexStr := `\w{5,}`
inputStr := `all five or more letter words in this string are capitalized`
regexComp := regex.MustCompile(regexStr)
fmt.Println(regexComp.ReplaceAllFunc(inputStr, strings.ToUpper))
// Output: all five or more LETTER WORDS in this STRING are CAPITALIZED
}

View File

@@ -1,460 +0,0 @@
package regex
import (
"fmt"
"strconv"
"unicode"
)
// A Match represents a match found by the regex in a given string.
// It is represented as a list of groups, where the nth element contains
// the contents of the nth capturing group. Note that the group may not be valid
// (see [Group.IsValid]). The element at index 0 is known
// as the 0-group, and represents the contents of the entire match.
//
// See [Reg.FindSubmatch] for an example.
type Match []Group
// a Group represents a capturing group. It contains the start and index of the group.
type Group struct {
StartIdx int
EndIdx int
}
func newMatch(size int) Match {
toRet := make([]Group, size)
for i := range toRet {
toRet[i].StartIdx = -1
toRet[i].EndIdx = -1
}
return toRet
}
// Returns a string containing the indices of all (valid) groups in the match
func (m Match) String() string {
var toRet string
for i, g := range m {
if g.IsValid() {
toRet += fmt.Sprintf("Group %d\n", i)
toRet += g.String()
toRet += "\n"
}
}
return toRet
}
// String converts the Group into a string representation.
func (idx Group) String() string {
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
}
// IsValid returns whether a group is valid (ie. whether it matched any text). It
// simply ensures that both indices of the group are >= 0.
func (g Group) IsValid() bool {
return g.StartIdx >= 0 && g.EndIdx >= 0
}
// Simple function, makes it easier to map this over a list of matches
func getZeroGroup(m Match) Group {
return m[0]
}
func copyThread(to *nfaState, from nfaState) {
to.threadGroups = append([]Group{}, from.threadGroups...)
}
// Find returns the 0-group of the leftmost match of the regex in the given string.
// An error value != nil indicates that no match was found.
func (re Reg) Find(str string) (Group, error) {
match, err := re.FindNthMatch(str, 1)
if err != nil {
return Group{}, fmt.Errorf("no matches found")
}
return getZeroGroup(match), nil
}
// Match returns a boolean value, indicating whether the regex found a match in the given string.
func (re Reg) Match(str string) bool {
_, err := re.Find(str)
return err == nil
}
// CompileMatch compiles expr and returns true if str contains a match of the expression.
// It is equivalent to [regexp.Match].
// An optional list of flags may be provided (see [ReFlag]).
// It returns an error (!= nil) if there was an error compiling the expression.
func CompileMatch(expr string, str string, flags ...ReFlag) (bool, error) {
re, err := Compile(expr, flags...)
if err != nil {
return false, err
}
return re.Match(str), nil
}
// FindAll returns a slice containing all the 0-groups of the regex in the given string.
// A 0-group represents the match without any submatches.
func (re Reg) FindAll(str string) []Group {
indices := re.FindAllSubmatch(str)
zeroGroups := funcMap(indices, getZeroGroup)
return zeroGroups
}
// FindString returns the text of the leftmost match of the regex in the given string.
// The return value will be an empty string in two situations:
// 1. No match was found
// 2. The match was an empty string
func (re Reg) FindString(str string) string {
match, err := re.FindNthMatch(str, 1)
if err != nil {
return ""
}
zeroGroup := getZeroGroup(match)
return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
}
// FindSubmatch returns the leftmost match of the regex in the given string, including
// the submatches matched by capturing groups. The returned [Match] will always contain the same
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
// [Group.IsValid], or by checking that both indices of the group are >= 0.
// The second-return value is nil if no match was found.
func (re Reg) FindSubmatch(str string) (Match, error) {
match, err := re.FindNthMatch(str, 1)
if err != nil {
return Match{}, fmt.Errorf("no match found")
} else {
return match, nil
}
}
// FindStringSubmatch is the 'string' version of [FindSubmatch]. It returns a slice of strings,
// where the string at index i contains the text matched by the i-th capturing group.
// The 0-th index represents the entire match.
// An empty string at index n could mean:
// ,
// 1. Group n did not find a match
// 2. Group n found a zero-length match
//
// A return value of nil indicates no match.
func (re Reg) FindStringSubmatch(str string) []string {
matchStr := make([]string, re.numGroups+1)
match, err := re.FindSubmatch(str)
if err != nil {
return nil
}
nonEmptyMatchFound := false
for i := range match {
if match[i].IsValid() {
matchStr[i] = str[match[i].StartIdx:match[i].EndIdx]
nonEmptyMatchFound = true
} else {
matchStr[i] = ""
}
}
if nonEmptyMatchFound == false {
return nil
}
return matchStr
}
// FindAllString is the 'all' version of [FindString].
// It returns a slice of strings containing the text of all matches of
// the regex in the given string.
func (re Reg) FindAllString(str string) []string {
zerogroups := re.FindAll(str)
matchStrs := funcMap(zerogroups, func(g Group) string {
return str[g.StartIdx:g.EndIdx]
})
return matchStrs
}
// FindNthMatch return the 'n'th match of the regex in the given string.
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
func (re Reg) FindNthMatch(str string, n int) (Match, error) {
idx := 0
matchNum := 0
str_runes := []rune(str)
var matchFound bool
var matchIdx Match
for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
if matchFound {
matchNum++
}
if matchNum == n {
return matchIdx, nil
}
}
// We haven't found the nth match after scanning the string - Return an error
return nil, fmt.Errorf("invalid match index - too few matches found")
}
// FindAllSubmatch returns a slice of matches in the given string.
func (re Reg) FindAllSubmatch(str string) []Match {
idx := 0
str_runes := []rune(str)
var matchFound bool
var matchIdx Match
indices := make([]Match, 0)
for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
if matchFound {
indices = append(indices, matchIdx)
}
}
return indices
}
// FindAllSubmatch returns a double-slice of strings. Each slice contains the text of a match, including all submatches.
// A return value of nil indicates no match.
func (re Reg) FindAllStringSubmatch(str string) [][]string {
match := re.FindAllSubmatch(str)
if len(match) == 0 {
return nil
}
rtv := make([][]string, len(match))
for i := range rtv {
rtv[i] = make([]string, re.numGroups+1)
}
rtv = funcMap(match, func(m Match) []string {
return funcMap(m, func(g Group) string {
if g.IsValid() {
return str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
})
return rtv
}
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState {
if stateExists(list, state) || stateExists(visited, state) {
return list
}
visited = append(visited, state)
if state.isKleene || state.isQuestion {
copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
return list
}
if state.isAlternation {
copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
return list
}
state.threadGroups = append([]Group{}, threadGroups...)
if state.assert != noneAssert {
if state.checkAssertion(str, idx, preferLongest) {
copyThread(state.next, state)
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
}
if state.groupBegin {
state.threadGroups[state.groupNum].StartIdx = idx
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
if state.groupEnd {
state.threadGroups[state.groupNum].EndIdx = idx
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
return append(list, state)
}
// Helper for FindAllMatches. Returns whether it found a match, the
// first Match it finds, and how far it got into the string ie. where
// the next search should start from.
func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int, preferLongest bool) (bool, Match, int) {
// Base case - exit if offset exceeds string's length
if offset > len(str) {
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
return false, []Group{}, offset
}
resetThreads(start)
currentStates := make([]nfaState, 0)
nextStates := make([]nfaState, 0)
i := offset // Index in string
// If the first state is an assertion, makes sure the assertion
// is true before we do _anything_ else.
if start.assert != noneAssert {
if start.checkAssertion(str, offset, preferLongest) == false {
i++
return false, []Group{}, i
}
}
start.threadGroups = newMatch(numGroups + 1)
start.threadGroups[0].StartIdx = i
currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest)
var match Match = nil
for idx := i; idx <= len(str); idx++ {
if len(currentStates) == 0 {
break
}
for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ {
currentState := currentStates[currentStateIdx]
if currentState.threadGroups == nil {
currentState.threadGroups = newMatch(numGroups + 1)
currentState.threadGroups[0].StartIdx = idx
}
if currentState.isLast {
currentState.threadGroups[0].EndIdx = idx
match = append([]Group{}, currentState.threadGroups...)
if !preferLongest {
break
}
} else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character
if currentState.contentContains(str, idx, preferLongest) {
nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
}
}
}
currentStates = append([]nfaState{}, nextStates...)
nextStates = nil
}
if match != nil {
if offset == match[0].EndIdx {
return true, match, match[0].EndIdx + 1
}
return true, match, match[0].EndIdx
}
return false, []Group{}, i + 1
}
// Expand appends template to dst, expanding any variables in template to the relevant capturing group.
//
// A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group.
// To insert a literal $, do not put a number after it. Alternatively, you can use $$.
// src is the input string, and match must be the result of [Reg.FindSubmatch].
func (re Reg) Expand(dst string, template string, src string, match Match) string {
templateRuneSlc := []rune(template)
srcRuneSlc := []rune(src)
i := 0
for i < len(templateRuneSlc) {
c := templateRuneSlc[i]
if c == '$' {
i += 1
// The dollar sign is the last character of the string, or it is proceeded by another dollar sign
if i >= len(templateRuneSlc) || templateRuneSlc[i] == '$' {
dst += "$"
i++
} else {
numStr := ""
for i < len(templateRuneSlc) && unicode.IsDigit(templateRuneSlc[i]) {
numStr += string(templateRuneSlc[i])
i++
}
if numStr == "" {
dst += "$"
} else {
num, _ := strconv.Atoi(numStr)
if num < len(match) {
dst += string(srcRuneSlc[match[num].StartIdx:match[num].EndIdx])
} else {
dst += "$" + numStr
}
}
}
} else {
dst += string(c)
i++
}
}
return dst
}
// LiteralPrefix returns a string that must begin any match of the given regular expression.
// The second return value is true if the string comprises the entire expression.
func (re Reg) LiteralPrefix() (prefix string, complete bool) {
state := re.start
if state.assert != noneAssert {
state = state.next
}
for !(state.isLast) && (!state.isAlternation) && len(state.content) == 1 && state.assert == noneAssert {
if state.groupBegin || state.groupEnd {
state = state.next
continue
}
prefix += string(rune(state.content[0]))
state = state.next
}
if state.isLast {
complete = true
} else {
complete = false
}
return prefix, complete
}
// ReplaceAll replaces all matches of the expression in src, with the text in repl. In repl, variables are interpreted
// as they are in [Reg.Expand]. The resulting string is returned.
func (re Reg) ReplaceAll(src string, repl string) string {
matches := re.FindAllSubmatch(src)
i := 0
currentMatch := 0
dst := ""
for i < len(src) {
if currentMatch < len(matches) && matches[currentMatch][0].IsValid() && i == matches[currentMatch][0].StartIdx {
dst += re.Expand("", repl, src, matches[currentMatch])
i = matches[currentMatch][0].EndIdx
currentMatch++
} else {
dst += string(src[i])
i++
}
}
return dst
}
// ReplaceAllLiteral replaces all matches of the expression in src, with the text in repl. The text is replaced directly,
// without any expansion.
func (re Reg) ReplaceAllLiteral(src string, repl string) string {
zerogroups := re.FindAll(src)
currentMatch := 0
i := 0
dst := ""
for i < len(src) {
if currentMatch < len(zerogroups) && i == zerogroups[currentMatch].StartIdx {
dst += repl
i = zerogroups[currentMatch].EndIdx
currentMatch += 1
} else {
dst += string(src[i])
i++
}
}
return dst
}
// ReplaceAllFunc replaces every match of the expression in src, with the return value of the function replFunc.
// replFunc takes in the matched string. The return value is substituted in directly without expasion.
func (re Reg) ReplaceAllFunc(src string, replFunc func(string) string) string {
zerogroups := re.FindAll(src)
currentMatch := 0
i := 0
dst := ""
for i < len(src) {
if currentMatch < len(zerogroups) && i == zerogroups[currentMatch].StartIdx {
dst += replFunc(src[zerogroups[currentMatch].StartIdx:zerogroups[currentMatch].EndIdx])
i = zerogroups[currentMatch].EndIdx
currentMatch += 1
} else {
dst += string(src[i])
i++
}
}
return dst
}

View File

@@ -1,441 +0,0 @@
package regex
import (
"fmt"
"slices"
)
const epsilon int = 0xF0000
type assertType int
const (
noneAssert assertType = iota
sosAssert // Start of string (^)
soiAssert // Start of input (\A)
eosAssert // End of string ($)
eoiAssert // End of input (\Z)
wboundAssert
nonwboundAssert
plaAssert // Positive lookahead
nlaAssert // Negative lookahead
plbAssert // Positive lookbehind
nlbAssert // Negative lookbehind
alwaysTrueAssert // An assertion that is always true
)
type nfaState struct {
content stateContents // Contents of current state
isEmpty bool // If it is empty - Union operator and Kleene star states will be empty
isLast bool // If it is the last state (acept state)
output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
// transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
next *nfaState // The next state (not for alternation or kleene states)
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
isQuestion bool // Identifies whether current node is a 0-state representing the question operator
isAlternation bool // Identifies whether current node is a 0-state representing an alternation
splitState *nfaState // Only for alternation states - the 'other' branch of the alternation ('next' is the first)
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds
lookaroundNFA *nfaState // Holds the NFA of the lookaroundRegex - if it exists
lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround
groupBegin bool // Whether or not the node starts a capturing group
groupEnd bool // Whether or not the node ends a capturing group
groupNum int // Which capturing group the node starts / ends
// The following properties depend on the current match - I should think about resetting them for every match.
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
}
// Clones the NFA starting from the given state.
func cloneState(start *nfaState) *nfaState {
return cloneStateHelper(start, make(map[*nfaState]*nfaState))
}
// Helper function for clone. The map is used to keep track of which states have
// already been copied, and which ones haven't.
// This function was created using output from Llama3.1:405B.
func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) *nfaState {
// Base case - if the clone exists in our map, return it.
if clone, exists := cloneMap[stateToClone]; exists {
return clone
}
if stateToClone == nil {
return nil
}
// Recursive case - if the clone doesn't exist, create it, add it to the map,
// and recursively call for each of the transition states.
clone := &nfaState{
content: append([]int{}, stateToClone.content...),
isEmpty: stateToClone.isEmpty,
isLast: stateToClone.isLast,
output: make([]*nfaState, len(stateToClone.output)),
isKleene: stateToClone.isKleene,
isQuestion: stateToClone.isQuestion,
isAlternation: stateToClone.isAlternation,
assert: stateToClone.assert,
zeroMatchFound: stateToClone.zeroMatchFound,
allChars: stateToClone.allChars,
except: append([]rune{}, stateToClone.except...),
lookaroundRegex: stateToClone.lookaroundRegex,
groupEnd: stateToClone.groupEnd,
groupBegin: stateToClone.groupBegin,
groupNum: stateToClone.groupNum,
}
cloneMap[stateToClone] = clone
for i, s := range stateToClone.output {
if s == stateToClone {
clone.output[i] = clone
} else {
clone.output[i] = cloneStateHelper(s, cloneMap)
}
}
if stateToClone.lookaroundNFA == stateToClone {
clone.lookaroundNFA = clone
}
clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap)
if stateToClone.splitState == stateToClone {
clone.splitState = clone
}
clone.splitState = cloneStateHelper(stateToClone.splitState, cloneMap)
if stateToClone.next == stateToClone {
clone.next = clone
}
clone.next = cloneStateHelper(stateToClone.next, cloneMap)
return clone
}
// Reset any thread-related fields of the NFA starting from the given state.
func resetThreads(start *nfaState) {
visitedMap := make(map[*nfaState]bool) // The value type doesn't matter here
resetThreadsHelper(start, visitedMap)
}
func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
if state == nil {
return
}
if _, ok := visitedMap[state]; ok {
return
}
// Assuming it hasn't been visited
state.threadGroups = nil
visitedMap[state] = true
if state.isAlternation {
resetThreadsHelper(state.next, visitedMap)
resetThreadsHelper(state.splitState, visitedMap)
} else {
resetThreadsHelper(state.next, visitedMap)
}
}
// Checks if the given state's assertion is true. Returns true if the given
// state doesn't have an assertion.
func (s nfaState) checkAssertion(str []rune, idx int, preferLongest bool) bool {
if s.assert == alwaysTrueAssert {
return true
}
if s.assert == sosAssert {
// Single-line mode: Beginning of string
// Multi-line mode: Previous character was newline
return idx == 0 || (multilineMode && (idx > 0 && str[idx-1] == '\n'))
}
if s.assert == eosAssert {
// Single-line mode: End of string
// Multi-line mode: current character is newline
// Index is at the end of the string, or it points to the last character which is a newline
return idx == len(str) || (multilineMode && str[idx] == '\n')
}
if s.assert == soiAssert {
// Only true at the start of the input, regardless of mode
return idx == 0
}
if s.assert == eoiAssert {
// Only true at the end of the input, regardless of mode
return idx == len(str)
}
if s.assert == wboundAssert {
return isWordBoundary(str, idx)
}
if s.assert == nonwboundAssert {
return !isWordBoundary(str, idx)
}
if s.isLookaround() {
// The process here is simple:
// 1. Compile the regex stored in the state's contents.
// 2. Run it on a subset of the test string, that ends after the current index in the string
// 3. Based on the kind of lookaround (and the indices we get), determine what action to take.
startState := s.lookaroundNFA
var runesToMatch []rune
var strToMatch string
if s.assert == plaAssert || s.assert == nlaAssert {
runesToMatch = str[idx:]
} else {
runesToMatch = str[:idx]
}
if len(runesToMatch) == 0 {
strToMatch = ""
} else {
strToMatch = string(runesToMatch)
}
regComp := Reg{startState, s.lookaroundNumCaptureGroups, s.lookaroundRegex, preferLongest}
matchIndices := regComp.FindAll(strToMatch)
numMatchesFound := 0
for _, matchIdx := range matchIndices {
if s.assert == plaAssert || s.assert == nlaAssert { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
if matchIdx.StartIdx == 0 {
numMatchesFound++
}
}
if s.assert == plbAssert || s.assert == nlbAssert { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
if matchIdx.EndIdx == idx {
numMatchesFound++
}
}
}
if s.assert == plaAssert || s.assert == plbAssert { // Positive assertions want at least one match
return numMatchesFound > 0
}
if s.assert == nlaAssert || s.assert == nlbAssert { // Negative assertions only want zero matches
return numMatchesFound == 0
}
}
return true
}
// Returns true if the contents of 's' contain the value at the given index of the given string
func (s nfaState) contentContains(str []rune, idx int, preferLongest bool) bool {
if s.assert != noneAssert {
return s.checkAssertion(str, idx, preferLongest)
}
if idx >= len(str) {
return false
}
if s.allChars {
return !slices.Contains(slices.Concat(notDotChars, s.except), str[idx]) // Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node.
}
// Default - s.assert must be NONE
return slices.Contains(s.content, int(str[idx]))
}
func (s nfaState) isLookaround() bool {
return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert
}
func (s nfaState) numTransitions() int {
if s.next == nil && s.splitState == nil {
return 0
}
if s.next == nil || s.splitState == nil {
return 1
}
return 2
}
// Returns the matches for the character at the given index of the given string.
// Also returns the number of matches. Returns -1 if an assertion failed.
//func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) {
// // Assertions can be viewed as 'checks'. If the check fails, we return
// // an empty array and 0.
// // If it passes, we treat it like any other state, and return all the transitions.
// if s.assert != noneAssert {
// if s.checkAssertion(str, idx) == false {
// return make([]*nfaState, 0), -1
// }
// }
// listTransitions := s.transitions[int(str[idx])]
// for _, dest := range s.transitions[int(anyCharRune)] {
// if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) {
// // Add an allChar state to the list of matches if:
// // a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't.
// // b. The current character isn't the state's exception list.
// listTransitions = append(listTransitions, dest)
// }
// }
// numTransitions := len(listTransitions)
// return listTransitions, numTransitions
//}
// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates
//func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) {
// if st.numTransitions() == 0 {
// st.isLast = true
// return
// }
// // if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
// if st.numTransitions() == 1 { // Eg. a*
// var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
// for _, c := range st.content {
// if len(st.transitions[c]) != 1 || st.transitions[c][0] != st {
// moreThanOneTrans = true
// }
// }
// st.isLast = !moreThanOneTrans
// }
//
// if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
// transitionDests := make([]*nfaState, 0)
// for _, v := range st.transitions {
// transitionDests = append(transitionDests, v...)
// }
// if allEqual(transitionDests...) {
// st.isLast = true
// return
// }
// }
// if visited[st] == true {
// return
// }
// visited[st] = true
// for _, states := range st.transitions {
// for i := range states {
// if states[i] != st {
// verifyLastStatesHelper(states[i], visited)
// }
// }
// }
//}
// verifyLastStates enables the 'isLast' flag for the leaf nodes (last states)
//func verifyLastStates(start []*nfaState) {
// verifyLastStatesHelper(start[0], make(map[*nfaState]bool))
//}
// Concatenates s1 and s2, returns the start of the concatenation.
func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
if s1 == nil {
return s2
}
for i := range s1.output {
s1.output[i].next = s2
}
s1.output = s2.output
return s1
}
func kleene(s1 *nfaState) (*nfaState, error) {
if s1.isEmpty && s1.assert != noneAssert {
return nil, fmt.Errorf("previous token is not quantifiable")
}
toReturn := &nfaState{}
toReturn.isEmpty = true
toReturn.isAlternation = true
toReturn.content = newContents(epsilon)
toReturn.splitState = s1
// toReturn := &nfaState{}
// toReturn.transitions = make(map[int][]*nfaState)
// toReturn.content = newContents(epsilon)
toReturn.isKleene = true
toReturn.output = append([]*nfaState{}, toReturn)
for i := range s1.output {
s1.output[i].next = toReturn
}
// for _, c := range s1.content {
// toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
// }
//toReturn.kleeneState = &s1
return toReturn, nil
}
func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
toReturn := &nfaState{}
// toReturn.transitions = make(map[int][]*nfaState)
toReturn.output = append(toReturn.output, s1.output...)
toReturn.output = append(toReturn.output, s2.output...)
// // Unique append is used here (and elsewhere) to ensure that,
// // for any given transition, a state can only be mentioned once.
// // For example, given the transition 'a', the state 's1' can only be mentioned once.
// // This would lead to multiple instances of the same set of match indices, since both
// // 's1' states would be considered to match.
// for _, c := range s1.content {
// toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
// }
// for _, c := range s2.content {
// toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
// }
toReturn.content = newContents(epsilon)
toReturn.isEmpty = true
toReturn.isAlternation = true
toReturn.next = s1
toReturn.splitState = s2
return toReturn
}
func question(s1 *nfaState) (*nfaState, error) { // Use the fact that ab? == a(b|)
if s1.isEmpty && s1.assert != noneAssert {
return nil, fmt.Errorf("previous token is not quantifiable")
}
toReturn := &nfaState{}
toReturn.isEmpty = true
toReturn.isAlternation = true
toReturn.isQuestion = true
toReturn.content = newContents(epsilon)
toReturn.splitState = s1
toReturn.output = append([]*nfaState{}, toReturn)
toReturn.output = append(toReturn.output, s1.output...)
// s2.transitions = make(map[int][]*nfaState)
return toReturn, nil
}
// Creates and returns a new state with the 'default' values.
func newState() nfaState {
ret := nfaState{
output: make([]*nfaState, 0),
// transitions: make(map[int][]*nfaState),
assert: noneAssert,
except: append([]rune{}, 0),
lookaroundRegex: "",
groupEnd: false,
groupBegin: false,
}
ret.output = append(ret.output, &ret)
return ret
}
// Creates and returns a state that _always_ has a zero-length match.
func zeroLengthMatchState() *nfaState {
start := &nfaState{}
start.content = newContents(epsilon)
start.isEmpty = true
start.assert = alwaysTrueAssert
start.output = append([]*nfaState{}, start)
return start
}
func (s nfaState) equals(other nfaState) bool {
return s.isEmpty == other.isEmpty &&
s.isLast == other.isLast &&
slices.Equal(s.output, other.output) &&
slices.Equal(s.content, other.content) &&
s.next == other.next &&
s.isKleene == other.isKleene &&
s.isQuestion == other.isQuestion &&
s.isAlternation == other.isAlternation &&
s.splitState == other.splitState &&
s.assert == other.assert &&
s.allChars == other.allChars &&
slices.Equal(s.except, other.except) &&
s.lookaroundNFA == other.lookaroundNFA &&
s.groupBegin == other.groupBegin &&
s.groupEnd == other.groupEnd &&
s.groupNum == other.groupNum &&
slices.Equal(s.threadGroups, other.threadGroups)
}
func stateExists(list []nfaState, s nfaState) bool {
for i := range list {
if list[i].equals(s) {
return true
}
}
return false
}

View File

@@ -1,7 +0,0 @@
In PCRE, following a backreference by _any_ number seems to turn it into an octal value. Why is this?
Eg.
`(a)\1` <-- Backreference
`(a)\17` <-- Octal '17'
`(a)\19` <-- Octal 1, then literal 9
`(a)\1a` <-- Backreference, then 'a'

View File

@@ -1,943 +0,0 @@
package regex
import (
"fmt"
"slices"
"testing"
)
var reTests = []struct {
re string
flags []ReFlag
str string
result []Group // Stores all zero-groups in the match
}{
{"a", nil, "abc", []Group{{0, 1}}},
{"a", nil, "bca", []Group{{2, 3}}},
{"l", nil, "ggllgg", []Group{{2, 3}, {3, 4}}},
{"(b|c)", nil, "abdceb", []Group{{1, 2}, {3, 4}, {5, 6}}},
{"a+", nil, "brerereraaaaabbbbb", []Group{{8, 13}}},
{"ab+", nil, "qweqweqweaqweqweabbbbbr", []Group{{16, 22}}},
{"(b|c|A)", nil, "ooaoobocA", []Group{{5, 6}, {7, 8}, {8, 9}}},
{"ab*", nil, "a", []Group{{0, 1}}},
{"ab*", nil, "abb", []Group{{0, 3}}},
{"a*b", nil, "aaab", []Group{{0, 4}}},
{"a*b", nil, "qwqw", []Group{}},
{"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}},
{"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}},
// This match will only happen with Longest()
// {"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}},
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}}},
{"b*a*a", nil, "bba", []Group{{0, 3}}},
{"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}},
{"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}},
{"a(b|c)*d+", nil, "abccdd", []Group{{0, 6}}},
{"a*", nil, "", []Group{{0, 0}}},
{"a|b", nil, "c", []Group{}},
{"(a|b)*c", nil, "aabbc", []Group{{0, 5}}},
{"a(b|b)", nil, "ab", []Group{{0, 2}}},
{"a*", nil, "aaaaaaaa", []Group{{0, 8}, {8, 8}}},
{"ab?", nil, "ab", []Group{{0, 2}}},
{"a?b", nil, "ab", []Group{{0, 2}}},
{"a?", nil, "", []Group{{0, 0}}},
{"a?b?c", nil, "a", []Group{}},
{"a?b?c?", nil, "ab", []Group{{0, 2}, {2, 2}}},
{"a?b?c?", nil, "ac", []Group{{0, 2}, {2, 2}}},
{"a?b?c", nil, "abc", []Group{{0, 3}}},
{"a?b?c", nil, "acb", []Group{{0, 2}}},
{"[abc]", nil, "defadefbdefce", []Group{{3, 4}, {7, 8}, {11, 12}}},
{"[ab]c", nil, "ab", []Group{}},
{"g[ab]c", nil, "gac", []Group{{0, 3}}},
{"g[ab]c", nil, "gbc", []Group{{0, 3}}},
{"g[ab]c", nil, "gc", []Group{}},
{"g[ab]c", nil, "gfc", []Group{}},
{"[ab]*", nil, "aabbbabaababab", []Group{{0, 14}, {14, 14}}},
{"[ab]+", nil, "aabbbablaababab", []Group{{0, 7}, {8, 15}}},
{"[Ff]r[Uu]it", nil, "fruit", []Group{{0, 5}}},
{"[Ff]r[Uu]it", nil, "FrUit", []Group{{0, 5}}},
{"[Ff]r[Uu|]it", nil, "Fr|it", []Group{{0, 5}}},
{"[Ff]r([Uu]|[pP])it", nil, "Frpit", []Group{{0, 5}}},
{"[Ff]r[Uu]|[pP]it", nil, "Frpit", []Group{{2, 5}}},
{"[a-zA-Z]+", nil, "Hello, how is it going?", []Group{{0, 5}, {7, 10}, {11, 13}, {14, 16}, {17, 22}}},
{".+", nil, "Hello, how is it going?", []Group{{0, 23}}},
{"a.", nil, "a ", []Group{{0, 2}}},
{"a.b", nil, "a/b", []Group{{0, 3}}},
{".", nil, "a ", []Group{{0, 1}, {1, 2}}},
{"a.", nil, "a ", []Group{{0, 2}}},
{".+b", nil, "abc", []Group{{0, 2}}},
{`\d`, nil, "1a0a3s'''34343s", []Group{{0, 1}, {2, 3}, {4, 5}, {9, 10}, {10, 11}, {11, 12}, {12, 13}, {13, 14}}},
{`\\`, nil, `a\b\c\qwe\`, []Group{{1, 2}, {3, 4}, {5, 6}, {9, 10}}},
{`\W`, nil, `"Hello", he said. How are you doing?`, []Group{{0, 1}, {6, 7}, {7, 8}, {8, 9}, {11, 12}, {16, 17}, {17, 18}, {21, 22}, {25, 26}, {29, 30}, {35, 36}}},
{`\w`, nil, ";';';';';'qwe12", []Group{{10, 11}, {11, 12}, {12, 13}, {13, 14}, {14, 15}}},
{`\s`, nil, "a b c d", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
{`\<`, nil, "<HTML><body>", []Group{{0, 1}, {6, 7}}},
{`\(.+\)`, nil, "Not (paranthesized), (so) is (this) not", []Group{{4, 35}}},
{"[^abc]+", nil, "qarbtopsaplpclkpasdmb prejip0r,p", []Group{{0, 1}, {2, 3}, {4, 8}, {9, 12}, {13, 16}, {17, 20}, {21, 32}}},
{"[^a]+", nil, "qqqaq", []Group{{0, 3}, {4, 5}}},
{"[^0-9]+", nil, "a1b2c3dd", []Group{{0, 1}, {2, 3}, {4, 5}, {6, 8}}},
{"[^abc]+", nil, "ababababbababaccacacacaca", []Group{}},
{`\[`, nil, "a[b[c[]]]", []Group{{1, 2}, {3, 4}, {5, 6}}},
{`\([^)]+\)`, nil, "Not (paranthesized), (so) is (this) not", []Group{{4, 19}, {21, 25}, {29, 35}}},
{"^ab", nil, "ab bab", []Group{{0, 2}}},
{"^aaaa^", nil, "aaaaaaaa", []Group{}},
{"^([bB][Gg])", nil, "bG", []Group{{0, 2}}},
{"b$", nil, "ba", []Group{}},
{"(boy|girl)$", nil, "girlf", []Group{}},
{`\bint\b`, nil, "print int integer", []Group{{6, 9}}},
{`int\b`, nil, "ints", []Group{}},
{`int(\b|a)`, nil, "inta", []Group{{0, 4}}},
{`\b\d+\b`, nil, "511 a3 43", []Group{{0, 3}, {7, 9}}},
{`\Bint\B`, nil, "prints int integer print", []Group{{2, 5}}},
{`^`, nil, "5^3^2", []Group{{0, 0}}},
{`\^`, nil, "5^3^2", []Group{{1, 2}, {3, 4}}},
{`pool$`, nil, "pool carpool", []Group{{8, 12}}},
{`^int$`, nil, "print int integer", []Group{}},
{`^int$`, nil, "int", []Group{{0, 3}}},
{`b*`, nil, "aaaaaaaaaaqweqwe", []Group{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}, {12, 12}, {13, 13}, {14, 14}, {15, 15}, {16, 16}}},
{"a{4}", nil, "aabaaa", []Group{}},
{"ab{5}", nil, "abbbbbab", []Group{{0, 6}}},
{"(a|b){3,4}", nil, "aba", []Group{{0, 3}}},
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
{`\d{3,4}`, nil, "120", []Group{{0, 3}}},
{`\d{3,4}`, nil, "12709", []Group{{0, 4}}},
{`\d{3,4}`, nil, "12", []Group{}},
{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
{`\d{3,4}`, nil, "5", []Group{}},
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
{`\d{3,4}`, nil, "89a-0", []Group{}},
{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
{`\bpaint\b`, nil, "paints", []Group{}},
{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
// Unicode tests
{`.+`, nil, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
{`a.b`, nil, "a²b", []Group{{0, 3}}},
{`[^a]+`, nil, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 25}}},
// Fun experiment - AI-generated tests
{"(abc|def|ghi)", nil, "abcdefg", []Group{{0, 3}, {3, 6}}},
{"a(b|c)d", nil, "abcd", []Group{}},
{"a(b|c)*d", nil, "abcbcd", []Group{{0, 6}}},
{"a(b|c)+d", nil, "abcbcd", []Group{{0, 6}}},
{"a(b|c)?d", nil, "abd", []Group{{0, 3}}},
{".+", nil, "hello world", []Group{{0, 11}}},
{"a.b", nil, "aXb", []Group{{0, 3}}},
{"a.*b", nil, "aXb", []Group{{0, 3}}},
{"a.{2,3}b", nil, "aXXb", []Group{{0, 4}}},
{"a.{2,}b", nil, "aXXXb", []Group{{0, 5}}},
{"a.{0,3}b", nil, "ab", []Group{{0, 2}}},
{"[abc]+", nil, "abcabc", []Group{{0, 6}}},
{"[a-zA-Z]+", nil, "HelloWorld", []Group{{0, 10}}},
{"[^abc]+", nil, "defghi", []Group{{0, 6}}},
{"^hello", nil, "hello world", []Group{{0, 5}}},
{"world$", nil, "hello world", []Group{{6, 11}}},
{`\bhello\b`, nil, "hello world", []Group{{0, 5}}},
{`\Bhello\B`, nil, "hello world", []Group{}},
{"(hello|world)", nil, "hello world", []Group{{0, 5}, {6, 11}}},
{"(hello|world)+", nil, "hello world", []Group{{0, 5}, {6, 11}}},
{"(hello|world)*", nil, "hello world", []Group{{0, 5}, {5, 5}, {6, 11}, {11, 11}}},
{"(hello|world)?", nil, "hello world", []Group{{0, 5}, {5, 5}, {6, 11}, {11, 11}}},
{"ú.+ï", nil, "úïäö´«åæïëòöê»éãçâï«úïòíñ", []Group{{0, 22}}},
{"(?=hello)", nil, "hello world", []Group{{0, 0}}},
{"(?!hello)", nil, "hello world", []Group{{1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}}},
{"(?<=hello)", nil, "hello world", []Group{{5, 5}}},
{"(?<!hello)", nil, "hello world", []Group{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {6, 6}, {7, 7}, {8, 8}, {9, 9}, {10, 10}, {11, 11}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "40", []Group{{0, 2}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "040", []Group{}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "400", []Group{{0, 3}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", nil, "4000", []Group{}},
{"a{1,3}", nil, "aaaaa", []Group{{0, 3}, {3, 5}}},
{`\\[ab\\]`, nil, "a", []Group{}},
{`\\[ab\\]`, nil, `\a`, []Group{{0, 2}}},
// Lookaround tests
{"(?<=bo)y", nil, "boy", []Group{{2, 3}}},
{"bo(?=y)", nil, "boy", []Group{{0, 2}}},
{"(?<=f)f+(?=f)", nil, "fffff", []Group{{1, 4}}},
{"(?<=f)f+(?=f)", nil, "fffffa", []Group{{1, 4}}},
// Some POSIX charclass tests
{"[[:lower:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 26}}},
{"[[:upper:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{26, 52}}},
{"[[:alpha:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 52}}},
{"[[:digit:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{52, 62}}},
{"[[:alnum:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 62}}},
{"[[:punct:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{62, 70}}},
{"[[:ascii:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 70}}},
{"[[:graph:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 70}}},
// Test cases from Python's RE test suite
{`[\1]`, nil, "\x01", []Group{{0, 1}}},
{`\0`, nil, "\x00", []Group{{0, 1}}},
{`[\0a]`, nil, "\x00", []Group{{0, 1}}},
{`[\0a]`, nil, "\x00", []Group{{0, 1}}},
{`[a\0]`, nil, "\x00", []Group{{0, 1}}},
{`[^a\0]`, nil, "\x00", []Group{}},
{`\a[\b]\f\n\r\t\v`, nil, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
{`[\a][\b][\f][\n][\r][\t][\v]`, nil, "\a\b\f\n\r\t\v", []Group{{0, 7}}},
{`\u`, nil, "", nil},
{`\xff`, nil, "ÿ", []Group{{0, 1}}},
{`\x00ffffffffffffff`, nil, "\xff", []Group{}},
{`\x00f`, nil, "\x0f", []Group{}},
{`\x00fe`, nil, "\xfe", []Group{}},
{`^\w+=(\\[\000-\277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
{`a.b`, nil, `acb`, []Group{{0, 3}}},
{`a.b`, nil, "a\nb", []Group{}},
{`a.*b`, nil, "acc\nccb", []Group{}},
{`a.{4,5}b`, nil, "acc\nccb", []Group{}},
{`a.b`, nil, "a\rb", []Group{{0, 3}}},
{`a.b`, []ReFlag{RE_SINGLE_LINE}, "a\nb", []Group{{0, 3}}},
{`a.*b`, []ReFlag{RE_SINGLE_LINE}, "acc\nccb", []Group{{0, 7}}},
{`a.{4,5}b`, []ReFlag{RE_SINGLE_LINE}, "acc\nccb", []Group{{0, 7}}},
{`)`, nil, ``, nil},
{`^$`, nil, ``, []Group{{0, 0}}},
{`abc`, nil, `abc`, []Group{{0, 3}}},
{`abc`, nil, `xbc`, []Group{}},
{`abc`, nil, `axc`, []Group{}},
{`abc`, nil, `abx`, []Group{}},
{`abc`, nil, `xabcy`, []Group{{1, 4}}},
{`abc`, nil, `ababc`, []Group{{2, 5}}},
{`ab*c`, nil, `abc`, []Group{{0, 3}}},
{`ab*bc`, nil, `abc`, []Group{{0, 3}}},
{`ab*bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab*bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab{0,}c`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab+bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab+bc`, nil, `abc`, []Group{}},
{`ab+bc`, nil, `abq`, []Group{}},
{`ab{1,}bc`, nil, `abq`, []Group{}},
{`ab+bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab{1,}bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab{1,3}bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab{3,4}bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab{4,5}bc`, nil, `abbbbc`, []Group{}},
{`ab?bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab?bc`, nil, `abc`, []Group{{0, 3}}},
{`ab{0,1}bc`, nil, `abc`, []Group{{0, 3}}},
{`ab?bc`, nil, `abbbbc`, []Group{}},
{`ab?c`, nil, `abc`, []Group{{0, 3}}},
{`^abc$`, nil, `abc`, []Group{{0, 3}}},
{`^abc$`, nil, `abcc`, []Group{}},
{`^abc`, nil, `abcc`, []Group{{0, 3}}},
{`^abc$`, nil, `aabc`, []Group{}},
{`abc$`, nil, `aabc`, []Group{{1, 4}}},
{`^`, nil, `abc`, []Group{{0, 0}}},
{`$`, nil, `abc`, []Group{{3, 3}}},
{`a.c`, nil, `abc`, []Group{{0, 3}}},
{`a.c`, nil, `axc`, []Group{{0, 3}}},
{`a.*c`, nil, `axyzc`, []Group{{0, 5}}},
{`a.*c`, nil, `axyzd`, []Group{}},
{`a[bc]d`, nil, `abc`, []Group{}},
{`a[bc]d`, nil, `abd`, []Group{{0, 3}}},
{`a[b-d]e`, nil, `abd`, []Group{}},
{`a[b-d]e`, nil, `ace`, []Group{{0, 3}}},
{`a[b-d]`, nil, `aac`, []Group{{1, 3}}},
{`a[-b]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
{`a[\-b]`, nil, `a-`, []Group{{0, 2}}},
{`a[b-]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
{`a[]b`, nil, `-`, nil},
{`a[`, nil, `-`, nil},
{`a\`, nil, `-`, nil},
{`abc)`, nil, `-`, nil},
{`(abc`, nil, `-`, nil},
{`a]`, nil, `a]`, []Group{{0, 2}}},
{`a[]]b`, nil, `a]b`, []Group{{0, 3}}},
{`a[\]]b`, nil, `a]b`, []Group{{0, 3}}},
{`a[^bc]d`, nil, `aed`, []Group{{0, 3}}},
{`a[^bc]d`, nil, `abd`, []Group{}},
{`a[^-b]c`, nil, `adc`, []Group{{0, 3}}},
{`a[^-b]c`, nil, `a-c`, []Group{}},
{`a[^]b]c`, nil, `a]c`, []Group{}},
{`a[^]b]c`, nil, `adc`, []Group{{0, 3}}},
{`\ba\b`, nil, `a-`, []Group{{0, 1}}},
{`\ba\b`, nil, `-a`, []Group{{1, 2}}},
{`\ba\b`, nil, `-a-`, []Group{{1, 2}}},
{`\by\b`, nil, `xy`, []Group{}},
{`\by\b`, nil, `yz`, []Group{}},
{`\by\b`, nil, `xyz`, []Group{}},
{`x\b`, nil, `xyz`, []Group{}},
{`x\B`, nil, `xyz`, []Group{{0, 1}}},
{`\Bz`, nil, `xyz`, []Group{{2, 3}}},
{`z\B`, nil, `xyz`, []Group{}},
{`\Bx`, nil, `xyz`, []Group{}},
{`\Ba\B`, nil, `a-`, []Group{}},
{`\Ba\B`, nil, `-a`, []Group{}},
{`\Ba\B`, nil, `-a-`, []Group{}},
{`\By\B`, nil, `xy`, []Group{}},
{`\By\B`, nil, `yz`, []Group{}},
{`\By\b`, nil, `xy`, []Group{{1, 2}}},
{`\by\B`, nil, `yz`, []Group{{0, 1}}},
{`\By\B`, nil, `xyz`, []Group{{1, 2}}},
{`ab|cd`, nil, `abc`, []Group{{0, 2}}},
{`ab|cd`, nil, `abcd`, []Group{{0, 2}, {2, 4}}},
{`$b`, nil, `b`, []Group{}},
{`a\(b`, nil, `a(b`, []Group{{0, 3}}},
{`a\(*b`, nil, `ab`, []Group{{0, 2}}},
{`a\(*b`, nil, `a((b`, []Group{{0, 4}}},
{`a\\b`, nil, `a\b`, []Group{{0, 3}}},
{`a+b+c`, nil, `aabbabc`, []Group{{4, 7}}},
{`a{1,}b{1,}c`, nil, `aabbabc`, []Group{{4, 7}}},
{`)(`, nil, `-`, nil},
{`[^ab]*`, nil, `cde`, []Group{{0, 3}, {3, 3}}},
{`abc`, nil, ``, []Group{}},
{`a*`, nil, ``, []Group{{0, 0}}},
{`a|b|c|d|e`, nil, `e`, []Group{{0, 1}}},
{`abcd*efg`, nil, `abcdefg`, []Group{{0, 7}}},
{`ab*`, nil, `xabyabbbz`, []Group{{1, 3}, {4, 8}}},
{`ab*`, nil, `xayabbbz`, []Group{{1, 2}, {3, 7}}},
{`[abhgefdc]ij`, nil, `hij`, []Group{{0, 3}}},
{`a[bcd]*dcdcde`, nil, `adcdcde`, []Group{{0, 7}}},
{`a[bcd]+dcdcde`, nil, `adcdcde`, []Group{}},
{`[a-zA-Z_][a-zA-Z0-9_]*`, nil, `alpha`, []Group{{0, 5}}},
{`multiple words of text`, nil, `uh-uh`, []Group{}},
{`multiple words`, nil, `multiple words, yeah`, []Group{{0, 14}}},
{`[k]`, nil, `ab`, []Group{}},
{`a[-]?c`, nil, `ac`, []Group{{0, 2}}},
{`^(.+)?B`, nil, `AB`, []Group{{0, 2}}},
{`\0009`, nil, "\x009", []Group{{0, 2}}},
{`\141`, nil, "a", []Group{{0, 1}}},
// At this point, the python test suite has a bunch
// of backreference tests. Since my engine doesn't
// implement backreferences, I've skipped those tests.
{`*a`, nil, ``, nil},
{`(*)b`, nil, ``, nil},
{`a**`, nil, ``, nil},
{`^`, nil, `abc`, []Group{{0, 0}}},
{`$`, nil, `abc`, []Group{{3, 3}}},
{`a[b-]`, nil, `a-`, []Group{{0, 2}}},
{`a[b-a]`, nil, `a-`, nil},
// Case-insensitive matching tests
{`abc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
{`abc`, []ReFlag{RE_CASE_INSENSITIVE}, `XBC`, []Group{}},
{`abc`, []ReFlag{RE_CASE_INSENSITIVE}, `AXC`, []Group{}},
{`abc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABX`, []Group{}},
{`abc`, []ReFlag{RE_CASE_INSENSITIVE}, `XABCY`, []Group{{1, 4}}},
{`abc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABABC`, []Group{{2, 5}}},
{`ab*c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
{`ab*bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
{`ab*bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
{`ab*bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{{0, 6}}},
{`ab{0,}c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{{0, 6}}},
{`ab+bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
{`ab+bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{}},
{`ab+bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABQ`, []Group{}},
{`ab{1,}bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABQ`, []Group{}},
{`ab+bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{{0, 6}}},
{`ab{1,}bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{{0, 6}}},
{`ab{1,3}bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{{0, 6}}},
{`ab{3,4}bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{{0, 6}}},
{`ab{4,5}bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{}},
{`ab?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
{`ab?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
{`ab{0,1}bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
{`ab?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{}},
{`ab?c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
{`^abc$`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
{`^abc$`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCC`, []Group{}},
{`^abc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCC`, []Group{{0, 3}}},
{`^abc$`, []ReFlag{RE_CASE_INSENSITIVE}, `AABC`, []Group{}},
{`abc$`, []ReFlag{RE_CASE_INSENSITIVE}, `AABC`, []Group{{1, 4}}},
{`^`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 0}}},
{`$`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{3, 3}}},
{`a.c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
{`a.c`, []ReFlag{RE_CASE_INSENSITIVE}, `AXC`, []Group{{0, 3}}},
{`a.*c`, []ReFlag{RE_CASE_INSENSITIVE}, `AXYZC`, []Group{{0, 5}}},
{`a.*c`, []ReFlag{RE_CASE_INSENSITIVE}, `AXYZD`, []Group{}},
{`a[bc]d`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{}},
{`a[bc]d`, []ReFlag{RE_CASE_INSENSITIVE}, `ABD`, []Group{{0, 3}}},
{`a[b-d]e`, []ReFlag{RE_CASE_INSENSITIVE}, `ABD`, []Group{}},
{`a[b-d]e`, []ReFlag{RE_CASE_INSENSITIVE}, `ACE`, []Group{{0, 3}}},
{`a[b-d]`, []ReFlag{RE_CASE_INSENSITIVE}, `AAC`, []Group{{1, 3}}},
{`a[-b]`, []ReFlag{RE_CASE_INSENSITIVE}, `A-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
{`a[\-b]`, []ReFlag{RE_CASE_INSENSITIVE}, `A-`, []Group{{0, 2}}},
{`a[b-]`, []ReFlag{RE_CASE_INSENSITIVE}, `A-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
{`a[]b`, []ReFlag{RE_CASE_INSENSITIVE}, `-`, nil},
{`a[`, []ReFlag{RE_CASE_INSENSITIVE}, `-`, nil},
{`a\`, []ReFlag{RE_CASE_INSENSITIVE}, `-`, nil},
{`abc)`, []ReFlag{RE_CASE_INSENSITIVE}, `-`, nil},
{`(abc`, []ReFlag{RE_CASE_INSENSITIVE}, `-`, nil},
{`a]`, []ReFlag{RE_CASE_INSENSITIVE}, `A]`, []Group{{0, 2}}},
{`a[]]b`, []ReFlag{RE_CASE_INSENSITIVE}, `A]B`, []Group{{0, 3}}},
{`a[\]]b`, []ReFlag{RE_CASE_INSENSITIVE}, `A]B`, []Group{{0, 3}}},
{`a[^bc]d`, []ReFlag{RE_CASE_INSENSITIVE}, `AED`, []Group{{0, 3}}},
{`a[^bc]d`, []ReFlag{RE_CASE_INSENSITIVE}, `ABD`, []Group{}},
{`a[^-b]c`, []ReFlag{RE_CASE_INSENSITIVE}, `ADC`, []Group{{0, 3}}},
{`a[^-b]c`, []ReFlag{RE_CASE_INSENSITIVE}, `A-C`, []Group{}},
{`a[^]b]c`, []ReFlag{RE_CASE_INSENSITIVE}, `A]C`, []Group{}},
{`a[^]b]c`, []ReFlag{RE_CASE_INSENSITIVE}, `ADC`, []Group{{0, 3}}},
{`\ba\b`, []ReFlag{RE_CASE_INSENSITIVE}, `A-`, []Group{{0, 1}}},
{`\ba\b`, []ReFlag{RE_CASE_INSENSITIVE}, `-A`, []Group{{1, 2}}},
{`\ba\b`, []ReFlag{RE_CASE_INSENSITIVE}, `-A-`, []Group{{1, 2}}},
{`\by\b`, []ReFlag{RE_CASE_INSENSITIVE}, `XY`, []Group{}},
{`\by\b`, []ReFlag{RE_CASE_INSENSITIVE}, `YZ`, []Group{}},
{`\by\b`, []ReFlag{RE_CASE_INSENSITIVE}, `XYZ`, []Group{}},
{`x\b`, []ReFlag{RE_CASE_INSENSITIVE}, `XYZ`, []Group{}},
{`x\B`, []ReFlag{RE_CASE_INSENSITIVE}, `XYZ`, []Group{{0, 1}}},
{`\Bz`, []ReFlag{RE_CASE_INSENSITIVE}, `XYZ`, []Group{{2, 3}}},
{`z\B`, []ReFlag{RE_CASE_INSENSITIVE}, `XYZ`, []Group{}},
{`\Bx`, []ReFlag{RE_CASE_INSENSITIVE}, `XYZ`, []Group{}},
{`\Ba\B`, []ReFlag{RE_CASE_INSENSITIVE}, `A-`, []Group{}},
{`\Ba\B`, []ReFlag{RE_CASE_INSENSITIVE}, `-A`, []Group{}},
{`\Ba\B`, []ReFlag{RE_CASE_INSENSITIVE}, `-A-`, []Group{}},
{`\By\B`, []ReFlag{RE_CASE_INSENSITIVE}, `XY`, []Group{}},
{`\By\B`, []ReFlag{RE_CASE_INSENSITIVE}, `YZ`, []Group{}},
{`\By\b`, []ReFlag{RE_CASE_INSENSITIVE}, `XY`, []Group{{1, 2}}},
{`\by\B`, []ReFlag{RE_CASE_INSENSITIVE}, `YZ`, []Group{{0, 1}}},
{`\By\B`, []ReFlag{RE_CASE_INSENSITIVE}, `XYZ`, []Group{{1, 2}}},
{`ab|cd`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 2}}},
{`ab|cd`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCD`, []Group{{0, 2}, {2, 4}}},
{`$b`, []ReFlag{RE_CASE_INSENSITIVE}, `B`, []Group{}},
{`a\(b`, []ReFlag{RE_CASE_INSENSITIVE}, `A(B`, []Group{{0, 3}}},
{`a\(*b`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Group{{0, 2}}},
{`a\(*b`, []ReFlag{RE_CASE_INSENSITIVE}, `A((B`, []Group{{0, 4}}},
{`a\\b`, []ReFlag{RE_CASE_INSENSITIVE}, `A\B`, []Group{{0, 3}}},
{`a+b+c`, []ReFlag{RE_CASE_INSENSITIVE}, `AABBABC`, []Group{{4, 7}}},
{`a{1,}b{1,}c`, []ReFlag{RE_CASE_INSENSITIVE}, `AABBABC`, []Group{{4, 7}}},
{`)(`, []ReFlag{RE_CASE_INSENSITIVE}, `-`, nil},
{`[^ab]*`, []ReFlag{RE_CASE_INSENSITIVE}, `CDE`, []Group{{0, 3}, {3, 3}}},
{`abc`, []ReFlag{RE_CASE_INSENSITIVE}, ``, []Group{}},
{`a*`, []ReFlag{RE_CASE_INSENSITIVE}, ``, []Group{{0, 0}}},
{`a|b|c|d|e`, []ReFlag{RE_CASE_INSENSITIVE}, `E`, []Group{{0, 1}}},
{`abcd*efg`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDEFG`, []Group{{0, 7}}},
{`ab*`, []ReFlag{RE_CASE_INSENSITIVE}, `XABYABBBZ`, []Group{{1, 3}, {4, 8}}},
{`ab*`, []ReFlag{RE_CASE_INSENSITIVE}, `XAYABBBZ`, []Group{{1, 2}, {3, 7}}},
{`[abhgefdc]ij`, []ReFlag{RE_CASE_INSENSITIVE}, `HIJ`, []Group{{0, 3}}},
{`a[bcd]*dcdcde`, []ReFlag{RE_CASE_INSENSITIVE}, `ADCDCDE`, []Group{{0, 7}}},
{`a[bcd]+dcdcde`, []ReFlag{RE_CASE_INSENSITIVE}, `ADCDCDE`, []Group{}},
{`[a-zA-Z_][a-zA-Z0-9_]*`, []ReFlag{RE_CASE_INSENSITIVE}, `ALPHA`, []Group{{0, 5}}},
{`multiple words of text`, []ReFlag{RE_CASE_INSENSITIVE}, `UH-UH`, []Group{}},
{`multiple words`, []ReFlag{RE_CASE_INSENSITIVE}, `MULTIPLE WORDS, YEAH`, []Group{{0, 14}}},
{`[k]`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Group{}},
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
{`\141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
{`a(?!b).`, nil, `abad`, []Group{{2, 4}}},
{`a(?=d).`, nil, `abad`, []Group{{2, 4}}},
{`a(?=c|d).`, nil, `abad`, []Group{{2, 4}}},
{`^abc`, nil, "jkl\nabc\nxyz", []Group{}},
{`^abc`, []ReFlag{RE_MULTILINE}, "jkl\nabc\nxyz", []Group{{4, 7}}},
{`abc$`, nil, "jkl\nabc\nxyz", []Group{}},
{`abc$`, []ReFlag{RE_MULTILINE}, "jkl\nabc\nxyz", []Group{{4, 7}}},
{`abc$`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{7, 10}}},
{`^`, nil, "jkl\n123abc\nxyz", []Group{{0, 0}}},
{`^`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{0, 0}, {4, 4}, {11, 11}}},
{`\A`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{0, 0}}},
{`$`, nil, "jkl\n123abc\nxyz", []Group{{14, 14}}},
{`$`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{3, 3}, {10, 10}, {14, 14}}},
{`\z`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{14, 14}}},
{`^abc\z`, []ReFlag{RE_MULTILINE}, "abc\nabc\nabc", []Group{{8, 11}}},
{`a.b`, nil, "a\nb", []Group{}},
{`a.b`, []ReFlag{RE_SINGLE_LINE}, "a\nb", []Group{{0, 3}}},
{`\w+`, nil, `--ab_cd0123--`, []Group{{2, 11}}},
{`[\w]+`, nil, `--ab_cd0123--`, []Group{{2, 11}}},
{`\D+`, nil, `1234abc5678`, []Group{{4, 7}}},
{`[\D]+`, nil, `1234abc5678`, []Group{{4, 7}}},
{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
{`\xff`, nil, "\u00ff", []Group{{0, 1}}},
{`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
{`\x00ff`, nil, "\u00ff", []Group{}},
{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
{`\t\n\v\r\f\a`, nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
{`[\t][\n][\v][\r][\f][\b]`, nil, "\t\n\v\r\f\b", []Group{{0, 6}}},
{`.*d`, nil, "abc\nabd", []Group{{4, 7}}},
{`(`, nil, "-", nil},
{`[\41]`, nil, `!`, []Group{{0, 1}}},
{`(?<!abc)(d.f)`, nil, `abcdefdof`, []Group{{6, 9}}},
{`[\w-]+`, nil, `laser_beam`, []Group{{0, 10}}},
{`M+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}},
{`m+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}},
{`[M]+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}},
{`[m]+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}},
{`^*`, nil, `-`, nil},
{`a[^>]*b`, nil, `a>b`, []Group{}},
{`^a*$`, nil, `foo`, []Group{}},
// Out-of-bounds for character classes
{`[b-e]`, nil, `a`, []Group{}},
{`[b-e]`, nil, `f`, []Group{}},
{`*?`, nil, `-`, nil},
{`a*?`, nil, `-`, nil}, // non-greedy operators are not supported
// Numeric range tests - this is a feature that I added, and doesn't exist
// in any other mainstream regex engine
{`<0-255>`, nil, `0`, []Group{{0, 1}}},
{`<0-255>`, nil, `7`, []Group{{0, 1}}},
{`<0-255>`, nil, `46`, []Group{{0, 2}}},
{`<0-255>`, nil, `90`, []Group{{0, 2}}},
{`<0-255>`, nil, `107`, []Group{{0, 3}}},
{`<0-255>`, nil, `198`, []Group{{0, 3}}},
{`<0-255>`, nil, `254`, []Group{{0, 3}}},
{`<0-255>`, nil, `255`, []Group{{0, 3}}},
{`<0-255>`, nil, `256`, []Group{{0, 2}, {2, 3}}},
{`^<0-255>$`, nil, `256`, []Group{}},
{`^<0-299792458>$`, nil, `299000999`, []Group{{0, 9}}},
{`^<0-299792458>$`, nil, `299792531`, []Group{}},
{`^<3-0>$`, nil, `-`, nil},
{`^<0-0>$`, nil, `0`, []Group{{0, 1}}},
{`2<0-55>`, nil, `231`, []Group{{0, 3}}},
{`2<0-55>`, nil, `271`, []Group{{0, 2}}},
{`^2<0-55>$`, nil, `271`, []Group{}},
{`<389`, nil, `-`, nil},
{`<389>`, nil, `-`, nil},
{`<-389>`, nil, `-`, nil},
{`<389->`, nil, `-`, nil},
{`<389-400`, nil, `-`, nil},
{`<389-400>`, nil, `391`, []Group{{0, 3}}},
{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
}
var groupTests = []struct {
re string
flags []ReFlag
str string
result []Match
}{
{"(a)(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}},
{"((a))(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}},
{"(0)", nil, "ab", []Match{}},
{"(a)b", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}}}},
{"a(b)", nil, "ab", []Match{[]Group{{0, 2}, {1, 2}}}},
{"(a|b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
{"(a)|(b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}, {-1, -1}}, []Group{{1, 2}, {-1, -1}, {1, 2}}}},
{"(a+)(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 3}, {3, 4}}}},
{"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}},
{"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
// This match will only happen with Longest()
// {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 3}, {0, 3}, {-1, -1}}}},
{"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
{"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
{"(a?)a?", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
{"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
{"a((b.d){3})", nil, "abfdbhdbid", []Match{[]Group{{0, 10}, {1, 10}, {7, 10}}}},
// Test cases from Python's RE test suite
{`(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\071`, nil, `abcdefghijkl9`, []Match{[]Group{{0, 13}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {10, 11}, {11, 12}}}},
{`()ef`, nil, `def`, []Match{[]Group{{1, 3}, {1, 1}}}},
{`(?:)ef`, nil, `def`, []Match{[]Group{{1, 3}}}},
{`(?:)`, nil, `def`, []Match{[]Group{{0, 0}}, []Group{{1, 1}}, []Group{{2, 2}}, []Group{{3, 3}}}},
{`((a))`, nil, `abc`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}}}},
{`(a)b(c)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 1}, {2, 3}}}},
{`(a+|b)*`, nil, `ab`, []Match{[]Group{{0, 2}, {1, 2}}, []Group{{2, 2}}}},
{`(a+|b){0,}`, nil, `ab`, []Match{[]Group{{0, 2}, {1, 2}}, []Group{{2, 2}}}},
{`(a+|b)+`, nil, `ab`, []Match{[]Group{{0, 2}, {1, 2}}}},
{`(a+|b){1,}`, nil, `ab`, []Match{[]Group{{0, 2}, {1, 2}}}},
{`(a+|b)?`, nil, `ab`, []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}, []Group{{2, 2}}}},
{`(a+|b){0,1}`, nil, `ab`, []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}, []Group{{2, 2}}}},
{`(a|b|c|d|e)f`, nil, `ef`, []Match{[]Group{{0, 2}, {0, 1}}}},
{`(ab|cd)e`, nil, `abcde`, []Match{[]Group{{2, 5}, {2, 4}}}},
{`^(ab|cd)e`, nil, `abcde`, []Match{}},
{`(abc|)ef`, nil, `abcdef`, []Match{[]Group{{4, 6}, {4, 4}}}},
{`(a|b)c*d`, nil, `abcd`, []Match{[]Group{{1, 4}, {1, 2}}}},
{`(ab|ab*)bc`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`a([bc]*)c*`, nil, `abc`, []Match{[]Group{{0, 3}, {1, 3}}}},
{`a([bc]*)(c*d)`, nil, `abcd`, []Match{[]Group{{0, 4}, {1, 3}, {3, 4}}}},
{`a([bc]+)(c*d)`, nil, `abcd`, []Match{[]Group{{0, 4}, {1, 3}, {3, 4}}}},
{`a([bc]*)(c+d)`, nil, `abcd`, []Match{[]Group{{0, 4}, {1, 2}, {2, 4}}}},
{`(ab|a)b*c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
{`((a)(b)c)(d)`, nil, `abcd`, []Match{[]Group{{0, 4}, {0, 3}, {0, 1}, {1, 2}, {3, 4}}}},
{`^a(bc+|b[eh])g|.h$`, nil, `abh`, []Match{[]Group{{1, 3}}}},
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `effgz`, []Match{[]Group{{0, 5}, {0, 5}}}},
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `ij`, []Match{[]Group{{0, 2}, {0, 2}, {1, 2}}}},
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `effg`, []Match{}},
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}},
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
{`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(((((((((a)))))))))\41`, nil, `a!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
{`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
// At this point, the python test suite has a bunch
// of backreference tests. Since my engine doesn't
// implement backreferences, I've skipped those tests.
{`(a)(b)c|ab`, nil, `ab`, []Match{[]Group{{0, 2}}}},
{`(a)+x`, nil, `aaax`, []Match{[]Group{{0, 4}, {2, 3}}}},
{`([ac])+x`, nil, `aacx`, []Match{[]Group{{0, 4}, {2, 3}}}},
{`([^/]*/)*sub1/`, nil, `d:msgs/tdir/sub1/trial/away.cpp`, []Match{[]Group{{0, 17}, {7, 12}}}},
{`([^.]*)\.([^:]*):[T ]+(.*)`, nil, `track1.title:TBlah blah blah`, []Match{[]Group{{0, 28}, {0, 6}, {7, 12}, {14, 28}}}},
{`([^N]*N)+`, nil, `abNNxyzN`, []Match{[]Group{{0, 8}, {4, 8}}}},
{`([^N]*N)+`, nil, `abNNxyz`, []Match{[]Group{{0, 4}, {3, 4}}}},
{`([abc]*)x`, nil, `abcx`, []Match{[]Group{{0, 4}, {0, 3}}}},
{`([abc]*)x`, nil, `abc`, []Match{}},
{`([xyz]*)x`, nil, `abcx`, []Match{[]Group{{3, 4}, {3, 3}}}},
{`(a)+b|aac`, nil, `aac`, []Match{[]Group{{0, 3}}}},
{`([abc])*d`, nil, `abbbcd`, []Match{[]Group{{0, 6}, {4, 5}}}},
{`([abc])*bcd`, nil, `abcd`, []Match{[]Group{{0, 4}, {0, 1}}}},
{`^(ab|cd)e`, nil, `abcde`, []Match{}},
// Case-insensitive tests
{`(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\071`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDEFGHIJKL9`, []Match{[]Group{{0, 13}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {10, 11}, {11, 12}}}},
{`()ef`, []ReFlag{RE_CASE_INSENSITIVE}, `DEF`, []Match{[]Group{{1, 3}, {1, 1}}}},
{`(?:)ef`, []ReFlag{RE_CASE_INSENSITIVE}, `DEF`, []Match{[]Group{{1, 3}}}},
{`(?:)`, []ReFlag{RE_CASE_INSENSITIVE}, `DEF`, []Match{[]Group{{0, 0}}, []Group{{1, 1}}, []Group{{2, 2}}, []Group{{3, 3}}}},
{`((a))`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}}}},
{`(a)b(c)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Match{[]Group{{0, 3}, {0, 1}, {2, 3}}}},
{`(a+|b)*`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}, {1, 2}}, []Group{{2, 2}}}},
{`(a+|b){0,}`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}, {1, 2}}, []Group{{2, 2}}}},
{`(a+|b)+`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}, {1, 2}}}},
{`(a+|b){1,}`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}, {1, 2}}}},
{`(a+|b)?`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}, []Group{{2, 2}}}},
{`(a+|b){0,1}`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}, []Group{{2, 2}}}},
{`(a|b|c|d|e)f`, []ReFlag{RE_CASE_INSENSITIVE}, `EF`, []Match{[]Group{{0, 2}, {0, 1}}}},
{`(ab|cd)e`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{2, 5}, {2, 4}}}},
{`^(ab|cd)e`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{}},
{`(abc|)ef`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDEF`, []Match{[]Group{{4, 6}, {4, 4}}}},
{`(a|b)c*d`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCD`, []Match{[]Group{{1, 4}, {1, 2}}}},
{`(ab|ab*)bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`a([bc]*)c*`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Match{[]Group{{0, 3}, {1, 3}}}},
{`a([bc]*)(c*d)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCD`, []Match{[]Group{{0, 4}, {1, 3}, {3, 4}}}},
{`a([bc]+)(c*d)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCD`, []Match{[]Group{{0, 4}, {1, 3}, {3, 4}}}},
{`a([bc]*)(c+d)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCD`, []Match{[]Group{{0, 4}, {1, 2}, {2, 4}}}},
{`(ab|a)b*c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Match{[]Group{{0, 3}, {0, 2}}}},
{`((a)(b)c)(d)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCD`, []Match{[]Group{{0, 4}, {0, 3}, {0, 1}, {1, 2}, {3, 4}}}},
{`^a(bc+|b[eh])g|.h$`, []ReFlag{RE_CASE_INSENSITIVE}, `ABH`, []Match{[]Group{{1, 3}}}},
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `EFFGZ`, []Match{[]Group{{0, 5}, {0, 5}}}},
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `IJ`, []Match{[]Group{{0, 2}, {0, 2}, {1, 2}}}},
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `EFFG`, []Match{}},
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `BCDD`, []Match{}},
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
{`(((((((((a)))))))))`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(((((((((a)))))))))\41`, []ReFlag{RE_CASE_INSENSITIVE}, `A!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(.*)c(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
{`\((.*), (.*)\)`, []ReFlag{RE_CASE_INSENSITIVE}, `(A, B)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
{`(a)(b)c|ab`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}}}},
{`(a)+x`, []ReFlag{RE_CASE_INSENSITIVE}, `AAAX`, []Match{[]Group{{0, 4}, {2, 3}}}},
{`([ac])+x`, []ReFlag{RE_CASE_INSENSITIVE}, `AACX`, []Match{[]Group{{0, 4}, {2, 3}}}},
{`([^/]*/)*sub1/`, []ReFlag{RE_CASE_INSENSITIVE}, `D:MSGS/TDIR/SUB1/TRIAL/AWAY.CPP`, []Match{[]Group{{0, 17}, {7, 12}}}},
{`([^.]*)\.([^:]*):[T ]+(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `TRACK1.TITLE:TBLAH BLAH BLAH`, []Match{[]Group{{0, 28}, {0, 6}, {7, 12}, {14, 28}}}},
{`([^N]*N)+`, []ReFlag{RE_CASE_INSENSITIVE}, `ABNNXYZN`, []Match{[]Group{{0, 8}, {4, 8}}}},
{`([^N]*N)+`, []ReFlag{RE_CASE_INSENSITIVE}, `ABNNXYZ`, []Match{[]Group{{0, 4}, {3, 4}}}},
{`([abc]*)x`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCX`, []Match{[]Group{{0, 4}, {0, 3}}}},
{`([abc]*)x`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Match{}},
{`([xyz]*)x`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCX`, []Match{[]Group{{3, 4}, {3, 3}}}},
{`(a)+b|aac`, []ReFlag{RE_CASE_INSENSITIVE}, `AAC`, []Match{[]Group{{0, 3}}}},
{`([abc])*d`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBCD`, []Match{[]Group{{0, 6}, {4, 5}}}},
{`([abc])*bcd`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCD`, []Match{[]Group{{0, 4}, {0, 1}}}},
{`^(ab|cd)e`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{}},
{`(?:(?:(?:(?:(?:(?:a))))))`, nil, `a`, []Match{[]Group{{0, 1}}}},
{`a(?:b|c|d)(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}},
{`a(?:b|c|d)*(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}},
{`a(?:b|c|d)+(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}},
{`a(?:b|(c|e){1,2}?|d)+(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {1, 2}, {2, 3}}}},
{`(?<!-):(.*)(?<!-):`, nil, `a:bc-:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
{`(?<!\\):(.*)(?<!\\):`, nil, `a:bc\:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
{`(?<!\?)'(.*)(?<!\?)'`, nil, `a'bc?'de'f`, []Match{[]Group{{1, 9}, {2, 8}}}},
{`([\s]*)([\S]*)([\s]*)`, nil, ` testing!1972`, []Match{[]Group{{0, 13}, {0, 1}, {1, 13}, {13, 13}}, []Group{{13, 13}, {13, 13}, {13, 13}, {13, 13}}}},
{`(\s*)(\S*)(\s*)`, nil, ` testing!1972`, []Match{[]Group{{0, 13}, {0, 1}, {1, 13}, {13, 13}}, []Group{{13, 13}, {13, 13}, {13, 13}, {13, 13}}}},
{`(([a-z]+):)?([a-z]+)$`, nil, `smil`, []Match{[]Group{{0, 4}, {-1, -1}, {-1, -1}, {0, 4}}}},
{`(x?)?`, nil, `x`, []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}}},
{`"(?:\\"|[^"])*"`, nil, `"\""`, []Match{[]Group{{0, 4}}}},
{`^((a)c)?(ab)$`, nil, `ab`, []Match{[]Group{{0, 2}, {-1, -1}, {-1, -1}, {0, 2}}}},
{`^([ab]*)(?=(b)?)c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
{`^([ab]*)(?!(b))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
{`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}},
// // Tests from https://wiki.haskell.org/Regex_Posix
// {`(()|.)(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
// {`(()|[ab])(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
// {`(()|[ab])+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
// {`([ab]|())+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
// // Bug - this should give {0,6},{3,6},{-1,-1} but it gives {0,6},{3,6},{3,3}
// // {`yyyyyy`, nil, `(yyy|(x?)){2,4}`, []Match{[]Group{{0, 6}, {3, 6}, {-1, -1}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
}
func TestFind(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(fmt.Errorf("Test Error: %v", err))
}
} else {
groupIndex, err := regComp.Find(test.str)
if err != nil { // No matches found
if len(test.result) == 0 {
return // Manually pass the test, because this is the expected behavior
} else {
t.Errorf("Wanted %v Got no matches\n", test.result)
}
} else {
if groupIndex != test.result[0] {
t.Errorf("Wanted %v Got %v\n", test.result, groupIndex)
}
}
}
})
}
}
func TestFindAll(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(fmt.Errorf("Test Error: %v", err))
}
} else {
matchIndices := regComp.FindAll(test.str)
if !slices.Equal(test.result, matchIndices) {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}
}
})
}
}
func TestFindString(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
} else {
foundString := regComp.FindString(test.str)
if len(test.result) == 0 {
if foundString != "" {
t.Errorf("Wanted no match got %v\n", foundString)
}
} else {
expectedString := test.str[test.result[0].StartIdx:test.result[0].EndIdx]
if foundString != expectedString {
t.Errorf("Wanted %v Got %v\n", expectedString, foundString)
}
}
}
})
}
}
func TestFindAllString(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
} else {
foundStrings := regComp.FindAllString(test.str)
if len(test.result) != len(foundStrings) {
t.Errorf("Differing number of matches: Wanted %v matches Got %v matches\n", len(test.result), len(foundStrings))
} else {
for idx, group := range test.result {
groupStr := test.str[group.StartIdx:group.EndIdx]
if groupStr != foundStrings[idx] {
t.Errorf("Wanted %v Got %v\n", groupStr, foundStrings[idx])
}
}
}
}
})
}
}
func TestFindSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
}
match, err := regComp.FindSubmatch(test.str)
if err != nil {
if len(test.result) != 0 {
t.Errorf("Wanted %v got no match\n", test.result[0])
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", match)
}
for i := range match {
if match[i].IsValid() {
if test.result[0][i] != match[i] {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
}
} else {
if i < len(test.result) && test.result[0][i].IsValid() {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
}
}
}
})
}
}
func TestFindStringSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
}
matchStr := regComp.FindStringSubmatch(test.str)
if matchStr == nil {
if len(test.result) != 0 {
expectedStr := funcMap(test.result[0], func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
t.Errorf("Wanted %v got no match\n", expectedStr)
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", matchStr)
} else {
expectedStr := funcMap(test.result[0], func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
for i, groupStr := range matchStr {
if groupStr == "" {
if i < len(expectedStr) && expectedStr[i] != "" {
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
}
} else {
if expectedStr[i] != groupStr {
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
}
}
}
}
})
}
}
func TestFindAllStringSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
}
matchStrs := regComp.FindAllStringSubmatch(test.str)
if matchStrs == nil {
if len(test.result) != 0 {
expectedStrs := funcMap(test.result, func(m Match) []string {
return funcMap(m, func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
})
t.Errorf("Wanted %v got no match\n", expectedStrs)
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", matchStrs)
} else {
expectedStrs := funcMap(test.result, func(m Match) []string {
return funcMap(m, func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
})
for i, matchStr := range matchStrs {
for j, groupStr := range matchStr {
if groupStr == "" {
if j < len(expectedStrs[i]) && expectedStrs[i][j] != "" {
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
}
} else {
if expectedStrs[i][j] != groupStr {
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
}
}
}
}
}
})
}
}
func TestFindAllSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
}
matchIndices := regComp.FindAllSubmatch(test.str)
for i := range matchIndices {
for j := range matchIndices[i] {
if matchIndices[i][j].IsValid() {
if test.result[i][j] != matchIndices[i][j] {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}
} else {
if i < len(test.result) && j < len(test.result[i]) && test.result[i][j].IsValid() {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}
}
}
}
})
}
}

View File

@@ -1,670 +0,0 @@
#!/usr/bin/env python3
# -*- mode: python -*-
# Re test suite and benchmark suite v1.5
# The 3 possible outcomes for each pattern
[SUCCEED, FAIL, SYNTAX_ERROR] = range(3)
# Benchmark suite (needs expansion)
#
# The benchmark suite does not test correctness, just speed. The
# first element of each tuple is the regex pattern; the second is a
# string to match it against. The benchmarking code will embed the
# second string inside several sizes of padding, to test how regex
# matching performs on large strings.
benchmarks = [
# test common prefix
('Python|Perl', 'Perl'), # Alternation
('(Python|Perl)', 'Perl'), # Grouped alternation
('Python|Perl|Tcl', 'Perl'), # Alternation
('(Python|Perl|Tcl)', 'Perl'), # Grouped alternation
('(Python)\\1', 'PythonPython'), # Backreference
('([0a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # Disable the fastmap optimization
('([a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # A few sets
('Python', 'Python'), # Simple text literal
('.*Python', 'Python'), # Bad text literal
('.*Python.*', 'Python'), # Worse text literal
('.*(Python)', 'Python'), # Bad text literal with grouping
]
# Test suite (for verifying correctness)
#
# The test suite is a list of 5- or 3-tuples. The 5 parts of a
# complete tuple are:
# element 0: a string containing the pattern
# 1: the string to match against the pattern
# 2: the expected result (SUCCEED, FAIL, SYNTAX_ERROR)
# 3: a string that will be eval()'ed to produce a test string.
# This is an arbitrary Python expression; the available
# variables are "found" (the whole match), and "g1", "g2", ...
# up to "g99" contain the contents of each group, or the
# string 'None' if the group wasn't given a value, or the
# string 'Error' if the group index was out of range;
# also "groups", the return value of m.group() (a tuple).
# 4: The expected result of evaluating the expression.
# If the two don't match, an error is reported.
#
# If the regex isn't expected to work, the latter two elements can be omitted.
tests = [
# Test ?P< and ?P= extensions
('(?P<foo_123', '', SYNTAX_ERROR), # Unterminated group identifier
('(?P<1>a)', '', SYNTAX_ERROR), # Begins with a digit
('(?P<!>a)', '', SYNTAX_ERROR), # Begins with an illegal char
('(?P<foo!>a)', '', SYNTAX_ERROR), # Begins with an illegal char
# Same tests, for the ?P= form
('(?P<foo_123>a)(?P=foo_123', 'aa', SYNTAX_ERROR),
('(?P<foo_123>a)(?P=1)', 'aa', SYNTAX_ERROR),
('(?P<foo_123>a)(?P=!)', 'aa', SYNTAX_ERROR),
('(?P<foo_123>a)(?P=foo_124', 'aa', SYNTAX_ERROR), # Backref to undefined group
('(?P<foo_123>a)', 'a', SUCCEED, 'g1', 'a'),
('(?P<foo_123>a)(?P=foo_123)', 'aa', SUCCEED, 'g1', 'a'),
# Test octal escapes
('\\1', 'a', SYNTAX_ERROR), # Backreference
('[\\1]', '\1', SUCCEED, 'found', '\1'), # Character
('\\09', chr(0) + '9', SUCCEED, 'found', chr(0) + '9'),
('\\141', 'a', SUCCEED, 'found', 'a'),
('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', SUCCEED, 'found+"-"+g11', 'abcdefghijklk9-k'),
# Test \0 is handled everywhere
(r'\0', '\0', SUCCEED, 'found', '\0'),
(r'[\0a]', '\0', SUCCEED, 'found', '\0'),
(r'[a\0]', '\0', SUCCEED, 'found', '\0'),
(r'[^a\0]', '\0', FAIL),
# Test various letter escapes
(r'\a[\b]\f\n\r\t\v', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'),
(r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'),
# NOTE: not an error under PCRE/PRE:
# (r'\u', '', SYNTAX_ERROR), # A Perl escape
(r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'),
(r'\xff', '\377', SUCCEED, 'found', chr(255)),
# new \x semantics
(r'\x00ffffffffffffff', '\377', FAIL, 'found', chr(255)),
(r'\x00f', '\017', FAIL, 'found', chr(15)),
(r'\x00fe', '\376', FAIL, 'found', chr(254)),
# (r'\x00ffffffffffffff', '\377', SUCCEED, 'found', chr(255)),
# (r'\x00f', '\017', SUCCEED, 'found', chr(15)),
# (r'\x00fe', '\376', SUCCEED, 'found', chr(254)),
(r"^\w+=(\\[\000-\277]|[^\n\\])*", "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c",
SUCCEED, 'found', "SRC=eval.c g.c blah blah blah \\\\"),
# Test that . only matches \n in DOTALL mode
('a.b', 'acb', SUCCEED, 'found', 'acb'),
('a.b', 'a\nb', FAIL),
('a.*b', 'acc\nccb', FAIL),
('a.{4,5}b', 'acc\nccb', FAIL),
('a.b', 'a\rb', SUCCEED, 'found', 'a\rb'),
('a.b(?s)', 'a\nb', SUCCEED, 'found', 'a\nb'),
('a.*(?s)b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'),
('(?s)a.{4,5}b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'),
('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'),
(')', '', SYNTAX_ERROR), # Unmatched right bracket
('', '', SUCCEED, 'found', ''), # Empty pattern
('abc', 'abc', SUCCEED, 'found', 'abc'),
('abc', 'xbc', FAIL),
('abc', 'axc', FAIL),
('abc', 'abx', FAIL),
('abc', 'xabcy', SUCCEED, 'found', 'abc'),
('abc', 'ababc', SUCCEED, 'found', 'abc'),
('ab*c', 'abc', SUCCEED, 'found', 'abc'),
('ab*bc', 'abc', SUCCEED, 'found', 'abc'),
('ab*bc', 'abbc', SUCCEED, 'found', 'abbc'),
('ab*bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab+bc', 'abbc', SUCCEED, 'found', 'abbc'),
('ab+bc', 'abc', FAIL),
('ab+bc', 'abq', FAIL),
('ab+bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab?bc', 'abbc', SUCCEED, 'found', 'abbc'),
('ab?bc', 'abc', SUCCEED, 'found', 'abc'),
('ab?bc', 'abbbbc', FAIL),
('ab?c', 'abc', SUCCEED, 'found', 'abc'),
('^abc$', 'abc', SUCCEED, 'found', 'abc'),
('^abc$', 'abcc', FAIL),
('^abc', 'abcc', SUCCEED, 'found', 'abc'),
('^abc$', 'aabc', FAIL),
('abc$', 'aabc', SUCCEED, 'found', 'abc'),
('^', 'abc', SUCCEED, 'found+"-"', '-'),
('$', 'abc', SUCCEED, 'found+"-"', '-'),
('a.c', 'abc', SUCCEED, 'found', 'abc'),
('a.c', 'axc', SUCCEED, 'found', 'axc'),
('a.*c', 'axyzc', SUCCEED, 'found', 'axyzc'),
('a.*c', 'axyzd', FAIL),
('a[bc]d', 'abc', FAIL),
('a[bc]d', 'abd', SUCCEED, 'found', 'abd'),
('a[b-d]e', 'abd', FAIL),
('a[b-d]e', 'ace', SUCCEED, 'found', 'ace'),
('a[b-d]', 'aac', SUCCEED, 'found', 'ac'),
('a[-b]', 'a-', SUCCEED, 'found', 'a-'),
('a[\\-b]', 'a-', SUCCEED, 'found', 'a-'),
# NOTE: not an error under PCRE/PRE:
# ('a[b-]', 'a-', SYNTAX_ERROR),
('a[]b', '-', SYNTAX_ERROR),
('a[', '-', SYNTAX_ERROR),
('a\\', '-', SYNTAX_ERROR),
('abc)', '-', SYNTAX_ERROR),
('(abc', '-', SYNTAX_ERROR),
('a]', 'a]', SUCCEED, 'found', 'a]'),
('a[]]b', 'a]b', SUCCEED, 'found', 'a]b'),
('a[\]]b', 'a]b', SUCCEED, 'found', 'a]b'),
('a[^bc]d', 'aed', SUCCEED, 'found', 'aed'),
('a[^bc]d', 'abd', FAIL),
('a[^-b]c', 'adc', SUCCEED, 'found', 'adc'),
('a[^-b]c', 'a-c', FAIL),
('a[^]b]c', 'a]c', FAIL),
('a[^]b]c', 'adc', SUCCEED, 'found', 'adc'),
('\\ba\\b', 'a-', SUCCEED, '"-"', '-'),
('\\ba\\b', '-a', SUCCEED, '"-"', '-'),
('\\ba\\b', '-a-', SUCCEED, '"-"', '-'),
('\\by\\b', 'xy', FAIL),
('\\by\\b', 'yz', FAIL),
('\\by\\b', 'xyz', FAIL),
('x\\b', 'xyz', FAIL),
('x\\B', 'xyz', SUCCEED, '"-"', '-'),
('\\Bz', 'xyz', SUCCEED, '"-"', '-'),
('z\\B', 'xyz', FAIL),
('\\Bx', 'xyz', FAIL),
('\\Ba\\B', 'a-', FAIL, '"-"', '-'),
('\\Ba\\B', '-a', FAIL, '"-"', '-'),
('\\Ba\\B', '-a-', FAIL, '"-"', '-'),
('\\By\\B', 'xy', FAIL),
('\\By\\B', 'yz', FAIL),
('\\By\\b', 'xy', SUCCEED, '"-"', '-'),
('\\by\\B', 'yz', SUCCEED, '"-"', '-'),
('\\By\\B', 'xyz', SUCCEED, '"-"', '-'),
('ab|cd', 'abc', SUCCEED, 'found', 'ab'),
('ab|cd', 'abcd', SUCCEED, 'found', 'ab'),
('()ef', 'def', SUCCEED, 'found+"-"+g1', 'ef-'),
('$b', 'b', FAIL),
('a\\(b', 'a(b', SUCCEED, 'found+"-"+g1', 'a(b-Error'),
('a\\(*b', 'ab', SUCCEED, 'found', 'ab'),
('a\\(*b', 'a((b', SUCCEED, 'found', 'a((b'),
('a\\\\b', 'a\\b', SUCCEED, 'found', 'a\\b'),
('((a))', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'a-a-a'),
('(a)b(c)', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'abc-a-c'),
('a+b+c', 'aabbabc', SUCCEED, 'found', 'abc'),
('(a+|b)*', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
('(a+|b)+', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
('(a+|b)?', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'),
(')(', '-', SYNTAX_ERROR),
('[^ab]*', 'cde', SUCCEED, 'found', 'cde'),
('abc', '', FAIL),
('a*', '', SUCCEED, 'found', ''),
('a|b|c|d|e', 'e', SUCCEED, 'found', 'e'),
('(a|b|c|d|e)f', 'ef', SUCCEED, 'found+"-"+g1', 'ef-e'),
('abcd*efg', 'abcdefg', SUCCEED, 'found', 'abcdefg'),
('ab*', 'xabyabbbz', SUCCEED, 'found', 'ab'),
('ab*', 'xayabbbz', SUCCEED, 'found', 'a'),
('(ab|cd)e', 'abcde', SUCCEED, 'found+"-"+g1', 'cde-cd'),
('[abhgefdc]ij', 'hij', SUCCEED, 'found', 'hij'),
('^(ab|cd)e', 'abcde', FAIL, 'xg1y', 'xy'),
('(abc|)ef', 'abcdef', SUCCEED, 'found+"-"+g1', 'ef-'),
('(a|b)c*d', 'abcd', SUCCEED, 'found+"-"+g1', 'bcd-b'),
('(ab|ab*)bc', 'abc', SUCCEED, 'found+"-"+g1', 'abc-a'),
('a([bc]*)c*', 'abc', SUCCEED, 'found+"-"+g1', 'abc-bc'),
('a([bc]*)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'),
('a([bc]+)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'),
('a([bc]*)(c+d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-b-cd'),
('a[bcd]*dcdcde', 'adcdcde', SUCCEED, 'found', 'adcdcde'),
('a[bcd]+dcdcde', 'adcdcde', FAIL),
('(ab|a)b*c', 'abc', SUCCEED, 'found+"-"+g1', 'abc-ab'),
('((a)(b)c)(d)', 'abcd', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'),
('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, 'found', 'alpha'),
('^a(bc+|b[eh])g|.h$', 'abh', SUCCEED, 'found+"-"+g1', 'bh-None'),
('(bc+d$|ef*g.|h?i(j|k))', 'effgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'),
('(bc+d$|ef*g.|h?i(j|k))', 'ij', SUCCEED, 'found+"-"+g1+"-"+g2', 'ij-ij-j'),
('(bc+d$|ef*g.|h?i(j|k))', 'effg', FAIL),
('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', FAIL),
('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'),
('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'),
('multiple words of text', 'uh-uh', FAIL),
('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'),
('(.*)c(.*)', 'abcde', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'),
('\\((.*), (.*)\\)', '(a, b)', SUCCEED, 'g2+"-"+g1', 'b-a'),
('[k]', 'ab', FAIL),
('a[-]?c', 'ac', SUCCEED, 'found', 'ac'),
('(abc)\\1', 'abcabc', SUCCEED, 'g1', 'abc'),
('([a-c]*)\\1', 'abcabc', SUCCEED, 'g1', 'abc'),
('^(.+)?B', 'AB', SUCCEED, 'g1', 'A'),
('(a+).\\1$', 'aaaaa', SUCCEED, 'found+"-"+g1', 'aaaaa-aa'),
('^(a+).\\1$', 'aaaa', FAIL),
('(abc)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'),
('([a-c]+)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'),
('(a)\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'),
('(a+)\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'),
('(a+)+\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'),
('(a).+\\1', 'aba', SUCCEED, 'found+"-"+g1', 'aba-a'),
('(a)ba*\\1', 'aba', SUCCEED, 'found+"-"+g1', 'aba-a'),
('(aa|a)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'),
('(a|aa)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'),
('(a+)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'),
('([abc]*)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'),
('(a)(b)c|ab', 'ab', SUCCEED, 'found+"-"+g1+"-"+g2', 'ab-None-None'),
('(a)+x', 'aaax', SUCCEED, 'found+"-"+g1', 'aaax-a'),
('([ac])+x', 'aacx', SUCCEED, 'found+"-"+g1', 'aacx-c'),
('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', SUCCEED, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/'),
('([^.]*)\\.([^:]*):[T ]+(.*)', 'track1.title:TBlah blah blah', SUCCEED, 'found+"-"+g1+"-"+g2+"-"+g3', 'track1.title:TBlah blah blah-track1-title-Blah blah blah'),
('([^N]*N)+', 'abNNxyzN', SUCCEED, 'found+"-"+g1', 'abNNxyzN-xyzN'),
('([^N]*N)+', 'abNNxyz', SUCCEED, 'found+"-"+g1', 'abNN-N'),
('([abc]*)x', 'abcx', SUCCEED, 'found+"-"+g1', 'abcx-abc'),
('([abc]*)x', 'abc', FAIL),
('([xyz]*)x', 'abcx', SUCCEED, 'found+"-"+g1', 'x-'),
('(a)+b|aac', 'aac', SUCCEED, 'found+"-"+g1', 'aac-None'),
# Test symbolic groups
('(?P<i d>aaa)a', 'aaaa', SYNTAX_ERROR),
('(?P<id>aaa)a', 'aaaa', SUCCEED, 'found+"-"+id', 'aaaa-aaa'),
('(?P<id>aa)(?P=id)', 'aaaa', SUCCEED, 'found+"-"+id', 'aaaa-aa'),
('(?P<id>aa)(?P=xd)', 'aaaa', SYNTAX_ERROR),
# Test octal escapes/memory references
('\\1', 'a', SYNTAX_ERROR),
('\\09', chr(0) + '9', SUCCEED, 'found', chr(0) + '9'),
('\\141', 'a', SUCCEED, 'found', 'a'),
('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', SUCCEED, 'found+"-"+g11', 'abcdefghijklk9-k'),
# All tests from Perl
('abc', 'abc', SUCCEED, 'found', 'abc'),
('abc', 'xbc', FAIL),
('abc', 'axc', FAIL),
('abc', 'abx', FAIL),
('abc', 'xabcy', SUCCEED, 'found', 'abc'),
('abc', 'ababc', SUCCEED, 'found', 'abc'),
('ab*c', 'abc', SUCCEED, 'found', 'abc'),
('ab*bc', 'abc', SUCCEED, 'found', 'abc'),
('ab*bc', 'abbc', SUCCEED, 'found', 'abbc'),
('ab*bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab{0,}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab+bc', 'abbc', SUCCEED, 'found', 'abbc'),
('ab+bc', 'abc', FAIL),
('ab+bc', 'abq', FAIL),
('ab{1,}bc', 'abq', FAIL),
('ab+bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab{1,}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab{1,3}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab{3,4}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab{4,5}bc', 'abbbbc', FAIL),
('ab?bc', 'abbc', SUCCEED, 'found', 'abbc'),
('ab?bc', 'abc', SUCCEED, 'found', 'abc'),
('ab{0,1}bc', 'abc', SUCCEED, 'found', 'abc'),
('ab?bc', 'abbbbc', FAIL),
('ab?c', 'abc', SUCCEED, 'found', 'abc'),
('ab{0,1}c', 'abc', SUCCEED, 'found', 'abc'),
('^abc$', 'abc', SUCCEED, 'found', 'abc'),
('^abc$', 'abcc', FAIL),
('^abc', 'abcc', SUCCEED, 'found', 'abc'),
('^abc$', 'aabc', FAIL),
('abc$', 'aabc', SUCCEED, 'found', 'abc'),
('^', 'abc', SUCCEED, 'found', ''),
('$', 'abc', SUCCEED, 'found', ''),
('a.c', 'abc', SUCCEED, 'found', 'abc'),
('a.c', 'axc', SUCCEED, 'found', 'axc'),
('a.*c', 'axyzc', SUCCEED, 'found', 'axyzc'),
('a.*c', 'axyzd', FAIL),
('a[bc]d', 'abc', FAIL),
('a[bc]d', 'abd', SUCCEED, 'found', 'abd'),
('a[b-d]e', 'abd', FAIL),
('a[b-d]e', 'ace', SUCCEED, 'found', 'ace'),
('a[b-d]', 'aac', SUCCEED, 'found', 'ac'),
('a[-b]', 'a-', SUCCEED, 'found', 'a-'),
('a[b-]', 'a-', SUCCEED, 'found', 'a-'),
('a[b-a]', '-', SYNTAX_ERROR),
('a[]b', '-', SYNTAX_ERROR),
('a[', '-', SYNTAX_ERROR),
('a]', 'a]', SUCCEED, 'found', 'a]'),
('a[]]b', 'a]b', SUCCEED, 'found', 'a]b'),
('a[^bc]d', 'aed', SUCCEED, 'found', 'aed'),
('a[^bc]d', 'abd', FAIL),
('a[^-b]c', 'adc', SUCCEED, 'found', 'adc'),
('a[^-b]c', 'a-c', FAIL),
('a[^]b]c', 'a]c', FAIL),
('a[^]b]c', 'adc', SUCCEED, 'found', 'adc'),
('ab|cd', 'abc', SUCCEED, 'found', 'ab'),
('ab|cd', 'abcd', SUCCEED, 'found', 'ab'),
('()ef', 'def', SUCCEED, 'found+"-"+g1', 'ef-'),
('*a', '-', SYNTAX_ERROR),
('(*)b', '-', SYNTAX_ERROR),
('$b', 'b', FAIL),
('a\\', '-', SYNTAX_ERROR),
('a\\(b', 'a(b', SUCCEED, 'found+"-"+g1', 'a(b-Error'),
('a\\(*b', 'ab', SUCCEED, 'found', 'ab'),
('a\\(*b', 'a((b', SUCCEED, 'found', 'a((b'),
('a\\\\b', 'a\\b', SUCCEED, 'found', 'a\\b'),
('abc)', '-', SYNTAX_ERROR),
('(abc', '-', SYNTAX_ERROR),
('((a))', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'a-a-a'),
('(a)b(c)', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'abc-a-c'),
('a+b+c', 'aabbabc', SUCCEED, 'found', 'abc'),
('a{1,}b{1,}c', 'aabbabc', SUCCEED, 'found', 'abc'),
('a**', '-', SYNTAX_ERROR),
('a.+?c', 'abcabc', SUCCEED, 'found', 'abc'),
('(a+|b)*', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
('(a+|b){0,}', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
('(a+|b)+', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
('(a+|b){1,}', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
('(a+|b)?', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'),
('(a+|b){0,1}', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'),
(')(', '-', SYNTAX_ERROR),
('[^ab]*', 'cde', SUCCEED, 'found', 'cde'),
('abc', '', FAIL),
('a*', '', SUCCEED, 'found', ''),
('([abc])*d', 'abbbcd', SUCCEED, 'found+"-"+g1', 'abbbcd-c'),
('([abc])*bcd', 'abcd', SUCCEED, 'found+"-"+g1', 'abcd-a'),
('a|b|c|d|e', 'e', SUCCEED, 'found', 'e'),
('(a|b|c|d|e)f', 'ef', SUCCEED, 'found+"-"+g1', 'ef-e'),
('abcd*efg', 'abcdefg', SUCCEED, 'found', 'abcdefg'),
('ab*', 'xabyabbbz', SUCCEED, 'found', 'ab'),
('ab*', 'xayabbbz', SUCCEED, 'found', 'a'),
('(ab|cd)e', 'abcde', SUCCEED, 'found+"-"+g1', 'cde-cd'),
('[abhgefdc]ij', 'hij', SUCCEED, 'found', 'hij'),
('^(ab|cd)e', 'abcde', FAIL),
('(abc|)ef', 'abcdef', SUCCEED, 'found+"-"+g1', 'ef-'),
('(a|b)c*d', 'abcd', SUCCEED, 'found+"-"+g1', 'bcd-b'),
('(ab|ab*)bc', 'abc', SUCCEED, 'found+"-"+g1', 'abc-a'),
('a([bc]*)c*', 'abc', SUCCEED, 'found+"-"+g1', 'abc-bc'),
('a([bc]*)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'),
('a([bc]+)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'),
('a([bc]*)(c+d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-b-cd'),
('a[bcd]*dcdcde', 'adcdcde', SUCCEED, 'found', 'adcdcde'),
('a[bcd]+dcdcde', 'adcdcde', FAIL),
('(ab|a)b*c', 'abc', SUCCEED, 'found+"-"+g1', 'abc-ab'),
('((a)(b)c)(d)', 'abcd', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'),
('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, 'found', 'alpha'),
('^a(bc+|b[eh])g|.h$', 'abh', SUCCEED, 'found+"-"+g1', 'bh-None'),
('(bc+d$|ef*g.|h?i(j|k))', 'effgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'),
('(bc+d$|ef*g.|h?i(j|k))', 'ij', SUCCEED, 'found+"-"+g1+"-"+g2', 'ij-ij-j'),
('(bc+d$|ef*g.|h?i(j|k))', 'effg', FAIL),
('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', FAIL),
('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'),
('((((((((((a))))))))))', 'a', SUCCEED, 'g10', 'a'),
('((((((((((a))))))))))\\10', 'aa', SUCCEED, 'found', 'aa'),
# Python does not have the same rules for \\41 so this is a syntax error
# ('((((((((((a))))))))))\\41', 'aa', FAIL),
# ('((((((((((a))))))))))\\41', 'a!', SUCCEED, 'found', 'a!'),
('((((((((((a))))))))))\\41', '', SYNTAX_ERROR),
('(?i)((((((((((a))))))))))\\41', '', SYNTAX_ERROR),
('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'),
('multiple words of text', 'uh-uh', FAIL),
('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'),
('(.*)c(.*)', 'abcde', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'),
('\\((.*), (.*)\\)', '(a, b)', SUCCEED, 'g2+"-"+g1', 'b-a'),
('[k]', 'ab', FAIL),
('a[-]?c', 'ac', SUCCEED, 'found', 'ac'),
('(abc)\\1', 'abcabc', SUCCEED, 'g1', 'abc'),
('([a-c]*)\\1', 'abcabc', SUCCEED, 'g1', 'abc'),
('(?i)abc', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)abc', 'XBC', FAIL),
('(?i)abc', 'AXC', FAIL),
('(?i)abc', 'ABX', FAIL),
('(?i)abc', 'XABCY', SUCCEED, 'found', 'ABC'),
('(?i)abc', 'ABABC', SUCCEED, 'found', 'ABC'),
('(?i)ab*c', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)ab*bc', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)ab*bc', 'ABBC', SUCCEED, 'found', 'ABBC'),
('(?i)ab*?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
('(?i)ab{0,}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
('(?i)ab+?bc', 'ABBC', SUCCEED, 'found', 'ABBC'),
('(?i)ab+bc', 'ABC', FAIL),
('(?i)ab+bc', 'ABQ', FAIL),
('(?i)ab{1,}bc', 'ABQ', FAIL),
('(?i)ab+bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
('(?i)ab{1,}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
('(?i)ab{1,3}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
('(?i)ab{3,4}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
('(?i)ab{4,5}?bc', 'ABBBBC', FAIL),
('(?i)ab??bc', 'ABBC', SUCCEED, 'found', 'ABBC'),
('(?i)ab??bc', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)ab{0,1}?bc', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)ab??bc', 'ABBBBC', FAIL),
('(?i)ab??c', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)ab{0,1}?c', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)^abc$', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)^abc$', 'ABCC', FAIL),
('(?i)^abc', 'ABCC', SUCCEED, 'found', 'ABC'),
('(?i)^abc$', 'AABC', FAIL),
('(?i)abc$', 'AABC', SUCCEED, 'found', 'ABC'),
('(?i)^', 'ABC', SUCCEED, 'found', ''),
('(?i)$', 'ABC', SUCCEED, 'found', ''),
('(?i)a.c', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)a.c', 'AXC', SUCCEED, 'found', 'AXC'),
('(?i)a.*?c', 'AXYZC', SUCCEED, 'found', 'AXYZC'),
('(?i)a.*c', 'AXYZD', FAIL),
('(?i)a[bc]d', 'ABC', FAIL),
('(?i)a[bc]d', 'ABD', SUCCEED, 'found', 'ABD'),
('(?i)a[b-d]e', 'ABD', FAIL),
('(?i)a[b-d]e', 'ACE', SUCCEED, 'found', 'ACE'),
('(?i)a[b-d]', 'AAC', SUCCEED, 'found', 'AC'),
('(?i)a[-b]', 'A-', SUCCEED, 'found', 'A-'),
('(?i)a[b-]', 'A-', SUCCEED, 'found', 'A-'),
('(?i)a[b-a]', '-', SYNTAX_ERROR),
('(?i)a[]b', '-', SYNTAX_ERROR),
('(?i)a[', '-', SYNTAX_ERROR),
('(?i)a]', 'A]', SUCCEED, 'found', 'A]'),
('(?i)a[]]b', 'A]B', SUCCEED, 'found', 'A]B'),
('(?i)a[^bc]d', 'AED', SUCCEED, 'found', 'AED'),
('(?i)a[^bc]d', 'ABD', FAIL),
('(?i)a[^-b]c', 'ADC', SUCCEED, 'found', 'ADC'),
('(?i)a[^-b]c', 'A-C', FAIL),
('(?i)a[^]b]c', 'A]C', FAIL),
('(?i)a[^]b]c', 'ADC', SUCCEED, 'found', 'ADC'),
('(?i)ab|cd', 'ABC', SUCCEED, 'found', 'AB'),
('(?i)ab|cd', 'ABCD', SUCCEED, 'found', 'AB'),
('(?i)()ef', 'DEF', SUCCEED, 'found+"-"+g1', 'EF-'),
('(?i)*a', '-', SYNTAX_ERROR),
('(?i)(*)b', '-', SYNTAX_ERROR),
('(?i)$b', 'B', FAIL),
('(?i)a\\', '-', SYNTAX_ERROR),
('(?i)a\\(b', 'A(B', SUCCEED, 'found+"-"+g1', 'A(B-Error'),
('(?i)a\\(*b', 'AB', SUCCEED, 'found', 'AB'),
('(?i)a\\(*b', 'A((B', SUCCEED, 'found', 'A((B'),
('(?i)a\\\\b', 'A\\B', SUCCEED, 'found', 'A\\B'),
('(?i)abc)', '-', SYNTAX_ERROR),
('(?i)(abc', '-', SYNTAX_ERROR),
('(?i)((a))', 'ABC', SUCCEED, 'found+"-"+g1+"-"+g2', 'A-A-A'),
('(?i)(a)b(c)', 'ABC', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABC-A-C'),
('(?i)a+b+c', 'AABBABC', SUCCEED, 'found', 'ABC'),
('(?i)a{1,}b{1,}c', 'AABBABC', SUCCEED, 'found', 'ABC'),
('(?i)a**', '-', SYNTAX_ERROR),
('(?i)a.+?c', 'ABCABC', SUCCEED, 'found', 'ABC'),
('(?i)a.*?c', 'ABCABC', SUCCEED, 'found', 'ABC'),
('(?i)a.{0,5}?c', 'ABCABC', SUCCEED, 'found', 'ABC'),
('(?i)(a+|b)*', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'),
('(?i)(a+|b){0,}', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'),
('(?i)(a+|b)+', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'),
('(?i)(a+|b){1,}', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'),
('(?i)(a+|b)?', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'),
('(?i)(a+|b){0,1}', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'),
('(?i)(a+|b){0,1}?', 'AB', SUCCEED, 'found+"-"+g1', '-None'),
('(?i))(', '-', SYNTAX_ERROR),
('(?i)[^ab]*', 'CDE', SUCCEED, 'found', 'CDE'),
('(?i)abc', '', FAIL),
('(?i)a*', '', SUCCEED, 'found', ''),
('(?i)([abc])*d', 'ABBBCD', SUCCEED, 'found+"-"+g1', 'ABBBCD-C'),
('(?i)([abc])*bcd', 'ABCD', SUCCEED, 'found+"-"+g1', 'ABCD-A'),
('(?i)a|b|c|d|e', 'E', SUCCEED, 'found', 'E'),
('(?i)(a|b|c|d|e)f', 'EF', SUCCEED, 'found+"-"+g1', 'EF-E'),
('(?i)abcd*efg', 'ABCDEFG', SUCCEED, 'found', 'ABCDEFG'),
('(?i)ab*', 'XABYABBBZ', SUCCEED, 'found', 'AB'),
('(?i)ab*', 'XAYABBBZ', SUCCEED, 'found', 'A'),
('(?i)(ab|cd)e', 'ABCDE', SUCCEED, 'found+"-"+g1', 'CDE-CD'),
('(?i)[abhgefdc]ij', 'HIJ', SUCCEED, 'found', 'HIJ'),
('(?i)^(ab|cd)e', 'ABCDE', FAIL),
('(?i)(abc|)ef', 'ABCDEF', SUCCEED, 'found+"-"+g1', 'EF-'),
('(?i)(a|b)c*d', 'ABCD', SUCCEED, 'found+"-"+g1', 'BCD-B'),
('(?i)(ab|ab*)bc', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-A'),
('(?i)a([bc]*)c*', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-BC'),
('(?i)a([bc]*)(c*d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-BC-D'),
('(?i)a([bc]+)(c*d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-BC-D'),
('(?i)a([bc]*)(c+d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-B-CD'),
('(?i)a[bcd]*dcdcde', 'ADCDCDE', SUCCEED, 'found', 'ADCDCDE'),
('(?i)a[bcd]+dcdcde', 'ADCDCDE', FAIL),
('(?i)(ab|a)b*c', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-AB'),
('(?i)((a)(b)c)(d)', 'ABCD', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'ABC-A-B-D'),
('(?i)[a-zA-Z_][a-zA-Z0-9_]*', 'ALPHA', SUCCEED, 'found', 'ALPHA'),
('(?i)^a(bc+|b[eh])g|.h$', 'ABH', SUCCEED, 'found+"-"+g1', 'BH-None'),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'IJ', SUCCEED, 'found+"-"+g1+"-"+g2', 'IJ-IJ-J'),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFG', FAIL),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'BCDD', FAIL),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'REFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'),
('(?i)((((((((((a))))))))))', 'A', SUCCEED, 'g10', 'A'),
('(?i)((((((((((a))))))))))\\10', 'AA', SUCCEED, 'found', 'AA'),
#('(?i)((((((((((a))))))))))\\41', 'AA', FAIL),
#('(?i)((((((((((a))))))))))\\41', 'A!', SUCCEED, 'found', 'A!'),
('(?i)(((((((((a)))))))))', 'A', SUCCEED, 'found', 'A'),
('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))', 'A', SUCCEED, 'g1', 'A'),
('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))', 'C', SUCCEED, 'g1', 'C'),
('(?i)multiple words of text', 'UH-UH', FAIL),
('(?i)multiple words', 'MULTIPLE WORDS, YEAH', SUCCEED, 'found', 'MULTIPLE WORDS'),
('(?i)(.*)c(.*)', 'ABCDE', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCDE-AB-DE'),
('(?i)\\((.*), (.*)\\)', '(A, B)', SUCCEED, 'g2+"-"+g1', 'B-A'),
('(?i)[k]', 'AB', FAIL),
# ('(?i)abcd', 'ABCD', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', 'ABCD-$&-\\ABCD'),
# ('(?i)a(bc)d', 'ABCD', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', 'BC-$1-\\BC'),
('(?i)a[-]?c', 'AC', SUCCEED, 'found', 'AC'),
('(?i)(abc)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'),
('(?i)([a-c]*)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'),
('a(?!b).', 'abad', SUCCEED, 'found', 'ad'),
('a(?=d).', 'abad', SUCCEED, 'found', 'ad'),
('a(?=c|d).', 'abad', SUCCEED, 'found', 'ad'),
('a(?:b|c|d)(.)', 'ace', SUCCEED, 'g1', 'e'),
('a(?:b|c|d)*(.)', 'ace', SUCCEED, 'g1', 'e'),
('a(?:b|c|d)+?(.)', 'ace', SUCCEED, 'g1', 'e'),
('a(?:b|(c|e){1,2}?|d)+?(.)', 'ace', SUCCEED, 'g1 + g2', 'ce'),
('^(.+)?B', 'AB', SUCCEED, 'g1', 'A'),
# lookbehind: split by : but not if it is escaped by -.
('(?<!-):(.*?)(?<!-):', 'a:bc-:de:f', SUCCEED, 'g1', 'bc-:de' ),
# escaping with \ as we know it
('(?<!\\\):(.*?)(?<!\\\):', 'a:bc\\:de:f', SUCCEED, 'g1', 'bc\\:de' ),
# terminating with ' and escaping with ? as in edifact
("(?<!\\?)'(.*?)(?<!\\?)'", "a'bc?'de'f", SUCCEED, 'g1', "bc?'de" ),
# Comments using the (?#...) syntax
('w(?# comment', 'w', SYNTAX_ERROR),
('w(?# comment 1)xy(?# comment 2)z', 'wxyz', SUCCEED, 'found', 'wxyz'),
# Check odd placement of embedded pattern modifiers
# not an error under PCRE/PRE:
('w(?i)', 'W', SUCCEED, 'found', 'W'),
# ('w(?i)', 'W', SYNTAX_ERROR),
# Comments using the x embedded pattern modifier
("""(?x)w# comment 1
x y
# comment 2
z""", 'wxyz', SUCCEED, 'found', 'wxyz'),
# using the m embedded pattern modifier
('^abc', """jkl
abc
xyz""", FAIL),
('(?m)^abc', """jkl
abc
xyz""", SUCCEED, 'found', 'abc'),
('(?m)abc$', """jkl
xyzabc
123""", SUCCEED, 'found', 'abc'),
# using the s embedded pattern modifier
('a.b', 'a\nb', FAIL),
('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'),
# test \w, etc. both inside and outside character classes
('\\w+', '--ab_cd0123--', SUCCEED, 'found', 'ab_cd0123'),
('[\\w]+', '--ab_cd0123--', SUCCEED, 'found', 'ab_cd0123'),
('\\D+', '1234abc5678', SUCCEED, 'found', 'abc'),
('[\\D]+', '1234abc5678', SUCCEED, 'found', 'abc'),
('[\\da-fA-F]+', '123abc', SUCCEED, 'found', '123abc'),
# not an error under PCRE/PRE:
# ('[\\d-x]', '-', SYNTAX_ERROR),
(r'([\s]*)([\S]*)([\s]*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '),
(r'(\s*)(\S*)(\s*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '),
(r'\xff', '\377', SUCCEED, 'found', chr(255)),
# new \x semantics
(r'\x00ff', '\377', FAIL),
# (r'\x00ff', '\377', SUCCEED, 'found', chr(255)),
(r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
(r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)),
(r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', SUCCEED, 'found', '\t\n\v\r\f\b'),
#
# post-1.5.2 additions
# xmllib problem
(r'(([a-z]+):)?([a-z]+)$', 'smil', SUCCEED, 'g1+"-"+g2+"-"+g3', 'None-None-smil'),
# bug 110866: reference to undefined group
(r'((.)\1+)', '', SYNTAX_ERROR),
# bug 111869: search (PRE/PCRE fails on this one, SRE doesn't)
(r'.*d', 'abc\nabd', SUCCEED, 'found', 'abd'),
# bug 112468: various expected syntax errors
(r'(', '', SYNTAX_ERROR),
(r'[\41]', '!', SUCCEED, 'found', '!'),
# bug 114033: nothing to repeat
(r'(x?)?', 'x', SUCCEED, 'found', 'x'),
# bug 115040: rescan if flags are modified inside pattern
(r' (?x)foo ', 'foo', SUCCEED, 'found', 'foo'),
# bug 115618: negative lookahead
(r'(?<!abc)(d.f)', 'abcdefdof', SUCCEED, 'found', 'dof'),
# bug 116251: character class bug
(r'[\w-]+', 'laser_beam', SUCCEED, 'found', 'laser_beam'),
# bug 123769+127259: non-greedy backtracking bug
(r'.*?\S *:', 'xx:', SUCCEED, 'found', 'xx:'),
(r'a[ ]*?\ (\d+).*', 'a 10', SUCCEED, 'found', 'a 10'),
(r'a[ ]*?\ (\d+).*', 'a 10', SUCCEED, 'found', 'a 10'),
# bug 127259: \Z shouldn't depend on multiline mode
(r'(?ms).*?x\s*\Z(.*)','xx\nx\n', SUCCEED, 'g1', ''),
# bug 128899: uppercase literals under the ignorecase flag
(r'(?i)M+', 'MMM', SUCCEED, 'found', 'MMM'),
(r'(?i)m+', 'MMM', SUCCEED, 'found', 'MMM'),
(r'(?i)[M]+', 'MMM', SUCCEED, 'found', 'MMM'),
(r'(?i)[m]+', 'MMM', SUCCEED, 'found', 'MMM'),
# bug 130748: ^* should be an error (nothing to repeat)
(r'^*', '', SYNTAX_ERROR),
# bug 133283: minimizing repeat problem
(r'"(?:\\"|[^"])*?"', r'"\""', SUCCEED, 'found', r'"\""'),
# bug 477728: minimizing repeat problem
(r'^.*?$', 'one\ntwo\nthree\n', FAIL),
# bug 483789: minimizing repeat problem
(r'a[^>]*?b', 'a>b', FAIL),
# bug 490573: minimizing repeat problem
(r'^a*?$', 'foo', FAIL),
# bug 470582: nested groups problem
(r'^((a)c)?(ab)$', 'ab', SUCCEED, 'g1+"-"+g2+"-"+g3', 'None-None-ab'),
# another minimizing repeat problem (capturing groups in assertions)
('^([ab]*?)(?=(b)?)c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'),
('^([ab]*?)(?!(b))c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'),
('^([ab]*?)(?<!(a))c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'),
]
u = '\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'
tests.extend([
# bug 410271: \b broken under locales
(r'\b.\b', 'a', SUCCEED, 'found', 'a'),
(r'(?u)\b.\b', u, SUCCEED, 'found', u),
(r'(?u)\w', u, SUCCEED, 'found', u),
])

View File

@@ -1,516 +0,0 @@
#!/usr/bin/env python3
# -*- mode: python -*-
# Re test suite and benchmark suite v1.5
# The 3 possible outcomes for each pattern
[SUCCEED, FAIL, SYNTAX_ERROR] = range(3)
# Benchmark suite (needs expansion)
#
# The benchmark suite does not test correctness, just speed. The
# first element of each tuple is the regex pattern; the second is a
# string to match it against. The benchmarking code will embed the
# second string inside several sizes of padding, to test how regex
# matching performs on large strings.
benchmarks = [
# test common prefix
('Python|Perl', 'Perl'), # Alternation
('(Python|Perl)', 'Perl'), # Grouped alternation
('Python|Perl|Tcl', 'Perl'), # Alternation
('(Python|Perl|Tcl)', 'Perl'), # Grouped alternation
('(Python)\\1', 'PythonPython'), # Backreference
('([0a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # Disable the fastmap optimization
('([a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # A few sets
('Python', 'Python'), # Simple text literal
('.*Python', 'Python'), # Bad text literal
('.*Python.*', 'Python'), # Worse text literal
('.*(Python)', 'Python'), # Bad text literal with grouping
]
# Test suite (for verifying correctness)
# The test suite is a list of 5- or 3-tuples. The 5 parts of a
# complete tuple are:
# element 0: a string containing the pattern
# 1: the string to match against the pattern
# 2: the expected result (SUCCEED, FAIL, SYNTAX_ERROR)
# 3: a string that will be eval()'ed to produce a test string.
# This is an arbitrary Python expression; the available
# variables are "found" (the whole match), and "g1", "g2", ...
# up to "g99" contain the contents of each group, or the
# string 'None' if the group wasn't given a value, or the
# string 'Error' if the group index was out of range;
# also "groups", the return value of m.group() (a tuple).
# 4: The expected result of evaluating the expression.
# If the two don't match, an error is reported.
# If the regex isn't expected to work, the latter two elements can be omitted.
tests = [
# Test ?P< and ?P= extensions
('(?P<foo_123', '', SYNTAX_ERROR), # Unterminated group identifier
('(?P<1>a)', '', SYNTAX_ERROR), # Begins with a digit
('(?P<!>a)', '', SYNTAX_ERROR), # Begins with an illegal char
('(?P<foo!>a)', '', SYNTAX_ERROR), # Begins with an illegal char
# Same tests, for the ?P= form
('(?P<foo_123>a)(?P=foo_123', 'aa', SYNTAX_ERROR),
('(?P<foo_123>a)(?P=1)', 'aa', SYNTAX_ERROR),
('(?P<foo_123>a)(?P=!)', 'aa', SYNTAX_ERROR),
('(?P<foo_123>a)(?P=foo_124', 'aa', SYNTAX_ERROR), # Backref to undefined group
('(?P<foo_123>a)', 'a', SUCCEED, 'g1', 'a'),
('(?P<foo_123>a)(?P=foo_123)', 'aa', SUCCEED, 'g1', 'a'),
# Test octal escapes
('\\1', 'a', SYNTAX_ERROR), # Backreference
('[\\1]', '\1', SUCCEED, 'found', '\1'), # Character
('\\09', chr(0) + '9', SUCCEED, 'found', chr(0) + '9'),
('\\141', 'a', SUCCEED, 'found', 'a'),
('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', SUCCEED, 'found+"-"+g11', 'abcdefghijklk9-k'),
# Test \0 is handled everywhere
(r'\0', '\0', SUCCEED, 'found', '\0'),
(r'[\0a]', '\0', SUCCEED, 'found', '\0'),
(r'[a\0]', '\0', SUCCEED, 'found', '\0'),
(r'[^a\0]', '\0', FAIL),
# Test various letter escapes
(r'\a[\b]\f\n\r\t\v', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'),
(r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'),
# NOTE: not an error under PCRE/PRE:
# (r'\u', '', SYNTAX_ERROR), # A Perl escape
(r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'),
(r'\xff', '\377', SUCCEED, 'found', chr(255)),
# new \x semantics
(r'\x00ffffffffffffff', '\377', FAIL, 'found', chr(255)),
(r'\x00f', '\017', FAIL, 'found', chr(15)),
(r'\x00fe', '\376', FAIL, 'found', chr(254)),
# (r'\x00ffffffffffffff', '\377', SUCCEED, 'found', chr(255)),
# (r'\x00f', '\017', SUCCEED, 'found', chr(15)),
# (r'\x00fe', '\376', SUCCEED, 'found', chr(254)),
(r"^\w+=(\\[\000-\277]|[^\n\\])*", "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c",
SUCCEED, 'found', "SRC=eval.c g.c blah blah blah \\\\"),
# Test that . only matches \n in DOTALL mode
('a.b', 'acb', SUCCEED, 'found', 'acb'),
('a.b', 'a\nb', FAIL),
('a.*b', 'acc\nccb', FAIL),
('a.{4,5}b', 'acc\nccb', FAIL),
('a.b', 'a\rb', SUCCEED, 'found', 'a\rb'),
('a.b(?s)', 'a\nb', SUCCEED, 'found', 'a\nb'),
('a.*(?s)b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'),
('(?s)a.{4,5}b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'),
('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'),
(')', '', SYNTAX_ERROR), # Unmatched right bracket
('', '', SUCCEED, 'found', ''), # Empty pattern
('abc', 'abc', SUCCEED, 'found', 'abc'),
('abc', 'xbc', FAIL),
('abc', 'axc', FAIL),
('abc', 'abx', FAIL),
('abc', 'xabcy', SUCCEED, 'found', 'abc'),
('abc', 'ababc', SUCCEED, 'found', 'abc'),
('ab*c', 'abc', SUCCEED, 'found', 'abc'),
('ab*bc', 'abc', SUCCEED, 'found', 'abc'),
('ab*bc', 'abbc', SUCCEED, 'found', 'abbc'),
('ab*bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab+bc', 'abbc', SUCCEED, 'found', 'abbc'),
('ab+bc', 'abc', FAIL),
('ab+bc', 'abq', FAIL),
('ab+bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab?bc', 'abbc', SUCCEED, 'found', 'abbc'),
('ab?bc', 'abc', SUCCEED, 'found', 'abc'),
('ab?bc', 'abbbbc', FAIL),
('ab?c', 'abc', SUCCEED, 'found', 'abc'),
('^abc$', 'abc', SUCCEED, 'found', 'abc'),
('^abc$', 'abcc', FAIL),
('^abc', 'abcc', SUCCEED, 'found', 'abc'),
('^abc$', 'aabc', FAIL),
('abc$', 'aabc', SUCCEED, 'found', 'abc'),
('^', 'abc', SUCCEED, 'found+"-"', '-'),
('$', 'abc', SUCCEED, 'found+"-"', '-'),
('a.c', 'abc', SUCCEED, 'found', 'abc'),
('a.c', 'axc', SUCCEED, 'found', 'axc'),
('a.*c', 'axyzc', SUCCEED, 'found', 'axyzc'),
('a.*c', 'axyzd', FAIL),
('a[bc]d', 'abc', FAIL),
('a[bc]d', 'abd', SUCCEED, 'found', 'abd'),
('a[b-d]e', 'abd', FAIL),
('a[b-d]e', 'ace', SUCCEED, 'found', 'ace'),
('a[b-d]', 'aac', SUCCEED, 'found', 'ac'),
('a[-b]', 'a-', SUCCEED, 'found', 'a-'),
('a[\\-b]', 'a-', SUCCEED, 'found', 'a-'),
# ('a[b-]', 'a-', SYNTAX_ERROR),
('a[]b', '-', SYNTAX_ERROR),
('a[', '-', SYNTAX_ERROR),
('a\\', '-', SYNTAX_ERROR),
('abc)', '-', SYNTAX_ERROR),
('(abc', '-', SYNTAX_ERROR),
('a]', 'a]', SUCCEED, 'found', 'a]'),
('a[]]b', 'a]b', SUCCEED, 'found', 'a]b'),
('a[\]]b', 'a]b', SUCCEED, 'found', 'a]b'),
('a[^bc]d', 'aed', SUCCEED, 'found', 'aed'),
('a[^bc]d', 'abd', FAIL),
('a[^-b]c', 'adc', SUCCEED, 'found', 'adc'),
('a[^-b]c', 'a-c', FAIL),
('a[^]b]c', 'a]c', FAIL),
('a[^]b]c', 'adc', SUCCEED, 'found', 'adc'),
('\\ba\\b', 'a-', SUCCEED, '"-"', '-'),
('\\ba\\b', '-a', SUCCEED, '"-"', '-'),
('\\ba\\b', '-a-', SUCCEED, '"-"', '-'),
('\\by\\b', 'xy', FAIL),
('\\by\\b', 'yz', FAIL),
('\\by\\b', 'xyz', FAIL),
('x\\b', 'xyz', FAIL),
('x\\B', 'xyz', SUCCEED, '"-"', '-'),
('\\Bz', 'xyz', SUCCEED, '"-"', '-'),
('z\\B', 'xyz', FAIL),
('\\Bx', 'xyz', FAIL),
('\\Ba\\B', 'a-', FAIL, '"-"', '-'),
('\\Ba\\B', '-a', FAIL, '"-"', '-'),
('\\Ba\\B', '-a-', FAIL, '"-"', '-'),
('\\By\\B', 'xy', FAIL),
('\\By\\B', 'yz', FAIL),
('\\By\\b', 'xy', SUCCEED, '"-"', '-'),
('\\by\\B', 'yz', SUCCEED, '"-"', '-'),
('\\By\\B', 'xyz', SUCCEED, '"-"', '-'),
('ab|cd', 'abc', SUCCEED, 'found', 'ab'),
('ab|cd', 'abcd', SUCCEED, 'found', 'ab'),
('()ef', 'def', SUCCEED, 'found+"-"+g1', 'ef-'),
('$b', 'b', FAIL),
('a\\(b', 'a(b', SUCCEED, 'found+"-"+g1', 'a(b-Error'),
('a\\(*b', 'ab', SUCCEED, 'found', 'ab'),
('a\\(*b', 'a((b', SUCCEED, 'found', 'a((b'),
('a\\\\b', 'a\\b', SUCCEED, 'found', 'a\\b'),
('((a))', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'a-a-a'),
('(a)b(c)', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'abc-a-c'),
('a+b+c', 'aabbabc', SUCCEED, 'found', 'abc'),
('(a+|b)*', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
('(a+|b)+', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
('(a+|b)?', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'),
(')(', '-', SYNTAX_ERROR),
('[^ab]*', 'cde', SUCCEED, 'found', 'cde'),
('abc', '', FAIL),
('a*', '', SUCCEED, 'found', ''),
('a|b|c|d|e', 'e', SUCCEED, 'found', 'e'),
('(a|b|c|d|e)f', 'ef', SUCCEED, 'found+"-"+g1', 'ef-e'),
('abcd*efg', 'abcdefg', SUCCEED, 'found', 'abcdefg'),
('ab*', 'xabyabbbz', SUCCEED, 'found', 'ab'),
('ab*', 'xayabbbz', SUCCEED, 'found', 'a'),
('(ab|cd)e', 'abcde', SUCCEED, 'found+"-"+g1', 'cde-cd'),
('[abhgefdc]ij', 'hij', SUCCEED, 'found', 'hij'),
('^(ab|cd)e', 'abcde', FAIL, 'xg1y', 'xy'),
('(abc|)ef', 'abcdef', SUCCEED, 'found+"-"+g1', 'ef-'),
('(a|b)c*d', 'abcd', SUCCEED, 'found+"-"+g1', 'bcd-b'),
('(ab|ab*)bc', 'abc', SUCCEED, 'found+"-"+g1', 'abc-a'),
('a([bc]*)c*', 'abc', SUCCEED, 'found+"-"+g1', 'abc-bc'),
('a([bc]*)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'),
('a([bc]+)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'),
('a([bc]*)(c+d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-b-cd'),
('a[bcd]*dcdcde', 'adcdcde', SUCCEED, 'found', 'adcdcde'),
('a[bcd]+dcdcde', 'adcdcde', FAIL),
('(ab|a)b*c', 'abc', SUCCEED, 'found+"-"+g1', 'abc-ab'),
('((a)(b)c)(d)', 'abcd', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'),
('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, 'found', 'alpha'),
('^a(bc+|b[eh])g|.h$', 'abh', SUCCEED, 'found+"-"+g1', 'bh-None'),
('(bc+d$|ef*g.|h?i(j|k))', 'effgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'),
('(bc+d$|ef*g.|h?i(j|k))', 'ij', SUCCEED, 'found+"-"+g1+"-"+g2', 'ij-ij-j'),
('(bc+d$|ef*g.|h?i(j|k))', 'effg', FAIL),
('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', FAIL),
('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'),
('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'),
('multiple words of text', 'uh-uh', FAIL),
('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'),
('(.*)c(.*)', 'abcde', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'),
('\\((.*), (.*)\\)', '(a, b)', SUCCEED, 'g2+"-"+g1', 'b-a'),
('[k]', 'ab', FAIL),
('a[-]?c', 'ac', SUCCEED, 'found', 'ac'),
('(abc)\\1', 'abcabc', SUCCEED, 'g1', 'abc'),
('([a-c]*)\\1', 'abcabc', SUCCEED, 'g1', 'abc'),
('^(.+)?B', 'AB', SUCCEED, 'g1', 'A'),
('(a+).\\1$', 'aaaaa', SUCCEED, 'found+"-"+g1', 'aaaaa-aa'),
('^(a+).\\1$', 'aaaa', FAIL),
('(abc)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'),
('([a-c]+)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'),
('(a)\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'),
('(a+)\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'),
('(a+)+\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'),
('(a).+\\1', 'aba', SUCCEED, 'found+"-"+g1', 'aba-a'),
('(a)ba*\\1', 'aba', SUCCEED, 'found+"-"+g1', 'aba-a'),
('(aa|a)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'),
('(a|aa)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'),
('(a+)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'),
('([abc]*)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'),
('(a)(b)c|ab', 'ab', SUCCEED, 'found+"-"+g1+"-"+g2', 'ab-None-None'),
('(a)+x', 'aaax', SUCCEED, 'found+"-"+g1', 'aaax-a'),
('([ac])+x', 'aacx', SUCCEED, 'found+"-"+g1', 'aacx-c'),
('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', SUCCEED, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/'),
('([^.]*)\\.([^:]*):[T ]+(.*)', 'track1.title:TBlah blah blah', SUCCEED, 'found+"-"+g1+"-"+g2+"-"+g3', 'track1.title:TBlah blah blah-track1-title-Blah blah blah'),
('([^N]*N)+', 'abNNxyzN', SUCCEED, 'found+"-"+g1', 'abNNxyzN-xyzN'),
('([^N]*N)+', 'abNNxyz', SUCCEED, 'found+"-"+g1', 'abNN-N'),
('([abc]*)x', 'abcx', SUCCEED, 'found+"-"+g1', 'abcx-abc'),
('([abc]*)x', 'abc', FAIL),
('([xyz]*)x', 'abcx', SUCCEED, 'found+"-"+g1', 'x-'),
('(a)+b|aac', 'aac', SUCCEED, 'found+"-"+g1', 'aac-None'),
# Test symbolic groups
('(?P<i d>aaa)a', 'aaaa', SYNTAX_ERROR),
('(?P<id>aaa)a', 'aaaa', SUCCEED, 'found+"-"+id', 'aaaa-aaa'),
('(?P<id>aa)(?P=id)', 'aaaa', SUCCEED, 'found+"-"+id', 'aaaa-aa'),
('(?P<id>aa)(?P=xd)', 'aaaa', SYNTAX_ERROR),
# Test octal escapes/memory references
('\\1', 'a', SYNTAX_ERROR),
# All tests from Perl
('ab{0,}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab{1,}bc', 'abq', FAIL),
('ab{1,}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab{1,3}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab{3,4}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
('ab{4,5}bc', 'abbbbc', FAIL),
('ab{0,1}bc', 'abc', SUCCEED, 'found', 'abc'),
('ab{0,1}c', 'abc', SUCCEED, 'found', 'abc'),
('^', 'abc', SUCCEED, 'found', ''),
('$', 'abc', SUCCEED, 'found', ''),
('a[b-]', 'a-', SUCCEED, 'found', 'a-'),
('a[b-a]', '-', SYNTAX_ERROR),
('*a', '-', SYNTAX_ERROR),
('(*)b', '-', SYNTAX_ERROR),
('a{1,}b{1,}c', 'aabbabc', SUCCEED, 'found', 'abc'),
('a**', '-', SYNTAX_ERROR),
('a.+?c', 'abcabc', SUCCEED, 'found', 'abc'),
('(a+|b){0,}', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
('(a+|b){1,}', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
('(a+|b){0,1}', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'),
('([abc])*d', 'abbbcd', SUCCEED, 'found+"-"+g1', 'abbbcd-c'),
('([abc])*bcd', 'abcd', SUCCEED, 'found+"-"+g1', 'abcd-a'),
('^(ab|cd)e', 'abcde', FAIL),
('((((((((((a))))))))))', 'a', SUCCEED, 'g10', 'a'),
('((((((((((a))))))))))\\10', 'aa', SUCCEED, 'found', 'aa'),
# Python does not have the same rules for \\41 so this is a syntax error
# ('((((((((((a))))))))))\\41', 'aa', FAIL),
# ('((((((((((a))))))))))\\41', 'a!', SUCCEED, 'found', 'a!'),
('((((((((((a))))))))))\\41', '', SYNTAX_ERROR),
('(?i)((((((((((a))))))))))\\41', '', SYNTAX_ERROR),
('(?i)abc', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)abc', 'XBC', FAIL),
('(?i)abc', 'AXC', FAIL),
('(?i)abc', 'ABX', FAIL),
('(?i)abc', 'XABCY', SUCCEED, 'found', 'ABC'),
('(?i)abc', 'ABABC', SUCCEED, 'found', 'ABC'),
('(?i)ab*c', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)ab*bc', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)ab*bc', 'ABBC', SUCCEED, 'found', 'ABBC'),
('(?i)ab*?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
('(?i)ab{0,}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
('(?i)ab+?bc', 'ABBC', SUCCEED, 'found', 'ABBC'),
('(?i)ab+bc', 'ABC', FAIL),
('(?i)ab+bc', 'ABQ', FAIL),
('(?i)ab{1,}bc', 'ABQ', FAIL),
('(?i)ab+bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
('(?i)ab{1,}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
('(?i)ab{1,3}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
('(?i)ab{3,4}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
('(?i)ab{4,5}?bc', 'ABBBBC', FAIL),
('(?i)ab??bc', 'ABBC', SUCCEED, 'found', 'ABBC'),
('(?i)ab??bc', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)ab{0,1}?bc', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)ab??bc', 'ABBBBC', FAIL),
('(?i)ab??c', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)ab{0,1}?c', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)^abc$', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)^abc$', 'ABCC', FAIL),
('(?i)^abc', 'ABCC', SUCCEED, 'found', 'ABC'),
('(?i)^abc$', 'AABC', FAIL),
('(?i)abc$', 'AABC', SUCCEED, 'found', 'ABC'),
('(?i)^', 'ABC', SUCCEED, 'found', ''),
('(?i)$', 'ABC', SUCCEED, 'found', ''),
('(?i)a.c', 'ABC', SUCCEED, 'found', 'ABC'),
('(?i)a.c', 'AXC', SUCCEED, 'found', 'AXC'),
('(?i)a.*?c', 'AXYZC', SUCCEED, 'found', 'AXYZC'),
('(?i)a.*c', 'AXYZD', FAIL),
('(?i)a[bc]d', 'ABC', FAIL),
('(?i)a[bc]d', 'ABD', SUCCEED, 'found', 'ABD'),
('(?i)a[b-d]e', 'ABD', FAIL),
('(?i)a[b-d]e', 'ACE', SUCCEED, 'found', 'ACE'),
('(?i)a[b-d]', 'AAC', SUCCEED, 'found', 'AC'),
('(?i)a[-b]', 'A-', SUCCEED, 'found', 'A-'),
('(?i)a[b-]', 'A-', SUCCEED, 'found', 'A-'),
('(?i)a[b-a]', '-', SYNTAX_ERROR),
('(?i)a[]b', '-', SYNTAX_ERROR),
('(?i)a[', '-', SYNTAX_ERROR),
('(?i)a]', 'A]', SUCCEED, 'found', 'A]'),
('(?i)a[]]b', 'A]B', SUCCEED, 'found', 'A]B'),
('(?i)a[^bc]d', 'AED', SUCCEED, 'found', 'AED'),
('(?i)a[^bc]d', 'ABD', FAIL),
('(?i)a[^-b]c', 'ADC', SUCCEED, 'found', 'ADC'),
('(?i)a[^-b]c', 'A-C', FAIL),
('(?i)a[^]b]c', 'A]C', FAIL),
('(?i)a[^]b]c', 'ADC', SUCCEED, 'found', 'ADC'),
('(?i)ab|cd', 'ABC', SUCCEED, 'found', 'AB'),
('(?i)ab|cd', 'ABCD', SUCCEED, 'found', 'AB'),
('(?i)()ef', 'DEF', SUCCEED, 'found+"-"+g1', 'EF-'),
('(?i)*a', '-', SYNTAX_ERROR),
('(?i)(*)b', '-', SYNTAX_ERROR),
('(?i)$b', 'B', FAIL),
('(?i)a\\', '-', SYNTAX_ERROR),
('(?i)a\\(b', 'A(B', SUCCEED, 'found+"-"+g1', 'A(B-Error'),
('(?i)a\\(*b', 'AB', SUCCEED, 'found', 'AB'),
('(?i)a\\(*b', 'A((B', SUCCEED, 'found', 'A((B'),
('(?i)a\\\\b', 'A\\B', SUCCEED, 'found', 'A\\B'),
('(?i)abc)', '-', SYNTAX_ERROR),
('(?i)(abc', '-', SYNTAX_ERROR),
('(?i)((a))', 'ABC', SUCCEED, 'found+"-"+g1+"-"+g2', 'A-A-A'),
('(?i)(a)b(c)', 'ABC', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABC-A-C'),
('(?i)a+b+c', 'AABBABC', SUCCEED, 'found', 'ABC'),
('(?i)a{1,}b{1,}c', 'AABBABC', SUCCEED, 'found', 'ABC'),
('(?i)a**', '-', SYNTAX_ERROR),
('(?i)a.+?c', 'ABCABC', SUCCEED, 'found', 'ABC'),
('(?i)a.*?c', 'ABCABC', SUCCEED, 'found', 'ABC'),
('(?i)a.{0,5}?c', 'ABCABC', SUCCEED, 'found', 'ABC'),
('(?i)(a+|b)*', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'),
('(?i)(a+|b){0,}', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'),
('(?i)(a+|b)+', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'),
('(?i)(a+|b){1,}', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'),
('(?i)(a+|b)?', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'),
('(?i)(a+|b){0,1}', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'),
('(?i)(a+|b){0,1}?', 'AB', SUCCEED, 'found+"-"+g1', '-None'),
('(?i))(', '-', SYNTAX_ERROR),
('(?i)[^ab]*', 'CDE', SUCCEED, 'found', 'CDE'),
('(?i)abc', '', FAIL),
('(?i)a*', '', SUCCEED, 'found', ''),
('(?i)([abc])*d', 'ABBBCD', SUCCEED, 'found+"-"+g1', 'ABBBCD-C'),
('(?i)([abc])*bcd', 'ABCD', SUCCEED, 'found+"-"+g1', 'ABCD-A'),
('(?i)a|b|c|d|e', 'E', SUCCEED, 'found', 'E'),
('(?i)(a|b|c|d|e)f', 'EF', SUCCEED, 'found+"-"+g1', 'EF-E'),
('(?i)abcd*efg', 'ABCDEFG', SUCCEED, 'found', 'ABCDEFG'),
('(?i)ab*', 'XABYABBBZ', SUCCEED, 'found', 'AB'),
('(?i)ab*', 'XAYABBBZ', SUCCEED, 'found', 'A'),
('(?i)(ab|cd)e', 'ABCDE', SUCCEED, 'found+"-"+g1', 'CDE-CD'),
('(?i)[abhgefdc]ij', 'HIJ', SUCCEED, 'found', 'HIJ'),
('(?i)^(ab|cd)e', 'ABCDE', FAIL),
('(?i)(abc|)ef', 'ABCDEF', SUCCEED, 'found+"-"+g1', 'EF-'),
('(?i)(a|b)c*d', 'ABCD', SUCCEED, 'found+"-"+g1', 'BCD-B'),
('(?i)(ab|ab*)bc', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-A'),
('(?i)a([bc]*)c*', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-BC'),
('(?i)a([bc]*)(c*d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-BC-D'),
('(?i)a([bc]+)(c*d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-BC-D'),
('(?i)a([bc]*)(c+d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-B-CD'),
('(?i)a[bcd]*dcdcde', 'ADCDCDE', SUCCEED, 'found', 'ADCDCDE'),
('(?i)a[bcd]+dcdcde', 'ADCDCDE', FAIL),
('(?i)(ab|a)b*c', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-AB'),
('(?i)((a)(b)c)(d)', 'ABCD', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'ABC-A-B-D'),
('(?i)[a-zA-Z_][a-zA-Z0-9_]*', 'ALPHA', SUCCEED, 'found', 'ALPHA'),
('(?i)^a(bc+|b[eh])g|.h$', 'ABH', SUCCEED, 'found+"-"+g1', 'BH-None'),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'IJ', SUCCEED, 'found+"-"+g1+"-"+g2', 'IJ-IJ-J'),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFG', FAIL),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'BCDD', FAIL),
('(?i)(bc+d$|ef*g.|h?i(j|k))', 'REFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'),
('(?i)((((((((((a))))))))))', 'A', SUCCEED, 'g10', 'A'),
('(?i)((((((((((a))))))))))\\10', 'AA', SUCCEED, 'found', 'AA'),
#('(?i)((((((((((a))))))))))\\41', 'AA', FAIL),
#('(?i)((((((((((a))))))))))\\41', 'A!', SUCCEED, 'found', 'A!'),
('(?i)(((((((((a)))))))))', 'A', SUCCEED, 'found', 'A'),
('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))', 'A', SUCCEED, 'g1', 'A'),
('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))', 'C', SUCCEED, 'g1', 'C'),
('(?i)multiple words of text', 'UH-UH', FAIL),
('(?i)multiple words', 'MULTIPLE WORDS, YEAH', SUCCEED, 'found', 'MULTIPLE WORDS'),
('(?i)(.*)c(.*)', 'ABCDE', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCDE-AB-DE'),
('(?i)\\((.*), (.*)\\)', '(A, B)', SUCCEED, 'g2+"-"+g1', 'B-A'),
('(?i)[k]', 'AB', FAIL),
# ('(?i)abcd', 'ABCD', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', 'ABCD-$&-\\ABCD'),
# ('(?i)a(bc)d', 'ABCD', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', 'BC-$1-\\BC'),
('(?i)a[-]?c', 'AC', SUCCEED, 'found', 'AC'),
('(?i)(abc)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'),
('(?i)([a-c]*)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'),
('a(?!b).', 'abad', SUCCEED, 'found', 'ad'),
('a(?=d).', 'abad', SUCCEED, 'found', 'ad'),
('a(?=c|d).', 'abad', SUCCEED, 'found', 'ad'),
('a(?:b|c|d)(.)', 'ace', SUCCEED, 'g1', 'e'),
('a(?:b|c|d)*(.)', 'ace', SUCCEED, 'g1', 'e'),
('a(?:b|c|d)+?(.)', 'ace', SUCCEED, 'g1', 'e'),
('a(?:b|(c|e){1,2}?|d)+?(.)', 'ace', SUCCEED, 'g1 + g2', 'ce'),
# lookbehind: split by : but not if it is escaped by -.
('(?<!-):(.*?)(?<!-):', 'a:bc-:de:f', SUCCEED, 'g1', 'bc-:de' ),
# escaping with \ as we know it
('(?<!\\\):(.*?)(?<!\\\):', 'a:bc\\:de:f', SUCCEED, 'g1', 'bc\\:de' ),
# terminating with ' and escaping with ? as in edifact
("(?<!\\?)'(.*?)(?<!\\?)'", "a'bc?'de'f", SUCCEED, 'g1', "bc?'de" ),
# Comments using the (?#...) syntax
('w(?# comment', 'w', SYNTAX_ERROR),
('w(?# comment 1)xy(?# comment 2)z', 'wxyz', SUCCEED, 'found', 'wxyz'),
# Check odd placement of embedded pattern modifiers
# not an error under PCRE/PRE:
('w(?i)', 'W', SUCCEED, 'found', 'W'),
# ('w(?i)', 'W', SYNTAX_ERROR),
# Comments using the x embedded pattern modifier
("""(?x)w# comment 1
x y
# comment 2
z""", 'wxyz', SUCCEED, 'found', 'wxyz'),
# using the m embedded pattern modifier
('^abc', """jkl
abc
xyz""", FAIL),
('(?m)^abc', """jkl
xyz""", SUCCEED, 'found', 'abc'),
('(?m)abc$', """jkl
xyzabc
123""", SUCCEED, 'found', 'abc'),
# using the s embedded pattern modifier
# test \w, etc. both inside and outside character classes
('\\w+', '--ab_cd0123--', SUCCEED, 'found', 'ab_cd0123'),
('[\\w]+', '--ab_cd0123--', SUCCEED, 'found', 'ab_cd0123'),
('\\D+', '1234abc5678', SUCCEED, 'found', 'abc'),
('[\\D]+', '1234abc5678', SUCCEED, 'found', 'abc'),
('[\\da-fA-F]+', '123abc', SUCCEED, 'found', '123abc'),
# ('[\\d-x]', '-', SYNTAX_ERROR),
(r'([\s]*)([\S]*)([\s]*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '),
(r'(\s*)(\S*)(\s*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '),
(r'\x00ff', '\377', FAIL),
# (r'\x00ff', '\377', SUCCEED, 'found', chr(255)),
(r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
(r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)),
(r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', SUCCEED, 'found', '\t\n\v\r\f\b'),
#
# post-1.5.2 additions
# xmllib problem
(r'(([a-z]+):)?([a-z]+)$', 'smil', SUCCEED, 'g1+"-"+g2+"-"+g3', 'None-None-smil'),
# bug 110866: reference to undefined group
(r'((.)\1+)', '', SYNTAX_ERROR),
# bug 111869: search (PRE/PCRE fails on this one, SRE doesn't)
(r'.*d', 'abc\nabd', SUCCEED, 'found', 'abd'),
# bug 112468: various expected syntax errors
(r'(', '', SYNTAX_ERROR),
(r'[\41]', '!', SUCCEED, 'found', '!'),
# bug 114033: nothing to repeat
(r'(x?)?', 'x', SUCCEED, 'found', 'x'),
# bug 115040: rescan if flags are modified inside pattern
(r' (?x)foo ', 'foo', SUCCEED, 'found', 'foo'),
# bug 115618: negative lookahead
(r'(?<!abc)(d.f)', 'abcdefdof', SUCCEED, 'found', 'dof'),
# bug 116251: character class bug
(r'[\w-]+', 'laser_beam', SUCCEED, 'found', 'laser_beam'),
# bug 123769+127259: non-greedy backtracking bug
(r'.*?\S *:', 'xx:', SUCCEED, 'found', 'xx:'),
(r'a[ ]*?\ (\d+).*', 'a 10', SUCCEED, 'found', 'a 10'),
(r'a[ ]*?\ (\d+).*', 'a 10', SUCCEED, 'found', 'a 10'),
# bug 127259: \Z shouldn't depend on multiline mode
(r'(?ms).*?x\s*\Z(.*)','xx\nx\n', SUCCEED, 'g1', ''),
# bug 128899: uppercase literals under the ignorecase flag
(r'(?i)M+', 'MMM', SUCCEED, 'found', 'MMM'),
(r'(?i)m+', 'MMM', SUCCEED, 'found', 'MMM'),
(r'(?i)[M]+', 'MMM', SUCCEED, 'found', 'MMM'),
(r'(?i)[m]+', 'MMM', SUCCEED, 'found', 'MMM'),
# bug 130748: ^* should be an error (nothing to repeat)
(r'^*', '', SYNTAX_ERROR),
# bug 133283: minimizing repeat problem
(r'"(?:\\"|[^"])*?"', r'"\""', SUCCEED, 'found', r'"\""'),
# bug 477728: minimizing repeat problem
(r'^.*?$', 'one\ntwo\nthree\n', FAIL),
# bug 483789: minimizing repeat problem
(r'a[^>]*?b', 'a>b', FAIL),
# bug 490573: minimizing repeat problem
(r'^a*?$', 'foo', FAIL),
# bug 470582: nested groups problem
(r'^((a)c)?(ab)$', 'ab', SUCCEED, 'g1+"-"+g2+"-"+g3', 'None-None-ab'),
# another minimizing repeat problem (capturing groups in assertions)
('^([ab]*?)(?=(b)?)c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'),
('^([ab]*?)(?!(b))c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'),
('^([ab]*?)(?<!(a))c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'),
u = '\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'
tests.extend([
# bug 410271: \b broken under locales
(r'\b.\b', 'a', SUCCEED, 'found', 'a'),
(r'(?u)\b.\b', u, SUCCEED, 'found', u),
(r'(?u)\w', u, SUCCEED, 'found', u),
])

View File

@@ -1,11 +1,11 @@
package regex
package main
import "errors"
// Helper functions for slices, to make them behave more like stacks
func peek[T any](s []T) (T, error) {
if len(s) < 1 {
return *new(T), errors.New("stack empty")
return *new(T), errors.New("Stack empty")
}
return s[len(s)-1], nil
}
@@ -20,7 +20,7 @@ func mustPop[T any](sp *[]T) T {
func pop[T any](sp *[]T) (T, error) {
if len(*sp) < 1 {
return *new(T), errors.New("stack empty")
return *new(T), errors.New("Stack empty")
}
to_return := (*sp)[len(*sp)-1]
*sp = (*sp)[:len(*sp)-1]

View File

@@ -1,4 +1,4 @@
package regex
package main
type stateContents []int // Represents the contents of the current state - character classes can have multiple contents, which is why it is represented as a slice

View File

@@ -16,6 +16,7 @@ func (s *uniq_arr[T]) add(vals ...T) {
s.backingMap[item] = struct{}{}
}
}
return
}
func (s uniq_arr[T]) contains(val T) bool {