Compare commits
54 Commits
v0.1.0
...
e489dc4c27
| Author | SHA1 | Date | |
|---|---|---|---|
| e489dc4c27 | |||
| 34149980a4 | |||
| e79c19a929 | |||
| d2bce37935 | |||
| bb3b866b77 | |||
| e07f27dc78 | |||
| 65d2317f79 | |||
| a631fc289c | |||
| d62a429cce | |||
| 7b31031553 | |||
| 38c842cb07 | |||
| 9f9af36be8 | |||
| 8217b67122 | |||
| 1f06dcef64 | |||
| 119475b41b | |||
| 6151cc8cf6 | |||
| 3eaf4eb19c | |||
| d453815831 | |||
| 3a2916baae | |||
| 9d6344719f | |||
| f5c868566b | |||
| 1cd6da218f | |||
| 277cbc0fc5 | |||
| 3924502b72 | |||
| 36b009747b | |||
| 6cd0a10a8f | |||
| 69fb96c43d | |||
| 46bc0c8529 | |||
| 1a890a1e75 | |||
| fde3784e5a | |||
| 7045711860 | |||
| d4d606d95b | |||
| 9cd330e521 | |||
| 44d6a2005c | |||
| f76cd6c3d9 | |||
| 375baa1722 | |||
| 2e47c631bb | |||
| 81b8b1b11c | |||
| 2934e7a20f | |||
| f466d4a8d5 | |||
| 8327450dd2 | |||
| 073f231b89 | |||
| 3b7257c921 | |||
| 668df8b70a | |||
| 214acf7e0f | |||
| 50221ff4d9 | |||
| 5ab95f512a | |||
| e7da678408 | |||
| ab363e2766 | |||
| c803e45415 | |||
| 525296f239 | |||
| eb0ab9f7ec | |||
| 17a7dbae4c | |||
| f2279acd98 |
17
README.md
Normal file
17
README.md
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
## Kleingrep
|
||||||
|
|
||||||
|
Kleingrep is a regular expression engine, providing a library and command-line tool written in Go.
|
||||||
|
|
||||||
|
It aims to provide a more featureful engine, compared to the one in Go's
|
||||||
|
[regexp](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
|
||||||
|
|
||||||
|
The engine does __not__ use backtracking, relying on the NFA-based method described in
|
||||||
|
[Russ Cox's articles](https://swtch.com/~rsc/regexp). As such, it is immune to catastrophic backtracking.
|
||||||
|
|
||||||
|
It also includes features not present in regexp, such as lookarounds and backreferences.
|
||||||
|
|
||||||
|
### Syntax
|
||||||
|
|
||||||
|
The syntax is, for the most part, a superset of Go's regexp. A full overview of the syntax can be found [here](https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex#hdr-Syntax).
|
||||||
|
|
||||||
|
__For more information, see https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex__.
|
||||||
278
cmd/main.go
278
cmd/main.go
@@ -6,6 +6,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
|
"slices"
|
||||||
|
|
||||||
"github.com/fatih/color"
|
"github.com/fatih/color"
|
||||||
|
|
||||||
@@ -25,6 +26,8 @@ func main() {
|
|||||||
multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
|
multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
|
||||||
printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
|
printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
|
||||||
caseInsensitiveFlag := flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
|
caseInsensitiveFlag := flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
|
||||||
|
recursiveFlag := flag.Bool("r", false, "Recursively search all files in the given directory.")
|
||||||
|
lineNumFlag := flag.Bool("n", false, "For each line with a match, print the line number. Implies -l.")
|
||||||
matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
|
matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
|
||||||
substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
|
substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
@@ -64,162 +67,197 @@ func main() {
|
|||||||
// 2. Build NFA from postfix representation (Thompson's algorithm)
|
// 2. Build NFA from postfix representation (Thompson's algorithm)
|
||||||
// 3. Run the string against the NFA
|
// 3. Run the string against the NFA
|
||||||
|
|
||||||
if len(flag.Args()) != 1 { // flag.Args() also strips out program name
|
if len(flag.Args()) < 1 { // flag.Args() also strips out program name
|
||||||
|
fmt.Println("ERROR: Missing cmdline args")
|
||||||
|
os.Exit(22)
|
||||||
|
}
|
||||||
|
if *recursiveFlag && len(flag.Args()) < 2 { // File/Directory must be provided with '-r'
|
||||||
fmt.Println("ERROR: Missing cmdline args")
|
fmt.Println("ERROR: Missing cmdline args")
|
||||||
os.Exit(22)
|
os.Exit(22)
|
||||||
}
|
}
|
||||||
var re string
|
var re string
|
||||||
re = flag.Args()[0]
|
re = flag.Args()[0]
|
||||||
|
var inputFiles []*os.File
|
||||||
|
if len(flag.Args()) == 1 || flag.Args()[1] == "-" { // Either no file argument, or file argument is "-"
|
||||||
|
if !slices.Contains(inputFiles, os.Stdin) {
|
||||||
|
inputFiles = append(inputFiles, os.Stdin) // os.Stdin cannot be entered more than once into the file list
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
inputFilenames := flag.Args()[1:]
|
||||||
|
for _, inputFilename := range inputFilenames {
|
||||||
|
inputFile, err := os.Open(inputFilename)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("%s: No such file or directory\n", flag.Args()[1])
|
||||||
|
os.Exit(2)
|
||||||
|
}
|
||||||
|
inputFiles = append(inputFiles, inputFile)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var test_str string
|
var test_str string
|
||||||
var err error
|
var err error
|
||||||
var linesRead bool // Whether or not we have read the lines in the file
|
var linesRead bool // Whether or not we have read the lines in the file
|
||||||
lineNum := 0 // Current line number
|
lineNum := 0 // Current line number
|
||||||
// Create reader for stdin and writer for stdout
|
// Create writer for stdout
|
||||||
reader := bufio.NewReader(os.Stdin)
|
|
||||||
out := bufio.NewWriter(os.Stdout)
|
out := bufio.NewWriter(os.Stdout)
|
||||||
|
// Compile regex
|
||||||
regComp, err := reg.Compile(re, flagsToCompile...)
|
regComp, err := reg.Compile(re, flagsToCompile...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println(err)
|
fmt.Println(err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
for true {
|
|
||||||
if linesRead {
|
for _, inputFile := range inputFiles {
|
||||||
break
|
reader := bufio.NewReader(inputFile)
|
||||||
}
|
linesRead = false
|
||||||
if !(*multiLineFlag) {
|
for true {
|
||||||
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.
|
if linesRead {
|
||||||
test_str, err = reader.ReadString('\n')
|
break
|
||||||
lineNum++
|
}
|
||||||
if err != nil {
|
if !(*multiLineFlag) {
|
||||||
|
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.
|
||||||
|
test_str, err = reader.ReadString('\n')
|
||||||
|
lineNum++
|
||||||
|
if err != nil {
|
||||||
|
if err == io.EOF {
|
||||||
|
linesRead = true
|
||||||
|
} else {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(test_str) > 0 && test_str[len(test_str)-1] == '\n' {
|
||||||
|
test_str = test_str[:len(test_str)-1]
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Multi-line mode - read every line of input into a temp. string.
|
||||||
|
// test_str will contain all lines of input (including newline characters)
|
||||||
|
// as one string.
|
||||||
|
var temp string
|
||||||
|
for temp, err = reader.ReadString('\n'); err == nil; temp, err = reader.ReadString('\n') {
|
||||||
|
test_str += temp
|
||||||
|
}
|
||||||
|
// Assuming err != nil
|
||||||
if err == io.EOF {
|
if err == io.EOF {
|
||||||
|
if len(temp) > 0 {
|
||||||
|
test_str += temp // Add the last line (if it is non-empty)
|
||||||
|
}
|
||||||
linesRead = true
|
linesRead = true
|
||||||
} else {
|
} else {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(test_str) > 0 && test_str[len(test_str)-1] == '\n' {
|
matchIndices := make([]reg.Match, 0)
|
||||||
test_str = test_str[:len(test_str)-1]
|
if matchNumFlagEnabled {
|
||||||
}
|
tmp, err := regComp.FindNthMatch(test_str, *matchNum)
|
||||||
} else {
|
if err == nil {
|
||||||
// Multi-line mode - read every line of input into a temp. string.
|
matchIndices = append(matchIndices, tmp)
|
||||||
// test_str will contain all lines of input (including newline characters)
|
|
||||||
// as one string.
|
|
||||||
var temp string
|
|
||||||
for temp, err = reader.ReadString('\n'); err == nil; temp, err = reader.ReadString('\n') {
|
|
||||||
test_str += temp
|
|
||||||
}
|
|
||||||
// Assuming err != nil
|
|
||||||
if err == io.EOF {
|
|
||||||
if len(temp) > 0 {
|
|
||||||
test_str += temp // Add the last line (if it is non-empty)
|
|
||||||
}
|
}
|
||||||
linesRead = true
|
|
||||||
} else {
|
} else {
|
||||||
panic(err)
|
matchIndices = regComp.FindAllSubmatch(test_str)
|
||||||
}
|
}
|
||||||
}
|
|
||||||
matchIndices := make([]reg.Match, 0)
|
|
||||||
if matchNumFlagEnabled {
|
|
||||||
tmp, err := regComp.FindNthMatch(test_str, *matchNum)
|
|
||||||
if err == nil {
|
|
||||||
matchIndices = append(matchIndices, tmp)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
matchIndices = regComp.FindAllSubmatch(test_str)
|
|
||||||
}
|
|
||||||
|
|
||||||
if *printMatchesFlag {
|
test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters
|
||||||
// if we are in single line mode, print the line on which
|
|
||||||
// the matches occur
|
|
||||||
if len(matchIndices) > 0 {
|
|
||||||
if !(*multiLineFlag) {
|
|
||||||
fmt.Fprintf(out, "Line %d:\n", lineNum)
|
|
||||||
}
|
|
||||||
for _, m := range matchIndices {
|
|
||||||
fmt.Fprintf(out, "%s\n", m.String())
|
|
||||||
}
|
|
||||||
err := out.Flush()
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
|
|
||||||
// This should make checking O(1) instead of O(n)
|
|
||||||
indicesToPrint := new_uniq_arr[int]()
|
|
||||||
for _, idx := range matchIndices {
|
|
||||||
indicesToPrint.add(genRange(idx[0].StartIdx, idx[0].EndIdx)...)
|
|
||||||
}
|
|
||||||
// If we are inverting, then we should print the indices which _didn't_ match
|
|
||||||
// in color.
|
|
||||||
if *invertFlag {
|
|
||||||
oldIndices := indicesToPrint.values()
|
|
||||||
indicesToPrint = new_uniq_arr[int]()
|
|
||||||
// Explanation:
|
|
||||||
// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
|
|
||||||
// These are the values we want to print, now that we have inverted the match.
|
|
||||||
// Re-initialize indicesToPrint and add all of these values to it.
|
|
||||||
indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)
|
|
||||||
|
|
||||||
}
|
if *printMatchesFlag {
|
||||||
// If lineFlag is enabled, we should only print something if:
|
// if we are in single line mode, print the line on which
|
||||||
// a. We are not inverting, and have at least one match on the current line
|
// the matches occur
|
||||||
// OR
|
if len(matchIndices) > 0 {
|
||||||
// b. We are inverting, and have no matches at all on the current line.
|
if !(*multiLineFlag) {
|
||||||
// This checks for the inverse, and continues if it is true.
|
fmt.Fprintf(out, "Line %d:\n", lineNum)
|
||||||
if *lineFlag {
|
}
|
||||||
if !(*invertFlag) && len(matchIndices) == 0 || *invertFlag && len(matchIndices) > 0 {
|
for _, m := range matchIndices {
|
||||||
|
fmt.Fprintf(out, "%s\n", m.String())
|
||||||
|
}
|
||||||
|
err := out.Flush()
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
}
|
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
|
||||||
|
// This should make checking O(1) instead of O(n)
|
||||||
|
indicesToPrint := new_uniq_arr[int]()
|
||||||
|
for _, idx := range matchIndices {
|
||||||
|
indicesToPrint.add(genRange(idx[0].StartIdx, idx[0].EndIdx)...)
|
||||||
|
}
|
||||||
|
// If we are inverting, then we should print the indices which _didn't_ match
|
||||||
|
// in color.
|
||||||
|
if *invertFlag {
|
||||||
|
oldIndices := indicesToPrint.values()
|
||||||
|
indicesToPrint = new_uniq_arr[int]()
|
||||||
|
// Explanation:
|
||||||
|
// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
|
||||||
|
// These are the values we want to print, now that we have inverted the match.
|
||||||
|
// Re-initialize indicesToPrint and add all of these values to it.
|
||||||
|
indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)
|
||||||
|
|
||||||
// If we are substituting, we need a different behavior, as follows:
|
}
|
||||||
// For every character in the test string:
|
// If lineFlag is enabled, we should only print something if:
|
||||||
// 1. Check if the index is the start of any matchIndex
|
// a. We are not inverting, and have at least one match on the current line
|
||||||
// 2. If so, print the substitute text, and set our index to
|
// OR
|
||||||
// the corresponding end index.
|
// b. We are inverting, and have no matches at all on the current line.
|
||||||
// 3. If not, just print the character.
|
// This checks for the inverse, and continues if it is true.
|
||||||
if substituteFlagEnabled {
|
if *lineFlag {
|
||||||
for i := range test_str {
|
if !(*invertFlag) && len(matchIndices) == 0 || *invertFlag && len(matchIndices) > 0 {
|
||||||
inMatchIndex := false
|
continue
|
||||||
for _, m := range matchIndices {
|
} else {
|
||||||
if i == m[0].StartIdx {
|
color.New(color.FgMagenta).Fprintf(out, "%s: ", inputFile.Name()) // Print filename
|
||||||
fmt.Fprintf(out, "%s", *substituteText)
|
|
||||||
i = m[0].EndIdx
|
|
||||||
inMatchIndex = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !inMatchIndex {
|
|
||||||
fmt.Fprintf(out, "%c", test_str[i])
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
for i, c := range test_str {
|
// If we are substituting, we need a different behavior, as follows:
|
||||||
if indicesToPrint.contains(i) {
|
// For every character in the test string:
|
||||||
color.New(color.FgRed).Fprintf(out, "%c", c)
|
// 1. Check if the index is the start of any matchIndex
|
||||||
// Newline after every match - only if -o is enabled and -v is disabled.
|
// 2. If so, print the substitute text, and set our index to
|
||||||
if *onlyFlag && !(*invertFlag) {
|
// the corresponding end index.
|
||||||
for _, idx := range matchIndices {
|
// 3. If not, just print the character.
|
||||||
if i+1 == idx[0].EndIdx { // End index is one more than last index of match
|
if substituteFlagEnabled {
|
||||||
fmt.Fprintf(out, "\n")
|
for i := range test_str_runes {
|
||||||
break
|
inMatchIndex := false
|
||||||
}
|
for _, m := range matchIndices {
|
||||||
|
if i == m[0].StartIdx {
|
||||||
|
fmt.Fprintf(out, "%s", *substituteText)
|
||||||
|
i = m[0].EndIdx
|
||||||
|
inMatchIndex = true
|
||||||
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
if !inMatchIndex {
|
||||||
if !(*onlyFlag) {
|
fmt.Fprintf(out, "%c", test_str_runes[i])
|
||||||
fmt.Fprintf(out, "%c", c)
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for i, c := range test_str_runes {
|
||||||
|
if indicesToPrint.contains(i) {
|
||||||
|
color.New(color.FgRed, color.Bold).Fprintf(out, "%c", c)
|
||||||
|
// Newline after every match - only if -o is enabled and -v is disabled.
|
||||||
|
if *onlyFlag && !(*invertFlag) {
|
||||||
|
for matchIdxNum, idx := range matchIndices {
|
||||||
|
if matchIdxNum < len(matchIndices)-1 { // Only print a newline afte printing a match, if there are multiple matches on the line, and we aren't on the last one. This is because the newline that gets added at the end will take care of that.
|
||||||
|
if i+1 == idx[0].EndIdx { // End index is one more than last index of match
|
||||||
|
fmt.Fprintf(out, "\n")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if !(*onlyFlag) {
|
||||||
|
fmt.Fprintf(out, "%c", c)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
err = out.Flush()
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
// If the last character in the string wasn't a newline, AND we either have don't -o set or we do (and we've matched something), then print a newline
|
||||||
|
if (len(test_str_runes) > 0 && test_str_runes[len(test_str_runes)-1] != '\n') &&
|
||||||
|
(!*onlyFlag || indicesToPrint.len() > 0) {
|
||||||
|
fmt.Println()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
err = out.Flush()
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
fmt.Println()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,3 +36,7 @@ func (s uniq_arr[T]) values() []T {
|
|||||||
}
|
}
|
||||||
return toRet
|
return toRet
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s uniq_arr[T]) len() int {
|
||||||
|
return len(s.backingMap)
|
||||||
|
}
|
||||||
|
|||||||
253
regex/compile.go
253
regex/compile.go
@@ -31,6 +31,22 @@ func (re Reg) String() string {
|
|||||||
return re.str
|
return re.str
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MarshalText implements [encoding.TextMarshaler]. The output is equivalent to that of [Reg.String].
|
||||||
|
// Any flags passed as arguments (including calling [Reg.Longest]) are lost.
|
||||||
|
func (re *Reg) MarshalText() ([]byte, error) {
|
||||||
|
return []byte(re.String()), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// UnmarshalText implements [encoding.TextUnmarshaler]. It calls [Reg.Compile] on the given byte-slice. If it returns successfully,
|
||||||
|
// then the result of the compilation is stored in re. The result of [Reg.Compile] is returned.
|
||||||
|
func (re *Reg) UnmarshalText(text []byte) error {
|
||||||
|
newReg, err := Compile(string(text))
|
||||||
|
if err == nil {
|
||||||
|
*re = newReg
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
func (re *Reg) Longest() {
|
func (re *Reg) Longest() {
|
||||||
re.preferLongest = true
|
re.preferLongest = true
|
||||||
}
|
}
|
||||||
@@ -48,7 +64,7 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func isOperator(c rune) bool {
|
func isOperator(c rune) bool {
|
||||||
if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune {
|
if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune || c == lazyPlusRune || c == lazyKleeneRune || c == lazyQuestionRune {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
@@ -56,7 +72,7 @@ func isOperator(c rune) bool {
|
|||||||
|
|
||||||
/* priority returns the priority of the given operator */
|
/* priority returns the priority of the given operator */
|
||||||
func priority(op rune) int {
|
func priority(op rune) int {
|
||||||
precedence := []rune{'|', concatRune, '+', '*', '?'}
|
precedence := []rune{'|', concatRune, '+', lazyPlusRune, '*', lazyKleeneRune, '?', lazyQuestionRune}
|
||||||
return slices.Index(precedence, op)
|
return slices.Index(precedence, op)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -92,6 +108,48 @@ func getPOSIXClass(str []rune) (bool, string) {
|
|||||||
return true, rtv
|
return true, rtv
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isUnicodeCharClassLetter returns whether or not the given letter represents a unicode character class.
|
||||||
|
func isUnicodeCharClassLetter(c rune) bool {
|
||||||
|
return slices.Contains([]rune{'L', 'M', 'S', 'N', 'P', 'C', 'Z'}, c)
|
||||||
|
}
|
||||||
|
|
||||||
|
// rangeTableToRuneSlice converts the given range table into a rune slice and returns it.
|
||||||
|
func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune {
|
||||||
|
var rtv []rune
|
||||||
|
for _, r := range rangetable.R16 {
|
||||||
|
for c := r.Lo; c <= r.Hi; c += r.Stride {
|
||||||
|
rtv = append(rtv, rune(c))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, r := range rangetable.R32 {
|
||||||
|
for c := r.Lo; c <= r.Hi; c += r.Stride {
|
||||||
|
rtv = append(rtv, rune(c))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rtv
|
||||||
|
}
|
||||||
|
|
||||||
|
// unicodeCharClassToRange converts the given unicode character class name into a list of characters in that class.
|
||||||
|
// This class could also be a single letter eg. 'C'.
|
||||||
|
func unicodeCharClassToRange(class string) ([]rune, error) {
|
||||||
|
if len(class) == 0 {
|
||||||
|
return nil, fmt.Errorf("empty unicode character class")
|
||||||
|
}
|
||||||
|
if len(class) == 1 || len(class) == 2 {
|
||||||
|
if rangeTable, ok := unicode.Categories[class]; ok {
|
||||||
|
return rangeTableToRuneSlice(rangeTable), nil
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("invalid short unicode character class")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if rangeTable, ok := unicode.Scripts[class]; ok {
|
||||||
|
return rangeTableToRuneSlice(rangeTable), nil
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("invalid long unicode character class")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Stores whether the case-insensitive flag has been enabled.
|
// Stores whether the case-insensitive flag has been enabled.
|
||||||
var caseInsensitive bool
|
var caseInsensitive bool
|
||||||
|
|
||||||
@@ -150,9 +208,6 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
// metacharacter. Later, in thompson(), these will be converted back. This avoids
|
// metacharacter. Later, in thompson(), these will be converted back. This avoids
|
||||||
// confusion in detecting whether a character is escaped eg. detecting
|
// confusion in detecting whether a character is escaped eg. detecting
|
||||||
// whether '\\[a]' has an escaped opening bracket (it doesn't).
|
// whether '\\[a]' has an escaped opening bracket (it doesn't).
|
||||||
//
|
|
||||||
// 5. Check for non-greedy operators. These are not supported at the moment, so an error
|
|
||||||
// must be thrown if the user attempts to use a non-greedy operator.
|
|
||||||
for i := 0; i < len(re_runes_orig); i++ {
|
for i := 0; i < len(re_runes_orig); i++ {
|
||||||
c := re_runes_orig[i]
|
c := re_runes_orig[i]
|
||||||
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
|
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
|
||||||
@@ -199,8 +254,16 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
|
} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
|
||||||
re_runes = append(re_runes, rbracketRune)
|
re_runes = append(re_runes, rbracketRune)
|
||||||
continue
|
continue
|
||||||
} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
|
} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i > 0 && re_runes_orig[i-1] != '\\') && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
|
||||||
return nil, fmt.Errorf("non-greedy operators are not supported")
|
switch c {
|
||||||
|
case '+':
|
||||||
|
re_runes = append(re_runes, lazyPlusRune)
|
||||||
|
case '*':
|
||||||
|
re_runes = append(re_runes, lazyKleeneRune)
|
||||||
|
case '?':
|
||||||
|
re_runes = append(re_runes, lazyQuestionRune)
|
||||||
|
}
|
||||||
|
i++
|
||||||
} else {
|
} else {
|
||||||
re_runes = append(re_runes, c)
|
re_runes = append(re_runes, c)
|
||||||
}
|
}
|
||||||
@@ -293,17 +356,44 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
} else if isHex(re_runes[i]) {
|
} else if isHex(re_runes[i]) {
|
||||||
re_postfix = append(re_postfix, re_runes[i:i+2]...)
|
re_postfix = append(re_postfix, re_runes[i:i+2]...)
|
||||||
i += 2
|
i += 1 // I don't skip forward 2 steps, because the second step will happen with the loop increment
|
||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("invalid hex value in expression")
|
return nil, fmt.Errorf("invalid hex value in expression")
|
||||||
}
|
}
|
||||||
} else if isOctal(re_runes[i]) {
|
} else if re_runes[i] == 'p' || re_runes[i] == 'P' { // Unicode character class (P is negated unicode charclass)
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
if i >= len(re_runes) {
|
||||||
|
return nil, fmt.Errorf("error parsing unicode character class in expression")
|
||||||
|
}
|
||||||
|
if re_runes[i] == '{' { // Full name charclass
|
||||||
|
for re_runes[i] != '}' {
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
} else if isUnicodeCharClassLetter(re_runes[i]) {
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("error parsing unicode character class in expression")
|
||||||
|
}
|
||||||
|
i-- // The loop increment at the top will move us forward
|
||||||
|
} else if re_runes[i] == '0' { // Start of octal value
|
||||||
numDigits := 1
|
numDigits := 1
|
||||||
for i+numDigits < len(re_runes) && numDigits < 3 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 3)
|
for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0)
|
||||||
numDigits++
|
numDigits++
|
||||||
}
|
}
|
||||||
re_postfix = append(re_postfix, re_runes[i:i+numDigits]...)
|
re_postfix = append(re_postfix, re_runes[i:i+numDigits]...)
|
||||||
i += (numDigits - 1) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended
|
i += (numDigits - 1) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended
|
||||||
|
} else if unicode.IsDigit(re_runes[i]) { // Any other number - backreference
|
||||||
|
numDigits := 1
|
||||||
|
for i+numDigits < len(re_runes) && unicode.IsDigit(re_runes[i+numDigits]) { // Skip while we see a digit
|
||||||
|
numDigits++
|
||||||
|
}
|
||||||
|
re_postfix = append(re_postfix, re_runes[i:i+numDigits]...)
|
||||||
|
i += (numDigits - 1) // Move back a step to add concatenation operator
|
||||||
} else {
|
} else {
|
||||||
re_postfix = append(re_postfix, re_runes[i])
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
}
|
}
|
||||||
@@ -320,10 +410,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
if i >= len(re_runes) {
|
if i >= len(re_runes) {
|
||||||
return nil, fmt.Errorf("unclosed lookaround")
|
return nil, fmt.Errorf("unclosed lookaround")
|
||||||
}
|
}
|
||||||
if re_runes[i] == '(' || re_runes[i] == nonCapLparenRune {
|
if (re_runes[i] == '(' && re_runes[i-1] != '\\') || re_runes[i] == nonCapLparenRune {
|
||||||
numOpenParens++
|
numOpenParens++
|
||||||
}
|
}
|
||||||
if re_runes[i] == ')' {
|
if re_runes[i] == ')' && re_runes[i-1] != '\\' {
|
||||||
numOpenParens--
|
numOpenParens--
|
||||||
if numOpenParens == 0 {
|
if numOpenParens == 0 {
|
||||||
break
|
break
|
||||||
@@ -336,7 +426,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != nonCapLparenRune && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != nonCapLparenRune && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
||||||
if i < len(re_runes)-1 {
|
if i < len(re_runes)-1 {
|
||||||
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
|
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != lazyKleeneRune && re_runes[i+1] != '+' && re_runes[i+1] != lazyPlusRune && re_runes[i+1] != '?' && re_runes[i+1] != lazyQuestionRune && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
|
||||||
re_postfix = append(re_postfix, concatRune)
|
re_postfix = append(re_postfix, concatRune)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -348,7 +438,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
outQueue := make([]postfixNode, 0) // Output queue
|
outQueue := make([]postfixNode, 0) // Output queue
|
||||||
|
|
||||||
// Actual algorithm
|
// Actual algorithm
|
||||||
numOpenParens := 0 // Number of open parentheses
|
numOpenParens := 0 // Number of open parentheses
|
||||||
|
parenIndices := make([]Group, 0) // I really shouldn't be using Group here, because that's strictly for matching purposes, but its a convenient way to store the indices of the opening and closing parens.
|
||||||
|
parenIndices = append(parenIndices, Group{0, 0}) // I append a weird value here, because the 0-th group doesn't have any parens. This way, the 1st group will be at index 1, 2nd at 2 ...
|
||||||
for i := 0; i < len(re_postfix); i++ {
|
for i := 0; i < len(re_postfix); i++ {
|
||||||
/* Two cases:
|
/* Two cases:
|
||||||
1. Current character is alphanumeric - send to output queue
|
1. Current character is alphanumeric - send to output queue
|
||||||
@@ -404,11 +496,44 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("not enough hex characters found in expression")
|
return nil, fmt.Errorf("not enough hex characters found in expression")
|
||||||
}
|
}
|
||||||
} else if isOctal(re_postfix[i]) { // Octal value
|
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
|
||||||
|
charClassInverted := (re_postfix[i] == 'P')
|
||||||
|
var charsInClass []rune
|
||||||
|
i++
|
||||||
|
if isUnicodeCharClassLetter(re_postfix[i]) {
|
||||||
|
var err error
|
||||||
|
charsInClass, err = unicodeCharClassToRange(string(re_postfix[i]))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else if re_postfix[i] == '{' {
|
||||||
|
i++ // Skip opening bracket
|
||||||
|
unicodeCharClassStr := ""
|
||||||
|
for re_postfix[i] != '}' {
|
||||||
|
unicodeCharClassStr += string(re_postfix[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
var err error
|
||||||
|
charsInClass, err = unicodeCharClassToRange(unicodeCharClassStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("error parsing unicode character class in expression")
|
||||||
|
}
|
||||||
|
var toAppend postfixNode
|
||||||
|
if !charClassInverted { // \p
|
||||||
|
toAppend = newPostfixNode(charsInClass...)
|
||||||
|
} else { // \P
|
||||||
|
toAppend = newPostfixDotNode()
|
||||||
|
toAppend.except = append([]postfixNode{}, newPostfixNode(charsInClass...))
|
||||||
|
}
|
||||||
|
outQueue = append(outQueue, toAppend)
|
||||||
|
} else if re_postfix[i] == '0' { // Octal value
|
||||||
var octVal int64
|
var octVal int64
|
||||||
var octValStr string
|
var octValStr string
|
||||||
numDigitsParsed := 0
|
numDigitsParsed := 0
|
||||||
for (i+numDigitsParsed) < len(re_postfix) && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 {
|
for (i+numDigitsParsed) < len(re_postfix) && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 4 {
|
||||||
octValStr += string(re_postfix[i+numDigitsParsed])
|
octValStr += string(re_postfix[i+numDigitsParsed])
|
||||||
numDigitsParsed++
|
numDigitsParsed++
|
||||||
}
|
}
|
||||||
@@ -421,6 +546,20 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically
|
i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically
|
||||||
outQueue = append(outQueue, newPostfixCharNode(rune(octVal)))
|
outQueue = append(outQueue, newPostfixCharNode(rune(octVal)))
|
||||||
|
} else if unicode.IsDigit(re_postfix[i]) { // Backreference
|
||||||
|
var num int64
|
||||||
|
var numStr string
|
||||||
|
numDigitsParsed := 0
|
||||||
|
for (i+numDigitsParsed) < len(re_postfix) && unicode.IsDigit(re_postfix[i+numDigitsParsed]) {
|
||||||
|
numStr += string(re_postfix[i+numDigitsParsed])
|
||||||
|
numDigitsParsed++
|
||||||
|
}
|
||||||
|
num, err := strconv.ParseInt(numStr, 10, 32)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error parsing backreference in expresion")
|
||||||
|
}
|
||||||
|
i += numDigitsParsed - 1
|
||||||
|
outQueue = append(outQueue, newPostfixBackreferenceNode(int(num)))
|
||||||
} else {
|
} else {
|
||||||
escapedNode, err := newEscapedNode(re_postfix[i], false)
|
escapedNode, err := newEscapedNode(re_postfix[i], false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -450,10 +589,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
if i >= len(re_postfix) {
|
if i >= len(re_postfix) {
|
||||||
return nil, fmt.Errorf("unclosed lookaround")
|
return nil, fmt.Errorf("unclosed lookaround")
|
||||||
}
|
}
|
||||||
if re_postfix[i] == '(' || re_postfix[i] == nonCapLparenRune {
|
if (re_postfix[i] == '(' && re_postfix[i-1] != '\\') || re_postfix[i] == nonCapLparenRune {
|
||||||
numOpenParens++
|
numOpenParens++
|
||||||
}
|
}
|
||||||
if re_postfix[i] == ')' {
|
if re_postfix[i] == ')' && re_postfix[i-1] != '\\' {
|
||||||
numOpenParens--
|
numOpenParens--
|
||||||
if numOpenParens == 0 {
|
if numOpenParens == 0 {
|
||||||
break
|
break
|
||||||
@@ -572,11 +711,44 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("not enough hex characters found in character class")
|
return nil, fmt.Errorf("not enough hex characters found in character class")
|
||||||
}
|
}
|
||||||
} else if isOctal(re_postfix[i]) { // Octal value
|
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
|
||||||
|
charClassInverted := (re_postfix[i] == 'P')
|
||||||
|
var charsInList []rune
|
||||||
|
i++
|
||||||
|
if isUnicodeCharClassLetter(re_postfix[i]) {
|
||||||
|
var err error
|
||||||
|
charsInList, err = unicodeCharClassToRange(string(re_postfix[i]))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else if re_postfix[i] == '{' {
|
||||||
|
i++ // Skip opening bracket
|
||||||
|
unicodeCharClassStr := ""
|
||||||
|
for re_postfix[i] != '}' {
|
||||||
|
unicodeCharClassStr += string(re_postfix[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
var err error
|
||||||
|
charsInList, err = unicodeCharClassToRange(unicodeCharClassStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("error parsing unicode character class in expression")
|
||||||
|
}
|
||||||
|
if !charClassInverted {
|
||||||
|
chars = append(chars, newPostfixNode(charsInList...))
|
||||||
|
} else {
|
||||||
|
toAppend := newPostfixDotNode()
|
||||||
|
toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...))
|
||||||
|
chars = append(chars, toAppend)
|
||||||
|
}
|
||||||
|
} else if re_postfix[i] == '0' { // Octal value
|
||||||
|
|
||||||
var octVal int64
|
var octVal int64
|
||||||
var octValStr string
|
var octValStr string
|
||||||
numDigitsParsed := 0
|
numDigitsParsed := 0
|
||||||
for (i+numDigitsParsed) < len(re_postfix)-1 && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else)
|
for (i+numDigitsParsed) < len(re_postfix)-1 && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 4 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else)
|
||||||
octValStr += string(re_postfix[i+numDigitsParsed])
|
octValStr += string(re_postfix[i+numDigitsParsed])
|
||||||
numDigitsParsed++
|
numDigitsParsed++
|
||||||
}
|
}
|
||||||
@@ -773,6 +945,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
outQueue[idx].startReps = startRangeNum
|
outQueue[idx].startReps = startRangeNum
|
||||||
outQueue[idx].endReps = endRangeNum
|
outQueue[idx].endReps = endRangeNum
|
||||||
|
if i < len(re_postfix)-1 && re_postfix[i+1] == '?' { // lazy repitition
|
||||||
|
outQueue[idx].isLazy = true
|
||||||
|
i++
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if c == '(' || c == nonCapLparenRune {
|
if c == '(' || c == nonCapLparenRune {
|
||||||
opStack = append(opStack, c)
|
opStack = append(opStack, c)
|
||||||
@@ -780,6 +956,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
outQueue = append(outQueue, newPostfixNode(c))
|
outQueue = append(outQueue, newPostfixNode(c))
|
||||||
}
|
}
|
||||||
numOpenParens++
|
numOpenParens++
|
||||||
|
parenIndices = append(parenIndices, Group{StartIdx: len(outQueue) - 1}) // Push the index of the lparen into parenIndices
|
||||||
}
|
}
|
||||||
if c == ')' {
|
if c == ')' {
|
||||||
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Throw error if we reach the end of the stack.
|
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Throw error if we reach the end of the stack.
|
||||||
@@ -796,6 +973,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
|
if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
|
||||||
outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
|
outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
|
||||||
}
|
}
|
||||||
|
parenIndices[numOpenParens].EndIdx = len(outQueue) - 1
|
||||||
numOpenParens--
|
numOpenParens--
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -810,6 +988,11 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
return nil, fmt.Errorf("imbalanced parantheses")
|
return nil, fmt.Errorf("imbalanced parantheses")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// outQueue, _, err := rewriteBackreferences(outQueue, parenIndices)
|
||||||
|
// if err != nil {
|
||||||
|
// return nil, err
|
||||||
|
// }
|
||||||
|
|
||||||
return outQueue, nil
|
return outQueue, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1021,6 +1204,21 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
})
|
})
|
||||||
nfa = append(nfa, toAdd)
|
nfa = append(nfa, toAdd)
|
||||||
}
|
}
|
||||||
|
if c.nodetype == backreferenceNode {
|
||||||
|
if c.referencedGroup > numGroups {
|
||||||
|
return Reg{}, fmt.Errorf("invalid backreference")
|
||||||
|
}
|
||||||
|
stateToAdd := &nfaState{}
|
||||||
|
stateToAdd.assert = noneAssert
|
||||||
|
stateToAdd.content = newContents(epsilon)
|
||||||
|
stateToAdd.isEmpty = true
|
||||||
|
stateToAdd.isBackreference = true
|
||||||
|
stateToAdd.output = make([]*nfaState, 0)
|
||||||
|
stateToAdd.output = append(stateToAdd.output, stateToAdd)
|
||||||
|
stateToAdd.referredGroup = c.referencedGroup
|
||||||
|
stateToAdd.threadBackref = 0
|
||||||
|
nfa = append(nfa, stateToAdd)
|
||||||
|
}
|
||||||
// Must be an operator if it isn't a character
|
// Must be an operator if it isn't a character
|
||||||
switch c.nodetype {
|
switch c.nodetype {
|
||||||
case concatenateNode:
|
case concatenateNode:
|
||||||
@@ -1044,6 +1242,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return Reg{}, err
|
return Reg{}, err
|
||||||
}
|
}
|
||||||
|
if c.isLazy {
|
||||||
|
stateToAdd.isLazy = true
|
||||||
|
}
|
||||||
nfa = append(nfa, stateToAdd)
|
nfa = append(nfa, stateToAdd)
|
||||||
case plusNode: // a+ is equivalent to aa*
|
case plusNode: // a+ is equivalent to aa*
|
||||||
s1 := mustPop(&nfa)
|
s1 := mustPop(&nfa)
|
||||||
@@ -1051,6 +1252,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return Reg{}, err
|
return Reg{}, err
|
||||||
}
|
}
|
||||||
|
if c.isLazy {
|
||||||
|
s2.isLazy = true
|
||||||
|
}
|
||||||
s1 = concatenate(s1, s2)
|
s1 = concatenate(s1, s2)
|
||||||
nfa = append(nfa, s1)
|
nfa = append(nfa, s1)
|
||||||
case questionNode: // ab? is equivalent to a(b|)
|
case questionNode: // ab? is equivalent to a(b|)
|
||||||
@@ -1062,6 +1266,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return Reg{}, err
|
return Reg{}, err
|
||||||
}
|
}
|
||||||
|
if c.isLazy {
|
||||||
|
s2.isLazy = true
|
||||||
|
}
|
||||||
nfa = append(nfa, s2)
|
nfa = append(nfa, s2)
|
||||||
case pipeNode:
|
case pipeNode:
|
||||||
// A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
|
// A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
|
||||||
@@ -1117,6 +1324,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return Reg{}, err
|
return Reg{}, err
|
||||||
}
|
}
|
||||||
|
if c.isLazy {
|
||||||
|
s2.isLazy = true
|
||||||
|
}
|
||||||
stateToAdd = concatenate(stateToAdd, s2)
|
stateToAdd = concatenate(stateToAdd, s2)
|
||||||
} else { // Case 2
|
} else { // Case 2
|
||||||
for i := c.startReps; i < c.endReps; i++ {
|
for i := c.startReps; i < c.endReps; i++ {
|
||||||
@@ -1124,6 +1334,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return Reg{}, fmt.Errorf("error processing bounded repetition")
|
return Reg{}, fmt.Errorf("error processing bounded repetition")
|
||||||
}
|
}
|
||||||
|
if c.isLazy {
|
||||||
|
tmp.isLazy = true
|
||||||
|
}
|
||||||
stateToAdd = concatenate(stateToAdd, tmp)
|
stateToAdd = concatenate(stateToAdd, tmp)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
65
regex/doc.go
65
regex/doc.go
@@ -18,7 +18,7 @@ Single characters:
|
|||||||
[^abc] Negated character class - match any character except a, b and c
|
[^abc] Negated character class - match any character except a, b and c
|
||||||
[^a-z] Negated character range - do not match any character from a to z
|
[^a-z] Negated character range - do not match any character from a to z
|
||||||
\[ Match a literal '['. Backslashes can escape any character with special meaning, including another backslash.
|
\[ Match a literal '['. Backslashes can escape any character with special meaning, including another backslash.
|
||||||
\452 Match the character with the octal value 452 (up to 3 digits)
|
\0452 Match the character with the octal value 452 (up to 4 digits, first digit must be 0)
|
||||||
\xFF Match the character with the hex value FF (exactly 2 characters)
|
\xFF Match the character with the hex value FF (exactly 2 characters)
|
||||||
\x{0000FF} Match the character with the hex value 0000FF (exactly 6 characters)
|
\x{0000FF} Match the character with the hex value 0000FF (exactly 6 characters)
|
||||||
\n Newline
|
\n Newline
|
||||||
@@ -33,7 +33,7 @@ Perl classes:
|
|||||||
\d Match any digit character ([0-9])
|
\d Match any digit character ([0-9])
|
||||||
\D Match any non-digit character ([^0-9])
|
\D Match any non-digit character ([^0-9])
|
||||||
\w Match any word character ([a-zA-Z0-9_])
|
\w Match any word character ([a-zA-Z0-9_])
|
||||||
\W Match any word character ([^a-zA-Z0-9_])
|
\W Match any non-word character ([^a-zA-Z0-9_])
|
||||||
\s Match any whitespace character ([ \t\n])
|
\s Match any whitespace character ([ \t\n])
|
||||||
\S Match any non-whitespace character ([^ \t\n])
|
\S Match any non-whitespace character ([^ \t\n])
|
||||||
|
|
||||||
@@ -60,14 +60,24 @@ Composition:
|
|||||||
x|y Match x or y (prefer x)
|
x|y Match x or y (prefer x)
|
||||||
xy|z Match xy or z (prefer xy)
|
xy|z Match xy or z (prefer xy)
|
||||||
|
|
||||||
Repitition (always greedy, preferring more):
|
Repitition:
|
||||||
|
|
||||||
x* Match x zero or more times
|
Greedy:
|
||||||
x+ Match x one or more times
|
x* Match x zero or more times, prefer more
|
||||||
x? Match x zero or one time
|
x+ Match x one or more times, prefer more
|
||||||
x{m,n} Match x between m and n times (inclusive)
|
x? Match x zero or one time, prefer one
|
||||||
x{m,} Match x atleast m times
|
x{m,n} Match x between m and n times (inclusive), prefer more
|
||||||
x{,n} Match x between 0 and n times (inclusive)
|
x{m,} Match x atleast m times, prefer more
|
||||||
|
x{,n} Match x between 0 and n times (inclusive), prefer more
|
||||||
|
x{m} Match x exactly m times
|
||||||
|
|
||||||
|
Lazy:
|
||||||
|
x*? Match x zero or more times, prefer fewer
|
||||||
|
x+? Match x one or more times, prefer fewer
|
||||||
|
x?? Match x zero or one time, prefer zero
|
||||||
|
x{m,n}? Match x between m and n times (inclusive), prefer fewer
|
||||||
|
x{m,}? Match x atleast m times, prefer fewer
|
||||||
|
x{,n}? Match x between 0 and n times (inclusive), prefer fewer
|
||||||
x{m} Match x exactly m times
|
x{m} Match x exactly m times
|
||||||
|
|
||||||
Grouping:
|
Grouping:
|
||||||
@@ -93,6 +103,10 @@ Lookarounds:
|
|||||||
(?<=x)y Positive lookbehind - Match y if preceded by x
|
(?<=x)y Positive lookbehind - Match y if preceded by x
|
||||||
(?<!x)y Negative lookbehind - Match y if NOT preceded by x
|
(?<!x)y Negative lookbehind - Match y if NOT preceded by x
|
||||||
|
|
||||||
|
Backreferences:
|
||||||
|
|
||||||
|
(xy)\1 Match 'xy' followed by the text most recently captured by group 1 (in this case, 'xy')
|
||||||
|
|
||||||
Numeric ranges:
|
Numeric ranges:
|
||||||
|
|
||||||
<x-y> Match any number from x to y (inclusive) (x and y must be positive numbers)
|
<x-y> Match any number from x to y (inclusive) (x and y must be positive numbers)
|
||||||
@@ -103,33 +117,13 @@ Numeric ranges:
|
|||||||
The engine and the API differ from [regexp] in a few ways, some of them very subtle.
|
The engine and the API differ from [regexp] in a few ways, some of them very subtle.
|
||||||
The key differences are mentioned below.
|
The key differences are mentioned below.
|
||||||
|
|
||||||
1. Greediness:
|
1. Byte-slices and runes:
|
||||||
|
|
||||||
This engine does not support non-greedy operators. All operators are always greedy in nature, and will try
|
|
||||||
to match as much as they can, while still allowing for a successful match. For example, given the regex:
|
|
||||||
|
|
||||||
y*y
|
|
||||||
|
|
||||||
The engine will match as many 'y's as it can, while still allowing the trailing 'y' to be matched.
|
|
||||||
|
|
||||||
Another, more subtle example is the following regex:
|
|
||||||
|
|
||||||
x|xx
|
|
||||||
|
|
||||||
While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
|
|
||||||
this engine will go for the longest possible match, regardless of the order of the alternation. Although this
|
|
||||||
strays from the convention, it results in a nice rule-of-thumb - the engine is ALWAYS greedy.
|
|
||||||
|
|
||||||
The stdlib implementation has a function [regexp.Regexp.Longest] which makes future searches prefer the longest match.
|
|
||||||
That is the default (and unchangable) behavior in this engine.
|
|
||||||
|
|
||||||
2. Byte-slices and runes:
|
|
||||||
|
|
||||||
My engine does not support byte-slices. When a matching function receives a string, it converts it into a
|
My engine does not support byte-slices. When a matching function receives a string, it converts it into a
|
||||||
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
|
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
|
||||||
support made the tradeoff worth it.
|
support made the tradeoff worth it.
|
||||||
|
|
||||||
3. Return values
|
2. Return values
|
||||||
|
|
||||||
Rather than using primitives for return values, my engine defines two types that are used as return
|
Rather than using primitives for return values, my engine defines two types that are used as return
|
||||||
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
|
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
|
||||||
@@ -164,14 +158,15 @@ returns the 0-group.
|
|||||||
|
|
||||||
The following features from [regexp] are (currently) NOT supported:
|
The following features from [regexp] are (currently) NOT supported:
|
||||||
1. Named capturing groups
|
1. Named capturing groups
|
||||||
2. Non-greedy operators
|
2. Negated POSIX classes
|
||||||
3. Unicode character classes
|
3. Embedded flags (flags are instead passed as arguments to [Compile])
|
||||||
4. Embedded flags (flags are passed as arguments to [Compile])
|
4. Literal text with \Q ... \E
|
||||||
5. Literal text with \Q ... \E
|
5. Finite repetition with no start (defaulting at 0)
|
||||||
|
|
||||||
The following features are not available in [regexp], but are supported in my engine:
|
The following features are not available in [regexp], but are supported in my engine:
|
||||||
1. Lookarounds
|
1. Lookarounds
|
||||||
2. Numeric ranges
|
2. Numeric ranges
|
||||||
|
3. Backreferences
|
||||||
|
|
||||||
I hope to shorten the first list, and expand the second.
|
I hope to shorten the first list, and expand the second.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package regex_test
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"gitea.twomorecents.org/Rockingcool/kleingrep/regex"
|
"gitea.twomorecents.org/Rockingcool/kleingrep/regex"
|
||||||
)
|
)
|
||||||
@@ -32,12 +33,12 @@ func ExampleReg_FindAll() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func ExampleReg_FindString() {
|
func ExampleReg_FindString() {
|
||||||
regexStr := `\d+`
|
regexStr := `\w+\s+(?=sheep)`
|
||||||
regexComp := regex.MustCompile(regexStr)
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
|
||||||
matchStr := regexComp.FindString("The year of our lord, 2025")
|
matchStr := regexComp.FindString("pink cows and yellow sheep")
|
||||||
fmt.Println(matchStr)
|
fmt.Println(matchStr)
|
||||||
// Output: 2025
|
// Output: yellow
|
||||||
}
|
}
|
||||||
|
|
||||||
func ExampleReg_FindSubmatch() {
|
func ExampleReg_FindSubmatch() {
|
||||||
@@ -53,6 +54,71 @@ func ExampleReg_FindSubmatch() {
|
|||||||
// 2 3
|
// 2 3
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ExampleReg_FindStringSubmatch() {
|
||||||
|
regexStr := `(\d{4})-(\d{2})-(\d{2})`
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
inputStr := `The date is 2025-02-10`
|
||||||
|
|
||||||
|
match := regexComp.FindStringSubmatch(inputStr)
|
||||||
|
fmt.Println(match[1])
|
||||||
|
fmt.Println(match[3])
|
||||||
|
// Output: 2025
|
||||||
|
// 10
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_FindAllSubmatch() {
|
||||||
|
regexStr := `(\d)\.(\d)(\d)`
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
|
||||||
|
matches := regexComp.FindAllSubmatch("3.14+8.97")
|
||||||
|
fmt.Println(matches[0][0]) // 0-group (entire match) of 1st match (0-indexed)
|
||||||
|
fmt.Println(matches[0][1]) // 1st group of 1st match
|
||||||
|
fmt.Println(matches[1][0]) // 0-group of 2nd match
|
||||||
|
fmt.Println(matches[1][1]) // 1st group of 2nd math
|
||||||
|
// Output: 0 4
|
||||||
|
// 0 1
|
||||||
|
// 5 9
|
||||||
|
// 5 6
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_FindAllString() {
|
||||||
|
regexStr := `<0-255>\.<0-255>\.<0-255>\.<0-255>`
|
||||||
|
inputStr := `192.168.220.7 pings 9.9.9.9`
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
|
||||||
|
matchStrs := regexComp.FindAllString(inputStr)
|
||||||
|
|
||||||
|
fmt.Println(matchStrs[0])
|
||||||
|
fmt.Println(matchStrs[1])
|
||||||
|
// Output: 192.168.220.7
|
||||||
|
// 9.9.9.9
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_FindAllStringSubmatch() {
|
||||||
|
// 'https' ...
|
||||||
|
// followed by 1 or more alphanumeric characters (including period) ...
|
||||||
|
// then a forward slash ...
|
||||||
|
// followed by one more of :
|
||||||
|
// word character,
|
||||||
|
// question mark,
|
||||||
|
// period,
|
||||||
|
// equals sign
|
||||||
|
regexStr := `https://([a-z0-9\.]+)/([\w.?=]+)`
|
||||||
|
regexComp := regex.MustCompile(regexStr, regex.RE_CASE_INSENSITIVE)
|
||||||
|
inputStr := `You can find me at https://twomorecents.org/index.html and https://news.ycombinator.com/user?id=aadhavans`
|
||||||
|
|
||||||
|
matchIndices := regexComp.FindAllStringSubmatch(inputStr)
|
||||||
|
fmt.Println(matchIndices[0][1]) // 1st group of 1st match (0-indexed)
|
||||||
|
fmt.Println(matchIndices[0][2]) // 2nd group of 1st match
|
||||||
|
fmt.Println(matchIndices[1][1]) // 1st group of 2nd match
|
||||||
|
fmt.Println(matchIndices[1][2]) // 2nd group of 2nd match
|
||||||
|
// Output: twomorecents.org
|
||||||
|
// index.html
|
||||||
|
// news.ycombinator.com
|
||||||
|
// user?id=aadhavans
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
func ExampleReg_Expand() {
|
func ExampleReg_Expand() {
|
||||||
inputStr := `option1: value1
|
inputStr := `option1: value1
|
||||||
option2: value2`
|
option2: value2`
|
||||||
@@ -89,3 +155,27 @@ func ExampleReg_Longest() {
|
|||||||
// Output: x
|
// Output: x
|
||||||
// xx
|
// xx
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ExampleReg_ReplaceAll() {
|
||||||
|
regexStr := `(\d)(\w)`
|
||||||
|
inputStr := "5d9t"
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
fmt.Println(regexComp.ReplaceAll(inputStr, `$2$1`))
|
||||||
|
// Output: d5t9
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_ReplaceAllLiteral() {
|
||||||
|
regexStr := `fox|dog`
|
||||||
|
inputStr := "the quick brown fox jumped over the lazy dog"
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
fmt.Println(regexComp.ReplaceAllLiteral(inputStr, `duck`))
|
||||||
|
// Output: the quick brown duck jumped over the lazy duck
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_ReplaceAllFunc() {
|
||||||
|
regexStr := `\w{5,}`
|
||||||
|
inputStr := `all five or more letter words in this string are capitalized`
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
fmt.Println(regexComp.ReplaceAllFunc(inputStr, strings.ToUpper))
|
||||||
|
// Output: all five or more LETTER WORDS in this STRING are CAPITALIZED
|
||||||
|
}
|
||||||
|
|||||||
@@ -205,22 +205,45 @@ func (re Reg) FindAllSubmatch(str string) []Match {
|
|||||||
return indices
|
return indices
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FindAllSubmatch returns a double-slice of strings. Each slice contains the text of a match, including all submatches.
|
||||||
|
// A return value of nil indicates no match.
|
||||||
|
func (re Reg) FindAllStringSubmatch(str string) [][]string {
|
||||||
|
match := re.FindAllSubmatch(str)
|
||||||
|
if len(match) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
rtv := make([][]string, len(match))
|
||||||
|
for i := range rtv {
|
||||||
|
rtv[i] = make([]string, re.numGroups+1)
|
||||||
|
}
|
||||||
|
rtv = funcMap(match, func(m Match) []string {
|
||||||
|
return funcMap(m, func(g Group) string {
|
||||||
|
if g.IsValid() {
|
||||||
|
return str[g.StartIdx:g.EndIdx]
|
||||||
|
} else {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
return rtv
|
||||||
|
}
|
||||||
|
|
||||||
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState {
|
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState {
|
||||||
if stateExists(list, state) || stateExists(visited, state) {
|
if stateExists(list, state) || stateExists(visited, state) {
|
||||||
return list
|
return list
|
||||||
}
|
}
|
||||||
visited = append(visited, state)
|
visited = append(visited, state)
|
||||||
|
|
||||||
if state.isKleene || state.isQuestion {
|
if (state.isKleene || state.isQuestion) && (state.isLazy == false) { // Greedy quantifiers
|
||||||
copyThread(state.splitState, state)
|
copyThread(state.splitState, state)
|
||||||
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
|
list := addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
|
||||||
copyThread(state.next, state)
|
copyThread(state.next, state)
|
||||||
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
||||||
return list
|
return list
|
||||||
}
|
}
|
||||||
if state.isAlternation {
|
if state.isAlternation || ((state.isKleene || state.isQuestion) && state.isLazy) { // Alternation or lazy quantifier
|
||||||
copyThread(state.next, state)
|
copyThread(state.next, state)
|
||||||
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
list := addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
||||||
copyThread(state.splitState, state)
|
copyThread(state.splitState, state)
|
||||||
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
|
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
|
||||||
return list
|
return list
|
||||||
@@ -234,10 +257,12 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread
|
|||||||
}
|
}
|
||||||
if state.groupBegin {
|
if state.groupBegin {
|
||||||
state.threadGroups[state.groupNum].StartIdx = idx
|
state.threadGroups[state.groupNum].StartIdx = idx
|
||||||
|
copyThread(state.next, state)
|
||||||
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
|
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
|
||||||
}
|
}
|
||||||
if state.groupEnd {
|
if state.groupEnd {
|
||||||
state.threadGroups[state.groupNum].EndIdx = idx
|
state.threadGroups[state.groupNum].EndIdx = idx
|
||||||
|
copyThread(state.next, state)
|
||||||
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
|
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
|
||||||
}
|
}
|
||||||
return append(list, state)
|
return append(list, state)
|
||||||
@@ -290,11 +315,25 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
|
|||||||
if !preferLongest {
|
if !preferLongest {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
} else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character
|
} else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.isBackreference && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character
|
||||||
if currentState.contentContains(str, idx, preferLongest) {
|
if currentState.contentContains(str, idx, preferLongest) {
|
||||||
nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
|
nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
|
||||||
}
|
}
|
||||||
|
} else if currentState.isBackreference && currentState.threadGroups[currentState.referredGroup].IsValid() {
|
||||||
|
groupLength := currentState.threadGroups[currentState.referredGroup].EndIdx - currentState.threadGroups[currentState.referredGroup].StartIdx
|
||||||
|
if currentState.threadBackref == groupLength {
|
||||||
|
currentState.threadBackref = 0
|
||||||
|
copyThread(currentState.next, currentState)
|
||||||
|
currentStates = addStateToList(str, idx, currentStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
|
||||||
|
} else {
|
||||||
|
idxInReferredGroup := currentState.threadGroups[currentState.referredGroup].StartIdx + currentState.threadBackref
|
||||||
|
if idxInReferredGroup < len(str) && idx < len(str) && str[idxInReferredGroup] == str[idx] {
|
||||||
|
currentState.threadBackref += 1
|
||||||
|
nextStates = append(nextStates, currentState)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
currentStates = append([]nfaState{}, nextStates...)
|
currentStates = append([]nfaState{}, nextStates...)
|
||||||
nextStates = nil
|
nextStates = nil
|
||||||
@@ -327,7 +366,7 @@ func (re Reg) Expand(dst string, template string, src string, match Match) strin
|
|||||||
i++
|
i++
|
||||||
} else {
|
} else {
|
||||||
numStr := ""
|
numStr := ""
|
||||||
for unicode.IsDigit(templateRuneSlc[i]) {
|
for i < len(templateRuneSlc) && unicode.IsDigit(templateRuneSlc[i]) {
|
||||||
numStr += string(templateRuneSlc[i])
|
numStr += string(templateRuneSlc[i])
|
||||||
i++
|
i++
|
||||||
}
|
}
|
||||||
@@ -372,3 +411,66 @@ func (re Reg) LiteralPrefix() (prefix string, complete bool) {
|
|||||||
}
|
}
|
||||||
return prefix, complete
|
return prefix, complete
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ReplaceAll replaces all matches of the expression in src, with the text in repl. In repl, variables are interpreted
|
||||||
|
// as they are in [Reg.Expand]. The resulting string is returned.
|
||||||
|
func (re Reg) ReplaceAll(src string, repl string) string {
|
||||||
|
matches := re.FindAllSubmatch(src)
|
||||||
|
i := 0
|
||||||
|
currentMatch := 0
|
||||||
|
dst := ""
|
||||||
|
for i < len(src) {
|
||||||
|
if currentMatch < len(matches) && matches[currentMatch][0].IsValid() && i == matches[currentMatch][0].StartIdx {
|
||||||
|
dst += re.Expand("", repl, src, matches[currentMatch])
|
||||||
|
i = matches[currentMatch][0].EndIdx
|
||||||
|
currentMatch++
|
||||||
|
} else {
|
||||||
|
dst += string(src[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return dst
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReplaceAllLiteral replaces all matches of the expression in src, with the text in repl. The text is replaced directly,
|
||||||
|
// without any expansion.
|
||||||
|
func (re Reg) ReplaceAllLiteral(src string, repl string) string {
|
||||||
|
zerogroups := re.FindAll(src)
|
||||||
|
currentMatch := 0
|
||||||
|
i := 0
|
||||||
|
dst := ""
|
||||||
|
|
||||||
|
for i < len(src) {
|
||||||
|
if currentMatch < len(zerogroups) && i == zerogroups[currentMatch].StartIdx {
|
||||||
|
dst += repl
|
||||||
|
i = zerogroups[currentMatch].EndIdx
|
||||||
|
currentMatch += 1
|
||||||
|
} else {
|
||||||
|
dst += string(src[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return dst
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReplaceAllFunc replaces every match of the expression in src, with the return value of the function replFunc.
|
||||||
|
// replFunc takes in the matched string. The return value is substituted in directly without expasion.
|
||||||
|
func (re Reg) ReplaceAllFunc(src string, replFunc func(string) string) string {
|
||||||
|
zerogroups := re.FindAll(src)
|
||||||
|
currentMatch := 0
|
||||||
|
i := 0
|
||||||
|
dst := ""
|
||||||
|
|
||||||
|
for i < len(src) {
|
||||||
|
if currentMatch < len(zerogroups) && i == zerogroups[currentMatch].StartIdx {
|
||||||
|
dst += replFunc(src[zerogroups[currentMatch].StartIdx:zerogroups[currentMatch].EndIdx])
|
||||||
|
i = zerogroups[currentMatch].EndIdx
|
||||||
|
currentMatch += 1
|
||||||
|
} else {
|
||||||
|
dst += string(src[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return dst
|
||||||
|
|
||||||
|
}
|
||||||
|
|||||||
@@ -16,8 +16,11 @@ var rparenRune rune = 0xF0006
|
|||||||
var nonCapLparenRune rune = 0xF0007 // Represents a non-capturing group's LPAREN
|
var nonCapLparenRune rune = 0xF0007 // Represents a non-capturing group's LPAREN
|
||||||
var escBackslashRune rune = 0xF0008 // Represents an escaped backslash
|
var escBackslashRune rune = 0xF0008 // Represents an escaped backslash
|
||||||
var charRangeRune rune = 0xF0009 // Represents a character range
|
var charRangeRune rune = 0xF0009 // Represents a character range
|
||||||
|
var lazyKleeneRune rune = 0xF000A // Represents a lazy kleene star
|
||||||
|
var lazyPlusRune rune = 0xF000B // Represents a lazy plus operator
|
||||||
|
var lazyQuestionRune rune = 0xF000C // Represents a lazy question operator
|
||||||
|
|
||||||
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
|
var specialChars = []rune{'?', lazyQuestionRune, '*', lazyKleeneRune, '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', lazyPlusRune, '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
|
||||||
|
|
||||||
// An interface for int and rune, which are identical
|
// An interface for int and rune, which are identical
|
||||||
type character interface {
|
type character interface {
|
||||||
|
|||||||
16
regex/nfa.go
16
regex/nfa.go
@@ -34,6 +34,7 @@ type nfaState struct {
|
|||||||
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
|
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
|
||||||
isQuestion bool // Identifies whether current node is a 0-state representing the question operator
|
isQuestion bool // Identifies whether current node is a 0-state representing the question operator
|
||||||
isAlternation bool // Identifies whether current node is a 0-state representing an alternation
|
isAlternation bool // Identifies whether current node is a 0-state representing an alternation
|
||||||
|
isLazy bool // Only for split states - Identifies whether or not to flip the order of branches (try one branch before the other)
|
||||||
splitState *nfaState // Only for alternation states - the 'other' branch of the alternation ('next' is the first)
|
splitState *nfaState // Only for alternation states - the 'other' branch of the alternation ('next' is the first)
|
||||||
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
|
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
|
||||||
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
|
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
|
||||||
@@ -44,9 +45,11 @@ type nfaState struct {
|
|||||||
groupBegin bool // Whether or not the node starts a capturing group
|
groupBegin bool // Whether or not the node starts a capturing group
|
||||||
groupEnd bool // Whether or not the node ends a capturing group
|
groupEnd bool // Whether or not the node ends a capturing group
|
||||||
groupNum int // Which capturing group the node starts / ends
|
groupNum int // Which capturing group the node starts / ends
|
||||||
|
isBackreference bool // Whether or not current node is backreference
|
||||||
|
referredGroup int // If current node is a backreference, the node that it points to
|
||||||
// The following properties depend on the current match - I should think about resetting them for every match.
|
// The following properties depend on the current match - I should think about resetting them for every match.
|
||||||
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
|
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
|
||||||
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
|
threadBackref int // If current node is a backreference, how many characters to look forward into the referred group
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clones the NFA starting from the given state.
|
// Clones the NFA starting from the given state.
|
||||||
@@ -75,14 +78,16 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
|
|||||||
isKleene: stateToClone.isKleene,
|
isKleene: stateToClone.isKleene,
|
||||||
isQuestion: stateToClone.isQuestion,
|
isQuestion: stateToClone.isQuestion,
|
||||||
isAlternation: stateToClone.isAlternation,
|
isAlternation: stateToClone.isAlternation,
|
||||||
|
isLazy: stateToClone.isLazy,
|
||||||
assert: stateToClone.assert,
|
assert: stateToClone.assert,
|
||||||
zeroMatchFound: stateToClone.zeroMatchFound,
|
|
||||||
allChars: stateToClone.allChars,
|
allChars: stateToClone.allChars,
|
||||||
except: append([]rune{}, stateToClone.except...),
|
except: append([]rune{}, stateToClone.except...),
|
||||||
lookaroundRegex: stateToClone.lookaroundRegex,
|
lookaroundRegex: stateToClone.lookaroundRegex,
|
||||||
groupEnd: stateToClone.groupEnd,
|
groupEnd: stateToClone.groupEnd,
|
||||||
groupBegin: stateToClone.groupBegin,
|
groupBegin: stateToClone.groupBegin,
|
||||||
groupNum: stateToClone.groupNum,
|
groupNum: stateToClone.groupNum,
|
||||||
|
isBackreference: stateToClone.isBackreference,
|
||||||
|
referredGroup: stateToClone.referredGroup,
|
||||||
}
|
}
|
||||||
cloneMap[stateToClone] = clone
|
cloneMap[stateToClone] = clone
|
||||||
for i, s := range stateToClone.output {
|
for i, s := range stateToClone.output {
|
||||||
@@ -122,6 +127,7 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
|
|||||||
}
|
}
|
||||||
// Assuming it hasn't been visited
|
// Assuming it hasn't been visited
|
||||||
state.threadGroups = nil
|
state.threadGroups = nil
|
||||||
|
state.threadBackref = 0
|
||||||
visitedMap[state] = true
|
visitedMap[state] = true
|
||||||
if state.isAlternation {
|
if state.isAlternation {
|
||||||
resetThreadsHelper(state.next, visitedMap)
|
resetThreadsHelper(state.next, visitedMap)
|
||||||
@@ -419,6 +425,7 @@ func (s nfaState) equals(other nfaState) bool {
|
|||||||
s.next == other.next &&
|
s.next == other.next &&
|
||||||
s.isKleene == other.isKleene &&
|
s.isKleene == other.isKleene &&
|
||||||
s.isQuestion == other.isQuestion &&
|
s.isQuestion == other.isQuestion &&
|
||||||
|
s.isLazy == other.isLazy &&
|
||||||
s.isAlternation == other.isAlternation &&
|
s.isAlternation == other.isAlternation &&
|
||||||
s.splitState == other.splitState &&
|
s.splitState == other.splitState &&
|
||||||
s.assert == other.assert &&
|
s.assert == other.assert &&
|
||||||
@@ -428,7 +435,8 @@ func (s nfaState) equals(other nfaState) bool {
|
|||||||
s.groupBegin == other.groupBegin &&
|
s.groupBegin == other.groupBegin &&
|
||||||
s.groupEnd == other.groupEnd &&
|
s.groupEnd == other.groupEnd &&
|
||||||
s.groupNum == other.groupNum &&
|
s.groupNum == other.groupNum &&
|
||||||
slices.Equal(s.threadGroups, other.threadGroups)
|
slices.Equal(s.threadGroups, other.threadGroups) &&
|
||||||
|
s.threadBackref == other.threadBackref
|
||||||
}
|
}
|
||||||
|
|
||||||
func stateExists(list []nfaState, s nfaState) bool {
|
func stateExists(list []nfaState, s nfaState) bool {
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
package regex
|
package regex
|
||||||
|
|
||||||
import "fmt"
|
import (
|
||||||
|
"fmt"
|
||||||
|
)
|
||||||
|
|
||||||
type nodeType int
|
type nodeType int
|
||||||
|
|
||||||
@@ -20,6 +22,7 @@ const (
|
|||||||
assertionNode
|
assertionNode
|
||||||
lparenNode
|
lparenNode
|
||||||
rparenNode
|
rparenNode
|
||||||
|
backreferenceNode
|
||||||
)
|
)
|
||||||
|
|
||||||
// Helper constants for lookarounds
|
// Helper constants for lookarounds
|
||||||
@@ -31,15 +34,17 @@ const lookbehind = -1
|
|||||||
var infinite_reps int = -1 // Represents infinite reps eg. the end range in {5,}
|
var infinite_reps int = -1 // Represents infinite reps eg. the end range in {5,}
|
||||||
// This represents a node in the postfix representation of the expression
|
// This represents a node in the postfix representation of the expression
|
||||||
type postfixNode struct {
|
type postfixNode struct {
|
||||||
nodetype nodeType
|
nodetype nodeType
|
||||||
contents []rune // Contents of the node
|
contents []rune // Contents of the node
|
||||||
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
|
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
|
||||||
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
|
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
|
||||||
allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter)
|
allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter)
|
||||||
except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here.
|
except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here.
|
||||||
lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
|
lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
|
||||||
lookaroundDir int // Lookbehind or lookahead
|
lookaroundDir int // Lookbehind or lookahead
|
||||||
nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
|
nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
|
||||||
|
referencedGroup int // ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode.
|
||||||
|
isLazy bool // ONLY USED WHEN nodetype == kleene or question
|
||||||
}
|
}
|
||||||
|
|
||||||
// Converts the given list of postfixNodes to one node of type CHARCLASS.
|
// Converts the given list of postfixNodes to one node of type CHARCLASS.
|
||||||
@@ -158,10 +163,19 @@ func newPostfixNode(contents ...rune) postfixNode {
|
|||||||
switch contents[0] {
|
switch contents[0] {
|
||||||
case '+':
|
case '+':
|
||||||
to_return.nodetype = plusNode
|
to_return.nodetype = plusNode
|
||||||
|
case lazyPlusRune:
|
||||||
|
to_return.nodetype = plusNode
|
||||||
|
to_return.isLazy = true
|
||||||
case '?':
|
case '?':
|
||||||
to_return.nodetype = questionNode
|
to_return.nodetype = questionNode
|
||||||
|
case lazyQuestionRune:
|
||||||
|
to_return.nodetype = questionNode
|
||||||
|
to_return.isLazy = true
|
||||||
case '*':
|
case '*':
|
||||||
to_return.nodetype = kleeneNode
|
to_return.nodetype = kleeneNode
|
||||||
|
case lazyKleeneRune:
|
||||||
|
to_return.nodetype = kleeneNode
|
||||||
|
to_return.isLazy = true
|
||||||
case '|':
|
case '|':
|
||||||
to_return.nodetype = pipeNode
|
to_return.nodetype = pipeNode
|
||||||
case concatRune:
|
case concatRune:
|
||||||
@@ -208,3 +222,44 @@ func newPostfixCharNode(contents ...rune) postfixNode {
|
|||||||
toReturn.contents = append(toReturn.contents, contents...)
|
toReturn.contents = append(toReturn.contents, contents...)
|
||||||
return toReturn
|
return toReturn
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// newPostfixBackreferenceNode creates and returns a backreference node, referring to the given group
|
||||||
|
func newPostfixBackreferenceNode(referred int) postfixNode {
|
||||||
|
toReturn := postfixNode{}
|
||||||
|
toReturn.startReps = 1
|
||||||
|
toReturn.endReps = 1
|
||||||
|
toReturn.nodetype = backreferenceNode
|
||||||
|
toReturn.referencedGroup = referred
|
||||||
|
return toReturn
|
||||||
|
}
|
||||||
|
|
||||||
|
// rewriteBackreferences rewrites any backreferences in the given postfixNode slice, into their respective groups.
|
||||||
|
// It stores the relation in a map, and returns it as the second return value.
|
||||||
|
// It uses parenIndices to determine where a group starts and ends in nodes.
|
||||||
|
// For example, \1(a) will be rewritten into (a)(a), and 1 -> 2 will be the hashmap value.
|
||||||
|
// It returns an error if a backreference points to an invalid group.
|
||||||
|
// func rewriteBackreferences(nodes []postfixNode, parenIndices []Group) ([]postfixNode, map[int]int, error) {
|
||||||
|
// rtv := make([]postfixNode, 0)
|
||||||
|
// referMap := make(map[int]int)
|
||||||
|
// numGroups := 0
|
||||||
|
// groupIncrement := 0 // If we have a backreference before the group its referring to, then the group its referring to will have its group number incremented.
|
||||||
|
// for i, node := range nodes {
|
||||||
|
// if node.nodetype == backreferenceNode {
|
||||||
|
// if node.referencedGroup >= len(parenIndices) {
|
||||||
|
// return nil, nil, fmt.Errorf("invalid backreference")
|
||||||
|
// }
|
||||||
|
// rtv = slices.Concat(rtv, nodes[parenIndices[node.referencedGroup].StartIdx:parenIndices[node.referencedGroup].EndIdx+1]) // Add all the nodes in the group to rtv
|
||||||
|
// numGroups += 1
|
||||||
|
// if i < parenIndices[node.referencedGroup].StartIdx {
|
||||||
|
// groupIncrement += 1
|
||||||
|
// }
|
||||||
|
// referMap[numGroups] = node.referencedGroup + groupIncrement
|
||||||
|
// } else {
|
||||||
|
// rtv = append(rtv, node)
|
||||||
|
// if node.nodetype == lparenNode {
|
||||||
|
// numGroups += 1
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// return rtv, referMap, nil
|
||||||
|
// }
|
||||||
|
|||||||
268
regex/re_test.go
268
regex/re_test.go
@@ -117,6 +117,7 @@ var reTests = []struct {
|
|||||||
{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
|
{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
|
||||||
{`\bpaint\b`, nil, "paints", []Group{}},
|
{`\bpaint\b`, nil, "paints", []Group{}},
|
||||||
{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
|
{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
|
||||||
|
{`\w{}`, nil, "test", nil},
|
||||||
{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
|
{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
|
||||||
{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
|
{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
|
||||||
{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
|
{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
|
||||||
@@ -179,7 +180,7 @@ var reTests = []struct {
|
|||||||
{"[[:graph:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 70}}},
|
{"[[:graph:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 70}}},
|
||||||
|
|
||||||
// Test cases from Python's RE test suite
|
// Test cases from Python's RE test suite
|
||||||
{`[\1]`, nil, "\x01", []Group{{0, 1}}},
|
{`[\01]`, nil, "\x01", []Group{{0, 1}}},
|
||||||
|
|
||||||
{`\0`, nil, "\x00", []Group{{0, 1}}},
|
{`\0`, nil, "\x00", []Group{{0, 1}}},
|
||||||
{`[\0a]`, nil, "\x00", []Group{{0, 1}}},
|
{`[\0a]`, nil, "\x00", []Group{{0, 1}}},
|
||||||
@@ -194,7 +195,7 @@ var reTests = []struct {
|
|||||||
{`\x00ffffffffffffff`, nil, "\xff", []Group{}},
|
{`\x00ffffffffffffff`, nil, "\xff", []Group{}},
|
||||||
{`\x00f`, nil, "\x0f", []Group{}},
|
{`\x00f`, nil, "\x0f", []Group{}},
|
||||||
{`\x00fe`, nil, "\xfe", []Group{}},
|
{`\x00fe`, nil, "\xfe", []Group{}},
|
||||||
{`^\w+=(\\[\000-\277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
|
{`^\w+=(\\[\000-\0277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
|
||||||
|
|
||||||
{`a.b`, nil, `acb`, []Group{{0, 3}}},
|
{`a.b`, nil, `acb`, []Group{{0, 3}}},
|
||||||
{`a.b`, nil, "a\nb", []Group{}},
|
{`a.b`, nil, "a\nb", []Group{}},
|
||||||
@@ -312,11 +313,7 @@ var reTests = []struct {
|
|||||||
{`a[-]?c`, nil, `ac`, []Group{{0, 2}}},
|
{`a[-]?c`, nil, `ac`, []Group{{0, 2}}},
|
||||||
{`^(.+)?B`, nil, `AB`, []Group{{0, 2}}},
|
{`^(.+)?B`, nil, `AB`, []Group{{0, 2}}},
|
||||||
{`\0009`, nil, "\x009", []Group{{0, 2}}},
|
{`\0009`, nil, "\x009", []Group{{0, 2}}},
|
||||||
{`\141`, nil, "a", []Group{{0, 1}}},
|
{`\0141`, nil, "a", []Group{{0, 1}}},
|
||||||
|
|
||||||
// At this point, the python test suite has a bunch
|
|
||||||
// of backreference tests. Since my engine doesn't
|
|
||||||
// implement backreferences, I've skipped those tests.
|
|
||||||
|
|
||||||
{`*a`, nil, ``, nil},
|
{`*a`, nil, ``, nil},
|
||||||
{`(*)b`, nil, ``, nil},
|
{`(*)b`, nil, ``, nil},
|
||||||
@@ -433,7 +430,8 @@ var reTests = []struct {
|
|||||||
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
|
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
|
||||||
{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
|
{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
|
||||||
{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
|
{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
|
||||||
{`\141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
|
{`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
|
||||||
|
{`\0141\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "AA", []Group{{0, 2}}},
|
||||||
|
|
||||||
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
|
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
|
||||||
|
|
||||||
@@ -464,8 +462,10 @@ var reTests = []struct {
|
|||||||
{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
|
{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
|
||||||
{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
|
{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
|
||||||
{`\xff`, nil, "\u00ff", []Group{{0, 1}}},
|
{`\xff`, nil, "\u00ff", []Group{{0, 1}}},
|
||||||
|
{`\xff+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
|
||||||
{`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
|
{`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
|
||||||
{`\x00ff`, nil, "\u00ff", []Group{}},
|
{`\x00ff`, nil, "\u00ff", []Group{}},
|
||||||
|
{`\x{0000ff}+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
|
||||||
{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
|
{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
|
||||||
{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
|
{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
|
||||||
{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
|
{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
|
||||||
@@ -473,7 +473,7 @@ var reTests = []struct {
|
|||||||
{`[\t][\n][\v][\r][\f][\b]`, nil, "\t\n\v\r\f\b", []Group{{0, 6}}},
|
{`[\t][\n][\v][\r][\f][\b]`, nil, "\t\n\v\r\f\b", []Group{{0, 6}}},
|
||||||
{`.*d`, nil, "abc\nabd", []Group{{4, 7}}},
|
{`.*d`, nil, "abc\nabd", []Group{{4, 7}}},
|
||||||
{`(`, nil, "-", nil},
|
{`(`, nil, "-", nil},
|
||||||
{`[\41]`, nil, `!`, []Group{{0, 1}}},
|
{`[\041]`, nil, `!`, []Group{{0, 1}}},
|
||||||
{`(?<!abc)(d.f)`, nil, `abcdefdof`, []Group{{6, 9}}},
|
{`(?<!abc)(d.f)`, nil, `abcdefdof`, []Group{{6, 9}}},
|
||||||
{`[\w-]+`, nil, `laser_beam`, []Group{{0, 10}}},
|
{`[\w-]+`, nil, `laser_beam`, []Group{{0, 10}}},
|
||||||
{`M+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}},
|
{`M+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}},
|
||||||
@@ -489,7 +489,25 @@ var reTests = []struct {
|
|||||||
{`[b-e]`, nil, `f`, []Group{}},
|
{`[b-e]`, nil, `f`, []Group{}},
|
||||||
|
|
||||||
{`*?`, nil, `-`, nil},
|
{`*?`, nil, `-`, nil},
|
||||||
{`a*?`, nil, `-`, nil}, // non-greedy operators are not supported
|
{`a.+c`, nil, `abcabc`, []Group{{0, 6}}},
|
||||||
|
// Lazy quantifier tests
|
||||||
|
{`a.+?c`, nil, `abcabc`, []Group{{0, 3}, {3, 6}}},
|
||||||
|
{`ab*?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{{0, 6}}},
|
||||||
|
{`ab+?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
|
||||||
|
{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
|
||||||
|
{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
|
||||||
|
{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{}},
|
||||||
|
{`ab??c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
|
||||||
|
{`a.*?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AXYZC`, []Group{{0, 5}}},
|
||||||
|
{`a.+?c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCABC`, []Group{{0, 3}, {3, 6}}},
|
||||||
|
{`a.*?c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCABC`, []Group{{0, 3}, {3, 6}}},
|
||||||
|
{`.*?\S *:`, nil, `xx:`, []Group{{0, 3}}},
|
||||||
|
{`a[ ]*? (\d+).*`, nil, `a 10`, []Group{{0, 6}}},
|
||||||
|
{`a[ ]*? (\d+).*`, nil, `a 10`, []Group{{0, 7}}},
|
||||||
|
{`"(?:\\"|[^"])*?"`, nil, `"\""`, []Group{{0, 4}}},
|
||||||
|
{`^.*?$`, nil, "one\ntwo\nthree", []Group{}},
|
||||||
|
{`a[^>]*?b`, nil, `a>b`, []Group{}},
|
||||||
|
{`^a*?$`, nil, `foo`, []Group{}},
|
||||||
|
|
||||||
// Numeric range tests - this is a feature that I added, and doesn't exist
|
// Numeric range tests - this is a feature that I added, and doesn't exist
|
||||||
// in any other mainstream regex engine
|
// in any other mainstream regex engine
|
||||||
@@ -520,6 +538,30 @@ var reTests = []struct {
|
|||||||
{`<389-400`, nil, `-`, nil},
|
{`<389-400`, nil, `-`, nil},
|
||||||
{`<389-400>`, nil, `391`, []Group{{0, 3}}},
|
{`<389-400>`, nil, `391`, []Group{{0, 3}}},
|
||||||
{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
|
{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
|
||||||
|
|
||||||
|
{`\p{Tamil}+`, nil, `உயிரெழுத்து`, []Group{{0, 11}}}, // Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'.
|
||||||
|
{`\P{Tamil}+`, nil, `vowel=உயிரெழுத்து`, []Group{{0, 6}}},
|
||||||
|
{`\P`, nil, `உயிரெழுத்து`, nil},
|
||||||
|
{`\PM\pM*`, nil, `உயிரெழுத்து`, []Group{{0, 1}, {1, 3}, {3, 5}, {5, 7}, {7, 9}, {9, 11}}},
|
||||||
|
{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
|
||||||
|
{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
|
||||||
|
{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
|
||||||
|
|
||||||
|
{`(?<=\().*?(?=\))`, nil, `(abc)`, []Group{{1, 4}}},
|
||||||
|
|
||||||
|
{`((a|b)\2)`, nil, `aa`, []Group{{0, 2}}},
|
||||||
|
{`((a|b)\2)`, nil, `bb`, []Group{{0, 2}}},
|
||||||
|
{`((a|b)\2)`, nil, `ab`, []Group{}},
|
||||||
|
{`((a|b)\2)`, nil, `ba`, []Group{}},
|
||||||
|
|
||||||
|
{`((a|b)\2){3}`, nil, `aaaaaa`, []Group{{0, 6}}},
|
||||||
|
{`((a|b)\2){3}`, nil, `bbbbbb`, []Group{{0, 6}}},
|
||||||
|
{`((a|b)\2){3}`, nil, `bbaaaa`, []Group{{0, 6}}},
|
||||||
|
{`((a|b)\2){3}`, nil, `aabbaa`, []Group{{0, 6}}},
|
||||||
|
{`((a|b)\2){3}`, nil, `aaaabb`, []Group{{0, 6}}},
|
||||||
|
{`((a|b)\2){3}`, nil, `bbaabb`, []Group{{0, 6}}},
|
||||||
|
{`((a|b)\2){3}`, nil, `baabab`, []Group{}},
|
||||||
|
{`((a|b)\2){3}`, nil, `bbabab`, []Group{}},
|
||||||
}
|
}
|
||||||
|
|
||||||
var groupTests = []struct {
|
var groupTests = []struct {
|
||||||
@@ -581,13 +623,37 @@ var groupTests = []struct {
|
|||||||
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}},
|
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}},
|
||||||
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
|
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
|
||||||
{`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
{`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
||||||
{`(((((((((a)))))))))\41`, nil, `a!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
{`(((((((((a)))))))))\041`, nil, `a!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
||||||
{`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
|
{`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
|
||||||
{`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
|
{`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
|
||||||
|
|
||||||
// At this point, the python test suite has a bunch
|
// Backreference tests
|
||||||
// of backreference tests. Since my engine doesn't
|
{`(abc)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}}},
|
||||||
// implement backreferences, I've skipped those tests.
|
{`([a-c]+)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}}},
|
||||||
|
{`([a-c]*)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}, []Group{{6, 6}, {6, 6}}}},
|
||||||
|
{`^(.+)?B`, nil, `AB`, []Match{[]Group{{0, 2}, {0, 1}}}},
|
||||||
|
{`(a+).\1$`, nil, `aaaaa`, []Match{[]Group{{0, 5}, {0, 2}}}},
|
||||||
|
{`^(a+).\1$`, nil, `aaaa`, []Match{}},
|
||||||
|
{`(a)\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}},
|
||||||
|
{`(a+)\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}},
|
||||||
|
{`(a+)+\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}},
|
||||||
|
{`(a).+\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}},
|
||||||
|
{`(a)ba*\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}},
|
||||||
|
{`(aa|a)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}},
|
||||||
|
{`(a|aa)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}},
|
||||||
|
{`(a+)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}},
|
||||||
|
{`([abc]*)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}, []Group{{6, 6}, {6, 6}}}},
|
||||||
|
{`(a)(?:b)\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}},
|
||||||
|
{`(a)(?:b)\1`, nil, `abb`, []Match{}},
|
||||||
|
{`(?:a)(b)\1`, nil, `aba`, []Match{}},
|
||||||
|
{`(?:a)(b)\1`, nil, `abb`, []Match{[]Group{{0, 3}, {1, 2}}}},
|
||||||
|
{`(?:(cat)|(dog))\2`, nil, `catdog`, []Match{}},
|
||||||
|
{`(?:a)\1`, nil, `aa`, nil},
|
||||||
|
{`((cat)|(dog)|(cow)|(bat))\4`, nil, `cowcow`, []Match{[]Group{{0, 6}, {0, 3}, {-1, -1}, {-1, -1}, {0, 3}, {-1, -1}}}},
|
||||||
|
{`(a|b)*\1`, nil, `abb`, []Match{[]Group{{0, 3}, {1, 2}}}},
|
||||||
|
{`(a|b)*\1`, nil, `aba`, []Match{}},
|
||||||
|
{`(a|b)*\1`, nil, `bab`, []Match{}},
|
||||||
|
{`(a|b)*\1`, nil, `baa`, []Match{[]Group{{0, 3}, {1, 2}}}},
|
||||||
|
|
||||||
{`(a)(b)c|ab`, nil, `ab`, []Match{[]Group{{0, 2}}}},
|
{`(a)(b)c|ab`, nil, `ab`, []Match{[]Group{{0, 2}}}},
|
||||||
{`(a)+x`, nil, `aaax`, []Match{[]Group{{0, 4}, {2, 3}}}},
|
{`(a)+x`, nil, `aaax`, []Match{[]Group{{0, 4}, {2, 3}}}},
|
||||||
@@ -636,7 +702,7 @@ var groupTests = []struct {
|
|||||||
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `BCDD`, []Match{}},
|
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `BCDD`, []Match{}},
|
||||||
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
|
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
|
||||||
{`(((((((((a)))))))))`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
{`(((((((((a)))))))))`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
||||||
{`(((((((((a)))))))))\41`, []ReFlag{RE_CASE_INSENSITIVE}, `A!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
{`(((((((((a)))))))))\041`, []ReFlag{RE_CASE_INSENSITIVE}, `A!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
||||||
{`(.*)c(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
|
{`(.*)c(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
|
||||||
{`\((.*), (.*)\)`, []ReFlag{RE_CASE_INSENSITIVE}, `(A, B)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
|
{`\((.*), (.*)\)`, []ReFlag{RE_CASE_INSENSITIVE}, `(A, B)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
|
||||||
{`(a)(b)c|ab`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}}}},
|
{`(a)(b)c|ab`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}}}},
|
||||||
@@ -688,6 +754,18 @@ var groupTests = []struct {
|
|||||||
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
||||||
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
|
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
|
||||||
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
|
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
|
||||||
|
|
||||||
|
// Lazy quantifier tests
|
||||||
|
{`a(?:b|c|d)+?(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}},
|
||||||
|
{`a(?:b|(c|e){1,2}?|d)+?(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {1, 2}, {2, 3}}}},
|
||||||
|
{`(?<!-):(.*?)(?<!-):`, nil, `a:bc-:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
|
||||||
|
{`(?<!\\):(.*?)(?<!\\):`, nil, `a:bc\:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
|
||||||
|
{`(?<!\?)'(.*?)(?<!\?)'`, nil, `a'bc?'de'f`, []Match{[]Group{{1, 9}, {2, 8}}}},
|
||||||
|
{`.*?x\s*\z(.*)`, []ReFlag{RE_MULTILINE, RE_SINGLE_LINE}, "xx\nx\n", []Match{[]Group{{0, 5}, {5, 5}}}},
|
||||||
|
{`.*?x\s*\z(.*)`, []ReFlag{RE_MULTILINE}, "xx\nx\n", []Match{[]Group{{3, 5}, {5, 5}}}},
|
||||||
|
{`^([ab]*?)(?=(b)?)c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
|
||||||
|
{`^([ab]*?)(?!(b))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
|
||||||
|
{`^([ab]*?)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestFind(t *testing.T) {
|
func TestFind(t *testing.T) {
|
||||||
@@ -792,23 +870,24 @@ func TestFindSubmatch(t *testing.T) {
|
|||||||
if test.result != nil {
|
if test.result != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
match, err := regComp.FindSubmatch(test.str)
|
match, err := regComp.FindSubmatch(test.str)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if len(test.result) != 0 {
|
if len(test.result) != 0 {
|
||||||
t.Errorf("Wanted %v got no match\n", test.result[0])
|
t.Errorf("Wanted %v got no match\n", test.result[0])
|
||||||
}
|
|
||||||
} else if len(test.result) == 0 {
|
|
||||||
t.Errorf("Wanted no match got %v\n", match)
|
|
||||||
}
|
|
||||||
for i := range match {
|
|
||||||
if match[i].IsValid() {
|
|
||||||
if test.result[0][i] != match[i] {
|
|
||||||
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
|
|
||||||
}
|
}
|
||||||
} else {
|
} else if len(test.result) == 0 {
|
||||||
if i < len(test.result) && test.result[0][i].IsValid() {
|
t.Errorf("Wanted no match got %v\n", match)
|
||||||
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
|
}
|
||||||
|
for i := range match {
|
||||||
|
if match[i].IsValid() {
|
||||||
|
if test.result[0][i] != match[i] {
|
||||||
|
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if i < len(test.result) && test.result[0][i].IsValid() {
|
||||||
|
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -823,10 +902,22 @@ func TestFindStringSubmatch(t *testing.T) {
|
|||||||
if test.result != nil {
|
if test.result != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
matchStr := regComp.FindStringSubmatch(test.str)
|
matchStr := regComp.FindStringSubmatch(test.str)
|
||||||
if matchStr == nil {
|
if matchStr == nil {
|
||||||
if len(test.result) != 0 {
|
if len(test.result) != 0 {
|
||||||
|
expectedStr := funcMap(test.result[0], func(g Group) string {
|
||||||
|
if g.IsValid() {
|
||||||
|
return test.str[g.StartIdx:g.EndIdx]
|
||||||
|
} else {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
})
|
||||||
|
t.Errorf("Wanted %v got no match\n", expectedStr)
|
||||||
|
}
|
||||||
|
} else if len(test.result) == 0 {
|
||||||
|
t.Errorf("Wanted no match got %v\n", matchStr)
|
||||||
|
} else {
|
||||||
expectedStr := funcMap(test.result[0], func(g Group) string {
|
expectedStr := funcMap(test.result[0], func(g Group) string {
|
||||||
if g.IsValid() {
|
if g.IsValid() {
|
||||||
return test.str[g.StartIdx:g.EndIdx]
|
return test.str[g.StartIdx:g.EndIdx]
|
||||||
@@ -834,26 +925,15 @@ func TestFindStringSubmatch(t *testing.T) {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
t.Errorf("Wanted %v got no match\n", expectedStr)
|
for i, groupStr := range matchStr {
|
||||||
}
|
if groupStr == "" {
|
||||||
} else if len(test.result) == 0 {
|
if i < len(expectedStr) && expectedStr[i] != "" {
|
||||||
t.Errorf("Wanted no match got %v\n", matchStr)
|
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
|
||||||
} else {
|
}
|
||||||
expectedStr := funcMap(test.result[0], func(g Group) string {
|
} else {
|
||||||
if g.IsValid() {
|
if expectedStr[i] != groupStr {
|
||||||
return test.str[g.StartIdx:g.EndIdx]
|
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
|
||||||
} else {
|
}
|
||||||
return ""
|
|
||||||
}
|
|
||||||
})
|
|
||||||
for i, groupStr := range matchStr {
|
|
||||||
if groupStr == "" {
|
|
||||||
if i < len(expectedStr) && expectedStr[i] != "" {
|
|
||||||
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if expectedStr[i] != groupStr {
|
|
||||||
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -861,6 +941,61 @@ func TestFindStringSubmatch(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFindAllStringSubmatch(t *testing.T) {
|
||||||
|
for _, test := range groupTests {
|
||||||
|
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||||
|
regComp, err := Compile(test.re, test.flags...)
|
||||||
|
if err != nil {
|
||||||
|
if test.result != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
matchStrs := regComp.FindAllStringSubmatch(test.str)
|
||||||
|
if matchStrs == nil {
|
||||||
|
if len(test.result) != 0 {
|
||||||
|
expectedStrs := funcMap(test.result, func(m Match) []string {
|
||||||
|
return funcMap(m, func(g Group) string {
|
||||||
|
if g.IsValid() {
|
||||||
|
return test.str[g.StartIdx:g.EndIdx]
|
||||||
|
} else {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
t.Errorf("Wanted %v got no match\n", expectedStrs)
|
||||||
|
}
|
||||||
|
} else if len(test.result) == 0 {
|
||||||
|
t.Errorf("Wanted no match got %v\n", matchStrs)
|
||||||
|
} else {
|
||||||
|
expectedStrs := funcMap(test.result, func(m Match) []string {
|
||||||
|
return funcMap(m, func(g Group) string {
|
||||||
|
if g.IsValid() {
|
||||||
|
return test.str[g.StartIdx:g.EndIdx]
|
||||||
|
} else {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
for i, matchStr := range matchStrs {
|
||||||
|
for j, groupStr := range matchStr {
|
||||||
|
if groupStr == "" {
|
||||||
|
if j < len(expectedStrs[i]) && expectedStrs[i][j] != "" {
|
||||||
|
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if expectedStrs[i][j] != groupStr {
|
||||||
|
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestFindAllSubmatch(t *testing.T) {
|
func TestFindAllSubmatch(t *testing.T) {
|
||||||
for _, test := range groupTests {
|
for _, test := range groupTests {
|
||||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||||
@@ -869,17 +1004,18 @@ func TestFindAllSubmatch(t *testing.T) {
|
|||||||
if test.result != nil {
|
if test.result != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
matchIndices := regComp.FindAllSubmatch(test.str)
|
matchIndices := regComp.FindAllSubmatch(test.str)
|
||||||
for i := range matchIndices {
|
for i := range matchIndices {
|
||||||
for j := range matchIndices[i] {
|
for j := range matchIndices[i] {
|
||||||
if matchIndices[i][j].IsValid() {
|
if matchIndices[i][j].IsValid() {
|
||||||
if test.result[i][j] != matchIndices[i][j] {
|
if test.result[i][j] != matchIndices[i][j] {
|
||||||
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if i < len(test.result) && j < len(test.result[i]) && test.result[i][j].IsValid() {
|
if i < len(test.result) && j < len(test.result[i]) && test.result[i][j].IsValid() {
|
||||||
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,4 +4,5 @@
|
|||||||
Ideas for flags:
|
Ideas for flags:
|
||||||
-m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
|
-m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
|
||||||
-g <num> : Print the <num>th group
|
-g <num> : Print the <num>th group
|
||||||
|
-r : Specify a directory instead of a file, reads recursively
|
||||||
4. Refactor code for flags - make each flag's code a function, which modifies the result of findAllMatches
|
4. Refactor code for flags - make each flag's code a function, which modifies the result of findAllMatches
|
||||||
|
|||||||
Reference in New Issue
Block a user