Compare commits
193 Commits
ca8d32cd7f
...
master
Author | SHA1 | Date | |
---|---|---|---|
3c61ab16ae | |||
17e897e2d6 | |||
777c590072 | |||
1d32865e76 | |||
66f4ca31d1 | |||
d81c72590a | |||
83632f2abc | |||
fc0af1ccc5 | |||
980fb77114 | |||
4c4d747a9c | |||
595b86df60 | |||
5f9bab528a | |||
530564b920 | |||
02b3b469c4 | |||
e489dc4c27 | |||
34149980a4 | |||
e79c19a929 | |||
d2bce37935 | |||
bb3b866b77 | |||
e07f27dc78 | |||
65d2317f79 | |||
a631fc289c | |||
d62a429cce | |||
7b31031553 | |||
38c842cb07 | |||
9f9af36be8 | |||
8217b67122 | |||
1f06dcef64 | |||
119475b41b | |||
6151cc8cf6 | |||
3eaf4eb19c | |||
d453815831 | |||
3a2916baae | |||
9d6344719f | |||
f5c868566b | |||
1cd6da218f | |||
277cbc0fc5 | |||
3924502b72 | |||
36b009747b | |||
6cd0a10a8f | |||
69fb96c43d | |||
46bc0c8529 | |||
1a890a1e75 | |||
fde3784e5a | |||
7045711860 | |||
d4d606d95b | |||
9cd330e521 | |||
44d6a2005c | |||
f76cd6c3d9 | |||
375baa1722 | |||
2e47c631bb | |||
81b8b1b11c | |||
2934e7a20f | |||
f466d4a8d5 | |||
8327450dd2 | |||
073f231b89 | |||
3b7257c921 | |||
668df8b70a | |||
214acf7e0f | |||
50221ff4d9 | |||
5ab95f512a | |||
e7da678408 | |||
ab363e2766 | |||
c803e45415 | |||
525296f239 | |||
eb0ab9f7ec | |||
17a7dbae4c | |||
f2279acd98 | |||
662527c478 | |||
d1958f289c | |||
15ee49f42e | |||
b60ded4136 | |||
9fbb99f86c | |||
af15904f3b | |||
d522f50b50 | |||
fb47e082eb | |||
1f5a363539 | |||
9e12f9dcb3 | |||
47f88c817f | |||
835d495990 | |||
76e0170cb9 | |||
d172a58258 | |||
7231169270 | |||
e546f01c20 | |||
b7467a00f1 | |||
c6ad4caa0d | |||
6334435b83 | |||
78fb5606dd | |||
eddd2ae700 | |||
c577064977 | |||
d4e3942d27 | |||
f15a5cae34 | |||
62ca1a872a | |||
99230b49de | |||
22ead83625 | |||
3604486a9b | |||
052de55826 | |||
d2ad0d95a8 | |||
ccf3b3b299 | |||
1d4f695f8f | |||
8534174ea1 | |||
ed4ffde64e | |||
fbc9bea9fb | |||
cca8c7cda2 | |||
858e535fba | |||
7c62ba6bfd | |||
d4e8cb74fd | |||
3ce611d121 | |||
e0253dfaf3 | |||
753e973d82 | |||
5563a70568 | |||
de0d7345a8 | |||
ad273b0c68 | |||
e167cdb2cb | |||
1fd48ae614 | |||
09812956ac | |||
fbc9dfcc95 | |||
bc32e0cb76 | |||
ad0f7d0178 | |||
4e597f8eb1 | |||
ef476e8875 | |||
7e6b02632f | |||
f94e3f2e71 | |||
b129d83c3f | |||
43aa7b5876 | |||
9a3bfca313 | |||
b6ab54f6dd | |||
6a96c98d04 | |||
3cfc2a6854 | |||
5d7a02e796 | |||
a46d2f4546 | |||
c88ebd1aa5 | |||
fd102292c6 | |||
6d692d0dfc | |||
7c4538a259 | |||
2a9ae0b68a | |||
783ae2ad10 | |||
b5e6bc112c | |||
206fea34cd | |||
fcdb23524a | |||
ac936659b6 | |||
e6dba9fdcf | |||
30779a446b | |||
f629a0f08f | |||
6869cd00a2 | |||
02bc8f30a2 | |||
ac05bceda3 | |||
037ac75ea6 | |||
e9d4e857cf | |||
b685d2fd5f | |||
8eda5055ff | |||
45b6566b2c | |||
e22822e619 | |||
692de2a32b | |||
0d19664044 | |||
1bfb09b6c7 | |||
b0b8bf23af | |||
00570f07fe | |||
7431b1a7b2 | |||
ee51e39d59 | |||
db7c884b83 | |||
c3059fe899 | |||
4f577592ba | |||
b734d61a03 | |||
00c39857eb | |||
aa9e2324ee | |||
66b96bf9e8 | |||
0ac39bfb7b | |||
dbc9fe2c3b | |||
eeeb9387d5 | |||
57eb935bd1 | |||
cbd679949f | |||
a63426d965 | |||
2e3450285c | |||
7e792f1248 | |||
b8f5b9af7c | |||
be60f2fb51 | |||
7aee4280cc | |||
e01ef48cbc | |||
93474c5159 | |||
d81b2ddaaa | |||
429d286439 | |||
198a2c12a7 | |||
7e88b8a4b0 | |||
af5b6ebe08 | |||
289bba35e2 | |||
7e6377a4c4 | |||
73c6a442ce | |||
ca8f8e1030 | |||
24a5045ebe | |||
f6d56b74e1 | |||
dc53951408 | |||
aef8152fc1 |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,2 +1 @@
|
||||
re
|
||||
|
||||
kg/kg
|
||||
|
11
LICENSE
Normal file
11
LICENSE
Normal file
@@ -0,0 +1,11 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2025 Aadhavan Srinivasan
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
16
Makefile
16
Makefile
@@ -1,9 +1,17 @@
|
||||
.DEFAULT_GOAL := build
|
||||
.PHONY: fmt vet build
|
||||
.DEFAULT_GOAL := buildCmd
|
||||
.PHONY: fmt vet buildLib buildCmd test
|
||||
|
||||
fmt:
|
||||
go fmt ./...
|
||||
vet: fmt
|
||||
go vet ./...
|
||||
build: vet
|
||||
go build -gcflags="-N -l" ./...
|
||||
buildLibUnopt: vet
|
||||
go build -gcflags="all=-N -l" ./...
|
||||
unopt: buildLibUnopt
|
||||
go build -C kg/ -gcflags="all=-N -l" -o kg ./...
|
||||
buildLib: vet
|
||||
go build ./...
|
||||
buildCmd: buildLib
|
||||
go build -C kg/ -o kg ./...
|
||||
test: buildCmd
|
||||
go test -v ./...
|
||||
|
21
README.md
Normal file
21
README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
## Kleingrep
|
||||
|
||||
Kleingrep is a regular expression engine, providing a library and command-line tool written in Go.
|
||||
|
||||
It aims to provide a more featureful engine, compared to the one in Go's
|
||||
[regexp](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
|
||||
|
||||
The engine does __not__ use backtracking, relying on the NFA-based method described in
|
||||
[Russ Cox's articles](https://swtch.com/~rsc/regexp). As such, it is immune to catastrophic backtracking.
|
||||
|
||||
It also includes features not present in regexp, such as lookarounds and backreferences.
|
||||
|
||||
### Syntax
|
||||
|
||||
The syntax is, for the most part, a superset of Go's regexp. A full overview of the syntax can be found [here](https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex#hdr-Syntax).
|
||||
|
||||
__For more information, see https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex__.
|
||||
|
||||
### How it works
|
||||
|
||||
I've written about the inner workings of the engine [on my blog](https://twomorecents.org/writing-regex-engine/index.html).
|
225
cmd/main.go
225
cmd/main.go
@@ -1,225 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
|
||||
"github.com/fatih/color"
|
||||
|
||||
"gitea.twomorecents.org/Rockingcool/kg/greg"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Flags for the regex Compile function
|
||||
flagsToCompile := make([]greg.ReFlag, 0)
|
||||
|
||||
invertFlag := flag.Bool("v", false, "Invert match.")
|
||||
// This flag has two 'modes':
|
||||
// 1. Without '-v': Prints only matches. Prints a newline after every match.
|
||||
// 2. With '-v': Substitutes all matches with empty string.
|
||||
onlyFlag := flag.Bool("o", false, "Print only colored content. Overrides -l.")
|
||||
lineFlag := flag.Bool("l", false, "Only print lines with a match (or with no matches, if -v is enabled). Similar to grep's default.")
|
||||
multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
|
||||
printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
|
||||
caseInsensitiveFlag := flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
|
||||
matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
|
||||
substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
|
||||
flag.Parse()
|
||||
|
||||
// These flags have to be passed to the Compile function
|
||||
if *multiLineFlag {
|
||||
flagsToCompile = append(flagsToCompile, greg.RE_MULTILINE, greg.RE_SINGLE_LINE)
|
||||
}
|
||||
if *caseInsensitiveFlag {
|
||||
flagsToCompile = append(flagsToCompile, greg.RE_CASE_INSENSITIVE)
|
||||
}
|
||||
|
||||
// -l and -o are mutually exclusive: -o overrides -l
|
||||
if *onlyFlag {
|
||||
*lineFlag = false
|
||||
}
|
||||
// Check if substitute and matchNum flags have been enabled
|
||||
substituteFlagEnabled := false
|
||||
matchNumFlagEnabled := false
|
||||
flag.Visit(func(f *flag.Flag) {
|
||||
if f.Name == "s" {
|
||||
substituteFlagEnabled = true
|
||||
}
|
||||
if f.Name == "m" {
|
||||
matchNumFlagEnabled = true
|
||||
}
|
||||
})
|
||||
|
||||
// Validate matchNumFlag - must be positive integer
|
||||
if matchNumFlagEnabled && *matchNum < 1 {
|
||||
panic("Invalid match number to print.")
|
||||
}
|
||||
|
||||
// Process:
|
||||
// 1. Convert regex into postfix notation (Shunting-Yard algorithm)
|
||||
// a. Add explicit concatenation operators to facilitate this
|
||||
// 2. Build NFA from postfix representation (Thompson's algorithm)
|
||||
// 3. Run the string against the NFA
|
||||
|
||||
if len(flag.Args()) != 1 { // flag.Args() also strips out program name
|
||||
fmt.Println("ERROR: Missing cmdline args")
|
||||
os.Exit(22)
|
||||
}
|
||||
var re string
|
||||
re = flag.Args()[0]
|
||||
var test_str string
|
||||
var err error
|
||||
var linesRead bool // Whether or not we have read the lines in the file
|
||||
lineNum := 0 // Current line number
|
||||
// Create reader for stdin and writer for stdout
|
||||
reader := bufio.NewReader(os.Stdin)
|
||||
out := bufio.NewWriter(os.Stdout)
|
||||
|
||||
regComp, err := greg.Compile(re, flagsToCompile...)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
for true {
|
||||
if linesRead {
|
||||
break
|
||||
}
|
||||
if !(*multiLineFlag) {
|
||||
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.
|
||||
test_str, err = reader.ReadString('\n')
|
||||
lineNum++
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
linesRead = true
|
||||
} else {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
if len(test_str) > 0 && test_str[len(test_str)-1] == '\n' {
|
||||
test_str = test_str[:len(test_str)-1]
|
||||
}
|
||||
} else {
|
||||
// Multi-line mode - read every line of input into a temp. string.
|
||||
// test_str will contain all lines of input (including newline characters)
|
||||
// as one string.
|
||||
var temp string
|
||||
for temp, err = reader.ReadString('\n'); err == nil; temp, err = reader.ReadString('\n') {
|
||||
test_str += temp
|
||||
}
|
||||
// Assuming err != nil
|
||||
if err == io.EOF {
|
||||
if len(temp) > 0 {
|
||||
test_str += temp // Add the last line (if it is non-empty)
|
||||
}
|
||||
linesRead = true
|
||||
} else {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
matchIndices := make([]greg.Match, 0)
|
||||
if matchNumFlagEnabled {
|
||||
tmp, err := greg.FindNthMatch(regComp, test_str, *matchNum)
|
||||
if err == nil {
|
||||
matchIndices = append(matchIndices, tmp)
|
||||
}
|
||||
} else {
|
||||
matchIndices = greg.FindAllMatches(regComp, test_str)
|
||||
}
|
||||
|
||||
if *printMatchesFlag {
|
||||
// if we are in single line mode, print the line on which
|
||||
// the matches occur
|
||||
if len(matchIndices) > 0 {
|
||||
if !(*multiLineFlag) {
|
||||
fmt.Fprintf(out, "Line %d:\n", lineNum)
|
||||
}
|
||||
for _, m := range matchIndices {
|
||||
fmt.Fprintf(out, "%s\n", m.ToString())
|
||||
}
|
||||
err := out.Flush()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
|
||||
// This should make checking O(1) instead of O(n)
|
||||
indicesToPrint := new_uniq_arr[int]()
|
||||
for _, idx := range matchIndices {
|
||||
indicesToPrint.add(genRange(idx[0].StartIdx, idx[0].EndIdx)...)
|
||||
}
|
||||
// If we are inverting, then we should print the indices which _didn't_ match
|
||||
// in color.
|
||||
if *invertFlag {
|
||||
oldIndices := indicesToPrint.values()
|
||||
indicesToPrint = new_uniq_arr[int]()
|
||||
// Explanation:
|
||||
// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
|
||||
// These are the values we want to print, now that we have inverted the match.
|
||||
// Re-initialize indicesToPrint and add all of these values to it.
|
||||
indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)
|
||||
|
||||
}
|
||||
// If lineFlag is enabled, we should only print something if:
|
||||
// a. We are not inverting, and have at least one match on the current line
|
||||
// OR
|
||||
// b. We are inverting, and have no matches at all on the current line.
|
||||
// This checks for the inverse, and continues if it is true.
|
||||
if *lineFlag {
|
||||
if !(*invertFlag) && len(matchIndices) == 0 || *invertFlag && len(matchIndices) > 0 {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// If we are substituting, we need a different behavior, as follows:
|
||||
// For every character in the test string:
|
||||
// 1. Check if the index is the start of any matchIndex
|
||||
// 2. If so, print the substitute text, and set our index to
|
||||
// the corresponding end index.
|
||||
// 3. If not, just print the character.
|
||||
if substituteFlagEnabled {
|
||||
for i := range test_str {
|
||||
inMatchIndex := false
|
||||
for _, m := range matchIndices {
|
||||
if i == m[0].StartIdx {
|
||||
fmt.Fprintf(out, "%s", *substituteText)
|
||||
i = m[0].EndIdx
|
||||
inMatchIndex = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !inMatchIndex {
|
||||
fmt.Fprintf(out, "%c", test_str[i])
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for i, c := range test_str {
|
||||
if indicesToPrint.contains(i) {
|
||||
color.New(color.FgRed).Fprintf(out, "%c", c)
|
||||
// Newline after every match - only if -o is enabled and -v is disabled.
|
||||
if *onlyFlag && !(*invertFlag) {
|
||||
for _, idx := range matchIndices {
|
||||
if i+1 == idx[0].EndIdx { // End index is one more than last index of match
|
||||
fmt.Fprintf(out, "\n")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if !(*onlyFlag) {
|
||||
fmt.Fprintf(out, "%c", c)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
err = out.Flush()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
2
go.mod
2
go.mod
@@ -1,4 +1,4 @@
|
||||
module gitea.twomorecents.org/Rockingcool/kg
|
||||
module gitea.twomorecents.org/Rockingcool/kleingrep
|
||||
|
||||
go 1.23.1
|
||||
|
||||
|
415
greg/matching.go
415
greg/matching.go
@@ -1,415 +0,0 @@
|
||||
package greg
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// a Match stores a slice of all the capturing groups in a match.
|
||||
type Match []Group
|
||||
|
||||
// a Group represents a group. It contains the start index and end index of the match
|
||||
type Group struct {
|
||||
StartIdx int
|
||||
EndIdx int
|
||||
}
|
||||
|
||||
func newMatch(size int) Match {
|
||||
toRet := make([]Group, size)
|
||||
for i := range toRet {
|
||||
toRet[i].StartIdx = -1
|
||||
toRet[i].EndIdx = -1
|
||||
}
|
||||
return toRet
|
||||
}
|
||||
|
||||
// Returns the number of valid groups in the match
|
||||
func (m Match) numValidGroups() int {
|
||||
numValid := 0
|
||||
for _, g := range m {
|
||||
if g.StartIdx >= 0 && g.EndIdx >= 0 {
|
||||
numValid++
|
||||
}
|
||||
}
|
||||
return numValid
|
||||
}
|
||||
|
||||
// Returns a string containing the indices of all (valid) groups in the match
|
||||
func (m Match) ToString() string {
|
||||
var toRet string
|
||||
for i, g := range m {
|
||||
if g.isValid() {
|
||||
toRet += fmt.Sprintf("Group %d\n", i)
|
||||
toRet += g.toString()
|
||||
toRet += "\n"
|
||||
}
|
||||
}
|
||||
return toRet
|
||||
}
|
||||
|
||||
// Converts the Group into a string representation:
|
||||
func (idx Group) toString() string {
|
||||
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
|
||||
}
|
||||
|
||||
// Returns whether a group contains valid indices
|
||||
func (g Group) isValid() bool {
|
||||
return g.StartIdx >= 0 && g.EndIdx >= 0
|
||||
}
|
||||
|
||||
// takeZeroState takes the 0-state (if such a transition exists) for all states in the
|
||||
// given slice. It returns the resulting states. If any of the resulting states is a 0-state,
|
||||
// the second ret val is true.
|
||||
// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index.
|
||||
func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZero bool) {
|
||||
for _, state := range states {
|
||||
if len(state.transitions[EPSILON]) > 0 {
|
||||
for _, s := range state.transitions[EPSILON] {
|
||||
if s.threadGroups == nil {
|
||||
s.threadGroups = newMatch(numGroups + 1)
|
||||
}
|
||||
copy(s.threadGroups, state.threadGroups)
|
||||
if s.groupBegin {
|
||||
s.threadGroups[s.groupNum].StartIdx = idx
|
||||
// openParenGroups = append(openParenGroups, s.groupNum)
|
||||
}
|
||||
if s.groupEnd {
|
||||
s.threadGroups[s.groupNum].EndIdx = idx
|
||||
// closeParenGroups = append(closeParenGroups, s.groupNum)
|
||||
}
|
||||
}
|
||||
rtv = append(rtv, state.transitions[EPSILON]...)
|
||||
}
|
||||
}
|
||||
for _, state := range rtv {
|
||||
if len(state.transitions[EPSILON]) > 0 {
|
||||
return rtv, true
|
||||
}
|
||||
}
|
||||
return rtv, false
|
||||
}
|
||||
|
||||
// zeroMatchPossible returns true if a zero-length match is possible
|
||||
// from any of the given states, given the string and our position in it.
|
||||
// It uses the same algorithm to find zero-states as the one inside the loop,
|
||||
// so I should probably put it in a function.
|
||||
func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) bool {
|
||||
zeroStates, isZero := takeZeroState(states, numGroups, idx)
|
||||
tempstates := make([]*State, 0, len(zeroStates)+len(states))
|
||||
tempstates = append(tempstates, states...)
|
||||
tempstates = append(tempstates, zeroStates...)
|
||||
num_appended := 0 // number of unique states addded to tempstates
|
||||
for isZero == true {
|
||||
zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
|
||||
tempstates, num_appended = unique_append(tempstates, zeroStates...)
|
||||
if num_appended == 0 { // break if we haven't appended any more unique values
|
||||
break
|
||||
}
|
||||
}
|
||||
for _, state := range tempstates {
|
||||
if state.isEmpty && (state.assert == NONE || state.checkAssertion(str, idx)) && state.isLast {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Prunes the slice by removing overlapping indices.
|
||||
func pruneIndices(indices []Match) []Match {
|
||||
// First, sort the slice by the start indices
|
||||
sort.Slice(indices, func(i, j int) bool {
|
||||
return indices[i][0].StartIdx < indices[j][0].StartIdx
|
||||
})
|
||||
toRet := make([]Match, 0, len(indices))
|
||||
current := indices[0]
|
||||
for _, idx := range indices[1:] {
|
||||
// idx doesn't overlap with current (starts after current ends), so add current to result
|
||||
// and update the current.
|
||||
if idx[0].StartIdx >= current[0].EndIdx {
|
||||
toRet = append(toRet, current)
|
||||
current = idx
|
||||
} else if idx[0].EndIdx > current[0].EndIdx {
|
||||
// idx overlaps, but it is longer, so update current
|
||||
current = idx
|
||||
}
|
||||
}
|
||||
// Add last state
|
||||
toRet = append(toRet, current)
|
||||
return toRet
|
||||
}
|
||||
|
||||
// FindString returns a _string_ containing the _text_ of the _leftmost_ match of
|
||||
// the regex, in the given string. The return value will be an empty string in two situations:
|
||||
// 1. No match was found
|
||||
// 2. The match was an empty string
|
||||
func FindString(regex Reg, str string) string {
|
||||
match, err := FindNthMatch(regex, str, 1)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return str[match[0].StartIdx:match[0].EndIdx]
|
||||
}
|
||||
|
||||
// FindAllString is the 'all' version of FindString.
|
||||
// It returns a _slice of strings_ containing the _text_ of _all_ matches of
|
||||
// the regex, in the given string.
|
||||
//func FindAllString(regex Reg, str []string) []string {
|
||||
//
|
||||
//}
|
||||
|
||||
// FindNthMatch finds the 'n'th match of the regex represented by the given start-state, with
|
||||
// the given string.
|
||||
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
|
||||
func FindNthMatch(regex Reg, str string, n int) (Match, error) {
|
||||
idx := 0
|
||||
matchNum := 0
|
||||
str_runes := []rune(str)
|
||||
var matchFound bool
|
||||
var matchIdx Match
|
||||
for idx <= len(str_runes) {
|
||||
matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
|
||||
if matchFound {
|
||||
matchNum++
|
||||
}
|
||||
if matchNum == n {
|
||||
return matchIdx, nil
|
||||
}
|
||||
}
|
||||
// We haven't found the nth match after scanning the string - Return an error
|
||||
return nil, fmt.Errorf("invalid match index - too few matches found")
|
||||
}
|
||||
|
||||
// FindAllMatches tries to find all matches of the regex represented by given start-state, with
|
||||
// the given string
|
||||
func FindAllMatches(regex Reg, str string) []Match {
|
||||
idx := 0
|
||||
str_runes := []rune(str)
|
||||
var matchFound bool
|
||||
var matchIdx Match
|
||||
indices := make([]Match, 0)
|
||||
for idx <= len(str_runes) {
|
||||
matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
|
||||
if matchFound {
|
||||
indices = append(indices, matchIdx)
|
||||
}
|
||||
}
|
||||
if len(indices) > 0 {
|
||||
return pruneIndices(indices)
|
||||
}
|
||||
return indices
|
||||
}
|
||||
|
||||
// Helper for FindAllMatches. Returns whether it found a match, the
|
||||
// first Match it finds, and how far it got into the string ie. where
|
||||
// the next search should start from.
|
||||
//
|
||||
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
|
||||
func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (bool, Match, int) {
|
||||
// Base case - exit if offset exceeds string's length
|
||||
if offset > len(str) {
|
||||
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
|
||||
return false, []Group{}, offset
|
||||
}
|
||||
|
||||
// Hold a list of match indices for the current run. When we
|
||||
// can no longer find a match, the match with the largest range is
|
||||
// chosen as the match for the entire string.
|
||||
// This allows us to pick the longest possible match (which is how greedy matching works).
|
||||
// COMMENT ABOVE IS CURRENTLY NOT UP-TO-DATE
|
||||
tempIndices := newMatch(numGroups + 1)
|
||||
|
||||
foundPath := false
|
||||
startIdx := offset
|
||||
endIdx := offset
|
||||
currentStates := make([]*State, 0)
|
||||
tempStates := make([]*State, 0) // Used to store states that should be used in next loop iteration
|
||||
i := offset // Index in string
|
||||
startingFrom := i // Store starting index
|
||||
|
||||
// If the first state is an assertion, makes sure the assertion
|
||||
// is true before we do _anything_ else.
|
||||
if start.assert != NONE {
|
||||
if start.checkAssertion(str, offset) == false {
|
||||
i++
|
||||
return false, []Group{}, i
|
||||
}
|
||||
}
|
||||
// Increment until we hit a character matching the start state (assuming not 0-state)
|
||||
if start.isEmpty == false {
|
||||
for i < len(str) && !start.contentContains(str, i) {
|
||||
i++
|
||||
}
|
||||
startIdx = i
|
||||
startingFrom = i
|
||||
i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character
|
||||
}
|
||||
|
||||
start.threadGroups = newMatch(numGroups + 1)
|
||||
// Check if the start state begins a group - if so, add the start index to our list
|
||||
if start.groupBegin {
|
||||
start.threadGroups[start.groupNum].StartIdx = i
|
||||
// tempIndices[start.groupNum].startIdx = i
|
||||
}
|
||||
|
||||
currentStates = append(currentStates, start)
|
||||
|
||||
// Main loop
|
||||
for i < len(str) {
|
||||
foundPath = false
|
||||
|
||||
zeroStates := make([]*State, 0)
|
||||
// Keep taking zero-states, until there are no more left to take
|
||||
// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
|
||||
zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
|
||||
tempStates = append(tempStates, zeroStates...)
|
||||
num_appended := 0
|
||||
for isZero == true {
|
||||
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
|
||||
tempStates, num_appended = unique_append(tempStates, zeroStates...)
|
||||
if num_appended == 0 { // Break if we haven't appended any more unique values
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
currentStates, _ = unique_append(currentStates, tempStates...)
|
||||
tempStates = nil
|
||||
|
||||
// Take any transitions corresponding to current character
|
||||
numStatesMatched := 0 // The number of states which had at least 1 match for this round
|
||||
assertionFailed := false // Whether or not an assertion failed for this round
|
||||
lastStateInList := false // Whether or not a last state was in our list of states
|
||||
var lastStatePtr *State = nil // Pointer to the last-state, if it was found
|
||||
lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
|
||||
for _, state := range currentStates {
|
||||
matches, numMatches := state.matchesFor(str, i)
|
||||
if numMatches > 0 {
|
||||
numStatesMatched++
|
||||
tempStates = append(tempStates, matches...)
|
||||
foundPath = true
|
||||
for _, m := range matches {
|
||||
if m.threadGroups == nil {
|
||||
m.threadGroups = newMatch(numGroups + 1)
|
||||
}
|
||||
copy(m.threadGroups, state.threadGroups)
|
||||
}
|
||||
}
|
||||
if numMatches < 0 {
|
||||
assertionFailed = true
|
||||
}
|
||||
if state.isLast {
|
||||
if state.isLookaround() {
|
||||
lastLookaroundInList = true
|
||||
}
|
||||
lastStateInList = true
|
||||
lastStatePtr = state
|
||||
}
|
||||
}
|
||||
|
||||
if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed
|
||||
// If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_
|
||||
// state. The explanation below is my attempt to explain this behavior.
|
||||
// If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails.
|
||||
//
|
||||
// One of the states in our list was a last state and a lookaround. In this case, we
|
||||
// don't abort upon failure of the assertion, because we have found
|
||||
// another path to a final state.
|
||||
// Even if the last state _was_ an assertion, we can use the previously
|
||||
// saved indices to find a match.
|
||||
if lastLookaroundInList {
|
||||
break
|
||||
} else {
|
||||
if i == startingFrom {
|
||||
i++
|
||||
}
|
||||
return false, []Group{}, i
|
||||
}
|
||||
}
|
||||
// Check if we can find a state in our list that is:
|
||||
// a. A last-state
|
||||
// b. Empty
|
||||
// c. Doesn't assert anything
|
||||
for _, s := range currentStates {
|
||||
if s.isLast && s.isEmpty && s.assert == NONE {
|
||||
lastStatePtr = s
|
||||
lastStateInList = true
|
||||
}
|
||||
}
|
||||
if lastStateInList { // A last-state was in the list of states. add the matchIndex to our MatchIndex list
|
||||
for j := 1; j < numGroups+1; j++ {
|
||||
tempIndices[j] = lastStatePtr.threadGroups[j]
|
||||
}
|
||||
endIdx = i
|
||||
tempIndices[0] = Group{startIdx, endIdx}
|
||||
}
|
||||
|
||||
// Check if we can find a zero-length match
|
||||
if foundPath == false {
|
||||
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
|
||||
if tempIndices[0].isValid() == false {
|
||||
tempIndices[0] = Group{startIdx, startIdx}
|
||||
}
|
||||
}
|
||||
// If we haven't moved in the string, increment the counter by 1
|
||||
// to ensure we don't keep trying the same string over and over.
|
||||
// if i == startingFrom {
|
||||
startIdx++
|
||||
// i++
|
||||
// }
|
||||
if tempIndices.numValidGroups() > 0 && tempIndices[0].isValid() {
|
||||
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
|
||||
return true, tempIndices, tempIndices[0].EndIdx + 1
|
||||
} else {
|
||||
return true, tempIndices, tempIndices[0].EndIdx
|
||||
}
|
||||
}
|
||||
return false, []Group{}, startIdx
|
||||
}
|
||||
currentStates = make([]*State, len(tempStates))
|
||||
copy(currentStates, tempStates)
|
||||
tempStates = nil
|
||||
|
||||
i++
|
||||
}
|
||||
|
||||
// End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position.
|
||||
// This is the exact same algorithm used inside the loop, so I should probably put it in a function.
|
||||
zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
|
||||
tempStates = append(tempStates, zeroStates...)
|
||||
num_appended := 0 // Number of unique states addded to tempStates
|
||||
for isZero == true {
|
||||
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
|
||||
tempStates, num_appended = unique_append(tempStates, zeroStates...)
|
||||
if num_appended == 0 { // Break if we haven't appended any more unique values
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
currentStates = append(currentStates, tempStates...)
|
||||
tempStates = nil
|
||||
|
||||
for _, state := range currentStates {
|
||||
// Only add the match if the start index is in bounds. If the state has an assertion,
|
||||
// make sure the assertion checks out.
|
||||
if state.isLast && i <= len(str) {
|
||||
if state.assert == NONE || state.checkAssertion(str, i) {
|
||||
for j := 1; j < numGroups+1; j++ {
|
||||
tempIndices[j] = state.threadGroups[j]
|
||||
}
|
||||
endIdx = i
|
||||
tempIndices[0] = Group{startIdx, endIdx}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if tempIndices.numValidGroups() > 0 {
|
||||
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
|
||||
return true, tempIndices, tempIndices[0].EndIdx + 1
|
||||
} else {
|
||||
return true, tempIndices, tempIndices[0].EndIdx
|
||||
}
|
||||
}
|
||||
if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over.
|
||||
startIdx++
|
||||
}
|
||||
return false, []Group{}, startIdx
|
||||
}
|
348
greg/nfa.go
348
greg/nfa.go
@@ -1,348 +0,0 @@
|
||||
package greg
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"slices"
|
||||
)
|
||||
|
||||
const EPSILON int = 0xF0000
|
||||
|
||||
type assertType int
|
||||
|
||||
const (
|
||||
NONE assertType = iota
|
||||
SOS
|
||||
EOS
|
||||
WBOUND
|
||||
NONWBOUND
|
||||
PLA // Positive lookahead
|
||||
NLA // Negative lookahead
|
||||
PLB // Positive lookbehind
|
||||
NLB // Negative lookbehind
|
||||
ALWAYS_TRUE // An assertion that is always true
|
||||
)
|
||||
|
||||
type State struct {
|
||||
content stateContents // Contents of current state
|
||||
isEmpty bool // If it is empty - Union operator and Kleene star states will be empty
|
||||
isLast bool // If it is the last state (acept state)
|
||||
output []*State // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
|
||||
transitions map[int][]*State // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
|
||||
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
|
||||
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
|
||||
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
|
||||
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
|
||||
lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds
|
||||
lookaroundNFA *State // Holds the NFA of the lookaroundRegex - if it exists
|
||||
lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround
|
||||
groupBegin bool // Whether or not the node starts a capturing group
|
||||
groupEnd bool // Whether or not the node ends a capturing group
|
||||
groupNum int // Which capturing group the node starts / ends
|
||||
// The following properties depend on the current match - I should think about resetting them for every match.
|
||||
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
|
||||
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
|
||||
}
|
||||
|
||||
// Clones the NFA starting from the given state.
|
||||
func cloneState(start *State) *State {
|
||||
return cloneStateHelper(start, make(map[*State]*State))
|
||||
}
|
||||
|
||||
// Helper function for clone. The map is used to keep track of which states have
|
||||
// already been copied, and which ones haven't.
|
||||
// This function was created using output from Llama3.1:405B.
|
||||
func cloneStateHelper(state *State, cloneMap map[*State]*State) *State {
|
||||
// Base case - if the clone exists in our map, return it.
|
||||
if clone, exists := cloneMap[state]; exists {
|
||||
return clone
|
||||
}
|
||||
if state == nil {
|
||||
return nil
|
||||
}
|
||||
// Recursive case - if the clone doesn't exist, create it, add it to the map,
|
||||
// and recursively call for each of the transition states.
|
||||
clone := &State{
|
||||
content: append([]int{}, state.content...),
|
||||
isEmpty: state.isEmpty,
|
||||
isLast: state.isLast,
|
||||
output: make([]*State, len(state.output)),
|
||||
transitions: make(map[int][]*State),
|
||||
isKleene: state.isKleene,
|
||||
assert: state.assert,
|
||||
zeroMatchFound: state.zeroMatchFound,
|
||||
allChars: state.allChars,
|
||||
except: append([]rune{}, state.except...),
|
||||
lookaroundRegex: state.lookaroundRegex,
|
||||
groupEnd: state.groupEnd,
|
||||
groupBegin: state.groupBegin,
|
||||
groupNum: state.groupNum,
|
||||
}
|
||||
cloneMap[state] = clone
|
||||
for i, s := range state.output {
|
||||
if s == state {
|
||||
clone.output[i] = clone
|
||||
} else {
|
||||
clone.output[i] = cloneStateHelper(s, cloneMap)
|
||||
}
|
||||
}
|
||||
for k, v := range state.transitions {
|
||||
clone.transitions[k] = make([]*State, len(v))
|
||||
for i, s := range v {
|
||||
if s == state {
|
||||
clone.transitions[k][i] = clone
|
||||
} else {
|
||||
clone.transitions[k][i] = cloneStateHelper(s, cloneMap)
|
||||
}
|
||||
}
|
||||
}
|
||||
if state.lookaroundNFA == state {
|
||||
clone.lookaroundNFA = clone
|
||||
}
|
||||
clone.lookaroundNFA = cloneStateHelper(state.lookaroundNFA, cloneMap)
|
||||
return clone
|
||||
}
|
||||
|
||||
// Checks if the given state's assertion is true. Returns true if the given
|
||||
// state doesn't have an assertion.
|
||||
func (s State) checkAssertion(str []rune, idx int) bool {
|
||||
if s.assert == ALWAYS_TRUE {
|
||||
return true
|
||||
}
|
||||
if s.assert == SOS {
|
||||
// Single-line mode: Beginning of string
|
||||
// Multi-line mode: Previous character was newline
|
||||
return idx == 0 || (multilineMode && (idx > 0 && str[idx-1] == '\n'))
|
||||
}
|
||||
if s.assert == EOS {
|
||||
// Single-line mode: End of string
|
||||
// Multi-line mode: current character is newline
|
||||
// Index is at the end of the string, or it points to the last character which is a newline
|
||||
return idx == len(str) || (multilineMode && str[idx] == '\n')
|
||||
}
|
||||
if s.assert == WBOUND {
|
||||
return isWordBoundary(str, idx)
|
||||
}
|
||||
if s.assert == NONWBOUND {
|
||||
return !isWordBoundary(str, idx)
|
||||
}
|
||||
if s.isLookaround() {
|
||||
// The process here is simple:
|
||||
// 1. Compile the regex stored in the state's contents.
|
||||
// 2. Run it on a subset of the test string, that ends after the current index in the string
|
||||
// 3. Based on the kind of lookaround (and the indices we get), determine what action to take.
|
||||
startState := s.lookaroundNFA
|
||||
var runesToMatch []rune
|
||||
var strToMatch string
|
||||
if s.assert == PLA || s.assert == NLA {
|
||||
runesToMatch = str[idx:]
|
||||
} else {
|
||||
runesToMatch = str[:idx]
|
||||
}
|
||||
|
||||
if len(runesToMatch) == 0 {
|
||||
strToMatch = ""
|
||||
} else {
|
||||
strToMatch = string(runesToMatch)
|
||||
}
|
||||
|
||||
matchIndices := FindAllMatches(Reg{startState, s.lookaroundNumCaptureGroups}, strToMatch)
|
||||
|
||||
numMatchesFound := 0
|
||||
for _, matchIdx := range matchIndices {
|
||||
if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
|
||||
if matchIdx[0].StartIdx == 0 {
|
||||
numMatchesFound++
|
||||
}
|
||||
}
|
||||
if s.assert == PLB || s.assert == NLB { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
|
||||
if matchIdx[0].EndIdx == idx {
|
||||
numMatchesFound++
|
||||
}
|
||||
}
|
||||
}
|
||||
if s.assert == PLA || s.assert == PLB { // Positive assertions want at least one match
|
||||
return numMatchesFound > 0
|
||||
}
|
||||
if s.assert == NLA || s.assert == NLB { // Negative assertions only want zero matches
|
||||
return numMatchesFound == 0
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Returns true if the contents of 's' contain the value at the given index of the given string
|
||||
func (s State) contentContains(str []rune, idx int) bool {
|
||||
if s.assert != NONE {
|
||||
return s.checkAssertion(str, idx)
|
||||
}
|
||||
if s.allChars {
|
||||
return !slices.Contains(slices.Concat(notDotChars, s.except), str[idx]) // Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node.
|
||||
}
|
||||
// Default - s.assert must be NONE
|
||||
return slices.Contains(s.content, int(str[idx]))
|
||||
}
|
||||
|
||||
func (s State) isLookaround() bool {
|
||||
return s.assert == PLA || s.assert == PLB || s.assert == NLA || s.assert == NLB
|
||||
}
|
||||
|
||||
// Returns the matches for the character at the given index of the given string.
|
||||
// Also returns the number of matches. Returns -1 if an assertion failed.
|
||||
func (s State) matchesFor(str []rune, idx int) ([]*State, int) {
|
||||
// Assertions can be viewed as 'checks'. If the check fails, we return
|
||||
// an empty array and 0.
|
||||
// If it passes, we treat it like any other state, and return all the transitions.
|
||||
if s.assert != NONE {
|
||||
if s.checkAssertion(str, idx) == false {
|
||||
return make([]*State, 0), -1
|
||||
}
|
||||
}
|
||||
listTransitions := s.transitions[int(str[idx])]
|
||||
for _, dest := range s.transitions[int(ANY_CHAR)] {
|
||||
if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) {
|
||||
// Add an allChar state to the list of matches if:
|
||||
// a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't.
|
||||
// b. The current character isn't the state's exception list.
|
||||
listTransitions = append(listTransitions, dest)
|
||||
}
|
||||
}
|
||||
numTransitions := len(listTransitions)
|
||||
return listTransitions, numTransitions
|
||||
}
|
||||
|
||||
// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates
|
||||
func verifyLastStatesHelper(state *State, visited map[*State]bool) {
|
||||
if len(state.transitions) == 0 {
|
||||
state.isLast = true
|
||||
return
|
||||
}
|
||||
// if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
|
||||
if len(state.transitions) == 1 { // Eg. a*
|
||||
var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
|
||||
for _, c := range state.content {
|
||||
if len(state.transitions[c]) != 1 || state.transitions[c][0] != state {
|
||||
moreThanOneTrans = true
|
||||
}
|
||||
}
|
||||
state.isLast = !moreThanOneTrans
|
||||
}
|
||||
|
||||
if state.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
|
||||
transitionDests := make([]*State, 0)
|
||||
for _, v := range state.transitions {
|
||||
transitionDests = append(transitionDests, v...)
|
||||
}
|
||||
if allEqual(transitionDests...) {
|
||||
state.isLast = true
|
||||
return
|
||||
}
|
||||
}
|
||||
if visited[state] == true {
|
||||
return
|
||||
}
|
||||
visited[state] = true
|
||||
for _, states := range state.transitions {
|
||||
for i := range states {
|
||||
if states[i] != state {
|
||||
verifyLastStatesHelper(states[i], visited)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// verifyLastStates enables the 'isLast' flag for the leaf nodes (last states)
|
||||
func verifyLastStates(start []*State) {
|
||||
verifyLastStatesHelper(start[0], make(map[*State]bool))
|
||||
}
|
||||
|
||||
// Concatenates s1 and s2, returns the start of the concatenation.
|
||||
func concatenate(s1 *State, s2 *State) *State {
|
||||
if s1 == nil {
|
||||
return s2
|
||||
}
|
||||
for i := range s1.output {
|
||||
for _, c := range s2.content { // Create transitions for every element in s1's content to s2'
|
||||
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], s2)
|
||||
}
|
||||
}
|
||||
s1.output = s2.output
|
||||
return s1
|
||||
}
|
||||
|
||||
func kleene(s1 State) (*State, error) {
|
||||
if s1.isEmpty && s1.assert != NONE {
|
||||
return nil, fmt.Errorf("previous token is not quantifiable")
|
||||
}
|
||||
|
||||
toReturn := &State{}
|
||||
toReturn.transitions = make(map[int][]*State)
|
||||
toReturn.content = newContents(EPSILON)
|
||||
toReturn.isEmpty = true
|
||||
toReturn.isKleene = true
|
||||
toReturn.output = append(toReturn.output, toReturn)
|
||||
for i := range s1.output {
|
||||
for _, c := range toReturn.content {
|
||||
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], toReturn)
|
||||
}
|
||||
}
|
||||
for _, c := range s1.content {
|
||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1)
|
||||
}
|
||||
return toReturn, nil
|
||||
}
|
||||
|
||||
func alternate(s1 *State, s2 *State) *State {
|
||||
toReturn := &State{}
|
||||
toReturn.transitions = make(map[int][]*State)
|
||||
toReturn.output = append(toReturn.output, s1.output...)
|
||||
toReturn.output = append(toReturn.output, s2.output...)
|
||||
// Unique append is used here (and elsewhere) to ensure that,
|
||||
// for any given transition, a state can only be mentioned once.
|
||||
// For example, given the transition 'a', the state 's1' can only be mentioned once.
|
||||
// This would lead to multiple instances of the same set of match indices, since both
|
||||
// 's1' states would be considered to match.
|
||||
for _, c := range s1.content {
|
||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s1)
|
||||
}
|
||||
for _, c := range s2.content {
|
||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2)
|
||||
}
|
||||
toReturn.content = newContents(EPSILON)
|
||||
toReturn.isEmpty = true
|
||||
|
||||
return toReturn
|
||||
}
|
||||
|
||||
func question(s1 *State) *State { // Use the fact that ab? == a(b|)
|
||||
s2 := &State{}
|
||||
s2.transitions = make(map[int][]*State)
|
||||
s2.content = newContents(EPSILON)
|
||||
s2.output = append(s2.output, s2)
|
||||
s2.isEmpty = true
|
||||
s3 := alternate(s1, s2)
|
||||
return s3
|
||||
}
|
||||
|
||||
// Creates and returns a new state with the 'default' values.
|
||||
func newState() State {
|
||||
ret := State{
|
||||
output: make([]*State, 0),
|
||||
transitions: make(map[int][]*State),
|
||||
assert: NONE,
|
||||
except: append([]rune{}, 0),
|
||||
lookaroundRegex: "",
|
||||
groupEnd: false,
|
||||
groupBegin: false,
|
||||
}
|
||||
ret.output = append(ret.output, &ret)
|
||||
return ret
|
||||
}
|
||||
|
||||
// Creates and returns a state that _always_ has a zero-length match.
|
||||
func zeroLengthMatchState() State {
|
||||
start := newState()
|
||||
start.content = newContents(EPSILON)
|
||||
start.isEmpty = true
|
||||
start.assert = ALWAYS_TRUE
|
||||
return start
|
||||
}
|
@@ -1,200 +0,0 @@
|
||||
package greg
|
||||
|
||||
import "fmt"
|
||||
|
||||
type NodeType int
|
||||
|
||||
// This is a slice containing all escapable characters that have special meaning.
|
||||
// Eg. \b is word boundary, \w is word character etc.
|
||||
var escapedChars []rune = []rune("wWdDbBnaftrvsS0")
|
||||
|
||||
// This is a list of the possible node types
|
||||
const (
|
||||
CHARACTER NodeType = iota
|
||||
CHARCLASS
|
||||
PIPE
|
||||
CONCATENATE
|
||||
KLEENE
|
||||
QUESTION
|
||||
PLUS
|
||||
ASSERTION
|
||||
LPAREN
|
||||
RPAREN
|
||||
)
|
||||
|
||||
// Helper constants for lookarounds
|
||||
const POSITIVE = 1
|
||||
const NEGATIVE = -1
|
||||
const LOOKAHEAD = 1
|
||||
const LOOKBEHIND = -1
|
||||
|
||||
var INFINITE_REPS int = -1 // Represents infinite reps eg. the end range in {5,}
|
||||
// This represents a node in the postfix representation of the expression
|
||||
type postfixNode struct {
|
||||
nodetype NodeType
|
||||
contents []rune // Contents of the node
|
||||
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
|
||||
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
|
||||
allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter)
|
||||
except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here.
|
||||
lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
|
||||
lookaroundDir int // Lookbehind or lookahead
|
||||
nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
|
||||
}
|
||||
|
||||
// Converts the given list of postfixNodes to one node of type CHARCLASS.
|
||||
// Used to convert eg. 'a', 'b' and 'c' to '[abc]'.
|
||||
// If the character class is negated, it returns a postfixNode of type CHARACTER.
|
||||
// This node will behave like the dot metacharacter, but it has a longer list of runes that
|
||||
// it will not match.
|
||||
func newCharClassNode(nodes []postfixNode, negated bool) postfixNode {
|
||||
rtv := postfixNode{}
|
||||
rtv.nodetype = CHARCLASS
|
||||
rtv.startReps = 1
|
||||
rtv.endReps = 1
|
||||
if negated {
|
||||
rtv.nodetype = CHARACTER
|
||||
rtv.contents = []rune{ANY_CHAR}
|
||||
rtv.allChars = true
|
||||
rtv.except = nodes
|
||||
} else {
|
||||
rtv.nodeContents = nodes
|
||||
}
|
||||
return rtv
|
||||
}
|
||||
|
||||
// Creates a new escaped node - the given character is assumed to have been preceded by a backslash
|
||||
func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
switch c {
|
||||
case 's': // Whitespace
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, whitespaceChars...)
|
||||
case 'S': // Non-whitespace
|
||||
toReturn = newPostfixDotNode()
|
||||
toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...))
|
||||
case 'd': // Digits
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, digitChars...)
|
||||
case 'D': // Non-digits
|
||||
toReturn = newPostfixDotNode()
|
||||
toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...))
|
||||
case 'w': // word character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, wordChars...)
|
||||
case 'W': // Non-word character
|
||||
toReturn = newPostfixDotNode()
|
||||
toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...))
|
||||
case 'b', 'B':
|
||||
if c == 'b' && inCharClass {
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(8))
|
||||
} else {
|
||||
toReturn.nodetype = ASSERTION
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
}
|
||||
case 'n': // Newline character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, '\n')
|
||||
case '0': // NULL character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(0))
|
||||
case 'a': // Bell character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(7))
|
||||
case 'f': // Form feed character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(12))
|
||||
case 't': // Horizontal tab character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(9))
|
||||
case 'r': // Carriage return
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(13))
|
||||
case 'v': // Vertical tab
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(11))
|
||||
case '-': // Literal hyphen - only in character class
|
||||
if inCharClass {
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, '-')
|
||||
} else {
|
||||
return postfixNode{}, fmt.Errorf("invalid escape character")
|
||||
}
|
||||
default: // None of the above - append it as a regular character
|
||||
if isNormalChar(c) { // Normal characters cannot be escaped
|
||||
return postfixNode{}, fmt.Errorf("invalid escape character")
|
||||
}
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
}
|
||||
return toReturn, nil
|
||||
}
|
||||
|
||||
// Creates and returns a postfixNode based on the given contents
|
||||
func newPostfixNode(contents ...rune) postfixNode {
|
||||
if len(contents) < 1 {
|
||||
panic("Empty node.")
|
||||
}
|
||||
to_return := postfixNode{}
|
||||
to_return.startReps = 1
|
||||
to_return.endReps = 1
|
||||
if len(contents) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER
|
||||
to_return.nodetype = CHARACTER
|
||||
to_return.contents = contents
|
||||
} else { // Node has one element, could be anything
|
||||
switch contents[0] {
|
||||
case '+':
|
||||
to_return.nodetype = PLUS
|
||||
case '?':
|
||||
to_return.nodetype = QUESTION
|
||||
case '*':
|
||||
to_return.nodetype = KLEENE
|
||||
case '|':
|
||||
to_return.nodetype = PIPE
|
||||
case CONCAT:
|
||||
to_return.nodetype = CONCATENATE
|
||||
case '^', '$':
|
||||
to_return.nodetype = ASSERTION
|
||||
case '(':
|
||||
to_return.nodetype = LPAREN
|
||||
case ')':
|
||||
to_return.nodetype = RPAREN
|
||||
default:
|
||||
to_return.nodetype = CHARACTER
|
||||
}
|
||||
to_return.contents = append(to_return.contents, contents...)
|
||||
|
||||
// Special cases for LPAREN and RPAREN - they have special characters defined for them
|
||||
if to_return.nodetype == LPAREN {
|
||||
to_return.contents = []rune{LPAREN_CHAR}
|
||||
}
|
||||
if to_return.nodetype == RPAREN {
|
||||
to_return.contents = []rune{RPAREN_CHAR}
|
||||
}
|
||||
}
|
||||
return to_return
|
||||
}
|
||||
|
||||
// Creates and returns a postfixNode representing the 'dot' metacharacter.
|
||||
func newPostfixDotNode() postfixNode {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.allChars = true
|
||||
toReturn.contents = []rune{ANY_CHAR}
|
||||
return toReturn
|
||||
}
|
||||
|
||||
// Creates a character node, regardless of the contents
|
||||
func newPostfixCharNode(contents ...rune) postfixNode {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, contents...)
|
||||
return toReturn
|
||||
}
|
@@ -1,6 +1,11 @@
|
||||
package main
|
||||
|
||||
import "slices"
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"slices"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type character interface {
|
||||
int | rune
|
||||
@@ -25,3 +30,23 @@ func genRange[T character](start, end T) []T {
|
||||
}
|
||||
return toRet
|
||||
}
|
||||
|
||||
// Returns whether or not the given file contains a NULL character
|
||||
func fileContainsNullChar(filename string) (bool, error) {
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
defer file.Close()
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if strings.Contains(line, "\000") {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return true, err
|
||||
}
|
||||
return false, nil
|
||||
}
|
317
kg/main.go
Normal file
317
kg/main.go
Normal file
@@ -0,0 +1,317 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
|
||||
"github.com/fatih/color"
|
||||
|
||||
reg "gitea.twomorecents.org/Rockingcool/kleingrep/regex"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Flags for the regex Compile function
|
||||
flagsToCompile := make([]reg.ReFlag, 0)
|
||||
|
||||
invertFlag := flag.Bool("v", false, "Invert match.")
|
||||
// This flag has two 'modes':
|
||||
// 1. Without '-v': Prints only matches. Prints a newline after every match.
|
||||
// 2. With '-v': Substitutes all matches with empty string.
|
||||
onlyFlag := flag.Bool("o", false, "Print only colored content. Overrides -l.")
|
||||
lineFlag := flag.Bool("l", false, "Only print lines with a match (or with no matches, if -v is enabled). Similar to grep's default.")
|
||||
multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
|
||||
printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
|
||||
caseInsensitiveFlag := flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
|
||||
recursiveFlag := flag.Bool("r", false, "Recursively search all files in the given directory.")
|
||||
lineNumFlag := flag.Bool("n", false, "For each line with a match, print the line number. Implies -l.")
|
||||
matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
|
||||
substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
|
||||
flag.Parse()
|
||||
|
||||
// These flags have to be passed to the Compile function
|
||||
if *multiLineFlag {
|
||||
flagsToCompile = append(flagsToCompile, reg.RE_MULTILINE, reg.RE_SINGLE_LINE)
|
||||
}
|
||||
if *caseInsensitiveFlag {
|
||||
flagsToCompile = append(flagsToCompile, reg.RE_CASE_INSENSITIVE)
|
||||
}
|
||||
|
||||
// -l and -o are mutually exclusive: -o overrides -l
|
||||
if *onlyFlag {
|
||||
*lineFlag = false
|
||||
}
|
||||
// Check if substitute and matchNum flags have been enabled
|
||||
substituteFlagEnabled := false
|
||||
matchNumFlagEnabled := false
|
||||
flag.Visit(func(f *flag.Flag) {
|
||||
if f.Name == "s" {
|
||||
substituteFlagEnabled = true
|
||||
}
|
||||
if f.Name == "m" {
|
||||
matchNumFlagEnabled = true
|
||||
}
|
||||
})
|
||||
|
||||
// Validate matchNumFlag - must be positive integer
|
||||
if matchNumFlagEnabled && *matchNum < 1 {
|
||||
panic("Invalid match number to print.")
|
||||
}
|
||||
|
||||
// Enable lineFlag if lineNumFlag is enabled
|
||||
if *lineNumFlag {
|
||||
*lineFlag = true
|
||||
}
|
||||
|
||||
// Process:
|
||||
// 1. Convert regex into postfix notation (Shunting-Yard algorithm)
|
||||
// a. Add explicit concatenation operators to facilitate this
|
||||
// 2. Build NFA from postfix representation (Thompson's algorithm)
|
||||
// 3. Run the string against the NFA
|
||||
|
||||
if len(flag.Args()) < 1 { // flag.Args() also strips out program name
|
||||
fmt.Printf("%s: ERROR: Missing cmdline args\n", os.Args[0])
|
||||
os.Exit(22)
|
||||
}
|
||||
if *recursiveFlag && len(flag.Args()) < 2 { // File/Directory must be provided with '-r'
|
||||
fmt.Printf("%s: ERROR: Missing cmdline args\n", os.Args[0])
|
||||
os.Exit(22)
|
||||
}
|
||||
var re string
|
||||
re = flag.Args()[0]
|
||||
var inputFiles []*os.File
|
||||
if len(flag.Args()) == 1 || flag.Args()[1] == "-" { // Either no file argument, or file argument is "-"
|
||||
if !slices.Contains(inputFiles, os.Stdin) {
|
||||
inputFiles = append(inputFiles, os.Stdin) // os.Stdin cannot be entered more than once into the file list
|
||||
}
|
||||
} else {
|
||||
inputFilenames := flag.Args()[1:]
|
||||
for _, inputFilename := range inputFilenames {
|
||||
inputFile, err := os.Open(inputFilename)
|
||||
defer inputFile.Close()
|
||||
if err != nil {
|
||||
fmt.Printf("%s: %s: No such file or directory\n", os.Args[0], inputFilename)
|
||||
} else {
|
||||
fileStat, err := inputFile.Stat()
|
||||
if err != nil {
|
||||
fmt.Printf("%v\n", err)
|
||||
os.Exit(2)
|
||||
} else {
|
||||
if fileStat.Mode().IsDir() {
|
||||
if *recursiveFlag {
|
||||
// Walk the directory and open every file in it. Add each file to the filelist.
|
||||
filepath.WalkDir(inputFilename, func(filename string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !d.IsDir() {
|
||||
f, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
inputFiles = append(inputFiles, f)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
} else {
|
||||
fmt.Printf("%s: %s: Is a directory\n", os.Args[0], inputFilename)
|
||||
}
|
||||
} else {
|
||||
var nullCharPresent bool
|
||||
if nullCharPresent, err = fileContainsNullChar(inputFilename); nullCharPresent {
|
||||
if err != nil {
|
||||
fmt.Printf("%v\n", err)
|
||||
os.Exit(1)
|
||||
} else {
|
||||
fmt.Printf("%s: %s: Is a binary file\n", os.Args[0], inputFilename)
|
||||
}
|
||||
} else {
|
||||
inputFiles = append(inputFiles, inputFile)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(inputFiles) == 0 { // No valid files given
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
var test_str string
|
||||
var err error
|
||||
var linesRead bool // Whether or not we have read the lines in the file
|
||||
lineNum := 0 // Current line number
|
||||
// Create writer for stdout
|
||||
out := bufio.NewWriter(os.Stdout)
|
||||
// Compile regex
|
||||
regComp, err := reg.Compile(re, flagsToCompile...)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
for _, inputFile := range inputFiles {
|
||||
lineNum = 0
|
||||
reader := bufio.NewReader(inputFile)
|
||||
linesRead = false
|
||||
for true {
|
||||
if linesRead {
|
||||
break
|
||||
}
|
||||
if !(*multiLineFlag) {
|
||||
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.
|
||||
test_str, err = reader.ReadString('\n')
|
||||
lineNum++
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
linesRead = true
|
||||
} else {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
if len(test_str) > 0 && test_str[len(test_str)-1] == '\n' {
|
||||
test_str = test_str[:len(test_str)-1]
|
||||
}
|
||||
} else {
|
||||
// Multi-line mode - read every line of input into a temp. string.
|
||||
// test_str will contain all lines of input (including newline characters)
|
||||
// as one string.
|
||||
var temp string
|
||||
for temp, err = reader.ReadString('\n'); err == nil; temp, err = reader.ReadString('\n') {
|
||||
test_str += temp
|
||||
}
|
||||
// Assuming err != nil
|
||||
if err == io.EOF {
|
||||
if len(temp) > 0 {
|
||||
test_str += temp // Add the last line (if it is non-empty)
|
||||
}
|
||||
linesRead = true
|
||||
} else {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
matchIndices := make([]reg.Match, 0)
|
||||
if matchNumFlagEnabled {
|
||||
tmp, err := regComp.FindNthMatch(test_str, *matchNum)
|
||||
if err == nil {
|
||||
matchIndices = append(matchIndices, tmp)
|
||||
}
|
||||
} else {
|
||||
matchIndices = regComp.FindAllSubmatch(test_str)
|
||||
}
|
||||
|
||||
test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters
|
||||
|
||||
if *printMatchesFlag {
|
||||
// if we are in single line mode, print the line on which
|
||||
// the matches occur
|
||||
if len(matchIndices) > 0 {
|
||||
if !(*multiLineFlag) {
|
||||
fmt.Fprintf(out, "Line %d:\n", lineNum)
|
||||
}
|
||||
for _, m := range matchIndices {
|
||||
fmt.Fprintf(out, "%s\n", m.String())
|
||||
}
|
||||
err := out.Flush()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
|
||||
// This should make checking O(1) instead of O(n)
|
||||
indicesToPrint := new_uniq_arr[int]()
|
||||
for _, idx := range matchIndices {
|
||||
indicesToPrint.add(genRange(idx[0].StartIdx, idx[0].EndIdx)...)
|
||||
}
|
||||
// If we are inverting, then we should print the indices which _didn't_ match
|
||||
// in color.
|
||||
if *invertFlag {
|
||||
oldIndices := indicesToPrint.values()
|
||||
indicesToPrint = new_uniq_arr[int]()
|
||||
// Explanation:
|
||||
// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
|
||||
// These are the values we want to print, now that we have inverted the match.
|
||||
// Re-initialize indicesToPrint and add all of these values to it.
|
||||
indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)
|
||||
|
||||
}
|
||||
// If lineFlag is enabled, we should only print something if:
|
||||
// a. We are not inverting, and have at least one match on the current line
|
||||
// OR
|
||||
// b. We are inverting, and have no matches at all on the current line.
|
||||
// This checks for the inverse, and continues if it is true.
|
||||
if *lineFlag {
|
||||
if !(*invertFlag) && len(matchIndices) == 0 || *invertFlag && len(matchIndices) > 0 {
|
||||
continue
|
||||
} else {
|
||||
if *recursiveFlag || len(flag.Args()) > 2 { // If we have 2 args, then we're only searching 1 file. We should only print the filename if there's more than 1 file.
|
||||
color.New(color.FgMagenta).Fprintf(out, "%s:", inputFile.Name()) // Print filename
|
||||
}
|
||||
if *lineNumFlag {
|
||||
color.New(color.FgGreen).Fprintf(out, "%d:", lineNum) // Print filename
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we are substituting, we need a different behavior, as follows:
|
||||
// For every character in the test string:
|
||||
// 1. Check if the index is the start of any matchIndex
|
||||
// 2. If so, print the substitute text, and set our index to
|
||||
// the corresponding end index.
|
||||
// 3. If not, just print the character.
|
||||
if substituteFlagEnabled {
|
||||
for i := range test_str_runes {
|
||||
inMatchIndex := false
|
||||
for _, m := range matchIndices {
|
||||
if i == m[0].StartIdx {
|
||||
fmt.Fprintf(out, "%s", *substituteText)
|
||||
i = m[0].EndIdx
|
||||
inMatchIndex = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !inMatchIndex {
|
||||
fmt.Fprintf(out, "%c", test_str_runes[i])
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for i, c := range test_str_runes {
|
||||
if indicesToPrint.contains(i) {
|
||||
color.New(color.FgRed, color.Bold).Fprintf(out, "%c", c)
|
||||
// Newline after every match - only if -o is enabled and -v is disabled.
|
||||
if *onlyFlag && !(*invertFlag) {
|
||||
for matchIdxNum, idx := range matchIndices {
|
||||
if matchIdxNum < len(matchIndices)-1 { // Only print a newline afte printing a match, if there are multiple matches on the line, and we aren't on the last one. This is because the newline that gets added at the end will take care of that.
|
||||
if i+1 == idx[0].EndIdx { // End index is one more than last index of match
|
||||
fmt.Fprintf(out, "\n")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if !(*onlyFlag) {
|
||||
fmt.Fprintf(out, "%c", c)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
err = out.Flush()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
// If the last character in the string wasn't a newline, AND we either have don't -o set or we do (and we've matched something), then print a newline
|
||||
if (len(test_str_runes) > 0 && test_str_runes[len(test_str_runes)-1] != '\n') &&
|
||||
(!*onlyFlag || indicesToPrint.len() > 0) {
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -16,7 +16,6 @@ func (s *uniq_arr[T]) add(vals ...T) {
|
||||
s.backingMap[item] = struct{}{}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (s uniq_arr[T]) contains(val T) bool {
|
||||
@@ -37,3 +36,7 @@ func (s uniq_arr[T]) values() []T {
|
||||
}
|
||||
return toRet
|
||||
}
|
||||
|
||||
func (s uniq_arr[T]) len() int {
|
||||
return len(s.backingMap)
|
||||
}
|
File diff suppressed because it is too large
Load Diff
173
regex/doc.go
Normal file
173
regex/doc.go
Normal file
@@ -0,0 +1,173 @@
|
||||
/*
|
||||
Package regex implements regular expression search, using a custom non-bracktracking engine with support for lookarounds and numeric ranges.
|
||||
|
||||
The engine relies completely on UTF-8 codepoints. As such, it is capable of matching characters
|
||||
from other languages, emojis and symbols.
|
||||
|
||||
The API and regex syntax are largely compatible with that of the stdlib's [regexp], with a few key differences (see 'Key Differences with regexp').
|
||||
|
||||
The full syntax is specified below.
|
||||
|
||||
# Syntax
|
||||
|
||||
Single characters:
|
||||
|
||||
. Match any character. Newline matching is dependent on the RE_SINGLE_LINE flag.
|
||||
[abc] Character class - match a, b or c
|
||||
[a-z] Character range - match any character from a to z
|
||||
[^abc] Negated character class - match any character except a, b and c
|
||||
[^a-z] Negated character range - do not match any character from a to z
|
||||
\[ Match a literal '['. Backslashes can escape any character with special meaning, including another backslash.
|
||||
\0452 Match the character with the octal value 452 (up to 4 digits, first digit must be 0)
|
||||
\xFF Match the character with the hex value FF (exactly 2 characters)
|
||||
\x{0000FF} Match the character with the hex value 0000FF (exactly 6 characters)
|
||||
\n Newline
|
||||
\a Bell character
|
||||
\f Form-feed character
|
||||
\r Carriage return
|
||||
\t Horizontal tab
|
||||
\v Vertical tab
|
||||
|
||||
Perl classes:
|
||||
|
||||
\d Match any digit character ([0-9])
|
||||
\D Match any non-digit character ([^0-9])
|
||||
\w Match any word character ([a-zA-Z0-9_])
|
||||
\W Match any non-word character ([^a-zA-Z0-9_])
|
||||
\s Match any whitespace character ([ \t\n])
|
||||
\S Match any non-whitespace character ([^ \t\n])
|
||||
|
||||
POSIX classes (inside normal character classes):
|
||||
|
||||
[:digit:] All digit characters ([0-9])
|
||||
[:upper:] All upper-case letters ([A-Z])
|
||||
[:lower:] All lower-case letters ([a-z])
|
||||
[:alpha:] All letters ([a-zA-Z])
|
||||
[:alnum:] All alphanumeric characters ([a-zA-Z0-9])
|
||||
[:xdigit:] All hexadecimal characters ([a-fA-F0-9])
|
||||
[:blank:] All blank characters ([ \t])
|
||||
[:space:] All whitespace characters ([ \t\n\r\f\v])
|
||||
[:cntrl:] All control characters ([\x00-\x1F\x7F])
|
||||
[:punct:] All punctuation characters
|
||||
[:graph:] All graphical characters ([\x21-\x7E])
|
||||
[:print:] All graphical characters + space ([\x20-\x7E])
|
||||
[:word:] All word characters (\w)
|
||||
[:ascii:] All ASCII values ([\x00-\x7F])
|
||||
|
||||
Composition:
|
||||
|
||||
def Match d, followed by e, followed by f
|
||||
x|y Match x or y (prefer x)
|
||||
xy|z Match xy or z (prefer xy)
|
||||
|
||||
Repitition:
|
||||
|
||||
Greedy:
|
||||
x* Match x zero or more times, prefer more
|
||||
x+ Match x one or more times, prefer more
|
||||
x? Match x zero or one time, prefer one
|
||||
x{m,n} Match x between m and n times (inclusive), prefer more
|
||||
x{m,} Match x atleast m times, prefer more
|
||||
x{,n} Match x between 0 and n times (inclusive), prefer more
|
||||
x{m} Match x exactly m times
|
||||
|
||||
Lazy:
|
||||
x*? Match x zero or more times, prefer fewer
|
||||
x+? Match x one or more times, prefer fewer
|
||||
x?? Match x zero or one time, prefer zero
|
||||
x{m,n}? Match x between m and n times (inclusive), prefer fewer
|
||||
x{m,}? Match x atleast m times, prefer fewer
|
||||
x{,n}? Match x between 0 and n times (inclusive), prefer fewer
|
||||
x{m} Match x exactly m times
|
||||
|
||||
Grouping:
|
||||
|
||||
(expr) Create a capturing group. The contents of the group can be retrieved with [FindAllMatches]
|
||||
x(y|z) Match x followed by y or z. Given a successful match, the contents of group 1 will include either y or z
|
||||
(?:expr) Create a non-capturing group. The contents of the group aren't saved.
|
||||
x(?:y|z) Match x followed by y or z. No groups are created.
|
||||
|
||||
Assertions:
|
||||
|
||||
^ Match at the start of the input string. If RE_MULTILINE is enabled, it also matches at the start of every line.
|
||||
$ Match at the end of the input string. If RE_MULTILINE is enabled, it also matches at the end of every line.
|
||||
\A Always match at the start of the string, regardless of RE_MULTILINE
|
||||
\z Always match at the end of the string, regardless of RE_MULTILINE
|
||||
\b Match at a word boundary (a word character followed by a non-word character, or vice-versa)
|
||||
\B Match at a non-word boundary (a word character followed by a word character, or vice-versa)
|
||||
|
||||
Lookarounds:
|
||||
|
||||
x(?=y) Positive lookahead - Match x if followed by y
|
||||
x(?!y) Negative lookahead - Match x if NOT followed by y
|
||||
(?<=x)y Positive lookbehind - Match y if preceded by x
|
||||
(?<!x)y Negative lookbehind - Match y if NOT preceded by x
|
||||
|
||||
Backreferences:
|
||||
|
||||
(xy)\1 Match 'xy' followed by the text most recently captured by group 1 (in this case, 'xy')
|
||||
|
||||
Numeric ranges:
|
||||
|
||||
<x-y> Match any number from x to y (inclusive) (x and y must be positive numbers)
|
||||
\<x Match a literal '<' followed by x
|
||||
|
||||
# Key Differences with regexp
|
||||
|
||||
The engine and the API differ from [regexp] in a few ways, some of them very subtle.
|
||||
The key differences are mentioned below.
|
||||
|
||||
1. Byte-slices and runes:
|
||||
|
||||
My engine does not support byte-slices. When a matching function receives a string, it converts it into a
|
||||
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
|
||||
support made the tradeoff worth it.
|
||||
|
||||
2. Return values
|
||||
|
||||
Rather than using primitives for return values, my engine defines two types that are used as return
|
||||
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
|
||||
|
||||
[regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The
|
||||
equivalent expression for this engine is shown below. Note that 'Index' is the default.
|
||||
|
||||
Find(All)?(String)?(Submatch)?
|
||||
|
||||
[Reg.Find] returns the index of the leftmost match in the string.
|
||||
|
||||
If a function contains 'All' it returns all matches instead of just the leftmost one.
|
||||
|
||||
If a function contains 'String' it returns the matched text, rather than the index in the string.
|
||||
|
||||
If a function contains 'Submatch' it returns the match, including all submatches found by
|
||||
capturing groups.
|
||||
|
||||
The term '0-group' is used to refer to the 0th capturing group of a match (which is the entire match).
|
||||
Given the following regex:
|
||||
|
||||
x(y)
|
||||
|
||||
and the input string:
|
||||
|
||||
xyz
|
||||
|
||||
The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name
|
||||
returns the 0-group.
|
||||
|
||||
# Feature Differences
|
||||
|
||||
The following features from [regexp] are (currently) NOT supported:
|
||||
1. Named capturing groups
|
||||
2. Negated POSIX classes
|
||||
3. Embedded flags (flags are instead passed as arguments to [Compile])
|
||||
4. Literal text with \Q ... \E
|
||||
5. Finite repetition with no start (defaulting at 0)
|
||||
|
||||
The following features are not available in [regexp], but are supported in my engine:
|
||||
1. Lookarounds
|
||||
2. Numeric ranges
|
||||
3. Backreferences
|
||||
|
||||
I hope to shorten the first list, and expand the second.
|
||||
*/
|
||||
package regex
|
181
regex/example_test.go
Normal file
181
regex/example_test.go
Normal file
@@ -0,0 +1,181 @@
|
||||
package regex_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"gitea.twomorecents.org/Rockingcool/kleingrep/regex"
|
||||
)
|
||||
|
||||
func ExampleReg_Find() {
|
||||
regexStr := "b|a"
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
match, _ := regexComp.Find("banana")
|
||||
fmt.Println(match.String())
|
||||
|
||||
// Output: 0 1
|
||||
}
|
||||
|
||||
func ExampleReg_FindAll() {
|
||||
regexStr := "b|a"
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
matches := regexComp.FindAll("banana")
|
||||
for _, group := range matches {
|
||||
fmt.Println(group.String())
|
||||
}
|
||||
|
||||
// Output: 0 1
|
||||
// 1 2
|
||||
// 3 4
|
||||
// 5 6
|
||||
}
|
||||
|
||||
func ExampleReg_FindString() {
|
||||
regexStr := `\w+\s+(?=sheep)`
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
matchStr := regexComp.FindString("pink cows and yellow sheep")
|
||||
fmt.Println(matchStr)
|
||||
// Output: yellow
|
||||
}
|
||||
|
||||
func ExampleReg_FindSubmatch() {
|
||||
regexStr := `(\d)\.(\d)(\d)`
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
match, _ := regexComp.FindSubmatch("3.14")
|
||||
fmt.Println(match[0])
|
||||
fmt.Println(match[1])
|
||||
fmt.Println(match[2])
|
||||
// Output: 0 4
|
||||
// 0 1
|
||||
// 2 3
|
||||
}
|
||||
|
||||
func ExampleReg_FindStringSubmatch() {
|
||||
regexStr := `(\d{4})-(\d{2})-(\d{2})`
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
inputStr := `The date is 2025-02-10`
|
||||
|
||||
match := regexComp.FindStringSubmatch(inputStr)
|
||||
fmt.Println(match[1])
|
||||
fmt.Println(match[3])
|
||||
// Output: 2025
|
||||
// 10
|
||||
}
|
||||
|
||||
func ExampleReg_FindAllSubmatch() {
|
||||
regexStr := `(\d)\.(\d)(\d)`
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
matches := regexComp.FindAllSubmatch("3.14+8.97")
|
||||
fmt.Println(matches[0][0]) // 0-group (entire match) of 1st match (0-indexed)
|
||||
fmt.Println(matches[0][1]) // 1st group of 1st match
|
||||
fmt.Println(matches[1][0]) // 0-group of 2nd match
|
||||
fmt.Println(matches[1][1]) // 1st group of 2nd math
|
||||
// Output: 0 4
|
||||
// 0 1
|
||||
// 5 9
|
||||
// 5 6
|
||||
}
|
||||
|
||||
func ExampleReg_FindAllString() {
|
||||
regexStr := `<0-255>\.<0-255>\.<0-255>\.<0-255>`
|
||||
inputStr := `192.168.220.7 pings 9.9.9.9`
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
matchStrs := regexComp.FindAllString(inputStr)
|
||||
|
||||
fmt.Println(matchStrs[0])
|
||||
fmt.Println(matchStrs[1])
|
||||
// Output: 192.168.220.7
|
||||
// 9.9.9.9
|
||||
}
|
||||
|
||||
func ExampleReg_FindAllStringSubmatch() {
|
||||
// 'https' ...
|
||||
// followed by 1 or more alphanumeric characters (including period) ...
|
||||
// then a forward slash ...
|
||||
// followed by one more of :
|
||||
// word character,
|
||||
// question mark,
|
||||
// period,
|
||||
// equals sign
|
||||
regexStr := `https://([a-z0-9\.]+)/([\w.?=]+)`
|
||||
regexComp := regex.MustCompile(regexStr, regex.RE_CASE_INSENSITIVE)
|
||||
inputStr := `You can find me at https://twomorecents.org/index.html and https://news.ycombinator.com/user?id=aadhavans`
|
||||
|
||||
matchIndices := regexComp.FindAllStringSubmatch(inputStr)
|
||||
fmt.Println(matchIndices[0][1]) // 1st group of 1st match (0-indexed)
|
||||
fmt.Println(matchIndices[0][2]) // 2nd group of 1st match
|
||||
fmt.Println(matchIndices[1][1]) // 1st group of 2nd match
|
||||
fmt.Println(matchIndices[1][2]) // 2nd group of 2nd match
|
||||
// Output: twomorecents.org
|
||||
// index.html
|
||||
// news.ycombinator.com
|
||||
// user?id=aadhavans
|
||||
|
||||
}
|
||||
|
||||
func ExampleReg_Expand() {
|
||||
inputStr := `option1: value1
|
||||
option2: value2`
|
||||
regexStr := `(\w+): (\w+)`
|
||||
templateStr := "$1 = $2\n"
|
||||
regexComp := regex.MustCompile(regexStr, regex.RE_MULTILINE)
|
||||
result := ""
|
||||
for _, submatches := range regexComp.FindAllSubmatch(inputStr) {
|
||||
result = regexComp.Expand(result, templateStr, inputStr, submatches)
|
||||
}
|
||||
fmt.Println(result)
|
||||
// Output: option1 = value1
|
||||
// option2 = value2
|
||||
|
||||
}
|
||||
|
||||
func ExampleReg_LiteralPrefix() {
|
||||
regexStr := `a(b|c)d*`
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
prefix, complete := regexComp.LiteralPrefix()
|
||||
fmt.Println(prefix)
|
||||
fmt.Println(complete)
|
||||
// Output: a
|
||||
// false
|
||||
}
|
||||
|
||||
func ExampleReg_Longest() {
|
||||
regexStr := `x|xx`
|
||||
inputStr := "xx"
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
fmt.Println(regexComp.FindString(inputStr))
|
||||
regexComp.Longest()
|
||||
fmt.Println(regexComp.FindString(inputStr))
|
||||
// Output: x
|
||||
// xx
|
||||
}
|
||||
|
||||
func ExampleReg_ReplaceAll() {
|
||||
regexStr := `(\d)(\w)`
|
||||
inputStr := "5d9t"
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
fmt.Println(regexComp.ReplaceAll(inputStr, `$2$1`))
|
||||
// Output: d5t9
|
||||
}
|
||||
|
||||
func ExampleReg_ReplaceAllLiteral() {
|
||||
regexStr := `fox|dog`
|
||||
inputStr := "the quick brown fox jumped over the lazy dog"
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
fmt.Println(regexComp.ReplaceAllLiteral(inputStr, `duck`))
|
||||
// Output: the quick brown duck jumped over the lazy duck
|
||||
}
|
||||
|
||||
func ExampleReg_ReplaceAllFunc() {
|
||||
regexStr := `\w{5,}`
|
||||
inputStr := `all five or more letter words in this string are capitalized`
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
fmt.Println(regexComp.ReplaceAllFunc(inputStr, strings.ToUpper))
|
||||
// Output: all five or more LETTER WORDS in this STRING are CAPITALIZED
|
||||
}
|
476
regex/matching.go
Normal file
476
regex/matching.go
Normal file
@@ -0,0 +1,476 @@
|
||||
package regex
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// A Match represents a match found by the regex in a given string.
|
||||
// It is represented as a list of groups, where the nth element contains
|
||||
// the contents of the nth capturing group. Note that the group may not be valid
|
||||
// (see [Group.IsValid]). The element at index 0 is known
|
||||
// as the 0-group, and represents the contents of the entire match.
|
||||
//
|
||||
// See [Reg.FindSubmatch] for an example.
|
||||
type Match []Group
|
||||
|
||||
// a Group represents a capturing group. It contains the start and index of the group.
|
||||
type Group struct {
|
||||
StartIdx int
|
||||
EndIdx int
|
||||
}
|
||||
|
||||
func newMatch(size int) Match {
|
||||
toRet := make([]Group, size)
|
||||
for i := range toRet {
|
||||
toRet[i].StartIdx = -1
|
||||
toRet[i].EndIdx = -1
|
||||
}
|
||||
return toRet
|
||||
}
|
||||
|
||||
// Returns a string containing the indices of all (valid) groups in the match
|
||||
func (m Match) String() string {
|
||||
var toRet string
|
||||
for i, g := range m {
|
||||
if g.IsValid() {
|
||||
toRet += fmt.Sprintf("Group %d\n", i)
|
||||
toRet += g.String()
|
||||
toRet += "\n"
|
||||
}
|
||||
}
|
||||
return toRet
|
||||
}
|
||||
|
||||
// String converts the Group into a string representation.
|
||||
func (idx Group) String() string {
|
||||
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
|
||||
}
|
||||
|
||||
// IsValid returns whether a group is valid (ie. whether it matched any text). It
|
||||
// simply ensures that both indices of the group are >= 0.
|
||||
func (g Group) IsValid() bool {
|
||||
return g.StartIdx >= 0 && g.EndIdx >= 0
|
||||
}
|
||||
|
||||
// Simple function, makes it easier to map this over a list of matches
|
||||
func getZeroGroup(m Match) Group {
|
||||
return m[0]
|
||||
}
|
||||
|
||||
func copyThread(to *nfaState, from nfaState) {
|
||||
to.threadGroups = append([]Group{}, from.threadGroups...)
|
||||
}
|
||||
|
||||
// Find returns the 0-group of the leftmost match of the regex in the given string.
|
||||
// An error value != nil indicates that no match was found.
|
||||
func (re Reg) Find(str string) (Group, error) {
|
||||
match, err := re.FindNthMatch(str, 1)
|
||||
if err != nil {
|
||||
return Group{}, fmt.Errorf("no matches found")
|
||||
}
|
||||
return getZeroGroup(match), nil
|
||||
}
|
||||
|
||||
// Match returns a boolean value, indicating whether the regex found a match in the given string.
|
||||
func (re Reg) Match(str string) bool {
|
||||
_, err := re.Find(str)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// CompileMatch compiles expr and returns true if str contains a match of the expression.
|
||||
// It is equivalent to [regexp.Match].
|
||||
// An optional list of flags may be provided (see [ReFlag]).
|
||||
// It returns an error (!= nil) if there was an error compiling the expression.
|
||||
func CompileMatch(expr string, str string, flags ...ReFlag) (bool, error) {
|
||||
re, err := Compile(expr, flags...)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return re.Match(str), nil
|
||||
}
|
||||
|
||||
// FindAll returns a slice containing all the 0-groups of the regex in the given string.
|
||||
// A 0-group represents the match without any submatches.
|
||||
func (re Reg) FindAll(str string) []Group {
|
||||
indices := re.FindAllSubmatch(str)
|
||||
zeroGroups := funcMap(indices, getZeroGroup)
|
||||
return zeroGroups
|
||||
}
|
||||
|
||||
// FindString returns the text of the leftmost match of the regex in the given string.
|
||||
// The return value will be an empty string in two situations:
|
||||
// 1. No match was found
|
||||
// 2. The match was an empty string
|
||||
func (re Reg) FindString(str string) string {
|
||||
match, err := re.FindNthMatch(str, 1)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
zeroGroup := getZeroGroup(match)
|
||||
return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
|
||||
}
|
||||
|
||||
// FindSubmatch returns the leftmost match of the regex in the given string, including
|
||||
// the submatches matched by capturing groups. The returned [Match] will always contain the same
|
||||
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
|
||||
// [Group.IsValid], or by checking that both indices of the group are >= 0.
|
||||
// The second-return value is nil if no match was found.
|
||||
func (re Reg) FindSubmatch(str string) (Match, error) {
|
||||
match, err := re.FindNthMatch(str, 1)
|
||||
if err != nil {
|
||||
return Match{}, fmt.Errorf("no match found")
|
||||
} else {
|
||||
return match, nil
|
||||
}
|
||||
}
|
||||
|
||||
// FindStringSubmatch is the 'string' version of [FindSubmatch]. It returns a slice of strings,
|
||||
// where the string at index i contains the text matched by the i-th capturing group.
|
||||
// The 0-th index represents the entire match.
|
||||
// An empty string at index n could mean:
|
||||
// ,
|
||||
// 1. Group n did not find a match
|
||||
// 2. Group n found a zero-length match
|
||||
//
|
||||
// A return value of nil indicates no match.
|
||||
func (re Reg) FindStringSubmatch(str string) []string {
|
||||
matchStr := make([]string, re.numGroups+1)
|
||||
match, err := re.FindSubmatch(str)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
nonEmptyMatchFound := false
|
||||
for i := range match {
|
||||
if match[i].IsValid() {
|
||||
matchStr[i] = str[match[i].StartIdx:match[i].EndIdx]
|
||||
nonEmptyMatchFound = true
|
||||
} else {
|
||||
matchStr[i] = ""
|
||||
}
|
||||
}
|
||||
if nonEmptyMatchFound == false {
|
||||
return nil
|
||||
}
|
||||
return matchStr
|
||||
}
|
||||
|
||||
// FindAllString is the 'all' version of [FindString].
|
||||
// It returns a slice of strings containing the text of all matches of
|
||||
// the regex in the given string.
|
||||
func (re Reg) FindAllString(str string) []string {
|
||||
zerogroups := re.FindAll(str)
|
||||
matchStrs := funcMap(zerogroups, func(g Group) string {
|
||||
return str[g.StartIdx:g.EndIdx]
|
||||
})
|
||||
return matchStrs
|
||||
}
|
||||
|
||||
// FindNthMatch return the 'n'th match of the regex in the given string.
|
||||
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
|
||||
func (re Reg) FindNthMatch(str string, n int) (Match, error) {
|
||||
idx := 0
|
||||
matchNum := 0
|
||||
str_runes := []rune(str)
|
||||
var matchFound bool
|
||||
var matchIdx Match
|
||||
for idx <= len(str_runes) {
|
||||
matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
|
||||
if matchFound {
|
||||
matchNum++
|
||||
}
|
||||
if matchNum == n {
|
||||
return matchIdx, nil
|
||||
}
|
||||
}
|
||||
// We haven't found the nth match after scanning the string - Return an error
|
||||
return nil, fmt.Errorf("invalid match index - too few matches found")
|
||||
}
|
||||
|
||||
// FindAllSubmatch returns a slice of matches in the given string.
|
||||
func (re Reg) FindAllSubmatch(str string) []Match {
|
||||
idx := 0
|
||||
str_runes := []rune(str)
|
||||
var matchFound bool
|
||||
var matchIdx Match
|
||||
indices := make([]Match, 0)
|
||||
for idx <= len(str_runes) {
|
||||
matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
|
||||
if matchFound {
|
||||
indices = append(indices, matchIdx)
|
||||
}
|
||||
}
|
||||
|
||||
return indices
|
||||
}
|
||||
|
||||
// FindAllSubmatch returns a double-slice of strings. Each slice contains the text of a match, including all submatches.
|
||||
// A return value of nil indicates no match.
|
||||
func (re Reg) FindAllStringSubmatch(str string) [][]string {
|
||||
match := re.FindAllSubmatch(str)
|
||||
if len(match) == 0 {
|
||||
return nil
|
||||
}
|
||||
rtv := make([][]string, len(match))
|
||||
for i := range rtv {
|
||||
rtv[i] = make([]string, re.numGroups+1)
|
||||
}
|
||||
rtv = funcMap(match, func(m Match) []string {
|
||||
return funcMap(m, func(g Group) string {
|
||||
if g.IsValid() {
|
||||
return str[g.StartIdx:g.EndIdx]
|
||||
} else {
|
||||
return ""
|
||||
}
|
||||
})
|
||||
})
|
||||
return rtv
|
||||
}
|
||||
|
||||
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState {
|
||||
if stateExists(list, state) || stateExists(visited, state) {
|
||||
return list
|
||||
}
|
||||
visited = append(visited, state)
|
||||
|
||||
if (state.isKleene || state.isQuestion) && (state.isLazy == false) { // Greedy quantifiers
|
||||
copyThread(state.splitState, state)
|
||||
list := addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
|
||||
copyThread(state.next, state)
|
||||
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
||||
return list
|
||||
}
|
||||
if state.isAlternation || ((state.isKleene || state.isQuestion) && state.isLazy) { // Alternation or lazy quantifier
|
||||
copyThread(state.next, state)
|
||||
list := addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
||||
copyThread(state.splitState, state)
|
||||
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
|
||||
return list
|
||||
}
|
||||
state.threadGroups = append([]Group{}, threadGroups...)
|
||||
if state.assert != noneAssert {
|
||||
if state.checkAssertion(str, idx, preferLongest) {
|
||||
copyThread(state.next, state)
|
||||
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
|
||||
}
|
||||
}
|
||||
if state.groupBegin {
|
||||
state.threadGroups[state.groupNum].StartIdx = idx
|
||||
copyThread(state.next, state)
|
||||
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
|
||||
}
|
||||
if state.groupEnd {
|
||||
state.threadGroups[state.groupNum].EndIdx = idx
|
||||
copyThread(state.next, state)
|
||||
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
|
||||
}
|
||||
return append(list, state)
|
||||
|
||||
}
|
||||
|
||||
// Helper for FindAllMatches. Returns whether it found a match, the
|
||||
// first Match it finds, and how far it got into the string ie. where
|
||||
// the next search should start from.
|
||||
func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int, preferLongest bool) (bool, Match, int) {
|
||||
// Base case - exit if offset exceeds string's length
|
||||
if offset > len(str) {
|
||||
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
|
||||
return false, []Group{}, offset
|
||||
}
|
||||
resetThreads(start)
|
||||
|
||||
currentStates := make([]nfaState, 0)
|
||||
nextStates := make([]nfaState, 0)
|
||||
i := offset // Index in string
|
||||
|
||||
// If the first state is an assertion, makes sure the assertion
|
||||
// is true before we do _anything_ else.
|
||||
if start.assert != noneAssert {
|
||||
if start.checkAssertion(str, offset, preferLongest) == false {
|
||||
i++
|
||||
return false, []Group{}, i
|
||||
}
|
||||
}
|
||||
|
||||
start.threadGroups = newMatch(numGroups + 1)
|
||||
start.threadGroups[0].StartIdx = i
|
||||
currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest)
|
||||
var match Match = nil
|
||||
for idx := i; idx <= len(str); idx++ {
|
||||
if len(currentStates) == 0 {
|
||||
break
|
||||
}
|
||||
for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ {
|
||||
currentState := currentStates[currentStateIdx]
|
||||
|
||||
if currentState.threadGroups == nil {
|
||||
currentState.threadGroups = newMatch(numGroups + 1)
|
||||
currentState.threadGroups[0].StartIdx = idx
|
||||
}
|
||||
|
||||
if currentState.isLast {
|
||||
currentState.threadGroups[0].EndIdx = idx
|
||||
match = append([]Group{}, currentState.threadGroups...)
|
||||
if !preferLongest {
|
||||
break
|
||||
}
|
||||
} else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.isBackreference && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character
|
||||
if currentState.contentContains(str, idx, preferLongest) {
|
||||
nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
|
||||
}
|
||||
} else if currentState.isBackreference && currentState.threadGroups[currentState.referredGroup].IsValid() {
|
||||
groupLength := currentState.threadGroups[currentState.referredGroup].EndIdx - currentState.threadGroups[currentState.referredGroup].StartIdx
|
||||
if currentState.threadBackref == groupLength {
|
||||
currentState.threadBackref = 0
|
||||
copyThread(currentState.next, currentState)
|
||||
currentStates = addStateToList(str, idx, currentStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
|
||||
} else {
|
||||
idxInReferredGroup := currentState.threadGroups[currentState.referredGroup].StartIdx + currentState.threadBackref
|
||||
if idxInReferredGroup < len(str) && idx < len(str) && str[idxInReferredGroup] == str[idx] {
|
||||
currentState.threadBackref += 1
|
||||
nextStates = append(nextStates, currentState)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
currentStates = append([]nfaState{}, nextStates...)
|
||||
nextStates = nil
|
||||
}
|
||||
if match != nil {
|
||||
if offset == match[0].EndIdx {
|
||||
return true, match, match[0].EndIdx + 1
|
||||
}
|
||||
return true, match, match[0].EndIdx
|
||||
}
|
||||
return false, []Group{}, i + 1
|
||||
}
|
||||
|
||||
// Expand appends template to dst, expanding any variables in template to the relevant capturing group.
|
||||
//
|
||||
// A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group.
|
||||
// To insert a literal $, do not put a number after it. Alternatively, you can use $$.
|
||||
// src is the input string, and match must be the result of [Reg.FindSubmatch].
|
||||
func (re Reg) Expand(dst string, template string, src string, match Match) string {
|
||||
templateRuneSlc := []rune(template)
|
||||
srcRuneSlc := []rune(src)
|
||||
i := 0
|
||||
for i < len(templateRuneSlc) {
|
||||
c := templateRuneSlc[i]
|
||||
if c == '$' {
|
||||
i += 1
|
||||
// The dollar sign is the last character of the string, or it is proceeded by another dollar sign
|
||||
if i >= len(templateRuneSlc) || templateRuneSlc[i] == '$' {
|
||||
dst += "$"
|
||||
i++
|
||||
} else {
|
||||
numStr := ""
|
||||
for i < len(templateRuneSlc) && unicode.IsDigit(templateRuneSlc[i]) {
|
||||
numStr += string(templateRuneSlc[i])
|
||||
i++
|
||||
}
|
||||
if numStr == "" {
|
||||
dst += "$"
|
||||
} else {
|
||||
num, _ := strconv.Atoi(numStr)
|
||||
if num < len(match) {
|
||||
dst += string(srcRuneSlc[match[num].StartIdx:match[num].EndIdx])
|
||||
} else {
|
||||
dst += "$" + numStr
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
dst += string(c)
|
||||
i++
|
||||
}
|
||||
}
|
||||
return dst
|
||||
}
|
||||
|
||||
// LiteralPrefix returns a string that must begin any match of the given regular expression.
|
||||
// The second return value is true if the string comprises the entire expression.
|
||||
func (re Reg) LiteralPrefix() (prefix string, complete bool) {
|
||||
state := re.start
|
||||
if state.assert != noneAssert {
|
||||
state = state.next
|
||||
}
|
||||
for !(state.isLast) && (!state.isAlternation) && len(state.content) == 1 && state.assert == noneAssert {
|
||||
if state.groupBegin || state.groupEnd {
|
||||
state = state.next
|
||||
continue
|
||||
}
|
||||
prefix += string(rune(state.content[0]))
|
||||
state = state.next
|
||||
}
|
||||
if state.isLast {
|
||||
complete = true
|
||||
} else {
|
||||
complete = false
|
||||
}
|
||||
return prefix, complete
|
||||
}
|
||||
|
||||
// ReplaceAll replaces all matches of the expression in src, with the text in repl. In repl, variables are interpreted
|
||||
// as they are in [Reg.Expand]. The resulting string is returned.
|
||||
func (re Reg) ReplaceAll(src string, repl string) string {
|
||||
matches := re.FindAllSubmatch(src)
|
||||
i := 0
|
||||
currentMatch := 0
|
||||
dst := ""
|
||||
for i < len(src) {
|
||||
if currentMatch < len(matches) && matches[currentMatch][0].IsValid() && i == matches[currentMatch][0].StartIdx {
|
||||
dst += re.Expand("", repl, src, matches[currentMatch])
|
||||
i = matches[currentMatch][0].EndIdx
|
||||
currentMatch++
|
||||
} else {
|
||||
dst += string(src[i])
|
||||
i++
|
||||
}
|
||||
}
|
||||
return dst
|
||||
}
|
||||
|
||||
// ReplaceAllLiteral replaces all matches of the expression in src, with the text in repl. The text is replaced directly,
|
||||
// without any expansion.
|
||||
func (re Reg) ReplaceAllLiteral(src string, repl string) string {
|
||||
zerogroups := re.FindAll(src)
|
||||
currentMatch := 0
|
||||
i := 0
|
||||
dst := ""
|
||||
|
||||
for i < len(src) {
|
||||
if currentMatch < len(zerogroups) && i == zerogroups[currentMatch].StartIdx {
|
||||
dst += repl
|
||||
i = zerogroups[currentMatch].EndIdx
|
||||
currentMatch += 1
|
||||
} else {
|
||||
dst += string(src[i])
|
||||
i++
|
||||
}
|
||||
}
|
||||
return dst
|
||||
}
|
||||
|
||||
// ReplaceAllFunc replaces every match of the expression in src, with the return value of the function replFunc.
|
||||
// replFunc takes in the matched string. The return value is substituted in directly without expasion.
|
||||
func (re Reg) ReplaceAllFunc(src string, replFunc func(string) string) string {
|
||||
zerogroups := re.FindAll(src)
|
||||
currentMatch := 0
|
||||
i := 0
|
||||
dst := ""
|
||||
|
||||
for i < len(src) {
|
||||
if currentMatch < len(zerogroups) && i == zerogroups[currentMatch].StartIdx {
|
||||
dst += replFunc(src[zerogroups[currentMatch].StartIdx:zerogroups[currentMatch].EndIdx])
|
||||
i = zerogroups[currentMatch].EndIdx
|
||||
currentMatch += 1
|
||||
} else {
|
||||
dst += string(src[i])
|
||||
i++
|
||||
}
|
||||
}
|
||||
return dst
|
||||
|
||||
}
|
@@ -1,4 +1,4 @@
|
||||
package greg
|
||||
package regex
|
||||
|
||||
import (
|
||||
"slices"
|
||||
@@ -8,16 +8,19 @@ import (
|
||||
var whitespaceChars = []rune{' ', '\t', '\n'}
|
||||
var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
|
||||
var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_")
|
||||
var LBRACKET rune = 0xF0001
|
||||
var RBRACKET rune = 0xF0002
|
||||
var ANY_CHAR rune = 0xF0003 // Represents any character - used for states where the allChars flag is on.
|
||||
var LPAREN_CHAR rune = 0xF0004 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
|
||||
var RPAREN_CHAR rune = 0xF0005
|
||||
var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPAREN
|
||||
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash
|
||||
var CHAR_RANGE rune = 0xF0008 // Represents a character range
|
||||
var lbracketRune rune = 0xF0002
|
||||
var rbracketRune rune = 0xF0003
|
||||
var anyCharRune rune = 0xF0004 // Represents any character - used for states where the allChars flag is on.
|
||||
var lparenRune rune = 0xF0005 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
|
||||
var rparenRune rune = 0xF0006
|
||||
var nonCapLparenRune rune = 0xF0007 // Represents a non-capturing group's LPAREN
|
||||
var escBackslashRune rune = 0xF0008 // Represents an escaped backslash
|
||||
var charRangeRune rune = 0xF0009 // Represents a character range
|
||||
var lazyKleeneRune rune = 0xF000A // Represents a lazy kleene star
|
||||
var lazyPlusRune rune = 0xF000B // Represents a lazy plus operator
|
||||
var lazyQuestionRune rune = 0xF000C // Represents a lazy question operator
|
||||
|
||||
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', CONCAT, '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR}
|
||||
var specialChars = []rune{'?', lazyQuestionRune, '*', lazyKleeneRune, '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', lazyPlusRune, '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
|
||||
|
||||
// An interface for int and rune, which are identical
|
||||
type character interface {
|
||||
@@ -48,33 +51,9 @@ func isNormalChar(c rune) bool {
|
||||
return !slices.Contains(specialChars, c)
|
||||
}
|
||||
|
||||
// Ensure that the given elements are only appended to the given slice if they
|
||||
// don't already exist. Returns the new slice, and the number of unique items appended.
|
||||
func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
|
||||
num_appended := 0
|
||||
for _, item := range items {
|
||||
if !slices.Contains(slc, item) {
|
||||
slc = append(slc, item)
|
||||
num_appended++
|
||||
}
|
||||
}
|
||||
return slc, num_appended
|
||||
}
|
||||
|
||||
// Returns true only if all the given elements are equal
|
||||
func allEqual[T comparable](items ...T) bool {
|
||||
first := items[0]
|
||||
for _, item := range items {
|
||||
if item != first {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Map function - convert a slice of T to a slice of V, based on a function
|
||||
// that maps a T to a V
|
||||
func Map[T, V any](slc []T, fn func(T) V) []V {
|
||||
func funcMap[T, V any](slc []T, fn func(T) V) []V {
|
||||
toReturn := make([]V, len(slc))
|
||||
for i, val := range slc {
|
||||
toReturn[i] = fn(val)
|
||||
@@ -84,7 +63,7 @@ func Map[T, V any](slc []T, fn func(T) V) []V {
|
||||
|
||||
// Reduce function - reduces a slice of a type into a value of the type,
|
||||
// based on the given function.
|
||||
func Reduce[T any](slc []T, fn func(T, T) T) T {
|
||||
func funcReduce[T any](slc []T, fn func(T, T) T) T {
|
||||
if len(slc) == 0 {
|
||||
panic("Reduce on empty slice.")
|
||||
}
|
449
regex/nfa.go
Normal file
449
regex/nfa.go
Normal file
@@ -0,0 +1,449 @@
|
||||
package regex
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"slices"
|
||||
)
|
||||
|
||||
const epsilon int = 0xF0000
|
||||
|
||||
type assertType int
|
||||
|
||||
const (
|
||||
noneAssert assertType = iota
|
||||
sosAssert // Start of string (^)
|
||||
soiAssert // Start of input (\A)
|
||||
eosAssert // End of string ($)
|
||||
eoiAssert // End of input (\Z)
|
||||
wboundAssert
|
||||
nonwboundAssert
|
||||
plaAssert // Positive lookahead
|
||||
nlaAssert // Negative lookahead
|
||||
plbAssert // Positive lookbehind
|
||||
nlbAssert // Negative lookbehind
|
||||
alwaysTrueAssert // An assertion that is always true
|
||||
)
|
||||
|
||||
type nfaState struct {
|
||||
content stateContents // Contents of current state
|
||||
isEmpty bool // If it is empty - Union operator and Kleene star states will be empty
|
||||
isLast bool // If it is the last state (acept state)
|
||||
output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
|
||||
// transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
|
||||
next *nfaState // The next state (not for alternation or kleene states)
|
||||
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
|
||||
isQuestion bool // Identifies whether current node is a 0-state representing the question operator
|
||||
isAlternation bool // Identifies whether current node is a 0-state representing an alternation
|
||||
isLazy bool // Only for split states - Identifies whether or not to flip the order of branches (try one branch before the other)
|
||||
splitState *nfaState // Only for alternation states - the 'other' branch of the alternation ('next' is the first)
|
||||
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
|
||||
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
|
||||
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
|
||||
lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds
|
||||
lookaroundNFA *nfaState // Holds the NFA of the lookaroundRegex - if it exists
|
||||
lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround
|
||||
groupBegin bool // Whether or not the node starts a capturing group
|
||||
groupEnd bool // Whether or not the node ends a capturing group
|
||||
groupNum int // Which capturing group the node starts / ends
|
||||
isBackreference bool // Whether or not current node is backreference
|
||||
referredGroup int // If current node is a backreference, the node that it points to
|
||||
// The following properties depend on the current match - I should think about resetting them for every match.
|
||||
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
|
||||
threadBackref int // If current node is a backreference, how many characters to look forward into the referred group
|
||||
}
|
||||
|
||||
// Clones the NFA starting from the given state.
|
||||
func cloneState(start *nfaState) *nfaState {
|
||||
return cloneStateHelper(start, make(map[*nfaState]*nfaState))
|
||||
}
|
||||
|
||||
// Helper function for clone. The map is used to keep track of which states have
|
||||
// already been copied, and which ones haven't.
|
||||
// This function was created using output from Llama3.1:405B.
|
||||
func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) *nfaState {
|
||||
// Base case - if the clone exists in our map, return it.
|
||||
if clone, exists := cloneMap[stateToClone]; exists {
|
||||
return clone
|
||||
}
|
||||
if stateToClone == nil {
|
||||
return nil
|
||||
}
|
||||
// Recursive case - if the clone doesn't exist, create it, add it to the map,
|
||||
// and recursively call for each of the transition states.
|
||||
clone := &nfaState{
|
||||
content: append([]int{}, stateToClone.content...),
|
||||
isEmpty: stateToClone.isEmpty,
|
||||
isLast: stateToClone.isLast,
|
||||
output: make([]*nfaState, len(stateToClone.output)),
|
||||
isKleene: stateToClone.isKleene,
|
||||
isQuestion: stateToClone.isQuestion,
|
||||
isAlternation: stateToClone.isAlternation,
|
||||
isLazy: stateToClone.isLazy,
|
||||
assert: stateToClone.assert,
|
||||
allChars: stateToClone.allChars,
|
||||
except: append([]rune{}, stateToClone.except...),
|
||||
lookaroundRegex: stateToClone.lookaroundRegex,
|
||||
groupEnd: stateToClone.groupEnd,
|
||||
groupBegin: stateToClone.groupBegin,
|
||||
groupNum: stateToClone.groupNum,
|
||||
isBackreference: stateToClone.isBackreference,
|
||||
referredGroup: stateToClone.referredGroup,
|
||||
}
|
||||
cloneMap[stateToClone] = clone
|
||||
for i, s := range stateToClone.output {
|
||||
if s == stateToClone {
|
||||
clone.output[i] = clone
|
||||
} else {
|
||||
clone.output[i] = cloneStateHelper(s, cloneMap)
|
||||
}
|
||||
}
|
||||
if stateToClone.lookaroundNFA == stateToClone {
|
||||
clone.lookaroundNFA = clone
|
||||
}
|
||||
clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap)
|
||||
if stateToClone.splitState == stateToClone {
|
||||
clone.splitState = clone
|
||||
}
|
||||
clone.splitState = cloneStateHelper(stateToClone.splitState, cloneMap)
|
||||
if stateToClone.next == stateToClone {
|
||||
clone.next = clone
|
||||
}
|
||||
clone.next = cloneStateHelper(stateToClone.next, cloneMap)
|
||||
return clone
|
||||
}
|
||||
|
||||
// Reset any thread-related fields of the NFA starting from the given state.
|
||||
func resetThreads(start *nfaState) {
|
||||
visitedMap := make(map[*nfaState]bool) // The value type doesn't matter here
|
||||
resetThreadsHelper(start, visitedMap)
|
||||
}
|
||||
|
||||
func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
|
||||
if state == nil {
|
||||
return
|
||||
}
|
||||
if _, ok := visitedMap[state]; ok {
|
||||
return
|
||||
}
|
||||
// Assuming it hasn't been visited
|
||||
state.threadGroups = nil
|
||||
state.threadBackref = 0
|
||||
visitedMap[state] = true
|
||||
if state.isAlternation {
|
||||
resetThreadsHelper(state.next, visitedMap)
|
||||
resetThreadsHelper(state.splitState, visitedMap)
|
||||
} else {
|
||||
resetThreadsHelper(state.next, visitedMap)
|
||||
}
|
||||
}
|
||||
|
||||
// Checks if the given state's assertion is true. Returns true if the given
|
||||
// state doesn't have an assertion.
|
||||
func (s nfaState) checkAssertion(str []rune, idx int, preferLongest bool) bool {
|
||||
if s.assert == alwaysTrueAssert {
|
||||
return true
|
||||
}
|
||||
if s.assert == sosAssert {
|
||||
// Single-line mode: Beginning of string
|
||||
// Multi-line mode: Previous character was newline
|
||||
return idx == 0 || (multilineMode && (idx > 0 && str[idx-1] == '\n'))
|
||||
}
|
||||
if s.assert == eosAssert {
|
||||
// Single-line mode: End of string
|
||||
// Multi-line mode: current character is newline
|
||||
// Index is at the end of the string, or it points to the last character which is a newline
|
||||
return idx == len(str) || (multilineMode && str[idx] == '\n')
|
||||
}
|
||||
if s.assert == soiAssert {
|
||||
// Only true at the start of the input, regardless of mode
|
||||
return idx == 0
|
||||
}
|
||||
if s.assert == eoiAssert {
|
||||
// Only true at the end of the input, regardless of mode
|
||||
return idx == len(str)
|
||||
}
|
||||
|
||||
if s.assert == wboundAssert {
|
||||
return isWordBoundary(str, idx)
|
||||
}
|
||||
if s.assert == nonwboundAssert {
|
||||
return !isWordBoundary(str, idx)
|
||||
}
|
||||
if s.isLookaround() {
|
||||
// The process here is simple:
|
||||
// 1. Compile the regex stored in the state's contents.
|
||||
// 2. Run it on a subset of the test string, that ends after the current index in the string
|
||||
// 3. Based on the kind of lookaround (and the indices we get), determine what action to take.
|
||||
startState := s.lookaroundNFA
|
||||
var runesToMatch []rune
|
||||
var strToMatch string
|
||||
if s.assert == plaAssert || s.assert == nlaAssert {
|
||||
runesToMatch = str[idx:]
|
||||
} else {
|
||||
runesToMatch = str[:idx]
|
||||
}
|
||||
|
||||
if len(runesToMatch) == 0 {
|
||||
strToMatch = ""
|
||||
} else {
|
||||
strToMatch = string(runesToMatch)
|
||||
}
|
||||
|
||||
regComp := Reg{startState, s.lookaroundNumCaptureGroups, s.lookaroundRegex, preferLongest}
|
||||
matchIndices := regComp.FindAll(strToMatch)
|
||||
|
||||
numMatchesFound := 0
|
||||
for _, matchIdx := range matchIndices {
|
||||
if s.assert == plaAssert || s.assert == nlaAssert { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
|
||||
if matchIdx.StartIdx == 0 {
|
||||
numMatchesFound++
|
||||
}
|
||||
}
|
||||
if s.assert == plbAssert || s.assert == nlbAssert { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
|
||||
if matchIdx.EndIdx == idx {
|
||||
numMatchesFound++
|
||||
}
|
||||
}
|
||||
}
|
||||
if s.assert == plaAssert || s.assert == plbAssert { // Positive assertions want at least one match
|
||||
return numMatchesFound > 0
|
||||
}
|
||||
if s.assert == nlaAssert || s.assert == nlbAssert { // Negative assertions only want zero matches
|
||||
return numMatchesFound == 0
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Returns true if the contents of 's' contain the value at the given index of the given string
|
||||
func (s nfaState) contentContains(str []rune, idx int, preferLongest bool) bool {
|
||||
if s.assert != noneAssert {
|
||||
return s.checkAssertion(str, idx, preferLongest)
|
||||
}
|
||||
if idx >= len(str) {
|
||||
return false
|
||||
}
|
||||
if s.allChars {
|
||||
return !slices.Contains(slices.Concat(notDotChars, s.except), str[idx]) // Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node.
|
||||
}
|
||||
// Default - s.assert must be NONE
|
||||
return slices.Contains(s.content, int(str[idx]))
|
||||
}
|
||||
|
||||
func (s nfaState) isLookaround() bool {
|
||||
return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert
|
||||
}
|
||||
|
||||
func (s nfaState) numTransitions() int {
|
||||
if s.next == nil && s.splitState == nil {
|
||||
return 0
|
||||
}
|
||||
if s.next == nil || s.splitState == nil {
|
||||
return 1
|
||||
}
|
||||
return 2
|
||||
}
|
||||
|
||||
// Returns the matches for the character at the given index of the given string.
|
||||
// Also returns the number of matches. Returns -1 if an assertion failed.
|
||||
//func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) {
|
||||
// // Assertions can be viewed as 'checks'. If the check fails, we return
|
||||
// // an empty array and 0.
|
||||
// // If it passes, we treat it like any other state, and return all the transitions.
|
||||
// if s.assert != noneAssert {
|
||||
// if s.checkAssertion(str, idx) == false {
|
||||
// return make([]*nfaState, 0), -1
|
||||
// }
|
||||
// }
|
||||
// listTransitions := s.transitions[int(str[idx])]
|
||||
// for _, dest := range s.transitions[int(anyCharRune)] {
|
||||
// if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) {
|
||||
// // Add an allChar state to the list of matches if:
|
||||
// // a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't.
|
||||
// // b. The current character isn't the state's exception list.
|
||||
// listTransitions = append(listTransitions, dest)
|
||||
// }
|
||||
// }
|
||||
// numTransitions := len(listTransitions)
|
||||
// return listTransitions, numTransitions
|
||||
//}
|
||||
|
||||
// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates
|
||||
//func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) {
|
||||
// if st.numTransitions() == 0 {
|
||||
// st.isLast = true
|
||||
// return
|
||||
// }
|
||||
// // if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
|
||||
// if st.numTransitions() == 1 { // Eg. a*
|
||||
// var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
|
||||
// for _, c := range st.content {
|
||||
// if len(st.transitions[c]) != 1 || st.transitions[c][0] != st {
|
||||
// moreThanOneTrans = true
|
||||
// }
|
||||
// }
|
||||
// st.isLast = !moreThanOneTrans
|
||||
// }
|
||||
//
|
||||
// if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
|
||||
// transitionDests := make([]*nfaState, 0)
|
||||
// for _, v := range st.transitions {
|
||||
// transitionDests = append(transitionDests, v...)
|
||||
// }
|
||||
// if allEqual(transitionDests...) {
|
||||
// st.isLast = true
|
||||
// return
|
||||
// }
|
||||
// }
|
||||
// if visited[st] == true {
|
||||
// return
|
||||
// }
|
||||
// visited[st] = true
|
||||
// for _, states := range st.transitions {
|
||||
// for i := range states {
|
||||
// if states[i] != st {
|
||||
// verifyLastStatesHelper(states[i], visited)
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
||||
// verifyLastStates enables the 'isLast' flag for the leaf nodes (last states)
|
||||
//func verifyLastStates(start []*nfaState) {
|
||||
// verifyLastStatesHelper(start[0], make(map[*nfaState]bool))
|
||||
//}
|
||||
|
||||
// Concatenates s1 and s2, returns the start of the concatenation.
|
||||
func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
|
||||
if s1 == nil {
|
||||
return s2
|
||||
}
|
||||
for i := range s1.output {
|
||||
s1.output[i].next = s2
|
||||
}
|
||||
s1.output = s2.output
|
||||
return s1
|
||||
}
|
||||
|
||||
func kleene(s1 *nfaState) (*nfaState, error) {
|
||||
if s1.isEmpty && s1.assert != noneAssert {
|
||||
return nil, fmt.Errorf("previous token is not quantifiable")
|
||||
}
|
||||
|
||||
toReturn := &nfaState{}
|
||||
toReturn.isEmpty = true
|
||||
toReturn.isAlternation = true
|
||||
toReturn.content = newContents(epsilon)
|
||||
toReturn.splitState = s1
|
||||
|
||||
// toReturn := &nfaState{}
|
||||
// toReturn.transitions = make(map[int][]*nfaState)
|
||||
// toReturn.content = newContents(epsilon)
|
||||
toReturn.isKleene = true
|
||||
toReturn.output = append([]*nfaState{}, toReturn)
|
||||
for i := range s1.output {
|
||||
s1.output[i].next = toReturn
|
||||
}
|
||||
// for _, c := range s1.content {
|
||||
// toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
|
||||
// }
|
||||
//toReturn.kleeneState = &s1
|
||||
return toReturn, nil
|
||||
}
|
||||
|
||||
func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
|
||||
toReturn := &nfaState{}
|
||||
// toReturn.transitions = make(map[int][]*nfaState)
|
||||
toReturn.output = append(toReturn.output, s1.output...)
|
||||
toReturn.output = append(toReturn.output, s2.output...)
|
||||
// // Unique append is used here (and elsewhere) to ensure that,
|
||||
// // for any given transition, a state can only be mentioned once.
|
||||
// // For example, given the transition 'a', the state 's1' can only be mentioned once.
|
||||
// // This would lead to multiple instances of the same set of match indices, since both
|
||||
// // 's1' states would be considered to match.
|
||||
// for _, c := range s1.content {
|
||||
// toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
|
||||
// }
|
||||
// for _, c := range s2.content {
|
||||
// toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
|
||||
// }
|
||||
toReturn.content = newContents(epsilon)
|
||||
toReturn.isEmpty = true
|
||||
toReturn.isAlternation = true
|
||||
toReturn.next = s1
|
||||
toReturn.splitState = s2
|
||||
|
||||
return toReturn
|
||||
}
|
||||
|
||||
func question(s1 *nfaState) (*nfaState, error) { // Use the fact that ab? == a(b|)
|
||||
if s1.isEmpty && s1.assert != noneAssert {
|
||||
return nil, fmt.Errorf("previous token is not quantifiable")
|
||||
}
|
||||
toReturn := &nfaState{}
|
||||
toReturn.isEmpty = true
|
||||
toReturn.isAlternation = true
|
||||
toReturn.isQuestion = true
|
||||
toReturn.content = newContents(epsilon)
|
||||
toReturn.splitState = s1
|
||||
toReturn.output = append([]*nfaState{}, toReturn)
|
||||
toReturn.output = append(toReturn.output, s1.output...)
|
||||
// s2.transitions = make(map[int][]*nfaState)
|
||||
return toReturn, nil
|
||||
}
|
||||
|
||||
// Creates and returns a new state with the 'default' values.
|
||||
func newState() nfaState {
|
||||
ret := nfaState{
|
||||
output: make([]*nfaState, 0),
|
||||
// transitions: make(map[int][]*nfaState),
|
||||
assert: noneAssert,
|
||||
except: append([]rune{}, 0),
|
||||
lookaroundRegex: "",
|
||||
groupEnd: false,
|
||||
groupBegin: false,
|
||||
}
|
||||
ret.output = append(ret.output, &ret)
|
||||
return ret
|
||||
}
|
||||
|
||||
// Creates and returns a state that _always_ has a zero-length match.
|
||||
func zeroLengthMatchState() *nfaState {
|
||||
start := &nfaState{}
|
||||
start.content = newContents(epsilon)
|
||||
start.isEmpty = true
|
||||
start.assert = alwaysTrueAssert
|
||||
start.output = append([]*nfaState{}, start)
|
||||
return start
|
||||
}
|
||||
|
||||
func (s nfaState) equals(other nfaState) bool {
|
||||
return s.isEmpty == other.isEmpty &&
|
||||
s.isLast == other.isLast &&
|
||||
slices.Equal(s.output, other.output) &&
|
||||
slices.Equal(s.content, other.content) &&
|
||||
s.next == other.next &&
|
||||
s.isKleene == other.isKleene &&
|
||||
s.isQuestion == other.isQuestion &&
|
||||
s.isLazy == other.isLazy &&
|
||||
s.isAlternation == other.isAlternation &&
|
||||
s.splitState == other.splitState &&
|
||||
s.assert == other.assert &&
|
||||
s.allChars == other.allChars &&
|
||||
slices.Equal(s.except, other.except) &&
|
||||
s.lookaroundNFA == other.lookaroundNFA &&
|
||||
s.groupBegin == other.groupBegin &&
|
||||
s.groupEnd == other.groupEnd &&
|
||||
s.groupNum == other.groupNum &&
|
||||
slices.Equal(s.threadGroups, other.threadGroups) &&
|
||||
s.threadBackref == other.threadBackref
|
||||
}
|
||||
|
||||
func stateExists(list []nfaState, s nfaState) bool {
|
||||
for i := range list {
|
||||
if list[i].equals(s) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
261
regex/postfixNode.go
Normal file
261
regex/postfixNode.go
Normal file
@@ -0,0 +1,261 @@
|
||||
package regex
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
type nodeType int
|
||||
|
||||
// This is a slice containing all escapable characters that have special meaning.
|
||||
// Eg. \b is word boundary, \w is word character etc.
|
||||
var escapedChars []rune = []rune("wWdDbBnaftrvsS0")
|
||||
|
||||
// This is a list of the possible node types
|
||||
const (
|
||||
characterNode nodeType = iota
|
||||
charclassNode
|
||||
pipeNode
|
||||
concatenateNode
|
||||
kleeneNode
|
||||
questionNode
|
||||
plusNode
|
||||
assertionNode
|
||||
lparenNode
|
||||
rparenNode
|
||||
backreferenceNode
|
||||
)
|
||||
|
||||
// Helper constants for lookarounds
|
||||
const positive = 1
|
||||
const negative = -1
|
||||
const lookahead = 1
|
||||
const lookbehind = -1
|
||||
|
||||
var infinite_reps int = -1 // Represents infinite reps eg. the end range in {5,}
|
||||
// This represents a node in the postfix representation of the expression
|
||||
type postfixNode struct {
|
||||
nodetype nodeType
|
||||
contents []rune // Contents of the node
|
||||
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
|
||||
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
|
||||
allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter)
|
||||
except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here.
|
||||
lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
|
||||
lookaroundDir int // Lookbehind or lookahead
|
||||
nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
|
||||
referencedGroup int // ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode.
|
||||
isLazy bool // ONLY USED WHEN nodetype == kleene or question
|
||||
}
|
||||
|
||||
// Converts the given list of postfixNodes to one node of type CHARCLASS.
|
||||
// Used to convert eg. 'a', 'b' and 'c' to '[abc]'.
|
||||
// If the character class is negated, it returns a postfixNode of type CHARACTER.
|
||||
// This node will behave like the dot metacharacter, but it has a longer list of runes that
|
||||
// it will not match.
|
||||
func newCharClassNode(nodes []postfixNode, negated bool) postfixNode {
|
||||
rtv := postfixNode{}
|
||||
rtv.nodetype = charclassNode
|
||||
rtv.startReps = 1
|
||||
rtv.endReps = 1
|
||||
if negated {
|
||||
rtv.nodetype = characterNode
|
||||
rtv.contents = []rune{anyCharRune}
|
||||
rtv.allChars = true
|
||||
rtv.except = nodes
|
||||
} else {
|
||||
rtv.nodeContents = nodes
|
||||
}
|
||||
return rtv
|
||||
}
|
||||
|
||||
// Creates a new escaped node - the given character is assumed to have been preceded by a backslash
|
||||
func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
switch c {
|
||||
case 's': // Whitespace
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, whitespaceChars...)
|
||||
case 'S': // Non-whitespace
|
||||
toReturn = newPostfixDotNode()
|
||||
toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...))
|
||||
case 'd': // Digits
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, digitChars...)
|
||||
case 'D': // Non-digits
|
||||
toReturn = newPostfixDotNode()
|
||||
toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...))
|
||||
case 'w': // word character
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, wordChars...)
|
||||
case 'W': // Non-word character
|
||||
toReturn = newPostfixDotNode()
|
||||
toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...))
|
||||
case 'b', 'B':
|
||||
if c == 'b' && inCharClass {
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(8))
|
||||
} else {
|
||||
toReturn.nodetype = assertionNode
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
}
|
||||
if c == 'B' && inCharClass { // Invalid
|
||||
return postfixNode{}, fmt.Errorf("word boundaries are not allowed in character class")
|
||||
}
|
||||
case 'A', 'z': // A is start of input, z is end of input (regardless of RE_MULTILINE)
|
||||
if inCharClass {
|
||||
return postfixNode{}, fmt.Errorf("input boundaries are not allowed in character class")
|
||||
} else {
|
||||
toReturn.nodetype = assertionNode
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
}
|
||||
case 'n': // Newline character
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, '\n')
|
||||
case '0': // NULL character
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(0))
|
||||
case 'a': // Bell character
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(7))
|
||||
case 'f': // Form feed character
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(12))
|
||||
case 't': // Horizontal tab character
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(9))
|
||||
case 'r': // Carriage return
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(13))
|
||||
case 'v': // Vertical tab
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, rune(11))
|
||||
case '-': // Literal hyphen
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, '-')
|
||||
default: // None of the above - append it as a regular character
|
||||
if isNormalChar(c) { // Normal characters cannot be escaped
|
||||
return postfixNode{}, fmt.Errorf("invalid escape character")
|
||||
}
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
}
|
||||
return toReturn, nil
|
||||
}
|
||||
|
||||
// Creates and returns a postfixNode based on the given contents
|
||||
func newPostfixNode(contents ...rune) postfixNode {
|
||||
if len(contents) < 1 {
|
||||
panic("Empty node.")
|
||||
}
|
||||
to_return := postfixNode{}
|
||||
to_return.startReps = 1
|
||||
to_return.endReps = 1
|
||||
if len(contents) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER
|
||||
to_return.nodetype = characterNode
|
||||
to_return.contents = contents
|
||||
} else { // Node has one element, could be anything
|
||||
switch contents[0] {
|
||||
case '+':
|
||||
to_return.nodetype = plusNode
|
||||
case lazyPlusRune:
|
||||
to_return.nodetype = plusNode
|
||||
to_return.isLazy = true
|
||||
case '?':
|
||||
to_return.nodetype = questionNode
|
||||
case lazyQuestionRune:
|
||||
to_return.nodetype = questionNode
|
||||
to_return.isLazy = true
|
||||
case '*':
|
||||
to_return.nodetype = kleeneNode
|
||||
case lazyKleeneRune:
|
||||
to_return.nodetype = kleeneNode
|
||||
to_return.isLazy = true
|
||||
case '|':
|
||||
to_return.nodetype = pipeNode
|
||||
case concatRune:
|
||||
to_return.nodetype = concatenateNode
|
||||
case '^', '$':
|
||||
to_return.nodetype = assertionNode
|
||||
case '(':
|
||||
to_return.nodetype = lparenNode
|
||||
case ')':
|
||||
to_return.nodetype = rparenNode
|
||||
default:
|
||||
to_return.nodetype = characterNode
|
||||
}
|
||||
to_return.contents = append(to_return.contents, contents...)
|
||||
|
||||
// Special cases for LPAREN and RPAREN - they have special characters defined for them
|
||||
if to_return.nodetype == lparenNode {
|
||||
to_return.contents = []rune{lparenRune}
|
||||
}
|
||||
if to_return.nodetype == rparenNode {
|
||||
to_return.contents = []rune{rparenRune}
|
||||
}
|
||||
}
|
||||
return to_return
|
||||
}
|
||||
|
||||
// Creates and returns a postfixNode representing the 'dot' metacharacter.
|
||||
func newPostfixDotNode() postfixNode {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.allChars = true
|
||||
toReturn.contents = []rune{anyCharRune}
|
||||
return toReturn
|
||||
}
|
||||
|
||||
// Creates a character node, regardless of the contents
|
||||
func newPostfixCharNode(contents ...rune) postfixNode {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, contents...)
|
||||
return toReturn
|
||||
}
|
||||
|
||||
// newPostfixBackreferenceNode creates and returns a backreference node, referring to the given group
|
||||
func newPostfixBackreferenceNode(referred int) postfixNode {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
toReturn.nodetype = backreferenceNode
|
||||
toReturn.referencedGroup = referred
|
||||
return toReturn
|
||||
}
|
||||
|
||||
// rewriteBackreferences rewrites any backreferences in the given postfixNode slice, into their respective groups.
|
||||
// It stores the relation in a map, and returns it as the second return value.
|
||||
// It uses parenIndices to determine where a group starts and ends in nodes.
|
||||
// For example, \1(a) will be rewritten into (a)(a), and 1 -> 2 will be the hashmap value.
|
||||
// It returns an error if a backreference points to an invalid group.
|
||||
// func rewriteBackreferences(nodes []postfixNode, parenIndices []Group) ([]postfixNode, map[int]int, error) {
|
||||
// rtv := make([]postfixNode, 0)
|
||||
// referMap := make(map[int]int)
|
||||
// numGroups := 0
|
||||
// groupIncrement := 0 // If we have a backreference before the group its referring to, then the group its referring to will have its group number incremented.
|
||||
// for i, node := range nodes {
|
||||
// if node.nodetype == backreferenceNode {
|
||||
// if node.referencedGroup >= len(parenIndices) {
|
||||
// return nil, nil, fmt.Errorf("invalid backreference")
|
||||
// }
|
||||
// rtv = slices.Concat(rtv, nodes[parenIndices[node.referencedGroup].StartIdx:parenIndices[node.referencedGroup].EndIdx+1]) // Add all the nodes in the group to rtv
|
||||
// numGroups += 1
|
||||
// if i < parenIndices[node.referencedGroup].StartIdx {
|
||||
// groupIncrement += 1
|
||||
// }
|
||||
// referMap[numGroups] = node.referencedGroup + groupIncrement
|
||||
// } else {
|
||||
// rtv = append(rtv, node)
|
||||
// if node.nodetype == lparenNode {
|
||||
// numGroups += 1
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// return rtv, referMap, nil
|
||||
// }
|
@@ -1,9 +1,11 @@
|
||||
package greg
|
||||
package regex
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type numRange struct {
|
||||
@@ -99,28 +101,39 @@ func range2regex(start int, end int) (string, error) {
|
||||
// Last range - tmp to rangeEnd
|
||||
ranges = append(ranges, numRange{tmp, rangeEnd})
|
||||
|
||||
regex := string(NONCAPLPAREN_CHAR)
|
||||
regexSlice := make([]string, 0)
|
||||
// Generate the regex
|
||||
for i, rg := range ranges {
|
||||
if i > 0 {
|
||||
regex += "|"
|
||||
}
|
||||
regex += string(NONCAPLPAREN_CHAR)
|
||||
for _, rg := range ranges {
|
||||
tmpStr := ""
|
||||
tmpStr += string(nonCapLparenRune)
|
||||
startSlc := intToSlc(rg.start)
|
||||
endSlc := intToSlc(rg.end)
|
||||
if len(startSlc) != len(endSlc) {
|
||||
return "", fmt.Errorf("Error parsing numeric range")
|
||||
return "", fmt.Errorf("error parsing numeric range")
|
||||
}
|
||||
for i := range startSlc {
|
||||
if startSlc[i] == endSlc[i] {
|
||||
regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
|
||||
tmpStr += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
|
||||
} else {
|
||||
regex += fmt.Sprintf("%c%c-%c%c", LBRACKET, rune(startSlc[i]+48), rune(endSlc[i]+48), RBRACKET)
|
||||
tmpStr += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
|
||||
}
|
||||
}
|
||||
regex += ")"
|
||||
tmpStr += ")"
|
||||
regexSlice = append(regexSlice, tmpStr)
|
||||
}
|
||||
regex += ")"
|
||||
// Each element of the slice represents one 'group'. Taking 0-255 as an example, the elements would be:
|
||||
// 1. 0-9
|
||||
// 2. 10-99
|
||||
// 3. 100-199
|
||||
// 4. 200-249
|
||||
// 5. 250-255
|
||||
//
|
||||
// The reason this is reversed before joining it, is because it is incompatible with the PCRE rule for matching.
|
||||
// The PCRE rule specifies that the left-branch of an alternation is preferred. Even though this engine uses the POSIX
|
||||
// rule at the moment (which prefers the longest match regardless of the order of the alternation), reversing the string
|
||||
// has no downsides. It doesn't affect POSIX matching, and it will reduce my burden if I decide to switch to PCRE matching.
|
||||
slices.Reverse(regexSlice)
|
||||
regex := string(nonCapLparenRune) + strings.Join(regexSlice, "|") + ")"
|
||||
return regex, nil
|
||||
|
||||
}
|
@@ -1,4 +1,4 @@
|
||||
package greg
|
||||
package regex
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
@@ -25,7 +25,9 @@ var reTests = []struct {
|
||||
{"a*b", nil, "qwqw", []Group{}},
|
||||
{"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}},
|
||||
{"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}},
|
||||
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}},
|
||||
// This match will only happen with Longest()
|
||||
// {"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}},
|
||||
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}}},
|
||||
{"b*a*a", nil, "bba", []Group{{0, 3}}},
|
||||
{"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}},
|
||||
{"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}},
|
||||
@@ -105,6 +107,9 @@ var reTests = []struct {
|
||||
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
|
||||
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
|
||||
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
|
||||
{`\d{3,4}`, nil, "120", []Group{{0, 3}}},
|
||||
{`\d{3,4}`, nil, "12709", []Group{{0, 4}}},
|
||||
{`\d{3,4}`, nil, "12", []Group{}},
|
||||
{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
|
||||
{`\d{3,4}`, nil, "5", []Group{}},
|
||||
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
|
||||
@@ -112,6 +117,7 @@ var reTests = []struct {
|
||||
{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
|
||||
{`\bpaint\b`, nil, "paints", []Group{}},
|
||||
{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
|
||||
{`\w{}`, nil, "test", nil},
|
||||
{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
|
||||
{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
|
||||
{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
|
||||
@@ -174,7 +180,7 @@ var reTests = []struct {
|
||||
{"[[:graph:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 70}}},
|
||||
|
||||
// Test cases from Python's RE test suite
|
||||
{`[\1]`, nil, "\x01", []Group{{0, 1}}},
|
||||
{`[\01]`, nil, "\x01", []Group{{0, 1}}},
|
||||
|
||||
{`\0`, nil, "\x00", []Group{{0, 1}}},
|
||||
{`[\0a]`, nil, "\x00", []Group{{0, 1}}},
|
||||
@@ -189,7 +195,7 @@ var reTests = []struct {
|
||||
{`\x00ffffffffffffff`, nil, "\xff", []Group{}},
|
||||
{`\x00f`, nil, "\x0f", []Group{}},
|
||||
{`\x00fe`, nil, "\xfe", []Group{}},
|
||||
{`^\w+=(\\[\000-\277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
|
||||
{`^\w+=(\\[\000-\0277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
|
||||
|
||||
{`a.b`, nil, `acb`, []Group{{0, 3}}},
|
||||
{`a.b`, nil, "a\nb", []Group{}},
|
||||
@@ -307,11 +313,7 @@ var reTests = []struct {
|
||||
{`a[-]?c`, nil, `ac`, []Group{{0, 2}}},
|
||||
{`^(.+)?B`, nil, `AB`, []Group{{0, 2}}},
|
||||
{`\0009`, nil, "\x009", []Group{{0, 2}}},
|
||||
{`\141`, nil, "a", []Group{{0, 1}}},
|
||||
|
||||
// At this point, the python test suite has a bunch
|
||||
// of backreference tests. Since my engine doesn't
|
||||
// implement backreferences, I've skipped those tests.
|
||||
{`\0141`, nil, "a", []Group{{0, 1}}},
|
||||
|
||||
{`*a`, nil, ``, nil},
|
||||
{`(*)b`, nil, ``, nil},
|
||||
@@ -428,7 +430,8 @@ var reTests = []struct {
|
||||
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
|
||||
{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
|
||||
{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
|
||||
{`\141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
|
||||
{`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
|
||||
{`\0141\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "AA", []Group{{0, 2}}},
|
||||
|
||||
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
|
||||
|
||||
@@ -443,8 +446,11 @@ var reTests = []struct {
|
||||
{`abc$`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{7, 10}}},
|
||||
{`^`, nil, "jkl\n123abc\nxyz", []Group{{0, 0}}},
|
||||
{`^`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{0, 0}, {4, 4}, {11, 11}}},
|
||||
{`\A`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{0, 0}}},
|
||||
{`$`, nil, "jkl\n123abc\nxyz", []Group{{14, 14}}},
|
||||
{`$`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{3, 3}, {10, 10}, {14, 14}}},
|
||||
{`\z`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{14, 14}}},
|
||||
{`^abc\z`, []ReFlag{RE_MULTILINE}, "abc\nabc\nabc", []Group{{8, 11}}},
|
||||
|
||||
{`a.b`, nil, "a\nb", []Group{}},
|
||||
{`a.b`, []ReFlag{RE_SINGLE_LINE}, "a\nb", []Group{{0, 3}}},
|
||||
@@ -456,8 +462,10 @@ var reTests = []struct {
|
||||
{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
|
||||
{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
|
||||
{`\xff`, nil, "\u00ff", []Group{{0, 1}}},
|
||||
{`\xff+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
|
||||
{`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
|
||||
{`\x00ff`, nil, "\u00ff", []Group{}},
|
||||
{`\x{0000ff}+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
|
||||
{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
|
||||
{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
|
||||
{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
|
||||
@@ -465,7 +473,7 @@ var reTests = []struct {
|
||||
{`[\t][\n][\v][\r][\f][\b]`, nil, "\t\n\v\r\f\b", []Group{{0, 6}}},
|
||||
{`.*d`, nil, "abc\nabd", []Group{{4, 7}}},
|
||||
{`(`, nil, "-", nil},
|
||||
{`[\41]`, nil, `!`, []Group{{0, 1}}},
|
||||
{`[\041]`, nil, `!`, []Group{{0, 1}}},
|
||||
{`(?<!abc)(d.f)`, nil, `abcdefdof`, []Group{{6, 9}}},
|
||||
{`[\w-]+`, nil, `laser_beam`, []Group{{0, 10}}},
|
||||
{`M+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}},
|
||||
@@ -481,7 +489,25 @@ var reTests = []struct {
|
||||
{`[b-e]`, nil, `f`, []Group{}},
|
||||
|
||||
{`*?`, nil, `-`, nil},
|
||||
{`a*?`, nil, `-`, nil}, // non-greedy operators are not supported
|
||||
{`a.+c`, nil, `abcabc`, []Group{{0, 6}}},
|
||||
// Lazy quantifier tests
|
||||
{`a.+?c`, nil, `abcabc`, []Group{{0, 3}, {3, 6}}},
|
||||
{`ab*?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{{0, 6}}},
|
||||
{`ab+?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
|
||||
{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
|
||||
{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
|
||||
{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{}},
|
||||
{`ab??c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
|
||||
{`a.*?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AXYZC`, []Group{{0, 5}}},
|
||||
{`a.+?c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCABC`, []Group{{0, 3}, {3, 6}}},
|
||||
{`a.*?c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCABC`, []Group{{0, 3}, {3, 6}}},
|
||||
{`.*?\S *:`, nil, `xx:`, []Group{{0, 3}}},
|
||||
{`a[ ]*? (\d+).*`, nil, `a 10`, []Group{{0, 6}}},
|
||||
{`a[ ]*? (\d+).*`, nil, `a 10`, []Group{{0, 7}}},
|
||||
{`"(?:\\"|[^"])*?"`, nil, `"\""`, []Group{{0, 4}}},
|
||||
{`^.*?$`, nil, "one\ntwo\nthree", []Group{}},
|
||||
{`a[^>]*?b`, nil, `a>b`, []Group{}},
|
||||
{`^a*?$`, nil, `foo`, []Group{}},
|
||||
|
||||
// Numeric range tests - this is a feature that I added, and doesn't exist
|
||||
// in any other mainstream regex engine
|
||||
@@ -512,6 +538,30 @@ var reTests = []struct {
|
||||
{`<389-400`, nil, `-`, nil},
|
||||
{`<389-400>`, nil, `391`, []Group{{0, 3}}},
|
||||
{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
|
||||
|
||||
{`\p{Tamil}+`, nil, `உயிரெழுத்து`, []Group{{0, 11}}}, // Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'.
|
||||
{`\P{Tamil}+`, nil, `vowel=உயிரெழுத்து`, []Group{{0, 6}}},
|
||||
{`\P`, nil, `உயிரெழுத்து`, nil},
|
||||
{`\PM\pM*`, nil, `உயிரெழுத்து`, []Group{{0, 1}, {1, 3}, {3, 5}, {5, 7}, {7, 9}, {9, 11}}},
|
||||
{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
|
||||
{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
|
||||
{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
|
||||
|
||||
{`(?<=\().*?(?=\))`, nil, `(abc)`, []Group{{1, 4}}},
|
||||
|
||||
{`((a|b)\2)`, nil, `aa`, []Group{{0, 2}}},
|
||||
{`((a|b)\2)`, nil, `bb`, []Group{{0, 2}}},
|
||||
{`((a|b)\2)`, nil, `ab`, []Group{}},
|
||||
{`((a|b)\2)`, nil, `ba`, []Group{}},
|
||||
|
||||
{`((a|b)\2){3}`, nil, `aaaaaa`, []Group{{0, 6}}},
|
||||
{`((a|b)\2){3}`, nil, `bbbbbb`, []Group{{0, 6}}},
|
||||
{`((a|b)\2){3}`, nil, `bbaaaa`, []Group{{0, 6}}},
|
||||
{`((a|b)\2){3}`, nil, `aabbaa`, []Group{{0, 6}}},
|
||||
{`((a|b)\2){3}`, nil, `aaaabb`, []Group{{0, 6}}},
|
||||
{`((a|b)\2){3}`, nil, `bbaabb`, []Group{{0, 6}}},
|
||||
{`((a|b)\2){3}`, nil, `baabab`, []Group{}},
|
||||
{`((a|b)\2){3}`, nil, `bbabab`, []Group{}},
|
||||
}
|
||||
|
||||
var groupTests = []struct {
|
||||
@@ -522,7 +572,7 @@ var groupTests = []struct {
|
||||
}{
|
||||
{"(a)(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}},
|
||||
{"((a))(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}},
|
||||
{"(0)", nil, "ab", []Match{[]Group{}}},
|
||||
{"(0)", nil, "ab", []Match{}},
|
||||
{"(a)b", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}}}},
|
||||
{"a(b)", nil, "ab", []Match{[]Group{{0, 2}, {1, 2}}}},
|
||||
{"(a|b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
|
||||
@@ -531,10 +581,11 @@ var groupTests = []struct {
|
||||
{"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
||||
{"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}},
|
||||
{"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
||||
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
|
||||
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
|
||||
// This match will only happen with Longest()
|
||||
// {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
|
||||
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 3}, {0, 3}, {-1, -1}}}},
|
||||
{"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
||||
{"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}},
|
||||
{"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
|
||||
{"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
|
||||
{"(a?)a?", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
|
||||
{"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
|
||||
@@ -572,13 +623,37 @@ var groupTests = []struct {
|
||||
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}},
|
||||
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
|
||||
{`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
||||
{`(((((((((a)))))))))\41`, nil, `a`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
||||
{`(((((((((a)))))))))\041`, nil, `a!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
||||
{`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
|
||||
{`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
|
||||
|
||||
// At this point, the python test suite has a bunch
|
||||
// of backreference tests. Since my engine doesn't
|
||||
// implement backreferences, I've skipped those tests.
|
||||
// Backreference tests
|
||||
{`(abc)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}}},
|
||||
{`([a-c]+)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}}},
|
||||
{`([a-c]*)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}, []Group{{6, 6}, {6, 6}}}},
|
||||
{`^(.+)?B`, nil, `AB`, []Match{[]Group{{0, 2}, {0, 1}}}},
|
||||
{`(a+).\1$`, nil, `aaaaa`, []Match{[]Group{{0, 5}, {0, 2}}}},
|
||||
{`^(a+).\1$`, nil, `aaaa`, []Match{}},
|
||||
{`(a)\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}},
|
||||
{`(a+)\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}},
|
||||
{`(a+)+\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}},
|
||||
{`(a).+\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}},
|
||||
{`(a)ba*\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}},
|
||||
{`(aa|a)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}},
|
||||
{`(a|aa)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}},
|
||||
{`(a+)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}},
|
||||
{`([abc]*)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}, []Group{{6, 6}, {6, 6}}}},
|
||||
{`(a)(?:b)\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}},
|
||||
{`(a)(?:b)\1`, nil, `abb`, []Match{}},
|
||||
{`(?:a)(b)\1`, nil, `aba`, []Match{}},
|
||||
{`(?:a)(b)\1`, nil, `abb`, []Match{[]Group{{0, 3}, {1, 2}}}},
|
||||
{`(?:(cat)|(dog))\2`, nil, `catdog`, []Match{}},
|
||||
{`(?:a)\1`, nil, `aa`, nil},
|
||||
{`((cat)|(dog)|(cow)|(bat))\4`, nil, `cowcow`, []Match{[]Group{{0, 6}, {0, 3}, {-1, -1}, {-1, -1}, {0, 3}, {-1, -1}}}},
|
||||
{`(a|b)*\1`, nil, `abb`, []Match{[]Group{{0, 3}, {1, 2}}}},
|
||||
{`(a|b)*\1`, nil, `aba`, []Match{}},
|
||||
{`(a|b)*\1`, nil, `bab`, []Match{}},
|
||||
{`(a|b)*\1`, nil, `baa`, []Match{[]Group{{0, 3}, {1, 2}}}},
|
||||
|
||||
{`(a)(b)c|ab`, nil, `ab`, []Match{[]Group{{0, 2}}}},
|
||||
{`(a)+x`, nil, `aaax`, []Match{[]Group{{0, 4}, {2, 3}}}},
|
||||
@@ -627,7 +702,7 @@ var groupTests = []struct {
|
||||
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `BCDD`, []Match{}},
|
||||
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
|
||||
{`(((((((((a)))))))))`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
||||
{`(((((((((a)))))))))\41`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
||||
{`(((((((((a)))))))))\041`, []ReFlag{RE_CASE_INSENSITIVE}, `A!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
|
||||
{`(.*)c(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
|
||||
{`\((.*), (.*)\)`, []ReFlag{RE_CASE_INSENSITIVE}, `(A, B)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
|
||||
{`(a)(b)c|ab`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}}}},
|
||||
@@ -668,9 +743,32 @@ var groupTests = []struct {
|
||||
{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
|
||||
|
||||
{`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}},
|
||||
|
||||
// // Tests from https://wiki.haskell.org/Regex_Posix
|
||||
// {`(()|.)(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
|
||||
// {`(()|[ab])(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
|
||||
// {`(()|[ab])+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
|
||||
// {`([ab]|())+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
|
||||
// // Bug - this should give {0,6},{3,6},{-1,-1} but it gives {0,6},{3,6},{3,3}
|
||||
// // {`yyyyyy`, nil, `(yyy|(x?)){2,4}`, []Match{[]Group{{0, 6}, {3, 6}, {-1, -1}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
||||
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
||||
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
|
||||
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
|
||||
|
||||
// Lazy quantifier tests
|
||||
{`a(?:b|c|d)+?(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}},
|
||||
{`a(?:b|(c|e){1,2}?|d)+?(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {1, 2}, {2, 3}}}},
|
||||
{`(?<!-):(.*?)(?<!-):`, nil, `a:bc-:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
|
||||
{`(?<!\\):(.*?)(?<!\\):`, nil, `a:bc\:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
|
||||
{`(?<!\?)'(.*?)(?<!\?)'`, nil, `a'bc?'de'f`, []Match{[]Group{{1, 9}, {2, 8}}}},
|
||||
{`.*?x\s*\z(.*)`, []ReFlag{RE_MULTILINE, RE_SINGLE_LINE}, "xx\nx\n", []Match{[]Group{{0, 5}, {5, 5}}}},
|
||||
{`.*?x\s*\z(.*)`, []ReFlag{RE_MULTILINE}, "xx\nx\n", []Match{[]Group{{3, 5}, {5, 5}}}},
|
||||
{`^([ab]*?)(?=(b)?)c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
|
||||
{`^([ab]*?)(?!(b))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
|
||||
{`^([ab]*?)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
|
||||
}
|
||||
|
||||
func TestFindAllMatches(t *testing.T) {
|
||||
func TestFind(t *testing.T) {
|
||||
for _, test := range reTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
@@ -679,13 +777,35 @@ func TestFindAllMatches(t *testing.T) {
|
||||
panic(fmt.Errorf("Test Error: %v", err))
|
||||
}
|
||||
} else {
|
||||
matchIndices := FindAllMatches(regComp, test.str)
|
||||
zeroGroups := make([]Group, len(matchIndices))
|
||||
for i, m := range matchIndices {
|
||||
zeroGroups[i] = m[0]
|
||||
groupIndex, err := regComp.Find(test.str)
|
||||
if err != nil { // No matches found
|
||||
if len(test.result) == 0 {
|
||||
return // Manually pass the test, because this is the expected behavior
|
||||
} else {
|
||||
t.Errorf("Wanted %v Got no matches\n", test.result)
|
||||
}
|
||||
} else {
|
||||
if groupIndex != test.result[0] {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, groupIndex)
|
||||
}
|
||||
}
|
||||
if !slices.Equal(test.result, zeroGroups) {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, zeroGroups)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAll(t *testing.T) {
|
||||
for _, test := range reTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(fmt.Errorf("Test Error: %v", err))
|
||||
}
|
||||
} else {
|
||||
matchIndices := regComp.FindAll(test.str)
|
||||
if !slices.Equal(test.result, matchIndices) {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
||||
}
|
||||
}
|
||||
})
|
||||
@@ -701,10 +821,10 @@ func TestFindString(t *testing.T) {
|
||||
panic(err)
|
||||
}
|
||||
} else {
|
||||
foundString := FindString(regComp, test.str)
|
||||
foundString := regComp.FindString(test.str)
|
||||
if len(test.result) == 0 {
|
||||
if foundString != "" {
|
||||
t.Errorf("Expected no match got %v\n", foundString)
|
||||
t.Errorf("Wanted no match got %v\n", foundString)
|
||||
}
|
||||
} else {
|
||||
expectedString := test.str[test.result[0].StartIdx:test.result[0].EndIdx]
|
||||
@@ -717,21 +837,185 @@ func TestFindString(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAllGroups(t *testing.T) {
|
||||
for _, test := range groupTests {
|
||||
func TestFindAllString(t *testing.T) {
|
||||
for _, test := range reTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
matchIndices := FindAllMatches(regComp, test.str)
|
||||
for i := range matchIndices {
|
||||
for j := range matchIndices[i] {
|
||||
if matchIndices[i][j].isValid() {
|
||||
if test.result[i][j] != matchIndices[i][j] {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
||||
} else {
|
||||
foundStrings := regComp.FindAllString(test.str)
|
||||
if len(test.result) != len(foundStrings) {
|
||||
t.Errorf("Differing number of matches: Wanted %v matches Got %v matches\n", len(test.result), len(foundStrings))
|
||||
} else {
|
||||
for idx, group := range test.result {
|
||||
groupStr := test.str[group.StartIdx:group.EndIdx]
|
||||
if groupStr != foundStrings[idx] {
|
||||
t.Errorf("Wanted %v Got %v\n", groupStr, foundStrings[idx])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindSubmatch(t *testing.T) {
|
||||
for _, test := range groupTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
}
|
||||
} else {
|
||||
match, err := regComp.FindSubmatch(test.str)
|
||||
if err != nil {
|
||||
if len(test.result) != 0 {
|
||||
t.Errorf("Wanted %v got no match\n", test.result[0])
|
||||
}
|
||||
} else if len(test.result) == 0 {
|
||||
t.Errorf("Wanted no match got %v\n", match)
|
||||
}
|
||||
for i := range match {
|
||||
if match[i].IsValid() {
|
||||
if test.result[0][i] != match[i] {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
|
||||
}
|
||||
} else {
|
||||
if i < len(test.result) && test.result[0][i].IsValid() {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
func TestFindStringSubmatch(t *testing.T) {
|
||||
for _, test := range groupTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
}
|
||||
} else {
|
||||
matchStr := regComp.FindStringSubmatch(test.str)
|
||||
if matchStr == nil {
|
||||
if len(test.result) != 0 {
|
||||
expectedStr := funcMap(test.result[0], func(g Group) string {
|
||||
if g.IsValid() {
|
||||
return test.str[g.StartIdx:g.EndIdx]
|
||||
} else {
|
||||
return ""
|
||||
}
|
||||
})
|
||||
t.Errorf("Wanted %v got no match\n", expectedStr)
|
||||
}
|
||||
} else if len(test.result) == 0 {
|
||||
t.Errorf("Wanted no match got %v\n", matchStr)
|
||||
} else {
|
||||
expectedStr := funcMap(test.result[0], func(g Group) string {
|
||||
if g.IsValid() {
|
||||
return test.str[g.StartIdx:g.EndIdx]
|
||||
} else {
|
||||
return ""
|
||||
}
|
||||
})
|
||||
for i, groupStr := range matchStr {
|
||||
if groupStr == "" {
|
||||
if i < len(expectedStr) && expectedStr[i] != "" {
|
||||
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
|
||||
}
|
||||
} else {
|
||||
if expectedStr[i] != groupStr {
|
||||
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAllStringSubmatch(t *testing.T) {
|
||||
for _, test := range groupTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
}
|
||||
} else {
|
||||
matchStrs := regComp.FindAllStringSubmatch(test.str)
|
||||
if matchStrs == nil {
|
||||
if len(test.result) != 0 {
|
||||
expectedStrs := funcMap(test.result, func(m Match) []string {
|
||||
return funcMap(m, func(g Group) string {
|
||||
if g.IsValid() {
|
||||
return test.str[g.StartIdx:g.EndIdx]
|
||||
} else {
|
||||
return ""
|
||||
}
|
||||
})
|
||||
})
|
||||
t.Errorf("Wanted %v got no match\n", expectedStrs)
|
||||
}
|
||||
} else if len(test.result) == 0 {
|
||||
t.Errorf("Wanted no match got %v\n", matchStrs)
|
||||
} else {
|
||||
expectedStrs := funcMap(test.result, func(m Match) []string {
|
||||
return funcMap(m, func(g Group) string {
|
||||
if g.IsValid() {
|
||||
return test.str[g.StartIdx:g.EndIdx]
|
||||
} else {
|
||||
return ""
|
||||
}
|
||||
})
|
||||
})
|
||||
for i, matchStr := range matchStrs {
|
||||
for j, groupStr := range matchStr {
|
||||
if groupStr == "" {
|
||||
if j < len(expectedStrs[i]) && expectedStrs[i][j] != "" {
|
||||
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
|
||||
}
|
||||
} else {
|
||||
if expectedStrs[i][j] != groupStr {
|
||||
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAllSubmatch(t *testing.T) {
|
||||
for _, test := range groupTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
}
|
||||
} else {
|
||||
matchIndices := regComp.FindAllSubmatch(test.str)
|
||||
for i := range matchIndices {
|
||||
for j := range matchIndices[i] {
|
||||
if matchIndices[i][j].IsValid() {
|
||||
if test.result[i][j] != matchIndices[i][j] {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
||||
}
|
||||
} else {
|
||||
if i < len(test.result) && j < len(test.result[i]) && test.result[i][j].IsValid() {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,4 +1,4 @@
|
||||
package greg
|
||||
package regex
|
||||
|
||||
import "errors"
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package greg
|
||||
package regex
|
||||
|
||||
type stateContents []int // Represents the contents of the current state - character classes can have multiple contents, which is why it is represented as a slice
|
||||
|
@@ -4,4 +4,5 @@
|
||||
Ideas for flags:
|
||||
-m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
|
||||
-g <num> : Print the <num>th group
|
||||
-r : Specify a directory instead of a file, reads recursively
|
||||
4. Refactor code for flags - make each flag's code a function, which modifies the result of findAllMatches
|
Reference in New Issue
Block a user