Compare commits
23 Commits
47f88c817f
...
v0.2.0
| Author | SHA1 | Date | |
|---|---|---|---|
| 073f231b89 | |||
| 3b7257c921 | |||
| 668df8b70a | |||
| 214acf7e0f | |||
| 50221ff4d9 | |||
| 5ab95f512a | |||
| e7da678408 | |||
| ab363e2766 | |||
| c803e45415 | |||
| 525296f239 | |||
| eb0ab9f7ec | |||
| 17a7dbae4c | |||
| f2279acd98 | |||
| 662527c478 | |||
| d1958f289c | |||
| 15ee49f42e | |||
| b60ded4136 | |||
| 9fbb99f86c | |||
| af15904f3b | |||
| d522f50b50 | |||
| fb47e082eb | |||
| 1f5a363539 | |||
| 9e12f9dcb3 |
@@ -14,20 +14,41 @@ var notDotChars []rune
|
|||||||
// the startState of the NFA representation of the regex, and the number of capturing
|
// the startState of the NFA representation of the regex, and the number of capturing
|
||||||
// groups in the regex. It also contains the expression string.
|
// groups in the regex. It also contains the expression string.
|
||||||
type Reg struct {
|
type Reg struct {
|
||||||
start *nfaState
|
start *nfaState
|
||||||
numGroups int
|
numGroups int
|
||||||
str string
|
str string
|
||||||
|
preferLongest bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// NumSubexp returns the number of sub-expressions in the given [Reg]. This is equivalent
|
// NumSubexp returns the number of sub-expressions in the given [Reg]. This is equivalent
|
||||||
// to the number of capturing groups.
|
// to the number of capturing groups.
|
||||||
func (r Reg) NumSubexp() int {
|
func (re Reg) NumSubexp() int {
|
||||||
return r.numGroups
|
return re.numGroups
|
||||||
}
|
}
|
||||||
|
|
||||||
// String returns the string used to compile the expression.
|
// String returns the string used to compile the expression.
|
||||||
func (r Reg) String() string {
|
func (re Reg) String() string {
|
||||||
return r.str
|
return re.str
|
||||||
|
}
|
||||||
|
|
||||||
|
// MarshalText implements [encoding.TextMarshaler]. The output is equivalent to that of [Reg.String].
|
||||||
|
// Any flags passed as arguments (including calling [Reg.Longest]) are lost.
|
||||||
|
func (re *Reg) MarshalText() ([]byte, error) {
|
||||||
|
return []byte(re.String()), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// UnmarshalText implements [encoding.TextUnmarshaler]. It calls [Reg.Compile] on the given byte-slice. If it returns successfully,
|
||||||
|
// then the result of the compilation is stored in re. The result of [Reg.Compile] is returned.
|
||||||
|
func (re *Reg) UnmarshalText(text []byte) error {
|
||||||
|
newReg, err := Compile(string(text))
|
||||||
|
if err == nil {
|
||||||
|
*re = newReg
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (re *Reg) Longest() {
|
||||||
|
re.preferLongest = true
|
||||||
}
|
}
|
||||||
|
|
||||||
const concatRune rune = 0xF0001
|
const concatRune rune = 0xF0001
|
||||||
@@ -1135,7 +1156,7 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
concatenate(nfa[0], &lastState)
|
concatenate(nfa[0], &lastState)
|
||||||
|
|
||||||
// The string is empty here, because we add it in Compile()
|
// The string is empty here, because we add it in Compile()
|
||||||
return Reg{nfa[0], numGroups, ""}, nil
|
return Reg{nfa[0], numGroups, "", false}, nil
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
24
regex/doc.go
24
regex/doc.go
@@ -33,7 +33,7 @@ Perl classes:
|
|||||||
\d Match any digit character ([0-9])
|
\d Match any digit character ([0-9])
|
||||||
\D Match any non-digit character ([^0-9])
|
\D Match any non-digit character ([^0-9])
|
||||||
\w Match any word character ([a-zA-Z0-9_])
|
\w Match any word character ([a-zA-Z0-9_])
|
||||||
\W Match any word character ([^a-zA-Z0-9_])
|
\W Match any non-word character ([^a-zA-Z0-9_])
|
||||||
\s Match any whitespace character ([ \t\n])
|
\s Match any whitespace character ([ \t\n])
|
||||||
\S Match any non-whitespace character ([^ \t\n])
|
\S Match any non-whitespace character ([^ \t\n])
|
||||||
|
|
||||||
@@ -105,23 +105,7 @@ The key differences are mentioned below.
|
|||||||
|
|
||||||
1. Greediness:
|
1. Greediness:
|
||||||
|
|
||||||
This engine does not support non-greedy operators. All operators are always greedy in nature, and will try
|
This engine currently does not support non-greedy operators.
|
||||||
to match as much as they can, while still allowing for a successful match. For example, given the regex:
|
|
||||||
|
|
||||||
y*y
|
|
||||||
|
|
||||||
The engine will match as many 'y's as it can, while still allowing the trailing 'y' to be matched.
|
|
||||||
|
|
||||||
Another, more subtle example is the following regex:
|
|
||||||
|
|
||||||
x|xx
|
|
||||||
|
|
||||||
While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
|
|
||||||
this engine will go for the longest possible match, regardless of the order of the alternation. Although this
|
|
||||||
strays from the convention, it results in a nice rule-of-thumb - the engine is ALWAYS greedy.
|
|
||||||
|
|
||||||
The stdlib implementation has a function [regexp.Regexp.Longest] which makes future searches prefer the longest match.
|
|
||||||
That is the default (and unchangable) behavior in this engine.
|
|
||||||
|
|
||||||
2. Byte-slices and runes:
|
2. Byte-slices and runes:
|
||||||
|
|
||||||
@@ -166,13 +150,13 @@ The following features from [regexp] are (currently) NOT supported:
|
|||||||
1. Named capturing groups
|
1. Named capturing groups
|
||||||
2. Non-greedy operators
|
2. Non-greedy operators
|
||||||
3. Unicode character classes
|
3. Unicode character classes
|
||||||
4. Embedded flags (flags are passed as arguments to [Compile])
|
4. Embedded flags (flags are instead passed as arguments to [Compile])
|
||||||
5. Literal text with \Q ... \E
|
5. Literal text with \Q ... \E
|
||||||
|
|
||||||
The following features are not available in [regexp], but are supported in my engine:
|
The following features are not available in [regexp], but are supported in my engine:
|
||||||
1. Lookarounds
|
1. Lookarounds
|
||||||
2. Numeric ranges
|
2. Numeric ranges
|
||||||
|
|
||||||
The goal is to shorten the first list, and expand the second.
|
I hope to shorten the first list, and expand the second.
|
||||||
*/
|
*/
|
||||||
package regex
|
package regex
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package regex_test
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"gitea.twomorecents.org/Rockingcool/kleingrep/regex"
|
"gitea.twomorecents.org/Rockingcool/kleingrep/regex"
|
||||||
)
|
)
|
||||||
@@ -32,12 +33,12 @@ func ExampleReg_FindAll() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func ExampleReg_FindString() {
|
func ExampleReg_FindString() {
|
||||||
regexStr := `\d+`
|
regexStr := `\w+\s+(?=sheep)`
|
||||||
regexComp := regex.MustCompile(regexStr)
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
|
||||||
matchStr := regexComp.FindString("The year of our lord, 2025")
|
matchStr := regexComp.FindString("pink cows and yellow sheep")
|
||||||
fmt.Println(matchStr)
|
fmt.Println(matchStr)
|
||||||
// Output: 2025
|
// Output: yellow
|
||||||
}
|
}
|
||||||
|
|
||||||
func ExampleReg_FindSubmatch() {
|
func ExampleReg_FindSubmatch() {
|
||||||
@@ -52,3 +53,129 @@ func ExampleReg_FindSubmatch() {
|
|||||||
// 0 1
|
// 0 1
|
||||||
// 2 3
|
// 2 3
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ExampleReg_FindStringSubmatch() {
|
||||||
|
regexStr := `(\d{4})-(\d{2})-(\d{2})`
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
inputStr := `The date is 2025-02-10`
|
||||||
|
|
||||||
|
match := regexComp.FindStringSubmatch(inputStr)
|
||||||
|
fmt.Println(match[1])
|
||||||
|
fmt.Println(match[3])
|
||||||
|
// Output: 2025
|
||||||
|
// 10
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_FindAllSubmatch() {
|
||||||
|
regexStr := `(\d)\.(\d)(\d)`
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
|
||||||
|
matches := regexComp.FindAllSubmatch("3.14+8.97")
|
||||||
|
fmt.Println(matches[0][0]) // 0-group (entire match) of 1st match (0-indexed)
|
||||||
|
fmt.Println(matches[0][1]) // 1st group of 1st match
|
||||||
|
fmt.Println(matches[1][0]) // 0-group of 2nd match
|
||||||
|
fmt.Println(matches[1][1]) // 1st group of 2nd math
|
||||||
|
// Output: 0 4
|
||||||
|
// 0 1
|
||||||
|
// 5 9
|
||||||
|
// 5 6
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_FindAllString() {
|
||||||
|
regexStr := `<0-255>\.<0-255>\.<0-255>\.<0-255>`
|
||||||
|
inputStr := `192.168.220.7 pings 9.9.9.9`
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
|
||||||
|
matchStrs := regexComp.FindAllString(inputStr)
|
||||||
|
|
||||||
|
fmt.Println(matchStrs[0])
|
||||||
|
fmt.Println(matchStrs[1])
|
||||||
|
// Output: 192.168.220.7
|
||||||
|
// 9.9.9.9
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_FindAllStringSubmatch() {
|
||||||
|
// 'https' ...
|
||||||
|
// followed by 1 or more alphanumeric characters (including period) ...
|
||||||
|
// then a forward slash ...
|
||||||
|
// followed by one more of :
|
||||||
|
// word character,
|
||||||
|
// question mark,
|
||||||
|
// period,
|
||||||
|
// equals sign
|
||||||
|
regexStr := `https://([a-z0-9\.]+)/([\w.?=]+)`
|
||||||
|
regexComp := regex.MustCompile(regexStr, regex.RE_CASE_INSENSITIVE)
|
||||||
|
inputStr := `You can find me at https://twomorecents.org/index.html and https://news.ycombinator.com/user?id=aadhavans`
|
||||||
|
|
||||||
|
matchIndices := regexComp.FindAllStringSubmatch(inputStr)
|
||||||
|
fmt.Println(matchIndices[0][1]) // 1st group of 1st match (0-indexed)
|
||||||
|
fmt.Println(matchIndices[0][2]) // 2nd group of 1st match
|
||||||
|
fmt.Println(matchIndices[1][1]) // 1st group of 2nd match
|
||||||
|
fmt.Println(matchIndices[1][2]) // 2nd group of 2nd match
|
||||||
|
// Output: twomorecents.org
|
||||||
|
// index.html
|
||||||
|
// news.ycombinator.com
|
||||||
|
// user?id=aadhavans
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_Expand() {
|
||||||
|
inputStr := `option1: value1
|
||||||
|
option2: value2`
|
||||||
|
regexStr := `(\w+): (\w+)`
|
||||||
|
templateStr := "$1 = $2\n"
|
||||||
|
regexComp := regex.MustCompile(regexStr, regex.RE_MULTILINE)
|
||||||
|
result := ""
|
||||||
|
for _, submatches := range regexComp.FindAllSubmatch(inputStr) {
|
||||||
|
result = regexComp.Expand(result, templateStr, inputStr, submatches)
|
||||||
|
}
|
||||||
|
fmt.Println(result)
|
||||||
|
// Output: option1 = value1
|
||||||
|
// option2 = value2
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_LiteralPrefix() {
|
||||||
|
regexStr := `a(b|c)d*`
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
prefix, complete := regexComp.LiteralPrefix()
|
||||||
|
fmt.Println(prefix)
|
||||||
|
fmt.Println(complete)
|
||||||
|
// Output: a
|
||||||
|
// false
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_Longest() {
|
||||||
|
regexStr := `x|xx`
|
||||||
|
inputStr := "xx"
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
fmt.Println(regexComp.FindString(inputStr))
|
||||||
|
regexComp.Longest()
|
||||||
|
fmt.Println(regexComp.FindString(inputStr))
|
||||||
|
// Output: x
|
||||||
|
// xx
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_ReplaceAll() {
|
||||||
|
regexStr := `(\d)(\w)`
|
||||||
|
inputStr := "5d9t"
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
fmt.Println(regexComp.ReplaceAll(inputStr, `$2$1`))
|
||||||
|
// Output: d5t9
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_ReplaceAllLiteral() {
|
||||||
|
regexStr := `fox|dog`
|
||||||
|
inputStr := "the quick brown fox jumped over the lazy dog"
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
fmt.Println(regexComp.ReplaceAllLiteral(inputStr, `duck`))
|
||||||
|
// Output: the quick brown duck jumped over the lazy duck
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_ReplaceAllFunc() {
|
||||||
|
regexStr := `\w{5,}`
|
||||||
|
inputStr := `all five or more letter words in this string are capitalized`
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
fmt.Println(regexComp.ReplaceAllFunc(inputStr, strings.ToUpper))
|
||||||
|
// Output: all five or more LETTER WORDS in this STRING are CAPITALIZED
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ package regex
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strconv"
|
||||||
|
"unicode"
|
||||||
)
|
)
|
||||||
|
|
||||||
// A Match represents a match found by the regex in a given string.
|
// A Match represents a match found by the regex in a given string.
|
||||||
@@ -63,8 +65,8 @@ func copyThread(to *nfaState, from nfaState) {
|
|||||||
|
|
||||||
// Find returns the 0-group of the leftmost match of the regex in the given string.
|
// Find returns the 0-group of the leftmost match of the regex in the given string.
|
||||||
// An error value != nil indicates that no match was found.
|
// An error value != nil indicates that no match was found.
|
||||||
func (regex Reg) Find(str string) (Group, error) {
|
func (re Reg) Find(str string) (Group, error) {
|
||||||
match, err := regex.FindNthMatch(str, 1)
|
match, err := re.FindNthMatch(str, 1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return Group{}, fmt.Errorf("no matches found")
|
return Group{}, fmt.Errorf("no matches found")
|
||||||
}
|
}
|
||||||
@@ -72,15 +74,27 @@ func (regex Reg) Find(str string) (Group, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Match returns a boolean value, indicating whether the regex found a match in the given string.
|
// Match returns a boolean value, indicating whether the regex found a match in the given string.
|
||||||
func (regex Reg) Match(str string) bool {
|
func (re Reg) Match(str string) bool {
|
||||||
_, err := regex.Find(str)
|
_, err := re.Find(str)
|
||||||
return err == nil
|
return err == nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CompileMatch compiles expr and returns true if str contains a match of the expression.
|
||||||
|
// It is equivalent to [regexp.Match].
|
||||||
|
// An optional list of flags may be provided (see [ReFlag]).
|
||||||
|
// It returns an error (!= nil) if there was an error compiling the expression.
|
||||||
|
func CompileMatch(expr string, str string, flags ...ReFlag) (bool, error) {
|
||||||
|
re, err := Compile(expr, flags...)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
return re.Match(str), nil
|
||||||
|
}
|
||||||
|
|
||||||
// FindAll returns a slice containing all the 0-groups of the regex in the given string.
|
// FindAll returns a slice containing all the 0-groups of the regex in the given string.
|
||||||
// A 0-group represents the match without any submatches.
|
// A 0-group represents the match without any submatches.
|
||||||
func (regex Reg) FindAll(str string) []Group {
|
func (re Reg) FindAll(str string) []Group {
|
||||||
indices := regex.FindAllSubmatch(str)
|
indices := re.FindAllSubmatch(str)
|
||||||
zeroGroups := funcMap(indices, getZeroGroup)
|
zeroGroups := funcMap(indices, getZeroGroup)
|
||||||
return zeroGroups
|
return zeroGroups
|
||||||
}
|
}
|
||||||
@@ -89,8 +103,8 @@ func (regex Reg) FindAll(str string) []Group {
|
|||||||
// The return value will be an empty string in two situations:
|
// The return value will be an empty string in two situations:
|
||||||
// 1. No match was found
|
// 1. No match was found
|
||||||
// 2. The match was an empty string
|
// 2. The match was an empty string
|
||||||
func (regex Reg) FindString(str string) string {
|
func (re Reg) FindString(str string) string {
|
||||||
match, err := regex.FindNthMatch(str, 1)
|
match, err := re.FindNthMatch(str, 1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
@@ -103,8 +117,8 @@ func (regex Reg) FindString(str string) string {
|
|||||||
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
|
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
|
||||||
// [Group.IsValid], or by checking that both indices of the group are >= 0.
|
// [Group.IsValid], or by checking that both indices of the group are >= 0.
|
||||||
// The second-return value is nil if no match was found.
|
// The second-return value is nil if no match was found.
|
||||||
func (regex Reg) FindSubmatch(str string) (Match, error) {
|
func (re Reg) FindSubmatch(str string) (Match, error) {
|
||||||
match, err := regex.FindNthMatch(str, 1)
|
match, err := re.FindNthMatch(str, 1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return Match{}, fmt.Errorf("no match found")
|
return Match{}, fmt.Errorf("no match found")
|
||||||
} else {
|
} else {
|
||||||
@@ -121,9 +135,9 @@ func (regex Reg) FindSubmatch(str string) (Match, error) {
|
|||||||
// 2. Group n found a zero-length match
|
// 2. Group n found a zero-length match
|
||||||
//
|
//
|
||||||
// A return value of nil indicates no match.
|
// A return value of nil indicates no match.
|
||||||
func (regex Reg) FindStringSubmatch(str string) []string {
|
func (re Reg) FindStringSubmatch(str string) []string {
|
||||||
matchStr := make([]string, regex.numGroups+1)
|
matchStr := make([]string, re.numGroups+1)
|
||||||
match, err := regex.FindSubmatch(str)
|
match, err := re.FindSubmatch(str)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -145,8 +159,8 @@ func (regex Reg) FindStringSubmatch(str string) []string {
|
|||||||
// FindAllString is the 'all' version of [FindString].
|
// FindAllString is the 'all' version of [FindString].
|
||||||
// It returns a slice of strings containing the text of all matches of
|
// It returns a slice of strings containing the text of all matches of
|
||||||
// the regex in the given string.
|
// the regex in the given string.
|
||||||
func (regex Reg) FindAllString(str string) []string {
|
func (re Reg) FindAllString(str string) []string {
|
||||||
zerogroups := regex.FindAll(str)
|
zerogroups := re.FindAll(str)
|
||||||
matchStrs := funcMap(zerogroups, func(g Group) string {
|
matchStrs := funcMap(zerogroups, func(g Group) string {
|
||||||
return str[g.StartIdx:g.EndIdx]
|
return str[g.StartIdx:g.EndIdx]
|
||||||
})
|
})
|
||||||
@@ -155,14 +169,14 @@ func (regex Reg) FindAllString(str string) []string {
|
|||||||
|
|
||||||
// FindNthMatch return the 'n'th match of the regex in the given string.
|
// FindNthMatch return the 'n'th match of the regex in the given string.
|
||||||
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
|
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
|
||||||
func (regex Reg) FindNthMatch(str string, n int) (Match, error) {
|
func (re Reg) FindNthMatch(str string, n int) (Match, error) {
|
||||||
idx := 0
|
idx := 0
|
||||||
matchNum := 0
|
matchNum := 0
|
||||||
str_runes := []rune(str)
|
str_runes := []rune(str)
|
||||||
var matchFound bool
|
var matchFound bool
|
||||||
var matchIdx Match
|
var matchIdx Match
|
||||||
for idx <= len(str_runes) {
|
for idx <= len(str_runes) {
|
||||||
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups)
|
matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
|
||||||
if matchFound {
|
if matchFound {
|
||||||
matchNum++
|
matchNum++
|
||||||
}
|
}
|
||||||
@@ -175,14 +189,14 @@ func (regex Reg) FindNthMatch(str string, n int) (Match, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// FindAllSubmatch returns a slice of matches in the given string.
|
// FindAllSubmatch returns a slice of matches in the given string.
|
||||||
func (regex Reg) FindAllSubmatch(str string) []Match {
|
func (re Reg) FindAllSubmatch(str string) []Match {
|
||||||
idx := 0
|
idx := 0
|
||||||
str_runes := []rune(str)
|
str_runes := []rune(str)
|
||||||
var matchFound bool
|
var matchFound bool
|
||||||
var matchIdx Match
|
var matchIdx Match
|
||||||
indices := make([]Match, 0)
|
indices := make([]Match, 0)
|
||||||
for idx <= len(str_runes) {
|
for idx <= len(str_runes) {
|
||||||
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups)
|
matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
|
||||||
if matchFound {
|
if matchFound {
|
||||||
indices = append(indices, matchIdx)
|
indices = append(indices, matchIdx)
|
||||||
}
|
}
|
||||||
@@ -191,7 +205,30 @@ func (regex Reg) FindAllSubmatch(str string) []Match {
|
|||||||
return indices
|
return indices
|
||||||
}
|
}
|
||||||
|
|
||||||
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState) []nfaState {
|
// FindAllSubmatch returns a double-slice of strings. Each slice contains the text of a match, including all submatches.
|
||||||
|
// A return value of nil indicates no match.
|
||||||
|
func (re Reg) FindAllStringSubmatch(str string) [][]string {
|
||||||
|
match := re.FindAllSubmatch(str)
|
||||||
|
if len(match) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
rtv := make([][]string, len(match))
|
||||||
|
for i := range rtv {
|
||||||
|
rtv[i] = make([]string, re.numGroups+1)
|
||||||
|
}
|
||||||
|
rtv = funcMap(match, func(m Match) []string {
|
||||||
|
return funcMap(m, func(g Group) string {
|
||||||
|
if g.IsValid() {
|
||||||
|
return str[g.StartIdx:g.EndIdx]
|
||||||
|
} else {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
return rtv
|
||||||
|
}
|
||||||
|
|
||||||
|
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState {
|
||||||
if stateExists(list, state) || stateExists(visited, state) {
|
if stateExists(list, state) || stateExists(visited, state) {
|
||||||
return list
|
return list
|
||||||
}
|
}
|
||||||
@@ -199,32 +236,32 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread
|
|||||||
|
|
||||||
if state.isKleene || state.isQuestion {
|
if state.isKleene || state.isQuestion {
|
||||||
copyThread(state.splitState, state)
|
copyThread(state.splitState, state)
|
||||||
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited)
|
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
|
||||||
copyThread(state.next, state)
|
copyThread(state.next, state)
|
||||||
list = addStateToList(str, idx, list, *state.next, threadGroups, visited)
|
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
||||||
return list
|
return list
|
||||||
}
|
}
|
||||||
if state.isAlternation {
|
if state.isAlternation {
|
||||||
copyThread(state.next, state)
|
copyThread(state.next, state)
|
||||||
list = addStateToList(str, idx, list, *state.next, threadGroups, visited)
|
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
||||||
copyThread(state.splitState, state)
|
copyThread(state.splitState, state)
|
||||||
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited)
|
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
|
||||||
return list
|
return list
|
||||||
}
|
}
|
||||||
state.threadGroups = append([]Group{}, threadGroups...)
|
state.threadGroups = append([]Group{}, threadGroups...)
|
||||||
if state.assert != noneAssert {
|
if state.assert != noneAssert {
|
||||||
if state.checkAssertion(str, idx) {
|
if state.checkAssertion(str, idx, preferLongest) {
|
||||||
copyThread(state.next, state)
|
copyThread(state.next, state)
|
||||||
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited)
|
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if state.groupBegin {
|
if state.groupBegin {
|
||||||
state.threadGroups[state.groupNum].StartIdx = idx
|
state.threadGroups[state.groupNum].StartIdx = idx
|
||||||
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited)
|
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
|
||||||
}
|
}
|
||||||
if state.groupEnd {
|
if state.groupEnd {
|
||||||
state.threadGroups[state.groupNum].EndIdx = idx
|
state.threadGroups[state.groupNum].EndIdx = idx
|
||||||
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited)
|
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
|
||||||
}
|
}
|
||||||
return append(list, state)
|
return append(list, state)
|
||||||
|
|
||||||
@@ -233,7 +270,7 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread
|
|||||||
// Helper for FindAllMatches. Returns whether it found a match, the
|
// Helper for FindAllMatches. Returns whether it found a match, the
|
||||||
// first Match it finds, and how far it got into the string ie. where
|
// first Match it finds, and how far it got into the string ie. where
|
||||||
// the next search should start from.
|
// the next search should start from.
|
||||||
func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
|
func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int, preferLongest bool) (bool, Match, int) {
|
||||||
// Base case - exit if offset exceeds string's length
|
// Base case - exit if offset exceeds string's length
|
||||||
if offset > len(str) {
|
if offset > len(str) {
|
||||||
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
|
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
|
||||||
@@ -248,7 +285,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
|
|||||||
// If the first state is an assertion, makes sure the assertion
|
// If the first state is an assertion, makes sure the assertion
|
||||||
// is true before we do _anything_ else.
|
// is true before we do _anything_ else.
|
||||||
if start.assert != noneAssert {
|
if start.assert != noneAssert {
|
||||||
if start.checkAssertion(str, offset) == false {
|
if start.checkAssertion(str, offset, preferLongest) == false {
|
||||||
i++
|
i++
|
||||||
return false, []Group{}, i
|
return false, []Group{}, i
|
||||||
}
|
}
|
||||||
@@ -256,7 +293,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
|
|||||||
|
|
||||||
start.threadGroups = newMatch(numGroups + 1)
|
start.threadGroups = newMatch(numGroups + 1)
|
||||||
start.threadGroups[0].StartIdx = i
|
start.threadGroups[0].StartIdx = i
|
||||||
currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil)
|
currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest)
|
||||||
var match Match = nil
|
var match Match = nil
|
||||||
for idx := i; idx <= len(str); idx++ {
|
for idx := i; idx <= len(str); idx++ {
|
||||||
if len(currentStates) == 0 {
|
if len(currentStates) == 0 {
|
||||||
@@ -273,10 +310,12 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
|
|||||||
if currentState.isLast {
|
if currentState.isLast {
|
||||||
currentState.threadGroups[0].EndIdx = idx
|
currentState.threadGroups[0].EndIdx = idx
|
||||||
match = append([]Group{}, currentState.threadGroups...)
|
match = append([]Group{}, currentState.threadGroups...)
|
||||||
break
|
if !preferLongest {
|
||||||
} else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd { // Normal character or assertion
|
break
|
||||||
if currentState.contentContains(str, idx) {
|
}
|
||||||
nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil)
|
} else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character
|
||||||
|
if currentState.contentContains(str, idx, preferLongest) {
|
||||||
|
nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -291,3 +330,131 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
|
|||||||
}
|
}
|
||||||
return false, []Group{}, i + 1
|
return false, []Group{}, i + 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Expand appends template to dst, expanding any variables in template to the relevant capturing group.
|
||||||
|
//
|
||||||
|
// A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group.
|
||||||
|
// To insert a literal $, do not put a number after it. Alternatively, you can use $$.
|
||||||
|
// src is the input string, and match must be the result of [Reg.FindSubmatch].
|
||||||
|
func (re Reg) Expand(dst string, template string, src string, match Match) string {
|
||||||
|
templateRuneSlc := []rune(template)
|
||||||
|
srcRuneSlc := []rune(src)
|
||||||
|
i := 0
|
||||||
|
for i < len(templateRuneSlc) {
|
||||||
|
c := templateRuneSlc[i]
|
||||||
|
if c == '$' {
|
||||||
|
i += 1
|
||||||
|
// The dollar sign is the last character of the string, or it is proceeded by another dollar sign
|
||||||
|
if i >= len(templateRuneSlc) || templateRuneSlc[i] == '$' {
|
||||||
|
dst += "$"
|
||||||
|
i++
|
||||||
|
} else {
|
||||||
|
numStr := ""
|
||||||
|
for i < len(templateRuneSlc) && unicode.IsDigit(templateRuneSlc[i]) {
|
||||||
|
numStr += string(templateRuneSlc[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
if numStr == "" {
|
||||||
|
dst += "$"
|
||||||
|
} else {
|
||||||
|
num, _ := strconv.Atoi(numStr)
|
||||||
|
if num < len(match) {
|
||||||
|
dst += string(srcRuneSlc[match[num].StartIdx:match[num].EndIdx])
|
||||||
|
} else {
|
||||||
|
dst += "$" + numStr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
dst += string(c)
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return dst
|
||||||
|
}
|
||||||
|
|
||||||
|
// LiteralPrefix returns a string that must begin any match of the given regular expression.
|
||||||
|
// The second return value is true if the string comprises the entire expression.
|
||||||
|
func (re Reg) LiteralPrefix() (prefix string, complete bool) {
|
||||||
|
state := re.start
|
||||||
|
if state.assert != noneAssert {
|
||||||
|
state = state.next
|
||||||
|
}
|
||||||
|
for !(state.isLast) && (!state.isAlternation) && len(state.content) == 1 && state.assert == noneAssert {
|
||||||
|
if state.groupBegin || state.groupEnd {
|
||||||
|
state = state.next
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
prefix += string(rune(state.content[0]))
|
||||||
|
state = state.next
|
||||||
|
}
|
||||||
|
if state.isLast {
|
||||||
|
complete = true
|
||||||
|
} else {
|
||||||
|
complete = false
|
||||||
|
}
|
||||||
|
return prefix, complete
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReplaceAll replaces all matches of the expression in src, with the text in repl. In repl, variables are interpreted
|
||||||
|
// as they are in [Reg.Expand]. The resulting string is returned.
|
||||||
|
func (re Reg) ReplaceAll(src string, repl string) string {
|
||||||
|
matches := re.FindAllSubmatch(src)
|
||||||
|
i := 0
|
||||||
|
currentMatch := 0
|
||||||
|
dst := ""
|
||||||
|
for i < len(src) {
|
||||||
|
if currentMatch < len(matches) && matches[currentMatch][0].IsValid() && i == matches[currentMatch][0].StartIdx {
|
||||||
|
dst += re.Expand("", repl, src, matches[currentMatch])
|
||||||
|
i = matches[currentMatch][0].EndIdx
|
||||||
|
currentMatch++
|
||||||
|
} else {
|
||||||
|
dst += string(src[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return dst
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReplaceAllLiteral replaces all matches of the expression in src, with the text in repl. The text is replaced directly,
|
||||||
|
// without any expansion.
|
||||||
|
func (re Reg) ReplaceAllLiteral(src string, repl string) string {
|
||||||
|
zerogroups := re.FindAll(src)
|
||||||
|
currentMatch := 0
|
||||||
|
i := 0
|
||||||
|
dst := ""
|
||||||
|
|
||||||
|
for i < len(src) {
|
||||||
|
if currentMatch < len(zerogroups) && i == zerogroups[currentMatch].StartIdx {
|
||||||
|
dst += repl
|
||||||
|
i = zerogroups[currentMatch].EndIdx
|
||||||
|
currentMatch += 1
|
||||||
|
} else {
|
||||||
|
dst += string(src[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return dst
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReplaceAllFunc replaces every match of the expression in src, with the return value of the function replFunc.
|
||||||
|
// replFunc takes in the matched string. The return value is substituted in directly without expasion.
|
||||||
|
func (re Reg) ReplaceAllFunc(src string, replFunc func(string) string) string {
|
||||||
|
zerogroups := re.FindAll(src)
|
||||||
|
currentMatch := 0
|
||||||
|
i := 0
|
||||||
|
dst := ""
|
||||||
|
|
||||||
|
for i < len(src) {
|
||||||
|
if currentMatch < len(zerogroups) && i == zerogroups[currentMatch].StartIdx {
|
||||||
|
dst += replFunc(src[zerogroups[currentMatch].StartIdx:zerogroups[currentMatch].EndIdx])
|
||||||
|
i = zerogroups[currentMatch].EndIdx
|
||||||
|
currentMatch += 1
|
||||||
|
} else {
|
||||||
|
dst += string(src[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return dst
|
||||||
|
|
||||||
|
}
|
||||||
|
|||||||
@@ -133,7 +133,7 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
|
|||||||
|
|
||||||
// Checks if the given state's assertion is true. Returns true if the given
|
// Checks if the given state's assertion is true. Returns true if the given
|
||||||
// state doesn't have an assertion.
|
// state doesn't have an assertion.
|
||||||
func (s nfaState) checkAssertion(str []rune, idx int) bool {
|
func (s nfaState) checkAssertion(str []rune, idx int, preferLongest bool) bool {
|
||||||
if s.assert == alwaysTrueAssert {
|
if s.assert == alwaysTrueAssert {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
@@ -183,7 +183,7 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool {
|
|||||||
strToMatch = string(runesToMatch)
|
strToMatch = string(runesToMatch)
|
||||||
}
|
}
|
||||||
|
|
||||||
regComp := Reg{startState, s.lookaroundNumCaptureGroups, s.lookaroundRegex}
|
regComp := Reg{startState, s.lookaroundNumCaptureGroups, s.lookaroundRegex, preferLongest}
|
||||||
matchIndices := regComp.FindAll(strToMatch)
|
matchIndices := regComp.FindAll(strToMatch)
|
||||||
|
|
||||||
numMatchesFound := 0
|
numMatchesFound := 0
|
||||||
@@ -210,9 +210,9 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns true if the contents of 's' contain the value at the given index of the given string
|
// Returns true if the contents of 's' contain the value at the given index of the given string
|
||||||
func (s nfaState) contentContains(str []rune, idx int) bool {
|
func (s nfaState) contentContains(str []rune, idx int, preferLongest bool) bool {
|
||||||
if s.assert != noneAssert {
|
if s.assert != noneAssert {
|
||||||
return s.checkAssertion(str, idx)
|
return s.checkAssertion(str, idx, preferLongest)
|
||||||
}
|
}
|
||||||
if idx >= len(str) {
|
if idx >= len(str) {
|
||||||
return false
|
return false
|
||||||
|
|||||||
@@ -25,7 +25,9 @@ var reTests = []struct {
|
|||||||
{"a*b", nil, "qwqw", []Group{}},
|
{"a*b", nil, "qwqw", []Group{}},
|
||||||
{"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}},
|
{"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}},
|
||||||
{"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}},
|
{"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}},
|
||||||
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}},
|
// This match will only happen with Longest()
|
||||||
|
// {"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}},
|
||||||
|
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}}},
|
||||||
{"b*a*a", nil, "bba", []Group{{0, 3}}},
|
{"b*a*a", nil, "bba", []Group{{0, 3}}},
|
||||||
{"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}},
|
{"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}},
|
||||||
{"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}},
|
{"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}},
|
||||||
@@ -537,7 +539,9 @@ var groupTests = []struct {
|
|||||||
{"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
{"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
||||||
{"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}},
|
{"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}},
|
||||||
{"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
{"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
||||||
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
|
// This match will only happen with Longest()
|
||||||
|
// {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
|
||||||
|
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 3}, {0, 3}, {-1, -1}}}},
|
||||||
{"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
{"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
|
||||||
{"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
|
{"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
|
||||||
{"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
|
{"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
|
||||||
@@ -857,6 +861,60 @@ func TestFindStringSubmatch(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFindAllStringSubmatch(t *testing.T) {
|
||||||
|
for _, test := range groupTests {
|
||||||
|
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||||
|
regComp, err := Compile(test.re, test.flags...)
|
||||||
|
if err != nil {
|
||||||
|
if test.result != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
matchStrs := regComp.FindAllStringSubmatch(test.str)
|
||||||
|
if matchStrs == nil {
|
||||||
|
if len(test.result) != 0 {
|
||||||
|
expectedStrs := funcMap(test.result, func(m Match) []string {
|
||||||
|
return funcMap(m, func(g Group) string {
|
||||||
|
if g.IsValid() {
|
||||||
|
return test.str[g.StartIdx:g.EndIdx]
|
||||||
|
} else {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
t.Errorf("Wanted %v got no match\n", expectedStrs)
|
||||||
|
}
|
||||||
|
} else if len(test.result) == 0 {
|
||||||
|
t.Errorf("Wanted no match got %v\n", matchStrs)
|
||||||
|
} else {
|
||||||
|
expectedStrs := funcMap(test.result, func(m Match) []string {
|
||||||
|
return funcMap(m, func(g Group) string {
|
||||||
|
if g.IsValid() {
|
||||||
|
return test.str[g.StartIdx:g.EndIdx]
|
||||||
|
} else {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
for i, matchStr := range matchStrs {
|
||||||
|
for j, groupStr := range matchStr {
|
||||||
|
if groupStr == "" {
|
||||||
|
if j < len(expectedStrs[i]) && expectedStrs[i][j] != "" {
|
||||||
|
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if expectedStrs[i][j] != groupStr {
|
||||||
|
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestFindAllSubmatch(t *testing.T) {
|
func TestFindAllSubmatch(t *testing.T) {
|
||||||
for _, test := range groupTests {
|
for _, test := range groupTests {
|
||||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||||
|
|||||||
Reference in New Issue
Block a user