Implement PCRE Matching (prefer left-branch) #2

Merged
Rockingcool merged 48 commits from implementPCREMatchingRules into master 4 weeks ago

@ -6,8 +6,8 @@ fmt:
vet: fmt vet: fmt
go vet ./... go vet ./...
buildLib: vet buildLib: vet
go build -gcflags="-N -l" ./... go build -gcflags="all=-N -l" ./...
buildCmd: buildLib buildCmd: buildLib
go build -C cmd/ -gcflags="-N -l" -o re ./... go build -C cmd/ -gcflags="all=-N -l" -o re ./...
test: buildCmd test: buildCmd
go test -v ./... go test -v ./...

@ -16,7 +16,6 @@ func (s *uniq_arr[T]) add(vals ...T) {
s.backingMap[item] = struct{}{} s.backingMap[item] = struct{}{}
} }
} }
return
} }
func (s uniq_arr[T]) contains(val T) bool { func (s uniq_arr[T]) contains(val T) bool {

@ -12,16 +12,27 @@ var notDotChars []rune
// A Reg represents the result of compiling a regular expression. It contains // A Reg represents the result of compiling a regular expression. It contains
// the startState of the NFA representation of the regex, and the number of capturing // the startState of the NFA representation of the regex, and the number of capturing
// groups in the regex. // groups in the regex. It also contains the expression string.
type Reg struct { type Reg struct {
start *nfaState start *nfaState
numGroups int numGroups int
str string
preferLongest bool
} }
// numSubexp eturns the number of sub-expressions in the given [Reg]. This is equivalent // NumSubexp returns the number of sub-expressions in the given [Reg]. This is equivalent
// to the number of capturing groups. // to the number of capturing groups.
func (r Reg) NumSubexp() int { func (re Reg) NumSubexp() int {
return r.numGroups return re.numGroups
}
// String returns the string used to compile the expression.
func (re Reg) String() string {
return re.str
}
func (re *Reg) Longest() {
re.preferLongest = true
} }
const concatRune rune = 0xF0001 const concatRune rune = 0xF0001
@ -816,13 +827,12 @@ func thompson(re []postfixNode) (Reg, error) {
// In these cases, we will return an NFA with 1 state, with an assertion that is always true. // In these cases, we will return an NFA with 1 state, with an assertion that is always true.
if len(re) == 0 { if len(re) == 0 {
start := zeroLengthMatchState() start := zeroLengthMatchState()
nfa = append(nfa, &start) nfa = append(nfa, start)
} }
for _, c := range re { for _, c := range re {
if c.nodetype == characterNode || c.nodetype == assertionNode { if c.nodetype == characterNode || c.nodetype == assertionNode {
stateToAdd := nfaState{} stateToAdd := nfaState{}
stateToAdd.transitions = make(map[int][]*nfaState)
if c.allChars { if c.allChars {
stateToAdd.allChars = true stateToAdd.allChars = true
if len(c.except) != 0 { if len(c.except) != 0 {
@ -934,7 +944,6 @@ func thompson(re []postfixNode) (Reg, error) {
s.isEmpty = true s.isEmpty = true
s.output = make([]*nfaState, 0) s.output = make([]*nfaState, 0)
s.output = append(s.output, s) s.output = append(s.output, s)
s.transitions = make(map[int][]*nfaState)
// LPAREN nodes are just added normally // LPAREN nodes are just added normally
if c.nodetype == lparenNode { if c.nodetype == lparenNode {
numGroups++ numGroups++
@ -966,7 +975,7 @@ func thompson(re []postfixNode) (Reg, error) {
s.groupNum = lparenNode.groupNum s.groupNum = lparenNode.groupNum
to_add := concatenate(lparenNode, s) to_add := concatenate(lparenNode, s)
nfa = append(nfa, to_add) nfa = append(nfa, to_add)
} else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen } else if middleNode.groupBegin && middleNode.numTransitions() == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group
s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen
to_add := concatenate(middleNode, s) to_add := concatenate(middleNode, s)
@ -989,7 +998,8 @@ func thompson(re []postfixNode) (Reg, error) {
if c.nodetype == charclassNode { // A Character class consists of all the nodes in it, alternated if c.nodetype == charclassNode { // A Character class consists of all the nodes in it, alternated
// Map the list of nodes to a list of states, each state containing the contents of a specific node // Map the list of nodes to a list of states, each state containing the contents of a specific node
states := funcMap(c.nodeContents, func(node postfixNode) *nfaState { states := funcMap(c.nodeContents, func(node postfixNode) *nfaState {
s := newState() s := &nfaState{}
s.output = append(s.output, s)
nodeContents := node.contents nodeContents := node.contents
if caseInsensitive { if caseInsensitive {
nodeContents = slices.Concat(funcMap(nodeContents, func(r rune) []rune { nodeContents = slices.Concat(funcMap(nodeContents, func(r rune) []rune {
@ -1003,7 +1013,7 @@ func thompson(re []postfixNode) (Reg, error) {
return n.contents return n.contents
})...) })...)
} }
return &s return s
}) })
// Reduce the list of states down to a single state by alternating them // Reduce the list of states down to a single state by alternating them
toAdd := funcReduce(states, func(s1 *nfaState, s2 *nfaState) *nfaState { toAdd := funcReduce(states, func(s1 *nfaState, s2 *nfaState) *nfaState {
@ -1030,14 +1040,14 @@ func thompson(re []postfixNode) (Reg, error) {
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error applying kleene star") return Reg{}, fmt.Errorf("error applying kleene star")
} }
stateToAdd, err := kleene(*s1) stateToAdd, err := kleene(s1)
if err != nil { if err != nil {
return Reg{}, err return Reg{}, err
} }
nfa = append(nfa, stateToAdd) nfa = append(nfa, stateToAdd)
case plusNode: // a+ is equivalent to aa* case plusNode: // a+ is equivalent to aa*
s1 := mustPop(&nfa) s1 := mustPop(&nfa)
s2, err := kleene(*s1) s2, err := kleene(s1)
if err != nil { if err != nil {
return Reg{}, err return Reg{}, err
} }
@ -1048,7 +1058,10 @@ func thompson(re []postfixNode) (Reg, error) {
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error applying question operator") return Reg{}, fmt.Errorf("error applying question operator")
} }
s2 := question(s1) s2, err := question(s1)
if err != nil {
return Reg{}, err
}
nfa = append(nfa, s2) nfa = append(nfa, s2)
case pipeNode: case pipeNode:
// A pipe operator doesn't actually need either operand to be present. If an operand isn't present, // A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
@ -1059,21 +1072,21 @@ func thompson(re []postfixNode) (Reg, error) {
// '|a' // '|a'
// '^a|' // '^a|'
// '^|a' // '^|a'
s1, err1 := pop(&nfa) s2, err1 := pop(&nfa)
s2, err2 := pop(&nfa) s1, err2 := pop(&nfa)
if err2 != nil || (s2.groupBegin && len(s2.transitions) == 0) { // Doesn't exist, or its just an LPAREN if err2 != nil || (s2.groupBegin && s2.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN
if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back
nfa = append(nfa, s2) nfa = append(nfa, s2)
} }
tmp := zeroLengthMatchState() tmp := zeroLengthMatchState()
s2 = &tmp s2 = tmp
} }
if err1 != nil || (s1.groupBegin && len(s1.transitions) == 0) { // Doesn't exist, or its just an LPAREN if err1 != nil || (s1.groupBegin && s1.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN
if err1 == nil { // See above for explanation if err1 == nil { // See above for explanation
nfa = append(nfa, s1) nfa = append(nfa, s1)
} }
tmp := zeroLengthMatchState() tmp := zeroLengthMatchState()
s1 = &tmp s1 = tmp
} }
s3 := alternate(s1, s2) s3 := alternate(s1, s2)
nfa = append(nfa, s3) nfa = append(nfa, s3)
@ -1100,14 +1113,18 @@ func thompson(re []postfixNode) (Reg, error) {
stateToAdd = concatenate(stateToAdd, cloneState(poppedState)) stateToAdd = concatenate(stateToAdd, cloneState(poppedState))
} }
if c.endReps == infinite_reps { // Case 3 if c.endReps == infinite_reps { // Case 3
s2, err := kleene(*poppedState) s2, err := kleene(poppedState)
if err != nil { if err != nil {
return Reg{}, err return Reg{}, err
} }
stateToAdd = concatenate(stateToAdd, s2) stateToAdd = concatenate(stateToAdd, s2)
} else { // Case 2 } else { // Case 2
for i := c.startReps; i < c.endReps; i++ { for i := c.startReps; i < c.endReps; i++ {
stateToAdd = concatenate(stateToAdd, question(cloneState(poppedState))) tmp, err := question(cloneState(poppedState))
if err != nil {
return Reg{}, fmt.Errorf("error processing bounded repetition")
}
stateToAdd = concatenate(stateToAdd, tmp)
} }
} }
nfa = append(nfa, stateToAdd) nfa = append(nfa, stateToAdd)
@ -1117,9 +1134,13 @@ func thompson(re []postfixNode) (Reg, error) {
return Reg{}, fmt.Errorf("invalid regex") return Reg{}, fmt.Errorf("invalid regex")
} }
verifyLastStates(nfa) lastState := newState()
lastState.isLast = true
concatenate(nfa[0], &lastState)
return Reg{nfa[0], numGroups}, nil // The string is empty here, because we add it in Compile()
return Reg{nfa[0], numGroups, "", false}, nil
} }
@ -1137,10 +1158,11 @@ func Compile(re string, flags ...ReFlag) (Reg, error) {
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error compiling regex: %w", err) return Reg{}, fmt.Errorf("error compiling regex: %w", err)
} }
reg.str = re
return reg, nil return reg, nil
} }
// MustCompile panicks if Compile returns an error. They are identical in all other respects. // MustCompile panics if Compile returns an error. They are identical in all other respects.
func MustCompile(re string, flags ...ReFlag) Reg { func MustCompile(re string, flags ...ReFlag) Reg {
reg, err := Compile(re, flags...) reg, err := Compile(re, flags...)
if err != nil { if err != nil {

@ -4,6 +4,8 @@ Package regex implements regular expression search, using a custom non-bracktrac
The engine relies completely on UTF-8 codepoints. As such, it is capable of matching characters The engine relies completely on UTF-8 codepoints. As such, it is capable of matching characters
from other languages, emojis and symbols. from other languages, emojis and symbols.
The API and regex syntax are largely compatible with that of the stdlib's [regexp], with a few key differences (see 'Key Differences with regexp').
The full syntax is specified below. The full syntax is specified below.
# Syntax # Syntax
@ -55,8 +57,8 @@ POSIX classes (inside normal character classes):
Composition: Composition:
def Match d, followed by e, followed by f def Match d, followed by e, followed by f
x|y Match x or y (prefer longer one) x|y Match x or y (prefer x)
xy|z Match xy or z xy|z Match xy or z (prefer xy)
Repitition (always greedy, preferring more): Repitition (always greedy, preferring more):
@ -94,10 +96,11 @@ Lookarounds:
Numeric ranges: Numeric ranges:
<x-y> Match any number from x to y (inclusive) (x and y must be positive numbers) <x-y> Match any number from x to y (inclusive) (x and y must be positive numbers)
\<x Match a literal '<' followed by x
# Key Differences with regexp # Key Differences with regexp
The engine and the API differ from [regexp] in a number of ways, some of them very subtle. The engine and the API differ from [regexp] in a few ways, some of them very subtle.
The key differences are mentioned below. The key differences are mentioned below.
1. Greediness: 1. Greediness:
@ -132,7 +135,7 @@ Rather than using primitives for return values, my engine defines two types that
values: a [Group] represents a capturing group, and a [Match] represents a list of groups. values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
[regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The [regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The
equivalent expression for this engine is: equivalent expression for this engine is shown below. Note that 'Index' is the default.
Find(All)?(String)?(Submatch)? Find(All)?(String)?(Submatch)?
@ -140,7 +143,7 @@ equivalent expression for this engine is:
If a function contains 'All' it returns all matches instead of just the leftmost one. If a function contains 'All' it returns all matches instead of just the leftmost one.
If a function contains 'String' it returns the matched text, rather than the indices. If a function contains 'String' it returns the matched text, rather than the index in the string.
If a function contains 'Submatch' it returns the match, including all submatches found by If a function contains 'Submatch' it returns the match, including all submatches found by
capturing groups. capturing groups.
@ -156,5 +159,20 @@ and the input string:
The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name
returns the 0-group. returns the 0-group.
# Feature Differences
The following features from [regexp] are (currently) NOT supported:
1. Named capturing groups
2. Non-greedy operators
3. Unicode character classes
4. Embedded flags (flags are passed as arguments to [Compile])
5. Literal text with \Q ... \E
The following features are not available in [regexp], but are supported in my engine:
1. Lookarounds
2. Numeric ranges
I hope to shorten the first list, and expand the second.
*/ */
package regex package regex

@ -52,3 +52,40 @@ func ExampleReg_FindSubmatch() {
// 0 1 // 0 1
// 2 3 // 2 3
} }
func ExampleReg_Expand() {
inputStr := `option1: value1
option2: value2`
regexStr := `(\w+): (\w+)`
templateStr := "$1 = $2\n"
regexComp := regex.MustCompile(regexStr, regex.RE_MULTILINE)
result := ""
for _, submatches := range regexComp.FindAllSubmatch(inputStr) {
result = regexComp.Expand(result, templateStr, inputStr, submatches)
}
fmt.Println(result)
// Output: option1 = value1
// option2 = value2
}
func ExampleReg_LiteralPrefix() {
regexStr := `a(b|c)d*`
regexComp := regex.MustCompile(regexStr)
prefix, complete := regexComp.LiteralPrefix()
fmt.Println(prefix)
fmt.Println(complete)
// Output: a
// false
}
func ExampleReg_Longest() {
regexStr := `x|xx`
inputStr := "xx"
regexComp := regex.MustCompile(regexStr)
fmt.Println(regexComp.FindString(inputStr))
regexComp.Longest()
fmt.Println(regexComp.FindString(inputStr))
// Output: x
// xx
}

@ -2,8 +2,8 @@ package regex
import ( import (
"fmt" "fmt"
"slices" "strconv"
"sort" "unicode"
) )
// A Match represents a match found by the regex in a given string. // A Match represents a match found by the regex in a given string.
@ -15,7 +15,7 @@ import (
// See [Reg.FindSubmatch] for an example. // See [Reg.FindSubmatch] for an example.
type Match []Group type Match []Group
// a Group represents a group. It contains the start index and end index of the match // a Group represents a capturing group. It contains the start and index of the group.
type Group struct { type Group struct {
StartIdx int StartIdx int
EndIdx int EndIdx int
@ -30,17 +30,6 @@ func newMatch(size int) Match {
return toRet return toRet
} }
// Returns the number of valid groups in the match
func (m Match) numValidGroups() int {
numValid := 0
for _, g := range m {
if g.StartIdx >= 0 && g.EndIdx >= 0 {
numValid++
}
}
return numValid
}
// Returns a string containing the indices of all (valid) groups in the match // Returns a string containing the indices of all (valid) groups in the match
func (m Match) String() string { func (m Match) String() string {
var toRet string var toRet string
@ -59,7 +48,7 @@ func (idx Group) String() string {
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx) return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
} }
// Returns whether a group is valid (ie. whether it matched any text). It // IsValid returns whether a group is valid (ie. whether it matched any text). It
// simply ensures that both indices of the group are >= 0. // simply ensures that both indices of the group are >= 0.
func (g Group) IsValid() bool { func (g Group) IsValid() bool {
return g.StartIdx >= 0 && g.EndIdx >= 0 return g.StartIdx >= 0 && g.EndIdx >= 0
@ -70,101 +59,42 @@ func getZeroGroup(m Match) Group {
return m[0] return m[0]
} }
// takeZeroState takes the 0-state (if such a transition exists) for all states in the func copyThread(to *nfaState, from nfaState) {
// given slice. It returns the resulting states. If any of the resulting states is a 0-state, to.threadGroups = append([]Group{}, from.threadGroups...)
// the second ret val is true.
// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index.
func takeZeroState(states []*nfaState, numGroups int, idx int) (rtv []*nfaState, isZero bool) {
for _, state := range states {
if len(state.transitions[epsilon]) > 0 {
for _, s := range state.transitions[epsilon] {
if s.threadGroups == nil {
s.threadGroups = newMatch(numGroups + 1)
}
copy(s.threadGroups, state.threadGroups)
if s.groupBegin {
s.threadGroups[s.groupNum].StartIdx = idx
// openParenGroups = append(openParenGroups, s.groupNum)
}
if s.groupEnd {
s.threadGroups[s.groupNum].EndIdx = idx
// closeParenGroups = append(closeParenGroups, s.groupNum)
}
}
rtv = append(rtv, state.transitions[epsilon]...)
}
}
for _, state := range rtv {
if len(state.transitions[epsilon]) > 0 {
return rtv, true
}
}
return rtv, false
} }
// zeroMatchPossible returns true if a zero-length match is possible // Find returns the 0-group of the leftmost match of the regex in the given string.
// from any of the given states, given the string and our position in it. // An error value != nil indicates that no match was found.
// It uses the same algorithm to find zero-states as the one inside the loop, func (re Reg) Find(str string) (Group, error) {
// so I should probably put it in a function. match, err := re.FindNthMatch(str, 1)
func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState) bool { if err != nil {
zeroStates, isZero := takeZeroState(states, numGroups, idx) return Group{}, fmt.Errorf("no matches found")
tempstates := make([]*nfaState, 0, len(zeroStates)+len(states))
tempstates = append(tempstates, states...)
tempstates = append(tempstates, zeroStates...)
num_appended := 0 // number of unique states addded to tempstates
for isZero == true {
zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
tempstates, num_appended = uniqueAppend(tempstates, zeroStates...)
if num_appended == 0 { // break if we haven't appended any more unique values
break
}
}
for _, state := range tempstates {
if state.isEmpty && (state.assert == noneAssert || state.checkAssertion(str, idx)) && state.isLast {
return true
}
} }
return false return getZeroGroup(match), nil
} }
// Prunes the slice by removing overlapping indices. // Match returns a boolean value, indicating whether the regex found a match in the given string.
func pruneIndices(indices []Match) []Match { func (re Reg) Match(str string) bool {
// First, sort the slice by the start indices _, err := re.Find(str)
sort.Slice(indices, func(i, j int) bool { return err == nil
return indices[i][0].StartIdx < indices[j][0].StartIdx
})
toRet := make([]Match, 0, len(indices))
current := indices[0]
for _, idx := range indices[1:] {
// idx doesn't overlap with current (starts after current ends), so add current to result
// and update the current.
if idx[0].StartIdx >= current[0].EndIdx {
toRet = append(toRet, current)
current = idx
} else if idx[0].EndIdx > current[0].EndIdx {
// idx overlaps, but it is longer, so update current
current = idx
}
}
// Add last state
toRet = append(toRet, current)
return toRet
} }
// Find returns the 0-group of the leftmost match of the regex in the given string. // CompileMatch compiles expr and returns true if str contains a match of the expression.
// An error value != nil indicates that no match was found. // It is equivalent to [regexp.Match].
func (regex Reg) Find(str string) (Group, error) { // An optional list of flags may be provided (see [ReFlag]).
match, err := regex.FindNthMatch(str, 1) // It returns an error (!= nil) if there was an error compiling the expression.
func CompileMatch(expr string, str string, flags ...ReFlag) (bool, error) {
re, err := Compile(expr, flags...)
if err != nil { if err != nil {
return Group{}, fmt.Errorf("no matches found") return false, err
} }
return getZeroGroup(match), nil return re.Match(str), nil
} }
// FindAll returns a slice containing all the 0-groups of the regex in the given string. // FindAll returns a slice containing all the 0-groups of the regex in the given string.
// A 0-group represents the match without any submatches. // A 0-group represents the match without any submatches.
func (regex Reg) FindAll(str string) []Group { func (re Reg) FindAll(str string) []Group {
indices := regex.FindAllSubmatch(str) indices := re.FindAllSubmatch(str)
zeroGroups := funcMap(indices, getZeroGroup) zeroGroups := funcMap(indices, getZeroGroup)
return zeroGroups return zeroGroups
} }
@ -173,8 +103,8 @@ func (regex Reg) FindAll(str string) []Group {
// The return value will be an empty string in two situations: // The return value will be an empty string in two situations:
// 1. No match was found // 1. No match was found
// 2. The match was an empty string // 2. The match was an empty string
func (regex Reg) FindString(str string) string { func (re Reg) FindString(str string) string {
match, err := regex.FindNthMatch(str, 1) match, err := re.FindNthMatch(str, 1)
if err != nil { if err != nil {
return "" return ""
} }
@ -187,8 +117,8 @@ func (regex Reg) FindString(str string) string {
// number of groups. The validity of a group (whether or not it matched anything) can be determined with // number of groups. The validity of a group (whether or not it matched anything) can be determined with
// [Group.IsValid], or by checking that both indices of the group are >= 0. // [Group.IsValid], or by checking that both indices of the group are >= 0.
// The second-return value is nil if no match was found. // The second-return value is nil if no match was found.
func (regex Reg) FindSubmatch(str string) (Match, error) { func (re Reg) FindSubmatch(str string) (Match, error) {
match, err := regex.FindNthMatch(str, 1) match, err := re.FindNthMatch(str, 1)
if err != nil { if err != nil {
return Match{}, fmt.Errorf("no match found") return Match{}, fmt.Errorf("no match found")
} else { } else {
@ -196,11 +126,41 @@ func (regex Reg) FindSubmatch(str string) (Match, error) {
} }
} }
// FindAllString is the 'all' version of FindString. // FindStringSubmatch is the 'string' version of [FindSubmatch]. It returns a slice of strings,
// where the string at index i contains the text matched by the i-th capturing group.
// The 0-th index represents the entire match.
// An empty string at index n could mean:
// ,
// 1. Group n did not find a match
// 2. Group n found a zero-length match
//
// A return value of nil indicates no match.
func (re Reg) FindStringSubmatch(str string) []string {
matchStr := make([]string, re.numGroups+1)
match, err := re.FindSubmatch(str)
if err != nil {
return nil
}
nonEmptyMatchFound := false
for i := range match {
if match[i].IsValid() {
matchStr[i] = str[match[i].StartIdx:match[i].EndIdx]
nonEmptyMatchFound = true
} else {
matchStr[i] = ""
}
}
if nonEmptyMatchFound == false {
return nil
}
return matchStr
}
// FindAllString is the 'all' version of [FindString].
// It returns a slice of strings containing the text of all matches of // It returns a slice of strings containing the text of all matches of
// the regex in the given string. // the regex in the given string.
func (regex Reg) FindAllString(str string) []string { func (re Reg) FindAllString(str string) []string {
zerogroups := regex.FindAll(str) zerogroups := re.FindAll(str)
matchStrs := funcMap(zerogroups, func(g Group) string { matchStrs := funcMap(zerogroups, func(g Group) string {
return str[g.StartIdx:g.EndIdx] return str[g.StartIdx:g.EndIdx]
}) })
@ -209,14 +169,14 @@ func (regex Reg) FindAllString(str string) []string {
// FindNthMatch return the 'n'th match of the regex in the given string. // FindNthMatch return the 'n'th match of the regex in the given string.
// It returns an error (!= nil) if there are fewer than 'n' matches in the string. // It returns an error (!= nil) if there are fewer than 'n' matches in the string.
func (regex Reg) FindNthMatch(str string, n int) (Match, error) { func (re Reg) FindNthMatch(str string, n int) (Match, error) {
idx := 0 idx := 0
matchNum := 0 matchNum := 0
str_runes := []rune(str) str_runes := []rune(str)
var matchFound bool var matchFound bool
var matchIdx Match var matchIdx Match
for idx <= len(str_runes) { for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups) matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
if matchFound { if matchFound {
matchNum++ matchNum++
} }
@ -229,31 +189,65 @@ func (regex Reg) FindNthMatch(str string, n int) (Match, error) {
} }
// FindAllSubmatch returns a slice of matches in the given string. // FindAllSubmatch returns a slice of matches in the given string.
func (regex Reg) FindAllSubmatch(str string) []Match { func (re Reg) FindAllSubmatch(str string) []Match {
idx := 0 idx := 0
str_runes := []rune(str) str_runes := []rune(str)
var matchFound bool var matchFound bool
var matchIdx Match var matchIdx Match
indices := make([]Match, 0) indices := make([]Match, 0)
for idx <= len(str_runes) { for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups) matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
if matchFound { if matchFound {
indices = append(indices, matchIdx) indices = append(indices, matchIdx)
} }
} }
if len(indices) > 0 {
return pruneIndices(indices)
}
return indices return indices
} }
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState {
if stateExists(list, state) || stateExists(visited, state) {
return list
}
visited = append(visited, state)
if state.isKleene || state.isQuestion {
copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
return list
}
if state.isAlternation {
copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
return list
}
state.threadGroups = append([]Group{}, threadGroups...)
if state.assert != noneAssert {
if state.checkAssertion(str, idx, preferLongest) {
copyThread(state.next, state)
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
}
if state.groupBegin {
state.threadGroups[state.groupNum].StartIdx = idx
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
if state.groupEnd {
state.threadGroups[state.groupNum].EndIdx = idx
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
return append(list, state)
}
// Helper for FindAllMatches. Returns whether it found a match, the // Helper for FindAllMatches. Returns whether it found a match, the
// first Match it finds, and how far it got into the string ie. where // first Match it finds, and how far it got into the string ie. where
// the next search should start from. // the next search should start from.
// func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int, preferLongest bool) (bool, Match, int) {
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
// Base case - exit if offset exceeds string's length // Base case - exit if offset exceeds string's length
if offset > len(str) { if offset > len(str) {
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str) // The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
@ -261,214 +255,120 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
} }
resetThreads(start) resetThreads(start)
// Hold a list of match indices for the current run. When we currentStates := make([]nfaState, 0)
// can no longer find a match, the match with the largest range is nextStates := make([]nfaState, 0)
// chosen as the match for the entire string.
// This allows us to pick the longest possible match (which is how greedy matching works).
// COMMENT ABOVE IS CURRENTLY NOT UP-TO-DATE
tempIndices := newMatch(numGroups + 1)
foundPath := false
startIdx := offset
endIdx := offset
currentStates := make([]*nfaState, 0)
tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration
i := offset // Index in string i := offset // Index in string
startingFrom := i // Store starting index
// If the first state is an assertion, makes sure the assertion // If the first state is an assertion, makes sure the assertion
// is true before we do _anything_ else. // is true before we do _anything_ else.
if start.assert != noneAssert { if start.assert != noneAssert {
if start.checkAssertion(str, offset) == false { if start.checkAssertion(str, offset, preferLongest) == false {
i++ i++
return false, []Group{}, i return false, []Group{}, i
} }
} }
// Increment until we hit a character matching the start state (assuming not 0-state)
if start.isEmpty == false {
for i < len(str) && !start.contentContains(str, i) {
i++
}
startIdx = i
startingFrom = i
i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character
}
start.threadGroups = newMatch(numGroups + 1) start.threadGroups = newMatch(numGroups + 1)
// Check if the start state begins a group - if so, add the start index to our list start.threadGroups[0].StartIdx = i
if start.groupBegin { currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest)
start.threadGroups[start.groupNum].StartIdx = i var match Match = nil
// tempIndices[start.groupNum].startIdx = i for idx := i; idx <= len(str); idx++ {
} if len(currentStates) == 0 {
currentStates = append(currentStates, start)
// Main loop
for i < len(str) {
foundPath = false
zeroStates := make([]*nfaState, 0)
// Keep taking zero-states, until there are no more left to take
// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
tempStates = append(tempStates, zeroStates...)
num_appended := 0
for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break break
} }
} for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ {
currentState := currentStates[currentStateIdx]
currentStates = slices.Concat(currentStates, tempStates) if currentState.threadGroups == nil {
tempStates = nil currentState.threadGroups = newMatch(numGroups + 1)
currentState.threadGroups[0].StartIdx = idx
}
// Take any transitions corresponding to current character if currentState.isLast {
numStatesMatched := 0 // The number of states which had at least 1 match for this round currentState.threadGroups[0].EndIdx = idx
assertionFailed := false // Whether or not an assertion failed for this round match = append([]Group{}, currentState.threadGroups...)
lastStateInList := false // Whether or not a last state was in our list of states if !preferLongest {
var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found
lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
for numStatesMatched == 0 && lastStateInList == false {
if len(currentStates) == 0 {
break break
} }
state, _ := pop(&currentStates) } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character
matches, numMatches := state.matchesFor(str, i) if currentState.contentContains(str, idx, preferLongest) {
if numMatches > 0 { nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
numStatesMatched++
tempStates = append([]*nfaState(nil), matches...)
foundPath = true
for _, m := range matches {
if m.threadGroups == nil {
m.threadGroups = newMatch(numGroups + 1)
} }
copy(m.threadGroups, state.threadGroups)
} }
} }
if numMatches < 0 { currentStates = append([]nfaState{}, nextStates...)
assertionFailed = true nextStates = nil
} }
if state.isLast { if match != nil {
if state.isLookaround() { if offset == match[0].EndIdx {
lastLookaroundInList = true return true, match, match[0].EndIdx + 1
} }
lastStateInList = true return true, match, match[0].EndIdx
lastStatePtr = state
} }
return false, []Group{}, i + 1
} }
if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed // Expand appends template to dst, expanding any variables in template to the relevant capturing group.
// If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_
// state. The explanation below is my attempt to explain this behavior.
// If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails.
// //
// One of the states in our list was a last state and a lookaround. In this case, we // A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group.
// don't abort upon failure of the assertion, because we have found // To insert a literal $, do not put a number after it. Alternatively, you can use $$.
// another path to a final state. // src is the input string, and match must be the result of [Reg.FindSubmatch].
// Even if the last state _was_ an assertion, we can use the previously func (re Reg) Expand(dst string, template string, src string, match Match) string {
// saved indices to find a match. templateRuneSlc := []rune(template)
if lastLookaroundInList { srcRuneSlc := []rune(src)
break i := 0
for i < len(templateRuneSlc) {
c := templateRuneSlc[i]
if c == '$' {
i += 1
// The dollar sign is the last character of the string, or it is proceeded by another dollar sign
if i >= len(templateRuneSlc) || templateRuneSlc[i] == '$' {
dst += "$"
i++
} else { } else {
if i == startingFrom { numStr := ""
for unicode.IsDigit(templateRuneSlc[i]) {
numStr += string(templateRuneSlc[i])
i++ i++
} }
return false, []Group{}, i if numStr == "" {
} dst += "$"
}
// Check if we can find a state in our list that is:
// a. A last-state
// b. Empty
// c. Doesn't assert anything
for _, s := range currentStates {
if s.isLast && s.isEmpty && s.assert == noneAssert {
lastStatePtr = s
lastStateInList = true
}
}
if lastStateInList && numStatesMatched == 0 { // A last-state was in the list of states. add the matchIndex to our MatchIndex list
for j := 1; j < numGroups+1; j++ {
tempIndices[j] = lastStatePtr.threadGroups[j]
}
endIdx = i
tempIndices[0] = Group{startIdx, endIdx}
if tempIndices[0].StartIdx == tempIndices[0].EndIdx {
return true, tempIndices, tempIndices[0].EndIdx + 1
} else { } else {
return true, tempIndices, tempIndices[0].EndIdx num, _ := strconv.Atoi(numStr)
} if num < len(match) {
} dst += string(srcRuneSlc[match[num].StartIdx:match[num].EndIdx])
// Check if we can find a zero-length match
if foundPath == false {
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
if tempIndices[0].IsValid() == false {
tempIndices[0] = Group{startIdx, startIdx}
}
}
// If we haven't moved in the string, increment the counter by 1
// to ensure we don't keep trying the same string over and over.
// if i == startingFrom {
startIdx++
// i++
// }
if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() {
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true, tempIndices, tempIndices[0].EndIdx + 1
} else { } else {
return true, tempIndices, tempIndices[0].EndIdx dst += "$" + numStr
} }
} }
return false, []Group{}, startIdx
} }
currentStates = make([]*nfaState, len(tempStates)) } else {
copy(currentStates, tempStates) dst += string(c)
tempStates = nil
i++ i++
} }
// End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position.
// This is the exact same algorithm used inside the loop, so I should probably put it in a function.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
tempStates = append(tempStates, zeroStates...)
num_appended := 0 // Number of unique states addded to tempStates
for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break
} }
return dst
} }
currentStates = append(currentStates, tempStates...) // LiteralPrefix returns a string that must begin any match of the given regular expression.
tempStates = nil // The second return value is true if the string comprises the entire expression.
func (re Reg) LiteralPrefix() (prefix string, complete bool) {
for _, state := range currentStates { state := re.start
// Only add the match if the start index is in bounds. If the state has an assertion, if state.assert != noneAssert {
// make sure the assertion checks out. state = state.next
if state.isLast && i <= len(str) {
if state.assert == noneAssert || state.checkAssertion(str, i) {
for j := 1; j < numGroups+1; j++ {
tempIndices[j] = state.threadGroups[j]
} }
endIdx = i for !(state.isLast) && (!state.isAlternation) && len(state.content) == 1 && state.assert == noneAssert {
tempIndices[0] = Group{startIdx, endIdx} if state.groupBegin || state.groupEnd {
state = state.next
continue
} }
prefix += string(rune(state.content[0]))
state = state.next
} }
} if state.isLast {
complete = true
if tempIndices.numValidGroups() > 0 {
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true, tempIndices, tempIndices[0].EndIdx + 1
} else { } else {
return true, tempIndices, tempIndices[0].EndIdx complete = false
}
}
if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over.
startIdx++
} }
return false, []Group{}, startIdx return prefix, complete
} }

@ -48,49 +48,6 @@ func isNormalChar(c rune) bool {
return !slices.Contains(specialChars, c) return !slices.Contains(specialChars, c)
} }
// Ensure that the given elements are only appended to the given slice if they
// don't already exist. Returns the new slice, and the number of unique items appended.
func uniqueAppend[T comparable](slc []T, items ...T) ([]T, int) {
num_appended := 0
for _, item := range items {
if !slices.Contains(slc, item) {
slc = append(slc, item)
num_appended++
}
}
return slc, num_appended
}
func uniqueAppendFunc[T any](slc []T, fn func(T, T) bool, items ...T) ([]T, int) {
toRet := make([]T, len(slc))
num_appended := 0
copy(toRet, slc)
for _, item := range items {
itemExists := false
for _, val := range slc {
if fn(item, val) {
itemExists = true
}
}
if !itemExists {
toRet = append(toRet, item)
num_appended++
}
}
return toRet, num_appended
}
// Returns true only if all the given elements are equal
func allEqual[T comparable](items ...T) bool {
first := items[0]
for _, item := range items {
if item != first {
return false
}
}
return true
}
// Map function - convert a slice of T to a slice of V, based on a function // Map function - convert a slice of T to a slice of V, based on a function
// that maps a T to a V // that maps a T to a V
func funcMap[T, V any](slc []T, fn func(T) V) []V { func funcMap[T, V any](slc []T, fn func(T) V) []V {

@ -29,10 +29,12 @@ type nfaState struct {
isEmpty bool // If it is empty - Union operator and Kleene star states will be empty isEmpty bool // If it is empty - Union operator and Kleene star states will be empty
isLast bool // If it is the last state (acept state) isLast bool // If it is the last state (acept state)
output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) // transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
next *nfaState // The next state (not for alternation or kleene states)
isKleene bool // Identifies whether current node is a 0-state representing Kleene star isKleene bool // Identifies whether current node is a 0-state representing Kleene star
isQuestion bool // Identifies whether current node is a 0-state representing the question operator isQuestion bool // Identifies whether current node is a 0-state representing the question operator
isAlternation bool // Identifies whether current node is a 0-state representing an alternation isAlternation bool // Identifies whether current node is a 0-state representing an alternation
splitState *nfaState // Only for alternation states - the 'other' branch of the alternation ('next' is the first)
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
@ -70,7 +72,6 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
isEmpty: stateToClone.isEmpty, isEmpty: stateToClone.isEmpty,
isLast: stateToClone.isLast, isLast: stateToClone.isLast,
output: make([]*nfaState, len(stateToClone.output)), output: make([]*nfaState, len(stateToClone.output)),
transitions: make(map[int][]*nfaState),
isKleene: stateToClone.isKleene, isKleene: stateToClone.isKleene,
isQuestion: stateToClone.isQuestion, isQuestion: stateToClone.isQuestion,
isAlternation: stateToClone.isAlternation, isAlternation: stateToClone.isAlternation,
@ -91,20 +92,18 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
clone.output[i] = cloneStateHelper(s, cloneMap) clone.output[i] = cloneStateHelper(s, cloneMap)
} }
} }
for k, v := range stateToClone.transitions {
clone.transitions[k] = make([]*nfaState, len(v))
for i, s := range v {
if s == stateToClone {
clone.transitions[k][i] = clone
} else {
clone.transitions[k][i] = cloneStateHelper(s, cloneMap)
}
}
}
if stateToClone.lookaroundNFA == stateToClone { if stateToClone.lookaroundNFA == stateToClone {
clone.lookaroundNFA = clone clone.lookaroundNFA = clone
} }
clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap) clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap)
if stateToClone.splitState == stateToClone {
clone.splitState = clone
}
clone.splitState = cloneStateHelper(stateToClone.splitState, cloneMap)
if stateToClone.next == stateToClone {
clone.next = clone
}
clone.next = cloneStateHelper(stateToClone.next, cloneMap)
return clone return clone
} }
@ -115,22 +114,26 @@ func resetThreads(start *nfaState) {
} }
func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) { func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
if state == nil {
return
}
if _, ok := visitedMap[state]; ok { if _, ok := visitedMap[state]; ok {
return return
} }
// Assuming it hasn't been visited // Assuming it hasn't been visited
state.threadGroups = nil state.threadGroups = nil
visitedMap[state] = true visitedMap[state] = true
for _, v := range state.transitions { if state.isAlternation {
for _, nextState := range v { resetThreadsHelper(state.next, visitedMap)
resetThreadsHelper(nextState, visitedMap) resetThreadsHelper(state.splitState, visitedMap)
} } else {
resetThreadsHelper(state.next, visitedMap)
} }
} }
// Checks if the given state's assertion is true. Returns true if the given // Checks if the given state's assertion is true. Returns true if the given
// state doesn't have an assertion. // state doesn't have an assertion.
func (s nfaState) checkAssertion(str []rune, idx int) bool { func (s nfaState) checkAssertion(str []rune, idx int, preferLongest bool) bool {
if s.assert == alwaysTrueAssert { if s.assert == alwaysTrueAssert {
return true return true
} }
@ -180,7 +183,7 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool {
strToMatch = string(runesToMatch) strToMatch = string(runesToMatch)
} }
regComp := Reg{startState, s.lookaroundNumCaptureGroups} regComp := Reg{startState, s.lookaroundNumCaptureGroups, s.lookaroundRegex, preferLongest}
matchIndices := regComp.FindAll(strToMatch) matchIndices := regComp.FindAll(strToMatch)
numMatchesFound := 0 numMatchesFound := 0
@ -207,9 +210,12 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool {
} }
// Returns true if the contents of 's' contain the value at the given index of the given string // Returns true if the contents of 's' contain the value at the given index of the given string
func (s nfaState) contentContains(str []rune, idx int) bool { func (s nfaState) contentContains(str []rune, idx int, preferLongest bool) bool {
if s.assert != noneAssert { if s.assert != noneAssert {
return s.checkAssertion(str, idx) return s.checkAssertion(str, idx, preferLongest)
}
if idx >= len(str) {
return false
} }
if s.allChars { if s.allChars {
return !slices.Contains(slices.Concat(notDotChars, s.except), str[idx]) // Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node. return !slices.Contains(slices.Concat(notDotChars, s.except), str[idx]) // Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node.
@ -222,74 +228,84 @@ func (s nfaState) isLookaround() bool {
return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert
} }
// Returns the matches for the character at the given index of the given string. func (s nfaState) numTransitions() int {
// Also returns the number of matches. Returns -1 if an assertion failed. if s.next == nil && s.splitState == nil {
func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) { return 0
// Assertions can be viewed as 'checks'. If the check fails, we return
// an empty array and 0.
// If it passes, we treat it like any other state, and return all the transitions.
if s.assert != noneAssert {
if s.checkAssertion(str, idx) == false {
return make([]*nfaState, 0), -1
}
} }
listTransitions := s.transitions[int(str[idx])] if s.next == nil || s.splitState == nil {
for _, dest := range s.transitions[int(anyCharRune)] { return 1
if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) {
// Add an allChar state to the list of matches if:
// a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't.
// b. The current character isn't the state's exception list.
listTransitions = append(listTransitions, dest)
} }
} return 2
numTransitions := len(listTransitions)
return listTransitions, numTransitions
} }
// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates // Returns the matches for the character at the given index of the given string.
func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) { // Also returns the number of matches. Returns -1 if an assertion failed.
if len(st.transitions) == 0 { //func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) {
st.isLast = true // // Assertions can be viewed as 'checks'. If the check fails, we return
return // // an empty array and 0.
} // // If it passes, we treat it like any other state, and return all the transitions.
// if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a* // if s.assert != noneAssert {
if len(st.transitions) == 1 { // Eg. a* // if s.checkAssertion(str, idx) == false {
var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one // return make([]*nfaState, 0), -1
for _, c := range st.content { // }
if len(st.transitions[c]) != 1 || st.transitions[c][0] != st { // }
moreThanOneTrans = true // listTransitions := s.transitions[int(str[idx])]
} // for _, dest := range s.transitions[int(anyCharRune)] {
} // if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) {
st.isLast = !moreThanOneTrans // // Add an allChar state to the list of matches if:
} // // a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't.
// // b. The current character isn't the state's exception list.
// listTransitions = append(listTransitions, dest)
// }
// }
// numTransitions := len(listTransitions)
// return listTransitions, numTransitions
//}
if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state // verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates
transitionDests := make([]*nfaState, 0) //func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) {
for _, v := range st.transitions { // if st.numTransitions() == 0 {
transitionDests = append(transitionDests, v...) // st.isLast = true
} // return
if allEqual(transitionDests...) { // }
st.isLast = true // // if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
return // if st.numTransitions() == 1 { // Eg. a*
} // var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
} // for _, c := range st.content {
if visited[st] == true { // if len(st.transitions[c]) != 1 || st.transitions[c][0] != st {
return // moreThanOneTrans = true
} // }
visited[st] = true // }
for _, states := range st.transitions { // st.isLast = !moreThanOneTrans
for i := range states { // }
if states[i] != st { //
verifyLastStatesHelper(states[i], visited) // if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
} // transitionDests := make([]*nfaState, 0)
} // for _, v := range st.transitions {
} // transitionDests = append(transitionDests, v...)
} // }
// if allEqual(transitionDests...) {
// st.isLast = true
// return
// }
// }
// if visited[st] == true {
// return
// }
// visited[st] = true
// for _, states := range st.transitions {
// for i := range states {
// if states[i] != st {
// verifyLastStatesHelper(states[i], visited)
// }
// }
// }
//}
// verifyLastStates enables the 'isLast' flag for the leaf nodes (last states) // verifyLastStates enables the 'isLast' flag for the leaf nodes (last states)
func verifyLastStates(start []*nfaState) { //func verifyLastStates(start []*nfaState) {
verifyLastStatesHelper(start[0], make(map[*nfaState]bool)) // verifyLastStatesHelper(start[0], make(map[*nfaState]bool))
} //}
// Concatenates s1 and s2, returns the start of the concatenation. // Concatenates s1 and s2, returns the start of the concatenation.
func concatenate(s1 *nfaState, s2 *nfaState) *nfaState { func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
@ -297,75 +313,84 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
return s2 return s2
} }
for i := range s1.output { for i := range s1.output {
for _, c := range s2.content { // Create transitions for every element in s1's content to s2' s1.output[i].next = s2
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2)
}
} }
s1.output = s2.output s1.output = s2.output
return s1 return s1
} }
func kleene(s1 nfaState) (*nfaState, error) { func kleene(s1 *nfaState) (*nfaState, error) {
if s1.isEmpty && s1.assert != noneAssert { if s1.isEmpty && s1.assert != noneAssert {
return nil, fmt.Errorf("previous token is not quantifiable") return nil, fmt.Errorf("previous token is not quantifiable")
} }
toReturn := &nfaState{} toReturn := &nfaState{}
toReturn.transitions = make(map[int][]*nfaState)
toReturn.content = newContents(epsilon)
toReturn.isEmpty = true toReturn.isEmpty = true
toReturn.isAlternation = true
toReturn.content = newContents(epsilon)
toReturn.splitState = s1
// toReturn := &nfaState{}
// toReturn.transitions = make(map[int][]*nfaState)
// toReturn.content = newContents(epsilon)
toReturn.isKleene = true toReturn.isKleene = true
toReturn.output = append(toReturn.output, toReturn) toReturn.output = append([]*nfaState{}, toReturn)
for i := range s1.output { for i := range s1.output {
for _, c := range toReturn.content { s1.output[i].next = toReturn
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
}
}
for _, c := range s1.content {
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
} }
// for _, c := range s1.content {
// toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
// }
//toReturn.kleeneState = &s1
return toReturn, nil return toReturn, nil
} }
func alternate(s1 *nfaState, s2 *nfaState) *nfaState { func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
toReturn := &nfaState{} toReturn := &nfaState{}
toReturn.transitions = make(map[int][]*nfaState) // toReturn.transitions = make(map[int][]*nfaState)
toReturn.output = append(toReturn.output, s1.output...) toReturn.output = append(toReturn.output, s1.output...)
toReturn.output = append(toReturn.output, s2.output...) toReturn.output = append(toReturn.output, s2.output...)
// Unique append is used here (and elsewhere) to ensure that, // // Unique append is used here (and elsewhere) to ensure that,
// for any given transition, a state can only be mentioned once. // // for any given transition, a state can only be mentioned once.
// For example, given the transition 'a', the state 's1' can only be mentioned once. // // For example, given the transition 'a', the state 's1' can only be mentioned once.
// This would lead to multiple instances of the same set of match indices, since both // // This would lead to multiple instances of the same set of match indices, since both
// 's1' states would be considered to match. // // 's1' states would be considered to match.
for _, c := range s1.content { // for _, c := range s1.content {
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1) // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
} // }
for _, c := range s2.content { // for _, c := range s2.content {
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2) // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
} // }
toReturn.content = newContents(epsilon) toReturn.content = newContents(epsilon)
toReturn.isEmpty = true toReturn.isEmpty = true
toReturn.isAlternation = true toReturn.isAlternation = true
toReturn.next = s1
toReturn.splitState = s2
return toReturn return toReturn
} }
func question(s1 *nfaState) *nfaState { // Use the fact that ab? == a(b|) func question(s1 *nfaState) (*nfaState, error) { // Use the fact that ab? == a(b|)
s2 := &nfaState{} if s1.isEmpty && s1.assert != noneAssert {
s2.transitions = make(map[int][]*nfaState) return nil, fmt.Errorf("previous token is not quantifiable")
s2.content = newContents(epsilon) }
s2.output = append(s2.output, s2) toReturn := &nfaState{}
s2.isEmpty = true toReturn.isEmpty = true
s2.isQuestion = true toReturn.isAlternation = true
s3 := alternate(s1, s2) toReturn.isQuestion = true
return s3 toReturn.content = newContents(epsilon)
toReturn.splitState = s1
toReturn.output = append([]*nfaState{}, toReturn)
toReturn.output = append(toReturn.output, s1.output...)
// s2.transitions = make(map[int][]*nfaState)
return toReturn, nil
} }
// Creates and returns a new state with the 'default' values. // Creates and returns a new state with the 'default' values.
func newState() nfaState { func newState() nfaState {
ret := nfaState{ ret := nfaState{
output: make([]*nfaState, 0), output: make([]*nfaState, 0),
transitions: make(map[int][]*nfaState), // transitions: make(map[int][]*nfaState),
assert: noneAssert, assert: noneAssert,
except: append([]rune{}, 0), except: append([]rune{}, 0),
lookaroundRegex: "", lookaroundRegex: "",
@ -377,10 +402,40 @@ func newState() nfaState {
} }
// Creates and returns a state that _always_ has a zero-length match. // Creates and returns a state that _always_ has a zero-length match.
func zeroLengthMatchState() nfaState { func zeroLengthMatchState() *nfaState {
start := newState() start := &nfaState{}
start.content = newContents(epsilon) start.content = newContents(epsilon)
start.isEmpty = true start.isEmpty = true
start.assert = alwaysTrueAssert start.assert = alwaysTrueAssert
start.output = append([]*nfaState{}, start)
return start return start
} }
func (s nfaState) equals(other nfaState) bool {
return s.isEmpty == other.isEmpty &&
s.isLast == other.isLast &&
slices.Equal(s.output, other.output) &&
slices.Equal(s.content, other.content) &&
s.next == other.next &&
s.isKleene == other.isKleene &&
s.isQuestion == other.isQuestion &&
s.isAlternation == other.isAlternation &&
s.splitState == other.splitState &&
s.assert == other.assert &&
s.allChars == other.allChars &&
slices.Equal(s.except, other.except) &&
s.lookaroundNFA == other.lookaroundNFA &&
s.groupBegin == other.groupBegin &&
s.groupEnd == other.groupEnd &&
s.groupNum == other.groupNum &&
slices.Equal(s.threadGroups, other.threadGroups)
}
func stateExists(list []nfaState, s nfaState) bool {
for i := range list {
if list[i].equals(s) {
return true
}
}
return false
}

@ -1,76 +0,0 @@
package regex
import "container/heap"
// Implement a priority queue using container/heap
const (
min_priority int = iota
zerostate_priority
alternation_priority
kleene_priority
char_priority
max_priority
)
func getPriority(state *nfaState) int {
if state.isKleene {
return kleene_priority
} else if state.isQuestion || state.isAlternation {
return alternation_priority
} else {
if state.isEmpty {
return zerostate_priority
} else {
return char_priority
}
}
}
type priorQueueItem struct {
state *nfaState
priority int
index int
}
type priorityQueue []*priorQueueItem
func (pq priorityQueue) Len() int {
return len(pq)
}
func (pq priorityQueue) Less(i, j int) bool {
if pq[i].priority == pq[j].priority {
return pq[i].index > pq[j].index
}
return pq[i].priority > pq[j].priority // We want max-heap, so we use greater-than
}
func (pq priorityQueue) Swap(i, j int) {
pq[i], pq[j] = pq[j], pq[i]
pq[i].index = i
pq[j].index = j
}
func (pq *priorityQueue) Push(x any) {
length := len(*pq)
item := x.(*priorQueueItem)
item.index = length
*pq = append(*pq, item)
}
func (pq *priorityQueue) Pop() any {
old := *pq
n := len(old)
item := old[n-1]
old[n-1] = nil
item.index = -1
*pq = old[0 : n-1]
return item
}
func (pq *priorityQueue) update(item *priorQueueItem, value *nfaState, priority int) {
item.state = value
item.priority = priority
heap.Fix(pq, item.index)
}

@ -109,7 +109,7 @@ func range2regex(start int, end int) (string, error) {
startSlc := intToSlc(rg.start) startSlc := intToSlc(rg.start)
endSlc := intToSlc(rg.end) endSlc := intToSlc(rg.end)
if len(startSlc) != len(endSlc) { if len(startSlc) != len(endSlc) {
return "", fmt.Errorf("Error parsing numeric range") return "", fmt.Errorf("error parsing numeric range")
} }
for i := range startSlc { for i := range startSlc {
if startSlc[i] == endSlc[i] { if startSlc[i] == endSlc[i] {

@ -25,7 +25,9 @@ var reTests = []struct {
{"a*b", nil, "qwqw", []Group{}}, {"a*b", nil, "qwqw", []Group{}},
{"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}}, {"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}},
{"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}}, {"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}},
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}}, // This match will only happen with Longest()
// {"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}},
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}}},
{"b*a*a", nil, "bba", []Group{{0, 3}}}, {"b*a*a", nil, "bba", []Group{{0, 3}}},
{"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}}, {"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}},
{"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}}, {"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}},
@ -528,7 +530,7 @@ var groupTests = []struct {
}{ }{
{"(a)(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}}, {"(a)(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}},
{"((a))(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}}, {"((a))(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}},
{"(0)", nil, "ab", []Match{[]Group{}}}, {"(0)", nil, "ab", []Match{}},
{"(a)b", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}}}}, {"(a)b", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}}}},
{"a(b)", nil, "ab", []Match{[]Group{{0, 2}, {1, 2}}}}, {"a(b)", nil, "ab", []Match{[]Group{{0, 2}, {1, 2}}}},
{"(a|b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}}, {"(a|b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
@ -537,10 +539,11 @@ var groupTests = []struct {
{"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}}, {"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}},
{"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, // This match will only happen with Longest()
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, // {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 3}, {0, 3}, {-1, -1}}}},
{"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}}, {"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
{"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}}, {"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
{"(a?)a?", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}}, {"(a?)a?", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
{"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}}, {"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
@ -578,7 +581,7 @@ var groupTests = []struct {
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}}, {`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}},
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}}, {`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
{`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(((((((((a)))))))))\41`, nil, `a`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(((((((((a)))))))))\41`, nil, `a!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}}, {`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
{`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}}, {`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
@ -633,7 +636,7 @@ var groupTests = []struct {
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `BCDD`, []Match{}}, {`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `BCDD`, []Match{}},
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}}, {`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
{`(((((((((a)))))))))`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(((((((((a)))))))))`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(((((((((a)))))))))\41`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(((((((((a)))))))))\41`, []ReFlag{RE_CASE_INSENSITIVE}, `A!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(.*)c(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}}, {`(.*)c(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
{`\((.*), (.*)\)`, []ReFlag{RE_CASE_INSENSITIVE}, `(A, B)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}}, {`\((.*), (.*)\)`, []ReFlag{RE_CASE_INSENSITIVE}, `(A, B)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
{`(a)(b)c|ab`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}}}}, {`(a)(b)c|ab`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}}}},
@ -701,7 +704,7 @@ func TestFind(t *testing.T) {
if len(test.result) == 0 { if len(test.result) == 0 {
return // Manually pass the test, because this is the expected behavior return // Manually pass the test, because this is the expected behavior
} else { } else {
t.Errorf("Wanted no match Got %v\n", groupIndex) t.Errorf("Wanted %v Got no matches\n", test.result)
} }
} else { } else {
if groupIndex != test.result[0] { if groupIndex != test.result[0] {
@ -743,7 +746,7 @@ func TestFindString(t *testing.T) {
foundString := regComp.FindString(test.str) foundString := regComp.FindString(test.str)
if len(test.result) == 0 { if len(test.result) == 0 {
if foundString != "" { if foundString != "" {
t.Errorf("Expected no match got %v\n", foundString) t.Errorf("Wanted no match got %v\n", foundString)
} }
} else { } else {
expectedString := test.str[test.result[0].StartIdx:test.result[0].EndIdx] expectedString := test.str[test.result[0].StartIdx:test.result[0].EndIdx]
@ -791,11 +794,68 @@ func TestFindSubmatch(t *testing.T) {
} }
} }
match, err := regComp.FindSubmatch(test.str) match, err := regComp.FindSubmatch(test.str)
if err != nil {
if len(test.result) != 0 {
t.Errorf("Wanted %v got no match\n", test.result[0])
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", match)
}
for i := range match { for i := range match {
if match[i].IsValid() { if match[i].IsValid() {
if test.result[0][i] != match[i] { if test.result[0][i] != match[i] {
t.Errorf("Wanted %v Got %v\n", test.result[0], match) t.Errorf("Wanted %v Got %v\n", test.result[0], match)
} }
} else {
if i < len(test.result) && test.result[0][i].IsValid() {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
}
}
}
})
}
}
func TestFindStringSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
}
matchStr := regComp.FindStringSubmatch(test.str)
if matchStr == nil {
if len(test.result) != 0 {
expectedStr := funcMap(test.result[0], func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
t.Errorf("Wanted %v got no match\n", expectedStr)
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", matchStr)
} else {
expectedStr := funcMap(test.result[0], func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
for i, groupStr := range matchStr {
if groupStr == "" {
if i < len(expectedStr) && expectedStr[i] != "" {
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
}
} else {
if expectedStr[i] != groupStr {
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
}
}
} }
} }
}) })
@ -817,6 +877,10 @@ func TestFindAllSubmatch(t *testing.T) {
if test.result[i][j] != matchIndices[i][j] { if test.result[i][j] != matchIndices[i][j] {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
} }
} else {
if i < len(test.result) && j < len(test.result[i]) && test.result[i][j].IsValid() {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}
} }
} }
} }

Loading…
Cancel
Save