20 Commits

Author SHA1 Message Date
ef476e8875 Reverse the order of the numeric range before adding it, to maintain compatibility with PCRE matching rules (even though I don't implement them, if I do in the future) 2025-02-02 13:46:48 -05:00
7e6b02632f Added more tests; commented out tests that I am failing 2025-02-02 13:46:08 -05:00
f94e3f2e71 Added comments 2025-02-02 12:44:06 -05:00
b129d83c3f Added function to reset threads 2025-02-02 12:43:40 -05:00
43aa7b5876 Updated documentation 2025-02-02 12:42:38 -05:00
9a3bfca313 Renamed unique_append to uniqueAppend 2025-02-02 12:42:29 -05:00
b6ab54f6dd Reset threads when findAllSubmatchHelper is called 2025-02-02 12:42:00 -05:00
6a96c98d04 Fixed bug where the regex '(()|.)(b)' wouldn't compile 2025-02-01 19:20:33 -05:00
3cfc2a6854 Updated Makefile 2025-02-01 18:52:26 -05:00
5d7a02e796 Added gcflags to go build 2025-02-01 18:51:58 -05:00
a46d2f4546 Updated comments 2025-02-01 18:07:31 -05:00
c88ebd1aa5 Added comments explaining what a Match is 2025-02-01 18:05:55 -05:00
fd102292c6 Added example for FindSubmatch 2025-02-01 18:05:43 -05:00
6d692d0dfc Rename Group.toString() to Group.String() 2025-02-01 12:51:32 -05:00
7c4538a259 Added 'example' file that contains testable examples 2025-02-01 12:50:49 -05:00
2a9ae0b68a Wrote test for 'FindSubmatch' 2025-02-01 11:09:05 -05:00
783ae2ad10 Updated call to 'isValid' with call to 'IsValid' 2025-02-01 11:06:26 -05:00
b5e6bc112c Wrote 'reg.FindSubmatch()' which returns the leftmost match with submatches, renamed 'isValid' to 'IsValid' to export it, renamed 'ToString' to 'String' 2025-02-01 11:06:03 -05:00
206fea34cd Added function to return the number of subexpressions in the group 2025-02-01 11:04:49 -05:00
fcdb23524a Added more documentation 2025-02-01 11:04:24 -05:00
10 changed files with 245 additions and 35 deletions

View File

@@ -8,6 +8,6 @@ vet: fmt
buildLib: vet buildLib: vet
go build -gcflags="-N -l" ./... go build -gcflags="-N -l" ./...
buildCmd: buildLib buildCmd: buildLib
go build -C cmd/ -o re ./... go build -C cmd/ -gcflags="-N -l" -o re ./...
test: buildCmd test: buildCmd
go test -v ./... go test -v ./...

View File

@@ -137,7 +137,7 @@ func main() {
fmt.Fprintf(out, "Line %d:\n", lineNum) fmt.Fprintf(out, "Line %d:\n", lineNum)
} }
for _, m := range matchIndices { for _, m := range matchIndices {
fmt.Fprintf(out, "%s\n", m.ToString()) fmt.Fprintf(out, "%s\n", m.String())
} }
err := out.Flush() err := out.Flush()
if err != nil { if err != nil {

View File

@@ -18,6 +18,12 @@ type Reg struct {
numGroups int numGroups int
} }
// numSubexp eturns the number of sub-expressions in the given [Reg]. This is equivalent
// to the number of capturing groups.
func (r Reg) NumSubexp() int {
return r.numGroups
}
const concatRune rune = 0xF0001 const concatRune rune = 0xF0001
// Flags for shuntingYard - control its behavior // Flags for shuntingYard - control its behavior
@@ -943,7 +949,9 @@ func thompson(re []postfixNode) (Reg, error) {
// and added back in. // and added back in.
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN // If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
// and RPAREN nodes. // and RPAREN nodes.
// If neither node exists, that's a problem so I return an error. // If the middle node exists but is itself the start of a group, then that _must_ be the opening paren for
// the closing paren that I'm on. I put the third node back (because it isn't involved in the capturing group), then
// I concatenate those two and add them. If neither node exists, that's a problem so I return an error.
if c.nodetype == rparenNode { if c.nodetype == rparenNode {
s.groupEnd = true s.groupEnd = true
middleNode, err1 := pop(&nfa) middleNode, err1 := pop(&nfa)
@@ -958,6 +966,11 @@ func thompson(re []postfixNode) (Reg, error) {
s.groupNum = lparenNode.groupNum s.groupNum = lparenNode.groupNum
to_add := concatenate(lparenNode, s) to_add := concatenate(lparenNode, s)
nfa = append(nfa, to_add) nfa = append(nfa, to_add)
} else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group
s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen
to_add := concatenate(middleNode, s)
nfa = append(nfa, to_add)
} else { } else {
// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's') // At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
if lparenNode.groupBegin { if lparenNode.groupBegin {

View File

@@ -114,12 +114,47 @@ Another, more subtle example is the following regex:
x|xx x|xx
While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation, While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
this engine will _always_ go for the longest possible match, regardless of the order of the alternation. this engine will go for the longest possible match, regardless of the order of the alternation. Although this
strays from the convention, it results in a nice rule-of-thumb - the engine is ALWAYS greedy.
The stdlib implementation has a function [regexp.Regexp.Longest] which makes future searches prefer the longest match.
That is the default (and unchangable) behavior in this engine.
2. Byte-slices and runes: 2. Byte-slices and runes:
My engine does not support byte-slices. When a matching function receives a string, it converts it into a My engine does not support byte-slices. When a matching function receives a string, it converts it into a
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
support made the tradeoff worth it. support made the tradeoff worth it.
3. Return values
Rather than using primitives for return values, my engine defines two types that are used as return
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
[regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The
equivalent expression for this engine is:
Find(All)?(String)?(Submatch)?
[Reg.Find] returns the index of the leftmost match in the string.
If a function contains 'All' it returns all matches instead of just the leftmost one.
If a function contains 'String' it returns the matched text, rather than the indices.
If a function contains 'Submatch' it returns the match, including all submatches found by
capturing groups.
The term '0-group' is used to refer to the 0th capturing group of a match (which is the entire match).
Given the following regex:
x(y)
and the input string:
xyz
The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name
returns the 0-group.
*/ */
package regex package regex

54
regex/example_test.go Normal file
View File

@@ -0,0 +1,54 @@
package regex_test
import (
"fmt"
"gitea.twomorecents.org/Rockingcool/kleingrep/regex"
)
func ExampleReg_Find() {
regexStr := "b|a"
regexComp := regex.MustCompile(regexStr)
match, _ := regexComp.Find("banana")
fmt.Println(match.String())
// Output: 0 1
}
func ExampleReg_FindAll() {
regexStr := "b|a"
regexComp := regex.MustCompile(regexStr)
matches := regexComp.FindAll("banana")
for _, group := range matches {
fmt.Println(group.String())
}
// Output: 0 1
// 1 2
// 3 4
// 5 6
}
func ExampleReg_FindString() {
regexStr := `\d+`
regexComp := regex.MustCompile(regexStr)
matchStr := regexComp.FindString("The year of our lord, 2025")
fmt.Println(matchStr)
// Output: 2025
}
func ExampleReg_FindSubmatch() {
regexStr := `(\d)\.(\d)(\d)`
regexComp := regex.MustCompile(regexStr)
match, _ := regexComp.FindSubmatch("3.14")
fmt.Println(match[0])
fmt.Println(match[1])
fmt.Println(match[2])
// Output: 0 4
// 0 1
// 2 3
}

View File

@@ -5,7 +5,13 @@ import (
"sort" "sort"
) )
// a Match stores a slice of all the capturing groups in a match. // A Match represents a match found by the regex in a given string.
// It is represented as a list of groups, where the nth element contains
// the contents of the nth capturing group. Note that the group may not be valid
// (see [Group.IsValid]). The element at index 0 is known
// as the 0-group, and represents the contents of the entire match.
//
// See [Reg.FindSubmatch] for an example.
type Match []Group type Match []Group
// a Group represents a group. It contains the start index and end index of the match // a Group represents a group. It contains the start index and end index of the match
@@ -35,25 +41,26 @@ func (m Match) numValidGroups() int {
} }
// Returns a string containing the indices of all (valid) groups in the match // Returns a string containing the indices of all (valid) groups in the match
func (m Match) ToString() string { func (m Match) String() string {
var toRet string var toRet string
for i, g := range m { for i, g := range m {
if g.isValid() { if g.IsValid() {
toRet += fmt.Sprintf("Group %d\n", i) toRet += fmt.Sprintf("Group %d\n", i)
toRet += g.toString() toRet += g.String()
toRet += "\n" toRet += "\n"
} }
} }
return toRet return toRet
} }
// Converts the Group into a string representation: // String converts the Group into a string representation.
func (idx Group) toString() string { func (idx Group) String() string {
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx) return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
} }
// Returns whether a group contains valid indices // Returns whether a group is valid (ie. whether it matched any text). It
func (g Group) isValid() bool { // simply ensures that both indices of the group are >= 0.
func (g Group) IsValid() bool {
return g.StartIdx >= 0 && g.EndIdx >= 0 return g.StartIdx >= 0 && g.EndIdx >= 0
} }
@@ -106,7 +113,7 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState)
num_appended := 0 // number of unique states addded to tempstates num_appended := 0 // number of unique states addded to tempstates
for isZero == true { for isZero == true {
zeroStates, isZero = takeZeroState(tempstates, numGroups, idx) zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
tempstates, num_appended = unique_append(tempstates, zeroStates...) tempstates, num_appended = uniqueAppend(tempstates, zeroStates...)
if num_appended == 0 { // break if we haven't appended any more unique values if num_appended == 0 { // break if we haven't appended any more unique values
break break
} }
@@ -174,6 +181,20 @@ func (regex Reg) FindString(str string) string {
return str[zeroGroup.StartIdx:zeroGroup.EndIdx] return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
} }
// FindSubmatch returns the leftmost match of the regex in the given string, including
// the submatches matched by capturing groups. The returned [Match] will always contain the same
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
// [Group.IsValid], or by checking that both indices of the group are >= 0.
// The second-return value is nil if no match was found.
func (regex Reg) FindSubmatch(str string) (Match, error) {
match, err := regex.FindNthMatch(str, 1)
if err != nil {
return Match{}, fmt.Errorf("no match found")
} else {
return match, nil
}
}
// FindAllString is the 'all' version of FindString. // FindAllString is the 'all' version of FindString.
// It returns a slice of strings containing the text of all matches of // It returns a slice of strings containing the text of all matches of
// the regex in the given string. // the regex in the given string.
@@ -237,6 +258,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str) // The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
return false, []Group{}, offset return false, []Group{}, offset
} }
resetThreads(start)
// Hold a list of match indices for the current run. When we // Hold a list of match indices for the current run. When we
// can no longer find a match, the match with the largest range is // can no longer find a match, the match with the largest range is
@@ -292,13 +314,13 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
num_appended := 0 num_appended := 0
for isZero == true { for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i) zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = unique_append(tempStates, zeroStates...) tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values if num_appended == 0 { // Break if we haven't appended any more unique values
break break
} }
} }
currentStates, _ = unique_append(currentStates, tempStates...) currentStates, _ = uniqueAppend(currentStates, tempStates...)
tempStates = nil tempStates = nil
// Take any transitions corresponding to current character // Take any transitions corresponding to current character
@@ -372,7 +394,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// Check if we can find a zero-length match // Check if we can find a zero-length match
if foundPath == false { if foundPath == false {
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok { if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
if tempIndices[0].isValid() == false { if tempIndices[0].IsValid() == false {
tempIndices[0] = Group{startIdx, startIdx} tempIndices[0] = Group{startIdx, startIdx}
} }
} }
@@ -382,7 +404,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
startIdx++ startIdx++
// i++ // i++
// } // }
if tempIndices.numValidGroups() > 0 && tempIndices[0].isValid() { if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() {
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true, tempIndices, tempIndices[0].EndIdx + 1 return true, tempIndices, tempIndices[0].EndIdx + 1
} else { } else {
@@ -405,7 +427,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
num_appended := 0 // Number of unique states addded to tempStates num_appended := 0 // Number of unique states addded to tempStates
for isZero == true { for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i) zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = unique_append(tempStates, zeroStates...) tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values if num_appended == 0 { // Break if we haven't appended any more unique values
break break
} }

View File

@@ -50,7 +50,7 @@ func isNormalChar(c rune) bool {
// Ensure that the given elements are only appended to the given slice if they // Ensure that the given elements are only appended to the given slice if they
// don't already exist. Returns the new slice, and the number of unique items appended. // don't already exist. Returns the new slice, and the number of unique items appended.
func unique_append[T comparable](slc []T, items ...T) ([]T, int) { func uniqueAppend[T comparable](slc []T, items ...T) ([]T, int) {
num_appended := 0 num_appended := 0
for _, item := range items { for _, item := range items {
if !slices.Contains(slc, item) { if !slices.Contains(slc, item) {
@@ -61,6 +61,25 @@ func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
return slc, num_appended return slc, num_appended
} }
func uniqueAppendFunc[T any](slc []T, fn func(T, T) bool, items ...T) ([]T, int) {
toRet := make([]T, len(slc))
num_appended := 0
copy(toRet, slc)
for _, item := range items {
itemExists := false
for _, val := range slc {
if fn(item, val) {
itemExists = true
}
}
if !itemExists {
toRet = append(toRet, item)
num_appended++
}
}
return toRet, num_appended
}
// Returns true only if all the given elements are equal // Returns true only if all the given elements are equal
func allEqual[T comparable](items ...T) bool { func allEqual[T comparable](items ...T) bool {
first := items[0] first := items[0]

View File

@@ -104,6 +104,26 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
return clone return clone
} }
// Reset any thread-related fields of the NFA starting from the given state.
func resetThreads(start *nfaState) {
visitedMap := make(map[*nfaState]bool) // The value type doesn't matter here
resetThreadsHelper(start, visitedMap)
}
func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
if _, ok := visitedMap[state]; ok {
return
}
// Assuming it hasn't been visited
state.threadGroups = nil
visitedMap[state] = true
for _, v := range state.transitions {
for _, nextState := range v {
resetThreadsHelper(nextState, visitedMap)
}
}
}
// Checks if the given state's assertion is true. Returns true if the given // Checks if the given state's assertion is true. Returns true if the given
// state doesn't have an assertion. // state doesn't have an assertion.
func (s nfaState) checkAssertion(str []rune, idx int) bool { func (s nfaState) checkAssertion(str []rune, idx int) bool {
@@ -274,7 +294,7 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
} }
for i := range s1.output { for i := range s1.output {
for _, c := range s2.content { // Create transitions for every element in s1's content to s2' for _, c := range s2.content { // Create transitions for every element in s1's content to s2'
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], s2) s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2)
} }
} }
s1.output = s2.output s1.output = s2.output
@@ -294,11 +314,11 @@ func kleene(s1 nfaState) (*nfaState, error) {
toReturn.output = append(toReturn.output, toReturn) toReturn.output = append(toReturn.output, toReturn)
for i := range s1.output { for i := range s1.output {
for _, c := range toReturn.content { for _, c := range toReturn.content {
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], toReturn) s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
} }
} }
for _, c := range s1.content { for _, c := range s1.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1) toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
} }
return toReturn, nil return toReturn, nil
} }
@@ -314,10 +334,10 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
// This would lead to multiple instances of the same set of match indices, since both // This would lead to multiple instances of the same set of match indices, since both
// 's1' states would be considered to match. // 's1' states would be considered to match.
for _, c := range s1.content { for _, c := range s1.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s1) toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
} }
for _, c := range s2.content { for _, c := range s2.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2) toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
} }
toReturn.content = newContents(epsilon) toReturn.content = newContents(epsilon)
toReturn.isEmpty = true toReturn.isEmpty = true

View File

@@ -3,7 +3,9 @@ package regex
import ( import (
"fmt" "fmt"
"math" "math"
"slices"
"strconv" "strconv"
"strings"
) )
type numRange struct { type numRange struct {
@@ -99,13 +101,11 @@ func range2regex(start int, end int) (string, error) {
// Last range - tmp to rangeEnd // Last range - tmp to rangeEnd
ranges = append(ranges, numRange{tmp, rangeEnd}) ranges = append(ranges, numRange{tmp, rangeEnd})
regex := string(nonCapLparenRune) regexSlice := make([]string, 0)
// Generate the regex // Generate the regex
for i, rg := range ranges { for _, rg := range ranges {
if i > 0 { tmpStr := ""
regex += "|" tmpStr += string(nonCapLparenRune)
}
regex += string(nonCapLparenRune)
startSlc := intToSlc(rg.start) startSlc := intToSlc(rg.start)
endSlc := intToSlc(rg.end) endSlc := intToSlc(rg.end)
if len(startSlc) != len(endSlc) { if len(startSlc) != len(endSlc) {
@@ -113,14 +113,27 @@ func range2regex(start int, end int) (string, error) {
} }
for i := range startSlc { for i := range startSlc {
if startSlc[i] == endSlc[i] { if startSlc[i] == endSlc[i] {
regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48. tmpStr += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
} else { } else {
regex += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune) tmpStr += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
} }
} }
regex += ")" tmpStr += ")"
regexSlice = append(regexSlice, tmpStr)
} }
regex += ")" // Each element of the slice represents one 'group'. Taking 0-255 as an example, the elements would be:
// 1. 0-9
// 2. 10-99
// 3. 100-199
// 4. 200-249
// 5. 250-255
//
// The reason this is reversed before joining it, is because it is incompatible with the PCRE rule for matching.
// The PCRE rule specifies that the left-branch of an alternation is preferred. Even though this engine uses the POSIX
// rule at the moment (which prefers the longest match regardless of the order of the alternation), reversing the string
// has no downsides. It doesn't affect POSIX matching, and it will reduce my burden if I decide to switch to PCRE matching.
slices.Reverse(regexSlice)
regex := string(nonCapLparenRune) + strings.Join(regexSlice, "|") + ")"
return regex, nil return regex, nil
} }

View File

@@ -105,6 +105,9 @@ var reTests = []struct {
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}}, {"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}}, {"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}}, {`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
{`\d{3,4}`, nil, "120", []Group{{0, 3}}},
{`\d{3,4}`, nil, "12709", []Group{{0, 4}}},
{`\d{3,4}`, nil, "12", []Group{}},
{`\d{3,4}`, nil, "109", []Group{{0, 3}}}, {`\d{3,4}`, nil, "109", []Group{{0, 3}}},
{`\d{3,4}`, nil, "5", []Group{}}, {`\d{3,4}`, nil, "5", []Group{}},
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}}, {`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
@@ -671,6 +674,17 @@ var groupTests = []struct {
{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}}, {`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
{`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}}, {`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}},
// // Tests from https://wiki.haskell.org/Regex_Posix
// {`(()|.)(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
// {`(()|[ab])(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
// {`(()|[ab])+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
// {`([ab]|())+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
// // Bug - this should give {0,6},{3,6},{-1,-1} but it gives {0,6},{3,6},{3,3}
// // {`yyyyyy`, nil, `(yyy|(x?)){2,4}`, []Match{[]Group{{0, 6}, {3, 6}, {-1, -1}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
} }
func TestFind(t *testing.T) { func TestFind(t *testing.T) {
@@ -767,6 +781,26 @@ func TestFindAllString(t *testing.T) {
} }
} }
func TestFindSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
}
match, err := regComp.FindSubmatch(test.str)
for i := range match {
if match[i].IsValid() {
if test.result[0][i] != match[i] {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
}
}
}
})
}
}
func TestFindAllSubmatch(t *testing.T) { func TestFindAllSubmatch(t *testing.T) {
for _, test := range groupTests { for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) { t.Run(test.re+" "+test.str, func(t *testing.T) {
@@ -779,7 +813,7 @@ func TestFindAllSubmatch(t *testing.T) {
matchIndices := regComp.FindAllSubmatch(test.str) matchIndices := regComp.FindAllSubmatch(test.str)
for i := range matchIndices { for i := range matchIndices {
for j := range matchIndices[i] { for j := range matchIndices[i] {
if matchIndices[i][j].isValid() { if matchIndices[i][j].IsValid() {
if test.result[i][j] != matchIndices[i][j] { if test.result[i][j] != matchIndices[i][j] {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
} }