50 Commits

Author SHA1 Message Date
46bc0c8529 Removed unicode character classes from 'features not supported' list 2025-02-13 10:48:23 -05:00
1a890a1e75 Refactoring - remove duplicate code 2025-02-13 09:10:40 -05:00
fde3784e5a Added unicode charclass support within character classes; Fixed bugs with hex classes and unicode classes 2025-02-13 08:58:02 -05:00
7045711860 Convert test_str into a rune slice for better unicode compatibility, it also fixed the bug where all unicode characters wouldn't be colored 2025-02-13 08:57:06 -05:00
d4d606d95b Added tests for unicode character classes; more tests for hex characters 2025-02-13 08:55:12 -05:00
9cd330e521 More work on unicode character class support - fix bug where all characters aren't being matched 2025-02-12 23:04:10 -05:00
44d6a2005c Started working on unicode character classes 2025-02-12 22:19:30 -05:00
f76cd6c3d9 Merge pull request 'Implement Backreferences' (#3) from implementBackreferences into master
Reviewed-on: #3
2025-02-12 21:17:32 -06:00
375baa1722 Wrote more backreference tests 2025-02-12 07:51:20 -05:00
2e47c631bb Updated documentation to include backreferences 2025-02-12 07:50:59 -05:00
81b8b1b11c Do not validate a backreference if the group that it refers to is not valid 2025-02-11 19:12:58 -05:00
2934e7a20f Wrote tests for backreferences 2025-02-11 19:12:40 -05:00
f466d4a8d5 More progress on backreference implementation 2025-02-11 17:06:39 -05:00
8327450dd2 Started implementing backreferences (octal values should now be prefaced with \0) 2025-02-11 16:14:54 -05:00
073f231b89 Added function and examples for ReplaceAllFunc() 2025-02-10 21:35:51 -05:00
3b7257c921 Wrote function and example for ReplaceAllLiteral() 2025-02-10 21:25:49 -05:00
668df8b70a Wrote MarshalText() and UnmarshalText() to implement TextMarshaler and TextUnmarshaler 2025-02-10 12:30:48 -05:00
214acf7e0f Wrote example for ReplaceAll(); fixed out-of-bounds bug in Expand() 2025-02-10 12:30:17 -05:00
50221ff4d9 Wrote ReplaceAll(), to replace all matches of the regex with a given string 2025-02-10 12:29:54 -05:00
5ab95f512a Updated docs 2025-02-10 09:36:00 -05:00
e7da678408 Removed obsolete documentation 2025-02-10 09:35:16 -05:00
ab363e2766 Rewrote test for 'FindString()' to use lookarounds 2025-02-10 09:24:47 -05:00
c803e45415 Added example for 'FindStringSubmatch()' 2025-02-10 09:19:24 -05:00
525296f239 Added examples for 'FindAllString()' , 'FindAllSubmatch()' and 'FindAllStringSubmatch()' 2025-02-10 09:10:39 -05:00
eb0ab9f7ec Wrote test for FindAllStringSubmatch() 2025-02-10 08:39:20 -05:00
17a7dbae4c Wrote FindAllStringSubmatch() 2025-02-10 08:39:10 -05:00
f2279acd98 Fixed mistake in docs 2025-02-10 08:12:09 -05:00
662527c478 Merge pull request 'Implement PCRE Matching (prefer left-branch)' (#2) from implementPCREMatchingRules into master
Reviewed-on: #2
2025-02-09 15:24:26 -06:00
d1958f289c Commented out tests that would only pass with Longest() 2025-02-09 16:08:16 -05:00
15ee49f42e Rename method receivers from 'regex' to 're' (it's shorter) 2025-02-09 15:51:46 -05:00
b60ded4136 Don't break when a match is found, if we are looking for the longest match 2025-02-09 15:48:33 -05:00
9fbb99f86c Wrote example for Longest() 2025-02-09 15:47:57 -05:00
af15904f3b Updated documentation 2025-02-09 15:41:13 -05:00
d522f50b50 Wrote new example functions 2025-02-09 15:40:59 -05:00
fb47e082eb Wrote new methods Expand() and preferLongest(); Use new function signatures (with preferLongest); only characters should be added to next state list 2025-02-09 15:40:39 -05:00
1f5a363539 Use new function signatures (with preferLongest) 2025-02-09 15:39:09 -05:00
9e12f9dcb3 Added field to Reg, denoting if we prefer longest match (POSIX style) or not (perl style) 2025-02-09 15:38:26 -05:00
47f88c817f Fixed typo 2025-02-09 15:14:17 -05:00
835d495990 Removed capitalization for error message (staticcheck) 2025-02-09 09:14:45 -05:00
76e0170cb9 Removed unused function 2025-02-09 09:13:52 -05:00
d172a58258 Throw error if match isn't found but test.result has >0 elements 2025-02-09 09:13:29 -05:00
7231169270 Removed unused functions 2025-02-09 09:13:03 -05:00
e546f01c20 Removed redundant return (staticcheck) 2025-02-09 09:12:55 -05:00
b7467a00f1 Removed priorityQueue (unused) 2025-02-09 09:07:43 -05:00
c6ad4caa0d Removed a bunch of unused code (let's go!!!) 2025-02-09 09:06:40 -05:00
6334435b83 Updated tests since the engine uses Perl matching instead of POSIX matching; added tests for FindStringSubmatch 2025-02-09 09:01:42 -05:00
78fb5606dd Use new definition of Reg 2025-02-09 08:59:16 -05:00
eddd2ae700 Updated documentation 2025-02-09 08:58:58 -05:00
c577064977 Added string field to Reg, that contains the expression string; wrote method to return the string 2025-02-09 08:58:46 -05:00
d4e3942d27 Added Match() and FindStringSubmatch(); removed old code; updated comments 2025-02-09 08:58:09 -05:00
12 changed files with 889 additions and 655 deletions

View File

@@ -129,6 +129,8 @@ func main() {
matchIndices = regComp.FindAllSubmatch(test_str) matchIndices = regComp.FindAllSubmatch(test_str)
} }
test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters
if *printMatchesFlag { if *printMatchesFlag {
// if we are in single line mode, print the line on which // if we are in single line mode, print the line on which
// the matches occur // the matches occur
@@ -158,10 +160,10 @@ func main() {
oldIndices := indicesToPrint.values() oldIndices := indicesToPrint.values()
indicesToPrint = new_uniq_arr[int]() indicesToPrint = new_uniq_arr[int]()
// Explanation: // Explanation:
// Find all numbers from 0 to len(test_str) that are NOT in oldIndices. // Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
// These are the values we want to print, now that we have inverted the match. // These are the values we want to print, now that we have inverted the match.
// Re-initialize indicesToPrint and add all of these values to it. // Re-initialize indicesToPrint and add all of these values to it.
indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...) indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)
} }
// If lineFlag is enabled, we should only print something if: // If lineFlag is enabled, we should only print something if:
@@ -182,7 +184,7 @@ func main() {
// the corresponding end index. // the corresponding end index.
// 3. If not, just print the character. // 3. If not, just print the character.
if substituteFlagEnabled { if substituteFlagEnabled {
for i := range test_str { for i := range test_str_runes {
inMatchIndex := false inMatchIndex := false
for _, m := range matchIndices { for _, m := range matchIndices {
if i == m[0].StartIdx { if i == m[0].StartIdx {
@@ -193,11 +195,11 @@ func main() {
} }
} }
if !inMatchIndex { if !inMatchIndex {
fmt.Fprintf(out, "%c", test_str[i]) fmt.Fprintf(out, "%c", test_str_runes[i])
} }
} }
} else { } else {
for i, c := range test_str { for i, c := range test_str_runes {
if indicesToPrint.contains(i) { if indicesToPrint.contains(i) {
color.New(color.FgRed).Fprintf(out, "%c", c) color.New(color.FgRed).Fprintf(out, "%c", c)
// Newline after every match - only if -o is enabled and -v is disabled. // Newline after every match - only if -o is enabled and -v is disabled.

View File

@@ -16,7 +16,6 @@ func (s *uniq_arr[T]) add(vals ...T) {
s.backingMap[item] = struct{}{} s.backingMap[item] = struct{}{}
} }
} }
return
} }
func (s uniq_arr[T]) contains(val T) bool { func (s uniq_arr[T]) contains(val T) bool {

View File

@@ -12,16 +12,43 @@ var notDotChars []rune
// A Reg represents the result of compiling a regular expression. It contains // A Reg represents the result of compiling a regular expression. It contains
// the startState of the NFA representation of the regex, and the number of capturing // the startState of the NFA representation of the regex, and the number of capturing
// groups in the regex. // groups in the regex. It also contains the expression string.
type Reg struct { type Reg struct {
start *nfaState start *nfaState
numGroups int numGroups int
str string
preferLongest bool
} }
// numSubexp eturns the number of sub-expressions in the given [Reg]. This is equivalent // NumSubexp returns the number of sub-expressions in the given [Reg]. This is equivalent
// to the number of capturing groups. // to the number of capturing groups.
func (r Reg) NumSubexp() int { func (re Reg) NumSubexp() int {
return r.numGroups return re.numGroups
}
// String returns the string used to compile the expression.
func (re Reg) String() string {
return re.str
}
// MarshalText implements [encoding.TextMarshaler]. The output is equivalent to that of [Reg.String].
// Any flags passed as arguments (including calling [Reg.Longest]) are lost.
func (re *Reg) MarshalText() ([]byte, error) {
return []byte(re.String()), nil
}
// UnmarshalText implements [encoding.TextUnmarshaler]. It calls [Reg.Compile] on the given byte-slice. If it returns successfully,
// then the result of the compilation is stored in re. The result of [Reg.Compile] is returned.
func (re *Reg) UnmarshalText(text []byte) error {
newReg, err := Compile(string(text))
if err == nil {
*re = newReg
}
return err
}
func (re *Reg) Longest() {
re.preferLongest = true
} }
const concatRune rune = 0xF0001 const concatRune rune = 0xF0001
@@ -81,6 +108,48 @@ func getPOSIXClass(str []rune) (bool, string) {
return true, rtv return true, rtv
} }
// isUnicodeCharClassLetter returns whether or not the given letter represents a unicode character class.
func isUnicodeCharClassLetter(c rune) bool {
return slices.Contains([]rune{'L', 'M', 'S', 'N', 'P', 'C', 'Z'}, c)
}
// rangeTableToRuneSlice converts the given range table into a rune slice and returns it.
func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune {
var rtv []rune
for _, r := range rangetable.R16 {
for c := r.Lo; c <= r.Hi; c += r.Stride {
rtv = append(rtv, rune(c))
}
}
for _, r := range rangetable.R32 {
for c := r.Lo; c <= r.Hi; c += r.Stride {
rtv = append(rtv, rune(c))
}
}
return rtv
}
// unicodeCharClassToRange converts the given unicode character class name into a list of characters in that class.
// This class could also be a single letter eg. 'C'.
func unicodeCharClassToRange(class string) ([]rune, error) {
if len(class) == 0 {
return nil, fmt.Errorf("empty unicode character class")
}
if len(class) == 1 || len(class) == 2 {
if rangeTable, ok := unicode.Categories[class]; ok {
return rangeTableToRuneSlice(rangeTable), nil
} else {
return nil, fmt.Errorf("invalid short unicode character class")
}
} else {
if rangeTable, ok := unicode.Scripts[class]; ok {
return rangeTableToRuneSlice(rangeTable), nil
} else {
return nil, fmt.Errorf("invalid long unicode character class")
}
}
}
// Stores whether the case-insensitive flag has been enabled. // Stores whether the case-insensitive flag has been enabled.
var caseInsensitive bool var caseInsensitive bool
@@ -282,17 +351,44 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} }
} else if isHex(re_runes[i]) { } else if isHex(re_runes[i]) {
re_postfix = append(re_postfix, re_runes[i:i+2]...) re_postfix = append(re_postfix, re_runes[i:i+2]...)
i += 2 i += 1 // I don't skip forward 2 steps, because the second step will happen with the loop increment
} else { } else {
return nil, fmt.Errorf("invalid hex value in expression") return nil, fmt.Errorf("invalid hex value in expression")
} }
} else if isOctal(re_runes[i]) { } else if re_runes[i] == 'p' || re_runes[i] == 'P' { // Unicode character class (P is negated unicode charclass)
re_postfix = append(re_postfix, re_runes[i])
i++
if i >= len(re_runes) {
return nil, fmt.Errorf("error parsing unicode character class in expression")
}
if re_runes[i] == '{' { // Full name charclass
for re_runes[i] != '}' {
re_postfix = append(re_postfix, re_runes[i])
i++
}
re_postfix = append(re_postfix, re_runes[i])
i++
} else if isUnicodeCharClassLetter(re_runes[i]) {
re_postfix = append(re_postfix, re_runes[i])
i++
} else {
return nil, fmt.Errorf("error parsing unicode character class in expression")
}
i-- // The loop increment at the top will move us forward
} else if re_runes[i] == '0' { // Start of octal value
numDigits := 1 numDigits := 1
for i+numDigits < len(re_runes) && numDigits < 3 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 3) for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0)
numDigits++ numDigits++
} }
re_postfix = append(re_postfix, re_runes[i:i+numDigits]...) re_postfix = append(re_postfix, re_runes[i:i+numDigits]...)
i += (numDigits - 1) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended i += (numDigits - 1) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended
} else if unicode.IsDigit(re_runes[i]) { // Any other number - backreference
numDigits := 1
for i+numDigits < len(re_runes) && unicode.IsDigit(re_runes[i+numDigits]) { // Skip while we see a digit
numDigits++
}
re_postfix = append(re_postfix, re_runes[i:i+numDigits]...)
i += (numDigits - 1) // Move back a step to add concatenation operator
} else { } else {
re_postfix = append(re_postfix, re_runes[i]) re_postfix = append(re_postfix, re_runes[i])
} }
@@ -338,6 +434,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// Actual algorithm // Actual algorithm
numOpenParens := 0 // Number of open parentheses numOpenParens := 0 // Number of open parentheses
parenIndices := make([]Group, 0) // I really shouldn't be using Group here, because that's strictly for matching purposes, but its a convenient way to store the indices of the opening and closing parens.
parenIndices = append(parenIndices, Group{0, 0}) // I append a weird value here, because the 0-th group doesn't have any parens. This way, the 1st group will be at index 1, 2nd at 2 ...
for i := 0; i < len(re_postfix); i++ { for i := 0; i < len(re_postfix); i++ {
/* Two cases: /* Two cases:
1. Current character is alphanumeric - send to output queue 1. Current character is alphanumeric - send to output queue
@@ -393,11 +491,44 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} else { } else {
return nil, fmt.Errorf("not enough hex characters found in expression") return nil, fmt.Errorf("not enough hex characters found in expression")
} }
} else if isOctal(re_postfix[i]) { // Octal value } else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
charClassInverted := (re_postfix[i] == 'P')
charsInClass := []rune{}
i++
if isUnicodeCharClassLetter(re_postfix[i]) {
var err error
charsInClass, err = unicodeCharClassToRange(string(re_postfix[i]))
if err != nil {
return nil, err
}
} else if re_postfix[i] == '{' {
i++ // Skip opening bracket
unicodeCharClassStr := ""
for re_postfix[i] != '}' {
unicodeCharClassStr += string(re_postfix[i])
i++
}
var err error
charsInClass, err = unicodeCharClassToRange(unicodeCharClassStr)
if err != nil {
return nil, err
}
} else {
return nil, fmt.Errorf("error parsing unicode character class in expression")
}
var toAppend postfixNode
if !charClassInverted { // \p
toAppend = newPostfixNode(charsInClass...)
} else { // \P
toAppend = newPostfixDotNode()
toAppend.except = append([]postfixNode{}, newPostfixNode(charsInClass...))
}
outQueue = append(outQueue, toAppend)
} else if re_postfix[i] == '0' { // Octal value
var octVal int64 var octVal int64
var octValStr string var octValStr string
numDigitsParsed := 0 numDigitsParsed := 0
for (i+numDigitsParsed) < len(re_postfix) && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 { for (i+numDigitsParsed) < len(re_postfix) && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 4 {
octValStr += string(re_postfix[i+numDigitsParsed]) octValStr += string(re_postfix[i+numDigitsParsed])
numDigitsParsed++ numDigitsParsed++
} }
@@ -410,6 +541,20 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} }
i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically
outQueue = append(outQueue, newPostfixCharNode(rune(octVal))) outQueue = append(outQueue, newPostfixCharNode(rune(octVal)))
} else if unicode.IsDigit(re_postfix[i]) { // Backreference
var num int64
var numStr string
numDigitsParsed := 0
for (i+numDigitsParsed) < len(re_postfix) && unicode.IsDigit(re_postfix[i+numDigitsParsed]) {
numStr += string(re_postfix[i+numDigitsParsed])
numDigitsParsed++
}
num, err := strconv.ParseInt(numStr, 10, 32)
if err != nil {
return nil, fmt.Errorf("error parsing backreference in expresion")
}
i += numDigitsParsed - 1
outQueue = append(outQueue, newPostfixBackreferenceNode(int(num)))
} else { } else {
escapedNode, err := newEscapedNode(re_postfix[i], false) escapedNode, err := newEscapedNode(re_postfix[i], false)
if err != nil { if err != nil {
@@ -561,11 +706,44 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} else { } else {
return nil, fmt.Errorf("not enough hex characters found in character class") return nil, fmt.Errorf("not enough hex characters found in character class")
} }
} else if isOctal(re_postfix[i]) { // Octal value } else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
charClassInverted := (re_postfix[i] == 'P')
charsInList := []rune{}
i++
if isUnicodeCharClassLetter(re_postfix[i]) {
var err error
charsInList, err = unicodeCharClassToRange(string(re_postfix[i]))
if err != nil {
return nil, err
}
} else if re_postfix[i] == '{' {
i++ // Skip opening bracket
unicodeCharClassStr := ""
for re_postfix[i] != '}' {
unicodeCharClassStr += string(re_postfix[i])
i++
}
var err error
charsInList, err = unicodeCharClassToRange(unicodeCharClassStr)
if err != nil {
return nil, err
}
} else {
return nil, fmt.Errorf("error parsing unicode character class in expression")
}
if !charClassInverted {
chars = append(chars, newPostfixNode(charsInList...))
} else {
toAppend := newPostfixDotNode()
toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...))
chars = append(chars, toAppend)
}
} else if re_postfix[i] == '0' { // Octal value
var octVal int64 var octVal int64
var octValStr string var octValStr string
numDigitsParsed := 0 numDigitsParsed := 0
for (i+numDigitsParsed) < len(re_postfix)-1 && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else) for (i+numDigitsParsed) < len(re_postfix)-1 && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 4 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else)
octValStr += string(re_postfix[i+numDigitsParsed]) octValStr += string(re_postfix[i+numDigitsParsed])
numDigitsParsed++ numDigitsParsed++
} }
@@ -769,6 +947,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
outQueue = append(outQueue, newPostfixNode(c)) outQueue = append(outQueue, newPostfixNode(c))
} }
numOpenParens++ numOpenParens++
parenIndices = append(parenIndices, Group{StartIdx: len(outQueue) - 1}) // Push the index of the lparen into parenIndices
} }
if c == ')' { if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Throw error if we reach the end of the stack. // Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Throw error if we reach the end of the stack.
@@ -785,6 +964,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
} }
parenIndices[numOpenParens].EndIdx = len(outQueue) - 1
numOpenParens-- numOpenParens--
} }
} }
@@ -799,6 +979,11 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("imbalanced parantheses") return nil, fmt.Errorf("imbalanced parantheses")
} }
// outQueue, _, err := rewriteBackreferences(outQueue, parenIndices)
// if err != nil {
// return nil, err
// }
return outQueue, nil return outQueue, nil
} }
@@ -1010,6 +1195,21 @@ func thompson(re []postfixNode) (Reg, error) {
}) })
nfa = append(nfa, toAdd) nfa = append(nfa, toAdd)
} }
if c.nodetype == backreferenceNode {
if c.referencedGroup > numGroups {
return Reg{}, fmt.Errorf("invalid backreference")
}
stateToAdd := &nfaState{}
stateToAdd.assert = noneAssert
stateToAdd.content = newContents(epsilon)
stateToAdd.isEmpty = true
stateToAdd.isBackreference = true
stateToAdd.output = make([]*nfaState, 0)
stateToAdd.output = append(stateToAdd.output, stateToAdd)
stateToAdd.referredGroup = c.referencedGroup
stateToAdd.threadBackref = 0
nfa = append(nfa, stateToAdd)
}
// Must be an operator if it isn't a character // Must be an operator if it isn't a character
switch c.nodetype { switch c.nodetype {
case concatenateNode: case concatenateNode:
@@ -1128,7 +1328,8 @@ func thompson(re []postfixNode) (Reg, error) {
concatenate(nfa[0], &lastState) concatenate(nfa[0], &lastState)
return Reg{nfa[0], numGroups}, nil // The string is empty here, because we add it in Compile()
return Reg{nfa[0], numGroups, "", false}, nil
} }
@@ -1146,10 +1347,11 @@ func Compile(re string, flags ...ReFlag) (Reg, error) {
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error compiling regex: %w", err) return Reg{}, fmt.Errorf("error compiling regex: %w", err)
} }
reg.str = re
return reg, nil return reg, nil
} }
// MustCompile panicks if Compile returns an error. They are identical in all other respects. // MustCompile panics if Compile returns an error. They are identical in all other respects.
func MustCompile(re string, flags ...ReFlag) Reg { func MustCompile(re string, flags ...ReFlag) Reg {
reg, err := Compile(re, flags...) reg, err := Compile(re, flags...)
if err != nil { if err != nil {

View File

@@ -4,6 +4,8 @@ Package regex implements regular expression search, using a custom non-bracktrac
The engine relies completely on UTF-8 codepoints. As such, it is capable of matching characters The engine relies completely on UTF-8 codepoints. As such, it is capable of matching characters
from other languages, emojis and symbols. from other languages, emojis and symbols.
The API and regex syntax are largely compatible with that of the stdlib's [regexp], with a few key differences (see 'Key Differences with regexp').
The full syntax is specified below. The full syntax is specified below.
# Syntax # Syntax
@@ -16,7 +18,7 @@ Single characters:
[^abc] Negated character class - match any character except a, b and c [^abc] Negated character class - match any character except a, b and c
[^a-z] Negated character range - do not match any character from a to z [^a-z] Negated character range - do not match any character from a to z
\[ Match a literal '['. Backslashes can escape any character with special meaning, including another backslash. \[ Match a literal '['. Backslashes can escape any character with special meaning, including another backslash.
\452 Match the character with the octal value 452 (up to 3 digits) \0452 Match the character with the octal value 452 (up to 4 digits, first digit must be 0)
\xFF Match the character with the hex value FF (exactly 2 characters) \xFF Match the character with the hex value FF (exactly 2 characters)
\x{0000FF} Match the character with the hex value 0000FF (exactly 6 characters) \x{0000FF} Match the character with the hex value 0000FF (exactly 6 characters)
\n Newline \n Newline
@@ -31,7 +33,7 @@ Perl classes:
\d Match any digit character ([0-9]) \d Match any digit character ([0-9])
\D Match any non-digit character ([^0-9]) \D Match any non-digit character ([^0-9])
\w Match any word character ([a-zA-Z0-9_]) \w Match any word character ([a-zA-Z0-9_])
\W Match any word character ([^a-zA-Z0-9_]) \W Match any non-word character ([^a-zA-Z0-9_])
\s Match any whitespace character ([ \t\n]) \s Match any whitespace character ([ \t\n])
\S Match any non-whitespace character ([^ \t\n]) \S Match any non-whitespace character ([^ \t\n])
@@ -55,8 +57,8 @@ POSIX classes (inside normal character classes):
Composition: Composition:
def Match d, followed by e, followed by f def Match d, followed by e, followed by f
x|y Match x or y (prefer longer one) x|y Match x or y (prefer x)
xy|z Match xy or z xy|z Match xy or z (prefer xy)
Repitition (always greedy, preferring more): Repitition (always greedy, preferring more):
@@ -91,34 +93,23 @@ Lookarounds:
(?<=x)y Positive lookbehind - Match y if preceded by x (?<=x)y Positive lookbehind - Match y if preceded by x
(?<!x)y Negative lookbehind - Match y if NOT preceded by x (?<!x)y Negative lookbehind - Match y if NOT preceded by x
Backreferences:
(xy)\1 Match 'xy' followed by the text most recently captured by group 1 (in this case, 'xy')
Numeric ranges: Numeric ranges:
<x-y> Match any number from x to y (inclusive) (x and y must be positive numbers) <x-y> Match any number from x to y (inclusive) (x and y must be positive numbers)
\<x Match a literal '<' followed by x
# Key Differences with regexp # Key Differences with regexp
The engine and the API differ from [regexp] in a number of ways, some of them very subtle. The engine and the API differ from [regexp] in a few ways, some of them very subtle.
The key differences are mentioned below. The key differences are mentioned below.
1. Greediness: 1. Greediness:
This engine does not support non-greedy operators. All operators are always greedy in nature, and will try This engine currently does not support non-greedy operators.
to match as much as they can, while still allowing for a successful match. For example, given the regex:
y*y
The engine will match as many 'y's as it can, while still allowing the trailing 'y' to be matched.
Another, more subtle example is the following regex:
x|xx
While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
this engine will go for the longest possible match, regardless of the order of the alternation. Although this
strays from the convention, it results in a nice rule-of-thumb - the engine is ALWAYS greedy.
The stdlib implementation has a function [regexp.Regexp.Longest] which makes future searches prefer the longest match.
That is the default (and unchangable) behavior in this engine.
2. Byte-slices and runes: 2. Byte-slices and runes:
@@ -132,7 +123,7 @@ Rather than using primitives for return values, my engine defines two types that
values: a [Group] represents a capturing group, and a [Match] represents a list of groups. values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
[regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The [regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The
equivalent expression for this engine is: equivalent expression for this engine is shown below. Note that 'Index' is the default.
Find(All)?(String)?(Submatch)? Find(All)?(String)?(Submatch)?
@@ -140,7 +131,7 @@ equivalent expression for this engine is:
If a function contains 'All' it returns all matches instead of just the leftmost one. If a function contains 'All' it returns all matches instead of just the leftmost one.
If a function contains 'String' it returns the matched text, rather than the indices. If a function contains 'String' it returns the matched text, rather than the index in the string.
If a function contains 'Submatch' it returns the match, including all submatches found by If a function contains 'Submatch' it returns the match, including all submatches found by
capturing groups. capturing groups.
@@ -156,5 +147,20 @@ and the input string:
The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name
returns the 0-group. returns the 0-group.
# Feature Differences
The following features from [regexp] are (currently) NOT supported:
1. Named capturing groups
2. Non-greedy operators
3. Embedded flags (flags are instead passed as arguments to [Compile])
4. Literal text with \Q ... \E
The following features are not available in [regexp], but are supported in my engine:
1. Lookarounds
2. Numeric ranges
3. Backreferences
I hope to shorten the first list, and expand the second.
*/ */
package regex package regex

View File

@@ -2,6 +2,7 @@ package regex_test
import ( import (
"fmt" "fmt"
"strings"
"gitea.twomorecents.org/Rockingcool/kleingrep/regex" "gitea.twomorecents.org/Rockingcool/kleingrep/regex"
) )
@@ -32,12 +33,12 @@ func ExampleReg_FindAll() {
} }
func ExampleReg_FindString() { func ExampleReg_FindString() {
regexStr := `\d+` regexStr := `\w+\s+(?=sheep)`
regexComp := regex.MustCompile(regexStr) regexComp := regex.MustCompile(regexStr)
matchStr := regexComp.FindString("The year of our lord, 2025") matchStr := regexComp.FindString("pink cows and yellow sheep")
fmt.Println(matchStr) fmt.Println(matchStr)
// Output: 2025 // Output: yellow
} }
func ExampleReg_FindSubmatch() { func ExampleReg_FindSubmatch() {
@@ -52,3 +53,129 @@ func ExampleReg_FindSubmatch() {
// 0 1 // 0 1
// 2 3 // 2 3
} }
func ExampleReg_FindStringSubmatch() {
regexStr := `(\d{4})-(\d{2})-(\d{2})`
regexComp := regex.MustCompile(regexStr)
inputStr := `The date is 2025-02-10`
match := regexComp.FindStringSubmatch(inputStr)
fmt.Println(match[1])
fmt.Println(match[3])
// Output: 2025
// 10
}
func ExampleReg_FindAllSubmatch() {
regexStr := `(\d)\.(\d)(\d)`
regexComp := regex.MustCompile(regexStr)
matches := regexComp.FindAllSubmatch("3.14+8.97")
fmt.Println(matches[0][0]) // 0-group (entire match) of 1st match (0-indexed)
fmt.Println(matches[0][1]) // 1st group of 1st match
fmt.Println(matches[1][0]) // 0-group of 2nd match
fmt.Println(matches[1][1]) // 1st group of 2nd math
// Output: 0 4
// 0 1
// 5 9
// 5 6
}
func ExampleReg_FindAllString() {
regexStr := `<0-255>\.<0-255>\.<0-255>\.<0-255>`
inputStr := `192.168.220.7 pings 9.9.9.9`
regexComp := regex.MustCompile(regexStr)
matchStrs := regexComp.FindAllString(inputStr)
fmt.Println(matchStrs[0])
fmt.Println(matchStrs[1])
// Output: 192.168.220.7
// 9.9.9.9
}
func ExampleReg_FindAllStringSubmatch() {
// 'https' ...
// followed by 1 or more alphanumeric characters (including period) ...
// then a forward slash ...
// followed by one more of :
// word character,
// question mark,
// period,
// equals sign
regexStr := `https://([a-z0-9\.]+)/([\w.?=]+)`
regexComp := regex.MustCompile(regexStr, regex.RE_CASE_INSENSITIVE)
inputStr := `You can find me at https://twomorecents.org/index.html and https://news.ycombinator.com/user?id=aadhavans`
matchIndices := regexComp.FindAllStringSubmatch(inputStr)
fmt.Println(matchIndices[0][1]) // 1st group of 1st match (0-indexed)
fmt.Println(matchIndices[0][2]) // 2nd group of 1st match
fmt.Println(matchIndices[1][1]) // 1st group of 2nd match
fmt.Println(matchIndices[1][2]) // 2nd group of 2nd match
// Output: twomorecents.org
// index.html
// news.ycombinator.com
// user?id=aadhavans
}
func ExampleReg_Expand() {
inputStr := `option1: value1
option2: value2`
regexStr := `(\w+): (\w+)`
templateStr := "$1 = $2\n"
regexComp := regex.MustCompile(regexStr, regex.RE_MULTILINE)
result := ""
for _, submatches := range regexComp.FindAllSubmatch(inputStr) {
result = regexComp.Expand(result, templateStr, inputStr, submatches)
}
fmt.Println(result)
// Output: option1 = value1
// option2 = value2
}
func ExampleReg_LiteralPrefix() {
regexStr := `a(b|c)d*`
regexComp := regex.MustCompile(regexStr)
prefix, complete := regexComp.LiteralPrefix()
fmt.Println(prefix)
fmt.Println(complete)
// Output: a
// false
}
func ExampleReg_Longest() {
regexStr := `x|xx`
inputStr := "xx"
regexComp := regex.MustCompile(regexStr)
fmt.Println(regexComp.FindString(inputStr))
regexComp.Longest()
fmt.Println(regexComp.FindString(inputStr))
// Output: x
// xx
}
func ExampleReg_ReplaceAll() {
regexStr := `(\d)(\w)`
inputStr := "5d9t"
regexComp := regex.MustCompile(regexStr)
fmt.Println(regexComp.ReplaceAll(inputStr, `$2$1`))
// Output: d5t9
}
func ExampleReg_ReplaceAllLiteral() {
regexStr := `fox|dog`
inputStr := "the quick brown fox jumped over the lazy dog"
regexComp := regex.MustCompile(regexStr)
fmt.Println(regexComp.ReplaceAllLiteral(inputStr, `duck`))
// Output: the quick brown duck jumped over the lazy duck
}
func ExampleReg_ReplaceAllFunc() {
regexStr := `\w{5,}`
inputStr := `all five or more letter words in this string are capitalized`
regexComp := regex.MustCompile(regexStr)
fmt.Println(regexComp.ReplaceAllFunc(inputStr, strings.ToUpper))
// Output: all five or more LETTER WORDS in this STRING are CAPITALIZED
}

View File

@@ -2,7 +2,8 @@ package regex
import ( import (
"fmt" "fmt"
"sort" "strconv"
"unicode"
) )
// A Match represents a match found by the regex in a given string. // A Match represents a match found by the regex in a given string.
@@ -14,7 +15,7 @@ import (
// See [Reg.FindSubmatch] for an example. // See [Reg.FindSubmatch] for an example.
type Match []Group type Match []Group
// a Group represents a group. It contains the start index and end index of the match // a Group represents a capturing group. It contains the start and index of the group.
type Group struct { type Group struct {
StartIdx int StartIdx int
EndIdx int EndIdx int
@@ -29,17 +30,6 @@ func newMatch(size int) Match {
return toRet return toRet
} }
// Returns the number of valid groups in the match
func (m Match) numValidGroups() int {
numValid := 0
for _, g := range m {
if g.StartIdx >= 0 && g.EndIdx >= 0 {
numValid++
}
}
return numValid
}
// Returns a string containing the indices of all (valid) groups in the match // Returns a string containing the indices of all (valid) groups in the match
func (m Match) String() string { func (m Match) String() string {
var toRet string var toRet string
@@ -58,7 +48,7 @@ func (idx Group) String() string {
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx) return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
} }
// Returns whether a group is valid (ie. whether it matched any text). It // IsValid returns whether a group is valid (ie. whether it matched any text). It
// simply ensures that both indices of the group are >= 0. // simply ensures that both indices of the group are >= 0.
func (g Group) IsValid() bool { func (g Group) IsValid() bool {
return g.StartIdx >= 0 && g.EndIdx >= 0 return g.StartIdx >= 0 && g.EndIdx >= 0
@@ -69,105 +59,42 @@ func getZeroGroup(m Match) Group {
return m[0] return m[0]
} }
// takeZeroState takes the 0-state (if such a transition exists) for all states in the
// given slice. It returns the resulting states. If any of the resulting states is a 0-state,
// the second ret val is true.
// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index.
//func takeZeroState(states []*nfaState, numGroups int, idx int) (rtv []*nfaState, isZero bool) {
// for _, state := range states {
// if len(state.transitions[epsilon]) > 0 {
// for _, s := range state.transitions[epsilon] {
// if s.threadGroups == nil {
// s.threadGroups = newMatch(numGroups + 1)
// }
// copy(s.threadGroups, state.threadGroups)
// if s.groupBegin {
// s.threadGroups[s.groupNum].StartIdx = idx
// // openParenGroups = append(openParenGroups, s.groupNum)
// }
// if s.groupEnd {
// s.threadGroups[s.groupNum].EndIdx = idx
// // closeParenGroups = append(closeParenGroups, s.groupNum)
// }
// }
// rtv = append(rtv, state.transitions[epsilon]...)
// }
// }
// for _, state := range rtv {
// if len(state.transitions[epsilon]) > 0 {
// return rtv, true
// }
// }
// return rtv, false
//}
// zeroMatchPossible returns true if a zero-length match is possible
// from any of the given states, given the string and our position in it.
// It uses the same algorithm to find zero-states as the one inside the loop,
// so I should probably put it in a function.
//func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState) bool {
// zeroStates, isZero := takeZeroState(states, numGroups, idx)
// tempstates := make([]*nfaState, 0, len(zeroStates)+len(states))
// tempstates = append(tempstates, states...)
// tempstates = append(tempstates, zeroStates...)
// num_appended := 0 // number of unique states addded to tempstates
// for isZero == true {
// zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
// tempstates, num_appended = uniqueAppend(tempstates, zeroStates...)
// if num_appended == 0 { // break if we haven't appended any more unique values
// break
// }
// }
// for _, state := range tempstates {
// if state.isEmpty && (state.assert == noneAssert || state.checkAssertion(str, idx)) && state.isLast {
// return true
// }
// }
// return false
//}
// Prunes the slice by removing overlapping indices.
func pruneIndices(indices []Match) []Match {
// First, sort the slice by the start indices
sort.Slice(indices, func(i, j int) bool {
return indices[i][0].StartIdx < indices[j][0].StartIdx
})
toRet := make([]Match, 0, len(indices))
current := indices[0]
for _, idx := range indices[1:] {
// idx doesn't overlap with current (starts after current ends), so add current to result
// and update the current.
if idx[0].StartIdx >= current[0].EndIdx {
toRet = append(toRet, current)
current = idx
} else if idx[0].EndIdx > current[0].EndIdx {
// idx overlaps, but it is longer, so update current
current = idx
}
}
// Add last state
toRet = append(toRet, current)
return toRet
}
func copyThread(to *nfaState, from nfaState) { func copyThread(to *nfaState, from nfaState) {
to.threadGroups = append([]Group{}, from.threadGroups...) to.threadGroups = append([]Group{}, from.threadGroups...)
} }
// Find returns the 0-group of the leftmost match of the regex in the given string. // Find returns the 0-group of the leftmost match of the regex in the given string.
// An error value != nil indicates that no match was found. // An error value != nil indicates that no match was found.
func (regex Reg) Find(str string) (Group, error) { func (re Reg) Find(str string) (Group, error) {
match, err := regex.FindNthMatch(str, 1) match, err := re.FindNthMatch(str, 1)
if err != nil { if err != nil {
return Group{}, fmt.Errorf("no matches found") return Group{}, fmt.Errorf("no matches found")
} }
return getZeroGroup(match), nil return getZeroGroup(match), nil
} }
// Match returns a boolean value, indicating whether the regex found a match in the given string.
func (re Reg) Match(str string) bool {
_, err := re.Find(str)
return err == nil
}
// CompileMatch compiles expr and returns true if str contains a match of the expression.
// It is equivalent to [regexp.Match].
// An optional list of flags may be provided (see [ReFlag]).
// It returns an error (!= nil) if there was an error compiling the expression.
func CompileMatch(expr string, str string, flags ...ReFlag) (bool, error) {
re, err := Compile(expr, flags...)
if err != nil {
return false, err
}
return re.Match(str), nil
}
// FindAll returns a slice containing all the 0-groups of the regex in the given string. // FindAll returns a slice containing all the 0-groups of the regex in the given string.
// A 0-group represents the match without any submatches. // A 0-group represents the match without any submatches.
func (regex Reg) FindAll(str string) []Group { func (re Reg) FindAll(str string) []Group {
indices := regex.FindAllSubmatch(str) indices := re.FindAllSubmatch(str)
zeroGroups := funcMap(indices, getZeroGroup) zeroGroups := funcMap(indices, getZeroGroup)
return zeroGroups return zeroGroups
} }
@@ -176,8 +103,8 @@ func (regex Reg) FindAll(str string) []Group {
// The return value will be an empty string in two situations: // The return value will be an empty string in two situations:
// 1. No match was found // 1. No match was found
// 2. The match was an empty string // 2. The match was an empty string
func (regex Reg) FindString(str string) string { func (re Reg) FindString(str string) string {
match, err := regex.FindNthMatch(str, 1) match, err := re.FindNthMatch(str, 1)
if err != nil { if err != nil {
return "" return ""
} }
@@ -190,8 +117,8 @@ func (regex Reg) FindString(str string) string {
// number of groups. The validity of a group (whether or not it matched anything) can be determined with // number of groups. The validity of a group (whether or not it matched anything) can be determined with
// [Group.IsValid], or by checking that both indices of the group are >= 0. // [Group.IsValid], or by checking that both indices of the group are >= 0.
// The second-return value is nil if no match was found. // The second-return value is nil if no match was found.
func (regex Reg) FindSubmatch(str string) (Match, error) { func (re Reg) FindSubmatch(str string) (Match, error) {
match, err := regex.FindNthMatch(str, 1) match, err := re.FindNthMatch(str, 1)
if err != nil { if err != nil {
return Match{}, fmt.Errorf("no match found") return Match{}, fmt.Errorf("no match found")
} else { } else {
@@ -199,11 +126,41 @@ func (regex Reg) FindSubmatch(str string) (Match, error) {
} }
} }
// FindAllString is the 'all' version of FindString. // FindStringSubmatch is the 'string' version of [FindSubmatch]. It returns a slice of strings,
// where the string at index i contains the text matched by the i-th capturing group.
// The 0-th index represents the entire match.
// An empty string at index n could mean:
// ,
// 1. Group n did not find a match
// 2. Group n found a zero-length match
//
// A return value of nil indicates no match.
func (re Reg) FindStringSubmatch(str string) []string {
matchStr := make([]string, re.numGroups+1)
match, err := re.FindSubmatch(str)
if err != nil {
return nil
}
nonEmptyMatchFound := false
for i := range match {
if match[i].IsValid() {
matchStr[i] = str[match[i].StartIdx:match[i].EndIdx]
nonEmptyMatchFound = true
} else {
matchStr[i] = ""
}
}
if nonEmptyMatchFound == false {
return nil
}
return matchStr
}
// FindAllString is the 'all' version of [FindString].
// It returns a slice of strings containing the text of all matches of // It returns a slice of strings containing the text of all matches of
// the regex in the given string. // the regex in the given string.
func (regex Reg) FindAllString(str string) []string { func (re Reg) FindAllString(str string) []string {
zerogroups := regex.FindAll(str) zerogroups := re.FindAll(str)
matchStrs := funcMap(zerogroups, func(g Group) string { matchStrs := funcMap(zerogroups, func(g Group) string {
return str[g.StartIdx:g.EndIdx] return str[g.StartIdx:g.EndIdx]
}) })
@@ -212,14 +169,14 @@ func (regex Reg) FindAllString(str string) []string {
// FindNthMatch return the 'n'th match of the regex in the given string. // FindNthMatch return the 'n'th match of the regex in the given string.
// It returns an error (!= nil) if there are fewer than 'n' matches in the string. // It returns an error (!= nil) if there are fewer than 'n' matches in the string.
func (regex Reg) FindNthMatch(str string, n int) (Match, error) { func (re Reg) FindNthMatch(str string, n int) (Match, error) {
idx := 0 idx := 0
matchNum := 0 matchNum := 0
str_runes := []rune(str) str_runes := []rune(str)
var matchFound bool var matchFound bool
var matchIdx Match var matchIdx Match
for idx <= len(str_runes) { for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups) matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
if matchFound { if matchFound {
matchNum++ matchNum++
} }
@@ -232,26 +189,46 @@ func (regex Reg) FindNthMatch(str string, n int) (Match, error) {
} }
// FindAllSubmatch returns a slice of matches in the given string. // FindAllSubmatch returns a slice of matches in the given string.
func (regex Reg) FindAllSubmatch(str string) []Match { func (re Reg) FindAllSubmatch(str string) []Match {
idx := 0 idx := 0
str_runes := []rune(str) str_runes := []rune(str)
var matchFound bool var matchFound bool
var matchIdx Match var matchIdx Match
indices := make([]Match, 0) indices := make([]Match, 0)
for idx <= len(str_runes) { for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups) matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
if matchFound { if matchFound {
indices = append(indices, matchIdx) indices = append(indices, matchIdx)
} }
} }
if len(indices) > 0 {
return pruneIndices(indices)
}
return indices return indices
} }
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState) []nfaState { // FindAllSubmatch returns a double-slice of strings. Each slice contains the text of a match, including all submatches.
// A return value of nil indicates no match.
func (re Reg) FindAllStringSubmatch(str string) [][]string {
match := re.FindAllSubmatch(str)
if len(match) == 0 {
return nil
}
rtv := make([][]string, len(match))
for i := range rtv {
rtv[i] = make([]string, re.numGroups+1)
}
rtv = funcMap(match, func(m Match) []string {
return funcMap(m, func(g Group) string {
if g.IsValid() {
return str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
})
return rtv
}
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState {
if stateExists(list, state) || stateExists(visited, state) { if stateExists(list, state) || stateExists(visited, state) {
return list return list
} }
@@ -259,32 +236,34 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread
if state.isKleene || state.isQuestion { if state.isKleene || state.isQuestion {
copyThread(state.splitState, state) copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited) list := addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
copyThread(state.next, state) copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited) list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
return list return list
} }
if state.isAlternation { if state.isAlternation {
copyThread(state.next, state) copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited) list := addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
copyThread(state.splitState, state) copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited) list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
return list return list
} }
state.threadGroups = append([]Group{}, threadGroups...) state.threadGroups = append([]Group{}, threadGroups...)
if state.assert != noneAssert { if state.assert != noneAssert {
if state.checkAssertion(str, idx) { if state.checkAssertion(str, idx, preferLongest) {
copyThread(state.next, state) copyThread(state.next, state)
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited) return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
} }
} }
if state.groupBegin { if state.groupBegin {
state.threadGroups[state.groupNum].StartIdx = idx state.threadGroups[state.groupNum].StartIdx = idx
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited) copyThread(state.next, state)
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
} }
if state.groupEnd { if state.groupEnd {
state.threadGroups[state.groupNum].EndIdx = idx state.threadGroups[state.groupNum].EndIdx = idx
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited) copyThread(state.next, state)
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
} }
return append(list, state) return append(list, state)
@@ -293,9 +272,7 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread
// Helper for FindAllMatches. Returns whether it found a match, the // Helper for FindAllMatches. Returns whether it found a match, the
// first Match it finds, and how far it got into the string ie. where // first Match it finds, and how far it got into the string ie. where
// the next search should start from. // the next search should start from.
// func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int, preferLongest bool) (bool, Match, int) {
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
// Base case - exit if offset exceeds string's length // Base case - exit if offset exceeds string's length
if offset > len(str) { if offset > len(str) {
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str) // The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
@@ -303,53 +280,23 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
} }
resetThreads(start) resetThreads(start)
// Hold a list of match indices for the current run. When we
// can no longer find a match, the match with the largest range is
// chosen as the match for the entire string.
// This allows us to pick the longest possible match (which is how greedy matching works).
// COMMENT ABOVE IS CURRENTLY NOT UP-TO-DATE
// tempIndices := newMatch(numGroups + 1)
// foundPath := false
//startIdx := offset
//endIdx := offset
currentStates := make([]nfaState, 0) currentStates := make([]nfaState, 0)
nextStates := make([]nfaState, 0) nextStates := make([]nfaState, 0)
// tempStates := make([]*nfaState, 0) // Used to store states that should be used in next loop iteration
i := offset // Index in string i := offset // Index in string
//startingFrom := i // Store starting index
// If the first state is an assertion, makes sure the assertion // If the first state is an assertion, makes sure the assertion
// is true before we do _anything_ else. // is true before we do _anything_ else.
if start.assert != noneAssert { if start.assert != noneAssert {
if start.checkAssertion(str, offset) == false { if start.checkAssertion(str, offset, preferLongest) == false {
i++ i++
return false, []Group{}, i return false, []Group{}, i
} }
} }
// Increment until we hit a character matching the start state (assuming not 0-state)
// if start.isEmpty == false {
// for i < len(str) && !start.contentContains(str, i) {
// i++
// }
// startIdx = i
// startingFrom = i
// i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character
// }
// start.threadGroups = newMatch(numGroups + 1)
// Check if the start state begins a group - if so, add the start index to our list
//if start.groupBegin {
// start.threadGroups[start.groupNum].StartIdx = i
// tempIndices[start.groupNum].startIdx = i
//}
start.threadGroups = newMatch(numGroups + 1) start.threadGroups = newMatch(numGroups + 1)
start.threadGroups[0].StartIdx = i start.threadGroups[0].StartIdx = i
currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil) currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest)
var match Match = nil var match Match = nil
// var isEmptyAndNoAssertion bool
// Main loop
for idx := i; idx <= len(str); idx++ { for idx := i; idx <= len(str); idx++ {
if len(currentStates) == 0 { if len(currentStates) == 0 {
break break
@@ -365,82 +312,28 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
if currentState.isLast { if currentState.isLast {
currentState.threadGroups[0].EndIdx = idx currentState.threadGroups[0].EndIdx = idx
match = append([]Group{}, currentState.threadGroups...) match = append([]Group{}, currentState.threadGroups...)
if !preferLongest {
break break
} else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd { // Normal character or assertion }
if currentState.contentContains(str, idx) { } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.isBackreference && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character
nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil) if currentState.contentContains(str, idx, preferLongest) {
nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
}
} else if currentState.isBackreference && currentState.threadGroups[currentState.referredGroup].IsValid() {
groupLength := currentState.threadGroups[currentState.referredGroup].EndIdx - currentState.threadGroups[currentState.referredGroup].StartIdx
if currentState.threadBackref == groupLength {
currentState.threadBackref = 0
copyThread(currentState.next, currentState)
currentStates = addStateToList(str, idx, currentStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
} else {
idxInReferredGroup := currentState.threadGroups[currentState.referredGroup].StartIdx + currentState.threadBackref
if idxInReferredGroup < len(str) && idx < len(str) && str[idxInReferredGroup] == str[idx] {
currentState.threadBackref += 1
nextStates = append(nextStates, currentState)
}
} }
} }
// if currentState.groupBegin {
// currentState.threadGroups[currentState.groupNum].StartIdx = idx
// }
// if currentState.groupEnd {
// currentState.threadGroups[currentState.groupNum].EndIdx = idx
// }
// Alternation - enqueue left then right state, and continue
// if currentState.isAlternation {
// if currentState.isKleene { // Reverse order of adding things
// rightState := currentState.splitState
// copyThread(rightState, currentState)
// currentStates = slices.Insert(currentStates, currentStateIdx+1, *rightState)
// leftState := currentState.next
// copyThread(leftState, currentState)
// currentStates = slices.Insert(currentStates, currentStateIdx+2, *leftState)
// } else {
// leftState := currentState.next
// copyThread(leftState, currentState)
// currentStates = slices.Insert(currentStates, currentStateIdx+1, *leftState)
// rightState := currentState.splitState
// copyThread(rightState, currentState)
// currentStates = slices.Insert(currentStates, currentStateIdx+2, *rightState)
// }
// continue
// }
// Empty state - enqueue next state, do _not_ increment the SP
// if !currentState.isAlternation && currentState.isEmpty && currentState.assert == noneAssert { //&& currentState.groupBegin == false && currentState.groupEnd == false {
// isEmptyAndNoAssertion = true
// }
//
// if currentState.contentContains(str, idx) {
// foundMatch = true
// }
//
// if isEmptyAndNoAssertion || foundMatch {
// nextMatch := *(currentState.next)
// copyThread(&nextMatch, currentState)
// if currentState.groupBegin {
// // if !stateExists(currentStates, nextMatch) {
// currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch)
// //}
// } else if currentState.groupEnd {
// if !stateExists(currentStates, nextMatch) {
// currentStates = slices.Insert(currentStates, currentStateIdx+1, nextMatch) // append(currentStates, nextMatch)
// }
// } else if currentState.assert != noneAssert {
// if !stateExists(currentStates, nextMatch) {
// currentStates = append(currentStates, nextMatch)
// }
// } else if currentState.isEmpty && !currentState.groupBegin && !currentState.groupEnd {
// if !stateExists(currentStates, nextMatch) {
// currentStates = append(currentStates, nextMatch)
// }
// } else {
// if !stateExists(nextStates, nextMatch) {
// nextStates = append(nextStates, nextMatch)
// }
// }
// }
//
// if currentState.isLast && len(nextStates) == 0 { // Last state reached
// currentState.threadGroups[0].EndIdx = idx
// if idx == currentState.threadGroups[0].StartIdx {
// idx += 1
// }
// return true, currentState.threadGroups, idx
// }
} }
currentStates = append([]nfaState{}, nextStates...) currentStates = append([]nfaState{}, nextStates...)
nextStates = nil nextStates = nil
@@ -452,196 +345,132 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
return true, match, match[0].EndIdx return true, match, match[0].EndIdx
} }
return false, []Group{}, i + 1 return false, []Group{}, i + 1
// zeroStates := make([]*nfaState, 0) }
// // Keep taking zero-states, until there are no more left to take
// // Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take. // Expand appends template to dst, expanding any variables in template to the relevant capturing group.
// topStateItem := currentStates.peek() //
// topState := topStateItem.(*priorQueueItem).state // A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group.
// zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i) // To insert a literal $, do not put a number after it. Alternatively, you can use $$.
// tempStates = append(tempStates, zeroStates...) // src is the input string, and match must be the result of [Reg.FindSubmatch].
// num_appended := 0 func (re Reg) Expand(dst string, template string, src string, match Match) string {
// for isZero == true { templateRuneSlc := []rune(template)
// zeroStates, isZero = takeZeroState(tempStates, numGroups, i) srcRuneSlc := []rune(src)
// tempStates, num_appended = uniqueAppend(tempStates, zeroStates...) i := 0
// if num_appended == 0 { // Break if we haven't appended any more unique values for i < len(templateRuneSlc) {
// break c := templateRuneSlc[i]
// } if c == '$' {
// } i += 1
// if isZero == true { // The dollar sign is the last character of the string, or it is proceeded by another dollar sign
// currentStates.Pop() if i >= len(templateRuneSlc) || templateRuneSlc[i] == '$' {
// } dst += "$"
// i++
// for _, state := range tempStates { } else {
// heap.Push(currentStates, newPriorQueueItem(state)) numStr := ""
// } for i < len(templateRuneSlc) && unicode.IsDigit(templateRuneSlc[i]) {
// tempStates = nil numStr += string(templateRuneSlc[i])
// i++
// // Take any transitions corresponding to current character }
// numStatesMatched := 0 // The number of states which had at least 1 match for this round if numStr == "" {
// assertionFailed := false // Whether or not an assertion failed for this round dst += "$"
// lastStateInList := false // Whether or not a last state was in our list of states } else {
// var lastStatePtr *nfaState = nil // Pointer to the last-state, if it was found num, _ := strconv.Atoi(numStr)
// lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states if num < len(match) {
// for numStatesMatched == 0 && lastStateInList == false { dst += string(srcRuneSlc[match[num].StartIdx:match[num].EndIdx])
// if currentStates.Len() == 0 { } else {
// break dst += "$" + numStr
// } }
// stateItem := heap.Pop(currentStates) }
// state := stateItem.(*priorQueueItem).state }
// matches, numMatches := state.matchesFor(str, i) } else {
// if numMatches > 0 { dst += string(c)
// numStatesMatched++ i++
// tempStates = append([]*nfaState(nil), matches...) }
// foundPath = true }
// for _, m := range matches { return dst
// if m.threadGroups == nil { }
// m.threadGroups = newMatch(numGroups + 1)
// } // LiteralPrefix returns a string that must begin any match of the given regular expression.
// m.threadSP = state.threadSP + 1 // The second return value is true if the string comprises the entire expression.
// copy(m.threadGroups, state.threadGroups) func (re Reg) LiteralPrefix() (prefix string, complete bool) {
// } state := re.start
// } if state.assert != noneAssert {
// if numMatches < 0 { state = state.next
// assertionFailed = true }
// } for !(state.isLast) && (!state.isAlternation) && len(state.content) == 1 && state.assert == noneAssert {
// if state.isLast { if state.groupBegin || state.groupEnd {
// if state.isLookaround() { state = state.next
// lastLookaroundInList = true continue
// } }
// lastStateInList = true prefix += string(rune(state.content[0]))
// lastStatePtr = state state = state.next
// } }
// } if state.isLast {
// complete = true
// if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed } else {
// // If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_ complete = false
// // state. The explanation below is my attempt to explain this behavior. }
// // If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails. return prefix, complete
// // }
// // One of the states in our list was a last state and a lookaround. In this case, we
// // don't abort upon failure of the assertion, because we have found // ReplaceAll replaces all matches of the expression in src, with the text in repl. In repl, variables are interpreted
// // another path to a final state. // as they are in [Reg.Expand]. The resulting string is returned.
// // Even if the last state _was_ an assertion, we can use the previously func (re Reg) ReplaceAll(src string, repl string) string {
// // saved indices to find a match. matches := re.FindAllSubmatch(src)
// if lastLookaroundInList { i := 0
// break currentMatch := 0
// } else { dst := ""
// if i == startingFrom { for i < len(src) {
// i++ if currentMatch < len(matches) && matches[currentMatch][0].IsValid() && i == matches[currentMatch][0].StartIdx {
// } dst += re.Expand("", repl, src, matches[currentMatch])
// return false, []Group{}, i i = matches[currentMatch][0].EndIdx
// } currentMatch++
// } } else {
// // Check if we can find a state in our list that is: dst += string(src[i])
// // a. A last-state i++
// // b. Empty }
// // c. Doesn't assert anything }
// for _, stateItem := range *currentStates { return dst
// s := stateItem.state }
// if s.isLast && s.isEmpty && s.assert == noneAssert {
// lastStatePtr = s // ReplaceAllLiteral replaces all matches of the expression in src, with the text in repl. The text is replaced directly,
// lastStateInList = true // without any expansion.
// } func (re Reg) ReplaceAllLiteral(src string, repl string) string {
// } zerogroups := re.FindAll(src)
// if lastStateInList && numStatesMatched == 0 { // A last-state was in the list of states. add the matchIndex to our MatchIndex list currentMatch := 0
// for j := 1; j < numGroups+1; j++ { i := 0
// tempIndices[j] = lastStatePtr.threadGroups[j] dst := ""
// }
// endIdx = i for i < len(src) {
// tempIndices[0] = Group{startIdx, endIdx} if currentMatch < len(zerogroups) && i == zerogroups[currentMatch].StartIdx {
// if tempIndices[0].StartIdx == tempIndices[0].EndIdx { dst += repl
// return true, tempIndices, tempIndices[0].EndIdx + 1 i = zerogroups[currentMatch].EndIdx
// } else { currentMatch += 1
// return true, tempIndices, tempIndices[0].EndIdx } else {
// } dst += string(src[i])
// } i++
// }
// // Check if we can find a zero-length match }
// if foundPath == false { return dst
// currentStatesList := funcMap(*currentStates, func(item *priorQueueItem) *nfaState { }
// return item.state
// }) // ReplaceAllFunc replaces every match of the expression in src, with the return value of the function replFunc.
// if ok := zeroMatchPossible(str, i, numGroups, currentStatesList...); ok { // replFunc takes in the matched string. The return value is substituted in directly without expasion.
// if tempIndices[0].IsValid() == false { func (re Reg) ReplaceAllFunc(src string, replFunc func(string) string) string {
// tempIndices[0] = Group{startIdx, startIdx} zerogroups := re.FindAll(src)
// } currentMatch := 0
// } i := 0
// // If we haven't moved in the string, increment the counter by 1 dst := ""
// // to ensure we don't keep trying the same string over and over.
// // if i == startingFrom { for i < len(src) {
// startIdx++ if currentMatch < len(zerogroups) && i == zerogroups[currentMatch].StartIdx {
// // i++ dst += replFunc(src[zerogroups[currentMatch].StartIdx:zerogroups[currentMatch].EndIdx])
// // } i = zerogroups[currentMatch].EndIdx
// if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() { currentMatch += 1
// if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. } else {
// return true, tempIndices, tempIndices[0].EndIdx + 1 dst += string(src[i])
// } else { i++
// return true, tempIndices, tempIndices[0].EndIdx }
// } }
// } return dst
// return false, []Group{}, startIdx
// }
// currentStates = &priorityQueue{}
// slices.Reverse(tempStates)
// for _, state := range tempStates {
// heap.Push(currentStates, newPriorQueueItem(state))
// }
// tempStates = nil
//
// i++
// }
//
// // End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position.
// // This is the exact same algorithm used inside the loop, so I should probably put it in a function.
//
// if currentStates.Len() > 0 {
// topStateItem := currentStates.peek()
// topState := topStateItem.(*priorQueueItem).state
// zeroStates, isZero := takeZeroState([]*nfaState{topState}, numGroups, i)
// tempStates = append(tempStates, zeroStates...)
// num_appended := 0 // Number of unique states addded to tempStates
// for isZero == true {
// zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
// tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
// if num_appended == 0 { // Break if we haven't appended any more unique values
// break
// }
// }
// }
//
// for _, state := range tempStates {
// heap.Push(currentStates, newPriorQueueItem(state))
// }
//
// tempStates = nil
//
// for _, stateItem := range *currentStates {
// state := stateItem.state
// // Only add the match if the start index is in bounds. If the state has an assertion,
// // make sure the assertion checks out.
// if state.isLast && i <= len(str) {
// if state.assert == noneAssert || state.checkAssertion(str, i) {
// for j := 1; j < numGroups+1; j++ {
// tempIndices[j] = state.threadGroups[j]
// }
// endIdx = i
// tempIndices[0] = Group{startIdx, endIdx}
// }
// }
// }
//
// if tempIndices.numValidGroups() > 0 {
// if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
// return true, tempIndices, tempIndices[0].EndIdx + 1
// } else {
// return true, tempIndices, tempIndices[0].EndIdx
// }
// }
//
// if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over.
//
// startIdx++
// }
//
// return false, []Group{}, startIdx
} }

View File

@@ -48,49 +48,6 @@ func isNormalChar(c rune) bool {
return !slices.Contains(specialChars, c) return !slices.Contains(specialChars, c)
} }
// Ensure that the given elements are only appended to the given slice if they
// don't already exist. Returns the new slice, and the number of unique items appended.
func uniqueAppend[T comparable](slc []T, items ...T) ([]T, int) {
num_appended := 0
for _, item := range items {
if !slices.Contains(slc, item) {
slc = append(slc, item)
num_appended++
}
}
return slc, num_appended
}
func uniqueAppendFunc[T any](slc []T, fn func(T, T) bool, items ...T) ([]T, int) {
toRet := make([]T, len(slc))
num_appended := 0
copy(toRet, slc)
for _, item := range items {
itemExists := false
for _, val := range slc {
if fn(item, val) {
itemExists = true
}
}
if !itemExists {
toRet = append(toRet, item)
num_appended++
}
}
return toRet, num_appended
}
// Returns true only if all the given elements are equal
func allEqual[T comparable](items ...T) bool {
first := items[0]
for _, item := range items {
if item != first {
return false
}
}
return true
}
// Map function - convert a slice of T to a slice of V, based on a function // Map function - convert a slice of T to a slice of V, based on a function
// that maps a T to a V // that maps a T to a V
func funcMap[T, V any](slc []T, fn func(T) V) []V { func funcMap[T, V any](slc []T, fn func(T) V) []V {

View File

@@ -45,8 +45,10 @@ type nfaState struct {
groupEnd bool // Whether or not the node ends a capturing group groupEnd bool // Whether or not the node ends a capturing group
groupNum int // Which capturing group the node starts / ends groupNum int // Which capturing group the node starts / ends
// The following properties depend on the current match - I should think about resetting them for every match. // The following properties depend on the current match - I should think about resetting them for every match.
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over. threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
isBackreference bool // Whether or not current node is backreference
referredGroup int // If current node is a backreference, the node that it points to
threadBackref int // If current node is a backreference, how many characters to look forward into the referred group
} }
// Clones the NFA starting from the given state. // Clones the NFA starting from the given state.
@@ -76,7 +78,6 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
isQuestion: stateToClone.isQuestion, isQuestion: stateToClone.isQuestion,
isAlternation: stateToClone.isAlternation, isAlternation: stateToClone.isAlternation,
assert: stateToClone.assert, assert: stateToClone.assert,
zeroMatchFound: stateToClone.zeroMatchFound,
allChars: stateToClone.allChars, allChars: stateToClone.allChars,
except: append([]rune{}, stateToClone.except...), except: append([]rune{}, stateToClone.except...),
lookaroundRegex: stateToClone.lookaroundRegex, lookaroundRegex: stateToClone.lookaroundRegex,
@@ -122,6 +123,7 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
} }
// Assuming it hasn't been visited // Assuming it hasn't been visited
state.threadGroups = nil state.threadGroups = nil
state.threadBackref = 0
visitedMap[state] = true visitedMap[state] = true
if state.isAlternation { if state.isAlternation {
resetThreadsHelper(state.next, visitedMap) resetThreadsHelper(state.next, visitedMap)
@@ -133,7 +135,7 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
// Checks if the given state's assertion is true. Returns true if the given // Checks if the given state's assertion is true. Returns true if the given
// state doesn't have an assertion. // state doesn't have an assertion.
func (s nfaState) checkAssertion(str []rune, idx int) bool { func (s nfaState) checkAssertion(str []rune, idx int, preferLongest bool) bool {
if s.assert == alwaysTrueAssert { if s.assert == alwaysTrueAssert {
return true return true
} }
@@ -183,7 +185,7 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool {
strToMatch = string(runesToMatch) strToMatch = string(runesToMatch)
} }
regComp := Reg{startState, s.lookaroundNumCaptureGroups} regComp := Reg{startState, s.lookaroundNumCaptureGroups, s.lookaroundRegex, preferLongest}
matchIndices := regComp.FindAll(strToMatch) matchIndices := regComp.FindAll(strToMatch)
numMatchesFound := 0 numMatchesFound := 0
@@ -210,9 +212,9 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool {
} }
// Returns true if the contents of 's' contain the value at the given index of the given string // Returns true if the contents of 's' contain the value at the given index of the given string
func (s nfaState) contentContains(str []rune, idx int) bool { func (s nfaState) contentContains(str []rune, idx int, preferLongest bool) bool {
if s.assert != noneAssert { if s.assert != noneAssert {
return s.checkAssertion(str, idx) return s.checkAssertion(str, idx, preferLongest)
} }
if idx >= len(str) { if idx >= len(str) {
return false return false
@@ -428,7 +430,8 @@ func (s nfaState) equals(other nfaState) bool {
s.groupBegin == other.groupBegin && s.groupBegin == other.groupBegin &&
s.groupEnd == other.groupEnd && s.groupEnd == other.groupEnd &&
s.groupNum == other.groupNum && s.groupNum == other.groupNum &&
slices.Equal(s.threadGroups, other.threadGroups) slices.Equal(s.threadGroups, other.threadGroups) &&
s.threadBackref == other.threadBackref
} }
func stateExists(list []nfaState, s nfaState) bool { func stateExists(list []nfaState, s nfaState) bool {

View File

@@ -1,6 +1,8 @@
package regex package regex
import "fmt" import (
"fmt"
)
type nodeType int type nodeType int
@@ -20,6 +22,7 @@ const (
assertionNode assertionNode
lparenNode lparenNode
rparenNode rparenNode
backreferenceNode
) )
// Helper constants for lookarounds // Helper constants for lookarounds
@@ -40,6 +43,7 @@ type postfixNode struct {
lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround. lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
lookaroundDir int // Lookbehind or lookahead lookaroundDir int // Lookbehind or lookahead
nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node. nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
referencedGroup int // ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode.
} }
// Converts the given list of postfixNodes to one node of type CHARCLASS. // Converts the given list of postfixNodes to one node of type CHARCLASS.
@@ -208,3 +212,44 @@ func newPostfixCharNode(contents ...rune) postfixNode {
toReturn.contents = append(toReturn.contents, contents...) toReturn.contents = append(toReturn.contents, contents...)
return toReturn return toReturn
} }
// newPostfixBackreferenceNode creates and returns a backreference node, referring to the given group
func newPostfixBackreferenceNode(referred int) postfixNode {
toReturn := postfixNode{}
toReturn.startReps = 1
toReturn.endReps = 1
toReturn.nodetype = backreferenceNode
toReturn.referencedGroup = referred
return toReturn
}
// rewriteBackreferences rewrites any backreferences in the given postfixNode slice, into their respective groups.
// It stores the relation in a map, and returns it as the second return value.
// It uses parenIndices to determine where a group starts and ends in nodes.
// For example, \1(a) will be rewritten into (a)(a), and 1 -> 2 will be the hashmap value.
// It returns an error if a backreference points to an invalid group.
// func rewriteBackreferences(nodes []postfixNode, parenIndices []Group) ([]postfixNode, map[int]int, error) {
// rtv := make([]postfixNode, 0)
// referMap := make(map[int]int)
// numGroups := 0
// groupIncrement := 0 // If we have a backreference before the group its referring to, then the group its referring to will have its group number incremented.
// for i, node := range nodes {
// if node.nodetype == backreferenceNode {
// if node.referencedGroup >= len(parenIndices) {
// return nil, nil, fmt.Errorf("invalid backreference")
// }
// rtv = slices.Concat(rtv, nodes[parenIndices[node.referencedGroup].StartIdx:parenIndices[node.referencedGroup].EndIdx+1]) // Add all the nodes in the group to rtv
// numGroups += 1
// if i < parenIndices[node.referencedGroup].StartIdx {
// groupIncrement += 1
// }
// referMap[numGroups] = node.referencedGroup + groupIncrement
// } else {
// rtv = append(rtv, node)
// if node.nodetype == lparenNode {
// numGroups += 1
// }
// }
// }
// return rtv, referMap, nil
// }

View File

@@ -1,89 +0,0 @@
package regex
import "container/heap"
// Implement a priority queue using container/heap
const (
min_priority int = iota
zerostate_priority
alternation_priority
kleene_priority
char_priority
max_priority
)
func getPriority(state *nfaState) int {
if state.isKleene {
return zerostate_priority
} else if state.isAlternation {
return alternation_priority
} else {
if state.isEmpty {
return zerostate_priority
} else {
return char_priority
}
}
}
type priorQueueItem struct {
state *nfaState
priority int
index int
}
func newPriorQueueItem(state *nfaState) *priorQueueItem {
return &priorQueueItem{
state: state,
index: -1,
priority: getPriority(state),
}
}
type priorityQueue []*priorQueueItem
func (pq priorityQueue) Len() int {
return len(pq)
}
func (pq priorityQueue) Less(i, j int) bool {
if pq[i].priority == pq[j].priority {
return pq[i].index < pq[j].index
}
return pq[i].priority > pq[j].priority // We want max-heap, so we use greater-than
}
func (pq priorityQueue) Swap(i, j int) {
pq[i], pq[j] = pq[j], pq[i]
pq[i].index = i
pq[j].index = j
}
func (pq *priorityQueue) Push(x any) {
length := len(*pq)
item := x.(*priorQueueItem)
item.index = length
*pq = append(*pq, item)
}
func (pq *priorityQueue) Pop() any {
old := *pq
n := len(old)
item := old[n-1]
old[n-1] = nil
item.index = -1
*pq = old[0 : n-1]
return item
}
func (pq *priorityQueue) peek() any {
queue := *pq
n := len(queue)
return queue[n-1]
}
func (pq *priorityQueue) update(item *priorQueueItem, value *nfaState, priority int) {
item.state = value
item.priority = priority
heap.Fix(pq, item.index)
}

View File

@@ -109,7 +109,7 @@ func range2regex(start int, end int) (string, error) {
startSlc := intToSlc(rg.start) startSlc := intToSlc(rg.start)
endSlc := intToSlc(rg.end) endSlc := intToSlc(rg.end)
if len(startSlc) != len(endSlc) { if len(startSlc) != len(endSlc) {
return "", fmt.Errorf("Error parsing numeric range") return "", fmt.Errorf("error parsing numeric range")
} }
for i := range startSlc { for i := range startSlc {
if startSlc[i] == endSlc[i] { if startSlc[i] == endSlc[i] {

View File

@@ -25,7 +25,9 @@ var reTests = []struct {
{"a*b", nil, "qwqw", []Group{}}, {"a*b", nil, "qwqw", []Group{}},
{"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}}, {"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}},
{"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}}, {"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}},
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}}, // This match will only happen with Longest()
// {"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}},
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}}},
{"b*a*a", nil, "bba", []Group{{0, 3}}}, {"b*a*a", nil, "bba", []Group{{0, 3}}},
{"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}}, {"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}},
{"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}}, {"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}},
@@ -177,7 +179,7 @@ var reTests = []struct {
{"[[:graph:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 70}}}, {"[[:graph:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 70}}},
// Test cases from Python's RE test suite // Test cases from Python's RE test suite
{`[\1]`, nil, "\x01", []Group{{0, 1}}}, {`[\01]`, nil, "\x01", []Group{{0, 1}}},
{`\0`, nil, "\x00", []Group{{0, 1}}}, {`\0`, nil, "\x00", []Group{{0, 1}}},
{`[\0a]`, nil, "\x00", []Group{{0, 1}}}, {`[\0a]`, nil, "\x00", []Group{{0, 1}}},
@@ -192,7 +194,7 @@ var reTests = []struct {
{`\x00ffffffffffffff`, nil, "\xff", []Group{}}, {`\x00ffffffffffffff`, nil, "\xff", []Group{}},
{`\x00f`, nil, "\x0f", []Group{}}, {`\x00f`, nil, "\x0f", []Group{}},
{`\x00fe`, nil, "\xfe", []Group{}}, {`\x00fe`, nil, "\xfe", []Group{}},
{`^\w+=(\\[\000-\277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}}, {`^\w+=(\\[\000-\0277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
{`a.b`, nil, `acb`, []Group{{0, 3}}}, {`a.b`, nil, `acb`, []Group{{0, 3}}},
{`a.b`, nil, "a\nb", []Group{}}, {`a.b`, nil, "a\nb", []Group{}},
@@ -310,11 +312,7 @@ var reTests = []struct {
{`a[-]?c`, nil, `ac`, []Group{{0, 2}}}, {`a[-]?c`, nil, `ac`, []Group{{0, 2}}},
{`^(.+)?B`, nil, `AB`, []Group{{0, 2}}}, {`^(.+)?B`, nil, `AB`, []Group{{0, 2}}},
{`\0009`, nil, "\x009", []Group{{0, 2}}}, {`\0009`, nil, "\x009", []Group{{0, 2}}},
{`\141`, nil, "a", []Group{{0, 1}}}, {`\0141`, nil, "a", []Group{{0, 1}}},
// At this point, the python test suite has a bunch
// of backreference tests. Since my engine doesn't
// implement backreferences, I've skipped those tests.
{`*a`, nil, ``, nil}, {`*a`, nil, ``, nil},
{`(*)b`, nil, ``, nil}, {`(*)b`, nil, ``, nil},
@@ -431,7 +429,8 @@ var reTests = []struct {
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}}, {`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}}, {`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}}, {`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
{`\141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}}, {`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
{`\0141\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "AA", []Group{{0, 2}}},
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}}, {`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
@@ -462,8 +461,10 @@ var reTests = []struct {
{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}}, {`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}}, {`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
{`\xff`, nil, "\u00ff", []Group{{0, 1}}}, {`\xff`, nil, "\u00ff", []Group{{0, 1}}},
{`\xff+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
{`\xFF`, nil, "\u00ff", []Group{{0, 1}}}, {`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
{`\x00ff`, nil, "\u00ff", []Group{}}, {`\x00ff`, nil, "\u00ff", []Group{}},
{`\x{0000ff}+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}}, {`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}}, {`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}}, {"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
@@ -471,7 +472,7 @@ var reTests = []struct {
{`[\t][\n][\v][\r][\f][\b]`, nil, "\t\n\v\r\f\b", []Group{{0, 6}}}, {`[\t][\n][\v][\r][\f][\b]`, nil, "\t\n\v\r\f\b", []Group{{0, 6}}},
{`.*d`, nil, "abc\nabd", []Group{{4, 7}}}, {`.*d`, nil, "abc\nabd", []Group{{4, 7}}},
{`(`, nil, "-", nil}, {`(`, nil, "-", nil},
{`[\41]`, nil, `!`, []Group{{0, 1}}}, {`[\041]`, nil, `!`, []Group{{0, 1}}},
{`(?<!abc)(d.f)`, nil, `abcdefdof`, []Group{{6, 9}}}, {`(?<!abc)(d.f)`, nil, `abcdefdof`, []Group{{6, 9}}},
{`[\w-]+`, nil, `laser_beam`, []Group{{0, 10}}}, {`[\w-]+`, nil, `laser_beam`, []Group{{0, 10}}},
{`M+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}}, {`M+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}},
@@ -518,6 +519,14 @@ var reTests = []struct {
{`<389-400`, nil, `-`, nil}, {`<389-400`, nil, `-`, nil},
{`<389-400>`, nil, `391`, []Group{{0, 3}}}, {`<389-400>`, nil, `391`, []Group{{0, 3}}},
{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}}, {`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
{`\p{Tamil}+`, nil, `உயிரெழுத்து`, []Group{{0, 11}}}, // Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'.
{`\P{Tamil}+`, nil, `vowel=உயிரெழுத்து`, []Group{{0, 6}}},
{`\P`, nil, `உயிரெழுத்து`, nil},
{`\PM\pM*`, nil, `உயிரெழுத்து`, []Group{{0, 1}, {1, 3}, {3, 5}, {5, 7}, {7, 9}, {9, 11}}},
{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
} }
var groupTests = []struct { var groupTests = []struct {
@@ -528,7 +537,7 @@ var groupTests = []struct {
}{ }{
{"(a)(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}}, {"(a)(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}},
{"((a))(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}}, {"((a))(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}},
{"(0)", nil, "ab", []Match{[]Group{}}}, {"(0)", nil, "ab", []Match{}},
{"(a)b", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}}}}, {"(a)b", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}}}},
{"a(b)", nil, "ab", []Match{[]Group{{0, 2}, {1, 2}}}}, {"a(b)", nil, "ab", []Match{[]Group{{0, 2}, {1, 2}}}},
{"(a|b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}}, {"(a|b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
@@ -537,10 +546,11 @@ var groupTests = []struct {
{"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}}, {"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}},
{"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, // This match will only happen with Longest()
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, // {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 3}, {0, 3}, {-1, -1}}}},
{"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}}, {"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
{"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}}, {"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
{"(a?)a?", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}}, {"(a?)a?", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
{"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}}, {"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
@@ -578,13 +588,37 @@ var groupTests = []struct {
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}}, {`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}},
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}}, {`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
{`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(((((((((a)))))))))\41`, nil, `a`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(((((((((a)))))))))\041`, nil, `a!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}}, {`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
{`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}}, {`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
// At this point, the python test suite has a bunch // Backreference tests
// of backreference tests. Since my engine doesn't {`(abc)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}}},
// implement backreferences, I've skipped those tests. {`([a-c]+)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}}},
{`([a-c]*)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}, []Group{{6, 6}, {6, 6}}}},
{`^(.+)?B`, nil, `AB`, []Match{[]Group{{0, 2}, {0, 1}}}},
{`(a+).\1$`, nil, `aaaaa`, []Match{[]Group{{0, 5}, {0, 2}}}},
{`^(a+).\1$`, nil, `aaaa`, []Match{}},
{`(a)\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}},
{`(a+)\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}},
{`(a+)+\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}},
{`(a).+\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`(a)ba*\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`(aa|a)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`(a|aa)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`(a+)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`([abc]*)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}, []Group{{6, 6}, {6, 6}}}},
{`(a)(?:b)\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`(a)(?:b)\1`, nil, `abb`, []Match{}},
{`(?:a)(b)\1`, nil, `aba`, []Match{}},
{`(?:a)(b)\1`, nil, `abb`, []Match{[]Group{{0, 3}, {1, 2}}}},
{`(?:(cat)|(dog))\2`, nil, `catdog`, []Match{}},
{`(?:a)\1`, nil, `aa`, nil},
{`((cat)|(dog)|(cow)|(bat))\4`, nil, `cowcow`, []Match{[]Group{{0, 6}, {0, 3}, {-1, -1}, {-1, -1}, {0, 3}, {-1, -1}}}},
{`(a|b)*\1`, nil, `abb`, []Match{[]Group{{0, 3}, {1, 2}}}},
{`(a|b)*\1`, nil, `aba`, []Match{}},
{`(a|b)*\1`, nil, `bab`, []Match{}},
{`(a|b)*\1`, nil, `baa`, []Match{[]Group{{0, 3}, {1, 2}}}},
{`(a)(b)c|ab`, nil, `ab`, []Match{[]Group{{0, 2}}}}, {`(a)(b)c|ab`, nil, `ab`, []Match{[]Group{{0, 2}}}},
{`(a)+x`, nil, `aaax`, []Match{[]Group{{0, 4}, {2, 3}}}}, {`(a)+x`, nil, `aaax`, []Match{[]Group{{0, 4}, {2, 3}}}},
@@ -633,7 +667,7 @@ var groupTests = []struct {
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `BCDD`, []Match{}}, {`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `BCDD`, []Match{}},
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}}, {`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
{`(((((((((a)))))))))`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(((((((((a)))))))))`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(((((((((a)))))))))\41`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(((((((((a)))))))))\041`, []ReFlag{RE_CASE_INSENSITIVE}, `A!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(.*)c(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}}, {`(.*)c(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
{`\((.*), (.*)\)`, []ReFlag{RE_CASE_INSENSITIVE}, `(A, B)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}}, {`\((.*), (.*)\)`, []ReFlag{RE_CASE_INSENSITIVE}, `(A, B)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
{`(a)(b)c|ab`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}}}}, {`(a)(b)c|ab`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}}}},
@@ -743,7 +777,7 @@ func TestFindString(t *testing.T) {
foundString := regComp.FindString(test.str) foundString := regComp.FindString(test.str)
if len(test.result) == 0 { if len(test.result) == 0 {
if foundString != "" { if foundString != "" {
t.Errorf("Expected no match got %v\n", foundString) t.Errorf("Wanted no match got %v\n", foundString)
} }
} else { } else {
expectedString := test.str[test.result[0].StartIdx:test.result[0].EndIdx] expectedString := test.str[test.result[0].StartIdx:test.result[0].EndIdx]
@@ -789,18 +823,132 @@ func TestFindSubmatch(t *testing.T) {
if test.result != nil { if test.result != nil {
panic(err) panic(err)
} }
} } else {
match, err := regComp.FindSubmatch(test.str) match, err := regComp.FindSubmatch(test.str)
if err != nil {
if len(test.result) != 0 {
t.Errorf("Wanted %v got no match\n", test.result[0])
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", match)
}
for i := range match { for i := range match {
if match[i].IsValid() { if match[i].IsValid() {
if test.result[0][i] != match[i] { if test.result[0][i] != match[i] {
t.Errorf("Wanted %v Got %v\n", test.result[0], match) t.Errorf("Wanted %v Got %v\n", test.result[0], match)
} }
} else {
if i < len(test.result) && test.result[0][i].IsValid() {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
}
}
} }
} }
}) })
} }
} }
func TestFindStringSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
} else {
matchStr := regComp.FindStringSubmatch(test.str)
if matchStr == nil {
if len(test.result) != 0 {
expectedStr := funcMap(test.result[0], func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
t.Errorf("Wanted %v got no match\n", expectedStr)
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", matchStr)
} else {
expectedStr := funcMap(test.result[0], func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
for i, groupStr := range matchStr {
if groupStr == "" {
if i < len(expectedStr) && expectedStr[i] != "" {
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
}
} else {
if expectedStr[i] != groupStr {
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
}
}
}
}
}
})
}
}
func TestFindAllStringSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
} else {
matchStrs := regComp.FindAllStringSubmatch(test.str)
if matchStrs == nil {
if len(test.result) != 0 {
expectedStrs := funcMap(test.result, func(m Match) []string {
return funcMap(m, func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
})
t.Errorf("Wanted %v got no match\n", expectedStrs)
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", matchStrs)
} else {
expectedStrs := funcMap(test.result, func(m Match) []string {
return funcMap(m, func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
})
for i, matchStr := range matchStrs {
for j, groupStr := range matchStr {
if groupStr == "" {
if j < len(expectedStrs[i]) && expectedStrs[i][j] != "" {
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
}
} else {
if expectedStrs[i][j] != groupStr {
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
}
}
}
}
}
}
})
}
}
func TestFindAllSubmatch(t *testing.T) { func TestFindAllSubmatch(t *testing.T) {
for _, test := range groupTests { for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) { t.Run(test.re+" "+test.str, func(t *testing.T) {
@@ -809,7 +957,7 @@ func TestFindAllSubmatch(t *testing.T) {
if test.result != nil { if test.result != nil {
panic(err) panic(err)
} }
} } else {
matchIndices := regComp.FindAllSubmatch(test.str) matchIndices := regComp.FindAllSubmatch(test.str)
for i := range matchIndices { for i := range matchIndices {
for j := range matchIndices[i] { for j := range matchIndices[i] {
@@ -817,6 +965,11 @@ func TestFindAllSubmatch(t *testing.T) {
if test.result[i][j] != matchIndices[i][j] { if test.result[i][j] != matchIndices[i][j] {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
} }
} else {
if i < len(test.result) && j < len(test.result[i]) && test.result[i][j].IsValid() {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}
}
} }
} }
} }