Compare commits
54 Commits
7e792f1248
...
posixStyle
| Author | SHA1 | Date | |
|---|---|---|---|
| ef476e8875 | |||
| 7e6b02632f | |||
| f94e3f2e71 | |||
| b129d83c3f | |||
| 43aa7b5876 | |||
| 9a3bfca313 | |||
| b6ab54f6dd | |||
| 6a96c98d04 | |||
| 3cfc2a6854 | |||
| 5d7a02e796 | |||
| a46d2f4546 | |||
| c88ebd1aa5 | |||
| fd102292c6 | |||
| 6d692d0dfc | |||
| 7c4538a259 | |||
| 2a9ae0b68a | |||
| 783ae2ad10 | |||
| b5e6bc112c | |||
| 206fea34cd | |||
| fcdb23524a | |||
| ac936659b6 | |||
| e6dba9fdcf | |||
| 30779a446b | |||
| f629a0f08f | |||
| 6869cd00a2 | |||
| 02bc8f30a2 | |||
| ac05bceda3 | |||
| 037ac75ea6 | |||
| e9d4e857cf | |||
| b685d2fd5f | |||
| 8eda5055ff | |||
| 45b6566b2c | |||
| e22822e619 | |||
| 692de2a32b | |||
| 0d19664044 | |||
| 1bfb09b6c7 | |||
| b0b8bf23af | |||
| 00570f07fe | |||
| 7431b1a7b2 | |||
| ee51e39d59 | |||
| db7c884b83 | |||
| c3059fe899 | |||
| 4f577592ba | |||
| b734d61a03 | |||
| 00c39857eb | |||
| aa9e2324ee | |||
| 66b96bf9e8 | |||
| 0ac39bfb7b | |||
| dbc9fe2c3b | |||
| eeeb9387d5 | |||
| 57eb935bd1 | |||
| cbd679949f | |||
| a63426d965 | |||
| 2e3450285c |
11
LICENSE
Normal file
11
LICENSE
Normal file
@@ -0,0 +1,11 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2025 Aadhavan Srinivasan
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
10
Makefile
10
Makefile
@@ -1,9 +1,13 @@
|
||||
.DEFAULT_GOAL := build
|
||||
.PHONY: fmt vet build
|
||||
.DEFAULT_GOAL := buildCmd
|
||||
.PHONY: fmt vet buildLib buildCmd test
|
||||
|
||||
fmt:
|
||||
go fmt ./...
|
||||
vet: fmt
|
||||
go vet ./...
|
||||
build: vet
|
||||
buildLib: vet
|
||||
go build -gcflags="-N -l" ./...
|
||||
buildCmd: buildLib
|
||||
go build -C cmd/ -gcflags="-N -l" -o re ./...
|
||||
test: buildCmd
|
||||
go test -v ./...
|
||||
|
||||
@@ -121,12 +121,12 @@ func main() {
|
||||
}
|
||||
matchIndices := make([]reg.Match, 0)
|
||||
if matchNumFlagEnabled {
|
||||
tmp, err := reg.FindNthMatch(regComp, test_str, *matchNum)
|
||||
tmp, err := regComp.FindNthMatch(test_str, *matchNum)
|
||||
if err == nil {
|
||||
matchIndices = append(matchIndices, tmp)
|
||||
}
|
||||
} else {
|
||||
matchIndices = reg.FindAllMatches(regComp, test_str)
|
||||
matchIndices = regComp.FindAllSubmatch(test_str)
|
||||
}
|
||||
|
||||
if *printMatchesFlag {
|
||||
@@ -137,7 +137,7 @@ func main() {
|
||||
fmt.Fprintf(out, "Line %d:\n", lineNum)
|
||||
}
|
||||
for _, m := range matchIndices {
|
||||
fmt.Fprintf(out, "%s\n", m.ToString())
|
||||
fmt.Fprintf(out, "%s\n", m.String())
|
||||
}
|
||||
err := out.Flush()
|
||||
if err != nil {
|
||||
|
||||
@@ -18,6 +18,12 @@ type Reg struct {
|
||||
numGroups int
|
||||
}
|
||||
|
||||
// numSubexp eturns the number of sub-expressions in the given [Reg]. This is equivalent
|
||||
// to the number of capturing groups.
|
||||
func (r Reg) NumSubexp() int {
|
||||
return r.numGroups
|
||||
}
|
||||
|
||||
const concatRune rune = 0xF0001
|
||||
|
||||
// Flags for shuntingYard - control its behavior
|
||||
@@ -171,7 +177,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
re_runes = append(re_runes, []rune(regex)...)
|
||||
} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
|
||||
re_runes = append(re_runes, NONCAPLPAREN_CHAR)
|
||||
re_runes = append(re_runes, nonCapLparenRune)
|
||||
i += 2
|
||||
} else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash
|
||||
re_runes = append(re_runes, escBackslashRune)
|
||||
@@ -233,7 +239,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
i++
|
||||
}
|
||||
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != rbracketRune) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
|
||||
re_runes[i] = CHAR_RANGE
|
||||
re_runes[i] = charRangeRune
|
||||
}
|
||||
toAppend = append(toAppend, re_runes[i])
|
||||
i++
|
||||
@@ -254,7 +260,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
|
||||
}
|
||||
if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen
|
||||
re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
|
||||
re_postfix = append(re_postfix, nonCapLparenRune)
|
||||
i += 3
|
||||
}
|
||||
if i < len(re_runes) && re_runes[i] == '\\' { // Something is being escaped (I don't add the backslash to re_postfix, because it was already added earlier)
|
||||
@@ -303,7 +309,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
if i >= len(re_runes) {
|
||||
return nil, fmt.Errorf("unclosed lookaround")
|
||||
}
|
||||
if re_runes[i] == '(' || re_runes[i] == NONCAPLPAREN_CHAR {
|
||||
if re_runes[i] == '(' || re_runes[i] == nonCapLparenRune {
|
||||
numOpenParens++
|
||||
}
|
||||
if re_runes[i] == ')' {
|
||||
@@ -317,7 +323,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
continue
|
||||
}
|
||||
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
||||
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != nonCapLparenRune && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
||||
if i < len(re_runes)-1 {
|
||||
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
|
||||
re_postfix = append(re_postfix, concatRune)
|
||||
@@ -407,7 +413,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
} else {
|
||||
escapedNode, err := newEscapedNode(re_postfix[i], false)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid escape character in expression")
|
||||
return nil, err
|
||||
}
|
||||
outQueue = append(outQueue, escapedNode)
|
||||
}
|
||||
@@ -433,7 +439,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
if i >= len(re_postfix) {
|
||||
return nil, fmt.Errorf("unclosed lookaround")
|
||||
}
|
||||
if re_postfix[i] == '(' || re_postfix[i] == NONCAPLPAREN_CHAR {
|
||||
if re_postfix[i] == '(' || re_postfix[i] == nonCapLparenRune {
|
||||
numOpenParens++
|
||||
}
|
||||
if re_postfix[i] == ')' {
|
||||
@@ -524,7 +530,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
if firstCharAdded && re_postfix[i] == rbracketRune {
|
||||
break
|
||||
}
|
||||
if re_postfix[i] == CHAR_RANGE {
|
||||
if re_postfix[i] == charRangeRune {
|
||||
endOfRange = true
|
||||
i++
|
||||
continue
|
||||
@@ -575,7 +581,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
} else {
|
||||
escapedNode, err := newEscapedNode(re_postfix[i], true)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid escape character in character class")
|
||||
return nil, err
|
||||
}
|
||||
chars = append(chars, escapedNode)
|
||||
i++
|
||||
@@ -757,7 +763,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
outQueue[idx].startReps = startRangeNum
|
||||
outQueue[idx].endReps = endRangeNum
|
||||
}
|
||||
if c == '(' || c == NONCAPLPAREN_CHAR {
|
||||
if c == '(' || c == nonCapLparenRune {
|
||||
opStack = append(opStack, c)
|
||||
if c == '(' { // We only push _capturing_ group parentheses to outQueue
|
||||
outQueue = append(outQueue, newPostfixNode(c))
|
||||
@@ -768,7 +774,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Throw error if we reach the end of the stack.
|
||||
var val rune
|
||||
var err error
|
||||
for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) {
|
||||
for val, err = peek(opStack); val != '(' && val != nonCapLparenRune; val, err = peek(opStack) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("imbalanced parantheses")
|
||||
}
|
||||
@@ -878,6 +884,10 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
stateToAdd.assert = wboundAssert
|
||||
case 'B':
|
||||
stateToAdd.assert = nonwboundAssert
|
||||
case 'A':
|
||||
stateToAdd.assert = soiAssert
|
||||
case 'z':
|
||||
stateToAdd.assert = eoiAssert
|
||||
}
|
||||
} else { // Lookaround
|
||||
stateToAdd.lookaroundRegex = string(c.contents)
|
||||
@@ -939,7 +949,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
// and added back in.
|
||||
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
|
||||
// and RPAREN nodes.
|
||||
// If neither node exists, that's a problem so I return an error.
|
||||
// If the middle node exists but is itself the start of a group, then that _must_ be the opening paren for
|
||||
// the closing paren that I'm on. I put the third node back (because it isn't involved in the capturing group), then
|
||||
// I concatenate those two and add them. If neither node exists, that's a problem so I return an error.
|
||||
if c.nodetype == rparenNode {
|
||||
s.groupEnd = true
|
||||
middleNode, err1 := pop(&nfa)
|
||||
@@ -954,6 +966,11 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
s.groupNum = lparenNode.groupNum
|
||||
to_add := concatenate(lparenNode, s)
|
||||
nfa = append(nfa, to_add)
|
||||
} else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
|
||||
nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group
|
||||
s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen
|
||||
to_add := concatenate(middleNode, s)
|
||||
nfa = append(nfa, to_add)
|
||||
} else {
|
||||
// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
|
||||
if lparenNode.groupBegin {
|
||||
@@ -1000,7 +1017,7 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
s2 := mustPop(&nfa)
|
||||
// Relax the requirements for concatenation a little bit - If
|
||||
// the second element is not found ie. the postfixNodes look
|
||||
// like 'a~', then that's fine, we just skip the concatenation.
|
||||
// like 'a'+CONCAT, then that's fine, we just skip the concatenation.
|
||||
s1, err := pop(&nfa)
|
||||
if err != nil {
|
||||
nfa = append(nfa, s2)
|
||||
@@ -1106,10 +1123,11 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
|
||||
}
|
||||
|
||||
// Compiles the given regular expression into a Reg type, suitable for use with the
|
||||
// matching functions. The second return value is non-nil if a compilation error has
|
||||
// occured. As such, the error value must be checked before using the Reg returned by this function.
|
||||
// The second parameter is an optional list of flags, passed to the parsing function shuntingYard.
|
||||
// Compile compiles the given regular expression into a [Reg].
|
||||
//
|
||||
// An error value != nil indicates that the regex was invalid; the error message should provide
|
||||
// detailed information on the nature of the error.
|
||||
// The second parameter is a sequence of zero or more [ReFlag] values, that modify the behavior of the regex.
|
||||
func Compile(re string, flags ...ReFlag) (Reg, error) {
|
||||
nodes, err := shuntingYard(re, flags...)
|
||||
if err != nil {
|
||||
@@ -1121,3 +1139,12 @@ func Compile(re string, flags ...ReFlag) (Reg, error) {
|
||||
}
|
||||
return reg, nil
|
||||
}
|
||||
|
||||
// MustCompile panicks if Compile returns an error. They are identical in all other respects.
|
||||
func MustCompile(re string, flags ...ReFlag) Reg {
|
||||
reg, err := Compile(re, flags...)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return reg
|
||||
}
|
||||
|
||||
160
regex/doc.go
Normal file
160
regex/doc.go
Normal file
@@ -0,0 +1,160 @@
|
||||
/*
|
||||
Package regex implements regular expression search, using a custom non-bracktracking engine with support for lookarounds and numeric ranges.
|
||||
|
||||
The engine relies completely on UTF-8 codepoints. As such, it is capable of matching characters
|
||||
from other languages, emojis and symbols.
|
||||
|
||||
The full syntax is specified below.
|
||||
|
||||
# Syntax
|
||||
|
||||
Single characters:
|
||||
|
||||
. Match any character. Newline matching is dependent on the RE_SINGLE_LINE flag.
|
||||
[abc] Character class - match a, b or c
|
||||
[a-z] Character range - match any character from a to z
|
||||
[^abc] Negated character class - match any character except a, b and c
|
||||
[^a-z] Negated character range - do not match any character from a to z
|
||||
\[ Match a literal '['. Backslashes can escape any character with special meaning, including another backslash.
|
||||
\452 Match the character with the octal value 452 (up to 3 digits)
|
||||
\xFF Match the character with the hex value FF (exactly 2 characters)
|
||||
\x{0000FF} Match the character with the hex value 0000FF (exactly 6 characters)
|
||||
\n Newline
|
||||
\a Bell character
|
||||
\f Form-feed character
|
||||
\r Carriage return
|
||||
\t Horizontal tab
|
||||
\v Vertical tab
|
||||
|
||||
Perl classes:
|
||||
|
||||
\d Match any digit character ([0-9])
|
||||
\D Match any non-digit character ([^0-9])
|
||||
\w Match any word character ([a-zA-Z0-9_])
|
||||
\W Match any word character ([^a-zA-Z0-9_])
|
||||
\s Match any whitespace character ([ \t\n])
|
||||
\S Match any non-whitespace character ([^ \t\n])
|
||||
|
||||
POSIX classes (inside normal character classes):
|
||||
|
||||
[:digit:] All digit characters ([0-9])
|
||||
[:upper:] All upper-case letters ([A-Z])
|
||||
[:lower:] All lower-case letters ([a-z])
|
||||
[:alpha:] All letters ([a-zA-Z])
|
||||
[:alnum:] All alphanumeric characters ([a-zA-Z0-9])
|
||||
[:xdigit:] All hexadecimal characters ([a-fA-F0-9])
|
||||
[:blank:] All blank characters ([ \t])
|
||||
[:space:] All whitespace characters ([ \t\n\r\f\v])
|
||||
[:cntrl:] All control characters ([\x00-\x1F\x7F])
|
||||
[:punct:] All punctuation characters
|
||||
[:graph:] All graphical characters ([\x21-\x7E])
|
||||
[:print:] All graphical characters + space ([\x20-\x7E])
|
||||
[:word:] All word characters (\w)
|
||||
[:ascii:] All ASCII values ([\x00-\x7F])
|
||||
|
||||
Composition:
|
||||
|
||||
def Match d, followed by e, followed by f
|
||||
x|y Match x or y (prefer longer one)
|
||||
xy|z Match xy or z
|
||||
|
||||
Repitition (always greedy, preferring more):
|
||||
|
||||
x* Match x zero or more times
|
||||
x+ Match x one or more times
|
||||
x? Match x zero or one time
|
||||
x{m,n} Match x between m and n times (inclusive)
|
||||
x{m,} Match x atleast m times
|
||||
x{,n} Match x between 0 and n times (inclusive)
|
||||
x{m} Match x exactly m times
|
||||
|
||||
Grouping:
|
||||
|
||||
(expr) Create a capturing group. The contents of the group can be retrieved with [FindAllMatches]
|
||||
x(y|z) Match x followed by y or z. Given a successful match, the contents of group 1 will include either y or z
|
||||
(?:expr) Create a non-capturing group. The contents of the group aren't saved.
|
||||
x(?:y|z) Match x followed by y or z. No groups are created.
|
||||
|
||||
Assertions:
|
||||
|
||||
^ Match at the start of the input string. If RE_MULTILINE is enabled, it also matches at the start of every line.
|
||||
$ Match at the end of the input string. If RE_MULTILINE is enabled, it also matches at the end of every line.
|
||||
\A Always match at the start of the string, regardless of RE_MULTILINE
|
||||
\z Always match at the end of the string, regardless of RE_MULTILINE
|
||||
\b Match at a word boundary (a word character followed by a non-word character, or vice-versa)
|
||||
\B Match at a non-word boundary (a word character followed by a word character, or vice-versa)
|
||||
|
||||
Lookarounds:
|
||||
|
||||
x(?=y) Positive lookahead - Match x if followed by y
|
||||
x(?!y) Negative lookahead - Match x if NOT followed by y
|
||||
(?<=x)y Positive lookbehind - Match y if preceded by x
|
||||
(?<!x)y Negative lookbehind - Match y if NOT preceded by x
|
||||
|
||||
Numeric ranges:
|
||||
|
||||
<x-y> Match any number from x to y (inclusive) (x and y must be positive numbers)
|
||||
|
||||
# Key Differences with regexp
|
||||
|
||||
The engine and the API differ from [regexp] in a number of ways, some of them very subtle.
|
||||
The key differences are mentioned below.
|
||||
|
||||
1. Greediness:
|
||||
|
||||
This engine does not support non-greedy operators. All operators are always greedy in nature, and will try
|
||||
to match as much as they can, while still allowing for a successful match. For example, given the regex:
|
||||
|
||||
y*y
|
||||
|
||||
The engine will match as many 'y's as it can, while still allowing the trailing 'y' to be matched.
|
||||
|
||||
Another, more subtle example is the following regex:
|
||||
|
||||
x|xx
|
||||
|
||||
While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
|
||||
this engine will go for the longest possible match, regardless of the order of the alternation. Although this
|
||||
strays from the convention, it results in a nice rule-of-thumb - the engine is ALWAYS greedy.
|
||||
|
||||
The stdlib implementation has a function [regexp.Regexp.Longest] which makes future searches prefer the longest match.
|
||||
That is the default (and unchangable) behavior in this engine.
|
||||
|
||||
2. Byte-slices and runes:
|
||||
|
||||
My engine does not support byte-slices. When a matching function receives a string, it converts it into a
|
||||
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
|
||||
support made the tradeoff worth it.
|
||||
|
||||
3. Return values
|
||||
|
||||
Rather than using primitives for return values, my engine defines two types that are used as return
|
||||
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
|
||||
|
||||
[regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The
|
||||
equivalent expression for this engine is:
|
||||
|
||||
Find(All)?(String)?(Submatch)?
|
||||
|
||||
[Reg.Find] returns the index of the leftmost match in the string.
|
||||
|
||||
If a function contains 'All' it returns all matches instead of just the leftmost one.
|
||||
|
||||
If a function contains 'String' it returns the matched text, rather than the indices.
|
||||
|
||||
If a function contains 'Submatch' it returns the match, including all submatches found by
|
||||
capturing groups.
|
||||
|
||||
The term '0-group' is used to refer to the 0th capturing group of a match (which is the entire match).
|
||||
Given the following regex:
|
||||
|
||||
x(y)
|
||||
|
||||
and the input string:
|
||||
|
||||
xyz
|
||||
|
||||
The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name
|
||||
returns the 0-group.
|
||||
*/
|
||||
package regex
|
||||
54
regex/example_test.go
Normal file
54
regex/example_test.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package regex_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"gitea.twomorecents.org/Rockingcool/kleingrep/regex"
|
||||
)
|
||||
|
||||
func ExampleReg_Find() {
|
||||
regexStr := "b|a"
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
match, _ := regexComp.Find("banana")
|
||||
fmt.Println(match.String())
|
||||
|
||||
// Output: 0 1
|
||||
}
|
||||
|
||||
func ExampleReg_FindAll() {
|
||||
regexStr := "b|a"
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
matches := regexComp.FindAll("banana")
|
||||
for _, group := range matches {
|
||||
fmt.Println(group.String())
|
||||
}
|
||||
|
||||
// Output: 0 1
|
||||
// 1 2
|
||||
// 3 4
|
||||
// 5 6
|
||||
}
|
||||
|
||||
func ExampleReg_FindString() {
|
||||
regexStr := `\d+`
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
matchStr := regexComp.FindString("The year of our lord, 2025")
|
||||
fmt.Println(matchStr)
|
||||
// Output: 2025
|
||||
}
|
||||
|
||||
func ExampleReg_FindSubmatch() {
|
||||
regexStr := `(\d)\.(\d)(\d)`
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
match, _ := regexComp.FindSubmatch("3.14")
|
||||
fmt.Println(match[0])
|
||||
fmt.Println(match[1])
|
||||
fmt.Println(match[2])
|
||||
// Output: 0 4
|
||||
// 0 1
|
||||
// 2 3
|
||||
}
|
||||
@@ -5,7 +5,13 @@ import (
|
||||
"sort"
|
||||
)
|
||||
|
||||
// a Match stores a slice of all the capturing groups in a match.
|
||||
// A Match represents a match found by the regex in a given string.
|
||||
// It is represented as a list of groups, where the nth element contains
|
||||
// the contents of the nth capturing group. Note that the group may not be valid
|
||||
// (see [Group.IsValid]). The element at index 0 is known
|
||||
// as the 0-group, and represents the contents of the entire match.
|
||||
//
|
||||
// See [Reg.FindSubmatch] for an example.
|
||||
type Match []Group
|
||||
|
||||
// a Group represents a group. It contains the start index and end index of the match
|
||||
@@ -35,28 +41,34 @@ func (m Match) numValidGroups() int {
|
||||
}
|
||||
|
||||
// Returns a string containing the indices of all (valid) groups in the match
|
||||
func (m Match) ToString() string {
|
||||
func (m Match) String() string {
|
||||
var toRet string
|
||||
for i, g := range m {
|
||||
if g.isValid() {
|
||||
if g.IsValid() {
|
||||
toRet += fmt.Sprintf("Group %d\n", i)
|
||||
toRet += g.toString()
|
||||
toRet += g.String()
|
||||
toRet += "\n"
|
||||
}
|
||||
}
|
||||
return toRet
|
||||
}
|
||||
|
||||
// Converts the Group into a string representation:
|
||||
func (idx Group) toString() string {
|
||||
// String converts the Group into a string representation.
|
||||
func (idx Group) String() string {
|
||||
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
|
||||
}
|
||||
|
||||
// Returns whether a group contains valid indices
|
||||
func (g Group) isValid() bool {
|
||||
// Returns whether a group is valid (ie. whether it matched any text). It
|
||||
// simply ensures that both indices of the group are >= 0.
|
||||
func (g Group) IsValid() bool {
|
||||
return g.StartIdx >= 0 && g.EndIdx >= 0
|
||||
}
|
||||
|
||||
// Simple function, makes it easier to map this over a list of matches
|
||||
func getZeroGroup(m Match) Group {
|
||||
return m[0]
|
||||
}
|
||||
|
||||
// takeZeroState takes the 0-state (if such a transition exists) for all states in the
|
||||
// given slice. It returns the resulting states. If any of the resulting states is a 0-state,
|
||||
// the second ret val is true.
|
||||
@@ -101,7 +113,7 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState)
|
||||
num_appended := 0 // number of unique states addded to tempstates
|
||||
for isZero == true {
|
||||
zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
|
||||
tempstates, num_appended = unique_append(tempstates, zeroStates...)
|
||||
tempstates, num_appended = uniqueAppend(tempstates, zeroStates...)
|
||||
if num_appended == 0 { // break if we haven't appended any more unique values
|
||||
break
|
||||
}
|
||||
@@ -138,36 +150,72 @@ func pruneIndices(indices []Match) []Match {
|
||||
return toRet
|
||||
}
|
||||
|
||||
// FindString returns a _string_ containing the _text_ of the _leftmost_ match of
|
||||
// the regex, in the given string. The return value will be an empty string in two situations:
|
||||
// Find returns the 0-group of the leftmost match of the regex in the given string.
|
||||
// An error value != nil indicates that no match was found.
|
||||
func (regex Reg) Find(str string) (Group, error) {
|
||||
match, err := regex.FindNthMatch(str, 1)
|
||||
if err != nil {
|
||||
return Group{}, fmt.Errorf("no matches found")
|
||||
}
|
||||
return getZeroGroup(match), nil
|
||||
}
|
||||
|
||||
// FindAll returns a slice containing all the 0-groups of the regex in the given string.
|
||||
// A 0-group represents the match without any submatches.
|
||||
func (regex Reg) FindAll(str string) []Group {
|
||||
indices := regex.FindAllSubmatch(str)
|
||||
zeroGroups := funcMap(indices, getZeroGroup)
|
||||
return zeroGroups
|
||||
}
|
||||
|
||||
// FindString returns the text of the leftmost match of the regex in the given string.
|
||||
// The return value will be an empty string in two situations:
|
||||
// 1. No match was found
|
||||
// 2. The match was an empty string
|
||||
func FindString(regex Reg, str string) string {
|
||||
match, err := FindNthMatch(regex, str, 1)
|
||||
func (regex Reg) FindString(str string) string {
|
||||
match, err := regex.FindNthMatch(str, 1)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return str[match[0].StartIdx:match[0].EndIdx]
|
||||
zeroGroup := getZeroGroup(match)
|
||||
return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
|
||||
}
|
||||
|
||||
// FindSubmatch returns the leftmost match of the regex in the given string, including
|
||||
// the submatches matched by capturing groups. The returned [Match] will always contain the same
|
||||
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
|
||||
// [Group.IsValid], or by checking that both indices of the group are >= 0.
|
||||
// The second-return value is nil if no match was found.
|
||||
func (regex Reg) FindSubmatch(str string) (Match, error) {
|
||||
match, err := regex.FindNthMatch(str, 1)
|
||||
if err != nil {
|
||||
return Match{}, fmt.Errorf("no match found")
|
||||
} else {
|
||||
return match, nil
|
||||
}
|
||||
}
|
||||
|
||||
// FindAllString is the 'all' version of FindString.
|
||||
// It returns a _slice of strings_ containing the _text_ of _all_ matches of
|
||||
// the regex, in the given string.
|
||||
//func FindAllString(regex Reg, str []string) []string {
|
||||
//
|
||||
//}
|
||||
// It returns a slice of strings containing the text of all matches of
|
||||
// the regex in the given string.
|
||||
func (regex Reg) FindAllString(str string) []string {
|
||||
zerogroups := regex.FindAll(str)
|
||||
matchStrs := funcMap(zerogroups, func(g Group) string {
|
||||
return str[g.StartIdx:g.EndIdx]
|
||||
})
|
||||
return matchStrs
|
||||
}
|
||||
|
||||
// FindNthMatch finds the 'n'th match of the regex represented by the given start-state, with
|
||||
// the given string.
|
||||
// FindNthMatch return the 'n'th match of the regex in the given string.
|
||||
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
|
||||
func FindNthMatch(regex Reg, str string, n int) (Match, error) {
|
||||
func (regex Reg) FindNthMatch(str string, n int) (Match, error) {
|
||||
idx := 0
|
||||
matchNum := 0
|
||||
str_runes := []rune(str)
|
||||
var matchFound bool
|
||||
var matchIdx Match
|
||||
for idx <= len(str_runes) {
|
||||
matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
|
||||
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups)
|
||||
if matchFound {
|
||||
matchNum++
|
||||
}
|
||||
@@ -179,16 +227,15 @@ func FindNthMatch(regex Reg, str string, n int) (Match, error) {
|
||||
return nil, fmt.Errorf("invalid match index - too few matches found")
|
||||
}
|
||||
|
||||
// FindAllMatches tries to find all matches of the regex represented by given start-state, with
|
||||
// the given string
|
||||
func FindAllMatches(regex Reg, str string) []Match {
|
||||
// FindAllSubmatch returns a slice of matches in the given string.
|
||||
func (regex Reg) FindAllSubmatch(str string) []Match {
|
||||
idx := 0
|
||||
str_runes := []rune(str)
|
||||
var matchFound bool
|
||||
var matchIdx Match
|
||||
indices := make([]Match, 0)
|
||||
for idx <= len(str_runes) {
|
||||
matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
|
||||
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups)
|
||||
if matchFound {
|
||||
indices = append(indices, matchIdx)
|
||||
}
|
||||
@@ -196,6 +243,7 @@ func FindAllMatches(regex Reg, str string) []Match {
|
||||
if len(indices) > 0 {
|
||||
return pruneIndices(indices)
|
||||
}
|
||||
|
||||
return indices
|
||||
}
|
||||
|
||||
@@ -204,12 +252,13 @@ func FindAllMatches(regex Reg, str string) []Match {
|
||||
// the next search should start from.
|
||||
//
|
||||
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
|
||||
func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
|
||||
func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
|
||||
// Base case - exit if offset exceeds string's length
|
||||
if offset > len(str) {
|
||||
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
|
||||
return false, []Group{}, offset
|
||||
}
|
||||
resetThreads(start)
|
||||
|
||||
// Hold a list of match indices for the current run. When we
|
||||
// can no longer find a match, the match with the largest range is
|
||||
@@ -265,13 +314,13 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
|
||||
num_appended := 0
|
||||
for isZero == true {
|
||||
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
|
||||
tempStates, num_appended = unique_append(tempStates, zeroStates...)
|
||||
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
|
||||
if num_appended == 0 { // Break if we haven't appended any more unique values
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
currentStates, _ = unique_append(currentStates, tempStates...)
|
||||
currentStates, _ = uniqueAppend(currentStates, tempStates...)
|
||||
tempStates = nil
|
||||
|
||||
// Take any transitions corresponding to current character
|
||||
@@ -345,7 +394,7 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
|
||||
// Check if we can find a zero-length match
|
||||
if foundPath == false {
|
||||
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
|
||||
if tempIndices[0].isValid() == false {
|
||||
if tempIndices[0].IsValid() == false {
|
||||
tempIndices[0] = Group{startIdx, startIdx}
|
||||
}
|
||||
}
|
||||
@@ -355,7 +404,7 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
|
||||
startIdx++
|
||||
// i++
|
||||
// }
|
||||
if tempIndices.numValidGroups() > 0 && tempIndices[0].isValid() {
|
||||
if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() {
|
||||
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
|
||||
return true, tempIndices, tempIndices[0].EndIdx + 1
|
||||
} else {
|
||||
@@ -378,7 +427,7 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
|
||||
num_appended := 0 // Number of unique states addded to tempStates
|
||||
for isZero == true {
|
||||
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
|
||||
tempStates, num_appended = unique_append(tempStates, zeroStates...)
|
||||
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
|
||||
if num_appended == 0 { // Break if we haven't appended any more unique values
|
||||
break
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ var lparenRune rune = 0xF0005 // Parentheses in regex are concatenated with thi
|
||||
var rparenRune rune = 0xF0006
|
||||
var nonCapLparenRune rune = 0xF0007 // Represents a non-capturing group's LPAREN
|
||||
var escBackslashRune rune = 0xF0008 // Represents an escaped backslash
|
||||
var CHAR_RANGE rune = 0xF0009 // Represents a character range
|
||||
var charRangeRune rune = 0xF0009 // Represents a character range
|
||||
|
||||
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
|
||||
|
||||
@@ -50,7 +50,7 @@ func isNormalChar(c rune) bool {
|
||||
|
||||
// Ensure that the given elements are only appended to the given slice if they
|
||||
// don't already exist. Returns the new slice, and the number of unique items appended.
|
||||
func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
|
||||
func uniqueAppend[T comparable](slc []T, items ...T) ([]T, int) {
|
||||
num_appended := 0
|
||||
for _, item := range items {
|
||||
if !slices.Contains(slc, item) {
|
||||
@@ -61,6 +61,25 @@ func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
|
||||
return slc, num_appended
|
||||
}
|
||||
|
||||
func uniqueAppendFunc[T any](slc []T, fn func(T, T) bool, items ...T) ([]T, int) {
|
||||
toRet := make([]T, len(slc))
|
||||
num_appended := 0
|
||||
copy(toRet, slc)
|
||||
for _, item := range items {
|
||||
itemExists := false
|
||||
for _, val := range slc {
|
||||
if fn(item, val) {
|
||||
itemExists = true
|
||||
}
|
||||
}
|
||||
if !itemExists {
|
||||
toRet = append(toRet, item)
|
||||
num_appended++
|
||||
}
|
||||
}
|
||||
return toRet, num_appended
|
||||
}
|
||||
|
||||
// Returns true only if all the given elements are equal
|
||||
func allEqual[T comparable](items ...T) bool {
|
||||
first := items[0]
|
||||
|
||||
52
regex/nfa.go
52
regex/nfa.go
@@ -11,8 +11,10 @@ type assertType int
|
||||
|
||||
const (
|
||||
noneAssert assertType = iota
|
||||
sosAssert
|
||||
eosAssert
|
||||
sosAssert // Start of string (^)
|
||||
soiAssert // Start of input (\A)
|
||||
eosAssert // End of string ($)
|
||||
eoiAssert // End of input (\Z)
|
||||
wboundAssert
|
||||
nonwboundAssert
|
||||
plaAssert // Positive lookahead
|
||||
@@ -102,6 +104,26 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
|
||||
return clone
|
||||
}
|
||||
|
||||
// Reset any thread-related fields of the NFA starting from the given state.
|
||||
func resetThreads(start *nfaState) {
|
||||
visitedMap := make(map[*nfaState]bool) // The value type doesn't matter here
|
||||
resetThreadsHelper(start, visitedMap)
|
||||
}
|
||||
|
||||
func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
|
||||
if _, ok := visitedMap[state]; ok {
|
||||
return
|
||||
}
|
||||
// Assuming it hasn't been visited
|
||||
state.threadGroups = nil
|
||||
visitedMap[state] = true
|
||||
for _, v := range state.transitions {
|
||||
for _, nextState := range v {
|
||||
resetThreadsHelper(nextState, visitedMap)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Checks if the given state's assertion is true. Returns true if the given
|
||||
// state doesn't have an assertion.
|
||||
func (s nfaState) checkAssertion(str []rune, idx int) bool {
|
||||
@@ -119,6 +141,15 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool {
|
||||
// Index is at the end of the string, or it points to the last character which is a newline
|
||||
return idx == len(str) || (multilineMode && str[idx] == '\n')
|
||||
}
|
||||
if s.assert == soiAssert {
|
||||
// Only true at the start of the input, regardless of mode
|
||||
return idx == 0
|
||||
}
|
||||
if s.assert == eoiAssert {
|
||||
// Only true at the end of the input, regardless of mode
|
||||
return idx == len(str)
|
||||
}
|
||||
|
||||
if s.assert == wboundAssert {
|
||||
return isWordBoundary(str, idx)
|
||||
}
|
||||
@@ -145,17 +176,18 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool {
|
||||
strToMatch = string(runesToMatch)
|
||||
}
|
||||
|
||||
matchIndices := FindAllMatches(Reg{startState, s.lookaroundNumCaptureGroups}, strToMatch)
|
||||
regComp := Reg{startState, s.lookaroundNumCaptureGroups}
|
||||
matchIndices := regComp.FindAll(strToMatch)
|
||||
|
||||
numMatchesFound := 0
|
||||
for _, matchIdx := range matchIndices {
|
||||
if s.assert == plaAssert || s.assert == nlaAssert { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
|
||||
if matchIdx[0].StartIdx == 0 {
|
||||
if matchIdx.StartIdx == 0 {
|
||||
numMatchesFound++
|
||||
}
|
||||
}
|
||||
if s.assert == plbAssert || s.assert == nlbAssert { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
|
||||
if matchIdx[0].EndIdx == idx {
|
||||
if matchIdx.EndIdx == idx {
|
||||
numMatchesFound++
|
||||
}
|
||||
}
|
||||
@@ -262,7 +294,7 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
|
||||
}
|
||||
for i := range s1.output {
|
||||
for _, c := range s2.content { // Create transitions for every element in s1's content to s2'
|
||||
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], s2)
|
||||
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2)
|
||||
}
|
||||
}
|
||||
s1.output = s2.output
|
||||
@@ -282,11 +314,11 @@ func kleene(s1 nfaState) (*nfaState, error) {
|
||||
toReturn.output = append(toReturn.output, toReturn)
|
||||
for i := range s1.output {
|
||||
for _, c := range toReturn.content {
|
||||
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], toReturn)
|
||||
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
|
||||
}
|
||||
}
|
||||
for _, c := range s1.content {
|
||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1)
|
||||
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
|
||||
}
|
||||
return toReturn, nil
|
||||
}
|
||||
@@ -302,10 +334,10 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
|
||||
// This would lead to multiple instances of the same set of match indices, since both
|
||||
// 's1' states would be considered to match.
|
||||
for _, c := range s1.content {
|
||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s1)
|
||||
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
|
||||
}
|
||||
for _, c := range s2.content {
|
||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2)
|
||||
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
|
||||
}
|
||||
toReturn.content = newContents(epsilon)
|
||||
toReturn.isEmpty = true
|
||||
|
||||
@@ -95,6 +95,16 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
|
||||
toReturn.nodetype = assertionNode
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
}
|
||||
if c == 'B' && inCharClass { // Invalid
|
||||
return postfixNode{}, fmt.Errorf("word boundaries are not allowed in character class")
|
||||
}
|
||||
case 'A', 'z': // A is start of input, z is end of input (regardless of RE_MULTILINE)
|
||||
if inCharClass {
|
||||
return postfixNode{}, fmt.Errorf("input boundaries are not allowed in character class")
|
||||
} else {
|
||||
toReturn.nodetype = assertionNode
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
}
|
||||
case 'n': // Newline character
|
||||
toReturn.nodetype = characterNode
|
||||
toReturn.contents = append(toReturn.contents, '\n')
|
||||
|
||||
@@ -3,7 +3,9 @@ package regex
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type numRange struct {
|
||||
@@ -99,13 +101,11 @@ func range2regex(start int, end int) (string, error) {
|
||||
// Last range - tmp to rangeEnd
|
||||
ranges = append(ranges, numRange{tmp, rangeEnd})
|
||||
|
||||
regex := string(NONCAPLPAREN_CHAR)
|
||||
regexSlice := make([]string, 0)
|
||||
// Generate the regex
|
||||
for i, rg := range ranges {
|
||||
if i > 0 {
|
||||
regex += "|"
|
||||
}
|
||||
regex += string(NONCAPLPAREN_CHAR)
|
||||
for _, rg := range ranges {
|
||||
tmpStr := ""
|
||||
tmpStr += string(nonCapLparenRune)
|
||||
startSlc := intToSlc(rg.start)
|
||||
endSlc := intToSlc(rg.end)
|
||||
if len(startSlc) != len(endSlc) {
|
||||
@@ -113,14 +113,27 @@ func range2regex(start int, end int) (string, error) {
|
||||
}
|
||||
for i := range startSlc {
|
||||
if startSlc[i] == endSlc[i] {
|
||||
regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
|
||||
tmpStr += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
|
||||
} else {
|
||||
regex += fmt.Sprintf("%c%c-%c%c", LBRACKET, rune(startSlc[i]+48), rune(endSlc[i]+48), RBRACKET)
|
||||
tmpStr += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
|
||||
}
|
||||
}
|
||||
regex += ")"
|
||||
tmpStr += ")"
|
||||
regexSlice = append(regexSlice, tmpStr)
|
||||
}
|
||||
regex += ")"
|
||||
// Each element of the slice represents one 'group'. Taking 0-255 as an example, the elements would be:
|
||||
// 1. 0-9
|
||||
// 2. 10-99
|
||||
// 3. 100-199
|
||||
// 4. 200-249
|
||||
// 5. 250-255
|
||||
//
|
||||
// The reason this is reversed before joining it, is because it is incompatible with the PCRE rule for matching.
|
||||
// The PCRE rule specifies that the left-branch of an alternation is preferred. Even though this engine uses the POSIX
|
||||
// rule at the moment (which prefers the longest match regardless of the order of the alternation), reversing the string
|
||||
// has no downsides. It doesn't affect POSIX matching, and it will reduce my burden if I decide to switch to PCRE matching.
|
||||
slices.Reverse(regexSlice)
|
||||
regex := string(nonCapLparenRune) + strings.Join(regexSlice, "|") + ")"
|
||||
return regex, nil
|
||||
|
||||
}
|
||||
|
||||
106
regex/re_test.go
106
regex/re_test.go
@@ -105,6 +105,9 @@ var reTests = []struct {
|
||||
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
|
||||
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
|
||||
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
|
||||
{`\d{3,4}`, nil, "120", []Group{{0, 3}}},
|
||||
{`\d{3,4}`, nil, "12709", []Group{{0, 4}}},
|
||||
{`\d{3,4}`, nil, "12", []Group{}},
|
||||
{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
|
||||
{`\d{3,4}`, nil, "5", []Group{}},
|
||||
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
|
||||
@@ -443,8 +446,11 @@ var reTests = []struct {
|
||||
{`abc$`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{7, 10}}},
|
||||
{`^`, nil, "jkl\n123abc\nxyz", []Group{{0, 0}}},
|
||||
{`^`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{0, 0}, {4, 4}, {11, 11}}},
|
||||
{`\A`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{0, 0}}},
|
||||
{`$`, nil, "jkl\n123abc\nxyz", []Group{{14, 14}}},
|
||||
{`$`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{3, 3}, {10, 10}, {14, 14}}},
|
||||
{`\z`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{14, 14}}},
|
||||
{`^abc\z`, []ReFlag{RE_MULTILINE}, "abc\nabc\nabc", []Group{{8, 11}}},
|
||||
|
||||
{`a.b`, nil, "a\nb", []Group{}},
|
||||
{`a.b`, []ReFlag{RE_SINGLE_LINE}, "a\nb", []Group{{0, 3}}},
|
||||
@@ -668,9 +674,20 @@ var groupTests = []struct {
|
||||
{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
|
||||
|
||||
{`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}},
|
||||
|
||||
// // Tests from https://wiki.haskell.org/Regex_Posix
|
||||
// {`(()|.)(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
|
||||
// {`(()|[ab])(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
|
||||
// {`(()|[ab])+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
|
||||
// {`([ab]|())+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
|
||||
// // Bug - this should give {0,6},{3,6},{-1,-1} but it gives {0,6},{3,6},{3,3}
|
||||
// // {`yyyyyy`, nil, `(yyy|(x?)){2,4}`, []Match{[]Group{{0, 6}, {3, 6}, {-1, -1}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
||||
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
||||
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
|
||||
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
|
||||
}
|
||||
|
||||
func TestFindAllMatches(t *testing.T) {
|
||||
func TestFind(t *testing.T) {
|
||||
for _, test := range reTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
@@ -679,13 +696,35 @@ func TestFindAllMatches(t *testing.T) {
|
||||
panic(fmt.Errorf("Test Error: %v", err))
|
||||
}
|
||||
} else {
|
||||
matchIndices := FindAllMatches(regComp, test.str)
|
||||
zeroGroups := make([]Group, len(matchIndices))
|
||||
for i, m := range matchIndices {
|
||||
zeroGroups[i] = m[0]
|
||||
groupIndex, err := regComp.Find(test.str)
|
||||
if err != nil { // No matches found
|
||||
if len(test.result) == 0 {
|
||||
return // Manually pass the test, because this is the expected behavior
|
||||
} else {
|
||||
t.Errorf("Wanted no match Got %v\n", groupIndex)
|
||||
}
|
||||
if !slices.Equal(test.result, zeroGroups) {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, zeroGroups)
|
||||
} else {
|
||||
if groupIndex != test.result[0] {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, groupIndex)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAll(t *testing.T) {
|
||||
for _, test := range reTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(fmt.Errorf("Test Error: %v", err))
|
||||
}
|
||||
} else {
|
||||
matchIndices := regComp.FindAll(test.str)
|
||||
if !slices.Equal(test.result, matchIndices) {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
||||
}
|
||||
}
|
||||
})
|
||||
@@ -701,7 +740,7 @@ func TestFindString(t *testing.T) {
|
||||
panic(err)
|
||||
}
|
||||
} else {
|
||||
foundString := FindString(regComp, test.str)
|
||||
foundString := regComp.FindString(test.str)
|
||||
if len(test.result) == 0 {
|
||||
if foundString != "" {
|
||||
t.Errorf("Expected no match got %v\n", foundString)
|
||||
@@ -717,7 +756,32 @@ func TestFindString(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAllGroups(t *testing.T) {
|
||||
func TestFindAllString(t *testing.T) {
|
||||
for _, test := range reTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
}
|
||||
} else {
|
||||
foundStrings := regComp.FindAllString(test.str)
|
||||
if len(test.result) != len(foundStrings) {
|
||||
t.Errorf("Differing number of matches: Wanted %v matches Got %v matches\n", len(test.result), len(foundStrings))
|
||||
} else {
|
||||
for idx, group := range test.result {
|
||||
groupStr := test.str[group.StartIdx:group.EndIdx]
|
||||
if groupStr != foundStrings[idx] {
|
||||
t.Errorf("Wanted %v Got %v\n", groupStr, foundStrings[idx])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindSubmatch(t *testing.T) {
|
||||
for _, test := range groupTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
@@ -726,10 +790,30 @@ func TestFindAllGroups(t *testing.T) {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
matchIndices := FindAllMatches(regComp, test.str)
|
||||
match, err := regComp.FindSubmatch(test.str)
|
||||
for i := range match {
|
||||
if match[i].IsValid() {
|
||||
if test.result[0][i] != match[i] {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
func TestFindAllSubmatch(t *testing.T) {
|
||||
for _, test := range groupTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
matchIndices := regComp.FindAllSubmatch(test.str)
|
||||
for i := range matchIndices {
|
||||
for j := range matchIndices[i] {
|
||||
if matchIndices[i][j].isValid() {
|
||||
if matchIndices[i][j].IsValid() {
|
||||
if test.result[i][j] != matchIndices[i][j] {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user