123 Commits

Author SHA1 Message Date
662527c478 Merge pull request 'Implement PCRE Matching (prefer left-branch)' (#2) from implementPCREMatchingRules into master
Reviewed-on: #2
2025-02-09 15:24:26 -06:00
d1958f289c Commented out tests that would only pass with Longest() 2025-02-09 16:08:16 -05:00
15ee49f42e Rename method receivers from 'regex' to 're' (it's shorter) 2025-02-09 15:51:46 -05:00
b60ded4136 Don't break when a match is found, if we are looking for the longest match 2025-02-09 15:48:33 -05:00
9fbb99f86c Wrote example for Longest() 2025-02-09 15:47:57 -05:00
af15904f3b Updated documentation 2025-02-09 15:41:13 -05:00
d522f50b50 Wrote new example functions 2025-02-09 15:40:59 -05:00
fb47e082eb Wrote new methods Expand() and preferLongest(); Use new function signatures (with preferLongest); only characters should be added to next state list 2025-02-09 15:40:39 -05:00
1f5a363539 Use new function signatures (with preferLongest) 2025-02-09 15:39:09 -05:00
9e12f9dcb3 Added field to Reg, denoting if we prefer longest match (POSIX style) or not (perl style) 2025-02-09 15:38:26 -05:00
47f88c817f Fixed typo 2025-02-09 15:14:17 -05:00
835d495990 Removed capitalization for error message (staticcheck) 2025-02-09 09:14:45 -05:00
76e0170cb9 Removed unused function 2025-02-09 09:13:52 -05:00
d172a58258 Throw error if match isn't found but test.result has >0 elements 2025-02-09 09:13:29 -05:00
7231169270 Removed unused functions 2025-02-09 09:13:03 -05:00
e546f01c20 Removed redundant return (staticcheck) 2025-02-09 09:12:55 -05:00
b7467a00f1 Removed priorityQueue (unused) 2025-02-09 09:07:43 -05:00
c6ad4caa0d Removed a bunch of unused code (let's go!!!) 2025-02-09 09:06:40 -05:00
6334435b83 Updated tests since the engine uses Perl matching instead of POSIX matching; added tests for FindStringSubmatch 2025-02-09 09:01:42 -05:00
78fb5606dd Use new definition of Reg 2025-02-09 08:59:16 -05:00
eddd2ae700 Updated documentation 2025-02-09 08:58:58 -05:00
c577064977 Added string field to Reg, that contains the expression string; wrote method to return the string 2025-02-09 08:58:46 -05:00
d4e3942d27 Added Match() and FindStringSubmatch(); removed old code; updated comments 2025-02-09 08:58:09 -05:00
f15a5cae34 Store all states visited in a single run of 'addStateToList()' in a slice 2025-02-08 16:07:01 -05:00
62ca1a872a Made zeroLengthMatchState() return a pointer; reduced the number of comparisons performd by nfaState.equals 2025-02-08 16:06:14 -05:00
99230b49de Use new function signature for zeroLengthMatchState() 2025-02-08 16:05:35 -05:00
22ead83625 Fixed assertion matching 2025-02-07 16:19:36 -05:00
3604486a9b Used Pike's algorithm (an extension to Thompson's algorithm) (see Russ Cox's 2nd article); I think I almost have a working PCRE-style engine 2025-02-07 16:06:45 -05:00
052de55826 question() now returns 2 values 2025-02-07 16:04:46 -05:00
d2ad0d95a8 Modified question operator so that it doesn't create an unnecessary zero-state 2025-02-07 16:04:26 -05:00
ccf3b3b299 More progress on implementing PCRE matching 2025-02-06 22:08:56 -05:00
1d4f695f8f Wrote function to check if a state is in an nfaState, based on the Equals function 2025-02-06 22:06:51 -05:00
8534174ea1 Use pointers instead of values 2025-02-06 22:06:22 -05:00
ed4ffde64e REFACTOR NEEDED: Added another special case; insert instead of appending into currentStates 2025-02-05 22:51:55 -05:00
fbc9bea9fb Commented out unused functions; use new nfaState parameters 2025-02-05 22:23:31 -05:00
cca8c7cda2 Got rid of transitions parameter, changed how kleene state is processed
I replaced the transition parameter for nfaState, replacing it with a
single nfaState pointer. This is because any non-alternation state will
only have one next state, so the map was just added complexity.

I changed alternation processing - instead of having their own dedicated
fields, they just use the new 'next' parameter, and another one called
'splitState'.

I also changed the kleene state processing to remove the unecessary
empty state in the right-side alternation (it actually messed up my
matching).
2025-02-05 22:20:28 -05:00
858e535fba Continued implementing Thompson's algorithm 2025-02-05 18:01:36 -05:00
7c62ba6bfd Started implementing Thompson's algorithm for matching, because the old one was completely backtracking (so it would enter infinite loops on something like '(a*)*' )
The git diff claims that a ton of code was changed, but most of it was just indentation changes.
2025-02-05 12:21:12 -05:00
d4e8cb74fd Replaced pointer to nfaState with nfaState 2025-02-05 11:32:20 -05:00
3ce611d121 More work towards implementing PCRE matching 2025-02-04 14:09:24 -05:00
e0253dfaf3 Change kleene() to an alternation-style construct 2025-02-04 14:09:04 -05:00
753e973d82 Started rewrite of matching algorithm, got concatenation and alternation done, kleene and zero-state stuff is next 2025-02-03 22:01:52 -05:00
5563a70568 Reverse the order in which I pop states for alternation, because this messes with the left branch-right branch thing 2025-02-03 21:59:41 -05:00
de0d7345a8 Store left and right branches of alternation separately 2025-02-03 21:59:05 -05:00
ad273b0c68 Trying to emulate backtracking by using string pointers within threads (something similar to rsc's 2nd regexp article) 2025-02-03 16:50:11 -05:00
e167cdb2cb Fixed mistake in test output 2025-02-03 16:49:30 -05:00
1fd48ae614 Store the current string pointer as a 'thread variable' (allows us to simulate backtracking) 2025-02-03 16:49:10 -05:00
09812956ac Disable all optimizations 2025-02-03 16:48:09 -05:00
fbc9dfcc95 Trying something out; we'll see if it works 2025-02-03 16:47:53 -05:00
bc32e0cb76 Started working on converting to PCRE matching rules (prefer left branch of alternation) 2025-02-03 14:06:14 -05:00
ad0f7d0178 Added new state fields to tell if a state is a question or alternation 2025-02-03 14:05:53 -05:00
4e597f8eb1 Implemented a priority-queue to use while matching 2025-02-03 14:05:30 -05:00
ef476e8875 Reverse the order of the numeric range before adding it, to maintain compatibility with PCRE matching rules (even though I don't implement them, if I do in the future) 2025-02-02 13:46:48 -05:00
7e6b02632f Added more tests; commented out tests that I am failing 2025-02-02 13:46:08 -05:00
f94e3f2e71 Added comments 2025-02-02 12:44:06 -05:00
b129d83c3f Added function to reset threads 2025-02-02 12:43:40 -05:00
43aa7b5876 Updated documentation 2025-02-02 12:42:38 -05:00
9a3bfca313 Renamed unique_append to uniqueAppend 2025-02-02 12:42:29 -05:00
b6ab54f6dd Reset threads when findAllSubmatchHelper is called 2025-02-02 12:42:00 -05:00
6a96c98d04 Fixed bug where the regex '(()|.)(b)' wouldn't compile 2025-02-01 19:20:33 -05:00
3cfc2a6854 Updated Makefile 2025-02-01 18:52:26 -05:00
5d7a02e796 Added gcflags to go build 2025-02-01 18:51:58 -05:00
a46d2f4546 Updated comments 2025-02-01 18:07:31 -05:00
c88ebd1aa5 Added comments explaining what a Match is 2025-02-01 18:05:55 -05:00
fd102292c6 Added example for FindSubmatch 2025-02-01 18:05:43 -05:00
6d692d0dfc Rename Group.toString() to Group.String() 2025-02-01 12:51:32 -05:00
7c4538a259 Added 'example' file that contains testable examples 2025-02-01 12:50:49 -05:00
2a9ae0b68a Wrote test for 'FindSubmatch' 2025-02-01 11:09:05 -05:00
783ae2ad10 Updated call to 'isValid' with call to 'IsValid' 2025-02-01 11:06:26 -05:00
b5e6bc112c Wrote 'reg.FindSubmatch()' which returns the leftmost match with submatches, renamed 'isValid' to 'IsValid' to export it, renamed 'ToString' to 'String' 2025-02-01 11:06:03 -05:00
206fea34cd Added function to return the number of subexpressions in the group 2025-02-01 11:04:49 -05:00
fcdb23524a Added more documentation 2025-02-01 11:04:24 -05:00
ac936659b6 Updated documentation 2025-01-31 16:52:26 -05:00
e6dba9fdcf Updated documentation 2025-01-31 16:51:46 -05:00
30779a446b Updated documentation 2025-01-31 16:46:19 -05:00
f629a0f08f Added 'mustCompile' which panicks if there is an error compiling 2025-01-31 16:46:05 -05:00
6869cd00a2 Return error instead of nil when 'Find' fails 2025-01-31 10:52:38 -05:00
02bc8f30a2 Added test for 'Find' 2025-01-31 10:52:27 -05:00
ac05bceda3 Use method instead of function 2025-01-31 10:13:02 -05:00
037ac75ea6 Wrote new method to return 0-group of leftmost match; reorganized some functions for better clarity; made 'FindNthMatch' a method 2025-01-31 10:12:53 -05:00
e9d4e857cf Run 'TestFindAllStrings' since that function has been implemented 2025-01-31 10:11:52 -05:00
b685d2fd5f Renamed 'findAllMatchesHelper' to 'findAllSubmatchHelper' 2025-01-31 09:56:30 -05:00
8eda5055ff Replaced call to 'FindAllMatches' with call to 'FindAll' or 'FindAllSubmatch' depending on whether I need submatches 2025-01-31 09:55:36 -05:00
45b6566b2c Replaced function call with method call 2025-01-31 09:54:35 -05:00
e22822e619 Renamed 'FindAllMatches' to 'FindAll' and made it a method; made it return a slice of 0-groups; the functionality of 'FindAllMatches' is now in 'FindAllSubmatch' 2025-01-31 09:54:09 -05:00
692de2a32b Added lookarounds and numeric ranges to documentation 2025-01-31 09:26:21 -05:00
0d19664044 Cleared up some comments, wrote a skeleton for FindAllString 2025-01-30 22:57:35 -05:00
1bfb09b6c7 Made 'FindString' a method of 'Reg' 2025-01-30 22:51:31 -05:00
b0b8bf23af Updated documentation 2025-01-30 22:51:16 -05:00
00570f07fe Wrote documentation on syntax 2025-01-30 17:51:46 -05:00
7431b1a7b2 Changed \Z to \z to fit with Go's naming 2025-01-30 15:08:18 -05:00
ee51e39d59 Added support for start-of-input (\A) and end-of-input (\Z) assertions 2025-01-30 13:56:56 -05:00
db7c884b83 Added test for start-of-input and end-of-input assertion 2025-01-30 13:56:26 -05:00
c3059fe899 Return a new error instead of rethrowing a non-existent error 2025-01-30 13:47:51 -05:00
4f577592ba Added rule to run tests 2025-01-30 13:46:41 -05:00
b734d61a03 Throw error if \B is used in character class 2025-01-30 12:27:22 -05:00
00c39857eb Rethrow errors instead of rewriting them 2025-01-30 12:26:50 -05:00
aa9e2324ee Removed unnecessary space 2025-01-30 11:25:19 -05:00
66b96bf9e8 Updated license 2025-01-30 11:20:05 -05:00
0ac39bfb7b Started working on package-level documentation 2025-01-30 11:19:53 -05:00
dbc9fe2c3b Added license 2025-01-30 11:08:28 -05:00
eeeb9387d5 Updated Makefile to build library and command separately 2025-01-30 10:58:33 -05:00
57eb935bd1 Updated comment 2025-01-30 10:48:59 -05:00
cbd679949f Updated more referencs to constants 2025-01-30 10:47:35 -05:00
a63426d965 Updated references to constants 2025-01-30 10:47:01 -05:00
2e3450285c Renamed one more variable to avoid exporting 2025-01-30 10:45:11 -05:00
7e792f1248 Renamed more constants to avoid exporting 2025-01-30 10:44:34 -05:00
b8f5b9af7c Updated one more reference to epsilon 2025-01-30 10:39:00 -05:00
be60f2fb51 Updated references to 'epsilon' 2025-01-30 10:38:26 -05:00
7aee4280cc Renamed 'EPSILON' to 'epsilon' to avoid exporting 2025-01-30 10:36:10 -05:00
e01ef48cbc Updated CONCAT to be a metacharacter instead of just a tilde, and renamed it to avoid exporting 2025-01-30 10:34:05 -05:00
93474c5159 Renamed 'state' to 'nfaState' because 'state' by itself means nothing 2025-01-30 10:31:02 -05:00
d81b2ddaaa Renamed 'State' to 'state' to avoid exposing the insides of the engine 2025-01-30 10:27:56 -05:00
429d286439 Renamed variable to avoid conflicting with type name 2025-01-30 10:26:31 -05:00
198a2c12a7 Renamed variable to avoid conflicting with type name 2025-01-30 10:25:24 -05:00
7e88b8a4b0 Renamed variable to avoid conflicting with type name 2025-01-30 10:24:24 -05:00
af5b6ebe08 Renamed type to avoid exporting 2025-01-30 10:23:01 -05:00
289bba35e2 Updated assertion constants so that they aren't exported 2025-01-30 10:18:18 -05:00
7e6377a4c4 Updated more constants, so that they aren't exported 2025-01-30 10:15:54 -05:00
73c6a442ce Updated nodeType constants so that they aren't exported 2025-01-30 10:13:55 -05:00
ca8f8e1030 Renamed function 2025-01-30 10:02:59 -05:00
24a5045ebe Updated map and reduce function names so that they aren't exported 2025-01-30 09:52:00 -05:00
f6d56b74e1 Updated module name to 'kleingrep' (Let's goo!) 2025-01-30 09:22:31 -05:00
14 changed files with 1276 additions and 745 deletions

11
LICENSE Normal file
View File

@@ -0,0 +1,11 @@
The MIT License (MIT)
Copyright (c) 2025 Aadhavan Srinivasan
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@@ -1,9 +1,13 @@
.DEFAULT_GOAL := build .DEFAULT_GOAL := buildCmd
.PHONY: fmt vet build .PHONY: fmt vet buildLib buildCmd test
fmt: fmt:
go fmt ./... go fmt ./...
vet: fmt vet: fmt
go vet ./... go vet ./...
build: vet buildLib: vet
go build -gcflags="-N -l" ./... go build -gcflags="all=-N -l" ./...
buildCmd: buildLib
go build -C cmd/ -gcflags="all=-N -l" -o re ./...
test: buildCmd
go test -v ./...

View File

@@ -9,7 +9,7 @@ import (
"github.com/fatih/color" "github.com/fatih/color"
reg "gitea.twomorecents.org/Rockingcool/kg/regex" reg "gitea.twomorecents.org/Rockingcool/kleingrep/regex"
) )
func main() { func main() {
@@ -121,12 +121,12 @@ func main() {
} }
matchIndices := make([]reg.Match, 0) matchIndices := make([]reg.Match, 0)
if matchNumFlagEnabled { if matchNumFlagEnabled {
tmp, err := reg.FindNthMatch(regComp, test_str, *matchNum) tmp, err := regComp.FindNthMatch(test_str, *matchNum)
if err == nil { if err == nil {
matchIndices = append(matchIndices, tmp) matchIndices = append(matchIndices, tmp)
} }
} else { } else {
matchIndices = reg.FindAllMatches(regComp, test_str) matchIndices = regComp.FindAllSubmatch(test_str)
} }
if *printMatchesFlag { if *printMatchesFlag {
@@ -137,7 +137,7 @@ func main() {
fmt.Fprintf(out, "Line %d:\n", lineNum) fmt.Fprintf(out, "Line %d:\n", lineNum)
} }
for _, m := range matchIndices { for _, m := range matchIndices {
fmt.Fprintf(out, "%s\n", m.ToString()) fmt.Fprintf(out, "%s\n", m.String())
} }
err := out.Flush() err := out.Flush()
if err != nil { if err != nil {

View File

@@ -16,7 +16,6 @@ func (s *uniq_arr[T]) add(vals ...T) {
s.backingMap[item] = struct{}{} s.backingMap[item] = struct{}{}
} }
} }
return
} }
func (s uniq_arr[T]) contains(val T) bool { func (s uniq_arr[T]) contains(val T) bool {

2
go.mod
View File

@@ -1,4 +1,4 @@
module gitea.twomorecents.org/Rockingcool/kg module gitea.twomorecents.org/Rockingcool/kleingrep
go 1.23.1 go 1.23.1

View File

@@ -12,13 +12,30 @@ var notDotChars []rune
// A Reg represents the result of compiling a regular expression. It contains // A Reg represents the result of compiling a regular expression. It contains
// the startState of the NFA representation of the regex, and the number of capturing // the startState of the NFA representation of the regex, and the number of capturing
// groups in the regex. // groups in the regex. It also contains the expression string.
type Reg struct { type Reg struct {
start *State start *nfaState
numGroups int numGroups int
str string
preferLongest bool
} }
const CONCAT rune = '~' // NumSubexp returns the number of sub-expressions in the given [Reg]. This is equivalent
// to the number of capturing groups.
func (re Reg) NumSubexp() int {
return re.numGroups
}
// String returns the string used to compile the expression.
func (re Reg) String() string {
return re.str
}
func (re *Reg) Longest() {
re.preferLongest = true
}
const concatRune rune = 0xF0001
// Flags for shuntingYard - control its behavior // Flags for shuntingYard - control its behavior
type ReFlag int type ReFlag int
@@ -31,7 +48,7 @@ const (
) )
func isOperator(c rune) bool { func isOperator(c rune) bool {
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT { if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune {
return true return true
} }
return false return false
@@ -39,7 +56,7 @@ func isOperator(c rune) bool {
/* priority returns the priority of the given operator */ /* priority returns the priority of the given operator */
func priority(op rune) int { func priority(op rune) int {
precedence := []rune{'|', CONCAT, '+', '*', '?'} precedence := []rune{'|', concatRune, '+', '*', '?'}
return slices.Index(precedence, op) return slices.Index(precedence, op)
} }
@@ -59,7 +76,7 @@ func priority(op rune) int {
func getPOSIXClass(str []rune) (bool, string) { func getPOSIXClass(str []rune) (bool, string) {
i := 0 i := 0
rtv := "" rtv := ""
for i < len(str) && (str[i] != ':' && str[i] != RBRACKET) { for i < len(str) && (str[i] != ':' && str[i] != rbracketRune) {
rtv += string(str[i]) rtv += string(str[i])
i++ i++
} }
@@ -69,7 +86,7 @@ func getPOSIXClass(str []rune) (bool, string) {
if str[i] != ':' { // The POSIX class must end with a colon and a closing bracket. It cannot end with a closing bracket first. if str[i] != ':' { // The POSIX class must end with a colon and a closing bracket. It cannot end with a closing bracket first.
return false, "" return false, ""
} }
if str[i+1] != RBRACKET { if str[i+1] != rbracketRune {
return false, "" return false, ""
} }
return true, rtv return true, rtv
@@ -171,16 +188,16 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} }
re_runes = append(re_runes, []rune(regex)...) re_runes = append(re_runes, []rune(regex)...)
} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' { } else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
re_runes = append(re_runes, NONCAPLPAREN_CHAR) re_runes = append(re_runes, nonCapLparenRune)
i += 2 i += 2
} else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash } else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash
re_runes = append(re_runes, ESC_BACKSLASH) re_runes = append(re_runes, escBackslashRune)
i++ i++
} else if c == '[' && (i == 0 || re_runes[len(re_runes)-1] != '\\') { } else if c == '[' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
re_runes = append(re_runes, LBRACKET) re_runes = append(re_runes, lbracketRune)
continue continue
} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') { } else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
re_runes = append(re_runes, RBRACKET) re_runes = append(re_runes, rbracketRune)
continue continue
} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') { } else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
return nil, fmt.Errorf("non-greedy operators are not supported") return nil, fmt.Errorf("non-greedy operators are not supported")
@@ -203,7 +220,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
i := 0 i := 0
for i < len(re_runes) { for i < len(re_runes) {
re_postfix = append(re_postfix, re_runes[i]) re_postfix = append(re_postfix, re_runes[i])
if re_runes[i] == LBRACKET && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. if re_runes[i] == lbracketRune && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped.
toAppend := make([]rune, 0) // Holds all the runes in the current character class toAppend := make([]rune, 0) // Holds all the runes in the current character class
i++ // Skip past LBRACKET, because it was already added i++ // Skip past LBRACKET, because it was already added
@@ -211,13 +228,13 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("opening bracket without closing bracket") return nil, fmt.Errorf("opening bracket without closing bracket")
} }
for re_runes[i] != RBRACKET || i == 0 || re_runes[i-1] == '\\' { // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash) for re_runes[i] != rbracketRune || i == 0 || re_runes[i-1] == '\\' { // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error. // Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
if i >= len(re_runes) { if i >= len(re_runes) {
return nil, fmt.Errorf("opening bracket without closing bracket") return nil, fmt.Errorf("opening bracket without closing bracket")
} }
if re_runes[i] == LBRACKET && re_runes[i+1] == ':' { // POSIX character class if re_runes[i] == lbracketRune && re_runes[i+1] == ':' { // POSIX character class
toAppend = append(toAppend, re_runes[i]) toAppend = append(toAppend, re_runes[i])
i++ i++
toAppend = append(toAppend, re_runes[i]) toAppend = append(toAppend, re_runes[i])
@@ -232,14 +249,14 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
toAppend = append(toAppend, re_runes[i]) toAppend = append(toAppend, re_runes[i])
i++ i++
} }
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != RBRACKET) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != rbracketRune) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
re_runes[i] = CHAR_RANGE re_runes[i] = charRangeRune
} }
toAppend = append(toAppend, re_runes[i]) toAppend = append(toAppend, re_runes[i])
i++ i++
} }
// Add in the RBRACKET // Add in the RBRACKET
toAppend = append(toAppend, RBRACKET) toAppend = append(toAppend, rbracketRune)
re_postfix = append(re_postfix, toAppend...) re_postfix = append(re_postfix, toAppend...)
} }
if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
@@ -254,7 +271,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
} }
if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen
re_postfix = append(re_postfix, NONCAPLPAREN_CHAR) re_postfix = append(re_postfix, nonCapLparenRune)
i += 3 i += 3
} }
if i < len(re_runes) && re_runes[i] == '\\' { // Something is being escaped (I don't add the backslash to re_postfix, because it was already added earlier) if i < len(re_runes) && re_runes[i] == '\\' { // Something is being escaped (I don't add the backslash to re_postfix, because it was already added earlier)
@@ -303,7 +320,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if i >= len(re_runes) { if i >= len(re_runes) {
return nil, fmt.Errorf("unclosed lookaround") return nil, fmt.Errorf("unclosed lookaround")
} }
if re_runes[i] == '(' || re_runes[i] == NONCAPLPAREN_CHAR { if re_runes[i] == '(' || re_runes[i] == nonCapLparenRune {
numOpenParens++ numOpenParens++
} }
if re_runes[i] == ')' { if re_runes[i] == ')' {
@@ -317,10 +334,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} }
continue continue
} }
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != nonCapLparenRune && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
if i < len(re_runes)-1 { if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' { if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
re_postfix = append(re_postfix, CONCAT) re_postfix = append(re_postfix, concatRune)
} }
} }
} }
@@ -357,7 +374,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// To deal with this, I make the following assertion: // To deal with this, I make the following assertion:
// If at any point I see an RBRACKET 'in the wild' (not in a character class), then it must be // If at any point I see an RBRACKET 'in the wild' (not in a character class), then it must be
// a regular character, with no special significance. // a regular character, with no special significance.
if c == RBRACKET { if c == rbracketRune {
outQueue = append(outQueue, newPostfixCharNode(']')) outQueue = append(outQueue, newPostfixCharNode(']'))
continue continue
} }
@@ -407,7 +424,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} else { } else {
escapedNode, err := newEscapedNode(re_postfix[i], false) escapedNode, err := newEscapedNode(re_postfix[i], false)
if err != nil { if err != nil {
return nil, fmt.Errorf("invalid escape character in expression") return nil, err
} }
outQueue = append(outQueue, escapedNode) outQueue = append(outQueue, escapedNode)
} }
@@ -433,7 +450,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if i >= len(re_postfix) { if i >= len(re_postfix) {
return nil, fmt.Errorf("unclosed lookaround") return nil, fmt.Errorf("unclosed lookaround")
} }
if re_postfix[i] == '(' || re_postfix[i] == NONCAPLPAREN_CHAR { if re_postfix[i] == '(' || re_postfix[i] == nonCapLparenRune {
numOpenParens++ numOpenParens++
} }
if re_postfix[i] == ')' { if re_postfix[i] == ')' {
@@ -450,21 +467,21 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} }
// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind) // 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
// Now we should filter that out. // Now we should filter that out.
toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1} toAppend := postfixNode{nodetype: assertionNode, startReps: 1, endReps: 1}
if regex[0] == '<' { // Lookbehind if regex[0] == '<' { // Lookbehind
toAppend.lookaroundDir = LOOKBEHIND toAppend.lookaroundDir = lookbehind
regex = regex[1:] regex = regex[1:]
} else if regex[0] == '=' || regex[0] == '!' { } else if regex[0] == '=' || regex[0] == '!' {
toAppend.lookaroundDir = LOOKAHEAD toAppend.lookaroundDir = lookahead
} else { } else {
return nil, fmt.Errorf("invalid lookaround") return nil, fmt.Errorf("invalid lookaround")
} }
// Positive or negative // Positive or negative
if regex[0] == '=' { // Positive if regex[0] == '=' { // Positive
toAppend.lookaroundSign = POSITIVE toAppend.lookaroundSign = positive
toAppend.contents = []rune(regex[1:]) toAppend.contents = []rune(regex[1:])
} else if regex[0] == '!' { // Negative } else if regex[0] == '!' { // Negative
toAppend.lookaroundSign = NEGATIVE toAppend.lookaroundSign = negative
toAppend.contents = []rune(regex[1:]) toAppend.contents = []rune(regex[1:])
} else { } else {
return nil, fmt.Errorf("invalid lookaround") return nil, fmt.Errorf("invalid lookaround")
@@ -489,14 +506,14 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
topStack, _ = peek(opStack) topStack, _ = peek(opStack)
} }
outQueueFinalElement, _ := peek(outQueue) outQueueFinalElement, _ := peek(outQueue)
if (c == '*' && outQueueFinalElement.nodetype == KLEENE) || (c == '+' && outQueueFinalElement.nodetype == PLUS) { // You cannot apply a quantifier to a quantifier in this way if (c == '*' && outQueueFinalElement.nodetype == kleeneNode) || (c == '+' && outQueueFinalElement.nodetype == plusNode) { // You cannot apply a quantifier to a quantifier in this way
return nil, fmt.Errorf("illegal use of token '%c'", c) return nil, fmt.Errorf("illegal use of token '%c'", c)
} }
opStack = append(opStack, c) opStack = append(opStack, c)
} }
} }
} }
if c == LBRACKET { // Used for character classes if c == lbracketRune { // Used for character classes
firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added. firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
i++ // Step forward so we can look at the character class i++ // Step forward so we can look at the character class
@@ -521,10 +538,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} }
chars := make([]postfixNode, 0) // List of nodes - used only for character classes chars := make([]postfixNode, 0) // List of nodes - used only for character classes
for i < len(re_postfix) { for i < len(re_postfix) {
if firstCharAdded && re_postfix[i] == RBRACKET { if firstCharAdded && re_postfix[i] == rbracketRune {
break break
} }
if re_postfix[i] == CHAR_RANGE { if re_postfix[i] == charRangeRune {
endOfRange = true endOfRange = true
i++ i++
continue continue
@@ -575,13 +592,13 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} else { } else {
escapedNode, err := newEscapedNode(re_postfix[i], true) escapedNode, err := newEscapedNode(re_postfix[i], true)
if err != nil { if err != nil {
return nil, fmt.Errorf("invalid escape character in character class") return nil, err
} }
chars = append(chars, escapedNode) chars = append(chars, escapedNode)
i++ i++
} }
} else { } else {
if re_postfix[i] == LBRACKET && i < len(re_postfix)-8 { // Could be the start of a POSIX class - the smallest POSIX class by word-length [[:word:]] takes 8 more characters if re_postfix[i] == lbracketRune && i < len(re_postfix)-8 { // Could be the start of a POSIX class - the smallest POSIX class by word-length [[:word:]] takes 8 more characters
temp_i := i temp_i := i
temp_i++ temp_i++
if re_postfix[temp_i] == ':' { if re_postfix[temp_i] == ':' {
@@ -643,9 +660,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// will prevent it from running, as the outer if-statement will have evaluated to true. // will prevent it from running, as the outer if-statement will have evaluated to true.
if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket. if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket.
switch re_postfix[i] { switch re_postfix[i] {
case LBRACKET: case lbracketRune:
chars = append(chars, newPostfixCharNode('[')) chars = append(chars, newPostfixCharNode('['))
case RBRACKET: case rbracketRune:
chars = append(chars, newPostfixCharNode(']')) chars = append(chars, newPostfixCharNode(']'))
default: default:
return nil, fmt.Errorf("error parsing high-range unicode value in character class") return nil, fmt.Errorf("error parsing high-range unicode value in character class")
@@ -739,7 +756,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("invalid start range for numeric specifier") return nil, fmt.Errorf("invalid start range for numeric specifier")
} }
if len(endRange) == 0 { // Case 3 above if len(endRange) == 0 { // Case 3 above
endRangeNum = INFINITE_REPS endRangeNum = infinite_reps
} else { // Case 2 above } else { // Case 2 above
var err error var err error
endRangeNum, err = strconv.Atoi(string(endRange)) endRangeNum, err = strconv.Atoi(string(endRange))
@@ -751,13 +768,13 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
idx := len(outQueue) - 1 idx := len(outQueue) - 1
// Get the last added node // Get the last added node
if idx < 0 || outQueue[idx].nodetype == LPAREN { if idx < 0 || outQueue[idx].nodetype == lparenNode {
return nil, fmt.Errorf("numeric specifier with no content") return nil, fmt.Errorf("numeric specifier with no content")
} }
outQueue[idx].startReps = startRangeNum outQueue[idx].startReps = startRangeNum
outQueue[idx].endReps = endRangeNum outQueue[idx].endReps = endRangeNum
} }
if c == '(' || c == NONCAPLPAREN_CHAR { if c == '(' || c == nonCapLparenRune {
opStack = append(opStack, c) opStack = append(opStack, c)
if c == '(' { // We only push _capturing_ group parentheses to outQueue if c == '(' { // We only push _capturing_ group parentheses to outQueue
outQueue = append(outQueue, newPostfixNode(c)) outQueue = append(outQueue, newPostfixNode(c))
@@ -768,7 +785,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Throw error if we reach the end of the stack. // Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Throw error if we reach the end of the stack.
var val rune var val rune
var err error var err error
for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) { for val, err = peek(opStack); val != '(' && val != nonCapLparenRune; val, err = peek(opStack) {
if err != nil { if err != nil {
return nil, fmt.Errorf("imbalanced parantheses") return nil, fmt.Errorf("imbalanced parantheses")
} }
@@ -799,8 +816,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// Thompson's algorithm. Constructs Finite-State Automaton from given string. // Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state and number of groups in regex. // Returns start state and number of groups in regex.
func thompson(re []postfixNode) (Reg, error) { func thompson(re []postfixNode) (Reg, error) {
nfa := make([]*State, 0) // Stack of states nfa := make([]*nfaState, 0) // Stack of states
numGroups := 0 // Number of capturing groups numGroups := 0 // Number of capturing groups
// If thompson() receives an empty regex, then whatever was given to shuntingYard() // If thompson() receives an empty regex, then whatever was given to shuntingYard()
// was parsed away. This doesn't mean that the regex itself is empty. // was parsed away. This doesn't mean that the regex itself is empty.
@@ -810,15 +827,14 @@ func thompson(re []postfixNode) (Reg, error) {
// In these cases, we will return an NFA with 1 state, with an assertion that is always true. // In these cases, we will return an NFA with 1 state, with an assertion that is always true.
if len(re) == 0 { if len(re) == 0 {
start := zeroLengthMatchState() start := zeroLengthMatchState()
nfa = append(nfa, &start) nfa = append(nfa, start)
} }
for _, c := range re { for _, c := range re {
if c.nodetype == CHARACTER || c.nodetype == ASSERTION { if c.nodetype == characterNode || c.nodetype == assertionNode {
state := State{} stateToAdd := nfaState{}
state.transitions = make(map[int][]*State)
if c.allChars { if c.allChars {
state.allChars = true stateToAdd.allChars = true
if len(c.except) != 0 { if len(c.except) != 0 {
// For each node that I am 'excepting' (eg. in an inverted character class): // For each node that I am 'excepting' (eg. in an inverted character class):
// - If the node itself has exceptions, then the exceptions cancel out. // - If the node itself has exceptions, then the exceptions cancel out.
@@ -827,28 +843,28 @@ func thompson(re []postfixNode) (Reg, error) {
// - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list. // - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list.
for _, node := range c.except { for _, node := range c.except {
if node.allChars { if node.allChars {
state.allChars = false stateToAdd.allChars = false
// For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all, // For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all,
// and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match // and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match
// those. // those.
nodeExceptChars := slices.Concat(Map(node.except, func(node postfixNode) []rune { nodeExceptChars := slices.Concat(funcMap(node.except, func(node postfixNode) []rune {
nodeContents := node.contents nodeContents := node.contents
if caseInsensitive { if caseInsensitive {
nodeContents = slices.Concat(Map(nodeContents, func(r rune) []rune { nodeContents = slices.Concat(funcMap(nodeContents, func(r rune) []rune {
return allCases(r, caseInsensitive) return allCases(r, caseInsensitive)
})...) })...)
} }
return nodeContents return nodeContents
})...) })...)
state.content = rune2Contents(nodeExceptChars) stateToAdd.content = rune2Contents(nodeExceptChars)
} else { } else {
charsToAdd := node.contents charsToAdd := node.contents
if caseInsensitive { if caseInsensitive {
charsToAdd = slices.Concat(Map(charsToAdd, func(r rune) []rune { charsToAdd = slices.Concat(funcMap(charsToAdd, func(r rune) []rune {
return allCases(r, caseInsensitive) return allCases(r, caseInsensitive)
})...) })...)
} }
state.except = append(state.except, charsToAdd...) stateToAdd.except = append(stateToAdd.except, charsToAdd...)
} }
} }
} }
@@ -857,47 +873,51 @@ func thompson(re []postfixNode) (Reg, error) {
// convert back to stateContents. // convert back to stateContents.
runesToAdd := c.contents runesToAdd := c.contents
if caseInsensitive { if caseInsensitive {
runesToAdd = slices.Concat(Map(runesToAdd, func(r rune) []rune { runesToAdd = slices.Concat(funcMap(runesToAdd, func(r rune) []rune {
return allCases(r, caseInsensitive) return allCases(r, caseInsensitive)
})...) })...)
} }
state.content = stateContents(append([]int(state.content), []int(rune2Contents(runesToAdd))...)) stateToAdd.content = stateContents(append([]int(stateToAdd.content), []int(rune2Contents(runesToAdd))...))
state.output = make([]*State, 0) stateToAdd.output = make([]*nfaState, 0)
state.output = append(state.output, &state) stateToAdd.output = append(stateToAdd.output, &stateToAdd)
state.isEmpty = false stateToAdd.isEmpty = false
if c.nodetype == ASSERTION { if c.nodetype == assertionNode {
state.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way. stateToAdd.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string stateToAdd.content = newContents(epsilon) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
if c.lookaroundDir == 0 || c.lookaroundSign == 0 { if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
switch c.contents[0] { switch c.contents[0] {
case '^': case '^':
state.assert = SOS stateToAdd.assert = sosAssert
case '$': case '$':
state.assert = EOS stateToAdd.assert = eosAssert
case 'b': case 'b':
state.assert = WBOUND stateToAdd.assert = wboundAssert
case 'B': case 'B':
state.assert = NONWBOUND stateToAdd.assert = nonwboundAssert
case 'A':
stateToAdd.assert = soiAssert
case 'z':
stateToAdd.assert = eoiAssert
} }
} else { // Lookaround } else { // Lookaround
state.lookaroundRegex = string(c.contents) stateToAdd.lookaroundRegex = string(c.contents)
if c.lookaroundDir == LOOKAHEAD { if c.lookaroundDir == lookahead {
if c.lookaroundSign == POSITIVE { if c.lookaroundSign == positive {
state.assert = PLA stateToAdd.assert = plaAssert
} }
if c.lookaroundSign == NEGATIVE { if c.lookaroundSign == negative {
state.assert = NLA stateToAdd.assert = nlaAssert
} }
} }
if c.lookaroundDir == LOOKBEHIND { if c.lookaroundDir == lookbehind {
if c.lookaroundSign == POSITIVE { if c.lookaroundSign == positive {
state.assert = PLB stateToAdd.assert = plbAssert
} }
if c.lookaroundSign == NEGATIVE { if c.lookaroundSign == negative {
state.assert = NLB stateToAdd.assert = nlbAssert
} }
} }
tmpRe, err := shuntingYard(state.lookaroundRegex) tmpRe, err := shuntingYard(stateToAdd.lookaroundRegex)
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error parsing lookaround: %w", err) return Reg{}, fmt.Errorf("error parsing lookaround: %w", err)
} }
@@ -905,28 +925,27 @@ func thompson(re []postfixNode) (Reg, error) {
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error compiling lookaround: %w", err) return Reg{}, fmt.Errorf("error compiling lookaround: %w", err)
} }
state.lookaroundNFA = reg.start stateToAdd.lookaroundNFA = reg.start
state.lookaroundNumCaptureGroups = reg.numGroups stateToAdd.lookaroundNumCaptureGroups = reg.numGroups
} }
} }
// Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it // Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it
replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\') replaceByValue([]int(stateToAdd.content), int(escBackslashRune), '\\')
replaceByValue(state.except, ESC_BACKSLASH, '\\') replaceByValue(stateToAdd.except, escBackslashRune, '\\')
nfa = append(nfa, &state) nfa = append(nfa, &stateToAdd)
} }
if c.nodetype == LPAREN || c.nodetype == RPAREN { if c.nodetype == lparenNode || c.nodetype == rparenNode {
s := &State{} s := &nfaState{}
s.assert = NONE s.assert = noneAssert
s.content = newContents(EPSILON) s.content = newContents(epsilon)
s.isEmpty = true s.isEmpty = true
s.output = make([]*State, 0) s.output = make([]*nfaState, 0)
s.output = append(s.output, s) s.output = append(s.output, s)
s.transitions = make(map[int][]*State)
// LPAREN nodes are just added normally // LPAREN nodes are just added normally
if c.nodetype == LPAREN { if c.nodetype == lparenNode {
numGroups++ numGroups++
s.groupBegin = true s.groupBegin = true
s.groupNum = numGroups s.groupNum = numGroups
@@ -939,8 +958,10 @@ func thompson(re []postfixNode) (Reg, error) {
// and added back in. // and added back in.
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN // If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
// and RPAREN nodes. // and RPAREN nodes.
// If neither node exists, that's a problem so I return an error. // If the middle node exists but is itself the start of a group, then that _must_ be the opening paren for
if c.nodetype == RPAREN { // the closing paren that I'm on. I put the third node back (because it isn't involved in the capturing group), then
// I concatenate those two and add them. If neither node exists, that's a problem so I return an error.
if c.nodetype == rparenNode {
s.groupEnd = true s.groupEnd = true
middleNode, err1 := pop(&nfa) middleNode, err1 := pop(&nfa)
lparenNode, err2 := pop(&nfa) lparenNode, err2 := pop(&nfa)
@@ -954,6 +975,11 @@ func thompson(re []postfixNode) (Reg, error) {
s.groupNum = lparenNode.groupNum s.groupNum = lparenNode.groupNum
to_add := concatenate(lparenNode, s) to_add := concatenate(lparenNode, s)
nfa = append(nfa, to_add) nfa = append(nfa, to_add)
} else if middleNode.groupBegin && middleNode.numTransitions() == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group
s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen
to_add := concatenate(middleNode, s)
nfa = append(nfa, to_add)
} else { } else {
// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's') // At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
if lparenNode.groupBegin { if lparenNode.groupBegin {
@@ -969,38 +995,39 @@ func thompson(re []postfixNode) (Reg, error) {
} }
} }
} }
if c.nodetype == CHARCLASS { // A Character class consists of all the nodes in it, alternated if c.nodetype == charclassNode { // A Character class consists of all the nodes in it, alternated
// Map the list of nodes to a list of states, each state containing the contents of a specific node // Map the list of nodes to a list of states, each state containing the contents of a specific node
states := Map(c.nodeContents, func(node postfixNode) *State { states := funcMap(c.nodeContents, func(node postfixNode) *nfaState {
s := newState() s := &nfaState{}
s.output = append(s.output, s)
nodeContents := node.contents nodeContents := node.contents
if caseInsensitive { if caseInsensitive {
nodeContents = slices.Concat(Map(nodeContents, func(r rune) []rune { nodeContents = slices.Concat(funcMap(nodeContents, func(r rune) []rune {
return allCases(r, caseInsensitive) return allCases(r, caseInsensitive)
})...) })...)
} }
s.content = rune2Contents(nodeContents) s.content = rune2Contents(nodeContents)
if len(node.except) > 0 { if len(node.except) > 0 {
s.allChars = true s.allChars = true
s.except = slices.Concat(Map(node.except, func(n postfixNode) []rune { s.except = slices.Concat(funcMap(node.except, func(n postfixNode) []rune {
return n.contents return n.contents
})...) })...)
} }
return &s return s
}) })
// Reduce the list of states down to a single state by alternating them // Reduce the list of states down to a single state by alternating them
toAdd := Reduce(states, func(s1 *State, s2 *State) *State { toAdd := funcReduce(states, func(s1 *nfaState, s2 *nfaState) *nfaState {
return alternate(s1, s2) return alternate(s1, s2)
}) })
nfa = append(nfa, toAdd) nfa = append(nfa, toAdd)
} }
// Must be an operator if it isn't a character // Must be an operator if it isn't a character
switch c.nodetype { switch c.nodetype {
case CONCATENATE: case concatenateNode:
s2 := mustPop(&nfa) s2 := mustPop(&nfa)
// Relax the requirements for concatenation a little bit - If // Relax the requirements for concatenation a little bit - If
// the second element is not found ie. the postfixNodes look // the second element is not found ie. the postfixNodes look
// like 'a~', then that's fine, we just skip the concatenation. // like 'a'+CONCAT, then that's fine, we just skip the concatenation.
s1, err := pop(&nfa) s1, err := pop(&nfa)
if err != nil { if err != nil {
nfa = append(nfa, s2) nfa = append(nfa, s2)
@@ -1008,32 +1035,35 @@ func thompson(re []postfixNode) (Reg, error) {
s1 = concatenate(s1, s2) s1 = concatenate(s1, s2)
nfa = append(nfa, s1) nfa = append(nfa, s1)
} }
case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state case kleeneNode: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
s1, err := pop(&nfa) s1, err := pop(&nfa)
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error applying kleene star") return Reg{}, fmt.Errorf("error applying kleene star")
} }
stateToAdd, err := kleene(*s1) stateToAdd, err := kleene(s1)
if err != nil { if err != nil {
return Reg{}, err return Reg{}, err
} }
nfa = append(nfa, stateToAdd) nfa = append(nfa, stateToAdd)
case PLUS: // a+ is equivalent to aa* case plusNode: // a+ is equivalent to aa*
s1 := mustPop(&nfa) s1 := mustPop(&nfa)
s2, err := kleene(*s1) s2, err := kleene(s1)
if err != nil { if err != nil {
return Reg{}, err return Reg{}, err
} }
s1 = concatenate(s1, s2) s1 = concatenate(s1, s2)
nfa = append(nfa, s1) nfa = append(nfa, s1)
case QUESTION: // ab? is equivalent to a(b|) case questionNode: // ab? is equivalent to a(b|)
s1, err := pop(&nfa) s1, err := pop(&nfa)
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error applying question operator") return Reg{}, fmt.Errorf("error applying question operator")
} }
s2 := question(s1) s2, err := question(s1)
if err != nil {
return Reg{}, err
}
nfa = append(nfa, s2) nfa = append(nfa, s2)
case PIPE: case pipeNode:
// A pipe operator doesn't actually need either operand to be present. If an operand isn't present, // A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
// it is replaced with an implicit 'matchZeroLength' state (this is the same thing that we add at the top if our // it is replaced with an implicit 'matchZeroLength' state (this is the same thing that we add at the top if our
// input has zero postfixNodes). // input has zero postfixNodes).
@@ -1042,21 +1072,21 @@ func thompson(re []postfixNode) (Reg, error) {
// '|a' // '|a'
// '^a|' // '^a|'
// '^|a' // '^|a'
s1, err1 := pop(&nfa) s2, err1 := pop(&nfa)
s2, err2 := pop(&nfa) s1, err2 := pop(&nfa)
if err2 != nil || (s2.groupBegin && len(s2.transitions) == 0) { // Doesn't exist, or its just an LPAREN if err2 != nil || (s2.groupBegin && s2.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN
if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back
nfa = append(nfa, s2) nfa = append(nfa, s2)
} }
tmp := zeroLengthMatchState() tmp := zeroLengthMatchState()
s2 = &tmp s2 = tmp
} }
if err1 != nil || (s1.groupBegin && len(s1.transitions) == 0) { // Doesn't exist, or its just an LPAREN if err1 != nil || (s1.groupBegin && s1.numTransitions() == 0) { // Doesn't exist, or its just an LPAREN
if err1 == nil { // See above for explanation if err1 == nil { // See above for explanation
nfa = append(nfa, s1) nfa = append(nfa, s1)
} }
tmp := zeroLengthMatchState() tmp := zeroLengthMatchState()
s1 = &tmp s1 = tmp
} }
s3 := alternate(s1, s2) s3 := alternate(s1, s2)
nfa = append(nfa, s3) nfa = append(nfa, s3)
@@ -1065,8 +1095,8 @@ func thompson(re []postfixNode) (Reg, error) {
if c.endReps != -1 && c.endReps < c.startReps { if c.endReps != -1 && c.endReps < c.startReps {
return Reg{}, fmt.Errorf("numeric specifier - start greater than end") return Reg{}, fmt.Errorf("numeric specifier - start greater than end")
} }
state := mustPop(&nfa) poppedState := mustPop(&nfa)
var stateToAdd *State = nil var stateToAdd *nfaState = nil
// Take advantage of the following facts: // Take advantage of the following facts:
// a{5} == aaaaa // a{5} == aaaaa
// a{3,5} == aaaa?a? // a{3,5} == aaaa?a?
@@ -1080,17 +1110,21 @@ func thompson(re []postfixNode) (Reg, error) {
// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier // b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
// at this point, I can leave thompson untouched. // at this point, I can leave thompson untouched.
for i := 0; i < c.startReps; i++ { // Case 1 for i := 0; i < c.startReps; i++ { // Case 1
stateToAdd = concatenate(stateToAdd, cloneState(state)) stateToAdd = concatenate(stateToAdd, cloneState(poppedState))
} }
if c.endReps == INFINITE_REPS { // Case 3 if c.endReps == infinite_reps { // Case 3
s2, err := kleene(*state) s2, err := kleene(poppedState)
if err != nil { if err != nil {
return Reg{}, err return Reg{}, err
} }
stateToAdd = concatenate(stateToAdd, s2) stateToAdd = concatenate(stateToAdd, s2)
} else { // Case 2 } else { // Case 2
for i := c.startReps; i < c.endReps; i++ { for i := c.startReps; i < c.endReps; i++ {
stateToAdd = concatenate(stateToAdd, question(cloneState(state))) tmp, err := question(cloneState(poppedState))
if err != nil {
return Reg{}, fmt.Errorf("error processing bounded repetition")
}
stateToAdd = concatenate(stateToAdd, tmp)
} }
} }
nfa = append(nfa, stateToAdd) nfa = append(nfa, stateToAdd)
@@ -1100,16 +1134,21 @@ func thompson(re []postfixNode) (Reg, error) {
return Reg{}, fmt.Errorf("invalid regex") return Reg{}, fmt.Errorf("invalid regex")
} }
verifyLastStates(nfa) lastState := newState()
lastState.isLast = true
return Reg{nfa[0], numGroups}, nil concatenate(nfa[0], &lastState)
// The string is empty here, because we add it in Compile()
return Reg{nfa[0], numGroups, "", false}, nil
} }
// Compiles the given regular expression into a Reg type, suitable for use with the // Compile compiles the given regular expression into a [Reg].
// matching functions. The second return value is non-nil if a compilation error has //
// occured. As such, the error value must be checked before using the Reg returned by this function. // An error value != nil indicates that the regex was invalid; the error message should provide
// The second parameter is an optional list of flags, passed to the parsing function shuntingYard. // detailed information on the nature of the error.
// The second parameter is a sequence of zero or more [ReFlag] values, that modify the behavior of the regex.
func Compile(re string, flags ...ReFlag) (Reg, error) { func Compile(re string, flags ...ReFlag) (Reg, error) {
nodes, err := shuntingYard(re, flags...) nodes, err := shuntingYard(re, flags...)
if err != nil { if err != nil {
@@ -1119,5 +1158,15 @@ func Compile(re string, flags ...ReFlag) (Reg, error) {
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error compiling regex: %w", err) return Reg{}, fmt.Errorf("error compiling regex: %w", err)
} }
reg.str = re
return reg, nil return reg, nil
} }
// MustCompile panics if Compile returns an error. They are identical in all other respects.
func MustCompile(re string, flags ...ReFlag) Reg {
reg, err := Compile(re, flags...)
if err != nil {
panic(err)
}
return reg
}

178
regex/doc.go Normal file
View File

@@ -0,0 +1,178 @@
/*
Package regex implements regular expression search, using a custom non-bracktracking engine with support for lookarounds and numeric ranges.
The engine relies completely on UTF-8 codepoints. As such, it is capable of matching characters
from other languages, emojis and symbols.
The API and regex syntax are largely compatible with that of the stdlib's [regexp], with a few key differences (see 'Key Differences with regexp').
The full syntax is specified below.
# Syntax
Single characters:
. Match any character. Newline matching is dependent on the RE_SINGLE_LINE flag.
[abc] Character class - match a, b or c
[a-z] Character range - match any character from a to z
[^abc] Negated character class - match any character except a, b and c
[^a-z] Negated character range - do not match any character from a to z
\[ Match a literal '['. Backslashes can escape any character with special meaning, including another backslash.
\452 Match the character with the octal value 452 (up to 3 digits)
\xFF Match the character with the hex value FF (exactly 2 characters)
\x{0000FF} Match the character with the hex value 0000FF (exactly 6 characters)
\n Newline
\a Bell character
\f Form-feed character
\r Carriage return
\t Horizontal tab
\v Vertical tab
Perl classes:
\d Match any digit character ([0-9])
\D Match any non-digit character ([^0-9])
\w Match any word character ([a-zA-Z0-9_])
\W Match any word character ([^a-zA-Z0-9_])
\s Match any whitespace character ([ \t\n])
\S Match any non-whitespace character ([^ \t\n])
POSIX classes (inside normal character classes):
[:digit:] All digit characters ([0-9])
[:upper:] All upper-case letters ([A-Z])
[:lower:] All lower-case letters ([a-z])
[:alpha:] All letters ([a-zA-Z])
[:alnum:] All alphanumeric characters ([a-zA-Z0-9])
[:xdigit:] All hexadecimal characters ([a-fA-F0-9])
[:blank:] All blank characters ([ \t])
[:space:] All whitespace characters ([ \t\n\r\f\v])
[:cntrl:] All control characters ([\x00-\x1F\x7F])
[:punct:] All punctuation characters
[:graph:] All graphical characters ([\x21-\x7E])
[:print:] All graphical characters + space ([\x20-\x7E])
[:word:] All word characters (\w)
[:ascii:] All ASCII values ([\x00-\x7F])
Composition:
def Match d, followed by e, followed by f
x|y Match x or y (prefer x)
xy|z Match xy or z (prefer xy)
Repitition (always greedy, preferring more):
x* Match x zero or more times
x+ Match x one or more times
x? Match x zero or one time
x{m,n} Match x between m and n times (inclusive)
x{m,} Match x atleast m times
x{,n} Match x between 0 and n times (inclusive)
x{m} Match x exactly m times
Grouping:
(expr) Create a capturing group. The contents of the group can be retrieved with [FindAllMatches]
x(y|z) Match x followed by y or z. Given a successful match, the contents of group 1 will include either y or z
(?:expr) Create a non-capturing group. The contents of the group aren't saved.
x(?:y|z) Match x followed by y or z. No groups are created.
Assertions:
^ Match at the start of the input string. If RE_MULTILINE is enabled, it also matches at the start of every line.
$ Match at the end of the input string. If RE_MULTILINE is enabled, it also matches at the end of every line.
\A Always match at the start of the string, regardless of RE_MULTILINE
\z Always match at the end of the string, regardless of RE_MULTILINE
\b Match at a word boundary (a word character followed by a non-word character, or vice-versa)
\B Match at a non-word boundary (a word character followed by a word character, or vice-versa)
Lookarounds:
x(?=y) Positive lookahead - Match x if followed by y
x(?!y) Negative lookahead - Match x if NOT followed by y
(?<=x)y Positive lookbehind - Match y if preceded by x
(?<!x)y Negative lookbehind - Match y if NOT preceded by x
Numeric ranges:
<x-y> Match any number from x to y (inclusive) (x and y must be positive numbers)
\<x Match a literal '<' followed by x
# Key Differences with regexp
The engine and the API differ from [regexp] in a few ways, some of them very subtle.
The key differences are mentioned below.
1. Greediness:
This engine does not support non-greedy operators. All operators are always greedy in nature, and will try
to match as much as they can, while still allowing for a successful match. For example, given the regex:
y*y
The engine will match as many 'y's as it can, while still allowing the trailing 'y' to be matched.
Another, more subtle example is the following regex:
x|xx
While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
this engine will go for the longest possible match, regardless of the order of the alternation. Although this
strays from the convention, it results in a nice rule-of-thumb - the engine is ALWAYS greedy.
The stdlib implementation has a function [regexp.Regexp.Longest] which makes future searches prefer the longest match.
That is the default (and unchangable) behavior in this engine.
2. Byte-slices and runes:
My engine does not support byte-slices. When a matching function receives a string, it converts it into a
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
support made the tradeoff worth it.
3. Return values
Rather than using primitives for return values, my engine defines two types that are used as return
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
[regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The
equivalent expression for this engine is shown below. Note that 'Index' is the default.
Find(All)?(String)?(Submatch)?
[Reg.Find] returns the index of the leftmost match in the string.
If a function contains 'All' it returns all matches instead of just the leftmost one.
If a function contains 'String' it returns the matched text, rather than the index in the string.
If a function contains 'Submatch' it returns the match, including all submatches found by
capturing groups.
The term '0-group' is used to refer to the 0th capturing group of a match (which is the entire match).
Given the following regex:
x(y)
and the input string:
xyz
The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name
returns the 0-group.
# Feature Differences
The following features from [regexp] are (currently) NOT supported:
1. Named capturing groups
2. Non-greedy operators
3. Unicode character classes
4. Embedded flags (flags are passed as arguments to [Compile])
5. Literal text with \Q ... \E
The following features are not available in [regexp], but are supported in my engine:
1. Lookarounds
2. Numeric ranges
I hope to shorten the first list, and expand the second.
*/
package regex

91
regex/example_test.go Normal file
View File

@@ -0,0 +1,91 @@
package regex_test
import (
"fmt"
"gitea.twomorecents.org/Rockingcool/kleingrep/regex"
)
func ExampleReg_Find() {
regexStr := "b|a"
regexComp := regex.MustCompile(regexStr)
match, _ := regexComp.Find("banana")
fmt.Println(match.String())
// Output: 0 1
}
func ExampleReg_FindAll() {
regexStr := "b|a"
regexComp := regex.MustCompile(regexStr)
matches := regexComp.FindAll("banana")
for _, group := range matches {
fmt.Println(group.String())
}
// Output: 0 1
// 1 2
// 3 4
// 5 6
}
func ExampleReg_FindString() {
regexStr := `\d+`
regexComp := regex.MustCompile(regexStr)
matchStr := regexComp.FindString("The year of our lord, 2025")
fmt.Println(matchStr)
// Output: 2025
}
func ExampleReg_FindSubmatch() {
regexStr := `(\d)\.(\d)(\d)`
regexComp := regex.MustCompile(regexStr)
match, _ := regexComp.FindSubmatch("3.14")
fmt.Println(match[0])
fmt.Println(match[1])
fmt.Println(match[2])
// Output: 0 4
// 0 1
// 2 3
}
func ExampleReg_Expand() {
inputStr := `option1: value1
option2: value2`
regexStr := `(\w+): (\w+)`
templateStr := "$1 = $2\n"
regexComp := regex.MustCompile(regexStr, regex.RE_MULTILINE)
result := ""
for _, submatches := range regexComp.FindAllSubmatch(inputStr) {
result = regexComp.Expand(result, templateStr, inputStr, submatches)
}
fmt.Println(result)
// Output: option1 = value1
// option2 = value2
}
func ExampleReg_LiteralPrefix() {
regexStr := `a(b|c)d*`
regexComp := regex.MustCompile(regexStr)
prefix, complete := regexComp.LiteralPrefix()
fmt.Println(prefix)
fmt.Println(complete)
// Output: a
// false
}
func ExampleReg_Longest() {
regexStr := `x|xx`
inputStr := "xx"
regexComp := regex.MustCompile(regexStr)
fmt.Println(regexComp.FindString(inputStr))
regexComp.Longest()
fmt.Println(regexComp.FindString(inputStr))
// Output: x
// xx
}

View File

@@ -2,13 +2,20 @@ package regex
import ( import (
"fmt" "fmt"
"sort" "strconv"
"unicode"
) )
// a Match stores a slice of all the capturing groups in a match. // A Match represents a match found by the regex in a given string.
// It is represented as a list of groups, where the nth element contains
// the contents of the nth capturing group. Note that the group may not be valid
// (see [Group.IsValid]). The element at index 0 is known
// as the 0-group, and represents the contents of the entire match.
//
// See [Reg.FindSubmatch] for an example.
type Match []Group type Match []Group
// a Group represents a group. It contains the start index and end index of the match // a Group represents a capturing group. It contains the start and index of the group.
type Group struct { type Group struct {
StartIdx int StartIdx int
EndIdx int EndIdx int
@@ -23,151 +30,153 @@ func newMatch(size int) Match {
return toRet return toRet
} }
// Returns the number of valid groups in the match
func (m Match) numValidGroups() int {
numValid := 0
for _, g := range m {
if g.StartIdx >= 0 && g.EndIdx >= 0 {
numValid++
}
}
return numValid
}
// Returns a string containing the indices of all (valid) groups in the match // Returns a string containing the indices of all (valid) groups in the match
func (m Match) ToString() string { func (m Match) String() string {
var toRet string var toRet string
for i, g := range m { for i, g := range m {
if g.isValid() { if g.IsValid() {
toRet += fmt.Sprintf("Group %d\n", i) toRet += fmt.Sprintf("Group %d\n", i)
toRet += g.toString() toRet += g.String()
toRet += "\n" toRet += "\n"
} }
} }
return toRet return toRet
} }
// Converts the Group into a string representation: // String converts the Group into a string representation.
func (idx Group) toString() string { func (idx Group) String() string {
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx) return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
} }
// Returns whether a group contains valid indices // IsValid returns whether a group is valid (ie. whether it matched any text). It
func (g Group) isValid() bool { // simply ensures that both indices of the group are >= 0.
func (g Group) IsValid() bool {
return g.StartIdx >= 0 && g.EndIdx >= 0 return g.StartIdx >= 0 && g.EndIdx >= 0
} }
// takeZeroState takes the 0-state (if such a transition exists) for all states in the // Simple function, makes it easier to map this over a list of matches
// given slice. It returns the resulting states. If any of the resulting states is a 0-state, func getZeroGroup(m Match) Group {
// the second ret val is true. return m[0]
// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index.
func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZero bool) {
for _, state := range states {
if len(state.transitions[EPSILON]) > 0 {
for _, s := range state.transitions[EPSILON] {
if s.threadGroups == nil {
s.threadGroups = newMatch(numGroups + 1)
}
copy(s.threadGroups, state.threadGroups)
if s.groupBegin {
s.threadGroups[s.groupNum].StartIdx = idx
// openParenGroups = append(openParenGroups, s.groupNum)
}
if s.groupEnd {
s.threadGroups[s.groupNum].EndIdx = idx
// closeParenGroups = append(closeParenGroups, s.groupNum)
}
}
rtv = append(rtv, state.transitions[EPSILON]...)
}
}
for _, state := range rtv {
if len(state.transitions[EPSILON]) > 0 {
return rtv, true
}
}
return rtv, false
} }
// zeroMatchPossible returns true if a zero-length match is possible func copyThread(to *nfaState, from nfaState) {
// from any of the given states, given the string and our position in it. to.threadGroups = append([]Group{}, from.threadGroups...)
// It uses the same algorithm to find zero-states as the one inside the loop,
// so I should probably put it in a function.
func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) bool {
zeroStates, isZero := takeZeroState(states, numGroups, idx)
tempstates := make([]*State, 0, len(zeroStates)+len(states))
tempstates = append(tempstates, states...)
tempstates = append(tempstates, zeroStates...)
num_appended := 0 // number of unique states addded to tempstates
for isZero == true {
zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
tempstates, num_appended = unique_append(tempstates, zeroStates...)
if num_appended == 0 { // break if we haven't appended any more unique values
break
}
}
for _, state := range tempstates {
if state.isEmpty && (state.assert == NONE || state.checkAssertion(str, idx)) && state.isLast {
return true
}
}
return false
} }
// Prunes the slice by removing overlapping indices. // Find returns the 0-group of the leftmost match of the regex in the given string.
func pruneIndices(indices []Match) []Match { // An error value != nil indicates that no match was found.
// First, sort the slice by the start indices func (re Reg) Find(str string) (Group, error) {
sort.Slice(indices, func(i, j int) bool { match, err := re.FindNthMatch(str, 1)
return indices[i][0].StartIdx < indices[j][0].StartIdx if err != nil {
}) return Group{}, fmt.Errorf("no matches found")
toRet := make([]Match, 0, len(indices))
current := indices[0]
for _, idx := range indices[1:] {
// idx doesn't overlap with current (starts after current ends), so add current to result
// and update the current.
if idx[0].StartIdx >= current[0].EndIdx {
toRet = append(toRet, current)
current = idx
} else if idx[0].EndIdx > current[0].EndIdx {
// idx overlaps, but it is longer, so update current
current = idx
}
} }
// Add last state return getZeroGroup(match), nil
toRet = append(toRet, current)
return toRet
} }
// FindString returns a _string_ containing the _text_ of the _leftmost_ match of // Match returns a boolean value, indicating whether the regex found a match in the given string.
// the regex, in the given string. The return value will be an empty string in two situations: func (re Reg) Match(str string) bool {
_, err := re.Find(str)
return err == nil
}
// CompileMatch compiles expr and returns true if str contains a match of the expression.
// It is equivalent to [regexp.Match].
// An optional list of flags may be provided (see [ReFlag]).
// It returns an error (!= nil) if there was an error compiling the expression.
func CompileMatch(expr string, str string, flags ...ReFlag) (bool, error) {
re, err := Compile(expr, flags...)
if err != nil {
return false, err
}
return re.Match(str), nil
}
// FindAll returns a slice containing all the 0-groups of the regex in the given string.
// A 0-group represents the match without any submatches.
func (re Reg) FindAll(str string) []Group {
indices := re.FindAllSubmatch(str)
zeroGroups := funcMap(indices, getZeroGroup)
return zeroGroups
}
// FindString returns the text of the leftmost match of the regex in the given string.
// The return value will be an empty string in two situations:
// 1. No match was found // 1. No match was found
// 2. The match was an empty string // 2. The match was an empty string
func FindString(regex Reg, str string) string { func (re Reg) FindString(str string) string {
match, err := FindNthMatch(regex, str, 1) match, err := re.FindNthMatch(str, 1)
if err != nil { if err != nil {
return "" return ""
} }
return str[match[0].StartIdx:match[0].EndIdx] zeroGroup := getZeroGroup(match)
return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
} }
// FindAllString is the 'all' version of FindString. // FindSubmatch returns the leftmost match of the regex in the given string, including
// It returns a _slice of strings_ containing the _text_ of _all_ matches of // the submatches matched by capturing groups. The returned [Match] will always contain the same
// the regex, in the given string. // number of groups. The validity of a group (whether or not it matched anything) can be determined with
//func FindAllString(regex Reg, str []string) []string { // [Group.IsValid], or by checking that both indices of the group are >= 0.
// // The second-return value is nil if no match was found.
//} func (re Reg) FindSubmatch(str string) (Match, error) {
match, err := re.FindNthMatch(str, 1)
if err != nil {
return Match{}, fmt.Errorf("no match found")
} else {
return match, nil
}
}
// FindNthMatch finds the 'n'th match of the regex represented by the given start-state, with // FindStringSubmatch is the 'string' version of [FindSubmatch]. It returns a slice of strings,
// the given string. // where the string at index i contains the text matched by the i-th capturing group.
// The 0-th index represents the entire match.
// An empty string at index n could mean:
// ,
// 1. Group n did not find a match
// 2. Group n found a zero-length match
//
// A return value of nil indicates no match.
func (re Reg) FindStringSubmatch(str string) []string {
matchStr := make([]string, re.numGroups+1)
match, err := re.FindSubmatch(str)
if err != nil {
return nil
}
nonEmptyMatchFound := false
for i := range match {
if match[i].IsValid() {
matchStr[i] = str[match[i].StartIdx:match[i].EndIdx]
nonEmptyMatchFound = true
} else {
matchStr[i] = ""
}
}
if nonEmptyMatchFound == false {
return nil
}
return matchStr
}
// FindAllString is the 'all' version of [FindString].
// It returns a slice of strings containing the text of all matches of
// the regex in the given string.
func (re Reg) FindAllString(str string) []string {
zerogroups := re.FindAll(str)
matchStrs := funcMap(zerogroups, func(g Group) string {
return str[g.StartIdx:g.EndIdx]
})
return matchStrs
}
// FindNthMatch return the 'n'th match of the regex in the given string.
// It returns an error (!= nil) if there are fewer than 'n' matches in the string. // It returns an error (!= nil) if there are fewer than 'n' matches in the string.
func FindNthMatch(regex Reg, str string, n int) (Match, error) { func (re Reg) FindNthMatch(str string, n int) (Match, error) {
idx := 0 idx := 0
matchNum := 0 matchNum := 0
str_runes := []rune(str) str_runes := []rune(str)
var matchFound bool var matchFound bool
var matchIdx Match var matchIdx Match
for idx <= len(str_runes) { for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups) matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
if matchFound { if matchFound {
matchNum++ matchNum++
} }
@@ -179,237 +188,187 @@ func FindNthMatch(regex Reg, str string, n int) (Match, error) {
return nil, fmt.Errorf("invalid match index - too few matches found") return nil, fmt.Errorf("invalid match index - too few matches found")
} }
// FindAllMatches tries to find all matches of the regex represented by given start-state, with // FindAllSubmatch returns a slice of matches in the given string.
// the given string func (re Reg) FindAllSubmatch(str string) []Match {
func FindAllMatches(regex Reg, str string) []Match {
idx := 0 idx := 0
str_runes := []rune(str) str_runes := []rune(str)
var matchFound bool var matchFound bool
var matchIdx Match var matchIdx Match
indices := make([]Match, 0) indices := make([]Match, 0)
for idx <= len(str_runes) { for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups) matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
if matchFound { if matchFound {
indices = append(indices, matchIdx) indices = append(indices, matchIdx)
} }
} }
if len(indices) > 0 {
return pruneIndices(indices)
}
return indices return indices
} }
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState {
if stateExists(list, state) || stateExists(visited, state) {
return list
}
visited = append(visited, state)
if state.isKleene || state.isQuestion {
copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
return list
}
if state.isAlternation {
copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
return list
}
state.threadGroups = append([]Group{}, threadGroups...)
if state.assert != noneAssert {
if state.checkAssertion(str, idx, preferLongest) {
copyThread(state.next, state)
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
}
if state.groupBegin {
state.threadGroups[state.groupNum].StartIdx = idx
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
if state.groupEnd {
state.threadGroups[state.groupNum].EndIdx = idx
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
return append(list, state)
}
// Helper for FindAllMatches. Returns whether it found a match, the // Helper for FindAllMatches. Returns whether it found a match, the
// first Match it finds, and how far it got into the string ie. where // first Match it finds, and how far it got into the string ie. where
// the next search should start from. // the next search should start from.
// func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int, preferLongest bool) (bool, Match, int) {
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (bool, Match, int) {
// Base case - exit if offset exceeds string's length // Base case - exit if offset exceeds string's length
if offset > len(str) { if offset > len(str) {
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str) // The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
return false, []Group{}, offset return false, []Group{}, offset
} }
resetThreads(start)
// Hold a list of match indices for the current run. When we currentStates := make([]nfaState, 0)
// can no longer find a match, the match with the largest range is nextStates := make([]nfaState, 0)
// chosen as the match for the entire string. i := offset // Index in string
// This allows us to pick the longest possible match (which is how greedy matching works).
// COMMENT ABOVE IS CURRENTLY NOT UP-TO-DATE
tempIndices := newMatch(numGroups + 1)
foundPath := false
startIdx := offset
endIdx := offset
currentStates := make([]*State, 0)
tempStates := make([]*State, 0) // Used to store states that should be used in next loop iteration
i := offset // Index in string
startingFrom := i // Store starting index
// If the first state is an assertion, makes sure the assertion // If the first state is an assertion, makes sure the assertion
// is true before we do _anything_ else. // is true before we do _anything_ else.
if start.assert != NONE { if start.assert != noneAssert {
if start.checkAssertion(str, offset) == false { if start.checkAssertion(str, offset, preferLongest) == false {
i++ i++
return false, []Group{}, i return false, []Group{}, i
} }
} }
// Increment until we hit a character matching the start state (assuming not 0-state)
if start.isEmpty == false {
for i < len(str) && !start.contentContains(str, i) {
i++
}
startIdx = i
startingFrom = i
i++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character
}
start.threadGroups = newMatch(numGroups + 1) start.threadGroups = newMatch(numGroups + 1)
// Check if the start state begins a group - if so, add the start index to our list start.threadGroups[0].StartIdx = i
if start.groupBegin { currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest)
start.threadGroups[start.groupNum].StartIdx = i var match Match = nil
// tempIndices[start.groupNum].startIdx = i for idx := i; idx <= len(str); idx++ {
} if len(currentStates) == 0 {
currentStates = append(currentStates, start)
// Main loop
for i < len(str) {
foundPath = false
zeroStates := make([]*State, 0)
// Keep taking zero-states, until there are no more left to take
// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
tempStates = append(tempStates, zeroStates...)
num_appended := 0
for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = unique_append(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break
}
}
currentStates, _ = unique_append(currentStates, tempStates...)
tempStates = nil
// Take any transitions corresponding to current character
numStatesMatched := 0 // The number of states which had at least 1 match for this round
assertionFailed := false // Whether or not an assertion failed for this round
lastStateInList := false // Whether or not a last state was in our list of states
var lastStatePtr *State = nil // Pointer to the last-state, if it was found
lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
for _, state := range currentStates {
matches, numMatches := state.matchesFor(str, i)
if numMatches > 0 {
numStatesMatched++
tempStates = append(tempStates, matches...)
foundPath = true
for _, m := range matches {
if m.threadGroups == nil {
m.threadGroups = newMatch(numGroups + 1)
}
copy(m.threadGroups, state.threadGroups)
}
}
if numMatches < 0 {
assertionFailed = true
}
if state.isLast {
if state.isLookaround() {
lastLookaroundInList = true
}
lastStateInList = true
lastStatePtr = state
}
}
if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed
// If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_
// state. The explanation below is my attempt to explain this behavior.
// If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails.
//
// One of the states in our list was a last state and a lookaround. In this case, we
// don't abort upon failure of the assertion, because we have found
// another path to a final state.
// Even if the last state _was_ an assertion, we can use the previously
// saved indices to find a match.
if lastLookaroundInList {
break
} else {
if i == startingFrom {
i++
}
return false, []Group{}, i
}
}
// Check if we can find a state in our list that is:
// a. A last-state
// b. Empty
// c. Doesn't assert anything
for _, s := range currentStates {
if s.isLast && s.isEmpty && s.assert == NONE {
lastStatePtr = s
lastStateInList = true
}
}
if lastStateInList { // A last-state was in the list of states. add the matchIndex to our MatchIndex list
for j := 1; j < numGroups+1; j++ {
tempIndices[j] = lastStatePtr.threadGroups[j]
}
endIdx = i
tempIndices[0] = Group{startIdx, endIdx}
}
// Check if we can find a zero-length match
if foundPath == false {
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
if tempIndices[0].isValid() == false {
tempIndices[0] = Group{startIdx, startIdx}
}
}
// If we haven't moved in the string, increment the counter by 1
// to ensure we don't keep trying the same string over and over.
// if i == startingFrom {
startIdx++
// i++
// }
if tempIndices.numValidGroups() > 0 && tempIndices[0].isValid() {
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true, tempIndices, tempIndices[0].EndIdx + 1
} else {
return true, tempIndices, tempIndices[0].EndIdx
}
}
return false, []Group{}, startIdx
}
currentStates = make([]*State, len(tempStates))
copy(currentStates, tempStates)
tempStates = nil
i++
}
// End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position.
// This is the exact same algorithm used inside the loop, so I should probably put it in a function.
zeroStates, isZero := takeZeroState(currentStates, numGroups, i)
tempStates = append(tempStates, zeroStates...)
num_appended := 0 // Number of unique states addded to tempStates
for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = unique_append(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break break
} }
} for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ {
currentState := currentStates[currentStateIdx]
currentStates = append(currentStates, tempStates...) if currentState.threadGroups == nil {
tempStates = nil currentState.threadGroups = newMatch(numGroups + 1)
currentState.threadGroups[0].StartIdx = idx
}
for _, state := range currentStates { if currentState.isLast {
// Only add the match if the start index is in bounds. If the state has an assertion, currentState.threadGroups[0].EndIdx = idx
// make sure the assertion checks out. match = append([]Group{}, currentState.threadGroups...)
if state.isLast && i <= len(str) { if !preferLongest {
if state.assert == NONE || state.checkAssertion(str, i) { break
for j := 1; j < numGroups+1; j++ { }
tempIndices[j] = state.threadGroups[j] } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character
if currentState.contentContains(str, idx, preferLongest) {
nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
} }
endIdx = i
tempIndices[0] = Group{startIdx, endIdx}
} }
} }
currentStates = append([]nfaState{}, nextStates...)
nextStates = nil
} }
if match != nil {
if offset == match[0].EndIdx {
return true, match, match[0].EndIdx + 1
}
return true, match, match[0].EndIdx
}
return false, []Group{}, i + 1
}
if tempIndices.numValidGroups() > 0 { // Expand appends template to dst, expanding any variables in template to the relevant capturing group.
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. //
return true, tempIndices, tempIndices[0].EndIdx + 1 // A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group.
// To insert a literal $, do not put a number after it. Alternatively, you can use $$.
// src is the input string, and match must be the result of [Reg.FindSubmatch].
func (re Reg) Expand(dst string, template string, src string, match Match) string {
templateRuneSlc := []rune(template)
srcRuneSlc := []rune(src)
i := 0
for i < len(templateRuneSlc) {
c := templateRuneSlc[i]
if c == '$' {
i += 1
// The dollar sign is the last character of the string, or it is proceeded by another dollar sign
if i >= len(templateRuneSlc) || templateRuneSlc[i] == '$' {
dst += "$"
i++
} else {
numStr := ""
for unicode.IsDigit(templateRuneSlc[i]) {
numStr += string(templateRuneSlc[i])
i++
}
if numStr == "" {
dst += "$"
} else {
num, _ := strconv.Atoi(numStr)
if num < len(match) {
dst += string(srcRuneSlc[match[num].StartIdx:match[num].EndIdx])
} else {
dst += "$" + numStr
}
}
}
} else { } else {
return true, tempIndices, tempIndices[0].EndIdx dst += string(c)
i++
} }
} }
if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over. return dst
startIdx++ }
}
return false, []Group{}, startIdx // LiteralPrefix returns a string that must begin any match of the given regular expression.
// The second return value is true if the string comprises the entire expression.
func (re Reg) LiteralPrefix() (prefix string, complete bool) {
state := re.start
if state.assert != noneAssert {
state = state.next
}
for !(state.isLast) && (!state.isAlternation) && len(state.content) == 1 && state.assert == noneAssert {
if state.groupBegin || state.groupEnd {
state = state.next
continue
}
prefix += string(rune(state.content[0]))
state = state.next
}
if state.isLast {
complete = true
} else {
complete = false
}
return prefix, complete
} }

View File

@@ -8,16 +8,16 @@ import (
var whitespaceChars = []rune{' ', '\t', '\n'} var whitespaceChars = []rune{' ', '\t', '\n'}
var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'} var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_") var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_")
var LBRACKET rune = 0xF0001 var lbracketRune rune = 0xF0002
var RBRACKET rune = 0xF0002 var rbracketRune rune = 0xF0003
var ANY_CHAR rune = 0xF0003 // Represents any character - used for states where the allChars flag is on. var anyCharRune rune = 0xF0004 // Represents any character - used for states where the allChars flag is on.
var LPAREN_CHAR rune = 0xF0004 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses var lparenRune rune = 0xF0005 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
var RPAREN_CHAR rune = 0xF0005 var rparenRune rune = 0xF0006
var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPAREN var nonCapLparenRune rune = 0xF0007 // Represents a non-capturing group's LPAREN
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash var escBackslashRune rune = 0xF0008 // Represents an escaped backslash
var CHAR_RANGE rune = 0xF0008 // Represents a character range var charRangeRune rune = 0xF0009 // Represents a character range
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', CONCAT, '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR} var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
// An interface for int and rune, which are identical // An interface for int and rune, which are identical
type character interface { type character interface {
@@ -48,33 +48,9 @@ func isNormalChar(c rune) bool {
return !slices.Contains(specialChars, c) return !slices.Contains(specialChars, c)
} }
// Ensure that the given elements are only appended to the given slice if they
// don't already exist. Returns the new slice, and the number of unique items appended.
func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
num_appended := 0
for _, item := range items {
if !slices.Contains(slc, item) {
slc = append(slc, item)
num_appended++
}
}
return slc, num_appended
}
// Returns true only if all the given elements are equal
func allEqual[T comparable](items ...T) bool {
first := items[0]
for _, item := range items {
if item != first {
return false
}
}
return true
}
// Map function - convert a slice of T to a slice of V, based on a function // Map function - convert a slice of T to a slice of V, based on a function
// that maps a T to a V // that maps a T to a V
func Map[T, V any](slc []T, fn func(T) V) []V { func funcMap[T, V any](slc []T, fn func(T) V) []V {
toReturn := make([]V, len(slc)) toReturn := make([]V, len(slc))
for i, val := range slc { for i, val := range slc {
toReturn[i] = fn(val) toReturn[i] = fn(val)
@@ -84,7 +60,7 @@ func Map[T, V any](slc []T, fn func(T) V) []V {
// Reduce function - reduces a slice of a type into a value of the type, // Reduce function - reduces a slice of a type into a value of the type,
// based on the given function. // based on the given function.
func Reduce[T any](slc []T, fn func(T, T) T) T { func funcReduce[T any](slc []T, fn func(T, T) T) T {
if len(slc) == 0 { if len(slc) == 0 {
panic("Reduce on empty slice.") panic("Reduce on empty slice.")
} }

View File

@@ -5,124 +5,162 @@ import (
"slices" "slices"
) )
const EPSILON int = 0xF0000 const epsilon int = 0xF0000
type assertType int type assertType int
const ( const (
NONE assertType = iota noneAssert assertType = iota
SOS sosAssert // Start of string (^)
EOS soiAssert // Start of input (\A)
WBOUND eosAssert // End of string ($)
NONWBOUND eoiAssert // End of input (\Z)
PLA // Positive lookahead wboundAssert
NLA // Negative lookahead nonwboundAssert
PLB // Positive lookbehind plaAssert // Positive lookahead
NLB // Negative lookbehind nlaAssert // Negative lookahead
ALWAYS_TRUE // An assertion that is always true plbAssert // Positive lookbehind
nlbAssert // Negative lookbehind
alwaysTrueAssert // An assertion that is always true
) )
type State struct { type nfaState struct {
content stateContents // Contents of current state content stateContents // Contents of current state
isEmpty bool // If it is empty - Union operator and Kleene star states will be empty isEmpty bool // If it is empty - Union operator and Kleene star states will be empty
isLast bool // If it is the last state (acept state) isLast bool // If it is the last state (acept state)
output []*State // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. output []*nfaState // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
transitions map[int][]*State // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) // transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
isKleene bool // Identifies whether current node is a 0-state representing Kleene star next *nfaState // The next state (not for alternation or kleene states)
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything isKleene bool // Identifies whether current node is a 0-state representing Kleene star
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space isQuestion bool // Identifies whether current node is a 0-state representing the question operator
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. isAlternation bool // Identifies whether current node is a 0-state representing an alternation
lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds splitState *nfaState // Only for alternation states - the 'other' branch of the alternation ('next' is the first)
lookaroundNFA *State // Holds the NFA of the lookaroundRegex - if it exists assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
groupBegin bool // Whether or not the node starts a capturing group except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
groupEnd bool // Whether or not the node ends a capturing group lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds
groupNum int // Which capturing group the node starts / ends lookaroundNFA *nfaState // Holds the NFA of the lookaroundRegex - if it exists
lookaroundNumCaptureGroups int // Number of capturing groups in lookaround regex if current node is a lookaround
groupBegin bool // Whether or not the node starts a capturing group
groupEnd bool // Whether or not the node ends a capturing group
groupNum int // Which capturing group the node starts / ends
// The following properties depend on the current match - I should think about resetting them for every match. // The following properties depend on the current match - I should think about resetting them for every match.
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over. threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
} }
// Clones the NFA starting from the given state. // Clones the NFA starting from the given state.
func cloneState(start *State) *State { func cloneState(start *nfaState) *nfaState {
return cloneStateHelper(start, make(map[*State]*State)) return cloneStateHelper(start, make(map[*nfaState]*nfaState))
} }
// Helper function for clone. The map is used to keep track of which states have // Helper function for clone. The map is used to keep track of which states have
// already been copied, and which ones haven't. // already been copied, and which ones haven't.
// This function was created using output from Llama3.1:405B. // This function was created using output from Llama3.1:405B.
func cloneStateHelper(state *State, cloneMap map[*State]*State) *State { func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) *nfaState {
// Base case - if the clone exists in our map, return it. // Base case - if the clone exists in our map, return it.
if clone, exists := cloneMap[state]; exists { if clone, exists := cloneMap[stateToClone]; exists {
return clone return clone
} }
if state == nil { if stateToClone == nil {
return nil return nil
} }
// Recursive case - if the clone doesn't exist, create it, add it to the map, // Recursive case - if the clone doesn't exist, create it, add it to the map,
// and recursively call for each of the transition states. // and recursively call for each of the transition states.
clone := &State{ clone := &nfaState{
content: append([]int{}, state.content...), content: append([]int{}, stateToClone.content...),
isEmpty: state.isEmpty, isEmpty: stateToClone.isEmpty,
isLast: state.isLast, isLast: stateToClone.isLast,
output: make([]*State, len(state.output)), output: make([]*nfaState, len(stateToClone.output)),
transitions: make(map[int][]*State), isKleene: stateToClone.isKleene,
isKleene: state.isKleene, isQuestion: stateToClone.isQuestion,
assert: state.assert, isAlternation: stateToClone.isAlternation,
zeroMatchFound: state.zeroMatchFound, assert: stateToClone.assert,
allChars: state.allChars, zeroMatchFound: stateToClone.zeroMatchFound,
except: append([]rune{}, state.except...), allChars: stateToClone.allChars,
lookaroundRegex: state.lookaroundRegex, except: append([]rune{}, stateToClone.except...),
groupEnd: state.groupEnd, lookaroundRegex: stateToClone.lookaroundRegex,
groupBegin: state.groupBegin, groupEnd: stateToClone.groupEnd,
groupNum: state.groupNum, groupBegin: stateToClone.groupBegin,
groupNum: stateToClone.groupNum,
} }
cloneMap[state] = clone cloneMap[stateToClone] = clone
for i, s := range state.output { for i, s := range stateToClone.output {
if s == state { if s == stateToClone {
clone.output[i] = clone clone.output[i] = clone
} else { } else {
clone.output[i] = cloneStateHelper(s, cloneMap) clone.output[i] = cloneStateHelper(s, cloneMap)
} }
} }
for k, v := range state.transitions { if stateToClone.lookaroundNFA == stateToClone {
clone.transitions[k] = make([]*State, len(v))
for i, s := range v {
if s == state {
clone.transitions[k][i] = clone
} else {
clone.transitions[k][i] = cloneStateHelper(s, cloneMap)
}
}
}
if state.lookaroundNFA == state {
clone.lookaroundNFA = clone clone.lookaroundNFA = clone
} }
clone.lookaroundNFA = cloneStateHelper(state.lookaroundNFA, cloneMap) clone.lookaroundNFA = cloneStateHelper(stateToClone.lookaroundNFA, cloneMap)
if stateToClone.splitState == stateToClone {
clone.splitState = clone
}
clone.splitState = cloneStateHelper(stateToClone.splitState, cloneMap)
if stateToClone.next == stateToClone {
clone.next = clone
}
clone.next = cloneStateHelper(stateToClone.next, cloneMap)
return clone return clone
} }
// Reset any thread-related fields of the NFA starting from the given state.
func resetThreads(start *nfaState) {
visitedMap := make(map[*nfaState]bool) // The value type doesn't matter here
resetThreadsHelper(start, visitedMap)
}
func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
if state == nil {
return
}
if _, ok := visitedMap[state]; ok {
return
}
// Assuming it hasn't been visited
state.threadGroups = nil
visitedMap[state] = true
if state.isAlternation {
resetThreadsHelper(state.next, visitedMap)
resetThreadsHelper(state.splitState, visitedMap)
} else {
resetThreadsHelper(state.next, visitedMap)
}
}
// Checks if the given state's assertion is true. Returns true if the given // Checks if the given state's assertion is true. Returns true if the given
// state doesn't have an assertion. // state doesn't have an assertion.
func (s State) checkAssertion(str []rune, idx int) bool { func (s nfaState) checkAssertion(str []rune, idx int, preferLongest bool) bool {
if s.assert == ALWAYS_TRUE { if s.assert == alwaysTrueAssert {
return true return true
} }
if s.assert == SOS { if s.assert == sosAssert {
// Single-line mode: Beginning of string // Single-line mode: Beginning of string
// Multi-line mode: Previous character was newline // Multi-line mode: Previous character was newline
return idx == 0 || (multilineMode && (idx > 0 && str[idx-1] == '\n')) return idx == 0 || (multilineMode && (idx > 0 && str[idx-1] == '\n'))
} }
if s.assert == EOS { if s.assert == eosAssert {
// Single-line mode: End of string // Single-line mode: End of string
// Multi-line mode: current character is newline // Multi-line mode: current character is newline
// Index is at the end of the string, or it points to the last character which is a newline // Index is at the end of the string, or it points to the last character which is a newline
return idx == len(str) || (multilineMode && str[idx] == '\n') return idx == len(str) || (multilineMode && str[idx] == '\n')
} }
if s.assert == WBOUND { if s.assert == soiAssert {
// Only true at the start of the input, regardless of mode
return idx == 0
}
if s.assert == eoiAssert {
// Only true at the end of the input, regardless of mode
return idx == len(str)
}
if s.assert == wboundAssert {
return isWordBoundary(str, idx) return isWordBoundary(str, idx)
} }
if s.assert == NONWBOUND { if s.assert == nonwboundAssert {
return !isWordBoundary(str, idx) return !isWordBoundary(str, idx)
} }
if s.isLookaround() { if s.isLookaround() {
@@ -133,7 +171,7 @@ func (s State) checkAssertion(str []rune, idx int) bool {
startState := s.lookaroundNFA startState := s.lookaroundNFA
var runesToMatch []rune var runesToMatch []rune
var strToMatch string var strToMatch string
if s.assert == PLA || s.assert == NLA { if s.assert == plaAssert || s.assert == nlaAssert {
runesToMatch = str[idx:] runesToMatch = str[idx:]
} else { } else {
runesToMatch = str[:idx] runesToMatch = str[:idx]
@@ -145,25 +183,26 @@ func (s State) checkAssertion(str []rune, idx int) bool {
strToMatch = string(runesToMatch) strToMatch = string(runesToMatch)
} }
matchIndices := FindAllMatches(Reg{startState, s.lookaroundNumCaptureGroups}, strToMatch) regComp := Reg{startState, s.lookaroundNumCaptureGroups, s.lookaroundRegex, preferLongest}
matchIndices := regComp.FindAll(strToMatch)
numMatchesFound := 0 numMatchesFound := 0
for _, matchIdx := range matchIndices { for _, matchIdx := range matchIndices {
if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx. if s.assert == plaAssert || s.assert == nlaAssert { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
if matchIdx[0].StartIdx == 0 { if matchIdx.StartIdx == 0 {
numMatchesFound++ numMatchesFound++
} }
} }
if s.assert == PLB || s.assert == NLB { // Lookbehind - return true (or false) if at least one match _ends_ at the current index. if s.assert == plbAssert || s.assert == nlbAssert { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
if matchIdx[0].EndIdx == idx { if matchIdx.EndIdx == idx {
numMatchesFound++ numMatchesFound++
} }
} }
} }
if s.assert == PLA || s.assert == PLB { // Positive assertions want at least one match if s.assert == plaAssert || s.assert == plbAssert { // Positive assertions want at least one match
return numMatchesFound > 0 return numMatchesFound > 0
} }
if s.assert == NLA || s.assert == NLB { // Negative assertions only want zero matches if s.assert == nlaAssert || s.assert == nlbAssert { // Negative assertions only want zero matches
return numMatchesFound == 0 return numMatchesFound == 0
} }
} }
@@ -171,9 +210,12 @@ func (s State) checkAssertion(str []rune, idx int) bool {
} }
// Returns true if the contents of 's' contain the value at the given index of the given string // Returns true if the contents of 's' contain the value at the given index of the given string
func (s State) contentContains(str []rune, idx int) bool { func (s nfaState) contentContains(str []rune, idx int, preferLongest bool) bool {
if s.assert != NONE { if s.assert != noneAssert {
return s.checkAssertion(str, idx) return s.checkAssertion(str, idx, preferLongest)
}
if idx >= len(str) {
return false
} }
if s.allChars { if s.allChars {
return !slices.Contains(slices.Concat(notDotChars, s.except), str[idx]) // Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node. return !slices.Contains(slices.Concat(notDotChars, s.except), str[idx]) // Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node.
@@ -182,153 +224,174 @@ func (s State) contentContains(str []rune, idx int) bool {
return slices.Contains(s.content, int(str[idx])) return slices.Contains(s.content, int(str[idx]))
} }
func (s State) isLookaround() bool { func (s nfaState) isLookaround() bool {
return s.assert == PLA || s.assert == PLB || s.assert == NLA || s.assert == NLB return s.assert == plaAssert || s.assert == plbAssert || s.assert == nlaAssert || s.assert == nlbAssert
}
func (s nfaState) numTransitions() int {
if s.next == nil && s.splitState == nil {
return 0
}
if s.next == nil || s.splitState == nil {
return 1
}
return 2
} }
// Returns the matches for the character at the given index of the given string. // Returns the matches for the character at the given index of the given string.
// Also returns the number of matches. Returns -1 if an assertion failed. // Also returns the number of matches. Returns -1 if an assertion failed.
func (s State) matchesFor(str []rune, idx int) ([]*State, int) { //func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) {
// Assertions can be viewed as 'checks'. If the check fails, we return // // Assertions can be viewed as 'checks'. If the check fails, we return
// an empty array and 0. // // an empty array and 0.
// If it passes, we treat it like any other state, and return all the transitions. // // If it passes, we treat it like any other state, and return all the transitions.
if s.assert != NONE { // if s.assert != noneAssert {
if s.checkAssertion(str, idx) == false { // if s.checkAssertion(str, idx) == false {
return make([]*State, 0), -1 // return make([]*nfaState, 0), -1
} // }
} // }
listTransitions := s.transitions[int(str[idx])] // listTransitions := s.transitions[int(str[idx])]
for _, dest := range s.transitions[int(ANY_CHAR)] { // for _, dest := range s.transitions[int(anyCharRune)] {
if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) { // if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) {
// Add an allChar state to the list of matches if: // // Add an allChar state to the list of matches if:
// a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't. // // a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't.
// b. The current character isn't the state's exception list. // // b. The current character isn't the state's exception list.
listTransitions = append(listTransitions, dest) // listTransitions = append(listTransitions, dest)
} // }
} // }
numTransitions := len(listTransitions) // numTransitions := len(listTransitions)
return listTransitions, numTransitions // return listTransitions, numTransitions
} //}
// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates // verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates
func verifyLastStatesHelper(state *State, visited map[*State]bool) { //func verifyLastStatesHelper(st *nfaState, visited map[*nfaState]bool) {
if len(state.transitions) == 0 { // if st.numTransitions() == 0 {
state.isLast = true // st.isLast = true
return // return
} // }
// if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a* // // if len(state.transitions) == 1 && len(state.transitions[state.content]) == 1 && state.transitions[state.content][0] == state { // Eg. a*
if len(state.transitions) == 1 { // Eg. a* // if st.numTransitions() == 1 { // Eg. a*
var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one // var moreThanOneTrans bool // Dummy variable, check if all the transitions for the current's state's contents have a length of one
for _, c := range state.content { // for _, c := range st.content {
if len(state.transitions[c]) != 1 || state.transitions[c][0] != state { // if len(st.transitions[c]) != 1 || st.transitions[c][0] != st {
moreThanOneTrans = true // moreThanOneTrans = true
} // }
} // }
state.isLast = !moreThanOneTrans // st.isLast = !moreThanOneTrans
} // }
//
if state.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state // if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
transitionDests := make([]*State, 0) // transitionDests := make([]*nfaState, 0)
for _, v := range state.transitions { // for _, v := range st.transitions {
transitionDests = append(transitionDests, v...) // transitionDests = append(transitionDests, v...)
} // }
if allEqual(transitionDests...) { // if allEqual(transitionDests...) {
state.isLast = true // st.isLast = true
return // return
} // }
} // }
if visited[state] == true { // if visited[st] == true {
return // return
} // }
visited[state] = true // visited[st] = true
for _, states := range state.transitions { // for _, states := range st.transitions {
for i := range states { // for i := range states {
if states[i] != state { // if states[i] != st {
verifyLastStatesHelper(states[i], visited) // verifyLastStatesHelper(states[i], visited)
} // }
} // }
} // }
} //}
// verifyLastStates enables the 'isLast' flag for the leaf nodes (last states) // verifyLastStates enables the 'isLast' flag for the leaf nodes (last states)
func verifyLastStates(start []*State) { //func verifyLastStates(start []*nfaState) {
verifyLastStatesHelper(start[0], make(map[*State]bool)) // verifyLastStatesHelper(start[0], make(map[*nfaState]bool))
} //}
// Concatenates s1 and s2, returns the start of the concatenation. // Concatenates s1 and s2, returns the start of the concatenation.
func concatenate(s1 *State, s2 *State) *State { func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
if s1 == nil { if s1 == nil {
return s2 return s2
} }
for i := range s1.output { for i := range s1.output {
for _, c := range s2.content { // Create transitions for every element in s1's content to s2' s1.output[i].next = s2
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], s2)
}
} }
s1.output = s2.output s1.output = s2.output
return s1 return s1
} }
func kleene(s1 State) (*State, error) { func kleene(s1 *nfaState) (*nfaState, error) {
if s1.isEmpty && s1.assert != NONE { if s1.isEmpty && s1.assert != noneAssert {
return nil, fmt.Errorf("previous token is not quantifiable") return nil, fmt.Errorf("previous token is not quantifiable")
} }
toReturn := &State{} toReturn := &nfaState{}
toReturn.transitions = make(map[int][]*State)
toReturn.content = newContents(EPSILON)
toReturn.isEmpty = true toReturn.isEmpty = true
toReturn.isAlternation = true
toReturn.content = newContents(epsilon)
toReturn.splitState = s1
// toReturn := &nfaState{}
// toReturn.transitions = make(map[int][]*nfaState)
// toReturn.content = newContents(epsilon)
toReturn.isKleene = true toReturn.isKleene = true
toReturn.output = append(toReturn.output, toReturn) toReturn.output = append([]*nfaState{}, toReturn)
for i := range s1.output { for i := range s1.output {
for _, c := range toReturn.content { s1.output[i].next = toReturn
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], toReturn)
}
}
for _, c := range s1.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1)
} }
// for _, c := range s1.content {
// toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
// }
//toReturn.kleeneState = &s1
return toReturn, nil return toReturn, nil
} }
func alternate(s1 *State, s2 *State) *State { func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
toReturn := &State{} toReturn := &nfaState{}
toReturn.transitions = make(map[int][]*State) // toReturn.transitions = make(map[int][]*nfaState)
toReturn.output = append(toReturn.output, s1.output...) toReturn.output = append(toReturn.output, s1.output...)
toReturn.output = append(toReturn.output, s2.output...) toReturn.output = append(toReturn.output, s2.output...)
// Unique append is used here (and elsewhere) to ensure that, // // Unique append is used here (and elsewhere) to ensure that,
// for any given transition, a state can only be mentioned once. // // for any given transition, a state can only be mentioned once.
// For example, given the transition 'a', the state 's1' can only be mentioned once. // // For example, given the transition 'a', the state 's1' can only be mentioned once.
// This would lead to multiple instances of the same set of match indices, since both // // This would lead to multiple instances of the same set of match indices, since both
// 's1' states would be considered to match. // // 's1' states would be considered to match.
for _, c := range s1.content { // for _, c := range s1.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s1) // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
} // }
for _, c := range s2.content { // for _, c := range s2.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2) // toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
} // }
toReturn.content = newContents(EPSILON) toReturn.content = newContents(epsilon)
toReturn.isEmpty = true toReturn.isEmpty = true
toReturn.isAlternation = true
toReturn.next = s1
toReturn.splitState = s2
return toReturn return toReturn
} }
func question(s1 *State) *State { // Use the fact that ab? == a(b|) func question(s1 *nfaState) (*nfaState, error) { // Use the fact that ab? == a(b|)
s2 := &State{} if s1.isEmpty && s1.assert != noneAssert {
s2.transitions = make(map[int][]*State) return nil, fmt.Errorf("previous token is not quantifiable")
s2.content = newContents(EPSILON) }
s2.output = append(s2.output, s2) toReturn := &nfaState{}
s2.isEmpty = true toReturn.isEmpty = true
s3 := alternate(s1, s2) toReturn.isAlternation = true
return s3 toReturn.isQuestion = true
toReturn.content = newContents(epsilon)
toReturn.splitState = s1
toReturn.output = append([]*nfaState{}, toReturn)
toReturn.output = append(toReturn.output, s1.output...)
// s2.transitions = make(map[int][]*nfaState)
return toReturn, nil
} }
// Creates and returns a new state with the 'default' values. // Creates and returns a new state with the 'default' values.
func newState() State { func newState() nfaState {
ret := State{ ret := nfaState{
output: make([]*State, 0), output: make([]*nfaState, 0),
transitions: make(map[int][]*State), // transitions: make(map[int][]*nfaState),
assert: NONE, assert: noneAssert,
except: append([]rune{}, 0), except: append([]rune{}, 0),
lookaroundRegex: "", lookaroundRegex: "",
groupEnd: false, groupEnd: false,
@@ -339,10 +402,40 @@ func newState() State {
} }
// Creates and returns a state that _always_ has a zero-length match. // Creates and returns a state that _always_ has a zero-length match.
func zeroLengthMatchState() State { func zeroLengthMatchState() *nfaState {
start := newState() start := &nfaState{}
start.content = newContents(EPSILON) start.content = newContents(epsilon)
start.isEmpty = true start.isEmpty = true
start.assert = ALWAYS_TRUE start.assert = alwaysTrueAssert
start.output = append([]*nfaState{}, start)
return start return start
} }
func (s nfaState) equals(other nfaState) bool {
return s.isEmpty == other.isEmpty &&
s.isLast == other.isLast &&
slices.Equal(s.output, other.output) &&
slices.Equal(s.content, other.content) &&
s.next == other.next &&
s.isKleene == other.isKleene &&
s.isQuestion == other.isQuestion &&
s.isAlternation == other.isAlternation &&
s.splitState == other.splitState &&
s.assert == other.assert &&
s.allChars == other.allChars &&
slices.Equal(s.except, other.except) &&
s.lookaroundNFA == other.lookaroundNFA &&
s.groupBegin == other.groupBegin &&
s.groupEnd == other.groupEnd &&
s.groupNum == other.groupNum &&
slices.Equal(s.threadGroups, other.threadGroups)
}
func stateExists(list []nfaState, s nfaState) bool {
for i := range list {
if list[i].equals(s) {
return true
}
}
return false
}

View File

@@ -2,7 +2,7 @@ package regex
import "fmt" import "fmt"
type NodeType int type nodeType int
// This is a slice containing all escapable characters that have special meaning. // This is a slice containing all escapable characters that have special meaning.
// Eg. \b is word boundary, \w is word character etc. // Eg. \b is word boundary, \w is word character etc.
@@ -10,28 +10,28 @@ var escapedChars []rune = []rune("wWdDbBnaftrvsS0")
// This is a list of the possible node types // This is a list of the possible node types
const ( const (
CHARACTER NodeType = iota characterNode nodeType = iota
CHARCLASS charclassNode
PIPE pipeNode
CONCATENATE concatenateNode
KLEENE kleeneNode
QUESTION questionNode
PLUS plusNode
ASSERTION assertionNode
LPAREN lparenNode
RPAREN rparenNode
) )
// Helper constants for lookarounds // Helper constants for lookarounds
const POSITIVE = 1 const positive = 1
const NEGATIVE = -1 const negative = -1
const LOOKAHEAD = 1 const lookahead = 1
const LOOKBEHIND = -1 const lookbehind = -1
var INFINITE_REPS int = -1 // Represents infinite reps eg. the end range in {5,} var infinite_reps int = -1 // Represents infinite reps eg. the end range in {5,}
// This represents a node in the postfix representation of the expression // This represents a node in the postfix representation of the expression
type postfixNode struct { type postfixNode struct {
nodetype NodeType nodetype nodeType
contents []rune // Contents of the node contents []rune // Contents of the node
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
@@ -49,12 +49,12 @@ type postfixNode struct {
// it will not match. // it will not match.
func newCharClassNode(nodes []postfixNode, negated bool) postfixNode { func newCharClassNode(nodes []postfixNode, negated bool) postfixNode {
rtv := postfixNode{} rtv := postfixNode{}
rtv.nodetype = CHARCLASS rtv.nodetype = charclassNode
rtv.startReps = 1 rtv.startReps = 1
rtv.endReps = 1 rtv.endReps = 1
if negated { if negated {
rtv.nodetype = CHARACTER rtv.nodetype = characterNode
rtv.contents = []rune{ANY_CHAR} rtv.contents = []rune{anyCharRune}
rtv.allChars = true rtv.allChars = true
rtv.except = nodes rtv.except = nodes
} else { } else {
@@ -70,55 +70,65 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
toReturn.endReps = 1 toReturn.endReps = 1
switch c { switch c {
case 's': // Whitespace case 's': // Whitespace
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, whitespaceChars...) toReturn.contents = append(toReturn.contents, whitespaceChars...)
case 'S': // Non-whitespace case 'S': // Non-whitespace
toReturn = newPostfixDotNode() toReturn = newPostfixDotNode()
toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...)) toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...))
case 'd': // Digits case 'd': // Digits
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, digitChars...) toReturn.contents = append(toReturn.contents, digitChars...)
case 'D': // Non-digits case 'D': // Non-digits
toReturn = newPostfixDotNode() toReturn = newPostfixDotNode()
toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...)) toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...))
case 'w': // word character case 'w': // word character
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, wordChars...) toReturn.contents = append(toReturn.contents, wordChars...)
case 'W': // Non-word character case 'W': // Non-word character
toReturn = newPostfixDotNode() toReturn = newPostfixDotNode()
toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...)) toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...))
case 'b', 'B': case 'b', 'B':
if c == 'b' && inCharClass { if c == 'b' && inCharClass {
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(8)) toReturn.contents = append(toReturn.contents, rune(8))
} else { } else {
toReturn.nodetype = ASSERTION toReturn.nodetype = assertionNode
toReturn.contents = append(toReturn.contents, c)
}
if c == 'B' && inCharClass { // Invalid
return postfixNode{}, fmt.Errorf("word boundaries are not allowed in character class")
}
case 'A', 'z': // A is start of input, z is end of input (regardless of RE_MULTILINE)
if inCharClass {
return postfixNode{}, fmt.Errorf("input boundaries are not allowed in character class")
} else {
toReturn.nodetype = assertionNode
toReturn.contents = append(toReturn.contents, c) toReturn.contents = append(toReturn.contents, c)
} }
case 'n': // Newline character case 'n': // Newline character
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, '\n') toReturn.contents = append(toReturn.contents, '\n')
case '0': // NULL character case '0': // NULL character
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(0)) toReturn.contents = append(toReturn.contents, rune(0))
case 'a': // Bell character case 'a': // Bell character
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(7)) toReturn.contents = append(toReturn.contents, rune(7))
case 'f': // Form feed character case 'f': // Form feed character
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(12)) toReturn.contents = append(toReturn.contents, rune(12))
case 't': // Horizontal tab character case 't': // Horizontal tab character
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(9)) toReturn.contents = append(toReturn.contents, rune(9))
case 'r': // Carriage return case 'r': // Carriage return
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(13)) toReturn.contents = append(toReturn.contents, rune(13))
case 'v': // Vertical tab case 'v': // Vertical tab
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, rune(11)) toReturn.contents = append(toReturn.contents, rune(11))
case '-': // Literal hyphen - only in character class case '-': // Literal hyphen - only in character class
if inCharClass { if inCharClass {
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, '-') toReturn.contents = append(toReturn.contents, '-')
} else { } else {
return postfixNode{}, fmt.Errorf("invalid escape character") return postfixNode{}, fmt.Errorf("invalid escape character")
@@ -127,7 +137,7 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
if isNormalChar(c) { // Normal characters cannot be escaped if isNormalChar(c) { // Normal characters cannot be escaped
return postfixNode{}, fmt.Errorf("invalid escape character") return postfixNode{}, fmt.Errorf("invalid escape character")
} }
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, c) toReturn.contents = append(toReturn.contents, c)
} }
return toReturn, nil return toReturn, nil
@@ -142,37 +152,37 @@ func newPostfixNode(contents ...rune) postfixNode {
to_return.startReps = 1 to_return.startReps = 1
to_return.endReps = 1 to_return.endReps = 1
if len(contents) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER if len(contents) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER
to_return.nodetype = CHARACTER to_return.nodetype = characterNode
to_return.contents = contents to_return.contents = contents
} else { // Node has one element, could be anything } else { // Node has one element, could be anything
switch contents[0] { switch contents[0] {
case '+': case '+':
to_return.nodetype = PLUS to_return.nodetype = plusNode
case '?': case '?':
to_return.nodetype = QUESTION to_return.nodetype = questionNode
case '*': case '*':
to_return.nodetype = KLEENE to_return.nodetype = kleeneNode
case '|': case '|':
to_return.nodetype = PIPE to_return.nodetype = pipeNode
case CONCAT: case concatRune:
to_return.nodetype = CONCATENATE to_return.nodetype = concatenateNode
case '^', '$': case '^', '$':
to_return.nodetype = ASSERTION to_return.nodetype = assertionNode
case '(': case '(':
to_return.nodetype = LPAREN to_return.nodetype = lparenNode
case ')': case ')':
to_return.nodetype = RPAREN to_return.nodetype = rparenNode
default: default:
to_return.nodetype = CHARACTER to_return.nodetype = characterNode
} }
to_return.contents = append(to_return.contents, contents...) to_return.contents = append(to_return.contents, contents...)
// Special cases for LPAREN and RPAREN - they have special characters defined for them // Special cases for LPAREN and RPAREN - they have special characters defined for them
if to_return.nodetype == LPAREN { if to_return.nodetype == lparenNode {
to_return.contents = []rune{LPAREN_CHAR} to_return.contents = []rune{lparenRune}
} }
if to_return.nodetype == RPAREN { if to_return.nodetype == rparenNode {
to_return.contents = []rune{RPAREN_CHAR} to_return.contents = []rune{rparenRune}
} }
} }
return to_return return to_return
@@ -183,9 +193,9 @@ func newPostfixDotNode() postfixNode {
toReturn := postfixNode{} toReturn := postfixNode{}
toReturn.startReps = 1 toReturn.startReps = 1
toReturn.endReps = 1 toReturn.endReps = 1
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.allChars = true toReturn.allChars = true
toReturn.contents = []rune{ANY_CHAR} toReturn.contents = []rune{anyCharRune}
return toReturn return toReturn
} }
@@ -194,7 +204,7 @@ func newPostfixCharNode(contents ...rune) postfixNode {
toReturn := postfixNode{} toReturn := postfixNode{}
toReturn.startReps = 1 toReturn.startReps = 1
toReturn.endReps = 1 toReturn.endReps = 1
toReturn.nodetype = CHARACTER toReturn.nodetype = characterNode
toReturn.contents = append(toReturn.contents, contents...) toReturn.contents = append(toReturn.contents, contents...)
return toReturn return toReturn
} }

View File

@@ -3,7 +3,9 @@ package regex
import ( import (
"fmt" "fmt"
"math" "math"
"slices"
"strconv" "strconv"
"strings"
) )
type numRange struct { type numRange struct {
@@ -99,28 +101,39 @@ func range2regex(start int, end int) (string, error) {
// Last range - tmp to rangeEnd // Last range - tmp to rangeEnd
ranges = append(ranges, numRange{tmp, rangeEnd}) ranges = append(ranges, numRange{tmp, rangeEnd})
regex := string(NONCAPLPAREN_CHAR) regexSlice := make([]string, 0)
// Generate the regex // Generate the regex
for i, rg := range ranges { for _, rg := range ranges {
if i > 0 { tmpStr := ""
regex += "|" tmpStr += string(nonCapLparenRune)
}
regex += string(NONCAPLPAREN_CHAR)
startSlc := intToSlc(rg.start) startSlc := intToSlc(rg.start)
endSlc := intToSlc(rg.end) endSlc := intToSlc(rg.end)
if len(startSlc) != len(endSlc) { if len(startSlc) != len(endSlc) {
return "", fmt.Errorf("Error parsing numeric range") return "", fmt.Errorf("error parsing numeric range")
} }
for i := range startSlc { for i := range startSlc {
if startSlc[i] == endSlc[i] { if startSlc[i] == endSlc[i] {
regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48. tmpStr += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
} else { } else {
regex += fmt.Sprintf("%c%c-%c%c", LBRACKET, rune(startSlc[i]+48), rune(endSlc[i]+48), RBRACKET) tmpStr += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
} }
} }
regex += ")" tmpStr += ")"
regexSlice = append(regexSlice, tmpStr)
} }
regex += ")" // Each element of the slice represents one 'group'. Taking 0-255 as an example, the elements would be:
// 1. 0-9
// 2. 10-99
// 3. 100-199
// 4. 200-249
// 5. 250-255
//
// The reason this is reversed before joining it, is because it is incompatible with the PCRE rule for matching.
// The PCRE rule specifies that the left-branch of an alternation is preferred. Even though this engine uses the POSIX
// rule at the moment (which prefers the longest match regardless of the order of the alternation), reversing the string
// has no downsides. It doesn't affect POSIX matching, and it will reduce my burden if I decide to switch to PCRE matching.
slices.Reverse(regexSlice)
regex := string(nonCapLparenRune) + strings.Join(regexSlice, "|") + ")"
return regex, nil return regex, nil
} }

View File

@@ -25,7 +25,9 @@ var reTests = []struct {
{"a*b", nil, "qwqw", []Group{}}, {"a*b", nil, "qwqw", []Group{}},
{"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}}, {"(abc)*", nil, "abcabcabc", []Group{{0, 9}, {9, 9}}},
{"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}}, {"((abc)|(def))*", nil, "abcdef", []Group{{0, 6}, {6, 6}}},
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}}, // This match will only happen with Longest()
// {"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 6}, {6, 6}}},
{"(abc)*|(def)*", nil, "abcdef", []Group{{0, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}}},
{"b*a*a", nil, "bba", []Group{{0, 3}}}, {"b*a*a", nil, "bba", []Group{{0, 3}}},
{"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}}, {"(ab)+", nil, "abcabddd", []Group{{0, 2}, {3, 5}}},
{"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}}, {"a(b(c|d)*)*", nil, "abccbd", []Group{{0, 6}}},
@@ -105,6 +107,9 @@ var reTests = []struct {
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}}, {"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}}, {"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}}, {`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
{`\d{3,4}`, nil, "120", []Group{{0, 3}}},
{`\d{3,4}`, nil, "12709", []Group{{0, 4}}},
{`\d{3,4}`, nil, "12", []Group{}},
{`\d{3,4}`, nil, "109", []Group{{0, 3}}}, {`\d{3,4}`, nil, "109", []Group{{0, 3}}},
{`\d{3,4}`, nil, "5", []Group{}}, {`\d{3,4}`, nil, "5", []Group{}},
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}}, {`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
@@ -443,8 +448,11 @@ var reTests = []struct {
{`abc$`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{7, 10}}}, {`abc$`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{7, 10}}},
{`^`, nil, "jkl\n123abc\nxyz", []Group{{0, 0}}}, {`^`, nil, "jkl\n123abc\nxyz", []Group{{0, 0}}},
{`^`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{0, 0}, {4, 4}, {11, 11}}}, {`^`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{0, 0}, {4, 4}, {11, 11}}},
{`\A`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{0, 0}}},
{`$`, nil, "jkl\n123abc\nxyz", []Group{{14, 14}}}, {`$`, nil, "jkl\n123abc\nxyz", []Group{{14, 14}}},
{`$`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{3, 3}, {10, 10}, {14, 14}}}, {`$`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{3, 3}, {10, 10}, {14, 14}}},
{`\z`, []ReFlag{RE_MULTILINE}, "jkl\n123abc\nxyz", []Group{{14, 14}}},
{`^abc\z`, []ReFlag{RE_MULTILINE}, "abc\nabc\nabc", []Group{{8, 11}}},
{`a.b`, nil, "a\nb", []Group{}}, {`a.b`, nil, "a\nb", []Group{}},
{`a.b`, []ReFlag{RE_SINGLE_LINE}, "a\nb", []Group{{0, 3}}}, {`a.b`, []ReFlag{RE_SINGLE_LINE}, "a\nb", []Group{{0, 3}}},
@@ -522,7 +530,7 @@ var groupTests = []struct {
}{ }{
{"(a)(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}}, {"(a)(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {1, 2}}}},
{"((a))(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}}, {"((a))(b)", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {1, 2}}}},
{"(0)", nil, "ab", []Match{[]Group{}}}, {"(0)", nil, "ab", []Match{}},
{"(a)b", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}}}}, {"(a)b", nil, "ab", []Match{[]Group{{0, 2}, {0, 1}}}},
{"a(b)", nil, "ab", []Match{[]Group{{0, 2}, {1, 2}}}}, {"a(b)", nil, "ab", []Match{[]Group{{0, 2}, {1, 2}}}},
{"(a|b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}}, {"(a|b)", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
@@ -531,10 +539,11 @@ var groupTests = []struct {
{"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(a+)|(a)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}}, {"(a+)(aa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 2}, {2, 4}}}},
{"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(aaaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, // This match will only happen with Longest()
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, // {"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
{"(aaa)|(aaaa)", nil, "aaaa", []Match{[]Group{{0, 3}, {0, 3}, {-1, -1}}}},
{"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(aaaa)|(aaa)", nil, "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}}, {"(a)|(aa)", nil, "aa", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 2}, {1, 2}}}},
{"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}}, {"(a?)a?", nil, "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
{"(a?)a?", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}}, {"(a?)a?", nil, "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
{"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}}, {"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
@@ -572,7 +581,7 @@ var groupTests = []struct {
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}}, {`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}},
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}}, {`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
{`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(((((((((a)))))))))\41`, nil, `a`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(((((((((a)))))))))\41`, nil, `a!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}}, {`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
{`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}}, {`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
@@ -627,7 +636,7 @@ var groupTests = []struct {
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `BCDD`, []Match{}}, {`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `BCDD`, []Match{}},
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}}, {`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
{`(((((((((a)))))))))`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(((((((((a)))))))))`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(((((((((a)))))))))\41`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}}, {`(((((((((a)))))))))\41`, []ReFlag{RE_CASE_INSENSITIVE}, `A!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(.*)c(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}}, {`(.*)c(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
{`\((.*), (.*)\)`, []ReFlag{RE_CASE_INSENSITIVE}, `(A, B)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}}, {`\((.*), (.*)\)`, []ReFlag{RE_CASE_INSENSITIVE}, `(A, B)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
{`(a)(b)c|ab`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}}}}, {`(a)(b)c|ab`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}}}},
@@ -668,9 +677,20 @@ var groupTests = []struct {
{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}}, {`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
{`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}}, {`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}},
// // Tests from https://wiki.haskell.org/Regex_Posix
// {`(()|.)(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
// {`(()|[ab])(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
// {`(()|[ab])+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
// {`([ab]|())+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
// // Bug - this should give {0,6},{3,6},{-1,-1} but it gives {0,6},{3,6},{3,3}
// // {`yyyyyy`, nil, `(yyy|(x?)){2,4}`, []Match{[]Group{{0, 6}, {3, 6}, {-1, -1}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
} }
func TestFindAllMatches(t *testing.T) { func TestFind(t *testing.T) {
for _, test := range reTests { for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) { t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...) regComp, err := Compile(test.re, test.flags...)
@@ -679,13 +699,35 @@ func TestFindAllMatches(t *testing.T) {
panic(fmt.Errorf("Test Error: %v", err)) panic(fmt.Errorf("Test Error: %v", err))
} }
} else { } else {
matchIndices := FindAllMatches(regComp, test.str) groupIndex, err := regComp.Find(test.str)
zeroGroups := make([]Group, len(matchIndices)) if err != nil { // No matches found
for i, m := range matchIndices { if len(test.result) == 0 {
zeroGroups[i] = m[0] return // Manually pass the test, because this is the expected behavior
} else {
t.Errorf("Wanted %v Got no matches\n", test.result)
}
} else {
if groupIndex != test.result[0] {
t.Errorf("Wanted %v Got %v\n", test.result, groupIndex)
}
} }
if !slices.Equal(test.result, zeroGroups) { }
t.Errorf("Wanted %v Got %v\n", test.result, zeroGroups) })
}
}
func TestFindAll(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(fmt.Errorf("Test Error: %v", err))
}
} else {
matchIndices := regComp.FindAll(test.str)
if !slices.Equal(test.result, matchIndices) {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
} }
} }
}) })
@@ -701,10 +743,10 @@ func TestFindString(t *testing.T) {
panic(err) panic(err)
} }
} else { } else {
foundString := FindString(regComp, test.str) foundString := regComp.FindString(test.str)
if len(test.result) == 0 { if len(test.result) == 0 {
if foundString != "" { if foundString != "" {
t.Errorf("Expected no match got %v\n", foundString) t.Errorf("Wanted no match got %v\n", foundString)
} }
} else { } else {
expectedString := test.str[test.result[0].StartIdx:test.result[0].EndIdx] expectedString := test.str[test.result[0].StartIdx:test.result[0].EndIdx]
@@ -717,7 +759,32 @@ func TestFindString(t *testing.T) {
} }
} }
func TestFindAllGroups(t *testing.T) { func TestFindAllString(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
} else {
foundStrings := regComp.FindAllString(test.str)
if len(test.result) != len(foundStrings) {
t.Errorf("Differing number of matches: Wanted %v matches Got %v matches\n", len(test.result), len(foundStrings))
} else {
for idx, group := range test.result {
groupStr := test.str[group.StartIdx:group.EndIdx]
if groupStr != foundStrings[idx] {
t.Errorf("Wanted %v Got %v\n", groupStr, foundStrings[idx])
}
}
}
}
})
}
}
func TestFindSubmatch(t *testing.T) {
for _, test := range groupTests { for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) { t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...) regComp, err := Compile(test.re, test.flags...)
@@ -726,13 +793,94 @@ func TestFindAllGroups(t *testing.T) {
panic(err) panic(err)
} }
} }
matchIndices := FindAllMatches(regComp, test.str) match, err := regComp.FindSubmatch(test.str)
if err != nil {
if len(test.result) != 0 {
t.Errorf("Wanted %v got no match\n", test.result[0])
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", match)
}
for i := range match {
if match[i].IsValid() {
if test.result[0][i] != match[i] {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
}
} else {
if i < len(test.result) && test.result[0][i].IsValid() {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
}
}
}
})
}
}
func TestFindStringSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
}
matchStr := regComp.FindStringSubmatch(test.str)
if matchStr == nil {
if len(test.result) != 0 {
expectedStr := funcMap(test.result[0], func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
t.Errorf("Wanted %v got no match\n", expectedStr)
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", matchStr)
} else {
expectedStr := funcMap(test.result[0], func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
for i, groupStr := range matchStr {
if groupStr == "" {
if i < len(expectedStr) && expectedStr[i] != "" {
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
}
} else {
if expectedStr[i] != groupStr {
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
}
}
}
}
})
}
}
func TestFindAllSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
}
matchIndices := regComp.FindAllSubmatch(test.str)
for i := range matchIndices { for i := range matchIndices {
for j := range matchIndices[i] { for j := range matchIndices[i] {
if matchIndices[i][j].isValid() { if matchIndices[i][j].IsValid() {
if test.result[i][j] != matchIndices[i][j] { if test.result[i][j] != matchIndices[i][j] {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
} }
} else {
if i < len(test.result) && j < len(test.result[i]) && test.result[i][j].IsValid() {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}
} }
} }
} }