37 Commits

Author SHA1 Message Date
ef476e8875 Reverse the order of the numeric range before adding it, to maintain compatibility with PCRE matching rules (even though I don't implement them, if I do in the future) 2025-02-02 13:46:48 -05:00
7e6b02632f Added more tests; commented out tests that I am failing 2025-02-02 13:46:08 -05:00
f94e3f2e71 Added comments 2025-02-02 12:44:06 -05:00
b129d83c3f Added function to reset threads 2025-02-02 12:43:40 -05:00
43aa7b5876 Updated documentation 2025-02-02 12:42:38 -05:00
9a3bfca313 Renamed unique_append to uniqueAppend 2025-02-02 12:42:29 -05:00
b6ab54f6dd Reset threads when findAllSubmatchHelper is called 2025-02-02 12:42:00 -05:00
6a96c98d04 Fixed bug where the regex '(()|.)(b)' wouldn't compile 2025-02-01 19:20:33 -05:00
3cfc2a6854 Updated Makefile 2025-02-01 18:52:26 -05:00
5d7a02e796 Added gcflags to go build 2025-02-01 18:51:58 -05:00
a46d2f4546 Updated comments 2025-02-01 18:07:31 -05:00
c88ebd1aa5 Added comments explaining what a Match is 2025-02-01 18:05:55 -05:00
fd102292c6 Added example for FindSubmatch 2025-02-01 18:05:43 -05:00
6d692d0dfc Rename Group.toString() to Group.String() 2025-02-01 12:51:32 -05:00
7c4538a259 Added 'example' file that contains testable examples 2025-02-01 12:50:49 -05:00
2a9ae0b68a Wrote test for 'FindSubmatch' 2025-02-01 11:09:05 -05:00
783ae2ad10 Updated call to 'isValid' with call to 'IsValid' 2025-02-01 11:06:26 -05:00
b5e6bc112c Wrote 'reg.FindSubmatch()' which returns the leftmost match with submatches, renamed 'isValid' to 'IsValid' to export it, renamed 'ToString' to 'String' 2025-02-01 11:06:03 -05:00
206fea34cd Added function to return the number of subexpressions in the group 2025-02-01 11:04:49 -05:00
fcdb23524a Added more documentation 2025-02-01 11:04:24 -05:00
ac936659b6 Updated documentation 2025-01-31 16:52:26 -05:00
e6dba9fdcf Updated documentation 2025-01-31 16:51:46 -05:00
30779a446b Updated documentation 2025-01-31 16:46:19 -05:00
f629a0f08f Added 'mustCompile' which panicks if there is an error compiling 2025-01-31 16:46:05 -05:00
6869cd00a2 Return error instead of nil when 'Find' fails 2025-01-31 10:52:38 -05:00
02bc8f30a2 Added test for 'Find' 2025-01-31 10:52:27 -05:00
ac05bceda3 Use method instead of function 2025-01-31 10:13:02 -05:00
037ac75ea6 Wrote new method to return 0-group of leftmost match; reorganized some functions for better clarity; made 'FindNthMatch' a method 2025-01-31 10:12:53 -05:00
e9d4e857cf Run 'TestFindAllStrings' since that function has been implemented 2025-01-31 10:11:52 -05:00
b685d2fd5f Renamed 'findAllMatchesHelper' to 'findAllSubmatchHelper' 2025-01-31 09:56:30 -05:00
8eda5055ff Replaced call to 'FindAllMatches' with call to 'FindAll' or 'FindAllSubmatch' depending on whether I need submatches 2025-01-31 09:55:36 -05:00
45b6566b2c Replaced function call with method call 2025-01-31 09:54:35 -05:00
e22822e619 Renamed 'FindAllMatches' to 'FindAll' and made it a method; made it return a slice of 0-groups; the functionality of 'FindAllMatches' is now in 'FindAllSubmatch' 2025-01-31 09:54:09 -05:00
692de2a32b Added lookarounds and numeric ranges to documentation 2025-01-31 09:26:21 -05:00
0d19664044 Cleared up some comments, wrote a skeleton for FindAllString 2025-01-30 22:57:35 -05:00
1bfb09b6c7 Made 'FindString' a method of 'Reg' 2025-01-30 22:51:31 -05:00
b0b8bf23af Updated documentation 2025-01-30 22:51:16 -05:00
10 changed files with 403 additions and 75 deletions

View File

@@ -8,6 +8,6 @@ vet: fmt
buildLib: vet
go build -gcflags="-N -l" ./...
buildCmd: buildLib
go build -C cmd/ -o re ./...
go build -C cmd/ -gcflags="-N -l" -o re ./...
test: buildCmd
go test -v ./...

View File

@@ -121,12 +121,12 @@ func main() {
}
matchIndices := make([]reg.Match, 0)
if matchNumFlagEnabled {
tmp, err := reg.FindNthMatch(regComp, test_str, *matchNum)
tmp, err := regComp.FindNthMatch(test_str, *matchNum)
if err == nil {
matchIndices = append(matchIndices, tmp)
}
} else {
matchIndices = reg.FindAllMatches(regComp, test_str)
matchIndices = regComp.FindAllSubmatch(test_str)
}
if *printMatchesFlag {
@@ -137,7 +137,7 @@ func main() {
fmt.Fprintf(out, "Line %d:\n", lineNum)
}
for _, m := range matchIndices {
fmt.Fprintf(out, "%s\n", m.ToString())
fmt.Fprintf(out, "%s\n", m.String())
}
err := out.Flush()
if err != nil {

View File

@@ -18,6 +18,12 @@ type Reg struct {
numGroups int
}
// numSubexp eturns the number of sub-expressions in the given [Reg]. This is equivalent
// to the number of capturing groups.
func (r Reg) NumSubexp() int {
return r.numGroups
}
const concatRune rune = 0xF0001
// Flags for shuntingYard - control its behavior
@@ -943,7 +949,9 @@ func thompson(re []postfixNode) (Reg, error) {
// and added back in.
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
// and RPAREN nodes.
// If neither node exists, that's a problem so I return an error.
// If the middle node exists but is itself the start of a group, then that _must_ be the opening paren for
// the closing paren that I'm on. I put the third node back (because it isn't involved in the capturing group), then
// I concatenate those two and add them. If neither node exists, that's a problem so I return an error.
if c.nodetype == rparenNode {
s.groupEnd = true
middleNode, err1 := pop(&nfa)
@@ -958,6 +966,11 @@ func thompson(re []postfixNode) (Reg, error) {
s.groupNum = lparenNode.groupNum
to_add := concatenate(lparenNode, s)
nfa = append(nfa, to_add)
} else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group
s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen
to_add := concatenate(middleNode, s)
nfa = append(nfa, to_add)
} else {
// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
if lparenNode.groupBegin {
@@ -1110,10 +1123,11 @@ func thompson(re []postfixNode) (Reg, error) {
}
// Compiles the given regular expression into a Reg type, suitable for use with the
// matching functions. The second return value is non-nil if a compilation error has
// occured. As such, the error value must be checked before using the Reg returned by this function.
// The second parameter is an optional list of flags, passed to the parsing function shuntingYard.
// Compile compiles the given regular expression into a [Reg].
//
// An error value != nil indicates that the regex was invalid; the error message should provide
// detailed information on the nature of the error.
// The second parameter is a sequence of zero or more [ReFlag] values, that modify the behavior of the regex.
func Compile(re string, flags ...ReFlag) (Reg, error) {
nodes, err := shuntingYard(re, flags...)
if err != nil {
@@ -1125,3 +1139,12 @@ func Compile(re string, flags ...ReFlag) (Reg, error) {
}
return reg, nil
}
// MustCompile panicks if Compile returns an error. They are identical in all other respects.
func MustCompile(re string, flags ...ReFlag) Reg {
reg, err := Compile(re, flags...)
if err != nil {
panic(err)
}
return reg
}

View File

@@ -84,9 +84,77 @@ Assertions:
\b Match at a word boundary (a word character followed by a non-word character, or vice-versa)
\B Match at a non-word boundary (a word character followed by a word character, or vice-versa)
# Flags
Lookarounds:
Flags are used to change the behavior of the engine. None of them are enabled by default. They are passed as an [ReFlag] slice to [Compile].
The list of flags, and their purpose, is provided in the type definition.
x(?=y) Positive lookahead - Match x if followed by y
x(?!y) Negative lookahead - Match x if NOT followed by y
(?<=x)y Positive lookbehind - Match y if preceded by x
(?<!x)y Negative lookbehind - Match y if NOT preceded by x
Numeric ranges:
<x-y> Match any number from x to y (inclusive) (x and y must be positive numbers)
# Key Differences with regexp
The engine and the API differ from [regexp] in a number of ways, some of them very subtle.
The key differences are mentioned below.
1. Greediness:
This engine does not support non-greedy operators. All operators are always greedy in nature, and will try
to match as much as they can, while still allowing for a successful match. For example, given the regex:
y*y
The engine will match as many 'y's as it can, while still allowing the trailing 'y' to be matched.
Another, more subtle example is the following regex:
x|xx
While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
this engine will go for the longest possible match, regardless of the order of the alternation. Although this
strays from the convention, it results in a nice rule-of-thumb - the engine is ALWAYS greedy.
The stdlib implementation has a function [regexp.Regexp.Longest] which makes future searches prefer the longest match.
That is the default (and unchangable) behavior in this engine.
2. Byte-slices and runes:
My engine does not support byte-slices. When a matching function receives a string, it converts it into a
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
support made the tradeoff worth it.
3. Return values
Rather than using primitives for return values, my engine defines two types that are used as return
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
[regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The
equivalent expression for this engine is:
Find(All)?(String)?(Submatch)?
[Reg.Find] returns the index of the leftmost match in the string.
If a function contains 'All' it returns all matches instead of just the leftmost one.
If a function contains 'String' it returns the matched text, rather than the indices.
If a function contains 'Submatch' it returns the match, including all submatches found by
capturing groups.
The term '0-group' is used to refer to the 0th capturing group of a match (which is the entire match).
Given the following regex:
x(y)
and the input string:
xyz
The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name
returns the 0-group.
*/
package regex

54
regex/example_test.go Normal file
View File

@@ -0,0 +1,54 @@
package regex_test
import (
"fmt"
"gitea.twomorecents.org/Rockingcool/kleingrep/regex"
)
func ExampleReg_Find() {
regexStr := "b|a"
regexComp := regex.MustCompile(regexStr)
match, _ := regexComp.Find("banana")
fmt.Println(match.String())
// Output: 0 1
}
func ExampleReg_FindAll() {
regexStr := "b|a"
regexComp := regex.MustCompile(regexStr)
matches := regexComp.FindAll("banana")
for _, group := range matches {
fmt.Println(group.String())
}
// Output: 0 1
// 1 2
// 3 4
// 5 6
}
func ExampleReg_FindString() {
regexStr := `\d+`
regexComp := regex.MustCompile(regexStr)
matchStr := regexComp.FindString("The year of our lord, 2025")
fmt.Println(matchStr)
// Output: 2025
}
func ExampleReg_FindSubmatch() {
regexStr := `(\d)\.(\d)(\d)`
regexComp := regex.MustCompile(regexStr)
match, _ := regexComp.FindSubmatch("3.14")
fmt.Println(match[0])
fmt.Println(match[1])
fmt.Println(match[2])
// Output: 0 4
// 0 1
// 2 3
}

View File

@@ -5,7 +5,13 @@ import (
"sort"
)
// a Match stores a slice of all the capturing groups in a match.
// A Match represents a match found by the regex in a given string.
// It is represented as a list of groups, where the nth element contains
// the contents of the nth capturing group. Note that the group may not be valid
// (see [Group.IsValid]). The element at index 0 is known
// as the 0-group, and represents the contents of the entire match.
//
// See [Reg.FindSubmatch] for an example.
type Match []Group
// a Group represents a group. It contains the start index and end index of the match
@@ -35,28 +41,34 @@ func (m Match) numValidGroups() int {
}
// Returns a string containing the indices of all (valid) groups in the match
func (m Match) ToString() string {
func (m Match) String() string {
var toRet string
for i, g := range m {
if g.isValid() {
if g.IsValid() {
toRet += fmt.Sprintf("Group %d\n", i)
toRet += g.toString()
toRet += g.String()
toRet += "\n"
}
}
return toRet
}
// Converts the Group into a string representation:
func (idx Group) toString() string {
// String converts the Group into a string representation.
func (idx Group) String() string {
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
}
// Returns whether a group contains valid indices
func (g Group) isValid() bool {
// Returns whether a group is valid (ie. whether it matched any text). It
// simply ensures that both indices of the group are >= 0.
func (g Group) IsValid() bool {
return g.StartIdx >= 0 && g.EndIdx >= 0
}
// Simple function, makes it easier to map this over a list of matches
func getZeroGroup(m Match) Group {
return m[0]
}
// takeZeroState takes the 0-state (if such a transition exists) for all states in the
// given slice. It returns the resulting states. If any of the resulting states is a 0-state,
// the second ret val is true.
@@ -101,7 +113,7 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState)
num_appended := 0 // number of unique states addded to tempstates
for isZero == true {
zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
tempstates, num_appended = unique_append(tempstates, zeroStates...)
tempstates, num_appended = uniqueAppend(tempstates, zeroStates...)
if num_appended == 0 { // break if we haven't appended any more unique values
break
}
@@ -138,36 +150,72 @@ func pruneIndices(indices []Match) []Match {
return toRet
}
// FindString returns a _string_ containing the _text_ of the _leftmost_ match of
// the regex, in the given string. The return value will be an empty string in two situations:
// Find returns the 0-group of the leftmost match of the regex in the given string.
// An error value != nil indicates that no match was found.
func (regex Reg) Find(str string) (Group, error) {
match, err := regex.FindNthMatch(str, 1)
if err != nil {
return Group{}, fmt.Errorf("no matches found")
}
return getZeroGroup(match), nil
}
// FindAll returns a slice containing all the 0-groups of the regex in the given string.
// A 0-group represents the match without any submatches.
func (regex Reg) FindAll(str string) []Group {
indices := regex.FindAllSubmatch(str)
zeroGroups := funcMap(indices, getZeroGroup)
return zeroGroups
}
// FindString returns the text of the leftmost match of the regex in the given string.
// The return value will be an empty string in two situations:
// 1. No match was found
// 2. The match was an empty string
func FindString(regex Reg, str string) string {
match, err := FindNthMatch(regex, str, 1)
func (regex Reg) FindString(str string) string {
match, err := regex.FindNthMatch(str, 1)
if err != nil {
return ""
}
return str[match[0].StartIdx:match[0].EndIdx]
zeroGroup := getZeroGroup(match)
return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
}
// FindSubmatch returns the leftmost match of the regex in the given string, including
// the submatches matched by capturing groups. The returned [Match] will always contain the same
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
// [Group.IsValid], or by checking that both indices of the group are >= 0.
// The second-return value is nil if no match was found.
func (regex Reg) FindSubmatch(str string) (Match, error) {
match, err := regex.FindNthMatch(str, 1)
if err != nil {
return Match{}, fmt.Errorf("no match found")
} else {
return match, nil
}
}
// FindAllString is the 'all' version of FindString.
// It returns a _slice of strings_ containing the _text_ of _all_ matches of
// the regex, in the given string.
//func FindAllString(regex Reg, str []string) []string {
//
//}
// It returns a slice of strings containing the text of all matches of
// the regex in the given string.
func (regex Reg) FindAllString(str string) []string {
zerogroups := regex.FindAll(str)
matchStrs := funcMap(zerogroups, func(g Group) string {
return str[g.StartIdx:g.EndIdx]
})
return matchStrs
}
// FindNthMatch finds the 'n'th match of the regex represented by the given start-state, with
// the given string.
// FindNthMatch return the 'n'th match of the regex in the given string.
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
func FindNthMatch(regex Reg, str string, n int) (Match, error) {
func (regex Reg) FindNthMatch(str string, n int) (Match, error) {
idx := 0
matchNum := 0
str_runes := []rune(str)
var matchFound bool
var matchIdx Match
for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups)
if matchFound {
matchNum++
}
@@ -179,16 +227,15 @@ func FindNthMatch(regex Reg, str string, n int) (Match, error) {
return nil, fmt.Errorf("invalid match index - too few matches found")
}
// FindAllMatches tries to find all matches of the regex represented by given start-state, with
// the given string
func FindAllMatches(regex Reg, str string) []Match {
// FindAllSubmatch returns a slice of matches in the given string.
func (regex Reg) FindAllSubmatch(str string) []Match {
idx := 0
str_runes := []rune(str)
var matchFound bool
var matchIdx Match
indices := make([]Match, 0)
for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups)
if matchFound {
indices = append(indices, matchIdx)
}
@@ -196,6 +243,7 @@ func FindAllMatches(regex Reg, str string) []Match {
if len(indices) > 0 {
return pruneIndices(indices)
}
return indices
}
@@ -204,12 +252,13 @@ func FindAllMatches(regex Reg, str string) []Match {
// the next search should start from.
//
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
// Base case - exit if offset exceeds string's length
if offset > len(str) {
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
return false, []Group{}, offset
}
resetThreads(start)
// Hold a list of match indices for the current run. When we
// can no longer find a match, the match with the largest range is
@@ -265,13 +314,13 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
num_appended := 0
for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = unique_append(tempStates, zeroStates...)
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break
}
}
currentStates, _ = unique_append(currentStates, tempStates...)
currentStates, _ = uniqueAppend(currentStates, tempStates...)
tempStates = nil
// Take any transitions corresponding to current character
@@ -345,7 +394,7 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
// Check if we can find a zero-length match
if foundPath == false {
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
if tempIndices[0].isValid() == false {
if tempIndices[0].IsValid() == false {
tempIndices[0] = Group{startIdx, startIdx}
}
}
@@ -355,7 +404,7 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
startIdx++
// i++
// }
if tempIndices.numValidGroups() > 0 && tempIndices[0].isValid() {
if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() {
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true, tempIndices, tempIndices[0].EndIdx + 1
} else {
@@ -378,7 +427,7 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
num_appended := 0 // Number of unique states addded to tempStates
for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = unique_append(tempStates, zeroStates...)
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break
}

View File

@@ -50,7 +50,7 @@ func isNormalChar(c rune) bool {
// Ensure that the given elements are only appended to the given slice if they
// don't already exist. Returns the new slice, and the number of unique items appended.
func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
func uniqueAppend[T comparable](slc []T, items ...T) ([]T, int) {
num_appended := 0
for _, item := range items {
if !slices.Contains(slc, item) {
@@ -61,6 +61,25 @@ func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
return slc, num_appended
}
func uniqueAppendFunc[T any](slc []T, fn func(T, T) bool, items ...T) ([]T, int) {
toRet := make([]T, len(slc))
num_appended := 0
copy(toRet, slc)
for _, item := range items {
itemExists := false
for _, val := range slc {
if fn(item, val) {
itemExists = true
}
}
if !itemExists {
toRet = append(toRet, item)
num_appended++
}
}
return toRet, num_appended
}
// Returns true only if all the given elements are equal
func allEqual[T comparable](items ...T) bool {
first := items[0]

View File

@@ -104,6 +104,26 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
return clone
}
// Reset any thread-related fields of the NFA starting from the given state.
func resetThreads(start *nfaState) {
visitedMap := make(map[*nfaState]bool) // The value type doesn't matter here
resetThreadsHelper(start, visitedMap)
}
func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
if _, ok := visitedMap[state]; ok {
return
}
// Assuming it hasn't been visited
state.threadGroups = nil
visitedMap[state] = true
for _, v := range state.transitions {
for _, nextState := range v {
resetThreadsHelper(nextState, visitedMap)
}
}
}
// Checks if the given state's assertion is true. Returns true if the given
// state doesn't have an assertion.
func (s nfaState) checkAssertion(str []rune, idx int) bool {
@@ -156,17 +176,18 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool {
strToMatch = string(runesToMatch)
}
matchIndices := FindAllMatches(Reg{startState, s.lookaroundNumCaptureGroups}, strToMatch)
regComp := Reg{startState, s.lookaroundNumCaptureGroups}
matchIndices := regComp.FindAll(strToMatch)
numMatchesFound := 0
for _, matchIdx := range matchIndices {
if s.assert == plaAssert || s.assert == nlaAssert { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
if matchIdx[0].StartIdx == 0 {
if matchIdx.StartIdx == 0 {
numMatchesFound++
}
}
if s.assert == plbAssert || s.assert == nlbAssert { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
if matchIdx[0].EndIdx == idx {
if matchIdx.EndIdx == idx {
numMatchesFound++
}
}
@@ -273,7 +294,7 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
}
for i := range s1.output {
for _, c := range s2.content { // Create transitions for every element in s1's content to s2'
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], s2)
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2)
}
}
s1.output = s2.output
@@ -293,11 +314,11 @@ func kleene(s1 nfaState) (*nfaState, error) {
toReturn.output = append(toReturn.output, toReturn)
for i := range s1.output {
for _, c := range toReturn.content {
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], toReturn)
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
}
}
for _, c := range s1.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1)
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
}
return toReturn, nil
}
@@ -313,10 +334,10 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
// This would lead to multiple instances of the same set of match indices, since both
// 's1' states would be considered to match.
for _, c := range s1.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s1)
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
}
for _, c := range s2.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2)
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
}
toReturn.content = newContents(epsilon)
toReturn.isEmpty = true

View File

@@ -3,7 +3,9 @@ package regex
import (
"fmt"
"math"
"slices"
"strconv"
"strings"
)
type numRange struct {
@@ -99,13 +101,11 @@ func range2regex(start int, end int) (string, error) {
// Last range - tmp to rangeEnd
ranges = append(ranges, numRange{tmp, rangeEnd})
regex := string(nonCapLparenRune)
regexSlice := make([]string, 0)
// Generate the regex
for i, rg := range ranges {
if i > 0 {
regex += "|"
}
regex += string(nonCapLparenRune)
for _, rg := range ranges {
tmpStr := ""
tmpStr += string(nonCapLparenRune)
startSlc := intToSlc(rg.start)
endSlc := intToSlc(rg.end)
if len(startSlc) != len(endSlc) {
@@ -113,14 +113,27 @@ func range2regex(start int, end int) (string, error) {
}
for i := range startSlc {
if startSlc[i] == endSlc[i] {
regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
tmpStr += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
} else {
regex += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
tmpStr += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
}
}
regex += ")"
tmpStr += ")"
regexSlice = append(regexSlice, tmpStr)
}
regex += ")"
// Each element of the slice represents one 'group'. Taking 0-255 as an example, the elements would be:
// 1. 0-9
// 2. 10-99
// 3. 100-199
// 4. 200-249
// 5. 250-255
//
// The reason this is reversed before joining it, is because it is incompatible with the PCRE rule for matching.
// The PCRE rule specifies that the left-branch of an alternation is preferred. Even though this engine uses the POSIX
// rule at the moment (which prefers the longest match regardless of the order of the alternation), reversing the string
// has no downsides. It doesn't affect POSIX matching, and it will reduce my burden if I decide to switch to PCRE matching.
slices.Reverse(regexSlice)
regex := string(nonCapLparenRune) + strings.Join(regexSlice, "|") + ")"
return regex, nil
}

View File

@@ -105,6 +105,9 @@ var reTests = []struct {
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
{`\d{3,4}`, nil, "120", []Group{{0, 3}}},
{`\d{3,4}`, nil, "12709", []Group{{0, 4}}},
{`\d{3,4}`, nil, "12", []Group{}},
{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
{`\d{3,4}`, nil, "5", []Group{}},
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
@@ -671,9 +674,20 @@ var groupTests = []struct {
{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
{`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}},
// // Tests from https://wiki.haskell.org/Regex_Posix
// {`(()|.)(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
// {`(()|[ab])(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
// {`(()|[ab])+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
// {`([ab]|())+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
// // Bug - this should give {0,6},{3,6},{-1,-1} but it gives {0,6},{3,6},{3,3}
// // {`yyyyyy`, nil, `(yyy|(x?)){2,4}`, []Match{[]Group{{0, 6}, {3, 6}, {-1, -1}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
}
func TestFindAllMatches(t *testing.T) {
func TestFind(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
@@ -682,13 +696,35 @@ func TestFindAllMatches(t *testing.T) {
panic(fmt.Errorf("Test Error: %v", err))
}
} else {
matchIndices := FindAllMatches(regComp, test.str)
zeroGroups := make([]Group, len(matchIndices))
for i, m := range matchIndices {
zeroGroups[i] = m[0]
groupIndex, err := regComp.Find(test.str)
if err != nil { // No matches found
if len(test.result) == 0 {
return // Manually pass the test, because this is the expected behavior
} else {
t.Errorf("Wanted no match Got %v\n", groupIndex)
}
} else {
if groupIndex != test.result[0] {
t.Errorf("Wanted %v Got %v\n", test.result, groupIndex)
}
}
if !slices.Equal(test.result, zeroGroups) {
t.Errorf("Wanted %v Got %v\n", test.result, zeroGroups)
}
})
}
}
func TestFindAll(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(fmt.Errorf("Test Error: %v", err))
}
} else {
matchIndices := regComp.FindAll(test.str)
if !slices.Equal(test.result, matchIndices) {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}
}
})
@@ -704,7 +740,7 @@ func TestFindString(t *testing.T) {
panic(err)
}
} else {
foundString := FindString(regComp, test.str)
foundString := regComp.FindString(test.str)
if len(test.result) == 0 {
if foundString != "" {
t.Errorf("Expected no match got %v\n", foundString)
@@ -720,7 +756,32 @@ func TestFindString(t *testing.T) {
}
}
func TestFindAllGroups(t *testing.T) {
func TestFindAllString(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
} else {
foundStrings := regComp.FindAllString(test.str)
if len(test.result) != len(foundStrings) {
t.Errorf("Differing number of matches: Wanted %v matches Got %v matches\n", len(test.result), len(foundStrings))
} else {
for idx, group := range test.result {
groupStr := test.str[group.StartIdx:group.EndIdx]
if groupStr != foundStrings[idx] {
t.Errorf("Wanted %v Got %v\n", groupStr, foundStrings[idx])
}
}
}
}
})
}
}
func TestFindSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
@@ -729,10 +790,30 @@ func TestFindAllGroups(t *testing.T) {
panic(err)
}
}
matchIndices := FindAllMatches(regComp, test.str)
match, err := regComp.FindSubmatch(test.str)
for i := range match {
if match[i].IsValid() {
if test.result[0][i] != match[i] {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
}
}
}
})
}
}
func TestFindAllSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
}
matchIndices := regComp.FindAllSubmatch(test.str)
for i := range matchIndices {
for j := range matchIndices[i] {
if matchIndices[i][j].isValid() {
if matchIndices[i][j].IsValid() {
if test.result[i][j] != matchIndices[i][j] {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}