Compare commits
35 Commits
1bfb09b6c7
...
posixStyle
| Author | SHA1 | Date | |
|---|---|---|---|
| ef476e8875 | |||
| 7e6b02632f | |||
| f94e3f2e71 | |||
| b129d83c3f | |||
| 43aa7b5876 | |||
| 9a3bfca313 | |||
| b6ab54f6dd | |||
| 6a96c98d04 | |||
| 3cfc2a6854 | |||
| 5d7a02e796 | |||
| a46d2f4546 | |||
| c88ebd1aa5 | |||
| fd102292c6 | |||
| 6d692d0dfc | |||
| 7c4538a259 | |||
| 2a9ae0b68a | |||
| 783ae2ad10 | |||
| b5e6bc112c | |||
| 206fea34cd | |||
| fcdb23524a | |||
| ac936659b6 | |||
| e6dba9fdcf | |||
| 30779a446b | |||
| f629a0f08f | |||
| 6869cd00a2 | |||
| 02bc8f30a2 | |||
| ac05bceda3 | |||
| 037ac75ea6 | |||
| e9d4e857cf | |||
| b685d2fd5f | |||
| 8eda5055ff | |||
| 45b6566b2c | |||
| e22822e619 | |||
| 692de2a32b | |||
| 0d19664044 |
2
Makefile
2
Makefile
@@ -8,6 +8,6 @@ vet: fmt
|
||||
buildLib: vet
|
||||
go build -gcflags="-N -l" ./...
|
||||
buildCmd: buildLib
|
||||
go build -C cmd/ -o re ./...
|
||||
go build -C cmd/ -gcflags="-N -l" -o re ./...
|
||||
test: buildCmd
|
||||
go test -v ./...
|
||||
|
||||
@@ -121,12 +121,12 @@ func main() {
|
||||
}
|
||||
matchIndices := make([]reg.Match, 0)
|
||||
if matchNumFlagEnabled {
|
||||
tmp, err := reg.FindNthMatch(regComp, test_str, *matchNum)
|
||||
tmp, err := regComp.FindNthMatch(test_str, *matchNum)
|
||||
if err == nil {
|
||||
matchIndices = append(matchIndices, tmp)
|
||||
}
|
||||
} else {
|
||||
matchIndices = reg.FindAllMatches(regComp, test_str)
|
||||
matchIndices = regComp.FindAllSubmatch(test_str)
|
||||
}
|
||||
|
||||
if *printMatchesFlag {
|
||||
@@ -137,7 +137,7 @@ func main() {
|
||||
fmt.Fprintf(out, "Line %d:\n", lineNum)
|
||||
}
|
||||
for _, m := range matchIndices {
|
||||
fmt.Fprintf(out, "%s\n", m.ToString())
|
||||
fmt.Fprintf(out, "%s\n", m.String())
|
||||
}
|
||||
err := out.Flush()
|
||||
if err != nil {
|
||||
|
||||
@@ -18,6 +18,12 @@ type Reg struct {
|
||||
numGroups int
|
||||
}
|
||||
|
||||
// numSubexp eturns the number of sub-expressions in the given [Reg]. This is equivalent
|
||||
// to the number of capturing groups.
|
||||
func (r Reg) NumSubexp() int {
|
||||
return r.numGroups
|
||||
}
|
||||
|
||||
const concatRune rune = 0xF0001
|
||||
|
||||
// Flags for shuntingYard - control its behavior
|
||||
@@ -943,7 +949,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
// and added back in.
|
||||
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
|
||||
// and RPAREN nodes.
|
||||
// If neither node exists, that's a problem so I return an error.
|
||||
// If the middle node exists but is itself the start of a group, then that _must_ be the opening paren for
|
||||
// the closing paren that I'm on. I put the third node back (because it isn't involved in the capturing group), then
|
||||
// I concatenate those two and add them. If neither node exists, that's a problem so I return an error.
|
||||
if c.nodetype == rparenNode {
|
||||
s.groupEnd = true
|
||||
middleNode, err1 := pop(&nfa)
|
||||
@@ -958,6 +966,11 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
s.groupNum = lparenNode.groupNum
|
||||
to_add := concatenate(lparenNode, s)
|
||||
nfa = append(nfa, to_add)
|
||||
} else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
|
||||
nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group
|
||||
s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen
|
||||
to_add := concatenate(middleNode, s)
|
||||
nfa = append(nfa, to_add)
|
||||
} else {
|
||||
// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
|
||||
if lparenNode.groupBegin {
|
||||
@@ -1110,10 +1123,11 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
|
||||
}
|
||||
|
||||
// Compiles the given regular expression into a Reg type, suitable for use with the
|
||||
// matching functions. The second return value is non-nil if a compilation error has
|
||||
// occured. As such, the error value must be checked before using the Reg returned by this function.
|
||||
// The second parameter is an optional list of flags, passed to the parsing function shuntingYard.
|
||||
// Compile compiles the given regular expression into a [Reg].
|
||||
//
|
||||
// An error value != nil indicates that the regex was invalid; the error message should provide
|
||||
// detailed information on the nature of the error.
|
||||
// The second parameter is a sequence of zero or more [ReFlag] values, that modify the behavior of the regex.
|
||||
func Compile(re string, flags ...ReFlag) (Reg, error) {
|
||||
nodes, err := shuntingYard(re, flags...)
|
||||
if err != nil {
|
||||
@@ -1125,3 +1139,12 @@ func Compile(re string, flags ...ReFlag) (Reg, error) {
|
||||
}
|
||||
return reg, nil
|
||||
}
|
||||
|
||||
// MustCompile panicks if Compile returns an error. They are identical in all other respects.
|
||||
func MustCompile(re string, flags ...ReFlag) Reg {
|
||||
reg, err := Compile(re, flags...)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return reg
|
||||
}
|
||||
|
||||
74
regex/doc.go
74
regex/doc.go
@@ -84,9 +84,77 @@ Assertions:
|
||||
\b Match at a word boundary (a word character followed by a non-word character, or vice-versa)
|
||||
\B Match at a non-word boundary (a word character followed by a word character, or vice-versa)
|
||||
|
||||
# Flags
|
||||
Lookarounds:
|
||||
|
||||
Flags are used to change the behavior of the engine. None of them are enabled by default. They are passed as variadic arguments to [Compile].
|
||||
The list of flags is provided in the type definition for [ReFlag].
|
||||
x(?=y) Positive lookahead - Match x if followed by y
|
||||
x(?!y) Negative lookahead - Match x if NOT followed by y
|
||||
(?<=x)y Positive lookbehind - Match y if preceded by x
|
||||
(?<!x)y Negative lookbehind - Match y if NOT preceded by x
|
||||
|
||||
Numeric ranges:
|
||||
|
||||
<x-y> Match any number from x to y (inclusive) (x and y must be positive numbers)
|
||||
|
||||
# Key Differences with regexp
|
||||
|
||||
The engine and the API differ from [regexp] in a number of ways, some of them very subtle.
|
||||
The key differences are mentioned below.
|
||||
|
||||
1. Greediness:
|
||||
|
||||
This engine does not support non-greedy operators. All operators are always greedy in nature, and will try
|
||||
to match as much as they can, while still allowing for a successful match. For example, given the regex:
|
||||
|
||||
y*y
|
||||
|
||||
The engine will match as many 'y's as it can, while still allowing the trailing 'y' to be matched.
|
||||
|
||||
Another, more subtle example is the following regex:
|
||||
|
||||
x|xx
|
||||
|
||||
While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
|
||||
this engine will go for the longest possible match, regardless of the order of the alternation. Although this
|
||||
strays from the convention, it results in a nice rule-of-thumb - the engine is ALWAYS greedy.
|
||||
|
||||
The stdlib implementation has a function [regexp.Regexp.Longest] which makes future searches prefer the longest match.
|
||||
That is the default (and unchangable) behavior in this engine.
|
||||
|
||||
2. Byte-slices and runes:
|
||||
|
||||
My engine does not support byte-slices. When a matching function receives a string, it converts it into a
|
||||
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
|
||||
support made the tradeoff worth it.
|
||||
|
||||
3. Return values
|
||||
|
||||
Rather than using primitives for return values, my engine defines two types that are used as return
|
||||
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
|
||||
|
||||
[regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The
|
||||
equivalent expression for this engine is:
|
||||
|
||||
Find(All)?(String)?(Submatch)?
|
||||
|
||||
[Reg.Find] returns the index of the leftmost match in the string.
|
||||
|
||||
If a function contains 'All' it returns all matches instead of just the leftmost one.
|
||||
|
||||
If a function contains 'String' it returns the matched text, rather than the indices.
|
||||
|
||||
If a function contains 'Submatch' it returns the match, including all submatches found by
|
||||
capturing groups.
|
||||
|
||||
The term '0-group' is used to refer to the 0th capturing group of a match (which is the entire match).
|
||||
Given the following regex:
|
||||
|
||||
x(y)
|
||||
|
||||
and the input string:
|
||||
|
||||
xyz
|
||||
|
||||
The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name
|
||||
returns the 0-group.
|
||||
*/
|
||||
package regex
|
||||
|
||||
54
regex/example_test.go
Normal file
54
regex/example_test.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package regex_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"gitea.twomorecents.org/Rockingcool/kleingrep/regex"
|
||||
)
|
||||
|
||||
func ExampleReg_Find() {
|
||||
regexStr := "b|a"
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
match, _ := regexComp.Find("banana")
|
||||
fmt.Println(match.String())
|
||||
|
||||
// Output: 0 1
|
||||
}
|
||||
|
||||
func ExampleReg_FindAll() {
|
||||
regexStr := "b|a"
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
matches := regexComp.FindAll("banana")
|
||||
for _, group := range matches {
|
||||
fmt.Println(group.String())
|
||||
}
|
||||
|
||||
// Output: 0 1
|
||||
// 1 2
|
||||
// 3 4
|
||||
// 5 6
|
||||
}
|
||||
|
||||
func ExampleReg_FindString() {
|
||||
regexStr := `\d+`
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
matchStr := regexComp.FindString("The year of our lord, 2025")
|
||||
fmt.Println(matchStr)
|
||||
// Output: 2025
|
||||
}
|
||||
|
||||
func ExampleReg_FindSubmatch() {
|
||||
regexStr := `(\d)\.(\d)(\d)`
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
match, _ := regexComp.FindSubmatch("3.14")
|
||||
fmt.Println(match[0])
|
||||
fmt.Println(match[1])
|
||||
fmt.Println(match[2])
|
||||
// Output: 0 4
|
||||
// 0 1
|
||||
// 2 3
|
||||
}
|
||||
@@ -5,7 +5,13 @@ import (
|
||||
"sort"
|
||||
)
|
||||
|
||||
// a Match stores a slice of all the capturing groups in a match.
|
||||
// A Match represents a match found by the regex in a given string.
|
||||
// It is represented as a list of groups, where the nth element contains
|
||||
// the contents of the nth capturing group. Note that the group may not be valid
|
||||
// (see [Group.IsValid]). The element at index 0 is known
|
||||
// as the 0-group, and represents the contents of the entire match.
|
||||
//
|
||||
// See [Reg.FindSubmatch] for an example.
|
||||
type Match []Group
|
||||
|
||||
// a Group represents a group. It contains the start index and end index of the match
|
||||
@@ -35,28 +41,34 @@ func (m Match) numValidGroups() int {
|
||||
}
|
||||
|
||||
// Returns a string containing the indices of all (valid) groups in the match
|
||||
func (m Match) ToString() string {
|
||||
func (m Match) String() string {
|
||||
var toRet string
|
||||
for i, g := range m {
|
||||
if g.isValid() {
|
||||
if g.IsValid() {
|
||||
toRet += fmt.Sprintf("Group %d\n", i)
|
||||
toRet += g.toString()
|
||||
toRet += g.String()
|
||||
toRet += "\n"
|
||||
}
|
||||
}
|
||||
return toRet
|
||||
}
|
||||
|
||||
// Converts the Group into a string representation:
|
||||
func (idx Group) toString() string {
|
||||
// String converts the Group into a string representation.
|
||||
func (idx Group) String() string {
|
||||
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
|
||||
}
|
||||
|
||||
// Returns whether a group contains valid indices
|
||||
func (g Group) isValid() bool {
|
||||
// Returns whether a group is valid (ie. whether it matched any text). It
|
||||
// simply ensures that both indices of the group are >= 0.
|
||||
func (g Group) IsValid() bool {
|
||||
return g.StartIdx >= 0 && g.EndIdx >= 0
|
||||
}
|
||||
|
||||
// Simple function, makes it easier to map this over a list of matches
|
||||
func getZeroGroup(m Match) Group {
|
||||
return m[0]
|
||||
}
|
||||
|
||||
// takeZeroState takes the 0-state (if such a transition exists) for all states in the
|
||||
// given slice. It returns the resulting states. If any of the resulting states is a 0-state,
|
||||
// the second ret val is true.
|
||||
@@ -101,7 +113,7 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState)
|
||||
num_appended := 0 // number of unique states addded to tempstates
|
||||
for isZero == true {
|
||||
zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
|
||||
tempstates, num_appended = unique_append(tempstates, zeroStates...)
|
||||
tempstates, num_appended = uniqueAppend(tempstates, zeroStates...)
|
||||
if num_appended == 0 { // break if we haven't appended any more unique values
|
||||
break
|
||||
}
|
||||
@@ -138,36 +150,72 @@ func pruneIndices(indices []Match) []Match {
|
||||
return toRet
|
||||
}
|
||||
|
||||
// FindString returns a _string_ containing the _text_ of the _leftmost_ match of
|
||||
// the regex, in the given string. The return value will be an empty string in two situations:
|
||||
// Find returns the 0-group of the leftmost match of the regex in the given string.
|
||||
// An error value != nil indicates that no match was found.
|
||||
func (regex Reg) Find(str string) (Group, error) {
|
||||
match, err := regex.FindNthMatch(str, 1)
|
||||
if err != nil {
|
||||
return Group{}, fmt.Errorf("no matches found")
|
||||
}
|
||||
return getZeroGroup(match), nil
|
||||
}
|
||||
|
||||
// FindAll returns a slice containing all the 0-groups of the regex in the given string.
|
||||
// A 0-group represents the match without any submatches.
|
||||
func (regex Reg) FindAll(str string) []Group {
|
||||
indices := regex.FindAllSubmatch(str)
|
||||
zeroGroups := funcMap(indices, getZeroGroup)
|
||||
return zeroGroups
|
||||
}
|
||||
|
||||
// FindString returns the text of the leftmost match of the regex in the given string.
|
||||
// The return value will be an empty string in two situations:
|
||||
// 1. No match was found
|
||||
// 2. The match was an empty string
|
||||
func (regex Reg) FindString(str string) string {
|
||||
match, err := FindNthMatch(regex, str, 1)
|
||||
match, err := regex.FindNthMatch(str, 1)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return str[match[0].StartIdx:match[0].EndIdx]
|
||||
zeroGroup := getZeroGroup(match)
|
||||
return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
|
||||
}
|
||||
|
||||
// FindSubmatch returns the leftmost match of the regex in the given string, including
|
||||
// the submatches matched by capturing groups. The returned [Match] will always contain the same
|
||||
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
|
||||
// [Group.IsValid], or by checking that both indices of the group are >= 0.
|
||||
// The second-return value is nil if no match was found.
|
||||
func (regex Reg) FindSubmatch(str string) (Match, error) {
|
||||
match, err := regex.FindNthMatch(str, 1)
|
||||
if err != nil {
|
||||
return Match{}, fmt.Errorf("no match found")
|
||||
} else {
|
||||
return match, nil
|
||||
}
|
||||
}
|
||||
|
||||
// FindAllString is the 'all' version of FindString.
|
||||
// It returns a _slice of strings_ containing the _text_ of _all_ matches of
|
||||
// the regex, in the given string.
|
||||
//func FindAllString(regex Reg, str []string) []string {
|
||||
//
|
||||
//}
|
||||
// It returns a slice of strings containing the text of all matches of
|
||||
// the regex in the given string.
|
||||
func (regex Reg) FindAllString(str string) []string {
|
||||
zerogroups := regex.FindAll(str)
|
||||
matchStrs := funcMap(zerogroups, func(g Group) string {
|
||||
return str[g.StartIdx:g.EndIdx]
|
||||
})
|
||||
return matchStrs
|
||||
}
|
||||
|
||||
// FindNthMatch finds the 'n'th match of the regex represented by the given start-state, with
|
||||
// the given string.
|
||||
// FindNthMatch return the 'n'th match of the regex in the given string.
|
||||
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
|
||||
func FindNthMatch(regex Reg, str string, n int) (Match, error) {
|
||||
func (regex Reg) FindNthMatch(str string, n int) (Match, error) {
|
||||
idx := 0
|
||||
matchNum := 0
|
||||
str_runes := []rune(str)
|
||||
var matchFound bool
|
||||
var matchIdx Match
|
||||
for idx <= len(str_runes) {
|
||||
matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
|
||||
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups)
|
||||
if matchFound {
|
||||
matchNum++
|
||||
}
|
||||
@@ -179,16 +227,15 @@ func FindNthMatch(regex Reg, str string, n int) (Match, error) {
|
||||
return nil, fmt.Errorf("invalid match index - too few matches found")
|
||||
}
|
||||
|
||||
// FindAllMatches tries to find all matches of the regex represented by given start-state, with
|
||||
// the given string
|
||||
func FindAllMatches(regex Reg, str string) []Match {
|
||||
// FindAllSubmatch returns a slice of matches in the given string.
|
||||
func (regex Reg) FindAllSubmatch(str string) []Match {
|
||||
idx := 0
|
||||
str_runes := []rune(str)
|
||||
var matchFound bool
|
||||
var matchIdx Match
|
||||
indices := make([]Match, 0)
|
||||
for idx <= len(str_runes) {
|
||||
matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
|
||||
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups)
|
||||
if matchFound {
|
||||
indices = append(indices, matchIdx)
|
||||
}
|
||||
@@ -196,6 +243,7 @@ func FindAllMatches(regex Reg, str string) []Match {
|
||||
if len(indices) > 0 {
|
||||
return pruneIndices(indices)
|
||||
}
|
||||
|
||||
return indices
|
||||
}
|
||||
|
||||
@@ -204,12 +252,13 @@ func FindAllMatches(regex Reg, str string) []Match {
|
||||
// the next search should start from.
|
||||
//
|
||||
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
|
||||
func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
|
||||
func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
|
||||
// Base case - exit if offset exceeds string's length
|
||||
if offset > len(str) {
|
||||
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
|
||||
return false, []Group{}, offset
|
||||
}
|
||||
resetThreads(start)
|
||||
|
||||
// Hold a list of match indices for the current run. When we
|
||||
// can no longer find a match, the match with the largest range is
|
||||
@@ -265,13 +314,13 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
|
||||
num_appended := 0
|
||||
for isZero == true {
|
||||
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
|
||||
tempStates, num_appended = unique_append(tempStates, zeroStates...)
|
||||
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
|
||||
if num_appended == 0 { // Break if we haven't appended any more unique values
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
currentStates, _ = unique_append(currentStates, tempStates...)
|
||||
currentStates, _ = uniqueAppend(currentStates, tempStates...)
|
||||
tempStates = nil
|
||||
|
||||
// Take any transitions corresponding to current character
|
||||
@@ -345,7 +394,7 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
|
||||
// Check if we can find a zero-length match
|
||||
if foundPath == false {
|
||||
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
|
||||
if tempIndices[0].isValid() == false {
|
||||
if tempIndices[0].IsValid() == false {
|
||||
tempIndices[0] = Group{startIdx, startIdx}
|
||||
}
|
||||
}
|
||||
@@ -355,7 +404,7 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
|
||||
startIdx++
|
||||
// i++
|
||||
// }
|
||||
if tempIndices.numValidGroups() > 0 && tempIndices[0].isValid() {
|
||||
if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() {
|
||||
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
|
||||
return true, tempIndices, tempIndices[0].EndIdx + 1
|
||||
} else {
|
||||
@@ -378,7 +427,7 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
|
||||
num_appended := 0 // Number of unique states addded to tempStates
|
||||
for isZero == true {
|
||||
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
|
||||
tempStates, num_appended = unique_append(tempStates, zeroStates...)
|
||||
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
|
||||
if num_appended == 0 { // Break if we haven't appended any more unique values
|
||||
break
|
||||
}
|
||||
|
||||
@@ -50,7 +50,7 @@ func isNormalChar(c rune) bool {
|
||||
|
||||
// Ensure that the given elements are only appended to the given slice if they
|
||||
// don't already exist. Returns the new slice, and the number of unique items appended.
|
||||
func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
|
||||
func uniqueAppend[T comparable](slc []T, items ...T) ([]T, int) {
|
||||
num_appended := 0
|
||||
for _, item := range items {
|
||||
if !slices.Contains(slc, item) {
|
||||
@@ -61,6 +61,25 @@ func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
|
||||
return slc, num_appended
|
||||
}
|
||||
|
||||
func uniqueAppendFunc[T any](slc []T, fn func(T, T) bool, items ...T) ([]T, int) {
|
||||
toRet := make([]T, len(slc))
|
||||
num_appended := 0
|
||||
copy(toRet, slc)
|
||||
for _, item := range items {
|
||||
itemExists := false
|
||||
for _, val := range slc {
|
||||
if fn(item, val) {
|
||||
itemExists = true
|
||||
}
|
||||
}
|
||||
if !itemExists {
|
||||
toRet = append(toRet, item)
|
||||
num_appended++
|
||||
}
|
||||
}
|
||||
return toRet, num_appended
|
||||
}
|
||||
|
||||
// Returns true only if all the given elements are equal
|
||||
func allEqual[T comparable](items ...T) bool {
|
||||
first := items[0]
|
||||
|
||||
37
regex/nfa.go
37
regex/nfa.go
@@ -104,6 +104,26 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
|
||||
return clone
|
||||
}
|
||||
|
||||
// Reset any thread-related fields of the NFA starting from the given state.
|
||||
func resetThreads(start *nfaState) {
|
||||
visitedMap := make(map[*nfaState]bool) // The value type doesn't matter here
|
||||
resetThreadsHelper(start, visitedMap)
|
||||
}
|
||||
|
||||
func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
|
||||
if _, ok := visitedMap[state]; ok {
|
||||
return
|
||||
}
|
||||
// Assuming it hasn't been visited
|
||||
state.threadGroups = nil
|
||||
visitedMap[state] = true
|
||||
for _, v := range state.transitions {
|
||||
for _, nextState := range v {
|
||||
resetThreadsHelper(nextState, visitedMap)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Checks if the given state's assertion is true. Returns true if the given
|
||||
// state doesn't have an assertion.
|
||||
func (s nfaState) checkAssertion(str []rune, idx int) bool {
|
||||
@@ -156,17 +176,18 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool {
|
||||
strToMatch = string(runesToMatch)
|
||||
}
|
||||
|
||||
matchIndices := FindAllMatches(Reg{startState, s.lookaroundNumCaptureGroups}, strToMatch)
|
||||
regComp := Reg{startState, s.lookaroundNumCaptureGroups}
|
||||
matchIndices := regComp.FindAll(strToMatch)
|
||||
|
||||
numMatchesFound := 0
|
||||
for _, matchIdx := range matchIndices {
|
||||
if s.assert == plaAssert || s.assert == nlaAssert { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
|
||||
if matchIdx[0].StartIdx == 0 {
|
||||
if matchIdx.StartIdx == 0 {
|
||||
numMatchesFound++
|
||||
}
|
||||
}
|
||||
if s.assert == plbAssert || s.assert == nlbAssert { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
|
||||
if matchIdx[0].EndIdx == idx {
|
||||
if matchIdx.EndIdx == idx {
|
||||
numMatchesFound++
|
||||
}
|
||||
}
|
||||
@@ -273,7 +294,7 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
|
||||
}
|
||||
for i := range s1.output {
|
||||
for _, c := range s2.content { // Create transitions for every element in s1's content to s2'
|
||||
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], s2)
|
||||
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2)
|
||||
}
|
||||
}
|
||||
s1.output = s2.output
|
||||
@@ -293,11 +314,11 @@ func kleene(s1 nfaState) (*nfaState, error) {
|
||||
toReturn.output = append(toReturn.output, toReturn)
|
||||
for i := range s1.output {
|
||||
for _, c := range toReturn.content {
|
||||
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], toReturn)
|
||||
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
|
||||
}
|
||||
}
|
||||
for _, c := range s1.content {
|
||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1)
|
||||
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
|
||||
}
|
||||
return toReturn, nil
|
||||
}
|
||||
@@ -313,10 +334,10 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
|
||||
// This would lead to multiple instances of the same set of match indices, since both
|
||||
// 's1' states would be considered to match.
|
||||
for _, c := range s1.content {
|
||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s1)
|
||||
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
|
||||
}
|
||||
for _, c := range s2.content {
|
||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2)
|
||||
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
|
||||
}
|
||||
toReturn.content = newContents(epsilon)
|
||||
toReturn.isEmpty = true
|
||||
|
||||
@@ -3,7 +3,9 @@ package regex
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type numRange struct {
|
||||
@@ -99,13 +101,11 @@ func range2regex(start int, end int) (string, error) {
|
||||
// Last range - tmp to rangeEnd
|
||||
ranges = append(ranges, numRange{tmp, rangeEnd})
|
||||
|
||||
regex := string(nonCapLparenRune)
|
||||
regexSlice := make([]string, 0)
|
||||
// Generate the regex
|
||||
for i, rg := range ranges {
|
||||
if i > 0 {
|
||||
regex += "|"
|
||||
}
|
||||
regex += string(nonCapLparenRune)
|
||||
for _, rg := range ranges {
|
||||
tmpStr := ""
|
||||
tmpStr += string(nonCapLparenRune)
|
||||
startSlc := intToSlc(rg.start)
|
||||
endSlc := intToSlc(rg.end)
|
||||
if len(startSlc) != len(endSlc) {
|
||||
@@ -113,14 +113,27 @@ func range2regex(start int, end int) (string, error) {
|
||||
}
|
||||
for i := range startSlc {
|
||||
if startSlc[i] == endSlc[i] {
|
||||
regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
|
||||
tmpStr += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
|
||||
} else {
|
||||
regex += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
|
||||
tmpStr += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
|
||||
}
|
||||
}
|
||||
regex += ")"
|
||||
tmpStr += ")"
|
||||
regexSlice = append(regexSlice, tmpStr)
|
||||
}
|
||||
regex += ")"
|
||||
// Each element of the slice represents one 'group'. Taking 0-255 as an example, the elements would be:
|
||||
// 1. 0-9
|
||||
// 2. 10-99
|
||||
// 3. 100-199
|
||||
// 4. 200-249
|
||||
// 5. 250-255
|
||||
//
|
||||
// The reason this is reversed before joining it, is because it is incompatible with the PCRE rule for matching.
|
||||
// The PCRE rule specifies that the left-branch of an alternation is preferred. Even though this engine uses the POSIX
|
||||
// rule at the moment (which prefers the longest match regardless of the order of the alternation), reversing the string
|
||||
// has no downsides. It doesn't affect POSIX matching, and it will reduce my burden if I decide to switch to PCRE matching.
|
||||
slices.Reverse(regexSlice)
|
||||
regex := string(nonCapLparenRune) + strings.Join(regexSlice, "|") + ")"
|
||||
return regex, nil
|
||||
|
||||
}
|
||||
|
||||
101
regex/re_test.go
101
regex/re_test.go
@@ -105,6 +105,9 @@ var reTests = []struct {
|
||||
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
|
||||
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
|
||||
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
|
||||
{`\d{3,4}`, nil, "120", []Group{{0, 3}}},
|
||||
{`\d{3,4}`, nil, "12709", []Group{{0, 4}}},
|
||||
{`\d{3,4}`, nil, "12", []Group{}},
|
||||
{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
|
||||
{`\d{3,4}`, nil, "5", []Group{}},
|
||||
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
|
||||
@@ -671,9 +674,20 @@ var groupTests = []struct {
|
||||
{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
|
||||
|
||||
{`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}},
|
||||
|
||||
// // Tests from https://wiki.haskell.org/Regex_Posix
|
||||
// {`(()|.)(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
|
||||
// {`(()|[ab])(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
|
||||
// {`(()|[ab])+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
|
||||
// {`([ab]|())+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
|
||||
// // Bug - this should give {0,6},{3,6},{-1,-1} but it gives {0,6},{3,6},{3,3}
|
||||
// // {`yyyyyy`, nil, `(yyy|(x?)){2,4}`, []Match{[]Group{{0, 6}, {3, 6}, {-1, -1}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
||||
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
||||
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
|
||||
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
|
||||
}
|
||||
|
||||
func TestFindAllMatches(t *testing.T) {
|
||||
func TestFind(t *testing.T) {
|
||||
for _, test := range reTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
@@ -682,13 +696,35 @@ func TestFindAllMatches(t *testing.T) {
|
||||
panic(fmt.Errorf("Test Error: %v", err))
|
||||
}
|
||||
} else {
|
||||
matchIndices := FindAllMatches(regComp, test.str)
|
||||
zeroGroups := make([]Group, len(matchIndices))
|
||||
for i, m := range matchIndices {
|
||||
zeroGroups[i] = m[0]
|
||||
groupIndex, err := regComp.Find(test.str)
|
||||
if err != nil { // No matches found
|
||||
if len(test.result) == 0 {
|
||||
return // Manually pass the test, because this is the expected behavior
|
||||
} else {
|
||||
t.Errorf("Wanted no match Got %v\n", groupIndex)
|
||||
}
|
||||
} else {
|
||||
if groupIndex != test.result[0] {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, groupIndex)
|
||||
}
|
||||
}
|
||||
if !slices.Equal(test.result, zeroGroups) {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, zeroGroups)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAll(t *testing.T) {
|
||||
for _, test := range reTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(fmt.Errorf("Test Error: %v", err))
|
||||
}
|
||||
} else {
|
||||
matchIndices := regComp.FindAll(test.str)
|
||||
if !slices.Equal(test.result, matchIndices) {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
||||
}
|
||||
}
|
||||
})
|
||||
@@ -720,7 +756,32 @@ func TestFindString(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAllGroups(t *testing.T) {
|
||||
func TestFindAllString(t *testing.T) {
|
||||
for _, test := range reTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
}
|
||||
} else {
|
||||
foundStrings := regComp.FindAllString(test.str)
|
||||
if len(test.result) != len(foundStrings) {
|
||||
t.Errorf("Differing number of matches: Wanted %v matches Got %v matches\n", len(test.result), len(foundStrings))
|
||||
} else {
|
||||
for idx, group := range test.result {
|
||||
groupStr := test.str[group.StartIdx:group.EndIdx]
|
||||
if groupStr != foundStrings[idx] {
|
||||
t.Errorf("Wanted %v Got %v\n", groupStr, foundStrings[idx])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindSubmatch(t *testing.T) {
|
||||
for _, test := range groupTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
@@ -729,10 +790,30 @@ func TestFindAllGroups(t *testing.T) {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
matchIndices := FindAllMatches(regComp, test.str)
|
||||
match, err := regComp.FindSubmatch(test.str)
|
||||
for i := range match {
|
||||
if match[i].IsValid() {
|
||||
if test.result[0][i] != match[i] {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
func TestFindAllSubmatch(t *testing.T) {
|
||||
for _, test := range groupTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
matchIndices := regComp.FindAllSubmatch(test.str)
|
||||
for i := range matchIndices {
|
||||
for j := range matchIndices[i] {
|
||||
if matchIndices[i][j].isValid() {
|
||||
if matchIndices[i][j].IsValid() {
|
||||
if test.result[i][j] != matchIndices[i][j] {
|
||||
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user