Compare commits
13 Commits
6d692d0dfc
...
posixStyle
| Author | SHA1 | Date | |
|---|---|---|---|
| ef476e8875 | |||
| 7e6b02632f | |||
| f94e3f2e71 | |||
| b129d83c3f | |||
| 43aa7b5876 | |||
| 9a3bfca313 | |||
| b6ab54f6dd | |||
| 6a96c98d04 | |||
| 3cfc2a6854 | |||
| 5d7a02e796 | |||
| a46d2f4546 | |||
| c88ebd1aa5 | |||
| fd102292c6 |
2
Makefile
2
Makefile
@@ -8,6 +8,6 @@ vet: fmt
|
||||
buildLib: vet
|
||||
go build -gcflags="-N -l" ./...
|
||||
buildCmd: buildLib
|
||||
go build -C cmd/ -o re ./...
|
||||
go build -C cmd/ -gcflags="-N -l" -o re ./...
|
||||
test: buildCmd
|
||||
go test -v ./...
|
||||
|
||||
@@ -949,7 +949,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
// and added back in.
|
||||
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
|
||||
// and RPAREN nodes.
|
||||
// If neither node exists, that's a problem so I return an error.
|
||||
// If the middle node exists but is itself the start of a group, then that _must_ be the opening paren for
|
||||
// the closing paren that I'm on. I put the third node back (because it isn't involved in the capturing group), then
|
||||
// I concatenate those two and add them. If neither node exists, that's a problem so I return an error.
|
||||
if c.nodetype == rparenNode {
|
||||
s.groupEnd = true
|
||||
middleNode, err1 := pop(&nfa)
|
||||
@@ -964,6 +966,11 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
s.groupNum = lparenNode.groupNum
|
||||
to_add := concatenate(lparenNode, s)
|
||||
nfa = append(nfa, to_add)
|
||||
} else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
|
||||
nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group
|
||||
s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen
|
||||
to_add := concatenate(middleNode, s)
|
||||
nfa = append(nfa, to_add)
|
||||
} else {
|
||||
// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
|
||||
if lparenNode.groupBegin {
|
||||
|
||||
@@ -114,7 +114,11 @@ Another, more subtle example is the following regex:
|
||||
x|xx
|
||||
|
||||
While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
|
||||
this engine will _always_ go for the longest possible match, regardless of the order of the alternation.
|
||||
this engine will go for the longest possible match, regardless of the order of the alternation. Although this
|
||||
strays from the convention, it results in a nice rule-of-thumb - the engine is ALWAYS greedy.
|
||||
|
||||
The stdlib implementation has a function [regexp.Regexp.Longest] which makes future searches prefer the longest match.
|
||||
That is the default (and unchangable) behavior in this engine.
|
||||
|
||||
2. Byte-slices and runes:
|
||||
|
||||
|
||||
@@ -30,3 +30,25 @@ func ExampleReg_FindAll() {
|
||||
// 3 4
|
||||
// 5 6
|
||||
}
|
||||
|
||||
func ExampleReg_FindString() {
|
||||
regexStr := `\d+`
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
matchStr := regexComp.FindString("The year of our lord, 2025")
|
||||
fmt.Println(matchStr)
|
||||
// Output: 2025
|
||||
}
|
||||
|
||||
func ExampleReg_FindSubmatch() {
|
||||
regexStr := `(\d)\.(\d)(\d)`
|
||||
regexComp := regex.MustCompile(regexStr)
|
||||
|
||||
match, _ := regexComp.FindSubmatch("3.14")
|
||||
fmt.Println(match[0])
|
||||
fmt.Println(match[1])
|
||||
fmt.Println(match[2])
|
||||
// Output: 0 4
|
||||
// 0 1
|
||||
// 2 3
|
||||
}
|
||||
|
||||
@@ -5,7 +5,13 @@ import (
|
||||
"sort"
|
||||
)
|
||||
|
||||
// a Match stores a slice of all the capturing groups in a match.
|
||||
// A Match represents a match found by the regex in a given string.
|
||||
// It is represented as a list of groups, where the nth element contains
|
||||
// the contents of the nth capturing group. Note that the group may not be valid
|
||||
// (see [Group.IsValid]). The element at index 0 is known
|
||||
// as the 0-group, and represents the contents of the entire match.
|
||||
//
|
||||
// See [Reg.FindSubmatch] for an example.
|
||||
type Match []Group
|
||||
|
||||
// a Group represents a group. It contains the start index and end index of the match
|
||||
@@ -107,7 +113,7 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState)
|
||||
num_appended := 0 // number of unique states addded to tempstates
|
||||
for isZero == true {
|
||||
zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
|
||||
tempstates, num_appended = unique_append(tempstates, zeroStates...)
|
||||
tempstates, num_appended = uniqueAppend(tempstates, zeroStates...)
|
||||
if num_appended == 0 { // break if we haven't appended any more unique values
|
||||
break
|
||||
}
|
||||
@@ -252,6 +258,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
|
||||
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
|
||||
return false, []Group{}, offset
|
||||
}
|
||||
resetThreads(start)
|
||||
|
||||
// Hold a list of match indices for the current run. When we
|
||||
// can no longer find a match, the match with the largest range is
|
||||
@@ -307,13 +314,13 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
|
||||
num_appended := 0
|
||||
for isZero == true {
|
||||
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
|
||||
tempStates, num_appended = unique_append(tempStates, zeroStates...)
|
||||
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
|
||||
if num_appended == 0 { // Break if we haven't appended any more unique values
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
currentStates, _ = unique_append(currentStates, tempStates...)
|
||||
currentStates, _ = uniqueAppend(currentStates, tempStates...)
|
||||
tempStates = nil
|
||||
|
||||
// Take any transitions corresponding to current character
|
||||
@@ -420,7 +427,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
|
||||
num_appended := 0 // Number of unique states addded to tempStates
|
||||
for isZero == true {
|
||||
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
|
||||
tempStates, num_appended = unique_append(tempStates, zeroStates...)
|
||||
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
|
||||
if num_appended == 0 { // Break if we haven't appended any more unique values
|
||||
break
|
||||
}
|
||||
|
||||
@@ -50,7 +50,7 @@ func isNormalChar(c rune) bool {
|
||||
|
||||
// Ensure that the given elements are only appended to the given slice if they
|
||||
// don't already exist. Returns the new slice, and the number of unique items appended.
|
||||
func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
|
||||
func uniqueAppend[T comparable](slc []T, items ...T) ([]T, int) {
|
||||
num_appended := 0
|
||||
for _, item := range items {
|
||||
if !slices.Contains(slc, item) {
|
||||
@@ -61,6 +61,25 @@ func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
|
||||
return slc, num_appended
|
||||
}
|
||||
|
||||
func uniqueAppendFunc[T any](slc []T, fn func(T, T) bool, items ...T) ([]T, int) {
|
||||
toRet := make([]T, len(slc))
|
||||
num_appended := 0
|
||||
copy(toRet, slc)
|
||||
for _, item := range items {
|
||||
itemExists := false
|
||||
for _, val := range slc {
|
||||
if fn(item, val) {
|
||||
itemExists = true
|
||||
}
|
||||
}
|
||||
if !itemExists {
|
||||
toRet = append(toRet, item)
|
||||
num_appended++
|
||||
}
|
||||
}
|
||||
return toRet, num_appended
|
||||
}
|
||||
|
||||
// Returns true only if all the given elements are equal
|
||||
func allEqual[T comparable](items ...T) bool {
|
||||
first := items[0]
|
||||
|
||||
30
regex/nfa.go
30
regex/nfa.go
@@ -104,6 +104,26 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
|
||||
return clone
|
||||
}
|
||||
|
||||
// Reset any thread-related fields of the NFA starting from the given state.
|
||||
func resetThreads(start *nfaState) {
|
||||
visitedMap := make(map[*nfaState]bool) // The value type doesn't matter here
|
||||
resetThreadsHelper(start, visitedMap)
|
||||
}
|
||||
|
||||
func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
|
||||
if _, ok := visitedMap[state]; ok {
|
||||
return
|
||||
}
|
||||
// Assuming it hasn't been visited
|
||||
state.threadGroups = nil
|
||||
visitedMap[state] = true
|
||||
for _, v := range state.transitions {
|
||||
for _, nextState := range v {
|
||||
resetThreadsHelper(nextState, visitedMap)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Checks if the given state's assertion is true. Returns true if the given
|
||||
// state doesn't have an assertion.
|
||||
func (s nfaState) checkAssertion(str []rune, idx int) bool {
|
||||
@@ -274,7 +294,7 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
|
||||
}
|
||||
for i := range s1.output {
|
||||
for _, c := range s2.content { // Create transitions for every element in s1's content to s2'
|
||||
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], s2)
|
||||
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2)
|
||||
}
|
||||
}
|
||||
s1.output = s2.output
|
||||
@@ -294,11 +314,11 @@ func kleene(s1 nfaState) (*nfaState, error) {
|
||||
toReturn.output = append(toReturn.output, toReturn)
|
||||
for i := range s1.output {
|
||||
for _, c := range toReturn.content {
|
||||
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], toReturn)
|
||||
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
|
||||
}
|
||||
}
|
||||
for _, c := range s1.content {
|
||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1)
|
||||
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
|
||||
}
|
||||
return toReturn, nil
|
||||
}
|
||||
@@ -314,10 +334,10 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
|
||||
// This would lead to multiple instances of the same set of match indices, since both
|
||||
// 's1' states would be considered to match.
|
||||
for _, c := range s1.content {
|
||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s1)
|
||||
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
|
||||
}
|
||||
for _, c := range s2.content {
|
||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2)
|
||||
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
|
||||
}
|
||||
toReturn.content = newContents(epsilon)
|
||||
toReturn.isEmpty = true
|
||||
|
||||
@@ -3,7 +3,9 @@ package regex
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type numRange struct {
|
||||
@@ -99,13 +101,11 @@ func range2regex(start int, end int) (string, error) {
|
||||
// Last range - tmp to rangeEnd
|
||||
ranges = append(ranges, numRange{tmp, rangeEnd})
|
||||
|
||||
regex := string(nonCapLparenRune)
|
||||
regexSlice := make([]string, 0)
|
||||
// Generate the regex
|
||||
for i, rg := range ranges {
|
||||
if i > 0 {
|
||||
regex += "|"
|
||||
}
|
||||
regex += string(nonCapLparenRune)
|
||||
for _, rg := range ranges {
|
||||
tmpStr := ""
|
||||
tmpStr += string(nonCapLparenRune)
|
||||
startSlc := intToSlc(rg.start)
|
||||
endSlc := intToSlc(rg.end)
|
||||
if len(startSlc) != len(endSlc) {
|
||||
@@ -113,14 +113,27 @@ func range2regex(start int, end int) (string, error) {
|
||||
}
|
||||
for i := range startSlc {
|
||||
if startSlc[i] == endSlc[i] {
|
||||
regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
|
||||
tmpStr += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
|
||||
} else {
|
||||
regex += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
|
||||
tmpStr += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
|
||||
}
|
||||
}
|
||||
regex += ")"
|
||||
tmpStr += ")"
|
||||
regexSlice = append(regexSlice, tmpStr)
|
||||
}
|
||||
regex += ")"
|
||||
// Each element of the slice represents one 'group'. Taking 0-255 as an example, the elements would be:
|
||||
// 1. 0-9
|
||||
// 2. 10-99
|
||||
// 3. 100-199
|
||||
// 4. 200-249
|
||||
// 5. 250-255
|
||||
//
|
||||
// The reason this is reversed before joining it, is because it is incompatible with the PCRE rule for matching.
|
||||
// The PCRE rule specifies that the left-branch of an alternation is preferred. Even though this engine uses the POSIX
|
||||
// rule at the moment (which prefers the longest match regardless of the order of the alternation), reversing the string
|
||||
// has no downsides. It doesn't affect POSIX matching, and it will reduce my burden if I decide to switch to PCRE matching.
|
||||
slices.Reverse(regexSlice)
|
||||
regex := string(nonCapLparenRune) + strings.Join(regexSlice, "|") + ")"
|
||||
return regex, nil
|
||||
|
||||
}
|
||||
|
||||
@@ -105,6 +105,9 @@ var reTests = []struct {
|
||||
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
|
||||
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
|
||||
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
|
||||
{`\d{3,4}`, nil, "120", []Group{{0, 3}}},
|
||||
{`\d{3,4}`, nil, "12709", []Group{{0, 4}}},
|
||||
{`\d{3,4}`, nil, "12", []Group{}},
|
||||
{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
|
||||
{`\d{3,4}`, nil, "5", []Group{}},
|
||||
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
|
||||
@@ -671,6 +674,17 @@ var groupTests = []struct {
|
||||
{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
|
||||
|
||||
{`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}},
|
||||
|
||||
// // Tests from https://wiki.haskell.org/Regex_Posix
|
||||
// {`(()|.)(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
|
||||
// {`(()|[ab])(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
|
||||
// {`(()|[ab])+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
|
||||
// {`([ab]|())+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
|
||||
// // Bug - this should give {0,6},{3,6},{-1,-1} but it gives {0,6},{3,6},{3,3}
|
||||
// // {`yyyyyy`, nil, `(yyy|(x?)){2,4}`, []Match{[]Group{{0, 6}, {3, 6}, {-1, -1}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
||||
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
||||
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
|
||||
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
|
||||
}
|
||||
|
||||
func TestFind(t *testing.T) {
|
||||
|
||||
Reference in New Issue
Block a user