Compare commits
13 Commits
6d692d0dfc
...
posixStyle
| Author | SHA1 | Date | |
|---|---|---|---|
| ef476e8875 | |||
| 7e6b02632f | |||
| f94e3f2e71 | |||
| b129d83c3f | |||
| 43aa7b5876 | |||
| 9a3bfca313 | |||
| b6ab54f6dd | |||
| 6a96c98d04 | |||
| 3cfc2a6854 | |||
| 5d7a02e796 | |||
| a46d2f4546 | |||
| c88ebd1aa5 | |||
| fd102292c6 |
2
Makefile
2
Makefile
@@ -8,6 +8,6 @@ vet: fmt
|
|||||||
buildLib: vet
|
buildLib: vet
|
||||||
go build -gcflags="-N -l" ./...
|
go build -gcflags="-N -l" ./...
|
||||||
buildCmd: buildLib
|
buildCmd: buildLib
|
||||||
go build -C cmd/ -o re ./...
|
go build -C cmd/ -gcflags="-N -l" -o re ./...
|
||||||
test: buildCmd
|
test: buildCmd
|
||||||
go test -v ./...
|
go test -v ./...
|
||||||
|
|||||||
@@ -949,7 +949,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
// and added back in.
|
// and added back in.
|
||||||
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
|
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
|
||||||
// and RPAREN nodes.
|
// and RPAREN nodes.
|
||||||
// If neither node exists, that's a problem so I return an error.
|
// If the middle node exists but is itself the start of a group, then that _must_ be the opening paren for
|
||||||
|
// the closing paren that I'm on. I put the third node back (because it isn't involved in the capturing group), then
|
||||||
|
// I concatenate those two and add them. If neither node exists, that's a problem so I return an error.
|
||||||
if c.nodetype == rparenNode {
|
if c.nodetype == rparenNode {
|
||||||
s.groupEnd = true
|
s.groupEnd = true
|
||||||
middleNode, err1 := pop(&nfa)
|
middleNode, err1 := pop(&nfa)
|
||||||
@@ -964,6 +966,11 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
s.groupNum = lparenNode.groupNum
|
s.groupNum = lparenNode.groupNum
|
||||||
to_add := concatenate(lparenNode, s)
|
to_add := concatenate(lparenNode, s)
|
||||||
nfa = append(nfa, to_add)
|
nfa = append(nfa, to_add)
|
||||||
|
} else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
|
||||||
|
nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group
|
||||||
|
s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen
|
||||||
|
to_add := concatenate(middleNode, s)
|
||||||
|
nfa = append(nfa, to_add)
|
||||||
} else {
|
} else {
|
||||||
// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
|
// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
|
||||||
if lparenNode.groupBegin {
|
if lparenNode.groupBegin {
|
||||||
|
|||||||
@@ -114,7 +114,11 @@ Another, more subtle example is the following regex:
|
|||||||
x|xx
|
x|xx
|
||||||
|
|
||||||
While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
|
While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
|
||||||
this engine will _always_ go for the longest possible match, regardless of the order of the alternation.
|
this engine will go for the longest possible match, regardless of the order of the alternation. Although this
|
||||||
|
strays from the convention, it results in a nice rule-of-thumb - the engine is ALWAYS greedy.
|
||||||
|
|
||||||
|
The stdlib implementation has a function [regexp.Regexp.Longest] which makes future searches prefer the longest match.
|
||||||
|
That is the default (and unchangable) behavior in this engine.
|
||||||
|
|
||||||
2. Byte-slices and runes:
|
2. Byte-slices and runes:
|
||||||
|
|
||||||
|
|||||||
@@ -30,3 +30,25 @@ func ExampleReg_FindAll() {
|
|||||||
// 3 4
|
// 3 4
|
||||||
// 5 6
|
// 5 6
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ExampleReg_FindString() {
|
||||||
|
regexStr := `\d+`
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
|
||||||
|
matchStr := regexComp.FindString("The year of our lord, 2025")
|
||||||
|
fmt.Println(matchStr)
|
||||||
|
// Output: 2025
|
||||||
|
}
|
||||||
|
|
||||||
|
func ExampleReg_FindSubmatch() {
|
||||||
|
regexStr := `(\d)\.(\d)(\d)`
|
||||||
|
regexComp := regex.MustCompile(regexStr)
|
||||||
|
|
||||||
|
match, _ := regexComp.FindSubmatch("3.14")
|
||||||
|
fmt.Println(match[0])
|
||||||
|
fmt.Println(match[1])
|
||||||
|
fmt.Println(match[2])
|
||||||
|
// Output: 0 4
|
||||||
|
// 0 1
|
||||||
|
// 2 3
|
||||||
|
}
|
||||||
|
|||||||
@@ -5,7 +5,13 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
)
|
)
|
||||||
|
|
||||||
// a Match stores a slice of all the capturing groups in a match.
|
// A Match represents a match found by the regex in a given string.
|
||||||
|
// It is represented as a list of groups, where the nth element contains
|
||||||
|
// the contents of the nth capturing group. Note that the group may not be valid
|
||||||
|
// (see [Group.IsValid]). The element at index 0 is known
|
||||||
|
// as the 0-group, and represents the contents of the entire match.
|
||||||
|
//
|
||||||
|
// See [Reg.FindSubmatch] for an example.
|
||||||
type Match []Group
|
type Match []Group
|
||||||
|
|
||||||
// a Group represents a group. It contains the start index and end index of the match
|
// a Group represents a group. It contains the start index and end index of the match
|
||||||
@@ -107,7 +113,7 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState)
|
|||||||
num_appended := 0 // number of unique states addded to tempstates
|
num_appended := 0 // number of unique states addded to tempstates
|
||||||
for isZero == true {
|
for isZero == true {
|
||||||
zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
|
zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
|
||||||
tempstates, num_appended = unique_append(tempstates, zeroStates...)
|
tempstates, num_appended = uniqueAppend(tempstates, zeroStates...)
|
||||||
if num_appended == 0 { // break if we haven't appended any more unique values
|
if num_appended == 0 { // break if we haven't appended any more unique values
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@@ -252,6 +258,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
|
|||||||
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
|
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
|
||||||
return false, []Group{}, offset
|
return false, []Group{}, offset
|
||||||
}
|
}
|
||||||
|
resetThreads(start)
|
||||||
|
|
||||||
// Hold a list of match indices for the current run. When we
|
// Hold a list of match indices for the current run. When we
|
||||||
// can no longer find a match, the match with the largest range is
|
// can no longer find a match, the match with the largest range is
|
||||||
@@ -307,13 +314,13 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
|
|||||||
num_appended := 0
|
num_appended := 0
|
||||||
for isZero == true {
|
for isZero == true {
|
||||||
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
|
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
|
||||||
tempStates, num_appended = unique_append(tempStates, zeroStates...)
|
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
|
||||||
if num_appended == 0 { // Break if we haven't appended any more unique values
|
if num_appended == 0 { // Break if we haven't appended any more unique values
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
currentStates, _ = unique_append(currentStates, tempStates...)
|
currentStates, _ = uniqueAppend(currentStates, tempStates...)
|
||||||
tempStates = nil
|
tempStates = nil
|
||||||
|
|
||||||
// Take any transitions corresponding to current character
|
// Take any transitions corresponding to current character
|
||||||
@@ -420,7 +427,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
|
|||||||
num_appended := 0 // Number of unique states addded to tempStates
|
num_appended := 0 // Number of unique states addded to tempStates
|
||||||
for isZero == true {
|
for isZero == true {
|
||||||
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
|
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
|
||||||
tempStates, num_appended = unique_append(tempStates, zeroStates...)
|
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
|
||||||
if num_appended == 0 { // Break if we haven't appended any more unique values
|
if num_appended == 0 { // Break if we haven't appended any more unique values
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ func isNormalChar(c rune) bool {
|
|||||||
|
|
||||||
// Ensure that the given elements are only appended to the given slice if they
|
// Ensure that the given elements are only appended to the given slice if they
|
||||||
// don't already exist. Returns the new slice, and the number of unique items appended.
|
// don't already exist. Returns the new slice, and the number of unique items appended.
|
||||||
func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
|
func uniqueAppend[T comparable](slc []T, items ...T) ([]T, int) {
|
||||||
num_appended := 0
|
num_appended := 0
|
||||||
for _, item := range items {
|
for _, item := range items {
|
||||||
if !slices.Contains(slc, item) {
|
if !slices.Contains(slc, item) {
|
||||||
@@ -61,6 +61,25 @@ func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
|
|||||||
return slc, num_appended
|
return slc, num_appended
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func uniqueAppendFunc[T any](slc []T, fn func(T, T) bool, items ...T) ([]T, int) {
|
||||||
|
toRet := make([]T, len(slc))
|
||||||
|
num_appended := 0
|
||||||
|
copy(toRet, slc)
|
||||||
|
for _, item := range items {
|
||||||
|
itemExists := false
|
||||||
|
for _, val := range slc {
|
||||||
|
if fn(item, val) {
|
||||||
|
itemExists = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !itemExists {
|
||||||
|
toRet = append(toRet, item)
|
||||||
|
num_appended++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return toRet, num_appended
|
||||||
|
}
|
||||||
|
|
||||||
// Returns true only if all the given elements are equal
|
// Returns true only if all the given elements are equal
|
||||||
func allEqual[T comparable](items ...T) bool {
|
func allEqual[T comparable](items ...T) bool {
|
||||||
first := items[0]
|
first := items[0]
|
||||||
|
|||||||
30
regex/nfa.go
30
regex/nfa.go
@@ -104,6 +104,26 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
|
|||||||
return clone
|
return clone
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reset any thread-related fields of the NFA starting from the given state.
|
||||||
|
func resetThreads(start *nfaState) {
|
||||||
|
visitedMap := make(map[*nfaState]bool) // The value type doesn't matter here
|
||||||
|
resetThreadsHelper(start, visitedMap)
|
||||||
|
}
|
||||||
|
|
||||||
|
func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
|
||||||
|
if _, ok := visitedMap[state]; ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Assuming it hasn't been visited
|
||||||
|
state.threadGroups = nil
|
||||||
|
visitedMap[state] = true
|
||||||
|
for _, v := range state.transitions {
|
||||||
|
for _, nextState := range v {
|
||||||
|
resetThreadsHelper(nextState, visitedMap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Checks if the given state's assertion is true. Returns true if the given
|
// Checks if the given state's assertion is true. Returns true if the given
|
||||||
// state doesn't have an assertion.
|
// state doesn't have an assertion.
|
||||||
func (s nfaState) checkAssertion(str []rune, idx int) bool {
|
func (s nfaState) checkAssertion(str []rune, idx int) bool {
|
||||||
@@ -274,7 +294,7 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
|
|||||||
}
|
}
|
||||||
for i := range s1.output {
|
for i := range s1.output {
|
||||||
for _, c := range s2.content { // Create transitions for every element in s1's content to s2'
|
for _, c := range s2.content { // Create transitions for every element in s1's content to s2'
|
||||||
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], s2)
|
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
s1.output = s2.output
|
s1.output = s2.output
|
||||||
@@ -294,11 +314,11 @@ func kleene(s1 nfaState) (*nfaState, error) {
|
|||||||
toReturn.output = append(toReturn.output, toReturn)
|
toReturn.output = append(toReturn.output, toReturn)
|
||||||
for i := range s1.output {
|
for i := range s1.output {
|
||||||
for _, c := range toReturn.content {
|
for _, c := range toReturn.content {
|
||||||
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], toReturn)
|
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, c := range s1.content {
|
for _, c := range s1.content {
|
||||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1)
|
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
|
||||||
}
|
}
|
||||||
return toReturn, nil
|
return toReturn, nil
|
||||||
}
|
}
|
||||||
@@ -314,10 +334,10 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
|
|||||||
// This would lead to multiple instances of the same set of match indices, since both
|
// This would lead to multiple instances of the same set of match indices, since both
|
||||||
// 's1' states would be considered to match.
|
// 's1' states would be considered to match.
|
||||||
for _, c := range s1.content {
|
for _, c := range s1.content {
|
||||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s1)
|
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
|
||||||
}
|
}
|
||||||
for _, c := range s2.content {
|
for _, c := range s2.content {
|
||||||
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2)
|
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
|
||||||
}
|
}
|
||||||
toReturn.content = newContents(epsilon)
|
toReturn.content = newContents(epsilon)
|
||||||
toReturn.isEmpty = true
|
toReturn.isEmpty = true
|
||||||
|
|||||||
@@ -3,7 +3,9 @@ package regex
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
type numRange struct {
|
type numRange struct {
|
||||||
@@ -99,13 +101,11 @@ func range2regex(start int, end int) (string, error) {
|
|||||||
// Last range - tmp to rangeEnd
|
// Last range - tmp to rangeEnd
|
||||||
ranges = append(ranges, numRange{tmp, rangeEnd})
|
ranges = append(ranges, numRange{tmp, rangeEnd})
|
||||||
|
|
||||||
regex := string(nonCapLparenRune)
|
regexSlice := make([]string, 0)
|
||||||
// Generate the regex
|
// Generate the regex
|
||||||
for i, rg := range ranges {
|
for _, rg := range ranges {
|
||||||
if i > 0 {
|
tmpStr := ""
|
||||||
regex += "|"
|
tmpStr += string(nonCapLparenRune)
|
||||||
}
|
|
||||||
regex += string(nonCapLparenRune)
|
|
||||||
startSlc := intToSlc(rg.start)
|
startSlc := intToSlc(rg.start)
|
||||||
endSlc := intToSlc(rg.end)
|
endSlc := intToSlc(rg.end)
|
||||||
if len(startSlc) != len(endSlc) {
|
if len(startSlc) != len(endSlc) {
|
||||||
@@ -113,14 +113,27 @@ func range2regex(start int, end int) (string, error) {
|
|||||||
}
|
}
|
||||||
for i := range startSlc {
|
for i := range startSlc {
|
||||||
if startSlc[i] == endSlc[i] {
|
if startSlc[i] == endSlc[i] {
|
||||||
regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
|
tmpStr += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
|
||||||
} else {
|
} else {
|
||||||
regex += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
|
tmpStr += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
regex += ")"
|
tmpStr += ")"
|
||||||
|
regexSlice = append(regexSlice, tmpStr)
|
||||||
}
|
}
|
||||||
regex += ")"
|
// Each element of the slice represents one 'group'. Taking 0-255 as an example, the elements would be:
|
||||||
|
// 1. 0-9
|
||||||
|
// 2. 10-99
|
||||||
|
// 3. 100-199
|
||||||
|
// 4. 200-249
|
||||||
|
// 5. 250-255
|
||||||
|
//
|
||||||
|
// The reason this is reversed before joining it, is because it is incompatible with the PCRE rule for matching.
|
||||||
|
// The PCRE rule specifies that the left-branch of an alternation is preferred. Even though this engine uses the POSIX
|
||||||
|
// rule at the moment (which prefers the longest match regardless of the order of the alternation), reversing the string
|
||||||
|
// has no downsides. It doesn't affect POSIX matching, and it will reduce my burden if I decide to switch to PCRE matching.
|
||||||
|
slices.Reverse(regexSlice)
|
||||||
|
regex := string(nonCapLparenRune) + strings.Join(regexSlice, "|") + ")"
|
||||||
return regex, nil
|
return regex, nil
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -105,6 +105,9 @@ var reTests = []struct {
|
|||||||
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
|
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
|
||||||
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
|
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
|
||||||
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
|
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
|
||||||
|
{`\d{3,4}`, nil, "120", []Group{{0, 3}}},
|
||||||
|
{`\d{3,4}`, nil, "12709", []Group{{0, 4}}},
|
||||||
|
{`\d{3,4}`, nil, "12", []Group{}},
|
||||||
{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
|
{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
|
||||||
{`\d{3,4}`, nil, "5", []Group{}},
|
{`\d{3,4}`, nil, "5", []Group{}},
|
||||||
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
|
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
|
||||||
@@ -671,6 +674,17 @@ var groupTests = []struct {
|
|||||||
{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
|
{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
|
||||||
|
|
||||||
{`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}},
|
{`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}},
|
||||||
|
|
||||||
|
// // Tests from https://wiki.haskell.org/Regex_Posix
|
||||||
|
// {`(()|.)(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
|
||||||
|
// {`(()|[ab])(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
|
||||||
|
// {`(()|[ab])+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
|
||||||
|
// {`([ab]|())+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
|
||||||
|
// // Bug - this should give {0,6},{3,6},{-1,-1} but it gives {0,6},{3,6},{3,3}
|
||||||
|
// // {`yyyyyy`, nil, `(yyy|(x?)){2,4}`, []Match{[]Group{{0, 6}, {3, 6}, {-1, -1}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
||||||
|
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
||||||
|
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
|
||||||
|
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestFind(t *testing.T) {
|
func TestFind(t *testing.T) {
|
||||||
|
|||||||
Reference in New Issue
Block a user