13 Commits

9 changed files with 130 additions and 24 deletions

View File

@@ -8,6 +8,6 @@ vet: fmt
buildLib: vet
go build -gcflags="-N -l" ./...
buildCmd: buildLib
go build -C cmd/ -o re ./...
go build -C cmd/ -gcflags="-N -l" -o re ./...
test: buildCmd
go test -v ./...

View File

@@ -949,7 +949,9 @@ func thompson(re []postfixNode) (Reg, error) {
// and added back in.
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
// and RPAREN nodes.
// If neither node exists, that's a problem so I return an error.
// If the middle node exists but is itself the start of a group, then that _must_ be the opening paren for
// the closing paren that I'm on. I put the third node back (because it isn't involved in the capturing group), then
// I concatenate those two and add them. If neither node exists, that's a problem so I return an error.
if c.nodetype == rparenNode {
s.groupEnd = true
middleNode, err1 := pop(&nfa)
@@ -964,6 +966,11 @@ func thompson(re []postfixNode) (Reg, error) {
s.groupNum = lparenNode.groupNum
to_add := concatenate(lparenNode, s)
nfa = append(nfa, to_add)
} else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
nfa = append(nfa, lparenNode) // I shouldn't have popped this out, because it is not involved in the current capturing group
s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen
to_add := concatenate(middleNode, s)
nfa = append(nfa, to_add)
} else {
// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
if lparenNode.groupBegin {

View File

@@ -114,7 +114,11 @@ Another, more subtle example is the following regex:
x|xx
While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
this engine will _always_ go for the longest possible match, regardless of the order of the alternation.
this engine will go for the longest possible match, regardless of the order of the alternation. Although this
strays from the convention, it results in a nice rule-of-thumb - the engine is ALWAYS greedy.
The stdlib implementation has a function [regexp.Regexp.Longest] which makes future searches prefer the longest match.
That is the default (and unchangable) behavior in this engine.
2. Byte-slices and runes:

View File

@@ -30,3 +30,25 @@ func ExampleReg_FindAll() {
// 3 4
// 5 6
}
func ExampleReg_FindString() {
regexStr := `\d+`
regexComp := regex.MustCompile(regexStr)
matchStr := regexComp.FindString("The year of our lord, 2025")
fmt.Println(matchStr)
// Output: 2025
}
func ExampleReg_FindSubmatch() {
regexStr := `(\d)\.(\d)(\d)`
regexComp := regex.MustCompile(regexStr)
match, _ := regexComp.FindSubmatch("3.14")
fmt.Println(match[0])
fmt.Println(match[1])
fmt.Println(match[2])
// Output: 0 4
// 0 1
// 2 3
}

View File

@@ -5,7 +5,13 @@ import (
"sort"
)
// a Match stores a slice of all the capturing groups in a match.
// A Match represents a match found by the regex in a given string.
// It is represented as a list of groups, where the nth element contains
// the contents of the nth capturing group. Note that the group may not be valid
// (see [Group.IsValid]). The element at index 0 is known
// as the 0-group, and represents the contents of the entire match.
//
// See [Reg.FindSubmatch] for an example.
type Match []Group
// a Group represents a group. It contains the start index and end index of the match
@@ -107,7 +113,7 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState)
num_appended := 0 // number of unique states addded to tempstates
for isZero == true {
zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
tempstates, num_appended = unique_append(tempstates, zeroStates...)
tempstates, num_appended = uniqueAppend(tempstates, zeroStates...)
if num_appended == 0 { // break if we haven't appended any more unique values
break
}
@@ -252,6 +258,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
return false, []Group{}, offset
}
resetThreads(start)
// Hold a list of match indices for the current run. When we
// can no longer find a match, the match with the largest range is
@@ -307,13 +314,13 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
num_appended := 0
for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = unique_append(tempStates, zeroStates...)
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break
}
}
currentStates, _ = unique_append(currentStates, tempStates...)
currentStates, _ = uniqueAppend(currentStates, tempStates...)
tempStates = nil
// Take any transitions corresponding to current character
@@ -420,7 +427,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
num_appended := 0 // Number of unique states addded to tempStates
for isZero == true {
zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
tempStates, num_appended = unique_append(tempStates, zeroStates...)
tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
if num_appended == 0 { // Break if we haven't appended any more unique values
break
}

View File

@@ -50,7 +50,7 @@ func isNormalChar(c rune) bool {
// Ensure that the given elements are only appended to the given slice if they
// don't already exist. Returns the new slice, and the number of unique items appended.
func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
func uniqueAppend[T comparable](slc []T, items ...T) ([]T, int) {
num_appended := 0
for _, item := range items {
if !slices.Contains(slc, item) {
@@ -61,6 +61,25 @@ func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
return slc, num_appended
}
func uniqueAppendFunc[T any](slc []T, fn func(T, T) bool, items ...T) ([]T, int) {
toRet := make([]T, len(slc))
num_appended := 0
copy(toRet, slc)
for _, item := range items {
itemExists := false
for _, val := range slc {
if fn(item, val) {
itemExists = true
}
}
if !itemExists {
toRet = append(toRet, item)
num_appended++
}
}
return toRet, num_appended
}
// Returns true only if all the given elements are equal
func allEqual[T comparable](items ...T) bool {
first := items[0]

View File

@@ -104,6 +104,26 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
return clone
}
// Reset any thread-related fields of the NFA starting from the given state.
func resetThreads(start *nfaState) {
visitedMap := make(map[*nfaState]bool) // The value type doesn't matter here
resetThreadsHelper(start, visitedMap)
}
func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
if _, ok := visitedMap[state]; ok {
return
}
// Assuming it hasn't been visited
state.threadGroups = nil
visitedMap[state] = true
for _, v := range state.transitions {
for _, nextState := range v {
resetThreadsHelper(nextState, visitedMap)
}
}
}
// Checks if the given state's assertion is true. Returns true if the given
// state doesn't have an assertion.
func (s nfaState) checkAssertion(str []rune, idx int) bool {
@@ -274,7 +294,7 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
}
for i := range s1.output {
for _, c := range s2.content { // Create transitions for every element in s1's content to s2'
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], s2)
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2)
}
}
s1.output = s2.output
@@ -294,11 +314,11 @@ func kleene(s1 nfaState) (*nfaState, error) {
toReturn.output = append(toReturn.output, toReturn)
for i := range s1.output {
for _, c := range toReturn.content {
s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], toReturn)
s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
}
}
for _, c := range s1.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1)
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
}
return toReturn, nil
}
@@ -314,10 +334,10 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
// This would lead to multiple instances of the same set of match indices, since both
// 's1' states would be considered to match.
for _, c := range s1.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s1)
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
}
for _, c := range s2.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2)
toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
}
toReturn.content = newContents(epsilon)
toReturn.isEmpty = true

View File

@@ -3,7 +3,9 @@ package regex
import (
"fmt"
"math"
"slices"
"strconv"
"strings"
)
type numRange struct {
@@ -99,13 +101,11 @@ func range2regex(start int, end int) (string, error) {
// Last range - tmp to rangeEnd
ranges = append(ranges, numRange{tmp, rangeEnd})
regex := string(nonCapLparenRune)
regexSlice := make([]string, 0)
// Generate the regex
for i, rg := range ranges {
if i > 0 {
regex += "|"
}
regex += string(nonCapLparenRune)
for _, rg := range ranges {
tmpStr := ""
tmpStr += string(nonCapLparenRune)
startSlc := intToSlc(rg.start)
endSlc := intToSlc(rg.end)
if len(startSlc) != len(endSlc) {
@@ -113,14 +113,27 @@ func range2regex(start int, end int) (string, error) {
}
for i := range startSlc {
if startSlc[i] == endSlc[i] {
regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
tmpStr += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
} else {
regex += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
tmpStr += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
}
}
regex += ")"
tmpStr += ")"
regexSlice = append(regexSlice, tmpStr)
}
regex += ")"
// Each element of the slice represents one 'group'. Taking 0-255 as an example, the elements would be:
// 1. 0-9
// 2. 10-99
// 3. 100-199
// 4. 200-249
// 5. 250-255
//
// The reason this is reversed before joining it, is because it is incompatible with the PCRE rule for matching.
// The PCRE rule specifies that the left-branch of an alternation is preferred. Even though this engine uses the POSIX
// rule at the moment (which prefers the longest match regardless of the order of the alternation), reversing the string
// has no downsides. It doesn't affect POSIX matching, and it will reduce my burden if I decide to switch to PCRE matching.
slices.Reverse(regexSlice)
regex := string(nonCapLparenRune) + strings.Join(regexSlice, "|") + ")"
return regex, nil
}

View File

@@ -105,6 +105,9 @@ var reTests = []struct {
{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
{`\d{3,4}`, nil, "120", []Group{{0, 3}}},
{`\d{3,4}`, nil, "12709", []Group{{0, 4}}},
{`\d{3,4}`, nil, "12", []Group{}},
{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
{`\d{3,4}`, nil, "5", []Group{}},
{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
@@ -671,6 +674,17 @@ var groupTests = []struct {
{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
{`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}},
// // Tests from https://wiki.haskell.org/Regex_Posix
// {`(()|.)(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
// {`(()|[ab])(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
// {`(()|[ab])+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
// {`([ab]|())+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
// // Bug - this should give {0,6},{3,6},{-1,-1} but it gives {0,6},{3,6},{3,3}
// // {`yyyyyy`, nil, `(yyy|(x?)){2,4}`, []Match{[]Group{{0, 6}, {3, 6}, {-1, -1}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
}
func TestFind(t *testing.T) {