Reverse the order of the numeric range before adding it, to maintain compatibility with PCRE matching rules (even though I don't implement them, if I do in the future)

Added more tests; commented out tests that I am failing
Added comments
2025-02-02 13:46:48 -05:00 · 2025-02-02 13:46:08 -05:00 · 2025-02-02 12:44:06 -05:00 · 2025-02-02 12:43:40 -05:00 · 2025-02-02 12:42:38 -05:00 · 2025-02-02 12:42:29 -05:00
9 changed files with 130 additions and 24 deletions
--- a/2
+++ b/2
@@ -8,6 +8,6 @@ vet: fmt
 buildLib: vet
 	go build -gcflags="-N -l" ./...
 buildCmd: buildLib
-	go build -C cmd/ -o re ./...
+	go build -C cmd/ -gcflags="-N -l" -o re ./...
 test: buildCmd
 	go test -v ./...
--- a/regex/compile.go
+++ b/regex/compile.go
@@ -949,7 +949,9 @@ func thompson(re []postfixNode) (Reg, error) {
 			// and added back in.
 			// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
 			// and RPAREN nodes.
-			// If neither node exists, that's a problem so I return an error.
+			// If the middle node exists but is itself the start of a group, then that _must_ be the opening paren for
+			// the closing paren that I'm on. I put the third node back (because it isn't involved in the capturing group), then
+			// I concatenate those two and add them. If neither node exists, that's a problem so I return an error.
 			if c.nodetype == rparenNode {
 				s.groupEnd = true
 				middleNode, err1 := pop(&nfa)
@@ -964,6 +966,11 @@ func thompson(re []postfixNode) (Reg, error) {
 					s.groupNum = lparenNode.groupNum
 					to_add := concatenate(lparenNode, s)
 					nfa = append(nfa, to_add)
+				} else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
+					nfa = append(nfa, lparenNode)    // I shouldn't have popped this out, because it is not involved in the current capturing group
+					s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen
+					to_add := concatenate(middleNode, s)
+					nfa = append(nfa, to_add)
 				} else {
 					// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
 					if lparenNode.groupBegin {
--- a/regex/doc.go
+++ b/regex/doc.go
@@ -114,7 +114,11 @@ Another, more subtle example is the following regex:
 	x|xx

 While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
-this engine will _always_ go for the longest possible match, regardless of the order of the alternation.
+this engine will go for the longest possible match, regardless of the order of the alternation. Although this
+strays from the convention, it results in a nice rule-of-thumb - the engine is ALWAYS greedy.
+
+The stdlib implementation has a function [regexp.Regexp.Longest] which makes future searches prefer the longest match.
+That is the default (and unchangable) behavior in this engine.

 2. Byte-slices and runes:

--- a/regex/example_test.go
+++ b/regex/example_test.go
@@ -30,3 +30,25 @@ func ExampleReg_FindAll() {
 	// 3	4
 	// 5	6
 }
+
+func ExampleReg_FindString() {
+	regexStr := `\d+`
+	regexComp := regex.MustCompile(regexStr)
+
+	matchStr := regexComp.FindString("The year of our lord, 2025")
+	fmt.Println(matchStr)
+	// Output: 2025
+}
+
+func ExampleReg_FindSubmatch() {
+	regexStr := `(\d)\.(\d)(\d)`
+	regexComp := regex.MustCompile(regexStr)
+
+	match, _ := regexComp.FindSubmatch("3.14")
+	fmt.Println(match[0])
+	fmt.Println(match[1])
+	fmt.Println(match[2])
+	// Output: 0	4
+	// 0	1
+	// 2	3
+}
--- a/regex/matching.go
+++ b/regex/matching.go
@@ -5,7 +5,13 @@ import (
 	"sort"
 )

-// a Match stores a slice of all the capturing groups in a match.
+// A Match represents a match found by the regex in a given string.
+// It is represented as a list of groups, where the nth element contains
+// the contents of the nth capturing group. Note that the group may not be valid
+// (see [Group.IsValid]). The element at index 0 is known
+// as the 0-group, and represents the contents of the entire match.
+//
+// See [Reg.FindSubmatch] for an example.
 type Match []Group

 // a Group represents a group. It contains the start index and end index of the match
@@ -107,7 +113,7 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState)
 	num_appended := 0 // number of unique states addded to tempstates
 	for isZero == true {
 		zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
-		tempstates, num_appended = unique_append(tempstates, zeroStates...)
+		tempstates, num_appended = uniqueAppend(tempstates, zeroStates...)
 		if num_appended == 0 { // break if we haven't appended any more unique values
 			break
 		}
@@ -252,6 +258,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
 		// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
 		return false, []Group{}, offset
 	}
+	resetThreads(start)

 	// Hold a list of match indices for the current run. When we
 	// can no longer find a match, the match with the largest range is
@@ -307,13 +314,13 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
 		num_appended := 0
 		for isZero == true {
 			zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
-			tempStates, num_appended = unique_append(tempStates, zeroStates...)
+			tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
 			if num_appended == 0 { // Break if we haven't appended any more unique values
 				break
 			}
 		}

-		currentStates, _ = unique_append(currentStates, tempStates...)
+		currentStates, _ = uniqueAppend(currentStates, tempStates...)
 		tempStates = nil

 		// Take any transitions corresponding to current character
@@ -420,7 +427,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
 	num_appended := 0 // Number of unique states addded to tempStates
 	for isZero == true {
 		zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
-		tempStates, num_appended = unique_append(tempStates, zeroStates...)
+		tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
 		if num_appended == 0 { // Break if we haven't appended any more unique values
 			break
 		}
--- a/regex/misc.go
+++ b/regex/misc.go
@@ -50,7 +50,7 @@ func isNormalChar(c rune) bool {

 // Ensure that the given elements are only appended to the given slice if they
 // don't already exist. Returns the new slice, and the number of unique items appended.
-func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
+func uniqueAppend[T comparable](slc []T, items ...T) ([]T, int) {
 	num_appended := 0
 	for _, item := range items {
 		if !slices.Contains(slc, item) {
@@ -61,6 +61,25 @@ func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
 	return slc, num_appended
 }

+func uniqueAppendFunc[T any](slc []T, fn func(T, T) bool, items ...T) ([]T, int) {
+	toRet := make([]T, len(slc))
+	num_appended := 0
+	copy(toRet, slc)
+	for _, item := range items {
+		itemExists := false
+		for _, val := range slc {
+			if fn(item, val) {
+				itemExists = true
+			}
+		}
+		if !itemExists {
+			toRet = append(toRet, item)
+			num_appended++
+		}
+	}
+	return toRet, num_appended
+}
+
 // Returns true only if all the given elements are equal
 func allEqual[T comparable](items ...T) bool {
 	first := items[0]
--- a/regex/nfa.go
+++ b/regex/nfa.go
@@ -104,6 +104,26 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
 	return clone
 }

+// Reset any thread-related fields of the NFA starting from the given state.
+func resetThreads(start *nfaState) {
+	visitedMap := make(map[*nfaState]bool) // The value type doesn't matter here
+	resetThreadsHelper(start, visitedMap)
+}
+
+func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
+	if _, ok := visitedMap[state]; ok {
+		return
+	}
+	// Assuming it hasn't been visited
+	state.threadGroups = nil
+	visitedMap[state] = true
+	for _, v := range state.transitions {
+		for _, nextState := range v {
+			resetThreadsHelper(nextState, visitedMap)
+		}
+	}
+}
+
 // Checks if the given state's assertion is true. Returns true if the given
 // state doesn't have an assertion.
 func (s nfaState) checkAssertion(str []rune, idx int) bool {
@@ -274,7 +294,7 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
 	}
 	for i := range s1.output {
 		for _, c := range s2.content { // Create transitions for every element in s1's content to s2'
-			s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], s2)
+			s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2)
 		}
 	}
 	s1.output = s2.output
@@ -294,11 +314,11 @@ func kleene(s1 nfaState) (*nfaState, error) {
 	toReturn.output = append(toReturn.output, toReturn)
 	for i := range s1.output {
 		for _, c := range toReturn.content {
-			s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], toReturn)
+			s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
 		}
 	}
 	for _, c := range s1.content {
-		toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1)
+		toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
 	}
 	return toReturn, nil
 }
@@ -314,10 +334,10 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
 	// This would lead to multiple instances of the same set of match indices, since both
 	// 's1' states would be considered to match.
 	for _, c := range s1.content {
-		toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s1)
+		toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
 	}
 	for _, c := range s2.content {
-		toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2)
+		toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
 	}
 	toReturn.content = newContents(epsilon)
 	toReturn.isEmpty = true
--- a/regex/range2regex.go
+++ b/regex/range2regex.go
@@ -3,7 +3,9 @@ package regex
 import (
 	"fmt"
 	"math"
+	"slices"
 	"strconv"
+	"strings"
 )

 type numRange struct {
@@ -99,13 +101,11 @@ func range2regex(start int, end int) (string, error) {
 	// Last range - tmp to rangeEnd
 	ranges = append(ranges, numRange{tmp, rangeEnd})

-	regex := string(nonCapLparenRune)
+	regexSlice := make([]string, 0)
 	// Generate the regex
-	for i, rg := range ranges {
-		if i > 0 {
-			regex += "|"
-		}
-		regex += string(nonCapLparenRune)
+	for _, rg := range ranges {
+		tmpStr := ""
+		tmpStr += string(nonCapLparenRune)
 		startSlc := intToSlc(rg.start)
 		endSlc := intToSlc(rg.end)
 		if len(startSlc) != len(endSlc) {
@@ -113,14 +113,27 @@ func range2regex(start int, end int) (string, error) {
 		}
 		for i := range startSlc {
 			if startSlc[i] == endSlc[i] {
-				regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
+				tmpStr += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
 			} else {
-				regex += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
+				tmpStr += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
 			}
 		}
-		regex += ")"
+		tmpStr += ")"
+		regexSlice = append(regexSlice, tmpStr)
 	}
-	regex += ")"
+	// Each element of the slice represents one 'group'. Taking 0-255 as an example, the elements would be:
+	// 	1. 0-9
+	// 	2. 10-99
+	// 	3. 100-199
+	// 	4. 200-249
+	// 	5. 250-255
+	//
+	// The reason this is reversed before joining it, is because it is incompatible with the PCRE rule for matching.
+	// The PCRE rule specifies that the left-branch of an alternation is preferred. Even though this engine uses the POSIX
+	// rule at the moment (which prefers the longest match regardless of the order of the alternation), reversing the string
+	// has no downsides. It doesn't affect POSIX matching, and it will reduce my burden if I decide to switch to PCRE matching.
+	slices.Reverse(regexSlice)
+	regex := string(nonCapLparenRune) + strings.Join(regexSlice, "|") + ")"
 	return regex, nil

 }
--- a/regex/re_test.go
+++ b/regex/re_test.go
@@ -105,6 +105,9 @@ var reTests = []struct {
 	{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
 	{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
 	{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
+	{`\d{3,4}`, nil, "120", []Group{{0, 3}}},
+	{`\d{3,4}`, nil, "12709", []Group{{0, 4}}},
+	{`\d{3,4}`, nil, "12", []Group{}},
 	{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
 	{`\d{3,4}`, nil, "5", []Group{}},
 	{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
@@ -671,6 +674,17 @@ var groupTests = []struct {
 	{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},

 	{`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}},
+
+	// // Tests from https://wiki.haskell.org/Regex_Posix
+	// {`(()|.)(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
+	// {`(()|[ab])(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
+	// {`(()|[ab])+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
+	// {`([ab]|())+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
+	// // Bug - this should give {0,6},{3,6},{-1,-1} but it gives {0,6},{3,6},{3,3}
+	// //	{`yyyyyy`, nil, `(yyy|(x?)){2,4}`, []Match{[]Group{{0, 6}, {3, 6}, {-1, -1}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
+	// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
+	// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
+	// //	{`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
 }

 func TestFind(t *testing.T) {
Author	SHA1	Message	Date
Aadhavan Srinivasan	ef476e8875	Reverse the order of the numeric range before adding it, to maintain compatibility with PCRE matching rules (even though I don't implement them, if I do in the future)	2025-02-02 13:46:48 -05:00
Aadhavan Srinivasan	7e6b02632f	Added more tests; commented out tests that I am failing	2025-02-02 13:46:08 -05:00
Aadhavan Srinivasan	f94e3f2e71	Added comments	2025-02-02 12:44:06 -05:00
Aadhavan Srinivasan	b129d83c3f	Added function to reset threads	2025-02-02 12:43:40 -05:00
Aadhavan Srinivasan	43aa7b5876	Updated documentation	2025-02-02 12:42:38 -05:00
Aadhavan Srinivasan	9a3bfca313	Renamed unique_append to uniqueAppend	2025-02-02 12:42:29 -05:00
Aadhavan Srinivasan	b6ab54f6dd	Reset threads when findAllSubmatchHelper is called	2025-02-02 12:42:00 -05:00
Aadhavan Srinivasan	6a96c98d04	Fixed bug where the regex '(()\|.)(b)' wouldn't compile	2025-02-01 19:20:33 -05:00
Aadhavan Srinivasan	3cfc2a6854	Updated Makefile	2025-02-01 18:52:26 -05:00
Aadhavan Srinivasan	5d7a02e796	Added gcflags to go build	2025-02-01 18:51:58 -05:00
Aadhavan Srinivasan	a46d2f4546	Updated comments	2025-02-01 18:07:31 -05:00
Aadhavan Srinivasan	c88ebd1aa5	Added comments explaining what a Match is	2025-02-01 18:05:55 -05:00
Aadhavan Srinivasan	fd102292c6	Added example for FindSubmatch	2025-02-01 18:05:43 -05:00