Reverse the order of the numeric range before adding it, to maintain compatibility with PCRE matching rules (even though I don't implement them, if I do in the future)

Added more tests; commented out tests that I am failing
Added comments
2025-02-02 13:46:48 -05:00 · 2025-02-02 13:46:08 -05:00 · 2025-02-02 12:44:06 -05:00 · 2025-02-02 12:43:40 -05:00 · 2025-02-02 12:42:38 -05:00 · 2025-02-02 12:42:29 -05:00
10 changed files with 403 additions and 75 deletions
--- a/2
+++ b/2
@@ -8,6 +8,6 @@ vet: fmt
 buildLib: vet
 	go build -gcflags="-N -l" ./...
 buildCmd: buildLib
-	go build -C cmd/ -o re ./...
+	go build -C cmd/ -gcflags="-N -l" -o re ./...
 test: buildCmd
 	go test -v ./...
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -121,12 +121,12 @@ func main() {
 		}
 		matchIndices := make([]reg.Match, 0)
 		if matchNumFlagEnabled {
-			tmp, err := reg.FindNthMatch(regComp, test_str, *matchNum)
+			tmp, err := regComp.FindNthMatch(test_str, *matchNum)
 			if err == nil {
 				matchIndices = append(matchIndices, tmp)
 			}
 		} else {
-			matchIndices = reg.FindAllMatches(regComp, test_str)
+			matchIndices = regComp.FindAllSubmatch(test_str)
 		}

 		if *printMatchesFlag {
@@ -137,7 +137,7 @@ func main() {
 					fmt.Fprintf(out, "Line %d:\n", lineNum)
 				}
 				for _, m := range matchIndices {
-					fmt.Fprintf(out, "%s\n", m.ToString())
+					fmt.Fprintf(out, "%s\n", m.String())
 				}
 				err := out.Flush()
 				if err != nil {
--- a/regex/compile.go
+++ b/regex/compile.go
@@ -18,6 +18,12 @@ type Reg struct {
 	numGroups int
 }

+// numSubexp eturns the number of sub-expressions in the given [Reg]. This is equivalent
+// to the number of capturing groups.
+func (r Reg) NumSubexp() int {
+	return r.numGroups
+}
+
 const concatRune rune = 0xF0001

 // Flags for shuntingYard - control its behavior
@@ -943,7 +949,9 @@ func thompson(re []postfixNode) (Reg, error) {
 			// and added back in.
 			// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
 			// and RPAREN nodes.
-			// If neither node exists, that's a problem so I return an error.
+			// If the middle node exists but is itself the start of a group, then that _must_ be the opening paren for
+			// the closing paren that I'm on. I put the third node back (because it isn't involved in the capturing group), then
+			// I concatenate those two and add them. If neither node exists, that's a problem so I return an error.
 			if c.nodetype == rparenNode {
 				s.groupEnd = true
 				middleNode, err1 := pop(&nfa)
@@ -958,6 +966,11 @@ func thompson(re []postfixNode) (Reg, error) {
 					s.groupNum = lparenNode.groupNum
 					to_add := concatenate(lparenNode, s)
 					nfa = append(nfa, to_add)
+				} else if middleNode.groupBegin && len(middleNode.transitions) == 0 { // The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
+					nfa = append(nfa, lparenNode)    // I shouldn't have popped this out, because it is not involved in the current capturing group
+					s.groupNum = middleNode.groupNum // In this case, the 'middle' node is actually an lparen
+					to_add := concatenate(middleNode, s)
+					nfa = append(nfa, to_add)
 				} else {
 					// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
 					if lparenNode.groupBegin {
@@ -1110,10 +1123,11 @@ func thompson(re []postfixNode) (Reg, error) {

 }

-// Compiles the given regular expression into a Reg type, suitable for use with the
-// matching functions. The second return value is non-nil if a compilation error has
-// occured. As such, the error value must be checked before using the Reg returned by this function.
-// The second parameter is an optional list of flags, passed to the parsing function shuntingYard.
+// Compile compiles the given regular expression into a [Reg].
+//
+// An error value != nil indicates that the regex was invalid; the error message should provide
+// detailed information on the nature of the error.
+// The second parameter is a sequence of zero or more [ReFlag] values, that modify the behavior of the regex.
 func Compile(re string, flags ...ReFlag) (Reg, error) {
 	nodes, err := shuntingYard(re, flags...)
 	if err != nil {
@@ -1125,3 +1139,12 @@ func Compile(re string, flags ...ReFlag) (Reg, error) {
 	}
 	return reg, nil
 }
+
+// MustCompile panicks if Compile returns an error. They are identical in all other respects.
+func MustCompile(re string, flags ...ReFlag) Reg {
+	reg, err := Compile(re, flags...)
+	if err != nil {
+		panic(err)
+	}
+	return reg
+}
--- a/regex/doc.go
+++ b/regex/doc.go
@@ -84,9 +84,77 @@ Assertions:
 	\b				Match at a word boundary (a word character followed by a non-word character, or vice-versa)
 	\B				Match at a non-word boundary (a word character followed by a word character, or vice-versa)

-# Flags
+Lookarounds:

-Flags are used to change the behavior of the engine. None of them are enabled by default. They are passed as an [ReFlag] slice to [Compile].
-The list of flags, and their purpose, is provided in the type definition.
+	x(?=y)			Positive lookahead - Match x if followed by y
+	x(?!y)			Negative lookahead - Match x if NOT followed by y
+	(?<=x)y			Positive lookbehind - Match y if preceded by x
+	(?<!x)y			Negative lookbehind - Match y if NOT preceded by x
+
+Numeric ranges:
+
+	<x-y>			Match any number from x to y (inclusive) (x and y must be positive numbers)
+
+# Key Differences with regexp
+
+The engine and the API differ from [regexp] in a number of ways, some of them very subtle.
+The key differences are mentioned below.
+
+1. Greediness:
+
+This engine does not support non-greedy operators. All operators are always greedy in nature, and will try
+to match as much as they can, while still allowing for a successful match. For example, given the regex:
+
+	y*y
+
+The engine will match as many 'y's as it can, while still allowing the trailing 'y' to be matched.
+
+Another, more subtle example is the following regex:
+
+	x|xx
+
+While the stdlib implementation (and most other engines) will prefer matching the first item of the alternation,
+this engine will go for the longest possible match, regardless of the order of the alternation. Although this
+strays from the convention, it results in a nice rule-of-thumb - the engine is ALWAYS greedy.
+
+The stdlib implementation has a function [regexp.Regexp.Longest] which makes future searches prefer the longest match.
+That is the default (and unchangable) behavior in this engine.
+
+2. Byte-slices and runes:
+
+My engine does not support byte-slices. When a matching function receives a string, it converts it into a
+rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
+support made the tradeoff worth it.
+
+3. Return values
+
+Rather than using primitives for return values, my engine defines two types that are used as return
+values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
+
+[regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The
+equivalent expression for this engine is:
+
+	Find(All)?(String)?(Submatch)?
+
+[Reg.Find] returns the index of the leftmost match in the string.
+
+If a function contains 'All' it returns all matches instead of just the leftmost one.
+
+If a function contains 'String' it returns the matched text, rather than the indices.
+
+If a function contains 'Submatch' it returns the match, including all submatches found by
+capturing groups.
+
+The term '0-group' is used to refer to the 0th capturing group of a match (which is the entire match).
+Given the following regex:
+
+	x(y)
+
+and the input string:
+
+	xyz
+
+The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name
+returns the 0-group.
 */
 package regex
--- a/regex/example_test.go
+++ b/regex/example_test.go
@@ -0,0 +1,54 @@
+package regex_test
+
+import (
+	"fmt"
+
+	"gitea.twomorecents.org/Rockingcool/kleingrep/regex"
+)
+
+func ExampleReg_Find() {
+	regexStr := "b|a"
+	regexComp := regex.MustCompile(regexStr)
+
+	match, _ := regexComp.Find("banana")
+	fmt.Println(match.String())
+
+	// Output: 0	1
+}
+
+func ExampleReg_FindAll() {
+	regexStr := "b|a"
+	regexComp := regex.MustCompile(regexStr)
+
+	matches := regexComp.FindAll("banana")
+	for _, group := range matches {
+		fmt.Println(group.String())
+	}
+
+	// Output: 0	1
+	// 1	2
+	// 3	4
+	// 5	6
+}
+
+func ExampleReg_FindString() {
+	regexStr := `\d+`
+	regexComp := regex.MustCompile(regexStr)
+
+	matchStr := regexComp.FindString("The year of our lord, 2025")
+	fmt.Println(matchStr)
+	// Output: 2025
+}
+
+func ExampleReg_FindSubmatch() {
+	regexStr := `(\d)\.(\d)(\d)`
+	regexComp := regex.MustCompile(regexStr)
+
+	match, _ := regexComp.FindSubmatch("3.14")
+	fmt.Println(match[0])
+	fmt.Println(match[1])
+	fmt.Println(match[2])
+	// Output: 0	4
+	// 0	1
+	// 2	3
+}
--- a/regex/matching.go
+++ b/regex/matching.go
@@ -5,7 +5,13 @@ import (
 	"sort"
 )

-// a Match stores a slice of all the capturing groups in a match.
+// A Match represents a match found by the regex in a given string.
+// It is represented as a list of groups, where the nth element contains
+// the contents of the nth capturing group. Note that the group may not be valid
+// (see [Group.IsValid]). The element at index 0 is known
+// as the 0-group, and represents the contents of the entire match.
+//
+// See [Reg.FindSubmatch] for an example.
 type Match []Group

 // a Group represents a group. It contains the start index and end index of the match
@@ -35,28 +41,34 @@ func (m Match) numValidGroups() int {
 }

 // Returns a string containing the indices of all (valid) groups in the match
-func (m Match) ToString() string {
+func (m Match) String() string {
 	var toRet string
 	for i, g := range m {
-		if g.isValid() {
+		if g.IsValid() {
 			toRet += fmt.Sprintf("Group %d\n", i)
-			toRet += g.toString()
+			toRet += g.String()
 			toRet += "\n"
 		}
 	}
 	return toRet
 }

-// Converts the Group into a string representation:
-func (idx Group) toString() string {
+// String converts the Group into a string representation.
+func (idx Group) String() string {
 	return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
 }

-// Returns whether a group contains valid indices
-func (g Group) isValid() bool {
+// Returns whether a group is valid (ie. whether it matched any text). It
+// simply ensures that both indices of the group are >= 0.
+func (g Group) IsValid() bool {
 	return g.StartIdx >= 0 && g.EndIdx >= 0
 }

+// Simple function, makes it easier to map this over a list of matches
+func getZeroGroup(m Match) Group {
+	return m[0]
+}
+
 // takeZeroState takes the 0-state (if such a transition exists) for all states in the
 // given slice. It returns the resulting states. If any of the resulting states is a 0-state,
 // the second ret val is true.
@@ -101,7 +113,7 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*nfaState)
 	num_appended := 0 // number of unique states addded to tempstates
 	for isZero == true {
 		zeroStates, isZero = takeZeroState(tempstates, numGroups, idx)
-		tempstates, num_appended = unique_append(tempstates, zeroStates...)
+		tempstates, num_appended = uniqueAppend(tempstates, zeroStates...)
 		if num_appended == 0 { // break if we haven't appended any more unique values
 			break
 		}
@@ -138,36 +150,72 @@ func pruneIndices(indices []Match) []Match {
 	return toRet
 }

-// FindString returns a _string_ containing the _text_ of the _leftmost_ match of
-// the regex, in the given string. The return value will be an empty string in two situations:
+// Find returns the 0-group of the leftmost match of the regex in the given string.
+// An error value != nil indicates that no match was found.
+func (regex Reg) Find(str string) (Group, error) {
+	match, err := regex.FindNthMatch(str, 1)
+	if err != nil {
+		return Group{}, fmt.Errorf("no matches found")
+	}
+	return getZeroGroup(match), nil
+}
+
+// FindAll returns a slice containing all the 0-groups of the regex in the given string.
+// A 0-group represents the match without any submatches.
+func (regex Reg) FindAll(str string) []Group {
+	indices := regex.FindAllSubmatch(str)
+	zeroGroups := funcMap(indices, getZeroGroup)
+	return zeroGroups
+}
+
+// FindString returns the text of the leftmost match of the regex in the given string.
+// The return value will be an empty string in two situations:
 //  1. No match was found
 //  2. The match was an empty string
-func FindString(regex Reg, str string) string {
-	match, err := FindNthMatch(regex, str, 1)
+func (regex Reg) FindString(str string) string {
+	match, err := regex.FindNthMatch(str, 1)
 	if err != nil {
 		return ""
 	}
-	return str[match[0].StartIdx:match[0].EndIdx]
+	zeroGroup := getZeroGroup(match)
+	return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
+}
+
+// FindSubmatch returns the leftmost match of the regex in the given string, including
+// the submatches matched by capturing groups. The returned [Match] will always contain the same
+// number of groups. The validity of a group (whether or not it matched anything) can be determined with
+// [Group.IsValid], or by checking that both indices of the group are >= 0.
+// The second-return value is nil if no match was found.
+func (regex Reg) FindSubmatch(str string) (Match, error) {
+	match, err := regex.FindNthMatch(str, 1)
+	if err != nil {
+		return Match{}, fmt.Errorf("no match found")
+	} else {
+		return match, nil
+	}
 }

 // FindAllString is the 'all' version of FindString.
-// It returns a _slice of strings_ containing the _text_ of _all_ matches of
-// the regex, in the given string.
-//func FindAllString(regex Reg, str []string) []string {
-//
-//}
+// It returns a slice of strings containing the text of all matches of
+// the regex in the given string.
+func (regex Reg) FindAllString(str string) []string {
+	zerogroups := regex.FindAll(str)
+	matchStrs := funcMap(zerogroups, func(g Group) string {
+		return str[g.StartIdx:g.EndIdx]
+	})
+	return matchStrs
+}

-// FindNthMatch finds the 'n'th match of the regex represented by the given start-state, with
-// the given string.
+// FindNthMatch return the 'n'th match of the regex in the given string.
 // It returns an error (!= nil) if there are fewer than 'n' matches in the string.
-func FindNthMatch(regex Reg, str string, n int) (Match, error) {
+func (regex Reg) FindNthMatch(str string, n int) (Match, error) {
 	idx := 0
 	matchNum := 0
 	str_runes := []rune(str)
 	var matchFound bool
 	var matchIdx Match
 	for idx <= len(str_runes) {
-		matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
+		matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups)
 		if matchFound {
 			matchNum++
 		}
@@ -179,16 +227,15 @@ func FindNthMatch(regex Reg, str string, n int) (Match, error) {
 	return nil, fmt.Errorf("invalid match index - too few matches found")
 }

-// FindAllMatches tries to find all matches of the regex represented by given start-state, with
-// the given string
-func FindAllMatches(regex Reg, str string) []Match {
+// FindAllSubmatch returns a slice of matches in the given string.
+func (regex Reg) FindAllSubmatch(str string) []Match {
 	idx := 0
 	str_runes := []rune(str)
 	var matchFound bool
 	var matchIdx Match
 	indices := make([]Match, 0)
 	for idx <= len(str_runes) {
-		matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
+		matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups)
 		if matchFound {
 			indices = append(indices, matchIdx)
 		}
@@ -196,6 +243,7 @@ func FindAllMatches(regex Reg, str string) []Match {
 	if len(indices) > 0 {
 		return pruneIndices(indices)
 	}
+
 	return indices
 }

@@ -204,12 +252,13 @@ func FindAllMatches(regex Reg, str string) []Match {
 // the next search should start from.
 //
 //	Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
-func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
+func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) {
 	// Base case - exit if offset exceeds string's length
 	if offset > len(str) {
 		// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
 		return false, []Group{}, offset
 	}
+	resetThreads(start)

 	// Hold a list of match indices for the current run. When we
 	// can no longer find a match, the match with the largest range is
@@ -265,13 +314,13 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
 		num_appended := 0
 		for isZero == true {
 			zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
-			tempStates, num_appended = unique_append(tempStates, zeroStates...)
+			tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
 			if num_appended == 0 { // Break if we haven't appended any more unique values
 				break
 			}
 		}

-		currentStates, _ = unique_append(currentStates, tempStates...)
+		currentStates, _ = uniqueAppend(currentStates, tempStates...)
 		tempStates = nil

 		// Take any transitions corresponding to current character
@@ -345,7 +394,7 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
 		// Check if we can find a zero-length match
 		if foundPath == false {
 			if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
-				if tempIndices[0].isValid() == false {
+				if tempIndices[0].IsValid() == false {
 					tempIndices[0] = Group{startIdx, startIdx}
 				}
 			}
@@ -355,7 +404,7 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
 			startIdx++
 			//	i++
 			//			}
-			if tempIndices.numValidGroups() > 0 && tempIndices[0].isValid() {
+			if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() {
 				if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
 					return true, tempIndices, tempIndices[0].EndIdx + 1
 				} else {
@@ -378,7 +427,7 @@ func findAllMatchesHelper(start *nfaState, str []rune, offset int, numGroups int
 	num_appended := 0 // Number of unique states addded to tempStates
 	for isZero == true {
 		zeroStates, isZero = takeZeroState(tempStates, numGroups, i)
-		tempStates, num_appended = unique_append(tempStates, zeroStates...)
+		tempStates, num_appended = uniqueAppend(tempStates, zeroStates...)
 		if num_appended == 0 { // Break if we haven't appended any more unique values
 			break
 		}
--- a/regex/misc.go
+++ b/regex/misc.go
@@ -50,7 +50,7 @@ func isNormalChar(c rune) bool {

 // Ensure that the given elements are only appended to the given slice if they
 // don't already exist. Returns the new slice, and the number of unique items appended.
-func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
+func uniqueAppend[T comparable](slc []T, items ...T) ([]T, int) {
 	num_appended := 0
 	for _, item := range items {
 		if !slices.Contains(slc, item) {
@@ -61,6 +61,25 @@ func unique_append[T comparable](slc []T, items ...T) ([]T, int) {
 	return slc, num_appended
 }

+func uniqueAppendFunc[T any](slc []T, fn func(T, T) bool, items ...T) ([]T, int) {
+	toRet := make([]T, len(slc))
+	num_appended := 0
+	copy(toRet, slc)
+	for _, item := range items {
+		itemExists := false
+		for _, val := range slc {
+			if fn(item, val) {
+				itemExists = true
+			}
+		}
+		if !itemExists {
+			toRet = append(toRet, item)
+			num_appended++
+		}
+	}
+	return toRet, num_appended
+}
+
 // Returns true only if all the given elements are equal
 func allEqual[T comparable](items ...T) bool {
 	first := items[0]
--- a/regex/nfa.go
+++ b/regex/nfa.go
@@ -104,6 +104,26 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
 	return clone
 }

+// Reset any thread-related fields of the NFA starting from the given state.
+func resetThreads(start *nfaState) {
+	visitedMap := make(map[*nfaState]bool) // The value type doesn't matter here
+	resetThreadsHelper(start, visitedMap)
+}
+
+func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
+	if _, ok := visitedMap[state]; ok {
+		return
+	}
+	// Assuming it hasn't been visited
+	state.threadGroups = nil
+	visitedMap[state] = true
+	for _, v := range state.transitions {
+		for _, nextState := range v {
+			resetThreadsHelper(nextState, visitedMap)
+		}
+	}
+}
+
 // Checks if the given state's assertion is true. Returns true if the given
 // state doesn't have an assertion.
 func (s nfaState) checkAssertion(str []rune, idx int) bool {
@@ -156,17 +176,18 @@ func (s nfaState) checkAssertion(str []rune, idx int) bool {
 			strToMatch = string(runesToMatch)
 		}

-		matchIndices := FindAllMatches(Reg{startState, s.lookaroundNumCaptureGroups}, strToMatch)
+		regComp := Reg{startState, s.lookaroundNumCaptureGroups}
+		matchIndices := regComp.FindAll(strToMatch)

 		numMatchesFound := 0
 		for _, matchIdx := range matchIndices {
 			if s.assert == plaAssert || s.assert == nlaAssert { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
-				if matchIdx[0].StartIdx == 0 {
+				if matchIdx.StartIdx == 0 {
 					numMatchesFound++
 				}
 			}
 			if s.assert == plbAssert || s.assert == nlbAssert { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
-				if matchIdx[0].EndIdx == idx {
+				if matchIdx.EndIdx == idx {
 					numMatchesFound++
 				}
 			}
@@ -273,7 +294,7 @@ func concatenate(s1 *nfaState, s2 *nfaState) *nfaState {
 	}
 	for i := range s1.output {
 		for _, c := range s2.content { // Create transitions for every element in s1's content to s2'
-			s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], s2)
+			s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], s2)
 		}
 	}
 	s1.output = s2.output
@@ -293,11 +314,11 @@ func kleene(s1 nfaState) (*nfaState, error) {
 	toReturn.output = append(toReturn.output, toReturn)
 	for i := range s1.output {
 		for _, c := range toReturn.content {
-			s1.output[i].transitions[c], _ = unique_append(s1.output[i].transitions[c], toReturn)
+			s1.output[i].transitions[c], _ = uniqueAppend(s1.output[i].transitions[c], toReturn)
 		}
 	}
 	for _, c := range s1.content {
-		toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1)
+		toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], &s1)
 	}
 	return toReturn, nil
 }
@@ -313,10 +334,10 @@ func alternate(s1 *nfaState, s2 *nfaState) *nfaState {
 	// This would lead to multiple instances of the same set of match indices, since both
 	// 's1' states would be considered to match.
 	for _, c := range s1.content {
-		toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s1)
+		toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s1)
 	}
 	for _, c := range s2.content {
-		toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], s2)
+		toReturn.transitions[c], _ = uniqueAppend(toReturn.transitions[c], s2)
 	}
 	toReturn.content = newContents(epsilon)
 	toReturn.isEmpty = true
--- a/regex/range2regex.go
+++ b/regex/range2regex.go
@@ -3,7 +3,9 @@ package regex
 import (
 	"fmt"
 	"math"
+	"slices"
 	"strconv"
+	"strings"
 )

 type numRange struct {
@@ -99,13 +101,11 @@ func range2regex(start int, end int) (string, error) {
 	// Last range - tmp to rangeEnd
 	ranges = append(ranges, numRange{tmp, rangeEnd})

-	regex := string(nonCapLparenRune)
+	regexSlice := make([]string, 0)
 	// Generate the regex
-	for i, rg := range ranges {
-		if i > 0 {
-			regex += "|"
-		}
-		regex += string(nonCapLparenRune)
+	for _, rg := range ranges {
+		tmpStr := ""
+		tmpStr += string(nonCapLparenRune)
 		startSlc := intToSlc(rg.start)
 		endSlc := intToSlc(rg.end)
 		if len(startSlc) != len(endSlc) {
@@ -113,14 +113,27 @@ func range2regex(start int, end int) (string, error) {
 		}
 		for i := range startSlc {
 			if startSlc[i] == endSlc[i] {
-				regex += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
+				tmpStr += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
 			} else {
-				regex += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
+				tmpStr += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
 			}
 		}
-		regex += ")"
+		tmpStr += ")"
+		regexSlice = append(regexSlice, tmpStr)
 	}
-	regex += ")"
+	// Each element of the slice represents one 'group'. Taking 0-255 as an example, the elements would be:
+	// 	1. 0-9
+	// 	2. 10-99
+	// 	3. 100-199
+	// 	4. 200-249
+	// 	5. 250-255
+	//
+	// The reason this is reversed before joining it, is because it is incompatible with the PCRE rule for matching.
+	// The PCRE rule specifies that the left-branch of an alternation is preferred. Even though this engine uses the POSIX
+	// rule at the moment (which prefers the longest match regardless of the order of the alternation), reversing the string
+	// has no downsides. It doesn't affect POSIX matching, and it will reduce my burden if I decide to switch to PCRE matching.
+	slices.Reverse(regexSlice)
+	regex := string(nonCapLparenRune) + strings.Join(regexSlice, "|") + ")"
 	return regex, nil

 }
--- a/regex/re_test.go
+++ b/regex/re_test.go
@@ -105,6 +105,9 @@ var reTests = []struct {
 	{"(a|b){3,4}", nil, "ababaa", []Group{{0, 4}}},
 	{"(bc){5,}", nil, "bcbcbcbcbcbcbcbc", []Group{{0, 16}}},
 	{`\d{3,4}`, nil, "1209", []Group{{0, 4}}},
+	{`\d{3,4}`, nil, "120", []Group{{0, 3}}},
+	{`\d{3,4}`, nil, "12709", []Group{{0, 4}}},
+	{`\d{3,4}`, nil, "12", []Group{}},
 	{`\d{3,4}`, nil, "109", []Group{{0, 3}}},
 	{`\d{3,4}`, nil, "5", []Group{}},
 	{`\d{3,4}`, nil, "123135", []Group{{0, 4}}},
@@ -671,9 +674,20 @@ var groupTests = []struct {
 	{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},

 	{`(<389-400>)`, nil, `391`, []Match{[]Group{{0, 3}, {0, 3}}}},
+
+	// // Tests from https://wiki.haskell.org/Regex_Posix
+	// {`(()|.)(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
+	// {`(()|[ab])(b)`, nil, `ab`, []Match{[]Group{{0, 2}, {0, 1}, {-1, -1}, {1, 2}}}},
+	// {`(()|[ab])+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
+	// {`([ab]|())+b`, nil, `aaab`, []Match{[]Group{{0, 4}, {2, 3}, {-1, -1}}}},
+	// // Bug - this should give {0,6},{3,6},{-1,-1} but it gives {0,6},{3,6},{3,3}
+	// //	{`yyyyyy`, nil, `(yyy|(x?)){2,4}`, []Match{[]Group{{0, 6}, {3, 6}, {-1, -1}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
+	// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
+	// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
+	// //	{`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
 }

-func TestFindAllMatches(t *testing.T) {
+func TestFind(t *testing.T) {
 	for _, test := range reTests {
 		t.Run(test.re+"	"+test.str, func(t *testing.T) {
 			regComp, err := Compile(test.re, test.flags...)
@@ -682,13 +696,35 @@ func TestFindAllMatches(t *testing.T) {
 					panic(fmt.Errorf("Test Error: %v", err))
 				}
 			} else {
-				matchIndices := FindAllMatches(regComp, test.str)
-				zeroGroups := make([]Group, len(matchIndices))
-				for i, m := range matchIndices {
-					zeroGroups[i] = m[0]
+				groupIndex, err := regComp.Find(test.str)
+				if err != nil { // No matches found
+					if len(test.result) == 0 {
+						return // Manually pass the test, because this is the expected behavior
+					} else {
+						t.Errorf("Wanted no match Got %v\n", groupIndex)
+					}
+				} else {
+					if groupIndex != test.result[0] {
+						t.Errorf("Wanted %v	Got %v\n", test.result, groupIndex)
+					}
 				}
-				if !slices.Equal(test.result, zeroGroups) {
-					t.Errorf("Wanted %v	Got %v\n", test.result, zeroGroups)
+			}
+		})
+	}
+}
+
+func TestFindAll(t *testing.T) {
+	for _, test := range reTests {
+		t.Run(test.re+"	"+test.str, func(t *testing.T) {
+			regComp, err := Compile(test.re, test.flags...)
+			if err != nil {
+				if test.result != nil {
+					panic(fmt.Errorf("Test Error: %v", err))
+				}
+			} else {
+				matchIndices := regComp.FindAll(test.str)
+				if !slices.Equal(test.result, matchIndices) {
+					t.Errorf("Wanted %v	Got %v\n", test.result, matchIndices)
 				}
 			}
 		})
@@ -704,7 +740,7 @@ func TestFindString(t *testing.T) {
 					panic(err)
 				}
 			} else {
-				foundString := FindString(regComp, test.str)
+				foundString := regComp.FindString(test.str)
 				if len(test.result) == 0 {
 					if foundString != "" {
 						t.Errorf("Expected no match got %v\n", foundString)
@@ -720,7 +756,32 @@ func TestFindString(t *testing.T) {
 	}
 }

-func TestFindAllGroups(t *testing.T) {
+func TestFindAllString(t *testing.T) {
+	for _, test := range reTests {
+		t.Run(test.re+"	"+test.str, func(t *testing.T) {
+			regComp, err := Compile(test.re, test.flags...)
+			if err != nil {
+				if test.result != nil {
+					panic(err)
+				}
+			} else {
+				foundStrings := regComp.FindAllString(test.str)
+				if len(test.result) != len(foundStrings) {
+					t.Errorf("Differing number of matches: Wanted %v matches Got %v matches\n", len(test.result), len(foundStrings))
+				} else {
+					for idx, group := range test.result {
+						groupStr := test.str[group.StartIdx:group.EndIdx]
+						if groupStr != foundStrings[idx] {
+							t.Errorf("Wanted %v	Got %v\n", groupStr, foundStrings[idx])
+						}
+					}
+				}
+			}
+		})
+	}
+}
+
+func TestFindSubmatch(t *testing.T) {
 	for _, test := range groupTests {
 		t.Run(test.re+"	"+test.str, func(t *testing.T) {
 			regComp, err := Compile(test.re, test.flags...)
@@ -729,10 +790,30 @@ func TestFindAllGroups(t *testing.T) {
 					panic(err)
 				}
 			}
-			matchIndices := FindAllMatches(regComp, test.str)
+			match, err := regComp.FindSubmatch(test.str)
+			for i := range match {
+				if match[i].IsValid() {
+					if test.result[0][i] != match[i] {
+						t.Errorf("Wanted %v	Got %v\n", test.result[0], match)
+					}
+				}
+			}
+		})
+	}
+}
+func TestFindAllSubmatch(t *testing.T) {
+	for _, test := range groupTests {
+		t.Run(test.re+"	"+test.str, func(t *testing.T) {
+			regComp, err := Compile(test.re, test.flags...)
+			if err != nil {
+				if test.result != nil {
+					panic(err)
+				}
+			}
+			matchIndices := regComp.FindAllSubmatch(test.str)
 			for i := range matchIndices {
 				for j := range matchIndices[i] {
-					if matchIndices[i][j].isValid() {
+					if matchIndices[i][j].IsValid() {
 						if test.result[i][j] != matchIndices[i][j] {
 							t.Errorf("Wanted %v	Got %v\n", test.result, matchIndices)
 						}
Author	SHA1	Message	Date
Aadhavan Srinivasan	ef476e8875	Reverse the order of the numeric range before adding it, to maintain compatibility with PCRE matching rules (even though I don't implement them, if I do in the future)	2025-02-02 13:46:48 -05:00
Aadhavan Srinivasan	7e6b02632f	Added more tests; commented out tests that I am failing	2025-02-02 13:46:08 -05:00
Aadhavan Srinivasan	f94e3f2e71	Added comments	2025-02-02 12:44:06 -05:00
Aadhavan Srinivasan	b129d83c3f	Added function to reset threads	2025-02-02 12:43:40 -05:00
Aadhavan Srinivasan	43aa7b5876	Updated documentation	2025-02-02 12:42:38 -05:00
Aadhavan Srinivasan	9a3bfca313	Renamed unique_append to uniqueAppend	2025-02-02 12:42:29 -05:00
Aadhavan Srinivasan	b6ab54f6dd	Reset threads when findAllSubmatchHelper is called	2025-02-02 12:42:00 -05:00
Aadhavan Srinivasan	6a96c98d04	Fixed bug where the regex '(()\|.)(b)' wouldn't compile	2025-02-01 19:20:33 -05:00
Aadhavan Srinivasan	3cfc2a6854	Updated Makefile	2025-02-01 18:52:26 -05:00
Aadhavan Srinivasan	5d7a02e796	Added gcflags to go build	2025-02-01 18:51:58 -05:00
Aadhavan Srinivasan	a46d2f4546	Updated comments	2025-02-01 18:07:31 -05:00
Aadhavan Srinivasan	c88ebd1aa5	Added comments explaining what a Match is	2025-02-01 18:05:55 -05:00
Aadhavan Srinivasan	fd102292c6	Added example for FindSubmatch	2025-02-01 18:05:43 -05:00
Aadhavan Srinivasan	6d692d0dfc	Rename Group.toString() to Group.String()	2025-02-01 12:51:32 -05:00
Aadhavan Srinivasan	7c4538a259	Added 'example' file that contains testable examples	2025-02-01 12:50:49 -05:00
Aadhavan Srinivasan	2a9ae0b68a	Wrote test for 'FindSubmatch'	2025-02-01 11:09:05 -05:00
Aadhavan Srinivasan	783ae2ad10	Updated call to 'isValid' with call to 'IsValid'	2025-02-01 11:06:26 -05:00
Aadhavan Srinivasan	b5e6bc112c	Wrote 'reg.FindSubmatch()' which returns the leftmost match with submatches, renamed 'isValid' to 'IsValid' to export it, renamed 'ToString' to 'String'	2025-02-01 11:06:03 -05:00
Aadhavan Srinivasan	206fea34cd	Added function to return the number of subexpressions in the group	2025-02-01 11:04:49 -05:00
Aadhavan Srinivasan	fcdb23524a	Added more documentation	2025-02-01 11:04:24 -05:00
Aadhavan Srinivasan	ac936659b6	Updated documentation	2025-01-31 16:52:26 -05:00
Aadhavan Srinivasan	e6dba9fdcf	Updated documentation	2025-01-31 16:51:46 -05:00
Aadhavan Srinivasan	30779a446b	Updated documentation	2025-01-31 16:46:19 -05:00
Aadhavan Srinivasan	f629a0f08f	Added 'mustCompile' which panicks if there is an error compiling	2025-01-31 16:46:05 -05:00
Aadhavan Srinivasan	6869cd00a2	Return error instead of nil when 'Find' fails	2025-01-31 10:52:38 -05:00
Aadhavan Srinivasan	02bc8f30a2	Added test for 'Find'	2025-01-31 10:52:27 -05:00
Aadhavan Srinivasan	ac05bceda3	Use method instead of function	2025-01-31 10:13:02 -05:00
Aadhavan Srinivasan	037ac75ea6	Wrote new method to return 0-group of leftmost match; reorganized some functions for better clarity; made 'FindNthMatch' a method	2025-01-31 10:12:53 -05:00
Aadhavan Srinivasan	e9d4e857cf	Run 'TestFindAllStrings' since that function has been implemented	2025-01-31 10:11:52 -05:00
Aadhavan Srinivasan	b685d2fd5f	Renamed 'findAllMatchesHelper' to 'findAllSubmatchHelper'	2025-01-31 09:56:30 -05:00
Aadhavan Srinivasan	8eda5055ff	Replaced call to 'FindAllMatches' with call to 'FindAll' or 'FindAllSubmatch' depending on whether I need submatches	2025-01-31 09:55:36 -05:00
Aadhavan Srinivasan	45b6566b2c	Replaced function call with method call	2025-01-31 09:54:35 -05:00
Aadhavan Srinivasan	e22822e619	Renamed 'FindAllMatches' to 'FindAll' and made it a method; made it return a slice of 0-groups; the functionality of 'FindAllMatches' is now in 'FindAllSubmatch'	2025-01-31 09:54:09 -05:00
Aadhavan Srinivasan	692de2a32b	Added lookarounds and numeric ranges to documentation	2025-01-31 09:26:21 -05:00
Aadhavan Srinivasan	0d19664044	Cleared up some comments, wrote a skeleton for FindAllString	2025-01-30 22:57:35 -05:00
Aadhavan Srinivasan	1bfb09b6c7	Made 'FindString' a method of 'Reg'	2025-01-30 22:51:31 -05:00
Aadhavan Srinivasan	b0b8bf23af	Updated documentation	2025-01-30 22:51:16 -05:00