Merge branch 'master' of https://gitea.twomorecents.org/Rockingcool/kleingrep

Added more backreference tests
Clone 'isBackreference' and 'referredGroup' NFA fields, because they aren't thread variables
10 changed files with 312 additions and 49 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,17 @@
 ## Kleingrep
 Kleingrep is a regular expression engine, providing a library and command-line tool written in Go.
 It aims to provide a more featureful engine, compared to the one in Go's
 [regexp](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
 The engine does __not__ use backtracking, relying on the NFA-based method described in
 [Russ Cox's articles](https://swtch.com/~rsc/regexp). As such, it is immune to catastrophic backtracking.
 It also includes features not present in regexp, such as lookarounds and backreferences.
 ### Syntax
 The syntax is, for the most part, a superset of Go's regexp. A full overview of the syntax can be found [here](https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex#hdr-Syntax).
 __For more information, see https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex__.
--- a/cmd/main.go
+++ b/cmd/main.go
@ -129,6 +129,8 @@ func main() {
 			matchIndices = regComp.FindAllSubmatch(test_str)
 		}
 		test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters
 		if *printMatchesFlag {
 			// if we are in single line mode, print the line on which
 			// the matches occur
@ -158,10 +160,10 @@ func main() {
 			oldIndices := indicesToPrint.values()
 			indicesToPrint = new_uniq_arr[int]()
 			// Explanation:
-			// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
+			// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
 			// These are the values we want to print, now that we have inverted the match.
 			// Re-initialize indicesToPrint and add all of these values to it.
-			indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)
+			indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)
 		}
 		// If lineFlag is enabled, we should only print something if:
@ -182,7 +184,7 @@ func main() {
 		//			the corresponding end index.
 		// 		3. If not, just print the character.
 		if substituteFlagEnabled {
-			for i := range test_str {
+			for i := range test_str_runes {
 				inMatchIndex := false
 				for _, m := range matchIndices {
 					if i == m[0].StartIdx {
@ -193,22 +195,24 @@ func main() {
 					}
 				}
 				if !inMatchIndex {
-					fmt.Fprintf(out, "%c", test_str[i])
+					fmt.Fprintf(out, "%c", test_str_runes[i])
 				}
 			}
 		} else {
-			for i, c := range test_str {
+			for i, c := range test_str_runes {
 				if indicesToPrint.contains(i) {
 					color.New(color.FgRed).Fprintf(out, "%c", c)
 					// Newline after every match - only if -o is enabled and -v is disabled.
 					if *onlyFlag && !(*invertFlag) {
-						for _, idx := range matchIndices {
+						for matchIdxNum, idx := range matchIndices {
 							if matchIdxNum < len(matchIndices)-1 { // Only print a newline afte printing a match, if there are multiple matches on the line, and we aren't on the last one. This is because the newline that gets added at the end will take care of that.
 								if i+1 == idx[0].EndIdx { // End index is one more than last index of match
 									fmt.Fprintf(out, "\n")
 									break
 								}
 							}
 						}
 					}
 				} else {
 					if !(*onlyFlag) {
 						fmt.Fprintf(out, "%c", c)
@ -220,6 +224,10 @@ func main() {
 		if err != nil {
 			panic(err)
 		}
 		// If the last character in the string wasn't a newline, AND we either have don't -o set or we do (and we've matched something), then print a newline
 		if (len(test_str_runes) > 0 && test_str_runes[len(test_str_runes)-1] != '\n') &&
 			(!*onlyFlag || indicesToPrint.len() > 0) {
 			fmt.Println()
 		}
 	}
 }
--- a/cmd/unique_array.go
+++ b/cmd/unique_array.go
@ -36,3 +36,7 @@ func (s uniq_arr[T]) values() []T {
 	}
 	return toRet
 }
 func (s uniq_arr[T]) len() int {
 	return len(s.backingMap)
 }
--- a/regex/compile.go
+++ b/regex/compile.go
@ -64,7 +64,7 @@ const (
 )
 func isOperator(c rune) bool {
-	if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune {
+	if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune || c == lazyPlusRune || c == lazyKleeneRune || c == lazyQuestionRune {
 		return true
 	}
 	return false
@ -72,7 +72,7 @@ func isOperator(c rune) bool {
 /* priority returns the priority of the given operator */
 func priority(op rune) int {
-	precedence := []rune{'|', concatRune, '+', '*', '?'}
+	precedence := []rune{'|', concatRune, '+', lazyPlusRune, '*', lazyKleeneRune, '?', lazyQuestionRune}
 	return slices.Index(precedence, op)
 }
@ -108,6 +108,48 @@ func getPOSIXClass(str []rune) (bool, string) {
 	return true, rtv
 }
 // isUnicodeCharClassLetter returns whether or not the given letter represents a unicode character class.
 func isUnicodeCharClassLetter(c rune) bool {
 	return slices.Contains([]rune{'L', 'M', 'S', 'N', 'P', 'C', 'Z'}, c)
 }
 // rangeTableToRuneSlice converts the given range table into a rune slice and returns it.
 func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune {
 	var rtv []rune
 	for _, r := range rangetable.R16 {
 		for c := r.Lo; c <= r.Hi; c += r.Stride {
 			rtv = append(rtv, rune(c))
 		}
 	}
 	for _, r := range rangetable.R32 {
 		for c := r.Lo; c <= r.Hi; c += r.Stride {
 			rtv = append(rtv, rune(c))
 		}
 	}
 	return rtv
 }
 // unicodeCharClassToRange converts the given unicode character class name into a list of characters in that class.
 // This class could also be a single letter eg. 'C'.
 func unicodeCharClassToRange(class string) ([]rune, error) {
 	if len(class) == 0 {
 		return nil, fmt.Errorf("empty unicode character class")
 	}
 	if len(class) == 1 || len(class) == 2 {
 		if rangeTable, ok := unicode.Categories[class]; ok {
 			return rangeTableToRuneSlice(rangeTable), nil
 		} else {
 			return nil, fmt.Errorf("invalid short unicode character class")
 		}
 	} else {
 		if rangeTable, ok := unicode.Scripts[class]; ok {
 			return rangeTableToRuneSlice(rangeTable), nil
 		} else {
 			return nil, fmt.Errorf("invalid long unicode character class")
 		}
 	}
 }
 // Stores whether the case-insensitive flag has been enabled.
 var caseInsensitive bool
@ -166,9 +208,6 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 	//		metacharacter. Later, in thompson(), these will be converted back. This avoids
 	//		confusion in detecting whether a character is escaped eg. detecting
 	// 		whether '\\[a]' has an escaped opening bracket (it doesn't).
 	//
 	// 	5. 	Check for non-greedy operators. These are not supported at the moment, so an error
 	// 		must be thrown if the user attempts to use a non-greedy operator.
 	for i := 0; i < len(re_runes_orig); i++ {
 		c := re_runes_orig[i]
 		if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
@ -215,8 +254,16 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 		} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
 			re_runes = append(re_runes, rbracketRune)
 			continue
-		} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
+		} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i > 0 && re_runes_orig[i-1] != '\\') && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
-			return nil, fmt.Errorf("non-greedy operators are not supported")
+			switch c {
 			case '+':
 				re_runes = append(re_runes, lazyPlusRune)
 			case '*':
 				re_runes = append(re_runes, lazyKleeneRune)
 			case '?':
 				re_runes = append(re_runes, lazyQuestionRune)
 			}
 			i++
 		} else {
 			re_runes = append(re_runes, c)
 		}
@ -309,10 +356,30 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 					}
 				} else if isHex(re_runes[i]) {
 					re_postfix = append(re_postfix, re_runes[i:i+2]...)
-					i += 2
+					i += 1 // I don't skip forward 2 steps, because the second step will happen with the loop increment
 				} else {
 					return nil, fmt.Errorf("invalid hex value in expression")
 				}
 			} else if re_runes[i] == 'p' || re_runes[i] == 'P' { // Unicode character class (P is negated unicode charclass)
 				re_postfix = append(re_postfix, re_runes[i])
 				i++
 				if i >= len(re_runes) {
 					return nil, fmt.Errorf("error parsing unicode character class in expression")
 				}
 				if re_runes[i] == '{' { // Full name charclass
 					for re_runes[i] != '}' {
 						re_postfix = append(re_postfix, re_runes[i])
 						i++
 					}
 					re_postfix = append(re_postfix, re_runes[i])
 					i++
 				} else if isUnicodeCharClassLetter(re_runes[i]) {
 					re_postfix = append(re_postfix, re_runes[i])
 					i++
 				} else {
 					return nil, fmt.Errorf("error parsing unicode character class in expression")
 				}
 				i-- // The loop increment at the top will move us forward
 			} else if re_runes[i] == '0' { // Start of octal value
 				numDigits := 1
 				for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0)
@ -343,10 +410,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				if i >= len(re_runes) {
 					return nil, fmt.Errorf("unclosed lookaround")
 				}
-				if re_runes[i] == '(' || re_runes[i] == nonCapLparenRune {
+				if (re_runes[i] == '(' && re_runes[i-1] != '\\') || re_runes[i] == nonCapLparenRune {
 					numOpenParens++
 				}
-				if re_runes[i] == ')' {
+				if re_runes[i] == ')' && re_runes[i-1] != '\\' {
 					numOpenParens--
 					if numOpenParens == 0 {
 						break
@ -359,7 +426,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 		}
 		if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != nonCapLparenRune && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
 			if i < len(re_runes)-1 {
-				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
+				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != lazyKleeneRune && re_runes[i+1] != '+' && re_runes[i+1] != lazyPlusRune && re_runes[i+1] != '?' && re_runes[i+1] != lazyQuestionRune && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
 					re_postfix = append(re_postfix, concatRune)
 				}
 			}
@ -429,6 +496,39 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				} else {
 					return nil, fmt.Errorf("not enough hex characters found in expression")
 				}
 			} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
 				charClassInverted := (re_postfix[i] == 'P')
 				var charsInClass []rune
 				i++
 				if isUnicodeCharClassLetter(re_postfix[i]) {
 					var err error
 					charsInClass, err = unicodeCharClassToRange(string(re_postfix[i]))
 					if err != nil {
 						return nil, err
 					}
 				} else if re_postfix[i] == '{' {
 					i++ // Skip opening bracket
 					unicodeCharClassStr := ""
 					for re_postfix[i] != '}' {
 						unicodeCharClassStr += string(re_postfix[i])
 						i++
 					}
 					var err error
 					charsInClass, err = unicodeCharClassToRange(unicodeCharClassStr)
 					if err != nil {
 						return nil, err
 					}
 				} else {
 					return nil, fmt.Errorf("error parsing unicode character class in expression")
 				}
 				var toAppend postfixNode
 				if !charClassInverted { // \p
 					toAppend = newPostfixNode(charsInClass...)
 				} else { // \P
 					toAppend = newPostfixDotNode()
 					toAppend.except = append([]postfixNode{}, newPostfixNode(charsInClass...))
 				}
 				outQueue = append(outQueue, toAppend)
 			} else if re_postfix[i] == '0' { // Octal value
 				var octVal int64
 				var octValStr string
@ -489,10 +589,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				if i >= len(re_postfix) {
 					return nil, fmt.Errorf("unclosed lookaround")
 				}
-				if re_postfix[i] == '(' || re_postfix[i] == nonCapLparenRune {
+				if (re_postfix[i] == '(' && re_postfix[i-1] != '\\') || re_postfix[i] == nonCapLparenRune {
 					numOpenParens++
 				}
-				if re_postfix[i] == ')' {
+				if re_postfix[i] == ')' && re_postfix[i-1] != '\\' {
 					numOpenParens--
 					if numOpenParens == 0 {
 						break
@ -611,7 +711,40 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 						} else {
 							return nil, fmt.Errorf("not enough hex characters found in character class")
 						}
 					} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
 						charClassInverted := (re_postfix[i] == 'P')
 						var charsInList []rune
 						i++
 						if isUnicodeCharClassLetter(re_postfix[i]) {
 							var err error
 							charsInList, err = unicodeCharClassToRange(string(re_postfix[i]))
 							if err != nil {
 								return nil, err
 							}
 						} else if re_postfix[i] == '{' {
 							i++ // Skip opening bracket
 							unicodeCharClassStr := ""
 							for re_postfix[i] != '}' {
 								unicodeCharClassStr += string(re_postfix[i])
 								i++
 							}
 							var err error
 							charsInList, err = unicodeCharClassToRange(unicodeCharClassStr)
 							if err != nil {
 								return nil, err
 							}
 						} else {
 							return nil, fmt.Errorf("error parsing unicode character class in expression")
 						}
 						if !charClassInverted {
 							chars = append(chars, newPostfixNode(charsInList...))
 						} else {
 							toAppend := newPostfixDotNode()
 							toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...))
 							chars = append(chars, toAppend)
 						}
 					} else if re_postfix[i] == '0' { // Octal value
 						var octVal int64
 						var octValStr string
 						numDigitsParsed := 0
@ -812,6 +945,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 			}
 			outQueue[idx].startReps = startRangeNum
 			outQueue[idx].endReps = endRangeNum
 			if i < len(re_postfix)-1 && re_postfix[i+1] == '?' { // lazy repitition
 				outQueue[idx].isLazy = true
 				i++
 			}
 		}
 		if c == '(' || c == nonCapLparenRune {
 			opStack = append(opStack, c)
@ -1105,6 +1242,9 @@ func thompson(re []postfixNode) (Reg, error) {
 			if err != nil {
 				return Reg{}, err
 			}
 			if c.isLazy {
 				stateToAdd.isLazy = true
 			}
 			nfa = append(nfa, stateToAdd)
 		case plusNode: // a+ is equivalent to aa*
 			s1 := mustPop(&nfa)
@ -1112,6 +1252,9 @@ func thompson(re []postfixNode) (Reg, error) {
 			if err != nil {
 				return Reg{}, err
 			}
 			if c.isLazy {
 				s2.isLazy = true
 			}
 			s1 = concatenate(s1, s2)
 			nfa = append(nfa, s1)
 		case questionNode: // ab? is equivalent to a(b|)
@ -1123,6 +1266,9 @@ func thompson(re []postfixNode) (Reg, error) {
 			if err != nil {
 				return Reg{}, err
 			}
 			if c.isLazy {
 				s2.isLazy = true
 			}
 			nfa = append(nfa, s2)
 		case pipeNode:
 			// A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
@ -1178,6 +1324,9 @@ func thompson(re []postfixNode) (Reg, error) {
 				if err != nil {
 					return Reg{}, err
 				}
 				if c.isLazy {
 					s2.isLazy = true
 				}
 				stateToAdd = concatenate(stateToAdd, s2)
 			} else { // Case 2
 				for i := c.startReps; i < c.endReps; i++ {
@ -1185,6 +1334,9 @@ func thompson(re []postfixNode) (Reg, error) {
 					if err != nil {
 						return Reg{}, fmt.Errorf("error processing bounded repetition")
 					}
 					if c.isLazy {
 						tmp.isLazy = true
 					}
 					stateToAdd = concatenate(stateToAdd, tmp)
 				}
 			}
--- a/regex/doc.go
+++ b/regex/doc.go
@ -60,14 +60,24 @@ Composition:
 	x|y				Match x or y (prefer x)
 	xy|z			Match xy or z (prefer xy)
-Repitition (always greedy, preferring more):
+Repitition:
-
+
-	x*				Match x zero or more times
+	Greedy:
-	x+				Match x one or more times
+	x*				Match x zero or more times, prefer more
-	x?				Match x zero or one time
+	x+				Match x one or more times, prefer more
-	x{m,n}			Match x between m and n times (inclusive)
+	x?				Match x zero or one time, prefer one
-	x{m,}			Match x atleast m times
+	x{m,n}			Match x between m and n times (inclusive), prefer more
-	x{,n}			Match x between 0 and n times (inclusive)
+	x{m,}			Match x atleast m times, prefer more
 	x{,n}			Match x between 0 and n times (inclusive), prefer more
 	x{m}			Match x exactly m times
 	Lazy:
 	x*?				Match x zero or more times, prefer fewer
 	x+?				Match x one or more times, prefer fewer
 	x??				Match x zero or one time, prefer zero
 	x{m,n}?			Match x between m and n times (inclusive), prefer fewer
 	x{m,}?			Match x atleast m times, prefer fewer
 	x{,n}?			Match x between 0 and n times (inclusive), prefer fewer
 	x{m}			Match x exactly m times
 Grouping:
@ -107,17 +117,13 @@ Numeric ranges:
 The engine and the API differ from [regexp] in a few ways, some of them very subtle.
 The key differences are mentioned below.
-1. Greediness:
+1. Byte-slices and runes:
 This engine currently does not support non-greedy operators.
 2. Byte-slices and runes:
 My engine does not support byte-slices. When a matching function receives a string, it converts it into a
 rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
 support made the tradeoff worth it.
-3. Return values
+2. Return values
 Rather than using primitives for return values, my engine defines two types that are used as return
 values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
@ -152,10 +158,10 @@ returns the 0-group.
 The following features from [regexp] are (currently) NOT supported:
 1. Named capturing groups
- 2. Non-greedy operators
+ 2. Negated POSIX classes
- 3. Unicode character classes
+ 3. Embedded flags (flags are instead passed as arguments to [Compile])
- 4. Embedded flags (flags are instead passed as arguments to [Compile])
+ 4. Literal text with \Q ... \E
- 5. Literal text with \Q ... \E
+ 5. Finite repetition with no start (defaulting at 0)
 The following features are not available in [regexp], but are supported in my engine:
 1. Lookarounds
--- a/regex/matching.go
+++ b/regex/matching.go
@ -234,14 +234,14 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread
 	}
 	visited = append(visited, state)
-	if state.isKleene || state.isQuestion {
+	if (state.isKleene || state.isQuestion) && (state.isLazy == false) { // Greedy quantifiers
 		copyThread(state.splitState, state)
 		list := addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
 		copyThread(state.next, state)
 		list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
 		return list
 	}
-	if state.isAlternation {
+	if state.isAlternation || ((state.isKleene || state.isQuestion) && state.isLazy) { // Alternation or lazy quantifier
 		copyThread(state.next, state)
 		list := addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
 		copyThread(state.splitState, state)
--- a/regex/misc.go
+++ b/regex/misc.go
@ -16,8 +16,11 @@ var rparenRune rune = 0xF0006
 var nonCapLparenRune rune = 0xF0007 // Represents a non-capturing group's LPAREN
 var escBackslashRune rune = 0xF0008 // Represents an escaped backslash
 var charRangeRune rune = 0xF0009    // Represents a character range
 var lazyKleeneRune rune = 0xF000A   // Represents a lazy kleene star
 var lazyPlusRune rune = 0xF000B     // Represents a lazy plus operator
 var lazyQuestionRune rune = 0xF000C // Represents a lazy question operator
-var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
+var specialChars = []rune{'?', lazyQuestionRune, '*', lazyKleeneRune, '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', lazyPlusRune, '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
 // An interface for int and rune, which are identical
 type character interface {
--- a/regex/nfa.go
+++ b/regex/nfa.go
@ -34,6 +34,7 @@ type nfaState struct {
 	isKleene                   bool       // Identifies whether current node is a 0-state representing Kleene star
 	isQuestion                 bool       // Identifies whether current node is a 0-state representing the question operator
 	isAlternation              bool       // Identifies whether current node is a 0-state representing an alternation
 	isLazy                     bool       // Only for split states - Identifies whether or not to flip the order of branches (try one branch before the other)
 	splitState                 *nfaState  // Only for alternation states - the 'other' branch of the alternation ('next' is the first)
 	assert                     assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
 	allChars                   bool       // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
@ -44,10 +45,10 @@ type nfaState struct {
 	groupBegin                 bool       // Whether or not the node starts a capturing group
 	groupEnd                   bool       // Whether or not the node ends a capturing group
 	groupNum                   int        // Which capturing group the node starts / ends
 	// The following properties depend on the current match - I should think about resetting them for every match.
 	threadGroups    []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
 	isBackreference            bool       // Whether or not current node is backreference
 	referredGroup              int        // If current node is a backreference, the node that it points to
 	// The following properties depend on the current match - I should think about resetting them for every match.
 	threadGroups  []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
 	threadBackref int     // If current node is a backreference, how many characters to look forward into the referred group
 }
@ -77,6 +78,7 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
 		isKleene:        stateToClone.isKleene,
 		isQuestion:      stateToClone.isQuestion,
 		isAlternation:   stateToClone.isAlternation,
 		isLazy:          stateToClone.isLazy,
 		assert:          stateToClone.assert,
 		allChars:        stateToClone.allChars,
 		except:          append([]rune{}, stateToClone.except...),
@ -84,6 +86,8 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
 		groupEnd:        stateToClone.groupEnd,
 		groupBegin:      stateToClone.groupBegin,
 		groupNum:        stateToClone.groupNum,
 		isBackreference: stateToClone.isBackreference,
 		referredGroup:   stateToClone.referredGroup,
 	}
 	cloneMap[stateToClone] = clone
 	for i, s := range stateToClone.output {
@ -421,6 +425,7 @@ func (s nfaState) equals(other nfaState) bool {
 		s.next == other.next &&
 		s.isKleene == other.isKleene &&
 		s.isQuestion == other.isQuestion &&
 		s.isLazy == other.isLazy &&
 		s.isAlternation == other.isAlternation &&
 		s.splitState == other.splitState &&
 		s.assert == other.assert &&
--- a/regex/postfixNode.go
+++ b/regex/postfixNode.go
@ -44,6 +44,7 @@ type postfixNode struct {
 	lookaroundDir   int           // Lookbehind or lookahead
 	nodeContents    []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
 	referencedGroup int           // ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode.
 	isLazy          bool          // ONLY USED WHEN nodetype == kleene or question
 }
 // Converts the given list of postfixNodes to one node of type CHARCLASS.
@ -162,10 +163,19 @@ func newPostfixNode(contents ...rune) postfixNode {
 		switch contents[0] {
 		case '+':
 			to_return.nodetype = plusNode
 		case lazyPlusRune:
 			to_return.nodetype = plusNode
 			to_return.isLazy = true
 		case '?':
 			to_return.nodetype = questionNode
 		case lazyQuestionRune:
 			to_return.nodetype = questionNode
 			to_return.isLazy = true
 		case '*':
 			to_return.nodetype = kleeneNode
 		case lazyKleeneRune:
 			to_return.nodetype = kleeneNode
 			to_return.isLazy = true
 		case '|':
 			to_return.nodetype = pipeNode
 		case concatRune:
--- a/regex/re_test.go
+++ b/regex/re_test.go
@ -117,6 +117,7 @@ var reTests = []struct {
 	{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
 	{`\bpaint\b`, nil, "paints", []Group{}},
 	{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
 	{`\w{}`, nil, "test", nil},
 	{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
 	{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
 	{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
@ -430,6 +431,7 @@ var reTests = []struct {
 	{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
 	{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
 	{`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
 	{`\0141\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "AA", []Group{{0, 2}}},
 	{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
@ -460,8 +462,10 @@ var reTests = []struct {
 	{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
 	{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
 	{`\xff`, nil, "\u00ff", []Group{{0, 1}}},
 	{`\xff+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
 	{`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
 	{`\x00ff`, nil, "\u00ff", []Group{}},
 	{`\x{0000ff}+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
 	{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
 	{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
 	{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
@ -485,7 +489,25 @@ var reTests = []struct {
 	{`[b-e]`, nil, `f`, []Group{}},
 	{`*?`, nil, `-`, nil},
-	{`a*?`, nil, `-`, nil}, // non-greedy operators are not supported
+	{`a.+c`, nil, `abcabc`, []Group{{0, 6}}},
 	// Lazy quantifier tests
 	{`a.+?c`, nil, `abcabc`, []Group{{0, 3}, {3, 6}}},
 	{`ab*?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{{0, 6}}},
 	{`ab+?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
 	{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
 	{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
 	{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{}},
 	{`ab??c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
 	{`a.*?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AXYZC`, []Group{{0, 5}}},
 	{`a.+?c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCABC`, []Group{{0, 3}, {3, 6}}},
 	{`a.*?c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCABC`, []Group{{0, 3}, {3, 6}}},
 	{`.*?\S *:`, nil, `xx:`, []Group{{0, 3}}},
 	{`a[ ]*? (\d+).*`, nil, `a   10`, []Group{{0, 6}}},
 	{`a[ ]*? (\d+).*`, nil, `a    10`, []Group{{0, 7}}},
 	{`"(?:\\"|[^"])*?"`, nil, `"\""`, []Group{{0, 4}}},
 	{`^.*?$`, nil, "one\ntwo\nthree", []Group{}},
 	{`a[^>]*?b`, nil, `a>b`, []Group{}},
 	{`^a*?$`, nil, `foo`, []Group{}},
 	// Numeric range tests - this is a feature that I added, and doesn't exist
 	// in any other mainstream regex engine
@ -516,6 +538,30 @@ var reTests = []struct {
 	{`<389-400`, nil, `-`, nil},
 	{`<389-400>`, nil, `391`, []Group{{0, 3}}},
 	{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
 	{`\p{Tamil}+`, nil, `உயிரெழுத்து`, []Group{{0, 11}}}, // Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'.
 	{`\P{Tamil}+`, nil, `vowel=உயிரெழுத்து`, []Group{{0, 6}}},
 	{`\P`, nil, `உயிரெழுத்து`, nil},
 	{`\PM\pM*`, nil, `உயிரெழுத்து`, []Group{{0, 1}, {1, 3}, {3, 5}, {5, 7}, {7, 9}, {9, 11}}},
 	{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
 	{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
 	{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
 	{`(?<=\().*?(?=\))`, nil, `(abc)`, []Group{{1, 4}}},
 	{`((a|b)\2)`, nil, `aa`, []Group{{0, 2}}},
 	{`((a|b)\2)`, nil, `bb`, []Group{{0, 2}}},
 	{`((a|b)\2)`, nil, `ab`, []Group{}},
 	{`((a|b)\2)`, nil, `ba`, []Group{}},
 	{`((a|b)\2){3}`, nil, `aaaaaa`, []Group{{0, 6}}},
 	{`((a|b)\2){3}`, nil, `bbbbbb`, []Group{{0, 6}}},
 	{`((a|b)\2){3}`, nil, `bbaaaa`, []Group{{0, 6}}},
 	{`((a|b)\2){3}`, nil, `aabbaa`, []Group{{0, 6}}},
 	{`((a|b)\2){3}`, nil, `aaaabb`, []Group{{0, 6}}},
 	{`((a|b)\2){3}`, nil, `bbaabb`, []Group{{0, 6}}},
 	{`((a|b)\2){3}`, nil, `baabab`, []Group{}},
 	{`((a|b)\2){3}`, nil, `bbabab`, []Group{}},
 }
 var groupTests = []struct {
@ -708,6 +754,18 @@ var groupTests = []struct {
 	// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
 	// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
 	// //	{`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
 	// Lazy quantifier tests
 	{`a(?:b|c|d)+?(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}},
 	{`a(?:b|(c|e){1,2}?|d)+?(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {1, 2}, {2, 3}}}},
 	{`(?<!-):(.*?)(?<!-):`, nil, `a:bc-:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
 	{`(?<!\\):(.*?)(?<!\\):`, nil, `a:bc\:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
 	{`(?<!\?)'(.*?)(?<!\?)'`, nil, `a'bc?'de'f`, []Match{[]Group{{1, 9}, {2, 8}}}},
 	{`.*?x\s*\z(.*)`, []ReFlag{RE_MULTILINE, RE_SINGLE_LINE}, "xx\nx\n", []Match{[]Group{{0, 5}, {5, 5}}}},
 	{`.*?x\s*\z(.*)`, []ReFlag{RE_MULTILINE}, "xx\nx\n", []Match{[]Group{{3, 5}, {5, 5}}}},
 	{`^([ab]*?)(?=(b)?)c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
 	{`^([ab]*?)(?!(b))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
 	{`^([ab]*?)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
 }
 func TestFind(t *testing.T) {
Author	SHA1	Message	Date
Aadhavan Srinivasan	e07f27dc78	Merge branch 'master' of https://gitea.twomorecents.org/Rockingcool/kleingrep	2 weeks ago
Aadhavan Srinivasan	65d2317f79	Added more backreference tests	2 weeks ago
Aadhavan Srinivasan	a631fc289c	Clone 'isBackreference' and 'referredGroup' NFA fields, because they aren't thread variables	2 weeks ago
Aadhavan Srinivasan	d62a429cce	Updated documentation	3 weeks ago
Aadhavan Srinivasan	7b31031553	Change when a newline is printed; so that we don't print extraneous newlinesraneous newlines	3 weeks ago
Aadhavan Srinivasan	38c842cb07	Added method to get length of unique array	3 weeks ago
Aadhavan Srinivasan	9f9af36be8	Fixed bug where escaped parentheses in lookarounds were counted as regular parentheses instead of literals	3 weeks ago
Aadhavan Srinivasan	8217b67122	Added test for escaped parentheses in lookarounds	3 weeks ago
Aadhavan Srinivasan	1f06dcef64	Just declare the variable instead of initializing it as well	3 weeks ago
Aadhavan Srinivasan	119475b41b	Updated README	3 weeks ago
Aadhavan Srinivasan	6151cc8cf6	Updated documentation	3 weeks ago
Aadhavan Srinivasan	3eaf4eb19c	Updated README	3 weeks ago
Aadhavan Srinivasan	d453815831	Added README	3 weeks ago
Aadhavan Srinivasan	3a2916baae	Set 'isLazy' to true in the NFA, if the postfixNode has the flag set	3 weeks ago
Aadhavan Srinivasan	9d6344719f	Reverse order of trying branches if the quantifier is lazy	3 weeks ago
Aadhavan Srinivasan	f5c868566b	Added field to NFA, denoting if a node is lazy or not	3 weeks ago
Aadhavan Srinivasan	1cd6da218f	Added lazy quantifier tests	3 weeks ago
Aadhavan Srinivasan	277cbc0fc5	Started working on lazy quantifier support	4 weeks ago
Aadhavan Srinivasan	3924502b72	Added code to return lazy quantifier postfixNodes	4 weeks ago
Aadhavan Srinivasan	36b009747b	Added metacharacters for lazy quantifiers	4 weeks ago
Aadhavan Srinivasan	6cd0a10a8f	Added more documentation	4 weeks ago
Aadhavan Srinivasan	69fb96c43d	Merge pull request 'Implement Unicode character classes' (#4 ) from implementUnicodeCharClass into master Reviewed-on: #4	4 weeks ago
Aadhavan Srinivasan	46bc0c8529	Removed unicode character classes from 'features not supported' list	4 weeks ago
Aadhavan Srinivasan	1a890a1e75	Refactoring - remove duplicate code	4 weeks ago
Aadhavan Srinivasan	fde3784e5a	Added unicode charclass support within character classes; Fixed bugs with hex classes and unicode classes	4 weeks ago
Aadhavan Srinivasan	7045711860	Convert test_str into a rune slice for better unicode compatibility, it also fixed the bug where all unicode characters wouldn't be colored	4 weeks ago
Aadhavan Srinivasan	d4d606d95b	Added tests for unicode character classes; more tests for hex characters	4 weeks ago
Aadhavan Srinivasan	9cd330e521	More work on unicode character class support - fix bug where all characters aren't being matched	4 weeks ago
Aadhavan Srinivasan	44d6a2005c	Started working on unicode character classes	4 weeks ago
Aadhavan Srinivasan	f76cd6c3d9	Merge pull request 'Implement Backreferences' (#3 ) from implementBackreferences into master Reviewed-on: #3	4 weeks ago