Updated TODO

Updated argument count validation
Started working on file arguments - stdin is used if arg is "-"
2025-03-12 16:46:57 -04:00 · 2025-03-12 16:46:05 -04:00 · 2025-03-12 16:44:40 -04:00 · 2025-02-24 07:46:54 -05:00 · 2025-02-21 08:44:33 -05:00 · 2025-02-21 08:44:24 -05:00
11 changed files with 326 additions and 50 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,17 @@
 ## Kleingrep
 Kleingrep is a regular expression engine, providing a library and command-line tool written in Go.
 It aims to provide a more featureful engine, compared to the one in Go's
 [regexp](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
 The engine does __not__ use backtracking, relying on the NFA-based method described in
 [Russ Cox's articles](https://swtch.com/~rsc/regexp). As such, it is immune to catastrophic backtracking.
 It also includes features not present in regexp, such as lookarounds and backreferences.
 ### Syntax
 The syntax is, for the most part, a superset of Go's regexp. A full overview of the syntax can be found [here](https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex#hdr-Syntax).
 __For more information, see https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex__.
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -64,18 +64,30 @@ func main() {
 	// 2. Build NFA from postfix representation (Thompson's algorithm)
 	// 3. Run the string against the NFA
-	if len(flag.Args()) != 1 { // flag.Args() also strips out program name
+	if len(flag.Args()) < 1 || len(flag.Args()) > 2 { // flag.Args() also strips out program name
 		fmt.Println("ERROR: Missing cmdline args")
 		os.Exit(22)
 	}
 	var re string
 	re = flag.Args()[0]
 	var inputFile *os.File
 	if len(flag.Args()) == 1 || flag.Args()[1] == "-" { // Either no file argument, or file argument is "-"
 		inputFile = os.Stdin
 	} else {
 		var err error
 		inputFile, err = os.Open(flag.Args()[1])
 		if err != nil {
 			fmt.Printf("%s: No such file or directory\n", flag.Args()[1])
 			os.Exit(2)
 		}
 	}
 	var test_str string
 	var err error
 	var linesRead bool // Whether or not we have read the lines in the file
 	lineNum := 0       // Current line number
 	// Create reader for stdin and writer for stdout
-	reader := bufio.NewReader(os.Stdin)
+	reader := bufio.NewReader(inputFile)
 	out := bufio.NewWriter(os.Stdout)
 	regComp, err := reg.Compile(re, flagsToCompile...)
@@ -129,6 +141,8 @@ func main() {
 			matchIndices = regComp.FindAllSubmatch(test_str)
 		}
 		test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters
 		if *printMatchesFlag {
 			// if we are in single line mode, print the line on which
 			// the matches occur
@@ -158,10 +172,10 @@ func main() {
 			oldIndices := indicesToPrint.values()
 			indicesToPrint = new_uniq_arr[int]()
 			// Explanation:
-			// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
+			// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
 			// These are the values we want to print, now that we have inverted the match.
 			// Re-initialize indicesToPrint and add all of these values to it.
-			indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)
+			indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)
 		}
 		// If lineFlag is enabled, we should only print something if:
@@ -182,7 +196,7 @@ func main() {
 		//			the corresponding end index.
 		// 		3. If not, just print the character.
 		if substituteFlagEnabled {
-			for i := range test_str {
+			for i := range test_str_runes {
 				inMatchIndex := false
 				for _, m := range matchIndices {
 					if i == m[0].StartIdx {
@@ -193,19 +207,21 @@ func main() {
 					}
 				}
 				if !inMatchIndex {
-					fmt.Fprintf(out, "%c", test_str[i])
+					fmt.Fprintf(out, "%c", test_str_runes[i])
 				}
 			}
 		} else {
-			for i, c := range test_str {
+			for i, c := range test_str_runes {
 				if indicesToPrint.contains(i) {
 					color.New(color.FgRed).Fprintf(out, "%c", c)
 					// Newline after every match - only if -o is enabled and -v is disabled.
 					if *onlyFlag && !(*invertFlag) {
-						for _, idx := range matchIndices {
+						for matchIdxNum, idx := range matchIndices {
-							if i+1 == idx[0].EndIdx { // End index is one more than last index of match
+							if matchIdxNum < len(matchIndices)-1 { // Only print a newline afte printing a match, if there are multiple matches on the line, and we aren't on the last one. This is because the newline that gets added at the end will take care of that.
-								fmt.Fprintf(out, "\n")
+								if i+1 == idx[0].EndIdx { // End index is one more than last index of match
-								break
+									fmt.Fprintf(out, "\n")
 									break
 								}
 							}
 						}
 					}
@@ -220,6 +236,10 @@ func main() {
 		if err != nil {
 			panic(err)
 		}
-		fmt.Println()
+		// If the last character in the string wasn't a newline, AND we either have don't -o set or we do (and we've matched something), then print a newline
 		if (len(test_str_runes) > 0 && test_str_runes[len(test_str_runes)-1] != '\n') &&
 			(!*onlyFlag || indicesToPrint.len() > 0) {
 			fmt.Println()
 		}
 	}
 }
--- a/cmd/unique_array.go
+++ b/cmd/unique_array.go
@@ -36,3 +36,7 @@ func (s uniq_arr[T]) values() []T {
 	}
 	return toRet
 }
 func (s uniq_arr[T]) len() int {
 	return len(s.backingMap)
 }
--- a/regex/compile.go
+++ b/regex/compile.go
@@ -64,7 +64,7 @@ const (
 )
 func isOperator(c rune) bool {
-	if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune {
+	if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune || c == lazyPlusRune || c == lazyKleeneRune || c == lazyQuestionRune {
 		return true
 	}
 	return false
@@ -72,7 +72,7 @@ func isOperator(c rune) bool {
 /* priority returns the priority of the given operator */
 func priority(op rune) int {
-	precedence := []rune{'|', concatRune, '+', '*', '?'}
+	precedence := []rune{'|', concatRune, '+', lazyPlusRune, '*', lazyKleeneRune, '?', lazyQuestionRune}
 	return slices.Index(precedence, op)
 }
@@ -108,6 +108,48 @@ func getPOSIXClass(str []rune) (bool, string) {
 	return true, rtv
 }
 // isUnicodeCharClassLetter returns whether or not the given letter represents a unicode character class.
 func isUnicodeCharClassLetter(c rune) bool {
 	return slices.Contains([]rune{'L', 'M', 'S', 'N', 'P', 'C', 'Z'}, c)
 }
 // rangeTableToRuneSlice converts the given range table into a rune slice and returns it.
 func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune {
 	var rtv []rune
 	for _, r := range rangetable.R16 {
 		for c := r.Lo; c <= r.Hi; c += r.Stride {
 			rtv = append(rtv, rune(c))
 		}
 	}
 	for _, r := range rangetable.R32 {
 		for c := r.Lo; c <= r.Hi; c += r.Stride {
 			rtv = append(rtv, rune(c))
 		}
 	}
 	return rtv
 }
 // unicodeCharClassToRange converts the given unicode character class name into a list of characters in that class.
 // This class could also be a single letter eg. 'C'.
 func unicodeCharClassToRange(class string) ([]rune, error) {
 	if len(class) == 0 {
 		return nil, fmt.Errorf("empty unicode character class")
 	}
 	if len(class) == 1 || len(class) == 2 {
 		if rangeTable, ok := unicode.Categories[class]; ok {
 			return rangeTableToRuneSlice(rangeTable), nil
 		} else {
 			return nil, fmt.Errorf("invalid short unicode character class")
 		}
 	} else {
 		if rangeTable, ok := unicode.Scripts[class]; ok {
 			return rangeTableToRuneSlice(rangeTable), nil
 		} else {
 			return nil, fmt.Errorf("invalid long unicode character class")
 		}
 	}
 }
 // Stores whether the case-insensitive flag has been enabled.
 var caseInsensitive bool
@@ -166,9 +208,6 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 	//		metacharacter. Later, in thompson(), these will be converted back. This avoids
 	//		confusion in detecting whether a character is escaped eg. detecting
 	// 		whether '\\[a]' has an escaped opening bracket (it doesn't).
 	//
 	// 	5. 	Check for non-greedy operators. These are not supported at the moment, so an error
 	// 		must be thrown if the user attempts to use a non-greedy operator.
 	for i := 0; i < len(re_runes_orig); i++ {
 		c := re_runes_orig[i]
 		if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
@@ -215,8 +254,16 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 		} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
 			re_runes = append(re_runes, rbracketRune)
 			continue
-		} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
+		} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i > 0 && re_runes_orig[i-1] != '\\') && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
-			return nil, fmt.Errorf("non-greedy operators are not supported")
+			switch c {
 			case '+':
 				re_runes = append(re_runes, lazyPlusRune)
 			case '*':
 				re_runes = append(re_runes, lazyKleeneRune)
 			case '?':
 				re_runes = append(re_runes, lazyQuestionRune)
 			}
 			i++
 		} else {
 			re_runes = append(re_runes, c)
 		}
@@ -309,10 +356,30 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 					}
 				} else if isHex(re_runes[i]) {
 					re_postfix = append(re_postfix, re_runes[i:i+2]...)
-					i += 2
+					i += 1 // I don't skip forward 2 steps, because the second step will happen with the loop increment
 				} else {
 					return nil, fmt.Errorf("invalid hex value in expression")
 				}
 			} else if re_runes[i] == 'p' || re_runes[i] == 'P' { // Unicode character class (P is negated unicode charclass)
 				re_postfix = append(re_postfix, re_runes[i])
 				i++
 				if i >= len(re_runes) {
 					return nil, fmt.Errorf("error parsing unicode character class in expression")
 				}
 				if re_runes[i] == '{' { // Full name charclass
 					for re_runes[i] != '}' {
 						re_postfix = append(re_postfix, re_runes[i])
 						i++
 					}
 					re_postfix = append(re_postfix, re_runes[i])
 					i++
 				} else if isUnicodeCharClassLetter(re_runes[i]) {
 					re_postfix = append(re_postfix, re_runes[i])
 					i++
 				} else {
 					return nil, fmt.Errorf("error parsing unicode character class in expression")
 				}
 				i-- // The loop increment at the top will move us forward
 			} else if re_runes[i] == '0' { // Start of octal value
 				numDigits := 1
 				for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0)
@@ -343,10 +410,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				if i >= len(re_runes) {
 					return nil, fmt.Errorf("unclosed lookaround")
 				}
-				if re_runes[i] == '(' || re_runes[i] == nonCapLparenRune {
+				if (re_runes[i] == '(' && re_runes[i-1] != '\\') || re_runes[i] == nonCapLparenRune {
 					numOpenParens++
 				}
-				if re_runes[i] == ')' {
+				if re_runes[i] == ')' && re_runes[i-1] != '\\' {
 					numOpenParens--
 					if numOpenParens == 0 {
 						break
@@ -359,7 +426,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 		}
 		if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != nonCapLparenRune && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
 			if i < len(re_runes)-1 {
-				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
+				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != lazyKleeneRune && re_runes[i+1] != '+' && re_runes[i+1] != lazyPlusRune && re_runes[i+1] != '?' && re_runes[i+1] != lazyQuestionRune && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
 					re_postfix = append(re_postfix, concatRune)
 				}
 			}
@@ -429,6 +496,39 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				} else {
 					return nil, fmt.Errorf("not enough hex characters found in expression")
 				}
 			} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
 				charClassInverted := (re_postfix[i] == 'P')
 				var charsInClass []rune
 				i++
 				if isUnicodeCharClassLetter(re_postfix[i]) {
 					var err error
 					charsInClass, err = unicodeCharClassToRange(string(re_postfix[i]))
 					if err != nil {
 						return nil, err
 					}
 				} else if re_postfix[i] == '{' {
 					i++ // Skip opening bracket
 					unicodeCharClassStr := ""
 					for re_postfix[i] != '}' {
 						unicodeCharClassStr += string(re_postfix[i])
 						i++
 					}
 					var err error
 					charsInClass, err = unicodeCharClassToRange(unicodeCharClassStr)
 					if err != nil {
 						return nil, err
 					}
 				} else {
 					return nil, fmt.Errorf("error parsing unicode character class in expression")
 				}
 				var toAppend postfixNode
 				if !charClassInverted { // \p
 					toAppend = newPostfixNode(charsInClass...)
 				} else { // \P
 					toAppend = newPostfixDotNode()
 					toAppend.except = append([]postfixNode{}, newPostfixNode(charsInClass...))
 				}
 				outQueue = append(outQueue, toAppend)
 			} else if re_postfix[i] == '0' { // Octal value
 				var octVal int64
 				var octValStr string
@@ -489,10 +589,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				if i >= len(re_postfix) {
 					return nil, fmt.Errorf("unclosed lookaround")
 				}
-				if re_postfix[i] == '(' || re_postfix[i] == nonCapLparenRune {
+				if (re_postfix[i] == '(' && re_postfix[i-1] != '\\') || re_postfix[i] == nonCapLparenRune {
 					numOpenParens++
 				}
-				if re_postfix[i] == ')' {
+				if re_postfix[i] == ')' && re_postfix[i-1] != '\\' {
 					numOpenParens--
 					if numOpenParens == 0 {
 						break
@@ -611,7 +711,40 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 						} else {
 							return nil, fmt.Errorf("not enough hex characters found in character class")
 						}
 					} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
 						charClassInverted := (re_postfix[i] == 'P')
 						var charsInList []rune
 						i++
 						if isUnicodeCharClassLetter(re_postfix[i]) {
 							var err error
 							charsInList, err = unicodeCharClassToRange(string(re_postfix[i]))
 							if err != nil {
 								return nil, err
 							}
 						} else if re_postfix[i] == '{' {
 							i++ // Skip opening bracket
 							unicodeCharClassStr := ""
 							for re_postfix[i] != '}' {
 								unicodeCharClassStr += string(re_postfix[i])
 								i++
 							}
 							var err error
 							charsInList, err = unicodeCharClassToRange(unicodeCharClassStr)
 							if err != nil {
 								return nil, err
 							}
 						} else {
 							return nil, fmt.Errorf("error parsing unicode character class in expression")
 						}
 						if !charClassInverted {
 							chars = append(chars, newPostfixNode(charsInList...))
 						} else {
 							toAppend := newPostfixDotNode()
 							toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...))
 							chars = append(chars, toAppend)
 						}
 					} else if re_postfix[i] == '0' { // Octal value
 						var octVal int64
 						var octValStr string
 						numDigitsParsed := 0
@@ -812,6 +945,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 			}
 			outQueue[idx].startReps = startRangeNum
 			outQueue[idx].endReps = endRangeNum
 			if i < len(re_postfix)-1 && re_postfix[i+1] == '?' { // lazy repitition
 				outQueue[idx].isLazy = true
 				i++
 			}
 		}
 		if c == '(' || c == nonCapLparenRune {
 			opStack = append(opStack, c)
@@ -1105,6 +1242,9 @@ func thompson(re []postfixNode) (Reg, error) {
 			if err != nil {
 				return Reg{}, err
 			}
 			if c.isLazy {
 				stateToAdd.isLazy = true
 			}
 			nfa = append(nfa, stateToAdd)
 		case plusNode: // a+ is equivalent to aa*
 			s1 := mustPop(&nfa)
@@ -1112,6 +1252,9 @@ func thompson(re []postfixNode) (Reg, error) {
 			if err != nil {
 				return Reg{}, err
 			}
 			if c.isLazy {
 				s2.isLazy = true
 			}
 			s1 = concatenate(s1, s2)
 			nfa = append(nfa, s1)
 		case questionNode: // ab? is equivalent to a(b|)
@@ -1123,6 +1266,9 @@ func thompson(re []postfixNode) (Reg, error) {
 			if err != nil {
 				return Reg{}, err
 			}
 			if c.isLazy {
 				s2.isLazy = true
 			}
 			nfa = append(nfa, s2)
 		case pipeNode:
 			// A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
@@ -1178,6 +1324,9 @@ func thompson(re []postfixNode) (Reg, error) {
 				if err != nil {
 					return Reg{}, err
 				}
 				if c.isLazy {
 					s2.isLazy = true
 				}
 				stateToAdd = concatenate(stateToAdd, s2)
 			} else { // Case 2
 				for i := c.startReps; i < c.endReps; i++ {
@@ -1185,6 +1334,9 @@ func thompson(re []postfixNode) (Reg, error) {
 					if err != nil {
 						return Reg{}, fmt.Errorf("error processing bounded repetition")
 					}
 					if c.isLazy {
 						tmp.isLazy = true
 					}
 					stateToAdd = concatenate(stateToAdd, tmp)
 				}
 			}
--- a/regex/doc.go
+++ b/regex/doc.go
@@ -60,14 +60,24 @@ Composition:
 	x|y				Match x or y (prefer x)
 	xy|z			Match xy or z (prefer xy)
-Repitition (always greedy, preferring more):
+Repitition:
-	x*				Match x zero or more times
+	Greedy:
-	x+				Match x one or more times
+	x*				Match x zero or more times, prefer more
-	x?				Match x zero or one time
+	x+				Match x one or more times, prefer more
-	x{m,n}			Match x between m and n times (inclusive)
+	x?				Match x zero or one time, prefer one
-	x{m,}			Match x atleast m times
+	x{m,n}			Match x between m and n times (inclusive), prefer more
-	x{,n}			Match x between 0 and n times (inclusive)
+	x{m,}			Match x atleast m times, prefer more
 	x{,n}			Match x between 0 and n times (inclusive), prefer more
 	x{m}			Match x exactly m times
 	Lazy:
 	x*?				Match x zero or more times, prefer fewer
 	x+?				Match x one or more times, prefer fewer
 	x??				Match x zero or one time, prefer zero
 	x{m,n}?			Match x between m and n times (inclusive), prefer fewer
 	x{m,}?			Match x atleast m times, prefer fewer
 	x{,n}?			Match x between 0 and n times (inclusive), prefer fewer
 	x{m}			Match x exactly m times
 Grouping:
@@ -107,17 +117,13 @@ Numeric ranges:
 The engine and the API differ from [regexp] in a few ways, some of them very subtle.
 The key differences are mentioned below.
-1. Greediness:
+1. Byte-slices and runes:
 This engine currently does not support non-greedy operators.
 2. Byte-slices and runes:
 My engine does not support byte-slices. When a matching function receives a string, it converts it into a
 rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
 support made the tradeoff worth it.
-3. Return values
+2. Return values
 Rather than using primitives for return values, my engine defines two types that are used as return
 values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
@@ -152,10 +158,10 @@ returns the 0-group.
 The following features from [regexp] are (currently) NOT supported:
 1. Named capturing groups
- 2. Non-greedy operators
+ 2. Negated POSIX classes
- 3. Unicode character classes
+ 3. Embedded flags (flags are instead passed as arguments to [Compile])
- 4. Embedded flags (flags are instead passed as arguments to [Compile])
+ 4. Literal text with \Q ... \E
- 5. Literal text with \Q ... \E
+ 5. Finite repetition with no start (defaulting at 0)
 The following features are not available in [regexp], but are supported in my engine:
 1. Lookarounds
--- a/regex/matching.go
+++ b/regex/matching.go
@@ -234,14 +234,14 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread
 	}
 	visited = append(visited, state)
-	if state.isKleene || state.isQuestion {
+	if (state.isKleene || state.isQuestion) && (state.isLazy == false) { // Greedy quantifiers
 		copyThread(state.splitState, state)
 		list := addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
 		copyThread(state.next, state)
 		list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
 		return list
 	}
-	if state.isAlternation {
+	if state.isAlternation || ((state.isKleene || state.isQuestion) && state.isLazy) { // Alternation or lazy quantifier
 		copyThread(state.next, state)
 		list := addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
 		copyThread(state.splitState, state)
--- a/regex/misc.go
+++ b/regex/misc.go
@@ -16,8 +16,11 @@ var rparenRune rune = 0xF0006
 var nonCapLparenRune rune = 0xF0007 // Represents a non-capturing group's LPAREN
 var escBackslashRune rune = 0xF0008 // Represents an escaped backslash
 var charRangeRune rune = 0xF0009    // Represents a character range
 var lazyKleeneRune rune = 0xF000A   // Represents a lazy kleene star
 var lazyPlusRune rune = 0xF000B     // Represents a lazy plus operator
 var lazyQuestionRune rune = 0xF000C // Represents a lazy question operator
-var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
+var specialChars = []rune{'?', lazyQuestionRune, '*', lazyKleeneRune, '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', lazyPlusRune, '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
 // An interface for int and rune, which are identical
 type character interface {
--- a/regex/nfa.go
+++ b/regex/nfa.go
@@ -34,6 +34,7 @@ type nfaState struct {
 	isKleene                   bool       // Identifies whether current node is a 0-state representing Kleene star
 	isQuestion                 bool       // Identifies whether current node is a 0-state representing the question operator
 	isAlternation              bool       // Identifies whether current node is a 0-state representing an alternation
 	isLazy                     bool       // Only for split states - Identifies whether or not to flip the order of branches (try one branch before the other)
 	splitState                 *nfaState  // Only for alternation states - the 'other' branch of the alternation ('next' is the first)
 	assert                     assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
 	allChars                   bool       // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
@@ -44,11 +45,11 @@ type nfaState struct {
 	groupBegin                 bool       // Whether or not the node starts a capturing group
 	groupEnd                   bool       // Whether or not the node ends a capturing group
 	groupNum                   int        // Which capturing group the node starts / ends
 	isBackreference            bool       // Whether or not current node is backreference
 	referredGroup              int        // If current node is a backreference, the node that it points to
 	// The following properties depend on the current match - I should think about resetting them for every match.
-	threadGroups    []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
+	threadGroups  []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
-	isBackreference bool    // Whether or not current node is backreference
+	threadBackref int     // If current node is a backreference, how many characters to look forward into the referred group
 	referredGroup   int     // If current node is a backreference, the node that it points to
 	threadBackref   int     // If current node is a backreference, how many characters to look forward into the referred group
 }
 // Clones the NFA starting from the given state.
@@ -77,6 +78,7 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
 		isKleene:        stateToClone.isKleene,
 		isQuestion:      stateToClone.isQuestion,
 		isAlternation:   stateToClone.isAlternation,
 		isLazy:          stateToClone.isLazy,
 		assert:          stateToClone.assert,
 		allChars:        stateToClone.allChars,
 		except:          append([]rune{}, stateToClone.except...),
@@ -84,6 +86,8 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
 		groupEnd:        stateToClone.groupEnd,
 		groupBegin:      stateToClone.groupBegin,
 		groupNum:        stateToClone.groupNum,
 		isBackreference: stateToClone.isBackreference,
 		referredGroup:   stateToClone.referredGroup,
 	}
 	cloneMap[stateToClone] = clone
 	for i, s := range stateToClone.output {
@@ -421,6 +425,7 @@ func (s nfaState) equals(other nfaState) bool {
 		s.next == other.next &&
 		s.isKleene == other.isKleene &&
 		s.isQuestion == other.isQuestion &&
 		s.isLazy == other.isLazy &&
 		s.isAlternation == other.isAlternation &&
 		s.splitState == other.splitState &&
 		s.assert == other.assert &&
--- a/regex/postfixNode.go
+++ b/regex/postfixNode.go
@@ -44,6 +44,7 @@ type postfixNode struct {
 	lookaroundDir   int           // Lookbehind or lookahead
 	nodeContents    []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
 	referencedGroup int           // ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode.
 	isLazy          bool          // ONLY USED WHEN nodetype == kleene or question
 }
 // Converts the given list of postfixNodes to one node of type CHARCLASS.
@@ -162,10 +163,19 @@ func newPostfixNode(contents ...rune) postfixNode {
 		switch contents[0] {
 		case '+':
 			to_return.nodetype = plusNode
 		case lazyPlusRune:
 			to_return.nodetype = plusNode
 			to_return.isLazy = true
 		case '?':
 			to_return.nodetype = questionNode
 		case lazyQuestionRune:
 			to_return.nodetype = questionNode
 			to_return.isLazy = true
 		case '*':
 			to_return.nodetype = kleeneNode
 		case lazyKleeneRune:
 			to_return.nodetype = kleeneNode
 			to_return.isLazy = true
 		case '|':
 			to_return.nodetype = pipeNode
 		case concatRune:
--- a/regex/re_test.go
+++ b/regex/re_test.go
@@ -117,6 +117,7 @@ var reTests = []struct {
 	{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
 	{`\bpaint\b`, nil, "paints", []Group{}},
 	{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
 	{`\w{}`, nil, "test", nil},
 	{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
 	{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
 	{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
@@ -430,6 +431,7 @@ var reTests = []struct {
 	{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
 	{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
 	{`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
 	{`\0141\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "AA", []Group{{0, 2}}},
 	{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
@@ -460,8 +462,10 @@ var reTests = []struct {
 	{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
 	{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
 	{`\xff`, nil, "\u00ff", []Group{{0, 1}}},
 	{`\xff+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
 	{`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
 	{`\x00ff`, nil, "\u00ff", []Group{}},
 	{`\x{0000ff}+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
 	{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
 	{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
 	{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
@@ -485,7 +489,25 @@ var reTests = []struct {
 	{`[b-e]`, nil, `f`, []Group{}},
 	{`*?`, nil, `-`, nil},
-	{`a*?`, nil, `-`, nil}, // non-greedy operators are not supported
+	{`a.+c`, nil, `abcabc`, []Group{{0, 6}}},
 	// Lazy quantifier tests
 	{`a.+?c`, nil, `abcabc`, []Group{{0, 3}, {3, 6}}},
 	{`ab*?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{{0, 6}}},
 	{`ab+?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
 	{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
 	{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
 	{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{}},
 	{`ab??c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
 	{`a.*?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AXYZC`, []Group{{0, 5}}},
 	{`a.+?c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCABC`, []Group{{0, 3}, {3, 6}}},
 	{`a.*?c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCABC`, []Group{{0, 3}, {3, 6}}},
 	{`.*?\S *:`, nil, `xx:`, []Group{{0, 3}}},
 	{`a[ ]*? (\d+).*`, nil, `a   10`, []Group{{0, 6}}},
 	{`a[ ]*? (\d+).*`, nil, `a    10`, []Group{{0, 7}}},
 	{`"(?:\\"|[^"])*?"`, nil, `"\""`, []Group{{0, 4}}},
 	{`^.*?$`, nil, "one\ntwo\nthree", []Group{}},
 	{`a[^>]*?b`, nil, `a>b`, []Group{}},
 	{`^a*?$`, nil, `foo`, []Group{}},
 	// Numeric range tests - this is a feature that I added, and doesn't exist
 	// in any other mainstream regex engine
@@ -516,6 +538,30 @@ var reTests = []struct {
 	{`<389-400`, nil, `-`, nil},
 	{`<389-400>`, nil, `391`, []Group{{0, 3}}},
 	{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
 	{`\p{Tamil}+`, nil, `உயிரெழுத்து`, []Group{{0, 11}}}, // Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'.
 	{`\P{Tamil}+`, nil, `vowel=உயிரெழுத்து`, []Group{{0, 6}}},
 	{`\P`, nil, `உயிரெழுத்து`, nil},
 	{`\PM\pM*`, nil, `உயிரெழுத்து`, []Group{{0, 1}, {1, 3}, {3, 5}, {5, 7}, {7, 9}, {9, 11}}},
 	{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
 	{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
 	{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
 	{`(?<=\().*?(?=\))`, nil, `(abc)`, []Group{{1, 4}}},
 	{`((a|b)\2)`, nil, `aa`, []Group{{0, 2}}},
 	{`((a|b)\2)`, nil, `bb`, []Group{{0, 2}}},
 	{`((a|b)\2)`, nil, `ab`, []Group{}},
 	{`((a|b)\2)`, nil, `ba`, []Group{}},
 	{`((a|b)\2){3}`, nil, `aaaaaa`, []Group{{0, 6}}},
 	{`((a|b)\2){3}`, nil, `bbbbbb`, []Group{{0, 6}}},
 	{`((a|b)\2){3}`, nil, `bbaaaa`, []Group{{0, 6}}},
 	{`((a|b)\2){3}`, nil, `aabbaa`, []Group{{0, 6}}},
 	{`((a|b)\2){3}`, nil, `aaaabb`, []Group{{0, 6}}},
 	{`((a|b)\2){3}`, nil, `bbaabb`, []Group{{0, 6}}},
 	{`((a|b)\2){3}`, nil, `baabab`, []Group{}},
 	{`((a|b)\2){3}`, nil, `bbabab`, []Group{}},
 }
 var groupTests = []struct {
@@ -708,6 +754,18 @@ var groupTests = []struct {
 	// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
 	// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
 	// //	{`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
 	// Lazy quantifier tests
 	{`a(?:b|c|d)+?(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}},
 	{`a(?:b|(c|e){1,2}?|d)+?(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {1, 2}, {2, 3}}}},
 	{`(?<!-):(.*?)(?<!-):`, nil, `a:bc-:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
 	{`(?<!\\):(.*?)(?<!\\):`, nil, `a:bc\:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
 	{`(?<!\?)'(.*?)(?<!\?)'`, nil, `a'bc?'de'f`, []Match{[]Group{{1, 9}, {2, 8}}}},
 	{`.*?x\s*\z(.*)`, []ReFlag{RE_MULTILINE, RE_SINGLE_LINE}, "xx\nx\n", []Match{[]Group{{0, 5}, {5, 5}}}},
 	{`.*?x\s*\z(.*)`, []ReFlag{RE_MULTILINE}, "xx\nx\n", []Match{[]Group{{3, 5}, {5, 5}}}},
 	{`^([ab]*?)(?=(b)?)c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
 	{`^([ab]*?)(?!(b))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
 	{`^([ab]*?)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
 }
 func TestFind(t *testing.T) {
--- a/regex/todo.txt
+++ b/regex/todo.txt
@@ -4,4 +4,5 @@
 Ideas for flags:
    -m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
    -g <num> : Print the <num>th group
    -r : Specify a directory instead of a file, reads recursively
 4. Refactor code for flags - make each flag's code a function, which modifies the result of findAllMatches
Author	SHA1	Message	Date
Aadhavan Srinivasan	e79c19a929	Updated TODO	2025-03-12 16:46:57 -04:00
Aadhavan Srinivasan	d2bce37935	Updated argument count validation	2025-03-12 16:46:05 -04:00
Aadhavan Srinivasan	bb3b866b77	Started working on file arguments - stdin is used if arg is "-"	2025-03-12 16:44:40 -04:00
Aadhavan Srinivasan	e07f27dc78	Merge branch 'master' of https://gitea.twomorecents.org/Rockingcool/kleingrep	2025-02-24 07:46:54 -05:00
Aadhavan Srinivasan	65d2317f79	Added more backreference tests	2025-02-21 08:44:33 -05:00
Aadhavan Srinivasan	a631fc289c	Clone 'isBackreference' and 'referredGroup' NFA fields, because they aren't thread variables	2025-02-21 08:44:24 -05:00
Aadhavan Srinivasan	d62a429cce	Updated documentation	2025-02-20 19:58:07 -05:00
Aadhavan Srinivasan	7b31031553	Change when a newline is printed; so that we don't print extraneous newlinesraneous newlines	2025-02-17 09:37:31 -05:00
Aadhavan Srinivasan	38c842cb07	Added method to get length of unique array	2025-02-17 09:36:38 -05:00
Aadhavan Srinivasan	9f9af36be8	Fixed bug where escaped parentheses in lookarounds were counted as regular parentheses instead of literals	2025-02-17 09:36:17 -05:00
Aadhavan Srinivasan	8217b67122	Added test for escaped parentheses in lookarounds	2025-02-17 09:35:06 -05:00
Aadhavan Srinivasan	1f06dcef64	Just declare the variable instead of initializing it as well	2025-02-16 15:51:53 -05:00
Aadhavan Srinivasan	119475b41b	Updated README	2025-02-14 12:13:01 -05:00
Aadhavan Srinivasan	6151cc8cf6	Updated documentation	2025-02-14 12:07:43 -05:00
Aadhavan Srinivasan	3eaf4eb19c	Updated README	2025-02-14 12:00:33 -05:00
Aadhavan Srinivasan	d453815831	Added README	2025-02-14 11:59:43 -05:00
Aadhavan Srinivasan	3a2916baae	Set 'isLazy' to true in the NFA, if the postfixNode has the flag set	2025-02-14 11:37:48 -05:00
Aadhavan Srinivasan	9d6344719f	Reverse order of trying branches if the quantifier is lazy	2025-02-14 11:37:28 -05:00
Aadhavan Srinivasan	f5c868566b	Added field to NFA, denoting if a node is lazy or not	2025-02-14 11:37:14 -05:00
Aadhavan Srinivasan	1cd6da218f	Added lazy quantifier tests	2025-02-14 11:36:56 -05:00
Aadhavan Srinivasan	277cbc0fc5	Started working on lazy quantifier support	2025-02-13 20:50:30 -05:00
Aadhavan Srinivasan	3924502b72	Added code to return lazy quantifier postfixNodes	2025-02-13 20:50:11 -05:00
Aadhavan Srinivasan	36b009747b	Added metacharacters for lazy quantifiers	2025-02-13 20:49:54 -05:00
Aadhavan Srinivasan	6cd0a10a8f	Added more documentation	2025-02-13 14:14:00 -05:00
Aadhavan Srinivasan	69fb96c43d	Merge pull request 'Implement Unicode character classes' (#4 ) from implementUnicodeCharClass into master Reviewed-on: #4	2025-02-13 09:51:44 -06:00
Aadhavan Srinivasan	46bc0c8529	Removed unicode character classes from 'features not supported' list	2025-02-13 10:48:23 -05:00
Aadhavan Srinivasan	1a890a1e75	Refactoring - remove duplicate code	2025-02-13 09:10:40 -05:00
Aadhavan Srinivasan	fde3784e5a	Added unicode charclass support within character classes; Fixed bugs with hex classes and unicode classes	2025-02-13 08:58:02 -05:00
Aadhavan Srinivasan	7045711860	Convert test_str into a rune slice for better unicode compatibility, it also fixed the bug where all unicode characters wouldn't be colored	2025-02-13 08:57:06 -05:00
Aadhavan Srinivasan	d4d606d95b	Added tests for unicode character classes; more tests for hex characters	2025-02-13 08:55:12 -05:00
Aadhavan Srinivasan	9cd330e521	More work on unicode character class support - fix bug where all characters aren't being matched	2025-02-12 23:04:10 -05:00
Aadhavan Srinivasan	44d6a2005c	Started working on unicode character classes	2025-02-12 22:19:30 -05:00
Aadhavan Srinivasan	f76cd6c3d9	Merge pull request 'Implement Backreferences' (#3 ) from implementBackreferences into master Reviewed-on: #3	2025-02-12 21:17:32 -06:00