Started working on line number flag

Started working on multiple filename arguments; prefix each line with filename containing the line; mostly indentation changes
Updated TODO
2025-03-15 16:24:50 -04:00 · 2025-03-13 12:11:54 -04:00 · 2025-03-12 16:46:57 -04:00 · 2025-03-12 16:46:05 -04:00 · 2025-03-12 16:44:40 -04:00 · 2025-02-24 07:46:54 -05:00
8 changed files with 194 additions and 133 deletions
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@

 Kleingrep is a regular expression engine, providing a library and command-line tool written in Go.

-It aims to provide a more featureful engine, compared to the one in
-[Go's standard library](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
+It aims to provide a more featureful engine, compared to the one in Go's
+[regexp](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.

 The engine does __not__ use backtracking, relying on the NFA-based method described in
 [Russ Cox's articles](https://swtch.com/~rsc/regexp). As such, it is immune to catastrophic backtracking.
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"slices"

 	"github.com/fatih/color"

@@ -25,6 +26,8 @@ func main() {
 	multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
 	printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
 	caseInsensitiveFlag := flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
+	recursiveFlag := flag.Bool("r", false, "Recursively search all files in the given directory.")
+	lineNumFlag := flag.Bool("n", false, "For each line with a match, print the line number. Implies -l.")
 	matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
 	substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
 	flag.Parse()
@@ -64,164 +67,197 @@ func main() {
 	// 2. Build NFA from postfix representation (Thompson's algorithm)
 	// 3. Run the string against the NFA

-	if len(flag.Args()) != 1 { // flag.Args() also strips out program name
+	if len(flag.Args()) < 1 { // flag.Args() also strips out program name
+		fmt.Println("ERROR: Missing cmdline args")
+		os.Exit(22)
+	}
+	if *recursiveFlag && len(flag.Args()) < 2 { // File/Directory must be provided with '-r'
 		fmt.Println("ERROR: Missing cmdline args")
 		os.Exit(22)
 	}
 	var re string
 	re = flag.Args()[0]
+	var inputFiles []*os.File
+	if len(flag.Args()) == 1 || flag.Args()[1] == "-" { // Either no file argument, or file argument is "-"
+		if !slices.Contains(inputFiles, os.Stdin) {
+			inputFiles = append(inputFiles, os.Stdin) // os.Stdin cannot be entered more than once into the file list
+		}
+	} else {
+		inputFilenames := flag.Args()[1:]
+		for _, inputFilename := range inputFilenames {
+			inputFile, err := os.Open(inputFilename)
+			if err != nil {
+				fmt.Printf("%s: No such file or directory\n", flag.Args()[1])
+				os.Exit(2)
+			}
+			inputFiles = append(inputFiles, inputFile)
+		}
+	}
+
 	var test_str string
 	var err error
 	var linesRead bool // Whether or not we have read the lines in the file
 	lineNum := 0       // Current line number
-	// Create reader for stdin and writer for stdout
-	reader := bufio.NewReader(os.Stdin)
+	// Create writer for stdout
 	out := bufio.NewWriter(os.Stdout)
-
+	// Compile regex
 	regComp, err := reg.Compile(re, flagsToCompile...)
 	if err != nil {
 		fmt.Println(err)
 		return
 	}
-	for true {
-		if linesRead {
-			break
-		}
-		if !(*multiLineFlag) {
-			// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.
-			test_str, err = reader.ReadString('\n')
-			lineNum++
-			if err != nil {
+
+	for _, inputFile := range inputFiles {
+		reader := bufio.NewReader(inputFile)
+		linesRead = false
+		for true {
+			if linesRead {
+				break
+			}
+			if !(*multiLineFlag) {
+				// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.
+				test_str, err = reader.ReadString('\n')
+				lineNum++
+				if err != nil {
+					if err == io.EOF {
+						linesRead = true
+					} else {
+						panic(err)
+					}
+				}
+				if len(test_str) > 0 && test_str[len(test_str)-1] == '\n' {
+					test_str = test_str[:len(test_str)-1]
+				}
+			} else {
+				// Multi-line mode - read every line of input into a temp. string.
+				// test_str will contain all lines of input (including newline characters)
+				// as one string.
+				var temp string
+				for temp, err = reader.ReadString('\n'); err == nil; temp, err = reader.ReadString('\n') {
+					test_str += temp
+				}
+				// Assuming err != nil
 				if err == io.EOF {
+					if len(temp) > 0 {
+						test_str += temp // Add the last line (if it is non-empty)
+					}
 					linesRead = true
 				} else {
 					panic(err)
 				}
 			}
-			if len(test_str) > 0 && test_str[len(test_str)-1] == '\n' {
-				test_str = test_str[:len(test_str)-1]
-			}
-		} else {
-			// Multi-line mode - read every line of input into a temp. string.
-			// test_str will contain all lines of input (including newline characters)
-			// as one string.
-			var temp string
-			for temp, err = reader.ReadString('\n'); err == nil; temp, err = reader.ReadString('\n') {
-				test_str += temp
-			}
-			// Assuming err != nil
-			if err == io.EOF {
-				if len(temp) > 0 {
-					test_str += temp // Add the last line (if it is non-empty)
+			matchIndices := make([]reg.Match, 0)
+			if matchNumFlagEnabled {
+				tmp, err := regComp.FindNthMatch(test_str, *matchNum)
+				if err == nil {
+					matchIndices = append(matchIndices, tmp)
 				}
-				linesRead = true
 			} else {
-				panic(err)
+				matchIndices = regComp.FindAllSubmatch(test_str)
 			}
-		}
-		matchIndices := make([]reg.Match, 0)
-		if matchNumFlagEnabled {
-			tmp, err := regComp.FindNthMatch(test_str, *matchNum)
-			if err == nil {
-				matchIndices = append(matchIndices, tmp)
-			}
-		} else {
-			matchIndices = regComp.FindAllSubmatch(test_str)
-		}

-		test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters
+			test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters

-		if *printMatchesFlag {
-			// if we are in single line mode, print the line on which
-			// the matches occur
-			if len(matchIndices) > 0 {
-				if !(*multiLineFlag) {
-					fmt.Fprintf(out, "Line %d:\n", lineNum)
+			if *printMatchesFlag {
+				// if we are in single line mode, print the line on which
+				// the matches occur
+				if len(matchIndices) > 0 {
+					if !(*multiLineFlag) {
+						fmt.Fprintf(out, "Line %d:\n", lineNum)
+					}
+					for _, m := range matchIndices {
+						fmt.Fprintf(out, "%s\n", m.String())
+					}
+					err := out.Flush()
+					if err != nil {
+						panic(err)
+					}
 				}
-				for _, m := range matchIndices {
-					fmt.Fprintf(out, "%s\n", m.String())
-				}
-				err := out.Flush()
-				if err != nil {
-					panic(err)
-				}
-			}
-			continue
-		}
-		// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
-		// This should make checking O(1) instead of O(n)
-		indicesToPrint := new_uniq_arr[int]()
-		for _, idx := range matchIndices {
-			indicesToPrint.add(genRange(idx[0].StartIdx, idx[0].EndIdx)...)
-		}
-		// If we are inverting, then we should print the indices which _didn't_ match
-		// in color.
-		if *invertFlag {
-			oldIndices := indicesToPrint.values()
-			indicesToPrint = new_uniq_arr[int]()
-			// Explanation:
-			// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
-			// These are the values we want to print, now that we have inverted the match.
-			// Re-initialize indicesToPrint and add all of these values to it.
-			indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)
-
-		}
-		// If lineFlag is enabled, we should only print something if:
-		// 		a. We are not inverting, and have at least one match on the current line
-		// 		OR
-		// 		b. We are inverting, and have no matches at all on the current line.
-		// This checks for the inverse, and continues if it is true.
-		if *lineFlag {
-			if !(*invertFlag) && len(matchIndices) == 0 || *invertFlag && len(matchIndices) > 0 {
 				continue
 			}
-		}
+			// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
+			// This should make checking O(1) instead of O(n)
+			indicesToPrint := new_uniq_arr[int]()
+			for _, idx := range matchIndices {
+				indicesToPrint.add(genRange(idx[0].StartIdx, idx[0].EndIdx)...)
+			}
+			// If we are inverting, then we should print the indices which _didn't_ match
+			// in color.
+			if *invertFlag {
+				oldIndices := indicesToPrint.values()
+				indicesToPrint = new_uniq_arr[int]()
+				// Explanation:
+				// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
+				// These are the values we want to print, now that we have inverted the match.
+				// Re-initialize indicesToPrint and add all of these values to it.
+				indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)

-		// If we are substituting, we need a different behavior, as follows:
-		// For every character in the test string:
-		// 		1. Check if the index is the start of any matchIndex
-		// 		2. If so, print the substitute text, and set our index to
-		//			the corresponding end index.
-		// 		3. If not, just print the character.
-		if substituteFlagEnabled {
-			for i := range test_str_runes {
-				inMatchIndex := false
-				for _, m := range matchIndices {
-					if i == m[0].StartIdx {
-						fmt.Fprintf(out, "%s", *substituteText)
-						i = m[0].EndIdx
-						inMatchIndex = true
-						break
-					}
-				}
-				if !inMatchIndex {
-					fmt.Fprintf(out, "%c", test_str_runes[i])
+			}
+			// If lineFlag is enabled, we should only print something if:
+			// 		a. We are not inverting, and have at least one match on the current line
+			// 		OR
+			// 		b. We are inverting, and have no matches at all on the current line.
+			// This checks for the inverse, and continues if it is true.
+			if *lineFlag {
+				if !(*invertFlag) && len(matchIndices) == 0 || *invertFlag && len(matchIndices) > 0 {
+					continue
+				} else {
+					color.New(color.FgMagenta).Fprintf(out, "%s: ", inputFile.Name()) // Print filename
 				}
 			}
-		} else {
-			for i, c := range test_str_runes {
-				if indicesToPrint.contains(i) {
-					color.New(color.FgRed).Fprintf(out, "%c", c)
-					// Newline after every match - only if -o is enabled and -v is disabled.
-					if *onlyFlag && !(*invertFlag) {
-						for _, idx := range matchIndices {
-							if i+1 == idx[0].EndIdx { // End index is one more than last index of match
-								fmt.Fprintf(out, "\n")
-								break
-							}
+
+			// If we are substituting, we need a different behavior, as follows:
+			// For every character in the test string:
+			// 		1. Check if the index is the start of any matchIndex
+			// 		2. If so, print the substitute text, and set our index to
+			//			the corresponding end index.
+			// 		3. If not, just print the character.
+			if substituteFlagEnabled {
+				for i := range test_str_runes {
+					inMatchIndex := false
+					for _, m := range matchIndices {
+						if i == m[0].StartIdx {
+							fmt.Fprintf(out, "%s", *substituteText)
+							i = m[0].EndIdx
+							inMatchIndex = true
+							break
 						}
 					}
-				} else {
-					if !(*onlyFlag) {
-						fmt.Fprintf(out, "%c", c)
+					if !inMatchIndex {
+						fmt.Fprintf(out, "%c", test_str_runes[i])
+					}
+				}
+			} else {
+				for i, c := range test_str_runes {
+					if indicesToPrint.contains(i) {
+						color.New(color.FgRed, color.Bold).Fprintf(out, "%c", c)
+						// Newline after every match - only if -o is enabled and -v is disabled.
+						if *onlyFlag && !(*invertFlag) {
+							for matchIdxNum, idx := range matchIndices {
+								if matchIdxNum < len(matchIndices)-1 { // Only print a newline afte printing a match, if there are multiple matches on the line, and we aren't on the last one. This is because the newline that gets added at the end will take care of that.
+									if i+1 == idx[0].EndIdx { // End index is one more than last index of match
+										fmt.Fprintf(out, "\n")
+										break
+									}
+								}
+							}
+						}
+					} else {
+						if !(*onlyFlag) {
+							fmt.Fprintf(out, "%c", c)
+						}
 					}
 				}
 			}
+			err = out.Flush()
+			if err != nil {
+				panic(err)
+			}
+			// If the last character in the string wasn't a newline, AND we either have don't -o set or we do (and we've matched something), then print a newline
+			if (len(test_str_runes) > 0 && test_str_runes[len(test_str_runes)-1] != '\n') &&
+				(!*onlyFlag || indicesToPrint.len() > 0) {
+				fmt.Println()
+			}
 		}
-		err = out.Flush()
-		if err != nil {
-			panic(err)
-		}
-		fmt.Println()
 	}
 }
--- a/cmd/unique_array.go
+++ b/cmd/unique_array.go
@@ -36,3 +36,7 @@ func (s uniq_arr[T]) values() []T {
 	}
 	return toRet
 }
+
+func (s uniq_arr[T]) len() int {
+	return len(s.backingMap)
+}
--- a/regex/compile.go
+++ b/regex/compile.go
@@ -410,10 +410,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				if i >= len(re_runes) {
 					return nil, fmt.Errorf("unclosed lookaround")
 				}
-				if re_runes[i] == '(' || re_runes[i] == nonCapLparenRune {
+				if (re_runes[i] == '(' && re_runes[i-1] != '\\') || re_runes[i] == nonCapLparenRune {
 					numOpenParens++
 				}
-				if re_runes[i] == ')' {
+				if re_runes[i] == ')' && re_runes[i-1] != '\\' {
 					numOpenParens--
 					if numOpenParens == 0 {
 						break
@@ -498,7 +498,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				}
 			} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
 				charClassInverted := (re_postfix[i] == 'P')
-				charsInClass := []rune{}
+				var charsInClass []rune
 				i++
 				if isUnicodeCharClassLetter(re_postfix[i]) {
 					var err error
@@ -589,10 +589,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				if i >= len(re_postfix) {
 					return nil, fmt.Errorf("unclosed lookaround")
 				}
-				if re_postfix[i] == '(' || re_postfix[i] == nonCapLparenRune {
+				if (re_postfix[i] == '(' && re_postfix[i-1] != '\\') || re_postfix[i] == nonCapLparenRune {
 					numOpenParens++
 				}
-				if re_postfix[i] == ')' {
+				if re_postfix[i] == ')' && re_postfix[i-1] != '\\' {
 					numOpenParens--
 					if numOpenParens == 0 {
 						break
@@ -713,7 +713,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 						}
 					} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
 						charClassInverted := (re_postfix[i] == 'P')
-						charsInList := []rune{}
+						var charsInList []rune
 						i++
 						if isUnicodeCharClassLetter(re_postfix[i]) {
 							var err error
--- a/regex/doc.go
+++ b/regex/doc.go
@@ -161,6 +161,7 @@ The following features from [regexp] are (currently) NOT supported:
 2. Negated POSIX classes
 3. Embedded flags (flags are instead passed as arguments to [Compile])
 4. Literal text with \Q ... \E
+ 5. Finite repetition with no start (defaulting at 0)

 The following features are not available in [regexp], but are supported in my engine:
 1. Lookarounds
--- a/regex/nfa.go
+++ b/regex/nfa.go
@@ -45,11 +45,11 @@ type nfaState struct {
 	groupBegin                 bool       // Whether or not the node starts a capturing group
 	groupEnd                   bool       // Whether or not the node ends a capturing group
 	groupNum                   int        // Which capturing group the node starts / ends
+	isBackreference            bool       // Whether or not current node is backreference
+	referredGroup              int        // If current node is a backreference, the node that it points to
 	// The following properties depend on the current match - I should think about resetting them for every match.
-	threadGroups    []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
-	isBackreference bool    // Whether or not current node is backreference
-	referredGroup   int     // If current node is a backreference, the node that it points to
-	threadBackref   int     // If current node is a backreference, how many characters to look forward into the referred group
+	threadGroups  []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
+	threadBackref int     // If current node is a backreference, how many characters to look forward into the referred group
 }

 // Clones the NFA starting from the given state.
@@ -86,6 +86,8 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
 		groupEnd:        stateToClone.groupEnd,
 		groupBegin:      stateToClone.groupBegin,
 		groupNum:        stateToClone.groupNum,
+		isBackreference: stateToClone.isBackreference,
+		referredGroup:   stateToClone.referredGroup,
 	}
 	cloneMap[stateToClone] = clone
 	for i, s := range stateToClone.output {
--- a/regex/re_test.go
+++ b/regex/re_test.go
@@ -117,6 +117,7 @@ var reTests = []struct {
 	{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
 	{`\bpaint\b`, nil, "paints", []Group{}},
 	{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
+	{`\w{}`, nil, "test", nil},
 	{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
 	{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
 	{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
@@ -545,6 +546,22 @@ var reTests = []struct {
 	{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
 	{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
 	{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
+
+	{`(?<=\().*?(?=\))`, nil, `(abc)`, []Group{{1, 4}}},
+
+	{`((a|b)\2)`, nil, `aa`, []Group{{0, 2}}},
+	{`((a|b)\2)`, nil, `bb`, []Group{{0, 2}}},
+	{`((a|b)\2)`, nil, `ab`, []Group{}},
+	{`((a|b)\2)`, nil, `ba`, []Group{}},
+
+	{`((a|b)\2){3}`, nil, `aaaaaa`, []Group{{0, 6}}},
+	{`((a|b)\2){3}`, nil, `bbbbbb`, []Group{{0, 6}}},
+	{`((a|b)\2){3}`, nil, `bbaaaa`, []Group{{0, 6}}},
+	{`((a|b)\2){3}`, nil, `aabbaa`, []Group{{0, 6}}},
+	{`((a|b)\2){3}`, nil, `aaaabb`, []Group{{0, 6}}},
+	{`((a|b)\2){3}`, nil, `bbaabb`, []Group{{0, 6}}},
+	{`((a|b)\2){3}`, nil, `baabab`, []Group{}},
+	{`((a|b)\2){3}`, nil, `bbabab`, []Group{}},
 }

 var groupTests = []struct {
--- a/regex/todo.txt
+++ b/regex/todo.txt
@@ -4,4 +4,5 @@
 Ideas for flags:
    -m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
    -g <num> : Print the <num>th group
+    -r : Specify a directory instead of a file, reads recursively
 4. Refactor code for flags - make each flag's code a function, which modifies the result of findAllMatches
Author	SHA1	Message	Date
Aadhavan Srinivasan	e489dc4c27	Started working on line number flag	2025-03-15 16:24:50 -04:00
Aadhavan Srinivasan	34149980a4	Started working on multiple filename arguments; prefix each line with filename containing the line; mostly indentation changes	2025-03-13 12:11:54 -04:00
Aadhavan Srinivasan	e79c19a929	Updated TODO	2025-03-12 16:46:57 -04:00
Aadhavan Srinivasan	d2bce37935	Updated argument count validation	2025-03-12 16:46:05 -04:00
Aadhavan Srinivasan	bb3b866b77	Started working on file arguments - stdin is used if arg is "-"	2025-03-12 16:44:40 -04:00
Aadhavan Srinivasan	e07f27dc78	Merge branch 'master' of https://gitea.twomorecents.org/Rockingcool/kleingrep	2025-02-24 07:46:54 -05:00
Aadhavan Srinivasan	65d2317f79	Added more backreference tests	2025-02-21 08:44:33 -05:00
Aadhavan Srinivasan	a631fc289c	Clone 'isBackreference' and 'referredGroup' NFA fields, because they aren't thread variables	2025-02-21 08:44:24 -05:00
Aadhavan Srinivasan	d62a429cce	Updated documentation	2025-02-20 19:58:07 -05:00
Aadhavan Srinivasan	7b31031553	Change when a newline is printed; so that we don't print extraneous newlinesraneous newlines	2025-02-17 09:37:31 -05:00
Aadhavan Srinivasan	38c842cb07	Added method to get length of unique array	2025-02-17 09:36:38 -05:00
Aadhavan Srinivasan	9f9af36be8	Fixed bug where escaped parentheses in lookarounds were counted as regular parentheses instead of literals	2025-02-17 09:36:17 -05:00
Aadhavan Srinivasan	8217b67122	Added test for escaped parentheses in lookarounds	2025-02-17 09:35:06 -05:00
Aadhavan Srinivasan	1f06dcef64	Just declare the variable instead of initializing it as well	2025-02-16 15:51:53 -05:00
Aadhavan Srinivasan	119475b41b	Updated README	2025-02-14 12:13:01 -05:00