Started working on line number flag

Started working on multiple filename arguments; prefix each line with filename containing the line; mostly indentation changes
Updated TODO
2025-03-15 16:24:50 -04:00 · 2025-03-13 12:11:54 -04:00 · 2025-03-12 16:46:57 -04:00 · 2025-03-12 16:46:05 -04:00 · 2025-03-12 16:44:40 -04:00 · 2025-02-24 07:46:54 -05:00
8 changed files with 194 additions and 133 deletions
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@

 Kleingrep is a regular expression engine, providing a library and command-line tool written in Go.

-It aims to provide a more featureful engine, compared to the one in
-[Go's standard library](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
+It aims to provide a more featureful engine, compared to the one in Go's
+[regexp](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.

 The engine does __not__ use backtracking, relying on the NFA-based method described in
 [Russ Cox's articles](https://swtch.com/~rsc/regexp). As such, it is immune to catastrophic backtracking.
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"slices"

 	"github.com/fatih/color"

@@ -25,6 +26,8 @@ func main() {
 	multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
 	printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
 	caseInsensitiveFlag := flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
+	recursiveFlag := flag.Bool("r", false, "Recursively search all files in the given directory.")
+	lineNumFlag := flag.Bool("n", false, "For each line with a match, print the line number. Implies -l.")
 	matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
 	substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
 	flag.Parse()
@@ -64,25 +67,49 @@ func main() {
 	// 2. Build NFA from postfix representation (Thompson's algorithm)
 	// 3. Run the string against the NFA

-	if len(flag.Args()) != 1 { // flag.Args() also strips out program name
+	if len(flag.Args()) < 1 { // flag.Args() also strips out program name
+		fmt.Println("ERROR: Missing cmdline args")
+		os.Exit(22)
+	}
+	if *recursiveFlag && len(flag.Args()) < 2 { // File/Directory must be provided with '-r'
 		fmt.Println("ERROR: Missing cmdline args")
 		os.Exit(22)
 	}
 	var re string
 	re = flag.Args()[0]
+	var inputFiles []*os.File
+	if len(flag.Args()) == 1 || flag.Args()[1] == "-" { // Either no file argument, or file argument is "-"
+		if !slices.Contains(inputFiles, os.Stdin) {
+			inputFiles = append(inputFiles, os.Stdin) // os.Stdin cannot be entered more than once into the file list
+		}
+	} else {
+		inputFilenames := flag.Args()[1:]
+		for _, inputFilename := range inputFilenames {
+			inputFile, err := os.Open(inputFilename)
+			if err != nil {
+				fmt.Printf("%s: No such file or directory\n", flag.Args()[1])
+				os.Exit(2)
+			}
+			inputFiles = append(inputFiles, inputFile)
+		}
+	}
+
 	var test_str string
 	var err error
 	var linesRead bool // Whether or not we have read the lines in the file
 	lineNum := 0       // Current line number
-	// Create reader for stdin and writer for stdout
-	reader := bufio.NewReader(os.Stdin)
+	// Create writer for stdout
 	out := bufio.NewWriter(os.Stdout)
-
+	// Compile regex
 	regComp, err := reg.Compile(re, flagsToCompile...)
 	if err != nil {
 		fmt.Println(err)
 		return
 	}
+
+	for _, inputFile := range inputFiles {
+		reader := bufio.NewReader(inputFile)
+		linesRead = false
 		for true {
 			if linesRead {
 				break
@@ -174,6 +201,8 @@ func main() {
 			if *lineFlag {
 				if !(*invertFlag) && len(matchIndices) == 0 || *invertFlag && len(matchIndices) > 0 {
 					continue
+				} else {
+					color.New(color.FgMagenta).Fprintf(out, "%s: ", inputFile.Name()) // Print filename
 				}
 			}

@@ -201,16 +230,18 @@ func main() {
 			} else {
 				for i, c := range test_str_runes {
 					if indicesToPrint.contains(i) {
-					color.New(color.FgRed).Fprintf(out, "%c", c)
+						color.New(color.FgRed, color.Bold).Fprintf(out, "%c", c)
 						// Newline after every match - only if -o is enabled and -v is disabled.
 						if *onlyFlag && !(*invertFlag) {
-						for _, idx := range matchIndices {
+							for matchIdxNum, idx := range matchIndices {
+								if matchIdxNum < len(matchIndices)-1 { // Only print a newline afte printing a match, if there are multiple matches on the line, and we aren't on the last one. This is because the newline that gets added at the end will take care of that.
 									if i+1 == idx[0].EndIdx { // End index is one more than last index of match
 										fmt.Fprintf(out, "\n")
 										break
 									}
 								}
 							}
+						}
 					} else {
 						if !(*onlyFlag) {
 							fmt.Fprintf(out, "%c", c)
@@ -222,6 +253,11 @@ func main() {
 			if err != nil {
 				panic(err)
 			}
+			// If the last character in the string wasn't a newline, AND we either have don't -o set or we do (and we've matched something), then print a newline
+			if (len(test_str_runes) > 0 && test_str_runes[len(test_str_runes)-1] != '\n') &&
+				(!*onlyFlag || indicesToPrint.len() > 0) {
 				fmt.Println()
 			}
+		}
+	}
 }
--- a/cmd/unique_array.go
+++ b/cmd/unique_array.go
@@ -36,3 +36,7 @@ func (s uniq_arr[T]) values() []T {
 	}
 	return toRet
 }
+
+func (s uniq_arr[T]) len() int {
+	return len(s.backingMap)
+}
--- a/regex/compile.go
+++ b/regex/compile.go
@@ -410,10 +410,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				if i >= len(re_runes) {
 					return nil, fmt.Errorf("unclosed lookaround")
 				}
-				if re_runes[i] == '(' || re_runes[i] == nonCapLparenRune {
+				if (re_runes[i] == '(' && re_runes[i-1] != '\\') || re_runes[i] == nonCapLparenRune {
 					numOpenParens++
 				}
-				if re_runes[i] == ')' {
+				if re_runes[i] == ')' && re_runes[i-1] != '\\' {
 					numOpenParens--
 					if numOpenParens == 0 {
 						break
@@ -498,7 +498,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				}
 			} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
 				charClassInverted := (re_postfix[i] == 'P')
-				charsInClass := []rune{}
+				var charsInClass []rune
 				i++
 				if isUnicodeCharClassLetter(re_postfix[i]) {
 					var err error
@@ -589,10 +589,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				if i >= len(re_postfix) {
 					return nil, fmt.Errorf("unclosed lookaround")
 				}
-				if re_postfix[i] == '(' || re_postfix[i] == nonCapLparenRune {
+				if (re_postfix[i] == '(' && re_postfix[i-1] != '\\') || re_postfix[i] == nonCapLparenRune {
 					numOpenParens++
 				}
-				if re_postfix[i] == ')' {
+				if re_postfix[i] == ')' && re_postfix[i-1] != '\\' {
 					numOpenParens--
 					if numOpenParens == 0 {
 						break
@@ -713,7 +713,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 						}
 					} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
 						charClassInverted := (re_postfix[i] == 'P')
-						charsInList := []rune{}
+						var charsInList []rune
 						i++
 						if isUnicodeCharClassLetter(re_postfix[i]) {
 							var err error
--- a/regex/doc.go
+++ b/regex/doc.go
@@ -161,6 +161,7 @@ The following features from [regexp] are (currently) NOT supported:
 2. Negated POSIX classes
 3. Embedded flags (flags are instead passed as arguments to [Compile])
 4. Literal text with \Q ... \E
+ 5. Finite repetition with no start (defaulting at 0)

 The following features are not available in [regexp], but are supported in my engine:
 1. Lookarounds
--- a/regex/nfa.go
+++ b/regex/nfa.go
@@ -45,10 +45,10 @@ type nfaState struct {
 	groupBegin                 bool       // Whether or not the node starts a capturing group
 	groupEnd                   bool       // Whether or not the node ends a capturing group
 	groupNum                   int        // Which capturing group the node starts / ends
-	// The following properties depend on the current match - I should think about resetting them for every match.
-	threadGroups    []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
 	isBackreference            bool       // Whether or not current node is backreference
 	referredGroup              int        // If current node is a backreference, the node that it points to
+	// The following properties depend on the current match - I should think about resetting them for every match.
+	threadGroups  []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
 	threadBackref int     // If current node is a backreference, how many characters to look forward into the referred group
 }

@@ -86,6 +86,8 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
 		groupEnd:        stateToClone.groupEnd,
 		groupBegin:      stateToClone.groupBegin,
 		groupNum:        stateToClone.groupNum,
+		isBackreference: stateToClone.isBackreference,
+		referredGroup:   stateToClone.referredGroup,
 	}
 	cloneMap[stateToClone] = clone
 	for i, s := range stateToClone.output {
--- a/regex/re_test.go
+++ b/regex/re_test.go
@@ -117,6 +117,7 @@ var reTests = []struct {
 	{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
 	{`\bpaint\b`, nil, "paints", []Group{}},
 	{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
+	{`\w{}`, nil, "test", nil},
 	{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
 	{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
 	{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
@@ -545,6 +546,22 @@ var reTests = []struct {
 	{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
 	{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
 	{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
+
+	{`(?<=\().*?(?=\))`, nil, `(abc)`, []Group{{1, 4}}},
+
+	{`((a|b)\2)`, nil, `aa`, []Group{{0, 2}}},
+	{`((a|b)\2)`, nil, `bb`, []Group{{0, 2}}},
+	{`((a|b)\2)`, nil, `ab`, []Group{}},
+	{`((a|b)\2)`, nil, `ba`, []Group{}},
+
+	{`((a|b)\2){3}`, nil, `aaaaaa`, []Group{{0, 6}}},
+	{`((a|b)\2){3}`, nil, `bbbbbb`, []Group{{0, 6}}},
+	{`((a|b)\2){3}`, nil, `bbaaaa`, []Group{{0, 6}}},
+	{`((a|b)\2){3}`, nil, `aabbaa`, []Group{{0, 6}}},
+	{`((a|b)\2){3}`, nil, `aaaabb`, []Group{{0, 6}}},
+	{`((a|b)\2){3}`, nil, `bbaabb`, []Group{{0, 6}}},
+	{`((a|b)\2){3}`, nil, `baabab`, []Group{}},
+	{`((a|b)\2){3}`, nil, `bbabab`, []Group{}},
 }

 var groupTests = []struct {
--- a/regex/todo.txt
+++ b/regex/todo.txt
@@ -4,4 +4,5 @@
 Ideas for flags:
    -m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
    -g <num> : Print the <num>th group
+    -r : Specify a directory instead of a file, reads recursively
 4. Refactor code for flags - make each flag's code a function, which modifies the result of findAllMatches
Author	SHA1	Message	Date
Aadhavan Srinivasan	e489dc4c27	Started working on line number flag	2025-03-15 16:24:50 -04:00
Aadhavan Srinivasan	34149980a4	Started working on multiple filename arguments; prefix each line with filename containing the line; mostly indentation changes	2025-03-13 12:11:54 -04:00
Aadhavan Srinivasan	e79c19a929	Updated TODO	2025-03-12 16:46:57 -04:00
Aadhavan Srinivasan	d2bce37935	Updated argument count validation	2025-03-12 16:46:05 -04:00
Aadhavan Srinivasan	bb3b866b77	Started working on file arguments - stdin is used if arg is "-"	2025-03-12 16:44:40 -04:00
Aadhavan Srinivasan	e07f27dc78	Merge branch 'master' of https://gitea.twomorecents.org/Rockingcool/kleingrep	2025-02-24 07:46:54 -05:00
Aadhavan Srinivasan	65d2317f79	Added more backreference tests	2025-02-21 08:44:33 -05:00
Aadhavan Srinivasan	a631fc289c	Clone 'isBackreference' and 'referredGroup' NFA fields, because they aren't thread variables	2025-02-21 08:44:24 -05:00
Aadhavan Srinivasan	d62a429cce	Updated documentation	2025-02-20 19:58:07 -05:00
Aadhavan Srinivasan	7b31031553	Change when a newline is printed; so that we don't print extraneous newlinesraneous newlines	2025-02-17 09:37:31 -05:00
Aadhavan Srinivasan	38c842cb07	Added method to get length of unique array	2025-02-17 09:36:38 -05:00
Aadhavan Srinivasan	9f9af36be8	Fixed bug where escaped parentheses in lookarounds were counted as regular parentheses instead of literals	2025-02-17 09:36:17 -05:00
Aadhavan Srinivasan	8217b67122	Added test for escaped parentheses in lookarounds	2025-02-17 09:35:06 -05:00
Aadhavan Srinivasan	1f06dcef64	Just declare the variable instead of initializing it as well	2025-02-16 15:51:53 -05:00
Aadhavan Srinivasan	119475b41b	Updated README	2025-02-14 12:13:01 -05:00