15 Commits

Author SHA1 Message Date
e489dc4c27 Started working on line number flag 2025-03-15 16:24:50 -04:00
34149980a4 Started working on multiple filename arguments; prefix each line with filename containing the line; mostly indentation changes 2025-03-13 12:11:54 -04:00
e79c19a929 Updated TODO 2025-03-12 16:46:57 -04:00
d2bce37935 Updated argument count validation 2025-03-12 16:46:05 -04:00
bb3b866b77 Started working on file arguments - stdin is used if arg is "-" 2025-03-12 16:44:40 -04:00
e07f27dc78 Merge branch 'master' of https://gitea.twomorecents.org/Rockingcool/kleingrep 2025-02-24 07:46:54 -05:00
65d2317f79 Added more backreference tests 2025-02-21 08:44:33 -05:00
a631fc289c Clone 'isBackreference' and 'referredGroup' NFA fields, because they aren't thread variables 2025-02-21 08:44:24 -05:00
d62a429cce Updated documentation 2025-02-20 19:58:07 -05:00
7b31031553 Change when a newline is printed; so that we don't print extraneous newlinesraneous newlines 2025-02-17 09:37:31 -05:00
38c842cb07 Added method to get length of unique array 2025-02-17 09:36:38 -05:00
9f9af36be8 Fixed bug where escaped parentheses in lookarounds were counted as regular parentheses instead of literals 2025-02-17 09:36:17 -05:00
8217b67122 Added test for escaped parentheses in lookarounds 2025-02-17 09:35:06 -05:00
1f06dcef64 Just declare the variable instead of initializing it as well 2025-02-16 15:51:53 -05:00
119475b41b Updated README 2025-02-14 12:13:01 -05:00
8 changed files with 194 additions and 133 deletions

View File

@@ -2,8 +2,8 @@
Kleingrep is a regular expression engine, providing a library and command-line tool written in Go.
It aims to provide a more featureful engine, compared to the one in
[Go's standard library](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
It aims to provide a more featureful engine, compared to the one in Go's
[regexp](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
The engine does __not__ use backtracking, relying on the NFA-based method described in
[Russ Cox's articles](https://swtch.com/~rsc/regexp). As such, it is immune to catastrophic backtracking.

View File

@@ -6,6 +6,7 @@ import (
"fmt"
"io"
"os"
"slices"
"github.com/fatih/color"
@@ -25,6 +26,8 @@ func main() {
multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
caseInsensitiveFlag := flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
recursiveFlag := flag.Bool("r", false, "Recursively search all files in the given directory.")
lineNumFlag := flag.Bool("n", false, "For each line with a match, print the line number. Implies -l.")
matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
flag.Parse()
@@ -64,25 +67,49 @@ func main() {
// 2. Build NFA from postfix representation (Thompson's algorithm)
// 3. Run the string against the NFA
if len(flag.Args()) != 1 { // flag.Args() also strips out program name
if len(flag.Args()) < 1 { // flag.Args() also strips out program name
fmt.Println("ERROR: Missing cmdline args")
os.Exit(22)
}
if *recursiveFlag && len(flag.Args()) < 2 { // File/Directory must be provided with '-r'
fmt.Println("ERROR: Missing cmdline args")
os.Exit(22)
}
var re string
re = flag.Args()[0]
var inputFiles []*os.File
if len(flag.Args()) == 1 || flag.Args()[1] == "-" { // Either no file argument, or file argument is "-"
if !slices.Contains(inputFiles, os.Stdin) {
inputFiles = append(inputFiles, os.Stdin) // os.Stdin cannot be entered more than once into the file list
}
} else {
inputFilenames := flag.Args()[1:]
for _, inputFilename := range inputFilenames {
inputFile, err := os.Open(inputFilename)
if err != nil {
fmt.Printf("%s: No such file or directory\n", flag.Args()[1])
os.Exit(2)
}
inputFiles = append(inputFiles, inputFile)
}
}
var test_str string
var err error
var linesRead bool // Whether or not we have read the lines in the file
lineNum := 0 // Current line number
// Create reader for stdin and writer for stdout
reader := bufio.NewReader(os.Stdin)
// Create writer for stdout
out := bufio.NewWriter(os.Stdout)
// Compile regex
regComp, err := reg.Compile(re, flagsToCompile...)
if err != nil {
fmt.Println(err)
return
}
for _, inputFile := range inputFiles {
reader := bufio.NewReader(inputFile)
linesRead = false
for true {
if linesRead {
break
@@ -174,6 +201,8 @@ func main() {
if *lineFlag {
if !(*invertFlag) && len(matchIndices) == 0 || *invertFlag && len(matchIndices) > 0 {
continue
} else {
color.New(color.FgMagenta).Fprintf(out, "%s: ", inputFile.Name()) // Print filename
}
}
@@ -201,16 +230,18 @@ func main() {
} else {
for i, c := range test_str_runes {
if indicesToPrint.contains(i) {
color.New(color.FgRed).Fprintf(out, "%c", c)
color.New(color.FgRed, color.Bold).Fprintf(out, "%c", c)
// Newline after every match - only if -o is enabled and -v is disabled.
if *onlyFlag && !(*invertFlag) {
for _, idx := range matchIndices {
for matchIdxNum, idx := range matchIndices {
if matchIdxNum < len(matchIndices)-1 { // Only print a newline afte printing a match, if there are multiple matches on the line, and we aren't on the last one. This is because the newline that gets added at the end will take care of that.
if i+1 == idx[0].EndIdx { // End index is one more than last index of match
fmt.Fprintf(out, "\n")
break
}
}
}
}
} else {
if !(*onlyFlag) {
fmt.Fprintf(out, "%c", c)
@@ -222,6 +253,11 @@ func main() {
if err != nil {
panic(err)
}
// If the last character in the string wasn't a newline, AND we either have don't -o set or we do (and we've matched something), then print a newline
if (len(test_str_runes) > 0 && test_str_runes[len(test_str_runes)-1] != '\n') &&
(!*onlyFlag || indicesToPrint.len() > 0) {
fmt.Println()
}
}
}
}

View File

@@ -36,3 +36,7 @@ func (s uniq_arr[T]) values() []T {
}
return toRet
}
func (s uniq_arr[T]) len() int {
return len(s.backingMap)
}

View File

@@ -410,10 +410,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if i >= len(re_runes) {
return nil, fmt.Errorf("unclosed lookaround")
}
if re_runes[i] == '(' || re_runes[i] == nonCapLparenRune {
if (re_runes[i] == '(' && re_runes[i-1] != '\\') || re_runes[i] == nonCapLparenRune {
numOpenParens++
}
if re_runes[i] == ')' {
if re_runes[i] == ')' && re_runes[i-1] != '\\' {
numOpenParens--
if numOpenParens == 0 {
break
@@ -498,7 +498,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
charClassInverted := (re_postfix[i] == 'P')
charsInClass := []rune{}
var charsInClass []rune
i++
if isUnicodeCharClassLetter(re_postfix[i]) {
var err error
@@ -589,10 +589,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if i >= len(re_postfix) {
return nil, fmt.Errorf("unclosed lookaround")
}
if re_postfix[i] == '(' || re_postfix[i] == nonCapLparenRune {
if (re_postfix[i] == '(' && re_postfix[i-1] != '\\') || re_postfix[i] == nonCapLparenRune {
numOpenParens++
}
if re_postfix[i] == ')' {
if re_postfix[i] == ')' && re_postfix[i-1] != '\\' {
numOpenParens--
if numOpenParens == 0 {
break
@@ -713,7 +713,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
charClassInverted := (re_postfix[i] == 'P')
charsInList := []rune{}
var charsInList []rune
i++
if isUnicodeCharClassLetter(re_postfix[i]) {
var err error

View File

@@ -161,6 +161,7 @@ The following features from [regexp] are (currently) NOT supported:
2. Negated POSIX classes
3. Embedded flags (flags are instead passed as arguments to [Compile])
4. Literal text with \Q ... \E
5. Finite repetition with no start (defaulting at 0)
The following features are not available in [regexp], but are supported in my engine:
1. Lookarounds

View File

@@ -45,10 +45,10 @@ type nfaState struct {
groupBegin bool // Whether or not the node starts a capturing group
groupEnd bool // Whether or not the node ends a capturing group
groupNum int // Which capturing group the node starts / ends
// The following properties depend on the current match - I should think about resetting them for every match.
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
isBackreference bool // Whether or not current node is backreference
referredGroup int // If current node is a backreference, the node that it points to
// The following properties depend on the current match - I should think about resetting them for every match.
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
threadBackref int // If current node is a backreference, how many characters to look forward into the referred group
}
@@ -86,6 +86,8 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
groupEnd: stateToClone.groupEnd,
groupBegin: stateToClone.groupBegin,
groupNum: stateToClone.groupNum,
isBackreference: stateToClone.isBackreference,
referredGroup: stateToClone.referredGroup,
}
cloneMap[stateToClone] = clone
for i, s := range stateToClone.output {

View File

@@ -117,6 +117,7 @@ var reTests = []struct {
{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
{`\bpaint\b`, nil, "paints", []Group{}},
{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
{`\w{}`, nil, "test", nil},
{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
@@ -545,6 +546,22 @@ var reTests = []struct {
{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
{`(?<=\().*?(?=\))`, nil, `(abc)`, []Group{{1, 4}}},
{`((a|b)\2)`, nil, `aa`, []Group{{0, 2}}},
{`((a|b)\2)`, nil, `bb`, []Group{{0, 2}}},
{`((a|b)\2)`, nil, `ab`, []Group{}},
{`((a|b)\2)`, nil, `ba`, []Group{}},
{`((a|b)\2){3}`, nil, `aaaaaa`, []Group{{0, 6}}},
{`((a|b)\2){3}`, nil, `bbbbbb`, []Group{{0, 6}}},
{`((a|b)\2){3}`, nil, `bbaaaa`, []Group{{0, 6}}},
{`((a|b)\2){3}`, nil, `aabbaa`, []Group{{0, 6}}},
{`((a|b)\2){3}`, nil, `aaaabb`, []Group{{0, 6}}},
{`((a|b)\2){3}`, nil, `bbaabb`, []Group{{0, 6}}},
{`((a|b)\2){3}`, nil, `baabab`, []Group{}},
{`((a|b)\2){3}`, nil, `bbabab`, []Group{}},
}
var groupTests = []struct {

View File

@@ -4,4 +4,5 @@
Ideas for flags:
-m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
-g <num> : Print the <num>th group
-r : Specify a directory instead of a file, reads recursively
4. Refactor code for flags - make each flag's code a function, which modifies the result of findAllMatches