Compare commits
15 Commits
v0.3.0
...
e489dc4c27
Author | SHA1 | Date | |
---|---|---|---|
e489dc4c27 | |||
34149980a4 | |||
e79c19a929 | |||
d2bce37935 | |||
bb3b866b77 | |||
e07f27dc78 | |||
65d2317f79 | |||
a631fc289c | |||
d62a429cce | |||
7b31031553 | |||
38c842cb07 | |||
9f9af36be8 | |||
8217b67122 | |||
1f06dcef64 | |||
119475b41b |
@@ -2,8 +2,8 @@
|
||||
|
||||
Kleingrep is a regular expression engine, providing a library and command-line tool written in Go.
|
||||
|
||||
It aims to provide a more featureful engine, compared to the one in
|
||||
[Go's standard library](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
|
||||
It aims to provide a more featureful engine, compared to the one in Go's
|
||||
[regexp](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
|
||||
|
||||
The engine does __not__ use backtracking, relying on the NFA-based method described in
|
||||
[Russ Cox's articles](https://swtch.com/~rsc/regexp). As such, it is immune to catastrophic backtracking.
|
||||
|
48
cmd/main.go
48
cmd/main.go
@@ -6,6 +6,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"slices"
|
||||
|
||||
"github.com/fatih/color"
|
||||
|
||||
@@ -25,6 +26,8 @@ func main() {
|
||||
multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
|
||||
printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
|
||||
caseInsensitiveFlag := flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
|
||||
recursiveFlag := flag.Bool("r", false, "Recursively search all files in the given directory.")
|
||||
lineNumFlag := flag.Bool("n", false, "For each line with a match, print the line number. Implies -l.")
|
||||
matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
|
||||
substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
|
||||
flag.Parse()
|
||||
@@ -64,25 +67,49 @@ func main() {
|
||||
// 2. Build NFA from postfix representation (Thompson's algorithm)
|
||||
// 3. Run the string against the NFA
|
||||
|
||||
if len(flag.Args()) != 1 { // flag.Args() also strips out program name
|
||||
if len(flag.Args()) < 1 { // flag.Args() also strips out program name
|
||||
fmt.Println("ERROR: Missing cmdline args")
|
||||
os.Exit(22)
|
||||
}
|
||||
if *recursiveFlag && len(flag.Args()) < 2 { // File/Directory must be provided with '-r'
|
||||
fmt.Println("ERROR: Missing cmdline args")
|
||||
os.Exit(22)
|
||||
}
|
||||
var re string
|
||||
re = flag.Args()[0]
|
||||
var inputFiles []*os.File
|
||||
if len(flag.Args()) == 1 || flag.Args()[1] == "-" { // Either no file argument, or file argument is "-"
|
||||
if !slices.Contains(inputFiles, os.Stdin) {
|
||||
inputFiles = append(inputFiles, os.Stdin) // os.Stdin cannot be entered more than once into the file list
|
||||
}
|
||||
} else {
|
||||
inputFilenames := flag.Args()[1:]
|
||||
for _, inputFilename := range inputFilenames {
|
||||
inputFile, err := os.Open(inputFilename)
|
||||
if err != nil {
|
||||
fmt.Printf("%s: No such file or directory\n", flag.Args()[1])
|
||||
os.Exit(2)
|
||||
}
|
||||
inputFiles = append(inputFiles, inputFile)
|
||||
}
|
||||
}
|
||||
|
||||
var test_str string
|
||||
var err error
|
||||
var linesRead bool // Whether or not we have read the lines in the file
|
||||
lineNum := 0 // Current line number
|
||||
// Create reader for stdin and writer for stdout
|
||||
reader := bufio.NewReader(os.Stdin)
|
||||
// Create writer for stdout
|
||||
out := bufio.NewWriter(os.Stdout)
|
||||
|
||||
// Compile regex
|
||||
regComp, err := reg.Compile(re, flagsToCompile...)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
for _, inputFile := range inputFiles {
|
||||
reader := bufio.NewReader(inputFile)
|
||||
linesRead = false
|
||||
for true {
|
||||
if linesRead {
|
||||
break
|
||||
@@ -174,6 +201,8 @@ func main() {
|
||||
if *lineFlag {
|
||||
if !(*invertFlag) && len(matchIndices) == 0 || *invertFlag && len(matchIndices) > 0 {
|
||||
continue
|
||||
} else {
|
||||
color.New(color.FgMagenta).Fprintf(out, "%s: ", inputFile.Name()) // Print filename
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,16 +230,18 @@ func main() {
|
||||
} else {
|
||||
for i, c := range test_str_runes {
|
||||
if indicesToPrint.contains(i) {
|
||||
color.New(color.FgRed).Fprintf(out, "%c", c)
|
||||
color.New(color.FgRed, color.Bold).Fprintf(out, "%c", c)
|
||||
// Newline after every match - only if -o is enabled and -v is disabled.
|
||||
if *onlyFlag && !(*invertFlag) {
|
||||
for _, idx := range matchIndices {
|
||||
for matchIdxNum, idx := range matchIndices {
|
||||
if matchIdxNum < len(matchIndices)-1 { // Only print a newline afte printing a match, if there are multiple matches on the line, and we aren't on the last one. This is because the newline that gets added at the end will take care of that.
|
||||
if i+1 == idx[0].EndIdx { // End index is one more than last index of match
|
||||
fmt.Fprintf(out, "\n")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if !(*onlyFlag) {
|
||||
fmt.Fprintf(out, "%c", c)
|
||||
@@ -222,6 +253,11 @@ func main() {
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
// If the last character in the string wasn't a newline, AND we either have don't -o set or we do (and we've matched something), then print a newline
|
||||
if (len(test_str_runes) > 0 && test_str_runes[len(test_str_runes)-1] != '\n') &&
|
||||
(!*onlyFlag || indicesToPrint.len() > 0) {
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -36,3 +36,7 @@ func (s uniq_arr[T]) values() []T {
|
||||
}
|
||||
return toRet
|
||||
}
|
||||
|
||||
func (s uniq_arr[T]) len() int {
|
||||
return len(s.backingMap)
|
||||
}
|
||||
|
@@ -410,10 +410,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
if i >= len(re_runes) {
|
||||
return nil, fmt.Errorf("unclosed lookaround")
|
||||
}
|
||||
if re_runes[i] == '(' || re_runes[i] == nonCapLparenRune {
|
||||
if (re_runes[i] == '(' && re_runes[i-1] != '\\') || re_runes[i] == nonCapLparenRune {
|
||||
numOpenParens++
|
||||
}
|
||||
if re_runes[i] == ')' {
|
||||
if re_runes[i] == ')' && re_runes[i-1] != '\\' {
|
||||
numOpenParens--
|
||||
if numOpenParens == 0 {
|
||||
break
|
||||
@@ -498,7 +498,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
|
||||
charClassInverted := (re_postfix[i] == 'P')
|
||||
charsInClass := []rune{}
|
||||
var charsInClass []rune
|
||||
i++
|
||||
if isUnicodeCharClassLetter(re_postfix[i]) {
|
||||
var err error
|
||||
@@ -589,10 +589,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
if i >= len(re_postfix) {
|
||||
return nil, fmt.Errorf("unclosed lookaround")
|
||||
}
|
||||
if re_postfix[i] == '(' || re_postfix[i] == nonCapLparenRune {
|
||||
if (re_postfix[i] == '(' && re_postfix[i-1] != '\\') || re_postfix[i] == nonCapLparenRune {
|
||||
numOpenParens++
|
||||
}
|
||||
if re_postfix[i] == ')' {
|
||||
if re_postfix[i] == ')' && re_postfix[i-1] != '\\' {
|
||||
numOpenParens--
|
||||
if numOpenParens == 0 {
|
||||
break
|
||||
@@ -713,7 +713,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
|
||||
charClassInverted := (re_postfix[i] == 'P')
|
||||
charsInList := []rune{}
|
||||
var charsInList []rune
|
||||
i++
|
||||
if isUnicodeCharClassLetter(re_postfix[i]) {
|
||||
var err error
|
||||
|
@@ -161,6 +161,7 @@ The following features from [regexp] are (currently) NOT supported:
|
||||
2. Negated POSIX classes
|
||||
3. Embedded flags (flags are instead passed as arguments to [Compile])
|
||||
4. Literal text with \Q ... \E
|
||||
5. Finite repetition with no start (defaulting at 0)
|
||||
|
||||
The following features are not available in [regexp], but are supported in my engine:
|
||||
1. Lookarounds
|
||||
|
@@ -45,10 +45,10 @@ type nfaState struct {
|
||||
groupBegin bool // Whether or not the node starts a capturing group
|
||||
groupEnd bool // Whether or not the node ends a capturing group
|
||||
groupNum int // Which capturing group the node starts / ends
|
||||
// The following properties depend on the current match - I should think about resetting them for every match.
|
||||
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
|
||||
isBackreference bool // Whether or not current node is backreference
|
||||
referredGroup int // If current node is a backreference, the node that it points to
|
||||
// The following properties depend on the current match - I should think about resetting them for every match.
|
||||
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
|
||||
threadBackref int // If current node is a backreference, how many characters to look forward into the referred group
|
||||
}
|
||||
|
||||
@@ -86,6 +86,8 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
|
||||
groupEnd: stateToClone.groupEnd,
|
||||
groupBegin: stateToClone.groupBegin,
|
||||
groupNum: stateToClone.groupNum,
|
||||
isBackreference: stateToClone.isBackreference,
|
||||
referredGroup: stateToClone.referredGroup,
|
||||
}
|
||||
cloneMap[stateToClone] = clone
|
||||
for i, s := range stateToClone.output {
|
||||
|
@@ -117,6 +117,7 @@ var reTests = []struct {
|
||||
{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
|
||||
{`\bpaint\b`, nil, "paints", []Group{}},
|
||||
{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
|
||||
{`\w{}`, nil, "test", nil},
|
||||
{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
|
||||
{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
|
||||
{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
|
||||
@@ -545,6 +546,22 @@ var reTests = []struct {
|
||||
{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
|
||||
{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
|
||||
{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
|
||||
|
||||
{`(?<=\().*?(?=\))`, nil, `(abc)`, []Group{{1, 4}}},
|
||||
|
||||
{`((a|b)\2)`, nil, `aa`, []Group{{0, 2}}},
|
||||
{`((a|b)\2)`, nil, `bb`, []Group{{0, 2}}},
|
||||
{`((a|b)\2)`, nil, `ab`, []Group{}},
|
||||
{`((a|b)\2)`, nil, `ba`, []Group{}},
|
||||
|
||||
{`((a|b)\2){3}`, nil, `aaaaaa`, []Group{{0, 6}}},
|
||||
{`((a|b)\2){3}`, nil, `bbbbbb`, []Group{{0, 6}}},
|
||||
{`((a|b)\2){3}`, nil, `bbaaaa`, []Group{{0, 6}}},
|
||||
{`((a|b)\2){3}`, nil, `aabbaa`, []Group{{0, 6}}},
|
||||
{`((a|b)\2){3}`, nil, `aaaabb`, []Group{{0, 6}}},
|
||||
{`((a|b)\2){3}`, nil, `bbaabb`, []Group{{0, 6}}},
|
||||
{`((a|b)\2){3}`, nil, `baabab`, []Group{}},
|
||||
{`((a|b)\2){3}`, nil, `bbabab`, []Group{}},
|
||||
}
|
||||
|
||||
var groupTests = []struct {
|
||||
|
@@ -4,4 +4,5 @@
|
||||
Ideas for flags:
|
||||
-m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
|
||||
-g <num> : Print the <num>th group
|
||||
-r : Specify a directory instead of a file, reads recursively
|
||||
4. Refactor code for flags - make each flag's code a function, which modifies the result of findAllMatches
|
||||
|
Reference in New Issue
Block a user