15 Commits

Author SHA1 Message Date
e489dc4c27 Started working on line number flag 2025-03-15 16:24:50 -04:00
34149980a4 Started working on multiple filename arguments; prefix each line with filename containing the line; mostly indentation changes 2025-03-13 12:11:54 -04:00
e79c19a929 Updated TODO 2025-03-12 16:46:57 -04:00
d2bce37935 Updated argument count validation 2025-03-12 16:46:05 -04:00
bb3b866b77 Started working on file arguments - stdin is used if arg is "-" 2025-03-12 16:44:40 -04:00
e07f27dc78 Merge branch 'master' of https://gitea.twomorecents.org/Rockingcool/kleingrep 2025-02-24 07:46:54 -05:00
65d2317f79 Added more backreference tests 2025-02-21 08:44:33 -05:00
a631fc289c Clone 'isBackreference' and 'referredGroup' NFA fields, because they aren't thread variables 2025-02-21 08:44:24 -05:00
d62a429cce Updated documentation 2025-02-20 19:58:07 -05:00
7b31031553 Change when a newline is printed; so that we don't print extraneous newlinesraneous newlines 2025-02-17 09:37:31 -05:00
38c842cb07 Added method to get length of unique array 2025-02-17 09:36:38 -05:00
9f9af36be8 Fixed bug where escaped parentheses in lookarounds were counted as regular parentheses instead of literals 2025-02-17 09:36:17 -05:00
8217b67122 Added test for escaped parentheses in lookarounds 2025-02-17 09:35:06 -05:00
1f06dcef64 Just declare the variable instead of initializing it as well 2025-02-16 15:51:53 -05:00
119475b41b Updated README 2025-02-14 12:13:01 -05:00
8 changed files with 194 additions and 133 deletions

View File

@@ -2,8 +2,8 @@
Kleingrep is a regular expression engine, providing a library and command-line tool written in Go.
It aims to provide a more featureful engine, compared to the one in
[Go's standard library](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
It aims to provide a more featureful engine, compared to the one in Go's
[regexp](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
The engine does __not__ use backtracking, relying on the NFA-based method described in
[Russ Cox's articles](https://swtch.com/~rsc/regexp). As such, it is immune to catastrophic backtracking.

View File

@@ -6,6 +6,7 @@ import (
"fmt"
"io"
"os"
"slices"
"github.com/fatih/color"
@@ -25,6 +26,8 @@ func main() {
multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
caseInsensitiveFlag := flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
recursiveFlag := flag.Bool("r", false, "Recursively search all files in the given directory.")
lineNumFlag := flag.Bool("n", false, "For each line with a match, print the line number. Implies -l.")
matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
flag.Parse()
@@ -64,164 +67,197 @@ func main() {
// 2. Build NFA from postfix representation (Thompson's algorithm)
// 3. Run the string against the NFA
if len(flag.Args()) != 1 { // flag.Args() also strips out program name
if len(flag.Args()) < 1 { // flag.Args() also strips out program name
fmt.Println("ERROR: Missing cmdline args")
os.Exit(22)
}
if *recursiveFlag && len(flag.Args()) < 2 { // File/Directory must be provided with '-r'
fmt.Println("ERROR: Missing cmdline args")
os.Exit(22)
}
var re string
re = flag.Args()[0]
var inputFiles []*os.File
if len(flag.Args()) == 1 || flag.Args()[1] == "-" { // Either no file argument, or file argument is "-"
if !slices.Contains(inputFiles, os.Stdin) {
inputFiles = append(inputFiles, os.Stdin) // os.Stdin cannot be entered more than once into the file list
}
} else {
inputFilenames := flag.Args()[1:]
for _, inputFilename := range inputFilenames {
inputFile, err := os.Open(inputFilename)
if err != nil {
fmt.Printf("%s: No such file or directory\n", flag.Args()[1])
os.Exit(2)
}
inputFiles = append(inputFiles, inputFile)
}
}
var test_str string
var err error
var linesRead bool // Whether or not we have read the lines in the file
lineNum := 0 // Current line number
// Create reader for stdin and writer for stdout
reader := bufio.NewReader(os.Stdin)
// Create writer for stdout
out := bufio.NewWriter(os.Stdout)
// Compile regex
regComp, err := reg.Compile(re, flagsToCompile...)
if err != nil {
fmt.Println(err)
return
}
for true {
if linesRead {
break
}
if !(*multiLineFlag) {
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.
test_str, err = reader.ReadString('\n')
lineNum++
if err != nil {
for _, inputFile := range inputFiles {
reader := bufio.NewReader(inputFile)
linesRead = false
for true {
if linesRead {
break
}
if !(*multiLineFlag) {
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.
test_str, err = reader.ReadString('\n')
lineNum++
if err != nil {
if err == io.EOF {
linesRead = true
} else {
panic(err)
}
}
if len(test_str) > 0 && test_str[len(test_str)-1] == '\n' {
test_str = test_str[:len(test_str)-1]
}
} else {
// Multi-line mode - read every line of input into a temp. string.
// test_str will contain all lines of input (including newline characters)
// as one string.
var temp string
for temp, err = reader.ReadString('\n'); err == nil; temp, err = reader.ReadString('\n') {
test_str += temp
}
// Assuming err != nil
if err == io.EOF {
if len(temp) > 0 {
test_str += temp // Add the last line (if it is non-empty)
}
linesRead = true
} else {
panic(err)
}
}
if len(test_str) > 0 && test_str[len(test_str)-1] == '\n' {
test_str = test_str[:len(test_str)-1]
}
} else {
// Multi-line mode - read every line of input into a temp. string.
// test_str will contain all lines of input (including newline characters)
// as one string.
var temp string
for temp, err = reader.ReadString('\n'); err == nil; temp, err = reader.ReadString('\n') {
test_str += temp
}
// Assuming err != nil
if err == io.EOF {
if len(temp) > 0 {
test_str += temp // Add the last line (if it is non-empty)
matchIndices := make([]reg.Match, 0)
if matchNumFlagEnabled {
tmp, err := regComp.FindNthMatch(test_str, *matchNum)
if err == nil {
matchIndices = append(matchIndices, tmp)
}
linesRead = true
} else {
panic(err)
matchIndices = regComp.FindAllSubmatch(test_str)
}
}
matchIndices := make([]reg.Match, 0)
if matchNumFlagEnabled {
tmp, err := regComp.FindNthMatch(test_str, *matchNum)
if err == nil {
matchIndices = append(matchIndices, tmp)
}
} else {
matchIndices = regComp.FindAllSubmatch(test_str)
}
test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters
test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters
if *printMatchesFlag {
// if we are in single line mode, print the line on which
// the matches occur
if len(matchIndices) > 0 {
if !(*multiLineFlag) {
fmt.Fprintf(out, "Line %d:\n", lineNum)
if *printMatchesFlag {
// if we are in single line mode, print the line on which
// the matches occur
if len(matchIndices) > 0 {
if !(*multiLineFlag) {
fmt.Fprintf(out, "Line %d:\n", lineNum)
}
for _, m := range matchIndices {
fmt.Fprintf(out, "%s\n", m.String())
}
err := out.Flush()
if err != nil {
panic(err)
}
}
for _, m := range matchIndices {
fmt.Fprintf(out, "%s\n", m.String())
}
err := out.Flush()
if err != nil {
panic(err)
}
}
continue
}
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
// This should make checking O(1) instead of O(n)
indicesToPrint := new_uniq_arr[int]()
for _, idx := range matchIndices {
indicesToPrint.add(genRange(idx[0].StartIdx, idx[0].EndIdx)...)
}
// If we are inverting, then we should print the indices which _didn't_ match
// in color.
if *invertFlag {
oldIndices := indicesToPrint.values()
indicesToPrint = new_uniq_arr[int]()
// Explanation:
// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
// These are the values we want to print, now that we have inverted the match.
// Re-initialize indicesToPrint and add all of these values to it.
indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)
}
// If lineFlag is enabled, we should only print something if:
// a. We are not inverting, and have at least one match on the current line
// OR
// b. We are inverting, and have no matches at all on the current line.
// This checks for the inverse, and continues if it is true.
if *lineFlag {
if !(*invertFlag) && len(matchIndices) == 0 || *invertFlag && len(matchIndices) > 0 {
continue
}
}
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
// This should make checking O(1) instead of O(n)
indicesToPrint := new_uniq_arr[int]()
for _, idx := range matchIndices {
indicesToPrint.add(genRange(idx[0].StartIdx, idx[0].EndIdx)...)
}
// If we are inverting, then we should print the indices which _didn't_ match
// in color.
if *invertFlag {
oldIndices := indicesToPrint.values()
indicesToPrint = new_uniq_arr[int]()
// Explanation:
// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
// These are the values we want to print, now that we have inverted the match.
// Re-initialize indicesToPrint and add all of these values to it.
indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)
// If we are substituting, we need a different behavior, as follows:
// For every character in the test string:
// 1. Check if the index is the start of any matchIndex
// 2. If so, print the substitute text, and set our index to
// the corresponding end index.
// 3. If not, just print the character.
if substituteFlagEnabled {
for i := range test_str_runes {
inMatchIndex := false
for _, m := range matchIndices {
if i == m[0].StartIdx {
fmt.Fprintf(out, "%s", *substituteText)
i = m[0].EndIdx
inMatchIndex = true
break
}
}
if !inMatchIndex {
fmt.Fprintf(out, "%c", test_str_runes[i])
}
// If lineFlag is enabled, we should only print something if:
// a. We are not inverting, and have at least one match on the current line
// OR
// b. We are inverting, and have no matches at all on the current line.
// This checks for the inverse, and continues if it is true.
if *lineFlag {
if !(*invertFlag) && len(matchIndices) == 0 || *invertFlag && len(matchIndices) > 0 {
continue
} else {
color.New(color.FgMagenta).Fprintf(out, "%s: ", inputFile.Name()) // Print filename
}
}
} else {
for i, c := range test_str_runes {
if indicesToPrint.contains(i) {
color.New(color.FgRed).Fprintf(out, "%c", c)
// Newline after every match - only if -o is enabled and -v is disabled.
if *onlyFlag && !(*invertFlag) {
for _, idx := range matchIndices {
if i+1 == idx[0].EndIdx { // End index is one more than last index of match
fmt.Fprintf(out, "\n")
break
}
// If we are substituting, we need a different behavior, as follows:
// For every character in the test string:
// 1. Check if the index is the start of any matchIndex
// 2. If so, print the substitute text, and set our index to
// the corresponding end index.
// 3. If not, just print the character.
if substituteFlagEnabled {
for i := range test_str_runes {
inMatchIndex := false
for _, m := range matchIndices {
if i == m[0].StartIdx {
fmt.Fprintf(out, "%s", *substituteText)
i = m[0].EndIdx
inMatchIndex = true
break
}
}
} else {
if !(*onlyFlag) {
fmt.Fprintf(out, "%c", c)
if !inMatchIndex {
fmt.Fprintf(out, "%c", test_str_runes[i])
}
}
} else {
for i, c := range test_str_runes {
if indicesToPrint.contains(i) {
color.New(color.FgRed, color.Bold).Fprintf(out, "%c", c)
// Newline after every match - only if -o is enabled and -v is disabled.
if *onlyFlag && !(*invertFlag) {
for matchIdxNum, idx := range matchIndices {
if matchIdxNum < len(matchIndices)-1 { // Only print a newline afte printing a match, if there are multiple matches on the line, and we aren't on the last one. This is because the newline that gets added at the end will take care of that.
if i+1 == idx[0].EndIdx { // End index is one more than last index of match
fmt.Fprintf(out, "\n")
break
}
}
}
}
} else {
if !(*onlyFlag) {
fmt.Fprintf(out, "%c", c)
}
}
}
}
err = out.Flush()
if err != nil {
panic(err)
}
// If the last character in the string wasn't a newline, AND we either have don't -o set or we do (and we've matched something), then print a newline
if (len(test_str_runes) > 0 && test_str_runes[len(test_str_runes)-1] != '\n') &&
(!*onlyFlag || indicesToPrint.len() > 0) {
fmt.Println()
}
}
err = out.Flush()
if err != nil {
panic(err)
}
fmt.Println()
}
}

View File

@@ -36,3 +36,7 @@ func (s uniq_arr[T]) values() []T {
}
return toRet
}
func (s uniq_arr[T]) len() int {
return len(s.backingMap)
}

View File

@@ -410,10 +410,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if i >= len(re_runes) {
return nil, fmt.Errorf("unclosed lookaround")
}
if re_runes[i] == '(' || re_runes[i] == nonCapLparenRune {
if (re_runes[i] == '(' && re_runes[i-1] != '\\') || re_runes[i] == nonCapLparenRune {
numOpenParens++
}
if re_runes[i] == ')' {
if re_runes[i] == ')' && re_runes[i-1] != '\\' {
numOpenParens--
if numOpenParens == 0 {
break
@@ -498,7 +498,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
charClassInverted := (re_postfix[i] == 'P')
charsInClass := []rune{}
var charsInClass []rune
i++
if isUnicodeCharClassLetter(re_postfix[i]) {
var err error
@@ -589,10 +589,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if i >= len(re_postfix) {
return nil, fmt.Errorf("unclosed lookaround")
}
if re_postfix[i] == '(' || re_postfix[i] == nonCapLparenRune {
if (re_postfix[i] == '(' && re_postfix[i-1] != '\\') || re_postfix[i] == nonCapLparenRune {
numOpenParens++
}
if re_postfix[i] == ')' {
if re_postfix[i] == ')' && re_postfix[i-1] != '\\' {
numOpenParens--
if numOpenParens == 0 {
break
@@ -713,7 +713,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
charClassInverted := (re_postfix[i] == 'P')
charsInList := []rune{}
var charsInList []rune
i++
if isUnicodeCharClassLetter(re_postfix[i]) {
var err error

View File

@@ -161,6 +161,7 @@ The following features from [regexp] are (currently) NOT supported:
2. Negated POSIX classes
3. Embedded flags (flags are instead passed as arguments to [Compile])
4. Literal text with \Q ... \E
5. Finite repetition with no start (defaulting at 0)
The following features are not available in [regexp], but are supported in my engine:
1. Lookarounds

View File

@@ -45,11 +45,11 @@ type nfaState struct {
groupBegin bool // Whether or not the node starts a capturing group
groupEnd bool // Whether or not the node ends a capturing group
groupNum int // Which capturing group the node starts / ends
isBackreference bool // Whether or not current node is backreference
referredGroup int // If current node is a backreference, the node that it points to
// The following properties depend on the current match - I should think about resetting them for every match.
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
isBackreference bool // Whether or not current node is backreference
referredGroup int // If current node is a backreference, the node that it points to
threadBackref int // If current node is a backreference, how many characters to look forward into the referred group
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
threadBackref int // If current node is a backreference, how many characters to look forward into the referred group
}
// Clones the NFA starting from the given state.
@@ -86,6 +86,8 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
groupEnd: stateToClone.groupEnd,
groupBegin: stateToClone.groupBegin,
groupNum: stateToClone.groupNum,
isBackreference: stateToClone.isBackreference,
referredGroup: stateToClone.referredGroup,
}
cloneMap[stateToClone] = clone
for i, s := range stateToClone.output {

View File

@@ -117,6 +117,7 @@ var reTests = []struct {
{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
{`\bpaint\b`, nil, "paints", []Group{}},
{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
{`\w{}`, nil, "test", nil},
{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
@@ -545,6 +546,22 @@ var reTests = []struct {
{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
{`(?<=\().*?(?=\))`, nil, `(abc)`, []Group{{1, 4}}},
{`((a|b)\2)`, nil, `aa`, []Group{{0, 2}}},
{`((a|b)\2)`, nil, `bb`, []Group{{0, 2}}},
{`((a|b)\2)`, nil, `ab`, []Group{}},
{`((a|b)\2)`, nil, `ba`, []Group{}},
{`((a|b)\2){3}`, nil, `aaaaaa`, []Group{{0, 6}}},
{`((a|b)\2){3}`, nil, `bbbbbb`, []Group{{0, 6}}},
{`((a|b)\2){3}`, nil, `bbaaaa`, []Group{{0, 6}}},
{`((a|b)\2){3}`, nil, `aabbaa`, []Group{{0, 6}}},
{`((a|b)\2){3}`, nil, `aaaabb`, []Group{{0, 6}}},
{`((a|b)\2){3}`, nil, `bbaabb`, []Group{{0, 6}}},
{`((a|b)\2){3}`, nil, `baabab`, []Group{}},
{`((a|b)\2){3}`, nil, `bbabab`, []Group{}},
}
var groupTests = []struct {

View File

@@ -4,4 +4,5 @@
Ideas for flags:
-m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
-g <num> : Print the <num>th group
-r : Specify a directory instead of a file, reads recursively
4. Refactor code for flags - make each flag's code a function, which modifies the result of findAllMatches