diff --git a/compile.go b/compile.go index 9311a21..9ce650f 100644 --- a/compile.go +++ b/compile.go @@ -7,6 +7,9 @@ import ( "unicode" ) +// Holds a list of all characters that are _not_ matched by the dot metacharacter +var notDotChars []rune + // A Reg represents the result of compiling a regular expression. It contains // the startState of the NFA representation of the regex, and the number of capturing // groups in the regex. @@ -17,6 +20,15 @@ type Reg struct { const CONCAT rune = '~' +// Flags for shuntingYard - control its behavior +type ReFlag int + +const ( + RE_NO_FLAGS ReFlag = iota + RE_CASE_INSENSITIVE + RE_MULTILINE +) + func isOperator(c rune) bool { if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT { return true @@ -35,9 +47,24 @@ The Shunting-Yard algorithm is used to convert the given infix (regeular) expres The primary benefit of this is getting rid of parentheses. It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm. An error can be returned for a multitude of reasons - the reason is specified in the error string. +The function also takes in 0 or more flags, which control the behavior of the parser. See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/ */ -func shuntingYard(re string) ([]postfixNode, error) { +func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { + // Check which flags are enabled + + caseInsensitive := false + // In Multiline mode, the newline character is considered a + // 'dot' character ie. the dot metacharacter matches a newline as well. + if slices.Contains(flags, RE_MULTILINE) { + notDotChars = []rune{} + } else { + notDotChars = []rune{'\n'} + } + if slices.Contains(flags, RE_CASE_INSENSITIVE) { + caseInsensitive = true + } + re_postfix := make([]rune, 0) // Convert the string to a slice of runes to allow iteration through it re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges) @@ -169,7 +196,7 @@ func shuntingYard(re string) ([]postfixNode, error) { if i >= len(re_runes) { return nil, fmt.Errorf("Unclosed lookaround.") } - if re_runes[i] == '(' { + if re_runes[i] == '(' || re_runes[i] == NONCAPLPAREN_CHAR { numOpenParens++ } if re_runes[i] == ')' { @@ -213,7 +240,7 @@ func shuntingYard(re string) ([]postfixNode, error) { */ c := re_postfix[i] if isNormalChar(c) { - if caseInsensitiveFlag != nil && *caseInsensitiveFlag { + if caseInsensitive { outQueue = append(outQueue, newPostfixNode(allCases(c)...)) } else { outQueue = append(outQueue, newPostfixNode(c)) @@ -249,7 +276,7 @@ func shuntingYard(re string) ([]postfixNode, error) { if i >= len(re_postfix) { return nil, fmt.Errorf("Unclosed lookaround.") } - if re_postfix[i] == '(' { + if re_postfix[i] == '(' || re_postfix[i] == NONCAPLPAREN_CHAR { numOpenParens++ } if re_postfix[i] == ')' { @@ -605,8 +632,9 @@ func thompson(re []postfixNode) (Reg, error) { // Compiles the given regular expression into a Reg type, suitable for use with the // matching functions. The second return value is non-nil if a compilation error has // occured. As such, the error value must be checked before using the Reg returned by this function. -func Compile(re string) (Reg, error) { - nodes, err := shuntingYard(re) +// The second parameter is an optional list of flags, passed to the parsing function shuntingYard. +func Compile(re string, flags ...ReFlag) (Reg, error) { + nodes, err := shuntingYard(re, flags...) if err != nil { return Reg{}, fmt.Errorf("Error parsing regex: %w", err) } diff --git a/main.go b/main.go index 782ed02..ae56164 100644 --- a/main.go +++ b/main.go @@ -10,10 +10,10 @@ import ( "github.com/fatih/color" ) -var notDotChars []rune -var caseInsensitiveFlag *bool // Whether we are running in case-insensitive mode - func main() { + // Flags for the regex Compile function + flagsToCompile := make([]ReFlag, 0) + invertFlag := flag.Bool("v", false, "Invert match.") // This flag has two 'modes': // 1. Without '-v': Prints only matches. Prints a newline after every match. @@ -22,17 +22,18 @@ func main() { lineFlag := flag.Bool("l", false, "Only print lines with a match (or with no matches, if -v is enabled). Similar to grep's default.") multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.") printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.") - caseInsensitiveFlag = flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.") + caseInsensitiveFlag := flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.") + if *caseInsensitiveFlag { + flagsToCompile = append(flagsToCompile, RE_CASE_INSENSITIVE) + } matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.") substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v") flag.Parse() - // In multi-line mode, 'dot' metacharacter also matches newline - if !(*multiLineFlag) { - notDotChars = []rune{'\n'} - } else { - notDotChars = []rune{} + if *multiLineFlag { + flagsToCompile = append(flagsToCompile, RE_MULTILINE) } + // -l and -o are mutually exclusive: -o overrides -l if *onlyFlag { *lineFlag = false @@ -74,7 +75,7 @@ func main() { reader := bufio.NewReader(os.Stdin) out := bufio.NewWriter(os.Stdout) - regComp, err := Compile(re) + regComp, err := Compile(re, flagsToCompile...) if err != nil { fmt.Println(err) return