Added 'flags' to the Compile function, instead of maintaining global state to check whether certain features were enabled

master
Aadhavan Srinivasan 2 weeks ago
parent 24fa365be1
commit 0b84806fc4

@ -7,6 +7,9 @@ import (
"unicode" "unicode"
) )
// Holds a list of all characters that are _not_ matched by the dot metacharacter
var notDotChars []rune
// A Reg represents the result of compiling a regular expression. It contains // A Reg represents the result of compiling a regular expression. It contains
// the startState of the NFA representation of the regex, and the number of capturing // the startState of the NFA representation of the regex, and the number of capturing
// groups in the regex. // groups in the regex.
@ -17,6 +20,15 @@ type Reg struct {
const CONCAT rune = '~' const CONCAT rune = '~'
// Flags for shuntingYard - control its behavior
type ReFlag int
const (
RE_NO_FLAGS ReFlag = iota
RE_CASE_INSENSITIVE
RE_MULTILINE
)
func isOperator(c rune) bool { func isOperator(c rune) bool {
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT { if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
return true return true
@ -35,9 +47,24 @@ The Shunting-Yard algorithm is used to convert the given infix (regeular) expres
The primary benefit of this is getting rid of parentheses. The primary benefit of this is getting rid of parentheses.
It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm. It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
An error can be returned for a multitude of reasons - the reason is specified in the error string. An error can be returned for a multitude of reasons - the reason is specified in the error string.
The function also takes in 0 or more flags, which control the behavior of the parser.
See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/ See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
*/ */
func shuntingYard(re string) ([]postfixNode, error) { func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// Check which flags are enabled
caseInsensitive := false
// In Multiline mode, the newline character is considered a
// 'dot' character ie. the dot metacharacter matches a newline as well.
if slices.Contains(flags, RE_MULTILINE) {
notDotChars = []rune{}
} else {
notDotChars = []rune{'\n'}
}
if slices.Contains(flags, RE_CASE_INSENSITIVE) {
caseInsensitive = true
}
re_postfix := make([]rune, 0) re_postfix := make([]rune, 0)
// Convert the string to a slice of runes to allow iteration through it // Convert the string to a slice of runes to allow iteration through it
re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges) re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
@ -169,7 +196,7 @@ func shuntingYard(re string) ([]postfixNode, error) {
if i >= len(re_runes) { if i >= len(re_runes) {
return nil, fmt.Errorf("Unclosed lookaround.") return nil, fmt.Errorf("Unclosed lookaround.")
} }
if re_runes[i] == '(' { if re_runes[i] == '(' || re_runes[i] == NONCAPLPAREN_CHAR {
numOpenParens++ numOpenParens++
} }
if re_runes[i] == ')' { if re_runes[i] == ')' {
@ -213,7 +240,7 @@ func shuntingYard(re string) ([]postfixNode, error) {
*/ */
c := re_postfix[i] c := re_postfix[i]
if isNormalChar(c) { if isNormalChar(c) {
if caseInsensitiveFlag != nil && *caseInsensitiveFlag { if caseInsensitive {
outQueue = append(outQueue, newPostfixNode(allCases(c)...)) outQueue = append(outQueue, newPostfixNode(allCases(c)...))
} else { } else {
outQueue = append(outQueue, newPostfixNode(c)) outQueue = append(outQueue, newPostfixNode(c))
@ -249,7 +276,7 @@ func shuntingYard(re string) ([]postfixNode, error) {
if i >= len(re_postfix) { if i >= len(re_postfix) {
return nil, fmt.Errorf("Unclosed lookaround.") return nil, fmt.Errorf("Unclosed lookaround.")
} }
if re_postfix[i] == '(' { if re_postfix[i] == '(' || re_postfix[i] == NONCAPLPAREN_CHAR {
numOpenParens++ numOpenParens++
} }
if re_postfix[i] == ')' { if re_postfix[i] == ')' {
@ -605,8 +632,9 @@ func thompson(re []postfixNode) (Reg, error) {
// Compiles the given regular expression into a Reg type, suitable for use with the // Compiles the given regular expression into a Reg type, suitable for use with the
// matching functions. The second return value is non-nil if a compilation error has // matching functions. The second return value is non-nil if a compilation error has
// occured. As such, the error value must be checked before using the Reg returned by this function. // occured. As such, the error value must be checked before using the Reg returned by this function.
func Compile(re string) (Reg, error) { // The second parameter is an optional list of flags, passed to the parsing function shuntingYard.
nodes, err := shuntingYard(re) func Compile(re string, flags ...ReFlag) (Reg, error) {
nodes, err := shuntingYard(re, flags...)
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("Error parsing regex: %w", err) return Reg{}, fmt.Errorf("Error parsing regex: %w", err)
} }

@ -10,10 +10,10 @@ import (
"github.com/fatih/color" "github.com/fatih/color"
) )
var notDotChars []rune
var caseInsensitiveFlag *bool // Whether we are running in case-insensitive mode
func main() { func main() {
// Flags for the regex Compile function
flagsToCompile := make([]ReFlag, 0)
invertFlag := flag.Bool("v", false, "Invert match.") invertFlag := flag.Bool("v", false, "Invert match.")
// This flag has two 'modes': // This flag has two 'modes':
// 1. Without '-v': Prints only matches. Prints a newline after every match. // 1. Without '-v': Prints only matches. Prints a newline after every match.
@ -22,17 +22,18 @@ func main() {
lineFlag := flag.Bool("l", false, "Only print lines with a match (or with no matches, if -v is enabled). Similar to grep's default.") lineFlag := flag.Bool("l", false, "Only print lines with a match (or with no matches, if -v is enabled). Similar to grep's default.")
multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.") multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.") printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
caseInsensitiveFlag = flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.") caseInsensitiveFlag := flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
if *caseInsensitiveFlag {
flagsToCompile = append(flagsToCompile, RE_CASE_INSENSITIVE)
}
matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.") matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v") substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
flag.Parse() flag.Parse()
// In multi-line mode, 'dot' metacharacter also matches newline if *multiLineFlag {
if !(*multiLineFlag) { flagsToCompile = append(flagsToCompile, RE_MULTILINE)
notDotChars = []rune{'\n'}
} else {
notDotChars = []rune{}
} }
// -l and -o are mutually exclusive: -o overrides -l // -l and -o are mutually exclusive: -o overrides -l
if *onlyFlag { if *onlyFlag {
*lineFlag = false *lineFlag = false
@ -74,7 +75,7 @@ func main() {
reader := bufio.NewReader(os.Stdin) reader := bufio.NewReader(os.Stdin)
out := bufio.NewWriter(os.Stdout) out := bufio.NewWriter(os.Stdout)
regComp, err := Compile(re) regComp, err := Compile(re, flagsToCompile...)
if err != nil { if err != nil {
fmt.Println(err) fmt.Println(err)
return return

Loading…
Cancel
Save