Added 'flags' to the Compile function, instead of maintaining global state to check whether certain features were enabled

master
Aadhavan Srinivasan 2 weeks ago
parent 24fa365be1
commit 0b84806fc4

@ -7,6 +7,9 @@ import (
"unicode"
)
// Holds a list of all characters that are _not_ matched by the dot metacharacter
var notDotChars []rune
// A Reg represents the result of compiling a regular expression. It contains
// the startState of the NFA representation of the regex, and the number of capturing
// groups in the regex.
@ -17,6 +20,15 @@ type Reg struct {
const CONCAT rune = '~'
// Flags for shuntingYard - control its behavior
type ReFlag int
const (
RE_NO_FLAGS ReFlag = iota
RE_CASE_INSENSITIVE
RE_MULTILINE
)
func isOperator(c rune) bool {
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
return true
@ -35,9 +47,24 @@ The Shunting-Yard algorithm is used to convert the given infix (regeular) expres
The primary benefit of this is getting rid of parentheses.
It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
An error can be returned for a multitude of reasons - the reason is specified in the error string.
The function also takes in 0 or more flags, which control the behavior of the parser.
See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
*/
func shuntingYard(re string) ([]postfixNode, error) {
func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// Check which flags are enabled
caseInsensitive := false
// In Multiline mode, the newline character is considered a
// 'dot' character ie. the dot metacharacter matches a newline as well.
if slices.Contains(flags, RE_MULTILINE) {
notDotChars = []rune{}
} else {
notDotChars = []rune{'\n'}
}
if slices.Contains(flags, RE_CASE_INSENSITIVE) {
caseInsensitive = true
}
re_postfix := make([]rune, 0)
// Convert the string to a slice of runes to allow iteration through it
re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
@ -169,7 +196,7 @@ func shuntingYard(re string) ([]postfixNode, error) {
if i >= len(re_runes) {
return nil, fmt.Errorf("Unclosed lookaround.")
}
if re_runes[i] == '(' {
if re_runes[i] == '(' || re_runes[i] == NONCAPLPAREN_CHAR {
numOpenParens++
}
if re_runes[i] == ')' {
@ -213,7 +240,7 @@ func shuntingYard(re string) ([]postfixNode, error) {
*/
c := re_postfix[i]
if isNormalChar(c) {
if caseInsensitiveFlag != nil && *caseInsensitiveFlag {
if caseInsensitive {
outQueue = append(outQueue, newPostfixNode(allCases(c)...))
} else {
outQueue = append(outQueue, newPostfixNode(c))
@ -249,7 +276,7 @@ func shuntingYard(re string) ([]postfixNode, error) {
if i >= len(re_postfix) {
return nil, fmt.Errorf("Unclosed lookaround.")
}
if re_postfix[i] == '(' {
if re_postfix[i] == '(' || re_postfix[i] == NONCAPLPAREN_CHAR {
numOpenParens++
}
if re_postfix[i] == ')' {
@ -605,8 +632,9 @@ func thompson(re []postfixNode) (Reg, error) {
// Compiles the given regular expression into a Reg type, suitable for use with the
// matching functions. The second return value is non-nil if a compilation error has
// occured. As such, the error value must be checked before using the Reg returned by this function.
func Compile(re string) (Reg, error) {
nodes, err := shuntingYard(re)
// The second parameter is an optional list of flags, passed to the parsing function shuntingYard.
func Compile(re string, flags ...ReFlag) (Reg, error) {
nodes, err := shuntingYard(re, flags...)
if err != nil {
return Reg{}, fmt.Errorf("Error parsing regex: %w", err)
}

@ -10,10 +10,10 @@ import (
"github.com/fatih/color"
)
var notDotChars []rune
var caseInsensitiveFlag *bool // Whether we are running in case-insensitive mode
func main() {
// Flags for the regex Compile function
flagsToCompile := make([]ReFlag, 0)
invertFlag := flag.Bool("v", false, "Invert match.")
// This flag has two 'modes':
// 1. Without '-v': Prints only matches. Prints a newline after every match.
@ -22,17 +22,18 @@ func main() {
lineFlag := flag.Bool("l", false, "Only print lines with a match (or with no matches, if -v is enabled). Similar to grep's default.")
multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
caseInsensitiveFlag = flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
caseInsensitiveFlag := flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
if *caseInsensitiveFlag {
flagsToCompile = append(flagsToCompile, RE_CASE_INSENSITIVE)
}
matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
flag.Parse()
// In multi-line mode, 'dot' metacharacter also matches newline
if !(*multiLineFlag) {
notDotChars = []rune{'\n'}
} else {
notDotChars = []rune{}
if *multiLineFlag {
flagsToCompile = append(flagsToCompile, RE_MULTILINE)
}
// -l and -o are mutually exclusive: -o overrides -l
if *onlyFlag {
*lineFlag = false
@ -74,7 +75,7 @@ func main() {
reader := bufio.NewReader(os.Stdin)
out := bufio.NewWriter(os.Stdout)
regComp, err := Compile(re)
regComp, err := Compile(re, flagsToCompile...)
if err != nil {
fmt.Println(err)
return

Loading…
Cancel
Save