|
|
@ -7,6 +7,9 @@ import (
|
|
|
|
"unicode"
|
|
|
|
"unicode"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Holds a list of all characters that are _not_ matched by the dot metacharacter
|
|
|
|
|
|
|
|
var notDotChars []rune
|
|
|
|
|
|
|
|
|
|
|
|
// A Reg represents the result of compiling a regular expression. It contains
|
|
|
|
// A Reg represents the result of compiling a regular expression. It contains
|
|
|
|
// the startState of the NFA representation of the regex, and the number of capturing
|
|
|
|
// the startState of the NFA representation of the regex, and the number of capturing
|
|
|
|
// groups in the regex.
|
|
|
|
// groups in the regex.
|
|
|
@ -17,6 +20,15 @@ type Reg struct {
|
|
|
|
|
|
|
|
|
|
|
|
const CONCAT rune = '~'
|
|
|
|
const CONCAT rune = '~'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Flags for shuntingYard - control its behavior
|
|
|
|
|
|
|
|
type ReFlag int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
|
|
|
RE_NO_FLAGS ReFlag = iota
|
|
|
|
|
|
|
|
RE_CASE_INSENSITIVE
|
|
|
|
|
|
|
|
RE_MULTILINE
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
func isOperator(c rune) bool {
|
|
|
|
func isOperator(c rune) bool {
|
|
|
|
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
|
|
|
|
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
|
|
|
|
return true
|
|
|
|
return true
|
|
|
@ -35,9 +47,24 @@ The Shunting-Yard algorithm is used to convert the given infix (regeular) expres
|
|
|
|
The primary benefit of this is getting rid of parentheses.
|
|
|
|
The primary benefit of this is getting rid of parentheses.
|
|
|
|
It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
|
|
|
|
It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
|
|
|
|
An error can be returned for a multitude of reasons - the reason is specified in the error string.
|
|
|
|
An error can be returned for a multitude of reasons - the reason is specified in the error string.
|
|
|
|
|
|
|
|
The function also takes in 0 or more flags, which control the behavior of the parser.
|
|
|
|
See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
|
|
|
|
See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
func shuntingYard(re string) ([]postfixNode, error) {
|
|
|
|
func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|
|
|
|
|
|
|
// Check which flags are enabled
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
caseInsensitive := false
|
|
|
|
|
|
|
|
// In Multiline mode, the newline character is considered a
|
|
|
|
|
|
|
|
// 'dot' character ie. the dot metacharacter matches a newline as well.
|
|
|
|
|
|
|
|
if slices.Contains(flags, RE_MULTILINE) {
|
|
|
|
|
|
|
|
notDotChars = []rune{}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
notDotChars = []rune{'\n'}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if slices.Contains(flags, RE_CASE_INSENSITIVE) {
|
|
|
|
|
|
|
|
caseInsensitive = true
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
re_postfix := make([]rune, 0)
|
|
|
|
re_postfix := make([]rune, 0)
|
|
|
|
// Convert the string to a slice of runes to allow iteration through it
|
|
|
|
// Convert the string to a slice of runes to allow iteration through it
|
|
|
|
re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
|
|
|
|
re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
|
|
|
@ -169,7 +196,7 @@ func shuntingYard(re string) ([]postfixNode, error) {
|
|
|
|
if i >= len(re_runes) {
|
|
|
|
if i >= len(re_runes) {
|
|
|
|
return nil, fmt.Errorf("Unclosed lookaround.")
|
|
|
|
return nil, fmt.Errorf("Unclosed lookaround.")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if re_runes[i] == '(' {
|
|
|
|
if re_runes[i] == '(' || re_runes[i] == NONCAPLPAREN_CHAR {
|
|
|
|
numOpenParens++
|
|
|
|
numOpenParens++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if re_runes[i] == ')' {
|
|
|
|
if re_runes[i] == ')' {
|
|
|
@ -213,7 +240,7 @@ func shuntingYard(re string) ([]postfixNode, error) {
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
c := re_postfix[i]
|
|
|
|
c := re_postfix[i]
|
|
|
|
if isNormalChar(c) {
|
|
|
|
if isNormalChar(c) {
|
|
|
|
if caseInsensitiveFlag != nil && *caseInsensitiveFlag {
|
|
|
|
if caseInsensitive {
|
|
|
|
outQueue = append(outQueue, newPostfixNode(allCases(c)...))
|
|
|
|
outQueue = append(outQueue, newPostfixNode(allCases(c)...))
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
outQueue = append(outQueue, newPostfixNode(c))
|
|
|
|
outQueue = append(outQueue, newPostfixNode(c))
|
|
|
@ -249,7 +276,7 @@ func shuntingYard(re string) ([]postfixNode, error) {
|
|
|
|
if i >= len(re_postfix) {
|
|
|
|
if i >= len(re_postfix) {
|
|
|
|
return nil, fmt.Errorf("Unclosed lookaround.")
|
|
|
|
return nil, fmt.Errorf("Unclosed lookaround.")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if re_postfix[i] == '(' {
|
|
|
|
if re_postfix[i] == '(' || re_postfix[i] == NONCAPLPAREN_CHAR {
|
|
|
|
numOpenParens++
|
|
|
|
numOpenParens++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if re_postfix[i] == ')' {
|
|
|
|
if re_postfix[i] == ')' {
|
|
|
@ -605,8 +632,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
|
|
|
// Compiles the given regular expression into a Reg type, suitable for use with the
|
|
|
|
// Compiles the given regular expression into a Reg type, suitable for use with the
|
|
|
|
// matching functions. The second return value is non-nil if a compilation error has
|
|
|
|
// matching functions. The second return value is non-nil if a compilation error has
|
|
|
|
// occured. As such, the error value must be checked before using the Reg returned by this function.
|
|
|
|
// occured. As such, the error value must be checked before using the Reg returned by this function.
|
|
|
|
func Compile(re string) (Reg, error) {
|
|
|
|
// The second parameter is an optional list of flags, passed to the parsing function shuntingYard.
|
|
|
|
nodes, err := shuntingYard(re)
|
|
|
|
func Compile(re string, flags ...ReFlag) (Reg, error) {
|
|
|
|
|
|
|
|
nodes, err := shuntingYard(re, flags...)
|
|
|
|
if err != nil {
|
|
|
|
if err != nil {
|
|
|
|
return Reg{}, fmt.Errorf("Error parsing regex: %w", err)
|
|
|
|
return Reg{}, fmt.Errorf("Error parsing regex: %w", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|