You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1124 lines
46 KiB
Go

package regex
import (
"fmt"
"slices"
"strconv"
"unicode"
)
// Holds a list of all characters that are _not_ matched by the dot metacharacter
var notDotChars []rune
// A Reg represents the result of compiling a regular expression. It contains
// the startState of the NFA representation of the regex, and the number of capturing
// groups in the regex.
type Reg struct {
start *nfaState
numGroups int
}
const concatRune rune = 0xF0001
// Flags for shuntingYard - control its behavior
type ReFlag int
const (
RE_NO_FLAGS ReFlag = iota
RE_CASE_INSENSITIVE // Case insensitive matching
RE_MULTILINE // '^' and '$' assert at start and end of _line_, rather than start and end of input string
RE_SINGLE_LINE // Dot metacharacter matches newline characters.
)
func isOperator(c rune) bool {
if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune {
return true
}
return false
}
/* priority returns the priority of the given operator */
func priority(op rune) int {
precedence := []rune{'|', concatRune, '+', '*', '?'}
return slices.Index(precedence, op)
}
// Returns the POSIX character class represented by the given string.
// The given string must be of the form:
//
// 'blah1:]blah2'
//
// In order to be a _syntactically_ valid POSIX class.
// Whether or not such a class actually exists is not relevant to this function, it just
// parses and returns 'blah1'.
// For example, if the regex was something like '[[:digit:]]', the caller must parse through the opening
// brackets and the colon, and call this function with the remainder.
//
// If the given string represents a syntactically valid POSIX class, the second parameter is true.
// Otherwise, it's false.
func getPOSIXClass(str []rune) (bool, string) {
i := 0
rtv := ""
for i < len(str) && (str[i] != ':' && str[i] != rbracketRune) {
rtv += string(str[i])
i++
}
if i >= (len(str) - 1) { // We need to be atleast 1 character short, because the closing bracket must follow
return false, ""
}
if str[i] != ':' { // The POSIX class must end with a colon and a closing bracket. It cannot end with a closing bracket first.
return false, ""
}
if str[i+1] != rbracketRune {
return false, ""
}
return true, rtv
}
// Stores whether the case-insensitive flag has been enabled.
var caseInsensitive bool
// Stores whether the multiline flag has been enabled.
// In multi-line mode, '^' and '$' assert position at the start and
// end of a _line_ rather than the entire input.
var multilineMode bool
/*
The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
The primary benefit of this is getting rid of parentheses.
It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
An error can be returned for a multitude of reasons - the reason is specified in the error string.
The function also takes in 0 or more flags, which control the behavior of the parser.
See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
*/
func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// Check which flags are enabled
caseInsensitive = false
multilineMode = false
if slices.Contains(flags, RE_MULTILINE) {
multilineMode = true
}
if slices.Contains(flags, RE_SINGLE_LINE) {
notDotChars = []rune{}
} else {
notDotChars = []rune{'\n'}
}
if slices.Contains(flags, RE_CASE_INSENSITIVE) {
caseInsensitive = true
}
re_postfix := make([]rune, 0)
// Convert the string to a slice of runes to allow iteration through it
re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
re_runes := make([]rune, 0)
// The following checks are performed here:
// 1. Check for numeric range. If we are at the start of a numeric range,
// skip to end and construct the equivalent regex for the range.
// The reason this is outside the loop below, is that it actually modifies
// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
// It also makes the overall parsing easier, since I don't have to worry about the numeric range
// anymore.
// Eventually, I might be able to add it into the main parsing loop, to reduce the time
// complexity.
// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
//
// 2. Check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
//
// 3. Another check is made for unescaped brackets - opening brackets are replaced with
// LBRACKET and closing brackets are replaced with RBRACKET.
//
// 4. Check for escaped backslashes. Replace these with the BACKSLASH
// metacharacter. Later, in thompson(), these will be converted back. This avoids
// confusion in detecting whether a character is escaped eg. detecting
// whether '\\[a]' has an escaped opening bracket (it doesn't).
//
// 5. Check for non-greedy operators. These are not supported at the moment, so an error
// must be thrown if the user attempts to use a non-greedy operator.
for i := 0; i < len(re_runes_orig); i++ {
c := re_runes_orig[i]
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
i++ // Step over opening angle bracket
tmpStr := ""
hyphenFound := false
for i < len(re_runes_orig) && re_runes_orig[i] != '>' {
if !unicode.IsDigit(re_runes_orig[i]) {
if re_runes_orig[i] != '-' || (hyphenFound) {
return nil, fmt.Errorf("invalid numeric range")
}
}
if re_runes_orig[i] == '-' {
hyphenFound = true
}
tmpStr += string(re_runes_orig[i])
i++
}
// End of string reached and last character doesn't close the range
if i == len(re_runes_orig) && re_runes_orig[len(re_runes_orig)-1] != '>' {
return nil, fmt.Errorf("numeric range not closed")
}
if len(tmpStr) == 0 {
return nil, fmt.Errorf("empty numeric range")
}
// Closing bracket will be skipped when the loop variable increments
var rangeStart int
var rangeEnd int
fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd)
regex, err := range2regex(rangeStart, rangeEnd)
if err != nil {
return nil, err
}
re_runes = append(re_runes, []rune(regex)...)
} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
re_runes = append(re_runes, nonCapLparenRune)
i += 2
} else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash
re_runes = append(re_runes, escBackslashRune)
i++
} else if c == '[' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
re_runes = append(re_runes, lbracketRune)
continue
} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
re_runes = append(re_runes, rbracketRune)
continue
} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
return nil, fmt.Errorf("non-greedy operators are not supported")
} else {
re_runes = append(re_runes, c)
}
}
/* Add concatenation operators.
Only add a concatenation operator between two characters if both the following conditions are met:
1. The first character isn't an opening parantheses or alteration operator (or an escape character)
a. This makes sense, because these operators can't be _concatenated_ with anything else.
2. The second character isn't a 'closing operator' - one that applies to something before it
a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_.
Caveats:
1. Don't mess with anything inside brackets - character class
2. Don't mess with anything inside braces - numeric repetition
3. Don't mess with any lookarounds.
*/
i := 0
for i < len(re_runes) {
re_postfix = append(re_postfix, re_runes[i])
if re_runes[i] == lbracketRune && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped.
toAppend := make([]rune, 0) // Holds all the runes in the current character class
i++ // Skip past LBRACKET, because it was already added
if i >= len(re_runes) { // Sanity check before we start
return nil, fmt.Errorf("opening bracket without closing bracket")
}
for re_runes[i] != rbracketRune || i == 0 || re_runes[i-1] == '\\' { // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
if i >= len(re_runes) {
return nil, fmt.Errorf("opening bracket without closing bracket")
}
if re_runes[i] == lbracketRune && re_runes[i+1] == ':' { // POSIX character class
toAppend = append(toAppend, re_runes[i])
i++
toAppend = append(toAppend, re_runes[i])
i++
for i < len(re_runes)-1 && re_runes[i] != ':' && re_runes[i] != ']' {
toAppend = append(toAppend, re_runes[i])
i++
}
if i >= len(re_runes)-1 && re_runes[i] != ':' {
return nil, fmt.Errorf("unable to parse what looks like a POSIX character class")
}
toAppend = append(toAppend, re_runes[i])
i++
}
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != rbracketRune) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
re_runes[i] = charRangeRune
}
toAppend = append(toAppend, re_runes[i])
i++
}
// Add in the RBRACKET
toAppend = append(toAppend, rbracketRune)
re_postfix = append(re_postfix, toAppend...)
}
if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
i++ // Skip opening brace
for i < len(re_runes) && re_runes[i] != '}' {
re_postfix = append(re_postfix, re_runes[i])
i++
}
if i == len(re_runes) {
return nil, fmt.Errorf("invalid numeric specifier")
}
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
}
if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen
re_postfix = append(re_postfix, nonCapLparenRune)
i += 3
}
if i < len(re_runes) && re_runes[i] == '\\' { // Something is being escaped (I don't add the backslash to re_postfix, because it was already added earlier)
i++
if i >= len(re_runes) {
return nil, fmt.Errorf("stray backslash in expression")
}
if re_runes[i] == 'x' {
re_postfix = append(re_postfix, re_runes[i])
i++
if i >= len(re_runes) {
return nil, fmt.Errorf("stray backslash in expression")
}
if re_runes[i] == '{' {
re_postfix = append(re_postfix, re_runes[i:i+8]...)
i += 7
if i >= len(re_runes) {
return nil, fmt.Errorf("stray backslash in expression")
}
} else if isHex(re_runes[i]) {
re_postfix = append(re_postfix, re_runes[i:i+2]...)
i += 2
} else {
return nil, fmt.Errorf("invalid hex value in expression")
}
} else if isOctal(re_runes[i]) {
numDigits := 1
for i+numDigits < len(re_runes) && numDigits < 3 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 3)
numDigits++
}
re_postfix = append(re_postfix, re_runes[i:i+numDigits]...)
i += (numDigits - 1) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended
} else {
re_postfix = append(re_postfix, re_runes[i])
}
}
if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
i++ // Step inside
if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
return nil, fmt.Errorf("invalid regex - lookaround intended?")
}
re_postfix = append(re_postfix, re_runes[i])
i++
numOpenParens := 1
for numOpenParens != 0 {
if i >= len(re_runes) {
return nil, fmt.Errorf("unclosed lookaround")
}
if re_runes[i] == '(' || re_runes[i] == nonCapLparenRune {
numOpenParens++
}
if re_runes[i] == ')' {
numOpenParens--
if numOpenParens == 0 {
break
}
}
re_postfix = append(re_postfix, re_runes[i])
i++
}
continue
}
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != nonCapLparenRune && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
re_postfix = append(re_postfix, concatRune)
}
}
}
i++
}
opStack := make([]rune, 0) // Operator stack
outQueue := make([]postfixNode, 0) // Output queue
// Actual algorithm
numOpenParens := 0 // Number of open parentheses
for i := 0; i < len(re_postfix); i++ {
/* Two cases:
1. Current character is alphanumeric - send to output queue
2. Current character is operator - do the following:
a. If current character has greater priority than top of opStack, push to opStack.
b. If not, keep popping from opStack (and appending to outQueue) until:
i. opStack is empty, OR
ii. current character has greater priority than top of opStack
3. If current character is '(' or NONCAPLPAREN_CHAR, push to opStack
4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
*/
c := re_postfix[i]
if isNormalChar(c) || isSpecialCharWithMetacharReplacement(c) {
outQueue = append(outQueue, newPostfixNode(c))
continue
}
// Since every unescaped bracket is replaced by a LBRACKET / RBRACKET, there may
// have been false positives. For example, the regex ']' has a closing bracket, but it
// isn't denoting a character class; it's just a regular character. Since it's not escaped,
// though, I would have converted this into an RBRACKET.
// To deal with this, I make the following assertion:
// If at any point I see an RBRACKET 'in the wild' (not in a character class), then it must be
// a regular character, with no special significance.
if c == rbracketRune {
outQueue = append(outQueue, newPostfixCharNode(']'))
continue
}
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
if i == len(re_postfix)-1 { // End of string - throw error, because backslash is an escape character (something needs to come after it)
return nil, fmt.Errorf("backslash with no escape character")
}
i++
if re_postfix[i] == 'x' { // Hex value
i++
if re_postfix[i] == '{' && i < len(re_postfix)-6 { // Expanded hex code
var hexVal int
n, err := fmt.Sscanf(string(re_postfix[i:]), "{%x}", &hexVal)
if n < 1 || err != nil {
return nil, fmt.Errorf("error parsing expanded hex code in expression")
}
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
i += 7
} else if i < len(re_postfix)-1 { // Two-digit hex code
hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
if err != nil {
return nil, fmt.Errorf("error parsing hex characters in expression")
}
i++ // Loop increment will take care of going forward
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
} else {
return nil, fmt.Errorf("not enough hex characters found in expression")
}
} else if isOctal(re_postfix[i]) { // Octal value
var octVal int64
var octValStr string
numDigitsParsed := 0
for (i+numDigitsParsed) < len(re_postfix) && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 {
octValStr += string(re_postfix[i+numDigitsParsed])
numDigitsParsed++
}
octVal, err := strconv.ParseInt(octValStr, 8, 32)
if err != nil {
return nil, fmt.Errorf("error parsing octal value in expression")
}
if octVal > 0777 {
return nil, fmt.Errorf("invalid octal value in expression")
}
i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically
outQueue = append(outQueue, newPostfixCharNode(rune(octVal)))
} else {
escapedNode, err := newEscapedNode(re_postfix[i], false)
if err != nil {
return nil, err
}
outQueue = append(outQueue, escapedNode)
}
continue // Escaped character will automatically be skipped when loop variable increments
}
if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
outQueue = append(outQueue, newPostfixDotNode())
continue
}
if c == '^' { // Start-of-string assertion
outQueue = append(outQueue, newPostfixNode(c))
}
if c == '$' { // End-of-string assertion
outQueue = append(outQueue, newPostfixNode(c))
}
// Check if we're at the start of a lookaround
if c == '(' && i < len(re_postfix)-1 && re_postfix[i+1] == '?' {
i += 2 // Skip opening paren and question mark
regex := "" // Stores lookaround regex
numOpenParens := 1
for numOpenParens != 0 {
if i >= len(re_postfix) {
return nil, fmt.Errorf("unclosed lookaround")
}
if re_postfix[i] == '(' || re_postfix[i] == nonCapLparenRune {
numOpenParens++
}
if re_postfix[i] == ')' {
numOpenParens--
if numOpenParens == 0 {
break
}
}
regex += string(re_postfix[i])
i++
}
if len(regex) <= 1 { // Nothing in regex - throw error
return nil, fmt.Errorf("invalid lookaround. (too short?)")
}
// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
// Now we should filter that out.
toAppend := postfixNode{nodetype: assertionNode, startReps: 1, endReps: 1}
if regex[0] == '<' { // Lookbehind
toAppend.lookaroundDir = lookbehind
regex = regex[1:]
} else if regex[0] == '=' || regex[0] == '!' {
toAppend.lookaroundDir = lookahead
} else {
return nil, fmt.Errorf("invalid lookaround")
}
// Positive or negative
if regex[0] == '=' { // Positive
toAppend.lookaroundSign = positive
toAppend.contents = []rune(regex[1:])
} else if regex[0] == '!' { // Negative
toAppend.lookaroundSign = negative
toAppend.contents = []rune(regex[1:])
} else {
return nil, fmt.Errorf("invalid lookaround")
}
outQueue = append(outQueue, toAppend)
continue
}
if isOperator(c) {
if len(opStack) == 0 {
opStack = append(opStack, c)
} else {
topStack, err := peek(opStack)
if err != nil {
return nil, fmt.Errorf("operator without operand")
}
if priority(c) > priority(topStack) { // 2a
opStack = append(opStack, c)
} else {
for priority(c) <= priority(topStack) { // 2b
to_append := mustPop(&opStack)
outQueue = append(outQueue, newPostfixNode(to_append))
topStack, _ = peek(opStack)
}
outQueueFinalElement, _ := peek(outQueue)
if (c == '*' && outQueueFinalElement.nodetype == kleeneNode) || (c == '+' && outQueueFinalElement.nodetype == plusNode) { // You cannot apply a quantifier to a quantifier in this way
return nil, fmt.Errorf("illegal use of token '%c'", c)
}
opStack = append(opStack, c)
}
}
}
if c == lbracketRune { // Used for character classes
firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
i++ // Step forward so we can look at the character class
// Oops, there's nothing there to look at
if i >= len(re_postfix) {
return nil, fmt.Errorf("opening bracket with no closing bracket")
}
// Check if a POSIX character class was specified ouside a bracket. This is an error.
// Eg. [:digit:] should lead to an error, telling the user that the right syntax is [[:digit:]]
if re_postfix[i] == ':' {
posixClassPresent, _ := getPOSIXClass(re_postfix[i+1:])
if posixClassPresent {
return nil, fmt.Errorf("the syntax for POSIX character classes is [[:digit:]], not [:digit:]")
}
}
var invertMatch bool
if re_postfix[i] == '^' {
invertMatch = true
i++
}
chars := make([]postfixNode, 0) // List of nodes - used only for character classes
for i < len(re_postfix) {
if firstCharAdded && re_postfix[i] == rbracketRune {
break
}
if re_postfix[i] == charRangeRune {
endOfRange = true
i++
continue
}
if re_postfix[i] == '\\' { // Backslash indicates a character to be escaped
if i == len(re_postfix)-1 {
return nil, fmt.Errorf("stray backslash in character class")
}
i++ // Step past backslash
if re_postfix[i] == 'x' { // Hex value
i++
if re_postfix[i] == '{' && i < len(re_postfix)-7 { // Expanded hex code
var hexVal int
n, err := fmt.Sscanf(string(re_postfix[i:]), "{%x}", &hexVal)
if n < 1 || err != nil {
return nil, fmt.Errorf("error parsing expanded hex code in character class")
}
chars = append(chars, newPostfixCharNode(rune(hexVal)))
i += 8
} else if i < len(re_postfix)-2 { // Two-digit hex code
hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
if err != nil {
return nil, fmt.Errorf("error parsing hex characters in character class")
}
i += 2
chars = append(chars, newPostfixCharNode(rune(hexVal)))
} else {
return nil, fmt.Errorf("not enough hex characters found in character class")
}
} else if isOctal(re_postfix[i]) { // Octal value
var octVal int64
var octValStr string
numDigitsParsed := 0
for (i+numDigitsParsed) < len(re_postfix)-1 && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else)
octValStr += string(re_postfix[i+numDigitsParsed])
numDigitsParsed++
}
octVal, err := strconv.ParseInt(octValStr, 8, 32)
if err != nil {
return nil, fmt.Errorf("error parsing octal value in character class")
}
if octVal > 0777 {
return nil, fmt.Errorf("invalid octal value in character class")
}
i += numDigitsParsed // Shift forward by the number of characters parsed
chars = append(chars, newPostfixCharNode(rune(octVal)))
} else {
escapedNode, err := newEscapedNode(re_postfix[i], true)
if err != nil {
return nil, err
}
chars = append(chars, escapedNode)
i++
}
} else {
if re_postfix[i] == lbracketRune && i < len(re_postfix)-8 { // Could be the start of a POSIX class - the smallest POSIX class by word-length [[:word:]] takes 8 more characters
temp_i := i
temp_i++
if re_postfix[temp_i] == ':' {
temp_i++
posixClassPresent, posixClass := getPOSIXClass(re_postfix[temp_i:])
// getPOSIXClass returns true if there is some set of characters that
// ends in a colon and then a closing bracket. If this is not the case, we
// just treat all the characters as literals.
// For example, [[:digit:a]] is _not_ a POSIX class, its just a regular
// character class contains the letters '[', ':', 'd', 'i', 'g', 'i', 't', ':', 'a'.
// The final 'closing bracket' has no special meaning, its just another character.
if posixClassPresent {
var nodeToAdd postfixNode
switch posixClass {
case "digit": // Equivalent to '\d'
nodeToAdd = newPostfixCharNode(genRangeInclusive('0', '9')...)
case "upper": // [A-Z]
charsToAdd := genRangeInclusive('A', 'Z')
nodeToAdd = newPostfixCharNode(charsToAdd...)
case "lower": // [a-z]
charsToAdd := genRangeInclusive('a', 'z')
nodeToAdd = newPostfixCharNode(charsToAdd...)
case "alpha": //[A-Za-z]
nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'Z'), genRangeInclusive('a', 'z'))...)
case "xdigit": // [0-9A-Fa-f]
nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'F'), genRangeInclusive('a', 'f'), genRangeInclusive('0', '9'))...)
case "alnum": // [A-Za-z0-9]
nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'Z'), genRangeInclusive('a', 'z'), genRangeInclusive('0', '9'))...)
case "blank": // [ \t]
nodeToAdd = newPostfixCharNode(' ', '\t')
case "space": // [ \t\n\r\f\v]
nodeToAdd = newPostfixCharNode(' ', '\t', '\n', '\r', '\f', '\v')
case "cntrl": // Control characters
nodeToAdd = newPostfixCharNode(append(genRangeInclusive('\x00', '\x1F'), '\x7F')...)
case "punct": // Punctuation and symbols
nodeToAdd = newPostfixCharNode([]rune(`!"\#$%&'()*+,\-./:;<=>?@\[\\\]^_` + "`" + `{|}~`)...)
case "graph": // Graphic characters
nodeToAdd = newPostfixCharNode(genRangeInclusive('\x21', '\x7E')...)
case "print": // Graphic characters + space
nodeToAdd = newPostfixCharNode(genRangeInclusive('\x20', '\x7E')...)
case "ascii": // ASCII values
nodeToAdd = newPostfixCharNode(genRangeInclusive('\x00', '\x7F')...)
case "word": // Word characters
nodeToAdd, _ = newEscapedNode('w', true) // This isn't going to error, so I suppress it
default:
return nil, err
}
chars = append(chars, nodeToAdd)
i = temp_i + len(posixClass) + 2 // Skip over the class name, the closing colon and the closing bracket
firstCharAdded = true
continue
}
}
}
// This used to be an else statement - I removed it, because if the previous if-block fails
// (ie. if it didn't actually find a character class), then this block must still execute.
// However, the checks for character classes are nested, so placing this inside an 'else' block
// will prevent it from running, as the outer if-statement will have evaluated to true.
if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket.
switch re_postfix[i] {
case lbracketRune:
chars = append(chars, newPostfixCharNode('['))
case rbracketRune:
chars = append(chars, newPostfixCharNode(']'))
default:
return nil, fmt.Errorf("error parsing high-range unicode value in character class")
}
}
chars = append(chars, newPostfixCharNode(re_postfix[i]))
i++
}
firstCharAdded = true
if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range
// Things to note:
// 1. In PCRE and Go's regex engine, a letter _can_ be surrounded by hyphens in a character class.
// Eg. [a-b-c]
// While you might think this leads to a syntax error (I thought so), the engine picks 'a-b' as a range,
// then treats the second '-' and 'c' as regular characters in the character class.
// So this regex becomes "Match a character from 'a' to 'b', a literal hyphen, or 'c' ".
// 2. To account for this, the following logic is followed:
// a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range.
// i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such.
// ii. If either the start or end of the range don't exist in 'chars' ie. something like [-a] or [a-], then too will we treat it as a literal hyphen.
// b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range).
endRangePostfixNode, err1 := pop(&chars)
startRangePostfixNode, err2 := pop(&chars)
if (err1 != nil || err2 != nil) || len(startRangePostfixNode.contents) != 1 { // Treat it as a regular hyphen
chars = append(chars, startRangePostfixNode, newPostfixCharNode('-'), endRangePostfixNode)
} else if len(endRangePostfixNode.contents) != 1 { // I don't even know what this would look like, this is just a sanity check
return nil, fmt.Errorf("error parsing character range")
} else {
// We have established that they both have a length of 1
startRangeRune := startRangePostfixNode.contents[0]
endRangeRune := endRangePostfixNode.contents[0]
if startRangeRune > endRangeRune {
return nil, fmt.Errorf("character range syntax is [a-b], not [b-a]")
}
chars = append(chars, newPostfixCharNode(genRangeInclusive(startRangeRune, endRangeRune)...))
}
endOfRange = false // Reset the flag
}
}
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Throw error.
return nil, fmt.Errorf("opening bracket without closing bracket")
}
outQueue = append(outQueue, newCharClassNode(chars, invertMatch))
continue
}
if c == '{' {
i++ // Skip opening brace
// Three possibilities:
// 1. Single number - {5}
// 2. Range - {3,5}
// 3. Start with no end, {3,}
startRange := make([]rune, 0)
startRangeNum := 0
endRange := make([]rune, 0)
endRangeNum := 0
for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
startRange = append(startRange, re_postfix[i])
i++
}
if len(startRange) == 0 { // {} is not valid, neither is {,5}
return nil, fmt.Errorf("invalid numeric specifier")
}
if i == len(re_postfix) {
return nil, fmt.Errorf("brace not closed")
}
startRangeNum, err := strconv.Atoi(string(startRange))
if err != nil {
return nil, fmt.Errorf("invalid numeric range")
}
if re_postfix[i] == '}' { // Case 1 above
endRangeNum = startRangeNum
} else {
if re_postfix[i] != ',' {
return nil, fmt.Errorf("invalid numeric specifier")
}
i++ // Skip comma
for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
endRange = append(endRange, re_postfix[i])
i++
}
if i == len(re_postfix) {
return nil, fmt.Errorf("brace not closed")
}
if re_postfix[i] != '}' {
return nil, fmt.Errorf("invalid start range for numeric specifier")
}
if len(endRange) == 0 { // Case 3 above
endRangeNum = infinite_reps
} else { // Case 2 above
var err error
endRangeNum, err = strconv.Atoi(string(endRange))
if err != nil {
return nil, fmt.Errorf("invalid end range for numeric specifier")
}
}
}
idx := len(outQueue) - 1
// Get the last added node
if idx < 0 || outQueue[idx].nodetype == lparenNode {
return nil, fmt.Errorf("numeric specifier with no content")
}
outQueue[idx].startReps = startRangeNum
outQueue[idx].endReps = endRangeNum
}
if c == '(' || c == nonCapLparenRune {
opStack = append(opStack, c)
if c == '(' { // We only push _capturing_ group parentheses to outQueue
outQueue = append(outQueue, newPostfixNode(c))
}
numOpenParens++
}
if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Throw error if we reach the end of the stack.
var val rune
var err error
for val, err = peek(opStack); val != '(' && val != nonCapLparenRune; val, err = peek(opStack) {
if err != nil {
return nil, fmt.Errorf("imbalanced parantheses")
}
to_append := mustPop(&opStack)
outQueue = append(outQueue, newPostfixNode(to_append))
}
_ = mustPop(&opStack) // Get rid of opening parentheses
if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
}
numOpenParens--
}
}
// Pop all remaining operators (and append to outQueue)
for len(opStack) > 0 {
to_append := mustPop(&opStack)
outQueue = append(outQueue, newPostfixNode(to_append))
}
if numOpenParens != 0 {
return nil, fmt.Errorf("imbalanced parantheses")
}
return outQueue, nil
}
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state and number of groups in regex.
func thompson(re []postfixNode) (Reg, error) {
nfa := make([]*nfaState, 0) // Stack of states
numGroups := 0 // Number of capturing groups
// If thompson() receives an empty regex, then whatever was given to shuntingYard()
// was parsed away. This doesn't mean that the regex itself is empty.
// For example, it could have been '(?:)'. This is an empty non-capturing group. Since
// shuntingYard() doesn't include non-capturing groups in its output (and the group contains
// nothing), the output of shuntingYard() (and the input to thompson()) ends up being empty.
// In these cases, we will return an NFA with 1 state, with an assertion that is always true.
if len(re) == 0 {
start := zeroLengthMatchState()
nfa = append(nfa, &start)
}
for _, c := range re {
if c.nodetype == characterNode || c.nodetype == assertionNode {
stateToAdd := nfaState{}
stateToAdd.transitions = make(map[int][]*nfaState)
if c.allChars {
stateToAdd.allChars = true
if len(c.except) != 0 {
// For each node that I am 'excepting' (eg. in an inverted character class):
// - If the node itself has exceptions, then the exceptions cancel out.
// Eg. [^\w] == [\W]
// - Since an allChars node is the only kind that _can_ have exceptions, that's what I check for.
// - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list.
for _, node := range c.except {
if node.allChars {
stateToAdd.allChars = false
// For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all,
// and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match
// those.
nodeExceptChars := slices.Concat(funcMap(node.except, func(node postfixNode) []rune {
nodeContents := node.contents
if caseInsensitive {
nodeContents = slices.Concat(funcMap(nodeContents, func(r rune) []rune {
return allCases(r, caseInsensitive)
})...)
}
return nodeContents
})...)
stateToAdd.content = rune2Contents(nodeExceptChars)
} else {
charsToAdd := node.contents
if caseInsensitive {
charsToAdd = slices.Concat(funcMap(charsToAdd, func(r rune) []rune {
return allCases(r, caseInsensitive)
})...)
}
stateToAdd.except = append(stateToAdd.except, charsToAdd...)
}
}
}
}
// Convert the current contents to []int, convert the result of rune2contents to []int, append then
// convert back to stateContents.
runesToAdd := c.contents
if caseInsensitive {
runesToAdd = slices.Concat(funcMap(runesToAdd, func(r rune) []rune {
return allCases(r, caseInsensitive)
})...)
}
stateToAdd.content = stateContents(append([]int(stateToAdd.content), []int(rune2Contents(runesToAdd))...))
stateToAdd.output = make([]*nfaState, 0)
stateToAdd.output = append(stateToAdd.output, &stateToAdd)
stateToAdd.isEmpty = false
if c.nodetype == assertionNode {
stateToAdd.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
stateToAdd.content = newContents(epsilon) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
switch c.contents[0] {
case '^':
stateToAdd.assert = sosAssert
case '$':
stateToAdd.assert = eosAssert
case 'b':
stateToAdd.assert = wboundAssert
case 'B':
stateToAdd.assert = nonwboundAssert
}
} else { // Lookaround
stateToAdd.lookaroundRegex = string(c.contents)
if c.lookaroundDir == lookahead {
if c.lookaroundSign == positive {
stateToAdd.assert = plaAssert
}
if c.lookaroundSign == negative {
stateToAdd.assert = nlaAssert
}
}
if c.lookaroundDir == lookbehind {
if c.lookaroundSign == positive {
stateToAdd.assert = plbAssert
}
if c.lookaroundSign == negative {
stateToAdd.assert = nlbAssert
}
}
tmpRe, err := shuntingYard(stateToAdd.lookaroundRegex)
if err != nil {
return Reg{}, fmt.Errorf("error parsing lookaround: %w", err)
}
reg, err := thompson(tmpRe)
if err != nil {
return Reg{}, fmt.Errorf("error compiling lookaround: %w", err)
}
stateToAdd.lookaroundNFA = reg.start
stateToAdd.lookaroundNumCaptureGroups = reg.numGroups
}
}
// Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it
replaceByValue([]int(stateToAdd.content), int(escBackslashRune), '\\')
replaceByValue(stateToAdd.except, escBackslashRune, '\\')
nfa = append(nfa, &stateToAdd)
}
if c.nodetype == lparenNode || c.nodetype == rparenNode {
s := &nfaState{}
s.assert = noneAssert
s.content = newContents(epsilon)
s.isEmpty = true
s.output = make([]*nfaState, 0)
s.output = append(s.output, s)
s.transitions = make(map[int][]*nfaState)
// LPAREN nodes are just added normally
if c.nodetype == lparenNode {
numGroups++
s.groupBegin = true
s.groupNum = numGroups
nfa = append(nfa, s)
continue
}
// For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN,
// and then some other node.
// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
// and added back in.
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
// and RPAREN nodes.
// If neither node exists, that's a problem so I return an error.
if c.nodetype == rparenNode {
s.groupEnd = true
middleNode, err1 := pop(&nfa)
lparenNode, err2 := pop(&nfa)
if err1 != nil && err2 != nil {
return Reg{}, fmt.Errorf("imbalanced parentheses")
} else if err2 != nil { // There was no third node. ie. something like '()'
lparenNode = middleNode
if lparenNode.groupBegin != true { // There are only two nodes, but the first one isn't an LPAREN.
return Reg{}, fmt.Errorf("imbalanced parentheses")
}
s.groupNum = lparenNode.groupNum
to_add := concatenate(lparenNode, s)
nfa = append(nfa, to_add)
} else {
// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
if lparenNode.groupBegin {
s.groupNum = lparenNode.groupNum
} else if middleNode.groupBegin { // Something like 'a()'
s.groupNum = middleNode.groupNum
} else { // A middleNode and lparenNode exist, but neither is actually an LPAREN.
return Reg{}, fmt.Errorf("imbalanced parentheses")
}
tmp := concatenate(lparenNode, middleNode)
to_add := concatenate(tmp, s)
nfa = append(nfa, to_add)
}
}
}
if c.nodetype == charclassNode { // A Character class consists of all the nodes in it, alternated
// Map the list of nodes to a list of states, each state containing the contents of a specific node
states := funcMap(c.nodeContents, func(node postfixNode) *nfaState {
s := newState()
nodeContents := node.contents
if caseInsensitive {
nodeContents = slices.Concat(funcMap(nodeContents, func(r rune) []rune {
return allCases(r, caseInsensitive)
})...)
}
s.content = rune2Contents(nodeContents)
if len(node.except) > 0 {
s.allChars = true
s.except = slices.Concat(funcMap(node.except, func(n postfixNode) []rune {
return n.contents
})...)
}
return &s
})
// Reduce the list of states down to a single state by alternating them
toAdd := funcReduce(states, func(s1 *nfaState, s2 *nfaState) *nfaState {
return alternate(s1, s2)
})
nfa = append(nfa, toAdd)
}
// Must be an operator if it isn't a character
switch c.nodetype {
case concatenateNode:
s2 := mustPop(&nfa)
// Relax the requirements for concatenation a little bit - If
// the second element is not found ie. the postfixNodes look
// like 'a'+CONCAT, then that's fine, we just skip the concatenation.
s1, err := pop(&nfa)
if err != nil {
nfa = append(nfa, s2)
} else {
s1 = concatenate(s1, s2)
nfa = append(nfa, s1)
}
case kleeneNode: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
s1, err := pop(&nfa)
if err != nil {
return Reg{}, fmt.Errorf("error applying kleene star")
}
stateToAdd, err := kleene(*s1)
if err != nil {
return Reg{}, err
}
nfa = append(nfa, stateToAdd)
case plusNode: // a+ is equivalent to aa*
s1 := mustPop(&nfa)
s2, err := kleene(*s1)
if err != nil {
return Reg{}, err
}
s1 = concatenate(s1, s2)
nfa = append(nfa, s1)
case questionNode: // ab? is equivalent to a(b|)
s1, err := pop(&nfa)
if err != nil {
return Reg{}, fmt.Errorf("error applying question operator")
}
s2 := question(s1)
nfa = append(nfa, s2)
case pipeNode:
// A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
// it is replaced with an implicit 'matchZeroLength' state (this is the same thing that we add at the top if our
// input has zero postfixNodes).
// Things to think about:
// 'a|'
// '|a'
// '^a|'
// '^|a'
s1, err1 := pop(&nfa)
s2, err2 := pop(&nfa)
if err2 != nil || (s2.groupBegin && len(s2.transitions) == 0) { // Doesn't exist, or its just an LPAREN
if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back
nfa = append(nfa, s2)
}
tmp := zeroLengthMatchState()
s2 = &tmp
}
if err1 != nil || (s1.groupBegin && len(s1.transitions) == 0) { // Doesn't exist, or its just an LPAREN
if err1 == nil { // See above for explanation
nfa = append(nfa, s1)
}
tmp := zeroLengthMatchState()
s1 = &tmp
}
s3 := alternate(s1, s2)
nfa = append(nfa, s3)
}
if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
if c.endReps != -1 && c.endReps < c.startReps {
return Reg{}, fmt.Errorf("numeric specifier - start greater than end")
}
poppedState := mustPop(&nfa)
var stateToAdd *nfaState = nil
// Take advantage of the following facts:
// a{5} == aaaaa
// a{3,5} == aaaa?a?
// a{5,} == aaaaa+
// Nov. 3 2024 - I have two choices on how I want to implement numeric
// specifiers.
// a. Encode the logic while creating the states. I will have to create a function
// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
// each other (concatenating them with the 'concatenate' method - which takes addresses - does
// not work). Creating this function might be a lot of work.
// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
// at this point, I can leave thompson untouched.
for i := 0; i < c.startReps; i++ { // Case 1
stateToAdd = concatenate(stateToAdd, cloneState(poppedState))
}
if c.endReps == infinite_reps { // Case 3
s2, err := kleene(*poppedState)
if err != nil {
return Reg{}, err
}
stateToAdd = concatenate(stateToAdd, s2)
} else { // Case 2
for i := c.startReps; i < c.endReps; i++ {
stateToAdd = concatenate(stateToAdd, question(cloneState(poppedState)))
}
}
nfa = append(nfa, stateToAdd)
}
}
if len(nfa) != 1 {
return Reg{}, fmt.Errorf("invalid regex")
}
verifyLastStates(nfa)
return Reg{nfa[0], numGroups}, nil
}
// Compiles the given regular expression into a Reg type, suitable for use with the
// matching functions. The second return value is non-nil if a compilation error has
// occured. As such, the error value must be checked before using the Reg returned by this function.
// The second parameter is an optional list of flags, passed to the parsing function shuntingYard.
func Compile(re string, flags ...ReFlag) (Reg, error) {
nodes, err := shuntingYard(re, flags...)
if err != nil {
return Reg{}, fmt.Errorf("error parsing regex: %w", err)
}
reg, err := thompson(nodes)
if err != nil {
return Reg{}, fmt.Errorf("error compiling regex: %w", err)
}
return reg, nil
}