package main import ( "fmt" "slices" "strconv" "unicode" ) // Holds a list of all characters that are _not_ matched by the dot metacharacter var notDotChars []rune // A Reg represents the result of compiling a regular expression. It contains // the startState of the NFA representation of the regex, and the number of capturing // groups in the regex. type Reg struct { start *State numGroups int } const CONCAT rune = '~' // Flags for shuntingYard - control its behavior type ReFlag int const ( RE_NO_FLAGS ReFlag = iota RE_CASE_INSENSITIVE RE_MULTILINE ) func isOperator(c rune) bool { if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT { return true } return false } /* priority returns the priority of the given operator */ func priority(op rune) int { precedence := []rune{'|', CONCAT, '+', '*', '?'} return slices.Index(precedence, op) } /* The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix. The primary benefit of this is getting rid of parentheses. It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm. An error can be returned for a multitude of reasons - the reason is specified in the error string. The function also takes in 0 or more flags, which control the behavior of the parser. See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/ */ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { // Check which flags are enabled caseInsensitive := false // In Multiline mode, the newline character is considered a // 'dot' character ie. the dot metacharacter matches a newline as well. if slices.Contains(flags, RE_MULTILINE) { notDotChars = []rune{} } else { notDotChars = []rune{'\n'} } if slices.Contains(flags, RE_CASE_INSENSITIVE) { caseInsensitive = true } re_postfix := make([]rune, 0) // Convert the string to a slice of runes to allow iteration through it re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges) re_runes := make([]rune, 0) // Check for numeric range. If we are at the start of a numeric range, // skip to end and construct the equivalent regex for the range. // The reason this is outside the loop below, is that it actually modifies // the given regex (we 'cut' the numeric range and 'paste' an equivalent regex). // It also makes the overall parsing easier, since I don't have to worry about the numeric range // anymore. // Eventually, I might be able to add it into the main parsing loop, to reduce the time // complexity. // A numeric range has the syntax: . Ir matches all numbers in this range. // // Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:' // I take this out, and put in a special character - NONCAPLPAREN_CHAR. // // Finally, check for escaped backslashes. Replace these with the BACKSLASH metacharacter. Later, in thompson(), // these will be converted back. This avoids confusiuon in detecting whether a character is escaped eg. detecting // whether '\\[a]' has an escaped opening bracket (it doesn't). for i := 0; i < len(re_runes_orig); i++ { c := re_runes_orig[i] if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) { i++ // Step over opening angle bracket tmpStr := "" hyphenFound := false for i < len(re_runes_orig) && re_runes_orig[i] != '>' { if !unicode.IsDigit(re_runes_orig[i]) { if re_runes_orig[i] != '-' || (hyphenFound) { return nil, fmt.Errorf("Invalid numeric range.") } } if re_runes_orig[i] == '-' { hyphenFound = true } tmpStr += string(re_runes_orig[i]) i++ } // End of string reached and last character doesn't close the range if i == len(re_runes_orig) && re_runes_orig[len(re_runes_orig)-1] != '>' { return nil, fmt.Errorf("Numeric range not closed.") } if len(tmpStr) == 0 { return nil, fmt.Errorf("Empty numeric range.") } // Closing bracket will be skipped when the loop variable increments var rangeStart int var rangeEnd int fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd) regex := range2regex(rangeStart, rangeEnd) re_runes = append(re_runes, []rune(regex)...) } else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' { re_runes = append(re_runes, NONCAPLPAREN_CHAR) i += 2 } else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash re_runes = append(re_runes, ESC_BACKSLASH) i++ } else { re_runes = append(re_runes, c) } } /* Add concatenation operators. Only add a concatenation operator between two characters if both the following conditions are met: 1. The first character isn't an opening parantheses or alteration operator (or an escape character) a. This makes sense, because these operators can't be _concatenated_ with anything else. 2. The second character isn't a 'closing operator' - one that applies to something before it a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_. Caveats: 1. Don't mess with anything inside brackets - character class 2. Don't mess with anything inside braces - numeric repetition 3. Don't mess with any lookarounds. */ i := 0 for i < len(re_runes) { re_postfix = append(re_postfix, re_runes[i]) if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters. re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class toAppend := make([]rune, 0) // Holds all the runes in the current character class if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets re_postfix = append(re_postfix, '^') i++ // Skip opening bracket and caret } if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic. return nil, fmt.Errorf("Empty character class.") } for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' { i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash) // TODO: Check for escaped characters // Check ahead for character range if i < len(re_runes)-2 && re_runes[i+1] == '-' { rangeStart := re_runes[i] rangeEnd := re_runes[i+2] if int(rangeEnd) < int(rangeStart) { return nil, fmt.Errorf("Range is out of order.") } for i := rangeStart; i <= rangeEnd; i++ { toAppend = append(toAppend, i) } i += 2 // Skip start and hyphen (end will automatically be skipped on next iteration of loop) continue } toAppend = append(toAppend, re_runes[i]) } // Replace the last character (which should have been ']', with RBRACKET toAppend[len(toAppend)-1] = RBRACKET re_postfix = append(re_postfix, toAppend...) } if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either i++ // Skip opening brace for i < len(re_runes) && re_runes[i] != '}' { re_postfix = append(re_postfix, re_runes[i]) i++ } if i == len(re_runes) { return nil, fmt.Errorf("Invalid numeric specifier.") } re_postfix = append(re_postfix, re_runes[i]) // Append closing brace } if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen re_postfix = append(re_postfix, NONCAPLPAREN_CHAR) i += 3 } if i < len(re_runes) && re_runes[i] == '\\' { // Something is being escaped (I don't add the backslash to re_postfix, because it was already added earlier) i++ if i >= len(re_runes) { return nil, fmt.Errorf("Stray backslash in expression.") } if re_runes[i] == 'x' { re_postfix = append(re_postfix, re_runes[i]) i++ if i >= len(re_runes) { return nil, fmt.Errorf("Stray backslash in expression.") } if re_runes[i] == '{' { re_postfix = append(re_postfix, re_runes[i:i+8]...) i += 7 if i >= len(re_runes) { return nil, fmt.Errorf("Stray backslash in expression.") } } else if isHex(re_runes[i]) { re_postfix = append(re_postfix, re_runes[i:i+2]...) i += 2 } else { return nil, fmt.Errorf("Invalid hex value in expression.") } } else if isOctal(re_runes[i]) { numDigits := 1 for i+numDigits < len(re_runes) && numDigits < 3 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 3) numDigits++ } re_postfix = append(re_postfix, re_runes[i:i+numDigits]...) i += (numDigits - 1) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended } else { re_postfix = append(re_postfix, re_runes[i]) } } if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it. i++ // Step inside if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') { return nil, fmt.Errorf("Invalid regex. Lookaround intended?") } re_postfix = append(re_postfix, re_runes[i]) i++ numOpenParens := 1 for numOpenParens != 0 { if i >= len(re_runes) { return nil, fmt.Errorf("Unclosed lookaround.") } if re_runes[i] == '(' || re_runes[i] == NONCAPLPAREN_CHAR { numOpenParens++ } if re_runes[i] == ')' { numOpenParens-- if numOpenParens == 0 { break } } re_postfix = append(re_postfix, re_runes[i]) i++ } continue } if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped if i < len(re_runes)-1 { if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' { re_postfix = append(re_postfix, CONCAT) } } } i++ } opStack := make([]rune, 0) // Operator stack outQueue := make([]postfixNode, 0) // Output queue // Actual algorithm numOpenParens := 0 // Number of open parentheses for i := 0; i < len(re_postfix); i++ { /* Two cases: 1. Current character is alphanumeric - send to output queue 2. Current character is operator - do the following: a. If current character has greater priority than top of opStack, push to opStack. b. If not, keep popping from opStack (and appending to outQueue) until: i. opStack is empty, OR ii. current character has greater priority than top of opStack 3. If current character is '(' or NONCAPLPAREN_CHAR, push to opStack 4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses. 5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue. 6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue. */ c := re_postfix[i] if isNormalChar(c) { if caseInsensitive { outQueue = append(outQueue, newPostfixNode(allCases(c)...)) } else { outQueue = append(outQueue, newPostfixNode(c)) } continue } // Escape character if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it) return nil, fmt.Errorf("ERROR: Backslash with no escape character.") } i++ if re_postfix[i] == 'x' { // Hex value i++ if re_postfix[i] == '{' && i < len(re_postfix)-6 { // Expanded hex code var hexVal int n, err := fmt.Sscanf(string(re_postfix[i:]), "{%x}", &hexVal) if n < 1 || err != nil { return nil, fmt.Errorf("Error parsing expanded hex code in expression.") } outQueue = append(outQueue, newPostfixCharNode(rune(hexVal))) i += 7 } else if i < len(re_postfix)-1 { // Two-digit hex code hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt() if err != nil { return nil, fmt.Errorf("Error parsing hex characters in expression.") } i++ // Loop increment will take care of going forward outQueue = append(outQueue, newPostfixCharNode(rune(hexVal))) } else { return nil, fmt.Errorf("Not enough hex characters found in expression.") } } else if isOctal(re_postfix[i]) { // Octal value var octVal int64 var octValStr string numDigitsParsed := 0 for (i+numDigitsParsed) < len(re_postfix) && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 { octValStr += string(re_postfix[i+numDigitsParsed]) numDigitsParsed++ } octVal, err := strconv.ParseInt(octValStr, 8, 32) if err != nil { return nil, fmt.Errorf("Error parsing octal value in expression.") } if octVal > 0777 { return nil, fmt.Errorf("Invalid octal value in expression.") } i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically outQueue = append(outQueue, newPostfixCharNode(rune(octVal))) } else { escapedNode, err := newEscapedNode(re_postfix[i], false) if err != nil { return nil, fmt.Errorf("Invalid escape character in expression.") } outQueue = append(outQueue, escapedNode) } continue // Escaped character will automatically be skipped when loop variable increments } if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E outQueue = append(outQueue, newPostfixDotNode()) continue } if c == '^' { // Start-of-string assertion outQueue = append(outQueue, newPostfixNode(c)) } if c == '$' { // End-of-string assertion outQueue = append(outQueue, newPostfixNode(c)) } // Check if we're at the start of a lookaround if c == '(' && i < len(re_postfix)-1 && re_postfix[i+1] == '?' { i += 2 // Skip opening paren and question mark regex := "" // Stores lookaround regex numOpenParens := 1 for numOpenParens != 0 { if i >= len(re_postfix) { return nil, fmt.Errorf("Unclosed lookaround.") } if re_postfix[i] == '(' || re_postfix[i] == NONCAPLPAREN_CHAR { numOpenParens++ } if re_postfix[i] == ')' { numOpenParens-- if numOpenParens == 0 { break } } regex += string(re_postfix[i]) i++ } if len(regex) <= 1 { // Nothing in regex - panic return nil, fmt.Errorf("Invalid lookaround. (too short?)") } // 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind) // Now we should filter that out. toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1} if regex[0] == '<' { // Lookbehind toAppend.lookaroundDir = LOOKBEHIND regex = regex[1:] } else if regex[0] == '=' || regex[0] == '!' { toAppend.lookaroundDir = LOOKAHEAD } else { return nil, fmt.Errorf("Invalid lookaround.") } // Positive or negative if regex[0] == '=' { // Positive toAppend.lookaroundSign = POSITIVE toAppend.contents = []rune(regex[1:]) } else if regex[0] == '!' { // Negative toAppend.lookaroundSign = NEGATIVE toAppend.contents = []rune(regex[1:]) } else { return nil, fmt.Errorf("Invalid lookaround.") } outQueue = append(outQueue, toAppend) continue } if isOperator(c) { if len(opStack) == 0 { opStack = append(opStack, c) } else { topStack, err := peek(opStack) if err != nil { return nil, fmt.Errorf("Operator without operand.") } if priority(c) > priority(topStack) { // 2a opStack = append(opStack, c) } else { for priority(c) <= priority(topStack) { // 2b to_append := mustPop(&opStack) outQueue = append(outQueue, newPostfixNode(to_append)) topStack, _ = peek(opStack) } opStack = append(opStack, c) } } } if c == LBRACKET { // Used for character classes i++ // Step forward so we can look at the character class var invertMatch bool if re_postfix[i] == '^' { invertMatch = true i++ } chars := make([]postfixNode, 0) // List of nodes - used only for character classes for i < len(re_postfix) { if re_postfix[i] == RBRACKET { break } if re_postfix[i] == '\\' { // Backslash indicates a character to be escaped if i == len(re_postfix)-1 { return nil, fmt.Errorf("Stray backslash in character class.") } i++ // Step past backslash if re_postfix[i] == 'x' { // Hex value i++ if re_postfix[i] == '{' && i < len(re_postfix)-7 { // Expanded hex code var hexVal int n, err := fmt.Sscanf(string(re_postfix[i:]), "{%x}", &hexVal) if n < 1 || err != nil { return nil, fmt.Errorf("Error parsing expanded hex code in character class.") } chars = append(chars, newPostfixCharNode(rune(hexVal))) i += 8 } else if i < len(re_postfix)-2 { // Two-digit hex code hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt() if err != nil { return nil, fmt.Errorf("Error parsing hex characters in character class.") } i += 2 chars = append(chars, newPostfixCharNode(rune(hexVal))) } else { return nil, fmt.Errorf("Not enough hex characters found in character class.") } } else if isOctal(re_postfix[i]) { // Octal value var octVal int64 var octValStr string numDigitsParsed := 0 for (i+numDigitsParsed) < len(re_postfix)-1 && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else) octValStr += string(re_postfix[i+numDigitsParsed]) numDigitsParsed++ } octVal, err := strconv.ParseInt(octValStr, 8, 32) if err != nil { return nil, fmt.Errorf("Error parsing octal value in character class.") } if octVal > 0777 { return nil, fmt.Errorf("Invalid octal value in character class.") } i += numDigitsParsed // Shift forward by the number of characters parsed chars = append(chars, newPostfixCharNode(rune(octVal))) } else { escapedNode, err := newEscapedNode(re_postfix[i], true) if err != nil { return nil, fmt.Errorf("Invalid escape character in character class.") } chars = append(chars, escapedNode) i++ } } else { chars = append(chars, newPostfixCharNode(re_postfix[i])) i++ } } if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic. return nil, fmt.Errorf("Opening bracket without closing bracket.") } outQueue = append(outQueue, newCharClassNode(chars, invertMatch)) continue } if c == '{' { i++ // Skip opening brace // Three possibilities: // 1. Single number - {5} // 2. Range - {3,5} // 3. Start with no end, {3,} startRange := make([]rune, 0) startRangeNum := 0 endRange := make([]rune, 0) endRangeNum := 0 for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) { startRange = append(startRange, re_postfix[i]) i++ } if len(startRange) == 0 { // {} is not valid, neither is {,5} return nil, fmt.Errorf("Invalid numeric specifier.") } if i == len(re_postfix) { return nil, fmt.Errorf("Brace not closed.") } startRangeNum, err := strconv.Atoi(string(startRange)) if err != nil { panic(err) } if re_postfix[i] == '}' { // Case 1 above endRangeNum = startRangeNum } else { if re_postfix[i] != ',' { return nil, fmt.Errorf("Invalid numeric specifier.") } i++ // Skip comma for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) { endRange = append(endRange, re_postfix[i]) i++ } if i == len(re_postfix) { return nil, fmt.Errorf("Brace not closed.") } if re_postfix[i] != '}' { return nil, fmt.Errorf("Invalid numeric specifier.") } if len(endRange) == 0 { // Case 3 above endRangeNum = INFINITE_REPS } else { // Case 2 above var err error endRangeNum, err = strconv.Atoi(string(endRange)) if err != nil { panic(err) } } } idx := len(outQueue) - 1 // Get the last added node if idx < 0 || outQueue[idx].nodetype == LPAREN { return nil, fmt.Errorf("Numeric specifier with no content.") } outQueue[idx].startReps = startRangeNum outQueue[idx].endReps = endRangeNum } if c == '(' || c == NONCAPLPAREN_CHAR { opStack = append(opStack, c) if c == '(' { // We only push _capturing_ group parentheses to outQueue outQueue = append(outQueue, newPostfixNode(c)) } numOpenParens++ } if c == ')' { // Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack. var val rune var err error for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) { if err != nil { return nil, fmt.Errorf("Imbalanced parantheses.") } to_append := mustPop(&opStack) outQueue = append(outQueue, newPostfixNode(to_append)) } _ = mustPop(&opStack) // Get rid of opening parentheses if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses } numOpenParens-- } } // Pop all remaining operators (and append to outQueue) for len(opStack) > 0 { to_append := mustPop(&opStack) outQueue = append(outQueue, newPostfixNode(to_append)) } if numOpenParens != 0 { return nil, fmt.Errorf("Imbalanced parantheses.") } return outQueue, nil } // Thompson's algorithm. Constructs Finite-State Automaton from given string. // Returns start state and number of groups in regex. func thompson(re []postfixNode) (Reg, error) { nfa := make([]*State, 0) // Stack of states numGroups := 0 // Number of capturing groups for _, c := range re { if c.nodetype == CHARACTER || c.nodetype == ASSERTION { state := State{} state.transitions = make(map[int][]*State) if c.allChars { state.allChars = true if len(c.except) != 0 { // For each node that I am 'excepting' (eg. in an inverted character class): // - If the node itself has exceptions, then the exceptions cancel out. // Eg. [^\w] == [\W] // - Since an allChars node is the only kind that _can_ have exceptions, that's what I check for. // - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list. for _, node := range c.except { if node.allChars { state.allChars = false // For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all, // and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match // those. nodeExceptChars := slices.Concat(Map(node.except, func(node postfixNode) []rune { return node.contents })...) state.content = rune2Contents(nodeExceptChars) } else { state.except = append(state.except, node.contents...) } } } } // Convert the current contents to []int, convert the result of rune2contents to []int, append then // convert back to stateContents. state.content = stateContents(append([]int(state.content), []int(rune2Contents(c.contents))...)) state.output = make([]*State, 0) state.output = append(state.output, &state) state.isEmpty = false if c.nodetype == ASSERTION { state.isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way. state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string if c.lookaroundDir == 0 || c.lookaroundSign == 0 { switch c.contents[0] { case '^': state.assert = SOS case '$': state.assert = EOS case 'b': state.assert = WBOUND case 'B': state.assert = NONWBOUND } } else { // Lookaround state.lookaroundRegex = string(c.contents) if c.lookaroundDir == LOOKAHEAD { if c.lookaroundSign == POSITIVE { state.assert = PLA } if c.lookaroundSign == NEGATIVE { state.assert = NLA } } if c.lookaroundDir == LOOKBEHIND { if c.lookaroundSign == POSITIVE { state.assert = PLB } if c.lookaroundSign == NEGATIVE { state.assert = NLB } } tmpRe, err := shuntingYard(state.lookaroundRegex) if err != nil { return Reg{}, fmt.Errorf("Error parsing lookaround: %w", err) } reg, err := thompson(tmpRe) if err != nil { return Reg{}, fmt.Errorf("Error compiling lookaround: %w", err) } state.lookaroundNFA = reg.start state.lookaroundNumCaptureGroups = reg.numGroups } } // Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\') replaceByValue(state.except, ESC_BACKSLASH, '\\') nfa = append(nfa, &state) } if c.nodetype == LPAREN || c.nodetype == RPAREN { s := &State{} s.assert = NONE s.content = newContents(EPSILON) s.isEmpty = true s.output = make([]*State, 0) s.output = append(s.output, s) s.transitions = make(map[int][]*State) // LPAREN nodes are just added normally if c.nodetype == LPAREN { numGroups++ s.groupBegin = true s.groupNum = numGroups nfa = append(nfa, s) continue } // For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN, // and then some other node. // These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated // and added back in. if c.nodetype == RPAREN { s.groupEnd = true middleNode := mustPop(&nfa) lparenNode := mustPop(&nfa) s.groupNum = lparenNode.groupNum tmp := concatenate(lparenNode, middleNode) to_add := concatenate(tmp, s) nfa = append(nfa, to_add) } } if c.nodetype == CHARCLASS { // A Character class consists of all the nodes in it, alternated // Map the list of nodes to a list of states, each state containing the contents of a specific node states := Map(c.nodeContents, func(node postfixNode) *State { s := newState() s.content = rune2Contents(node.contents) return &s }) // Reduce the list of states down to a single state by alternating them toAdd := Reduce(states, func(s1 *State, s2 *State) *State { return alternate(s1, s2) }) nfa = append(nfa, toAdd) } // Must be an operator if it isn't a character switch c.nodetype { case CONCATENATE: s2 := mustPop(&nfa) s1 := mustPop(&nfa) s1 = concatenate(s1, s2) nfa = append(nfa, s1) case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state s1 := mustPop(&nfa) stateToAdd := kleene(*s1) nfa = append(nfa, stateToAdd) case PLUS: // a+ is equivalent to aa* s1 := mustPop(&nfa) s2 := kleene(*s1) s1 = concatenate(s1, s2) nfa = append(nfa, s1) case QUESTION: // ab? is equivalent to a(b|) s1 := mustPop(&nfa) s2 := question(s1) nfa = append(nfa, s2) case PIPE: s1 := mustPop(&nfa) s2 := mustPop(&nfa) s3 := alternate(s1, s2) nfa = append(nfa, s3) } if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it if c.endReps != -1 && c.endReps < c.startReps { return Reg{}, fmt.Errorf("Numeric specifier - start greater than end.") } state := mustPop(&nfa) var stateToAdd *State = nil // Take advantage of the following facts: // a{5} == aaaaa // a{3,5} == aaaa?a? // a{5,} == aaaaa+ // Nov. 3 2024 - I have two choices on how I want to implement numeric // specifiers. // a. Encode the logic while creating the states. I will have to create a function // that creates a deep-copy of a given state / NFA, so that I can concatenate them to // each other (concatenating them with the 'concatenate' method - which takes addresses - does // not work). Creating this function might be a lot of work. // b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier // at this point, I can leave thompson untouched. for i := 0; i < c.startReps; i++ { // Case 1 stateToAdd = concatenate(stateToAdd, cloneState(state)) } if c.endReps == INFINITE_REPS { // Case 3 s2 := kleene(*state) stateToAdd = concatenate(stateToAdd, s2) } else { // Case 2 for i := c.startReps; i < c.endReps; i++ { stateToAdd = concatenate(stateToAdd, question(cloneState(state))) } } nfa = append(nfa, stateToAdd) } } if len(nfa) != 1 { return Reg{}, fmt.Errorf("Invalid Regex.") } verifyLastStates(nfa) return Reg{nfa[0], numGroups}, nil } // Compiles the given regular expression into a Reg type, suitable for use with the // matching functions. The second return value is non-nil if a compilation error has // occured. As such, the error value must be checked before using the Reg returned by this function. // The second parameter is an optional list of flags, passed to the parsing function shuntingYard. func Compile(re string, flags ...ReFlag) (Reg, error) { nodes, err := shuntingYard(re, flags...) if err != nil { return Reg{}, fmt.Errorf("Error parsing regex: %w", err) } reg, err := thompson(nodes) if err != nil { return Reg{}, fmt.Errorf("Error compiling regex: %w", err) } return reg, nil }