You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

422 lines
14 KiB
Go

2 months ago
package main
import (
"bufio"
2 months ago
"fmt"
"io"
"os"
2 months ago
"slices"
"strconv"
"unicode"
"github.com/fatih/color"
2 months ago
)
const CONCAT rune = '~'
func isOperator(c rune) bool {
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
2 months ago
return true
}
return false
}
/* priority returns the priority of the given operator */
func priority(op rune) int {
precedence := []rune{'|', CONCAT, '+', '*', '?'}
2 months ago
return slices.Index(precedence, op)
}
/*
2 months ago
The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
The primary benefit of this is getting rid of parentheses.
It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
2 months ago
See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
*/
func shuntingYard(re string) []postfixNode {
2 months ago
re_postfix := make([]rune, 0)
2 months ago
re_runes := []rune(re) // Convert the string to a slice of runes to allow iteration through it
/* Add concatenation operators.
Only add a concatenation operator between two characters if both the following conditions are met:
1. The first character isn't an opening parantheses or alteration operator (or an escape character)
2 months ago
a. This makes sense, because these operators can't be _concatenated_ with anything else.
2. The second character isn't a 'closing operator' - one that applies to something before it
a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_.
*/
i := 0
for i < len(re_runes) {
2 months ago
re_postfix = append(re_postfix, re_runes[i])
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
invertMatch := false
toAppend := make([]rune, 0) // Holds all the runes in the current character class
if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
invertMatch = true
i++
}
if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
panic("Empty character class.")
}
for re_runes[i] != ']' {
i++ // Skip all characters inside brackets
// TODO: Check for escaped characters
// Check ahead for character range
if i < len(re_runes)-2 && re_runes[i+1] == '-' {
rangeStart := re_runes[i]
rangeEnd := re_runes[i+2]
if int(rangeEnd) < int(rangeStart) {
panic("Range is out of order.")
}
for i := rangeStart; i <= rangeEnd; i++ {
toAppend = append(toAppend, i)
}
i += 2 // Skip start and hyphen (end will automatically be skipped on next iteration of loop)
continue
}
toAppend = append(toAppend, re_runes[i])
}
// Replace the last character (which should have been ']', with RBRACKET
toAppend[len(toAppend)-1] = RBRACKET
if invertMatch {
toAppend = setDifference(dotChars(), toAppend) // Take the inverse of the set by getting the difference between it and all dot characters
toAppend = append(toAppend, RBRACKET) // Since RBRACKET doesn't exist in dotChars, it wouldn't have been return in setDifference. We manually append it here.
}
re_postfix = append(re_postfix, toAppend...)
}
if re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
i++ // Skip opening brace
for i < len(re_runes) && re_runes[i] != '}' {
re_postfix = append(re_postfix, re_runes[i])
i++
}
if i == len(re_runes) {
panic("Invalid numeric specifier.")
}
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
}
if (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
2 months ago
if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
2 months ago
re_postfix = append(re_postfix, CONCAT)
}
}
}
i++
2 months ago
}
opStack := make([]rune, 0) // Operator stack
outQueue := make([]postfixNode, 0) // Output queue
2 months ago
// Actual algorithm
for i := 0; i < len(re_postfix); i++ {
2 months ago
/* Two cases:
1. Current character is alphanumeric - send to output queue
2. Current character is operator - do the following:
a. If current character has greater priority than top of opStack, push to opStack.
b. If not, keep popping from opStack (and appending to outQueue) until:
i. opStack is empty, OR
ii. current character has greater priority than top of opStack
3. If current character is '(', push to opStack
4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
2 months ago
*/
c := re_postfix[i]
2 months ago
if isAlphaNum(c) {
outQueue = append(outQueue, newPostfixNode(c))
continue
2 months ago
}
// Escape character
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
panic("ERROR: Backslash with no escape character.")
}
i++
outQueue = append(outQueue, newEscapedNode(re_postfix[i]))
continue // Escaped character will automatically be skipped when loop variable increments
}
if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
outQueue = append(outQueue, newPostfixNode(dotChars()...))
continue
}
if c == '^' { // Start-of-string assertion
outQueue = append(outQueue, newPostfixNode(c))
}
if c == '$' { // End-of-string assertion
outQueue = append(outQueue, newPostfixNode(c))
}
2 months ago
if isOperator(c) {
if len(opStack) == 0 {
opStack = append(opStack, c)
} else {
topStack, err := peek(opStack)
if err != nil {
panic("ERROR: Operator without operand.")
}
if priority(c) > priority(topStack) { // 2a
2 months ago
opStack = append(opStack, c)
} else {
for priority(c) <= priority(topStack) { // 2b
to_append := mustPop(&opStack)
outQueue = append(outQueue, newPostfixNode(to_append))
topStack, _ = peek(opStack)
2 months ago
}
opStack = append(opStack, c)
}
}
}
if c == LBRACKET { // Used for character classes
i++ // Step forward so we can look at the character class
chars := make([]rune, 0) // List of characters - used only for character classes
for i < len(re_postfix) {
if re_postfix[i] == RBRACKET {
break
}
chars = append(chars, re_postfix[i])
i++
}
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
panic("ERROR: Opening bracket without closing bracket.")
}
outQueue = append(outQueue, newPostfixNode(chars...))
// i++ // Step forward to skip closing bracket
continue
}
if c == '{' {
i++ // Skip opening brace
// Three possibilities:
// 1. Single number - {5}
// 2. Range - {3,5}
// 3. Start with no end, {3,}
startRange := make([]rune, 0)
startRangeNum := 0
endRange := make([]rune, 0)
endRangeNum := 0
for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
startRange = append(startRange, re_postfix[i])
i++
}
if len(startRange) == 0 { // {} is not valid, neither is {,5}
panic("ERROR: Invalid numeric specifier.")
}
if i == len(re_postfix) {
panic("ERROR: Brace not closed.")
}
startRangeNum, err := strconv.Atoi(string(startRange))
if err != nil {
panic(err)
}
if re_postfix[i] == '}' { // Case 1 above
endRangeNum = startRangeNum
} else {
if re_postfix[i] != ',' {
panic("ERROR: Invalid numeric specifier.")
}
i++ // Skip comma
for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
endRange = append(endRange, re_postfix[i])
i++
}
if i == len(re_postfix) {
panic("ERROR: Brace not closed.")
}
if re_postfix[i] != '}' {
panic("ERROR: Invalid numeric specifier.")
}
if len(endRange) == 0 { // Case 3 above
endRangeNum = INFINITE_REPS
} else { // Case 2 above
var err error
endRangeNum, err = strconv.Atoi(string(endRange))
if err != nil {
panic(err)
}
}
}
node, err := pop(&outQueue)
if err != nil {
panic("Numeric specifier with no content.")
}
node.startReps = startRangeNum
node.endReps = endRangeNum
outQueue = append(outQueue, node)
}
2 months ago
if c == '(' {
opStack = append(opStack, c)
}
if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack.
for val, err := peek(opStack); val != '('; val, err = peek(opStack) {
if err != nil {
panic("ERROR: Imbalanced parantheses.")
}
to_append := mustPop(&opStack)
outQueue = append(outQueue, newPostfixNode(to_append))
2 months ago
}
_ = mustPop(&opStack) // Get rid of opening parantheses
2 months ago
}
}
// Pop all remaining operators (and append to outQueue)
for len(opStack) > 0 {
to_append := mustPop(&opStack)
outQueue = append(outQueue, newPostfixNode(to_append))
2 months ago
}
return outQueue
2 months ago
}
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state.
func thompson(re []postfixNode) *State {
nfa := make([]*State, 0) // Stack of states
2 months ago
for _, c := range re {
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
2 months ago
state := State{}
state.transitions = make(map[int][]*State)
state.content = rune2Contents(c.contents)
2 months ago
state.output = make([]*State, 0)
state.output = append(state.output, &state)
state.isEmpty = false
if c.nodetype == ASSERTION {
state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
state.isEmpty = true
switch c.contents[0] {
case '^':
state.assert = SOS
case '$':
state.assert = EOS
case 'b':
state.assert = WBOUND
case 'B':
state.assert = NONWBOUND
}
}
nfa = append(nfa, &state)
2 months ago
}
// Must be an operator if it isn't a character
switch c.nodetype {
case CONCATENATE:
s2 := mustPop(&nfa)
s1 := mustPop(&nfa)
s1 = concatenate(s1, s2)
2 months ago
nfa = append(nfa, s1)
case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
s1 := mustPop(&nfa)
stateToAdd := kleene(*s1)
nfa = append(nfa, stateToAdd)
case PLUS: // a+ is equivalent to aa*
s1 := mustPop(&nfa)
s2 := kleene(*s1)
s1 = concatenate(s1, s2)
nfa = append(nfa, s1)
case QUESTION: // ab? is equivalent to a(b|)
s1 := mustPop(&nfa)
s2 := question(s1)
nfa = append(nfa, s2)
case PIPE:
s1 := mustPop(&nfa)
s2 := mustPop(&nfa)
s3 := alternate(s1, s2)
nfa = append(nfa, s3)
2 months ago
}
if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
if c.endReps != -1 && c.endReps < c.startReps {
panic("ERROR: Numeric specifier - start greater than end.")
}
state := mustPop(&nfa)
var stateToAdd *State = nil
// Take advantage of the following facts:
// a{5} == aaaaa
// a{3,5} == aaaa?a?
// a{5,} == aaaaa+
// Nov. 3 2024 - I have two choices on how I want to implement numeric
// specifiers.
// a. Encode the logic while creating the states. I will have to create a function
// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
// each other (concatenating them with the 'concatenate' method - which takes addresses - does
// not work). Creating this function might be a lot of work.
// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
// at this point, I can leave thompson untouched.
for i := 0; i < c.startReps; i++ { // Case 1
stateToAdd = concatenate(stateToAdd, cloneState(state))
}
if c.endReps == INFINITE_REPS { // Case 3
s2 := kleene(*state)
stateToAdd = concatenate(stateToAdd, s2)
} else { // Case 2
for i := c.startReps; i < c.endReps; i++ {
stateToAdd = concatenate(stateToAdd, question(state))
}
}
nfa = append(nfa, stateToAdd)
}
2 months ago
}
if len(nfa) != 1 {
panic("ERROR: Invalid Regex.")
}
verifyLastStates(nfa)
return nfa[0]
}
func main() {
// Process:
// 1. Convert regex into postfix notation (Shunting-Yard algorithm)
// a. Add explicit concatenation operators to facilitate this
// 2. Build NFA from postfix representation (Thompson's algorithm)
// 3. Run the string against the NFA
if len(os.Args) != 2 {
fmt.Println("ERROR: Missing cmdline args")
os.Exit(22)
}
2 months ago
var re string
re = os.Args[1]
var test_str string
var err error
// Create reader for stdin and writer for stdout
reader := bufio.NewReader(os.Stdin)
out := bufio.NewWriter(os.Stdout)
2 months ago
re_postfix := shuntingYard(re)
startState := thompson(re_postfix)
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.'
for test_str, err = reader.ReadString('\n'); err == nil; test_str, err = reader.ReadString('\n') {
matchIndices := findAllMatches(startState, test_str)
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
// This should make checking O(1) instead of O(n)
indicesToPrint := new_uniq_arr[int]()
for _, idx := range matchIndices {
indicesToPrint.add(genRange(idx.startIdx, idx.endIdx)...)
}
if len(matchIndices) > 0 {
for i, c := range test_str {
if indicesToPrint.contains(i) {
color.New(color.FgRed).Fprintf(out, "%c", c)
} else {
fmt.Fprintf(out, "%c", c)
}
}
} else {
fmt.Fprint(out, test_str)
}
err = out.Flush()
if err != nil {
panic(err)
}
}
if err != io.EOF {
panic(err)
}
2 months ago
}