You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

212 lines
6.4 KiB
Go

2 months ago
package main
import (
"fmt"
"os"
2 months ago
"slices"
"github.com/fatih/color"
2 months ago
)
const CONCAT rune = '~'
func isOperator(c rune) bool {
if c == '+' || c == '*' || c == '|' || c == CONCAT {
2 months ago
return true
}
return false
}
/* priority returns the priority of the given operator */
func priority(op rune) int {
precedence := []rune{'|', CONCAT, '+', '*'}
2 months ago
return slices.Index(precedence, op)
}
/*
2 months ago
The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
The primary benefit of this is getting rid of parentheses.
It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
2 months ago
See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
*/
func shuntingYard(re string) string {
re_postfix := make([]rune, 0)
2 months ago
re_runes := []rune(re) // Convert the string to a slice of runes to allow iteration through it
/* Add concatenation operators.
Only add a concatenation operator between two characters if both the following conditions are met:
1. The first character isn't an opening parantheses or alteration operator.
a. This makes sense, because these operators can't be _concatenated_ with anything else.
2. The second character isn't a 'closing operator' - one that applies to something before it
a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_.
*/
2 months ago
for i := 0; i < len(re_runes); i++ {
re_postfix = append(re_postfix, re_runes[i])
if re_runes[i] != '(' && re_runes[i] != '|' {
if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != ')' {
2 months ago
re_postfix = append(re_postfix, CONCAT)
}
}
}
}
opStack := make([]rune, 0) // Operator stack
outQueue := make([]rune, 0) // Output queue
// Actual algorithm
for _, c := range re_postfix {
/* Two cases:
1. Current character is alphanumeric - send to output queue
2. Current character is operator - do the following:
a. If current character has greater priority than top of opStack, push to opStack.
b. If not, keep popping from opStack (and appending to outQueue) until:
i. opStack is empty, OR
ii. current character has greater priority than top of opStack
3. If current character is '(', push to opStack
4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
*/
if isAlphaNum(c) {
outQueue = append(outQueue, c)
}
if isOperator(c) {
if len(opStack) == 0 {
opStack = append(opStack, c)
} else {
topStack, err := peek(opStack)
if err != nil {
panic("ERROR: Operator without operand.")
}
if priority(c) > priority(topStack) { // 2a
2 months ago
opStack = append(opStack, c)
} else {
for priority(c) <= priority(topStack) { // 2b
to_append := mustPop(&opStack)
2 months ago
outQueue = append(outQueue, to_append)
topStack, _ = peek(opStack)
2 months ago
}
opStack = append(opStack, c)
}
}
}
if c == '(' {
opStack = append(opStack, c)
}
if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack.
for val, err := peek(opStack); val != '('; val, err = peek(opStack) {
if err != nil {
panic("ERROR: Imbalanced parantheses.")
}
to_append := mustPop(&opStack)
2 months ago
outQueue = append(outQueue, to_append)
}
_ = mustPop(&opStack) // Get rid of opening parantheses
2 months ago
}
}
// Pop all remaining operators (and append to outQueue)
for len(opStack) > 0 {
to_append := mustPop(&opStack)
2 months ago
outQueue = append(outQueue, to_append)
}
return string(outQueue)
}
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state.
func thompson(re string) *State {
nfa := make([]*State, 0) // Stack of states
2 months ago
for _, c := range re {
if isAlphaNum(c) {
state := State{}
state.transitions = make(map[int][]*State)
2 months ago
state.content = int(c)
state.output = make([]*State, 0)
state.output = append(state.output, &state)
state.isEmpty = false
nfa = append(nfa, &state)
2 months ago
}
// Must be an operator if it isn't alphanumeric
switch c {
case CONCAT:
s2 := mustPop(&nfa)
s1 := mustPop(&nfa)
s1 = concatenate(s1, s2)
2 months ago
nfa = append(nfa, s1)
case '*': // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
s1 := mustPop(&nfa)
stateToAdd := kleene(*s1)
nfa = append(nfa, stateToAdd)
case '+': // a+ is equivalent to aa*
s1 := mustPop(&nfa)
s2 := kleene(*s1)
s1 = concatenate(s1, s2)
nfa = append(nfa, s1)
2 months ago
case '|':
s1 := mustPop(&nfa)
s2 := mustPop(&nfa)
2 months ago
s3 := State{}
s3.transitions = make(map[int][]*State)
s3.output = append(s3.output, s1.output...)
s3.output = append(s3.output, s2.output...)
// Unique append is used here (and elsewhere) to ensure that,
// for any given transition, a state can only be mentioned once.
// For example, given the transition 'a', the state 's1' can only be mentioned once.
// This would lead to multiple instances of the same set of match indices, since both
// 's1' states would be considered to match.
s3.transitions[s1.content] = unique_append(s3.transitions[s1.content], s1)
s3.transitions[s2.content] = unique_append(s3.transitions[s2.content], s2)
s3.content = EPSILON
2 months ago
s3.isEmpty = true
nfa = append(nfa, &s3)
2 months ago
}
}
if len(nfa) != 1 {
panic("ERROR: Invalid Regex.")
}
verifyLastStates(nfa)
return nfa[0]
}
func main() {
// Process:
// 1. Convert regex into postfix notation (Shunting-Yard algorithm)
// a. Add explicit concatenation operators to facilitate this
// 2. Build NFA from postfix representation (Thompson's algorithm)
// 3. Run the string against the NFA
if len(os.Args) < 3 {
fmt.Println("ERROR: Missing cmdline args")
os.Exit(22)
}
2 months ago
var re string
re = os.Args[1]
2 months ago
re_postfix := shuntingYard(re)
// fmt.Println(re_postfix)
startState := thompson(re_postfix)
matchIndices := findAllMatches(startState, os.Args[2])
inColor := false
if len(matchIndices) > 0 {
for i, c := range os.Args[2] {
for _, indices := range matchIndices {
if i >= indices.startIdx && i < indices.endIdx {
color.New(color.FgRed).Printf("%c", c)
inColor = true
break
}
}
if inColor == false {
fmt.Printf("%c", c)
}
inColor = false
}
fmt.Printf("\n")
} else {
fmt.Println(os.Args[2])
}
2 months ago
}