|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"os"
|
|
|
|
"slices"
|
|
|
|
|
|
|
|
"github.com/fatih/color"
|
|
|
|
)
|
|
|
|
|
|
|
|
const CONCAT rune = '~'
|
|
|
|
|
|
|
|
func isOperator(c rune) bool {
|
|
|
|
if c == '+' || c == '*' || c == '|' || c == CONCAT {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
/* priority returns the priority of the given operator */
|
|
|
|
func priority(op rune) int {
|
|
|
|
precedence := []rune{'|', CONCAT, '+', '*'}
|
|
|
|
return slices.Index(precedence, op)
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
shuntingYard applies the Shunting-Yard algorithm
|
|
|
|
|
|
|
|
to convert the given infix expression to postfix. This makes
|
|
|
|
it easier to parse the algorithm when doing Thompson.
|
|
|
|
See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
|
|
|
|
*/
|
|
|
|
func shuntingYard(re string) string {
|
|
|
|
re_postfix := make([]rune, 0)
|
|
|
|
re_runes := []rune(re)
|
|
|
|
/* Add concatenation operators */
|
|
|
|
for i := 0; i < len(re_runes); i++ {
|
|
|
|
re_postfix = append(re_postfix, re_runes[i])
|
|
|
|
if re_runes[i] != '(' && re_runes[i] != '|' {
|
|
|
|
if i < len(re_runes)-1 {
|
|
|
|
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != ')' {
|
|
|
|
re_postfix = append(re_postfix, CONCAT)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// fmt.Println(string(re_postfix))
|
|
|
|
|
|
|
|
opStack := make([]rune, 0) // Operator stack
|
|
|
|
outQueue := make([]rune, 0) // Output queue
|
|
|
|
|
|
|
|
// Actual algorithm
|
|
|
|
for _, c := range re_postfix {
|
|
|
|
/* Two cases:
|
|
|
|
1. Current character is alphanumeric - send to output queue
|
|
|
|
2. Current character is operator - do the following:
|
|
|
|
a. If current character has greater priority than top of opStack, push to opStack.
|
|
|
|
b. If not, keep popping from opStack (and appending to outQueue) until:
|
|
|
|
i. opStack is empty, OR
|
|
|
|
ii. current character has greater priority than top of opStack
|
|
|
|
3. If current character is '(', push to opStack
|
|
|
|
4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
|
|
|
|
*/
|
|
|
|
if isAlphaNum(c) {
|
|
|
|
outQueue = append(outQueue, c)
|
|
|
|
}
|
|
|
|
if isOperator(c) {
|
|
|
|
if len(opStack) == 0 {
|
|
|
|
opStack = append(opStack, c)
|
|
|
|
} else {
|
|
|
|
topStack, err := peek(opStack)
|
|
|
|
if err != nil {
|
|
|
|
panic("ERROR: Operator without operand.")
|
|
|
|
}
|
|
|
|
if priority(c) > priority(topStack) { // 2a
|
|
|
|
opStack = append(opStack, c)
|
|
|
|
} else {
|
|
|
|
for priority(c) <= priority(topStack) { // 2b
|
|
|
|
to_append := mustPop(&opStack)
|
|
|
|
outQueue = append(outQueue, to_append)
|
|
|
|
topStack, _ = peek(opStack)
|
|
|
|
}
|
|
|
|
opStack = append(opStack, c)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if c == '(' {
|
|
|
|
opStack = append(opStack, c)
|
|
|
|
}
|
|
|
|
if c == ')' {
|
|
|
|
// Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack.
|
|
|
|
for val, err := peek(opStack); val != '('; val, err = peek(opStack) {
|
|
|
|
if err != nil {
|
|
|
|
panic("ERROR: Imbalanced parantheses.")
|
|
|
|
}
|
|
|
|
to_append := mustPop(&opStack)
|
|
|
|
outQueue = append(outQueue, to_append)
|
|
|
|
}
|
|
|
|
_ = mustPop(&opStack) // Get rid of opening parantheses
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Pop all remaining operators (and append to outQueue)
|
|
|
|
for len(opStack) > 0 {
|
|
|
|
to_append := mustPop(&opStack)
|
|
|
|
outQueue = append(outQueue, to_append)
|
|
|
|
}
|
|
|
|
|
|
|
|
return string(outQueue)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
|
|
|
|
// Returns start state.
|
|
|
|
func thompson(re string) *State {
|
|
|
|
nfa := make([]*State, 0) // Stack of states
|
|
|
|
for _, c := range re {
|
|
|
|
if isAlphaNum(c) {
|
|
|
|
state := State{}
|
|
|
|
state.transitions = make(map[int][]*State)
|
|
|
|
state.content = int(c)
|
|
|
|
state.output = make([]*State, 0)
|
|
|
|
state.output = append(state.output, &state)
|
|
|
|
state.isEmpty = false
|
|
|
|
nfa = append(nfa, &state)
|
|
|
|
}
|
|
|
|
// Must be an operator if it isn't alphanumeric
|
|
|
|
switch c {
|
|
|
|
case CONCAT:
|
|
|
|
s2 := mustPop(&nfa)
|
|
|
|
s1 := mustPop(&nfa)
|
|
|
|
s1 = concatenate(s1, s2)
|
|
|
|
nfa = append(nfa, s1)
|
|
|
|
case '*': // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
|
|
|
|
s1 := mustPop(&nfa)
|
|
|
|
stateToAdd := kleene(*s1)
|
|
|
|
nfa = append(nfa, stateToAdd)
|
|
|
|
case '+': // a+ is equivalent to aa*
|
|
|
|
s1 := mustPop(&nfa)
|
|
|
|
s2 := kleene(*s1)
|
|
|
|
s1 = concatenate(s1, s2)
|
|
|
|
nfa = append(nfa, s1)
|
|
|
|
case '|':
|
|
|
|
s1 := mustPop(&nfa)
|
|
|
|
s2 := mustPop(&nfa)
|
|
|
|
s3 := State{}
|
|
|
|
s3.transitions = make(map[int][]*State)
|
|
|
|
s3.output = append(s3.output, s1.output...)
|
|
|
|
s3.output = append(s3.output, s2.output...)
|
|
|
|
// Unique append is used here (and elsewhere) to ensure that,
|
|
|
|
// for any given transition, a state can only be mentioned once.
|
|
|
|
// For example, given the transition 'a', the state 's1' can only be mentioned once.
|
|
|
|
// This would lead to multiple instances of the same set of match indices, since both
|
|
|
|
// 's1' states would be considered to match.
|
|
|
|
s3.transitions[s1.content] = unique_append(s3.transitions[s1.content], s1)
|
|
|
|
s3.transitions[s2.content] = unique_append(s3.transitions[s2.content], s2)
|
|
|
|
s3.content = EPSILON
|
|
|
|
s3.isEmpty = true
|
|
|
|
|
|
|
|
nfa = append(nfa, &s3)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if len(nfa) != 1 {
|
|
|
|
panic("ERROR: Invalid Regex.")
|
|
|
|
}
|
|
|
|
|
|
|
|
verifyLastStates(nfa)
|
|
|
|
|
|
|
|
return nfa[0]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
func main() {
|
|
|
|
// Process:
|
|
|
|
// 1. Convert regex into postfix notation (Shunting-Yard algorithm)
|
|
|
|
// a. Add explicit concatenation operators to facilitate this
|
|
|
|
// 2. Build NFA from postfix representation (Thompson's algorithm)
|
|
|
|
// 3. Run the string against the NFA
|
|
|
|
if len(os.Args) < 3 {
|
|
|
|
fmt.Println("ERROR: Missing cmdline args")
|
|
|
|
os.Exit(22)
|
|
|
|
}
|
|
|
|
var re string
|
|
|
|
re = os.Args[1]
|
|
|
|
re_postfix := shuntingYard(re)
|
|
|
|
// fmt.Println(re_postfix)
|
|
|
|
startState := thompson(re_postfix)
|
|
|
|
matchIndices := findAllMatches(startState, os.Args[2])
|
|
|
|
inColor := false
|
|
|
|
if len(matchIndices) > 0 {
|
|
|
|
for i, c := range os.Args[2] {
|
|
|
|
for _, indices := range matchIndices {
|
|
|
|
if i >= indices.startIdx && i < indices.endIdx {
|
|
|
|
color.New(color.FgRed).Printf("%c", c)
|
|
|
|
inColor = true
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if inColor == false {
|
|
|
|
fmt.Printf("%c", c)
|
|
|
|
}
|
|
|
|
inColor = false
|
|
|
|
}
|
|
|
|
fmt.Printf("\n")
|
|
|
|
} else {
|
|
|
|
fmt.Println(os.Args[2])
|
|
|
|
}
|
|
|
|
}
|