package main import ( "fmt" "os" "slices" "github.com/fatih/color" ) const CONCAT rune = '~' func isOperator(c rune) bool { if c == '+' || c == '*' || c == '|' || c == CONCAT { return true } return false } /* priority returns the priority of the given operator */ func priority(op rune) int { precedence := []rune{'|', CONCAT, '+', '*'} return slices.Index(precedence, op) } /* shuntingYard applies the Shunting-Yard algorithm to convert the given infix expression to postfix. This makes it easier to parse the algorithm when doing Thompson. See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/ */ func shuntingYard(re string) string { re_postfix := make([]rune, 0) re_runes := []rune(re) /* Add concatenation operators */ for i := 0; i < len(re_runes); i++ { re_postfix = append(re_postfix, re_runes[i]) if re_runes[i] != '(' && re_runes[i] != '|' { if i < len(re_runes)-1 { if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != ')' { re_postfix = append(re_postfix, CONCAT) } } } } // fmt.Println(string(re_postfix)) opStack := make([]rune, 0) // Operator stack outQueue := make([]rune, 0) // Output queue // Actual algorithm for _, c := range re_postfix { /* Two cases: 1. Current character is alphanumeric - send to output queue 2. Current character is operator - do the following: a. If current character has greater priority than top of opStack, push to opStack. b. If not, keep popping from opStack (and appending to outQueue) until: i. opStack is empty, OR ii. current character has greater priority than top of opStack 3. If current character is '(', push to opStack 4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses. */ if isAlphaNum(c) { outQueue = append(outQueue, c) } if isOperator(c) { if len(opStack) == 0 { opStack = append(opStack, c) } else { topStack, err := peek(opStack) if err != nil { panic("ERROR: Operator without operand.") } if priority(c) > priority(topStack) { // 2a opStack = append(opStack, c) } else { for priority(c) <= priority(topStack) { // 2b to_append := mustPop(&opStack) outQueue = append(outQueue, to_append) topStack, _ = peek(opStack) } opStack = append(opStack, c) } } } if c == '(' { opStack = append(opStack, c) } if c == ')' { // Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack. for val, err := peek(opStack); val != '('; val, err = peek(opStack) { if err != nil { panic("ERROR: Imbalanced parantheses.") } to_append := mustPop(&opStack) outQueue = append(outQueue, to_append) } _ = mustPop(&opStack) // Get rid of opening parantheses } } // Pop all remaining operators (and append to outQueue) for len(opStack) > 0 { to_append := mustPop(&opStack) outQueue = append(outQueue, to_append) } return string(outQueue) } // Thompson's algorithm. Constructs Finite-State Automaton from given string. // Returns start state. func thompson(re string) *State { nfa := make([]*State, 0) // Stack of states for _, c := range re { if isAlphaNum(c) { state := State{} state.transitions = make(map[int][]*State) state.content = int(c) state.output = make([]*State, 0) state.output = append(state.output, &state) state.isEmpty = false nfa = append(nfa, &state) } // Must be an operator if it isn't alphanumeric switch c { case CONCAT: s2 := mustPop(&nfa) s1 := mustPop(&nfa) s1 = concatenate(s1, s2) nfa = append(nfa, s1) case '*': // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state s1 := mustPop(&nfa) stateToAdd := kleene(*s1) nfa = append(nfa, stateToAdd) case '+': // a+ is equivalent to aa* s1 := mustPop(&nfa) s2 := kleene(*s1) s1 = concatenate(s1, s2) nfa = append(nfa, s1) case '|': s1 := mustPop(&nfa) s2 := mustPop(&nfa) s3 := State{} s3.transitions = make(map[int][]*State) s3.output = append(s3.output, s1.output...) s3.output = append(s3.output, s2.output...) // Unique append is used here (and elsewhere) to ensure that, // for any given transition, a state can only be mentioned once. // For example, given the transition 'a', the state 's1' can only be mentioned once. // This would lead to multiple instances of the same set of match indices, since both // 's1' states would be considered to match. s3.transitions[s1.content] = unique_append(s3.transitions[s1.content], s1) s3.transitions[s2.content] = unique_append(s3.transitions[s2.content], s2) s3.content = EPSILON s3.isEmpty = true nfa = append(nfa, &s3) } } if len(nfa) != 1 { panic("ERROR: Invalid Regex.") } verifyLastStates(nfa) return nfa[0] } func main() { // Process: // 1. Convert regex into postfix notation (Shunting-Yard algorithm) // a. Add explicit concatenation operators to facilitate this // 2. Build NFA from postfix representation (Thompson's algorithm) // 3. Run the string against the NFA if len(os.Args) < 3 { fmt.Println("ERROR: Missing cmdline args") os.Exit(22) } var re string re = os.Args[1] re_postfix := shuntingYard(re) // fmt.Println(re_postfix) startState := thompson(re_postfix) matchIndices := findAllMatches(startState, os.Args[2]) inColor := false if len(matchIndices) > 0 { for i, c := range os.Args[2] { for _, indices := range matchIndices { if i >= indices.startIdx && i < indices.endIdx { color.New(color.FgRed).Printf("%c", c) inColor = true break } } if inColor == false { fmt.Printf("%c", c) } inColor = false } fmt.Printf("\n") } else { fmt.Println(os.Args[2]) } }