package main import ( "bufio" "fmt" "io" "os" "slices" "github.com/fatih/color" ) const CONCAT rune = '~' func isOperator(c rune) bool { if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT { return true } return false } /* priority returns the priority of the given operator */ func priority(op rune) int { precedence := []rune{'|', CONCAT, '+', '*', '?'} return slices.Index(precedence, op) } /* The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix. The primary benefit of this is getting rid of parentheses. It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm. See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/ */ func shuntingYard(re string) []postfixNode { re_postfix := make([]rune, 0) re_runes := []rune(re) // Convert the string to a slice of runes to allow iteration through it /* Add concatenation operators. Only add a concatenation operator between two characters if both the following conditions are met: 1. The first character isn't an opening parantheses or alteration operator (or an escape character) a. This makes sense, because these operators can't be _concatenated_ with anything else. 2. The second character isn't a 'closing operator' - one that applies to something before it a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_. */ i := 0 for i < len(re_runes) { re_postfix = append(re_postfix, re_runes[i]) if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing og all characters (including opening and closing brackets) within the character class invertMatch := false toAppend := make([]rune, 0) // Holds all the runes in the current character class if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets invertMatch = true i++ } if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic. panic("Empty character class.") } for re_runes[i] != ']' { i++ // Skip all characters inside brackets // TODO: Check for escaped characters // Check ahead for character range if i < len(re_runes)-2 && re_runes[i+1] == '-' { rangeStart := re_runes[i] rangeEnd := re_runes[i+2] if int(rangeEnd) < int(rangeStart) { panic("Range is out of order.") } for i := rangeStart; i <= rangeEnd; i++ { toAppend = append(toAppend, i) } i += 2 // Skip start and hyphen (end will automatically be skipped on next iteration of loop) continue } toAppend = append(toAppend, re_runes[i]) } // Replace the last character (which should have been ']', with RBRACKET toAppend[len(toAppend)-1] = RBRACKET if invertMatch { toAppend = setDifference(dotChars(), toAppend) // Take the inverse of the set by getting the difference between it and all dot characters toAppend = append(toAppend, RBRACKET) // Since RBRACKET doesn't exist in dotChars, it wouldn't have been return in setDifference. We manually append it here. } re_postfix = append(re_postfix, toAppend...) } if (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped if i < len(re_runes)-1 { if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' { re_postfix = append(re_postfix, CONCAT) } } } i++ } opStack := make([]rune, 0) // Operator stack outQueue := make([]postfixNode, 0) // Output queue // Actual algorithm for i := 0; i < len(re_postfix); i++ { /* Two cases: 1. Current character is alphanumeric - send to output queue 2. Current character is operator - do the following: a. If current character has greater priority than top of opStack, push to opStack. b. If not, keep popping from opStack (and appending to outQueue) until: i. opStack is empty, OR ii. current character has greater priority than top of opStack 3. If current character is '(', push to opStack 4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses. 5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue. */ c := re_postfix[i] if isAlphaNum(c) { outQueue = append(outQueue, newPostfixNode(c)) continue } // Escape character if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it) panic("ERROR: Backslash with no escape character.") } i++ outQueue = append(outQueue, newEscapedNode(re_postfix[i])) continue // Escaped character will automatically be skipped when loop variable increments } if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E outQueue = append(outQueue, newPostfixNode(dotChars()...)) continue } if c == '^' { // Start-of-string assertion outQueue = append(outQueue, newPostfixNode(c)) } if c == '$' { // End-of-string assertion outQueue = append(outQueue, newPostfixNode(c)) } if isOperator(c) { if len(opStack) == 0 { opStack = append(opStack, c) } else { topStack, err := peek(opStack) if err != nil { panic("ERROR: Operator without operand.") } if priority(c) > priority(topStack) { // 2a opStack = append(opStack, c) } else { for priority(c) <= priority(topStack) { // 2b to_append := mustPop(&opStack) outQueue = append(outQueue, newPostfixNode(to_append)) topStack, _ = peek(opStack) } opStack = append(opStack, c) } } } if c == LBRACKET { // Used for character classes i++ // Step forward so we can look at the character class chars := make([]rune, 0) // List of characters - used only for character classes for i < len(re_postfix) { if re_postfix[i] == RBRACKET { break } chars = append(chars, re_postfix[i]) i++ } if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic. panic("ERROR: Opening bracket without closing bracket.") } outQueue = append(outQueue, newPostfixNode(chars...)) // i++ // Step forward to skip closing bracket continue } if c == '(' { opStack = append(opStack, c) } if c == ')' { // Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack. for val, err := peek(opStack); val != '('; val, err = peek(opStack) { if err != nil { panic("ERROR: Imbalanced parantheses.") } to_append := mustPop(&opStack) outQueue = append(outQueue, newPostfixNode(to_append)) } _ = mustPop(&opStack) // Get rid of opening parantheses } } // Pop all remaining operators (and append to outQueue) for len(opStack) > 0 { to_append := mustPop(&opStack) outQueue = append(outQueue, newPostfixNode(to_append)) } return outQueue } // Thompson's algorithm. Constructs Finite-State Automaton from given string. // Returns start state. func thompson(re []postfixNode) *State { nfa := make([]*State, 0) // Stack of states for _, c := range re { if c.nodetype == CHARACTER || c.nodetype == ASSERTION { state := State{} state.transitions = make(map[int][]*State) state.content = rune2Contents(c.contents) state.output = make([]*State, 0) state.output = append(state.output, &state) state.isEmpty = false if c.nodetype == ASSERTION { state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string state.isEmpty = true switch c.contents[0] { case '^': state.assert = SOS case '$': state.assert = EOS case 'b': state.assert = WBOUND case 'B': state.assert = NONWBOUND } } nfa = append(nfa, &state) } // Must be an operator if it isn't a character switch c.nodetype { case CONCATENATE: s2 := mustPop(&nfa) s1 := mustPop(&nfa) s1 = concatenate(s1, s2) nfa = append(nfa, s1) case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state s1 := mustPop(&nfa) stateToAdd := kleene(*s1) nfa = append(nfa, stateToAdd) case PLUS: // a+ is equivalent to aa* s1 := mustPop(&nfa) s2 := kleene(*s1) s1 = concatenate(s1, s2) nfa = append(nfa, s1) case QUESTION: // ab? is equivalent to a(b|) s1 := mustPop(&nfa) s2 := &State{} s2.transitions = make(map[int][]*State) s2.content = newContents(EPSILON) s2.output = append(s2.output, s2) s2.isEmpty = true s3 := alternate(s1, s2) nfa = append(nfa, s3) case PIPE: s1 := mustPop(&nfa) s2 := mustPop(&nfa) s3 := alternate(s1, s2) nfa = append(nfa, s3) } } if len(nfa) != 1 { panic("ERROR: Invalid Regex.") } verifyLastStates(nfa) return nfa[0] } func main() { // Process: // 1. Convert regex into postfix notation (Shunting-Yard algorithm) // a. Add explicit concatenation operators to facilitate this // 2. Build NFA from postfix representation (Thompson's algorithm) // 3. Run the string against the NFA if len(os.Args) != 2 { fmt.Println("ERROR: Missing cmdline args") os.Exit(22) } var re string re = os.Args[1] var test_str string // Read test string from stdin reader := bufio.NewReader(os.Stdin) test_str, err := reader.ReadString('\n') if err != nil && err != io.EOF { panic(err) } fmt.Scanln(&test_str) re_postfix := shuntingYard(re) startState := thompson(re_postfix) matchIndices := findAllMatches(startState, test_str) inColor := false if len(matchIndices) > 0 { for i, c := range test_str { for _, indices := range matchIndices { if i >= indices.startIdx && i < indices.endIdx { color.New(color.FgRed).Printf("%c", c) inColor = true break } } if inColor == false { fmt.Printf("%c", c) } inColor = false } } else { fmt.Print(test_str) } }