package main
import (
"bufio"
"fmt"
"os"
"slices"
"github.com/fatih/color"
)
const CONCAT rune = '~'
func isOperator ( c rune ) bool {
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
return true
}
return false
}
/* priority returns the priority of the given operator */
func priority ( op rune ) int {
precedence := [ ] rune { '|' , CONCAT , '+' , '*' , '?' }
return slices . Index ( precedence , op )
}
/ *
The Shunting - Yard algorithm is used to convert the given infix ( regeular ) expression to postfix .
The primary benefit of this is getting rid of parentheses .
It also inserts explicit concatenation operators to make parsing easier in Thompson ' s algorithm .
See : https : //blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
* /
func shuntingYard ( re string ) [ ] postfixNode {
re_postfix := make ( [ ] rune , 0 )
re_runes := [ ] rune ( re ) // Convert the string to a slice of runes to allow iteration through it
/ * Add concatenation operators .
Only add a concatenation operator between two characters if both the following conditions are met :
1. The first character isn ' t an opening parantheses or alteration operator ( or an escape character )
a . This makes sense , because these operators can ' t be _concatenated_ with anything else .
2. The second character isn ' t a ' closing operator ' - one that applies to something before it
a . Again , these operators can ' be concatenated _to_ . They can , however , be concatenated _from_ .
* /
i := 0
for i < len ( re_runes ) {
re_postfix = append ( re_postfix , re_runes [ i ] )
if re_runes [ i ] == '[' && ( i == 0 || re_runes [ i - 1 ] != '\\' ) { // We do not touch things inside brackets, unless they are escaped
re_postfix [ len ( re_postfix ) - 1 ] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing og all characters (including opening and closing brackets) within the character class
invertMatch := false
toAppend := make ( [ ] rune , 0 ) // Holds all the runes in the current character class
if i < len ( re_runes ) - 1 && re_runes [ i + 1 ] == '^' { // Inverting class - match everything NOT in brackets
invertMatch = true
i ++
}
if i < len ( re_runes ) - 1 && re_runes [ i + 1 ] == ']' { // Nothing inside brackets - panic.
panic ( "Empty character class." )
}
for re_runes [ i ] != ']' {
i ++ // Skip all characters inside brackets
// TODO: Check for escaped characters
// Check ahead for character range
if i < len ( re_runes ) - 2 && re_runes [ i + 1 ] == '-' {
rangeStart := re_runes [ i ]
rangeEnd := re_runes [ i + 2 ]
if int ( rangeEnd ) < int ( rangeStart ) {
panic ( "Range is out of order." )
}
for i := rangeStart ; i <= rangeEnd ; i ++ {
toAppend = append ( toAppend , i )
}
i += 2 // Skip start and hyphen (end will automatically be skipped on next iteration of loop)
continue
}
toAppend = append ( toAppend , re_runes [ i ] )
}
// Replace the last character (which should have been ']', with RBRACKET
toAppend [ len ( toAppend ) - 1 ] = RBRACKET
if invertMatch {
toAppend = setDifference ( dotChars ( ) , toAppend ) // Take the inverse of the set by getting the difference between it and all dot characters
toAppend = append ( toAppend , RBRACKET ) // Since RBRACKET doesn't exist in dotChars, it wouldn't have been return in setDifference. We manually append it here.
}
re_postfix = append ( re_postfix , toAppend ... )
}
if ( re_runes [ i ] != '(' && re_runes [ i ] != '|' && re_runes [ i ] != '\\' ) || ( i > 0 && re_runes [ i - 1 ] == '\\' ) { // Every character should be concatenated if it is escaped
if i < len ( re_runes ) - 1 {
if re_runes [ i + 1 ] != '|' && re_runes [ i + 1 ] != '*' && re_runes [ i + 1 ] != '+' && re_runes [ i + 1 ] != '?' && re_runes [ i + 1 ] != ')' {
re_postfix = append ( re_postfix , CONCAT )
}
}
}
i ++
}
opStack := make ( [ ] rune , 0 ) // Operator stack
outQueue := make ( [ ] postfixNode , 0 ) // Output queue
// Actual algorithm
for i := 0 ; i < len ( re_postfix ) ; i ++ {
/ * Two cases :
1. Current character is alphanumeric - send to output queue
2. Current character is operator - do the following :
a . If current character has greater priority than top of opStack , push to opStack .
b . If not , keep popping from opStack ( and appending to outQueue ) until :
i . opStack is empty , OR
ii . current character has greater priority than top of opStack
3. If current character is '(' , push to opStack
4. If current character is ')' , pop from opStack ( and append to outQueue ) until '(' is found . Discard parantheses .
5. If current character is '[' , find all the characters until ']' , then create a postfixNode containing all these contents . Add this node to outQueue .
* /
c := re_postfix [ i ]
if isAlphaNum ( c ) {
outQueue = append ( outQueue , newPostfixNode ( c ) )
continue
}
// Escape character
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
if i == len ( re_postfix ) - 1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
panic ( "ERROR: Backslash with no escape character." )
}
i ++
outQueue = append ( outQueue , newEscapedNode ( re_postfix [ i ] ) )
continue // Escaped character will automatically be skipped when loop variable increments
}
if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
outQueue = append ( outQueue , newPostfixNode ( dotChars ( ) ... ) )
continue
}
if isOperator ( c ) {
if len ( opStack ) == 0 {
opStack = append ( opStack , c )
} else {
topStack , err := peek ( opStack )
if err != nil {
panic ( "ERROR: Operator without operand." )
}
if priority ( c ) > priority ( topStack ) { // 2a
opStack = append ( opStack , c )
} else {
for priority ( c ) <= priority ( topStack ) { // 2b
to_append := mustPop ( & opStack )
outQueue = append ( outQueue , newPostfixNode ( to_append ) )
topStack , _ = peek ( opStack )
}
opStack = append ( opStack , c )
}
}
}
if c == LBRACKET { // Used for character classes
i ++ // Step forward so we can look at the character class
chars := make ( [ ] rune , 0 ) // List of characters - used only for character classes
for i < len ( re_postfix ) {
if re_postfix [ i ] == RBRACKET {
break
}
chars = append ( chars , re_postfix [ i ] )
i ++
}
if i == len ( re_postfix ) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
panic ( "ERROR: Opening bracket without closing bracket." )
}
outQueue = append ( outQueue , newPostfixNode ( chars ... ) )
// i++ // Step forward to skip closing bracket
continue
}
if c == '(' {
opStack = append ( opStack , c )
}
if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack.
for val , err := peek ( opStack ) ; val != '(' ; val , err = peek ( opStack ) {
if err != nil {
panic ( "ERROR: Imbalanced parantheses." )
}
to_append := mustPop ( & opStack )
outQueue = append ( outQueue , newPostfixNode ( to_append ) )
}
_ = mustPop ( & opStack ) // Get rid of opening parantheses
}
}
// Pop all remaining operators (and append to outQueue)
for len ( opStack ) > 0 {
to_append := mustPop ( & opStack )
outQueue = append ( outQueue , newPostfixNode ( to_append ) )
}
return outQueue
}
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state.
func thompson ( re [ ] postfixNode ) * State {
nfa := make ( [ ] * State , 0 ) // Stack of states
for _ , c := range re {
if c . nodetype == CHARACTER {
state := State { }
state . transitions = make ( map [ int ] [ ] * State )
state . content = rune2Contents ( c . contents )
state . output = make ( [ ] * State , 0 )
state . output = append ( state . output , & state )
state . isEmpty = false
nfa = append ( nfa , & state )
}
// Must be an operator if it isn't a character
switch c . nodetype {
case CONCATENATE :
s2 := mustPop ( & nfa )
s1 := mustPop ( & nfa )
s1 = concatenate ( s1 , s2 )
nfa = append ( nfa , s1 )
case KLEENE : // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
s1 := mustPop ( & nfa )
stateToAdd := kleene ( * s1 )
nfa = append ( nfa , stateToAdd )
case PLUS : // a+ is equivalent to aa*
s1 := mustPop ( & nfa )
s2 := kleene ( * s1 )
s1 = concatenate ( s1 , s2 )
nfa = append ( nfa , s1 )
case QUESTION : // ab? is equivalent to a(b|)
s1 := mustPop ( & nfa )
s2 := & State { }
s2 . transitions = make ( map [ int ] [ ] * State )
s2 . content = newContents ( EPSILON )
s2 . output = append ( s2 . output , s2 )
s2 . isEmpty = true
s3 := alternate ( s1 , s2 )
nfa = append ( nfa , s3 )
case PIPE :
s1 := mustPop ( & nfa )
s2 := mustPop ( & nfa )
s3 := alternate ( s1 , s2 )
nfa = append ( nfa , s3 )
}
}
if len ( nfa ) != 1 {
panic ( "ERROR: Invalid Regex." )
}
verifyLastStates ( nfa )
return nfa [ 0 ]
}
func main ( ) {
// Process:
// 1. Convert regex into postfix notation (Shunting-Yard algorithm)
// a. Add explicit concatenation operators to facilitate this
// 2. Build NFA from postfix representation (Thompson's algorithm)
// 3. Run the string against the NFA
if len ( os . Args ) != 2 {
fmt . Println ( "ERROR: Missing cmdline args" )
os . Exit ( 22 )
}
var re string
re = os . Args [ 1 ]
var test_str string
// Read test string from stdin
reader := bufio . NewReader ( os . Stdin )
test_str , err := reader . ReadString ( '\n' )
if err != nil {
panic ( err )
}
fmt . Scanln ( & test_str )
re_postfix := shuntingYard ( re )
// fmt.Println(re_postfix)
startState := thompson ( re_postfix )
matchIndices := findAllMatches ( startState , test_str )
inColor := false
if len ( matchIndices ) > 0 {
for i , c := range test_str {
for _ , indices := range matchIndices {
if i >= indices . startIdx && i < indices . endIdx {
color . New ( color . FgRed ) . Printf ( "%c" , c )
inColor = true
break
}
}
if inColor == false {
fmt . Printf ( "%c" , c )
}
inColor = false
}
} else {
fmt . Print ( test_str )
}
}