Added support for character classes (not ranges, yet); also take input from stdin instead of cmdline arg

master
Aadhavan Srinivasan 2 months ago
parent cd680371fb
commit 11073759e3

@ -1,6 +1,7 @@
package main package main
import ( import (
"bufio"
"fmt" "fmt"
"os" "os"
"slices" "slices"
@ -29,7 +30,7 @@ The primary benefit of this is getting rid of parentheses.
It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm. It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/ See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
*/ */
func shuntingYard(re string) string { func shuntingYard(re string) []postfixNode {
re_postfix := make([]rune, 0) re_postfix := make([]rune, 0)
re_runes := []rune(re) // Convert the string to a slice of runes to allow iteration through it re_runes := []rune(re) // Convert the string to a slice of runes to allow iteration through it
/* Add concatenation operators. /* Add concatenation operators.
@ -39,8 +40,16 @@ func shuntingYard(re string) string {
2. The second character isn't a 'closing operator' - one that applies to something before it 2. The second character isn't a 'closing operator' - one that applies to something before it
a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_. a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_.
*/ */
for i := 0; i < len(re_runes); i++ { i := 0
for i < len(re_runes) {
re_postfix = append(re_postfix, re_runes[i]) re_postfix = append(re_postfix, re_runes[i])
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped
for re_runes[i] != ']' {
i++ // Skip all characters inside brackets
re_postfix = append(re_postfix, re_runes[i])
}
continue
}
if re_runes[i] != '(' && re_runes[i] != '|' { if re_runes[i] != '(' && re_runes[i] != '|' {
if i < len(re_runes)-1 { if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' { if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' {
@ -48,10 +57,11 @@ func shuntingYard(re string) string {
} }
} }
} }
i++
} }
opStack := make([]rune, 0) // Operator stack opStack := make([]rune, 0) // Operator stack
outQueue := make([]rune, 0) // Output queue outQueue := make([]postfixNode, 0) // Output queue
// Actual algorithm // Actual algorithm
for i := 0; i < len(re_postfix); i++ { for i := 0; i < len(re_postfix); i++ {
@ -67,7 +77,7 @@ func shuntingYard(re string) string {
*/ */
c := re_postfix[i] c := re_postfix[i]
if isAlphaNum(c) { if isAlphaNum(c) {
outQueue = append(outQueue, c) outQueue = append(outQueue, newPostfixNode(c))
continue continue
} }
// Escape character - NOT IMPLEMENTED YET - DO NOT USE // Escape character - NOT IMPLEMENTED YET - DO NOT USE
@ -91,13 +101,30 @@ func shuntingYard(re string) string {
} else { } else {
for priority(c) <= priority(topStack) { // 2b for priority(c) <= priority(topStack) { // 2b
to_append := mustPop(&opStack) to_append := mustPop(&opStack)
outQueue = append(outQueue, to_append) outQueue = append(outQueue, newPostfixNode(to_append))
topStack, _ = peek(opStack) topStack, _ = peek(opStack)
} }
opStack = append(opStack, c) opStack = append(opStack, c)
} }
} }
} }
if c == '[' { // Used for character classes
i++ // Step forward so we can look at the character class
chars := make([]rune, 0) // List of characters - used only for character classes
for i < len(re_postfix) {
if re_postfix[i] == ']' {
break
}
chars = append(chars, re_postfix[i])
i++
}
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
panic("ERROR: Opening bracket without closing bracket.")
}
outQueue = append(outQueue, newPostfixNode(chars...))
i++ // Step forward to skip closing bracket
continue
}
if c == '(' { if c == '(' {
opStack = append(opStack, c) opStack = append(opStack, c)
} }
@ -108,7 +135,7 @@ func shuntingYard(re string) string {
panic("ERROR: Imbalanced parantheses.") panic("ERROR: Imbalanced parantheses.")
} }
to_append := mustPop(&opStack) to_append := mustPop(&opStack)
outQueue = append(outQueue, to_append) outQueue = append(outQueue, newPostfixNode(to_append))
} }
_ = mustPop(&opStack) // Get rid of opening parantheses _ = mustPop(&opStack) // Get rid of opening parantheses
} }
@ -117,52 +144,52 @@ func shuntingYard(re string) string {
// Pop all remaining operators (and append to outQueue) // Pop all remaining operators (and append to outQueue)
for len(opStack) > 0 { for len(opStack) > 0 {
to_append := mustPop(&opStack) to_append := mustPop(&opStack)
outQueue = append(outQueue, to_append) outQueue = append(outQueue, newPostfixNode(to_append))
} }
return string(outQueue) return outQueue
} }
// Thompson's algorithm. Constructs Finite-State Automaton from given string. // Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state. // Returns start state.
func thompson(re string) *State { func thompson(re []postfixNode) *State {
nfa := make([]*State, 0) // Stack of states nfa := make([]*State, 0) // Stack of states
for _, c := range re { for _, c := range re {
if isAlphaNum(c) { if c.nodetype == CHARACTER {
state := State{} state := State{}
state.transitions = make(map[int][]*State) state.transitions = make(map[int][]*State)
state.content = int(c) state.content = rune2Contents(c.contents)
state.output = make([]*State, 0) state.output = make([]*State, 0)
state.output = append(state.output, &state) state.output = append(state.output, &state)
state.isEmpty = false state.isEmpty = false
nfa = append(nfa, &state) nfa = append(nfa, &state)
} }
// Must be an operator if it isn't alphanumeric // Must be an operator if it isn't a character
switch c { switch c.nodetype {
case CONCAT: case CONCATENATE:
s2 := mustPop(&nfa) s2 := mustPop(&nfa)
s1 := mustPop(&nfa) s1 := mustPop(&nfa)
s1 = concatenate(s1, s2) s1 = concatenate(s1, s2)
nfa = append(nfa, s1) nfa = append(nfa, s1)
case '*': // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
s1 := mustPop(&nfa) s1 := mustPop(&nfa)
stateToAdd := kleene(*s1) stateToAdd := kleene(*s1)
nfa = append(nfa, stateToAdd) nfa = append(nfa, stateToAdd)
case '+': // a+ is equivalent to aa* case PLUS: // a+ is equivalent to aa*
s1 := mustPop(&nfa) s1 := mustPop(&nfa)
s2 := kleene(*s1) s2 := kleene(*s1)
s1 = concatenate(s1, s2) s1 = concatenate(s1, s2)
nfa = append(nfa, s1) nfa = append(nfa, s1)
case '?': // ab? is equivalent to a(b|) case QUESTION: // ab? is equivalent to a(b|)
s1 := mustPop(&nfa) s1 := mustPop(&nfa)
s2 := &State{} s2 := &State{}
s2.transitions = make(map[int][]*State) s2.transitions = make(map[int][]*State)
s2.content = EPSILON s2.content = newContents(EPSILON)
s2.output = append(s2.output, s2) s2.output = append(s2.output, s2)
s2.isEmpty = true s2.isEmpty = true
s3 := alternate(s1, s2) s3 := alternate(s1, s2)
nfa = append(nfa, s3) nfa = append(nfa, s3)
case '|': case PIPE:
s1 := mustPop(&nfa) s1 := mustPop(&nfa)
s2 := mustPop(&nfa) s2 := mustPop(&nfa)
s3 := alternate(s1, s2) s3 := alternate(s1, s2)
@ -185,19 +212,28 @@ func main() {
// a. Add explicit concatenation operators to facilitate this // a. Add explicit concatenation operators to facilitate this
// 2. Build NFA from postfix representation (Thompson's algorithm) // 2. Build NFA from postfix representation (Thompson's algorithm)
// 3. Run the string against the NFA // 3. Run the string against the NFA
if len(os.Args) < 3 { if len(os.Args) != 2 {
fmt.Println("ERROR: Missing cmdline args") fmt.Println("ERROR: Missing cmdline args")
os.Exit(22) os.Exit(22)
} }
var re string var re string
re = os.Args[1] re = os.Args[1]
var test_str string
// Read test string from stdin
reader := bufio.NewReader(os.Stdin)
test_str, err := reader.ReadString('\n')
if err != nil {
panic(err)
}
fmt.Scanln(&test_str)
re_postfix := shuntingYard(re) re_postfix := shuntingYard(re)
// fmt.Println(re_postfix) // fmt.Println(re_postfix)
startState := thompson(re_postfix) startState := thompson(re_postfix)
matchIndices := findAllMatches(startState, os.Args[2]) matchIndices := findAllMatches(startState, test_str)
inColor := false inColor := false
if len(matchIndices) > 0 { if len(matchIndices) > 0 {
for i, c := range os.Args[2] { for i, c := range test_str {
for _, indices := range matchIndices { for _, indices := range matchIndices {
if i >= indices.startIdx && i < indices.endIdx { if i >= indices.startIdx && i < indices.endIdx {
color.New(color.FgRed).Printf("%c", c) color.New(color.FgRed).Printf("%c", c)
@ -210,8 +246,7 @@ func main() {
} }
inColor = false inColor = false
} }
fmt.Printf("\n")
} else { } else {
fmt.Println(os.Args[2]) fmt.Print(test_str)
} }
} }

Loading…
Cancel
Save