commit 82b33f3c9abf3f2cf0a8c6e8c9e7134509108d6f Author: Aadhavan Srinivasan Date: Mon Oct 21 23:08:52 2024 -0400 First commit diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e461d8f --- /dev/null +++ b/Makefile @@ -0,0 +1,9 @@ +.DEFAULT_GOAL := build +.PHONY: fmt vet build + +fmt: + go fmt ./... +vet: fmt + go vet ./... +build: vet + go build ./... diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..a9f775e --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module re + +go 1.23.1 diff --git a/main.go b/main.go new file mode 100644 index 0000000..7a2f5ac --- /dev/null +++ b/main.go @@ -0,0 +1,167 @@ +package main + +import ( + "fmt" + "slices" +) + +const CONCAT rune = '~' +const UNION int = 0 + +func isOperator(c rune) bool { + if c == '*' || c == '|' || c == CONCAT { + return true + } + return false +} + +/* priority returns the priority of the given operator */ +func priority(op rune) int { + precedence := []rune{'|', CONCAT, '*'} + return slices.Index(precedence, op) +} + +/* + shuntingYard applies the Shunting-Yard algorithm + +to convert the given infix expression to postfix. This makes +it easier to parse the algorithm when doing Thompson. +See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/ +*/ +func shuntingYard(re string) string { + re_postfix := make([]rune, 0) + re_runes := []rune(re) + /* Add concatenation operators */ + for i := 0; i < len(re_runes); i++ { + re_postfix = append(re_postfix, re_runes[i]) + if re_runes[i] != '(' && re_runes[i] != '|' { + if i < len(re_runes)-1 { + if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != ')' { + re_postfix = append(re_postfix, CONCAT) + } + } + } + } + + fmt.Println(string(re_postfix)) + + opStack := make([]rune, 0) // Operator stack + outQueue := make([]rune, 0) // Output queue + + // Actual algorithm + for _, c := range re_postfix { + /* Two cases: + 1. Current character is alphanumeric - send to output queue + 2. Current character is operator - do the following: + a. If current character has greater priority than top of opStack, push to opStack. + b. If not, keep popping from opStack (and appending to outQueue) until: + i. opStack is empty, OR + ii. current character has greater priority than top of opStack + 3. If current character is '(', push to opStack + 4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses. + */ + if isAlphaNum(c) { + outQueue = append(outQueue, c) + } + if isOperator(c) { + if len(opStack) == 0 { + opStack = append(opStack, c) + } else { + if priority(c) > priority(peek(opStack)) { // 2a + opStack = append(opStack, c) + } else { + for len(opStack) > 0 && priority(c) <= priority(peek(opStack)) { // 2b + to_append := pop(&opStack) + outQueue = append(outQueue, to_append) + } + opStack = append(opStack, c) + } + } + } + if c == '(' { + opStack = append(opStack, c) + } + if c == ')' { + for peek(opStack) != '(' { + to_append := pop(&opStack) + outQueue = append(outQueue, to_append) + } + _ = pop(&opStack) // Get rid of opening parantheses + } + } + + // Pop all remaining operators (and append to outQueue) + for len(opStack) > 0 { + to_append := pop(&opStack) + outQueue = append(outQueue, to_append) + } + + return string(outQueue) +} + +// Thompson's algorithm. Constructs Finite-State Automaton from given string. +// Returns start state. +func thompson(re string) State { + nfa := make([]State, 0) // Stack of states + for _, c := range re { + if isAlphaNum(c) { + state := State{} + state.transitions = make(map[int]*State) + state.content = int(c) + state.output = make([]*State, 0) + state.output = append(state.output, &state) + state.isEmpty = false + nfa = append(nfa, state) + } + // Must be an operator if it isn't alphanumeric + switch c { + case CONCAT: + s2 := pop(&nfa) + s1 := pop(&nfa) + for i := range s1.output { + s1.output[i].transitions[s2.content] = &s2 + } + s1.output = s2.output + nfa = append(nfa, s1) + case '*': + s1 := pop(&nfa) + for i := range s1.output { + s1.output[i].transitions[s1.content] = &s1 + } + nfa = append(nfa, s1) + case '|': + s1 := pop(&nfa) + s2 := pop(&nfa) + s3 := State{} + s3.transitions = make(map[int]*State) + s3.output = append(s3.output, &s1, &s2) + s3.transitions[s1.content] = &s1 + s3.transitions[s2.content] = &s2 + s3.content = UNION + s3.isEmpty = true + + nfa = append(nfa, s3) + } + } + if len(nfa) != 1 { + panic("ERROR: Invalid Regex.") + } + + verifyLastStates(nfa) + + return nfa[0] + +} + +func main() { + var re string + // fmt.Scanln(&re) + re = "a(b|c)*d" + re_postfix := shuntingYard(re) + fmt.Println(re_postfix) + start := thompson(re_postfix) + + assert(len(start.transitions) == 1) + assert(len(start.transitions[UNION].transitions) == 2) + +} diff --git a/misc.go b/misc.go new file mode 100644 index 0000000..0674718 --- /dev/null +++ b/misc.go @@ -0,0 +1,15 @@ +package main + +import ( + "unicode" +) + +func isAlphaNum(c rune) bool { + return unicode.IsLetter(c) || unicode.IsNumber(c) +} + +func assert(cond bool) { + if cond != true { + panic("Assertion Failed") + } +} diff --git a/nfa.go b/nfa.go new file mode 100644 index 0000000..0244b91 --- /dev/null +++ b/nfa.go @@ -0,0 +1,38 @@ +package main + +const EPSILON int = 0 + +type State struct { + content int // Contents of current state + isEmpty bool // If it is empty - Union operator states will be empty + isLast bool // If it is the last state (acept state) + output []*State // The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these. + transitions map[int]*State // Transitions to different states (can be associated with an int, representing content of destination state) +} + +type NFA struct { + start State + outputs []State +} + +// verifyLastStatesHelper performs the depth-first recursion needed for verifyLastStates +func verifyLastStatesHelper(state *State, visited map[*State]bool) { + if len(state.transitions) == 0 { + state.isLast = true + return + } + if visited[state] == true { + return + } + visited[state] = true + for k := range state.transitions { + if state.transitions[k] != state { + verifyLastStatesHelper(state.transitions[k], visited) + } + } +} + +// verifyLastStates penables the 'isLast' flag for the leaf nodes (last states) +func verifyLastStates(start []State) { + verifyLastStatesHelper(&start[0], make(map[*State]bool)) +} diff --git a/sliceQueue.go b/sliceQueue.go new file mode 100644 index 0000000..ba68ebd --- /dev/null +++ b/sliceQueue.go @@ -0,0 +1,12 @@ +package main + +// Helper functions for slices, to make them behave more like stacks +func peek[T any](s []T) T { + return s[len(s)-1] +} + +func pop[T any](sp *[]T) T { + to_return := (*sp)[len(*sp)-1] + *sp = (*sp)[:len(*sp)-1] + return to_return +}