Added support for character classes (not ranges, yet); also take input from stdin instead of cmdline arg
This commit is contained in:
		
							
								
								
									
										85
									
								
								main.go
									
									
									
									
									
								
							
							
						
						
									
										85
									
								
								main.go
									
									
									
									
									
								
							| @@ -1,6 +1,7 @@ | ||||
| package main | ||||
|  | ||||
| import ( | ||||
| 	"bufio" | ||||
| 	"fmt" | ||||
| 	"os" | ||||
| 	"slices" | ||||
| @@ -29,7 +30,7 @@ The primary benefit of this is getting rid of parentheses. | ||||
| It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm. | ||||
| See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/ | ||||
| */ | ||||
| func shuntingYard(re string) string { | ||||
| func shuntingYard(re string) []postfixNode { | ||||
| 	re_postfix := make([]rune, 0) | ||||
| 	re_runes := []rune(re) // Convert the string to a slice of runes to allow iteration through it | ||||
| 	/* 	Add concatenation operators. | ||||
| @@ -39,8 +40,16 @@ func shuntingYard(re string) string { | ||||
| 		2. The second character isn't a 'closing operator' - one that applies to something before it | ||||
| 			a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_. | ||||
| 	*/ | ||||
| 	for i := 0; i < len(re_runes); i++ { | ||||
| 	i := 0 | ||||
| 	for i < len(re_runes) { | ||||
| 		re_postfix = append(re_postfix, re_runes[i]) | ||||
| 		if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped | ||||
| 			for re_runes[i] != ']' { | ||||
| 				i++ // Skip all characters inside brackets | ||||
| 				re_postfix = append(re_postfix, re_runes[i]) | ||||
| 			} | ||||
| 			continue | ||||
| 		} | ||||
| 		if re_runes[i] != '(' && re_runes[i] != '|' { | ||||
| 			if i < len(re_runes)-1 { | ||||
| 				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' { | ||||
| @@ -48,10 +57,11 @@ func shuntingYard(re string) string { | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 		i++ | ||||
| 	} | ||||
|  | ||||
| 	opStack := make([]rune, 0)  // Operator stack | ||||
| 	outQueue := make([]rune, 0) // Output queue | ||||
| 	opStack := make([]rune, 0)         // Operator stack | ||||
| 	outQueue := make([]postfixNode, 0) // Output queue | ||||
|  | ||||
| 	// Actual algorithm | ||||
| 	for i := 0; i < len(re_postfix); i++ { | ||||
| @@ -67,7 +77,7 @@ func shuntingYard(re string) string { | ||||
| 		*/ | ||||
| 		c := re_postfix[i] | ||||
| 		if isAlphaNum(c) { | ||||
| 			outQueue = append(outQueue, c) | ||||
| 			outQueue = append(outQueue, newPostfixNode(c)) | ||||
| 			continue | ||||
| 		} | ||||
| 		// Escape character - NOT IMPLEMENTED YET - DO NOT USE | ||||
| @@ -91,13 +101,30 @@ func shuntingYard(re string) string { | ||||
| 				} else { | ||||
| 					for priority(c) <= priority(topStack) { // 2b | ||||
| 						to_append := mustPop(&opStack) | ||||
| 						outQueue = append(outQueue, to_append) | ||||
| 						outQueue = append(outQueue, newPostfixNode(to_append)) | ||||
| 						topStack, _ = peek(opStack) | ||||
| 					} | ||||
| 					opStack = append(opStack, c) | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 		if c == '[' { // Used for character classes | ||||
| 			i++                      // Step forward so we can look at the character class | ||||
| 			chars := make([]rune, 0) // List of characters -  used only for character classes | ||||
| 			for i < len(re_postfix) { | ||||
| 				if re_postfix[i] == ']' { | ||||
| 					break | ||||
| 				} | ||||
| 				chars = append(chars, re_postfix[i]) | ||||
| 				i++ | ||||
| 			} | ||||
| 			if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic. | ||||
| 				panic("ERROR: Opening bracket without closing bracket.") | ||||
| 			} | ||||
| 			outQueue = append(outQueue, newPostfixNode(chars...)) | ||||
| 			i++ // Step forward to skip closing bracket | ||||
| 			continue | ||||
| 		} | ||||
| 		if c == '(' { | ||||
| 			opStack = append(opStack, c) | ||||
| 		} | ||||
| @@ -108,7 +135,7 @@ func shuntingYard(re string) string { | ||||
| 					panic("ERROR: Imbalanced parantheses.") | ||||
| 				} | ||||
| 				to_append := mustPop(&opStack) | ||||
| 				outQueue = append(outQueue, to_append) | ||||
| 				outQueue = append(outQueue, newPostfixNode(to_append)) | ||||
| 			} | ||||
| 			_ = mustPop(&opStack) // Get rid of opening parantheses | ||||
| 		} | ||||
| @@ -117,52 +144,52 @@ func shuntingYard(re string) string { | ||||
| 	// Pop all remaining operators (and append to outQueue) | ||||
| 	for len(opStack) > 0 { | ||||
| 		to_append := mustPop(&opStack) | ||||
| 		outQueue = append(outQueue, to_append) | ||||
| 		outQueue = append(outQueue, newPostfixNode(to_append)) | ||||
| 	} | ||||
|  | ||||
| 	return string(outQueue) | ||||
| 	return outQueue | ||||
| } | ||||
|  | ||||
| // Thompson's algorithm. Constructs Finite-State Automaton from given string. | ||||
| // Returns start state. | ||||
| func thompson(re string) *State { | ||||
| func thompson(re []postfixNode) *State { | ||||
| 	nfa := make([]*State, 0) // Stack of states | ||||
| 	for _, c := range re { | ||||
| 		if isAlphaNum(c) { | ||||
| 		if c.nodetype == CHARACTER { | ||||
| 			state := State{} | ||||
| 			state.transitions = make(map[int][]*State) | ||||
| 			state.content = int(c) | ||||
| 			state.content = rune2Contents(c.contents) | ||||
| 			state.output = make([]*State, 0) | ||||
| 			state.output = append(state.output, &state) | ||||
| 			state.isEmpty = false | ||||
| 			nfa = append(nfa, &state) | ||||
| 		} | ||||
| 		// Must be an operator if it isn't alphanumeric | ||||
| 		switch c { | ||||
| 		case CONCAT: | ||||
| 		// Must be an operator if it isn't a character | ||||
| 		switch c.nodetype { | ||||
| 		case CONCATENATE: | ||||
| 			s2 := mustPop(&nfa) | ||||
| 			s1 := mustPop(&nfa) | ||||
| 			s1 = concatenate(s1, s2) | ||||
| 			nfa = append(nfa, s1) | ||||
| 		case '*': // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state | ||||
| 		case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state | ||||
| 			s1 := mustPop(&nfa) | ||||
| 			stateToAdd := kleene(*s1) | ||||
| 			nfa = append(nfa, stateToAdd) | ||||
| 		case '+': // a+ is equivalent to aa* | ||||
| 		case PLUS: // a+ is equivalent to aa* | ||||
| 			s1 := mustPop(&nfa) | ||||
| 			s2 := kleene(*s1) | ||||
| 			s1 = concatenate(s1, s2) | ||||
| 			nfa = append(nfa, s1) | ||||
| 		case '?': // ab? is equivalent to a(b|) | ||||
| 		case QUESTION: // ab? is equivalent to a(b|) | ||||
| 			s1 := mustPop(&nfa) | ||||
| 			s2 := &State{} | ||||
| 			s2.transitions = make(map[int][]*State) | ||||
| 			s2.content = EPSILON | ||||
| 			s2.content = newContents(EPSILON) | ||||
| 			s2.output = append(s2.output, s2) | ||||
| 			s2.isEmpty = true | ||||
| 			s3 := alternate(s1, s2) | ||||
| 			nfa = append(nfa, s3) | ||||
| 		case '|': | ||||
| 		case PIPE: | ||||
| 			s1 := mustPop(&nfa) | ||||
| 			s2 := mustPop(&nfa) | ||||
| 			s3 := alternate(s1, s2) | ||||
| @@ -185,19 +212,28 @@ func main() { | ||||
| 	// 		a. Add explicit concatenation operators to facilitate this | ||||
| 	// 2. Build NFA from postfix representation (Thompson's algorithm) | ||||
| 	// 3. Run the string against the NFA | ||||
| 	if len(os.Args) < 3 { | ||||
| 	if len(os.Args) != 2 { | ||||
| 		fmt.Println("ERROR: Missing cmdline args") | ||||
| 		os.Exit(22) | ||||
| 	} | ||||
| 	var re string | ||||
| 	re = os.Args[1] | ||||
| 	var test_str string | ||||
| 	// Read test string from stdin | ||||
| 	reader := bufio.NewReader(os.Stdin) | ||||
| 	test_str, err := reader.ReadString('\n') | ||||
| 	if err != nil { | ||||
| 		panic(err) | ||||
| 	} | ||||
|  | ||||
| 	fmt.Scanln(&test_str) | ||||
| 	re_postfix := shuntingYard(re) | ||||
| 	// fmt.Println(re_postfix) | ||||
| 	startState := thompson(re_postfix) | ||||
| 	matchIndices := findAllMatches(startState, os.Args[2]) | ||||
| 	matchIndices := findAllMatches(startState, test_str) | ||||
| 	inColor := false | ||||
| 	if len(matchIndices) > 0 { | ||||
| 		for i, c := range os.Args[2] { | ||||
| 		for i, c := range test_str { | ||||
| 			for _, indices := range matchIndices { | ||||
| 				if i >= indices.startIdx && i < indices.endIdx { | ||||
| 					color.New(color.FgRed).Printf("%c", c) | ||||
| @@ -210,8 +246,7 @@ func main() { | ||||
| 			} | ||||
| 			inColor = false | ||||
| 		} | ||||
| 		fmt.Printf("\n") | ||||
| 	} else { | ||||
| 		fmt.Println(os.Args[2]) | ||||
| 		fmt.Print(test_str) | ||||
| 	} | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user