Moved some auxiliary functions into compile.go; use new API for compiling and finding matches

Changed API for match-finding functions - take in a Reg instead of start state and numGroups separately
Rewrote to use new API for compiling and finding matches
2025-01-06 20:14:57 -06:00 · 2025-01-06 20:14:19 -06:00 · 2025-01-06 20:12:18 -06:00 · 2025-01-06 20:10:25 -06:00 · 2025-01-06 20:08:24 -06:00 · 2025-01-06 12:29:04 -06:00
5 changed files with 692 additions and 632 deletions
--- a/compile.go
+++ b/compile.go
@@ -0,0 +1,618 @@
 package main
 import (
 	"fmt"
 	"slices"
 	"strconv"
 	"unicode"
 )
 // A Reg represents the result of compiling a regular expression. It contains
 // the startState of the NFA representation of the regex, and the number of capturing
 // groups in the regex.
 type Reg struct {
 	start     *State
 	numGroups int
 }
 const CONCAT rune = '~'
 func isOperator(c rune) bool {
 	if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
 		return true
 	}
 	return false
 }
 /* priority returns the priority of the given operator */
 func priority(op rune) int {
 	precedence := []rune{'|', CONCAT, '+', '*', '?'}
 	return slices.Index(precedence, op)
 }
 /*
 The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
 The primary benefit of this is getting rid of parentheses.
 It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
 An error can be returned for a multitude of reasons - the reason is specified in the error string.
 See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
 */
 func shuntingYard(re string) ([]postfixNode, error) {
 	re_postfix := make([]rune, 0)
 	// Convert the string to a slice of runes to allow iteration through it
 	re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
 	re_runes := make([]rune, 0)
 	// Check for numeric range. If we are at the start of a numeric range,
 	// skip to end and construct the equivalent regex for the range.
 	// The reason this is outside the loop below, is that it actually modifies
 	// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
 	// It also makes the overall parsing easier, since I don't have to worry about the numeric range
 	// anymore.
 	// Eventually, I might be able to add it into the main parsing loop, to reduce the time
 	// complexity.
 	// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
 	//
 	// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
 	// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
 	for i := 0; i < len(re_runes_orig); i++ {
 		c := re_runes_orig[i]
 		if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
 			i++ // Step over opening angle bracket
 			tmpStr := ""
 			hyphenFound := false
 			for i < len(re_runes_orig) && re_runes_orig[i] != '>' {
 				if !unicode.IsDigit(re_runes_orig[i]) {
 					if re_runes_orig[i] != '-' || (hyphenFound) {
 						return nil, fmt.Errorf("Invalid numeric range.")
 					}
 				}
 				if re_runes_orig[i] == '-' {
 					hyphenFound = true
 				}
 				tmpStr += string(re_runes_orig[i])
 				i++
 			}
 			// End of string reached and last character doesn't close the range
 			if i == len(re_runes_orig) && re_runes_orig[len(re_runes_orig)-1] != '>' {
 				return nil, fmt.Errorf("Numeric range not closed.")
 			}
 			if len(tmpStr) == 0 {
 				return nil, fmt.Errorf("Empty numeric range.")
 			}
 			// Closing bracket will be skipped when the loop variable increments
 			var rangeStart int
 			var rangeEnd int
 			fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd)
 			regex := range2regex(rangeStart, rangeEnd)
 			re_runes = append(re_runes, []rune(regex)...)
 		} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
 			re_runes = append(re_runes, NONCAPLPAREN_CHAR)
 			i += 2
 		} else {
 			re_runes = append(re_runes, c)
 		}
 	}
 	/* 	Add concatenation operators.
 	Only add a concatenation operator between two characters if both the following conditions are met:
 		1. 	The first character isn't an opening parantheses or alteration operator (or an escape character)
 			a. This makes sense, because these operators can't be _concatenated_ with anything else.
 		2. The second character isn't a 'closing operator' - one that applies to something before it
 			a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_.
 	Caveats:
 		1. Don't mess with anything inside brackets - character class
 		2. Don't mess with anything inside braces - numeric repetition
 		3. Don't mess with any lookarounds.
 	*/
 	i := 0
 	for i < len(re_runes) {
 		re_postfix = append(re_postfix, re_runes[i])
 		if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters.
 			re_postfix[len(re_postfix)-1] = LBRACKET         // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
 			toAppend := make([]rune, 0)                      // Holds all the runes in the current character class
 			if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
 				re_postfix = append(re_postfix, '^')
 				i++ // Skip opening bracket and caret
 			}
 			if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
 				return nil, fmt.Errorf("Empty character class.")
 			}
 			for re_runes[i] != ']' {
 				i++ // Skip all characters inside brackets
 				// TODO: Check for escaped characters
 				// Check ahead for character range
 				if i < len(re_runes)-2 && re_runes[i+1] == '-' {
 					rangeStart := re_runes[i]
 					rangeEnd := re_runes[i+2]
 					if int(rangeEnd) < int(rangeStart) {
 						return nil, fmt.Errorf("Range is out of order.")
 					}
 					for i := rangeStart; i <= rangeEnd; i++ {
 						toAppend = append(toAppend, i)
 					}
 					i += 2 // Skip start and hyphen (end will automatically be skipped on next iteration of loop)
 					continue
 				}
 				toAppend = append(toAppend, re_runes[i])
 			}
 			// Replace the last character (which should have been ']', with RBRACKET
 			toAppend[len(toAppend)-1] = RBRACKET
 			re_postfix = append(re_postfix, toAppend...)
 		}
 		if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
 			i++ // Skip opening brace
 			for i < len(re_runes) && re_runes[i] != '}' {
 				re_postfix = append(re_postfix, re_runes[i])
 				i++
 			}
 			if i == len(re_runes) {
 				return nil, fmt.Errorf("Invalid numeric specifier.")
 			}
 			re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
 		}
 		if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen
 			re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
 			i += 3
 		}
 		if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
 			i++ // Step inside
 			if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
 				return nil, fmt.Errorf("Invalid regex. Lookaround intended?")
 			}
 			re_postfix = append(re_postfix, re_runes[i])
 			i++
 			numOpenParens := 1
 			for numOpenParens != 0 {
 				if i >= len(re_runes) {
 					return nil, fmt.Errorf("Unclosed lookaround.")
 				}
 				if re_runes[i] == '(' {
 					numOpenParens++
 				}
 				if re_runes[i] == ')' {
 					numOpenParens--
 					if numOpenParens == 0 {
 						break
 					}
 				}
 				re_postfix = append(re_postfix, re_runes[i])
 				i++
 			}
 			continue
 		}
 		if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
 			if i < len(re_runes)-1 {
 				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
 					re_postfix = append(re_postfix, CONCAT)
 				}
 			}
 		}
 		i++
 	}
 	opStack := make([]rune, 0)         // Operator stack
 	outQueue := make([]postfixNode, 0) // Output queue
 	// Actual algorithm
 	numOpenParens := 0 // Number of open parentheses
 	for i := 0; i < len(re_postfix); i++ {
 		/* Two cases:
 		1. Current character is alphanumeric - send to output queue
 		2. Current character is operator - do the following:
 			a. If current character has greater priority than top of opStack, push to opStack.
 			b. If not, keep popping from opStack (and appending to outQueue) until:
 				i. opStack is empty, OR
 				ii. current character has greater priority than top of opStack
 		3. If current character is '(' or NONCAPLPAREN_CHAR, push to opStack
 		4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
 		5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
 		6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
 		*/
 		c := re_postfix[i]
 		if isNormalChar(c) {
 			if caseInsensitiveFlag != nil && *caseInsensitiveFlag {
 				outQueue = append(outQueue, newPostfixNode(allCases(c)...))
 			} else {
 				outQueue = append(outQueue, newPostfixNode(c))
 			}
 			continue
 		}
 		// Escape character
 		if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
 			if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
 				return nil, fmt.Errorf("ERROR: Backslash with no escape character.")
 			}
 			i++
 			outQueue = append(outQueue, newEscapedNode(re_postfix[i]))
 			continue // Escaped character will automatically be skipped when loop variable increments
 		}
 		if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
 			outQueue = append(outQueue, newPostfixDotNode())
 			continue
 		}
 		if c == '^' { // Start-of-string assertion
 			outQueue = append(outQueue, newPostfixNode(c))
 		}
 		if c == '$' { // End-of-string assertion
 			outQueue = append(outQueue, newPostfixNode(c))
 		}
 		// Check if we're at the start of a lookaround
 		if c == '(' && i < len(re_postfix)-1 && re_postfix[i+1] == '?' {
 			i += 2      // Skip opening paren and question mark
 			regex := "" // Stores lookaround regex
 			numOpenParens := 1
 			for numOpenParens != 0 {
 				if i >= len(re_postfix) {
 					return nil, fmt.Errorf("Unclosed lookaround.")
 				}
 				if re_postfix[i] == '(' {
 					numOpenParens++
 				}
 				if re_postfix[i] == ')' {
 					numOpenParens--
 					if numOpenParens == 0 {
 						break
 					}
 				}
 				regex += string(re_postfix[i])
 				i++
 			}
 			if len(regex) <= 1 { // Nothing in regex - panic
 				return nil, fmt.Errorf("Invalid lookaround. (too short?)")
 			}
 			// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
 			// Now we should filter that out.
 			toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1}
 			if regex[0] == '<' { // Lookbehind
 				toAppend.lookaroundDir = LOOKBEHIND
 				regex = regex[1:]
 			} else if regex[0] == '=' || regex[0] == '!' {
 				toAppend.lookaroundDir = LOOKAHEAD
 			} else {
 				return nil, fmt.Errorf("Invalid lookaround.")
 			}
 			// Positive or negative
 			if regex[0] == '=' { // Positive
 				toAppend.lookaroundSign = POSITIVE
 				toAppend.contents = []rune(regex[1:])
 			} else if regex[0] == '!' { // Negative
 				toAppend.lookaroundSign = NEGATIVE
 				toAppend.contents = []rune(regex[1:])
 			} else {
 				return nil, fmt.Errorf("Invalid lookaround.")
 			}
 			outQueue = append(outQueue, toAppend)
 			continue
 		}
 		if isOperator(c) {
 			if len(opStack) == 0 {
 				opStack = append(opStack, c)
 			} else {
 				topStack, err := peek(opStack)
 				if err != nil {
 					return nil, fmt.Errorf("Operator without operand.")
 				}
 				if priority(c) > priority(topStack) { // 2a
 					opStack = append(opStack, c)
 				} else {
 					for priority(c) <= priority(topStack) { // 2b
 						to_append := mustPop(&opStack)
 						outQueue = append(outQueue, newPostfixNode(to_append))
 						topStack, _ = peek(opStack)
 					}
 					opStack = append(opStack, c)
 				}
 			}
 		}
 		if c == LBRACKET { // Used for character classes
 			i++ // Step forward so we can look at the character class
 			var invertMatch bool
 			if re_postfix[i] == '^' {
 				invertMatch = true
 				i++
 			}
 			chars := make([]rune, 0) // List of characters -  used only for character classes
 			for i < len(re_postfix) {
 				if re_postfix[i] == RBRACKET {
 					break
 				}
 				chars = append(chars, re_postfix[i])
 				i++
 			}
 			if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
 				return nil, fmt.Errorf("Opening bracket without closing bracket.")
 			}
 			if !invertMatch {
 				outQueue = append(outQueue, newPostfixCharNode(chars...))
 			} else {
 				// Invert match - create an allChars postfixNode, then add the given states to its 'except' list.
 				toAdd := newPostfixDotNode()
 				toAdd.except = chars
 				outQueue = append(outQueue, toAdd)
 			}
 			continue
 		}
 		if c == '{' {
 			i++ // Skip opening brace
 			// Three possibilities:
 			// 1. Single number - {5}
 			// 2. Range - {3,5}
 			// 3. Start with no end, {3,}
 			startRange := make([]rune, 0)
 			startRangeNum := 0
 			endRange := make([]rune, 0)
 			endRangeNum := 0
 			for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
 				startRange = append(startRange, re_postfix[i])
 				i++
 			}
 			if len(startRange) == 0 { // {} is not valid, neither is {,5}
 				return nil, fmt.Errorf("Invalid numeric specifier.")
 			}
 			if i == len(re_postfix) {
 				return nil, fmt.Errorf("Brace not closed.")
 			}
 			startRangeNum, err := strconv.Atoi(string(startRange))
 			if err != nil {
 				panic(err)
 			}
 			if re_postfix[i] == '}' { // Case 1 above
 				endRangeNum = startRangeNum
 			} else {
 				if re_postfix[i] != ',' {
 					return nil, fmt.Errorf("Invalid numeric specifier.")
 				}
 				i++ // Skip comma
 				for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
 					endRange = append(endRange, re_postfix[i])
 					i++
 				}
 				if i == len(re_postfix) {
 					return nil, fmt.Errorf("Brace not closed.")
 				}
 				if re_postfix[i] != '}' {
 					return nil, fmt.Errorf("Invalid numeric specifier.")
 				}
 				if len(endRange) == 0 { // Case 3 above
 					endRangeNum = INFINITE_REPS
 				} else { // Case 2 above
 					var err error
 					endRangeNum, err = strconv.Atoi(string(endRange))
 					if err != nil {
 						panic(err)
 					}
 				}
 			}
 			idx := len(outQueue) - 1
 			// Get the last added node
 			if idx < 0 || outQueue[idx].nodetype == LPAREN {
 				return nil, fmt.Errorf("Numeric specifier with no content.")
 			}
 			outQueue[idx].startReps = startRangeNum
 			outQueue[idx].endReps = endRangeNum
 		}
 		if c == '(' || c == NONCAPLPAREN_CHAR {
 			opStack = append(opStack, c)
 			if c == '(' { // We only push _capturing_ group parentheses to outQueue
 				outQueue = append(outQueue, newPostfixNode(c))
 			}
 			numOpenParens++
 		}
 		if c == ')' {
 			// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack.
 			var val rune
 			var err error
 			for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) {
 				if err != nil {
 					return nil, fmt.Errorf("Imbalanced parantheses.")
 				}
 				to_append := mustPop(&opStack)
 				outQueue = append(outQueue, newPostfixNode(to_append))
 			}
 			_ = mustPop(&opStack) // Get rid of opening parentheses
 			if val == '(' {       // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
 				outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
 			}
 			numOpenParens--
 		}
 	}
 	// Pop all remaining operators (and append to outQueue)
 	for len(opStack) > 0 {
 		to_append := mustPop(&opStack)
 		outQueue = append(outQueue, newPostfixNode(to_append))
 	}
 	if numOpenParens != 0 {
 		return nil, fmt.Errorf("Imbalanced parantheses.")
 	}
 	return outQueue, nil
 }
 // Thompson's algorithm. Constructs Finite-State Automaton from given string.
 // Returns start state and number of groups in regex.
 func thompson(re []postfixNode) (Reg, error) {
 	nfa := make([]*State, 0) // Stack of states
 	numGroups := 0           // Number of capturing groups
 	for _, c := range re {
 		if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
 			state := State{}
 			state.transitions = make(map[int][]*State)
 			if c.allChars {
 				state.allChars = true
 				if len(c.except) != 0 {
 					state.except = append([]rune{}, c.except...)
 				}
 			}
 			state.content = rune2Contents(c.contents)
 			state.output = make([]*State, 0)
 			state.output = append(state.output, &state)
 			state.isEmpty = false
 			if c.nodetype == ASSERTION {
 				state.isEmpty = true                 // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
 				state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
 				if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
 					switch c.contents[0] {
 					case '^':
 						state.assert = SOS
 					case '$':
 						state.assert = EOS
 					case 'b':
 						state.assert = WBOUND
 					case 'B':
 						state.assert = NONWBOUND
 					}
 				} else { // Lookaround
 					state.lookaroundRegex = string(c.contents)
 					if c.lookaroundDir == LOOKAHEAD {
 						if c.lookaroundSign == POSITIVE {
 							state.assert = PLA
 						}
 						if c.lookaroundSign == NEGATIVE {
 							state.assert = NLA
 						}
 					}
 					if c.lookaroundDir == LOOKBEHIND {
 						if c.lookaroundSign == POSITIVE {
 							state.assert = PLB
 						}
 						if c.lookaroundSign == NEGATIVE {
 							state.assert = NLB
 						}
 					}
 					tmpRe, err := shuntingYard(state.lookaroundRegex)
 					if err != nil {
 						return Reg{}, fmt.Errorf("Error parsing lookaround: %w", err)
 					}
 					reg, err := thompson(tmpRe)
 					if err != nil {
 						return Reg{}, fmt.Errorf("Error compiling lookaround: %w", err)
 					}
 					state.lookaroundNFA = reg.start
 					state.lookaroundNumCaptureGroups = reg.numGroups
 				}
 			}
 			nfa = append(nfa, &state)
 		}
 		if c.nodetype == LPAREN || c.nodetype == RPAREN {
 			s := &State{}
 			s.assert = NONE
 			s.content = newContents(EPSILON)
 			s.isEmpty = true
 			s.output = make([]*State, 0)
 			s.output = append(s.output, s)
 			s.transitions = make(map[int][]*State)
 			// LPAREN nodes are just added normally
 			if c.nodetype == LPAREN {
 				numGroups++
 				s.groupBegin = true
 				s.groupNum = numGroups
 				nfa = append(nfa, s)
 				continue
 			}
 			// For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN,
 			// and then some other node.
 			// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
 			// and added back in.
 			if c.nodetype == RPAREN {
 				s.groupEnd = true
 				middleNode := mustPop(&nfa)
 				lparenNode := mustPop(&nfa)
 				s.groupNum = lparenNode.groupNum
 				tmp := concatenate(lparenNode, middleNode)
 				to_add := concatenate(tmp, s)
 				nfa = append(nfa, to_add)
 			}
 		}
 		// Must be an operator if it isn't a character
 		switch c.nodetype {
 		case CONCATENATE:
 			s2 := mustPop(&nfa)
 			s1 := mustPop(&nfa)
 			s1 = concatenate(s1, s2)
 			nfa = append(nfa, s1)
 		case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
 			s1 := mustPop(&nfa)
 			stateToAdd := kleene(*s1)
 			nfa = append(nfa, stateToAdd)
 		case PLUS: // a+ is equivalent to aa*
 			s1 := mustPop(&nfa)
 			s2 := kleene(*s1)
 			s1 = concatenate(s1, s2)
 			nfa = append(nfa, s1)
 		case QUESTION: // ab? is equivalent to a(b|)
 			s1 := mustPop(&nfa)
 			s2 := question(s1)
 			nfa = append(nfa, s2)
 		case PIPE:
 			s1 := mustPop(&nfa)
 			s2 := mustPop(&nfa)
 			s3 := alternate(s1, s2)
 			nfa = append(nfa, s3)
 		}
 		if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
 			if c.endReps != -1 && c.endReps < c.startReps {
 				return Reg{}, fmt.Errorf("Numeric specifier - start greater than end.")
 			}
 			state := mustPop(&nfa)
 			var stateToAdd *State = nil
 			// Take advantage of the following facts:
 			// a{5} == aaaaa
 			// a{3,5} == aaaa?a?
 			// a{5,} == aaaaa+
 			// Nov. 3 2024 - I have two choices on how I want to implement numeric
 			// specifiers.
 			// a. Encode the logic while creating the states. I will have to create a function
 			// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
 			// each other (concatenating them with the 'concatenate' method - which takes addresses - does
 			// not work). Creating this function might be a lot of work.
 			// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
 			// at this point, I can leave thompson untouched.
 			for i := 0; i < c.startReps; i++ { // Case 1
 				stateToAdd = concatenate(stateToAdd, cloneState(state))
 			}
 			if c.endReps == INFINITE_REPS { // Case 3
 				s2 := kleene(*state)
 				stateToAdd = concatenate(stateToAdd, s2)
 			} else { // Case 2
 				for i := c.startReps; i < c.endReps; i++ {
 					stateToAdd = concatenate(stateToAdd, question(state))
 				}
 			}
 			nfa = append(nfa, stateToAdd)
 		}
 	}
 	if len(nfa) != 1 {
 		return Reg{}, fmt.Errorf("Invalid Regex.")
 	}
 	verifyLastStates(nfa)
 	return Reg{nfa[0], numGroups}, nil
 }
 // Compiles the given regular expression into a Reg type, suitable for use with the
 // matching functions. The second return value is non-nil if a compilation error has
 // occured. As such, the error value must be checked before using the Reg returned by this function.
 func Compile(re string) (Reg, error) {
 	nodes, err := shuntingYard(re)
 	if err != nil {
 		return Reg{}, fmt.Errorf("Error parsing regex: %w", err)
 	}
 	reg, err := thompson(nodes)
 	if err != nil {
 		return Reg{}, fmt.Errorf("Error compiling regex: %w", err)
 	}
 	return reg, nil
 }
--- a/main.go
+++ b/main.go
@@ -6,596 +6,13 @@ import (
 	"fmt"
 	"io"
 	"os"
 	"slices"
 	"strconv"
 	"unicode"
 	"github.com/fatih/color"
 )
 const CONCAT rune = '~'
 var notDotChars []rune
 var caseInsensitiveFlag *bool // Whether we are running in case-insensitive mode
 func isOperator(c rune) bool {
 	if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
 		return true
 	}
 	return false
 }
 /* priority returns the priority of the given operator */
 func priority(op rune) int {
 	precedence := []rune{'|', CONCAT, '+', '*', '?'}
 	return slices.Index(precedence, op)
 }
 /*
 The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
 The primary benefit of this is getting rid of parentheses.
 It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
 See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
 */
 func shuntingYard(re string) []postfixNode {
 	re_postfix := make([]rune, 0)
 	// Convert the string to a slice of runes to allow iteration through it
 	re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
 	re_runes := make([]rune, 0)
 	// Check for numeric range. If we are at the start of a numeric range,
 	// skip to end and construct the equivalent regex for the range.
 	// The reason this is outside the loop below, is that it actually modifies
 	// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
 	// It also makes the overall parsing easier, since I don't have to worry about the numeric range
 	// anymore.
 	// Eventually, I might be able to add it into the main parsing loop, to reduce the time
 	// complexity.
 	// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
 	//
 	// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
 	// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
 	for i := 0; i < len(re_runes_orig); i++ {
 		c := re_runes_orig[i]
 		if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
 			i++ // Step over opening angle bracket
 			tmpStr := ""
 			hyphenFound := false
 			for i < len(re_runes_orig) && re_runes_orig[i] != '>' {
 				if !unicode.IsDigit(re_runes_orig[i]) {
 					if re_runes_orig[i] != '-' || (hyphenFound) {
 						panic("ERROR: Invalid numeric range.")
 					}
 				}
 				if re_runes_orig[i] == '-' {
 					hyphenFound = true
 				}
 				tmpStr += string(re_runes_orig[i])
 				i++
 			}
 			// End of string reached and last character doesn't close the range
 			if i == len(re_runes_orig) && re_runes_orig[len(re_runes_orig)-1] != '>' {
 				panic("ERROR: Numeric range not closed.")
 			}
 			if len(tmpStr) == 0 {
 				panic("ERROR: Empty numeric range.")
 			}
 			// Closing bracket will be skipped when the loop variable increments
 			var rangeStart int
 			var rangeEnd int
 			fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd)
 			regex := range2regex(rangeStart, rangeEnd)
 			re_runes = append(re_runes, []rune(regex)...)
 		} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
 			re_runes = append(re_runes, NONCAPLPAREN_CHAR)
 			i += 2
 		} else {
 			re_runes = append(re_runes, c)
 		}
 	}
 	/* 	Add concatenation operators.
 	Only add a concatenation operator between two characters if both the following conditions are met:
 		1. 	The first character isn't an opening parantheses or alteration operator (or an escape character)
 			a. This makes sense, because these operators can't be _concatenated_ with anything else.
 		2. The second character isn't a 'closing operator' - one that applies to something before it
 			a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_.
 	Caveats:
 		1. Don't mess with anything inside brackets - character class
 		2. Don't mess with anything inside braces - numeric repetition
 		3. Don't mess with any lookarounds.
 	*/
 	i := 0
 	for i < len(re_runes) {
 		re_postfix = append(re_postfix, re_runes[i])
 		if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters.
 			re_postfix[len(re_postfix)-1] = LBRACKET         // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
 			toAppend := make([]rune, 0)                      // Holds all the runes in the current character class
 			if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
 				re_postfix = append(re_postfix, '^')
 				i++ // Skip opening bracket and caret
 			}
 			if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
 				panic("Empty character class.")
 			}
 			for re_runes[i] != ']' {
 				i++ // Skip all characters inside brackets
 				// TODO: Check for escaped characters
 				// Check ahead for character range
 				if i < len(re_runes)-2 && re_runes[i+1] == '-' {
 					rangeStart := re_runes[i]
 					rangeEnd := re_runes[i+2]
 					if int(rangeEnd) < int(rangeStart) {
 						panic("Range is out of order.")
 					}
 					for i := rangeStart; i <= rangeEnd; i++ {
 						toAppend = append(toAppend, i)
 					}
 					i += 2 // Skip start and hyphen (end will automatically be skipped on next iteration of loop)
 					continue
 				}
 				toAppend = append(toAppend, re_runes[i])
 			}
 			// Replace the last character (which should have been ']', with RBRACKET
 			toAppend[len(toAppend)-1] = RBRACKET
 			re_postfix = append(re_postfix, toAppend...)
 		}
 		if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
 			i++ // Skip opening brace
 			for i < len(re_runes) && re_runes[i] != '}' {
 				re_postfix = append(re_postfix, re_runes[i])
 				i++
 			}
 			if i == len(re_runes) {
 				panic("Invalid numeric specifier.")
 			}
 			re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
 		}
 		if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen
 			re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
 			i += 3
 		}
 		if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
 			i++ // Step inside
 			if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
 				panic("Invalid regex. Lookaround intended?")
 			}
 			re_postfix = append(re_postfix, re_runes[i])
 			i++
 			numOpenParens := 1
 			for numOpenParens != 0 {
 				if i >= len(re_runes) {
 					panic("Unclosed lookaround.")
 				}
 				if re_runes[i] == '(' {
 					numOpenParens++
 				}
 				if re_runes[i] == ')' {
 					numOpenParens--
 					if numOpenParens == 0 {
 						break
 					}
 				}
 				re_postfix = append(re_postfix, re_runes[i])
 				i++
 			}
 			continue
 		}
 		if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
 			if i < len(re_runes)-1 {
 				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
 					re_postfix = append(re_postfix, CONCAT)
 				}
 			}
 		}
 		i++
 	}
 	opStack := make([]rune, 0)         // Operator stack
 	outQueue := make([]postfixNode, 0) // Output queue
 	// Actual algorithm
 	numOpenParens := 0 // Number of open parentheses
 	for i := 0; i < len(re_postfix); i++ {
 		/* Two cases:
 		1. Current character is alphanumeric - send to output queue
 		2. Current character is operator - do the following:
 			a. If current character has greater priority than top of opStack, push to opStack.
 			b. If not, keep popping from opStack (and appending to outQueue) until:
 				i. opStack is empty, OR
 				ii. current character has greater priority than top of opStack
 		3. If current character is '(' or NONCAPLPAREN_CHAR, push to opStack
 		4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
 		5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
 		6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
 		*/
 		c := re_postfix[i]
 		if isNormalChar(c) {
 			if caseInsensitiveFlag != nil && *caseInsensitiveFlag {
 				outQueue = append(outQueue, newPostfixNode(allCases(c)...))
 			} else {
 				outQueue = append(outQueue, newPostfixNode(c))
 			}
 			continue
 		}
 		// Escape character
 		if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
 			if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
 				panic("ERROR: Backslash with no escape character.")
 			}
 			i++
 			outQueue = append(outQueue, newEscapedNode(re_postfix[i]))
 			continue // Escaped character will automatically be skipped when loop variable increments
 		}
 		if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
 			outQueue = append(outQueue, newPostfixDotNode())
 			continue
 		}
 		if c == '^' { // Start-of-string assertion
 			outQueue = append(outQueue, newPostfixNode(c))
 		}
 		if c == '$' { // End-of-string assertion
 			outQueue = append(outQueue, newPostfixNode(c))
 		}
 		// Check if we're at the start of a lookaround
 		if c == '(' && i < len(re_postfix)-1 && re_postfix[i+1] == '?' {
 			i += 2      // Skip opening paren and question mark
 			regex := "" // Stores lookaround regex
 			numOpenParens := 1
 			for numOpenParens != 0 {
 				if i >= len(re_postfix) {
 					panic("Unclosed lookaround.")
 				}
 				if re_postfix[i] == '(' {
 					numOpenParens++
 				}
 				if re_postfix[i] == ')' {
 					numOpenParens--
 					if numOpenParens == 0 {
 						break
 					}
 				}
 				regex += string(re_postfix[i])
 				i++
 			}
 			if len(regex) <= 1 { // Nothing in regex - panic
 				panic("Invalid lookaround. (too short?)")
 			}
 			// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
 			// Now we should filter that out.
 			toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1}
 			if regex[0] == '<' { // Lookbehind
 				toAppend.lookaroundDir = LOOKBEHIND
 				regex = regex[1:]
 			} else if regex[0] == '=' || regex[0] == '!' {
 				toAppend.lookaroundDir = LOOKAHEAD
 			} else {
 				panic("Invalid lookaround.")
 			}
 			// Positive or negative
 			if regex[0] == '=' { // Positive
 				toAppend.lookaroundSign = POSITIVE
 				toAppend.contents = []rune(regex[1:])
 			} else if regex[0] == '!' { // Negative
 				toAppend.lookaroundSign = NEGATIVE
 				toAppend.contents = []rune(regex[1:])
 			} else {
 				panic("Invalid lookaround.")
 			}
 			outQueue = append(outQueue, toAppend)
 			continue
 		}
 		if isOperator(c) {
 			if len(opStack) == 0 {
 				opStack = append(opStack, c)
 			} else {
 				topStack, err := peek(opStack)
 				if err != nil {
 					panic("ERROR: Operator without operand.")
 				}
 				if priority(c) > priority(topStack) { // 2a
 					opStack = append(opStack, c)
 				} else {
 					for priority(c) <= priority(topStack) { // 2b
 						to_append := mustPop(&opStack)
 						outQueue = append(outQueue, newPostfixNode(to_append))
 						topStack, _ = peek(opStack)
 					}
 					opStack = append(opStack, c)
 				}
 			}
 		}
 		if c == LBRACKET { // Used for character classes
 			i++ // Step forward so we can look at the character class
 			var invertMatch bool
 			if re_postfix[i] == '^' {
 				invertMatch = true
 				i++
 			}
 			chars := make([]rune, 0) // List of characters -  used only for character classes
 			for i < len(re_postfix) {
 				if re_postfix[i] == RBRACKET {
 					break
 				}
 				chars = append(chars, re_postfix[i])
 				i++
 			}
 			if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
 				panic("ERROR: Opening bracket without closing bracket.")
 			}
 			if !invertMatch {
 				outQueue = append(outQueue, newPostfixCharNode(chars...))
 			} else {
 				// Invert match - create an allChars postfixNode, then add the given states to its 'except' list.
 				toAdd := newPostfixDotNode()
 				toAdd.except = chars
 				outQueue = append(outQueue, toAdd)
 			}
 			continue
 		}
 		if c == '{' {
 			i++ // Skip opening brace
 			// Three possibilities:
 			// 1. Single number - {5}
 			// 2. Range - {3,5}
 			// 3. Start with no end, {3,}
 			startRange := make([]rune, 0)
 			startRangeNum := 0
 			endRange := make([]rune, 0)
 			endRangeNum := 0
 			for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
 				startRange = append(startRange, re_postfix[i])
 				i++
 			}
 			if len(startRange) == 0 { // {} is not valid, neither is {,5}
 				panic("ERROR: Invalid numeric specifier.")
 			}
 			if i == len(re_postfix) {
 				panic("ERROR: Brace not closed.")
 			}
 			startRangeNum, err := strconv.Atoi(string(startRange))
 			if err != nil {
 				panic(err)
 			}
 			if re_postfix[i] == '}' { // Case 1 above
 				endRangeNum = startRangeNum
 			} else {
 				if re_postfix[i] != ',' {
 					panic("ERROR: Invalid numeric specifier.")
 				}
 				i++ // Skip comma
 				for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
 					endRange = append(endRange, re_postfix[i])
 					i++
 				}
 				if i == len(re_postfix) {
 					panic("ERROR: Brace not closed.")
 				}
 				if re_postfix[i] != '}' {
 					panic("ERROR: Invalid numeric specifier.")
 				}
 				if len(endRange) == 0 { // Case 3 above
 					endRangeNum = INFINITE_REPS
 				} else { // Case 2 above
 					var err error
 					endRangeNum, err = strconv.Atoi(string(endRange))
 					if err != nil {
 						panic(err)
 					}
 				}
 			}
 			idx := len(outQueue) - 1
 			// Get the last added node
 			if idx < 0 || outQueue[idx].nodetype == LPAREN {
 				panic("Numeric specifier with no content.")
 			}
 			outQueue[idx].startReps = startRangeNum
 			outQueue[idx].endReps = endRangeNum
 		}
 		if c == '(' || c == NONCAPLPAREN_CHAR {
 			opStack = append(opStack, c)
 			if c == '(' { // We only push _capturing_ group parentheses to outQueue
 				outQueue = append(outQueue, newPostfixNode(c))
 			}
 			numOpenParens++
 		}
 		if c == ')' {
 			// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack.
 			var val rune
 			var err error
 			for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) {
 				if err != nil {
 					panic("ERROR: Imbalanced parantheses.")
 				}
 				to_append := mustPop(&opStack)
 				outQueue = append(outQueue, newPostfixNode(to_append))
 			}
 			_ = mustPop(&opStack) // Get rid of opening parentheses
 			if val == '(' {       // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
 				outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
 			}
 			numOpenParens--
 		}
 	}
 	// Pop all remaining operators (and append to outQueue)
 	for len(opStack) > 0 {
 		to_append := mustPop(&opStack)
 		outQueue = append(outQueue, newPostfixNode(to_append))
 	}
 	if numOpenParens != 0 {
 		panic("ERROR: Imbalanced parantheses.")
 	}
 	return outQueue
 }
 // Thompson's algorithm. Constructs Finite-State Automaton from given string.
 // Returns start state and number of groups in regex.
 func thompson(re []postfixNode) (*State, int) {
 	nfa := make([]*State, 0) // Stack of states
 	numGroups := 0           // Number of capturing groups
 	for _, c := range re {
 		if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
 			state := State{}
 			state.transitions = make(map[int][]*State)
 			if c.allChars {
 				state.allChars = true
 				if len(c.except) != 0 {
 					state.except = append([]rune{}, c.except...)
 				}
 			}
 			state.content = rune2Contents(c.contents)
 			state.output = make([]*State, 0)
 			state.output = append(state.output, &state)
 			state.isEmpty = false
 			if c.nodetype == ASSERTION {
 				state.isEmpty = true                 // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
 				state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
 				if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
 					switch c.contents[0] {
 					case '^':
 						state.assert = SOS
 					case '$':
 						state.assert = EOS
 					case 'b':
 						state.assert = WBOUND
 					case 'B':
 						state.assert = NONWBOUND
 					}
 				} else { // Lookaround
 					state.lookaroundRegex = string(c.contents)
 					if c.lookaroundDir == LOOKAHEAD {
 						if c.lookaroundSign == POSITIVE {
 							state.assert = PLA
 						}
 						if c.lookaroundSign == NEGATIVE {
 							state.assert = NLA
 						}
 					}
 					if c.lookaroundDir == LOOKBEHIND {
 						if c.lookaroundSign == POSITIVE {
 							state.assert = PLB
 						}
 						if c.lookaroundSign == NEGATIVE {
 							state.assert = NLB
 						}
 					}
 					tmpRe := shuntingYard(state.lookaroundRegex)
 					var numGroupsLookaround int
 					state.lookaroundNFA, numGroupsLookaround = thompson(tmpRe)
 					state.lookaroundNumCaptureGroups = numGroupsLookaround
 				}
 			}
 			nfa = append(nfa, &state)
 		}
 		if c.nodetype == LPAREN || c.nodetype == RPAREN {
 			s := &State{}
 			s.assert = NONE
 			s.content = newContents(EPSILON)
 			s.isEmpty = true
 			s.output = make([]*State, 0)
 			s.output = append(s.output, s)
 			s.transitions = make(map[int][]*State)
 			// LPAREN nodes are just added normally
 			if c.nodetype == LPAREN {
 				numGroups++
 				s.groupBegin = true
 				s.groupNum = numGroups
 				nfa = append(nfa, s)
 				continue
 			}
 			// For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN,
 			// and then some other node.
 			// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
 			// and added back in.
 			if c.nodetype == RPAREN {
 				s.groupEnd = true
 				middleNode := mustPop(&nfa)
 				lparenNode := mustPop(&nfa)
 				s.groupNum = lparenNode.groupNum
 				tmp := concatenate(lparenNode, middleNode)
 				to_add := concatenate(tmp, s)
 				nfa = append(nfa, to_add)
 			}
 		}
 		// Must be an operator if it isn't a character
 		switch c.nodetype {
 		case CONCATENATE:
 			s2 := mustPop(&nfa)
 			s1 := mustPop(&nfa)
 			s1 = concatenate(s1, s2)
 			nfa = append(nfa, s1)
 		case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
 			s1 := mustPop(&nfa)
 			stateToAdd := kleene(*s1)
 			nfa = append(nfa, stateToAdd)
 		case PLUS: // a+ is equivalent to aa*
 			s1 := mustPop(&nfa)
 			s2 := kleene(*s1)
 			s1 = concatenate(s1, s2)
 			nfa = append(nfa, s1)
 		case QUESTION: // ab? is equivalent to a(b|)
 			s1 := mustPop(&nfa)
 			s2 := question(s1)
 			nfa = append(nfa, s2)
 		case PIPE:
 			s1 := mustPop(&nfa)
 			s2 := mustPop(&nfa)
 			s3 := alternate(s1, s2)
 			nfa = append(nfa, s3)
 		}
 		if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
 			if c.endReps != -1 && c.endReps < c.startReps {
 				panic("ERROR: Numeric specifier - start greater than end.")
 			}
 			state := mustPop(&nfa)
 			var stateToAdd *State = nil
 			// Take advantage of the following facts:
 			// a{5} == aaaaa
 			// a{3,5} == aaaa?a?
 			// a{5,} == aaaaa+
 			// Nov. 3 2024 - I have two choices on how I want to implement numeric
 			// specifiers.
 			// a. Encode the logic while creating the states. I will have to create a function
 			// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
 			// each other (concatenating them with the 'concatenate' method - which takes addresses - does
 			// not work). Creating this function might be a lot of work.
 			// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
 			// at this point, I can leave thompson untouched.
 			for i := 0; i < c.startReps; i++ { // Case 1
 				stateToAdd = concatenate(stateToAdd, cloneState(state))
 			}
 			if c.endReps == INFINITE_REPS { // Case 3
 				s2 := kleene(*state)
 				stateToAdd = concatenate(stateToAdd, s2)
 			} else { // Case 2
 				for i := c.startReps; i < c.endReps; i++ {
 					stateToAdd = concatenate(stateToAdd, question(state))
 				}
 			}
 			nfa = append(nfa, stateToAdd)
 		}
 	}
 	if len(nfa) != 1 {
 		panic("ERROR: Invalid Regex.")
 	}
 	verifyLastStates(nfa)
 	return nfa[0], numGroups
 }
 func main() {
 	invertFlag := flag.Bool("v", false, "Invert match.")
 	// This flag has two 'modes':
@@ -650,7 +67,6 @@ func main() {
 	var re string
 	re = flag.Args()[0]
 	var test_str string
 	var test_runes []rune // Rune-slice representation of test_str
 	var err error
 	var linesRead bool // Whether or not we have read the lines in the file
 	lineNum := 0       // Current line number
@@ -658,8 +74,11 @@ func main() {
 	reader := bufio.NewReader(os.Stdin)
 	out := bufio.NewWriter(os.Stdout)
-	re_postfix := shuntingYard(re)
+	regComp, err := Compile(re)
-	startState, numGroups := thompson(re_postfix)
+	if err != nil {
 		fmt.Println(err)
 		return
 	}
 	for true {
 		if linesRead {
 			break
@@ -696,12 +115,14 @@ func main() {
 				panic(err)
 			}
 		}
-		test_runes = []rune(test_str)
+		matchIndices := make([]Match, 0)
-		matchIndices := findAllMatches(startState, test_runes, numGroups)
+		if matchNumFlagEnabled {
-
+			tmp, err := findNthMatch(regComp, test_str, *matchNum)
-		// If we are trying to print an invalid index, we just assume no specific matches will be printed.
+			if err == nil {
-		if matchNumFlagEnabled && *matchNum > len(matchIndices) {
+				matchIndices = append(matchIndices, tmp)
-			matchNumFlagEnabled = false
+			}
 		} else {
 			matchIndices = findAllMatches(regComp, test_str)
 		}
 		if *printMatchesFlag {
@@ -711,15 +132,9 @@ func main() {
 				if !(*multiLineFlag) {
 					fmt.Fprintf(out, "Line %d:\n", lineNum)
 				}
-				for i, m := range matchIndices {
+				for _, m := range matchIndices {
 					// Only print a match if:
 					// 	a. We are _not_ printing just one match
 					// 	OR
 					// 	b. We _are_ printing just one match, and this is that match
 					if !matchNumFlagEnabled || (i+1) == *matchNum { // Match indexes start from 1; loop counter starts from 0
 					fmt.Fprintf(out, "%s\n", m.toString())
 				}
 				}
 				err := out.Flush()
 				if err != nil {
 					panic(err)
@@ -742,7 +157,7 @@ func main() {
 			// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
 			// These are the values we want to print, now that we have inverted the match.
 			// Re-initialize indicesToPrint and add all of these values to it.
-			indicesToPrint.add(setDifference(genRange(0, len(test_runes)), oldIndices)...)
+			indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)
 		}
 		// If lineFlag is enabled, we should only print something if:
@@ -763,10 +178,9 @@ func main() {
 		//			the corresponding end index.
 		// 		3. If not, just print the character.
 		if substituteFlagEnabled {
-			for i := range test_runes {
+			for i := range test_str {
 				inMatchIndex := false
-				for idx, m := range matchIndices {
+				for _, m := range matchIndices {
 					if !matchNumFlagEnabled || (idx+1) == *matchNum {
 					if i == m[0].startIdx {
 						fmt.Fprintf(out, "%s", *substituteText)
 						i = m[0].endIdx
@@ -774,20 +188,13 @@ func main() {
 						break
 					}
 				}
 				}
 				if !inMatchIndex {
-					fmt.Fprintf(out, "%c", test_runes[i])
+					fmt.Fprintf(out, "%c", test_str[i])
 				}
 			}
 		} else {
-			for i, c := range test_runes {
+			for i, c := range test_str {
-				// Explanation:
+				if indicesToPrint.contains(i) {
 				// 	We print a letter in red if:
 				// 		1. It is in the 'indicesToPrint'
 				// 		2. One of the following:
 				// 			a. The '-m' flag is disabled
 				// 			b. The '-m' flag is enabled, and our current index is in the bounds of the specific match
 				if indicesToPrint.contains(i) && (!matchNumFlagEnabled || (i >= matchIndices[*matchNum-1][0].startIdx && i < matchIndices[*matchNum-1][0].endIdx)) {
 					color.New(color.FgRed).Fprintf(out, "%c", c)
 					// Newline after every match - only if -o is enabled and -v is disabled.
 					if *onlyFlag && !(*invertFlag) {
--- a/matching.go
+++ b/matching.go
@@ -138,15 +138,38 @@ func pruneIndices(indices []Match) []Match {
 	return toRet
 }
 // findNthMatch finds the 'n'th match of the regex represented by the given start-state, with
 // the given string.
 // It returns an error (!= nil) if there are fewer than 'n' matches in the string.
 func findNthMatch(regex Reg, str string, n int) (Match, error) {
 	idx := 0
 	matchNum := 0
 	str_runes := []rune(str)
 	var matchFound bool
 	var matchIdx Match
 	for idx <= len(str_runes) {
 		matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
 		if matchFound {
 			matchNum++
 		}
 		if matchNum == n {
 			return matchIdx, nil
 		}
 	}
 	// We haven't found the nth match after scanning the string - Return an error
 	return nil, fmt.Errorf("Invalid match index. Too few matches found.")
 }
 // findAllMatches tries to find all matches of the regex represented by given start-state, with
 // the given string
-func findAllMatches(start *State, str []rune, numGroups int) []Match {
+func findAllMatches(regex Reg, str string) []Match {
 	idx := 0
 	str_runes := []rune(str)
 	var matchFound bool
 	var matchIdx Match
 	indices := make([]Match, 0)
-	for idx <= len(str) {
+	for idx <= len(str_runes) {
-		matchFound, matchIdx, idx = findAllMatchesHelper(start, str, idx, numGroups)
+		matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
 		if matchFound {
 			indices = append(indices, matchIdx)
 		}
--- a/nfa.go
+++ b/nfa.go
@@ -122,13 +122,21 @@ func (s State) checkAssertion(str []rune, idx int) bool {
 		// 		2. Run it on a subset of the test string, that ends after the current index in the string
 		// 		3. Based on the kind of lookaround (and the indices we get), determine what action to take.
 		startState := s.lookaroundNFA
-		var strToMatch []rune
+		var runesToMatch []rune
 		var strToMatch string
 		if s.assert == PLA || s.assert == NLA {
-			strToMatch = str[idx:]
+			runesToMatch = str[idx:]
 		} else {
-			strToMatch = str[:idx]
+			runesToMatch = str[:idx]
 		}
-		matchIndices := findAllMatches(startState, strToMatch, startState.lookaroundNumCaptureGroups)
+
 		if len(runesToMatch) == 0 {
 			strToMatch = ""
 		} else {
 			strToMatch = string(runesToMatch)
 		}
 		matchIndices := findAllMatches(Reg{startState, startState.lookaroundNumCaptureGroups}, strToMatch)
 		numMatchesFound := 0
 		for _, matchIdx := range matchIndices {
--- a/re_test.go
+++ b/re_test.go
@@ -187,9 +187,11 @@ var groupTests = []struct {
 func TestFindAllMatches(t *testing.T) {
 	for _, test := range reTests {
 		t.Run(test.re+"	"+test.str, func(t *testing.T) {
-			re_postfix := shuntingYard(test.re)
+			regComp, err := Compile(test.re)
-			startState, numGroups := thompson(re_postfix)
+			if err != nil {
-			matchIndices := findAllMatches(startState, []rune(test.str), numGroups)
+				panic(err)
 			}
 			matchIndices := findAllMatches(regComp, test.str)
 			zeroGroups := make([]Group, len(matchIndices))
 			for i, m := range matchIndices {
 				zeroGroups[i] = m[0]
@@ -204,9 +206,11 @@ func TestFindAllMatches(t *testing.T) {
 func TestFindAllGroups(t *testing.T) {
 	for _, test := range groupTests {
 		t.Run(test.re+"	"+test.str, func(t *testing.T) {
-			re_postfix := shuntingYard(test.re)
+			regComp, err := Compile(test.re)
-			startState, numGroups := thompson(re_postfix)
+			if err != nil {
-			matchIndices := findAllMatches(startState, []rune(test.str), numGroups)
+				panic(err)
 			}
 			matchIndices := findAllMatches(regComp, test.str)
 			for i := range matchIndices {
 				for j := range matchIndices[i] {
 					if matchIndices[i][j].isValid() {
Author	SHA1	Message	Date
Rockingcool	24fa365be1	Moved some auxiliary functions into compile.go; use new API for compiling and finding matches	2025-01-06 20:14:57 -06:00
Rockingcool	1da3f7f0e0	Changed API for match-finding functions - take in a Reg instead of start state and numGroups separately	2025-01-06 20:14:19 -06:00
Rockingcool	8e8067482a	Rewrote to use new API for compiling and finding matches	2025-01-06 20:12:18 -06:00
Rockingcool	644ed15af0	Use new API for findAllMatches	2025-01-06 20:10:25 -06:00
Rockingcool	c8613c1ba2	Major restructuring - added new type, changed return types for shuntingYard and thompson I added a new function 'Compile' that calls shuntingYard and thompson. I also added a new type 'Reg' that this function returns - it represents the starting state and contains the number of capturing groups in the regex. I also rewrote shuntingYard and thompson to return errors instead of panicking.	2025-01-06 20:08:24 -06:00
Rockingcool	ddbcb309b0	Made shuntingYard return an error instead of panicking, moved it and thompson to compile.go	2025-01-06 12:29:04 -06:00
Rockingcool	72263509d3	Rewrote behavior of '-m' flag to use the 'nth match' function from matching.go	2025-01-05 21:41:14 -06:00
Rockingcool	4373d35216	Wrote function to find the 'n'th match of a regex	2025-01-05 21:40:53 -06:00