Moved some auxiliary functions into compile.go; use new API for compiling and finding matches

Changed API for match-finding functions - take in a Reg instead of start state and numGroups separately
Rewrote to use new API for compiling and finding matches
2025-01-06 20:14:57 -06:00 · 2025-01-06 20:14:19 -06:00 · 2025-01-06 20:12:18 -06:00 · 2025-01-06 20:10:25 -06:00 · 2025-01-06 20:08:24 -06:00 · 2025-01-06 12:29:04 -06:00
5 changed files with 692 additions and 632 deletions
--- a/compile.go
+++ b/compile.go
@@ -0,0 +1,618 @@
+package main
+
+import (
+	"fmt"
+	"slices"
+	"strconv"
+	"unicode"
+)
+
+// A Reg represents the result of compiling a regular expression. It contains
+// the startState of the NFA representation of the regex, and the number of capturing
+// groups in the regex.
+type Reg struct {
+	start     *State
+	numGroups int
+}
+
+const CONCAT rune = '~'
+
+func isOperator(c rune) bool {
+	if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
+		return true
+	}
+	return false
+}
+
+/* priority returns the priority of the given operator */
+func priority(op rune) int {
+	precedence := []rune{'|', CONCAT, '+', '*', '?'}
+	return slices.Index(precedence, op)
+}
+
+/*
+The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
+The primary benefit of this is getting rid of parentheses.
+It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
+An error can be returned for a multitude of reasons - the reason is specified in the error string.
+See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
+*/
+func shuntingYard(re string) ([]postfixNode, error) {
+	re_postfix := make([]rune, 0)
+	// Convert the string to a slice of runes to allow iteration through it
+	re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
+	re_runes := make([]rune, 0)
+	// Check for numeric range. If we are at the start of a numeric range,
+	// skip to end and construct the equivalent regex for the range.
+	// The reason this is outside the loop below, is that it actually modifies
+	// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
+	// It also makes the overall parsing easier, since I don't have to worry about the numeric range
+	// anymore.
+	// Eventually, I might be able to add it into the main parsing loop, to reduce the time
+	// complexity.
+	// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
+	//
+	// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
+	// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
+	for i := 0; i < len(re_runes_orig); i++ {
+		c := re_runes_orig[i]
+		if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
+			i++ // Step over opening angle bracket
+			tmpStr := ""
+			hyphenFound := false
+			for i < len(re_runes_orig) && re_runes_orig[i] != '>' {
+				if !unicode.IsDigit(re_runes_orig[i]) {
+					if re_runes_orig[i] != '-' || (hyphenFound) {
+						return nil, fmt.Errorf("Invalid numeric range.")
+					}
+				}
+				if re_runes_orig[i] == '-' {
+					hyphenFound = true
+				}
+				tmpStr += string(re_runes_orig[i])
+				i++
+			}
+			// End of string reached and last character doesn't close the range
+			if i == len(re_runes_orig) && re_runes_orig[len(re_runes_orig)-1] != '>' {
+				return nil, fmt.Errorf("Numeric range not closed.")
+			}
+			if len(tmpStr) == 0 {
+				return nil, fmt.Errorf("Empty numeric range.")
+			}
+			// Closing bracket will be skipped when the loop variable increments
+			var rangeStart int
+			var rangeEnd int
+			fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd)
+			regex := range2regex(rangeStart, rangeEnd)
+			re_runes = append(re_runes, []rune(regex)...)
+		} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
+			re_runes = append(re_runes, NONCAPLPAREN_CHAR)
+			i += 2
+		} else {
+			re_runes = append(re_runes, c)
+		}
+	}
+
+	/* 	Add concatenation operators.
+	Only add a concatenation operator between two characters if both the following conditions are met:
+		1. 	The first character isn't an opening parantheses or alteration operator (or an escape character)
+			a. This makes sense, because these operators can't be _concatenated_ with anything else.
+		2. The second character isn't a 'closing operator' - one that applies to something before it
+			a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_.
+	Caveats:
+		1. Don't mess with anything inside brackets - character class
+		2. Don't mess with anything inside braces - numeric repetition
+		3. Don't mess with any lookarounds.
+	*/
+	i := 0
+	for i < len(re_runes) {
+		re_postfix = append(re_postfix, re_runes[i])
+		if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters.
+			re_postfix[len(re_postfix)-1] = LBRACKET         // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
+			toAppend := make([]rune, 0)                      // Holds all the runes in the current character class
+			if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
+				re_postfix = append(re_postfix, '^')
+				i++ // Skip opening bracket and caret
+			}
+			if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
+				return nil, fmt.Errorf("Empty character class.")
+			}
+			for re_runes[i] != ']' {
+				i++ // Skip all characters inside brackets
+				// TODO: Check for escaped characters
+
+				// Check ahead for character range
+				if i < len(re_runes)-2 && re_runes[i+1] == '-' {
+					rangeStart := re_runes[i]
+					rangeEnd := re_runes[i+2]
+					if int(rangeEnd) < int(rangeStart) {
+						return nil, fmt.Errorf("Range is out of order.")
+					}
+
+					for i := rangeStart; i <= rangeEnd; i++ {
+						toAppend = append(toAppend, i)
+					}
+
+					i += 2 // Skip start and hyphen (end will automatically be skipped on next iteration of loop)
+					continue
+				}
+				toAppend = append(toAppend, re_runes[i])
+			}
+			// Replace the last character (which should have been ']', with RBRACKET
+			toAppend[len(toAppend)-1] = RBRACKET
+			re_postfix = append(re_postfix, toAppend...)
+		}
+		if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
+			i++ // Skip opening brace
+			for i < len(re_runes) && re_runes[i] != '}' {
+				re_postfix = append(re_postfix, re_runes[i])
+				i++
+			}
+			if i == len(re_runes) {
+				return nil, fmt.Errorf("Invalid numeric specifier.")
+			}
+			re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
+		}
+		if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen
+			re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
+			i += 3
+		}
+		if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
+			i++ // Step inside
+			if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
+				return nil, fmt.Errorf("Invalid regex. Lookaround intended?")
+			}
+			re_postfix = append(re_postfix, re_runes[i])
+			i++
+			numOpenParens := 1
+			for numOpenParens != 0 {
+				if i >= len(re_runes) {
+					return nil, fmt.Errorf("Unclosed lookaround.")
+				}
+				if re_runes[i] == '(' {
+					numOpenParens++
+				}
+				if re_runes[i] == ')' {
+					numOpenParens--
+					if numOpenParens == 0 {
+						break
+					}
+				}
+				re_postfix = append(re_postfix, re_runes[i])
+				i++
+			}
+			continue
+		}
+		if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
+			if i < len(re_runes)-1 {
+				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
+					re_postfix = append(re_postfix, CONCAT)
+				}
+			}
+		}
+		i++
+	}
+
+	opStack := make([]rune, 0)         // Operator stack
+	outQueue := make([]postfixNode, 0) // Output queue
+
+	// Actual algorithm
+	numOpenParens := 0 // Number of open parentheses
+	for i := 0; i < len(re_postfix); i++ {
+		/* Two cases:
+		1. Current character is alphanumeric - send to output queue
+		2. Current character is operator - do the following:
+			a. If current character has greater priority than top of opStack, push to opStack.
+			b. If not, keep popping from opStack (and appending to outQueue) until:
+				i. opStack is empty, OR
+				ii. current character has greater priority than top of opStack
+		3. If current character is '(' or NONCAPLPAREN_CHAR, push to opStack
+		4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
+		5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
+		6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
+		*/
+		c := re_postfix[i]
+		if isNormalChar(c) {
+			if caseInsensitiveFlag != nil && *caseInsensitiveFlag {
+				outQueue = append(outQueue, newPostfixNode(allCases(c)...))
+			} else {
+				outQueue = append(outQueue, newPostfixNode(c))
+			}
+			continue
+		}
+		// Escape character
+		if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
+			if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
+				return nil, fmt.Errorf("ERROR: Backslash with no escape character.")
+			}
+			i++
+			outQueue = append(outQueue, newEscapedNode(re_postfix[i]))
+			continue // Escaped character will automatically be skipped when loop variable increments
+		}
+
+		if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
+			outQueue = append(outQueue, newPostfixDotNode())
+			continue
+		}
+		if c == '^' { // Start-of-string assertion
+			outQueue = append(outQueue, newPostfixNode(c))
+		}
+		if c == '$' { // End-of-string assertion
+			outQueue = append(outQueue, newPostfixNode(c))
+		}
+		// Check if we're at the start of a lookaround
+		if c == '(' && i < len(re_postfix)-1 && re_postfix[i+1] == '?' {
+			i += 2      // Skip opening paren and question mark
+			regex := "" // Stores lookaround regex
+			numOpenParens := 1
+			for numOpenParens != 0 {
+				if i >= len(re_postfix) {
+					return nil, fmt.Errorf("Unclosed lookaround.")
+				}
+				if re_postfix[i] == '(' {
+					numOpenParens++
+				}
+				if re_postfix[i] == ')' {
+					numOpenParens--
+					if numOpenParens == 0 {
+						break
+					}
+				}
+				regex += string(re_postfix[i])
+				i++
+			}
+			if len(regex) <= 1 { // Nothing in regex - panic
+				return nil, fmt.Errorf("Invalid lookaround. (too short?)")
+			}
+			// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
+			// Now we should filter that out.
+			toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1}
+			if regex[0] == '<' { // Lookbehind
+				toAppend.lookaroundDir = LOOKBEHIND
+				regex = regex[1:]
+			} else if regex[0] == '=' || regex[0] == '!' {
+				toAppend.lookaroundDir = LOOKAHEAD
+			} else {
+				return nil, fmt.Errorf("Invalid lookaround.")
+			}
+			// Positive or negative
+			if regex[0] == '=' { // Positive
+				toAppend.lookaroundSign = POSITIVE
+				toAppend.contents = []rune(regex[1:])
+			} else if regex[0] == '!' { // Negative
+				toAppend.lookaroundSign = NEGATIVE
+				toAppend.contents = []rune(regex[1:])
+			} else {
+				return nil, fmt.Errorf("Invalid lookaround.")
+			}
+			outQueue = append(outQueue, toAppend)
+			continue
+		}
+		if isOperator(c) {
+			if len(opStack) == 0 {
+				opStack = append(opStack, c)
+			} else {
+				topStack, err := peek(opStack)
+				if err != nil {
+					return nil, fmt.Errorf("Operator without operand.")
+				}
+				if priority(c) > priority(topStack) { // 2a
+					opStack = append(opStack, c)
+				} else {
+					for priority(c) <= priority(topStack) { // 2b
+						to_append := mustPop(&opStack)
+						outQueue = append(outQueue, newPostfixNode(to_append))
+						topStack, _ = peek(opStack)
+					}
+					opStack = append(opStack, c)
+				}
+			}
+		}
+		if c == LBRACKET { // Used for character classes
+			i++ // Step forward so we can look at the character class
+			var invertMatch bool
+			if re_postfix[i] == '^' {
+				invertMatch = true
+				i++
+			}
+			chars := make([]rune, 0) // List of characters -  used only for character classes
+			for i < len(re_postfix) {
+				if re_postfix[i] == RBRACKET {
+					break
+				}
+				chars = append(chars, re_postfix[i])
+				i++
+			}
+			if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
+				return nil, fmt.Errorf("Opening bracket without closing bracket.")
+			}
+			if !invertMatch {
+				outQueue = append(outQueue, newPostfixCharNode(chars...))
+			} else {
+				// Invert match - create an allChars postfixNode, then add the given states to its 'except' list.
+				toAdd := newPostfixDotNode()
+				toAdd.except = chars
+				outQueue = append(outQueue, toAdd)
+			}
+			continue
+		}
+		if c == '{' {
+			i++ // Skip opening brace
+			// Three possibilities:
+			// 1. Single number - {5}
+			// 2. Range - {3,5}
+			// 3. Start with no end, {3,}
+			startRange := make([]rune, 0)
+			startRangeNum := 0
+			endRange := make([]rune, 0)
+			endRangeNum := 0
+			for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
+				startRange = append(startRange, re_postfix[i])
+				i++
+			}
+			if len(startRange) == 0 { // {} is not valid, neither is {,5}
+				return nil, fmt.Errorf("Invalid numeric specifier.")
+			}
+			if i == len(re_postfix) {
+				return nil, fmt.Errorf("Brace not closed.")
+			}
+
+			startRangeNum, err := strconv.Atoi(string(startRange))
+			if err != nil {
+				panic(err)
+			}
+
+			if re_postfix[i] == '}' { // Case 1 above
+				endRangeNum = startRangeNum
+			} else {
+				if re_postfix[i] != ',' {
+					return nil, fmt.Errorf("Invalid numeric specifier.")
+				}
+				i++ // Skip comma
+				for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
+					endRange = append(endRange, re_postfix[i])
+					i++
+				}
+				if i == len(re_postfix) {
+					return nil, fmt.Errorf("Brace not closed.")
+				}
+				if re_postfix[i] != '}' {
+					return nil, fmt.Errorf("Invalid numeric specifier.")
+				}
+				if len(endRange) == 0 { // Case 3 above
+					endRangeNum = INFINITE_REPS
+				} else { // Case 2 above
+					var err error
+					endRangeNum, err = strconv.Atoi(string(endRange))
+					if err != nil {
+						panic(err)
+					}
+				}
+			}
+
+			idx := len(outQueue) - 1
+			// Get the last added node
+			if idx < 0 || outQueue[idx].nodetype == LPAREN {
+				return nil, fmt.Errorf("Numeric specifier with no content.")
+			}
+			outQueue[idx].startReps = startRangeNum
+			outQueue[idx].endReps = endRangeNum
+		}
+		if c == '(' || c == NONCAPLPAREN_CHAR {
+			opStack = append(opStack, c)
+			if c == '(' { // We only push _capturing_ group parentheses to outQueue
+				outQueue = append(outQueue, newPostfixNode(c))
+			}
+			numOpenParens++
+		}
+		if c == ')' {
+			// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack.
+			var val rune
+			var err error
+			for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) {
+				if err != nil {
+					return nil, fmt.Errorf("Imbalanced parantheses.")
+				}
+				to_append := mustPop(&opStack)
+				outQueue = append(outQueue, newPostfixNode(to_append))
+			}
+			_ = mustPop(&opStack) // Get rid of opening parentheses
+			if val == '(' {       // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
+				outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
+			}
+			numOpenParens--
+		}
+	}
+
+	// Pop all remaining operators (and append to outQueue)
+	for len(opStack) > 0 {
+		to_append := mustPop(&opStack)
+		outQueue = append(outQueue, newPostfixNode(to_append))
+	}
+
+	if numOpenParens != 0 {
+		return nil, fmt.Errorf("Imbalanced parantheses.")
+	}
+
+	return outQueue, nil
+}
+
+// Thompson's algorithm. Constructs Finite-State Automaton from given string.
+// Returns start state and number of groups in regex.
+func thompson(re []postfixNode) (Reg, error) {
+	nfa := make([]*State, 0) // Stack of states
+	numGroups := 0           // Number of capturing groups
+	for _, c := range re {
+		if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
+			state := State{}
+			state.transitions = make(map[int][]*State)
+			if c.allChars {
+				state.allChars = true
+				if len(c.except) != 0 {
+					state.except = append([]rune{}, c.except...)
+				}
+			}
+			state.content = rune2Contents(c.contents)
+			state.output = make([]*State, 0)
+			state.output = append(state.output, &state)
+			state.isEmpty = false
+			if c.nodetype == ASSERTION {
+				state.isEmpty = true                 // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
+				state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
+				if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
+					switch c.contents[0] {
+					case '^':
+						state.assert = SOS
+					case '$':
+						state.assert = EOS
+					case 'b':
+						state.assert = WBOUND
+					case 'B':
+						state.assert = NONWBOUND
+					}
+				} else { // Lookaround
+					state.lookaroundRegex = string(c.contents)
+					if c.lookaroundDir == LOOKAHEAD {
+						if c.lookaroundSign == POSITIVE {
+							state.assert = PLA
+						}
+						if c.lookaroundSign == NEGATIVE {
+							state.assert = NLA
+						}
+					}
+					if c.lookaroundDir == LOOKBEHIND {
+						if c.lookaroundSign == POSITIVE {
+							state.assert = PLB
+						}
+						if c.lookaroundSign == NEGATIVE {
+							state.assert = NLB
+						}
+					}
+					tmpRe, err := shuntingYard(state.lookaroundRegex)
+					if err != nil {
+						return Reg{}, fmt.Errorf("Error parsing lookaround: %w", err)
+					}
+					reg, err := thompson(tmpRe)
+					if err != nil {
+						return Reg{}, fmt.Errorf("Error compiling lookaround: %w", err)
+					}
+					state.lookaroundNFA = reg.start
+					state.lookaroundNumCaptureGroups = reg.numGroups
+
+				}
+			}
+			nfa = append(nfa, &state)
+		}
+		if c.nodetype == LPAREN || c.nodetype == RPAREN {
+			s := &State{}
+			s.assert = NONE
+			s.content = newContents(EPSILON)
+			s.isEmpty = true
+			s.output = make([]*State, 0)
+			s.output = append(s.output, s)
+			s.transitions = make(map[int][]*State)
+			// LPAREN nodes are just added normally
+			if c.nodetype == LPAREN {
+				numGroups++
+				s.groupBegin = true
+				s.groupNum = numGroups
+				nfa = append(nfa, s)
+				continue
+			}
+			// For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN,
+			// and then some other node.
+			// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
+			// and added back in.
+			if c.nodetype == RPAREN {
+				s.groupEnd = true
+				middleNode := mustPop(&nfa)
+				lparenNode := mustPop(&nfa)
+				s.groupNum = lparenNode.groupNum
+				tmp := concatenate(lparenNode, middleNode)
+				to_add := concatenate(tmp, s)
+				nfa = append(nfa, to_add)
+
+			}
+		}
+		// Must be an operator if it isn't a character
+		switch c.nodetype {
+		case CONCATENATE:
+			s2 := mustPop(&nfa)
+			s1 := mustPop(&nfa)
+			s1 = concatenate(s1, s2)
+			nfa = append(nfa, s1)
+		case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
+			s1 := mustPop(&nfa)
+			stateToAdd := kleene(*s1)
+			nfa = append(nfa, stateToAdd)
+		case PLUS: // a+ is equivalent to aa*
+			s1 := mustPop(&nfa)
+			s2 := kleene(*s1)
+			s1 = concatenate(s1, s2)
+			nfa = append(nfa, s1)
+		case QUESTION: // ab? is equivalent to a(b|)
+			s1 := mustPop(&nfa)
+			s2 := question(s1)
+			nfa = append(nfa, s2)
+		case PIPE:
+			s1 := mustPop(&nfa)
+			s2 := mustPop(&nfa)
+			s3 := alternate(s1, s2)
+			nfa = append(nfa, s3)
+		}
+		if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
+			if c.endReps != -1 && c.endReps < c.startReps {
+				return Reg{}, fmt.Errorf("Numeric specifier - start greater than end.")
+			}
+			state := mustPop(&nfa)
+			var stateToAdd *State = nil
+			// Take advantage of the following facts:
+			// a{5} == aaaaa
+			// a{3,5} == aaaa?a?
+			// a{5,} == aaaaa+
+			// Nov. 3 2024 - I have two choices on how I want to implement numeric
+			// specifiers.
+			// a. Encode the logic while creating the states. I will have to create a function
+			// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
+			// each other (concatenating them with the 'concatenate' method - which takes addresses - does
+			// not work). Creating this function might be a lot of work.
+			// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
+			// at this point, I can leave thompson untouched.
+			for i := 0; i < c.startReps; i++ { // Case 1
+				stateToAdd = concatenate(stateToAdd, cloneState(state))
+			}
+			if c.endReps == INFINITE_REPS { // Case 3
+				s2 := kleene(*state)
+				stateToAdd = concatenate(stateToAdd, s2)
+			} else { // Case 2
+				for i := c.startReps; i < c.endReps; i++ {
+					stateToAdd = concatenate(stateToAdd, question(state))
+				}
+			}
+			nfa = append(nfa, stateToAdd)
+		}
+	}
+	if len(nfa) != 1 {
+		return Reg{}, fmt.Errorf("Invalid Regex.")
+	}
+
+	verifyLastStates(nfa)
+
+	return Reg{nfa[0], numGroups}, nil
+
+}
+
+// Compiles the given regular expression into a Reg type, suitable for use with the
+// matching functions. The second return value is non-nil if a compilation error has
+// occured. As such, the error value must be checked before using the Reg returned by this function.
+func Compile(re string) (Reg, error) {
+	nodes, err := shuntingYard(re)
+	if err != nil {
+		return Reg{}, fmt.Errorf("Error parsing regex: %w", err)
+	}
+	reg, err := thompson(nodes)
+	if err != nil {
+		return Reg{}, fmt.Errorf("Error compiling regex: %w", err)
+	}
+	return reg, nil
+}
--- a/main.go
+++ b/main.go
@@ -6,596 +6,13 @@ import (
 	"fmt"
 	"io"
 	"os"
-	"slices"
-	"strconv"
-	"unicode"

 	"github.com/fatih/color"
 )

-const CONCAT rune = '~'
-
 var notDotChars []rune
 var caseInsensitiveFlag *bool // Whether we are running in case-insensitive mode

-func isOperator(c rune) bool {
-	if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
-		return true
-	}
-	return false
-}
-
-/* priority returns the priority of the given operator */
-func priority(op rune) int {
-	precedence := []rune{'|', CONCAT, '+', '*', '?'}
-	return slices.Index(precedence, op)
-}
-
-/*
-The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
-The primary benefit of this is getting rid of parentheses.
-It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
-See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
-*/
-func shuntingYard(re string) []postfixNode {
-	re_postfix := make([]rune, 0)
-	// Convert the string to a slice of runes to allow iteration through it
-	re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
-	re_runes := make([]rune, 0)
-	// Check for numeric range. If we are at the start of a numeric range,
-	// skip to end and construct the equivalent regex for the range.
-	// The reason this is outside the loop below, is that it actually modifies
-	// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
-	// It also makes the overall parsing easier, since I don't have to worry about the numeric range
-	// anymore.
-	// Eventually, I might be able to add it into the main parsing loop, to reduce the time
-	// complexity.
-	// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
-	//
-	// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
-	// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
-	for i := 0; i < len(re_runes_orig); i++ {
-		c := re_runes_orig[i]
-		if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
-			i++ // Step over opening angle bracket
-			tmpStr := ""
-			hyphenFound := false
-			for i < len(re_runes_orig) && re_runes_orig[i] != '>' {
-				if !unicode.IsDigit(re_runes_orig[i]) {
-					if re_runes_orig[i] != '-' || (hyphenFound) {
-						panic("ERROR: Invalid numeric range.")
-					}
-				}
-				if re_runes_orig[i] == '-' {
-					hyphenFound = true
-				}
-				tmpStr += string(re_runes_orig[i])
-				i++
-			}
-			// End of string reached and last character doesn't close the range
-			if i == len(re_runes_orig) && re_runes_orig[len(re_runes_orig)-1] != '>' {
-				panic("ERROR: Numeric range not closed.")
-			}
-			if len(tmpStr) == 0 {
-				panic("ERROR: Empty numeric range.")
-			}
-			// Closing bracket will be skipped when the loop variable increments
-			var rangeStart int
-			var rangeEnd int
-			fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd)
-			regex := range2regex(rangeStart, rangeEnd)
-			re_runes = append(re_runes, []rune(regex)...)
-		} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
-			re_runes = append(re_runes, NONCAPLPAREN_CHAR)
-			i += 2
-		} else {
-			re_runes = append(re_runes, c)
-		}
-	}
-
-	/* 	Add concatenation operators.
-	Only add a concatenation operator between two characters if both the following conditions are met:
-		1. 	The first character isn't an opening parantheses or alteration operator (or an escape character)
-			a. This makes sense, because these operators can't be _concatenated_ with anything else.
-		2. The second character isn't a 'closing operator' - one that applies to something before it
-			a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_.
-	Caveats:
-		1. Don't mess with anything inside brackets - character class
-		2. Don't mess with anything inside braces - numeric repetition
-		3. Don't mess with any lookarounds.
-	*/
-	i := 0
-	for i < len(re_runes) {
-		re_postfix = append(re_postfix, re_runes[i])
-		if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters.
-			re_postfix[len(re_postfix)-1] = LBRACKET         // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
-			toAppend := make([]rune, 0)                      // Holds all the runes in the current character class
-			if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
-				re_postfix = append(re_postfix, '^')
-				i++ // Skip opening bracket and caret
-			}
-			if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
-				panic("Empty character class.")
-			}
-			for re_runes[i] != ']' {
-				i++ // Skip all characters inside brackets
-				// TODO: Check for escaped characters
-
-				// Check ahead for character range
-				if i < len(re_runes)-2 && re_runes[i+1] == '-' {
-					rangeStart := re_runes[i]
-					rangeEnd := re_runes[i+2]
-					if int(rangeEnd) < int(rangeStart) {
-						panic("Range is out of order.")
-					}
-
-					for i := rangeStart; i <= rangeEnd; i++ {
-						toAppend = append(toAppend, i)
-					}
-
-					i += 2 // Skip start and hyphen (end will automatically be skipped on next iteration of loop)
-					continue
-				}
-				toAppend = append(toAppend, re_runes[i])
-			}
-			// Replace the last character (which should have been ']', with RBRACKET
-			toAppend[len(toAppend)-1] = RBRACKET
-			re_postfix = append(re_postfix, toAppend...)
-		}
-		if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
-			i++ // Skip opening brace
-			for i < len(re_runes) && re_runes[i] != '}' {
-				re_postfix = append(re_postfix, re_runes[i])
-				i++
-			}
-			if i == len(re_runes) {
-				panic("Invalid numeric specifier.")
-			}
-			re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
-		}
-		if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen
-			re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
-			i += 3
-		}
-		if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
-			i++ // Step inside
-			if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
-				panic("Invalid regex. Lookaround intended?")
-			}
-			re_postfix = append(re_postfix, re_runes[i])
-			i++
-			numOpenParens := 1
-			for numOpenParens != 0 {
-				if i >= len(re_runes) {
-					panic("Unclosed lookaround.")
-				}
-				if re_runes[i] == '(' {
-					numOpenParens++
-				}
-				if re_runes[i] == ')' {
-					numOpenParens--
-					if numOpenParens == 0 {
-						break
-					}
-				}
-				re_postfix = append(re_postfix, re_runes[i])
-				i++
-			}
-			continue
-		}
-		if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
-			if i < len(re_runes)-1 {
-				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
-					re_postfix = append(re_postfix, CONCAT)
-				}
-			}
-		}
-		i++
-	}
-
-	opStack := make([]rune, 0)         // Operator stack
-	outQueue := make([]postfixNode, 0) // Output queue
-
-	// Actual algorithm
-	numOpenParens := 0 // Number of open parentheses
-	for i := 0; i < len(re_postfix); i++ {
-		/* Two cases:
-		1. Current character is alphanumeric - send to output queue
-		2. Current character is operator - do the following:
-			a. If current character has greater priority than top of opStack, push to opStack.
-			b. If not, keep popping from opStack (and appending to outQueue) until:
-				i. opStack is empty, OR
-				ii. current character has greater priority than top of opStack
-		3. If current character is '(' or NONCAPLPAREN_CHAR, push to opStack
-		4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
-		5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
-		6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
-		*/
-		c := re_postfix[i]
-		if isNormalChar(c) {
-			if caseInsensitiveFlag != nil && *caseInsensitiveFlag {
-				outQueue = append(outQueue, newPostfixNode(allCases(c)...))
-			} else {
-				outQueue = append(outQueue, newPostfixNode(c))
-			}
-			continue
-		}
-		// Escape character
-		if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
-			if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
-				panic("ERROR: Backslash with no escape character.")
-			}
-			i++
-			outQueue = append(outQueue, newEscapedNode(re_postfix[i]))
-			continue // Escaped character will automatically be skipped when loop variable increments
-		}
-
-		if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
-			outQueue = append(outQueue, newPostfixDotNode())
-			continue
-		}
-		if c == '^' { // Start-of-string assertion
-			outQueue = append(outQueue, newPostfixNode(c))
-		}
-		if c == '$' { // End-of-string assertion
-			outQueue = append(outQueue, newPostfixNode(c))
-		}
-		// Check if we're at the start of a lookaround
-		if c == '(' && i < len(re_postfix)-1 && re_postfix[i+1] == '?' {
-			i += 2      // Skip opening paren and question mark
-			regex := "" // Stores lookaround regex
-			numOpenParens := 1
-			for numOpenParens != 0 {
-				if i >= len(re_postfix) {
-					panic("Unclosed lookaround.")
-				}
-				if re_postfix[i] == '(' {
-					numOpenParens++
-				}
-				if re_postfix[i] == ')' {
-					numOpenParens--
-					if numOpenParens == 0 {
-						break
-					}
-				}
-				regex += string(re_postfix[i])
-				i++
-			}
-			if len(regex) <= 1 { // Nothing in regex - panic
-				panic("Invalid lookaround. (too short?)")
-			}
-			// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
-			// Now we should filter that out.
-			toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1}
-			if regex[0] == '<' { // Lookbehind
-				toAppend.lookaroundDir = LOOKBEHIND
-				regex = regex[1:]
-			} else if regex[0] == '=' || regex[0] == '!' {
-				toAppend.lookaroundDir = LOOKAHEAD
-			} else {
-				panic("Invalid lookaround.")
-			}
-			// Positive or negative
-			if regex[0] == '=' { // Positive
-				toAppend.lookaroundSign = POSITIVE
-				toAppend.contents = []rune(regex[1:])
-			} else if regex[0] == '!' { // Negative
-				toAppend.lookaroundSign = NEGATIVE
-				toAppend.contents = []rune(regex[1:])
-			} else {
-				panic("Invalid lookaround.")
-			}
-			outQueue = append(outQueue, toAppend)
-			continue
-		}
-		if isOperator(c) {
-			if len(opStack) == 0 {
-				opStack = append(opStack, c)
-			} else {
-				topStack, err := peek(opStack)
-				if err != nil {
-					panic("ERROR: Operator without operand.")
-				}
-				if priority(c) > priority(topStack) { // 2a
-					opStack = append(opStack, c)
-				} else {
-					for priority(c) <= priority(topStack) { // 2b
-						to_append := mustPop(&opStack)
-						outQueue = append(outQueue, newPostfixNode(to_append))
-						topStack, _ = peek(opStack)
-					}
-					opStack = append(opStack, c)
-				}
-			}
-		}
-		if c == LBRACKET { // Used for character classes
-			i++ // Step forward so we can look at the character class
-			var invertMatch bool
-			if re_postfix[i] == '^' {
-				invertMatch = true
-				i++
-			}
-			chars := make([]rune, 0) // List of characters -  used only for character classes
-			for i < len(re_postfix) {
-				if re_postfix[i] == RBRACKET {
-					break
-				}
-				chars = append(chars, re_postfix[i])
-				i++
-			}
-			if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
-				panic("ERROR: Opening bracket without closing bracket.")
-			}
-			if !invertMatch {
-				outQueue = append(outQueue, newPostfixCharNode(chars...))
-			} else {
-				// Invert match - create an allChars postfixNode, then add the given states to its 'except' list.
-				toAdd := newPostfixDotNode()
-				toAdd.except = chars
-				outQueue = append(outQueue, toAdd)
-			}
-			continue
-		}
-		if c == '{' {
-			i++ // Skip opening brace
-			// Three possibilities:
-			// 1. Single number - {5}
-			// 2. Range - {3,5}
-			// 3. Start with no end, {3,}
-			startRange := make([]rune, 0)
-			startRangeNum := 0
-			endRange := make([]rune, 0)
-			endRangeNum := 0
-			for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
-				startRange = append(startRange, re_postfix[i])
-				i++
-			}
-			if len(startRange) == 0 { // {} is not valid, neither is {,5}
-				panic("ERROR: Invalid numeric specifier.")
-			}
-			if i == len(re_postfix) {
-				panic("ERROR: Brace not closed.")
-			}
-
-			startRangeNum, err := strconv.Atoi(string(startRange))
-			if err != nil {
-				panic(err)
-			}
-
-			if re_postfix[i] == '}' { // Case 1 above
-				endRangeNum = startRangeNum
-			} else {
-				if re_postfix[i] != ',' {
-					panic("ERROR: Invalid numeric specifier.")
-				}
-				i++ // Skip comma
-				for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
-					endRange = append(endRange, re_postfix[i])
-					i++
-				}
-				if i == len(re_postfix) {
-					panic("ERROR: Brace not closed.")
-				}
-				if re_postfix[i] != '}' {
-					panic("ERROR: Invalid numeric specifier.")
-				}
-				if len(endRange) == 0 { // Case 3 above
-					endRangeNum = INFINITE_REPS
-				} else { // Case 2 above
-					var err error
-					endRangeNum, err = strconv.Atoi(string(endRange))
-					if err != nil {
-						panic(err)
-					}
-				}
-			}
-
-			idx := len(outQueue) - 1
-			// Get the last added node
-			if idx < 0 || outQueue[idx].nodetype == LPAREN {
-				panic("Numeric specifier with no content.")
-			}
-			outQueue[idx].startReps = startRangeNum
-			outQueue[idx].endReps = endRangeNum
-		}
-		if c == '(' || c == NONCAPLPAREN_CHAR {
-			opStack = append(opStack, c)
-			if c == '(' { // We only push _capturing_ group parentheses to outQueue
-				outQueue = append(outQueue, newPostfixNode(c))
-			}
-			numOpenParens++
-		}
-		if c == ')' {
-			// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack.
-			var val rune
-			var err error
-			for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) {
-				if err != nil {
-					panic("ERROR: Imbalanced parantheses.")
-				}
-				to_append := mustPop(&opStack)
-				outQueue = append(outQueue, newPostfixNode(to_append))
-			}
-			_ = mustPop(&opStack) // Get rid of opening parentheses
-			if val == '(' {       // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
-				outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
-			}
-			numOpenParens--
-		}
-	}
-
-	// Pop all remaining operators (and append to outQueue)
-	for len(opStack) > 0 {
-		to_append := mustPop(&opStack)
-		outQueue = append(outQueue, newPostfixNode(to_append))
-	}
-
-	if numOpenParens != 0 {
-		panic("ERROR: Imbalanced parantheses.")
-	}
-
-	return outQueue
-}
-
-// Thompson's algorithm. Constructs Finite-State Automaton from given string.
-// Returns start state and number of groups in regex.
-func thompson(re []postfixNode) (*State, int) {
-	nfa := make([]*State, 0) // Stack of states
-	numGroups := 0           // Number of capturing groups
-	for _, c := range re {
-		if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
-			state := State{}
-			state.transitions = make(map[int][]*State)
-			if c.allChars {
-				state.allChars = true
-				if len(c.except) != 0 {
-					state.except = append([]rune{}, c.except...)
-				}
-			}
-			state.content = rune2Contents(c.contents)
-			state.output = make([]*State, 0)
-			state.output = append(state.output, &state)
-			state.isEmpty = false
-			if c.nodetype == ASSERTION {
-				state.isEmpty = true                 // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
-				state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
-				if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
-					switch c.contents[0] {
-					case '^':
-						state.assert = SOS
-					case '$':
-						state.assert = EOS
-					case 'b':
-						state.assert = WBOUND
-					case 'B':
-						state.assert = NONWBOUND
-					}
-				} else { // Lookaround
-					state.lookaroundRegex = string(c.contents)
-					if c.lookaroundDir == LOOKAHEAD {
-						if c.lookaroundSign == POSITIVE {
-							state.assert = PLA
-						}
-						if c.lookaroundSign == NEGATIVE {
-							state.assert = NLA
-						}
-					}
-					if c.lookaroundDir == LOOKBEHIND {
-						if c.lookaroundSign == POSITIVE {
-							state.assert = PLB
-						}
-						if c.lookaroundSign == NEGATIVE {
-							state.assert = NLB
-						}
-					}
-					tmpRe := shuntingYard(state.lookaroundRegex)
-					var numGroupsLookaround int
-					state.lookaroundNFA, numGroupsLookaround = thompson(tmpRe)
-					state.lookaroundNumCaptureGroups = numGroupsLookaround
-
-				}
-			}
-			nfa = append(nfa, &state)
-		}
-		if c.nodetype == LPAREN || c.nodetype == RPAREN {
-			s := &State{}
-			s.assert = NONE
-			s.content = newContents(EPSILON)
-			s.isEmpty = true
-			s.output = make([]*State, 0)
-			s.output = append(s.output, s)
-			s.transitions = make(map[int][]*State)
-			// LPAREN nodes are just added normally
-			if c.nodetype == LPAREN {
-				numGroups++
-				s.groupBegin = true
-				s.groupNum = numGroups
-				nfa = append(nfa, s)
-				continue
-			}
-			// For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN,
-			// and then some other node.
-			// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
-			// and added back in.
-			if c.nodetype == RPAREN {
-				s.groupEnd = true
-				middleNode := mustPop(&nfa)
-				lparenNode := mustPop(&nfa)
-				s.groupNum = lparenNode.groupNum
-				tmp := concatenate(lparenNode, middleNode)
-				to_add := concatenate(tmp, s)
-				nfa = append(nfa, to_add)
-
-			}
-		}
-		// Must be an operator if it isn't a character
-		switch c.nodetype {
-		case CONCATENATE:
-			s2 := mustPop(&nfa)
-			s1 := mustPop(&nfa)
-			s1 = concatenate(s1, s2)
-			nfa = append(nfa, s1)
-		case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
-			s1 := mustPop(&nfa)
-			stateToAdd := kleene(*s1)
-			nfa = append(nfa, stateToAdd)
-		case PLUS: // a+ is equivalent to aa*
-			s1 := mustPop(&nfa)
-			s2 := kleene(*s1)
-			s1 = concatenate(s1, s2)
-			nfa = append(nfa, s1)
-		case QUESTION: // ab? is equivalent to a(b|)
-			s1 := mustPop(&nfa)
-			s2 := question(s1)
-			nfa = append(nfa, s2)
-		case PIPE:
-			s1 := mustPop(&nfa)
-			s2 := mustPop(&nfa)
-			s3 := alternate(s1, s2)
-			nfa = append(nfa, s3)
-		}
-		if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
-			if c.endReps != -1 && c.endReps < c.startReps {
-				panic("ERROR: Numeric specifier - start greater than end.")
-			}
-			state := mustPop(&nfa)
-			var stateToAdd *State = nil
-			// Take advantage of the following facts:
-			// a{5} == aaaaa
-			// a{3,5} == aaaa?a?
-			// a{5,} == aaaaa+
-			// Nov. 3 2024 - I have two choices on how I want to implement numeric
-			// specifiers.
-			// a. Encode the logic while creating the states. I will have to create a function
-			// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
-			// each other (concatenating them with the 'concatenate' method - which takes addresses - does
-			// not work). Creating this function might be a lot of work.
-			// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
-			// at this point, I can leave thompson untouched.
-			for i := 0; i < c.startReps; i++ { // Case 1
-				stateToAdd = concatenate(stateToAdd, cloneState(state))
-			}
-			if c.endReps == INFINITE_REPS { // Case 3
-				s2 := kleene(*state)
-				stateToAdd = concatenate(stateToAdd, s2)
-			} else { // Case 2
-				for i := c.startReps; i < c.endReps; i++ {
-					stateToAdd = concatenate(stateToAdd, question(state))
-				}
-			}
-			nfa = append(nfa, stateToAdd)
-		}
-	}
-	if len(nfa) != 1 {
-		panic("ERROR: Invalid Regex.")
-	}
-
-	verifyLastStates(nfa)
-
-	return nfa[0], numGroups
-
-}
-
 func main() {
 	invertFlag := flag.Bool("v", false, "Invert match.")
 	// This flag has two 'modes':
@@ -650,7 +67,6 @@ func main() {
 	var re string
 	re = flag.Args()[0]
 	var test_str string
-	var test_runes []rune // Rune-slice representation of test_str
 	var err error
 	var linesRead bool // Whether or not we have read the lines in the file
 	lineNum := 0       // Current line number
@@ -658,8 +74,11 @@ func main() {
 	reader := bufio.NewReader(os.Stdin)
 	out := bufio.NewWriter(os.Stdout)

-	re_postfix := shuntingYard(re)
-	startState, numGroups := thompson(re_postfix)
+	regComp, err := Compile(re)
+	if err != nil {
+		fmt.Println(err)
+		return
+	}
 	for true {
 		if linesRead {
 			break
@@ -696,12 +115,14 @@ func main() {
 				panic(err)
 			}
 		}
-		test_runes = []rune(test_str)
-		matchIndices := findAllMatches(startState, test_runes, numGroups)
-
-		// If we are trying to print an invalid index, we just assume no specific matches will be printed.
-		if matchNumFlagEnabled && *matchNum > len(matchIndices) {
-			matchNumFlagEnabled = false
+		matchIndices := make([]Match, 0)
+		if matchNumFlagEnabled {
+			tmp, err := findNthMatch(regComp, test_str, *matchNum)
+			if err == nil {
+				matchIndices = append(matchIndices, tmp)
+			}
+		} else {
+			matchIndices = findAllMatches(regComp, test_str)
 		}

 		if *printMatchesFlag {
@@ -711,15 +132,9 @@ func main() {
 				if !(*multiLineFlag) {
 					fmt.Fprintf(out, "Line %d:\n", lineNum)
 				}
-				for i, m := range matchIndices {
-					// Only print a match if:
-					// 	a. We are _not_ printing just one match
-					// 	OR
-					// 	b. We _are_ printing just one match, and this is that match
-					if !matchNumFlagEnabled || (i+1) == *matchNum { // Match indexes start from 1; loop counter starts from 0
+				for _, m := range matchIndices {
 					fmt.Fprintf(out, "%s\n", m.toString())
 				}
-				}
 				err := out.Flush()
 				if err != nil {
 					panic(err)
@@ -742,7 +157,7 @@ func main() {
 			// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
 			// These are the values we want to print, now that we have inverted the match.
 			// Re-initialize indicesToPrint and add all of these values to it.
-			indicesToPrint.add(setDifference(genRange(0, len(test_runes)), oldIndices)...)
+			indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)

 		}
 		// If lineFlag is enabled, we should only print something if:
@@ -763,10 +178,9 @@ func main() {
 		//			the corresponding end index.
 		// 		3. If not, just print the character.
 		if substituteFlagEnabled {
-			for i := range test_runes {
+			for i := range test_str {
 				inMatchIndex := false
-				for idx, m := range matchIndices {
-					if !matchNumFlagEnabled || (idx+1) == *matchNum {
+				for _, m := range matchIndices {
 					if i == m[0].startIdx {
 						fmt.Fprintf(out, "%s", *substituteText)
 						i = m[0].endIdx
@@ -774,20 +188,13 @@ func main() {
 						break
 					}
 				}
-				}
 				if !inMatchIndex {
-					fmt.Fprintf(out, "%c", test_runes[i])
+					fmt.Fprintf(out, "%c", test_str[i])
 				}
 			}
 		} else {
-			for i, c := range test_runes {
-				// Explanation:
-				// 	We print a letter in red if:
-				// 		1. It is in the 'indicesToPrint'
-				// 		2. One of the following:
-				// 			a. The '-m' flag is disabled
-				// 			b. The '-m' flag is enabled, and our current index is in the bounds of the specific match
-				if indicesToPrint.contains(i) && (!matchNumFlagEnabled || (i >= matchIndices[*matchNum-1][0].startIdx && i < matchIndices[*matchNum-1][0].endIdx)) {
+			for i, c := range test_str {
+				if indicesToPrint.contains(i) {
 					color.New(color.FgRed).Fprintf(out, "%c", c)
 					// Newline after every match - only if -o is enabled and -v is disabled.
 					if *onlyFlag && !(*invertFlag) {
--- a/matching.go
+++ b/matching.go
@@ -138,15 +138,38 @@ func pruneIndices(indices []Match) []Match {
 	return toRet
 }

+// findNthMatch finds the 'n'th match of the regex represented by the given start-state, with
+// the given string.
+// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
+func findNthMatch(regex Reg, str string, n int) (Match, error) {
+	idx := 0
+	matchNum := 0
+	str_runes := []rune(str)
+	var matchFound bool
+	var matchIdx Match
+	for idx <= len(str_runes) {
+		matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
+		if matchFound {
+			matchNum++
+		}
+		if matchNum == n {
+			return matchIdx, nil
+		}
+	}
+	// We haven't found the nth match after scanning the string - Return an error
+	return nil, fmt.Errorf("Invalid match index. Too few matches found.")
+}
+
 // findAllMatches tries to find all matches of the regex represented by given start-state, with
 // the given string
-func findAllMatches(start *State, str []rune, numGroups int) []Match {
+func findAllMatches(regex Reg, str string) []Match {
 	idx := 0
+	str_runes := []rune(str)
 	var matchFound bool
 	var matchIdx Match
 	indices := make([]Match, 0)
-	for idx <= len(str) {
-		matchFound, matchIdx, idx = findAllMatchesHelper(start, str, idx, numGroups)
+	for idx <= len(str_runes) {
+		matchFound, matchIdx, idx = findAllMatchesHelper(regex.start, str_runes, idx, regex.numGroups)
 		if matchFound {
 			indices = append(indices, matchIdx)
 		}
--- a/nfa.go
+++ b/nfa.go
@@ -122,13 +122,21 @@ func (s State) checkAssertion(str []rune, idx int) bool {
 		// 		2. Run it on a subset of the test string, that ends after the current index in the string
 		// 		3. Based on the kind of lookaround (and the indices we get), determine what action to take.
 		startState := s.lookaroundNFA
-		var strToMatch []rune
+		var runesToMatch []rune
+		var strToMatch string
 		if s.assert == PLA || s.assert == NLA {
-			strToMatch = str[idx:]
+			runesToMatch = str[idx:]
 		} else {
-			strToMatch = str[:idx]
+			runesToMatch = str[:idx]
 		}
-		matchIndices := findAllMatches(startState, strToMatch, startState.lookaroundNumCaptureGroups)
+
+		if len(runesToMatch) == 0 {
+			strToMatch = ""
+		} else {
+			strToMatch = string(runesToMatch)
+		}
+
+		matchIndices := findAllMatches(Reg{startState, startState.lookaroundNumCaptureGroups}, strToMatch)

 		numMatchesFound := 0
 		for _, matchIdx := range matchIndices {
--- a/re_test.go
+++ b/re_test.go
@@ -187,9 +187,11 @@ var groupTests = []struct {
 func TestFindAllMatches(t *testing.T) {
 	for _, test := range reTests {
 		t.Run(test.re+"	"+test.str, func(t *testing.T) {
-			re_postfix := shuntingYard(test.re)
-			startState, numGroups := thompson(re_postfix)
-			matchIndices := findAllMatches(startState, []rune(test.str), numGroups)
+			regComp, err := Compile(test.re)
+			if err != nil {
+				panic(err)
+			}
+			matchIndices := findAllMatches(regComp, test.str)
 			zeroGroups := make([]Group, len(matchIndices))
 			for i, m := range matchIndices {
 				zeroGroups[i] = m[0]
@@ -204,9 +206,11 @@ func TestFindAllMatches(t *testing.T) {
 func TestFindAllGroups(t *testing.T) {
 	for _, test := range groupTests {
 		t.Run(test.re+"	"+test.str, func(t *testing.T) {
-			re_postfix := shuntingYard(test.re)
-			startState, numGroups := thompson(re_postfix)
-			matchIndices := findAllMatches(startState, []rune(test.str), numGroups)
+			regComp, err := Compile(test.re)
+			if err != nil {
+				panic(err)
+			}
+			matchIndices := findAllMatches(regComp, test.str)
 			for i := range matchIndices {
 				for j := range matchIndices[i] {
 					if matchIndices[i][j].isValid() {
Author	SHA1	Message	Date
Rockingcool	24fa365be1	Moved some auxiliary functions into compile.go; use new API for compiling and finding matches	2025-01-06 20:14:57 -06:00
Rockingcool	1da3f7f0e0	Changed API for match-finding functions - take in a Reg instead of start state and numGroups separately	2025-01-06 20:14:19 -06:00
Rockingcool	8e8067482a	Rewrote to use new API for compiling and finding matches	2025-01-06 20:12:18 -06:00
Rockingcool	644ed15af0	Use new API for findAllMatches	2025-01-06 20:10:25 -06:00
Rockingcool	c8613c1ba2	Major restructuring - added new type, changed return types for shuntingYard and thompson I added a new function 'Compile' that calls shuntingYard and thompson. I also added a new type 'Reg' that this function returns - it represents the starting state and contains the number of capturing groups in the regex. I also rewrote shuntingYard and thompson to return errors instead of panicking.	2025-01-06 20:08:24 -06:00
Rockingcool	ddbcb309b0	Made shuntingYard return an error instead of panicking, moved it and thompson to compile.go	2025-01-06 12:29:04 -06:00
Rockingcool	72263509d3	Rewrote behavior of '-m' flag to use the 'nth match' function from matching.go	2025-01-05 21:41:14 -06:00
Rockingcool	4373d35216	Wrote function to find the 'n'th match of a regex	2025-01-05 21:40:53 -06:00