regex/main.go

package main

import (
	"bufio"
	"flag"
	"fmt"
	"io"
	"os"
	"slices"
	"strconv"
	"unicode"

	"github.com/fatih/color"
)

const CONCAT rune = '~'

var notDotChars []rune
var caseInsensitiveFlag *bool // Whether we are running in case-insensitive mode

func isOperator(c rune) bool {
	if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
		return true
	}
	return false
}

/* priority returns the priority of the given operator */
func priority(op rune) int {
	precedence := []rune{'|', CONCAT, '+', '*', '?'}
	return slices.Index(precedence, op)
}

/*
The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
The primary benefit of this is getting rid of parentheses.
It also inserts explicit concatenation operators to make parsing easier in Thompson's algorithm.
See: https://blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
*/
func shuntingYard(re string) []postfixNode {
	re_postfix := make([]rune, 0)
	// Convert the string to a slice of runes to allow iteration through it
	re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
	re_runes := make([]rune, 0)
	// Check for numeric range. If we are at the start of a numeric range,
	// skip to end and construct the equivalent regex for the range.
	// The reason this is outside the loop below, is that it actually modifies
	// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
	// It also makes the overall parsing easier, since I don't have to worry about the numeric range
	// anymore.
	// Eventually, I might be able to add it into the main parsing loop, to reduce the time
	// complexity.
	// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
	//
	// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
	// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
	for i := 0; i < len(re_runes_orig); i++ {
		c := re_runes_orig[i]
		if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
			i++ // Step over opening angle bracket
			tmpStr := ""
			hyphenFound := false
			for i < len(re_runes_orig) && re_runes_orig[i] != '>' {
				if !unicode.IsDigit(re_runes_orig[i]) {
					if re_runes_orig[i] != '-' || (hyphenFound) {
						panic("ERROR: Invalid numeric range.")
					}
				}
				if re_runes_orig[i] == '-' {
					hyphenFound = true
				}
				tmpStr += string(re_runes_orig[i])
				i++
			}
			// End of string reached and last character doesn't close the range
			if i == len(re_runes_orig) && re_runes_orig[len(re_runes_orig)-1] != '>' {
				panic("ERROR: Numeric range not closed.")
			}
			if len(tmpStr) == 0 {
				panic("ERROR: Empty numeric range.")
			}
			// Closing bracket will be skipped when the loop variable increments
			var rangeStart int
			var rangeEnd int
			fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd)
			regex := range2regex(rangeStart, rangeEnd)
			re_runes = append(re_runes, []rune(regex)...)
		} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
			re_runes = append(re_runes, NONCAPLPAREN_CHAR)
			i += 2
		} else {
			re_runes = append(re_runes, c)
		}
	}

	/* 	Add concatenation operators.
	Only add a concatenation operator between two characters if both the following conditions are met:
		1. 	The first character isn't an opening parantheses or alteration operator (or an escape character)
			a. This makes sense, because these operators can't be _concatenated_ with anything else.
		2. The second character isn't a 'closing operator' - one that applies to something before it
			a. Again, these operators can'be concatenated _to_. They can, however, be concatenated _from_.
	Caveats:
		1. Don't mess with anything inside brackets - character class
		2. Don't mess with anything inside braces - numeric repetition
		3. Don't mess with any lookarounds.
	*/
	i := 0
	for i < len(re_runes) {
		re_postfix = append(re_postfix, re_runes[i])
		if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters.
			re_postfix[len(re_postfix)-1] = LBRACKET         // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
			toAppend := make([]rune, 0)                      // Holds all the runes in the current character class
			if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
				re_postfix = append(re_postfix, '^')
				i++ // Skip opening bracket and caret
			}
			if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
				panic("Empty character class.")
			}
			for re_runes[i] != ']' {
				i++ // Skip all characters inside brackets
				// TODO: Check for escaped characters

				// Check ahead for character range
				if i < len(re_runes)-2 && re_runes[i+1] == '-' {
					rangeStart := re_runes[i]
					rangeEnd := re_runes[i+2]
					if int(rangeEnd) < int(rangeStart) {
						panic("Range is out of order.")
					}

					for i := rangeStart; i <= rangeEnd; i++ {
						toAppend = append(toAppend, i)
					}

					i += 2 // Skip start and hyphen (end will automatically be skipped on next iteration of loop)
					continue
				}
				toAppend = append(toAppend, re_runes[i])
			}
			// Replace the last character (which should have been ']', with RBRACKET
			toAppend[len(toAppend)-1] = RBRACKET
			re_postfix = append(re_postfix, toAppend...)
		}
		if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
			i++ // Skip opening brace
			for i < len(re_runes) && re_runes[i] != '}' {
				re_postfix = append(re_postfix, re_runes[i])
				i++
			}
			if i == len(re_runes) {
				panic("Invalid numeric specifier.")
			}
			re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
		}
		if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen
			re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
			i += 3
		}
		if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
			i++ // Step inside
			if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
				panic("Invalid regex. Lookaround intended?")
			}
			re_postfix = append(re_postfix, re_runes[i])
			i++
			numOpenParens := 1
			for numOpenParens != 0 {
				if i >= len(re_runes) {
					panic("Unclosed lookaround.")
				}
				if re_runes[i] == '(' {
					numOpenParens++
				}
				if re_runes[i] == ')' {
					numOpenParens--
					if numOpenParens == 0 {
						break
					}
				}
				re_postfix = append(re_postfix, re_runes[i])
				i++
			}
			continue
		}
		if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
			if i < len(re_runes)-1 {
				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
					re_postfix = append(re_postfix, CONCAT)
				}
			}
		}
		i++
	}

	opStack := make([]rune, 0)         // Operator stack
	outQueue := make([]postfixNode, 0) // Output queue

	// Actual algorithm
	numOpenParens := 0 // Number of open parentheses
	for i := 0; i < len(re_postfix); i++ {
		/* Two cases:
		1. Current character is alphanumeric - send to output queue
		2. Current character is operator - do the following:
			a. If current character has greater priority than top of opStack, push to opStack.
			b. If not, keep popping from opStack (and appending to outQueue) until:
				i. opStack is empty, OR
				ii. current character has greater priority than top of opStack
		3. If current character is '(' or NONCAPLPAREN_CHAR, push to opStack
		4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
		5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
		6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
		*/
		c := re_postfix[i]
		if isNormalChar(c) {
			if caseInsensitiveFlag != nil && *caseInsensitiveFlag {
				outQueue = append(outQueue, newPostfixNode(allCases(c)...))
			} else {
				outQueue = append(outQueue, newPostfixNode(c))
			}
			continue
		}
		// Escape character
		if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
			if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
				panic("ERROR: Backslash with no escape character.")
			}
			i++
			outQueue = append(outQueue, newEscapedNode(re_postfix[i]))
			continue // Escaped character will automatically be skipped when loop variable increments
		}

		if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
			outQueue = append(outQueue, newPostfixDotNode())
			continue
		}
		if c == '^' { // Start-of-string assertion
			outQueue = append(outQueue, newPostfixNode(c))
		}
		if c == '$' { // End-of-string assertion
			outQueue = append(outQueue, newPostfixNode(c))
		}
		// Check if we're at the start of a lookaround
		if c == '(' && i < len(re_postfix)-1 && re_postfix[i+1] == '?' {
			i += 2      // Skip opening paren and question mark
			regex := "" // Stores lookaround regex
			numOpenParens := 1
			for numOpenParens != 0 {
				if i >= len(re_postfix) {
					panic("Unclosed lookaround.")
				}
				if re_postfix[i] == '(' {
					numOpenParens++
				}
				if re_postfix[i] == ')' {
					numOpenParens--
					if numOpenParens == 0 {
						break
					}
				}
				regex += string(re_postfix[i])
				i++
			}
			if len(regex) <= 1 { // Nothing in regex - panic
				panic("Invalid lookaround. (too short?)")
			}
			// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
			// Now we should filter that out.
			toAppend := postfixNode{nodetype: ASSERTION, startReps: 1, endReps: 1}
			if regex[0] == '<' { // Lookbehind
				toAppend.lookaroundDir = LOOKBEHIND
				regex = regex[1:]
			} else if regex[0] == '=' || regex[0] == '!' {
				toAppend.lookaroundDir = LOOKAHEAD
			} else {
				panic("Invalid lookaround.")
			}
			// Positive or negative
			if regex[0] == '=' { // Positive
				toAppend.lookaroundSign = POSITIVE
				toAppend.contents = []rune(regex[1:])
			} else if regex[0] == '!' { // Negative
				toAppend.lookaroundSign = NEGATIVE
				toAppend.contents = []rune(regex[1:])
			} else {
				panic("Invalid lookaround.")
			}
			outQueue = append(outQueue, toAppend)
			continue
		}
		if isOperator(c) {
			if len(opStack) == 0 {
				opStack = append(opStack, c)
			} else {
				topStack, err := peek(opStack)
				if err != nil {
					panic("ERROR: Operator without operand.")
				}
				if priority(c) > priority(topStack) { // 2a
					opStack = append(opStack, c)
				} else {
					for priority(c) <= priority(topStack) { // 2b
						to_append := mustPop(&opStack)
						outQueue = append(outQueue, newPostfixNode(to_append))
						topStack, _ = peek(opStack)
					}
					opStack = append(opStack, c)
				}
			}
		}
		if c == LBRACKET { // Used for character classes
			i++ // Step forward so we can look at the character class
			var invertMatch bool
			if re_postfix[i] == '^' {
				invertMatch = true
				i++
			}
			chars := make([]rune, 0) // List of characters -  used only for character classes
			for i < len(re_postfix) {
				if re_postfix[i] == RBRACKET {
					break
				}
				chars = append(chars, re_postfix[i])
				i++
			}
			if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
				panic("ERROR: Opening bracket without closing bracket.")
			}
			if !invertMatch {
				outQueue = append(outQueue, newPostfixCharNode(chars...))
			} else {
				// Invert match - create an allChars postfixNode, then add the given states to its 'except' list.
				toAdd := newPostfixDotNode()
				toAdd.except = chars
				outQueue = append(outQueue, toAdd)
			}
			continue
		}
		if c == '{' {
			i++ // Skip opening brace
			// Three possibilities:
			// 1. Single number - {5}
			// 2. Range - {3,5}
			// 3. Start with no end, {3,}
			startRange := make([]rune, 0)
			startRangeNum := 0
			endRange := make([]rune, 0)
			endRangeNum := 0
			for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
				startRange = append(startRange, re_postfix[i])
				i++
			}
			if len(startRange) == 0 { // {} is not valid, neither is {,5}
				panic("ERROR: Invalid numeric specifier.")
			}
			if i == len(re_postfix) {
				panic("ERROR: Brace not closed.")
			}

			startRangeNum, err := strconv.Atoi(string(startRange))
			if err != nil {
				panic(err)
			}

			if re_postfix[i] == '}' { // Case 1 above
				endRangeNum = startRangeNum
			} else {
				if re_postfix[i] != ',' {
					panic("ERROR: Invalid numeric specifier.")
				}
				i++ // Skip comma
				for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
					endRange = append(endRange, re_postfix[i])
					i++
				}
				if i == len(re_postfix) {
					panic("ERROR: Brace not closed.")
				}
				if re_postfix[i] != '}' {
					panic("ERROR: Invalid numeric specifier.")
				}
				if len(endRange) == 0 { // Case 3 above
					endRangeNum = INFINITE_REPS
				} else { // Case 2 above
					var err error
					endRangeNum, err = strconv.Atoi(string(endRange))
					if err != nil {
						panic(err)
					}
				}
			}

			idx := len(outQueue) - 1
			// Get the last added node
			if idx < 0 || outQueue[idx].nodetype == LPAREN {
				panic("Numeric specifier with no content.")
			}
			outQueue[idx].startReps = startRangeNum
			outQueue[idx].endReps = endRangeNum
		}
		if c == '(' || c == NONCAPLPAREN_CHAR {
			opStack = append(opStack, c)
			if c == '(' { // We only push _capturing_ group parentheses to outQueue
				outQueue = append(outQueue, newPostfixNode(c))
			}
			numOpenParens++
		}
		if c == ')' {
			// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack.
			var val rune
			var err error
			for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) {
				if err != nil {
					panic("ERROR: Imbalanced parantheses.")
				}
				to_append := mustPop(&opStack)
				outQueue = append(outQueue, newPostfixNode(to_append))
			}
			_ = mustPop(&opStack) // Get rid of opening parentheses
			if val == '(' {       // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
				outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
			}
			numOpenParens--
		}
	}

	// Pop all remaining operators (and append to outQueue)
	for len(opStack) > 0 {
		to_append := mustPop(&opStack)
		outQueue = append(outQueue, newPostfixNode(to_append))
	}

	if numOpenParens != 0 {
		panic("ERROR: Imbalanced parantheses.")
	}

	return outQueue
}

// Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state and number of groups in regex.
func thompson(re []postfixNode) (*State, int) {
	nfa := make([]*State, 0) // Stack of states
	numGroups := 0           // Number of capturing groups
	for _, c := range re {
		if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
			state := State{}
			state.transitions = make(map[int][]*State)
			if c.allChars {
				state.allChars = true
				if len(c.except) != 0 {
					state.except = append([]rune{}, c.except...)
				}
			}
			state.content = rune2Contents(c.contents)
			state.output = make([]*State, 0)
			state.output = append(state.output, &state)
			state.isEmpty = false
			if c.nodetype == ASSERTION {
				state.isEmpty = true                 // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
				state.content = newContents(EPSILON) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
				if c.lookaroundDir == 0 || c.lookaroundSign == 0 {
					switch c.contents[0] {
					case '^':
						state.assert = SOS
					case '$':
						state.assert = EOS
					case 'b':
						state.assert = WBOUND
					case 'B':
						state.assert = NONWBOUND
					}
				} else { // Lookaround
					state.lookaroundRegex = string(c.contents)
					if c.lookaroundDir == LOOKAHEAD {
						if c.lookaroundSign == POSITIVE {
							state.assert = PLA
						}
						if c.lookaroundSign == NEGATIVE {
							state.assert = NLA
						}
					}
					if c.lookaroundDir == LOOKBEHIND {
						if c.lookaroundSign == POSITIVE {
							state.assert = PLB
						}
						if c.lookaroundSign == NEGATIVE {
							state.assert = NLB
						}
					}
					tmpRe := shuntingYard(state.lookaroundRegex)
					var numGroupsLookaround int
					state.lookaroundNFA, numGroupsLookaround = thompson(tmpRe)
					state.lookaroundNumCaptureGroups = numGroupsLookaround

				}
			}
			nfa = append(nfa, &state)
		}
		if c.nodetype == LPAREN || c.nodetype == RPAREN {
			s := &State{}
			s.assert = NONE
			s.content = newContents(EPSILON)
			s.isEmpty = true
			s.output = make([]*State, 0)
			s.output = append(s.output, s)
			s.transitions = make(map[int][]*State)
			// LPAREN nodes are just added normally
			if c.nodetype == LPAREN {
				numGroups++
				s.groupBegin = true
				s.groupNum = numGroups
				nfa = append(nfa, s)
				continue
			}
			// For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN,
			// and then some other node.
			// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
			// and added back in.
			if c.nodetype == RPAREN {
				s.groupEnd = true
				middleNode := mustPop(&nfa)
				lparenNode := mustPop(&nfa)
				s.groupNum = lparenNode.groupNum
				tmp := concatenate(lparenNode, middleNode)
				to_add := concatenate(tmp, s)
				nfa = append(nfa, to_add)

			}
		}
		// Must be an operator if it isn't a character
		switch c.nodetype {
		case CONCATENATE:
			s2 := mustPop(&nfa)
			s1 := mustPop(&nfa)
			s1 = concatenate(s1, s2)
			nfa = append(nfa, s1)
		case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
			s1 := mustPop(&nfa)
			stateToAdd := kleene(*s1)
			nfa = append(nfa, stateToAdd)
		case PLUS: // a+ is equivalent to aa*
			s1 := mustPop(&nfa)
			s2 := kleene(*s1)
			s1 = concatenate(s1, s2)
			nfa = append(nfa, s1)
		case QUESTION: // ab? is equivalent to a(b|)
			s1 := mustPop(&nfa)
			s2 := question(s1)
			nfa = append(nfa, s2)
		case PIPE:
			s1 := mustPop(&nfa)
			s2 := mustPop(&nfa)
			s3 := alternate(s1, s2)
			nfa = append(nfa, s3)
		}
		if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
			if c.endReps != -1 && c.endReps < c.startReps {
				panic("ERROR: Numeric specifier - start greater than end.")
			}
			state := mustPop(&nfa)
			var stateToAdd *State = nil
			// Take advantage of the following facts:
			// a{5} == aaaaa
			// a{3,5} == aaaa?a?
			// a{5,} == aaaaa+
			// Nov. 3 2024 - I have two choices on how I want to implement numeric
			// specifiers.
			// a. Encode the logic while creating the states. I will have to create a function
			// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
			// each other (concatenating them with the 'concatenate' method - which takes addresses - does
			// not work). Creating this function might be a lot of work.
			// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
			// at this point, I can leave thompson untouched.
			for i := 0; i < c.startReps; i++ { // Case 1
				stateToAdd = concatenate(stateToAdd, cloneState(state))
			}
			if c.endReps == INFINITE_REPS { // Case 3
				s2 := kleene(*state)
				stateToAdd = concatenate(stateToAdd, s2)
			} else { // Case 2
				for i := c.startReps; i < c.endReps; i++ {
					stateToAdd = concatenate(stateToAdd, question(state))
				}
			}
			nfa = append(nfa, stateToAdd)
		}
	}
	if len(nfa) != 1 {
		panic("ERROR: Invalid Regex.")
	}

	verifyLastStates(nfa)

	return nfa[0], numGroups

}

func main() {
	invertFlag := flag.Bool("v", false, "Invert match.")
	// This flag has two 'modes':
	// 1. Without '-v': Prints only matches. Prints a newline after every match.
	// 2. With '-v': Substitutes all matches with empty string.
	onlyFlag := flag.Bool("o", false, "Print only colored content. Overrides -l.")
	lineFlag := flag.Bool("l", false, "Only print lines with a match (or with no matches, if -v is enabled). Similar to grep's default.")
	multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
	printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
	caseInsensitiveFlag = flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
	matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
	substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
	flag.Parse()

	// In multi-line mode, 'dot' metacharacter also matches newline
	if !(*multiLineFlag) {
		notDotChars = []rune{'\n'}
	} else {
		notDotChars = []rune{}
	}
	// -l and -o are mutually exclusive: -o overrides -l
	if *onlyFlag {
		*lineFlag = false
	}
	// Check if substitute and matchNum flags have been enabled
	substituteFlagEnabled := false
	matchNumFlagEnabled := false
	flag.Visit(func(f *flag.Flag) {
		if f.Name == "s" {
			substituteFlagEnabled = true
		}
		if f.Name == "m" {
			matchNumFlagEnabled = true
		}
	})

	// Validate matchNumFlag - must be positive integer
	if matchNumFlagEnabled && *matchNum < 1 {
		panic("Invalid match number to print.")
	}

	// Process:
	// 1. Convert regex into postfix notation (Shunting-Yard algorithm)
	// 		a. Add explicit concatenation operators to facilitate this
	// 2. Build NFA from postfix representation (Thompson's algorithm)
	// 3. Run the string against the NFA

	if len(flag.Args()) != 1 { // flag.Args() also strips out program name
		fmt.Println("ERROR: Missing cmdline args")
		os.Exit(22)
	}
	var re string
	re = flag.Args()[0]
	var test_str string
	var test_runes []rune // Rune-slice representation of test_str
	var err error
	var linesRead bool // Whether or not we have read the lines in the file
	lineNum := 0       // Current line number
	// Create reader for stdin and writer for stdout
	reader := bufio.NewReader(os.Stdin)
	out := bufio.NewWriter(os.Stdout)

	re_postfix := shuntingYard(re)
	startState, numGroups := thompson(re_postfix)
	for true {
		if linesRead {
			break
		}
		if !(*multiLineFlag) {
			// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.
			test_str, err = reader.ReadString('\n')
			lineNum++
			if err != nil {
				if err == io.EOF {
					linesRead = true
				} else {
					panic(err)
				}
			}
			if len(test_str) > 0 && test_str[len(test_str)-1] == '\n' {
				test_str = test_str[:len(test_str)-1]
			}
		} else {
			// Multi-line mode - read every line of input into a temp. string.
			// test_str will contain all lines of input (including newline characters)
			// as one string.
			var temp string
			for temp, err = reader.ReadString('\n'); err == nil; temp, err = reader.ReadString('\n') {
				test_str += temp
			}
			// Assuming err != nil
			if err == io.EOF {
				if len(temp) > 0 {
					test_str += temp // Add the last line (if it is non-empty)
				}
				linesRead = true
			} else {
				panic(err)
			}
		}
		test_runes = []rune(test_str)
		matchIndices := findAllMatches(startState, test_runes, numGroups)
		if *printMatchesFlag {
			// if we are in single line mode, print the line on which
			// the matches occur
			if len(matchIndices) > 0 {
				if !(*multiLineFlag) {
					fmt.Fprintf(out, "Line %d:\n", lineNum)
				}
				for _, idx := range matchIndices {
					fmt.Fprintf(out, "%s\n", idx.toString())
				}
				err := out.Flush()
				if err != nil {
					panic(err)
				}
			}
			continue
		}
		// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
		// This should make checking O(1) instead of O(n)
		indicesToPrint := new_uniq_arr[int]()
		for _, idx := range matchIndices {
			indicesToPrint.add(genRange(idx[0].startIdx, idx[0].endIdx)...)
		}
		// If we are inverting, then we should print the indices which _didn't_ match
		// in color.
		if *invertFlag {
			oldIndices := indicesToPrint.values()
			indicesToPrint = new_uniq_arr[int]()
			// Explanation:
			// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
			// These are the values we want to print, now that we have inverted the match.
			// Re-initialize indicesToPrint and add all of these values to it.
			indicesToPrint.add(setDifference(genRange(0, len(test_runes)), oldIndices)...)

		}
		// If lineFlag is enabled, we should only print something if:
		// 		a. We are not inverting, and have at least one match on the current line
		// 		OR
		// 		b. We are inverting, and have no matches at all on the current line.
		// This checks for the inverse, and continues if it is true.
		if *lineFlag {
			if !(*invertFlag) && len(matchIndices) == 0 || *invertFlag && len(matchIndices) > 0 {
				continue
			}
		}

		// If we are substituting, we need a different behavior, as follows:
		// For every character in the test string:
		// 		1. Check if the index is the start of any matchIndex
		// 		2. If so, print the substitute text, and set our index to
		//			the corresponding end index.
		// 		3. If not, just print the character.
		if substituteFlagEnabled {
			for i := range test_runes {
				inMatchIndex := false
				for _, idx := range matchIndices {
					if i == idx[0].startIdx {
						fmt.Fprintf(out, "%s", *substituteText)
						i = idx[0].endIdx
						inMatchIndex = true
						break
					}
				}
				if !inMatchIndex {
					fmt.Fprintf(out, "%c", test_runes[i])
				}
			}
		} else {
			for i, c := range test_runes {
				if indicesToPrint.contains(i) {
					color.New(color.FgRed).Fprintf(out, "%c", c)
					// Newline after every match - only if -o is enabled and -v is disabled.
					if *onlyFlag && !(*invertFlag) {
						for _, idx := range matchIndices {
							if i+1 == idx[0].endIdx { // End index is one more than last index of match
								fmt.Fprintf(out, "\n")
								break
							}
						}
					}
				} else {
					if !(*onlyFlag) {
						fmt.Fprintf(out, "%c", c)
					}
				}
			}
		}
		err = out.Flush()
		if err != nil {
			panic(err)
		}
		fmt.Println()
	}
}