6 changed files with 61 additions and 68 deletions
--- a/main.go
+++ b/main.go
@ -51,9 +51,6 @@ func shuntingYard(re string) []postfixNode {
 	// Eventually, I might be able to add it into the main parsing loop, to reduce the time
 	// complexity.
 	// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
 	//
 	// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
 	// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
 	for i := 0; i < len(re_runes_orig); i++ {
 		c := re_runes_orig[i]
 		if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
@ -85,9 +82,7 @@ func shuntingYard(re string) []postfixNode {
 			fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd)
 			regex := range2regex(rangeStart, rangeEnd)
 			re_runes = append(re_runes, []rune(regex)...)
-		} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
+
 			re_runes = append(re_runes, NONCAPLPAREN_CHAR)
 			i += 2
 		} else {
 			re_runes = append(re_runes, c)
 		}
@ -153,11 +148,7 @@ func shuntingYard(re string) []postfixNode {
 			}
 			re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
 		}
-		if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen
+		if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] == '?') { // Unescaped open parentheses followed by question mark = lokaround. Don't mess with it.
 			re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
 			i += 3
 		}
 		if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
 			i++ // Step inside
 			if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
 				panic("Invalid regex. Lookaround intended?")
@ -183,7 +174,7 @@ func shuntingYard(re string) []postfixNode {
 			}
 			continue
 		}
-		if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
+		if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
 			if i < len(re_runes)-1 {
 				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
 					re_postfix = append(re_postfix, CONCAT)
@ -206,7 +197,7 @@ func shuntingYard(re string) []postfixNode {
 			b. If not, keep popping from opStack (and appending to outQueue) until:
 				i. opStack is empty, OR
 				ii. current character has greater priority than top of opStack
-		3. If current character is '(' or NONCAPLPAREN_CHAR, push to opStack
+		3. If current character is '(', push to opStack
 		4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
 		5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
 		6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
@ -391,25 +382,24 @@ func shuntingYard(re string) []postfixNode {
 			}
 			idx := len(outQueue) - 1
-			// Get the last added node
+			// Get the most recently added non-paren node
-			if idx < 0 || outQueue[idx].nodetype == LPAREN {
+			for node := outQueue[idx]; idx >= 0 && (node.nodetype == RPAREN || node.nodetype == LPAREN); node = outQueue[idx] {
 				idx--
 			}
 			if idx < 0 {
 				panic("Numeric specifier with no content.")
 			}
 			outQueue[idx].startReps = startRangeNum
 			outQueue[idx].endReps = endRangeNum
 		}
-		if c == '(' || c == NONCAPLPAREN_CHAR {
+		if c == '(' {
 			opStack = append(opStack, c)
 			if c == '(' { // We only push _capturing_ group parentheses to outQueue
 			outQueue = append(outQueue, newPostfixNode(c))
 			}
 			numOpenParens++
 		}
 		if c == ')' {
-			// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack.
+			// Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack.
-			var val rune
+			for val, err := peek(opStack); val != '('; val, err = peek(opStack) {
 			var err error
 			for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) {
 				if err != nil {
 					panic("ERROR: Imbalanced parantheses.")
 				}
@ -417,9 +407,7 @@ func shuntingYard(re string) []postfixNode {
 				outQueue = append(outQueue, newPostfixNode(to_append))
 			}
 			_ = mustPop(&opStack)                            // Get rid of opening parentheses
 			if val == '(' {       // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
 			outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
 			}
 			numOpenParens--
 		}
 	}
@ -606,7 +594,6 @@ func main() {
 	multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
 	printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
 	caseInsensitiveFlag = flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
 	matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
 	substituteText := flag.String("s", "", "Substitute the contents  of each match with the given string. Overrides -o and -v")
 	flag.Parse()
@ -620,23 +607,14 @@ func main() {
 	if *onlyFlag {
 		*lineFlag = false
 	}
-	// Check if substitute and matchNum flags have been enabled
+	// Check if substitute text has been enabled
 	substituteFlagEnabled := false
 	matchNumFlagEnabled := false
 	flag.Visit(func(f *flag.Flag) {
 		if f.Name == "s" {
 			substituteFlagEnabled = true
 		}
 		if f.Name == "m" {
 			matchNumFlagEnabled = true
 		}
 	})
 	// Validate matchNumFlag - must be positive integer
 	if matchNumFlagEnabled && *matchNum < 1 {
 		panic("Invalid match number to print.")
 	}
 	// Process:
 	// 1. Convert regex into postfix notation (Shunting-Yard algorithm)
 	// 		a. Add explicit concatenation operators to facilitate this
--- a/matching.go
+++ b/matching.go
@ -60,7 +60,8 @@ func (g Group) isValid() bool {
 // takeZeroState takes the 0-state (if such a transition exists) for all states in the
 // given slice. It returns the resulting states. If any of the resulting states is a 0-state,
 // the second ret val is true.
-// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index.
+// The third ret val is a list of all the group numbers of all the opening parentheses we crossed,
 // and the fourth is a list of all the closing parentheses we passed
 func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZero bool) {
 	for _, state := range states {
 		if len(state.transitions[EPSILON]) > 0 {
@ -93,7 +94,11 @@ func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZer
 // from any of the given states, given the string and our position in it.
 // It uses the same algorithm to find zero-states as the one inside the loop,
 // so I should probably put it in a function.
-func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) bool {
+// It also returns all the capturing groups that both begin and end at the current index.
 // This is because, by definition, zero-states don't move forward in the string.
 func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) (bool, []int, []int) {
 	allOpenParenGroups := make([]int, 0)
 	allCloseParenGroups := make([]int, 0)
 	zeroStates, isZero := takeZeroState(states, numGroups, idx)
 	tempstates := make([]*State, 0, len(zeroStates)+len(states))
 	tempstates = append(tempstates, states...)
@ -108,10 +113,10 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) boo
 	}
 	for _, state := range tempstates {
 		if state.isEmpty && (state.assert == NONE || state.checkAssertion(str, idx)) && state.isLast {
-			return true
+			return true, allOpenParenGroups, allCloseParenGroups
 		}
 	}
-	return false
+	return false, allOpenParenGroups, allCloseParenGroups
 }
 // Prunes the slice by removing overlapping indices.
@ -168,6 +173,28 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
 		// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
 		return false, []Group{}, offset
 	}
 	// 'Base case' - if we are at the end of the string, check if we can add a zero-length match
 	if offset == len(str) {
 		// Get all zero-state matches. If we can get to a zero-state without matching anything, we
 		// can add a zero-length match. This is all true only if the start state itself matches nothing.
 		if start.isEmpty {
 			to_return := newMatch(numGroups + 1)
 			if start.groupBegin {
 				to_return[start.groupNum].startIdx = offset
 			}
 			if ok, openGrps, closeGrps := zeroMatchPossible(str, offset, numGroups, start); ok {
 				for _, gIdx := range openGrps {
 					to_return[gIdx].startIdx = offset
 				}
 				for _, gIdx := range closeGrps {
 					to_return[gIdx].endIdx = offset
 				}
 				to_return[0] = Group{offset, offset}
 				return true, to_return, offset + 1
 			}
 		}
 		return false, []Group{}, offset + 1
 	}
 	// Hold a list of match indices for the current run. When we
 	// can no longer find a match, the match with the largest range is
@ -302,7 +329,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
 		// Check if we can find a zero-length match
 		if foundPath == false {
-			if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
+			if ok, _, _ := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
 				if tempIndices[0].isValid() == false {
 					tempIndices[0] = Group{startIdx, startIdx}
 				}
@ -348,7 +375,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
 	for _, state := range currentStates {
 		// Only add the match if the start index is in bounds. If the state has an assertion,
 		// make sure the assertion checks out.
-		if state.isLast && i <= len(str) {
+		if state.isLast && startIdx < len(str) {
 			if state.assert == NONE || state.checkAssertion(str, i) {
 				for j := 1; j < numGroups+1; j++ {
 					tempIndices[j] = state.threadGroups[j]
--- a/misc.go
+++ b/misc.go
@ -13,7 +13,6 @@ var RBRACKET rune = 0xF0001
 var ANY_CHAR rune = 0xF0002    // Represents any character - used for states where the allChars flag is on.
 var LPAREN_CHAR rune = 0xF0003 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
 var RPAREN_CHAR rune = 0xF0004
 var NONCAPLPAREN_CHAR rune = 0xF0005 // Represents a non-capturing group's LPAREN
 // Returns true if str[idx] and str[idx-1] are separated by a word boundary.
 func isWordBoundary(str []rune, idx int) bool {
@ -27,7 +26,7 @@ func isWordBoundary(str []rune, idx int) bool {
 func isNormalChar(c rune) bool {
 	specialChars := []rune(`?*\^${}()+|[].~<>`)
-	specialChars = append(specialChars, LBRACKET, RBRACKET, NONCAPLPAREN_CHAR)
+	specialChars = append(specialChars, LBRACKET, RBRACKET)
 	return !slices.Contains(specialChars, c)
 }
--- a/nfa.go
+++ b/nfa.go
@ -28,6 +28,7 @@ type State struct {
 	transitions                map[int][]*State // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
 	isKleene                   bool             // Identifies whether current node is a 0-state representing Kleene star
 	assert                     assertType       // Type of assertion of current node - NONE means that the node doesn't assert anything
 	zeroMatchFound             bool             // Whether or not the state has been used for a zero-length match - only relevant for zero states
 	allChars                   bool             // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
 	except                     []rune           // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
 	lookaroundRegex            string           // Only for lookaround states - Contents of the regex that the lookaround state holds
@ -36,8 +37,6 @@ type State struct {
 	groupBegin                 bool             // Whether or not the node starts a capturing group
 	groupEnd                   bool             // Whether or not the node ends a capturing group
 	groupNum                   int              // Which capturing group the node starts / ends
 	// The following properties depend on the current match - I should think about resetting them for every match.
 	zeroMatchFound bool    // Whether or not the state has been used for a zero-length match - only relevant for zero states
 	threadGroups               []Group          // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
 }
@ -119,21 +118,15 @@ func (s State) checkAssertion(str []rune, idx int) bool {
 	if s.isLookaround() {
 		// The process here is simple:
 		// 		1. Compile the regex stored in the state's contents.
-		// 		2. Run it on a subset of the test string, that ends after the current index in the string
+		// 		2. Run it on the test string.
 		// 		3. Based on the kind of lookaround (and the indices we get), determine what action to take.
 		startState := s.lookaroundNFA
-		var strToMatch []rune
+		matchIndices := findAllMatches(startState, str, startState.lookaroundNumCaptureGroups)
 		if s.assert == PLA || s.assert == NLA {
 			strToMatch = str[idx:]
 		} else {
 			strToMatch = str[:idx]
 		}
 		matchIndices := findAllMatches(startState, strToMatch, startState.lookaroundNumCaptureGroups)
 		numMatchesFound := 0
 		for _, matchIdx := range matchIndices {
-			if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
+			if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at the current index
-				if matchIdx[0].startIdx == 0 {
+				if matchIdx[0].startIdx == idx {
 					numMatchesFound++
 				}
 			}
--- a/re_test.go
+++ b/re_test.go
@ -149,13 +149,14 @@ var reTests = []struct {
 	{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "400", []Group{{0, 3}}},
 	{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "4000", []Group{}},
-	// Lookaround tests
+	// Todo - add lookaround tests
 	{"(?<=bo)y", "boy", []Group{{2, 3}}},
 	{"bo(?=y)", "boy", []Group{{0, 2}}},
 	{"(?<=f)f+(?=f)", "fffff", []Group{{1, 4}}},
 	{"(?<=f)f+(?=f)", "fffffa", []Group{{1, 4}}},
 	// Todo - add numeric range tests
 	// Todo - add capturing group tests
 	{"(?<=f)f+(?=f)", "fffff", []Group{{1, 4}}},
 	{"(?<=f)f+(?=f)", "fffffa", []Group{{1, 4}}},
 }
 var groupTests = []struct {
@ -178,10 +179,6 @@ var groupTests = []struct {
 	{"(aaa)|(aaaa)", "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
 	{"(aaaa)|(aaa)", "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
 	{"(a)|(aa)", "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}},
 	{"(a?)a?", "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
 	{"(a?)a?", "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
 	{"(a?)a?", "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
 	{"a((b.d){3})", "abfdbhdbid", []Match{[]Group{{0, 10}, {1, 10}, {7, 10}}}},
 }
 func TestFindAllMatches(t *testing.T) {
--- a/todo.txt
+++ b/todo.txt
@ -3,4 +3,3 @@
 3. Fix adding concatenation operators in shunting-yard function (very janky, compares against operators individually)
 Ideas for flags:
    -m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
    -g <num> : Print the <num>th group