Added more descriptive comments; throw error if non-greedy operator is used; use new definition for kleene()

2025-01-29 10:28:18 -05:00
parent ecab7cc522
commit 833dd269a8
1 changed files with 39 additions and 24 deletions
--- a/compile.go
+++ b/compile.go
@@ -112,23 +112,30 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 	// Convert the string to a slice of runes to allow iteration through it
 	re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
 	re_runes := make([]rune, 0)
-	// Check for numeric range. If we are at the start of a numeric range,
-	// skip to end and construct the equivalent regex for the range.
-	// The reason this is outside the loop below, is that it actually modifies
-	// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
-	// It also makes the overall parsing easier, since I don't have to worry about the numeric range
-	// anymore.
-	// Eventually, I might be able to add it into the main parsing loop, to reduce the time
-	// complexity.
-	// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
+	// The following checks are performed here:
+	// 	1. 	Check for numeric range. If we are at the start of a numeric range,
+	// 		skip to end and construct the equivalent regex for the range.
+	// 		The reason this is outside the loop below, is that it actually modifies
+	// 		the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
+	// 		It also makes the overall parsing easier, since I don't have to worry about the numeric range
+	// 		anymore.
+	// 		Eventually, I might be able to add it into the main parsing loop, to reduce the time
+	// 		complexity.
+	// 		A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
 	//
-	// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
-	// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
+	// 	2. 	Check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
+	// 		I take this out, and put in a special character - NONCAPLPAREN_CHAR.
 	//
-	// Another check is made for unescaped brackets - opening brackets are replaced with LBRACKET and closing brackets are replaced with RBRACKET.
-	// Finally, check for escaped backslashes. Replace these with the BACKSLASH metacharacter. Later, in thompson(),
-	// these will be converted back. This avoids confusiuon in detecting whether a character is escaped eg. detecting
-	// whether '\\[a]' has an escaped opening bracket (it doesn't).
+	// 	3. 	Another check is made for unescaped brackets - opening brackets are replaced with
+	//		LBRACKET and closing brackets are replaced with RBRACKET.
+	//
+	// 	4. 	Check for escaped backslashes. Replace these with the BACKSLASH
+	//		metacharacter. Later, in thompson(), these will be converted back. This avoids
+	//		confusion in detecting whether a character is escaped eg. detecting
+	// 		whether '\\[a]' has an escaped opening bracket (it doesn't).
+	//
+	// 	5. 	Check for non-greedy operators. These are not supported at the moment, so an error
+	// 		must be thrown if the user attempts to use a non-greedy operator.
 	for i := 0; i < len(re_runes_orig); i++ {
 		c := re_runes_orig[i]
 		if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
@@ -172,6 +179,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 		} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
 			re_runes = append(re_runes, RBRACKET)
 			continue
+		} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
+			return nil, fmt.Errorf("non-greedy operators are not supported")
 		} else {
 			re_runes = append(re_runes, c)
 		}
@@ -480,9 +489,6 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 					if (c == '*' && outQueueFinalElement.nodetype == KLEENE) || (c == '+' && outQueueFinalElement.nodetype == PLUS) { // You cannot apply a quantifier to a quantifier in this way
 						return nil, fmt.Errorf("illegal use of token '%c'", c)
 					}
-					if c == '?' && slices.Contains([]NodeType{KLEENE, PLUS, QUESTION}, outQueueFinalElement.nodetype) {
-						return nil, fmt.Errorf("non-greedy operators not supported")
-					}
 					opStack = append(opStack, c)
 				}
 			}
@@ -1004,18 +1010,24 @@ func thompson(re []postfixNode) (Reg, error) {
 			if err != nil {
 				return Reg{}, fmt.Errorf("error applying kleene star")
 			}
-			if s1.isEmpty && s1.assert != NONE {
-				return Reg{}, fmt.Errorf("previous token is not quantifiable")
+			stateToAdd, err := kleene(*s1)
+			if err != nil {
+				return Reg{}, err
 			}
-			stateToAdd := kleene(*s1)
 			nfa = append(nfa, stateToAdd)
 		case PLUS: // a+ is equivalent to aa*
 			s1 := mustPop(&nfa)
-			s2 := kleene(*s1)
+			s2, err := kleene(*s1)
+			if err != nil {
+				return Reg{}, err
+			}
 			s1 = concatenate(s1, s2)
 			nfa = append(nfa, s1)
 		case QUESTION: // ab? is equivalent to a(b|)
-			s1 := mustPop(&nfa)
+			s1, err := pop(&nfa)
+			if err != nil {
+				return Reg{}, fmt.Errorf("error applying question operator")
+			}
 			s2 := question(s1)
 			nfa = append(nfa, s2)
 		case PIPE:
@@ -1068,7 +1080,10 @@ func thompson(re []postfixNode) (Reg, error) {
 				stateToAdd = concatenate(stateToAdd, cloneState(state))
 			}
 			if c.endReps == INFINITE_REPS { // Case 3
-				s2 := kleene(*state)
+				s2, err := kleene(*state)
+				if err != nil {
+					return Reg{}, err
+				}
 				stateToAdd = concatenate(stateToAdd, s2)
 			} else { // Case 2
 				for i := c.startReps; i < c.endReps; i++ {