Removed old comment

Implemented character range detection later in the code, using a metacharacter
Modified genRange() so that it can work on ints and runes
2025-01-22 20:27:35 -05:00 · 2025-01-22 20:26:58 -05:00 · 2025-01-22 20:25:49 -05:00
2 changed files with 47 additions and 5 deletions
--- a/compile.go
+++ b/compile.go
@@ -154,6 +154,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 			for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
 				i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
 				// TODO: Check for escaped characters
 				if re_runes[i] == '-' && i > 0 && re_runes[i-1] != '\\' { // Unescaped hyphen - replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
 					re_runes[i] = CHAR_RANGE
 				}
 				toAppend = append(toAppend, re_runes[i])
 			}
@@ -405,7 +408,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 			}
 		}
 		if c == LBRACKET { // Used for character classes
-			i++ // Step forward so we can look at the character class
+			endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
 			i++                 // Step forward so we can look at the character class
 			var invertMatch bool
 			if re_postfix[i] == '^' {
 				invertMatch = true
@@ -416,6 +420,11 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				if re_postfix[i] == RBRACKET {
 					break
 				}
 				if re_postfix[i] == CHAR_RANGE {
 					endOfRange = true
 					i++
 					continue
 				}
 				if re_postfix[i] == '\\' { // Backslash indicates a character to be escaped
 					if i == len(re_postfix)-1 {
 						return nil, fmt.Errorf("Stray backslash in character class.")
@@ -471,10 +480,38 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 					chars = append(chars, newPostfixCharNode(re_postfix[i]))
 					i++
 				}
 				if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range
 					// Things to note:
 					// 	1. In PCRE and Go's regex engine, a letter _can_ be surrounded by hyphens in a character class.
 					// 		Eg. [a-b-c]
 					// 		While you might think this leads to a syntax error (I thought so), the engine picks 'a-b' as a range,
 					// 		then treats the second '-' and 'c' as regular characters in the character class.
 					// 		So this regex becomes "Match a character from 'a' to 'b', a literal hyphen, or 'c' ".
 					//  2. To account for this, the following logic is followed:
 					// 		a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range.
 					// 			i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such.
 					// 		b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range).
 					endRangePostfixNode := mustPop(&chars)
 					startRangePostfixNode := mustPop(&chars)
 					if len(endRangePostfixNode.contents) != 1 {
 						return nil, fmt.Errorf("Error parsing character range.")
 					} else if len(startRangePostfixNode.contents) != 1 { // This is actually a regular hyphen
 						chars = append(chars, startRangePostfixNode, newPostfixCharNode('-'), endRangePostfixNode)
 					} else {
 						// We have established that they both have a length of 1
 						startRangeRune := startRangePostfixNode.contents[0]
 						endRangeRune := endRangePostfixNode.contents[0]
 						chars = append(chars, newPostfixCharNode(genRange(startRangeRune, endRangeRune+1)...))
 					}
 					endOfRange = false // Reset the flag
 				}
 			}
 			if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
 				return nil, fmt.Errorf("Opening bracket without closing bracket.")
 			}
 			outQueue = append(outQueue, newCharClassNode(chars, invertMatch))
 			continue
 		}
@@ -666,8 +703,7 @@ func thompson(re []postfixNode) (Reg, error) {
 			// Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it
 			replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\')
-			// Uncommenting this seems to make one of the test cases fail. Why?
+			replaceByValue(state.except, ESC_BACKSLASH, '\\')
 			//			replaceByValue(state.except, ESC_BACKSLASH, '\\')
 			nfa = append(nfa, &state)
 		}
--- a/misc.go
+++ b/misc.go
@@ -15,6 +15,12 @@ var LPAREN_CHAR rune = 0xF0004 // Parentheses in regex are concatenated with thi
 var RPAREN_CHAR rune = 0xF0005
 var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPAREN
 var ESC_BACKSLASH rune = 0xF0007     // Represents an escaped backslash
 var CHAR_RANGE rune = 0xF0008        // Represents a character range
 // An interface for int and rune, which are identical
 type character interface {
 	int | rune
 }
 // Returns true if str[idx] and str[idx-1] are separated by a word boundary.
 func isWordBoundary(str []rune, idx int) bool {
@@ -109,8 +115,8 @@ func Reduce[T any](slc []T, fn func(T, T) T) T {
 }
 // Generate numbers in a range - start (inclusive) to end (exclusive)
-func genRange(start, end int) []int {
+func genRange[T character](start, end T) []T {
-	toRet := make([]int, end-start)
+	toRet := make([]T, end-start)
 	for i := start; i < end; i++ {
 		toRet[i-start] = i
 	}
Author	SHA1	Message	Date
Aadhavan Srinivasan	0bd7a87797	Removed old comment	2025-01-22 20:27:35 -05:00
Aadhavan Srinivasan	9cf1c66653	Implemented character range detection later in the code, using a metacharacter	2025-01-22 20:26:58 -05:00
Aadhavan Srinivasan	9edc99d73c	Modified genRange() so that it can work on ints and runes	2025-01-22 20:25:49 -05:00