Enforce the rule that character classes must have at least one character; interpret literal closing brackets as regular characters

Refactored isNormalChar(), wrote function to get special characters that have metachar replacements
Added more test cases (1 failing)
2025-01-24 15:50:36 -05:00 · 2025-01-24 15:49:33 -05:00 · 2025-01-24 14:58:18 -05:00 · 2025-01-24 14:58:07 -05:00 · 2025-01-24 14:57:47 -05:00
4 changed files with 103 additions and 17 deletions
--- a/compile.go
+++ b/compile.go
@@ -153,8 +153,12 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 			}
 			for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
 				i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
-				// TODO: Check for escaped characters
+				// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
-				if re_runes[i] == '-' && i > 0 && re_runes[i-1] != '\\' { // Unescaped hyphen - replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
+				if i >= len(re_runes) {
 					return nil, fmt.Errorf("Opening bracket without closing bracket.")
 				}
 				if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != ']') { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
 					re_runes[i] = CHAR_RANGE
 				}
@@ -268,7 +272,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 		6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
 		*/
 		c := re_postfix[i]
-		if isNormalChar(c) {
+		if isNormalChar(c) || isSpecialCharWithMetacharReplacement(c) {
 			if caseInsensitive {
 				outQueue = append(outQueue, newPostfixNode(allCases(c)...))
 			} else {
@@ -276,7 +280,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 			}
 			continue
 		}
-		// Escape character
+
 		if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
 			if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
 				return nil, fmt.Errorf("ERROR: Backslash with no escape character.")
@@ -408,8 +412,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 			}
 		}
 		if c == LBRACKET { // Used for character classes
-			endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
+			firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
-			i++                 // Step forward so we can look at the character class
+			endOfRange := false     // Set to 'true' when we encounter a CHAR_RANGE metacharacter
 			i++                     // Step forward so we can look at the character class
 			var invertMatch bool
 			if re_postfix[i] == '^' {
 				invertMatch = true
@@ -417,7 +422,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 			}
 			chars := make([]postfixNode, 0) // List of nodes - used only for character classes
 			for i < len(re_postfix) {
-				if re_postfix[i] == RBRACKET {
+				if firstCharAdded && re_postfix[i] == RBRACKET {
 					break
 				}
 				if re_postfix[i] == CHAR_RANGE {
@@ -477,9 +482,20 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 						i++
 					}
 				} else {
 					if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket.
 						switch re_postfix[i] {
 						case LBRACKET:
 							chars = append(chars, newPostfixCharNode('['))
 						case RBRACKET:
 							chars = append(chars, newPostfixCharNode(']'))
 						default:
 							return nil, fmt.Errorf("Error parsing high-range unicode value in character class.")
 						}
 					}
 					chars = append(chars, newPostfixCharNode(re_postfix[i]))
 					i++
 				}
 				firstCharAdded = true
 				if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range
 					// Things to note:
@@ -491,13 +507,15 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 					//  2. To account for this, the following logic is followed:
 					// 		a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range.
 					// 			i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such.
 					// 		   ii. If either the start or end of the range don't exist in 'chars' ie. something like [-a] or [a-], then too will we treat it as a literal hyphen.
 					// 		b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range).
-					endRangePostfixNode := mustPop(&chars)
+					endRangePostfixNode, err1 := pop(&chars)
-					startRangePostfixNode := mustPop(&chars)
+					startRangePostfixNode, err2 := pop(&chars)
-					if len(endRangePostfixNode.contents) != 1 {
+
-						return nil, fmt.Errorf("Error parsing character range.")
+					if (err1 != nil || err2 != nil) || len(startRangePostfixNode.contents) != 1 { // Treat it as a regular hyphen
 					} else if len(startRangePostfixNode.contents) != 1 { // This is actually a regular hyphen
 						chars = append(chars, startRangePostfixNode, newPostfixCharNode('-'), endRangePostfixNode)
 					} else if len(endRangePostfixNode.contents) != 1 { // I don't even know what this would look like, this is just a sanity check
 						return nil, fmt.Errorf("Error parsing character range.")
 					} else {
 						// We have established that they both have a length of 1
 						startRangeRune := startRangePostfixNode.contents[0]
--- a/misc.go
+++ b/misc.go
@@ -17,6 +17,8 @@ var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPARE
 var ESC_BACKSLASH rune = 0xF0007     // Represents an escaped backslash
 var CHAR_RANGE rune = 0xF0008        // Represents a character range
 var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', '~', '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR}
 // An interface for int and rune, which are identical
 type character interface {
 	int | rune
@@ -32,9 +34,17 @@ func isWordBoundary(str []rune, idx int) bool {
 	return wbounded
 }
 func isSpecialChar(c rune) bool {
 	return slices.Contains(specialChars, c)
 }
 // Some special characters have metacharacter replacements. These characters, when encountered in their literal form, can be treated as regular characters.
 func isSpecialCharWithMetacharReplacement(c rune) bool {
 	return slices.Contains([]rune{'[', ']'}, c)
 }
 func isNormalChar(c rune) bool {
 	specialChars := []rune(`?*\^${}()+|[].~<>`)
 	specialChars = append(specialChars, LBRACKET, RBRACKET, NONCAPLPAREN_CHAR)
 	return !slices.Contains(specialChars, c)
 }
--- a/postfixNode.go
+++ b/postfixNode.go
@@ -116,6 +116,13 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
 	case 'v': // Vertical tab
 		toReturn.nodetype = CHARACTER
 		toReturn.contents = append(toReturn.contents, rune(11))
 	case '-': // Literal hyphen - only in character class
 		if inCharClass {
 			toReturn.nodetype = CHARACTER
 			toReturn.contents = append(toReturn.contents, '-')
 		} else {
 			return postfixNode{}, fmt.Errorf("Invalid escape character.")
 		}
 	default: // None of the above - append it as a regular character
 		if isNormalChar(c) { // Normal characters cannot be escaped
 			return postfixNode{}, fmt.Errorf("Invalid escape character.")
--- a/re_test.go
+++ b/re_test.go
@@ -1,6 +1,7 @@
 package main
 import (
 	"fmt"
 	"slices"
 	"testing"
 )
@@ -185,6 +186,56 @@ var reTests = []struct {
 	{`a.*b`, nil, "acc\nccb", []Group{}},
 	{`a.{4,5}b`, nil, "acc\nccb", []Group{}},
 	{`a.b`, nil, "a\rb", []Group{{0, 3}}},
 	{`a.b`, []ReFlag{RE_MULTILINE}, "a\nb", []Group{{0, 3}}},
 	{`a.*b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
 	{`a.{4,5}b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
 	{`)`, nil, ``, nil},
 	{`^$`, nil, ``, []Group{{0, 0}}},
 	{`abc`, nil, `abc`, []Group{{0, 3}}},
 	{`abc`, nil, `xbc`, []Group{}},
 	{`abc`, nil, `axc`, []Group{}},
 	{`abc`, nil, `abx`, []Group{}},
 	{`abc`, nil, `xabcy`, []Group{{1, 4}}},
 	{`abc`, nil, `ababc`, []Group{{2, 5}}},
 	{`ab*c`, nil, `abc`, []Group{{0, 3}}},
 	{`ab*bc`, nil, `abc`, []Group{{0, 3}}},
 	{`ab*bc`, nil, `abbc`, []Group{{0, 4}}},
 	{`ab*bc`, nil, `abbbbc`, []Group{{0, 6}}},
 	{`ab+bc`, nil, `abbc`, []Group{{0, 4}}},
 	{`ab+bc`, nil, `abc`, []Group{}},
 	{`ab+bc`, nil, `abq`, []Group{}},
 	{`ab+bc`, nil, `abbbbc`, []Group{{0, 6}}},
 	{`ab?bc`, nil, `abbc`, []Group{{0, 4}}},
 	{`ab?bc`, nil, `abc`, []Group{{0, 3}}},
 	{`ab?bc`, nil, `abbbbc`, []Group{}},
 	{`ab?c`, nil, `abc`, []Group{{0, 3}}},
 	{`^abc$`, nil, `abc`, []Group{{0, 3}}},
 	{`^abc$`, nil, `abcc`, []Group{}},
 	{`^abc`, nil, `abcc`, []Group{{0, 3}}},
 	{`^abc$`, nil, `aabc`, []Group{}},
 	{`abc$`, nil, `aabc`, []Group{{1, 4}}},
 	{`^`, nil, `abc`, []Group{{0, 0}}},
 	{`$`, nil, `abc`, []Group{{3, 3}}},
 	{`a.c`, nil, `abc`, []Group{{0, 3}}},
 	{`a.c`, nil, `axc`, []Group{{0, 3}}},
 	{`a.*c`, nil, `axyzc`, []Group{{0, 5}}},
 	{`a.*c`, nil, `axyzd`, []Group{}},
 	{`a[bc]d`, nil, `abc`, []Group{}},
 	{`a[bc]d`, nil, `abd`, []Group{{0, 3}}},
 	{`a[b-d]e`, nil, `abd`, []Group{}},
 	{`a[b-d]e`, nil, `ace`, []Group{{0, 3}}},
 	{`a[b-d]`, nil, `aac`, []Group{{1, 3}}},
 	{`a[-b]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
 	{`a[\-b]`, nil, `a-`, []Group{{0, 2}}},
 	{`a[b-]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
 	{`a[]b`, nil, `-`, nil},
 	{`a[`, nil, `-`, nil},
 	{`a\`, nil, `-`, nil},
 	{`abc)`, nil, `-`, nil},
 	{`(abc`, nil, `-`, nil},
 	{`a]`, nil, `a]`, []Group{{0, 2}}},
 	// Todo - add numeric range tests
 }
@@ -223,7 +274,7 @@ func TestFindAllMatches(t *testing.T) {
 			regComp, err := Compile(test.re, test.flags...)
 			if err != nil {
 				if test.result != nil {
-					panic(err)
+					panic(fmt.Errorf("Test Error: %v", err))
 				}
 			} else {
 				matchIndices := FindAllMatches(regComp, test.str)
@@ -242,7 +293,7 @@ func TestFindAllMatches(t *testing.T) {
 func TestFindString(t *testing.T) {
 	for _, test := range reTests {
 		t.Run(test.re+"	"+test.str, func(t *testing.T) {
-			regComp, err := Compile(test.re)
+			regComp, err := Compile(test.re, test.flags...)
 			if err != nil {
 				if test.result != nil {
 					panic(err)
@@ -267,7 +318,7 @@ func TestFindString(t *testing.T) {
 func TestFindAllGroups(t *testing.T) {
 	for _, test := range groupTests {
 		t.Run(test.re+"	"+test.str, func(t *testing.T) {
-			regComp, err := Compile(test.re)
+			regComp, err := Compile(test.re, test.flags...)
 			if err != nil {
 				if test.result != nil {
 					panic(err)
Author	SHA1	Message	Date
Aadhavan Srinivasan	ccb82f781b	Enforce the rule that character classes must have at least one character; interpret literal closing brackets as regular characters	2025-01-24 15:50:36 -05:00
Aadhavan Srinivasan	09bbf8d3f1	Refactored isNormalChar(), wrote function to get special characters that have metachar replacements	2025-01-24 15:49:33 -05:00
Aadhavan Srinivasan	d5b4450e50	Added more test cases (1 failing)	2025-01-24 14:58:18 -05:00
Aadhavan Srinivasan	45827b5dd3	Allow hyphen to be escaped inside character class	2025-01-24 14:58:07 -05:00
Aadhavan Srinivasan	c26edcb0c4	Fixed edge cases with character ranges and character classes	2025-01-24 14:57:47 -05:00