From 44d6a2005c87c2fc63f14c408bcc56dd0f8fc417 Mon Sep 17 00:00:00 2001
From: Aadhavan Srinivasan <aadhavan@twomorecents.org>
Date: Wed, 12 Feb 2025 22:19:30 -0500
Subject: [PATCH 1/7] Started working on unicode character classes

---
 regex/compile.go | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/regex/compile.go b/regex/compile.go
index 0414ac8..dfee016 100644
--- a/regex/compile.go
+++ b/regex/compile.go
@@ -108,6 +108,48 @@ func getPOSIXClass(str []rune) (bool, string) {
 	return true, rtv
 }
 
+// isUnicodeCharClassLetter returns whether or not the given letter represents a unicode character class.
+func isUnicodeCharClassLetter(c rune) bool {
+	return slices.Contains([]rune{'L', 'M', 'S', 'N', 'P', 'C', 'Z'}, c)
+}
+
+// rangeTableToRuneSlice converts the given range table into a rune slice and returns it.
+func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune {
+	var rtv []rune
+	for _, r := range rangetable.R16 {
+		for c := r.Lo; c < r.Hi; c += r.Stride {
+			rtv = append(rtv, rune(c))
+		}
+	}
+	for _, r := range rangetable.R32 {
+		for c := r.Lo; c < r.Hi; c += r.Stride {
+			rtv = append(rtv, rune(c))
+		}
+	}
+	return rtv
+}
+
+// unicodeCharClassToRange converts the given unicode character class name into a list of characters in that class.
+// This class could also be a single letter eg. 'C'.
+func unicodeCharClassToRange(class string) ([]rune, error) {
+	if len(class) == 0 {
+		return nil, fmt.Errorf("empty unicode character class")
+	}
+	if len(class) == 1 || len(class) == 2 {
+		if rangeTable, ok := unicode.Categories[class]; ok {
+			return rangeTableToRuneSlice(rangeTable), nil
+		} else {
+			return nil, fmt.Errorf("invalid short unicode character class")
+		}
+	} else {
+		if rangeTable, ok := unicode.Scripts[class]; ok {
+			return rangeTableToRuneSlice(rangeTable), nil
+		} else {
+			return nil, fmt.Errorf("invalid long unicode character class")
+		}
+	}
+}
+
 // Stores whether the case-insensitive flag has been enabled.
 var caseInsensitive bool
 
@@ -313,6 +355,25 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				} else {
 					return nil, fmt.Errorf("invalid hex value in expression")
 				}
+			} else if re_runes[i] == 'p' || re_runes[i] == 'P' { // Unicode character class (P is negated unicode charclass)
+				re_postfix = append(re_postfix, re_runes[i])
+				i++
+				if i >= len(re_runes) {
+					return nil, fmt.Errorf("error parsing unicode character class in expression")
+				}
+				if re_runes[i] == '{' { // Full name charclass
+					for re_runes[i] != '}' {
+						re_postfix = append(re_postfix, re_runes[i])
+						i++
+					}
+					re_postfix = append(re_postfix, re_runes[i])
+					i++
+				} else if isUnicodeCharClassLetter(re_runes[i]) {
+					re_postfix = append(re_postfix, re_runes[i])
+					i++
+				} else {
+					return nil, fmt.Errorf("error parsing unicode character class in expression")
+				}
 			} else if re_runes[i] == '0' { // Start of octal value
 				numDigits := 1
 				for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0)

From 9cd330e521fe467b3018bab8097aca3f655b9751 Mon Sep 17 00:00:00 2001
From: Aadhavan Srinivasan <aadhavan@twomorecents.org>
Date: Wed, 12 Feb 2025 23:04:10 -0500
Subject: [PATCH 2/7] More work on unicode character class support - fix bug
 where all characters aren't being matched

---
 regex/compile.go | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/regex/compile.go b/regex/compile.go
index dfee016..428df30 100644
--- a/regex/compile.go
+++ b/regex/compile.go
@@ -490,6 +490,45 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				} else {
 					return nil, fmt.Errorf("not enough hex characters found in expression")
 				}
+			} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
+				charClassInverted := (re_postfix[i] == 'P')
+				i++
+				if isUnicodeCharClassLetter(re_postfix[i]) {
+					chars, err := unicodeCharClassToRange(string(re_postfix[i]))
+					if err != nil {
+						return nil, err
+					}
+					var toAppend postfixNode
+					if re_postfix[i] == 'p' {
+						toAppend = newPostfixNode(chars...)
+					}
+					if re_postfix[i] == 'P' {
+						toAppend = newPostfixDotNode()
+						toAppend.except = append([]postfixNode{}, newPostfixNode(chars...))
+					}
+					outQueue = append(outQueue, toAppend)
+				} else if re_postfix[i] == '{' {
+					i++ // Skip opening bracket
+					unicodeCharClassStr := ""
+					for re_postfix[i] != '}' {
+						unicodeCharClassStr += string(re_postfix[i])
+						i++
+					}
+					chars, err := unicodeCharClassToRange(unicodeCharClassStr)
+					if err != nil {
+						return nil, err
+					}
+					var toAppend postfixNode
+					if !charClassInverted { // \p
+						toAppend = newPostfixNode(chars...)
+					} else { // \P
+						toAppend = newPostfixDotNode()
+						toAppend.except = append([]postfixNode{}, newPostfixNode(chars...))
+					}
+					outQueue = append(outQueue, toAppend)
+				} else {
+					return nil, fmt.Errorf("error parsing unicode character class in expression")
+				}
 			} else if re_postfix[i] == '0' { // Octal value
 				var octVal int64
 				var octValStr string

From d4d606d95bdfcce5002860612be77ba422513ab3 Mon Sep 17 00:00:00 2001
From: Aadhavan Srinivasan <aadhavan@twomorecents.org>
Date: Thu, 13 Feb 2025 08:55:12 -0500
Subject: [PATCH 3/7] Added tests for unicode character classes; more tests for
 hex characters

---
 regex/re_test.go | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/regex/re_test.go b/regex/re_test.go
index 1b717c4..f085761 100644
--- a/regex/re_test.go
+++ b/regex/re_test.go
@@ -430,6 +430,7 @@ var reTests = []struct {
 	{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
 	{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
 	{`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
+	{`\0141\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "AA", []Group{{0, 2}}},
 
 	{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
 
@@ -460,8 +461,10 @@ var reTests = []struct {
 	{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
 	{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
 	{`\xff`, nil, "\u00ff", []Group{{0, 1}}},
+	{`\xff+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
 	{`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
 	{`\x00ff`, nil, "\u00ff", []Group{}},
+	{`\x{0000ff}+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
 	{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
 	{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
 	{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
@@ -516,6 +519,14 @@ var reTests = []struct {
 	{`<389-400`, nil, `-`, nil},
 	{`<389-400>`, nil, `391`, []Group{{0, 3}}},
 	{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
+
+	{`\p{Tamil}+`, nil, `உயிரெழுத்து`, []Group{{0, 11}}}, // Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'.
+	{`\P{Tamil}+`, nil, `vowel=உயிரெழுத்து`, []Group{{0, 6}}},
+	{`\P`, nil, `உயிரெழுத்து`, nil},
+	{`\PM\pM*`, nil, `உயிரெழுத்து`, []Group{{0, 1}, {1, 3}, {3, 5}, {5, 7}, {7, 9}, {9, 11}}},
+	{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
+	{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
+	{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
 }
 
 var groupTests = []struct {

From 70457118603d7ed762f9ff3cccdf3365d3621e3c Mon Sep 17 00:00:00 2001
From: Aadhavan Srinivasan <aadhavan@twomorecents.org>
Date: Thu, 13 Feb 2025 08:55:41 -0500
Subject: [PATCH 4/7] Convert test_str into a rune slice for better unicode
 compatibility, it also fixed the bug where all unicode characters wouldn't be
 colored

---
 cmd/main.go | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/cmd/main.go b/cmd/main.go
index 82c5748..06b9a58 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -129,6 +129,8 @@ func main() {
 			matchIndices = regComp.FindAllSubmatch(test_str)
 		}
 
+		test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters
+
 		if *printMatchesFlag {
 			// if we are in single line mode, print the line on which
 			// the matches occur
@@ -158,10 +160,10 @@ func main() {
 			oldIndices := indicesToPrint.values()
 			indicesToPrint = new_uniq_arr[int]()
 			// Explanation:
-			// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
+			// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
 			// These are the values we want to print, now that we have inverted the match.
 			// Re-initialize indicesToPrint and add all of these values to it.
-			indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)
+			indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)
 
 		}
 		// If lineFlag is enabled, we should only print something if:
@@ -182,7 +184,7 @@ func main() {
 		//			the corresponding end index.
 		// 		3. If not, just print the character.
 		if substituteFlagEnabled {
-			for i := range test_str {
+			for i := range test_str_runes {
 				inMatchIndex := false
 				for _, m := range matchIndices {
 					if i == m[0].StartIdx {
@@ -193,11 +195,11 @@ func main() {
 					}
 				}
 				if !inMatchIndex {
-					fmt.Fprintf(out, "%c", test_str[i])
+					fmt.Fprintf(out, "%c", test_str_runes[i])
 				}
 			}
 		} else {
-			for i, c := range test_str {
+			for i, c := range test_str_runes {
 				if indicesToPrint.contains(i) {
 					color.New(color.FgRed).Fprintf(out, "%c", c)
 					// Newline after every match - only if -o is enabled and -v is disabled.

From fde3784e5a4a6525ae6ecec2fc945d34d3396f36 Mon Sep 17 00:00:00 2001
From: Aadhavan Srinivasan <aadhavan@twomorecents.org>
Date: Thu, 13 Feb 2025 08:58:02 -0500
Subject: [PATCH 5/7] Added unicode charclass support within character classes;
 Fixed bugs with hex classes and unicode classes

---
 regex/compile.go | 50 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/regex/compile.go b/regex/compile.go
index 428df30..62dbfb9 100644
--- a/regex/compile.go
+++ b/regex/compile.go
@@ -117,12 +117,12 @@ func isUnicodeCharClassLetter(c rune) bool {
 func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune {
 	var rtv []rune
 	for _, r := range rangetable.R16 {
-		for c := r.Lo; c < r.Hi; c += r.Stride {
+		for c := r.Lo; c <= r.Hi; c += r.Stride {
 			rtv = append(rtv, rune(c))
 		}
 	}
 	for _, r := range rangetable.R32 {
-		for c := r.Lo; c < r.Hi; c += r.Stride {
+		for c := r.Lo; c <= r.Hi; c += r.Stride {
 			rtv = append(rtv, rune(c))
 		}
 	}
@@ -351,7 +351,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 					}
 				} else if isHex(re_runes[i]) {
 					re_postfix = append(re_postfix, re_runes[i:i+2]...)
-					i += 2
+					i += 1 // I don't skip forward 2 steps, because the second step will happen with the loop increment
 				} else {
 					return nil, fmt.Errorf("invalid hex value in expression")
 				}
@@ -374,6 +374,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				} else {
 					return nil, fmt.Errorf("error parsing unicode character class in expression")
 				}
+				i-- // The loop increment at the top will move us forward
 			} else if re_runes[i] == '0' { // Start of octal value
 				numDigits := 1
 				for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0)
@@ -499,10 +500,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 						return nil, err
 					}
 					var toAppend postfixNode
-					if re_postfix[i] == 'p' {
+					if !charClassInverted {
 						toAppend = newPostfixNode(chars...)
-					}
-					if re_postfix[i] == 'P' {
+					} else {
 						toAppend = newPostfixDotNode()
 						toAppend.except = append([]postfixNode{}, newPostfixNode(chars...))
 					}
@@ -711,7 +711,45 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 						} else {
 							return nil, fmt.Errorf("not enough hex characters found in character class")
 						}
+					} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
+						charClassInverted := (re_postfix[i] == 'P')
+						i++
+						if isUnicodeCharClassLetter(re_postfix[i]) {
+							charsInList, err := unicodeCharClassToRange(string(re_postfix[i]))
+							if err != nil {
+								return nil, err
+							}
+							if !charClassInverted {
+								chars = append(chars, newPostfixNode(charsInList...))
+							} else {
+								toAppend := newPostfixDotNode()
+								toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...))
+								chars = append(chars, toAppend)
+							}
+						} else if re_postfix[i] == '{' {
+							i++ // Skip opening bracket
+							unicodeCharClassStr := ""
+							for re_postfix[i] != '}' {
+								unicodeCharClassStr += string(re_postfix[i])
+								i++
+							}
+							charsInList, err := unicodeCharClassToRange(unicodeCharClassStr)
+							if err != nil {
+								return nil, err
+							}
+							if !charClassInverted {
+								chars = append(chars, newPostfixNode(charsInList...))
+							} else {
+								toAppend := newPostfixDotNode()
+								toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...))
+								chars = append(chars, toAppend)
+							}
+						} else {
+							return nil, fmt.Errorf("error parsing unicode character class in expression")
+						}
+
 					} else if re_postfix[i] == '0' { // Octal value
+
 						var octVal int64
 						var octValStr string
 						numDigitsParsed := 0

From 1a890a1e75aa5212c4625efea60e6b21dd54c098 Mon Sep 17 00:00:00 2001
From: Aadhavan Srinivasan <aadhavan@twomorecents.org>
Date: Thu, 13 Feb 2025 09:10:40 -0500
Subject: [PATCH 6/7] Refactoring - remove duplicate code

---
 regex/compile.go | 60 ++++++++++++++++++++----------------------------
 1 file changed, 25 insertions(+), 35 deletions(-)

diff --git a/regex/compile.go b/regex/compile.go
index 62dbfb9..904bf26 100644
--- a/regex/compile.go
+++ b/regex/compile.go
@@ -493,20 +493,14 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				}
 			} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
 				charClassInverted := (re_postfix[i] == 'P')
+				charsInClass := []rune{}
 				i++
 				if isUnicodeCharClassLetter(re_postfix[i]) {
-					chars, err := unicodeCharClassToRange(string(re_postfix[i]))
+					var err error
+					charsInClass, err = unicodeCharClassToRange(string(re_postfix[i]))
 					if err != nil {
 						return nil, err
 					}
-					var toAppend postfixNode
-					if !charClassInverted {
-						toAppend = newPostfixNode(chars...)
-					} else {
-						toAppend = newPostfixDotNode()
-						toAppend.except = append([]postfixNode{}, newPostfixNode(chars...))
-					}
-					outQueue = append(outQueue, toAppend)
 				} else if re_postfix[i] == '{' {
 					i++ // Skip opening bracket
 					unicodeCharClassStr := ""
@@ -514,21 +508,22 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 						unicodeCharClassStr += string(re_postfix[i])
 						i++
 					}
-					chars, err := unicodeCharClassToRange(unicodeCharClassStr)
+					var err error
+					charsInClass, err = unicodeCharClassToRange(unicodeCharClassStr)
 					if err != nil {
 						return nil, err
 					}
-					var toAppend postfixNode
-					if !charClassInverted { // \p
-						toAppend = newPostfixNode(chars...)
-					} else { // \P
-						toAppend = newPostfixDotNode()
-						toAppend.except = append([]postfixNode{}, newPostfixNode(chars...))
-					}
-					outQueue = append(outQueue, toAppend)
 				} else {
 					return nil, fmt.Errorf("error parsing unicode character class in expression")
 				}
+				var toAppend postfixNode
+				if !charClassInverted { // \p
+					toAppend = newPostfixNode(charsInClass...)
+				} else { // \P
+					toAppend = newPostfixDotNode()
+					toAppend.except = append([]postfixNode{}, newPostfixNode(charsInClass...))
+				}
+				outQueue = append(outQueue, toAppend)
 			} else if re_postfix[i] == '0' { // Octal value
 				var octVal int64
 				var octValStr string
@@ -713,19 +708,14 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 						}
 					} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
 						charClassInverted := (re_postfix[i] == 'P')
+						charsInList := []rune{}
 						i++
 						if isUnicodeCharClassLetter(re_postfix[i]) {
-							charsInList, err := unicodeCharClassToRange(string(re_postfix[i]))
+							var err error
+							charsInList, err = unicodeCharClassToRange(string(re_postfix[i]))
 							if err != nil {
 								return nil, err
 							}
-							if !charClassInverted {
-								chars = append(chars, newPostfixNode(charsInList...))
-							} else {
-								toAppend := newPostfixDotNode()
-								toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...))
-								chars = append(chars, toAppend)
-							}
 						} else if re_postfix[i] == '{' {
 							i++ // Skip opening bracket
 							unicodeCharClassStr := ""
@@ -733,21 +723,21 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 								unicodeCharClassStr += string(re_postfix[i])
 								i++
 							}
-							charsInList, err := unicodeCharClassToRange(unicodeCharClassStr)
+							var err error
+							charsInList, err = unicodeCharClassToRange(unicodeCharClassStr)
 							if err != nil {
 								return nil, err
 							}
-							if !charClassInverted {
-								chars = append(chars, newPostfixNode(charsInList...))
-							} else {
-								toAppend := newPostfixDotNode()
-								toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...))
-								chars = append(chars, toAppend)
-							}
 						} else {
 							return nil, fmt.Errorf("error parsing unicode character class in expression")
 						}
-
+						if !charClassInverted {
+							chars = append(chars, newPostfixNode(charsInList...))
+						} else {
+							toAppend := newPostfixDotNode()
+							toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...))
+							chars = append(chars, toAppend)
+						}
 					} else if re_postfix[i] == '0' { // Octal value
 
 						var octVal int64

From 46bc0c85296181362b214cfb44afe615f8aa2f8a Mon Sep 17 00:00:00 2001
From: Aadhavan Srinivasan <aadhavan@twomorecents.org>
Date: Thu, 13 Feb 2025 10:48:23 -0500
Subject: [PATCH 7/7] Removed unicode character classes from 'features not
 supported' list

---
 regex/doc.go | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/regex/doc.go b/regex/doc.go
index b35c02f..c77faf6 100644
--- a/regex/doc.go
+++ b/regex/doc.go
@@ -153,9 +153,8 @@ returns the 0-group.
 The following features from [regexp] are (currently) NOT supported:
  1. Named capturing groups
  2. Non-greedy operators
- 3. Unicode character classes
- 4. Embedded flags (flags are instead passed as arguments to [Compile])
- 5. Literal text with \Q ... \E
+ 3. Embedded flags (flags are instead passed as arguments to [Compile])
+ 4. Literal text with \Q ... \E
 
 The following features are not available in [regexp], but are supported in my engine:
  1. Lookarounds