From 8a1f1dc621e7f4228ebf60be0571f08551f0398e Mon Sep 17 00:00:00 2001
From: Aadhavan Srinivasan <aadhavan@twomorecents.org>
Date: Mon, 18 Nov 2024 10:41:50 -0500
Subject: [PATCH] Added unicode support

Replaced strings with rune-slices, which capture unicode codepoints more
accurately.
---
 main.go     | 2 +-
 matching.go | 4 ++--
 misc.go     | 2 +-
 nfa.go      | 6 +++---
 re_test.go  | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/main.go b/main.go
index c4f9dff..ea47319 100644
--- a/main.go
+++ b/main.go
@@ -406,7 +406,7 @@ func main() {
 	startState := thompson(re_postfix)
 	// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.'
 	for test_str, err = reader.ReadString('\n'); err == nil; test_str, err = reader.ReadString('\n') {
-		matchIndices := findAllMatches(startState, test_str)
+		matchIndices := findAllMatches(startState, []rune(test_str))
 		// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
 		// This should make checking O(1) instead of O(n)
 		indicesToPrint := new_uniq_arr[int]()
diff --git a/matching.go b/matching.go
index 8cbfb60..27313f5 100644
--- a/matching.go
+++ b/matching.go
@@ -81,7 +81,7 @@ func pruneIndices(indices []MatchIndex) []MatchIndex {
 
 // findAllMatches tries to find all matches of the regex represented by given start-state, with
 // the given string
-func findAllMatches(start *State, str string) []MatchIndex {
+func findAllMatches(start *State, str []rune) []MatchIndex {
 	idx := 0
 	var matchFound bool
 	var matchIdx MatchIndex
@@ -104,7 +104,7 @@ func findAllMatches(start *State, str string) []MatchIndex {
 // the next search should start from.
 //
 //	Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
-func findAllMatchesHelper(start *State, str string, offset int) (bool, MatchIndex, int) {
+func findAllMatchesHelper(start *State, str []rune, offset int) (bool, MatchIndex, int) {
 	// Base case - exit if offset exceeds string's length
 	if offset > len(str) {
 		// The first value here shouldn't be used, because we should exit when the second return value is > than len(str)
diff --git a/misc.go b/misc.go
index 282d65c..f3c8fd5 100644
--- a/misc.go
+++ b/misc.go
@@ -22,7 +22,7 @@ func dotChars() []rune { // Returns all possible characters represented by the d
 }
 
 // Returns true if str[idx] and str[idx-1] are separated by a word boundary.
-func isWordBoundary(str string, idx int) bool {
+func isWordBoundary(str []rune, idx int) bool {
 	str_runes := []rune(str)
 	wbounded := idx == 0 ||
 		idx >= len(str) ||
diff --git a/nfa.go b/nfa.go
index dd2a2b6..b472020 100644
--- a/nfa.go
+++ b/nfa.go
@@ -73,7 +73,7 @@ func cloneStateHelper(state *State, cloneMap map[*State]*State) *State {
 
 // Checks if the given state's assertion is true. Returns true if the given
 // state doesn't have an assertion.
-func (s State) checkAssertion(str string, idx int) bool {
+func (s State) checkAssertion(str []rune, idx int) bool {
 	if s.assert == SOS {
 		return idx == 0
 	}
@@ -90,7 +90,7 @@ func (s State) checkAssertion(str string, idx int) bool {
 }
 
 // Returns true if the contents of 's' contain the value at the given index of the given string
-func (s State) contentContains(str string, idx int) bool {
+func (s State) contentContains(str []rune, idx int) bool {
 	if s.assert != NONE {
 		return s.checkAssertion(str, idx)
 	}
@@ -100,7 +100,7 @@ func (s State) contentContains(str string, idx int) bool {
 
 // Returns the matches for the character at the given index of the given string.
 // Also returns the number of matches. Returns -1 if an assertion failed.
-func (s State) matchesFor(str string, idx int) ([]*State, int) {
+func (s State) matchesFor(str []rune, idx int) ([]*State, int) {
 	// Assertions can be viewed as 'checks'. If the check fails, we return
 	// an empty array and 0.
 	// If it passes, we treat it like any other state, and return all the transitions.
diff --git a/re_test.go b/re_test.go
index 20cf847..36b4590 100644
--- a/re_test.go
+++ b/re_test.go
@@ -116,7 +116,7 @@ func TestFindAllMatches(t *testing.T) {
 		t.Run(test.re+"	"+test.str, func(t *testing.T) {
 			re_postfix := shuntingYard(test.re)
 			startState := thompson(re_postfix)
-			matchIndices := findAllMatches(startState, test.str)
+			matchIndices := findAllMatches(startState, []rune(test.str))
 			if !slices.Equal(test.result, matchIndices) {
 				t.Errorf("Wanted %v	Got %v\n", test.result, matchIndices)
 			}