From 8a1f1dc621e7f4228ebf60be0571f08551f0398e Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Mon, 18 Nov 2024 10:41:50 -0500 Subject: [PATCH] Added unicode support Replaced strings with rune-slices, which capture unicode codepoints more accurately. --- main.go | 2 +- matching.go | 4 ++-- misc.go | 2 +- nfa.go | 6 +++--- re_test.go | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/main.go b/main.go index c4f9dff..ea47319 100644 --- a/main.go +++ b/main.go @@ -406,7 +406,7 @@ func main() { startState := thompson(re_postfix) // Read every string from stdin until we encounter an error. If the error isn't EOF, panic.' for test_str, err = reader.ReadString('\n'); err == nil; test_str, err = reader.ReadString('\n') { - matchIndices := findAllMatches(startState, test_str) + matchIndices := findAllMatches(startState, []rune(test_str)) // Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6 // This should make checking O(1) instead of O(n) indicesToPrint := new_uniq_arr[int]() diff --git a/matching.go b/matching.go index 8cbfb60..27313f5 100644 --- a/matching.go +++ b/matching.go @@ -81,7 +81,7 @@ func pruneIndices(indices []MatchIndex) []MatchIndex { // findAllMatches tries to find all matches of the regex represented by given start-state, with // the given string -func findAllMatches(start *State, str string) []MatchIndex { +func findAllMatches(start *State, str []rune) []MatchIndex { idx := 0 var matchFound bool var matchIdx MatchIndex @@ -104,7 +104,7 @@ func findAllMatches(start *State, str string) []MatchIndex { // the next search should start from. // // Might return duplicates or overlapping indices, so care must be taken to prune the resulting array. -func findAllMatchesHelper(start *State, str string, offset int) (bool, MatchIndex, int) { +func findAllMatchesHelper(start *State, str []rune, offset int) (bool, MatchIndex, int) { // Base case - exit if offset exceeds string's length if offset > len(str) { // The first value here shouldn't be used, because we should exit when the second return value is > than len(str) diff --git a/misc.go b/misc.go index 282d65c..f3c8fd5 100644 --- a/misc.go +++ b/misc.go @@ -22,7 +22,7 @@ func dotChars() []rune { // Returns all possible characters represented by the d } // Returns true if str[idx] and str[idx-1] are separated by a word boundary. -func isWordBoundary(str string, idx int) bool { +func isWordBoundary(str []rune, idx int) bool { str_runes := []rune(str) wbounded := idx == 0 || idx >= len(str) || diff --git a/nfa.go b/nfa.go index dd2a2b6..b472020 100644 --- a/nfa.go +++ b/nfa.go @@ -73,7 +73,7 @@ func cloneStateHelper(state *State, cloneMap map[*State]*State) *State { // Checks if the given state's assertion is true. Returns true if the given // state doesn't have an assertion. -func (s State) checkAssertion(str string, idx int) bool { +func (s State) checkAssertion(str []rune, idx int) bool { if s.assert == SOS { return idx == 0 } @@ -90,7 +90,7 @@ func (s State) checkAssertion(str string, idx int) bool { } // Returns true if the contents of 's' contain the value at the given index of the given string -func (s State) contentContains(str string, idx int) bool { +func (s State) contentContains(str []rune, idx int) bool { if s.assert != NONE { return s.checkAssertion(str, idx) } @@ -100,7 +100,7 @@ func (s State) contentContains(str string, idx int) bool { // Returns the matches for the character at the given index of the given string. // Also returns the number of matches. Returns -1 if an assertion failed. -func (s State) matchesFor(str string, idx int) ([]*State, int) { +func (s State) matchesFor(str []rune, idx int) ([]*State, int) { // Assertions can be viewed as 'checks'. If the check fails, we return // an empty array and 0. // If it passes, we treat it like any other state, and return all the transitions. diff --git a/re_test.go b/re_test.go index 20cf847..36b4590 100644 --- a/re_test.go +++ b/re_test.go @@ -116,7 +116,7 @@ func TestFindAllMatches(t *testing.T) { t.Run(test.re+" "+test.str, func(t *testing.T) { re_postfix := shuntingYard(test.re) startState := thompson(re_postfix) - matchIndices := findAllMatches(startState, test.str) + matchIndices := findAllMatches(startState, []rune(test.str)) if !slices.Equal(test.result, matchIndices) { t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) }