Added unicode support

Replaced strings with rune-slices, which capture unicode codepoints more
accurately.
master
Aadhavan Srinivasan 2 months ago
parent 805766a5ba
commit 8a1f1dc621

@ -406,7 +406,7 @@ func main() {
startState := thompson(re_postfix)
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.'
for test_str, err = reader.ReadString('\n'); err == nil; test_str, err = reader.ReadString('\n') {
matchIndices := findAllMatches(startState, test_str)
matchIndices := findAllMatches(startState, []rune(test_str))
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
// This should make checking O(1) instead of O(n)
indicesToPrint := new_uniq_arr[int]()

@ -81,7 +81,7 @@ func pruneIndices(indices []MatchIndex) []MatchIndex {
// findAllMatches tries to find all matches of the regex represented by given start-state, with
// the given string
func findAllMatches(start *State, str string) []MatchIndex {
func findAllMatches(start *State, str []rune) []MatchIndex {
idx := 0
var matchFound bool
var matchIdx MatchIndex
@ -104,7 +104,7 @@ func findAllMatches(start *State, str string) []MatchIndex {
// the next search should start from.
//
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
func findAllMatchesHelper(start *State, str string, offset int) (bool, MatchIndex, int) {
func findAllMatchesHelper(start *State, str []rune, offset int) (bool, MatchIndex, int) {
// Base case - exit if offset exceeds string's length
if offset > len(str) {
// The first value here shouldn't be used, because we should exit when the second return value is > than len(str)

@ -22,7 +22,7 @@ func dotChars() []rune { // Returns all possible characters represented by the d
}
// Returns true if str[idx] and str[idx-1] are separated by a word boundary.
func isWordBoundary(str string, idx int) bool {
func isWordBoundary(str []rune, idx int) bool {
str_runes := []rune(str)
wbounded := idx == 0 ||
idx >= len(str) ||

@ -73,7 +73,7 @@ func cloneStateHelper(state *State, cloneMap map[*State]*State) *State {
// Checks if the given state's assertion is true. Returns true if the given
// state doesn't have an assertion.
func (s State) checkAssertion(str string, idx int) bool {
func (s State) checkAssertion(str []rune, idx int) bool {
if s.assert == SOS {
return idx == 0
}
@ -90,7 +90,7 @@ func (s State) checkAssertion(str string, idx int) bool {
}
// Returns true if the contents of 's' contain the value at the given index of the given string
func (s State) contentContains(str string, idx int) bool {
func (s State) contentContains(str []rune, idx int) bool {
if s.assert != NONE {
return s.checkAssertion(str, idx)
}
@ -100,7 +100,7 @@ func (s State) contentContains(str string, idx int) bool {
// Returns the matches for the character at the given index of the given string.
// Also returns the number of matches. Returns -1 if an assertion failed.
func (s State) matchesFor(str string, idx int) ([]*State, int) {
func (s State) matchesFor(str []rune, idx int) ([]*State, int) {
// Assertions can be viewed as 'checks'. If the check fails, we return
// an empty array and 0.
// If it passes, we treat it like any other state, and return all the transitions.

@ -116,7 +116,7 @@ func TestFindAllMatches(t *testing.T) {
t.Run(test.re+" "+test.str, func(t *testing.T) {
re_postfix := shuntingYard(test.re)
startState := thompson(re_postfix)
matchIndices := findAllMatches(startState, test.str)
matchIndices := findAllMatches(startState, []rune(test.str))
if !slices.Equal(test.result, matchIndices) {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}

Loading…
Cancel
Save