Added unicode support
Replaced strings with rune-slices, which capture unicode codepoints more accurately.
This commit is contained in:
2
main.go
2
main.go
@@ -406,7 +406,7 @@ func main() {
|
|||||||
startState := thompson(re_postfix)
|
startState := thompson(re_postfix)
|
||||||
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.'
|
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.'
|
||||||
for test_str, err = reader.ReadString('\n'); err == nil; test_str, err = reader.ReadString('\n') {
|
for test_str, err = reader.ReadString('\n'); err == nil; test_str, err = reader.ReadString('\n') {
|
||||||
matchIndices := findAllMatches(startState, test_str)
|
matchIndices := findAllMatches(startState, []rune(test_str))
|
||||||
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
|
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
|
||||||
// This should make checking O(1) instead of O(n)
|
// This should make checking O(1) instead of O(n)
|
||||||
indicesToPrint := new_uniq_arr[int]()
|
indicesToPrint := new_uniq_arr[int]()
|
||||||
|
@@ -81,7 +81,7 @@ func pruneIndices(indices []MatchIndex) []MatchIndex {
|
|||||||
|
|
||||||
// findAllMatches tries to find all matches of the regex represented by given start-state, with
|
// findAllMatches tries to find all matches of the regex represented by given start-state, with
|
||||||
// the given string
|
// the given string
|
||||||
func findAllMatches(start *State, str string) []MatchIndex {
|
func findAllMatches(start *State, str []rune) []MatchIndex {
|
||||||
idx := 0
|
idx := 0
|
||||||
var matchFound bool
|
var matchFound bool
|
||||||
var matchIdx MatchIndex
|
var matchIdx MatchIndex
|
||||||
@@ -104,7 +104,7 @@ func findAllMatches(start *State, str string) []MatchIndex {
|
|||||||
// the next search should start from.
|
// the next search should start from.
|
||||||
//
|
//
|
||||||
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
|
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
|
||||||
func findAllMatchesHelper(start *State, str string, offset int) (bool, MatchIndex, int) {
|
func findAllMatchesHelper(start *State, str []rune, offset int) (bool, MatchIndex, int) {
|
||||||
// Base case - exit if offset exceeds string's length
|
// Base case - exit if offset exceeds string's length
|
||||||
if offset > len(str) {
|
if offset > len(str) {
|
||||||
// The first value here shouldn't be used, because we should exit when the second return value is > than len(str)
|
// The first value here shouldn't be used, because we should exit when the second return value is > than len(str)
|
||||||
|
2
misc.go
2
misc.go
@@ -22,7 +22,7 @@ func dotChars() []rune { // Returns all possible characters represented by the d
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns true if str[idx] and str[idx-1] are separated by a word boundary.
|
// Returns true if str[idx] and str[idx-1] are separated by a word boundary.
|
||||||
func isWordBoundary(str string, idx int) bool {
|
func isWordBoundary(str []rune, idx int) bool {
|
||||||
str_runes := []rune(str)
|
str_runes := []rune(str)
|
||||||
wbounded := idx == 0 ||
|
wbounded := idx == 0 ||
|
||||||
idx >= len(str) ||
|
idx >= len(str) ||
|
||||||
|
6
nfa.go
6
nfa.go
@@ -73,7 +73,7 @@ func cloneStateHelper(state *State, cloneMap map[*State]*State) *State {
|
|||||||
|
|
||||||
// Checks if the given state's assertion is true. Returns true if the given
|
// Checks if the given state's assertion is true. Returns true if the given
|
||||||
// state doesn't have an assertion.
|
// state doesn't have an assertion.
|
||||||
func (s State) checkAssertion(str string, idx int) bool {
|
func (s State) checkAssertion(str []rune, idx int) bool {
|
||||||
if s.assert == SOS {
|
if s.assert == SOS {
|
||||||
return idx == 0
|
return idx == 0
|
||||||
}
|
}
|
||||||
@@ -90,7 +90,7 @@ func (s State) checkAssertion(str string, idx int) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns true if the contents of 's' contain the value at the given index of the given string
|
// Returns true if the contents of 's' contain the value at the given index of the given string
|
||||||
func (s State) contentContains(str string, idx int) bool {
|
func (s State) contentContains(str []rune, idx int) bool {
|
||||||
if s.assert != NONE {
|
if s.assert != NONE {
|
||||||
return s.checkAssertion(str, idx)
|
return s.checkAssertion(str, idx)
|
||||||
}
|
}
|
||||||
@@ -100,7 +100,7 @@ func (s State) contentContains(str string, idx int) bool {
|
|||||||
|
|
||||||
// Returns the matches for the character at the given index of the given string.
|
// Returns the matches for the character at the given index of the given string.
|
||||||
// Also returns the number of matches. Returns -1 if an assertion failed.
|
// Also returns the number of matches. Returns -1 if an assertion failed.
|
||||||
func (s State) matchesFor(str string, idx int) ([]*State, int) {
|
func (s State) matchesFor(str []rune, idx int) ([]*State, int) {
|
||||||
// Assertions can be viewed as 'checks'. If the check fails, we return
|
// Assertions can be viewed as 'checks'. If the check fails, we return
|
||||||
// an empty array and 0.
|
// an empty array and 0.
|
||||||
// If it passes, we treat it like any other state, and return all the transitions.
|
// If it passes, we treat it like any other state, and return all the transitions.
|
||||||
|
@@ -116,7 +116,7 @@ func TestFindAllMatches(t *testing.T) {
|
|||||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||||
re_postfix := shuntingYard(test.re)
|
re_postfix := shuntingYard(test.re)
|
||||||
startState := thompson(re_postfix)
|
startState := thompson(re_postfix)
|
||||||
matchIndices := findAllMatches(startState, test.str)
|
matchIndices := findAllMatches(startState, []rune(test.str))
|
||||||
if !slices.Equal(test.result, matchIndices) {
|
if !slices.Equal(test.result, matchIndices) {
|
||||||
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user