Added unicode support to dot metacharacter - it now matches _any_ unicode character (almost)
This commit is contained in:
13
main.go
13
main.go
@@ -141,7 +141,7 @@ func shuntingYard(re string) []postfixNode {
|
||||
}
|
||||
|
||||
if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
|
||||
outQueue = append(outQueue, newPostfixNode(dotChars()...))
|
||||
outQueue = append(outQueue, newPostfixDotNode())
|
||||
continue
|
||||
}
|
||||
if c == '^' { // Start-of-string assertion
|
||||
@@ -282,6 +282,9 @@ func thompson(re []postfixNode) *State {
|
||||
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
|
||||
state := State{}
|
||||
state.transitions = make(map[int][]*State)
|
||||
if c.isDot {
|
||||
state.isDot = true
|
||||
}
|
||||
state.content = rune2Contents(c.contents)
|
||||
state.output = make([]*State, 0)
|
||||
state.output = append(state.output, &state)
|
||||
@@ -397,6 +400,7 @@ func main() {
|
||||
var re string
|
||||
re = flag.Args()[0]
|
||||
var test_str string
|
||||
var test_runes []rune // Rune-slice representation of test_str
|
||||
var err error
|
||||
// Create reader for stdin and writer for stdout // End index is one more than last index of match
|
||||
reader := bufio.NewReader(os.Stdin)
|
||||
@@ -406,7 +410,8 @@ func main() {
|
||||
startState := thompson(re_postfix)
|
||||
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.'
|
||||
for test_str, err = reader.ReadString('\n'); err == nil; test_str, err = reader.ReadString('\n') {
|
||||
matchIndices := findAllMatches(startState, []rune(test_str))
|
||||
test_runes = []rune(test_str)
|
||||
matchIndices := findAllMatches(startState, []rune(test_runes))
|
||||
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
|
||||
// This should make checking O(1) instead of O(n)
|
||||
indicesToPrint := new_uniq_arr[int]()
|
||||
@@ -422,7 +427,7 @@ func main() {
|
||||
// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
|
||||
// These are the values we want to print, now that we have inverted the match.
|
||||
// Re-initialize indicesToPrint and add all of these values to it.
|
||||
indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)
|
||||
indicesToPrint.add(setDifference(genRange(0, len(test_runes)), oldIndices)...)
|
||||
|
||||
}
|
||||
// If lineFlag is enabled, we should only print something if:
|
||||
@@ -435,7 +440,7 @@ func main() {
|
||||
continue
|
||||
}
|
||||
}
|
||||
for i, c := range test_str {
|
||||
for i, c := range test_runes {
|
||||
if indicesToPrint.contains(i) {
|
||||
color.New(color.FgRed).Fprintf(out, "%c", c)
|
||||
// Newline after every match - only if -o is enabled and -v is disabled.
|
||||
|
2
misc.go
2
misc.go
@@ -8,8 +8,10 @@ import (
|
||||
var whitespaceChars = []rune{' ', '\t', '\n'}
|
||||
var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
|
||||
var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_")
|
||||
var notDotChars = []rune{'\n'}
|
||||
var LBRACKET rune = 0xF0000
|
||||
var RBRACKET rune = 0xF0001
|
||||
var ANY_CHAR rune = 0xF0002 // Represents any character - used for 'dot' metacharacter
|
||||
|
||||
func dotChars() []rune { // Returns all possible characters represented by the dot metacharacter - this is too tedious to define as a variable, which is why it is a function
|
||||
start := 0x0020
|
||||
|
11
nfa.go
11
nfa.go
@@ -23,6 +23,7 @@ type State struct {
|
||||
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
|
||||
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
|
||||
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
|
||||
isDot bool // Whether or not the state represents a 'dot' metacharacter. A 'dot' node doesn't store any contents directly, as it would take up too much space
|
||||
}
|
||||
|
||||
// Clones the NFA starting from the given state.
|
||||
@@ -94,6 +95,9 @@ func (s State) contentContains(str []rune, idx int) bool {
|
||||
if s.assert != NONE {
|
||||
return s.checkAssertion(str, idx)
|
||||
}
|
||||
if s.isDot {
|
||||
return !slices.Contains(notDotChars, str[idx])
|
||||
}
|
||||
// Default - s.assert must be NONE
|
||||
return slices.Contains(s.content, int(str[idx]))
|
||||
}
|
||||
@@ -116,7 +120,12 @@ func (s State) matchesFor(str []rune, idx int) ([]*State, int) {
|
||||
if s.assert == NONWBOUND && isWordBoundary(str, idx) {
|
||||
return make([]*State, 0), -1
|
||||
}
|
||||
return s.transitions[int(str[idx])], len(s.transitions[int(str[idx])])
|
||||
listTransitions := s.transitions[int(str[idx])]
|
||||
for _, dest := range s.transitions[int(ANY_CHAR)] {
|
||||
listTransitions = append(listTransitions, dest)
|
||||
}
|
||||
numTransitions := len(listTransitions)
|
||||
return listTransitions, numTransitions
|
||||
}
|
||||
|
||||
type NFA struct {
|
||||
|
@@ -26,6 +26,7 @@ type postfixNode struct {
|
||||
contents []rune // Contents of the node - the length of this would only be >1 if the node represents a character class
|
||||
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
|
||||
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
|
||||
isDot bool // Whether or not the current node represents a 'dot' metacharacter
|
||||
}
|
||||
|
||||
// Creates a new escaped node - the given character is assumed to have been preceded by a backslash
|
||||
@@ -105,9 +106,22 @@ func newPostfixNode(contents ...rune) postfixNode {
|
||||
return to_return
|
||||
}
|
||||
|
||||
// Creates and returns a postfixNode representing the 'dot' metacharacter.
|
||||
func newPostfixDotNode() postfixNode {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.isDot = true
|
||||
toReturn.contents = []rune{ANY_CHAR}
|
||||
return toReturn
|
||||
}
|
||||
|
||||
// Creates a character node, regardless of the contents
|
||||
func newPostfixCharNode(contents ...rune) postfixNode {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, contents...)
|
||||
return toReturn
|
||||
|
Reference in New Issue
Block a user