Added unicode support to dot metacharacter - it now matches _any_ unicode character (almost)

master
Aadhavan Srinivasan 1 month ago
parent 8a1f1dc621
commit c56d81a335

@ -141,7 +141,7 @@ func shuntingYard(re string) []postfixNode {
}
if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
outQueue = append(outQueue, newPostfixNode(dotChars()...))
outQueue = append(outQueue, newPostfixDotNode())
continue
}
if c == '^' { // Start-of-string assertion
@ -282,6 +282,9 @@ func thompson(re []postfixNode) *State {
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
state := State{}
state.transitions = make(map[int][]*State)
if c.isDot {
state.isDot = true
}
state.content = rune2Contents(c.contents)
state.output = make([]*State, 0)
state.output = append(state.output, &state)
@ -397,6 +400,7 @@ func main() {
var re string
re = flag.Args()[0]
var test_str string
var test_runes []rune // Rune-slice representation of test_str
var err error
// Create reader for stdin and writer for stdout // End index is one more than last index of match
reader := bufio.NewReader(os.Stdin)
@ -406,7 +410,8 @@ func main() {
startState := thompson(re_postfix)
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.'
for test_str, err = reader.ReadString('\n'); err == nil; test_str, err = reader.ReadString('\n') {
matchIndices := findAllMatches(startState, []rune(test_str))
test_runes = []rune(test_str)
matchIndices := findAllMatches(startState, []rune(test_runes))
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
// This should make checking O(1) instead of O(n)
indicesToPrint := new_uniq_arr[int]()
@ -422,7 +427,7 @@ func main() {
// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
// These are the values we want to print, now that we have inverted the match.
// Re-initialize indicesToPrint and add all of these values to it.
indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)
indicesToPrint.add(setDifference(genRange(0, len(test_runes)), oldIndices)...)
}
// If lineFlag is enabled, we should only print something if:
@ -435,7 +440,7 @@ func main() {
continue
}
}
for i, c := range test_str {
for i, c := range test_runes {
if indicesToPrint.contains(i) {
color.New(color.FgRed).Fprintf(out, "%c", c)
// Newline after every match - only if -o is enabled and -v is disabled.

@ -8,8 +8,10 @@ import (
var whitespaceChars = []rune{' ', '\t', '\n'}
var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_")
var notDotChars = []rune{'\n'}
var LBRACKET rune = 0xF0000
var RBRACKET rune = 0xF0001
var ANY_CHAR rune = 0xF0002 // Represents any character - used for 'dot' metacharacter
func dotChars() []rune { // Returns all possible characters represented by the dot metacharacter - this is too tedious to define as a variable, which is why it is a function
start := 0x0020

@ -23,6 +23,7 @@ type State struct {
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
isDot bool // Whether or not the state represents a 'dot' metacharacter. A 'dot' node doesn't store any contents directly, as it would take up too much space
}
// Clones the NFA starting from the given state.
@ -94,6 +95,9 @@ func (s State) contentContains(str []rune, idx int) bool {
if s.assert != NONE {
return s.checkAssertion(str, idx)
}
if s.isDot {
return !slices.Contains(notDotChars, str[idx])
}
// Default - s.assert must be NONE
return slices.Contains(s.content, int(str[idx]))
}
@ -116,7 +120,12 @@ func (s State) matchesFor(str []rune, idx int) ([]*State, int) {
if s.assert == NONWBOUND && isWordBoundary(str, idx) {
return make([]*State, 0), -1
}
return s.transitions[int(str[idx])], len(s.transitions[int(str[idx])])
listTransitions := s.transitions[int(str[idx])]
for _, dest := range s.transitions[int(ANY_CHAR)] {
listTransitions = append(listTransitions, dest)
}
numTransitions := len(listTransitions)
return listTransitions, numTransitions
}
type NFA struct {

@ -26,6 +26,7 @@ type postfixNode struct {
contents []rune // Contents of the node - the length of this would only be >1 if the node represents a character class
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
isDot bool // Whether or not the current node represents a 'dot' metacharacter
}
// Creates a new escaped node - the given character is assumed to have been preceded by a backslash
@ -105,9 +106,22 @@ func newPostfixNode(contents ...rune) postfixNode {
return to_return
}
// Creates and returns a postfixNode representing the 'dot' metacharacter.
func newPostfixDotNode() postfixNode {
toReturn := postfixNode{}
toReturn.startReps = 1
toReturn.endReps = 1
toReturn.nodetype = CHARACTER
toReturn.isDot = true
toReturn.contents = []rune{ANY_CHAR}
return toReturn
}
// Creates a character node, regardless of the contents
func newPostfixCharNode(contents ...rune) postfixNode {
toReturn := postfixNode{}
toReturn.startReps = 1
toReturn.endReps = 1
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, contents...)
return toReturn

Loading…
Cancel
Save