Added a new class 'CHARCLASS', which represents a character class with some other postfixNodes in it. The 'except' field now contains a list of postfixNodes rather than runes

master
Aadhavan Srinivasan 3 days ago
parent b81a2f8452
commit 7056026e10

@ -2,9 +2,14 @@ package main
type NodeType int type NodeType int
// This is a slice containing all escapable characters that have special meaning.
// Eg. \b is word boundary, \w is word character etc.
var escapedChars []rune = []rune("wWdDbBnaftrvsS0")
// This is a list of the possible node types // This is a list of the possible node types
const ( const (
CHARACTER NodeType = iota CHARACTER NodeType = iota
CHARCLASS
PIPE PIPE
CONCATENATE CONCATENATE
KLEENE KLEENE
@ -25,13 +30,35 @@ var INFINITE_REPS int = -1 // Represents infinite reps eg. the end range in {5,}
// This represents a node in the postfix representation of the expression // This represents a node in the postfix representation of the expression
type postfixNode struct { type postfixNode struct {
nodetype NodeType nodetype NodeType
contents []rune // Contents of the node contents []rune // Contents of the node
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter) allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter)
except []rune // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here. except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here.
lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround. lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
lookaroundDir int // Lookbehind or lookahead lookaroundDir int // Lookbehind or lookahead
nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
}
// Converts the given list of postfixNodes to one node of type CHARCLASS.
// Used to convert eg. 'a', 'b' and 'c' to '[abc]'.
// If the character class is negated, it returns a postfixNode of type CHARACTER.
// This node will behave like the dot metacharacter, but it has a longer list of runes that
// it will not match.
func newCharClassNode(nodes []postfixNode, negated bool) postfixNode {
rtv := postfixNode{}
rtv.nodetype = CHARCLASS
rtv.startReps = 1
rtv.endReps = 1
if negated {
rtv.nodetype = CHARACTER
rtv.contents = []rune{ANY_CHAR}
rtv.allChars = true
rtv.except = nodes
} else {
rtv.nodeContents = nodes
}
return rtv
} }
// Creates a new escaped node - the given character is assumed to have been preceded by a backslash // Creates a new escaped node - the given character is assumed to have been preceded by a backslash
@ -45,25 +72,43 @@ func newEscapedNode(c rune) postfixNode {
toReturn.contents = append(toReturn.contents, whitespaceChars...) toReturn.contents = append(toReturn.contents, whitespaceChars...)
case 'S': // Non-whitespace case 'S': // Non-whitespace
toReturn = newPostfixDotNode() toReturn = newPostfixDotNode()
toReturn.except = append([]rune{}, whitespaceChars...) toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...))
case 'd': // Digits case 'd': // Digits
toReturn.nodetype = CHARACTER toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, digitChars...) toReturn.contents = append(toReturn.contents, digitChars...)
case 'D': // Non-digits case 'D': // Non-digits
toReturn = newPostfixDotNode() toReturn = newPostfixDotNode()
toReturn.except = append([]rune{}, digitChars...) toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...))
case 'w': // word character case 'w': // word character
toReturn.nodetype = CHARACTER toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, wordChars...) toReturn.contents = append(toReturn.contents, wordChars...)
case 'W': // Non-word character case 'W': // Non-word character
toReturn = newPostfixDotNode() toReturn = newPostfixDotNode()
toReturn.except = append([]rune{}, wordChars...) toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...))
case 'b', 'B': case 'b', 'B':
toReturn.nodetype = ASSERTION toReturn.nodetype = ASSERTION
toReturn.contents = append(toReturn.contents, c) toReturn.contents = append(toReturn.contents, c)
case 'n': // Newline character case 'n': // Newline character
toReturn.nodetype = CHARACTER toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, '\n') toReturn.contents = append(toReturn.contents, '\n')
case '0': // NULL character
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(0))
case 'a': // Bell character
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(7))
case 'f': // Form feed character
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(12))
case 't': // Horizontal tab character
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(9))
case 'r': // Carriage return
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(13))
case 'v': // Vertical tab
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(11))
default: // None of the above - append it as a regular character default: // None of the above - append it as a regular character
toReturn.nodetype = CHARACTER toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, c) toReturn.contents = append(toReturn.contents, c)

Loading…
Cancel
Save