diff --git a/postfixNode.go b/postfixNode.go index ed60b80..68f589d 100644 --- a/postfixNode.go +++ b/postfixNode.go @@ -2,9 +2,14 @@ package main type NodeType int +// This is a slice containing all escapable characters that have special meaning. +// Eg. \b is word boundary, \w is word character etc. +var escapedChars []rune = []rune("wWdDbBnaftrvsS0") + // This is a list of the possible node types const ( CHARACTER NodeType = iota + CHARCLASS PIPE CONCATENATE KLEENE @@ -25,13 +30,35 @@ var INFINITE_REPS int = -1 // Represents infinite reps eg. the end range in {5,} // This represents a node in the postfix representation of the expression type postfixNode struct { nodetype NodeType - contents []rune // Contents of the node - startReps int // Minimum number of times the node should be repeated - used with numeric specifiers - endReps int // Maximum number of times the node should be repeated - used with numeric specifiers - allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter) - except []rune // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here. - lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround. - lookaroundDir int // Lookbehind or lookahead + contents []rune // Contents of the node + startReps int // Minimum number of times the node should be repeated - used with numeric specifiers + endReps int // Maximum number of times the node should be repeated - used with numeric specifiers + allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter) + except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here. + lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround. + lookaroundDir int // Lookbehind or lookahead + nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node. +} + +// Converts the given list of postfixNodes to one node of type CHARCLASS. +// Used to convert eg. 'a', 'b' and 'c' to '[abc]'. +// If the character class is negated, it returns a postfixNode of type CHARACTER. +// This node will behave like the dot metacharacter, but it has a longer list of runes that +// it will not match. +func newCharClassNode(nodes []postfixNode, negated bool) postfixNode { + rtv := postfixNode{} + rtv.nodetype = CHARCLASS + rtv.startReps = 1 + rtv.endReps = 1 + if negated { + rtv.nodetype = CHARACTER + rtv.contents = []rune{ANY_CHAR} + rtv.allChars = true + rtv.except = nodes + } else { + rtv.nodeContents = nodes + } + return rtv } // Creates a new escaped node - the given character is assumed to have been preceded by a backslash @@ -45,25 +72,43 @@ func newEscapedNode(c rune) postfixNode { toReturn.contents = append(toReturn.contents, whitespaceChars...) case 'S': // Non-whitespace toReturn = newPostfixDotNode() - toReturn.except = append([]rune{}, whitespaceChars...) + toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...)) case 'd': // Digits toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, digitChars...) case 'D': // Non-digits toReturn = newPostfixDotNode() - toReturn.except = append([]rune{}, digitChars...) + toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...)) case 'w': // word character toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, wordChars...) case 'W': // Non-word character toReturn = newPostfixDotNode() - toReturn.except = append([]rune{}, wordChars...) + toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...)) case 'b', 'B': toReturn.nodetype = ASSERTION toReturn.contents = append(toReturn.contents, c) case 'n': // Newline character toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, '\n') + case '0': // NULL character + toReturn.nodetype = CHARACTER + toReturn.contents = append(toReturn.contents, rune(0)) + case 'a': // Bell character + toReturn.nodetype = CHARACTER + toReturn.contents = append(toReturn.contents, rune(7)) + case 'f': // Form feed character + toReturn.nodetype = CHARACTER + toReturn.contents = append(toReturn.contents, rune(12)) + case 't': // Horizontal tab character + toReturn.nodetype = CHARACTER + toReturn.contents = append(toReturn.contents, rune(9)) + case 'r': // Carriage return + toReturn.nodetype = CHARACTER + toReturn.contents = append(toReturn.contents, rune(13)) + case 'v': // Vertical tab + toReturn.nodetype = CHARACTER + toReturn.contents = append(toReturn.contents, rune(11)) default: // None of the above - append it as a regular character toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, c)