package main type NodeType int // This is a slice containing all escapable characters that have special meaning. // Eg. \b is word boundary, \w is word character etc. var escapedChars []rune = []rune("wWdDbBnaftrvsS0") // This is a list of the possible node types const ( CHARACTER NodeType = iota CHARCLASS PIPE CONCATENATE KLEENE QUESTION PLUS ASSERTION LPAREN RPAREN ) // Helper constants for lookarounds const POSITIVE = 1 const NEGATIVE = -1 const LOOKAHEAD = 1 const LOOKBEHIND = -1 var INFINITE_REPS int = -1 // Represents infinite reps eg. the end range in {5,} // This represents a node in the postfix representation of the expression type postfixNode struct { nodetype NodeType contents []rune // Contents of the node startReps int // Minimum number of times the node should be repeated - used with numeric specifiers endReps int // Maximum number of times the node should be repeated - used with numeric specifiers allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter) except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here. lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround. lookaroundDir int // Lookbehind or lookahead nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node. } // Converts the given list of postfixNodes to one node of type CHARCLASS. // Used to convert eg. 'a', 'b' and 'c' to '[abc]'. // If the character class is negated, it returns a postfixNode of type CHARACTER. // This node will behave like the dot metacharacter, but it has a longer list of runes that // it will not match. func newCharClassNode(nodes []postfixNode, negated bool) postfixNode { rtv := postfixNode{} rtv.nodetype = CHARCLASS rtv.startReps = 1 rtv.endReps = 1 if negated { rtv.nodetype = CHARACTER rtv.contents = []rune{ANY_CHAR} rtv.allChars = true rtv.except = nodes } else { rtv.nodeContents = nodes } return rtv } // Creates a new escaped node - the given character is assumed to have been preceded by a backslash func newEscapedNode(c rune) postfixNode { toReturn := postfixNode{} toReturn.startReps = 1 toReturn.endReps = 1 switch c { case 's': // Whitespace toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, whitespaceChars...) case 'S': // Non-whitespace toReturn = newPostfixDotNode() toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...)) case 'd': // Digits toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, digitChars...) case 'D': // Non-digits toReturn = newPostfixDotNode() toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...)) case 'w': // word character toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, wordChars...) case 'W': // Non-word character toReturn = newPostfixDotNode() toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...)) case 'b', 'B': toReturn.nodetype = ASSERTION toReturn.contents = append(toReturn.contents, c) case 'n': // Newline character toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, '\n') case '0': // NULL character toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, rune(0)) case 'a': // Bell character toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, rune(7)) case 'f': // Form feed character toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, rune(12)) case 't': // Horizontal tab character toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, rune(9)) case 'r': // Carriage return toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, rune(13)) case 'v': // Vertical tab toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, rune(11)) default: // None of the above - append it as a regular character toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, c) } return toReturn } // Creates and returns a postfixNode based on the given contents func newPostfixNode(contents ...rune) postfixNode { if len(contents) < 1 { panic("Empty node.") } to_return := postfixNode{} to_return.startReps = 1 to_return.endReps = 1 if len(contents) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER to_return.nodetype = CHARACTER to_return.contents = contents } else { // Node has one element, could be anything switch contents[0] { case '+': to_return.nodetype = PLUS case '?': to_return.nodetype = QUESTION case '*': to_return.nodetype = KLEENE case '|': to_return.nodetype = PIPE case CONCAT: to_return.nodetype = CONCATENATE case '^', '$': to_return.nodetype = ASSERTION case '(': to_return.nodetype = LPAREN case ')': to_return.nodetype = RPAREN default: to_return.nodetype = CHARACTER } to_return.contents = append(to_return.contents, contents...) // Special cases for LPAREN and RPAREN - they have special characters defined for them if to_return.nodetype == LPAREN { to_return.contents = []rune{LPAREN_CHAR} } if to_return.nodetype == RPAREN { to_return.contents = []rune{RPAREN_CHAR} } } return to_return } // Creates and returns a postfixNode representing the 'dot' metacharacter. func newPostfixDotNode() postfixNode { toReturn := postfixNode{} toReturn.startReps = 1 toReturn.endReps = 1 toReturn.nodetype = CHARACTER toReturn.allChars = true toReturn.contents = []rune{ANY_CHAR} return toReturn } // Creates a character node, regardless of the contents func newPostfixCharNode(contents ...rune) postfixNode { toReturn := postfixNode{} toReturn.startReps = 1 toReturn.endReps = 1 toReturn.nodetype = CHARACTER toReturn.contents = append(toReturn.contents, contents...) return toReturn }