package regex import "fmt" type nodeType int // This is a slice containing all escapable characters that have special meaning. // Eg. \b is word boundary, \w is word character etc. var escapedChars []rune = []rune("wWdDbBnaftrvsS0") // This is a list of the possible node types const ( characterNode nodeType = iota charclassNode pipeNode concatenateNode kleeneNode questionNode plusNode assertionNode lparenNode rparenNode ) // Helper constants for lookarounds const positive = 1 const negative = -1 const lookahead = 1 const lookbehind = -1 var infinite_reps int = -1 // Represents infinite reps eg. the end range in {5,} // This represents a node in the postfix representation of the expression type postfixNode struct { nodetype nodeType contents []rune // Contents of the node startReps int // Minimum number of times the node should be repeated - used with numeric specifiers endReps int // Maximum number of times the node should be repeated - used with numeric specifiers allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter) except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here. lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround. lookaroundDir int // Lookbehind or lookahead nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node. } // Converts the given list of postfixNodes to one node of type CHARCLASS. // Used to convert eg. 'a', 'b' and 'c' to '[abc]'. // If the character class is negated, it returns a postfixNode of type CHARACTER. // This node will behave like the dot metacharacter, but it has a longer list of runes that // it will not match. func newCharClassNode(nodes []postfixNode, negated bool) postfixNode { rtv := postfixNode{} rtv.nodetype = charclassNode rtv.startReps = 1 rtv.endReps = 1 if negated { rtv.nodetype = characterNode rtv.contents = []rune{anyCharRune} rtv.allChars = true rtv.except = nodes } else { rtv.nodeContents = nodes } return rtv } // Creates a new escaped node - the given character is assumed to have been preceded by a backslash func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) { toReturn := postfixNode{} toReturn.startReps = 1 toReturn.endReps = 1 switch c { case 's': // Whitespace toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, whitespaceChars...) case 'S': // Non-whitespace toReturn = newPostfixDotNode() toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...)) case 'd': // Digits toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, digitChars...) case 'D': // Non-digits toReturn = newPostfixDotNode() toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...)) case 'w': // word character toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, wordChars...) case 'W': // Non-word character toReturn = newPostfixDotNode() toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...)) case 'b', 'B': if c == 'b' && inCharClass { toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, rune(8)) } else { toReturn.nodetype = assertionNode toReturn.contents = append(toReturn.contents, c) } if c == 'B' && inCharClass { // Invalid return postfixNode{}, fmt.Errorf("word boundaries are not allowed in character class") } case 'n': // Newline character toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, '\n') case '0': // NULL character toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, rune(0)) case 'a': // Bell character toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, rune(7)) case 'f': // Form feed character toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, rune(12)) case 't': // Horizontal tab character toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, rune(9)) case 'r': // Carriage return toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, rune(13)) case 'v': // Vertical tab toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, rune(11)) case '-': // Literal hyphen - only in character class if inCharClass { toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, '-') } else { return postfixNode{}, fmt.Errorf("invalid escape character") } default: // None of the above - append it as a regular character if isNormalChar(c) { // Normal characters cannot be escaped return postfixNode{}, fmt.Errorf("invalid escape character") } toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, c) } return toReturn, nil } // Creates and returns a postfixNode based on the given contents func newPostfixNode(contents ...rune) postfixNode { if len(contents) < 1 { panic("Empty node.") } to_return := postfixNode{} to_return.startReps = 1 to_return.endReps = 1 if len(contents) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER to_return.nodetype = characterNode to_return.contents = contents } else { // Node has one element, could be anything switch contents[0] { case '+': to_return.nodetype = plusNode case '?': to_return.nodetype = questionNode case '*': to_return.nodetype = kleeneNode case '|': to_return.nodetype = pipeNode case concatRune: to_return.nodetype = concatenateNode case '^', '$': to_return.nodetype = assertionNode case '(': to_return.nodetype = lparenNode case ')': to_return.nodetype = rparenNode default: to_return.nodetype = characterNode } to_return.contents = append(to_return.contents, contents...) // Special cases for LPAREN and RPAREN - they have special characters defined for them if to_return.nodetype == lparenNode { to_return.contents = []rune{lparenRune} } if to_return.nodetype == rparenNode { to_return.contents = []rune{rparenRune} } } return to_return } // Creates and returns a postfixNode representing the 'dot' metacharacter. func newPostfixDotNode() postfixNode { toReturn := postfixNode{} toReturn.startReps = 1 toReturn.endReps = 1 toReturn.nodetype = characterNode toReturn.allChars = true toReturn.contents = []rune{anyCharRune} return toReturn } // Creates a character node, regardless of the contents func newPostfixCharNode(contents ...rune) postfixNode { toReturn := postfixNode{} toReturn.startReps = 1 toReturn.endReps = 1 toReturn.nodetype = characterNode toReturn.contents = append(toReturn.contents, contents...) return toReturn }