package main
type NodeType int
// This is a slice containing all escapable characters that have special meaning.
// Eg. \b is word boundary, \w is word character etc.
var escapedChars [ ] rune = [ ] rune ( "wWdDbBnaftrvsS0" )
// This is a list of the possible node types
const (
CHARACTER NodeType = iota
CHARCLASS
PIPE
CONCATENATE
KLEENE
QUESTION
PLUS
ASSERTION
LPAREN
RPAREN
)
// Helper constants for lookarounds
const POSITIVE = 1
const NEGATIVE = - 1
const LOOKAHEAD = 1
const LOOKBEHIND = - 1
var INFINITE_REPS int = - 1 // Represents infinite reps eg. the end range in {5,}
// This represents a node in the postfix representation of the expression
type postfixNode struct {
nodetype NodeType
contents [ ] rune // Contents of the node
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter)
except [ ] postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here.
lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
lookaroundDir int // Lookbehind or lookahead
nodeContents [ ] postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
}
// Converts the given list of postfixNodes to one node of type CHARCLASS.
// Used to convert eg. 'a', 'b' and 'c' to '[abc]'.
// If the character class is negated, it returns a postfixNode of type CHARACTER.
// This node will behave like the dot metacharacter, but it has a longer list of runes that
// it will not match.
func newCharClassNode ( nodes [ ] postfixNode , negated bool ) postfixNode {
rtv := postfixNode { }
rtv . nodetype = CHARCLASS
rtv . startReps = 1
rtv . endReps = 1
if negated {
rtv . nodetype = CHARACTER
rtv . contents = [ ] rune { ANY_CHAR }
rtv . allChars = true
rtv . except = nodes
} else {
rtv . nodeContents = nodes
}
return rtv
}
// Creates a new escaped node - the given character is assumed to have been preceded by a backslash
func newEscapedNode ( c rune ) postfixNode {
toReturn := postfixNode { }
toReturn . startReps = 1
toReturn . endReps = 1
switch c {
case 's' : // Whitespace
toReturn . nodetype = CHARACTER
toReturn . contents = append ( toReturn . contents , whitespaceChars ... )
case 'S' : // Non-whitespace
toReturn = newPostfixDotNode ( )
toReturn . except = append ( [ ] postfixNode { } , newPostfixNode ( whitespaceChars ... ) )
case 'd' : // Digits
toReturn . nodetype = CHARACTER
toReturn . contents = append ( toReturn . contents , digitChars ... )
case 'D' : // Non-digits
toReturn = newPostfixDotNode ( )
toReturn . except = append ( [ ] postfixNode { } , newPostfixNode ( digitChars ... ) )
case 'w' : // word character
toReturn . nodetype = CHARACTER
toReturn . contents = append ( toReturn . contents , wordChars ... )
case 'W' : // Non-word character
toReturn = newPostfixDotNode ( )
toReturn . except = append ( [ ] postfixNode { } , newPostfixNode ( wordChars ... ) )
case 'b' , 'B' :
toReturn . nodetype = ASSERTION
toReturn . contents = append ( toReturn . contents , c )
case 'n' : // Newline character
toReturn . nodetype = CHARACTER
toReturn . contents = append ( toReturn . contents , '\n' )
case '0' : // NULL character
toReturn . nodetype = CHARACTER
toReturn . contents = append ( toReturn . contents , rune ( 0 ) )
case 'a' : // Bell character
toReturn . nodetype = CHARACTER
toReturn . contents = append ( toReturn . contents , rune ( 7 ) )
case 'f' : // Form feed character
toReturn . nodetype = CHARACTER
toReturn . contents = append ( toReturn . contents , rune ( 12 ) )
case 't' : // Horizontal tab character
toReturn . nodetype = CHARACTER
toReturn . contents = append ( toReturn . contents , rune ( 9 ) )
case 'r' : // Carriage return
toReturn . nodetype = CHARACTER
toReturn . contents = append ( toReturn . contents , rune ( 13 ) )
case 'v' : // Vertical tab
toReturn . nodetype = CHARACTER
toReturn . contents = append ( toReturn . contents , rune ( 11 ) )
default : // None of the above - append it as a regular character
toReturn . nodetype = CHARACTER
toReturn . contents = append ( toReturn . contents , c )
}
return toReturn
}
// Creates and returns a postfixNode based on the given contents
func newPostfixNode ( contents ... rune ) postfixNode {
if len ( contents ) < 1 {
panic ( "Empty node." )
}
to_return := postfixNode { }
to_return . startReps = 1
to_return . endReps = 1
if len ( contents ) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER
to_return . nodetype = CHARACTER
to_return . contents = contents
} else { // Node has one element, could be anything
switch contents [ 0 ] {
case '+' :
to_return . nodetype = PLUS
case '?' :
to_return . nodetype = QUESTION
case '*' :
to_return . nodetype = KLEENE
case '|' :
to_return . nodetype = PIPE
case CONCAT :
to_return . nodetype = CONCATENATE
case '^' , '$' :
to_return . nodetype = ASSERTION
case '(' :
to_return . nodetype = LPAREN
case ')' :
to_return . nodetype = RPAREN
default :
to_return . nodetype = CHARACTER
}
to_return . contents = append ( to_return . contents , contents ... )
// Special cases for LPAREN and RPAREN - they have special characters defined for them
if to_return . nodetype == LPAREN {
to_return . contents = [ ] rune { LPAREN_CHAR }
}
if to_return . nodetype == RPAREN {
to_return . contents = [ ] rune { RPAREN_CHAR }
}
}
return to_return
}
// Creates and returns a postfixNode representing the 'dot' metacharacter.
func newPostfixDotNode ( ) postfixNode {
toReturn := postfixNode { }
toReturn . startReps = 1
toReturn . endReps = 1
toReturn . nodetype = CHARACTER
toReturn . allChars = true
toReturn . contents = [ ] rune { ANY_CHAR }
return toReturn
}
// Creates a character node, regardless of the contents
func newPostfixCharNode ( contents ... rune ) postfixNode {
toReturn := postfixNode { }
toReturn . startReps = 1
toReturn . endReps = 1
toReturn . nodetype = CHARACTER
toReturn . contents = append ( toReturn . contents , contents ... )
return toReturn
}