Renamed package 'greg' to 'regex'
This commit is contained in:
200
regex/postfixNode.go
Normal file
200
regex/postfixNode.go
Normal file
@@ -0,0 +1,200 @@
|
||||
package regex
|
||||
|
||||
import "fmt"
|
||||
|
||||
type NodeType int
|
||||
|
||||
// This is a slice containing all escapable characters that have special meaning.
|
||||
// Eg. \b is word boundary, \w is word character etc.
|
||||
var escapedChars []rune = []rune("wWdDbBnaftrvsS0")
|
||||
|
||||
// This is a list of the possible node types
|
||||
const (
|
||||
CHARACTER NodeType = iota
|
||||
CHARCLASS
|
||||
PIPE
|
||||
CONCATENATE
|
||||
KLEENE
|
||||
QUESTION
|
||||
PLUS
|
||||
ASSERTION
|
||||
LPAREN
|
||||
RPAREN
|
||||
)
|
||||
|
||||
// Helper constants for lookarounds
|
||||
const POSITIVE = 1
|
||||
const NEGATIVE = -1
|
||||
const LOOKAHEAD = 1
|
||||
const LOOKBEHIND = -1
|
||||
|
||||
var INFINITE_REPS int = -1 // Represents infinite reps eg. the end range in {5,}
|
||||
// This represents a node in the postfix representation of the expression
|
||||
type postfixNode struct {
|
||||
nodetype NodeType
|
||||
contents []rune // Contents of the node
|
||||
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
|
||||
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
|
||||
allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter)
|
||||
except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here.
|
||||
lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
|
||||
lookaroundDir int // Lookbehind or lookahead
|
||||
nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
|
||||
}
|
||||
|
||||
// Converts the given list of postfixNodes to one node of type CHARCLASS.
|
||||
// Used to convert eg. 'a', 'b' and 'c' to '[abc]'.
|
||||
// If the character class is negated, it returns a postfixNode of type CHARACTER.
|
||||
// This node will behave like the dot metacharacter, but it has a longer list of runes that
|
||||
// it will not match.
|
||||
func newCharClassNode(nodes []postfixNode, negated bool) postfixNode {
|
||||
rtv := postfixNode{}
|
||||
rtv.nodetype = CHARCLASS
|
||||
rtv.startReps = 1
|
||||
rtv.endReps = 1
|
||||
if negated {
|
||||
rtv.nodetype = CHARACTER
|
||||
rtv.contents = []rune{ANY_CHAR}
|
||||
rtv.allChars = true
|
||||
rtv.except = nodes
|
||||
} else {
|
||||
rtv.nodeContents = nodes
|
||||
}
|
||||
return rtv
|
||||
}
|
||||
|
||||
// Creates a new escaped node - the given character is assumed to have been preceded by a backslash
|
||||
func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
switch c {
|
||||
case 's': // Whitespace
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, whitespaceChars...)
|
||||
case 'S': // Non-whitespace
|
||||
toReturn = newPostfixDotNode()
|
||||
toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...))
|
||||
case 'd': // Digits
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, digitChars...)
|
||||
case 'D': // Non-digits
|
||||
toReturn = newPostfixDotNode()
|
||||
toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...))
|
||||
case 'w': // word character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, wordChars...)
|
||||
case 'W': // Non-word character
|
||||
toReturn = newPostfixDotNode()
|
||||
toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...))
|
||||
case 'b', 'B':
|
||||
if c == 'b' && inCharClass {
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(8))
|
||||
} else {
|
||||
toReturn.nodetype = ASSERTION
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
}
|
||||
case 'n': // Newline character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, '\n')
|
||||
case '0': // NULL character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(0))
|
||||
case 'a': // Bell character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(7))
|
||||
case 'f': // Form feed character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(12))
|
||||
case 't': // Horizontal tab character
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(9))
|
||||
case 'r': // Carriage return
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(13))
|
||||
case 'v': // Vertical tab
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(11))
|
||||
case '-': // Literal hyphen - only in character class
|
||||
if inCharClass {
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, '-')
|
||||
} else {
|
||||
return postfixNode{}, fmt.Errorf("invalid escape character")
|
||||
}
|
||||
default: // None of the above - append it as a regular character
|
||||
if isNormalChar(c) { // Normal characters cannot be escaped
|
||||
return postfixNode{}, fmt.Errorf("invalid escape character")
|
||||
}
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, c)
|
||||
}
|
||||
return toReturn, nil
|
||||
}
|
||||
|
||||
// Creates and returns a postfixNode based on the given contents
|
||||
func newPostfixNode(contents ...rune) postfixNode {
|
||||
if len(contents) < 1 {
|
||||
panic("Empty node.")
|
||||
}
|
||||
to_return := postfixNode{}
|
||||
to_return.startReps = 1
|
||||
to_return.endReps = 1
|
||||
if len(contents) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER
|
||||
to_return.nodetype = CHARACTER
|
||||
to_return.contents = contents
|
||||
} else { // Node has one element, could be anything
|
||||
switch contents[0] {
|
||||
case '+':
|
||||
to_return.nodetype = PLUS
|
||||
case '?':
|
||||
to_return.nodetype = QUESTION
|
||||
case '*':
|
||||
to_return.nodetype = KLEENE
|
||||
case '|':
|
||||
to_return.nodetype = PIPE
|
||||
case CONCAT:
|
||||
to_return.nodetype = CONCATENATE
|
||||
case '^', '$':
|
||||
to_return.nodetype = ASSERTION
|
||||
case '(':
|
||||
to_return.nodetype = LPAREN
|
||||
case ')':
|
||||
to_return.nodetype = RPAREN
|
||||
default:
|
||||
to_return.nodetype = CHARACTER
|
||||
}
|
||||
to_return.contents = append(to_return.contents, contents...)
|
||||
|
||||
// Special cases for LPAREN and RPAREN - they have special characters defined for them
|
||||
if to_return.nodetype == LPAREN {
|
||||
to_return.contents = []rune{LPAREN_CHAR}
|
||||
}
|
||||
if to_return.nodetype == RPAREN {
|
||||
to_return.contents = []rune{RPAREN_CHAR}
|
||||
}
|
||||
}
|
||||
return to_return
|
||||
}
|
||||
|
||||
// Creates and returns a postfixNode representing the 'dot' metacharacter.
|
||||
func newPostfixDotNode() postfixNode {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.allChars = true
|
||||
toReturn.contents = []rune{ANY_CHAR}
|
||||
return toReturn
|
||||
}
|
||||
|
||||
// Creates a character node, regardless of the contents
|
||||
func newPostfixCharNode(contents ...rune) postfixNode {
|
||||
toReturn := postfixNode{}
|
||||
toReturn.startReps = 1
|
||||
toReturn.endReps = 1
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, contents...)
|
||||
return toReturn
|
||||
}
|
Reference in New Issue
Block a user