Renamed package 'greg' to 'regex'

2025-01-30 09:15:29 -05:00
parent ca8d32cd7f
commit aef8152fc1
13 changed files with 9 additions and 9 deletions
--- a/regex/postfixNode.go
+++ b/regex/postfixNode.go
@@ -0,0 +1,200 @@
+package regex
+
+import "fmt"
+
+type NodeType int
+
+// This is a slice containing all escapable characters that have special meaning.
+// Eg. \b is word boundary, \w is word character etc.
+var escapedChars []rune = []rune("wWdDbBnaftrvsS0")
+
+// This is a list of the possible node types
+const (
+	CHARACTER NodeType = iota
+	CHARCLASS
+	PIPE
+	CONCATENATE
+	KLEENE
+	QUESTION
+	PLUS
+	ASSERTION
+	LPAREN
+	RPAREN
+)
+
+// Helper constants for lookarounds
+const POSITIVE = 1
+const NEGATIVE = -1
+const LOOKAHEAD = 1
+const LOOKBEHIND = -1
+
+var INFINITE_REPS int = -1 // Represents infinite reps eg. the end range in {5,}
+// This represents a node in the postfix representation of the expression
+type postfixNode struct {
+	nodetype       NodeType
+	contents       []rune        // Contents of the node
+	startReps      int           // Minimum number of times the node should be repeated - used with numeric specifiers
+	endReps        int           // Maximum number of times the node should be repeated - used with numeric specifiers
+	allChars       bool          // Whether or not the current node represents all characters (eg. dot metacharacter)
+	except         []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here.
+	lookaroundSign int           // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
+	lookaroundDir  int           // Lookbehind or lookahead
+	nodeContents   []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
+}
+
+// Converts the given list of postfixNodes to one node of type CHARCLASS.
+// Used to convert eg. 'a', 'b' and 'c' to '[abc]'.
+// If the character class is negated, it returns a postfixNode of type CHARACTER.
+// This node will behave like the dot metacharacter, but it has a longer list of runes that
+// it will not match.
+func newCharClassNode(nodes []postfixNode, negated bool) postfixNode {
+	rtv := postfixNode{}
+	rtv.nodetype = CHARCLASS
+	rtv.startReps = 1
+	rtv.endReps = 1
+	if negated {
+		rtv.nodetype = CHARACTER
+		rtv.contents = []rune{ANY_CHAR}
+		rtv.allChars = true
+		rtv.except = nodes
+	} else {
+		rtv.nodeContents = nodes
+	}
+	return rtv
+}
+
+// Creates a new escaped node - the given character is assumed to have been preceded by a backslash
+func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
+	toReturn := postfixNode{}
+	toReturn.startReps = 1
+	toReturn.endReps = 1
+	switch c {
+	case 's': // Whitespace
+		toReturn.nodetype = CHARACTER
+		toReturn.contents = append(toReturn.contents, whitespaceChars...)
+	case 'S': // Non-whitespace
+		toReturn = newPostfixDotNode()
+		toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...))
+	case 'd': // Digits
+		toReturn.nodetype = CHARACTER
+		toReturn.contents = append(toReturn.contents, digitChars...)
+	case 'D': // Non-digits
+		toReturn = newPostfixDotNode()
+		toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...))
+	case 'w': // word character
+		toReturn.nodetype = CHARACTER
+		toReturn.contents = append(toReturn.contents, wordChars...)
+	case 'W': // Non-word character
+		toReturn = newPostfixDotNode()
+		toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...))
+	case 'b', 'B':
+		if c == 'b' && inCharClass {
+			toReturn.nodetype = CHARACTER
+			toReturn.contents = append(toReturn.contents, rune(8))
+		} else {
+			toReturn.nodetype = ASSERTION
+			toReturn.contents = append(toReturn.contents, c)
+		}
+	case 'n': // Newline character
+		toReturn.nodetype = CHARACTER
+		toReturn.contents = append(toReturn.contents, '\n')
+	case '0': // NULL character
+		toReturn.nodetype = CHARACTER
+		toReturn.contents = append(toReturn.contents, rune(0))
+	case 'a': // Bell character
+		toReturn.nodetype = CHARACTER
+		toReturn.contents = append(toReturn.contents, rune(7))
+	case 'f': // Form feed character
+		toReturn.nodetype = CHARACTER
+		toReturn.contents = append(toReturn.contents, rune(12))
+	case 't': // Horizontal tab character
+		toReturn.nodetype = CHARACTER
+		toReturn.contents = append(toReturn.contents, rune(9))
+	case 'r': // Carriage return
+		toReturn.nodetype = CHARACTER
+		toReturn.contents = append(toReturn.contents, rune(13))
+	case 'v': // Vertical tab
+		toReturn.nodetype = CHARACTER
+		toReturn.contents = append(toReturn.contents, rune(11))
+	case '-': // Literal hyphen - only in character class
+		if inCharClass {
+			toReturn.nodetype = CHARACTER
+			toReturn.contents = append(toReturn.contents, '-')
+		} else {
+			return postfixNode{}, fmt.Errorf("invalid escape character")
+		}
+	default: // None of the above - append it as a regular character
+		if isNormalChar(c) { // Normal characters cannot be escaped
+			return postfixNode{}, fmt.Errorf("invalid escape character")
+		}
+		toReturn.nodetype = CHARACTER
+		toReturn.contents = append(toReturn.contents, c)
+	}
+	return toReturn, nil
+}
+
+// Creates and returns a postfixNode based on the given contents
+func newPostfixNode(contents ...rune) postfixNode {
+	if len(contents) < 1 {
+		panic("Empty node.")
+	}
+	to_return := postfixNode{}
+	to_return.startReps = 1
+	to_return.endReps = 1
+	if len(contents) > 1 { // If the node has more than element, it must be a character class - the type must be CHARACTER
+		to_return.nodetype = CHARACTER
+		to_return.contents = contents
+	} else { // Node has one element, could be anything
+		switch contents[0] {
+		case '+':
+			to_return.nodetype = PLUS
+		case '?':
+			to_return.nodetype = QUESTION
+		case '*':
+			to_return.nodetype = KLEENE
+		case '|':
+			to_return.nodetype = PIPE
+		case CONCAT:
+			to_return.nodetype = CONCATENATE
+		case '^', '$':
+			to_return.nodetype = ASSERTION
+		case '(':
+			to_return.nodetype = LPAREN
+		case ')':
+			to_return.nodetype = RPAREN
+		default:
+			to_return.nodetype = CHARACTER
+		}
+		to_return.contents = append(to_return.contents, contents...)
+
+		// Special cases for LPAREN and RPAREN - they have special characters defined for them
+		if to_return.nodetype == LPAREN {
+			to_return.contents = []rune{LPAREN_CHAR}
+		}
+		if to_return.nodetype == RPAREN {
+			to_return.contents = []rune{RPAREN_CHAR}
+		}
+	}
+	return to_return
+}
+
+// Creates and returns a postfixNode representing the 'dot' metacharacter.
+func newPostfixDotNode() postfixNode {
+	toReturn := postfixNode{}
+	toReturn.startReps = 1
+	toReturn.endReps = 1
+	toReturn.nodetype = CHARACTER
+	toReturn.allChars = true
+	toReturn.contents = []rune{ANY_CHAR}
+	return toReturn
+}
+
+// Creates a character node, regardless of the contents
+func newPostfixCharNode(contents ...rune) postfixNode {
+	toReturn := postfixNode{}
+	toReturn.startReps = 1
+	toReturn.endReps = 1
+	toReturn.nodetype = CHARACTER
+	toReturn.contents = append(toReturn.contents, contents...)
+	return toReturn
+}