Convert an inverting character class into an 'allChars' node, with the characters marked as exceptions
This commit is contained in:
41
main.go
41
main.go
@@ -47,13 +47,12 @@ func shuntingYard(re string) []postfixNode {
|
||||
i := 0
|
||||
for i < len(re_runes) {
|
||||
re_postfix = append(re_postfix, re_runes[i])
|
||||
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped
|
||||
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
|
||||
invertMatch := false
|
||||
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters.
|
||||
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
|
||||
toAppend := make([]rune, 0) // Holds all the runes in the current character class
|
||||
if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
|
||||
invertMatch = true
|
||||
i++
|
||||
re_postfix = append(re_postfix, '^')
|
||||
i++ // Skip opening bracket and caret
|
||||
}
|
||||
if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
|
||||
panic("Empty character class.")
|
||||
@@ -81,13 +80,9 @@ func shuntingYard(re string) []postfixNode {
|
||||
}
|
||||
// Replace the last character (which should have been ']', with RBRACKET
|
||||
toAppend[len(toAppend)-1] = RBRACKET
|
||||
if invertMatch {
|
||||
toAppend = setDifference(dotChars(), toAppend) // Take the inverse of the set by getting the difference between it and all dot characters
|
||||
toAppend = append(toAppend, RBRACKET) // Since RBRACKET doesn't exist in dotChars, it wouldn't have been return in setDifference. We manually append it here.
|
||||
}
|
||||
re_postfix = append(re_postfix, toAppend...)
|
||||
}
|
||||
if re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
|
||||
if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
|
||||
i++ // Skip opening brace
|
||||
for i < len(re_runes) && re_runes[i] != '}' {
|
||||
re_postfix = append(re_postfix, re_runes[i])
|
||||
@@ -98,7 +93,7 @@ func shuntingYard(re string) []postfixNode {
|
||||
}
|
||||
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
|
||||
}
|
||||
if (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
||||
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
||||
if i < len(re_runes)-1 {
|
||||
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
|
||||
re_postfix = append(re_postfix, CONCAT)
|
||||
@@ -172,7 +167,12 @@ func shuntingYard(re string) []postfixNode {
|
||||
}
|
||||
}
|
||||
if c == LBRACKET { // Used for character classes
|
||||
i++ // Step forward so we can look at the character class
|
||||
i++ // Step forward so we can look at the character class
|
||||
var invertMatch bool
|
||||
if re_postfix[i] == '^' {
|
||||
invertMatch = true
|
||||
i++
|
||||
}
|
||||
chars := make([]rune, 0) // List of characters - used only for character classes
|
||||
for i < len(re_postfix) {
|
||||
if re_postfix[i] == RBRACKET {
|
||||
@@ -184,8 +184,14 @@ func shuntingYard(re string) []postfixNode {
|
||||
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
|
||||
panic("ERROR: Opening bracket without closing bracket.")
|
||||
}
|
||||
outQueue = append(outQueue, newPostfixNode(chars...))
|
||||
// i++ // Step forward to skip closing bracket
|
||||
if !invertMatch {
|
||||
outQueue = append(outQueue, newPostfixCharNode(chars...))
|
||||
} else {
|
||||
// Invert match - create an allChars postfixNode, then add the given states to its 'except' list.
|
||||
toAdd := newPostfixDotNode()
|
||||
toAdd.except = chars
|
||||
outQueue = append(outQueue, toAdd)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if c == '{' {
|
||||
@@ -282,8 +288,11 @@ func thompson(re []postfixNode) *State {
|
||||
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
|
||||
state := State{}
|
||||
state.transitions = make(map[int][]*State)
|
||||
if c.isDot {
|
||||
state.isDot = true
|
||||
if c.allChars {
|
||||
state.allChars = true
|
||||
if len(c.except) != 0 {
|
||||
state.except = append([]rune{}, c.except...)
|
||||
}
|
||||
}
|
||||
state.content = rune2Contents(c.contents)
|
||||
state.output = make([]*State, 0)
|
||||
|
Reference in New Issue
Block a user