Convert an inverting character class into an 'allChars' node, with the characters marked as exceptions

master
Aadhavan Srinivasan 1 month ago
parent 708a9e1303
commit b3ee1fe5e8

@ -47,13 +47,12 @@ func shuntingYard(re string) []postfixNode {
i := 0
for i < len(re_runes) {
re_postfix = append(re_postfix, re_runes[i])
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
invertMatch := false
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters.
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
toAppend := make([]rune, 0) // Holds all the runes in the current character class
if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
invertMatch = true
i++
re_postfix = append(re_postfix, '^')
i++ // Skip opening bracket and caret
}
if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
panic("Empty character class.")
@ -81,13 +80,9 @@ func shuntingYard(re string) []postfixNode {
}
// Replace the last character (which should have been ']', with RBRACKET
toAppend[len(toAppend)-1] = RBRACKET
if invertMatch {
toAppend = setDifference(dotChars(), toAppend) // Take the inverse of the set by getting the difference between it and all dot characters
toAppend = append(toAppend, RBRACKET) // Since RBRACKET doesn't exist in dotChars, it wouldn't have been return in setDifference. We manually append it here.
}
re_postfix = append(re_postfix, toAppend...)
}
if re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
i++ // Skip opening brace
for i < len(re_runes) && re_runes[i] != '}' {
re_postfix = append(re_postfix, re_runes[i])
@ -98,7 +93,7 @@ func shuntingYard(re string) []postfixNode {
}
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
}
if (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
re_postfix = append(re_postfix, CONCAT)
@ -172,7 +167,12 @@ func shuntingYard(re string) []postfixNode {
}
}
if c == LBRACKET { // Used for character classes
i++ // Step forward so we can look at the character class
i++ // Step forward so we can look at the character class
var invertMatch bool
if re_postfix[i] == '^' {
invertMatch = true
i++
}
chars := make([]rune, 0) // List of characters - used only for character classes
for i < len(re_postfix) {
if re_postfix[i] == RBRACKET {
@ -184,8 +184,14 @@ func shuntingYard(re string) []postfixNode {
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
panic("ERROR: Opening bracket without closing bracket.")
}
outQueue = append(outQueue, newPostfixNode(chars...))
// i++ // Step forward to skip closing bracket
if !invertMatch {
outQueue = append(outQueue, newPostfixCharNode(chars...))
} else {
// Invert match - create an allChars postfixNode, then add the given states to its 'except' list.
toAdd := newPostfixDotNode()
toAdd.except = chars
outQueue = append(outQueue, toAdd)
}
continue
}
if c == '{' {
@ -282,8 +288,11 @@ func thompson(re []postfixNode) *State {
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
state := State{}
state.transitions = make(map[int][]*State)
if c.isDot {
state.isDot = true
if c.allChars {
state.allChars = true
if len(c.except) != 0 {
state.except = append([]rune{}, c.except...)
}
}
state.content = rune2Contents(c.contents)
state.output = make([]*State, 0)

Loading…
Cancel
Save