Convert an inverting character class into an 'allChars' node, with the characters marked as exceptions

master
Aadhavan Srinivasan 1 month ago
parent 708a9e1303
commit b3ee1fe5e8

@ -47,13 +47,12 @@ func shuntingYard(re string) []postfixNode {
i := 0 i := 0
for i < len(re_runes) { for i < len(re_runes) {
re_postfix = append(re_postfix, re_runes[i]) re_postfix = append(re_postfix, re_runes[i])
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters.
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
invertMatch := false
toAppend := make([]rune, 0) // Holds all the runes in the current character class toAppend := make([]rune, 0) // Holds all the runes in the current character class
if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
invertMatch = true re_postfix = append(re_postfix, '^')
i++ i++ // Skip opening bracket and caret
} }
if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic. if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
panic("Empty character class.") panic("Empty character class.")
@ -81,13 +80,9 @@ func shuntingYard(re string) []postfixNode {
} }
// Replace the last character (which should have been ']', with RBRACKET // Replace the last character (which should have been ']', with RBRACKET
toAppend[len(toAppend)-1] = RBRACKET toAppend[len(toAppend)-1] = RBRACKET
if invertMatch {
toAppend = setDifference(dotChars(), toAppend) // Take the inverse of the set by getting the difference between it and all dot characters
toAppend = append(toAppend, RBRACKET) // Since RBRACKET doesn't exist in dotChars, it wouldn't have been return in setDifference. We manually append it here.
}
re_postfix = append(re_postfix, toAppend...) re_postfix = append(re_postfix, toAppend...)
} }
if re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
i++ // Skip opening brace i++ // Skip opening brace
for i < len(re_runes) && re_runes[i] != '}' { for i < len(re_runes) && re_runes[i] != '}' {
re_postfix = append(re_postfix, re_runes[i]) re_postfix = append(re_postfix, re_runes[i])
@ -98,7 +93,7 @@ func shuntingYard(re string) []postfixNode {
} }
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
} }
if (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
if i < len(re_runes)-1 { if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' { if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
re_postfix = append(re_postfix, CONCAT) re_postfix = append(re_postfix, CONCAT)
@ -173,6 +168,11 @@ func shuntingYard(re string) []postfixNode {
} }
if c == LBRACKET { // Used for character classes if c == LBRACKET { // Used for character classes
i++ // Step forward so we can look at the character class i++ // Step forward so we can look at the character class
var invertMatch bool
if re_postfix[i] == '^' {
invertMatch = true
i++
}
chars := make([]rune, 0) // List of characters - used only for character classes chars := make([]rune, 0) // List of characters - used only for character classes
for i < len(re_postfix) { for i < len(re_postfix) {
if re_postfix[i] == RBRACKET { if re_postfix[i] == RBRACKET {
@ -184,8 +184,14 @@ func shuntingYard(re string) []postfixNode {
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic. if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
panic("ERROR: Opening bracket without closing bracket.") panic("ERROR: Opening bracket without closing bracket.")
} }
outQueue = append(outQueue, newPostfixNode(chars...)) if !invertMatch {
// i++ // Step forward to skip closing bracket outQueue = append(outQueue, newPostfixCharNode(chars...))
} else {
// Invert match - create an allChars postfixNode, then add the given states to its 'except' list.
toAdd := newPostfixDotNode()
toAdd.except = chars
outQueue = append(outQueue, toAdd)
}
continue continue
} }
if c == '{' { if c == '{' {
@ -282,8 +288,11 @@ func thompson(re []postfixNode) *State {
if c.nodetype == CHARACTER || c.nodetype == ASSERTION { if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
state := State{} state := State{}
state.transitions = make(map[int][]*State) state.transitions = make(map[int][]*State)
if c.isDot { if c.allChars {
state.isDot = true state.allChars = true
if len(c.except) != 0 {
state.except = append([]rune{}, c.except...)
}
} }
state.content = rune2Contents(c.contents) state.content = rune2Contents(c.contents)
state.output = make([]*State, 0) state.output = make([]*State, 0)

Loading…
Cancel
Save