Fixed bug in character class implementation
This commit is contained in:
51
compile.go
51
compile.go
@@ -123,9 +123,12 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
} else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash
|
} else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash
|
||||||
re_runes = append(re_runes, ESC_BACKSLASH)
|
re_runes = append(re_runes, ESC_BACKSLASH)
|
||||||
i++
|
i++
|
||||||
} else if c == '[' && (i == 0 || re_runes_orig[i-1] != '\\')
|
} else if c == '[' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
|
||||||
|
re_runes = append(re_runes, LBRACKET)
|
||||||
|
continue
|
||||||
|
} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
|
||||||
|
re_runes = append(re_runes, RBRACKET)
|
||||||
|
continue
|
||||||
} else {
|
} else {
|
||||||
re_runes = append(re_runes, c)
|
re_runes = append(re_runes, c)
|
||||||
}
|
}
|
||||||
@@ -145,31 +148,28 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
i := 0
|
i := 0
|
||||||
for i < len(re_runes) {
|
for i < len(re_runes) {
|
||||||
re_postfix = append(re_postfix, re_runes[i])
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters.
|
if re_runes[i] == LBRACKET && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped.
|
||||||
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
|
toAppend := make([]rune, 0) // Holds all the runes in the current character class
|
||||||
toAppend := make([]rune, 0) // Holds all the runes in the current character class
|
|
||||||
if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
|
i++ // Skip past LBRACKET, because it was already added
|
||||||
re_postfix = append(re_postfix, '^')
|
if i >= len(re_runes) { // Sanity check before we start
|
||||||
i++ // Skip opening bracket and caret
|
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
||||||
}
|
}
|
||||||
if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
|
|
||||||
return nil, fmt.Errorf("Empty character class.")
|
for re_runes[i] != RBRACKET || i == 0 || re_runes[i-1] == '\\' { // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
|
||||||
}
|
|
||||||
for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
|
|
||||||
i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
|
|
||||||
// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
|
// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
|
||||||
if i >= len(re_runes) {
|
if i >= len(re_runes) {
|
||||||
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
||||||
}
|
}
|
||||||
|
|
||||||
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != ']') { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
|
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != RBRACKET) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
|
||||||
re_runes[i] = CHAR_RANGE
|
re_runes[i] = CHAR_RANGE
|
||||||
}
|
}
|
||||||
|
|
||||||
toAppend = append(toAppend, re_runes[i])
|
toAppend = append(toAppend, re_runes[i])
|
||||||
|
i++
|
||||||
}
|
}
|
||||||
// Replace the last character (which should have been ']', with RBRACKET
|
// Add in the RBRACKET
|
||||||
toAppend[len(toAppend)-1] = RBRACKET
|
toAppend = append(toAppend, RBRACKET)
|
||||||
re_postfix = append(re_postfix, toAppend...)
|
re_postfix = append(re_postfix, toAppend...)
|
||||||
}
|
}
|
||||||
if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
|
if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
|
||||||
@@ -284,6 +284,17 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
// Since every unescaped bracket is replaced by a LBRACKET / RBRACKET, there may
|
||||||
|
// have been false positives. For example, the regex ']' has a closing bracket, but it
|
||||||
|
// isn't denoting a character class; it's just a regular character. Since it's not escaped,
|
||||||
|
// though, I would have converted this into an RBRACKET.
|
||||||
|
// To deal with this, I make the following assertion:
|
||||||
|
// If at any point I see an RBRACKET 'in the wild' (not in a character class), then it must be
|
||||||
|
// a regular character, with no special significance.
|
||||||
|
if c == RBRACKET {
|
||||||
|
outQueue = append(outQueue, newPostfixCharNode(']'))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
|
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
|
||||||
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
|
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
|
||||||
@@ -419,6 +430,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
|
firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
|
||||||
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
|
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
|
||||||
i++ // Step forward so we can look at the character class
|
i++ // Step forward so we can look at the character class
|
||||||
|
// Oops, there's nothing there to look at
|
||||||
|
if i >= len(re_postfix) {
|
||||||
|
return nil, fmt.Errorf("Opening bracket with no closing bracket.")
|
||||||
|
}
|
||||||
var invertMatch bool
|
var invertMatch bool
|
||||||
if re_postfix[i] == '^' {
|
if re_postfix[i] == '^' {
|
||||||
invertMatch = true
|
invertMatch = true
|
||||||
|
Reference in New Issue
Block a user