From 1520edad55b3d5f19bbc39cba5ae2991d3fdfc77 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Fri, 24 Jan 2025 15:50:36 -0500 Subject: [PATCH] Enforce the rule that character classes must have at least one character; interpret literal closing brackets as regular characters --- compile.go | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/compile.go b/compile.go index bc967df..e3707ca 100644 --- a/compile.go +++ b/compile.go @@ -272,7 +272,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { 6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue. */ c := re_postfix[i] - if isNormalChar(c) { + if isNormalChar(c) || isSpecialCharWithMetacharReplacement(c) { if caseInsensitive { outQueue = append(outQueue, newPostfixNode(allCases(c)...)) } else { @@ -280,7 +280,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } continue } - // Escape character + if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it) return nil, fmt.Errorf("ERROR: Backslash with no escape character.") @@ -412,8 +412,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } } if c == LBRACKET { // Used for character classes - endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter - i++ // Step forward so we can look at the character class + firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added. + endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter + i++ // Step forward so we can look at the character class var invertMatch bool if re_postfix[i] == '^' { invertMatch = true @@ -421,7 +422,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } chars := make([]postfixNode, 0) // List of nodes - used only for character classes for i < len(re_postfix) { - if re_postfix[i] == RBRACKET { + if firstCharAdded && re_postfix[i] == RBRACKET { break } if re_postfix[i] == CHAR_RANGE { @@ -481,9 +482,20 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { i++ } } else { + if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket. + switch re_postfix[i] { + case LBRACKET: + chars = append(chars, newPostfixCharNode('[')) + case RBRACKET: + chars = append(chars, newPostfixCharNode(']')) + default: + return nil, fmt.Errorf("Error parsing high-range unicode value in character class.") + } + } chars = append(chars, newPostfixCharNode(re_postfix[i])) i++ } + firstCharAdded = true if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range // Things to note: