From 9d3c228ace0e64b5eba509aecab82ee9a60344e0 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Fri, 24 Jan 2025 14:57:47 -0500 Subject: [PATCH] Fixed edge cases with character ranges and character classes --- compile.go | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/compile.go b/compile.go index 14b6b1e..bc967df 100644 --- a/compile.go +++ b/compile.go @@ -153,8 +153,12 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' { i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash) - // TODO: Check for escaped characters - if re_runes[i] == '-' && i > 0 && re_runes[i-1] != '\\' { // Unescaped hyphen - replace with CHAR_RANGE. This metacharacter will be used later on to construct the range + // Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error. + if i >= len(re_runes) { + return nil, fmt.Errorf("Opening bracket without closing bracket.") + } + + if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != ']') { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range re_runes[i] = CHAR_RANGE } @@ -491,13 +495,15 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { // 2. To account for this, the following logic is followed: // a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range. // i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such. + // ii. If either the start or end of the range don't exist in 'chars' ie. something like [-a] or [a-], then too will we treat it as a literal hyphen. // b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range). - endRangePostfixNode := mustPop(&chars) - startRangePostfixNode := mustPop(&chars) - if len(endRangePostfixNode.contents) != 1 { - return nil, fmt.Errorf("Error parsing character range.") - } else if len(startRangePostfixNode.contents) != 1 { // This is actually a regular hyphen + endRangePostfixNode, err1 := pop(&chars) + startRangePostfixNode, err2 := pop(&chars) + + if (err1 != nil || err2 != nil) || len(startRangePostfixNode.contents) != 1 { // Treat it as a regular hyphen chars = append(chars, startRangePostfixNode, newPostfixCharNode('-'), endRangePostfixNode) + } else if len(endRangePostfixNode.contents) != 1 { // I don't even know what this would look like, this is just a sanity check + return nil, fmt.Errorf("Error parsing character range.") } else { // We have established that they both have a length of 1 startRangeRune := startRangePostfixNode.contents[0]