diff --git a/compile.go b/compile.go index 8f8eb8c..5aa0a5c 100644 --- a/compile.go +++ b/compile.go @@ -154,6 +154,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' { i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash) // TODO: Check for escaped characters + if re_runes[i] == '-' && i > 0 && re_runes[i-1] != '\\' { // Unescaped hyphen - replace with CHAR_RANGE. This metacharacter will be used later on to construct the range + re_runes[i] = CHAR_RANGE + } toAppend = append(toAppend, re_runes[i]) } @@ -405,7 +408,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } } if c == LBRACKET { // Used for character classes - i++ // Step forward so we can look at the character class + endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter + i++ // Step forward so we can look at the character class var invertMatch bool if re_postfix[i] == '^' { invertMatch = true @@ -416,6 +420,11 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { if re_postfix[i] == RBRACKET { break } + if re_postfix[i] == CHAR_RANGE { + endOfRange = true + i++ + continue + } if re_postfix[i] == '\\' { // Backslash indicates a character to be escaped if i == len(re_postfix)-1 { return nil, fmt.Errorf("Stray backslash in character class.") @@ -471,10 +480,38 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { chars = append(chars, newPostfixCharNode(re_postfix[i])) i++ } + + if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range + // Things to note: + // 1. In PCRE and Go's regex engine, a letter _can_ be surrounded by hyphens in a character class. + // Eg. [a-b-c] + // While you might think this leads to a syntax error (I thought so), the engine picks 'a-b' as a range, + // then treats the second '-' and 'c' as regular characters in the character class. + // So this regex becomes "Match a character from 'a' to 'b', a literal hyphen, or 'c' ". + // 2. To account for this, the following logic is followed: + // a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range. + // i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such. + // b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range). + endRangePostfixNode := mustPop(&chars) + startRangePostfixNode := mustPop(&chars) + if len(endRangePostfixNode.contents) != 1 { + return nil, fmt.Errorf("Error parsing character range.") + } else if len(startRangePostfixNode.contents) != 1 { // This is actually a regular hyphen + chars = append(chars, startRangePostfixNode, newPostfixCharNode('-'), endRangePostfixNode) + } else { + // We have established that they both have a length of 1 + startRangeRune := startRangePostfixNode.contents[0] + endRangeRune := endRangePostfixNode.contents[0] + chars = append(chars, newPostfixCharNode(genRange(startRangeRune, endRangeRune+1)...)) + } + + endOfRange = false // Reset the flag + } } if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic. return nil, fmt.Errorf("Opening bracket without closing bracket.") } + outQueue = append(outQueue, newCharClassNode(chars, invertMatch)) continue } @@ -667,7 +704,7 @@ func thompson(re []postfixNode) (Reg, error) { // Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\') // Uncommenting this seems to make one of the test cases fail. Why? - // replaceByValue(state.except, ESC_BACKSLASH, '\\') + replaceByValue(state.except, ESC_BACKSLASH, '\\') nfa = append(nfa, &state) }