From 9cd330e521fe467b3018bab8097aca3f655b9751 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 12 Feb 2025 23:04:10 -0500 Subject: [PATCH] More work on unicode character class support - fix bug where all characters aren't being matched --- regex/compile.go | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/regex/compile.go b/regex/compile.go index dfee016..428df30 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -490,6 +490,45 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } else { return nil, fmt.Errorf("not enough hex characters found in expression") } + } else if re_postfix[i] == 'p' || re_postfix[i] == 'P' { + charClassInverted := (re_postfix[i] == 'P') + i++ + if isUnicodeCharClassLetter(re_postfix[i]) { + chars, err := unicodeCharClassToRange(string(re_postfix[i])) + if err != nil { + return nil, err + } + var toAppend postfixNode + if re_postfix[i] == 'p' { + toAppend = newPostfixNode(chars...) + } + if re_postfix[i] == 'P' { + toAppend = newPostfixDotNode() + toAppend.except = append([]postfixNode{}, newPostfixNode(chars...)) + } + outQueue = append(outQueue, toAppend) + } else if re_postfix[i] == '{' { + i++ // Skip opening bracket + unicodeCharClassStr := "" + for re_postfix[i] != '}' { + unicodeCharClassStr += string(re_postfix[i]) + i++ + } + chars, err := unicodeCharClassToRange(unicodeCharClassStr) + if err != nil { + return nil, err + } + var toAppend postfixNode + if !charClassInverted { // \p + toAppend = newPostfixNode(chars...) + } else { // \P + toAppend = newPostfixDotNode() + toAppend.except = append([]postfixNode{}, newPostfixNode(chars...)) + } + outQueue = append(outQueue, toAppend) + } else { + return nil, fmt.Errorf("error parsing unicode character class in expression") + } } else if re_postfix[i] == '0' { // Octal value var octVal int64 var octValStr string