From 44d6a2005c87c2fc63f14c408bcc56dd0f8fc417 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 12 Feb 2025 22:19:30 -0500 Subject: [PATCH] Started working on unicode character classes --- regex/compile.go | 61 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/regex/compile.go b/regex/compile.go index 0414ac8..dfee016 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -108,6 +108,48 @@ func getPOSIXClass(str []rune) (bool, string) { return true, rtv } +// isUnicodeCharClassLetter returns whether or not the given letter represents a unicode character class. +func isUnicodeCharClassLetter(c rune) bool { + return slices.Contains([]rune{'L', 'M', 'S', 'N', 'P', 'C', 'Z'}, c) +} + +// rangeTableToRuneSlice converts the given range table into a rune slice and returns it. +func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune { + var rtv []rune + for _, r := range rangetable.R16 { + for c := r.Lo; c < r.Hi; c += r.Stride { + rtv = append(rtv, rune(c)) + } + } + for _, r := range rangetable.R32 { + for c := r.Lo; c < r.Hi; c += r.Stride { + rtv = append(rtv, rune(c)) + } + } + return rtv +} + +// unicodeCharClassToRange converts the given unicode character class name into a list of characters in that class. +// This class could also be a single letter eg. 'C'. +func unicodeCharClassToRange(class string) ([]rune, error) { + if len(class) == 0 { + return nil, fmt.Errorf("empty unicode character class") + } + if len(class) == 1 || len(class) == 2 { + if rangeTable, ok := unicode.Categories[class]; ok { + return rangeTableToRuneSlice(rangeTable), nil + } else { + return nil, fmt.Errorf("invalid short unicode character class") + } + } else { + if rangeTable, ok := unicode.Scripts[class]; ok { + return rangeTableToRuneSlice(rangeTable), nil + } else { + return nil, fmt.Errorf("invalid long unicode character class") + } + } +} + // Stores whether the case-insensitive flag has been enabled. var caseInsensitive bool @@ -313,6 +355,25 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } else { return nil, fmt.Errorf("invalid hex value in expression") } + } else if re_runes[i] == 'p' || re_runes[i] == 'P' { // Unicode character class (P is negated unicode charclass) + re_postfix = append(re_postfix, re_runes[i]) + i++ + if i >= len(re_runes) { + return nil, fmt.Errorf("error parsing unicode character class in expression") + } + if re_runes[i] == '{' { // Full name charclass + for re_runes[i] != '}' { + re_postfix = append(re_postfix, re_runes[i]) + i++ + } + re_postfix = append(re_postfix, re_runes[i]) + i++ + } else if isUnicodeCharClassLetter(re_runes[i]) { + re_postfix = append(re_postfix, re_runes[i]) + i++ + } else { + return nil, fmt.Errorf("error parsing unicode character class in expression") + } } else if re_runes[i] == '0' { // Start of octal value numDigits := 1 for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0)