From 44d6a2005c87c2fc63f14c408bcc56dd0f8fc417 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 12 Feb 2025 22:19:30 -0500 Subject: [PATCH 1/7] Started working on unicode character classes --- regex/compile.go | 61 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/regex/compile.go b/regex/compile.go index 0414ac8..dfee016 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -108,6 +108,48 @@ func getPOSIXClass(str []rune) (bool, string) { return true, rtv } +// isUnicodeCharClassLetter returns whether or not the given letter represents a unicode character class. +func isUnicodeCharClassLetter(c rune) bool { + return slices.Contains([]rune{'L', 'M', 'S', 'N', 'P', 'C', 'Z'}, c) +} + +// rangeTableToRuneSlice converts the given range table into a rune slice and returns it. +func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune { + var rtv []rune + for _, r := range rangetable.R16 { + for c := r.Lo; c < r.Hi; c += r.Stride { + rtv = append(rtv, rune(c)) + } + } + for _, r := range rangetable.R32 { + for c := r.Lo; c < r.Hi; c += r.Stride { + rtv = append(rtv, rune(c)) + } + } + return rtv +} + +// unicodeCharClassToRange converts the given unicode character class name into a list of characters in that class. +// This class could also be a single letter eg. 'C'. +func unicodeCharClassToRange(class string) ([]rune, error) { + if len(class) == 0 { + return nil, fmt.Errorf("empty unicode character class") + } + if len(class) == 1 || len(class) == 2 { + if rangeTable, ok := unicode.Categories[class]; ok { + return rangeTableToRuneSlice(rangeTable), nil + } else { + return nil, fmt.Errorf("invalid short unicode character class") + } + } else { + if rangeTable, ok := unicode.Scripts[class]; ok { + return rangeTableToRuneSlice(rangeTable), nil + } else { + return nil, fmt.Errorf("invalid long unicode character class") + } + } +} + // Stores whether the case-insensitive flag has been enabled. var caseInsensitive bool @@ -313,6 +355,25 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } else { return nil, fmt.Errorf("invalid hex value in expression") } + } else if re_runes[i] == 'p' || re_runes[i] == 'P' { // Unicode character class (P is negated unicode charclass) + re_postfix = append(re_postfix, re_runes[i]) + i++ + if i >= len(re_runes) { + return nil, fmt.Errorf("error parsing unicode character class in expression") + } + if re_runes[i] == '{' { // Full name charclass + for re_runes[i] != '}' { + re_postfix = append(re_postfix, re_runes[i]) + i++ + } + re_postfix = append(re_postfix, re_runes[i]) + i++ + } else if isUnicodeCharClassLetter(re_runes[i]) { + re_postfix = append(re_postfix, re_runes[i]) + i++ + } else { + return nil, fmt.Errorf("error parsing unicode character class in expression") + } } else if re_runes[i] == '0' { // Start of octal value numDigits := 1 for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0) From 9cd330e521fe467b3018bab8097aca3f655b9751 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 12 Feb 2025 23:04:10 -0500 Subject: [PATCH 2/7] More work on unicode character class support - fix bug where all characters aren't being matched --- regex/compile.go | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/regex/compile.go b/regex/compile.go index dfee016..428df30 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -490,6 +490,45 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } else { return nil, fmt.Errorf("not enough hex characters found in expression") } + } else if re_postfix[i] == 'p' || re_postfix[i] == 'P' { + charClassInverted := (re_postfix[i] == 'P') + i++ + if isUnicodeCharClassLetter(re_postfix[i]) { + chars, err := unicodeCharClassToRange(string(re_postfix[i])) + if err != nil { + return nil, err + } + var toAppend postfixNode + if re_postfix[i] == 'p' { + toAppend = newPostfixNode(chars...) + } + if re_postfix[i] == 'P' { + toAppend = newPostfixDotNode() + toAppend.except = append([]postfixNode{}, newPostfixNode(chars...)) + } + outQueue = append(outQueue, toAppend) + } else if re_postfix[i] == '{' { + i++ // Skip opening bracket + unicodeCharClassStr := "" + for re_postfix[i] != '}' { + unicodeCharClassStr += string(re_postfix[i]) + i++ + } + chars, err := unicodeCharClassToRange(unicodeCharClassStr) + if err != nil { + return nil, err + } + var toAppend postfixNode + if !charClassInverted { // \p + toAppend = newPostfixNode(chars...) + } else { // \P + toAppend = newPostfixDotNode() + toAppend.except = append([]postfixNode{}, newPostfixNode(chars...)) + } + outQueue = append(outQueue, toAppend) + } else { + return nil, fmt.Errorf("error parsing unicode character class in expression") + } } else if re_postfix[i] == '0' { // Octal value var octVal int64 var octValStr string From d4d606d95bdfcce5002860612be77ba422513ab3 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Thu, 13 Feb 2025 08:55:12 -0500 Subject: [PATCH 3/7] Added tests for unicode character classes; more tests for hex characters --- regex/re_test.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/regex/re_test.go b/regex/re_test.go index 1b717c4..f085761 100644 --- a/regex/re_test.go +++ b/regex/re_test.go @@ -430,6 +430,7 @@ var reTests = []struct { {`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}}, {`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}}, {`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}}, + {`\0141\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "AA", []Group{{0, 2}}}, {`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}}, @@ -460,8 +461,10 @@ var reTests = []struct { {`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}}, {`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}}, {`\xff`, nil, "\u00ff", []Group{{0, 1}}}, + {`\xff+`, nil, "\u00ff\u00ff", []Group{{0, 2}}}, {`\xFF`, nil, "\u00ff", []Group{{0, 1}}}, {`\x00ff`, nil, "\u00ff", []Group{}}, + {`\x{0000ff}+`, nil, "\u00ff\u00ff", []Group{{0, 2}}}, {`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}}, {`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}}, {"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}}, @@ -516,6 +519,14 @@ var reTests = []struct { {`<389-400`, nil, `-`, nil}, {`<389-400>`, nil, `391`, []Group{{0, 3}}}, {`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}}, + + {`\p{Tamil}+`, nil, `உயிரெழுத்து`, []Group{{0, 11}}}, // Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'. + {`\P{Tamil}+`, nil, `vowel=உயிரெழுத்து`, []Group{{0, 6}}}, + {`\P`, nil, `உயிரெழுத்து`, nil}, + {`\PM\pM*`, nil, `உயிரெழுத்து`, []Group{{0, 1}, {1, 3}, {3, 5}, {5, 7}, {7, 9}, {9, 11}}}, + {`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}}, + {`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}}, + {`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}}, } var groupTests = []struct { From 70457118603d7ed762f9ff3cccdf3365d3621e3c Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Thu, 13 Feb 2025 08:55:41 -0500 Subject: [PATCH 4/7] Convert test_str into a rune slice for better unicode compatibility, it also fixed the bug where all unicode characters wouldn't be colored --- cmd/main.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 82c5748..06b9a58 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -129,6 +129,8 @@ func main() { matchIndices = regComp.FindAllSubmatch(test_str) } + test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters + if *printMatchesFlag { // if we are in single line mode, print the line on which // the matches occur @@ -158,10 +160,10 @@ func main() { oldIndices := indicesToPrint.values() indicesToPrint = new_uniq_arr[int]() // Explanation: - // Find all numbers from 0 to len(test_str) that are NOT in oldIndices. + // Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices. // These are the values we want to print, now that we have inverted the match. // Re-initialize indicesToPrint and add all of these values to it. - indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...) + indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...) } // If lineFlag is enabled, we should only print something if: @@ -182,7 +184,7 @@ func main() { // the corresponding end index. // 3. If not, just print the character. if substituteFlagEnabled { - for i := range test_str { + for i := range test_str_runes { inMatchIndex := false for _, m := range matchIndices { if i == m[0].StartIdx { @@ -193,11 +195,11 @@ func main() { } } if !inMatchIndex { - fmt.Fprintf(out, "%c", test_str[i]) + fmt.Fprintf(out, "%c", test_str_runes[i]) } } } else { - for i, c := range test_str { + for i, c := range test_str_runes { if indicesToPrint.contains(i) { color.New(color.FgRed).Fprintf(out, "%c", c) // Newline after every match - only if -o is enabled and -v is disabled. From fde3784e5a4a6525ae6ecec2fc945d34d3396f36 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Thu, 13 Feb 2025 08:58:02 -0500 Subject: [PATCH 5/7] Added unicode charclass support within character classes; Fixed bugs with hex classes and unicode classes --- regex/compile.go | 50 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/regex/compile.go b/regex/compile.go index 428df30..62dbfb9 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -117,12 +117,12 @@ func isUnicodeCharClassLetter(c rune) bool { func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune { var rtv []rune for _, r := range rangetable.R16 { - for c := r.Lo; c < r.Hi; c += r.Stride { + for c := r.Lo; c <= r.Hi; c += r.Stride { rtv = append(rtv, rune(c)) } } for _, r := range rangetable.R32 { - for c := r.Lo; c < r.Hi; c += r.Stride { + for c := r.Lo; c <= r.Hi; c += r.Stride { rtv = append(rtv, rune(c)) } } @@ -351,7 +351,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } } else if isHex(re_runes[i]) { re_postfix = append(re_postfix, re_runes[i:i+2]...) - i += 2 + i += 1 // I don't skip forward 2 steps, because the second step will happen with the loop increment } else { return nil, fmt.Errorf("invalid hex value in expression") } @@ -374,6 +374,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } else { return nil, fmt.Errorf("error parsing unicode character class in expression") } + i-- // The loop increment at the top will move us forward } else if re_runes[i] == '0' { // Start of octal value numDigits := 1 for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0) @@ -499,10 +500,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { return nil, err } var toAppend postfixNode - if re_postfix[i] == 'p' { + if !charClassInverted { toAppend = newPostfixNode(chars...) - } - if re_postfix[i] == 'P' { + } else { toAppend = newPostfixDotNode() toAppend.except = append([]postfixNode{}, newPostfixNode(chars...)) } @@ -711,7 +711,45 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } else { return nil, fmt.Errorf("not enough hex characters found in character class") } + } else if re_postfix[i] == 'p' || re_postfix[i] == 'P' { + charClassInverted := (re_postfix[i] == 'P') + i++ + if isUnicodeCharClassLetter(re_postfix[i]) { + charsInList, err := unicodeCharClassToRange(string(re_postfix[i])) + if err != nil { + return nil, err + } + if !charClassInverted { + chars = append(chars, newPostfixNode(charsInList...)) + } else { + toAppend := newPostfixDotNode() + toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...)) + chars = append(chars, toAppend) + } + } else if re_postfix[i] == '{' { + i++ // Skip opening bracket + unicodeCharClassStr := "" + for re_postfix[i] != '}' { + unicodeCharClassStr += string(re_postfix[i]) + i++ + } + charsInList, err := unicodeCharClassToRange(unicodeCharClassStr) + if err != nil { + return nil, err + } + if !charClassInverted { + chars = append(chars, newPostfixNode(charsInList...)) + } else { + toAppend := newPostfixDotNode() + toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...)) + chars = append(chars, toAppend) + } + } else { + return nil, fmt.Errorf("error parsing unicode character class in expression") + } + } else if re_postfix[i] == '0' { // Octal value + var octVal int64 var octValStr string numDigitsParsed := 0 From 1a890a1e75aa5212c4625efea60e6b21dd54c098 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Thu, 13 Feb 2025 09:10:40 -0500 Subject: [PATCH 6/7] Refactoring - remove duplicate code --- regex/compile.go | 60 ++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/regex/compile.go b/regex/compile.go index 62dbfb9..904bf26 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -493,20 +493,14 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } } else if re_postfix[i] == 'p' || re_postfix[i] == 'P' { charClassInverted := (re_postfix[i] == 'P') + charsInClass := []rune{} i++ if isUnicodeCharClassLetter(re_postfix[i]) { - chars, err := unicodeCharClassToRange(string(re_postfix[i])) + var err error + charsInClass, err = unicodeCharClassToRange(string(re_postfix[i])) if err != nil { return nil, err } - var toAppend postfixNode - if !charClassInverted { - toAppend = newPostfixNode(chars...) - } else { - toAppend = newPostfixDotNode() - toAppend.except = append([]postfixNode{}, newPostfixNode(chars...)) - } - outQueue = append(outQueue, toAppend) } else if re_postfix[i] == '{' { i++ // Skip opening bracket unicodeCharClassStr := "" @@ -514,21 +508,22 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { unicodeCharClassStr += string(re_postfix[i]) i++ } - chars, err := unicodeCharClassToRange(unicodeCharClassStr) + var err error + charsInClass, err = unicodeCharClassToRange(unicodeCharClassStr) if err != nil { return nil, err } - var toAppend postfixNode - if !charClassInverted { // \p - toAppend = newPostfixNode(chars...) - } else { // \P - toAppend = newPostfixDotNode() - toAppend.except = append([]postfixNode{}, newPostfixNode(chars...)) - } - outQueue = append(outQueue, toAppend) } else { return nil, fmt.Errorf("error parsing unicode character class in expression") } + var toAppend postfixNode + if !charClassInverted { // \p + toAppend = newPostfixNode(charsInClass...) + } else { // \P + toAppend = newPostfixDotNode() + toAppend.except = append([]postfixNode{}, newPostfixNode(charsInClass...)) + } + outQueue = append(outQueue, toAppend) } else if re_postfix[i] == '0' { // Octal value var octVal int64 var octValStr string @@ -713,19 +708,14 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } } else if re_postfix[i] == 'p' || re_postfix[i] == 'P' { charClassInverted := (re_postfix[i] == 'P') + charsInList := []rune{} i++ if isUnicodeCharClassLetter(re_postfix[i]) { - charsInList, err := unicodeCharClassToRange(string(re_postfix[i])) + var err error + charsInList, err = unicodeCharClassToRange(string(re_postfix[i])) if err != nil { return nil, err } - if !charClassInverted { - chars = append(chars, newPostfixNode(charsInList...)) - } else { - toAppend := newPostfixDotNode() - toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...)) - chars = append(chars, toAppend) - } } else if re_postfix[i] == '{' { i++ // Skip opening bracket unicodeCharClassStr := "" @@ -733,21 +723,21 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { unicodeCharClassStr += string(re_postfix[i]) i++ } - charsInList, err := unicodeCharClassToRange(unicodeCharClassStr) + var err error + charsInList, err = unicodeCharClassToRange(unicodeCharClassStr) if err != nil { return nil, err } - if !charClassInverted { - chars = append(chars, newPostfixNode(charsInList...)) - } else { - toAppend := newPostfixDotNode() - toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...)) - chars = append(chars, toAppend) - } } else { return nil, fmt.Errorf("error parsing unicode character class in expression") } - + if !charClassInverted { + chars = append(chars, newPostfixNode(charsInList...)) + } else { + toAppend := newPostfixDotNode() + toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...)) + chars = append(chars, toAppend) + } } else if re_postfix[i] == '0' { // Octal value var octVal int64 From 46bc0c85296181362b214cfb44afe615f8aa2f8a Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Thu, 13 Feb 2025 10:48:23 -0500 Subject: [PATCH 7/7] Removed unicode character classes from 'features not supported' list --- regex/doc.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/regex/doc.go b/regex/doc.go index b35c02f..c77faf6 100644 --- a/regex/doc.go +++ b/regex/doc.go @@ -153,9 +153,8 @@ returns the 0-group. The following features from [regexp] are (currently) NOT supported: 1. Named capturing groups 2. Non-greedy operators - 3. Unicode character classes - 4. Embedded flags (flags are instead passed as arguments to [Compile]) - 5. Literal text with \Q ... \E + 3. Embedded flags (flags are instead passed as arguments to [Compile]) + 4. Literal text with \Q ... \E The following features are not available in [regexp], but are supported in my engine: 1. Lookarounds