Merge pull request 'Implement Unicode character classes' (#4) from implementUnicodeCharClass into master
Reviewed-on: #4
This commit is contained in:
12
cmd/main.go
12
cmd/main.go
@@ -129,6 +129,8 @@ func main() {
|
|||||||
matchIndices = regComp.FindAllSubmatch(test_str)
|
matchIndices = regComp.FindAllSubmatch(test_str)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters
|
||||||
|
|
||||||
if *printMatchesFlag {
|
if *printMatchesFlag {
|
||||||
// if we are in single line mode, print the line on which
|
// if we are in single line mode, print the line on which
|
||||||
// the matches occur
|
// the matches occur
|
||||||
@@ -158,10 +160,10 @@ func main() {
|
|||||||
oldIndices := indicesToPrint.values()
|
oldIndices := indicesToPrint.values()
|
||||||
indicesToPrint = new_uniq_arr[int]()
|
indicesToPrint = new_uniq_arr[int]()
|
||||||
// Explanation:
|
// Explanation:
|
||||||
// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
|
// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
|
||||||
// These are the values we want to print, now that we have inverted the match.
|
// These are the values we want to print, now that we have inverted the match.
|
||||||
// Re-initialize indicesToPrint and add all of these values to it.
|
// Re-initialize indicesToPrint and add all of these values to it.
|
||||||
indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)
|
indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)
|
||||||
|
|
||||||
}
|
}
|
||||||
// If lineFlag is enabled, we should only print something if:
|
// If lineFlag is enabled, we should only print something if:
|
||||||
@@ -182,7 +184,7 @@ func main() {
|
|||||||
// the corresponding end index.
|
// the corresponding end index.
|
||||||
// 3. If not, just print the character.
|
// 3. If not, just print the character.
|
||||||
if substituteFlagEnabled {
|
if substituteFlagEnabled {
|
||||||
for i := range test_str {
|
for i := range test_str_runes {
|
||||||
inMatchIndex := false
|
inMatchIndex := false
|
||||||
for _, m := range matchIndices {
|
for _, m := range matchIndices {
|
||||||
if i == m[0].StartIdx {
|
if i == m[0].StartIdx {
|
||||||
@@ -193,11 +195,11 @@ func main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !inMatchIndex {
|
if !inMatchIndex {
|
||||||
fmt.Fprintf(out, "%c", test_str[i])
|
fmt.Fprintf(out, "%c", test_str_runes[i])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for i, c := range test_str {
|
for i, c := range test_str_runes {
|
||||||
if indicesToPrint.contains(i) {
|
if indicesToPrint.contains(i) {
|
||||||
color.New(color.FgRed).Fprintf(out, "%c", c)
|
color.New(color.FgRed).Fprintf(out, "%c", c)
|
||||||
// Newline after every match - only if -o is enabled and -v is disabled.
|
// Newline after every match - only if -o is enabled and -v is disabled.
|
||||||
|
130
regex/compile.go
130
regex/compile.go
@@ -108,6 +108,48 @@ func getPOSIXClass(str []rune) (bool, string) {
|
|||||||
return true, rtv
|
return true, rtv
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isUnicodeCharClassLetter returns whether or not the given letter represents a unicode character class.
|
||||||
|
func isUnicodeCharClassLetter(c rune) bool {
|
||||||
|
return slices.Contains([]rune{'L', 'M', 'S', 'N', 'P', 'C', 'Z'}, c)
|
||||||
|
}
|
||||||
|
|
||||||
|
// rangeTableToRuneSlice converts the given range table into a rune slice and returns it.
|
||||||
|
func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune {
|
||||||
|
var rtv []rune
|
||||||
|
for _, r := range rangetable.R16 {
|
||||||
|
for c := r.Lo; c <= r.Hi; c += r.Stride {
|
||||||
|
rtv = append(rtv, rune(c))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, r := range rangetable.R32 {
|
||||||
|
for c := r.Lo; c <= r.Hi; c += r.Stride {
|
||||||
|
rtv = append(rtv, rune(c))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rtv
|
||||||
|
}
|
||||||
|
|
||||||
|
// unicodeCharClassToRange converts the given unicode character class name into a list of characters in that class.
|
||||||
|
// This class could also be a single letter eg. 'C'.
|
||||||
|
func unicodeCharClassToRange(class string) ([]rune, error) {
|
||||||
|
if len(class) == 0 {
|
||||||
|
return nil, fmt.Errorf("empty unicode character class")
|
||||||
|
}
|
||||||
|
if len(class) == 1 || len(class) == 2 {
|
||||||
|
if rangeTable, ok := unicode.Categories[class]; ok {
|
||||||
|
return rangeTableToRuneSlice(rangeTable), nil
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("invalid short unicode character class")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if rangeTable, ok := unicode.Scripts[class]; ok {
|
||||||
|
return rangeTableToRuneSlice(rangeTable), nil
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("invalid long unicode character class")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Stores whether the case-insensitive flag has been enabled.
|
// Stores whether the case-insensitive flag has been enabled.
|
||||||
var caseInsensitive bool
|
var caseInsensitive bool
|
||||||
|
|
||||||
@@ -309,10 +351,30 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
} else if isHex(re_runes[i]) {
|
} else if isHex(re_runes[i]) {
|
||||||
re_postfix = append(re_postfix, re_runes[i:i+2]...)
|
re_postfix = append(re_postfix, re_runes[i:i+2]...)
|
||||||
i += 2
|
i += 1 // I don't skip forward 2 steps, because the second step will happen with the loop increment
|
||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("invalid hex value in expression")
|
return nil, fmt.Errorf("invalid hex value in expression")
|
||||||
}
|
}
|
||||||
|
} else if re_runes[i] == 'p' || re_runes[i] == 'P' { // Unicode character class (P is negated unicode charclass)
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
if i >= len(re_runes) {
|
||||||
|
return nil, fmt.Errorf("error parsing unicode character class in expression")
|
||||||
|
}
|
||||||
|
if re_runes[i] == '{' { // Full name charclass
|
||||||
|
for re_runes[i] != '}' {
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
} else if isUnicodeCharClassLetter(re_runes[i]) {
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("error parsing unicode character class in expression")
|
||||||
|
}
|
||||||
|
i-- // The loop increment at the top will move us forward
|
||||||
} else if re_runes[i] == '0' { // Start of octal value
|
} else if re_runes[i] == '0' { // Start of octal value
|
||||||
numDigits := 1
|
numDigits := 1
|
||||||
for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0)
|
for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0)
|
||||||
@@ -429,6 +491,39 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("not enough hex characters found in expression")
|
return nil, fmt.Errorf("not enough hex characters found in expression")
|
||||||
}
|
}
|
||||||
|
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
|
||||||
|
charClassInverted := (re_postfix[i] == 'P')
|
||||||
|
charsInClass := []rune{}
|
||||||
|
i++
|
||||||
|
if isUnicodeCharClassLetter(re_postfix[i]) {
|
||||||
|
var err error
|
||||||
|
charsInClass, err = unicodeCharClassToRange(string(re_postfix[i]))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else if re_postfix[i] == '{' {
|
||||||
|
i++ // Skip opening bracket
|
||||||
|
unicodeCharClassStr := ""
|
||||||
|
for re_postfix[i] != '}' {
|
||||||
|
unicodeCharClassStr += string(re_postfix[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
var err error
|
||||||
|
charsInClass, err = unicodeCharClassToRange(unicodeCharClassStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("error parsing unicode character class in expression")
|
||||||
|
}
|
||||||
|
var toAppend postfixNode
|
||||||
|
if !charClassInverted { // \p
|
||||||
|
toAppend = newPostfixNode(charsInClass...)
|
||||||
|
} else { // \P
|
||||||
|
toAppend = newPostfixDotNode()
|
||||||
|
toAppend.except = append([]postfixNode{}, newPostfixNode(charsInClass...))
|
||||||
|
}
|
||||||
|
outQueue = append(outQueue, toAppend)
|
||||||
} else if re_postfix[i] == '0' { // Octal value
|
} else if re_postfix[i] == '0' { // Octal value
|
||||||
var octVal int64
|
var octVal int64
|
||||||
var octValStr string
|
var octValStr string
|
||||||
@@ -611,7 +706,40 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("not enough hex characters found in character class")
|
return nil, fmt.Errorf("not enough hex characters found in character class")
|
||||||
}
|
}
|
||||||
|
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
|
||||||
|
charClassInverted := (re_postfix[i] == 'P')
|
||||||
|
charsInList := []rune{}
|
||||||
|
i++
|
||||||
|
if isUnicodeCharClassLetter(re_postfix[i]) {
|
||||||
|
var err error
|
||||||
|
charsInList, err = unicodeCharClassToRange(string(re_postfix[i]))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else if re_postfix[i] == '{' {
|
||||||
|
i++ // Skip opening bracket
|
||||||
|
unicodeCharClassStr := ""
|
||||||
|
for re_postfix[i] != '}' {
|
||||||
|
unicodeCharClassStr += string(re_postfix[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
var err error
|
||||||
|
charsInList, err = unicodeCharClassToRange(unicodeCharClassStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("error parsing unicode character class in expression")
|
||||||
|
}
|
||||||
|
if !charClassInverted {
|
||||||
|
chars = append(chars, newPostfixNode(charsInList...))
|
||||||
|
} else {
|
||||||
|
toAppend := newPostfixDotNode()
|
||||||
|
toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...))
|
||||||
|
chars = append(chars, toAppend)
|
||||||
|
}
|
||||||
} else if re_postfix[i] == '0' { // Octal value
|
} else if re_postfix[i] == '0' { // Octal value
|
||||||
|
|
||||||
var octVal int64
|
var octVal int64
|
||||||
var octValStr string
|
var octValStr string
|
||||||
numDigitsParsed := 0
|
numDigitsParsed := 0
|
||||||
|
@@ -153,9 +153,8 @@ returns the 0-group.
|
|||||||
The following features from [regexp] are (currently) NOT supported:
|
The following features from [regexp] are (currently) NOT supported:
|
||||||
1. Named capturing groups
|
1. Named capturing groups
|
||||||
2. Non-greedy operators
|
2. Non-greedy operators
|
||||||
3. Unicode character classes
|
3. Embedded flags (flags are instead passed as arguments to [Compile])
|
||||||
4. Embedded flags (flags are instead passed as arguments to [Compile])
|
4. Literal text with \Q ... \E
|
||||||
5. Literal text with \Q ... \E
|
|
||||||
|
|
||||||
The following features are not available in [regexp], but are supported in my engine:
|
The following features are not available in [regexp], but are supported in my engine:
|
||||||
1. Lookarounds
|
1. Lookarounds
|
||||||
|
@@ -430,6 +430,7 @@ var reTests = []struct {
|
|||||||
{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
|
{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
|
||||||
{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
|
{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
|
||||||
{`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
|
{`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
|
||||||
|
{`\0141\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "AA", []Group{{0, 2}}},
|
||||||
|
|
||||||
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
|
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
|
||||||
|
|
||||||
@@ -460,8 +461,10 @@ var reTests = []struct {
|
|||||||
{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
|
{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
|
||||||
{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
|
{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
|
||||||
{`\xff`, nil, "\u00ff", []Group{{0, 1}}},
|
{`\xff`, nil, "\u00ff", []Group{{0, 1}}},
|
||||||
|
{`\xff+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
|
||||||
{`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
|
{`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
|
||||||
{`\x00ff`, nil, "\u00ff", []Group{}},
|
{`\x00ff`, nil, "\u00ff", []Group{}},
|
||||||
|
{`\x{0000ff}+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
|
||||||
{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
|
{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
|
||||||
{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
|
{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
|
||||||
{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
|
{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
|
||||||
@@ -516,6 +519,14 @@ var reTests = []struct {
|
|||||||
{`<389-400`, nil, `-`, nil},
|
{`<389-400`, nil, `-`, nil},
|
||||||
{`<389-400>`, nil, `391`, []Group{{0, 3}}},
|
{`<389-400>`, nil, `391`, []Group{{0, 3}}},
|
||||||
{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
|
{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
|
||||||
|
|
||||||
|
{`\p{Tamil}+`, nil, `உயிரெழுத்து`, []Group{{0, 11}}}, // Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'.
|
||||||
|
{`\P{Tamil}+`, nil, `vowel=உயிரெழுத்து`, []Group{{0, 6}}},
|
||||||
|
{`\P`, nil, `உயிரெழுத்து`, nil},
|
||||||
|
{`\PM\pM*`, nil, `உயிரெழுத்து`, []Group{{0, 1}, {1, 3}, {3, 5}, {5, 7}, {7, 9}, {9, 11}}},
|
||||||
|
{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
|
||||||
|
{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
|
||||||
|
{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
|
||||||
}
|
}
|
||||||
|
|
||||||
var groupTests = []struct {
|
var groupTests = []struct {
|
||||||
|
Reference in New Issue
Block a user