Merge pull request 'Implement Unicode character classes' (#4) from implementUnicodeCharClass into master
Reviewed-on: #4
This commit is contained in:
		
							
								
								
									
										12
									
								
								cmd/main.go
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								cmd/main.go
									
									
									
									
									
								
							| @@ -129,6 +129,8 @@ func main() { | ||||
| 			matchIndices = regComp.FindAllSubmatch(test_str) | ||||
| 		} | ||||
|  | ||||
| 		test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters | ||||
|  | ||||
| 		if *printMatchesFlag { | ||||
| 			// if we are in single line mode, print the line on which | ||||
| 			// the matches occur | ||||
| @@ -158,10 +160,10 @@ func main() { | ||||
| 			oldIndices := indicesToPrint.values() | ||||
| 			indicesToPrint = new_uniq_arr[int]() | ||||
| 			// Explanation: | ||||
| 			// Find all numbers from 0 to len(test_str) that are NOT in oldIndices. | ||||
| 			// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices. | ||||
| 			// These are the values we want to print, now that we have inverted the match. | ||||
| 			// Re-initialize indicesToPrint and add all of these values to it. | ||||
| 			indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...) | ||||
| 			indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...) | ||||
|  | ||||
| 		} | ||||
| 		// If lineFlag is enabled, we should only print something if: | ||||
| @@ -182,7 +184,7 @@ func main() { | ||||
| 		//			the corresponding end index. | ||||
| 		// 		3. If not, just print the character. | ||||
| 		if substituteFlagEnabled { | ||||
| 			for i := range test_str { | ||||
| 			for i := range test_str_runes { | ||||
| 				inMatchIndex := false | ||||
| 				for _, m := range matchIndices { | ||||
| 					if i == m[0].StartIdx { | ||||
| @@ -193,11 +195,11 @@ func main() { | ||||
| 					} | ||||
| 				} | ||||
| 				if !inMatchIndex { | ||||
| 					fmt.Fprintf(out, "%c", test_str[i]) | ||||
| 					fmt.Fprintf(out, "%c", test_str_runes[i]) | ||||
| 				} | ||||
| 			} | ||||
| 		} else { | ||||
| 			for i, c := range test_str { | ||||
| 			for i, c := range test_str_runes { | ||||
| 				if indicesToPrint.contains(i) { | ||||
| 					color.New(color.FgRed).Fprintf(out, "%c", c) | ||||
| 					// Newline after every match - only if -o is enabled and -v is disabled. | ||||
|   | ||||
							
								
								
									
										130
									
								
								regex/compile.go
									
									
									
									
									
								
							
							
						
						
									
										130
									
								
								regex/compile.go
									
									
									
									
									
								
							| @@ -108,6 +108,48 @@ func getPOSIXClass(str []rune) (bool, string) { | ||||
| 	return true, rtv | ||||
| } | ||||
|  | ||||
| // isUnicodeCharClassLetter returns whether or not the given letter represents a unicode character class. | ||||
| func isUnicodeCharClassLetter(c rune) bool { | ||||
| 	return slices.Contains([]rune{'L', 'M', 'S', 'N', 'P', 'C', 'Z'}, c) | ||||
| } | ||||
|  | ||||
| // rangeTableToRuneSlice converts the given range table into a rune slice and returns it. | ||||
| func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune { | ||||
| 	var rtv []rune | ||||
| 	for _, r := range rangetable.R16 { | ||||
| 		for c := r.Lo; c <= r.Hi; c += r.Stride { | ||||
| 			rtv = append(rtv, rune(c)) | ||||
| 		} | ||||
| 	} | ||||
| 	for _, r := range rangetable.R32 { | ||||
| 		for c := r.Lo; c <= r.Hi; c += r.Stride { | ||||
| 			rtv = append(rtv, rune(c)) | ||||
| 		} | ||||
| 	} | ||||
| 	return rtv | ||||
| } | ||||
|  | ||||
| // unicodeCharClassToRange converts the given unicode character class name into a list of characters in that class. | ||||
| // This class could also be a single letter eg. 'C'. | ||||
| func unicodeCharClassToRange(class string) ([]rune, error) { | ||||
| 	if len(class) == 0 { | ||||
| 		return nil, fmt.Errorf("empty unicode character class") | ||||
| 	} | ||||
| 	if len(class) == 1 || len(class) == 2 { | ||||
| 		if rangeTable, ok := unicode.Categories[class]; ok { | ||||
| 			return rangeTableToRuneSlice(rangeTable), nil | ||||
| 		} else { | ||||
| 			return nil, fmt.Errorf("invalid short unicode character class") | ||||
| 		} | ||||
| 	} else { | ||||
| 		if rangeTable, ok := unicode.Scripts[class]; ok { | ||||
| 			return rangeTableToRuneSlice(rangeTable), nil | ||||
| 		} else { | ||||
| 			return nil, fmt.Errorf("invalid long unicode character class") | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // Stores whether the case-insensitive flag has been enabled. | ||||
| var caseInsensitive bool | ||||
|  | ||||
| @@ -309,10 +351,30 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { | ||||
| 					} | ||||
| 				} else if isHex(re_runes[i]) { | ||||
| 					re_postfix = append(re_postfix, re_runes[i:i+2]...) | ||||
| 					i += 2 | ||||
| 					i += 1 // I don't skip forward 2 steps, because the second step will happen with the loop increment | ||||
| 				} else { | ||||
| 					return nil, fmt.Errorf("invalid hex value in expression") | ||||
| 				} | ||||
| 			} else if re_runes[i] == 'p' || re_runes[i] == 'P' { // Unicode character class (P is negated unicode charclass) | ||||
| 				re_postfix = append(re_postfix, re_runes[i]) | ||||
| 				i++ | ||||
| 				if i >= len(re_runes) { | ||||
| 					return nil, fmt.Errorf("error parsing unicode character class in expression") | ||||
| 				} | ||||
| 				if re_runes[i] == '{' { // Full name charclass | ||||
| 					for re_runes[i] != '}' { | ||||
| 						re_postfix = append(re_postfix, re_runes[i]) | ||||
| 						i++ | ||||
| 					} | ||||
| 					re_postfix = append(re_postfix, re_runes[i]) | ||||
| 					i++ | ||||
| 				} else if isUnicodeCharClassLetter(re_runes[i]) { | ||||
| 					re_postfix = append(re_postfix, re_runes[i]) | ||||
| 					i++ | ||||
| 				} else { | ||||
| 					return nil, fmt.Errorf("error parsing unicode character class in expression") | ||||
| 				} | ||||
| 				i-- // The loop increment at the top will move us forward | ||||
| 			} else if re_runes[i] == '0' { // Start of octal value | ||||
| 				numDigits := 1 | ||||
| 				for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0) | ||||
| @@ -429,6 +491,39 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { | ||||
| 				} else { | ||||
| 					return nil, fmt.Errorf("not enough hex characters found in expression") | ||||
| 				} | ||||
| 			} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' { | ||||
| 				charClassInverted := (re_postfix[i] == 'P') | ||||
| 				charsInClass := []rune{} | ||||
| 				i++ | ||||
| 				if isUnicodeCharClassLetter(re_postfix[i]) { | ||||
| 					var err error | ||||
| 					charsInClass, err = unicodeCharClassToRange(string(re_postfix[i])) | ||||
| 					if err != nil { | ||||
| 						return nil, err | ||||
| 					} | ||||
| 				} else if re_postfix[i] == '{' { | ||||
| 					i++ // Skip opening bracket | ||||
| 					unicodeCharClassStr := "" | ||||
| 					for re_postfix[i] != '}' { | ||||
| 						unicodeCharClassStr += string(re_postfix[i]) | ||||
| 						i++ | ||||
| 					} | ||||
| 					var err error | ||||
| 					charsInClass, err = unicodeCharClassToRange(unicodeCharClassStr) | ||||
| 					if err != nil { | ||||
| 						return nil, err | ||||
| 					} | ||||
| 				} else { | ||||
| 					return nil, fmt.Errorf("error parsing unicode character class in expression") | ||||
| 				} | ||||
| 				var toAppend postfixNode | ||||
| 				if !charClassInverted { // \p | ||||
| 					toAppend = newPostfixNode(charsInClass...) | ||||
| 				} else { // \P | ||||
| 					toAppend = newPostfixDotNode() | ||||
| 					toAppend.except = append([]postfixNode{}, newPostfixNode(charsInClass...)) | ||||
| 				} | ||||
| 				outQueue = append(outQueue, toAppend) | ||||
| 			} else if re_postfix[i] == '0' { // Octal value | ||||
| 				var octVal int64 | ||||
| 				var octValStr string | ||||
| @@ -611,7 +706,40 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { | ||||
| 						} else { | ||||
| 							return nil, fmt.Errorf("not enough hex characters found in character class") | ||||
| 						} | ||||
| 					} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' { | ||||
| 						charClassInverted := (re_postfix[i] == 'P') | ||||
| 						charsInList := []rune{} | ||||
| 						i++ | ||||
| 						if isUnicodeCharClassLetter(re_postfix[i]) { | ||||
| 							var err error | ||||
| 							charsInList, err = unicodeCharClassToRange(string(re_postfix[i])) | ||||
| 							if err != nil { | ||||
| 								return nil, err | ||||
| 							} | ||||
| 						} else if re_postfix[i] == '{' { | ||||
| 							i++ // Skip opening bracket | ||||
| 							unicodeCharClassStr := "" | ||||
| 							for re_postfix[i] != '}' { | ||||
| 								unicodeCharClassStr += string(re_postfix[i]) | ||||
| 								i++ | ||||
| 							} | ||||
| 							var err error | ||||
| 							charsInList, err = unicodeCharClassToRange(unicodeCharClassStr) | ||||
| 							if err != nil { | ||||
| 								return nil, err | ||||
| 							} | ||||
| 						} else { | ||||
| 							return nil, fmt.Errorf("error parsing unicode character class in expression") | ||||
| 						} | ||||
| 						if !charClassInverted { | ||||
| 							chars = append(chars, newPostfixNode(charsInList...)) | ||||
| 						} else { | ||||
| 							toAppend := newPostfixDotNode() | ||||
| 							toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...)) | ||||
| 							chars = append(chars, toAppend) | ||||
| 						} | ||||
| 					} else if re_postfix[i] == '0' { // Octal value | ||||
|  | ||||
| 						var octVal int64 | ||||
| 						var octValStr string | ||||
| 						numDigitsParsed := 0 | ||||
|   | ||||
| @@ -153,9 +153,8 @@ returns the 0-group. | ||||
| The following features from [regexp] are (currently) NOT supported: | ||||
|  1. Named capturing groups | ||||
|  2. Non-greedy operators | ||||
|  3. Unicode character classes | ||||
|  4. Embedded flags (flags are instead passed as arguments to [Compile]) | ||||
|  5. Literal text with \Q ... \E | ||||
|  3. Embedded flags (flags are instead passed as arguments to [Compile]) | ||||
|  4. Literal text with \Q ... \E | ||||
|  | ||||
| The following features are not available in [regexp], but are supported in my engine: | ||||
|  1. Lookarounds | ||||
|   | ||||
| @@ -430,6 +430,7 @@ var reTests = []struct { | ||||
| 	{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}}, | ||||
| 	{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}}, | ||||
| 	{`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}}, | ||||
| 	{`\0141\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "AA", []Group{{0, 2}}}, | ||||
|  | ||||
| 	{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}}, | ||||
|  | ||||
| @@ -460,8 +461,10 @@ var reTests = []struct { | ||||
| 	{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}}, | ||||
| 	{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}}, | ||||
| 	{`\xff`, nil, "\u00ff", []Group{{0, 1}}}, | ||||
| 	{`\xff+`, nil, "\u00ff\u00ff", []Group{{0, 2}}}, | ||||
| 	{`\xFF`, nil, "\u00ff", []Group{{0, 1}}}, | ||||
| 	{`\x00ff`, nil, "\u00ff", []Group{}}, | ||||
| 	{`\x{0000ff}+`, nil, "\u00ff\u00ff", []Group{{0, 2}}}, | ||||
| 	{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}}, | ||||
| 	{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}}, | ||||
| 	{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}}, | ||||
| @@ -516,6 +519,14 @@ var reTests = []struct { | ||||
| 	{`<389-400`, nil, `-`, nil}, | ||||
| 	{`<389-400>`, nil, `391`, []Group{{0, 3}}}, | ||||
| 	{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}}, | ||||
|  | ||||
| 	{`\p{Tamil}+`, nil, `உயிரெழுத்து`, []Group{{0, 11}}}, // Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'. | ||||
| 	{`\P{Tamil}+`, nil, `vowel=உயிரெழுத்து`, []Group{{0, 6}}}, | ||||
| 	{`\P`, nil, `உயிரெழுத்து`, nil}, | ||||
| 	{`\PM\pM*`, nil, `உயிரெழுத்து`, []Group{{0, 1}, {1, 3}, {3, 5}, {5, 7}, {7, 9}, {9, 11}}}, | ||||
| 	{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}}, | ||||
| 	{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}}, | ||||
| 	{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}}, | ||||
| } | ||||
|  | ||||
| var groupTests = []struct { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user