Compare commits
33 Commits
implementB
...
e79c19a929
| Author | SHA1 | Date | |
|---|---|---|---|
| e79c19a929 | |||
| d2bce37935 | |||
| bb3b866b77 | |||
| e07f27dc78 | |||
| 65d2317f79 | |||
| a631fc289c | |||
| d62a429cce | |||
| 7b31031553 | |||
| 38c842cb07 | |||
| 9f9af36be8 | |||
| 8217b67122 | |||
| 1f06dcef64 | |||
| 119475b41b | |||
| 6151cc8cf6 | |||
| 3eaf4eb19c | |||
| d453815831 | |||
| 3a2916baae | |||
| 9d6344719f | |||
| f5c868566b | |||
| 1cd6da218f | |||
| 277cbc0fc5 | |||
| 3924502b72 | |||
| 36b009747b | |||
| 6cd0a10a8f | |||
| 69fb96c43d | |||
| 46bc0c8529 | |||
| 1a890a1e75 | |||
| fde3784e5a | |||
| 7045711860 | |||
| d4d606d95b | |||
| 9cd330e521 | |||
| 44d6a2005c | |||
| f76cd6c3d9 |
17
README.md
Normal file
17
README.md
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
## Kleingrep
|
||||||
|
|
||||||
|
Kleingrep is a regular expression engine, providing a library and command-line tool written in Go.
|
||||||
|
|
||||||
|
It aims to provide a more featureful engine, compared to the one in Go's
|
||||||
|
[regexp](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
|
||||||
|
|
||||||
|
The engine does __not__ use backtracking, relying on the NFA-based method described in
|
||||||
|
[Russ Cox's articles](https://swtch.com/~rsc/regexp). As such, it is immune to catastrophic backtracking.
|
||||||
|
|
||||||
|
It also includes features not present in regexp, such as lookarounds and backreferences.
|
||||||
|
|
||||||
|
### Syntax
|
||||||
|
|
||||||
|
The syntax is, for the most part, a superset of Go's regexp. A full overview of the syntax can be found [here](https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex#hdr-Syntax).
|
||||||
|
|
||||||
|
__For more information, see https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex__.
|
||||||
44
cmd/main.go
44
cmd/main.go
@@ -64,18 +64,30 @@ func main() {
|
|||||||
// 2. Build NFA from postfix representation (Thompson's algorithm)
|
// 2. Build NFA from postfix representation (Thompson's algorithm)
|
||||||
// 3. Run the string against the NFA
|
// 3. Run the string against the NFA
|
||||||
|
|
||||||
if len(flag.Args()) != 1 { // flag.Args() also strips out program name
|
if len(flag.Args()) < 1 || len(flag.Args()) > 2 { // flag.Args() also strips out program name
|
||||||
fmt.Println("ERROR: Missing cmdline args")
|
fmt.Println("ERROR: Missing cmdline args")
|
||||||
os.Exit(22)
|
os.Exit(22)
|
||||||
}
|
}
|
||||||
var re string
|
var re string
|
||||||
re = flag.Args()[0]
|
re = flag.Args()[0]
|
||||||
|
var inputFile *os.File
|
||||||
|
if len(flag.Args()) == 1 || flag.Args()[1] == "-" { // Either no file argument, or file argument is "-"
|
||||||
|
inputFile = os.Stdin
|
||||||
|
} else {
|
||||||
|
var err error
|
||||||
|
inputFile, err = os.Open(flag.Args()[1])
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("%s: No such file or directory\n", flag.Args()[1])
|
||||||
|
os.Exit(2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var test_str string
|
var test_str string
|
||||||
var err error
|
var err error
|
||||||
var linesRead bool // Whether or not we have read the lines in the file
|
var linesRead bool // Whether or not we have read the lines in the file
|
||||||
lineNum := 0 // Current line number
|
lineNum := 0 // Current line number
|
||||||
// Create reader for stdin and writer for stdout
|
// Create reader for stdin and writer for stdout
|
||||||
reader := bufio.NewReader(os.Stdin)
|
reader := bufio.NewReader(inputFile)
|
||||||
out := bufio.NewWriter(os.Stdout)
|
out := bufio.NewWriter(os.Stdout)
|
||||||
|
|
||||||
regComp, err := reg.Compile(re, flagsToCompile...)
|
regComp, err := reg.Compile(re, flagsToCompile...)
|
||||||
@@ -129,6 +141,8 @@ func main() {
|
|||||||
matchIndices = regComp.FindAllSubmatch(test_str)
|
matchIndices = regComp.FindAllSubmatch(test_str)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters
|
||||||
|
|
||||||
if *printMatchesFlag {
|
if *printMatchesFlag {
|
||||||
// if we are in single line mode, print the line on which
|
// if we are in single line mode, print the line on which
|
||||||
// the matches occur
|
// the matches occur
|
||||||
@@ -158,10 +172,10 @@ func main() {
|
|||||||
oldIndices := indicesToPrint.values()
|
oldIndices := indicesToPrint.values()
|
||||||
indicesToPrint = new_uniq_arr[int]()
|
indicesToPrint = new_uniq_arr[int]()
|
||||||
// Explanation:
|
// Explanation:
|
||||||
// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
|
// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
|
||||||
// These are the values we want to print, now that we have inverted the match.
|
// These are the values we want to print, now that we have inverted the match.
|
||||||
// Re-initialize indicesToPrint and add all of these values to it.
|
// Re-initialize indicesToPrint and add all of these values to it.
|
||||||
indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)
|
indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)
|
||||||
|
|
||||||
}
|
}
|
||||||
// If lineFlag is enabled, we should only print something if:
|
// If lineFlag is enabled, we should only print something if:
|
||||||
@@ -182,7 +196,7 @@ func main() {
|
|||||||
// the corresponding end index.
|
// the corresponding end index.
|
||||||
// 3. If not, just print the character.
|
// 3. If not, just print the character.
|
||||||
if substituteFlagEnabled {
|
if substituteFlagEnabled {
|
||||||
for i := range test_str {
|
for i := range test_str_runes {
|
||||||
inMatchIndex := false
|
inMatchIndex := false
|
||||||
for _, m := range matchIndices {
|
for _, m := range matchIndices {
|
||||||
if i == m[0].StartIdx {
|
if i == m[0].StartIdx {
|
||||||
@@ -193,19 +207,21 @@ func main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !inMatchIndex {
|
if !inMatchIndex {
|
||||||
fmt.Fprintf(out, "%c", test_str[i])
|
fmt.Fprintf(out, "%c", test_str_runes[i])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for i, c := range test_str {
|
for i, c := range test_str_runes {
|
||||||
if indicesToPrint.contains(i) {
|
if indicesToPrint.contains(i) {
|
||||||
color.New(color.FgRed).Fprintf(out, "%c", c)
|
color.New(color.FgRed).Fprintf(out, "%c", c)
|
||||||
// Newline after every match - only if -o is enabled and -v is disabled.
|
// Newline after every match - only if -o is enabled and -v is disabled.
|
||||||
if *onlyFlag && !(*invertFlag) {
|
if *onlyFlag && !(*invertFlag) {
|
||||||
for _, idx := range matchIndices {
|
for matchIdxNum, idx := range matchIndices {
|
||||||
if i+1 == idx[0].EndIdx { // End index is one more than last index of match
|
if matchIdxNum < len(matchIndices)-1 { // Only print a newline afte printing a match, if there are multiple matches on the line, and we aren't on the last one. This is because the newline that gets added at the end will take care of that.
|
||||||
fmt.Fprintf(out, "\n")
|
if i+1 == idx[0].EndIdx { // End index is one more than last index of match
|
||||||
break
|
fmt.Fprintf(out, "\n")
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -220,6 +236,10 @@ func main() {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
fmt.Println()
|
// If the last character in the string wasn't a newline, AND we either have don't -o set or we do (and we've matched something), then print a newline
|
||||||
|
if (len(test_str_runes) > 0 && test_str_runes[len(test_str_runes)-1] != '\n') &&
|
||||||
|
(!*onlyFlag || indicesToPrint.len() > 0) {
|
||||||
|
fmt.Println()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,3 +36,7 @@ func (s uniq_arr[T]) values() []T {
|
|||||||
}
|
}
|
||||||
return toRet
|
return toRet
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s uniq_arr[T]) len() int {
|
||||||
|
return len(s.backingMap)
|
||||||
|
}
|
||||||
|
|||||||
178
regex/compile.go
178
regex/compile.go
@@ -64,7 +64,7 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func isOperator(c rune) bool {
|
func isOperator(c rune) bool {
|
||||||
if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune {
|
if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune || c == lazyPlusRune || c == lazyKleeneRune || c == lazyQuestionRune {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
@@ -72,7 +72,7 @@ func isOperator(c rune) bool {
|
|||||||
|
|
||||||
/* priority returns the priority of the given operator */
|
/* priority returns the priority of the given operator */
|
||||||
func priority(op rune) int {
|
func priority(op rune) int {
|
||||||
precedence := []rune{'|', concatRune, '+', '*', '?'}
|
precedence := []rune{'|', concatRune, '+', lazyPlusRune, '*', lazyKleeneRune, '?', lazyQuestionRune}
|
||||||
return slices.Index(precedence, op)
|
return slices.Index(precedence, op)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -108,6 +108,48 @@ func getPOSIXClass(str []rune) (bool, string) {
|
|||||||
return true, rtv
|
return true, rtv
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isUnicodeCharClassLetter returns whether or not the given letter represents a unicode character class.
|
||||||
|
func isUnicodeCharClassLetter(c rune) bool {
|
||||||
|
return slices.Contains([]rune{'L', 'M', 'S', 'N', 'P', 'C', 'Z'}, c)
|
||||||
|
}
|
||||||
|
|
||||||
|
// rangeTableToRuneSlice converts the given range table into a rune slice and returns it.
|
||||||
|
func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune {
|
||||||
|
var rtv []rune
|
||||||
|
for _, r := range rangetable.R16 {
|
||||||
|
for c := r.Lo; c <= r.Hi; c += r.Stride {
|
||||||
|
rtv = append(rtv, rune(c))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, r := range rangetable.R32 {
|
||||||
|
for c := r.Lo; c <= r.Hi; c += r.Stride {
|
||||||
|
rtv = append(rtv, rune(c))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rtv
|
||||||
|
}
|
||||||
|
|
||||||
|
// unicodeCharClassToRange converts the given unicode character class name into a list of characters in that class.
|
||||||
|
// This class could also be a single letter eg. 'C'.
|
||||||
|
func unicodeCharClassToRange(class string) ([]rune, error) {
|
||||||
|
if len(class) == 0 {
|
||||||
|
return nil, fmt.Errorf("empty unicode character class")
|
||||||
|
}
|
||||||
|
if len(class) == 1 || len(class) == 2 {
|
||||||
|
if rangeTable, ok := unicode.Categories[class]; ok {
|
||||||
|
return rangeTableToRuneSlice(rangeTable), nil
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("invalid short unicode character class")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if rangeTable, ok := unicode.Scripts[class]; ok {
|
||||||
|
return rangeTableToRuneSlice(rangeTable), nil
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("invalid long unicode character class")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Stores whether the case-insensitive flag has been enabled.
|
// Stores whether the case-insensitive flag has been enabled.
|
||||||
var caseInsensitive bool
|
var caseInsensitive bool
|
||||||
|
|
||||||
@@ -166,9 +208,6 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
// metacharacter. Later, in thompson(), these will be converted back. This avoids
|
// metacharacter. Later, in thompson(), these will be converted back. This avoids
|
||||||
// confusion in detecting whether a character is escaped eg. detecting
|
// confusion in detecting whether a character is escaped eg. detecting
|
||||||
// whether '\\[a]' has an escaped opening bracket (it doesn't).
|
// whether '\\[a]' has an escaped opening bracket (it doesn't).
|
||||||
//
|
|
||||||
// 5. Check for non-greedy operators. These are not supported at the moment, so an error
|
|
||||||
// must be thrown if the user attempts to use a non-greedy operator.
|
|
||||||
for i := 0; i < len(re_runes_orig); i++ {
|
for i := 0; i < len(re_runes_orig); i++ {
|
||||||
c := re_runes_orig[i]
|
c := re_runes_orig[i]
|
||||||
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
|
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
|
||||||
@@ -215,8 +254,16 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
|
} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
|
||||||
re_runes = append(re_runes, rbracketRune)
|
re_runes = append(re_runes, rbracketRune)
|
||||||
continue
|
continue
|
||||||
} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
|
} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i > 0 && re_runes_orig[i-1] != '\\') && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
|
||||||
return nil, fmt.Errorf("non-greedy operators are not supported")
|
switch c {
|
||||||
|
case '+':
|
||||||
|
re_runes = append(re_runes, lazyPlusRune)
|
||||||
|
case '*':
|
||||||
|
re_runes = append(re_runes, lazyKleeneRune)
|
||||||
|
case '?':
|
||||||
|
re_runes = append(re_runes, lazyQuestionRune)
|
||||||
|
}
|
||||||
|
i++
|
||||||
} else {
|
} else {
|
||||||
re_runes = append(re_runes, c)
|
re_runes = append(re_runes, c)
|
||||||
}
|
}
|
||||||
@@ -309,10 +356,30 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
} else if isHex(re_runes[i]) {
|
} else if isHex(re_runes[i]) {
|
||||||
re_postfix = append(re_postfix, re_runes[i:i+2]...)
|
re_postfix = append(re_postfix, re_runes[i:i+2]...)
|
||||||
i += 2
|
i += 1 // I don't skip forward 2 steps, because the second step will happen with the loop increment
|
||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("invalid hex value in expression")
|
return nil, fmt.Errorf("invalid hex value in expression")
|
||||||
}
|
}
|
||||||
|
} else if re_runes[i] == 'p' || re_runes[i] == 'P' { // Unicode character class (P is negated unicode charclass)
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
if i >= len(re_runes) {
|
||||||
|
return nil, fmt.Errorf("error parsing unicode character class in expression")
|
||||||
|
}
|
||||||
|
if re_runes[i] == '{' { // Full name charclass
|
||||||
|
for re_runes[i] != '}' {
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
} else if isUnicodeCharClassLetter(re_runes[i]) {
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("error parsing unicode character class in expression")
|
||||||
|
}
|
||||||
|
i-- // The loop increment at the top will move us forward
|
||||||
} else if re_runes[i] == '0' { // Start of octal value
|
} else if re_runes[i] == '0' { // Start of octal value
|
||||||
numDigits := 1
|
numDigits := 1
|
||||||
for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0)
|
for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0)
|
||||||
@@ -343,10 +410,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
if i >= len(re_runes) {
|
if i >= len(re_runes) {
|
||||||
return nil, fmt.Errorf("unclosed lookaround")
|
return nil, fmt.Errorf("unclosed lookaround")
|
||||||
}
|
}
|
||||||
if re_runes[i] == '(' || re_runes[i] == nonCapLparenRune {
|
if (re_runes[i] == '(' && re_runes[i-1] != '\\') || re_runes[i] == nonCapLparenRune {
|
||||||
numOpenParens++
|
numOpenParens++
|
||||||
}
|
}
|
||||||
if re_runes[i] == ')' {
|
if re_runes[i] == ')' && re_runes[i-1] != '\\' {
|
||||||
numOpenParens--
|
numOpenParens--
|
||||||
if numOpenParens == 0 {
|
if numOpenParens == 0 {
|
||||||
break
|
break
|
||||||
@@ -359,7 +426,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != nonCapLparenRune && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != nonCapLparenRune && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
||||||
if i < len(re_runes)-1 {
|
if i < len(re_runes)-1 {
|
||||||
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
|
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != lazyKleeneRune && re_runes[i+1] != '+' && re_runes[i+1] != lazyPlusRune && re_runes[i+1] != '?' && re_runes[i+1] != lazyQuestionRune && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
|
||||||
re_postfix = append(re_postfix, concatRune)
|
re_postfix = append(re_postfix, concatRune)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -429,6 +496,39 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("not enough hex characters found in expression")
|
return nil, fmt.Errorf("not enough hex characters found in expression")
|
||||||
}
|
}
|
||||||
|
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
|
||||||
|
charClassInverted := (re_postfix[i] == 'P')
|
||||||
|
var charsInClass []rune
|
||||||
|
i++
|
||||||
|
if isUnicodeCharClassLetter(re_postfix[i]) {
|
||||||
|
var err error
|
||||||
|
charsInClass, err = unicodeCharClassToRange(string(re_postfix[i]))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else if re_postfix[i] == '{' {
|
||||||
|
i++ // Skip opening bracket
|
||||||
|
unicodeCharClassStr := ""
|
||||||
|
for re_postfix[i] != '}' {
|
||||||
|
unicodeCharClassStr += string(re_postfix[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
var err error
|
||||||
|
charsInClass, err = unicodeCharClassToRange(unicodeCharClassStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("error parsing unicode character class in expression")
|
||||||
|
}
|
||||||
|
var toAppend postfixNode
|
||||||
|
if !charClassInverted { // \p
|
||||||
|
toAppend = newPostfixNode(charsInClass...)
|
||||||
|
} else { // \P
|
||||||
|
toAppend = newPostfixDotNode()
|
||||||
|
toAppend.except = append([]postfixNode{}, newPostfixNode(charsInClass...))
|
||||||
|
}
|
||||||
|
outQueue = append(outQueue, toAppend)
|
||||||
} else if re_postfix[i] == '0' { // Octal value
|
} else if re_postfix[i] == '0' { // Octal value
|
||||||
var octVal int64
|
var octVal int64
|
||||||
var octValStr string
|
var octValStr string
|
||||||
@@ -489,10 +589,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
if i >= len(re_postfix) {
|
if i >= len(re_postfix) {
|
||||||
return nil, fmt.Errorf("unclosed lookaround")
|
return nil, fmt.Errorf("unclosed lookaround")
|
||||||
}
|
}
|
||||||
if re_postfix[i] == '(' || re_postfix[i] == nonCapLparenRune {
|
if (re_postfix[i] == '(' && re_postfix[i-1] != '\\') || re_postfix[i] == nonCapLparenRune {
|
||||||
numOpenParens++
|
numOpenParens++
|
||||||
}
|
}
|
||||||
if re_postfix[i] == ')' {
|
if re_postfix[i] == ')' && re_postfix[i-1] != '\\' {
|
||||||
numOpenParens--
|
numOpenParens--
|
||||||
if numOpenParens == 0 {
|
if numOpenParens == 0 {
|
||||||
break
|
break
|
||||||
@@ -611,7 +711,40 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("not enough hex characters found in character class")
|
return nil, fmt.Errorf("not enough hex characters found in character class")
|
||||||
}
|
}
|
||||||
|
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
|
||||||
|
charClassInverted := (re_postfix[i] == 'P')
|
||||||
|
var charsInList []rune
|
||||||
|
i++
|
||||||
|
if isUnicodeCharClassLetter(re_postfix[i]) {
|
||||||
|
var err error
|
||||||
|
charsInList, err = unicodeCharClassToRange(string(re_postfix[i]))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else if re_postfix[i] == '{' {
|
||||||
|
i++ // Skip opening bracket
|
||||||
|
unicodeCharClassStr := ""
|
||||||
|
for re_postfix[i] != '}' {
|
||||||
|
unicodeCharClassStr += string(re_postfix[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
var err error
|
||||||
|
charsInList, err = unicodeCharClassToRange(unicodeCharClassStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("error parsing unicode character class in expression")
|
||||||
|
}
|
||||||
|
if !charClassInverted {
|
||||||
|
chars = append(chars, newPostfixNode(charsInList...))
|
||||||
|
} else {
|
||||||
|
toAppend := newPostfixDotNode()
|
||||||
|
toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...))
|
||||||
|
chars = append(chars, toAppend)
|
||||||
|
}
|
||||||
} else if re_postfix[i] == '0' { // Octal value
|
} else if re_postfix[i] == '0' { // Octal value
|
||||||
|
|
||||||
var octVal int64
|
var octVal int64
|
||||||
var octValStr string
|
var octValStr string
|
||||||
numDigitsParsed := 0
|
numDigitsParsed := 0
|
||||||
@@ -812,6 +945,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
outQueue[idx].startReps = startRangeNum
|
outQueue[idx].startReps = startRangeNum
|
||||||
outQueue[idx].endReps = endRangeNum
|
outQueue[idx].endReps = endRangeNum
|
||||||
|
if i < len(re_postfix)-1 && re_postfix[i+1] == '?' { // lazy repitition
|
||||||
|
outQueue[idx].isLazy = true
|
||||||
|
i++
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if c == '(' || c == nonCapLparenRune {
|
if c == '(' || c == nonCapLparenRune {
|
||||||
opStack = append(opStack, c)
|
opStack = append(opStack, c)
|
||||||
@@ -1105,6 +1242,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return Reg{}, err
|
return Reg{}, err
|
||||||
}
|
}
|
||||||
|
if c.isLazy {
|
||||||
|
stateToAdd.isLazy = true
|
||||||
|
}
|
||||||
nfa = append(nfa, stateToAdd)
|
nfa = append(nfa, stateToAdd)
|
||||||
case plusNode: // a+ is equivalent to aa*
|
case plusNode: // a+ is equivalent to aa*
|
||||||
s1 := mustPop(&nfa)
|
s1 := mustPop(&nfa)
|
||||||
@@ -1112,6 +1252,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return Reg{}, err
|
return Reg{}, err
|
||||||
}
|
}
|
||||||
|
if c.isLazy {
|
||||||
|
s2.isLazy = true
|
||||||
|
}
|
||||||
s1 = concatenate(s1, s2)
|
s1 = concatenate(s1, s2)
|
||||||
nfa = append(nfa, s1)
|
nfa = append(nfa, s1)
|
||||||
case questionNode: // ab? is equivalent to a(b|)
|
case questionNode: // ab? is equivalent to a(b|)
|
||||||
@@ -1123,6 +1266,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return Reg{}, err
|
return Reg{}, err
|
||||||
}
|
}
|
||||||
|
if c.isLazy {
|
||||||
|
s2.isLazy = true
|
||||||
|
}
|
||||||
nfa = append(nfa, s2)
|
nfa = append(nfa, s2)
|
||||||
case pipeNode:
|
case pipeNode:
|
||||||
// A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
|
// A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
|
||||||
@@ -1178,6 +1324,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return Reg{}, err
|
return Reg{}, err
|
||||||
}
|
}
|
||||||
|
if c.isLazy {
|
||||||
|
s2.isLazy = true
|
||||||
|
}
|
||||||
stateToAdd = concatenate(stateToAdd, s2)
|
stateToAdd = concatenate(stateToAdd, s2)
|
||||||
} else { // Case 2
|
} else { // Case 2
|
||||||
for i := c.startReps; i < c.endReps; i++ {
|
for i := c.startReps; i < c.endReps; i++ {
|
||||||
@@ -1185,6 +1334,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return Reg{}, fmt.Errorf("error processing bounded repetition")
|
return Reg{}, fmt.Errorf("error processing bounded repetition")
|
||||||
}
|
}
|
||||||
|
if c.isLazy {
|
||||||
|
tmp.isLazy = true
|
||||||
|
}
|
||||||
stateToAdd = concatenate(stateToAdd, tmp)
|
stateToAdd = concatenate(stateToAdd, tmp)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
40
regex/doc.go
40
regex/doc.go
@@ -60,14 +60,24 @@ Composition:
|
|||||||
x|y Match x or y (prefer x)
|
x|y Match x or y (prefer x)
|
||||||
xy|z Match xy or z (prefer xy)
|
xy|z Match xy or z (prefer xy)
|
||||||
|
|
||||||
Repitition (always greedy, preferring more):
|
Repitition:
|
||||||
|
|
||||||
x* Match x zero or more times
|
Greedy:
|
||||||
x+ Match x one or more times
|
x* Match x zero or more times, prefer more
|
||||||
x? Match x zero or one time
|
x+ Match x one or more times, prefer more
|
||||||
x{m,n} Match x between m and n times (inclusive)
|
x? Match x zero or one time, prefer one
|
||||||
x{m,} Match x atleast m times
|
x{m,n} Match x between m and n times (inclusive), prefer more
|
||||||
x{,n} Match x between 0 and n times (inclusive)
|
x{m,} Match x atleast m times, prefer more
|
||||||
|
x{,n} Match x between 0 and n times (inclusive), prefer more
|
||||||
|
x{m} Match x exactly m times
|
||||||
|
|
||||||
|
Lazy:
|
||||||
|
x*? Match x zero or more times, prefer fewer
|
||||||
|
x+? Match x one or more times, prefer fewer
|
||||||
|
x?? Match x zero or one time, prefer zero
|
||||||
|
x{m,n}? Match x between m and n times (inclusive), prefer fewer
|
||||||
|
x{m,}? Match x atleast m times, prefer fewer
|
||||||
|
x{,n}? Match x between 0 and n times (inclusive), prefer fewer
|
||||||
x{m} Match x exactly m times
|
x{m} Match x exactly m times
|
||||||
|
|
||||||
Grouping:
|
Grouping:
|
||||||
@@ -107,17 +117,13 @@ Numeric ranges:
|
|||||||
The engine and the API differ from [regexp] in a few ways, some of them very subtle.
|
The engine and the API differ from [regexp] in a few ways, some of them very subtle.
|
||||||
The key differences are mentioned below.
|
The key differences are mentioned below.
|
||||||
|
|
||||||
1. Greediness:
|
1. Byte-slices and runes:
|
||||||
|
|
||||||
This engine currently does not support non-greedy operators.
|
|
||||||
|
|
||||||
2. Byte-slices and runes:
|
|
||||||
|
|
||||||
My engine does not support byte-slices. When a matching function receives a string, it converts it into a
|
My engine does not support byte-slices. When a matching function receives a string, it converts it into a
|
||||||
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
|
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
|
||||||
support made the tradeoff worth it.
|
support made the tradeoff worth it.
|
||||||
|
|
||||||
3. Return values
|
2. Return values
|
||||||
|
|
||||||
Rather than using primitives for return values, my engine defines two types that are used as return
|
Rather than using primitives for return values, my engine defines two types that are used as return
|
||||||
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
|
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
|
||||||
@@ -152,10 +158,10 @@ returns the 0-group.
|
|||||||
|
|
||||||
The following features from [regexp] are (currently) NOT supported:
|
The following features from [regexp] are (currently) NOT supported:
|
||||||
1. Named capturing groups
|
1. Named capturing groups
|
||||||
2. Non-greedy operators
|
2. Negated POSIX classes
|
||||||
3. Unicode character classes
|
3. Embedded flags (flags are instead passed as arguments to [Compile])
|
||||||
4. Embedded flags (flags are instead passed as arguments to [Compile])
|
4. Literal text with \Q ... \E
|
||||||
5. Literal text with \Q ... \E
|
5. Finite repetition with no start (defaulting at 0)
|
||||||
|
|
||||||
The following features are not available in [regexp], but are supported in my engine:
|
The following features are not available in [regexp], but are supported in my engine:
|
||||||
1. Lookarounds
|
1. Lookarounds
|
||||||
|
|||||||
@@ -234,14 +234,14 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread
|
|||||||
}
|
}
|
||||||
visited = append(visited, state)
|
visited = append(visited, state)
|
||||||
|
|
||||||
if state.isKleene || state.isQuestion {
|
if (state.isKleene || state.isQuestion) && (state.isLazy == false) { // Greedy quantifiers
|
||||||
copyThread(state.splitState, state)
|
copyThread(state.splitState, state)
|
||||||
list := addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
|
list := addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
|
||||||
copyThread(state.next, state)
|
copyThread(state.next, state)
|
||||||
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
||||||
return list
|
return list
|
||||||
}
|
}
|
||||||
if state.isAlternation {
|
if state.isAlternation || ((state.isKleene || state.isQuestion) && state.isLazy) { // Alternation or lazy quantifier
|
||||||
copyThread(state.next, state)
|
copyThread(state.next, state)
|
||||||
list := addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
list := addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
||||||
copyThread(state.splitState, state)
|
copyThread(state.splitState, state)
|
||||||
|
|||||||
@@ -16,8 +16,11 @@ var rparenRune rune = 0xF0006
|
|||||||
var nonCapLparenRune rune = 0xF0007 // Represents a non-capturing group's LPAREN
|
var nonCapLparenRune rune = 0xF0007 // Represents a non-capturing group's LPAREN
|
||||||
var escBackslashRune rune = 0xF0008 // Represents an escaped backslash
|
var escBackslashRune rune = 0xF0008 // Represents an escaped backslash
|
||||||
var charRangeRune rune = 0xF0009 // Represents a character range
|
var charRangeRune rune = 0xF0009 // Represents a character range
|
||||||
|
var lazyKleeneRune rune = 0xF000A // Represents a lazy kleene star
|
||||||
|
var lazyPlusRune rune = 0xF000B // Represents a lazy plus operator
|
||||||
|
var lazyQuestionRune rune = 0xF000C // Represents a lazy question operator
|
||||||
|
|
||||||
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
|
var specialChars = []rune{'?', lazyQuestionRune, '*', lazyKleeneRune, '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', lazyPlusRune, '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
|
||||||
|
|
||||||
// An interface for int and rune, which are identical
|
// An interface for int and rune, which are identical
|
||||||
type character interface {
|
type character interface {
|
||||||
|
|||||||
13
regex/nfa.go
13
regex/nfa.go
@@ -34,6 +34,7 @@ type nfaState struct {
|
|||||||
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
|
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
|
||||||
isQuestion bool // Identifies whether current node is a 0-state representing the question operator
|
isQuestion bool // Identifies whether current node is a 0-state representing the question operator
|
||||||
isAlternation bool // Identifies whether current node is a 0-state representing an alternation
|
isAlternation bool // Identifies whether current node is a 0-state representing an alternation
|
||||||
|
isLazy bool // Only for split states - Identifies whether or not to flip the order of branches (try one branch before the other)
|
||||||
splitState *nfaState // Only for alternation states - the 'other' branch of the alternation ('next' is the first)
|
splitState *nfaState // Only for alternation states - the 'other' branch of the alternation ('next' is the first)
|
||||||
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
|
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
|
||||||
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
|
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
|
||||||
@@ -44,11 +45,11 @@ type nfaState struct {
|
|||||||
groupBegin bool // Whether or not the node starts a capturing group
|
groupBegin bool // Whether or not the node starts a capturing group
|
||||||
groupEnd bool // Whether or not the node ends a capturing group
|
groupEnd bool // Whether or not the node ends a capturing group
|
||||||
groupNum int // Which capturing group the node starts / ends
|
groupNum int // Which capturing group the node starts / ends
|
||||||
|
isBackreference bool // Whether or not current node is backreference
|
||||||
|
referredGroup int // If current node is a backreference, the node that it points to
|
||||||
// The following properties depend on the current match - I should think about resetting them for every match.
|
// The following properties depend on the current match - I should think about resetting them for every match.
|
||||||
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
|
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
|
||||||
isBackreference bool // Whether or not current node is backreference
|
threadBackref int // If current node is a backreference, how many characters to look forward into the referred group
|
||||||
referredGroup int // If current node is a backreference, the node that it points to
|
|
||||||
threadBackref int // If current node is a backreference, how many characters to look forward into the referred group
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clones the NFA starting from the given state.
|
// Clones the NFA starting from the given state.
|
||||||
@@ -77,6 +78,7 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
|
|||||||
isKleene: stateToClone.isKleene,
|
isKleene: stateToClone.isKleene,
|
||||||
isQuestion: stateToClone.isQuestion,
|
isQuestion: stateToClone.isQuestion,
|
||||||
isAlternation: stateToClone.isAlternation,
|
isAlternation: stateToClone.isAlternation,
|
||||||
|
isLazy: stateToClone.isLazy,
|
||||||
assert: stateToClone.assert,
|
assert: stateToClone.assert,
|
||||||
allChars: stateToClone.allChars,
|
allChars: stateToClone.allChars,
|
||||||
except: append([]rune{}, stateToClone.except...),
|
except: append([]rune{}, stateToClone.except...),
|
||||||
@@ -84,6 +86,8 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
|
|||||||
groupEnd: stateToClone.groupEnd,
|
groupEnd: stateToClone.groupEnd,
|
||||||
groupBegin: stateToClone.groupBegin,
|
groupBegin: stateToClone.groupBegin,
|
||||||
groupNum: stateToClone.groupNum,
|
groupNum: stateToClone.groupNum,
|
||||||
|
isBackreference: stateToClone.isBackreference,
|
||||||
|
referredGroup: stateToClone.referredGroup,
|
||||||
}
|
}
|
||||||
cloneMap[stateToClone] = clone
|
cloneMap[stateToClone] = clone
|
||||||
for i, s := range stateToClone.output {
|
for i, s := range stateToClone.output {
|
||||||
@@ -421,6 +425,7 @@ func (s nfaState) equals(other nfaState) bool {
|
|||||||
s.next == other.next &&
|
s.next == other.next &&
|
||||||
s.isKleene == other.isKleene &&
|
s.isKleene == other.isKleene &&
|
||||||
s.isQuestion == other.isQuestion &&
|
s.isQuestion == other.isQuestion &&
|
||||||
|
s.isLazy == other.isLazy &&
|
||||||
s.isAlternation == other.isAlternation &&
|
s.isAlternation == other.isAlternation &&
|
||||||
s.splitState == other.splitState &&
|
s.splitState == other.splitState &&
|
||||||
s.assert == other.assert &&
|
s.assert == other.assert &&
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ type postfixNode struct {
|
|||||||
lookaroundDir int // Lookbehind or lookahead
|
lookaroundDir int // Lookbehind or lookahead
|
||||||
nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
|
nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
|
||||||
referencedGroup int // ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode.
|
referencedGroup int // ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode.
|
||||||
|
isLazy bool // ONLY USED WHEN nodetype == kleene or question
|
||||||
}
|
}
|
||||||
|
|
||||||
// Converts the given list of postfixNodes to one node of type CHARCLASS.
|
// Converts the given list of postfixNodes to one node of type CHARCLASS.
|
||||||
@@ -162,10 +163,19 @@ func newPostfixNode(contents ...rune) postfixNode {
|
|||||||
switch contents[0] {
|
switch contents[0] {
|
||||||
case '+':
|
case '+':
|
||||||
to_return.nodetype = plusNode
|
to_return.nodetype = plusNode
|
||||||
|
case lazyPlusRune:
|
||||||
|
to_return.nodetype = plusNode
|
||||||
|
to_return.isLazy = true
|
||||||
case '?':
|
case '?':
|
||||||
to_return.nodetype = questionNode
|
to_return.nodetype = questionNode
|
||||||
|
case lazyQuestionRune:
|
||||||
|
to_return.nodetype = questionNode
|
||||||
|
to_return.isLazy = true
|
||||||
case '*':
|
case '*':
|
||||||
to_return.nodetype = kleeneNode
|
to_return.nodetype = kleeneNode
|
||||||
|
case lazyKleeneRune:
|
||||||
|
to_return.nodetype = kleeneNode
|
||||||
|
to_return.isLazy = true
|
||||||
case '|':
|
case '|':
|
||||||
to_return.nodetype = pipeNode
|
to_return.nodetype = pipeNode
|
||||||
case concatRune:
|
case concatRune:
|
||||||
|
|||||||
@@ -117,6 +117,7 @@ var reTests = []struct {
|
|||||||
{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
|
{`\d{3,4}`, nil, "ababab555", []Group{{6, 9}}},
|
||||||
{`\bpaint\b`, nil, "paints", []Group{}},
|
{`\bpaint\b`, nil, "paints", []Group{}},
|
||||||
{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
|
{`\b\w{5}\b`, nil, "paint", []Group{{0, 5}}},
|
||||||
|
{`\w{}`, nil, "test", nil},
|
||||||
{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
|
{`[^\w]`, nil, "abcdef1230[]qq';;'", []Group{{10, 11}, {11, 12}, {14, 15}, {15, 16}, {16, 17}, {17, 18}}},
|
||||||
{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
|
{`[^\W]`, nil, "abcdef1230[]qq';;'", []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {12, 13}, {13, 14}}},
|
||||||
{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
|
{`[\[\]]`, nil, "a[b[l]]", []Group{{1, 2}, {3, 4}, {5, 6}, {6, 7}}},
|
||||||
@@ -430,6 +431,7 @@ var reTests = []struct {
|
|||||||
{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
|
{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
|
||||||
{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
|
{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
|
||||||
{`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
|
{`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
|
||||||
|
{`\0141\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "AA", []Group{{0, 2}}},
|
||||||
|
|
||||||
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
|
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
|
||||||
|
|
||||||
@@ -460,8 +462,10 @@ var reTests = []struct {
|
|||||||
{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
|
{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
|
||||||
{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
|
{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
|
||||||
{`\xff`, nil, "\u00ff", []Group{{0, 1}}},
|
{`\xff`, nil, "\u00ff", []Group{{0, 1}}},
|
||||||
|
{`\xff+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
|
||||||
{`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
|
{`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
|
||||||
{`\x00ff`, nil, "\u00ff", []Group{}},
|
{`\x00ff`, nil, "\u00ff", []Group{}},
|
||||||
|
{`\x{0000ff}+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
|
||||||
{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
|
{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
|
||||||
{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
|
{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
|
||||||
{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
|
{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
|
||||||
@@ -485,7 +489,25 @@ var reTests = []struct {
|
|||||||
{`[b-e]`, nil, `f`, []Group{}},
|
{`[b-e]`, nil, `f`, []Group{}},
|
||||||
|
|
||||||
{`*?`, nil, `-`, nil},
|
{`*?`, nil, `-`, nil},
|
||||||
{`a*?`, nil, `-`, nil}, // non-greedy operators are not supported
|
{`a.+c`, nil, `abcabc`, []Group{{0, 6}}},
|
||||||
|
// Lazy quantifier tests
|
||||||
|
{`a.+?c`, nil, `abcabc`, []Group{{0, 3}, {3, 6}}},
|
||||||
|
{`ab*?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{{0, 6}}},
|
||||||
|
{`ab+?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
|
||||||
|
{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
|
||||||
|
{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
|
||||||
|
{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{}},
|
||||||
|
{`ab??c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
|
||||||
|
{`a.*?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AXYZC`, []Group{{0, 5}}},
|
||||||
|
{`a.+?c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCABC`, []Group{{0, 3}, {3, 6}}},
|
||||||
|
{`a.*?c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCABC`, []Group{{0, 3}, {3, 6}}},
|
||||||
|
{`.*?\S *:`, nil, `xx:`, []Group{{0, 3}}},
|
||||||
|
{`a[ ]*? (\d+).*`, nil, `a 10`, []Group{{0, 6}}},
|
||||||
|
{`a[ ]*? (\d+).*`, nil, `a 10`, []Group{{0, 7}}},
|
||||||
|
{`"(?:\\"|[^"])*?"`, nil, `"\""`, []Group{{0, 4}}},
|
||||||
|
{`^.*?$`, nil, "one\ntwo\nthree", []Group{}},
|
||||||
|
{`a[^>]*?b`, nil, `a>b`, []Group{}},
|
||||||
|
{`^a*?$`, nil, `foo`, []Group{}},
|
||||||
|
|
||||||
// Numeric range tests - this is a feature that I added, and doesn't exist
|
// Numeric range tests - this is a feature that I added, and doesn't exist
|
||||||
// in any other mainstream regex engine
|
// in any other mainstream regex engine
|
||||||
@@ -516,6 +538,30 @@ var reTests = []struct {
|
|||||||
{`<389-400`, nil, `-`, nil},
|
{`<389-400`, nil, `-`, nil},
|
||||||
{`<389-400>`, nil, `391`, []Group{{0, 3}}},
|
{`<389-400>`, nil, `391`, []Group{{0, 3}}},
|
||||||
{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
|
{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
|
||||||
|
|
||||||
|
{`\p{Tamil}+`, nil, `உயிரெழுத்து`, []Group{{0, 11}}}, // Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'.
|
||||||
|
{`\P{Tamil}+`, nil, `vowel=உயிரெழுத்து`, []Group{{0, 6}}},
|
||||||
|
{`\P`, nil, `உயிரெழுத்து`, nil},
|
||||||
|
{`\PM\pM*`, nil, `உயிரெழுத்து`, []Group{{0, 1}, {1, 3}, {3, 5}, {5, 7}, {7, 9}, {9, 11}}},
|
||||||
|
{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
|
||||||
|
{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
|
||||||
|
{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
|
||||||
|
|
||||||
|
{`(?<=\().*?(?=\))`, nil, `(abc)`, []Group{{1, 4}}},
|
||||||
|
|
||||||
|
{`((a|b)\2)`, nil, `aa`, []Group{{0, 2}}},
|
||||||
|
{`((a|b)\2)`, nil, `bb`, []Group{{0, 2}}},
|
||||||
|
{`((a|b)\2)`, nil, `ab`, []Group{}},
|
||||||
|
{`((a|b)\2)`, nil, `ba`, []Group{}},
|
||||||
|
|
||||||
|
{`((a|b)\2){3}`, nil, `aaaaaa`, []Group{{0, 6}}},
|
||||||
|
{`((a|b)\2){3}`, nil, `bbbbbb`, []Group{{0, 6}}},
|
||||||
|
{`((a|b)\2){3}`, nil, `bbaaaa`, []Group{{0, 6}}},
|
||||||
|
{`((a|b)\2){3}`, nil, `aabbaa`, []Group{{0, 6}}},
|
||||||
|
{`((a|b)\2){3}`, nil, `aaaabb`, []Group{{0, 6}}},
|
||||||
|
{`((a|b)\2){3}`, nil, `bbaabb`, []Group{{0, 6}}},
|
||||||
|
{`((a|b)\2){3}`, nil, `baabab`, []Group{}},
|
||||||
|
{`((a|b)\2){3}`, nil, `bbabab`, []Group{}},
|
||||||
}
|
}
|
||||||
|
|
||||||
var groupTests = []struct {
|
var groupTests = []struct {
|
||||||
@@ -708,6 +754,18 @@ var groupTests = []struct {
|
|||||||
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
||||||
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
|
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
|
||||||
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
|
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
|
||||||
|
|
||||||
|
// Lazy quantifier tests
|
||||||
|
{`a(?:b|c|d)+?(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}},
|
||||||
|
{`a(?:b|(c|e){1,2}?|d)+?(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {1, 2}, {2, 3}}}},
|
||||||
|
{`(?<!-):(.*?)(?<!-):`, nil, `a:bc-:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
|
||||||
|
{`(?<!\\):(.*?)(?<!\\):`, nil, `a:bc\:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
|
||||||
|
{`(?<!\?)'(.*?)(?<!\?)'`, nil, `a'bc?'de'f`, []Match{[]Group{{1, 9}, {2, 8}}}},
|
||||||
|
{`.*?x\s*\z(.*)`, []ReFlag{RE_MULTILINE, RE_SINGLE_LINE}, "xx\nx\n", []Match{[]Group{{0, 5}, {5, 5}}}},
|
||||||
|
{`.*?x\s*\z(.*)`, []ReFlag{RE_MULTILINE}, "xx\nx\n", []Match{[]Group{{3, 5}, {5, 5}}}},
|
||||||
|
{`^([ab]*?)(?=(b)?)c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
|
||||||
|
{`^([ab]*?)(?!(b))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
|
||||||
|
{`^([ab]*?)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestFind(t *testing.T) {
|
func TestFind(t *testing.T) {
|
||||||
|
|||||||
@@ -4,4 +4,5 @@
|
|||||||
Ideas for flags:
|
Ideas for flags:
|
||||||
-m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
|
-m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
|
||||||
-g <num> : Print the <num>th group
|
-g <num> : Print the <num>th group
|
||||||
|
-r : Specify a directory instead of a file, reads recursively
|
||||||
4. Refactor code for flags - make each flag's code a function, which modifies the result of findAllMatches
|
4. Refactor code for flags - make each flag's code a function, which modifies the result of findAllMatches
|
||||||
|
|||||||
Reference in New Issue
Block a user