Compare commits
5 Commits
110298b6a6
...
ccb82f781b
Author | SHA1 | Date | |
---|---|---|---|
|
ccb82f781b | ||
|
09bbf8d3f1 | ||
|
d5b4450e50 | ||
|
45827b5dd3 | ||
|
c26edcb0c4 |
38
compile.go
38
compile.go
@@ -153,8 +153,12 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
|
||||
i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
|
||||
// TODO: Check for escaped characters
|
||||
if re_runes[i] == '-' && i > 0 && re_runes[i-1] != '\\' { // Unescaped hyphen - replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
|
||||
// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
|
||||
if i >= len(re_runes) {
|
||||
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
||||
}
|
||||
|
||||
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != ']') { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
|
||||
re_runes[i] = CHAR_RANGE
|
||||
}
|
||||
|
||||
@@ -268,7 +272,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
|
||||
*/
|
||||
c := re_postfix[i]
|
||||
if isNormalChar(c) {
|
||||
if isNormalChar(c) || isSpecialCharWithMetacharReplacement(c) {
|
||||
if caseInsensitive {
|
||||
outQueue = append(outQueue, newPostfixNode(allCases(c)...))
|
||||
} else {
|
||||
@@ -276,7 +280,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
continue
|
||||
}
|
||||
// Escape character
|
||||
|
||||
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
|
||||
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
|
||||
return nil, fmt.Errorf("ERROR: Backslash with no escape character.")
|
||||
@@ -408,6 +412,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
}
|
||||
if c == LBRACKET { // Used for character classes
|
||||
firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
|
||||
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
|
||||
i++ // Step forward so we can look at the character class
|
||||
var invertMatch bool
|
||||
@@ -417,7 +422,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
chars := make([]postfixNode, 0) // List of nodes - used only for character classes
|
||||
for i < len(re_postfix) {
|
||||
if re_postfix[i] == RBRACKET {
|
||||
if firstCharAdded && re_postfix[i] == RBRACKET {
|
||||
break
|
||||
}
|
||||
if re_postfix[i] == CHAR_RANGE {
|
||||
@@ -477,9 +482,20 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
i++
|
||||
}
|
||||
} else {
|
||||
if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket.
|
||||
switch re_postfix[i] {
|
||||
case LBRACKET:
|
||||
chars = append(chars, newPostfixCharNode('['))
|
||||
case RBRACKET:
|
||||
chars = append(chars, newPostfixCharNode(']'))
|
||||
default:
|
||||
return nil, fmt.Errorf("Error parsing high-range unicode value in character class.")
|
||||
}
|
||||
}
|
||||
chars = append(chars, newPostfixCharNode(re_postfix[i]))
|
||||
i++
|
||||
}
|
||||
firstCharAdded = true
|
||||
|
||||
if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range
|
||||
// Things to note:
|
||||
@@ -491,13 +507,15 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
// 2. To account for this, the following logic is followed:
|
||||
// a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range.
|
||||
// i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such.
|
||||
// ii. If either the start or end of the range don't exist in 'chars' ie. something like [-a] or [a-], then too will we treat it as a literal hyphen.
|
||||
// b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range).
|
||||
endRangePostfixNode := mustPop(&chars)
|
||||
startRangePostfixNode := mustPop(&chars)
|
||||
if len(endRangePostfixNode.contents) != 1 {
|
||||
return nil, fmt.Errorf("Error parsing character range.")
|
||||
} else if len(startRangePostfixNode.contents) != 1 { // This is actually a regular hyphen
|
||||
endRangePostfixNode, err1 := pop(&chars)
|
||||
startRangePostfixNode, err2 := pop(&chars)
|
||||
|
||||
if (err1 != nil || err2 != nil) || len(startRangePostfixNode.contents) != 1 { // Treat it as a regular hyphen
|
||||
chars = append(chars, startRangePostfixNode, newPostfixCharNode('-'), endRangePostfixNode)
|
||||
} else if len(endRangePostfixNode.contents) != 1 { // I don't even know what this would look like, this is just a sanity check
|
||||
return nil, fmt.Errorf("Error parsing character range.")
|
||||
} else {
|
||||
// We have established that they both have a length of 1
|
||||
startRangeRune := startRangePostfixNode.contents[0]
|
||||
|
14
misc.go
14
misc.go
@@ -17,6 +17,8 @@ var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPARE
|
||||
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash
|
||||
var CHAR_RANGE rune = 0xF0008 // Represents a character range
|
||||
|
||||
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', '~', '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR}
|
||||
|
||||
// An interface for int and rune, which are identical
|
||||
type character interface {
|
||||
int | rune
|
||||
@@ -32,9 +34,17 @@ func isWordBoundary(str []rune, idx int) bool {
|
||||
return wbounded
|
||||
}
|
||||
|
||||
func isSpecialChar(c rune) bool {
|
||||
return slices.Contains(specialChars, c)
|
||||
|
||||
}
|
||||
|
||||
// Some special characters have metacharacter replacements. These characters, when encountered in their literal form, can be treated as regular characters.
|
||||
func isSpecialCharWithMetacharReplacement(c rune) bool {
|
||||
return slices.Contains([]rune{'[', ']'}, c)
|
||||
}
|
||||
|
||||
func isNormalChar(c rune) bool {
|
||||
specialChars := []rune(`?*\^${}()+|[].~<>`)
|
||||
specialChars = append(specialChars, LBRACKET, RBRACKET, NONCAPLPAREN_CHAR)
|
||||
return !slices.Contains(specialChars, c)
|
||||
}
|
||||
|
||||
|
@@ -116,6 +116,13 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
|
||||
case 'v': // Vertical tab
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, rune(11))
|
||||
case '-': // Literal hyphen - only in character class
|
||||
if inCharClass {
|
||||
toReturn.nodetype = CHARACTER
|
||||
toReturn.contents = append(toReturn.contents, '-')
|
||||
} else {
|
||||
return postfixNode{}, fmt.Errorf("Invalid escape character.")
|
||||
}
|
||||
default: // None of the above - append it as a regular character
|
||||
if isNormalChar(c) { // Normal characters cannot be escaped
|
||||
return postfixNode{}, fmt.Errorf("Invalid escape character.")
|
||||
|
57
re_test.go
57
re_test.go
@@ -1,6 +1,7 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"slices"
|
||||
"testing"
|
||||
)
|
||||
@@ -185,6 +186,56 @@ var reTests = []struct {
|
||||
{`a.*b`, nil, "acc\nccb", []Group{}},
|
||||
{`a.{4,5}b`, nil, "acc\nccb", []Group{}},
|
||||
{`a.b`, nil, "a\rb", []Group{{0, 3}}},
|
||||
{`a.b`, []ReFlag{RE_MULTILINE}, "a\nb", []Group{{0, 3}}},
|
||||
{`a.*b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
|
||||
{`a.{4,5}b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
|
||||
|
||||
{`)`, nil, ``, nil},
|
||||
{`^$`, nil, ``, []Group{{0, 0}}},
|
||||
{`abc`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`abc`, nil, `xbc`, []Group{}},
|
||||
{`abc`, nil, `axc`, []Group{}},
|
||||
{`abc`, nil, `abx`, []Group{}},
|
||||
{`abc`, nil, `xabcy`, []Group{{1, 4}}},
|
||||
{`abc`, nil, `ababc`, []Group{{2, 5}}},
|
||||
{`ab*c`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`ab*bc`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`ab*bc`, nil, `abbc`, []Group{{0, 4}}},
|
||||
{`ab*bc`, nil, `abbbbc`, []Group{{0, 6}}},
|
||||
{`ab+bc`, nil, `abbc`, []Group{{0, 4}}},
|
||||
{`ab+bc`, nil, `abc`, []Group{}},
|
||||
{`ab+bc`, nil, `abq`, []Group{}},
|
||||
{`ab+bc`, nil, `abbbbc`, []Group{{0, 6}}},
|
||||
{`ab?bc`, nil, `abbc`, []Group{{0, 4}}},
|
||||
{`ab?bc`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`ab?bc`, nil, `abbbbc`, []Group{}},
|
||||
{`ab?c`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`^abc$`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`^abc$`, nil, `abcc`, []Group{}},
|
||||
{`^abc`, nil, `abcc`, []Group{{0, 3}}},
|
||||
{`^abc$`, nil, `aabc`, []Group{}},
|
||||
{`abc$`, nil, `aabc`, []Group{{1, 4}}},
|
||||
{`^`, nil, `abc`, []Group{{0, 0}}},
|
||||
{`$`, nil, `abc`, []Group{{3, 3}}},
|
||||
{`a.c`, nil, `abc`, []Group{{0, 3}}},
|
||||
{`a.c`, nil, `axc`, []Group{{0, 3}}},
|
||||
{`a.*c`, nil, `axyzc`, []Group{{0, 5}}},
|
||||
{`a.*c`, nil, `axyzd`, []Group{}},
|
||||
{`a[bc]d`, nil, `abc`, []Group{}},
|
||||
{`a[bc]d`, nil, `abd`, []Group{{0, 3}}},
|
||||
{`a[b-d]e`, nil, `abd`, []Group{}},
|
||||
{`a[b-d]e`, nil, `ace`, []Group{{0, 3}}},
|
||||
{`a[b-d]`, nil, `aac`, []Group{{1, 3}}},
|
||||
{`a[-b]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
|
||||
{`a[\-b]`, nil, `a-`, []Group{{0, 2}}},
|
||||
{`a[b-]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
|
||||
|
||||
{`a[]b`, nil, `-`, nil},
|
||||
{`a[`, nil, `-`, nil},
|
||||
{`a\`, nil, `-`, nil},
|
||||
{`abc)`, nil, `-`, nil},
|
||||
{`(abc`, nil, `-`, nil},
|
||||
{`a]`, nil, `a]`, []Group{{0, 2}}},
|
||||
|
||||
// Todo - add numeric range tests
|
||||
}
|
||||
@@ -223,7 +274,7 @@ func TestFindAllMatches(t *testing.T) {
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
panic(fmt.Errorf("Test Error: %v", err))
|
||||
}
|
||||
} else {
|
||||
matchIndices := FindAllMatches(regComp, test.str)
|
||||
@@ -242,7 +293,7 @@ func TestFindAllMatches(t *testing.T) {
|
||||
func TestFindString(t *testing.T) {
|
||||
for _, test := range reTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re)
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
@@ -267,7 +318,7 @@ func TestFindString(t *testing.T) {
|
||||
func TestFindAllGroups(t *testing.T) {
|
||||
for _, test := range groupTests {
|
||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||
regComp, err := Compile(test.re)
|
||||
regComp, err := Compile(test.re, test.flags...)
|
||||
if err != nil {
|
||||
if test.result != nil {
|
||||
panic(err)
|
||||
|
Reference in New Issue
Block a user