Compare commits
5 Commits
110298b6a6
...
ccb82f781b
Author | SHA1 | Date | |
---|---|---|---|
|
ccb82f781b | ||
|
09bbf8d3f1 | ||
|
d5b4450e50 | ||
|
45827b5dd3 | ||
|
c26edcb0c4 |
42
compile.go
42
compile.go
@@ -153,8 +153,12 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
|
for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
|
||||||
i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
|
i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
|
||||||
// TODO: Check for escaped characters
|
// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
|
||||||
if re_runes[i] == '-' && i > 0 && re_runes[i-1] != '\\' { // Unescaped hyphen - replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
|
if i >= len(re_runes) {
|
||||||
|
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
||||||
|
}
|
||||||
|
|
||||||
|
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != ']') { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
|
||||||
re_runes[i] = CHAR_RANGE
|
re_runes[i] = CHAR_RANGE
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -268,7 +272,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
|
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
|
||||||
*/
|
*/
|
||||||
c := re_postfix[i]
|
c := re_postfix[i]
|
||||||
if isNormalChar(c) {
|
if isNormalChar(c) || isSpecialCharWithMetacharReplacement(c) {
|
||||||
if caseInsensitive {
|
if caseInsensitive {
|
||||||
outQueue = append(outQueue, newPostfixNode(allCases(c)...))
|
outQueue = append(outQueue, newPostfixNode(allCases(c)...))
|
||||||
} else {
|
} else {
|
||||||
@@ -276,7 +280,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Escape character
|
|
||||||
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
|
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
|
||||||
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
|
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
|
||||||
return nil, fmt.Errorf("ERROR: Backslash with no escape character.")
|
return nil, fmt.Errorf("ERROR: Backslash with no escape character.")
|
||||||
@@ -408,8 +412,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if c == LBRACKET { // Used for character classes
|
if c == LBRACKET { // Used for character classes
|
||||||
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
|
firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
|
||||||
i++ // Step forward so we can look at the character class
|
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
|
||||||
|
i++ // Step forward so we can look at the character class
|
||||||
var invertMatch bool
|
var invertMatch bool
|
||||||
if re_postfix[i] == '^' {
|
if re_postfix[i] == '^' {
|
||||||
invertMatch = true
|
invertMatch = true
|
||||||
@@ -417,7 +422,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
chars := make([]postfixNode, 0) // List of nodes - used only for character classes
|
chars := make([]postfixNode, 0) // List of nodes - used only for character classes
|
||||||
for i < len(re_postfix) {
|
for i < len(re_postfix) {
|
||||||
if re_postfix[i] == RBRACKET {
|
if firstCharAdded && re_postfix[i] == RBRACKET {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
if re_postfix[i] == CHAR_RANGE {
|
if re_postfix[i] == CHAR_RANGE {
|
||||||
@@ -477,9 +482,20 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
i++
|
i++
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket.
|
||||||
|
switch re_postfix[i] {
|
||||||
|
case LBRACKET:
|
||||||
|
chars = append(chars, newPostfixCharNode('['))
|
||||||
|
case RBRACKET:
|
||||||
|
chars = append(chars, newPostfixCharNode(']'))
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("Error parsing high-range unicode value in character class.")
|
||||||
|
}
|
||||||
|
}
|
||||||
chars = append(chars, newPostfixCharNode(re_postfix[i]))
|
chars = append(chars, newPostfixCharNode(re_postfix[i]))
|
||||||
i++
|
i++
|
||||||
}
|
}
|
||||||
|
firstCharAdded = true
|
||||||
|
|
||||||
if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range
|
if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range
|
||||||
// Things to note:
|
// Things to note:
|
||||||
@@ -491,13 +507,15 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
// 2. To account for this, the following logic is followed:
|
// 2. To account for this, the following logic is followed:
|
||||||
// a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range.
|
// a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range.
|
||||||
// i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such.
|
// i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such.
|
||||||
|
// ii. If either the start or end of the range don't exist in 'chars' ie. something like [-a] or [a-], then too will we treat it as a literal hyphen.
|
||||||
// b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range).
|
// b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range).
|
||||||
endRangePostfixNode := mustPop(&chars)
|
endRangePostfixNode, err1 := pop(&chars)
|
||||||
startRangePostfixNode := mustPop(&chars)
|
startRangePostfixNode, err2 := pop(&chars)
|
||||||
if len(endRangePostfixNode.contents) != 1 {
|
|
||||||
return nil, fmt.Errorf("Error parsing character range.")
|
if (err1 != nil || err2 != nil) || len(startRangePostfixNode.contents) != 1 { // Treat it as a regular hyphen
|
||||||
} else if len(startRangePostfixNode.contents) != 1 { // This is actually a regular hyphen
|
|
||||||
chars = append(chars, startRangePostfixNode, newPostfixCharNode('-'), endRangePostfixNode)
|
chars = append(chars, startRangePostfixNode, newPostfixCharNode('-'), endRangePostfixNode)
|
||||||
|
} else if len(endRangePostfixNode.contents) != 1 { // I don't even know what this would look like, this is just a sanity check
|
||||||
|
return nil, fmt.Errorf("Error parsing character range.")
|
||||||
} else {
|
} else {
|
||||||
// We have established that they both have a length of 1
|
// We have established that they both have a length of 1
|
||||||
startRangeRune := startRangePostfixNode.contents[0]
|
startRangeRune := startRangePostfixNode.contents[0]
|
||||||
|
14
misc.go
14
misc.go
@@ -17,6 +17,8 @@ var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPARE
|
|||||||
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash
|
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash
|
||||||
var CHAR_RANGE rune = 0xF0008 // Represents a character range
|
var CHAR_RANGE rune = 0xF0008 // Represents a character range
|
||||||
|
|
||||||
|
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', '~', '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR}
|
||||||
|
|
||||||
// An interface for int and rune, which are identical
|
// An interface for int and rune, which are identical
|
||||||
type character interface {
|
type character interface {
|
||||||
int | rune
|
int | rune
|
||||||
@@ -32,9 +34,17 @@ func isWordBoundary(str []rune, idx int) bool {
|
|||||||
return wbounded
|
return wbounded
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isSpecialChar(c rune) bool {
|
||||||
|
return slices.Contains(specialChars, c)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Some special characters have metacharacter replacements. These characters, when encountered in their literal form, can be treated as regular characters.
|
||||||
|
func isSpecialCharWithMetacharReplacement(c rune) bool {
|
||||||
|
return slices.Contains([]rune{'[', ']'}, c)
|
||||||
|
}
|
||||||
|
|
||||||
func isNormalChar(c rune) bool {
|
func isNormalChar(c rune) bool {
|
||||||
specialChars := []rune(`?*\^${}()+|[].~<>`)
|
|
||||||
specialChars = append(specialChars, LBRACKET, RBRACKET, NONCAPLPAREN_CHAR)
|
|
||||||
return !slices.Contains(specialChars, c)
|
return !slices.Contains(specialChars, c)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -116,6 +116,13 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
|
|||||||
case 'v': // Vertical tab
|
case 'v': // Vertical tab
|
||||||
toReturn.nodetype = CHARACTER
|
toReturn.nodetype = CHARACTER
|
||||||
toReturn.contents = append(toReturn.contents, rune(11))
|
toReturn.contents = append(toReturn.contents, rune(11))
|
||||||
|
case '-': // Literal hyphen - only in character class
|
||||||
|
if inCharClass {
|
||||||
|
toReturn.nodetype = CHARACTER
|
||||||
|
toReturn.contents = append(toReturn.contents, '-')
|
||||||
|
} else {
|
||||||
|
return postfixNode{}, fmt.Errorf("Invalid escape character.")
|
||||||
|
}
|
||||||
default: // None of the above - append it as a regular character
|
default: // None of the above - append it as a regular character
|
||||||
if isNormalChar(c) { // Normal characters cannot be escaped
|
if isNormalChar(c) { // Normal characters cannot be escaped
|
||||||
return postfixNode{}, fmt.Errorf("Invalid escape character.")
|
return postfixNode{}, fmt.Errorf("Invalid escape character.")
|
||||||
|
57
re_test.go
57
re_test.go
@@ -1,6 +1,7 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"slices"
|
"slices"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
@@ -185,6 +186,56 @@ var reTests = []struct {
|
|||||||
{`a.*b`, nil, "acc\nccb", []Group{}},
|
{`a.*b`, nil, "acc\nccb", []Group{}},
|
||||||
{`a.{4,5}b`, nil, "acc\nccb", []Group{}},
|
{`a.{4,5}b`, nil, "acc\nccb", []Group{}},
|
||||||
{`a.b`, nil, "a\rb", []Group{{0, 3}}},
|
{`a.b`, nil, "a\rb", []Group{{0, 3}}},
|
||||||
|
{`a.b`, []ReFlag{RE_MULTILINE}, "a\nb", []Group{{0, 3}}},
|
||||||
|
{`a.*b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
|
||||||
|
{`a.{4,5}b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
|
||||||
|
|
||||||
|
{`)`, nil, ``, nil},
|
||||||
|
{`^$`, nil, ``, []Group{{0, 0}}},
|
||||||
|
{`abc`, nil, `abc`, []Group{{0, 3}}},
|
||||||
|
{`abc`, nil, `xbc`, []Group{}},
|
||||||
|
{`abc`, nil, `axc`, []Group{}},
|
||||||
|
{`abc`, nil, `abx`, []Group{}},
|
||||||
|
{`abc`, nil, `xabcy`, []Group{{1, 4}}},
|
||||||
|
{`abc`, nil, `ababc`, []Group{{2, 5}}},
|
||||||
|
{`ab*c`, nil, `abc`, []Group{{0, 3}}},
|
||||||
|
{`ab*bc`, nil, `abc`, []Group{{0, 3}}},
|
||||||
|
{`ab*bc`, nil, `abbc`, []Group{{0, 4}}},
|
||||||
|
{`ab*bc`, nil, `abbbbc`, []Group{{0, 6}}},
|
||||||
|
{`ab+bc`, nil, `abbc`, []Group{{0, 4}}},
|
||||||
|
{`ab+bc`, nil, `abc`, []Group{}},
|
||||||
|
{`ab+bc`, nil, `abq`, []Group{}},
|
||||||
|
{`ab+bc`, nil, `abbbbc`, []Group{{0, 6}}},
|
||||||
|
{`ab?bc`, nil, `abbc`, []Group{{0, 4}}},
|
||||||
|
{`ab?bc`, nil, `abc`, []Group{{0, 3}}},
|
||||||
|
{`ab?bc`, nil, `abbbbc`, []Group{}},
|
||||||
|
{`ab?c`, nil, `abc`, []Group{{0, 3}}},
|
||||||
|
{`^abc$`, nil, `abc`, []Group{{0, 3}}},
|
||||||
|
{`^abc$`, nil, `abcc`, []Group{}},
|
||||||
|
{`^abc`, nil, `abcc`, []Group{{0, 3}}},
|
||||||
|
{`^abc$`, nil, `aabc`, []Group{}},
|
||||||
|
{`abc$`, nil, `aabc`, []Group{{1, 4}}},
|
||||||
|
{`^`, nil, `abc`, []Group{{0, 0}}},
|
||||||
|
{`$`, nil, `abc`, []Group{{3, 3}}},
|
||||||
|
{`a.c`, nil, `abc`, []Group{{0, 3}}},
|
||||||
|
{`a.c`, nil, `axc`, []Group{{0, 3}}},
|
||||||
|
{`a.*c`, nil, `axyzc`, []Group{{0, 5}}},
|
||||||
|
{`a.*c`, nil, `axyzd`, []Group{}},
|
||||||
|
{`a[bc]d`, nil, `abc`, []Group{}},
|
||||||
|
{`a[bc]d`, nil, `abd`, []Group{{0, 3}}},
|
||||||
|
{`a[b-d]e`, nil, `abd`, []Group{}},
|
||||||
|
{`a[b-d]e`, nil, `ace`, []Group{{0, 3}}},
|
||||||
|
{`a[b-d]`, nil, `aac`, []Group{{1, 3}}},
|
||||||
|
{`a[-b]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
|
||||||
|
{`a[\-b]`, nil, `a-`, []Group{{0, 2}}},
|
||||||
|
{`a[b-]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
|
||||||
|
|
||||||
|
{`a[]b`, nil, `-`, nil},
|
||||||
|
{`a[`, nil, `-`, nil},
|
||||||
|
{`a\`, nil, `-`, nil},
|
||||||
|
{`abc)`, nil, `-`, nil},
|
||||||
|
{`(abc`, nil, `-`, nil},
|
||||||
|
{`a]`, nil, `a]`, []Group{{0, 2}}},
|
||||||
|
|
||||||
// Todo - add numeric range tests
|
// Todo - add numeric range tests
|
||||||
}
|
}
|
||||||
@@ -223,7 +274,7 @@ func TestFindAllMatches(t *testing.T) {
|
|||||||
regComp, err := Compile(test.re, test.flags...)
|
regComp, err := Compile(test.re, test.flags...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if test.result != nil {
|
if test.result != nil {
|
||||||
panic(err)
|
panic(fmt.Errorf("Test Error: %v", err))
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
matchIndices := FindAllMatches(regComp, test.str)
|
matchIndices := FindAllMatches(regComp, test.str)
|
||||||
@@ -242,7 +293,7 @@ func TestFindAllMatches(t *testing.T) {
|
|||||||
func TestFindString(t *testing.T) {
|
func TestFindString(t *testing.T) {
|
||||||
for _, test := range reTests {
|
for _, test := range reTests {
|
||||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||||
regComp, err := Compile(test.re)
|
regComp, err := Compile(test.re, test.flags...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if test.result != nil {
|
if test.result != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
@@ -267,7 +318,7 @@ func TestFindString(t *testing.T) {
|
|||||||
func TestFindAllGroups(t *testing.T) {
|
func TestFindAllGroups(t *testing.T) {
|
||||||
for _, test := range groupTests {
|
for _, test := range groupTests {
|
||||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||||
regComp, err := Compile(test.re)
|
regComp, err := Compile(test.re, test.flags...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if test.result != nil {
|
if test.result != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
|
Reference in New Issue
Block a user