5 Commits

4 changed files with 103 additions and 17 deletions

View File

@@ -153,8 +153,12 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
// TODO: Check for escaped characters
if re_runes[i] == '-' && i > 0 && re_runes[i-1] != '\\' { // Unescaped hyphen - replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
if i >= len(re_runes) {
return nil, fmt.Errorf("Opening bracket without closing bracket.")
}
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != ']') { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
re_runes[i] = CHAR_RANGE
}
@@ -268,7 +272,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
*/
c := re_postfix[i]
if isNormalChar(c) {
if isNormalChar(c) || isSpecialCharWithMetacharReplacement(c) {
if caseInsensitive {
outQueue = append(outQueue, newPostfixNode(allCases(c)...))
} else {
@@ -276,7 +280,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
continue
}
// Escape character
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
return nil, fmt.Errorf("ERROR: Backslash with no escape character.")
@@ -408,8 +412,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
}
if c == LBRACKET { // Used for character classes
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
i++ // Step forward so we can look at the character class
firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
i++ // Step forward so we can look at the character class
var invertMatch bool
if re_postfix[i] == '^' {
invertMatch = true
@@ -417,7 +422,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
chars := make([]postfixNode, 0) // List of nodes - used only for character classes
for i < len(re_postfix) {
if re_postfix[i] == RBRACKET {
if firstCharAdded && re_postfix[i] == RBRACKET {
break
}
if re_postfix[i] == CHAR_RANGE {
@@ -477,9 +482,20 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
i++
}
} else {
if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket.
switch re_postfix[i] {
case LBRACKET:
chars = append(chars, newPostfixCharNode('['))
case RBRACKET:
chars = append(chars, newPostfixCharNode(']'))
default:
return nil, fmt.Errorf("Error parsing high-range unicode value in character class.")
}
}
chars = append(chars, newPostfixCharNode(re_postfix[i]))
i++
}
firstCharAdded = true
if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range
// Things to note:
@@ -491,13 +507,15 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// 2. To account for this, the following logic is followed:
// a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range.
// i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such.
// ii. If either the start or end of the range don't exist in 'chars' ie. something like [-a] or [a-], then too will we treat it as a literal hyphen.
// b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range).
endRangePostfixNode := mustPop(&chars)
startRangePostfixNode := mustPop(&chars)
if len(endRangePostfixNode.contents) != 1 {
return nil, fmt.Errorf("Error parsing character range.")
} else if len(startRangePostfixNode.contents) != 1 { // This is actually a regular hyphen
endRangePostfixNode, err1 := pop(&chars)
startRangePostfixNode, err2 := pop(&chars)
if (err1 != nil || err2 != nil) || len(startRangePostfixNode.contents) != 1 { // Treat it as a regular hyphen
chars = append(chars, startRangePostfixNode, newPostfixCharNode('-'), endRangePostfixNode)
} else if len(endRangePostfixNode.contents) != 1 { // I don't even know what this would look like, this is just a sanity check
return nil, fmt.Errorf("Error parsing character range.")
} else {
// We have established that they both have a length of 1
startRangeRune := startRangePostfixNode.contents[0]

14
misc.go
View File

@@ -17,6 +17,8 @@ var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPARE
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash
var CHAR_RANGE rune = 0xF0008 // Represents a character range
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', '~', '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR}
// An interface for int and rune, which are identical
type character interface {
int | rune
@@ -32,9 +34,17 @@ func isWordBoundary(str []rune, idx int) bool {
return wbounded
}
func isSpecialChar(c rune) bool {
return slices.Contains(specialChars, c)
}
// Some special characters have metacharacter replacements. These characters, when encountered in their literal form, can be treated as regular characters.
func isSpecialCharWithMetacharReplacement(c rune) bool {
return slices.Contains([]rune{'[', ']'}, c)
}
func isNormalChar(c rune) bool {
specialChars := []rune(`?*\^${}()+|[].~<>`)
specialChars = append(specialChars, LBRACKET, RBRACKET, NONCAPLPAREN_CHAR)
return !slices.Contains(specialChars, c)
}

View File

@@ -116,6 +116,13 @@ func newEscapedNode(c rune, inCharClass bool) (postfixNode, error) {
case 'v': // Vertical tab
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(11))
case '-': // Literal hyphen - only in character class
if inCharClass {
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, '-')
} else {
return postfixNode{}, fmt.Errorf("Invalid escape character.")
}
default: // None of the above - append it as a regular character
if isNormalChar(c) { // Normal characters cannot be escaped
return postfixNode{}, fmt.Errorf("Invalid escape character.")

View File

@@ -1,6 +1,7 @@
package main
import (
"fmt"
"slices"
"testing"
)
@@ -185,6 +186,56 @@ var reTests = []struct {
{`a.*b`, nil, "acc\nccb", []Group{}},
{`a.{4,5}b`, nil, "acc\nccb", []Group{}},
{`a.b`, nil, "a\rb", []Group{{0, 3}}},
{`a.b`, []ReFlag{RE_MULTILINE}, "a\nb", []Group{{0, 3}}},
{`a.*b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
{`a.{4,5}b`, []ReFlag{RE_MULTILINE}, "acc\nccb", []Group{{0, 7}}},
{`)`, nil, ``, nil},
{`^$`, nil, ``, []Group{{0, 0}}},
{`abc`, nil, `abc`, []Group{{0, 3}}},
{`abc`, nil, `xbc`, []Group{}},
{`abc`, nil, `axc`, []Group{}},
{`abc`, nil, `abx`, []Group{}},
{`abc`, nil, `xabcy`, []Group{{1, 4}}},
{`abc`, nil, `ababc`, []Group{{2, 5}}},
{`ab*c`, nil, `abc`, []Group{{0, 3}}},
{`ab*bc`, nil, `abc`, []Group{{0, 3}}},
{`ab*bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab*bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab+bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab+bc`, nil, `abc`, []Group{}},
{`ab+bc`, nil, `abq`, []Group{}},
{`ab+bc`, nil, `abbbbc`, []Group{{0, 6}}},
{`ab?bc`, nil, `abbc`, []Group{{0, 4}}},
{`ab?bc`, nil, `abc`, []Group{{0, 3}}},
{`ab?bc`, nil, `abbbbc`, []Group{}},
{`ab?c`, nil, `abc`, []Group{{0, 3}}},
{`^abc$`, nil, `abc`, []Group{{0, 3}}},
{`^abc$`, nil, `abcc`, []Group{}},
{`^abc`, nil, `abcc`, []Group{{0, 3}}},
{`^abc$`, nil, `aabc`, []Group{}},
{`abc$`, nil, `aabc`, []Group{{1, 4}}},
{`^`, nil, `abc`, []Group{{0, 0}}},
{`$`, nil, `abc`, []Group{{3, 3}}},
{`a.c`, nil, `abc`, []Group{{0, 3}}},
{`a.c`, nil, `axc`, []Group{{0, 3}}},
{`a.*c`, nil, `axyzc`, []Group{{0, 5}}},
{`a.*c`, nil, `axyzd`, []Group{}},
{`a[bc]d`, nil, `abc`, []Group{}},
{`a[bc]d`, nil, `abd`, []Group{{0, 3}}},
{`a[b-d]e`, nil, `abd`, []Group{}},
{`a[b-d]e`, nil, `ace`, []Group{{0, 3}}},
{`a[b-d]`, nil, `aac`, []Group{{1, 3}}},
{`a[-b]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
{`a[\-b]`, nil, `a-`, []Group{{0, 2}}},
{`a[b-]`, nil, `a-`, []Group{{0, 2}}}, // If a character class has a hyphen without a start or end character, it is treated as a literal hyphen
{`a[]b`, nil, `-`, nil},
{`a[`, nil, `-`, nil},
{`a\`, nil, `-`, nil},
{`abc)`, nil, `-`, nil},
{`(abc`, nil, `-`, nil},
{`a]`, nil, `a]`, []Group{{0, 2}}},
// Todo - add numeric range tests
}
@@ -223,7 +274,7 @@ func TestFindAllMatches(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
panic(fmt.Errorf("Test Error: %v", err))
}
} else {
matchIndices := FindAllMatches(regComp, test.str)
@@ -242,7 +293,7 @@ func TestFindAllMatches(t *testing.T) {
func TestFindString(t *testing.T) {
for _, test := range reTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re)
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
@@ -267,7 +318,7 @@ func TestFindString(t *testing.T) {
func TestFindAllGroups(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re)
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)