16 Commits

Author SHA1 Message Date
6cd0a10a8f Added more documentation 2025-02-13 14:14:00 -05:00
69fb96c43d Merge pull request 'Implement Unicode character classes' (#4) from implementUnicodeCharClass into master
Reviewed-on: #4
2025-02-13 09:51:44 -06:00
46bc0c8529 Removed unicode character classes from 'features not supported' list 2025-02-13 10:48:23 -05:00
1a890a1e75 Refactoring - remove duplicate code 2025-02-13 09:10:40 -05:00
fde3784e5a Added unicode charclass support within character classes; Fixed bugs with hex classes and unicode classes 2025-02-13 08:58:02 -05:00
7045711860 Convert test_str into a rune slice for better unicode compatibility, it also fixed the bug where all unicode characters wouldn't be colored 2025-02-13 08:57:06 -05:00
d4d606d95b Added tests for unicode character classes; more tests for hex characters 2025-02-13 08:55:12 -05:00
9cd330e521 More work on unicode character class support - fix bug where all characters aren't being matched 2025-02-12 23:04:10 -05:00
44d6a2005c Started working on unicode character classes 2025-02-12 22:19:30 -05:00
f76cd6c3d9 Merge pull request 'Implement Backreferences' (#3) from implementBackreferences into master
Reviewed-on: #3
2025-02-12 21:17:32 -06:00
375baa1722 Wrote more backreference tests 2025-02-12 07:51:20 -05:00
2e47c631bb Updated documentation to include backreferences 2025-02-12 07:50:59 -05:00
81b8b1b11c Do not validate a backreference if the group that it refers to is not valid 2025-02-11 19:12:58 -05:00
2934e7a20f Wrote tests for backreferences 2025-02-11 19:12:40 -05:00
f466d4a8d5 More progress on backreference implementation 2025-02-11 17:06:39 -05:00
8327450dd2 Started implementing backreferences (octal values should now be prefaced with \0) 2025-02-11 16:14:54 -05:00
7 changed files with 403 additions and 124 deletions

View File

@@ -129,6 +129,8 @@ func main() {
matchIndices = regComp.FindAllSubmatch(test_str)
}
test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters
if *printMatchesFlag {
// if we are in single line mode, print the line on which
// the matches occur
@@ -158,10 +160,10 @@ func main() {
oldIndices := indicesToPrint.values()
indicesToPrint = new_uniq_arr[int]()
// Explanation:
// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
// These are the values we want to print, now that we have inverted the match.
// Re-initialize indicesToPrint and add all of these values to it.
indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)
indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)
}
// If lineFlag is enabled, we should only print something if:
@@ -182,7 +184,7 @@ func main() {
// the corresponding end index.
// 3. If not, just print the character.
if substituteFlagEnabled {
for i := range test_str {
for i := range test_str_runes {
inMatchIndex := false
for _, m := range matchIndices {
if i == m[0].StartIdx {
@@ -193,11 +195,11 @@ func main() {
}
}
if !inMatchIndex {
fmt.Fprintf(out, "%c", test_str[i])
fmt.Fprintf(out, "%c", test_str_runes[i])
}
}
} else {
for i, c := range test_str {
for i, c := range test_str_runes {
if indicesToPrint.contains(i) {
color.New(color.FgRed).Fprintf(out, "%c", c)
// Newline after every match - only if -o is enabled and -v is disabled.

View File

@@ -108,6 +108,48 @@ func getPOSIXClass(str []rune) (bool, string) {
return true, rtv
}
// isUnicodeCharClassLetter returns whether or not the given letter represents a unicode character class.
func isUnicodeCharClassLetter(c rune) bool {
return slices.Contains([]rune{'L', 'M', 'S', 'N', 'P', 'C', 'Z'}, c)
}
// rangeTableToRuneSlice converts the given range table into a rune slice and returns it.
func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune {
var rtv []rune
for _, r := range rangetable.R16 {
for c := r.Lo; c <= r.Hi; c += r.Stride {
rtv = append(rtv, rune(c))
}
}
for _, r := range rangetable.R32 {
for c := r.Lo; c <= r.Hi; c += r.Stride {
rtv = append(rtv, rune(c))
}
}
return rtv
}
// unicodeCharClassToRange converts the given unicode character class name into a list of characters in that class.
// This class could also be a single letter eg. 'C'.
func unicodeCharClassToRange(class string) ([]rune, error) {
if len(class) == 0 {
return nil, fmt.Errorf("empty unicode character class")
}
if len(class) == 1 || len(class) == 2 {
if rangeTable, ok := unicode.Categories[class]; ok {
return rangeTableToRuneSlice(rangeTable), nil
} else {
return nil, fmt.Errorf("invalid short unicode character class")
}
} else {
if rangeTable, ok := unicode.Scripts[class]; ok {
return rangeTableToRuneSlice(rangeTable), nil
} else {
return nil, fmt.Errorf("invalid long unicode character class")
}
}
}
// Stores whether the case-insensitive flag has been enabled.
var caseInsensitive bool
@@ -309,17 +351,44 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
} else if isHex(re_runes[i]) {
re_postfix = append(re_postfix, re_runes[i:i+2]...)
i += 2
i += 1 // I don't skip forward 2 steps, because the second step will happen with the loop increment
} else {
return nil, fmt.Errorf("invalid hex value in expression")
}
} else if isOctal(re_runes[i]) {
} else if re_runes[i] == 'p' || re_runes[i] == 'P' { // Unicode character class (P is negated unicode charclass)
re_postfix = append(re_postfix, re_runes[i])
i++
if i >= len(re_runes) {
return nil, fmt.Errorf("error parsing unicode character class in expression")
}
if re_runes[i] == '{' { // Full name charclass
for re_runes[i] != '}' {
re_postfix = append(re_postfix, re_runes[i])
i++
}
re_postfix = append(re_postfix, re_runes[i])
i++
} else if isUnicodeCharClassLetter(re_runes[i]) {
re_postfix = append(re_postfix, re_runes[i])
i++
} else {
return nil, fmt.Errorf("error parsing unicode character class in expression")
}
i-- // The loop increment at the top will move us forward
} else if re_runes[i] == '0' { // Start of octal value
numDigits := 1
for i+numDigits < len(re_runes) && numDigits < 3 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 3)
for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0)
numDigits++
}
re_postfix = append(re_postfix, re_runes[i:i+numDigits]...)
i += (numDigits - 1) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended
} else if unicode.IsDigit(re_runes[i]) { // Any other number - backreference
numDigits := 1
for i+numDigits < len(re_runes) && unicode.IsDigit(re_runes[i+numDigits]) { // Skip while we see a digit
numDigits++
}
re_postfix = append(re_postfix, re_runes[i:i+numDigits]...)
i += (numDigits - 1) // Move back a step to add concatenation operator
} else {
re_postfix = append(re_postfix, re_runes[i])
}
@@ -364,7 +433,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
outQueue := make([]postfixNode, 0) // Output queue
// Actual algorithm
numOpenParens := 0 // Number of open parentheses
numOpenParens := 0 // Number of open parentheses
parenIndices := make([]Group, 0) // I really shouldn't be using Group here, because that's strictly for matching purposes, but its a convenient way to store the indices of the opening and closing parens.
parenIndices = append(parenIndices, Group{0, 0}) // I append a weird value here, because the 0-th group doesn't have any parens. This way, the 1st group will be at index 1, 2nd at 2 ...
for i := 0; i < len(re_postfix); i++ {
/* Two cases:
1. Current character is alphanumeric - send to output queue
@@ -420,11 +491,44 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} else {
return nil, fmt.Errorf("not enough hex characters found in expression")
}
} else if isOctal(re_postfix[i]) { // Octal value
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
charClassInverted := (re_postfix[i] == 'P')
charsInClass := []rune{}
i++
if isUnicodeCharClassLetter(re_postfix[i]) {
var err error
charsInClass, err = unicodeCharClassToRange(string(re_postfix[i]))
if err != nil {
return nil, err
}
} else if re_postfix[i] == '{' {
i++ // Skip opening bracket
unicodeCharClassStr := ""
for re_postfix[i] != '}' {
unicodeCharClassStr += string(re_postfix[i])
i++
}
var err error
charsInClass, err = unicodeCharClassToRange(unicodeCharClassStr)
if err != nil {
return nil, err
}
} else {
return nil, fmt.Errorf("error parsing unicode character class in expression")
}
var toAppend postfixNode
if !charClassInverted { // \p
toAppend = newPostfixNode(charsInClass...)
} else { // \P
toAppend = newPostfixDotNode()
toAppend.except = append([]postfixNode{}, newPostfixNode(charsInClass...))
}
outQueue = append(outQueue, toAppend)
} else if re_postfix[i] == '0' { // Octal value
var octVal int64
var octValStr string
numDigitsParsed := 0
for (i+numDigitsParsed) < len(re_postfix) && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 {
for (i+numDigitsParsed) < len(re_postfix) && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 4 {
octValStr += string(re_postfix[i+numDigitsParsed])
numDigitsParsed++
}
@@ -437,6 +541,20 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
}
i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically
outQueue = append(outQueue, newPostfixCharNode(rune(octVal)))
} else if unicode.IsDigit(re_postfix[i]) { // Backreference
var num int64
var numStr string
numDigitsParsed := 0
for (i+numDigitsParsed) < len(re_postfix) && unicode.IsDigit(re_postfix[i+numDigitsParsed]) {
numStr += string(re_postfix[i+numDigitsParsed])
numDigitsParsed++
}
num, err := strconv.ParseInt(numStr, 10, 32)
if err != nil {
return nil, fmt.Errorf("error parsing backreference in expresion")
}
i += numDigitsParsed - 1
outQueue = append(outQueue, newPostfixBackreferenceNode(int(num)))
} else {
escapedNode, err := newEscapedNode(re_postfix[i], false)
if err != nil {
@@ -588,11 +706,44 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} else {
return nil, fmt.Errorf("not enough hex characters found in character class")
}
} else if isOctal(re_postfix[i]) { // Octal value
} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
charClassInverted := (re_postfix[i] == 'P')
charsInList := []rune{}
i++
if isUnicodeCharClassLetter(re_postfix[i]) {
var err error
charsInList, err = unicodeCharClassToRange(string(re_postfix[i]))
if err != nil {
return nil, err
}
} else if re_postfix[i] == '{' {
i++ // Skip opening bracket
unicodeCharClassStr := ""
for re_postfix[i] != '}' {
unicodeCharClassStr += string(re_postfix[i])
i++
}
var err error
charsInList, err = unicodeCharClassToRange(unicodeCharClassStr)
if err != nil {
return nil, err
}
} else {
return nil, fmt.Errorf("error parsing unicode character class in expression")
}
if !charClassInverted {
chars = append(chars, newPostfixNode(charsInList...))
} else {
toAppend := newPostfixDotNode()
toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...))
chars = append(chars, toAppend)
}
} else if re_postfix[i] == '0' { // Octal value
var octVal int64
var octValStr string
numDigitsParsed := 0
for (i+numDigitsParsed) < len(re_postfix)-1 && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else)
for (i+numDigitsParsed) < len(re_postfix)-1 && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 4 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else)
octValStr += string(re_postfix[i+numDigitsParsed])
numDigitsParsed++
}
@@ -796,6 +947,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
outQueue = append(outQueue, newPostfixNode(c))
}
numOpenParens++
parenIndices = append(parenIndices, Group{StartIdx: len(outQueue) - 1}) // Push the index of the lparen into parenIndices
}
if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Throw error if we reach the end of the stack.
@@ -812,6 +964,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
}
parenIndices[numOpenParens].EndIdx = len(outQueue) - 1
numOpenParens--
}
}
@@ -826,6 +979,11 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("imbalanced parantheses")
}
// outQueue, _, err := rewriteBackreferences(outQueue, parenIndices)
// if err != nil {
// return nil, err
// }
return outQueue, nil
}
@@ -1037,6 +1195,21 @@ func thompson(re []postfixNode) (Reg, error) {
})
nfa = append(nfa, toAdd)
}
if c.nodetype == backreferenceNode {
if c.referencedGroup > numGroups {
return Reg{}, fmt.Errorf("invalid backreference")
}
stateToAdd := &nfaState{}
stateToAdd.assert = noneAssert
stateToAdd.content = newContents(epsilon)
stateToAdd.isEmpty = true
stateToAdd.isBackreference = true
stateToAdd.output = make([]*nfaState, 0)
stateToAdd.output = append(stateToAdd.output, stateToAdd)
stateToAdd.referredGroup = c.referencedGroup
stateToAdd.threadBackref = 0
nfa = append(nfa, stateToAdd)
}
// Must be an operator if it isn't a character
switch c.nodetype {
case concatenateNode:

View File

@@ -18,7 +18,7 @@ Single characters:
[^abc] Negated character class - match any character except a, b and c
[^a-z] Negated character range - do not match any character from a to z
\[ Match a literal '['. Backslashes can escape any character with special meaning, including another backslash.
\452 Match the character with the octal value 452 (up to 3 digits)
\0452 Match the character with the octal value 452 (up to 4 digits, first digit must be 0)
\xFF Match the character with the hex value FF (exactly 2 characters)
\x{0000FF} Match the character with the hex value 0000FF (exactly 6 characters)
\n Newline
@@ -93,6 +93,10 @@ Lookarounds:
(?<=x)y Positive lookbehind - Match y if preceded by x
(?<!x)y Negative lookbehind - Match y if NOT preceded by x
Backreferences:
(xy)\1 Match 'xy' followed by the text most recently captured by group 1 (in this case, 'xy')
Numeric ranges:
<x-y> Match any number from x to y (inclusive) (x and y must be positive numbers)
@@ -149,13 +153,14 @@ returns the 0-group.
The following features from [regexp] are (currently) NOT supported:
1. Named capturing groups
2. Non-greedy operators
3. Unicode character classes
3. Negated POSIX classes
4. Embedded flags (flags are instead passed as arguments to [Compile])
5. Literal text with \Q ... \E
The following features are not available in [regexp], but are supported in my engine:
1. Lookarounds
2. Numeric ranges
3. Backreferences
I hope to shorten the first list, and expand the second.
*/

View File

@@ -236,14 +236,14 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread
if state.isKleene || state.isQuestion {
copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
list := addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
return list
}
if state.isAlternation {
copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
list := addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
return list
@@ -257,10 +257,12 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread
}
if state.groupBegin {
state.threadGroups[state.groupNum].StartIdx = idx
copyThread(state.next, state)
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
if state.groupEnd {
state.threadGroups[state.groupNum].EndIdx = idx
copyThread(state.next, state)
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
return append(list, state)
@@ -313,11 +315,25 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
if !preferLongest {
break
}
} else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character
} else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.isBackreference && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character
if currentState.contentContains(str, idx, preferLongest) {
nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
}
} else if currentState.isBackreference && currentState.threadGroups[currentState.referredGroup].IsValid() {
groupLength := currentState.threadGroups[currentState.referredGroup].EndIdx - currentState.threadGroups[currentState.referredGroup].StartIdx
if currentState.threadBackref == groupLength {
currentState.threadBackref = 0
copyThread(currentState.next, currentState)
currentStates = addStateToList(str, idx, currentStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
} else {
idxInReferredGroup := currentState.threadGroups[currentState.referredGroup].StartIdx + currentState.threadBackref
if idxInReferredGroup < len(str) && idx < len(str) && str[idxInReferredGroup] == str[idx] {
currentState.threadBackref += 1
nextStates = append(nextStates, currentState)
}
}
}
}
currentStates = append([]nfaState{}, nextStates...)
nextStates = nil

View File

@@ -45,8 +45,10 @@ type nfaState struct {
groupEnd bool // Whether or not the node ends a capturing group
groupNum int // Which capturing group the node starts / ends
// The following properties depend on the current match - I should think about resetting them for every match.
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
isBackreference bool // Whether or not current node is backreference
referredGroup int // If current node is a backreference, the node that it points to
threadBackref int // If current node is a backreference, how many characters to look forward into the referred group
}
// Clones the NFA starting from the given state.
@@ -76,7 +78,6 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
isQuestion: stateToClone.isQuestion,
isAlternation: stateToClone.isAlternation,
assert: stateToClone.assert,
zeroMatchFound: stateToClone.zeroMatchFound,
allChars: stateToClone.allChars,
except: append([]rune{}, stateToClone.except...),
lookaroundRegex: stateToClone.lookaroundRegex,
@@ -122,6 +123,7 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) {
}
// Assuming it hasn't been visited
state.threadGroups = nil
state.threadBackref = 0
visitedMap[state] = true
if state.isAlternation {
resetThreadsHelper(state.next, visitedMap)
@@ -428,7 +430,8 @@ func (s nfaState) equals(other nfaState) bool {
s.groupBegin == other.groupBegin &&
s.groupEnd == other.groupEnd &&
s.groupNum == other.groupNum &&
slices.Equal(s.threadGroups, other.threadGroups)
slices.Equal(s.threadGroups, other.threadGroups) &&
s.threadBackref == other.threadBackref
}
func stateExists(list []nfaState, s nfaState) bool {

View File

@@ -1,6 +1,8 @@
package regex
import "fmt"
import (
"fmt"
)
type nodeType int
@@ -20,6 +22,7 @@ const (
assertionNode
lparenNode
rparenNode
backreferenceNode
)
// Helper constants for lookarounds
@@ -31,15 +34,16 @@ const lookbehind = -1
var infinite_reps int = -1 // Represents infinite reps eg. the end range in {5,}
// This represents a node in the postfix representation of the expression
type postfixNode struct {
nodetype nodeType
contents []rune // Contents of the node
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter)
except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here.
lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
lookaroundDir int // Lookbehind or lookahead
nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
nodetype nodeType
contents []rune // Contents of the node
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter)
except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here.
lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
lookaroundDir int // Lookbehind or lookahead
nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
referencedGroup int // ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode.
}
// Converts the given list of postfixNodes to one node of type CHARCLASS.
@@ -208,3 +212,44 @@ func newPostfixCharNode(contents ...rune) postfixNode {
toReturn.contents = append(toReturn.contents, contents...)
return toReturn
}
// newPostfixBackreferenceNode creates and returns a backreference node, referring to the given group
func newPostfixBackreferenceNode(referred int) postfixNode {
toReturn := postfixNode{}
toReturn.startReps = 1
toReturn.endReps = 1
toReturn.nodetype = backreferenceNode
toReturn.referencedGroup = referred
return toReturn
}
// rewriteBackreferences rewrites any backreferences in the given postfixNode slice, into their respective groups.
// It stores the relation in a map, and returns it as the second return value.
// It uses parenIndices to determine where a group starts and ends in nodes.
// For example, \1(a) will be rewritten into (a)(a), and 1 -> 2 will be the hashmap value.
// It returns an error if a backreference points to an invalid group.
// func rewriteBackreferences(nodes []postfixNode, parenIndices []Group) ([]postfixNode, map[int]int, error) {
// rtv := make([]postfixNode, 0)
// referMap := make(map[int]int)
// numGroups := 0
// groupIncrement := 0 // If we have a backreference before the group its referring to, then the group its referring to will have its group number incremented.
// for i, node := range nodes {
// if node.nodetype == backreferenceNode {
// if node.referencedGroup >= len(parenIndices) {
// return nil, nil, fmt.Errorf("invalid backreference")
// }
// rtv = slices.Concat(rtv, nodes[parenIndices[node.referencedGroup].StartIdx:parenIndices[node.referencedGroup].EndIdx+1]) // Add all the nodes in the group to rtv
// numGroups += 1
// if i < parenIndices[node.referencedGroup].StartIdx {
// groupIncrement += 1
// }
// referMap[numGroups] = node.referencedGroup + groupIncrement
// } else {
// rtv = append(rtv, node)
// if node.nodetype == lparenNode {
// numGroups += 1
// }
// }
// }
// return rtv, referMap, nil
// }

View File

@@ -179,7 +179,7 @@ var reTests = []struct {
{"[[:graph:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 70}}},
// Test cases from Python's RE test suite
{`[\1]`, nil, "\x01", []Group{{0, 1}}},
{`[\01]`, nil, "\x01", []Group{{0, 1}}},
{`\0`, nil, "\x00", []Group{{0, 1}}},
{`[\0a]`, nil, "\x00", []Group{{0, 1}}},
@@ -194,7 +194,7 @@ var reTests = []struct {
{`\x00ffffffffffffff`, nil, "\xff", []Group{}},
{`\x00f`, nil, "\x0f", []Group{}},
{`\x00fe`, nil, "\xfe", []Group{}},
{`^\w+=(\\[\000-\277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
{`^\w+=(\\[\000-\0277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}},
{`a.b`, nil, `acb`, []Group{{0, 3}}},
{`a.b`, nil, "a\nb", []Group{}},
@@ -312,11 +312,7 @@ var reTests = []struct {
{`a[-]?c`, nil, `ac`, []Group{{0, 2}}},
{`^(.+)?B`, nil, `AB`, []Group{{0, 2}}},
{`\0009`, nil, "\x009", []Group{{0, 2}}},
{`\141`, nil, "a", []Group{{0, 1}}},
// At this point, the python test suite has a bunch
// of backreference tests. Since my engine doesn't
// implement backreferences, I've skipped those tests.
{`\0141`, nil, "a", []Group{{0, 1}}},
{`*a`, nil, ``, nil},
{`(*)b`, nil, ``, nil},
@@ -433,7 +429,8 @@ var reTests = []struct {
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
{`\141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
{`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
{`\0141\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "AA", []Group{{0, 2}}},
{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
@@ -464,8 +461,10 @@ var reTests = []struct {
{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
{`\xff`, nil, "\u00ff", []Group{{0, 1}}},
{`\xff+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
{`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
{`\x00ff`, nil, "\u00ff", []Group{}},
{`\x{0000ff}+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
@@ -473,7 +472,7 @@ var reTests = []struct {
{`[\t][\n][\v][\r][\f][\b]`, nil, "\t\n\v\r\f\b", []Group{{0, 6}}},
{`.*d`, nil, "abc\nabd", []Group{{4, 7}}},
{`(`, nil, "-", nil},
{`[\41]`, nil, `!`, []Group{{0, 1}}},
{`[\041]`, nil, `!`, []Group{{0, 1}}},
{`(?<!abc)(d.f)`, nil, `abcdefdof`, []Group{{6, 9}}},
{`[\w-]+`, nil, `laser_beam`, []Group{{0, 10}}},
{`M+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}},
@@ -520,6 +519,14 @@ var reTests = []struct {
{`<389-400`, nil, `-`, nil},
{`<389-400>`, nil, `391`, []Group{{0, 3}}},
{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
{`\p{Tamil}+`, nil, `உயிரெழுத்து`, []Group{{0, 11}}}, // Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'.
{`\P{Tamil}+`, nil, `vowel=உயிரெழுத்து`, []Group{{0, 6}}},
{`\P`, nil, `உயிரெழுத்து`, nil},
{`\PM\pM*`, nil, `உயிரெழுத்து`, []Group{{0, 1}, {1, 3}, {3, 5}, {5, 7}, {7, 9}, {9, 11}}},
{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
}
var groupTests = []struct {
@@ -581,13 +588,37 @@ var groupTests = []struct {
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `bcdd`, []Match{}},
{`(bc+d$|ef*g.|h?i(j|k))`, nil, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
{`(((((((((a)))))))))`, nil, `a`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(((((((((a)))))))))\41`, nil, `a!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(((((((((a)))))))))\041`, nil, `a!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
{`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
// At this point, the python test suite has a bunch
// of backreference tests. Since my engine doesn't
// implement backreferences, I've skipped those tests.
// Backreference tests
{`(abc)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}}},
{`([a-c]+)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}}},
{`([a-c]*)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}, []Group{{6, 6}, {6, 6}}}},
{`^(.+)?B`, nil, `AB`, []Match{[]Group{{0, 2}, {0, 1}}}},
{`(a+).\1$`, nil, `aaaaa`, []Match{[]Group{{0, 5}, {0, 2}}}},
{`^(a+).\1$`, nil, `aaaa`, []Match{}},
{`(a)\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}},
{`(a+)\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}},
{`(a+)+\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}},
{`(a).+\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`(a)ba*\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`(aa|a)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`(a|aa)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`(a+)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`([abc]*)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}, []Group{{6, 6}, {6, 6}}}},
{`(a)(?:b)\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}},
{`(a)(?:b)\1`, nil, `abb`, []Match{}},
{`(?:a)(b)\1`, nil, `aba`, []Match{}},
{`(?:a)(b)\1`, nil, `abb`, []Match{[]Group{{0, 3}, {1, 2}}}},
{`(?:(cat)|(dog))\2`, nil, `catdog`, []Match{}},
{`(?:a)\1`, nil, `aa`, nil},
{`((cat)|(dog)|(cow)|(bat))\4`, nil, `cowcow`, []Match{[]Group{{0, 6}, {0, 3}, {-1, -1}, {-1, -1}, {0, 3}, {-1, -1}}}},
{`(a|b)*\1`, nil, `abb`, []Match{[]Group{{0, 3}, {1, 2}}}},
{`(a|b)*\1`, nil, `aba`, []Match{}},
{`(a|b)*\1`, nil, `bab`, []Match{}},
{`(a|b)*\1`, nil, `baa`, []Match{[]Group{{0, 3}, {1, 2}}}},
{`(a)(b)c|ab`, nil, `ab`, []Match{[]Group{{0, 2}}}},
{`(a)+x`, nil, `aaax`, []Match{[]Group{{0, 4}, {2, 3}}}},
@@ -636,7 +667,7 @@ var groupTests = []struct {
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `BCDD`, []Match{}},
{`(bc+d$|ef*g.|h?i(j|k))`, []ReFlag{RE_CASE_INSENSITIVE}, `reffgz`, []Match{[]Group{{1, 6}, {1, 6}}}},
{`(((((((((a)))))))))`, []ReFlag{RE_CASE_INSENSITIVE}, `A`, []Match{[]Group{{0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(((((((((a)))))))))\41`, []ReFlag{RE_CASE_INSENSITIVE}, `A!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(((((((((a)))))))))\041`, []ReFlag{RE_CASE_INSENSITIVE}, `A!`, []Match{[]Group{{0, 2}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}}}},
{`(.*)c(.*)`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCDE`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}},
{`\((.*), (.*)\)`, []ReFlag{RE_CASE_INSENSITIVE}, `(A, B)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}},
{`(a)(b)c|ab`, []ReFlag{RE_CASE_INSENSITIVE}, `AB`, []Match{[]Group{{0, 2}}}},
@@ -792,23 +823,24 @@ func TestFindSubmatch(t *testing.T) {
if test.result != nil {
panic(err)
}
}
match, err := regComp.FindSubmatch(test.str)
if err != nil {
if len(test.result) != 0 {
t.Errorf("Wanted %v got no match\n", test.result[0])
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", match)
}
for i := range match {
if match[i].IsValid() {
if test.result[0][i] != match[i] {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
} else {
match, err := regComp.FindSubmatch(test.str)
if err != nil {
if len(test.result) != 0 {
t.Errorf("Wanted %v got no match\n", test.result[0])
}
} else {
if i < len(test.result) && test.result[0][i].IsValid() {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", match)
}
for i := range match {
if match[i].IsValid() {
if test.result[0][i] != match[i] {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
}
} else {
if i < len(test.result) && test.result[0][i].IsValid() {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
}
}
}
}
@@ -823,10 +855,22 @@ func TestFindStringSubmatch(t *testing.T) {
if test.result != nil {
panic(err)
}
}
matchStr := regComp.FindStringSubmatch(test.str)
if matchStr == nil {
if len(test.result) != 0 {
} else {
matchStr := regComp.FindStringSubmatch(test.str)
if matchStr == nil {
if len(test.result) != 0 {
expectedStr := funcMap(test.result[0], func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
t.Errorf("Wanted %v got no match\n", expectedStr)
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", matchStr)
} else {
expectedStr := funcMap(test.result[0], func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
@@ -834,26 +878,15 @@ func TestFindStringSubmatch(t *testing.T) {
return ""
}
})
t.Errorf("Wanted %v got no match\n", expectedStr)
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", matchStr)
} else {
expectedStr := funcMap(test.result[0], func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
for i, groupStr := range matchStr {
if groupStr == "" {
if i < len(expectedStr) && expectedStr[i] != "" {
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
}
} else {
if expectedStr[i] != groupStr {
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
for i, groupStr := range matchStr {
if groupStr == "" {
if i < len(expectedStr) && expectedStr[i] != "" {
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
}
} else {
if expectedStr[i] != groupStr {
t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr)
}
}
}
}
@@ -870,10 +903,24 @@ func TestFindAllStringSubmatch(t *testing.T) {
if test.result != nil {
panic(err)
}
}
matchStrs := regComp.FindAllStringSubmatch(test.str)
if matchStrs == nil {
if len(test.result) != 0 {
} else {
matchStrs := regComp.FindAllStringSubmatch(test.str)
if matchStrs == nil {
if len(test.result) != 0 {
expectedStrs := funcMap(test.result, func(m Match) []string {
return funcMap(m, func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
})
t.Errorf("Wanted %v got no match\n", expectedStrs)
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", matchStrs)
} else {
expectedStrs := funcMap(test.result, func(m Match) []string {
return funcMap(m, func(g Group) string {
if g.IsValid() {
@@ -883,29 +930,16 @@ func TestFindAllStringSubmatch(t *testing.T) {
}
})
})
t.Errorf("Wanted %v got no match\n", expectedStrs)
}
} else if len(test.result) == 0 {
t.Errorf("Wanted no match got %v\n", matchStrs)
} else {
expectedStrs := funcMap(test.result, func(m Match) []string {
return funcMap(m, func(g Group) string {
if g.IsValid() {
return test.str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
})
for i, matchStr := range matchStrs {
for j, groupStr := range matchStr {
if groupStr == "" {
if j < len(expectedStrs[i]) && expectedStrs[i][j] != "" {
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
}
} else {
if expectedStrs[i][j] != groupStr {
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
for i, matchStr := range matchStrs {
for j, groupStr := range matchStr {
if groupStr == "" {
if j < len(expectedStrs[i]) && expectedStrs[i][j] != "" {
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
}
} else {
if expectedStrs[i][j] != groupStr {
t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs)
}
}
}
}
@@ -923,17 +957,18 @@ func TestFindAllSubmatch(t *testing.T) {
if test.result != nil {
panic(err)
}
}
matchIndices := regComp.FindAllSubmatch(test.str)
for i := range matchIndices {
for j := range matchIndices[i] {
if matchIndices[i][j].IsValid() {
if test.result[i][j] != matchIndices[i][j] {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}
} else {
if i < len(test.result) && j < len(test.result[i]) && test.result[i][j].IsValid() {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
} else {
matchIndices := regComp.FindAllSubmatch(test.str)
for i := range matchIndices {
for j := range matchIndices[i] {
if matchIndices[i][j].IsValid() {
if test.result[i][j] != matchIndices[i][j] {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}
} else {
if i < len(test.result) && j < len(test.result[i]) && test.result[i][j].IsValid() {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}
}
}
}