From 8327450dd2ed52763a0b2b62abddb71d58299f1a Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Tue, 11 Feb 2025 16:14:48 -0500 Subject: [PATCH 1/6] Started implementing backreferences (octal values should now be prefaced with \0) --- regex/compile.go | 59 +++++++++++++++++++++++++++++++++++----- regex/matching.go | 46 +++++++++++++++++++++++-------- regex/nfa.go | 11 +++++--- regex/postfixNode.go | 65 +++++++++++++++++++++++++++++++++++++------- regex/re_test.go | 14 +++++----- 5 files changed, 156 insertions(+), 39 deletions(-) diff --git a/regex/compile.go b/regex/compile.go index 0ae3d6b..0414ac8 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -313,13 +313,20 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } else { return nil, fmt.Errorf("invalid hex value in expression") } - } else if isOctal(re_runes[i]) { + } else if re_runes[i] == '0' { // Start of octal value numDigits := 1 - for i+numDigits < len(re_runes) && numDigits < 3 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 3) + for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0) numDigits++ } re_postfix = append(re_postfix, re_runes[i:i+numDigits]...) i += (numDigits - 1) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended + } else if unicode.IsDigit(re_runes[i]) { // Any other number - backreference + numDigits := 1 + for i+numDigits < len(re_runes) && unicode.IsDigit(re_runes[i+numDigits]) { // Skip while we see a digit + numDigits++ + } + re_postfix = append(re_postfix, re_runes[i:i+numDigits]...) + i += (numDigits - 1) // Move back a step to add concatenation operator } else { re_postfix = append(re_postfix, re_runes[i]) } @@ -364,7 +371,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { outQueue := make([]postfixNode, 0) // Output queue // Actual algorithm - numOpenParens := 0 // Number of open parentheses + numOpenParens := 0 // Number of open parentheses + parenIndices := make([]Group, 0) // I really shouldn't be using Group here, because that's strictly for matching purposes, but its a convenient way to store the indices of the opening and closing parens. + parenIndices = append(parenIndices, Group{0, 0}) // I append a weird value here, because the 0-th group doesn't have any parens. This way, the 1st group will be at index 1, 2nd at 2 ... for i := 0; i < len(re_postfix); i++ { /* Two cases: 1. Current character is alphanumeric - send to output queue @@ -420,11 +429,11 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } else { return nil, fmt.Errorf("not enough hex characters found in expression") } - } else if isOctal(re_postfix[i]) { // Octal value + } else if re_postfix[i] == '0' { // Octal value var octVal int64 var octValStr string numDigitsParsed := 0 - for (i+numDigitsParsed) < len(re_postfix) && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 { + for (i+numDigitsParsed) < len(re_postfix) && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 4 { octValStr += string(re_postfix[i+numDigitsParsed]) numDigitsParsed++ } @@ -437,6 +446,20 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically outQueue = append(outQueue, newPostfixCharNode(rune(octVal))) + } else if unicode.IsDigit(re_postfix[i]) { // Backreference + var num int64 + var numStr string + numDigitsParsed := 0 + for (i+numDigitsParsed) < len(re_postfix) && unicode.IsDigit(re_postfix[i+numDigitsParsed]) { + numStr += string(re_postfix[i+numDigitsParsed]) + numDigitsParsed++ + } + num, err := strconv.ParseInt(numStr, 10, 32) + if err != nil { + return nil, fmt.Errorf("error parsing backreference in expresion") + } + i += numDigitsParsed - 1 + outQueue = append(outQueue, newPostfixBackreferenceNode(int(num))) } else { escapedNode, err := newEscapedNode(re_postfix[i], false) if err != nil { @@ -588,11 +611,11 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } else { return nil, fmt.Errorf("not enough hex characters found in character class") } - } else if isOctal(re_postfix[i]) { // Octal value + } else if re_postfix[i] == '0' { // Octal value var octVal int64 var octValStr string numDigitsParsed := 0 - for (i+numDigitsParsed) < len(re_postfix)-1 && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 3 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else) + for (i+numDigitsParsed) < len(re_postfix)-1 && isOctal(re_postfix[i+numDigitsParsed]) && numDigitsParsed <= 4 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else) octValStr += string(re_postfix[i+numDigitsParsed]) numDigitsParsed++ } @@ -796,6 +819,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { outQueue = append(outQueue, newPostfixNode(c)) } numOpenParens++ + parenIndices = append(parenIndices, Group{StartIdx: len(outQueue) - 1}) // Push the index of the lparen into parenIndices } if c == ')' { // Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Throw error if we reach the end of the stack. @@ -812,6 +836,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses } + parenIndices[numOpenParens].EndIdx = len(outQueue) - 1 numOpenParens-- } } @@ -826,6 +851,11 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { return nil, fmt.Errorf("imbalanced parantheses") } + // outQueue, _, err := rewriteBackreferences(outQueue, parenIndices) + // if err != nil { + // return nil, err + // } + return outQueue, nil } @@ -1037,6 +1067,21 @@ func thompson(re []postfixNode) (Reg, error) { }) nfa = append(nfa, toAdd) } + if c.nodetype == backreferenceNode { + if c.referencedGroup > numGroups { + return Reg{}, fmt.Errorf("invalid backreference") + } + stateToAdd := &nfaState{} + stateToAdd.assert = noneAssert + stateToAdd.content = newContents(epsilon) + stateToAdd.isEmpty = true + stateToAdd.isBackreference = true + stateToAdd.output = make([]*nfaState, 0) + stateToAdd.output = append(stateToAdd.output, stateToAdd) + stateToAdd.referredGroup = c.referencedGroup + stateToAdd.threadBackref = 0 + nfa = append(nfa, stateToAdd) + } // Must be an operator if it isn't a character switch c.nodetype { case concatenateNode: diff --git a/regex/matching.go b/regex/matching.go index a344a40..230a658 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -228,25 +228,45 @@ func (re Reg) FindAllStringSubmatch(str string) [][]string { return rtv } -func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState { +// Second parameter is the 'new index' +func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) ([]nfaState, int) { if stateExists(list, state) || stateExists(visited, state) { - return list + return list, idx } visited = append(visited, state) + if state.isBackreference { + if threadGroups[state.referredGroup].IsValid() { + groupLength := threadGroups[state.referredGroup].EndIdx - threadGroups[state.referredGroup].StartIdx + if state.threadBackref == groupLength { + state.threadBackref = 0 + copyThread(state.next, state) + return addStateToList(str, idx+groupLength, list, *state.next, threadGroups, visited, preferLongest) + } + idxInReferredGroup := threadGroups[state.referredGroup].StartIdx + state.threadBackref + if idxInReferredGroup < len(str) && idx+state.threadBackref < len(str) && str[idxInReferredGroup] == str[idx+state.threadBackref] { + state.threadBackref += 1 + return addStateToList(str, idx, list, state, threadGroups, visited, preferLongest) + } else { + return list, idx + } + } else { + return list, idx + } + } if state.isKleene || state.isQuestion { copyThread(state.splitState, state) - list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest) + list, newIdx := addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest) copyThread(state.next, state) - list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest) - return list + list, newIdx = addStateToList(str, newIdx, list, *state.next, threadGroups, visited, preferLongest) + return list, newIdx } if state.isAlternation { copyThread(state.next, state) - list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest) + list, newIdx := addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest) copyThread(state.splitState, state) - list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest) - return list + list, newIdx = addStateToList(str, newIdx, list, *state.splitState, threadGroups, visited, preferLongest) + return list, newIdx } state.threadGroups = append([]Group{}, threadGroups...) if state.assert != noneAssert { @@ -257,13 +277,15 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread } if state.groupBegin { state.threadGroups[state.groupNum].StartIdx = idx + copyThread(state.next, state) return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest) } if state.groupEnd { state.threadGroups[state.groupNum].EndIdx = idx + copyThread(state.next, state) return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest) } - return append(list, state) + return append(list, state), idx } @@ -293,7 +315,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in start.threadGroups = newMatch(numGroups + 1) start.threadGroups[0].StartIdx = i - currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest) + currentStates, _ = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest) // We can't go forward at the beginning, so I discard the second retval var match Match = nil for idx := i; idx <= len(str); idx++ { if len(currentStates) == 0 { @@ -315,7 +337,9 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in } } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character if currentState.contentContains(str, idx, preferLongest) { - nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest) + var newIdx int + nextStates, newIdx = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest) + idx = newIdx - 1 } } } diff --git a/regex/nfa.go b/regex/nfa.go index c649712..8f454cf 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -45,8 +45,10 @@ type nfaState struct { groupEnd bool // Whether or not the node ends a capturing group groupNum int // Which capturing group the node starts / ends // The following properties depend on the current match - I should think about resetting them for every match. - zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states - threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over. + threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over. + isBackreference bool // Whether or not current node is backreference + referredGroup int // If current node is a backreference, the node that it points to + threadBackref int // If current node is a backreference, how many characters to look forward into the referred group } // Clones the NFA starting from the given state. @@ -76,7 +78,6 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState) isQuestion: stateToClone.isQuestion, isAlternation: stateToClone.isAlternation, assert: stateToClone.assert, - zeroMatchFound: stateToClone.zeroMatchFound, allChars: stateToClone.allChars, except: append([]rune{}, stateToClone.except...), lookaroundRegex: stateToClone.lookaroundRegex, @@ -122,6 +123,7 @@ func resetThreadsHelper(state *nfaState, visitedMap map[*nfaState]bool) { } // Assuming it hasn't been visited state.threadGroups = nil + state.threadBackref = 0 visitedMap[state] = true if state.isAlternation { resetThreadsHelper(state.next, visitedMap) @@ -428,7 +430,8 @@ func (s nfaState) equals(other nfaState) bool { s.groupBegin == other.groupBegin && s.groupEnd == other.groupEnd && s.groupNum == other.groupNum && - slices.Equal(s.threadGroups, other.threadGroups) + slices.Equal(s.threadGroups, other.threadGroups) && + s.threadBackref == other.threadBackref } func stateExists(list []nfaState, s nfaState) bool { diff --git a/regex/postfixNode.go b/regex/postfixNode.go index c60de47..88e4c62 100644 --- a/regex/postfixNode.go +++ b/regex/postfixNode.go @@ -1,6 +1,8 @@ package regex -import "fmt" +import ( + "fmt" +) type nodeType int @@ -20,6 +22,7 @@ const ( assertionNode lparenNode rparenNode + backreferenceNode ) // Helper constants for lookarounds @@ -31,15 +34,16 @@ const lookbehind = -1 var infinite_reps int = -1 // Represents infinite reps eg. the end range in {5,} // This represents a node in the postfix representation of the expression type postfixNode struct { - nodetype nodeType - contents []rune // Contents of the node - startReps int // Minimum number of times the node should be repeated - used with numeric specifiers - endReps int // Maximum number of times the node should be repeated - used with numeric specifiers - allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter) - except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here. - lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround. - lookaroundDir int // Lookbehind or lookahead - nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node. + nodetype nodeType + contents []rune // Contents of the node + startReps int // Minimum number of times the node should be repeated - used with numeric specifiers + endReps int // Maximum number of times the node should be repeated - used with numeric specifiers + allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter) + except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here. + lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround. + lookaroundDir int // Lookbehind or lookahead + nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node. + referencedGroup int // ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode. } // Converts the given list of postfixNodes to one node of type CHARCLASS. @@ -208,3 +212,44 @@ func newPostfixCharNode(contents ...rune) postfixNode { toReturn.contents = append(toReturn.contents, contents...) return toReturn } + +// newPostfixBackreferenceNode creates and returns a backreference node, referring to the given group +func newPostfixBackreferenceNode(referred int) postfixNode { + toReturn := postfixNode{} + toReturn.startReps = 1 + toReturn.endReps = 1 + toReturn.nodetype = backreferenceNode + toReturn.referencedGroup = referred + return toReturn +} + +// rewriteBackreferences rewrites any backreferences in the given postfixNode slice, into their respective groups. +// It stores the relation in a map, and returns it as the second return value. +// It uses parenIndices to determine where a group starts and ends in nodes. +// For example, \1(a) will be rewritten into (a)(a), and 1 -> 2 will be the hashmap value. +// It returns an error if a backreference points to an invalid group. +// func rewriteBackreferences(nodes []postfixNode, parenIndices []Group) ([]postfixNode, map[int]int, error) { +// rtv := make([]postfixNode, 0) +// referMap := make(map[int]int) +// numGroups := 0 +// groupIncrement := 0 // If we have a backreference before the group its referring to, then the group its referring to will have its group number incremented. +// for i, node := range nodes { +// if node.nodetype == backreferenceNode { +// if node.referencedGroup >= len(parenIndices) { +// return nil, nil, fmt.Errorf("invalid backreference") +// } +// rtv = slices.Concat(rtv, nodes[parenIndices[node.referencedGroup].StartIdx:parenIndices[node.referencedGroup].EndIdx+1]) // Add all the nodes in the group to rtv +// numGroups += 1 +// if i < parenIndices[node.referencedGroup].StartIdx { +// groupIncrement += 1 +// } +// referMap[numGroups] = node.referencedGroup + groupIncrement +// } else { +// rtv = append(rtv, node) +// if node.nodetype == lparenNode { +// numGroups += 1 +// } +// } +// } +// return rtv, referMap, nil +// } diff --git a/regex/re_test.go b/regex/re_test.go index f697e81..05230a1 100644 --- a/regex/re_test.go +++ b/regex/re_test.go @@ -179,7 +179,7 @@ var reTests = []struct { {"[[:graph:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 70}}}, // Test cases from Python's RE test suite - {`[\1]`, nil, "\x01", []Group{{0, 1}}}, + {`[\01]`, nil, "\x01", []Group{{0, 1}}}, {`\0`, nil, "\x00", []Group{{0, 1}}}, {`[\0a]`, nil, "\x00", []Group{{0, 1}}}, @@ -194,7 +194,7 @@ var reTests = []struct { {`\x00ffffffffffffff`, nil, "\xff", []Group{}}, {`\x00f`, nil, "\x0f", []Group{}}, {`\x00fe`, nil, "\xfe", []Group{}}, - {`^\w+=(\\[\000-\277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}}, + {`^\w+=(\\[\000-\0277]|[^\n\\])*`, nil, "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", []Group{{0, 32}}}, {`a.b`, nil, `acb`, []Group{{0, 3}}}, {`a.b`, nil, "a\nb", []Group{}}, @@ -312,7 +312,7 @@ var reTests = []struct { {`a[-]?c`, nil, `ac`, []Group{{0, 2}}}, {`^(.+)?B`, nil, `AB`, []Group{{0, 2}}}, {`\0009`, nil, "\x009", []Group{{0, 2}}}, - {`\141`, nil, "a", []Group{{0, 1}}}, + {`\0141`, nil, "a", []Group{{0, 1}}}, // At this point, the python test suite has a bunch // of backreference tests. Since my engine doesn't @@ -433,7 +433,7 @@ var reTests = []struct { {`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}}, {`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}}, {`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}}, - {`\141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}}, + {`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}}, {`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}}, @@ -473,7 +473,7 @@ var reTests = []struct { {`[\t][\n][\v][\r][\f][\b]`, nil, "\t\n\v\r\f\b", []Group{{0, 6}}}, {`.*d`, nil, "abc\nabd", []Group{{4, 7}}}, {`(`, nil, "-", nil}, - {`[\41]`, nil, `!`, []Group{{0, 1}}}, + {`[\041]`, nil, `!`, []Group{{0, 1}}}, {`(? Date: Tue, 11 Feb 2025 17:06:39 -0500 Subject: [PATCH 2/6] More progress on backreference implementation --- regex/matching.go | 60 ++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/regex/matching.go b/regex/matching.go index 230a658..3dba3f0 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -228,45 +228,25 @@ func (re Reg) FindAllStringSubmatch(str string) [][]string { return rtv } -// Second parameter is the 'new index' -func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) ([]nfaState, int) { +func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState { if stateExists(list, state) || stateExists(visited, state) { - return list, idx + return list } visited = append(visited, state) - if state.isBackreference { - if threadGroups[state.referredGroup].IsValid() { - groupLength := threadGroups[state.referredGroup].EndIdx - threadGroups[state.referredGroup].StartIdx - if state.threadBackref == groupLength { - state.threadBackref = 0 - copyThread(state.next, state) - return addStateToList(str, idx+groupLength, list, *state.next, threadGroups, visited, preferLongest) - } - idxInReferredGroup := threadGroups[state.referredGroup].StartIdx + state.threadBackref - if idxInReferredGroup < len(str) && idx+state.threadBackref < len(str) && str[idxInReferredGroup] == str[idx+state.threadBackref] { - state.threadBackref += 1 - return addStateToList(str, idx, list, state, threadGroups, visited, preferLongest) - } else { - return list, idx - } - } else { - return list, idx - } - } if state.isKleene || state.isQuestion { copyThread(state.splitState, state) - list, newIdx := addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest) + list := addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest) copyThread(state.next, state) - list, newIdx = addStateToList(str, newIdx, list, *state.next, threadGroups, visited, preferLongest) - return list, newIdx + list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest) + return list } if state.isAlternation { copyThread(state.next, state) - list, newIdx := addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest) + list := addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest) copyThread(state.splitState, state) - list, newIdx = addStateToList(str, newIdx, list, *state.splitState, threadGroups, visited, preferLongest) - return list, newIdx + list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest) + return list } state.threadGroups = append([]Group{}, threadGroups...) if state.assert != noneAssert { @@ -285,7 +265,7 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread copyThread(state.next, state) return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest) } - return append(list, state), idx + return append(list, state) } @@ -315,7 +295,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in start.threadGroups = newMatch(numGroups + 1) start.threadGroups[0].StartIdx = i - currentStates, _ = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest) // We can't go forward at the beginning, so I discard the second retval + currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest) var match Match = nil for idx := i; idx <= len(str); idx++ { if len(currentStates) == 0 { @@ -335,13 +315,25 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in if !preferLongest { break } - } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character + } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.isBackreference && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character if currentState.contentContains(str, idx, preferLongest) { - var newIdx int - nextStates, newIdx = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest) - idx = newIdx - 1 + nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest) + } + } else if currentState.isBackreference { + groupLength := currentState.threadGroups[currentState.referredGroup].EndIdx - currentState.threadGroups[currentState.referredGroup].StartIdx + if currentState.threadBackref == groupLength { + currentState.threadBackref = 0 + copyThread(currentState.next, currentState) + currentStates = addStateToList(str, idx, currentStates, *currentState.next, currentState.threadGroups, nil, preferLongest) + } else { + idxInReferredGroup := currentState.threadGroups[currentState.referredGroup].StartIdx + currentState.threadBackref + if idxInReferredGroup < len(str) && idx < len(str) && str[idxInReferredGroup] == str[idx] { + currentState.threadBackref += 1 + nextStates = append(nextStates, currentState) + } } } + } currentStates = append([]nfaState{}, nextStates...) nextStates = nil From 2934e7a20fa076f9b2f0ecc6bda589b7d7fdb74b Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Tue, 11 Feb 2025 19:12:40 -0500 Subject: [PATCH 3/6] Wrote tests for backreferences --- regex/re_test.go | 190 ++++++++++++++++++++++++++--------------------- 1 file changed, 105 insertions(+), 85 deletions(-) diff --git a/regex/re_test.go b/regex/re_test.go index 05230a1..c751c26 100644 --- a/regex/re_test.go +++ b/regex/re_test.go @@ -314,10 +314,6 @@ var reTests = []struct { {`\0009`, nil, "\x009", []Group{{0, 2}}}, {`\0141`, nil, "a", []Group{{0, 1}}}, - // At this point, the python test suite has a bunch - // of backreference tests. Since my engine doesn't - // implement backreferences, I've skipped those tests. - {`*a`, nil, ``, nil}, {`(*)b`, nil, ``, nil}, {`a**`, nil, ``, nil}, @@ -585,9 +581,29 @@ var groupTests = []struct { {`(.*)c(.*)`, nil, `abcde`, []Match{[]Group{{0, 5}, {0, 2}, {3, 5}}}}, {`\((.*), (.*)\)`, nil, `(a, b)`, []Match{[]Group{{0, 6}, {1, 2}, {4, 5}}}}, - // At this point, the python test suite has a bunch - // of backreference tests. Since my engine doesn't - // implement backreferences, I've skipped those tests. + // Backreference tests + {`(abc)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}}}, + {`([a-c]+)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}}}, + {`([a-c]*)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}, []Group{{6, 6}, {6, 6}}}}, + {`^(.+)?B`, nil, `AB`, []Match{[]Group{{0, 2}, {0, 1}}}}, + {`(a+).\1$`, nil, `aaaaa`, []Match{[]Group{{0, 5}, {0, 2}}}}, + {`^(a+).\1$`, nil, `aaaa`, []Match{}}, + {`(a)\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}}, + {`(a+)\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}}, + {`(a+)+\1`, nil, `aa`, []Match{[]Group{{0, 2}, {0, 1}}}}, + {`(a).+\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}}, + {`(a)ba*\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}}, + {`(aa|a)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}}, + {`(a|aa)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}}, + {`(a+)a\1$`, nil, `aaa`, []Match{[]Group{{0, 3}, {0, 1}}}}, + {`([abc]*)\1`, nil, `abcabc`, []Match{[]Group{{0, 6}, {0, 3}}, []Group{{6, 6}, {6, 6}}}}, + {`(a)(?:b)\1`, nil, `aba`, []Match{[]Group{{0, 3}, {0, 1}}}}, + {`(a)(?:b)\1`, nil, `abb`, []Match{}}, + {`(?:a)(b)\1`, nil, `aba`, []Match{}}, + {`(?:a)(b)\1`, nil, `abb`, []Match{[]Group{{0, 3}, {1, 2}}}}, + {`(?:(cat)|(dog))\2`, nil, `catdog`, []Match{}}, + {`(?:a)\1`, nil, `aa`, nil}, + {`((cat)|(dog)|(cow)|(bat))\4`, nil, `cowcow`, []Match{[]Group{{0, 6}, {0, 3}, {-1, -1}, {-1, -1}, {0, 3}, {-1, -1}}}}, {`(a)(b)c|ab`, nil, `ab`, []Match{[]Group{{0, 2}}}}, {`(a)+x`, nil, `aaax`, []Match{[]Group{{0, 4}, {2, 3}}}}, @@ -792,23 +808,24 @@ func TestFindSubmatch(t *testing.T) { if test.result != nil { panic(err) } - } - match, err := regComp.FindSubmatch(test.str) - if err != nil { - if len(test.result) != 0 { - t.Errorf("Wanted %v got no match\n", test.result[0]) - } - } else if len(test.result) == 0 { - t.Errorf("Wanted no match got %v\n", match) - } - for i := range match { - if match[i].IsValid() { - if test.result[0][i] != match[i] { - t.Errorf("Wanted %v Got %v\n", test.result[0], match) + } else { + match, err := regComp.FindSubmatch(test.str) + if err != nil { + if len(test.result) != 0 { + t.Errorf("Wanted %v got no match\n", test.result[0]) } - } else { - if i < len(test.result) && test.result[0][i].IsValid() { - t.Errorf("Wanted %v Got %v\n", test.result[0], match) + } else if len(test.result) == 0 { + t.Errorf("Wanted no match got %v\n", match) + } + for i := range match { + if match[i].IsValid() { + if test.result[0][i] != match[i] { + t.Errorf("Wanted %v Got %v\n", test.result[0], match) + } + } else { + if i < len(test.result) && test.result[0][i].IsValid() { + t.Errorf("Wanted %v Got %v\n", test.result[0], match) + } } } } @@ -823,10 +840,22 @@ func TestFindStringSubmatch(t *testing.T) { if test.result != nil { panic(err) } - } - matchStr := regComp.FindStringSubmatch(test.str) - if matchStr == nil { - if len(test.result) != 0 { + } else { + matchStr := regComp.FindStringSubmatch(test.str) + if matchStr == nil { + if len(test.result) != 0 { + expectedStr := funcMap(test.result[0], func(g Group) string { + if g.IsValid() { + return test.str[g.StartIdx:g.EndIdx] + } else { + return "" + } + }) + t.Errorf("Wanted %v got no match\n", expectedStr) + } + } else if len(test.result) == 0 { + t.Errorf("Wanted no match got %v\n", matchStr) + } else { expectedStr := funcMap(test.result[0], func(g Group) string { if g.IsValid() { return test.str[g.StartIdx:g.EndIdx] @@ -834,26 +863,15 @@ func TestFindStringSubmatch(t *testing.T) { return "" } }) - t.Errorf("Wanted %v got no match\n", expectedStr) - } - } else if len(test.result) == 0 { - t.Errorf("Wanted no match got %v\n", matchStr) - } else { - expectedStr := funcMap(test.result[0], func(g Group) string { - if g.IsValid() { - return test.str[g.StartIdx:g.EndIdx] - } else { - return "" - } - }) - for i, groupStr := range matchStr { - if groupStr == "" { - if i < len(expectedStr) && expectedStr[i] != "" { - t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr) - } - } else { - if expectedStr[i] != groupStr { - t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr) + for i, groupStr := range matchStr { + if groupStr == "" { + if i < len(expectedStr) && expectedStr[i] != "" { + t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr) + } + } else { + if expectedStr[i] != groupStr { + t.Errorf("Wanted %v Got %v\n", expectedStr, matchStr) + } } } } @@ -870,10 +888,24 @@ func TestFindAllStringSubmatch(t *testing.T) { if test.result != nil { panic(err) } - } - matchStrs := regComp.FindAllStringSubmatch(test.str) - if matchStrs == nil { - if len(test.result) != 0 { + } else { + matchStrs := regComp.FindAllStringSubmatch(test.str) + if matchStrs == nil { + if len(test.result) != 0 { + expectedStrs := funcMap(test.result, func(m Match) []string { + return funcMap(m, func(g Group) string { + if g.IsValid() { + return test.str[g.StartIdx:g.EndIdx] + } else { + return "" + } + }) + }) + t.Errorf("Wanted %v got no match\n", expectedStrs) + } + } else if len(test.result) == 0 { + t.Errorf("Wanted no match got %v\n", matchStrs) + } else { expectedStrs := funcMap(test.result, func(m Match) []string { return funcMap(m, func(g Group) string { if g.IsValid() { @@ -883,29 +915,16 @@ func TestFindAllStringSubmatch(t *testing.T) { } }) }) - t.Errorf("Wanted %v got no match\n", expectedStrs) - } - } else if len(test.result) == 0 { - t.Errorf("Wanted no match got %v\n", matchStrs) - } else { - expectedStrs := funcMap(test.result, func(m Match) []string { - return funcMap(m, func(g Group) string { - if g.IsValid() { - return test.str[g.StartIdx:g.EndIdx] - } else { - return "" - } - }) - }) - for i, matchStr := range matchStrs { - for j, groupStr := range matchStr { - if groupStr == "" { - if j < len(expectedStrs[i]) && expectedStrs[i][j] != "" { - t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs) - } - } else { - if expectedStrs[i][j] != groupStr { - t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs) + for i, matchStr := range matchStrs { + for j, groupStr := range matchStr { + if groupStr == "" { + if j < len(expectedStrs[i]) && expectedStrs[i][j] != "" { + t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs) + } + } else { + if expectedStrs[i][j] != groupStr { + t.Errorf("Wanted %v Got %v\n", expectedStrs, matchStrs) + } } } } @@ -923,17 +942,18 @@ func TestFindAllSubmatch(t *testing.T) { if test.result != nil { panic(err) } - } - matchIndices := regComp.FindAllSubmatch(test.str) - for i := range matchIndices { - for j := range matchIndices[i] { - if matchIndices[i][j].IsValid() { - if test.result[i][j] != matchIndices[i][j] { - t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) - } - } else { - if i < len(test.result) && j < len(test.result[i]) && test.result[i][j].IsValid() { - t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) + } else { + matchIndices := regComp.FindAllSubmatch(test.str) + for i := range matchIndices { + for j := range matchIndices[i] { + if matchIndices[i][j].IsValid() { + if test.result[i][j] != matchIndices[i][j] { + t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) + } + } else { + if i < len(test.result) && j < len(test.result[i]) && test.result[i][j].IsValid() { + t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) + } } } } From 81b8b1b11ce09a2778440a5890dfd2de0065855b Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Tue, 11 Feb 2025 19:12:58 -0500 Subject: [PATCH 4/6] Do not validate a backreference if the group that it refers to is not valid --- regex/matching.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex/matching.go b/regex/matching.go index 3dba3f0..e1e7d9b 100644 --- a/regex/matching.go +++ b/regex/matching.go @@ -319,7 +319,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in if currentState.contentContains(str, idx, preferLongest) { nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest) } - } else if currentState.isBackreference { + } else if currentState.isBackreference && currentState.threadGroups[currentState.referredGroup].IsValid() { groupLength := currentState.threadGroups[currentState.referredGroup].EndIdx - currentState.threadGroups[currentState.referredGroup].StartIdx if currentState.threadBackref == groupLength { currentState.threadBackref = 0 From 2e47c631bbd75229b5d6c755ecd85b8db2d373cb Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 12 Feb 2025 07:50:24 -0500 Subject: [PATCH 5/6] Updated documentation to include backreferences --- regex/doc.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/regex/doc.go b/regex/doc.go index 3dd4456..b35c02f 100644 --- a/regex/doc.go +++ b/regex/doc.go @@ -18,7 +18,7 @@ Single characters: [^abc] Negated character class - match any character except a, b and c [^a-z] Negated character range - do not match any character from a to z \[ Match a literal '['. Backslashes can escape any character with special meaning, including another backslash. - \452 Match the character with the octal value 452 (up to 3 digits) + \0452 Match the character with the octal value 452 (up to 4 digits, first digit must be 0) \xFF Match the character with the hex value FF (exactly 2 characters) \x{0000FF} Match the character with the hex value 0000FF (exactly 6 characters) \n Newline @@ -93,6 +93,10 @@ Lookarounds: (?<=x)y Positive lookbehind - Match y if preceded by x (? Match any number from x to y (inclusive) (x and y must be positive numbers) @@ -156,6 +160,7 @@ The following features from [regexp] are (currently) NOT supported: The following features are not available in [regexp], but are supported in my engine: 1. Lookarounds 2. Numeric ranges + 3. Backreferences I hope to shorten the first list, and expand the second. */ From 375baa172229976a47acffbc01f8659c490462a7 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Wed, 12 Feb 2025 07:51:20 -0500 Subject: [PATCH 6/6] Wrote more backreference tests --- regex/re_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/regex/re_test.go b/regex/re_test.go index c751c26..1b717c4 100644 --- a/regex/re_test.go +++ b/regex/re_test.go @@ -604,6 +604,10 @@ var groupTests = []struct { {`(?:(cat)|(dog))\2`, nil, `catdog`, []Match{}}, {`(?:a)\1`, nil, `aa`, nil}, {`((cat)|(dog)|(cow)|(bat))\4`, nil, `cowcow`, []Match{[]Group{{0, 6}, {0, 3}, {-1, -1}, {-1, -1}, {0, 3}, {-1, -1}}}}, + {`(a|b)*\1`, nil, `abb`, []Match{[]Group{{0, 3}, {1, 2}}}}, + {`(a|b)*\1`, nil, `aba`, []Match{}}, + {`(a|b)*\1`, nil, `bab`, []Match{}}, + {`(a|b)*\1`, nil, `baa`, []Match{[]Group{{0, 3}, {1, 2}}}}, {`(a)(b)c|ab`, nil, `ab`, []Match{[]Group{{0, 2}}}}, {`(a)+x`, nil, `aaax`, []Match{[]Group{{0, 4}, {2, 3}}}},