Wrote new methods Expand() and preferLongest(); Use new function signatures (with preferLongest); only characters should be added to next state list

implementPCREMatchingRules
Aadhavan Srinivasan 4 weeks ago
parent 1f5a363539
commit fb47e082eb

@ -2,6 +2,8 @@ package regex
import ( import (
"fmt" "fmt"
"strconv"
"unicode"
) )
// A Match represents a match found by the regex in a given string. // A Match represents a match found by the regex in a given string.
@ -77,6 +79,18 @@ func (regex Reg) Match(str string) bool {
return err == nil return err == nil
} }
// CompileMatch compiles expr and returns true if str contains a match of the expression.
// It is equivalent to [regexp.Match].
// An optional list of flags may be provided (see [ReFlag]).
// It returns an error (!= nil) if there was an error compiling the expression.
func CompileMatch(expr string, str string, flags ...ReFlag) (bool, error) {
re, err := Compile(expr, flags...)
if err != nil {
return false, err
}
return re.Match(str), nil
}
// FindAll returns a slice containing all the 0-groups of the regex in the given string. // FindAll returns a slice containing all the 0-groups of the regex in the given string.
// A 0-group represents the match without any submatches. // A 0-group represents the match without any submatches.
func (regex Reg) FindAll(str string) []Group { func (regex Reg) FindAll(str string) []Group {
@ -162,7 +176,7 @@ func (regex Reg) FindNthMatch(str string, n int) (Match, error) {
var matchFound bool var matchFound bool
var matchIdx Match var matchIdx Match
for idx <= len(str_runes) { for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups) matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups, regex.preferLongest)
if matchFound { if matchFound {
matchNum++ matchNum++
} }
@ -182,7 +196,7 @@ func (regex Reg) FindAllSubmatch(str string) []Match {
var matchIdx Match var matchIdx Match
indices := make([]Match, 0) indices := make([]Match, 0)
for idx <= len(str_runes) { for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups) matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups, regex.preferLongest)
if matchFound { if matchFound {
indices = append(indices, matchIdx) indices = append(indices, matchIdx)
} }
@ -191,7 +205,7 @@ func (regex Reg) FindAllSubmatch(str string) []Match {
return indices return indices
} }
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState) []nfaState { func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState {
if stateExists(list, state) || stateExists(visited, state) { if stateExists(list, state) || stateExists(visited, state) {
return list return list
} }
@ -199,32 +213,32 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread
if state.isKleene || state.isQuestion { if state.isKleene || state.isQuestion {
copyThread(state.splitState, state) copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited) list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
copyThread(state.next, state) copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited) list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
return list return list
} }
if state.isAlternation { if state.isAlternation {
copyThread(state.next, state) copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited) list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
copyThread(state.splitState, state) copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited) list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
return list return list
} }
state.threadGroups = append([]Group{}, threadGroups...) state.threadGroups = append([]Group{}, threadGroups...)
if state.assert != noneAssert { if state.assert != noneAssert {
if state.checkAssertion(str, idx) { if state.checkAssertion(str, idx, preferLongest) {
copyThread(state.next, state) copyThread(state.next, state)
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited) return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
} }
} }
if state.groupBegin { if state.groupBegin {
state.threadGroups[state.groupNum].StartIdx = idx state.threadGroups[state.groupNum].StartIdx = idx
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited) return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
} }
if state.groupEnd { if state.groupEnd {
state.threadGroups[state.groupNum].EndIdx = idx state.threadGroups[state.groupNum].EndIdx = idx
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited) return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
} }
return append(list, state) return append(list, state)
@ -233,7 +247,7 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread
// Helper for FindAllMatches. Returns whether it found a match, the // Helper for FindAllMatches. Returns whether it found a match, the
// first Match it finds, and how far it got into the string ie. where // first Match it finds, and how far it got into the string ie. where
// the next search should start from. // the next search should start from.
func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int) (bool, Match, int) { func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int, preferLongest bool) (bool, Match, int) {
// Base case - exit if offset exceeds string's length // Base case - exit if offset exceeds string's length
if offset > len(str) { if offset > len(str) {
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str) // The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
@ -248,7 +262,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// If the first state is an assertion, makes sure the assertion // If the first state is an assertion, makes sure the assertion
// is true before we do _anything_ else. // is true before we do _anything_ else.
if start.assert != noneAssert { if start.assert != noneAssert {
if start.checkAssertion(str, offset) == false { if start.checkAssertion(str, offset, preferLongest) == false {
i++ i++
return false, []Group{}, i return false, []Group{}, i
} }
@ -256,7 +270,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
start.threadGroups = newMatch(numGroups + 1) start.threadGroups = newMatch(numGroups + 1)
start.threadGroups[0].StartIdx = i start.threadGroups[0].StartIdx = i
currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil) currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest)
var match Match = nil var match Match = nil
for idx := i; idx <= len(str); idx++ { for idx := i; idx <= len(str); idx++ {
if len(currentStates) == 0 { if len(currentStates) == 0 {
@ -274,9 +288,9 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
currentState.threadGroups[0].EndIdx = idx currentState.threadGroups[0].EndIdx = idx
match = append([]Group{}, currentState.threadGroups...) match = append([]Group{}, currentState.threadGroups...)
break break
} else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd { // Normal character or assertion } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character
if currentState.contentContains(str, idx) { if currentState.contentContains(str, idx, preferLongest) {
nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil) nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
} }
} }
} }
@ -291,3 +305,68 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
} }
return false, []Group{}, i + 1 return false, []Group{}, i + 1
} }
// Expand appends template to dst, expanding any variables in template to the relevant capturing group.
//
// A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group.
// To insert a literal $, do not put a number after it. Alternatively, you can use $$.
// src is the input string, and match must be the result of [Reg.FindSubmatch].
func (regex Reg) Expand(dst string, template string, src string, match Match) string {
templateRuneSlc := []rune(template)
srcRuneSlc := []rune(src)
i := 0
for i < len(templateRuneSlc) {
c := templateRuneSlc[i]
if c == '$' {
i += 1
// The dollar sign is the last character of the string, or it is proceeded by another dollar sign
if i >= len(templateRuneSlc) || templateRuneSlc[i] == '$' {
dst += "$"
i++
} else {
numStr := ""
for unicode.IsDigit(templateRuneSlc[i]) {
numStr += string(templateRuneSlc[i])
i++
}
if numStr == "" {
dst += "$"
} else {
num, _ := strconv.Atoi(numStr)
if num < len(match) {
dst += string(srcRuneSlc[match[num].StartIdx:match[num].EndIdx])
} else {
dst += "$" + numStr
}
}
}
} else {
dst += string(c)
i++
}
}
return dst
}
// LiteralPrefix returns a string that must begin any match of the given regular expression.
// The second return value is true if the string comprises the entire expression.
func (regex Reg) LiteralPrefix() (prefix string, complete bool) {
state := regex.start
if state.assert != noneAssert {
state = state.next
}
for !(state.isLast) && (!state.isAlternation) && len(state.content) == 1 && state.assert == noneAssert {
if state.groupBegin || state.groupEnd {
state = state.next
continue
}
prefix += string(rune(state.content[0]))
state = state.next
}
if state.isLast {
complete = true
} else {
complete = false
}
return prefix, complete
}

Loading…
Cancel
Save