You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
375 lines
12 KiB
Go
375 lines
12 KiB
Go
package regex
|
|
|
|
import (
|
|
"fmt"
|
|
"strconv"
|
|
"unicode"
|
|
)
|
|
|
|
// A Match represents a match found by the regex in a given string.
|
|
// It is represented as a list of groups, where the nth element contains
|
|
// the contents of the nth capturing group. Note that the group may not be valid
|
|
// (see [Group.IsValid]). The element at index 0 is known
|
|
// as the 0-group, and represents the contents of the entire match.
|
|
//
|
|
// See [Reg.FindSubmatch] for an example.
|
|
type Match []Group
|
|
|
|
// a Group represents a capturing group. It contains the start and index of the group.
|
|
type Group struct {
|
|
StartIdx int
|
|
EndIdx int
|
|
}
|
|
|
|
func newMatch(size int) Match {
|
|
toRet := make([]Group, size)
|
|
for i := range toRet {
|
|
toRet[i].StartIdx = -1
|
|
toRet[i].EndIdx = -1
|
|
}
|
|
return toRet
|
|
}
|
|
|
|
// Returns a string containing the indices of all (valid) groups in the match
|
|
func (m Match) String() string {
|
|
var toRet string
|
|
for i, g := range m {
|
|
if g.IsValid() {
|
|
toRet += fmt.Sprintf("Group %d\n", i)
|
|
toRet += g.String()
|
|
toRet += "\n"
|
|
}
|
|
}
|
|
return toRet
|
|
}
|
|
|
|
// String converts the Group into a string representation.
|
|
func (idx Group) String() string {
|
|
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
|
|
}
|
|
|
|
// IsValid returns whether a group is valid (ie. whether it matched any text). It
|
|
// simply ensures that both indices of the group are >= 0.
|
|
func (g Group) IsValid() bool {
|
|
return g.StartIdx >= 0 && g.EndIdx >= 0
|
|
}
|
|
|
|
// Simple function, makes it easier to map this over a list of matches
|
|
func getZeroGroup(m Match) Group {
|
|
return m[0]
|
|
}
|
|
|
|
func copyThread(to *nfaState, from nfaState) {
|
|
to.threadGroups = append([]Group{}, from.threadGroups...)
|
|
}
|
|
|
|
// Find returns the 0-group of the leftmost match of the regex in the given string.
|
|
// An error value != nil indicates that no match was found.
|
|
func (re Reg) Find(str string) (Group, error) {
|
|
match, err := re.FindNthMatch(str, 1)
|
|
if err != nil {
|
|
return Group{}, fmt.Errorf("no matches found")
|
|
}
|
|
return getZeroGroup(match), nil
|
|
}
|
|
|
|
// Match returns a boolean value, indicating whether the regex found a match in the given string.
|
|
func (re Reg) Match(str string) bool {
|
|
_, err := re.Find(str)
|
|
return err == nil
|
|
}
|
|
|
|
// CompileMatch compiles expr and returns true if str contains a match of the expression.
|
|
// It is equivalent to [regexp.Match].
|
|
// An optional list of flags may be provided (see [ReFlag]).
|
|
// It returns an error (!= nil) if there was an error compiling the expression.
|
|
func CompileMatch(expr string, str string, flags ...ReFlag) (bool, error) {
|
|
re, err := Compile(expr, flags...)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
return re.Match(str), nil
|
|
}
|
|
|
|
// FindAll returns a slice containing all the 0-groups of the regex in the given string.
|
|
// A 0-group represents the match without any submatches.
|
|
func (re Reg) FindAll(str string) []Group {
|
|
indices := re.FindAllSubmatch(str)
|
|
zeroGroups := funcMap(indices, getZeroGroup)
|
|
return zeroGroups
|
|
}
|
|
|
|
// FindString returns the text of the leftmost match of the regex in the given string.
|
|
// The return value will be an empty string in two situations:
|
|
// 1. No match was found
|
|
// 2. The match was an empty string
|
|
func (re Reg) FindString(str string) string {
|
|
match, err := re.FindNthMatch(str, 1)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
zeroGroup := getZeroGroup(match)
|
|
return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
|
|
}
|
|
|
|
// FindSubmatch returns the leftmost match of the regex in the given string, including
|
|
// the submatches matched by capturing groups. The returned [Match] will always contain the same
|
|
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
|
|
// [Group.IsValid], or by checking that both indices of the group are >= 0.
|
|
// The second-return value is nil if no match was found.
|
|
func (re Reg) FindSubmatch(str string) (Match, error) {
|
|
match, err := re.FindNthMatch(str, 1)
|
|
if err != nil {
|
|
return Match{}, fmt.Errorf("no match found")
|
|
} else {
|
|
return match, nil
|
|
}
|
|
}
|
|
|
|
// FindStringSubmatch is the 'string' version of [FindSubmatch]. It returns a slice of strings,
|
|
// where the string at index i contains the text matched by the i-th capturing group.
|
|
// The 0-th index represents the entire match.
|
|
// An empty string at index n could mean:
|
|
// ,
|
|
// 1. Group n did not find a match
|
|
// 2. Group n found a zero-length match
|
|
//
|
|
// A return value of nil indicates no match.
|
|
func (re Reg) FindStringSubmatch(str string) []string {
|
|
matchStr := make([]string, re.numGroups+1)
|
|
match, err := re.FindSubmatch(str)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
nonEmptyMatchFound := false
|
|
for i := range match {
|
|
if match[i].IsValid() {
|
|
matchStr[i] = str[match[i].StartIdx:match[i].EndIdx]
|
|
nonEmptyMatchFound = true
|
|
} else {
|
|
matchStr[i] = ""
|
|
}
|
|
}
|
|
if nonEmptyMatchFound == false {
|
|
return nil
|
|
}
|
|
return matchStr
|
|
}
|
|
|
|
// FindAllString is the 'all' version of [FindString].
|
|
// It returns a slice of strings containing the text of all matches of
|
|
// the regex in the given string.
|
|
func (re Reg) FindAllString(str string) []string {
|
|
zerogroups := re.FindAll(str)
|
|
matchStrs := funcMap(zerogroups, func(g Group) string {
|
|
return str[g.StartIdx:g.EndIdx]
|
|
})
|
|
return matchStrs
|
|
}
|
|
|
|
// FindNthMatch return the 'n'th match of the regex in the given string.
|
|
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
|
|
func (re Reg) FindNthMatch(str string, n int) (Match, error) {
|
|
idx := 0
|
|
matchNum := 0
|
|
str_runes := []rune(str)
|
|
var matchFound bool
|
|
var matchIdx Match
|
|
for idx <= len(str_runes) {
|
|
matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
|
|
if matchFound {
|
|
matchNum++
|
|
}
|
|
if matchNum == n {
|
|
return matchIdx, nil
|
|
}
|
|
}
|
|
// We haven't found the nth match after scanning the string - Return an error
|
|
return nil, fmt.Errorf("invalid match index - too few matches found")
|
|
}
|
|
|
|
// FindAllSubmatch returns a slice of matches in the given string.
|
|
func (re Reg) FindAllSubmatch(str string) []Match {
|
|
idx := 0
|
|
str_runes := []rune(str)
|
|
var matchFound bool
|
|
var matchIdx Match
|
|
indices := make([]Match, 0)
|
|
for idx <= len(str_runes) {
|
|
matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
|
|
if matchFound {
|
|
indices = append(indices, matchIdx)
|
|
}
|
|
}
|
|
|
|
return indices
|
|
}
|
|
|
|
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState {
|
|
if stateExists(list, state) || stateExists(visited, state) {
|
|
return list
|
|
}
|
|
visited = append(visited, state)
|
|
|
|
if state.isKleene || state.isQuestion {
|
|
copyThread(state.splitState, state)
|
|
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
|
|
copyThread(state.next, state)
|
|
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
|
return list
|
|
}
|
|
if state.isAlternation {
|
|
copyThread(state.next, state)
|
|
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
|
copyThread(state.splitState, state)
|
|
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
|
|
return list
|
|
}
|
|
state.threadGroups = append([]Group{}, threadGroups...)
|
|
if state.assert != noneAssert {
|
|
if state.checkAssertion(str, idx, preferLongest) {
|
|
copyThread(state.next, state)
|
|
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
|
|
}
|
|
}
|
|
if state.groupBegin {
|
|
state.threadGroups[state.groupNum].StartIdx = idx
|
|
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
|
|
}
|
|
if state.groupEnd {
|
|
state.threadGroups[state.groupNum].EndIdx = idx
|
|
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
|
|
}
|
|
return append(list, state)
|
|
|
|
}
|
|
|
|
// Helper for FindAllMatches. Returns whether it found a match, the
|
|
// first Match it finds, and how far it got into the string ie. where
|
|
// the next search should start from.
|
|
func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int, preferLongest bool) (bool, Match, int) {
|
|
// Base case - exit if offset exceeds string's length
|
|
if offset > len(str) {
|
|
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
|
|
return false, []Group{}, offset
|
|
}
|
|
resetThreads(start)
|
|
|
|
currentStates := make([]nfaState, 0)
|
|
nextStates := make([]nfaState, 0)
|
|
i := offset // Index in string
|
|
|
|
// If the first state is an assertion, makes sure the assertion
|
|
// is true before we do _anything_ else.
|
|
if start.assert != noneAssert {
|
|
if start.checkAssertion(str, offset, preferLongest) == false {
|
|
i++
|
|
return false, []Group{}, i
|
|
}
|
|
}
|
|
|
|
start.threadGroups = newMatch(numGroups + 1)
|
|
start.threadGroups[0].StartIdx = i
|
|
currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest)
|
|
var match Match = nil
|
|
for idx := i; idx <= len(str); idx++ {
|
|
if len(currentStates) == 0 {
|
|
break
|
|
}
|
|
for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ {
|
|
currentState := currentStates[currentStateIdx]
|
|
|
|
if currentState.threadGroups == nil {
|
|
currentState.threadGroups = newMatch(numGroups + 1)
|
|
currentState.threadGroups[0].StartIdx = idx
|
|
}
|
|
|
|
if currentState.isLast {
|
|
currentState.threadGroups[0].EndIdx = idx
|
|
match = append([]Group{}, currentState.threadGroups...)
|
|
if !preferLongest {
|
|
break
|
|
}
|
|
} else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character
|
|
if currentState.contentContains(str, idx, preferLongest) {
|
|
nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
|
|
}
|
|
}
|
|
}
|
|
currentStates = append([]nfaState{}, nextStates...)
|
|
nextStates = nil
|
|
}
|
|
if match != nil {
|
|
if offset == match[0].EndIdx {
|
|
return true, match, match[0].EndIdx + 1
|
|
}
|
|
return true, match, match[0].EndIdx
|
|
}
|
|
return false, []Group{}, i + 1
|
|
}
|
|
|
|
// Expand appends template to dst, expanding any variables in template to the relevant capturing group.
|
|
//
|
|
// A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group.
|
|
// To insert a literal $, do not put a number after it. Alternatively, you can use $$.
|
|
// src is the input string, and match must be the result of [Reg.FindSubmatch].
|
|
func (re Reg) Expand(dst string, template string, src string, match Match) string {
|
|
templateRuneSlc := []rune(template)
|
|
srcRuneSlc := []rune(src)
|
|
i := 0
|
|
for i < len(templateRuneSlc) {
|
|
c := templateRuneSlc[i]
|
|
if c == '$' {
|
|
i += 1
|
|
// The dollar sign is the last character of the string, or it is proceeded by another dollar sign
|
|
if i >= len(templateRuneSlc) || templateRuneSlc[i] == '$' {
|
|
dst += "$"
|
|
i++
|
|
} else {
|
|
numStr := ""
|
|
for unicode.IsDigit(templateRuneSlc[i]) {
|
|
numStr += string(templateRuneSlc[i])
|
|
i++
|
|
}
|
|
if numStr == "" {
|
|
dst += "$"
|
|
} else {
|
|
num, _ := strconv.Atoi(numStr)
|
|
if num < len(match) {
|
|
dst += string(srcRuneSlc[match[num].StartIdx:match[num].EndIdx])
|
|
} else {
|
|
dst += "$" + numStr
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
dst += string(c)
|
|
i++
|
|
}
|
|
}
|
|
return dst
|
|
}
|
|
|
|
// LiteralPrefix returns a string that must begin any match of the given regular expression.
|
|
// The second return value is true if the string comprises the entire expression.
|
|
func (re Reg) LiteralPrefix() (prefix string, complete bool) {
|
|
state := re.start
|
|
if state.assert != noneAssert {
|
|
state = state.next
|
|
}
|
|
for !(state.isLast) && (!state.isAlternation) && len(state.content) == 1 && state.assert == noneAssert {
|
|
if state.groupBegin || state.groupEnd {
|
|
state = state.next
|
|
continue
|
|
}
|
|
prefix += string(rune(state.content[0]))
|
|
state = state.next
|
|
}
|
|
if state.isLast {
|
|
complete = true
|
|
} else {
|
|
complete = false
|
|
}
|
|
return prefix, complete
|
|
}
|