You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kleingrep/regex/matching.go

461 lines
14 KiB
Go

package regex
import (
"fmt"
"strconv"
"unicode"
)
// A Match represents a match found by the regex in a given string.
1 month ago
// It is represented as a list of groups, where the nth element contains
// the contents of the nth capturing group. Note that the group may not be valid
// (see [Group.IsValid]). The element at index 0 is known
// as the 0-group, and represents the contents of the entire match.
//
1 month ago
// See [Reg.FindSubmatch] for an example.
type Match []Group
// a Group represents a capturing group. It contains the start and index of the group.
type Group struct {
StartIdx int
EndIdx int
}
func newMatch(size int) Match {
toRet := make([]Group, size)
for i := range toRet {
toRet[i].StartIdx = -1
toRet[i].EndIdx = -1
}
return toRet
}
// Returns a string containing the indices of all (valid) groups in the match
func (m Match) String() string {
var toRet string
for i, g := range m {
if g.IsValid() {
toRet += fmt.Sprintf("Group %d\n", i)
toRet += g.String()
toRet += "\n"
}
}
return toRet
}
// String converts the Group into a string representation.
func (idx Group) String() string {
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
}
// IsValid returns whether a group is valid (ie. whether it matched any text). It
// simply ensures that both indices of the group are >= 0.
func (g Group) IsValid() bool {
return g.StartIdx >= 0 && g.EndIdx >= 0
}
// Simple function, makes it easier to map this over a list of matches
func getZeroGroup(m Match) Group {
return m[0]
}
func copyThread(to *nfaState, from nfaState) {
to.threadGroups = append([]Group{}, from.threadGroups...)
}
// Find returns the 0-group of the leftmost match of the regex in the given string.
// An error value != nil indicates that no match was found.
func (re Reg) Find(str string) (Group, error) {
match, err := re.FindNthMatch(str, 1)
if err != nil {
return Group{}, fmt.Errorf("no matches found")
}
return getZeroGroup(match), nil
}
// Match returns a boolean value, indicating whether the regex found a match in the given string.
func (re Reg) Match(str string) bool {
_, err := re.Find(str)
return err == nil
}
// CompileMatch compiles expr and returns true if str contains a match of the expression.
// It is equivalent to [regexp.Match].
// An optional list of flags may be provided (see [ReFlag]).
// It returns an error (!= nil) if there was an error compiling the expression.
func CompileMatch(expr string, str string, flags ...ReFlag) (bool, error) {
re, err := Compile(expr, flags...)
if err != nil {
return false, err
}
return re.Match(str), nil
}
// FindAll returns a slice containing all the 0-groups of the regex in the given string.
// A 0-group represents the match without any submatches.
func (re Reg) FindAll(str string) []Group {
indices := re.FindAllSubmatch(str)
zeroGroups := funcMap(indices, getZeroGroup)
return zeroGroups
}
// FindString returns the text of the leftmost match of the regex in the given string.
// The return value will be an empty string in two situations:
// 1. No match was found
// 2. The match was an empty string
func (re Reg) FindString(str string) string {
match, err := re.FindNthMatch(str, 1)
if err != nil {
return ""
}
zeroGroup := getZeroGroup(match)
return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
}
// FindSubmatch returns the leftmost match of the regex in the given string, including
// the submatches matched by capturing groups. The returned [Match] will always contain the same
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
// [Group.IsValid], or by checking that both indices of the group are >= 0.
// The second-return value is nil if no match was found.
func (re Reg) FindSubmatch(str string) (Match, error) {
match, err := re.FindNthMatch(str, 1)
if err != nil {
return Match{}, fmt.Errorf("no match found")
} else {
return match, nil
}
}
// FindStringSubmatch is the 'string' version of [FindSubmatch]. It returns a slice of strings,
// where the string at index i contains the text matched by the i-th capturing group.
// The 0-th index represents the entire match.
// An empty string at index n could mean:
// ,
// 1. Group n did not find a match
// 2. Group n found a zero-length match
//
// A return value of nil indicates no match.
func (re Reg) FindStringSubmatch(str string) []string {
matchStr := make([]string, re.numGroups+1)
match, err := re.FindSubmatch(str)
if err != nil {
return nil
}
nonEmptyMatchFound := false
for i := range match {
if match[i].IsValid() {
matchStr[i] = str[match[i].StartIdx:match[i].EndIdx]
nonEmptyMatchFound = true
} else {
matchStr[i] = ""
}
}
if nonEmptyMatchFound == false {
return nil
}
return matchStr
}
// FindAllString is the 'all' version of [FindString].
// It returns a slice of strings containing the text of all matches of
// the regex in the given string.
func (re Reg) FindAllString(str string) []string {
zerogroups := re.FindAll(str)
matchStrs := funcMap(zerogroups, func(g Group) string {
return str[g.StartIdx:g.EndIdx]
})
return matchStrs
}
// FindNthMatch return the 'n'th match of the regex in the given string.
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
func (re Reg) FindNthMatch(str string, n int) (Match, error) {
idx := 0
matchNum := 0
str_runes := []rune(str)
var matchFound bool
var matchIdx Match
for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
if matchFound {
matchNum++
}
if matchNum == n {
return matchIdx, nil
}
}
// We haven't found the nth match after scanning the string - Return an error
return nil, fmt.Errorf("invalid match index - too few matches found")
}
// FindAllSubmatch returns a slice of matches in the given string.
func (re Reg) FindAllSubmatch(str string) []Match {
idx := 0
str_runes := []rune(str)
var matchFound bool
var matchIdx Match
indices := make([]Match, 0)
for idx <= len(str_runes) {
matchFound, matchIdx, idx = findAllSubmatchHelper(re.start, str_runes, idx, re.numGroups, re.preferLongest)
if matchFound {
indices = append(indices, matchIdx)
}
}
return indices
}
// FindAllSubmatch returns a double-slice of strings. Each slice contains the text of a match, including all submatches.
// A return value of nil indicates no match.
func (re Reg) FindAllStringSubmatch(str string) [][]string {
match := re.FindAllSubmatch(str)
if len(match) == 0 {
return nil
}
rtv := make([][]string, len(match))
for i := range rtv {
rtv[i] = make([]string, re.numGroups+1)
}
rtv = funcMap(match, func(m Match) []string {
return funcMap(m, func(g Group) string {
if g.IsValid() {
return str[g.StartIdx:g.EndIdx]
} else {
return ""
}
})
})
return rtv
}
func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState {
if stateExists(list, state) || stateExists(visited, state) {
return list
}
visited = append(visited, state)
if state.isKleene || state.isQuestion {
copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
return list
}
if state.isAlternation {
copyThread(state.next, state)
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
copyThread(state.splitState, state)
list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
return list
}
state.threadGroups = append([]Group{}, threadGroups...)
if state.assert != noneAssert {
if state.checkAssertion(str, idx, preferLongest) {
copyThread(state.next, state)
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
}
if state.groupBegin {
state.threadGroups[state.groupNum].StartIdx = idx
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
if state.groupEnd {
state.threadGroups[state.groupNum].EndIdx = idx
return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest)
}
return append(list, state)
}
// Helper for FindAllMatches. Returns whether it found a match, the
// first Match it finds, and how far it got into the string ie. where
// the next search should start from.
func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int, preferLongest bool) (bool, Match, int) {
// Base case - exit if offset exceeds string's length
if offset > len(str) {
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
return false, []Group{}, offset
}
resetThreads(start)
currentStates := make([]nfaState, 0)
nextStates := make([]nfaState, 0)
i := offset // Index in string
// If the first state is an assertion, makes sure the assertion
// is true before we do _anything_ else.
if start.assert != noneAssert {
if start.checkAssertion(str, offset, preferLongest) == false {
i++
return false, []Group{}, i
}
}
start.threadGroups = newMatch(numGroups + 1)
start.threadGroups[0].StartIdx = i
currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest)
var match Match = nil
for idx := i; idx <= len(str); idx++ {
if len(currentStates) == 0 {
break
}
for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ {
currentState := currentStates[currentStateIdx]
if currentState.threadGroups == nil {
currentState.threadGroups = newMatch(numGroups + 1)
currentState.threadGroups[0].StartIdx = idx
}
if currentState.isLast {
currentState.threadGroups[0].EndIdx = idx
match = append([]Group{}, currentState.threadGroups...)
if !preferLongest {
break
}
} else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character
if currentState.contentContains(str, idx, preferLongest) {
nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest)
}
}
}
currentStates = append([]nfaState{}, nextStates...)
nextStates = nil
}
if match != nil {
if offset == match[0].EndIdx {
return true, match, match[0].EndIdx + 1
}
return true, match, match[0].EndIdx
}
return false, []Group{}, i + 1
}
// Expand appends template to dst, expanding any variables in template to the relevant capturing group.
//
// A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group.
// To insert a literal $, do not put a number after it. Alternatively, you can use $$.
// src is the input string, and match must be the result of [Reg.FindSubmatch].
func (re Reg) Expand(dst string, template string, src string, match Match) string {
templateRuneSlc := []rune(template)
srcRuneSlc := []rune(src)
i := 0
for i < len(templateRuneSlc) {
c := templateRuneSlc[i]
if c == '$' {
i += 1
// The dollar sign is the last character of the string, or it is proceeded by another dollar sign
if i >= len(templateRuneSlc) || templateRuneSlc[i] == '$' {
dst += "$"
i++
} else {
numStr := ""
for i < len(templateRuneSlc) && unicode.IsDigit(templateRuneSlc[i]) {
numStr += string(templateRuneSlc[i])
i++
}
if numStr == "" {
dst += "$"
} else {
num, _ := strconv.Atoi(numStr)
if num < len(match) {
dst += string(srcRuneSlc[match[num].StartIdx:match[num].EndIdx])
} else {
dst += "$" + numStr
}
}
}
} else {
dst += string(c)
i++
}
}
return dst
}
// LiteralPrefix returns a string that must begin any match of the given regular expression.
// The second return value is true if the string comprises the entire expression.
func (re Reg) LiteralPrefix() (prefix string, complete bool) {
state := re.start
if state.assert != noneAssert {
state = state.next
}
for !(state.isLast) && (!state.isAlternation) && len(state.content) == 1 && state.assert == noneAssert {
if state.groupBegin || state.groupEnd {
state = state.next
continue
}
prefix += string(rune(state.content[0]))
state = state.next
}
if state.isLast {
complete = true
} else {
complete = false
}
return prefix, complete
}
// ReplaceAll replaces all matches of the expression in src, with the text in repl. In repl, variables are interpreted
// as they are in [Reg.Expand]. The resulting string is returned.
func (re Reg) ReplaceAll(src string, repl string) string {
matches := re.FindAllSubmatch(src)
i := 0
currentMatch := 0
dst := ""
for i < len(src) {
if currentMatch < len(matches) && matches[currentMatch][0].IsValid() && i == matches[currentMatch][0].StartIdx {
dst += re.Expand("", repl, src, matches[currentMatch])
i = matches[currentMatch][0].EndIdx
currentMatch++
} else {
dst += string(src[i])
i++
}
}
return dst
}
// ReplaceAllLiteral replaces all matches of the expression in src, with the text in repl. The text is replaced directly,
// without any expansion.
func (re Reg) ReplaceAllLiteral(src string, repl string) string {
zerogroups := re.FindAll(src)
currentMatch := 0
i := 0
dst := ""
for i < len(src) {
if currentMatch < len(zerogroups) && i == zerogroups[currentMatch].StartIdx {
dst += repl
i = zerogroups[currentMatch].EndIdx
currentMatch += 1
} else {
dst += string(src[i])
i++
}
}
return dst
}
// ReplaceAllFunc replaces every match of the expression in src, with the return value of the function replFunc.
// replFunc takes in the matched string. The return value is substituted in directly without expasion.
func (re Reg) ReplaceAllFunc(src string, replFunc func(string) string) string {
zerogroups := re.FindAll(src)
currentMatch := 0
i := 0
dst := ""
for i < len(src) {
if currentMatch < len(zerogroups) && i == zerogroups[currentMatch].StartIdx {
dst += replFunc(src[zerogroups[currentMatch].StartIdx:zerogroups[currentMatch].EndIdx])
i = zerogroups[currentMatch].EndIdx
currentMatch += 1
} else {
dst += string(src[i])
i++
}
}
return dst
}