package regex import ( "fmt" "strconv" "unicode" ) // A Match represents a match found by the regex in a given string. // It is represented as a list of groups, where the nth element contains // the contents of the nth capturing group. Note that the group may not be valid // (see [Group.IsValid]). The element at index 0 is known // as the 0-group, and represents the contents of the entire match. // // See [Reg.FindSubmatch] for an example. type Match []Group // a Group represents a capturing group. It contains the start and index of the group. type Group struct { StartIdx int EndIdx int } func newMatch(size int) Match { toRet := make([]Group, size) for i := range toRet { toRet[i].StartIdx = -1 toRet[i].EndIdx = -1 } return toRet } // Returns a string containing the indices of all (valid) groups in the match func (m Match) String() string { var toRet string for i, g := range m { if g.IsValid() { toRet += fmt.Sprintf("Group %d\n", i) toRet += g.String() toRet += "\n" } } return toRet } // String converts the Group into a string representation. func (idx Group) String() string { return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx) } // IsValid returns whether a group is valid (ie. whether it matched any text). It // simply ensures that both indices of the group are >= 0. func (g Group) IsValid() bool { return g.StartIdx >= 0 && g.EndIdx >= 0 } // Simple function, makes it easier to map this over a list of matches func getZeroGroup(m Match) Group { return m[0] } func copyThread(to *nfaState, from nfaState) { to.threadGroups = append([]Group{}, from.threadGroups...) } // Find returns the 0-group of the leftmost match of the regex in the given string. // An error value != nil indicates that no match was found. func (regex Reg) Find(str string) (Group, error) { match, err := regex.FindNthMatch(str, 1) if err != nil { return Group{}, fmt.Errorf("no matches found") } return getZeroGroup(match), nil } // Match returns a boolean value, indicating whether the regex found a match in the given string. func (regex Reg) Match(str string) bool { _, err := regex.Find(str) return err == nil } // CompileMatch compiles expr and returns true if str contains a match of the expression. // It is equivalent to [regexp.Match]. // An optional list of flags may be provided (see [ReFlag]). // It returns an error (!= nil) if there was an error compiling the expression. func CompileMatch(expr string, str string, flags ...ReFlag) (bool, error) { re, err := Compile(expr, flags...) if err != nil { return false, err } return re.Match(str), nil } // FindAll returns a slice containing all the 0-groups of the regex in the given string. // A 0-group represents the match without any submatches. func (regex Reg) FindAll(str string) []Group { indices := regex.FindAllSubmatch(str) zeroGroups := funcMap(indices, getZeroGroup) return zeroGroups } // FindString returns the text of the leftmost match of the regex in the given string. // The return value will be an empty string in two situations: // 1. No match was found // 2. The match was an empty string func (regex Reg) FindString(str string) string { match, err := regex.FindNthMatch(str, 1) if err != nil { return "" } zeroGroup := getZeroGroup(match) return str[zeroGroup.StartIdx:zeroGroup.EndIdx] } // FindSubmatch returns the leftmost match of the regex in the given string, including // the submatches matched by capturing groups. The returned [Match] will always contain the same // number of groups. The validity of a group (whether or not it matched anything) can be determined with // [Group.IsValid], or by checking that both indices of the group are >= 0. // The second-return value is nil if no match was found. func (regex Reg) FindSubmatch(str string) (Match, error) { match, err := regex.FindNthMatch(str, 1) if err != nil { return Match{}, fmt.Errorf("no match found") } else { return match, nil } } // FindStringSubmatch is the 'string' version of [FindSubmatch]. It returns a slice of strings, // where the string at index i contains the text matched by the i-th capturing group. // The 0-th index represents the entire match. // An empty string at index n could mean: // , // 1. Group n did not find a match // 2. Group n found a zero-length match // // A return value of nil indicates no match. func (regex Reg) FindStringSubmatch(str string) []string { matchStr := make([]string, regex.numGroups+1) match, err := regex.FindSubmatch(str) if err != nil { return nil } nonEmptyMatchFound := false for i := range match { if match[i].IsValid() { matchStr[i] = str[match[i].StartIdx:match[i].EndIdx] nonEmptyMatchFound = true } else { matchStr[i] = "" } } if nonEmptyMatchFound == false { return nil } return matchStr } // FindAllString is the 'all' version of [FindString]. // It returns a slice of strings containing the text of all matches of // the regex in the given string. func (regex Reg) FindAllString(str string) []string { zerogroups := regex.FindAll(str) matchStrs := funcMap(zerogroups, func(g Group) string { return str[g.StartIdx:g.EndIdx] }) return matchStrs } // FindNthMatch return the 'n'th match of the regex in the given string. // It returns an error (!= nil) if there are fewer than 'n' matches in the string. func (regex Reg) FindNthMatch(str string, n int) (Match, error) { idx := 0 matchNum := 0 str_runes := []rune(str) var matchFound bool var matchIdx Match for idx <= len(str_runes) { matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups, regex.preferLongest) if matchFound { matchNum++ } if matchNum == n { return matchIdx, nil } } // We haven't found the nth match after scanning the string - Return an error return nil, fmt.Errorf("invalid match index - too few matches found") } // FindAllSubmatch returns a slice of matches in the given string. func (regex Reg) FindAllSubmatch(str string) []Match { idx := 0 str_runes := []rune(str) var matchFound bool var matchIdx Match indices := make([]Match, 0) for idx <= len(str_runes) { matchFound, matchIdx, idx = findAllSubmatchHelper(regex.start, str_runes, idx, regex.numGroups, regex.preferLongest) if matchFound { indices = append(indices, matchIdx) } } return indices } func addStateToList(str []rune, idx int, list []nfaState, state nfaState, threadGroups []Group, visited []nfaState, preferLongest bool) []nfaState { if stateExists(list, state) || stateExists(visited, state) { return list } visited = append(visited, state) if state.isKleene || state.isQuestion { copyThread(state.splitState, state) list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest) copyThread(state.next, state) list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest) return list } if state.isAlternation { copyThread(state.next, state) list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest) copyThread(state.splitState, state) list = addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest) return list } state.threadGroups = append([]Group{}, threadGroups...) if state.assert != noneAssert { if state.checkAssertion(str, idx, preferLongest) { copyThread(state.next, state) return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest) } } if state.groupBegin { state.threadGroups[state.groupNum].StartIdx = idx return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest) } if state.groupEnd { state.threadGroups[state.groupNum].EndIdx = idx return addStateToList(str, idx, list, *state.next, state.threadGroups, visited, preferLongest) } return append(list, state) } // Helper for FindAllMatches. Returns whether it found a match, the // first Match it finds, and how far it got into the string ie. where // the next search should start from. func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups int, preferLongest bool) (bool, Match, int) { // Base case - exit if offset exceeds string's length if offset > len(str) { // The second value here shouldn't be used, because we should exit when the third return value is > than len(str) return false, []Group{}, offset } resetThreads(start) currentStates := make([]nfaState, 0) nextStates := make([]nfaState, 0) i := offset // Index in string // If the first state is an assertion, makes sure the assertion // is true before we do _anything_ else. if start.assert != noneAssert { if start.checkAssertion(str, offset, preferLongest) == false { i++ return false, []Group{}, i } } start.threadGroups = newMatch(numGroups + 1) start.threadGroups[0].StartIdx = i currentStates = addStateToList(str, i, currentStates, *start, start.threadGroups, nil, preferLongest) var match Match = nil for idx := i; idx <= len(str); idx++ { if len(currentStates) == 0 { break } for currentStateIdx := 0; currentStateIdx < len(currentStates); currentStateIdx++ { currentState := currentStates[currentStateIdx] if currentState.threadGroups == nil { currentState.threadGroups = newMatch(numGroups + 1) currentState.threadGroups[0].StartIdx = idx } if currentState.isLast { currentState.threadGroups[0].EndIdx = idx match = append([]Group{}, currentState.threadGroups...) break } else if !currentState.isAlternation && !currentState.isKleene && !currentState.isQuestion && !currentState.groupBegin && !currentState.groupEnd && currentState.assert == noneAssert { // Normal character if currentState.contentContains(str, idx, preferLongest) { nextStates = addStateToList(str, idx+1, nextStates, *currentState.next, currentState.threadGroups, nil, preferLongest) } } } currentStates = append([]nfaState{}, nextStates...) nextStates = nil } if match != nil { if offset == match[0].EndIdx { return true, match, match[0].EndIdx + 1 } return true, match, match[0].EndIdx } return false, []Group{}, i + 1 } // Expand appends template to dst, expanding any variables in template to the relevant capturing group. // // A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group. // To insert a literal $, do not put a number after it. Alternatively, you can use $$. // src is the input string, and match must be the result of [Reg.FindSubmatch]. func (regex Reg) Expand(dst string, template string, src string, match Match) string { templateRuneSlc := []rune(template) srcRuneSlc := []rune(src) i := 0 for i < len(templateRuneSlc) { c := templateRuneSlc[i] if c == '$' { i += 1 // The dollar sign is the last character of the string, or it is proceeded by another dollar sign if i >= len(templateRuneSlc) || templateRuneSlc[i] == '$' { dst += "$" i++ } else { numStr := "" for unicode.IsDigit(templateRuneSlc[i]) { numStr += string(templateRuneSlc[i]) i++ } if numStr == "" { dst += "$" } else { num, _ := strconv.Atoi(numStr) if num < len(match) { dst += string(srcRuneSlc[match[num].StartIdx:match[num].EndIdx]) } else { dst += "$" + numStr } } } } else { dst += string(c) i++ } } return dst } // LiteralPrefix returns a string that must begin any match of the given regular expression. // The second return value is true if the string comprises the entire expression. func (regex Reg) LiteralPrefix() (prefix string, complete bool) { state := regex.start if state.assert != noneAssert { state = state.next } for !(state.isLast) && (!state.isAlternation) && len(state.content) == 1 && state.assert == noneAssert { if state.groupBegin || state.groupEnd { state = state.next continue } prefix += string(rune(state.content[0])) state = state.next } if state.isLast { complete = true } else { complete = false } return prefix, complete }