package regex
import (
"fmt"
"strconv"
"unicode"
)
// A Match represents a match found by the regex in a given string.
// It is represented as a list of groups, where the nth element contains
// the contents of the nth capturing group. Note that the group may not be valid
// (see [Group.IsValid]). The element at index 0 is known
// as the 0-group, and represents the contents of the entire match.
//
// See [Reg.FindSubmatch] for an example.
type Match [ ] Group
// a Group represents a capturing group. It contains the start and index of the group.
type Group struct {
StartIdx int
EndIdx int
}
func newMatch ( size int ) Match {
toRet := make ( [ ] Group , size )
for i := range toRet {
toRet [ i ] . StartIdx = - 1
toRet [ i ] . EndIdx = - 1
}
return toRet
}
// Returns a string containing the indices of all (valid) groups in the match
func ( m Match ) String ( ) string {
var toRet string
for i , g := range m {
if g . IsValid ( ) {
toRet += fmt . Sprintf ( "Group %d\n" , i )
toRet += g . String ( )
toRet += "\n"
}
}
return toRet
}
// String converts the Group into a string representation.
func ( idx Group ) String ( ) string {
return fmt . Sprintf ( "%d\t%d" , idx . StartIdx , idx . EndIdx )
}
Big rewrite - assertion handling, zero-match fixes, change in recursive calls
I added support for transitions. I wrote a function to determine if
a given state has transitions for a character at a given point in the
string. This helps me check if the current state has an assertion, and
take actions based on that.
I also fixed zero-length matching (almost, see todo.txt). It works for
nearly all cases I could think of, although I still need to write more
tests. I wrote a function to check if zero-length matches are possible
with a given state.
I also changed the way recursive calls work. Rather than passing a
modified string, the function stores the location in the input string.
This location is updated with each call to the function.
Finally, the function now increments the offset by 1 instead of
incrementing by the length of the longest match. This leads to a bit of
overhead eg. if a regex matches index 1-5, then 1-5, 2-5, 3-5, 4-5 are
all stored. To fix this, I wrote (and used) a function to check if
a match overlaps with any matches in a slice.
4 months ago
// IsValid returns whether a group is valid (ie. whether it matched any text). It
// simply ensures that both indices of the group are >= 0.
func ( g Group ) IsValid ( ) bool {
return g . StartIdx >= 0 && g . EndIdx >= 0
}
// Simple function, makes it easier to map this over a list of matches
func getZeroGroup ( m Match ) Group {
return m [ 0 ]
}
func copyThread ( to * nfaState , from nfaState ) {
to . threadGroups = append ( [ ] Group { } , from . threadGroups ... )
}
// Find returns the 0-group of the leftmost match of the regex in the given string.
// An error value != nil indicates that no match was found.
func ( re Reg ) Find ( str string ) ( Group , error ) {
match , err := re . FindNthMatch ( str , 1 )
if err != nil {
return Group { } , fmt . Errorf ( "no matches found" )
}
return getZeroGroup ( match ) , nil
}
// Match returns a boolean value, indicating whether the regex found a match in the given string.
func ( re Reg ) Match ( str string ) bool {
_ , err := re . Find ( str )
return err == nil
}
// CompileMatch compiles expr and returns true if str contains a match of the expression.
// It is equivalent to [regexp.Match].
// An optional list of flags may be provided (see [ReFlag]).
// It returns an error (!= nil) if there was an error compiling the expression.
func CompileMatch ( expr string , str string , flags ... ReFlag ) ( bool , error ) {
re , err := Compile ( expr , flags ... )
if err != nil {
return false , err
}
return re . Match ( str ) , nil
}
// FindAll returns a slice containing all the 0-groups of the regex in the given string.
// A 0-group represents the match without any submatches.
func ( re Reg ) FindAll ( str string ) [ ] Group {
indices := re . FindAllSubmatch ( str )
zeroGroups := funcMap ( indices , getZeroGroup )
return zeroGroups
}
// FindString returns the text of the leftmost match of the regex in the given string.
// The return value will be an empty string in two situations:
// 1. No match was found
// 2. The match was an empty string
func ( re Reg ) FindString ( str string ) string {
match , err := re . FindNthMatch ( str , 1 )
if err != nil {
return ""
}
zeroGroup := getZeroGroup ( match )
return str [ zeroGroup . StartIdx : zeroGroup . EndIdx ]
}
// FindSubmatch returns the leftmost match of the regex in the given string, including
// the submatches matched by capturing groups. The returned [Match] will always contain the same
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
// [Group.IsValid], or by checking that both indices of the group are >= 0.
// The second-return value is nil if no match was found.
func ( re Reg ) FindSubmatch ( str string ) ( Match , error ) {
match , err := re . FindNthMatch ( str , 1 )
if err != nil {
return Match { } , fmt . Errorf ( "no match found" )
} else {
return match , nil
}
}
// FindStringSubmatch is the 'string' version of [FindSubmatch]. It returns a slice of strings,
// where the string at index i contains the text matched by the i-th capturing group.
// The 0-th index represents the entire match.
// An empty string at index n could mean:
// ,
// 1. Group n did not find a match
// 2. Group n found a zero-length match
//
// A return value of nil indicates no match.
func ( re Reg ) FindStringSubmatch ( str string ) [ ] string {
matchStr := make ( [ ] string , re . numGroups + 1 )
match , err := re . FindSubmatch ( str )
if err != nil {
return nil
}
nonEmptyMatchFound := false
for i := range match {
if match [ i ] . IsValid ( ) {
matchStr [ i ] = str [ match [ i ] . StartIdx : match [ i ] . EndIdx ]
nonEmptyMatchFound = true
} else {
matchStr [ i ] = ""
}
}
if nonEmptyMatchFound == false {
return nil
}
return matchStr
}
// FindAllString is the 'all' version of [FindString].
// It returns a slice of strings containing the text of all matches of
// the regex in the given string.
func ( re Reg ) FindAllString ( str string ) [ ] string {
zerogroups := re . FindAll ( str )
matchStrs := funcMap ( zerogroups , func ( g Group ) string {
return str [ g . StartIdx : g . EndIdx ]
} )
return matchStrs
}
// FindNthMatch return the 'n'th match of the regex in the given string.
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
func ( re Reg ) FindNthMatch ( str string , n int ) ( Match , error ) {
idx := 0
matchNum := 0
str_runes := [ ] rune ( str )
var matchFound bool
var matchIdx Match
for idx <= len ( str_runes ) {
matchFound , matchIdx , idx = findAllSubmatchHelper ( re . start , str_runes , idx , re . numGroups , re . preferLongest )
if matchFound {
matchNum ++
}
if matchNum == n {
return matchIdx , nil
}
}
// We haven't found the nth match after scanning the string - Return an error
return nil , fmt . Errorf ( "invalid match index - too few matches found" )
}
// FindAllSubmatch returns a slice of matches in the given string.
func ( re Reg ) FindAllSubmatch ( str string ) [ ] Match {
idx := 0
str_runes := [ ] rune ( str )
var matchFound bool
var matchIdx Match
indices := make ( [ ] Match , 0 )
for idx <= len ( str_runes ) {
matchFound , matchIdx , idx = findAllSubmatchHelper ( re . start , str_runes , idx , re . numGroups , re . preferLongest )
if matchFound {
indices = append ( indices , matchIdx )
}
}
return indices
}
// FindAllSubmatch returns a double-slice of strings. Each slice contains the text of a match, including all submatches.
// A return value of nil indicates no match.
func ( re Reg ) FindAllStringSubmatch ( str string ) [ ] [ ] string {
match := re . FindAllSubmatch ( str )
if len ( match ) == 0 {
return nil
}
rtv := make ( [ ] [ ] string , len ( match ) )
for i := range rtv {
rtv [ i ] = make ( [ ] string , re . numGroups + 1 )
}
rtv = funcMap ( match , func ( m Match ) [ ] string {
return funcMap ( m , func ( g Group ) string {
if g . IsValid ( ) {
return str [ g . StartIdx : g . EndIdx ]
} else {
return ""
}
} )
} )
return rtv
}
func addStateToList ( str [ ] rune , idx int , list [ ] nfaState , state nfaState , threadGroups [ ] Group , visited [ ] nfaState , preferLongest bool ) [ ] nfaState {
if stateExists ( list , state ) || stateExists ( visited , state ) {
return list
}
visited = append ( visited , state )
if state . isKleene || state . isQuestion {
copyThread ( state . splitState , state )
list = addStateToList ( str , idx , list , * state . splitState , threadGroups , visited , preferLongest )
copyThread ( state . next , state )
list = addStateToList ( str , idx , list , * state . next , threadGroups , visited , preferLongest )
return list
}
if state . isAlternation {
copyThread ( state . next , state )
list = addStateToList ( str , idx , list , * state . next , threadGroups , visited , preferLongest )
copyThread ( state . splitState , state )
list = addStateToList ( str , idx , list , * state . splitState , threadGroups , visited , preferLongest )
return list
}
state . threadGroups = append ( [ ] Group { } , threadGroups ... )
if state . assert != noneAssert {
if state . checkAssertion ( str , idx , preferLongest ) {
copyThread ( state . next , state )
return addStateToList ( str , idx , list , * state . next , state . threadGroups , visited , preferLongest )
}
}
if state . groupBegin {
state . threadGroups [ state . groupNum ] . StartIdx = idx
return addStateToList ( str , idx , list , * state . next , state . threadGroups , visited , preferLongest )
}
if state . groupEnd {
state . threadGroups [ state . groupNum ] . EndIdx = idx
return addStateToList ( str , idx , list , * state . next , state . threadGroups , visited , preferLongest )
}
return append ( list , state )
}
// Helper for FindAllMatches. Returns whether it found a match, the
// first Match it finds, and how far it got into the string ie. where
// the next search should start from.
func findAllSubmatchHelper ( start * nfaState , str [ ] rune , offset int , numGroups int , preferLongest bool ) ( bool , Match , int ) {
Big rewrite - assertion handling, zero-match fixes, change in recursive calls
I added support for transitions. I wrote a function to determine if
a given state has transitions for a character at a given point in the
string. This helps me check if the current state has an assertion, and
take actions based on that.
I also fixed zero-length matching (almost, see todo.txt). It works for
nearly all cases I could think of, although I still need to write more
tests. I wrote a function to check if zero-length matches are possible
with a given state.
I also changed the way recursive calls work. Rather than passing a
modified string, the function stores the location in the input string.
This location is updated with each call to the function.
Finally, the function now increments the offset by 1 instead of
incrementing by the length of the longest match. This leads to a bit of
overhead eg. if a regex matches index 1-5, then 1-5, 2-5, 3-5, 4-5 are
all stored. To fix this, I wrote (and used) a function to check if
a match overlaps with any matches in a slice.
4 months ago
// Base case - exit if offset exceeds string's length
if offset > len ( str ) {
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
return false , [ ] Group { } , offset
Big rewrite - assertion handling, zero-match fixes, change in recursive calls
I added support for transitions. I wrote a function to determine if
a given state has transitions for a character at a given point in the
string. This helps me check if the current state has an assertion, and
take actions based on that.
I also fixed zero-length matching (almost, see todo.txt). It works for
nearly all cases I could think of, although I still need to write more
tests. I wrote a function to check if zero-length matches are possible
with a given state.
I also changed the way recursive calls work. Rather than passing a
modified string, the function stores the location in the input string.
This location is updated with each call to the function.
Finally, the function now increments the offset by 1 instead of
incrementing by the length of the longest match. This leads to a bit of
overhead eg. if a regex matches index 1-5, then 1-5, 2-5, 3-5, 4-5 are
all stored. To fix this, I wrote (and used) a function to check if
a match overlaps with any matches in a slice.
4 months ago
}
resetThreads ( start )
currentStates := make ( [ ] nfaState , 0 )
nextStates := make ( [ ] nfaState , 0 )
i := offset // Index in string
Big rewrite - assertion handling, zero-match fixes, change in recursive calls
I added support for transitions. I wrote a function to determine if
a given state has transitions for a character at a given point in the
string. This helps me check if the current state has an assertion, and
take actions based on that.
I also fixed zero-length matching (almost, see todo.txt). It works for
nearly all cases I could think of, although I still need to write more
tests. I wrote a function to check if zero-length matches are possible
with a given state.
I also changed the way recursive calls work. Rather than passing a
modified string, the function stores the location in the input string.
This location is updated with each call to the function.
Finally, the function now increments the offset by 1 instead of
incrementing by the length of the longest match. This leads to a bit of
overhead eg. if a regex matches index 1-5, then 1-5, 2-5, 3-5, 4-5 are
all stored. To fix this, I wrote (and used) a function to check if
a match overlaps with any matches in a slice.
4 months ago
// If the first state is an assertion, makes sure the assertion
// is true before we do _anything_ else.
if start . assert != noneAssert {
if start . checkAssertion ( str , offset , preferLongest ) == false {
i ++
return false , [ ] Group { } , i
}
}
start . threadGroups = newMatch ( numGroups + 1 )
start . threadGroups [ 0 ] . StartIdx = i
currentStates = addStateToList ( str , i , currentStates , * start , start . threadGroups , nil , preferLongest )
var match Match = nil
for idx := i ; idx <= len ( str ) ; idx ++ {
if len ( currentStates ) == 0 {
break
}
for currentStateIdx := 0 ; currentStateIdx < len ( currentStates ) ; currentStateIdx ++ {
currentState := currentStates [ currentStateIdx ]
if currentState . threadGroups == nil {
currentState . threadGroups = newMatch ( numGroups + 1 )
currentState . threadGroups [ 0 ] . StartIdx = idx
}
if currentState . isLast {
currentState . threadGroups [ 0 ] . EndIdx = idx
match = append ( [ ] Group { } , currentState . threadGroups ... )
if ! preferLongest {
break
}
} else if ! currentState . isAlternation && ! currentState . isKleene && ! currentState . isQuestion && ! currentState . groupBegin && ! currentState . groupEnd && currentState . assert == noneAssert { // Normal character
if currentState . contentContains ( str , idx , preferLongest ) {
nextStates = addStateToList ( str , idx + 1 , nextStates , * currentState . next , currentState . threadGroups , nil , preferLongest )
}
}
}
currentStates = append ( [ ] nfaState { } , nextStates ... )
nextStates = nil
}
if match != nil {
if offset == match [ 0 ] . EndIdx {
return true , match , match [ 0 ] . EndIdx + 1
}
return true , match , match [ 0 ] . EndIdx
}
return false , [ ] Group { } , i + 1
}
// Expand appends template to dst, expanding any variables in template to the relevant capturing group.
//
// A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group.
// To insert a literal $, do not put a number after it. Alternatively, you can use $$.
// src is the input string, and match must be the result of [Reg.FindSubmatch].
func ( re Reg ) Expand ( dst string , template string , src string , match Match ) string {
templateRuneSlc := [ ] rune ( template )
srcRuneSlc := [ ] rune ( src )
i := 0
for i < len ( templateRuneSlc ) {
c := templateRuneSlc [ i ]
if c == '$' {
i += 1
// The dollar sign is the last character of the string, or it is proceeded by another dollar sign
if i >= len ( templateRuneSlc ) || templateRuneSlc [ i ] == '$' {
dst += "$"
i ++
} else {
numStr := ""
for unicode . IsDigit ( templateRuneSlc [ i ] ) {
numStr += string ( templateRuneSlc [ i ] )
i ++
}
if numStr == "" {
dst += "$"
} else {
num , _ := strconv . Atoi ( numStr )
if num < len ( match ) {
dst += string ( srcRuneSlc [ match [ num ] . StartIdx : match [ num ] . EndIdx ] )
} else {
dst += "$" + numStr
}
}
}
} else {
dst += string ( c )
i ++
}
}
return dst
}
// LiteralPrefix returns a string that must begin any match of the given regular expression.
// The second return value is true if the string comprises the entire expression.
func ( re Reg ) LiteralPrefix ( ) ( prefix string , complete bool ) {
state := re . start
if state . assert != noneAssert {
state = state . next
}
for ! ( state . isLast ) && ( ! state . isAlternation ) && len ( state . content ) == 1 && state . assert == noneAssert {
if state . groupBegin || state . groupEnd {
state = state . next
continue
}
prefix += string ( rune ( state . content [ 0 ] ) )
state = state . next
}
if state . isLast {
complete = true
} else {
complete = false
}
return prefix , complete
}