package regex
import (
"fmt"
)
// A Match represents a match found by the regex in a given string.
// It is represented as a list of groups, where the nth element contains
// the contents of the nth capturing group. Note that the group may not be valid
// (see [Group.IsValid]). The element at index 0 is known
// as the 0-group, and represents the contents of the entire match.
//
// See [Reg.FindSubmatch] for an example.
type Match [ ] Group
// a Group represents a capturing group. It contains the start and index of the group.
type Group struct {
StartIdx int
EndIdx int
}
func newMatch ( size int ) Match {
toRet := make ( [ ] Group , size )
for i := range toRet {
toRet [ i ] . StartIdx = - 1
toRet [ i ] . EndIdx = - 1
}
return toRet
}
// Returns the number of valid groups in the match
func ( m Match ) numValidGroups ( ) int {
numValid := 0
for _ , g := range m {
if g . StartIdx >= 0 && g . EndIdx >= 0 {
numValid ++
}
}
return numValid
}
// Returns a string containing the indices of all (valid) groups in the match
func ( m Match ) String ( ) string {
var toRet string
for i , g := range m {
if g . IsValid ( ) {
toRet += fmt . Sprintf ( "Group %d\n" , i )
toRet += g . String ( )
toRet += "\n"
}
}
return toRet
}
// String converts the Group into a string representation.
func ( idx Group ) String ( ) string {
return fmt . Sprintf ( "%d\t%d" , idx . StartIdx , idx . EndIdx )
}
Big rewrite - assertion handling, zero-match fixes, change in recursive calls
I added support for transitions. I wrote a function to determine if
a given state has transitions for a character at a given point in the
string. This helps me check if the current state has an assertion, and
take actions based on that.
I also fixed zero-length matching (almost, see todo.txt). It works for
nearly all cases I could think of, although I still need to write more
tests. I wrote a function to check if zero-length matches are possible
with a given state.
I also changed the way recursive calls work. Rather than passing a
modified string, the function stores the location in the input string.
This location is updated with each call to the function.
Finally, the function now increments the offset by 1 instead of
incrementing by the length of the longest match. This leads to a bit of
overhead eg. if a regex matches index 1-5, then 1-5, 2-5, 3-5, 4-5 are
all stored. To fix this, I wrote (and used) a function to check if
a match overlaps with any matches in a slice.
4 months ago
// IsValid returns whether a group is valid (ie. whether it matched any text). It
// simply ensures that both indices of the group are >= 0.
func ( g Group ) IsValid ( ) bool {
return g . StartIdx >= 0 && g . EndIdx >= 0
}
// Simple function, makes it easier to map this over a list of matches
func getZeroGroup ( m Match ) Group {
return m [ 0 ]
}
func copyThread ( to * nfaState , from nfaState ) {
to . threadGroups = append ( [ ] Group { } , from . threadGroups ... )
}
// Find returns the 0-group of the leftmost match of the regex in the given string.
// An error value != nil indicates that no match was found.
func ( regex Reg ) Find ( str string ) ( Group , error ) {
match , err := regex . FindNthMatch ( str , 1 )
if err != nil {
return Group { } , fmt . Errorf ( "no matches found" )
}
return getZeroGroup ( match ) , nil
}
// Match returns a boolean value, indicating whether the regex found a match in the given string.
func ( regex Reg ) Match ( str string ) bool {
_ , err := regex . Find ( str )
return err == nil
}
// FindAll returns a slice containing all the 0-groups of the regex in the given string.
// A 0-group represents the match without any submatches.
func ( regex Reg ) FindAll ( str string ) [ ] Group {
indices := regex . FindAllSubmatch ( str )
zeroGroups := funcMap ( indices , getZeroGroup )
return zeroGroups
}
// FindString returns the text of the leftmost match of the regex in the given string.
// The return value will be an empty string in two situations:
// 1. No match was found
// 2. The match was an empty string
func ( regex Reg ) FindString ( str string ) string {
match , err := regex . FindNthMatch ( str , 1 )
if err != nil {
return ""
}
zeroGroup := getZeroGroup ( match )
return str [ zeroGroup . StartIdx : zeroGroup . EndIdx ]
}
// FindSubmatch returns the leftmost match of the regex in the given string, including
// the submatches matched by capturing groups. The returned [Match] will always contain the same
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
// [Group.IsValid], or by checking that both indices of the group are >= 0.
// The second-return value is nil if no match was found.
func ( regex Reg ) FindSubmatch ( str string ) ( Match , error ) {
match , err := regex . FindNthMatch ( str , 1 )
if err != nil {
return Match { } , fmt . Errorf ( "no match found" )
} else {
return match , nil
}
}
// FindStringSubmatch is the 'string' version of [FindSubmatch]. It returns a slice of strings,
// where the string at index i contains the text matched by the i-th capturing group.
// The 0-th index represents the entire match.
// An empty string at index n could mean:
// ,
// 1. Group n did not find a match
// 2. Group n found a zero-length match
//
// A return value of nil indicates no match.
func ( regex Reg ) FindStringSubmatch ( str string ) [ ] string {
matchStr := make ( [ ] string , regex . numGroups + 1 )
match , err := regex . FindSubmatch ( str )
if err != nil {
return nil
}
nonEmptyMatchFound := false
for i := range match {
if match [ i ] . IsValid ( ) {
matchStr [ i ] = str [ match [ i ] . StartIdx : match [ i ] . EndIdx ]
nonEmptyMatchFound = true
} else {
matchStr [ i ] = ""
}
}
if nonEmptyMatchFound == false {
return nil
}
return matchStr
}
// FindAllString is the 'all' version of [FindString].
// It returns a slice of strings containing the text of all matches of
// the regex in the given string.
func ( regex Reg ) FindAllString ( str string ) [ ] string {
zerogroups := regex . FindAll ( str )
matchStrs := funcMap ( zerogroups , func ( g Group ) string {
return str [ g . StartIdx : g . EndIdx ]
} )
return matchStrs
}
// FindNthMatch return the 'n'th match of the regex in the given string.
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
func ( regex Reg ) FindNthMatch ( str string , n int ) ( Match , error ) {
idx := 0
matchNum := 0
str_runes := [ ] rune ( str )
var matchFound bool
var matchIdx Match
for idx <= len ( str_runes ) {
matchFound , matchIdx , idx = findAllSubmatchHelper ( regex . start , str_runes , idx , regex . numGroups )
if matchFound {
matchNum ++
}
if matchNum == n {
return matchIdx , nil
}
}
// We haven't found the nth match after scanning the string - Return an error
return nil , fmt . Errorf ( "invalid match index - too few matches found" )
}
// FindAllSubmatch returns a slice of matches in the given string.
func ( regex Reg ) FindAllSubmatch ( str string ) [ ] Match {
idx := 0
str_runes := [ ] rune ( str )
var matchFound bool
var matchIdx Match
indices := make ( [ ] Match , 0 )
for idx <= len ( str_runes ) {
matchFound , matchIdx , idx = findAllSubmatchHelper ( regex . start , str_runes , idx , regex . numGroups )
if matchFound {
indices = append ( indices , matchIdx )
}
}
return indices
}
func addStateToList ( str [ ] rune , idx int , list [ ] nfaState , state nfaState , threadGroups [ ] Group , visited [ ] nfaState ) [ ] nfaState {
if stateExists ( list , state ) || stateExists ( visited , state ) {
return list
}
visited = append ( visited , state )
if state . isKleene || state . isQuestion {
copyThread ( state . splitState , state )
list = addStateToList ( str , idx , list , * state . splitState , threadGroups , visited )
copyThread ( state . next , state )
list = addStateToList ( str , idx , list , * state . next , threadGroups , visited )
return list
}
if state . isAlternation {
copyThread ( state . next , state )
list = addStateToList ( str , idx , list , * state . next , threadGroups , visited )
copyThread ( state . splitState , state )
list = addStateToList ( str , idx , list , * state . splitState , threadGroups , visited )
return list
}
state . threadGroups = append ( [ ] Group { } , threadGroups ... )
if state . assert != noneAssert {
if state . checkAssertion ( str , idx ) {
copyThread ( state . next , state )
return addStateToList ( str , idx , list , * state . next , state . threadGroups , visited )
}
}
if state . groupBegin {
state . threadGroups [ state . groupNum ] . StartIdx = idx
return addStateToList ( str , idx , list , * state . next , state . threadGroups , visited )
}
if state . groupEnd {
state . threadGroups [ state . groupNum ] . EndIdx = idx
return addStateToList ( str , idx , list , * state . next , state . threadGroups , visited )
}
return append ( list , state )
}
// Helper for FindAllMatches. Returns whether it found a match, the
// first Match it finds, and how far it got into the string ie. where
// the next search should start from.
func findAllSubmatchHelper ( start * nfaState , str [ ] rune , offset int , numGroups int ) ( bool , Match , int ) {
Big rewrite - assertion handling, zero-match fixes, change in recursive calls
I added support for transitions. I wrote a function to determine if
a given state has transitions for a character at a given point in the
string. This helps me check if the current state has an assertion, and
take actions based on that.
I also fixed zero-length matching (almost, see todo.txt). It works for
nearly all cases I could think of, although I still need to write more
tests. I wrote a function to check if zero-length matches are possible
with a given state.
I also changed the way recursive calls work. Rather than passing a
modified string, the function stores the location in the input string.
This location is updated with each call to the function.
Finally, the function now increments the offset by 1 instead of
incrementing by the length of the longest match. This leads to a bit of
overhead eg. if a regex matches index 1-5, then 1-5, 2-5, 3-5, 4-5 are
all stored. To fix this, I wrote (and used) a function to check if
a match overlaps with any matches in a slice.
4 months ago
// Base case - exit if offset exceeds string's length
if offset > len ( str ) {
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
return false , [ ] Group { } , offset
Big rewrite - assertion handling, zero-match fixes, change in recursive calls
I added support for transitions. I wrote a function to determine if
a given state has transitions for a character at a given point in the
string. This helps me check if the current state has an assertion, and
take actions based on that.
I also fixed zero-length matching (almost, see todo.txt). It works for
nearly all cases I could think of, although I still need to write more
tests. I wrote a function to check if zero-length matches are possible
with a given state.
I also changed the way recursive calls work. Rather than passing a
modified string, the function stores the location in the input string.
This location is updated with each call to the function.
Finally, the function now increments the offset by 1 instead of
incrementing by the length of the longest match. This leads to a bit of
overhead eg. if a regex matches index 1-5, then 1-5, 2-5, 3-5, 4-5 are
all stored. To fix this, I wrote (and used) a function to check if
a match overlaps with any matches in a slice.
4 months ago
}
resetThreads ( start )
currentStates := make ( [ ] nfaState , 0 )
nextStates := make ( [ ] nfaState , 0 )
i := offset // Index in string
Big rewrite - assertion handling, zero-match fixes, change in recursive calls
I added support for transitions. I wrote a function to determine if
a given state has transitions for a character at a given point in the
string. This helps me check if the current state has an assertion, and
take actions based on that.
I also fixed zero-length matching (almost, see todo.txt). It works for
nearly all cases I could think of, although I still need to write more
tests. I wrote a function to check if zero-length matches are possible
with a given state.
I also changed the way recursive calls work. Rather than passing a
modified string, the function stores the location in the input string.
This location is updated with each call to the function.
Finally, the function now increments the offset by 1 instead of
incrementing by the length of the longest match. This leads to a bit of
overhead eg. if a regex matches index 1-5, then 1-5, 2-5, 3-5, 4-5 are
all stored. To fix this, I wrote (and used) a function to check if
a match overlaps with any matches in a slice.
4 months ago
// If the first state is an assertion, makes sure the assertion
// is true before we do _anything_ else.
if start . assert != noneAssert {
if start . checkAssertion ( str , offset ) == false {
i ++
return false , [ ] Group { } , i
}
}
start . threadGroups = newMatch ( numGroups + 1 )
start . threadGroups [ 0 ] . StartIdx = i
currentStates = addStateToList ( str , i , currentStates , * start , start . threadGroups , nil )
var match Match = nil
for idx := i ; idx <= len ( str ) ; idx ++ {
if len ( currentStates ) == 0 {
break
}
for currentStateIdx := 0 ; currentStateIdx < len ( currentStates ) ; currentStateIdx ++ {
currentState := currentStates [ currentStateIdx ]
if currentState . threadGroups == nil {
currentState . threadGroups = newMatch ( numGroups + 1 )
currentState . threadGroups [ 0 ] . StartIdx = idx
}
if currentState . isLast {
currentState . threadGroups [ 0 ] . EndIdx = idx
match = append ( [ ] Group { } , currentState . threadGroups ... )
break
} else if ! currentState . isAlternation && ! currentState . isKleene && ! currentState . isQuestion && ! currentState . groupBegin && ! currentState . groupEnd { // Normal character or assertion
if currentState . contentContains ( str , idx ) {
nextStates = addStateToList ( str , idx + 1 , nextStates , * currentState . next , currentState . threadGroups , nil )
}
}
}
currentStates = append ( [ ] nfaState { } , nextStates ... )
nextStates = nil
}
if match != nil {
if offset == match [ 0 ] . EndIdx {
return true , match , match [ 0 ] . EndIdx + 1
}
return true , match , match [ 0 ] . EndIdx
}
return false , [ ] Group { } , i + 1
}