@ -2,8 +2,8 @@ package regex
import (
"fmt"
"s lices "
" sort "
"s trconv "
" unicode "
)
// A Match represents a match found by the regex in a given string.
@ -15,7 +15,7 @@ import (
// See [Reg.FindSubmatch] for an example.
type Match [ ] Group
// a Group represents a group. It contains the start index and end index of the match
// a Group represents a capturing group. It contains the start and index of the group.
type Group struct {
StartIdx int
EndIdx int
@ -30,17 +30,6 @@ func newMatch(size int) Match {
return toRet
}
// Returns the number of valid groups in the match
func ( m Match ) numValidGroups ( ) int {
numValid := 0
for _ , g := range m {
if g . StartIdx >= 0 && g . EndIdx >= 0 {
numValid ++
}
}
return numValid
}
// Returns a string containing the indices of all (valid) groups in the match
func ( m Match ) String ( ) string {
var toRet string
@ -59,7 +48,7 @@ func (idx Group) String() string {
return fmt . Sprintf ( "%d\t%d" , idx . StartIdx , idx . EndIdx )
}
// R eturns whether a group is valid (ie. whether it matched any text). It
// IsValid r eturns whether a group is valid (ie. whether it matched any text). It
// simply ensures that both indices of the group are >= 0.
func ( g Group ) IsValid ( ) bool {
return g . StartIdx >= 0 && g . EndIdx >= 0
@ -70,101 +59,42 @@ func getZeroGroup(m Match) Group {
return m [ 0 ]
}
// takeZeroState takes the 0-state (if such a transition exists) for all states in the
// given slice. It returns the resulting states. If any of the resulting states is a 0-state,
// the second ret val is true.
// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index.
func takeZeroState ( states [ ] * nfaState , numGroups int , idx int ) ( rtv [ ] * nfaState , isZero bool ) {
for _ , state := range states {
if len ( state . transitions [ epsilon ] ) > 0 {
for _ , s := range state . transitions [ epsilon ] {
if s . threadGroups == nil {
s . threadGroups = newMatch ( numGroups + 1 )
}
copy ( s . threadGroups , state . threadGroups )
if s . groupBegin {
s . threadGroups [ s . groupNum ] . StartIdx = idx
// openParenGroups = append(openParenGroups, s.groupNum)
}
if s . groupEnd {
s . threadGroups [ s . groupNum ] . EndIdx = idx
// closeParenGroups = append(closeParenGroups, s.groupNum)
}
}
rtv = append ( rtv , state . transitions [ epsilon ] ... )
}
}
for _ , state := range rtv {
if len ( state . transitions [ epsilon ] ) > 0 {
return rtv , true
}
}
return rtv , false
func copyThread ( to * nfaState , from nfaState ) {
to . threadGroups = append ( [ ] Group { } , from . threadGroups ... )
}
// zeroMatchPossible returns true if a zero-length match is possible
// from any of the given states, given the string and our position in it.
// It uses the same algorithm to find zero-states as the one inside the loop,
// so I should probably put it in a function.
func zeroMatchPossible ( str [ ] rune , idx int , numGroups int , states ... * nfaState ) bool {
zeroStates , isZero := takeZeroState ( states , numGroups , idx )
tempstates := make ( [ ] * nfaState , 0 , len ( zeroStates ) + len ( states ) )
tempstates = append ( tempstates , states ... )
tempstates = append ( tempstates , zeroStates ... )
num_appended := 0 // number of unique states addded to tempstates
for isZero == true {
zeroStates , isZero = takeZeroState ( tempstates , numGroups , idx )
tempstates , num_appended = uniqueAppend ( tempstates , zeroStates ... )
if num_appended == 0 { // break if we haven't appended any more unique values
break
}
}
for _ , state := range tempstates {
if state . isEmpty && ( state . assert == noneAssert || state . checkAssertion ( str , idx ) ) && state . isLast {
return true
}
// Find returns the 0-group of the leftmost match of the regex in the given string.
// An error value != nil indicates that no match was found.
func ( re Reg ) Find ( str string ) ( Group , error ) {
match , err := re . FindNthMatch ( str , 1 )
if err != nil {
return Group { } , fmt . Errorf ( "no matches found" )
}
return false
return getZeroGroup ( match ) , nil
}
// Prunes the slice by removing overlapping indices.
func pruneIndices ( indices [ ] Match ) [ ] Match {
// First, sort the slice by the start indices
sort . Slice ( indices , func ( i , j int ) bool {
return indices [ i ] [ 0 ] . StartIdx < indices [ j ] [ 0 ] . StartIdx
} )
toRet := make ( [ ] Match , 0 , len ( indices ) )
current := indices [ 0 ]
for _ , idx := range indices [ 1 : ] {
// idx doesn't overlap with current (starts after current ends), so add current to result
// and update the current.
if idx [ 0 ] . StartIdx >= current [ 0 ] . EndIdx {
toRet = append ( toRet , current )
current = idx
} else if idx [ 0 ] . EndIdx > current [ 0 ] . EndIdx {
// idx overlaps, but it is longer, so update current
current = idx
}
}
// Add last state
toRet = append ( toRet , current )
return toRet
// Match returns a boolean value, indicating whether the regex found a match in the given string.
func ( re Reg ) Match ( str string ) bool {
_ , err := re . Find ( str )
return err == nil
}
// Find returns the 0-group of the leftmost match of the regex in the given string.
// An error value != nil indicates that no match was found.
func ( regex Reg ) Find ( str string ) ( Group , error ) {
match , err := regex . FindNthMatch ( str , 1 )
// CompileMatch compiles expr and returns true if str contains a match of the expression.
// It is equivalent to [regexp.Match].
// An optional list of flags may be provided (see [ReFlag]).
// It returns an error (!= nil) if there was an error compiling the expression.
func CompileMatch ( expr string , str string , flags ... ReFlag ) ( bool , error ) {
re , err := Compile ( expr , flags ... )
if err != nil {
return Group { } , fmt . Errorf ( "no matches found" )
return false , err
}
return getZeroGroup ( match ) , nil
return re . Match ( str ) , nil
}
// FindAll returns a slice containing all the 0-groups of the regex in the given string.
// A 0-group represents the match without any submatches.
func ( re gex Reg ) FindAll ( str string ) [ ] Group {
indices := re gex . FindAllSubmatch ( str )
func ( re Reg ) FindAll ( str string ) [ ] Group {
indices := re . FindAllSubmatch ( str )
zeroGroups := funcMap ( indices , getZeroGroup )
return zeroGroups
}
@ -173,8 +103,8 @@ func (regex Reg) FindAll(str string) []Group {
// The return value will be an empty string in two situations:
// 1. No match was found
// 2. The match was an empty string
func ( re gex Reg ) FindString ( str string ) string {
match , err := re gex . FindNthMatch ( str , 1 )
func ( re Reg ) FindString ( str string ) string {
match , err := re . FindNthMatch ( str , 1 )
if err != nil {
return ""
}
@ -187,8 +117,8 @@ func (regex Reg) FindString(str string) string {
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
// [Group.IsValid], or by checking that both indices of the group are >= 0.
// The second-return value is nil if no match was found.
func ( re gex Reg ) FindSubmatch ( str string ) ( Match , error ) {
match , err := re gex . FindNthMatch ( str , 1 )
func ( re Reg ) FindSubmatch ( str string ) ( Match , error ) {
match , err := re . FindNthMatch ( str , 1 )
if err != nil {
return Match { } , fmt . Errorf ( "no match found" )
} else {
@ -196,11 +126,41 @@ func (regex Reg) FindSubmatch(str string) (Match, error) {
}
}
// FindAllString is the 'all' version of FindString.
// FindStringSubmatch is the 'string' version of [FindSubmatch]. It returns a slice of strings,
// where the string at index i contains the text matched by the i-th capturing group.
// The 0-th index represents the entire match.
// An empty string at index n could mean:
// ,
// 1. Group n did not find a match
// 2. Group n found a zero-length match
//
// A return value of nil indicates no match.
func ( re Reg ) FindStringSubmatch ( str string ) [ ] string {
matchStr := make ( [ ] string , re . numGroups + 1 )
match , err := re . FindSubmatch ( str )
if err != nil {
return nil
}
nonEmptyMatchFound := false
for i := range match {
if match [ i ] . IsValid ( ) {
matchStr [ i ] = str [ match [ i ] . StartIdx : match [ i ] . EndIdx ]
nonEmptyMatchFound = true
} else {
matchStr [ i ] = ""
}
}
if nonEmptyMatchFound == false {
return nil
}
return matchStr
}
// FindAllString is the 'all' version of [FindString].
// It returns a slice of strings containing the text of all matches of
// the regex in the given string.
func ( regex Reg ) FindAllString ( str string ) [ ] string {
zerogroups := regex . FindAll ( str )
func ( re Reg ) FindAllString ( str string ) [ ] string {
zerogroups := re . FindAll ( str )
matchStrs := funcMap ( zerogroups , func ( g Group ) string {
return str [ g . StartIdx : g . EndIdx ]
} )
@ -209,14 +169,14 @@ func (regex Reg) FindAllString(str string) []string {
// FindNthMatch return the 'n'th match of the regex in the given string.
// It returns an error (!= nil) if there are fewer than 'n' matches in the string.
func ( re gex Reg ) FindNthMatch ( str string , n int ) ( Match , error ) {
func ( re Reg ) FindNthMatch ( str string , n int ) ( Match , error ) {
idx := 0
matchNum := 0
str_runes := [ ] rune ( str )
var matchFound bool
var matchIdx Match
for idx <= len ( str_runes ) {
matchFound , matchIdx , idx = findAllSubmatchHelper ( re gex . start , str_runes , idx , re gex . numGroups )
matchFound , matchIdx , idx = findAllSubmatchHelper ( re . start , str_runes , idx , re . numGroups , re . preferLongest )
if matchFound {
matchNum ++
}
@ -229,31 +189,65 @@ func (regex Reg) FindNthMatch(str string, n int) (Match, error) {
}
// FindAllSubmatch returns a slice of matches in the given string.
func ( re gex Reg ) FindAllSubmatch ( str string ) [ ] Match {
func ( re Reg ) FindAllSubmatch ( str string ) [ ] Match {
idx := 0
str_runes := [ ] rune ( str )
var matchFound bool
var matchIdx Match
indices := make ( [ ] Match , 0 )
for idx <= len ( str_runes ) {
matchFound , matchIdx , idx = findAllSubmatchHelper ( re gex . start , str_runes , idx , re gex . numGroups )
matchFound , matchIdx , idx = findAllSubmatchHelper ( re . start , str_runes , idx , re . numGroups , re . preferLongest )
if matchFound {
indices = append ( indices , matchIdx )
}
}
if len ( indices ) > 0 {
return pruneIndices ( indices )
}
return indices
}
func addStateToList ( str [ ] rune , idx int , list [ ] nfaState , state nfaState , threadGroups [ ] Group , visited [ ] nfaState , preferLongest bool ) [ ] nfaState {
if stateExists ( list , state ) || stateExists ( visited , state ) {
return list
}
visited = append ( visited , state )
if state . isKleene || state . isQuestion {
copyThread ( state . splitState , state )
list = addStateToList ( str , idx , list , * state . splitState , threadGroups , visited , preferLongest )
copyThread ( state . next , state )
list = addStateToList ( str , idx , list , * state . next , threadGroups , visited , preferLongest )
return list
}
if state . isAlternation {
copyThread ( state . next , state )
list = addStateToList ( str , idx , list , * state . next , threadGroups , visited , preferLongest )
copyThread ( state . splitState , state )
list = addStateToList ( str , idx , list , * state . splitState , threadGroups , visited , preferLongest )
return list
}
state . threadGroups = append ( [ ] Group { } , threadGroups ... )
if state . assert != noneAssert {
if state . checkAssertion ( str , idx , preferLongest ) {
copyThread ( state . next , state )
return addStateToList ( str , idx , list , * state . next , state . threadGroups , visited , preferLongest )
}
}
if state . groupBegin {
state . threadGroups [ state . groupNum ] . StartIdx = idx
return addStateToList ( str , idx , list , * state . next , state . threadGroups , visited , preferLongest )
}
if state . groupEnd {
state . threadGroups [ state . groupNum ] . EndIdx = idx
return addStateToList ( str , idx , list , * state . next , state . threadGroups , visited , preferLongest )
}
return append ( list , state )
}
// Helper for FindAllMatches. Returns whether it found a match, the
// first Match it finds, and how far it got into the string ie. where
// the next search should start from.
//
// Might return duplicates or overlapping indices, so care must be taken to prune the resulting array.
func findAllSubmatchHelper ( start * nfaState , str [ ] rune , offset int , numGroups int ) ( bool , Match , int ) {
func findAllSubmatchHelper ( start * nfaState , str [ ] rune , offset int , numGroups int , preferLongest bool ) ( bool , Match , int ) {
// Base case - exit if offset exceeds string's length
if offset > len ( str ) {
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
@ -261,214 +255,120 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
}
resetThreads ( start )
// Hold a list of match indices for the current run. When we
// can no longer find a match, the match with the largest range is
// chosen as the match for the entire string.
// This allows us to pick the longest possible match (which is how greedy matching works).
// COMMENT ABOVE IS CURRENTLY NOT UP-TO-DATE
tempIndices := newMatch ( numGroups + 1 )
foundPath := false
startIdx := offset
endIdx := offset
currentStates := make ( [ ] * nfaState , 0 )
tempStates := make ( [ ] * nfaState , 0 ) // Used to store states that should be used in next loop iteration
currentStates := make ( [ ] nfaState , 0 )
nextStates := make ( [ ] nfaState , 0 )
i := offset // Index in string
startingFrom := i // Store starting index
// If the first state is an assertion, makes sure the assertion
// is true before we do _anything_ else.
if start . assert != noneAssert {
if start . checkAssertion ( str , offset ) == false {
if start . checkAssertion ( str , offset , preferLongest ) == false {
i ++
return false , [ ] Group { } , i
}
}
// Increment until we hit a character matching the start state (assuming not 0-state)
if start . isEmpty == false {
for i < len ( str ) && ! start . contentContains ( str , i ) {
i ++
}
startIdx = i
startingFrom = i
i ++ // Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character
}
start . threadGroups = newMatch ( numGroups + 1 )
// Check if the start state begins a group - if so, add the start index to our list
if start . groupBegin {
start . threadGroups [ start . groupNum ] . StartIdx = i
// tempIndices[start.groupNum].startIdx = i
}
currentStates = append ( currentStates , start )
// Main loop
for i < len ( str ) {
foundPath = false
zeroStates := make ( [ ] * nfaState , 0 )
// Keep taking zero-states, until there are no more left to take
// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
zeroStates , isZero := takeZeroState ( currentStates , numGroups , i )
tempStates = append ( tempStates , zeroStates ... )
num_appended := 0
for isZero == true {
zeroStates , isZero = takeZeroState ( tempStates , numGroups , i )
tempStates , num_appended = uniqueAppend ( tempStates , zeroStates ... )
if num_appended == 0 { // Break if we haven't appended any more unique values
start . threadGroups [ 0 ] . StartIdx = i
currentStates = addStateToList ( str , i , currentStates , * start , start . threadGroups , nil , preferLongest )
var match Match = nil
for idx := i ; idx <= len ( str ) ; idx ++ {
if len ( currentStates ) == 0 {
break
}
}
for currentStateIdx := 0 ; currentStateIdx < len ( currentStates ) ; currentStateIdx ++ {
currentState := currentStates [ currentStateIdx ]
currentStates = slices . Concat ( currentStates , tempStates )
tempStates = nil
if currentState . threadGroups == nil {
currentState . threadGroups = newMatch ( numGroups + 1 )
currentState . threadGroups [ 0 ] . StartIdx = idx
}
// Take any transitions corresponding to current character
numStatesMatched := 0 // The number of states which had at least 1 match for this round
assertionFailed := false // Whether or not an assertion failed for this round
lastStateInList := false // Whether or not a last state was in our list of states
var lastStatePtr * nfaState = nil // Pointer to the last-state, if it was found
lastLookaroundInList := false // Whether or not a last state (that is a lookaround) was in our list of states
for numStatesMatched == 0 && lastStateInList == false {
if len ( currentStates ) == 0 {
if currentState . isLast {
currentState . threadGroups [ 0 ] . EndIdx = idx
match = append ( [ ] Group { } , currentState . threadGroups ... )
if ! preferLongest {
break
}
state , _ := pop ( & currentStates )
matches , numMatches := state . matchesFor ( str , i )
if numMatches > 0 {
numStatesMatched ++
tempStates = append ( [ ] * nfaState ( nil ) , matches ... )
foundPath = true
for _ , m := range matches {
if m . threadGroups == nil {
m . threadGroups = newMatch ( numGroups + 1 )
} else if ! currentState . isAlternation && ! currentState . isKleene && ! currentState . isQuestion && ! currentState . groupBegin && ! currentState . groupEnd && currentState . assert == noneAssert { // Normal character
if currentState . contentContains ( str , idx , preferLongest ) {
nextStates = addStateToList ( str , idx + 1 , nextStates , * currentState . next , currentState . threadGroups , nil , preferLongest )
}
copy ( m . threadGroups , state . threadGroups )
}
}
if numMatches < 0 {
assertionFailed = true
currentStates = append ( [ ] nfaState { } , nextStates ... )
nextStates = nil
}
if state . isLast {
if state . isLookaround ( ) {
lastLookaroundInList = true
if match != nil {
if offset == match [ 0 ] . EndIdx {
return true , match , match [ 0 ] . EndIdx + 1
}
lastStateInList = true
lastStatePtr = state
return true , match , match [ 0 ] . EndIdx
}
return false , [ ] Group { } , i + 1
}
if assertionFailed && numStatesMatched == 0 { // Nothing has matched and an assertion has failed
// If I'm being completely honest, I'm not sure why I have to check specifically for a _lookaround_
// state. The explanation below is my attempt to explain this behavior.
// If you replace 'lastLookaroundInList' with 'lastStateInList', one of the test cases fails.
// Expand appends template to dst, expanding any variables in template to the relevant capturing group.
//
// One of the states in our list was a last state and a lookaround. In this case, we
// don't abort upon failure of the assertion, because we have found
// another path to a final state.
// Even if the last state _was_ an assertion, we can use the previously
// saved indices to find a match.
if lastLookaroundInList {
break
// A variable is of the form '$n', where 'n' is a number. It will be replaced by the contents of the n-th capturing group.
// To insert a literal $, do not put a number after it. Alternatively, you can use $$.
// src is the input string, and match must be the result of [Reg.FindSubmatch].
func ( re Reg ) Expand ( dst string , template string , src string , match Match ) string {
templateRuneSlc := [ ] rune ( template )
srcRuneSlc := [ ] rune ( src )
i := 0
for i < len ( templateRuneSlc ) {
c := templateRuneSlc [ i ]
if c == '$' {
i += 1
// The dollar sign is the last character of the string, or it is proceeded by another dollar sign
if i >= len ( templateRuneSlc ) || templateRuneSlc [ i ] == '$' {
dst += "$"
i ++
} else {
if i == startingFrom {
numStr := ""
for unicode . IsDigit ( templateRuneSlc [ i ] ) {
numStr += string ( templateRuneSlc [ i ] )
i ++
}
return false , [ ] Group { } , i
}
}
// Check if we can find a state in our list that is:
// a. A last-state
// b. Empty
// c. Doesn't assert anything
for _ , s := range currentStates {
if s . isLast && s . isEmpty && s . assert == noneAssert {
lastStatePtr = s
lastStateInList = true
}
}
if lastStateInList && numStatesMatched == 0 { // A last-state was in the list of states. add the matchIndex to our MatchIndex list
for j := 1 ; j < numGroups + 1 ; j ++ {
tempIndices [ j ] = lastStatePtr . threadGroups [ j ]
}
endIdx = i
tempIndices [ 0 ] = Group { startIdx , endIdx }
if tempIndices [ 0 ] . StartIdx == tempIndices [ 0 ] . EndIdx {
return true , tempIndices , tempIndices [ 0 ] . EndIdx + 1
if numStr == "" {
dst += "$"
} else {
return true , tempIndices , tempIndices [ 0 ] . EndIdx
}
}
// Check if we can find a zero-length match
if foundPath == false {
if ok := zeroMatchPossible ( str , i , numGroups , currentStates ... ) ; ok {
if tempIndices [ 0 ] . IsValid ( ) == false {
tempIndices [ 0 ] = Group { startIdx , startIdx }
}
}
// If we haven't moved in the string, increment the counter by 1
// to ensure we don't keep trying the same string over and over.
// if i == startingFrom {
startIdx ++
// i++
// }
if tempIndices . numValidGroups ( ) > 0 && tempIndices [ 0 ] . IsValid ( ) {
if tempIndices [ 0 ] . StartIdx == tempIndices [ 0 ] . EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true , tempIndices , tempIndices [ 0 ] . EndIdx + 1
num , _ := strconv . Atoi ( numStr )
if num < len ( match ) {
dst += string ( srcRuneSlc [ match [ num ] . StartIdx : match [ num ] . EndIdx ] )
} else {
return true , tempIndices , tempIndices [ 0 ] . EndIdx
dst += "$" + numStr
}
}
return false , [ ] Group { } , startIdx
}
currentStates = make ( [ ] * nfaState , len ( tempStates ) )
copy ( currentStates , tempStates )
tempStates = nil
} else {
dst += string ( c )
i ++
}
// End-of-string reached. Go to any 0-states, until there are no more 0-states to go to. Then check if any of our states are in the end position.
// This is the exact same algorithm used inside the loop, so I should probably put it in a function.
zeroStates , isZero := takeZeroState ( currentStates , numGroups , i )
tempStates = append ( tempStates , zeroStates ... )
num_appended := 0 // Number of unique states addded to tempStates
for isZero == true {
zeroStates , isZero = takeZeroState ( tempStates , numGroups , i )
tempStates , num_appended = uniqueAppend ( tempStates , zeroStates ... )
if num_appended == 0 { // Break if we haven't appended any more unique values
break
}
return dst
}
currentStates = append ( currentStates , tempStates ... )
tempStates = nil
for _ , state := range currentStates {
// Only add the match if the start index is in bounds. If the state has an assertion,
// make sure the assertion checks out.
if state . isLast && i <= len ( str ) {
if state . assert == noneAssert || state . checkAssertion ( str , i ) {
for j := 1 ; j < numGroups + 1 ; j ++ {
tempIndices [ j ] = state . threadGroups [ j ]
// LiteralPrefix returns a string that must begin any match of the given regular expression.
// The second return value is true if the string comprises the entire expression.
func ( re Reg ) LiteralPrefix ( ) ( prefix string , complete bool ) {
state := re . start
if state . assert != noneAssert {
state = state . next
}
endIdx = i
tempIndices [ 0 ] = Group { startIdx , endIdx }
for ! ( state . isLast ) && ( ! state . isAlternation ) && len ( state . content ) == 1 && state . assert == noneAssert {
if state . groupBegin || state . groupEnd {
state = state . next
continue
}
prefix += string ( rune ( state . content [ 0 ] ) )
state = state . next
}
}
if tempIndices . numValidGroups ( ) > 0 {
if tempIndices [ 0 ] . StartIdx == tempIndices [ 0 ] . EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true , tempIndices , tempIndices [ 0 ] . EndIdx + 1
if state . isLast {
complete = true
} else {
return true , tempIndices , tempIndices [ 0 ] . EndIdx
}
}
if startIdx == startingFrom { // Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over.
startIdx ++
complete = false
}
return false , [ ] Group { } , startIdx
return prefix , complete
}