5 Commits

5 changed files with 80 additions and 8 deletions

View File

@@ -137,7 +137,7 @@ func main() {
fmt.Fprintf(out, "Line %d:\n", lineNum) fmt.Fprintf(out, "Line %d:\n", lineNum)
} }
for _, m := range matchIndices { for _, m := range matchIndices {
fmt.Fprintf(out, "%s\n", m.ToString()) fmt.Fprintf(out, "%s\n", m.String())
} }
err := out.Flush() err := out.Flush()
if err != nil { if err != nil {

View File

@@ -18,6 +18,12 @@ type Reg struct {
numGroups int numGroups int
} }
// numSubexp eturns the number of sub-expressions in the given [Reg]. This is equivalent
// to the number of capturing groups.
func (r Reg) NumSubexp() int {
return r.numGroups
}
const concatRune rune = 0xF0001 const concatRune rune = 0xF0001
// Flags for shuntingYard - control its behavior // Flags for shuntingYard - control its behavior

View File

@@ -121,5 +121,36 @@ this engine will _always_ go for the longest possible match, regardless of the o
My engine does not support byte-slices. When a matching function receives a string, it converts it into a My engine does not support byte-slices. When a matching function receives a string, it converts it into a
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
support made the tradeoff worth it. support made the tradeoff worth it.
3. Return values
Rather than using primitives for return values, my engine defines two types that are used as return
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
[regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The
equivalent expression for this engine is:
Find(All)?(String)?(Submatch)?
[Reg.Find] returns the index of the leftmost match in the string.
If a function contains 'All' it returns all matches instead of just the leftmost one.
If a function contains 'String' it returns the matched text, rather than the indices.
If a function contains 'Submatch' it returns the match, including all submatches found by
capturing groups.
The term '0-group' is used to refer to the 0th capturing group of a match (which is the entire match).
Given the following regex:
x(y)
and the input string:
xyz
The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name
returns the 0-group.
*/ */
package regex package regex

View File

@@ -35,10 +35,10 @@ func (m Match) numValidGroups() int {
} }
// Returns a string containing the indices of all (valid) groups in the match // Returns a string containing the indices of all (valid) groups in the match
func (m Match) ToString() string { func (m Match) String() string {
var toRet string var toRet string
for i, g := range m { for i, g := range m {
if g.isValid() { if g.IsValid() {
toRet += fmt.Sprintf("Group %d\n", i) toRet += fmt.Sprintf("Group %d\n", i)
toRet += g.toString() toRet += g.toString()
toRet += "\n" toRet += "\n"
@@ -52,8 +52,9 @@ func (idx Group) toString() string {
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx) return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
} }
// Returns whether a group contains valid indices // Returns whether a group is valid (ie. whether it matched any text). It
func (g Group) isValid() bool { // simply ensures that both indices of the group are >= 0.
func (g Group) IsValid() bool {
return g.StartIdx >= 0 && g.EndIdx >= 0 return g.StartIdx >= 0 && g.EndIdx >= 0
} }
@@ -174,6 +175,20 @@ func (regex Reg) FindString(str string) string {
return str[zeroGroup.StartIdx:zeroGroup.EndIdx] return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
} }
// FindSubmatch returns the leftmost match of the regex in the given string, including
// the submatches matched by capturing groups. The returned [Match] will always contain the same
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
// [Group.IsValid], or by checking that both indices of the group are >= 0.
// The second-return value is nil if no match was found.
func (regex Reg) FindSubmatch(str string) (Match, error) {
match, err := regex.FindNthMatch(str, 1)
if err != nil {
return Match{}, fmt.Errorf("no match found")
} else {
return match, nil
}
}
// FindAllString is the 'all' version of FindString. // FindAllString is the 'all' version of FindString.
// It returns a slice of strings containing the text of all matches of // It returns a slice of strings containing the text of all matches of
// the regex in the given string. // the regex in the given string.
@@ -372,7 +387,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// Check if we can find a zero-length match // Check if we can find a zero-length match
if foundPath == false { if foundPath == false {
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok { if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
if tempIndices[0].isValid() == false { if tempIndices[0].IsValid() == false {
tempIndices[0] = Group{startIdx, startIdx} tempIndices[0] = Group{startIdx, startIdx}
} }
} }
@@ -382,7 +397,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
startIdx++ startIdx++
// i++ // i++
// } // }
if tempIndices.numValidGroups() > 0 && tempIndices[0].isValid() { if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() {
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over. if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true, tempIndices, tempIndices[0].EndIdx + 1 return true, tempIndices, tempIndices[0].EndIdx + 1
} else { } else {

View File

@@ -767,6 +767,26 @@ func TestFindAllString(t *testing.T) {
} }
} }
func TestFindSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
}
match, err := regComp.FindSubmatch(test.str)
for i := range match {
if match[i].IsValid() {
if test.result[0][i] != match[i] {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
}
}
}
})
}
}
func TestFindAllSubmatch(t *testing.T) { func TestFindAllSubmatch(t *testing.T) {
for _, test := range groupTests { for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) { t.Run(test.re+" "+test.str, func(t *testing.T) {
@@ -779,7 +799,7 @@ func TestFindAllSubmatch(t *testing.T) {
matchIndices := regComp.FindAllSubmatch(test.str) matchIndices := regComp.FindAllSubmatch(test.str)
for i := range matchIndices { for i := range matchIndices {
for j := range matchIndices[i] { for j := range matchIndices[i] {
if matchIndices[i][j].isValid() { if matchIndices[i][j].IsValid() {
if test.result[i][j] != matchIndices[i][j] { if test.result[i][j] != matchIndices[i][j] {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices) t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
} }