5 Commits

5 changed files with 80 additions and 8 deletions

View File

@@ -137,7 +137,7 @@ func main() {
fmt.Fprintf(out, "Line %d:\n", lineNum)
}
for _, m := range matchIndices {
fmt.Fprintf(out, "%s\n", m.ToString())
fmt.Fprintf(out, "%s\n", m.String())
}
err := out.Flush()
if err != nil {

View File

@@ -18,6 +18,12 @@ type Reg struct {
numGroups int
}
// numSubexp eturns the number of sub-expressions in the given [Reg]. This is equivalent
// to the number of capturing groups.
func (r Reg) NumSubexp() int {
return r.numGroups
}
const concatRune rune = 0xF0001
// Flags for shuntingYard - control its behavior

View File

@@ -121,5 +121,36 @@ this engine will _always_ go for the longest possible match, regardless of the o
My engine does not support byte-slices. When a matching function receives a string, it converts it into a
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
support made the tradeoff worth it.
3. Return values
Rather than using primitives for return values, my engine defines two types that are used as return
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
[regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The
equivalent expression for this engine is:
Find(All)?(String)?(Submatch)?
[Reg.Find] returns the index of the leftmost match in the string.
If a function contains 'All' it returns all matches instead of just the leftmost one.
If a function contains 'String' it returns the matched text, rather than the indices.
If a function contains 'Submatch' it returns the match, including all submatches found by
capturing groups.
The term '0-group' is used to refer to the 0th capturing group of a match (which is the entire match).
Given the following regex:
x(y)
and the input string:
xyz
The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name
returns the 0-group.
*/
package regex

View File

@@ -35,10 +35,10 @@ func (m Match) numValidGroups() int {
}
// Returns a string containing the indices of all (valid) groups in the match
func (m Match) ToString() string {
func (m Match) String() string {
var toRet string
for i, g := range m {
if g.isValid() {
if g.IsValid() {
toRet += fmt.Sprintf("Group %d\n", i)
toRet += g.toString()
toRet += "\n"
@@ -52,8 +52,9 @@ func (idx Group) toString() string {
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
}
// Returns whether a group contains valid indices
func (g Group) isValid() bool {
// Returns whether a group is valid (ie. whether it matched any text). It
// simply ensures that both indices of the group are >= 0.
func (g Group) IsValid() bool {
return g.StartIdx >= 0 && g.EndIdx >= 0
}
@@ -174,6 +175,20 @@ func (regex Reg) FindString(str string) string {
return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
}
// FindSubmatch returns the leftmost match of the regex in the given string, including
// the submatches matched by capturing groups. The returned [Match] will always contain the same
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
// [Group.IsValid], or by checking that both indices of the group are >= 0.
// The second-return value is nil if no match was found.
func (regex Reg) FindSubmatch(str string) (Match, error) {
match, err := regex.FindNthMatch(str, 1)
if err != nil {
return Match{}, fmt.Errorf("no match found")
} else {
return match, nil
}
}
// FindAllString is the 'all' version of FindString.
// It returns a slice of strings containing the text of all matches of
// the regex in the given string.
@@ -372,7 +387,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
// Check if we can find a zero-length match
if foundPath == false {
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
if tempIndices[0].isValid() == false {
if tempIndices[0].IsValid() == false {
tempIndices[0] = Group{startIdx, startIdx}
}
}
@@ -382,7 +397,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
startIdx++
// i++
// }
if tempIndices.numValidGroups() > 0 && tempIndices[0].isValid() {
if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() {
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
return true, tempIndices, tempIndices[0].EndIdx + 1
} else {

View File

@@ -767,6 +767,26 @@ func TestFindAllString(t *testing.T) {
}
}
func TestFindSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
regComp, err := Compile(test.re, test.flags...)
if err != nil {
if test.result != nil {
panic(err)
}
}
match, err := regComp.FindSubmatch(test.str)
for i := range match {
if match[i].IsValid() {
if test.result[0][i] != match[i] {
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
}
}
}
})
}
}
func TestFindAllSubmatch(t *testing.T) {
for _, test := range groupTests {
t.Run(test.re+" "+test.str, func(t *testing.T) {
@@ -779,7 +799,7 @@ func TestFindAllSubmatch(t *testing.T) {
matchIndices := regComp.FindAllSubmatch(test.str)
for i := range matchIndices {
for j := range matchIndices[i] {
if matchIndices[i][j].isValid() {
if matchIndices[i][j].IsValid() {
if test.result[i][j] != matchIndices[i][j] {
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
}