Compare commits
5 Commits
ac936659b6
...
2a9ae0b68a
| Author | SHA1 | Date | |
|---|---|---|---|
| 2a9ae0b68a | |||
| 783ae2ad10 | |||
| b5e6bc112c | |||
| 206fea34cd | |||
| fcdb23524a |
@@ -137,7 +137,7 @@ func main() {
|
|||||||
fmt.Fprintf(out, "Line %d:\n", lineNum)
|
fmt.Fprintf(out, "Line %d:\n", lineNum)
|
||||||
}
|
}
|
||||||
for _, m := range matchIndices {
|
for _, m := range matchIndices {
|
||||||
fmt.Fprintf(out, "%s\n", m.ToString())
|
fmt.Fprintf(out, "%s\n", m.String())
|
||||||
}
|
}
|
||||||
err := out.Flush()
|
err := out.Flush()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -18,6 +18,12 @@ type Reg struct {
|
|||||||
numGroups int
|
numGroups int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// numSubexp eturns the number of sub-expressions in the given [Reg]. This is equivalent
|
||||||
|
// to the number of capturing groups.
|
||||||
|
func (r Reg) NumSubexp() int {
|
||||||
|
return r.numGroups
|
||||||
|
}
|
||||||
|
|
||||||
const concatRune rune = 0xF0001
|
const concatRune rune = 0xF0001
|
||||||
|
|
||||||
// Flags for shuntingYard - control its behavior
|
// Flags for shuntingYard - control its behavior
|
||||||
|
|||||||
31
regex/doc.go
31
regex/doc.go
@@ -121,5 +121,36 @@ this engine will _always_ go for the longest possible match, regardless of the o
|
|||||||
My engine does not support byte-slices. When a matching function receives a string, it converts it into a
|
My engine does not support byte-slices. When a matching function receives a string, it converts it into a
|
||||||
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
|
rune-slice to iterate through it. While this has some space overhead, the convenience of built-in unicode
|
||||||
support made the tradeoff worth it.
|
support made the tradeoff worth it.
|
||||||
|
|
||||||
|
3. Return values
|
||||||
|
|
||||||
|
Rather than using primitives for return values, my engine defines two types that are used as return
|
||||||
|
values: a [Group] represents a capturing group, and a [Match] represents a list of groups.
|
||||||
|
|
||||||
|
[regexp] specifies a regular expression that gives a list of all the matching functions that it supports. The
|
||||||
|
equivalent expression for this engine is:
|
||||||
|
|
||||||
|
Find(All)?(String)?(Submatch)?
|
||||||
|
|
||||||
|
[Reg.Find] returns the index of the leftmost match in the string.
|
||||||
|
|
||||||
|
If a function contains 'All' it returns all matches instead of just the leftmost one.
|
||||||
|
|
||||||
|
If a function contains 'String' it returns the matched text, rather than the indices.
|
||||||
|
|
||||||
|
If a function contains 'Submatch' it returns the match, including all submatches found by
|
||||||
|
capturing groups.
|
||||||
|
|
||||||
|
The term '0-group' is used to refer to the 0th capturing group of a match (which is the entire match).
|
||||||
|
Given the following regex:
|
||||||
|
|
||||||
|
x(y)
|
||||||
|
|
||||||
|
and the input string:
|
||||||
|
|
||||||
|
xyz
|
||||||
|
|
||||||
|
The 0th group would contain 'xy' and the 1st group would contain 'y'. Any matching function without 'Submatch' in its name
|
||||||
|
returns the 0-group.
|
||||||
*/
|
*/
|
||||||
package regex
|
package regex
|
||||||
|
|||||||
@@ -35,10 +35,10 @@ func (m Match) numValidGroups() int {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns a string containing the indices of all (valid) groups in the match
|
// Returns a string containing the indices of all (valid) groups in the match
|
||||||
func (m Match) ToString() string {
|
func (m Match) String() string {
|
||||||
var toRet string
|
var toRet string
|
||||||
for i, g := range m {
|
for i, g := range m {
|
||||||
if g.isValid() {
|
if g.IsValid() {
|
||||||
toRet += fmt.Sprintf("Group %d\n", i)
|
toRet += fmt.Sprintf("Group %d\n", i)
|
||||||
toRet += g.toString()
|
toRet += g.toString()
|
||||||
toRet += "\n"
|
toRet += "\n"
|
||||||
@@ -52,8 +52,9 @@ func (idx Group) toString() string {
|
|||||||
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
|
return fmt.Sprintf("%d\t%d", idx.StartIdx, idx.EndIdx)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns whether a group contains valid indices
|
// Returns whether a group is valid (ie. whether it matched any text). It
|
||||||
func (g Group) isValid() bool {
|
// simply ensures that both indices of the group are >= 0.
|
||||||
|
func (g Group) IsValid() bool {
|
||||||
return g.StartIdx >= 0 && g.EndIdx >= 0
|
return g.StartIdx >= 0 && g.EndIdx >= 0
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -174,6 +175,20 @@ func (regex Reg) FindString(str string) string {
|
|||||||
return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
|
return str[zeroGroup.StartIdx:zeroGroup.EndIdx]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FindSubmatch returns the leftmost match of the regex in the given string, including
|
||||||
|
// the submatches matched by capturing groups. The returned [Match] will always contain the same
|
||||||
|
// number of groups. The validity of a group (whether or not it matched anything) can be determined with
|
||||||
|
// [Group.IsValid], or by checking that both indices of the group are >= 0.
|
||||||
|
// The second-return value is nil if no match was found.
|
||||||
|
func (regex Reg) FindSubmatch(str string) (Match, error) {
|
||||||
|
match, err := regex.FindNthMatch(str, 1)
|
||||||
|
if err != nil {
|
||||||
|
return Match{}, fmt.Errorf("no match found")
|
||||||
|
} else {
|
||||||
|
return match, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// FindAllString is the 'all' version of FindString.
|
// FindAllString is the 'all' version of FindString.
|
||||||
// It returns a slice of strings containing the text of all matches of
|
// It returns a slice of strings containing the text of all matches of
|
||||||
// the regex in the given string.
|
// the regex in the given string.
|
||||||
@@ -372,7 +387,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
|
|||||||
// Check if we can find a zero-length match
|
// Check if we can find a zero-length match
|
||||||
if foundPath == false {
|
if foundPath == false {
|
||||||
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
|
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
|
||||||
if tempIndices[0].isValid() == false {
|
if tempIndices[0].IsValid() == false {
|
||||||
tempIndices[0] = Group{startIdx, startIdx}
|
tempIndices[0] = Group{startIdx, startIdx}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -382,7 +397,7 @@ func findAllSubmatchHelper(start *nfaState, str []rune, offset int, numGroups in
|
|||||||
startIdx++
|
startIdx++
|
||||||
// i++
|
// i++
|
||||||
// }
|
// }
|
||||||
if tempIndices.numValidGroups() > 0 && tempIndices[0].isValid() {
|
if tempIndices.numValidGroups() > 0 && tempIndices[0].IsValid() {
|
||||||
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
|
if tempIndices[0].StartIdx == tempIndices[0].EndIdx { // If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
|
||||||
return true, tempIndices, tempIndices[0].EndIdx + 1
|
return true, tempIndices, tempIndices[0].EndIdx + 1
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -767,6 +767,26 @@ func TestFindAllString(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFindSubmatch(t *testing.T) {
|
||||||
|
for _, test := range groupTests {
|
||||||
|
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||||
|
regComp, err := Compile(test.re, test.flags...)
|
||||||
|
if err != nil {
|
||||||
|
if test.result != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
match, err := regComp.FindSubmatch(test.str)
|
||||||
|
for i := range match {
|
||||||
|
if match[i].IsValid() {
|
||||||
|
if test.result[0][i] != match[i] {
|
||||||
|
t.Errorf("Wanted %v Got %v\n", test.result[0], match)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
func TestFindAllSubmatch(t *testing.T) {
|
func TestFindAllSubmatch(t *testing.T) {
|
||||||
for _, test := range groupTests {
|
for _, test := range groupTests {
|
||||||
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
t.Run(test.re+" "+test.str, func(t *testing.T) {
|
||||||
@@ -779,7 +799,7 @@ func TestFindAllSubmatch(t *testing.T) {
|
|||||||
matchIndices := regComp.FindAllSubmatch(test.str)
|
matchIndices := regComp.FindAllSubmatch(test.str)
|
||||||
for i := range matchIndices {
|
for i := range matchIndices {
|
||||||
for j := range matchIndices[i] {
|
for j := range matchIndices[i] {
|
||||||
if matchIndices[i][j].isValid() {
|
if matchIndices[i][j].IsValid() {
|
||||||
if test.result[i][j] != matchIndices[i][j] {
|
if test.result[i][j] != matchIndices[i][j] {
|
||||||
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
t.Errorf("Wanted %v Got %v\n", test.result, matchIndices)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user