4 Commits

3 changed files with 70 additions and 22 deletions

View File

@@ -112,23 +112,30 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
// Convert the string to a slice of runes to allow iteration through it // Convert the string to a slice of runes to allow iteration through it
re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges) re_runes_orig := []rune(re) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
re_runes := make([]rune, 0) re_runes := make([]rune, 0)
// Check for numeric range. If we are at the start of a numeric range, // The following checks are performed here:
// skip to end and construct the equivalent regex for the range. // 1. Check for numeric range. If we are at the start of a numeric range,
// The reason this is outside the loop below, is that it actually modifies // skip to end and construct the equivalent regex for the range.
// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex). // The reason this is outside the loop below, is that it actually modifies
// It also makes the overall parsing easier, since I don't have to worry about the numeric range // the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
// anymore. // It also makes the overall parsing easier, since I don't have to worry about the numeric range
// Eventually, I might be able to add it into the main parsing loop, to reduce the time // anymore.
// complexity. // Eventually, I might be able to add it into the main parsing loop, to reduce the time
// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range. // complexity.
// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
// //
// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:' // 2. Check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
// I take this out, and put in a special character - NONCAPLPAREN_CHAR. // I take this out, and put in a special character - NONCAPLPAREN_CHAR.
// //
// Another check is made for unescaped brackets - opening brackets are replaced with LBRACKET and closing brackets are replaced with RBRACKET. // 3. Another check is made for unescaped brackets - opening brackets are replaced with
// Finally, check for escaped backslashes. Replace these with the BACKSLASH metacharacter. Later, in thompson(), // LBRACKET and closing brackets are replaced with RBRACKET.
// these will be converted back. This avoids confusiuon in detecting whether a character is escaped eg. detecting //
// whether '\\[a]' has an escaped opening bracket (it doesn't). // 4. Check for escaped backslashes. Replace these with the BACKSLASH
// metacharacter. Later, in thompson(), these will be converted back. This avoids
// confusion in detecting whether a character is escaped eg. detecting
// whether '\\[a]' has an escaped opening bracket (it doesn't).
//
// 5. Check for non-greedy operators. These are not supported at the moment, so an error
// must be thrown if the user attempts to use a non-greedy operator.
for i := 0; i < len(re_runes_orig); i++ { for i := 0; i < len(re_runes_orig); i++ {
c := re_runes_orig[i] c := re_runes_orig[i]
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) { if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
@@ -172,6 +179,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') { } else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
re_runes = append(re_runes, RBRACKET) re_runes = append(re_runes, RBRACKET)
continue continue
} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
return nil, fmt.Errorf("non-greedy operators are not supported")
} else { } else {
re_runes = append(re_runes, c) re_runes = append(re_runes, c)
} }
@@ -1001,15 +1010,24 @@ func thompson(re []postfixNode) (Reg, error) {
if err != nil { if err != nil {
return Reg{}, fmt.Errorf("error applying kleene star") return Reg{}, fmt.Errorf("error applying kleene star")
} }
stateToAdd := kleene(*s1) stateToAdd, err := kleene(*s1)
if err != nil {
return Reg{}, err
}
nfa = append(nfa, stateToAdd) nfa = append(nfa, stateToAdd)
case PLUS: // a+ is equivalent to aa* case PLUS: // a+ is equivalent to aa*
s1 := mustPop(&nfa) s1 := mustPop(&nfa)
s2 := kleene(*s1) s2, err := kleene(*s1)
if err != nil {
return Reg{}, err
}
s1 = concatenate(s1, s2) s1 = concatenate(s1, s2)
nfa = append(nfa, s1) nfa = append(nfa, s1)
case QUESTION: // ab? is equivalent to a(b|) case QUESTION: // ab? is equivalent to a(b|)
s1 := mustPop(&nfa) s1, err := pop(&nfa)
if err != nil {
return Reg{}, fmt.Errorf("error applying question operator")
}
s2 := question(s1) s2 := question(s1)
nfa = append(nfa, s2) nfa = append(nfa, s2)
case PIPE: case PIPE:
@@ -1062,7 +1080,10 @@ func thompson(re []postfixNode) (Reg, error) {
stateToAdd = concatenate(stateToAdd, cloneState(state)) stateToAdd = concatenate(stateToAdd, cloneState(state))
} }
if c.endReps == INFINITE_REPS { // Case 3 if c.endReps == INFINITE_REPS { // Case 3
s2 := kleene(*state) s2, err := kleene(*state)
if err != nil {
return Reg{}, err
}
stateToAdd = concatenate(stateToAdd, s2) stateToAdd = concatenate(stateToAdd, s2)
} else { // Case 2 } else { // Case 2
for i := c.startReps; i < c.endReps; i++ { for i := c.startReps; i < c.endReps; i++ {

9
nfa.go
View File

@@ -1,6 +1,7 @@
package main package main
import ( import (
"fmt"
"slices" "slices"
) )
@@ -268,7 +269,11 @@ func concatenate(s1 *State, s2 *State) *State {
return s1 return s1
} }
func kleene(s1 State) *State { func kleene(s1 State) (*State, error) {
if s1.isEmpty && s1.assert != NONE {
return nil, fmt.Errorf("previous token is not quantifiable")
}
toReturn := &State{} toReturn := &State{}
toReturn.transitions = make(map[int][]*State) toReturn.transitions = make(map[int][]*State)
toReturn.content = newContents(EPSILON) toReturn.content = newContents(EPSILON)
@@ -283,7 +288,7 @@ func kleene(s1 State) *State {
for _, c := range s1.content { for _, c := range s1.content {
toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1) toReturn.transitions[c], _ = unique_append(toReturn.transitions[c], &s1)
} }
return toReturn return toReturn, nil
} }
func alternate(s1 *State, s2 *State) *State { func alternate(s1 *State, s2 *State) *State {

View File

@@ -465,6 +465,19 @@ var reTests = []struct {
{`[\t][\n][\v][\r][\f][\b]`, nil, "\t\n\v\r\f\b", []Group{{0, 6}}}, {`[\t][\n][\v][\r][\f][\b]`, nil, "\t\n\v\r\f\b", []Group{{0, 6}}},
{`.*d`, nil, "abc\nabd", []Group{{4, 7}}}, {`.*d`, nil, "abc\nabd", []Group{{4, 7}}},
{`(`, nil, "-", nil}, {`(`, nil, "-", nil},
{`[\41]`, nil, `!`, []Group{{0, 1}}},
{`(?<!abc)(d.f)`, nil, `abcdefdof`, []Group{{6, 9}}},
{`[\w-]+`, nil, `laser_beam`, []Group{{0, 10}}},
{`M+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}},
{`m+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}},
{`[M]+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}},
{`[m]+`, []ReFlag{RE_CASE_INSENSITIVE}, `MMM`, []Group{{0, 3}}},
{`^*`, nil, `-`, nil},
{`a[^>]*b`, nil, `a>b`, []Group{}},
{`^a*$`, nil, `foo`, []Group{}},
{`*?`, nil, `-`, nil},
{`a*?`, nil, `-`, nil}, // non-greedy operators are not supported
// Todo - add numeric range tests // Todo - add numeric range tests
} }
@@ -604,7 +617,7 @@ var groupTests = []struct {
{`a(?:b|c|d)(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}}, {`a(?:b|c|d)(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}},
{`a(?:b|c|d)*(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}}, {`a(?:b|c|d)*(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}},
{`a(?:b|c|d)+(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}}, {`a(?:b|c|d)+(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}},
{`a(?:b|(c|e){1,2}?|d)+?(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {1, 2}, {2, 3}}}}, {`a(?:b|(c|e){1,2}?|d)+(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {1, 2}, {2, 3}}}},
{`(?<!-):(.*)(?<!-):`, nil, `a:bc-:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}}, {`(?<!-):(.*)(?<!-):`, nil, `a:bc-:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
{`(?<!\\):(.*)(?<!\\):`, nil, `a:bc\:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}}, {`(?<!\\):(.*)(?<!\\):`, nil, `a:bc\:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
{`(?<!\?)'(.*)(?<!\?)'`, nil, `a'bc?'de'f`, []Match{[]Group{{1, 9}, {2, 8}}}}, {`(?<!\?)'(.*)(?<!\?)'`, nil, `a'bc?'de'f`, []Match{[]Group{{1, 9}, {2, 8}}}},
@@ -612,6 +625,15 @@ var groupTests = []struct {
{`([\s]*)([\S]*)([\s]*)`, nil, ` testing!1972`, []Match{[]Group{{0, 13}, {0, 1}, {1, 13}, {13, 13}}, []Group{{13, 13}, {13, 13}, {13, 13}, {13, 13}}}}, {`([\s]*)([\S]*)([\s]*)`, nil, ` testing!1972`, []Match{[]Group{{0, 13}, {0, 1}, {1, 13}, {13, 13}}, []Group{{13, 13}, {13, 13}, {13, 13}, {13, 13}}}},
{`(\s*)(\S*)(\s*)`, nil, ` testing!1972`, []Match{[]Group{{0, 13}, {0, 1}, {1, 13}, {13, 13}}, []Group{{13, 13}, {13, 13}, {13, 13}, {13, 13}}}}, {`(\s*)(\S*)(\s*)`, nil, ` testing!1972`, []Match{[]Group{{0, 13}, {0, 1}, {1, 13}, {13, 13}}, []Group{{13, 13}, {13, 13}, {13, 13}, {13, 13}}}},
{`(([a-z]+):)?([a-z]+)$`, nil, `smil`, []Match{[]Group{{0, 4}, {-1, -1}, {-1, -1}, {0, 4}}}}, {`(([a-z]+):)?([a-z]+)$`, nil, `smil`, []Match{[]Group{{0, 4}, {-1, -1}, {-1, -1}, {0, 4}}}},
{`(x?)?`, nil, `x`, []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}}},
{`"(?:\\"|[^"])*"`, nil, `"\""`, []Match{[]Group{{0, 4}}}},
{`^((a)c)?(ab)$`, nil, `ab`, []Match{[]Group{{0, 2}, {-1, -1}, {-1, -1}, {0, 2}}}},
{`^([ab]*)(?=(b)?)c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
{`^([ab]*)(?!(b))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
{`^([ab]*)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}}}},
} }
func TestFindAllMatches(t *testing.T) { func TestFindAllMatches(t *testing.T) {