Compare commits
11 Commits
implementU
...
3eaf4eb19c
Author | SHA1 | Date | |
---|---|---|---|
3eaf4eb19c | |||
d453815831 | |||
3a2916baae | |||
9d6344719f | |||
f5c868566b | |||
1cd6da218f | |||
277cbc0fc5 | |||
3924502b72 | |||
36b009747b | |||
6cd0a10a8f | |||
69fb96c43d |
17
README.md
Normal file
17
README.md
Normal file
@@ -0,0 +1,17 @@
|
||||
## Kleingrep
|
||||
|
||||
Kleingrep is a regular expression engine, providing a library and command-line tool written in Go.
|
||||
|
||||
It aims to provide a more featureful engine, compared to the one in
|
||||
[Go's standard library](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
|
||||
|
||||
The engine does __not__ use backtracking, relying on the NFA-based method described in
|
||||
[Russ Cox's articles](https://swtch.com/~rsc/regexp). As such, it is immune to catastrophic backtracking.
|
||||
|
||||
It also includes features not present in regexp, such as lookarounds and backreferences.
|
||||
|
||||
### Syntax
|
||||
|
||||
The syntax is, for the most part, a superset of Go's regexp. A full overview of the syntax can be found [here](https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex#hdr-Syntax).
|
||||
|
||||
__For more information, see https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex__.
|
@@ -64,7 +64,7 @@ const (
|
||||
)
|
||||
|
||||
func isOperator(c rune) bool {
|
||||
if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune {
|
||||
if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune || c == lazyPlusRune || c == lazyKleeneRune || c == lazyQuestionRune {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
@@ -72,7 +72,7 @@ func isOperator(c rune) bool {
|
||||
|
||||
/* priority returns the priority of the given operator */
|
||||
func priority(op rune) int {
|
||||
precedence := []rune{'|', concatRune, '+', '*', '?'}
|
||||
precedence := []rune{'|', concatRune, '+', lazyPlusRune, '*', lazyKleeneRune, '?', lazyQuestionRune}
|
||||
return slices.Index(precedence, op)
|
||||
}
|
||||
|
||||
@@ -208,9 +208,6 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
// metacharacter. Later, in thompson(), these will be converted back. This avoids
|
||||
// confusion in detecting whether a character is escaped eg. detecting
|
||||
// whether '\\[a]' has an escaped opening bracket (it doesn't).
|
||||
//
|
||||
// 5. Check for non-greedy operators. These are not supported at the moment, so an error
|
||||
// must be thrown if the user attempts to use a non-greedy operator.
|
||||
for i := 0; i < len(re_runes_orig); i++ {
|
||||
c := re_runes_orig[i]
|
||||
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
|
||||
@@ -257,8 +254,16 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
|
||||
re_runes = append(re_runes, rbracketRune)
|
||||
continue
|
||||
} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
|
||||
return nil, fmt.Errorf("non-greedy operators are not supported")
|
||||
} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i > 0 && re_runes_orig[i-1] != '\\') && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
|
||||
switch c {
|
||||
case '+':
|
||||
re_runes = append(re_runes, lazyPlusRune)
|
||||
case '*':
|
||||
re_runes = append(re_runes, lazyKleeneRune)
|
||||
case '?':
|
||||
re_runes = append(re_runes, lazyQuestionRune)
|
||||
}
|
||||
i++
|
||||
} else {
|
||||
re_runes = append(re_runes, c)
|
||||
}
|
||||
@@ -421,7 +426,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != nonCapLparenRune && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
||||
if i < len(re_runes)-1 {
|
||||
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
|
||||
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != lazyKleeneRune && re_runes[i+1] != '+' && re_runes[i+1] != lazyPlusRune && re_runes[i+1] != '?' && re_runes[i+1] != lazyQuestionRune && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
|
||||
re_postfix = append(re_postfix, concatRune)
|
||||
}
|
||||
}
|
||||
@@ -940,6 +945,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
outQueue[idx].startReps = startRangeNum
|
||||
outQueue[idx].endReps = endRangeNum
|
||||
if i < len(re_postfix)-1 && re_postfix[i+1] == '?' { // lazy repitition
|
||||
outQueue[idx].isLazy = true
|
||||
i++
|
||||
}
|
||||
}
|
||||
if c == '(' || c == nonCapLparenRune {
|
||||
opStack = append(opStack, c)
|
||||
@@ -1233,6 +1242,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
if err != nil {
|
||||
return Reg{}, err
|
||||
}
|
||||
if c.isLazy {
|
||||
stateToAdd.isLazy = true
|
||||
}
|
||||
nfa = append(nfa, stateToAdd)
|
||||
case plusNode: // a+ is equivalent to aa*
|
||||
s1 := mustPop(&nfa)
|
||||
@@ -1240,6 +1252,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
if err != nil {
|
||||
return Reg{}, err
|
||||
}
|
||||
if c.isLazy {
|
||||
s2.isLazy = true
|
||||
}
|
||||
s1 = concatenate(s1, s2)
|
||||
nfa = append(nfa, s1)
|
||||
case questionNode: // ab? is equivalent to a(b|)
|
||||
@@ -1251,6 +1266,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
if err != nil {
|
||||
return Reg{}, err
|
||||
}
|
||||
if c.isLazy {
|
||||
s2.isLazy = true
|
||||
}
|
||||
nfa = append(nfa, s2)
|
||||
case pipeNode:
|
||||
// A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
|
||||
@@ -1306,6 +1324,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
if err != nil {
|
||||
return Reg{}, err
|
||||
}
|
||||
if c.isLazy {
|
||||
s2.isLazy = true
|
||||
}
|
||||
stateToAdd = concatenate(stateToAdd, s2)
|
||||
} else { // Case 2
|
||||
for i := c.startReps; i < c.endReps; i++ {
|
||||
@@ -1313,6 +1334,9 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
if err != nil {
|
||||
return Reg{}, fmt.Errorf("error processing bounded repetition")
|
||||
}
|
||||
if c.isLazy {
|
||||
tmp.isLazy = true
|
||||
}
|
||||
stateToAdd = concatenate(stateToAdd, tmp)
|
||||
}
|
||||
}
|
||||
|
@@ -153,8 +153,9 @@ returns the 0-group.
|
||||
The following features from [regexp] are (currently) NOT supported:
|
||||
1. Named capturing groups
|
||||
2. Non-greedy operators
|
||||
3. Embedded flags (flags are instead passed as arguments to [Compile])
|
||||
4. Literal text with \Q ... \E
|
||||
3. Negated POSIX classes
|
||||
4. Embedded flags (flags are instead passed as arguments to [Compile])
|
||||
5. Literal text with \Q ... \E
|
||||
|
||||
The following features are not available in [regexp], but are supported in my engine:
|
||||
1. Lookarounds
|
||||
|
@@ -234,14 +234,14 @@ func addStateToList(str []rune, idx int, list []nfaState, state nfaState, thread
|
||||
}
|
||||
visited = append(visited, state)
|
||||
|
||||
if state.isKleene || state.isQuestion {
|
||||
if (state.isKleene || state.isQuestion) && (state.isLazy == false) { // Greedy quantifiers
|
||||
copyThread(state.splitState, state)
|
||||
list := addStateToList(str, idx, list, *state.splitState, threadGroups, visited, preferLongest)
|
||||
copyThread(state.next, state)
|
||||
list = addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
||||
return list
|
||||
}
|
||||
if state.isAlternation {
|
||||
if state.isAlternation || ((state.isKleene || state.isQuestion) && state.isLazy) { // Alternation or lazy quantifier
|
||||
copyThread(state.next, state)
|
||||
list := addStateToList(str, idx, list, *state.next, threadGroups, visited, preferLongest)
|
||||
copyThread(state.splitState, state)
|
||||
|
@@ -16,8 +16,11 @@ var rparenRune rune = 0xF0006
|
||||
var nonCapLparenRune rune = 0xF0007 // Represents a non-capturing group's LPAREN
|
||||
var escBackslashRune rune = 0xF0008 // Represents an escaped backslash
|
||||
var charRangeRune rune = 0xF0009 // Represents a character range
|
||||
var lazyKleeneRune rune = 0xF000A // Represents a lazy kleene star
|
||||
var lazyPlusRune rune = 0xF000B // Represents a lazy plus operator
|
||||
var lazyQuestionRune rune = 0xF000C // Represents a lazy question operator
|
||||
|
||||
var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
|
||||
var specialChars = []rune{'?', lazyQuestionRune, '*', lazyKleeneRune, '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', lazyPlusRune, '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
|
||||
|
||||
// An interface for int and rune, which are identical
|
||||
type character interface {
|
||||
|
@@ -34,6 +34,7 @@ type nfaState struct {
|
||||
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
|
||||
isQuestion bool // Identifies whether current node is a 0-state representing the question operator
|
||||
isAlternation bool // Identifies whether current node is a 0-state representing an alternation
|
||||
isLazy bool // Only for split states - Identifies whether or not to flip the order of branches (try one branch before the other)
|
||||
splitState *nfaState // Only for alternation states - the 'other' branch of the alternation ('next' is the first)
|
||||
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
|
||||
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
|
||||
@@ -77,6 +78,7 @@ func cloneStateHelper(stateToClone *nfaState, cloneMap map[*nfaState]*nfaState)
|
||||
isKleene: stateToClone.isKleene,
|
||||
isQuestion: stateToClone.isQuestion,
|
||||
isAlternation: stateToClone.isAlternation,
|
||||
isLazy: stateToClone.isLazy,
|
||||
assert: stateToClone.assert,
|
||||
allChars: stateToClone.allChars,
|
||||
except: append([]rune{}, stateToClone.except...),
|
||||
@@ -421,6 +423,7 @@ func (s nfaState) equals(other nfaState) bool {
|
||||
s.next == other.next &&
|
||||
s.isKleene == other.isKleene &&
|
||||
s.isQuestion == other.isQuestion &&
|
||||
s.isLazy == other.isLazy &&
|
||||
s.isAlternation == other.isAlternation &&
|
||||
s.splitState == other.splitState &&
|
||||
s.assert == other.assert &&
|
||||
|
@@ -44,6 +44,7 @@ type postfixNode struct {
|
||||
lookaroundDir int // Lookbehind or lookahead
|
||||
nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
|
||||
referencedGroup int // ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode.
|
||||
isLazy bool // ONLY USED WHEN nodetype == kleene or question
|
||||
}
|
||||
|
||||
// Converts the given list of postfixNodes to one node of type CHARCLASS.
|
||||
@@ -162,10 +163,19 @@ func newPostfixNode(contents ...rune) postfixNode {
|
||||
switch contents[0] {
|
||||
case '+':
|
||||
to_return.nodetype = plusNode
|
||||
case lazyPlusRune:
|
||||
to_return.nodetype = plusNode
|
||||
to_return.isLazy = true
|
||||
case '?':
|
||||
to_return.nodetype = questionNode
|
||||
case lazyQuestionRune:
|
||||
to_return.nodetype = questionNode
|
||||
to_return.isLazy = true
|
||||
case '*':
|
||||
to_return.nodetype = kleeneNode
|
||||
case lazyKleeneRune:
|
||||
to_return.nodetype = kleeneNode
|
||||
to_return.isLazy = true
|
||||
case '|':
|
||||
to_return.nodetype = pipeNode
|
||||
case concatRune:
|
||||
|
@@ -488,7 +488,25 @@ var reTests = []struct {
|
||||
{`[b-e]`, nil, `f`, []Group{}},
|
||||
|
||||
{`*?`, nil, `-`, nil},
|
||||
{`a*?`, nil, `-`, nil}, // non-greedy operators are not supported
|
||||
{`a.+c`, nil, `abcabc`, []Group{{0, 6}}},
|
||||
// Lazy quantifier tests
|
||||
{`a.+?c`, nil, `abcabc`, []Group{{0, 3}, {3, 6}}},
|
||||
{`ab*?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{{0, 6}}},
|
||||
{`ab+?bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
|
||||
{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBC`, []Group{{0, 4}}},
|
||||
{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
|
||||
{`ab??bc`, []ReFlag{RE_CASE_INSENSITIVE}, `ABBBBC`, []Group{}},
|
||||
{`ab??c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABC`, []Group{{0, 3}}},
|
||||
{`a.*?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AXYZC`, []Group{{0, 5}}},
|
||||
{`a.+?c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCABC`, []Group{{0, 3}, {3, 6}}},
|
||||
{`a.*?c`, []ReFlag{RE_CASE_INSENSITIVE}, `ABCABC`, []Group{{0, 3}, {3, 6}}},
|
||||
{`.*?\S *:`, nil, `xx:`, []Group{{0, 3}}},
|
||||
{`a[ ]*? (\d+).*`, nil, `a 10`, []Group{{0, 6}}},
|
||||
{`a[ ]*? (\d+).*`, nil, `a 10`, []Group{{0, 7}}},
|
||||
{`"(?:\\"|[^"])*?"`, nil, `"\""`, []Group{{0, 4}}},
|
||||
{`^.*?$`, nil, "one\ntwo\nthree", []Group{}},
|
||||
{`a[^>]*?b`, nil, `a>b`, []Group{}},
|
||||
{`^a*?$`, nil, `foo`, []Group{}},
|
||||
|
||||
// Numeric range tests - this is a feature that I added, and doesn't exist
|
||||
// in any other mainstream regex engine
|
||||
@@ -719,6 +737,18 @@ var groupTests = []struct {
|
||||
// {`(a|ab|c|bcd)*(d*)`, nil, `ababcd`, []Match{[]Group{{0, 6}, {3, 6}, {6, 6}}, []Group{{6, 6}, {6, 6}, {6, 6}}}},
|
||||
// // Bug - this should give {0,3},{0,3},{0,0},{0,3},{3,3} but it gives {0,3},{0,2},{0,1},{1,2},{2,3}
|
||||
// // {`((a*)(b|abc))(c*)`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 3}, {0, 0}, {0, 3}, {3, 3}}}},
|
||||
|
||||
// Lazy quantifier tests
|
||||
{`a(?:b|c|d)+?(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {2, 3}}}},
|
||||
{`a(?:b|(c|e){1,2}?|d)+?(.)`, nil, `ace`, []Match{[]Group{{0, 3}, {1, 2}, {2, 3}}}},
|
||||
{`(?<!-):(.*?)(?<!-):`, nil, `a:bc-:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
|
||||
{`(?<!\\):(.*?)(?<!\\):`, nil, `a:bc\:de:f`, []Match{[]Group{{1, 9}, {2, 8}}}},
|
||||
{`(?<!\?)'(.*?)(?<!\?)'`, nil, `a'bc?'de'f`, []Match{[]Group{{1, 9}, {2, 8}}}},
|
||||
{`.*?x\s*\z(.*)`, []ReFlag{RE_MULTILINE, RE_SINGLE_LINE}, "xx\nx\n", []Match{[]Group{{0, 5}, {5, 5}}}},
|
||||
{`.*?x\s*\z(.*)`, []ReFlag{RE_MULTILINE}, "xx\nx\n", []Match{[]Group{{3, 5}, {5, 5}}}},
|
||||
{`^([ab]*?)(?=(b)?)c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
|
||||
{`^([ab]*?)(?!(b))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
|
||||
{`^([ab]*?)(?<!(a))c`, nil, `abc`, []Match{[]Group{{0, 3}, {0, 2}, {-1, -1}}}},
|
||||
}
|
||||
|
||||
func TestFind(t *testing.T) {
|
||||
|
Reference in New Issue
Block a user