Kleingrep is a regular expression engine, providing a library and command-line tool written in Go.
It aims to provide a more featureful engine, compared to the one in Go's
[regexp](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
The engine does __not__ use backtracking, relying on the NFA-based method described in
[Russ Cox's articles](https://swtch.com/~rsc/regexp). As such, it is immune to catastrophic backtracking.
It also includes features not present in regexp, such as lookarounds and backreferences.
### Syntax
The syntax is, for the most part, a superset of Go's regexp. A full overview of the syntax can be found [here](https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex#hdr-Syntax).
__For more information, see https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex__.
// If lineFlag is enabled, we should only print something if:
// If lineFlag is enabled, we should only print something if:
@ -182,7 +184,7 @@ func main() {
// the corresponding end index.
// the corresponding end index.
// 3. If not, just print the character.
// 3. If not, just print the character.
ifsubstituteFlagEnabled{
ifsubstituteFlagEnabled{
fori:=rangetest_str{
fori:=rangetest_str_runes{
inMatchIndex:=false
inMatchIndex:=false
for_,m:=rangematchIndices{
for_,m:=rangematchIndices{
ifi==m[0].StartIdx{
ifi==m[0].StartIdx{
@ -193,19 +195,21 @@ func main() {
}
}
}
}
if!inMatchIndex{
if!inMatchIndex{
fmt.Fprintf(out,"%c",test_str[i])
fmt.Fprintf(out,"%c",test_str_runes[i])
}
}
}
}
}else{
}else{
fori,c:=rangetest_str{
fori,c:=rangetest_str_runes{
ifindicesToPrint.contains(i){
ifindicesToPrint.contains(i){
color.New(color.FgRed).Fprintf(out,"%c",c)
color.New(color.FgRed).Fprintf(out,"%c",c)
// Newline after every match - only if -o is enabled and -v is disabled.
// Newline after every match - only if -o is enabled and -v is disabled.
if*onlyFlag&&!(*invertFlag){
if*onlyFlag&&!(*invertFlag){
for_,idx:=rangematchIndices{
formatchIdxNum,idx:=rangematchIndices{
ifi+1==idx[0].EndIdx{// End index is one more than last index of match
ifmatchIdxNum<len(matchIndices)-1{// Only print a newline afte printing a match, if there are multiple matches on the line, and we aren't on the last one. This is because the newline that gets added at the end will take care of that.
fmt.Fprintf(out,"\n")
ifi+1==idx[0].EndIdx{// End index is one more than last index of match
break
fmt.Fprintf(out,"\n")
break
}
}
}
}
}
}
}
@ -220,6 +224,10 @@ func main() {
iferr!=nil{
iferr!=nil{
panic(err)
panic(err)
}
}
fmt.Println()
// If the last character in the string wasn't a newline, AND we either have don't -o set or we do (and we've matched something), then print a newline
ifi<len(re_runes)&&(re_runes[i]!='('&&re_runes[i]!=nonCapLparenRune&&re_runes[i]!='|'&&re_runes[i]!='\\')||(i>0&&re_runes[i-1]=='\\'){// Every character should be concatenated if it is escaped
ifi<len(re_runes)&&(re_runes[i]!='('&&re_runes[i]!=nonCapLparenRune&&re_runes[i]!='|'&&re_runes[i]!='\\')||(i>0&&re_runes[i-1]=='\\'){// Every character should be concatenated if it is escaped
isKleenebool// Identifies whether current node is a 0-state representing Kleene star
isKleenebool// Identifies whether current node is a 0-state representing Kleene star
isQuestionbool// Identifies whether current node is a 0-state representing the question operator
isQuestionbool// Identifies whether current node is a 0-state representing the question operator
isAlternationbool// Identifies whether current node is a 0-state representing an alternation
isAlternationbool// Identifies whether current node is a 0-state representing an alternation
isLazybool// Only for split states - Identifies whether or not to flip the order of branches (try one branch before the other)
splitState*nfaState// Only for alternation states - the 'other' branch of the alternation ('next' is the first)
splitState*nfaState// Only for alternation states - the 'other' branch of the alternation ('next' is the first)
assertassertType// Type of assertion of current node - NONE means that the node doesn't assert anything
assertassertType// Type of assertion of current node - NONE means that the node doesn't assert anything
allCharsbool// Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
allCharsbool// Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
@ -44,11 +45,11 @@ type nfaState struct {
groupBeginbool// Whether or not the node starts a capturing group
groupBeginbool// Whether or not the node starts a capturing group
groupEndbool// Whether or not the node ends a capturing group
groupEndbool// Whether or not the node ends a capturing group
groupNumint// Which capturing group the node starts / ends
groupNumint// Which capturing group the node starts / ends
isBackreferencebool// Whether or not current node is backreference
referredGroupint// If current node is a backreference, the node that it points to
// The following properties depend on the current match - I should think about resetting them for every match.
// The following properties depend on the current match - I should think about resetting them for every match.
threadGroups[]Group// Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
threadGroups[]Group// Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
isBackreferencebool// Whether or not current node is backreference
threadBackrefint// If current node is a backreference, how many characters to look forward into the referred group
referredGroupint// If current node is a backreference, the node that it points to
threadBackrefint// If current node is a backreference, how many characters to look forward into the referred group
nodeContents[]postfixNode// ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
nodeContents[]postfixNode// ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
referencedGroupint// ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode.
referencedGroupint// ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode.
isLazybool// ONLY USED WHEN nodetype == kleene or question
}
}
// Converts the given list of postfixNodes to one node of type CHARCLASS.
// Converts the given list of postfixNodes to one node of type CHARCLASS.
// Numeric range tests - this is a feature that I added, and doesn't exist
// Numeric range tests - this is a feature that I added, and doesn't exist
// in any other mainstream regex engine
// in any other mainstream regex engine
@ -516,6 +538,30 @@ var reTests = []struct {
{`<389-400`,nil,`-`,nil},
{`<389-400`,nil,`-`,nil},
{`<389-400>`,nil,`391`,[]Group{{0,3}}},
{`<389-400>`,nil,`391`,[]Group{{0,3}}},
{`\b<1-10000>\b`,nil,`America declared independence in 1776.`,[]Group{{33,37}}},
{`\b<1-10000>\b`,nil,`America declared independence in 1776.`,[]Group{{33,37}}},
{`\p{Tamil}+`,nil,`உயிரெழுத்து`,[]Group{{0,11}}},// Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'.