Kleingrep is a regular expression engine, providing a library and command-line tool written in Go.
It aims to provide a more featureful engine, compared to the one in Go's
[regexp](https://pkg.go.dev/regexp), while retaining some semblance of efficiency.
The engine does __not__ use backtracking, relying on the NFA-based method described in
[Russ Cox's articles](https://swtch.com/~rsc/regexp). As such, it is immune to catastrophic backtracking.
It also includes features not present in regexp, such as lookarounds and backreferences.
### Syntax
The syntax is, for the most part, a superset of Go's regexp. A full overview of the syntax can be found [here](https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex#hdr-Syntax).
__For more information, see https://pkg.go.dev/gitea.twomorecents.org/Rockingcool/kleingrep/regex__.
// If lineFlag is enabled, we should only print something if:
// If lineFlag is enabled, we should only print something if:
@ -184,7 +182,7 @@ func main() {
// the corresponding end index.
// the corresponding end index.
// 3. If not, just print the character.
// 3. If not, just print the character.
ifsubstituteFlagEnabled{
ifsubstituteFlagEnabled{
fori:=rangetest_str_runes{
fori:=rangetest_str{
inMatchIndex:=false
inMatchIndex:=false
for_,m:=rangematchIndices{
for_,m:=rangematchIndices{
ifi==m[0].StartIdx{
ifi==m[0].StartIdx{
@ -195,21 +193,19 @@ func main() {
}
}
}
}
if!inMatchIndex{
if!inMatchIndex{
fmt.Fprintf(out,"%c",test_str_runes[i])
fmt.Fprintf(out,"%c",test_str[i])
}
}
}
}
}else{
}else{
fori,c:=rangetest_str_runes{
fori,c:=rangetest_str{
ifindicesToPrint.contains(i){
ifindicesToPrint.contains(i){
color.New(color.FgRed).Fprintf(out,"%c",c)
color.New(color.FgRed).Fprintf(out,"%c",c)
// Newline after every match - only if -o is enabled and -v is disabled.
// Newline after every match - only if -o is enabled and -v is disabled.
if*onlyFlag&&!(*invertFlag){
if*onlyFlag&&!(*invertFlag){
formatchIdxNum,idx:=rangematchIndices{
for_,idx:=rangematchIndices{
ifmatchIdxNum<len(matchIndices)-1{// Only print a newline afte printing a match, if there are multiple matches on the line, and we aren't on the last one. This is because the newline that gets added at the end will take care of that.
ifi+1==idx[0].EndIdx{// End index is one more than last index of match
ifi+1==idx[0].EndIdx{// End index is one more than last index of match
fmt.Fprintf(out,"\n")
fmt.Fprintf(out,"\n")
break
break
}
}
}
}
}
}
}
@ -224,10 +220,6 @@ func main() {
iferr!=nil{
iferr!=nil{
panic(err)
panic(err)
}
}
// If the last character in the string wasn't a newline, AND we either have don't -o set or we do (and we've matched something), then print a newline
i+=(numDigits-1)// I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended
i+=(numDigits-1)// I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended
}elseifunicode.IsDigit(re_runes[i]){// Any other number - backreference
numDigits:=1
fori+numDigits<len(re_runes)&&unicode.IsDigit(re_runes[i+numDigits]){// Skip while we see a digit
ifi<len(re_runes)&&(re_runes[i]!='('&&re_runes[i]!=nonCapLparenRune&&re_runes[i]!='|'&&re_runes[i]!='\\')||(i>0&&re_runes[i-1]=='\\'){// Every character should be concatenated if it is escaped
ifi<len(re_runes)&&(re_runes[i]!='('&&re_runes[i]!=nonCapLparenRune&&re_runes[i]!='|'&&re_runes[i]!='\\')||(i>0&&re_runes[i-1]=='\\'){// Every character should be concatenated if it is escaped
parenIndices:=make([]Group,0)// I really shouldn't be using Group here, because that's strictly for matching purposes, but its a convenient way to store the indices of the opening and closing parens.
parenIndices=append(parenIndices,Group{0,0})// I append a weird value here, because the 0-th group doesn't have any parens. This way, the 1st group will be at index 1, 2nd at 2 ...
i+=numDigitsParsed-1// Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically
i+=numDigitsParsed-1// Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically
for(i+numDigitsParsed)<len(re_postfix)-1&&isOctal(re_postfix[i+numDigitsParsed])&&numDigitsParsed<=4{// The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else)
for(i+numDigitsParsed)<len(re_postfix)-1&&isOctal(re_postfix[i+numDigitsParsed])&&numDigitsParsed<=3{// The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else)
}elseifmiddleNode.groupBegin&&middleNode.numTransitions()==0{// The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
}elseifmiddleNode.groupBegin&&len(middleNode.transitions)==0{// The middle node is a lone lparen - something like '(())', and I'm looking at the first rparen
nfa=append(nfa,lparenNode)// I shouldn't have popped this out, because it is not involved in the current capturing group
nfa=append(nfa,lparenNode)// I shouldn't have popped this out, because it is not involved in the current capturing group
s.groupNum=middleNode.groupNum// In this case, the 'middle' node is actually an lparen
s.groupNum=middleNode.groupNum// In this case, the 'middle' node is actually an lparen
// Increment until we hit a character matching the start state (assuming not 0-state)
ifstart.isEmpty==false{
fori<len(str)&&!start.contentContains(str,i){
i++
}
startIdx=i
startingFrom=i
i++// Advance to next character (if we aren't at a 0-state, which doesn't match anything), so that we can check for transitions. If we advance at a 0-state, we will never get a chance to match the first character
}
start.threadGroups=newMatch(numGroups+1)
start.threadGroups=newMatch(numGroups+1)
start.threadGroups[0].StartIdx=i
// Check if the start state begins a group - if so, add the start index to our list
// Keep taking zero-states, until there are no more left to take
// Objective: If any of our current states have transitions to 0-states, replace them with the 0-state. Do this until there are no more transitions to 0-states, or there are no more unique 0-states to take.
// Take any transitions corresponding to current character
break
numStatesMatched:=0// The number of states which had at least 1 match for this round
}
assertionFailed:=false// Whether or not an assertion failed for this round
}elseif!currentState.isAlternation&&!currentState.isKleene&&!currentState.isQuestion&&!currentState.isBackreference&&!currentState.groupBegin&&!currentState.groupEnd&¤tState.assert==noneAssert{// Normal character
lastStateInList:=false// Whether or not a last state was in our list of states
iftempIndices[0].StartIdx==tempIndices[0].EndIdx{// If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
// ReplaceAllFunc replaces every match of the expression in src, with the return value of the function replFunc.
iftempIndices.numValidGroups()>0{
// replFunc takes in the matched string. The return value is substituted in directly without expasion.
iftempIndices[0].StartIdx==tempIndices[0].EndIdx{// If we have a zero-length match, we have to shift the index at which we start. Otherwise we keep looking at the same paert of the string over and over.
ifstartIdx==startingFrom{// Increment starting index if we haven't moved in the string. Prevents us from matching the same part of the string over and over.
isEmptybool// If it is empty - Union operator and Kleene star states will be empty
isEmptybool// If it is empty - Union operator and Kleene star states will be empty
isLastbool// If it is the last state (acept state)
isLastbool// If it is the last state (acept state)
output[]*nfaState// The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
output[]*nfaState// The outputs of the current state ie. the 'outward arrows'. A union operator state will have more than one of these.
// transitions map[int][]*nfaState // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
transitionsmap[int][]*nfaState// Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
next*nfaState// The next state (not for alternation or kleene states)
isKleenebool// Identifies whether current node is a 0-state representing Kleene star
isKleenebool// Identifies whether current node is a 0-state representing Kleene star
assertassertType// Type of assertion of current node - NONE means that the node doesn't assert anything
isQuestionbool// Identifies whether current node is a 0-state representing the question operator
allCharsbool// Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
isAlternationbool// Identifies whether current node is a 0-state representing an alternation
except[]rune// Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
isLazybool// Only for split states - Identifies whether or not to flip the order of branches (try one branch before the other)
lookaroundRegexstring// Only for lookaround states - Contents of the regex that the lookaround state holds
splitState*nfaState// Only for alternation states - the 'other' branch of the alternation ('next' is the first)
lookaroundNFA*nfaState// Holds the NFA of the lookaroundRegex - if it exists
assertassertType// Type of assertion of current node - NONE means that the node doesn't assert anything
lookaroundNumCaptureGroupsint// Number of capturing groups in lookaround regex if current node is a lookaround
allCharsbool// Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
groupBeginbool// Whether or not the node starts a capturing group
except[]rune// Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
groupEndbool// Whether or not the node ends a capturing group
lookaroundRegexstring// Only for lookaround states - Contents of the regex that the lookaround state holds
groupNumint// Which capturing group the node starts / ends
lookaroundNFA*nfaState// Holds the NFA of the lookaroundRegex - if it exists
lookaroundNumCaptureGroupsint// Number of capturing groups in lookaround regex if current node is a lookaround
groupBeginbool// Whether or not the node starts a capturing group
groupEndbool// Whether or not the node ends a capturing group
groupNumint// Which capturing group the node starts / ends
isBackreferencebool// Whether or not current node is backreference
referredGroupint// If current node is a backreference, the node that it points to
// The following properties depend on the current match - I should think about resetting them for every match.
// The following properties depend on the current match - I should think about resetting them for every match.
threadGroups []Group// Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
zeroMatchFoundbool// Whether or not the state has been used for a zero-length match - only relevant for zero states
threadBackref int// If current node is a backreference, how many characters to look forward into the referred group
threadGroups[]Group// Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
return!slices.Contains(slices.Concat(notDotChars,s.except),str[idx])// Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node.
return!slices.Contains(slices.Concat(notDotChars,s.except),str[idx])// Return true only if the index isn't a 'notDotChar', or isn't one of the exception characters for the current node.
@ -234,84 +218,74 @@ func (s nfaState) isLookaround() bool {
// if st.isKleene { // A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
ifst.isKleene{// A State representing a Kleene Star has transitions going out, which loop back to it. If all those transitions point to the same (single) state, then it must be a last state
varinfinite_repsint=-1// Represents infinite reps eg. the end range in {5,}
varinfinite_repsint=-1// Represents infinite reps eg. the end range in {5,}
// This represents a node in the postfix representation of the expression
// This represents a node in the postfix representation of the expression
typepostfixNodestruct{
typepostfixNodestruct{
nodetypenodeType
nodetypenodeType
contents[]rune// Contents of the node
contents[]rune// Contents of the node
startRepsint// Minimum number of times the node should be repeated - used with numeric specifiers
startRepsint// Minimum number of times the node should be repeated - used with numeric specifiers
endRepsint// Maximum number of times the node should be repeated - used with numeric specifiers
endRepsint// Maximum number of times the node should be repeated - used with numeric specifiers
allCharsbool// Whether or not the current node represents all characters (eg. dot metacharacter)
allCharsbool// Whether or not the current node represents all characters (eg. dot metacharacter)
except[]postfixNode// For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here.
except[]postfixNode// For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here.
lookaroundSignint// ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
lookaroundSignint// ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
lookaroundDirint// Lookbehind or lookahead
lookaroundDirint// Lookbehind or lookahead
nodeContents[]postfixNode// ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
nodeContents[]postfixNode// ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
referencedGroupint// ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode.
isLazybool// ONLY USED WHEN nodetype == kleene or question
}
}
// Converts the given list of postfixNodes to one node of type CHARCLASS.
// Converts the given list of postfixNodes to one node of type CHARCLASS.
// groupIncrement := 0 // If we have a backreference before the group its referring to, then the group its referring to will have its group number incremented.
// rtv = slices.Concat(rtv, nodes[parenIndices[node.referencedGroup].StartIdx:parenIndices[node.referencedGroup].EndIdx+1]) // Add all the nodes in the group to rtv
// numGroups += 1
// if i < parenIndices[node.referencedGroup].StartIdx {
// Numeric range tests - this is a feature that I added, and doesn't exist
// Numeric range tests - this is a feature that I added, and doesn't exist
// in any other mainstream regex engine
// in any other mainstream regex engine
@ -538,30 +518,6 @@ var reTests = []struct {
{`<389-400`,nil,`-`,nil},
{`<389-400`,nil,`-`,nil},
{`<389-400>`,nil,`391`,[]Group{{0,3}}},
{`<389-400>`,nil,`391`,[]Group{{0,3}}},
{`\b<1-10000>\b`,nil,`America declared independence in 1776.`,[]Group{{33,37}}},
{`\b<1-10000>\b`,nil,`America declared independence in 1776.`,[]Group{{33,37}}},
{`\p{Tamil}+`,nil,`உயிரெழுத்து`,[]Group{{0,11}}},// Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'.