Compare commits

...

10 Commits

@ -51,6 +51,9 @@ func shuntingYard(re string) []postfixNode {
// Eventually, I might be able to add it into the main parsing loop, to reduce the time
// complexity.
// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
//
// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
for i := 0; i < len(re_runes_orig); i++ {
c := re_runes_orig[i]
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
@ -82,7 +85,9 @@ func shuntingYard(re string) []postfixNode {
fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd)
regex := range2regex(rangeStart, rangeEnd)
re_runes = append(re_runes, []rune(regex)...)
} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
re_runes = append(re_runes, NONCAPLPAREN_CHAR)
i += 2
} else {
re_runes = append(re_runes, c)
}
@ -148,7 +153,11 @@ func shuntingYard(re string) []postfixNode {
}
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
}
if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] == '?') { // Unescaped open parentheses followed by question mark = lokaround. Don't mess with it.
if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen
re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
i += 3
}
if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
i++ // Step inside
if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
panic("Invalid regex. Lookaround intended?")
@ -174,7 +183,7 @@ func shuntingYard(re string) []postfixNode {
}
continue
}
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
re_postfix = append(re_postfix, CONCAT)
@ -197,7 +206,7 @@ func shuntingYard(re string) []postfixNode {
b. If not, keep popping from opStack (and appending to outQueue) until:
i. opStack is empty, OR
ii. current character has greater priority than top of opStack
3. If current character is '(', push to opStack
3. If current character is '(' or NONCAPLPAREN_CHAR, push to opStack
4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
@ -382,32 +391,35 @@ func shuntingYard(re string) []postfixNode {
}
idx := len(outQueue) - 1
// Get the most recently added non-paren node
for node := outQueue[idx]; idx >= 0 && (node.nodetype == RPAREN || node.nodetype == LPAREN); node = outQueue[idx] {
idx--
}
if idx < 0 {
// Get the last added node
if idx < 0 || outQueue[idx].nodetype == LPAREN {
panic("Numeric specifier with no content.")
}
outQueue[idx].startReps = startRangeNum
outQueue[idx].endReps = endRangeNum
}
if c == '(' {
if c == '(' || c == NONCAPLPAREN_CHAR {
opStack = append(opStack, c)
outQueue = append(outQueue, newPostfixNode(c))
if c == '(' { // We only push _capturing_ group parentheses to outQueue
outQueue = append(outQueue, newPostfixNode(c))
}
numOpenParens++
}
if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack.
for val, err := peek(opStack); val != '('; val, err = peek(opStack) {
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack.
var val rune
var err error
for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) {
if err != nil {
panic("ERROR: Imbalanced parantheses.")
}
to_append := mustPop(&opStack)
outQueue = append(outQueue, newPostfixNode(to_append))
}
_ = mustPop(&opStack) // Get rid of opening parentheses
outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
_ = mustPop(&opStack) // Get rid of opening parentheses
if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
}
numOpenParens--
}
}
@ -594,7 +606,8 @@ func main() {
multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
caseInsensitiveFlag = flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
flag.Parse()
// In multi-line mode, 'dot' metacharacter also matches newline
@ -607,14 +620,23 @@ func main() {
if *onlyFlag {
*lineFlag = false
}
// Check if substitute text has been enabled
// Check if substitute and matchNum flags have been enabled
substituteFlagEnabled := false
matchNumFlagEnabled := false
flag.Visit(func(f *flag.Flag) {
if f.Name == "s" {
substituteFlagEnabled = true
}
if f.Name == "m" {
matchNumFlagEnabled = true
}
})
// Validate matchNumFlag - must be positive integer
if matchNumFlagEnabled && *matchNum < 1 {
panic("Invalid match number to print.")
}
// Process:
// 1. Convert regex into postfix notation (Shunting-Yard algorithm)
// a. Add explicit concatenation operators to facilitate this

@ -60,8 +60,7 @@ func (g Group) isValid() bool {
// takeZeroState takes the 0-state (if such a transition exists) for all states in the
// given slice. It returns the resulting states. If any of the resulting states is a 0-state,
// the second ret val is true.
// The third ret val is a list of all the group numbers of all the opening parentheses we crossed,
// and the fourth is a list of all the closing parentheses we passed
// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index.
func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZero bool) {
for _, state := range states {
if len(state.transitions[EPSILON]) > 0 {
@ -94,11 +93,7 @@ func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZer
// from any of the given states, given the string and our position in it.
// It uses the same algorithm to find zero-states as the one inside the loop,
// so I should probably put it in a function.
// It also returns all the capturing groups that both begin and end at the current index.
// This is because, by definition, zero-states don't move forward in the string.
func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) (bool, []int, []int) {
allOpenParenGroups := make([]int, 0)
allCloseParenGroups := make([]int, 0)
func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) bool {
zeroStates, isZero := takeZeroState(states, numGroups, idx)
tempstates := make([]*State, 0, len(zeroStates)+len(states))
tempstates = append(tempstates, states...)
@ -113,10 +108,10 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) (bo
}
for _, state := range tempstates {
if state.isEmpty && (state.assert == NONE || state.checkAssertion(str, idx)) && state.isLast {
return true, allOpenParenGroups, allCloseParenGroups
return true
}
}
return false, allOpenParenGroups, allCloseParenGroups
return false
}
// Prunes the slice by removing overlapping indices.
@ -173,28 +168,6 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
return false, []Group{}, offset
}
// 'Base case' - if we are at the end of the string, check if we can add a zero-length match
if offset == len(str) {
// Get all zero-state matches. If we can get to a zero-state without matching anything, we
// can add a zero-length match. This is all true only if the start state itself matches nothing.
if start.isEmpty {
to_return := newMatch(numGroups + 1)
if start.groupBegin {
to_return[start.groupNum].startIdx = offset
}
if ok, openGrps, closeGrps := zeroMatchPossible(str, offset, numGroups, start); ok {
for _, gIdx := range openGrps {
to_return[gIdx].startIdx = offset
}
for _, gIdx := range closeGrps {
to_return[gIdx].endIdx = offset
}
to_return[0] = Group{offset, offset}
return true, to_return, offset + 1
}
}
return false, []Group{}, offset + 1
}
// Hold a list of match indices for the current run. When we
// can no longer find a match, the match with the largest range is
@ -329,7 +302,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
// Check if we can find a zero-length match
if foundPath == false {
if ok, _, _ := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
if tempIndices[0].isValid() == false {
tempIndices[0] = Group{startIdx, startIdx}
}
@ -375,7 +348,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
for _, state := range currentStates {
// Only add the match if the start index is in bounds. If the state has an assertion,
// make sure the assertion checks out.
if state.isLast && startIdx < len(str) {
if state.isLast && i <= len(str) {
if state.assert == NONE || state.checkAssertion(str, i) {
for j := 1; j < numGroups+1; j++ {
tempIndices[j] = state.threadGroups[j]

@ -13,6 +13,7 @@ var RBRACKET rune = 0xF0001
var ANY_CHAR rune = 0xF0002 // Represents any character - used for states where the allChars flag is on.
var LPAREN_CHAR rune = 0xF0003 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
var RPAREN_CHAR rune = 0xF0004
var NONCAPLPAREN_CHAR rune = 0xF0005 // Represents a non-capturing group's LPAREN
// Returns true if str[idx] and str[idx-1] are separated by a word boundary.
func isWordBoundary(str []rune, idx int) bool {
@ -26,7 +27,7 @@ func isWordBoundary(str []rune, idx int) bool {
func isNormalChar(c rune) bool {
specialChars := []rune(`?*\^${}()+|[].~<>`)
specialChars = append(specialChars, LBRACKET, RBRACKET)
specialChars = append(specialChars, LBRACKET, RBRACKET, NONCAPLPAREN_CHAR)
return !slices.Contains(specialChars, c)
}

@ -28,7 +28,6 @@ type State struct {
transitions map[int][]*State // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
isKleene bool // Identifies whether current node is a 0-state representing Kleene star
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds
@ -37,7 +36,9 @@ type State struct {
groupBegin bool // Whether or not the node starts a capturing group
groupEnd bool // Whether or not the node ends a capturing group
groupNum int // Which capturing group the node starts / ends
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
// The following properties depend on the current match - I should think about resetting them for every match.
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
}
// Clones the NFA starting from the given state.
@ -118,15 +119,21 @@ func (s State) checkAssertion(str []rune, idx int) bool {
if s.isLookaround() {
// The process here is simple:
// 1. Compile the regex stored in the state's contents.
// 2. Run it on the test string.
// 2. Run it on a subset of the test string, that ends after the current index in the string
// 3. Based on the kind of lookaround (and the indices we get), determine what action to take.
startState := s.lookaroundNFA
matchIndices := findAllMatches(startState, str, startState.lookaroundNumCaptureGroups)
var strToMatch []rune
if s.assert == PLA || s.assert == NLA {
strToMatch = str[idx:]
} else {
strToMatch = str[:idx]
}
matchIndices := findAllMatches(startState, strToMatch, startState.lookaroundNumCaptureGroups)
numMatchesFound := 0
for _, matchIdx := range matchIndices {
if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at the current index
if matchIdx[0].startIdx == idx {
if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx.
if matchIdx[0].startIdx == 0 {
numMatchesFound++
}
}

@ -149,14 +149,13 @@ var reTests = []struct {
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "400", []Group{{0, 3}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "4000", []Group{}},
// Todo - add lookaround tests
// Lookaround tests
{"(?<=bo)y", "boy", []Group{{2, 3}}},
{"bo(?=y)", "boy", []Group{{0, 2}}},
// Todo - add numeric range tests
// Todo - add capturing group tests
{"(?<=f)f+(?=f)", "fffff", []Group{{1, 4}}},
{"(?<=f)f+(?=f)", "fffffa", []Group{{1, 4}}},
// Todo - add numeric range tests
}
var groupTests = []struct {
@ -179,6 +178,10 @@ var groupTests = []struct {
{"(aaa)|(aaaa)", "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
{"(aaaa)|(aaa)", "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a)|(aa)", "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}},
{"(a?)a?", "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
{"(a?)a?", "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
{"(a?)a?", "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
{"a((b.d){3})", "abfdbhdbid", []Match{[]Group{{0, 10}, {1, 10}, {7, 10}}}},
}
func TestFindAllMatches(t *testing.T) {

@ -3,3 +3,4 @@
3. Fix adding concatenation operators in shunting-yard function (very janky, compares against operators individually)
Ideas for flags:
-m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
-g <num> : Print the <num>th group

Loading…
Cancel
Save