Compare commits

..

No commits in common. '13ca954072bf343aa8bdb5c1ca32e76625ab747d' and '3fda07280ed8edb1079ad32143ee617e7bb0e010' have entirely different histories.

@ -51,9 +51,6 @@ func shuntingYard(re string) []postfixNode {
// Eventually, I might be able to add it into the main parsing loop, to reduce the time // Eventually, I might be able to add it into the main parsing loop, to reduce the time
// complexity. // complexity.
// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range. // A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
//
// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
for i := 0; i < len(re_runes_orig); i++ { for i := 0; i < len(re_runes_orig); i++ {
c := re_runes_orig[i] c := re_runes_orig[i]
if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) { if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
@ -85,9 +82,7 @@ func shuntingYard(re string) []postfixNode {
fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd) fmt.Sscanf(tmpStr, "%d-%d", &rangeStart, &rangeEnd)
regex := range2regex(rangeStart, rangeEnd) regex := range2regex(rangeStart, rangeEnd)
re_runes = append(re_runes, []rune(regex)...) re_runes = append(re_runes, []rune(regex)...)
} else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' {
re_runes = append(re_runes, NONCAPLPAREN_CHAR)
i += 2
} else { } else {
re_runes = append(re_runes, c) re_runes = append(re_runes, c)
} }
@ -153,11 +148,7 @@ func shuntingYard(re string) []postfixNode {
} }
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
} }
if i < len(re_runes)-3 && string(re_runes[i+1:i+4]) == "(?:" { // Non-capturing lparen if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] == '?') { // Unescaped open parentheses followed by question mark = lokaround. Don't mess with it.
re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
i += 3
}
if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
i++ // Step inside i++ // Step inside
if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') { if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
panic("Invalid regex. Lookaround intended?") panic("Invalid regex. Lookaround intended?")
@ -183,7 +174,7 @@ func shuntingYard(re string) []postfixNode {
} }
continue continue
} }
if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != NONCAPLPAREN_CHAR && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
if i < len(re_runes)-1 { if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' { if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
re_postfix = append(re_postfix, CONCAT) re_postfix = append(re_postfix, CONCAT)
@ -206,7 +197,7 @@ func shuntingYard(re string) []postfixNode {
b. If not, keep popping from opStack (and appending to outQueue) until: b. If not, keep popping from opStack (and appending to outQueue) until:
i. opStack is empty, OR i. opStack is empty, OR
ii. current character has greater priority than top of opStack ii. current character has greater priority than top of opStack
3. If current character is '(' or NONCAPLPAREN_CHAR, push to opStack 3. If current character is '(', push to opStack
4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses. 4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue. 5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue. 6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
@ -391,25 +382,24 @@ func shuntingYard(re string) []postfixNode {
} }
idx := len(outQueue) - 1 idx := len(outQueue) - 1
// Get the last added node // Get the most recently added non-paren node
if idx < 0 || outQueue[idx].nodetype == LPAREN { for node := outQueue[idx]; idx >= 0 && (node.nodetype == RPAREN || node.nodetype == LPAREN); node = outQueue[idx] {
idx--
}
if idx < 0 {
panic("Numeric specifier with no content.") panic("Numeric specifier with no content.")
} }
outQueue[idx].startReps = startRangeNum outQueue[idx].startReps = startRangeNum
outQueue[idx].endReps = endRangeNum outQueue[idx].endReps = endRangeNum
} }
if c == '(' || c == NONCAPLPAREN_CHAR { if c == '(' {
opStack = append(opStack, c) opStack = append(opStack, c)
if c == '(' { // We only push _capturing_ group parentheses to outQueue
outQueue = append(outQueue, newPostfixNode(c)) outQueue = append(outQueue, newPostfixNode(c))
}
numOpenParens++ numOpenParens++
} }
if c == ')' { if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack. // Keep popping from opStack until we encounter an opening parantheses. Panic if we reach the end of the stack.
var val rune for val, err := peek(opStack); val != '('; val, err = peek(opStack) {
var err error
for val, err = peek(opStack); val != '(' && val != NONCAPLPAREN_CHAR; val, err = peek(opStack) {
if err != nil { if err != nil {
panic("ERROR: Imbalanced parantheses.") panic("ERROR: Imbalanced parantheses.")
} }
@ -417,9 +407,7 @@ func shuntingYard(re string) []postfixNode {
outQueue = append(outQueue, newPostfixNode(to_append)) outQueue = append(outQueue, newPostfixNode(to_append))
} }
_ = mustPop(&opStack) // Get rid of opening parentheses _ = mustPop(&opStack) // Get rid of opening parentheses
if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
}
numOpenParens-- numOpenParens--
} }
} }
@ -606,7 +594,6 @@ func main() {
multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.") multiLineFlag := flag.Bool("t", false, "Multi-line mode. Treats newline just like any character.")
printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.") printMatchesFlag := flag.Bool("p", false, "Prints start and end index of each match. Can only be used with '-t' for multi-line mode.")
caseInsensitiveFlag = flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.") caseInsensitiveFlag = flag.Bool("i", false, "Case-insensitive. Disregard the case of all characters.")
matchNum := flag.Int("m", 0, "Print the match with the given index. Eg. -m 3 prints the third match.")
substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v") substituteText := flag.String("s", "", "Substitute the contents of each match with the given string. Overrides -o and -v")
flag.Parse() flag.Parse()
@ -620,23 +607,14 @@ func main() {
if *onlyFlag { if *onlyFlag {
*lineFlag = false *lineFlag = false
} }
// Check if substitute and matchNum flags have been enabled // Check if substitute text has been enabled
substituteFlagEnabled := false substituteFlagEnabled := false
matchNumFlagEnabled := false
flag.Visit(func(f *flag.Flag) { flag.Visit(func(f *flag.Flag) {
if f.Name == "s" { if f.Name == "s" {
substituteFlagEnabled = true substituteFlagEnabled = true
} }
if f.Name == "m" {
matchNumFlagEnabled = true
}
}) })
// Validate matchNumFlag - must be positive integer
if matchNumFlagEnabled && *matchNum < 1 {
panic("Invalid match number to print.")
}
// Process: // Process:
// 1. Convert regex into postfix notation (Shunting-Yard algorithm) // 1. Convert regex into postfix notation (Shunting-Yard algorithm)
// a. Add explicit concatenation operators to facilitate this // a. Add explicit concatenation operators to facilitate this

@ -60,7 +60,8 @@ func (g Group) isValid() bool {
// takeZeroState takes the 0-state (if such a transition exists) for all states in the // takeZeroState takes the 0-state (if such a transition exists) for all states in the
// given slice. It returns the resulting states. If any of the resulting states is a 0-state, // given slice. It returns the resulting states. If any of the resulting states is a 0-state,
// the second ret val is true. // the second ret val is true.
// If a state begins or ends a capturing group, its 'thread' is updated to contain the correct index. // The third ret val is a list of all the group numbers of all the opening parentheses we crossed,
// and the fourth is a list of all the closing parentheses we passed
func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZero bool) { func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZero bool) {
for _, state := range states { for _, state := range states {
if len(state.transitions[EPSILON]) > 0 { if len(state.transitions[EPSILON]) > 0 {
@ -93,7 +94,11 @@ func takeZeroState(states []*State, numGroups int, idx int) (rtv []*State, isZer
// from any of the given states, given the string and our position in it. // from any of the given states, given the string and our position in it.
// It uses the same algorithm to find zero-states as the one inside the loop, // It uses the same algorithm to find zero-states as the one inside the loop,
// so I should probably put it in a function. // so I should probably put it in a function.
func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) bool { // It also returns all the capturing groups that both begin and end at the current index.
// This is because, by definition, zero-states don't move forward in the string.
func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) (bool, []int, []int) {
allOpenParenGroups := make([]int, 0)
allCloseParenGroups := make([]int, 0)
zeroStates, isZero := takeZeroState(states, numGroups, idx) zeroStates, isZero := takeZeroState(states, numGroups, idx)
tempstates := make([]*State, 0, len(zeroStates)+len(states)) tempstates := make([]*State, 0, len(zeroStates)+len(states))
tempstates = append(tempstates, states...) tempstates = append(tempstates, states...)
@ -108,10 +113,10 @@ func zeroMatchPossible(str []rune, idx int, numGroups int, states ...*State) boo
} }
for _, state := range tempstates { for _, state := range tempstates {
if state.isEmpty && (state.assert == NONE || state.checkAssertion(str, idx)) && state.isLast { if state.isEmpty && (state.assert == NONE || state.checkAssertion(str, idx)) && state.isLast {
return true return true, allOpenParenGroups, allCloseParenGroups
} }
} }
return false return false, allOpenParenGroups, allCloseParenGroups
} }
// Prunes the slice by removing overlapping indices. // Prunes the slice by removing overlapping indices.
@ -168,6 +173,28 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
// The second value here shouldn't be used, because we should exit when the third return value is > than len(str) // The second value here shouldn't be used, because we should exit when the third return value is > than len(str)
return false, []Group{}, offset return false, []Group{}, offset
} }
// 'Base case' - if we are at the end of the string, check if we can add a zero-length match
if offset == len(str) {
// Get all zero-state matches. If we can get to a zero-state without matching anything, we
// can add a zero-length match. This is all true only if the start state itself matches nothing.
if start.isEmpty {
to_return := newMatch(numGroups + 1)
if start.groupBegin {
to_return[start.groupNum].startIdx = offset
}
if ok, openGrps, closeGrps := zeroMatchPossible(str, offset, numGroups, start); ok {
for _, gIdx := range openGrps {
to_return[gIdx].startIdx = offset
}
for _, gIdx := range closeGrps {
to_return[gIdx].endIdx = offset
}
to_return[0] = Group{offset, offset}
return true, to_return, offset + 1
}
}
return false, []Group{}, offset + 1
}
// Hold a list of match indices for the current run. When we // Hold a list of match indices for the current run. When we
// can no longer find a match, the match with the largest range is // can no longer find a match, the match with the largest range is
@ -302,7 +329,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
// Check if we can find a zero-length match // Check if we can find a zero-length match
if foundPath == false { if foundPath == false {
if ok := zeroMatchPossible(str, i, numGroups, currentStates...); ok { if ok, _, _ := zeroMatchPossible(str, i, numGroups, currentStates...); ok {
if tempIndices[0].isValid() == false { if tempIndices[0].isValid() == false {
tempIndices[0] = Group{startIdx, startIdx} tempIndices[0] = Group{startIdx, startIdx}
} }
@ -348,7 +375,7 @@ func findAllMatchesHelper(start *State, str []rune, offset int, numGroups int) (
for _, state := range currentStates { for _, state := range currentStates {
// Only add the match if the start index is in bounds. If the state has an assertion, // Only add the match if the start index is in bounds. If the state has an assertion,
// make sure the assertion checks out. // make sure the assertion checks out.
if state.isLast && i <= len(str) { if state.isLast && startIdx < len(str) {
if state.assert == NONE || state.checkAssertion(str, i) { if state.assert == NONE || state.checkAssertion(str, i) {
for j := 1; j < numGroups+1; j++ { for j := 1; j < numGroups+1; j++ {
tempIndices[j] = state.threadGroups[j] tempIndices[j] = state.threadGroups[j]

@ -13,7 +13,6 @@ var RBRACKET rune = 0xF0001
var ANY_CHAR rune = 0xF0002 // Represents any character - used for states where the allChars flag is on. var ANY_CHAR rune = 0xF0002 // Represents any character - used for states where the allChars flag is on.
var LPAREN_CHAR rune = 0xF0003 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses var LPAREN_CHAR rune = 0xF0003 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses
var RPAREN_CHAR rune = 0xF0004 var RPAREN_CHAR rune = 0xF0004
var NONCAPLPAREN_CHAR rune = 0xF0005 // Represents a non-capturing group's LPAREN
// Returns true if str[idx] and str[idx-1] are separated by a word boundary. // Returns true if str[idx] and str[idx-1] are separated by a word boundary.
func isWordBoundary(str []rune, idx int) bool { func isWordBoundary(str []rune, idx int) bool {
@ -27,7 +26,7 @@ func isWordBoundary(str []rune, idx int) bool {
func isNormalChar(c rune) bool { func isNormalChar(c rune) bool {
specialChars := []rune(`?*\^${}()+|[].~<>`) specialChars := []rune(`?*\^${}()+|[].~<>`)
specialChars = append(specialChars, LBRACKET, RBRACKET, NONCAPLPAREN_CHAR) specialChars = append(specialChars, LBRACKET, RBRACKET)
return !slices.Contains(specialChars, c) return !slices.Contains(specialChars, c)
} }

@ -28,6 +28,7 @@ type State struct {
transitions map[int][]*State // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa) transitions map[int][]*State // Transitions to different states (maps a character (int representation) to a _list of states. This is useful if one character can lead multiple states eg. ab|aa)
isKleene bool // Identifies whether current node is a 0-state representing Kleene star isKleene bool // Identifies whether current node is a 0-state representing Kleene star
assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything assert assertType // Type of assertion of current node - NONE means that the node doesn't assert anything
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space allChars bool // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes. except []rune // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds lookaroundRegex string // Only for lookaround states - Contents of the regex that the lookaround state holds
@ -36,8 +37,6 @@ type State struct {
groupBegin bool // Whether or not the node starts a capturing group groupBegin bool // Whether or not the node starts a capturing group
groupEnd bool // Whether or not the node ends a capturing group groupEnd bool // Whether or not the node ends a capturing group
groupNum int // Which capturing group the node starts / ends groupNum int // Which capturing group the node starts / ends
// The following properties depend on the current match - I should think about resetting them for every match.
zeroMatchFound bool // Whether or not the state has been used for a zero-length match - only relevant for zero states
threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over. threadGroups []Group // Assuming that a state is part of a 'thread' in the matching process, this array stores the indices of capturing groups in the current thread. As matches are found for this state, its groups will be copied over.
} }
@ -119,21 +118,15 @@ func (s State) checkAssertion(str []rune, idx int) bool {
if s.isLookaround() { if s.isLookaround() {
// The process here is simple: // The process here is simple:
// 1. Compile the regex stored in the state's contents. // 1. Compile the regex stored in the state's contents.
// 2. Run it on a subset of the test string, that ends after the current index in the string // 2. Run it on the test string.
// 3. Based on the kind of lookaround (and the indices we get), determine what action to take. // 3. Based on the kind of lookaround (and the indices we get), determine what action to take.
startState := s.lookaroundNFA startState := s.lookaroundNFA
var strToMatch []rune matchIndices := findAllMatches(startState, str, startState.lookaroundNumCaptureGroups)
if s.assert == PLA || s.assert == NLA {
strToMatch = str[idx:]
} else {
strToMatch = str[:idx]
}
matchIndices := findAllMatches(startState, strToMatch, startState.lookaroundNumCaptureGroups)
numMatchesFound := 0 numMatchesFound := 0
for _, matchIdx := range matchIndices { for _, matchIdx := range matchIndices {
if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at 0. Zero is used because the test-string _starts_ from idx. if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at the current index
if matchIdx[0].startIdx == 0 { if matchIdx[0].startIdx == idx {
numMatchesFound++ numMatchesFound++
} }
} }

@ -149,13 +149,14 @@ var reTests = []struct {
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "400", []Group{{0, 3}}}, {"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "400", []Group{{0, 3}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "4000", []Group{}}, {"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "4000", []Group{}},
// Lookaround tests // Todo - add lookaround tests
{"(?<=bo)y", "boy", []Group{{2, 3}}}, {"(?<=bo)y", "boy", []Group{{2, 3}}},
{"bo(?=y)", "boy", []Group{{0, 2}}}, {"bo(?=y)", "boy", []Group{{0, 2}}},
{"(?<=f)f+(?=f)", "fffff", []Group{{1, 4}}},
{"(?<=f)f+(?=f)", "fffffa", []Group{{1, 4}}},
// Todo - add numeric range tests // Todo - add numeric range tests
// Todo - add capturing group tests
{"(?<=f)f+(?=f)", "fffff", []Group{{1, 4}}},
{"(?<=f)f+(?=f)", "fffffa", []Group{{1, 4}}},
} }
var groupTests = []struct { var groupTests = []struct {
@ -178,10 +179,6 @@ var groupTests = []struct {
{"(aaa)|(aaaa)", "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}}, {"(aaa)|(aaaa)", "aaaa", []Match{[]Group{{0, 4}, {-1, -1}, {0, 4}}}},
{"(aaaa)|(aaa)", "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}}, {"(aaaa)|(aaa)", "aaaa", []Match{[]Group{{0, 4}, {0, 4}, {-1, -1}}}},
{"(a)|(aa)", "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}}, {"(a)|(aa)", "aa", []Match{[]Group{{0, 2}, {-1, -1}, {0, 2}}}},
{"(a?)a?", "b", []Match{[]Group{{0, 0}, {0, 0}}, []Group{{1, 1}, {1, 1}}}},
{"(a?)a?", "ab", []Match{[]Group{{0, 1}, {0, 1}}, []Group{{1, 1}, {1, 1}}, []Group{{2, 2}, {2, 2}}}},
{"(a?)a?", "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
{"a((b.d){3})", "abfdbhdbid", []Match{[]Group{{0, 10}, {1, 10}, {7, 10}}}},
} }
func TestFindAllMatches(t *testing.T) { func TestFindAllMatches(t *testing.T) {

@ -3,4 +3,3 @@
3. Fix adding concatenation operators in shunting-yard function (very janky, compares against operators individually) 3. Fix adding concatenation operators in shunting-yard function (very janky, compares against operators individually)
Ideas for flags: Ideas for flags:
-m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match) -m <num> : Print <num>th match (-m 1 = first match, -m 2 = second match)
-g <num> : Print the <num>th group

Loading…
Cancel
Save