Added initial support for capturing groups

master
Aadhavan Srinivasan 2 weeks ago
parent 745fab9639
commit 822d1f319f

@ -380,16 +380,21 @@ func shuntingYard(re string) []postfixNode {
} }
} }
} }
node, err := pop(&outQueue)
if err != nil { idx := len(outQueue) - 1
// Get the most recently added non-paren node
for node := outQueue[idx]; idx >= 0 && (node.nodetype == RPAREN || node.nodetype == LPAREN); node = outQueue[idx] {
idx--
}
if idx < 0 {
panic("Numeric specifier with no content.") panic("Numeric specifier with no content.")
} }
node.startReps = startRangeNum outQueue[idx].startReps = startRangeNum
node.endReps = endRangeNum outQueue[idx].endReps = endRangeNum
outQueue = append(outQueue, node)
} }
if c == '(' { if c == '(' {
opStack = append(opStack, c) opStack = append(opStack, c)
outQueue = append(outQueue, newPostfixNode(c))
numOpenParens++ numOpenParens++
} }
if c == ')' { if c == ')' {
@ -401,7 +406,8 @@ func shuntingYard(re string) []postfixNode {
to_append := mustPop(&opStack) to_append := mustPop(&opStack)
outQueue = append(outQueue, newPostfixNode(to_append)) outQueue = append(outQueue, newPostfixNode(to_append))
} }
_ = mustPop(&opStack) // Get rid of opening parantheses _ = mustPop(&opStack) // Get rid of opening parentheses
outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
numOpenParens-- numOpenParens--
} }
} }
@ -420,9 +426,10 @@ func shuntingYard(re string) []postfixNode {
} }
// Thompson's algorithm. Constructs Finite-State Automaton from given string. // Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state. // Returns start state and number of groups in regex.
func thompson(re []postfixNode) *State { func thompson(re []postfixNode) (*State, int) {
nfa := make([]*State, 0) // Stack of states nfa := make([]*State, 0) // Stack of states
numGroups := 0 // Number of capturing groups
for _, c := range re { for _, c := range re {
if c.nodetype == CHARACTER || c.nodetype == ASSERTION { if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
state := State{} state := State{}
@ -470,12 +477,45 @@ func thompson(re []postfixNode) *State {
} }
} }
tmpRe := shuntingYard(state.lookaroundRegex) tmpRe := shuntingYard(state.lookaroundRegex)
state.lookaroundNFA = thompson(tmpRe) var numGroupsLookaround int
state.lookaroundNFA, numGroupsLookaround = thompson(tmpRe)
state.lookaroundNumCaptureGroups = numGroupsLookaround
} }
} }
nfa = append(nfa, &state) nfa = append(nfa, &state)
} }
if c.nodetype == LPAREN || c.nodetype == RPAREN {
s := &State{}
s.assert = NONE
s.content = newContents(EPSILON)
s.isEmpty = true
s.output = make([]*State, 0)
s.output = append(s.output, s)
s.transitions = make(map[int][]*State)
// LPAREN nodes are just added normally
if c.nodetype == LPAREN {
numGroups++
s.groupBegin = true
s.groupNum = numGroups
nfa = append(nfa, s)
continue
}
// For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN,
// and then some other node.
// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
// and added back in.
if c.nodetype == RPAREN {
s.groupEnd = true
middleNode := mustPop(&nfa)
lparenNode := mustPop(&nfa)
s.groupNum = lparenNode.groupNum
tmp := concatenate(lparenNode, middleNode)
to_add := concatenate(tmp, s)
nfa = append(nfa, to_add)
}
}
// Must be an operator if it isn't a character // Must be an operator if it isn't a character
switch c.nodetype { switch c.nodetype {
case CONCATENATE: case CONCATENATE:
@ -540,7 +580,7 @@ func thompson(re []postfixNode) *State {
verifyLastStates(nfa) verifyLastStates(nfa)
return nfa[0] return nfa[0], numGroups
} }
@ -597,7 +637,7 @@ func main() {
out := bufio.NewWriter(os.Stdout) out := bufio.NewWriter(os.Stdout)
re_postfix := shuntingYard(re) re_postfix := shuntingYard(re)
startState := thompson(re_postfix) startState, numGroups := thompson(re_postfix)
for true { for true {
if linesRead { if linesRead {
break break
@ -613,6 +653,9 @@ func main() {
panic(err) panic(err)
} }
} }
if len(test_str) > 0 && test_str[len(test_str)-1] == '\n' {
test_str = test_str[:len(test_str)-1]
}
} else { } else {
// Multi-line mode - read every line of input into a temp. string. // Multi-line mode - read every line of input into a temp. string.
// test_str will contain all lines of input (including newline characters) // test_str will contain all lines of input (including newline characters)
@ -632,7 +675,7 @@ func main() {
} }
} }
test_runes = []rune(test_str) test_runes = []rune(test_str)
matchIndices := findAllMatches(startState, test_runes) matchIndices := findAllMatches(startState, test_runes, numGroups)
if *printMatchesFlag { if *printMatchesFlag {
// if we are in single line mode, print the line on which // if we are in single line mode, print the line on which
// the matches occur // the matches occur
@ -654,7 +697,7 @@ func main() {
// This should make checking O(1) instead of O(n) // This should make checking O(1) instead of O(n)
indicesToPrint := new_uniq_arr[int]() indicesToPrint := new_uniq_arr[int]()
for _, idx := range matchIndices { for _, idx := range matchIndices {
indicesToPrint.add(genRange(idx.startIdx, idx.endIdx)...) indicesToPrint.add(genRange(idx[0].startIdx, idx[0].endIdx)...)
} }
// If we are inverting, then we should print the indices which _didn't_ match // If we are inverting, then we should print the indices which _didn't_ match
// in color. // in color.
@ -689,9 +732,9 @@ func main() {
for i := range test_runes { for i := range test_runes {
inMatchIndex := false inMatchIndex := false
for _, idx := range matchIndices { for _, idx := range matchIndices {
if i == idx.startIdx { if i == idx[0].startIdx {
fmt.Fprintf(out, "%s", *substituteText) fmt.Fprintf(out, "%s", *substituteText)
i = idx.endIdx i = idx[0].endIdx
inMatchIndex = true inMatchIndex = true
break break
} }
@ -707,7 +750,7 @@ func main() {
// Newline after every match - only if -o is enabled and -v is disabled. // Newline after every match - only if -o is enabled and -v is disabled.
if *onlyFlag && !(*invertFlag) { if *onlyFlag && !(*invertFlag) {
for _, idx := range matchIndices { for _, idx := range matchIndices {
if i+1 == idx.endIdx { // End index is one more than last index of match if i+1 == idx[0].endIdx { // End index is one more than last index of match
fmt.Fprintf(out, "\n") fmt.Fprintf(out, "\n")
break break
} }
@ -724,5 +767,6 @@ func main() {
if err != nil { if err != nil {
panic(err) panic(err)
} }
fmt.Println()
} }
} }

Loading…
Cancel
Save