Added initial support for capturing groups

master
Aadhavan Srinivasan 1 month ago
parent 745fab9639
commit 822d1f319f

@ -380,16 +380,21 @@ func shuntingYard(re string) []postfixNode {
}
}
}
node, err := pop(&outQueue)
if err != nil {
idx := len(outQueue) - 1
// Get the most recently added non-paren node
for node := outQueue[idx]; idx >= 0 && (node.nodetype == RPAREN || node.nodetype == LPAREN); node = outQueue[idx] {
idx--
}
if idx < 0 {
panic("Numeric specifier with no content.")
}
node.startReps = startRangeNum
node.endReps = endRangeNum
outQueue = append(outQueue, node)
outQueue[idx].startReps = startRangeNum
outQueue[idx].endReps = endRangeNum
}
if c == '(' {
opStack = append(opStack, c)
outQueue = append(outQueue, newPostfixNode(c))
numOpenParens++
}
if c == ')' {
@ -401,7 +406,8 @@ func shuntingYard(re string) []postfixNode {
to_append := mustPop(&opStack)
outQueue = append(outQueue, newPostfixNode(to_append))
}
_ = mustPop(&opStack) // Get rid of opening parantheses
_ = mustPop(&opStack) // Get rid of opening parentheses
outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
numOpenParens--
}
}
@ -420,9 +426,10 @@ func shuntingYard(re string) []postfixNode {
}
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state.
func thompson(re []postfixNode) *State {
// Returns start state and number of groups in regex.
func thompson(re []postfixNode) (*State, int) {
nfa := make([]*State, 0) // Stack of states
numGroups := 0 // Number of capturing groups
for _, c := range re {
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
state := State{}
@ -470,12 +477,45 @@ func thompson(re []postfixNode) *State {
}
}
tmpRe := shuntingYard(state.lookaroundRegex)
state.lookaroundNFA = thompson(tmpRe)
var numGroupsLookaround int
state.lookaroundNFA, numGroupsLookaround = thompson(tmpRe)
state.lookaroundNumCaptureGroups = numGroupsLookaround
}
}
nfa = append(nfa, &state)
}
if c.nodetype == LPAREN || c.nodetype == RPAREN {
s := &State{}
s.assert = NONE
s.content = newContents(EPSILON)
s.isEmpty = true
s.output = make([]*State, 0)
s.output = append(s.output, s)
s.transitions = make(map[int][]*State)
// LPAREN nodes are just added normally
if c.nodetype == LPAREN {
numGroups++
s.groupBegin = true
s.groupNum = numGroups
nfa = append(nfa, s)
continue
}
// For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN,
// and then some other node.
// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
// and added back in.
if c.nodetype == RPAREN {
s.groupEnd = true
middleNode := mustPop(&nfa)
lparenNode := mustPop(&nfa)
s.groupNum = lparenNode.groupNum
tmp := concatenate(lparenNode, middleNode)
to_add := concatenate(tmp, s)
nfa = append(nfa, to_add)
}
}
// Must be an operator if it isn't a character
switch c.nodetype {
case CONCATENATE:
@ -540,7 +580,7 @@ func thompson(re []postfixNode) *State {
verifyLastStates(nfa)
return nfa[0]
return nfa[0], numGroups
}
@ -597,7 +637,7 @@ func main() {
out := bufio.NewWriter(os.Stdout)
re_postfix := shuntingYard(re)
startState := thompson(re_postfix)
startState, numGroups := thompson(re_postfix)
for true {
if linesRead {
break
@ -613,6 +653,9 @@ func main() {
panic(err)
}
}
if len(test_str) > 0 && test_str[len(test_str)-1] == '\n' {
test_str = test_str[:len(test_str)-1]
}
} else {
// Multi-line mode - read every line of input into a temp. string.
// test_str will contain all lines of input (including newline characters)
@ -632,7 +675,7 @@ func main() {
}
}
test_runes = []rune(test_str)
matchIndices := findAllMatches(startState, test_runes)
matchIndices := findAllMatches(startState, test_runes, numGroups)
if *printMatchesFlag {
// if we are in single line mode, print the line on which
// the matches occur
@ -654,7 +697,7 @@ func main() {
// This should make checking O(1) instead of O(n)
indicesToPrint := new_uniq_arr[int]()
for _, idx := range matchIndices {
indicesToPrint.add(genRange(idx.startIdx, idx.endIdx)...)
indicesToPrint.add(genRange(idx[0].startIdx, idx[0].endIdx)...)
}
// If we are inverting, then we should print the indices which _didn't_ match
// in color.
@ -689,9 +732,9 @@ func main() {
for i := range test_runes {
inMatchIndex := false
for _, idx := range matchIndices {
if i == idx.startIdx {
if i == idx[0].startIdx {
fmt.Fprintf(out, "%s", *substituteText)
i = idx.endIdx
i = idx[0].endIdx
inMatchIndex = true
break
}
@ -707,7 +750,7 @@ func main() {
// Newline after every match - only if -o is enabled and -v is disabled.
if *onlyFlag && !(*invertFlag) {
for _, idx := range matchIndices {
if i+1 == idx.endIdx { // End index is one more than last index of match
if i+1 == idx[0].endIdx { // End index is one more than last index of match
fmt.Fprintf(out, "\n")
break
}
@ -724,5 +767,6 @@ func main() {
if err != nil {
panic(err)
}
fmt.Println()
}
}

Loading…
Cancel
Save