Added initial support for capturing groups
This commit is contained in:
76
main.go
76
main.go
@@ -380,16 +380,21 @@ func shuntingYard(re string) []postfixNode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
node, err := pop(&outQueue)
|
|
||||||
if err != nil {
|
idx := len(outQueue) - 1
|
||||||
|
// Get the most recently added non-paren node
|
||||||
|
for node := outQueue[idx]; idx >= 0 && (node.nodetype == RPAREN || node.nodetype == LPAREN); node = outQueue[idx] {
|
||||||
|
idx--
|
||||||
|
}
|
||||||
|
if idx < 0 {
|
||||||
panic("Numeric specifier with no content.")
|
panic("Numeric specifier with no content.")
|
||||||
}
|
}
|
||||||
node.startReps = startRangeNum
|
outQueue[idx].startReps = startRangeNum
|
||||||
node.endReps = endRangeNum
|
outQueue[idx].endReps = endRangeNum
|
||||||
outQueue = append(outQueue, node)
|
|
||||||
}
|
}
|
||||||
if c == '(' {
|
if c == '(' {
|
||||||
opStack = append(opStack, c)
|
opStack = append(opStack, c)
|
||||||
|
outQueue = append(outQueue, newPostfixNode(c))
|
||||||
numOpenParens++
|
numOpenParens++
|
||||||
}
|
}
|
||||||
if c == ')' {
|
if c == ')' {
|
||||||
@@ -401,7 +406,8 @@ func shuntingYard(re string) []postfixNode {
|
|||||||
to_append := mustPop(&opStack)
|
to_append := mustPop(&opStack)
|
||||||
outQueue = append(outQueue, newPostfixNode(to_append))
|
outQueue = append(outQueue, newPostfixNode(to_append))
|
||||||
}
|
}
|
||||||
_ = mustPop(&opStack) // Get rid of opening parantheses
|
_ = mustPop(&opStack) // Get rid of opening parentheses
|
||||||
|
outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses
|
||||||
numOpenParens--
|
numOpenParens--
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -420,9 +426,10 @@ func shuntingYard(re string) []postfixNode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
|
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
|
||||||
// Returns start state.
|
// Returns start state and number of groups in regex.
|
||||||
func thompson(re []postfixNode) *State {
|
func thompson(re []postfixNode) (*State, int) {
|
||||||
nfa := make([]*State, 0) // Stack of states
|
nfa := make([]*State, 0) // Stack of states
|
||||||
|
numGroups := 0 // Number of capturing groups
|
||||||
for _, c := range re {
|
for _, c := range re {
|
||||||
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
|
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
|
||||||
state := State{}
|
state := State{}
|
||||||
@@ -470,12 +477,45 @@ func thompson(re []postfixNode) *State {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
tmpRe := shuntingYard(state.lookaroundRegex)
|
tmpRe := shuntingYard(state.lookaroundRegex)
|
||||||
state.lookaroundNFA = thompson(tmpRe)
|
var numGroupsLookaround int
|
||||||
|
state.lookaroundNFA, numGroupsLookaround = thompson(tmpRe)
|
||||||
|
state.lookaroundNumCaptureGroups = numGroupsLookaround
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
nfa = append(nfa, &state)
|
nfa = append(nfa, &state)
|
||||||
}
|
}
|
||||||
|
if c.nodetype == LPAREN || c.nodetype == RPAREN {
|
||||||
|
s := &State{}
|
||||||
|
s.assert = NONE
|
||||||
|
s.content = newContents(EPSILON)
|
||||||
|
s.isEmpty = true
|
||||||
|
s.output = make([]*State, 0)
|
||||||
|
s.output = append(s.output, s)
|
||||||
|
s.transitions = make(map[int][]*State)
|
||||||
|
// LPAREN nodes are just added normally
|
||||||
|
if c.nodetype == LPAREN {
|
||||||
|
numGroups++
|
||||||
|
s.groupBegin = true
|
||||||
|
s.groupNum = numGroups
|
||||||
|
nfa = append(nfa, s)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN,
|
||||||
|
// and then some other node.
|
||||||
|
// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
|
||||||
|
// and added back in.
|
||||||
|
if c.nodetype == RPAREN {
|
||||||
|
s.groupEnd = true
|
||||||
|
middleNode := mustPop(&nfa)
|
||||||
|
lparenNode := mustPop(&nfa)
|
||||||
|
s.groupNum = lparenNode.groupNum
|
||||||
|
tmp := concatenate(lparenNode, middleNode)
|
||||||
|
to_add := concatenate(tmp, s)
|
||||||
|
nfa = append(nfa, to_add)
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
// Must be an operator if it isn't a character
|
// Must be an operator if it isn't a character
|
||||||
switch c.nodetype {
|
switch c.nodetype {
|
||||||
case CONCATENATE:
|
case CONCATENATE:
|
||||||
@@ -540,7 +580,7 @@ func thompson(re []postfixNode) *State {
|
|||||||
|
|
||||||
verifyLastStates(nfa)
|
verifyLastStates(nfa)
|
||||||
|
|
||||||
return nfa[0]
|
return nfa[0], numGroups
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -597,7 +637,7 @@ func main() {
|
|||||||
out := bufio.NewWriter(os.Stdout)
|
out := bufio.NewWriter(os.Stdout)
|
||||||
|
|
||||||
re_postfix := shuntingYard(re)
|
re_postfix := shuntingYard(re)
|
||||||
startState := thompson(re_postfix)
|
startState, numGroups := thompson(re_postfix)
|
||||||
for true {
|
for true {
|
||||||
if linesRead {
|
if linesRead {
|
||||||
break
|
break
|
||||||
@@ -613,6 +653,9 @@ func main() {
|
|||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if len(test_str) > 0 && test_str[len(test_str)-1] == '\n' {
|
||||||
|
test_str = test_str[:len(test_str)-1]
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// Multi-line mode - read every line of input into a temp. string.
|
// Multi-line mode - read every line of input into a temp. string.
|
||||||
// test_str will contain all lines of input (including newline characters)
|
// test_str will contain all lines of input (including newline characters)
|
||||||
@@ -632,7 +675,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
test_runes = []rune(test_str)
|
test_runes = []rune(test_str)
|
||||||
matchIndices := findAllMatches(startState, test_runes)
|
matchIndices := findAllMatches(startState, test_runes, numGroups)
|
||||||
if *printMatchesFlag {
|
if *printMatchesFlag {
|
||||||
// if we are in single line mode, print the line on which
|
// if we are in single line mode, print the line on which
|
||||||
// the matches occur
|
// the matches occur
|
||||||
@@ -654,7 +697,7 @@ func main() {
|
|||||||
// This should make checking O(1) instead of O(n)
|
// This should make checking O(1) instead of O(n)
|
||||||
indicesToPrint := new_uniq_arr[int]()
|
indicesToPrint := new_uniq_arr[int]()
|
||||||
for _, idx := range matchIndices {
|
for _, idx := range matchIndices {
|
||||||
indicesToPrint.add(genRange(idx.startIdx, idx.endIdx)...)
|
indicesToPrint.add(genRange(idx[0].startIdx, idx[0].endIdx)...)
|
||||||
}
|
}
|
||||||
// If we are inverting, then we should print the indices which _didn't_ match
|
// If we are inverting, then we should print the indices which _didn't_ match
|
||||||
// in color.
|
// in color.
|
||||||
@@ -689,9 +732,9 @@ func main() {
|
|||||||
for i := range test_runes {
|
for i := range test_runes {
|
||||||
inMatchIndex := false
|
inMatchIndex := false
|
||||||
for _, idx := range matchIndices {
|
for _, idx := range matchIndices {
|
||||||
if i == idx.startIdx {
|
if i == idx[0].startIdx {
|
||||||
fmt.Fprintf(out, "%s", *substituteText)
|
fmt.Fprintf(out, "%s", *substituteText)
|
||||||
i = idx.endIdx
|
i = idx[0].endIdx
|
||||||
inMatchIndex = true
|
inMatchIndex = true
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@@ -707,7 +750,7 @@ func main() {
|
|||||||
// Newline after every match - only if -o is enabled and -v is disabled.
|
// Newline after every match - only if -o is enabled and -v is disabled.
|
||||||
if *onlyFlag && !(*invertFlag) {
|
if *onlyFlag && !(*invertFlag) {
|
||||||
for _, idx := range matchIndices {
|
for _, idx := range matchIndices {
|
||||||
if i+1 == idx.endIdx { // End index is one more than last index of match
|
if i+1 == idx[0].endIdx { // End index is one more than last index of match
|
||||||
fmt.Fprintf(out, "\n")
|
fmt.Fprintf(out, "\n")
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@@ -724,5 +767,6 @@ func main() {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
|
fmt.Println()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user