Added initial support for capturing groups
This commit is contained in:
		
							
								
								
									
										76
									
								
								main.go
									
									
									
									
									
								
							
							
						
						
									
										76
									
								
								main.go
									
									
									
									
									
								
							| @@ -380,16 +380,21 @@ func shuntingYard(re string) []postfixNode { | ||||
| 					} | ||||
| 				} | ||||
| 			} | ||||
| 			node, err := pop(&outQueue) | ||||
| 			if err != nil { | ||||
|  | ||||
| 			idx := len(outQueue) - 1 | ||||
| 			// Get the most recently added non-paren node | ||||
| 			for node := outQueue[idx]; idx >= 0 && (node.nodetype == RPAREN || node.nodetype == LPAREN); node = outQueue[idx] { | ||||
| 				idx-- | ||||
| 			} | ||||
| 			if idx < 0 { | ||||
| 				panic("Numeric specifier with no content.") | ||||
| 			} | ||||
| 			node.startReps = startRangeNum | ||||
| 			node.endReps = endRangeNum | ||||
| 			outQueue = append(outQueue, node) | ||||
| 			outQueue[idx].startReps = startRangeNum | ||||
| 			outQueue[idx].endReps = endRangeNum | ||||
| 		} | ||||
| 		if c == '(' { | ||||
| 			opStack = append(opStack, c) | ||||
| 			outQueue = append(outQueue, newPostfixNode(c)) | ||||
| 			numOpenParens++ | ||||
| 		} | ||||
| 		if c == ')' { | ||||
| @@ -401,7 +406,8 @@ func shuntingYard(re string) []postfixNode { | ||||
| 				to_append := mustPop(&opStack) | ||||
| 				outQueue = append(outQueue, newPostfixNode(to_append)) | ||||
| 			} | ||||
| 			_ = mustPop(&opStack) // Get rid of opening parantheses | ||||
| 			_ = mustPop(&opStack)                            // Get rid of opening parentheses | ||||
| 			outQueue = append(outQueue, newPostfixNode(')')) // Add closing parentheses | ||||
| 			numOpenParens-- | ||||
| 		} | ||||
| 	} | ||||
| @@ -420,9 +426,10 @@ func shuntingYard(re string) []postfixNode { | ||||
| } | ||||
|  | ||||
| // Thompson's algorithm. Constructs Finite-State Automaton from given string. | ||||
| // Returns start state. | ||||
| func thompson(re []postfixNode) *State { | ||||
| // Returns start state and number of groups in regex. | ||||
| func thompson(re []postfixNode) (*State, int) { | ||||
| 	nfa := make([]*State, 0) // Stack of states | ||||
| 	numGroups := 0           // Number of capturing groups | ||||
| 	for _, c := range re { | ||||
| 		if c.nodetype == CHARACTER || c.nodetype == ASSERTION { | ||||
| 			state := State{} | ||||
| @@ -470,12 +477,45 @@ func thompson(re []postfixNode) *State { | ||||
| 						} | ||||
| 					} | ||||
| 					tmpRe := shuntingYard(state.lookaroundRegex) | ||||
| 					state.lookaroundNFA = thompson(tmpRe) | ||||
| 					var numGroupsLookaround int | ||||
| 					state.lookaroundNFA, numGroupsLookaround = thompson(tmpRe) | ||||
| 					state.lookaroundNumCaptureGroups = numGroupsLookaround | ||||
|  | ||||
| 				} | ||||
| 			} | ||||
| 			nfa = append(nfa, &state) | ||||
| 		} | ||||
| 		if c.nodetype == LPAREN || c.nodetype == RPAREN { | ||||
| 			s := &State{} | ||||
| 			s.assert = NONE | ||||
| 			s.content = newContents(EPSILON) | ||||
| 			s.isEmpty = true | ||||
| 			s.output = make([]*State, 0) | ||||
| 			s.output = append(s.output, s) | ||||
| 			s.transitions = make(map[int][]*State) | ||||
| 			// LPAREN nodes are just added normally | ||||
| 			if c.nodetype == LPAREN { | ||||
| 				numGroups++ | ||||
| 				s.groupBegin = true | ||||
| 				s.groupNum = numGroups | ||||
| 				nfa = append(nfa, s) | ||||
| 				continue | ||||
| 			} | ||||
| 			// For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN, | ||||
| 			// and then some other node. | ||||
| 			// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated | ||||
| 			// and added back in. | ||||
| 			if c.nodetype == RPAREN { | ||||
| 				s.groupEnd = true | ||||
| 				middleNode := mustPop(&nfa) | ||||
| 				lparenNode := mustPop(&nfa) | ||||
| 				s.groupNum = lparenNode.groupNum | ||||
| 				tmp := concatenate(lparenNode, middleNode) | ||||
| 				to_add := concatenate(tmp, s) | ||||
| 				nfa = append(nfa, to_add) | ||||
|  | ||||
| 			} | ||||
| 		} | ||||
| 		// Must be an operator if it isn't a character | ||||
| 		switch c.nodetype { | ||||
| 		case CONCATENATE: | ||||
| @@ -540,7 +580,7 @@ func thompson(re []postfixNode) *State { | ||||
|  | ||||
| 	verifyLastStates(nfa) | ||||
|  | ||||
| 	return nfa[0] | ||||
| 	return nfa[0], numGroups | ||||
|  | ||||
| } | ||||
|  | ||||
| @@ -597,7 +637,7 @@ func main() { | ||||
| 	out := bufio.NewWriter(os.Stdout) | ||||
|  | ||||
| 	re_postfix := shuntingYard(re) | ||||
| 	startState := thompson(re_postfix) | ||||
| 	startState, numGroups := thompson(re_postfix) | ||||
| 	for true { | ||||
| 		if linesRead { | ||||
| 			break | ||||
| @@ -613,6 +653,9 @@ func main() { | ||||
| 					panic(err) | ||||
| 				} | ||||
| 			} | ||||
| 			if len(test_str) > 0 && test_str[len(test_str)-1] == '\n' { | ||||
| 				test_str = test_str[:len(test_str)-1] | ||||
| 			} | ||||
| 		} else { | ||||
| 			// Multi-line mode - read every line of input into a temp. string. | ||||
| 			// test_str will contain all lines of input (including newline characters) | ||||
| @@ -632,7 +675,7 @@ func main() { | ||||
| 			} | ||||
| 		} | ||||
| 		test_runes = []rune(test_str) | ||||
| 		matchIndices := findAllMatches(startState, test_runes) | ||||
| 		matchIndices := findAllMatches(startState, test_runes, numGroups) | ||||
| 		if *printMatchesFlag { | ||||
| 			// if we are in single line mode, print the line on which | ||||
| 			// the matches occur | ||||
| @@ -654,7 +697,7 @@ func main() { | ||||
| 		// This should make checking O(1) instead of O(n) | ||||
| 		indicesToPrint := new_uniq_arr[int]() | ||||
| 		for _, idx := range matchIndices { | ||||
| 			indicesToPrint.add(genRange(idx.startIdx, idx.endIdx)...) | ||||
| 			indicesToPrint.add(genRange(idx[0].startIdx, idx[0].endIdx)...) | ||||
| 		} | ||||
| 		// If we are inverting, then we should print the indices which _didn't_ match | ||||
| 		// in color. | ||||
| @@ -689,9 +732,9 @@ func main() { | ||||
| 			for i := range test_runes { | ||||
| 				inMatchIndex := false | ||||
| 				for _, idx := range matchIndices { | ||||
| 					if i == idx.startIdx { | ||||
| 					if i == idx[0].startIdx { | ||||
| 						fmt.Fprintf(out, "%s", *substituteText) | ||||
| 						i = idx.endIdx | ||||
| 						i = idx[0].endIdx | ||||
| 						inMatchIndex = true | ||||
| 						break | ||||
| 					} | ||||
| @@ -707,7 +750,7 @@ func main() { | ||||
| 					// Newline after every match - only if -o is enabled and -v is disabled. | ||||
| 					if *onlyFlag && !(*invertFlag) { | ||||
| 						for _, idx := range matchIndices { | ||||
| 							if i+1 == idx.endIdx { // End index is one more than last index of match | ||||
| 							if i+1 == idx[0].endIdx { // End index is one more than last index of match | ||||
| 								fmt.Fprintf(out, "\n") | ||||
| 								break | ||||
| 							} | ||||
| @@ -724,5 +767,6 @@ func main() { | ||||
| 		if err != nil { | ||||
| 			panic(err) | ||||
| 		} | ||||
| 		fmt.Println() | ||||
| 	} | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user