package main
import (
"bufio"
"flag"
"fmt"
"io"
"os"
"slices"
"strconv"
"unicode"
"github.com/fatih/color"
)
const CONCAT rune = '~'
var notDotChars [ ] rune
var caseInsensitiveFlag * bool // Whether we are running in case-insensitive mode
func isOperator ( c rune ) bool {
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
return true
}
return false
}
/* priority returns the priority of the given operator */
func priority ( op rune ) int {
precedence := [ ] rune { '|' , CONCAT , '+' , '*' , '?' }
return slices . Index ( precedence , op )
}
/ *
The Shunting - Yard algorithm is used to convert the given infix ( regeular ) expression to postfix .
The primary benefit of this is getting rid of parentheses .
It also inserts explicit concatenation operators to make parsing easier in Thompson ' s algorithm .
See : https : //blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
* /
func shuntingYard ( re string ) [ ] postfixNode {
re_postfix := make ( [ ] rune , 0 )
// Convert the string to a slice of runes to allow iteration through it
re_runes_orig := [ ] rune ( re ) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
re_runes := make ( [ ] rune , 0 )
// Check for numeric range. If we are at the start of a numeric range,
// skip to end and construct the equivalent regex for the range.
// The reason this is outside the loop below, is that it actually modifies
// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
// It also makes the overall parsing easier, since I don't have to worry about the numeric range
// anymore.
// Eventually, I might be able to add it into the main parsing loop, to reduce the time
// complexity.
// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
//
// Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
for i := 0 ; i < len ( re_runes_orig ) ; i ++ {
c := re_runes_orig [ i ]
if c == '<' && ( i == 0 || ( re_runes_orig [ i - 1 ] != '\\' && re_runes_orig [ i - 1 ] != '?' ) ) {
i ++ // Step over opening angle bracket
tmpStr := ""
hyphenFound := false
for i < len ( re_runes_orig ) && re_runes_orig [ i ] != '>' {
if ! unicode . IsDigit ( re_runes_orig [ i ] ) {
if re_runes_orig [ i ] != '-' || ( hyphenFound ) {
panic ( "ERROR: Invalid numeric range." )
}
}
if re_runes_orig [ i ] == '-' {
hyphenFound = true
}
tmpStr += string ( re_runes_orig [ i ] )
i ++
}
// End of string reached and last character doesn't close the range
if i == len ( re_runes_orig ) && re_runes_orig [ len ( re_runes_orig ) - 1 ] != '>' {
panic ( "ERROR: Numeric range not closed." )
}
if len ( tmpStr ) == 0 {
panic ( "ERROR: Empty numeric range." )
}
// Closing bracket will be skipped when the loop variable increments
var rangeStart int
var rangeEnd int
fmt . Sscanf ( tmpStr , "%d-%d" , & rangeStart , & rangeEnd )
regex := range2regex ( rangeStart , rangeEnd )
re_runes = append ( re_runes , [ ] rune ( regex ) ... )
} else if c == '(' && i < len ( re_runes_orig ) - 2 && re_runes_orig [ i + 1 ] == '?' && re_runes_orig [ i + 2 ] == ':' {
re_runes = append ( re_runes , NONCAPLPAREN_CHAR )
i += 2
} else {
re_runes = append ( re_runes , c )
}
}
/ * Add concatenation operators .
Only add a concatenation operator between two characters if both the following conditions are met :
1. The first character isn ' t an opening parantheses or alteration operator ( or an escape character )
a . This makes sense , because these operators can ' t be _concatenated_ with anything else .
2. The second character isn ' t a ' closing operator ' - one that applies to something before it
a . Again , these operators can ' be concatenated _to_ . They can , however , be concatenated _from_ .
Caveats :
1. Don ' t mess with anything inside brackets - character class
2. Don ' t mess with anything inside braces - numeric repetition
3. Don ' t mess with any lookarounds .
* /
i := 0
for i < len ( re_runes ) {
re_postfix = append ( re_postfix , re_runes [ i ] )
if re_runes [ i ] == '[' && ( i == 0 || re_runes [ i - 1 ] != '\\' ) { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters.
re_postfix [ len ( re_postfix ) - 1 ] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
toAppend := make ( [ ] rune , 0 ) // Holds all the runes in the current character class
if i < len ( re_runes ) - 1 && re_runes [ i + 1 ] == '^' { // Inverting class - match everything NOT in brackets
re_postfix = append ( re_postfix , '^' )
i ++ // Skip opening bracket and caret
}
if i < len ( re_runes ) - 1 && re_runes [ i + 1 ] == ']' { // Nothing inside brackets - panic.
panic ( "Empty character class." )
}
for re_runes [ i ] != ']' {
i ++ // Skip all characters inside brackets
// TODO: Check for escaped characters
// Check ahead for character range
if i < len ( re_runes ) - 2 && re_runes [ i + 1 ] == '-' {
rangeStart := re_runes [ i ]
rangeEnd := re_runes [ i + 2 ]
if int ( rangeEnd ) < int ( rangeStart ) {
panic ( "Range is out of order." )
}
for i := rangeStart ; i <= rangeEnd ; i ++ {
toAppend = append ( toAppend , i )
}
i += 2 // Skip start and hyphen (end will automatically be skipped on next iteration of loop)
continue
}
toAppend = append ( toAppend , re_runes [ i ] )
}
// Replace the last character (which should have been ']', with RBRACKET
toAppend [ len ( toAppend ) - 1 ] = RBRACKET
re_postfix = append ( re_postfix , toAppend ... )
}
if i < len ( re_runes ) && re_runes [ i ] == '{' && ( i > 0 && re_runes [ i - 1 ] != '\\' ) { // We don't touch things inside braces, either
i ++ // Skip opening brace
for i < len ( re_runes ) && re_runes [ i ] != '}' {
re_postfix = append ( re_postfix , re_runes [ i ] )
i ++
}
if i == len ( re_runes ) {
panic ( "Invalid numeric specifier." )
}
re_postfix = append ( re_postfix , re_runes [ i ] ) // Append closing brace
}
if i < len ( re_runes ) - 3 && string ( re_runes [ i + 1 : i + 4 ] ) == "(?:" { // Non-capturing lparen
re_postfix = append ( re_postfix , NONCAPLPAREN_CHAR )
i += 3
}
if i < len ( re_runes ) && re_runes [ i ] == '(' && ( i == 0 || re_runes [ i - 1 ] != '\\' ) && ( i < len ( re_runes ) - 2 && re_runes [ i + 1 ] == '?' && slices . Contains ( [ ] rune { '=' , '!' , '<' } , re_runes [ i + 2 ] ) ) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
i ++ // Step inside
if i == len ( re_runes ) - 1 || ( re_runes [ i + 1 ] != '=' && re_runes [ i + 1 ] != '!' && re_runes [ i + 1 ] != '<' ) {
panic ( "Invalid regex. Lookaround intended?" )
}
re_postfix = append ( re_postfix , re_runes [ i ] )
i ++
numOpenParens := 1
for numOpenParens != 0 {
if i >= len ( re_runes ) {
panic ( "Unclosed lookaround." )
}
if re_runes [ i ] == '(' {
numOpenParens ++
}
if re_runes [ i ] == ')' {
numOpenParens --
if numOpenParens == 0 {
break
}
}
re_postfix = append ( re_postfix , re_runes [ i ] )
i ++
}
continue
}
if i < len ( re_runes ) && ( re_runes [ i ] != '(' && re_runes [ i ] != NONCAPLPAREN_CHAR && re_runes [ i ] != '|' && re_runes [ i ] != '\\' ) || ( i > 0 && re_runes [ i - 1 ] == '\\' ) { // Every character should be concatenated if it is escaped
if i < len ( re_runes ) - 1 {
if re_runes [ i + 1 ] != '|' && re_runes [ i + 1 ] != '*' && re_runes [ i + 1 ] != '+' && re_runes [ i + 1 ] != '?' && re_runes [ i + 1 ] != ')' && re_runes [ i + 1 ] != '{' {
re_postfix = append ( re_postfix , CONCAT )
}
}
}
i ++
}
opStack := make ( [ ] rune , 0 ) // Operator stack
outQueue := make ( [ ] postfixNode , 0 ) // Output queue
// Actual algorithm
numOpenParens := 0 // Number of open parentheses
for i := 0 ; i < len ( re_postfix ) ; i ++ {
/ * Two cases :
1. Current character is alphanumeric - send to output queue
2. Current character is operator - do the following :
a . If current character has greater priority than top of opStack , push to opStack .
b . If not , keep popping from opStack ( and appending to outQueue ) until :
i . opStack is empty , OR
ii . current character has greater priority than top of opStack
3. If current character is '(' or NONCAPLPAREN_CHAR , push to opStack
4. If current character is ')' , pop from opStack ( and append to outQueue ) until '(' is found . Discard parantheses .
5. If current character is '[' , find all the characters until ']' , then create a postfixNode containing all these contents . Add this node to outQueue .
6. If current character is '{' , find the appropriate numeric specifier ( range start , range end ) . Apply the range to the postfixNode at the end of outQueue .
* /
c := re_postfix [ i ]
if isNormalChar ( c ) {
if caseInsensitiveFlag != nil && * caseInsensitiveFlag {
outQueue = append ( outQueue , newPostfixNode ( allCases ( c ) ... ) )
} else {
outQueue = append ( outQueue , newPostfixNode ( c ) )
}
continue
}
// Escape character
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
if i == len ( re_postfix ) - 1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
panic ( "ERROR: Backslash with no escape character." )
}
i ++
outQueue = append ( outQueue , newEscapedNode ( re_postfix [ i ] ) )
continue // Escaped character will automatically be skipped when loop variable increments
}
if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
outQueue = append ( outQueue , newPostfixDotNode ( ) )
continue
}
if c == '^' { // Start-of-string assertion
outQueue = append ( outQueue , newPostfixNode ( c ) )
}
if c == '$' { // End-of-string assertion
outQueue = append ( outQueue , newPostfixNode ( c ) )
}
// Check if we're at the start of a lookaround
if c == '(' && i < len ( re_postfix ) - 1 && re_postfix [ i + 1 ] == '?' {
i += 2 // Skip opening paren and question mark
regex := "" // Stores lookaround regex
numOpenParens := 1
for numOpenParens != 0 {
if i >= len ( re_postfix ) {
panic ( "Unclosed lookaround." )
}
if re_postfix [ i ] == '(' {
numOpenParens ++
}
if re_postfix [ i ] == ')' {
numOpenParens --
if numOpenParens == 0 {
break
}
}
regex += string ( re_postfix [ i ] )
i ++
}
if len ( regex ) <= 1 { // Nothing in regex - panic
panic ( "Invalid lookaround. (too short?)" )
}
// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
// Now we should filter that out.
toAppend := postfixNode { nodetype : ASSERTION , startReps : 1 , endReps : 1 }
if regex [ 0 ] == '<' { // Lookbehind
toAppend . lookaroundDir = LOOKBEHIND
regex = regex [ 1 : ]
} else if regex [ 0 ] == '=' || regex [ 0 ] == '!' {
toAppend . lookaroundDir = LOOKAHEAD
} else {
panic ( "Invalid lookaround." )
}
// Positive or negative
if regex [ 0 ] == '=' { // Positive
toAppend . lookaroundSign = POSITIVE
toAppend . contents = [ ] rune ( regex [ 1 : ] )
} else if regex [ 0 ] == '!' { // Negative
toAppend . lookaroundSign = NEGATIVE
toAppend . contents = [ ] rune ( regex [ 1 : ] )
} else {
panic ( "Invalid lookaround." )
}
outQueue = append ( outQueue , toAppend )
continue
}
if isOperator ( c ) {
if len ( opStack ) == 0 {
opStack = append ( opStack , c )
} else {
topStack , err := peek ( opStack )
if err != nil {
panic ( "ERROR: Operator without operand." )
}
if priority ( c ) > priority ( topStack ) { // 2a
opStack = append ( opStack , c )
} else {
for priority ( c ) <= priority ( topStack ) { // 2b
to_append := mustPop ( & opStack )
outQueue = append ( outQueue , newPostfixNode ( to_append ) )
topStack , _ = peek ( opStack )
}
opStack = append ( opStack , c )
}
}
}
if c == LBRACKET { // Used for character classes
i ++ // Step forward so we can look at the character class
var invertMatch bool
if re_postfix [ i ] == '^' {
invertMatch = true
i ++
}
chars := make ( [ ] rune , 0 ) // List of characters - used only for character classes
for i < len ( re_postfix ) {
if re_postfix [ i ] == RBRACKET {
break
}
chars = append ( chars , re_postfix [ i ] )
i ++
}
if i == len ( re_postfix ) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
panic ( "ERROR: Opening bracket without closing bracket." )
}
if ! invertMatch {
outQueue = append ( outQueue , newPostfixCharNode ( chars ... ) )
} else {
// Invert match - create an allChars postfixNode, then add the given states to its 'except' list.
toAdd := newPostfixDotNode ( )
toAdd . except = chars
outQueue = append ( outQueue , toAdd )
}
continue
}
if c == '{' {
i ++ // Skip opening brace
// Three possibilities:
// 1. Single number - {5}
// 2. Range - {3,5}
// 3. Start with no end, {3,}
startRange := make ( [ ] rune , 0 )
startRangeNum := 0
endRange := make ( [ ] rune , 0 )
endRangeNum := 0
for i < len ( re_postfix ) && unicode . IsDigit ( re_postfix [ i ] ) {
startRange = append ( startRange , re_postfix [ i ] )
i ++
}
if len ( startRange ) == 0 { // {} is not valid, neither is {,5}
panic ( "ERROR: Invalid numeric specifier." )
}
if i == len ( re_postfix ) {
panic ( "ERROR: Brace not closed." )
}
startRangeNum , err := strconv . Atoi ( string ( startRange ) )
if err != nil {
panic ( err )
}
if re_postfix [ i ] == '}' { // Case 1 above
endRangeNum = startRangeNum
} else {
if re_postfix [ i ] != ',' {
panic ( "ERROR: Invalid numeric specifier." )
}
i ++ // Skip comma
for i < len ( re_postfix ) && unicode . IsDigit ( re_postfix [ i ] ) {
endRange = append ( endRange , re_postfix [ i ] )
i ++
}
if i == len ( re_postfix ) {
panic ( "ERROR: Brace not closed." )
}
if re_postfix [ i ] != '}' {
panic ( "ERROR: Invalid numeric specifier." )
}
if len ( endRange ) == 0 { // Case 3 above
endRangeNum = INFINITE_REPS
} else { // Case 2 above
var err error
endRangeNum , err = strconv . Atoi ( string ( endRange ) )
if err != nil {
panic ( err )
}
}
}
idx := len ( outQueue ) - 1
// Get the last added node
if idx < 0 || outQueue [ idx ] . nodetype == LPAREN {
panic ( "Numeric specifier with no content." )
}
outQueue [ idx ] . startReps = startRangeNum
outQueue [ idx ] . endReps = endRangeNum
}
if c == '(' || c == NONCAPLPAREN_CHAR {
opStack = append ( opStack , c )
if c == '(' { // We only push _capturing_ group parentheses to outQueue
outQueue = append ( outQueue , newPostfixNode ( c ) )
}
numOpenParens ++
}
if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Panic if we reach the end of the stack.
var val rune
var err error
for val , err = peek ( opStack ) ; val != '(' && val != NONCAPLPAREN_CHAR ; val , err = peek ( opStack ) {
if err != nil {
panic ( "ERROR: Imbalanced parantheses." )
}
to_append := mustPop ( & opStack )
outQueue = append ( outQueue , newPostfixNode ( to_append ) )
}
_ = mustPop ( & opStack ) // Get rid of opening parentheses
if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
outQueue = append ( outQueue , newPostfixNode ( ')' ) ) // Add closing parentheses
}
numOpenParens --
}
}
// Pop all remaining operators (and append to outQueue)
for len ( opStack ) > 0 {
to_append := mustPop ( & opStack )
outQueue = append ( outQueue , newPostfixNode ( to_append ) )
}
if numOpenParens != 0 {
panic ( "ERROR: Imbalanced parantheses." )
}
return outQueue
}
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state and number of groups in regex.
func thompson ( re [ ] postfixNode ) ( * State , int ) {
nfa := make ( [ ] * State , 0 ) // Stack of states
numGroups := 0 // Number of capturing groups
for _ , c := range re {
if c . nodetype == CHARACTER || c . nodetype == ASSERTION {
state := State { }
state . transitions = make ( map [ int ] [ ] * State )
if c . allChars {
state . allChars = true
if len ( c . except ) != 0 {
state . except = append ( [ ] rune { } , c . except ... )
}
}
state . content = rune2Contents ( c . contents )
state . output = make ( [ ] * State , 0 )
state . output = append ( state . output , & state )
state . isEmpty = false
if c . nodetype == ASSERTION {
state . isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
state . content = newContents ( EPSILON ) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
if c . lookaroundDir == 0 || c . lookaroundSign == 0 {
switch c . contents [ 0 ] {
case '^' :
state . assert = SOS
case '$' :
state . assert = EOS
case 'b' :
state . assert = WBOUND
case 'B' :
state . assert = NONWBOUND
}
} else { // Lookaround
state . lookaroundRegex = string ( c . contents )
if c . lookaroundDir == LOOKAHEAD {
if c . lookaroundSign == POSITIVE {
state . assert = PLA
}
if c . lookaroundSign == NEGATIVE {
state . assert = NLA
}
}
if c . lookaroundDir == LOOKBEHIND {
if c . lookaroundSign == POSITIVE {
state . assert = PLB
}
if c . lookaroundSign == NEGATIVE {
state . assert = NLB
}
}
tmpRe := shuntingYard ( state . lookaroundRegex )
var numGroupsLookaround int
state . lookaroundNFA , numGroupsLookaround = thompson ( tmpRe )
state . lookaroundNumCaptureGroups = numGroupsLookaround
}
}
nfa = append ( nfa , & state )
}
if c . nodetype == LPAREN || c . nodetype == RPAREN {
s := & State { }
s . assert = NONE
s . content = newContents ( EPSILON )
s . isEmpty = true
s . output = make ( [ ] * State , 0 )
s . output = append ( s . output , s )
s . transitions = make ( map [ int ] [ ] * State )
// LPAREN nodes are just added normally
if c . nodetype == LPAREN {
numGroups ++
s . groupBegin = true
s . groupNum = numGroups
nfa = append ( nfa , s )
continue
}
// For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN,
// and then some other node.
// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
// and added back in.
if c . nodetype == RPAREN {
s . groupEnd = true
middleNode := mustPop ( & nfa )
lparenNode := mustPop ( & nfa )
s . groupNum = lparenNode . groupNum
tmp := concatenate ( lparenNode , middleNode )
to_add := concatenate ( tmp , s )
nfa = append ( nfa , to_add )
}
}
// Must be an operator if it isn't a character
switch c . nodetype {
case CONCATENATE :
s2 := mustPop ( & nfa )
s1 := mustPop ( & nfa )
s1 = concatenate ( s1 , s2 )
nfa = append ( nfa , s1 )
case KLEENE : // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
s1 := mustPop ( & nfa )
stateToAdd := kleene ( * s1 )
nfa = append ( nfa , stateToAdd )
case PLUS : // a+ is equivalent to aa*
s1 := mustPop ( & nfa )
s2 := kleene ( * s1 )
s1 = concatenate ( s1 , s2 )
nfa = append ( nfa , s1 )
case QUESTION : // ab? is equivalent to a(b|)
s1 := mustPop ( & nfa )
s2 := question ( s1 )
nfa = append ( nfa , s2 )
case PIPE :
s1 := mustPop ( & nfa )
s2 := mustPop ( & nfa )
s3 := alternate ( s1 , s2 )
nfa = append ( nfa , s3 )
}
if c . startReps != 1 || c . endReps != 1 { // Must have a numeric specifier attached to it
if c . endReps != - 1 && c . endReps < c . startReps {
panic ( "ERROR: Numeric specifier - start greater than end." )
}
state := mustPop ( & nfa )
var stateToAdd * State = nil
// Take advantage of the following facts:
// a{5} == aaaaa
// a{3,5} == aaaa?a?
// a{5,} == aaaaa+
// Nov. 3 2024 - I have two choices on how I want to implement numeric
// specifiers.
// a. Encode the logic while creating the states. I will have to create a function
// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
// each other (concatenating them with the 'concatenate' method - which takes addresses - does
// not work). Creating this function might be a lot of work.
// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
// at this point, I can leave thompson untouched.
for i := 0 ; i < c . startReps ; i ++ { // Case 1
stateToAdd = concatenate ( stateToAdd , cloneState ( state ) )
}
if c . endReps == INFINITE_REPS { // Case 3
s2 := kleene ( * state )
stateToAdd = concatenate ( stateToAdd , s2 )
} else { // Case 2
for i := c . startReps ; i < c . endReps ; i ++ {
stateToAdd = concatenate ( stateToAdd , question ( state ) )
}
}
nfa = append ( nfa , stateToAdd )
}
}
if len ( nfa ) != 1 {
panic ( "ERROR: Invalid Regex." )
}
verifyLastStates ( nfa )
return nfa [ 0 ] , numGroups
}
func main ( ) {
invertFlag := flag . Bool ( "v" , false , "Invert match." )
// This flag has two 'modes':
// 1. Without '-v': Prints only matches. Prints a newline after every match.
// 2. With '-v': Substitutes all matches with empty string.
onlyFlag := flag . Bool ( "o" , false , "Print only colored content. Overrides -l." )
lineFlag := flag . Bool ( "l" , false , "Only print lines with a match (or with no matches, if -v is enabled). Similar to grep's default." )
multiLineFlag := flag . Bool ( "t" , false , "Multi-line mode. Treats newline just like any character." )
printMatchesFlag := flag . Bool ( "p" , false , "Prints start and end index of each match. Can only be used with '-t' for multi-line mode." )
caseInsensitiveFlag = flag . Bool ( "i" , false , "Case-insensitive. Disregard the case of all characters." )
matchNum := flag . Int ( "m" , 0 , "Print the match with the given index. Eg. -m 3 prints the third match." )
substituteText := flag . String ( "s" , "" , "Substitute the contents of each match with the given string. Overrides -o and -v" )
flag . Parse ( )
// In multi-line mode, 'dot' metacharacter also matches newline
if ! ( * multiLineFlag ) {
notDotChars = [ ] rune { '\n' }
} else {
notDotChars = [ ] rune { }
}
// -l and -o are mutually exclusive: -o overrides -l
if * onlyFlag {
* lineFlag = false
}
// Check if substitute and matchNum flags have been enabled
substituteFlagEnabled := false
matchNumFlagEnabled := false
flag . Visit ( func ( f * flag . Flag ) {
if f . Name == "s" {
substituteFlagEnabled = true
}
if f . Name == "m" {
matchNumFlagEnabled = true
}
} )
// Validate matchNumFlag - must be positive integer
if matchNumFlagEnabled && * matchNum < 1 {
panic ( "Invalid match number to print." )
}
// Process:
// 1. Convert regex into postfix notation (Shunting-Yard algorithm)
// a. Add explicit concatenation operators to facilitate this
// 2. Build NFA from postfix representation (Thompson's algorithm)
// 3. Run the string against the NFA
if len ( flag . Args ( ) ) != 1 { // flag.Args() also strips out program name
fmt . Println ( "ERROR: Missing cmdline args" )
os . Exit ( 22 )
}
var re string
re = flag . Args ( ) [ 0 ]
var test_str string
var test_runes [ ] rune // Rune-slice representation of test_str
var err error
var linesRead bool // Whether or not we have read the lines in the file
lineNum := 0 // Current line number
// Create reader for stdin and writer for stdout
reader := bufio . NewReader ( os . Stdin )
out := bufio . NewWriter ( os . Stdout )
re_postfix := shuntingYard ( re )
startState , numGroups := thompson ( re_postfix )
for true {
if linesRead {
break
}
if ! ( * multiLineFlag ) {
// Read every string from stdin until we encounter an error. If the error isn't EOF, panic.
test_str , err = reader . ReadString ( '\n' )
lineNum ++
if err != nil {
if err == io . EOF {
linesRead = true
} else {
panic ( err )
}
}
if len ( test_str ) > 0 && test_str [ len ( test_str ) - 1 ] == '\n' {
test_str = test_str [ : len ( test_str ) - 1 ]
}
} else {
// Multi-line mode - read every line of input into a temp. string.
// test_str will contain all lines of input (including newline characters)
// as one string.
var temp string
for temp , err = reader . ReadString ( '\n' ) ; err == nil ; temp , err = reader . ReadString ( '\n' ) {
test_str += temp
}
// Assuming err != nil
if err == io . EOF {
if len ( temp ) > 0 {
test_str += temp // Add the last line (if it is non-empty)
}
linesRead = true
} else {
panic ( err )
}
}
test_runes = [ ] rune ( test_str )
matchIndices := findAllMatches ( startState , test_runes , numGroups )
if * printMatchesFlag {
// if we are in single line mode, print the line on which
// the matches occur
if len ( matchIndices ) > 0 {
if ! ( * multiLineFlag ) {
fmt . Fprintf ( out , "Line %d:\n" , lineNum )
}
for i , m := range matchIndices {
// Only print a match if:
// a. We are _not_ printing just one match
// OR
// b. We _are_ printing just one match, and this is that match
if ! matchNumFlagEnabled || ( i + 1 ) == * matchNum { // Match indexes start from 1; loop counter starts from 0
fmt . Fprintf ( out , "%s\n" , m . toString ( ) )
}
}
err := out . Flush ( )
if err != nil {
panic ( err )
}
}
continue
}
// Decompose the array of matchIndex structs into a flat unique array of ints - if matchIndex is {4,7}, flat array will contain 4,5,6
// This should make checking O(1) instead of O(n)
indicesToPrint := new_uniq_arr [ int ] ( )
for _ , idx := range matchIndices {
indicesToPrint . add ( genRange ( idx [ 0 ] . startIdx , idx [ 0 ] . endIdx ) ... )
}
// If we are inverting, then we should print the indices which _didn't_ match
// in color.
if * invertFlag {
oldIndices := indicesToPrint . values ( )
indicesToPrint = new_uniq_arr [ int ] ( )
// Explanation:
// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
// These are the values we want to print, now that we have inverted the match.
// Re-initialize indicesToPrint and add all of these values to it.
indicesToPrint . add ( setDifference ( genRange ( 0 , len ( test_runes ) ) , oldIndices ) ... )
}
// If lineFlag is enabled, we should only print something if:
// a. We are not inverting, and have at least one match on the current line
// OR
// b. We are inverting, and have no matches at all on the current line.
// This checks for the inverse, and continues if it is true.
if * lineFlag {
if ! ( * invertFlag ) && len ( matchIndices ) == 0 || * invertFlag && len ( matchIndices ) > 0 {
continue
}
}
// If we are substituting, we need a different behavior, as follows:
// For every character in the test string:
// 1. Check if the index is the start of any matchIndex
// 2. If so, print the substitute text, and set our index to
// the corresponding end index.
// 3. If not, just print the character.
if substituteFlagEnabled {
for i := range test_runes {
inMatchIndex := false
for _ , idx := range matchIndices {
if i == idx [ 0 ] . startIdx {
fmt . Fprintf ( out , "%s" , * substituteText )
i = idx [ 0 ] . endIdx
inMatchIndex = true
break
}
}
if ! inMatchIndex {
fmt . Fprintf ( out , "%c" , test_runes [ i ] )
}
}
} else {
for i , c := range test_runes {
if indicesToPrint . contains ( i ) {
color . New ( color . FgRed ) . Fprintf ( out , "%c" , c )
// Newline after every match - only if -o is enabled and -v is disabled.
if * onlyFlag && ! ( * invertFlag ) {
for _ , idx := range matchIndices {
if i + 1 == idx [ 0 ] . endIdx { // End index is one more than last index of match
fmt . Fprintf ( out , "\n" )
break
}
}
}
} else {
if ! ( * onlyFlag ) {
fmt . Fprintf ( out , "%c" , c )
}
}
}
}
err = out . Flush ( )
if err != nil {
panic ( err )
}
fmt . Println ( )
}
}