@ -6,6 +6,8 @@ import (
"io"
"io"
"os"
"os"
"slices"
"slices"
"strconv"
"unicode"
"github.com/fatih/color"
"github.com/fatih/color"
)
)
@ -45,7 +47,7 @@ func shuntingYard(re string) []postfixNode {
for i < len ( re_runes ) {
for i < len ( re_runes ) {
re_postfix = append ( re_postfix , re_runes [ i ] )
re_postfix = append ( re_postfix , re_runes [ i ] )
if re_runes [ i ] == '[' && ( i == 0 || re_runes [ i - 1 ] != '\\' ) { // We do not touch things inside brackets, unless they are escaped
if re_runes [ i ] == '[' && ( i == 0 || re_runes [ i - 1 ] != '\\' ) { // We do not touch things inside brackets, unless they are escaped
re_postfix [ len ( re_postfix ) - 1 ] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing o g all characters (including opening and closing brackets) within the character class
re_postfix [ len ( re_postfix ) - 1 ] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing o f all characters (including opening and closing brackets) within the character class
invertMatch := false
invertMatch := false
toAppend := make ( [ ] rune , 0 ) // Holds all the runes in the current character class
toAppend := make ( [ ] rune , 0 ) // Holds all the runes in the current character class
if i < len ( re_runes ) - 1 && re_runes [ i + 1 ] == '^' { // Inverting class - match everything NOT in brackets
if i < len ( re_runes ) - 1 && re_runes [ i + 1 ] == '^' { // Inverting class - match everything NOT in brackets
@ -84,9 +86,20 @@ func shuntingYard(re string) []postfixNode {
}
}
re_postfix = append ( re_postfix , toAppend ... )
re_postfix = append ( re_postfix , toAppend ... )
}
}
if re_runes [ i ] == '{' && ( i > 0 && re_runes [ i - 1 ] != '\\' ) { // We don't touch things inside braces, either
i ++ // Skip opening brace
for i < len ( re_runes ) && re_runes [ i ] != '}' {
re_postfix = append ( re_postfix , re_runes [ i ] )
i ++
}
if i == len ( re_runes ) {
panic ( "Invalid numeric specifier." )
}
re_postfix = append ( re_postfix , re_runes [ i ] ) // Append closing brace
}
if ( re_runes [ i ] != '(' && re_runes [ i ] != '|' && re_runes [ i ] != '\\' ) || ( i > 0 && re_runes [ i - 1 ] == '\\' ) { // Every character should be concatenated if it is escaped
if ( re_runes [ i ] != '(' && re_runes [ i ] != '|' && re_runes [ i ] != '\\' ) || ( i > 0 && re_runes [ i - 1 ] == '\\' ) { // Every character should be concatenated if it is escaped
if i < len ( re_runes ) - 1 {
if i < len ( re_runes ) - 1 {
if re_runes [ i + 1 ] != '|' && re_runes [ i + 1 ] != '*' && re_runes [ i + 1 ] != '+' && re_runes [ i + 1 ] != '?' && re_runes [ i + 1 ] != ')' {
if re_runes [ i + 1 ] != '|' && re_runes [ i + 1 ] != '*' && re_runes [ i + 1 ] != '+' && re_runes [ i + 1 ] != '?' && re_runes [ i + 1 ] != ')' && re_runes [ i + 1 ] != '{' {
re_postfix = append ( re_postfix , CONCAT )
re_postfix = append ( re_postfix , CONCAT )
}
}
}
}
@ -109,6 +122,7 @@ func shuntingYard(re string) []postfixNode {
3. If current character is '(' , push to opStack
3. If current character is '(' , push to opStack
4. If current character is ')' , pop from opStack ( and append to outQueue ) until '(' is found . Discard parantheses .
4. If current character is ')' , pop from opStack ( and append to outQueue ) until '(' is found . Discard parantheses .
5. If current character is '[' , find all the characters until ']' , then create a postfixNode containing all these contents . Add this node to outQueue .
5. If current character is '[' , find all the characters until ']' , then create a postfixNode containing all these contents . Add this node to outQueue .
6. If current character is '{' , find the appropriate numeric specifier ( range start , range end ) . Apply the range to the postfixNode at the end of outQueue .
* /
* /
c := re_postfix [ i ]
c := re_postfix [ i ]
if isAlphaNum ( c ) {
if isAlphaNum ( c ) {
@ -173,6 +187,67 @@ func shuntingYard(re string) []postfixNode {
// i++ // Step forward to skip closing bracket
// i++ // Step forward to skip closing bracket
continue
continue
}
}
if c == '{' {
i ++ // Skip opening brace
// Three possibilities:
// 1. Single number - {5}
// 2. Range - {3,5}
// 3. Start with no end, {3,}
startRange := make ( [ ] rune , 0 )
startRangeNum := 0
endRange := make ( [ ] rune , 0 )
endRangeNum := 0
for i < len ( re_postfix ) && unicode . IsDigit ( re_postfix [ i ] ) {
startRange = append ( startRange , re_postfix [ i ] )
i ++
}
if len ( startRange ) == 0 { // {} is not valid, neither is {,5}
panic ( "ERROR: Invalid numeric specifier." )
}
if i == len ( re_postfix ) {
panic ( "ERROR: Brace not closed." )
}
startRangeNum , err := strconv . Atoi ( string ( startRange ) )
if err != nil {
panic ( err )
}
if re_postfix [ i ] == '}' { // Case 1 above
endRangeNum = startRangeNum
} else {
if re_postfix [ i ] != ',' {
panic ( "ERROR: Invalid numeric specifier." )
}
i ++ // Skip comma
for i < len ( re_postfix ) && unicode . IsDigit ( re_postfix [ i ] ) {
endRange = append ( endRange , re_postfix [ i ] )
i ++
}
if i == len ( re_postfix ) {
panic ( "ERROR: Brace not closed." )
}
if re_postfix [ i ] != '}' {
panic ( "ERROR: Invalid numeric specifier." )
}
if len ( endRange ) == 0 { // Case 3 above
endRangeNum = INFINITE_REPS
} else { // Case 2 above
var err error
endRangeNum , err = strconv . Atoi ( string ( endRange ) )
if err != nil {
panic ( err )
}
}
}
node , err := pop ( & outQueue )
if err != nil {
panic ( "Numeric specifier with no content." )
}
node . startReps = startRangeNum
node . endReps = endRangeNum
outQueue = append ( outQueue , node )
}
if c == '(' {
if c == '(' {
opStack = append ( opStack , c )
opStack = append ( opStack , c )
}
}
@ -244,19 +319,45 @@ func thompson(re []postfixNode) *State {
nfa = append ( nfa , s1 )
nfa = append ( nfa , s1 )
case QUESTION : // ab? is equivalent to a(b|)
case QUESTION : // ab? is equivalent to a(b|)
s1 := mustPop ( & nfa )
s1 := mustPop ( & nfa )
s2 := & State { }
s2 := question ( s1 )
s2 . transitions = make ( map [ int ] [ ] * State )
nfa = append ( nfa , s2 )
s2 . content = newContents ( EPSILON )
s2 . output = append ( s2 . output , s2 )
s2 . isEmpty = true
s3 := alternate ( s1 , s2 )
nfa = append ( nfa , s3 )
case PIPE :
case PIPE :
s1 := mustPop ( & nfa )
s1 := mustPop ( & nfa )
s2 := mustPop ( & nfa )
s2 := mustPop ( & nfa )
s3 := alternate ( s1 , s2 )
s3 := alternate ( s1 , s2 )
nfa = append ( nfa , s3 )
nfa = append ( nfa , s3 )
}
}
if c . startReps != 1 || c . endReps != 1 { // Must have a numeric specifier attached to it
if c . endReps != - 1 && c . endReps < c . startReps {
panic ( "ERROR: Numeric specifier - start greater than end." )
}
state := mustPop ( & nfa )
var stateToAdd * State = nil
// Take advantage of the following facts:
// a{5} == aaaaa
// a{3,5} == aaaa?a?
// a{5,} == aaaaa+
// Nov. 3 2024 - I have two choices on how I want to implement numeric
// specifiers.
// a. Encode the logic while creating the states. I will have to create a function
// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
// each other (concatenating them with the 'concatenate' method - which takes addresses - does
// not work). Creating this function might be a lot of work.
// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
// at this point, I can leave thompson untouched.
for i := 0 ; i < c . startReps ; i ++ { // Case 1
stateToAdd = concatenate ( stateToAdd , cloneState ( state ) )
}
if c . endReps == INFINITE_REPS { // Case 3
s2 := kleene ( * state )
stateToAdd = concatenate ( stateToAdd , s2 )
} else { // Case 2
for i := c . startReps ; i < c . endReps ; i ++ {
stateToAdd = concatenate ( stateToAdd , question ( state ) )
}
}
nfa = append ( nfa , stateToAdd )
}
}
}
if len ( nfa ) != 1 {
if len ( nfa ) != 1 {
panic ( "ERROR: Invalid Regex." )
panic ( "ERROR: Invalid Regex." )
@ -274,6 +375,7 @@ func main() {
// a. Add explicit concatenation operators to facilitate this
// a. Add explicit concatenation operators to facilitate this
// 2. Build NFA from postfix representation (Thompson's algorithm)
// 2. Build NFA from postfix representation (Thompson's algorithm)
// 3. Run the string against the NFA
// 3. Run the string against the NFA
if len ( os . Args ) != 2 {
if len ( os . Args ) != 2 {
fmt . Println ( "ERROR: Missing cmdline args" )
fmt . Println ( "ERROR: Missing cmdline args" )
os . Exit ( 22 )
os . Exit ( 22 )
@ -287,7 +389,7 @@ func main() {
if err != nil && err != io . EOF {
if err != nil && err != io . EOF {
panic ( err )
panic ( err )
}
}
fmt . Scanln ( & test_str )
//fmt.Scanln(&test_str)
re_postfix := shuntingYard ( re )
re_postfix := shuntingYard ( re )
startState := thompson ( re_postfix )
startState := thompson ( re_postfix )
matchIndices := findAllMatches ( startState , test_str )
matchIndices := findAllMatches ( startState , test_str )