Added support for numeric specifiers, moved question mark operator to its own function
This commit is contained in:
122
main.go
122
main.go
@@ -6,6 +6,8 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"slices"
|
"slices"
|
||||||
|
"strconv"
|
||||||
|
"unicode"
|
||||||
|
|
||||||
"github.com/fatih/color"
|
"github.com/fatih/color"
|
||||||
)
|
)
|
||||||
@@ -45,7 +47,7 @@ func shuntingYard(re string) []postfixNode {
|
|||||||
for i < len(re_runes) {
|
for i < len(re_runes) {
|
||||||
re_postfix = append(re_postfix, re_runes[i])
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped
|
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped
|
||||||
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing og all characters (including opening and closing brackets) within the character class
|
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
|
||||||
invertMatch := false
|
invertMatch := false
|
||||||
toAppend := make([]rune, 0) // Holds all the runes in the current character class
|
toAppend := make([]rune, 0) // Holds all the runes in the current character class
|
||||||
if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
|
if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
|
||||||
@@ -84,9 +86,20 @@ func shuntingYard(re string) []postfixNode {
|
|||||||
}
|
}
|
||||||
re_postfix = append(re_postfix, toAppend...)
|
re_postfix = append(re_postfix, toAppend...)
|
||||||
}
|
}
|
||||||
|
if re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
|
||||||
|
i++ // Skip opening brace
|
||||||
|
for i < len(re_runes) && re_runes[i] != '}' {
|
||||||
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
if i == len(re_runes) {
|
||||||
|
panic("Invalid numeric specifier.")
|
||||||
|
}
|
||||||
|
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
|
||||||
|
}
|
||||||
if (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
if (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
||||||
if i < len(re_runes)-1 {
|
if i < len(re_runes)-1 {
|
||||||
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' {
|
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
|
||||||
re_postfix = append(re_postfix, CONCAT)
|
re_postfix = append(re_postfix, CONCAT)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -109,6 +122,7 @@ func shuntingYard(re string) []postfixNode {
|
|||||||
3. If current character is '(', push to opStack
|
3. If current character is '(', push to opStack
|
||||||
4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
|
4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
|
||||||
5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
|
5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
|
||||||
|
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
|
||||||
*/
|
*/
|
||||||
c := re_postfix[i]
|
c := re_postfix[i]
|
||||||
if isAlphaNum(c) {
|
if isAlphaNum(c) {
|
||||||
@@ -173,6 +187,67 @@ func shuntingYard(re string) []postfixNode {
|
|||||||
// i++ // Step forward to skip closing bracket
|
// i++ // Step forward to skip closing bracket
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if c == '{' {
|
||||||
|
i++ // Skip opening brace
|
||||||
|
// Three possibilities:
|
||||||
|
// 1. Single number - {5}
|
||||||
|
// 2. Range - {3,5}
|
||||||
|
// 3. Start with no end, {3,}
|
||||||
|
startRange := make([]rune, 0)
|
||||||
|
startRangeNum := 0
|
||||||
|
endRange := make([]rune, 0)
|
||||||
|
endRangeNum := 0
|
||||||
|
for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
|
||||||
|
startRange = append(startRange, re_postfix[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
if len(startRange) == 0 { // {} is not valid, neither is {,5}
|
||||||
|
panic("ERROR: Invalid numeric specifier.")
|
||||||
|
}
|
||||||
|
if i == len(re_postfix) {
|
||||||
|
panic("ERROR: Brace not closed.")
|
||||||
|
}
|
||||||
|
|
||||||
|
startRangeNum, err := strconv.Atoi(string(startRange))
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if re_postfix[i] == '}' { // Case 1 above
|
||||||
|
endRangeNum = startRangeNum
|
||||||
|
} else {
|
||||||
|
if re_postfix[i] != ',' {
|
||||||
|
panic("ERROR: Invalid numeric specifier.")
|
||||||
|
}
|
||||||
|
i++ // Skip comma
|
||||||
|
for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
|
||||||
|
endRange = append(endRange, re_postfix[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
if i == len(re_postfix) {
|
||||||
|
panic("ERROR: Brace not closed.")
|
||||||
|
}
|
||||||
|
if re_postfix[i] != '}' {
|
||||||
|
panic("ERROR: Invalid numeric specifier.")
|
||||||
|
}
|
||||||
|
if len(endRange) == 0 { // Case 3 above
|
||||||
|
endRangeNum = INFINITE_REPS
|
||||||
|
} else { // Case 2 above
|
||||||
|
var err error
|
||||||
|
endRangeNum, err = strconv.Atoi(string(endRange))
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
node, err := pop(&outQueue)
|
||||||
|
if err != nil {
|
||||||
|
panic("Numeric specifier with no content.")
|
||||||
|
}
|
||||||
|
node.startReps = startRangeNum
|
||||||
|
node.endReps = endRangeNum
|
||||||
|
outQueue = append(outQueue, node)
|
||||||
|
}
|
||||||
if c == '(' {
|
if c == '(' {
|
||||||
opStack = append(opStack, c)
|
opStack = append(opStack, c)
|
||||||
}
|
}
|
||||||
@@ -244,19 +319,45 @@ func thompson(re []postfixNode) *State {
|
|||||||
nfa = append(nfa, s1)
|
nfa = append(nfa, s1)
|
||||||
case QUESTION: // ab? is equivalent to a(b|)
|
case QUESTION: // ab? is equivalent to a(b|)
|
||||||
s1 := mustPop(&nfa)
|
s1 := mustPop(&nfa)
|
||||||
s2 := &State{}
|
s2 := question(s1)
|
||||||
s2.transitions = make(map[int][]*State)
|
nfa = append(nfa, s2)
|
||||||
s2.content = newContents(EPSILON)
|
|
||||||
s2.output = append(s2.output, s2)
|
|
||||||
s2.isEmpty = true
|
|
||||||
s3 := alternate(s1, s2)
|
|
||||||
nfa = append(nfa, s3)
|
|
||||||
case PIPE:
|
case PIPE:
|
||||||
s1 := mustPop(&nfa)
|
s1 := mustPop(&nfa)
|
||||||
s2 := mustPop(&nfa)
|
s2 := mustPop(&nfa)
|
||||||
s3 := alternate(s1, s2)
|
s3 := alternate(s1, s2)
|
||||||
nfa = append(nfa, s3)
|
nfa = append(nfa, s3)
|
||||||
}
|
}
|
||||||
|
if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
|
||||||
|
if c.endReps != -1 && c.endReps < c.startReps {
|
||||||
|
panic("ERROR: Numeric specifier - start greater than end.")
|
||||||
|
}
|
||||||
|
state := mustPop(&nfa)
|
||||||
|
var stateToAdd *State = nil
|
||||||
|
// Take advantage of the following facts:
|
||||||
|
// a{5} == aaaaa
|
||||||
|
// a{3,5} == aaaa?a?
|
||||||
|
// a{5,} == aaaaa+
|
||||||
|
// Nov. 3 2024 - I have two choices on how I want to implement numeric
|
||||||
|
// specifiers.
|
||||||
|
// a. Encode the logic while creating the states. I will have to create a function
|
||||||
|
// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
|
||||||
|
// each other (concatenating them with the 'concatenate' method - which takes addresses - does
|
||||||
|
// not work). Creating this function might be a lot of work.
|
||||||
|
// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
|
||||||
|
// at this point, I can leave thompson untouched.
|
||||||
|
for i := 0; i < c.startReps; i++ { // Case 1
|
||||||
|
stateToAdd = concatenate(stateToAdd, cloneState(state))
|
||||||
|
}
|
||||||
|
if c.endReps == INFINITE_REPS { // Case 3
|
||||||
|
s2 := kleene(*state)
|
||||||
|
stateToAdd = concatenate(stateToAdd, s2)
|
||||||
|
} else { // Case 2
|
||||||
|
for i := c.startReps; i < c.endReps; i++ {
|
||||||
|
stateToAdd = concatenate(stateToAdd, question(state))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nfa = append(nfa, stateToAdd)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if len(nfa) != 1 {
|
if len(nfa) != 1 {
|
||||||
panic("ERROR: Invalid Regex.")
|
panic("ERROR: Invalid Regex.")
|
||||||
@@ -274,6 +375,7 @@ func main() {
|
|||||||
// a. Add explicit concatenation operators to facilitate this
|
// a. Add explicit concatenation operators to facilitate this
|
||||||
// 2. Build NFA from postfix representation (Thompson's algorithm)
|
// 2. Build NFA from postfix representation (Thompson's algorithm)
|
||||||
// 3. Run the string against the NFA
|
// 3. Run the string against the NFA
|
||||||
|
|
||||||
if len(os.Args) != 2 {
|
if len(os.Args) != 2 {
|
||||||
fmt.Println("ERROR: Missing cmdline args")
|
fmt.Println("ERROR: Missing cmdline args")
|
||||||
os.Exit(22)
|
os.Exit(22)
|
||||||
@@ -287,7 +389,7 @@ func main() {
|
|||||||
if err != nil && err != io.EOF {
|
if err != nil && err != io.EOF {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
fmt.Scanln(&test_str)
|
//fmt.Scanln(&test_str)
|
||||||
re_postfix := shuntingYard(re)
|
re_postfix := shuntingYard(re)
|
||||||
startState := thompson(re_postfix)
|
startState := thompson(re_postfix)
|
||||||
matchIndices := findAllMatches(startState, test_str)
|
matchIndices := findAllMatches(startState, test_str)
|
||||||
|
Reference in New Issue
Block a user