Added support for numeric specifiers, moved question mark operator to its own function
This commit is contained in:
122
main.go
122
main.go
@@ -6,6 +6,8 @@ import (
|
||||
"io"
|
||||
"os"
|
||||
"slices"
|
||||
"strconv"
|
||||
"unicode"
|
||||
|
||||
"github.com/fatih/color"
|
||||
)
|
||||
@@ -45,7 +47,7 @@ func shuntingYard(re string) []postfixNode {
|
||||
for i < len(re_runes) {
|
||||
re_postfix = append(re_postfix, re_runes[i])
|
||||
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped
|
||||
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing og all characters (including opening and closing brackets) within the character class
|
||||
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
|
||||
invertMatch := false
|
||||
toAppend := make([]rune, 0) // Holds all the runes in the current character class
|
||||
if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
|
||||
@@ -84,9 +86,20 @@ func shuntingYard(re string) []postfixNode {
|
||||
}
|
||||
re_postfix = append(re_postfix, toAppend...)
|
||||
}
|
||||
if re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
|
||||
i++ // Skip opening brace
|
||||
for i < len(re_runes) && re_runes[i] != '}' {
|
||||
re_postfix = append(re_postfix, re_runes[i])
|
||||
i++
|
||||
}
|
||||
if i == len(re_runes) {
|
||||
panic("Invalid numeric specifier.")
|
||||
}
|
||||
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
|
||||
}
|
||||
if (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
|
||||
if i < len(re_runes)-1 {
|
||||
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' {
|
||||
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
|
||||
re_postfix = append(re_postfix, CONCAT)
|
||||
}
|
||||
}
|
||||
@@ -109,6 +122,7 @@ func shuntingYard(re string) []postfixNode {
|
||||
3. If current character is '(', push to opStack
|
||||
4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
|
||||
5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
|
||||
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
|
||||
*/
|
||||
c := re_postfix[i]
|
||||
if isAlphaNum(c) {
|
||||
@@ -173,6 +187,67 @@ func shuntingYard(re string) []postfixNode {
|
||||
// i++ // Step forward to skip closing bracket
|
||||
continue
|
||||
}
|
||||
if c == '{' {
|
||||
i++ // Skip opening brace
|
||||
// Three possibilities:
|
||||
// 1. Single number - {5}
|
||||
// 2. Range - {3,5}
|
||||
// 3. Start with no end, {3,}
|
||||
startRange := make([]rune, 0)
|
||||
startRangeNum := 0
|
||||
endRange := make([]rune, 0)
|
||||
endRangeNum := 0
|
||||
for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
|
||||
startRange = append(startRange, re_postfix[i])
|
||||
i++
|
||||
}
|
||||
if len(startRange) == 0 { // {} is not valid, neither is {,5}
|
||||
panic("ERROR: Invalid numeric specifier.")
|
||||
}
|
||||
if i == len(re_postfix) {
|
||||
panic("ERROR: Brace not closed.")
|
||||
}
|
||||
|
||||
startRangeNum, err := strconv.Atoi(string(startRange))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if re_postfix[i] == '}' { // Case 1 above
|
||||
endRangeNum = startRangeNum
|
||||
} else {
|
||||
if re_postfix[i] != ',' {
|
||||
panic("ERROR: Invalid numeric specifier.")
|
||||
}
|
||||
i++ // Skip comma
|
||||
for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
|
||||
endRange = append(endRange, re_postfix[i])
|
||||
i++
|
||||
}
|
||||
if i == len(re_postfix) {
|
||||
panic("ERROR: Brace not closed.")
|
||||
}
|
||||
if re_postfix[i] != '}' {
|
||||
panic("ERROR: Invalid numeric specifier.")
|
||||
}
|
||||
if len(endRange) == 0 { // Case 3 above
|
||||
endRangeNum = INFINITE_REPS
|
||||
} else { // Case 2 above
|
||||
var err error
|
||||
endRangeNum, err = strconv.Atoi(string(endRange))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
node, err := pop(&outQueue)
|
||||
if err != nil {
|
||||
panic("Numeric specifier with no content.")
|
||||
}
|
||||
node.startReps = startRangeNum
|
||||
node.endReps = endRangeNum
|
||||
outQueue = append(outQueue, node)
|
||||
}
|
||||
if c == '(' {
|
||||
opStack = append(opStack, c)
|
||||
}
|
||||
@@ -244,19 +319,45 @@ func thompson(re []postfixNode) *State {
|
||||
nfa = append(nfa, s1)
|
||||
case QUESTION: // ab? is equivalent to a(b|)
|
||||
s1 := mustPop(&nfa)
|
||||
s2 := &State{}
|
||||
s2.transitions = make(map[int][]*State)
|
||||
s2.content = newContents(EPSILON)
|
||||
s2.output = append(s2.output, s2)
|
||||
s2.isEmpty = true
|
||||
s3 := alternate(s1, s2)
|
||||
nfa = append(nfa, s3)
|
||||
s2 := question(s1)
|
||||
nfa = append(nfa, s2)
|
||||
case PIPE:
|
||||
s1 := mustPop(&nfa)
|
||||
s2 := mustPop(&nfa)
|
||||
s3 := alternate(s1, s2)
|
||||
nfa = append(nfa, s3)
|
||||
}
|
||||
if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
|
||||
if c.endReps != -1 && c.endReps < c.startReps {
|
||||
panic("ERROR: Numeric specifier - start greater than end.")
|
||||
}
|
||||
state := mustPop(&nfa)
|
||||
var stateToAdd *State = nil
|
||||
// Take advantage of the following facts:
|
||||
// a{5} == aaaaa
|
||||
// a{3,5} == aaaa?a?
|
||||
// a{5,} == aaaaa+
|
||||
// Nov. 3 2024 - I have two choices on how I want to implement numeric
|
||||
// specifiers.
|
||||
// a. Encode the logic while creating the states. I will have to create a function
|
||||
// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
|
||||
// each other (concatenating them with the 'concatenate' method - which takes addresses - does
|
||||
// not work). Creating this function might be a lot of work.
|
||||
// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
|
||||
// at this point, I can leave thompson untouched.
|
||||
for i := 0; i < c.startReps; i++ { // Case 1
|
||||
stateToAdd = concatenate(stateToAdd, cloneState(state))
|
||||
}
|
||||
if c.endReps == INFINITE_REPS { // Case 3
|
||||
s2 := kleene(*state)
|
||||
stateToAdd = concatenate(stateToAdd, s2)
|
||||
} else { // Case 2
|
||||
for i := c.startReps; i < c.endReps; i++ {
|
||||
stateToAdd = concatenate(stateToAdd, question(state))
|
||||
}
|
||||
}
|
||||
nfa = append(nfa, stateToAdd)
|
||||
}
|
||||
}
|
||||
if len(nfa) != 1 {
|
||||
panic("ERROR: Invalid Regex.")
|
||||
@@ -274,6 +375,7 @@ func main() {
|
||||
// a. Add explicit concatenation operators to facilitate this
|
||||
// 2. Build NFA from postfix representation (Thompson's algorithm)
|
||||
// 3. Run the string against the NFA
|
||||
|
||||
if len(os.Args) != 2 {
|
||||
fmt.Println("ERROR: Missing cmdline args")
|
||||
os.Exit(22)
|
||||
@@ -287,7 +389,7 @@ func main() {
|
||||
if err != nil && err != io.EOF {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Scanln(&test_str)
|
||||
//fmt.Scanln(&test_str)
|
||||
re_postfix := shuntingYard(re)
|
||||
startState := thompson(re_postfix)
|
||||
matchIndices := findAllMatches(startState, test_str)
|
||||
|
Reference in New Issue
Block a user