Added support for numeric specifiers, moved question mark operator to its own function

master
Aadhavan Srinivasan 2 months ago
parent dca81c1796
commit d8f52b8ccc

@ -6,6 +6,8 @@ import (
"io" "io"
"os" "os"
"slices" "slices"
"strconv"
"unicode"
"github.com/fatih/color" "github.com/fatih/color"
) )
@ -45,7 +47,7 @@ func shuntingYard(re string) []postfixNode {
for i < len(re_runes) { for i < len(re_runes) {
re_postfix = append(re_postfix, re_runes[i]) re_postfix = append(re_postfix, re_runes[i])
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing og all characters (including opening and closing brackets) within the character class re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
invertMatch := false invertMatch := false
toAppend := make([]rune, 0) // Holds all the runes in the current character class toAppend := make([]rune, 0) // Holds all the runes in the current character class
if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
@ -84,9 +86,20 @@ func shuntingYard(re string) []postfixNode {
} }
re_postfix = append(re_postfix, toAppend...) re_postfix = append(re_postfix, toAppend...)
} }
if re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
i++ // Skip opening brace
for i < len(re_runes) && re_runes[i] != '}' {
re_postfix = append(re_postfix, re_runes[i])
i++
}
if i == len(re_runes) {
panic("Invalid numeric specifier.")
}
re_postfix = append(re_postfix, re_runes[i]) // Append closing brace
}
if (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped if (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
if i < len(re_runes)-1 { if i < len(re_runes)-1 {
if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' { if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
re_postfix = append(re_postfix, CONCAT) re_postfix = append(re_postfix, CONCAT)
} }
} }
@ -109,6 +122,7 @@ func shuntingYard(re string) []postfixNode {
3. If current character is '(', push to opStack 3. If current character is '(', push to opStack
4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses. 4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses.
5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue. 5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue.
6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue.
*/ */
c := re_postfix[i] c := re_postfix[i]
if isAlphaNum(c) { if isAlphaNum(c) {
@ -173,6 +187,67 @@ func shuntingYard(re string) []postfixNode {
// i++ // Step forward to skip closing bracket // i++ // Step forward to skip closing bracket
continue continue
} }
if c == '{' {
i++ // Skip opening brace
// Three possibilities:
// 1. Single number - {5}
// 2. Range - {3,5}
// 3. Start with no end, {3,}
startRange := make([]rune, 0)
startRangeNum := 0
endRange := make([]rune, 0)
endRangeNum := 0
for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
startRange = append(startRange, re_postfix[i])
i++
}
if len(startRange) == 0 { // {} is not valid, neither is {,5}
panic("ERROR: Invalid numeric specifier.")
}
if i == len(re_postfix) {
panic("ERROR: Brace not closed.")
}
startRangeNum, err := strconv.Atoi(string(startRange))
if err != nil {
panic(err)
}
if re_postfix[i] == '}' { // Case 1 above
endRangeNum = startRangeNum
} else {
if re_postfix[i] != ',' {
panic("ERROR: Invalid numeric specifier.")
}
i++ // Skip comma
for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) {
endRange = append(endRange, re_postfix[i])
i++
}
if i == len(re_postfix) {
panic("ERROR: Brace not closed.")
}
if re_postfix[i] != '}' {
panic("ERROR: Invalid numeric specifier.")
}
if len(endRange) == 0 { // Case 3 above
endRangeNum = INFINITE_REPS
} else { // Case 2 above
var err error
endRangeNum, err = strconv.Atoi(string(endRange))
if err != nil {
panic(err)
}
}
}
node, err := pop(&outQueue)
if err != nil {
panic("Numeric specifier with no content.")
}
node.startReps = startRangeNum
node.endReps = endRangeNum
outQueue = append(outQueue, node)
}
if c == '(' { if c == '(' {
opStack = append(opStack, c) opStack = append(opStack, c)
} }
@ -244,19 +319,45 @@ func thompson(re []postfixNode) *State {
nfa = append(nfa, s1) nfa = append(nfa, s1)
case QUESTION: // ab? is equivalent to a(b|) case QUESTION: // ab? is equivalent to a(b|)
s1 := mustPop(&nfa) s1 := mustPop(&nfa)
s2 := &State{} s2 := question(s1)
s2.transitions = make(map[int][]*State) nfa = append(nfa, s2)
s2.content = newContents(EPSILON)
s2.output = append(s2.output, s2)
s2.isEmpty = true
s3 := alternate(s1, s2)
nfa = append(nfa, s3)
case PIPE: case PIPE:
s1 := mustPop(&nfa) s1 := mustPop(&nfa)
s2 := mustPop(&nfa) s2 := mustPop(&nfa)
s3 := alternate(s1, s2) s3 := alternate(s1, s2)
nfa = append(nfa, s3) nfa = append(nfa, s3)
} }
if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
if c.endReps != -1 && c.endReps < c.startReps {
panic("ERROR: Numeric specifier - start greater than end.")
}
state := mustPop(&nfa)
var stateToAdd *State = nil
// Take advantage of the following facts:
// a{5} == aaaaa
// a{3,5} == aaaa?a?
// a{5,} == aaaaa+
// Nov. 3 2024 - I have two choices on how I want to implement numeric
// specifiers.
// a. Encode the logic while creating the states. I will have to create a function
// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
// each other (concatenating them with the 'concatenate' method - which takes addresses - does
// not work). Creating this function might be a lot of work.
// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
// at this point, I can leave thompson untouched.
for i := 0; i < c.startReps; i++ { // Case 1
stateToAdd = concatenate(stateToAdd, cloneState(state))
}
if c.endReps == INFINITE_REPS { // Case 3
s2 := kleene(*state)
stateToAdd = concatenate(stateToAdd, s2)
} else { // Case 2
for i := c.startReps; i < c.endReps; i++ {
stateToAdd = concatenate(stateToAdd, question(state))
}
}
nfa = append(nfa, stateToAdd)
}
} }
if len(nfa) != 1 { if len(nfa) != 1 {
panic("ERROR: Invalid Regex.") panic("ERROR: Invalid Regex.")
@ -274,6 +375,7 @@ func main() {
// a. Add explicit concatenation operators to facilitate this // a. Add explicit concatenation operators to facilitate this
// 2. Build NFA from postfix representation (Thompson's algorithm) // 2. Build NFA from postfix representation (Thompson's algorithm)
// 3. Run the string against the NFA // 3. Run the string against the NFA
if len(os.Args) != 2 { if len(os.Args) != 2 {
fmt.Println("ERROR: Missing cmdline args") fmt.Println("ERROR: Missing cmdline args")
os.Exit(22) os.Exit(22)
@ -287,7 +389,7 @@ func main() {
if err != nil && err != io.EOF { if err != nil && err != io.EOF {
panic(err) panic(err)
} }
fmt.Scanln(&test_str) //fmt.Scanln(&test_str)
re_postfix := shuntingYard(re) re_postfix := shuntingYard(re)
startState := thompson(re_postfix) startState := thompson(re_postfix)
matchIndices := findAllMatches(startState, test_str) matchIndices := findAllMatches(startState, test_str)

Loading…
Cancel
Save