From d8f52b8ccc09fbf59970a4261348589a1af4ffd7 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 3 Nov 2024 14:36:04 -0500 Subject: [PATCH] Added support for numeric specifiers, moved question mark operator to its own function --- main.go | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 112 insertions(+), 10 deletions(-) diff --git a/main.go b/main.go index b0a9409..d92d56f 100644 --- a/main.go +++ b/main.go @@ -6,6 +6,8 @@ import ( "io" "os" "slices" + "strconv" + "unicode" "github.com/fatih/color" ) @@ -45,7 +47,7 @@ func shuntingYard(re string) []postfixNode { for i < len(re_runes) { re_postfix = append(re_postfix, re_runes[i]) if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped - re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing og all characters (including opening and closing brackets) within the character class + re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class invertMatch := false toAppend := make([]rune, 0) // Holds all the runes in the current character class if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets @@ -84,9 +86,20 @@ func shuntingYard(re string) []postfixNode { } re_postfix = append(re_postfix, toAppend...) } + if re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either + i++ // Skip opening brace + for i < len(re_runes) && re_runes[i] != '}' { + re_postfix = append(re_postfix, re_runes[i]) + i++ + } + if i == len(re_runes) { + panic("Invalid numeric specifier.") + } + re_postfix = append(re_postfix, re_runes[i]) // Append closing brace + } if (re_runes[i] != '(' && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped if i < len(re_runes)-1 { - if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' { + if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' { re_postfix = append(re_postfix, CONCAT) } } @@ -109,6 +122,7 @@ func shuntingYard(re string) []postfixNode { 3. If current character is '(', push to opStack 4. If current character is ')', pop from opStack (and append to outQueue) until '(' is found. Discard parantheses. 5. If current character is '[', find all the characters until ']', then create a postfixNode containing all these contents. Add this node to outQueue. + 6. If current character is '{', find the appropriate numeric specifier (range start, range end). Apply the range to the postfixNode at the end of outQueue. */ c := re_postfix[i] if isAlphaNum(c) { @@ -173,6 +187,67 @@ func shuntingYard(re string) []postfixNode { // i++ // Step forward to skip closing bracket continue } + if c == '{' { + i++ // Skip opening brace + // Three possibilities: + // 1. Single number - {5} + // 2. Range - {3,5} + // 3. Start with no end, {3,} + startRange := make([]rune, 0) + startRangeNum := 0 + endRange := make([]rune, 0) + endRangeNum := 0 + for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) { + startRange = append(startRange, re_postfix[i]) + i++ + } + if len(startRange) == 0 { // {} is not valid, neither is {,5} + panic("ERROR: Invalid numeric specifier.") + } + if i == len(re_postfix) { + panic("ERROR: Brace not closed.") + } + + startRangeNum, err := strconv.Atoi(string(startRange)) + if err != nil { + panic(err) + } + + if re_postfix[i] == '}' { // Case 1 above + endRangeNum = startRangeNum + } else { + if re_postfix[i] != ',' { + panic("ERROR: Invalid numeric specifier.") + } + i++ // Skip comma + for i < len(re_postfix) && unicode.IsDigit(re_postfix[i]) { + endRange = append(endRange, re_postfix[i]) + i++ + } + if i == len(re_postfix) { + panic("ERROR: Brace not closed.") + } + if re_postfix[i] != '}' { + panic("ERROR: Invalid numeric specifier.") + } + if len(endRange) == 0 { // Case 3 above + endRangeNum = INFINITE_REPS + } else { // Case 2 above + var err error + endRangeNum, err = strconv.Atoi(string(endRange)) + if err != nil { + panic(err) + } + } + } + node, err := pop(&outQueue) + if err != nil { + panic("Numeric specifier with no content.") + } + node.startReps = startRangeNum + node.endReps = endRangeNum + outQueue = append(outQueue, node) + } if c == '(' { opStack = append(opStack, c) } @@ -244,19 +319,45 @@ func thompson(re []postfixNode) *State { nfa = append(nfa, s1) case QUESTION: // ab? is equivalent to a(b|) s1 := mustPop(&nfa) - s2 := &State{} - s2.transitions = make(map[int][]*State) - s2.content = newContents(EPSILON) - s2.output = append(s2.output, s2) - s2.isEmpty = true - s3 := alternate(s1, s2) - nfa = append(nfa, s3) + s2 := question(s1) + nfa = append(nfa, s2) case PIPE: s1 := mustPop(&nfa) s2 := mustPop(&nfa) s3 := alternate(s1, s2) nfa = append(nfa, s3) } + if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it + if c.endReps != -1 && c.endReps < c.startReps { + panic("ERROR: Numeric specifier - start greater than end.") + } + state := mustPop(&nfa) + var stateToAdd *State = nil + // Take advantage of the following facts: + // a{5} == aaaaa + // a{3,5} == aaaa?a? + // a{5,} == aaaaa+ + // Nov. 3 2024 - I have two choices on how I want to implement numeric + // specifiers. + // a. Encode the logic while creating the states. I will have to create a function + // that creates a deep-copy of a given state / NFA, so that I can concatenate them to + // each other (concatenating them with the 'concatenate' method - which takes addresses - does + // not work). Creating this function might be a lot of work. + // b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier + // at this point, I can leave thompson untouched. + for i := 0; i < c.startReps; i++ { // Case 1 + stateToAdd = concatenate(stateToAdd, cloneState(state)) + } + if c.endReps == INFINITE_REPS { // Case 3 + s2 := kleene(*state) + stateToAdd = concatenate(stateToAdd, s2) + } else { // Case 2 + for i := c.startReps; i < c.endReps; i++ { + stateToAdd = concatenate(stateToAdd, question(state)) + } + } + nfa = append(nfa, stateToAdd) + } } if len(nfa) != 1 { panic("ERROR: Invalid Regex.") @@ -274,6 +375,7 @@ func main() { // a. Add explicit concatenation operators to facilitate this // 2. Build NFA from postfix representation (Thompson's algorithm) // 3. Run the string against the NFA + if len(os.Args) != 2 { fmt.Println("ERROR: Missing cmdline args") os.Exit(22) @@ -287,7 +389,7 @@ func main() { if err != nil && err != io.EOF { panic(err) } - fmt.Scanln(&test_str) + //fmt.Scanln(&test_str) re_postfix := shuntingYard(re) startState := thompson(re_postfix) matchIndices := findAllMatches(startState, test_str)