From c8613c1ba2d162348af72caf499bdc790511e835 Mon Sep 17 00:00:00 2001 From: Rockingcool Date: Mon, 6 Jan 2025 20:08:24 -0600 Subject: [PATCH] Major restructuring - added new type, changed return types for shuntingYard and thompson I added a new function 'Compile' that calls shuntingYard and thompson. I also added a new type 'Reg' that this function returns - it represents the starting state and contains the number of capturing groups in the regex. I also rewrote shuntingYard and thompson to return errors instead of panicking. --- compile.go | 60 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/compile.go b/compile.go index 9de2b81..9311a21 100644 --- a/compile.go +++ b/compile.go @@ -7,6 +7,29 @@ import ( "unicode" ) +// A Reg represents the result of compiling a regular expression. It contains +// the startState of the NFA representation of the regex, and the number of capturing +// groups in the regex. +type Reg struct { + start *State + numGroups int +} + +const CONCAT rune = '~' + +func isOperator(c rune) bool { + if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT { + return true + } + return false +} + +/* priority returns the priority of the given operator */ +func priority(op rune) int { + precedence := []rune{'|', CONCAT, '+', '*', '?'} + return slices.Index(precedence, op) +} + /* The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix. The primary benefit of this is getting rid of parentheses. @@ -416,7 +439,7 @@ func shuntingYard(re string) ([]postfixNode, error) { // Thompson's algorithm. Constructs Finite-State Automaton from given string. // Returns start state and number of groups in regex. -func thompson(re []postfixNode) (*State, int) { +func thompson(re []postfixNode) (Reg, error) { nfa := make([]*State, 0) // Stack of states numGroups := 0 // Number of capturing groups for _, c := range re { @@ -465,10 +488,16 @@ func thompson(re []postfixNode) (*State, int) { state.assert = NLB } } - tmpRe := shuntingYard(state.lookaroundRegex) - var numGroupsLookaround int - state.lookaroundNFA, numGroupsLookaround = thompson(tmpRe) - state.lookaroundNumCaptureGroups = numGroupsLookaround + tmpRe, err := shuntingYard(state.lookaroundRegex) + if err != nil { + return Reg{}, fmt.Errorf("Error parsing lookaround: %w", err) + } + reg, err := thompson(tmpRe) + if err != nil { + return Reg{}, fmt.Errorf("Error compiling lookaround: %w", err) + } + state.lookaroundNFA = reg.start + state.lookaroundNumCaptureGroups = reg.numGroups } } @@ -533,7 +562,7 @@ func thompson(re []postfixNode) (*State, int) { } if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it if c.endReps != -1 && c.endReps < c.startReps { - panic("ERROR: Numeric specifier - start greater than end.") + return Reg{}, fmt.Errorf("Numeric specifier - start greater than end.") } state := mustPop(&nfa) var stateToAdd *State = nil @@ -564,11 +593,26 @@ func thompson(re []postfixNode) (*State, int) { } } if len(nfa) != 1 { - panic("ERROR: Invalid Regex.") + return Reg{}, fmt.Errorf("Invalid Regex.") } verifyLastStates(nfa) - return nfa[0], numGroups + return Reg{nfa[0], numGroups}, nil } + +// Compiles the given regular expression into a Reg type, suitable for use with the +// matching functions. The second return value is non-nil if a compilation error has +// occured. As such, the error value must be checked before using the Reg returned by this function. +func Compile(re string) (Reg, error) { + nodes, err := shuntingYard(re) + if err != nil { + return Reg{}, fmt.Errorf("Error parsing regex: %w", err) + } + reg, err := thompson(nodes) + if err != nil { + return Reg{}, fmt.Errorf("Error compiling regex: %w", err) + } + return reg, nil +}