Major restructuring - added new type, changed return types for shuntingYard and thompson

I added a new function 'Compile' that calls shuntingYard and thompson. I also added
a new type 'Reg' that this function returns - it represents the starting state and contains
the number of capturing groups in the regex. I also rewrote shuntingYard and thompson
to return errors instead of panicking.
master
Aadhavan Srinivasan 2 weeks ago
parent ddbcb309b0
commit c8613c1ba2

@ -7,6 +7,29 @@ import (
"unicode"
)
// A Reg represents the result of compiling a regular expression. It contains
// the startState of the NFA representation of the regex, and the number of capturing
// groups in the regex.
type Reg struct {
start *State
numGroups int
}
const CONCAT rune = '~'
func isOperator(c rune) bool {
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
return true
}
return false
}
/* priority returns the priority of the given operator */
func priority(op rune) int {
precedence := []rune{'|', CONCAT, '+', '*', '?'}
return slices.Index(precedence, op)
}
/*
The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
The primary benefit of this is getting rid of parentheses.
@ -416,7 +439,7 @@ func shuntingYard(re string) ([]postfixNode, error) {
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state and number of groups in regex.
func thompson(re []postfixNode) (*State, int) {
func thompson(re []postfixNode) (Reg, error) {
nfa := make([]*State, 0) // Stack of states
numGroups := 0 // Number of capturing groups
for _, c := range re {
@ -465,10 +488,16 @@ func thompson(re []postfixNode) (*State, int) {
state.assert = NLB
}
}
tmpRe := shuntingYard(state.lookaroundRegex)
var numGroupsLookaround int
state.lookaroundNFA, numGroupsLookaround = thompson(tmpRe)
state.lookaroundNumCaptureGroups = numGroupsLookaround
tmpRe, err := shuntingYard(state.lookaroundRegex)
if err != nil {
return Reg{}, fmt.Errorf("Error parsing lookaround: %w", err)
}
reg, err := thompson(tmpRe)
if err != nil {
return Reg{}, fmt.Errorf("Error compiling lookaround: %w", err)
}
state.lookaroundNFA = reg.start
state.lookaroundNumCaptureGroups = reg.numGroups
}
}
@ -533,7 +562,7 @@ func thompson(re []postfixNode) (*State, int) {
}
if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
if c.endReps != -1 && c.endReps < c.startReps {
panic("ERROR: Numeric specifier - start greater than end.")
return Reg{}, fmt.Errorf("Numeric specifier - start greater than end.")
}
state := mustPop(&nfa)
var stateToAdd *State = nil
@ -564,11 +593,26 @@ func thompson(re []postfixNode) (*State, int) {
}
}
if len(nfa) != 1 {
panic("ERROR: Invalid Regex.")
return Reg{}, fmt.Errorf("Invalid Regex.")
}
verifyLastStates(nfa)
return nfa[0], numGroups
return Reg{nfa[0], numGroups}, nil
}
// Compiles the given regular expression into a Reg type, suitable for use with the
// matching functions. The second return value is non-nil if a compilation error has
// occured. As such, the error value must be checked before using the Reg returned by this function.
func Compile(re string) (Reg, error) {
nodes, err := shuntingYard(re)
if err != nil {
return Reg{}, fmt.Errorf("Error parsing regex: %w", err)
}
reg, err := thompson(nodes)
if err != nil {
return Reg{}, fmt.Errorf("Error compiling regex: %w", err)
}
return reg, nil
}

Loading…
Cancel
Save