Major restructuring - added new type, changed return types for shuntingYard and thompson
I added a new function 'Compile' that calls shuntingYard and thompson. I also added a new type 'Reg' that this function returns - it represents the starting state and contains the number of capturing groups in the regex. I also rewrote shuntingYard and thompson to return errors instead of panicking.
This commit is contained in:
60
compile.go
60
compile.go
@@ -7,6 +7,29 @@ import (
|
|||||||
"unicode"
|
"unicode"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// A Reg represents the result of compiling a regular expression. It contains
|
||||||
|
// the startState of the NFA representation of the regex, and the number of capturing
|
||||||
|
// groups in the regex.
|
||||||
|
type Reg struct {
|
||||||
|
start *State
|
||||||
|
numGroups int
|
||||||
|
}
|
||||||
|
|
||||||
|
const CONCAT rune = '~'
|
||||||
|
|
||||||
|
func isOperator(c rune) bool {
|
||||||
|
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
/* priority returns the priority of the given operator */
|
||||||
|
func priority(op rune) int {
|
||||||
|
precedence := []rune{'|', CONCAT, '+', '*', '?'}
|
||||||
|
return slices.Index(precedence, op)
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
|
The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
|
||||||
The primary benefit of this is getting rid of parentheses.
|
The primary benefit of this is getting rid of parentheses.
|
||||||
@@ -416,7 +439,7 @@ func shuntingYard(re string) ([]postfixNode, error) {
|
|||||||
|
|
||||||
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
|
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
|
||||||
// Returns start state and number of groups in regex.
|
// Returns start state and number of groups in regex.
|
||||||
func thompson(re []postfixNode) (*State, int) {
|
func thompson(re []postfixNode) (Reg, error) {
|
||||||
nfa := make([]*State, 0) // Stack of states
|
nfa := make([]*State, 0) // Stack of states
|
||||||
numGroups := 0 // Number of capturing groups
|
numGroups := 0 // Number of capturing groups
|
||||||
for _, c := range re {
|
for _, c := range re {
|
||||||
@@ -465,10 +488,16 @@ func thompson(re []postfixNode) (*State, int) {
|
|||||||
state.assert = NLB
|
state.assert = NLB
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tmpRe := shuntingYard(state.lookaroundRegex)
|
tmpRe, err := shuntingYard(state.lookaroundRegex)
|
||||||
var numGroupsLookaround int
|
if err != nil {
|
||||||
state.lookaroundNFA, numGroupsLookaround = thompson(tmpRe)
|
return Reg{}, fmt.Errorf("Error parsing lookaround: %w", err)
|
||||||
state.lookaroundNumCaptureGroups = numGroupsLookaround
|
}
|
||||||
|
reg, err := thompson(tmpRe)
|
||||||
|
if err != nil {
|
||||||
|
return Reg{}, fmt.Errorf("Error compiling lookaround: %w", err)
|
||||||
|
}
|
||||||
|
state.lookaroundNFA = reg.start
|
||||||
|
state.lookaroundNumCaptureGroups = reg.numGroups
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -533,7 +562,7 @@ func thompson(re []postfixNode) (*State, int) {
|
|||||||
}
|
}
|
||||||
if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
|
if c.startReps != 1 || c.endReps != 1 { // Must have a numeric specifier attached to it
|
||||||
if c.endReps != -1 && c.endReps < c.startReps {
|
if c.endReps != -1 && c.endReps < c.startReps {
|
||||||
panic("ERROR: Numeric specifier - start greater than end.")
|
return Reg{}, fmt.Errorf("Numeric specifier - start greater than end.")
|
||||||
}
|
}
|
||||||
state := mustPop(&nfa)
|
state := mustPop(&nfa)
|
||||||
var stateToAdd *State = nil
|
var stateToAdd *State = nil
|
||||||
@@ -564,11 +593,26 @@ func thompson(re []postfixNode) (*State, int) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(nfa) != 1 {
|
if len(nfa) != 1 {
|
||||||
panic("ERROR: Invalid Regex.")
|
return Reg{}, fmt.Errorf("Invalid Regex.")
|
||||||
}
|
}
|
||||||
|
|
||||||
verifyLastStates(nfa)
|
verifyLastStates(nfa)
|
||||||
|
|
||||||
return nfa[0], numGroups
|
return Reg{nfa[0], numGroups}, nil
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Compiles the given regular expression into a Reg type, suitable for use with the
|
||||||
|
// matching functions. The second return value is non-nil if a compilation error has
|
||||||
|
// occured. As such, the error value must be checked before using the Reg returned by this function.
|
||||||
|
func Compile(re string) (Reg, error) {
|
||||||
|
nodes, err := shuntingYard(re)
|
||||||
|
if err != nil {
|
||||||
|
return Reg{}, fmt.Errorf("Error parsing regex: %w", err)
|
||||||
|
}
|
||||||
|
reg, err := thompson(nodes)
|
||||||
|
if err != nil {
|
||||||
|
return Reg{}, fmt.Errorf("Error compiling regex: %w", err)
|
||||||
|
}
|
||||||
|
return reg, nil
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user