@ -7,6 +7,29 @@ import (
"unicode"
"unicode"
)
)
// A Reg represents the result of compiling a regular expression. It contains
// the startState of the NFA representation of the regex, and the number of capturing
// groups in the regex.
type Reg struct {
start * State
numGroups int
}
const CONCAT rune = '~'
func isOperator ( c rune ) bool {
if c == '+' || c == '?' || c == '*' || c == '|' || c == CONCAT {
return true
}
return false
}
/* priority returns the priority of the given operator */
func priority ( op rune ) int {
precedence := [ ] rune { '|' , CONCAT , '+' , '*' , '?' }
return slices . Index ( precedence , op )
}
/ *
/ *
The Shunting - Yard algorithm is used to convert the given infix ( regeular ) expression to postfix .
The Shunting - Yard algorithm is used to convert the given infix ( regeular ) expression to postfix .
The primary benefit of this is getting rid of parentheses .
The primary benefit of this is getting rid of parentheses .
@ -416,7 +439,7 @@ func shuntingYard(re string) ([]postfixNode, error) {
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state and number of groups in regex.
// Returns start state and number of groups in regex.
func thompson ( re [ ] postfixNode ) ( * State , int ) {
func thompson ( re [ ] postfixNode ) ( Reg , error ) {
nfa := make ( [ ] * State , 0 ) // Stack of states
nfa := make ( [ ] * State , 0 ) // Stack of states
numGroups := 0 // Number of capturing groups
numGroups := 0 // Number of capturing groups
for _ , c := range re {
for _ , c := range re {
@ -465,10 +488,16 @@ func thompson(re []postfixNode) (*State, int) {
state . assert = NLB
state . assert = NLB
}
}
}
}
tmpRe := shuntingYard ( state . lookaroundRegex )
tmpRe , err := shuntingYard ( state . lookaroundRegex )
var numGroupsLookaround int
if err != nil {
state . lookaroundNFA , numGroupsLookaround = thompson ( tmpRe )
return Reg { } , fmt . Errorf ( "Error parsing lookaround: %w" , err )
state . lookaroundNumCaptureGroups = numGroupsLookaround
}
reg , err := thompson ( tmpRe )
if err != nil {
return Reg { } , fmt . Errorf ( "Error compiling lookaround: %w" , err )
}
state . lookaroundNFA = reg . start
state . lookaroundNumCaptureGroups = reg . numGroups
}
}
}
}
@ -533,7 +562,7 @@ func thompson(re []postfixNode) (*State, int) {
}
}
if c . startReps != 1 || c . endReps != 1 { // Must have a numeric specifier attached to it
if c . startReps != 1 || c . endReps != 1 { // Must have a numeric specifier attached to it
if c . endReps != - 1 && c . endReps < c . startReps {
if c . endReps != - 1 && c . endReps < c . startReps {
panic ( "ERROR: Numeric specifier - start greater than end.")
return Reg { } , fmt . Errorf ( " Numeric specifier - start greater than end.")
}
}
state := mustPop ( & nfa )
state := mustPop ( & nfa )
var stateToAdd * State = nil
var stateToAdd * State = nil
@ -564,11 +593,26 @@ func thompson(re []postfixNode) (*State, int) {
}
}
}
}
if len ( nfa ) != 1 {
if len ( nfa ) != 1 {
panic ( "ERROR: Invalid Regex.")
return Reg { } , fmt . Errorf ( " Invalid Regex.")
}
}
verifyLastStates ( nfa )
verifyLastStates ( nfa )
return nfa [ 0 ] , numGroups
return Reg { nfa [ 0 ] , numGroups } , nil
}
// Compiles the given regular expression into a Reg type, suitable for use with the
// matching functions. The second return value is non-nil if a compilation error has
// occured. As such, the error value must be checked before using the Reg returned by this function.
func Compile ( re string ) ( Reg , error ) {
nodes , err := shuntingYard ( re )
if err != nil {
return Reg { } , fmt . Errorf ( "Error parsing regex: %w" , err )
}
reg , err := thompson ( nodes )
if err != nil {
return Reg { } , fmt . Errorf ( "Error compiling regex: %w" , err )
}
return reg , nil
}
}