package regex
import (
"fmt"
"slices"
"strconv"
"unicode"
)
// Holds a list of all characters that are _not_ matched by the dot metacharacter
var notDotChars [ ] rune
// A Reg represents the result of compiling a regular expression. It contains
// the startState of the NFA representation of the regex, and the number of capturing
// groups in the regex.
type Reg struct {
start * nfaState
numGroups int
}
// numSubexp eturns the number of sub-expressions in the given [Reg]. This is equivalent
// to the number of capturing groups.
func ( r Reg ) NumSubexp ( ) int {
return r . numGroups
}
const concatRune rune = 0xF0001
// Flags for shuntingYard - control its behavior
type ReFlag int
const (
RE_NO_FLAGS ReFlag = iota
RE_CASE_INSENSITIVE // Case insensitive matching
RE_MULTILINE // '^' and '$' assert at start and end of _line_, rather than start and end of input string
RE_SINGLE_LINE // Dot metacharacter matches newline characters.
)
func isOperator ( c rune ) bool {
if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune {
return true
}
return false
}
/* priority returns the priority of the given operator */
func priority ( op rune ) int {
precedence := [ ] rune { '|' , concatRune , '+' , '*' , '?' }
return slices . Index ( precedence , op )
}
// Returns the POSIX character class represented by the given string.
// The given string must be of the form:
//
// 'blah1:]blah2'
//
// In order to be a _syntactically_ valid POSIX class.
// Whether or not such a class actually exists is not relevant to this function, it just
// parses and returns 'blah1'.
// For example, if the regex was something like '[[:digit:]]', the caller must parse through the opening
// brackets and the colon, and call this function with the remainder.
//
// If the given string represents a syntactically valid POSIX class, the second parameter is true.
// Otherwise, it's false.
func getPOSIXClass ( str [ ] rune ) ( bool , string ) {
i := 0
rtv := ""
for i < len ( str ) && ( str [ i ] != ':' && str [ i ] != rbracketRune ) {
rtv += string ( str [ i ] )
i ++
}
if i >= ( len ( str ) - 1 ) { // We need to be atleast 1 character short, because the closing bracket must follow
return false , ""
}
if str [ i ] != ':' { // The POSIX class must end with a colon and a closing bracket. It cannot end with a closing bracket first.
return false , ""
}
if str [ i + 1 ] != rbracketRune {
return false , ""
}
return true , rtv
}
// Stores whether the case-insensitive flag has been enabled.
var caseInsensitive bool
// Stores whether the multiline flag has been enabled.
// In multi-line mode, '^' and '$' assert position at the start and
// end of a _line_ rather than the entire input.
var multilineMode bool
/ *
The Shunting - Yard algorithm is used to convert the given infix ( regeular ) expression to postfix .
The primary benefit of this is getting rid of parentheses .
It also inserts explicit concatenation operators to make parsing easier in Thompson ' s algorithm .
An error can be returned for a multitude of reasons - the reason is specified in the error string .
The function also takes in 0 or more flags , which control the behavior of the parser .
See : https : //blog.cernera.me/converting-regular-expressions-to-postfix-notation-with-the-shunting-yard-algorithm/
* /
func shuntingYard ( re string , flags ... ReFlag ) ( [ ] postfixNode , error ) {
// Check which flags are enabled
caseInsensitive = false
multilineMode = false
if slices . Contains ( flags , RE_MULTILINE ) {
multilineMode = true
}
if slices . Contains ( flags , RE_SINGLE_LINE ) {
notDotChars = [ ] rune { }
} else {
notDotChars = [ ] rune { '\n' }
}
if slices . Contains ( flags , RE_CASE_INSENSITIVE ) {
caseInsensitive = true
}
re_postfix := make ( [ ] rune , 0 )
// Convert the string to a slice of runes to allow iteration through it
re_runes_orig := [ ] rune ( re ) // This is the rune slice before the first parsing loop (which detects and replaces numeric ranges)
re_runes := make ( [ ] rune , 0 )
// The following checks are performed here:
// 1. Check for numeric range. If we are at the start of a numeric range,
// skip to end and construct the equivalent regex for the range.
// The reason this is outside the loop below, is that it actually modifies
// the given regex (we 'cut' the numeric range and 'paste' an equivalent regex).
// It also makes the overall parsing easier, since I don't have to worry about the numeric range
// anymore.
// Eventually, I might be able to add it into the main parsing loop, to reduce the time
// complexity.
// A numeric range has the syntax: <num1-num2>. Ir matches all numbers in this range.
//
// 2. Check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:'
// I take this out, and put in a special character - NONCAPLPAREN_CHAR.
//
// 3. Another check is made for unescaped brackets - opening brackets are replaced with
// LBRACKET and closing brackets are replaced with RBRACKET.
//
// 4. Check for escaped backslashes. Replace these with the BACKSLASH
// metacharacter. Later, in thompson(), these will be converted back. This avoids
// confusion in detecting whether a character is escaped eg. detecting
// whether '\\[a]' has an escaped opening bracket (it doesn't).
//
// 5. Check for non-greedy operators. These are not supported at the moment, so an error
// must be thrown if the user attempts to use a non-greedy operator.
for i := 0 ; i < len ( re_runes_orig ) ; i ++ {
c := re_runes_orig [ i ]
if c == '<' && ( i == 0 || ( re_runes_orig [ i - 1 ] != '\\' && re_runes_orig [ i - 1 ] != '?' ) ) {
i ++ // Step over opening angle bracket
tmpStr := ""
hyphenFound := false
for i < len ( re_runes_orig ) && re_runes_orig [ i ] != '>' {
if ! unicode . IsDigit ( re_runes_orig [ i ] ) {
if re_runes_orig [ i ] != '-' || ( hyphenFound ) {
return nil , fmt . Errorf ( "invalid numeric range" )
}
}
if re_runes_orig [ i ] == '-' {
hyphenFound = true
}
tmpStr += string ( re_runes_orig [ i ] )
i ++
}
// End of string reached and last character doesn't close the range
if i == len ( re_runes_orig ) && re_runes_orig [ len ( re_runes_orig ) - 1 ] != '>' {
return nil , fmt . Errorf ( "numeric range not closed" )
}
if len ( tmpStr ) == 0 {
return nil , fmt . Errorf ( "empty numeric range" )
}
// Closing bracket will be skipped when the loop variable increments
var rangeStart int
var rangeEnd int
fmt . Sscanf ( tmpStr , "%d-%d" , & rangeStart , & rangeEnd )
regex , err := range2regex ( rangeStart , rangeEnd )
if err != nil {
return nil , err
}
re_runes = append ( re_runes , [ ] rune ( regex ) ... )
} else if c == '(' && i < len ( re_runes_orig ) - 2 && re_runes_orig [ i + 1 ] == '?' && re_runes_orig [ i + 2 ] == ':' {
re_runes = append ( re_runes , nonCapLparenRune )
i += 2
} else if c == '\\' && i < len ( re_runes_orig ) - 1 && re_runes_orig [ i + 1 ] == '\\' { // Escaped backslash
re_runes = append ( re_runes , escBackslashRune )
i ++
} else if c == '[' && ( i == 0 || re_runes [ len ( re_runes ) - 1 ] != '\\' ) {
re_runes = append ( re_runes , lbracketRune )
continue
} else if c == ']' && ( i == 0 || re_runes [ len ( re_runes ) - 1 ] != '\\' ) {
re_runes = append ( re_runes , rbracketRune )
continue
} else if slices . Contains ( [ ] rune { '+' , '*' , '?' } , c ) && ( i < len ( re_runes_orig ) - 1 && re_runes_orig [ i + 1 ] == '?' ) {
return nil , fmt . Errorf ( "non-greedy operators are not supported" )
} else {
re_runes = append ( re_runes , c )
}
}
/ * Add concatenation operators .
Only add a concatenation operator between two characters if both the following conditions are met :
1. The first character isn ' t an opening parantheses or alteration operator ( or an escape character )
a . This makes sense , because these operators can ' t be _concatenated_ with anything else .
2. The second character isn ' t a ' closing operator ' - one that applies to something before it
a . Again , these operators can ' be concatenated _to_ . They can , however , be concatenated _from_ .
Caveats :
1. Don ' t mess with anything inside brackets - character class
2. Don ' t mess with anything inside braces - numeric repetition
3. Don ' t mess with any lookarounds .
* /
i := 0
for i < len ( re_runes ) {
re_postfix = append ( re_postfix , re_runes [ i ] )
if re_runes [ i ] == lbracketRune && ( i == 0 || re_runes [ i - 1 ] != '\\' ) { // We do not touch things inside brackets, unless they are escaped.
toAppend := make ( [ ] rune , 0 ) // Holds all the runes in the current character class
i ++ // Skip past LBRACKET, because it was already added
if i >= len ( re_runes ) { // Sanity check before we start
return nil , fmt . Errorf ( "opening bracket without closing bracket" )
}
for re_runes [ i ] != rbracketRune || i == 0 || re_runes [ i - 1 ] == '\\' { // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
if i >= len ( re_runes ) {
return nil , fmt . Errorf ( "opening bracket without closing bracket" )
}
if re_runes [ i ] == lbracketRune && re_runes [ i + 1 ] == ':' { // POSIX character class
toAppend = append ( toAppend , re_runes [ i ] )
i ++
toAppend = append ( toAppend , re_runes [ i ] )
i ++
for i < len ( re_runes ) - 1 && re_runes [ i ] != ':' && re_runes [ i ] != ']' {
toAppend = append ( toAppend , re_runes [ i ] )
i ++
}
if i >= len ( re_runes ) - 1 && re_runes [ i ] != ':' {
return nil , fmt . Errorf ( "unable to parse what looks like a POSIX character class" )
}
toAppend = append ( toAppend , re_runes [ i ] )
i ++
}
if re_runes [ i ] == '-' && ( i > 0 && re_runes [ i - 1 ] != '\\' ) && ( i < len ( re_runes ) - 1 && re_runes [ i + 1 ] != rbracketRune ) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
re_runes [ i ] = charRangeRune
}
toAppend = append ( toAppend , re_runes [ i ] )
i ++
}
// Add in the RBRACKET
toAppend = append ( toAppend , rbracketRune )
re_postfix = append ( re_postfix , toAppend ... )
}
if i < len ( re_runes ) && re_runes [ i ] == '{' && ( i > 0 && re_runes [ i - 1 ] != '\\' ) { // We don't touch things inside braces, either
i ++ // Skip opening brace
for i < len ( re_runes ) && re_runes [ i ] != '}' {
re_postfix = append ( re_postfix , re_runes [ i ] )
i ++
}
if i == len ( re_runes ) {
return nil , fmt . Errorf ( "invalid numeric specifier" )
}
re_postfix = append ( re_postfix , re_runes [ i ] ) // Append closing brace
}
if i < len ( re_runes ) - 3 && string ( re_runes [ i + 1 : i + 4 ] ) == "(?:" { // Non-capturing lparen
re_postfix = append ( re_postfix , nonCapLparenRune )
i += 3
}
if i < len ( re_runes ) && re_runes [ i ] == '\\' { // Something is being escaped (I don't add the backslash to re_postfix, because it was already added earlier)
i ++
if i >= len ( re_runes ) {
return nil , fmt . Errorf ( "stray backslash in expression" )
}
if re_runes [ i ] == 'x' {
re_postfix = append ( re_postfix , re_runes [ i ] )
i ++
if i >= len ( re_runes ) {
return nil , fmt . Errorf ( "stray backslash in expression" )
}
if re_runes [ i ] == '{' {
re_postfix = append ( re_postfix , re_runes [ i : i + 8 ] ... )
i += 7
if i >= len ( re_runes ) {
return nil , fmt . Errorf ( "stray backslash in expression" )
}
} else if isHex ( re_runes [ i ] ) {
re_postfix = append ( re_postfix , re_runes [ i : i + 2 ] ... )
i += 2
} else {
return nil , fmt . Errorf ( "invalid hex value in expression" )
}
} else if isOctal ( re_runes [ i ] ) {
numDigits := 1
for i + numDigits < len ( re_runes ) && numDigits < 3 && isOctal ( re_runes [ i + numDigits ] ) { // Skip while we see an octal character (max of 3)
numDigits ++
}
re_postfix = append ( re_postfix , re_runes [ i : i + numDigits ] ... )
i += ( numDigits - 1 ) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended
} else {
re_postfix = append ( re_postfix , re_runes [ i ] )
}
}
if i < len ( re_runes ) && re_runes [ i ] == '(' && ( i == 0 || re_runes [ i - 1 ] != '\\' ) && ( i < len ( re_runes ) - 2 && re_runes [ i + 1 ] == '?' && slices . Contains ( [ ] rune { '=' , '!' , '<' } , re_runes [ i + 2 ] ) ) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
i ++ // Step inside
if i == len ( re_runes ) - 1 || ( re_runes [ i + 1 ] != '=' && re_runes [ i + 1 ] != '!' && re_runes [ i + 1 ] != '<' ) {
return nil , fmt . Errorf ( "invalid regex - lookaround intended?" )
}
re_postfix = append ( re_postfix , re_runes [ i ] )
i ++
numOpenParens := 1
for numOpenParens != 0 {
if i >= len ( re_runes ) {
return nil , fmt . Errorf ( "unclosed lookaround" )
}
if re_runes [ i ] == '(' || re_runes [ i ] == nonCapLparenRune {
numOpenParens ++
}
if re_runes [ i ] == ')' {
numOpenParens --
if numOpenParens == 0 {
break
}
}
re_postfix = append ( re_postfix , re_runes [ i ] )
i ++
}
continue
}
if i < len ( re_runes ) && ( re_runes [ i ] != '(' && re_runes [ i ] != nonCapLparenRune && re_runes [ i ] != '|' && re_runes [ i ] != '\\' ) || ( i > 0 && re_runes [ i - 1 ] == '\\' ) { // Every character should be concatenated if it is escaped
if i < len ( re_runes ) - 1 {
if re_runes [ i + 1 ] != '|' && re_runes [ i + 1 ] != '*' && re_runes [ i + 1 ] != '+' && re_runes [ i + 1 ] != '?' && re_runes [ i + 1 ] != ')' && re_runes [ i + 1 ] != '{' {
re_postfix = append ( re_postfix , concatRune )
}
}
}
i ++
}
opStack := make ( [ ] rune , 0 ) // Operator stack
outQueue := make ( [ ] postfixNode , 0 ) // Output queue
// Actual algorithm
numOpenParens := 0 // Number of open parentheses
for i := 0 ; i < len ( re_postfix ) ; i ++ {
/ * Two cases :
1. Current character is alphanumeric - send to output queue
2. Current character is operator - do the following :
a . If current character has greater priority than top of opStack , push to opStack .
b . If not , keep popping from opStack ( and appending to outQueue ) until :
i . opStack is empty , OR
ii . current character has greater priority than top of opStack
3. If current character is '(' or NONCAPLPAREN_CHAR , push to opStack
4. If current character is ')' , pop from opStack ( and append to outQueue ) until '(' is found . Discard parantheses .
5. If current character is '[' , find all the characters until ']' , then create a postfixNode containing all these contents . Add this node to outQueue .
6. If current character is '{' , find the appropriate numeric specifier ( range start , range end ) . Apply the range to the postfixNode at the end of outQueue .
* /
c := re_postfix [ i ]
if isNormalChar ( c ) || isSpecialCharWithMetacharReplacement ( c ) {
outQueue = append ( outQueue , newPostfixNode ( c ) )
continue
}
// Since every unescaped bracket is replaced by a LBRACKET / RBRACKET, there may
// have been false positives. For example, the regex ']' has a closing bracket, but it
// isn't denoting a character class; it's just a regular character. Since it's not escaped,
// though, I would have converted this into an RBRACKET.
// To deal with this, I make the following assertion:
// If at any point I see an RBRACKET 'in the wild' (not in a character class), then it must be
// a regular character, with no special significance.
if c == rbracketRune {
outQueue = append ( outQueue , newPostfixCharNode ( ']' ) )
continue
}
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
if i == len ( re_postfix ) - 1 { // End of string - throw error, because backslash is an escape character (something needs to come after it)
return nil , fmt . Errorf ( "backslash with no escape character" )
}
i ++
if re_postfix [ i ] == 'x' { // Hex value
i ++
if re_postfix [ i ] == '{' && i < len ( re_postfix ) - 6 { // Expanded hex code
var hexVal int
n , err := fmt . Sscanf ( string ( re_postfix [ i : ] ) , "{%x}" , & hexVal )
if n < 1 || err != nil {
return nil , fmt . Errorf ( "error parsing expanded hex code in expression" )
}
outQueue = append ( outQueue , newPostfixCharNode ( rune ( hexVal ) ) )
i += 7
} else if i < len ( re_postfix ) - 1 { // Two-digit hex code
hexVal , err := strconv . ParseInt ( string ( [ ] rune { re_postfix [ i ] , re_postfix [ i + 1 ] } ) , 16 , 64 ) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
if err != nil {
return nil , fmt . Errorf ( "error parsing hex characters in expression" )
}
i ++ // Loop increment will take care of going forward
outQueue = append ( outQueue , newPostfixCharNode ( rune ( hexVal ) ) )
} else {
return nil , fmt . Errorf ( "not enough hex characters found in expression" )
}
} else if isOctal ( re_postfix [ i ] ) { // Octal value
var octVal int64
var octValStr string
numDigitsParsed := 0
for ( i + numDigitsParsed ) < len ( re_postfix ) && isOctal ( re_postfix [ i + numDigitsParsed ] ) && numDigitsParsed <= 3 {
octValStr += string ( re_postfix [ i + numDigitsParsed ] )
numDigitsParsed ++
}
octVal , err := strconv . ParseInt ( octValStr , 8 , 32 )
if err != nil {
return nil , fmt . Errorf ( "error parsing octal value in expression" )
}
if octVal > 0777 {
return nil , fmt . Errorf ( "invalid octal value in expression" )
}
i += numDigitsParsed - 1 // Shift forward by the number of digits that were parsed. Move back one character, because the loop increment will move us back to the next character automatically
outQueue = append ( outQueue , newPostfixCharNode ( rune ( octVal ) ) )
} else {
escapedNode , err := newEscapedNode ( re_postfix [ i ] , false )
if err != nil {
return nil , err
}
outQueue = append ( outQueue , escapedNode )
}
continue // Escaped character will automatically be skipped when loop variable increments
}
if c == '.' { // Dot metacharacter - represents 'any' character, but I am only adding Unicode 0020-007E
outQueue = append ( outQueue , newPostfixDotNode ( ) )
continue
}
if c == '^' { // Start-of-string assertion
outQueue = append ( outQueue , newPostfixNode ( c ) )
}
if c == '$' { // End-of-string assertion
outQueue = append ( outQueue , newPostfixNode ( c ) )
}
// Check if we're at the start of a lookaround
if c == '(' && i < len ( re_postfix ) - 1 && re_postfix [ i + 1 ] == '?' {
i += 2 // Skip opening paren and question mark
regex := "" // Stores lookaround regex
numOpenParens := 1
for numOpenParens != 0 {
if i >= len ( re_postfix ) {
return nil , fmt . Errorf ( "unclosed lookaround" )
}
if re_postfix [ i ] == '(' || re_postfix [ i ] == nonCapLparenRune {
numOpenParens ++
}
if re_postfix [ i ] == ')' {
numOpenParens --
if numOpenParens == 0 {
break
}
}
regex += string ( re_postfix [ i ] )
i ++
}
if len ( regex ) <= 1 { // Nothing in regex - throw error
return nil , fmt . Errorf ( "invalid lookaround. (too short?)" )
}
// 'regex' should now contain the lookaround regex, plus the characters at the start (which indicate pos/neg, ahead/behind)
// Now we should filter that out.
toAppend := postfixNode { nodetype : assertionNode , startReps : 1 , endReps : 1 }
if regex [ 0 ] == '<' { // Lookbehind
toAppend . lookaroundDir = lookbehind
regex = regex [ 1 : ]
} else if regex [ 0 ] == '=' || regex [ 0 ] == '!' {
toAppend . lookaroundDir = lookahead
} else {
return nil , fmt . Errorf ( "invalid lookaround" )
}
// Positive or negative
if regex [ 0 ] == '=' { // Positive
toAppend . lookaroundSign = positive
toAppend . contents = [ ] rune ( regex [ 1 : ] )
} else if regex [ 0 ] == '!' { // Negative
toAppend . lookaroundSign = negative
toAppend . contents = [ ] rune ( regex [ 1 : ] )
} else {
return nil , fmt . Errorf ( "invalid lookaround" )
}
outQueue = append ( outQueue , toAppend )
continue
}
if isOperator ( c ) {
if len ( opStack ) == 0 {
opStack = append ( opStack , c )
} else {
topStack , err := peek ( opStack )
if err != nil {
return nil , fmt . Errorf ( "operator without operand" )
}
if priority ( c ) > priority ( topStack ) { // 2a
opStack = append ( opStack , c )
} else {
for priority ( c ) <= priority ( topStack ) { // 2b
to_append := mustPop ( & opStack )
outQueue = append ( outQueue , newPostfixNode ( to_append ) )
topStack , _ = peek ( opStack )
}
outQueueFinalElement , _ := peek ( outQueue )
if ( c == '*' && outQueueFinalElement . nodetype == kleeneNode ) || ( c == '+' && outQueueFinalElement . nodetype == plusNode ) { // You cannot apply a quantifier to a quantifier in this way
return nil , fmt . Errorf ( "illegal use of token '%c'" , c )
}
opStack = append ( opStack , c )
}
}
}
if c == lbracketRune { // Used for character classes
firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
i ++ // Step forward so we can look at the character class
// Oops, there's nothing there to look at
if i >= len ( re_postfix ) {
return nil , fmt . Errorf ( "opening bracket with no closing bracket" )
}
// Check if a POSIX character class was specified ouside a bracket. This is an error.
// Eg. [:digit:] should lead to an error, telling the user that the right syntax is [[:digit:]]
if re_postfix [ i ] == ':' {
posixClassPresent , _ := getPOSIXClass ( re_postfix [ i + 1 : ] )
if posixClassPresent {
return nil , fmt . Errorf ( "the syntax for POSIX character classes is [[:digit:]], not [:digit:]" )
}
}
var invertMatch bool
if re_postfix [ i ] == '^' {
invertMatch = true
i ++
}
chars := make ( [ ] postfixNode , 0 ) // List of nodes - used only for character classes
for i < len ( re_postfix ) {
if firstCharAdded && re_postfix [ i ] == rbracketRune {
break
}
if re_postfix [ i ] == charRangeRune {
endOfRange = true
i ++
continue
}
if re_postfix [ i ] == '\\' { // Backslash indicates a character to be escaped
if i == len ( re_postfix ) - 1 {
return nil , fmt . Errorf ( "stray backslash in character class" )
}
i ++ // Step past backslash
if re_postfix [ i ] == 'x' { // Hex value
i ++
if re_postfix [ i ] == '{' && i < len ( re_postfix ) - 7 { // Expanded hex code
var hexVal int
n , err := fmt . Sscanf ( string ( re_postfix [ i : ] ) , "{%x}" , & hexVal )
if n < 1 || err != nil {
return nil , fmt . Errorf ( "error parsing expanded hex code in character class" )
}
chars = append ( chars , newPostfixCharNode ( rune ( hexVal ) ) )
i += 8
} else if i < len ( re_postfix ) - 2 { // Two-digit hex code
hexVal , err := strconv . ParseInt ( string ( [ ] rune { re_postfix [ i ] , re_postfix [ i + 1 ] } ) , 16 , 64 ) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
if err != nil {
return nil , fmt . Errorf ( "error parsing hex characters in character class" )
}
i += 2
chars = append ( chars , newPostfixCharNode ( rune ( hexVal ) ) )
} else {
return nil , fmt . Errorf ( "not enough hex characters found in character class" )
}
} else if isOctal ( re_postfix [ i ] ) { // Octal value
var octVal int64
var octValStr string
numDigitsParsed := 0
for ( i + numDigitsParsed ) < len ( re_postfix ) - 1 && isOctal ( re_postfix [ i + numDigitsParsed ] ) && numDigitsParsed <= 3 { // The '-1' exists, because even in the worst case (the character class extends till the end), the last character must be a closing bracket (and nothing else)
octValStr += string ( re_postfix [ i + numDigitsParsed ] )
numDigitsParsed ++
}
octVal , err := strconv . ParseInt ( octValStr , 8 , 32 )
if err != nil {
return nil , fmt . Errorf ( "error parsing octal value in character class" )
}
if octVal > 0777 {
return nil , fmt . Errorf ( "invalid octal value in character class" )
}
i += numDigitsParsed // Shift forward by the number of characters parsed
chars = append ( chars , newPostfixCharNode ( rune ( octVal ) ) )
} else {
escapedNode , err := newEscapedNode ( re_postfix [ i ] , true )
if err != nil {
return nil , err
}
chars = append ( chars , escapedNode )
i ++
}
} else {
if re_postfix [ i ] == lbracketRune && i < len ( re_postfix ) - 8 { // Could be the start of a POSIX class - the smallest POSIX class by word-length [[:word:]] takes 8 more characters
temp_i := i
temp_i ++
if re_postfix [ temp_i ] == ':' {
temp_i ++
posixClassPresent , posixClass := getPOSIXClass ( re_postfix [ temp_i : ] )
// getPOSIXClass returns true if there is some set of characters that
// ends in a colon and then a closing bracket. If this is not the case, we
// just treat all the characters as literals.
// For example, [[:digit:a]] is _not_ a POSIX class, its just a regular
// character class contains the letters '[', ':', 'd', 'i', 'g', 'i', 't', ':', 'a'.
// The final 'closing bracket' has no special meaning, its just another character.
if posixClassPresent {
var nodeToAdd postfixNode
switch posixClass {
case "digit" : // Equivalent to '\d'
nodeToAdd = newPostfixCharNode ( genRangeInclusive ( '0' , '9' ) ... )
case "upper" : // [A-Z]
charsToAdd := genRangeInclusive ( 'A' , 'Z' )
nodeToAdd = newPostfixCharNode ( charsToAdd ... )
case "lower" : // [a-z]
charsToAdd := genRangeInclusive ( 'a' , 'z' )
nodeToAdd = newPostfixCharNode ( charsToAdd ... )
case "alpha" : //[A-Za-z]
nodeToAdd = newPostfixCharNode ( slices . Concat ( genRangeInclusive ( 'A' , 'Z' ) , genRangeInclusive ( 'a' , 'z' ) ) ... )
case "xdigit" : // [0-9A-Fa-f]
nodeToAdd = newPostfixCharNode ( slices . Concat ( genRangeInclusive ( 'A' , 'F' ) , genRangeInclusive ( 'a' , 'f' ) , genRangeInclusive ( '0' , '9' ) ) ... )
case "alnum" : // [A-Za-z0-9]
nodeToAdd = newPostfixCharNode ( slices . Concat ( genRangeInclusive ( 'A' , 'Z' ) , genRangeInclusive ( 'a' , 'z' ) , genRangeInclusive ( '0' , '9' ) ) ... )
case "blank" : // [ \t]
nodeToAdd = newPostfixCharNode ( ' ' , '\t' )
case "space" : // [ \t\n\r\f\v]
nodeToAdd = newPostfixCharNode ( ' ' , '\t' , '\n' , '\r' , '\f' , '\v' )
case "cntrl" : // Control characters
nodeToAdd = newPostfixCharNode ( append ( genRangeInclusive ( '\x00' , '\x1F' ) , '\x7F' ) ... )
case "punct" : // Punctuation and symbols
nodeToAdd = newPostfixCharNode ( [ ] rune ( ` !"\#$%&'()*+,\-./:;<=>?@\[\\\]^_ ` + "`" + ` { |}~ ` ) ... )
case "graph" : // Graphic characters
nodeToAdd = newPostfixCharNode ( genRangeInclusive ( '\x21' , '\x7E' ) ... )
case "print" : // Graphic characters + space
nodeToAdd = newPostfixCharNode ( genRangeInclusive ( '\x20' , '\x7E' ) ... )
case "ascii" : // ASCII values
nodeToAdd = newPostfixCharNode ( genRangeInclusive ( '\x00' , '\x7F' ) ... )
case "word" : // Word characters
nodeToAdd , _ = newEscapedNode ( 'w' , true ) // This isn't going to error, so I suppress it
default :
return nil , fmt . Errorf ( "invalid POSIX character class" )
}
chars = append ( chars , nodeToAdd )
i = temp_i + len ( posixClass ) + 2 // Skip over the class name, the closing colon and the closing bracket
firstCharAdded = true
continue
}
}
}
// This used to be an else statement - I removed it, because if the previous if-block fails
// (ie. if it didn't actually find a character class), then this block must still execute.
// However, the checks for character classes are nested, so placing this inside an 'else' block
// will prevent it from running, as the outer if-statement will have evaluated to true.
if ! firstCharAdded && re_postfix [ i ] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket.
switch re_postfix [ i ] {
case lbracketRune :
chars = append ( chars , newPostfixCharNode ( '[' ) )
case rbracketRune :
chars = append ( chars , newPostfixCharNode ( ']' ) )
default :
return nil , fmt . Errorf ( "error parsing high-range unicode value in character class" )
}
}
chars = append ( chars , newPostfixCharNode ( re_postfix [ i ] ) )
i ++
}
firstCharAdded = true
if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range
// Things to note:
// 1. In PCRE and Go's regex engine, a letter _can_ be surrounded by hyphens in a character class.
// Eg. [a-b-c]
// While you might think this leads to a syntax error (I thought so), the engine picks 'a-b' as a range,
// then treats the second '-' and 'c' as regular characters in the character class.
// So this regex becomes "Match a character from 'a' to 'b', a literal hyphen, or 'c' ".
// 2. To account for this, the following logic is followed:
// a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range.
// i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such.
// ii. If either the start or end of the range don't exist in 'chars' ie. something like [-a] or [a-], then too will we treat it as a literal hyphen.
// b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range).
endRangePostfixNode , err1 := pop ( & chars )
startRangePostfixNode , err2 := pop ( & chars )
if ( err1 != nil || err2 != nil ) || len ( startRangePostfixNode . contents ) != 1 { // Treat it as a regular hyphen
chars = append ( chars , startRangePostfixNode , newPostfixCharNode ( '-' ) , endRangePostfixNode )
} else if len ( endRangePostfixNode . contents ) != 1 { // I don't even know what this would look like, this is just a sanity check
return nil , fmt . Errorf ( "error parsing character range" )
} else {
// We have established that they both have a length of 1
startRangeRune := startRangePostfixNode . contents [ 0 ]
endRangeRune := endRangePostfixNode . contents [ 0 ]
if startRangeRune > endRangeRune {
return nil , fmt . Errorf ( "character range syntax is [a-b], not [b-a]" )
}
chars = append ( chars , newPostfixCharNode ( genRangeInclusive ( startRangeRune , endRangeRune ) ... ) )
}
endOfRange = false // Reset the flag
}
}
if i == len ( re_postfix ) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Throw error.
return nil , fmt . Errorf ( "opening bracket without closing bracket" )
}
outQueue = append ( outQueue , newCharClassNode ( chars , invertMatch ) )
continue
}
if c == '{' {
i ++ // Skip opening brace
// Three possibilities:
// 1. Single number - {5}
// 2. Range - {3,5}
// 3. Start with no end, {3,}
startRange := make ( [ ] rune , 0 )
startRangeNum := 0
endRange := make ( [ ] rune , 0 )
endRangeNum := 0
for i < len ( re_postfix ) && unicode . IsDigit ( re_postfix [ i ] ) {
startRange = append ( startRange , re_postfix [ i ] )
i ++
}
if len ( startRange ) == 0 { // {} is not valid, neither is {,5}
return nil , fmt . Errorf ( "invalid numeric specifier" )
}
if i == len ( re_postfix ) {
return nil , fmt . Errorf ( "brace not closed" )
}
startRangeNum , err := strconv . Atoi ( string ( startRange ) )
if err != nil {
return nil , fmt . Errorf ( "invalid numeric range" )
}
if re_postfix [ i ] == '}' { // Case 1 above
endRangeNum = startRangeNum
} else {
if re_postfix [ i ] != ',' {
return nil , fmt . Errorf ( "invalid numeric specifier" )
}
i ++ // Skip comma
for i < len ( re_postfix ) && unicode . IsDigit ( re_postfix [ i ] ) {
endRange = append ( endRange , re_postfix [ i ] )
i ++
}
if i == len ( re_postfix ) {
return nil , fmt . Errorf ( "brace not closed" )
}
if re_postfix [ i ] != '}' {
return nil , fmt . Errorf ( "invalid start range for numeric specifier" )
}
if len ( endRange ) == 0 { // Case 3 above
endRangeNum = infinite_reps
} else { // Case 2 above
var err error
endRangeNum , err = strconv . Atoi ( string ( endRange ) )
if err != nil {
return nil , fmt . Errorf ( "invalid end range for numeric specifier" )
}
}
}
idx := len ( outQueue ) - 1
// Get the last added node
if idx < 0 || outQueue [ idx ] . nodetype == lparenNode {
return nil , fmt . Errorf ( "numeric specifier with no content" )
}
outQueue [ idx ] . startReps = startRangeNum
outQueue [ idx ] . endReps = endRangeNum
}
if c == '(' || c == nonCapLparenRune {
opStack = append ( opStack , c )
if c == '(' { // We only push _capturing_ group parentheses to outQueue
outQueue = append ( outQueue , newPostfixNode ( c ) )
}
numOpenParens ++
}
if c == ')' {
// Keep popping from opStack until we encounter an opening parantheses or a NONCAPLPAREN_CHAR. Throw error if we reach the end of the stack.
var val rune
var err error
for val , err = peek ( opStack ) ; val != '(' && val != nonCapLparenRune ; val , err = peek ( opStack ) {
if err != nil {
return nil , fmt . Errorf ( "imbalanced parantheses" )
}
to_append := mustPop ( & opStack )
outQueue = append ( outQueue , newPostfixNode ( to_append ) )
}
_ = mustPop ( & opStack ) // Get rid of opening parentheses
if val == '(' { // Whatever was inside the parentheses was a _capturing_ group, so we append the closing parentheses as well
outQueue = append ( outQueue , newPostfixNode ( ')' ) ) // Add closing parentheses
}
numOpenParens --
}
}
// Pop all remaining operators (and append to outQueue)
for len ( opStack ) > 0 {
to_append := mustPop ( & opStack )
outQueue = append ( outQueue , newPostfixNode ( to_append ) )
}
if numOpenParens != 0 {
return nil , fmt . Errorf ( "imbalanced parantheses" )
}
return outQueue , nil
}
// Thompson's algorithm. Constructs Finite-State Automaton from given string.
// Returns start state and number of groups in regex.
func thompson ( re [ ] postfixNode ) ( Reg , error ) {
nfa := make ( [ ] * nfaState , 0 ) // Stack of states
numGroups := 0 // Number of capturing groups
// If thompson() receives an empty regex, then whatever was given to shuntingYard()
// was parsed away. This doesn't mean that the regex itself is empty.
// For example, it could have been '(?:)'. This is an empty non-capturing group. Since
// shuntingYard() doesn't include non-capturing groups in its output (and the group contains
// nothing), the output of shuntingYard() (and the input to thompson()) ends up being empty.
// In these cases, we will return an NFA with 1 state, with an assertion that is always true.
if len ( re ) == 0 {
start := zeroLengthMatchState ( )
nfa = append ( nfa , & start )
}
for _ , c := range re {
if c . nodetype == characterNode || c . nodetype == assertionNode {
stateToAdd := nfaState { }
stateToAdd . transitions = make ( map [ int ] [ ] * nfaState )
if c . allChars {
stateToAdd . allChars = true
if len ( c . except ) != 0 {
// For each node that I am 'excepting' (eg. in an inverted character class):
// - If the node itself has exceptions, then the exceptions cancel out.
// Eg. [^\w] == [\W]
// - Since an allChars node is the only kind that _can_ have exceptions, that's what I check for.
// - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list.
for _ , node := range c . except {
if node . allChars {
stateToAdd . allChars = false
// For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all,
// and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match
// those.
nodeExceptChars := slices . Concat ( funcMap ( node . except , func ( node postfixNode ) [ ] rune {
nodeContents := node . contents
if caseInsensitive {
nodeContents = slices . Concat ( funcMap ( nodeContents , func ( r rune ) [ ] rune {
return allCases ( r , caseInsensitive )
} ) ... )
}
return nodeContents
} ) ... )
stateToAdd . content = rune2Contents ( nodeExceptChars )
} else {
charsToAdd := node . contents
if caseInsensitive {
charsToAdd = slices . Concat ( funcMap ( charsToAdd , func ( r rune ) [ ] rune {
return allCases ( r , caseInsensitive )
} ) ... )
}
stateToAdd . except = append ( stateToAdd . except , charsToAdd ... )
}
}
}
}
// Convert the current contents to []int, convert the result of rune2contents to []int, append then
// convert back to stateContents.
runesToAdd := c . contents
if caseInsensitive {
runesToAdd = slices . Concat ( funcMap ( runesToAdd , func ( r rune ) [ ] rune {
return allCases ( r , caseInsensitive )
} ) ... )
}
stateToAdd . content = stateContents ( append ( [ ] int ( stateToAdd . content ) , [ ] int ( rune2Contents ( runesToAdd ) ) ... ) )
stateToAdd . output = make ( [ ] * nfaState , 0 )
stateToAdd . output = append ( stateToAdd . output , & stateToAdd )
stateToAdd . isEmpty = false
if c . nodetype == assertionNode {
stateToAdd . isEmpty = true // This is a little weird. A lookaround has the 'isEmpty' flag set, even though it _isn't_ empty (the contents are the regex). But, there's so much error-checking that relies on this flag that it's better to keep it this way.
stateToAdd . content = newContents ( epsilon ) // Ideally, an assertion shouldn't have any content, since it doesn't say anything about the content of string
if c . lookaroundDir == 0 || c . lookaroundSign == 0 {
switch c . contents [ 0 ] {
case '^' :
stateToAdd . assert = sosAssert
case '$' :
stateToAdd . assert = eosAssert
case 'b' :
stateToAdd . assert = wboundAssert
case 'B' :
stateToAdd . assert = nonwboundAssert
case 'A' :
stateToAdd . assert = soiAssert
case 'z' :
stateToAdd . assert = eoiAssert
}
} else { // Lookaround
stateToAdd . lookaroundRegex = string ( c . contents )
if c . lookaroundDir == lookahead {
if c . lookaroundSign == positive {
stateToAdd . assert = plaAssert
}
if c . lookaroundSign == negative {
stateToAdd . assert = nlaAssert
}
}
if c . lookaroundDir == lookbehind {
if c . lookaroundSign == positive {
stateToAdd . assert = plbAssert
}
if c . lookaroundSign == negative {
stateToAdd . assert = nlbAssert
}
}
tmpRe , err := shuntingYard ( stateToAdd . lookaroundRegex )
if err != nil {
return Reg { } , fmt . Errorf ( "error parsing lookaround: %w" , err )
}
reg , err := thompson ( tmpRe )
if err != nil {
return Reg { } , fmt . Errorf ( "error compiling lookaround: %w" , err )
}
stateToAdd . lookaroundNFA = reg . start
stateToAdd . lookaroundNumCaptureGroups = reg . numGroups
}
}
// Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it
replaceByValue ( [ ] int ( stateToAdd . content ) , int ( escBackslashRune ) , '\\' )
replaceByValue ( stateToAdd . except , escBackslashRune , '\\' )
nfa = append ( nfa , & stateToAdd )
}
if c . nodetype == lparenNode || c . nodetype == rparenNode {
s := & nfaState { }
s . assert = noneAssert
s . content = newContents ( epsilon )
s . isEmpty = true
s . output = make ( [ ] * nfaState , 0 )
s . output = append ( s . output , s )
s . transitions = make ( map [ int ] [ ] * nfaState )
// LPAREN nodes are just added normally
if c . nodetype == lparenNode {
numGroups ++
s . groupBegin = true
s . groupNum = numGroups
nfa = append ( nfa , s )
continue
}
// For RPAREN nodes, I assume that the last two nodes in the list are an LPAREN,
// and then some other node.
// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
// and added back in.
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
// and RPAREN nodes.
// If neither node exists, that's a problem so I return an error.
if c . nodetype == rparenNode {
s . groupEnd = true
middleNode , err1 := pop ( & nfa )
lparenNode , err2 := pop ( & nfa )
if err1 != nil && err2 != nil {
return Reg { } , fmt . Errorf ( "imbalanced parentheses" )
} else if err2 != nil { // There was no third node. ie. something like '()'
lparenNode = middleNode
if lparenNode . groupBegin != true { // There are only two nodes, but the first one isn't an LPAREN.
return Reg { } , fmt . Errorf ( "imbalanced parentheses" )
}
s . groupNum = lparenNode . groupNum
to_add := concatenate ( lparenNode , s )
nfa = append ( nfa , to_add )
} else {
// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
if lparenNode . groupBegin {
s . groupNum = lparenNode . groupNum
} else if middleNode . groupBegin { // Something like 'a()'
s . groupNum = middleNode . groupNum
} else { // A middleNode and lparenNode exist, but neither is actually an LPAREN.
return Reg { } , fmt . Errorf ( "imbalanced parentheses" )
}
tmp := concatenate ( lparenNode , middleNode )
to_add := concatenate ( tmp , s )
nfa = append ( nfa , to_add )
}
}
}
if c . nodetype == charclassNode { // A Character class consists of all the nodes in it, alternated
// Map the list of nodes to a list of states, each state containing the contents of a specific node
states := funcMap ( c . nodeContents , func ( node postfixNode ) * nfaState {
s := newState ( )
nodeContents := node . contents
if caseInsensitive {
nodeContents = slices . Concat ( funcMap ( nodeContents , func ( r rune ) [ ] rune {
return allCases ( r , caseInsensitive )
} ) ... )
}
s . content = rune2Contents ( nodeContents )
if len ( node . except ) > 0 {
s . allChars = true
s . except = slices . Concat ( funcMap ( node . except , func ( n postfixNode ) [ ] rune {
return n . contents
} ) ... )
}
return & s
} )
// Reduce the list of states down to a single state by alternating them
toAdd := funcReduce ( states , func ( s1 * nfaState , s2 * nfaState ) * nfaState {
return alternate ( s1 , s2 )
} )
nfa = append ( nfa , toAdd )
}
// Must be an operator if it isn't a character
switch c . nodetype {
case concatenateNode :
s2 := mustPop ( & nfa )
// Relax the requirements for concatenation a little bit - If
// the second element is not found ie. the postfixNodes look
// like 'a'+CONCAT, then that's fine, we just skip the concatenation.
s1 , err := pop ( & nfa )
if err != nil {
nfa = append ( nfa , s2 )
} else {
s1 = concatenate ( s1 , s2 )
nfa = append ( nfa , s1 )
}
case kleeneNode : // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
s1 , err := pop ( & nfa )
if err != nil {
return Reg { } , fmt . Errorf ( "error applying kleene star" )
}
stateToAdd , err := kleene ( * s1 )
if err != nil {
return Reg { } , err
}
nfa = append ( nfa , stateToAdd )
case plusNode : // a+ is equivalent to aa*
s1 := mustPop ( & nfa )
s2 , err := kleene ( * s1 )
if err != nil {
return Reg { } , err
}
s1 = concatenate ( s1 , s2 )
nfa = append ( nfa , s1 )
case questionNode : // ab? is equivalent to a(b|)
s1 , err := pop ( & nfa )
if err != nil {
return Reg { } , fmt . Errorf ( "error applying question operator" )
}
s2 := question ( s1 )
nfa = append ( nfa , s2 )
case pipeNode :
// A pipe operator doesn't actually need either operand to be present. If an operand isn't present,
// it is replaced with an implicit 'matchZeroLength' state (this is the same thing that we add at the top if our
// input has zero postfixNodes).
// Things to think about:
// 'a|'
// '|a'
// '^a|'
// '^|a'
s1 , err1 := pop ( & nfa )
s2 , err2 := pop ( & nfa )
if err2 != nil || ( s2 . groupBegin && len ( s2 . transitions ) == 0 ) { // Doesn't exist, or its just an LPAREN
if err2 == nil { // Roundabout way of saying that this node existed, but it was an LPAREN, so we append it back
nfa = append ( nfa , s2 )
}
tmp := zeroLengthMatchState ( )
s2 = & tmp
}
if err1 != nil || ( s1 . groupBegin && len ( s1 . transitions ) == 0 ) { // Doesn't exist, or its just an LPAREN
if err1 == nil { // See above for explanation
nfa = append ( nfa , s1 )
}
tmp := zeroLengthMatchState ( )
s1 = & tmp
}
s3 := alternate ( s1 , s2 )
nfa = append ( nfa , s3 )
}
if c . startReps != 1 || c . endReps != 1 { // Must have a numeric specifier attached to it
if c . endReps != - 1 && c . endReps < c . startReps {
return Reg { } , fmt . Errorf ( "numeric specifier - start greater than end" )
}
poppedState := mustPop ( & nfa )
var stateToAdd * nfaState = nil
// Take advantage of the following facts:
// a{5} == aaaaa
// a{3,5} == aaaa?a?
// a{5,} == aaaaa+
// Nov. 3 2024 - I have two choices on how I want to implement numeric
// specifiers.
// a. Encode the logic while creating the states. I will have to create a function
// that creates a deep-copy of a given state / NFA, so that I can concatenate them to
// each other (concatenating them with the 'concatenate' method - which takes addresses - does
// not work). Creating this function might be a lot of work.
// b. Encode the logic while parsing the string (shunting-yard). If I can expand the numeric specifier
// at this point, I can leave thompson untouched.
for i := 0 ; i < c . startReps ; i ++ { // Case 1
stateToAdd = concatenate ( stateToAdd , cloneState ( poppedState ) )
}
if c . endReps == infinite_reps { // Case 3
s2 , err := kleene ( * poppedState )
if err != nil {
return Reg { } , err
}
stateToAdd = concatenate ( stateToAdd , s2 )
} else { // Case 2
for i := c . startReps ; i < c . endReps ; i ++ {
stateToAdd = concatenate ( stateToAdd , question ( cloneState ( poppedState ) ) )
}
}
nfa = append ( nfa , stateToAdd )
}
}
if len ( nfa ) != 1 {
return Reg { } , fmt . Errorf ( "invalid regex" )
}
verifyLastStates ( nfa )
return Reg { nfa [ 0 ] , numGroups } , nil
}
// Compile compiles the given regular expression into a [Reg].
//
// An error value != nil indicates that the regex was invalid; the error message should provide
// detailed information on the nature of the error.
// The second parameter is a sequence of zero or more [ReFlag] values, that modify the behavior of the regex.
func Compile ( re string , flags ... ReFlag ) ( Reg , error ) {
nodes , err := shuntingYard ( re , flags ... )
if err != nil {
return Reg { } , fmt . Errorf ( "error parsing regex: %w" , err )
}
reg , err := thompson ( nodes )
if err != nil {
return Reg { } , fmt . Errorf ( "error compiling regex: %w" , err )
}
return reg , nil
}
// MustCompile panicks if Compile returns an error. They are identical in all other respects.
func MustCompile ( re string , flags ... ReFlag ) Reg {
reg , err := Compile ( re , flags ... )
if err != nil {
panic ( err )
}
return reg
}