You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kleingrep/regex/range2regex.go

140 lines
4.0 KiB
Go

package regex
import (
"fmt"
"math"
"slices"
"strconv"
"strings"
)
type numRange struct {
start int
end int
}
// Returns the exponent of the closest power of 10 smaller
// than the given value.
func floorPower10(val int) int {
return int(math.Floor(math.Log10(float64(val))))
}
// Returns smallest multiple of 10^exp, that is greater than val
func roundUpToNearest10Multiple(val int, exp int) int {
bench := int(math.Round(math.Pow10(exp)))
if val != 0 && val%bench == 0 {
return val
} else {
return (bench - val%bench) + val
}
}
func roundDownToNearest10Multiple(val int, exp int) int {
bench := int(math.Round(math.Pow10(exp)))
return val - val%bench
}
// Converts the given integer into an int-slice, where each element
// represents a digit of the number.
func intToSlc(val int) []int {
valStr := strconv.Itoa(val)
valSlc := []rune(valStr)
toRet := make([]int, len(valStr))
for i, r := range valSlc {
toRet[i] = int(r - 48)
}
return toRet
}
func range2regex(start int, end int) (string, error) {
rangeStart := start
rangeEnd := end
if rangeStart > rangeEnd {
return "", fmt.Errorf("numeric range start greater than range end")
}
ranges := make([]numRange, 0)
// If both numbers are in the same power of 10 eg. 15000 and 17000.
// the maximum power of 10 that we will go to, is determined by the largest
// power of 10 at which both numbers differ. Given 15000 and 17000, we will
// go up to 10^3, because that is the largestindex at which they differ.
startRangeSlc := intToSlc(rangeStart)
endRangeSlc := intToSlc(rangeEnd)
maxPower10 := 0
if len(startRangeSlc) != len(endRangeSlc) { // Different number of digits, so we will go up to the maximum (which must be rangeEnd)
maxPower10 = floorPower10(rangeEnd) // Maximum power of 10 that we will reach
} else {
maxPower10 = 0
for i := range startRangeSlc {
if startRangeSlc[i] != endRangeSlc[i] {
maxPower10 = len(startRangeSlc) - i - 1
break
}
}
}
tmp := rangeStart
exp := 1 // The exponent of 10 that we are finding the range to
// Increasing up to highest power
for exp <= maxPower10 {
tmpRangeEnd := roundUpToNearest10Multiple(tmp, exp)
if tmp != tmpRangeEnd {
ranges = append(ranges, numRange{tmp, tmpRangeEnd - 1})
}
tmp = tmpRangeEnd
exp++
}
exp--
// Decreasing down to lowest power
for exp >= 1 {
tmpRangeEnd := roundDownToNearest10Multiple(rangeEnd, exp)
if tmp != tmpRangeEnd {
ranges = append(ranges, numRange{tmp, tmpRangeEnd - 1})
}
tmp = tmpRangeEnd
exp--
}
// Last range - tmp to rangeEnd
ranges = append(ranges, numRange{tmp, rangeEnd})
regexSlice := make([]string, 0)
// Generate the regex
for _, rg := range ranges {
tmpStr := ""
tmpStr += string(nonCapLparenRune)
startSlc := intToSlc(rg.start)
endSlc := intToSlc(rg.end)
if len(startSlc) != len(endSlc) {
return "", fmt.Errorf("Error parsing numeric range")
}
for i := range startSlc {
if startSlc[i] == endSlc[i] {
tmpStr += string(rune(startSlc[i] + 48)) // '0' is ascii value 48, 1 is 49 etc. To convert the digit to its character form, we can just add 48.
} else {
tmpStr += fmt.Sprintf("%c%c-%c%c", lbracketRune, rune(startSlc[i]+48), rune(endSlc[i]+48), rbracketRune)
}
}
tmpStr += ")"
regexSlice = append(regexSlice, tmpStr)
}
// Each element of the slice represents one 'group'. Taking 0-255 as an example, the elements would be:
// 1. 0-9
// 2. 10-99
// 3. 100-199
// 4. 200-249
// 5. 250-255
//
// The reason this is reversed before joining it, is because it is incompatible with the PCRE rule for matching.
// The PCRE rule specifies that the left-branch of an alternation is preferred. Even though this engine uses the POSIX
// rule at the moment (which prefers the longest match regardless of the order of the alternation), reversing the string
// has no downsides. It doesn't affect POSIX matching, and it will reduce my burden if I decide to switch to PCRE matching.
slices.Reverse(regexSlice)
regex := string(nonCapLparenRune) + strings.Join(regexSlice, "|") + ")"
return regex, nil
}