3 Commits

3 changed files with 122 additions and 0 deletions

View File

@@ -42,6 +42,38 @@ func priority(op rune) int {
return slices.Index(precedence, op)
}
// Returns the POSIX character class represented by the given string.
// The given string must be of the form:
//
// 'blah1:]blah2'
//
// In order to be a _syntactically_ valid POSIX class.
// Whether or not such a class actually exists is not relevant to this function, it just
// parses and returns 'blah1'.
// For example, if the regex was something like '[[:digit:]]', the caller must parse through the opening
// brackets and the colon, and call this function with the remainder.
//
// If the given string represents a syntactically valid POSIX class, the second parameter is true.
// Otherwise, it's false.
func getPOSIXClass(str []rune) (bool, string) {
i := 0
rtv := ""
for i < len(str) && (str[i] != ':' && str[i] != RBRACKET) {
rtv += string(str[i])
i++
}
if i >= (len(str) - 1) { // We need to be atleast 1 character short, because the closing bracket must follow
return false, ""
}
if str[i] != ':' { // The POSIX class must end with a colon and a closing bracket. It cannot end with a closing bracket first.
return false, ""
}
if str[i+1] != RBRACKET {
return false, ""
}
return true, rtv
}
/*
The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
The primary benefit of this is getting rid of parentheses.
@@ -162,6 +194,21 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("Opening bracket without closing bracket.")
}
if re_runes[i] == LBRACKET && re_runes[i+1] == ':' { // POSIX character class
toAppend = append(toAppend, re_runes[i])
i++
toAppend = append(toAppend, re_runes[i])
i++
for i < len(re_runes)-1 && re_runes[i] != ':' && re_runes[i] != ']' {
toAppend = append(toAppend, re_runes[i])
i++
}
if i >= len(re_runes)-1 && re_runes[i] != ':' {
return nil, fmt.Errorf("Unable to parse what looks like a POSIX character class.")
}
toAppend = append(toAppend, re_runes[i])
i++
}
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != RBRACKET) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
re_runes[i] = CHAR_RANGE
}
@@ -501,6 +548,64 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
i++
}
} else {
if re_postfix[i] == LBRACKET && i < len(re_postfix)-8 { // Could be the start of a POSIX class - the smallest POSIX class by word-length [[:word:]] takes 8 more characters
temp_i := i
temp_i++
if re_postfix[temp_i] == ':' {
temp_i++
posixClassPresent, posixClass := getPOSIXClass(re_postfix[temp_i:])
// getPOSIXClass returns true if there is some set of characters that
// ends in a colon and then a closing bracket. If this is not the case, we
// just treat all the characters as literals.
// For example, [[:digit:a]] is _not_ a POSIX class, its just a regular
// character class contains the letters '[', ':', 'd', 'i', 'g', 'i', 't', ':', 'a'.
// The final 'closing bracket' has no special meaning, its just another character.
if posixClassPresent {
var nodeToAdd postfixNode
switch posixClass {
case "digit": // Equivalent to '\d'
nodeToAdd = newPostfixCharNode(genRangeInclusive('0', '9')...)
case "upper": // [A-Z]
nodeToAdd = newPostfixCharNode(genRangeInclusive('A', 'Z')...)
case "lower": // [a-z]
nodeToAdd = newPostfixCharNode(genRangeInclusive('a', 'z')...)
case "alpha": //[A-Za-z]
nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'Z'), genRangeInclusive('a', 'z'))...)
case "xdigit": // [0-9A-Fa-f]
nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'F'), genRangeInclusive('a', 'f'), genRangeInclusive('0', '9'))...)
case "alnum": // [A-Za-z0-9]
nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'Z'), genRangeInclusive('a', 'z'), genRangeInclusive('0', '9'))...)
case "blank": // [ \t]
nodeToAdd = newPostfixCharNode(' ', '\t')
case "space": // [ \t\n\r\f\v]
nodeToAdd = newPostfixCharNode(' ', '\t', '\n', '\r', '\f', '\v')
case "cntrl": // Control characters
nodeToAdd = newPostfixCharNode(append(genRangeInclusive('\x00', '\x1F'), '\x7F')...)
case "punct": // Punctuation and symbols
nodeToAdd = newPostfixCharNode([]rune(`!"\#$%&'()*+,\-./:;<=>?@\[\\\]^_` + "`" + `{|}~`)...)
case "graph": // Graphic characters
nodeToAdd = newPostfixCharNode(genRangeInclusive('\x21', '\x7E')...)
case "print": // Graphic characters + space
nodeToAdd = newPostfixCharNode(genRangeInclusive('\x20', '\x7E')...)
case "ascii": // ASCII values
nodeToAdd = newPostfixCharNode(genRangeInclusive('\x00', '\x7F')...)
case "word": // Word characters
nodeToAdd, _ = newEscapedNode('w', true) // This isn't going to error, so I suppress it
default:
return nil, fmt.Errorf("Invalid POSIX character class.")
}
chars = append(chars, nodeToAdd)
i = temp_i + len(posixClass) + 2 // Skip over the class name, the closing colon and the closing bracket
firstCharAdded = true
continue
}
}
}
// This used to be an else statement - I removed it, because if the previous if-block fails
// (ie. if it didn't actually find a character class), then this block must still execute.
// However, the checks for character classes are nested, so placing this inside an 'else' block
// will prevent it from running, as the outer if-statement will have evaluated to true.
if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket.
switch re_postfix[i] {
case LBRACKET:

View File

@@ -133,6 +133,13 @@ func genRange[T character](start, end T) []T {
return toRet
}
// Generate numbers in a range - start to end (both inclusive)
func genRangeInclusive[T character](start, end T) []T {
toRet := genRange(start, end)
toRet = append(toRet, end)
return toRet
}
// Returns a rune-slice containing all possible cases of the given rune.
// At the moment, this includes:
// 1. Upper case

View File

@@ -163,6 +163,16 @@ var reTests = []struct {
{"(?<=f)f+(?=f)", nil, "fffff", []Group{{1, 4}}},
{"(?<=f)f+(?=f)", nil, "fffffa", []Group{{1, 4}}},
// Some POSIX charclass tests
{"[[:lower:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 26}}},
{"[[:upper:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{26, 52}}},
{"[[:alpha:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 52}}},
{"[[:digit:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{52, 62}}},
{"[[:alnum:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 62}}},
{"[[:punct:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{62, 70}}},
{"[[:ascii:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 70}}},
{"[[:graph:]]+", nil, "abcdefghijklmnopqrstuvwyxzABCDEFGHIJKLMNOPRQSTUVWXYZ0123456789!@#$%^&*", []Group{{0, 70}}},
// Test cases from Python's RE test suite
{`[\1]`, nil, "\x01", []Group{{0, 1}}},