Added support for POSIX character classes

master
Aadhavan Srinivasan 6 days ago
parent ae76e2e55e
commit 3fb9bc1446

@ -42,6 +42,38 @@ func priority(op rune) int {
return slices.Index(precedence, op) return slices.Index(precedence, op)
} }
// Returns the POSIX character class represented by the given string.
// The given string must be of the form:
//
// 'blah1:]blah2'
//
// In order to be a _syntactically_ valid POSIX class.
// Whether or not such a class actually exists is not relevant to this function, it just
// parses and returns 'blah1'.
// For example, if the regex was something like '[[:digit:]]', the caller must parse through the opening
// brackets and the colon, and call this function with the remainder.
//
// If the given string represents a syntactically valid POSIX class, the second parameter is true.
// Otherwise, it's false.
func getPOSIXClass(str []rune) (bool, string) {
i := 0
rtv := ""
for i < len(str) && (str[i] != ':' && str[i] != RBRACKET) {
rtv += string(str[i])
i++
}
if i >= (len(str) - 1) { // We need to be atleast 1 character short, because the closing bracket must follow
return false, ""
}
if str[i] != ':' { // The POSIX class must end with a colon and a closing bracket. It cannot end with a closing bracket first.
return false, ""
}
if str[i+1] != RBRACKET {
return false, ""
}
return true, rtv
}
/* /*
The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix. The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix.
The primary benefit of this is getting rid of parentheses. The primary benefit of this is getting rid of parentheses.
@ -162,6 +194,21 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("Opening bracket without closing bracket.") return nil, fmt.Errorf("Opening bracket without closing bracket.")
} }
if re_runes[i] == LBRACKET && re_runes[i+1] == ':' { // POSIX character class
toAppend = append(toAppend, re_runes[i])
i++
toAppend = append(toAppend, re_runes[i])
i++
for i < len(re_runes)-1 && re_runes[i] != ':' && re_runes[i] != ']' {
toAppend = append(toAppend, re_runes[i])
i++
}
if i >= len(re_runes)-1 && re_runes[i] != ':' {
return nil, fmt.Errorf("Unable to parse what looks like a POSIX character class.")
}
toAppend = append(toAppend, re_runes[i])
i++
}
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != RBRACKET) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != RBRACKET) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
re_runes[i] = CHAR_RANGE re_runes[i] = CHAR_RANGE
} }
@ -501,6 +548,64 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
i++ i++
} }
} else { } else {
if re_postfix[i] == LBRACKET && i < len(re_postfix)-8 { // Could be the start of a POSIX class - the smallest POSIX class by word-length [[:word:]] takes 8 more characters
temp_i := i
temp_i++
if re_postfix[temp_i] == ':' {
temp_i++
posixClassPresent, posixClass := getPOSIXClass(re_postfix[temp_i:])
// getPOSIXClass returns true if there is some set of characters that
// ends in a colon and then a closing bracket. If this is not the case, we
// just treat all the characters as literals.
// For example, [[:digit:a]] is _not_ a POSIX class, its just a regular
// character class contains the letters '[', ':', 'd', 'i', 'g', 'i', 't', ':', 'a'.
// The final 'closing bracket' has no special meaning, its just another character.
if posixClassPresent {
var nodeToAdd postfixNode
switch posixClass {
case "digit": // Equivalent to '\d'
nodeToAdd = newPostfixCharNode(genRangeInclusive('0', '9')...)
case "upper": // [A-Z]
nodeToAdd = newPostfixCharNode(genRangeInclusive('A', 'Z')...)
case "lower": // [a-z]
nodeToAdd = newPostfixCharNode(genRangeInclusive('a', 'z')...)
case "alpha": //[A-Za-z]
nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'Z'), genRangeInclusive('a', 'z'))...)
case "xdigit": // [0-9A-Fa-f]
nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'F'), genRangeInclusive('a', 'f'), genRangeInclusive('0', '9'))...)
case "alnum": // [A-Za-z0-9]
nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'Z'), genRangeInclusive('a', 'z'), genRangeInclusive('0', '9'))...)
case "blank": // [ \t]
nodeToAdd = newPostfixCharNode(' ', '\t')
case "space": // [ \t\n\r\f\v]
nodeToAdd = newPostfixCharNode(' ', '\t', '\n', '\r', '\f', '\v')
case "cntrl": // Control characters
nodeToAdd = newPostfixCharNode(append(genRangeInclusive('\x00', '\x1F'), '\x7F')...)
case "punct": // Punctuation and symbols
nodeToAdd = newPostfixCharNode([]rune(`!"\#$%&'()*+,\-./:;<=>?@\[\\\]^_` + "`" + `{|}~`)...)
case "graph": // Graphic characters
nodeToAdd = newPostfixCharNode(genRangeInclusive('\x21', '\x7E')...)
case "print": // Graphic characters + space
nodeToAdd = newPostfixCharNode(genRangeInclusive('\x20', '\x7E')...)
case "ascii": // ASCII values
nodeToAdd = newPostfixCharNode(genRangeInclusive('\x00', '\x7F')...)
case "word": // Word characters
nodeToAdd, _ = newEscapedNode('w', true) // This isn't going to error, so I suppress it
default:
return nil, fmt.Errorf("Invalid POSIX character class.")
}
chars = append(chars, nodeToAdd)
i = temp_i + len(posixClass) + 2 // Skip over the class name, the closing colon and the closing bracket
firstCharAdded = true
continue
}
}
}
// This used to be an else statement - I removed it, because if the previous if-block fails
// (ie. if it didn't actually find a character class), then this block must still execute.
// However, the checks for character classes are nested, so placing this inside an 'else' block
// will prevent it from running, as the outer if-statement will have evaluated to true.
if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket. if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket.
switch re_postfix[i] { switch re_postfix[i] {
case LBRACKET: case LBRACKET:

Loading…
Cancel
Save