From 3fb9bc14465fb8a660df31c93a0079fd9695fe79 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Mon, 27 Jan 2025 16:00:35 -0500 Subject: [PATCH] Added support for POSIX character classes --- compile.go | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/compile.go b/compile.go index 28038cf..91b098c 100644 --- a/compile.go +++ b/compile.go @@ -42,6 +42,38 @@ func priority(op rune) int { return slices.Index(precedence, op) } +// Returns the POSIX character class represented by the given string. +// The given string must be of the form: +// +// 'blah1:]blah2' +// +// In order to be a _syntactically_ valid POSIX class. +// Whether or not such a class actually exists is not relevant to this function, it just +// parses and returns 'blah1'. +// For example, if the regex was something like '[[:digit:]]', the caller must parse through the opening +// brackets and the colon, and call this function with the remainder. +// +// If the given string represents a syntactically valid POSIX class, the second parameter is true. +// Otherwise, it's false. +func getPOSIXClass(str []rune) (bool, string) { + i := 0 + rtv := "" + for i < len(str) && (str[i] != ':' && str[i] != RBRACKET) { + rtv += string(str[i]) + i++ + } + if i >= (len(str) - 1) { // We need to be atleast 1 character short, because the closing bracket must follow + return false, "" + } + if str[i] != ':' { // The POSIX class must end with a colon and a closing bracket. It cannot end with a closing bracket first. + return false, "" + } + if str[i+1] != RBRACKET { + return false, "" + } + return true, rtv +} + /* The Shunting-Yard algorithm is used to convert the given infix (regeular) expression to postfix. The primary benefit of this is getting rid of parentheses. @@ -162,6 +194,21 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { return nil, fmt.Errorf("Opening bracket without closing bracket.") } + if re_runes[i] == LBRACKET && re_runes[i+1] == ':' { // POSIX character class + toAppend = append(toAppend, re_runes[i]) + i++ + toAppend = append(toAppend, re_runes[i]) + i++ + for i < len(re_runes)-1 && re_runes[i] != ':' && re_runes[i] != ']' { + toAppend = append(toAppend, re_runes[i]) + i++ + } + if i >= len(re_runes)-1 && re_runes[i] != ':' { + return nil, fmt.Errorf("Unable to parse what looks like a POSIX character class.") + } + toAppend = append(toAppend, re_runes[i]) + i++ + } if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != RBRACKET) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range re_runes[i] = CHAR_RANGE } @@ -501,6 +548,64 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { i++ } } else { + if re_postfix[i] == LBRACKET && i < len(re_postfix)-8 { // Could be the start of a POSIX class - the smallest POSIX class by word-length [[:word:]] takes 8 more characters + temp_i := i + temp_i++ + if re_postfix[temp_i] == ':' { + temp_i++ + posixClassPresent, posixClass := getPOSIXClass(re_postfix[temp_i:]) + // getPOSIXClass returns true if there is some set of characters that + // ends in a colon and then a closing bracket. If this is not the case, we + // just treat all the characters as literals. + // For example, [[:digit:a]] is _not_ a POSIX class, its just a regular + // character class contains the letters '[', ':', 'd', 'i', 'g', 'i', 't', ':', 'a'. + // The final 'closing bracket' has no special meaning, its just another character. + if posixClassPresent { + var nodeToAdd postfixNode + switch posixClass { + case "digit": // Equivalent to '\d' + nodeToAdd = newPostfixCharNode(genRangeInclusive('0', '9')...) + case "upper": // [A-Z] + nodeToAdd = newPostfixCharNode(genRangeInclusive('A', 'Z')...) + case "lower": // [a-z] + nodeToAdd = newPostfixCharNode(genRangeInclusive('a', 'z')...) + case "alpha": //[A-Za-z] + nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'Z'), genRangeInclusive('a', 'z'))...) + case "xdigit": // [0-9A-Fa-f] + nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'F'), genRangeInclusive('a', 'f'), genRangeInclusive('0', '9'))...) + case "alnum": // [A-Za-z0-9] + nodeToAdd = newPostfixCharNode(slices.Concat(genRangeInclusive('A', 'Z'), genRangeInclusive('a', 'z'), genRangeInclusive('0', '9'))...) + case "blank": // [ \t] + nodeToAdd = newPostfixCharNode(' ', '\t') + case "space": // [ \t\n\r\f\v] + nodeToAdd = newPostfixCharNode(' ', '\t', '\n', '\r', '\f', '\v') + case "cntrl": // Control characters + nodeToAdd = newPostfixCharNode(append(genRangeInclusive('\x00', '\x1F'), '\x7F')...) + case "punct": // Punctuation and symbols + nodeToAdd = newPostfixCharNode([]rune(`!"\#$%&'()*+,\-./:;<=>?@\[\\\]^_` + "`" + `{|}~`)...) + case "graph": // Graphic characters + nodeToAdd = newPostfixCharNode(genRangeInclusive('\x21', '\x7E')...) + case "print": // Graphic characters + space + nodeToAdd = newPostfixCharNode(genRangeInclusive('\x20', '\x7E')...) + case "ascii": // ASCII values + nodeToAdd = newPostfixCharNode(genRangeInclusive('\x00', '\x7F')...) + case "word": // Word characters + nodeToAdd, _ = newEscapedNode('w', true) // This isn't going to error, so I suppress it + default: + return nil, fmt.Errorf("Invalid POSIX character class.") + } + chars = append(chars, nodeToAdd) + i = temp_i + len(posixClass) + 2 // Skip over the class name, the closing colon and the closing bracket + firstCharAdded = true + continue + } + } + } + + // This used to be an else statement - I removed it, because if the previous if-block fails + // (ie. if it didn't actually find a character class), then this block must still execute. + // However, the checks for character classes are nested, so placing this inside an 'else' block + // will prevent it from running, as the outer if-statement will have evaluated to true. if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket. switch re_postfix[i] { case LBRACKET: