diff --git a/regex/compile.go b/regex/compile.go index ba59c9f..59ff738 100644 --- a/regex/compile.go +++ b/regex/compile.go @@ -59,7 +59,7 @@ func priority(op rune) int { func getPOSIXClass(str []rune) (bool, string) { i := 0 rtv := "" - for i < len(str) && (str[i] != ':' && str[i] != RBRACKET) { + for i < len(str) && (str[i] != ':' && str[i] != rbracketRune) { rtv += string(str[i]) i++ } @@ -69,7 +69,7 @@ func getPOSIXClass(str []rune) (bool, string) { if str[i] != ':' { // The POSIX class must end with a colon and a closing bracket. It cannot end with a closing bracket first. return false, "" } - if str[i+1] != RBRACKET { + if str[i+1] != rbracketRune { return false, "" } return true, rtv @@ -174,13 +174,13 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { re_runes = append(re_runes, NONCAPLPAREN_CHAR) i += 2 } else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash - re_runes = append(re_runes, ESC_BACKSLASH) + re_runes = append(re_runes, escBackslashRune) i++ } else if c == '[' && (i == 0 || re_runes[len(re_runes)-1] != '\\') { - re_runes = append(re_runes, LBRACKET) + re_runes = append(re_runes, lbracketRune) continue } else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') { - re_runes = append(re_runes, RBRACKET) + re_runes = append(re_runes, rbracketRune) continue } else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') { return nil, fmt.Errorf("non-greedy operators are not supported") @@ -203,7 +203,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { i := 0 for i < len(re_runes) { re_postfix = append(re_postfix, re_runes[i]) - if re_runes[i] == LBRACKET && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. + if re_runes[i] == lbracketRune && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. toAppend := make([]rune, 0) // Holds all the runes in the current character class i++ // Skip past LBRACKET, because it was already added @@ -211,13 +211,13 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { return nil, fmt.Errorf("opening bracket without closing bracket") } - for re_runes[i] != RBRACKET || i == 0 || re_runes[i-1] == '\\' { // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash) + for re_runes[i] != rbracketRune || i == 0 || re_runes[i-1] == '\\' { // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash) // Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error. if i >= len(re_runes) { return nil, fmt.Errorf("opening bracket without closing bracket") } - if re_runes[i] == LBRACKET && re_runes[i+1] == ':' { // POSIX character class + if re_runes[i] == lbracketRune && re_runes[i+1] == ':' { // POSIX character class toAppend = append(toAppend, re_runes[i]) i++ toAppend = append(toAppend, re_runes[i]) @@ -232,14 +232,14 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { toAppend = append(toAppend, re_runes[i]) i++ } - if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != RBRACKET) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range + if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != rbracketRune) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range re_runes[i] = CHAR_RANGE } toAppend = append(toAppend, re_runes[i]) i++ } // Add in the RBRACKET - toAppend = append(toAppend, RBRACKET) + toAppend = append(toAppend, rbracketRune) re_postfix = append(re_postfix, toAppend...) } if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either @@ -357,7 +357,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { // To deal with this, I make the following assertion: // If at any point I see an RBRACKET 'in the wild' (not in a character class), then it must be // a regular character, with no special significance. - if c == RBRACKET { + if c == rbracketRune { outQueue = append(outQueue, newPostfixCharNode(']')) continue } @@ -496,7 +496,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } } } - if c == LBRACKET { // Used for character classes + if c == lbracketRune { // Used for character classes firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added. endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter i++ // Step forward so we can look at the character class @@ -521,7 +521,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } chars := make([]postfixNode, 0) // List of nodes - used only for character classes for i < len(re_postfix) { - if firstCharAdded && re_postfix[i] == RBRACKET { + if firstCharAdded && re_postfix[i] == rbracketRune { break } if re_postfix[i] == CHAR_RANGE { @@ -581,7 +581,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { i++ } } else { - if re_postfix[i] == LBRACKET && i < len(re_postfix)-8 { // Could be the start of a POSIX class - the smallest POSIX class by word-length [[:word:]] takes 8 more characters + if re_postfix[i] == lbracketRune && i < len(re_postfix)-8 { // Could be the start of a POSIX class - the smallest POSIX class by word-length [[:word:]] takes 8 more characters temp_i := i temp_i++ if re_postfix[temp_i] == ':' { @@ -643,9 +643,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { // will prevent it from running, as the outer if-statement will have evaluated to true. if !firstCharAdded && re_postfix[i] > 0xF0000 { // It's a metacharacter that I defined, I'll have to convert it back to the regular character before adding it back, because I haven't added any characters yet. For example, '[[]', the second LBRACKET should be treated like a literal bracket. switch re_postfix[i] { - case LBRACKET: + case lbracketRune: chars = append(chars, newPostfixCharNode('[')) - case RBRACKET: + case rbracketRune: chars = append(chars, newPostfixCharNode(']')) default: return nil, fmt.Errorf("error parsing high-range unicode value in character class") @@ -912,8 +912,8 @@ func thompson(re []postfixNode) (Reg, error) { } // Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it - replaceByValue([]int(stateToAdd.content), int(ESC_BACKSLASH), '\\') - replaceByValue(stateToAdd.except, ESC_BACKSLASH, '\\') + replaceByValue([]int(stateToAdd.content), int(escBackslashRune), '\\') + replaceByValue(stateToAdd.except, escBackslashRune, '\\') nfa = append(nfa, &stateToAdd) } diff --git a/regex/misc.go b/regex/misc.go index d9bc33c..33da223 100644 --- a/regex/misc.go +++ b/regex/misc.go @@ -8,16 +8,16 @@ import ( var whitespaceChars = []rune{' ', '\t', '\n'} var digitChars = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'} var wordChars = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_") -var LBRACKET rune = 0xF0002 -var RBRACKET rune = 0xF0003 -var ANY_CHAR rune = 0xF0004 // Represents any character - used for states where the allChars flag is on. -var LPAREN_CHAR rune = 0xF0005 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses -var RPAREN_CHAR rune = 0xF0006 -var NONCAPLPAREN_CHAR rune = 0xF0007 // Represents a non-capturing group's LPAREN -var ESC_BACKSLASH rune = 0xF0008 // Represents an escaped backslash -var CHAR_RANGE rune = 0xF0009 // Represents a character range - -var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', LBRACKET, RBRACKET, NONCAPLPAREN_CHAR} +var lbracketRune rune = 0xF0002 +var rbracketRune rune = 0xF0003 +var anyCharRune rune = 0xF0004 // Represents any character - used for states where the allChars flag is on. +var lparenRune rune = 0xF0005 // Parentheses in regex are concatenated with this - it acts as a pseudio-parentheses +var rparenRune rune = 0xF0006 +var nonCapLparenRune rune = 0xF0007 // Represents a non-capturing group's LPAREN +var escBackslashRune rune = 0xF0008 // Represents an escaped backslash +var CHAR_RANGE rune = 0xF0009 // Represents a character range + +var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune} // An interface for int and rune, which are identical type character interface { diff --git a/regex/nfa.go b/regex/nfa.go index 42c0eff..bb7c9b6 100644 --- a/regex/nfa.go +++ b/regex/nfa.go @@ -198,7 +198,7 @@ func (s nfaState) matchesFor(str []rune, idx int) ([]*nfaState, int) { } } listTransitions := s.transitions[int(str[idx])] - for _, dest := range s.transitions[int(ANY_CHAR)] { + for _, dest := range s.transitions[int(anyCharRune)] { if !slices.Contains(slices.Concat(notDotChars, dest.except), str[idx]) { // Add an allChar state to the list of matches if: // a. The current character isn't a 'notDotChars' character. In single line mode, this includes newline. In multiline mode, it doesn't. diff --git a/regex/postfixNode.go b/regex/postfixNode.go index 42a2eab..355bc6c 100644 --- a/regex/postfixNode.go +++ b/regex/postfixNode.go @@ -54,7 +54,7 @@ func newCharClassNode(nodes []postfixNode, negated bool) postfixNode { rtv.endReps = 1 if negated { rtv.nodetype = characterNode - rtv.contents = []rune{ANY_CHAR} + rtv.contents = []rune{anyCharRune} rtv.allChars = true rtv.except = nodes } else { @@ -169,10 +169,10 @@ func newPostfixNode(contents ...rune) postfixNode { // Special cases for LPAREN and RPAREN - they have special characters defined for them if to_return.nodetype == lparenNode { - to_return.contents = []rune{LPAREN_CHAR} + to_return.contents = []rune{lparenRune} } if to_return.nodetype == rparenNode { - to_return.contents = []rune{RPAREN_CHAR} + to_return.contents = []rune{rparenRune} } } return to_return @@ -185,7 +185,7 @@ func newPostfixDotNode() postfixNode { toReturn.endReps = 1 toReturn.nodetype = characterNode toReturn.allChars = true - toReturn.contents = []rune{ANY_CHAR} + toReturn.contents = []rune{anyCharRune} return toReturn }