From 3a3333b38a134616925901c6cd0d6781560ed7b8 Mon Sep 17 00:00:00 2001 From: Rockingcool Date: Sun, 19 Jan 2025 21:26:56 -0600 Subject: [PATCH] New features, changed character class behavior I added support for hex values (eg. \x0F), octal values (eg. \012) and extended hex values (eg. \x{000F2A}). I also expanded the abilities of character clsses, to include things like escaped characters (eg. [aefp\)]) and character ranges _inside_ inverted character classes (eg. [^\w] which is functionally equivalent to [\W]). --- compile.go | 170 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 155 insertions(+), 15 deletions(-) diff --git a/compile.go b/compile.go index 9ce650f..91724e1 100644 --- a/compile.go +++ b/compile.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "math" "slices" "strconv" "unicode" @@ -184,6 +185,40 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { re_postfix = append(re_postfix, NONCAPLPAREN_CHAR) i += 3 } + if i < len(re_runes) && re_runes[i] == '\\' { // Something is being escaped (I don't add the backslash to re_postfix, because it was already added earlier) + i++ + if i >= len(re_runes) { + return nil, fmt.Errorf("Stray backslash in expression.") + } + if re_runes[i] == 'x' { + re_postfix = append(re_postfix, re_runes[i]) + i++ + if i >= len(re_runes) { + return nil, fmt.Errorf("Stray backslash in expression.") + } + if re_runes[i] == '{' { + re_postfix = append(re_postfix, re_runes[i:i+8]...) + i += 7 + if i >= len(re_runes) { + return nil, fmt.Errorf("Stray backslash in expression.") + } + } else if isHex(re_runes[i]) { + re_postfix = append(re_postfix, re_runes[i:i+2]...) + i += 2 + } else { + return nil, fmt.Errorf("Invalid hex value in expression.") + } + } else if isOctal(re_runes[i]) { + numDigits := 1 + for i+numDigits < len(re_runes) && numDigits < 3 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 3) + numDigits++ + } + re_postfix = append(re_postfix, re_runes[i:i+numDigits]...) + i += (numDigits - 1) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended + } else { + re_postfix = append(re_postfix, re_runes[i]) + } + } if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it. i++ // Step inside if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') { @@ -253,7 +288,45 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { return nil, fmt.Errorf("ERROR: Backslash with no escape character.") } i++ - outQueue = append(outQueue, newEscapedNode(re_postfix[i])) + if re_postfix[i] == 'x' { // Hex value + i++ + if re_postfix[i] == '{' && i < len(re_postfix)-6 { // Expanded hex code + var hexVal int + n, err := fmt.Sscanf(string(re_postfix[i:]), "{%x}", &hexVal) + if n < 1 || err != nil { + return nil, fmt.Errorf("Error parsing expanded hex code in expression.") + } + outQueue = append(outQueue, newPostfixCharNode(rune(hexVal))) + i += 7 + } else if i < len(re_postfix)-1 { // Two-digit hex code + hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt() + if err != nil { + return nil, fmt.Errorf("Error parsing hex characters in expression.") + } + i += 2 + outQueue = append(outQueue, newPostfixCharNode(rune(hexVal))) + } else { + return nil, fmt.Errorf("Not enough hex characters found in expression.") + } + } else if isOctal(re_postfix[i]) { // Octal value + var octVal int + n, err := fmt.Sscanf(string(re_postfix[i:]), "%d", &octVal) + if n < 1 || err != nil { + return nil, fmt.Errorf("Error parsing octal value in expression.") + } + if octVal > 777 { + return nil, fmt.Errorf("Invalid octal value in expression.") + } + i += int(math.Ceil(math.Log10(float64(octVal)))) // Shift forward by the number of digits that were parsed + i-- // Move back one character, because the loop increment will move us back to the next character automatically + octValBase10, err := strconv.ParseInt(strconv.Itoa(octVal), 8, 0) + if err != nil { + return nil, fmt.Errorf("Error parsing octal value in expression.") + } + outQueue = append(outQueue, newPostfixCharNode(rune(octValBase10))) + } else { + outQueue = append(outQueue, newEscapedNode(re_postfix[i])) + } continue // Escaped character will automatically be skipped when loop variable increments } @@ -342,25 +415,60 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { invertMatch = true i++ } - chars := make([]rune, 0) // List of characters - used only for character classes + chars := make([]postfixNode, 0) // List of nodes - used only for character classes for i < len(re_postfix) { if re_postfix[i] == RBRACKET { break } - chars = append(chars, re_postfix[i]) - i++ + if re_postfix[i] == '\\' { // Backslash indicates a character to be escaped + if i == len(re_postfix)-1 { + return nil, fmt.Errorf("Stray backslash in character class.") + } + i++ // Step past backslash + + if re_postfix[i] == 'x' { // Hex value + i++ + if re_postfix[i] == '{' && i < len(re_postfix)-7 { // Expanded hex code + var hexVal int + n, err := fmt.Sscanf(string(re_postfix[i:]), "{%x}", &hexVal) + if n < 1 || err != nil { + return nil, fmt.Errorf("Error parsing expanded hex code in character class.") + } + chars = append(chars, newPostfixCharNode(rune(hexVal))) + i += 8 + } else if i < len(re_postfix)-2 { // Two-digit hex code + hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt() + if err != nil { + return nil, fmt.Errorf("Error parsing hex characters in character class.") + } + i += 2 + chars = append(chars, newPostfixCharNode(rune(hexVal))) + } else { + return nil, fmt.Errorf("Not enough hex characters found in character class.") + } + } else if unicode.IsDigit(re_postfix[i]) { // Octal value + var octVal int + n, err := fmt.Sscanf(string(re_postfix[i:]), "%d", &octVal) + if n < 1 || err != nil { + return nil, fmt.Errorf("Error parsing octal value in character class.") + } + if octVal > 0777 { + return nil, fmt.Errorf("Invalid octal value in character class.") + } + i += int(math.Ceil(math.Log10(float64(octVal)) / math.Log10(8))) // Shift forward by the number of digits that were parsed + chars = append(chars, newPostfixCharNode(rune(octVal))) + } else { + chars = append(chars, newEscapedNode(re_postfix[i])) + } + } else { + chars = append(chars, newPostfixCharNode(re_postfix[i])) + i++ + } } if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic. return nil, fmt.Errorf("Opening bracket without closing bracket.") } - if !invertMatch { - outQueue = append(outQueue, newPostfixCharNode(chars...)) - } else { - // Invert match - create an allChars postfixNode, then add the given states to its 'except' list. - toAdd := newPostfixDotNode() - toAdd.except = chars - outQueue = append(outQueue, toAdd) - } + outQueue = append(outQueue, newCharClassNode(chars, invertMatch)) continue } if c == '{' { @@ -476,10 +584,29 @@ func thompson(re []postfixNode) (Reg, error) { if c.allChars { state.allChars = true if len(c.except) != 0 { - state.except = append([]rune{}, c.except...) + // For each node that I am 'excepting' (eg. in an inverted character class): + // - If the node itself has exceptions, then the exceptions cancel out. + // Eg. [^\w] == [\W] + // - Since an allChars node is the only kind that _can_ have exceptions, that's what I check for. + // - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list. + for _, node := range c.except { + if node.allChars { + // For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all, + // and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match + // those. + nodeExceptChars := slices.Concat(Map(node.except, func(node postfixNode) []rune { + return node.contents + })...) + state.content = rune2Contents(nodeExceptChars) + } else { + state.except = append(state.except, node.contents...) + } + } } } - state.content = rune2Contents(c.contents) + // Convert the current contents to []int, convert the result of rune2contents to []int, append then + // convert back to stateContents. + state.content = stateContents(append([]int(state.content), []int(rune2Contents(c.contents))...)) state.output = make([]*State, 0) state.output = append(state.output, &state) state.isEmpty = false @@ -561,6 +688,19 @@ func thompson(re []postfixNode) (Reg, error) { } } + if c.nodetype == CHARCLASS { // A Character class consists of all the nodes in it, alternated + // Map the list of nodes to a list of states, each state containing the contents of a specific node + states := Map(c.nodeContents, func(node postfixNode) *State { + s := newState() + s.content = rune2Contents(node.contents) + return &s + }) + // Reduce the list of states down to a single state by alternating them + toAdd := Reduce(states, func(s1 *State, s2 *State) *State { + return alternate(s1, s2) + }) + nfa = append(nfa, toAdd) + } // Must be an operator if it isn't a character switch c.nodetype { case CONCATENATE: @@ -613,7 +753,7 @@ func thompson(re []postfixNode) (Reg, error) { stateToAdd = concatenate(stateToAdd, s2) } else { // Case 2 for i := c.startReps; i < c.endReps; i++ { - stateToAdd = concatenate(stateToAdd, question(state)) + stateToAdd = concatenate(stateToAdd, question(cloneState(state))) } } nfa = append(nfa, stateToAdd)