7 Commits

Author SHA1 Message Date
47ec95f7bb Created function that returns a 'default' state 2025-01-19 21:45:07 -06:00
a14ab81697 Updated function names, addeed new function 'FindString' that returns the _text_ of the match 2025-01-19 21:44:15 -06:00
7056026e10 Added a new class 'CHARCLASS', which represents a character class with some other postfixNodes in it. The 'except' field now contains a list of postfixNodes rather than runes 2025-01-19 21:43:21 -06:00
b81a2f8452 Added functions to find if a character is a valid hex value and a valid octal value 2025-01-19 21:31:18 -06:00
fcdb4a8868 Added another test, changed function calls to match new names 2025-01-19 21:30:56 -06:00
3a3333b38a New features, changed character class behavior
I added support for hex values (eg. \x0F), octal values (eg. \012) and
extended hex values (eg. \x{000F2A}). I also expanded the abilities of
character clsses, to include things like escaped characters (eg. [aefp\)])
and character ranges _inside_ inverted character classes (eg. [^\w] which is
functionally equivalent to [\W]).
2025-01-19 21:26:56 -06:00
4376ccb77d Renamed function calls to use new names 2025-01-19 21:22:33 -06:00
7 changed files with 263 additions and 35 deletions

View File

@@ -2,6 +2,7 @@ package main
import ( import (
"fmt" "fmt"
"math"
"slices" "slices"
"strconv" "strconv"
"unicode" "unicode"
@@ -184,6 +185,40 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
re_postfix = append(re_postfix, NONCAPLPAREN_CHAR) re_postfix = append(re_postfix, NONCAPLPAREN_CHAR)
i += 3 i += 3
} }
if i < len(re_runes) && re_runes[i] == '\\' { // Something is being escaped (I don't add the backslash to re_postfix, because it was already added earlier)
i++
if i >= len(re_runes) {
return nil, fmt.Errorf("Stray backslash in expression.")
}
if re_runes[i] == 'x' {
re_postfix = append(re_postfix, re_runes[i])
i++
if i >= len(re_runes) {
return nil, fmt.Errorf("Stray backslash in expression.")
}
if re_runes[i] == '{' {
re_postfix = append(re_postfix, re_runes[i:i+8]...)
i += 7
if i >= len(re_runes) {
return nil, fmt.Errorf("Stray backslash in expression.")
}
} else if isHex(re_runes[i]) {
re_postfix = append(re_postfix, re_runes[i:i+2]...)
i += 2
} else {
return nil, fmt.Errorf("Invalid hex value in expression.")
}
} else if isOctal(re_runes[i]) {
numDigits := 1
for i+numDigits < len(re_runes) && numDigits < 3 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 3)
numDigits++
}
re_postfix = append(re_postfix, re_runes[i:i+numDigits]...)
i += (numDigits - 1) // I have to move back a step, so that I can add a concatenation operator if necessary, and so that the increment at the bottom of the loop works as intended
} else {
re_postfix = append(re_postfix, re_runes[i])
}
}
if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it. if i < len(re_runes) && re_runes[i] == '(' && (i == 0 || re_runes[i-1] != '\\') && (i < len(re_runes)-2 && re_runes[i+1] == '?' && slices.Contains([]rune{'=', '!', '<'}, re_runes[i+2])) { // Unescaped open parentheses followed by question mark then '<', '!' or '=' => lokaround. Don't mess with it.
i++ // Step inside i++ // Step inside
if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') { if i == len(re_runes)-1 || (re_runes[i+1] != '=' && re_runes[i+1] != '!' && re_runes[i+1] != '<') {
@@ -253,7 +288,45 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
return nil, fmt.Errorf("ERROR: Backslash with no escape character.") return nil, fmt.Errorf("ERROR: Backslash with no escape character.")
} }
i++ i++
if re_postfix[i] == 'x' { // Hex value
i++
if re_postfix[i] == '{' && i < len(re_postfix)-6 { // Expanded hex code
var hexVal int
n, err := fmt.Sscanf(string(re_postfix[i:]), "{%x}", &hexVal)
if n < 1 || err != nil {
return nil, fmt.Errorf("Error parsing expanded hex code in expression.")
}
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
i += 7
} else if i < len(re_postfix)-1 { // Two-digit hex code
hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
if err != nil {
return nil, fmt.Errorf("Error parsing hex characters in expression.")
}
i += 2
outQueue = append(outQueue, newPostfixCharNode(rune(hexVal)))
} else {
return nil, fmt.Errorf("Not enough hex characters found in expression.")
}
} else if isOctal(re_postfix[i]) { // Octal value
var octVal int
n, err := fmt.Sscanf(string(re_postfix[i:]), "%d", &octVal)
if n < 1 || err != nil {
return nil, fmt.Errorf("Error parsing octal value in expression.")
}
if octVal > 777 {
return nil, fmt.Errorf("Invalid octal value in expression.")
}
i += int(math.Ceil(math.Log10(float64(octVal)))) // Shift forward by the number of digits that were parsed
i-- // Move back one character, because the loop increment will move us back to the next character automatically
octValBase10, err := strconv.ParseInt(strconv.Itoa(octVal), 8, 0)
if err != nil {
return nil, fmt.Errorf("Error parsing octal value in expression.")
}
outQueue = append(outQueue, newPostfixCharNode(rune(octValBase10)))
} else {
outQueue = append(outQueue, newEscapedNode(re_postfix[i])) outQueue = append(outQueue, newEscapedNode(re_postfix[i]))
}
continue // Escaped character will automatically be skipped when loop variable increments continue // Escaped character will automatically be skipped when loop variable increments
} }
@@ -342,25 +415,60 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
invertMatch = true invertMatch = true
i++ i++
} }
chars := make([]rune, 0) // List of characters - used only for character classes chars := make([]postfixNode, 0) // List of nodes - used only for character classes
for i < len(re_postfix) { for i < len(re_postfix) {
if re_postfix[i] == RBRACKET { if re_postfix[i] == RBRACKET {
break break
} }
chars = append(chars, re_postfix[i]) if re_postfix[i] == '\\' { // Backslash indicates a character to be escaped
if i == len(re_postfix)-1 {
return nil, fmt.Errorf("Stray backslash in character class.")
}
i++ // Step past backslash
if re_postfix[i] == 'x' { // Hex value
i++ i++
if re_postfix[i] == '{' && i < len(re_postfix)-7 { // Expanded hex code
var hexVal int
n, err := fmt.Sscanf(string(re_postfix[i:]), "{%x}", &hexVal)
if n < 1 || err != nil {
return nil, fmt.Errorf("Error parsing expanded hex code in character class.")
}
chars = append(chars, newPostfixCharNode(rune(hexVal)))
i += 8
} else if i < len(re_postfix)-2 { // Two-digit hex code
hexVal, err := strconv.ParseInt(string([]rune{re_postfix[i], re_postfix[i+1]}), 16, 64) // Convert the two hex values into a rune slice, then to a string. Parse the string into an int with strconv.ParseInt()
if err != nil {
return nil, fmt.Errorf("Error parsing hex characters in character class.")
}
i += 2
chars = append(chars, newPostfixCharNode(rune(hexVal)))
} else {
return nil, fmt.Errorf("Not enough hex characters found in character class.")
}
} else if unicode.IsDigit(re_postfix[i]) { // Octal value
var octVal int
n, err := fmt.Sscanf(string(re_postfix[i:]), "%d", &octVal)
if n < 1 || err != nil {
return nil, fmt.Errorf("Error parsing octal value in character class.")
}
if octVal > 0777 {
return nil, fmt.Errorf("Invalid octal value in character class.")
}
i += int(math.Ceil(math.Log10(float64(octVal)) / math.Log10(8))) // Shift forward by the number of digits that were parsed
chars = append(chars, newPostfixCharNode(rune(octVal)))
} else {
chars = append(chars, newEscapedNode(re_postfix[i]))
}
} else {
chars = append(chars, newPostfixCharNode(re_postfix[i]))
i++
}
} }
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic. if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
return nil, fmt.Errorf("Opening bracket without closing bracket.") return nil, fmt.Errorf("Opening bracket without closing bracket.")
} }
if !invertMatch { outQueue = append(outQueue, newCharClassNode(chars, invertMatch))
outQueue = append(outQueue, newPostfixCharNode(chars...))
} else {
// Invert match - create an allChars postfixNode, then add the given states to its 'except' list.
toAdd := newPostfixDotNode()
toAdd.except = chars
outQueue = append(outQueue, toAdd)
}
continue continue
} }
if c == '{' { if c == '{' {
@@ -476,10 +584,29 @@ func thompson(re []postfixNode) (Reg, error) {
if c.allChars { if c.allChars {
state.allChars = true state.allChars = true
if len(c.except) != 0 { if len(c.except) != 0 {
state.except = append([]rune{}, c.except...) // For each node that I am 'excepting' (eg. in an inverted character class):
// - If the node itself has exceptions, then the exceptions cancel out.
// Eg. [^\w] == [\W]
// - Since an allChars node is the only kind that _can_ have exceptions, that's what I check for.
// - If the node doesn't have exceptions (allChars == false) then the contents of the node are added to the except list.
for _, node := range c.except {
if node.allChars {
// For each postfixNode in node.except, extract the contents of the postfixNode. Concatenate them all,
// and them to the state's _content_. As mentioned above, if the exception has exceptions, then we can match
// those.
nodeExceptChars := slices.Concat(Map(node.except, func(node postfixNode) []rune {
return node.contents
})...)
state.content = rune2Contents(nodeExceptChars)
} else {
state.except = append(state.except, node.contents...)
} }
} }
state.content = rune2Contents(c.contents) }
}
// Convert the current contents to []int, convert the result of rune2contents to []int, append then
// convert back to stateContents.
state.content = stateContents(append([]int(state.content), []int(rune2Contents(c.contents))...))
state.output = make([]*State, 0) state.output = make([]*State, 0)
state.output = append(state.output, &state) state.output = append(state.output, &state)
state.isEmpty = false state.isEmpty = false
@@ -561,6 +688,19 @@ func thompson(re []postfixNode) (Reg, error) {
} }
} }
if c.nodetype == CHARCLASS { // A Character class consists of all the nodes in it, alternated
// Map the list of nodes to a list of states, each state containing the contents of a specific node
states := Map(c.nodeContents, func(node postfixNode) *State {
s := newState()
s.content = rune2Contents(node.contents)
return &s
})
// Reduce the list of states down to a single state by alternating them
toAdd := Reduce(states, func(s1 *State, s2 *State) *State {
return alternate(s1, s2)
})
nfa = append(nfa, toAdd)
}
// Must be an operator if it isn't a character // Must be an operator if it isn't a character
switch c.nodetype { switch c.nodetype {
case CONCATENATE: case CONCATENATE:
@@ -613,7 +753,7 @@ func thompson(re []postfixNode) (Reg, error) {
stateToAdd = concatenate(stateToAdd, s2) stateToAdd = concatenate(stateToAdd, s2)
} else { // Case 2 } else { // Case 2
for i := c.startReps; i < c.endReps; i++ { for i := c.startReps; i < c.endReps; i++ {
stateToAdd = concatenate(stateToAdd, question(state)) stateToAdd = concatenate(stateToAdd, question(cloneState(state)))
} }
} }
nfa = append(nfa, stateToAdd) nfa = append(nfa, stateToAdd)

View File

@@ -119,12 +119,12 @@ func main() {
} }
matchIndices := make([]Match, 0) matchIndices := make([]Match, 0)
if matchNumFlagEnabled { if matchNumFlagEnabled {
tmp, err := findNthMatch(regComp, test_str, *matchNum) tmp, err := FindNthMatch(regComp, test_str, *matchNum)
if err == nil { if err == nil {
matchIndices = append(matchIndices, tmp) matchIndices = append(matchIndices, tmp)
} }
} else { } else {
matchIndices = findAllMatches(regComp, test_str) matchIndices = FindAllMatches(regComp, test_str)
} }
if *printMatchesFlag { if *printMatchesFlag {

View File

@@ -138,10 +138,29 @@ func pruneIndices(indices []Match) []Match {
return toRet return toRet
} }
// findNthMatch finds the 'n'th match of the regex represented by the given start-state, with // FindString returns a _string_ containing the _text_ of the _leftmost_ match of
// the regex, in the given string. The return value will be an empty string in two situations:
// 1. No match was found
// 2. The match was an empty string
func FindString(regex Reg, str string) string {
match, err := FindNthMatch(regex, str, 1)
if err != nil {
return ""
}
return str[match[0].startIdx:match[0].endIdx]
}
// FindAllString is the 'all' version of FindString.
// It returns a _slice of strings_ containing the _text_ of _all_ matches of
// the regex, in the given string.
//func FindAllString(regex Reg, str []string) []string {
//
//}
// FindNthMatch finds the 'n'th match of the regex represented by the given start-state, with
// the given string. // the given string.
// It returns an error (!= nil) if there are fewer than 'n' matches in the string. // It returns an error (!= nil) if there are fewer than 'n' matches in the string.
func findNthMatch(regex Reg, str string, n int) (Match, error) { func FindNthMatch(regex Reg, str string, n int) (Match, error) {
idx := 0 idx := 0
matchNum := 0 matchNum := 0
str_runes := []rune(str) str_runes := []rune(str)
@@ -160,9 +179,9 @@ func findNthMatch(regex Reg, str string, n int) (Match, error) {
return nil, fmt.Errorf("Invalid match index. Too few matches found.") return nil, fmt.Errorf("Invalid match index. Too few matches found.")
} }
// findAllMatches tries to find all matches of the regex represented by given start-state, with // FindAllMatches tries to find all matches of the regex represented by given start-state, with
// the given string // the given string
func findAllMatches(regex Reg, str string) []Match { func FindAllMatches(regex Reg, str string) []Match {
idx := 0 idx := 0
str_runes := []rune(str) str_runes := []rune(str)
var matchFound bool var matchFound bool
@@ -180,7 +199,7 @@ func findAllMatches(regex Reg, str string) []Match {
return indices return indices
} }
// Helper for findAllMatches. Returns whether it found a match, the // Helper for FindAllMatches. Returns whether it found a match, the
// first Match it finds, and how far it got into the string ie. where // first Match it finds, and how far it got into the string ie. where
// the next search should start from. // the next search should start from.
// //

View File

@@ -131,3 +131,11 @@ func expandSlice[T any](slc []T, newSize int) []T {
copy(toRet, slc) copy(toRet, slc)
return toRet return toRet
} }
func isHex(c rune) bool {
return slices.Contains([]rune("0123456789abcdefABCDEF"), c)
}
func isOctal(c rune) bool {
return slices.Contains([]rune("01234567"), c)
}

17
nfa.go
View File

@@ -136,7 +136,7 @@ func (s State) checkAssertion(str []rune, idx int) bool {
strToMatch = string(runesToMatch) strToMatch = string(runesToMatch)
} }
matchIndices := findAllMatches(Reg{startState, s.lookaroundNumCaptureGroups}, strToMatch) matchIndices := FindAllMatches(Reg{startState, s.lookaroundNumCaptureGroups}, strToMatch)
numMatchesFound := 0 numMatchesFound := 0
for _, matchIdx := range matchIndices { for _, matchIdx := range matchIndices {
@@ -314,3 +314,18 @@ func question(s1 *State) *State { // Use the fact that ab? == a(b|)
s3 := alternate(s1, s2) s3 := alternate(s1, s2)
return s3 return s3
} }
// Creates and returns a new state with the 'default' values.
func newState() State {
ret := State{
output: make([]*State, 0),
transitions: make(map[int][]*State),
assert: NONE,
except: append([]rune{}, 0),
lookaroundRegex: "",
groupEnd: false,
groupBegin: false,
}
ret.output = append(ret.output, &ret)
return ret
}

View File

@@ -2,9 +2,14 @@ package main
type NodeType int type NodeType int
// This is a slice containing all escapable characters that have special meaning.
// Eg. \b is word boundary, \w is word character etc.
var escapedChars []rune = []rune("wWdDbBnaftrvsS0")
// This is a list of the possible node types // This is a list of the possible node types
const ( const (
CHARACTER NodeType = iota CHARACTER NodeType = iota
CHARCLASS
PIPE PIPE
CONCATENATE CONCATENATE
KLEENE KLEENE
@@ -29,9 +34,31 @@ type postfixNode struct {
startReps int // Minimum number of times the node should be repeated - used with numeric specifiers startReps int // Minimum number of times the node should be repeated - used with numeric specifiers
endReps int // Maximum number of times the node should be repeated - used with numeric specifiers endReps int // Maximum number of times the node should be repeated - used with numeric specifiers
allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter) allChars bool // Whether or not the current node represents all characters (eg. dot metacharacter)
except []rune // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here. except []postfixNode // For inverted character classes, we match every unicode character _except_ a few. In this case, allChars is true and the exceptions are placed here.
lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround. lookaroundSign int // ONLY USED WHEN nodetype == ASSERTION. Whether we have a positive or negative lookaround.
lookaroundDir int // Lookbehind or lookahead lookaroundDir int // Lookbehind or lookahead
nodeContents []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
}
// Converts the given list of postfixNodes to one node of type CHARCLASS.
// Used to convert eg. 'a', 'b' and 'c' to '[abc]'.
// If the character class is negated, it returns a postfixNode of type CHARACTER.
// This node will behave like the dot metacharacter, but it has a longer list of runes that
// it will not match.
func newCharClassNode(nodes []postfixNode, negated bool) postfixNode {
rtv := postfixNode{}
rtv.nodetype = CHARCLASS
rtv.startReps = 1
rtv.endReps = 1
if negated {
rtv.nodetype = CHARACTER
rtv.contents = []rune{ANY_CHAR}
rtv.allChars = true
rtv.except = nodes
} else {
rtv.nodeContents = nodes
}
return rtv
} }
// Creates a new escaped node - the given character is assumed to have been preceded by a backslash // Creates a new escaped node - the given character is assumed to have been preceded by a backslash
@@ -45,25 +72,43 @@ func newEscapedNode(c rune) postfixNode {
toReturn.contents = append(toReturn.contents, whitespaceChars...) toReturn.contents = append(toReturn.contents, whitespaceChars...)
case 'S': // Non-whitespace case 'S': // Non-whitespace
toReturn = newPostfixDotNode() toReturn = newPostfixDotNode()
toReturn.except = append([]rune{}, whitespaceChars...) toReturn.except = append([]postfixNode{}, newPostfixNode(whitespaceChars...))
case 'd': // Digits case 'd': // Digits
toReturn.nodetype = CHARACTER toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, digitChars...) toReturn.contents = append(toReturn.contents, digitChars...)
case 'D': // Non-digits case 'D': // Non-digits
toReturn = newPostfixDotNode() toReturn = newPostfixDotNode()
toReturn.except = append([]rune{}, digitChars...) toReturn.except = append([]postfixNode{}, newPostfixNode(digitChars...))
case 'w': // word character case 'w': // word character
toReturn.nodetype = CHARACTER toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, wordChars...) toReturn.contents = append(toReturn.contents, wordChars...)
case 'W': // Non-word character case 'W': // Non-word character
toReturn = newPostfixDotNode() toReturn = newPostfixDotNode()
toReturn.except = append([]rune{}, wordChars...) toReturn.except = append([]postfixNode{}, newPostfixNode(wordChars...))
case 'b', 'B': case 'b', 'B':
toReturn.nodetype = ASSERTION toReturn.nodetype = ASSERTION
toReturn.contents = append(toReturn.contents, c) toReturn.contents = append(toReturn.contents, c)
case 'n': // Newline character case 'n': // Newline character
toReturn.nodetype = CHARACTER toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, '\n') toReturn.contents = append(toReturn.contents, '\n')
case '0': // NULL character
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(0))
case 'a': // Bell character
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(7))
case 'f': // Form feed character
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(12))
case 't': // Horizontal tab character
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(9))
case 'r': // Carriage return
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(13))
case 'v': // Vertical tab
toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, rune(11))
default: // None of the above - append it as a regular character default: // None of the above - append it as a regular character
toReturn.nodetype = CHARACTER toReturn.nodetype = CHARACTER
toReturn.contents = append(toReturn.contents, c) toReturn.contents = append(toReturn.contents, c)

View File

@@ -148,6 +148,7 @@ var reTests = []struct {
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "040", []Group{}}, {"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "040", []Group{}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "400", []Group{{0, 3}}}, {"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "400", []Group{{0, 3}}},
{"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "4000", []Group{}}, {"^((3[7-9])|([4-9][0-9])|([1-9][0-9][0-9])|(1000))$", "4000", []Group{}},
{"a{1,3}", "aaaaa", []Group{{0, 3}, {3, 5}}},
// Lookaround tests // Lookaround tests
{"(?<=bo)y", "boy", []Group{{2, 3}}}, {"(?<=bo)y", "boy", []Group{{2, 3}}},
@@ -191,7 +192,7 @@ func TestFindAllMatches(t *testing.T) {
if err != nil { if err != nil {
panic(err) panic(err)
} }
matchIndices := findAllMatches(regComp, test.str) matchIndices := FindAllMatches(regComp, test.str)
zeroGroups := make([]Group, len(matchIndices)) zeroGroups := make([]Group, len(matchIndices))
for i, m := range matchIndices { for i, m := range matchIndices {
zeroGroups[i] = m[0] zeroGroups[i] = m[0]
@@ -210,7 +211,7 @@ func TestFindAllGroups(t *testing.T) {
if err != nil { if err != nil {
panic(err) panic(err)
} }
matchIndices := findAllMatches(regComp, test.str) matchIndices := FindAllMatches(regComp, test.str)
for i := range matchIndices { for i := range matchIndices {
for j := range matchIndices[i] { for j := range matchIndices[i] {
if matchIndices[i][j].isValid() { if matchIndices[i][j].isValid() {