Compare commits
8 Commits
435588274c
...
4c96cfa06c
Author | SHA1 | Date | |
---|---|---|---|
4c96cfa06c | |||
bd56c9c7b5 | |||
6cf523b7ea | |||
ed2671849d | |||
2309d35d30 | |||
5afb7dd04a | |||
d5007a3fd5 | |||
5c4d979d7e |
100
compile.go
100
compile.go
@@ -123,9 +123,12 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
} else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash
|
} else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash
|
||||||
re_runes = append(re_runes, ESC_BACKSLASH)
|
re_runes = append(re_runes, ESC_BACKSLASH)
|
||||||
i++
|
i++
|
||||||
} else if c == '[' && (i == 0 || re_runes_orig[i-1] != '\\')
|
} else if c == '[' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
|
||||||
|
re_runes = append(re_runes, LBRACKET)
|
||||||
|
continue
|
||||||
|
} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
|
||||||
|
re_runes = append(re_runes, RBRACKET)
|
||||||
|
continue
|
||||||
} else {
|
} else {
|
||||||
re_runes = append(re_runes, c)
|
re_runes = append(re_runes, c)
|
||||||
}
|
}
|
||||||
@@ -145,31 +148,28 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
i := 0
|
i := 0
|
||||||
for i < len(re_runes) {
|
for i < len(re_runes) {
|
||||||
re_postfix = append(re_postfix, re_runes[i])
|
re_postfix = append(re_postfix, re_runes[i])
|
||||||
if re_runes[i] == '[' && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped. Inside this block, the only task is to expand character ranges into their constituent characters.
|
if re_runes[i] == LBRACKET && (i == 0 || re_runes[i-1] != '\\') { // We do not touch things inside brackets, unless they are escaped.
|
||||||
re_postfix[len(re_postfix)-1] = LBRACKET // Replace the '[' character with LBRACKET. This allows for easier parsing of all characters (including opening and closing brackets) within the character class
|
|
||||||
toAppend := make([]rune, 0) // Holds all the runes in the current character class
|
toAppend := make([]rune, 0) // Holds all the runes in the current character class
|
||||||
if i < len(re_runes)-1 && re_runes[i+1] == '^' { // Inverting class - match everything NOT in brackets
|
|
||||||
re_postfix = append(re_postfix, '^')
|
i++ // Skip past LBRACKET, because it was already added
|
||||||
i++ // Skip opening bracket and caret
|
if i >= len(re_runes) { // Sanity check before we start
|
||||||
|
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
||||||
}
|
}
|
||||||
if i < len(re_runes)-1 && re_runes[i+1] == ']' { // Nothing inside brackets - panic.
|
|
||||||
return nil, fmt.Errorf("Empty character class.")
|
for re_runes[i] != RBRACKET || i == 0 || re_runes[i-1] == '\\' { // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
|
||||||
}
|
|
||||||
for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
|
|
||||||
i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
|
|
||||||
// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
|
// Make sure we haven't exceeded the length of the string. If we did, then the regex doesn't actually have a closing bracket and we should throw an error.
|
||||||
if i >= len(re_runes) {
|
if i >= len(re_runes) {
|
||||||
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
||||||
}
|
}
|
||||||
|
|
||||||
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != ']') { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
|
if re_runes[i] == '-' && (i > 0 && re_runes[i-1] != '\\') && (i < len(re_runes)-1 && re_runes[i+1] != RBRACKET) { // Unescaped hyphen, that has some character (not a RBRACKET) after it - This represents a character range, so we replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
|
||||||
re_runes[i] = CHAR_RANGE
|
re_runes[i] = CHAR_RANGE
|
||||||
}
|
}
|
||||||
|
|
||||||
toAppend = append(toAppend, re_runes[i])
|
toAppend = append(toAppend, re_runes[i])
|
||||||
|
i++
|
||||||
}
|
}
|
||||||
// Replace the last character (which should have been ']', with RBRACKET
|
// Add in the RBRACKET
|
||||||
toAppend[len(toAppend)-1] = RBRACKET
|
toAppend = append(toAppend, RBRACKET)
|
||||||
re_postfix = append(re_postfix, toAppend...)
|
re_postfix = append(re_postfix, toAppend...)
|
||||||
}
|
}
|
||||||
if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
|
if i < len(re_runes) && re_runes[i] == '{' && (i > 0 && re_runes[i-1] != '\\') { // We don't touch things inside braces, either
|
||||||
@@ -284,6 +284,17 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
// Since every unescaped bracket is replaced by a LBRACKET / RBRACKET, there may
|
||||||
|
// have been false positives. For example, the regex ']' has a closing bracket, but it
|
||||||
|
// isn't denoting a character class; it's just a regular character. Since it's not escaped,
|
||||||
|
// though, I would have converted this into an RBRACKET.
|
||||||
|
// To deal with this, I make the following assertion:
|
||||||
|
// If at any point I see an RBRACKET 'in the wild' (not in a character class), then it must be
|
||||||
|
// a regular character, with no special significance.
|
||||||
|
if c == RBRACKET {
|
||||||
|
outQueue = append(outQueue, newPostfixCharNode(']'))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
|
if c == '\\' { // Escape character - invert special and non-special characters eg. \( is treated as a literal parentheses, \b is treated as word boundary
|
||||||
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
|
if i == len(re_postfix)-1 { // End of string - panic, because backslash is an escape character (something needs to come after it)
|
||||||
@@ -419,6 +430,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
|
firstCharAdded := false // A character class must have at least 1 character. This flag checks if the first character has been added.
|
||||||
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
|
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
|
||||||
i++ // Step forward so we can look at the character class
|
i++ // Step forward so we can look at the character class
|
||||||
|
// Oops, there's nothing there to look at
|
||||||
|
if i >= len(re_postfix) {
|
||||||
|
return nil, fmt.Errorf("Opening bracket with no closing bracket.")
|
||||||
|
}
|
||||||
var invertMatch bool
|
var invertMatch bool
|
||||||
if re_postfix[i] == '^' {
|
if re_postfix[i] == '^' {
|
||||||
invertMatch = true
|
invertMatch = true
|
||||||
@@ -643,6 +658,21 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
|||||||
func thompson(re []postfixNode) (Reg, error) {
|
func thompson(re []postfixNode) (Reg, error) {
|
||||||
nfa := make([]*State, 0) // Stack of states
|
nfa := make([]*State, 0) // Stack of states
|
||||||
numGroups := 0 // Number of capturing groups
|
numGroups := 0 // Number of capturing groups
|
||||||
|
|
||||||
|
// If thompson() receives an empty regex, then whatever was given to shuntingYard()
|
||||||
|
// was parsed away. This doesn't mean that the regex itself is empty.
|
||||||
|
// For example, it could have been '(?:)'. This is an empty non-capturing group. Since
|
||||||
|
// shuntingYard() doesn't include non-capturing groups in its output (and the group contains
|
||||||
|
// nothing), the output of shuntingYard() (and the input to thompson()) ends up being empty.
|
||||||
|
// In these cases, we will return an NFA with 1 state, with an assertion that is always true.
|
||||||
|
if len(re) == 0 {
|
||||||
|
start := newState()
|
||||||
|
start.content = newContents(EPSILON)
|
||||||
|
start.isEmpty = true
|
||||||
|
start.assert = ALWAYS_TRUE
|
||||||
|
nfa = append(nfa, &start)
|
||||||
|
}
|
||||||
|
|
||||||
for _, c := range re {
|
for _, c := range re {
|
||||||
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
|
if c.nodetype == CHARACTER || c.nodetype == ASSERTION {
|
||||||
state := State{}
|
state := State{}
|
||||||
@@ -749,15 +779,36 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
// and then some other node.
|
// and then some other node.
|
||||||
// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
|
// These three nodes (LPAREN, the middle node and RPAREN) are extracted together, concatenated
|
||||||
// and added back in.
|
// and added back in.
|
||||||
|
// If the middle node doesn't exist (ie. something like '()' ), that's fine, I just connect the LPAREN
|
||||||
|
// and RPAREN nodes.
|
||||||
|
// If neither node exists, that's a problem so I return an error.
|
||||||
if c.nodetype == RPAREN {
|
if c.nodetype == RPAREN {
|
||||||
s.groupEnd = true
|
s.groupEnd = true
|
||||||
middleNode := mustPop(&nfa)
|
middleNode, err1 := pop(&nfa)
|
||||||
lparenNode := mustPop(&nfa)
|
lparenNode, err2 := pop(&nfa)
|
||||||
|
if err1 != nil && err2 != nil {
|
||||||
|
return Reg{}, fmt.Errorf("Imbalanced parentheses.")
|
||||||
|
} else if err2 != nil { // There was no third node. ie. something like '()'
|
||||||
|
lparenNode = middleNode
|
||||||
|
if lparenNode.groupBegin != true { // There are only two nodes, but the first one isn't an LPAREN.
|
||||||
|
return Reg{}, fmt.Errorf("Imbalanced parentheses.")
|
||||||
|
}
|
||||||
s.groupNum = lparenNode.groupNum
|
s.groupNum = lparenNode.groupNum
|
||||||
|
to_add := concatenate(lparenNode, s)
|
||||||
|
nfa = append(nfa, to_add)
|
||||||
|
} else {
|
||||||
|
// At this point, we assume all three nodes are valid ('lparenNode', 'middleNode' and 's')
|
||||||
|
if lparenNode.groupBegin {
|
||||||
|
s.groupNum = lparenNode.groupNum
|
||||||
|
} else if middleNode.groupBegin { // Something like 'a()'
|
||||||
|
s.groupNum = middleNode.groupNum
|
||||||
|
} else { // A middleNode and lparenNode exist, but neither is actually an LPAREN.
|
||||||
|
return Reg{}, fmt.Errorf("Imbalanced parentheses.")
|
||||||
|
}
|
||||||
tmp := concatenate(lparenNode, middleNode)
|
tmp := concatenate(lparenNode, middleNode)
|
||||||
to_add := concatenate(tmp, s)
|
to_add := concatenate(tmp, s)
|
||||||
nfa = append(nfa, to_add)
|
nfa = append(nfa, to_add)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if c.nodetype == CHARCLASS { // A Character class consists of all the nodes in it, alternated
|
if c.nodetype == CHARCLASS { // A Character class consists of all the nodes in it, alternated
|
||||||
@@ -777,9 +828,16 @@ func thompson(re []postfixNode) (Reg, error) {
|
|||||||
switch c.nodetype {
|
switch c.nodetype {
|
||||||
case CONCATENATE:
|
case CONCATENATE:
|
||||||
s2 := mustPop(&nfa)
|
s2 := mustPop(&nfa)
|
||||||
s1 := mustPop(&nfa)
|
// Relax the requirements for concatenation a little bit - If
|
||||||
|
// the second element is not found ie. the postfixNodes look
|
||||||
|
// like 'a~', then that's fine, we just skip the concatenation.
|
||||||
|
s1, err := pop(&nfa)
|
||||||
|
if err != nil {
|
||||||
|
nfa = append(nfa, s2)
|
||||||
|
} else {
|
||||||
s1 = concatenate(s1, s2)
|
s1 = concatenate(s1, s2)
|
||||||
nfa = append(nfa, s1)
|
nfa = append(nfa, s1)
|
||||||
|
}
|
||||||
case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
|
case KLEENE: // Create a 0-state, concat the popped state after it, concat the 0-state after the popped state
|
||||||
s1 := mustPop(&nfa)
|
s1 := mustPop(&nfa)
|
||||||
stateToAdd := kleene(*s1)
|
stateToAdd := kleene(*s1)
|
||||||
|
4
nfa.go
4
nfa.go
@@ -18,6 +18,7 @@ const (
|
|||||||
NLA // Negative lookahead
|
NLA // Negative lookahead
|
||||||
PLB // Positive lookbehind
|
PLB // Positive lookbehind
|
||||||
NLB // Negative lookbehind
|
NLB // Negative lookbehind
|
||||||
|
ALWAYS_TRUE // An assertion that is always true
|
||||||
)
|
)
|
||||||
|
|
||||||
type State struct {
|
type State struct {
|
||||||
@@ -103,6 +104,9 @@ func cloneStateHelper(state *State, cloneMap map[*State]*State) *State {
|
|||||||
// Checks if the given state's assertion is true. Returns true if the given
|
// Checks if the given state's assertion is true. Returns true if the given
|
||||||
// state doesn't have an assertion.
|
// state doesn't have an assertion.
|
||||||
func (s State) checkAssertion(str []rune, idx int) bool {
|
func (s State) checkAssertion(str []rune, idx int) bool {
|
||||||
|
if s.assert == ALWAYS_TRUE {
|
||||||
|
return true
|
||||||
|
}
|
||||||
if s.assert == SOS {
|
if s.assert == SOS {
|
||||||
return idx == 0
|
return idx == 0
|
||||||
}
|
}
|
||||||
|
31
re_test.go
31
re_test.go
@@ -237,6 +237,34 @@ var reTests = []struct {
|
|||||||
{`(abc`, nil, `-`, nil},
|
{`(abc`, nil, `-`, nil},
|
||||||
{`a]`, nil, `a]`, []Group{{0, 2}}},
|
{`a]`, nil, `a]`, []Group{{0, 2}}},
|
||||||
{`a[]]b`, nil, `a]b`, []Group{{0, 3}}},
|
{`a[]]b`, nil, `a]b`, []Group{{0, 3}}},
|
||||||
|
{`a[\]]b`, nil, `a]b`, []Group{{0, 3}}},
|
||||||
|
{`a[^bc]d`, nil, `aed`, []Group{{0, 3}}},
|
||||||
|
{`a[^bc]d`, nil, `abd`, []Group{}},
|
||||||
|
{`a[^-b]c`, nil, `adc`, []Group{{0, 3}}},
|
||||||
|
{`a[^-b]c`, nil, `a-c`, []Group{}},
|
||||||
|
{`a[^]b]c`, nil, `a]c`, []Group{}},
|
||||||
|
{`a[^]b]c`, nil, `adc`, []Group{{0, 3}}},
|
||||||
|
{`\ba\b`, nil, `a-`, []Group{{0, 1}}},
|
||||||
|
{`\ba\b`, nil, `-a`, []Group{{1, 2}}},
|
||||||
|
{`\ba\b`, nil, `-a-`, []Group{{1, 2}}},
|
||||||
|
{`\by\b`, nil, `xy`, []Group{}},
|
||||||
|
{`\by\b`, nil, `yz`, []Group{}},
|
||||||
|
{`\by\b`, nil, `xyz`, []Group{}},
|
||||||
|
{`x\b`, nil, `xyz`, []Group{}},
|
||||||
|
{`x\B`, nil, `xyz`, []Group{{0, 1}}},
|
||||||
|
{`\Bz`, nil, `xyz`, []Group{{2, 3}}},
|
||||||
|
{`z\B`, nil, `xyz`, []Group{}},
|
||||||
|
{`\Bx`, nil, `xyz`, []Group{}},
|
||||||
|
{`\Ba\B`, nil, `a-`, []Group{}},
|
||||||
|
{`\Ba\B`, nil, `-a`, []Group{}},
|
||||||
|
{`\Ba\B`, nil, `-a-`, []Group{}},
|
||||||
|
{`\By\B`, nil, `xy`, []Group{}},
|
||||||
|
{`\By\B`, nil, `yz`, []Group{}},
|
||||||
|
{`\By\b`, nil, `xy`, []Group{{1, 2}}},
|
||||||
|
{`\by\B`, nil, `yz`, []Group{{0, 1}}},
|
||||||
|
{`\By\B`, nil, `xyz`, []Group{{1, 2}}},
|
||||||
|
{`ab|cd`, nil, `abc`, []Group{{0, 2}}},
|
||||||
|
{`ab|cd`, nil, `abcd`, []Group{{0, 2}, {2, 4}}},
|
||||||
|
|
||||||
// Todo - add numeric range tests
|
// Todo - add numeric range tests
|
||||||
}
|
}
|
||||||
@@ -267,6 +295,9 @@ var groupTests = []struct {
|
|||||||
{"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
|
{"(a?)a?", nil, "aa", []Match{[]Group{{0, 2}, {0, 1}}, []Group{{2, 2}, {2, 2}}}},
|
||||||
{"a((b.d){3})", nil, "abfdbhdbid", []Match{[]Group{{0, 10}, {1, 10}, {7, 10}}}},
|
{"a((b.d){3})", nil, "abfdbhdbid", []Match{[]Group{{0, 10}, {1, 10}, {7, 10}}}},
|
||||||
{`(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\071`, nil, `abcdefghijkl9`, []Match{[]Group{{0, 13}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {10, 11}, {11, 12}}}},
|
{`(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\071`, nil, `abcdefghijkl9`, []Match{[]Group{{0, 13}, {0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9, 10}, {10, 11}, {11, 12}}}},
|
||||||
|
{`()ef`, nil, `def`, []Match{[]Group{{1, 3}, {1, 1}}}},
|
||||||
|
{`(?:)ef`, nil, `def`, []Match{[]Group{{1, 3}}}},
|
||||||
|
{`(?:)`, nil, `def`, []Match{[]Group{{0, 0}}, []Group{{1, 1}}, []Group{{2, 2}}, []Group{{3, 3}}}},
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestFindAllMatches(t *testing.T) {
|
func TestFindAllMatches(t *testing.T) {
|
||||||
|
Reference in New Issue
Block a user