Compare commits
4 Commits
v0.6.1
...
0bd7a87797
Author | SHA1 | Date | |
---|---|---|---|
0bd7a87797 | |||
9cf1c66653 | |||
9edc99d73c | |||
|
6850396bf9 |
57
compile.go
57
compile.go
@@ -154,22 +154,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
for re_runes[i] != ']' || i == 0 || re_runes[i-1] == '\\' {
|
||||
i++ // Skip all characters inside _unescaped_ brackets (we are _not_ at a closing bracket, or if we are, the previous character is a backslash)
|
||||
// TODO: Check for escaped characters
|
||||
|
||||
// Check ahead for character range
|
||||
if i < len(re_runes)-2 && re_runes[i+1] == '-' {
|
||||
rangeStart := re_runes[i]
|
||||
rangeEnd := re_runes[i+2]
|
||||
if int(rangeEnd) < int(rangeStart) {
|
||||
return nil, fmt.Errorf("Range is out of order.")
|
||||
}
|
||||
|
||||
for i := rangeStart; i <= rangeEnd; i++ {
|
||||
toAppend = append(toAppend, i)
|
||||
}
|
||||
|
||||
i += 2 // Skip start and hyphen (end will automatically be skipped on next iteration of loop)
|
||||
continue
|
||||
if re_runes[i] == '-' && i > 0 && re_runes[i-1] != '\\' { // Unescaped hyphen - replace with CHAR_RANGE. This metacharacter will be used later on to construct the range
|
||||
re_runes[i] = CHAR_RANGE
|
||||
}
|
||||
|
||||
toAppend = append(toAppend, re_runes[i])
|
||||
}
|
||||
// Replace the last character (which should have been ']', with RBRACKET
|
||||
@@ -420,7 +408,8 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
}
|
||||
}
|
||||
if c == LBRACKET { // Used for character classes
|
||||
i++ // Step forward so we can look at the character class
|
||||
endOfRange := false // Set to 'true' when we encounter a CHAR_RANGE metacharacter
|
||||
i++ // Step forward so we can look at the character class
|
||||
var invertMatch bool
|
||||
if re_postfix[i] == '^' {
|
||||
invertMatch = true
|
||||
@@ -431,6 +420,11 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
if re_postfix[i] == RBRACKET {
|
||||
break
|
||||
}
|
||||
if re_postfix[i] == CHAR_RANGE {
|
||||
endOfRange = true
|
||||
i++
|
||||
continue
|
||||
}
|
||||
if re_postfix[i] == '\\' { // Backslash indicates a character to be escaped
|
||||
if i == len(re_postfix)-1 {
|
||||
return nil, fmt.Errorf("Stray backslash in character class.")
|
||||
@@ -486,10 +480,38 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
|
||||
chars = append(chars, newPostfixCharNode(re_postfix[i]))
|
||||
i++
|
||||
}
|
||||
|
||||
if endOfRange { // The previous character was an unescaped hyphen, which (in the context of a character class) means the character that was last appended is the end of a character range
|
||||
// Things to note:
|
||||
// 1. In PCRE and Go's regex engine, a letter _can_ be surrounded by hyphens in a character class.
|
||||
// Eg. [a-b-c]
|
||||
// While you might think this leads to a syntax error (I thought so), the engine picks 'a-b' as a range,
|
||||
// then treats the second '-' and 'c' as regular characters in the character class.
|
||||
// So this regex becomes "Match a character from 'a' to 'b', a literal hyphen, or 'c' ".
|
||||
// 2. To account for this, the following logic is followed:
|
||||
// a. If the second-to-last postfixNode ie. the start of the range has only one element, then we are in a range.
|
||||
// i. If it has more than one element, then we are actually looking at a literal hyphen, and we will treat is as such.
|
||||
// b. The last postfixNode added to 'chars' _must_ only have one character (because it's the end of the range).
|
||||
endRangePostfixNode := mustPop(&chars)
|
||||
startRangePostfixNode := mustPop(&chars)
|
||||
if len(endRangePostfixNode.contents) != 1 {
|
||||
return nil, fmt.Errorf("Error parsing character range.")
|
||||
} else if len(startRangePostfixNode.contents) != 1 { // This is actually a regular hyphen
|
||||
chars = append(chars, startRangePostfixNode, newPostfixCharNode('-'), endRangePostfixNode)
|
||||
} else {
|
||||
// We have established that they both have a length of 1
|
||||
startRangeRune := startRangePostfixNode.contents[0]
|
||||
endRangeRune := endRangePostfixNode.contents[0]
|
||||
chars = append(chars, newPostfixCharNode(genRange(startRangeRune, endRangeRune+1)...))
|
||||
}
|
||||
|
||||
endOfRange = false // Reset the flag
|
||||
}
|
||||
}
|
||||
if i == len(re_postfix) { // We have reached the end of the string, so we didn't encounter a closing brakcet. Panic.
|
||||
return nil, fmt.Errorf("Opening bracket without closing bracket.")
|
||||
}
|
||||
|
||||
outQueue = append(outQueue, newCharClassNode(chars, invertMatch))
|
||||
continue
|
||||
}
|
||||
@@ -681,8 +703,7 @@ func thompson(re []postfixNode) (Reg, error) {
|
||||
|
||||
// Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it
|
||||
replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\')
|
||||
// Uncommenting this seems to make one of the test cases fail. Why?
|
||||
// replaceByValue(state.except, ESC_BACKSLASH, '\\')
|
||||
replaceByValue(state.except, ESC_BACKSLASH, '\\')
|
||||
|
||||
nfa = append(nfa, &state)
|
||||
}
|
||||
|
10
misc.go
10
misc.go
@@ -15,6 +15,12 @@ var LPAREN_CHAR rune = 0xF0004 // Parentheses in regex are concatenated with thi
|
||||
var RPAREN_CHAR rune = 0xF0005
|
||||
var NONCAPLPAREN_CHAR rune = 0xF0006 // Represents a non-capturing group's LPAREN
|
||||
var ESC_BACKSLASH rune = 0xF0007 // Represents an escaped backslash
|
||||
var CHAR_RANGE rune = 0xF0008 // Represents a character range
|
||||
|
||||
// An interface for int and rune, which are identical
|
||||
type character interface {
|
||||
int | rune
|
||||
}
|
||||
|
||||
// Returns true if str[idx] and str[idx-1] are separated by a word boundary.
|
||||
func isWordBoundary(str []rune, idx int) bool {
|
||||
@@ -109,8 +115,8 @@ func Reduce[T any](slc []T, fn func(T, T) T) T {
|
||||
}
|
||||
|
||||
// Generate numbers in a range - start (inclusive) to end (exclusive)
|
||||
func genRange(start, end int) []int {
|
||||
toRet := make([]int, end-start)
|
||||
func genRange[T character](start, end T) []T {
|
||||
toRet := make([]T, end-start)
|
||||
for i := start; i < end; i++ {
|
||||
toRet[i-start] = i
|
||||
}
|
||||
|
Reference in New Issue
Block a user