From 25cb79f01bb4b832eaff2a32bd0ddc9076f0b9f3 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Tue, 21 Jan 2025 22:10:07 -0500 Subject: [PATCH] Changed the value of EPSILON, so that we can use the NUL character (which it used to be) in a regex; Also added code to detect escaped backslashes Specifically, I replace an escaped backslash with a metacharacter, then replace it back later on. This prevents problems, like detecting whether the opening bracket is escaped in '\\[a]'. --- compile.go | 12 ++++++++++++ nfa.go | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/compile.go b/compile.go index c74cad0..7f43273 100644 --- a/compile.go +++ b/compile.go @@ -81,6 +81,10 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { // // Also check for non-capturing groups. The LPAREN of a non-capturing group looks like this: '(?:' // I take this out, and put in a special character - NONCAPLPAREN_CHAR. + // + // Finally, check for escaped backslashes. Replace these with the BACKSLASH metacharacter. Later, in thompson(), + // these will be converted back. This avoids confusiuon in detecting whether a character is escaped eg. detecting + // whether '\\[a]' has an escaped opening bracket (it doesn't). for i := 0; i < len(re_runes_orig); i++ { c := re_runes_orig[i] if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) { @@ -115,6 +119,9 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) { } else if c == '(' && i < len(re_runes_orig)-2 && re_runes_orig[i+1] == '?' && re_runes_orig[i+2] == ':' { re_runes = append(re_runes, NONCAPLPAREN_CHAR) i += 2 + } else if c == '\\' && i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '\\' { // Escaped backslash + re_runes = append(re_runes, ESC_BACKSLASH) + i++ } else { re_runes = append(re_runes, c) } @@ -671,6 +678,11 @@ func thompson(re []postfixNode) (Reg, error) { } } + + // Replace ESC_BACKSLASH with actual backslash, so that we can actually check if we encounter it + replaceByValue([]int(state.content), int(ESC_BACKSLASH), '\\') + replaceByValue(state.except, ESC_BACKSLASH, '\\') + nfa = append(nfa, &state) } if c.nodetype == LPAREN || c.nodetype == RPAREN { diff --git a/nfa.go b/nfa.go index 9bc9ae9..5d1a4a4 100644 --- a/nfa.go +++ b/nfa.go @@ -4,7 +4,7 @@ import ( "slices" ) -const EPSILON int = 0 +const EPSILON int = 0xF0000 type assertType int