Added lookaround-related fields to State struct, added lookaround support to checkAssertion()

2024-11-22 00:11:51 -05:00
parent 051a8551f3
commit 77d19cd84e
1 changed files with 59 additions and 21 deletions
--- a/nfa.go
+++ b/nfa.go
@@ -1,6 +1,8 @@
 package main

-import "slices"
+import (
+	"slices"
+)

 const EPSILON int = 0

@@ -12,6 +14,10 @@ const (
 	EOS
 	WBOUND
 	NONWBOUND
+	PLA // Positive lookahead
+	NLA // Negative lookahead
+	PLB // Positive lookbehind
+	NLB // Negative lookbehind
 )

 type State struct {
@@ -25,6 +31,7 @@ type State struct {
 	zeroMatchFound  bool             // Whether or not the state has been used for a zero-length match - only relevant for zero states
 	allChars        bool             // Whether or not the state represents all characters (eg. a 'dot' metacharacter). A 'dot' node doesn't store any contents directly, as it would take up too much space
 	except          []rune           // Only valid if allChars is true - match all characters _except_ the ones in this block. Useful for inverting character classes.
+	lookaroundRegex string           // Only for lookaround states - Contents of the regex that the lookaround state holds
 }

 // Clones the NFA starting from the given state.
@@ -53,6 +60,7 @@ func cloneStateHelper(state *State, cloneMap map[*State]*State) *State {
 		zeroMatchFound:  state.zeroMatchFound,
 		allChars:        state.allChars,
 		except:          append([]rune{}, state.except...),
+		lookaroundRegex: state.lookaroundRegex,
 	}
 	cloneMap[state] = clone
 	for i, s := range state.output {
@@ -90,6 +98,36 @@ func (s State) checkAssertion(str []rune, idx int) bool {
 	if s.assert == NONWBOUND {
 		return !isWordBoundary(str, idx)
 	}
+	if s.assert == PLA || s.assert == PLB || s.assert == NLA || s.assert == NLB { // Lookaround
+		// The process here is simple:
+		// 		1. Compile the regex stored in the state's contents.
+		// 		2. Run it on the test string.
+		// 		3. Based on the kind of lookaround (and the indices we get), determine what action to take.
+		regex := s.lookaroundRegex
+		re_postfix := shuntingYard(regex)
+		startState := thompson(re_postfix)
+		matchIndices := findAllMatches(startState, str)
+
+		numMatchesFound := 0
+		for _, matchIdx := range matchIndices {
+			if s.assert == PLA || s.assert == NLA { // Lookahead - return true (or false) if at least one match starts at the current index
+				if matchIdx.startIdx == idx {
+					numMatchesFound++
+				}
+			}
+			if s.assert == PLB || s.assert == NLB { // Lookbehind - return true (or false) if at least one match _ends_ at the current index.
+				if matchIdx.endIdx == idx {
+					numMatchesFound++
+				}
+			}
+		}
+		if s.assert == PLA || s.assert == PLB { // Positive assertions want at least one match
+			return numMatchesFound > 0
+		}
+		if s.assert == NLA || s.assert == NLB { // Negative assertions only want zero matches
+			return numMatchesFound == 0
+		}
+	}
 	return true
 }