Started working on lazy quantifier support

Added code to return lazy quantifier postfixNodes
Added metacharacters for lazy quantifiers
2025-02-13 20:50:30 -05:00 · 2025-02-13 20:50:11 -05:00 · 2025-02-13 20:49:54 -05:00 · 2025-02-13 14:14:00 -05:00 · 2025-02-13 09:51:44 -06:00 · 2025-02-13 10:48:23 -05:00
6 changed files with 175 additions and 16 deletions
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -129,6 +129,8 @@ func main() {
 			matchIndices = regComp.FindAllSubmatch(test_str)
 		}
 		test_str_runes := []rune(test_str) // Converting to runes preserves unicode characters
 		if *printMatchesFlag {
 			// if we are in single line mode, print the line on which
 			// the matches occur
@@ -158,10 +160,10 @@ func main() {
 			oldIndices := indicesToPrint.values()
 			indicesToPrint = new_uniq_arr[int]()
 			// Explanation:
-			// Find all numbers from 0 to len(test_str) that are NOT in oldIndices.
+			// Find all numbers from 0 to len(test_str_runes) that are NOT in oldIndices.
 			// These are the values we want to print, now that we have inverted the match.
 			// Re-initialize indicesToPrint and add all of these values to it.
-			indicesToPrint.add(setDifference(genRange(0, len(test_str)), oldIndices)...)
+			indicesToPrint.add(setDifference(genRange(0, len(test_str_runes)), oldIndices)...)
 		}
 		// If lineFlag is enabled, we should only print something if:
@@ -182,7 +184,7 @@ func main() {
 		//			the corresponding end index.
 		// 		3. If not, just print the character.
 		if substituteFlagEnabled {
-			for i := range test_str {
+			for i := range test_str_runes {
 				inMatchIndex := false
 				for _, m := range matchIndices {
 					if i == m[0].StartIdx {
@@ -193,11 +195,11 @@ func main() {
 					}
 				}
 				if !inMatchIndex {
-					fmt.Fprintf(out, "%c", test_str[i])
+					fmt.Fprintf(out, "%c", test_str_runes[i])
 				}
 			}
 		} else {
-			for i, c := range test_str {
+			for i, c := range test_str_runes {
 				if indicesToPrint.contains(i) {
 					color.New(color.FgRed).Fprintf(out, "%c", c)
 					// Newline after every match - only if -o is enabled and -v is disabled.
--- a/regex/compile.go
+++ b/regex/compile.go
@@ -64,7 +64,7 @@ const (
 )
 func isOperator(c rune) bool {
-	if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune {
+	if c == '+' || c == '?' || c == '*' || c == '|' || c == concatRune || c == lazyPlusRune || c == lazyKleeneRune || c == lazyQuestionRune {
 		return true
 	}
 	return false
@@ -72,7 +72,7 @@ func isOperator(c rune) bool {
 /* priority returns the priority of the given operator */
 func priority(op rune) int {
-	precedence := []rune{'|', concatRune, '+', '*', '?'}
+	precedence := []rune{'|', concatRune, '+', lazyPlusRune, '*', lazyKleeneRune, '?', lazyQuestionRune}
 	return slices.Index(precedence, op)
 }
@@ -108,6 +108,48 @@ func getPOSIXClass(str []rune) (bool, string) {
 	return true, rtv
 }
 // isUnicodeCharClassLetter returns whether or not the given letter represents a unicode character class.
 func isUnicodeCharClassLetter(c rune) bool {
 	return slices.Contains([]rune{'L', 'M', 'S', 'N', 'P', 'C', 'Z'}, c)
 }
 // rangeTableToRuneSlice converts the given range table into a rune slice and returns it.
 func rangeTableToRuneSlice(rangetable *unicode.RangeTable) []rune {
 	var rtv []rune
 	for _, r := range rangetable.R16 {
 		for c := r.Lo; c <= r.Hi; c += r.Stride {
 			rtv = append(rtv, rune(c))
 		}
 	}
 	for _, r := range rangetable.R32 {
 		for c := r.Lo; c <= r.Hi; c += r.Stride {
 			rtv = append(rtv, rune(c))
 		}
 	}
 	return rtv
 }
 // unicodeCharClassToRange converts the given unicode character class name into a list of characters in that class.
 // This class could also be a single letter eg. 'C'.
 func unicodeCharClassToRange(class string) ([]rune, error) {
 	if len(class) == 0 {
 		return nil, fmt.Errorf("empty unicode character class")
 	}
 	if len(class) == 1 || len(class) == 2 {
 		if rangeTable, ok := unicode.Categories[class]; ok {
 			return rangeTableToRuneSlice(rangeTable), nil
 		} else {
 			return nil, fmt.Errorf("invalid short unicode character class")
 		}
 	} else {
 		if rangeTable, ok := unicode.Scripts[class]; ok {
 			return rangeTableToRuneSlice(rangeTable), nil
 		} else {
 			return nil, fmt.Errorf("invalid long unicode character class")
 		}
 	}
 }
 // Stores whether the case-insensitive flag has been enabled.
 var caseInsensitive bool
@@ -166,9 +208,6 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 	//		metacharacter. Later, in thompson(), these will be converted back. This avoids
 	//		confusion in detecting whether a character is escaped eg. detecting
 	// 		whether '\\[a]' has an escaped opening bracket (it doesn't).
 	//
 	// 	5. 	Check for non-greedy operators. These are not supported at the moment, so an error
 	// 		must be thrown if the user attempts to use a non-greedy operator.
 	for i := 0; i < len(re_runes_orig); i++ {
 		c := re_runes_orig[i]
 		if c == '<' && (i == 0 || (re_runes_orig[i-1] != '\\' && re_runes_orig[i-1] != '?')) {
@@ -215,8 +254,16 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 		} else if c == ']' && (i == 0 || re_runes[len(re_runes)-1] != '\\') {
 			re_runes = append(re_runes, rbracketRune)
 			continue
-		} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
+		} else if slices.Contains([]rune{'+', '*', '?'}, c) && (i > 0 && re_runes_orig[i-1] != '\\') && (i < len(re_runes_orig)-1 && re_runes_orig[i+1] == '?') {
-			return nil, fmt.Errorf("non-greedy operators are not supported")
+			switch c {
 			case '+':
 				re_runes = append(re_runes, lazyPlusRune)
 			case '*':
 				re_runes = append(re_runes, lazyKleeneRune)
 			case '?':
 				re_runes = append(re_runes, lazyQuestionRune)
 			}
 			i++
 		} else {
 			re_runes = append(re_runes, c)
 		}
@@ -309,10 +356,30 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 					}
 				} else if isHex(re_runes[i]) {
 					re_postfix = append(re_postfix, re_runes[i:i+2]...)
-					i += 2
+					i += 1 // I don't skip forward 2 steps, because the second step will happen with the loop increment
 				} else {
 					return nil, fmt.Errorf("invalid hex value in expression")
 				}
 			} else if re_runes[i] == 'p' || re_runes[i] == 'P' { // Unicode character class (P is negated unicode charclass)
 				re_postfix = append(re_postfix, re_runes[i])
 				i++
 				if i >= len(re_runes) {
 					return nil, fmt.Errorf("error parsing unicode character class in expression")
 				}
 				if re_runes[i] == '{' { // Full name charclass
 					for re_runes[i] != '}' {
 						re_postfix = append(re_postfix, re_runes[i])
 						i++
 					}
 					re_postfix = append(re_postfix, re_runes[i])
 					i++
 				} else if isUnicodeCharClassLetter(re_runes[i]) {
 					re_postfix = append(re_postfix, re_runes[i])
 					i++
 				} else {
 					return nil, fmt.Errorf("error parsing unicode character class in expression")
 				}
 				i-- // The loop increment at the top will move us forward
 			} else if re_runes[i] == '0' { // Start of octal value
 				numDigits := 1
 				for i+numDigits < len(re_runes) && numDigits < 4 && isOctal(re_runes[i+numDigits]) { // Skip while we see an octal character (max of 4, starting with 0)
@@ -359,7 +426,7 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 		}
 		if i < len(re_runes) && (re_runes[i] != '(' && re_runes[i] != nonCapLparenRune && re_runes[i] != '|' && re_runes[i] != '\\') || (i > 0 && re_runes[i-1] == '\\') { // Every character should be concatenated if it is escaped
 			if i < len(re_runes)-1 {
-				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != '+' && re_runes[i+1] != '?' && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
+				if re_runes[i+1] != '|' && re_runes[i+1] != '*' && re_runes[i+1] != lazyKleeneRune && re_runes[i+1] != '+' && re_runes[i+1] != lazyPlusRune && re_runes[i+1] != '?' && re_runes[i+1] != lazyQuestionRune && re_runes[i+1] != ')' && re_runes[i+1] != '{' {
 					re_postfix = append(re_postfix, concatRune)
 				}
 			}
@@ -429,6 +496,39 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 				} else {
 					return nil, fmt.Errorf("not enough hex characters found in expression")
 				}
 			} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
 				charClassInverted := (re_postfix[i] == 'P')
 				charsInClass := []rune{}
 				i++
 				if isUnicodeCharClassLetter(re_postfix[i]) {
 					var err error
 					charsInClass, err = unicodeCharClassToRange(string(re_postfix[i]))
 					if err != nil {
 						return nil, err
 					}
 				} else if re_postfix[i] == '{' {
 					i++ // Skip opening bracket
 					unicodeCharClassStr := ""
 					for re_postfix[i] != '}' {
 						unicodeCharClassStr += string(re_postfix[i])
 						i++
 					}
 					var err error
 					charsInClass, err = unicodeCharClassToRange(unicodeCharClassStr)
 					if err != nil {
 						return nil, err
 					}
 				} else {
 					return nil, fmt.Errorf("error parsing unicode character class in expression")
 				}
 				var toAppend postfixNode
 				if !charClassInverted { // \p
 					toAppend = newPostfixNode(charsInClass...)
 				} else { // \P
 					toAppend = newPostfixDotNode()
 					toAppend.except = append([]postfixNode{}, newPostfixNode(charsInClass...))
 				}
 				outQueue = append(outQueue, toAppend)
 			} else if re_postfix[i] == '0' { // Octal value
 				var octVal int64
 				var octValStr string
@@ -611,7 +711,40 @@ func shuntingYard(re string, flags ...ReFlag) ([]postfixNode, error) {
 						} else {
 							return nil, fmt.Errorf("not enough hex characters found in character class")
 						}
 					} else if re_postfix[i] == 'p' || re_postfix[i] == 'P' {
 						charClassInverted := (re_postfix[i] == 'P')
 						charsInList := []rune{}
 						i++
 						if isUnicodeCharClassLetter(re_postfix[i]) {
 							var err error
 							charsInList, err = unicodeCharClassToRange(string(re_postfix[i]))
 							if err != nil {
 								return nil, err
 							}
 						} else if re_postfix[i] == '{' {
 							i++ // Skip opening bracket
 							unicodeCharClassStr := ""
 							for re_postfix[i] != '}' {
 								unicodeCharClassStr += string(re_postfix[i])
 								i++
 							}
 							var err error
 							charsInList, err = unicodeCharClassToRange(unicodeCharClassStr)
 							if err != nil {
 								return nil, err
 							}
 						} else {
 							return nil, fmt.Errorf("error parsing unicode character class in expression")
 						}
 						if !charClassInverted {
 							chars = append(chars, newPostfixNode(charsInList...))
 						} else {
 							toAppend := newPostfixDotNode()
 							toAppend.except = append([]postfixNode{}, newPostfixNode(charsInList...))
 							chars = append(chars, toAppend)
 						}
 					} else if re_postfix[i] == '0' { // Octal value
 						var octVal int64
 						var octValStr string
 						numDigitsParsed := 0
--- a/regex/doc.go
+++ b/regex/doc.go
@@ -153,7 +153,7 @@ returns the 0-group.
 The following features from [regexp] are (currently) NOT supported:
 1. Named capturing groups
 2. Non-greedy operators
- 3. Unicode character classes
+ 3. Negated POSIX classes
 4. Embedded flags (flags are instead passed as arguments to [Compile])
 5. Literal text with \Q ... \E
--- a/regex/misc.go
+++ b/regex/misc.go
@@ -16,8 +16,11 @@ var rparenRune rune = 0xF0006
 var nonCapLparenRune rune = 0xF0007 // Represents a non-capturing group's LPAREN
 var escBackslashRune rune = 0xF0008 // Represents an escaped backslash
 var charRangeRune rune = 0xF0009    // Represents a character range
 var lazyKleeneRune rune = 0xF000A   // Represents a lazy kleene star
 var lazyPlusRune rune = 0xF000B     // Represents a lazy plus operator
 var lazyQuestionRune rune = 0xF000C // Represents a lazy question operator
-var specialChars = []rune{'?', '*', '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
+var specialChars = []rune{'?', lazyQuestionRune, '*', lazyKleeneRune, '\\', '^', '$', '{', '}', '(', ')', '[', ']', '+', lazyPlusRune, '|', '.', concatRune, '<', '>', lbracketRune, rbracketRune, nonCapLparenRune}
 // An interface for int and rune, which are identical
 type character interface {
--- a/regex/postfixNode.go
+++ b/regex/postfixNode.go
@@ -44,6 +44,7 @@ type postfixNode struct {
 	lookaroundDir   int           // Lookbehind or lookahead
 	nodeContents    []postfixNode // ONLY USED WHEN nodetype == CHARCLASS. Holds all the nodes inside the given CHARCLASS node.
 	referencedGroup int           // ONLY USED WHEN nodetype == backreferenceNode. Holds the group which this one refers to. After parsing is done, the expression will be rewritten eg. (a)\1 will become (a)(a). So the return value of ShuntingYard() shouldn't contain a backreferenceNode.
 	isLazy          bool          // ONLY USED WHEN nodetype == kleene or question
 }
 // Converts the given list of postfixNodes to one node of type CHARCLASS.
@@ -162,10 +163,19 @@ func newPostfixNode(contents ...rune) postfixNode {
 		switch contents[0] {
 		case '+':
 			to_return.nodetype = plusNode
 		case lazyPlusRune:
 			to_return.nodetype = plusNode
 			to_return.isLazy = true
 		case '?':
 			to_return.nodetype = questionNode
 		case lazyQuestionRune:
 			to_return.nodetype = questionNode
 			to_return.isLazy = true
 		case '*':
 			to_return.nodetype = kleeneNode
 		case lazyKleeneRune:
 			to_return.nodetype = kleeneNode
 			to_return.isLazy = true
 		case '|':
 			to_return.nodetype = pipeNode
 		case concatRune:
--- a/regex/re_test.go
+++ b/regex/re_test.go
@@ -430,6 +430,7 @@ var reTests = []struct {
 	{`^(.+)?B`, []ReFlag{RE_CASE_INSENSITIVE}, `ab`, []Group{{0, 2}}},
 	{`\0009`, []ReFlag{RE_CASE_INSENSITIVE}, "\x009", []Group{{0, 2}}},
 	{`\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "A", []Group{{0, 1}}},
 	{`\0141\0141`, []ReFlag{RE_CASE_INSENSITIVE}, "AA", []Group{{0, 2}}},
 	{`a[-]?c`, []ReFlag{RE_CASE_INSENSITIVE}, `AC`, []Group{{0, 2}}},
@@ -460,8 +461,10 @@ var reTests = []struct {
 	{`[\D5]+`, nil, `1234abc5678`, []Group{{4, 8}}},
 	{`[\da-fA-F]+`, nil, `123abc`, []Group{{0, 6}}},
 	{`\xff`, nil, "\u00ff", []Group{{0, 1}}},
 	{`\xff+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
 	{`\xFF`, nil, "\u00ff", []Group{{0, 1}}},
 	{`\x00ff`, nil, "\u00ff", []Group{}},
 	{`\x{0000ff}+`, nil, "\u00ff\u00ff", []Group{{0, 2}}},
 	{`\x{0000ff}`, nil, "\u00ff", []Group{{0, 1}}},
 	{`\x{0000FF}`, nil, "\u00ff", []Group{{0, 1}}},
 	{"\t\n\v\r\f\a", nil, "\t\n\v\r\f\a", []Group{{0, 6}}},
@@ -516,6 +519,14 @@ var reTests = []struct {
 	{`<389-400`, nil, `-`, nil},
 	{`<389-400>`, nil, `391`, []Group{{0, 3}}},
 	{`\b<1-10000>\b`, nil, `America declared independence in 1776.`, []Group{{33, 37}}},
 	{`\p{Tamil}+`, nil, `உயிரெழுத்து`, []Group{{0, 11}}}, // Each letter and matra is counted as a separate rune, so 'u', 'ya', 'e (matra), 'ra', 'e (matra)', 'zha', (oo (matra), 'tha', 'ith', 'tha', 'oo (matra)'.
 	{`\P{Tamil}+`, nil, `vowel=உயிரெழுத்து`, []Group{{0, 6}}},
 	{`\P`, nil, `உயிரெழுத்து`, nil},
 	{`\PM\pM*`, nil, `உயிரெழுத்து`, []Group{{0, 1}, {1, 3}, {3, 5}, {5, 7}, {7, 9}, {9, 11}}},
 	{`\pN+`, nil, `123abc456def`, []Group{{0, 3}, {6, 9}}},
 	{`\PN+`, nil, `123abc456def`, []Group{{3, 6}, {9, 12}}},
 	{`[\p{Greek}\p{Cyrillic}]`, nil, `ΣωШД`, []Group{{0, 1}, {1, 2}, {2, 3}, {3, 4}}},
 }
 var groupTests = []struct {
Author	SHA1	Message	Date
Aadhavan Srinivasan	277cbc0fc5	Started working on lazy quantifier support	2025-02-13 20:50:30 -05:00
Aadhavan Srinivasan	3924502b72	Added code to return lazy quantifier postfixNodes	2025-02-13 20:50:11 -05:00
Aadhavan Srinivasan	36b009747b	Added metacharacters for lazy quantifiers	2025-02-13 20:49:54 -05:00
Aadhavan Srinivasan	6cd0a10a8f	Added more documentation	2025-02-13 14:14:00 -05:00
Aadhavan Srinivasan	69fb96c43d	Merge pull request 'Implement Unicode character classes' (#4 ) from implementUnicodeCharClass into master Reviewed-on: #4	2025-02-13 09:51:44 -06:00
Aadhavan Srinivasan	46bc0c8529	Removed unicode character classes from 'features not supported' list	2025-02-13 10:48:23 -05:00
Aadhavan Srinivasan	1a890a1e75	Refactoring - remove duplicate code	2025-02-13 09:10:40 -05:00
Aadhavan Srinivasan	fde3784e5a	Added unicode charclass support within character classes; Fixed bugs with hex classes and unicode classes	2025-02-13 08:58:02 -05:00
Aadhavan Srinivasan	7045711860	Convert test_str into a rune slice for better unicode compatibility, it also fixed the bug where all unicode characters wouldn't be colored	2025-02-13 08:57:06 -05:00
Aadhavan Srinivasan	d4d606d95b	Added tests for unicode character classes; more tests for hex characters	2025-02-13 08:55:12 -05:00
Aadhavan Srinivasan	9cd330e521	More work on unicode character class support - fix bug where all characters aren't being matched	2025-02-12 23:04:10 -05:00
Aadhavan Srinivasan	44d6a2005c	Started working on unicode character classes	2025-02-12 22:19:30 -05:00
Aadhavan Srinivasan	f76cd6c3d9	Merge pull request 'Implement Backreferences' (#3 ) from implementBackreferences into master Reviewed-on: #3	2025-02-12 21:17:32 -06:00