Factor list line common parsing into a separate function; refactored

OList and UList line parsing to us it
A nested list can be ordered or unordered
2025-05-20 16:48:31 -04:00 · 2025-05-20 16:47:58 -04:00 · 2025-05-20 16:47:31 -04:00 · 2025-05-20 16:47:16 -04:00 · 2025-05-20 16:47:04 -04:00 · 2025-05-20 16:46:52 -04:00
2 changed files with 93 additions and 62 deletions
--- a/src/MdToHTML.hs
+++ b/src/MdToHTML.hs
@@ -6,6 +6,7 @@ module MdToHTML where

 import Control.Applicative
 import Control.Monad
+import Data.Char
 import Data.List
 import Data.Ord (comparing)
 import Debug.Trace
@@ -87,17 +88,6 @@ leftmostLongestParse parser input =
        Nothing -> (mempty, mempty)
        Just x -> x

-- Parse if the string that's left matches the string comparator function
-lookaheadParse :: (String -> Bool) -> ReadP Char
-lookaheadParse stringCmp = do
-  lookahead <- look
-  case stringCmp lookahead of
-    True -> get
-    False -> pfail
-
-lineToList :: MdToken -> [MdToken]
-lineToList (Line tokens) = tokens
-
 specialChars = "\\#*_[\n"

 -- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
@@ -118,25 +108,6 @@ prepend x1 x2 = x1 ++ x2
 append :: [a] -> [a] -> [a]
 append x1 x2 = x2 ++ x1

-- Sequence two parsers, running one after the other and returning the result.
-sequenceParse :: ReadP a -> ReadP a -> ReadP [a]
-sequenceParse p1 p2 = twoElemList <$> p1 <*> p2
-  where
-    twoElemList elem1 elem2 = [elem1, elem2]
-
-- Parses p1 until p2 succeeds, but doesn't actually consume anything from p2.
-- Similar to manyTill, except manyTill's second parser actually consumes characters.
-manyTillLazy :: ReadP a -> ReadP b -> ReadP [a]
-manyTillLazy p1 p2 = do
-  res <- p1
-  remaining <- look
-  let p2res = readP_to_S p2 remaining
-  case p2res of
-    [] -> do
-      res2 <- manyTillLazy p1 p2
-      return (res : res2)
-    _ -> return [res]
-
 -- Parse until EOL or EOF
 parseTillEol :: ReadP String
 parseTillEol = manyTill get (void (char '\n') <++ eof)
@@ -176,9 +147,9 @@ parseBold = parseBoldWith "**" <|> parseBoldWith "__"

 -- Parse italic text
 parseItalic :: ReadP MdToken
-parseItalic = parseBoldWith "*" <|> parseBoldWith "_"
+parseItalic = parseItalicWith "*" <|> parseItalicWith "_"
  where
-    parseBoldWith delim = do
+    parseItalicWith delim = do
      string delim
      inside <- greedyParse1 parseLineToken
      string delim
@@ -316,17 +287,13 @@ parseBlockquote = do
  return (Blockquote parsedQuotedLines)

 -- Parse a nested list item.
-parseUListNested :: ReadP MdToken
-parseUListNested = do
-  --  firstChar <- string "    " <++ string "\t"
-  --  skipSpaces
-  --  restOfLine <- manyTill get (void (char '\n') <++ eof)
-  --  let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
-  --  return restOfLineParsed
+parseListNested :: ReadP MdToken
+parseListNested = do
  let firstCharParser = string "    " <++ string "\t"
  let restOfLineParser = manyTill get (void (char '\n') <++ eof)
  lines <- greedyParse1 (firstCharParser *> restOfLineParser)
-  let linesParsed = fst $ leftmostLongestParse parseUnorderedList (init $ unlines lines)
+  let linesParsed = fst $ leftmostLongestParse (parseUnorderedList <++ parseOrderedList) (init $ unlines lines)
+  when (null (show linesParsed)) pfail
  return linesParsed

 -- Parse an unordered list line item.
@@ -334,36 +301,54 @@ parseUListLineItem :: ReadP MdToken
 parseUListLineItem = do
  firstChar <- choice (map char ['*', '+', '-'])
  char ' ' -- At least one space between list indicator and list text.
+  parseListLineItemCommon
+
+-- Parse an ordered list line item.
+parseOListLineItem :: ReadP MdToken
+parseOListLineItem = do
+  num <- greedyParse1 (satisfy isDigit)
+  char '.'
+  char ' ' -- At least one space between list indicator and list text.
+  parseListLineItemCommon
+
+-- Common code for parsing list line items
+parseListLineItemCommon :: ReadP MdToken
+parseListLineItemCommon = do
  skipSpaces
  restOfLine <- many1 parseListLineToken
  void (char '\n') <++ eof
-  nestedList <- parseUListNested <++ return (Unit "")
+  nestedList <- parseListNested <++ return (Unit "")
  return $ Line [Line restOfLine, nestedList]

-- restOfLine <- manyTill get (void (char '\n') <++ eof)
-- let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
-- nestedList <- parseUListNested <++ return (Unit "")
-- return $ Line [restOfLineParsed, nestedList]
-
 -- Parse an unordered list paragraph item.
-- This is defined as a line item, followed by an empty line, followed by one or more
-- lines indented by a space or tab.
 parseUListParaItem :: ReadP MdToken
 parseUListParaItem = do
  firstLine <- parseUListLineItem
-  char '\n'
-  lines <- greedyParse1 ((string "    " <|> string "\t") *> parseTillEol)
-  let res = fst $ leftmostLongestParse (greedyParse1 parsePara) (init $ unlines lines)
-  char '\n'
+  res <- parseListParaItemCommon
  return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.

-- This is hacky as hell
-- parsedParas <- manyTillLazy parsePara (string "\n\n" *> choice (map char "*-+"))
-- return $ Document parsedParas -- I wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
+-- Parse an unordered list paragraph item.
+parseOListParaItem :: ReadP MdToken
+parseOListParaItem = do
+  firstLine <- parseOListLineItem
+  res <- parseListParaItemCommon
+  return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
+
+-- Common code for parsing list paragraph items.
+-- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more
+-- lines indented by a space or tab.
+-- A list paragraph item can also be a blockquote.
+parseListParaItemCommon :: ReadP [MdToken]
+parseListParaItemCommon = do
+  char '\n'
+  lines <- greedyParse1 ((string "    " <|> string "\t") *> parseTillEol)
+  let res = fst $ leftmostLongestParse (greedyParse1 parseBlockquote <++ greedyParse1 parsePara) (init $ unlines lines)
+  char '\n'
+  return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.

 -- Parse an unordered list item, which can be a line item or another list.
 parseUListItem :: ReadP MdToken
-parseUListItem = parseUListParaItem <++ parseUListLineItem <++ parseUListNested
+parseUListItem = parseUListParaItem <++ parseUListLineItem

 -- Parse an unordered list.
 parseUnorderedList :: ReadP MdToken
@@ -372,8 +357,36 @@ parseUnorderedList = do
  void (char '\n') <++ eof -- A list must end in an extra newline or eof
  return $ UnordList lineItems

+-- --------
+
+parseOListItem :: ReadP MdToken
+parseOListItem = parseOListParaItem <++ parseOListLineItem
+
+-- Parses the first element of an ordered list, which must start with '1.'
+parseFirstOListItem :: ReadP MdToken
+parseFirstOListItem = do
+  remaining <- look
+  when (take 2 remaining /= "1.") pfail
+  parseOListLineItem
+
+parseOrderedList :: ReadP MdToken
+parseOrderedList = do
+  firstLine <- parseFirstOListItem
+  lineItems <- greedyParse1 parseOListItem
+  void (char '\n') <++ eof
+  return $ OrdList (firstLine : lineItems)
+
+documentParsers :: [ReadP MdToken]
+documentParsers =
+  [ parseHeader,
+    parseBlockquote,
+    parseUnorderedList,
+    parseOrderedList,
+    parsePara
+  ]
+
 -- Parse a document, which is multiple paragraphs.
 parseDocument :: ReadP MdToken
 parseDocument = do
-  res <- manyTill (parseHeader <++ parseBlockquote <++ parseUnorderedList <++ parsePara) eof
+  res <- manyTill (fallthroughParser documentParsers) eof
  return (Document res)
--- a/src/MdToHtmlTest.hs
+++ b/src/MdToHtmlTest.hs
@@ -66,14 +66,31 @@ blockquoteTests =

 unorderedListTests =
  TestList
-    [ check_equal "Basic ordered list" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>" (convert "* Item 1\n* Item 2\n* Item 3"),
+    [ check_equal "Basic unordered list" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>" (convert "* Item 1\n* Item 2\n* Item 3"),
      check_equal "Mixing list indicators" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>" (convert "* Item 1\n+ Item 2\n- Item 3"),
      check_equal "Formatted lists" "<ul><li><b>Item 1</b></li><li><i>Item 2</i></li><li><b><i>Item 3</i></b></li></ul>" (convert "* __Item 1__\n+ _Item 2_\n- ***Item 3***"),
      check_equal "Nested list" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3<ul><li>Subitem 1</li><li>Subitem 2</li></ul></li></ul>" (convert "* Item 1\n* Item 2\n* Item 3\n    * Subitem 1\n    * Subitem 2"),
      check_equal "Paragraph in list" "<ul><li>Item 1</li><li><p>Item 2</p><p>More stuff</p></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n\n    More stuff\n\n- Item 3"),
      check_equal "Paragraph before list" "<p>This is a list</p><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "This is a list\n\n* Item 1\n* Item 2"),
      check_equal "Paragraph before list" "<h3>This is a list</h3><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "### This is a list\n\n* Item 1\n* Item 2"),
-      check_equal "Nested list then back" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3</li><li>Item 4</li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n    - Item 3\n    - Item 4\n- Item 5")
+      check_equal "Nested list then back" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3</li><li>Item 4</li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n    - Item 3\n    - Item 4\n- Item 5"),
+      check_equal "Blockquote in list" "<ul><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n\n    > Quote\n\n- Item 3"),
+      check_equal "Ordered list in unordered list" "<ul><li>Item 1</li><li>Item 2<ol><li>Item 1</li><li>Item 2</li></ol></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n    1. Item 1\n    2. Item 2\n- Item 3")
+    ]
+
+orderedListTests =
+  TestList
+    [ check_equal "Basic ordered list" "<ol><li>Item 1</li><li>Item 2</li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n3. Item 3"),
+      check_equal "Mixing list numbering" "<ol><li>Item 1</li><li>Item 2</li><li>Item 3</li></ol>" (convert "1. Item 1\n3. Item 2\n2. Item 3"),
+      check_equal "Should not convert list without number 1" "<p>2. Item 1 1. Item 2</p>" (convert "2. Item 1\n1. Item 2"),
+      check_equal "Formatted lists" "<ol><li><b>Item 1</b></li><li><i>Item 2</i></li><li><b><i>Item 3</i></b></li></ol>" (convert "1. __Item 1__\n2. _Item 2_\n3. ***Item 3***"),
+      check_equal "Nested list" "<ol><li>Item 1</li><li>Item 2</li><li>Item 3<ol><li>Subitem 1</li><li>Subitem 2</li></ol></li></ol>" (convert "1. Item 1\n2. Item 2\n3. Item 3\n    1. Subitem 1\n    2. Subitem 2"),
+      check_equal "Paragraph in list" "<ol><li>Item 1</li><li><p>Item 2</p><p>More stuff</p></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n\n    More stuff\n\n1. Item 3"),
+      check_equal "Paragraph before list" "<p>This is a list</p><ol><li>Item 1</li><li>Item 2</li></ol>" (convert "This is a list\n\n1. Item 1\n1. Item 2"),
+      check_equal "Paragraph before list" "<h3>This is a list</h3><ol><li>Item 1</li><li>Item 2</li></ol>" (convert "### This is a list\n\n1. Item 1\n200. Item 2"),
+      check_equal "Nested list then back" "<ol><li>Item 1</li><li>Item 2<ol><li>Item 3</li><li>Item 4</li></ol></li><li>Item 5</li></ol>" (convert "1. Item 1\n2. Item 2\n    1. Item 3\n    3. Item 4\n5. Item 5"),
+      check_equal "Blockquote in list" "<ol><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n\n    > Quote\n\n3. Item 3"),
+      check_equal "Unordered list in ordered list" "<ol><li>Item 1</li><li>Item 2<ul><li>Item 1</li><li>Item 2</li></ul></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n    - Item 1\n    * Item 2\n4. Item 3")
    ]

 integrationTests =
@@ -87,14 +104,14 @@ integrationTests =
      check_equal
        "Integration 7"
        "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2>Second \
-        \Heading</h2><ul><li>Unordered lists, and:<ul><li>One</li><li>Two</li><li>\
-        \Three</li></ul></li><li>More</li></ul><blockquote><p>Blockquote</p>\
+        \Heading</h2><ul><li>Unordered lists, and:<ol><li>One</li><li>Two</li><li>\
+        \Three</li></ol></li><li>More</li></ul><blockquote><p>Blockquote</p>\
        \</blockquote><p>And <b>bold</b>, <i>italics</i>, and even <i>italics \
        \and later <b>bold</b></i>. Even <s>strikethrough</s>. \
        \<a href=\"https://markdowntohtml.com\">A link</a> to somewhere.</p>"
        ( convert
            "# Sample Markdown\n\nThis is some basic, sample markdown.\n\n## Second \
-            \Heading\n\n- Unordered lists, and:\n    - One\n    - Two\n    - Three\n\
+            \Heading\n\n- Unordered lists, and:\n    1. One\n    2. Two\n    3. Three\n\
            \- More\n\n> Blockquote\n\nAnd **bold**, *italics*, and even *italics and \
            \later **bold***. Even ~~strikethrough~~. [A link](https://markdowntohtml.com) to somewhere."
        )
@@ -109,6 +126,7 @@ tests =
      escapedCharTests,
      blockquoteTests,
      unorderedListTests,
+      orderedListTests,
      integrationTests
    ]
Author	SHA1	Message	Date
Aadhavan Srinivasan	e051c87f08	Factor list line common parsing into a separate function; refactored OList and UList line parsing to us it	2025-05-20 16:48:31 -04:00
Aadhavan Srinivasan	9b1c51897c	A nested list can be ordered or unordered	2025-05-20 16:47:58 -04:00
Aadhavan Srinivasan	2a3dddc7b0	Rename function	2025-05-20 16:47:31 -04:00
Aadhavan Srinivasan	a8793b5adb	Remove obsolete comments	2025-05-20 16:47:16 -04:00
Aadhavan Srinivasan	eecec764ad	A list can contain a blockquote as well.	2025-05-20 16:47:04 -04:00
Aadhavan Srinivasan	cdca6ea95e	A list item doesn't need to have a nested list item parser, because the line item parser handles the nested list.	2025-05-20 16:46:52 -04:00
Aadhavan Srinivasan	540b5430e5	Use list of document parsers in parseDocument	2025-05-20 16:45:56 -04:00
Aadhavan Srinivasan	00dfba81eb	Created a list of document parsers; implemented ordered list parsing.	2025-05-20 16:45:37 -04:00
Aadhavan Srinivasan	39152c0034	Factor out common code for UList and OList parsing into a separate function. Refactored UList function; wrote OList function.	2025-05-20 16:45:07 -04:00
Aadhavan Srinivasan	41b35be7c9	Rename function	2025-05-20 16:43:35 -04:00
Aadhavan Srinivasan	d2c8565f62	Import Data.Char for isDigit	2025-05-20 16:43:22 -04:00
Aadhavan Srinivasan	62eeef2abb	Removed unused functions	2025-05-20 16:43:10 -04:00
Aadhavan Srinivasan	9c6634cfec	Added ordered list tests	2025-05-20 16:42:40 -04:00
Aadhavan Srinivasan	2a5a68b1de	Fixed test name	2025-05-20 16:42:12 -04:00
Aadhavan Srinivasan	f8e1a98bdf	Remove obsolete comment	2025-05-20 14:03:30 -04:00
Aadhavan Srinivasan	05433c31f1	Remove unused functions	2025-05-20 14:02:50 -04:00