Factor list line common parsing into a separate function; refactored

OList and UList line parsing to us it
A nested list can be ordered or unordered
2025-05-20 16:48:31 -04:00 · 2025-05-20 16:47:58 -04:00 · 2025-05-20 16:47:31 -04:00 · 2025-05-20 16:47:16 -04:00 · 2025-05-20 16:47:04 -04:00 · 2025-05-20 16:46:52 -04:00
2 changed files with 93 additions and 62 deletions
--- a/src/MdToHTML.hs
+++ b/src/MdToHTML.hs
@@ -6,6 +6,7 @@ module MdToHTML where
 import Control.Applicative
 import Control.Monad
 import Data.Char
 import Data.List
 import Data.Ord (comparing)
 import Debug.Trace
@@ -87,17 +88,6 @@ leftmostLongestParse parser input =
        Nothing -> (mempty, mempty)
        Just x -> x
 -- Parse if the string that's left matches the string comparator function
 lookaheadParse :: (String -> Bool) -> ReadP Char
 lookaheadParse stringCmp = do
  lookahead <- look
  case stringCmp lookahead of
    True -> get
    False -> pfail
 lineToList :: MdToken -> [MdToken]
 lineToList (Line tokens) = tokens
 specialChars = "\\#*_[\n"
 -- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
@@ -118,25 +108,6 @@ prepend x1 x2 = x1 ++ x2
 append :: [a] -> [a] -> [a]
 append x1 x2 = x2 ++ x1
 -- Sequence two parsers, running one after the other and returning the result.
 sequenceParse :: ReadP a -> ReadP a -> ReadP [a]
 sequenceParse p1 p2 = twoElemList <$> p1 <*> p2
  where
    twoElemList elem1 elem2 = [elem1, elem2]
 -- Parses p1 until p2 succeeds, but doesn't actually consume anything from p2.
 -- Similar to manyTill, except manyTill's second parser actually consumes characters.
 manyTillLazy :: ReadP a -> ReadP b -> ReadP [a]
 manyTillLazy p1 p2 = do
  res <- p1
  remaining <- look
  let p2res = readP_to_S p2 remaining
  case p2res of
    [] -> do
      res2 <- manyTillLazy p1 p2
      return (res : res2)
    _ -> return [res]
 -- Parse until EOL or EOF
 parseTillEol :: ReadP String
 parseTillEol = manyTill get (void (char '\n') <++ eof)
@@ -176,9 +147,9 @@ parseBold = parseBoldWith "**" <|> parseBoldWith "__"
 -- Parse italic text
 parseItalic :: ReadP MdToken
-parseItalic = parseBoldWith "*" <|> parseBoldWith "_"
+parseItalic = parseItalicWith "*" <|> parseItalicWith "_"
  where
-    parseBoldWith delim = do
+    parseItalicWith delim = do
      string delim
      inside <- greedyParse1 parseLineToken
      string delim
@@ -316,17 +287,13 @@ parseBlockquote = do
  return (Blockquote parsedQuotedLines)
 -- Parse a nested list item.
-parseUListNested :: ReadP MdToken
+parseListNested :: ReadP MdToken
-parseUListNested = do
+parseListNested = do
  --  firstChar <- string "    " <++ string "\t"
  --  skipSpaces
  --  restOfLine <- manyTill get (void (char '\n') <++ eof)
  --  let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
  --  return restOfLineParsed
  let firstCharParser = string "    " <++ string "\t"
  let restOfLineParser = manyTill get (void (char '\n') <++ eof)
  lines <- greedyParse1 (firstCharParser *> restOfLineParser)
-  let linesParsed = fst $ leftmostLongestParse parseUnorderedList (init $ unlines lines)
+  let linesParsed = fst $ leftmostLongestParse (parseUnorderedList <++ parseOrderedList) (init $ unlines lines)
  when (null (show linesParsed)) pfail
  return linesParsed
 -- Parse an unordered list line item.
@@ -334,36 +301,54 @@ parseUListLineItem :: ReadP MdToken
 parseUListLineItem = do
  firstChar <- choice (map char ['*', '+', '-'])
  char ' ' -- At least one space between list indicator and list text.
  parseListLineItemCommon
 -- Parse an ordered list line item.
 parseOListLineItem :: ReadP MdToken
 parseOListLineItem = do
  num <- greedyParse1 (satisfy isDigit)
  char '.'
  char ' ' -- At least one space between list indicator and list text.
  parseListLineItemCommon
 -- Common code for parsing list line items
 parseListLineItemCommon :: ReadP MdToken
 parseListLineItemCommon = do
  skipSpaces
  restOfLine <- many1 parseListLineToken
  void (char '\n') <++ eof
-  nestedList <- parseUListNested <++ return (Unit "")
+  nestedList <- parseListNested <++ return (Unit "")
  return $ Line [Line restOfLine, nestedList]
 -- restOfLine <- manyTill get (void (char '\n') <++ eof)
 -- let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
 -- nestedList <- parseUListNested <++ return (Unit "")
 -- return $ Line [restOfLineParsed, nestedList]
 -- Parse an unordered list paragraph item.
 -- This is defined as a line item, followed by an empty line, followed by one or more
 -- lines indented by a space or tab.
 parseUListParaItem :: ReadP MdToken
 parseUListParaItem = do
  firstLine <- parseUListLineItem
-  char '\n'
+  res <- parseListParaItemCommon
  lines <- greedyParse1 ((string "    " <|> string "\t") *> parseTillEol)
  let res = fst $ leftmostLongestParse (greedyParse1 parsePara) (init $ unlines lines)
  char '\n'
  return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- This is hacky as hell
+-- Parse an unordered list paragraph item.
-- parsedParas <- manyTillLazy parsePara (string "\n\n" *> choice (map char "*-+"))
+parseOListParaItem :: ReadP MdToken
-- return $ Document parsedParas -- I wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
+parseOListParaItem = do
  firstLine <- parseOListLineItem
  res <- parseListParaItemCommon
  return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
 -- Common code for parsing list paragraph items.
 -- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more
 -- lines indented by a space or tab.
 -- A list paragraph item can also be a blockquote.
 parseListParaItemCommon :: ReadP [MdToken]
 parseListParaItemCommon = do
  char '\n'
  lines <- greedyParse1 ((string "    " <|> string "\t") *> parseTillEol)
  let res = fst $ leftmostLongestParse (greedyParse1 parseBlockquote <++ greedyParse1 parsePara) (init $ unlines lines)
  char '\n'
  return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
 -- Parse an unordered list item, which can be a line item or another list.
 parseUListItem :: ReadP MdToken
-parseUListItem = parseUListParaItem <++ parseUListLineItem <++ parseUListNested
+parseUListItem = parseUListParaItem <++ parseUListLineItem
 -- Parse an unordered list.
 parseUnorderedList :: ReadP MdToken
@@ -372,8 +357,36 @@ parseUnorderedList = do
  void (char '\n') <++ eof -- A list must end in an extra newline or eof
  return $ UnordList lineItems
 -- --------
 parseOListItem :: ReadP MdToken
 parseOListItem = parseOListParaItem <++ parseOListLineItem
 -- Parses the first element of an ordered list, which must start with '1.'
 parseFirstOListItem :: ReadP MdToken
 parseFirstOListItem = do
  remaining <- look
  when (take 2 remaining /= "1.") pfail
  parseOListLineItem
 parseOrderedList :: ReadP MdToken
 parseOrderedList = do
  firstLine <- parseFirstOListItem
  lineItems <- greedyParse1 parseOListItem
  void (char '\n') <++ eof
  return $ OrdList (firstLine : lineItems)
 documentParsers :: [ReadP MdToken]
 documentParsers =
  [ parseHeader,
    parseBlockquote,
    parseUnorderedList,
    parseOrderedList,
    parsePara
  ]
 -- Parse a document, which is multiple paragraphs.
 parseDocument :: ReadP MdToken
 parseDocument = do
-  res <- manyTill (parseHeader <++ parseBlockquote <++ parseUnorderedList <++ parsePara) eof
+  res <- manyTill (fallthroughParser documentParsers) eof
  return (Document res)
--- a/src/MdToHtmlTest.hs
+++ b/src/MdToHtmlTest.hs
@@ -66,14 +66,31 @@ blockquoteTests =
 unorderedListTests =
  TestList
-    [ check_equal "Basic ordered list" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>" (convert "* Item 1\n* Item 2\n* Item 3"),
+    [ check_equal "Basic unordered list" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>" (convert "* Item 1\n* Item 2\n* Item 3"),
      check_equal "Mixing list indicators" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>" (convert "* Item 1\n+ Item 2\n- Item 3"),
      check_equal "Formatted lists" "<ul><li><b>Item 1</b></li><li><i>Item 2</i></li><li><b><i>Item 3</i></b></li></ul>" (convert "* __Item 1__\n+ _Item 2_\n- ***Item 3***"),
      check_equal "Nested list" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3<ul><li>Subitem 1</li><li>Subitem 2</li></ul></li></ul>" (convert "* Item 1\n* Item 2\n* Item 3\n    * Subitem 1\n    * Subitem 2"),
      check_equal "Paragraph in list" "<ul><li>Item 1</li><li><p>Item 2</p><p>More stuff</p></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n\n    More stuff\n\n- Item 3"),
      check_equal "Paragraph before list" "<p>This is a list</p><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "This is a list\n\n* Item 1\n* Item 2"),
      check_equal "Paragraph before list" "<h3>This is a list</h3><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "### This is a list\n\n* Item 1\n* Item 2"),
-      check_equal "Nested list then back" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3</li><li>Item 4</li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n    - Item 3\n    - Item 4\n- Item 5")
+      check_equal "Nested list then back" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3</li><li>Item 4</li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n    - Item 3\n    - Item 4\n- Item 5"),
      check_equal "Blockquote in list" "<ul><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n\n    > Quote\n\n- Item 3"),
      check_equal "Ordered list in unordered list" "<ul><li>Item 1</li><li>Item 2<ol><li>Item 1</li><li>Item 2</li></ol></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n    1. Item 1\n    2. Item 2\n- Item 3")
    ]
 orderedListTests =
  TestList
    [ check_equal "Basic ordered list" "<ol><li>Item 1</li><li>Item 2</li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n3. Item 3"),
      check_equal "Mixing list numbering" "<ol><li>Item 1</li><li>Item 2</li><li>Item 3</li></ol>" (convert "1. Item 1\n3. Item 2\n2. Item 3"),
      check_equal "Should not convert list without number 1" "<p>2. Item 1 1. Item 2</p>" (convert "2. Item 1\n1. Item 2"),
      check_equal "Formatted lists" "<ol><li><b>Item 1</b></li><li><i>Item 2</i></li><li><b><i>Item 3</i></b></li></ol>" (convert "1. __Item 1__\n2. _Item 2_\n3. ***Item 3***"),
      check_equal "Nested list" "<ol><li>Item 1</li><li>Item 2</li><li>Item 3<ol><li>Subitem 1</li><li>Subitem 2</li></ol></li></ol>" (convert "1. Item 1\n2. Item 2\n3. Item 3\n    1. Subitem 1\n    2. Subitem 2"),
      check_equal "Paragraph in list" "<ol><li>Item 1</li><li><p>Item 2</p><p>More stuff</p></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n\n    More stuff\n\n1. Item 3"),
      check_equal "Paragraph before list" "<p>This is a list</p><ol><li>Item 1</li><li>Item 2</li></ol>" (convert "This is a list\n\n1. Item 1\n1. Item 2"),
      check_equal "Paragraph before list" "<h3>This is a list</h3><ol><li>Item 1</li><li>Item 2</li></ol>" (convert "### This is a list\n\n1. Item 1\n200. Item 2"),
      check_equal "Nested list then back" "<ol><li>Item 1</li><li>Item 2<ol><li>Item 3</li><li>Item 4</li></ol></li><li>Item 5</li></ol>" (convert "1. Item 1\n2. Item 2\n    1. Item 3\n    3. Item 4\n5. Item 5"),
      check_equal "Blockquote in list" "<ol><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n\n    > Quote\n\n3. Item 3"),
      check_equal "Unordered list in ordered list" "<ol><li>Item 1</li><li>Item 2<ul><li>Item 1</li><li>Item 2</li></ul></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n    - Item 1\n    * Item 2\n4. Item 3")
    ]
 integrationTests =
@@ -87,14 +104,14 @@ integrationTests =
      check_equal
        "Integration 7"
        "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2>Second \
-        \Heading</h2><ul><li>Unordered lists, and:<ul><li>One</li><li>Two</li><li>\
+        \Heading</h2><ul><li>Unordered lists, and:<ol><li>One</li><li>Two</li><li>\
-        \Three</li></ul></li><li>More</li></ul><blockquote><p>Blockquote</p>\
+        \Three</li></ol></li><li>More</li></ul><blockquote><p>Blockquote</p>\
        \</blockquote><p>And <b>bold</b>, <i>italics</i>, and even <i>italics \
        \and later <b>bold</b></i>. Even <s>strikethrough</s>. \
        \<a href=\"https://markdowntohtml.com\">A link</a> to somewhere.</p>"
        ( convert
            "# Sample Markdown\n\nThis is some basic, sample markdown.\n\n## Second \
-            \Heading\n\n- Unordered lists, and:\n    - One\n    - Two\n    - Three\n\
+            \Heading\n\n- Unordered lists, and:\n    1. One\n    2. Two\n    3. Three\n\
            \- More\n\n> Blockquote\n\nAnd **bold**, *italics*, and even *italics and \
            \later **bold***. Even ~~strikethrough~~. [A link](https://markdowntohtml.com) to somewhere."
        )
@@ -109,6 +126,7 @@ tests =
      escapedCharTests,
      blockquoteTests,
      unorderedListTests,
      orderedListTests,
      integrationTests
    ]
Author	SHA1	Message	Date
Aadhavan Srinivasan	e051c87f08	Factor list line common parsing into a separate function; refactored OList and UList line parsing to us it	2025-05-20 16:48:31 -04:00
Aadhavan Srinivasan	9b1c51897c	A nested list can be ordered or unordered	2025-05-20 16:47:58 -04:00
Aadhavan Srinivasan	2a3dddc7b0	Rename function	2025-05-20 16:47:31 -04:00
Aadhavan Srinivasan	a8793b5adb	Remove obsolete comments	2025-05-20 16:47:16 -04:00
Aadhavan Srinivasan	eecec764ad	A list can contain a blockquote as well.	2025-05-20 16:47:04 -04:00
Aadhavan Srinivasan	cdca6ea95e	A list item doesn't need to have a nested list item parser, because the line item parser handles the nested list.	2025-05-20 16:46:52 -04:00
Aadhavan Srinivasan	540b5430e5	Use list of document parsers in parseDocument	2025-05-20 16:45:56 -04:00
Aadhavan Srinivasan	00dfba81eb	Created a list of document parsers; implemented ordered list parsing.	2025-05-20 16:45:37 -04:00
Aadhavan Srinivasan	39152c0034	Factor out common code for UList and OList parsing into a separate function. Refactored UList function; wrote OList function.	2025-05-20 16:45:07 -04:00
Aadhavan Srinivasan	41b35be7c9	Rename function	2025-05-20 16:43:35 -04:00
Aadhavan Srinivasan	d2c8565f62	Import Data.Char for isDigit	2025-05-20 16:43:22 -04:00
Aadhavan Srinivasan	62eeef2abb	Removed unused functions	2025-05-20 16:43:10 -04:00
Aadhavan Srinivasan	9c6634cfec	Added ordered list tests	2025-05-20 16:42:40 -04:00
Aadhavan Srinivasan	2a5a68b1de	Fixed test name	2025-05-20 16:42:12 -04:00
Aadhavan Srinivasan	f8e1a98bdf	Remove obsolete comment	2025-05-20 14:03:30 -04:00
Aadhavan Srinivasan	05433c31f1	Remove unused functions	2025-05-20 14:02:50 -04:00