Removed test file

Added unordered list tests and integration tests; added strikethrough
tests to test list
2025-05-20 12:25:07 -05:00 · 2025-05-20 12:24:20 -05:00 · 2025-05-20 12:23:47 -05:00 · 2025-05-20 12:23:27 -05:00 · 2025-05-20 12:23:13 -05:00 · 2025-05-20 12:22:37 -05:00
3 changed files with 201 additions and 37 deletions
--- a/src/MdToHTML.hs
+++ b/src/MdToHTML.hs
@@ -7,15 +7,16 @@ module MdToHTML where
 import Control.Applicative
 import Control.Monad
 import Data.List
+import Data.Ord (comparing)
 import Debug.Trace
 import Text.ParserCombinators.ReadP
 import Text.Printf

 type HeaderLevel = Int

-newtype URL = URL {getUrl :: String}
+newtype URL = URL {getUrl :: String} deriving (Eq)

-newtype ImgPath = ImgPath {getPath :: String}
+newtype ImgPath = ImgPath {getPath :: String} deriving (Eq)

 parseMany :: ReadP a -> ReadP [a]
 parseMany = Text.ParserCombinators.ReadP.many
@@ -28,7 +29,7 @@ data MdToken
  | SingleNewline -- A single newline is rendered as a space.
  | Linebreak
  | HorizontalRule
-  | Blockquote MdToken
+  | Blockquote [MdToken]
  | UnordList [MdToken]
  | OrdList [MdToken]
  | Code String
@@ -39,6 +40,7 @@ data MdToken
  | Italic MdToken
  | Strikethrough MdToken
  | Unit String
+  deriving (Eq)

 -- Deriving Show for MdToken
 instance Show MdToken where
@@ -96,7 +98,54 @@ lookaheadParse stringCmp = do
 lineToList :: MdToken -> [MdToken]
 lineToList (Line tokens) = tokens

-specialChars = "\\#*_[\n "
+specialChars = "\\#*_[\n"
+
+-- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
+greedyParse :: ReadP a -> ReadP [a]
+greedyParse parser = do
+  greedyParse1 parser <++ return []
+
+-- Like greedyParse, but the parser must succeed atleast once.
+greedyParse1 :: ReadP a -> ReadP [a]
+greedyParse1 parser = do
+  parsed1 <- parser
+  parsed2 <- greedyParse1 parser <++ return []
+  return (parsed1 : parsed2)
+
+prepend :: [a] -> [a] -> [a]
+prepend x1 x2 = x1 ++ x2
+
+append :: [a] -> [a] -> [a]
+append x1 x2 = x2 ++ x1
+
+-- Sequence two parsers, running one after the other and returning the result.
+sequenceParse :: ReadP a -> ReadP a -> ReadP [a]
+sequenceParse p1 p2 = twoElemList <$> p1 <*> p2
+  where
+    twoElemList elem1 elem2 = [elem1, elem2]
+
+-- Parses p1 until p2 succeeds, but doesn't actually consume anything from p2.
+-- Similar to manyTill, except manyTill's second parser actually consumes characters.
+manyTillLazy :: ReadP a -> ReadP b -> ReadP [a]
+manyTillLazy p1 p2 = do
+  res <- p1
+  remaining <- look
+  let p2res = readP_to_S p2 remaining
+  case p2res of
+    [] -> do
+      res2 <- manyTillLazy p1 p2
+      return (res : res2)
+    _ -> return [res]
+
+-- Parse until EOL or EOF
+parseTillEol :: ReadP String
+parseTillEol = manyTill get (void (char '\n') <++ eof)
+
+-- Takes a list of parsers. Returns a parser that will try them in
+-- order, moving to the next one only if the current one fails.
+fallthroughParser :: [ReadP a] -> ReadP a
+fallthroughParser [x] = x
+fallthroughParser (x : xs) = x <++ fallthroughParser xs

 -- ---------------

@@ -110,31 +159,38 @@ parseHeader = do
    pfail
  skipSpaces
  text <- munch1 (/= '\n')
-  Text.ParserCombinators.ReadP.optional (char '\n')
+  --  Text.ParserCombinators.ReadP.optional (char '\n')
+  skipSpaces
  let parsedText = fst $ leftmostLongestParse parseLine text
  return (Header (length headers) parsedText)

 -- Parse bold text
 parseBold :: ReadP MdToken
-parseBold = do
-  text <-
-    choice
-      [ between (string "__") (string "__") (many1 (lookaheadParse (/= "__"))),
-        between (string "**") (string "**") (many1 (lookaheadParse (/= "**")))
-      ]
-  let parsedText = fst $ leftmostLongestParse parseLine text
-  return (Bold parsedText)
+parseBold = parseBoldWith "**" <|> parseBoldWith "__"
+  where
+    parseBoldWith delim = do
+      string delim
+      inside <- greedyParse1 parseLineToken
+      string delim
+      return (Bold (Line inside))

 -- Parse italic text
 parseItalic :: ReadP MdToken
-parseItalic = do
-  text <-
-    choice
-      [ between (string "_") (string "_") (munch1 (/= '_')),
-        between (string "*") (string "*") (munch1 (/= '*'))
-      ]
-  let parsedText = fst $ leftmostLongestParse parseLine text
-  return (Italic parsedText)
+parseItalic = parseBoldWith "*" <|> parseBoldWith "_"
+  where
+    parseBoldWith delim = do
+      string delim
+      inside <- greedyParse1 parseLineToken
+      string delim
+      return (Italic (Line inside))
+
+-- Parse strikethrough text
+parseStrikethrough :: ReadP MdToken
+parseStrikethrough = do
+  string "~~"
+  inside <- many1 parseLineToken
+  string "~~"
+  return (Strikethrough (Line inside))

 -- Parse a link
 parseLink :: ReadP MdToken
@@ -164,6 +220,12 @@ parseEscapedChar = do
  escapedChar <- choice (map char specialChars) -- Parse any of the special chars.
  return (Unit [escapedChar])

+-- Parse a character as a Unit.
+parseUnit :: ReadP MdToken
+parseUnit = do
+  text <- satisfy (`notElem` specialChars)
+  return (Unit [text])
+
 -- Parse a regular string as a Unit.
 parseString :: ReadP MdToken
 parseString = do
@@ -178,17 +240,33 @@ lineParsers =
    parseEscapedChar,
    parseBold,
    parseItalic,
+    parseStrikethrough,
    parseLink,
-    parseString
+    parseUnit
  ] -- A 'line' doesn't include a 'header'

+listLineParsers :: [ReadP MdToken]
+listLineParsers =
+  [ parseLinebreak,
+    parseEscapedChar,
+    parseBold,
+    parseItalic,
+    parseStrikethrough,
+    parseLink,
+    parseUnit
+  ] -- A list line cannot contain newlines.
+
 -- List of all parsers
 allParsers :: [ReadP MdToken]
 allParsers = parseHeader : lineParsers

-- Parse any of the above tokens.
+-- Parse any of the line tokens.
 parseLineToken :: ReadP MdToken
-parseLineToken = choice lineParsers
+parseLineToken = fallthroughParser lineParsers
+
+-- Parse any of the list line tokens.
+parseListLineToken :: ReadP MdToken
+parseListLineToken = fallthroughParser listLineParsers

 -- Parse a line, consisting of one or more tokens.
 parseLine :: ReadP MdToken
@@ -230,11 +308,72 @@ parseQuotedLines =
 -- Parse a blockquote, which is a greater-than sign followed by a paragraph.
 parseBlockquote :: ReadP MdToken
 parseBlockquote = do
-  char '>'
-  Blockquote <$> (parseBlockquote <++ parsePara) -- Parse another blockquote or a regular paragraph, wrap it in a blockquote.
+  quotedLines <- parseQuotedLines
+  -- remaining <- look
+  -- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining
+  -- string (init $ unlines quotedLines)
+  let parsedQuotedLines = fst $ leftmostLongestParse (many1 (parseBlockquote <++ parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
+  return (Blockquote parsedQuotedLines)
+
+-- Parse a nested list item.
+parseUListNested :: ReadP MdToken
+parseUListNested = do
+  --  firstChar <- string "    " <++ string "\t"
+  --  skipSpaces
+  --  restOfLine <- manyTill get (void (char '\n') <++ eof)
+  --  let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
+  --  return restOfLineParsed
+  let firstCharParser = string "    " <++ string "\t"
+  let restOfLineParser = manyTill get (void (char '\n') <++ eof)
+  lines <- greedyParse1 (firstCharParser *> restOfLineParser)
+  let linesParsed = fst $ leftmostLongestParse parseUnorderedList (init $ unlines lines)
+  return linesParsed
+
+-- Parse an unordered list line item.
+parseUListLineItem :: ReadP MdToken
+parseUListLineItem = do
+  firstChar <- choice (map char ['*', '+', '-'])
+  char ' ' -- At least one space between list indicator and list text.
+  skipSpaces
+  restOfLine <- many1 parseListLineToken
+  void (char '\n') <++ eof
+  nestedList <- parseUListNested <++ return (Unit "")
+  return $ Line [Line restOfLine, nestedList]
+
+-- restOfLine <- manyTill get (void (char '\n') <++ eof)
+-- let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
+-- nestedList <- parseUListNested <++ return (Unit "")
+-- return $ Line [restOfLineParsed, nestedList]
+
+-- Parse an unordered list paragraph item.
+-- This is defined as a line item, followed by an empty line, followed by one or more
+-- lines indented by a space or tab.
+parseUListParaItem :: ReadP MdToken
+parseUListParaItem = do
+  firstLine <- parseUListLineItem
+  char '\n'
+  lines <- greedyParse1 ((string "    " <|> string "\t") *> parseTillEol)
+  let res = fst $ leftmostLongestParse (greedyParse1 parsePara) (init $ unlines lines)
+  char '\n'
+  return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
+
+-- This is hacky as hell
+-- parsedParas <- manyTillLazy parsePara (string "\n\n" *> choice (map char "*-+"))
+-- return $ Document parsedParas -- I wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
+
+-- Parse an unordered list item, which can be a line item or another list.
+parseUListItem :: ReadP MdToken
+parseUListItem = parseUListParaItem <++ parseUListLineItem <++ parseUListNested
+
+-- Parse an unordered list.
+parseUnorderedList :: ReadP MdToken
+parseUnorderedList = do
+  lineItems <- greedyParse1 parseUListItem
+  void (char '\n') <++ eof -- A list must end in an extra newline or eof
+  return $ UnordList lineItems

 -- Parse a document, which is multiple paragraphs.
 parseDocument :: ReadP MdToken
 parseDocument = do
-  res <- manyTill (parseHeader <++ parseBlockquote <++ parsePara) eof
+  res <- manyTill (parseHeader <++ parseBlockquote <++ parseUnorderedList <++ parsePara) eof
  return (Document res)
--- a/src/MdToHtmlTest.hs
+++ b/src/MdToHtmlTest.hs
@@ -23,7 +23,15 @@ boldTests =
  TestList
    [ check_equal "Should convert bold" "<p><b>Hello</b></p>" (convert "__Hello__"),
      check_equal "Should convert italic" "<p><i>Hello</i></p>" (convert "_Hello_"),
-      check_equal "Should convert bold and italic in a sentence" "<p>It <i>is</i> a <b>wonderful</b> day</p>" (convert "It _is_ a __wonderful__ day")
+      check_equal "Should convert bold and italic in a sentence" "<p>It <i>is</i> a <b>wonderful</b> day</p>" (convert "It _is_ a __wonderful__ day"),
+      check_equal "Should convert nested bold and italic" "<p><b>Bold then <i>Italic</i></b></p>" (convert "**Bold then *Italic***"),
+      check_equal "Should convert nested bold and italic" "<p><i>Italic then <b>Bold</b></i></p>" (convert "*Italic then **Bold***")
+    ]
+
+strikethroughTests =
+  TestList
+    [ check_equal "Should convert strikethrough" "<p><s>Hello</s></p>" (convert "~~Hello~~"),
+      check_equal "Should convert long sentence with tilde" "<p><s>The universe is ~7 days old</s>. The universe is 13 billion years old.</p>" (convert "~~The universe is ~7 days old~~. The universe is 13 billion years old.")
    ]

 linkTests =
@@ -56,9 +64,16 @@ blockquoteTests =
        )
    ]

-orderedListTests =
+unorderedListTests =
  TestList
-    [ check_equal "Basic ordered list" "<ol><li>Item 1</li><li>Item 2</li><li>Item 3</li></ol" (convert "1. Item 1\n2. Item2\n3. Item3")
+    [ check_equal "Basic ordered list" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>" (convert "* Item 1\n* Item 2\n* Item 3"),
+      check_equal "Mixing list indicators" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>" (convert "* Item 1\n+ Item 2\n- Item 3"),
+      check_equal "Formatted lists" "<ul><li><b>Item 1</b></li><li><i>Item 2</i></li><li><b><i>Item 3</i></b></li></ul>" (convert "* __Item 1__\n+ _Item 2_\n- ***Item 3***"),
+      check_equal "Nested list" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3<ul><li>Subitem 1</li><li>Subitem 2</li></ul></li></ul>" (convert "* Item 1\n* Item 2\n* Item 3\n    * Subitem 1\n    * Subitem 2"),
+      check_equal "Paragraph in list" "<ul><li>Item 1</li><li><p>Item 2</p><p>More stuff</p></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n\n    More stuff\n\n- Item 3"),
+      check_equal "Paragraph before list" "<p>This is a list</p><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "This is a list\n\n* Item 1\n* Item 2"),
+      check_equal "Paragraph before list" "<h3>This is a list</h3><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "### This is a list\n\n* Item 1\n* Item 2"),
+      check_equal "Nested list then back" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3</li><li>Item 4</li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n    - Item 3\n    - Item 4\n- Item 5")
    ]

 integrationTests =
@@ -68,17 +83,32 @@ integrationTests =
      check_equal "Integration 3" "<h1>Hello</h1><p>World</p>" (convert "# Hello\nWorld"),
      check_equal "Integration 4" "<p>a b</p>" (convert "a\nb"),
      check_equal "Integration 5" "<h1>Hello</h1>" (convert "# Hello\n"),
-      check_equal "Integration 6" "<p>First line<br>Second line</p>" (convert "First line  \nSecond line")
+      check_equal "Integration 6" "<p>First line<br>Second line</p>" (convert "First line  \nSecond line"),
+      check_equal
+        "Integration 7"
+        "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2>Second \
+        \Heading</h2><ul><li>Unordered lists, and:<ul><li>One</li><li>Two</li><li>\
+        \Three</li></ul></li><li>More</li></ul><blockquote><p>Blockquote</p>\
+        \</blockquote><p>And <b>bold</b>, <i>italics</i>, and even <i>italics \
+        \and later <b>bold</b></i>. Even <s>strikethrough</s>. \
+        \<a href=\"https://markdowntohtml.com\">A link</a> to somewhere.</p>"
+        ( convert
+            "# Sample Markdown\n\nThis is some basic, sample markdown.\n\n## Second \
+            \Heading\n\n- Unordered lists, and:\n    - One\n    - Two\n    - Three\n\
+            \- More\n\n> Blockquote\n\nAnd **bold**, *italics*, and even *italics and \
+            \later **bold***. Even ~~strikethrough~~. [A link](https://markdowntohtml.com) to somewhere."
+        )
    ]

 tests =
  TestList
    [ headerTests,
      boldTests,
+      strikethroughTests,
      linkTests,
      escapedCharTests,
      blockquoteTests,
-      orderedListTests,
+      unorderedListTests,
      integrationTests
    ]

--- a/test.hs
+++ b/test.hs
@@ -1,5 +0,0 @@
-leftmostLongest :: (Foldable t) => [t a] -> t a
-leftmostLongest xs = 
-        let lastElem = (last xs)
-            filteredLst = (filter (\val -> (length val) == (length lastElem)) xs)
-        in head filteredLst
Author	SHA1	Message	Date
Rockingcool	5c871f2b25	Removed test file	2025-05-20 12:25:07 -05:00
Rockingcool	5273c99e6e	Added unordered list tests and integration tests; added strikethrough tests to test list	2025-05-20 12:24:20 -05:00
Rockingcool	50888c9c3d	Added bold and strikethrough tests	2025-05-20 12:23:47 -05:00
Rockingcool	45115c765c	An unordered list must end in a blank line.	2025-05-20 12:23:27 -05:00
Rockingcool	5b0d42fd2d	Use the in-order parsing approach instead of the post-order one.	2025-05-20 12:23:13 -05:00
Rockingcool	2a585d00f2	Enforce at least one space between list indicator and list text.	2025-05-20 12:22:37 -05:00
Rockingcool	11a3b14cb1	Define a parser for list line tokens, update line token parser	2025-05-20 12:22:19 -05:00
Rockingcool	58d3142855	Update comment	2025-05-20 12:21:45 -05:00
Rockingcool	0fb651fffc	Add parseUnit instead of parseString to lineParsers	2025-05-20 12:20:51 -05:00
Rockingcool	bc05dede06	Create a list of parsers that are used for list line items.	2025-05-20 12:20:35 -05:00
Rockingcool	b69e34f823	Parse a single character as a unit	2025-05-20 12:20:10 -05:00
Rockingcool	2514ecdafc	Parse bold, italic and strikethrough in-order, instead of trying to find the end, then parsing everything in the middle. The current approach parses the opening bold (or italic), some text, then the closing bold (or italic), instead of parsing the opening, closing, then everything in between.	2025-05-20 12:19:59 -05:00
Rockingcool	c52d5556a2	Allow for multiple blank lines after header	2025-05-20 12:18:52 -05:00
Rockingcool	5fc1b1122a	Create a function to 'fallthrough parse' ie. try the second parser only if the first one fails.	2025-05-20 12:18:23 -05:00
Rockingcool	83dd0024c4	Space doesn't have to be a reserved character anymore.	2025-05-20 12:17:59 -05:00
Rockingcool	70761649ad	Derive Eq for defined types	2025-05-20 12:17:43 -05:00
Rockingcool	b9c6cc4470	Implemented strikethrough parser	2025-05-16 19:29:39 -05:00
Rockingcool	23691f9cfe	Add strikethrough parser to line parser list	2025-05-16 19:28:41 -05:00
Rockingcool	8c220cc800	A document can consist of unordered lists as well	2025-05-14 21:40:03 -05:00
Rockingcool	ee453c0259	Fixed blockquote parser; implemented unordered list parser and relevant sub-parsers	2025-05-14 21:39:27 -05:00
Rockingcool	c90d23617a	A blockquote must have a list of tokens	2025-05-14 21:38:50 -05:00
Rockingcool	c574699a8a	Added an import	2025-05-14 21:38:38 -05:00
Rockingcool	f55e160e25	Added tests for unordered lists	2025-05-14 21:37:58 -05:00
Rockingcool	dddcca0185	Wrote a lot of helper functions - most importantly greedyParse	2025-05-13 21:40:42 -05:00