Added more tests

Improve document parser to parse extra newlines
Fix bug in fallthrough parser
2025-07-30 15:34:43 -04:00 · 2025-07-30 15:32:35 -04:00 · 2025-07-30 15:29:25 -04:00 · 2025-07-30 15:29:03 -04:00 · 2025-07-30 15:28:35 -04:00 · 2025-07-22 09:56:41 -04:00
4 changed files with 139 additions and 26 deletions
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -24,4 +24,7 @@ main = do
    [] -> getContents
    x : _ -> readFile x
  let res = leftmostLongestParse parseDocument fileContents
-  print res
+  let toPrint = prettyPrint res
+  case reverse toPrint of
+    '\n' : _ -> putStr toPrint
+    _ -> putStrLn toPrint
--- a/md-to-html.cabal
+++ b/md-to-html.cabal
@@ -60,9 +60,12 @@ library
   build-depends:    base ^>=4.19.1.0,
                      HUnit,
                      megaparsec,
-                      text
+                      parser-combinators,
+                      text,
+                      MissingH,
+                      word-wrap

-executable md-to-html-runner
+executable mdtoh
    -- Import common warning flags.
    import:           warnings

--- a/src/MdToHTML.hs
+++ b/src/MdToHTML.hs
@@ -17,6 +17,7 @@ import Debug.Trace
 import Text.Megaparsec
 import Text.Megaparsec.Char
 import Text.Printf
+import Text.Wrap

 type Parser = Parsec Void T.Text

@@ -39,6 +40,7 @@ data MdToken
  | Blockquote [MdToken]
  | UnordList [MdToken]
  | OrdList [MdToken]
+  | Checkbox Bool
  | Code MdToken
  | Table [[MdToken]]
  | Codeblock MdToken
@@ -47,6 +49,8 @@ data MdToken
  | Figure MdToken URL (Maybe [CssClass])
  | Bold MdToken
  | Italic MdToken
+  | Superscript MdToken
+  | Subscript MdToken
  | Strikethrough MdToken
  | Unit String
  deriving (Eq)
@@ -55,25 +59,40 @@ data MdToken
 instance Show MdToken where
  show (Document tokens) = concatMap show tokens
  show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
-  show (Para token) = "<p>" ++ show token ++ "</p>\n"
+  show (Para token) = "<p>" ++ show token ++ "</p>"
  show (Line tokens) = concatMap show tokens
-  show Linebreak = "<br>"
+  show Linebreak = "<br />"
  show SingleNewline = " "
  show HorizontalRule = "<hr>"
  show (Blockquote tokens) = "<blockquote>" ++ concatMap show tokens ++ "</blockquote>"
  show (UnordList tokens) = "<ul>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ul>"
  show (OrdList tokens) = "<ol>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ol>"
-  show (Code code) = "<code>" ++ show code ++ "</code>"
-  show (Table (thead : tokenGrid)) = "<table>\n<thead>\n<tr>\n" ++ concatMap (\x -> "<th>" ++ rstrip (show x) ++ "</th>\n") thead ++ "</tr>\n</thead>\n" ++ "<tbody>\n" ++ concatMap (\x -> "<tr>\n" ++ concatMap (\y -> "<td>" ++ rstrip (show y) ++ "</td>\n") x ++ "</tr>\n") tokenGrid ++ "</tbody>\n</table>\n"
+  show (Checkbox isChecked) = "<input type=\"checkbox\"" ++ (if isChecked then " checked=\"\"" else "") ++ " />"
+  show (Code code) = "<code>" ++ strip (show code) ++ "</code>"
+  show (Table (thead : tokenGrid)) = "<table><thead><tr>" ++ concatMap (\x -> "<th>" ++ rstrip (show x) ++ "</th>") thead ++ "</tr></thead>" ++ "<tbody>" ++ concatMap (\x -> "<tr>" ++ concatMap (\y -> "<td>" ++ rstrip (show y) ++ "</td>") x ++ "</tr>") tokenGrid ++ "</tbody></table>"
  show (Codeblock code) = "<pre><code>" ++ show code ++ "</code></pre>"
  show (Link txt url) = "<a href=\"" ++ getUrl url ++ "\">" ++ show txt ++ "</a>"
  show (Image txt url cssClasses) = "<img src=\"" ++ getUrl url ++ "\"" ++ " alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/>"
  show (Figure txt url cssClasses) = "<figure><img src=\"" ++ getUrl url ++ "\" alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/><figcaption aria-hidden=\"true\">" ++ show txt ++ "</figcaption></figure>"
  show (Bold token) = "<b>" ++ show token ++ "</b>"
  show (Italic token) = "<i>" ++ show token ++ "</i>"
+  show (Superscript token) = "<sup>" ++ show token ++ "</sup>"
+  show (Subscript token) = "<sub>" ++ show token ++ "</sub>"
  show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
  show (Unit unit) = printf "%s" unit

+-- Pretty print the given token into a string.
+-- This is the same as calling 'show' for most tokens, but is different for paragraphs and tables,
+-- which have newlines inserted into them.
+prettyPrint :: MdToken -> String
+prettyPrint (Para token) = "<p>" ++ T.unpack (wrapText defaultWrapSettings 70 (T.pack $ prettyPrint token)) ++ "</p>\n"
+prettyPrint (Table (thead : tokenGrid)) = "<table>\n<thead>\n<tr>\n" ++ concatMap (\x -> "<th>" ++ rstrip (prettyPrint x) ++ "</th>\n") thead ++ "</tr>\n</thead>\n" ++ "<tbody>\n" ++ concatMap (\x -> "<tr>\n" ++ concatMap (\y -> "<td>" ++ rstrip (prettyPrint y) ++ "</td>\n") x ++ "</tr>\n") tokenGrid ++ "</tbody>\n</table>\n"
+prettyPrint Linebreak = "<br />\n"
+prettyPrint HorizontalRule = "<hr>\n"
+prettyPrint (Line tokens) = concatMap prettyPrint tokens
+prettyPrint (Document tokens) = concatMap prettyPrint tokens
+prettyPrint token = show token
+
 instance Semigroup MdToken where
  a <> b = Document [a, b]

@@ -126,7 +145,7 @@ parseTillEol = manyTill anySingle (void (char '\n') <|> eof)
 -- Takes a list of parsers. Returns a parser that will try them in
 -- order, moving to the next one only if the current one fails.
 fallthroughParser :: [Parser a] -> Parser a
-fallthroughParser [x] = x
+fallthroughParser [x] = try x
 fallthroughParser (x : xs) = try x <|> fallthroughParser xs

 escapeChar :: Char -> String
@@ -138,6 +157,19 @@ escapeChar x = [x]
 htmlEscapeChars :: T.Text -> T.Text
 htmlEscapeChars = T.concatMap (T.pack . escapeChar)

+-- -- Wraps a list of words after (at most) the given number of characters, trying to prevent word-breaks
+-- wordwrap :: Int -> String -> String
+-- wordwrap wraplength str = if (length str) < wraplength
+--     then str
+--     else
+--         let spaceIndex = lastgtSpaceIndex 0 (takeRev (length str) - wraplength str)
+--
+--     where
+--         takeRev n = (reverse . take n . reverse)
+--         lastSpaceIndex counter str = case str of
+--             [] -> counter
+--             x:xs -> if (isSpace x) counter else lastSpaceIndex counter+1 xs
+
 -- ---------------

 -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
@@ -175,6 +207,20 @@ parseItalic = parseItalicWith '*' <|> parseItalicWith '_'
      inside <- someTill parseLineToken (char delim)
      return (Italic (Line inside))

+-- Parse subscript
+parseSubscript :: Parser MdToken
+parseSubscript = do
+  char '~'
+  inside <- someTill parseLineToken (char '~')
+  return (Subscript (Line inside))
+
+-- Parse superscript
+parseSuperscript :: Parser MdToken
+parseSuperscript = do
+  char '^'
+  inside <- someTill parseLineToken (char '^')
+  return (Superscript (Line inside))
+
 -- Parse strikethrough text
 parseStrikethrough :: Parser MdToken
 parseStrikethrough = do
@@ -201,11 +247,17 @@ parseLink = do

 -- Parse a linebreak character
 parseLinebreak :: Parser MdToken
-parseLinebreak = do
-  char ' '
-  some (char ' ')
-  char '\n'
-  return Linebreak
+parseLinebreak = parseLinebreakSpace <|> parseLinebreakBackslash
+  where
+    parseLinebreakSpace = do
+      char ' '
+      some (char ' ')
+      char '\n'
+      return Linebreak
+    parseLinebreakBackslash = try $ do
+      char '\\'
+      char '\n'
+      return Linebreak

 parseTableRow :: Parser [MdToken]
 parseTableRow = do
@@ -294,6 +346,8 @@ lineParsers =
    parseBold,
    parseItalic,
    parseStrikethrough,
+    parseSubscript,
+    parseSuperscript,
    parseLink,
    parseUnit
  ] -- A 'line' doesn't include a 'header'
@@ -306,6 +360,8 @@ lineParsersWithoutNewline =
    parseBold,
    parseItalic,
    parseStrikethrough,
+    parseSubscript,
+    parseSuperscript,
    parseLink,
    parseUnitExceptNewline
  ] -- A list line cannot contain newlines.
@@ -371,13 +427,28 @@ parseBlockquote = do
  let parsedQuotedLines = leftmostLongestParse (some (parseBlockquote <|> parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
  return (Blockquote parsedQuotedLines)

+-- Parse a checkbox
+parseCheckbox :: Parser MdToken
+parseCheckbox = do
+  char '['
+  inside <- char ' ' <|> char 'x'
+  char ']'
+  space
+  return (if inside == 'x' then Checkbox True else Checkbox False)
+
 -- Parse a nested list item.
 parseListNested :: Parser MdToken
 parseListNested = do
-  let firstCharParser = string (T.pack "    ") <|> string (T.pack "\t")
+  let firstCharParser = (<>) <$> (string (T.pack "  ") <|> string (T.pack "\t")) <*> (T.pack <$> many (char ' '))
  let restOfLineParser = manyTill anySingle (void (char '\n') <|> eof)
-  lines <- greedyParse1 (firstCharParser *> restOfLineParser)
-  let linesParsed = leftmostLongestParse (parseUnorderedList <|> parseOrderedList) (init $ unlines lines)
+  -- For the first line, I manually run firstCharParser and restOfLineParser. The
+  -- result of firstCharParser is saved. For every subsequent line, I parse exactly
+  -- the same string as firstCharParser.
+  firstLineSpaces <- firstCharParser
+  firstLine <- restOfLineParser
+  lines <- greedyParse (string firstLineSpaces *> restOfLineParser)
+  let allLines = firstLine : lines
+  let linesParsed = leftmostLongestParse (parseUnorderedList <|> parseOrderedList) (init $ unlines allLines)
  when (null (show linesParsed)) empty
  return linesParsed

@@ -400,9 +471,12 @@ parseOListLineItem = do
 parseListLineItemCommon :: Parser MdToken
 parseListLineItemCommon = do
  space
+  checkbox <- optional $ try parseCheckbox
  restOfLine <- manyTill parseListLineToken (void (char '\n') <|> eof)
  nestedList <- try parseListNested <|> return (Unit "")
-  return $ Line [Line restOfLine, nestedList]
+  case checkbox of
+    Just box -> return $ Line [box, Line restOfLine, nestedList]
+    Nothing -> return $ Line [Line restOfLine, nestedList]

 -- Parse an unordered list paragraph item.
 parseUListParaItem :: Parser MdToken
@@ -468,11 +542,15 @@ doubleNewlineText :: T.Text
 doubleNewlineText = T.pack "\n\n"

 parseHorizontalRule :: Parser MdToken
-parseHorizontalRule = string horizontalRuleText *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule
+parseHorizontalRule = parseHorizontalRuleLine *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule
+  where
+    parseHorizontalRuleLine = fallthroughParser (map (string . T.pack) ["---", "***", "___", "- - -", "* * *", "_ _ _"])

 parseCodeblock :: Parser MdToken
 parseCodeblock = do
-  string (T.pack "```\n")
+  string (T.pack "```")
+  _ <- many $ satisfy (/= '\n') -- Language name
+  char '\n'
  inside <- someTill anySingle (string (T.pack "\n```"))
  return $ Codeblock (Unit (concatMap escapeChar inside))

@@ -492,5 +570,8 @@ documentParsers =
 -- Parse a document, which is multiple paragraphs.
 parseDocument :: Parser MdToken
 parseDocument = do
-  res <- manyTill (fallthroughParser documentParsers) eof
+  --  res <- manyTill (fallthroughParser documentParsers <|> (char '\n' *> return $ Unit "")) eof
+  res <- sepEndBy (fallthroughParser documentParsers) (many $ char '\n')
+  -- many $ char '\n'
+  eof
  return (Document res)
--- a/src/MdToHtmlTest.hs
+++ b/src/MdToHtmlTest.hs
@@ -4,7 +4,7 @@ import MdToHTML
 import Test.HUnit

 check_equal :: String -> String -> String -> Test
-check_equal desc expected actual = TestCase (assertEqual desc expected (filter (/= '\n') actual))
+check_equal desc expected actual = TestCase (assertEqual desc expected actual)

 convert :: String -> String
 convert md = show $ leftmostLongestParse parseDocument md
@@ -31,7 +31,7 @@ boldTests =
 strikethroughTests =
  TestList
    [ check_equal "Should convert strikethrough" "<p><s>Hello</s></p>" (convert "~~Hello~~"),
-      check_equal "Should convert long sentence with tilde" "<p><s>The universe is ~7 days old</s>. The universe is 13 billion years old.</p>" (convert "~~The universe is ~7 days old~~. The universe is 13 billion years old.")
+      check_equal "Should convert long sentence with tilde" "<p><s>The universe is ~7 days old</s>. The universe is 13 billion years old.</p>" (convert "~~The universe is \\~7 days old~~. The universe is 13 billion years old.")
    ]

 linkTests =
@@ -74,8 +74,17 @@ unorderedListTests =
      check_equal "Paragraph before list" "<p>This is a list</p><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "This is a list\n\n* Item 1\n* Item 2"),
      check_equal "Paragraph before list" "<h3>This is a list</h3><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "### This is a list\n\n* Item 1\n* Item 2"),
      check_equal "Nested list then back" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3</li><li>Item 4</li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n    - Item 3\n    - Item 4\n- Item 5"),
+      check_equal "Triply nested list" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3<ul><li>Item 4</li></ul></li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n    - Item 3\n      - Item 4\n- Item 5"),
      check_equal "Blockquote in list" "<ul><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n\n    > Quote\n\n- Item 3"),
-      check_equal "Ordered list in unordered list" "<ul><li>Item 1</li><li>Item 2<ol><li>Item 1</li><li>Item 2</li></ol></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n    1. Item 1\n    2. Item 2\n- Item 3")
+      check_equal "Ordered list in unordered list" "<ul><li>Item 1</li><li>Item 2<ol><li>Item 1</li><li>Item 2</li></ol></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n    1. Item 1\n    2. Item 2\n- Item 3"),
+      check_equal
+        "Checkbox in unordered list"
+        "<ul>\
+        \<li><input type=\"checkbox\" />Not checked</li>\
+        \<li><input type=\"checkbox\" checked=\"\" />Checked</li>\
+        \<li>Normal list item</li></ul>"
+        (convert "- [ ]     Not checked\n- [x]   Checked\n- Normal list item"),
+      check_equal "List with link at the start" "<ul><li><a href=\"b\">a</a></li><li><a href=\"d\">c</a></li></ul>" (convert "- [a](b)\n- [c](d)")
    ]

 orderedListTests =
@@ -91,7 +100,14 @@ orderedListTests =
      check_equal "Nested list then back" "<ol><li>Item 1</li><li>Item 2<ol><li>Item 3</li><li>Item 4</li></ol></li><li>Item 5</li></ol>" (convert "1. Item 1\n2. Item 2\n    1. Item 3\n    3. Item 4\n5. Item 5"),
      check_equal "Blockquote in list" "<ol><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n\n    > Quote\n\n3. Item 3"),
      check_equal "Unordered list in ordered list" "<ol><li>Item 1</li><li>Item 2<ul><li>Item 1</li><li>Item 2</li></ul></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n    - Item 1\n    * Item 2\n4. Item 3"),
-      check_equal "List with just 1 item" "<ol><li>Item 1</li></ol>" (convert "1. Item 1")
+      check_equal "List with just 1 item" "<ol><li>Item 1</li></ol>" (convert "1. Item 1"),
+      check_equal
+        "Checkbox in ordered list"
+        "<ol>\
+        \<li><input type=\"checkbox\" />Not checked</li>\
+        \<li><input type=\"checkbox\" checked=\"\" />Checked</li>\
+        \<li>Normal list item</li></ol>"
+        (convert "1. [ ]     Not checked\n2. [x]   Checked\n3. Normal list item")
    ]

 htmlTests =
@@ -102,7 +118,9 @@ codeTests =
  TestList
    [ check_equal "Code by itself" "<p><code>Hello world!</code></p>" (convert "`Hello world!`"),
      check_equal "Code in a paragraph" "<p>The following <code>text</code> is code</p>" (convert "The following `text` is code"),
-      check_equal "Code across paragraphs (shouldn't work)" "<p>`Incomplete</p><p>Code`</p>" (convert "`Incomplete\n\nCode`") -- At the moment, this is just treated as a syntax error, so nothing is rendered.
+      check_equal "Code across paragraphs (shouldn't work)" "<p>`Incomplete</p><p>Code`</p>" (convert "`Incomplete\n\nCode`"), -- At the moment, this is just treated as a syntax error, so nothing is rendered.
+      check_equal "Code block" "<pre><code>Test code block</code></pre>" (convert "```\nTest code block\n```"),
+      check_equal "Multiple code blocks" "<pre><code>Test code block</code></pre><pre><code>Next block</code></pre>" (convert "```\nTest code block\n```\n\n```\nNext block\n```")
    ]

 imageTests =
@@ -120,6 +138,14 @@ horizontalRuleTests =
  TestList
    [check_equal "Horizontal Rule" "<p>a</p><hr><p>b</p>" (convert "a\n\n---\n\nb")]

+subscriptTests =
+  TestList
+    [check_equal "Should convert subscript" "A<sub>b</sub>" (convert "A~b~")]
+
+superscriptTests =
+  TestList
+    [check_equal "Should convert superscript" "A<sup>b</sup>" (convert "A^b^")]
+
 tableTests =
  TestList
    [ check_equal
@@ -138,7 +164,7 @@ integrationTests =
      check_equal "Integration 3" "<h1>Hello</h1><p>World</p>" (convert "# Hello\nWorld"),
      check_equal "Integration 4" "<p>a b</p>" (convert "a\nb"),
      check_equal "Integration 5" "<h1>Hello</h1>" (convert "# Hello\n"),
-      check_equal "Integration 6" "<p>First line<br>Second line</p>" (convert "First line  \nSecond line"),
+      check_equal "Integration 6" "<p>First line<br />Second line</p>" (convert "First line  \nSecond line"),
      check_equal
        "Integration 7"
        "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2>Second \
Author	SHA1	Message	Date
Aadhavan Srinivasan	e5795e0d75	Added more tests	2025-07-30 15:34:43 -04:00
Aadhavan Srinivasan	eae897a2d6	Improve document parser to parse extra newlines	2025-07-30 15:32:35 -04:00
Aadhavan Srinivasan	8152b89a23	Fix bug in fallthrough parser	2025-07-30 15:29:25 -04:00
Aadhavan Srinivasan	9d3d656065	Improved parser for horizontal rule and codeblock	2025-07-30 15:29:03 -04:00
Aadhavan Srinivasan	d4a550f6a7	Added token and parser for list checkbox	2025-07-30 15:28:35 -04:00
Aadhavan Srinivasan	2b21aeae89	Added implementation and tests for subscript and superscript; fixed nested list parsing	2025-07-22 09:56:41 -04:00
Aadhavan Srinivasan	ca328a464a	Add prettyPrint definition for horizontal rule	2025-07-15 10:51:10 -04:00
Aadhavan Srinivasan	7d45b1123f	Renamed md-to-html-runner to mdtoh	2025-06-18 15:21:54 -04:00
Aadhavan Srinivasan	9627abcd12	Updated test	2025-06-10 14:14:56 -04:00
Aadhavan Srinivasan	82277e9ea8	Only add newlines for linebreak when pretty printing	2025-06-10 14:14:26 -04:00
Aadhavan Srinivasan	d074b0131c	Parse linebreaks as a backslash before a newline	2025-06-10 14:10:45 -04:00
Aadhavan Srinivasan	57cb3e68fa	Import module for word wrapping; add package to cabal file	2025-06-10 14:02:20 -04:00
Aadhavan Srinivasan	4e9f84c2bb	Add function to pretty print; commented out my word wrap and use a built-in one instead	2025-06-10 14:01:52 -04:00
Aadhavan Srinivasan	e025614324	Print extra newline if output text doesn't include a newline	2025-06-10 09:13:40 -04:00
Aadhavan Srinivasan	e711444066	Add more packages to cabal file	2025-06-10 09:13:15 -04:00