Created a separate parser list for all parsers (except the unit parser

is replaced with the non-newline unit parser); use that parser when parsing list lines
Parse CSS classes in image and figure
2025-06-10 09:12:42 -04:00 · 2025-06-10 09:11:56 -04:00 · 2025-06-10 09:11:34 -04:00 · 2025-06-10 09:11:19 -04:00 · 2025-06-10 09:10:10 -04:00 · 2025-06-10 09:09:53 -04:00
2 changed files with 94 additions and 25 deletions
--- a/src/MdToHTML.hs
+++ b/src/MdToHTML.hs
@@ -6,9 +6,11 @@ module MdToHTML where

 import Control.Applicative hiding (many, some)
 import Control.Monad
+import Control.Monad.Combinators (count)
 import Data.Char
 import Data.List
 import Data.Ord (comparing)
+import Data.String.Utils
 import qualified Data.Text as T
 import Data.Void
 import Debug.Trace
@@ -20,6 +22,8 @@ type Parser = Parsec Void T.Text

 type HeaderLevel = Int

+type CssClass = String
+
 newtype URL = URL {getUrl :: String} deriving (Eq)

 newtype ImgPath = ImgPath {getPath :: String} deriving (Eq)
@@ -36,10 +40,11 @@ data MdToken
  | UnordList [MdToken]
  | OrdList [MdToken]
  | Code MdToken
-  | Codeblock String
+  | Table [[MdToken]]
+  | Codeblock MdToken
  | Link MdToken URL
-  | Image MdToken URL
-  | Figure MdToken URL
+  | Image MdToken URL (Maybe [CssClass])
+  | Figure MdToken URL (Maybe [CssClass])
  | Bold MdToken
  | Italic MdToken
  | Strikethrough MdToken
@@ -59,10 +64,11 @@ instance Show MdToken where
  show (UnordList tokens) = "<ul>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ul>"
  show (OrdList tokens) = "<ol>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ol>"
  show (Code code) = "<code>" ++ show code ++ "</code>"
-  show (Codeblock code) = show code
+  show (Table (thead : tokenGrid)) = "<table>\n<thead>\n<tr>\n" ++ concatMap (\x -> "<th>" ++ rstrip (show x) ++ "</th>\n") thead ++ "</tr>\n</thead>\n" ++ "<tbody>\n" ++ concatMap (\x -> "<tr>\n" ++ concatMap (\y -> "<td>" ++ rstrip (show y) ++ "</td>\n") x ++ "</tr>\n") tokenGrid ++ "</tbody>\n</table>\n"
+  show (Codeblock code) = "<pre><code>" ++ show code ++ "</code></pre>"
  show (Link txt url) = "<a href=\"" ++ getUrl url ++ "\">" ++ show txt ++ "</a>"
-  show (Image txt url) = "<img src=\"" ++ getUrl url ++ "\"" ++ " alt=\"" ++ show txt ++ "\" />"
-  show (Figure txt url) = "<figure><img src=\"" ++ getUrl url ++ "\" alt=\"" ++ show txt ++ "\"/><figcaption aria-hidden=\"true\">" ++ show txt ++ "</figcaption></figure>"
+  show (Image txt url cssClasses) = "<img src=\"" ++ getUrl url ++ "\"" ++ " alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/>"
+  show (Figure txt url cssClasses) = "<figure><img src=\"" ++ getUrl url ++ "\" alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/><figcaption aria-hidden=\"true\">" ++ show txt ++ "</figcaption></figure>"
  show (Bold token) = "<b>" ++ show token ++ "</b>"
  show (Italic token) = "<i>" ++ show token ++ "</i>"
  show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
@@ -179,15 +185,16 @@ parseStrikethrough = do
 -- Parse code
 parseCode :: Parser MdToken
 parseCode = do
-  char '`'
-  inside <- manyTill (satisfy (/= '\n')) (char '`')
+  opening <- some $ char '`'
+  inside <- someTill (satisfy (/= '\n')) (char '`')
+  closing <- count (length opening - 1) (char '`')
  return (Code (Unit (concatMap escapeChar inside)))

 -- Parse a link
 parseLink :: Parser MdToken
 parseLink = do
  char '['
-  linkText <- someTill parseLineToken (char ']')
+  linkText <- manyTill parseLineToken (char ']')
  char '('
  linkURL <- manyTill anySingle (char ')')
  return $ Link (Line linkText) (URL linkURL)
@@ -200,6 +207,22 @@ parseLinebreak = do
  char '\n'
  return Linebreak

+parseTableRow :: Parser [MdToken]
+parseTableRow = do
+  char '|'
+  row <- some (many (satisfy (\x -> x == ' ' || x == '\t')) *> someTill parseListLineToken (char '|'))
+  return (map Line row)
+
+parseTable :: Parser MdToken
+parseTable = do
+  tableHead <- parseTableRow
+  char '\n'
+  char '|'
+  sepEndBy1 (some (char '-')) (char '|') *> char '\n'
+  tableBody <- sepEndBy parseTableRow (char '\n')
+  many (char '\n') -- Parse trailing newlines, if any
+  return $ Table (tableHead : tableBody)
+
 parseSingleNewline :: Parser MdToken
 parseSingleNewline = do
  char '\n'
@@ -208,19 +231,36 @@ parseSingleNewline = do
    [] -> return $ Unit ""
    _ -> return SingleNewline

+parseCssClasses :: Parser [CssClass]
+parseCssClasses = do
+  char '{'
+  classes <- some parseCssClass
+  char '}'
+  return classes
+  where
+    parseCssClass :: Parser CssClass
+    parseCssClass = do
+      char '.'
+      let firstLetterParser = char '_' <|> char '-' <|> label "letter" (satisfy isAlpha)
+      cssClassFirstLetter <- firstLetterParser
+      cssClass <- many (firstLetterParser <|> label "digit" (satisfy isDigit))
+      space
+      return (cssClassFirstLetter : cssClass)
+
 parseImage :: Parser MdToken
 parseImage = do
  char '!'
  link <- parseLink
+  cssClasses <- optional $ try parseCssClasses
  case link of
-    Link text path -> return $ Image text path
+    Link text path -> return $ Image text path cssClasses
    _ -> empty -- This should never be reached

 parseFigure = do
  img <- parseImage
  void (string doubleNewlineText) <|> eof
  case img of
-    Image text path -> return $ Figure text path
+    Image text path cssClasses -> return $ Figure text path cssClasses
    _ -> return img

 -- Parse an escaped character
@@ -237,6 +277,13 @@ parseUnit = do
  text <- anySingle
  return (Unit [text])

+-- Parse any character except a newline
+parseUnitExceptNewline :: Parser MdToken
+parseUnitExceptNewline = do
+  -- text <- satisfy (`notElem` specialChars)
+  text <- satisfy (/= '\n')
+  return (Unit [text])
+
 lineParsers :: [Parser MdToken]
 lineParsers =
  [ parseLinebreak,
@@ -251,17 +298,16 @@ lineParsers =
    parseUnit
  ] -- A 'line' doesn't include a 'header'

-listLineParsers :: [Parser MdToken]
-listLineParsers =
-  [ parseLinebreak,
-    parseEscapedChar,
+lineParsersWithoutNewline :: [Parser MdToken]
+lineParsersWithoutNewline =
+  [ parseEscapedChar,
    parseCode,
    parseImage,
    parseBold,
    parseItalic,
    parseStrikethrough,
    parseLink,
-    parseUnit
+    parseUnitExceptNewline
  ] -- A list line cannot contain newlines.

 -- List of all parsers
@@ -274,7 +320,7 @@ parseLineToken = fallthroughParser lineParsers

 -- Parse any of the list line tokens.
 parseListLineToken :: Parser MdToken
-parseListLineToken = fallthroughParser listLineParsers
+parseListLineToken = fallthroughParser lineParsersWithoutNewline

 -- Parse a line, consisting of one or more tokens.
 parseLine :: Parser MdToken
@@ -355,7 +401,7 @@ parseListLineItemCommon :: Parser MdToken
 parseListLineItemCommon = do
  space
  restOfLine <- manyTill parseListLineToken (void (char '\n') <|> eof)
-  nestedList <- parseListNested <|> return (Unit "")
+  nestedList <- try parseListNested <|> return (Unit "")
  return $ Line [Line restOfLine, nestedList]

 -- Parse an unordered list paragraph item.
@@ -386,31 +432,32 @@ parseListParaItemCommon = do

 -- Parse an unordered list item, which can be a line item or another list.
 parseUListItem :: Parser MdToken
-parseUListItem = try parseUListParaItem <|> parseUListLineItem
+parseUListItem = space *> (try parseUListParaItem <|> parseUListLineItem)

 -- Parse an unordered list.
 parseUnorderedList :: Parser MdToken
 parseUnorderedList = do
-  lineItems <- some parseUListItem
+  lineItems <- some $ try parseUListItem
  void (char '\n') <|> eof -- A list must end in an extra newline or eof
  return $ UnordList lineItems

 -- --------

 parseOListItem :: Parser MdToken
-parseOListItem = try parseOListParaItem <|> parseOListLineItem
+parseOListItem = space *> (try parseOListParaItem <|> parseOListLineItem)

 -- Parses the first element of an ordered list, which must start with '1.'
 parseFirstOListItem :: Parser MdToken
 parseFirstOListItem = do
+  space
  remaining <- getInput
  when (take 2 (T.unpack remaining) /= "1.") empty
  parseOListLineItem

 parseOrderedList :: Parser MdToken
 parseOrderedList = do
-  firstLine <- parseFirstOListItem
-  lineItems <- some parseOListItem
+  firstLine <- try parseFirstOListItem
+  lineItems <- many $ try parseOListItem
  void (char '\n') <|> eof
  return $ OrdList (firstLine : lineItems)

@@ -423,9 +470,17 @@ doubleNewlineText = T.pack "\n\n"
 parseHorizontalRule :: Parser MdToken
 parseHorizontalRule = string horizontalRuleText *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule

+parseCodeblock :: Parser MdToken
+parseCodeblock = do
+  string (T.pack "```\n")
+  inside <- someTill anySingle (string (T.pack "\n```"))
+  return $ Codeblock (Unit (concatMap escapeChar inside))
+
 documentParsers :: [Parser MdToken]
 documentParsers =
  [ parseHorizontalRule,
+    parseCodeblock,
+    parseTable,
    parseHeader,
    parseBlockquote,
    parseUnorderedList,
--- a/src/MdToHtmlTest.hs
+++ b/src/MdToHtmlTest.hs
@@ -90,7 +90,8 @@ orderedListTests =
      check_equal "Paragraph before list" "<h3>This is a list</h3><ol><li>Item 1</li><li>Item 2</li></ol>" (convert "### This is a list\n\n1. Item 1\n200. Item 2"),
      check_equal "Nested list then back" "<ol><li>Item 1</li><li>Item 2<ol><li>Item 3</li><li>Item 4</li></ol></li><li>Item 5</li></ol>" (convert "1. Item 1\n2. Item 2\n    1. Item 3\n    3. Item 4\n5. Item 5"),
      check_equal "Blockquote in list" "<ol><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n\n    > Quote\n\n3. Item 3"),
-      check_equal "Unordered list in ordered list" "<ol><li>Item 1</li><li>Item 2<ul><li>Item 1</li><li>Item 2</li></ul></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n    - Item 1\n    * Item 2\n4. Item 3")
+      check_equal "Unordered list in ordered list" "<ol><li>Item 1</li><li>Item 2<ul><li>Item 1</li><li>Item 2</li></ul></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n    - Item 1\n    * Item 2\n4. Item 3"),
+      check_equal "List with just 1 item" "<ol><li>Item 1</li></ol>" (convert "1. Item 1")
    ]

 htmlTests =
@@ -106,7 +107,8 @@ codeTests =

 imageTests =
  TestList
-    [ check_equal "Image with text" "<p>This is an image <img src=\"img.png\" alt=\"Image 1\" /></p>" (convert "This is an image ![Image 1](img.png)")
+    [ check_equal "Image with text" "<p>This is an image <img src=\"img.png\" alt=\"Image 1\"/></p>" (convert "This is an image ![Image 1](img.png)"),
+      check_equal "Image with classes" "<p>This is an image <img src=\"img.png\" alt=\"Image 1\" class=\"new-img\"/></p>" (convert "This is an image ![Image 1](img.png){.new-img}")
    ]

 figureTests =
@@ -118,6 +120,17 @@ horizontalRuleTests =
  TestList
    [check_equal "Horizontal Rule" "<p>a</p><hr><p>b</p>" (convert "a\n\n---\n\nb")]

+tableTests =
+  TestList
+    [ check_equal
+        "Basic table"
+        "<table>\
+        \<thead><tr><th>Col 1</th><th>Col 2</th><th>Col 3</th></tr></thead>\
+        \<tbody><tr><td>Data 1</td><td>Data 2</td><td>Data 3</td></tr>\
+        \<tr><td>More Data 1</td><td>More Data 2</td><td>More Data 3</td></tr></tbody></table>"
+        (convert "| Col 1 | Col 2 | Col 3 |\n|---|---|---|\n| Data 1 | Data 2 | Data 3 |\n| More Data 1 | More Data 2 | More Data 3 |")
+    ]
+
 integrationTests =
  TestList
    [ check_equal "Integration 1" "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2><b>Second</b> <i>Heading</i></h2>" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n  ## __Second__ _Heading_"),
@@ -157,6 +170,7 @@ tests =
      figureTests,
      codeTests,
      horizontalRuleTests,
+      tableTests,
      integrationTests
    ]
Author	SHA1	Message	Date
Aadhavan Srinivasan	6b99a1835d	Created a separate parser list for all parsers (except the unit parser is replaced with the non-newline unit parser); use that parser when parsing list lines	2025-06-10 09:12:42 -04:00
Aadhavan Srinivasan	04167e0f96	Parse CSS classes in image and figure	2025-06-10 09:11:56 -04:00
Aadhavan Srinivasan	0528e813c5	Parser for CSS classes	2025-06-10 09:11:34 -04:00
Aadhavan Srinivasan	b1b99189c9	Link tect can be empty; inline code cannot be empty and can have nested backticks; created a unit parser for all characters except newline	2025-06-10 09:11:19 -04:00
Aadhavan Srinivasan	ade3768e29	Try and backtrack	2025-06-10 09:10:10 -04:00
Aadhavan Srinivasan	fd6d39ecd6	Parse space at beginning of list	2025-06-10 09:09:53 -04:00
Aadhavan Srinivasan	0f04342867	More trying and backtracking; parse and discard extraneous spaces at beginning of list	2025-06-10 09:09:31 -04:00
Aadhavan Srinivasan	80ef93bbc9	Try parsing an ordered list item, backtrack if not possible	2025-06-10 09:08:52 -04:00
Aadhavan Srinivasan	b73d4131b6	Added support for tables and codeblocks Defined the types, defined 'show', created the parsers, added them to parser list	2025-06-10 09:08:30 -04:00
Aadhavan Srinivasan	c48b8c5ae8	Images and figures now support CSS classes	2025-06-10 09:05:31 -04:00
Aadhavan Srinivasan	cf4282b26e	More imports	2025-06-10 09:04:28 -04:00
Aadhavan Srinivasan	7b40d6fe7c	Imports	2025-06-10 09:04:18 -04:00
Aadhavan Srinivasan	c4255d4578	Added a test for a list with just one item	2025-06-10 09:04:02 -04:00
Aadhavan Srinivasan	dcbbff13cb	Spacing change	2025-06-10 09:03:48 -04:00
Aadhavan Srinivasan	592fad2b46	Added tests for tables	2025-06-10 09:03:32 -04:00