From d771460bb1973f096263876ac0be432ae6782bf7 Mon Sep 17 00:00:00 2001
From: Aadhavan Srinivasan <aadhavan@twomorecents.org>
Date: Sat, 3 May 2025 13:50:36 -0400
Subject: [PATCH] Skip spaces at beginning of line; parse a line and header
 separately; add a Document data constructor for MdToken (consists of the
 entire document

---
 src/MdToHTML.hs | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)
diff --git a/src/MdToHTML.hs b/src/MdToHTML.hs
index 568b6f0..6eff55e 100644
--- a/src/MdToHTML.hs
+++ b/src/MdToHTML.hs
@@ -15,7 +15,8 @@ newtype ImgPath = ImgPath {getPath :: String}
 parseMany :: ReadP a -> ReadP [a]
 parseMany = Text.ParserCombinators.ReadP.many
 
-data MdToken = Header HeaderLevel MdToken
+data MdToken = Document [MdToken]
+             | Header HeaderLevel MdToken
              | Para MdToken
              | Line [MdToken]
              | Linebreak
@@ -34,6 +35,7 @@ data MdToken = Header HeaderLevel MdToken
 
 -- Deriving Show for MdToken
 instance Show MdToken where
+    show (Document tokens) = concat(map show tokens)
     show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
     show (Para token) = "<p>" ++ show token ++ "</p>"
     show (Line tokens) =  concat(map show tokens)
@@ -83,6 +85,7 @@ lineToList (Line tokens) = tokens
 -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
 parseHeader :: ReadP MdToken
 parseHeader = do
+        skipSpaces
         headers <- many1 mustBeHash
         when ((length headers) > 6)
             pfail
@@ -125,25 +128,41 @@ parseString = do
          text <- munch (\x -> not (elem x "#*_[\n"))
          return (Unit (firstChar:text))
 
+lineParsers :: [ReadP MdToken]
+lineParsers = [parseHeader, parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header'
+
+-- List of all parsers
+allParsers :: [ReadP MdToken]
+allParsers = parseHeader:lineParsers
+
 -- Parse any of the above tokens.
-parseToken :: ReadP MdToken
-parseToken = choice [parseHeader, parseLinebreak, parseBold, parseItalic, parseString]
+parseLineToken :: ReadP MdToken
+parseLineToken = choice lineParsers
 
 -- Parse a line, consisting of one or more tokens.
 parseLine :: ReadP MdToken
 parseLine =  do
+         skipSpaces
+         -- Fail if we have reached the end of the document.
          remaining <- look
          when (null remaining) pfail
-         parsed <- parseMany parseToken
+         parsed <- parseMany parseLineToken
 --         traceM $ show parsed
          return (Line parsed)
 
 -- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
+-- As a weird special case, a 'Paragraph' can also be a 'Header'.
 parsePara :: ReadP MdToken
 parsePara = do
            parseMany (char '\n')
-           text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n"))
-           string "\n\n"
-           -- I don't consume the ending double-newline, because the next paragraph will consume it as part of its starting double-newline.
-           let parsedText = fst $ leftmostLongestParse parseLine text
-           return (Para parsedText)
+           text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
+           string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
+           let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header.
+           -- If the paragraph is a header, return a Header token. Otheriwse return a Para token.
+           case parsedText of
+                Header level token -> return (Header level token)
+                _ -> return (Para parsedText)
+
+-- Parse a document, which is multiple paragraphs.
+parseDocument :: ReadP MdToken
+parseDocument = (many1 parsePara) >>= (\res -> return (Document (res)))