From d771460bb1973f096263876ac0be432ae6782bf7 Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sat, 3 May 2025 13:50:36 -0400 Subject: [PATCH] Skip spaces at beginning of line; parse a line and header separately; add a Document data constructor for MdToken (consists of the entire document --- src/MdToHTML.hs | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/src/MdToHTML.hs b/src/MdToHTML.hs index 568b6f0..6eff55e 100644 --- a/src/MdToHTML.hs +++ b/src/MdToHTML.hs @@ -15,7 +15,8 @@ newtype ImgPath = ImgPath {getPath :: String} parseMany :: ReadP a -> ReadP [a] parseMany = Text.ParserCombinators.ReadP.many -data MdToken = Header HeaderLevel MdToken +data MdToken = Document [MdToken] + | Header HeaderLevel MdToken | Para MdToken | Line [MdToken] | Linebreak @@ -34,6 +35,7 @@ data MdToken = Header HeaderLevel MdToken -- Deriving Show for MdToken instance Show MdToken where + show (Document tokens) = concat(map show tokens) show (Header level token) = "" ++ show token ++ "" show (Para token) = "

" ++ show token ++ "

" show (Line tokens) = concat(map show tokens) @@ -83,6 +85,7 @@ lineToList (Line tokens) = tokens -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL. parseHeader :: ReadP MdToken parseHeader = do + skipSpaces headers <- many1 mustBeHash when ((length headers) > 6) pfail @@ -125,25 +128,41 @@ parseString = do text <- munch (\x -> not (elem x "#*_[\n")) return (Unit (firstChar:text)) +lineParsers :: [ReadP MdToken] +lineParsers = [parseHeader, parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header' + +-- List of all parsers +allParsers :: [ReadP MdToken] +allParsers = parseHeader:lineParsers + -- Parse any of the above tokens. -parseToken :: ReadP MdToken -parseToken = choice [parseHeader, parseLinebreak, parseBold, parseItalic, parseString] +parseLineToken :: ReadP MdToken +parseLineToken = choice lineParsers -- Parse a line, consisting of one or more tokens. parseLine :: ReadP MdToken parseLine = do + skipSpaces + -- Fail if we have reached the end of the document. remaining <- look when (null remaining) pfail - parsed <- parseMany parseToken + parsed <- parseMany parseLineToken -- traceM $ show parsed return (Line parsed) -- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines. +-- As a weird special case, a 'Paragraph' can also be a 'Header'. parsePara :: ReadP MdToken parsePara = do parseMany (char '\n') - text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) - string "\n\n" - -- I don't consume the ending double-newline, because the next paragraph will consume it as part of its starting double-newline. - let parsedText = fst $ leftmostLongestParse parseLine text - return (Para parsedText) + text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline. + string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF. + let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header. + -- If the paragraph is a header, return a Header token. Otheriwse return a Para token. + case parsedText of + Header level token -> return (Header level token) + _ -> return (Para parsedText) + +-- Parse a document, which is multiple paragraphs. +parseDocument :: ReadP MdToken +parseDocument = (many1 parsePara) >>= (\res -> return (Document (res)))