From 05e5548aa95fdc60561444fa287f10c6648663bf Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Tue, 3 Jun 2025 11:19:05 -0400 Subject: [PATCH] Huge rewrite - use megaparsec instead of readP --- src/MdToHTML.hs | 279 +++++++++++++++++++++++--------------------- src/MdToHtmlTest.hs | 13 ++- 2 files changed, 156 insertions(+), 136 deletions(-) diff --git a/src/MdToHTML.hs b/src/MdToHTML.hs index 1b2bbb0..e3dc86a 100644 --- a/src/MdToHTML.hs +++ b/src/MdToHTML.hs @@ -4,24 +4,26 @@ module MdToHTML where -import Control.Applicative +import Control.Applicative hiding (many, some) import Control.Monad import Data.Char import Data.List import Data.Ord (comparing) +import qualified Data.Text as T +import Data.Void import Debug.Trace -import Text.ParserCombinators.ReadP +import Text.Megaparsec +import Text.Megaparsec.Char import Text.Printf +type Parser = Parsec Void T.Text + type HeaderLevel = Int newtype URL = URL {getUrl :: String} deriving (Eq) newtype ImgPath = ImgPath {getPath :: String} deriving (Eq) -parseMany :: ReadP a -> ReadP [a] -parseMany = Text.ParserCombinators.ReadP.many - data MdToken = Document [MdToken] | Header HeaderLevel MdToken @@ -36,8 +38,8 @@ data MdToken | Code MdToken | Codeblock String | Link MdToken URL - | Image MdToken ImgPath - | Figure MdToken ImgPath + | Image MdToken URL + | Figure MdToken URL | Bold MdToken | Italic MdToken | Strikethrough MdToken @@ -48,7 +50,7 @@ data MdToken instance Show MdToken where show (Document tokens) = concatMap show tokens show (Header level token) = "" ++ show token ++ "" - show (Para token) = "

" ++ show token ++ "

" + show (Para token) = "

" ++ show token ++ "

\n" show (Line tokens) = concatMap show tokens show Linebreak = "
" show SingleNewline = " " @@ -59,8 +61,8 @@ instance Show MdToken where show (Code code) = "" ++ show code ++ "" show (Codeblock code) = show code show (Link txt url) = "" ++ show txt ++ "" - show (Image txt imgPath) = "\""" - show (Figure txt imgPath) = "
\""
" ++ show txt ++ "
" + show (Image txt url) = "\""" + show (Figure txt url) = "
\""
" ++ show txt ++ "
" show (Bold token) = "" ++ show token ++ "" show (Italic token) = "" ++ show token ++ "" show (Strikethrough token) = "" ++ show token ++ "" @@ -83,27 +85,26 @@ leftmostLongest xs = (x : xs) -> Just x -- Get the first parse returned by readP_to_S that consumed the most input -leftmostLongestParse :: (Monoid a) => ReadP a -> String -> (a, String) +leftmostLongestParse :: (Monoid a) => Parser a -> String -> a leftmostLongestParse parser input = - let res = leftmostLongest $ readP_to_S parser input - in case res of - Nothing -> (mempty, mempty) - Just x -> x + case runParser parser "input" (T.pack input) of + (Left a) -> mempty + (Right a) -> a -specialChars = "\n\\`*_{}[]()<>#+|" +specialChars = ">\n\\`*_{}[]#+|" -escapableChars = "-~!." ++ specialChars +escapableChars = "-~!.$()" ++ specialChars -- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned. -greedyParse :: ReadP a -> ReadP [a] +greedyParse :: Parser a -> Parser [a] greedyParse parser = do - greedyParse1 parser <++ return [] + greedyParse1 parser <|> return [] -- Like greedyParse, but the parser must succeed atleast once. -greedyParse1 :: ReadP a -> ReadP [a] +greedyParse1 :: Parser a -> Parser [a] greedyParse1 parser = do parsed1 <- parser - parsed2 <- greedyParse1 parser <++ return [] + parsed2 <- greedyParse1 parser <|> return [] return (parsed1 : parsed2) prepend :: [a] -> [a] -> [a] @@ -113,122 +114,130 @@ append :: [a] -> [a] -> [a] append x1 x2 = x2 ++ x1 -- Parse until EOL or EOF -parseTillEol :: ReadP String -parseTillEol = manyTill get (void (char '\n') <++ eof) +parseTillEol :: Parser String +parseTillEol = manyTill anySingle (void (char '\n') <|> eof) -- Takes a list of parsers. Returns a parser that will try them in -- order, moving to the next one only if the current one fails. -fallthroughParser :: [ReadP a] -> ReadP a +fallthroughParser :: [Parser a] -> Parser a fallthroughParser [x] = x -fallthroughParser (x : xs) = x <++ fallthroughParser xs +fallthroughParser (x : xs) = try x <|> fallthroughParser xs + +escapeChar :: Char -> String +escapeChar '>' = ">" +escapeChar '<' = "<" +escapeChar '&' = "&" +escapeChar x = [x] + +htmlEscapeChars :: T.Text -> T.Text +htmlEscapeChars = T.concatMap (T.pack . escapeChar) -- --------------- -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL. -parseHeader :: ReadP MdToken +parseHeader :: Parser MdToken parseHeader = do - skipSpaces - headers <- munch1 (== '#') + space + headers <- greedyParse1 (char '#') when (length headers > 6) - pfail - skipSpaces - parsedText <- manyTill parseLineToken (void (char '\n') <++ eof) + empty + space + parsedText <- manyTill parseLineToken (void (char '\n') <|> eof) greedyParse (char '\n') return (Header (length headers) (Line parsedText)) +asteriskBold = T.pack "**" + +underscoreBold = T.pack "__" + -- Parse bold text -parseBold :: ReadP MdToken -parseBold = parseBoldWith "**" <|> parseBoldWith "__" +parseBold :: Parser MdToken +parseBold = parseBoldWith asteriskBold <|> parseBoldWith underscoreBold where parseBoldWith delim = do string delim - inside <- greedyParse1 parseLineToken - string delim + inside <- someTill parseLineToken $ string delim return (Bold (Line inside)) -- Parse italic text -parseItalic :: ReadP MdToken -parseItalic = parseItalicWith "*" <|> parseItalicWith "_" +parseItalic :: Parser MdToken +parseItalic = parseItalicWith '*' <|> parseItalicWith '_' where parseItalicWith delim = do - string delim - inside <- greedyParse1 parseLineToken - string delim + char delim + inside <- someTill parseLineToken (char delim) return (Italic (Line inside)) -- Parse strikethrough text -parseStrikethrough :: ReadP MdToken +parseStrikethrough :: Parser MdToken parseStrikethrough = do - string "~~" - inside <- many1 parseLineToken - string "~~" + string (T.pack "~~") + inside <- someTill parseLineToken $ string (T.pack "~~") return (Strikethrough (Line inside)) -- Parse code -parseCode :: ReadP MdToken +parseCode :: Parser MdToken parseCode = do - string "`" - inside <- many1 get - string "`" - return (Code (Unit inside)) + char '`' + inside <- manyTill (satisfy (/= '\n')) (char '`') + return (Code (Unit (concatMap escapeChar inside))) -- Parse a link -parseLink :: ReadP MdToken +parseLink :: Parser MdToken parseLink = do - linkText <- between (string "[") (string "]") (many1 get) - linkURL <- between (string "(") (string ")") (many1 get) - let parsedLinkText = fst $ leftmostLongestParse parseLine linkText - return $ Link parsedLinkText (URL linkURL) + char '[' + linkText <- someTill parseLineToken (char ']') + char '(' + linkURL <- manyTill anySingle (char ')') + return $ Link (Line linkText) (URL linkURL) -- Parse a linebreak character -parseLinebreak :: ReadP MdToken +parseLinebreak :: Parser MdToken parseLinebreak = do char ' ' - many1 (char ' ') + some (char ' ') char '\n' return Linebreak -parseSingleNewline :: ReadP MdToken +parseSingleNewline :: Parser MdToken parseSingleNewline = do char '\n' - remaining <- look - case remaining of + remaining <- getInput + case T.unpack remaining of [] -> return $ Unit "" _ -> return SingleNewline -parseImage :: ReadP MdToken +parseImage :: Parser MdToken parseImage = do char '!' - char '[' - altText <- many1 (parseEscapedChar <++ parseUnit) - char ']' - char '(' - path <- many1 get - char ')' - return $ Image (Line altText) (ImgPath path) + link <- parseLink + case link of + Link text path -> return $ Image text path + _ -> empty -- This should never be reached parseFigure = do img <- parseImage - void (string "\n\n") <++ eof + void (string doubleNewlineText) <|> eof case img of Image text path -> return $ Figure text path _ -> return img -- Parse an escaped character -parseEscapedChar :: ReadP MdToken +parseEscapedChar :: Parser MdToken parseEscapedChar = do char '\\' escapedChar <- choice (map char escapableChars) -- Parse any of the special chars. return (Unit [escapedChar]) -- Parse a character as a Unit. -parseUnit :: ReadP MdToken +parseUnit :: Parser MdToken parseUnit = do - text <- satisfy (`notElem` specialChars) + -- text <- satisfy (`notElem` specialChars) + text <- anySingle return (Unit [text]) -lineParsers :: [ReadP MdToken] +lineParsers :: [Parser MdToken] lineParsers = [ parseLinebreak, parseSingleNewline, @@ -242,7 +251,7 @@ lineParsers = parseUnit ] -- A 'line' doesn't include a 'header' -listLineParsers :: [ReadP MdToken] +listLineParsers :: [Parser MdToken] listLineParsers = [ parseLinebreak, parseEscapedChar, @@ -256,84 +265,85 @@ listLineParsers = ] -- A list line cannot contain newlines. -- List of all parsers -allParsers :: [ReadP MdToken] +allParsers :: [Parser MdToken] allParsers = parseHeader : lineParsers -- Parse any of the line tokens. -parseLineToken :: ReadP MdToken +parseLineToken :: Parser MdToken parseLineToken = fallthroughParser lineParsers -- Parse any of the list line tokens. -parseListLineToken :: ReadP MdToken +parseListLineToken :: Parser MdToken parseListLineToken = fallthroughParser listLineParsers -- Parse a line, consisting of one or more tokens. -parseLine :: ReadP MdToken +parseLine :: Parser MdToken parseLine = do - skipSpaces + space -- Fail if we have reached the end of the document. parsed <- manyTill parseLineToken eof return (Line parsed) -- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines. -parsePara :: ReadP MdToken +parsePara :: Parser MdToken parsePara = do - parseMany (char '\n') + space -- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline. -- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF. - text <- manyTill get (string "\n\n" <|> (eof >> return "")) - when (null text) pfail - let parsedText = fst $ leftmostLongestParse parseLine text -- Parse a line - parseMany (char '\n') - return (Para parsedText) + parsedText <- someTill parseLineToken (try paraEnding) + many (char '\n') + return (Para (Line parsedText)) + where + paraEnding = void (char '\n' *> (char '\n' <|> lookAhead (char '>'))) <|> eof -- Parse a line starting with '>', return the line except for the '>'. -parseQuotedLine :: ReadP String +parseQuotedLine :: Parser String parseQuotedLine = do char '>' - greedyParse (char ' ' +++ char '\t') - restOfLine <- munch (/= '\n') - Text.ParserCombinators.ReadP.optional (char '\n') >> return "" + many (char ' ' <|> char '\t') + restOfLine <- many (satisfy (/= '\n')) + void (char '\n') <|> eof return restOfLine -- Parse many 'quoted lines' until I see a non-quoted line. -parseQuotedLines :: ReadP [String] -parseQuotedLines = - greedyParse1 $ do - look >>= \line -> - case line of - ('>' : _) -> parseQuotedLine - _ -> pfail +parseQuotedLines :: Parser [String] +parseQuotedLines = some parseQuotedLine + +-- some $ do +-- getInput >>= \line -> +-- case T.unpack line of +-- ('>' : _) -> parseQuotedLine +-- _ -> empty -- Parse a blockquote, which is a greater-than sign followed by a paragraph. -parseBlockquote :: ReadP MdToken +parseBlockquote :: Parser MdToken parseBlockquote = do quotedLines <- parseQuotedLines -- remaining <- look -- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining -- string (init $ unlines quotedLines) - let parsedQuotedLines = fst $ leftmostLongestParse (many1 (parseBlockquote <++ parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline. + let parsedQuotedLines = leftmostLongestParse (some (parseBlockquote <|> parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline. return (Blockquote parsedQuotedLines) -- Parse a nested list item. -parseListNested :: ReadP MdToken +parseListNested :: Parser MdToken parseListNested = do - let firstCharParser = string " " <++ string "\t" - let restOfLineParser = manyTill get (void (char '\n') <++ eof) + let firstCharParser = string (T.pack " ") <|> string (T.pack "\t") + let restOfLineParser = manyTill anySingle (void (char '\n') <|> eof) lines <- greedyParse1 (firstCharParser *> restOfLineParser) - let linesParsed = fst $ leftmostLongestParse (parseUnorderedList <++ parseOrderedList) (init $ unlines lines) - when (null (show linesParsed)) pfail + let linesParsed = leftmostLongestParse (parseUnorderedList <|> parseOrderedList) (init $ unlines lines) + when (null (show linesParsed)) empty return linesParsed -- Parse an unordered list line item. -parseUListLineItem :: ReadP MdToken +parseUListLineItem :: Parser MdToken parseUListLineItem = do firstChar <- choice (map char ['*', '+', '-']) char ' ' -- At least one space between list indicator and list text. parseListLineItemCommon -- Parse an ordered list line item. -parseOListLineItem :: ReadP MdToken +parseOListLineItem :: Parser MdToken parseOListLineItem = do num <- greedyParse1 (satisfy isDigit) char '.' @@ -341,23 +351,22 @@ parseOListLineItem = do parseListLineItemCommon -- Common code for parsing list line items -parseListLineItemCommon :: ReadP MdToken +parseListLineItemCommon :: Parser MdToken parseListLineItemCommon = do - skipSpaces - restOfLine <- many1 parseListLineToken - void (char '\n') <++ eof - nestedList <- parseListNested <++ return (Unit "") + space + restOfLine <- manyTill parseListLineToken (void (char '\n') <|> eof) + nestedList <- parseListNested <|> return (Unit "") return $ Line [Line restOfLine, nestedList] -- Parse an unordered list paragraph item. -parseUListParaItem :: ReadP MdToken +parseUListParaItem :: Parser MdToken parseUListParaItem = do firstLine <- parseUListLineItem res <- parseListParaItemCommon return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document. -- Parse an unordered list paragraph item. -parseOListParaItem :: ReadP MdToken +parseOListParaItem :: Parser MdToken parseOListParaItem = do firstLine <- parseOListLineItem res <- parseListParaItemCommon @@ -367,48 +376,54 @@ parseOListParaItem = do -- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more -- lines indented by a space or tab. -- A list paragraph item can also be a blockquote. -parseListParaItemCommon :: ReadP [MdToken] +parseListParaItemCommon :: Parser [MdToken] parseListParaItemCommon = do char '\n' - lines <- greedyParse1 ((string " " <|> string "\t") *> parseTillEol) - let res = fst $ leftmostLongestParse (greedyParse1 parseBlockquote <++ greedyParse1 parsePara) (init $ unlines lines) + lines <- greedyParse1 ((string (T.pack " ") <|> string (T.pack "\t")) *> parseTillEol) + let res = leftmostLongestParse (greedyParse1 parseBlockquote <|> greedyParse1 parsePara) (init $ unlines lines) char '\n' return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document. -- Parse an unordered list item, which can be a line item or another list. -parseUListItem :: ReadP MdToken -parseUListItem = parseUListParaItem <++ parseUListLineItem +parseUListItem :: Parser MdToken +parseUListItem = try parseUListParaItem <|> parseUListLineItem -- Parse an unordered list. -parseUnorderedList :: ReadP MdToken +parseUnorderedList :: Parser MdToken parseUnorderedList = do - lineItems <- greedyParse1 parseUListItem - void (char '\n') <++ eof -- A list must end in an extra newline or eof + lineItems <- some parseUListItem + void (char '\n') <|> eof -- A list must end in an extra newline or eof return $ UnordList lineItems -- -------- -parseOListItem :: ReadP MdToken -parseOListItem = parseOListParaItem <++ parseOListLineItem +parseOListItem :: Parser MdToken +parseOListItem = try parseOListParaItem <|> parseOListLineItem -- Parses the first element of an ordered list, which must start with '1.' -parseFirstOListItem :: ReadP MdToken +parseFirstOListItem :: Parser MdToken parseFirstOListItem = do - remaining <- look - when (take 2 remaining /= "1.") pfail + remaining <- getInput + when (take 2 (T.unpack remaining) /= "1.") empty parseOListLineItem -parseOrderedList :: ReadP MdToken +parseOrderedList :: Parser MdToken parseOrderedList = do firstLine <- parseFirstOListItem - lineItems <- greedyParse1 parseOListItem - void (char '\n') <++ eof + lineItems <- some parseOListItem + void (char '\n') <|> eof return $ OrdList (firstLine : lineItems) -parseHorizontalRule :: ReadP MdToken -parseHorizontalRule = string "---" *> (void (string "\n\n") <++ eof) *> return HorizontalRule +horizontalRuleText :: T.Text +horizontalRuleText = T.pack "---" + +doubleNewlineText :: T.Text +doubleNewlineText = T.pack "\n\n" + +parseHorizontalRule :: Parser MdToken +parseHorizontalRule = string horizontalRuleText *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule -documentParsers :: [ReadP MdToken] +documentParsers :: [Parser MdToken] documentParsers = [ parseHorizontalRule, parseHeader, @@ -420,7 +435,7 @@ documentParsers = ] -- Parse a document, which is multiple paragraphs. -parseDocument :: ReadP MdToken +parseDocument :: Parser MdToken parseDocument = do res <- manyTill (fallthroughParser documentParsers) eof return (Document res) diff --git a/src/MdToHtmlTest.hs b/src/MdToHtmlTest.hs index 239eb86..a42c6b9 100644 --- a/src/MdToHtmlTest.hs +++ b/src/MdToHtmlTest.hs @@ -7,7 +7,7 @@ check_equal :: String -> String -> String -> Test check_equal desc expected actual = TestCase (assertEqual desc expected actual) convert :: String -> String -convert md = show . fst $ leftmostLongestParse parseDocument md +convert md = show $ leftmostLongestParse parseDocument md headerTests = TestList @@ -24,8 +24,8 @@ boldTests = [ check_equal "Should convert bold" "

Hello

" (convert "__Hello__"), check_equal "Should convert italic" "

Hello

" (convert "_Hello_"), check_equal "Should convert bold and italic in a sentence" "

It is a wonderful day

" (convert "It _is_ a __wonderful__ day"), - check_equal "Should convert nested bold and italic" "

Bold then Italic

" (convert "**Bold then *Italic***"), - check_equal "Should convert nested bold and italic" "

Italic then Bold

" (convert "*Italic then **Bold***") + check_equal "Should convert nested bold and italic" "

Bold then Italic

" (convert "**Bold then _Italic_**"), + check_equal "Should convert nested bold and italic" "

Italic then Bold

" (convert "*Italic then __Bold__*") ] strikethroughTests = @@ -93,11 +93,15 @@ orderedListTests = check_equal "Unordered list in ordered list" "
  1. Item 1
  2. Item 2
    • Item 1
    • Item 2
  3. Item 3
" (convert "1. Item 1\n2. Item 2\n - Item 1\n * Item 2\n4. Item 3") ] +htmlTests = + TestList + [check_equal "Convert HTML element" "

a

" (convert "
a
")] + codeTests = TestList [ check_equal "Code by itself" "

Hello world!

" (convert "`Hello world!`"), check_equal "Code in a paragraph" "

The following text is code

" (convert "The following `text` is code"), - check_equal "Code across paragraphs (shouldn't work" "

" (convert "`Incomplete\n\nCode`") -- At the moment, this is just treated as a syntax error, so nothing is rendered. + check_equal "Code across paragraphs (shouldn't work)" "

`Incomplete

Code`

" (convert "`Incomplete\n\nCode`") -- At the moment, this is just treated as a syntax error, so nothing is rendered. ] imageTests = @@ -149,6 +153,7 @@ tests = unorderedListTests, orderedListTests, imageTests, + htmlTests, figureTests, codeTests, horizontalRuleTests,