{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-} {-# HLINT ignore "Use lambda-case" #-} module MdToHTML where import Control.Applicative hiding (many, some) import Control.Monad import Data.Char import Data.List import Data.Ord (comparing) import qualified Data.Text as T import Data.Void import Debug.Trace import Text.Megaparsec import Text.Megaparsec.Char import Text.Printf type Parser = Parsec Void T.Text type HeaderLevel = Int newtype URL = URL {getUrl :: String} deriving (Eq) newtype ImgPath = ImgPath {getPath :: String} deriving (Eq) data MdToken = Document [MdToken] | Header HeaderLevel MdToken | Para MdToken | Line [MdToken] | SingleNewline -- A single newline is rendered as a space. | Linebreak | HorizontalRule | Blockquote [MdToken] | UnordList [MdToken] | OrdList [MdToken] | Code MdToken | Codeblock String | Link MdToken URL | Image MdToken URL | Figure MdToken URL | Bold MdToken | Italic MdToken | Strikethrough MdToken | Unit String deriving (Eq) -- Deriving Show for MdToken instance Show MdToken where show (Document tokens) = concatMap show tokens show (Header level token) = "" ++ show token ++ "" show (Para token) = "

" ++ show token ++ "

\n" show (Line tokens) = concatMap show tokens show Linebreak = "
" show SingleNewline = " " show HorizontalRule = "

" show (Blockquote tokens) = "

" ++ concatMap show tokens ++ "

" show (UnordList tokens) = "

" . append "

" show (OrdList tokens) = "

" . append "

" show (Code code) = "" ++ show code ++ "" show (Codeblock code) = show code show (Link txt url) = "" ++ show txt ++ "" show (Image txt url) = " $\""$ " show (Figure txt url) = "

" show (Bold token) = "" ++ show token ++ "" show (Italic token) = "" ++ show token ++ "" show (Strikethrough token) = "~~" ++ show token ++ "~~" show (Unit unit) = printf "%s" unit instance Semigroup MdToken where a <> b = Document [a, b] instance Monoid MdToken where mempty = Unit "" -- --------------- -- Helpers leftmostLongest :: (Foldable t) => [(a, t b)] -> Maybe (a, t b) leftmostLongest xs = let lastElem = last xs filteredLst = filter (\val -> length (snd val) == length (snd lastElem)) xs in case filteredLst of [] -> Nothing (x : xs) -> Just x -- Get the first parse returned by readP_to_S that consumed the most input leftmostLongestParse :: (Monoid a) => Parser a -> String -> a leftmostLongestParse parser input = case runParser parser "input" (T.pack input) of (Left a) -> mempty (Right a) -> a specialChars = ">\n\\`*_{}[]#+|" escapableChars = "-~!.$()" ++ specialChars -- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned. greedyParse :: Parser a -> Parser [a] greedyParse parser = do greedyParse1 parser <|> return [] -- Like greedyParse, but the parser must succeed atleast once. greedyParse1 :: Parser a -> Parser [a] greedyParse1 parser = do parsed1 <- parser parsed2 <- greedyParse1 parser <|> return [] return (parsed1 : parsed2) prepend :: [a] -> [a] -> [a] prepend x1 x2 = x1 ++ x2 append :: [a] -> [a] -> [a] append x1 x2 = x2 ++ x1 -- Parse until EOL or EOF parseTillEol :: Parser String parseTillEol = manyTill anySingle (void (char '\n') <|> eof) -- Takes a list of parsers. Returns a parser that will try them in -- order, moving to the next one only if the current one fails. fallthroughParser :: [Parser a] -> Parser a fallthroughParser [x] = x fallthroughParser (x : xs) = try x <|> fallthroughParser xs escapeChar :: Char -> String escapeChar '>' = ">" escapeChar '<' = "<" escapeChar '&' = "&" escapeChar x = [x] htmlEscapeChars :: T.Text -> T.Text htmlEscapeChars = T.concatMap (T.pack . escapeChar) -- --------------- -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL. parseHeader :: Parser MdToken parseHeader = do space headers <- greedyParse1 (char '#') when (length headers > 6) empty space parsedText <- manyTill parseLineToken (void (char '\n') <|> eof) greedyParse (char '\n') return (Header (length headers) (Line parsedText)) asteriskBold = T.pack "**" underscoreBold = T.pack "__" -- Parse bold text parseBold :: Parser MdToken parseBold = parseBoldWith asteriskBold <|> parseBoldWith underscoreBold where parseBoldWith delim = do string delim inside <- someTill parseLineToken $ string delim return (Bold (Line inside)) -- Parse italic text parseItalic :: Parser MdToken parseItalic = parseItalicWith '*' <|> parseItalicWith '_' where parseItalicWith delim = do char delim inside <- someTill parseLineToken (char delim) return (Italic (Line inside)) -- Parse strikethrough text parseStrikethrough :: Parser MdToken parseStrikethrough = do string (T.pack "~~") inside <- someTill parseLineToken $ string (T.pack "~~") return (Strikethrough (Line inside)) -- Parse code parseCode :: Parser MdToken parseCode = do char '`' inside <- manyTill (satisfy (/= '\n')) (char '`') return (Code (Unit (concatMap escapeChar inside))) -- Parse a link parseLink :: Parser MdToken parseLink = do char '[' linkText <- someTill parseLineToken (char ']') char '(' linkURL <- manyTill anySingle (char ')') return $ Link (Line linkText) (URL linkURL) -- Parse a linebreak character parseLinebreak :: Parser MdToken parseLinebreak = do char ' ' some (char ' ') char '\n' return Linebreak parseSingleNewline :: Parser MdToken parseSingleNewline = do char '\n' remaining <- getInput case T.unpack remaining of [] -> return $ Unit "" _ -> return SingleNewline parseImage :: Parser MdToken parseImage = do char '!' link <- parseLink case link of Link text path -> return $ Image text path _ -> empty -- This should never be reached parseFigure = do img <- parseImage void (string doubleNewlineText) <|> eof case img of Image text path -> return $ Figure text path _ -> return img -- Parse an escaped character parseEscapedChar :: Parser MdToken parseEscapedChar = do char '\\' escapedChar <- choice (map char escapableChars) -- Parse any of the special chars. return (Unit [escapedChar]) -- Parse a character as a Unit. parseUnit :: Parser MdToken parseUnit = do -- text <- satisfy (`notElem` specialChars) text <- anySingle return (Unit [text]) lineParsers :: [Parser MdToken] lineParsers = [ parseLinebreak, parseSingleNewline, parseEscapedChar, parseCode, parseImage, parseBold, parseItalic, parseStrikethrough, parseLink, parseUnit ] -- A 'line' doesn't include a 'header' listLineParsers :: [Parser MdToken] listLineParsers = [ parseLinebreak, parseEscapedChar, parseCode, parseImage, parseBold, parseItalic, parseStrikethrough, parseLink, parseUnit ] -- A list line cannot contain newlines. -- List of all parsers allParsers :: [Parser MdToken] allParsers = parseHeader : lineParsers -- Parse any of the line tokens. parseLineToken :: Parser MdToken parseLineToken = fallthroughParser lineParsers -- Parse any of the list line tokens. parseListLineToken :: Parser MdToken parseListLineToken = fallthroughParser listLineParsers -- Parse a line, consisting of one or more tokens. parseLine :: Parser MdToken parseLine = do space -- Fail if we have reached the end of the document. parsed <- manyTill parseLineToken eof return (Line parsed) -- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines. parsePara :: Parser MdToken parsePara = do space -- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline. -- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF. parsedText <- someTill parseLineToken (try paraEnding) many (char '\n') return (Para (Line parsedText)) where paraEnding = void (char '\n' *> (char '\n' <|> lookAhead (char '>'))) <|> eof -- Parse a line starting with '>', return the line except for the '>'. parseQuotedLine :: Parser String parseQuotedLine = do char '>' many (char ' ' <|> char '\t') restOfLine <- many (satisfy (/= '\n')) void (char '\n') <|> eof return restOfLine -- Parse many 'quoted lines' until I see a non-quoted line. parseQuotedLines :: Parser [String] parseQuotedLines = some parseQuotedLine -- some $ do -- getInput >>= \line -> -- case T.unpack line of -- ('>' : _) -> parseQuotedLine -- _ -> empty -- Parse a blockquote, which is a greater-than sign followed by a paragraph. parseBlockquote :: Parser MdToken parseBlockquote = do quotedLines <- parseQuotedLines -- remaining <- look -- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining -- string (init $ unlines quotedLines) let parsedQuotedLines = leftmostLongestParse (some (parseBlockquote <|> parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline. return (Blockquote parsedQuotedLines) -- Parse a nested list item. parseListNested :: Parser MdToken parseListNested = do let firstCharParser = string (T.pack " ") <|> string (T.pack "\t") let restOfLineParser = manyTill anySingle (void (char '\n') <|> eof) lines <- greedyParse1 (firstCharParser *> restOfLineParser) let linesParsed = leftmostLongestParse (parseUnorderedList <|> parseOrderedList) (init $ unlines lines) when (null (show linesParsed)) empty return linesParsed -- Parse an unordered list line item. parseUListLineItem :: Parser MdToken parseUListLineItem = do firstChar <- choice (map char ['*', '+', '-']) char ' ' -- At least one space between list indicator and list text. parseListLineItemCommon -- Parse an ordered list line item. parseOListLineItem :: Parser MdToken parseOListLineItem = do num <- greedyParse1 (satisfy isDigit) char '.' char ' ' -- At least one space between list indicator and list text. parseListLineItemCommon -- Common code for parsing list line items parseListLineItemCommon :: Parser MdToken parseListLineItemCommon = do space restOfLine <- manyTill parseListLineToken (void (char '\n') <|> eof) nestedList <- parseListNested <|> return (Unit "") return $ Line [Line restOfLine, nestedList] -- Parse an unordered list paragraph item. parseUListParaItem :: Parser MdToken parseUListParaItem = do firstLine <- parseUListLineItem res <- parseListParaItemCommon return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document. -- Parse an unordered list paragraph item. parseOListParaItem :: Parser MdToken parseOListParaItem = do firstLine <- parseOListLineItem res <- parseListParaItemCommon return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document. -- Common code for parsing list paragraph items. -- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more -- lines indented by a space or tab. -- A list paragraph item can also be a blockquote. parseListParaItemCommon :: Parser [MdToken] parseListParaItemCommon = do char '\n' lines <- greedyParse1 ((string (T.pack " ") <|> string (T.pack "\t")) *> parseTillEol) let res = leftmostLongestParse (greedyParse1 parseBlockquote <|> greedyParse1 parsePara) (init $ unlines lines) char '\n' return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document. -- Parse an unordered list item, which can be a line item or another list. parseUListItem :: Parser MdToken parseUListItem = try parseUListParaItem <|> parseUListLineItem -- Parse an unordered list. parseUnorderedList :: Parser MdToken parseUnorderedList = do lineItems <- some parseUListItem void (char '\n') <|> eof -- A list must end in an extra newline or eof return $ UnordList lineItems -- -------- parseOListItem :: Parser MdToken parseOListItem = try parseOListParaItem <|> parseOListLineItem -- Parses the first element of an ordered list, which must start with '1.' parseFirstOListItem :: Parser MdToken parseFirstOListItem = do remaining <- getInput when (take 2 (T.unpack remaining) /= "1.") empty parseOListLineItem parseOrderedList :: Parser MdToken parseOrderedList = do firstLine <- parseFirstOListItem lineItems <- some parseOListItem void (char '\n') <|> eof return $ OrdList (firstLine : lineItems) horizontalRuleText :: T.Text horizontalRuleText = T.pack "---" doubleNewlineText :: T.Text doubleNewlineText = T.pack "\n\n" parseHorizontalRule :: Parser MdToken parseHorizontalRule = string horizontalRuleText *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule documentParsers :: [Parser MdToken] documentParsers = [ parseHorizontalRule, parseHeader, parseBlockquote, parseUnorderedList, parseOrderedList, parseFigure, parsePara ] -- Parse a document, which is multiple paragraphs. parseDocument :: Parser MdToken parseDocument = do res <- manyTill (fallthroughParser documentParsers) eof return (Document res)