{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-} {-# HLINT ignore "Use lambda-case" #-} module MdToHTML where import Control.Applicative import Control.Monad import Data.List import Data.Ord (comparing) import Debug.Trace import Text.ParserCombinators.ReadP import Text.Printf type HeaderLevel = Int newtype URL = URL {getUrl :: String} deriving (Eq) newtype ImgPath = ImgPath {getPath :: String} deriving (Eq) parseMany :: ReadP a -> ReadP [a] parseMany = Text.ParserCombinators.ReadP.many data MdToken = Document [MdToken] | Header HeaderLevel MdToken | Para MdToken | Line [MdToken] | SingleNewline -- A single newline is rendered as a space. | Linebreak | HorizontalRule | Blockquote [MdToken] | UnordList [MdToken] | OrdList [MdToken] | Code String | Codeblock String | Link MdToken URL | Image MdToken ImgPath | Bold MdToken | Italic MdToken | Strikethrough MdToken | Unit String deriving (Eq) -- Deriving Show for MdToken instance Show MdToken where show (Document tokens) = concatMap show tokens show (Header level token) = "" ++ show token ++ "" show (Para token) = "

" ++ show token ++ "

" show (Line tokens) = concatMap show tokens show Linebreak = "
" show SingleNewline = " " show HorizontalRule = "---------" show (Blockquote tokens) = "

" ++ concatMap show tokens ++ "

" show (UnordList tokens) = "

" . append "

" show (OrdList tokens) = "

" . append "

" show (Code code) = show code show (Codeblock code) = show code show (Link txt url) = "" ++ show txt ++ "" show (Image txt imgPath) = "

" ++ show txt ++ "" show (Bold token) = "" ++ show token ++ "" show (Italic token) = "" ++ show token ++ "" show (Strikethrough token) = "~~" ++ show token ++ "~~" show (Unit unit) = printf "%s" unit instance Semigroup MdToken where a <> b = Document [a, b] instance Monoid MdToken where mempty = Unit "" -- --------------- -- Helpers leftmostLongest :: (Foldable t) => [(a, t b)] -> Maybe (a, t b) leftmostLongest xs = let lastElem = last xs filteredLst = filter (\val -> length (snd val) == length (snd lastElem)) xs in case filteredLst of [] -> Nothing (x : xs) -> Just x -- Get the first parse returned by readP_to_S that consumed the most input leftmostLongestParse :: (Monoid a) => ReadP a -> String -> (a, String) leftmostLongestParse parser input = let res = leftmostLongest $ readP_to_S parser input in case res of Nothing -> (mempty, mempty) Just x -> x -- Parse if the string that's left matches the string comparator function lookaheadParse :: (String -> Bool) -> ReadP Char lookaheadParse stringCmp = do lookahead <- look case stringCmp lookahead of True -> get False -> pfail lineToList :: MdToken -> [MdToken] lineToList (Line tokens) = tokens specialChars = "\\#*_[\n" -- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned. greedyParse :: ReadP a -> ReadP [a] greedyParse parser = do greedyParse1 parser <++ return [] -- Like greedyParse, but the parser must succeed atleast once. greedyParse1 :: ReadP a -> ReadP [a] greedyParse1 parser = do parsed1 <- parser parsed2 <- greedyParse1 parser <++ return [] return (parsed1 : parsed2) prepend :: [a] -> [a] -> [a] prepend x1 x2 = x1 ++ x2 append :: [a] -> [a] -> [a] append x1 x2 = x2 ++ x1 -- Sequence two parsers, running one after the other and returning the result. sequenceParse :: ReadP a -> ReadP a -> ReadP [a] sequenceParse p1 p2 = twoElemList <$> p1 <*> p2 where twoElemList elem1 elem2 = [elem1, elem2] -- Parses p1 until p2 succeeds, but doesn't actually consume anything from p2. -- Similar to manyTill, except manyTill's second parser actually consumes characters. manyTillLazy :: ReadP a -> ReadP b -> ReadP [a] manyTillLazy p1 p2 = do res <- p1 remaining <- look let p2res = readP_to_S p2 remaining case p2res of [] -> do res2 <- manyTillLazy p1 p2 return (res : res2) _ -> return [res] -- Parse until EOL or EOF parseTillEol :: ReadP String parseTillEol = manyTill get (void (char '\n') <++ eof) -- Takes a list of parsers. Returns a parser that will try them in -- order, moving to the next one only if the current one fails. fallthroughParser :: [ReadP a] -> ReadP a fallthroughParser [x] = x fallthroughParser (x : xs) = x <++ fallthroughParser xs -- --------------- -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL. parseHeader :: ReadP MdToken parseHeader = do skipSpaces headers <- munch1 (== '#') when (length headers > 6) pfail skipSpaces text <- munch1 (/= '\n') -- Text.ParserCombinators.ReadP.optional (char '\n') skipSpaces let parsedText = fst $ leftmostLongestParse parseLine text return (Header (length headers) parsedText) -- Parse bold text parseBold :: ReadP MdToken parseBold = parseBoldWith "**" <|> parseBoldWith "__" where parseBoldWith delim = do string delim inside <- greedyParse1 parseLineToken string delim return (Bold (Line inside)) -- Parse italic text parseItalic :: ReadP MdToken parseItalic = parseBoldWith "*" <|> parseBoldWith "_" where parseBoldWith delim = do string delim inside <- greedyParse1 parseLineToken string delim return (Italic (Line inside)) -- Parse strikethrough text parseStrikethrough :: ReadP MdToken parseStrikethrough = do string "~~" inside <- many1 parseLineToken string "~~" return (Strikethrough (Line inside)) -- Parse a link parseLink :: ReadP MdToken parseLink = do linkText <- between (string "[") (string "]") (many1 get) linkURL <- between (string "(") (string ")") (many1 get) let parsedLinkText = fst $ leftmostLongestParse parseLine linkText return $ Link parsedLinkText (URL linkURL) -- Parse a linebreak character parseLinebreak :: ReadP MdToken parseLinebreak = do char ' ' many1 (char ' ') char '\n' return Linebreak parseSingleNewline :: ReadP MdToken parseSingleNewline = do char '\n' return SingleNewline -- Parse an escaped character parseEscapedChar :: ReadP MdToken parseEscapedChar = do char '\\' escapedChar <- choice (map char specialChars) -- Parse any of the special chars. return (Unit [escapedChar]) -- Parse a character as a Unit. parseUnit :: ReadP MdToken parseUnit = do text <- satisfy (`notElem` specialChars) return (Unit [text]) -- Parse a regular string as a Unit. parseString :: ReadP MdToken parseString = do firstChar <- satisfy (/= '\n') -- Must parse at least one non-newline character here text <- munch (`notElem` specialChars) return (Unit (firstChar : text)) lineParsers :: [ReadP MdToken] lineParsers = [ parseLinebreak, parseSingleNewline, parseEscapedChar, parseBold, parseItalic, parseStrikethrough, parseLink, parseUnit ] -- A 'line' doesn't include a 'header' listLineParsers :: [ReadP MdToken] listLineParsers = [ parseLinebreak, parseEscapedChar, parseBold, parseItalic, parseStrikethrough, parseLink, parseUnit ] -- A list line cannot contain newlines. -- List of all parsers allParsers :: [ReadP MdToken] allParsers = parseHeader : lineParsers -- Parse any of the line tokens. parseLineToken :: ReadP MdToken parseLineToken = choice lineParsers -- Parse a line, consisting of one or more tokens. parseLine :: ReadP MdToken parseLine = do skipSpaces -- Fail if we have reached the end of the document. parsed <- manyTill parseLineToken eof return (Line parsed) -- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines. parsePara :: ReadP MdToken parsePara = do parseMany (char '\n') -- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline. -- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF. text <- manyTill get (string "\n\n" <|> (eof >> return "")) when (null text) pfail let parsedText = fst $ leftmostLongestParse parseLine text -- Parse a line return (Para parsedText) -- Parse a line starting with '>', return the line except for the '>'. parseQuotedLine :: ReadP String parseQuotedLine = do char '>' greedyParse (char ' ' +++ char '\t') restOfLine <- munch (/= '\n') Text.ParserCombinators.ReadP.optional (char '\n') >> return "" return restOfLine -- Parse many 'quoted lines' until I see a non-quoted line. parseQuotedLines :: ReadP [String] parseQuotedLines = greedyParse1 $ do look >>= \line -> case line of ('>' : _) -> parseQuotedLine _ -> pfail -- Parse a blockquote, which is a greater-than sign followed by a paragraph. parseBlockquote :: ReadP MdToken parseBlockquote = do quotedLines <- parseQuotedLines -- remaining <- look -- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining -- string (init $ unlines quotedLines) let parsedQuotedLines = fst $ leftmostLongestParse (many1 (parseBlockquote <++ parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline. return (Blockquote parsedQuotedLines) -- Parse a nested list item. parseUListNested :: ReadP MdToken parseUListNested = do -- firstChar <- string " " <++ string "\t" -- skipSpaces -- restOfLine <- manyTill get (void (char '\n') <++ eof) -- let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine -- return restOfLineParsed let firstCharParser = string " " <++ string "\t" let restOfLineParser = manyTill get (void (char '\n') <++ eof) lines <- greedyParse1 (firstCharParser *> restOfLineParser) let linesParsed = fst $ leftmostLongestParse parseUnorderedList (init $ unlines lines) return linesParsed -- Parse an unordered list line item. parseUListLineItem :: ReadP MdToken parseUListLineItem = do firstChar <- choice (map char ['*', '+', '-']) skipSpaces restOfLine <- manyTill get (void (char '\n') <++ eof) let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine nestedList <- parseUListNested <++ return (Unit "") return $ Line [restOfLineParsed, nestedList] -- Parse an unordered list paragraph item. -- This is defined as a line item, followed by an empty line, followed by one or more -- lines indented by a space or tab. parseUListParaItem :: ReadP MdToken parseUListParaItem = do firstLine <- parseUListLineItem char '\n' lines <- greedyParse1 ((string " " <|> string "\t") *> parseTillEol) let res = fst $ leftmostLongestParse (greedyParse1 parsePara) (init $ unlines lines) char '\n' return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document. -- This is hacky as hell -- parsedParas <- manyTillLazy parsePara (string "\n\n" *> choice (map char "*-+")) -- return $ Document parsedParas -- I wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document. -- Parse an unordered list item, which can be a line item or another list. parseUListItem :: ReadP MdToken parseUListItem = parseUListParaItem <++ parseUListLineItem <++ parseUListNested -- Parse an unordered list. parseUnorderedList :: ReadP MdToken parseUnorderedList = do lineItems <- greedyParse1 parseUListItem return $ UnordList lineItems -- Parse a document, which is multiple paragraphs. parseDocument :: ReadP MdToken parseDocument = do res <- manyTill (parseHeader <++ parseBlockquote <++ parseUnorderedList <++ parsePara) eof return (Document res)