{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-} {-# HLINT ignore "Use lambda-case" #-} module MdToHTML where import Control.Applicative hiding (many, some) import Control.Monad import Control.Monad.Combinators (count) import Data.Char import Data.List import Data.Ord (comparing) import Data.String.Utils import qualified Data.Text as T import Data.Void import Debug.Trace import Text.Megaparsec import Text.Megaparsec.Char import Text.Printf import Text.Wrap type Parser = Parsec Void T.Text type HeaderLevel = Int type CssClass = String newtype URL = URL {getUrl :: String} deriving (Eq) newtype ImgPath = ImgPath {getPath :: String} deriving (Eq) data MdToken = Document [MdToken] | Header HeaderLevel MdToken | Para MdToken | Line [MdToken] | SingleNewline -- A single newline is rendered as a space. | Linebreak | HorizontalRule | Blockquote [MdToken] | UnordList [MdToken] | OrdList [MdToken] | Code MdToken | Table [[MdToken]] | Codeblock MdToken | Link MdToken URL | Image MdToken URL (Maybe [CssClass]) | Figure MdToken URL (Maybe [CssClass]) | Bold MdToken | Italic MdToken | Strikethrough MdToken | Unit String deriving (Eq) -- Deriving Show for MdToken instance Show MdToken where show (Document tokens) = concatMap show tokens show (Header level token) = "" ++ show token ++ "" show (Para token) = "

" ++ show token ++ "

" show (Line tokens) = concatMap show tokens show Linebreak = "
" show SingleNewline = " " show HorizontalRule = "

" show (Blockquote tokens) = "

" ++ concatMap show tokens ++ "

" show (UnordList tokens) = "

" . append "

" show (OrdList tokens) = "

" . append "

" show (Code code) = "" ++ strip (show code) ++ "" show (Table (thead : tokenGrid)) = "" ++ concatMap (\x -> "") thead ++ "" ++ "" ++ concatMap (\x -> "" ++ concatMap (\y -> "") x ++ "") tokenGrid ++ "

" ++ rstrip (show x) ++ "
" ++ rstrip (show y) ++ "

" show (Codeblock code) = "

" ++ show code ++ "

" show (Link txt url) = "" ++ show txt ++ "" show (Image txt url cssClasses) = " $\""$ " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/>" show (Figure txt url cssClasses) = "

" show (Bold token) = "" ++ show token ++ "" show (Italic token) = "" ++ show token ++ "" show (Strikethrough token) = "~~" ++ show token ++ "~~" show (Unit unit) = printf "%s" unit -- Pretty print the given token into a string. -- This is the same as calling 'show' for most tokens, but is different for paragraphs and tables, -- which have newlines inserted into them. prettyPrint :: MdToken -> String prettyPrint (Para token) = "

" ++ T.unpack (wrapText defaultWrapSettings 70 (T.pack $ prettyPrint token)) ++ "

\n" prettyPrint (Table (thead : tokenGrid)) = "\n\n\n" ++ concatMap (\x -> "\n") thead ++ "\n\n" ++ "\n" ++ concatMap (\x -> "\n" ++ concatMap (\y -> "\n") x ++ "\n") tokenGrid ++ "\n

" ++ rstrip (prettyPrint x) ++ "
" ++ rstrip (prettyPrint y) ++ "

\n" prettyPrint Linebreak = "
\n" prettyPrint (Line tokens) = concatMap prettyPrint tokens prettyPrint (Document tokens) = concatMap prettyPrint tokens prettyPrint token = show token instance Semigroup MdToken where a <> b = Document [a, b] instance Monoid MdToken where mempty = Unit "" -- --------------- -- Helpers leftmostLongest :: (Foldable t) => [(a, t b)] -> Maybe (a, t b) leftmostLongest xs = let lastElem = last xs filteredLst = filter (\val -> length (snd val) == length (snd lastElem)) xs in case filteredLst of [] -> Nothing (x : xs) -> Just x -- Get the first parse returned by readP_to_S that consumed the most input leftmostLongestParse :: (Monoid a) => Parser a -> String -> a leftmostLongestParse parser input = case runParser parser "input" (T.pack input) of (Left a) -> mempty (Right a) -> a specialChars = ">\n\\`*_{}[]#+|" escapableChars = "-~!.$()" ++ specialChars -- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned. greedyParse :: Parser a -> Parser [a] greedyParse parser = do greedyParse1 parser <|> return [] -- Like greedyParse, but the parser must succeed atleast once. greedyParse1 :: Parser a -> Parser [a] greedyParse1 parser = do parsed1 <- parser parsed2 <- greedyParse1 parser <|> return [] return (parsed1 : parsed2) prepend :: [a] -> [a] -> [a] prepend x1 x2 = x1 ++ x2 append :: [a] -> [a] -> [a] append x1 x2 = x2 ++ x1 -- Parse until EOL or EOF parseTillEol :: Parser String parseTillEol = manyTill anySingle (void (char '\n') <|> eof) -- Takes a list of parsers. Returns a parser that will try them in -- order, moving to the next one only if the current one fails. fallthroughParser :: [Parser a] -> Parser a fallthroughParser [x] = x fallthroughParser (x : xs) = try x <|> fallthroughParser xs escapeChar :: Char -> String escapeChar '>' = ">" escapeChar '<' = "<" escapeChar '&' = "&" escapeChar x = [x] htmlEscapeChars :: T.Text -> T.Text htmlEscapeChars = T.concatMap (T.pack . escapeChar) -- -- Wraps a list of words after (at most) the given number of characters, trying to prevent word-breaks -- wordwrap :: Int -> String -> String -- wordwrap wraplength str = if (length str) < wraplength -- then str -- else -- let spaceIndex = lastgtSpaceIndex 0 (takeRev (length str) - wraplength str) -- -- where -- takeRev n = (reverse . take n . reverse) -- lastSpaceIndex counter str = case str of -- [] -> counter -- x:xs -> if (isSpace x) counter else lastSpaceIndex counter+1 xs -- --------------- -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL. parseHeader :: Parser MdToken parseHeader = do space headers <- greedyParse1 (char '#') when (length headers > 6) empty space parsedText <- manyTill parseLineToken (void (char '\n') <|> eof) greedyParse (char '\n') return (Header (length headers) (Line parsedText)) asteriskBold = T.pack "**" underscoreBold = T.pack "__" -- Parse bold text parseBold :: Parser MdToken parseBold = parseBoldWith asteriskBold <|> parseBoldWith underscoreBold where parseBoldWith delim = do string delim inside <- someTill parseLineToken $ string delim return (Bold (Line inside)) -- Parse italic text parseItalic :: Parser MdToken parseItalic = parseItalicWith '*' <|> parseItalicWith '_' where parseItalicWith delim = do char delim inside <- someTill parseLineToken (char delim) return (Italic (Line inside)) -- Parse strikethrough text parseStrikethrough :: Parser MdToken parseStrikethrough = do string (T.pack "~~") inside <- someTill parseLineToken $ string (T.pack "~~") return (Strikethrough (Line inside)) -- Parse code parseCode :: Parser MdToken parseCode = do opening <- some $ char '`' inside <- someTill (satisfy (/= '\n')) (char '`') closing <- count (length opening - 1) (char '`') return (Code (Unit (concatMap escapeChar inside))) -- Parse a link parseLink :: Parser MdToken parseLink = do char '[' linkText <- manyTill parseLineToken (char ']') char '(' linkURL <- manyTill anySingle (char ')') return $ Link (Line linkText) (URL linkURL) -- Parse a linebreak character parseLinebreak :: Parser MdToken parseLinebreak = parseLinebreakSpace <|> parseLinebreakBackslash where parseLinebreakSpace = do char ' ' some (char ' ') char '\n' return Linebreak parseLinebreakBackslash = try $ do char '\\' char '\n' return Linebreak parseTableRow :: Parser [MdToken] parseTableRow = do char '|' row <- some (many (satisfy (\x -> x == ' ' || x == '\t')) *> someTill parseListLineToken (char '|')) return (map Line row) parseTable :: Parser MdToken parseTable = do tableHead <- parseTableRow char '\n' char '|' sepEndBy1 (some (char '-')) (char '|') *> char '\n' tableBody <- sepEndBy parseTableRow (char '\n') many (char '\n') -- Parse trailing newlines, if any return $ Table (tableHead : tableBody) parseSingleNewline :: Parser MdToken parseSingleNewline = do char '\n' remaining <- getInput case T.unpack remaining of [] -> return $ Unit "" _ -> return SingleNewline parseCssClasses :: Parser [CssClass] parseCssClasses = do char '{' classes <- some parseCssClass char '}' return classes where parseCssClass :: Parser CssClass parseCssClass = do char '.' let firstLetterParser = char '_' <|> char '-' <|> label "letter" (satisfy isAlpha) cssClassFirstLetter <- firstLetterParser cssClass <- many (firstLetterParser <|> label "digit" (satisfy isDigit)) space return (cssClassFirstLetter : cssClass) parseImage :: Parser MdToken parseImage = do char '!' link <- parseLink cssClasses <- optional $ try parseCssClasses case link of Link text path -> return $ Image text path cssClasses _ -> empty -- This should never be reached parseFigure = do img <- parseImage void (string doubleNewlineText) <|> eof case img of Image text path cssClasses -> return $ Figure text path cssClasses _ -> return img -- Parse an escaped character parseEscapedChar :: Parser MdToken parseEscapedChar = do char '\\' escapedChar <- choice (map char escapableChars) -- Parse any of the special chars. return (Unit [escapedChar]) -- Parse a character as a Unit. parseUnit :: Parser MdToken parseUnit = do -- text <- satisfy (`notElem` specialChars) text <- anySingle return (Unit [text]) -- Parse any character except a newline parseUnitExceptNewline :: Parser MdToken parseUnitExceptNewline = do -- text <- satisfy (`notElem` specialChars) text <- satisfy (/= '\n') return (Unit [text]) lineParsers :: [Parser MdToken] lineParsers = [ parseLinebreak, parseSingleNewline, parseEscapedChar, parseCode, parseImage, parseBold, parseItalic, parseStrikethrough, parseLink, parseUnit ] -- A 'line' doesn't include a 'header' lineParsersWithoutNewline :: [Parser MdToken] lineParsersWithoutNewline = [ parseEscapedChar, parseCode, parseImage, parseBold, parseItalic, parseStrikethrough, parseLink, parseUnitExceptNewline ] -- A list line cannot contain newlines. -- List of all parsers allParsers :: [Parser MdToken] allParsers = parseHeader : lineParsers -- Parse any of the line tokens. parseLineToken :: Parser MdToken parseLineToken = fallthroughParser lineParsers -- Parse any of the list line tokens. parseListLineToken :: Parser MdToken parseListLineToken = fallthroughParser lineParsersWithoutNewline -- Parse a line, consisting of one or more tokens. parseLine :: Parser MdToken parseLine = do space -- Fail if we have reached the end of the document. parsed <- manyTill parseLineToken eof return (Line parsed) -- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines. parsePara :: Parser MdToken parsePara = do space -- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline. -- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF. parsedText <- someTill parseLineToken (try paraEnding) many (char '\n') return (Para (Line parsedText)) where paraEnding = void (char '\n' *> (char '\n' <|> lookAhead (char '>'))) <|> eof -- Parse a line starting with '>', return the line except for the '>'. parseQuotedLine :: Parser String parseQuotedLine = do char '>' many (char ' ' <|> char '\t') restOfLine <- many (satisfy (/= '\n')) void (char '\n') <|> eof return restOfLine -- Parse many 'quoted lines' until I see a non-quoted line. parseQuotedLines :: Parser [String] parseQuotedLines = some parseQuotedLine -- some $ do -- getInput >>= \line -> -- case T.unpack line of -- ('>' : _) -> parseQuotedLine -- _ -> empty -- Parse a blockquote, which is a greater-than sign followed by a paragraph. parseBlockquote :: Parser MdToken parseBlockquote = do quotedLines <- parseQuotedLines -- remaining <- look -- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining -- string (init $ unlines quotedLines) let parsedQuotedLines = leftmostLongestParse (some (parseBlockquote <|> parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline. return (Blockquote parsedQuotedLines) -- Parse a nested list item. parseListNested :: Parser MdToken parseListNested = do let firstCharParser = string (T.pack " ") <|> string (T.pack "\t") let restOfLineParser = manyTill anySingle (void (char '\n') <|> eof) lines <- greedyParse1 (firstCharParser *> restOfLineParser) let linesParsed = leftmostLongestParse (parseUnorderedList <|> parseOrderedList) (init $ unlines lines) when (null (show linesParsed)) empty return linesParsed -- Parse an unordered list line item. parseUListLineItem :: Parser MdToken parseUListLineItem = do firstChar <- choice (map char ['*', '+', '-']) char ' ' -- At least one space between list indicator and list text. parseListLineItemCommon -- Parse an ordered list line item. parseOListLineItem :: Parser MdToken parseOListLineItem = do num <- greedyParse1 (satisfy isDigit) char '.' char ' ' -- At least one space between list indicator and list text. parseListLineItemCommon -- Common code for parsing list line items parseListLineItemCommon :: Parser MdToken parseListLineItemCommon = do space restOfLine <- manyTill parseListLineToken (void (char '\n') <|> eof) nestedList <- try parseListNested <|> return (Unit "") return $ Line [Line restOfLine, nestedList] -- Parse an unordered list paragraph item. parseUListParaItem :: Parser MdToken parseUListParaItem = do firstLine <- parseUListLineItem res <- parseListParaItemCommon return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document. -- Parse an unordered list paragraph item. parseOListParaItem :: Parser MdToken parseOListParaItem = do firstLine <- parseOListLineItem res <- parseListParaItemCommon return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document. -- Common code for parsing list paragraph items. -- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more -- lines indented by a space or tab. -- A list paragraph item can also be a blockquote. parseListParaItemCommon :: Parser [MdToken] parseListParaItemCommon = do char '\n' lines <- greedyParse1 ((string (T.pack " ") <|> string (T.pack "\t")) *> parseTillEol) let res = leftmostLongestParse (greedyParse1 parseBlockquote <|> greedyParse1 parsePara) (init $ unlines lines) char '\n' return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document. -- Parse an unordered list item, which can be a line item or another list. parseUListItem :: Parser MdToken parseUListItem = space *> (try parseUListParaItem <|> parseUListLineItem) -- Parse an unordered list. parseUnorderedList :: Parser MdToken parseUnorderedList = do lineItems <- some $ try parseUListItem void (char '\n') <|> eof -- A list must end in an extra newline or eof return $ UnordList lineItems -- -------- parseOListItem :: Parser MdToken parseOListItem = space *> (try parseOListParaItem <|> parseOListLineItem) -- Parses the first element of an ordered list, which must start with '1.' parseFirstOListItem :: Parser MdToken parseFirstOListItem = do space remaining <- getInput when (take 2 (T.unpack remaining) /= "1.") empty parseOListLineItem parseOrderedList :: Parser MdToken parseOrderedList = do firstLine <- try parseFirstOListItem lineItems <- many $ try parseOListItem void (char '\n') <|> eof return $ OrdList (firstLine : lineItems) horizontalRuleText :: T.Text horizontalRuleText = T.pack "---" doubleNewlineText :: T.Text doubleNewlineText = T.pack "\n\n" parseHorizontalRule :: Parser MdToken parseHorizontalRule = string horizontalRuleText *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule parseCodeblock :: Parser MdToken parseCodeblock = do string (T.pack "```\n") inside <- someTill anySingle (string (T.pack "\n```")) return $ Codeblock (Unit (concatMap escapeChar inside)) documentParsers :: [Parser MdToken] documentParsers = [ parseHorizontalRule, parseCodeblock, parseTable, parseHeader, parseBlockquote, parseUnorderedList, parseOrderedList, parseFigure, parsePara ] -- Parse a document, which is multiple paragraphs. parseDocument :: Parser MdToken parseDocument = do res <- manyTill (fallthroughParser documentParsers) eof return (Document res)