You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
528 lines
18 KiB
Haskell
528 lines
18 KiB
Haskell
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
|
|
|
|
{-# HLINT ignore "Use lambda-case" #-}
|
|
|
|
module MdToHTML where
|
|
|
|
import Control.Applicative hiding (many, some)
|
|
import Control.Monad
|
|
import Control.Monad.Combinators (count)
|
|
import Data.Char
|
|
import Data.List
|
|
import Data.Ord (comparing)
|
|
import Data.String.Utils
|
|
import qualified Data.Text as T
|
|
import Data.Void
|
|
import Debug.Trace
|
|
import Text.Megaparsec
|
|
import Text.Megaparsec.Char
|
|
import Text.Printf
|
|
import Text.Wrap
|
|
|
|
type Parser = Parsec Void T.Text
|
|
|
|
type HeaderLevel = Int
|
|
|
|
type CssClass = String
|
|
|
|
newtype URL = URL {getUrl :: String} deriving (Eq)
|
|
|
|
newtype ImgPath = ImgPath {getPath :: String} deriving (Eq)
|
|
|
|
data MdToken
|
|
= Document [MdToken]
|
|
| Header HeaderLevel MdToken
|
|
| Para MdToken
|
|
| Line [MdToken]
|
|
| SingleNewline -- A single newline is rendered as a space.
|
|
| Linebreak
|
|
| HorizontalRule
|
|
| Blockquote [MdToken]
|
|
| UnordList [MdToken]
|
|
| OrdList [MdToken]
|
|
| Code MdToken
|
|
| Table [[MdToken]]
|
|
| Codeblock MdToken
|
|
| Link MdToken URL
|
|
| Image MdToken URL (Maybe [CssClass])
|
|
| Figure MdToken URL (Maybe [CssClass])
|
|
| Bold MdToken
|
|
| Italic MdToken
|
|
| Strikethrough MdToken
|
|
| Unit String
|
|
deriving (Eq)
|
|
|
|
-- Deriving Show for MdToken
|
|
instance Show MdToken where
|
|
show (Document tokens) = concatMap show tokens
|
|
show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
|
|
show (Para token) = "<p>" ++ show token ++ "</p>"
|
|
show (Line tokens) = concatMap show tokens
|
|
show Linebreak = "<br />"
|
|
show SingleNewline = " "
|
|
show HorizontalRule = "<hr>"
|
|
show (Blockquote tokens) = "<blockquote>" ++ concatMap show tokens ++ "</blockquote>"
|
|
show (UnordList tokens) = "<ul>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ul>"
|
|
show (OrdList tokens) = "<ol>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ol>"
|
|
show (Code code) = "<code>" ++ strip (show code) ++ "</code>"
|
|
show (Table (thead : tokenGrid)) = "<table><thead><tr>" ++ concatMap (\x -> "<th>" ++ rstrip (show x) ++ "</th>") thead ++ "</tr></thead>" ++ "<tbody>" ++ concatMap (\x -> "<tr>" ++ concatMap (\y -> "<td>" ++ rstrip (show y) ++ "</td>") x ++ "</tr>") tokenGrid ++ "</tbody></table>"
|
|
show (Codeblock code) = "<pre><code>" ++ show code ++ "</code></pre>"
|
|
show (Link txt url) = "<a href=\"" ++ getUrl url ++ "\">" ++ show txt ++ "</a>"
|
|
show (Image txt url cssClasses) = "<img src=\"" ++ getUrl url ++ "\"" ++ " alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/>"
|
|
show (Figure txt url cssClasses) = "<figure><img src=\"" ++ getUrl url ++ "\" alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/><figcaption aria-hidden=\"true\">" ++ show txt ++ "</figcaption></figure>"
|
|
show (Bold token) = "<b>" ++ show token ++ "</b>"
|
|
show (Italic token) = "<i>" ++ show token ++ "</i>"
|
|
show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
|
|
show (Unit unit) = printf "%s" unit
|
|
|
|
-- Pretty print the given token into a string.
|
|
-- This is the same as calling 'show' for most tokens, but is different for paragraphs and tables,
|
|
-- which have newlines inserted into them.
|
|
prettyPrint :: MdToken -> String
|
|
prettyPrint (Para token) = "<p>" ++ T.unpack (wrapText defaultWrapSettings 70 (T.pack $ prettyPrint token)) ++ "</p>\n"
|
|
prettyPrint (Table (thead : tokenGrid)) = "<table>\n<thead>\n<tr>\n" ++ concatMap (\x -> "<th>" ++ rstrip (prettyPrint x) ++ "</th>\n") thead ++ "</tr>\n</thead>\n" ++ "<tbody>\n" ++ concatMap (\x -> "<tr>\n" ++ concatMap (\y -> "<td>" ++ rstrip (prettyPrint y) ++ "</td>\n") x ++ "</tr>\n") tokenGrid ++ "</tbody>\n</table>\n"
|
|
prettyPrint Linebreak = "<br />\n"
|
|
prettyPrint (Line tokens) = concatMap prettyPrint tokens
|
|
prettyPrint (Document tokens) = concatMap prettyPrint tokens
|
|
prettyPrint token = show token
|
|
|
|
instance Semigroup MdToken where
|
|
a <> b = Document [a, b]
|
|
|
|
instance Monoid MdToken where
|
|
mempty = Unit ""
|
|
|
|
-- ---------------
|
|
-- Helpers
|
|
leftmostLongest :: (Foldable t) => [(a, t b)] -> Maybe (a, t b)
|
|
leftmostLongest xs =
|
|
let lastElem = last xs
|
|
filteredLst = filter (\val -> length (snd val) == length (snd lastElem)) xs
|
|
in case filteredLst of
|
|
[] -> Nothing
|
|
(x : xs) -> Just x
|
|
|
|
-- Get the first parse returned by readP_to_S that consumed the most input
|
|
leftmostLongestParse :: (Monoid a) => Parser a -> String -> a
|
|
leftmostLongestParse parser input =
|
|
case runParser parser "input" (T.pack input) of
|
|
(Left a) -> mempty
|
|
(Right a) -> a
|
|
|
|
specialChars = ">\n\\`*_{}[]#+|"
|
|
|
|
escapableChars = "-~!.$()" ++ specialChars
|
|
|
|
-- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
|
|
greedyParse :: Parser a -> Parser [a]
|
|
greedyParse parser = do
|
|
greedyParse1 parser <|> return []
|
|
|
|
-- Like greedyParse, but the parser must succeed atleast once.
|
|
greedyParse1 :: Parser a -> Parser [a]
|
|
greedyParse1 parser = do
|
|
parsed1 <- parser
|
|
parsed2 <- greedyParse1 parser <|> return []
|
|
return (parsed1 : parsed2)
|
|
|
|
prepend :: [a] -> [a] -> [a]
|
|
prepend x1 x2 = x1 ++ x2
|
|
|
|
append :: [a] -> [a] -> [a]
|
|
append x1 x2 = x2 ++ x1
|
|
|
|
-- Parse until EOL or EOF
|
|
parseTillEol :: Parser String
|
|
parseTillEol = manyTill anySingle (void (char '\n') <|> eof)
|
|
|
|
-- Takes a list of parsers. Returns a parser that will try them in
|
|
-- order, moving to the next one only if the current one fails.
|
|
fallthroughParser :: [Parser a] -> Parser a
|
|
fallthroughParser [x] = x
|
|
fallthroughParser (x : xs) = try x <|> fallthroughParser xs
|
|
|
|
escapeChar :: Char -> String
|
|
escapeChar '>' = ">"
|
|
escapeChar '<' = "<"
|
|
escapeChar '&' = "&"
|
|
escapeChar x = [x]
|
|
|
|
htmlEscapeChars :: T.Text -> T.Text
|
|
htmlEscapeChars = T.concatMap (T.pack . escapeChar)
|
|
|
|
-- -- Wraps a list of words after (at most) the given number of characters, trying to prevent word-breaks
|
|
-- wordwrap :: Int -> String -> String
|
|
-- wordwrap wraplength str = if (length str) < wraplength
|
|
-- then str
|
|
-- else
|
|
-- let spaceIndex = lastgtSpaceIndex 0 (takeRev (length str) - wraplength str)
|
|
--
|
|
-- where
|
|
-- takeRev n = (reverse . take n . reverse)
|
|
-- lastSpaceIndex counter str = case str of
|
|
-- [] -> counter
|
|
-- x:xs -> if (isSpace x) counter else lastSpaceIndex counter+1 xs
|
|
|
|
-- ---------------
|
|
|
|
-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
|
|
parseHeader :: Parser MdToken
|
|
parseHeader = do
|
|
space
|
|
headers <- greedyParse1 (char '#')
|
|
when
|
|
(length headers > 6)
|
|
empty
|
|
space
|
|
parsedText <- manyTill parseLineToken (void (char '\n') <|> eof)
|
|
greedyParse (char '\n')
|
|
return (Header (length headers) (Line parsedText))
|
|
|
|
asteriskBold = T.pack "**"
|
|
|
|
underscoreBold = T.pack "__"
|
|
|
|
-- Parse bold text
|
|
parseBold :: Parser MdToken
|
|
parseBold = parseBoldWith asteriskBold <|> parseBoldWith underscoreBold
|
|
where
|
|
parseBoldWith delim = do
|
|
string delim
|
|
inside <- someTill parseLineToken $ string delim
|
|
return (Bold (Line inside))
|
|
|
|
-- Parse italic text
|
|
parseItalic :: Parser MdToken
|
|
parseItalic = parseItalicWith '*' <|> parseItalicWith '_'
|
|
where
|
|
parseItalicWith delim = do
|
|
char delim
|
|
inside <- someTill parseLineToken (char delim)
|
|
return (Italic (Line inside))
|
|
|
|
-- Parse strikethrough text
|
|
parseStrikethrough :: Parser MdToken
|
|
parseStrikethrough = do
|
|
string (T.pack "~~")
|
|
inside <- someTill parseLineToken $ string (T.pack "~~")
|
|
return (Strikethrough (Line inside))
|
|
|
|
-- Parse code
|
|
parseCode :: Parser MdToken
|
|
parseCode = do
|
|
opening <- some $ char '`'
|
|
inside <- someTill (satisfy (/= '\n')) (char '`')
|
|
closing <- count (length opening - 1) (char '`')
|
|
return (Code (Unit (concatMap escapeChar inside)))
|
|
|
|
-- Parse a link
|
|
parseLink :: Parser MdToken
|
|
parseLink = do
|
|
char '['
|
|
linkText <- manyTill parseLineToken (char ']')
|
|
char '('
|
|
linkURL <- manyTill anySingle (char ')')
|
|
return $ Link (Line linkText) (URL linkURL)
|
|
|
|
-- Parse a linebreak character
|
|
parseLinebreak :: Parser MdToken
|
|
parseLinebreak = parseLinebreakSpace <|> parseLinebreakBackslash
|
|
where
|
|
parseLinebreakSpace = do
|
|
char ' '
|
|
some (char ' ')
|
|
char '\n'
|
|
return Linebreak
|
|
parseLinebreakBackslash = try $ do
|
|
char '\\'
|
|
char '\n'
|
|
return Linebreak
|
|
|
|
parseTableRow :: Parser [MdToken]
|
|
parseTableRow = do
|
|
char '|'
|
|
row <- some (many (satisfy (\x -> x == ' ' || x == '\t')) *> someTill parseListLineToken (char '|'))
|
|
return (map Line row)
|
|
|
|
parseTable :: Parser MdToken
|
|
parseTable = do
|
|
tableHead <- parseTableRow
|
|
char '\n'
|
|
char '|'
|
|
sepEndBy1 (some (char '-')) (char '|') *> char '\n'
|
|
tableBody <- sepEndBy parseTableRow (char '\n')
|
|
many (char '\n') -- Parse trailing newlines, if any
|
|
return $ Table (tableHead : tableBody)
|
|
|
|
parseSingleNewline :: Parser MdToken
|
|
parseSingleNewline = do
|
|
char '\n'
|
|
remaining <- getInput
|
|
case T.unpack remaining of
|
|
[] -> return $ Unit ""
|
|
_ -> return SingleNewline
|
|
|
|
parseCssClasses :: Parser [CssClass]
|
|
parseCssClasses = do
|
|
char '{'
|
|
classes <- some parseCssClass
|
|
char '}'
|
|
return classes
|
|
where
|
|
parseCssClass :: Parser CssClass
|
|
parseCssClass = do
|
|
char '.'
|
|
let firstLetterParser = char '_' <|> char '-' <|> label "letter" (satisfy isAlpha)
|
|
cssClassFirstLetter <- firstLetterParser
|
|
cssClass <- many (firstLetterParser <|> label "digit" (satisfy isDigit))
|
|
space
|
|
return (cssClassFirstLetter : cssClass)
|
|
|
|
parseImage :: Parser MdToken
|
|
parseImage = do
|
|
char '!'
|
|
link <- parseLink
|
|
cssClasses <- optional $ try parseCssClasses
|
|
case link of
|
|
Link text path -> return $ Image text path cssClasses
|
|
_ -> empty -- This should never be reached
|
|
|
|
parseFigure = do
|
|
img <- parseImage
|
|
void (string doubleNewlineText) <|> eof
|
|
case img of
|
|
Image text path cssClasses -> return $ Figure text path cssClasses
|
|
_ -> return img
|
|
|
|
-- Parse an escaped character
|
|
parseEscapedChar :: Parser MdToken
|
|
parseEscapedChar = do
|
|
char '\\'
|
|
escapedChar <- choice (map char escapableChars) -- Parse any of the special chars.
|
|
return (Unit [escapedChar])
|
|
|
|
-- Parse a character as a Unit.
|
|
parseUnit :: Parser MdToken
|
|
parseUnit = do
|
|
-- text <- satisfy (`notElem` specialChars)
|
|
text <- anySingle
|
|
return (Unit [text])
|
|
|
|
-- Parse any character except a newline
|
|
parseUnitExceptNewline :: Parser MdToken
|
|
parseUnitExceptNewline = do
|
|
-- text <- satisfy (`notElem` specialChars)
|
|
text <- satisfy (/= '\n')
|
|
return (Unit [text])
|
|
|
|
lineParsers :: [Parser MdToken]
|
|
lineParsers =
|
|
[ parseLinebreak,
|
|
parseSingleNewline,
|
|
parseEscapedChar,
|
|
parseCode,
|
|
parseImage,
|
|
parseBold,
|
|
parseItalic,
|
|
parseStrikethrough,
|
|
parseLink,
|
|
parseUnit
|
|
] -- A 'line' doesn't include a 'header'
|
|
|
|
lineParsersWithoutNewline :: [Parser MdToken]
|
|
lineParsersWithoutNewline =
|
|
[ parseEscapedChar,
|
|
parseCode,
|
|
parseImage,
|
|
parseBold,
|
|
parseItalic,
|
|
parseStrikethrough,
|
|
parseLink,
|
|
parseUnitExceptNewline
|
|
] -- A list line cannot contain newlines.
|
|
|
|
-- List of all parsers
|
|
allParsers :: [Parser MdToken]
|
|
allParsers = parseHeader : lineParsers
|
|
|
|
-- Parse any of the line tokens.
|
|
parseLineToken :: Parser MdToken
|
|
parseLineToken = fallthroughParser lineParsers
|
|
|
|
-- Parse any of the list line tokens.
|
|
parseListLineToken :: Parser MdToken
|
|
parseListLineToken = fallthroughParser lineParsersWithoutNewline
|
|
|
|
-- Parse a line, consisting of one or more tokens.
|
|
parseLine :: Parser MdToken
|
|
parseLine = do
|
|
space
|
|
-- Fail if we have reached the end of the document.
|
|
parsed <- manyTill parseLineToken eof
|
|
return (Line parsed)
|
|
|
|
-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
|
|
parsePara :: Parser MdToken
|
|
parsePara = do
|
|
space
|
|
-- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
|
|
-- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
|
|
parsedText <- someTill parseLineToken (try paraEnding)
|
|
many (char '\n')
|
|
return (Para (Line parsedText))
|
|
where
|
|
paraEnding = void (char '\n' *> (char '\n' <|> lookAhead (char '>'))) <|> eof
|
|
|
|
-- Parse a line starting with '>', return the line except for the '>'.
|
|
parseQuotedLine :: Parser String
|
|
parseQuotedLine = do
|
|
char '>'
|
|
many (char ' ' <|> char '\t')
|
|
restOfLine <- many (satisfy (/= '\n'))
|
|
void (char '\n') <|> eof
|
|
return restOfLine
|
|
|
|
-- Parse many 'quoted lines' until I see a non-quoted line.
|
|
parseQuotedLines :: Parser [String]
|
|
parseQuotedLines = some parseQuotedLine
|
|
|
|
-- some $ do
|
|
-- getInput >>= \line ->
|
|
-- case T.unpack line of
|
|
-- ('>' : _) -> parseQuotedLine
|
|
-- _ -> empty
|
|
|
|
-- Parse a blockquote, which is a greater-than sign followed by a paragraph.
|
|
parseBlockquote :: Parser MdToken
|
|
parseBlockquote = do
|
|
quotedLines <- parseQuotedLines
|
|
-- remaining <- look
|
|
-- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining
|
|
-- string (init $ unlines quotedLines)
|
|
let parsedQuotedLines = leftmostLongestParse (some (parseBlockquote <|> parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
|
|
return (Blockquote parsedQuotedLines)
|
|
|
|
-- Parse a nested list item.
|
|
parseListNested :: Parser MdToken
|
|
parseListNested = do
|
|
let firstCharParser = string (T.pack " ") <|> string (T.pack "\t")
|
|
let restOfLineParser = manyTill anySingle (void (char '\n') <|> eof)
|
|
lines <- greedyParse1 (firstCharParser *> restOfLineParser)
|
|
let linesParsed = leftmostLongestParse (parseUnorderedList <|> parseOrderedList) (init $ unlines lines)
|
|
when (null (show linesParsed)) empty
|
|
return linesParsed
|
|
|
|
-- Parse an unordered list line item.
|
|
parseUListLineItem :: Parser MdToken
|
|
parseUListLineItem = do
|
|
firstChar <- choice (map char ['*', '+', '-'])
|
|
char ' ' -- At least one space between list indicator and list text.
|
|
parseListLineItemCommon
|
|
|
|
-- Parse an ordered list line item.
|
|
parseOListLineItem :: Parser MdToken
|
|
parseOListLineItem = do
|
|
num <- greedyParse1 (satisfy isDigit)
|
|
char '.'
|
|
char ' ' -- At least one space between list indicator and list text.
|
|
parseListLineItemCommon
|
|
|
|
-- Common code for parsing list line items
|
|
parseListLineItemCommon :: Parser MdToken
|
|
parseListLineItemCommon = do
|
|
space
|
|
restOfLine <- manyTill parseListLineToken (void (char '\n') <|> eof)
|
|
nestedList <- try parseListNested <|> return (Unit "")
|
|
return $ Line [Line restOfLine, nestedList]
|
|
|
|
-- Parse an unordered list paragraph item.
|
|
parseUListParaItem :: Parser MdToken
|
|
parseUListParaItem = do
|
|
firstLine <- parseUListLineItem
|
|
res <- parseListParaItemCommon
|
|
return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
|
|
|
|
-- Parse an unordered list paragraph item.
|
|
parseOListParaItem :: Parser MdToken
|
|
parseOListParaItem = do
|
|
firstLine <- parseOListLineItem
|
|
res <- parseListParaItemCommon
|
|
return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
|
|
|
|
-- Common code for parsing list paragraph items.
|
|
-- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more
|
|
-- lines indented by a space or tab.
|
|
-- A list paragraph item can also be a blockquote.
|
|
parseListParaItemCommon :: Parser [MdToken]
|
|
parseListParaItemCommon = do
|
|
char '\n'
|
|
lines <- greedyParse1 ((string (T.pack " ") <|> string (T.pack "\t")) *> parseTillEol)
|
|
let res = leftmostLongestParse (greedyParse1 parseBlockquote <|> greedyParse1 parsePara) (init $ unlines lines)
|
|
char '\n'
|
|
return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
|
|
|
|
-- Parse an unordered list item, which can be a line item or another list.
|
|
parseUListItem :: Parser MdToken
|
|
parseUListItem = space *> (try parseUListParaItem <|> parseUListLineItem)
|
|
|
|
-- Parse an unordered list.
|
|
parseUnorderedList :: Parser MdToken
|
|
parseUnorderedList = do
|
|
lineItems <- some $ try parseUListItem
|
|
void (char '\n') <|> eof -- A list must end in an extra newline or eof
|
|
return $ UnordList lineItems
|
|
|
|
-- --------
|
|
|
|
parseOListItem :: Parser MdToken
|
|
parseOListItem = space *> (try parseOListParaItem <|> parseOListLineItem)
|
|
|
|
-- Parses the first element of an ordered list, which must start with '1.'
|
|
parseFirstOListItem :: Parser MdToken
|
|
parseFirstOListItem = do
|
|
space
|
|
remaining <- getInput
|
|
when (take 2 (T.unpack remaining) /= "1.") empty
|
|
parseOListLineItem
|
|
|
|
parseOrderedList :: Parser MdToken
|
|
parseOrderedList = do
|
|
firstLine <- try parseFirstOListItem
|
|
lineItems <- many $ try parseOListItem
|
|
void (char '\n') <|> eof
|
|
return $ OrdList (firstLine : lineItems)
|
|
|
|
horizontalRuleText :: T.Text
|
|
horizontalRuleText = T.pack "---"
|
|
|
|
doubleNewlineText :: T.Text
|
|
doubleNewlineText = T.pack "\n\n"
|
|
|
|
parseHorizontalRule :: Parser MdToken
|
|
parseHorizontalRule = string horizontalRuleText *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule
|
|
|
|
parseCodeblock :: Parser MdToken
|
|
parseCodeblock = do
|
|
string (T.pack "```\n")
|
|
inside <- someTill anySingle (string (T.pack "\n```"))
|
|
return $ Codeblock (Unit (concatMap escapeChar inside))
|
|
|
|
documentParsers :: [Parser MdToken]
|
|
documentParsers =
|
|
[ parseHorizontalRule,
|
|
parseCodeblock,
|
|
parseTable,
|
|
parseHeader,
|
|
parseBlockquote,
|
|
parseUnorderedList,
|
|
parseOrderedList,
|
|
parseFigure,
|
|
parsePara
|
|
]
|
|
|
|
-- Parse a document, which is multiple paragraphs.
|
|
parseDocument :: Parser MdToken
|
|
parseDocument = do
|
|
res <- manyTill (fallthroughParser documentParsers) eof
|
|
return (Document res)
|