You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
md-to-html/src/MdToHTML.hs

442 lines
14 KiB
Haskell

{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Use lambda-case" #-}
module MdToHTML where
import Control.Applicative hiding (many, some)
import Control.Monad
import Data.Char
import Data.List
import Data.Ord (comparing)
import qualified Data.Text as T
import Data.Void
import Debug.Trace
import Text.Megaparsec
import Text.Megaparsec.Char
import Text.Printf
type Parser = Parsec Void T.Text
type HeaderLevel = Int
newtype URL = URL {getUrl :: String} deriving (Eq)
newtype ImgPath = ImgPath {getPath :: String} deriving (Eq)
data MdToken
= Document [MdToken]
| Header HeaderLevel MdToken
| Para MdToken
| Line [MdToken]
| SingleNewline -- A single newline is rendered as a space.
| Linebreak
| HorizontalRule
| Blockquote [MdToken]
| UnordList [MdToken]
| OrdList [MdToken]
| Code MdToken
| Codeblock String
| Link MdToken URL
| Image MdToken URL
| Figure MdToken URL
| Bold MdToken
| Italic MdToken
| Strikethrough MdToken
| Unit String
deriving (Eq)
-- Deriving Show for MdToken
instance Show MdToken where
show (Document tokens) = concatMap show tokens
show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
show (Para token) = "<p>" ++ show token ++ "</p>\n"
show (Line tokens) = concatMap show tokens
show Linebreak = "<br>"
show SingleNewline = " "
show HorizontalRule = "<hr>"
show (Blockquote tokens) = "<blockquote>" ++ concatMap show tokens ++ "</blockquote>"
show (UnordList tokens) = "<ul>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ul>"
show (OrdList tokens) = "<ol>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ol>"
show (Code code) = "<code>" ++ show code ++ "</code>"
show (Codeblock code) = show code
show (Link txt url) = "<a href=\"" ++ getUrl url ++ "\">" ++ show txt ++ "</a>"
show (Image txt url) = "<img src=\"" ++ getUrl url ++ "\"" ++ " alt=\"" ++ show txt ++ "\" />"
show (Figure txt url) = "<figure><img src=\"" ++ getUrl url ++ "\" alt=\"" ++ show txt ++ "\"/><figcaption aria-hidden=\"true\">" ++ show txt ++ "</figcaption></figure>"
show (Bold token) = "<b>" ++ show token ++ "</b>"
show (Italic token) = "<i>" ++ show token ++ "</i>"
show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
show (Unit unit) = printf "%s" unit
instance Semigroup MdToken where
a <> b = Document [a, b]
instance Monoid MdToken where
mempty = Unit ""
-- ---------------
-- Helpers
leftmostLongest :: (Foldable t) => [(a, t b)] -> Maybe (a, t b)
leftmostLongest xs =
let lastElem = last xs
filteredLst = filter (\val -> length (snd val) == length (snd lastElem)) xs
in case filteredLst of
[] -> Nothing
(x : xs) -> Just x
-- Get the first parse returned by readP_to_S that consumed the most input
leftmostLongestParse :: (Monoid a) => Parser a -> String -> a
leftmostLongestParse parser input =
case runParser parser "input" (T.pack input) of
(Left a) -> mempty
(Right a) -> a
specialChars = ">\n\\`*_{}[]#+|"
escapableChars = "-~!.$()" ++ specialChars
-- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
greedyParse :: Parser a -> Parser [a]
greedyParse parser = do
greedyParse1 parser <|> return []
-- Like greedyParse, but the parser must succeed atleast once.
greedyParse1 :: Parser a -> Parser [a]
greedyParse1 parser = do
parsed1 <- parser
parsed2 <- greedyParse1 parser <|> return []
return (parsed1 : parsed2)
prepend :: [a] -> [a] -> [a]
prepend x1 x2 = x1 ++ x2
append :: [a] -> [a] -> [a]
append x1 x2 = x2 ++ x1
-- Parse until EOL or EOF
parseTillEol :: Parser String
parseTillEol = manyTill anySingle (void (char '\n') <|> eof)
-- Takes a list of parsers. Returns a parser that will try them in
-- order, moving to the next one only if the current one fails.
fallthroughParser :: [Parser a] -> Parser a
fallthroughParser [x] = x
fallthroughParser (x : xs) = try x <|> fallthroughParser xs
escapeChar :: Char -> String
escapeChar '>' = "&gt;"
escapeChar '<' = "&lt;"
escapeChar '&' = "&amp;"
escapeChar x = [x]
htmlEscapeChars :: T.Text -> T.Text
htmlEscapeChars = T.concatMap (T.pack . escapeChar)
-- ---------------
-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
parseHeader :: Parser MdToken
parseHeader = do
space
headers <- greedyParse1 (char '#')
when
(length headers > 6)
empty
space
parsedText <- manyTill parseLineToken (void (char '\n') <|> eof)
greedyParse (char '\n')
return (Header (length headers) (Line parsedText))
asteriskBold = T.pack "**"
underscoreBold = T.pack "__"
-- Parse bold text
parseBold :: Parser MdToken
parseBold = parseBoldWith asteriskBold <|> parseBoldWith underscoreBold
where
parseBoldWith delim = do
string delim
inside <- someTill parseLineToken $ string delim
return (Bold (Line inside))
-- Parse italic text
parseItalic :: Parser MdToken
parseItalic = parseItalicWith '*' <|> parseItalicWith '_'
where
parseItalicWith delim = do
char delim
inside <- someTill parseLineToken (char delim)
return (Italic (Line inside))
-- Parse strikethrough text
parseStrikethrough :: Parser MdToken
parseStrikethrough = do
string (T.pack "~~")
inside <- someTill parseLineToken $ string (T.pack "~~")
return (Strikethrough (Line inside))
-- Parse code
parseCode :: Parser MdToken
parseCode = do
char '`'
inside <- manyTill (satisfy (/= '\n')) (char '`')
return (Code (Unit (concatMap escapeChar inside)))
-- Parse a link
parseLink :: Parser MdToken
parseLink = do
char '['
linkText <- someTill parseLineToken (char ']')
char '('
linkURL <- manyTill anySingle (char ')')
return $ Link (Line linkText) (URL linkURL)
-- Parse a linebreak character
parseLinebreak :: Parser MdToken
parseLinebreak = do
char ' '
some (char ' ')
char '\n'
return Linebreak
parseSingleNewline :: Parser MdToken
parseSingleNewline = do
char '\n'
remaining <- getInput
case T.unpack remaining of
[] -> return $ Unit ""
_ -> return SingleNewline
parseImage :: Parser MdToken
parseImage = do
char '!'
link <- parseLink
case link of
Link text path -> return $ Image text path
_ -> empty -- This should never be reached
parseFigure = do
img <- parseImage
void (string doubleNewlineText) <|> eof
case img of
Image text path -> return $ Figure text path
_ -> return img
-- Parse an escaped character
parseEscapedChar :: Parser MdToken
parseEscapedChar = do
char '\\'
escapedChar <- choice (map char escapableChars) -- Parse any of the special chars.
return (Unit [escapedChar])
-- Parse a character as a Unit.
parseUnit :: Parser MdToken
parseUnit = do
-- text <- satisfy (`notElem` specialChars)
text <- anySingle
return (Unit [text])
lineParsers :: [Parser MdToken]
lineParsers =
[ parseLinebreak,
parseSingleNewline,
parseEscapedChar,
parseCode,
parseImage,
parseBold,
parseItalic,
parseStrikethrough,
parseLink,
parseUnit
] -- A 'line' doesn't include a 'header'
listLineParsers :: [Parser MdToken]
listLineParsers =
[ parseLinebreak,
parseEscapedChar,
parseCode,
parseImage,
parseBold,
parseItalic,
parseStrikethrough,
parseLink,
parseUnit
] -- A list line cannot contain newlines.
-- List of all parsers
allParsers :: [Parser MdToken]
allParsers = parseHeader : lineParsers
-- Parse any of the line tokens.
parseLineToken :: Parser MdToken
parseLineToken = fallthroughParser lineParsers
-- Parse any of the list line tokens.
parseListLineToken :: Parser MdToken
parseListLineToken = fallthroughParser listLineParsers
-- Parse a line, consisting of one or more tokens.
parseLine :: Parser MdToken
parseLine = do
space
-- Fail if we have reached the end of the document.
parsed <- manyTill parseLineToken eof
return (Line parsed)
-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
parsePara :: Parser MdToken
parsePara = do
space
-- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
-- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
parsedText <- someTill parseLineToken (try paraEnding)
many (char '\n')
return (Para (Line parsedText))
where
paraEnding = void (char '\n' *> (char '\n' <|> lookAhead (char '>'))) <|> eof
-- Parse a line starting with '>', return the line except for the '>'.
parseQuotedLine :: Parser String
parseQuotedLine = do
char '>'
many (char ' ' <|> char '\t')
restOfLine <- many (satisfy (/= '\n'))
void (char '\n') <|> eof
return restOfLine
-- Parse many 'quoted lines' until I see a non-quoted line.
parseQuotedLines :: Parser [String]
parseQuotedLines = some parseQuotedLine
-- some $ do
-- getInput >>= \line ->
-- case T.unpack line of
-- ('>' : _) -> parseQuotedLine
-- _ -> empty
-- Parse a blockquote, which is a greater-than sign followed by a paragraph.
parseBlockquote :: Parser MdToken
parseBlockquote = do
quotedLines <- parseQuotedLines
-- remaining <- look
-- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining
-- string (init $ unlines quotedLines)
let parsedQuotedLines = leftmostLongestParse (some (parseBlockquote <|> parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
return (Blockquote parsedQuotedLines)
-- Parse a nested list item.
parseListNested :: Parser MdToken
parseListNested = do
let firstCharParser = string (T.pack " ") <|> string (T.pack "\t")
let restOfLineParser = manyTill anySingle (void (char '\n') <|> eof)
lines <- greedyParse1 (firstCharParser *> restOfLineParser)
let linesParsed = leftmostLongestParse (parseUnorderedList <|> parseOrderedList) (init $ unlines lines)
when (null (show linesParsed)) empty
return linesParsed
-- Parse an unordered list line item.
parseUListLineItem :: Parser MdToken
parseUListLineItem = do
firstChar <- choice (map char ['*', '+', '-'])
char ' ' -- At least one space between list indicator and list text.
parseListLineItemCommon
-- Parse an ordered list line item.
parseOListLineItem :: Parser MdToken
parseOListLineItem = do
num <- greedyParse1 (satisfy isDigit)
char '.'
char ' ' -- At least one space between list indicator and list text.
parseListLineItemCommon
-- Common code for parsing list line items
parseListLineItemCommon :: Parser MdToken
parseListLineItemCommon = do
space
restOfLine <- manyTill parseListLineToken (void (char '\n') <|> eof)
nestedList <- parseListNested <|> return (Unit "")
return $ Line [Line restOfLine, nestedList]
-- Parse an unordered list paragraph item.
parseUListParaItem :: Parser MdToken
parseUListParaItem = do
firstLine <- parseUListLineItem
res <- parseListParaItemCommon
return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- Parse an unordered list paragraph item.
parseOListParaItem :: Parser MdToken
parseOListParaItem = do
firstLine <- parseOListLineItem
res <- parseListParaItemCommon
return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- Common code for parsing list paragraph items.
-- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more
-- lines indented by a space or tab.
-- A list paragraph item can also be a blockquote.
parseListParaItemCommon :: Parser [MdToken]
parseListParaItemCommon = do
char '\n'
lines <- greedyParse1 ((string (T.pack " ") <|> string (T.pack "\t")) *> parseTillEol)
let res = leftmostLongestParse (greedyParse1 parseBlockquote <|> greedyParse1 parsePara) (init $ unlines lines)
char '\n'
return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- Parse an unordered list item, which can be a line item or another list.
parseUListItem :: Parser MdToken
parseUListItem = try parseUListParaItem <|> parseUListLineItem
-- Parse an unordered list.
parseUnorderedList :: Parser MdToken
parseUnorderedList = do
lineItems <- some parseUListItem
void (char '\n') <|> eof -- A list must end in an extra newline or eof
return $ UnordList lineItems
-- --------
parseOListItem :: Parser MdToken
parseOListItem = try parseOListParaItem <|> parseOListLineItem
-- Parses the first element of an ordered list, which must start with '1.'
parseFirstOListItem :: Parser MdToken
parseFirstOListItem = do
remaining <- getInput
when (take 2 (T.unpack remaining) /= "1.") empty
parseOListLineItem
parseOrderedList :: Parser MdToken
parseOrderedList = do
firstLine <- parseFirstOListItem
lineItems <- some parseOListItem
void (char '\n') <|> eof
return $ OrdList (firstLine : lineItems)
horizontalRuleText :: T.Text
horizontalRuleText = T.pack "---"
doubleNewlineText :: T.Text
doubleNewlineText = T.pack "\n\n"
parseHorizontalRule :: Parser MdToken
parseHorizontalRule = string horizontalRuleText *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule
documentParsers :: [Parser MdToken]
documentParsers =
[ parseHorizontalRule,
parseHeader,
parseBlockquote,
parseUnorderedList,
parseOrderedList,
parseFigure,
parsePara
]
-- Parse a document, which is multiple paragraphs.
parseDocument :: Parser MdToken
parseDocument = do
res <- manyTill (fallthroughParser documentParsers) eof
return (Document res)