You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
369 lines
12 KiB
Haskell
369 lines
12 KiB
Haskell
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
|
|
|
|
{-# HLINT ignore "Use lambda-case" #-}
|
|
|
|
module MdToHTML where
|
|
|
|
import Control.Applicative
|
|
import Control.Monad
|
|
import Data.List
|
|
import Data.Ord (comparing)
|
|
import Debug.Trace
|
|
import Text.ParserCombinators.ReadP
|
|
import Text.Printf
|
|
|
|
type HeaderLevel = Int
|
|
|
|
newtype URL = URL {getUrl :: String} deriving (Eq)
|
|
|
|
newtype ImgPath = ImgPath {getPath :: String} deriving (Eq)
|
|
|
|
parseMany :: ReadP a -> ReadP [a]
|
|
parseMany = Text.ParserCombinators.ReadP.many
|
|
|
|
data MdToken
|
|
= Document [MdToken]
|
|
| Header HeaderLevel MdToken
|
|
| Para MdToken
|
|
| Line [MdToken]
|
|
| SingleNewline -- A single newline is rendered as a space.
|
|
| Linebreak
|
|
| HorizontalRule
|
|
| Blockquote [MdToken]
|
|
| UnordList [MdToken]
|
|
| OrdList [MdToken]
|
|
| Code String
|
|
| Codeblock String
|
|
| Link MdToken URL
|
|
| Image MdToken ImgPath
|
|
| Bold MdToken
|
|
| Italic MdToken
|
|
| Strikethrough MdToken
|
|
| Unit String
|
|
deriving (Eq)
|
|
|
|
-- Deriving Show for MdToken
|
|
instance Show MdToken where
|
|
show (Document tokens) = concatMap show tokens
|
|
show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
|
|
show (Para token) = "<p>" ++ show token ++ "</p>"
|
|
show (Line tokens) = concatMap show tokens
|
|
show Linebreak = "<br>"
|
|
show SingleNewline = " "
|
|
show HorizontalRule = "---------"
|
|
show (Blockquote tokens) = "<blockquote>" ++ concatMap show tokens ++ "</blockquote>"
|
|
show (UnordList tokens) = "<ul>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ul>"
|
|
show (OrdList tokens) = "<ol>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ol>"
|
|
show (Code code) = show code
|
|
show (Codeblock code) = show code
|
|
show (Link txt url) = "<a href=\"" ++ getUrl url ++ "\">" ++ show txt ++ "</a>"
|
|
show (Image txt imgPath) = "<img src=" ++ getPath imgPath ++ ">" ++ show txt ++ "</img>"
|
|
show (Bold token) = "<b>" ++ show token ++ "</b>"
|
|
show (Italic token) = "<i>" ++ show token ++ "</i>"
|
|
show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
|
|
show (Unit unit) = printf "%s" unit
|
|
|
|
instance Semigroup MdToken where
|
|
a <> b = Document [a, b]
|
|
|
|
instance Monoid MdToken where
|
|
mempty = Unit ""
|
|
|
|
-- ---------------
|
|
-- Helpers
|
|
leftmostLongest :: (Foldable t) => [(a, t b)] -> Maybe (a, t b)
|
|
leftmostLongest xs =
|
|
let lastElem = last xs
|
|
filteredLst = filter (\val -> length (snd val) == length (snd lastElem)) xs
|
|
in case filteredLst of
|
|
[] -> Nothing
|
|
(x : xs) -> Just x
|
|
|
|
-- Get the first parse returned by readP_to_S that consumed the most input
|
|
leftmostLongestParse :: (Monoid a) => ReadP a -> String -> (a, String)
|
|
leftmostLongestParse parser input =
|
|
let res = leftmostLongest $ readP_to_S parser input
|
|
in case res of
|
|
Nothing -> (mempty, mempty)
|
|
Just x -> x
|
|
|
|
-- Parse if the string that's left matches the string comparator function
|
|
lookaheadParse :: (String -> Bool) -> ReadP Char
|
|
lookaheadParse stringCmp = do
|
|
lookahead <- look
|
|
case stringCmp lookahead of
|
|
True -> get
|
|
False -> pfail
|
|
|
|
lineToList :: MdToken -> [MdToken]
|
|
lineToList (Line tokens) = tokens
|
|
|
|
specialChars = "\\#*_[\n"
|
|
|
|
-- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
|
|
greedyParse :: ReadP a -> ReadP [a]
|
|
greedyParse parser = do
|
|
greedyParse1 parser <++ return []
|
|
|
|
-- Like greedyParse, but the parser must succeed atleast once.
|
|
greedyParse1 :: ReadP a -> ReadP [a]
|
|
greedyParse1 parser = do
|
|
parsed1 <- parser
|
|
parsed2 <- greedyParse1 parser <++ return []
|
|
return (parsed1 : parsed2)
|
|
|
|
prepend :: [a] -> [a] -> [a]
|
|
prepend x1 x2 = x1 ++ x2
|
|
|
|
append :: [a] -> [a] -> [a]
|
|
append x1 x2 = x2 ++ x1
|
|
|
|
-- Sequence two parsers, running one after the other and returning the result.
|
|
sequenceParse :: ReadP a -> ReadP a -> ReadP [a]
|
|
sequenceParse p1 p2 = twoElemList <$> p1 <*> p2
|
|
where
|
|
twoElemList elem1 elem2 = [elem1, elem2]
|
|
|
|
-- Parses p1 until p2 succeeds, but doesn't actually consume anything from p2.
|
|
-- Similar to manyTill, except manyTill's second parser actually consumes characters.
|
|
manyTillLazy :: ReadP a -> ReadP b -> ReadP [a]
|
|
manyTillLazy p1 p2 = do
|
|
res <- p1
|
|
remaining <- look
|
|
let p2res = readP_to_S p2 remaining
|
|
case p2res of
|
|
[] -> do
|
|
res2 <- manyTillLazy p1 p2
|
|
return (res : res2)
|
|
_ -> return [res]
|
|
|
|
-- Parse until EOL or EOF
|
|
parseTillEol :: ReadP String
|
|
parseTillEol = manyTill get (void (char '\n') <++ eof)
|
|
|
|
-- Takes a list of parsers. Returns a parser that will try them in
|
|
-- order, moving to the next one only if the current one fails.
|
|
fallthroughParser :: [ReadP a] -> ReadP a
|
|
fallthroughParser [x] = x
|
|
fallthroughParser (x : xs) = x <++ fallthroughParser xs
|
|
|
|
-- ---------------
|
|
|
|
-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
|
|
parseHeader :: ReadP MdToken
|
|
parseHeader = do
|
|
skipSpaces
|
|
headers <- munch1 (== '#')
|
|
when
|
|
(length headers > 6)
|
|
pfail
|
|
skipSpaces
|
|
text <- munch1 (/= '\n')
|
|
-- Text.ParserCombinators.ReadP.optional (char '\n')
|
|
skipSpaces
|
|
let parsedText = fst $ leftmostLongestParse parseLine text
|
|
return (Header (length headers) parsedText)
|
|
|
|
-- Parse bold text
|
|
parseBold :: ReadP MdToken
|
|
parseBold = parseBoldWith "**" <|> parseBoldWith "__"
|
|
where
|
|
parseBoldWith delim = do
|
|
string delim
|
|
inside <- greedyParse1 parseLineToken
|
|
string delim
|
|
return (Bold (Line inside))
|
|
|
|
-- Parse italic text
|
|
parseItalic :: ReadP MdToken
|
|
parseItalic = parseBoldWith "*" <|> parseBoldWith "_"
|
|
where
|
|
parseBoldWith delim = do
|
|
string delim
|
|
inside <- greedyParse1 parseLineToken
|
|
string delim
|
|
return (Italic (Line inside))
|
|
|
|
-- Parse strikethrough text
|
|
parseStrikethrough :: ReadP MdToken
|
|
parseStrikethrough = do
|
|
string "~~"
|
|
inside <- many1 parseLineToken
|
|
string "~~"
|
|
return (Strikethrough (Line inside))
|
|
|
|
-- Parse a link
|
|
parseLink :: ReadP MdToken
|
|
parseLink = do
|
|
linkText <- between (string "[") (string "]") (many1 get)
|
|
linkURL <- between (string "(") (string ")") (many1 get)
|
|
let parsedLinkText = fst $ leftmostLongestParse parseLine linkText
|
|
return $ Link parsedLinkText (URL linkURL)
|
|
|
|
-- Parse a linebreak character
|
|
parseLinebreak :: ReadP MdToken
|
|
parseLinebreak = do
|
|
char ' '
|
|
many1 (char ' ')
|
|
char '\n'
|
|
return Linebreak
|
|
|
|
parseSingleNewline :: ReadP MdToken
|
|
parseSingleNewline = do
|
|
char '\n'
|
|
return SingleNewline
|
|
|
|
-- Parse an escaped character
|
|
parseEscapedChar :: ReadP MdToken
|
|
parseEscapedChar = do
|
|
char '\\'
|
|
escapedChar <- choice (map char specialChars) -- Parse any of the special chars.
|
|
return (Unit [escapedChar])
|
|
|
|
-- Parse a character as a Unit.
|
|
parseUnit :: ReadP MdToken
|
|
parseUnit = do
|
|
text <- satisfy (`notElem` specialChars)
|
|
return (Unit [text])
|
|
|
|
-- Parse a regular string as a Unit.
|
|
parseString :: ReadP MdToken
|
|
parseString = do
|
|
firstChar <- satisfy (/= '\n') -- Must parse at least one non-newline character here
|
|
text <- munch (`notElem` specialChars)
|
|
return (Unit (firstChar : text))
|
|
|
|
lineParsers :: [ReadP MdToken]
|
|
lineParsers =
|
|
[ parseLinebreak,
|
|
parseSingleNewline,
|
|
parseEscapedChar,
|
|
parseBold,
|
|
parseItalic,
|
|
parseStrikethrough,
|
|
parseLink,
|
|
parseUnit
|
|
] -- A 'line' doesn't include a 'header'
|
|
|
|
listLineParsers :: [ReadP MdToken]
|
|
listLineParsers =
|
|
[ parseLinebreak,
|
|
parseEscapedChar,
|
|
parseBold,
|
|
parseItalic,
|
|
parseStrikethrough,
|
|
parseLink,
|
|
parseUnit
|
|
] -- A list line cannot contain newlines.
|
|
|
|
-- List of all parsers
|
|
allParsers :: [ReadP MdToken]
|
|
allParsers = parseHeader : lineParsers
|
|
|
|
-- Parse any of the line tokens.
|
|
parseLineToken :: ReadP MdToken
|
|
parseLineToken = choice lineParsers
|
|
|
|
-- Parse a line, consisting of one or more tokens.
|
|
parseLine :: ReadP MdToken
|
|
parseLine = do
|
|
skipSpaces
|
|
-- Fail if we have reached the end of the document.
|
|
parsed <- manyTill parseLineToken eof
|
|
return (Line parsed)
|
|
|
|
-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
|
|
parsePara :: ReadP MdToken
|
|
parsePara = do
|
|
parseMany (char '\n')
|
|
-- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
|
|
-- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
|
|
text <- manyTill get (string "\n\n" <|> (eof >> return ""))
|
|
when (null text) pfail
|
|
let parsedText = fst $ leftmostLongestParse parseLine text -- Parse a line
|
|
return (Para parsedText)
|
|
|
|
-- Parse a line starting with '>', return the line except for the '>'.
|
|
parseQuotedLine :: ReadP String
|
|
parseQuotedLine = do
|
|
char '>'
|
|
greedyParse (char ' ' +++ char '\t')
|
|
restOfLine <- munch (/= '\n')
|
|
Text.ParserCombinators.ReadP.optional (char '\n') >> return ""
|
|
return restOfLine
|
|
|
|
-- Parse many 'quoted lines' until I see a non-quoted line.
|
|
parseQuotedLines :: ReadP [String]
|
|
parseQuotedLines =
|
|
greedyParse1 $ do
|
|
look >>= \line ->
|
|
case line of
|
|
('>' : _) -> parseQuotedLine
|
|
_ -> pfail
|
|
|
|
-- Parse a blockquote, which is a greater-than sign followed by a paragraph.
|
|
parseBlockquote :: ReadP MdToken
|
|
parseBlockquote = do
|
|
quotedLines <- parseQuotedLines
|
|
-- remaining <- look
|
|
-- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining
|
|
-- string (init $ unlines quotedLines)
|
|
let parsedQuotedLines = fst $ leftmostLongestParse (many1 (parseBlockquote <++ parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
|
|
return (Blockquote parsedQuotedLines)
|
|
|
|
-- Parse a nested list item.
|
|
parseUListNested :: ReadP MdToken
|
|
parseUListNested = do
|
|
-- firstChar <- string " " <++ string "\t"
|
|
-- skipSpaces
|
|
-- restOfLine <- manyTill get (void (char '\n') <++ eof)
|
|
-- let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
|
|
-- return restOfLineParsed
|
|
let firstCharParser = string " " <++ string "\t"
|
|
let restOfLineParser = manyTill get (void (char '\n') <++ eof)
|
|
lines <- greedyParse1 (firstCharParser *> restOfLineParser)
|
|
let linesParsed = fst $ leftmostLongestParse parseUnorderedList (init $ unlines lines)
|
|
return linesParsed
|
|
|
|
-- Parse an unordered list line item.
|
|
parseUListLineItem :: ReadP MdToken
|
|
parseUListLineItem = do
|
|
firstChar <- choice (map char ['*', '+', '-'])
|
|
skipSpaces
|
|
restOfLine <- manyTill get (void (char '\n') <++ eof)
|
|
let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
|
|
nestedList <- parseUListNested <++ return (Unit "")
|
|
return $ Line [restOfLineParsed, nestedList]
|
|
|
|
-- Parse an unordered list paragraph item.
|
|
-- This is defined as a line item, followed by an empty line, followed by one or more
|
|
-- lines indented by a space or tab.
|
|
parseUListParaItem :: ReadP MdToken
|
|
parseUListParaItem = do
|
|
firstLine <- parseUListLineItem
|
|
char '\n'
|
|
lines <- greedyParse1 ((string " " <|> string "\t") *> parseTillEol)
|
|
let res = fst $ leftmostLongestParse (greedyParse1 parsePara) (init $ unlines lines)
|
|
char '\n'
|
|
return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
|
|
|
|
-- This is hacky as hell
|
|
-- parsedParas <- manyTillLazy parsePara (string "\n\n" *> choice (map char "*-+"))
|
|
-- return $ Document parsedParas -- I wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
|
|
|
|
-- Parse an unordered list item, which can be a line item or another list.
|
|
parseUListItem :: ReadP MdToken
|
|
parseUListItem = parseUListParaItem <++ parseUListLineItem <++ parseUListNested
|
|
|
|
-- Parse an unordered list.
|
|
parseUnorderedList :: ReadP MdToken
|
|
parseUnorderedList = do
|
|
lineItems <- greedyParse1 parseUListItem
|
|
return $ UnordList lineItems
|
|
|
|
-- Parse a document, which is multiple paragraphs.
|
|
parseDocument :: ReadP MdToken
|
|
parseDocument = do
|
|
res <- manyTill (parseHeader <++ parseBlockquote <++ parseUnorderedList <++ parsePara) eof
|
|
return (Document res)
|