@ -4,24 +4,26 @@
module MdToHTML where
import Control.Applicative
import Control.Applicative hiding ( many , some )
import Control.Monad
import Data.Char
import Data.List
import Data.Ord ( comparing )
import qualified Data.Text as T
import Data.Void
import Debug.Trace
import Text.ParserCombinators.ReadP
import Text.Megaparsec
import Text.Megaparsec.Char
import Text.Printf
type Parser = Parsec Void T . Text
type HeaderLevel = Int
newtype URL = URL { getUrl :: String } deriving ( Eq )
newtype ImgPath = ImgPath { getPath :: String } deriving ( Eq )
parseMany :: ReadP a -> ReadP [ a ]
parseMany = Text . ParserCombinators . ReadP . many
data MdToken
= Document [ MdToken ]
| Header HeaderLevel MdToken
@ -36,8 +38,8 @@ data MdToken
| Code MdToken
| Codeblock String
| Link MdToken URL
| Image MdToken ImgPath
| Figure MdToken ImgPath
| Image MdToken URL
| Figure MdToken URL
| Bold MdToken
| Italic MdToken
| Strikethrough MdToken
@ -48,7 +50,7 @@ data MdToken
instance Show MdToken where
show ( Document tokens ) = concatMap show tokens
show ( Header level token ) = " <h " ++ show level ++ " > " ++ show token ++ " </h " ++ show level ++ " > "
show ( Para token ) = " <p> " ++ show token ++ " </p> "
show ( Para token ) = " <p> " ++ show token ++ " </p> \ n "
show ( Line tokens ) = concatMap show tokens
show Linebreak = " <br> "
show SingleNewline = " "
@ -59,8 +61,8 @@ instance Show MdToken where
show ( Code code ) = " <code> " ++ show code ++ " </code> "
show ( Codeblock code ) = show code
show ( Link txt url ) = " <a href= \ " " ++ getUrl url ++ " \ " > " ++ show txt ++ " </a> "
show ( Image txt imgPath ) = " <img src= \ " " ++ get Path imgPath ++ " \ " " ++ " alt= \ " " ++ show txt ++ " \ " /> "
show ( Figure txt imgPath ) = " <figure><img src= \ " " ++ get Path imgPath ++ " \ " alt= \ " " ++ show txt ++ " \ " /><figcaption aria-hidden= \ " true \ " > " ++ show txt ++ " </figcaption></figure> "
show ( Image txt url ) = " <img src= \ " " ++ get Url url ++ " \ " " ++ " alt= \ " " ++ show txt ++ " \ " /> "
show ( Figure txt url ) = " <figure><img src= \ " " ++ get Url url ++ " \ " alt= \ " " ++ show txt ++ " \ " /><figcaption aria-hidden= \ " true \ " > " ++ show txt ++ " </figcaption></figure> "
show ( Bold token ) = " <b> " ++ show token ++ " </b> "
show ( Italic token ) = " <i> " ++ show token ++ " </i> "
show ( Strikethrough token ) = " <s> " ++ show token ++ " </s> "
@ -83,27 +85,26 @@ leftmostLongest xs =
( x : xs ) -> Just x
-- Get the first parse returned by readP_to_S that consumed the most input
leftmostLongestParse :: ( Monoid a ) => ReadP a -> String -> ( a , String )
leftmostLongestParse :: ( Monoid a ) => Parser a -> String -> a
leftmostLongestParse parser input =
let res = leftmostLongest $ readP_to_S parser input
in case res of
Nothing -> ( mempty , mempty )
Just x -> x
case runParser parser " input " ( T . pack input ) of
( Left a ) -> mempty
( Right a ) -> a
specialChars = " \ n \ \ `*_{}[] ()<> #+|"
specialChars = " > \ n \ \ `*_{}[] #+|"
escapableChars = " -~!. " ++ specialChars
escapableChars = " -~!. $() " ++ specialChars
-- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
greedyParse :: ReadP a -> ReadP [ a ]
greedyParse :: Parser a -> Parser [ a ]
greedyParse parser = do
greedyParse1 parser < ++ return []
greedyParse1 parser < |> return []
-- Like greedyParse, but the parser must succeed atleast once.
greedyParse1 :: ReadP a -> ReadP [ a ]
greedyParse1 :: Parser a -> Parser [ a ]
greedyParse1 parser = do
parsed1 <- parser
parsed2 <- greedyParse1 parser < ++ return []
parsed2 <- greedyParse1 parser < |> return []
return ( parsed1 : parsed2 )
prepend :: [ a ] -> [ a ] -> [ a ]
@ -113,124 +114,130 @@ append :: [a] -> [a] -> [a]
append x1 x2 = x2 ++ x1
-- Parse until EOL or EOF
parseTillEol :: ReadP String
parseTillEol = manyTill get ( void ( char ' \ n ' ) < ++ eof )
parseTillEol :: Parser String
parseTillEol = manyTill anySin gl e ( void ( char ' \ n ' ) < |> eof )
-- Takes a list of parsers. Returns a parser that will try them in
-- order, moving to the next one only if the current one fails.
fallthroughParser :: [ ReadP a ] -> ReadP a
fallthroughParser :: [ Parser a ] -> Parser a
fallthroughParser [ x ] = x
fallthroughParser ( x : xs ) = x <++ fallthroughParser xs
fallthroughParser ( x : xs ) = try x <|> fallthroughParser xs
escapeChar :: Char -> String
escapeChar '>' = " > "
escapeChar '<' = " < "
escapeChar '&' = " & "
escapeChar x = [ x ]
htmlEscapeChars :: T . Text -> T . Text
htmlEscapeChars = T . concatMap ( T . pack . escapeChar )
-- ---------------
-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
parseHeader :: ReadP MdToken
parseHeader :: Parser MdToken
parseHeader = do
skipSpaces
headers <- munch1 ( == '#' )
s pace
headers <- greedyParse1 ( char '#' )
when
( length headers > 6 )
pfail
skipSpaces
text <- munch1 ( /= ' \ n ' )
-- Text.ParserCombinators.ReadP.optional (char '\n')
skipSpaces
let parsedText = fst $ leftmostLongestParse parseLine text
return ( Header ( length headers ) parsedText )
empty
space
parsedText <- manyTill parseLineToken ( void ( char ' \ n ' ) <|> eof )
greedyParse ( char ' \ n ' )
return ( Header ( length headers ) ( Line parsedText ) )
asteriskBold = T . pack " ** "
underscoreBold = T . pack " __ "
-- Parse bold text
parseBold :: ReadP MdToken
parseBold = parseBoldWith " ** " <|> parseBoldWith " __ "
parseBold :: Parser MdToken
parseBold = parseBoldWith asteriskBold <|> parseBoldWith underscoreBold
where
parseBoldWith delim = do
string delim
inside <- greedyParse1 parseLineToken
string delim
inside <- someTill parseLineToken $ string delim
return ( Bold ( Line inside ) )
-- Parse italic text
parseItalic :: ReadP MdToken
parseItalic = parseItalicWith " * " <|> parseItalicWith " _ "
parseItalic :: Parser MdToken
parseItalic = parseItalicWith '*' <|> parseItalicWith '_'
where
parseItalicWith delim = do
string delim
inside <- greedyParse1 parseLineToken
string delim
char delim
inside <- someTill parseLineToken ( char delim )
return ( Italic ( Line inside ) )
-- Parse strikethrough text
parseStrikethrough :: ReadP MdToken
parseStrikethrough :: Parser MdToken
parseStrikethrough = do
string " ~~ "
inside <- many1 parseLineToken
string " ~~ "
string ( T . pack " ~~ " )
inside <- someTill parseLineToken $ string ( T . pack " ~~ " )
return ( Strikethrough ( Line inside ) )
-- Parse code
parseCode :: ReadP MdToken
parseCode :: Parser MdToken
parseCode = do
string " ` "
inside <- many1 get
string " ` "
return ( Code ( Unit inside ) )
char '`'
inside <- manyTill ( satisfy ( /= ' \ n ' ) ) ( char '`' )
return ( Code ( Unit ( concatMap escapeChar inside ) ) )
-- Parse a link
parseLink :: ReadP MdToken
parseLink :: Parser MdToken
parseLink = do
linkText <- between ( string " [ " ) ( string " ] " ) ( many1 get )
linkURL <- between ( string " ( " ) ( string " ) " ) ( many1 get )
let parsedLinkText = fst $ leftmostLongestParse parseLine linkText
return $ Link parsedLinkText ( URL linkURL )
char '['
linkText <- someTill parseLineToken ( char ']' )
char '('
linkURL <- manyTill anySingle ( char ')' )
return $ Link ( Line linkText ) ( URL linkURL )
-- Parse a linebreak character
parseLinebreak :: ReadP MdToken
parseLinebreak :: Parser MdToken
parseLinebreak = do
char ' '
many1 ( char ' ' )
some ( char ' ' )
char ' \ n '
return Linebreak
parseSingleNewline :: ReadP MdToken
parseSingleNewline :: Parser MdToken
parseSingleNewline = do
char ' \ n '
remaining <- look
case remaining of
remaining <- getInput
case T . unpack remaining of
[] -> return $ Unit " "
_ -> return SingleNewline
parseImage :: ReadP MdToken
parseImage :: Parser MdToken
parseImage = do
char '!'
char '['
altText <- many1 ( parseEscapedChar <++ parseUnit )
char ']'
char '('
path <- many1 get
char ')'
return $ Image ( Line altText ) ( ImgPath path )
link <- parseLink
case link of
Link text path -> return $ Image text path
_ -> empty -- This should never be reached
parseFigure = do
img <- parseImage
void ( string " \ n \ n " ) <++ eof
void ( string doubleNewlineText ) <|> eof
case img of
Image text path -> return $ Figure text path
_ -> return img
-- Parse an escaped character
parseEscapedChar :: ReadP MdToken
parseEscapedChar :: Parser MdToken
parseEscapedChar = do
char ' \ \ '
escapedChar <- choice ( map char escapableChars ) -- Parse any of the special chars.
return ( Unit [ escapedChar ] )
-- Parse a character as a Unit.
parseUnit :: ReadP MdToken
parseUnit :: Parser MdToken
parseUnit = do
text <- satisfy ( ` notElem ` specialChars )
-- text <- satisfy (`notElem` specialChars)
text <- anySingle
return ( Unit [ text ] )
lineParsers :: [ ReadP MdToken ]
lineParsers :: [ Parser MdToken ]
lineParsers =
[ parseLinebreak ,
parseSingleNewline ,
@ -244,7 +251,7 @@ lineParsers =
parseUnit
] -- A 'line' doesn't include a 'header'
listLineParsers :: [ ReadP MdToken ]
listLineParsers :: [ Parser MdToken ]
listLineParsers =
[ parseLinebreak ,
parseEscapedChar ,
@ -258,84 +265,85 @@ listLineParsers =
] -- A list line cannot contain newlines.
-- List of all parsers
allParsers :: [ ReadP MdToken ]
allParsers :: [ Parser MdToken ]
allParsers = parseHeader : lineParsers
-- Parse any of the line tokens.
parseLineToken :: ReadP MdToken
parseLineToken :: Parser MdToken
parseLineToken = fallthroughParser lineParsers
-- Parse any of the list line tokens.
parseListLineToken :: ReadP MdToken
parseListLineToken :: Parser MdToken
parseListLineToken = fallthroughParser listLineParsers
-- Parse a line, consisting of one or more tokens.
parseLine :: ReadP MdToken
parseLine :: Parser MdToken
parseLine = do
s kipS paces
s pace
-- Fail if we have reached the end of the document.
parsed <- manyTill parseLineToken eof
return ( Line parsed )
-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
parsePara :: ReadP MdToken
parsePara :: Parser MdToken
parsePara = do
parseMany ( char ' \ n ' )
space
-- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
-- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
text <- manyTill get ( string " \ n \ n " <|> ( eof >> return " " ) )
when ( null text ) pfail
let parsedText = fst $ leftmostLongestParse parseLine text -- Parse a line
parseMany ( char ' \ n ' )
return ( Para parsedText )
parsedText <- someTill parseLineToken ( try paraEnding )
many ( char ' \ n ' )
return ( Para ( Line parsedText ) )
where
paraEnding = void ( char ' \ n ' *> ( char ' \ n ' <|> lookAhead ( char '>' ) ) ) <|> eof
-- Parse a line starting with '>', return the line except for the '>'.
parseQuotedLine :: ReadP String
parseQuotedLine :: Parser String
parseQuotedLine = do
char '>'
greedyParse ( char ' ' +++ char ' \ t ' )
restOfLine <- m unch ( /= ' \ n ' )
Text . ParserCombinators . ReadP . optional ( char ' \ n ' ) >> return " "
many ( char ' ' <|> char ' \ t ' )
restOfLine <- m any ( satisfy ( /= ' \ n ' ) )
void ( char ' \ n ' ) <|> eof
return restOfLine
-- Parse many 'quoted lines' until I see a non-quoted line.
parseQuotedLines :: ReadP [ String ]
parseQuotedLines =
greedyParse1 $ do
look >>= \ line ->
case line of
( '>' : _ ) -> parseQuotedLine
_ -> pfail
parseQuotedLines :: Parser [ String ]
parseQuotedLines = some parseQuotedLine
-- some $ do
-- getInput >>= \line ->
-- case T.unpack line of
-- ('>' : _) -> parseQuotedLine
-- _ -> empty
-- Parse a blockquote, which is a greater-than sign followed by a paragraph.
parseBlockquote :: ReadP MdToken
parseBlockquote :: Parser MdToken
parseBlockquote = do
quotedLines <- parseQuotedLines
-- remaining <- look
-- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining
-- string (init $ unlines quotedLines)
let parsedQuotedLines = fst $ leftmostLongestParse ( many1 ( parseBlockquote <++ parsePara ) ) ( init $ unlines quotedLines ) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
let parsedQuotedLines = leftmostLongestParse ( some ( parseBlockquote <|> parsePara ) ) ( init $ unlines quotedLines ) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
return ( Blockquote parsedQuotedLines )
-- Parse a nested list item.
parseListNested :: ReadP MdToken
parseListNested :: Parser MdToken
parseListNested = do
let firstCharParser = string " " <++ string " \ t "
let restOfLineParser = manyTill get ( void ( char ' \ n ' ) < ++ eof )
let firstCharParser = string ( T . pack " " ) <|> string ( T . pack " \ t " )
let restOfLineParser = manyTill anySin gl e ( void ( char ' \ n ' ) < |> eof )
lines <- greedyParse1 ( firstCharParser *> restOfLineParser )
let linesParsed = fst $ leftmostLongestParse ( parseUnorderedList < ++ parseOrderedList ) ( init $ unlines lines )
when ( null ( show linesParsed ) ) pfail
let linesParsed = leftmostLongestParse ( parseUnorderedList < |> parseOrderedList ) ( init $ unlines lines )
when ( null ( show linesParsed ) ) empty
return linesParsed
-- Parse an unordered list line item.
parseUListLineItem :: ReadP MdToken
parseUListLineItem :: Parser MdToken
parseUListLineItem = do
firstChar <- choice ( map char [ '*' , '+' , '-' ] )
char ' ' -- At least one space between list indicator and list text.
parseListLineItemCommon
-- Parse an ordered list line item.
parseOListLineItem :: ReadP MdToken
parseOListLineItem :: Parser MdToken
parseOListLineItem = do
num <- greedyParse1 ( satisfy isDigit )
char '.'
@ -343,23 +351,22 @@ parseOListLineItem = do
parseListLineItemCommon
-- Common code for parsing list line items
parseListLineItemCommon :: ReadP MdToken
parseListLineItemCommon :: Parser MdToken
parseListLineItemCommon = do
skipSpaces
restOfLine <- many1 parseListLineToken
void ( char ' \ n ' ) <++ eof
nestedList <- parseListNested <++ return ( Unit " " )
space
restOfLine <- manyTill parseListLineToken ( void ( char ' \ n ' ) <|> eof )
nestedList <- parseListNested <|> return ( Unit " " )
return $ Line [ Line restOfLine , nestedList ]
-- Parse an unordered list paragraph item.
parseUListParaItem :: ReadP MdToken
parseUListParaItem :: Parser MdToken
parseUListParaItem = do
firstLine <- parseUListLineItem
res <- parseListParaItemCommon
return $ Document ( Para firstLine : res ) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- Parse an unordered list paragraph item.
parseOListParaItem :: ReadP MdToken
parseOListParaItem :: Parser MdToken
parseOListParaItem = do
firstLine <- parseOListLineItem
res <- parseListParaItemCommon
@ -369,48 +376,54 @@ parseOListParaItem = do
-- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more
-- lines indented by a space or tab.
-- A list paragraph item can also be a blockquote.
parseListParaItemCommon :: ReadP [ MdToken ]
parseListParaItemCommon :: Parser [ MdToken ]
parseListParaItemCommon = do
char ' \ n '
lines <- greedyParse1 ( ( string " " <|> string " \ t " ) *> parseTillEol )
let res = fst $ leftmostLongestParse ( greedyParse1 parseBlockquote < ++ greedyParse1 parsePara ) ( init $ unlines lines )
lines <- greedyParse1 ( ( string ( T . pack " " ) <|> string ( T . pack " \ t " ) ) *> parseTillEol )
let res = leftmostLongestParse ( greedyParse1 parseBlockquote < |> greedyParse1 parsePara ) ( init $ unlines lines )
char ' \ n '
return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- Parse an unordered list item, which can be a line item or another list.
parseUListItem :: ReadP MdToken
parseUListItem = parseUListParaItem <++ parseUListLineItem
parseUListItem :: Parser MdToken
parseUListItem = try parseUListParaItem <|> parseUListLineItem
-- Parse an unordered list.
parseUnorderedList :: ReadP MdToken
parseUnorderedList :: Parser MdToken
parseUnorderedList = do
lineItems <- greedyPar se1 parseUListItem
void ( char ' \ n ' ) < ++ eof -- A list must end in an extra newline or eof
lineItems <- som e parseUListItem
void ( char ' \ n ' ) < |> eof -- A list must end in an extra newline or eof
return $ UnordList lineItems
-- --------
parseOListItem :: ReadP MdToken
parseOListItem = parseOListParaItem <++ parseOListLineItem
parseOListItem :: Parser MdToken
parseOListItem = try parseOListParaItem <|> parseOListLineItem
-- Parses the first element of an ordered list, which must start with '1.'
parseFirstOListItem :: ReadP MdToken
parseFirstOListItem :: Parser MdToken
parseFirstOListItem = do
remaining <- look
when ( take 2 remaining /= " 1. " ) pfail
remaining <- getInput
when ( take 2 ( T . unpack remaining ) /= " 1. " ) empty
parseOListLineItem
parseOrderedList :: ReadP MdToken
parseOrderedList :: Parser MdToken
parseOrderedList = do
firstLine <- parseFirstOListItem
lineItems <- greedyPar se1 parseOListItem
void ( char ' \ n ' ) < ++ eof
lineItems <- som e parseOListItem
void ( char ' \ n ' ) < |> eof
return $ OrdList ( firstLine : lineItems )
parseHorizontalRule :: ReadP MdToken
parseHorizontalRule = string " --- " *> ( void ( string " \ n \ n " ) <++ eof ) *> return HorizontalRule
horizontalRuleText :: T . Text
horizontalRuleText = T . pack " --- "
doubleNewlineText :: T . Text
doubleNewlineText = T . pack " \ n \ n "
parseHorizontalRule :: Parser MdToken
parseHorizontalRule = string horizontalRuleText *> ( void ( string doubleNewlineText ) <|> eof ) *> return HorizontalRule
documentParsers :: [ ReadP MdToken ]
documentParsers :: [ Parser MdToken ]
documentParsers =
[ parseHorizontalRule ,
parseHeader ,
@ -422,7 +435,7 @@ documentParsers =
]
-- Parse a document, which is multiple paragraphs.
parseDocument :: ReadP MdToken
parseDocument :: Parser MdToken
parseDocument = do
res <- manyTill ( fallthroughParser documentParsers ) eof
return ( Document res )