diff --git a/src/MdToHTML.hs b/src/MdToHTML.hs
index 1b2bbb0..e3dc86a 100644
--- a/src/MdToHTML.hs
+++ b/src/MdToHTML.hs
@@ -4,24 +4,26 @@
module MdToHTML where
-import Control.Applicative
+import Control.Applicative hiding (many, some)
import Control.Monad
import Data.Char
import Data.List
import Data.Ord (comparing)
+import qualified Data.Text as T
+import Data.Void
import Debug.Trace
-import Text.ParserCombinators.ReadP
+import Text.Megaparsec
+import Text.Megaparsec.Char
import Text.Printf
+type Parser = Parsec Void T.Text
+
type HeaderLevel = Int
newtype URL = URL {getUrl :: String} deriving (Eq)
newtype ImgPath = ImgPath {getPath :: String} deriving (Eq)
-parseMany :: ReadP a -> ReadP [a]
-parseMany = Text.ParserCombinators.ReadP.many
-
data MdToken
= Document [MdToken]
| Header HeaderLevel MdToken
@@ -36,8 +38,8 @@ data MdToken
| Code MdToken
| Codeblock String
| Link MdToken URL
- | Image MdToken ImgPath
- | Figure MdToken ImgPath
+ | Image MdToken URL
+ | Figure MdToken URL
| Bold MdToken
| Italic MdToken
| Strikethrough MdToken
@@ -48,7 +50,7 @@ data MdToken
instance Show MdToken where
show (Document tokens) = concatMap show tokens
show (Header level token) = "" ++ show token ++ " "
- show (Para token) = "
" ++ show token ++ "
"
+ show (Para token) = "" ++ show token ++ "
\n"
show (Line tokens) = concatMap show tokens
show Linebreak = " "
show SingleNewline = " "
@@ -59,8 +61,8 @@ instance Show MdToken where
show (Code code) = "" ++ show code ++ "
"
show (Codeblock code) = show code
show (Link txt url) = "" ++ show txt ++ " "
- show (Image txt imgPath) = " "
- show (Figure txt imgPath) = "" ++ show txt ++ " "
+ show (Image txt url) = " "
+ show (Figure txt url) = "" ++ show txt ++ " "
show (Bold token) = "" ++ show token ++ " "
show (Italic token) = "" ++ show token ++ " "
show (Strikethrough token) = "" ++ show token ++ " "
@@ -83,27 +85,26 @@ leftmostLongest xs =
(x : xs) -> Just x
-- Get the first parse returned by readP_to_S that consumed the most input
-leftmostLongestParse :: (Monoid a) => ReadP a -> String -> (a, String)
+leftmostLongestParse :: (Monoid a) => Parser a -> String -> a
leftmostLongestParse parser input =
- let res = leftmostLongest $ readP_to_S parser input
- in case res of
- Nothing -> (mempty, mempty)
- Just x -> x
+ case runParser parser "input" (T.pack input) of
+ (Left a) -> mempty
+ (Right a) -> a
-specialChars = "\n\\`*_{}[]()<>#+|"
+specialChars = ">\n\\`*_{}[]#+|"
-escapableChars = "-~!." ++ specialChars
+escapableChars = "-~!.$()" ++ specialChars
-- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
-greedyParse :: ReadP a -> ReadP [a]
+greedyParse :: Parser a -> Parser [a]
greedyParse parser = do
- greedyParse1 parser <++ return []
+ greedyParse1 parser <|> return []
-- Like greedyParse, but the parser must succeed atleast once.
-greedyParse1 :: ReadP a -> ReadP [a]
+greedyParse1 :: Parser a -> Parser [a]
greedyParse1 parser = do
parsed1 <- parser
- parsed2 <- greedyParse1 parser <++ return []
+ parsed2 <- greedyParse1 parser <|> return []
return (parsed1 : parsed2)
prepend :: [a] -> [a] -> [a]
@@ -113,122 +114,130 @@ append :: [a] -> [a] -> [a]
append x1 x2 = x2 ++ x1
-- Parse until EOL or EOF
-parseTillEol :: ReadP String
-parseTillEol = manyTill get (void (char '\n') <++ eof)
+parseTillEol :: Parser String
+parseTillEol = manyTill anySingle (void (char '\n') <|> eof)
-- Takes a list of parsers. Returns a parser that will try them in
-- order, moving to the next one only if the current one fails.
-fallthroughParser :: [ReadP a] -> ReadP a
+fallthroughParser :: [Parser a] -> Parser a
fallthroughParser [x] = x
-fallthroughParser (x : xs) = x <++ fallthroughParser xs
+fallthroughParser (x : xs) = try x <|> fallthroughParser xs
+
+escapeChar :: Char -> String
+escapeChar '>' = ">"
+escapeChar '<' = "<"
+escapeChar '&' = "&"
+escapeChar x = [x]
+
+htmlEscapeChars :: T.Text -> T.Text
+htmlEscapeChars = T.concatMap (T.pack . escapeChar)
-- ---------------
-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
-parseHeader :: ReadP MdToken
+parseHeader :: Parser MdToken
parseHeader = do
- skipSpaces
- headers <- munch1 (== '#')
+ space
+ headers <- greedyParse1 (char '#')
when
(length headers > 6)
- pfail
- skipSpaces
- parsedText <- manyTill parseLineToken (void (char '\n') <++ eof)
+ empty
+ space
+ parsedText <- manyTill parseLineToken (void (char '\n') <|> eof)
greedyParse (char '\n')
return (Header (length headers) (Line parsedText))
+asteriskBold = T.pack "**"
+
+underscoreBold = T.pack "__"
+
-- Parse bold text
-parseBold :: ReadP MdToken
-parseBold = parseBoldWith "**" <|> parseBoldWith "__"
+parseBold :: Parser MdToken
+parseBold = parseBoldWith asteriskBold <|> parseBoldWith underscoreBold
where
parseBoldWith delim = do
string delim
- inside <- greedyParse1 parseLineToken
- string delim
+ inside <- someTill parseLineToken $ string delim
return (Bold (Line inside))
-- Parse italic text
-parseItalic :: ReadP MdToken
-parseItalic = parseItalicWith "*" <|> parseItalicWith "_"
+parseItalic :: Parser MdToken
+parseItalic = parseItalicWith '*' <|> parseItalicWith '_'
where
parseItalicWith delim = do
- string delim
- inside <- greedyParse1 parseLineToken
- string delim
+ char delim
+ inside <- someTill parseLineToken (char delim)
return (Italic (Line inside))
-- Parse strikethrough text
-parseStrikethrough :: ReadP MdToken
+parseStrikethrough :: Parser MdToken
parseStrikethrough = do
- string "~~"
- inside <- many1 parseLineToken
- string "~~"
+ string (T.pack "~~")
+ inside <- someTill parseLineToken $ string (T.pack "~~")
return (Strikethrough (Line inside))
-- Parse code
-parseCode :: ReadP MdToken
+parseCode :: Parser MdToken
parseCode = do
- string "`"
- inside <- many1 get
- string "`"
- return (Code (Unit inside))
+ char '`'
+ inside <- manyTill (satisfy (/= '\n')) (char '`')
+ return (Code (Unit (concatMap escapeChar inside)))
-- Parse a link
-parseLink :: ReadP MdToken
+parseLink :: Parser MdToken
parseLink = do
- linkText <- between (string "[") (string "]") (many1 get)
- linkURL <- between (string "(") (string ")") (many1 get)
- let parsedLinkText = fst $ leftmostLongestParse parseLine linkText
- return $ Link parsedLinkText (URL linkURL)
+ char '['
+ linkText <- someTill parseLineToken (char ']')
+ char '('
+ linkURL <- manyTill anySingle (char ')')
+ return $ Link (Line linkText) (URL linkURL)
-- Parse a linebreak character
-parseLinebreak :: ReadP MdToken
+parseLinebreak :: Parser MdToken
parseLinebreak = do
char ' '
- many1 (char ' ')
+ some (char ' ')
char '\n'
return Linebreak
-parseSingleNewline :: ReadP MdToken
+parseSingleNewline :: Parser MdToken
parseSingleNewline = do
char '\n'
- remaining <- look
- case remaining of
+ remaining <- getInput
+ case T.unpack remaining of
[] -> return $ Unit ""
_ -> return SingleNewline
-parseImage :: ReadP MdToken
+parseImage :: Parser MdToken
parseImage = do
char '!'
- char '['
- altText <- many1 (parseEscapedChar <++ parseUnit)
- char ']'
- char '('
- path <- many1 get
- char ')'
- return $ Image (Line altText) (ImgPath path)
+ link <- parseLink
+ case link of
+ Link text path -> return $ Image text path
+ _ -> empty -- This should never be reached
parseFigure = do
img <- parseImage
- void (string "\n\n") <++ eof
+ void (string doubleNewlineText) <|> eof
case img of
Image text path -> return $ Figure text path
_ -> return img
-- Parse an escaped character
-parseEscapedChar :: ReadP MdToken
+parseEscapedChar :: Parser MdToken
parseEscapedChar = do
char '\\'
escapedChar <- choice (map char escapableChars) -- Parse any of the special chars.
return (Unit [escapedChar])
-- Parse a character as a Unit.
-parseUnit :: ReadP MdToken
+parseUnit :: Parser MdToken
parseUnit = do
- text <- satisfy (`notElem` specialChars)
+ -- text <- satisfy (`notElem` specialChars)
+ text <- anySingle
return (Unit [text])
-lineParsers :: [ReadP MdToken]
+lineParsers :: [Parser MdToken]
lineParsers =
[ parseLinebreak,
parseSingleNewline,
@@ -242,7 +251,7 @@ lineParsers =
parseUnit
] -- A 'line' doesn't include a 'header'
-listLineParsers :: [ReadP MdToken]
+listLineParsers :: [Parser MdToken]
listLineParsers =
[ parseLinebreak,
parseEscapedChar,
@@ -256,84 +265,85 @@ listLineParsers =
] -- A list line cannot contain newlines.
-- List of all parsers
-allParsers :: [ReadP MdToken]
+allParsers :: [Parser MdToken]
allParsers = parseHeader : lineParsers
-- Parse any of the line tokens.
-parseLineToken :: ReadP MdToken
+parseLineToken :: Parser MdToken
parseLineToken = fallthroughParser lineParsers
-- Parse any of the list line tokens.
-parseListLineToken :: ReadP MdToken
+parseListLineToken :: Parser MdToken
parseListLineToken = fallthroughParser listLineParsers
-- Parse a line, consisting of one or more tokens.
-parseLine :: ReadP MdToken
+parseLine :: Parser MdToken
parseLine = do
- skipSpaces
+ space
-- Fail if we have reached the end of the document.
parsed <- manyTill parseLineToken eof
return (Line parsed)
-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
-parsePara :: ReadP MdToken
+parsePara :: Parser MdToken
parsePara = do
- parseMany (char '\n')
+ space
-- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
-- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
- text <- manyTill get (string "\n\n" <|> (eof >> return ""))
- when (null text) pfail
- let parsedText = fst $ leftmostLongestParse parseLine text -- Parse a line
- parseMany (char '\n')
- return (Para parsedText)
+ parsedText <- someTill parseLineToken (try paraEnding)
+ many (char '\n')
+ return (Para (Line parsedText))
+ where
+ paraEnding = void (char '\n' *> (char '\n' <|> lookAhead (char '>'))) <|> eof
-- Parse a line starting with '>', return the line except for the '>'.
-parseQuotedLine :: ReadP String
+parseQuotedLine :: Parser String
parseQuotedLine = do
char '>'
- greedyParse (char ' ' +++ char '\t')
- restOfLine <- munch (/= '\n')
- Text.ParserCombinators.ReadP.optional (char '\n') >> return ""
+ many (char ' ' <|> char '\t')
+ restOfLine <- many (satisfy (/= '\n'))
+ void (char '\n') <|> eof
return restOfLine
-- Parse many 'quoted lines' until I see a non-quoted line.
-parseQuotedLines :: ReadP [String]
-parseQuotedLines =
- greedyParse1 $ do
- look >>= \line ->
- case line of
- ('>' : _) -> parseQuotedLine
- _ -> pfail
+parseQuotedLines :: Parser [String]
+parseQuotedLines = some parseQuotedLine
+
+-- some $ do
+-- getInput >>= \line ->
+-- case T.unpack line of
+-- ('>' : _) -> parseQuotedLine
+-- _ -> empty
-- Parse a blockquote, which is a greater-than sign followed by a paragraph.
-parseBlockquote :: ReadP MdToken
+parseBlockquote :: Parser MdToken
parseBlockquote = do
quotedLines <- parseQuotedLines
-- remaining <- look
-- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining
-- string (init $ unlines quotedLines)
- let parsedQuotedLines = fst $ leftmostLongestParse (many1 (parseBlockquote <++ parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
+ let parsedQuotedLines = leftmostLongestParse (some (parseBlockquote <|> parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
return (Blockquote parsedQuotedLines)
-- Parse a nested list item.
-parseListNested :: ReadP MdToken
+parseListNested :: Parser MdToken
parseListNested = do
- let firstCharParser = string " " <++ string "\t"
- let restOfLineParser = manyTill get (void (char '\n') <++ eof)
+ let firstCharParser = string (T.pack " ") <|> string (T.pack "\t")
+ let restOfLineParser = manyTill anySingle (void (char '\n') <|> eof)
lines <- greedyParse1 (firstCharParser *> restOfLineParser)
- let linesParsed = fst $ leftmostLongestParse (parseUnorderedList <++ parseOrderedList) (init $ unlines lines)
- when (null (show linesParsed)) pfail
+ let linesParsed = leftmostLongestParse (parseUnorderedList <|> parseOrderedList) (init $ unlines lines)
+ when (null (show linesParsed)) empty
return linesParsed
-- Parse an unordered list line item.
-parseUListLineItem :: ReadP MdToken
+parseUListLineItem :: Parser MdToken
parseUListLineItem = do
firstChar <- choice (map char ['*', '+', '-'])
char ' ' -- At least one space between list indicator and list text.
parseListLineItemCommon
-- Parse an ordered list line item.
-parseOListLineItem :: ReadP MdToken
+parseOListLineItem :: Parser MdToken
parseOListLineItem = do
num <- greedyParse1 (satisfy isDigit)
char '.'
@@ -341,23 +351,22 @@ parseOListLineItem = do
parseListLineItemCommon
-- Common code for parsing list line items
-parseListLineItemCommon :: ReadP MdToken
+parseListLineItemCommon :: Parser MdToken
parseListLineItemCommon = do
- skipSpaces
- restOfLine <- many1 parseListLineToken
- void (char '\n') <++ eof
- nestedList <- parseListNested <++ return (Unit "")
+ space
+ restOfLine <- manyTill parseListLineToken (void (char '\n') <|> eof)
+ nestedList <- parseListNested <|> return (Unit "")
return $ Line [Line restOfLine, nestedList]
-- Parse an unordered list paragraph item.
-parseUListParaItem :: ReadP MdToken
+parseUListParaItem :: Parser MdToken
parseUListParaItem = do
firstLine <- parseUListLineItem
res <- parseListParaItemCommon
return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- Parse an unordered list paragraph item.
-parseOListParaItem :: ReadP MdToken
+parseOListParaItem :: Parser MdToken
parseOListParaItem = do
firstLine <- parseOListLineItem
res <- parseListParaItemCommon
@@ -367,48 +376,54 @@ parseOListParaItem = do
-- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more
-- lines indented by a space or tab.
-- A list paragraph item can also be a blockquote.
-parseListParaItemCommon :: ReadP [MdToken]
+parseListParaItemCommon :: Parser [MdToken]
parseListParaItemCommon = do
char '\n'
- lines <- greedyParse1 ((string " " <|> string "\t") *> parseTillEol)
- let res = fst $ leftmostLongestParse (greedyParse1 parseBlockquote <++ greedyParse1 parsePara) (init $ unlines lines)
+ lines <- greedyParse1 ((string (T.pack " ") <|> string (T.pack "\t")) *> parseTillEol)
+ let res = leftmostLongestParse (greedyParse1 parseBlockquote <|> greedyParse1 parsePara) (init $ unlines lines)
char '\n'
return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- Parse an unordered list item, which can be a line item or another list.
-parseUListItem :: ReadP MdToken
-parseUListItem = parseUListParaItem <++ parseUListLineItem
+parseUListItem :: Parser MdToken
+parseUListItem = try parseUListParaItem <|> parseUListLineItem
-- Parse an unordered list.
-parseUnorderedList :: ReadP MdToken
+parseUnorderedList :: Parser MdToken
parseUnorderedList = do
- lineItems <- greedyParse1 parseUListItem
- void (char '\n') <++ eof -- A list must end in an extra newline or eof
+ lineItems <- some parseUListItem
+ void (char '\n') <|> eof -- A list must end in an extra newline or eof
return $ UnordList lineItems
-- --------
-parseOListItem :: ReadP MdToken
-parseOListItem = parseOListParaItem <++ parseOListLineItem
+parseOListItem :: Parser MdToken
+parseOListItem = try parseOListParaItem <|> parseOListLineItem
-- Parses the first element of an ordered list, which must start with '1.'
-parseFirstOListItem :: ReadP MdToken
+parseFirstOListItem :: Parser MdToken
parseFirstOListItem = do
- remaining <- look
- when (take 2 remaining /= "1.") pfail
+ remaining <- getInput
+ when (take 2 (T.unpack remaining) /= "1.") empty
parseOListLineItem
-parseOrderedList :: ReadP MdToken
+parseOrderedList :: Parser MdToken
parseOrderedList = do
firstLine <- parseFirstOListItem
- lineItems <- greedyParse1 parseOListItem
- void (char '\n') <++ eof
+ lineItems <- some parseOListItem
+ void (char '\n') <|> eof
return $ OrdList (firstLine : lineItems)
-parseHorizontalRule :: ReadP MdToken
-parseHorizontalRule = string "---" *> (void (string "\n\n") <++ eof) *> return HorizontalRule
+horizontalRuleText :: T.Text
+horizontalRuleText = T.pack "---"
+
+doubleNewlineText :: T.Text
+doubleNewlineText = T.pack "\n\n"
+
+parseHorizontalRule :: Parser MdToken
+parseHorizontalRule = string horizontalRuleText *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule
-documentParsers :: [ReadP MdToken]
+documentParsers :: [Parser MdToken]
documentParsers =
[ parseHorizontalRule,
parseHeader,
@@ -420,7 +435,7 @@ documentParsers =
]
-- Parse a document, which is multiple paragraphs.
-parseDocument :: ReadP MdToken
+parseDocument :: Parser MdToken
parseDocument = do
res <- manyTill (fallthroughParser documentParsers) eof
return (Document res)
diff --git a/src/MdToHtmlTest.hs b/src/MdToHtmlTest.hs
index 239eb86..a42c6b9 100644
--- a/src/MdToHtmlTest.hs
+++ b/src/MdToHtmlTest.hs
@@ -7,7 +7,7 @@ check_equal :: String -> String -> String -> Test
check_equal desc expected actual = TestCase (assertEqual desc expected actual)
convert :: String -> String
-convert md = show . fst $ leftmostLongestParse parseDocument md
+convert md = show $ leftmostLongestParse parseDocument md
headerTests =
TestList
@@ -24,8 +24,8 @@ boldTests =
[ check_equal "Should convert bold" "Hello
" (convert "__Hello__"),
check_equal "Should convert italic" "Hello
" (convert "_Hello_"),
check_equal "Should convert bold and italic in a sentence" "It is a wonderful day
" (convert "It _is_ a __wonderful__ day"),
- check_equal "Should convert nested bold and italic" "Bold then Italic
" (convert "**Bold then *Italic***"),
- check_equal "Should convert nested bold and italic" "Italic then Bold
" (convert "*Italic then **Bold***")
+ check_equal "Should convert nested bold and italic" "Bold then Italic
" (convert "**Bold then _Italic_**"),
+ check_equal "Should convert nested bold and italic" "Italic then Bold
" (convert "*Italic then __Bold__*")
]
strikethroughTests =
@@ -93,11 +93,15 @@ orderedListTests =
check_equal "Unordered list in ordered list" "Item 1 Item 2 Item 3 " (convert "1. Item 1\n2. Item 2\n - Item 1\n * Item 2\n4. Item 3")
]
+htmlTests =
+ TestList
+ [check_equal "Convert HTML element" "
a " (convert "a ")]
+
codeTests =
TestList
[ check_equal "Code by itself" "Hello world!
" (convert "`Hello world!`"),
check_equal "Code in a paragraph" "The following text
is code
" (convert "The following `text` is code"),
- check_equal "Code across paragraphs (shouldn't work" "
" (convert "`Incomplete\n\nCode`") -- At the moment, this is just treated as a syntax error, so nothing is rendered.
+ check_equal "Code across paragraphs (shouldn't work)" "`Incomplete
Code`
" (convert "`Incomplete\n\nCode`") -- At the moment, this is just treated as a syntax error, so nothing is rendered.
]
imageTests =
@@ -149,6 +153,7 @@ tests =
unorderedListTests,
orderedListTests,
imageTests,
+ htmlTests,
figureTests,
codeTests,
horizontalRuleTests,