3 Commits

4 changed files with 170 additions and 417 deletions

View File

@@ -1,30 +1,8 @@
module Main where module Main where
import MdToHTML import MdToHTML
import System.Environment
import System.IO
readLinesHelper :: [String] -> IO [String]
readLinesHelper xs = do
done <- isEOF
if done
then return xs
else do
line <- getLine
let xs' = line : xs
readLinesHelper xs'
readLines :: IO [String]
readLines = reverse <$> readLinesHelper []
main :: IO () main :: IO ()
main = do main = do
args <- getArgs let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n"
fileContents <- case args of putStrLn (show res)
[] -> getContents
x : _ -> readFile x
let res = leftmostLongestParse parseDocument fileContents
let toPrint = prettyPrint res
case reverse toPrint of
'\n' : _ -> putStr toPrint
_ -> putStrLn toPrint

View File

@@ -58,20 +58,19 @@ library
exposed-modules: MdToHTML exposed-modules: MdToHTML
other-modules: MdToHtmlTest other-modules: MdToHtmlTest
build-depends: base ^>=4.19.1.0, build-depends: base ^>=4.19.1.0,
HUnit, HUnit
megaparsec,
parser-combinators,
text,
MissingH,
word-wrap
executable mdtoh executable md-to-html-runner
-- Import common warning flags. -- Import common warning flags.
import: warnings import: warnings
-- .hs or .lhs file containing the Main module. -- .hs or .lhs file containing the Main module.
main-is: Main.hs main-is: Main.hs
-- Modules included in this executable, other than Main.
other-modules:
MdToHTML
MdToHtmlTest
-- LANGUAGE extensions used by modules in this package. -- LANGUAGE extensions used by modules in this package.
-- other-extensions: -- other-extensions:

View File

@@ -4,31 +4,24 @@
module MdToHTML where module MdToHTML where
import Control.Applicative hiding (many, some) import Control.Applicative
import Control.Monad import Control.Monad
import Control.Monad.Combinators (count)
import Data.Char import Data.Char
import Data.List import Data.List
import Data.Ord (comparing) import Data.Ord (comparing)
import Data.String.Utils
import qualified Data.Text as T
import Data.Void
import Debug.Trace import Debug.Trace
import Text.Megaparsec import Text.ParserCombinators.ReadP
import Text.Megaparsec.Char
import Text.Printf import Text.Printf
import Text.Wrap
type Parser = Parsec Void T.Text
type HeaderLevel = Int type HeaderLevel = Int
type CssClass = String
newtype URL = URL {getUrl :: String} deriving (Eq) newtype URL = URL {getUrl :: String} deriving (Eq)
newtype ImgPath = ImgPath {getPath :: String} deriving (Eq) newtype ImgPath = ImgPath {getPath :: String} deriving (Eq)
parseMany :: ReadP a -> ReadP [a]
parseMany = Text.ParserCombinators.ReadP.many
data MdToken data MdToken
= Document [MdToken] = Document [MdToken]
| Header HeaderLevel MdToken | Header HeaderLevel MdToken
@@ -40,17 +33,12 @@ data MdToken
| Blockquote [MdToken] | Blockquote [MdToken]
| UnordList [MdToken] | UnordList [MdToken]
| OrdList [MdToken] | OrdList [MdToken]
| Checkbox Bool
| Code MdToken | Code MdToken
| Table [[MdToken]] | Codeblock String
| Codeblock MdToken
| Link MdToken URL | Link MdToken URL
| Image MdToken URL (Maybe [CssClass]) | Image MdToken ImgPath
| Figure MdToken URL (Maybe [CssClass])
| Bold MdToken | Bold MdToken
| Italic MdToken | Italic MdToken
| Superscript MdToken
| Subscript MdToken
| Strikethrough MdToken | Strikethrough MdToken
| Unit String | Unit String
deriving (Eq) deriving (Eq)
@@ -61,38 +49,21 @@ instance Show MdToken where
show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">" show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
show (Para token) = "<p>" ++ show token ++ "</p>" show (Para token) = "<p>" ++ show token ++ "</p>"
show (Line tokens) = concatMap show tokens show (Line tokens) = concatMap show tokens
show Linebreak = "<br />" show Linebreak = "<br>"
show SingleNewline = " " show SingleNewline = " "
show HorizontalRule = "<hr>" show HorizontalRule = "<hr>"
show (Blockquote tokens) = "<blockquote>" ++ concatMap show tokens ++ "</blockquote>" show (Blockquote tokens) = "<blockquote>" ++ concatMap show tokens ++ "</blockquote>"
show (UnordList tokens) = "<ul>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ul>" show (UnordList tokens) = "<ul>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ul>"
show (OrdList tokens) = "<ol>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ol>" show (OrdList tokens) = "<ol>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ol>"
show (Checkbox isChecked) = "<input type=\"checkbox\"" ++ (if isChecked then " checked=\"\"" else "") ++ " />" show (Code code) = "<code>" ++ show code ++ "</code>"
show (Code code) = "<code>" ++ strip (show code) ++ "</code>" show (Codeblock code) = show code
show (Table (thead : tokenGrid)) = "<table><thead><tr>" ++ concatMap (\x -> "<th>" ++ rstrip (show x) ++ "</th>") thead ++ "</tr></thead>" ++ "<tbody>" ++ concatMap (\x -> "<tr>" ++ concatMap (\y -> "<td>" ++ rstrip (show y) ++ "</td>") x ++ "</tr>") tokenGrid ++ "</tbody></table>"
show (Codeblock code) = "<pre><code>" ++ show code ++ "</code></pre>"
show (Link txt url) = "<a href=\"" ++ getUrl url ++ "\">" ++ show txt ++ "</a>" show (Link txt url) = "<a href=\"" ++ getUrl url ++ "\">" ++ show txt ++ "</a>"
show (Image txt url cssClasses) = "<img src=\"" ++ getUrl url ++ "\"" ++ " alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/>" show (Image txt imgPath) = "<img src=" ++ getPath imgPath ++ ">" ++ show txt ++ "</img>"
show (Figure txt url cssClasses) = "<figure><img src=\"" ++ getUrl url ++ "\" alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/><figcaption aria-hidden=\"true\">" ++ show txt ++ "</figcaption></figure>"
show (Bold token) = "<b>" ++ show token ++ "</b>" show (Bold token) = "<b>" ++ show token ++ "</b>"
show (Italic token) = "<i>" ++ show token ++ "</i>" show (Italic token) = "<i>" ++ show token ++ "</i>"
show (Superscript token) = "<sup>" ++ show token ++ "</sup>"
show (Subscript token) = "<sub>" ++ show token ++ "</sub>"
show (Strikethrough token) = "<s>" ++ show token ++ "</s>" show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
show (Unit unit) = printf "%s" unit show (Unit unit) = printf "%s" unit
-- Pretty print the given token into a string.
-- This is the same as calling 'show' for most tokens, but is different for paragraphs and tables,
-- which have newlines inserted into them.
prettyPrint :: MdToken -> String
prettyPrint (Para token) = "<p>" ++ T.unpack (wrapText defaultWrapSettings 70 (T.pack $ prettyPrint token)) ++ "</p>\n"
prettyPrint (Table (thead : tokenGrid)) = "<table>\n<thead>\n<tr>\n" ++ concatMap (\x -> "<th>" ++ rstrip (prettyPrint x) ++ "</th>\n") thead ++ "</tr>\n</thead>\n" ++ "<tbody>\n" ++ concatMap (\x -> "<tr>\n" ++ concatMap (\y -> "<td>" ++ rstrip (prettyPrint y) ++ "</td>\n") x ++ "</tr>\n") tokenGrid ++ "</tbody>\n</table>\n"
prettyPrint Linebreak = "<br />\n"
prettyPrint HorizontalRule = "<hr>\n"
prettyPrint (Line tokens) = concatMap prettyPrint tokens
prettyPrint (Document tokens) = concatMap prettyPrint tokens
prettyPrint token = show token
instance Semigroup MdToken where instance Semigroup MdToken where
a <> b = Document [a, b] a <> b = Document [a, b]
@@ -110,26 +81,27 @@ leftmostLongest xs =
(x : xs) -> Just x (x : xs) -> Just x
-- Get the first parse returned by readP_to_S that consumed the most input -- Get the first parse returned by readP_to_S that consumed the most input
leftmostLongestParse :: (Monoid a) => Parser a -> String -> a leftmostLongestParse :: (Monoid a) => ReadP a -> String -> (a, String)
leftmostLongestParse parser input = leftmostLongestParse parser input =
case runParser parser "input" (T.pack input) of let res = leftmostLongest $ readP_to_S parser input
(Left a) -> mempty in case res of
(Right a) -> a Nothing -> (mempty, mempty)
Just x -> x
specialChars = ">\n\\`*_{}[]#+|" specialChars = "\\#*_[\n`"
escapableChars = "-~!.$()" ++ specialChars escapableChars = '~' : specialChars
-- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned. -- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
greedyParse :: Parser a -> Parser [a] greedyParse :: ReadP a -> ReadP [a]
greedyParse parser = do greedyParse parser = do
greedyParse1 parser <|> return [] greedyParse1 parser <++ return []
-- Like greedyParse, but the parser must succeed atleast once. -- Like greedyParse, but the parser must succeed atleast once.
greedyParse1 :: Parser a -> Parser [a] greedyParse1 :: ReadP a -> ReadP [a]
greedyParse1 parser = do greedyParse1 parser = do
parsed1 <- parser parsed1 <- parser
parsed2 <- greedyParse1 parser <|> return [] parsed2 <- greedyParse1 parser <++ return []
return (parsed1 : parsed2) return (parsed1 : parsed2)
prepend :: [a] -> [a] -> [a] prepend :: [a] -> [a] -> [a]
@@ -139,328 +111,222 @@ append :: [a] -> [a] -> [a]
append x1 x2 = x2 ++ x1 append x1 x2 = x2 ++ x1
-- Parse until EOL or EOF -- Parse until EOL or EOF
parseTillEol :: Parser String parseTillEol :: ReadP String
parseTillEol = manyTill anySingle (void (char '\n') <|> eof) parseTillEol = manyTill get (void (char '\n') <++ eof)
-- Takes a list of parsers. Returns a parser that will try them in -- Takes a list of parsers. Returns a parser that will try them in
-- order, moving to the next one only if the current one fails. -- order, moving to the next one only if the current one fails.
fallthroughParser :: [Parser a] -> Parser a fallthroughParser :: [ReadP a] -> ReadP a
fallthroughParser [x] = try x fallthroughParser [x] = x
fallthroughParser (x : xs) = try x <|> fallthroughParser xs fallthroughParser (x : xs) = x <++ fallthroughParser xs
escapeChar :: Char -> String myMany :: (Monoid a) => ReadP a -> ReadP [a]
escapeChar '>' = "&gt;" myMany p = do
escapeChar '<' = "&lt;" remaining <- look
escapeChar '&' = "&amp;" case remaining of
escapeChar x = [x] [] -> return []
_ -> return [] +++ myMany1 p
htmlEscapeChars :: T.Text -> T.Text myMany1 :: (Monoid a) => ReadP a -> ReadP [a]
htmlEscapeChars = T.concatMap (T.pack . escapeChar) myMany1 p = liftM2 (:) p (myMany p)
-- -- Wraps a list of words after (at most) the given number of characters, trying to prevent word-breaks
-- wordwrap :: Int -> String -> String
-- wordwrap wraplength str = if (length str) < wraplength
-- then str
-- else
-- let spaceIndex = lastgtSpaceIndex 0 (takeRev (length str) - wraplength str)
--
-- where
-- takeRev n = (reverse . take n . reverse)
-- lastSpaceIndex counter str = case str of
-- [] -> counter
-- x:xs -> if (isSpace x) counter else lastSpaceIndex counter+1 xs
-- --------------- -- ---------------
-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL. -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
parseHeader :: Parser MdToken parseHeader :: ReadP MdToken
parseHeader = do parseHeader = do
space skipSpaces
headers <- greedyParse1 (char '#') headers <- munch1 (== '#')
when when
(length headers > 6) (length headers > 6)
empty pfail
space skipSpaces
parsedText <- manyTill parseLineToken (void (char '\n') <|> eof) text <- munch1 (/= '\n')
greedyParse (char '\n') -- Text.ParserCombinators.ReadP.optional (char '\n')
return (Header (length headers) (Line parsedText)) skipSpaces
let parsedText = fst $ leftmostLongestParse parseLine text
asteriskBold = T.pack "**" return (Header (length headers) parsedText)
underscoreBold = T.pack "__"
-- Parse bold text -- Parse bold text
parseBold :: Parser MdToken parseBold :: ReadP MdToken
parseBold = parseBoldWith asteriskBold <|> parseBoldWith underscoreBold parseBold = parseBoldWith "**" <|> parseBoldWith "__"
where where
parseBoldWith delim = do parseBoldWith delim = do
string delim string delim
inside <- someTill parseLineToken $ string delim inside <- myMany1 parseLineToken
string delim
return (Bold (Line inside)) return (Bold (Line inside))
-- Parse italic text -- Parse italic text
parseItalic :: Parser MdToken parseItalic :: ReadP MdToken
parseItalic = parseItalicWith '*' <|> parseItalicWith '_' parseItalic = parseItalicWith '*' <|> parseItalicWith '_'
where where
parseItalicWith delim = do parseItalicWith delim = do
char delim exactlyOnce delim
inside <- someTill parseLineToken (char delim) inside <- myMany1 parseLineToken
exactlyOnce delim
return (Italic (Line inside)) return (Italic (Line inside))
exactlyOnce ch = do
-- Parse subscript char ch
parseSubscript :: Parser MdToken remaining <- look
parseSubscript = do case remaining of
char '~' [] -> return ch
inside <- someTill parseLineToken (char '~') x : xs -> if x == ch then pfail else return ch
return (Subscript (Line inside))
-- Parse superscript
parseSuperscript :: Parser MdToken
parseSuperscript = do
char '^'
inside <- someTill parseLineToken (char '^')
return (Superscript (Line inside))
-- Parse strikethrough text -- Parse strikethrough text
parseStrikethrough :: Parser MdToken parseStrikethrough :: ReadP MdToken
parseStrikethrough = do parseStrikethrough = do
string (T.pack "~~") string "~~"
inside <- someTill parseLineToken $ string (T.pack "~~") inside <- many1 parseLineToken
string "~~"
return (Strikethrough (Line inside)) return (Strikethrough (Line inside))
-- Parse code -- Parse code
parseCode :: Parser MdToken parseCode :: ReadP MdToken
parseCode = do parseCode = do
opening <- some $ char '`' string "`"
inside <- someTill (satisfy (/= '\n')) (char '`') inside <- many1 get
closing <- count (length opening - 1) (char '`') string "`"
return (Code (Unit (concatMap escapeChar inside))) return (Code (Unit inside))
-- Parse a link -- Parse a link
parseLink :: Parser MdToken parseLink :: ReadP MdToken
parseLink = do parseLink = do
char '[' linkText <- between (string "[") (string "]") (many1 get)
linkText <- manyTill parseLineToken (char ']') linkURL <- between (string "(") (string ")") (many1 get)
char '(' let parsedLinkText = fst $ leftmostLongestParse parseLine linkText
linkURL <- manyTill anySingle (char ')') return $ Link parsedLinkText (URL linkURL)
return $ Link (Line linkText) (URL linkURL)
-- Parse a linebreak character -- Parse a linebreak character
parseLinebreak :: Parser MdToken parseLinebreak :: ReadP MdToken
parseLinebreak = parseLinebreakSpace <|> parseLinebreakBackslash parseLinebreak = do
where char ' '
parseLinebreakSpace = do many1 (char ' ')
char ' '
some (char ' ')
char '\n'
return Linebreak
parseLinebreakBackslash = try $ do
char '\\'
char '\n'
return Linebreak
parseTableRow :: Parser [MdToken]
parseTableRow = do
char '|'
row <- some (many (satisfy (\x -> x == ' ' || x == '\t')) *> someTill parseListLineToken (char '|'))
return (map Line row)
parseTable :: Parser MdToken
parseTable = do
tableHead <- parseTableRow
char '\n' char '\n'
char '|' return Linebreak
sepEndBy1 (some (char '-')) (char '|') *> char '\n'
tableBody <- sepEndBy parseTableRow (char '\n')
many (char '\n') -- Parse trailing newlines, if any
return $ Table (tableHead : tableBody)
parseSingleNewline :: Parser MdToken parseSingleNewline :: ReadP MdToken
parseSingleNewline = do parseSingleNewline = do
char '\n' char '\n'
remaining <- getInput return SingleNewline
case T.unpack remaining of
[] -> return $ Unit ""
_ -> return SingleNewline
parseCssClasses :: Parser [CssClass]
parseCssClasses = do
char '{'
classes <- some parseCssClass
char '}'
return classes
where
parseCssClass :: Parser CssClass
parseCssClass = do
char '.'
let firstLetterParser = char '_' <|> char '-' <|> label "letter" (satisfy isAlpha)
cssClassFirstLetter <- firstLetterParser
cssClass <- many (firstLetterParser <|> label "digit" (satisfy isDigit))
space
return (cssClassFirstLetter : cssClass)
parseImage :: Parser MdToken
parseImage = do
char '!'
link <- parseLink
cssClasses <- optional $ try parseCssClasses
case link of
Link text path -> return $ Image text path cssClasses
_ -> empty -- This should never be reached
parseFigure = do
img <- parseImage
void (string doubleNewlineText) <|> eof
case img of
Image text path cssClasses -> return $ Figure text path cssClasses
_ -> return img
-- Parse an escaped character -- Parse an escaped character
parseEscapedChar :: Parser MdToken parseEscapedChar :: ReadP MdToken
parseEscapedChar = do parseEscapedChar = do
char '\\' char '\\'
escapedChar <- choice (map char escapableChars) -- Parse any of the special chars. escapedChar <- choice (map char escapableChars) -- Parse any of the special chars.
return (Unit [escapedChar]) return (Unit [escapedChar])
-- Parse a character as a Unit. -- Parse a character as a Unit.
parseUnit :: Parser MdToken parseUnit :: ReadP MdToken
parseUnit = do parseUnit = do
-- text <- satisfy (`notElem` specialChars) -- text <- satisfy (`notElem` specialChars)
text <- anySingle text <- get
return (Unit [text]) return (Unit [text])
-- Parse any character except a newline lineParsers :: [ReadP MdToken]
parseUnitExceptNewline :: Parser MdToken
parseUnitExceptNewline = do
-- text <- satisfy (`notElem` specialChars)
text <- satisfy (/= '\n')
return (Unit [text])
lineParsers :: [Parser MdToken]
lineParsers = lineParsers =
[ parseLinebreak, [ parseLinebreak,
parseSingleNewline, parseSingleNewline,
parseEscapedChar, parseEscapedChar,
parseCode, parseCode,
parseImage,
parseBold, parseBold,
parseItalic, parseItalic,
parseStrikethrough, parseStrikethrough,
parseSubscript,
parseSuperscript,
parseLink, parseLink,
parseUnit parseUnit
] -- A 'line' doesn't include a 'header' ] -- A 'line' doesn't include a 'header'
lineParsersWithoutNewline :: [Parser MdToken] listLineParsers :: [ReadP MdToken]
lineParsersWithoutNewline = listLineParsers =
[ parseEscapedChar, [ parseLinebreak,
parseEscapedChar,
parseCode, parseCode,
parseImage,
parseBold, parseBold,
parseItalic, parseItalic,
parseStrikethrough, parseStrikethrough,
parseSubscript,
parseSuperscript,
parseLink, parseLink,
parseUnitExceptNewline parseUnit
] -- A list line cannot contain newlines. ] -- A list line cannot contain newlines.
-- List of all parsers -- List of all parsers
allParsers :: [Parser MdToken] allParsers :: [ReadP MdToken]
allParsers = parseHeader : lineParsers allParsers = parseHeader : lineParsers
-- Parse any of the line tokens. -- Parse any of the line tokens.
parseLineToken :: Parser MdToken parseLineToken :: ReadP MdToken
parseLineToken = fallthroughParser lineParsers parseLineToken = fallthroughParser lineParsers
-- Parse any of the list line tokens. -- Parse any of the list line tokens.
parseListLineToken :: Parser MdToken parseListLineToken :: ReadP MdToken
parseListLineToken = fallthroughParser lineParsersWithoutNewline parseListLineToken = fallthroughParser listLineParsers
-- Parse a line, consisting of one or more tokens. -- Parse a line, consisting of one or more tokens.
parseLine :: Parser MdToken parseLine :: ReadP MdToken
parseLine = do parseLine = do
space skipSpaces
-- Fail if we have reached the end of the document. -- Fail if we have reached the end of the document.
parsed <- manyTill parseLineToken eof parsed <- myMany1 parseLineToken
return (Line parsed) return (Line parsed)
-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines. -- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
parsePara :: Parser MdToken parsePara :: ReadP MdToken
parsePara = do parsePara = do
space parseMany (char '\n')
-- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline. -- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
-- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF. -- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
parsedText <- someTill parseLineToken (try paraEnding) text <- manyTill get (string "\n\n" <|> (eof >> return ""))
many (char '\n') when (null text) pfail
return (Para (Line parsedText)) let parsedText = fst $ leftmostLongestParse parseLine text -- Parse a line
where return (Para parsedText)
paraEnding = void (char '\n' *> (char '\n' <|> lookAhead (char '>'))) <|> eof
-- Parse a line starting with '>', return the line except for the '>'. -- Parse a line starting with '>', return the line except for the '>'.
parseQuotedLine :: Parser String parseQuotedLine :: ReadP String
parseQuotedLine = do parseQuotedLine = do
char '>' char '>'
many (char ' ' <|> char '\t') greedyParse (char ' ' +++ char '\t')
restOfLine <- many (satisfy (/= '\n')) restOfLine <- munch (/= '\n')
void (char '\n') <|> eof Text.ParserCombinators.ReadP.optional (char '\n') >> return ""
return restOfLine return restOfLine
-- Parse many 'quoted lines' until I see a non-quoted line. -- Parse many 'quoted lines' until I see a non-quoted line.
parseQuotedLines :: Parser [String] parseQuotedLines :: ReadP [String]
parseQuotedLines = some parseQuotedLine parseQuotedLines =
greedyParse1 $ do
-- some $ do look >>= \line ->
-- getInput >>= \line -> case line of
-- case T.unpack line of ('>' : _) -> parseQuotedLine
-- ('>' : _) -> parseQuotedLine _ -> pfail
-- _ -> empty
-- Parse a blockquote, which is a greater-than sign followed by a paragraph. -- Parse a blockquote, which is a greater-than sign followed by a paragraph.
parseBlockquote :: Parser MdToken parseBlockquote :: ReadP MdToken
parseBlockquote = do parseBlockquote = do
quotedLines <- parseQuotedLines quotedLines <- parseQuotedLines
-- remaining <- look -- remaining <- look
-- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining -- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining
-- string (init $ unlines quotedLines) -- string (init $ unlines quotedLines)
let parsedQuotedLines = leftmostLongestParse (some (parseBlockquote <|> parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline. let parsedQuotedLines = fst $ leftmostLongestParse (many1 (parseBlockquote <++ parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
return (Blockquote parsedQuotedLines) return (Blockquote parsedQuotedLines)
-- Parse a checkbox
parseCheckbox :: Parser MdToken
parseCheckbox = do
char '['
inside <- char ' ' <|> char 'x'
char ']'
space
return (if inside == 'x' then Checkbox True else Checkbox False)
-- Parse a nested list item. -- Parse a nested list item.
parseListNested :: Parser MdToken parseListNested :: ReadP MdToken
parseListNested = do parseListNested = do
let firstCharParser = (<>) <$> (string (T.pack " ") <|> string (T.pack "\t")) <*> (T.pack <$> many (char ' ')) let firstCharParser = string " " <++ string "\t"
let restOfLineParser = manyTill anySingle (void (char '\n') <|> eof) let restOfLineParser = manyTill get (void (char '\n') <++ eof)
-- For the first line, I manually run firstCharParser and restOfLineParser. The lines <- greedyParse1 (firstCharParser *> restOfLineParser)
-- result of firstCharParser is saved. For every subsequent line, I parse exactly let linesParsed = fst $ leftmostLongestParse (parseUnorderedList <++ parseOrderedList) (init $ unlines lines)
-- the same string as firstCharParser. when (null (show linesParsed)) pfail
firstLineSpaces <- firstCharParser
firstLine <- restOfLineParser
lines <- greedyParse (string firstLineSpaces *> restOfLineParser)
let allLines = firstLine : lines
let linesParsed = leftmostLongestParse (parseUnorderedList <|> parseOrderedList) (init $ unlines allLines)
when (null (show linesParsed)) empty
return linesParsed return linesParsed
-- Parse an unordered list line item. -- Parse an unordered list line item.
parseUListLineItem :: Parser MdToken parseUListLineItem :: ReadP MdToken
parseUListLineItem = do parseUListLineItem = do
firstChar <- choice (map char ['*', '+', '-']) firstChar <- choice (map char ['*', '+', '-'])
char ' ' -- At least one space between list indicator and list text. char ' ' -- At least one space between list indicator and list text.
parseListLineItemCommon parseListLineItemCommon
-- Parse an ordered list line item. -- Parse an ordered list line item.
parseOListLineItem :: Parser MdToken parseOListLineItem :: ReadP MdToken
parseOListLineItem = do parseOListLineItem = do
num <- greedyParse1 (satisfy isDigit) num <- greedyParse1 (satisfy isDigit)
char '.' char '.'
@@ -468,25 +334,23 @@ parseOListLineItem = do
parseListLineItemCommon parseListLineItemCommon
-- Common code for parsing list line items -- Common code for parsing list line items
parseListLineItemCommon :: Parser MdToken parseListLineItemCommon :: ReadP MdToken
parseListLineItemCommon = do parseListLineItemCommon = do
space skipSpaces
checkbox <- optional $ try parseCheckbox restOfLine <- many1 parseListLineToken
restOfLine <- manyTill parseListLineToken (void (char '\n') <|> eof) void (char '\n') <++ eof
nestedList <- try parseListNested <|> return (Unit "") nestedList <- parseListNested <++ return (Unit "")
case checkbox of return $ Line [Line restOfLine, nestedList]
Just box -> return $ Line [box, Line restOfLine, nestedList]
Nothing -> return $ Line [Line restOfLine, nestedList]
-- Parse an unordered list paragraph item. -- Parse an unordered list paragraph item.
parseUListParaItem :: Parser MdToken parseUListParaItem :: ReadP MdToken
parseUListParaItem = do parseUListParaItem = do
firstLine <- parseUListLineItem firstLine <- parseUListLineItem
res <- parseListParaItemCommon res <- parseListParaItemCommon
return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document. return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- Parse an unordered list paragraph item. -- Parse an unordered list paragraph item.
parseOListParaItem :: Parser MdToken parseOListParaItem :: ReadP MdToken
parseOListParaItem = do parseOListParaItem = do
firstLine <- parseOListLineItem firstLine <- parseOListLineItem
res <- parseListParaItemCommon res <- parseListParaItemCommon
@@ -496,82 +360,55 @@ parseOListParaItem = do
-- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more -- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more
-- lines indented by a space or tab. -- lines indented by a space or tab.
-- A list paragraph item can also be a blockquote. -- A list paragraph item can also be a blockquote.
parseListParaItemCommon :: Parser [MdToken] parseListParaItemCommon :: ReadP [MdToken]
parseListParaItemCommon = do parseListParaItemCommon = do
char '\n' char '\n'
lines <- greedyParse1 ((string (T.pack " ") <|> string (T.pack "\t")) *> parseTillEol) lines <- greedyParse1 ((string " " <|> string "\t") *> parseTillEol)
let res = leftmostLongestParse (greedyParse1 parseBlockquote <|> greedyParse1 parsePara) (init $ unlines lines) let res = fst $ leftmostLongestParse (greedyParse1 parseBlockquote <++ greedyParse1 parsePara) (init $ unlines lines)
char '\n' char '\n'
return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document. return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- Parse an unordered list item, which can be a line item or another list. -- Parse an unordered list item, which can be a line item or another list.
parseUListItem :: Parser MdToken parseUListItem :: ReadP MdToken
parseUListItem = space *> (try parseUListParaItem <|> parseUListLineItem) parseUListItem = parseUListParaItem <++ parseUListLineItem
-- Parse an unordered list. -- Parse an unordered list.
parseUnorderedList :: Parser MdToken parseUnorderedList :: ReadP MdToken
parseUnorderedList = do parseUnorderedList = do
lineItems <- some $ try parseUListItem lineItems <- greedyParse1 parseUListItem
void (char '\n') <|> eof -- A list must end in an extra newline or eof void (char '\n') <++ eof -- A list must end in an extra newline or eof
return $ UnordList lineItems return $ UnordList lineItems
-- -------- -- --------
parseOListItem :: Parser MdToken parseOListItem :: ReadP MdToken
parseOListItem = space *> (try parseOListParaItem <|> parseOListLineItem) parseOListItem = parseOListParaItem <++ parseOListLineItem
-- Parses the first element of an ordered list, which must start with '1.' -- Parses the first element of an ordered list, which must start with '1.'
parseFirstOListItem :: Parser MdToken parseFirstOListItem :: ReadP MdToken
parseFirstOListItem = do parseFirstOListItem = do
space remaining <- look
remaining <- getInput when (take 2 remaining /= "1.") pfail
when (take 2 (T.unpack remaining) /= "1.") empty
parseOListLineItem parseOListLineItem
parseOrderedList :: Parser MdToken parseOrderedList :: ReadP MdToken
parseOrderedList = do parseOrderedList = do
firstLine <- try parseFirstOListItem firstLine <- parseFirstOListItem
lineItems <- many $ try parseOListItem lineItems <- greedyParse1 parseOListItem
void (char '\n') <|> eof void (char '\n') <++ eof
return $ OrdList (firstLine : lineItems) return $ OrdList (firstLine : lineItems)
horizontalRuleText :: T.Text documentParsers :: [ReadP MdToken]
horizontalRuleText = T.pack "---"
doubleNewlineText :: T.Text
doubleNewlineText = T.pack "\n\n"
parseHorizontalRule :: Parser MdToken
parseHorizontalRule = parseHorizontalRuleLine *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule
where
parseHorizontalRuleLine = fallthroughParser (map (string . T.pack) ["---", "***", "___", "- - -", "* * *", "_ _ _"])
parseCodeblock :: Parser MdToken
parseCodeblock = do
string (T.pack "```")
_ <- many $ satisfy (/= '\n') -- Language name
char '\n'
inside <- someTill anySingle (string (T.pack "\n```"))
return $ Codeblock (Unit (concatMap escapeChar inside))
documentParsers :: [Parser MdToken]
documentParsers = documentParsers =
[ parseHorizontalRule, [ parseHeader,
parseCodeblock,
parseTable,
parseHeader,
parseBlockquote, parseBlockquote,
parseUnorderedList, parseUnorderedList,
parseOrderedList, parseOrderedList,
parseFigure,
parsePara parsePara
] ]
-- Parse a document, which is multiple paragraphs. -- Parse a document, which is multiple paragraphs.
parseDocument :: Parser MdToken parseDocument :: ReadP MdToken
parseDocument = do parseDocument = do
-- res <- manyTill (fallthroughParser documentParsers <|> (char '\n' *> return $ Unit "")) eof res <- manyTill (fallthroughParser documentParsers) eof
res <- sepEndBy (fallthroughParser documentParsers) (many $ char '\n')
-- many $ char '\n'
eof
return (Document res) return (Document res)

View File

@@ -7,7 +7,7 @@ check_equal :: String -> String -> String -> Test
check_equal desc expected actual = TestCase (assertEqual desc expected actual) check_equal desc expected actual = TestCase (assertEqual desc expected actual)
convert :: String -> String convert :: String -> String
convert md = show $ leftmostLongestParse parseDocument md convert md = show . fst $ leftmostLongestParse parseDocument md
headerTests = headerTests =
TestList TestList
@@ -22,16 +22,17 @@ headerTests =
boldTests = boldTests =
TestList TestList
[ check_equal "Should convert bold" "<p><b>Hello</b></p>" (convert "__Hello__"), [ check_equal "Should convert bold" "<p><b>Hello</b></p>" (convert "__Hello__"),
check_equal " Should not convert incomplete bold" "<p>**Hello</p>" (convert "**Hello"),
check_equal "Should convert italic" "<p><i>Hello</i></p>" (convert "_Hello_"), check_equal "Should convert italic" "<p><i>Hello</i></p>" (convert "_Hello_"),
check_equal "Should convert bold and italic in a sentence" "<p>It <i>is</i> a <b>wonderful</b> day</p>" (convert "It _is_ a __wonderful__ day"), check_equal "Should convert bold and italic in a sentence" "<p>It <i>is</i> a <b>wonderful</b> day</p>" (convert "It _is_ a __wonderful__ day"),
check_equal "Should convert nested bold and italic" "<p><b>Bold then <i>Italic</i></b></p>" (convert "**Bold then _Italic_**"), check_equal "Should convert nested bold and italic" "<p><b>Bold then <i>Italic</i></b></p>" (convert "**Bold then *Italic***"),
check_equal "Should convert nested bold and italic" "<p><i>Italic then <b>Bold</b></i></p>" (convert "*Italic then __Bold__*") check_equal "Should convert nested bold and italic" "<p><i>Italic then <b>Bold</b></i></p>" (convert "*Italic then **Bold***")
] ]
strikethroughTests = strikethroughTests =
TestList TestList
[ check_equal "Should convert strikethrough" "<p><s>Hello</s></p>" (convert "~~Hello~~"), [ check_equal "Should convert strikethrough" "<p><s>Hello</s></p>" (convert "~~Hello~~"),
check_equal "Should convert long sentence with tilde" "<p><s>The universe is ~7 days old</s>. The universe is 13 billion years old.</p>" (convert "~~The universe is \\~7 days old~~. The universe is 13 billion years old.") check_equal "Should convert long sentence with tilde" "<p><s>The universe is ~7 days old</s>. The universe is 13 billion years old.</p>" (convert "~~The universe is ~7 days old~~. The universe is 13 billion years old.")
] ]
linkTests = linkTests =
@@ -74,17 +75,8 @@ unorderedListTests =
check_equal "Paragraph before list" "<p>This is a list</p><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "This is a list\n\n* Item 1\n* Item 2"), check_equal "Paragraph before list" "<p>This is a list</p><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "This is a list\n\n* Item 1\n* Item 2"),
check_equal "Paragraph before list" "<h3>This is a list</h3><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "### This is a list\n\n* Item 1\n* Item 2"), check_equal "Paragraph before list" "<h3>This is a list</h3><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "### This is a list\n\n* Item 1\n* Item 2"),
check_equal "Nested list then back" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3</li><li>Item 4</li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n - Item 3\n - Item 4\n- Item 5"), check_equal "Nested list then back" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3</li><li>Item 4</li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n - Item 3\n - Item 4\n- Item 5"),
check_equal "Triply nested list" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3<ul><li>Item 4</li></ul></li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n - Item 3\n - Item 4\n- Item 5"),
check_equal "Blockquote in list" "<ul><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n\n > Quote\n\n- Item 3"), check_equal "Blockquote in list" "<ul><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n\n > Quote\n\n- Item 3"),
check_equal "Ordered list in unordered list" "<ul><li>Item 1</li><li>Item 2<ol><li>Item 1</li><li>Item 2</li></ol></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n 1. Item 1\n 2. Item 2\n- Item 3"), check_equal "Ordered list in unordered list" "<ul><li>Item 1</li><li>Item 2<ol><li>Item 1</li><li>Item 2</li></ol></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n 1. Item 1\n 2. Item 2\n- Item 3")
check_equal
"Checkbox in unordered list"
"<ul>\
\<li><input type=\"checkbox\" />Not checked</li>\
\<li><input type=\"checkbox\" checked=\"\" />Checked</li>\
\<li>Normal list item</li></ul>"
(convert "- [ ] Not checked\n- [x] Checked\n- Normal list item"),
check_equal "List with link at the start" "<ul><li><a href=\"b\">a</a></li><li><a href=\"d\">c</a></li></ul>" (convert "- [a](b)\n- [c](d)")
] ]
orderedListTests = orderedListTests =
@@ -99,62 +91,14 @@ orderedListTests =
check_equal "Paragraph before list" "<h3>This is a list</h3><ol><li>Item 1</li><li>Item 2</li></ol>" (convert "### This is a list\n\n1. Item 1\n200. Item 2"), check_equal "Paragraph before list" "<h3>This is a list</h3><ol><li>Item 1</li><li>Item 2</li></ol>" (convert "### This is a list\n\n1. Item 1\n200. Item 2"),
check_equal "Nested list then back" "<ol><li>Item 1</li><li>Item 2<ol><li>Item 3</li><li>Item 4</li></ol></li><li>Item 5</li></ol>" (convert "1. Item 1\n2. Item 2\n 1. Item 3\n 3. Item 4\n5. Item 5"), check_equal "Nested list then back" "<ol><li>Item 1</li><li>Item 2<ol><li>Item 3</li><li>Item 4</li></ol></li><li>Item 5</li></ol>" (convert "1. Item 1\n2. Item 2\n 1. Item 3\n 3. Item 4\n5. Item 5"),
check_equal "Blockquote in list" "<ol><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n\n > Quote\n\n3. Item 3"), check_equal "Blockquote in list" "<ol><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n\n > Quote\n\n3. Item 3"),
check_equal "Unordered list in ordered list" "<ol><li>Item 1</li><li>Item 2<ul><li>Item 1</li><li>Item 2</li></ul></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n - Item 1\n * Item 2\n4. Item 3"), check_equal "Unordered list in ordered list" "<ol><li>Item 1</li><li>Item 2<ul><li>Item 1</li><li>Item 2</li></ul></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n - Item 1\n * Item 2\n4. Item 3")
check_equal "List with just 1 item" "<ol><li>Item 1</li></ol>" (convert "1. Item 1"),
check_equal
"Checkbox in ordered list"
"<ol>\
\<li><input type=\"checkbox\" />Not checked</li>\
\<li><input type=\"checkbox\" checked=\"\" />Checked</li>\
\<li>Normal list item</li></ol>"
(convert "1. [ ] Not checked\n2. [x] Checked\n3. Normal list item")
] ]
htmlTests =
TestList
[check_equal "Convert HTML element" "<p><center>a</center></p>" (convert "<center>a</center>")]
codeTests = codeTests =
TestList TestList
[ check_equal "Code by itself" "<p><code>Hello world!</code></p>" (convert "`Hello world!`"), [ check_equal "Code by itself" "<p><code>Hello world!</code></p>" (convert "`Hello world!`"),
check_equal "Code in a paragraph" "<p>The following <code>text</code> is code</p>" (convert "The following `text` is code"), check_equal "Code in a paragraph" "<p>The following <code>text</code> is code</p>" (convert "The following `text` is code"),
check_equal "Code across paragraphs (shouldn't work)" "<p>`Incomplete</p><p>Code`</p>" (convert "`Incomplete\n\nCode`"), -- At the moment, this is just treated as a syntax error, so nothing is rendered. check_equal "Code across paragraphs (shouldn't work" "<p>`Incomplete</p><p>Code`</p>" (convert "`Incomplete\n\nCode`")
check_equal "Code block" "<pre><code>Test code block</code></pre>" (convert "```\nTest code block\n```"),
check_equal "Multiple code blocks" "<pre><code>Test code block</code></pre><pre><code>Next block</code></pre>" (convert "```\nTest code block\n```\n\n```\nNext block\n```")
]
imageTests =
TestList
[ check_equal "Image with text" "<p>This is an image <img src=\"img.png\" alt=\"Image 1\"/></p>" (convert "This is an image ![Image 1](img.png)"),
check_equal "Image with classes" "<p>This is an image <img src=\"img.png\" alt=\"Image 1\" class=\"new-img\"/></p>" (convert "This is an image ![Image 1](img.png){.new-img}")
]
figureTests =
TestList
[ check_equal "Image by itself" "<figure><img src=\"img.png\" alt=\"Image 1\"/><figcaption aria-hidden=\"true\">Image 1</figcaption></figure>" (convert "![Image 1](img.png)")
]
horizontalRuleTests =
TestList
[check_equal "Horizontal Rule" "<p>a</p><hr><p>b</p>" (convert "a\n\n---\n\nb")]
subscriptTests =
TestList
[check_equal "Should convert subscript" "A<sub>b</sub>" (convert "A~b~")]
superscriptTests =
TestList
[check_equal "Should convert superscript" "A<sup>b</sup>" (convert "A^b^")]
tableTests =
TestList
[ check_equal
"Basic table"
"<table>\
\<thead><tr><th>Col 1</th><th>Col 2</th><th>Col 3</th></tr></thead>\
\<tbody><tr><td>Data 1</td><td>Data 2</td><td>Data 3</td></tr>\
\<tr><td>More Data 1</td><td>More Data 2</td><td>More Data 3</td></tr></tbody></table>"
(convert "| Col 1 | Col 2 | Col 3 |\n|---|---|---|\n| Data 1 | Data 2 | Data 3 |\n| More Data 1 | More Data 2 | More Data 3 |")
] ]
integrationTests = integrationTests =
@@ -164,7 +108,7 @@ integrationTests =
check_equal "Integration 3" "<h1>Hello</h1><p>World</p>" (convert "# Hello\nWorld"), check_equal "Integration 3" "<h1>Hello</h1><p>World</p>" (convert "# Hello\nWorld"),
check_equal "Integration 4" "<p>a b</p>" (convert "a\nb"), check_equal "Integration 4" "<p>a b</p>" (convert "a\nb"),
check_equal "Integration 5" "<h1>Hello</h1>" (convert "# Hello\n"), check_equal "Integration 5" "<h1>Hello</h1>" (convert "# Hello\n"),
check_equal "Integration 6" "<p>First line<br />Second line</p>" (convert "First line \nSecond line"), check_equal "Integration 6" "<p>First line<br>Second line</p>" (convert "First line \nSecond line"),
check_equal check_equal
"Integration 7" "Integration 7"
"<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2>Second \ "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2>Second \
@@ -177,7 +121,7 @@ integrationTests =
"# Sample Markdown\n\nThis is some basic, sample markdown.\n\n## Second \ "# Sample Markdown\n\nThis is some basic, sample markdown.\n\n## Second \
\Heading\n\n- Unordered lists, and:\n 1. One\n 2. Two\n 3. Three\n\ \Heading\n\n- Unordered lists, and:\n 1. One\n 2. Two\n 3. Three\n\
\- More\n\n> Blockquote\n\nAnd **bold**, *italics*, and even *italics and \ \- More\n\n> Blockquote\n\nAnd **bold**, *italics*, and even *italics and \
\later __bold__*. Even ~~strikethrough~~. [A link](https://markdowntohtml.com) to somewhere." \later **bold***. Even ~~strikethrough~~. [A link](https://markdowntohtml.com) to somewhere."
) )
] ]
@@ -191,12 +135,7 @@ tests =
blockquoteTests, blockquoteTests,
unorderedListTests, unorderedListTests,
orderedListTests, orderedListTests,
imageTests,
htmlTests,
figureTests,
codeTests, codeTests,
horizontalRuleTests,
tableTests,
integrationTests integrationTests
] ]