Compare commits
15 Commits
6b99a1835d
...
usingMegap
Author | SHA1 | Date | |
---|---|---|---|
e5795e0d75 | |||
eae897a2d6 | |||
8152b89a23 | |||
9d3d656065 | |||
d4a550f6a7 | |||
2b21aeae89 | |||
ca328a464a | |||
7d45b1123f | |||
9627abcd12 | |||
82277e9ea8 | |||
d074b0131c | |||
57cb3e68fa | |||
4e9f84c2bb | |||
e025614324 | |||
e711444066 |
@@ -24,4 +24,7 @@ main = do
|
||||
[] -> getContents
|
||||
x : _ -> readFile x
|
||||
let res = leftmostLongestParse parseDocument fileContents
|
||||
print res
|
||||
let toPrint = prettyPrint res
|
||||
case reverse toPrint of
|
||||
'\n' : _ -> putStr toPrint
|
||||
_ -> putStrLn toPrint
|
||||
|
@@ -60,9 +60,12 @@ library
|
||||
build-depends: base ^>=4.19.1.0,
|
||||
HUnit,
|
||||
megaparsec,
|
||||
text
|
||||
parser-combinators,
|
||||
text,
|
||||
MissingH,
|
||||
word-wrap
|
||||
|
||||
executable md-to-html-runner
|
||||
executable mdtoh
|
||||
-- Import common warning flags.
|
||||
import: warnings
|
||||
|
||||
|
115
src/MdToHTML.hs
115
src/MdToHTML.hs
@@ -17,6 +17,7 @@ import Debug.Trace
|
||||
import Text.Megaparsec
|
||||
import Text.Megaparsec.Char
|
||||
import Text.Printf
|
||||
import Text.Wrap
|
||||
|
||||
type Parser = Parsec Void T.Text
|
||||
|
||||
@@ -39,6 +40,7 @@ data MdToken
|
||||
| Blockquote [MdToken]
|
||||
| UnordList [MdToken]
|
||||
| OrdList [MdToken]
|
||||
| Checkbox Bool
|
||||
| Code MdToken
|
||||
| Table [[MdToken]]
|
||||
| Codeblock MdToken
|
||||
@@ -47,6 +49,8 @@ data MdToken
|
||||
| Figure MdToken URL (Maybe [CssClass])
|
||||
| Bold MdToken
|
||||
| Italic MdToken
|
||||
| Superscript MdToken
|
||||
| Subscript MdToken
|
||||
| Strikethrough MdToken
|
||||
| Unit String
|
||||
deriving (Eq)
|
||||
@@ -55,25 +59,40 @@ data MdToken
|
||||
instance Show MdToken where
|
||||
show (Document tokens) = concatMap show tokens
|
||||
show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
|
||||
show (Para token) = "<p>" ++ show token ++ "</p>\n"
|
||||
show (Para token) = "<p>" ++ show token ++ "</p>"
|
||||
show (Line tokens) = concatMap show tokens
|
||||
show Linebreak = "<br>"
|
||||
show Linebreak = "<br />"
|
||||
show SingleNewline = " "
|
||||
show HorizontalRule = "<hr>"
|
||||
show (Blockquote tokens) = "<blockquote>" ++ concatMap show tokens ++ "</blockquote>"
|
||||
show (UnordList tokens) = "<ul>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ul>"
|
||||
show (OrdList tokens) = "<ol>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ol>"
|
||||
show (Code code) = "<code>" ++ show code ++ "</code>"
|
||||
show (Table (thead : tokenGrid)) = "<table>\n<thead>\n<tr>\n" ++ concatMap (\x -> "<th>" ++ rstrip (show x) ++ "</th>\n") thead ++ "</tr>\n</thead>\n" ++ "<tbody>\n" ++ concatMap (\x -> "<tr>\n" ++ concatMap (\y -> "<td>" ++ rstrip (show y) ++ "</td>\n") x ++ "</tr>\n") tokenGrid ++ "</tbody>\n</table>\n"
|
||||
show (Checkbox isChecked) = "<input type=\"checkbox\"" ++ (if isChecked then " checked=\"\"" else "") ++ " />"
|
||||
show (Code code) = "<code>" ++ strip (show code) ++ "</code>"
|
||||
show (Table (thead : tokenGrid)) = "<table><thead><tr>" ++ concatMap (\x -> "<th>" ++ rstrip (show x) ++ "</th>") thead ++ "</tr></thead>" ++ "<tbody>" ++ concatMap (\x -> "<tr>" ++ concatMap (\y -> "<td>" ++ rstrip (show y) ++ "</td>") x ++ "</tr>") tokenGrid ++ "</tbody></table>"
|
||||
show (Codeblock code) = "<pre><code>" ++ show code ++ "</code></pre>"
|
||||
show (Link txt url) = "<a href=\"" ++ getUrl url ++ "\">" ++ show txt ++ "</a>"
|
||||
show (Image txt url cssClasses) = "<img src=\"" ++ getUrl url ++ "\"" ++ " alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/>"
|
||||
show (Figure txt url cssClasses) = "<figure><img src=\"" ++ getUrl url ++ "\" alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/><figcaption aria-hidden=\"true\">" ++ show txt ++ "</figcaption></figure>"
|
||||
show (Bold token) = "<b>" ++ show token ++ "</b>"
|
||||
show (Italic token) = "<i>" ++ show token ++ "</i>"
|
||||
show (Superscript token) = "<sup>" ++ show token ++ "</sup>"
|
||||
show (Subscript token) = "<sub>" ++ show token ++ "</sub>"
|
||||
show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
|
||||
show (Unit unit) = printf "%s" unit
|
||||
|
||||
-- Pretty print the given token into a string.
|
||||
-- This is the same as calling 'show' for most tokens, but is different for paragraphs and tables,
|
||||
-- which have newlines inserted into them.
|
||||
prettyPrint :: MdToken -> String
|
||||
prettyPrint (Para token) = "<p>" ++ T.unpack (wrapText defaultWrapSettings 70 (T.pack $ prettyPrint token)) ++ "</p>\n"
|
||||
prettyPrint (Table (thead : tokenGrid)) = "<table>\n<thead>\n<tr>\n" ++ concatMap (\x -> "<th>" ++ rstrip (prettyPrint x) ++ "</th>\n") thead ++ "</tr>\n</thead>\n" ++ "<tbody>\n" ++ concatMap (\x -> "<tr>\n" ++ concatMap (\y -> "<td>" ++ rstrip (prettyPrint y) ++ "</td>\n") x ++ "</tr>\n") tokenGrid ++ "</tbody>\n</table>\n"
|
||||
prettyPrint Linebreak = "<br />\n"
|
||||
prettyPrint HorizontalRule = "<hr>\n"
|
||||
prettyPrint (Line tokens) = concatMap prettyPrint tokens
|
||||
prettyPrint (Document tokens) = concatMap prettyPrint tokens
|
||||
prettyPrint token = show token
|
||||
|
||||
instance Semigroup MdToken where
|
||||
a <> b = Document [a, b]
|
||||
|
||||
@@ -126,7 +145,7 @@ parseTillEol = manyTill anySingle (void (char '\n') <|> eof)
|
||||
-- Takes a list of parsers. Returns a parser that will try them in
|
||||
-- order, moving to the next one only if the current one fails.
|
||||
fallthroughParser :: [Parser a] -> Parser a
|
||||
fallthroughParser [x] = x
|
||||
fallthroughParser [x] = try x
|
||||
fallthroughParser (x : xs) = try x <|> fallthroughParser xs
|
||||
|
||||
escapeChar :: Char -> String
|
||||
@@ -138,6 +157,19 @@ escapeChar x = [x]
|
||||
htmlEscapeChars :: T.Text -> T.Text
|
||||
htmlEscapeChars = T.concatMap (T.pack . escapeChar)
|
||||
|
||||
-- -- Wraps a list of words after (at most) the given number of characters, trying to prevent word-breaks
|
||||
-- wordwrap :: Int -> String -> String
|
||||
-- wordwrap wraplength str = if (length str) < wraplength
|
||||
-- then str
|
||||
-- else
|
||||
-- let spaceIndex = lastgtSpaceIndex 0 (takeRev (length str) - wraplength str)
|
||||
--
|
||||
-- where
|
||||
-- takeRev n = (reverse . take n . reverse)
|
||||
-- lastSpaceIndex counter str = case str of
|
||||
-- [] -> counter
|
||||
-- x:xs -> if (isSpace x) counter else lastSpaceIndex counter+1 xs
|
||||
|
||||
-- ---------------
|
||||
|
||||
-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
|
||||
@@ -175,6 +207,20 @@ parseItalic = parseItalicWith '*' <|> parseItalicWith '_'
|
||||
inside <- someTill parseLineToken (char delim)
|
||||
return (Italic (Line inside))
|
||||
|
||||
-- Parse subscript
|
||||
parseSubscript :: Parser MdToken
|
||||
parseSubscript = do
|
||||
char '~'
|
||||
inside <- someTill parseLineToken (char '~')
|
||||
return (Subscript (Line inside))
|
||||
|
||||
-- Parse superscript
|
||||
parseSuperscript :: Parser MdToken
|
||||
parseSuperscript = do
|
||||
char '^'
|
||||
inside <- someTill parseLineToken (char '^')
|
||||
return (Superscript (Line inside))
|
||||
|
||||
-- Parse strikethrough text
|
||||
parseStrikethrough :: Parser MdToken
|
||||
parseStrikethrough = do
|
||||
@@ -201,11 +247,17 @@ parseLink = do
|
||||
|
||||
-- Parse a linebreak character
|
||||
parseLinebreak :: Parser MdToken
|
||||
parseLinebreak = do
|
||||
char ' '
|
||||
some (char ' ')
|
||||
char '\n'
|
||||
return Linebreak
|
||||
parseLinebreak = parseLinebreakSpace <|> parseLinebreakBackslash
|
||||
where
|
||||
parseLinebreakSpace = do
|
||||
char ' '
|
||||
some (char ' ')
|
||||
char '\n'
|
||||
return Linebreak
|
||||
parseLinebreakBackslash = try $ do
|
||||
char '\\'
|
||||
char '\n'
|
||||
return Linebreak
|
||||
|
||||
parseTableRow :: Parser [MdToken]
|
||||
parseTableRow = do
|
||||
@@ -294,6 +346,8 @@ lineParsers =
|
||||
parseBold,
|
||||
parseItalic,
|
||||
parseStrikethrough,
|
||||
parseSubscript,
|
||||
parseSuperscript,
|
||||
parseLink,
|
||||
parseUnit
|
||||
] -- A 'line' doesn't include a 'header'
|
||||
@@ -306,6 +360,8 @@ lineParsersWithoutNewline =
|
||||
parseBold,
|
||||
parseItalic,
|
||||
parseStrikethrough,
|
||||
parseSubscript,
|
||||
parseSuperscript,
|
||||
parseLink,
|
||||
parseUnitExceptNewline
|
||||
] -- A list line cannot contain newlines.
|
||||
@@ -371,13 +427,28 @@ parseBlockquote = do
|
||||
let parsedQuotedLines = leftmostLongestParse (some (parseBlockquote <|> parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
|
||||
return (Blockquote parsedQuotedLines)
|
||||
|
||||
-- Parse a checkbox
|
||||
parseCheckbox :: Parser MdToken
|
||||
parseCheckbox = do
|
||||
char '['
|
||||
inside <- char ' ' <|> char 'x'
|
||||
char ']'
|
||||
space
|
||||
return (if inside == 'x' then Checkbox True else Checkbox False)
|
||||
|
||||
-- Parse a nested list item.
|
||||
parseListNested :: Parser MdToken
|
||||
parseListNested = do
|
||||
let firstCharParser = string (T.pack " ") <|> string (T.pack "\t")
|
||||
let firstCharParser = (<>) <$> (string (T.pack " ") <|> string (T.pack "\t")) <*> (T.pack <$> many (char ' '))
|
||||
let restOfLineParser = manyTill anySingle (void (char '\n') <|> eof)
|
||||
lines <- greedyParse1 (firstCharParser *> restOfLineParser)
|
||||
let linesParsed = leftmostLongestParse (parseUnorderedList <|> parseOrderedList) (init $ unlines lines)
|
||||
-- For the first line, I manually run firstCharParser and restOfLineParser. The
|
||||
-- result of firstCharParser is saved. For every subsequent line, I parse exactly
|
||||
-- the same string as firstCharParser.
|
||||
firstLineSpaces <- firstCharParser
|
||||
firstLine <- restOfLineParser
|
||||
lines <- greedyParse (string firstLineSpaces *> restOfLineParser)
|
||||
let allLines = firstLine : lines
|
||||
let linesParsed = leftmostLongestParse (parseUnorderedList <|> parseOrderedList) (init $ unlines allLines)
|
||||
when (null (show linesParsed)) empty
|
||||
return linesParsed
|
||||
|
||||
@@ -400,9 +471,12 @@ parseOListLineItem = do
|
||||
parseListLineItemCommon :: Parser MdToken
|
||||
parseListLineItemCommon = do
|
||||
space
|
||||
checkbox <- optional $ try parseCheckbox
|
||||
restOfLine <- manyTill parseListLineToken (void (char '\n') <|> eof)
|
||||
nestedList <- try parseListNested <|> return (Unit "")
|
||||
return $ Line [Line restOfLine, nestedList]
|
||||
case checkbox of
|
||||
Just box -> return $ Line [box, Line restOfLine, nestedList]
|
||||
Nothing -> return $ Line [Line restOfLine, nestedList]
|
||||
|
||||
-- Parse an unordered list paragraph item.
|
||||
parseUListParaItem :: Parser MdToken
|
||||
@@ -468,11 +542,15 @@ doubleNewlineText :: T.Text
|
||||
doubleNewlineText = T.pack "\n\n"
|
||||
|
||||
parseHorizontalRule :: Parser MdToken
|
||||
parseHorizontalRule = string horizontalRuleText *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule
|
||||
parseHorizontalRule = parseHorizontalRuleLine *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule
|
||||
where
|
||||
parseHorizontalRuleLine = fallthroughParser (map (string . T.pack) ["---", "***", "___", "- - -", "* * *", "_ _ _"])
|
||||
|
||||
parseCodeblock :: Parser MdToken
|
||||
parseCodeblock = do
|
||||
string (T.pack "```\n")
|
||||
string (T.pack "```")
|
||||
_ <- many $ satisfy (/= '\n') -- Language name
|
||||
char '\n'
|
||||
inside <- someTill anySingle (string (T.pack "\n```"))
|
||||
return $ Codeblock (Unit (concatMap escapeChar inside))
|
||||
|
||||
@@ -492,5 +570,8 @@ documentParsers =
|
||||
-- Parse a document, which is multiple paragraphs.
|
||||
parseDocument :: Parser MdToken
|
||||
parseDocument = do
|
||||
res <- manyTill (fallthroughParser documentParsers) eof
|
||||
-- res <- manyTill (fallthroughParser documentParsers <|> (char '\n' *> return $ Unit "")) eof
|
||||
res <- sepEndBy (fallthroughParser documentParsers) (many $ char '\n')
|
||||
-- many $ char '\n'
|
||||
eof
|
||||
return (Document res)
|
||||
|
@@ -4,7 +4,7 @@ import MdToHTML
|
||||
import Test.HUnit
|
||||
|
||||
check_equal :: String -> String -> String -> Test
|
||||
check_equal desc expected actual = TestCase (assertEqual desc expected (filter (/= '\n') actual))
|
||||
check_equal desc expected actual = TestCase (assertEqual desc expected actual)
|
||||
|
||||
convert :: String -> String
|
||||
convert md = show $ leftmostLongestParse parseDocument md
|
||||
@@ -31,7 +31,7 @@ boldTests =
|
||||
strikethroughTests =
|
||||
TestList
|
||||
[ check_equal "Should convert strikethrough" "<p><s>Hello</s></p>" (convert "~~Hello~~"),
|
||||
check_equal "Should convert long sentence with tilde" "<p><s>The universe is ~7 days old</s>. The universe is 13 billion years old.</p>" (convert "~~The universe is ~7 days old~~. The universe is 13 billion years old.")
|
||||
check_equal "Should convert long sentence with tilde" "<p><s>The universe is ~7 days old</s>. The universe is 13 billion years old.</p>" (convert "~~The universe is \\~7 days old~~. The universe is 13 billion years old.")
|
||||
]
|
||||
|
||||
linkTests =
|
||||
@@ -74,8 +74,17 @@ unorderedListTests =
|
||||
check_equal "Paragraph before list" "<p>This is a list</p><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "This is a list\n\n* Item 1\n* Item 2"),
|
||||
check_equal "Paragraph before list" "<h3>This is a list</h3><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "### This is a list\n\n* Item 1\n* Item 2"),
|
||||
check_equal "Nested list then back" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3</li><li>Item 4</li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n - Item 3\n - Item 4\n- Item 5"),
|
||||
check_equal "Triply nested list" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3<ul><li>Item 4</li></ul></li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n - Item 3\n - Item 4\n- Item 5"),
|
||||
check_equal "Blockquote in list" "<ul><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n\n > Quote\n\n- Item 3"),
|
||||
check_equal "Ordered list in unordered list" "<ul><li>Item 1</li><li>Item 2<ol><li>Item 1</li><li>Item 2</li></ol></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n 1. Item 1\n 2. Item 2\n- Item 3")
|
||||
check_equal "Ordered list in unordered list" "<ul><li>Item 1</li><li>Item 2<ol><li>Item 1</li><li>Item 2</li></ol></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n 1. Item 1\n 2. Item 2\n- Item 3"),
|
||||
check_equal
|
||||
"Checkbox in unordered list"
|
||||
"<ul>\
|
||||
\<li><input type=\"checkbox\" />Not checked</li>\
|
||||
\<li><input type=\"checkbox\" checked=\"\" />Checked</li>\
|
||||
\<li>Normal list item</li></ul>"
|
||||
(convert "- [ ] Not checked\n- [x] Checked\n- Normal list item"),
|
||||
check_equal "List with link at the start" "<ul><li><a href=\"b\">a</a></li><li><a href=\"d\">c</a></li></ul>" (convert "- [a](b)\n- [c](d)")
|
||||
]
|
||||
|
||||
orderedListTests =
|
||||
@@ -91,7 +100,14 @@ orderedListTests =
|
||||
check_equal "Nested list then back" "<ol><li>Item 1</li><li>Item 2<ol><li>Item 3</li><li>Item 4</li></ol></li><li>Item 5</li></ol>" (convert "1. Item 1\n2. Item 2\n 1. Item 3\n 3. Item 4\n5. Item 5"),
|
||||
check_equal "Blockquote in list" "<ol><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n\n > Quote\n\n3. Item 3"),
|
||||
check_equal "Unordered list in ordered list" "<ol><li>Item 1</li><li>Item 2<ul><li>Item 1</li><li>Item 2</li></ul></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n - Item 1\n * Item 2\n4. Item 3"),
|
||||
check_equal "List with just 1 item" "<ol><li>Item 1</li></ol>" (convert "1. Item 1")
|
||||
check_equal "List with just 1 item" "<ol><li>Item 1</li></ol>" (convert "1. Item 1"),
|
||||
check_equal
|
||||
"Checkbox in ordered list"
|
||||
"<ol>\
|
||||
\<li><input type=\"checkbox\" />Not checked</li>\
|
||||
\<li><input type=\"checkbox\" checked=\"\" />Checked</li>\
|
||||
\<li>Normal list item</li></ol>"
|
||||
(convert "1. [ ] Not checked\n2. [x] Checked\n3. Normal list item")
|
||||
]
|
||||
|
||||
htmlTests =
|
||||
@@ -102,7 +118,9 @@ codeTests =
|
||||
TestList
|
||||
[ check_equal "Code by itself" "<p><code>Hello world!</code></p>" (convert "`Hello world!`"),
|
||||
check_equal "Code in a paragraph" "<p>The following <code>text</code> is code</p>" (convert "The following `text` is code"),
|
||||
check_equal "Code across paragraphs (shouldn't work)" "<p>`Incomplete</p><p>Code`</p>" (convert "`Incomplete\n\nCode`") -- At the moment, this is just treated as a syntax error, so nothing is rendered.
|
||||
check_equal "Code across paragraphs (shouldn't work)" "<p>`Incomplete</p><p>Code`</p>" (convert "`Incomplete\n\nCode`"), -- At the moment, this is just treated as a syntax error, so nothing is rendered.
|
||||
check_equal "Code block" "<pre><code>Test code block</code></pre>" (convert "```\nTest code block\n```"),
|
||||
check_equal "Multiple code blocks" "<pre><code>Test code block</code></pre><pre><code>Next block</code></pre>" (convert "```\nTest code block\n```\n\n```\nNext block\n```")
|
||||
]
|
||||
|
||||
imageTests =
|
||||
@@ -120,6 +138,14 @@ horizontalRuleTests =
|
||||
TestList
|
||||
[check_equal "Horizontal Rule" "<p>a</p><hr><p>b</p>" (convert "a\n\n---\n\nb")]
|
||||
|
||||
subscriptTests =
|
||||
TestList
|
||||
[check_equal "Should convert subscript" "A<sub>b</sub>" (convert "A~b~")]
|
||||
|
||||
superscriptTests =
|
||||
TestList
|
||||
[check_equal "Should convert superscript" "A<sup>b</sup>" (convert "A^b^")]
|
||||
|
||||
tableTests =
|
||||
TestList
|
||||
[ check_equal
|
||||
@@ -138,7 +164,7 @@ integrationTests =
|
||||
check_equal "Integration 3" "<h1>Hello</h1><p>World</p>" (convert "# Hello\nWorld"),
|
||||
check_equal "Integration 4" "<p>a b</p>" (convert "a\nb"),
|
||||
check_equal "Integration 5" "<h1>Hello</h1>" (convert "# Hello\n"),
|
||||
check_equal "Integration 6" "<p>First line<br>Second line</p>" (convert "First line \nSecond line"),
|
||||
check_equal "Integration 6" "<p>First line<br />Second line</p>" (convert "First line \nSecond line"),
|
||||
check_equal
|
||||
"Integration 7"
|
||||
"<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2>Second \
|
||||
|
Reference in New Issue
Block a user