25 Commits

Author SHA1 Message Date
09982f4ab1 Made some changes 2025-05-23 08:07:02 -04:00
4101767aff Merge remote-tracking branch 'origin' into fixingIncompleteElements 2025-05-20 22:24:24 -04:00
ed7d2c1ef1 Added code parser to list line parser list 2025-05-20 22:23:41 -04:00
234145bcb3 Started working on parsing changes to allow incomplete elements to be parsed (eg. opening bold, followed by text, without closing bold) 2025-05-20 22:21:59 -04:00
eb20f154a4 Removed unused function 2025-05-20 22:20:36 -04:00
172985131b Fixed HTML rendering for horizontal rule 2025-05-20 22:20:23 -04:00
3781e67ab1 Created a separate list of escapable chars 2025-05-20 22:20:05 -04:00
f2d54edd3f Implemented inline code parsing 2025-05-20 22:19:44 -04:00
5393dc4eb9 Added code tests 2025-05-20 22:18:46 -04:00
e051c87f08 Factor list line common parsing into a separate function; refactored
OList and UList line parsing to us it
2025-05-20 16:48:31 -04:00
9b1c51897c A nested list can be ordered or unordered 2025-05-20 16:47:58 -04:00
2a3dddc7b0 Rename function 2025-05-20 16:47:31 -04:00
a8793b5adb Remove obsolete comments 2025-05-20 16:47:16 -04:00
eecec764ad A list can contain a blockquote as well. 2025-05-20 16:47:04 -04:00
cdca6ea95e A list item doesn't need to have a nested list item parser, because the
line item parser handles the nested list.
2025-05-20 16:46:52 -04:00
540b5430e5 Use list of document parsers in parseDocument 2025-05-20 16:45:56 -04:00
00dfba81eb Created a list of document parsers; implemented ordered list parsing. 2025-05-20 16:45:37 -04:00
39152c0034 Factor out common code for UList and OList parsing into a separate
function. Refactored UList function; wrote OList function.
2025-05-20 16:45:07 -04:00
41b35be7c9 Rename function 2025-05-20 16:43:35 -04:00
d2c8565f62 Import Data.Char for isDigit 2025-05-20 16:43:22 -04:00
62eeef2abb Removed unused functions 2025-05-20 16:43:10 -04:00
9c6634cfec Added ordered list tests 2025-05-20 16:42:40 -04:00
2a5a68b1de Fixed test name 2025-05-20 16:42:12 -04:00
f8e1a98bdf Remove obsolete comment 2025-05-20 14:03:30 -04:00
05433c31f1 Remove unused functions 2025-05-20 14:02:50 -04:00
2 changed files with 141 additions and 79 deletions

View File

@@ -6,6 +6,7 @@ module MdToHTML where
import Control.Applicative
import Control.Monad
import Data.Char
import Data.List
import Data.Ord (comparing)
import Debug.Trace
@@ -32,7 +33,7 @@ data MdToken
| Blockquote [MdToken]
| UnordList [MdToken]
| OrdList [MdToken]
| Code String
| Code MdToken
| Codeblock String
| Link MdToken URL
| Image MdToken ImgPath
@@ -50,11 +51,11 @@ instance Show MdToken where
show (Line tokens) = concatMap show tokens
show Linebreak = "<br>"
show SingleNewline = " "
show HorizontalRule = "---------"
show HorizontalRule = "<hr>"
show (Blockquote tokens) = "<blockquote>" ++ concatMap show tokens ++ "</blockquote>"
show (UnordList tokens) = "<ul>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ul>"
show (OrdList tokens) = "<ol>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ol>"
show (Code code) = show code
show (Code code) = "<code>" ++ show code ++ "</code>"
show (Codeblock code) = show code
show (Link txt url) = "<a href=\"" ++ getUrl url ++ "\">" ++ show txt ++ "</a>"
show (Image txt imgPath) = "<img src=" ++ getPath imgPath ++ ">" ++ show txt ++ "</img>"
@@ -87,18 +88,9 @@ leftmostLongestParse parser input =
Nothing -> (mempty, mempty)
Just x -> x
-- Parse if the string that's left matches the string comparator function
lookaheadParse :: (String -> Bool) -> ReadP Char
lookaheadParse stringCmp = do
lookahead <- look
case stringCmp lookahead of
True -> get
False -> pfail
specialChars = "\\#*_[\n`"
lineToList :: MdToken -> [MdToken]
lineToList (Line tokens) = tokens
specialChars = "\\#*_[\n"
escapableChars = '~' : specialChars
-- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
greedyParse :: ReadP a -> ReadP [a]
@@ -118,25 +110,6 @@ prepend x1 x2 = x1 ++ x2
append :: [a] -> [a] -> [a]
append x1 x2 = x2 ++ x1
-- Sequence two parsers, running one after the other and returning the result.
sequenceParse :: ReadP a -> ReadP a -> ReadP [a]
sequenceParse p1 p2 = twoElemList <$> p1 <*> p2
where
twoElemList elem1 elem2 = [elem1, elem2]
-- Parses p1 until p2 succeeds, but doesn't actually consume anything from p2.
-- Similar to manyTill, except manyTill's second parser actually consumes characters.
manyTillLazy :: ReadP a -> ReadP b -> ReadP [a]
manyTillLazy p1 p2 = do
res <- p1
remaining <- look
let p2res = readP_to_S p2 remaining
case p2res of
[] -> do
res2 <- manyTillLazy p1 p2
return (res : res2)
_ -> return [res]
-- Parse until EOL or EOF
parseTillEol :: ReadP String
parseTillEol = manyTill get (void (char '\n') <++ eof)
@@ -147,6 +120,16 @@ fallthroughParser :: [ReadP a] -> ReadP a
fallthroughParser [x] = x
fallthroughParser (x : xs) = x <++ fallthroughParser xs
myMany :: (Monoid a) => ReadP a -> ReadP [a]
myMany p = do
remaining <- look
case remaining of
[] -> return []
_ -> return [] +++ myMany1 p
myMany1 :: (Monoid a) => ReadP a -> ReadP [a]
myMany1 p = liftM2 (:) p (myMany p)
-- ---------------
-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
@@ -170,19 +153,25 @@ parseBold = parseBoldWith "**" <|> parseBoldWith "__"
where
parseBoldWith delim = do
string delim
inside <- greedyParse1 parseLineToken
inside <- myMany1 parseLineToken
string delim
return (Bold (Line inside))
-- Parse italic text
parseItalic :: ReadP MdToken
parseItalic = parseBoldWith "*" <|> parseBoldWith "_"
parseItalic = parseItalicWith '*' <|> parseItalicWith '_'
where
parseBoldWith delim = do
string delim
inside <- greedyParse1 parseLineToken
string delim
parseItalicWith delim = do
exactlyOnce delim
inside <- myMany1 parseLineToken
exactlyOnce delim
return (Italic (Line inside))
exactlyOnce ch = do
char ch
remaining <- look
case remaining of
[] -> return ch
x : xs -> if x == ch then pfail else return ch
-- Parse strikethrough text
parseStrikethrough :: ReadP MdToken
@@ -192,6 +181,14 @@ parseStrikethrough = do
string "~~"
return (Strikethrough (Line inside))
-- Parse code
parseCode :: ReadP MdToken
parseCode = do
string "`"
inside <- many1 get
string "`"
return (Code (Unit inside))
-- Parse a link
parseLink :: ReadP MdToken
parseLink = do
@@ -217,27 +214,22 @@ parseSingleNewline = do
parseEscapedChar :: ReadP MdToken
parseEscapedChar = do
char '\\'
escapedChar <- choice (map char specialChars) -- Parse any of the special chars.
escapedChar <- choice (map char escapableChars) -- Parse any of the special chars.
return (Unit [escapedChar])
-- Parse a character as a Unit.
parseUnit :: ReadP MdToken
parseUnit = do
text <- satisfy (`notElem` specialChars)
-- text <- satisfy (`notElem` specialChars)
text <- get
return (Unit [text])
-- Parse a regular string as a Unit.
parseString :: ReadP MdToken
parseString = do
firstChar <- satisfy (/= '\n') -- Must parse at least one non-newline character here
text <- munch (`notElem` specialChars)
return (Unit (firstChar : text))
lineParsers :: [ReadP MdToken]
lineParsers =
[ parseLinebreak,
parseSingleNewline,
parseEscapedChar,
parseCode,
parseBold,
parseItalic,
parseStrikethrough,
@@ -249,6 +241,7 @@ listLineParsers :: [ReadP MdToken]
listLineParsers =
[ parseLinebreak,
parseEscapedChar,
parseCode,
parseBold,
parseItalic,
parseStrikethrough,
@@ -273,7 +266,7 @@ parseLine :: ReadP MdToken
parseLine = do
skipSpaces
-- Fail if we have reached the end of the document.
parsed <- manyTill parseLineToken eof
parsed <- myMany1 parseLineToken
return (Line parsed)
-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
@@ -316,17 +309,13 @@ parseBlockquote = do
return (Blockquote parsedQuotedLines)
-- Parse a nested list item.
parseUListNested :: ReadP MdToken
parseUListNested = do
-- firstChar <- string " " <++ string "\t"
-- skipSpaces
-- restOfLine <- manyTill get (void (char '\n') <++ eof)
-- let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
-- return restOfLineParsed
parseListNested :: ReadP MdToken
parseListNested = do
let firstCharParser = string " " <++ string "\t"
let restOfLineParser = manyTill get (void (char '\n') <++ eof)
lines <- greedyParse1 (firstCharParser *> restOfLineParser)
let linesParsed = fst $ leftmostLongestParse parseUnorderedList (init $ unlines lines)
let linesParsed = fst $ leftmostLongestParse (parseUnorderedList <++ parseOrderedList) (init $ unlines lines)
when (null (show linesParsed)) pfail
return linesParsed
-- Parse an unordered list line item.
@@ -334,36 +323,54 @@ parseUListLineItem :: ReadP MdToken
parseUListLineItem = do
firstChar <- choice (map char ['*', '+', '-'])
char ' ' -- At least one space between list indicator and list text.
parseListLineItemCommon
-- Parse an ordered list line item.
parseOListLineItem :: ReadP MdToken
parseOListLineItem = do
num <- greedyParse1 (satisfy isDigit)
char '.'
char ' ' -- At least one space between list indicator and list text.
parseListLineItemCommon
-- Common code for parsing list line items
parseListLineItemCommon :: ReadP MdToken
parseListLineItemCommon = do
skipSpaces
restOfLine <- many1 parseListLineToken
void (char '\n') <++ eof
nestedList <- parseUListNested <++ return (Unit "")
nestedList <- parseListNested <++ return (Unit "")
return $ Line [Line restOfLine, nestedList]
-- restOfLine <- manyTill get (void (char '\n') <++ eof)
-- let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
-- nestedList <- parseUListNested <++ return (Unit "")
-- return $ Line [restOfLineParsed, nestedList]
-- Parse an unordered list paragraph item.
-- This is defined as a line item, followed by an empty line, followed by one or more
-- lines indented by a space or tab.
parseUListParaItem :: ReadP MdToken
parseUListParaItem = do
firstLine <- parseUListLineItem
char '\n'
lines <- greedyParse1 ((string " " <|> string "\t") *> parseTillEol)
let res = fst $ leftmostLongestParse (greedyParse1 parsePara) (init $ unlines lines)
char '\n'
res <- parseListParaItemCommon
return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- This is hacky as hell
-- parsedParas <- manyTillLazy parsePara (string "\n\n" *> choice (map char "*-+"))
-- return $ Document parsedParas -- I wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- Parse an unordered list paragraph item.
parseOListParaItem :: ReadP MdToken
parseOListParaItem = do
firstLine <- parseOListLineItem
res <- parseListParaItemCommon
return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- Common code for parsing list paragraph items.
-- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more
-- lines indented by a space or tab.
-- A list paragraph item can also be a blockquote.
parseListParaItemCommon :: ReadP [MdToken]
parseListParaItemCommon = do
char '\n'
lines <- greedyParse1 ((string " " <|> string "\t") *> parseTillEol)
let res = fst $ leftmostLongestParse (greedyParse1 parseBlockquote <++ greedyParse1 parsePara) (init $ unlines lines)
char '\n'
return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- Parse an unordered list item, which can be a line item or another list.
parseUListItem :: ReadP MdToken
parseUListItem = parseUListParaItem <++ parseUListLineItem <++ parseUListNested
parseUListItem = parseUListParaItem <++ parseUListLineItem
-- Parse an unordered list.
parseUnorderedList :: ReadP MdToken
@@ -372,8 +379,36 @@ parseUnorderedList = do
void (char '\n') <++ eof -- A list must end in an extra newline or eof
return $ UnordList lineItems
-- --------
parseOListItem :: ReadP MdToken
parseOListItem = parseOListParaItem <++ parseOListLineItem
-- Parses the first element of an ordered list, which must start with '1.'
parseFirstOListItem :: ReadP MdToken
parseFirstOListItem = do
remaining <- look
when (take 2 remaining /= "1.") pfail
parseOListLineItem
parseOrderedList :: ReadP MdToken
parseOrderedList = do
firstLine <- parseFirstOListItem
lineItems <- greedyParse1 parseOListItem
void (char '\n') <++ eof
return $ OrdList (firstLine : lineItems)
documentParsers :: [ReadP MdToken]
documentParsers =
[ parseHeader,
parseBlockquote,
parseUnorderedList,
parseOrderedList,
parsePara
]
-- Parse a document, which is multiple paragraphs.
parseDocument :: ReadP MdToken
parseDocument = do
res <- manyTill (parseHeader <++ parseBlockquote <++ parseUnorderedList <++ parsePara) eof
res <- manyTill (fallthroughParser documentParsers) eof
return (Document res)

View File

@@ -22,6 +22,7 @@ headerTests =
boldTests =
TestList
[ check_equal "Should convert bold" "<p><b>Hello</b></p>" (convert "__Hello__"),
check_equal " Should not convert incomplete bold" "<p>**Hello</p>" (convert "**Hello"),
check_equal "Should convert italic" "<p><i>Hello</i></p>" (convert "_Hello_"),
check_equal "Should convert bold and italic in a sentence" "<p>It <i>is</i> a <b>wonderful</b> day</p>" (convert "It _is_ a __wonderful__ day"),
check_equal "Should convert nested bold and italic" "<p><b>Bold then <i>Italic</i></b></p>" (convert "**Bold then *Italic***"),
@@ -66,14 +67,38 @@ blockquoteTests =
unorderedListTests =
TestList
[ check_equal "Basic ordered list" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>" (convert "* Item 1\n* Item 2\n* Item 3"),
[ check_equal "Basic unordered list" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>" (convert "* Item 1\n* Item 2\n* Item 3"),
check_equal "Mixing list indicators" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>" (convert "* Item 1\n+ Item 2\n- Item 3"),
check_equal "Formatted lists" "<ul><li><b>Item 1</b></li><li><i>Item 2</i></li><li><b><i>Item 3</i></b></li></ul>" (convert "* __Item 1__\n+ _Item 2_\n- ***Item 3***"),
check_equal "Nested list" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3<ul><li>Subitem 1</li><li>Subitem 2</li></ul></li></ul>" (convert "* Item 1\n* Item 2\n* Item 3\n * Subitem 1\n * Subitem 2"),
check_equal "Paragraph in list" "<ul><li>Item 1</li><li><p>Item 2</p><p>More stuff</p></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n\n More stuff\n\n- Item 3"),
check_equal "Paragraph before list" "<p>This is a list</p><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "This is a list\n\n* Item 1\n* Item 2"),
check_equal "Paragraph before list" "<h3>This is a list</h3><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "### This is a list\n\n* Item 1\n* Item 2"),
check_equal "Nested list then back" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3</li><li>Item 4</li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n - Item 3\n - Item 4\n- Item 5")
check_equal "Nested list then back" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3</li><li>Item 4</li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n - Item 3\n - Item 4\n- Item 5"),
check_equal "Blockquote in list" "<ul><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n\n > Quote\n\n- Item 3"),
check_equal "Ordered list in unordered list" "<ul><li>Item 1</li><li>Item 2<ol><li>Item 1</li><li>Item 2</li></ol></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n 1. Item 1\n 2. Item 2\n- Item 3")
]
orderedListTests =
TestList
[ check_equal "Basic ordered list" "<ol><li>Item 1</li><li>Item 2</li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n3. Item 3"),
check_equal "Mixing list numbering" "<ol><li>Item 1</li><li>Item 2</li><li>Item 3</li></ol>" (convert "1. Item 1\n3. Item 2\n2. Item 3"),
check_equal "Should not convert list without number 1" "<p>2. Item 1 1. Item 2</p>" (convert "2. Item 1\n1. Item 2"),
check_equal "Formatted lists" "<ol><li><b>Item 1</b></li><li><i>Item 2</i></li><li><b><i>Item 3</i></b></li></ol>" (convert "1. __Item 1__\n2. _Item 2_\n3. ***Item 3***"),
check_equal "Nested list" "<ol><li>Item 1</li><li>Item 2</li><li>Item 3<ol><li>Subitem 1</li><li>Subitem 2</li></ol></li></ol>" (convert "1. Item 1\n2. Item 2\n3. Item 3\n 1. Subitem 1\n 2. Subitem 2"),
check_equal "Paragraph in list" "<ol><li>Item 1</li><li><p>Item 2</p><p>More stuff</p></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n\n More stuff\n\n1. Item 3"),
check_equal "Paragraph before list" "<p>This is a list</p><ol><li>Item 1</li><li>Item 2</li></ol>" (convert "This is a list\n\n1. Item 1\n1. Item 2"),
check_equal "Paragraph before list" "<h3>This is a list</h3><ol><li>Item 1</li><li>Item 2</li></ol>" (convert "### This is a list\n\n1. Item 1\n200. Item 2"),
check_equal "Nested list then back" "<ol><li>Item 1</li><li>Item 2<ol><li>Item 3</li><li>Item 4</li></ol></li><li>Item 5</li></ol>" (convert "1. Item 1\n2. Item 2\n 1. Item 3\n 3. Item 4\n5. Item 5"),
check_equal "Blockquote in list" "<ol><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n\n > Quote\n\n3. Item 3"),
check_equal "Unordered list in ordered list" "<ol><li>Item 1</li><li>Item 2<ul><li>Item 1</li><li>Item 2</li></ul></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n - Item 1\n * Item 2\n4. Item 3")
]
codeTests =
TestList
[ check_equal "Code by itself" "<p><code>Hello world!</code></p>" (convert "`Hello world!`"),
check_equal "Code in a paragraph" "<p>The following <code>text</code> is code</p>" (convert "The following `text` is code"),
check_equal "Code across paragraphs (shouldn't work" "<p>`Incomplete</p><p>Code`</p>" (convert "`Incomplete\n\nCode`")
]
integrationTests =
@@ -87,14 +112,14 @@ integrationTests =
check_equal
"Integration 7"
"<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2>Second \
\Heading</h2><ul><li>Unordered lists, and:<ul><li>One</li><li>Two</li><li>\
\Three</li></ul></li><li>More</li></ul><blockquote><p>Blockquote</p>\
\Heading</h2><ul><li>Unordered lists, and:<ol><li>One</li><li>Two</li><li>\
\Three</li></ol></li><li>More</li></ul><blockquote><p>Blockquote</p>\
\</blockquote><p>And <b>bold</b>, <i>italics</i>, and even <i>italics \
\and later <b>bold</b></i>. Even <s>strikethrough</s>. \
\<a href=\"https://markdowntohtml.com\">A link</a> to somewhere.</p>"
( convert
"# Sample Markdown\n\nThis is some basic, sample markdown.\n\n## Second \
\Heading\n\n- Unordered lists, and:\n - One\n - Two\n - Three\n\
\Heading\n\n- Unordered lists, and:\n 1. One\n 2. Two\n 3. Three\n\
\- More\n\n> Blockquote\n\nAnd **bold**, *italics*, and even *italics and \
\later **bold***. Even ~~strikethrough~~. [A link](https://markdowntohtml.com) to somewhere."
)
@@ -109,6 +134,8 @@ tests =
escapedCharTests,
blockquoteTests,
unorderedListTests,
orderedListTests,
codeTests,
integrationTests
]