Compare commits

24 Commits

Author SHA1 Message Date
5c871f2b25 Removed test file 2025-05-20 12:25:07 -05:00
5273c99e6e Added unordered list tests and integration tests; added strikethrough
tests to test list
2025-05-20 12:24:20 -05:00
50888c9c3d Added bold and strikethrough tests 2025-05-20 12:23:47 -05:00
45115c765c An unordered list must end in a blank line. 2025-05-20 12:23:27 -05:00
5b0d42fd2d Use the in-order parsing approach instead of the post-order one. 2025-05-20 12:23:13 -05:00
2a585d00f2 Enforce at least one space between list indicator and list text. 2025-05-20 12:22:37 -05:00
11a3b14cb1 Define a parser for list line tokens, update line token parser 2025-05-20 12:22:19 -05:00
58d3142855 Update comment 2025-05-20 12:21:45 -05:00
0fb651fffc Add parseUnit instead of parseString to lineParsers 2025-05-20 12:20:51 -05:00
bc05dede06 Create a list of parsers that are used for list line items. 2025-05-20 12:20:35 -05:00
b69e34f823 Parse a single character as a unit 2025-05-20 12:20:10 -05:00
2514ecdafc Parse bold, italic and strikethrough in-order, instead of trying to find
the end, then parsing everything in the middle.

The current approach parses the opening bold (or italic), some text,
then the closing bold (or italic), instead of parsing the opening,
closing, then everything in between.
2025-05-20 12:19:59 -05:00
c52d5556a2 Allow for multiple blank lines after header 2025-05-20 12:18:52 -05:00
5fc1b1122a Create a function to 'fallthrough parse' ie. try the second parser only
if the first one fails.
2025-05-20 12:18:23 -05:00
83dd0024c4 Space doesn't have to be a reserved character anymore. 2025-05-20 12:17:59 -05:00
70761649ad Derive Eq for defined types 2025-05-20 12:17:43 -05:00
b9c6cc4470 Implemented strikethrough parser 2025-05-16 19:29:39 -05:00
23691f9cfe Add strikethrough parser to line parser list 2025-05-16 19:28:41 -05:00
8c220cc800 A document can consist of unordered lists as well 2025-05-14 21:40:03 -05:00
ee453c0259 Fixed blockquote parser; implemented unordered list parser and relevant
sub-parsers
2025-05-14 21:39:27 -05:00
c90d23617a A blockquote must have a list of tokens 2025-05-14 21:38:50 -05:00
c574699a8a Added an import 2025-05-14 21:38:38 -05:00
f55e160e25 Added tests for unordered lists 2025-05-14 21:37:58 -05:00
dddcca0185 Wrote a lot of helper functions - most importantly greedyParse 2025-05-13 21:40:42 -05:00
3 changed files with 201 additions and 37 deletions

View File

@@ -7,15 +7,16 @@ module MdToHTML where
import Control.Applicative
import Control.Monad
import Data.List
import Data.Ord (comparing)
import Debug.Trace
import Text.ParserCombinators.ReadP
import Text.Printf
type HeaderLevel = Int
newtype URL = URL {getUrl :: String}
newtype URL = URL {getUrl :: String} deriving (Eq)
newtype ImgPath = ImgPath {getPath :: String}
newtype ImgPath = ImgPath {getPath :: String} deriving (Eq)
parseMany :: ReadP a -> ReadP [a]
parseMany = Text.ParserCombinators.ReadP.many
@@ -28,7 +29,7 @@ data MdToken
| SingleNewline -- A single newline is rendered as a space.
| Linebreak
| HorizontalRule
| Blockquote MdToken
| Blockquote [MdToken]
| UnordList [MdToken]
| OrdList [MdToken]
| Code String
@@ -39,6 +40,7 @@ data MdToken
| Italic MdToken
| Strikethrough MdToken
| Unit String
deriving (Eq)
-- Deriving Show for MdToken
instance Show MdToken where
@@ -96,7 +98,54 @@ lookaheadParse stringCmp = do
lineToList :: MdToken -> [MdToken]
lineToList (Line tokens) = tokens
specialChars = "\\#*_[\n "
specialChars = "\\#*_[\n"
-- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
greedyParse :: ReadP a -> ReadP [a]
greedyParse parser = do
greedyParse1 parser <++ return []
-- Like greedyParse, but the parser must succeed atleast once.
greedyParse1 :: ReadP a -> ReadP [a]
greedyParse1 parser = do
parsed1 <- parser
parsed2 <- greedyParse1 parser <++ return []
return (parsed1 : parsed2)
prepend :: [a] -> [a] -> [a]
prepend x1 x2 = x1 ++ x2
append :: [a] -> [a] -> [a]
append x1 x2 = x2 ++ x1
-- Sequence two parsers, running one after the other and returning the result.
sequenceParse :: ReadP a -> ReadP a -> ReadP [a]
sequenceParse p1 p2 = twoElemList <$> p1 <*> p2
where
twoElemList elem1 elem2 = [elem1, elem2]
-- Parses p1 until p2 succeeds, but doesn't actually consume anything from p2.
-- Similar to manyTill, except manyTill's second parser actually consumes characters.
manyTillLazy :: ReadP a -> ReadP b -> ReadP [a]
manyTillLazy p1 p2 = do
res <- p1
remaining <- look
let p2res = readP_to_S p2 remaining
case p2res of
[] -> do
res2 <- manyTillLazy p1 p2
return (res : res2)
_ -> return [res]
-- Parse until EOL or EOF
parseTillEol :: ReadP String
parseTillEol = manyTill get (void (char '\n') <++ eof)
-- Takes a list of parsers. Returns a parser that will try them in
-- order, moving to the next one only if the current one fails.
fallthroughParser :: [ReadP a] -> ReadP a
fallthroughParser [x] = x
fallthroughParser (x : xs) = x <++ fallthroughParser xs
-- ---------------
@@ -110,31 +159,38 @@ parseHeader = do
pfail
skipSpaces
text <- munch1 (/= '\n')
Text.ParserCombinators.ReadP.optional (char '\n')
-- Text.ParserCombinators.ReadP.optional (char '\n')
skipSpaces
let parsedText = fst $ leftmostLongestParse parseLine text
return (Header (length headers) parsedText)
-- Parse bold text
parseBold :: ReadP MdToken
parseBold = do
text <-
choice
[ between (string "__") (string "__") (many1 (lookaheadParse (/= "__"))),
between (string "**") (string "**") (many1 (lookaheadParse (/= "**")))
]
let parsedText = fst $ leftmostLongestParse parseLine text
return (Bold parsedText)
parseBold = parseBoldWith "**" <|> parseBoldWith "__"
where
parseBoldWith delim = do
string delim
inside <- greedyParse1 parseLineToken
string delim
return (Bold (Line inside))
-- Parse italic text
parseItalic :: ReadP MdToken
parseItalic = do
text <-
choice
[ between (string "_") (string "_") (munch1 (/= '_')),
between (string "*") (string "*") (munch1 (/= '*'))
]
let parsedText = fst $ leftmostLongestParse parseLine text
return (Italic parsedText)
parseItalic = parseBoldWith "*" <|> parseBoldWith "_"
where
parseBoldWith delim = do
string delim
inside <- greedyParse1 parseLineToken
string delim
return (Italic (Line inside))
-- Parse strikethrough text
parseStrikethrough :: ReadP MdToken
parseStrikethrough = do
string "~~"
inside <- many1 parseLineToken
string "~~"
return (Strikethrough (Line inside))
-- Parse a link
parseLink :: ReadP MdToken
@@ -164,6 +220,12 @@ parseEscapedChar = do
escapedChar <- choice (map char specialChars) -- Parse any of the special chars.
return (Unit [escapedChar])
-- Parse a character as a Unit.
parseUnit :: ReadP MdToken
parseUnit = do
text <- satisfy (`notElem` specialChars)
return (Unit [text])
-- Parse a regular string as a Unit.
parseString :: ReadP MdToken
parseString = do
@@ -178,17 +240,33 @@ lineParsers =
parseEscapedChar,
parseBold,
parseItalic,
parseStrikethrough,
parseLink,
parseString
parseUnit
] -- A 'line' doesn't include a 'header'
listLineParsers :: [ReadP MdToken]
listLineParsers =
[ parseLinebreak,
parseEscapedChar,
parseBold,
parseItalic,
parseStrikethrough,
parseLink,
parseUnit
] -- A list line cannot contain newlines.
-- List of all parsers
allParsers :: [ReadP MdToken]
allParsers = parseHeader : lineParsers
-- Parse any of the above tokens.
-- Parse any of the line tokens.
parseLineToken :: ReadP MdToken
parseLineToken = choice lineParsers
parseLineToken = fallthroughParser lineParsers
-- Parse any of the list line tokens.
parseListLineToken :: ReadP MdToken
parseListLineToken = fallthroughParser listLineParsers
-- Parse a line, consisting of one or more tokens.
parseLine :: ReadP MdToken
@@ -230,11 +308,72 @@ parseQuotedLines =
-- Parse a blockquote, which is a greater-than sign followed by a paragraph.
parseBlockquote :: ReadP MdToken
parseBlockquote = do
char '>'
Blockquote <$> (parseBlockquote <++ parsePara) -- Parse another blockquote or a regular paragraph, wrap it in a blockquote.
quotedLines <- parseQuotedLines
-- remaining <- look
-- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining
-- string (init $ unlines quotedLines)
let parsedQuotedLines = fst $ leftmostLongestParse (many1 (parseBlockquote <++ parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
return (Blockquote parsedQuotedLines)
-- Parse a nested list item.
parseUListNested :: ReadP MdToken
parseUListNested = do
-- firstChar <- string " " <++ string "\t"
-- skipSpaces
-- restOfLine <- manyTill get (void (char '\n') <++ eof)
-- let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
-- return restOfLineParsed
let firstCharParser = string " " <++ string "\t"
let restOfLineParser = manyTill get (void (char '\n') <++ eof)
lines <- greedyParse1 (firstCharParser *> restOfLineParser)
let linesParsed = fst $ leftmostLongestParse parseUnorderedList (init $ unlines lines)
return linesParsed
-- Parse an unordered list line item.
parseUListLineItem :: ReadP MdToken
parseUListLineItem = do
firstChar <- choice (map char ['*', '+', '-'])
char ' ' -- At least one space between list indicator and list text.
skipSpaces
restOfLine <- many1 parseListLineToken
void (char '\n') <++ eof
nestedList <- parseUListNested <++ return (Unit "")
return $ Line [Line restOfLine, nestedList]
-- restOfLine <- manyTill get (void (char '\n') <++ eof)
-- let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
-- nestedList <- parseUListNested <++ return (Unit "")
-- return $ Line [restOfLineParsed, nestedList]
-- Parse an unordered list paragraph item.
-- This is defined as a line item, followed by an empty line, followed by one or more
-- lines indented by a space or tab.
parseUListParaItem :: ReadP MdToken
parseUListParaItem = do
firstLine <- parseUListLineItem
char '\n'
lines <- greedyParse1 ((string " " <|> string "\t") *> parseTillEol)
let res = fst $ leftmostLongestParse (greedyParse1 parsePara) (init $ unlines lines)
char '\n'
return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- This is hacky as hell
-- parsedParas <- manyTillLazy parsePara (string "\n\n" *> choice (map char "*-+"))
-- return $ Document parsedParas -- I wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
-- Parse an unordered list item, which can be a line item or another list.
parseUListItem :: ReadP MdToken
parseUListItem = parseUListParaItem <++ parseUListLineItem <++ parseUListNested
-- Parse an unordered list.
parseUnorderedList :: ReadP MdToken
parseUnorderedList = do
lineItems <- greedyParse1 parseUListItem
void (char '\n') <++ eof -- A list must end in an extra newline or eof
return $ UnordList lineItems
-- Parse a document, which is multiple paragraphs.
parseDocument :: ReadP MdToken
parseDocument = do
res <- manyTill (parseHeader <++ parseBlockquote <++ parsePara) eof
res <- manyTill (parseHeader <++ parseBlockquote <++ parseUnorderedList <++ parsePara) eof
return (Document res)

View File

@@ -23,7 +23,15 @@ boldTests =
TestList
[ check_equal "Should convert bold" "<p><b>Hello</b></p>" (convert "__Hello__"),
check_equal "Should convert italic" "<p><i>Hello</i></p>" (convert "_Hello_"),
check_equal "Should convert bold and italic in a sentence" "<p>It <i>is</i> a <b>wonderful</b> day</p>" (convert "It _is_ a __wonderful__ day")
check_equal "Should convert bold and italic in a sentence" "<p>It <i>is</i> a <b>wonderful</b> day</p>" (convert "It _is_ a __wonderful__ day"),
check_equal "Should convert nested bold and italic" "<p><b>Bold then <i>Italic</i></b></p>" (convert "**Bold then *Italic***"),
check_equal "Should convert nested bold and italic" "<p><i>Italic then <b>Bold</b></i></p>" (convert "*Italic then **Bold***")
]
strikethroughTests =
TestList
[ check_equal "Should convert strikethrough" "<p><s>Hello</s></p>" (convert "~~Hello~~"),
check_equal "Should convert long sentence with tilde" "<p><s>The universe is ~7 days old</s>. The universe is 13 billion years old.</p>" (convert "~~The universe is ~7 days old~~. The universe is 13 billion years old.")
]
linkTests =
@@ -56,9 +64,16 @@ blockquoteTests =
)
]
orderedListTests =
unorderedListTests =
TestList
[ check_equal "Basic ordered list" "<ol><li>Item 1</li><li>Item 2</li><li>Item 3</li></ol" (convert "1. Item 1\n2. Item2\n3. Item3")
[ check_equal "Basic ordered list" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>" (convert "* Item 1\n* Item 2\n* Item 3"),
check_equal "Mixing list indicators" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>" (convert "* Item 1\n+ Item 2\n- Item 3"),
check_equal "Formatted lists" "<ul><li><b>Item 1</b></li><li><i>Item 2</i></li><li><b><i>Item 3</i></b></li></ul>" (convert "* __Item 1__\n+ _Item 2_\n- ***Item 3***"),
check_equal "Nested list" "<ul><li>Item 1</li><li>Item 2</li><li>Item 3<ul><li>Subitem 1</li><li>Subitem 2</li></ul></li></ul>" (convert "* Item 1\n* Item 2\n* Item 3\n * Subitem 1\n * Subitem 2"),
check_equal "Paragraph in list" "<ul><li>Item 1</li><li><p>Item 2</p><p>More stuff</p></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n\n More stuff\n\n- Item 3"),
check_equal "Paragraph before list" "<p>This is a list</p><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "This is a list\n\n* Item 1\n* Item 2"),
check_equal "Paragraph before list" "<h3>This is a list</h3><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "### This is a list\n\n* Item 1\n* Item 2"),
check_equal "Nested list then back" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3</li><li>Item 4</li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n - Item 3\n - Item 4\n- Item 5")
]
integrationTests =
@@ -68,17 +83,32 @@ integrationTests =
check_equal "Integration 3" "<h1>Hello</h1><p>World</p>" (convert "# Hello\nWorld"),
check_equal "Integration 4" "<p>a b</p>" (convert "a\nb"),
check_equal "Integration 5" "<h1>Hello</h1>" (convert "# Hello\n"),
check_equal "Integration 6" "<p>First line<br>Second line</p>" (convert "First line \nSecond line")
check_equal "Integration 6" "<p>First line<br>Second line</p>" (convert "First line \nSecond line"),
check_equal
"Integration 7"
"<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2>Second \
\Heading</h2><ul><li>Unordered lists, and:<ul><li>One</li><li>Two</li><li>\
\Three</li></ul></li><li>More</li></ul><blockquote><p>Blockquote</p>\
\</blockquote><p>And <b>bold</b>, <i>italics</i>, and even <i>italics \
\and later <b>bold</b></i>. Even <s>strikethrough</s>. \
\<a href=\"https://markdowntohtml.com\">A link</a> to somewhere.</p>"
( convert
"# Sample Markdown\n\nThis is some basic, sample markdown.\n\n## Second \
\Heading\n\n- Unordered lists, and:\n - One\n - Two\n - Three\n\
\- More\n\n> Blockquote\n\nAnd **bold**, *italics*, and even *italics and \
\later **bold***. Even ~~strikethrough~~. [A link](https://markdowntohtml.com) to somewhere."
)
]
tests =
TestList
[ headerTests,
boldTests,
strikethroughTests,
linkTests,
escapedCharTests,
blockquoteTests,
orderedListTests,
unorderedListTests,
integrationTests
]

View File

@@ -1,5 +0,0 @@
leftmostLongest :: (Foldable t) => [t a] -> t a
leftmostLongest xs =
let lastElem = (last xs)
filteredLst = (filter (\val -> (length val) == (length lastElem)) xs)
in head filteredLst