Made some changes

Merge remote-tracking branch 'origin' into fixingIncompleteElements
Started working on parsing changes to allow incomplete elements to be parsed (eg. opening bold, followed by text, without closing bold)
2025-05-23 08:07:02 -04:00 · 2025-05-20 22:24:24 -04:00 · 2025-05-20 22:21:59 -04:00
4 changed files with 170 additions and 417 deletions
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -1,30 +1,8 @@
 module Main where
 import MdToHTML
 import System.Environment
 import System.IO
 readLinesHelper :: [String] -> IO [String]
 readLinesHelper xs = do
  done <- isEOF
  if done
    then return xs
    else do
      line <- getLine
      let xs' = line : xs
      readLinesHelper xs'
 readLines :: IO [String]
 readLines = reverse <$> readLinesHelper []
 main :: IO ()
 main = do
-  args <- getArgs
+   let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n"
-  fileContents <- case args of
+   putStrLn (show res)
    [] -> getContents
    x : _ -> readFile x
  let res = leftmostLongestParse parseDocument fileContents
  let toPrint = prettyPrint res
  case reverse toPrint of
    '\n' : _ -> putStr toPrint
    _ -> putStrLn toPrint
--- a/md-to-html.cabal
+++ b/md-to-html.cabal
@@ -58,20 +58,19 @@ library
   exposed-modules: MdToHTML
   other-modules: MdToHtmlTest
   build-depends:    base ^>=4.19.1.0,
-                      HUnit,
+                      HUnit
                      megaparsec,
                      parser-combinators,
                      text,
                      MissingH,
                      word-wrap
-executable mdtoh
+executable md-to-html-runner
    -- Import common warning flags.
    import:           warnings
    -- .hs or .lhs file containing the Main module.
    main-is:          Main.hs
    -- Modules included in this executable, other than Main.
    other-modules:
        MdToHTML
        MdToHtmlTest
    -- LANGUAGE extensions used by modules in this package.
    -- other-extensions:
--- a/src/MdToHTML.hs
+++ b/src/MdToHTML.hs
@@ -4,31 +4,24 @@
 module MdToHTML where
-import Control.Applicative hiding (many, some)
+import Control.Applicative
 import Control.Monad
 import Control.Monad.Combinators (count)
 import Data.Char
 import Data.List
 import Data.Ord (comparing)
 import Data.String.Utils
 import qualified Data.Text as T
 import Data.Void
 import Debug.Trace
-import Text.Megaparsec
+import Text.ParserCombinators.ReadP
 import Text.Megaparsec.Char
 import Text.Printf
 import Text.Wrap
 type Parser = Parsec Void T.Text
 type HeaderLevel = Int
 type CssClass = String
 newtype URL = URL {getUrl :: String} deriving (Eq)
 newtype ImgPath = ImgPath {getPath :: String} deriving (Eq)
 parseMany :: ReadP a -> ReadP [a]
 parseMany = Text.ParserCombinators.ReadP.many
 data MdToken
  = Document [MdToken]
  | Header HeaderLevel MdToken
@@ -40,17 +33,12 @@ data MdToken
  | Blockquote [MdToken]
  | UnordList [MdToken]
  | OrdList [MdToken]
  | Checkbox Bool
  | Code MdToken
-  | Table [[MdToken]]
+  | Codeblock String
  | Codeblock MdToken
  | Link MdToken URL
-  | Image MdToken URL (Maybe [CssClass])
+  | Image MdToken ImgPath
  | Figure MdToken URL (Maybe [CssClass])
  | Bold MdToken
  | Italic MdToken
  | Superscript MdToken
  | Subscript MdToken
  | Strikethrough MdToken
  | Unit String
  deriving (Eq)
@@ -61,38 +49,21 @@ instance Show MdToken where
  show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
  show (Para token) = "<p>" ++ show token ++ "</p>"
  show (Line tokens) = concatMap show tokens
-  show Linebreak = "<br />"
+  show Linebreak = "<br>"
  show SingleNewline = " "
  show HorizontalRule = "<hr>"
  show (Blockquote tokens) = "<blockquote>" ++ concatMap show tokens ++ "</blockquote>"
  show (UnordList tokens) = "<ul>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ul>"
  show (OrdList tokens) = "<ol>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ol>"
-  show (Checkbox isChecked) = "<input type=\"checkbox\"" ++ (if isChecked then " checked=\"\"" else "") ++ " />"
+  show (Code code) = "<code>" ++ show code ++ "</code>"
-  show (Code code) = "<code>" ++ strip (show code) ++ "</code>"
+  show (Codeblock code) = show code
  show (Table (thead : tokenGrid)) = "<table><thead><tr>" ++ concatMap (\x -> "<th>" ++ rstrip (show x) ++ "</th>") thead ++ "</tr></thead>" ++ "<tbody>" ++ concatMap (\x -> "<tr>" ++ concatMap (\y -> "<td>" ++ rstrip (show y) ++ "</td>") x ++ "</tr>") tokenGrid ++ "</tbody></table>"
  show (Codeblock code) = "<pre><code>" ++ show code ++ "</code></pre>"
  show (Link txt url) = "<a href=\"" ++ getUrl url ++ "\">" ++ show txt ++ "</a>"
-  show (Image txt url cssClasses) = "<img src=\"" ++ getUrl url ++ "\"" ++ " alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/>"
+  show (Image txt imgPath) = "<img src=" ++ getPath imgPath ++ ">" ++ show txt ++ "</img>"
  show (Figure txt url cssClasses) = "<figure><img src=\"" ++ getUrl url ++ "\" alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/><figcaption aria-hidden=\"true\">" ++ show txt ++ "</figcaption></figure>"
  show (Bold token) = "<b>" ++ show token ++ "</b>"
  show (Italic token) = "<i>" ++ show token ++ "</i>"
  show (Superscript token) = "<sup>" ++ show token ++ "</sup>"
  show (Subscript token) = "<sub>" ++ show token ++ "</sub>"
  show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
  show (Unit unit) = printf "%s" unit
 -- Pretty print the given token into a string.
 -- This is the same as calling 'show' for most tokens, but is different for paragraphs and tables,
 -- which have newlines inserted into them.
 prettyPrint :: MdToken -> String
 prettyPrint (Para token) = "<p>" ++ T.unpack (wrapText defaultWrapSettings 70 (T.pack $ prettyPrint token)) ++ "</p>\n"
 prettyPrint (Table (thead : tokenGrid)) = "<table>\n<thead>\n<tr>\n" ++ concatMap (\x -> "<th>" ++ rstrip (prettyPrint x) ++ "</th>\n") thead ++ "</tr>\n</thead>\n" ++ "<tbody>\n" ++ concatMap (\x -> "<tr>\n" ++ concatMap (\y -> "<td>" ++ rstrip (prettyPrint y) ++ "</td>\n") x ++ "</tr>\n") tokenGrid ++ "</tbody>\n</table>\n"
 prettyPrint Linebreak = "<br />\n"
 prettyPrint HorizontalRule = "<hr>\n"
 prettyPrint (Line tokens) = concatMap prettyPrint tokens
 prettyPrint (Document tokens) = concatMap prettyPrint tokens
 prettyPrint token = show token
 instance Semigroup MdToken where
  a <> b = Document [a, b]
@@ -110,26 +81,27 @@ leftmostLongest xs =
        (x : xs) -> Just x
 -- Get the first parse returned by readP_to_S that consumed the most input
-leftmostLongestParse :: (Monoid a) => Parser a -> String -> a
+leftmostLongestParse :: (Monoid a) => ReadP a -> String -> (a, String)
 leftmostLongestParse parser input =
-  case runParser parser "input" (T.pack input) of
+  let res = leftmostLongest $ readP_to_S parser input
-    (Left a) -> mempty
+   in case res of
-    (Right a) -> a
+        Nothing -> (mempty, mempty)
        Just x -> x
-specialChars = ">\n\\`*_{}[]#+|"
+specialChars = "\\#*_[\n`"
-escapableChars = "-~!.$()" ++ specialChars
+escapableChars = '~' : specialChars
 -- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
-greedyParse :: Parser a -> Parser [a]
+greedyParse :: ReadP a -> ReadP [a]
 greedyParse parser = do
-  greedyParse1 parser <|> return []
+  greedyParse1 parser <++ return []
 -- Like greedyParse, but the parser must succeed atleast once.
-greedyParse1 :: Parser a -> Parser [a]
+greedyParse1 :: ReadP a -> ReadP [a]
 greedyParse1 parser = do
  parsed1 <- parser
-  parsed2 <- greedyParse1 parser <|> return []
+  parsed2 <- greedyParse1 parser <++ return []
  return (parsed1 : parsed2)
 prepend :: [a] -> [a] -> [a]
@@ -139,328 +111,222 @@ append :: [a] -> [a] -> [a]
 append x1 x2 = x2 ++ x1
 -- Parse until EOL or EOF
-parseTillEol :: Parser String
+parseTillEol :: ReadP String
-parseTillEol = manyTill anySingle (void (char '\n') <|> eof)
+parseTillEol = manyTill get (void (char '\n') <++ eof)
 -- Takes a list of parsers. Returns a parser that will try them in
 -- order, moving to the next one only if the current one fails.
-fallthroughParser :: [Parser a] -> Parser a
+fallthroughParser :: [ReadP a] -> ReadP a
-fallthroughParser [x] = try x
+fallthroughParser [x] = x
-fallthroughParser (x : xs) = try x <|> fallthroughParser xs
+fallthroughParser (x : xs) = x <++ fallthroughParser xs
-escapeChar :: Char -> String
+myMany :: (Monoid a) => ReadP a -> ReadP [a]
-escapeChar '>' = "&gt;"
+myMany p = do
-escapeChar '<' = "&lt;"
+  remaining <- look
-escapeChar '&' = "&amp;"
+  case remaining of
-escapeChar x = [x]
+    [] -> return []
    _ -> return [] +++ myMany1 p
-htmlEscapeChars :: T.Text -> T.Text
+myMany1 :: (Monoid a) => ReadP a -> ReadP [a]
-htmlEscapeChars = T.concatMap (T.pack . escapeChar)
+myMany1 p = liftM2 (:) p (myMany p)
 -- -- Wraps a list of words after (at most) the given number of characters, trying to prevent word-breaks
 -- wordwrap :: Int -> String -> String
 -- wordwrap wraplength str = if (length str) < wraplength
 --     then str
 --     else
 --         let spaceIndex = lastgtSpaceIndex 0 (takeRev (length str) - wraplength str)
 --
 --     where
 --         takeRev n = (reverse . take n . reverse)
 --         lastSpaceIndex counter str = case str of
 --             [] -> counter
 --             x:xs -> if (isSpace x) counter else lastSpaceIndex counter+1 xs
 -- ---------------
 -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
-parseHeader :: Parser MdToken
+parseHeader :: ReadP MdToken
 parseHeader = do
-  space
+  skipSpaces
-  headers <- greedyParse1 (char '#')
+  headers <- munch1 (== '#')
  when
    (length headers > 6)
-    empty
+    pfail
-  space
+  skipSpaces
-  parsedText <- manyTill parseLineToken (void (char '\n') <|> eof)
+  text <- munch1 (/= '\n')
-  greedyParse (char '\n')
+  --  Text.ParserCombinators.ReadP.optional (char '\n')
-  return (Header (length headers) (Line parsedText))
+  skipSpaces
-
+  let parsedText = fst $ leftmostLongestParse parseLine text
-asteriskBold = T.pack "**"
+  return (Header (length headers) parsedText)
 underscoreBold = T.pack "__"
 -- Parse bold text
-parseBold :: Parser MdToken
+parseBold :: ReadP MdToken
-parseBold = parseBoldWith asteriskBold <|> parseBoldWith underscoreBold
+parseBold = parseBoldWith "**" <|> parseBoldWith "__"
  where
    parseBoldWith delim = do
      string delim
-      inside <- someTill parseLineToken $ string delim
+      inside <- myMany1 parseLineToken
      string delim
      return (Bold (Line inside))
 -- Parse italic text
-parseItalic :: Parser MdToken
+parseItalic :: ReadP MdToken
 parseItalic = parseItalicWith '*' <|> parseItalicWith '_'
  where
    parseItalicWith delim = do
-      char delim
+      exactlyOnce delim
-      inside <- someTill parseLineToken (char delim)
+      inside <- myMany1 parseLineToken
      exactlyOnce delim
      return (Italic (Line inside))
-
+    exactlyOnce ch = do
-- Parse subscript
+      char ch
-parseSubscript :: Parser MdToken
+      remaining <- look
-parseSubscript = do
+      case remaining of
-  char '~'
+        [] -> return ch
-  inside <- someTill parseLineToken (char '~')
+        x : xs -> if x == ch then pfail else return ch
  return (Subscript (Line inside))
 -- Parse superscript
 parseSuperscript :: Parser MdToken
 parseSuperscript = do
  char '^'
  inside <- someTill parseLineToken (char '^')
  return (Superscript (Line inside))
 -- Parse strikethrough text
-parseStrikethrough :: Parser MdToken
+parseStrikethrough :: ReadP MdToken
 parseStrikethrough = do
-  string (T.pack "~~")
+  string "~~"
-  inside <- someTill parseLineToken $ string (T.pack "~~")
+  inside <- many1 parseLineToken
  string "~~"
  return (Strikethrough (Line inside))
 -- Parse code
-parseCode :: Parser MdToken
+parseCode :: ReadP MdToken
 parseCode = do
-  opening <- some $ char '`'
+  string "`"
-  inside <- someTill (satisfy (/= '\n')) (char '`')
+  inside <- many1 get
-  closing <- count (length opening - 1) (char '`')
+  string "`"
-  return (Code (Unit (concatMap escapeChar inside)))
+  return (Code (Unit inside))
 -- Parse a link
-parseLink :: Parser MdToken
+parseLink :: ReadP MdToken
 parseLink = do
-  char '['
+  linkText <- between (string "[") (string "]") (many1 get)
-  linkText <- manyTill parseLineToken (char ']')
+  linkURL <- between (string "(") (string ")") (many1 get)
-  char '('
+  let parsedLinkText = fst $ leftmostLongestParse parseLine linkText
-  linkURL <- manyTill anySingle (char ')')
+  return $ Link parsedLinkText (URL linkURL)
  return $ Link (Line linkText) (URL linkURL)
 -- Parse a linebreak character
-parseLinebreak :: Parser MdToken
+parseLinebreak :: ReadP MdToken
-parseLinebreak = parseLinebreakSpace <|> parseLinebreakBackslash
+parseLinebreak = do
-  where
+  char ' '
-    parseLinebreakSpace = do
+  many1 (char ' ')
      char ' '
      some (char ' ')
      char '\n'
      return Linebreak
    parseLinebreakBackslash = try $ do
      char '\\'
      char '\n'
      return Linebreak
 parseTableRow :: Parser [MdToken]
 parseTableRow = do
  char '|'
  row <- some (many (satisfy (\x -> x == ' ' || x == '\t')) *> someTill parseListLineToken (char '|'))
  return (map Line row)
 parseTable :: Parser MdToken
 parseTable = do
  tableHead <- parseTableRow
  char '\n'
-  char '|'
+  return Linebreak
  sepEndBy1 (some (char '-')) (char '|') *> char '\n'
  tableBody <- sepEndBy parseTableRow (char '\n')
  many (char '\n') -- Parse trailing newlines, if any
  return $ Table (tableHead : tableBody)
-parseSingleNewline :: Parser MdToken
+parseSingleNewline :: ReadP MdToken
 parseSingleNewline = do
  char '\n'
-  remaining <- getInput
+  return SingleNewline
  case T.unpack remaining of
    [] -> return $ Unit ""
    _ -> return SingleNewline
 parseCssClasses :: Parser [CssClass]
 parseCssClasses = do
  char '{'
  classes <- some parseCssClass
  char '}'
  return classes
  where
    parseCssClass :: Parser CssClass
    parseCssClass = do
      char '.'
      let firstLetterParser = char '_' <|> char '-' <|> label "letter" (satisfy isAlpha)
      cssClassFirstLetter <- firstLetterParser
      cssClass <- many (firstLetterParser <|> label "digit" (satisfy isDigit))
      space
      return (cssClassFirstLetter : cssClass)
 parseImage :: Parser MdToken
 parseImage = do
  char '!'
  link <- parseLink
  cssClasses <- optional $ try parseCssClasses
  case link of
    Link text path -> return $ Image text path cssClasses
    _ -> empty -- This should never be reached
 parseFigure = do
  img <- parseImage
  void (string doubleNewlineText) <|> eof
  case img of
    Image text path cssClasses -> return $ Figure text path cssClasses
    _ -> return img
 -- Parse an escaped character
-parseEscapedChar :: Parser MdToken
+parseEscapedChar :: ReadP MdToken
 parseEscapedChar = do
  char '\\'
  escapedChar <- choice (map char escapableChars) -- Parse any of the special chars.
  return (Unit [escapedChar])
 -- Parse a character as a Unit.
-parseUnit :: Parser MdToken
+parseUnit :: ReadP MdToken
 parseUnit = do
  -- text <- satisfy (`notElem` specialChars)
-  text <- anySingle
+  text <- get
  return (Unit [text])
-- Parse any character except a newline
+lineParsers :: [ReadP MdToken]
 parseUnitExceptNewline :: Parser MdToken
 parseUnitExceptNewline = do
  -- text <- satisfy (`notElem` specialChars)
  text <- satisfy (/= '\n')
  return (Unit [text])
 lineParsers :: [Parser MdToken]
 lineParsers =
  [ parseLinebreak,
    parseSingleNewline,
    parseEscapedChar,
    parseCode,
    parseImage,
    parseBold,
    parseItalic,
    parseStrikethrough,
    parseSubscript,
    parseSuperscript,
    parseLink,
    parseUnit
  ] -- A 'line' doesn't include a 'header'
-lineParsersWithoutNewline :: [Parser MdToken]
+listLineParsers :: [ReadP MdToken]
-lineParsersWithoutNewline =
+listLineParsers =
-  [ parseEscapedChar,
+  [ parseLinebreak,
    parseEscapedChar,
    parseCode,
    parseImage,
    parseBold,
    parseItalic,
    parseStrikethrough,
    parseSubscript,
    parseSuperscript,
    parseLink,
-    parseUnitExceptNewline
+    parseUnit
  ] -- A list line cannot contain newlines.
 -- List of all parsers
-allParsers :: [Parser MdToken]
+allParsers :: [ReadP MdToken]
 allParsers = parseHeader : lineParsers
 -- Parse any of the line tokens.
-parseLineToken :: Parser MdToken
+parseLineToken :: ReadP MdToken
 parseLineToken = fallthroughParser lineParsers
 -- Parse any of the list line tokens.
-parseListLineToken :: Parser MdToken
+parseListLineToken :: ReadP MdToken
-parseListLineToken = fallthroughParser lineParsersWithoutNewline
+parseListLineToken = fallthroughParser listLineParsers
 -- Parse a line, consisting of one or more tokens.
-parseLine :: Parser MdToken
+parseLine :: ReadP MdToken
 parseLine = do
-  space
+  skipSpaces
  -- Fail if we have reached the end of the document.
-  parsed <- manyTill parseLineToken eof
+  parsed <- myMany1 parseLineToken
  return (Line parsed)
 -- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
-parsePara :: Parser MdToken
+parsePara :: ReadP MdToken
 parsePara = do
-  space
+  parseMany (char '\n')
  --  text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
  --  string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
-  parsedText <- someTill parseLineToken (try paraEnding)
+  text <- manyTill get (string "\n\n" <|> (eof >> return ""))
-  many (char '\n')
+  when (null text) pfail
-  return (Para (Line parsedText))
+  let parsedText = fst $ leftmostLongestParse parseLine text -- Parse a line
-  where
+  return (Para parsedText)
    paraEnding = void (char '\n' *> (char '\n' <|> lookAhead (char '>'))) <|> eof
 -- Parse a line starting with '>', return the line except for the '>'.
-parseQuotedLine :: Parser String
+parseQuotedLine :: ReadP String
 parseQuotedLine = do
  char '>'
-  many (char ' ' <|> char '\t')
+  greedyParse (char ' ' +++ char '\t')
-  restOfLine <- many (satisfy (/= '\n'))
+  restOfLine <- munch (/= '\n')
-  void (char '\n') <|> eof
+  Text.ParserCombinators.ReadP.optional (char '\n') >> return ""
  return restOfLine
 -- Parse many 'quoted lines' until I see a non-quoted line.
-parseQuotedLines :: Parser [String]
+parseQuotedLines :: ReadP [String]
-parseQuotedLines = some parseQuotedLine
+parseQuotedLines =
-
+  greedyParse1 $ do
-- some $ do
+    look >>= \line ->
--   getInput >>= \line ->
+      case line of
--     case T.unpack line of
+        ('>' : _) -> parseQuotedLine
--       ('>' : _) -> parseQuotedLine
+        _ -> pfail
 --       _ -> empty
 -- Parse a blockquote, which is a greater-than sign followed by a paragraph.
-parseBlockquote :: Parser MdToken
+parseBlockquote :: ReadP MdToken
 parseBlockquote = do
  quotedLines <- parseQuotedLines
  -- remaining <- look
  -- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining
  -- string (init $ unlines quotedLines)
-  let parsedQuotedLines = leftmostLongestParse (some (parseBlockquote <|> parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
+  let parsedQuotedLines = fst $ leftmostLongestParse (many1 (parseBlockquote <++ parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
  return (Blockquote parsedQuotedLines)
 -- Parse a checkbox
 parseCheckbox :: Parser MdToken
 parseCheckbox = do
  char '['
  inside <- char ' ' <|> char 'x'
  char ']'
  space
  return (if inside == 'x' then Checkbox True else Checkbox False)
 -- Parse a nested list item.
-parseListNested :: Parser MdToken
+parseListNested :: ReadP MdToken
 parseListNested = do
-  let firstCharParser = (<>) <$> (string (T.pack "  ") <|> string (T.pack "\t")) <*> (T.pack <$> many (char ' '))
+  let firstCharParser = string "    " <++ string "\t"
-  let restOfLineParser = manyTill anySingle (void (char '\n') <|> eof)
+  let restOfLineParser = manyTill get (void (char '\n') <++ eof)
-  -- For the first line, I manually run firstCharParser and restOfLineParser. The
+  lines <- greedyParse1 (firstCharParser *> restOfLineParser)
-  -- result of firstCharParser is saved. For every subsequent line, I parse exactly
+  let linesParsed = fst $ leftmostLongestParse (parseUnorderedList <++ parseOrderedList) (init $ unlines lines)
-  -- the same string as firstCharParser.
+  when (null (show linesParsed)) pfail
  firstLineSpaces <- firstCharParser
  firstLine <- restOfLineParser
  lines <- greedyParse (string firstLineSpaces *> restOfLineParser)
  let allLines = firstLine : lines
  let linesParsed = leftmostLongestParse (parseUnorderedList <|> parseOrderedList) (init $ unlines allLines)
  when (null (show linesParsed)) empty
  return linesParsed
 -- Parse an unordered list line item.
-parseUListLineItem :: Parser MdToken
+parseUListLineItem :: ReadP MdToken
 parseUListLineItem = do
  firstChar <- choice (map char ['*', '+', '-'])
  char ' ' -- At least one space between list indicator and list text.
  parseListLineItemCommon
 -- Parse an ordered list line item.
-parseOListLineItem :: Parser MdToken
+parseOListLineItem :: ReadP MdToken
 parseOListLineItem = do
  num <- greedyParse1 (satisfy isDigit)
  char '.'
@@ -468,25 +334,23 @@ parseOListLineItem = do
  parseListLineItemCommon
 -- Common code for parsing list line items
-parseListLineItemCommon :: Parser MdToken
+parseListLineItemCommon :: ReadP MdToken
 parseListLineItemCommon = do
-  space
+  skipSpaces
-  checkbox <- optional $ try parseCheckbox
+  restOfLine <- many1 parseListLineToken
-  restOfLine <- manyTill parseListLineToken (void (char '\n') <|> eof)
+  void (char '\n') <++ eof
-  nestedList <- try parseListNested <|> return (Unit "")
+  nestedList <- parseListNested <++ return (Unit "")
-  case checkbox of
+  return $ Line [Line restOfLine, nestedList]
    Just box -> return $ Line [box, Line restOfLine, nestedList]
    Nothing -> return $ Line [Line restOfLine, nestedList]
 -- Parse an unordered list paragraph item.
-parseUListParaItem :: Parser MdToken
+parseUListParaItem :: ReadP MdToken
 parseUListParaItem = do
  firstLine <- parseUListLineItem
  res <- parseListParaItemCommon
  return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
 -- Parse an unordered list paragraph item.
-parseOListParaItem :: Parser MdToken
+parseOListParaItem :: ReadP MdToken
 parseOListParaItem = do
  firstLine <- parseOListLineItem
  res <- parseListParaItemCommon
@@ -496,82 +360,55 @@ parseOListParaItem = do
 -- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more
 -- lines indented by a space or tab.
 -- A list paragraph item can also be a blockquote.
-parseListParaItemCommon :: Parser [MdToken]
+parseListParaItemCommon :: ReadP [MdToken]
 parseListParaItemCommon = do
  char '\n'
-  lines <- greedyParse1 ((string (T.pack "    ") <|> string (T.pack "\t")) *> parseTillEol)
+  lines <- greedyParse1 ((string "    " <|> string "\t") *> parseTillEol)
-  let res = leftmostLongestParse (greedyParse1 parseBlockquote <|> greedyParse1 parsePara) (init $ unlines lines)
+  let res = fst $ leftmostLongestParse (greedyParse1 parseBlockquote <++ greedyParse1 parsePara) (init $ unlines lines)
  char '\n'
  return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.
 -- Parse an unordered list item, which can be a line item or another list.
-parseUListItem :: Parser MdToken
+parseUListItem :: ReadP MdToken
-parseUListItem = space *> (try parseUListParaItem <|> parseUListLineItem)
+parseUListItem = parseUListParaItem <++ parseUListLineItem
 -- Parse an unordered list.
-parseUnorderedList :: Parser MdToken
+parseUnorderedList :: ReadP MdToken
 parseUnorderedList = do
-  lineItems <- some $ try parseUListItem
+  lineItems <- greedyParse1 parseUListItem
-  void (char '\n') <|> eof -- A list must end in an extra newline or eof
+  void (char '\n') <++ eof -- A list must end in an extra newline or eof
  return $ UnordList lineItems
 -- --------
-parseOListItem :: Parser MdToken
+parseOListItem :: ReadP MdToken
-parseOListItem = space *> (try parseOListParaItem <|> parseOListLineItem)
+parseOListItem = parseOListParaItem <++ parseOListLineItem
 -- Parses the first element of an ordered list, which must start with '1.'
-parseFirstOListItem :: Parser MdToken
+parseFirstOListItem :: ReadP MdToken
 parseFirstOListItem = do
-  space
+  remaining <- look
-  remaining <- getInput
+  when (take 2 remaining /= "1.") pfail
  when (take 2 (T.unpack remaining) /= "1.") empty
  parseOListLineItem
-parseOrderedList :: Parser MdToken
+parseOrderedList :: ReadP MdToken
 parseOrderedList = do
-  firstLine <- try parseFirstOListItem
+  firstLine <- parseFirstOListItem
-  lineItems <- many $ try parseOListItem
+  lineItems <- greedyParse1 parseOListItem
-  void (char '\n') <|> eof
+  void (char '\n') <++ eof
  return $ OrdList (firstLine : lineItems)
-horizontalRuleText :: T.Text
+documentParsers :: [ReadP MdToken]
 horizontalRuleText = T.pack "---"
 doubleNewlineText :: T.Text
 doubleNewlineText = T.pack "\n\n"
 parseHorizontalRule :: Parser MdToken
 parseHorizontalRule = parseHorizontalRuleLine *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule
  where
    parseHorizontalRuleLine = fallthroughParser (map (string . T.pack) ["---", "***", "___", "- - -", "* * *", "_ _ _"])
 parseCodeblock :: Parser MdToken
 parseCodeblock = do
  string (T.pack "```")
  _ <- many $ satisfy (/= '\n') -- Language name
  char '\n'
  inside <- someTill anySingle (string (T.pack "\n```"))
  return $ Codeblock (Unit (concatMap escapeChar inside))
 documentParsers :: [Parser MdToken]
 documentParsers =
-  [ parseHorizontalRule,
+  [ parseHeader,
    parseCodeblock,
    parseTable,
    parseHeader,
    parseBlockquote,
    parseUnorderedList,
    parseOrderedList,
    parseFigure,
    parsePara
  ]
 -- Parse a document, which is multiple paragraphs.
-parseDocument :: Parser MdToken
+parseDocument :: ReadP MdToken
 parseDocument = do
-  --  res <- manyTill (fallthroughParser documentParsers <|> (char '\n' *> return $ Unit "")) eof
+  res <- manyTill (fallthroughParser documentParsers) eof
  res <- sepEndBy (fallthroughParser documentParsers) (many $ char '\n')
  -- many $ char '\n'
  eof
  return (Document res)
--- a/src/MdToHtmlTest.hs
+++ b/src/MdToHtmlTest.hs
@@ -7,7 +7,7 @@ check_equal :: String -> String -> String -> Test
 check_equal desc expected actual = TestCase (assertEqual desc expected actual)
 convert :: String -> String
-convert md = show $ leftmostLongestParse parseDocument md
+convert md = show . fst $ leftmostLongestParse parseDocument md
 headerTests =
  TestList
@@ -22,16 +22,17 @@ headerTests =
 boldTests =
  TestList
    [ check_equal "Should convert bold" "<p><b>Hello</b></p>" (convert "__Hello__"),
      check_equal " Should not convert incomplete bold" "<p>**Hello</p>" (convert "**Hello"),
      check_equal "Should convert italic" "<p><i>Hello</i></p>" (convert "_Hello_"),
      check_equal "Should convert bold and italic in a sentence" "<p>It <i>is</i> a <b>wonderful</b> day</p>" (convert "It _is_ a __wonderful__ day"),
-      check_equal "Should convert nested bold and italic" "<p><b>Bold then <i>Italic</i></b></p>" (convert "**Bold then _Italic_**"),
+      check_equal "Should convert nested bold and italic" "<p><b>Bold then <i>Italic</i></b></p>" (convert "**Bold then *Italic***"),
-      check_equal "Should convert nested bold and italic" "<p><i>Italic then <b>Bold</b></i></p>" (convert "*Italic then __Bold__*")
+      check_equal "Should convert nested bold and italic" "<p><i>Italic then <b>Bold</b></i></p>" (convert "*Italic then **Bold***")
    ]
 strikethroughTests =
  TestList
    [ check_equal "Should convert strikethrough" "<p><s>Hello</s></p>" (convert "~~Hello~~"),
-      check_equal "Should convert long sentence with tilde" "<p><s>The universe is ~7 days old</s>. The universe is 13 billion years old.</p>" (convert "~~The universe is \\~7 days old~~. The universe is 13 billion years old.")
+      check_equal "Should convert long sentence with tilde" "<p><s>The universe is ~7 days old</s>. The universe is 13 billion years old.</p>" (convert "~~The universe is ~7 days old~~. The universe is 13 billion years old.")
    ]
 linkTests =
@@ -74,17 +75,8 @@ unorderedListTests =
      check_equal "Paragraph before list" "<p>This is a list</p><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "This is a list\n\n* Item 1\n* Item 2"),
      check_equal "Paragraph before list" "<h3>This is a list</h3><ul><li>Item 1</li><li>Item 2</li></ul>" (convert "### This is a list\n\n* Item 1\n* Item 2"),
      check_equal "Nested list then back" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3</li><li>Item 4</li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n    - Item 3\n    - Item 4\n- Item 5"),
      check_equal "Triply nested list" "<ul><li>Item 1</li><li>Item 2<ul><li>Item 3<ul><li>Item 4</li></ul></li></ul></li><li>Item 5</li></ul>" (convert "- Item 1\n- Item 2\n    - Item 3\n      - Item 4\n- Item 5"),
      check_equal "Blockquote in list" "<ul><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n\n    > Quote\n\n- Item 3"),
-      check_equal "Ordered list in unordered list" "<ul><li>Item 1</li><li>Item 2<ol><li>Item 1</li><li>Item 2</li></ol></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n    1. Item 1\n    2. Item 2\n- Item 3"),
+      check_equal "Ordered list in unordered list" "<ul><li>Item 1</li><li>Item 2<ol><li>Item 1</li><li>Item 2</li></ol></li><li>Item 3</li></ul>" (convert "- Item 1\n- Item 2\n    1. Item 1\n    2. Item 2\n- Item 3")
      check_equal
        "Checkbox in unordered list"
        "<ul>\
        \<li><input type=\"checkbox\" />Not checked</li>\
        \<li><input type=\"checkbox\" checked=\"\" />Checked</li>\
        \<li>Normal list item</li></ul>"
        (convert "- [ ]     Not checked\n- [x]   Checked\n- Normal list item"),
      check_equal "List with link at the start" "<ul><li><a href=\"b\">a</a></li><li><a href=\"d\">c</a></li></ul>" (convert "- [a](b)\n- [c](d)")
    ]
 orderedListTests =
@@ -99,62 +91,14 @@ orderedListTests =
      check_equal "Paragraph before list" "<h3>This is a list</h3><ol><li>Item 1</li><li>Item 2</li></ol>" (convert "### This is a list\n\n1. Item 1\n200. Item 2"),
      check_equal "Nested list then back" "<ol><li>Item 1</li><li>Item 2<ol><li>Item 3</li><li>Item 4</li></ol></li><li>Item 5</li></ol>" (convert "1. Item 1\n2. Item 2\n    1. Item 3\n    3. Item 4\n5. Item 5"),
      check_equal "Blockquote in list" "<ol><li>Item 1</li><li><p>Item 2</p><blockquote><p>Quote</p></blockquote></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n\n    > Quote\n\n3. Item 3"),
-      check_equal "Unordered list in ordered list" "<ol><li>Item 1</li><li>Item 2<ul><li>Item 1</li><li>Item 2</li></ul></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n    - Item 1\n    * Item 2\n4. Item 3"),
+      check_equal "Unordered list in ordered list" "<ol><li>Item 1</li><li>Item 2<ul><li>Item 1</li><li>Item 2</li></ul></li><li>Item 3</li></ol>" (convert "1. Item 1\n2. Item 2\n    - Item 1\n    * Item 2\n4. Item 3")
      check_equal "List with just 1 item" "<ol><li>Item 1</li></ol>" (convert "1. Item 1"),
      check_equal
        "Checkbox in ordered list"
        "<ol>\
        \<li><input type=\"checkbox\" />Not checked</li>\
        \<li><input type=\"checkbox\" checked=\"\" />Checked</li>\
        \<li>Normal list item</li></ol>"
        (convert "1. [ ]     Not checked\n2. [x]   Checked\n3. Normal list item")
    ]
 htmlTests =
  TestList
    [check_equal "Convert HTML element" "<p><center>a</center></p>" (convert "<center>a</center>")]
 codeTests =
  TestList
    [ check_equal "Code by itself" "<p><code>Hello world!</code></p>" (convert "`Hello world!`"),
      check_equal "Code in a paragraph" "<p>The following <code>text</code> is code</p>" (convert "The following `text` is code"),
-      check_equal "Code across paragraphs (shouldn't work)" "<p>`Incomplete</p><p>Code`</p>" (convert "`Incomplete\n\nCode`"), -- At the moment, this is just treated as a syntax error, so nothing is rendered.
+      check_equal "Code across paragraphs (shouldn't work" "<p>`Incomplete</p><p>Code`</p>" (convert "`Incomplete\n\nCode`")
      check_equal "Code block" "<pre><code>Test code block</code></pre>" (convert "```\nTest code block\n```"),
      check_equal "Multiple code blocks" "<pre><code>Test code block</code></pre><pre><code>Next block</code></pre>" (convert "```\nTest code block\n```\n\n```\nNext block\n```")
    ]
 imageTests =
  TestList
    [ check_equal "Image with text" "<p>This is an image <img src=\"img.png\" alt=\"Image 1\"/></p>" (convert "This is an image ![Image 1](img.png)"),
      check_equal "Image with classes" "<p>This is an image <img src=\"img.png\" alt=\"Image 1\" class=\"new-img\"/></p>" (convert "This is an image ![Image 1](img.png){.new-img}")
    ]
 figureTests =
  TestList
    [ check_equal "Image by itself" "<figure><img src=\"img.png\" alt=\"Image 1\"/><figcaption aria-hidden=\"true\">Image 1</figcaption></figure>" (convert "![Image 1](img.png)")
    ]
 horizontalRuleTests =
  TestList
    [check_equal "Horizontal Rule" "<p>a</p><hr><p>b</p>" (convert "a\n\n---\n\nb")]
 subscriptTests =
  TestList
    [check_equal "Should convert subscript" "A<sub>b</sub>" (convert "A~b~")]
 superscriptTests =
  TestList
    [check_equal "Should convert superscript" "A<sup>b</sup>" (convert "A^b^")]
 tableTests =
  TestList
    [ check_equal
        "Basic table"
        "<table>\
        \<thead><tr><th>Col 1</th><th>Col 2</th><th>Col 3</th></tr></thead>\
        \<tbody><tr><td>Data 1</td><td>Data 2</td><td>Data 3</td></tr>\
        \<tr><td>More Data 1</td><td>More Data 2</td><td>More Data 3</td></tr></tbody></table>"
        (convert "| Col 1 | Col 2 | Col 3 |\n|---|---|---|\n| Data 1 | Data 2 | Data 3 |\n| More Data 1 | More Data 2 | More Data 3 |")
    ]
 integrationTests =
@@ -164,7 +108,7 @@ integrationTests =
      check_equal "Integration 3" "<h1>Hello</h1><p>World</p>" (convert "# Hello\nWorld"),
      check_equal "Integration 4" "<p>a b</p>" (convert "a\nb"),
      check_equal "Integration 5" "<h1>Hello</h1>" (convert "# Hello\n"),
-      check_equal "Integration 6" "<p>First line<br />Second line</p>" (convert "First line  \nSecond line"),
+      check_equal "Integration 6" "<p>First line<br>Second line</p>" (convert "First line  \nSecond line"),
      check_equal
        "Integration 7"
        "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2>Second \
@@ -177,7 +121,7 @@ integrationTests =
            "# Sample Markdown\n\nThis is some basic, sample markdown.\n\n## Second \
            \Heading\n\n- Unordered lists, and:\n    1. One\n    2. Two\n    3. Three\n\
            \- More\n\n> Blockquote\n\nAnd **bold**, *italics*, and even *italics and \
-            \later __bold__*. Even ~~strikethrough~~. [A link](https://markdowntohtml.com) to somewhere."
+            \later **bold***. Even ~~strikethrough~~. [A link](https://markdowntohtml.com) to somewhere."
        )
    ]
@@ -191,12 +135,7 @@ tests =
      blockquoteTests,
      unorderedListTests,
      orderedListTests,
      imageTests,
      htmlTests,
      figureTests,
      codeTests,
      horizontalRuleTests,
      tableTests,
      integrationTests
    ]
Author	SHA1	Message	Date
Aadhavan Srinivasan	09982f4ab1	Made some changes	2025-05-23 08:07:02 -04:00
Aadhavan Srinivasan	4101767aff	Merge remote-tracking branch 'origin' into fixingIncompleteElements	2025-05-20 22:24:24 -04:00
Aadhavan Srinivasan	234145bcb3	Started working on parsing changes to allow incomplete elements to be parsed (eg. opening bold, followed by text, without closing bold)	2025-05-20 22:21:59 -04:00