Trying to fix a bug where a header with a newline isn't recognized as a header

2025-05-04 01:34:24 -04:00
parent d771460bb1
commit 7fdc1bcbf1
3 changed files with 116 additions and 98 deletions
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -4,5 +4,5 @@ import MdToHTML

 main :: IO ()
 main = do
-   let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n\n # Hello"
+   let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n"
   putStrLn (show res)
--- a/src/MdToHTML.hs
+++ b/src/MdToHTML.hs
@@ -1,59 +1,59 @@
 module MdToHTML where

-import Text.ParserCombinators.ReadP
-import Control.Monad
 import Control.Applicative
-import Text.Printf
-import Debug.Trace
+import Control.Monad
 import Data.List
+import Debug.Trace
+import Text.ParserCombinators.ReadP
+import Text.Printf

 type HeaderLevel = Int

 newtype URL = URL {getUrl :: String}
+
 newtype ImgPath = ImgPath {getPath :: String}

 parseMany :: ReadP a -> ReadP [a]
 parseMany = Text.ParserCombinators.ReadP.many

-data MdToken = Document [MdToken]
-             | Header HeaderLevel MdToken
-             | Para MdToken
-             | Line [MdToken]
-             | Linebreak
-             | HorizontalRule
-             | Blockquote MdToken
-             | UnordList [MdToken]
-             | OrdList [MdToken]
-             | Code String
-             | Codeblock String
-             | Link MdToken URL
-             | Image MdToken ImgPath
-             | Bold MdToken
-             | Italic MdToken
-             | Strikethrough MdToken
-             | Unit String
+data MdToken
+  = Document [MdToken]
+  | Header HeaderLevel MdToken
+  | Para MdToken
+  | Line [MdToken]
+  | Linebreak
+  | HorizontalRule
+  | Blockquote MdToken
+  | UnordList [MdToken]
+  | OrdList [MdToken]
+  | Code String
+  | Codeblock String
+  | Link MdToken URL
+  | Image MdToken ImgPath
+  | Bold MdToken
+  | Italic MdToken
+  | Strikethrough MdToken
+  | Unit String

 -- Deriving Show for MdToken
 instance Show MdToken where
-    show (Document tokens) = concat(map show tokens)
-    show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
-    show (Para token) = "<p>" ++ show token ++ "</p>"
-    show (Line tokens) =  concat(map show tokens)
-    show Linebreak = "<br>"
-    show HorizontalRule = "---------"
-    show (Blockquote token) = "BLOCK" ++ show token
-    show (UnordList tokens) = "UNORD" ++ concat(map show tokens)
-    show (OrdList tokens) = "ORD" ++ concat(map show tokens)
-    show (Code code) = show code
-    show (Codeblock code) = show code
-    show (Link txt url) = "<a href=" ++ (getUrl url) ++ ">" ++ show txt ++ "</a>"
-    show (Image txt imgPath) = "<img src=" ++ (getPath imgPath) ++ ">" ++ show txt ++ "</img>"
-    show (Bold token) = "<b>" ++ show token ++ "</b>"
-    show (Italic token) = "<i>" ++ show token ++ "</i>"
-    show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
-    show (Unit unit) = printf "%s" unit
-
-
+  show (Document tokens) = concat (map show tokens)
+  show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
+  show (Para token) = "<p>" ++ show token ++ "</p>"
+  show (Line tokens) = concat (map show tokens)
+  show Linebreak = "<br>"
+  show HorizontalRule = "---------"
+  show (Blockquote token) = "BLOCK" ++ show token
+  show (UnordList tokens) = "UNORD" ++ concat (map show tokens)
+  show (OrdList tokens) = "ORD" ++ concat (map show tokens)
+  show (Code code) = show code
+  show (Codeblock code) = show code
+  show (Link txt url) = "<a href=" ++ (getUrl url) ++ ">" ++ show txt ++ "</a>"
+  show (Image txt imgPath) = "<img src=" ++ (getPath imgPath) ++ ">" ++ show txt ++ "</img>"
+  show (Bold token) = "<b>" ++ show token ++ "</b>"
+  show (Italic token) = "<i>" ++ show token ++ "</i>"
+  show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
+  show (Unit unit) = printf "%s" unit

 -- ---------------
 -- Helpers
@@ -61,79 +61,91 @@ mustBeHash :: ReadP Char
 mustBeHash = satisfy (\x -> x == '#')

 leftmostLongest :: (Foldable t) => [(a, t b)] -> (a, t b)
-leftmostLongest xs = 
-        let lastElem = (last xs)
-            filteredLst = (filter (\val -> (length $ snd val) == (length $ snd lastElem)) xs)
-        in head filteredLst
+leftmostLongest xs =
+  let lastElem = (last xs)
+      filteredLst = (filter (\val -> (length $ snd val) == (length $ snd lastElem)) xs)
+   in head filteredLst

 -- Get the first parse returned by readP_to_S that consumed the most input
 leftmostLongestParse :: ReadP a -> String -> (a, String)
-leftmostLongestParse parser input = leftmostLongest $ readP_to_S parser input 
+leftmostLongestParse parser input = leftmostLongest $ readP_to_S parser input

 -- Parse if the string that's left matches the string comparator function
 lookaheadParse :: (String -> Bool) -> ReadP Char
 lookaheadParse stringCmp = do
-               lookahead <- look
-               case stringCmp lookahead of
-                  True -> get
-                  False -> pfail
+  lookahead <- look
+  case stringCmp lookahead of
+    True -> get
+    False -> pfail

 lineToList :: MdToken -> [MdToken]
 lineToList (Line tokens) = tokens
-- --------------- 
+
+-- ---------------

 -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
 parseHeader :: ReadP MdToken
 parseHeader = do
-        skipSpaces
-        headers <- many1 mustBeHash
-        when ((length headers) > 6)
-            pfail
-        _ <- string " "
-        text <- munch1 (\x -> x /= '\n') -- Parse until EOL
--        traceM text
-        let parsedText = fst  $ leftmostLongestParse parseLine text
-        return (Header (length headers) parsedText)
-                
+  traceM "Reached parseHeader"
+  skipSpaces
+  headers <- many1 mustBeHash
+  when
+    ((length headers) > 6)
+    pfail
+  _ <- string " "
+  text <- munch1 (\x -> x /= '\n') -- Parse until EOL
+  let parsedText = fst $ leftmostLongestParse parseLine text
+  traceM (show parsedText)
+  traceM (show (length headers))
+  return (Header (length headers) parsedText)
+
 -- Parse bold text
 parseBold :: ReadP MdToken
 parseBold = do
-        text <- choice[
-                       (between (string "__") (string "__") (many1 (lookaheadParse (/= "__")))),
-                       (between (string "**") (string "**") (many1 (lookaheadParse (/= "**"))))
-                       ]
-        let parsedText = fst  $ leftmostLongestParse parseLine text
-        return (Bold parsedText)
+  traceM "Reached parseBold"
+  text <-
+    choice
+      [ between (string "__") (string "__") (many1 (lookaheadParse (/= "__"))),
+        between (string "**") (string "**") (many1 (lookaheadParse (/= "**")))
+      ]
+  let parsedText = fst $ leftmostLongestParse parseLine text
+  return (Bold parsedText)

 -- Parse italic text
 parseItalic :: ReadP MdToken
 parseItalic = do
-        text <- choice[
-                       (between (string "_") (string "_") (munch1 (/= '_'))), 
-                       (between (string "*") (string "*") (munch1 (/= '*')))
-                       ]
-        let parsedText = fst $ leftmostLongestParse parseLine text
-        return (Italic parsedText)
+  traceM "Reached parseItalic"
+  text <-
+    choice
+      [ (between (string "_") (string "_") (munch1 (/= '_'))),
+        (between (string "*") (string "*") (munch1 (/= '*')))
+      ]
+  let parsedText = fst $ leftmostLongestParse parseLine text
+  return (Italic parsedText)

 -- Parse a linebreak character
 parseLinebreak :: ReadP MdToken
 parseLinebreak = do
-           char '\n'
-           return Linebreak
+  traceM "Reached parseLinebreak"
+  char ' '
+  many1 (char ' ')
+  char '\n'
+  return Linebreak

 -- Parse a regular string as a Unit.
 parseString :: ReadP MdToken
 parseString = do
-         firstChar <- get -- Must parse at least one character here
-         text <- munch (\x -> not (elem x "#*_[\n"))
-         return (Unit (firstChar:text))
+  traceM "Reached parseString"
+  firstChar <- get -- Must parse at least one character here
+  text <- munch (\x -> not (elem x "#*_[\n "))
+  return (Unit (firstChar : text))

 lineParsers :: [ReadP MdToken]
-lineParsers = [parseHeader, parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header'
+lineParsers = [parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header'

 -- List of all parsers
 allParsers :: [ReadP MdToken]
-allParsers = parseHeader:lineParsers
+allParsers = parseHeader : lineParsers

 -- Parse any of the above tokens.
 parseLineToken :: ReadP MdToken
@@ -141,27 +153,32 @@ parseLineToken = choice lineParsers

 -- Parse a line, consisting of one or more tokens.
 parseLine :: ReadP MdToken
-parseLine =  do
-         skipSpaces
-         -- Fail if we have reached the end of the document.
-         remaining <- look
-         when (null remaining) pfail
-         parsed <- parseMany parseLineToken
--         traceM $ show parsed
-         return (Line parsed)
+parseLine = do
+  traceM "Reached parseLine"
+  skipSpaces
+  -- Fail if we have reached the end of the document.
+  remaining <- look
+  when (null remaining) pfail
+  parsed <- parseMany parseLineToken
+  --         traceM $ show parsed
+  return (Line parsed)

 -- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
 -- As a weird special case, a 'Paragraph' can also be a 'Header'.
 parsePara :: ReadP MdToken
 parsePara = do
-           parseMany (char '\n')
-           text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
-           string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
-           let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header.
-           -- If the paragraph is a header, return a Header token. Otheriwse return a Para token.
-           case parsedText of
-                Header level token -> return (Header level token)
-                _ -> return (Para parsedText)
+  traceM "Reached parsePara"
+  parseMany (char '\n')
+  --  text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
+  --  string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
+  text <- (manyTill get ((string "\n\n") <|> (eof >> return "")))
+  when (null text) pfail
+  let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header.
+  traceM (show parsedText)
+  -- If the paragraph is a header, return a Header token. Otheriwse return a Para token.
+  case parsedText of
+    Header level token -> return (Header level token)
+    _ -> return (Para parsedText)

 -- Parse a document, which is multiple paragraphs.
 parseDocument :: ReadP MdToken
--- a/src/Test.hs
+++ b/src/Test.hs
@@ -28,8 +28,9 @@ boldTests = TestList

 integrationTests = TestList
    [
-        check_equal "Integration 1" "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2><b>Second</b> <i>Heading</i></h2>" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n  ## __Second__ _Heading_")
-        -- Add a test for single-newlines.
+        check_equal "Integration 1" "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2><b>Second</b> <i>Heading</i></h2>" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n  ## __Second__ _Heading_"),
+        check_equal "Integration 2" "<p><b>Hello</b> <i>World</i></p>" (convert "__Hello__\n_World_"),
+        check_equal "Integration 3" "<h1>Hello</h1><p>WorldM/p>" (convert "# Hello\nWorld")
    ]