Trying to fix a bug where a header with a newline isn't recognized as a header

2025-05-04 01:34:24 -04:00
parent d771460bb1
commit 7fdc1bcbf1
3 changed files with 116 additions and 98 deletions
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -4,5 +4,5 @@ import MdToHTML
 main :: IO ()
 main = do
-   let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n\n # Hello"
+   let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n"
   putStrLn (show res)
--- a/src/MdToHTML.hs
+++ b/src/MdToHTML.hs
@@ -1,21 +1,23 @@
 module MdToHTML where
 import Text.ParserCombinators.ReadP
 import Control.Monad
 import Control.Applicative
-import Text.Printf
+import Control.Monad
 import Debug.Trace
 import Data.List
 import Debug.Trace
 import Text.ParserCombinators.ReadP
 import Text.Printf
 type HeaderLevel = Int
 newtype URL = URL {getUrl :: String}
 newtype ImgPath = ImgPath {getPath :: String}
 parseMany :: ReadP a -> ReadP [a]
 parseMany = Text.ParserCombinators.ReadP.many
-data MdToken = Document [MdToken]
+data MdToken
  = Document [MdToken]
  | Header HeaderLevel MdToken
  | Para MdToken
  | Line [MdToken]
@@ -35,15 +37,15 @@ data MdToken = Document [MdToken]
 -- Deriving Show for MdToken
 instance Show MdToken where
-    show (Document tokens) = concat(map show tokens)
+  show (Document tokens) = concat (map show tokens)
  show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
  show (Para token) = "<p>" ++ show token ++ "</p>"
-    show (Line tokens) =  concat(map show tokens)
+  show (Line tokens) = concat (map show tokens)
  show Linebreak = "<br>"
  show HorizontalRule = "---------"
  show (Blockquote token) = "BLOCK" ++ show token
-    show (UnordList tokens) = "UNORD" ++ concat(map show tokens)
+  show (UnordList tokens) = "UNORD" ++ concat (map show tokens)
-    show (OrdList tokens) = "ORD" ++ concat(map show tokens)
+  show (OrdList tokens) = "ORD" ++ concat (map show tokens)
  show (Code code) = show code
  show (Codeblock code) = show code
  show (Link txt url) = "<a href=" ++ (getUrl url) ++ ">" ++ show txt ++ "</a>"
@@ -53,8 +55,6 @@ instance Show MdToken where
  show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
  show (Unit unit) = printf "%s" unit
 -- ---------------
 -- Helpers
 mustBeHash :: ReadP Char
@@ -80,27 +80,33 @@ lookaheadParse stringCmp = do
 lineToList :: MdToken -> [MdToken]
 lineToList (Line tokens) = tokens
 -- ---------------
 -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
 parseHeader :: ReadP MdToken
 parseHeader = do
  traceM "Reached parseHeader"
  skipSpaces
  headers <- many1 mustBeHash
-        when ((length headers) > 6)
+  when
    ((length headers) > 6)
    pfail
  _ <- string " "
  text <- munch1 (\x -> x /= '\n') -- Parse until EOL
 --        traceM text
  let parsedText = fst $ leftmostLongestParse parseLine text
  traceM (show parsedText)
  traceM (show (length headers))
  return (Header (length headers) parsedText)
 -- Parse bold text
 parseBold :: ReadP MdToken
 parseBold = do
-        text <- choice[
+  traceM "Reached parseBold"
-                       (between (string "__") (string "__") (many1 (lookaheadParse (/= "__")))),
+  text <-
-                       (between (string "**") (string "**") (many1 (lookaheadParse (/= "**"))))
+    choice
      [ between (string "__") (string "__") (many1 (lookaheadParse (/= "__"))),
        between (string "**") (string "**") (many1 (lookaheadParse (/= "**")))
      ]
  let parsedText = fst $ leftmostLongestParse parseLine text
  return (Bold parsedText)
@@ -108,8 +114,10 @@ parseBold = do
 -- Parse italic text
 parseItalic :: ReadP MdToken
 parseItalic = do
-        text <- choice[
+  traceM "Reached parseItalic"
-                       (between (string "_") (string "_") (munch1 (/= '_'))), 
+  text <-
    choice
      [ (between (string "_") (string "_") (munch1 (/= '_'))),
        (between (string "*") (string "*") (munch1 (/= '*')))
      ]
  let parsedText = fst $ leftmostLongestParse parseLine text
@@ -118,22 +126,26 @@ parseItalic = do
 -- Parse a linebreak character
 parseLinebreak :: ReadP MdToken
 parseLinebreak = do
  traceM "Reached parseLinebreak"
  char ' '
  many1 (char ' ')
  char '\n'
  return Linebreak
 -- Parse a regular string as a Unit.
 parseString :: ReadP MdToken
 parseString = do
  traceM "Reached parseString"
  firstChar <- get -- Must parse at least one character here
-         text <- munch (\x -> not (elem x "#*_[\n"))
+  text <- munch (\x -> not (elem x "#*_[\n "))
-         return (Unit (firstChar:text))
+  return (Unit (firstChar : text))
 lineParsers :: [ReadP MdToken]
-lineParsers = [parseHeader, parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header'
+lineParsers = [parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header'
 -- List of all parsers
 allParsers :: [ReadP MdToken]
-allParsers = parseHeader:lineParsers
+allParsers = parseHeader : lineParsers
 -- Parse any of the above tokens.
 parseLineToken :: ReadP MdToken
@@ -142,22 +154,27 @@ parseLineToken = choice lineParsers
 -- Parse a line, consisting of one or more tokens.
 parseLine :: ReadP MdToken
 parseLine = do
  traceM "Reached parseLine"
  skipSpaces
  -- Fail if we have reached the end of the document.
  remaining <- look
  when (null remaining) pfail
  parsed <- parseMany parseLineToken
--         traceM $ show parsed
+  --         traceM $ show parsed
  return (Line parsed)
 -- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
 -- As a weird special case, a 'Paragraph' can also be a 'Header'.
 parsePara :: ReadP MdToken
 parsePara = do
  traceM "Reached parsePara"
  parseMany (char '\n')
-           text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
+  --  text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
-           string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
+  --  string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
  text <- (manyTill get ((string "\n\n") <|> (eof >> return "")))
  when (null text) pfail
  let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header.
  traceM (show parsedText)
  -- If the paragraph is a header, return a Header token. Otheriwse return a Para token.
  case parsedText of
    Header level token -> return (Header level token)
--- a/src/Test.hs
+++ b/src/Test.hs
@@ -28,8 +28,9 @@ boldTests = TestList
 integrationTests = TestList
    [
-        check_equal "Integration 1" "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2><b>Second</b> <i>Heading</i></h2>" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n  ## __Second__ _Heading_")
+        check_equal "Integration 1" "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2><b>Second</b> <i>Heading</i></h2>" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n  ## __Second__ _Heading_"),
-        -- Add a test for single-newlines.
+        check_equal "Integration 2" "<p><b>Hello</b> <i>World</i></p>" (convert "__Hello__\n_World_"),
        check_equal "Integration 3" "<h1>Hello</h1><p>WorldM/p>" (convert "# Hello\nWorld")
    ]