Trying to fix a bug where a header with a newline isn't recognized as a header

2 months ago · 7fdc1bcbf1
parent d771460bb1
commit 7fdc1bcbf1
3 changed files with 116 additions and 98 deletions
--- a/app/Main.hs
+++ b/app/Main.hs
@ -4,5 +4,5 @@ import MdToHTML
 main :: IO ()
 main = do
-   let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n\n # Hello"
+   let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n"
   putStrLn (show res)
--- a/src/MdToHTML.hs
+++ b/src/MdToHTML.hs
@ -1,59 +1,59 @@
 module MdToHTML where
 import Text.ParserCombinators.ReadP
 import Control.Monad
 import Control.Applicative
-import Text.Printf
+import Control.Monad
 import Debug.Trace
 import Data.List
 import Debug.Trace
 import Text.ParserCombinators.ReadP
 import Text.Printf
 type HeaderLevel = Int
 newtype URL = URL {getUrl :: String}
 newtype ImgPath = ImgPath {getPath :: String}
 parseMany :: ReadP a -> ReadP [a]
 parseMany = Text.ParserCombinators.ReadP.many
-data MdToken = Document [MdToken]
+data MdToken
-             | Header HeaderLevel MdToken
+  = Document [MdToken]
-             | Para MdToken
+  | Header HeaderLevel MdToken
-             | Line [MdToken]
+  | Para MdToken
-             | Linebreak
+  | Line [MdToken]
-             | HorizontalRule
+  | Linebreak
-             | Blockquote MdToken
+  | HorizontalRule
-             | UnordList [MdToken]
+  | Blockquote MdToken
-             | OrdList [MdToken]
+  | UnordList [MdToken]
-             | Code String
+  | OrdList [MdToken]
-             | Codeblock String
+  | Code String
-             | Link MdToken URL
+  | Codeblock String
-             | Image MdToken ImgPath
+  | Link MdToken URL
-             | Bold MdToken
+  | Image MdToken ImgPath
-             | Italic MdToken
+  | Bold MdToken
-             | Strikethrough MdToken
+  | Italic MdToken
-             | Unit String
+  | Strikethrough MdToken
  | Unit String
 -- Deriving Show for MdToken
 instance Show MdToken where
-    show (Document tokens) = concat(map show tokens)
+  show (Document tokens) = concat (map show tokens)
-    show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
+  show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
-    show (Para token) = "<p>" ++ show token ++ "</p>"
+  show (Para token) = "<p>" ++ show token ++ "</p>"
-    show (Line tokens) =  concat(map show tokens)
+  show (Line tokens) = concat (map show tokens)
-    show Linebreak = "<br>"
+  show Linebreak = "<br>"
-    show HorizontalRule = "---------"
+  show HorizontalRule = "---------"
-    show (Blockquote token) = "BLOCK" ++ show token
+  show (Blockquote token) = "BLOCK" ++ show token
-    show (UnordList tokens) = "UNORD" ++ concat(map show tokens)
+  show (UnordList tokens) = "UNORD" ++ concat (map show tokens)
-    show (OrdList tokens) = "ORD" ++ concat(map show tokens)
+  show (OrdList tokens) = "ORD" ++ concat (map show tokens)
-    show (Code code) = show code
+  show (Code code) = show code
-    show (Codeblock code) = show code
+  show (Codeblock code) = show code
-    show (Link txt url) = "<a href=" ++ (getUrl url) ++ ">" ++ show txt ++ "</a>"
+  show (Link txt url) = "<a href=" ++ (getUrl url) ++ ">" ++ show txt ++ "</a>"
-    show (Image txt imgPath) = "<img src=" ++ (getPath imgPath) ++ ">" ++ show txt ++ "</img>"
+  show (Image txt imgPath) = "<img src=" ++ (getPath imgPath) ++ ">" ++ show txt ++ "</img>"
-    show (Bold token) = "<b>" ++ show token ++ "</b>"
+  show (Bold token) = "<b>" ++ show token ++ "</b>"
-    show (Italic token) = "<i>" ++ show token ++ "</i>"
+  show (Italic token) = "<i>" ++ show token ++ "</i>"
-    show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
+  show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
-    show (Unit unit) = printf "%s" unit
+  show (Unit unit) = printf "%s" unit
 -- ---------------
 -- Helpers
@ -61,79 +61,91 @@ mustBeHash :: ReadP Char
 mustBeHash = satisfy (\x -> x == '#')
 leftmostLongest :: (Foldable t) => [(a, t b)] -> (a, t b)
-leftmostLongest xs = 
+leftmostLongest xs =
-        let lastElem = (last xs)
+  let lastElem = (last xs)
-            filteredLst = (filter (\val -> (length $ snd val) == (length $ snd lastElem)) xs)
+      filteredLst = (filter (\val -> (length $ snd val) == (length $ snd lastElem)) xs)
-        in head filteredLst
+   in head filteredLst
 -- Get the first parse returned by readP_to_S that consumed the most input
 leftmostLongestParse :: ReadP a -> String -> (a, String)
-leftmostLongestParse parser input = leftmostLongest $ readP_to_S parser input 
+leftmostLongestParse parser input = leftmostLongest $ readP_to_S parser input
 -- Parse if the string that's left matches the string comparator function
 lookaheadParse :: (String -> Bool) -> ReadP Char
 lookaheadParse stringCmp = do
-               lookahead <- look
+  lookahead <- look
-               case stringCmp lookahead of
+  case stringCmp lookahead of
-                  True -> get
+    True -> get
-                  False -> pfail
+    False -> pfail
 lineToList :: MdToken -> [MdToken]
 lineToList (Line tokens) = tokens
-- --------------- 
+
 -- ---------------
 -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
 parseHeader :: ReadP MdToken
 parseHeader = do
-        skipSpaces
+  traceM "Reached parseHeader"
-        headers <- many1 mustBeHash
+  skipSpaces
-        when ((length headers) > 6)
+  headers <- many1 mustBeHash
-            pfail
+  when
-        _ <- string " "
+    ((length headers) > 6)
-        text <- munch1 (\x -> x /= '\n') -- Parse until EOL
+    pfail
--        traceM text
+  _ <- string " "
-        let parsedText = fst  $ leftmostLongestParse parseLine text
+  text <- munch1 (\x -> x /= '\n') -- Parse until EOL
-        return (Header (length headers) parsedText)
+  let parsedText = fst $ leftmostLongestParse parseLine text
-                
+  traceM (show parsedText)
  traceM (show (length headers))
  return (Header (length headers) parsedText)
 -- Parse bold text
 parseBold :: ReadP MdToken
 parseBold = do
-        text <- choice[
+  traceM "Reached parseBold"
-                       (between (string "__") (string "__") (many1 (lookaheadParse (/= "__")))),
+  text <-
-                       (between (string "**") (string "**") (many1 (lookaheadParse (/= "**"))))
+    choice
-                       ]
+      [ between (string "__") (string "__") (many1 (lookaheadParse (/= "__"))),
-        let parsedText = fst  $ leftmostLongestParse parseLine text
+        between (string "**") (string "**") (many1 (lookaheadParse (/= "**")))
-        return (Bold parsedText)
+      ]
  let parsedText = fst $ leftmostLongestParse parseLine text
  return (Bold parsedText)
 -- Parse italic text
 parseItalic :: ReadP MdToken
 parseItalic = do
-        text <- choice[
+  traceM "Reached parseItalic"
-                       (between (string "_") (string "_") (munch1 (/= '_'))), 
+  text <-
-                       (between (string "*") (string "*") (munch1 (/= '*')))
+    choice
-                       ]
+      [ (between (string "_") (string "_") (munch1 (/= '_'))),
-        let parsedText = fst $ leftmostLongestParse parseLine text
+        (between (string "*") (string "*") (munch1 (/= '*')))
-        return (Italic parsedText)
+      ]
  let parsedText = fst $ leftmostLongestParse parseLine text
  return (Italic parsedText)
 -- Parse a linebreak character
 parseLinebreak :: ReadP MdToken
 parseLinebreak = do
-           char '\n'
+  traceM "Reached parseLinebreak"
-           return Linebreak
+  char ' '
  many1 (char ' ')
  char '\n'
  return Linebreak
 -- Parse a regular string as a Unit.
 parseString :: ReadP MdToken
 parseString = do
-         firstChar <- get -- Must parse at least one character here
+  traceM "Reached parseString"
-         text <- munch (\x -> not (elem x "#*_[\n"))
+  firstChar <- get -- Must parse at least one character here
-         return (Unit (firstChar:text))
+  text <- munch (\x -> not (elem x "#*_[\n "))
  return (Unit (firstChar : text))
 lineParsers :: [ReadP MdToken]
-lineParsers = [parseHeader, parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header'
+lineParsers = [parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header'
 -- List of all parsers
 allParsers :: [ReadP MdToken]
-allParsers = parseHeader:lineParsers
+allParsers = parseHeader : lineParsers
 -- Parse any of the above tokens.
 parseLineToken :: ReadP MdToken
@ -141,27 +153,32 @@ parseLineToken = choice lineParsers
 -- Parse a line, consisting of one or more tokens.
 parseLine :: ReadP MdToken
-parseLine =  do
+parseLine = do
-         skipSpaces
+  traceM "Reached parseLine"
-         -- Fail if we have reached the end of the document.
+  skipSpaces
-         remaining <- look
+  -- Fail if we have reached the end of the document.
-         when (null remaining) pfail
+  remaining <- look
-         parsed <- parseMany parseLineToken
+  when (null remaining) pfail
--         traceM $ show parsed
+  parsed <- parseMany parseLineToken
-         return (Line parsed)
+  --         traceM $ show parsed
  return (Line parsed)
 -- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
 -- As a weird special case, a 'Paragraph' can also be a 'Header'.
 parsePara :: ReadP MdToken
 parsePara = do
-           parseMany (char '\n')
+  traceM "Reached parsePara"
-           text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
+  parseMany (char '\n')
-           string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
+  --  text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
-           let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header.
+  --  string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
-           -- If the paragraph is a header, return a Header token. Otheriwse return a Para token.
+  text <- (manyTill get ((string "\n\n") <|> (eof >> return "")))
-           case parsedText of
+  when (null text) pfail
-                Header level token -> return (Header level token)
+  let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header.
-                _ -> return (Para parsedText)
+  traceM (show parsedText)
  -- If the paragraph is a header, return a Header token. Otheriwse return a Para token.
  case parsedText of
    Header level token -> return (Header level token)
    _ -> return (Para parsedText)
 -- Parse a document, which is multiple paragraphs.
 parseDocument :: ReadP MdToken
--- a/src/Test.hs
+++ b/src/Test.hs
@ -28,8 +28,9 @@ boldTests = TestList
 integrationTests = TestList
    [
-        check_equal "Integration 1" "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2><b>Second</b> <i>Heading</i></h2>" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n  ## __Second__ _Heading_")
+        check_equal "Integration 1" "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2><b>Second</b> <i>Heading</i></h2>" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n  ## __Second__ _Heading_"),
-        -- Add a test for single-newlines.
+        check_equal "Integration 2" "<p><b>Hello</b> <i>World</i></p>" (convert "__Hello__\n_World_"),
        check_equal "Integration 3" "<h1>Hello</h1><p>WorldM/p>" (convert "# Hello\nWorld")
    ]