From 7fdc1bcbf13a1ed51fe15b3fd0082b1f488ade4c Mon Sep 17 00:00:00 2001 From: Aadhavan Srinivasan Date: Sun, 4 May 2025 01:34:24 -0400 Subject: [PATCH] Trying to fix a bug where a header with a newline isn't recognized as a header --- app/Main.hs | 2 +- src/MdToHTML.hs | 207 ++++++++++++++++++++++++++---------------------- src/Test.hs | 5 +- 3 files changed, 116 insertions(+), 98 deletions(-) diff --git a/app/Main.hs b/app/Main.hs index 64cbe5f..1dddb85 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -4,5 +4,5 @@ import MdToHTML main :: IO () main = do - let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n\n # Hello" + let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n" putStrLn (show res) diff --git a/src/MdToHTML.hs b/src/MdToHTML.hs index 6eff55e..82b6b02 100644 --- a/src/MdToHTML.hs +++ b/src/MdToHTML.hs @@ -1,59 +1,59 @@ module MdToHTML where -import Text.ParserCombinators.ReadP -import Control.Monad import Control.Applicative -import Text.Printf -import Debug.Trace +import Control.Monad import Data.List +import Debug.Trace +import Text.ParserCombinators.ReadP +import Text.Printf type HeaderLevel = Int newtype URL = URL {getUrl :: String} + newtype ImgPath = ImgPath {getPath :: String} parseMany :: ReadP a -> ReadP [a] parseMany = Text.ParserCombinators.ReadP.many -data MdToken = Document [MdToken] - | Header HeaderLevel MdToken - | Para MdToken - | Line [MdToken] - | Linebreak - | HorizontalRule - | Blockquote MdToken - | UnordList [MdToken] - | OrdList [MdToken] - | Code String - | Codeblock String - | Link MdToken URL - | Image MdToken ImgPath - | Bold MdToken - | Italic MdToken - | Strikethrough MdToken - | Unit String +data MdToken + = Document [MdToken] + | Header HeaderLevel MdToken + | Para MdToken + | Line [MdToken] + | Linebreak + | HorizontalRule + | Blockquote MdToken + | UnordList [MdToken] + | OrdList [MdToken] + | Code String + | Codeblock String + | Link MdToken URL + | Image MdToken ImgPath + | Bold MdToken + | Italic MdToken + | Strikethrough MdToken + | Unit String -- Deriving Show for MdToken instance Show MdToken where - show (Document tokens) = concat(map show tokens) - show (Header level token) = "" ++ show token ++ "" - show (Para token) = "

" ++ show token ++ "

" - show (Line tokens) = concat(map show tokens) - show Linebreak = "
" - show HorizontalRule = "---------" - show (Blockquote token) = "BLOCK" ++ show token - show (UnordList tokens) = "UNORD" ++ concat(map show tokens) - show (OrdList tokens) = "ORD" ++ concat(map show tokens) - show (Code code) = show code - show (Codeblock code) = show code - show (Link txt url) = "" ++ show txt ++ "" - show (Image txt imgPath) = "" ++ show txt ++ "" - show (Bold token) = "" ++ show token ++ "" - show (Italic token) = "" ++ show token ++ "" - show (Strikethrough token) = "" ++ show token ++ "" - show (Unit unit) = printf "%s" unit - - + show (Document tokens) = concat (map show tokens) + show (Header level token) = "" ++ show token ++ "" + show (Para token) = "

" ++ show token ++ "

" + show (Line tokens) = concat (map show tokens) + show Linebreak = "
" + show HorizontalRule = "---------" + show (Blockquote token) = "BLOCK" ++ show token + show (UnordList tokens) = "UNORD" ++ concat (map show tokens) + show (OrdList tokens) = "ORD" ++ concat (map show tokens) + show (Code code) = show code + show (Codeblock code) = show code + show (Link txt url) = "" ++ show txt ++ "" + show (Image txt imgPath) = "" ++ show txt ++ "" + show (Bold token) = "" ++ show token ++ "" + show (Italic token) = "" ++ show token ++ "" + show (Strikethrough token) = "" ++ show token ++ "" + show (Unit unit) = printf "%s" unit -- --------------- -- Helpers @@ -61,79 +61,91 @@ mustBeHash :: ReadP Char mustBeHash = satisfy (\x -> x == '#') leftmostLongest :: (Foldable t) => [(a, t b)] -> (a, t b) -leftmostLongest xs = - let lastElem = (last xs) - filteredLst = (filter (\val -> (length $ snd val) == (length $ snd lastElem)) xs) - in head filteredLst +leftmostLongest xs = + let lastElem = (last xs) + filteredLst = (filter (\val -> (length $ snd val) == (length $ snd lastElem)) xs) + in head filteredLst -- Get the first parse returned by readP_to_S that consumed the most input leftmostLongestParse :: ReadP a -> String -> (a, String) -leftmostLongestParse parser input = leftmostLongest $ readP_to_S parser input +leftmostLongestParse parser input = leftmostLongest $ readP_to_S parser input -- Parse if the string that's left matches the string comparator function lookaheadParse :: (String -> Bool) -> ReadP Char lookaheadParse stringCmp = do - lookahead <- look - case stringCmp lookahead of - True -> get - False -> pfail + lookahead <- look + case stringCmp lookahead of + True -> get + False -> pfail lineToList :: MdToken -> [MdToken] lineToList (Line tokens) = tokens --- --------------- + +-- --------------- -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL. parseHeader :: ReadP MdToken parseHeader = do - skipSpaces - headers <- many1 mustBeHash - when ((length headers) > 6) - pfail - _ <- string " " - text <- munch1 (\x -> x /= '\n') -- Parse until EOL --- traceM text - let parsedText = fst $ leftmostLongestParse parseLine text - return (Header (length headers) parsedText) - + traceM "Reached parseHeader" + skipSpaces + headers <- many1 mustBeHash + when + ((length headers) > 6) + pfail + _ <- string " " + text <- munch1 (\x -> x /= '\n') -- Parse until EOL + let parsedText = fst $ leftmostLongestParse parseLine text + traceM (show parsedText) + traceM (show (length headers)) + return (Header (length headers) parsedText) + -- Parse bold text parseBold :: ReadP MdToken parseBold = do - text <- choice[ - (between (string "__") (string "__") (many1 (lookaheadParse (/= "__")))), - (between (string "**") (string "**") (many1 (lookaheadParse (/= "**")))) - ] - let parsedText = fst $ leftmostLongestParse parseLine text - return (Bold parsedText) + traceM "Reached parseBold" + text <- + choice + [ between (string "__") (string "__") (many1 (lookaheadParse (/= "__"))), + between (string "**") (string "**") (many1 (lookaheadParse (/= "**"))) + ] + let parsedText = fst $ leftmostLongestParse parseLine text + return (Bold parsedText) -- Parse italic text parseItalic :: ReadP MdToken parseItalic = do - text <- choice[ - (between (string "_") (string "_") (munch1 (/= '_'))), - (between (string "*") (string "*") (munch1 (/= '*'))) - ] - let parsedText = fst $ leftmostLongestParse parseLine text - return (Italic parsedText) + traceM "Reached parseItalic" + text <- + choice + [ (between (string "_") (string "_") (munch1 (/= '_'))), + (between (string "*") (string "*") (munch1 (/= '*'))) + ] + let parsedText = fst $ leftmostLongestParse parseLine text + return (Italic parsedText) -- Parse a linebreak character parseLinebreak :: ReadP MdToken parseLinebreak = do - char '\n' - return Linebreak + traceM "Reached parseLinebreak" + char ' ' + many1 (char ' ') + char '\n' + return Linebreak -- Parse a regular string as a Unit. parseString :: ReadP MdToken parseString = do - firstChar <- get -- Must parse at least one character here - text <- munch (\x -> not (elem x "#*_[\n")) - return (Unit (firstChar:text)) + traceM "Reached parseString" + firstChar <- get -- Must parse at least one character here + text <- munch (\x -> not (elem x "#*_[\n ")) + return (Unit (firstChar : text)) lineParsers :: [ReadP MdToken] -lineParsers = [parseHeader, parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header' +lineParsers = [parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header' -- List of all parsers allParsers :: [ReadP MdToken] -allParsers = parseHeader:lineParsers +allParsers = parseHeader : lineParsers -- Parse any of the above tokens. parseLineToken :: ReadP MdToken @@ -141,27 +153,32 @@ parseLineToken = choice lineParsers -- Parse a line, consisting of one or more tokens. parseLine :: ReadP MdToken -parseLine = do - skipSpaces - -- Fail if we have reached the end of the document. - remaining <- look - when (null remaining) pfail - parsed <- parseMany parseLineToken --- traceM $ show parsed - return (Line parsed) +parseLine = do + traceM "Reached parseLine" + skipSpaces + -- Fail if we have reached the end of the document. + remaining <- look + when (null remaining) pfail + parsed <- parseMany parseLineToken + -- traceM $ show parsed + return (Line parsed) -- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines. -- As a weird special case, a 'Paragraph' can also be a 'Header'. parsePara :: ReadP MdToken parsePara = do - parseMany (char '\n') - text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline. - string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF. - let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header. - -- If the paragraph is a header, return a Header token. Otheriwse return a Para token. - case parsedText of - Header level token -> return (Header level token) - _ -> return (Para parsedText) + traceM "Reached parsePara" + parseMany (char '\n') + -- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline. + -- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF. + text <- (manyTill get ((string "\n\n") <|> (eof >> return ""))) + when (null text) pfail + let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header. + traceM (show parsedText) + -- If the paragraph is a header, return a Header token. Otheriwse return a Para token. + case parsedText of + Header level token -> return (Header level token) + _ -> return (Para parsedText) -- Parse a document, which is multiple paragraphs. parseDocument :: ReadP MdToken diff --git a/src/Test.hs b/src/Test.hs index 4a72c64..d415adb 100644 --- a/src/Test.hs +++ b/src/Test.hs @@ -28,8 +28,9 @@ boldTests = TestList integrationTests = TestList [ - check_equal "Integration 1" "

Sample Markdown

This is some basic, sample markdown.

Second Heading

" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n ## __Second__ _Heading_") - -- Add a test for single-newlines. + check_equal "Integration 1" "

Sample Markdown

This is some basic, sample markdown.

Second Heading

" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n ## __Second__ _Heading_"), + check_equal "Integration 2" "

Hello World

" (convert "__Hello__\n_World_"), + check_equal "Integration 3" "

Hello

WorldM/p>" (convert "# Hello\nWorld") ]