Trying to fix a bug where a header with a newline isn't recognized as a header

master
Aadhavan Srinivasan 6 days ago
parent d771460bb1
commit 7fdc1bcbf1

@ -4,5 +4,5 @@ import MdToHTML
main :: IO () main :: IO ()
main = do main = do
let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n\n # Hello" let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n"
putStrLn (show res) putStrLn (show res)

@ -1,59 +1,59 @@
module MdToHTML where module MdToHTML where
import Text.ParserCombinators.ReadP
import Control.Monad
import Control.Applicative import Control.Applicative
import Text.Printf import Control.Monad
import Debug.Trace
import Data.List import Data.List
import Debug.Trace
import Text.ParserCombinators.ReadP
import Text.Printf
type HeaderLevel = Int type HeaderLevel = Int
newtype URL = URL {getUrl :: String} newtype URL = URL {getUrl :: String}
newtype ImgPath = ImgPath {getPath :: String} newtype ImgPath = ImgPath {getPath :: String}
parseMany :: ReadP a -> ReadP [a] parseMany :: ReadP a -> ReadP [a]
parseMany = Text.ParserCombinators.ReadP.many parseMany = Text.ParserCombinators.ReadP.many
data MdToken = Document [MdToken] data MdToken
| Header HeaderLevel MdToken = Document [MdToken]
| Para MdToken | Header HeaderLevel MdToken
| Line [MdToken] | Para MdToken
| Linebreak | Line [MdToken]
| HorizontalRule | Linebreak
| Blockquote MdToken | HorizontalRule
| UnordList [MdToken] | Blockquote MdToken
| OrdList [MdToken] | UnordList [MdToken]
| Code String | OrdList [MdToken]
| Codeblock String | Code String
| Link MdToken URL | Codeblock String
| Image MdToken ImgPath | Link MdToken URL
| Bold MdToken | Image MdToken ImgPath
| Italic MdToken | Bold MdToken
| Strikethrough MdToken | Italic MdToken
| Unit String | Strikethrough MdToken
| Unit String
-- Deriving Show for MdToken -- Deriving Show for MdToken
instance Show MdToken where instance Show MdToken where
show (Document tokens) = concat(map show tokens) show (Document tokens) = concat (map show tokens)
show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">" show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
show (Para token) = "<p>" ++ show token ++ "</p>" show (Para token) = "<p>" ++ show token ++ "</p>"
show (Line tokens) = concat(map show tokens) show (Line tokens) = concat (map show tokens)
show Linebreak = "<br>" show Linebreak = "<br>"
show HorizontalRule = "---------" show HorizontalRule = "---------"
show (Blockquote token) = "BLOCK" ++ show token show (Blockquote token) = "BLOCK" ++ show token
show (UnordList tokens) = "UNORD" ++ concat(map show tokens) show (UnordList tokens) = "UNORD" ++ concat (map show tokens)
show (OrdList tokens) = "ORD" ++ concat(map show tokens) show (OrdList tokens) = "ORD" ++ concat (map show tokens)
show (Code code) = show code show (Code code) = show code
show (Codeblock code) = show code show (Codeblock code) = show code
show (Link txt url) = "<a href=" ++ (getUrl url) ++ ">" ++ show txt ++ "</a>" show (Link txt url) = "<a href=" ++ (getUrl url) ++ ">" ++ show txt ++ "</a>"
show (Image txt imgPath) = "<img src=" ++ (getPath imgPath) ++ ">" ++ show txt ++ "</img>" show (Image txt imgPath) = "<img src=" ++ (getPath imgPath) ++ ">" ++ show txt ++ "</img>"
show (Bold token) = "<b>" ++ show token ++ "</b>" show (Bold token) = "<b>" ++ show token ++ "</b>"
show (Italic token) = "<i>" ++ show token ++ "</i>" show (Italic token) = "<i>" ++ show token ++ "</i>"
show (Strikethrough token) = "<s>" ++ show token ++ "</s>" show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
show (Unit unit) = printf "%s" unit show (Unit unit) = printf "%s" unit
-- --------------- -- ---------------
-- Helpers -- Helpers
@ -61,79 +61,91 @@ mustBeHash :: ReadP Char
mustBeHash = satisfy (\x -> x == '#') mustBeHash = satisfy (\x -> x == '#')
leftmostLongest :: (Foldable t) => [(a, t b)] -> (a, t b) leftmostLongest :: (Foldable t) => [(a, t b)] -> (a, t b)
leftmostLongest xs = leftmostLongest xs =
let lastElem = (last xs) let lastElem = (last xs)
filteredLst = (filter (\val -> (length $ snd val) == (length $ snd lastElem)) xs) filteredLst = (filter (\val -> (length $ snd val) == (length $ snd lastElem)) xs)
in head filteredLst in head filteredLst
-- Get the first parse returned by readP_to_S that consumed the most input -- Get the first parse returned by readP_to_S that consumed the most input
leftmostLongestParse :: ReadP a -> String -> (a, String) leftmostLongestParse :: ReadP a -> String -> (a, String)
leftmostLongestParse parser input = leftmostLongest $ readP_to_S parser input leftmostLongestParse parser input = leftmostLongest $ readP_to_S parser input
-- Parse if the string that's left matches the string comparator function -- Parse if the string that's left matches the string comparator function
lookaheadParse :: (String -> Bool) -> ReadP Char lookaheadParse :: (String -> Bool) -> ReadP Char
lookaheadParse stringCmp = do lookaheadParse stringCmp = do
lookahead <- look lookahead <- look
case stringCmp lookahead of case stringCmp lookahead of
True -> get True -> get
False -> pfail False -> pfail
lineToList :: MdToken -> [MdToken] lineToList :: MdToken -> [MdToken]
lineToList (Line tokens) = tokens lineToList (Line tokens) = tokens
-- ---------------
-- ---------------
-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL. -- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
parseHeader :: ReadP MdToken parseHeader :: ReadP MdToken
parseHeader = do parseHeader = do
skipSpaces traceM "Reached parseHeader"
headers <- many1 mustBeHash skipSpaces
when ((length headers) > 6) headers <- many1 mustBeHash
pfail when
_ <- string " " ((length headers) > 6)
text <- munch1 (\x -> x /= '\n') -- Parse until EOL pfail
-- traceM text _ <- string " "
let parsedText = fst $ leftmostLongestParse parseLine text text <- munch1 (\x -> x /= '\n') -- Parse until EOL
return (Header (length headers) parsedText) let parsedText = fst $ leftmostLongestParse parseLine text
traceM (show parsedText)
traceM (show (length headers))
return (Header (length headers) parsedText)
-- Parse bold text -- Parse bold text
parseBold :: ReadP MdToken parseBold :: ReadP MdToken
parseBold = do parseBold = do
text <- choice[ traceM "Reached parseBold"
(between (string "__") (string "__") (many1 (lookaheadParse (/= "__")))), text <-
(between (string "**") (string "**") (many1 (lookaheadParse (/= "**")))) choice
] [ between (string "__") (string "__") (many1 (lookaheadParse (/= "__"))),
let parsedText = fst $ leftmostLongestParse parseLine text between (string "**") (string "**") (many1 (lookaheadParse (/= "**")))
return (Bold parsedText) ]
let parsedText = fst $ leftmostLongestParse parseLine text
return (Bold parsedText)
-- Parse italic text -- Parse italic text
parseItalic :: ReadP MdToken parseItalic :: ReadP MdToken
parseItalic = do parseItalic = do
text <- choice[ traceM "Reached parseItalic"
(between (string "_") (string "_") (munch1 (/= '_'))), text <-
(between (string "*") (string "*") (munch1 (/= '*'))) choice
] [ (between (string "_") (string "_") (munch1 (/= '_'))),
let parsedText = fst $ leftmostLongestParse parseLine text (between (string "*") (string "*") (munch1 (/= '*')))
return (Italic parsedText) ]
let parsedText = fst $ leftmostLongestParse parseLine text
return (Italic parsedText)
-- Parse a linebreak character -- Parse a linebreak character
parseLinebreak :: ReadP MdToken parseLinebreak :: ReadP MdToken
parseLinebreak = do parseLinebreak = do
char '\n' traceM "Reached parseLinebreak"
return Linebreak char ' '
many1 (char ' ')
char '\n'
return Linebreak
-- Parse a regular string as a Unit. -- Parse a regular string as a Unit.
parseString :: ReadP MdToken parseString :: ReadP MdToken
parseString = do parseString = do
firstChar <- get -- Must parse at least one character here traceM "Reached parseString"
text <- munch (\x -> not (elem x "#*_[\n")) firstChar <- get -- Must parse at least one character here
return (Unit (firstChar:text)) text <- munch (\x -> not (elem x "#*_[\n "))
return (Unit (firstChar : text))
lineParsers :: [ReadP MdToken] lineParsers :: [ReadP MdToken]
lineParsers = [parseHeader, parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header' lineParsers = [parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header'
-- List of all parsers -- List of all parsers
allParsers :: [ReadP MdToken] allParsers :: [ReadP MdToken]
allParsers = parseHeader:lineParsers allParsers = parseHeader : lineParsers
-- Parse any of the above tokens. -- Parse any of the above tokens.
parseLineToken :: ReadP MdToken parseLineToken :: ReadP MdToken
@ -141,27 +153,32 @@ parseLineToken = choice lineParsers
-- Parse a line, consisting of one or more tokens. -- Parse a line, consisting of one or more tokens.
parseLine :: ReadP MdToken parseLine :: ReadP MdToken
parseLine = do parseLine = do
skipSpaces traceM "Reached parseLine"
-- Fail if we have reached the end of the document. skipSpaces
remaining <- look -- Fail if we have reached the end of the document.
when (null remaining) pfail remaining <- look
parsed <- parseMany parseLineToken when (null remaining) pfail
-- traceM $ show parsed parsed <- parseMany parseLineToken
return (Line parsed) -- traceM $ show parsed
return (Line parsed)
-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines. -- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
-- As a weird special case, a 'Paragraph' can also be a 'Header'. -- As a weird special case, a 'Paragraph' can also be a 'Header'.
parsePara :: ReadP MdToken parsePara :: ReadP MdToken
parsePara = do parsePara = do
parseMany (char '\n') traceM "Reached parsePara"
text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline. parseMany (char '\n')
string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF. -- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header. -- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
-- If the paragraph is a header, return a Header token. Otheriwse return a Para token. text <- (manyTill get ((string "\n\n") <|> (eof >> return "")))
case parsedText of when (null text) pfail
Header level token -> return (Header level token) let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header.
_ -> return (Para parsedText) traceM (show parsedText)
-- If the paragraph is a header, return a Header token. Otheriwse return a Para token.
case parsedText of
Header level token -> return (Header level token)
_ -> return (Para parsedText)
-- Parse a document, which is multiple paragraphs. -- Parse a document, which is multiple paragraphs.
parseDocument :: ReadP MdToken parseDocument :: ReadP MdToken

@ -28,8 +28,9 @@ boldTests = TestList
integrationTests = TestList integrationTests = TestList
[ [
check_equal "Integration 1" "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2><b>Second</b> <i>Heading</i></h2>" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n ## __Second__ _Heading_") check_equal "Integration 1" "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2><b>Second</b> <i>Heading</i></h2>" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n ## __Second__ _Heading_"),
-- Add a test for single-newlines. check_equal "Integration 2" "<p><b>Hello</b> <i>World</i></p>" (convert "__Hello__\n_World_"),
check_equal "Integration 3" "<h1>Hello</h1><p>WorldM/p>" (convert "# Hello\nWorld")
] ]

Loading…
Cancel
Save