Trying to fix a bug where a header with a newline isn't recognized as a header

master
Aadhavan Srinivasan 6 days ago
parent d771460bb1
commit 7fdc1bcbf1

@ -4,5 +4,5 @@ import MdToHTML
main :: IO ()
main = do
let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n\n # Hello"
let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n"
putStrLn (show res)

@ -1,59 +1,59 @@
module MdToHTML where
import Text.ParserCombinators.ReadP
import Control.Monad
import Control.Applicative
import Text.Printf
import Debug.Trace
import Control.Monad
import Data.List
import Debug.Trace
import Text.ParserCombinators.ReadP
import Text.Printf
type HeaderLevel = Int
newtype URL = URL {getUrl :: String}
newtype ImgPath = ImgPath {getPath :: String}
parseMany :: ReadP a -> ReadP [a]
parseMany = Text.ParserCombinators.ReadP.many
data MdToken = Document [MdToken]
| Header HeaderLevel MdToken
| Para MdToken
| Line [MdToken]
| Linebreak
| HorizontalRule
| Blockquote MdToken
| UnordList [MdToken]
| OrdList [MdToken]
| Code String
| Codeblock String
| Link MdToken URL
| Image MdToken ImgPath
| Bold MdToken
| Italic MdToken
| Strikethrough MdToken
| Unit String
data MdToken
= Document [MdToken]
| Header HeaderLevel MdToken
| Para MdToken
| Line [MdToken]
| Linebreak
| HorizontalRule
| Blockquote MdToken
| UnordList [MdToken]
| OrdList [MdToken]
| Code String
| Codeblock String
| Link MdToken URL
| Image MdToken ImgPath
| Bold MdToken
| Italic MdToken
| Strikethrough MdToken
| Unit String
-- Deriving Show for MdToken
instance Show MdToken where
show (Document tokens) = concat(map show tokens)
show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
show (Para token) = "<p>" ++ show token ++ "</p>"
show (Line tokens) = concat(map show tokens)
show Linebreak = "<br>"
show HorizontalRule = "---------"
show (Blockquote token) = "BLOCK" ++ show token
show (UnordList tokens) = "UNORD" ++ concat(map show tokens)
show (OrdList tokens) = "ORD" ++ concat(map show tokens)
show (Code code) = show code
show (Codeblock code) = show code
show (Link txt url) = "<a href=" ++ (getUrl url) ++ ">" ++ show txt ++ "</a>"
show (Image txt imgPath) = "<img src=" ++ (getPath imgPath) ++ ">" ++ show txt ++ "</img>"
show (Bold token) = "<b>" ++ show token ++ "</b>"
show (Italic token) = "<i>" ++ show token ++ "</i>"
show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
show (Unit unit) = printf "%s" unit
show (Document tokens) = concat (map show tokens)
show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
show (Para token) = "<p>" ++ show token ++ "</p>"
show (Line tokens) = concat (map show tokens)
show Linebreak = "<br>"
show HorizontalRule = "---------"
show (Blockquote token) = "BLOCK" ++ show token
show (UnordList tokens) = "UNORD" ++ concat (map show tokens)
show (OrdList tokens) = "ORD" ++ concat (map show tokens)
show (Code code) = show code
show (Codeblock code) = show code
show (Link txt url) = "<a href=" ++ (getUrl url) ++ ">" ++ show txt ++ "</a>"
show (Image txt imgPath) = "<img src=" ++ (getPath imgPath) ++ ">" ++ show txt ++ "</img>"
show (Bold token) = "<b>" ++ show token ++ "</b>"
show (Italic token) = "<i>" ++ show token ++ "</i>"
show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
show (Unit unit) = printf "%s" unit
-- ---------------
-- Helpers
@ -61,79 +61,91 @@ mustBeHash :: ReadP Char
mustBeHash = satisfy (\x -> x == '#')
leftmostLongest :: (Foldable t) => [(a, t b)] -> (a, t b)
leftmostLongest xs =
let lastElem = (last xs)
filteredLst = (filter (\val -> (length $ snd val) == (length $ snd lastElem)) xs)
in head filteredLst
leftmostLongest xs =
let lastElem = (last xs)
filteredLst = (filter (\val -> (length $ snd val) == (length $ snd lastElem)) xs)
in head filteredLst
-- Get the first parse returned by readP_to_S that consumed the most input
leftmostLongestParse :: ReadP a -> String -> (a, String)
leftmostLongestParse parser input = leftmostLongest $ readP_to_S parser input
leftmostLongestParse parser input = leftmostLongest $ readP_to_S parser input
-- Parse if the string that's left matches the string comparator function
lookaheadParse :: (String -> Bool) -> ReadP Char
lookaheadParse stringCmp = do
lookahead <- look
case stringCmp lookahead of
True -> get
False -> pfail
lookahead <- look
case stringCmp lookahead of
True -> get
False -> pfail
lineToList :: MdToken -> [MdToken]
lineToList (Line tokens) = tokens
-- ---------------
-- ---------------
-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
parseHeader :: ReadP MdToken
parseHeader = do
skipSpaces
headers <- many1 mustBeHash
when ((length headers) > 6)
pfail
_ <- string " "
text <- munch1 (\x -> x /= '\n') -- Parse until EOL
-- traceM text
let parsedText = fst $ leftmostLongestParse parseLine text
return (Header (length headers) parsedText)
traceM "Reached parseHeader"
skipSpaces
headers <- many1 mustBeHash
when
((length headers) > 6)
pfail
_ <- string " "
text <- munch1 (\x -> x /= '\n') -- Parse until EOL
let parsedText = fst $ leftmostLongestParse parseLine text
traceM (show parsedText)
traceM (show (length headers))
return (Header (length headers) parsedText)
-- Parse bold text
parseBold :: ReadP MdToken
parseBold = do
text <- choice[
(between (string "__") (string "__") (many1 (lookaheadParse (/= "__")))),
(between (string "**") (string "**") (many1 (lookaheadParse (/= "**"))))
]
let parsedText = fst $ leftmostLongestParse parseLine text
return (Bold parsedText)
traceM "Reached parseBold"
text <-
choice
[ between (string "__") (string "__") (many1 (lookaheadParse (/= "__"))),
between (string "**") (string "**") (many1 (lookaheadParse (/= "**")))
]
let parsedText = fst $ leftmostLongestParse parseLine text
return (Bold parsedText)
-- Parse italic text
parseItalic :: ReadP MdToken
parseItalic = do
text <- choice[
(between (string "_") (string "_") (munch1 (/= '_'))),
(between (string "*") (string "*") (munch1 (/= '*')))
]
let parsedText = fst $ leftmostLongestParse parseLine text
return (Italic parsedText)
traceM "Reached parseItalic"
text <-
choice
[ (between (string "_") (string "_") (munch1 (/= '_'))),
(between (string "*") (string "*") (munch1 (/= '*')))
]
let parsedText = fst $ leftmostLongestParse parseLine text
return (Italic parsedText)
-- Parse a linebreak character
parseLinebreak :: ReadP MdToken
parseLinebreak = do
char '\n'
return Linebreak
traceM "Reached parseLinebreak"
char ' '
many1 (char ' ')
char '\n'
return Linebreak
-- Parse a regular string as a Unit.
parseString :: ReadP MdToken
parseString = do
firstChar <- get -- Must parse at least one character here
text <- munch (\x -> not (elem x "#*_[\n"))
return (Unit (firstChar:text))
traceM "Reached parseString"
firstChar <- get -- Must parse at least one character here
text <- munch (\x -> not (elem x "#*_[\n "))
return (Unit (firstChar : text))
lineParsers :: [ReadP MdToken]
lineParsers = [parseHeader, parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header'
lineParsers = [parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header'
-- List of all parsers
allParsers :: [ReadP MdToken]
allParsers = parseHeader:lineParsers
allParsers = parseHeader : lineParsers
-- Parse any of the above tokens.
parseLineToken :: ReadP MdToken
@ -141,27 +153,32 @@ parseLineToken = choice lineParsers
-- Parse a line, consisting of one or more tokens.
parseLine :: ReadP MdToken
parseLine = do
skipSpaces
-- Fail if we have reached the end of the document.
remaining <- look
when (null remaining) pfail
parsed <- parseMany parseLineToken
-- traceM $ show parsed
return (Line parsed)
parseLine = do
traceM "Reached parseLine"
skipSpaces
-- Fail if we have reached the end of the document.
remaining <- look
when (null remaining) pfail
parsed <- parseMany parseLineToken
-- traceM $ show parsed
return (Line parsed)
-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
-- As a weird special case, a 'Paragraph' can also be a 'Header'.
parsePara :: ReadP MdToken
parsePara = do
parseMany (char '\n')
text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header.
-- If the paragraph is a header, return a Header token. Otheriwse return a Para token.
case parsedText of
Header level token -> return (Header level token)
_ -> return (Para parsedText)
traceM "Reached parsePara"
parseMany (char '\n')
-- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
-- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
text <- (manyTill get ((string "\n\n") <|> (eof >> return "")))
when (null text) pfail
let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header.
traceM (show parsedText)
-- If the paragraph is a header, return a Header token. Otheriwse return a Para token.
case parsedText of
Header level token -> return (Header level token)
_ -> return (Para parsedText)
-- Parse a document, which is multiple paragraphs.
parseDocument :: ReadP MdToken

@ -28,8 +28,9 @@ boldTests = TestList
integrationTests = TestList
[
check_equal "Integration 1" "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2><b>Second</b> <i>Heading</i></h2>" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n ## __Second__ _Heading_")
-- Add a test for single-newlines.
check_equal "Integration 1" "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2><b>Second</b> <i>Heading</i></h2>" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n ## __Second__ _Heading_"),
check_equal "Integration 2" "<p><b>Hello</b> <i>World</i></p>" (convert "__Hello__\n_World_"),
check_equal "Integration 3" "<h1>Hello</h1><p>WorldM/p>" (convert "# Hello\nWorld")
]

Loading…
Cancel
Save