Trying to fix a bug where a header with a newline isn't recognized as a header
This commit is contained in:
@@ -4,5 +4,5 @@ import MdToHTML
|
|||||||
|
|
||||||
main :: IO ()
|
main :: IO ()
|
||||||
main = do
|
main = do
|
||||||
let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n\n # Hello"
|
let res = fst $ leftmostLongestParse parseDocument "# _Hello_\n"
|
||||||
putStrLn (show res)
|
putStrLn (show res)
|
||||||
|
199
src/MdToHTML.hs
199
src/MdToHTML.hs
@@ -1,59 +1,59 @@
|
|||||||
module MdToHTML where
|
module MdToHTML where
|
||||||
|
|
||||||
import Text.ParserCombinators.ReadP
|
|
||||||
import Control.Monad
|
|
||||||
import Control.Applicative
|
import Control.Applicative
|
||||||
import Text.Printf
|
import Control.Monad
|
||||||
import Debug.Trace
|
|
||||||
import Data.List
|
import Data.List
|
||||||
|
import Debug.Trace
|
||||||
|
import Text.ParserCombinators.ReadP
|
||||||
|
import Text.Printf
|
||||||
|
|
||||||
type HeaderLevel = Int
|
type HeaderLevel = Int
|
||||||
|
|
||||||
newtype URL = URL {getUrl :: String}
|
newtype URL = URL {getUrl :: String}
|
||||||
|
|
||||||
newtype ImgPath = ImgPath {getPath :: String}
|
newtype ImgPath = ImgPath {getPath :: String}
|
||||||
|
|
||||||
parseMany :: ReadP a -> ReadP [a]
|
parseMany :: ReadP a -> ReadP [a]
|
||||||
parseMany = Text.ParserCombinators.ReadP.many
|
parseMany = Text.ParserCombinators.ReadP.many
|
||||||
|
|
||||||
data MdToken = Document [MdToken]
|
data MdToken
|
||||||
| Header HeaderLevel MdToken
|
= Document [MdToken]
|
||||||
| Para MdToken
|
| Header HeaderLevel MdToken
|
||||||
| Line [MdToken]
|
| Para MdToken
|
||||||
| Linebreak
|
| Line [MdToken]
|
||||||
| HorizontalRule
|
| Linebreak
|
||||||
| Blockquote MdToken
|
| HorizontalRule
|
||||||
| UnordList [MdToken]
|
| Blockquote MdToken
|
||||||
| OrdList [MdToken]
|
| UnordList [MdToken]
|
||||||
| Code String
|
| OrdList [MdToken]
|
||||||
| Codeblock String
|
| Code String
|
||||||
| Link MdToken URL
|
| Codeblock String
|
||||||
| Image MdToken ImgPath
|
| Link MdToken URL
|
||||||
| Bold MdToken
|
| Image MdToken ImgPath
|
||||||
| Italic MdToken
|
| Bold MdToken
|
||||||
| Strikethrough MdToken
|
| Italic MdToken
|
||||||
| Unit String
|
| Strikethrough MdToken
|
||||||
|
| Unit String
|
||||||
|
|
||||||
-- Deriving Show for MdToken
|
-- Deriving Show for MdToken
|
||||||
instance Show MdToken where
|
instance Show MdToken where
|
||||||
show (Document tokens) = concat(map show tokens)
|
show (Document tokens) = concat (map show tokens)
|
||||||
show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
|
show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
|
||||||
show (Para token) = "<p>" ++ show token ++ "</p>"
|
show (Para token) = "<p>" ++ show token ++ "</p>"
|
||||||
show (Line tokens) = concat(map show tokens)
|
show (Line tokens) = concat (map show tokens)
|
||||||
show Linebreak = "<br>"
|
show Linebreak = "<br>"
|
||||||
show HorizontalRule = "---------"
|
show HorizontalRule = "---------"
|
||||||
show (Blockquote token) = "BLOCK" ++ show token
|
show (Blockquote token) = "BLOCK" ++ show token
|
||||||
show (UnordList tokens) = "UNORD" ++ concat(map show tokens)
|
show (UnordList tokens) = "UNORD" ++ concat (map show tokens)
|
||||||
show (OrdList tokens) = "ORD" ++ concat(map show tokens)
|
show (OrdList tokens) = "ORD" ++ concat (map show tokens)
|
||||||
show (Code code) = show code
|
show (Code code) = show code
|
||||||
show (Codeblock code) = show code
|
show (Codeblock code) = show code
|
||||||
show (Link txt url) = "<a href=" ++ (getUrl url) ++ ">" ++ show txt ++ "</a>"
|
show (Link txt url) = "<a href=" ++ (getUrl url) ++ ">" ++ show txt ++ "</a>"
|
||||||
show (Image txt imgPath) = "<img src=" ++ (getPath imgPath) ++ ">" ++ show txt ++ "</img>"
|
show (Image txt imgPath) = "<img src=" ++ (getPath imgPath) ++ ">" ++ show txt ++ "</img>"
|
||||||
show (Bold token) = "<b>" ++ show token ++ "</b>"
|
show (Bold token) = "<b>" ++ show token ++ "</b>"
|
||||||
show (Italic token) = "<i>" ++ show token ++ "</i>"
|
show (Italic token) = "<i>" ++ show token ++ "</i>"
|
||||||
show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
|
show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
|
||||||
show (Unit unit) = printf "%s" unit
|
show (Unit unit) = printf "%s" unit
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
-- ---------------
|
-- ---------------
|
||||||
-- Helpers
|
-- Helpers
|
||||||
@@ -62,9 +62,9 @@ mustBeHash = satisfy (\x -> x == '#')
|
|||||||
|
|
||||||
leftmostLongest :: (Foldable t) => [(a, t b)] -> (a, t b)
|
leftmostLongest :: (Foldable t) => [(a, t b)] -> (a, t b)
|
||||||
leftmostLongest xs =
|
leftmostLongest xs =
|
||||||
let lastElem = (last xs)
|
let lastElem = (last xs)
|
||||||
filteredLst = (filter (\val -> (length $ snd val) == (length $ snd lastElem)) xs)
|
filteredLst = (filter (\val -> (length $ snd val) == (length $ snd lastElem)) xs)
|
||||||
in head filteredLst
|
in head filteredLst
|
||||||
|
|
||||||
-- Get the first parse returned by readP_to_S that consumed the most input
|
-- Get the first parse returned by readP_to_S that consumed the most input
|
||||||
leftmostLongestParse :: ReadP a -> String -> (a, String)
|
leftmostLongestParse :: ReadP a -> String -> (a, String)
|
||||||
@@ -73,67 +73,79 @@ leftmostLongestParse parser input = leftmostLongest $ readP_to_S parser input
|
|||||||
-- Parse if the string that's left matches the string comparator function
|
-- Parse if the string that's left matches the string comparator function
|
||||||
lookaheadParse :: (String -> Bool) -> ReadP Char
|
lookaheadParse :: (String -> Bool) -> ReadP Char
|
||||||
lookaheadParse stringCmp = do
|
lookaheadParse stringCmp = do
|
||||||
lookahead <- look
|
lookahead <- look
|
||||||
case stringCmp lookahead of
|
case stringCmp lookahead of
|
||||||
True -> get
|
True -> get
|
||||||
False -> pfail
|
False -> pfail
|
||||||
|
|
||||||
lineToList :: MdToken -> [MdToken]
|
lineToList :: MdToken -> [MdToken]
|
||||||
lineToList (Line tokens) = tokens
|
lineToList (Line tokens) = tokens
|
||||||
|
|
||||||
-- ---------------
|
-- ---------------
|
||||||
|
|
||||||
-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
|
-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
|
||||||
parseHeader :: ReadP MdToken
|
parseHeader :: ReadP MdToken
|
||||||
parseHeader = do
|
parseHeader = do
|
||||||
skipSpaces
|
traceM "Reached parseHeader"
|
||||||
headers <- many1 mustBeHash
|
skipSpaces
|
||||||
when ((length headers) > 6)
|
headers <- many1 mustBeHash
|
||||||
pfail
|
when
|
||||||
_ <- string " "
|
((length headers) > 6)
|
||||||
text <- munch1 (\x -> x /= '\n') -- Parse until EOL
|
pfail
|
||||||
-- traceM text
|
_ <- string " "
|
||||||
let parsedText = fst $ leftmostLongestParse parseLine text
|
text <- munch1 (\x -> x /= '\n') -- Parse until EOL
|
||||||
return (Header (length headers) parsedText)
|
let parsedText = fst $ leftmostLongestParse parseLine text
|
||||||
|
traceM (show parsedText)
|
||||||
|
traceM (show (length headers))
|
||||||
|
return (Header (length headers) parsedText)
|
||||||
|
|
||||||
-- Parse bold text
|
-- Parse bold text
|
||||||
parseBold :: ReadP MdToken
|
parseBold :: ReadP MdToken
|
||||||
parseBold = do
|
parseBold = do
|
||||||
text <- choice[
|
traceM "Reached parseBold"
|
||||||
(between (string "__") (string "__") (many1 (lookaheadParse (/= "__")))),
|
text <-
|
||||||
(between (string "**") (string "**") (many1 (lookaheadParse (/= "**"))))
|
choice
|
||||||
]
|
[ between (string "__") (string "__") (many1 (lookaheadParse (/= "__"))),
|
||||||
let parsedText = fst $ leftmostLongestParse parseLine text
|
between (string "**") (string "**") (many1 (lookaheadParse (/= "**")))
|
||||||
return (Bold parsedText)
|
]
|
||||||
|
let parsedText = fst $ leftmostLongestParse parseLine text
|
||||||
|
return (Bold parsedText)
|
||||||
|
|
||||||
-- Parse italic text
|
-- Parse italic text
|
||||||
parseItalic :: ReadP MdToken
|
parseItalic :: ReadP MdToken
|
||||||
parseItalic = do
|
parseItalic = do
|
||||||
text <- choice[
|
traceM "Reached parseItalic"
|
||||||
(between (string "_") (string "_") (munch1 (/= '_'))),
|
text <-
|
||||||
(between (string "*") (string "*") (munch1 (/= '*')))
|
choice
|
||||||
]
|
[ (between (string "_") (string "_") (munch1 (/= '_'))),
|
||||||
let parsedText = fst $ leftmostLongestParse parseLine text
|
(between (string "*") (string "*") (munch1 (/= '*')))
|
||||||
return (Italic parsedText)
|
]
|
||||||
|
let parsedText = fst $ leftmostLongestParse parseLine text
|
||||||
|
return (Italic parsedText)
|
||||||
|
|
||||||
-- Parse a linebreak character
|
-- Parse a linebreak character
|
||||||
parseLinebreak :: ReadP MdToken
|
parseLinebreak :: ReadP MdToken
|
||||||
parseLinebreak = do
|
parseLinebreak = do
|
||||||
char '\n'
|
traceM "Reached parseLinebreak"
|
||||||
return Linebreak
|
char ' '
|
||||||
|
many1 (char ' ')
|
||||||
|
char '\n'
|
||||||
|
return Linebreak
|
||||||
|
|
||||||
-- Parse a regular string as a Unit.
|
-- Parse a regular string as a Unit.
|
||||||
parseString :: ReadP MdToken
|
parseString :: ReadP MdToken
|
||||||
parseString = do
|
parseString = do
|
||||||
firstChar <- get -- Must parse at least one character here
|
traceM "Reached parseString"
|
||||||
text <- munch (\x -> not (elem x "#*_[\n"))
|
firstChar <- get -- Must parse at least one character here
|
||||||
return (Unit (firstChar:text))
|
text <- munch (\x -> not (elem x "#*_[\n "))
|
||||||
|
return (Unit (firstChar : text))
|
||||||
|
|
||||||
lineParsers :: [ReadP MdToken]
|
lineParsers :: [ReadP MdToken]
|
||||||
lineParsers = [parseHeader, parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header'
|
lineParsers = [parseLinebreak, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header'
|
||||||
|
|
||||||
-- List of all parsers
|
-- List of all parsers
|
||||||
allParsers :: [ReadP MdToken]
|
allParsers :: [ReadP MdToken]
|
||||||
allParsers = parseHeader:lineParsers
|
allParsers = parseHeader : lineParsers
|
||||||
|
|
||||||
-- Parse any of the above tokens.
|
-- Parse any of the above tokens.
|
||||||
parseLineToken :: ReadP MdToken
|
parseLineToken :: ReadP MdToken
|
||||||
@@ -141,27 +153,32 @@ parseLineToken = choice lineParsers
|
|||||||
|
|
||||||
-- Parse a line, consisting of one or more tokens.
|
-- Parse a line, consisting of one or more tokens.
|
||||||
parseLine :: ReadP MdToken
|
parseLine :: ReadP MdToken
|
||||||
parseLine = do
|
parseLine = do
|
||||||
skipSpaces
|
traceM "Reached parseLine"
|
||||||
-- Fail if we have reached the end of the document.
|
skipSpaces
|
||||||
remaining <- look
|
-- Fail if we have reached the end of the document.
|
||||||
when (null remaining) pfail
|
remaining <- look
|
||||||
parsed <- parseMany parseLineToken
|
when (null remaining) pfail
|
||||||
-- traceM $ show parsed
|
parsed <- parseMany parseLineToken
|
||||||
return (Line parsed)
|
-- traceM $ show parsed
|
||||||
|
return (Line parsed)
|
||||||
|
|
||||||
-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
|
-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
|
||||||
-- As a weird special case, a 'Paragraph' can also be a 'Header'.
|
-- As a weird special case, a 'Paragraph' can also be a 'Header'.
|
||||||
parsePara :: ReadP MdToken
|
parsePara :: ReadP MdToken
|
||||||
parsePara = do
|
parsePara = do
|
||||||
parseMany (char '\n')
|
traceM "Reached parsePara"
|
||||||
text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
|
parseMany (char '\n')
|
||||||
string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
|
-- text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
|
||||||
let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header.
|
-- string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
|
||||||
-- If the paragraph is a header, return a Header token. Otheriwse return a Para token.
|
text <- (manyTill get ((string "\n\n") <|> (eof >> return "")))
|
||||||
case parsedText of
|
when (null text) pfail
|
||||||
Header level token -> return (Header level token)
|
let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header.
|
||||||
_ -> return (Para parsedText)
|
traceM (show parsedText)
|
||||||
|
-- If the paragraph is a header, return a Header token. Otheriwse return a Para token.
|
||||||
|
case parsedText of
|
||||||
|
Header level token -> return (Header level token)
|
||||||
|
_ -> return (Para parsedText)
|
||||||
|
|
||||||
-- Parse a document, which is multiple paragraphs.
|
-- Parse a document, which is multiple paragraphs.
|
||||||
parseDocument :: ReadP MdToken
|
parseDocument :: ReadP MdToken
|
||||||
|
@@ -28,8 +28,9 @@ boldTests = TestList
|
|||||||
|
|
||||||
integrationTests = TestList
|
integrationTests = TestList
|
||||||
[
|
[
|
||||||
check_equal "Integration 1" "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2><b>Second</b> <i>Heading</i></h2>" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n ## __Second__ _Heading_")
|
check_equal "Integration 1" "<h1>Sample Markdown</h1><p>This is some basic, sample markdown.</p><h2><b>Second</b> <i>Heading</i></h2>" (convert "# Sample Markdown\n\n This is some basic, sample markdown.\n\n ## __Second__ _Heading_"),
|
||||||
-- Add a test for single-newlines.
|
check_equal "Integration 2" "<p><b>Hello</b> <i>World</i></p>" (convert "__Hello__\n_World_"),
|
||||||
|
check_equal "Integration 3" "<h1>Hello</h1><p>WorldM/p>" (convert "# Hello\nWorld")
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user