md-to-html/src/MdToHTML.hs

{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}

{-# HLINT ignore "Use lambda-case" #-}

module MdToHTML where

import Control.Applicative
import Control.Monad
import Data.List
import Data.Ord (comparing)
import Debug.Trace
import Text.ParserCombinators.ReadP
import Text.Printf

type HeaderLevel = Int

newtype URL = URL {getUrl :: String} deriving (Eq)

newtype ImgPath = ImgPath {getPath :: String} deriving (Eq)

parseMany :: ReadP a -> ReadP [a]
parseMany = Text.ParserCombinators.ReadP.many

data MdToken
  = Document [MdToken]
  | Header HeaderLevel MdToken
  | Para MdToken
  | Line [MdToken]
  | SingleNewline -- A single newline is rendered as a space.
  | Linebreak
  | HorizontalRule
  | Blockquote [MdToken]
  | UnordList [MdToken]
  | OrdList [MdToken]
  | Code String
  | Codeblock String
  | Link MdToken URL
  | Image MdToken ImgPath
  | Bold MdToken
  | Italic MdToken
  | Strikethrough MdToken
  | Unit String
  deriving (Eq)

-- Deriving Show for MdToken
instance Show MdToken where
  show (Document tokens) = concatMap show tokens
  show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
  show (Para token) = "<p>" ++ show token ++ "</p>"
  show (Line tokens) = concatMap show tokens
  show Linebreak = "<br>"
  show SingleNewline = " "
  show HorizontalRule = "---------"
  show (Blockquote tokens) = "<blockquote>" ++ concatMap show tokens ++ "</blockquote>"
  show (UnordList tokens) = "<ul>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ul>"
  show (OrdList tokens) = "<ol>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ol>"
  show (Code code) = show code
  show (Codeblock code) = show code
  show (Link txt url) = "<a href=\"" ++ getUrl url ++ "\">" ++ show txt ++ "</a>"
  show (Image txt imgPath) = "<img src=" ++ getPath imgPath ++ ">" ++ show txt ++ "</img>"
  show (Bold token) = "<b>" ++ show token ++ "</b>"
  show (Italic token) = "<i>" ++ show token ++ "</i>"
  show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
  show (Unit unit) = printf "%s" unit

instance Semigroup MdToken where
  a <> b = Document [a, b]

instance Monoid MdToken where
  mempty = Unit ""

-- ---------------
-- Helpers
leftmostLongest :: (Foldable t) => [(a, t b)] -> Maybe (a, t b)
leftmostLongest xs =
  let lastElem = last xs
      filteredLst = filter (\val -> length (snd val) == length (snd lastElem)) xs
   in case filteredLst of
        [] -> Nothing
        (x : xs) -> Just x

-- Get the first parse returned by readP_to_S that consumed the most input
leftmostLongestParse :: (Monoid a) => ReadP a -> String -> (a, String)
leftmostLongestParse parser input =
  let res = leftmostLongest $ readP_to_S parser input
   in case res of
        Nothing -> (mempty, mempty)
        Just x -> x

-- Parse if the string that's left matches the string comparator function
lookaheadParse :: (String -> Bool) -> ReadP Char
lookaheadParse stringCmp = do
  lookahead <- look
  case stringCmp lookahead of
    True -> get
    False -> pfail

lineToList :: MdToken -> [MdToken]
lineToList (Line tokens) = tokens

specialChars = "\\#*_[\n"

-- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
greedyParse :: ReadP a -> ReadP [a]
greedyParse parser = do
  greedyParse1 parser <++ return []

-- Like greedyParse, but the parser must succeed atleast once.
greedyParse1 :: ReadP a -> ReadP [a]
greedyParse1 parser = do
  parsed1 <- parser
  parsed2 <- greedyParse1 parser <++ return []
  return (parsed1 : parsed2)

prepend :: [a] -> [a] -> [a]
prepend x1 x2 = x1 ++ x2

append :: [a] -> [a] -> [a]
append x1 x2 = x2 ++ x1

-- Sequence two parsers, running one after the other and returning the result.
sequenceParse :: ReadP a -> ReadP a -> ReadP [a]
sequenceParse p1 p2 = twoElemList <$> p1 <*> p2
  where
    twoElemList elem1 elem2 = [elem1, elem2]

-- Parses p1 until p2 succeeds, but doesn't actually consume anything from p2.
-- Similar to manyTill, except manyTill's second parser actually consumes characters.
manyTillLazy :: ReadP a -> ReadP b -> ReadP [a]
manyTillLazy p1 p2 = do
  res <- p1
  remaining <- look
  let p2res = readP_to_S p2 remaining
  case p2res of
    [] -> do
      res2 <- manyTillLazy p1 p2
      return (res : res2)
    _ -> return [res]

-- Parse until EOL or EOF
parseTillEol :: ReadP String
parseTillEol = manyTill get (void (char '\n') <++ eof)

-- Takes a list of parsers. Returns a parser that will try them in
-- order, moving to the next one only if the current one fails.
fallthroughParser :: [ReadP a] -> ReadP a
fallthroughParser [x] = x
fallthroughParser (x : xs) = x <++ fallthroughParser xs

-- ---------------

-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
parseHeader :: ReadP MdToken
parseHeader = do
  skipSpaces
  headers <- munch1 (== '#')
  when
    (length headers > 6)
    pfail
  skipSpaces
  text <- munch1 (/= '\n')
  --  Text.ParserCombinators.ReadP.optional (char '\n')
  skipSpaces
  let parsedText = fst $ leftmostLongestParse parseLine text
  return (Header (length headers) parsedText)

-- Parse bold text
parseBold :: ReadP MdToken
parseBold = parseBoldWith "**" <|> parseBoldWith "__"
  where
    parseBoldWith delim = do
      string delim
      inside <- greedyParse1 parseLineToken
      string delim
      return (Bold (Line inside))

-- Parse italic text
parseItalic :: ReadP MdToken
parseItalic = parseBoldWith "*" <|> parseBoldWith "_"
  where
    parseBoldWith delim = do
      string delim
      inside <- greedyParse1 parseLineToken
      string delim
      return (Italic (Line inside))

-- Parse strikethrough text
parseStrikethrough :: ReadP MdToken
parseStrikethrough = do
  string "~~"
  inside <- many1 parseLineToken
  string "~~"
  return (Strikethrough (Line inside))

-- Parse a link
parseLink :: ReadP MdToken
parseLink = do
  linkText <- between (string "[") (string "]") (many1 get)
  linkURL <- between (string "(") (string ")") (many1 get)
  let parsedLinkText = fst $ leftmostLongestParse parseLine linkText
  return $ Link parsedLinkText (URL linkURL)

-- Parse a linebreak character
parseLinebreak :: ReadP MdToken
parseLinebreak = do
  char ' '
  many1 (char ' ')
  char '\n'
  return Linebreak

parseSingleNewline :: ReadP MdToken
parseSingleNewline = do
  char '\n'
  return SingleNewline

-- Parse an escaped character
parseEscapedChar :: ReadP MdToken
parseEscapedChar = do
  char '\\'
  escapedChar <- choice (map char specialChars) -- Parse any of the special chars.
  return (Unit [escapedChar])

-- Parse a character as a Unit.
parseUnit :: ReadP MdToken
parseUnit = do
  text <- satisfy (`notElem` specialChars)
  return (Unit [text])

-- Parse a regular string as a Unit.
parseString :: ReadP MdToken
parseString = do
  firstChar <- satisfy (/= '\n') -- Must parse at least one non-newline character here
  text <- munch (`notElem` specialChars)
  return (Unit (firstChar : text))

lineParsers :: [ReadP MdToken]
lineParsers =
  [ parseLinebreak,
    parseSingleNewline,
    parseEscapedChar,
    parseBold,
    parseItalic,
    parseStrikethrough,
    parseLink,
    parseUnit
  ] -- A 'line' doesn't include a 'header'

listLineParsers :: [ReadP MdToken]
listLineParsers =
  [ parseLinebreak,
    parseEscapedChar,
    parseBold,
    parseItalic,
    parseStrikethrough,
    parseLink,
    parseUnit
  ] -- A list line cannot contain newlines.

-- List of all parsers
allParsers :: [ReadP MdToken]
allParsers = parseHeader : lineParsers

-- Parse any of the line tokens.
parseLineToken :: ReadP MdToken
parseLineToken = choice lineParsers

-- Parse a line, consisting of one or more tokens.
parseLine :: ReadP MdToken
parseLine = do
  skipSpaces
  -- Fail if we have reached the end of the document.
  parsed <- manyTill parseLineToken eof
  return (Line parsed)

-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
parsePara :: ReadP MdToken
parsePara = do
  parseMany (char '\n')
  --  text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
  --  string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
  text <- manyTill get (string "\n\n" <|> (eof >> return ""))
  when (null text) pfail
  let parsedText = fst $ leftmostLongestParse parseLine text -- Parse a line
  return (Para parsedText)

-- Parse a line starting with '>', return the line except for the '>'.
parseQuotedLine :: ReadP String
parseQuotedLine = do
  char '>'
  greedyParse (char ' ' +++ char '\t')
  restOfLine <- munch (/= '\n')
  Text.ParserCombinators.ReadP.optional (char '\n') >> return ""
  return restOfLine

-- Parse many 'quoted lines' until I see a non-quoted line.
parseQuotedLines :: ReadP [String]
parseQuotedLines =
  greedyParse1 $ do
    look >>= \line ->
      case line of
        ('>' : _) -> parseQuotedLine
        _ -> pfail

-- Parse a blockquote, which is a greater-than sign followed by a paragraph.
parseBlockquote :: ReadP MdToken
parseBlockquote = do
  quotedLines <- parseQuotedLines
  -- remaining <- look
  -- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining
  -- string (init $ unlines quotedLines)
  let parsedQuotedLines = fst $ leftmostLongestParse (many1 (parseBlockquote <++ parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
  return (Blockquote parsedQuotedLines)

-- Parse a nested list item.
parseUListNested :: ReadP MdToken
parseUListNested = do
  --  firstChar <- string "    " <++ string "\t"
  --  skipSpaces
  --  restOfLine <- manyTill get (void (char '\n') <++ eof)
  --  let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
  --  return restOfLineParsed
  let firstCharParser = string "    " <++ string "\t"
  let restOfLineParser = manyTill get (void (char '\n') <++ eof)
  lines <- greedyParse1 (firstCharParser *> restOfLineParser)
  let linesParsed = fst $ leftmostLongestParse parseUnorderedList (init $ unlines lines)
  return linesParsed

-- Parse an unordered list line item.
parseUListLineItem :: ReadP MdToken
parseUListLineItem = do
  firstChar <- choice (map char ['*', '+', '-'])
  skipSpaces
  restOfLine <- manyTill get (void (char '\n') <++ eof)
  let restOfLineParsed = fst $ leftmostLongestParse parseLine restOfLine
  nestedList <- parseUListNested <++ return (Unit "")
  return $ Line [restOfLineParsed, nestedList]

-- Parse an unordered list paragraph item.
-- This is defined as a line item, followed by an empty line, followed by one or more
-- lines indented by a space or tab.
parseUListParaItem :: ReadP MdToken
parseUListParaItem = do
  firstLine <- parseUListLineItem
  char '\n'
  lines <- greedyParse1 ((string "    " <|> string "\t") *> parseTillEol)
  let res = fst $ leftmostLongestParse (greedyParse1 parsePara) (init $ unlines lines)
  char '\n'
  return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.

-- This is hacky as hell
-- parsedParas <- manyTillLazy parsePara (string "\n\n" *> choice (map char "*-+"))
-- return $ Document parsedParas -- I wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.

-- Parse an unordered list item, which can be a line item or another list.
parseUListItem :: ReadP MdToken
parseUListItem = parseUListParaItem <++ parseUListLineItem <++ parseUListNested

-- Parse an unordered list.
parseUnorderedList :: ReadP MdToken
parseUnorderedList = do
  lineItems <- greedyParse1 parseUListItem
  return $ UnordList lineItems

-- Parse a document, which is multiple paragraphs.
parseDocument :: ReadP MdToken
parseDocument = do
  res <- manyTill (parseHeader <++ parseBlockquote <++ parseUnorderedList <++ parsePara) eof
  return (Document res)