md-to-html/src/MdToHTML.hs

{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}

{-# HLINT ignore "Use lambda-case" #-}

module MdToHTML where

import Control.Applicative hiding (many, some)
import Control.Monad
import Control.Monad.Combinators (count)
import Data.Char
import Data.List
import Data.Ord (comparing)
import Data.String.Utils
import qualified Data.Text as T
import Data.Void
import Debug.Trace
import Text.Megaparsec
import Text.Megaparsec.Char
import Text.Printf
import Text.Wrap

type Parser = Parsec Void T.Text

type HeaderLevel = Int

type CssClass = String

newtype URL = URL {getUrl :: String} deriving (Eq)

newtype ImgPath = ImgPath {getPath :: String} deriving (Eq)

data MdToken
  = Document [MdToken]
  | Header HeaderLevel MdToken
  | Para MdToken
  | Line [MdToken]
  | SingleNewline -- A single newline is rendered as a space.
  | Linebreak
  | HorizontalRule
  | Blockquote [MdToken]
  | UnordList [MdToken]
  | OrdList [MdToken]
  | Code MdToken
  | Table [[MdToken]]
  | Codeblock MdToken
  | Link MdToken URL
  | Image MdToken URL (Maybe [CssClass])
  | Figure MdToken URL (Maybe [CssClass])
  | Bold MdToken
  | Italic MdToken
  | Strikethrough MdToken
  | Unit String
  deriving (Eq)

-- Deriving Show for MdToken
instance Show MdToken where
  show (Document tokens) = concatMap show tokens
  show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
  show (Para token) = "<p>" ++ show token ++ "</p>"
  show (Line tokens) = concatMap show tokens
  show Linebreak = "<br />"
  show SingleNewline = " "
  show HorizontalRule = "<hr>"
  show (Blockquote tokens) = "<blockquote>" ++ concatMap show tokens ++ "</blockquote>"
  show (UnordList tokens) = "<ul>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ul>"
  show (OrdList tokens) = "<ol>" ++ concatMap (prepend "<li>" . append "</li>" . show) tokens ++ "</ol>"
  show (Code code) = "<code>" ++ strip (show code) ++ "</code>"
  show (Table (thead : tokenGrid)) = "<table><thead><tr>" ++ concatMap (\x -> "<th>" ++ rstrip (show x) ++ "</th>") thead ++ "</tr></thead>" ++ "<tbody>" ++ concatMap (\x -> "<tr>" ++ concatMap (\y -> "<td>" ++ rstrip (show y) ++ "</td>") x ++ "</tr>") tokenGrid ++ "</tbody></table>"
  show (Codeblock code) = "<pre><code>" ++ show code ++ "</code></pre>"
  show (Link txt url) = "<a href=\"" ++ getUrl url ++ "\">" ++ show txt ++ "</a>"
  show (Image txt url cssClasses) = "<img src=\"" ++ getUrl url ++ "\"" ++ " alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/>"
  show (Figure txt url cssClasses) = "<figure><img src=\"" ++ getUrl url ++ "\" alt=\"" ++ show txt ++ "\"" ++ maybe "" (\classes -> " class=\"" ++ unwords classes ++ "\"") cssClasses ++ "/><figcaption aria-hidden=\"true\">" ++ show txt ++ "</figcaption></figure>"
  show (Bold token) = "<b>" ++ show token ++ "</b>"
  show (Italic token) = "<i>" ++ show token ++ "</i>"
  show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
  show (Unit unit) = printf "%s" unit

-- Pretty print the given token into a string.
-- This is the same as calling 'show' for most tokens, but is different for paragraphs and tables,
-- which have newlines inserted into them.
prettyPrint :: MdToken -> String
prettyPrint (Para token) = "<p>" ++ T.unpack (wrapText defaultWrapSettings 70 (T.pack $ prettyPrint token)) ++ "</p>\n"
prettyPrint (Table (thead : tokenGrid)) = "<table>\n<thead>\n<tr>\n" ++ concatMap (\x -> "<th>" ++ rstrip (prettyPrint x) ++ "</th>\n") thead ++ "</tr>\n</thead>\n" ++ "<tbody>\n" ++ concatMap (\x -> "<tr>\n" ++ concatMap (\y -> "<td>" ++ rstrip (prettyPrint y) ++ "</td>\n") x ++ "</tr>\n") tokenGrid ++ "</tbody>\n</table>\n"
prettyPrint Linebreak = "<br />\n"
prettyPrint (Line tokens) = concatMap prettyPrint tokens
prettyPrint (Document tokens) = concatMap prettyPrint tokens
prettyPrint token = show token

instance Semigroup MdToken where
  a <> b = Document [a, b]

instance Monoid MdToken where
  mempty = Unit ""

-- ---------------
-- Helpers
leftmostLongest :: (Foldable t) => [(a, t b)] -> Maybe (a, t b)
leftmostLongest xs =
  let lastElem = last xs
      filteredLst = filter (\val -> length (snd val) == length (snd lastElem)) xs
   in case filteredLst of
        [] -> Nothing
        (x : xs) -> Just x

-- Get the first parse returned by readP_to_S that consumed the most input
leftmostLongestParse :: (Monoid a) => Parser a -> String -> a
leftmostLongestParse parser input =
  case runParser parser "input" (T.pack input) of
    (Left a) -> mempty
    (Right a) -> a

specialChars = ">\n\\`*_{}[]#+|"

escapableChars = "-~!.$()" ++ specialChars

-- Makes a parser greedy. Instead of returning all possible parses, only the longest one is returned.
greedyParse :: Parser a -> Parser [a]
greedyParse parser = do
  greedyParse1 parser <|> return []

-- Like greedyParse, but the parser must succeed atleast once.
greedyParse1 :: Parser a -> Parser [a]
greedyParse1 parser = do
  parsed1 <- parser
  parsed2 <- greedyParse1 parser <|> return []
  return (parsed1 : parsed2)

prepend :: [a] -> [a] -> [a]
prepend x1 x2 = x1 ++ x2

append :: [a] -> [a] -> [a]
append x1 x2 = x2 ++ x1

-- Parse until EOL or EOF
parseTillEol :: Parser String
parseTillEol = manyTill anySingle (void (char '\n') <|> eof)

-- Takes a list of parsers. Returns a parser that will try them in
-- order, moving to the next one only if the current one fails.
fallthroughParser :: [Parser a] -> Parser a
fallthroughParser [x] = x
fallthroughParser (x : xs) = try x <|> fallthroughParser xs

escapeChar :: Char -> String
escapeChar '>' = "&gt;"
escapeChar '<' = "&lt;"
escapeChar '&' = "&amp;"
escapeChar x = [x]

htmlEscapeChars :: T.Text -> T.Text
htmlEscapeChars = T.concatMap (T.pack . escapeChar)

-- -- Wraps a list of words after (at most) the given number of characters, trying to prevent word-breaks
-- wordwrap :: Int -> String -> String
-- wordwrap wraplength str = if (length str) < wraplength
--     then str
--     else
--         let spaceIndex = lastgtSpaceIndex 0 (takeRev (length str) - wraplength str)
--
--     where
--         takeRev n = (reverse . take n . reverse)
--         lastSpaceIndex counter str = case str of
--             [] -> counter
--             x:xs -> if (isSpace x) counter else lastSpaceIndex counter+1 xs

-- ---------------

-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
parseHeader :: Parser MdToken
parseHeader = do
  space
  headers <- greedyParse1 (char '#')
  when
    (length headers > 6)
    empty
  space
  parsedText <- manyTill parseLineToken (void (char '\n') <|> eof)
  greedyParse (char '\n')
  return (Header (length headers) (Line parsedText))

asteriskBold = T.pack "**"

underscoreBold = T.pack "__"

-- Parse bold text
parseBold :: Parser MdToken
parseBold = parseBoldWith asteriskBold <|> parseBoldWith underscoreBold
  where
    parseBoldWith delim = do
      string delim
      inside <- someTill parseLineToken $ string delim
      return (Bold (Line inside))

-- Parse italic text
parseItalic :: Parser MdToken
parseItalic = parseItalicWith '*' <|> parseItalicWith '_'
  where
    parseItalicWith delim = do
      char delim
      inside <- someTill parseLineToken (char delim)
      return (Italic (Line inside))

-- Parse strikethrough text
parseStrikethrough :: Parser MdToken
parseStrikethrough = do
  string (T.pack "~~")
  inside <- someTill parseLineToken $ string (T.pack "~~")
  return (Strikethrough (Line inside))

-- Parse code
parseCode :: Parser MdToken
parseCode = do
  opening <- some $ char '`'
  inside <- someTill (satisfy (/= '\n')) (char '`')
  closing <- count (length opening - 1) (char '`')
  return (Code (Unit (concatMap escapeChar inside)))

-- Parse a link
parseLink :: Parser MdToken
parseLink = do
  char '['
  linkText <- manyTill parseLineToken (char ']')
  char '('
  linkURL <- manyTill anySingle (char ')')
  return $ Link (Line linkText) (URL linkURL)

-- Parse a linebreak character
parseLinebreak :: Parser MdToken
parseLinebreak = parseLinebreakSpace <|> parseLinebreakBackslash
  where
    parseLinebreakSpace = do
      char ' '
      some (char ' ')
      char '\n'
      return Linebreak
    parseLinebreakBackslash = try $ do
      char '\\'
      char '\n'
      return Linebreak

parseTableRow :: Parser [MdToken]
parseTableRow = do
  char '|'
  row <- some (many (satisfy (\x -> x == ' ' || x == '\t')) *> someTill parseListLineToken (char '|'))
  return (map Line row)

parseTable :: Parser MdToken
parseTable = do
  tableHead <- parseTableRow
  char '\n'
  char '|'
  sepEndBy1 (some (char '-')) (char '|') *> char '\n'
  tableBody <- sepEndBy parseTableRow (char '\n')
  many (char '\n') -- Parse trailing newlines, if any
  return $ Table (tableHead : tableBody)

parseSingleNewline :: Parser MdToken
parseSingleNewline = do
  char '\n'
  remaining <- getInput
  case T.unpack remaining of
    [] -> return $ Unit ""
    _ -> return SingleNewline

parseCssClasses :: Parser [CssClass]
parseCssClasses = do
  char '{'
  classes <- some parseCssClass
  char '}'
  return classes
  where
    parseCssClass :: Parser CssClass
    parseCssClass = do
      char '.'
      let firstLetterParser = char '_' <|> char '-' <|> label "letter" (satisfy isAlpha)
      cssClassFirstLetter <- firstLetterParser
      cssClass <- many (firstLetterParser <|> label "digit" (satisfy isDigit))
      space
      return (cssClassFirstLetter : cssClass)

parseImage :: Parser MdToken
parseImage = do
  char '!'
  link <- parseLink
  cssClasses <- optional $ try parseCssClasses
  case link of
    Link text path -> return $ Image text path cssClasses
    _ -> empty -- This should never be reached

parseFigure = do
  img <- parseImage
  void (string doubleNewlineText) <|> eof
  case img of
    Image text path cssClasses -> return $ Figure text path cssClasses
    _ -> return img

-- Parse an escaped character
parseEscapedChar :: Parser MdToken
parseEscapedChar = do
  char '\\'
  escapedChar <- choice (map char escapableChars) -- Parse any of the special chars.
  return (Unit [escapedChar])

-- Parse a character as a Unit.
parseUnit :: Parser MdToken
parseUnit = do
  -- text <- satisfy (`notElem` specialChars)
  text <- anySingle
  return (Unit [text])

-- Parse any character except a newline
parseUnitExceptNewline :: Parser MdToken
parseUnitExceptNewline = do
  -- text <- satisfy (`notElem` specialChars)
  text <- satisfy (/= '\n')
  return (Unit [text])

lineParsers :: [Parser MdToken]
lineParsers =
  [ parseLinebreak,
    parseSingleNewline,
    parseEscapedChar,
    parseCode,
    parseImage,
    parseBold,
    parseItalic,
    parseStrikethrough,
    parseLink,
    parseUnit
  ] -- A 'line' doesn't include a 'header'

lineParsersWithoutNewline :: [Parser MdToken]
lineParsersWithoutNewline =
  [ parseEscapedChar,
    parseCode,
    parseImage,
    parseBold,
    parseItalic,
    parseStrikethrough,
    parseLink,
    parseUnitExceptNewline
  ] -- A list line cannot contain newlines.

-- List of all parsers
allParsers :: [Parser MdToken]
allParsers = parseHeader : lineParsers

-- Parse any of the line tokens.
parseLineToken :: Parser MdToken
parseLineToken = fallthroughParser lineParsers

-- Parse any of the list line tokens.
parseListLineToken :: Parser MdToken
parseListLineToken = fallthroughParser lineParsersWithoutNewline

-- Parse a line, consisting of one or more tokens.
parseLine :: Parser MdToken
parseLine = do
  space
  -- Fail if we have reached the end of the document.
  parsed <- manyTill parseLineToken eof
  return (Line parsed)

-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
parsePara :: Parser MdToken
parsePara = do
  space
  --  text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
  --  string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
  parsedText <- someTill parseLineToken (try paraEnding)
  many (char '\n')
  return (Para (Line parsedText))
  where
    paraEnding = void (char '\n' *> (char '\n' <|> lookAhead (char '>'))) <|> eof

-- Parse a line starting with '>', return the line except for the '>'.
parseQuotedLine :: Parser String
parseQuotedLine = do
  char '>'
  many (char ' ' <|> char '\t')
  restOfLine <- many (satisfy (/= '\n'))
  void (char '\n') <|> eof
  return restOfLine

-- Parse many 'quoted lines' until I see a non-quoted line.
parseQuotedLines :: Parser [String]
parseQuotedLines = some parseQuotedLine

-- some $ do
--   getInput >>= \line ->
--     case T.unpack line of
--       ('>' : _) -> parseQuotedLine
--       _ -> empty

-- Parse a blockquote, which is a greater-than sign followed by a paragraph.
parseBlockquote :: Parser MdToken
parseBlockquote = do
  quotedLines <- parseQuotedLines
  -- remaining <- look
  -- let quotedLines = fst $ leftmostLongestParse parseQuotedLines remaining
  -- string (init $ unlines quotedLines)
  let parsedQuotedLines = leftmostLongestParse (some (parseBlockquote <|> parsePara)) (init $ unlines quotedLines) -- unlines joins the lines together with a newline, and adds a trailing newline. init removes the trailing newline.
  return (Blockquote parsedQuotedLines)

-- Parse a nested list item.
parseListNested :: Parser MdToken
parseListNested = do
  let firstCharParser = string (T.pack "    ") <|> string (T.pack "\t")
  let restOfLineParser = manyTill anySingle (void (char '\n') <|> eof)
  lines <- greedyParse1 (firstCharParser *> restOfLineParser)
  let linesParsed = leftmostLongestParse (parseUnorderedList <|> parseOrderedList) (init $ unlines lines)
  when (null (show linesParsed)) empty
  return linesParsed

-- Parse an unordered list line item.
parseUListLineItem :: Parser MdToken
parseUListLineItem = do
  firstChar <- choice (map char ['*', '+', '-'])
  char ' ' -- At least one space between list indicator and list text.
  parseListLineItemCommon

-- Parse an ordered list line item.
parseOListLineItem :: Parser MdToken
parseOListLineItem = do
  num <- greedyParse1 (satisfy isDigit)
  char '.'
  char ' ' -- At least one space between list indicator and list text.
  parseListLineItemCommon

-- Common code for parsing list line items
parseListLineItemCommon :: Parser MdToken
parseListLineItemCommon = do
  space
  restOfLine <- manyTill parseListLineToken (void (char '\n') <|> eof)
  nestedList <- try parseListNested <|> return (Unit "")
  return $ Line [Line restOfLine, nestedList]

-- Parse an unordered list paragraph item.
parseUListParaItem :: Parser MdToken
parseUListParaItem = do
  firstLine <- parseUListLineItem
  res <- parseListParaItemCommon
  return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.

-- Parse an unordered list paragraph item.
parseOListParaItem :: Parser MdToken
parseOListParaItem = do
  firstLine <- parseOListLineItem
  res <- parseListParaItemCommon
  return $ Document (Para firstLine : res) -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.

-- Common code for parsing list paragraph items.
-- A list paragraph item is defined as a line item, followed by an empty line, followed by one or more
-- lines indented by a space or tab.
-- A list paragraph item can also be a blockquote.
parseListParaItemCommon :: Parser [MdToken]
parseListParaItemCommon = do
  char '\n'
  lines <- greedyParse1 ((string (T.pack "    ") <|> string (T.pack "\t")) *> parseTillEol)
  let res = leftmostLongestParse (greedyParse1 parseBlockquote <|> greedyParse1 parsePara) (init $ unlines lines)
  char '\n'
  return res -- I only wrap this in a document because I want some way of converting [MdToken] to MdToken, without any overhead. There is no other reason to wrap it in a Document.

-- Parse an unordered list item, which can be a line item or another list.
parseUListItem :: Parser MdToken
parseUListItem = space *> (try parseUListParaItem <|> parseUListLineItem)

-- Parse an unordered list.
parseUnorderedList :: Parser MdToken
parseUnorderedList = do
  lineItems <- some $ try parseUListItem
  void (char '\n') <|> eof -- A list must end in an extra newline or eof
  return $ UnordList lineItems

-- --------

parseOListItem :: Parser MdToken
parseOListItem = space *> (try parseOListParaItem <|> parseOListLineItem)

-- Parses the first element of an ordered list, which must start with '1.'
parseFirstOListItem :: Parser MdToken
parseFirstOListItem = do
  space
  remaining <- getInput
  when (take 2 (T.unpack remaining) /= "1.") empty
  parseOListLineItem

parseOrderedList :: Parser MdToken
parseOrderedList = do
  firstLine <- try parseFirstOListItem
  lineItems <- many $ try parseOListItem
  void (char '\n') <|> eof
  return $ OrdList (firstLine : lineItems)

horizontalRuleText :: T.Text
horizontalRuleText = T.pack "---"

doubleNewlineText :: T.Text
doubleNewlineText = T.pack "\n\n"

parseHorizontalRule :: Parser MdToken
parseHorizontalRule = string horizontalRuleText *> (void (string doubleNewlineText) <|> eof) *> return HorizontalRule

parseCodeblock :: Parser MdToken
parseCodeblock = do
  string (T.pack "```\n")
  inside <- someTill anySingle (string (T.pack "\n```"))
  return $ Codeblock (Unit (concatMap escapeChar inside))

documentParsers :: [Parser MdToken]
documentParsers =
  [ parseHorizontalRule,
    parseCodeblock,
    parseTable,
    parseHeader,
    parseBlockquote,
    parseUnorderedList,
    parseOrderedList,
    parseFigure,
    parsePara
  ]

-- Parse a document, which is multiple paragraphs.
parseDocument :: Parser MdToken
parseDocument = do
  res <- manyTill (fallthroughParser documentParsers) eof
  return (Document res)