md-to-html/src/MdToHTML.hs

module MdToHTML where

import Control.Applicative
import Control.Monad
import Data.List
import Debug.Trace
import Text.ParserCombinators.ReadP
import Text.Printf

type HeaderLevel = Int

newtype URL = URL {getUrl :: String}

newtype ImgPath = ImgPath {getPath :: String}

parseMany :: ReadP a -> ReadP [a]
parseMany = Text.ParserCombinators.ReadP.many

data MdToken
  = Document [MdToken]
  | Header HeaderLevel MdToken
  | Para MdToken
  | Line [MdToken]
  | SingleNewline -- A single newline is rendered as a space.
  | Linebreak
  | HorizontalRule
  | Blockquote MdToken
  | UnordList [MdToken]
  | OrdList [MdToken]
  | Code String
  | Codeblock String
  | Link MdToken URL
  | Image MdToken ImgPath
  | Bold MdToken
  | Italic MdToken
  | Strikethrough MdToken
  | Unit String

-- Deriving Show for MdToken
instance Show MdToken where
  show (Document tokens) = concat (map show tokens)
  show (Header level token) = "<h" ++ show level ++ ">" ++ show token ++ "</h" ++ show level ++ ">"
  show (Para token) = "<p>" ++ show token ++ "</p>"
  show (Line tokens) = concat (map show tokens)
  show Linebreak = "<br>"
  show SingleNewline = " "
  show HorizontalRule = "---------"
  show (Blockquote token) = "BLOCK" ++ show token
  show (UnordList tokens) = "UNORD" ++ concat (map show tokens)
  show (OrdList tokens) = "ORD" ++ concat (map show tokens)
  show (Code code) = show code
  show (Codeblock code) = show code
  show (Link txt url) = "<a href=" ++ (getUrl url) ++ ">" ++ show txt ++ "</a>"
  show (Image txt imgPath) = "<img src=" ++ (getPath imgPath) ++ ">" ++ show txt ++ "</img>"
  show (Bold token) = "<b>" ++ show token ++ "</b>"
  show (Italic token) = "<i>" ++ show token ++ "</i>"
  show (Strikethrough token) = "<s>" ++ show token ++ "</s>"
  show (Unit unit) = printf "%s" unit

-- ---------------
-- Helpers
leftmostLongest :: (Foldable t) => [(a, t b)] -> (a, t b)
leftmostLongest xs =
  let lastElem = (last xs)
      filteredLst = (filter (\val -> (length $ snd val) == (length $ snd lastElem)) xs)
   in head filteredLst

-- Get the first parse returned by readP_to_S that consumed the most input
leftmostLongestParse :: ReadP a -> String -> (a, String)
leftmostLongestParse parser input = leftmostLongest $ readP_to_S parser input

-- Parse if the string that's left matches the string comparator function
lookaheadParse :: (String -> Bool) -> ReadP Char
lookaheadParse stringCmp = do
  lookahead <- look
  case stringCmp lookahead of
    True -> get
    False -> pfail

lineToList :: MdToken -> [MdToken]
lineToList (Line tokens) = tokens

-- ---------------

-- Parse a markdown header, denoted by 1-6 #'s followed by some text, followed by EOL.
parseHeader :: ReadP MdToken
parseHeader = do
  skipSpaces
  headers <- munch1 (== '#')
  when
    ((length headers) > 6)
    pfail
  skipSpaces
  text <- munch1 (/= '\n')
  Text.ParserCombinators.ReadP.optional (char '\n')
  let parsedText = fst $ leftmostLongestParse parseLine text
  return (Header (length headers) parsedText)

-- Parse bold text
parseBold :: ReadP MdToken
parseBold = do
  text <-
    choice
      [ between (string "__") (string "__") (many1 (lookaheadParse (/= "__"))),
        between (string "**") (string "**") (many1 (lookaheadParse (/= "**")))
      ]
  let parsedText = fst $ leftmostLongestParse parseLine text
  return (Bold parsedText)

-- Parse italic text
parseItalic :: ReadP MdToken
parseItalic = do
  text <-
    choice
      [ between (string "_") (string "_") (munch1 (/= '_')),
        between (string "*") (string "*") (munch1 (/= '*'))
      ]
  let parsedText = fst $ leftmostLongestParse parseLine text
  return (Italic parsedText)

-- Parse a linebreak character
parseLinebreak :: ReadP MdToken
parseLinebreak = do
  char ' '
  many1 (char ' ')
  char '\n'
  return Linebreak

parseSingleNewline :: ReadP MdToken
parseSingleNewline = do
  char '\n'
  return SingleNewline

-- Parse a regular string as a Unit.
parseString :: ReadP MdToken
parseString = do
  firstChar <- satisfy (/= '\n') -- Must parse at least one non-newline character here
  text <- munch (\x -> not (elem x "#*_[\n "))
  return (Unit (firstChar : text))

lineParsers :: [ReadP MdToken]
lineParsers = [parseLinebreak, parseSingleNewline, parseBold, parseItalic, parseString] -- A 'line' doesn't include a 'header'
--lineParsers = [parseSingleNewline, parseString]

-- List of all parsers
allParsers :: [ReadP MdToken]
allParsers = parseHeader : lineParsers

-- Parse any of the above tokens.
parseLineToken :: ReadP MdToken
parseLineToken = choice lineParsers

-- Parse a line, consisting of one or more tokens.
parseLine :: ReadP MdToken
parseLine = do
  skipSpaces
  -- Fail if we have reached the end of the document.
  remaining <- look
  when (null remaining) pfail
  parsed <- parseMany parseLineToken
  return (Line parsed)

-- Parse a paragraph, which is a 'Line' (can span multiple actual lines), separated by double-newlines.
-- As a weird special case, a 'Paragraph' can also be a 'Header'.
parsePara :: ReadP MdToken
parsePara = do
  parseMany (char '\n')
  --  text <- many1 (lookaheadParse (\x -> ((length x) < 2) || (take 2 x) /= "\n\n")) -- Parse until a double-newline.
  --  string "\n\n" <|> (eof >> return "") -- Consume the next double-newline or EOF.
  text <- (manyTill get ((string "\n\n") <|> (eof >> return "")))
  when (null text) pfail
  let parsedText = fst $ leftmostLongestParse (parseHeader <|> parseLine) text -- Parse either a line or a header.
  -- If the paragraph is a header, return a Header token. Otheriwse return a Para token.
  case parsedText of
    Header level token -> return (Header level token)
    _ -> return (Para parsedText)

-- Parse a document, which is multiple paragraphs.
parseDocument :: ReadP MdToken
parseDocument = (many1 parsePara) >>= (\res -> return (Document (res)))