pandoc/src/Text/Pandoc/Writers/HTML.hs

{-
Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-}

{- |
   Module      : Text.Pandoc.Writers.HTML 
   Copyright   : Copyright (C) 2006-7 John MacFarlane
   License     : GNU GPL, version 2 or above 

   Maintainer  : John MacFarlane <jgm@berkeley.edu>
   Stability   : alpha
   Portability : portable

Conversion of 'Pandoc' documents to HTML.
-}
module Text.Pandoc.Writers.HTML ( writeHtml , writeHtmlString ) where
import Text.Pandoc.Definition
import Text.Pandoc.ASCIIMathML
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
import Text.Pandoc.Shared
import Text.Regex ( mkRegex, matchRegex )
import Numeric ( showHex )
import Data.Char ( ord, toLower )
import Data.List ( isPrefixOf, intersperse )
import qualified Data.Set as S
import Control.Monad.State
import Text.XHtml.Transitional

data WriterState = WriterState
    { stNotes            :: [Html]       -- ^ List of notes
    , stIds              :: [String]     -- ^ List of header identifiers
    , stMath             :: Bool         -- ^ Math is used in document
    , stCSS              :: S.Set String -- ^ CSS to include in header
    } deriving Show

defaultWriterState :: WriterState
defaultWriterState = WriterState {stNotes= [], stIds = [], 
                                  stMath = False, stCSS = S.empty}

-- | Convert Pandoc document to Html string.
writeHtmlString :: WriterOptions -> Pandoc -> String
writeHtmlString opts = 
  if writerStandalone opts
     then renderHtml . writeHtml opts
     else renderHtmlFragment . writeHtml opts

-- | Convert Pandoc document to Html structure.
writeHtml :: WriterOptions -> Pandoc -> Html
writeHtml opts (Pandoc (Meta tit authors date) blocks) = 
  let titlePrefix = writerTitlePrefix opts
      topTitle    = evalState (inlineListToHtml opts tit) defaultWriterState
      topTitle'   = if null titlePrefix
                        then topTitle
                        else titlePrefix +++ " - " +++ topTitle
      metadata    = thetitle topTitle' +++ 
                    meta ! [httpequiv "Content-Type", 
                            content "text/html; charset=UTF-8"] +++
                    meta ! [name "generator", content "pandoc"] +++
                    (toHtmlFromList $ 
                    map (\a -> meta ! [name "author", content a]) authors) +++
                    (if null date
                       then noHtml
                       else meta ! [name "date", content date])
      titleHeader = if writerStandalone opts && not (null tit) && 
                    not (writerS5 opts)
                        then h1 ! [theclass "title"] $ topTitle
                        else noHtml
      headerBlocks = filter isHeaderBlock blocks
      ids          = uniqueIdentifiers $ 
                     map (\(Header _ lst) -> lst) headerBlocks
      toc          = if writerTableOfContents opts 
                        then tableOfContents opts headerBlocks ids
                        else noHtml
      (blocks', newstate) = 
                     runState (blockListToHtml opts blocks)
                     (defaultWriterState {stIds = ids})
      cssLines     = stCSS newstate
      css          = if S.null cssLines
                        then noHtml
                        else style ! [thetype "text/css"] $ primHtml $
                             '\n':(unlines $ S.toList cssLines)
      math         = if stMath newstate
                        then case writerASCIIMathMLURL opts of
                                   Just path -> script !  [src path, 
                                                thetype "text/javascript"] $
                                                noHtml
                                   Nothing   -> primHtml asciiMathMLScript
                        else noHtml
      head         = header $ metadata +++ math +++ css +++ 
                              primHtml (writerHeader opts)
      notes        = reverse (stNotes newstate)
      before       = primHtml $ writerIncludeBefore opts
      after        = primHtml $ writerIncludeAfter opts
      thebody      = before +++ titleHeader +++ toc +++ blocks' +++
                     footnoteSection opts notes +++ after
  in  if writerStandalone opts
         then head +++ body thebody
         else thebody

-- | Construct table of contents from list of header blocks and identifiers.
-- Assumes there are as many identifiers as header blocks.
tableOfContents :: WriterOptions -> [Block] -> [String] -> Html
tableOfContents opts headers ids =
  let opts'        = opts { writerIgnoreNotes = True }
      contentsTree = hierarchicalize headers
      contents     = evalState (mapM (elementToListItem opts') contentsTree) 
                     (defaultWriterState {stIds = ids})
  in  thediv ! [identifier "toc"] $ unordList contents

-- | Converts an Element to a list item for a table of contents,
-- retrieving the appropriate identifier from state.
elementToListItem :: WriterOptions -> Element -> State WriterState Html
elementToListItem opts (Blk _) = return noHtml
elementToListItem opts (Sec headerText subsecs) = do
  st <- get
  let ids = stIds st
  let (id, rest) = if null ids
                      then ("", [])
                      else (head ids, tail ids)
  put $ st {stIds = rest}
  txt <- inlineListToHtml opts headerText
  subHeads <- mapM (elementToListItem opts) subsecs
  let subList = if null subHeads
                   then noHtml
                   else unordList subHeads 
  return $ (anchor ! [href ("#" ++ id), identifier ("TOC-" ++ id)] $ txt) +++ 
           subList

-- | Convert list of Note blocks to a footnote <div>.
-- Assumes notes are sorted.
footnoteSection :: WriterOptions -> [Html] -> Html
footnoteSection opts notes =
  if null notes 
     then noHtml
     else thediv ! [theclass "footnotes"] $ hr +++ (olist << notes)

-- | Obfuscate a "mailto:" link using Javascript.
obfuscateLink :: WriterOptions -> String -> String -> Html
obfuscateLink opts text src =
  let emailRegex = mkRegex "^mailto:([^@]*)@(.*)$"
      src'       = map toLower src
  in  case (matchRegex emailRegex src') of
        (Just [name, domain]) ->
          let domain'  = substitute "." " dot " domain
              at'      = obfuscateChar '@'
              (linkText, altText) = 
                 if text == drop 7 src' -- autolink
                    then ("'<code>'+e+'</code>'", name ++ " at " ++ domain')
                    else ("'" ++ text ++ "'", text ++ " (" ++ name ++ " at " ++ 
                          domain' ++ ")")
          in  if writerStrictMarkdown opts
                then -- need to use primHtml or &'s are escaped to &amp; in URL
                     primHtml $ "<a href=\"" ++ (obfuscateString src')
                     ++ "\">" ++ (obfuscateString text) ++ "</a>"
                else (script ! [thetype "text/javascript"] $
                     primHtml ("\n<!--\nh='" ++ 
                     obfuscateString domain ++ "';a='" ++ at' ++ "';n='" ++ 
                     obfuscateString name ++ "';e=n+a+h;\n" ++
                     "document.write('<a h'+'ref'+'=\"ma'+'ilto'+':'+e+'\">'+" ++ 
                     linkText  ++ "+'<\\/'+'a'+'>');\n// -->\n")) +++  
                     noscript (primHtml $ obfuscateString altText)
        _ -> anchor ! [href src] $ primHtml text  -- malformed email

-- | Obfuscate character as entity.
obfuscateChar :: Char -> String
obfuscateChar char = 
  let num    = ord char
      numstr = if even num then show num else "x" ++ showHex num ""
  in  "&#" ++ numstr ++ ";"

-- | Obfuscate string using entities.
obfuscateString :: String -> String
obfuscateString = concatMap obfuscateChar . decodeCharacterReferences

-- | True if character is a punctuation character (unicode).
isPunctuation :: Char -> Bool
isPunctuation c =
  let c' = ord c
  in  if c `elem` "!\"'()*,-./:;<>?[\\]`{|}~" || c' >= 0x2000 && c' <= 0x206F ||
         c' >= 0xE000 && c' <= 0xE0FF
         then True
         else False

-- | Add CSS for document header.
addToCSS :: String -> State WriterState ()
addToCSS item = do
  st <- get
  let current = stCSS st
  put $ st {stCSS = S.insert item current}

-- | Convert Pandoc inline list to plain text identifier.
inlineListToIdentifier :: [Inline] -> String
inlineListToIdentifier [] = ""
inlineListToIdentifier (x:xs) = 
  xAsText ++ inlineListToIdentifier xs
  where xAsText = case x of
          Str s          -> filter (\c -> c == '-' || not (isPunctuation c)) $
                            concat $ intersperse "-" $ words $ map toLower s
          Emph lst       -> inlineListToIdentifier lst
          Strikeout lst  -> inlineListToIdentifier lst
          Superscript lst -> inlineListToIdentifier lst
          Subscript lst  -> inlineListToIdentifier lst
          Strong lst     -> inlineListToIdentifier lst
          Quoted _ lst   -> inlineListToIdentifier lst
          Code s         -> s
          Space          -> "-"
          EmDash         -> "-"
          EnDash         -> "-"
          Apostrophe     -> ""
          Ellipses       -> ""
          LineBreak      -> "-"
          TeX _          -> ""
          HtmlInline _   -> ""
          Link lst _     -> inlineListToIdentifier lst
          Image lst _    -> inlineListToIdentifier lst
          Note _         -> ""

-- | Return unique identifiers for list of inline lists.
uniqueIdentifiers :: [[Inline]] -> [String]
uniqueIdentifiers ls =
  let addIdentifier (nonuniqueIds, uniqueIds) l =
        let new = inlineListToIdentifier l
            matches = length $ filter (== new) nonuniqueIds
            new' = new ++ if matches > 0 then ("-" ++ show matches) else ""
        in  (new:nonuniqueIds, new':uniqueIds)
  in  reverse $ snd $ foldl addIdentifier ([],[]) ls

-- | Convert Pandoc block element to HTML.
blockToHtml :: WriterOptions -> Block -> State WriterState Html
blockToHtml opts Null = return $ noHtml 
blockToHtml opts (Plain lst) = inlineListToHtml opts lst
blockToHtml opts (Para lst) = inlineListToHtml opts lst >>= (return . paragraph)
blockToHtml opts (RawHtml str) = return $ primHtml str
blockToHtml opts (HorizontalRule) = return $ hr
blockToHtml opts (CodeBlock str) = return $ pre $ thecode << (str ++ "\n") 
                       -- the final \n for consistency with Markdown.pl
blockToHtml opts (BlockQuote blocks) =
  -- in S5, treat list in blockquote specially
  -- if default is incremental, make it nonincremental; 
  -- otherwise incremental
  if writerS5 opts
     then let inc = not (writerIncremental opts) in
          case blocks of 
             [BulletList lst]  -> blockToHtml (opts {writerIncremental = inc})
                                  (BulletList lst)
             [OrderedList attribs lst] -> 
                                  blockToHtml (opts {writerIncremental = inc})
                                  (OrderedList attribs lst)
             otherwise         -> blockListToHtml opts blocks >>= 
                                  (return . blockquote)
     else blockListToHtml opts blocks >>= (return . blockquote)
blockToHtml opts (Header level lst) = do 
  contents <- inlineListToHtml opts lst
  st <- get
  let ids = stIds st
  let (id, rest) = if null ids
                      then ("", [])
                      else (head ids, tail ids)
  put $ st {stIds = rest}
  let attribs = if writerStrictMarkdown opts && not (writerTableOfContents opts)
                   then []
                   else [identifier id]
  let headerHtml = case level of
              1 -> h1 contents ! attribs
              2 -> h2 contents ! attribs
              3 -> h3 contents ! attribs
              4 -> h4 contents ! attribs
              5 -> h5 contents ! attribs
              6 -> h6 contents ! attribs
              _ -> paragraph contents ! attribs
  return $ if writerTableOfContents opts
              then anchor ! [href ("#TOC-" ++ id)] $ headerHtml
              else headerHtml
blockToHtml opts (BulletList lst) = do
  contents <- mapM (blockListToHtml opts) lst
  let attribs = if writerIncremental opts
                   then [theclass "incremental"]
                   else []
  return $ unordList ! attribs $ contents
blockToHtml opts (OrderedList (startnum, numstyle, _) lst) = do
  contents <- mapM (blockListToHtml opts) lst
  let numstyle' = camelCaseToHyphenated $ show numstyle
  let attribs = (if writerIncremental opts
                   then [theclass "incremental"]
                   else []) ++
                (if startnum /= 1
                   then [start startnum]
                   else []) ++
                (if numstyle /= DefaultStyle
                   then [theclass numstyle']
                   else [])
  if numstyle /= DefaultStyle
     then addToCSS $ "ol." ++ numstyle' ++ 
                     " { list-style-type: " ++ 
                     numstyle' ++ "; }"
     else return ()
  return $ ordList ! attribs $ contents
blockToHtml opts (DefinitionList lst) = do
  contents <- mapM (\(term, def) -> do term' <- inlineListToHtml opts term
                                       def' <- blockListToHtml opts def
                                       return $ (term', def')) lst
  let attribs = if writerIncremental opts
                   then [theclass "incremental"]
                   else []
  return $ defList ! attribs $ contents
blockToHtml opts (Table capt aligns widths headers rows) = do
  let alignStrings = map alignmentToString aligns
  captionDoc <- if null capt
                   then return noHtml
                   else inlineListToHtml opts capt >>= return . caption
  colHeads <- colHeadsToHtml opts alignStrings 
                             widths headers
  rows' <- mapM (tableRowToHtml opts alignStrings) rows
  return $ table $ captionDoc +++ colHeads +++ rows'

colHeadsToHtml opts alignStrings widths headers = do
  heads <- sequence $ zipWith3 
           (\align width item -> tableItemToHtml opts th align width item) 
           alignStrings widths headers
  return $ tr $ toHtmlFromList heads

alignmentToString alignment = case alignment of
                                 AlignLeft    -> "left"
                                 AlignRight   -> "right"
                                 AlignCenter  -> "center"
                                 AlignDefault -> "left"

tableRowToHtml opts aligns cols = 
  (sequence $ zipWith3 (tableItemToHtml opts td) aligns (repeat 0) cols) >>=
  return . tr . toHtmlFromList

tableItemToHtml opts tag align' width item = do
  contents <- blockListToHtml opts item
  let attrib = [align align'] ++ 
               if width /= 0 
                  then [thestyle ("{width: " ++ show (truncate (100*width)) ++ 
                        "%;}")]
                  else [] 
  return $ tag ! attrib $ contents

blockListToHtml :: WriterOptions -> [Block] -> State WriterState Html
blockListToHtml opts lst = 
  mapM (blockToHtml opts) lst >>= return . toHtmlFromList

-- | Convert list of Pandoc inline elements to HTML.
inlineListToHtml :: WriterOptions -> [Inline] -> State WriterState Html
inlineListToHtml opts lst = 
  mapM (inlineToHtml opts) lst >>= return . toHtmlFromList

-- | Convert Pandoc inline element to HTML.
inlineToHtml :: WriterOptions -> Inline -> State WriterState Html
inlineToHtml opts inline =
  case inline of  
    (Str str)        -> return $ stringToHtml str
    (Space)          -> return $ stringToHtml " "
    (LineBreak)      -> return $ br
    (EmDash)         -> return $ primHtmlChar "mdash"
    (EnDash)         -> return $ primHtmlChar "ndash"
    (Ellipses)       -> return $ primHtmlChar "hellip"
    (Apostrophe)     -> return $ primHtmlChar "rsquo"
    (Emph lst)       -> inlineListToHtml opts lst >>= return . emphasize
    (Strong lst)     -> inlineListToHtml opts lst >>= return . strong
    (Code str)       -> return $ thecode << str
    (Strikeout lst)  -> addToCSS 
                        ".strikeout { text-decoration: line-through; }" >> 
                        inlineListToHtml opts lst >>=
                        return . (thespan ! [theclass "strikeout"])
    (Superscript lst) -> inlineListToHtml opts lst >>= return . sup
    (Subscript lst)   -> inlineListToHtml opts lst >>= return . sub
    (Quoted quoteType lst) ->
                        let (leftQuote, rightQuote) = case quoteType of
                              SingleQuote -> (primHtmlChar "lsquo", 
                                              primHtmlChar "rsquo")
                              DoubleQuote -> (primHtmlChar "ldquo", 
                                              primHtmlChar "rdquo")
                        in  do contents <- inlineListToHtml opts lst
                               return $ leftQuote +++ contents +++ rightQuote
    (TeX str)        -> (if writerUseASCIIMathML opts
                            then modify (\st -> st {stMath = True})
                            else return ()) >> return (stringToHtml str)
    (HtmlInline str) -> return $ primHtml str 
    (Link [Code str] (src,tit)) | "mailto:" `isPrefixOf` src ->
                        return $ obfuscateLink opts str src
    (Link txt (src,tit)) | "mailto:" `isPrefixOf` src -> do
                        linkText <- inlineListToHtml opts txt  
                        return $ obfuscateLink opts (show linkText) src
    (Link txt (src,tit)) -> do
                        linkText <- inlineListToHtml opts txt
                        return $ anchor ! ([href src] ++ 
                                 if null tit then [] else [title tit]) $ 
                                 linkText
    (Image txt (source,tit)) -> do
                        alternate <- inlineListToHtml opts txt
                        let alternate' = renderHtmlFragment alternate
                        let attributes = [src source, title tit] ++ 
                                         if null txt 
                                            then [] 
                                            else [alt alternate']
                        return $ image ! attributes 
                        -- note:  null title included, as in Markdown.pl 
    (Note contents)          -> do 
                        st <- get
                        let notes = stNotes st
                        let number = (length notes) + 1
                        let ref = show number
                        htmlContents <- blockListToNote opts ref contents 
                        -- push contents onto front of notes
                        put $ st {stNotes = (htmlContents:notes)} 
                        return $ anchor ! [href ("#fn" ++ ref),
                                          theclass "footnoteRef",
                                          identifier ("fnref" ++ ref)] << 
                                          sup << ref

blockListToNote :: WriterOptions -> String -> [Block] -> State WriterState Html
blockListToNote opts ref blocks =
  -- If last block is Para or Plain, include the backlink at the end of
  -- that block. Otherwise, insert a new Plain block with the backlink.
  let backlink = [HtmlInline $ " <a href=\"#fnref" ++ ref ++ 
                 "\" class=\"footnoteBackLink\"" ++
                 " title=\"Jump back to footnote " ++ ref ++ "\">&#8617;</a>"]
      blocks'  = if null blocks
                    then []
                    else let lastBlock   = last blocks
                             otherBlocks = init blocks
                         in  case lastBlock of
                                  (Para lst)  -> otherBlocks ++
                                                 [Para (lst ++ backlink)]
                                  (Plain lst) -> otherBlocks ++
                                                 [Plain (lst ++ backlink)]
                                  _           -> otherBlocks ++ [lastBlock,
                                                 Plain backlink]
  in  do contents <- blockListToHtml opts blocks'
         return $ li ! [identifier ("fn" ++ ref)] $ contents