2011-11-19 00:20:00 -08:00
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
|
|
{-
|
|
|
|
Copyright (C) 2011 John MacFarlane <jgm@berkeley.edu>
|
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
-}
|
|
|
|
|
|
|
|
{- |
|
2011-11-21 15:09:42 -08:00
|
|
|
Module : Text.Pandoc.SelfContained
|
2011-11-19 00:20:00 -08:00
|
|
|
Copyright : Copyright (C) 2011 John MacFarlane
|
|
|
|
License : GNU GPL, version 2 or above
|
|
|
|
|
|
|
|
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
|
|
|
Stability : alpha
|
|
|
|
Portability : portable
|
|
|
|
|
|
|
|
Functions for converting an HTML file into one that can be viewed
|
|
|
|
offline, by incorporating linked images, CSS, and scripts into
|
|
|
|
the HTML using data URIs.
|
|
|
|
-}
|
2011-11-21 15:09:42 -08:00
|
|
|
module Text.Pandoc.SelfContained ( makeSelfContained ) where
|
2011-11-19 00:20:00 -08:00
|
|
|
import Text.HTML.TagSoup
|
|
|
|
import Network.URI (isAbsoluteURI, parseURI, escapeURIString)
|
|
|
|
import Network.HTTP
|
|
|
|
import Data.ByteString.Base64
|
|
|
|
import qualified Data.ByteString.Char8 as B
|
|
|
|
import Data.ByteString (ByteString)
|
2011-12-04 12:19:35 -08:00
|
|
|
import Data.ByteString.UTF8 (toString, fromString)
|
2011-11-20 12:04:47 -08:00
|
|
|
import System.FilePath (takeExtension, dropExtension, takeDirectory, (</>))
|
2011-11-19 00:20:00 -08:00
|
|
|
import Data.Char (toLower, isAscii, isAlphaNum)
|
|
|
|
import Codec.Compression.GZip as Gzip
|
|
|
|
import qualified Data.ByteString.Lazy as L
|
2011-11-20 12:32:54 -08:00
|
|
|
import Text.Pandoc.Shared (findDataFile)
|
2012-01-14 10:27:29 -08:00
|
|
|
import Text.Pandoc.MIME (getMimeType)
|
2011-11-20 12:32:54 -08:00
|
|
|
import System.Directory (doesFileExist)
|
2011-11-19 00:20:00 -08:00
|
|
|
|
2011-12-04 15:58:31 -08:00
|
|
|
getItem :: Maybe FilePath -> String -> IO (ByteString, Maybe String)
|
2011-11-21 00:41:08 -08:00
|
|
|
getItem userdata f =
|
2011-11-19 00:20:00 -08:00
|
|
|
if isAbsoluteURI f
|
|
|
|
then openURL f
|
2011-11-20 12:32:54 -08:00
|
|
|
else do
|
2012-01-14 10:27:29 -08:00
|
|
|
let mime = case takeExtension f of
|
|
|
|
".gz" -> getMimeType $ dropExtension f
|
|
|
|
x -> getMimeType x
|
2011-11-20 12:32:54 -08:00
|
|
|
exists <- doesFileExist f
|
|
|
|
if exists
|
2011-12-04 15:58:31 -08:00
|
|
|
then do
|
|
|
|
cont <- B.readFile f
|
2012-01-14 10:27:29 -08:00
|
|
|
return (cont, mime)
|
2011-11-20 12:32:54 -08:00
|
|
|
else do
|
2011-11-21 00:41:08 -08:00
|
|
|
res <- findDataFile userdata f
|
2011-11-20 12:32:54 -08:00
|
|
|
exists' <- doesFileExist res
|
|
|
|
if exists'
|
2011-12-04 15:58:31 -08:00
|
|
|
then do
|
|
|
|
cont <- B.readFile res
|
2012-01-14 10:27:29 -08:00
|
|
|
return (cont, mime)
|
2011-11-21 15:09:42 -08:00
|
|
|
else error $ "Could not find `" ++ f ++ "'"
|
2011-11-19 00:20:00 -08:00
|
|
|
|
2011-12-04 15:58:31 -08:00
|
|
|
-- TODO - have this return mime type too - then it can work for google
|
|
|
|
-- chart API, e.g.
|
|
|
|
openURL :: String -> IO (ByteString, Maybe String)
|
|
|
|
openURL u = getBodyAndMimeType =<< simpleHTTP (getReq u)
|
2011-11-20 12:04:47 -08:00
|
|
|
where getReq v = case parseURI v of
|
|
|
|
Nothing -> error $ "Could not parse URI: " ++ v
|
2011-11-19 00:20:00 -08:00
|
|
|
Just u' -> mkRequest GET u'
|
2011-12-04 15:58:31 -08:00
|
|
|
getBodyAndMimeType (Left err) = fail (show err)
|
|
|
|
getBodyAndMimeType (Right r) = return (rspBody r, findHeader HdrContentType r)
|
2011-11-19 00:20:00 -08:00
|
|
|
|
2011-11-20 12:04:47 -08:00
|
|
|
isOk :: Char -> Bool
|
2011-11-19 00:20:00 -08:00
|
|
|
isOk c = isAscii c && isAlphaNum c
|
|
|
|
|
2011-11-21 00:41:08 -08:00
|
|
|
convertTag :: Maybe FilePath -> Tag String -> IO (Tag String)
|
|
|
|
convertTag userdata t@(TagOpen "img" as) =
|
2011-11-19 00:20:00 -08:00
|
|
|
case fromAttrib "src" t of
|
2011-11-19 19:30:27 -08:00
|
|
|
[] -> return t
|
|
|
|
src -> do
|
2011-12-04 12:19:35 -08:00
|
|
|
(raw, mime) <- getRaw userdata (fromAttrib "type" t) src
|
2011-11-19 19:30:27 -08:00
|
|
|
let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw)
|
2011-11-19 00:20:00 -08:00
|
|
|
return $ TagOpen "img" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
|
2011-11-24 10:04:25 -08:00
|
|
|
convertTag userdata t@(TagOpen "video" as) =
|
|
|
|
case fromAttrib "src" t of
|
|
|
|
[] -> return t
|
|
|
|
src -> do
|
2011-12-04 12:19:35 -08:00
|
|
|
(raw, mime) <- getRaw userdata (fromAttrib "type" t) src
|
2011-11-24 10:04:25 -08:00
|
|
|
let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw)
|
|
|
|
return $ TagOpen "video" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
|
2011-11-21 00:41:08 -08:00
|
|
|
convertTag userdata t@(TagOpen "script" as) =
|
2011-11-19 00:20:00 -08:00
|
|
|
case fromAttrib "src" t of
|
2011-11-19 19:30:27 -08:00
|
|
|
[] -> return t
|
|
|
|
src -> do
|
2011-12-04 12:19:35 -08:00
|
|
|
(raw, mime) <- getRaw userdata (fromAttrib "type" t) src
|
2011-11-19 19:30:27 -08:00
|
|
|
let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw)
|
2011-11-19 00:20:00 -08:00
|
|
|
return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
|
2011-11-21 00:41:08 -08:00
|
|
|
convertTag userdata t@(TagOpen "link" as) =
|
2011-11-19 00:20:00 -08:00
|
|
|
case fromAttrib "href" t of
|
2011-11-19 19:30:27 -08:00
|
|
|
[] -> return t
|
|
|
|
src -> do
|
2011-12-04 12:19:35 -08:00
|
|
|
(raw, mime) <- getRaw userdata (fromAttrib "type" t) src
|
2011-11-19 19:30:27 -08:00
|
|
|
let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw)
|
2011-11-19 00:20:00 -08:00
|
|
|
return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"])
|
2011-11-21 00:41:08 -08:00
|
|
|
convertTag _ t = return t
|
2011-11-19 00:20:00 -08:00
|
|
|
|
2011-12-04 12:19:35 -08:00
|
|
|
cssURLs :: Maybe FilePath -> FilePath -> ByteString -> IO ByteString
|
|
|
|
cssURLs userdata d orig =
|
|
|
|
case B.breakSubstring "url(" orig of
|
2011-11-20 12:04:47 -08:00
|
|
|
(x,y) | B.null y -> return orig
|
|
|
|
| otherwise -> do
|
2011-12-04 12:19:35 -08:00
|
|
|
let (u,v) = B.breakSubstring ")" $ B.drop 4 y
|
|
|
|
let url = toString
|
|
|
|
$ case B.take 1 u of
|
|
|
|
"\"" -> B.takeWhile (/='"') $ B.drop 1 u
|
|
|
|
_ -> u
|
|
|
|
(raw, mime) <- getRaw userdata "" (d </> url)
|
|
|
|
rest <- cssURLs userdata d v
|
|
|
|
let enc = "data:" `B.append` fromString mime `B.append`
|
|
|
|
";base64," `B.append` (encode raw)
|
|
|
|
return $ x `B.append` "url(" `B.append` enc `B.append` rest
|
2011-11-20 12:04:47 -08:00
|
|
|
|
2011-12-04 12:19:35 -08:00
|
|
|
getRaw :: Maybe FilePath -> String -> String -> IO (ByteString, String)
|
|
|
|
getRaw userdata mimetype src = do
|
2011-11-19 19:30:27 -08:00
|
|
|
let ext = map toLower $ takeExtension src
|
2011-12-04 15:58:31 -08:00
|
|
|
(raw, respMime) <- getItem userdata src
|
|
|
|
let raw' = if ext == ".gz"
|
|
|
|
then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks
|
|
|
|
$ [raw]
|
|
|
|
else raw
|
|
|
|
let mime = case (mimetype, respMime) of
|
|
|
|
("",Nothing) -> error
|
|
|
|
$ "Could not determine mime type for `" ++ src ++ "'"
|
|
|
|
(x, Nothing) -> x
|
|
|
|
(_, Just x ) -> x
|
2011-11-20 12:04:47 -08:00
|
|
|
result <- if mime == "text/css"
|
2011-12-04 15:58:31 -08:00
|
|
|
then cssURLs userdata (takeDirectory src) raw'
|
|
|
|
else return raw'
|
2011-11-20 12:04:47 -08:00
|
|
|
return (result, mime)
|
2011-11-19 00:20:00 -08:00
|
|
|
|
2011-11-21 15:09:42 -08:00
|
|
|
-- | Convert HTML into self-contained HTML, incorporating images,
|
|
|
|
-- scripts, and CSS using data: URIs. Items specified using absolute
|
|
|
|
-- URLs will be downloaded; those specified using relative URLs will
|
|
|
|
-- be sought first relative to the working directory, then relative
|
|
|
|
-- to the user data directory (if the first parameter is 'Just'
|
|
|
|
-- a directory), and finally relative to pandoc's default data
|
|
|
|
-- directory.
|
|
|
|
makeSelfContained :: Maybe FilePath -> String -> IO String
|
|
|
|
makeSelfContained userdata inp = do
|
2011-11-19 00:20:00 -08:00
|
|
|
let tags = parseTags inp
|
2011-11-21 00:41:08 -08:00
|
|
|
out' <- mapM (convertTag userdata) tags
|
2011-11-19 00:20:00 -08:00
|
|
|
return $ renderTagsOptions renderOptions{ optMinimize = (\t -> t == "br"
|
2011-11-20 12:04:47 -08:00
|
|
|
|| t == "img" || t == "meta" || t == "link" ) } out'
|
2011-11-19 00:20:00 -08:00
|
|
|
|