2011-11-19 00:20:00 -08:00
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
|
|
{-
|
2015-04-26 10:18:29 -07:00
|
|
|
Copyright (C) 2011-2015 John MacFarlane <jgm@berkeley.edu>
|
2011-11-19 00:20:00 -08:00
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
-}
|
|
|
|
|
|
|
|
{- |
|
2011-11-21 15:09:42 -08:00
|
|
|
Module : Text.Pandoc.SelfContained
|
2015-04-26 10:18:29 -07:00
|
|
|
Copyright : Copyright (C) 2011-2015 John MacFarlane
|
2011-11-19 00:20:00 -08:00
|
|
|
License : GNU GPL, version 2 or above
|
|
|
|
|
|
|
|
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
|
|
|
Stability : alpha
|
|
|
|
Portability : portable
|
|
|
|
|
|
|
|
Functions for converting an HTML file into one that can be viewed
|
|
|
|
offline, by incorporating linked images, CSS, and scripts into
|
|
|
|
the HTML using data URIs.
|
|
|
|
-}
|
2011-11-21 15:09:42 -08:00
|
|
|
module Text.Pandoc.SelfContained ( makeSelfContained ) where
|
2011-11-19 00:20:00 -08:00
|
|
|
import Text.HTML.TagSoup
|
2014-08-02 16:33:06 -07:00
|
|
|
import Network.URI (isURI, escapeURIString, URI(..), parseURI)
|
2011-11-19 00:20:00 -08:00
|
|
|
import Data.ByteString.Base64
|
|
|
|
import qualified Data.ByteString.Char8 as B
|
|
|
|
import Data.ByteString (ByteString)
|
2014-08-02 16:07:19 -07:00
|
|
|
import System.FilePath (takeExtension, takeDirectory, (</>))
|
2011-11-19 00:20:00 -08:00
|
|
|
import Data.Char (toLower, isAscii, isAlphaNum)
|
|
|
|
import Codec.Compression.GZip as Gzip
|
|
|
|
import qualified Data.ByteString.Lazy as L
|
2014-08-02 16:07:19 -07:00
|
|
|
import Text.Pandoc.Shared (renderTags', err, fetchItem')
|
|
|
|
import Text.Pandoc.MediaBag (MediaBag)
|
2014-08-17 20:42:30 +04:00
|
|
|
import Text.Pandoc.MIME (MimeType)
|
2015-05-04 16:00:28 -07:00
|
|
|
import Text.Pandoc.UTF8 (toString)
|
2014-08-02 16:07:19 -07:00
|
|
|
import Text.Pandoc.Options (WriterOptions(..))
|
2015-02-13 21:37:43 -08:00
|
|
|
import Data.List (isPrefixOf)
|
2015-05-04 16:00:28 -07:00
|
|
|
import Control.Applicative
|
|
|
|
import Text.CSS.Parse (parseNestedBlocks, NestedBlock(..))
|
|
|
|
import Text.CSS.Render (renderNestedBlocks)
|
|
|
|
import Data.Text.Encoding (decodeUtf8, encodeUtf8)
|
|
|
|
import qualified Data.Text as T
|
|
|
|
import Data.Text (Text)
|
|
|
|
import Data.Text.Lazy (toStrict)
|
|
|
|
import Data.Text.Lazy.Builder (toLazyText)
|
2011-11-19 00:20:00 -08:00
|
|
|
|
2011-11-20 12:04:47 -08:00
|
|
|
isOk :: Char -> Bool
|
2011-11-19 00:20:00 -08:00
|
|
|
isOk c = isAscii c && isAlphaNum c
|
|
|
|
|
2015-02-13 21:37:43 -08:00
|
|
|
makeDataURI :: String -> ByteString -> String
|
|
|
|
makeDataURI mime raw =
|
|
|
|
if textual
|
|
|
|
then "data:" ++ mime' ++ "," ++ escapeURIString isOk (toString raw)
|
|
|
|
else "data:" ++ mime' ++ ";base64," ++ toString (encode raw)
|
|
|
|
where textual = "text/" `Data.List.isPrefixOf` mime
|
|
|
|
mime' = if textual && ';' `notElem` mime
|
|
|
|
then mime ++ ";charset=utf-8"
|
|
|
|
else mime -- mime type already has charset
|
|
|
|
|
2014-08-02 16:07:19 -07:00
|
|
|
convertTag :: MediaBag -> Maybe String -> Tag String -> IO (Tag String)
|
|
|
|
convertTag media sourceURL t@(TagOpen tagname as)
|
2014-10-04 11:37:27 -07:00
|
|
|
| tagname `elem`
|
|
|
|
["img", "embed", "video", "input", "audio", "source", "track"] = do
|
2014-03-05 09:10:09 -08:00
|
|
|
as' <- mapM processAttribute as
|
|
|
|
return $ TagOpen tagname as'
|
|
|
|
where processAttribute (x,y) =
|
|
|
|
if x == "src" || x == "href" || x == "poster"
|
|
|
|
then do
|
2015-05-04 16:00:28 -07:00
|
|
|
enc <- getDataURI media sourceURL (fromAttrib "type" t) y
|
2014-03-05 09:10:09 -08:00
|
|
|
return (x, enc)
|
|
|
|
else return (x,y)
|
2014-08-02 16:07:19 -07:00
|
|
|
convertTag media sourceURL t@(TagOpen "script" as) =
|
2011-11-19 00:20:00 -08:00
|
|
|
case fromAttrib "src" t of
|
2011-11-19 19:30:27 -08:00
|
|
|
[] -> return t
|
|
|
|
src -> do
|
2015-05-04 16:00:28 -07:00
|
|
|
enc <- getDataURI media sourceURL (fromAttrib "type" t) src
|
2012-07-26 22:32:53 -07:00
|
|
|
return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
|
2014-08-02 16:07:19 -07:00
|
|
|
convertTag media sourceURL t@(TagOpen "link" as) =
|
2011-11-19 00:20:00 -08:00
|
|
|
case fromAttrib "href" t of
|
2011-11-19 19:30:27 -08:00
|
|
|
[] -> return t
|
|
|
|
src -> do
|
2015-05-04 16:00:28 -07:00
|
|
|
enc <- getDataURI media sourceURL (fromAttrib "type" t) src
|
2012-07-26 22:32:53 -07:00
|
|
|
return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"])
|
2014-07-30 15:26:40 -07:00
|
|
|
convertTag _ _ t = return t
|
2011-11-19 00:20:00 -08:00
|
|
|
|
2013-04-10 10:22:00 -07:00
|
|
|
-- NOTE: This is really crude, it doesn't respect CSS comments.
|
2014-08-02 16:07:19 -07:00
|
|
|
cssURLs :: MediaBag -> Maybe String -> FilePath -> ByteString
|
2014-07-30 15:26:40 -07:00
|
|
|
-> IO ByteString
|
2015-05-04 16:00:28 -07:00
|
|
|
cssURLs media sourceURL d orig = do
|
|
|
|
case parseNestedBlocks (decodeUtf8 orig) of
|
|
|
|
Left _err -> return orig
|
|
|
|
Right bs -> (encodeUtf8 . toStrict . toLazyText . renderNestedBlocks)
|
|
|
|
<$> mapM (handleCSSUrls media sourceURL d) bs
|
2011-11-20 12:04:47 -08:00
|
|
|
|
2015-05-04 16:00:28 -07:00
|
|
|
handleCSSUrls :: MediaBag -> Maybe String -> FilePath -> NestedBlock
|
|
|
|
-> IO NestedBlock
|
|
|
|
handleCSSUrls media sourceURL d (NestedBlock t bs) =
|
|
|
|
NestedBlock t <$> mapM (handleCSSUrls media sourceURL d) bs
|
|
|
|
handleCSSUrls media sourceURL d (LeafBlock (selector, attrs)) = do
|
|
|
|
attrs' <- mapM (handleCSSAttr media sourceURL d) attrs
|
|
|
|
return (LeafBlock (selector, attrs'))
|
|
|
|
|
|
|
|
handleCSSAttr :: MediaBag -> Maybe String -> FilePath -> (Text, Text)
|
|
|
|
-> IO (Text, Text)
|
|
|
|
handleCSSAttr media sourceURL d (key, val) =
|
|
|
|
if "url(" `T.isPrefixOf` val
|
|
|
|
then do
|
|
|
|
let url = T.unpack $ dropParens $ T.drop 3 val
|
|
|
|
case url of
|
|
|
|
'#':_ -> return (key, val)
|
|
|
|
'd':'a':'t':'a':':':_ -> return (key, val)
|
|
|
|
_ -> do
|
|
|
|
let url' = if isURI url then url else d </> url
|
|
|
|
enc <- getDataURI media sourceURL "" url'
|
|
|
|
return (key, T.pack enc)
|
|
|
|
else return (key, val)
|
|
|
|
|
|
|
|
dropParens :: Text -> Text
|
|
|
|
dropParens = T.dropAround (`elem` ['(',')','"','\'',' ','\t','\n','\r'])
|
|
|
|
|
|
|
|
getDataURI :: MediaBag -> Maybe String -> MimeType -> String
|
|
|
|
-> IO String
|
|
|
|
getDataURI _ _ _ src@('d':'a':'t':'a':':':_) = return src -- already data: uri
|
|
|
|
getDataURI media sourceURL mimetype src = do
|
2011-11-19 19:30:27 -08:00
|
|
|
let ext = map toLower $ takeExtension src
|
2014-08-02 16:07:19 -07:00
|
|
|
fetchResult <- fetchItem' media sourceURL src
|
|
|
|
(raw, respMime) <- case fetchResult of
|
|
|
|
Left msg -> err 67 $ "Could not fetch " ++ src ++
|
|
|
|
"\n" ++ show msg
|
|
|
|
Right x -> return x
|
2011-12-04 15:58:31 -08:00
|
|
|
let raw' = if ext == ".gz"
|
|
|
|
then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks
|
|
|
|
$ [raw]
|
|
|
|
else raw
|
|
|
|
let mime = case (mimetype, respMime) of
|
|
|
|
("",Nothing) -> error
|
|
|
|
$ "Could not determine mime type for `" ++ src ++ "'"
|
|
|
|
(x, Nothing) -> x
|
|
|
|
(_, Just x ) -> x
|
2014-08-02 16:33:06 -07:00
|
|
|
let cssSourceURL = case parseURI src of
|
|
|
|
Just u
|
|
|
|
| uriScheme u `elem` ["http:","https:"] ->
|
|
|
|
Just $ show u{ uriPath = "",
|
|
|
|
uriQuery = "",
|
|
|
|
uriFragment = "" }
|
|
|
|
_ -> Nothing
|
2011-11-20 12:04:47 -08:00
|
|
|
result <- if mime == "text/css"
|
2014-08-02 16:33:06 -07:00
|
|
|
then cssURLs media cssSourceURL (takeDirectory src) raw'
|
2011-12-04 15:58:31 -08:00
|
|
|
else return raw'
|
2015-05-04 16:00:28 -07:00
|
|
|
return $ makeDataURI mime result
|
2011-11-19 00:20:00 -08:00
|
|
|
|
2011-11-21 15:09:42 -08:00
|
|
|
-- | Convert HTML into self-contained HTML, incorporating images,
|
2014-08-02 16:07:19 -07:00
|
|
|
-- scripts, and CSS using data: URIs.
|
|
|
|
makeSelfContained :: WriterOptions -> String -> IO String
|
|
|
|
makeSelfContained opts inp = do
|
2011-11-19 00:20:00 -08:00
|
|
|
let tags = parseTags inp
|
2014-08-02 16:07:19 -07:00
|
|
|
out' <- mapM (convertTag (writerMediaBag opts) (writerSourceURL opts)) tags
|
2012-02-17 10:44:46 -08:00
|
|
|
return $ renderTags' out'
|