{-# LANGUAGE OverloadedStrings #-} {- Copyright (C) 2011-2016 John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | Module : Text.Pandoc.SelfContained Copyright : Copyright (C) 2011-2016 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane Stability : alpha Portability : portable Functions for converting an HTML file into one that can be viewed offline, by incorporating linked images, CSS, and scripts into the HTML using data URIs. -} module Text.Pandoc.SelfContained ( makeSelfContained ) where import Text.HTML.TagSoup import Network.URI (isURI, escapeURIString, URI(..), parseURI) import Data.ByteString.Base64 import qualified Data.ByteString.Char8 as B import Data.ByteString (ByteString) import System.FilePath (takeExtension, takeDirectory, ()) import Data.Char (toLower, isAscii, isAlphaNum) import Codec.Compression.GZip as Gzip import qualified Data.ByteString.Lazy as L import Text.Pandoc.Shared (renderTags', trim) import Text.Pandoc.MIME (MimeType) import Text.Pandoc.UTF8 (toString) import Text.Pandoc.Options (WriterOptions(..)) import Data.List (isPrefixOf) import Control.Applicative ((<|>)) import Text.Parsec (runParserT, ParsecT) import qualified Text.Parsec as P import Control.Monad.Except (throwError) import Control.Monad.Trans (lift) import Text.Pandoc.Class (fetchItem, PandocMonad(..), report) import Text.Pandoc.Error import Text.Pandoc.Logging isOk :: Char -> Bool isOk c = isAscii c && isAlphaNum c makeDataURI :: (MimeType, ByteString) -> String makeDataURI (mime, raw) = if textual then "data:" ++ mime' ++ "," ++ escapeURIString isOk (toString raw) else "data:" ++ mime' ++ ";base64," ++ toString (encode raw) where textual = "text/" `Data.List.isPrefixOf` mime mime' = if textual && ';' `notElem` mime then mime ++ ";charset=utf-8" else mime -- mime type already has charset convertTag :: PandocMonad m => Maybe String -> Tag String -> m [Tag String] convertTag sourceURL t@(TagOpen tagname as) | tagname `elem` ["img", "embed", "video", "input", "audio", "source", "track"] = do as' <- mapM processAttribute as return [TagOpen tagname as'] where processAttribute (x,y) = if x == "src" || x == "data-src" || x == "href" || x == "poster" then do enc <- getDataURI sourceURL (fromAttrib "type" t) y return (x, enc) else return (x,y) convertTag sourceURL t@(TagOpen "script" as) = case fromAttrib "src" t of [] -> return [t] src -> do let typeAttr = fromAttrib "type" t res <- getData sourceURL typeAttr src case res of Left dataUri -> return [TagOpen "script" (("src",dataUri) : [(x,y) | (x,y) <- as, x /= "src"])] Right (mime, bs) | (mime == "text/javascript" || mime == "application/javascript" || mime == "application/x-javascript") && not (" return [ TagOpen "script" [("type", typeAttr)|not (null typeAttr)] , TagText (toString bs) , TagClose "script" ] | otherwise -> return [TagOpen "script" (("src",makeDataURI (mime, bs)) : [(x,y) | (x,y) <- as, x /= "src"])] convertTag sourceURL t@(TagOpen "link" as) = case fromAttrib "href" t of [] -> return [t] src -> do res <- getData sourceURL (fromAttrib "type" t) src case res of Left dataUri -> return [TagOpen "link" (("href",dataUri) : [(x,y) | (x,y) <- as, x /= "href"])] Right (mime, bs) | mime == "text/css" && not (" return [ TagOpen "style" [("type", "text/css")] , TagText (toString bs) , TagClose "style" ] | otherwise -> return [TagOpen "link" (("href",makeDataURI (mime, bs)) : [(x,y) | (x,y) <- as, x /= "href"])] convertTag _ t = return [t] cssURLs :: PandocMonad m => Maybe String -> FilePath -> ByteString -> m ByteString cssURLs sourceURL d orig = do res <- runParserT (parseCSSUrls sourceURL d) () "css" orig case res of Left e -> do report $ CouldNotParseCSS (show e) return orig Right bs -> return bs parseCSSUrls :: PandocMonad m => Maybe String -> FilePath -> ParsecT ByteString () m ByteString parseCSSUrls sourceURL d = B.concat <$> P.many (pCSSWhite <|> pCSSComment <|> pCSSUrl sourceURL d <|> pCSSOther) -- Note: some whitespace in CSS is significant, so we can't collapse it! pCSSWhite :: PandocMonad m => ParsecT ByteString () m ByteString pCSSWhite = B.singleton <$> P.space <* P.spaces pCSSComment :: PandocMonad m => ParsecT ByteString () m ByteString pCSSComment = P.try $ do P.string "/*" P.manyTill P.anyChar (P.try (P.string "*/")) return B.empty pCSSOther :: PandocMonad m => ParsecT ByteString () m ByteString pCSSOther = do (B.pack <$> P.many1 (P.noneOf "u/ \n\r\t")) <|> (B.singleton <$> P.char 'u') <|> (B.singleton <$> P.char '/') pCSSUrl :: PandocMonad m => Maybe String -> FilePath -> ParsecT ByteString () m ByteString pCSSUrl sourceURL d = P.try $ do P.string "url(" P.spaces quote <- P.option Nothing (Just <$> P.oneOf "\"'") url <- P.manyTill P.anyChar (maybe (P.lookAhead (P.char ')')) P.char quote) P.spaces P.char ')' let fallback = B.pack ("url(" ++ maybe "" (:[]) quote ++ trim url ++ maybe "" (:[]) quote ++ ")") case trim url of '#':_ -> return fallback 'd':'a':'t':'a':':':_ -> return fallback u -> do let url' = if isURI u then u else d u enc <- lift $ getDataURI sourceURL "" url' return (B.pack $ "url(" ++ enc ++ ")") getDataURI :: PandocMonad m => Maybe String -> MimeType -> String -> m String getDataURI sourceURL mimetype src = do res <- getData sourceURL mimetype src case res of Left uri -> return uri Right x -> return $ makeDataURI x getData :: PandocMonad m => Maybe String -> MimeType -> String -> m (Either String (MimeType, ByteString)) getData _ _ src@('d':'a':'t':'a':':':_) = return $ Left src-- already data: uri getData sourceURL mimetype src = do let ext = map toLower $ takeExtension src (raw, respMime) <- fetchItem sourceURL src let raw' = if ext == ".gz" then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks $ [raw] else raw mime <- case (mimetype, respMime) of ("",Nothing) -> throwError $ PandocSomeError $ "Could not determine mime type for `" ++ src ++ "'" (x, Nothing) -> return x (_, Just x ) -> return x let cssSourceURL = case parseURI src of Just u | uriScheme u `elem` ["http:","https:"] -> Just $ show u{ uriPath = "", uriQuery = "", uriFragment = "" } _ -> Nothing result <- if mime == "text/css" then cssURLs cssSourceURL (takeDirectory src) raw' else return raw' return $ Right (mime, result) -- | Convert HTML into self-contained HTML, incorporating images, -- scripts, and CSS using data: URIs. makeSelfContained :: PandocMonad m => WriterOptions -> String -> m String makeSelfContained opts inp = do let tags = parseTags inp out' <- concat <$> mapM (convertTag (writerSourceURL opts)) tags return $ renderTags' out'