pandoc/src/Text/Pandoc/SelfContained.hs

215 lines
8.5 KiB
Haskell
Raw Normal View History

2011-11-19 00:20:00 -08:00
{-# LANGUAGE OverloadedStrings #-}
{-
Copyright (C) 2011-2016 John MacFarlane <jgm@berkeley.edu>
2011-11-19 00:20:00 -08:00
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.SelfContained
Copyright : Copyright (C) 2011-2016 John MacFarlane
2011-11-19 00:20:00 -08:00
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Functions for converting an HTML file into one that can be viewed
offline, by incorporating linked images, CSS, and scripts into
the HTML using data URIs.
-}
module Text.Pandoc.SelfContained ( makeSelfContained ) where
2011-11-19 00:20:00 -08:00
import Text.HTML.TagSoup
import Network.URI (isURI, escapeURIString, URI(..), parseURI)
2011-11-19 00:20:00 -08:00
import Data.ByteString.Base64
import qualified Data.ByteString.Char8 as B
import Data.ByteString (ByteString)
import System.FilePath (takeExtension, takeDirectory, (</>))
2011-11-19 00:20:00 -08:00
import Data.Char (toLower, isAscii, isAlphaNum)
import Codec.Compression.GZip as Gzip
import qualified Data.ByteString.Lazy as L
import Text.Pandoc.Shared (renderTags', trim)
import Text.Pandoc.MIME (MimeType)
import Text.Pandoc.UTF8 (toString)
import Text.Pandoc.Options (WriterOptions(..))
import Data.List (isPrefixOf)
import Control.Applicative ((<|>))
import Text.Parsec (runParserT, ParsecT)
import qualified Text.Parsec as P
import Control.Monad.Except (throwError)
import Control.Monad.Trans (lift)
import Text.Pandoc.Class (fetchItem, PandocMonad(..), report)
import Text.Pandoc.Error
import Text.Pandoc.Logging
2011-11-19 00:20:00 -08:00
isOk :: Char -> Bool
2011-11-19 00:20:00 -08:00
isOk c = isAscii c && isAlphaNum c
makeDataURI :: (MimeType, ByteString) -> String
makeDataURI (mime, raw) =
if textual
then "data:" ++ mime' ++ "," ++ escapeURIString isOk (toString raw)
else "data:" ++ mime' ++ ";base64," ++ toString (encode raw)
where textual = "text/" `Data.List.isPrefixOf` mime
mime' = if textual && ';' `notElem` mime
then mime ++ ";charset=utf-8"
else mime -- mime type already has charset
convertTag :: PandocMonad m => Maybe String -> Tag String -> m [Tag String]
convertTag sourceURL t@(TagOpen tagname as)
| tagname `elem`
["img", "embed", "video", "input", "audio", "source", "track"] = do
as' <- mapM processAttribute as
return [TagOpen tagname as']
where processAttribute (x,y) =
if x == "src" || x == "data-src" || x == "href" || x == "poster"
then do
enc <- getDataURI sourceURL (fromAttrib "type" t) y
return (x, enc)
else return (x,y)
convertTag sourceURL t@(TagOpen "script" as) =
2011-11-19 00:20:00 -08:00
case fromAttrib "src" t of
[] -> return [t]
2011-11-19 19:30:27 -08:00
src -> do
let typeAttr = fromAttrib "type" t
res <- getData sourceURL typeAttr src
case res of
Left dataUri -> return [TagOpen "script"
(("src",dataUri) : [(x,y) | (x,y) <- as, x /= "src"])]
Right (mime, bs)
| (mime == "text/javascript" ||
mime == "application/javascript" ||
mime == "application/x-javascript") &&
not ("</" `B.isInfixOf` bs) ->
return [
TagOpen "script" [("type", typeAttr)|not (null typeAttr)]
, TagText (toString bs)
, TagClose "script" ]
| otherwise -> return [TagOpen "script"
(("src",makeDataURI (mime, bs)) :
[(x,y) | (x,y) <- as, x /= "src"])]
convertTag sourceURL t@(TagOpen "link" as) =
2011-11-19 00:20:00 -08:00
case fromAttrib "href" t of
[] -> return [t]
2011-11-19 19:30:27 -08:00
src -> do
res <- getData sourceURL (fromAttrib "type" t) src
case res of
Left dataUri -> return [TagOpen "link"
(("href",dataUri) : [(x,y) | (x,y) <- as, x /= "href"])]
Right (mime, bs)
| mime == "text/css" && not ("</" `B.isInfixOf` bs) ->
return [
TagOpen "style" [("type", "text/css")]
, TagText (toString bs)
, TagClose "style" ]
| otherwise -> return [TagOpen "link"
(("href",makeDataURI (mime, bs)) :
[(x,y) | (x,y) <- as, x /= "href"])]
convertTag _ t = return [t]
2011-11-19 00:20:00 -08:00
cssURLs :: PandocMonad m
=> Maybe String -> FilePath -> ByteString -> m ByteString
cssURLs sourceURL d orig = do
res <- runParserT (parseCSSUrls sourceURL d) () "css" orig
case res of
Left e -> do
report $ CouldNotParseCSS (show e)
return orig
Right bs -> return bs
parseCSSUrls :: PandocMonad m
=> Maybe String -> FilePath -> ParsecT ByteString () m ByteString
parseCSSUrls sourceURL d = B.concat <$> P.many
(pCSSWhite <|> pCSSComment <|> pCSSUrl sourceURL d <|> pCSSOther)
-- Note: some whitespace in CSS is significant, so we can't collapse it!
pCSSWhite :: PandocMonad m => ParsecT ByteString () m ByteString
pCSSWhite = B.singleton <$> P.space <* P.spaces
pCSSComment :: PandocMonad m => ParsecT ByteString () m ByteString
pCSSComment = P.try $ do
P.string "/*"
P.manyTill P.anyChar (P.try (P.string "*/"))
return B.empty
pCSSOther :: PandocMonad m => ParsecT ByteString () m ByteString
pCSSOther = do
(B.pack <$> P.many1 (P.noneOf "u/ \n\r\t")) <|>
(B.singleton <$> P.char 'u') <|>
(B.singleton <$> P.char '/')
pCSSUrl :: PandocMonad m
=> Maybe String -> FilePath -> ParsecT ByteString () m ByteString
pCSSUrl sourceURL d = P.try $ do
P.string "url("
P.spaces
quote <- P.option Nothing (Just <$> P.oneOf "\"'")
url <- P.manyTill P.anyChar (maybe (P.lookAhead (P.char ')')) P.char quote)
P.spaces
P.char ')'
let fallback = B.pack ("url(" ++ maybe "" (:[]) quote ++ trim url ++
maybe "" (:[]) quote ++ ")")
case trim url of
'#':_ -> return fallback
'd':'a':'t':'a':':':_ -> return fallback
u -> do let url' = if isURI u then u else d </> u
enc <- lift $ getDataURI sourceURL "" url'
return (B.pack $ "url(" ++ enc ++ ")")
getDataURI :: PandocMonad m => Maybe String -> MimeType -> String -> m String
getDataURI sourceURL mimetype src = do
res <- getData sourceURL mimetype src
case res of
Left uri -> return uri
Right x -> return $ makeDataURI x
getData :: PandocMonad m
=> Maybe String -> MimeType -> String
-> m (Either String (MimeType, ByteString))
getData _ _ src@('d':'a':'t':'a':':':_) = return $ Left src-- already data: uri
getData sourceURL mimetype src = do
2011-11-19 19:30:27 -08:00
let ext = map toLower $ takeExtension src
(raw, respMime) <- fetchItem sourceURL src
let raw' = if ext == ".gz"
then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks
$ [raw]
else raw
mime <- case (mimetype, respMime) of
("",Nothing) -> throwError $ PandocSomeError
$ "Could not determine mime type for `" ++ src ++ "'"
(x, Nothing) -> return x
(_, Just x ) -> return x
let cssSourceURL = case parseURI src of
Just u
| uriScheme u `elem` ["http:","https:"] ->
Just $ show u{ uriPath = "",
uriQuery = "",
uriFragment = "" }
_ -> Nothing
result <- if mime == "text/css"
then cssURLs cssSourceURL (takeDirectory src) raw'
else return raw'
return $ Right (mime, result)
2011-11-19 00:20:00 -08:00
-- | Convert HTML into self-contained HTML, incorporating images,
-- scripts, and CSS using data: URIs.
makeSelfContained :: PandocMonad m => WriterOptions -> String -> m String
makeSelfContained opts inp = do
2011-11-19 00:20:00 -08:00
let tags = parseTags inp
out' <- concat <$> mapM (convertTag (writerSourceURL opts)) tags
return $ renderTags' out'