2011-11-19 00:20:00 -08:00
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
|
|
{-
|
2016-03-22 17:20:39 -07:00
|
|
|
Copyright (C) 2011-2016 John MacFarlane <jgm@berkeley.edu>
|
2011-11-19 00:20:00 -08:00
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
-}
|
|
|
|
|
|
|
|
{- |
|
2011-11-21 15:09:42 -08:00
|
|
|
Module : Text.Pandoc.SelfContained
|
2016-03-22 17:20:39 -07:00
|
|
|
Copyright : Copyright (C) 2011-2016 John MacFarlane
|
2011-11-19 00:20:00 -08:00
|
|
|
License : GNU GPL, version 2 or above
|
|
|
|
|
|
|
|
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
|
|
|
Stability : alpha
|
|
|
|
Portability : portable
|
|
|
|
|
|
|
|
Functions for converting an HTML file into one that can be viewed
|
|
|
|
offline, by incorporating linked images, CSS, and scripts into
|
|
|
|
the HTML using data URIs.
|
|
|
|
-}
|
2017-03-30 16:43:12 +02:00
|
|
|
module Text.Pandoc.SelfContained ( makeDataURI, makeSelfContained ) where
|
2011-11-19 00:20:00 -08:00
|
|
|
import Codec.Compression.GZip as Gzip
|
2015-10-14 09:09:10 -07:00
|
|
|
import Control.Applicative ((<|>))
|
2017-02-23 15:00:00 +01:00
|
|
|
import Control.Monad.Except (throwError)
|
2015-06-28 11:51:35 -07:00
|
|
|
import Control.Monad.Trans (lift)
|
2017-03-04 13:03:41 +01:00
|
|
|
import Data.ByteString (ByteString)
|
|
|
|
import Data.ByteString.Base64
|
|
|
|
import qualified Data.ByteString.Char8 as B
|
|
|
|
import qualified Data.ByteString.Lazy as L
|
|
|
|
import Data.Char (isAlphaNum, isAscii, toLower)
|
|
|
|
import Data.List (isPrefixOf)
|
|
|
|
import Network.URI (URI (..), escapeURIString, isURI, parseURI)
|
|
|
|
import System.FilePath (takeDirectory, takeExtension, (</>))
|
|
|
|
import Text.HTML.TagSoup
|
|
|
|
import Text.Pandoc.Class (PandocMonad (..), fetchItem, report)
|
2017-02-23 15:00:00 +01:00
|
|
|
import Text.Pandoc.Error
|
|
|
|
import Text.Pandoc.Logging
|
2017-03-04 13:03:41 +01:00
|
|
|
import Text.Pandoc.MIME (MimeType)
|
|
|
|
import Text.Pandoc.Options (WriterOptions (..))
|
|
|
|
import Text.Pandoc.Shared (renderTags', trim)
|
|
|
|
import Text.Pandoc.UTF8 (toString)
|
|
|
|
import Text.Parsec (ParsecT, runParserT)
|
|
|
|
import qualified Text.Parsec as P
|
2011-11-19 00:20:00 -08:00
|
|
|
|
2011-11-20 12:04:47 -08:00
|
|
|
isOk :: Char -> Bool
|
2011-11-19 00:20:00 -08:00
|
|
|
isOk c = isAscii c && isAlphaNum c
|
|
|
|
|
2017-02-24 11:27:52 +01:00
|
|
|
makeDataURI :: (MimeType, ByteString) -> String
|
|
|
|
makeDataURI (mime, raw) =
|
2015-02-13 21:37:43 -08:00
|
|
|
if textual
|
|
|
|
then "data:" ++ mime' ++ "," ++ escapeURIString isOk (toString raw)
|
|
|
|
else "data:" ++ mime' ++ ";base64," ++ toString (encode raw)
|
|
|
|
where textual = "text/" `Data.List.isPrefixOf` mime
|
|
|
|
mime' = if textual && ';' `notElem` mime
|
|
|
|
then mime ++ ";charset=utf-8"
|
|
|
|
else mime -- mime type already has charset
|
|
|
|
|
2017-02-24 13:11:29 +01:00
|
|
|
convertTags :: PandocMonad m => Maybe String -> [Tag String] -> m [Tag String]
|
|
|
|
convertTags _ [] = return []
|
2017-02-26 22:48:02 +01:00
|
|
|
convertTags sourceURL (t@TagOpen{}:ts)
|
|
|
|
| fromAttrib "data-external" t == "1" = (t:) <$> convertTags sourceURL ts
|
2017-02-24 13:11:29 +01:00
|
|
|
convertTags sourceURL (t@(TagOpen tagname as):ts)
|
2014-10-04 11:37:27 -07:00
|
|
|
| tagname `elem`
|
|
|
|
["img", "embed", "video", "input", "audio", "source", "track"] = do
|
2014-03-05 09:10:09 -08:00
|
|
|
as' <- mapM processAttribute as
|
2017-02-24 13:11:29 +01:00
|
|
|
rest <- convertTags sourceURL ts
|
|
|
|
return $ TagOpen tagname as' : rest
|
2014-03-05 09:10:09 -08:00
|
|
|
where processAttribute (x,y) =
|
2017-02-20 22:21:20 +01:00
|
|
|
if x == "src" || x == "data-src" || x == "href" || x == "poster"
|
2014-03-05 09:10:09 -08:00
|
|
|
then do
|
2017-02-23 15:00:00 +01:00
|
|
|
enc <- getDataURI sourceURL (fromAttrib "type" t) y
|
2014-03-05 09:10:09 -08:00
|
|
|
return (x, enc)
|
|
|
|
else return (x,y)
|
2017-02-24 13:11:29 +01:00
|
|
|
convertTags sourceURL (t@(TagOpen "script" as):TagClose "script":ts) =
|
2011-11-19 00:20:00 -08:00
|
|
|
case fromAttrib "src" t of
|
2017-02-24 13:11:29 +01:00
|
|
|
[] -> (t:) <$> convertTags sourceURL ts
|
2011-11-19 19:30:27 -08:00
|
|
|
src -> do
|
2017-02-24 11:55:15 +01:00
|
|
|
let typeAttr = fromAttrib "type" t
|
|
|
|
res <- getData sourceURL typeAttr src
|
2017-02-24 13:11:29 +01:00
|
|
|
rest <- convertTags sourceURL ts
|
2017-02-24 11:55:15 +01:00
|
|
|
case res of
|
2017-02-24 13:11:29 +01:00
|
|
|
Left dataUri -> return $ TagOpen "script"
|
|
|
|
(("src",dataUri) : [(x,y) | (x,y) <- as, x /= "src"]) :
|
|
|
|
TagClose "script" : rest
|
2017-02-24 11:55:15 +01:00
|
|
|
Right (mime, bs)
|
|
|
|
| (mime == "text/javascript" ||
|
|
|
|
mime == "application/javascript" ||
|
|
|
|
mime == "application/x-javascript") &&
|
2017-02-24 13:11:29 +01:00
|
|
|
not ("</script" `B.isInfixOf` bs) ->
|
|
|
|
return $
|
2017-02-24 11:55:15 +01:00
|
|
|
TagOpen "script" [("type", typeAttr)|not (null typeAttr)]
|
2017-02-24 13:11:29 +01:00
|
|
|
: TagText (toString bs)
|
|
|
|
: TagClose "script"
|
|
|
|
: rest
|
|
|
|
| otherwise ->
|
|
|
|
return $ TagOpen "script"
|
|
|
|
(("src",makeDataURI (mime, bs)) :
|
|
|
|
[(x,y) | (x,y) <- as, x /= "src"]) :
|
|
|
|
TagClose "script" : rest
|
|
|
|
convertTags sourceURL (t@(TagOpen "link" as):ts) =
|
2011-11-19 00:20:00 -08:00
|
|
|
case fromAttrib "href" t of
|
2017-02-24 13:11:29 +01:00
|
|
|
[] -> (t:) <$> convertTags sourceURL ts
|
2011-11-19 19:30:27 -08:00
|
|
|
src -> do
|
2017-02-24 11:55:15 +01:00
|
|
|
res <- getData sourceURL (fromAttrib "type" t) src
|
|
|
|
case res of
|
2017-02-24 13:11:29 +01:00
|
|
|
Left dataUri -> do
|
|
|
|
rest <- convertTags sourceURL ts
|
|
|
|
return $ TagOpen "link"
|
|
|
|
(("href",dataUri) : [(x,y) | (x,y) <- as, x /= "href"]) :
|
|
|
|
rest
|
2017-02-24 11:55:15 +01:00
|
|
|
Right (mime, bs)
|
2017-02-24 13:11:29 +01:00
|
|
|
| mime == "text/css" && not ("</" `B.isInfixOf` bs) -> do
|
|
|
|
rest <- convertTags sourceURL $
|
|
|
|
dropWhile (==TagClose "link") ts
|
|
|
|
return $
|
2017-02-24 11:55:15 +01:00
|
|
|
TagOpen "style" [("type", "text/css")]
|
2017-02-24 13:11:29 +01:00
|
|
|
: TagText (toString bs)
|
|
|
|
: TagClose "style"
|
|
|
|
: rest
|
|
|
|
| otherwise -> do
|
|
|
|
rest <- convertTags sourceURL ts
|
|
|
|
return $ TagOpen "link"
|
|
|
|
(("href",makeDataURI (mime, bs)) :
|
|
|
|
[(x,y) | (x,y) <- as, x /= "href"]) : rest
|
|
|
|
convertTags sourceURL (t:ts) = (t:) <$> convertTags sourceURL ts
|
2011-11-19 00:20:00 -08:00
|
|
|
|
2017-02-23 15:00:00 +01:00
|
|
|
cssURLs :: PandocMonad m
|
|
|
|
=> Maybe String -> FilePath -> ByteString -> m ByteString
|
|
|
|
cssURLs sourceURL d orig = do
|
|
|
|
res <- runParserT (parseCSSUrls sourceURL d) () "css" orig
|
2015-06-28 11:51:35 -07:00
|
|
|
case res of
|
2017-02-23 15:00:00 +01:00
|
|
|
Left e -> do
|
|
|
|
report $ CouldNotParseCSS (show e)
|
|
|
|
return orig
|
2015-06-28 11:51:35 -07:00
|
|
|
Right bs -> return bs
|
|
|
|
|
2017-02-23 15:00:00 +01:00
|
|
|
parseCSSUrls :: PandocMonad m
|
|
|
|
=> Maybe String -> FilePath -> ParsecT ByteString () m ByteString
|
|
|
|
parseCSSUrls sourceURL d = B.concat <$> P.many
|
2017-05-05 23:23:49 +02:00
|
|
|
(pCSSWhite <|> pCSSComment <|> pCSSUrl sourceURL d <|> pCSSOther)
|
2015-06-28 11:51:35 -07:00
|
|
|
|
2015-07-15 08:15:08 -07:00
|
|
|
-- Note: some whitespace in CSS is significant, so we can't collapse it!
|
2017-02-23 15:00:00 +01:00
|
|
|
pCSSWhite :: PandocMonad m => ParsecT ByteString () m ByteString
|
2015-07-15 08:15:08 -07:00
|
|
|
pCSSWhite = B.singleton <$> P.space <* P.spaces
|
2015-06-28 11:51:35 -07:00
|
|
|
|
2017-02-23 15:00:00 +01:00
|
|
|
pCSSComment :: PandocMonad m => ParsecT ByteString () m ByteString
|
2015-06-28 11:51:35 -07:00
|
|
|
pCSSComment = P.try $ do
|
|
|
|
P.string "/*"
|
|
|
|
P.manyTill P.anyChar (P.try (P.string "*/"))
|
|
|
|
return B.empty
|
|
|
|
|
2017-02-23 15:00:00 +01:00
|
|
|
pCSSOther :: PandocMonad m => ParsecT ByteString () m ByteString
|
2015-06-28 11:51:35 -07:00
|
|
|
pCSSOther = do
|
|
|
|
(B.pack <$> P.many1 (P.noneOf "u/ \n\r\t")) <|>
|
|
|
|
(B.singleton <$> P.char 'u') <|>
|
|
|
|
(B.singleton <$> P.char '/')
|
|
|
|
|
2017-02-23 15:00:00 +01:00
|
|
|
pCSSUrl :: PandocMonad m
|
2017-05-05 23:23:49 +02:00
|
|
|
=> Maybe String -> FilePath -> ParsecT ByteString () m ByteString
|
|
|
|
pCSSUrl sourceURL d = P.try $ do
|
2015-06-28 11:51:35 -07:00
|
|
|
P.string "url("
|
|
|
|
P.spaces
|
|
|
|
quote <- P.option Nothing (Just <$> P.oneOf "\"'")
|
|
|
|
url <- P.manyTill P.anyChar (maybe (P.lookAhead (P.char ')')) P.char quote)
|
|
|
|
P.spaces
|
|
|
|
P.char ')'
|
2017-05-05 23:23:49 +02:00
|
|
|
let fallback = B.pack ("url(" ++ maybe "" (:[]) quote ++ trim url ++
|
|
|
|
maybe "" (:[]) quote ++ ")")
|
2017-05-05 17:03:27 +02:00
|
|
|
-- pipes are used in URLs provided by Google Code fonts
|
|
|
|
-- but parseURI doesn't like them, so we escape them:
|
|
|
|
case escapeURIString (/='|') (trim url) of
|
2015-06-28 11:51:35 -07:00
|
|
|
'#':_ -> return fallback
|
|
|
|
'd':'a':'t':'a':':':_ -> return fallback
|
|
|
|
u -> do let url' = if isURI u then u else d </> u
|
2017-05-05 17:03:27 +02:00
|
|
|
res <- lift $ getData sourceURL "" url'
|
|
|
|
case res of
|
|
|
|
Left uri -> return (B.pack $ "url(" ++ uri ++ ")")
|
|
|
|
Right (mt, raw) -> do
|
|
|
|
-- note that the downloaded content may
|
|
|
|
-- itself contain url(...).
|
|
|
|
raw' <- cssURLs sourceURL d raw
|
|
|
|
let enc = makeDataURI (mt, raw')
|
|
|
|
return (B.pack $ "url(" ++ enc ++ ")")
|
2015-06-28 11:51:35 -07:00
|
|
|
|
2017-02-23 15:00:00 +01:00
|
|
|
getDataURI :: PandocMonad m => Maybe String -> MimeType -> String -> m String
|
|
|
|
getDataURI sourceURL mimetype src = do
|
2017-02-24 11:27:52 +01:00
|
|
|
res <- getData sourceURL mimetype src
|
|
|
|
case res of
|
|
|
|
Left uri -> return uri
|
|
|
|
Right x -> return $ makeDataURI x
|
|
|
|
|
|
|
|
getData :: PandocMonad m
|
|
|
|
=> Maybe String -> MimeType -> String
|
|
|
|
-> m (Either String (MimeType, ByteString))
|
|
|
|
getData _ _ src@('d':'a':'t':'a':':':_) = return $ Left src-- already data: uri
|
|
|
|
getData sourceURL mimetype src = do
|
2011-11-19 19:30:27 -08:00
|
|
|
let ext = map toLower $ takeExtension src
|
2017-02-23 15:00:00 +01:00
|
|
|
(raw, respMime) <- fetchItem sourceURL src
|
2011-12-04 15:58:31 -08:00
|
|
|
let raw' = if ext == ".gz"
|
|
|
|
then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks
|
|
|
|
$ [raw]
|
|
|
|
else raw
|
2017-02-23 15:00:00 +01:00
|
|
|
mime <- case (mimetype, respMime) of
|
|
|
|
("",Nothing) -> throwError $ PandocSomeError
|
2011-12-04 15:58:31 -08:00
|
|
|
$ "Could not determine mime type for `" ++ src ++ "'"
|
2017-02-23 15:00:00 +01:00
|
|
|
(x, Nothing) -> return x
|
|
|
|
(_, Just x ) -> return x
|
2014-08-02 16:33:06 -07:00
|
|
|
let cssSourceURL = case parseURI src of
|
|
|
|
Just u
|
|
|
|
| uriScheme u `elem` ["http:","https:"] ->
|
|
|
|
Just $ show u{ uriPath = "",
|
|
|
|
uriQuery = "",
|
|
|
|
uriFragment = "" }
|
|
|
|
_ -> Nothing
|
2011-11-20 12:04:47 -08:00
|
|
|
result <- if mime == "text/css"
|
2017-02-23 15:00:00 +01:00
|
|
|
then cssURLs cssSourceURL (takeDirectory src) raw'
|
2011-12-04 15:58:31 -08:00
|
|
|
else return raw'
|
2017-02-24 11:27:52 +01:00
|
|
|
return $ Right (mime, result)
|
|
|
|
|
|
|
|
|
2011-11-19 00:20:00 -08:00
|
|
|
|
2011-11-21 15:09:42 -08:00
|
|
|
-- | Convert HTML into self-contained HTML, incorporating images,
|
2014-08-02 16:07:19 -07:00
|
|
|
-- scripts, and CSS using data: URIs.
|
2017-02-23 15:00:00 +01:00
|
|
|
makeSelfContained :: PandocMonad m => WriterOptions -> String -> m String
|
|
|
|
makeSelfContained opts inp = do
|
2011-11-19 00:20:00 -08:00
|
|
|
let tags = parseTags inp
|
2017-02-24 13:11:29 +01:00
|
|
|
out' <- convertTags (writerSourceURL opts) tags
|
2012-02-17 10:44:46 -08:00
|
|
|
return $ renderTags' out'
|