pandoc/src/Text/Pandoc/SelfContained.hs

292 lines
12 KiB
Haskell
Raw Normal View History

2011-11-19 00:20:00 -08:00
{-# LANGUAGE OverloadedStrings #-}
{-
Copyright (C) 2011 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.SelfContained
2011-11-19 00:20:00 -08:00
Copyright : Copyright (C) 2011 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Functions for converting an HTML file into one that can be viewed
offline, by incorporating linked images, CSS, and scripts into
the HTML using data URIs.
-}
module Text.Pandoc.SelfContained ( makeSelfContained ) where
2011-11-19 00:20:00 -08:00
import Text.HTML.TagSoup
import Network.URI (isAbsoluteURI, parseURI, escapeURIString)
import Network.HTTP
import Data.ByteString.Base64
import qualified Data.ByteString.Char8 as B
import Data.ByteString (ByteString)
import Data.ByteString.UTF8 (toString, fromString)
import System.FilePath (takeExtension, dropExtension, takeDirectory, (</>))
2011-11-19 00:20:00 -08:00
import Data.Char (toLower, isAscii, isAlphaNum)
import Codec.Compression.GZip as Gzip
import qualified Data.ByteString.Lazy as L
import Text.Pandoc.Shared (findDataFile)
import System.Directory (doesFileExist)
2011-11-19 00:20:00 -08:00
getItem :: Maybe FilePath -> String -> IO (ByteString, Maybe String)
getItem userdata f =
2011-11-19 00:20:00 -08:00
if isAbsoluteURI f
then openURL f
else do
let ext = case takeExtension f of
".gz" -> takeExtension $ dropExtension f
x -> x
exists <- doesFileExist f
if exists
then do
cont <- B.readFile f
return (cont, mimeTypeFor ext)
else do
res <- findDataFile userdata f
exists' <- doesFileExist res
if exists'
then do
cont <- B.readFile res
return (cont, mimeTypeFor ext)
else error $ "Could not find `" ++ f ++ "'"
2011-11-19 00:20:00 -08:00
-- TODO - have this return mime type too - then it can work for google
-- chart API, e.g.
openURL :: String -> IO (ByteString, Maybe String)
openURL u = getBodyAndMimeType =<< simpleHTTP (getReq u)
where getReq v = case parseURI v of
Nothing -> error $ "Could not parse URI: " ++ v
2011-11-19 00:20:00 -08:00
Just u' -> mkRequest GET u'
getBodyAndMimeType (Left err) = fail (show err)
getBodyAndMimeType (Right r) = return (rspBody r, findHeader HdrContentType r)
2011-11-19 00:20:00 -08:00
mimeTypeFor :: String -> Maybe String
mimeTypeFor s = lookup (map toLower s) mimetypes
2011-11-19 00:20:00 -08:00
where mimetypes = [ -- taken from MissingH
(".a", "application/octet-stream"),
(".ai", "application/postscript"),
(".aif", "audio/x-aiff"),
(".aifc", "audio/x-aiff"),
(".aiff", "audio/x-aiff"),
(".au", "audio/basic"),
(".avi", "video/x-msvideo"),
(".bat", "text/plain"),
(".bcpio", "application/x-bcpio"),
(".bin", "application/octet-stream"),
(".bmp", "image/x-ms-bmp"),
(".c", "text/plain"),
(".cdf", "application/x-netcdf"),
(".cpio", "application/x-cpio"),
(".csh", "application/x-csh"),
(".css", "text/css"),
(".dll", "application/octet-stream"),
(".doc", "application/msword"),
(".dot", "application/msword"),
(".dvi", "application/x-dvi"),
(".eml", "message/rfc822"),
(".eps", "application/postscript"),
(".etx", "text/x-setext"),
(".exe", "application/octet-stream"),
(".gif", "image/gif"),
(".gtar", "application/x-gtar"),
(".h", "text/plain"),
(".hdf", "application/x-hdf"),
(".htm", "text/html"),
(".html", "text/html"),
(".ief", "image/ief"),
(".jpe", "image/jpeg"),
(".jpeg", "image/jpeg"),
(".jpg", "image/jpeg"),
(".js", "application/x-javascript"),
(".ksh", "text/plain"),
(".latex", "application/x-latex"),
(".m1v", "video/mpeg"),
(".man", "application/x-troff-man"),
(".me", "application/x-troff-me"),
(".mht", "message/rfc822"),
(".mhtml", "message/rfc822"),
(".mif", "application/x-mif"),
(".mov", "video/quicktime"),
(".movie", "video/x-sgi-movie"),
(".mp2", "audio/mpeg"),
(".mp3", "audio/mpeg"),
(".mpa", "video/mpeg"),
(".mpe", "video/mpeg"),
(".mpeg", "video/mpeg"),
(".mpg", "video/mpeg"),
(".ms", "application/x-troff-ms"),
(".nc", "application/x-netcdf"),
(".nws", "message/rfc822"),
(".o", "application/octet-stream"),
(".obj", "application/octet-stream"),
(".oda", "application/oda"),
(".p12", "application/x-pkcs12"),
(".p7c", "application/pkcs7-mime"),
(".pbm", "image/x-portable-bitmap"),
(".pdf", "application/pdf"),
(".pfx", "application/x-pkcs12"),
(".pgm", "image/x-portable-graymap"),
(".pl", "text/plain"),
(".png", "image/png"),
(".pnm", "image/x-portable-anymap"),
(".pot", "application/vnd.ms-powerpoint"),
(".ppa", "application/vnd.ms-powerpoint"),
(".ppm", "image/x-portable-pixmap"),
(".pps", "application/vnd.ms-powerpoint"),
(".ppt", "application/vnd.ms-powerpoint"),
(".ps", "application/postscript"),
(".pwz", "application/vnd.ms-powerpoint"),
(".py", "text/x-python"),
(".pyc", "application/x-python-code"),
(".pyo", "application/x-python-code"),
(".qt", "video/quicktime"),
(".ra", "audio/x-pn-realaudio"),
(".ram", "application/x-pn-realaudio"),
(".ras", "image/x-cmu-raster"),
(".rdf", "application/xml"),
(".rgb", "image/x-rgb"),
(".roff", "application/x-troff"),
(".rtx", "text/richtext"),
(".sgm", "text/x-sgml"),
(".sgml", "text/x-sgml"),
(".sh", "application/x-sh"),
(".shar", "application/x-shar"),
(".snd", "audio/basic"),
(".so", "application/octet-stream"),
(".src", "application/x-wais-source"),
(".sv4cpio", "application/x-sv4cpio"),
(".sv4crc", "application/x-sv4crc"),
(".swf", "application/x-shockwave-flash"),
(".t", "application/x-troff"),
(".tar", "application/x-tar"),
(".tcl", "application/x-tcl"),
(".tex", "application/x-tex"),
(".texi", "application/x-texinfo"),
(".texinfo", "application/x-texinfo"),
(".tif", "image/tiff"),
(".tiff", "image/tiff"),
(".tr", "application/x-troff"),
(".tsv", "text/tab-separated-values"),
(".txt", "text/plain"),
(".ustar", "application/x-ustar"),
(".vcf", "text/x-vcard"),
(".wav", "audio/x-wav"),
(".wiz", "application/msword"),
(".xbm", "image/x-xbitmap"),
(".xlb", "application/vnd.ms-excel"),
(".xls", "application/vnd.ms-excel"),
(".xml", "text/xml"),
(".xpm", "image/x-xpixmap"),
(".xsl", "application/xml"),
(".xwd", "image/x-xwindowdump"),
(".zip", "application/zip"),
(".jpg", "image/jpg"),
(".mid", "audio/midi"),
(".midi", "audio/midi"),
(".pct", "image/pict"),
(".pic", "image/pict"),
(".pict", "image/pict"),
(".rtf", "application/rtf"),
(".xul", "text/xul")
]
isOk :: Char -> Bool
2011-11-19 00:20:00 -08:00
isOk c = isAscii c && isAlphaNum c
convertTag :: Maybe FilePath -> Tag String -> IO (Tag String)
convertTag userdata t@(TagOpen "img" as) =
2011-11-19 00:20:00 -08:00
case fromAttrib "src" t of
2011-11-19 19:30:27 -08:00
[] -> return t
src -> do
(raw, mime) <- getRaw userdata (fromAttrib "type" t) src
2011-11-19 19:30:27 -08:00
let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw)
2011-11-19 00:20:00 -08:00
return $ TagOpen "img" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
convertTag userdata t@(TagOpen "video" as) =
case fromAttrib "src" t of
[] -> return t
src -> do
(raw, mime) <- getRaw userdata (fromAttrib "type" t) src
let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw)
return $ TagOpen "video" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
convertTag userdata t@(TagOpen "script" as) =
2011-11-19 00:20:00 -08:00
case fromAttrib "src" t of
2011-11-19 19:30:27 -08:00
[] -> return t
src -> do
(raw, mime) <- getRaw userdata (fromAttrib "type" t) src
2011-11-19 19:30:27 -08:00
let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw)
2011-11-19 00:20:00 -08:00
return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
convertTag userdata t@(TagOpen "link" as) =
2011-11-19 00:20:00 -08:00
case fromAttrib "href" t of
2011-11-19 19:30:27 -08:00
[] -> return t
src -> do
(raw, mime) <- getRaw userdata (fromAttrib "type" t) src
2011-11-19 19:30:27 -08:00
let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw)
2011-11-19 00:20:00 -08:00
return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"])
convertTag _ t = return t
2011-11-19 00:20:00 -08:00
cssURLs :: Maybe FilePath -> FilePath -> ByteString -> IO ByteString
cssURLs userdata d orig =
case B.breakSubstring "url(" orig of
(x,y) | B.null y -> return orig
| otherwise -> do
let (u,v) = B.breakSubstring ")" $ B.drop 4 y
let url = toString
$ case B.take 1 u of
"\"" -> B.takeWhile (/='"') $ B.drop 1 u
_ -> u
(raw, mime) <- getRaw userdata "" (d </> url)
rest <- cssURLs userdata d v
let enc = "data:" `B.append` fromString mime `B.append`
";base64," `B.append` (encode raw)
return $ x `B.append` "url(" `B.append` enc `B.append` rest
getRaw :: Maybe FilePath -> String -> String -> IO (ByteString, String)
getRaw userdata mimetype src = do
2011-11-19 19:30:27 -08:00
let ext = map toLower $ takeExtension src
(raw, respMime) <- getItem userdata src
let raw' = if ext == ".gz"
then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks
$ [raw]
else raw
let mime = case (mimetype, respMime) of
("",Nothing) -> error
$ "Could not determine mime type for `" ++ src ++ "'"
(x, Nothing) -> x
(_, Just x ) -> x
result <- if mime == "text/css"
then cssURLs userdata (takeDirectory src) raw'
else return raw'
return (result, mime)
2011-11-19 00:20:00 -08:00
-- | Convert HTML into self-contained HTML, incorporating images,
-- scripts, and CSS using data: URIs. Items specified using absolute
-- URLs will be downloaded; those specified using relative URLs will
-- be sought first relative to the working directory, then relative
-- to the user data directory (if the first parameter is 'Just'
-- a directory), and finally relative to pandoc's default data
-- directory.
makeSelfContained :: Maybe FilePath -> String -> IO String
makeSelfContained userdata inp = do
2011-11-19 00:20:00 -08:00
let tags = parseTags inp
out' <- mapM (convertTag userdata) tags
2011-11-19 00:20:00 -08:00
return $ renderTagsOptions renderOptions{ optMinimize = (\t -> t == "br"
|| t == "img" || t == "meta" || t == "link" ) } out'
2011-11-19 00:20:00 -08:00