2014-07-30 00:47:54 +01:00
|
|
|
{-# LANGUAGE
|
|
|
|
ViewPatterns
|
|
|
|
, StandaloneDeriving
|
2014-07-31 15:35:40 +01:00
|
|
|
, TupleSections
|
2014-07-30 00:47:54 +01:00
|
|
|
, FlexibleContexts #-}
|
|
|
|
|
|
|
|
module Text.Pandoc.Readers.EPUB
|
|
|
|
(readEPUB)
|
|
|
|
where
|
|
|
|
|
|
|
|
import Text.XML.Light
|
|
|
|
import Text.Pandoc.Definition hiding (Attr)
|
2014-07-31 15:35:40 +01:00
|
|
|
import Text.Pandoc.Walk (walk, query)
|
2014-07-30 00:47:54 +01:00
|
|
|
import Text.Pandoc.Generic(bottomUp)
|
|
|
|
import Text.Pandoc.Readers.HTML (readHtml)
|
2014-07-31 15:35:40 +01:00
|
|
|
import Text.Pandoc.Options ( ReaderOptions(..), readerExtensions, Extension(..)
|
|
|
|
, readerTrace)
|
2014-08-08 22:10:32 +01:00
|
|
|
import Text.Pandoc.Shared (escapeURI, collapseFilePath)
|
2014-07-31 15:35:40 +01:00
|
|
|
import Text.Pandoc.MediaBag (MediaBag, insertMedia)
|
2014-07-30 00:47:54 +01:00
|
|
|
import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except)
|
|
|
|
import qualified Text.Pandoc.Builder as B
|
|
|
|
import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry
|
|
|
|
, findEntryByPath, Entry)
|
|
|
|
import qualified Data.ByteString.Lazy as BL (ByteString)
|
2014-08-05 12:51:18 +01:00
|
|
|
import System.FilePath ( takeFileName, (</>), dropFileName, normalise
|
2014-08-08 22:10:32 +01:00
|
|
|
, dropFileName
|
2014-08-05 12:51:18 +01:00
|
|
|
, splitFileName )
|
2014-07-30 00:47:54 +01:00
|
|
|
import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy)
|
|
|
|
import Control.Applicative ((<$>))
|
2014-07-31 15:35:40 +01:00
|
|
|
import Control.Monad (guard, liftM, when)
|
2014-07-30 00:47:54 +01:00
|
|
|
import Data.Monoid (mempty, (<>))
|
|
|
|
import Data.List (isPrefixOf, isInfixOf)
|
|
|
|
import Data.Maybe (mapMaybe, fromMaybe)
|
2014-07-31 15:35:40 +01:00
|
|
|
import qualified Data.Map as M (Map, lookup, fromList, elems)
|
2014-07-30 00:47:54 +01:00
|
|
|
import qualified Data.Set as S (insert)
|
|
|
|
import Control.DeepSeq.Generics (deepseq, NFData)
|
|
|
|
|
2014-07-31 15:35:40 +01:00
|
|
|
import Debug.Trace (trace)
|
|
|
|
|
2014-07-30 00:47:54 +01:00
|
|
|
type MIME = String
|
|
|
|
|
|
|
|
type Items = M.Map String (FilePath, MIME)
|
|
|
|
|
2014-07-31 15:35:40 +01:00
|
|
|
readEPUB :: ReaderOptions -> BL.ByteString -> (Pandoc, MediaBag)
|
2014-07-30 00:47:54 +01:00
|
|
|
readEPUB opts bytes = runEPUB (archiveToEPUB opts $ toArchive bytes)
|
|
|
|
|
|
|
|
runEPUB :: Except String a -> a
|
|
|
|
runEPUB = either error id . runExcept
|
|
|
|
|
|
|
|
-- Note that internal reference are aggresively normalised so that all ids
|
|
|
|
-- are of the form "filename#id"
|
|
|
|
--
|
2014-07-31 15:35:40 +01:00
|
|
|
archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag)
|
2014-08-08 19:58:23 +01:00
|
|
|
archiveToEPUB (setEPUBOptions -> os) archive = do
|
2014-07-30 00:47:54 +01:00
|
|
|
(root, content) <- getManifest archive
|
|
|
|
meta <- parseMeta content
|
|
|
|
(cover, items) <- parseManifest content
|
2014-08-05 12:51:18 +01:00
|
|
|
let coverDoc = fromMaybe mempty (imageToPandoc <$> cover)
|
2014-07-30 00:47:54 +01:00
|
|
|
spine <- parseSpine items content
|
|
|
|
let escapedSpine = map (escapeURI . takeFileName . fst) spine
|
|
|
|
Pandoc _ bs <-
|
|
|
|
foldM' (\a b -> ((a <>) . bottomUp (prependHash escapedSpine))
|
|
|
|
`liftM` parseSpineElem root b) mempty spine
|
2014-07-31 15:35:40 +01:00
|
|
|
let ast = coverDoc <> (Pandoc meta bs)
|
|
|
|
let mediaBag = fetchImages (M.elems items) root archive ast
|
|
|
|
return $ (ast, mediaBag)
|
2014-07-30 00:47:54 +01:00
|
|
|
where
|
|
|
|
parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc
|
2014-08-05 12:51:18 +01:00
|
|
|
parseSpineElem (normalise -> r) (normalise -> path, mime) = do
|
2014-07-31 15:35:40 +01:00
|
|
|
when (readerTrace os) (traceM path)
|
2014-08-05 12:51:18 +01:00
|
|
|
doc <- mimeToReader mime r path
|
2014-07-30 00:47:54 +01:00
|
|
|
let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty
|
2014-08-05 12:51:18 +01:00
|
|
|
return $ docSpan <> doc
|
|
|
|
mimeToReader :: MonadError String m => MIME -> FilePath -> FilePath -> m Pandoc
|
|
|
|
mimeToReader "application/xhtml+xml" r path = do
|
|
|
|
fname <- findEntryByPathE (r </> path) archive
|
|
|
|
return $ fixInternalReferences (r </> path) .
|
2014-08-08 19:58:23 +01:00
|
|
|
readHtml os .
|
2014-08-05 12:51:18 +01:00
|
|
|
UTF8.toStringLazy $
|
|
|
|
fromEntry fname
|
|
|
|
mimeToReader s _ path
|
2014-07-30 00:47:54 +01:00
|
|
|
| s `elem` imageMimes = return $ imageToPandoc path
|
|
|
|
| otherwise = return $ mempty
|
|
|
|
|
2014-08-08 19:58:23 +01:00
|
|
|
setEPUBOptions :: ReaderOptions -> ReaderOptions
|
|
|
|
setEPUBOptions os = os''
|
|
|
|
where
|
|
|
|
rs = readerExtensions os
|
|
|
|
os' = os {readerExtensions = foldr S.insert rs [Ext_epub_html_exts]}
|
|
|
|
os'' = os' {readerParseRaw = True}
|
|
|
|
|
2014-07-31 15:35:40 +01:00
|
|
|
fetchImages :: [(FilePath, MIME)]
|
|
|
|
-> FilePath
|
|
|
|
-> Archive
|
|
|
|
-> Pandoc
|
|
|
|
-> MediaBag
|
2014-08-05 12:51:18 +01:00
|
|
|
fetchImages mimes root arc (query iq -> links) =
|
2014-07-31 15:35:40 +01:00
|
|
|
foldr (uncurry3 insertMedia) mempty
|
|
|
|
(mapMaybe getEntry links)
|
|
|
|
where
|
2014-08-05 12:51:18 +01:00
|
|
|
getEntry (normalise -> l) =
|
|
|
|
let mediaPos = normalise (root </> l) in
|
|
|
|
(l , lookup mediaPos mimes, ) . fromEntry
|
|
|
|
<$> findEntryByPath mediaPos arc
|
2014-07-31 15:35:40 +01:00
|
|
|
|
|
|
|
iq :: Inline -> [FilePath]
|
|
|
|
iq (Image _ (url, _)) = [url]
|
|
|
|
iq _ = []
|
|
|
|
|
2014-08-05 12:51:18 +01:00
|
|
|
-- Remove relative paths
|
|
|
|
renameImages :: FilePath -> Inline -> Inline
|
2014-08-08 22:10:32 +01:00
|
|
|
renameImages root (Image a (url, b)) = Image a (collapseFilePath (root </> url), b)
|
2014-08-05 12:51:18 +01:00
|
|
|
renameImages _ x = x
|
|
|
|
|
2014-07-30 00:47:54 +01:00
|
|
|
imageToPandoc :: FilePath -> Pandoc
|
|
|
|
imageToPandoc s = B.doc . B.para $ B.image s "" mempty
|
|
|
|
|
|
|
|
imageMimes :: [String]
|
|
|
|
imageMimes = ["image/gif", "image/jpeg", "image/png"]
|
|
|
|
|
|
|
|
type CoverImage = FilePath
|
|
|
|
|
|
|
|
parseManifest :: (MonadError String m) => Element -> m (Maybe CoverImage, Items)
|
|
|
|
parseManifest content = do
|
|
|
|
manifest <- findElementE (dfName "manifest") content
|
|
|
|
let items = findChildren (dfName "item") manifest
|
|
|
|
r <- mapM parseItem items
|
|
|
|
let cover = findAttr (emptyName "href") =<< filterChild findCover manifest
|
|
|
|
return (cover, (M.fromList r))
|
|
|
|
where
|
|
|
|
findCover e = maybe False (isInfixOf "cover-image")
|
|
|
|
(findAttr (emptyName "properties") e)
|
|
|
|
parseItem e = do
|
|
|
|
uid <- findAttrE (emptyName "id") e
|
|
|
|
href <- findAttrE (emptyName "href") e
|
|
|
|
mime <- findAttrE (emptyName "media-type") e
|
|
|
|
return (uid, (href, mime))
|
|
|
|
|
|
|
|
parseSpine :: MonadError String m => Items -> Element -> m [(FilePath, MIME)]
|
|
|
|
parseSpine is e = do
|
|
|
|
spine <- findElementE (dfName "spine") e
|
|
|
|
let itemRefs = findChildren (dfName "itemref") spine
|
|
|
|
mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs
|
|
|
|
where
|
|
|
|
parseItemRef ref = do
|
|
|
|
let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref)
|
|
|
|
guard linear
|
|
|
|
findAttr (emptyName "idref") ref
|
|
|
|
|
|
|
|
parseMeta :: MonadError String m => Element -> m Meta
|
|
|
|
parseMeta content = do
|
|
|
|
meta <- findElementE (dfName "metadata") content
|
|
|
|
let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True
|
|
|
|
dcspace _ = False
|
|
|
|
let dcs = filterChildrenName dcspace meta
|
|
|
|
let r = foldr parseMetaItem nullMeta dcs
|
|
|
|
return r
|
|
|
|
|
|
|
|
-- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem
|
|
|
|
parseMetaItem :: Element -> Meta -> Meta
|
|
|
|
parseMetaItem e@(stripNamespace . elName -> field) meta =
|
|
|
|
B.setMeta (renameMeta field) (B.str $ strContent e) meta
|
|
|
|
|
|
|
|
renameMeta :: String -> String
|
|
|
|
renameMeta "creator" = "author"
|
|
|
|
renameMeta s = s
|
|
|
|
|
|
|
|
getManifest :: MonadError String m => Archive -> m (String, Element)
|
|
|
|
getManifest archive = do
|
|
|
|
metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive
|
|
|
|
docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry
|
|
|
|
let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
|
|
|
|
ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces)
|
|
|
|
as <- liftM ((map attrToPair) . elAttribs)
|
|
|
|
(findElementE (QName "rootfile" (Just ns) Nothing) docElem)
|
|
|
|
root <- mkE "Root not found" (lookup "full-path" as)
|
|
|
|
let rootdir = dropFileName root
|
|
|
|
--mime <- lookup "media-type" as
|
|
|
|
manifest <- findEntryByPathE root archive
|
|
|
|
liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest)
|
|
|
|
|
|
|
|
-- Fixup
|
|
|
|
|
2014-08-05 12:51:18 +01:00
|
|
|
fixInternalReferences :: FilePath -> Pandoc -> Pandoc
|
|
|
|
fixInternalReferences pathToFile =
|
|
|
|
(walk $ renameImages root)
|
|
|
|
. (walk normalisePath)
|
|
|
|
. (walk $ fixBlockIRs filename)
|
|
|
|
. (walk $ fixInlineIRs filename)
|
2014-07-30 00:47:54 +01:00
|
|
|
where
|
2014-08-05 12:51:18 +01:00
|
|
|
(root, escapeURI -> filename) = splitFileName pathToFile
|
2014-07-30 00:47:54 +01:00
|
|
|
|
|
|
|
fixInlineIRs :: String -> Inline -> Inline
|
|
|
|
fixInlineIRs s (Span as v) =
|
|
|
|
Span (fixAttrs s as) v
|
|
|
|
fixInlineIRs s (Code as code) =
|
|
|
|
Code (fixAttrs s as) code
|
|
|
|
fixInlineIRs s (Link t ('#':url, tit)) =
|
|
|
|
Link t (addHash s url, tit)
|
|
|
|
fixInlineIRs _ v = v
|
|
|
|
|
|
|
|
normalisePath :: Inline -> Inline
|
|
|
|
normalisePath (Link t (url, tit)) =
|
|
|
|
let (path, uid) = span (/= '#') url in
|
|
|
|
Link t (takeFileName path ++ uid, tit)
|
|
|
|
normalisePath s = s
|
|
|
|
|
|
|
|
prependHash :: [String] -> Inline -> Inline
|
|
|
|
prependHash ps l@(Link is (url, tit))
|
|
|
|
| or [s `isPrefixOf` url | s <- ps] =
|
|
|
|
Link is ('#':url, tit)
|
|
|
|
| otherwise = l
|
|
|
|
prependHash _ i = i
|
|
|
|
|
|
|
|
fixBlockIRs :: String -> Block -> Block
|
|
|
|
fixBlockIRs s (Div as b) =
|
|
|
|
Div (fixAttrs s as) b
|
|
|
|
fixBlockIRs s (Header i as b) =
|
|
|
|
Header i (fixAttrs s as) b
|
|
|
|
fixBlockIRs s (CodeBlock as code) =
|
|
|
|
CodeBlock (fixAttrs s as) code
|
|
|
|
fixBlockIRs _ b = b
|
|
|
|
|
|
|
|
fixAttrs :: FilePath -> B.Attr -> B.Attr
|
|
|
|
fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs)
|
|
|
|
|
|
|
|
addHash :: String -> String -> String
|
|
|
|
addHash _ "" = ""
|
|
|
|
addHash s ident = s ++ "#" ++ ident
|
|
|
|
|
|
|
|
removeEPUBAttrs :: [(String, String)] -> [(String, String)]
|
|
|
|
removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs
|
|
|
|
|
|
|
|
isEPUBAttr :: (String, String) -> Bool
|
|
|
|
isEPUBAttr (k, _) = "epub:" `isPrefixOf` k
|
|
|
|
|
2014-07-31 15:35:40 +01:00
|
|
|
-- Library
|
2014-07-30 00:47:54 +01:00
|
|
|
|
|
|
|
-- Strict version of foldM
|
|
|
|
foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a
|
|
|
|
foldM' _ z [] = return z
|
|
|
|
foldM' f z (x:xs) = do
|
|
|
|
z' <- f z x
|
|
|
|
z' `deepseq` foldM' f z' xs
|
|
|
|
|
2014-07-31 15:35:40 +01:00
|
|
|
uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d
|
|
|
|
uncurry3 f (a, b, c) = f a b c
|
|
|
|
|
2014-08-05 12:51:18 +01:00
|
|
|
traceM :: Monad m => String -> m ()
|
|
|
|
traceM = flip trace (return ())
|
|
|
|
|
2014-07-31 15:35:40 +01:00
|
|
|
-- Utility
|
|
|
|
|
2014-07-30 00:47:54 +01:00
|
|
|
stripNamespace :: QName -> String
|
|
|
|
stripNamespace (QName v _ _) = v
|
|
|
|
|
|
|
|
attrToNSPair :: Attr -> Maybe (String, String)
|
|
|
|
attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val)
|
|
|
|
attrToNSPair _ = Nothing
|
|
|
|
|
|
|
|
attrToPair :: Attr -> (String, String)
|
|
|
|
attrToPair (Attr (QName name _ _) val) = (name, val)
|
|
|
|
|
|
|
|
defaultNameSpace :: Maybe String
|
|
|
|
defaultNameSpace = Just "http://www.idpf.org/2007/opf"
|
|
|
|
|
|
|
|
dfName :: String -> QName
|
|
|
|
dfName s = QName s defaultNameSpace Nothing
|
|
|
|
|
|
|
|
emptyName :: String -> QName
|
|
|
|
emptyName s = QName s Nothing Nothing
|
|
|
|
|
|
|
|
-- Convert Maybe interface to Either
|
|
|
|
|
|
|
|
findAttrE :: MonadError String m => QName -> Element -> m String
|
|
|
|
findAttrE q e = mkE "findAttr" $ findAttr q e
|
|
|
|
|
|
|
|
findEntryByPathE :: MonadError String m => FilePath -> Archive -> m Entry
|
|
|
|
findEntryByPathE path a = mkE ("No entry on path: " ++ path) $ findEntryByPath path a
|
|
|
|
|
|
|
|
parseXMLDocE :: MonadError String m => String -> m Element
|
|
|
|
parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc
|
|
|
|
|
|
|
|
findElementE :: MonadError String m => QName -> Element -> m Element
|
|
|
|
findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x
|
|
|
|
|
|
|
|
mkE :: MonadError String m => String -> Maybe a -> m a
|
|
|
|
mkE s = maybe (throwError s) return
|