pandoc/src/Text/Pandoc/Readers/EPUB.hs

278 lines
10 KiB
Haskell
Raw Normal View History

{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE ViewPatterns #-}
2014-07-30 00:47:54 +01:00
module Text.Pandoc.Readers.EPUB
(readEPUB)
where
import Codec.Archive.Zip (Archive (..), Entry, findEntryByPath, fromEntry,
toArchiveOrFail)
import Control.DeepSeq (NFData, deepseq)
import Control.Monad (guard, liftM)
2016-11-28 17:13:46 -05:00
import Control.Monad.Except (throwError)
2014-07-30 00:47:54 +01:00
import qualified Data.ByteString.Lazy as BL (ByteString)
import qualified Data.Text.Lazy.Encoding as TL
import qualified Data.Text.Lazy as TL
import Data.List (isInfixOf, isPrefixOf)
import qualified Data.Map as M (Map, elems, fromList, lookup)
import Data.Maybe (fromMaybe, mapMaybe)
import Data.Monoid ((<>))
import Network.URI (unEscapeString)
import System.FilePath (dropFileName, dropFileName, normalise, splitFileName,
takeFileName, (</>))
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Class (PandocMonad, insertMedia)
import Text.Pandoc.Definition hiding (Attr)
import Text.Pandoc.Error
import Text.Pandoc.Extensions (Extension (Ext_raw_html), enableExtension)
import Text.Pandoc.MIME (MimeType)
import Text.Pandoc.Options (ReaderOptions (..))
import Text.Pandoc.Readers.HTML (readHtml)
import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI)
import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy)
import Text.Pandoc.Walk (query, walk)
import Text.XML.Light
2014-07-30 00:47:54 +01:00
2014-08-17 20:53:46 +04:00
type Items = M.Map String (FilePath, MimeType)
2014-07-30 00:47:54 +01:00
2016-11-28 17:13:46 -05:00
readEPUB :: PandocMonad m => ReaderOptions -> BL.ByteString -> m Pandoc
readEPUB opts bytes = case toArchiveOrFail bytes of
2016-11-28 17:13:46 -05:00
Right archive -> archiveToEPUB opts $ archive
Left _ -> throwError $ PandocParseError "Couldn't extract ePub file"
2014-07-30 00:47:54 +01:00
2016-11-28 17:13:46 -05:00
-- runEPUB :: Except PandocError a -> Either PandocError a
-- runEPUB = runExcept
2014-07-30 00:47:54 +01:00
-- Note that internal reference are aggresively normalised so that all ids
-- are of the form "filename#id"
--
2016-11-28 17:13:46 -05:00
archiveToEPUB :: (PandocMonad m) => ReaderOptions -> Archive -> m Pandoc
2014-07-30 00:47:54 +01:00
archiveToEPUB os archive = do
-- root is path to folder with manifest file in
2014-07-30 00:47:54 +01:00
(root, content) <- getManifest archive
meta <- parseMeta content
(cover, items) <- parseManifest content
-- No need to collapse here as the image path is from the manifest file
let coverDoc = fromMaybe mempty (imageToPandoc <$> cover)
2014-07-30 00:47:54 +01:00
spine <- parseSpine items content
let escapedSpine = map (escapeURI . takeFileName . fst) spine
Pandoc _ bs <-
foldM' (\a b -> ((a <>) . walk (prependHash escapedSpine))
2014-07-30 00:47:54 +01:00
`liftM` parseSpineElem root b) mempty spine
let ast = coverDoc <> (Pandoc meta bs)
fetchImages (M.elems items) root archive ast
2016-11-28 17:13:46 -05:00
return ast
2014-07-30 00:47:54 +01:00
where
os' = os {readerExtensions = enableExtension Ext_raw_html (readerExtensions os)}
2016-11-28 17:13:46 -05:00
parseSpineElem :: PandocMonad m => FilePath -> (FilePath, MimeType) -> m Pandoc
parseSpineElem (normalise -> r) (normalise -> path, mime) = do
doc <- mimeToReader mime r path
2014-07-30 00:47:54 +01:00
let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty
return $ docSpan <> doc
2016-11-28 17:13:46 -05:00
mimeToReader :: PandocMonad m => MimeType -> FilePath -> FilePath -> m Pandoc
mimeToReader "application/xhtml+xml" (unEscapeString -> root)
(unEscapeString -> path) = do
fname <- findEntryByPathE (root </> path) archive
html <- readHtml os' . TL.toStrict . TL.decodeUtf8 $ fromEntry fname
2015-02-18 13:02:59 +00:00
return $ fixInternalReferences path html
mimeToReader s _ (unEscapeString -> path)
2014-07-30 00:47:54 +01:00
| s `elem` imageMimes = return $ imageToPandoc path
| otherwise = return $ mempty
-- paths should be absolute when this function is called
-- renameImages should do this
fetchImages :: PandocMonad m
=> [(FilePath, MimeType)]
-> FilePath -- ^ Root
-> Archive
-> Pandoc
-> m ()
fetchImages mimes root arc (query iq -> links) =
mapM_ (uncurry3 insertMedia) (mapMaybe getEntry links)
where
getEntry link =
let abslink = normalise (root </> link) in
(link , lookup link mimes, ) . fromEntry
<$> findEntryByPath abslink arc
iq :: Inline -> [FilePath]
iq (Image _ _ (url, _)) = [url]
iq _ = []
-- Remove relative paths
renameImages :: FilePath -> Inline -> Inline
renameImages root img@(Image attr a (url, b))
| "data:" `isPrefixOf` url = img
| otherwise = Image attr a (collapseFilePath (root </> url), b)
renameImages _ x = x
2014-07-30 00:47:54 +01:00
imageToPandoc :: FilePath -> Pandoc
imageToPandoc s = B.doc . B.para $ B.image s "" mempty
2014-08-17 20:53:46 +04:00
imageMimes :: [MimeType]
2014-07-30 00:47:54 +01:00
imageMimes = ["image/gif", "image/jpeg", "image/png"]
type CoverImage = FilePath
2016-11-28 17:13:46 -05:00
parseManifest :: (PandocMonad m) => Element -> m (Maybe CoverImage, Items)
2014-07-30 00:47:54 +01:00
parseManifest content = do
manifest <- findElementE (dfName "manifest") content
let items = findChildren (dfName "item") manifest
r <- mapM parseItem items
let cover = findAttr (emptyName "href") =<< filterChild findCover manifest
return (cover, (M.fromList r))
where
findCover e = maybe False (isInfixOf "cover-image")
(findAttr (emptyName "properties") e)
parseItem e = do
uid <- findAttrE (emptyName "id") e
href <- findAttrE (emptyName "href") e
mime <- findAttrE (emptyName "media-type") e
return (uid, (href, mime))
2016-11-28 17:13:46 -05:00
parseSpine :: PandocMonad m => Items -> Element -> m [(FilePath, MimeType)]
2014-07-30 00:47:54 +01:00
parseSpine is e = do
spine <- findElementE (dfName "spine") e
let itemRefs = findChildren (dfName "itemref") spine
mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs
where
parseItemRef ref = do
let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref)
guard linear
findAttr (emptyName "idref") ref
2016-11-28 17:13:46 -05:00
parseMeta :: PandocMonad m => Element -> m Meta
2014-07-30 00:47:54 +01:00
parseMeta content = do
meta <- findElementE (dfName "metadata") content
let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True
dcspace _ = False
let dcs = filterChildrenName dcspace meta
let r = foldr parseMetaItem nullMeta dcs
return r
-- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem
parseMetaItem :: Element -> Meta -> Meta
parseMetaItem e@(stripNamespace . elName -> field) meta =
addMetaField (renameMeta field) (B.str $ strContent e) meta
2014-07-30 00:47:54 +01:00
renameMeta :: String -> String
renameMeta "creator" = "author"
renameMeta s = s
2014-07-30 00:47:54 +01:00
2016-11-28 17:13:46 -05:00
getManifest :: PandocMonad m => Archive -> m (String, Element)
2014-07-30 00:47:54 +01:00
getManifest archive = do
metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive
docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry
let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces)
as <- liftM ((map attrToPair) . elAttribs)
(findElementE (QName "rootfile" (Just ns) Nothing) docElem)
manifestFile <- mkE "Root not found" (lookup "full-path" as)
let rootdir = dropFileName manifestFile
2014-07-30 00:47:54 +01:00
--mime <- lookup "media-type" as
manifest <- findEntryByPathE manifestFile archive
2014-07-30 00:47:54 +01:00
liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest)
-- Fixup
fixInternalReferences :: FilePath -> Pandoc -> Pandoc
fixInternalReferences pathToFile =
(walk $ renameImages root)
. (walk $ fixBlockIRs filename)
. (walk $ fixInlineIRs filename)
2014-07-30 00:47:54 +01:00
where
(root, escapeURI -> filename) = splitFileName pathToFile
2014-07-30 00:47:54 +01:00
fixInlineIRs :: String -> Inline -> Inline
fixInlineIRs s (Span as v) =
Span (fixAttrs s as) v
fixInlineIRs s (Code as code) =
Code (fixAttrs s as) code
2016-05-24 17:42:37 +02:00
fixInlineIRs s (Link as is ('#':url, tit)) =
Link (fixAttrs s as) is (addHash s url, tit)
fixInlineIRs s (Link as is t) =
Link (fixAttrs s as) is t
2014-07-30 00:47:54 +01:00
fixInlineIRs _ v = v
prependHash :: [String] -> Inline -> Inline
prependHash ps l@(Link attr is (url, tit))
2014-07-30 00:47:54 +01:00
| or [s `isPrefixOf` url | s <- ps] =
Link attr is ('#':url, tit)
2014-07-30 00:47:54 +01:00
| otherwise = l
prependHash _ i = i
fixBlockIRs :: String -> Block -> Block
fixBlockIRs s (Div as b) =
Div (fixAttrs s as) b
fixBlockIRs s (Header i as b) =
Header i (fixAttrs s as) b
fixBlockIRs s (CodeBlock as code) =
CodeBlock (fixAttrs s as) code
fixBlockIRs _ b = b
fixAttrs :: FilePath -> B.Attr -> B.Attr
fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs)
addHash :: String -> String -> String
addHash _ "" = ""
addHash s ident = takeFileName s ++ "#" ++ ident
2014-07-30 00:47:54 +01:00
removeEPUBAttrs :: [(String, String)] -> [(String, String)]
removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs
isEPUBAttr :: (String, String) -> Bool
isEPUBAttr (k, _) = "epub:" `isPrefixOf` k
-- Library
2014-07-30 00:47:54 +01:00
-- Strict version of foldM
foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a
foldM' _ z [] = return z
foldM' f z (x:xs) = do
z' <- f z x
z' `deepseq` foldM' f z' xs
uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d
uncurry3 f (a, b, c) = f a b c
-- Utility
2014-07-30 00:47:54 +01:00
stripNamespace :: QName -> String
stripNamespace (QName v _ _) = v
attrToNSPair :: Attr -> Maybe (String, String)
attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val)
attrToNSPair _ = Nothing
2014-07-30 00:47:54 +01:00
attrToPair :: Attr -> (String, String)
attrToPair (Attr (QName name _ _) val) = (name, val)
defaultNameSpace :: Maybe String
defaultNameSpace = Just "http://www.idpf.org/2007/opf"
dfName :: String -> QName
dfName s = QName s defaultNameSpace Nothing
emptyName :: String -> QName
emptyName s = QName s Nothing Nothing
-- Convert Maybe interface to Either
2016-11-28 17:13:46 -05:00
findAttrE :: PandocMonad m => QName -> Element -> m String
2014-07-30 00:47:54 +01:00
findAttrE q e = mkE "findAttr" $ findAttr q e
2016-11-28 17:13:46 -05:00
findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry
findEntryByPathE (normalise -> path) a =
mkE ("No entry on path: " ++ path) $ findEntryByPath path a
2014-07-30 00:47:54 +01:00
2016-11-28 17:13:46 -05:00
parseXMLDocE :: PandocMonad m => String -> m Element
2014-07-30 00:47:54 +01:00
parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc
2016-11-28 17:13:46 -05:00
findElementE :: PandocMonad m => QName -> Element -> m Element
2014-07-30 00:47:54 +01:00
findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x
2016-11-28 17:13:46 -05:00
mkE :: PandocMonad m => String -> Maybe a -> m a
mkE s = maybe (throwError . PandocParseError $ s) return