EPUB Reader: Improved robustness of image extraction

We now maintain the invariant that when fetchImages is called,
all images have absolute paths.

This patch fixes several bugs relating to this as there are three places
where images can be introduced.
  (1) During the HTML parse
  (2) As spine elements
  (3) As a cover image

For (1), the paths are corrected by the transformation renameImages
For (2) and (3), we need to append the "root" to the path we parse from the
spine
This commit is contained in:
Matthew Pickering 2014-08-08 23:04:03 +01:00
parent 40ae8efddc
commit cfd8c0214c

View file

@ -52,10 +52,12 @@ runEPUB = either error id . runExcept
-- --
archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag) archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag)
archiveToEPUB (setEPUBOptions -> os) archive = do archiveToEPUB (setEPUBOptions -> os) archive = do
-- root is path to folder with manifest file in
(root, content) <- getManifest archive (root, content) <- getManifest archive
meta <- parseMeta content meta <- parseMeta content
(cover, items) <- parseManifest content (cover, items) <- parseManifest content
let coverDoc = fromMaybe mempty (imageToPandoc <$> cover) -- No need to collapse here as the image path is from the manifest file
let coverDoc = fromMaybe mempty (imageToPandoc . (root </>) <$> cover)
spine <- parseSpine items content spine <- parseSpine items content
let escapedSpine = map (escapeURI . takeFileName . fst) spine let escapedSpine = map (escapeURI . takeFileName . fst) spine
Pandoc _ bs <- Pandoc _ bs <-
@ -68,17 +70,17 @@ archiveToEPUB (setEPUBOptions -> os) archive = do
parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc
parseSpineElem (normalise -> r) (normalise -> path, mime) = do parseSpineElem (normalise -> r) (normalise -> path, mime) = do
when (readerTrace os) (traceM path) when (readerTrace os) (traceM path)
doc <- mimeToReader mime r path doc <- mimeToReader mime (r </> path)
let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty
return $ docSpan <> doc return $ docSpan <> doc
mimeToReader :: MonadError String m => MIME -> FilePath -> FilePath -> m Pandoc mimeToReader :: MonadError String m => MIME -> FilePath -> m Pandoc
mimeToReader "application/xhtml+xml" r path = do mimeToReader "application/xhtml+xml" (normalise -> path) = do
fname <- findEntryByPathE (r </> path) archive fname <- findEntryByPathE path archive
return $ fixInternalReferences (r </> path) . return $ fixInternalReferences path .
readHtml os . readHtml os .
UTF8.toStringLazy $ UTF8.toStringLazy $
fromEntry fname fromEntry fname
mimeToReader s _ path mimeToReader s path
| s `elem` imageMimes = return $ imageToPandoc path | s `elem` imageMimes = return $ imageToPandoc path
| otherwise = return $ mempty | otherwise = return $ mempty