Merge pull request #1516 from mpickering/epubmetadata

EPUB improvements
This commit is contained in:
John MacFarlane 2014-08-11 08:14:54 -07:00
commit e690fe4a3e
3 changed files with 72 additions and 26 deletions

18
README
View file

@ -2972,43 +2972,60 @@ The following fields are recognized:
`GTIN-13`, `UPC`, `ISMN-10`, `DOI`, `LCCN`, `GTIN-14`,
`ISBN-13`, `Legal deposit number`, `URN`, `OCLC`,
`ISMN-13`, `ISBN-A`, `JP`, `OLCC`.
`title`
~ Either a string value, or an object with fields `file-as` and
`type`, or a list of such objects. Valid values for `type` are
`main`, `subtitle`, `short`, `collection`, `edition`, `extended`.
`creator`
~ Either a string value, or an object with fields `role`, `file-as`,
and `text`, or a list of such objects. Valid values for `role` are
[marc relators](http://www.loc.gov/marc/relators/relaterm.html), but
pandoc will attempt to translate the human-readable versions
(like "author" and "editor") to the appropriate marc relators.
`contributor`
~ Same format as `creator`.
`date`
~ A string value in `YYYY-MM-DD` format. (Only the year is necessary.)
Pandoc will attempt to convert other common date formats.
`language`
~ A string value in [RFC5646] format. Pandoc will default to the local
language if nothing is specified.
`subject`
~ A string value or a list of such values.
`description`
~ A string value.
`type`
~ A string value.
`format`
~ A string value.
`relation`
~ A string value.
`coverage`
~ A string value.
`rights`
~ A string value.
`cover-image`
~ A string value (path to cover image).
`stylesheet`
~ A string value (path to CSS stylesheet).
`page-progression-direction`
~ Either `ltr` or `rtl`. Specifies the synonymous spine [attribute][EPUBspine].
Literate Haskell support
========================
@ -3126,3 +3143,4 @@ Rosenthal.
[InDesign ICML]: https://www.adobe.com/content/dam/Adobe/en/devnet/indesign/cs55-docs/IDML/idml-specification.pdf
[txt2tags]: http://txt2tags.org/
[EPUB]: http://idpf.org/epub
[EPUBspine]: http://www.idpf.org/epub/301/spec/epub-publications.html#sec-spine-elem

View file

@ -14,7 +14,7 @@ import Text.Pandoc.Walk (walk, query)
import Text.Pandoc.Generic(bottomUp)
import Text.Pandoc.Readers.HTML (readHtml)
import Text.Pandoc.Options ( ReaderOptions(..), readerTrace)
import Text.Pandoc.Shared (escapeURI, collapseFilePath)
import Text.Pandoc.Shared (escapeURI, collapseFilePath, addMetaField)
import Text.Pandoc.MediaBag (MediaBag, insertMedia)
import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except)
import qualified Text.Pandoc.Builder as B
@ -155,7 +155,7 @@ parseMeta content = do
-- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem
parseMetaItem :: Element -> Meta -> Meta
parseMetaItem e@(stripNamespace . elName -> field) meta =
B.setMeta (renameMeta field) (B.str $ strContent e) meta
addMetaField (renameMeta field) (B.str $ strContent e) meta
renameMeta :: String -> String
renameMeta "creator" = "author"
@ -169,10 +169,10 @@ getManifest archive = do
ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces)
as <- liftM ((map attrToPair) . elAttribs)
(findElementE (QName "rootfile" (Just ns) Nothing) docElem)
root <- mkE "Root not found" (lookup "full-path" as)
let rootdir = dropFileName root
manifestFile <- mkE "Root not found" (lookup "full-path" as)
let rootdir = dropFileName manifestFile
--mime <- lookup "media-type" as
manifest <- findEntryByPathE root archive
manifest <- findEntryByPathE manifestFile archive
liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest)
-- Fixup
@ -272,7 +272,8 @@ findAttrE :: MonadError String m => QName -> Element -> m String
findAttrE q e = mkE "findAttr" $ findAttr q e
findEntryByPathE :: MonadError String m => FilePath -> Archive -> m Entry
findEntryByPathE path a = mkE ("No entry on path: " ++ path) $ findEntryByPath path a
findEntryByPathE (normalise -> path) a =
mkE ("No entry on path: " ++ path) $ findEntryByPath path a
parseXMLDocE :: MonadError String m => String -> m Element
parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc

View file

@ -1,4 +1,4 @@
{-# LANGUAGE PatternGuards, CPP, ScopedTypeVariables #-}
{-# LANGUAGE PatternGuards, CPP, ScopedTypeVariables, ViewPatterns #-}
{-
Copyright (C) 2010-2014 John MacFarlane <jgm@berkeley.edu>
@ -29,7 +29,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Conversion of 'Pandoc' documents to EPUB.
-}
module Text.Pandoc.Writers.EPUB ( writeEPUB ) where
import Data.IORef
import Data.IORef ( IORef, newIORef, readIORef, modifyIORef )
import qualified Data.Map as M
import Data.Maybe ( fromMaybe )
import Data.List ( isPrefixOf, isInfixOf, intercalate )
@ -40,27 +40,35 @@ import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Char8 as B8
import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.SelfContained ( makeSelfContained )
import Codec.Archive.Zip
import Codec.Archive.Zip ( emptyArchive, addEntryToArchive, eRelativePath, fromEntry , Entry, toEntry, fromArchive)
import Control.Applicative ((<$>))
import Data.Time.Clock.POSIX
import Data.Time
import System.Locale
import Text.Pandoc.Shared hiding ( Element )
import qualified Text.Pandoc.Shared as Shared
import Data.Time.Clock.POSIX ( getPOSIXTime )
import Data.Time (getCurrentTime,UTCTime, formatTime)
import System.Locale ( defaultTimeLocale )
import Text.Pandoc.Shared ( trimr, renderTags', safeRead, uniqueIdent, trim
, normalizeDate, readDataFile, stringify, warn
, hierarchicalize, fetchItem' )
import qualified Text.Pandoc.Shared as S (Element(..))
import Text.Pandoc.Builder (fromList, setMeta)
import Text.Pandoc.Options
import Text.Pandoc.Options ( WriterOptions(..)
, HTMLMathMethod(..)
, EPUBVersion(..)
, ObfuscationMethod(NoObfuscation) )
import Text.Pandoc.Definition
import Text.Pandoc.Walk
import Control.Monad.State
import Text.XML.Light hiding (ppTopElement)
import Text.Pandoc.UUID
import Text.Pandoc.Writers.HTML
import Text.Pandoc.Walk (walk, walkM)
import Control.Monad.State (modify, get, execState, State, put, evalState)
import Control.Monad (foldM, when, mplus, liftM)
import Text.XML.Light ( unode, Element(..), unqual, Attr(..), add_attrs
, strContent, lookupAttr, Node(..), QName(..), parseXML
, onlyElems, node, ppElement)
import Text.Pandoc.UUID (getRandomUUID)
import Text.Pandoc.Writers.HTML (writeHtmlString, writeHtml)
import Data.Char ( toLower, isDigit, isAlphaNum )
import Network.URI ( unEscapeString )
import Text.Pandoc.MIME (getMimeType)
import qualified Control.Exception as E
import Text.Blaze.Html.Renderer.Utf8 (renderHtml)
import Text.HTML.TagSoup
import Text.HTML.TagSoup (Tag(TagOpen), fromAttrib, parseTags)
-- A Chapter includes a list of blocks and maybe a section
-- number offset. Note, some chapters are unnumbered. The section
@ -86,6 +94,7 @@ data EPUBMetadata = EPUBMetadata{
, epubRights :: Maybe String
, epubCoverImage :: Maybe String
, epubStylesheet :: Maybe Stylesheet
, epubPageDirection :: ProgressionDirection
} deriving Show
data Stylesheet = StylesheetPath FilePath
@ -114,6 +123,8 @@ data Title = Title{
, titleType :: Maybe String
} deriving Show
data ProgressionDirection = LTR | RTL | Default deriving Show
dcName :: String -> QName
dcName n = QName n Nothing (Just "dc")
@ -288,6 +299,7 @@ metadataFromMeta opts meta = EPUBMetadata{
, epubRights = rights
, epubCoverImage = coverImage
, epubStylesheet = stylesheet
, epubPageDirection = pageDirection
}
where identifiers = getIdentifier meta
titles = getTitle meta
@ -310,6 +322,14 @@ metadataFromMeta opts meta = EPUBMetadata{
stylesheet = (StylesheetContents <$> writerEpubStylesheet opts) `mplus`
((StylesheetPath . metaValueToString) <$>
lookupMeta "stylesheet" meta)
pageDirection = maybe Default stringToPageDirection
(lookupMeta "page-progression-direction" meta)
stringToPageDirection (metaValueToString -> s) =
case s of
"ltr" -> LTR
"rtl" -> RTL
_ -> Default
-- | Produce an EPUB file from a Pandoc document.
writeEPUB :: WriterOptions -- ^ Writer options
@ -374,6 +394,12 @@ writeEPUB opts doc@(Pandoc meta _) = do
let mkFontEntry f = mkEntry (takeFileName f) `fmap` B.readFile f
fontEntries <- mapM mkFontEntry $ writerEpubFonts opts'
-- set page progression direction
let progressionDirection = case epubPageDirection metadata of
LTR -> "ltr"
RTL -> "rtl"
Default -> "default"
-- body pages
-- add level 1 header to beginning if none there
@ -493,7 +519,8 @@ writeEPUB opts doc@(Pandoc meta _) = do
(pictureNode x)]) ++
map pictureNode picEntries ++
map fontNode fontEntries
, unode "spine" ! [("toc","ncx")] $
, unode "spine" ! [("toc","ncx")
,("page-progression-direction", progressionDirection)] $
case epubCoverImage metadata of
Nothing -> []
Just _ -> [ unode "itemref" !
@ -524,8 +551,8 @@ writeEPUB opts doc@(Pandoc meta _) = do
let tocLevel = writerTOCDepth opts
let navPointNode :: (Int -> String -> String -> [Element] -> Element)
-> Shared.Element -> State Int Element
navPointNode formatter (Sec _ nums (ident,_,_) ils children) = do
-> S.Element -> State Int Element
navPointNode formatter (S.Sec _ nums (ident,_,_) ils children) = do
n <- get
modify (+1)
let showNums :: [Int] -> String
@ -537,12 +564,12 @@ writeEPUB opts doc@(Pandoc meta _) = do
let src = case lookup ident reftable of
Just x -> x
Nothing -> error (ident ++ " not found in reftable")
let isSec (Sec lev _ _ _ _) = lev <= tocLevel
let isSec (S.Sec lev _ _ _ _) = lev <= tocLevel
isSec _ = False
let subsecs = filter isSec children
subs <- mapM (navPointNode formatter) subsecs
return $ formatter n tit src subs
navPointNode _ (Blk _) = error "navPointNode encountered Blk"
navPointNode _ (S.Blk _) = error "navPointNode encountered Blk"
let navMapFormatter :: Int -> String -> String -> [Element] -> Element
navMapFormatter n tit src subs = unode "navPoint" !