EPUB writer: Improved chapter splitting and internal link rewriting.

Closes #1887.
Closes #2163.
Closes #2162.
This commit is contained in:
John MacFarlane 2015-05-27 09:39:05 -07:00
parent 1daa26468c
commit dbf2a63669

View file

@ -56,10 +56,10 @@ import Text.Pandoc.Options ( WriterOptions(..)
, EPUBVersion(..) , EPUBVersion(..)
, ObfuscationMethod(NoObfuscation) ) , ObfuscationMethod(NoObfuscation) )
import Text.Pandoc.Definition import Text.Pandoc.Definition
import Text.Pandoc.Walk (walk, walkM) import Text.Pandoc.Walk (walk, walkM, query)
import Data.Default import Data.Default
import Text.Pandoc.Writers.Markdown (writePlain) import Text.Pandoc.Writers.Markdown (writePlain)
import Control.Monad.State (modify, get, execState, State, put, evalState) import Control.Monad.State (modify, get, State, put, evalState)
import Control.Monad (mplus, liftM, when) import Control.Monad (mplus, liftM, when)
import Text.XML.Light ( unode, Element(..), unqual, Attr(..), add_attrs import Text.XML.Light ( unode, Element(..), unqual, Attr(..), add_attrs
, strContent, lookupAttr, Node(..), QName(..), parseXML , strContent, lookupAttr, Node(..), QName(..), parseXML
@ -408,11 +408,6 @@ writeEPUB opts doc@(Pandoc meta _) = do
(docTitle' meta) : blocks (docTitle' meta) : blocks
let chapterHeaderLevel = writerEpubChapterLevel opts let chapterHeaderLevel = writerEpubChapterLevel opts
-- internal reference IDs change when we chunk the file,
-- so that '#my-header-1' might turn into 'chap004.xhtml#my-header'.
-- the next two lines fix that:
let reftable = correlateRefs chapterHeaderLevel blocks'
let blocks'' = replaceRefs reftable blocks'
let isChapterHeader (Header n _ _) = n <= chapterHeaderLevel let isChapterHeader (Header n _ _) = n <= chapterHeaderLevel
isChapterHeader (Div ("",["references"],[]) (Header n _ _:_)) = isChapterHeader (Div ("",["references"],[]) (Header n _ _:_)) =
@ -443,7 +438,37 @@ writeEPUB opts doc@(Pandoc meta _) = do
let (xs,ys) = break isChapterHeader bs let (xs,ys) = break isChapterHeader bs
(Chapter Nothing (b:xs) :) `fmap` toChapters ys (Chapter Nothing (b:xs) :) `fmap` toChapters ys
let chapters = evalState (toChapters blocks'') [] let chapters' = evalState (toChapters blocks') []
let extractLinkURL' :: Int -> Inline -> [(String, String)]
extractLinkURL' num (Span (ident, _, _) _)
| not (null ident) = [(ident, showChapter num ++ ('#':ident))]
extractLinkURL' _ _ = []
let extractLinkURL :: Int -> Block -> [(String, String)]
extractLinkURL num (Div (ident, _, _) _)
| not (null ident) = [(ident, showChapter num ++ ('#':ident))]
extractLinkURL num (Header _ (ident, _, _) _)
| not (null ident) = [(ident, showChapter num ++ ('#':ident))]
extractLinkURL num b = query (extractLinkURL' num) b
let reftable = concat $ zipWith (\(Chapter _ bs) num ->
query (extractLinkURL num) bs)
chapters' [1..]
let fixInternalReferences :: Inline -> Inline
fixInternalReferences (Link lab ('#':xs, tit)) =
case lookup xs reftable of
Just ys -> Link lab (ys, tit)
Nothing -> Link lab ('#':xs, tit)
fixInternalReferences x = x
-- internal reference IDs change when we chunk the file,
-- so that '#my-header-1' might turn into 'chap004.xhtml#my-header'.
-- this fixes that:
let chapters = map (\(Chapter mbnum bs) ->
Chapter mbnum $ walk fixInternalReferences bs)
chapters'
let chapToEntry :: Int -> Chapter -> Entry let chapToEntry :: Int -> Chapter -> Entry
chapToEntry num (Chapter mbnum bs) = mkEntry (showChapter num) chapToEntry num (Chapter mbnum bs) = mkEntry (showChapter num)
@ -549,7 +574,7 @@ writeEPUB opts doc@(Pandoc meta _) = do
let contentsEntry = mkEntry "content.opf" contentsData let contentsEntry = mkEntry "content.opf" contentsData
-- toc.ncx -- toc.ncx
let secs = hierarchicalize blocks'' let secs = hierarchicalize blocks'
let tocLevel = writerTOCDepth opts let tocLevel = writerTOCDepth opts
@ -889,11 +914,6 @@ mediaTypeOf x =
Just y | any (`isPrefixOf` y) mediaPrefixes -> Just y Just y | any (`isPrefixOf` y) mediaPrefixes -> Just y
_ -> Nothing _ -> Nothing
data IdentState = IdentState{
chapterNumber :: Int,
identTable :: [(String,String)]
} deriving (Read, Show)
-- Returns filename for chapter number. -- Returns filename for chapter number.
showChapter :: Int -> String showChapter :: Int -> String
showChapter = printf "ch%03d.xhtml" showChapter = printf "ch%03d.xhtml"
@ -910,45 +930,6 @@ addIdentifiers bs = evalState (mapM go bs) []
return $ Header n (ident',classes,kvs) ils return $ Header n (ident',classes,kvs) ils
go x = return x go x = return x
-- Go through a block list and construct a table
-- correlating the automatically constructed references
-- that would be used in a normal pandoc document with
-- new URLs to be used in the EPUB. For example, what
-- was "header-1" might turn into "ch006.xhtml#header".
correlateRefs :: Int -> [Block] -> [(String,String)]
correlateRefs chapterHeaderLevel bs =
identTable $ execState (walkM goBlock bs >>= walkM goInline)
IdentState{ chapterNumber = 0
, identTable = [] }
where goBlock :: Block -> State IdentState Block
goBlock x@(Header n (ident,_,_) _) = x <$ addIdentifier (Just n) ident
goBlock x@(Div (ident,_,_) _) = x <$ addIdentifier Nothing ident
goBlock x = return x
goInline :: Inline -> State IdentState Inline
goInline x@(Span (ident,_,_) _) = x <$ addIdentifier Nothing ident
goInline x = return x
addIdentifier mbHeaderLevel ident = do
case mbHeaderLevel of
Just n | n <= chapterHeaderLevel ->
modify $ \s -> s{ chapterNumber = chapterNumber s + 1 }
_ -> return ()
st <- get
let chapterid = showChapter (chapterNumber st) ++
case mbHeaderLevel of
Just n | n <= chapterHeaderLevel -> ""
_ -> '#' : ident
modify $ \s -> s{ identTable = (ident, chapterid) : identTable st }
-- Replace internal link references using the table produced
-- by correlateRefs.
replaceRefs :: [(String,String)] -> [Block] -> [Block]
replaceRefs refTable = walk replaceOneRef
where replaceOneRef x@(Link lab ('#':xs,tit)) =
case lookup xs refTable of
Just url -> Link lab (url,tit)
Nothing -> x
replaceOneRef x = x
-- Variant of normalizeDate that allows partial dates: YYYY, YYYY-MM -- Variant of normalizeDate that allows partial dates: YYYY, YYYY-MM
normalizeDate' :: String -> Maybe String normalizeDate' :: String -> Maybe String
normalizeDate' xs = normalizeDate' xs =