From dbf2a63669bb40e3ecfcd4ae4317d495d50ec0a2 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 27 May 2015 09:39:05 -0700 Subject: [PATCH] EPUB writer: Improved chapter splitting and internal link rewriting. Closes #1887. Closes #2163. Closes #2162. --- src/Text/Pandoc/Writers/EPUB.hs | 87 +++++++++++++-------------------- 1 file changed, 34 insertions(+), 53 deletions(-) diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index 4ce7857ac..cfbec7bfa 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -56,10 +56,10 @@ import Text.Pandoc.Options ( WriterOptions(..) , EPUBVersion(..) , ObfuscationMethod(NoObfuscation) ) import Text.Pandoc.Definition -import Text.Pandoc.Walk (walk, walkM) +import Text.Pandoc.Walk (walk, walkM, query) import Data.Default import Text.Pandoc.Writers.Markdown (writePlain) -import Control.Monad.State (modify, get, execState, State, put, evalState) +import Control.Monad.State (modify, get, State, put, evalState) import Control.Monad (mplus, liftM, when) import Text.XML.Light ( unode, Element(..), unqual, Attr(..), add_attrs , strContent, lookupAttr, Node(..), QName(..), parseXML @@ -408,11 +408,6 @@ writeEPUB opts doc@(Pandoc meta _) = do (docTitle' meta) : blocks let chapterHeaderLevel = writerEpubChapterLevel opts - -- internal reference IDs change when we chunk the file, - -- so that '#my-header-1' might turn into 'chap004.xhtml#my-header'. - -- the next two lines fix that: - let reftable = correlateRefs chapterHeaderLevel blocks' - let blocks'' = replaceRefs reftable blocks' let isChapterHeader (Header n _ _) = n <= chapterHeaderLevel isChapterHeader (Div ("",["references"],[]) (Header n _ _:_)) = @@ -443,7 +438,37 @@ writeEPUB opts doc@(Pandoc meta _) = do let (xs,ys) = break isChapterHeader bs (Chapter Nothing (b:xs) :) `fmap` toChapters ys - let chapters = evalState (toChapters blocks'') [] + let chapters' = evalState (toChapters blocks') [] + + let extractLinkURL' :: Int -> Inline -> [(String, String)] + extractLinkURL' num (Span (ident, _, _) _) + | not (null ident) = [(ident, showChapter num ++ ('#':ident))] + extractLinkURL' _ _ = [] + + let extractLinkURL :: Int -> Block -> [(String, String)] + extractLinkURL num (Div (ident, _, _) _) + | not (null ident) = [(ident, showChapter num ++ ('#':ident))] + extractLinkURL num (Header _ (ident, _, _) _) + | not (null ident) = [(ident, showChapter num ++ ('#':ident))] + extractLinkURL num b = query (extractLinkURL' num) b + + let reftable = concat $ zipWith (\(Chapter _ bs) num -> + query (extractLinkURL num) bs) + chapters' [1..] + + let fixInternalReferences :: Inline -> Inline + fixInternalReferences (Link lab ('#':xs, tit)) = + case lookup xs reftable of + Just ys -> Link lab (ys, tit) + Nothing -> Link lab ('#':xs, tit) + fixInternalReferences x = x + + -- internal reference IDs change when we chunk the file, + -- so that '#my-header-1' might turn into 'chap004.xhtml#my-header'. + -- this fixes that: + let chapters = map (\(Chapter mbnum bs) -> + Chapter mbnum $ walk fixInternalReferences bs) + chapters' let chapToEntry :: Int -> Chapter -> Entry chapToEntry num (Chapter mbnum bs) = mkEntry (showChapter num) @@ -549,7 +574,7 @@ writeEPUB opts doc@(Pandoc meta _) = do let contentsEntry = mkEntry "content.opf" contentsData -- toc.ncx - let secs = hierarchicalize blocks'' + let secs = hierarchicalize blocks' let tocLevel = writerTOCDepth opts @@ -889,11 +914,6 @@ mediaTypeOf x = Just y | any (`isPrefixOf` y) mediaPrefixes -> Just y _ -> Nothing -data IdentState = IdentState{ - chapterNumber :: Int, - identTable :: [(String,String)] - } deriving (Read, Show) - -- Returns filename for chapter number. showChapter :: Int -> String showChapter = printf "ch%03d.xhtml" @@ -910,45 +930,6 @@ addIdentifiers bs = evalState (mapM go bs) [] return $ Header n (ident',classes,kvs) ils go x = return x --- Go through a block list and construct a table --- correlating the automatically constructed references --- that would be used in a normal pandoc document with --- new URLs to be used in the EPUB. For example, what --- was "header-1" might turn into "ch006.xhtml#header". -correlateRefs :: Int -> [Block] -> [(String,String)] -correlateRefs chapterHeaderLevel bs = - identTable $ execState (walkM goBlock bs >>= walkM goInline) - IdentState{ chapterNumber = 0 - , identTable = [] } - where goBlock :: Block -> State IdentState Block - goBlock x@(Header n (ident,_,_) _) = x <$ addIdentifier (Just n) ident - goBlock x@(Div (ident,_,_) _) = x <$ addIdentifier Nothing ident - goBlock x = return x - goInline :: Inline -> State IdentState Inline - goInline x@(Span (ident,_,_) _) = x <$ addIdentifier Nothing ident - goInline x = return x - addIdentifier mbHeaderLevel ident = do - case mbHeaderLevel of - Just n | n <= chapterHeaderLevel -> - modify $ \s -> s{ chapterNumber = chapterNumber s + 1 } - _ -> return () - st <- get - let chapterid = showChapter (chapterNumber st) ++ - case mbHeaderLevel of - Just n | n <= chapterHeaderLevel -> "" - _ -> '#' : ident - modify $ \s -> s{ identTable = (ident, chapterid) : identTable st } - --- Replace internal link references using the table produced --- by correlateRefs. -replaceRefs :: [(String,String)] -> [Block] -> [Block] -replaceRefs refTable = walk replaceOneRef - where replaceOneRef x@(Link lab ('#':xs,tit)) = - case lookup xs refTable of - Just url -> Link lab (url,tit) - Nothing -> x - replaceOneRef x = x - -- Variant of normalizeDate that allows partial dates: YYYY, YYYY-MM normalizeDate' :: String -> Maybe String normalizeDate' xs =