Docx reader: Remove unused anchors.

Docx produces a lot of anchors with nothing pointing to them -- we now
remove these to produce cleaner output. Note that this has to occur at
the end of the process because it has to follow link/anchor rewriting.

Closes #3679.
This commit is contained in:
Jesse Rosenthal 2017-12-30 22:17:06 -05:00
parent e0cf8e64b5
commit e90c714c73

View file

@ -118,6 +118,7 @@ readDocx _ _ =
throwError $ PandocSomeError "couldn't parse docx file"
data DState = DState { docxAnchorMap :: M.Map String String
, docxAnchorSet :: Set.Set String
, docxMediaBag :: MediaBag
, docxDropCap :: Inlines
, docxWarnings :: [String]
@ -128,6 +129,7 @@ data DState = DState { docxAnchorMap :: M.Map String String
instance Default DState where
def = DState { docxAnchorMap = M.empty
, docxAnchorSet = mempty
, docxMediaBag = mempty
, docxDropCap = mempty
, docxWarnings = []
@ -561,7 +563,7 @@ bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do
]
modify $ \st -> st{ docxListState = M.insert (numId, lvl) start listState}
blks <- bodyPartToBlocks (Paragraph pPr parparts)
return $ divWith ("", ["list-item"], kvs) blks
return $ divWith ("", ["list-item"], kvs) blks
bodyPartToBlocks (ListItem pPr _ _ _ parparts) =
let pPr' = pPr {pStyle = "ListParagraph": pStyle pPr}
in
@ -603,21 +605,41 @@ bodyPartToBlocks (OMathPara e) =
rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline
rewriteLink' l@(Link attr ils ('#':target, title)) = do
anchorMap <- gets docxAnchorMap
return $ case M.lookup target anchorMap of
Just newTarget -> Link attr ils ('#':newTarget, title)
Nothing -> l
case M.lookup target anchorMap of
Just newTarget -> do
modify $ \s -> s{docxAnchorSet = Set.insert newTarget (docxAnchorSet s)}
return $ Link attr ils ('#':newTarget, title)
Nothing -> do
modify $ \s -> s{docxAnchorSet = Set.insert target (docxAnchorSet s)}
return l
rewriteLink' il = return il
rewriteLinks :: PandocMonad m => [Block] -> DocxContext m [Block]
rewriteLinks = mapM (walkM rewriteLink')
removeOrphanAnchors'' :: PandocMonad m => Inline -> DocxContext m [Inline]
removeOrphanAnchors'' s@(Span (ident, classes, _) ils)
| "anchor" `elem` classes = do
anchorSet <- gets docxAnchorSet
return $ if ident `Set.member` anchorSet
then [s]
else ils
removeOrphanAnchors'' il = return [il]
removeOrphanAnchors' :: PandocMonad m => [Inline] -> DocxContext m [Inline]
removeOrphanAnchors' ils = liftM concat $ mapM removeOrphanAnchors'' ils
removeOrphanAnchors :: PandocMonad m => [Block] -> DocxContext m [Block]
removeOrphanAnchors = mapM (walkM removeOrphanAnchors')
bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block])
bodyToOutput (Body bps) = do
let (metabps, blkbps) = sepBodyParts bps
meta <- bodyPartsToMeta metabps
blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps
blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks
return (meta, blks')
blks'' <- removeOrphanAnchors blks'
return (meta, blks'')
docxToOutput :: PandocMonad m
=> ReaderOptions