From 836153de43933dca3205e4459f55979d467f927e Mon Sep 17 00:00:00 2001
From: Jesse Rosenthal <jrosenthal@jhu.edu>
Date: Sun, 31 Dec 2017 09:09:22 -0500
Subject: [PATCH] Docx Reader: Combine adjacent anchors.

There isn't any reason to have numberous anchors in the same place,
since we can't maintain docx's non-nesting overlapping. So we reduce
to a single anchor, and have all links pointing to one of the
overlapping anchors point to that one. This changes the behavior from
commit e90c714c7 slightly (use the first anchor instead of the last)
so we change the expected test result.

Note that because this produces a state that has to be set after every
invocation of `parPartToInlines`, we make the main function into a
primed subfunction `parPartToInlines'`, and make `parPartToInlines` a
wrapper around that.
---
 src/Text/Pandoc/Readers/Docx.hs | 67 +++++++++++++++++++++++----------
 test/docx/unused_anchors.native |  4 +-
 2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 248cb0b84..6ca1590a4 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -82,6 +82,7 @@ import qualified Data.ByteString.Lazy as B
 import Data.Default (Default)
 import Data.List (delete, intersect)
 import qualified Data.Map as M
+import Data.Maybe (isJust)
 import Data.Sequence (ViewL (..), viewl)
 import qualified Data.Sequence as Seq
 import qualified Data.Set as Set
@@ -119,6 +120,7 @@ readDocx _ _ =
 
 data DState = DState { docxAnchorMap :: M.Map String String
                      , docxAnchorSet :: Set.Set String
+                     , docxImmedPrevAnchor :: Maybe String
                      , docxMediaBag  :: MediaBag
                      , docxDropCap   :: Inlines
                      , docxWarnings  :: [String]
@@ -130,6 +132,7 @@ data DState = DState { docxAnchorMap :: M.Map String String
 instance Default DState where
   def = DState { docxAnchorMap = M.empty
                , docxAnchorSet = mempty
+               , docxImmedPrevAnchor = Nothing
                , docxMediaBag  = mempty
                , docxDropCap   = mempty
                , docxWarnings  = []
@@ -341,9 +344,26 @@ blocksToInlinesWarn cmtId blks = do
       "Docx comment " ++ cmtId ++ " will not retain formatting"
   return $ blocksToInlines' blkList
 
+-- The majority of work in this function is done in the primted
+-- subfunction `partPartToInlines'`. We make this wrapper so that we
+-- don't have to modify `docxImmedPrevAnchor` state after every function.
 parPartToInlines :: PandocMonad m => ParPart -> DocxContext m Inlines
-parPartToInlines (PlainRun r) = runToInlines r
-parPartToInlines (Insertion _ author date runs) = do
+parPartToInlines parPart =
+  case parPart of
+    (BookMark _ anchor) | not $ anchor `elem` dummyAnchors -> do
+      inHdrBool <- asks docxInHeaderBlock
+      ils <- parPartToInlines' parPart
+      immedPrevAnchor <- gets docxImmedPrevAnchor
+      unless (isJust immedPrevAnchor || inHdrBool)
+        (modify $ \s -> s{ docxImmedPrevAnchor = Just anchor})
+      return ils
+    _ -> do
+      ils <- parPartToInlines' parPart
+      modify $ \s -> s{ docxImmedPrevAnchor = Nothing}
+      return ils
+parPartToInlines' :: PandocMonad m => ParPart -> DocxContext m Inlines
+parPartToInlines' (PlainRun r) = runToInlines r
+parPartToInlines' (Insertion _ author date runs) = do
   opts <- asks docxOptions
   case readerTrackChanges opts of
     AcceptChanges -> smushInlines <$> mapM runToInlines runs
@@ -352,7 +372,7 @@ parPartToInlines (Insertion _ author date runs) = do
       ils <- smushInlines <$> mapM runToInlines runs
       let attr = ("", ["insertion"], [("author", author), ("date", date)])
       return $ spanWith attr ils
-parPartToInlines (Deletion _ author date runs) = do
+parPartToInlines' (Deletion _ author date runs) = do
   opts <- asks docxOptions
   case readerTrackChanges opts of
     AcceptChanges -> return mempty
@@ -361,7 +381,7 @@ parPartToInlines (Deletion _ author date runs) = do
       ils <- smushInlines <$> mapM runToInlines runs
       let attr = ("", ["deletion"], [("author", author), ("date", date)])
       return $ spanWith attr ils
-parPartToInlines (CommentStart cmtId author date bodyParts) = do
+parPartToInlines' (CommentStart cmtId author date bodyParts) = do
   opts <- asks docxOptions
   case readerTrackChanges opts of
     AllChanges -> do
@@ -370,16 +390,16 @@ parPartToInlines (CommentStart cmtId author date bodyParts) = do
       let attr = ("", ["comment-start"], [("id", cmtId), ("author", author), ("date", date)])
       return $ spanWith attr ils
     _ -> return mempty
-parPartToInlines (CommentEnd cmtId) = do
+parPartToInlines' (CommentEnd cmtId) = do
   opts <- asks docxOptions
   case readerTrackChanges opts of
     AllChanges -> do
       let attr = ("", ["comment-end"], [("id", cmtId)])
       return $ spanWith attr mempty
     _ -> return mempty
-parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors =
+parPartToInlines' (BookMark _ anchor) | anchor `elem` dummyAnchors =
   return mempty
-parPartToInlines (BookMark _ anchor) =
+parPartToInlines' (BookMark _ anchor) =
   -- We record these, so we can make sure not to overwrite
   -- user-defined anchor links with header auto ids.
   do
@@ -395,27 +415,34 @@ parPartToInlines (BookMark _ anchor) =
     -- of rewriting user-defined anchor links. However, since these
     -- are not defined in pandoc, it seems like a necessary evil to
     -- avoid an extra pass.
-    let newAnchor =
-          if not inHdrBool && anchor `elem` M.elems anchorMap
-          then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap)
-          else anchor
-    unless inHdrBool
-      (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap})
-    return $ spanWith (newAnchor, ["anchor"], []) mempty
-parPartToInlines (Drawing fp title alt bs ext) = do
+    immedPrevAnchor <- gets docxImmedPrevAnchor
+    case immedPrevAnchor of
+      Just prevAnchor -> do
+        unless inHdrBool
+          (modify $ \s -> s { docxAnchorMap = M.insert anchor prevAnchor anchorMap})
+        return mempty
+      Nothing -> do
+        let newAnchor =
+              if not inHdrBool && anchor `elem` M.elems anchorMap
+              then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap)
+              else anchor
+        unless inHdrBool
+          (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap})
+        return $ spanWith (newAnchor, ["anchor"], []) mempty
+parPartToInlines' (Drawing fp title alt bs ext) = do
   (lift . lift) $ P.insertMedia fp Nothing bs
   return $ imageWith (extentToAttr ext) fp title $ text alt
-parPartToInlines Chart =
+parPartToInlines' Chart =
   return $ spanWith ("", ["chart"], []) $ text "[CHART]"
-parPartToInlines (InternalHyperLink anchor runs) = do
+parPartToInlines' (InternalHyperLink anchor runs) = do
   ils <- smushInlines <$> mapM runToInlines runs
   return $ link ('#' : anchor) "" ils
-parPartToInlines (ExternalHyperLink target runs) = do
+parPartToInlines' (ExternalHyperLink target runs) = do
   ils <- smushInlines <$> mapM runToInlines runs
   return $ link target "" ils
-parPartToInlines (PlainOMath exps) =
+parPartToInlines' (PlainOMath exps) =
   return $ math $ writeTeX exps
-parPartToInlines (SmartTag runs) = do
+parPartToInlines' (SmartTag runs) = do
   smushInlines <$> mapM runToInlines runs
 
 isAnchorSpan :: Inline -> Bool
diff --git a/test/docx/unused_anchors.native b/test/docx/unused_anchors.native
index 334269793..051dfe424 100644
--- a/test/docx/unused_anchors.native
+++ b/test/docx/unused_anchors.native
@@ -1,3 +1,3 @@
 [Header 1 ("my-section",[],[]) [Str "My",Space,Str "Section"]
-,Para [Link ("",[],[]) [Str "Here",Space,Str "is",Space,Str "a",Space,Str "link."] ("#Bar","")]
-,Para [Span ("Bar",["anchor"],[]) [],Str "Here",Space,Str "is",Space,Str "the",Space,Str "target."]]
+,Para [Link ("",[],[]) [Str "Here",Space,Str "is",Space,Str "a",Space,Str "link."] ("#Foo","")]
+,Para [Span ("Foo",["anchor"],[]) [],Str "Here",Space,Str "is",Space,Str "the",Space,Str "target."]]