From 6072bdcec95df9f537b22fb7df4a5f8ea7958189 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Wed, 24 Nov 2021 11:01:21 -0800
Subject: [PATCH] HTML reader: parse attributes on links and images.

Closes #6970.
---
 src/Text/Pandoc/Readers/HTML.hs         | 14 ++++++--------
 src/Text/Pandoc/Readers/HTML/Parsing.hs |  7 ++++---
 test/command/5986.md                    |  4 ++--
 test/command/6970.md                    | 12 ++++++++++++
 4 files changed, 24 insertions(+), 13 deletions(-)
 create mode 100644 test/command/6970.md

diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index c78faebbd..8aa2646b2 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -643,7 +643,7 @@ pQ = do
   case lookup "cite" attrs of
     Just url -> do
       let uid = fromMaybe mempty $
-                   lookup "name" attrs <> lookup "id" attrs
+                   lookup "name" attrs <|> lookup "id" attrs
       let cls = maybe [] T.words $ lookup "class" attrs
       url' <- canonicalizeUrl url
       makeQuote $ B.spanWith (uid, cls, [("cite", escapeURI url')])
@@ -705,20 +705,18 @@ pLineBreak = do
 
 pLink :: PandocMonad m => TagParser m Inlines
 pLink = try $ do
-  tag <- pSatisfy $ tagOpenLit "a" (const True)
+  tag@(TagOpen _ attr') <- pSatisfy $ tagOpenLit "a" (const True)
   let title = fromAttrib "title" tag
-  -- take id from id attribute if present, otherwise name
-  let uid = fromMaybe (fromAttrib "name" tag) $
-               maybeFromAttrib "id" tag
-  let cls = T.words $ fromAttrib "class" tag
+  let attr = toAttr $ filter (\(k,_) -> k /= "title" && k /= "href") attr'
   lab <- mconcat <$> manyTill inline (pCloses "a")
   -- check for href; if href, then a link, otherwise a span
   case maybeFromAttrib "href" tag of
        Nothing   ->
-         return $ extractSpaces (B.spanWith (uid, cls, [])) lab
+         return $ extractSpaces (B.spanWith attr) lab
        Just url' -> do
          url <- canonicalizeUrl url'
-         return $ extractSpaces (B.linkWith (uid, cls, []) (escapeURI url) title) lab
+         return $ extractSpaces
+                   (B.linkWith attr (escapeURI url) title) lab
 
 pImage :: PandocMonad m => TagParser m Inlines
 pImage = do
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs
index 27a23aa69..a8cdf1de2 100644
--- a/src/Text/Pandoc/Readers/HTML/Parsing.hs
+++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs
@@ -30,7 +30,7 @@ module Text.Pandoc.Readers.HTML.Parsing
   )
 where
 
-import Control.Monad (void, mzero)
+import Control.Monad (void, mzero, mplus)
 import Data.Maybe (fromMaybe)
 import Data.Text (Text)
 import Text.HTML.TagSoup
@@ -220,9 +220,10 @@ maybeFromAttrib _ _ = Nothing
 
 mkAttr :: [(Text, Text)] -> Attr
 mkAttr attr = (attribsId, attribsClasses, attribsKV)
-  where attribsId = fromMaybe "" $ lookup "id" attr
+  where attribsId = fromMaybe "" $ lookup "id" attr `mplus` lookup "name" attr
         attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
-        attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
+        attribsKV = filter (\(k,_) -> k /= "class" && k /= "id" && k /= "name")
+                           attr
         epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
 
 toAttr :: [(Text, Text)] -> Attr
diff --git a/test/command/5986.md b/test/command/5986.md
index ed8dd30c9..c181e33ad 100644
--- a/test/command/5986.md
+++ b/test/command/5986.md
@@ -6,8 +6,8 @@
 <p><span id="nav.xhtml"></span></p>
 <nav epub:type="landmarks" id="landmarks" hidden="hidden">
 <ol>
-<li><a href="text/title_page.xhtml">Title Page</a></li>
-<li><a href="#nav.xhtml#toc">Table of Contents</a></li>
+<li><a href="text/title_page.xhtml" class="titlepage">Title Page</a></li>
+<li><a href="#nav.xhtml#toc" class="toc">Table of Contents</a></li>
 </ol>
 </nav>
 <p><span id="ch001.xhtml"></span></p>
diff --git a/test/command/6970.md b/test/command/6970.md
new file mode 100644
index 000000000..45d9b76bd
--- /dev/null
+++ b/test/command/6970.md
@@ -0,0 +1,12 @@
+```
+% pandoc -f html -t native
+<a name="foo" class="bar baz" href='https://example.com' target='_blank'>https://example.com<a>
+^D
+[ Plain
+    [ Link
+        ( "foo" , [ "bar" , "baz" ] , [ ( "target" , "_blank" ) ] )
+        [ Str "https://example.com" ]
+        ( "https://example.com" , "" )
+    ]
+]
+```