HTML reader: parse attributes on links and images.

Closes #6970.
This commit is contained in:
John MacFarlane 2021-11-24 11:01:21 -08:00
parent a8638894ab
commit 6072bdcec9
4 changed files with 24 additions and 13 deletions

View file

@ -643,7 +643,7 @@ pQ = do
case lookup "cite" attrs of
Just url -> do
let uid = fromMaybe mempty $
lookup "name" attrs <> lookup "id" attrs
lookup "name" attrs <|> lookup "id" attrs
let cls = maybe [] T.words $ lookup "class" attrs
url' <- canonicalizeUrl url
makeQuote $ B.spanWith (uid, cls, [("cite", escapeURI url')])
@ -705,20 +705,18 @@ pLineBreak = do
pLink :: PandocMonad m => TagParser m Inlines
pLink = try $ do
tag <- pSatisfy $ tagOpenLit "a" (const True)
tag@(TagOpen _ attr') <- pSatisfy $ tagOpenLit "a" (const True)
let title = fromAttrib "title" tag
-- take id from id attribute if present, otherwise name
let uid = fromMaybe (fromAttrib "name" tag) $
maybeFromAttrib "id" tag
let cls = T.words $ fromAttrib "class" tag
let attr = toAttr $ filter (\(k,_) -> k /= "title" && k /= "href") attr'
lab <- mconcat <$> manyTill inline (pCloses "a")
-- check for href; if href, then a link, otherwise a span
case maybeFromAttrib "href" tag of
Nothing ->
return $ extractSpaces (B.spanWith (uid, cls, [])) lab
return $ extractSpaces (B.spanWith attr) lab
Just url' -> do
url <- canonicalizeUrl url'
return $ extractSpaces (B.linkWith (uid, cls, []) (escapeURI url) title) lab
return $ extractSpaces
(B.linkWith attr (escapeURI url) title) lab
pImage :: PandocMonad m => TagParser m Inlines
pImage = do

View file

@ -30,7 +30,7 @@ module Text.Pandoc.Readers.HTML.Parsing
)
where
import Control.Monad (void, mzero)
import Control.Monad (void, mzero, mplus)
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import Text.HTML.TagSoup
@ -220,9 +220,10 @@ maybeFromAttrib _ _ = Nothing
mkAttr :: [(Text, Text)] -> Attr
mkAttr attr = (attribsId, attribsClasses, attribsKV)
where attribsId = fromMaybe "" $ lookup "id" attr
where attribsId = fromMaybe "" $ lookup "id" attr `mplus` lookup "name" attr
attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id" && k /= "name")
attr
epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
toAttr :: [(Text, Text)] -> Attr

View file

@ -6,8 +6,8 @@
<p><span id="nav.xhtml"></span></p>
<nav epub:type="landmarks" id="landmarks" hidden="hidden">
<ol>
<li><a href="text/title_page.xhtml">Title Page</a></li>
<li><a href="#nav.xhtml#toc">Table of Contents</a></li>
<li><a href="text/title_page.xhtml" class="titlepage">Title Page</a></li>
<li><a href="#nav.xhtml#toc" class="toc">Table of Contents</a></li>
</ol>
</nav>
<p><span id="ch001.xhtml"></span></p>

12
test/command/6970.md Normal file
View file

@ -0,0 +1,12 @@
```
% pandoc -f html -t native
<a name="foo" class="bar baz" href='https://example.com' target='_blank'>https://example.com<a>
^D
[ Plain
[ Link
( "foo" , [ "bar" , "baz" ] , [ ( "target" , "_blank" ) ] )
[ Str "https://example.com" ]
( "https://example.com" , "" )
]
]
```