HTML reader: retain attribute prefixes and avoid duplicates.

Previously we stripped attribute prefixes, reading
`xml:lang` as `lang` for example. This resulted in
two duplicate `lang` attributes when `xml:lang` and
`lang` were both used.  This commit causes the prefixes
to be retained, and also avoids invald duplicate
attributes.

Closes #6938.
This commit is contained in:
John MacFarlane 2020-12-10 15:44:10 -08:00
parent 248a2a1db5
commit 0a502e5ff5
4 changed files with 29 additions and 29 deletions

View file

@ -74,7 +74,7 @@ readHtml :: PandocMonad m
-> Text -- ^ String to parse (assumes @'\n'@ line endings)
-> m Pandoc
readHtml opts inp = do
let tags = stripPrefixes . canonicalizeTags $
let tags = stripPrefixes $ canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True }
(crFilter inp)
parseDoc = do
@ -95,6 +95,15 @@ readHtml opts inp = do
Right doc -> return doc
Left err -> throwError $ PandocParseError $ T.pack $ getError err
-- Strip namespace prefixes on tags (not attributes)
stripPrefixes :: [Tag Text] -> [Tag Text]
stripPrefixes = map stripPrefix
stripPrefix :: Tag Text -> Tag Text
stripPrefix (TagOpen s as) = TagOpen (T.takeWhileEnd (/=':') s) as
stripPrefix (TagClose s) = TagClose (T.takeWhileEnd (/=':') s)
stripPrefix x = x
replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
replaceNotes bs = do
st <- getState
@ -114,7 +123,7 @@ setInPlain = local (\s -> s {inPlain = True})
pHtml :: PandocMonad m => TagParser m Blocks
pHtml = try $ do
(TagOpen "html" attr) <- lookAhead pAny
for_ (lookup "lang" attr) $
for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $
updateState . B.setMeta "lang" . B.text
pInTags "html" block
@ -1024,21 +1033,6 @@ htmlTag f = try $ do
handleTag tagname
_ -> mzero
-- Strip namespace prefixes
stripPrefixes :: [Tag Text] -> [Tag Text]
stripPrefixes = map stripPrefix
stripPrefix :: Tag Text -> Tag Text
stripPrefix (TagOpen s as) =
TagOpen (stripPrefix' s) (map (first stripPrefix') as)
stripPrefix (TagClose s) = TagClose (stripPrefix' s)
stripPrefix x = x
stripPrefix' :: Text -> Text
stripPrefix' s =
if T.null t then s else T.drop 1 t
where (_, t) = T.span (/= ':') s
-- Utilities
-- | Adjusts a url according to the document's base URL.

View file

@ -193,14 +193,20 @@ t1 `closes` t2 |
_ `closes` _ = False
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
toStringAttr = map go
toStringAttr = foldr go []
where
go (x,y) =
case T.stripPrefix "data-" x of
Just x' | x' `Set.notMember` (html5Attributes <>
html4Attributes <> rdfaAttributes)
-> (x',y)
_ -> (x,y)
go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)]
-- treat xml:lang as lang
go ("xml:lang",y) ats = go ("lang",y) ats
-- prevent duplicate attributes
go (x,y) ats
| any (\(x',_) -> x == x') ats = ats
| otherwise =
case T.stripPrefix "data-" x of
Just x' | x' `Set.notMember` (html5Attributes <>
html4Attributes <> rdfaAttributes)
-> go (x',y) ats
_ -> (x,y):ats
-- Unlike fromAttrib from tagsoup, this distinguishes
-- between a missing attribute and an attribute with empty content.

View file

@ -4,7 +4,7 @@
^D
<p><span id="title_page.xhtml"></span></p>
<p><span id="nav.xhtml"></span></p>
<nav type="landmarks" id="landmarks" hidden="hidden">
<nav epub:type="landmarks" id="landmarks" hidden="hidden">
<ol>
<li><a href="text/title_page.xhtml">Title Page</a></li>
<li><a href="#nav.xhtml#toc">Table of Contents</a></li>

View file

@ -1,8 +1,8 @@
[Para [Image ("",[],[]) [] ("wasteland-cover.jpg","")]
,Para [Span ("wasteland-content.xhtml",[],[]) []]
,Div ("wasteland-content.xhtml#frontmatter",["section"],[("type","frontmatter")])
,Div ("wasteland-content.xhtml#frontmatter",["section","frontmatter"],[])
[]
,Div ("wasteland-content.xhtml#bodymatter",["section"],[("type","bodymatter")])
,Div ("wasteland-content.xhtml#bodymatter",["section","bodymatter"],[])
[Div ("wasteland-content.xhtml#ch1",["section"],[])
[Header 2 ("",[],[]) [Str "I.",Space,Str "THE",Space,Str "BURIAL",Space,Str "OF",Space,Str "THE",Space,Str "DEAD"]
,Div ("",["linegroup"],[])
@ -922,8 +922,8 @@
[Plain [Str "Datta.",Space,Str "Dayadhvam.",Space,Str "Damyata."]]
,Div ("wasteland-content.xhtml#ln434",["linegroup","indent"],[])
[Plain [Span ("",[],[("lang","sa")]) [Str "Shantih",Space,Str "shantih",Space,Str "shantih",Note [Para [Link ("",[],[]) [Str "434."] ("#wasteland-content.xhtml#ln434",""),Space,Str "Shantih.",Space,Str "Repeated",Space,Str "as",Space,Str "here,",Space,Str "a",Space,Str "formal",Space,Str "ending",Space,Str "to",Space,Str "an",Space,Str "Upanishad.",Space,Str "'The",SoftBreak,Str "Peace",Space,Str "which",Space,Str "passeth",Space,Str "understanding'",Space,Str "is",Space,Str "a",Space,Str "feeble",Space,Str "translation",Space,Str "of",Space,Str "the",SoftBreak,Str "content",Space,Str "of",Space,Str "this",Space,Str "word."]],SoftBreak]]]]]]
,Div ("wasteland-content.xhtml#backmatter",["section"],[("type","backmatter")])
[Div ("wasteland-content.xhtml#rearnotes",["section"],[("type","rearnotes")])
,Div ("wasteland-content.xhtml#backmatter",["section","backmatter"],[])
[Div ("wasteland-content.xhtml#rearnotes",["section","rearnotes"],[])
[Header 2 ("",[],[]) [Str "NOTES",Space,Str "ON",Space,Str "\"THE",Space,Str "WASTE",Space,Str "LAND\""]
,Para [Str "Not",Space,Str "only",Space,Str "the",Space,Str "title,",Space,Str "but",Space,Str "the",Space,Str "plan",Space,Str "and",Space,Str "a",Space,Str "good",Space,Str "deal",Space,Str "of",Space,Str "the",Space,Str "incidental",Space,Str "symbolism",Space,Str "of",SoftBreak,Str "the",Space,Str "poem",Space,Str "were",Space,Str "suggested",Space,Str "by",Space,Str "Miss",Space,Str "Jessie",Space,Str "L.",Space,Str "Weston's",Space,Str "book",Space,Str "on",Space,Str "the",Space,Str "Grail",Space,Str "legend:",SoftBreak,Str "From",Space,Str "Ritual",Space,Str "to",Space,Str "Romance"]
,Para [Str "Indeed,",Space,Str "so",Space,Str "deeply",Space,Str "am",Space,Str "I",Space,Str "indebted,",Space,Str "Miss",Space,Str "Weston's",Space,Str "book",Space,Str "will",Space,Str "elucidate",Space,Str "the",SoftBreak,Str "difficulties",Space,Str "of",Space,Str "the",Space,Str "poem",Space,Str "much",Space,Str "better",Space,Str "than",Space,Str "my",Space,Str "notes",Space,Str "can",Space,Str "do;",Space,Str "and",Space,Str "I",Space,Str "recommend",Space,Str "it",SoftBreak,Str "(apart",Space,Str "from",Space,Str "the",Space,Str "great",Space,Str "interest",Space,Str "of",Space,Str "the",Space,Str "book",Space,Str "itself)",Space,Str "to",Space,Str "any",Space,Str "who",Space,Str "think",Space,Str "such",SoftBreak,Str "elucidation",Space,Str "of",Space,Str "the",Space,Str "poem",Space,Str "worth",Space,Str "the",Space,Str "trouble.",Space,Str "To",Space,Str "another",Space,Str "work",Space,Str "of",Space,Str "anthropology",Space,Str "I",Space,Str "am",SoftBreak,Str "indebted",Space,Str "in",Space,Str "general,",Space,Str "one",Space,Str "which",Space,Str "has",Space,Str "influenced",Space,Str "our",Space,Str "generation",Space,Str "profoundly;",Space,Str "I",Space,Str "mean",SoftBreak,Str "The",Space,Str "Golden",Space,Str "Bough;",Space,Str "I",Space,Str "have",Space,Str "used",Space,Str "especially",Space,Str "the",Space,Str "two",Space,Str "volumes",Space,Str "Adonis,",Space,Str "Attis,",Space,Str "Osiris.",SoftBreak,Str "Anyone",Space,Str "who",Space,Str "is",Space,Str "acquainted",Space,Str "with",Space,Str "these",Space,Str "works",Space,Str "will",Space,Str "immediately",Space,Str "recognise",Space,Str "in",Space,Str "the",Space,Str "poem",SoftBreak,Str "certain",Space,Str "references",Space,Str "to",Space,Str "vegetation",Space,Str "ceremonies."]