HTML reader: retain attribute prefixes and avoid duplicates.

Previously we stripped attribute prefixes, reading
`xml:lang` as `lang` for example. This resulted in
two duplicate `lang` attributes when `xml:lang` and
`lang` were both used.  This commit causes the prefixes
to be retained, and also avoids invald duplicate
attributes.

Closes #6938.
This commit is contained in:
John MacFarlane 2020-12-10 15:44:10 -08:00
parent 248a2a1db5
commit 0a502e5ff5
4 changed files with 29 additions and 29 deletions

View file

@ -74,7 +74,7 @@ readHtml :: PandocMonad m
-> Text -- ^ String to parse (assumes @'\n'@ line endings) -> Text -- ^ String to parse (assumes @'\n'@ line endings)
-> m Pandoc -> m Pandoc
readHtml opts inp = do readHtml opts inp = do
let tags = stripPrefixes . canonicalizeTags $ let tags = stripPrefixes $ canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } parseTagsOptions parseOptions{ optTagPosition = True }
(crFilter inp) (crFilter inp)
parseDoc = do parseDoc = do
@ -95,6 +95,15 @@ readHtml opts inp = do
Right doc -> return doc Right doc -> return doc
Left err -> throwError $ PandocParseError $ T.pack $ getError err Left err -> throwError $ PandocParseError $ T.pack $ getError err
-- Strip namespace prefixes on tags (not attributes)
stripPrefixes :: [Tag Text] -> [Tag Text]
stripPrefixes = map stripPrefix
stripPrefix :: Tag Text -> Tag Text
stripPrefix (TagOpen s as) = TagOpen (T.takeWhileEnd (/=':') s) as
stripPrefix (TagClose s) = TagClose (T.takeWhileEnd (/=':') s)
stripPrefix x = x
replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
replaceNotes bs = do replaceNotes bs = do
st <- getState st <- getState
@ -114,7 +123,7 @@ setInPlain = local (\s -> s {inPlain = True})
pHtml :: PandocMonad m => TagParser m Blocks pHtml :: PandocMonad m => TagParser m Blocks
pHtml = try $ do pHtml = try $ do
(TagOpen "html" attr) <- lookAhead pAny (TagOpen "html" attr) <- lookAhead pAny
for_ (lookup "lang" attr) $ for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $
updateState . B.setMeta "lang" . B.text updateState . B.setMeta "lang" . B.text
pInTags "html" block pInTags "html" block
@ -1024,21 +1033,6 @@ htmlTag f = try $ do
handleTag tagname handleTag tagname
_ -> mzero _ -> mzero
-- Strip namespace prefixes
stripPrefixes :: [Tag Text] -> [Tag Text]
stripPrefixes = map stripPrefix
stripPrefix :: Tag Text -> Tag Text
stripPrefix (TagOpen s as) =
TagOpen (stripPrefix' s) (map (first stripPrefix') as)
stripPrefix (TagClose s) = TagClose (stripPrefix' s)
stripPrefix x = x
stripPrefix' :: Text -> Text
stripPrefix' s =
if T.null t then s else T.drop 1 t
where (_, t) = T.span (/= ':') s
-- Utilities -- Utilities
-- | Adjusts a url according to the document's base URL. -- | Adjusts a url according to the document's base URL.

View file

@ -193,14 +193,20 @@ t1 `closes` t2 |
_ `closes` _ = False _ `closes` _ = False
toStringAttr :: [(Text, Text)] -> [(Text, Text)] toStringAttr :: [(Text, Text)] -> [(Text, Text)]
toStringAttr = map go toStringAttr = foldr go []
where where
go (x,y) = go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)]
case T.stripPrefix "data-" x of -- treat xml:lang as lang
Just x' | x' `Set.notMember` (html5Attributes <> go ("xml:lang",y) ats = go ("lang",y) ats
html4Attributes <> rdfaAttributes) -- prevent duplicate attributes
-> (x',y) go (x,y) ats
_ -> (x,y) | any (\(x',_) -> x == x') ats = ats
| otherwise =
case T.stripPrefix "data-" x of
Just x' | x' `Set.notMember` (html5Attributes <>
html4Attributes <> rdfaAttributes)
-> go (x',y) ats
_ -> (x,y):ats
-- Unlike fromAttrib from tagsoup, this distinguishes -- Unlike fromAttrib from tagsoup, this distinguishes
-- between a missing attribute and an attribute with empty content. -- between a missing attribute and an attribute with empty content.

View file

@ -4,7 +4,7 @@
^D ^D
<p><span id="title_page.xhtml"></span></p> <p><span id="title_page.xhtml"></span></p>
<p><span id="nav.xhtml"></span></p> <p><span id="nav.xhtml"></span></p>
<nav type="landmarks" id="landmarks" hidden="hidden"> <nav epub:type="landmarks" id="landmarks" hidden="hidden">
<ol> <ol>
<li><a href="text/title_page.xhtml">Title Page</a></li> <li><a href="text/title_page.xhtml">Title Page</a></li>
<li><a href="#nav.xhtml#toc">Table of Contents</a></li> <li><a href="#nav.xhtml#toc">Table of Contents</a></li>

View file

@ -1,8 +1,8 @@
[Para [Image ("",[],[]) [] ("wasteland-cover.jpg","")] [Para [Image ("",[],[]) [] ("wasteland-cover.jpg","")]
,Para [Span ("wasteland-content.xhtml",[],[]) []] ,Para [Span ("wasteland-content.xhtml",[],[]) []]
,Div ("wasteland-content.xhtml#frontmatter",["section"],[("type","frontmatter")]) ,Div ("wasteland-content.xhtml#frontmatter",["section","frontmatter"],[])
[] []
,Div ("wasteland-content.xhtml#bodymatter",["section"],[("type","bodymatter")]) ,Div ("wasteland-content.xhtml#bodymatter",["section","bodymatter"],[])
[Div ("wasteland-content.xhtml#ch1",["section"],[]) [Div ("wasteland-content.xhtml#ch1",["section"],[])
[Header 2 ("",[],[]) [Str "I.",Space,Str "THE",Space,Str "BURIAL",Space,Str "OF",Space,Str "THE",Space,Str "DEAD"] [Header 2 ("",[],[]) [Str "I.",Space,Str "THE",Space,Str "BURIAL",Space,Str "OF",Space,Str "THE",Space,Str "DEAD"]
,Div ("",["linegroup"],[]) ,Div ("",["linegroup"],[])
@ -922,8 +922,8 @@
[Plain [Str "Datta.",Space,Str "Dayadhvam.",Space,Str "Damyata."]] [Plain [Str "Datta.",Space,Str "Dayadhvam.",Space,Str "Damyata."]]
,Div ("wasteland-content.xhtml#ln434",["linegroup","indent"],[]) ,Div ("wasteland-content.xhtml#ln434",["linegroup","indent"],[])
[Plain [Span ("",[],[("lang","sa")]) [Str "Shantih",Space,Str "shantih",Space,Str "shantih",Note [Para [Link ("",[],[]) [Str "434."] ("#wasteland-content.xhtml#ln434",""),Space,Str "Shantih.",Space,Str "Repeated",Space,Str "as",Space,Str "here,",Space,Str "a",Space,Str "formal",Space,Str "ending",Space,Str "to",Space,Str "an",Space,Str "Upanishad.",Space,Str "'The",SoftBreak,Str "Peace",Space,Str "which",Space,Str "passeth",Space,Str "understanding'",Space,Str "is",Space,Str "a",Space,Str "feeble",Space,Str "translation",Space,Str "of",Space,Str "the",SoftBreak,Str "content",Space,Str "of",Space,Str "this",Space,Str "word."]],SoftBreak]]]]]] [Plain [Span ("",[],[("lang","sa")]) [Str "Shantih",Space,Str "shantih",Space,Str "shantih",Note [Para [Link ("",[],[]) [Str "434."] ("#wasteland-content.xhtml#ln434",""),Space,Str "Shantih.",Space,Str "Repeated",Space,Str "as",Space,Str "here,",Space,Str "a",Space,Str "formal",Space,Str "ending",Space,Str "to",Space,Str "an",Space,Str "Upanishad.",Space,Str "'The",SoftBreak,Str "Peace",Space,Str "which",Space,Str "passeth",Space,Str "understanding'",Space,Str "is",Space,Str "a",Space,Str "feeble",Space,Str "translation",Space,Str "of",Space,Str "the",SoftBreak,Str "content",Space,Str "of",Space,Str "this",Space,Str "word."]],SoftBreak]]]]]]
,Div ("wasteland-content.xhtml#backmatter",["section"],[("type","backmatter")]) ,Div ("wasteland-content.xhtml#backmatter",["section","backmatter"],[])
[Div ("wasteland-content.xhtml#rearnotes",["section"],[("type","rearnotes")]) [Div ("wasteland-content.xhtml#rearnotes",["section","rearnotes"],[])
[Header 2 ("",[],[]) [Str "NOTES",Space,Str "ON",Space,Str "\"THE",Space,Str "WASTE",Space,Str "LAND\""] [Header 2 ("",[],[]) [Str "NOTES",Space,Str "ON",Space,Str "\"THE",Space,Str "WASTE",Space,Str "LAND\""]
,Para [Str "Not",Space,Str "only",Space,Str "the",Space,Str "title,",Space,Str "but",Space,Str "the",Space,Str "plan",Space,Str "and",Space,Str "a",Space,Str "good",Space,Str "deal",Space,Str "of",Space,Str "the",Space,Str "incidental",Space,Str "symbolism",Space,Str "of",SoftBreak,Str "the",Space,Str "poem",Space,Str "were",Space,Str "suggested",Space,Str "by",Space,Str "Miss",Space,Str "Jessie",Space,Str "L.",Space,Str "Weston's",Space,Str "book",Space,Str "on",Space,Str "the",Space,Str "Grail",Space,Str "legend:",SoftBreak,Str "From",Space,Str "Ritual",Space,Str "to",Space,Str "Romance"] ,Para [Str "Not",Space,Str "only",Space,Str "the",Space,Str "title,",Space,Str "but",Space,Str "the",Space,Str "plan",Space,Str "and",Space,Str "a",Space,Str "good",Space,Str "deal",Space,Str "of",Space,Str "the",Space,Str "incidental",Space,Str "symbolism",Space,Str "of",SoftBreak,Str "the",Space,Str "poem",Space,Str "were",Space,Str "suggested",Space,Str "by",Space,Str "Miss",Space,Str "Jessie",Space,Str "L.",Space,Str "Weston's",Space,Str "book",Space,Str "on",Space,Str "the",Space,Str "Grail",Space,Str "legend:",SoftBreak,Str "From",Space,Str "Ritual",Space,Str "to",Space,Str "Romance"]
,Para [Str "Indeed,",Space,Str "so",Space,Str "deeply",Space,Str "am",Space,Str "I",Space,Str "indebted,",Space,Str "Miss",Space,Str "Weston's",Space,Str "book",Space,Str "will",Space,Str "elucidate",Space,Str "the",SoftBreak,Str "difficulties",Space,Str "of",Space,Str "the",Space,Str "poem",Space,Str "much",Space,Str "better",Space,Str "than",Space,Str "my",Space,Str "notes",Space,Str "can",Space,Str "do;",Space,Str "and",Space,Str "I",Space,Str "recommend",Space,Str "it",SoftBreak,Str "(apart",Space,Str "from",Space,Str "the",Space,Str "great",Space,Str "interest",Space,Str "of",Space,Str "the",Space,Str "book",Space,Str "itself)",Space,Str "to",Space,Str "any",Space,Str "who",Space,Str "think",Space,Str "such",SoftBreak,Str "elucidation",Space,Str "of",Space,Str "the",Space,Str "poem",Space,Str "worth",Space,Str "the",Space,Str "trouble.",Space,Str "To",Space,Str "another",Space,Str "work",Space,Str "of",Space,Str "anthropology",Space,Str "I",Space,Str "am",SoftBreak,Str "indebted",Space,Str "in",Space,Str "general,",Space,Str "one",Space,Str "which",Space,Str "has",Space,Str "influenced",Space,Str "our",Space,Str "generation",Space,Str "profoundly;",Space,Str "I",Space,Str "mean",SoftBreak,Str "The",Space,Str "Golden",Space,Str "Bough;",Space,Str "I",Space,Str "have",Space,Str "used",Space,Str "especially",Space,Str "the",Space,Str "two",Space,Str "volumes",Space,Str "Adonis,",Space,Str "Attis,",Space,Str "Osiris.",SoftBreak,Str "Anyone",Space,Str "who",Space,Str "is",Space,Str "acquainted",Space,Str "with",Space,Str "these",Space,Str "works",Space,Str "will",Space,Str "immediately",Space,Str "recognise",Space,Str "in",Space,Str "the",Space,Str "poem",SoftBreak,Str "certain",Space,Str "references",Space,Str "to",Space,Str "vegetation",Space,Str "ceremonies."] ,Para [Str "Indeed,",Space,Str "so",Space,Str "deeply",Space,Str "am",Space,Str "I",Space,Str "indebted,",Space,Str "Miss",Space,Str "Weston's",Space,Str "book",Space,Str "will",Space,Str "elucidate",Space,Str "the",SoftBreak,Str "difficulties",Space,Str "of",Space,Str "the",Space,Str "poem",Space,Str "much",Space,Str "better",Space,Str "than",Space,Str "my",Space,Str "notes",Space,Str "can",Space,Str "do;",Space,Str "and",Space,Str "I",Space,Str "recommend",Space,Str "it",SoftBreak,Str "(apart",Space,Str "from",Space,Str "the",Space,Str "great",Space,Str "interest",Space,Str "of",Space,Str "the",Space,Str "book",Space,Str "itself)",Space,Str "to",Space,Str "any",Space,Str "who",Space,Str "think",Space,Str "such",SoftBreak,Str "elucidation",Space,Str "of",Space,Str "the",Space,Str "poem",Space,Str "worth",Space,Str "the",Space,Str "trouble.",Space,Str "To",Space,Str "another",Space,Str "work",Space,Str "of",Space,Str "anthropology",Space,Str "I",Space,Str "am",SoftBreak,Str "indebted",Space,Str "in",Space,Str "general,",Space,Str "one",Space,Str "which",Space,Str "has",Space,Str "influenced",Space,Str "our",Space,Str "generation",Space,Str "profoundly;",Space,Str "I",Space,Str "mean",SoftBreak,Str "The",Space,Str "Golden",Space,Str "Bough;",Space,Str "I",Space,Str "have",Space,Str "used",Space,Str "especially",Space,Str "the",Space,Str "two",Space,Str "volumes",Space,Str "Adonis,",Space,Str "Attis,",Space,Str "Osiris.",SoftBreak,Str "Anyone",Space,Str "who",Space,Str "is",Space,Str "acquainted",Space,Str "with",Space,Str "these",Space,Str "works",Space,Str "will",Space,Str "immediately",Space,Str "recognise",Space,Str "in",Space,Str "the",Space,Str "poem",SoftBreak,Str "certain",Space,Str "references",Space,Str "to",Space,Str "vegetation",Space,Str "ceremonies."]