HTML reader: retain attribute prefixes and avoid duplicates.
Previously we stripped attribute prefixes, reading `xml:lang` as `lang` for example. This resulted in two duplicate `lang` attributes when `xml:lang` and `lang` were both used. This commit causes the prefixes to be retained, and also avoids invald duplicate attributes. Closes #6938.
This commit is contained in:
parent
248a2a1db5
commit
0a502e5ff5
4 changed files with 29 additions and 29 deletions
|
@ -74,7 +74,7 @@ readHtml :: PandocMonad m
|
|||
-> Text -- ^ String to parse (assumes @'\n'@ line endings)
|
||||
-> m Pandoc
|
||||
readHtml opts inp = do
|
||||
let tags = stripPrefixes . canonicalizeTags $
|
||||
let tags = stripPrefixes $ canonicalizeTags $
|
||||
parseTagsOptions parseOptions{ optTagPosition = True }
|
||||
(crFilter inp)
|
||||
parseDoc = do
|
||||
|
@ -95,6 +95,15 @@ readHtml opts inp = do
|
|||
Right doc -> return doc
|
||||
Left err -> throwError $ PandocParseError $ T.pack $ getError err
|
||||
|
||||
-- Strip namespace prefixes on tags (not attributes)
|
||||
stripPrefixes :: [Tag Text] -> [Tag Text]
|
||||
stripPrefixes = map stripPrefix
|
||||
|
||||
stripPrefix :: Tag Text -> Tag Text
|
||||
stripPrefix (TagOpen s as) = TagOpen (T.takeWhileEnd (/=':') s) as
|
||||
stripPrefix (TagClose s) = TagClose (T.takeWhileEnd (/=':') s)
|
||||
stripPrefix x = x
|
||||
|
||||
replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
|
||||
replaceNotes bs = do
|
||||
st <- getState
|
||||
|
@ -114,7 +123,7 @@ setInPlain = local (\s -> s {inPlain = True})
|
|||
pHtml :: PandocMonad m => TagParser m Blocks
|
||||
pHtml = try $ do
|
||||
(TagOpen "html" attr) <- lookAhead pAny
|
||||
for_ (lookup "lang" attr) $
|
||||
for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $
|
||||
updateState . B.setMeta "lang" . B.text
|
||||
pInTags "html" block
|
||||
|
||||
|
@ -1024,21 +1033,6 @@ htmlTag f = try $ do
|
|||
handleTag tagname
|
||||
_ -> mzero
|
||||
|
||||
-- Strip namespace prefixes
|
||||
stripPrefixes :: [Tag Text] -> [Tag Text]
|
||||
stripPrefixes = map stripPrefix
|
||||
|
||||
stripPrefix :: Tag Text -> Tag Text
|
||||
stripPrefix (TagOpen s as) =
|
||||
TagOpen (stripPrefix' s) (map (first stripPrefix') as)
|
||||
stripPrefix (TagClose s) = TagClose (stripPrefix' s)
|
||||
stripPrefix x = x
|
||||
|
||||
stripPrefix' :: Text -> Text
|
||||
stripPrefix' s =
|
||||
if T.null t then s else T.drop 1 t
|
||||
where (_, t) = T.span (/= ':') s
|
||||
|
||||
-- Utilities
|
||||
|
||||
-- | Adjusts a url according to the document's base URL.
|
||||
|
|
|
@ -193,14 +193,20 @@ t1 `closes` t2 |
|
|||
_ `closes` _ = False
|
||||
|
||||
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
|
||||
toStringAttr = map go
|
||||
toStringAttr = foldr go []
|
||||
where
|
||||
go (x,y) =
|
||||
go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)]
|
||||
-- treat xml:lang as lang
|
||||
go ("xml:lang",y) ats = go ("lang",y) ats
|
||||
-- prevent duplicate attributes
|
||||
go (x,y) ats
|
||||
| any (\(x',_) -> x == x') ats = ats
|
||||
| otherwise =
|
||||
case T.stripPrefix "data-" x of
|
||||
Just x' | x' `Set.notMember` (html5Attributes <>
|
||||
html4Attributes <> rdfaAttributes)
|
||||
-> (x',y)
|
||||
_ -> (x,y)
|
||||
-> go (x',y) ats
|
||||
_ -> (x,y):ats
|
||||
|
||||
-- Unlike fromAttrib from tagsoup, this distinguishes
|
||||
-- between a missing attribute and an attribute with empty content.
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
^D
|
||||
<p><span id="title_page.xhtml"></span></p>
|
||||
<p><span id="nav.xhtml"></span></p>
|
||||
<nav type="landmarks" id="landmarks" hidden="hidden">
|
||||
<nav epub:type="landmarks" id="landmarks" hidden="hidden">
|
||||
<ol>
|
||||
<li><a href="text/title_page.xhtml">Title Page</a></li>
|
||||
<li><a href="#nav.xhtml#toc">Table of Contents</a></li>
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
[Para [Image ("",[],[]) [] ("wasteland-cover.jpg","")]
|
||||
,Para [Span ("wasteland-content.xhtml",[],[]) []]
|
||||
,Div ("wasteland-content.xhtml#frontmatter",["section"],[("type","frontmatter")])
|
||||
,Div ("wasteland-content.xhtml#frontmatter",["section","frontmatter"],[])
|
||||
[]
|
||||
,Div ("wasteland-content.xhtml#bodymatter",["section"],[("type","bodymatter")])
|
||||
,Div ("wasteland-content.xhtml#bodymatter",["section","bodymatter"],[])
|
||||
[Div ("wasteland-content.xhtml#ch1",["section"],[])
|
||||
[Header 2 ("",[],[]) [Str "I.",Space,Str "THE",Space,Str "BURIAL",Space,Str "OF",Space,Str "THE",Space,Str "DEAD"]
|
||||
,Div ("",["linegroup"],[])
|
||||
|
@ -922,8 +922,8 @@
|
|||
[Plain [Str "Datta.",Space,Str "Dayadhvam.",Space,Str "Damyata."]]
|
||||
,Div ("wasteland-content.xhtml#ln434",["linegroup","indent"],[])
|
||||
[Plain [Span ("",[],[("lang","sa")]) [Str "Shantih",Space,Str "shantih",Space,Str "shantih",Note [Para [Link ("",[],[]) [Str "434."] ("#wasteland-content.xhtml#ln434",""),Space,Str "Shantih.",Space,Str "Repeated",Space,Str "as",Space,Str "here,",Space,Str "a",Space,Str "formal",Space,Str "ending",Space,Str "to",Space,Str "an",Space,Str "Upanishad.",Space,Str "'The",SoftBreak,Str "Peace",Space,Str "which",Space,Str "passeth",Space,Str "understanding'",Space,Str "is",Space,Str "a",Space,Str "feeble",Space,Str "translation",Space,Str "of",Space,Str "the",SoftBreak,Str "content",Space,Str "of",Space,Str "this",Space,Str "word."]],SoftBreak]]]]]]
|
||||
,Div ("wasteland-content.xhtml#backmatter",["section"],[("type","backmatter")])
|
||||
[Div ("wasteland-content.xhtml#rearnotes",["section"],[("type","rearnotes")])
|
||||
,Div ("wasteland-content.xhtml#backmatter",["section","backmatter"],[])
|
||||
[Div ("wasteland-content.xhtml#rearnotes",["section","rearnotes"],[])
|
||||
[Header 2 ("",[],[]) [Str "NOTES",Space,Str "ON",Space,Str "\"THE",Space,Str "WASTE",Space,Str "LAND\""]
|
||||
,Para [Str "Not",Space,Str "only",Space,Str "the",Space,Str "title,",Space,Str "but",Space,Str "the",Space,Str "plan",Space,Str "and",Space,Str "a",Space,Str "good",Space,Str "deal",Space,Str "of",Space,Str "the",Space,Str "incidental",Space,Str "symbolism",Space,Str "of",SoftBreak,Str "the",Space,Str "poem",Space,Str "were",Space,Str "suggested",Space,Str "by",Space,Str "Miss",Space,Str "Jessie",Space,Str "L.",Space,Str "Weston's",Space,Str "book",Space,Str "on",Space,Str "the",Space,Str "Grail",Space,Str "legend:",SoftBreak,Str "From",Space,Str "Ritual",Space,Str "to",Space,Str "Romance"]
|
||||
,Para [Str "Indeed,",Space,Str "so",Space,Str "deeply",Space,Str "am",Space,Str "I",Space,Str "indebted,",Space,Str "Miss",Space,Str "Weston's",Space,Str "book",Space,Str "will",Space,Str "elucidate",Space,Str "the",SoftBreak,Str "difficulties",Space,Str "of",Space,Str "the",Space,Str "poem",Space,Str "much",Space,Str "better",Space,Str "than",Space,Str "my",Space,Str "notes",Space,Str "can",Space,Str "do;",Space,Str "and",Space,Str "I",Space,Str "recommend",Space,Str "it",SoftBreak,Str "(apart",Space,Str "from",Space,Str "the",Space,Str "great",Space,Str "interest",Space,Str "of",Space,Str "the",Space,Str "book",Space,Str "itself)",Space,Str "to",Space,Str "any",Space,Str "who",Space,Str "think",Space,Str "such",SoftBreak,Str "elucidation",Space,Str "of",Space,Str "the",Space,Str "poem",Space,Str "worth",Space,Str "the",Space,Str "trouble.",Space,Str "To",Space,Str "another",Space,Str "work",Space,Str "of",Space,Str "anthropology",Space,Str "I",Space,Str "am",SoftBreak,Str "indebted",Space,Str "in",Space,Str "general,",Space,Str "one",Space,Str "which",Space,Str "has",Space,Str "influenced",Space,Str "our",Space,Str "generation",Space,Str "profoundly;",Space,Str "I",Space,Str "mean",SoftBreak,Str "The",Space,Str "Golden",Space,Str "Bough;",Space,Str "I",Space,Str "have",Space,Str "used",Space,Str "especially",Space,Str "the",Space,Str "two",Space,Str "volumes",Space,Str "Adonis,",Space,Str "Attis,",Space,Str "Osiris.",SoftBreak,Str "Anyone",Space,Str "who",Space,Str "is",Space,Str "acquainted",Space,Str "with",Space,Str "these",Space,Str "works",Space,Str "will",Space,Str "immediately",Space,Str "recognise",Space,Str "in",Space,Str "the",Space,Str "poem",SoftBreak,Str "certain",Space,Str "references",Space,Str "to",Space,Str "vegetation",Space,Str "ceremonies."]
|
||||
|
|
Loading…
Reference in a new issue