Fix parsing of epub footnotes.

Closes #7884.
This commit is contained in:
John MacFarlane 2022-02-08 00:01:17 -08:00
parent 7dc59aa26a
commit 7a888e8603
3 changed files with 61 additions and 23 deletions

View file

@ -89,7 +89,7 @@ readHtml opts inp = do
result <- flip runReaderT def $ result <- flip runReaderT def $
runParserT parseDoc runParserT parseDoc
(HTMLState def{ stateOptions = opts } (HTMLState def{ stateOptions = opts }
[] Nothing Set.empty [] M.empty opts) [] Nothing Set.empty [] M.empty opts False)
"source" tags "source" tags
case result of case result of
Right doc -> return doc Right doc -> return doc
@ -106,8 +106,8 @@ stripPrefix x = x
replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
replaceNotes bs = do replaceNotes bs = do
st <- getState notes <- noteTable <$> getState
walkM (replaceNotes' (noteTable st)) bs walkM (replaceNotes' notes) bs
replaceNotes' :: PandocMonad m replaceNotes' :: PandocMonad m
=> [(Text, Blocks)] -> Inline -> TagParser m Inline => [(Text, Blocks)] -> Inline -> TagParser m Inline
@ -178,6 +178,9 @@ block = ((do
, epubExts , epubExts
, "chapter" `T.isInfixOf` type' , "chapter" `T.isInfixOf` type'
-> eSection -> eSection
_ | epubExts
, type' `elem` ["footnotes", "rearnotes"]
-> eFootnotes
_ | epubExts _ | epubExts
, type' `elem` ["footnote", "rearnote"] , type' `elem` ["footnote", "rearnote"]
-> mempty <$ eFootnote -> mempty <$ eFootnote
@ -256,19 +259,39 @@ eCase = do
Nothing -> Nothing <$ manyTill pAny (pSatisfy (matchTagClose "case")) Nothing -> Nothing <$ manyTill pAny (pSatisfy (matchTagClose "case"))
eFootnote :: PandocMonad m => TagParser m () eFootnote :: PandocMonad m => TagParser m ()
eFootnote = try $ do eFootnote = do
let notes = ["footnote", "rearnote"] guardEnabled Ext_epub_html_exts
TagOpen tag attr' <- lookAhead $ pSatisfy
(\case
TagOpen _ attr'
-> case lookup "type" attr' <|> lookup "epub:type" attr' of
Just "footnote" -> True
Just "rearnote" -> True
_ -> False
_ -> False)
let attr = toStringAttr attr'
let ident = fromMaybe "" (lookup "id" attr)
content <- pInTags tag block
updateState $ \s ->
s {noteTable = (ident, content) : noteTable s}
eFootnotes :: PandocMonad m => TagParser m Blocks
eFootnotes = try $ do
let notes = ["footnotes", "rearnotes"]
guardEnabled Ext_epub_html_exts guardEnabled Ext_epub_html_exts
(TagOpen tag attr') <- lookAhead pAny (TagOpen tag attr') <- lookAhead pAny
let attr = toStringAttr attr' let attr = toStringAttr attr'
guard $ maybe False (`elem` notes) guard $ maybe False (`elem` notes)
(lookup "type" attr <|> lookup "epub:type" attr) (lookup "type" attr <|> lookup "epub:type" attr)
let ident = fromMaybe "" (lookup "id" attr) updateState $ \s -> s{ inFootnotes = True }
content <- pInTags tag block result <- pInTags tag block
addNote ident content updateState $ \s -> s{ inFootnotes = False }
if null result
addNote :: PandocMonad m => Text -> Blocks -> TagParser m () -- if it just contains notes, we don't need the container:
addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : noteTable s}) then return result
-- but there might be content other than notes, in which case
-- we want a div:
else return $ B.divWith (toAttr attr') result
eNoteref :: PandocMonad m => TagParser m Inlines eNoteref :: PandocMonad m => TagParser m Inlines
eNoteref = try $ do eNoteref = try $ do
@ -337,6 +360,7 @@ parseTypeAttr _ = DefaultStyle
pOrderedList :: PandocMonad m => TagParser m Blocks pOrderedList :: PandocMonad m => TagParser m Blocks
pOrderedList = try $ do pOrderedList = try $ do
TagOpen _ attribs' <- pSatisfy (matchTagOpen "ol" []) TagOpen _ attribs' <- pSatisfy (matchTagOpen "ol" [])
isNoteList <- inFootnotes <$> getState
let attribs = toStringAttr attribs' let attribs = toStringAttr attribs'
let start = fromMaybe 1 $ lookup "start" attribs >>= safeRead let start = fromMaybe 1 $ lookup "start" attribs >>= safeRead
let style = fromMaybe DefaultStyle let style = fromMaybe DefaultStyle
@ -352,8 +376,14 @@ pOrderedList = try $ do
-- note: if they have an <ol> or <ul> not in scope of a <li>, -- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml... -- treat it as a list item, though it's not valid xhtml...
skipMany nonItem skipMany nonItem
items <- manyTill (pListItem nonItem) (pCloses "ol") if isNoteList
return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items then do
_ <- manyTill (eFootnote <|> pBlank) (pCloses "ol")
return mempty
else do
items <- manyTill (pListItem nonItem) (pCloses "ol")
return $ B.orderedListWith (start, style, DefaultDelim) $
map (fixPlains True) items
pDefinitionList :: PandocMonad m => TagParser m Blocks pDefinitionList :: PandocMonad m => TagParser m Blocks
pDefinitionList = try $ do pDefinitionList = try $ do
@ -518,7 +548,10 @@ pHeader = try $ do
pHrule :: PandocMonad m => TagParser m Blocks pHrule :: PandocMonad m => TagParser m Blocks
pHrule = do pHrule = do
pSelfClosing (=="hr") (const True) pSelfClosing (=="hr") (const True)
return B.horizontalRule inNotes <- inFootnotes <$> getState
return $ if inNotes
then mempty
else B.horizontalRule
pBlockQuote :: PandocMonad m => TagParser m Blocks pBlockQuote :: PandocMonad m => TagParser m Blocks
pBlockQuote = do pBlockQuote = do
@ -715,14 +748,18 @@ pLink = try $ do
let title = fromAttrib "title" tag let title = fromAttrib "title" tag
let attr = toAttr $ filter (\(k,_) -> k /= "title" && k /= "href") attr' let attr = toAttr $ filter (\(k,_) -> k /= "title" && k /= "href") attr'
lab <- mconcat <$> manyTill inline (pCloses "a") lab <- mconcat <$> manyTill inline (pCloses "a")
-- check for href; if href, then a link, otherwise a span st <- getState
case maybeFromAttrib "href" tag of if inFootnotes st && maybeFromAttrib "role" tag == Just "doc-backlink"
Nothing -> then return mempty
return $ extractSpaces (B.spanWith attr) lab else do
Just url' -> do -- check for href; if href, then a link, otherwise a span
url <- canonicalizeUrl url' case maybeFromAttrib "href" tag of
return $ extractSpaces Nothing ->
(B.linkWith attr (escapeURI url) title) lab return $ extractSpaces (B.spanWith attr) lab
Just url' -> do
url <- canonicalizeUrl url'
return $ extractSpaces
(B.linkWith attr (escapeURI url) title) lab
pImage :: PandocMonad m => TagParser m Inlines pImage :: PandocMonad m => TagParser m Inlines
pImage = do pImage = do

View file

@ -52,6 +52,7 @@ data HTMLState = HTMLState
, logMessages :: [LogMessage] , logMessages :: [LogMessage]
, macros :: Map Text Macro , macros :: Map Text Macro
, readerOpts :: ReaderOptions , readerOpts :: ReaderOptions
, inFootnotes :: Bool
} }
-- | Local HTML parser state -- | Local HTML parser state

View file

@ -4230,7 +4230,7 @@
) )
[ Div [ Div
( "wasteland-content.xhtml#rearnotes" ( "wasteland-content.xhtml#rearnotes"
, [ "section" , "rearnotes" ] , [ "rearnotes" ]
, [] , []
) )
[ Header [ Header