DocBook reader: handle complete set of entities...

as specified at <https://www.w3.org/2003/entities/2007doc/byalpha.html>.

Closes #7938.
This commit is contained in:
John MacFarlane 2022-02-24 15:50:53 -08:00
parent 7dea81f992
commit 5375bd1446
3 changed files with 2262 additions and 2 deletions

2237
data/docbook-entities.txt Normal file

File diff suppressed because it is too large Load diff

View file

@ -93,6 +93,8 @@ data-files:
data/templates/default.markua
-- translations
data/translations/*.yaml
-- entities
data/docbook-entities.txt
-- source files for reference.docx
data/docx/[Content_Types].xml
data/docx/_rels/.rels

View file

@ -1,4 +1,5 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
{- |
Module : Text.Pandoc.Readers.DocBook
Copyright : Copyright (C) 2006-2022 John MacFarlane
@ -12,7 +13,9 @@ Conversion of DocBook XML to 'Pandoc' document.
-}
module Text.Pandoc.Readers.DocBook ( readDocBook ) where
import Control.Monad.State.Strict
import Data.Char (isSpace, isLetter)
import Data.ByteString (ByteString)
import Data.FileEmbed
import Data.Char (isSpace, isLetter, chr)
import Data.Default
import Data.Either (rights)
import Data.Foldable (asum)
@ -21,6 +24,8 @@ import Data.List (intersperse,elemIndex)
import Data.List.NonEmpty (nonEmpty)
import Data.Maybe (catMaybes,fromMaybe,mapMaybe,maybeToList)
import Data.Text (Text)
import Data.Text.Read as TR
import Data.Text.Encoding (decodeUtf8)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Control.Monad.Except (throwError)
@ -33,6 +38,7 @@ import Text.Pandoc.Logging (LogMessage(..))
import Text.Pandoc.Shared (safeRead, extractSpaces)
import Text.Pandoc.Sources (ToSources(..), sourcesToText)
import Text.TeXMath (readMathML, writeTeX)
import qualified Data.Map as M
import Text.Pandoc.XML.Light
{-
@ -548,7 +554,8 @@ readDocBook :: (PandocMonad m, ToSources a)
readDocBook _ inp = do
let sources = toSources inp
tree <- either (throwError . PandocXMLError "") return $
parseXMLContents
parseXMLContentsWithEntities
docbookEntityMap
(TL.fromStrict . handleInstructions . sourcesToText $ sources)
(bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree
return $ Pandoc (dbMeta st') (toList . mconcat $ bs)
@ -1335,3 +1342,17 @@ paraToPlain :: Block -> Block
paraToPlain (Para ils) = Plain ils
paraToPlain x = x
docbookEntityMap :: M.Map Text Text
docbookEntityMap = M.fromList
(map lineToPair (T.lines (decodeUtf8 docbookEntities)))
where
lineToPair l =
case T.words l of
(x:ys) -> (x, T.pack (mapMaybe readHex ys))
[] -> ("","")
readHex t = case TR.hexadecimal t of
Left _ -> Nothing
Right (n,_) -> Just (chr n)
docbookEntities :: ByteString
docbookEntities = $(embedFile "data/docbook-entities.txt")