DocBook reader: handle complete set of entities...

as specified at <https://www.w3.org/2003/entities/2007doc/byalpha.html>.

Closes #7938.
This commit is contained in:
John MacFarlane 2022-02-24 15:50:53 -08:00
parent 7dea81f992
commit 5375bd1446
3 changed files with 2262 additions and 2 deletions

2237
data/docbook-entities.txt Normal file

File diff suppressed because it is too large Load diff

View file

@ -93,6 +93,8 @@ data-files:
data/templates/default.markua data/templates/default.markua
-- translations -- translations
data/translations/*.yaml data/translations/*.yaml
-- entities
data/docbook-entities.txt
-- source files for reference.docx -- source files for reference.docx
data/docx/[Content_Types].xml data/docx/[Content_Types].xml
data/docx/_rels/.rels data/docx/_rels/.rels

View file

@ -1,4 +1,5 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
{- | {- |
Module : Text.Pandoc.Readers.DocBook Module : Text.Pandoc.Readers.DocBook
Copyright : Copyright (C) 2006-2022 John MacFarlane Copyright : Copyright (C) 2006-2022 John MacFarlane
@ -12,7 +13,9 @@ Conversion of DocBook XML to 'Pandoc' document.
-} -}
module Text.Pandoc.Readers.DocBook ( readDocBook ) where module Text.Pandoc.Readers.DocBook ( readDocBook ) where
import Control.Monad.State.Strict import Control.Monad.State.Strict
import Data.Char (isSpace, isLetter) import Data.ByteString (ByteString)
import Data.FileEmbed
import Data.Char (isSpace, isLetter, chr)
import Data.Default import Data.Default
import Data.Either (rights) import Data.Either (rights)
import Data.Foldable (asum) import Data.Foldable (asum)
@ -21,6 +24,8 @@ import Data.List (intersperse,elemIndex)
import Data.List.NonEmpty (nonEmpty) import Data.List.NonEmpty (nonEmpty)
import Data.Maybe (catMaybes,fromMaybe,mapMaybe,maybeToList) import Data.Maybe (catMaybes,fromMaybe,mapMaybe,maybeToList)
import Data.Text (Text) import Data.Text (Text)
import Data.Text.Read as TR
import Data.Text.Encoding (decodeUtf8)
import qualified Data.Text as T import qualified Data.Text as T
import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy as TL
import Control.Monad.Except (throwError) import Control.Monad.Except (throwError)
@ -33,6 +38,7 @@ import Text.Pandoc.Logging (LogMessage(..))
import Text.Pandoc.Shared (safeRead, extractSpaces) import Text.Pandoc.Shared (safeRead, extractSpaces)
import Text.Pandoc.Sources (ToSources(..), sourcesToText) import Text.Pandoc.Sources (ToSources(..), sourcesToText)
import Text.TeXMath (readMathML, writeTeX) import Text.TeXMath (readMathML, writeTeX)
import qualified Data.Map as M
import Text.Pandoc.XML.Light import Text.Pandoc.XML.Light
{- {-
@ -548,7 +554,8 @@ readDocBook :: (PandocMonad m, ToSources a)
readDocBook _ inp = do readDocBook _ inp = do
let sources = toSources inp let sources = toSources inp
tree <- either (throwError . PandocXMLError "") return $ tree <- either (throwError . PandocXMLError "") return $
parseXMLContents parseXMLContentsWithEntities
docbookEntityMap
(TL.fromStrict . handleInstructions . sourcesToText $ sources) (TL.fromStrict . handleInstructions . sourcesToText $ sources)
(bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree (bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree
return $ Pandoc (dbMeta st') (toList . mconcat $ bs) return $ Pandoc (dbMeta st') (toList . mconcat $ bs)
@ -1335,3 +1342,17 @@ paraToPlain :: Block -> Block
paraToPlain (Para ils) = Plain ils paraToPlain (Para ils) = Plain ils
paraToPlain x = x paraToPlain x = x
docbookEntityMap :: M.Map Text Text
docbookEntityMap = M.fromList
(map lineToPair (T.lines (decodeUtf8 docbookEntities)))
where
lineToPair l =
case T.words l of
(x:ys) -> (x, T.pack (mapMaybe readHex ys))
[] -> ("","")
readHex t = case TR.hexadecimal t of
Left _ -> Nothing
Right (n,_) -> Just (chr n)
docbookEntities :: ByteString
docbookEntities = $(embedFile "data/docbook-entities.txt")