T.P.XML Improve fromEntities.

This commit is contained in:
John MacFarlane 2021-02-18 18:11:27 -08:00
parent 0f5c56dfb1
commit ef642e2bbc

View file

@ -1,6 +1,5 @@
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ViewPatterns #-}
{- |
Module : Text.Pandoc.XML
Copyright : Copyright (C) 2006-2021 John MacFarlane
@ -123,23 +122,20 @@ html5EntityMap = foldr go mempty htmlEntities
-- Unescapes XML entities
fromEntities :: Text -> Text
fromEntities = T.pack . fromEntities'
fromEntities t
= let (x, y) = T.break (== '&') t
in if T.null y
then t
else x <>
let (ent, rest) = T.break (\c -> isSpace c || c == ';') y
rest' = case T.uncons rest of
Just (';',ys) -> ys
_ -> rest
ent' = T.drop 1 ent <> ";"
in case T.pack <$> lookupEntity (T.unpack ent') of
Just c -> c <> fromEntities rest'
Nothing -> ent <> fromEntities rest
fromEntities' :: Text -> String
fromEntities' (T.uncons -> Just ('&', xs)) =
case lookupEntity $ T.unpack ent' of
Just c -> c <> fromEntities' rest
Nothing -> "&" <> fromEntities' xs
where (ent, rest) = case T.break (\c -> isSpace c || c == ';') xs of
(zs,T.uncons -> Just (';',ys)) -> (zs,ys)
(zs, ys) -> (zs,ys)
ent'
| Just ys <- T.stripPrefix "#X" ent = "#x" <> ys -- workaround tagsoup bug
| Just ('#', _) <- T.uncons ent = ent
| otherwise = ent <> ";"
fromEntities' t = case T.uncons t of
Just (x, xs) -> x : fromEntities' xs
Nothing -> ""
html5Attributes :: Set.Set Text
html5Attributes = Set.fromList