Entity handling fixes:

- Text.Pandoc.XML.fromEntities:  handle entities without a
  semicolon. Always lookup character references with the
  trailing ';', even if it wasn't present.  And never add
  it when looking up numerical entities.  (This is what
  tagsoup seems to require.)
- Text.Pandoc.Parsing.characterReference:  Always lookup
  character references with the trailing ';', and leave off
  the ';' when looking up numerical entities.

This fixes a regression for e.g. `⟨`.
This commit is contained in:
John MacFarlane 2016-01-08 17:08:01 -08:00
parent 52d95ddde1
commit 12a5bd3c8d
2 changed files with 10 additions and 3 deletions

View file

@ -573,7 +573,10 @@ characterReference :: Stream s m Char => ParserT s st m Char
characterReference = try $ do
char '&'
ent <- many1Till nonspaceChar (char ';')
case lookupEntity ent of
let ent' = case ent of
'#':_ -> ent
_ -> ent ++ ";"
case lookupEntity ent' of
Just c -> return c
Nothing -> fail "entity not found"

View file

@ -100,11 +100,15 @@ toEntities (c:cs)
-- Unescapes XML entities
fromEntities :: String -> String
fromEntities ('&':xs) =
case lookupEntity ent of
case lookupEntity ent' of
Just c -> c : fromEntities rest
Nothing -> '&' : fromEntities xs
where (ent, rest) = case break (\c -> isSpace c || c == ';') xs of
(zs,';':ys) -> (zs,ys)
_ -> ("",xs)
(zs, ys) -> (zs,ys)
ent' = case ent of
'#':_ -> ent
_ -> ent ++ ";"
fromEntities (x:xs) = x : fromEntities xs
fromEntities [] = []