0114f68d21
of entity by character, in Entities.hs. This yields a small performance improvement. git-svn-id: https://pandoc.googlecode.com/svn/trunk@534 788f1e2b-df1e-0410-8736-df70ead52e1b
371 lines
9.4 KiB
Haskell
371 lines
9.4 KiB
Haskell
{-
|
|
Copyright (C) 2006 John MacFarlane <jgm at berkeley dot edu>
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
-}
|
|
|
|
{- |
|
|
Module : Text.Pandoc.Entities
|
|
Copyright : Copyright (C) 2006 John MacFarlane
|
|
License : GNU GPL, version 2 or above
|
|
|
|
Maintainer : John MacFarlane <jgm at berkeley dot edu>
|
|
Stability : alpha
|
|
Portability : portable
|
|
|
|
Functions for encoding unicode characters as entity references,
|
|
and vice versa.
|
|
-}
|
|
module Text.Pandoc.Entities (
|
|
charToEntity,
|
|
charToNumericalEntity,
|
|
decodeEntities,
|
|
escapeSGMLChar,
|
|
escapeSGMLString,
|
|
characterEntity
|
|
) where
|
|
import Data.Char ( chr, ord )
|
|
import Text.ParserCombinators.Parsec
|
|
import Data.Maybe ( fromMaybe )
|
|
import qualified Data.Map as Map
|
|
|
|
-- | Returns a string containing an entity reference for the character.
|
|
charToEntity :: Char -> String
|
|
charToEntity char = Map.findWithDefault (charToNumericalEntity char) char reverseEntityTable
|
|
|
|
-- | Returns a string containing a numerical entity reference for the char.
|
|
charToNumericalEntity :: Char -> String
|
|
charToNumericalEntity ch = "&#" ++ show (ord ch) ++ ";"
|
|
|
|
-- | Parse SGML character entity.
|
|
characterEntity :: GenParser Char st Char
|
|
characterEntity = namedEntity <|> hexEntity <|> decimalEntity <?> "SGML entity"
|
|
|
|
-- | Parse SGML character entity.
|
|
namedEntity :: GenParser Char st Char
|
|
namedEntity = try $ do
|
|
st <- char '&'
|
|
body <- many1 alphaNum
|
|
end <- char ';'
|
|
let entity = "&" ++ body ++ ";"
|
|
return $ Map.findWithDefault '?' entity entityTable
|
|
|
|
-- | Parse SGML hexadecimal entity.
|
|
hexEntity :: GenParser Char st Char
|
|
hexEntity = try $ do
|
|
st <- string "&#"
|
|
hex <- oneOf "Xx"
|
|
body <- many1 (oneOf "0123456789ABCDEFabcdef")
|
|
end <- char ';'
|
|
return $ chr $ read ('0':'x':body)
|
|
|
|
-- | Parse SGML decimal entity.
|
|
decimalEntity :: GenParser Char st Char
|
|
decimalEntity = try $ do
|
|
st <- string "&#"
|
|
body <- many1 digit
|
|
end <- char ';'
|
|
return $ chr $ read body
|
|
|
|
-- | Escape one character as needed for SGML.
|
|
escapeSGMLChar :: Char -> String
|
|
escapeSGMLChar x =
|
|
case x of
|
|
'&' -> "&"
|
|
'<' -> "<"
|
|
'>' -> ">"
|
|
'"' -> """
|
|
c -> [c]
|
|
|
|
-- | True if the character needs to be escaped.
|
|
needsEscaping :: Char -> Bool
|
|
needsEscaping c = c `elem` "&<>\""
|
|
|
|
-- | Escape string as needed for SGML. Entity references are not preserved.
|
|
escapeSGMLString :: String -> String
|
|
escapeSGMLString "" = ""
|
|
escapeSGMLString str =
|
|
case break needsEscaping str of
|
|
(okay, "") -> okay
|
|
(okay, (c:cs)) -> okay ++ escapeSGMLChar c ++ escapeSGMLString cs
|
|
|
|
-- | Convert entities in a string to characters.
|
|
decodeEntities :: String -> String
|
|
decodeEntities str =
|
|
case parse (many (characterEntity <|> anyChar)) str str of
|
|
Left err -> error $ "\nError: " ++ show err
|
|
Right result -> result
|
|
|
|
entityTable :: Map.Map String Char
|
|
entityTable = Map.fromList entityTableList
|
|
|
|
reverseEntityTable :: Map.Map Char String
|
|
reverseEntityTable = Map.fromList $ map (\(a,b) -> (b,a)) entityTableList
|
|
|
|
entityTableList :: [(String, Char)]
|
|
entityTableList = [
|
|
(""", chr 34),
|
|
("&", chr 38),
|
|
("<", chr 60),
|
|
(">", chr 62),
|
|
(" ", chr 160),
|
|
("¡", chr 161),
|
|
("¢", chr 162),
|
|
("£", chr 163),
|
|
("¤", chr 164),
|
|
("¥", chr 165),
|
|
("¦", chr 166),
|
|
("§", chr 167),
|
|
("¨", chr 168),
|
|
("©", chr 169),
|
|
("ª", chr 170),
|
|
("«", chr 171),
|
|
("¬", chr 172),
|
|
("­", chr 173),
|
|
("®", chr 174),
|
|
("¯", chr 175),
|
|
("°", chr 176),
|
|
("±", chr 177),
|
|
("²", chr 178),
|
|
("³", chr 179),
|
|
("´", chr 180),
|
|
("µ", chr 181),
|
|
("¶", chr 182),
|
|
("·", chr 183),
|
|
("¸", chr 184),
|
|
("¹", chr 185),
|
|
("º", chr 186),
|
|
("»", chr 187),
|
|
("¼", chr 188),
|
|
("½", chr 189),
|
|
("¾", chr 190),
|
|
("¿", chr 191),
|
|
("À", chr 192),
|
|
("Á", chr 193),
|
|
("Â", chr 194),
|
|
("Ã", chr 195),
|
|
("Ä", chr 196),
|
|
("Å", chr 197),
|
|
("Æ", chr 198),
|
|
("Ç", chr 199),
|
|
("È", chr 200),
|
|
("É", chr 201),
|
|
("Ê", chr 202),
|
|
("Ë", chr 203),
|
|
("Ì", chr 204),
|
|
("Í", chr 205),
|
|
("Î", chr 206),
|
|
("Ï", chr 207),
|
|
("Ð", chr 208),
|
|
("Ñ", chr 209),
|
|
("Ò", chr 210),
|
|
("Ó", chr 211),
|
|
("Ô", chr 212),
|
|
("Õ", chr 213),
|
|
("Ö", chr 214),
|
|
("×", chr 215),
|
|
("Ø", chr 216),
|
|
("Ù", chr 217),
|
|
("Ú", chr 218),
|
|
("Û", chr 219),
|
|
("Ü", chr 220),
|
|
("Ý", chr 221),
|
|
("Þ", chr 222),
|
|
("ß", chr 223),
|
|
("à", chr 224),
|
|
("á", chr 225),
|
|
("â", chr 226),
|
|
("ã", chr 227),
|
|
("ä", chr 228),
|
|
("å", chr 229),
|
|
("æ", chr 230),
|
|
("ç", chr 231),
|
|
("è", chr 232),
|
|
("é", chr 233),
|
|
("ê", chr 234),
|
|
("ë", chr 235),
|
|
("ì", chr 236),
|
|
("í", chr 237),
|
|
("î", chr 238),
|
|
("ï", chr 239),
|
|
("ð", chr 240),
|
|
("ñ", chr 241),
|
|
("ò", chr 242),
|
|
("ó", chr 243),
|
|
("ô", chr 244),
|
|
("õ", chr 245),
|
|
("ö", chr 246),
|
|
("÷", chr 247),
|
|
("ø", chr 248),
|
|
("ù", chr 249),
|
|
("ú", chr 250),
|
|
("û", chr 251),
|
|
("ü", chr 252),
|
|
("ý", chr 253),
|
|
("þ", chr 254),
|
|
("ÿ", chr 255),
|
|
("Œ", chr 338),
|
|
("œ", chr 339),
|
|
("Š", chr 352),
|
|
("š", chr 353),
|
|
("Ÿ", chr 376),
|
|
("ƒ", chr 402),
|
|
("ˆ", chr 710),
|
|
("˜", chr 732),
|
|
("Α", chr 913),
|
|
("Β", chr 914),
|
|
("Γ", chr 915),
|
|
("Δ", chr 916),
|
|
("Ε", chr 917),
|
|
("Ζ", chr 918),
|
|
("Η", chr 919),
|
|
("Θ", chr 920),
|
|
("Ι", chr 921),
|
|
("Κ", chr 922),
|
|
("Λ", chr 923),
|
|
("Μ", chr 924),
|
|
("Ν", chr 925),
|
|
("Ξ", chr 926),
|
|
("Ο", chr 927),
|
|
("Π", chr 928),
|
|
("Ρ", chr 929),
|
|
("Σ", chr 931),
|
|
("Τ", chr 932),
|
|
("Υ", chr 933),
|
|
("Φ", chr 934),
|
|
("Χ", chr 935),
|
|
("Ψ", chr 936),
|
|
("Ω", chr 937),
|
|
("α", chr 945),
|
|
("β", chr 946),
|
|
("γ", chr 947),
|
|
("δ", chr 948),
|
|
("ε", chr 949),
|
|
("ζ", chr 950),
|
|
("η", chr 951),
|
|
("θ", chr 952),
|
|
("ι", chr 953),
|
|
("κ", chr 954),
|
|
("λ", chr 955),
|
|
("μ", chr 956),
|
|
("ν", chr 957),
|
|
("ξ", chr 958),
|
|
("ο", chr 959),
|
|
("π", chr 960),
|
|
("ρ", chr 961),
|
|
("ς", chr 962),
|
|
("σ", chr 963),
|
|
("τ", chr 964),
|
|
("υ", chr 965),
|
|
("φ", chr 966),
|
|
("χ", chr 967),
|
|
("ψ", chr 968),
|
|
("ω", chr 969),
|
|
("ϑ", chr 977),
|
|
("ϒ", chr 978),
|
|
("ϖ", chr 982),
|
|
(" ", chr 8194),
|
|
(" ", chr 8195),
|
|
(" ", chr 8201),
|
|
("‌", chr 8204),
|
|
("‍", chr 8205),
|
|
("‎", chr 8206),
|
|
("‏", chr 8207),
|
|
("–", chr 8211),
|
|
("—", chr 8212),
|
|
("‘", chr 8216),
|
|
("’", chr 8217),
|
|
("‚", chr 8218),
|
|
("“", chr 8220),
|
|
("”", chr 8221),
|
|
("„", chr 8222),
|
|
("†", chr 8224),
|
|
("‡", chr 8225),
|
|
("•", chr 8226),
|
|
("…", chr 8230),
|
|
("‰", chr 8240),
|
|
("′", chr 8242),
|
|
("″", chr 8243),
|
|
("‹", chr 8249),
|
|
("›", chr 8250),
|
|
("‾", chr 8254),
|
|
("⁄", chr 8260),
|
|
("€", chr 8364),
|
|
("ℑ", chr 8465),
|
|
("℘", chr 8472),
|
|
("ℜ", chr 8476),
|
|
("™", chr 8482),
|
|
("ℵ", chr 8501),
|
|
("←", chr 8592),
|
|
("↑", chr 8593),
|
|
("→", chr 8594),
|
|
("↓", chr 8595),
|
|
("↔", chr 8596),
|
|
("↵", chr 8629),
|
|
("⇐", chr 8656),
|
|
("⇑", chr 8657),
|
|
("⇒", chr 8658),
|
|
("⇓", chr 8659),
|
|
("⇔", chr 8660),
|
|
("∀", chr 8704),
|
|
("∂", chr 8706),
|
|
("∃", chr 8707),
|
|
("∅", chr 8709),
|
|
("∇", chr 8711),
|
|
("∈", chr 8712),
|
|
("∉", chr 8713),
|
|
("∋", chr 8715),
|
|
("∏", chr 8719),
|
|
("∑", chr 8721),
|
|
("−", chr 8722),
|
|
("∗", chr 8727),
|
|
("√", chr 8730),
|
|
("∝", chr 8733),
|
|
("∞", chr 8734),
|
|
("∠", chr 8736),
|
|
("∧", chr 8743),
|
|
("∨", chr 8744),
|
|
("∩", chr 8745),
|
|
("∪", chr 8746),
|
|
("∫", chr 8747),
|
|
("∴", chr 8756),
|
|
("∼", chr 8764),
|
|
("≅", chr 8773),
|
|
("≈", chr 8776),
|
|
("≠", chr 8800),
|
|
("≡", chr 8801),
|
|
("≤", chr 8804),
|
|
("≥", chr 8805),
|
|
("⊂", chr 8834),
|
|
("⊃", chr 8835),
|
|
("⊄", chr 8836),
|
|
("⊆", chr 8838),
|
|
("⊇", chr 8839),
|
|
("⊕", chr 8853),
|
|
("⊗", chr 8855),
|
|
("⊥", chr 8869),
|
|
("⋅", chr 8901),
|
|
("⌈", chr 8968),
|
|
("⌉", chr 8969),
|
|
("⌊", chr 8970),
|
|
("⌋", chr 8971),
|
|
("⟨", chr 9001),
|
|
("⟩", chr 9002),
|
|
("◊", chr 9674),
|
|
("♠", chr 9824),
|
|
("♣", chr 9827),
|
|
("♥", chr 9829),
|
|
("♦", chr 9830)
|
|
]
|