2006-12-20 06:50:14 +00:00
|
|
|
{- |
|
|
|
|
Module : Text.Pandoc.HtmlEntities
|
|
|
|
Copyright : Copyright (C) 2006 John MacFarlane
|
|
|
|
License : GNU GPL, version 2 or above
|
|
|
|
|
|
|
|
Maintainer : John MacFarlane <jgm at berkeley dot edu>
|
2006-12-20 19:48:46 +00:00
|
|
|
Stability : provisional
|
2006-12-20 06:50:14 +00:00
|
|
|
Portability : portable
|
|
|
|
|
|
|
|
Functions for encoding unicode characters as HTML entity references,
|
|
|
|
and vice versa.
|
|
|
|
-}
|
2006-10-17 14:22:29 +00:00
|
|
|
module Text.Pandoc.HtmlEntities (
|
|
|
|
htmlEntityToChar,
|
|
|
|
charToHtmlEntity,
|
|
|
|
decodeEntities,
|
|
|
|
encodeEntities
|
|
|
|
) where
|
2006-12-20 06:50:14 +00:00
|
|
|
import Data.Char ( chr, ord )
|
2006-10-17 14:22:29 +00:00
|
|
|
import Text.Regex ( mkRegex, matchRegexAll )
|
|
|
|
import Maybe ( fromMaybe )
|
|
|
|
|
|
|
|
-- regexs for entities
|
|
|
|
decimalCodedEntity = mkRegex "&#([0-9]+);"
|
|
|
|
characterEntity = mkRegex "&#[0-9]+;|&[A-Za-z0-9]+;"
|
|
|
|
|
|
|
|
-- | Return a string with all entity references decoded to unicode characters
|
|
|
|
-- where possible.
|
|
|
|
decodeEntities :: String -> String
|
|
|
|
decodeEntities str =
|
|
|
|
case (matchRegexAll characterEntity str) of
|
2006-12-20 06:50:14 +00:00
|
|
|
Nothing -> str
|
|
|
|
Just (before, match, rest, _) -> before ++ replacement ++
|
|
|
|
(decodeEntities rest)
|
2006-10-17 14:22:29 +00:00
|
|
|
where replacement = case (htmlEntityToChar match) of
|
|
|
|
Just ch -> [ch]
|
|
|
|
Nothing -> match
|
|
|
|
|
2006-12-20 06:50:14 +00:00
|
|
|
-- | Returns a string with characters replaced with entity references where
|
|
|
|
-- possible.
|
2006-10-17 14:22:29 +00:00
|
|
|
encodeEntities :: String -> String
|
|
|
|
encodeEntities = concatMap (\c -> fromMaybe [c] (charToHtmlEntity c))
|
|
|
|
|
|
|
|
-- | If the string is a valid entity reference, returns @Just@ the character,
|
|
|
|
-- otherwise @Nothing@.
|
|
|
|
htmlEntityToChar :: String -> Maybe Char
|
|
|
|
htmlEntityToChar entity =
|
|
|
|
case (lookup entity htmlEntityTable) of
|
|
|
|
Just ch -> Just ch
|
|
|
|
Nothing -> case (matchRegexAll decimalCodedEntity entity) of
|
|
|
|
Just (_, _, _, [sub]) -> Just (chr (read sub))
|
|
|
|
Nothing -> Nothing
|
|
|
|
|
|
|
|
-- | If there is an entity reference corresponding to the character, returns
|
|
|
|
-- @Just@ the entity reference, otherwise @Nothing@.
|
|
|
|
charToHtmlEntity :: Char -> Maybe String
|
|
|
|
charToHtmlEntity char =
|
|
|
|
let matches = filter (\(entity, character) -> (character == char)) htmlEntityTable in
|
2006-12-20 06:50:14 +00:00
|
|
|
if (length matches) == 0
|
|
|
|
then Nothing
|
|
|
|
else Just (fst (head matches))
|
2006-10-17 14:22:29 +00:00
|
|
|
|
|
|
|
htmlEntityTable :: [(String, Char)]
|
|
|
|
htmlEntityTable = [
|
|
|
|
(""", chr 34),
|
|
|
|
("&", chr 38),
|
|
|
|
("<", chr 60),
|
|
|
|
(">", chr 62),
|
|
|
|
(" ", chr 160),
|
|
|
|
("¡", chr 161),
|
|
|
|
("¢", chr 162),
|
|
|
|
("£", chr 163),
|
|
|
|
("¤", chr 164),
|
|
|
|
("¥", chr 165),
|
|
|
|
("¦", chr 166),
|
|
|
|
("§", chr 167),
|
|
|
|
("¨", chr 168),
|
|
|
|
("©", chr 169),
|
|
|
|
("ª", chr 170),
|
|
|
|
("«", chr 171),
|
|
|
|
("¬", chr 172),
|
|
|
|
("­", chr 173),
|
|
|
|
("®", chr 174),
|
|
|
|
("¯", chr 175),
|
|
|
|
("°", chr 176),
|
|
|
|
("±", chr 177),
|
|
|
|
("²", chr 178),
|
|
|
|
("³", chr 179),
|
|
|
|
("´", chr 180),
|
|
|
|
("µ", chr 181),
|
|
|
|
("¶", chr 182),
|
|
|
|
("·", chr 183),
|
|
|
|
("¸", chr 184),
|
|
|
|
("¹", chr 185),
|
|
|
|
("º", chr 186),
|
|
|
|
("»", chr 187),
|
|
|
|
("¼", chr 188),
|
|
|
|
("½", chr 189),
|
|
|
|
("¾", chr 190),
|
|
|
|
("¿", chr 191),
|
|
|
|
("À", chr 192),
|
|
|
|
("Á", chr 193),
|
|
|
|
("Â", chr 194),
|
|
|
|
("Ã", chr 195),
|
|
|
|
("Ä", chr 196),
|
|
|
|
("Å", chr 197),
|
|
|
|
("Æ", chr 198),
|
|
|
|
("Ç", chr 199),
|
|
|
|
("È", chr 200),
|
|
|
|
("É", chr 201),
|
|
|
|
("Ê", chr 202),
|
|
|
|
("Ë", chr 203),
|
|
|
|
("Ì", chr 204),
|
|
|
|
("Í", chr 205),
|
|
|
|
("Î", chr 206),
|
|
|
|
("Ï", chr 207),
|
|
|
|
("Ð", chr 208),
|
|
|
|
("Ñ", chr 209),
|
|
|
|
("Ò", chr 210),
|
|
|
|
("Ó", chr 211),
|
|
|
|
("Ô", chr 212),
|
|
|
|
("Õ", chr 213),
|
|
|
|
("Ö", chr 214),
|
|
|
|
("×", chr 215),
|
|
|
|
("Ø", chr 216),
|
|
|
|
("Ù", chr 217),
|
|
|
|
("Ú", chr 218),
|
|
|
|
("Û", chr 219),
|
|
|
|
("Ü", chr 220),
|
|
|
|
("Ý", chr 221),
|
|
|
|
("Þ", chr 222),
|
|
|
|
("ß", chr 223),
|
|
|
|
("à", chr 224),
|
|
|
|
("á", chr 225),
|
|
|
|
("â", chr 226),
|
|
|
|
("ã", chr 227),
|
|
|
|
("ä", chr 228),
|
|
|
|
("å", chr 229),
|
|
|
|
("æ", chr 230),
|
|
|
|
("ç", chr 231),
|
|
|
|
("è", chr 232),
|
|
|
|
("é", chr 233),
|
|
|
|
("ê", chr 234),
|
|
|
|
("ë", chr 235),
|
|
|
|
("ì", chr 236),
|
|
|
|
("í", chr 237),
|
|
|
|
("î", chr 238),
|
|
|
|
("ï", chr 239),
|
|
|
|
("ð", chr 240),
|
|
|
|
("ñ", chr 241),
|
|
|
|
("ò", chr 242),
|
|
|
|
("ó", chr 243),
|
|
|
|
("ô", chr 244),
|
|
|
|
("õ", chr 245),
|
|
|
|
("ö", chr 246),
|
|
|
|
("÷", chr 247),
|
|
|
|
("ø", chr 248),
|
|
|
|
("ù", chr 249),
|
|
|
|
("ú", chr 250),
|
|
|
|
("û", chr 251),
|
|
|
|
("ü", chr 252),
|
|
|
|
("ý", chr 253),
|
|
|
|
("þ", chr 254),
|
|
|
|
("ÿ", chr 255),
|
|
|
|
("Œ", chr 338),
|
|
|
|
("œ", chr 339),
|
|
|
|
("Š", chr 352),
|
|
|
|
("š", chr 353),
|
|
|
|
("Ÿ", chr 376),
|
|
|
|
("ƒ", chr 402),
|
|
|
|
("ˆ", chr 710),
|
|
|
|
("˜", chr 732),
|
|
|
|
("Α", chr 913),
|
|
|
|
("Β", chr 914),
|
|
|
|
("Γ", chr 915),
|
|
|
|
("Δ", chr 916),
|
|
|
|
("Ε", chr 917),
|
|
|
|
("Ζ", chr 918),
|
|
|
|
("Η", chr 919),
|
|
|
|
("Θ", chr 920),
|
|
|
|
("Ι", chr 921),
|
|
|
|
("Κ", chr 922),
|
|
|
|
("Λ", chr 923),
|
|
|
|
("Μ", chr 924),
|
|
|
|
("Ν", chr 925),
|
|
|
|
("Ξ", chr 926),
|
|
|
|
("Ο", chr 927),
|
|
|
|
("Π", chr 928),
|
|
|
|
("Ρ", chr 929),
|
|
|
|
("Σ", chr 931),
|
|
|
|
("Τ", chr 932),
|
|
|
|
("Υ", chr 933),
|
|
|
|
("Φ", chr 934),
|
|
|
|
("Χ", chr 935),
|
|
|
|
("Ψ", chr 936),
|
|
|
|
("Ω", chr 937),
|
|
|
|
("α", chr 945),
|
|
|
|
("β", chr 946),
|
|
|
|
("γ", chr 947),
|
|
|
|
("δ", chr 948),
|
|
|
|
("ε", chr 949),
|
|
|
|
("ζ", chr 950),
|
|
|
|
("η", chr 951),
|
|
|
|
("θ", chr 952),
|
|
|
|
("ι", chr 953),
|
|
|
|
("κ", chr 954),
|
|
|
|
("λ", chr 955),
|
|
|
|
("μ", chr 956),
|
|
|
|
("ν", chr 957),
|
|
|
|
("ξ", chr 958),
|
|
|
|
("ο", chr 959),
|
|
|
|
("π", chr 960),
|
|
|
|
("ρ", chr 961),
|
|
|
|
("ς", chr 962),
|
|
|
|
("σ", chr 963),
|
|
|
|
("τ", chr 964),
|
|
|
|
("υ", chr 965),
|
|
|
|
("φ", chr 966),
|
|
|
|
("χ", chr 967),
|
|
|
|
("ψ", chr 968),
|
|
|
|
("ω", chr 969),
|
|
|
|
("ϑ", chr 977),
|
|
|
|
("ϒ", chr 978),
|
|
|
|
("ϖ", chr 982),
|
|
|
|
(" ", chr 8194),
|
|
|
|
(" ", chr 8195),
|
|
|
|
(" ", chr 8201),
|
|
|
|
("‌", chr 8204),
|
|
|
|
("‍", chr 8205),
|
|
|
|
("‎", chr 8206),
|
|
|
|
("‏", chr 8207),
|
|
|
|
("–", chr 8211),
|
|
|
|
("—", chr 8212),
|
|
|
|
("‘", chr 8216),
|
|
|
|
("’", chr 8217),
|
|
|
|
("‚", chr 8218),
|
|
|
|
("“", chr 8220),
|
|
|
|
("”", chr 8221),
|
|
|
|
("„", chr 8222),
|
|
|
|
("†", chr 8224),
|
|
|
|
("‡", chr 8225),
|
|
|
|
("•", chr 8226),
|
|
|
|
("…", chr 8230),
|
|
|
|
("‰", chr 8240),
|
|
|
|
("′", chr 8242),
|
|
|
|
("″", chr 8243),
|
|
|
|
("‹", chr 8249),
|
|
|
|
("›", chr 8250),
|
|
|
|
("‾", chr 8254),
|
|
|
|
("⁄", chr 8260),
|
|
|
|
("€", chr 8364),
|
|
|
|
("ℑ", chr 8465),
|
|
|
|
("℘", chr 8472),
|
|
|
|
("ℜ", chr 8476),
|
|
|
|
("™", chr 8482),
|
|
|
|
("ℵ", chr 8501),
|
|
|
|
("←", chr 8592),
|
|
|
|
("↑", chr 8593),
|
|
|
|
("→", chr 8594),
|
|
|
|
("↓", chr 8595),
|
|
|
|
("↔", chr 8596),
|
|
|
|
("↵", chr 8629),
|
|
|
|
("⇐", chr 8656),
|
|
|
|
("⇑", chr 8657),
|
|
|
|
("⇒", chr 8658),
|
|
|
|
("⇓", chr 8659),
|
|
|
|
("⇔", chr 8660),
|
|
|
|
("∀", chr 8704),
|
|
|
|
("∂", chr 8706),
|
|
|
|
("∃", chr 8707),
|
|
|
|
("∅", chr 8709),
|
|
|
|
("∇", chr 8711),
|
|
|
|
("∈", chr 8712),
|
|
|
|
("∉", chr 8713),
|
|
|
|
("∋", chr 8715),
|
|
|
|
("∏", chr 8719),
|
|
|
|
("∑", chr 8721),
|
|
|
|
("−", chr 8722),
|
|
|
|
("∗", chr 8727),
|
|
|
|
("√", chr 8730),
|
|
|
|
("∝", chr 8733),
|
|
|
|
("∞", chr 8734),
|
|
|
|
("∠", chr 8736),
|
|
|
|
("∧", chr 8743),
|
|
|
|
("∨", chr 8744),
|
|
|
|
("∩", chr 8745),
|
|
|
|
("∪", chr 8746),
|
|
|
|
("∫", chr 8747),
|
|
|
|
("∴", chr 8756),
|
|
|
|
("∼", chr 8764),
|
|
|
|
("≅", chr 8773),
|
|
|
|
("≈", chr 8776),
|
|
|
|
("≠", chr 8800),
|
|
|
|
("≡", chr 8801),
|
|
|
|
("≤", chr 8804),
|
|
|
|
("≥", chr 8805),
|
|
|
|
("⊂", chr 8834),
|
|
|
|
("⊃", chr 8835),
|
|
|
|
("⊄", chr 8836),
|
|
|
|
("⊆", chr 8838),
|
|
|
|
("⊇", chr 8839),
|
|
|
|
("⊕", chr 8853),
|
|
|
|
("⊗", chr 8855),
|
|
|
|
("⊥", chr 8869),
|
|
|
|
("⋅", chr 8901),
|
|
|
|
("⌈", chr 8968),
|
|
|
|
("⌉", chr 8969),
|
|
|
|
("⌊", chr 8970),
|
|
|
|
("⌋", chr 8971),
|
|
|
|
("⟨", chr 9001),
|
|
|
|
("⟩", chr 9002),
|
|
|
|
("◊", chr 9674),
|
|
|
|
("♠", chr 9824),
|
|
|
|
("♣", chr 9827),
|
|
|
|
("♥", chr 9829),
|
|
|
|
("♦", chr 9830)
|
|
|
|
]
|