{- Copyright (C) 2006 John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | Module : Text.Pandoc.HtmlEntities Copyright : Copyright (C) 2006 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane Stability : alpha Portability : portable Functions for encoding unicode characters as HTML entity references, and vice versa. -} module Text.Pandoc.HtmlEntities ( htmlEntityToChar, charToHtmlEntity, decodeEntities, encodeEntities ) where import Data.Char ( chr, ord ) import Text.Regex ( mkRegex, matchRegexAll ) import Maybe ( fromMaybe ) -- regexs for entities decimalCodedEntity = mkRegex "&#([0-9]+);" characterEntity = mkRegex "&#[0-9]+;|&[A-Za-z0-9]+;" -- | Return a string with all entity references decoded to unicode characters -- where possible. decodeEntities :: String -> String decodeEntities str = case (matchRegexAll characterEntity str) of Nothing -> str Just (before, match, rest, _) -> before ++ replacement ++ (decodeEntities rest) where replacement = case (htmlEntityToChar match) of Just ch -> [ch] Nothing -> match -- | Returns a string with characters replaced with entity references where -- possible. encodeEntities :: String -> String encodeEntities [] = [] encodeEntities (c:cs) = if ord c < 127 then c:(encodeEntities cs) else (charToHtmlEntity c) ++ (encodeEntities cs) -- | If the string is a valid entity reference, returns @Just@ the character, -- otherwise @Nothing@. htmlEntityToChar :: String -> Maybe Char htmlEntityToChar entity = case (lookup entity htmlEntityTable) of Just ch -> Just ch Nothing -> case (matchRegexAll decimalCodedEntity entity) of Just (_, _, _, [sub]) -> Just (chr (read sub)) Nothing -> Nothing -- | Returns a string containing an entity reference for the character. charToHtmlEntity :: Char -> String charToHtmlEntity char = let matches = filter (\(entity, character) -> (character == char)) htmlEntityTable in if (length matches) == 0 then "&#" ++ show (ord char) ++ ";" else fst (head matches) htmlEntityTable :: [(String, Char)] htmlEntityTable = [ (""", chr 34), ("&", chr 38), ("<", chr 60), (">", chr 62), (" ", chr 160), ("¡", chr 161), ("¢", chr 162), ("£", chr 163), ("¤", chr 164), ("¥", chr 165), ("¦", chr 166), ("§", chr 167), ("¨", chr 168), ("©", chr 169), ("ª", chr 170), ("«", chr 171), ("¬", chr 172), ("­", chr 173), ("®", chr 174), ("¯", chr 175), ("°", chr 176), ("±", chr 177), ("²", chr 178), ("³", chr 179), ("´", chr 180), ("µ", chr 181), ("¶", chr 182), ("·", chr 183), ("¸", chr 184), ("¹", chr 185), ("º", chr 186), ("»", chr 187), ("¼", chr 188), ("½", chr 189), ("¾", chr 190), ("¿", chr 191), ("À", chr 192), ("Á", chr 193), ("Â", chr 194), ("Ã", chr 195), ("Ä", chr 196), ("Å", chr 197), ("Æ", chr 198), ("Ç", chr 199), ("È", chr 200), ("É", chr 201), ("Ê", chr 202), ("Ë", chr 203), ("Ì", chr 204), ("Í", chr 205), ("Î", chr 206), ("Ï", chr 207), ("Ð", chr 208), ("Ñ", chr 209), ("Ò", chr 210), ("Ó", chr 211), ("Ô", chr 212), ("Õ", chr 213), ("Ö", chr 214), ("×", chr 215), ("Ø", chr 216), ("Ù", chr 217), ("Ú", chr 218), ("Û", chr 219), ("Ü", chr 220), ("Ý", chr 221), ("Þ", chr 222), ("ß", chr 223), ("à", chr 224), ("á", chr 225), ("â", chr 226), ("ã", chr 227), ("ä", chr 228), ("å", chr 229), ("æ", chr 230), ("ç", chr 231), ("è", chr 232), ("é", chr 233), ("ê", chr 234), ("ë", chr 235), ("ì", chr 236), ("í", chr 237), ("î", chr 238), ("ï", chr 239), ("ð", chr 240), ("ñ", chr 241), ("ò", chr 242), ("ó", chr 243), ("ô", chr 244), ("õ", chr 245), ("ö", chr 246), ("÷", chr 247), ("ø", chr 248), ("ù", chr 249), ("ú", chr 250), ("û", chr 251), ("ü", chr 252), ("ý", chr 253), ("þ", chr 254), ("ÿ", chr 255), ("Œ", chr 338), ("œ", chr 339), ("Š", chr 352), ("š", chr 353), ("Ÿ", chr 376), ("ƒ", chr 402), ("ˆ", chr 710), ("˜", chr 732), ("Α", chr 913), ("Β", chr 914), ("Γ", chr 915), ("Δ", chr 916), ("Ε", chr 917), ("Ζ", chr 918), ("Η", chr 919), ("Θ", chr 920), ("Ι", chr 921), ("Κ", chr 922), ("Λ", chr 923), ("Μ", chr 924), ("Ν", chr 925), ("Ξ", chr 926), ("Ο", chr 927), ("Π", chr 928), ("Ρ", chr 929), ("Σ", chr 931), ("Τ", chr 932), ("Υ", chr 933), ("Φ", chr 934), ("Χ", chr 935), ("Ψ", chr 936), ("Ω", chr 937), ("α", chr 945), ("β", chr 946), ("γ", chr 947), ("δ", chr 948), ("ε", chr 949), ("ζ", chr 950), ("η", chr 951), ("θ", chr 952), ("ι", chr 953), ("κ", chr 954), ("λ", chr 955), ("μ", chr 956), ("ν", chr 957), ("ξ", chr 958), ("ο", chr 959), ("π", chr 960), ("ρ", chr 961), ("ς", chr 962), ("σ", chr 963), ("τ", chr 964), ("υ", chr 965), ("φ", chr 966), ("χ", chr 967), ("ψ", chr 968), ("ω", chr 969), ("ϑ", chr 977), ("ϒ", chr 978), ("ϖ", chr 982), (" ", chr 8194), (" ", chr 8195), (" ", chr 8201), ("‌", chr 8204), ("‍", chr 8205), ("‎", chr 8206), ("‏", chr 8207), ("–", chr 8211), ("—", chr 8212), ("‘", chr 8216), ("’", chr 8217), ("‚", chr 8218), ("“", chr 8220), ("”", chr 8221), ("„", chr 8222), ("†", chr 8224), ("‡", chr 8225), ("•", chr 8226), ("…", chr 8230), ("‰", chr 8240), ("′", chr 8242), ("″", chr 8243), ("‹", chr 8249), ("›", chr 8250), ("‾", chr 8254), ("⁄", chr 8260), ("€", chr 8364), ("ℑ", chr 8465), ("℘", chr 8472), ("ℜ", chr 8476), ("™", chr 8482), ("ℵ", chr 8501), ("←", chr 8592), ("↑", chr 8593), ("→", chr 8594), ("↓", chr 8595), ("↔", chr 8596), ("↵", chr 8629), ("⇐", chr 8656), ("⇑", chr 8657), ("⇒", chr 8658), ("⇓", chr 8659), ("⇔", chr 8660), ("∀", chr 8704), ("∂", chr 8706), ("∃", chr 8707), ("∅", chr 8709), ("∇", chr 8711), ("∈", chr 8712), ("∉", chr 8713), ("∋", chr 8715), ("∏", chr 8719), ("∑", chr 8721), ("−", chr 8722), ("∗", chr 8727), ("√", chr 8730), ("∝", chr 8733), ("∞", chr 8734), ("∠", chr 8736), ("∧", chr 8743), ("∨", chr 8744), ("∩", chr 8745), ("∪", chr 8746), ("∫", chr 8747), ("∴", chr 8756), ("∼", chr 8764), ("≅", chr 8773), ("≈", chr 8776), ("≠", chr 8800), ("≡", chr 8801), ("≤", chr 8804), ("≥", chr 8805), ("⊂", chr 8834), ("⊃", chr 8835), ("⊄", chr 8836), ("⊆", chr 8838), ("⊇", chr 8839), ("⊕", chr 8853), ("⊗", chr 8855), ("⊥", chr 8869), ("⋅", chr 8901), ("⌈", chr 8968), ("⌉", chr 8969), ("⌊", chr 8970), ("⌋", chr 8971), ("⟨", chr 9001), ("⟩", chr 9002), ("◊", chr 9674), ("♠", chr 9824), ("♣", chr 9827), ("♥", chr 9829), ("♦", chr 9830) ]