Rewrote functions in Text/Pandoc/Shared so as not to use Text.Regex,
which does not support unicode: - escapePreservingRegex removed - stringToSGML rewritten using Parsec parser - new parsers for SGML character entities - escapeSGML rewritten using specialCharToEntity - new function specialCharToEntity git-svn-id: https://pandoc.googlecode.com/svn/trunk@514 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
parent
11456ba5ce
commit
f2de08864e
1 changed files with 51 additions and 28 deletions
|
@ -36,7 +36,6 @@ module Text.Pandoc.Shared (
|
|||
joinWithSep,
|
||||
tabsToSpaces,
|
||||
backslashEscape,
|
||||
escapePreservingRegex,
|
||||
endsWith,
|
||||
stripTrailingNewlines,
|
||||
removeLeadingTrailingSpace,
|
||||
|
@ -74,12 +73,11 @@ module Text.Pandoc.Shared (
|
|||
inTagsIndented
|
||||
) where
|
||||
import Text.Pandoc.Definition
|
||||
import Text.ParserCombinators.Parsec
|
||||
import Text.Pandoc.Entities ( decodeEntities, encodeEntities, characterEntity )
|
||||
import Text.Regex ( matchRegexAll, mkRegex, subRegex, Regex )
|
||||
import Text.PrettyPrint.HughesPJ as PP ( text, char, (<>), ($$), nest, Doc,
|
||||
isEmpty )
|
||||
import Data.Char ( toLower )
|
||||
import Text.ParserCombinators.Parsec as Parsec
|
||||
import Text.Pandoc.Entities ( decodeEntities, charToEntity )
|
||||
import Text.PrettyPrint.HughesPJ as PP ( text, char, (<>),
|
||||
($$), nest, Doc, isEmpty )
|
||||
import Data.Char ( toLower, ord )
|
||||
import Data.List ( find, groupBy, isPrefixOf )
|
||||
|
||||
-- | Parse a string with a given parser and state.
|
||||
|
@ -228,17 +226,6 @@ backslashEscape special (x:xs) = if x `elem` special
|
|||
then '\\':x:(backslashEscape special xs)
|
||||
else x:(backslashEscape special xs)
|
||||
|
||||
-- | Escape string by applying a function, but don't touch anything that matches regex.
|
||||
escapePreservingRegex :: (String -> String) -- ^ Escaping function
|
||||
-> Regex -- ^ Regular expression
|
||||
-> String -- ^ String to be escaped
|
||||
-> String
|
||||
escapePreservingRegex escapeFunction regex str =
|
||||
case (matchRegexAll regex str) of
|
||||
Nothing -> escapeFunction str
|
||||
Just (before, matched, after, _) -> (escapeFunction before) ++
|
||||
matched ++ (escapePreservingRegex escapeFunction regex after)
|
||||
|
||||
-- | Returns @True@ if string ends with given character.
|
||||
endsWith :: Char -> [Char] -> Bool
|
||||
endsWith char [] = False
|
||||
|
@ -536,20 +523,56 @@ replaceRefLinksInline keytable (Quoted t lst) =
|
|||
Quoted t (map (replaceRefLinksInline keytable) lst)
|
||||
replaceRefLinksInline keytable other = other
|
||||
|
||||
-- | Parse SGML character entity.
|
||||
sgmlCharacterEntity :: GenParser Char st [Char]
|
||||
sgmlCharacterEntity = sgmlNamedEntity <|> sgmlDecimalEntity <|>
|
||||
sgmlHexEntity <?> "SGML entity"
|
||||
|
||||
-- | Parse SGML character entity.
|
||||
sgmlNamedEntity :: GenParser Char st [Char]
|
||||
sgmlNamedEntity = try $ do
|
||||
st <- Parsec.char '&'
|
||||
body <- many1 alphaNum
|
||||
end <- Parsec.char ';'
|
||||
return $ (st:body) ++ [end]
|
||||
|
||||
-- | Parse SGML decimal entity.
|
||||
sgmlDecimalEntity :: GenParser Char st [Char]
|
||||
sgmlDecimalEntity = try $ do
|
||||
st <- string "&#"
|
||||
body <- many1 (oneOf "0123456789")
|
||||
end <- Parsec.char ';'
|
||||
return $ st ++ body ++ [end]
|
||||
|
||||
-- | Parse SGML hexadecimal entity.
|
||||
sgmlHexEntity :: GenParser Char st [Char]
|
||||
sgmlHexEntity = try $ do
|
||||
st <- string "&#"
|
||||
hex <- oneOf "Xx"
|
||||
body <- many1 (oneOf "0123456789ABCDEFabcdef")
|
||||
end <- Parsec.char ';'
|
||||
return $ st ++ (hex:body) ++ [end]
|
||||
|
||||
-- | Escape special character to SGML entity.
|
||||
specialCharToEntity :: Char -> [Char]
|
||||
specialCharToEntity c = if (c `elem` "&<>\"") || (ord c > 127)
|
||||
then charToEntity c
|
||||
else [c]
|
||||
|
||||
-- | Escape string, preserving character entities.
|
||||
stringToSGML :: String -> String
|
||||
stringToSGML =
|
||||
encodeEntities . (escapePreservingRegex escapeSGML characterEntity)
|
||||
stringToSGML str =
|
||||
let segment = sgmlCharacterEntity <|>
|
||||
(do{c <- anyChar;
|
||||
return $ specialCharToEntity c})
|
||||
sgmlString = (do{segs <- many segment; return $ concat segs}) in
|
||||
case parse sgmlString str str of
|
||||
Left err -> error $ "\nError:\n" ++ show err
|
||||
Right result -> result
|
||||
|
||||
-- | Escape string as needed for HTML. Entity references are not preserved.
|
||||
-- | Escape string as needed for SGML. Entity references are not preserved.
|
||||
escapeSGML :: String -> String
|
||||
escapeSGML [] = []
|
||||
escapeSGML (x:xs) = case x of
|
||||
'&' -> "&" ++ escapeSGML xs
|
||||
'<' -> "<" ++ escapeSGML xs
|
||||
'>' -> ">" ++ escapeSGML xs
|
||||
'"' -> """ ++ escapeSGML xs
|
||||
_ -> x:(escapeSGML xs)
|
||||
escapeSGML = concatMap specialCharToEntity
|
||||
|
||||
-- | Return a text object with a string of formatted SGML attributes.
|
||||
attributeList :: [(String, String)] -> Doc
|
||||
|
|
Loading…
Reference in a new issue