Fixed treatment of unicode characters in URIs.

* Added stringToURI to Shared.  This is used in the HTML
  writer for all URIs.  It properly URI-encodes high
  characters (> 127), leaving everything else (including
  symbols and spaces) the same.

* Modified unsanitaryURI to allow UTF8 characters in a URI.
  (First, we convert the URI to URI-encoded octets, then we
  pass through parseURIReference.)
  This resolves gitit Issue #99. Previously
  '[abc](http://gitit.net/测试)' would not be rendered as
  a link when --sanitize was selected.
This commit is contained in:
John MacFarlane 2010-03-22 19:29:37 -07:00
parent 7689cacb5d
commit 71eac37ac5
3 changed files with 38 additions and 14 deletions

View file

@ -182,7 +182,7 @@ unsanitaryURI u =
"ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:", "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:",
"secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:", "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:",
"snews:", "webcal:", "ymsgr:"] "snews:", "webcal:", "ymsgr:"]
in case parseURIReference u of in case parseURIReference (stringToURI u) of
Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes
Nothing -> True Nothing -> True

View file

@ -43,6 +43,7 @@ module Text.Pandoc.Shared (
stripFirstAndLast, stripFirstAndLast,
camelCaseToHyphenated, camelCaseToHyphenated,
toRomanNumeral, toRomanNumeral,
stringToURI,
wrapped, wrapped,
wrapIfNeeded, wrapIfNeeded,
wrappedTeX, wrappedTeX,
@ -114,7 +115,7 @@ import Text.ParserCombinators.Parsec
import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest ) import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest )
import qualified Text.PrettyPrint.HughesPJ as PP import qualified Text.PrettyPrint.HughesPJ as PP
import Text.Pandoc.CharacterReferences ( characterReference ) import Text.Pandoc.CharacterReferences ( characterReference )
import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha, import Data.Char ( toLower, toUpper, ord, chr, isLower, isUpper, isAlpha,
isPunctuation ) isPunctuation )
import Data.List ( find, isPrefixOf, intercalate ) import Data.List ( find, isPrefixOf, intercalate )
import Network.URI ( parseURI, URI (..), isAllowedInURI ) import Network.URI ( parseURI, URI (..), isAllowedInURI )
@ -130,7 +131,12 @@ import System.IO.UTF8
import Data.Generics import Data.Generics
import qualified Control.Monad.State as S import qualified Control.Monad.State as S
import Control.Monad (join) import Control.Monad (join)
import Data.ByteString (unpack)
import Data.Word (Word8)
import Data.ByteString.UTF8 (fromString)
import Text.Printf (printf)
import Paths_pandoc (getDataFileName) import Paths_pandoc (getDataFileName)
-- --
-- List processing -- List processing
-- --
@ -228,6 +234,16 @@ toRomanNumeral x =
_ | x >= 1 -> "I" ++ toRomanNumeral (x - 1) _ | x >= 1 -> "I" ++ toRomanNumeral (x - 1)
_ -> "" _ -> ""
-- | Escape unicode characters in a URI. This means converting
-- them to UTF-8, then URI-encoding the octets. We leave everything
-- else the same, assuming that the user has already escaped
-- special characters like & and %.
stringToURI :: String -> String
stringToURI = concatMap encodeOctet . unpack . fromString
where encodeOctet :: Word8 -> String
encodeOctet x | x > 127 = printf "%%%2x" x
encodeOctet x = [chr (fromIntegral x)]
-- | Wrap inlines to line length. -- | Wrap inlines to line length.
wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc
wrapped listWriter sect = (mapM listWriter $ splitBy Space sect) >>= wrapped listWriter sect = (mapM listWriter $ splitBy Space sect) >>=

View file

@ -67,6 +67,14 @@ renderFragment opts = if writerWrapText opts
stringToHtml :: String -> Html stringToHtml :: String -> Html
stringToHtml = primHtml . escapeStringForXML stringToHtml = primHtml . escapeStringForXML
-- Note: href and src, unmodified, incorrectly escape high
-- characters in URIs using entities. So we use these replacements:
href' :: String -> HtmlAttr
href' = href . stringToURI
src' :: String -> HtmlAttr
src' = src . stringToURI
-- | Convert Pandoc document to Html string. -- | Convert Pandoc document to Html string.
writeHtmlString :: WriterOptions -> Pandoc -> String writeHtmlString :: WriterOptions -> Pandoc -> String
writeHtmlString opts d = writeHtmlString opts d =
@ -112,13 +120,13 @@ pandocToHtml opts (Pandoc (Meta title' authors' date') blocks) = do
then case writerHTMLMathMethod opts of then case writerHTMLMathMethod opts of
LaTeXMathML (Just url) -> LaTeXMathML (Just url) ->
script ! script !
[src url, thetype "text/javascript"] $ noHtml [src' url, thetype "text/javascript"] $ noHtml
MathML (Just url) -> MathML (Just url) ->
script ! script !
[src url, thetype "text/javascript"] $ noHtml [src' url, thetype "text/javascript"] $ noHtml
JsMath (Just url) -> JsMath (Just url) ->
script ! script !
[src url, thetype "text/javascript"] $ noHtml [src' url, thetype "text/javascript"] $ noHtml
_ -> case lookup "mathml-script" (writerVariables opts) of _ -> case lookup "mathml-script" (writerVariables opts) of
Just s -> Just s ->
script ! [thetype "text/javascript"] << script ! [thetype "text/javascript"] <<
@ -188,7 +196,7 @@ elementToListItem opts (Sec _ num id' headerText subsecs) = do
let subList = if null subHeads let subList = if null subHeads
then noHtml then noHtml
else unordList subHeads else unordList subHeads
return $ Just $ (anchor ! [href ("#" ++ writerIdentifierPrefix opts ++ id')] $ txt) +++ subList return $ Just $ (anchor ! [href' ("#" ++ writerIdentifierPrefix opts ++ id')] $ txt) +++ subList
-- | Convert an Element to Html. -- | Convert an Element to Html.
elementToHtml :: WriterOptions -> Element -> State WriterState Html elementToHtml :: WriterOptions -> Element -> State WriterState Html
@ -222,7 +230,7 @@ parseMailto _ = Nothing
-- | Obfuscate a "mailto:" link. -- | Obfuscate a "mailto:" link.
obfuscateLink :: WriterOptions -> String -> String -> Html obfuscateLink :: WriterOptions -> String -> String -> Html
obfuscateLink opts txt s | writerEmailObfuscation opts == NoObfuscation = obfuscateLink opts txt s | writerEmailObfuscation opts == NoObfuscation =
anchor ! [href s] << txt anchor ! [href' s] << txt
obfuscateLink opts txt s = obfuscateLink opts txt s =
let meth = writerEmailObfuscation opts let meth = writerEmailObfuscation opts
s' = map toLower s s' = map toLower s
@ -249,7 +257,7 @@ obfuscateLink opts txt s =
linkText ++ "+'<\\/'+'a'+'>');\n// -->\n")) +++ linkText ++ "+'<\\/'+'a'+'>');\n// -->\n")) +++
noscript (primHtml $ obfuscateString altText) noscript (primHtml $ obfuscateString altText)
_ -> error $ "Unknown obfuscation method: " ++ show meth _ -> error $ "Unknown obfuscation method: " ++ show meth
_ -> anchor ! [href s] $ stringToHtml txt -- malformed email _ -> anchor ! [href' s] $ stringToHtml txt -- malformed email
-- | Obfuscate character as entity. -- | Obfuscate character as entity.
obfuscateChar :: Char -> String obfuscateChar :: Char -> String
@ -312,7 +320,7 @@ blockToHtml opts (Header level lst) = do
stringToHtml " " +++ contents stringToHtml " " +++ contents
else contents else contents
let contents'' = if writerTableOfContents opts let contents'' = if writerTableOfContents opts
then anchor ! [href $ "#" ++ writerIdentifierPrefix opts ++ "TOC"] $ contents' then anchor ! [href' $ "#" ++ writerIdentifierPrefix opts ++ "TOC"] $ contents'
else contents' else contents'
return $ case level of return $ case level of
1 -> h1 contents'' 1 -> h1 contents''
@ -452,7 +460,7 @@ inlineToHtml opts inline =
then thespan ! [theclass "math"] $ primHtml str then thespan ! [theclass "math"] $ primHtml str
else thediv ! [theclass "math"] $ primHtml str else thediv ! [theclass "math"] $ primHtml str
MimeTeX url -> MimeTeX url ->
return $ image ! [src (url ++ "?" ++ str), return $ image ! [src' (url ++ "?" ++ str),
alt str, title str] alt str, title str]
GladTeX -> GladTeX ->
return $ primHtml $ "<EQ>" ++ str ++ "</EQ>" return $ primHtml $ "<EQ>" ++ str ++ "</EQ>"
@ -484,13 +492,13 @@ inlineToHtml opts inline =
return $ obfuscateLink opts (show linkText) s return $ obfuscateLink opts (show linkText) s
(Link txt (s,tit)) -> do (Link txt (s,tit)) -> do
linkText <- inlineListToHtml opts txt linkText <- inlineListToHtml opts txt
return $ anchor ! ([href s] ++ return $ anchor ! ([href' s] ++
if null tit then [] else [title tit]) $ if null tit then [] else [title tit]) $
linkText linkText
(Image txt (s,tit)) -> do (Image txt (s,tit)) -> do
alternate <- inlineListToHtml opts txt alternate <- inlineListToHtml opts txt
let alternate' = renderFragment opts alternate let alternate' = renderFragment opts alternate
let attributes = [src s] ++ let attributes = [src' s] ++
(if null tit (if null tit
then [] then []
else [title tit]) ++ else [title tit]) ++
@ -508,7 +516,7 @@ inlineToHtml opts inline =
-- push contents onto front of notes -- push contents onto front of notes
put $ st {stNotes = (htmlContents:notes)} put $ st {stNotes = (htmlContents:notes)}
return $ sup << return $ sup <<
anchor ! [href ("#" ++ writerIdentifierPrefix opts ++ "fn" ++ ref), anchor ! [href' ("#" ++ writerIdentifierPrefix opts ++ "fn" ++ ref),
theclass "footnoteRef", theclass "footnoteRef",
prefixedId opts ("fnref" ++ ref)] << ref prefixedId opts ("fnref" ++ ref)] << ref
(Cite _ il) -> inlineListToHtml opts il (Cite _ il) -> inlineListToHtml opts il
@ -517,7 +525,7 @@ blockListToNote :: WriterOptions -> String -> [Block] -> State WriterState Html
blockListToNote opts ref blocks = blockListToNote opts ref blocks =
-- If last block is Para or Plain, include the backlink at the end of -- If last block is Para or Plain, include the backlink at the end of
-- that block. Otherwise, insert a new Plain block with the backlink. -- that block. Otherwise, insert a new Plain block with the backlink.
let backlink = [HtmlInline $ " <a href=\"#" ++ writerIdentifierPrefix opts ++ "fnref" ++ ref ++ let backlink = [HtmlInline $ " <a href=\"#" ++ stringToURI (writerIdentifierPrefix opts ++ "fnref" ++ ref) ++
"\" class=\"footnoteBackLink\"" ++ "\" class=\"footnoteBackLink\"" ++
" title=\"Jump back to footnote " ++ ref ++ "\">↩</a>"] " title=\"Jump back to footnote " ++ ref ++ "\">↩</a>"]
blocks' = if null blocks blocks' = if null blocks