Fixed treatment of unicode characters in URIs.
* Added stringToURI to Shared. This is used in the HTML writer for all URIs. It properly URI-encodes high characters (> 127), leaving everything else (including symbols and spaces) the same. * Modified unsanitaryURI to allow UTF8 characters in a URI. (First, we convert the URI to URI-encoded octets, then we pass through parseURIReference.) This resolves gitit Issue #99. Previously '[abc](http://gitit.net/测试)' would not be rendered as a link when --sanitize was selected.
This commit is contained in:
parent
7689cacb5d
commit
71eac37ac5
3 changed files with 38 additions and 14 deletions
|
@ -182,7 +182,7 @@ unsanitaryURI u =
|
||||||
"ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:",
|
"ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:",
|
||||||
"secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:",
|
"secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:",
|
||||||
"snews:", "webcal:", "ymsgr:"]
|
"snews:", "webcal:", "ymsgr:"]
|
||||||
in case parseURIReference u of
|
in case parseURIReference (stringToURI u) of
|
||||||
Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes
|
Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes
|
||||||
Nothing -> True
|
Nothing -> True
|
||||||
|
|
||||||
|
|
|
@ -43,6 +43,7 @@ module Text.Pandoc.Shared (
|
||||||
stripFirstAndLast,
|
stripFirstAndLast,
|
||||||
camelCaseToHyphenated,
|
camelCaseToHyphenated,
|
||||||
toRomanNumeral,
|
toRomanNumeral,
|
||||||
|
stringToURI,
|
||||||
wrapped,
|
wrapped,
|
||||||
wrapIfNeeded,
|
wrapIfNeeded,
|
||||||
wrappedTeX,
|
wrappedTeX,
|
||||||
|
@ -114,7 +115,7 @@ import Text.ParserCombinators.Parsec
|
||||||
import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest )
|
import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest )
|
||||||
import qualified Text.PrettyPrint.HughesPJ as PP
|
import qualified Text.PrettyPrint.HughesPJ as PP
|
||||||
import Text.Pandoc.CharacterReferences ( characterReference )
|
import Text.Pandoc.CharacterReferences ( characterReference )
|
||||||
import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha,
|
import Data.Char ( toLower, toUpper, ord, chr, isLower, isUpper, isAlpha,
|
||||||
isPunctuation )
|
isPunctuation )
|
||||||
import Data.List ( find, isPrefixOf, intercalate )
|
import Data.List ( find, isPrefixOf, intercalate )
|
||||||
import Network.URI ( parseURI, URI (..), isAllowedInURI )
|
import Network.URI ( parseURI, URI (..), isAllowedInURI )
|
||||||
|
@ -130,7 +131,12 @@ import System.IO.UTF8
|
||||||
import Data.Generics
|
import Data.Generics
|
||||||
import qualified Control.Monad.State as S
|
import qualified Control.Monad.State as S
|
||||||
import Control.Monad (join)
|
import Control.Monad (join)
|
||||||
|
import Data.ByteString (unpack)
|
||||||
|
import Data.Word (Word8)
|
||||||
|
import Data.ByteString.UTF8 (fromString)
|
||||||
|
import Text.Printf (printf)
|
||||||
import Paths_pandoc (getDataFileName)
|
import Paths_pandoc (getDataFileName)
|
||||||
|
|
||||||
--
|
--
|
||||||
-- List processing
|
-- List processing
|
||||||
--
|
--
|
||||||
|
@ -228,6 +234,16 @@ toRomanNumeral x =
|
||||||
_ | x >= 1 -> "I" ++ toRomanNumeral (x - 1)
|
_ | x >= 1 -> "I" ++ toRomanNumeral (x - 1)
|
||||||
_ -> ""
|
_ -> ""
|
||||||
|
|
||||||
|
-- | Escape unicode characters in a URI. This means converting
|
||||||
|
-- them to UTF-8, then URI-encoding the octets. We leave everything
|
||||||
|
-- else the same, assuming that the user has already escaped
|
||||||
|
-- special characters like & and %.
|
||||||
|
stringToURI :: String -> String
|
||||||
|
stringToURI = concatMap encodeOctet . unpack . fromString
|
||||||
|
where encodeOctet :: Word8 -> String
|
||||||
|
encodeOctet x | x > 127 = printf "%%%2x" x
|
||||||
|
encodeOctet x = [chr (fromIntegral x)]
|
||||||
|
|
||||||
-- | Wrap inlines to line length.
|
-- | Wrap inlines to line length.
|
||||||
wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc
|
wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc
|
||||||
wrapped listWriter sect = (mapM listWriter $ splitBy Space sect) >>=
|
wrapped listWriter sect = (mapM listWriter $ splitBy Space sect) >>=
|
||||||
|
|
|
@ -67,6 +67,14 @@ renderFragment opts = if writerWrapText opts
|
||||||
stringToHtml :: String -> Html
|
stringToHtml :: String -> Html
|
||||||
stringToHtml = primHtml . escapeStringForXML
|
stringToHtml = primHtml . escapeStringForXML
|
||||||
|
|
||||||
|
-- Note: href and src, unmodified, incorrectly escape high
|
||||||
|
-- characters in URIs using entities. So we use these replacements:
|
||||||
|
href' :: String -> HtmlAttr
|
||||||
|
href' = href . stringToURI
|
||||||
|
|
||||||
|
src' :: String -> HtmlAttr
|
||||||
|
src' = src . stringToURI
|
||||||
|
|
||||||
-- | Convert Pandoc document to Html string.
|
-- | Convert Pandoc document to Html string.
|
||||||
writeHtmlString :: WriterOptions -> Pandoc -> String
|
writeHtmlString :: WriterOptions -> Pandoc -> String
|
||||||
writeHtmlString opts d =
|
writeHtmlString opts d =
|
||||||
|
@ -112,13 +120,13 @@ pandocToHtml opts (Pandoc (Meta title' authors' date') blocks) = do
|
||||||
then case writerHTMLMathMethod opts of
|
then case writerHTMLMathMethod opts of
|
||||||
LaTeXMathML (Just url) ->
|
LaTeXMathML (Just url) ->
|
||||||
script !
|
script !
|
||||||
[src url, thetype "text/javascript"] $ noHtml
|
[src' url, thetype "text/javascript"] $ noHtml
|
||||||
MathML (Just url) ->
|
MathML (Just url) ->
|
||||||
script !
|
script !
|
||||||
[src url, thetype "text/javascript"] $ noHtml
|
[src' url, thetype "text/javascript"] $ noHtml
|
||||||
JsMath (Just url) ->
|
JsMath (Just url) ->
|
||||||
script !
|
script !
|
||||||
[src url, thetype "text/javascript"] $ noHtml
|
[src' url, thetype "text/javascript"] $ noHtml
|
||||||
_ -> case lookup "mathml-script" (writerVariables opts) of
|
_ -> case lookup "mathml-script" (writerVariables opts) of
|
||||||
Just s ->
|
Just s ->
|
||||||
script ! [thetype "text/javascript"] <<
|
script ! [thetype "text/javascript"] <<
|
||||||
|
@ -188,7 +196,7 @@ elementToListItem opts (Sec _ num id' headerText subsecs) = do
|
||||||
let subList = if null subHeads
|
let subList = if null subHeads
|
||||||
then noHtml
|
then noHtml
|
||||||
else unordList subHeads
|
else unordList subHeads
|
||||||
return $ Just $ (anchor ! [href ("#" ++ writerIdentifierPrefix opts ++ id')] $ txt) +++ subList
|
return $ Just $ (anchor ! [href' ("#" ++ writerIdentifierPrefix opts ++ id')] $ txt) +++ subList
|
||||||
|
|
||||||
-- | Convert an Element to Html.
|
-- | Convert an Element to Html.
|
||||||
elementToHtml :: WriterOptions -> Element -> State WriterState Html
|
elementToHtml :: WriterOptions -> Element -> State WriterState Html
|
||||||
|
@ -222,7 +230,7 @@ parseMailto _ = Nothing
|
||||||
-- | Obfuscate a "mailto:" link.
|
-- | Obfuscate a "mailto:" link.
|
||||||
obfuscateLink :: WriterOptions -> String -> String -> Html
|
obfuscateLink :: WriterOptions -> String -> String -> Html
|
||||||
obfuscateLink opts txt s | writerEmailObfuscation opts == NoObfuscation =
|
obfuscateLink opts txt s | writerEmailObfuscation opts == NoObfuscation =
|
||||||
anchor ! [href s] << txt
|
anchor ! [href' s] << txt
|
||||||
obfuscateLink opts txt s =
|
obfuscateLink opts txt s =
|
||||||
let meth = writerEmailObfuscation opts
|
let meth = writerEmailObfuscation opts
|
||||||
s' = map toLower s
|
s' = map toLower s
|
||||||
|
@ -249,7 +257,7 @@ obfuscateLink opts txt s =
|
||||||
linkText ++ "+'<\\/'+'a'+'>');\n// -->\n")) +++
|
linkText ++ "+'<\\/'+'a'+'>');\n// -->\n")) +++
|
||||||
noscript (primHtml $ obfuscateString altText)
|
noscript (primHtml $ obfuscateString altText)
|
||||||
_ -> error $ "Unknown obfuscation method: " ++ show meth
|
_ -> error $ "Unknown obfuscation method: " ++ show meth
|
||||||
_ -> anchor ! [href s] $ stringToHtml txt -- malformed email
|
_ -> anchor ! [href' s] $ stringToHtml txt -- malformed email
|
||||||
|
|
||||||
-- | Obfuscate character as entity.
|
-- | Obfuscate character as entity.
|
||||||
obfuscateChar :: Char -> String
|
obfuscateChar :: Char -> String
|
||||||
|
@ -312,7 +320,7 @@ blockToHtml opts (Header level lst) = do
|
||||||
stringToHtml " " +++ contents
|
stringToHtml " " +++ contents
|
||||||
else contents
|
else contents
|
||||||
let contents'' = if writerTableOfContents opts
|
let contents'' = if writerTableOfContents opts
|
||||||
then anchor ! [href $ "#" ++ writerIdentifierPrefix opts ++ "TOC"] $ contents'
|
then anchor ! [href' $ "#" ++ writerIdentifierPrefix opts ++ "TOC"] $ contents'
|
||||||
else contents'
|
else contents'
|
||||||
return $ case level of
|
return $ case level of
|
||||||
1 -> h1 contents''
|
1 -> h1 contents''
|
||||||
|
@ -452,7 +460,7 @@ inlineToHtml opts inline =
|
||||||
then thespan ! [theclass "math"] $ primHtml str
|
then thespan ! [theclass "math"] $ primHtml str
|
||||||
else thediv ! [theclass "math"] $ primHtml str
|
else thediv ! [theclass "math"] $ primHtml str
|
||||||
MimeTeX url ->
|
MimeTeX url ->
|
||||||
return $ image ! [src (url ++ "?" ++ str),
|
return $ image ! [src' (url ++ "?" ++ str),
|
||||||
alt str, title str]
|
alt str, title str]
|
||||||
GladTeX ->
|
GladTeX ->
|
||||||
return $ primHtml $ "<EQ>" ++ str ++ "</EQ>"
|
return $ primHtml $ "<EQ>" ++ str ++ "</EQ>"
|
||||||
|
@ -484,13 +492,13 @@ inlineToHtml opts inline =
|
||||||
return $ obfuscateLink opts (show linkText) s
|
return $ obfuscateLink opts (show linkText) s
|
||||||
(Link txt (s,tit)) -> do
|
(Link txt (s,tit)) -> do
|
||||||
linkText <- inlineListToHtml opts txt
|
linkText <- inlineListToHtml opts txt
|
||||||
return $ anchor ! ([href s] ++
|
return $ anchor ! ([href' s] ++
|
||||||
if null tit then [] else [title tit]) $
|
if null tit then [] else [title tit]) $
|
||||||
linkText
|
linkText
|
||||||
(Image txt (s,tit)) -> do
|
(Image txt (s,tit)) -> do
|
||||||
alternate <- inlineListToHtml opts txt
|
alternate <- inlineListToHtml opts txt
|
||||||
let alternate' = renderFragment opts alternate
|
let alternate' = renderFragment opts alternate
|
||||||
let attributes = [src s] ++
|
let attributes = [src' s] ++
|
||||||
(if null tit
|
(if null tit
|
||||||
then []
|
then []
|
||||||
else [title tit]) ++
|
else [title tit]) ++
|
||||||
|
@ -508,7 +516,7 @@ inlineToHtml opts inline =
|
||||||
-- push contents onto front of notes
|
-- push contents onto front of notes
|
||||||
put $ st {stNotes = (htmlContents:notes)}
|
put $ st {stNotes = (htmlContents:notes)}
|
||||||
return $ sup <<
|
return $ sup <<
|
||||||
anchor ! [href ("#" ++ writerIdentifierPrefix opts ++ "fn" ++ ref),
|
anchor ! [href' ("#" ++ writerIdentifierPrefix opts ++ "fn" ++ ref),
|
||||||
theclass "footnoteRef",
|
theclass "footnoteRef",
|
||||||
prefixedId opts ("fnref" ++ ref)] << ref
|
prefixedId opts ("fnref" ++ ref)] << ref
|
||||||
(Cite _ il) -> inlineListToHtml opts il
|
(Cite _ il) -> inlineListToHtml opts il
|
||||||
|
@ -517,7 +525,7 @@ blockListToNote :: WriterOptions -> String -> [Block] -> State WriterState Html
|
||||||
blockListToNote opts ref blocks =
|
blockListToNote opts ref blocks =
|
||||||
-- If last block is Para or Plain, include the backlink at the end of
|
-- If last block is Para or Plain, include the backlink at the end of
|
||||||
-- that block. Otherwise, insert a new Plain block with the backlink.
|
-- that block. Otherwise, insert a new Plain block with the backlink.
|
||||||
let backlink = [HtmlInline $ " <a href=\"#" ++ writerIdentifierPrefix opts ++ "fnref" ++ ref ++
|
let backlink = [HtmlInline $ " <a href=\"#" ++ stringToURI (writerIdentifierPrefix opts ++ "fnref" ++ ref) ++
|
||||||
"\" class=\"footnoteBackLink\"" ++
|
"\" class=\"footnoteBackLink\"" ++
|
||||||
" title=\"Jump back to footnote " ++ ref ++ "\">↩</a>"]
|
" title=\"Jump back to footnote " ++ ref ++ "\">↩</a>"]
|
||||||
blocks' = if null blocks
|
blocks' = if null blocks
|
||||||
|
|
Loading…
Add table
Reference in a new issue