Shared: Provide custom isURI that rejects unknown schemes [isURI]

We also export the set of known `schemes`.

The new function replaces the function of the same name
from `Network.URI`, as the latter did not check whether a scheme is
well-known.  E.g. MediaWiki wikis frequently feature pages with names
like `User:John`. These links were interpreted as URIs, thus turning
internal links into global links. This is prevented by also checking
whether the scheme of a URI is frequently used (i.e. is IANA registered
or an otherwise well-known scheme).

Fixes: #2713

Update set of well-known URIs from IANA list
All official IANA schemes (as of 2017-05-22) are included in the set of
known schemes.  The four non-official schemes doi, isbn, javascript, and
pmid are kept.
This commit is contained in:
Albert Krewinkel 2017-05-23 09:48:11 +02:00 committed by John MacFarlane
parent 4d1e9b8e41
commit 5debb0da0f
16 changed files with 81 additions and 48 deletions

View file

@ -57,7 +57,7 @@ import Data.Maybe (fromMaybe, isJust, isNothing)
import qualified Data.Text as T
import Data.Yaml (decode)
import qualified Data.Yaml as Yaml
import Network.URI (URI (..), isURI, parseURI)
import Network.URI (URI (..), parseURI)
import Paths_pandoc (getDataDir)
import Skylighting (Style, Syntax (..), defaultSyntaxMap, parseTheme)
import Skylighting.Parser (missingIncludes, parseSyntaxDefinition,
@ -80,7 +80,7 @@ import Text.Pandoc.Lua ( runLuaFilter )
import Text.Pandoc.PDF (makePDF)
import Text.Pandoc.Process (pipeProcess)
import Text.Pandoc.SelfContained (makeSelfContained, makeDataURI)
import Text.Pandoc.Shared (headerShift, openURL, readDataFile,
import Text.Pandoc.Shared (isURI, headerShift, openURL, readDataFile,
readDataFileUTF8, safeRead, tabFilter)
import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.XML (toEntities)

View file

@ -465,33 +465,8 @@ emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain)
sepby1 p sep = (:) <$> p <*> (many (try $ sep >> p))
-- Schemes from http://www.iana.org/assignments/uri-schemes.html plus
-- the unofficial schemes coap, doi, javascript, isbn, pmid
schemes :: [String]
schemes = ["coap","doi","javascript","aaa","aaas","about","acap","cap","cid",
"crid","data","dav","dict","dns","file","ftp","geo","go","gopher",
"h323","http","https","iax","icap","im","imap","info","ipp","iris",
"iris.beep","iris.xpc","iris.xpcs","iris.lwz","ldap","mailto","mid",
"msrp","msrps","mtqp","mupdate","news","nfs","ni","nih","nntp",
"opaquelocktoken","pop","pres","rtsp","service","session","shttp","sieve",
"sip","sips","sms","snmp","soap.beep","soap.beeps","tag","tel","telnet",
"tftp","thismessage","tn3270","tip","tv","urn","vemmi","ws","wss","xcon",
"xcon-userid","xmlrpc.beep","xmlrpc.beeps","xmpp","z39.50r","z39.50s",
"adiumxtra","afp","afs","aim","apt","attachment","aw","beshare","bitcoin",
"bolo","callto","chrome","chrome-extension","com-eventbrite-attendee",
"content", "cvs","dlna-playsingle","dlna-playcontainer","dtn","dvb",
"ed2k","facetime","feed","finger","fish","gg","git","gizmoproject",
"gtalk","hcp","icon","ipn","irc","irc6","ircs","itms","jar","jms",
"keyparc","lastfm","ldaps","magnet","maps","market","message","mms",
"ms-help","msnim","mumble","mvn","notes","oid","palm","paparazzi",
"platform","proxy","psyc","query","res","resource","rmi","rsync",
"rtmp","secondlife","sftp","sgn","skype","smb","soldat","spotify",
"ssh","steam","svn","teamspeak","things","udp","unreal","ut2004",
"ventrilo","view-source","webcal","wtai","wyciwyg","xfire","xri",
"ymsgr", "isbn", "pmid"]
uriScheme :: Stream s m Char => ParserT s st m String
uriScheme = oneOfStringsCI schemes
uriScheme = oneOfStringsCI (Set.toList schemes)
-- | Parses a URI. Returns pair of original and URI-escaped version.
uri :: Stream [Char] m Char => ParserT [Char] st m (String, String)

View file

@ -42,7 +42,6 @@ import Text.Pandoc.Definition
import Text.Pandoc.Options
import Text.Pandoc.Parsing hiding (macro, space, spaces, uri)
import Text.Pandoc.Shared (compactify, compactifyDL, escapeURI)
--import Network.URI (isURI) -- Not sure whether to use this function
import Control.Monad (guard, void, when)
import Control.Monad.Reader (Reader, asks, runReader)
import Data.Default

View file

@ -42,7 +42,7 @@ import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy as L
import Data.Char (isAlphaNum, isAscii, toLower)
import Data.List (isPrefixOf)
import Network.URI (URI (..), escapeURIString, isURI, parseURI)
import Network.URI (URI (..), escapeURIString, parseURI)
import System.FilePath (takeDirectory, takeExtension, (</>))
import Text.HTML.TagSoup
import Text.Pandoc.Class (PandocMonad (..), fetchItem, report)
@ -50,7 +50,7 @@ import Text.Pandoc.Error
import Text.Pandoc.Logging
import Text.Pandoc.MIME (MimeType)
import Text.Pandoc.Options (WriterOptions (..))
import Text.Pandoc.Shared (renderTags', trim)
import Text.Pandoc.Shared (isURI, renderTags', trim)
import Text.Pandoc.UTF8 (toString)
import Text.Parsec (ParsecT, runParserT)
import qualified Text.Parsec as P

View file

@ -81,6 +81,9 @@ module Text.Pandoc.Shared (
openURL,
collapseFilePath,
filteredFilesFromArchive,
-- * URI handling
schemes,
isURI,
-- * Error handling
mapLeft,
-- * for squashing blocks
@ -104,7 +107,7 @@ import Data.List ( find, stripPrefix, intercalate )
import Data.Maybe (mapMaybe)
import Data.Version ( showVersion )
import qualified Data.Map as M
import Network.URI ( escapeURIString, unEscapeString )
import Network.URI ( URI(uriScheme), escapeURIString, unEscapeString, parseURI )
import qualified Data.Set as Set
import System.Directory
import System.FilePath (splitDirectories, isPathSeparator)
@ -774,6 +777,70 @@ filteredFilesFromArchive zf f =
fileAndBinary :: Archive -> FilePath -> Maybe (FilePath, BL.ByteString)
fileAndBinary a fp = findEntryByPath fp a >>= \e -> Just (fp, fromEntry e)
--
-- IANA URIs
--
-- | Schemes from http://www.iana.org/assignments/uri-schemes.html plus
-- the unofficial schemes doi, javascript, isbn, pmid.
schemes :: Set.Set String
schemes = Set.fromList
-- Official IANA schemes
[ "aaa", "aaas", "about", "acap", "acct", "acr", "adiumxtra", "afp", "afs"
, "aim", "appdata", "apt", "attachment", "aw", "barion", "beshare", "bitcoin"
, "blob", "bolo", "browserext", "callto", "cap", "chrome", "chrome-extension"
, "cid", "coap", "coaps", "com-eventbrite-attendee", "content", "crid", "cvs"
, "data", "dav", "dict", "dis", "dlna-playcontainer", "dlna-playsingle"
, "dns", "dntp", "dtn", "dvb", "ed2k", "example", "facetime", "fax", "feed"
, "feedready", "file", "filesystem", "finger", "fish", "ftp", "geo", "gg"
, "git", "gizmoproject", "go", "gopher", "graph", "gtalk", "h323", "ham"
, "hcp", "http", "https", "hxxp", "hxxps", "hydrazone", "iax", "icap", "icon"
, "im", "imap", "info", "iotdisco", "ipn", "ipp", "ipps", "irc", "irc6"
, "ircs", "iris", "iris.beep", "iris.lwz", "iris.xpc", "iris.xpcs"
, "isostore", "itms", "jabber", "jar", "jms", "keyparc", "lastfm", "ldap"
, "ldaps", "lvlt", "magnet", "mailserver", "mailto", "maps", "market"
, "message", "mid", "mms", "modem", "mongodb", "moz", "ms-access"
, "ms-browser-extension", "ms-drive-to", "ms-enrollment", "ms-excel"
, "ms-gamebarservices", "ms-getoffice", "ms-help", "ms-infopath"
, "ms-media-stream-id", "ms-officeapp", "ms-project", "ms-powerpoint"
, "ms-publisher", "ms-search-repair", "ms-secondary-screen-controller"
, "ms-secondary-screen-setup", "ms-settings", "ms-settings-airplanemode"
, "ms-settings-bluetooth", "ms-settings-camera", "ms-settings-cellular"
, "ms-settings-cloudstorage", "ms-settings-connectabledevices"
, "ms-settings-displays-topology", "ms-settings-emailandaccounts"
, "ms-settings-language", "ms-settings-location", "ms-settings-lock"
, "ms-settings-nfctransactions", "ms-settings-notifications"
, "ms-settings-power", "ms-settings-privacy", "ms-settings-proximity"
, "ms-settings-screenrotation", "ms-settings-wifi", "ms-settings-workplace"
, "ms-spd", "ms-sttoverlay", "ms-transit-to", "ms-virtualtouchpad"
, "ms-visio", "ms-walk-to", "ms-whiteboard", "ms-whiteboard-cmd", "ms-word"
, "msnim", "msrp", "msrps", "mtqp", "mumble", "mupdate", "mvn", "news", "nfs"
, "ni", "nih", "nntp", "notes", "ocf", "oid", "onenote", "onenote-cmd"
, "opaquelocktoken", "pack", "palm", "paparazzi", "pkcs11", "platform", "pop"
, "pres", "prospero", "proxy", "pwid", "psyc", "qb", "query", "redis"
, "rediss", "reload", "res", "resource", "rmi", "rsync", "rtmfp", "rtmp"
, "rtsp", "rtsps", "rtspu", "secondlife", "service", "session", "sftp", "sgn"
, "shttp", "sieve", "sip", "sips", "skype", "smb", "sms", "smtp", "snews"
, "snmp", "soap.beep", "soap.beeps", "soldat", "spotify", "ssh", "steam"
, "stun", "stuns", "submit", "svn", "tag", "teamspeak", "tel", "teliaeid"
, "telnet", "tftp", "things", "thismessage", "tip", "tn3270", "tool", "turn"
, "turns", "tv", "udp", "unreal", "urn", "ut2004", "v-event", "vemmi"
, "ventrilo", "videotex", "vnc", "view-source", "wais", "webcal", "wpid"
, "ws", "wss", "wtai", "wyciwyg", "xcon", "xcon-userid", "xfire"
, "xmlrpc.beep", "xmlrpc.beeps", "xmpp", "xri", "ymsgr", "z39.50", "z39.50r"
, "z39.50s"
-- Inofficial schemes
, "doi", "isbn", "javascript", "pmid"
]
-- | Check if the string is a valid URL with a IANA or frequently used but
-- unofficial scheme (see @schemes@).
isURI :: String -> Bool
isURI = maybe False hasKnownScheme . parseURI
where
hasKnownScheme = (`Set.member` schemes) . filter (/= ':') . uriScheme
---
--- Squash blocks into inlines
---

View file

@ -33,7 +33,7 @@ import Control.Monad.State
import Data.Char (ord)
import Data.List (intercalate, intersperse)
import Data.Maybe (catMaybes)
import Network.URI (isURI, unEscapeString)
import Network.URI (unEscapeString)
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Logging
import Text.Pandoc.Definition

View file

@ -44,13 +44,12 @@ import Control.Monad.Reader (ReaderT, ask, local, runReaderT)
import Control.Monad.State (StateT, evalStateT, gets, modify)
import Data.Default (Default (..))
import Data.List (intercalate, intersect, isPrefixOf, transpose)
import Network.URI (isURI)
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Logging
import Text.Pandoc.Definition
import Text.Pandoc.ImageSize
import Text.Pandoc.Options (WrapOption (..), WriterOptions (writerTableOfContents, writerTemplate, writerWrapText))
import Text.Pandoc.Shared (camelCaseToHyphenated, escapeURI, linesToPara,
import Text.Pandoc.Shared (camelCaseToHyphenated, escapeURI, isURI, linesToPara,
removeFormatting, substitute, trimr)
import Text.Pandoc.Templates (renderTemplate')
import Text.Pandoc.Writers.Shared (defField, metaToJSON)

View file

@ -46,7 +46,6 @@ import Data.Char (isAscii, isControl, isSpace, toLower)
import Data.Either (lefts, rights)
import Data.List (intercalate, intersperse, isPrefixOf, stripPrefix)
import Network.HTTP (urlEncode)
import Network.URI (isURI)
import Text.XML.Light
import qualified Text.XML.Light as X
import qualified Text.XML.Light.Cursor as XC
@ -57,7 +56,7 @@ import Text.Pandoc.Definition
import Text.Pandoc.Error
import Text.Pandoc.Logging
import Text.Pandoc.Options (HTMLMathMethod (..), WriterOptions (..), def)
import Text.Pandoc.Shared (capitalize, isHeaderBlock, linesToPara,
import Text.Pandoc.Shared (capitalize, isHeaderBlock, isURI, linesToPara,
orderedListMarkers)
-- | Data to be written at the end of the document:

View file

@ -36,7 +36,6 @@ module Text.Pandoc.Writers.Haddock (writeHaddock) where
import Control.Monad.State
import Data.Default
import Data.List (intersperse, transpose)
import Network.URI (isURI)
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Definition
import Text.Pandoc.Logging

View file

@ -21,7 +21,6 @@ import Control.Monad.State
import Data.List (intersperse, isInfixOf, isPrefixOf, stripPrefix)
import qualified Data.Set as Set
import Data.Text as Text (breakOnAll, pack)
import Network.URI (isURI)
import Text.Pandoc.Class (PandocMonad, report)
import qualified Text.Pandoc.Class as P
import Text.Pandoc.Definition
@ -29,7 +28,7 @@ import Text.Pandoc.ImageSize
import Text.Pandoc.Logging
import Text.Pandoc.Options
import Text.Pandoc.Pretty
import Text.Pandoc.Shared (linesToPara, splitBy)
import Text.Pandoc.Shared (isURI, linesToPara, splitBy)
import Text.Pandoc.Templates (renderTemplate')
import Text.Pandoc.Writers.Math (texMathToInlines)
import Text.Pandoc.Writers.Shared

View file

@ -43,7 +43,7 @@ import Data.List (foldl', intercalate, intersperse, isInfixOf, nub, nubBy,
stripPrefix, (\\))
import Data.Maybe (catMaybes, fromMaybe, isJust)
import qualified Data.Text as T
import Network.URI (isURI, unEscapeString)
import Network.URI (unEscapeString)
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Definition
import Text.Pandoc.Highlighting (formatLaTeXBlock, formatLaTeXInline, highlight,

View file

@ -49,7 +49,6 @@ import qualified Data.Text as T
import qualified Data.Vector as V
import Data.Yaml (Value (Array, Bool, Number, Object, String))
import Network.HTTP (urlEncode)
import Network.URI (isURI)
import Text.HTML.TagSoup (Tag (..), isTagText, parseTags)
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Definition

View file

@ -34,7 +34,6 @@ import Control.Monad.Reader
import Control.Monad.State
import Data.List (intercalate)
import qualified Data.Set as Set
import Network.URI (isURI)
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Logging
import Text.Pandoc.Definition

View file

@ -35,7 +35,6 @@ import Control.Monad.State
import Data.Char (isSpace, toLower)
import Data.List (isPrefixOf, stripPrefix)
import Data.Maybe (fromMaybe)
import Network.URI (isURI)
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Logging

View file

@ -37,7 +37,7 @@ import Data.Char (chr, ord)
import Data.List (maximumBy, transpose)
import Data.Ord (comparing)
import qualified Data.Set as Set
import Network.URI (isURI, unEscapeString)
import Network.URI (unEscapeString)
import System.FilePath
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Definition

View file

@ -38,14 +38,13 @@ import Data.Default (Default (..))
import Data.List (intercalate, isInfixOf, isPrefixOf, transpose)
import qualified Data.Map as Map
import Data.Text (breakOnAll, pack)
import Network.URI (isURI)
import Text.Pandoc.Class (PandocMonad, report)
import Text.Pandoc.Logging
import Text.Pandoc.Definition
import Text.Pandoc.ImageSize
import Text.Pandoc.Options (WrapOption (..), WriterOptions (writerTableOfContents, writerTemplate, writerWrapText))
import Text.Pandoc.Shared (escapeURI, linesToPara, removeFormatting, substitute,
trimr)
import Text.Pandoc.Shared (isURI, escapeURI, linesToPara, removeFormatting,
substitute, trimr)
import Text.Pandoc.Templates (renderTemplate')
import Text.Pandoc.Writers.Shared (defField, metaToJSON)