From 942189056d76cd8dedfe11436fb9a7b6a7b8724c Mon Sep 17 00:00:00 2001 From: Thenaesh Elango Date: Sun, 5 Feb 2017 18:28:39 +0800 Subject: [PATCH] Allow user to specify User-Agent (#3421) This commit enables users to specify the User-Agent header used when pandoc requests a document from a URL. This is done by setting an environment variable. For instance, one can do: USER_AGENT="..." ./pandoc -f html -t markdown http://example.com Signed-off-by: Thenaesh Elango --- MANUAL.txt | 5 +++++ src/Text/Pandoc/Shared.hs | 14 +++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/MANUAL.txt b/MANUAL.txt index 77ca471c5..3d5555a92 100644 --- a/MANUAL.txt +++ b/MANUAL.txt @@ -120,6 +120,11 @@ pandoc will fetch the content using HTTP: pandoc -f html -t markdown http://www.fsf.org +It is possible to supply a custom User-Agent string when requesting a +document from a URL, by setting an environment variable: + + USER_AGENT="Mozilla/5.0" pandoc -f html -t markdown http://www.fsf.org + If multiple input files are given, `pandoc` will concatenate them all (with blank lines between them) before parsing. This feature is disabled for binary input formats such as `EPUB`, `odt`, and `docx`. diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 22847931f..86e9a5525 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -146,13 +146,13 @@ import Paths_pandoc (getDataFileName) #endif #ifdef HTTP_CLIENT import Network.HTTP.Client (httpLbs, responseBody, responseHeaders, - Request(port,host)) + Request(port,host,requestHeaders)) import Network.HTTP.Client (parseRequest) import Network.HTTP.Client (newManager) import Network.HTTP.Client.Internal (addProxy) import Network.HTTP.Client.TLS (tlsManagerSettings) import System.Environment (getEnv) -import Network.HTTP.Types.Header ( hContentType) +import Network.HTTP.Types.Header ( hContentType, hUserAgent) import Network (withSocketsDo) #else import Network.URI (parseURI) @@ -742,13 +742,21 @@ openURL u | otherwise = withSocketsDo $ E.try $ do let parseReq = parseRequest (proxy :: Either E.SomeException String) <- E.try $ getEnv "http_proxy" + (useragent :: Either E.SomeException String) <- E.try $ getEnv "USER_AGENT" req <- parseReq u req' <- case proxy of Left _ -> return req Right pr -> (parseReq pr >>= \r -> return $ addProxy (host r) (port r) req) `mplus` return req - resp <- newManager tlsManagerSettings >>= httpLbs req' + req'' <- case useragent of + Left _ -> return req' + Right ua -> do + let headers = requestHeaders req' + let useragentheader = (hUserAgent, B8.pack ua) + let headers' = useragentheader:headers + return $ req' {requestHeaders = headers'} + resp <- newManager tlsManagerSettings >>= httpLbs req'' return (BS.concat $ toChunks $ responseBody resp, UTF8.toString `fmap` lookup hContentType (responseHeaders resp)) #else