Changed uri parser so it doesn't include trailing punctuation.
So, in RST, 'http://google.com.' should be parsed as a link to 'http://google.com' followed by a period. The parser is smart enough to recognize balanced parentheses, as often occur in wikipedia links: 'http://foo.bar/baz_(bam)'. Also added ()s to RST specialChars, so '(http://google.com)' will be parsed as a link in parens. Added test cases. Resolves Issue #291.
This commit is contained in:
parent
d1304e8356
commit
6beba76f61
4 changed files with 29 additions and 5 deletions
|
@ -45,5 +45,13 @@ tests = [ "field list" =:
|
||||||
, (str "Parameter i", [para "integer"])
|
, (str "Parameter i", [para "integer"])
|
||||||
, (str "Final", [para "item on two lines"])
|
, (str "Final", [para "item on two lines"])
|
||||||
])
|
])
|
||||||
|
, "URLs with following punctuation" =:
|
||||||
|
("http://google.com, http://yahoo.com; http://foo.bar.baz.\n" ++
|
||||||
|
"http://foo.bar/baz_(bam) (http://foo.bar)") =?>
|
||||||
|
para (link "http://google.com" "" "http://google.com" +++ ", " +++
|
||||||
|
link "http://yahoo.com" "" "http://yahoo.com" +++ "; " +++
|
||||||
|
link "http://foo.bar.baz" "" "http://foo.bar.baz" +++ ". " +++
|
||||||
|
link "http://foo.bar/baz_(bam)" "" "http://foo.bar/baz_(bam)"
|
||||||
|
+++ " (" +++ link "http://foo.bar" "" "http://foo.bar" +++ ")")
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -78,7 +78,7 @@ import Text.Pandoc.Generic
|
||||||
import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn)
|
import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn)
|
||||||
import Text.ParserCombinators.Parsec
|
import Text.ParserCombinators.Parsec
|
||||||
import Text.Pandoc.CharacterReferences ( characterReference )
|
import Text.Pandoc.CharacterReferences ( characterReference )
|
||||||
import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit )
|
import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit, isPunctuation )
|
||||||
import Data.List ( intercalate, transpose )
|
import Data.List ( intercalate, transpose )
|
||||||
import Network.URI ( parseURI, URI (..), isAllowedInURI )
|
import Network.URI ( parseURI, URI (..), isAllowedInURI )
|
||||||
import Control.Monad ( join, liftM, guard )
|
import Control.Monad ( join, liftM, guard )
|
||||||
|
@ -264,8 +264,24 @@ uri = try $ do
|
||||||
let protocols = [ "http:", "https:", "ftp:", "file:", "mailto:",
|
let protocols = [ "http:", "https:", "ftp:", "file:", "mailto:",
|
||||||
"news:", "telnet:" ]
|
"news:", "telnet:" ]
|
||||||
lookAhead $ oneOfStrings protocols
|
lookAhead $ oneOfStrings protocols
|
||||||
-- scan non-ascii characters and ascii characters allowed in a URI
|
-- Scan non-ascii characters and ascii characters allowed in a URI.
|
||||||
str <- many1 $ satisfy (\c -> not (isAscii c) || isAllowedInURI c)
|
-- We allow punctuation except when followed by a space, since
|
||||||
|
-- we don't want the trailing '.' in 'http://google.com.'
|
||||||
|
let innerPunct = try $ satisfy isPunctuation >>~
|
||||||
|
notFollowedBy (newline <|> spaceChar)
|
||||||
|
let uriChar = innerPunct <|>
|
||||||
|
satisfy (\c -> not (isPunctuation c) &&
|
||||||
|
(not (isAscii c) || isAllowedInURI c))
|
||||||
|
-- We want to allow
|
||||||
|
-- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation)
|
||||||
|
-- as a URL, while NOT picking up the closing paren in
|
||||||
|
-- (http://wikipedia.org)
|
||||||
|
-- So we include balanced parens in the URL.
|
||||||
|
let inParens = try $ do char '('
|
||||||
|
res <- many uriChar
|
||||||
|
char ')'
|
||||||
|
return $ '(' : res ++ ")"
|
||||||
|
str <- liftM concat $ many1 $ inParens <|> count 1 (innerPunct <|> uriChar)
|
||||||
-- now see if they amount to an absolute URI
|
-- now see if they amount to an absolute URI
|
||||||
case parseURI (escapeURI str) of
|
case parseURI (escapeURI str) of
|
||||||
Just uri' -> if uriScheme uri' `elem` protocols
|
Just uri' -> if uriScheme uri' `elem` protocols
|
||||||
|
|
|
@ -58,7 +58,7 @@ underlineChars = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
|
||||||
|
|
||||||
-- treat these as potentially non-text when parsing inline:
|
-- treat these as potentially non-text when parsing inline:
|
||||||
specialChars :: [Char]
|
specialChars :: [Char]
|
||||||
specialChars = "\\`|*_<>$:[-.\"'\8216\8217\8220\8221"
|
specialChars = "\\`|*_<>$:[]()-.\"'\8216\8217\8220\8221"
|
||||||
|
|
||||||
--
|
--
|
||||||
-- parsing documents
|
-- parsing documents
|
||||||
|
|
|
@ -222,7 +222,7 @@ Pandoc (Meta {docTitle = [Str "Pandoc",Space,Str "Test",Space,Str "Suite",Str ":
|
||||||
,Para [Str "But",Space,Str "not",Space,Str "here",Str ":"]
|
,Para [Str "But",Space,Str "not",Space,Str "here",Str ":"]
|
||||||
,CodeBlock ("",[],[]) "http://example.com/"
|
,CodeBlock ("",[],[]) "http://example.com/"
|
||||||
,Header 1 [Str "Images"]
|
,Header 1 [Str "Images"]
|
||||||
,Para [Str "From",Space,Quoted DoubleQuote [Str "Voyage",Space,Str "dans",Space,Str "la",Space,Str "Lune"],Space,Str "by",Space,Str "Georges",Space,Str "Melies",Space,Str "(1902)",Str ":"]
|
,Para [Str "From",Space,Quoted DoubleQuote [Str "Voyage",Space,Str "dans",Space,Str "la",Space,Str "Lune"],Space,Str "by",Space,Str "Georges",Space,Str "Melies",Space,Str "(",Str "1902",Str ")",Str ":"]
|
||||||
,Plain [Image [Str "image"] ("lalune.jpg","")]
|
,Plain [Image [Str "image"] ("lalune.jpg","")]
|
||||||
,Plain [Image [Str "Voyage dans la Lune"] ("lalune.jpg","")]
|
,Plain [Image [Str "Voyage dans la Lune"] ("lalune.jpg","")]
|
||||||
,Para [Str "Here",Space,Str "is",Space,Str "a",Space,Str "movie",Space,Image [Str "movie"] ("movie.jpg",""),Space,Str "icon",Str "."]
|
,Para [Str "Here",Space,Str "is",Space,Str "a",Space,Str "movie",Space,Image [Str "movie"] ("movie.jpg",""),Space,Str "icon",Str "."]
|
||||||
|
|
Loading…
Reference in a new issue