diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index daf8e867d..110e34c6a 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -178,7 +178,7 @@ import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) import Text.Parsec hiding (token) import Text.Parsec.Pos (newPos) import Data.Char ( toLower, toUpper, ord, chr, isAscii, isAlphaNum, - isHexDigit, isSpace ) + isHexDigit, isSpace, isPunctuation ) import Data.List ( intercalate, transpose, isSuffixOf ) import Text.Pandoc.Shared import qualified Data.Map as M @@ -405,9 +405,18 @@ emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain) domain = intercalate "." <$> (subdomain `sepby1` dot) dot = char '.' subdomain = many1 $ alphaNum <|> innerPunct - innerPunct = try (satisfy (\c -> isEmailPunct c || c == '@') <* - notFollowedBy space) - emailWord = many1 $ satisfy isEmailChar + -- this excludes some valid email addresses, since an + -- email could contain e.g. '__', but gives better results + -- for our purposes, when combined with markdown parsing: + innerPunct = try (satisfy (\c -> isEmailPunct c || c == '@') + <* notFollowedBy space + <* notFollowedBy (satisfy isPunctuation)) + -- technically an email address could begin with a symbol, + -- but allowing this creates too many problems. + -- See e.g. https://github.com/jgm/pandoc/issues/2940 + emailWord = do x <- satisfy isAlphaNum + xs <- many (satisfy isEmailChar) + return (x:xs) isEmailChar c = isAlphaNum c || isEmailPunct c isEmailPunct c = c `elem` "!\"#$%&'*+-/=?^_{|}~;" -- note: sepBy1 from parsec consumes input when sep diff --git a/tests/Tests/Readers/Markdown.hs b/tests/Tests/Readers/Markdown.hs index 099d75b62..e877b81ca 100644 --- a/tests/Tests/Readers/Markdown.hs +++ b/tests/Tests/Readers/Markdown.hs @@ -185,6 +185,11 @@ tests = [ testGroup "inline code" "<\n\na>" =?> para (text "<") <> para (text "a>") ] + , testGroup "raw email addresses" + [ test markdownGH "issue 2940" $ + "**@user**" =?> + para (strong (text "@user")) + ] , testGroup "emoji" [ test markdownGH "emoji symbols" $ ":smile: and :+1:" =?> para (text "😄 and 👍")