From bf72a482ebf8483028f587fb538d35e2b18dade4 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Sun, 23 Oct 2016 23:12:36 +0200
Subject: [PATCH] Tighten up parsing of raw email addresses.

Technically `**@user` is a valid email address, but if we
allow things like this, we get bad results in markdown flavors
that autolink raw email addresses. (See #2940.)
So we exclude a few valid email addresses in order to
avoid these more common bad cases.

Closes #2940.
---
 src/Text/Pandoc/Parsing.hs      | 17 +++++++++++++----
 tests/Tests/Readers/Markdown.hs |  5 +++++
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index daf8e867d..110e34c6a 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -178,7 +178,7 @@ import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn)
 import Text.Parsec hiding (token)
 import Text.Parsec.Pos (newPos)
 import Data.Char ( toLower, toUpper, ord, chr, isAscii, isAlphaNum,
-                   isHexDigit, isSpace )
+                   isHexDigit, isSpace, isPunctuation )
 import Data.List ( intercalate, transpose, isSuffixOf )
 import Text.Pandoc.Shared
 import qualified Data.Map as M
@@ -405,9 +405,18 @@ emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain)
        domain            = intercalate "." <$> (subdomain `sepby1` dot)
        dot               = char '.'
        subdomain         = many1 $ alphaNum <|> innerPunct
-       innerPunct        = try (satisfy (\c -> isEmailPunct c || c == '@') <*
-                                 notFollowedBy space)
-       emailWord         = many1 $ satisfy isEmailChar
+       -- this excludes some valid email addresses, since an
+       -- email could contain e.g. '__', but gives better results
+       -- for our purposes, when combined with markdown parsing:
+       innerPunct        = try (satisfy (\c -> isEmailPunct c || c == '@')
+                                 <* notFollowedBy space
+                                 <* notFollowedBy (satisfy isPunctuation))
+       -- technically an email address could begin with a symbol,
+       -- but allowing this creates too many problems.
+       -- See e.g. https://github.com/jgm/pandoc/issues/2940
+       emailWord         = do x <- satisfy isAlphaNum
+                              xs <- many (satisfy isEmailChar)
+                              return (x:xs)
        isEmailChar c     = isAlphaNum c || isEmailPunct c
        isEmailPunct c    = c `elem` "!\"#$%&'*+-/=?^_{|}~;"
        -- note: sepBy1 from parsec consumes input when sep
diff --git a/tests/Tests/Readers/Markdown.hs b/tests/Tests/Readers/Markdown.hs
index 099d75b62..e877b81ca 100644
--- a/tests/Tests/Readers/Markdown.hs
+++ b/tests/Tests/Readers/Markdown.hs
@@ -185,6 +185,11 @@ tests = [ testGroup "inline code"
             "<\n\na>" =?>
             para (text "<") <> para (text "a>")
           ]
+        , testGroup "raw email addresses"
+          [ test markdownGH "issue 2940" $
+            "**@user**" =?>
+            para (strong (text "@user"))
+          ]
         , testGroup "emoji"
           [ test markdownGH "emoji symbols" $
             ":smile: and :+1:" =?> para (text "😄 and 👍")