Added readerAbbreviations to ParserState.

Markdown reader now consults this to determine what is an abbreviation. Eventually it will be possible to specify a custom list (see #256).
2017-03-05 10:24:39 +01:00 · 2017-03-05 10:24:39 +01:00 · 95f2726ee7
commit 95f2726ee7
parent 7fc6919f90
2 changed files with 27 additions and 25 deletions
--- a/src/Text/Pandoc/Options.hs
+++ b/src/Text/Pandoc/Options.hs
@ -47,6 +47,7 @@ module Text.Pandoc.Options ( module Text.Pandoc.Extensions
                           ) where
 import Data.Data (Data)
 import Data.Default
+import qualified Data.Set as Set
 import Data.Typeable (Typeable)
 import GHC.Generics (Generic)
 import Text.Pandoc.Extensions
@ -60,6 +61,7 @@ data ReaderOptions = ReaderOptions{
       , readerApplyMacros           :: Bool -- ^ Apply macros to TeX math
       , readerIndentedCodeClasses   :: [String] -- ^ Default classes for
                                       -- indented code blocks
+       , readerAbbreviations         :: Set.Set String -- ^ Strings to treat as abbreviations
       , readerDefaultImageExtension :: String -- ^ Default extension for images
       , readerTrackChanges          :: TrackChanges
 } deriving (Show, Read, Data, Typeable, Generic)
@ -72,10 +74,19 @@ instance Default ReaderOptions
               , readerTabStop               = 4
               , readerApplyMacros           = True
               , readerIndentedCodeClasses   = []
+               , readerAbbreviations         = defaultAbbrevs
               , readerDefaultImageExtension = ""
               , readerTrackChanges          = AcceptChanges
               }

+defaultAbbrevs :: Set.Set String
+defaultAbbrevs = Set.fromList
+                 [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.",
+                   "Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.",
+                   "vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.",
+                   "Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.",
+                   "ch.", "sec.", "cf.", "cp."]
+
 --
 -- Writer options
 --
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@ -42,6 +42,7 @@ import Data.Maybe
 import Data.Monoid ((<>))
 import Data.Ord (comparing)
 import Data.Scientific (base10Exponent, coefficient)
+import qualified Data.Set as Set
 import Data.Text (Text)
 import qualified Data.Text as T
 import qualified Data.Vector as V
@ -1688,32 +1689,22 @@ nonEndline = satisfy (/='\n')

 str :: PandocMonad m => MarkdownParser m (F Inlines)
 str = do
-  result <- many1 alphaNum
+  result <- many1 (alphaNum <|> try (char '.' <* notFollowedBy (char '.')))
  updateLastStrPos
-  let spacesToNbr = map (\c -> if c == ' ' then '\160' else c)
-  isSmart <- extensionEnabled Ext_smart <$> getOption readerExtensions
-  if isSmart
-     then case likelyAbbrev result of
-               []        -> return $ return $ B.str result
-               xs        -> choice (map (\x ->
-                               try (string x >> oneOf " \n" >>
-                                    lookAhead alphaNum >>
-                                    return (return $ B.str
-                                                  $ result ++ spacesToNbr x ++ "\160"))) xs)
-                           <|> (return $ return $ B.str result)
-     else return $ return $ B.str result
-
-- | if the string matches the beginning of an abbreviation (before
-- the first period, return strings that would finish the abbreviation.
-likelyAbbrev :: String -> [String]
-likelyAbbrev x =
-  let abbrevs = [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.",
-                  "Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.",
-                  "vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.",
-                  "Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.",
-                  "ch.", "sec.", "cf.", "cp."]
-      abbrPairs = map (break (=='.')) abbrevs
-  in  map snd $ filter (\(y,_) -> y == x) abbrPairs
+  (do guardEnabled Ext_smart
+      abbrevs <- getOption readerAbbreviations
+      if not (null result) && last result == '.' && result `Set.member` abbrevs
+         then try (do ils <- whitespace <|> endline
+                      lookAhead alphaNum
+                      return $ do
+                        ils' <- ils
+                        if ils' == B.space
+                           then return (B.str result <> B.str "\160")
+                           else -- linebreak or softbreak
+                                return (ils' <> B.str result <> B.str "\160"))
+                <|> return (return (B.str result))
+         else return (return (B.str result)))
+     <|> return (return (B.str result))

 -- an endline character that can be treated as a space, not a structural break
 endline :: PandocMonad m => MarkdownParser m (F Inlines)