Added readerAbbreviations to ParserState.

Markdown reader now consults this to determine what is an
abbreviation.

Eventually it will be possible to specify a custom list
(see #256).
This commit is contained in:
John MacFarlane 2017-03-05 10:24:39 +01:00
parent 7fc6919f90
commit 95f2726ee7
2 changed files with 27 additions and 25 deletions

View file

@ -47,6 +47,7 @@ module Text.Pandoc.Options ( module Text.Pandoc.Extensions
) where
import Data.Data (Data)
import Data.Default
import qualified Data.Set as Set
import Data.Typeable (Typeable)
import GHC.Generics (Generic)
import Text.Pandoc.Extensions
@ -60,6 +61,7 @@ data ReaderOptions = ReaderOptions{
, readerApplyMacros :: Bool -- ^ Apply macros to TeX math
, readerIndentedCodeClasses :: [String] -- ^ Default classes for
-- indented code blocks
, readerAbbreviations :: Set.Set String -- ^ Strings to treat as abbreviations
, readerDefaultImageExtension :: String -- ^ Default extension for images
, readerTrackChanges :: TrackChanges
} deriving (Show, Read, Data, Typeable, Generic)
@ -72,10 +74,19 @@ instance Default ReaderOptions
, readerTabStop = 4
, readerApplyMacros = True
, readerIndentedCodeClasses = []
, readerAbbreviations = defaultAbbrevs
, readerDefaultImageExtension = ""
, readerTrackChanges = AcceptChanges
}
defaultAbbrevs :: Set.Set String
defaultAbbrevs = Set.fromList
[ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.",
"Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.",
"vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.",
"Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.",
"ch.", "sec.", "cf.", "cp."]
--
-- Writer options
--

View file

@ -42,6 +42,7 @@ import Data.Maybe
import Data.Monoid ((<>))
import Data.Ord (comparing)
import Data.Scientific (base10Exponent, coefficient)
import qualified Data.Set as Set
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Vector as V
@ -1688,32 +1689,22 @@ nonEndline = satisfy (/='\n')
str :: PandocMonad m => MarkdownParser m (F Inlines)
str = do
result <- many1 alphaNum
result <- many1 (alphaNum <|> try (char '.' <* notFollowedBy (char '.')))
updateLastStrPos
let spacesToNbr = map (\c -> if c == ' ' then '\160' else c)
isSmart <- extensionEnabled Ext_smart <$> getOption readerExtensions
if isSmart
then case likelyAbbrev result of
[] -> return $ return $ B.str result
xs -> choice (map (\x ->
try (string x >> oneOf " \n" >>
lookAhead alphaNum >>
return (return $ B.str
$ result ++ spacesToNbr x ++ "\160"))) xs)
<|> (return $ return $ B.str result)
else return $ return $ B.str result
-- | if the string matches the beginning of an abbreviation (before
-- the first period, return strings that would finish the abbreviation.
likelyAbbrev :: String -> [String]
likelyAbbrev x =
let abbrevs = [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.",
"Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.",
"vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.",
"Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.",
"ch.", "sec.", "cf.", "cp."]
abbrPairs = map (break (=='.')) abbrevs
in map snd $ filter (\(y,_) -> y == x) abbrPairs
(do guardEnabled Ext_smart
abbrevs <- getOption readerAbbreviations
if not (null result) && last result == '.' && result `Set.member` abbrevs
then try (do ils <- whitespace <|> endline
lookAhead alphaNum
return $ do
ils' <- ils
if ils' == B.space
then return (B.str result <> B.str "\160")
else -- linebreak or softbreak
return (ils' <> B.str result <> B.str "\160"))
<|> return (return (B.str result))
else return (return (B.str result)))
<|> return (return (B.str result))
-- an endline character that can be treated as a space, not a structural break
endline :: PandocMonad m => MarkdownParser m (F Inlines)