Markdown reader: improved efficiency of abbreviation parsing.

Instead of a separate abbrev parser, we just check for abbreviations
each time we parse a string.  This gives a huge performance boost
with -S.  Resolves Issue #141.

git-svn-id: https://pandoc.googlecode.com/svn/trunk@1570 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
fiddlosopher 2009-04-30 04:39:45 +00:00
parent 490c4304f5
commit ab100f7c5e
3 changed files with 28 additions and 31 deletions

View file

@ -34,7 +34,7 @@ module Text.Pandoc.Readers.Markdown (
import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy, findIndex, intercalate )
import Data.Ord ( comparing )
import Data.Char ( isAlphaNum, isAlpha, isLower, isDigit, isUpper )
import Data.Char ( isAlphaNum, isUpper )
import Data.Maybe
import Text.Pandoc.Definition
import Text.Pandoc.Shared
@ -820,8 +820,7 @@ inline :: GenParser Char ParserState Inline
inline = choice inlineParsers <?> "inline"
inlineParsers :: [GenParser Char ParserState Inline]
inlineParsers = [ abbrev
, str
inlineParsers = [ str
, smartPunctuation
, whitespace
, endline
@ -944,30 +943,6 @@ subscript = failIfStrict >> enclosed (char '~') (char '~')
(notFollowedBy spaceChar >> inline) >>= -- may not contain Space
return . Subscript
abbrev :: GenParser Char ParserState Inline
abbrev = failUnlessSmart >>
(assumedAbbrev <|> knownAbbrev) >>= return . Str . (++ ".\160")
-- an string of letters followed by a period that does not end a sentence
-- is assumed to be an abbreviation. It is assumed that sentences don't
-- start with lowercase letters or numerals.
assumedAbbrev :: GenParser Char ParserState [Char]
assumedAbbrev = try $ do
result <- many1 $ satisfy isAlpha
string ". "
lookAhead $ satisfy (\x -> isLower x || isDigit x)
return result
-- these strings are treated as abbreviations even if they are followed
-- by a capital letter (such as a name).
knownAbbrev :: GenParser Char ParserState [Char]
knownAbbrev = try $ do
result <- oneOfStrings [ "Mr", "Mrs", "Ms", "Capt", "Dr", "Prof", "Gen",
"Gov", "e.g", "i.e", "Sgt", "St", "vol", "vs",
"Sen", "Rep", "Pres", "Hon", "Rev" ]
string ". "
return result
smartPunctuation :: GenParser Char ParserState Inline
smartPunctuation = failUnlessSmart >>
choice [ quoted, apostrophe, dash, ellipses ]
@ -1060,8 +1035,30 @@ nonEndline = satisfy (/='\n')
strChar :: GenParser Char st Char
strChar = noneOf (specialChars ++ " \t\n")
str :: GenParser Char st Inline
str = many1 strChar >>= return . Str
str :: GenParser Char ParserState Inline
str = do
result <- many1 strChar
state <- getState
if stateSmart state
then case likelyAbbrev result of
[] -> return $ Str result
xs -> choice (map (\x ->
try (string x >> char ' ' >>
notFollowedBy spaceChar >>
return (Str $ result ++ x ++ "\160"))) xs)
<|> (return $ Str result)
else return $ Str result
-- | if the string matches the beginning of an abbreviation (before
-- the first period, return strings that would finish the abbreviation.
likelyAbbrev :: String -> [String]
likelyAbbrev x =
let abbrevs = [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.",
"Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.",
"vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.",
"Rev.", "Ph.D.", "M.D.", "M.A." ]
abbrPairs = map (break (=='.')) abbrevs
in map snd $ filter (\(y,_) -> y == x) abbrPairs
-- an endline character that can be treated as a space, not a structural break
endline :: GenParser Char ParserState Inline

View file

@ -165,7 +165,7 @@ Pandoc (Meta [Str "Pandoc",Space,Str "Test",Space,Str "Suite"] ["John MacFarlane
[ [ Plain [Str "Nested",Str "."] ]
] ] ]
, Para [Str "Should",Space,Str "not",Space,Str "be",Space,Str "a",Space,Str "list",Space,Str "item:"]
, Para [Str "M",Str ".",Str "A.\160",Str "2007"]
, Para [Str "M.A.\160",Str "2007"]
, Para [Str "B",Str ".",Space,Str "Williams"]
, HorizontalRule
, Header 1 [Str "Definition",Space,Str "Lists"]

View file

@ -165,7 +165,7 @@ Pandoc (Meta [Str "Pandoc",Space,Str "Test",Space,Str "Suite"] ["John MacFarlane
[ [ Plain [Str "Nested",Str "."] ]
] ] ]
, Para [Str "Should",Space,Str "not",Space,Str "be",Space,Str "a",Space,Str "list",Space,Str "item:"]
, Para [Str "M",Str ".",Str "A.\160",Str "2007"]
, Para [Str "M.A.\160",Str "2007"]
, Para [Str "B",Str ".",Space,Str "Williams"]
, HorizontalRule
, Header 1 [Str "Definition",Space,Str "Lists"]