Markdown reader: improved efficiency of abbreviation parsing.
Instead of a separate abbrev parser, we just check for abbreviations each time we parse a string. This gives a huge performance boost with -S. Resolves Issue #141. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1570 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
parent
490c4304f5
commit
ab100f7c5e
3 changed files with 28 additions and 31 deletions
|
@ -34,7 +34,7 @@ module Text.Pandoc.Readers.Markdown (
|
||||||
|
|
||||||
import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy, findIndex, intercalate )
|
import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy, findIndex, intercalate )
|
||||||
import Data.Ord ( comparing )
|
import Data.Ord ( comparing )
|
||||||
import Data.Char ( isAlphaNum, isAlpha, isLower, isDigit, isUpper )
|
import Data.Char ( isAlphaNum, isUpper )
|
||||||
import Data.Maybe
|
import Data.Maybe
|
||||||
import Text.Pandoc.Definition
|
import Text.Pandoc.Definition
|
||||||
import Text.Pandoc.Shared
|
import Text.Pandoc.Shared
|
||||||
|
@ -820,8 +820,7 @@ inline :: GenParser Char ParserState Inline
|
||||||
inline = choice inlineParsers <?> "inline"
|
inline = choice inlineParsers <?> "inline"
|
||||||
|
|
||||||
inlineParsers :: [GenParser Char ParserState Inline]
|
inlineParsers :: [GenParser Char ParserState Inline]
|
||||||
inlineParsers = [ abbrev
|
inlineParsers = [ str
|
||||||
, str
|
|
||||||
, smartPunctuation
|
, smartPunctuation
|
||||||
, whitespace
|
, whitespace
|
||||||
, endline
|
, endline
|
||||||
|
@ -944,30 +943,6 @@ subscript = failIfStrict >> enclosed (char '~') (char '~')
|
||||||
(notFollowedBy spaceChar >> inline) >>= -- may not contain Space
|
(notFollowedBy spaceChar >> inline) >>= -- may not contain Space
|
||||||
return . Subscript
|
return . Subscript
|
||||||
|
|
||||||
abbrev :: GenParser Char ParserState Inline
|
|
||||||
abbrev = failUnlessSmart >>
|
|
||||||
(assumedAbbrev <|> knownAbbrev) >>= return . Str . (++ ".\160")
|
|
||||||
|
|
||||||
-- an string of letters followed by a period that does not end a sentence
|
|
||||||
-- is assumed to be an abbreviation. It is assumed that sentences don't
|
|
||||||
-- start with lowercase letters or numerals.
|
|
||||||
assumedAbbrev :: GenParser Char ParserState [Char]
|
|
||||||
assumedAbbrev = try $ do
|
|
||||||
result <- many1 $ satisfy isAlpha
|
|
||||||
string ". "
|
|
||||||
lookAhead $ satisfy (\x -> isLower x || isDigit x)
|
|
||||||
return result
|
|
||||||
|
|
||||||
-- these strings are treated as abbreviations even if they are followed
|
|
||||||
-- by a capital letter (such as a name).
|
|
||||||
knownAbbrev :: GenParser Char ParserState [Char]
|
|
||||||
knownAbbrev = try $ do
|
|
||||||
result <- oneOfStrings [ "Mr", "Mrs", "Ms", "Capt", "Dr", "Prof", "Gen",
|
|
||||||
"Gov", "e.g", "i.e", "Sgt", "St", "vol", "vs",
|
|
||||||
"Sen", "Rep", "Pres", "Hon", "Rev" ]
|
|
||||||
string ". "
|
|
||||||
return result
|
|
||||||
|
|
||||||
smartPunctuation :: GenParser Char ParserState Inline
|
smartPunctuation :: GenParser Char ParserState Inline
|
||||||
smartPunctuation = failUnlessSmart >>
|
smartPunctuation = failUnlessSmart >>
|
||||||
choice [ quoted, apostrophe, dash, ellipses ]
|
choice [ quoted, apostrophe, dash, ellipses ]
|
||||||
|
@ -1060,8 +1035,30 @@ nonEndline = satisfy (/='\n')
|
||||||
strChar :: GenParser Char st Char
|
strChar :: GenParser Char st Char
|
||||||
strChar = noneOf (specialChars ++ " \t\n")
|
strChar = noneOf (specialChars ++ " \t\n")
|
||||||
|
|
||||||
str :: GenParser Char st Inline
|
str :: GenParser Char ParserState Inline
|
||||||
str = many1 strChar >>= return . Str
|
str = do
|
||||||
|
result <- many1 strChar
|
||||||
|
state <- getState
|
||||||
|
if stateSmart state
|
||||||
|
then case likelyAbbrev result of
|
||||||
|
[] -> return $ Str result
|
||||||
|
xs -> choice (map (\x ->
|
||||||
|
try (string x >> char ' ' >>
|
||||||
|
notFollowedBy spaceChar >>
|
||||||
|
return (Str $ result ++ x ++ "\160"))) xs)
|
||||||
|
<|> (return $ Str result)
|
||||||
|
else return $ Str result
|
||||||
|
|
||||||
|
-- | if the string matches the beginning of an abbreviation (before
|
||||||
|
-- the first period, return strings that would finish the abbreviation.
|
||||||
|
likelyAbbrev :: String -> [String]
|
||||||
|
likelyAbbrev x =
|
||||||
|
let abbrevs = [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.",
|
||||||
|
"Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.",
|
||||||
|
"vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.",
|
||||||
|
"Rev.", "Ph.D.", "M.D.", "M.A." ]
|
||||||
|
abbrPairs = map (break (=='.')) abbrevs
|
||||||
|
in map snd $ filter (\(y,_) -> y == x) abbrPairs
|
||||||
|
|
||||||
-- an endline character that can be treated as a space, not a structural break
|
-- an endline character that can be treated as a space, not a structural break
|
||||||
endline :: GenParser Char ParserState Inline
|
endline :: GenParser Char ParserState Inline
|
||||||
|
|
|
@ -165,7 +165,7 @@ Pandoc (Meta [Str "Pandoc",Space,Str "Test",Space,Str "Suite"] ["John MacFarlane
|
||||||
[ [ Plain [Str "Nested",Str "."] ]
|
[ [ Plain [Str "Nested",Str "."] ]
|
||||||
] ] ]
|
] ] ]
|
||||||
, Para [Str "Should",Space,Str "not",Space,Str "be",Space,Str "a",Space,Str "list",Space,Str "item:"]
|
, Para [Str "Should",Space,Str "not",Space,Str "be",Space,Str "a",Space,Str "list",Space,Str "item:"]
|
||||||
, Para [Str "M",Str ".",Str "A.\160",Str "2007"]
|
, Para [Str "M.A.\160",Str "2007"]
|
||||||
, Para [Str "B",Str ".",Space,Str "Williams"]
|
, Para [Str "B",Str ".",Space,Str "Williams"]
|
||||||
, HorizontalRule
|
, HorizontalRule
|
||||||
, Header 1 [Str "Definition",Space,Str "Lists"]
|
, Header 1 [Str "Definition",Space,Str "Lists"]
|
||||||
|
|
|
@ -165,7 +165,7 @@ Pandoc (Meta [Str "Pandoc",Space,Str "Test",Space,Str "Suite"] ["John MacFarlane
|
||||||
[ [ Plain [Str "Nested",Str "."] ]
|
[ [ Plain [Str "Nested",Str "."] ]
|
||||||
] ] ]
|
] ] ]
|
||||||
, Para [Str "Should",Space,Str "not",Space,Str "be",Space,Str "a",Space,Str "list",Space,Str "item:"]
|
, Para [Str "Should",Space,Str "not",Space,Str "be",Space,Str "a",Space,Str "list",Space,Str "item:"]
|
||||||
, Para [Str "M",Str ".",Str "A.\160",Str "2007"]
|
, Para [Str "M.A.\160",Str "2007"]
|
||||||
, Para [Str "B",Str ".",Space,Str "Williams"]
|
, Para [Str "B",Str ".",Space,Str "Williams"]
|
||||||
, HorizontalRule
|
, HorizontalRule
|
||||||
, Header 1 [Str "Definition",Space,Str "Lists"]
|
, Header 1 [Str "Definition",Space,Str "Lists"]
|
||||||
|
|
Loading…
Add table
Reference in a new issue