Fix parsing errors forgetting to make sure there's a space after special operator arguments like names and stringObjects

This commit is contained in:
Tissevert 2019-09-28 09:25:59 +02:00
parent 32efdcdd6b
commit b8ca7281aa
1 changed files with 22 additions and 15 deletions

View File

@ -5,7 +5,7 @@ module PDF.Text {-(
)-} where )-} where
import Control.Applicative ((<|>)) import Control.Applicative ((<|>))
import Control.Monad (foldM, join) import Control.Monad (foldM)
import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (get, put) import Control.Monad.State (get, put)
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
@ -18,7 +18,7 @@ import PDF.Object (
DirectObject(..), StringObject(..) DirectObject(..), StringObject(..)
, array, blank, name, parseBytes, regular, stringObject , array, blank, name, parseBytes, regular, stringObject
) )
import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll) import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll)
data StateOperator = data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
@ -40,24 +40,22 @@ stateOperator M = (,) M <$> count 1 argument <* string "M"
stateOperator D = (,) D <$> count 2 argument <* string "d" stateOperator D = (,) D <$> count 2 argument <* string "d"
stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri" stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri"
stateOperator I = (,) I <$> count 1 argument <* string "i" stateOperator I = (,) I <$> count 1 argument <* string "i"
stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs" stateOperator Gs = (,) Gs <$> count 1 nameArg <* string "gs"
textOperator :: MonadParser m => TextOperator -> m (Call TextOperator) textOperator :: MonadParser m => TextOperator -> m (Call TextOperator)
textOperator Td = (,) Td <$> count 2 argument <* string "Td" textOperator Td = (,) Td <$> count 2 argument <* string "Td"
textOperator TD = (,) TD <$> count 2 argument <* string "TD" textOperator TD = (,) TD <$> count 2 argument <* string "TD"
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm" textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
textOperator Tstar = (,) Td <$> return [] <* string "T*" textOperator Tstar = (,) Td <$> return [] <* string "T*"
textOperator TJ = textOperator TJ = (,) TJ <$> count 1 arrayArg <* string "TJ"
(,) TJ <$> sequence [Typed . Array <$> array] <* blank <* string "TJ" textOperator Tj = (,) Tj <$> count 1 stringArg <* string "Tj"
textOperator Tj = textOperator Quote = (,) Quote <$> count 1 stringArg <* string "'"
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* blank <* string "Tj" textOperator DQuote = (,) DQuote <$> count 1 stringArg <* string "\""
textOperator Quote = (,) Quote <$> count 1 argument <* string "'"
textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\""
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc" textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw" textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw"
textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz" textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz"
textOperator TL = (,) TL <$> count 1 argument <* string "TL" textOperator TL = (,) TL <$> count 1 argument <* string "TL"
textOperator Tf = (,) Tf <$> sequence [Typed . NameObject <$> name, argument] <* string "Tf" textOperator Tf = (,) Tf <$> sequence [nameArg, argument] <* string "Tf"
textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr" textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr"
textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts" textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
@ -67,6 +65,15 @@ a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
argument :: MonadParser m => m Argument argument :: MonadParser m => m Argument
argument = Raw <$> takeAll regular <* blank argument = Raw <$> takeAll regular <* blank
arrayArg :: MonadParser m => m Argument
arrayArg = Typed . Array <$> array <* blank
nameArg :: MonadParser m => m Argument
nameArg = Typed . NameObject <$> name <* blank
stringArg :: MonadParser m => m Argument
stringArg = Typed . StringObject <$> stringObject <* blank
data PageContents = PageContents { data PageContents = PageContents {
chunks :: [ByteString] chunks :: [ByteString]
} }
@ -78,20 +85,20 @@ pageContents font input =
evalParser (runReaderT (PageContents <$> page) font) emptyCMap input evalParser (runReaderT (PageContents <$> page) font) emptyCMap input
page :: ParserWithFont [ByteString] page :: ParserWithFont [ByteString]
page = graphicState <|> text page = graphicState <|> text <?> "Text page contents"
graphicState :: ParserWithFont [ByteString] graphicState :: ParserWithFont [ByteString]
graphicState = graphicState =
string "q" *> blank *> insideQ <* string "Q" (string "q" *> blank *> insideQ <* blank <* string "Q") <?> "Graphic state"
where where
insideQ = join <$> ((command <|> page) `sepBy` blank ) insideQ = concat <$> ((command <|> page) `sepBy` blank )
command = a stateOperator *> return [] command = a stateOperator *> return []
text :: ParserWithFont [ByteString] text :: ParserWithFont [ByteString]
text = text =
string "BT" *> blank *> commands <* blank <* string "ET" string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
where where
commands = join <$> (a textOperator >>= runOperator) `sepBy` blank commands = concat <$> (a textOperator >>= runOperator) `sepBy` blank
runOperator :: Call TextOperator -> ParserWithFont [ByteString] runOperator :: Call TextOperator -> ParserWithFont [ByteString]
runOperator (Tf, [Typed (NameObject fontName), _]) = runOperator (Tf, [Typed (NameObject fontName), _]) =