Finish implementing reading, still bugs to investigate

This commit is contained in:
Tissevert 2019-09-27 12:21:06 +02:00
parent 1dd22c3889
commit 0374b72920
3 changed files with 33 additions and 22 deletions

View file

@ -6,8 +6,8 @@ import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify) import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
import Control.Monad.Writer (tell) import Control.Monad.Writer (tell)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile, putStrLn) import qualified Data.ByteString.Char8 as BS (pack, putStrLn, readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, toStrict) import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!)) import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, insert, lookup, toList) import qualified Data.Map as Map (empty, insert, lookup, toList)
import PDF (Document(..), parseDocument) import PDF (Document(..), parseDocument)
@ -29,12 +29,16 @@ list l = RWST (\_ s -> fillContext s <$> l)
where where
fillContext s a = (a, s, []) fillContext s a = (a, s, [])
handleError :: a -> String -> T a
handleError defaultValue s =
(tell . replicate 1 . BS.pack $ "Warning: " ++ s) >> return defaultValue
extractText :: Object -> T () extractText :: Object -> T ()
extractText object = do extractText object = do
pageDict <- dict object pageDict <- dict object
cMappers <- loadCMappers =<< getFont pageDict cMappers <- loadCMappers =<< getFont pageDict
contents <- stream =<< follow =<< key "Contents" pageDict contents <- stream =<< follow =<< key "Contents" pageDict
either (return . const ()) (tell . chunks) (pageContents cMappers contents) either (handleError ()) (tell . chunks) (pageContents cMappers contents)
stream :: Object -> T ByteString stream :: Object -> T ByteString
stream (Stream {header, streamContent}) = return $ stream (Stream {header, streamContent}) = return $
@ -68,7 +72,7 @@ loadFont objectId =
>>= key "ToUnicode" >>= key "ToUnicode"
>>= follow >>= follow
>>= stream >>= stream
>>= either (return . const emptyCMap) return . cMap >>= either (handleError emptyCMap) return . cMap
loadCMappers :: Dictionary -> T CMappers loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList loadCMappers = foldM loadCMapper Map.empty . Map.toList

View file

@ -51,7 +51,7 @@ instance MonadParser Atto.Parser where
decNumber = Atto.takeWhile1 (`Set.member` digits) decNumber = Atto.takeWhile1 (`Set.member` digits)
hexNumber = Atto.takeWhile1 (`Set.member` hexDigits) hexNumber = Atto.takeWhile1 (`Set.member` hexDigits)
oneOf charSet = Atto.satisfy (`elem` charSet) oneOf charSet = Atto.satisfy (`elem` charSet)
string = Atto.string string s = Atto.string s <?> show s
takeAll = Atto.takeWhile takeAll = Atto.takeWhile
takeAll1 = Atto.takeWhile1 takeAll1 = Atto.takeWhile1

View file

@ -1,34 +1,31 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module PDF.Text {-( module PDF.Text {
CMap CMap
, CMappers , CMappers
, PageContents(..) , PageContents(..)
, cMap , cMap
, emptyCMap , emptyCMap
, pageContents , pageContents
)-} where ) where
import Control.Applicative ((<|>), many) import Control.Applicative ((<|>), many)
import Control.Monad (foldM, join) import Control.Monad (foldM, join)
import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (put) import Control.Monad.State (get, put)
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser) import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
import Data.ByteString (ByteString, pack) import Data.ByteString (ByteString, pack)
import qualified Data.ByteString as BS (head)
import Data.ByteString.Char8 as Char8 (pack)
import qualified Data.ByteString.UTF8 as UTF8 (fromString)
import Data.Map (Map, (!)) import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList) import qualified Data.Map as Map (empty, fromList, lookup)
import Data.Text (Text, snoc) import Data.Text (Text, snoc)
import qualified Data.Text as Text (init, last, null, unpack) import qualified Data.Text as Text (init, last, null, unpack)
import Data.Text.Encoding (decodeUtf16BE) import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
import PDF.Object ( import PDF.Object (
DirectObject(..), Name, StringObject(..) DirectObject(..), Name, StringObject(..)
, array, blank, directObject, integer, line, name, regular, stringObject , array, blank, directObject, integer, line, name, regular, stringObject
) )
import qualified PDF.EOL as EOL (charset, parser) import qualified PDF.EOL as EOL (charset, parser)
import PDF.Parser (Parser, evalParser, string, takeAll) import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll)
type CMappers = Map Name CMap type CMappers = Map Name CMap
type CMap = Map Int Text type CMap = Map Int Text
@ -106,7 +103,7 @@ data TextOperator =
data Argument = Raw ByteString | Typed DirectObject data Argument = Raw ByteString | Typed DirectObject
type Call a = (a, [Argument]) type Call a = (a, [Argument])
stateOperator :: StateOperator -> ParserWithFont (Call StateOperator) stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator)
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm" stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm"
stateOperator W = (,) W <$> count 1 argument <* string "w" stateOperator W = (,) W <$> count 1 argument <* string "w"
stateOperator J = (,) J <$> count 1 argument <* string "J" stateOperator J = (,) J <$> count 1 argument <* string "J"
@ -117,15 +114,15 @@ stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri"
stateOperator I = (,) I <$> count 1 argument <* string "i" stateOperator I = (,) I <$> count 1 argument <* string "i"
stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs" stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs"
textOperator :: TextOperator -> ParserWithFont (Call TextOperator) textOperator :: MonadParser m => TextOperator -> m (Call TextOperator)
textOperator Td = (,) Td <$> count 2 argument <* string "Td" textOperator Td = (,) Td <$> count 2 argument <* string "Td"
textOperator TD = (,) TD <$> count 2 argument <* string "TD" textOperator TD = (,) TD <$> count 2 argument <* string "TD"
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm" textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
textOperator Tstar = (,) Td <$> return [] <* string "T*" textOperator Tstar = (,) Td <$> return [] <* string "T*"
textOperator TJ = textOperator TJ =
(,) TJ <$> sequence [Typed . Array <$> array] <* string "TJ" (,) TJ <$> sequence [Typed . Array <$> array] <* blank <* string "TJ"
textOperator Tj = textOperator Tj =
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* string "Tj" (,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* blank <* string "Tj"
textOperator Quote = (,) Quote <$> count 1 argument <* string "'" textOperator Quote = (,) Quote <$> count 1 argument <* string "'"
textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\"" textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\""
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc" textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
@ -139,7 +136,7 @@ textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o) a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o)
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound] a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
argument :: ParserWithFont Argument argument :: MonadParser m => m Argument
argument = Raw <$> takeAll regular <* blank argument = Raw <$> takeAll regular <* blank
data PageContents = PageContents { data PageContents = PageContents {
@ -194,7 +191,17 @@ runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString decodeString :: StringObject -> ParserWithFont ByteString
decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h)) decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h))
decodeString (Literal s) = decodeString (Literal litString) = get >>= convertBytes litString
asks
where where
bytes = Char8.pack s convertBytes :: String -> CMap -> ParserWithFont ByteString
convertBytes [] _ = return ""
convertBytes (c:cs) someCMap = do
convertBytesAux (fromEnum c) 1 cs someCMap
convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString
convertBytesAux code size s someCMap
| size > 4 = fail "Could not match any input code smaller than an int"
| otherwise =
case (Map.lookup code someCMap, s) of
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
(Nothing, []) -> fail "No character left to read but no code recognized"
(Just outputText, _) -> mappend (encodeUtf8 outputText) <$> convertBytes s someCMap