Finish implementing reading, still bugs to investigate

This commit is contained in:
Tissevert 2019-09-27 12:21:06 +02:00
parent 1dd22c3889
commit 0374b72920
3 changed files with 33 additions and 22 deletions

View file

@ -6,8 +6,8 @@ import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
import Control.Monad.Writer (tell)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile, putStrLn)
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, toStrict)
import qualified Data.ByteString.Char8 as BS (pack, putStrLn, readFile)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, insert, lookup, toList)
import PDF (Document(..), parseDocument)
@ -29,12 +29,16 @@ list l = RWST (\_ s -> fillContext s <$> l)
where
fillContext s a = (a, s, [])
handleError :: a -> String -> T a
handleError defaultValue s =
(tell . replicate 1 . BS.pack $ "Warning: " ++ s) >> return defaultValue
extractText :: Object -> T ()
extractText object = do
pageDict <- dict object
cMappers <- loadCMappers =<< getFont pageDict
contents <- stream =<< follow =<< key "Contents" pageDict
either (return . const ()) (tell . chunks) (pageContents cMappers contents)
either (handleError ()) (tell . chunks) (pageContents cMappers contents)
stream :: Object -> T ByteString
stream (Stream {header, streamContent}) = return $
@ -68,7 +72,7 @@ loadFont objectId =
>>= key "ToUnicode"
>>= follow
>>= stream
>>= either (return . const emptyCMap) return . cMap
>>= either (handleError emptyCMap) return . cMap
loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList

View file

@ -51,7 +51,7 @@ instance MonadParser Atto.Parser where
decNumber = Atto.takeWhile1 (`Set.member` digits)
hexNumber = Atto.takeWhile1 (`Set.member` hexDigits)
oneOf charSet = Atto.satisfy (`elem` charSet)
string = Atto.string
string s = Atto.string s <?> show s
takeAll = Atto.takeWhile
takeAll1 = Atto.takeWhile1

View file

@ -1,34 +1,31 @@
{-# LANGUAGE OverloadedStrings #-}
module PDF.Text {-(
module PDF.Text {
CMap
, CMappers
, PageContents(..)
, cMap
, emptyCMap
, pageContents
)-} where
) where
import Control.Applicative ((<|>), many)
import Control.Monad (foldM, join)
import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (put)
import Control.Monad.State (get, put)
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
import Data.ByteString (ByteString, pack)
import qualified Data.ByteString as BS (head)
import Data.ByteString.Char8 as Char8 (pack)
import qualified Data.ByteString.UTF8 as UTF8 (fromString)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList)
import qualified Data.Map as Map (empty, fromList, lookup)
import Data.Text (Text, snoc)
import qualified Data.Text as Text (init, last, null, unpack)
import Data.Text.Encoding (decodeUtf16BE)
import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
import PDF.Object (
DirectObject(..), Name, StringObject(..)
, array, blank, directObject, integer, line, name, regular, stringObject
)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Parser (Parser, evalParser, string, takeAll)
import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll)
type CMappers = Map Name CMap
type CMap = Map Int Text
@ -106,7 +103,7 @@ data TextOperator =
data Argument = Raw ByteString | Typed DirectObject
type Call a = (a, [Argument])
stateOperator :: StateOperator -> ParserWithFont (Call StateOperator)
stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator)
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm"
stateOperator W = (,) W <$> count 1 argument <* string "w"
stateOperator J = (,) J <$> count 1 argument <* string "J"
@ -117,15 +114,15 @@ stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri"
stateOperator I = (,) I <$> count 1 argument <* string "i"
stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs"
textOperator :: TextOperator -> ParserWithFont (Call TextOperator)
textOperator :: MonadParser m => TextOperator -> m (Call TextOperator)
textOperator Td = (,) Td <$> count 2 argument <* string "Td"
textOperator TD = (,) TD <$> count 2 argument <* string "TD"
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
textOperator Tstar = (,) Td <$> return [] <* string "T*"
textOperator TJ =
(,) TJ <$> sequence [Typed . Array <$> array] <* string "TJ"
(,) TJ <$> sequence [Typed . Array <$> array] <* blank <* string "TJ"
textOperator Tj =
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* string "Tj"
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* blank <* string "Tj"
textOperator Quote = (,) Quote <$> count 1 argument <* string "'"
textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\""
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
@ -139,7 +136,7 @@ textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o)
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
argument :: ParserWithFont Argument
argument :: MonadParser m => m Argument
argument = Raw <$> takeAll regular <* blank
data PageContents = PageContents {
@ -194,7 +191,17 @@ runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString
decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h))
decodeString (Literal s) =
asks
decodeString (Literal litString) = get >>= convertBytes litString
where
bytes = Char8.pack s
convertBytes :: String -> CMap -> ParserWithFont ByteString
convertBytes [] _ = return ""
convertBytes (c:cs) someCMap = do
convertBytesAux (fromEnum c) 1 cs someCMap
convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString
convertBytesAux code size s someCMap
| size > 4 = fail "Could not match any input code smaller than an int"
| otherwise =
case (Map.lookup code someCMap, s) of
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
(Nothing, []) -> fail "No character left to read but no code recognized"
(Just outputText, _) -> mappend (encodeUtf8 outputText) <$> convertBytes s someCMap