From 0374b72920be12a0e2fe4e371e13c3389a295365 Mon Sep 17 00:00:00 2001 From: Tissevert Date: Fri, 27 Sep 2019 12:21:06 +0200 Subject: [PATCH] Finish implementing reading, still bugs to investigate --- examples/getText.hs | 12 ++++++++---- src/PDF/Parser.hs | 2 +- src/PDF/Text.hs | 41 ++++++++++++++++++++++++----------------- 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/examples/getText.hs b/examples/getText.hs index c0ba6a2..0515104 100755 --- a/examples/getText.hs +++ b/examples/getText.hs @@ -6,8 +6,8 @@ import Control.Monad (foldM) import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify) import Control.Monad.Writer (tell) import Data.ByteString (ByteString) -import qualified Data.ByteString.Char8 as BS (readFile, putStrLn) -import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, toStrict) +import qualified Data.ByteString.Char8 as BS (pack, putStrLn, readFile) +import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import Data.Map (Map, (!)) import qualified Data.Map as Map (empty, insert, lookup, toList) import PDF (Document(..), parseDocument) @@ -29,12 +29,16 @@ list l = RWST (\_ s -> fillContext s <$> l) where fillContext s a = (a, s, []) +handleError :: a -> String -> T a +handleError defaultValue s = + (tell . replicate 1 . BS.pack $ "Warning: " ++ s) >> return defaultValue + extractText :: Object -> T () extractText object = do pageDict <- dict object cMappers <- loadCMappers =<< getFont pageDict contents <- stream =<< follow =<< key "Contents" pageDict - either (return . const ()) (tell . chunks) (pageContents cMappers contents) + either (handleError ()) (tell . chunks) (pageContents cMappers contents) stream :: Object -> T ByteString stream (Stream {header, streamContent}) = return $ @@ -68,7 +72,7 @@ loadFont objectId = >>= key "ToUnicode" >>= follow >>= stream - >>= either (return . const emptyCMap) return . cMap + >>= either (handleError emptyCMap) return . cMap loadCMappers :: Dictionary -> T CMappers loadCMappers = foldM loadCMapper Map.empty . Map.toList diff --git a/src/PDF/Parser.hs b/src/PDF/Parser.hs index 0aba5da..c9c380a 100644 --- a/src/PDF/Parser.hs +++ b/src/PDF/Parser.hs @@ -51,7 +51,7 @@ instance MonadParser Atto.Parser where decNumber = Atto.takeWhile1 (`Set.member` digits) hexNumber = Atto.takeWhile1 (`Set.member` hexDigits) oneOf charSet = Atto.satisfy (`elem` charSet) - string = Atto.string + string s = Atto.string s show s takeAll = Atto.takeWhile takeAll1 = Atto.takeWhile1 diff --git a/src/PDF/Text.hs b/src/PDF/Text.hs index fb80526..bfcbeb5 100644 --- a/src/PDF/Text.hs +++ b/src/PDF/Text.hs @@ -1,34 +1,31 @@ {-# LANGUAGE OverloadedStrings #-} -module PDF.Text {-( +module PDF.Text { CMap , CMappers , PageContents(..) , cMap , emptyCMap , pageContents - )-} where + ) where import Control.Applicative ((<|>), many) import Control.Monad (foldM, join) import Control.Monad.Reader (ReaderT, runReaderT, asks) -import Control.Monad.State (put) +import Control.Monad.State (get, put) import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy) import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser) import Data.ByteString (ByteString, pack) -import qualified Data.ByteString as BS (head) -import Data.ByteString.Char8 as Char8 (pack) -import qualified Data.ByteString.UTF8 as UTF8 (fromString) import Data.Map (Map, (!)) -import qualified Data.Map as Map (empty, fromList) +import qualified Data.Map as Map (empty, fromList, lookup) import Data.Text (Text, snoc) import qualified Data.Text as Text (init, last, null, unpack) -import Data.Text.Encoding (decodeUtf16BE) +import Data.Text.Encoding (decodeUtf16BE, encodeUtf8) import PDF.Object ( DirectObject(..), Name, StringObject(..) , array, blank, directObject, integer, line, name, regular, stringObject ) import qualified PDF.EOL as EOL (charset, parser) -import PDF.Parser (Parser, evalParser, string, takeAll) +import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll) type CMappers = Map Name CMap type CMap = Map Int Text @@ -106,7 +103,7 @@ data TextOperator = data Argument = Raw ByteString | Typed DirectObject type Call a = (a, [Argument]) -stateOperator :: StateOperator -> ParserWithFont (Call StateOperator) +stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator) stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm" stateOperator W = (,) W <$> count 1 argument <* string "w" stateOperator J = (,) J <$> count 1 argument <* string "J" @@ -117,15 +114,15 @@ stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri" stateOperator I = (,) I <$> count 1 argument <* string "i" stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs" -textOperator :: TextOperator -> ParserWithFont (Call TextOperator) +textOperator :: MonadParser m => TextOperator -> m (Call TextOperator) textOperator Td = (,) Td <$> count 2 argument <* string "Td" textOperator TD = (,) TD <$> count 2 argument <* string "TD" textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm" textOperator Tstar = (,) Td <$> return [] <* string "T*" textOperator TJ = - (,) TJ <$> sequence [Typed . Array <$> array] <* string "TJ" + (,) TJ <$> sequence [Typed . Array <$> array] <* blank <* string "TJ" textOperator Tj = - (,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* string "Tj" + (,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* blank <* string "Tj" textOperator Quote = (,) Quote <$> count 1 argument <* string "'" textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\"" textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc" @@ -139,7 +136,7 @@ textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts" a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o) a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound] -argument :: ParserWithFont Argument +argument :: MonadParser m => m Argument argument = Raw <$> takeAll regular <* blank data PageContents = PageContents { @@ -194,7 +191,17 @@ runOperator _ = return [] decodeString :: StringObject -> ParserWithFont ByteString decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h)) -decodeString (Literal s) = - asks +decodeString (Literal litString) = get >>= convertBytes litString where - bytes = Char8.pack s + convertBytes :: String -> CMap -> ParserWithFont ByteString + convertBytes [] _ = return "" + convertBytes (c:cs) someCMap = do + convertBytesAux (fromEnum c) 1 cs someCMap + convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString + convertBytesAux code size s someCMap + | size > 4 = fail "Could not match any input code smaller than an int" + | otherwise = + case (Map.lookup code someCMap, s) of + (Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap + (Nothing, []) -> fail "No character left to read but no code recognized" + (Just outputText, _) -> mappend (encodeUtf8 outputText) <$> convertBytes s someCMap