diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index f406ab5..f4eb761 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -20,6 +20,7 @@ library , PDF.EOL , PDF.Object , PDF.Output + , PDF.Text , PDF.Update other-modules: Data.ByteString.Char8.Util , PDF.Body @@ -51,3 +52,14 @@ executable getObj , zlib ghc-options: -Wall default-language: Haskell2010 + +executable getText + main-is: examples/getText.hs + build-depends: base + , bytestring + , containers + , Hufflepdf + , mtl + , zlib + ghc-options: -Wall + default-language: Haskell2010 diff --git a/examples/getText.hs b/examples/getText.hs new file mode 100755 index 0000000..76c4347 --- /dev/null +++ b/examples/getText.hs @@ -0,0 +1,124 @@ +{-# LANGUAGE NamedFieldPuns #-} +{-# LANGUAGE OverloadedStrings #-} + +import Codec.Compression.Zlib (decompress) +import Control.Monad.Reader (ask) +import Control.Monad.RWS (RWST(..), evalRWST) +import Control.Monad.Writer (tell) +import Data.ByteString.Lazy.Char8 (ByteString) +import qualified Data.ByteString.Char8 as BS (readFile) +import qualified Data.ByteString.Lazy.Char8 as Lazy (concat, fromStrict, pack, putStrLn, toStrict) +import Data.Map ((!)) +import qualified Data.Map as Map (empty, lookup) +import PDF (Document(..), parseDocument) +import qualified PDF.EOL as EOL (Style) +import PDF.Object ( + Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) + , Object(..), Name(..), Structure(..) + ,) +import PDF.Output (ObjectId(..)) +import qualified PDF.Output as Output (render) +import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents) +import PDF.Update (unify) +import System.Environment (getArgs) +import System.IO (hPutStrLn, stderr) + +--type T a = ReaderT Content [] a +type T a = RWST Content [ByteString] CMappers [] a + +list :: [a] -> T a +list l = RWST (\_ s -> fillContext s <$> l) + where + fillContext s a = (a, s, []) + +extractText :: Object -> T () +extractText object = do + pageDict <- dict object + pageContents <- follow =<< get "Contents" pageDict + case pageContents of + (Stream {header, streamContent}) -> do + font <- getFont pageDict + storeDecodedText font . clear header $ Lazy.fromStrict streamContent + _ -> return () + +clear :: Dictionary -> ByteString -> ByteString +clear header streamContent = + case Map.lookup (Name "Filter") header of + Just (NameObject (Name "FlateDecode")) -> decompress streamContent + _ -> streamContent + +storeDecodedText :: Dictionary -> ByteString -> T () +storeDecodedText font page = + tell . chunks =<< pageContents font page + +getFont :: Dictionary -> T Dictionary +getFont pageDict = + get "Resources" pageDict + >>= dict . Direct + >>= get "Font" + >>= follow + >>= dict + +getObject :: ObjectId -> T Object +getObject objectId = do + content <- ask + return (objects content ! objectId) + +get :: String -> Dictionary -> T DirectObject +get key dictionary = + case Map.lookup (Name key) dictionary of + Just obj -> return obj + _ -> list [] + +follow :: DirectObject -> T Object +follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId +follow _ = list [] +{- + obj <- getObject objectId + case obj of + Direct directObj -> return directObj + _ -> list [] +-} + +{- +key dictionary = + case Map.lookup (Name key) dictionary of + Just (Reference (IndirectObjCoordinates {objectId})) -> getObject objectId + _ -> list [] +-} + +dict :: Object -> T Dictionary +dict (Direct (Dictionary dictionary)) = return dictionary +dict _ = list [] + +pagesList :: T ObjectId +pagesList = do + root <- dict =<< follow =<< get "Root" . trailer . docStructure =<< ask + pages <- dict =<< follow =<< get "Pages" root + case Map.lookup (Name "Kids") pages of + Just (Array kids) -> list $ filterObjectIds kids + _ -> list [] + +filterObjectIds :: [DirectObject] -> [ObjectId] +filterObjectIds objects = do + object <- objects + case object of + Reference (IndirectObjCoordinates {objectId}) -> [objectId] + _ -> [] + +listTextObjects :: Document -> [ByteString] +listTextObjects (Document {updates}) = + snd =<< evalRWST rwsMain (unify updates) Map.empty + where + rwsMain = + --Lazy.pack . show <$> (getObject =<< pagesList) + pagesList >>= getObject >>= extractText + + +main :: IO () +main = do + [inputFile] <- getArgs + result <- parseDocument <$> BS.readFile inputFile + case result of + Left parseError -> hPutStrLn stderr $ show parseError + Right doc -> mapM_ Lazy.putStrLn $ listTextObjects doc diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs index fc116e4..8c69849 100644 --- a/src/PDF/Object.hs +++ b/src/PDF/Object.hs @@ -3,6 +3,7 @@ {-# LANGUAGE FlexibleInstances #-} module PDF.Object ( Content(..) + , Dictionary , DirectObject(..) , Flow(..) , IndexedObjects diff --git a/src/PDF/Text.hs b/src/PDF/Text.hs new file mode 100644 index 0000000..7b2852a --- /dev/null +++ b/src/PDF/Text.hs @@ -0,0 +1,28 @@ +{-# LANGUAGE FlexibleContexts #-} +module PDF.Text ( + CMap + , CMappers + , PageContents(..) + , cMap + , pageContents + ) where + +import Control.Monad.State (MonadState) +import Data.ByteString.Lazy.Char8 (ByteString) +import Data.Map (Map) +import PDF.Object (Dictionary) +import PDF.Output (ObjectId) +import PDF.Parser (Parser) + +type CMappers = Map ObjectId CMap +type CMap = Map Int ByteString + +cMap :: ByteString -> CMap +cMap = undefined + +data PageContents = PageContents { + chunks :: [ByteString] + } + +pageContents :: MonadState CMappers m => Dictionary -> ByteString -> m PageContents +pageContents font page = undefined