{-# LANGUAGE FlexibleContexts #-} import Control.Monad ((>=>)) import Control.Monad.IO.Class (liftIO) import qualified Data.ByteString.Char8 as BS (readFile) import Data.Id (Id(..), mapWithKey) import qualified Data.Map as Map (mapWithKey) import Data.OrderedMap (mapi) import qualified Data.Text as Text (unpack) import PDF (UnifiedLayers(..), parseDocument) import PDF.Box (Box(..)) import PDF.Content.Text (Chunks(..)) import PDF.Layer (Layer) import PDF.Pages ( Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), withFonts , withResources ) import System.Environment (getArgs) import System.Exit (die) import System.IO (BufferMode(..), hSetBuffering, stdout) import Text.Printf (printf) displayPage :: Int -> Page -> FontCache IO () displayPage n = withResources ( r Contents >=> sequence_ . mapi (\objectId -> r Chunks >=> sequence_ . mapWithKey (display objectId) ) ) where display a b v = liftIO . putStrLn $ printf "p#%d obj#%d instr#%d: %s" n (getId a) (getId b) (Text.unpack v) getAll :: Layer -> IO () getAll = withFonts $ r Pages >=> sequence_ . Map.mapWithKey displayPage get :: Int -> Layer -> IO () get n = withFonts $ r (P n) >=> displayPage n onDoc :: FilePath -> (Layer -> IO ()) -> IO () onDoc inputFile f = do (parseDocument <$> BS.readFile inputFile) >>= either die (r UnifiedLayers >=> f) main :: IO () main = do hSetBuffering stdout LineBuffering args <- getArgs case args of [inputFile] -> onDoc inputFile getAll [inputFile, pageNumber] -> onDoc inputFile (get $ read pageNumber) _ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"