2020-03-11 18:55:18 +01:00
|
|
|
{-# LANGUAGE FlexibleContexts #-}
|
2020-03-04 18:19:10 +01:00
|
|
|
import Control.Monad ((>=>))
|
2020-05-28 18:54:15 +02:00
|
|
|
import Control.Monad.Except (ExceptT(..))
|
|
|
|
import Control.Monad.Except.IOH (handle)
|
2020-03-14 16:57:16 +01:00
|
|
|
import Control.Monad.IO.Class (liftIO)
|
2020-02-08 08:15:32 +01:00
|
|
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
2020-03-15 15:13:00 +01:00
|
|
|
import Data.Id (Id(..), mapWithKey)
|
2020-03-17 16:29:46 +01:00
|
|
|
import qualified Data.Map as Map (mapWithKey)
|
2020-03-11 18:55:18 +01:00
|
|
|
import Data.OrderedMap (mapi)
|
2020-03-10 22:57:11 +01:00
|
|
|
import qualified Data.Text as Text (unpack)
|
2020-03-04 18:19:10 +01:00
|
|
|
import PDF (UnifiedLayers(..), parseDocument)
|
|
|
|
import PDF.Box (Box(..))
|
2020-03-10 22:57:11 +01:00
|
|
|
import PDF.Content.Text (Chunks(..))
|
2020-05-28 18:54:15 +02:00
|
|
|
import PDF.Layer (Layer, LayerReader)
|
2020-03-11 18:55:18 +01:00
|
|
|
import PDF.Pages (
|
2020-03-17 16:29:46 +01:00
|
|
|
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), withFonts
|
|
|
|
, withResources
|
2020-03-11 18:55:18 +01:00
|
|
|
)
|
2019-09-23 18:00:47 +02:00
|
|
|
import System.Environment (getArgs)
|
2019-11-29 11:51:35 +01:00
|
|
|
import System.Exit (die)
|
|
|
|
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
2020-03-10 22:57:11 +01:00
|
|
|
import Text.Printf (printf)
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2020-05-28 18:54:15 +02:00
|
|
|
displayPage :: Int -> Page -> FontCache (LayerReader (ExceptT String IO)) ()
|
2020-03-17 08:36:02 +01:00
|
|
|
displayPage n = withResources (
|
2020-03-14 16:57:16 +01:00
|
|
|
r Contents
|
|
|
|
>=> sequence_ . mapi (\objectId ->
|
|
|
|
r Chunks >=> sequence_ . mapWithKey (display objectId)
|
|
|
|
)
|
2020-03-11 18:55:18 +01:00
|
|
|
)
|
2020-03-10 22:57:11 +01:00
|
|
|
where
|
2020-03-11 18:55:18 +01:00
|
|
|
display a b v =
|
|
|
|
liftIO . putStrLn $
|
2020-03-17 08:36:02 +01:00
|
|
|
printf "p#%d obj#%d instr#%d: %s" n (getId a) (getId b) (Text.unpack v)
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2020-05-28 18:54:15 +02:00
|
|
|
getAll :: Layer -> ExceptT String IO ()
|
2020-03-17 16:29:46 +01:00
|
|
|
getAll = withFonts $ r Pages >=> sequence_ . Map.mapWithKey displayPage
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2020-05-28 18:54:15 +02:00
|
|
|
get :: Int -> Layer -> ExceptT String IO ()
|
2020-03-17 16:29:46 +01:00
|
|
|
get n = withFonts $ r (P n) >=> displayPage n
|
2020-03-04 18:19:10 +01:00
|
|
|
|
2020-05-28 18:54:15 +02:00
|
|
|
onDoc :: FilePath -> (Layer -> ExceptT String IO ()) -> ExceptT String IO ()
|
|
|
|
onDoc inputFile f =
|
|
|
|
ExceptT (parseDocument <$> BS.readFile inputFile) >>= r UnifiedLayers >>= f
|
2019-09-23 18:00:47 +02:00
|
|
|
|
|
|
|
main :: IO ()
|
|
|
|
main = do
|
2019-11-29 11:51:35 +01:00
|
|
|
hSetBuffering stdout LineBuffering
|
|
|
|
args <- getArgs
|
|
|
|
case args of
|
2020-05-28 18:54:15 +02:00
|
|
|
[inputFile] -> onDoc inputFile getAll `handle` die
|
|
|
|
[inputFile, pageNumber] ->
|
|
|
|
onDoc inputFile (get $ read pageNumber) `handle` die
|
2019-11-29 11:51:35 +01:00
|
|
|
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
|