Hufflepdf/examples/getText.hs

55 lines
1.8 KiB
Haskell
Raw Permalink Normal View History

{-# LANGUAGE FlexibleContexts #-}
import Control.Monad ((>=>))
import Control.Monad.Except (ExceptT(..))
import Control.Monad.Except.IOH (handle)
import Control.Monad.IO.Class (liftIO)
import qualified Data.ByteString.Char8 as BS (readFile)
2020-03-15 15:13:00 +01:00
import Data.Id (Id(..), mapWithKey)
import qualified Data.Map as Map (mapWithKey)
import Data.OrderedMap (mapi)
import qualified Data.Text as Text (unpack)
import PDF (UnifiedLayers(..), parseDocument)
import PDF.Box (Box(..))
import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer, LayerReader)
import PDF.Pages (
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), withFonts
, withResources
)
import System.Environment (getArgs)
import System.Exit (die)
import System.IO (BufferMode(..), hSetBuffering, stdout)
import Text.Printf (printf)
displayPage :: Int -> Page -> FontCache (LayerReader (ExceptT String IO)) ()
displayPage n = withResources (
r Contents
>=> sequence_ . mapi (\objectId ->
r Chunks >=> sequence_ . mapWithKey (display objectId)
)
)
where
display a b v =
liftIO . putStrLn $
printf "p#%d obj#%d instr#%d: %s" n (getId a) (getId b) (Text.unpack v)
getAll :: Layer -> ExceptT String IO ()
getAll = withFonts $ r Pages >=> sequence_ . Map.mapWithKey displayPage
get :: Int -> Layer -> ExceptT String IO ()
get n = withFonts $ r (P n) >=> displayPage n
onDoc :: FilePath -> (Layer -> ExceptT String IO ()) -> ExceptT String IO ()
onDoc inputFile f =
ExceptT (parseDocument <$> BS.readFile inputFile) >>= r UnifiedLayers >>= f
main :: IO ()
main = do
hSetBuffering stdout LineBuffering
args <- getArgs
case args of
[inputFile] -> onDoc inputFile getAll `handle` die
[inputFile, pageNumber] ->
onDoc inputFile (get $ read pageNumber) `handle` die
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"