54 lines
1.7 KiB
Haskell
54 lines
1.7 KiB
Haskell
{-# LANGUAGE FlexibleContexts #-}
|
|
import Control.Monad ((>=>))
|
|
import Control.Monad.IO.Class (liftIO)
|
|
import Control.Monad.Reader (runReaderT)
|
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
|
import Data.Id (Id(..), mapWithKey)
|
|
import qualified Data.Map as Map (toList)
|
|
import Data.OrderedMap (mapi)
|
|
import qualified Data.Text as Text (unpack)
|
|
import PDF (UnifiedLayers(..), parseDocument)
|
|
import PDF.Box (Box(..))
|
|
import PDF.Content.Text (Chunks(..))
|
|
import PDF.Layer (Layer)
|
|
import PDF.Pages (
|
|
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), cacheFonts, withResources
|
|
)
|
|
import System.Environment (getArgs)
|
|
import System.Exit (die)
|
|
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
|
import Text.Printf (printf)
|
|
|
|
displayPage :: Page -> FontCache IO ()
|
|
displayPage = withResources (
|
|
r Contents
|
|
>=> sequence_ . mapi (\objectId ->
|
|
r Chunks >=> sequence_ . mapWithKey (display objectId)
|
|
)
|
|
)
|
|
where
|
|
display a b v =
|
|
liftIO . putStrLn $
|
|
printf "%d@%d: %s" (getId a) (getId b) (Text.unpack v)
|
|
|
|
getAll :: Layer -> IO ()
|
|
getAll layer =
|
|
r Pages layer
|
|
>>= flip runReaderT layer . cacheFonts . mapM_ (displayPage . snd) . Map.toList
|
|
|
|
get :: Int -> Layer -> IO ()
|
|
get n layer = r (P n) layer >>= flip runReaderT layer . cacheFonts . displayPage
|
|
|
|
onDoc :: FilePath -> (Layer -> IO ()) -> IO ()
|
|
onDoc inputFile f = do
|
|
(parseDocument <$> BS.readFile inputFile)
|
|
>>= either die (r UnifiedLayers >=> f)
|
|
|
|
main :: IO ()
|
|
main = do
|
|
hSetBuffering stdout LineBuffering
|
|
args <- getArgs
|
|
case args of
|
|
[inputFile] -> onDoc inputFile getAll
|
|
[inputFile, pageNumber] -> onDoc inputFile (get $ read pageNumber)
|
|
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
|