Hufflepdf/examples/getText.hs

67 lines
2.1 KiB
Haskell
Raw Normal View History

{-# LANGUAGE FlexibleContexts #-}
import Control.Monad ((>=>))
import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.Reader (runReaderT)
import qualified Data.ByteString.Char8 as BS (readFile)
import Data.Map (Map, foldlWithKey, mapWithKey)
import qualified Data.Map as Map (empty, insert, toList, union)
import Data.OrderedMap (mapi)
import qualified Data.Text as Text (unpack)
import qualified Data.Text.IO as Text (putStrLn)
import PDF (UnifiedLayers(..), parseDocument)
import PDF.Box (Box(..))
import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer)
import PDF.Output (ObjectId(..))
import PDF.Pages (
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), Text_(..), cacheFonts, withResources
)
import System.Environment (getArgs)
import System.Exit (die)
import System.IO (BufferMode(..), hSetBuffering, stdout)
import Text.Printf (printf)
displayPage :: (MonadIO m, FontCache m) => Page -> m ()
displayPage =
withResources (r Contents)
>=> sequence_ . mapi (\objectId content ->
r Chunks content -- >=> sequence_ . mapWithKey (display objectId)
)
where
display = undefined
{-
(
r Contents :: ReaderT FontSet m (OrderedMap ObjectId Content)
-- >=> sequenceA . mapi $ \(objectId, content) -> undefined
)
--sequenceA $ mapWithKey (display objectId) (r Chunks content)
-- >=> mapM_ Text.putStrLn
where
display a b v =
liftIO . putStrLn $
printf "%d@%s: %s" (getObjectId a) (show b) (Text.unpack v)
-}
getAll :: Layer -> IO ()
getAll layer =
r Pages layer
>>= flip runReaderT layer . cacheFonts . mapM_ (displayPage . snd) . Map.toList
get :: Int -> Layer -> IO ()
get n layer = r (P n) layer >>= flip runReaderT layer . cacheFonts . displayPage
onDoc :: FilePath -> (Layer -> IO ()) -> IO ()
onDoc inputFile f = do
(parseDocument <$> BS.readFile inputFile)
>>= either die (r UnifiedLayers >=> f)
main :: IO ()
main = do
hSetBuffering stdout LineBuffering
args <- getArgs
case args of
[inputFile] -> onDoc inputFile getAll
[inputFile, pageNumber] -> onDoc inputFile (get $ read pageNumber)
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"