Hufflepdf/examples/getText.hs

50 lines
1.8 KiB
Haskell
Raw Normal View History

import Control.Monad ((>=>))
import qualified Data.ByteString.Char8 as BS (readFile)
import Data.Map (Map, foldlWithKey, mapWithKey)
import qualified Data.Map as Map (empty, insert, toList, union)
import qualified Data.Text as Text (unpack)
import qualified Data.Text.IO as Text (putStrLn)
import PDF (UnifiedLayers(..), parseDocument)
import PDF.Box (Box(..))
import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer)
import PDF.Pages (Contents(..), Page(..), PageNumber(..), Pages(..), cacheFonts)
import System.Environment (getArgs)
import System.Exit (die)
import System.IO (BufferMode(..), hSetBuffering, stdout)
import Text.Printf (printf)
displayPage :: Page -> IO ()
displayPage =
r Contents
>=> mapM (\(objectId, content) -> r Chunks content >>= return . mixIn objectId)
>=> fusion
>=> sequence_ . mapWithKey (\k v -> putStrLn $ printf "%s: %s" (show k) (Text.unpack v))
-- >=> mapM_ Text.putStrLn
where
mixIn :: (Ord k1, Ord k2) => k1 -> Map k2 v -> Map (k1, k2) v
mixIn prefix =
foldlWithKey (\m k v -> Map.insert (prefix, k) v m) Map.empty
fusion :: (Monad m, Ord a) => [Map a b] -> m (Map a b)
fusion = return . foldl Map.union Map.empty
getAll :: Layer -> IO ()
getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList
get :: Int -> Layer -> IO ()
get n layer = (cacheFonts $ r (P n) layer) >>= displayPage
onDoc :: FilePath -> (Layer -> IO ()) -> IO ()
onDoc inputFile f = do
(parseDocument <$> BS.readFile inputFile)
>>= either die (r UnifiedLayers >=> f)
main :: IO ()
main = do
hSetBuffering stdout LineBuffering
args <- getArgs
case args of
[inputFile] -> onDoc inputFile getAll
[inputFile, pageNumber] -> onDoc inputFile (get $ read pageNumber)
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"