2020-03-04 18:19:10 +01:00
|
|
|
import Control.Monad ((>=>))
|
2020-02-08 08:15:32 +01:00
|
|
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
2020-03-10 22:57:11 +01:00
|
|
|
import Data.Map (Map, foldlWithKey, mapWithKey)
|
|
|
|
import qualified Data.Map as Map (empty, insert, toList, union)
|
|
|
|
import qualified Data.Text as Text (unpack)
|
2020-02-08 08:15:32 +01:00
|
|
|
import qualified Data.Text.IO as Text (putStrLn)
|
2020-03-04 18:19:10 +01:00
|
|
|
import PDF (UnifiedLayers(..), parseDocument)
|
|
|
|
import PDF.Box (Box(..))
|
2020-03-10 22:57:11 +01:00
|
|
|
import PDF.Content.Text (Chunks(..))
|
2020-03-04 18:19:10 +01:00
|
|
|
import PDF.Layer (Layer)
|
2020-03-10 22:57:11 +01:00
|
|
|
import PDF.Pages (Contents(..), Page(..), PageNumber(..), Pages(..), cacheFonts)
|
2019-09-23 18:00:47 +02:00
|
|
|
import System.Environment (getArgs)
|
2019-11-29 11:51:35 +01:00
|
|
|
import System.Exit (die)
|
|
|
|
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
2020-03-10 22:57:11 +01:00
|
|
|
import Text.Printf (printf)
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2019-11-29 11:51:35 +01:00
|
|
|
displayPage :: Page -> IO ()
|
2020-03-10 22:57:11 +01:00
|
|
|
displayPage =
|
|
|
|
r Contents
|
|
|
|
>=> mapM (\(objectId, content) -> r Chunks content >>= return . mixIn objectId)
|
|
|
|
>=> fusion
|
|
|
|
>=> sequence_ . mapWithKey (\k v -> putStrLn $ printf "%s: %s" (show k) (Text.unpack v))
|
|
|
|
-- >=> mapM_ Text.putStrLn
|
|
|
|
where
|
|
|
|
mixIn :: (Ord k1, Ord k2) => k1 -> Map k2 v -> Map (k1, k2) v
|
|
|
|
mixIn prefix =
|
|
|
|
foldlWithKey (\m k v -> Map.insert (prefix, k) v m) Map.empty
|
|
|
|
fusion :: (Monad m, Ord a) => [Map a b] -> m (Map a b)
|
|
|
|
fusion = return . foldl Map.union Map.empty
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2020-03-04 18:19:10 +01:00
|
|
|
getAll :: Layer -> IO ()
|
|
|
|
getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2020-03-04 18:19:10 +01:00
|
|
|
get :: Int -> Layer -> IO ()
|
|
|
|
get n layer = (cacheFonts $ r (P n) layer) >>= displayPage
|
|
|
|
|
|
|
|
onDoc :: FilePath -> (Layer -> IO ()) -> IO ()
|
|
|
|
onDoc inputFile f = do
|
|
|
|
(parseDocument <$> BS.readFile inputFile)
|
|
|
|
>>= either die (r UnifiedLayers >=> f)
|
2019-09-23 18:00:47 +02:00
|
|
|
|
|
|
|
main :: IO ()
|
|
|
|
main = do
|
2019-11-29 11:51:35 +01:00
|
|
|
hSetBuffering stdout LineBuffering
|
|
|
|
args <- getArgs
|
|
|
|
case args of
|
2020-03-04 18:19:10 +01:00
|
|
|
[inputFile] -> onDoc inputFile getAll
|
|
|
|
[inputFile, pageNumber] -> onDoc inputFile (get $ read pageNumber)
|
2019-11-29 11:51:35 +01:00
|
|
|
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
|