{-# LANGUAGE FlexibleContexts #-} module Main where import ALTO (Collection, collection, getWords) import Control.Monad.Except.IOH (handle) import Control.Monad.IO.Class (liftIO) import Control.Monad.Reader (runReaderT) import Control.Monad.State (MonadState(..), evalStateT) import Control.Monad.Trans (lift) import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Lazy as Lazy (writeFile) import Data.Id (Id(..)) import qualified Data.Map as Map (toList) import Data.OrderedMap (mapi) import PDF (Document, UnifiedLayers(..), parseDocument, render) import PDF.Box (Box(..), at) import PDF.Content.Operator (Instruction) import PDF.Content.Text (Chunks(..)) import PDF.Layer (Layer) import PDF.Object (Object) import PDF.Pages (Contents(..), Pages(..), withFonts, withResources) import System.Environment (getArgs) import System.Exit (die) import Text.Printf (printf) synchronize :: Document -> Collection -> IO [(Id Object, Id Instruction, String)] synchronize pdf alto = (r UnifiedLayers pdf >>= withFonts extractFromLayer) `handle` die where extractFromLayer layer = r Pages layer >>= fmap concat . mapM extractFromPage . Map.toList extractFromPage (pageNumber, page) = do altoWords <- liftIO $ getWords alto pageNumber withResources (flip evalStateT altoWords . extractWithResources) page extractWithResources page = r Contents page >>= fmap concat . sequence . mapi extractFromObject extractFromObject objectId content = lift (r Chunks content) >>= undefined processFiles :: FilePath -> FilePath -> IO () processFiles inputFile altoDir = do pdf <- either die return . parseDocument =<< BS.readFile inputFile alto <- collection altoDir putStrLn "Object,Instruction,ALTO_ID" mapM_ (putStrLn . format) =<< synchronize pdf alto where format (objectId, instructionId, altoId) = printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId main :: IO () main = do args <- getArgs case args of [inputFile, altoDir] -> processFiles inputFile altoDir _ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR"