{-# LANGUAGE FlexibleContexts #-} module Main where import ALTO (ALTO_ID(..), Collection, Words, collection, getWords) import Control.Monad.Except (MonadError(..), ExceptT(..)) import Control.Monad.Except.IOH (handle) import Control.Monad.IO.Class (MonadIO, liftIO) import Control.Monad.State (MonadState(..), evalStateT) import Control.Monad.Trans (lift) import qualified Data.ByteString.Char8 as BS (readFile) import Data.Id (Id(..)) import qualified Data.Id as Id(toList) import qualified Data.Map as Map (toList) import Data.OrderedMap (mapi) import Data.Text (Text, pack, stripPrefix) import qualified Data.Text as Text (null, strip) import PDF (Document, UnifiedLayers(..), parseDocument) import PDF.Box (Box(..)) import PDF.Content.Text (Chunks(..)) import PDF.Object (Object) import PDF.Pages (Contents(..), Pages(..), withFonts, withResources) import System.Environment (getArgs) import System.Exit (die) import Text.Printf (printf) type Mapping = (Id Object, Id Text, ALTO_ID) format :: Mapping -> String format (objectId, instructionId, ALTO_ID altoId) = printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId zipWithALTO :: (MonadIO m, MonadError String m, MonadState Words m) => Id Object -> [(Id Text, Text)]-> m () zipWithALTO _ [] = return () zipWithALTO objectId chunks@((chunkId, chunk):otherChunks) = popWord >>= zipWord where popWord = state (splitAt 1) zipWord [(word, wordID)] | pack word == chunk = do liftIO . putStrLn $ format (objectId, chunkId, wordID) zipWithALTO objectId otherChunks | otherwise = case stripPrefix (pack word) chunk of Just stripped -> do let editedChunks = (chunkId, stripped):otherChunks liftIO . putStrLn $ format (objectId, chunkId, wordID) zipWithALTO objectId editedChunks Nothing -> zipWithALTO objectId chunks zipWord _ | Text.null $ Text.strip chunk = zipWithALTO objectId otherChunks | otherwise = throwError $ printf "Could not associate an ALTO_ID to all PDF chunks in object #%d. Remaining : %s\n" (getId objectId) (show chunks) synchronize :: Document -> Collection -> ExceptT String IO () synchronize pdf alto = r UnifiedLayers pdf >>= withFonts extractFromLayer where extractFromLayer layer = r Pages layer >>= mapM_ extractFromPage . Map.toList extractFromPage (pageNumber, page) = do altoWords <- liftIO $ getWords alto pageNumber withResources (flip evalStateT altoWords . extractWithResources) page extractWithResources page = r Contents page >>= sequence_ . mapi extractFromObject extractFromObject objectId content = lift (r Chunks content) >>= zipWithALTO objectId . Id.toList processFiles :: FilePath -> FilePath -> IO () processFiles inputFile altoDir = do pdf <- either die return . parseDocument =<< BS.readFile inputFile alto <- collection altoDir putStrLn "Object,Instruction,ALTO_ID" synchronize pdf alto `handle` die main :: IO () main = do args <- getArgs case args of [inputFile, altoDir] -> processFiles inputFile altoDir _ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR"