pdfcleaner/src/Sync.hs

57 lines
2.1 KiB
Haskell

{-# LANGUAGE FlexibleContexts #-}
module Main where
import ALTO (Collection, collection, getWords)
import Control.Monad.Except.IOH (handle)
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Reader (runReaderT)
import Control.Monad.State (MonadState(..), evalStateT)
import Control.Monad.Trans (lift)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy as Lazy (writeFile)
import Data.Id (Id(..))
import qualified Data.Map as Map (toList)
import Data.OrderedMap (mapi)
import PDF (Document, UnifiedLayers(..), parseDocument, render)
import PDF.Box (Box(..), at)
import PDF.Content.Operator (Instruction)
import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer)
import PDF.Object (Object)
import PDF.Pages (Contents(..), Pages(..), withFonts, withResources)
import System.Environment (getArgs)
import System.Exit (die)
import Text.Printf (printf)
synchronize :: Document -> Collection -> IO [(Id Object, Id Instruction, String)]
synchronize pdf alto =
(r UnifiedLayers pdf >>= withFonts extractFromLayer) `handle` die
where
extractFromLayer layer =
r Pages layer >>= fmap concat . mapM extractFromPage . Map.toList
extractFromPage (pageNumber, page) = do
altoWords <- liftIO $ getWords alto pageNumber
withResources (flip evalStateT altoWords . extractWithResources) page
extractWithResources page =
r Contents page >>= fmap concat . sequence . mapi extractFromObject
extractFromObject objectId content =
lift (r Chunks content) >>= undefined
processFiles :: FilePath -> FilePath -> IO ()
processFiles inputFile altoDir = do
pdf <- either die return . parseDocument =<< BS.readFile inputFile
alto <- collection altoDir
putStrLn "Object,Instruction,ALTO_ID"
mapM_ (putStrLn . format) =<< synchronize pdf alto
where
format (objectId, instructionId, altoId) =
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
main :: IO ()
main = do
args <- getArgs
case args of
[inputFile, altoDir] ->
processFiles inputFile altoDir
_ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR"