pdfcleaner/src/Sync.hs

84 lines
3.1 KiB
Haskell

{-# LANGUAGE FlexibleContexts #-}
module Main where
import ALTO (ALTO_ID(..), Collection, Words, collection, getWords)
import Control.Monad.Except (MonadError(..), ExceptT(..))
import Control.Monad.Except.IOH (handle)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.State (MonadState(..), evalStateT)
import Control.Monad.Trans (lift)
import qualified Data.ByteString.Char8 as BS (readFile)
import Data.Id (Id(..))
import qualified Data.Id as Id(toList)
import qualified Data.Map as Map (toList)
import Data.OrderedMap (mapi)
import Data.Text (Text, pack, stripPrefix)
import qualified Data.Text as Text (null, strip)
import PDF (Document, UnifiedLayers(..), parseDocument)
import PDF.Box (Box(..))
import PDF.Content.Text (Chunks(..))
import PDF.Object (Object)
import PDF.Pages (Contents(..), Pages(..), withFonts, withResources)
import System.Environment (getArgs)
import System.Exit (die)
import Text.Printf (printf)
type Mapping = (Id Object, Id Text, ALTO_ID)
format :: Mapping -> String
format (objectId, instructionId, ALTO_ID altoId) =
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
zipWithALTO :: (MonadIO m, MonadError String m, MonadState Words m) =>
Id Object -> [(Id Text, Text)]-> m ()
zipWithALTO _ [] = return ()
zipWithALTO objectId chunks@((chunkId, chunk):otherChunks) = popWord >>= zipWord
where
popWord = state (splitAt 1)
zipWord [(word, wordID)]
| pack word == chunk = do
liftIO . putStrLn $ format (objectId, chunkId, wordID)
zipWithALTO objectId otherChunks
| otherwise =
case stripPrefix (pack word) chunk of
Just stripped -> do
let editedChunks = (chunkId, stripped):otherChunks
liftIO . putStrLn $ format (objectId, chunkId, wordID)
zipWithALTO objectId editedChunks
Nothing -> zipWithALTO objectId chunks
zipWord _
| Text.null $ Text.strip chunk = zipWithALTO objectId otherChunks
| otherwise = throwError $
printf "Could not associate an ALTO_ID to all PDF chunks in object #%d. Remaining : %s\n"
(getId objectId)
(show chunks)
synchronize :: Document -> Collection -> ExceptT String IO ()
synchronize pdf alto =
r UnifiedLayers pdf >>= withFonts extractFromLayer
where
extractFromLayer layer =
r Pages layer >>= mapM_ extractFromPage . Map.toList
extractFromPage (pageNumber, page) = do
altoWords <- liftIO $ getWords alto pageNumber
withResources (flip evalStateT altoWords . extractWithResources) page
extractWithResources page =
r Contents page >>= sequence_ . mapi extractFromObject
extractFromObject objectId content =
lift (r Chunks content) >>= zipWithALTO objectId . Id.toList
processFiles :: FilePath -> FilePath -> IO ()
processFiles inputFile altoDir = do
pdf <- either die return . parseDocument =<< BS.readFile inputFile
alto <- collection altoDir
putStrLn "Object,Instruction,ALTO_ID"
synchronize pdf alto `handle` die
main :: IO ()
main = do
args <- getArgs
case args of
[inputFile, altoDir] ->
processFiles inputFile altoDir
_ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR"