84 lines
3.1 KiB
Haskell
84 lines
3.1 KiB
Haskell
{-# LANGUAGE FlexibleContexts #-}
|
|
module Main where
|
|
|
|
import ALTO (ALTO_ID(..), Collection, Words, collection, getWords)
|
|
import Control.Monad.Except (MonadError(..), ExceptT(..))
|
|
import Control.Monad.Except.IOH (handle)
|
|
import Control.Monad.IO.Class (MonadIO, liftIO)
|
|
import Control.Monad.State (MonadState(..), evalStateT)
|
|
import Control.Monad.Trans (lift)
|
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
|
import Data.Id (Id(..))
|
|
import qualified Data.Id as Id(toList)
|
|
import qualified Data.Map as Map (toList)
|
|
import Data.OrderedMap (mapi)
|
|
import Data.Text (Text, pack, stripPrefix)
|
|
import qualified Data.Text as Text (null, strip)
|
|
import PDF (Document, UnifiedLayers(..), parseDocument)
|
|
import PDF.Box (Box(..))
|
|
import PDF.Content.Text (Chunks(..))
|
|
import PDF.Object (Object)
|
|
import PDF.Pages (Contents(..), Pages(..), withFonts, withResources)
|
|
import System.Environment (getArgs)
|
|
import System.Exit (die)
|
|
import Text.Printf (printf)
|
|
|
|
type Mapping = (Id Object, Id Text, ALTO_ID)
|
|
|
|
format :: Mapping -> String
|
|
format (objectId, instructionId, ALTO_ID altoId) =
|
|
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
|
|
|
|
zipWithALTO :: (MonadIO m, MonadError String m, MonadState Words m) =>
|
|
Id Object -> [(Id Text, Text)]-> m ()
|
|
zipWithALTO _ [] = return ()
|
|
zipWithALTO objectId chunks@((chunkId, chunk):otherChunks) = popWord >>= zipWord
|
|
where
|
|
popWord = state (splitAt 1)
|
|
zipWord [(word, wordID)]
|
|
| pack word == chunk = do
|
|
liftIO . putStrLn $ format (objectId, chunkId, wordID)
|
|
zipWithALTO objectId otherChunks
|
|
| otherwise =
|
|
case stripPrefix (pack word) chunk of
|
|
Just stripped -> do
|
|
let editedChunks = (chunkId, stripped):otherChunks
|
|
liftIO . putStrLn $ format (objectId, chunkId, wordID)
|
|
zipWithALTO objectId editedChunks
|
|
Nothing -> zipWithALTO objectId chunks
|
|
zipWord _
|
|
| Text.null $ Text.strip chunk = zipWithALTO objectId otherChunks
|
|
| otherwise = throwError $
|
|
printf "Could not associate an ALTO_ID to all PDF chunks in object #%d. Remaining : %s\n"
|
|
(getId objectId)
|
|
(show chunks)
|
|
|
|
synchronize :: Document -> Collection -> ExceptT String IO ()
|
|
synchronize pdf alto =
|
|
r UnifiedLayers pdf >>= withFonts extractFromLayer
|
|
where
|
|
extractFromLayer layer =
|
|
r Pages layer >>= mapM_ extractFromPage . Map.toList
|
|
extractFromPage (pageNumber, page) = do
|
|
altoWords <- liftIO $ getWords alto pageNumber
|
|
withResources (flip evalStateT altoWords . extractWithResources) page
|
|
extractWithResources page =
|
|
r Contents page >>= sequence_ . mapi extractFromObject
|
|
extractFromObject objectId content =
|
|
lift (r Chunks content) >>= zipWithALTO objectId . Id.toList
|
|
|
|
processFiles :: FilePath -> FilePath -> IO ()
|
|
processFiles inputFile altoDir = do
|
|
pdf <- either die return . parseDocument =<< BS.readFile inputFile
|
|
alto <- collection altoDir
|
|
putStrLn "Object,Instruction,ALTO_ID"
|
|
synchronize pdf alto `handle` die
|
|
|
|
main :: IO ()
|
|
main = do
|
|
args <- getArgs
|
|
case args of
|
|
[inputFile, altoDir] ->
|
|
processFiles inputFile altoDir
|
|
_ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR"
|