57 lines
2.1 KiB
Haskell
57 lines
2.1 KiB
Haskell
{-# LANGUAGE FlexibleContexts #-}
|
|
module Main where
|
|
|
|
import ALTO (Collection, collection, getWords)
|
|
import Control.Monad.Except.IOH (handle)
|
|
import Control.Monad.IO.Class (liftIO)
|
|
import Control.Monad.Reader (runReaderT)
|
|
import Control.Monad.State (MonadState(..), evalStateT)
|
|
import Control.Monad.Trans (lift)
|
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
|
import qualified Data.ByteString.Lazy as Lazy (writeFile)
|
|
import Data.Id (Id(..))
|
|
import qualified Data.Map as Map (toList)
|
|
import Data.OrderedMap (mapi)
|
|
import PDF (Document, UnifiedLayers(..), parseDocument, render)
|
|
import PDF.Box (Box(..), at)
|
|
import PDF.Content.Operator (Instruction)
|
|
import PDF.Content.Text (Chunks(..))
|
|
import PDF.Layer (Layer)
|
|
import PDF.Object (Object)
|
|
import PDF.Pages (Contents(..), Pages(..), withFonts, withResources)
|
|
import System.Environment (getArgs)
|
|
import System.Exit (die)
|
|
import Text.Printf (printf)
|
|
|
|
synchronize :: Document -> Collection -> IO [(Id Object, Id Instruction, String)]
|
|
synchronize pdf alto =
|
|
(r UnifiedLayers pdf >>= withFonts extractFromLayer) `handle` die
|
|
where
|
|
extractFromLayer layer =
|
|
r Pages layer >>= fmap concat . mapM extractFromPage . Map.toList
|
|
extractFromPage (pageNumber, page) = do
|
|
altoWords <- liftIO $ getWords alto pageNumber
|
|
withResources (flip evalStateT altoWords . extractWithResources) page
|
|
extractWithResources page =
|
|
r Contents page >>= fmap concat . sequence . mapi extractFromObject
|
|
extractFromObject objectId content =
|
|
lift (r Chunks content) >>= undefined
|
|
|
|
processFiles :: FilePath -> FilePath -> IO ()
|
|
processFiles inputFile altoDir = do
|
|
pdf <- either die return . parseDocument =<< BS.readFile inputFile
|
|
alto <- collection altoDir
|
|
putStrLn "Object,Instruction,ALTO_ID"
|
|
mapM_ (putStrLn . format) =<< synchronize pdf alto
|
|
where
|
|
format (objectId, instructionId, altoId) =
|
|
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
|
|
|
|
main :: IO ()
|
|
main = do
|
|
args <- getArgs
|
|
case args of
|
|
[inputFile, altoDir] ->
|
|
processFiles inputFile altoDir
|
|
_ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR"
|