diff --git a/pdfcleaner.cabal b/pdfcleaner.cabal index cf49054..59681e0 100644 --- a/pdfcleaner.cabal +++ b/pdfcleaner.cabal @@ -50,6 +50,7 @@ executable sync , Hufflepdf , mtl , soprano + , text , xml ghc-options: -Wall hs-source-dirs: src diff --git a/src/ALTO.hs b/src/ALTO.hs index b82bdba..9d677f1 100644 --- a/src/ALTO.hs +++ b/src/ALTO.hs @@ -1,6 +1,7 @@ {-# LANGUAGE NamedFieldPuns #-} module ALTO ( Collection + , ALTO_ID(..) , Words , collection , getWords @@ -20,7 +21,10 @@ data Collection = Collection { path :: FilePath , pages :: Map Int FilePath } -type Words = [(String, String)] +newtype ALTO_ID = ALTO_ID { + altoID :: String + } +type Words = [(String, ALTO_ID)] getWords :: Collection -> Int -> IO Words getWords (Collection {path, pages}) pageNumber = do @@ -31,7 +35,7 @@ getWords (Collection {path, pages}) pageNumber = do t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"] wordOfElem element = case (getAttr "CONTENT" element, getAttr "ID" element) of - (Just content, Just elemID) -> return (content, elemID) + (Just content, Just elemID) -> return (content, ALTO_ID elemID) (Nothing, _) -> die $ "Missing CONTENT in word " ++ show element _ -> die $ "Missing ID in word " ++ show element diff --git a/src/Sync.hs b/src/Sync.hs index c73f64d..4da042a 100644 --- a/src/Sync.hs +++ b/src/Sync.hs @@ -1,51 +1,78 @@ {-# LANGUAGE FlexibleContexts #-} module Main where -import ALTO (Collection, collection, getWords) +import ALTO (ALTO_ID(..), Collection, Words, collection, getWords) +import Control.Monad.Except (MonadError(..), ExceptT(..)) import Control.Monad.Except.IOH (handle) -import Control.Monad.IO.Class (liftIO) -import Control.Monad.Reader (runReaderT) +import Control.Monad.IO.Class (MonadIO, liftIO) import Control.Monad.State (MonadState(..), evalStateT) import Control.Monad.Trans (lift) import qualified Data.ByteString.Char8 as BS (readFile) -import qualified Data.ByteString.Lazy as Lazy (writeFile) import Data.Id (Id(..)) +import qualified Data.Id as Id(toList) import qualified Data.Map as Map (toList) import Data.OrderedMap (mapi) -import PDF (Document, UnifiedLayers(..), parseDocument, render) -import PDF.Box (Box(..), at) -import PDF.Content.Operator (Instruction) +import Data.Text (Text, pack, stripPrefix) +import qualified Data.Text as Text (null, strip) +import PDF (Document, UnifiedLayers(..), parseDocument) +import PDF.Box (Box(..)) import PDF.Content.Text (Chunks(..)) -import PDF.Layer (Layer) import PDF.Object (Object) import PDF.Pages (Contents(..), Pages(..), withFonts, withResources) import System.Environment (getArgs) import System.Exit (die) import Text.Printf (printf) -synchronize :: Document -> Collection -> IO [(Id Object, Id Instruction, String)] +type Mapping = (Id Object, Id Text, ALTO_ID) + +format :: Mapping -> String +format (objectId, instructionId, ALTO_ID altoId) = + printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId + +zipWithALTO :: (MonadIO m, MonadError String m, MonadState Words m) => + Id Object -> [(Id Text, Text)]-> m () +zipWithALTO _ [] = return () +zipWithALTO objectId chunks@((chunkId, chunk):otherChunks) = popWord >>= zipWord + where + popWord = state (splitAt 1) + zipWord [(word, wordID)] + | pack word == chunk = do + liftIO . putStrLn $ format (objectId, chunkId, wordID) + zipWithALTO objectId otherChunks + | otherwise = + case stripPrefix (pack word) chunk of + Just stripped -> do + let editedChunks = (chunkId, stripped):otherChunks + liftIO . putStrLn $ format (objectId, chunkId, wordID) + zipWithALTO objectId editedChunks + Nothing -> zipWithALTO objectId chunks + zipWord _ + | Text.null $ Text.strip chunk = zipWithALTO objectId otherChunks + | otherwise = throwError $ + printf "Could not associate an ALTO_ID to all PDF chunks in object #%d. Remaining : %s\n" + (getId objectId) + (show chunks) + +synchronize :: Document -> Collection -> ExceptT String IO () synchronize pdf alto = - (r UnifiedLayers pdf >>= withFonts extractFromLayer) `handle` die + r UnifiedLayers pdf >>= withFonts extractFromLayer where extractFromLayer layer = - r Pages layer >>= fmap concat . mapM extractFromPage . Map.toList + r Pages layer >>= mapM_ extractFromPage . Map.toList extractFromPage (pageNumber, page) = do altoWords <- liftIO $ getWords alto pageNumber withResources (flip evalStateT altoWords . extractWithResources) page extractWithResources page = - r Contents page >>= fmap concat . sequence . mapi extractFromObject + r Contents page >>= sequence_ . mapi extractFromObject extractFromObject objectId content = - lift (r Chunks content) >>= undefined + lift (r Chunks content) >>= zipWithALTO objectId . Id.toList processFiles :: FilePath -> FilePath -> IO () processFiles inputFile altoDir = do pdf <- either die return . parseDocument =<< BS.readFile inputFile alto <- collection altoDir putStrLn "Object,Instruction,ALTO_ID" - mapM_ (putStrLn . format) =<< synchronize pdf alto - where - format (objectId, instructionId, altoId) = - printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId + synchronize pdf alto `handle` die main :: IO () main = do