Implement the output of synchronized terms between PDF and ALTO

This commit is contained in:
Tissevert 2020-06-03 15:15:41 +02:00
parent 1907a03b91
commit 7cd61d6451
3 changed files with 51 additions and 19 deletions

View File

@ -50,6 +50,7 @@ executable sync
, Hufflepdf , Hufflepdf
, mtl , mtl
, soprano , soprano
, text
, xml , xml
ghc-options: -Wall ghc-options: -Wall
hs-source-dirs: src hs-source-dirs: src

View File

@ -1,6 +1,7 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
module ALTO ( module ALTO (
Collection Collection
, ALTO_ID(..)
, Words , Words
, collection , collection
, getWords , getWords
@ -20,7 +21,10 @@ data Collection = Collection {
path :: FilePath path :: FilePath
, pages :: Map Int FilePath , pages :: Map Int FilePath
} }
type Words = [(String, String)] newtype ALTO_ID = ALTO_ID {
altoID :: String
}
type Words = [(String, ALTO_ID)]
getWords :: Collection -> Int -> IO Words getWords :: Collection -> Int -> IO Words
getWords (Collection {path, pages}) pageNumber = do getWords (Collection {path, pages}) pageNumber = do
@ -31,7 +35,7 @@ getWords (Collection {path, pages}) pageNumber = do
t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"] t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"]
wordOfElem element = wordOfElem element =
case (getAttr "CONTENT" element, getAttr "ID" element) of case (getAttr "CONTENT" element, getAttr "ID" element) of
(Just content, Just elemID) -> return (content, elemID) (Just content, Just elemID) -> return (content, ALTO_ID elemID)
(Nothing, _) -> die $ "Missing CONTENT in word " ++ show element (Nothing, _) -> die $ "Missing CONTENT in word " ++ show element
_ -> die $ "Missing ID in word " ++ show element _ -> die $ "Missing ID in word " ++ show element

View File

@ -1,51 +1,78 @@
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
module Main where module Main where
import ALTO (Collection, collection, getWords) import ALTO (ALTO_ID(..), Collection, Words, collection, getWords)
import Control.Monad.Except (MonadError(..), ExceptT(..))
import Control.Monad.Except.IOH (handle) import Control.Monad.Except.IOH (handle)
import Control.Monad.IO.Class (liftIO) import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.Reader (runReaderT)
import Control.Monad.State (MonadState(..), evalStateT) import Control.Monad.State (MonadState(..), evalStateT)
import Control.Monad.Trans (lift) import Control.Monad.Trans (lift)
import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy as Lazy (writeFile)
import Data.Id (Id(..)) import Data.Id (Id(..))
import qualified Data.Id as Id(toList)
import qualified Data.Map as Map (toList) import qualified Data.Map as Map (toList)
import Data.OrderedMap (mapi) import Data.OrderedMap (mapi)
import PDF (Document, UnifiedLayers(..), parseDocument, render) import Data.Text (Text, pack, stripPrefix)
import PDF.Box (Box(..), at) import qualified Data.Text as Text (null, strip)
import PDF.Content.Operator (Instruction) import PDF (Document, UnifiedLayers(..), parseDocument)
import PDF.Box (Box(..))
import PDF.Content.Text (Chunks(..)) import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer)
import PDF.Object (Object) import PDF.Object (Object)
import PDF.Pages (Contents(..), Pages(..), withFonts, withResources) import PDF.Pages (Contents(..), Pages(..), withFonts, withResources)
import System.Environment (getArgs) import System.Environment (getArgs)
import System.Exit (die) import System.Exit (die)
import Text.Printf (printf) import Text.Printf (printf)
synchronize :: Document -> Collection -> IO [(Id Object, Id Instruction, String)] type Mapping = (Id Object, Id Text, ALTO_ID)
format :: Mapping -> String
format (objectId, instructionId, ALTO_ID altoId) =
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
zipWithALTO :: (MonadIO m, MonadError String m, MonadState Words m) =>
Id Object -> [(Id Text, Text)]-> m ()
zipWithALTO _ [] = return ()
zipWithALTO objectId chunks@((chunkId, chunk):otherChunks) = popWord >>= zipWord
where
popWord = state (splitAt 1)
zipWord [(word, wordID)]
| pack word == chunk = do
liftIO . putStrLn $ format (objectId, chunkId, wordID)
zipWithALTO objectId otherChunks
| otherwise =
case stripPrefix (pack word) chunk of
Just stripped -> do
let editedChunks = (chunkId, stripped):otherChunks
liftIO . putStrLn $ format (objectId, chunkId, wordID)
zipWithALTO objectId editedChunks
Nothing -> zipWithALTO objectId chunks
zipWord _
| Text.null $ Text.strip chunk = zipWithALTO objectId otherChunks
| otherwise = throwError $
printf "Could not associate an ALTO_ID to all PDF chunks in object #%d. Remaining : %s\n"
(getId objectId)
(show chunks)
synchronize :: Document -> Collection -> ExceptT String IO ()
synchronize pdf alto = synchronize pdf alto =
(r UnifiedLayers pdf >>= withFonts extractFromLayer) `handle` die r UnifiedLayers pdf >>= withFonts extractFromLayer
where where
extractFromLayer layer = extractFromLayer layer =
r Pages layer >>= fmap concat . mapM extractFromPage . Map.toList r Pages layer >>= mapM_ extractFromPage . Map.toList
extractFromPage (pageNumber, page) = do extractFromPage (pageNumber, page) = do
altoWords <- liftIO $ getWords alto pageNumber altoWords <- liftIO $ getWords alto pageNumber
withResources (flip evalStateT altoWords . extractWithResources) page withResources (flip evalStateT altoWords . extractWithResources) page
extractWithResources page = extractWithResources page =
r Contents page >>= fmap concat . sequence . mapi extractFromObject r Contents page >>= sequence_ . mapi extractFromObject
extractFromObject objectId content = extractFromObject objectId content =
lift (r Chunks content) >>= undefined lift (r Chunks content) >>= zipWithALTO objectId . Id.toList
processFiles :: FilePath -> FilePath -> IO () processFiles :: FilePath -> FilePath -> IO ()
processFiles inputFile altoDir = do processFiles inputFile altoDir = do
pdf <- either die return . parseDocument =<< BS.readFile inputFile pdf <- either die return . parseDocument =<< BS.readFile inputFile
alto <- collection altoDir alto <- collection altoDir
putStrLn "Object,Instruction,ALTO_ID" putStrLn "Object,Instruction,ALTO_ID"
mapM_ (putStrLn . format) =<< synchronize pdf alto synchronize pdf alto `handle` die
where
format (objectId, instructionId, altoId) =
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
main :: IO () main :: IO ()
main = do main = do