Implement the output of synchronized terms between PDF and ALTO

This commit is contained in:
Tissevert 2020-06-03 15:15:41 +02:00
parent 1907a03b91
commit 7cd61d6451
3 changed files with 51 additions and 19 deletions

View file

@ -50,6 +50,7 @@ executable sync
, Hufflepdf
, mtl
, soprano
, text
, xml
ghc-options: -Wall
hs-source-dirs: src

View file

@ -1,6 +1,7 @@
{-# LANGUAGE NamedFieldPuns #-}
module ALTO (
Collection
, ALTO_ID(..)
, Words
, collection
, getWords
@ -20,7 +21,10 @@ data Collection = Collection {
path :: FilePath
, pages :: Map Int FilePath
}
type Words = [(String, String)]
newtype ALTO_ID = ALTO_ID {
altoID :: String
}
type Words = [(String, ALTO_ID)]
getWords :: Collection -> Int -> IO Words
getWords (Collection {path, pages}) pageNumber = do
@ -31,7 +35,7 @@ getWords (Collection {path, pages}) pageNumber = do
t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"]
wordOfElem element =
case (getAttr "CONTENT" element, getAttr "ID" element) of
(Just content, Just elemID) -> return (content, elemID)
(Just content, Just elemID) -> return (content, ALTO_ID elemID)
(Nothing, _) -> die $ "Missing CONTENT in word " ++ show element
_ -> die $ "Missing ID in word " ++ show element

View file

@ -1,51 +1,78 @@
{-# LANGUAGE FlexibleContexts #-}
module Main where
import ALTO (Collection, collection, getWords)
import ALTO (ALTO_ID(..), Collection, Words, collection, getWords)
import Control.Monad.Except (MonadError(..), ExceptT(..))
import Control.Monad.Except.IOH (handle)
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Reader (runReaderT)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.State (MonadState(..), evalStateT)
import Control.Monad.Trans (lift)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy as Lazy (writeFile)
import Data.Id (Id(..))
import qualified Data.Id as Id(toList)
import qualified Data.Map as Map (toList)
import Data.OrderedMap (mapi)
import PDF (Document, UnifiedLayers(..), parseDocument, render)
import PDF.Box (Box(..), at)
import PDF.Content.Operator (Instruction)
import Data.Text (Text, pack, stripPrefix)
import qualified Data.Text as Text (null, strip)
import PDF (Document, UnifiedLayers(..), parseDocument)
import PDF.Box (Box(..))
import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer)
import PDF.Object (Object)
import PDF.Pages (Contents(..), Pages(..), withFonts, withResources)
import System.Environment (getArgs)
import System.Exit (die)
import Text.Printf (printf)
synchronize :: Document -> Collection -> IO [(Id Object, Id Instruction, String)]
type Mapping = (Id Object, Id Text, ALTO_ID)
format :: Mapping -> String
format (objectId, instructionId, ALTO_ID altoId) =
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
zipWithALTO :: (MonadIO m, MonadError String m, MonadState Words m) =>
Id Object -> [(Id Text, Text)]-> m ()
zipWithALTO _ [] = return ()
zipWithALTO objectId chunks@((chunkId, chunk):otherChunks) = popWord >>= zipWord
where
popWord = state (splitAt 1)
zipWord [(word, wordID)]
| pack word == chunk = do
liftIO . putStrLn $ format (objectId, chunkId, wordID)
zipWithALTO objectId otherChunks
| otherwise =
case stripPrefix (pack word) chunk of
Just stripped -> do
let editedChunks = (chunkId, stripped):otherChunks
liftIO . putStrLn $ format (objectId, chunkId, wordID)
zipWithALTO objectId editedChunks
Nothing -> zipWithALTO objectId chunks
zipWord _
| Text.null $ Text.strip chunk = zipWithALTO objectId otherChunks
| otherwise = throwError $
printf "Could not associate an ALTO_ID to all PDF chunks in object #%d. Remaining : %s\n"
(getId objectId)
(show chunks)
synchronize :: Document -> Collection -> ExceptT String IO ()
synchronize pdf alto =
(r UnifiedLayers pdf >>= withFonts extractFromLayer) `handle` die
r UnifiedLayers pdf >>= withFonts extractFromLayer
where
extractFromLayer layer =
r Pages layer >>= fmap concat . mapM extractFromPage . Map.toList
r Pages layer >>= mapM_ extractFromPage . Map.toList
extractFromPage (pageNumber, page) = do
altoWords <- liftIO $ getWords alto pageNumber
withResources (flip evalStateT altoWords . extractWithResources) page
extractWithResources page =
r Contents page >>= fmap concat . sequence . mapi extractFromObject
r Contents page >>= sequence_ . mapi extractFromObject
extractFromObject objectId content =
lift (r Chunks content) >>= undefined
lift (r Chunks content) >>= zipWithALTO objectId . Id.toList
processFiles :: FilePath -> FilePath -> IO ()
processFiles inputFile altoDir = do
pdf <- either die return . parseDocument =<< BS.readFile inputFile
alto <- collection altoDir
putStrLn "Object,Instruction,ALTO_ID"
mapM_ (putStrLn . format) =<< synchronize pdf alto
where
format (objectId, instructionId, altoId) =
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
synchronize pdf alto `handle` die
main :: IO ()
main = do