Implement the output of synchronized terms between PDF and ALTO
This commit is contained in:
parent
1907a03b91
commit
7cd61d6451
3 changed files with 51 additions and 19 deletions
|
@ -50,6 +50,7 @@ executable sync
|
|||
, Hufflepdf
|
||||
, mtl
|
||||
, soprano
|
||||
, text
|
||||
, xml
|
||||
ghc-options: -Wall
|
||||
hs-source-dirs: src
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module ALTO (
|
||||
Collection
|
||||
, ALTO_ID(..)
|
||||
, Words
|
||||
, collection
|
||||
, getWords
|
||||
|
@ -20,7 +21,10 @@ data Collection = Collection {
|
|||
path :: FilePath
|
||||
, pages :: Map Int FilePath
|
||||
}
|
||||
type Words = [(String, String)]
|
||||
newtype ALTO_ID = ALTO_ID {
|
||||
altoID :: String
|
||||
}
|
||||
type Words = [(String, ALTO_ID)]
|
||||
|
||||
getWords :: Collection -> Int -> IO Words
|
||||
getWords (Collection {path, pages}) pageNumber = do
|
||||
|
@ -31,7 +35,7 @@ getWords (Collection {path, pages}) pageNumber = do
|
|||
t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"]
|
||||
wordOfElem element =
|
||||
case (getAttr "CONTENT" element, getAttr "ID" element) of
|
||||
(Just content, Just elemID) -> return (content, elemID)
|
||||
(Just content, Just elemID) -> return (content, ALTO_ID elemID)
|
||||
(Nothing, _) -> die $ "Missing CONTENT in word " ++ show element
|
||||
_ -> die $ "Missing ID in word " ++ show element
|
||||
|
||||
|
|
61
src/Sync.hs
61
src/Sync.hs
|
@ -1,51 +1,78 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
module Main where
|
||||
|
||||
import ALTO (Collection, collection, getWords)
|
||||
import ALTO (ALTO_ID(..), Collection, Words, collection, getWords)
|
||||
import Control.Monad.Except (MonadError(..), ExceptT(..))
|
||||
import Control.Monad.Except.IOH (handle)
|
||||
import Control.Monad.IO.Class (liftIO)
|
||||
import Control.Monad.Reader (runReaderT)
|
||||
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||
import Control.Monad.State (MonadState(..), evalStateT)
|
||||
import Control.Monad.Trans (lift)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import qualified Data.ByteString.Lazy as Lazy (writeFile)
|
||||
import Data.Id (Id(..))
|
||||
import qualified Data.Id as Id(toList)
|
||||
import qualified Data.Map as Map (toList)
|
||||
import Data.OrderedMap (mapi)
|
||||
import PDF (Document, UnifiedLayers(..), parseDocument, render)
|
||||
import PDF.Box (Box(..), at)
|
||||
import PDF.Content.Operator (Instruction)
|
||||
import Data.Text (Text, pack, stripPrefix)
|
||||
import qualified Data.Text as Text (null, strip)
|
||||
import PDF (Document, UnifiedLayers(..), parseDocument)
|
||||
import PDF.Box (Box(..))
|
||||
import PDF.Content.Text (Chunks(..))
|
||||
import PDF.Layer (Layer)
|
||||
import PDF.Object (Object)
|
||||
import PDF.Pages (Contents(..), Pages(..), withFonts, withResources)
|
||||
import System.Environment (getArgs)
|
||||
import System.Exit (die)
|
||||
import Text.Printf (printf)
|
||||
|
||||
synchronize :: Document -> Collection -> IO [(Id Object, Id Instruction, String)]
|
||||
type Mapping = (Id Object, Id Text, ALTO_ID)
|
||||
|
||||
format :: Mapping -> String
|
||||
format (objectId, instructionId, ALTO_ID altoId) =
|
||||
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
|
||||
|
||||
zipWithALTO :: (MonadIO m, MonadError String m, MonadState Words m) =>
|
||||
Id Object -> [(Id Text, Text)]-> m ()
|
||||
zipWithALTO _ [] = return ()
|
||||
zipWithALTO objectId chunks@((chunkId, chunk):otherChunks) = popWord >>= zipWord
|
||||
where
|
||||
popWord = state (splitAt 1)
|
||||
zipWord [(word, wordID)]
|
||||
| pack word == chunk = do
|
||||
liftIO . putStrLn $ format (objectId, chunkId, wordID)
|
||||
zipWithALTO objectId otherChunks
|
||||
| otherwise =
|
||||
case stripPrefix (pack word) chunk of
|
||||
Just stripped -> do
|
||||
let editedChunks = (chunkId, stripped):otherChunks
|
||||
liftIO . putStrLn $ format (objectId, chunkId, wordID)
|
||||
zipWithALTO objectId editedChunks
|
||||
Nothing -> zipWithALTO objectId chunks
|
||||
zipWord _
|
||||
| Text.null $ Text.strip chunk = zipWithALTO objectId otherChunks
|
||||
| otherwise = throwError $
|
||||
printf "Could not associate an ALTO_ID to all PDF chunks in object #%d. Remaining : %s\n"
|
||||
(getId objectId)
|
||||
(show chunks)
|
||||
|
||||
synchronize :: Document -> Collection -> ExceptT String IO ()
|
||||
synchronize pdf alto =
|
||||
(r UnifiedLayers pdf >>= withFonts extractFromLayer) `handle` die
|
||||
r UnifiedLayers pdf >>= withFonts extractFromLayer
|
||||
where
|
||||
extractFromLayer layer =
|
||||
r Pages layer >>= fmap concat . mapM extractFromPage . Map.toList
|
||||
r Pages layer >>= mapM_ extractFromPage . Map.toList
|
||||
extractFromPage (pageNumber, page) = do
|
||||
altoWords <- liftIO $ getWords alto pageNumber
|
||||
withResources (flip evalStateT altoWords . extractWithResources) page
|
||||
extractWithResources page =
|
||||
r Contents page >>= fmap concat . sequence . mapi extractFromObject
|
||||
r Contents page >>= sequence_ . mapi extractFromObject
|
||||
extractFromObject objectId content =
|
||||
lift (r Chunks content) >>= undefined
|
||||
lift (r Chunks content) >>= zipWithALTO objectId . Id.toList
|
||||
|
||||
processFiles :: FilePath -> FilePath -> IO ()
|
||||
processFiles inputFile altoDir = do
|
||||
pdf <- either die return . parseDocument =<< BS.readFile inputFile
|
||||
alto <- collection altoDir
|
||||
putStrLn "Object,Instruction,ALTO_ID"
|
||||
mapM_ (putStrLn . format) =<< synchronize pdf alto
|
||||
where
|
||||
format (objectId, instructionId, altoId) =
|
||||
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
|
||||
synchronize pdf alto `handle` die
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
|
|
Loading…
Reference in a new issue