Implement the output of synchronized terms between PDF and ALTO
This commit is contained in:
parent
1907a03b91
commit
7cd61d6451
3 changed files with 51 additions and 19 deletions
|
@ -50,6 +50,7 @@ executable sync
|
||||||
, Hufflepdf
|
, Hufflepdf
|
||||||
, mtl
|
, mtl
|
||||||
, soprano
|
, soprano
|
||||||
|
, text
|
||||||
, xml
|
, xml
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall
|
||||||
hs-source-dirs: src
|
hs-source-dirs: src
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
module ALTO (
|
module ALTO (
|
||||||
Collection
|
Collection
|
||||||
|
, ALTO_ID(..)
|
||||||
, Words
|
, Words
|
||||||
, collection
|
, collection
|
||||||
, getWords
|
, getWords
|
||||||
|
@ -20,7 +21,10 @@ data Collection = Collection {
|
||||||
path :: FilePath
|
path :: FilePath
|
||||||
, pages :: Map Int FilePath
|
, pages :: Map Int FilePath
|
||||||
}
|
}
|
||||||
type Words = [(String, String)]
|
newtype ALTO_ID = ALTO_ID {
|
||||||
|
altoID :: String
|
||||||
|
}
|
||||||
|
type Words = [(String, ALTO_ID)]
|
||||||
|
|
||||||
getWords :: Collection -> Int -> IO Words
|
getWords :: Collection -> Int -> IO Words
|
||||||
getWords (Collection {path, pages}) pageNumber = do
|
getWords (Collection {path, pages}) pageNumber = do
|
||||||
|
@ -31,7 +35,7 @@ getWords (Collection {path, pages}) pageNumber = do
|
||||||
t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"]
|
t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"]
|
||||||
wordOfElem element =
|
wordOfElem element =
|
||||||
case (getAttr "CONTENT" element, getAttr "ID" element) of
|
case (getAttr "CONTENT" element, getAttr "ID" element) of
|
||||||
(Just content, Just elemID) -> return (content, elemID)
|
(Just content, Just elemID) -> return (content, ALTO_ID elemID)
|
||||||
(Nothing, _) -> die $ "Missing CONTENT in word " ++ show element
|
(Nothing, _) -> die $ "Missing CONTENT in word " ++ show element
|
||||||
_ -> die $ "Missing ID in word " ++ show element
|
_ -> die $ "Missing ID in word " ++ show element
|
||||||
|
|
||||||
|
|
61
src/Sync.hs
61
src/Sync.hs
|
@ -1,51 +1,78 @@
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
module Main where
|
module Main where
|
||||||
|
|
||||||
import ALTO (Collection, collection, getWords)
|
import ALTO (ALTO_ID(..), Collection, Words, collection, getWords)
|
||||||
|
import Control.Monad.Except (MonadError(..), ExceptT(..))
|
||||||
import Control.Monad.Except.IOH (handle)
|
import Control.Monad.Except.IOH (handle)
|
||||||
import Control.Monad.IO.Class (liftIO)
|
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||||
import Control.Monad.Reader (runReaderT)
|
|
||||||
import Control.Monad.State (MonadState(..), evalStateT)
|
import Control.Monad.State (MonadState(..), evalStateT)
|
||||||
import Control.Monad.Trans (lift)
|
import Control.Monad.Trans (lift)
|
||||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
import qualified Data.ByteString.Lazy as Lazy (writeFile)
|
|
||||||
import Data.Id (Id(..))
|
import Data.Id (Id(..))
|
||||||
|
import qualified Data.Id as Id(toList)
|
||||||
import qualified Data.Map as Map (toList)
|
import qualified Data.Map as Map (toList)
|
||||||
import Data.OrderedMap (mapi)
|
import Data.OrderedMap (mapi)
|
||||||
import PDF (Document, UnifiedLayers(..), parseDocument, render)
|
import Data.Text (Text, pack, stripPrefix)
|
||||||
import PDF.Box (Box(..), at)
|
import qualified Data.Text as Text (null, strip)
|
||||||
import PDF.Content.Operator (Instruction)
|
import PDF (Document, UnifiedLayers(..), parseDocument)
|
||||||
|
import PDF.Box (Box(..))
|
||||||
import PDF.Content.Text (Chunks(..))
|
import PDF.Content.Text (Chunks(..))
|
||||||
import PDF.Layer (Layer)
|
|
||||||
import PDF.Object (Object)
|
import PDF.Object (Object)
|
||||||
import PDF.Pages (Contents(..), Pages(..), withFonts, withResources)
|
import PDF.Pages (Contents(..), Pages(..), withFonts, withResources)
|
||||||
import System.Environment (getArgs)
|
import System.Environment (getArgs)
|
||||||
import System.Exit (die)
|
import System.Exit (die)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
synchronize :: Document -> Collection -> IO [(Id Object, Id Instruction, String)]
|
type Mapping = (Id Object, Id Text, ALTO_ID)
|
||||||
|
|
||||||
|
format :: Mapping -> String
|
||||||
|
format (objectId, instructionId, ALTO_ID altoId) =
|
||||||
|
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
|
||||||
|
|
||||||
|
zipWithALTO :: (MonadIO m, MonadError String m, MonadState Words m) =>
|
||||||
|
Id Object -> [(Id Text, Text)]-> m ()
|
||||||
|
zipWithALTO _ [] = return ()
|
||||||
|
zipWithALTO objectId chunks@((chunkId, chunk):otherChunks) = popWord >>= zipWord
|
||||||
|
where
|
||||||
|
popWord = state (splitAt 1)
|
||||||
|
zipWord [(word, wordID)]
|
||||||
|
| pack word == chunk = do
|
||||||
|
liftIO . putStrLn $ format (objectId, chunkId, wordID)
|
||||||
|
zipWithALTO objectId otherChunks
|
||||||
|
| otherwise =
|
||||||
|
case stripPrefix (pack word) chunk of
|
||||||
|
Just stripped -> do
|
||||||
|
let editedChunks = (chunkId, stripped):otherChunks
|
||||||
|
liftIO . putStrLn $ format (objectId, chunkId, wordID)
|
||||||
|
zipWithALTO objectId editedChunks
|
||||||
|
Nothing -> zipWithALTO objectId chunks
|
||||||
|
zipWord _
|
||||||
|
| Text.null $ Text.strip chunk = zipWithALTO objectId otherChunks
|
||||||
|
| otherwise = throwError $
|
||||||
|
printf "Could not associate an ALTO_ID to all PDF chunks in object #%d. Remaining : %s\n"
|
||||||
|
(getId objectId)
|
||||||
|
(show chunks)
|
||||||
|
|
||||||
|
synchronize :: Document -> Collection -> ExceptT String IO ()
|
||||||
synchronize pdf alto =
|
synchronize pdf alto =
|
||||||
(r UnifiedLayers pdf >>= withFonts extractFromLayer) `handle` die
|
r UnifiedLayers pdf >>= withFonts extractFromLayer
|
||||||
where
|
where
|
||||||
extractFromLayer layer =
|
extractFromLayer layer =
|
||||||
r Pages layer >>= fmap concat . mapM extractFromPage . Map.toList
|
r Pages layer >>= mapM_ extractFromPage . Map.toList
|
||||||
extractFromPage (pageNumber, page) = do
|
extractFromPage (pageNumber, page) = do
|
||||||
altoWords <- liftIO $ getWords alto pageNumber
|
altoWords <- liftIO $ getWords alto pageNumber
|
||||||
withResources (flip evalStateT altoWords . extractWithResources) page
|
withResources (flip evalStateT altoWords . extractWithResources) page
|
||||||
extractWithResources page =
|
extractWithResources page =
|
||||||
r Contents page >>= fmap concat . sequence . mapi extractFromObject
|
r Contents page >>= sequence_ . mapi extractFromObject
|
||||||
extractFromObject objectId content =
|
extractFromObject objectId content =
|
||||||
lift (r Chunks content) >>= undefined
|
lift (r Chunks content) >>= zipWithALTO objectId . Id.toList
|
||||||
|
|
||||||
processFiles :: FilePath -> FilePath -> IO ()
|
processFiles :: FilePath -> FilePath -> IO ()
|
||||||
processFiles inputFile altoDir = do
|
processFiles inputFile altoDir = do
|
||||||
pdf <- either die return . parseDocument =<< BS.readFile inputFile
|
pdf <- either die return . parseDocument =<< BS.readFile inputFile
|
||||||
alto <- collection altoDir
|
alto <- collection altoDir
|
||||||
putStrLn "Object,Instruction,ALTO_ID"
|
putStrLn "Object,Instruction,ALTO_ID"
|
||||||
mapM_ (putStrLn . format) =<< synchronize pdf alto
|
synchronize pdf alto `handle` die
|
||||||
where
|
|
||||||
format (objectId, instructionId, altoId) =
|
|
||||||
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
|
|
||||||
|
|
||||||
main :: IO ()
|
main :: IO ()
|
||||||
main = do
|
main = do
|
||||||
|
|
Loading…
Reference in a new issue