Initial state : a base structure for a «zipper» that finds the correspondances between PDF coordinates and ALTO IDs of the text chunks

This commit is contained in:
Tissevert 2020-05-29 22:03:54 +02:00
commit 1907a03b91
9 changed files with 267 additions and 0 deletions

24
.gitignore vendored Normal file
View file

@ -0,0 +1,24 @@
# ---> Haskell
dist
dist-*
cabal-dev
*.o
*.hi
*.chi
*.chs.h
*.dyn_o
*.dyn_hi
.hpc
.hsenv
.cabal-sandbox/
cabal.sandbox.config
*.prof
*.aux
*.hp
*.eventlog
.stack-work/
cabal.project.local
cabal.project.local~
.HTF/
.ghc.environment.*

5
CHANGELOG.md Normal file
View file

@ -0,0 +1,5 @@
# Revision history for chaoui-pdf
## 0.1.0.0 -- YYYY-mm-dd
* First version. Released on an unsuspecting world.

30
LICENSE Normal file
View file

@ -0,0 +1,30 @@
Copyright (c) 2020, Tissevert
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Tissevert nor the names of other
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

2
Setup.hs Normal file
View file

@ -0,0 +1,2 @@
import Distribution.Simple
main = defaultMain

56
pdfcleaner.cabal Normal file
View file

@ -0,0 +1,56 @@
cabal-version: >=1.10
-- Initial package description 'chaoui-pdf.cabal' generated by 'cabal
-- init'. For further documentation, see
-- http://haskell.org/cabal/users-guide/
name: pdfcleaner
version: 0.1.0.0
synopsis: A program to purge blacklisted words from a PDF
description: A tool to replicate modifications on ALTO files on PDF files
homepage: https://git.marvid.fr/Tissevert/pdfcleaner
-- bug-reports:
license: BSD3
license-file: LICENSE
author: Tissevert
maintainer: tissevert+devel@marvid.fr
-- copyright:
category: Text
build-type: Simple
extra-source-files: CHANGELOG.md
executable pdfcleaner
main-is: Main.hs
other-modules: ALTO
, Scoria
-- other-extensions:
build-depends: base >=4.12 && <4.13
, bytestring
, containers
, directory
, filepath
, ExceptIOH
, Hufflepdf
, mtl
, soprano
, xml
ghc-options: -Wall
hs-source-dirs: src
default-language: Haskell2010
executable sync
main-is: Sync.hs
other-modules: ALTO
-- other-extensions:
build-depends: base >=4.12 && <4.13
, bytestring
, containers
, directory
, filepath
, ExceptIOH
, Hufflepdf
, mtl
, soprano
, xml
ghc-options: -Wall
hs-source-dirs: src
default-language: Haskell2010

45
src/ALTO.hs Normal file
View file

@ -0,0 +1,45 @@
{-# LANGUAGE NamedFieldPuns #-}
module ALTO (
Collection
, Words
, collection
, getWords
) where
import Data.List (sortOn)
import Data.Map (Map, (!), fromList)
import System.Directory (listDirectory)
import System.Exit (die)
import System.FilePath ((</>))
import Text.Read (readMaybe)
import Text.XML.Light (parseXML)
import Text.XML.Light.Extra (getAttr)
import Text.XML.Light.XPath (t, xPath)
data Collection = Collection {
path :: FilePath
, pages :: Map Int FilePath
}
type Words = [(String, String)]
getWords :: Collection -> Int -> IO Words
getWords (Collection {path, pages}) pageNumber = do
alto <- parseXML <$> readFile (path </> pages ! pageNumber)
mapM wordOfElem $ xPath stringPath alto
where
stringPath =
t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"]
wordOfElem element =
case (getAttr "CONTENT" element, getAttr "ID" element) of
(Just content, Just elemID) -> return (content, elemID)
(Nothing, _) -> die $ "Missing CONTENT in word " ++ show element
_ -> die $ "Missing ID in word " ++ show element
collection :: FilePath -> IO Collection
collection directory = buildCollection <$> listDirectory directory
where
rankPages files =
[(rank, file) | file <- files, Just rank <- [pageNumber file]]
pageNumber ('p':s) = readMaybe $ takeWhile (/= '.') s
pageNumber _ = Nothing
buildCollection = Collection directory . fromList . sortOn snd . rankPages

38
src/Main.hs Normal file
View file

@ -0,0 +1,38 @@
module Main where
import ALTO (Collection, collection)
import Control.Monad.Except.IOH (handle)
import Control.Monad.State (MonadState(..), execStateT)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy as Lazy (writeFile)
import qualified Data.Map as Map (keys)
import PDF (Document, UnifiedLayers(..), parseDocument, render)
import PDF.Box (Box(..), at)
import PDF.Pages (Pages(..), withFonts)
import Scoria (Scoriae)
import qualified Scoria (fromCSV)
import System.Environment (getArgs)
import System.Exit (die)
clean :: Document -> Collection -> Scoriae -> IO Document
clean pdf alto scoriae =
(at UnifiedLayers $ withFonts cleanLayer) pdf `handle` die
where
cleanLayer = execStateT $ do
pages <- get >>= r Pages
return ()
processFiles :: FilePath -> FilePath -> FilePath -> IO ()
processFiles inputFile altoDir outputFile = do
pdf <- either die return . parseDocument =<< BS.readFile inputFile
alto <- collection altoDir
scoriae <- Scoria.fromCSV <$> getContents
Lazy.writeFile outputFile . render =<< clean pdf alto scoriae
main :: IO ()
main = do
args <- getArgs
case args of
[inputFile, altoDir, outputFile] ->
processFiles inputFile altoDir outputFile
_ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR OUTPUT_FILE"

11
src/Scoria.hs Normal file
View file

@ -0,0 +1,11 @@
module Scoria (
Scoriae
, fromCSV
) where
import Data.Set (Set, fromList)
type Scoriae = Set String
fromCSV :: String -> Scoriae
fromCSV = fromList . drop 1 . lines

56
src/Sync.hs Normal file
View file

@ -0,0 +1,56 @@
{-# LANGUAGE FlexibleContexts #-}
module Main where
import ALTO (Collection, collection, getWords)
import Control.Monad.Except.IOH (handle)
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Reader (runReaderT)
import Control.Monad.State (MonadState(..), evalStateT)
import Control.Monad.Trans (lift)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy as Lazy (writeFile)
import Data.Id (Id(..))
import qualified Data.Map as Map (toList)
import Data.OrderedMap (mapi)
import PDF (Document, UnifiedLayers(..), parseDocument, render)
import PDF.Box (Box(..), at)
import PDF.Content.Operator (Instruction)
import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer)
import PDF.Object (Object)
import PDF.Pages (Contents(..), Pages(..), withFonts, withResources)
import System.Environment (getArgs)
import System.Exit (die)
import Text.Printf (printf)
synchronize :: Document -> Collection -> IO [(Id Object, Id Instruction, String)]
synchronize pdf alto =
(r UnifiedLayers pdf >>= withFonts extractFromLayer) `handle` die
where
extractFromLayer layer =
r Pages layer >>= fmap concat . mapM extractFromPage . Map.toList
extractFromPage (pageNumber, page) = do
altoWords <- liftIO $ getWords alto pageNumber
withResources (flip evalStateT altoWords . extractWithResources) page
extractWithResources page =
r Contents page >>= fmap concat . sequence . mapi extractFromObject
extractFromObject objectId content =
lift (r Chunks content) >>= undefined
processFiles :: FilePath -> FilePath -> IO ()
processFiles inputFile altoDir = do
pdf <- either die return . parseDocument =<< BS.readFile inputFile
alto <- collection altoDir
putStrLn "Object,Instruction,ALTO_ID"
mapM_ (putStrLn . format) =<< synchronize pdf alto
where
format (objectId, instructionId, altoId) =
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
main :: IO ()
main = do
args <- getArgs
case args of
[inputFile, altoDir] ->
processFiles inputFile altoDir
_ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR"