Initial state : a base structure for a «zipper» that finds the correspondances between PDF coordinates and ALTO IDs of the text chunks
This commit is contained in:
commit
1907a03b91
9 changed files with 267 additions and 0 deletions
24
.gitignore
vendored
Normal file
24
.gitignore
vendored
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
# ---> Haskell
|
||||||
|
dist
|
||||||
|
dist-*
|
||||||
|
cabal-dev
|
||||||
|
*.o
|
||||||
|
*.hi
|
||||||
|
*.chi
|
||||||
|
*.chs.h
|
||||||
|
*.dyn_o
|
||||||
|
*.dyn_hi
|
||||||
|
.hpc
|
||||||
|
.hsenv
|
||||||
|
.cabal-sandbox/
|
||||||
|
cabal.sandbox.config
|
||||||
|
*.prof
|
||||||
|
*.aux
|
||||||
|
*.hp
|
||||||
|
*.eventlog
|
||||||
|
.stack-work/
|
||||||
|
cabal.project.local
|
||||||
|
cabal.project.local~
|
||||||
|
.HTF/
|
||||||
|
.ghc.environment.*
|
||||||
|
|
5
CHANGELOG.md
Normal file
5
CHANGELOG.md
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
# Revision history for chaoui-pdf
|
||||||
|
|
||||||
|
## 0.1.0.0 -- YYYY-mm-dd
|
||||||
|
|
||||||
|
* First version. Released on an unsuspecting world.
|
30
LICENSE
Normal file
30
LICENSE
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
Copyright (c) 2020, Tissevert
|
||||||
|
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above
|
||||||
|
copyright notice, this list of conditions and the following
|
||||||
|
disclaimer in the documentation and/or other materials provided
|
||||||
|
with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Tissevert nor the names of other
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
2
Setup.hs
Normal file
2
Setup.hs
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
import Distribution.Simple
|
||||||
|
main = defaultMain
|
56
pdfcleaner.cabal
Normal file
56
pdfcleaner.cabal
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
cabal-version: >=1.10
|
||||||
|
-- Initial package description 'chaoui-pdf.cabal' generated by 'cabal
|
||||||
|
-- init'. For further documentation, see
|
||||||
|
-- http://haskell.org/cabal/users-guide/
|
||||||
|
|
||||||
|
name: pdfcleaner
|
||||||
|
version: 0.1.0.0
|
||||||
|
synopsis: A program to purge blacklisted words from a PDF
|
||||||
|
description: A tool to replicate modifications on ALTO files on PDF files
|
||||||
|
homepage: https://git.marvid.fr/Tissevert/pdfcleaner
|
||||||
|
-- bug-reports:
|
||||||
|
license: BSD3
|
||||||
|
license-file: LICENSE
|
||||||
|
author: Tissevert
|
||||||
|
maintainer: tissevert+devel@marvid.fr
|
||||||
|
-- copyright:
|
||||||
|
category: Text
|
||||||
|
build-type: Simple
|
||||||
|
extra-source-files: CHANGELOG.md
|
||||||
|
|
||||||
|
executable pdfcleaner
|
||||||
|
main-is: Main.hs
|
||||||
|
other-modules: ALTO
|
||||||
|
, Scoria
|
||||||
|
-- other-extensions:
|
||||||
|
build-depends: base >=4.12 && <4.13
|
||||||
|
, bytestring
|
||||||
|
, containers
|
||||||
|
, directory
|
||||||
|
, filepath
|
||||||
|
, ExceptIOH
|
||||||
|
, Hufflepdf
|
||||||
|
, mtl
|
||||||
|
, soprano
|
||||||
|
, xml
|
||||||
|
ghc-options: -Wall
|
||||||
|
hs-source-dirs: src
|
||||||
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
executable sync
|
||||||
|
main-is: Sync.hs
|
||||||
|
other-modules: ALTO
|
||||||
|
-- other-extensions:
|
||||||
|
build-depends: base >=4.12 && <4.13
|
||||||
|
, bytestring
|
||||||
|
, containers
|
||||||
|
, directory
|
||||||
|
, filepath
|
||||||
|
, ExceptIOH
|
||||||
|
, Hufflepdf
|
||||||
|
, mtl
|
||||||
|
, soprano
|
||||||
|
, xml
|
||||||
|
ghc-options: -Wall
|
||||||
|
hs-source-dirs: src
|
||||||
|
default-language: Haskell2010
|
45
src/ALTO.hs
Normal file
45
src/ALTO.hs
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
module ALTO (
|
||||||
|
Collection
|
||||||
|
, Words
|
||||||
|
, collection
|
||||||
|
, getWords
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Data.List (sortOn)
|
||||||
|
import Data.Map (Map, (!), fromList)
|
||||||
|
import System.Directory (listDirectory)
|
||||||
|
import System.Exit (die)
|
||||||
|
import System.FilePath ((</>))
|
||||||
|
import Text.Read (readMaybe)
|
||||||
|
import Text.XML.Light (parseXML)
|
||||||
|
import Text.XML.Light.Extra (getAttr)
|
||||||
|
import Text.XML.Light.XPath (t, xPath)
|
||||||
|
|
||||||
|
data Collection = Collection {
|
||||||
|
path :: FilePath
|
||||||
|
, pages :: Map Int FilePath
|
||||||
|
}
|
||||||
|
type Words = [(String, String)]
|
||||||
|
|
||||||
|
getWords :: Collection -> Int -> IO Words
|
||||||
|
getWords (Collection {path, pages}) pageNumber = do
|
||||||
|
alto <- parseXML <$> readFile (path </> pages ! pageNumber)
|
||||||
|
mapM wordOfElem $ xPath stringPath alto
|
||||||
|
where
|
||||||
|
stringPath =
|
||||||
|
t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"]
|
||||||
|
wordOfElem element =
|
||||||
|
case (getAttr "CONTENT" element, getAttr "ID" element) of
|
||||||
|
(Just content, Just elemID) -> return (content, elemID)
|
||||||
|
(Nothing, _) -> die $ "Missing CONTENT in word " ++ show element
|
||||||
|
_ -> die $ "Missing ID in word " ++ show element
|
||||||
|
|
||||||
|
collection :: FilePath -> IO Collection
|
||||||
|
collection directory = buildCollection <$> listDirectory directory
|
||||||
|
where
|
||||||
|
rankPages files =
|
||||||
|
[(rank, file) | file <- files, Just rank <- [pageNumber file]]
|
||||||
|
pageNumber ('p':s) = readMaybe $ takeWhile (/= '.') s
|
||||||
|
pageNumber _ = Nothing
|
||||||
|
buildCollection = Collection directory . fromList . sortOn snd . rankPages
|
38
src/Main.hs
Normal file
38
src/Main.hs
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
module Main where
|
||||||
|
|
||||||
|
import ALTO (Collection, collection)
|
||||||
|
import Control.Monad.Except.IOH (handle)
|
||||||
|
import Control.Monad.State (MonadState(..), execStateT)
|
||||||
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
|
import qualified Data.ByteString.Lazy as Lazy (writeFile)
|
||||||
|
import qualified Data.Map as Map (keys)
|
||||||
|
import PDF (Document, UnifiedLayers(..), parseDocument, render)
|
||||||
|
import PDF.Box (Box(..), at)
|
||||||
|
import PDF.Pages (Pages(..), withFonts)
|
||||||
|
import Scoria (Scoriae)
|
||||||
|
import qualified Scoria (fromCSV)
|
||||||
|
import System.Environment (getArgs)
|
||||||
|
import System.Exit (die)
|
||||||
|
|
||||||
|
clean :: Document -> Collection -> Scoriae -> IO Document
|
||||||
|
clean pdf alto scoriae =
|
||||||
|
(at UnifiedLayers $ withFonts cleanLayer) pdf `handle` die
|
||||||
|
where
|
||||||
|
cleanLayer = execStateT $ do
|
||||||
|
pages <- get >>= r Pages
|
||||||
|
return ()
|
||||||
|
|
||||||
|
processFiles :: FilePath -> FilePath -> FilePath -> IO ()
|
||||||
|
processFiles inputFile altoDir outputFile = do
|
||||||
|
pdf <- either die return . parseDocument =<< BS.readFile inputFile
|
||||||
|
alto <- collection altoDir
|
||||||
|
scoriae <- Scoria.fromCSV <$> getContents
|
||||||
|
Lazy.writeFile outputFile . render =<< clean pdf alto scoriae
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
args <- getArgs
|
||||||
|
case args of
|
||||||
|
[inputFile, altoDir, outputFile] ->
|
||||||
|
processFiles inputFile altoDir outputFile
|
||||||
|
_ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR OUTPUT_FILE"
|
11
src/Scoria.hs
Normal file
11
src/Scoria.hs
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
module Scoria (
|
||||||
|
Scoriae
|
||||||
|
, fromCSV
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Data.Set (Set, fromList)
|
||||||
|
|
||||||
|
type Scoriae = Set String
|
||||||
|
|
||||||
|
fromCSV :: String -> Scoriae
|
||||||
|
fromCSV = fromList . drop 1 . lines
|
56
src/Sync.hs
Normal file
56
src/Sync.hs
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
module Main where
|
||||||
|
|
||||||
|
import ALTO (Collection, collection, getWords)
|
||||||
|
import Control.Monad.Except.IOH (handle)
|
||||||
|
import Control.Monad.IO.Class (liftIO)
|
||||||
|
import Control.Monad.Reader (runReaderT)
|
||||||
|
import Control.Monad.State (MonadState(..), evalStateT)
|
||||||
|
import Control.Monad.Trans (lift)
|
||||||
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
|
import qualified Data.ByteString.Lazy as Lazy (writeFile)
|
||||||
|
import Data.Id (Id(..))
|
||||||
|
import qualified Data.Map as Map (toList)
|
||||||
|
import Data.OrderedMap (mapi)
|
||||||
|
import PDF (Document, UnifiedLayers(..), parseDocument, render)
|
||||||
|
import PDF.Box (Box(..), at)
|
||||||
|
import PDF.Content.Operator (Instruction)
|
||||||
|
import PDF.Content.Text (Chunks(..))
|
||||||
|
import PDF.Layer (Layer)
|
||||||
|
import PDF.Object (Object)
|
||||||
|
import PDF.Pages (Contents(..), Pages(..), withFonts, withResources)
|
||||||
|
import System.Environment (getArgs)
|
||||||
|
import System.Exit (die)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
synchronize :: Document -> Collection -> IO [(Id Object, Id Instruction, String)]
|
||||||
|
synchronize pdf alto =
|
||||||
|
(r UnifiedLayers pdf >>= withFonts extractFromLayer) `handle` die
|
||||||
|
where
|
||||||
|
extractFromLayer layer =
|
||||||
|
r Pages layer >>= fmap concat . mapM extractFromPage . Map.toList
|
||||||
|
extractFromPage (pageNumber, page) = do
|
||||||
|
altoWords <- liftIO $ getWords alto pageNumber
|
||||||
|
withResources (flip evalStateT altoWords . extractWithResources) page
|
||||||
|
extractWithResources page =
|
||||||
|
r Contents page >>= fmap concat . sequence . mapi extractFromObject
|
||||||
|
extractFromObject objectId content =
|
||||||
|
lift (r Chunks content) >>= undefined
|
||||||
|
|
||||||
|
processFiles :: FilePath -> FilePath -> IO ()
|
||||||
|
processFiles inputFile altoDir = do
|
||||||
|
pdf <- either die return . parseDocument =<< BS.readFile inputFile
|
||||||
|
alto <- collection altoDir
|
||||||
|
putStrLn "Object,Instruction,ALTO_ID"
|
||||||
|
mapM_ (putStrLn . format) =<< synchronize pdf alto
|
||||||
|
where
|
||||||
|
format (objectId, instructionId, altoId) =
|
||||||
|
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
args <- getArgs
|
||||||
|
case args of
|
||||||
|
[inputFile, altoDir] ->
|
||||||
|
processFiles inputFile altoDir
|
||||||
|
_ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR"
|
Loading…
Reference in a new issue