Initial state : a base structure for a «zipper» that finds the correspondances between PDF coordinates and ALTO IDs of the text chunks
This commit is contained in:
commit
1907a03b91
9 changed files with 267 additions and 0 deletions
24
.gitignore
vendored
Normal file
24
.gitignore
vendored
Normal file
|
@ -0,0 +1,24 @@
|
|||
# ---> Haskell
|
||||
dist
|
||||
dist-*
|
||||
cabal-dev
|
||||
*.o
|
||||
*.hi
|
||||
*.chi
|
||||
*.chs.h
|
||||
*.dyn_o
|
||||
*.dyn_hi
|
||||
.hpc
|
||||
.hsenv
|
||||
.cabal-sandbox/
|
||||
cabal.sandbox.config
|
||||
*.prof
|
||||
*.aux
|
||||
*.hp
|
||||
*.eventlog
|
||||
.stack-work/
|
||||
cabal.project.local
|
||||
cabal.project.local~
|
||||
.HTF/
|
||||
.ghc.environment.*
|
||||
|
5
CHANGELOG.md
Normal file
5
CHANGELOG.md
Normal file
|
@ -0,0 +1,5 @@
|
|||
# Revision history for chaoui-pdf
|
||||
|
||||
## 0.1.0.0 -- YYYY-mm-dd
|
||||
|
||||
* First version. Released on an unsuspecting world.
|
30
LICENSE
Normal file
30
LICENSE
Normal file
|
@ -0,0 +1,30 @@
|
|||
Copyright (c) 2020, Tissevert
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following
|
||||
disclaimer in the documentation and/or other materials provided
|
||||
with the distribution.
|
||||
|
||||
* Neither the name of Tissevert nor the names of other
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
2
Setup.hs
Normal file
2
Setup.hs
Normal file
|
@ -0,0 +1,2 @@
|
|||
import Distribution.Simple
|
||||
main = defaultMain
|
56
pdfcleaner.cabal
Normal file
56
pdfcleaner.cabal
Normal file
|
@ -0,0 +1,56 @@
|
|||
cabal-version: >=1.10
|
||||
-- Initial package description 'chaoui-pdf.cabal' generated by 'cabal
|
||||
-- init'. For further documentation, see
|
||||
-- http://haskell.org/cabal/users-guide/
|
||||
|
||||
name: pdfcleaner
|
||||
version: 0.1.0.0
|
||||
synopsis: A program to purge blacklisted words from a PDF
|
||||
description: A tool to replicate modifications on ALTO files on PDF files
|
||||
homepage: https://git.marvid.fr/Tissevert/pdfcleaner
|
||||
-- bug-reports:
|
||||
license: BSD3
|
||||
license-file: LICENSE
|
||||
author: Tissevert
|
||||
maintainer: tissevert+devel@marvid.fr
|
||||
-- copyright:
|
||||
category: Text
|
||||
build-type: Simple
|
||||
extra-source-files: CHANGELOG.md
|
||||
|
||||
executable pdfcleaner
|
||||
main-is: Main.hs
|
||||
other-modules: ALTO
|
||||
, Scoria
|
||||
-- other-extensions:
|
||||
build-depends: base >=4.12 && <4.13
|
||||
, bytestring
|
||||
, containers
|
||||
, directory
|
||||
, filepath
|
||||
, ExceptIOH
|
||||
, Hufflepdf
|
||||
, mtl
|
||||
, soprano
|
||||
, xml
|
||||
ghc-options: -Wall
|
||||
hs-source-dirs: src
|
||||
default-language: Haskell2010
|
||||
|
||||
executable sync
|
||||
main-is: Sync.hs
|
||||
other-modules: ALTO
|
||||
-- other-extensions:
|
||||
build-depends: base >=4.12 && <4.13
|
||||
, bytestring
|
||||
, containers
|
||||
, directory
|
||||
, filepath
|
||||
, ExceptIOH
|
||||
, Hufflepdf
|
||||
, mtl
|
||||
, soprano
|
||||
, xml
|
||||
ghc-options: -Wall
|
||||
hs-source-dirs: src
|
||||
default-language: Haskell2010
|
45
src/ALTO.hs
Normal file
45
src/ALTO.hs
Normal file
|
@ -0,0 +1,45 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module ALTO (
|
||||
Collection
|
||||
, Words
|
||||
, collection
|
||||
, getWords
|
||||
) where
|
||||
|
||||
import Data.List (sortOn)
|
||||
import Data.Map (Map, (!), fromList)
|
||||
import System.Directory (listDirectory)
|
||||
import System.Exit (die)
|
||||
import System.FilePath ((</>))
|
||||
import Text.Read (readMaybe)
|
||||
import Text.XML.Light (parseXML)
|
||||
import Text.XML.Light.Extra (getAttr)
|
||||
import Text.XML.Light.XPath (t, xPath)
|
||||
|
||||
data Collection = Collection {
|
||||
path :: FilePath
|
||||
, pages :: Map Int FilePath
|
||||
}
|
||||
type Words = [(String, String)]
|
||||
|
||||
getWords :: Collection -> Int -> IO Words
|
||||
getWords (Collection {path, pages}) pageNumber = do
|
||||
alto <- parseXML <$> readFile (path </> pages ! pageNumber)
|
||||
mapM wordOfElem $ xPath stringPath alto
|
||||
where
|
||||
stringPath =
|
||||
t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"]
|
||||
wordOfElem element =
|
||||
case (getAttr "CONTENT" element, getAttr "ID" element) of
|
||||
(Just content, Just elemID) -> return (content, elemID)
|
||||
(Nothing, _) -> die $ "Missing CONTENT in word " ++ show element
|
||||
_ -> die $ "Missing ID in word " ++ show element
|
||||
|
||||
collection :: FilePath -> IO Collection
|
||||
collection directory = buildCollection <$> listDirectory directory
|
||||
where
|
||||
rankPages files =
|
||||
[(rank, file) | file <- files, Just rank <- [pageNumber file]]
|
||||
pageNumber ('p':s) = readMaybe $ takeWhile (/= '.') s
|
||||
pageNumber _ = Nothing
|
||||
buildCollection = Collection directory . fromList . sortOn snd . rankPages
|
38
src/Main.hs
Normal file
38
src/Main.hs
Normal file
|
@ -0,0 +1,38 @@
|
|||
module Main where
|
||||
|
||||
import ALTO (Collection, collection)
|
||||
import Control.Monad.Except.IOH (handle)
|
||||
import Control.Monad.State (MonadState(..), execStateT)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import qualified Data.ByteString.Lazy as Lazy (writeFile)
|
||||
import qualified Data.Map as Map (keys)
|
||||
import PDF (Document, UnifiedLayers(..), parseDocument, render)
|
||||
import PDF.Box (Box(..), at)
|
||||
import PDF.Pages (Pages(..), withFonts)
|
||||
import Scoria (Scoriae)
|
||||
import qualified Scoria (fromCSV)
|
||||
import System.Environment (getArgs)
|
||||
import System.Exit (die)
|
||||
|
||||
clean :: Document -> Collection -> Scoriae -> IO Document
|
||||
clean pdf alto scoriae =
|
||||
(at UnifiedLayers $ withFonts cleanLayer) pdf `handle` die
|
||||
where
|
||||
cleanLayer = execStateT $ do
|
||||
pages <- get >>= r Pages
|
||||
return ()
|
||||
|
||||
processFiles :: FilePath -> FilePath -> FilePath -> IO ()
|
||||
processFiles inputFile altoDir outputFile = do
|
||||
pdf <- either die return . parseDocument =<< BS.readFile inputFile
|
||||
alto <- collection altoDir
|
||||
scoriae <- Scoria.fromCSV <$> getContents
|
||||
Lazy.writeFile outputFile . render =<< clean pdf alto scoriae
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
args <- getArgs
|
||||
case args of
|
||||
[inputFile, altoDir, outputFile] ->
|
||||
processFiles inputFile altoDir outputFile
|
||||
_ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR OUTPUT_FILE"
|
11
src/Scoria.hs
Normal file
11
src/Scoria.hs
Normal file
|
@ -0,0 +1,11 @@
|
|||
module Scoria (
|
||||
Scoriae
|
||||
, fromCSV
|
||||
) where
|
||||
|
||||
import Data.Set (Set, fromList)
|
||||
|
||||
type Scoriae = Set String
|
||||
|
||||
fromCSV :: String -> Scoriae
|
||||
fromCSV = fromList . drop 1 . lines
|
56
src/Sync.hs
Normal file
56
src/Sync.hs
Normal file
|
@ -0,0 +1,56 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
module Main where
|
||||
|
||||
import ALTO (Collection, collection, getWords)
|
||||
import Control.Monad.Except.IOH (handle)
|
||||
import Control.Monad.IO.Class (liftIO)
|
||||
import Control.Monad.Reader (runReaderT)
|
||||
import Control.Monad.State (MonadState(..), evalStateT)
|
||||
import Control.Monad.Trans (lift)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import qualified Data.ByteString.Lazy as Lazy (writeFile)
|
||||
import Data.Id (Id(..))
|
||||
import qualified Data.Map as Map (toList)
|
||||
import Data.OrderedMap (mapi)
|
||||
import PDF (Document, UnifiedLayers(..), parseDocument, render)
|
||||
import PDF.Box (Box(..), at)
|
||||
import PDF.Content.Operator (Instruction)
|
||||
import PDF.Content.Text (Chunks(..))
|
||||
import PDF.Layer (Layer)
|
||||
import PDF.Object (Object)
|
||||
import PDF.Pages (Contents(..), Pages(..), withFonts, withResources)
|
||||
import System.Environment (getArgs)
|
||||
import System.Exit (die)
|
||||
import Text.Printf (printf)
|
||||
|
||||
synchronize :: Document -> Collection -> IO [(Id Object, Id Instruction, String)]
|
||||
synchronize pdf alto =
|
||||
(r UnifiedLayers pdf >>= withFonts extractFromLayer) `handle` die
|
||||
where
|
||||
extractFromLayer layer =
|
||||
r Pages layer >>= fmap concat . mapM extractFromPage . Map.toList
|
||||
extractFromPage (pageNumber, page) = do
|
||||
altoWords <- liftIO $ getWords alto pageNumber
|
||||
withResources (flip evalStateT altoWords . extractWithResources) page
|
||||
extractWithResources page =
|
||||
r Contents page >>= fmap concat . sequence . mapi extractFromObject
|
||||
extractFromObject objectId content =
|
||||
lift (r Chunks content) >>= undefined
|
||||
|
||||
processFiles :: FilePath -> FilePath -> IO ()
|
||||
processFiles inputFile altoDir = do
|
||||
pdf <- either die return . parseDocument =<< BS.readFile inputFile
|
||||
alto <- collection altoDir
|
||||
putStrLn "Object,Instruction,ALTO_ID"
|
||||
mapM_ (putStrLn . format) =<< synchronize pdf alto
|
||||
where
|
||||
format (objectId, instructionId, altoId) =
|
||||
printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
args <- getArgs
|
||||
case args of
|
||||
[inputFile, altoDir] ->
|
||||
processFiles inputFile altoDir
|
||||
_ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR"
|
Loading…
Reference in a new issue