commit 1907a03b919cda87f28b11b2bafe600b90393801 Author: Tissevert Date: Fri May 29 22:03:54 2020 +0200 Initial state : a base structure for a «zipper» that finds the correspondances between PDF coordinates and ALTO IDs of the text chunks diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..eea5c71 --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +# ---> Haskell +dist +dist-* +cabal-dev +*.o +*.hi +*.chi +*.chs.h +*.dyn_o +*.dyn_hi +.hpc +.hsenv +.cabal-sandbox/ +cabal.sandbox.config +*.prof +*.aux +*.hp +*.eventlog +.stack-work/ +cabal.project.local +cabal.project.local~ +.HTF/ +.ghc.environment.* + diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..d61fe59 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,5 @@ +# Revision history for chaoui-pdf + +## 0.1.0.0 -- YYYY-mm-dd + +* First version. Released on an unsuspecting world. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a58e302 --- /dev/null +++ b/LICENSE @@ -0,0 +1,30 @@ +Copyright (c) 2020, Tissevert + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Tissevert nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Setup.hs b/Setup.hs new file mode 100644 index 0000000..9a994af --- /dev/null +++ b/Setup.hs @@ -0,0 +1,2 @@ +import Distribution.Simple +main = defaultMain diff --git a/pdfcleaner.cabal b/pdfcleaner.cabal new file mode 100644 index 0000000..cf49054 --- /dev/null +++ b/pdfcleaner.cabal @@ -0,0 +1,56 @@ +cabal-version: >=1.10 +-- Initial package description 'chaoui-pdf.cabal' generated by 'cabal +-- init'. For further documentation, see +-- http://haskell.org/cabal/users-guide/ + +name: pdfcleaner +version: 0.1.0.0 +synopsis: A program to purge blacklisted words from a PDF +description: A tool to replicate modifications on ALTO files on PDF files +homepage: https://git.marvid.fr/Tissevert/pdfcleaner +-- bug-reports: +license: BSD3 +license-file: LICENSE +author: Tissevert +maintainer: tissevert+devel@marvid.fr +-- copyright: +category: Text +build-type: Simple +extra-source-files: CHANGELOG.md + +executable pdfcleaner + main-is: Main.hs + other-modules: ALTO + , Scoria + -- other-extensions: + build-depends: base >=4.12 && <4.13 + , bytestring + , containers + , directory + , filepath + , ExceptIOH + , Hufflepdf + , mtl + , soprano + , xml + ghc-options: -Wall + hs-source-dirs: src + default-language: Haskell2010 + +executable sync + main-is: Sync.hs + other-modules: ALTO + -- other-extensions: + build-depends: base >=4.12 && <4.13 + , bytestring + , containers + , directory + , filepath + , ExceptIOH + , Hufflepdf + , mtl + , soprano + , xml + ghc-options: -Wall + hs-source-dirs: src + default-language: Haskell2010 diff --git a/src/ALTO.hs b/src/ALTO.hs new file mode 100644 index 0000000..b82bdba --- /dev/null +++ b/src/ALTO.hs @@ -0,0 +1,45 @@ +{-# LANGUAGE NamedFieldPuns #-} +module ALTO ( + Collection + , Words + , collection + , getWords + ) where + +import Data.List (sortOn) +import Data.Map (Map, (!), fromList) +import System.Directory (listDirectory) +import System.Exit (die) +import System.FilePath (()) +import Text.Read (readMaybe) +import Text.XML.Light (parseXML) +import Text.XML.Light.Extra (getAttr) +import Text.XML.Light.XPath (t, xPath) + +data Collection = Collection { + path :: FilePath + , pages :: Map Int FilePath + } +type Words = [(String, String)] + +getWords :: Collection -> Int -> IO Words +getWords (Collection {path, pages}) pageNumber = do + alto <- parseXML <$> readFile (path pages ! pageNumber) + mapM wordOfElem $ xPath stringPath alto + where + stringPath = + t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"] + wordOfElem element = + case (getAttr "CONTENT" element, getAttr "ID" element) of + (Just content, Just elemID) -> return (content, elemID) + (Nothing, _) -> die $ "Missing CONTENT in word " ++ show element + _ -> die $ "Missing ID in word " ++ show element + +collection :: FilePath -> IO Collection +collection directory = buildCollection <$> listDirectory directory + where + rankPages files = + [(rank, file) | file <- files, Just rank <- [pageNumber file]] + pageNumber ('p':s) = readMaybe $ takeWhile (/= '.') s + pageNumber _ = Nothing + buildCollection = Collection directory . fromList . sortOn snd . rankPages diff --git a/src/Main.hs b/src/Main.hs new file mode 100644 index 0000000..dc15229 --- /dev/null +++ b/src/Main.hs @@ -0,0 +1,38 @@ +module Main where + +import ALTO (Collection, collection) +import Control.Monad.Except.IOH (handle) +import Control.Monad.State (MonadState(..), execStateT) +import qualified Data.ByteString.Char8 as BS (readFile) +import qualified Data.ByteString.Lazy as Lazy (writeFile) +import qualified Data.Map as Map (keys) +import PDF (Document, UnifiedLayers(..), parseDocument, render) +import PDF.Box (Box(..), at) +import PDF.Pages (Pages(..), withFonts) +import Scoria (Scoriae) +import qualified Scoria (fromCSV) +import System.Environment (getArgs) +import System.Exit (die) + +clean :: Document -> Collection -> Scoriae -> IO Document +clean pdf alto scoriae = + (at UnifiedLayers $ withFonts cleanLayer) pdf `handle` die + where + cleanLayer = execStateT $ do + pages <- get >>= r Pages + return () + +processFiles :: FilePath -> FilePath -> FilePath -> IO () +processFiles inputFile altoDir outputFile = do + pdf <- either die return . parseDocument =<< BS.readFile inputFile + alto <- collection altoDir + scoriae <- Scoria.fromCSV <$> getContents + Lazy.writeFile outputFile . render =<< clean pdf alto scoriae + +main :: IO () +main = do + args <- getArgs + case args of + [inputFile, altoDir, outputFile] -> + processFiles inputFile altoDir outputFile + _ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR OUTPUT_FILE" diff --git a/src/Scoria.hs b/src/Scoria.hs new file mode 100644 index 0000000..b6a9056 --- /dev/null +++ b/src/Scoria.hs @@ -0,0 +1,11 @@ +module Scoria ( + Scoriae + , fromCSV + ) where + +import Data.Set (Set, fromList) + +type Scoriae = Set String + +fromCSV :: String -> Scoriae +fromCSV = fromList . drop 1 . lines diff --git a/src/Sync.hs b/src/Sync.hs new file mode 100644 index 0000000..c73f64d --- /dev/null +++ b/src/Sync.hs @@ -0,0 +1,56 @@ +{-# LANGUAGE FlexibleContexts #-} +module Main where + +import ALTO (Collection, collection, getWords) +import Control.Monad.Except.IOH (handle) +import Control.Monad.IO.Class (liftIO) +import Control.Monad.Reader (runReaderT) +import Control.Monad.State (MonadState(..), evalStateT) +import Control.Monad.Trans (lift) +import qualified Data.ByteString.Char8 as BS (readFile) +import qualified Data.ByteString.Lazy as Lazy (writeFile) +import Data.Id (Id(..)) +import qualified Data.Map as Map (toList) +import Data.OrderedMap (mapi) +import PDF (Document, UnifiedLayers(..), parseDocument, render) +import PDF.Box (Box(..), at) +import PDF.Content.Operator (Instruction) +import PDF.Content.Text (Chunks(..)) +import PDF.Layer (Layer) +import PDF.Object (Object) +import PDF.Pages (Contents(..), Pages(..), withFonts, withResources) +import System.Environment (getArgs) +import System.Exit (die) +import Text.Printf (printf) + +synchronize :: Document -> Collection -> IO [(Id Object, Id Instruction, String)] +synchronize pdf alto = + (r UnifiedLayers pdf >>= withFonts extractFromLayer) `handle` die + where + extractFromLayer layer = + r Pages layer >>= fmap concat . mapM extractFromPage . Map.toList + extractFromPage (pageNumber, page) = do + altoWords <- liftIO $ getWords alto pageNumber + withResources (flip evalStateT altoWords . extractWithResources) page + extractWithResources page = + r Contents page >>= fmap concat . sequence . mapi extractFromObject + extractFromObject objectId content = + lift (r Chunks content) >>= undefined + +processFiles :: FilePath -> FilePath -> IO () +processFiles inputFile altoDir = do + pdf <- either die return . parseDocument =<< BS.readFile inputFile + alto <- collection altoDir + putStrLn "Object,Instruction,ALTO_ID" + mapM_ (putStrLn . format) =<< synchronize pdf alto + where + format (objectId, instructionId, altoId) = + printf "%d,%d,%s" (getId objectId) (getId instructionId) altoId + +main :: IO () +main = do + args <- getArgs + case args of + [inputFile, altoDir] -> + processFiles inputFile altoDir + _ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR"