39 lines
1.3 KiB
Haskell
39 lines
1.3 KiB
Haskell
module Main where
|
|
|
|
import ALTO (Collection, collection)
|
|
import Control.Monad.Except.IOH (handle)
|
|
import Control.Monad.State (MonadState(..), execStateT)
|
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
|
import qualified Data.ByteString.Lazy as Lazy (writeFile)
|
|
import qualified Data.Map as Map (keys)
|
|
import PDF (Document, UnifiedLayers(..), parseDocument, render)
|
|
import PDF.Box (Box(..), at)
|
|
import PDF.Pages (Pages(..), withFonts)
|
|
import Scoria (Scoriae)
|
|
import qualified Scoria (fromCSV)
|
|
import System.Environment (getArgs)
|
|
import System.Exit (die)
|
|
|
|
clean :: Document -> Collection -> Scoriae -> IO Document
|
|
clean pdf alto scoriae =
|
|
(at UnifiedLayers $ withFonts cleanLayer) pdf `handle` die
|
|
where
|
|
cleanLayer = execStateT $ do
|
|
pages <- get >>= r Pages
|
|
return ()
|
|
|
|
processFiles :: FilePath -> FilePath -> FilePath -> IO ()
|
|
processFiles inputFile altoDir outputFile = do
|
|
pdf <- either die return . parseDocument =<< BS.readFile inputFile
|
|
alto <- collection altoDir
|
|
scoriae <- Scoria.fromCSV <$> getContents
|
|
Lazy.writeFile outputFile . render =<< clean pdf alto scoriae
|
|
|
|
main :: IO ()
|
|
main = do
|
|
args <- getArgs
|
|
case args of
|
|
[inputFile, altoDir, outputFile] ->
|
|
processFiles inputFile altoDir outputFile
|
|
_ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR OUTPUT_FILE"
|