From d9f69014a033212a415c7048c1c17eee7d940ba2 Mon Sep 17 00:00:00 2001 From: Tissevert Date: Thu, 28 May 2020 18:54:15 +0200 Subject: [PATCH] Make a couple improvements in performance + add an example script to extract pages from a PDF --- Hufflepdf.cabal | 31 +++++- examples/getObj.hs | 22 ++-- examples/getText.hs | 22 ++-- examples/pdfCut.hs | 41 +++++++ src/Data/ByteString/Char8/Util.hs | 4 +- src/Data/Id.hs | 10 +- src/PDF/Box.hs | 3 +- src/PDF/CMap.hs | 15 +-- src/PDF/Content.hs | 67 ++++++++---- src/PDF/Content/Text.hs | 74 ++++++------- src/PDF/Encoding.hs | 7 +- src/PDF/Encoding/MacRoman.hs | 3 +- src/PDF/Layer.hs | 47 +++++--- src/PDF/Object.hs | 38 ++++--- src/PDF/Object/Navigation.hs | 174 ++++++++++++++++++++---------- src/PDF/Output.hs | 5 +- src/PDF/Pages.hs | 143 +++++++++++++----------- 17 files changed, 448 insertions(+), 258 deletions(-) create mode 100644 examples/pdfCut.hs diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index 49017a0..cb06807 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -59,7 +59,7 @@ executable equivalent build-depends: base , bytestring , Hufflepdf - ghc-options: -Wall + ghc-options: -Wall -rtsopts default-language: Haskell2010 executable getObj @@ -69,7 +69,7 @@ executable getObj , containers , Hufflepdf , mtl - ghc-options: -Wall + ghc-options: -Wall -rtsopts default-language: Haskell2010 executable getText @@ -77,9 +77,34 @@ executable getText build-depends: base , bytestring , containers + , ExceptIOH , Hufflepdf + , mtl , text - ghc-options: -Wall + ghc-options: -Wall -rtsopts + default-language: Haskell2010 + +executable pdfCut + main-is: examples/pdfCut.hs + build-depends: base + , bytestring + , containers + , ExceptIOH + , filepath + , Hufflepdf + , mtl + , text + ghc-options: -Wall -rtsopts + default-language: Haskell2010 + +executable fixMermoz + main-is: examples/fixMermoz.hs + build-depends: base + , bytestring + , ExceptIOH + , Hufflepdf + , mtl + ghc-options: -Wall -rtsopts default-language: Haskell2010 Test-Suite unitTests diff --git a/examples/getObj.hs b/examples/getObj.hs index 56acd6d..4616a3b 100644 --- a/examples/getObj.hs +++ b/examples/getObj.hs @@ -1,8 +1,6 @@ {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE GeneralizedNewtypeDeriving #-} -import Control.Monad.Fail (MonadFail(..)) import Control.Monad.Reader (ReaderT, runReaderT) import Data.ByteString.Lazy.Char8 (ByteString) import qualified Data.ByteString.Char8 as BS (readFile) @@ -13,7 +11,7 @@ import PDF.Box (Box(..)) import PDF.Layer (Layer(..), unify) import PDF.Object (Object(..)) import PDF.Object.Navigation ( - Clear(..), Raw(..), (//), objectById, catalog + Nav(..), PPath(..), StreamContent(..), (//), objectById, catalog ) import PDF.Output (Output) import qualified PDF.Output as Output (render) @@ -23,33 +21,27 @@ import System.Exit (die) import Text.Printf (printf) import Text.Read (readMaybe) -newtype Error a = Error { - runError :: Either String a - } deriving (Functor, Applicative, Monad) -instance MonadFail Error where - fail = Error . Left - decodedStream :: Object -> Object decodedStream object = - maybe object id $ r Clear object >>= flip (w Raw) object + either (const object) id $ r Clear object >>= flip (w Raw) object -display :: Output a => ReaderT Layer Error a -> Document -> Either String ByteString +display :: Functor m => Output a => ReaderT Layer m a -> Document -> m ByteString display getter (Document {eolStyle, layers}) = - Output.render eolStyle <$> runError (runReaderT getter (unify layers)) + Output.render eolStyle <$> runReaderT getter (unify layers) parse :: [String] -> IO (FilePath, Document -> Either String ByteString) -parse [inputFile] = return (inputFile, display catalog) +parse [inputFile] = return (inputFile, display $ value <$> catalog) parse [inputFile, key] = return (inputFile, clear . maybe (byPath key) byId $ readMaybe key) where byId = objectById . Id - byPath path = catalog // (explode path) + byPath path = (catalog // PPath (explode path)) explode "" = [] explode path = case break (== '.') path of (name, "") -> [name] (name, rest) -> name : explode (drop 1 rest) - clear = display . fmap decodedStream + clear = display . fmap (decodedStream . value) parse _ = die . printf "Syntax: %s inputFile [OBJECT_ID | PATH_TO_OBJ]\n" =<< getProgName diff --git a/examples/getText.hs b/examples/getText.hs index 9c44e16..0c5d237 100644 --- a/examples/getText.hs +++ b/examples/getText.hs @@ -1,5 +1,7 @@ {-# LANGUAGE FlexibleContexts #-} import Control.Monad ((>=>)) +import Control.Monad.Except (ExceptT(..)) +import Control.Monad.Except.IOH (handle) import Control.Monad.IO.Class (liftIO) import qualified Data.ByteString.Char8 as BS (readFile) import Data.Id (Id(..), mapWithKey) @@ -9,7 +11,7 @@ import qualified Data.Text as Text (unpack) import PDF (UnifiedLayers(..), parseDocument) import PDF.Box (Box(..)) import PDF.Content.Text (Chunks(..)) -import PDF.Layer (Layer) +import PDF.Layer (Layer, LayerReader) import PDF.Pages ( Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), withFonts , withResources @@ -19,7 +21,7 @@ import System.Exit (die) import System.IO (BufferMode(..), hSetBuffering, stdout) import Text.Printf (printf) -displayPage :: Int -> Page -> FontCache IO () +displayPage :: Int -> Page -> FontCache (LayerReader (ExceptT String IO)) () displayPage n = withResources ( r Contents >=> sequence_ . mapi (\objectId -> @@ -31,22 +33,22 @@ displayPage n = withResources ( liftIO . putStrLn $ printf "p#%d obj#%d instr#%d: %s" n (getId a) (getId b) (Text.unpack v) -getAll :: Layer -> IO () +getAll :: Layer -> ExceptT String IO () getAll = withFonts $ r Pages >=> sequence_ . Map.mapWithKey displayPage -get :: Int -> Layer -> IO () +get :: Int -> Layer -> ExceptT String IO () get n = withFonts $ r (P n) >=> displayPage n -onDoc :: FilePath -> (Layer -> IO ()) -> IO () -onDoc inputFile f = do - (parseDocument <$> BS.readFile inputFile) - >>= either die (r UnifiedLayers >=> f) +onDoc :: FilePath -> (Layer -> ExceptT String IO ()) -> ExceptT String IO () +onDoc inputFile f = + ExceptT (parseDocument <$> BS.readFile inputFile) >>= r UnifiedLayers >>= f main :: IO () main = do hSetBuffering stdout LineBuffering args <- getArgs case args of - [inputFile] -> onDoc inputFile getAll - [inputFile, pageNumber] -> onDoc inputFile (get $ read pageNumber) + [inputFile] -> onDoc inputFile getAll `handle` die + [inputFile, pageNumber] -> + onDoc inputFile (get $ read pageNumber) `handle` die _ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]" diff --git a/examples/pdfCut.hs b/examples/pdfCut.hs new file mode 100644 index 0000000..509a163 --- /dev/null +++ b/examples/pdfCut.hs @@ -0,0 +1,41 @@ +import Control.Monad.State (MonadState(..), evalStateT) +import Control.Monad.Except.IOH (handle) +import Control.Monad.Trans (lift) +import qualified Data.ByteString.Char8 as BS (readFile) +import qualified Data.ByteString.Lazy.Char8 as Lazy (writeFile) +import Data.Map ((!), fromSet) +import qualified Data.Set as Set (fromList) +import PDF (Document, UnifiedLayers(..), parseDocument, render) +import PDF.Box (at) +import PDF.Pages (Pages(..)) +import System.Environment (getArgs) +import System.Exit (die) +import System.FilePath (replaceBaseName, takeBaseName) +import Text.Read (readEither) + +parseRange :: String -> Either String (Int, Int) +parseRange = evalStateT $ do + from <- lift . readEither =<< state (break (== '-')) + get >>= makeRange from + where + makeRange from "" = return (from, from) + makeRange from (_:to) = (,) from <$> lift (readEither to) + +cut :: (Int, Int) -> Document -> IO Document +cut (p1, p2) doc = (( + at UnifiedLayers .at Pages $ \pages -> + return $ fromSet (pages!) $ Set.fromList [pMin .. pMax] + ) doc) `handle` die + where + (pMin, pMax) = (min p1 p2, max p1 p2) + +main :: IO () +main = do + [inputPath, inputRange] <- getArgs + range <- catchLeft $ parseRange inputRange + doc <- catchLeft =<< parseDocument <$> BS.readFile inputPath + Lazy.writeFile (outputPath inputPath inputRange) . render =<< cut range doc + where + catchLeft = either die return + outputPath fileName range = + replaceBaseName fileName (takeBaseName fileName ++ "_p" ++ range) diff --git a/src/Data/ByteString/Char8/Util.hs b/src/Data/ByteString/Char8/Util.hs index 03d5b54..f79a2d0 100644 --- a/src/Data/ByteString/Char8/Util.hs +++ b/src/Data/ByteString/Char8/Util.hs @@ -14,7 +14,7 @@ module Data.ByteString.Char8.Util ( ) where import Data.ByteString (ByteString, snoc) -import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt) +import qualified Data.ByteString as BS (empty, foldl', length, pack, singleton, splitAt) import qualified Data.ByteString.Char8 as Char8 ( cons, drop, index, splitAt, take, uncons, unpack ) @@ -43,7 +43,7 @@ intToB256 n B256Int $ begining `snoc` (toEnum (n `mod` 0x100)) b256ToInt :: B256Int -> Int -b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n +b256ToInt (B256Int n) = BS.foldl' (\k w -> 0x100*k + fromEnum w) 0 n toBytes :: Int -> Int -> ByteString toBytes 0 _ = BS.empty diff --git a/src/Data/Id.hs b/src/Data/Id.hs index b79bd33..59ec8d4 100644 --- a/src/Data/Id.hs +++ b/src/Data/Id.hs @@ -8,6 +8,7 @@ module Data.Id ( , at , delete , empty + , filterWithKey , fromList , insert , keysSet @@ -21,11 +22,11 @@ module Data.Id ( , union ) where -import Control.Monad.State (MonadState, modify, gets) +import Control.Monad.State.Strict (MonadState, modify, gets) import Data.IntMap (IntMap, (!)) import qualified Data.IntMap as IntMap ( - delete, empty, fromList, keysSet, insert, lookup, mapWithKey, maxViewWithKey - , member, minViewWithKey, size, union + delete, empty, filterWithKey, fromList, keysSet, insert, lookup + , mapWithKey, maxViewWithKey, member, minViewWithKey, size, union ) import Data.IntSet (IntSet) import Prelude hiding (lookup) @@ -74,6 +75,9 @@ union (IdMap intMap1) (IdMap intMap2) = mapWithKey :: (Id a -> b -> c) -> IdMap a b -> IdMap a c mapWithKey f (IdMap idMap) = IdMap {intMap = IntMap.mapWithKey (f . Id) idMap} +filterWithKey :: (Id a -> b -> Bool) -> IdMap a b -> IdMap a b +filterWithKey f = IdMap . IntMap.filterWithKey (f . Id) . intMap + fromList :: [(Id a, b)] -> IdMap a b fromList = IdMap . IntMap.fromList . fmap (\(key, b) -> (getId key, b)) diff --git a/src/PDF/Box.hs b/src/PDF/Box.hs index bf129f9..59b64ac 100644 --- a/src/PDF/Box.hs +++ b/src/PDF/Box.hs @@ -1,7 +1,6 @@ {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE FunctionalDependencies #-} {-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE UndecidableInstances #-} {-# LANGUAGE ScopedTypeVariables #-} module PDF.Box ( @@ -17,8 +16,8 @@ module PDF.Box ( import Control.Monad.Except (ExceptT(..), runExceptT) import Control.Monad.Fail (MonadFail(..)) -import Control.Monad.State (MonadState(..)) import Control.Monad.Reader (ReaderT, runReaderT) +import Control.Monad.State (MonadState(..)) import Data.Id (Id, IdMap) import qualified Data.Id as Id (insert, lookup) import Data.Map (Map) diff --git a/src/PDF/CMap.hs b/src/PDF/CMap.hs index 97549d6..67323e3 100644 --- a/src/PDF/CMap.hs +++ b/src/PDF/CMap.hs @@ -1,15 +1,17 @@ {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE FlexibleContexts #-} module PDF.CMap ( CMap , CMappers , CRange(..) - , cMap , matches + , parse ) where import Control.Applicative ((<|>), many) -import Control.Monad.Fail (MonadFail(..)) +import Control.Monad.Except (MonadError(..)) +import Control.Monad.Fail (fail) import Control.Monad.State (modify) import Data.Attoparsec.ByteString.Char8 (count) import Data.ByteString (ByteString) @@ -18,6 +20,7 @@ import Data.ByteString.Char8 (unpack) import Data.ByteString.Char8.Util ( B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8 ) +import Data.Foldable (foldl', foldr') import Data.Map (Map, mapWithKey, union) import qualified Data.Map as Map ( adjust, empty, fromList, insert, insertWith, lookup, toList @@ -75,7 +78,7 @@ encoder :: Map Size FromUnicode -> Encoder encoder fromUnicodes input | Text.null input = Right "" | otherwise = - foldl (<>) (Left "No encoding found") $ mapWithKey tryOn fromUnicodes + foldr' (<>) (Left "No encoding found") $ mapWithKey tryOn fromUnicodes where tryOn size fromUnicode = let (prefix, end) = Text.splitAt size input in @@ -87,8 +90,8 @@ matches :: ByteString -> CRange -> Bool matches code (CRange {fromSequence, toSequence}) = fromSequence <= code && code <= toSequence -cMap :: MonadFail m => ByteString -> m Font -cMap = either fail (return . toFont . snd) . runParser +parse :: MonadError String m => ByteString -> m Font +parse = either throwError (return . toFont . snd) . runParser (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine)) emptyCMap where @@ -148,7 +151,7 @@ saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length cod ) saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode -saveFromUnicodeBySize = flip (foldl insertFromUnicode) +saveFromUnicodeBySize = flip (foldl' insertFromUnicode) where insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) = diff --git a/src/PDF/Content.hs b/src/PDF/Content.hs index a3483b4..eacd41f 100644 --- a/src/PDF/Content.hs +++ b/src/PDF/Content.hs @@ -9,54 +9,77 @@ module PDF.Content ( Content(..) , ContentUnit(..) , GraphicContextUnit(..) + , IdContentUnit + , IdGraphicContextUnit + , IdTextContext , Instructions(..) , TextContext , parse ) where import Control.Applicative ((<|>)) -import Control.Monad.Fail (MonadFail) import Control.Monad.Reader (asks, runReader) -import Control.Monad.State (evalStateT, modify) +import Control.Monad.State.Strict (runState, evalStateT, modify) import Data.Attoparsec.ByteString.Char8 (sepBy) +import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser, parseOnly) import Data.ByteString (ByteString) import Data.Id (Id(..), Indexed, at, empty, register) import PDF.Box (Box(..)) import PDF.Content.Operator (Instruction, operator) import PDF.Object (blank, directObject) import PDF.Output (Output(..), line) -import PDF.Parser (Parser, runParser, string) +import PDF.Parser (string) data Instructions = Instructions -data GraphicContextUnit = - GraphicInstruction (Id Instruction) - | ContentUnit ContentUnit +data GraphicContextUnit a = + GraphicInstruction a + | ContentUnit (ContentUnit a) deriving Show -type TextContext = [Id Instruction] -data ContentUnit = - GraphicContext [GraphicContextUnit] - | TextContext TextContext +type TextContext a = [a] +data ContentUnit a = + GraphicContext [GraphicContextUnit a] + | TextContext (TextContext a) deriving Show data Content = Content { - contentUnits :: [ContentUnit] + contentUnits :: [IdContentUnit] , indexedInstructions :: Indexed Instruction + , firstError :: Maybe String } deriving Show -type InstructionParser = Parser (Indexed Instruction) +type TmpContentUnit = ContentUnit Instruction +type TmpGraphicContextUnit = GraphicContextUnit Instruction +type TmpTextContext = TextContext Instruction + +type IdContentUnit = ContentUnit (Id Instruction) +type IdGraphicContextUnit = GraphicContextUnit (Id Instruction) +type IdTextContext = TextContext (Id Instruction) instance Monad m => Box m Instructions Content (Indexed Instruction) where r Instructions = return . indexedInstructions w Instructions indexedInstructions someContent = return $ someContent {indexedInstructions} -parse :: MonadFail m => ByteString -> m Content -parse = - either fail (return . uncurry Content) . runParser contentUnits empty - where - contentUnits = contentUnit `sepBy` blank +parse :: ByteString -> Content +parse input = + let result = Atto.parseOnly (contentUnit `sepBy` blank) input in + let (contentUnits, indexedInstructions) = either (const ([], empty)) buildContent result in + let firstError = either Just (const Nothing) result in + Content {contentUnits, indexedInstructions, firstError} -contentUnit :: InstructionParser ContentUnit +buildContent :: [TmpContentUnit] -> ([IdContentUnit], Indexed Instruction) +buildContent instructionContentUnits = + runState (mapM registerContentUnit instructionContentUnits) empty + where + registerContentUnit (GraphicContext gc) = + GraphicContext <$> (mapM registerGraphicContext gc) + registerContentUnit (TextContext tc) = TextContext <$> (mapM register tc) + registerGraphicContext (GraphicInstruction gi) = + GraphicInstruction <$> (register gi) + registerGraphicContext (ContentUnit cu) = + ContentUnit <$> (registerContentUnit cu) + +contentUnit :: Atto.Parser TmpContentUnit contentUnit = (GraphicContext <$> graphicContext) <|> (TextContext <$> textContext) @@ -64,18 +87,18 @@ contentUnit = graphicContext = string "q" *> blank *> graphicContextUnit `sepBy` blank <* blank <* string "Q" -graphicContextUnit :: InstructionParser GraphicContextUnit +graphicContextUnit :: Atto.Parser TmpGraphicContextUnit graphicContextUnit = (GraphicInstruction <$> instruction) <|> (ContentUnit <$> contentUnit) -instruction :: InstructionParser (Id Instruction) -instruction = evalStateT stackParser [] >>= register +instruction :: Atto.Parser Instruction +instruction = evalStateT stackParser [] where stackParser = ((directObject <* blank) >>= push) <|> operator push arg = modify (arg:) *> stackParser -textContext :: InstructionParser TextContext +textContext :: Atto.Parser TmpTextContext textContext = string "BT" *> blank *> instruction `sepBy` blank <* blank <* string "ET" diff --git a/src/PDF/Content/Text.hs b/src/PDF/Content/Text.hs index 02246ed..9d3d15d 100644 --- a/src/PDF/Content/Text.hs +++ b/src/PDF/Content/Text.hs @@ -1,24 +1,22 @@ {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} module PDF.Content.Text ( Chunks(..) , chunk , format - , renderText ) where import Control.Monad (foldM) import Control.Monad.Fail (MonadFail(..)) -import Control.Monad.Reader (MonadReader(..), ReaderT, asks, runReaderT) +import Control.Monad.Reader (ReaderT, asks, runReaderT) import Control.Monad.State ( MonadState(..), StateT, evalStateT, gets, modify, runStateT ) import Control.Monad.Trans (lift) +import qualified Data.ByteString.Char8 as BS (concatMap, singleton) import Data.Id (Id(..), Indexed, at, empty, singleton) import qualified Data.Id as Id (delete, lookup, register) import Data.Map ((!)) @@ -26,27 +24,20 @@ import Data.Text (Text, breakOn) import qualified Data.Text as Text (drop) import PDF.Box (Box(..)) import PDF.Content ( - Content, ContentUnit(..), GraphicContextUnit(..), contentUnits + Content(..), ContentUnit(..), GraphicContextUnit(..), IdContentUnit + , IdGraphicContextUnit ) -import qualified PDF.Content as Content (Content(..)) import PDF.Content.Operator (Instruction, Operator(..)) import PDF.Content.Operator.Text (Operator(..)) import PDF.Font (Font(..), FontSet, emptyFont) import PDF.Object (DirectObject(..), StringObject(..), toByteString) import Prelude hiding (fail) -data ROContext = ROContext { - indexedInstructions :: Indexed Instruction - , fontSet :: FontSet - } - -type TextContent m = ReaderT ROContext m -type FontContext m = StateT Font (TextContent m) - +type TmpFont = StateT Font +type Renderer m = ReaderT (Indexed Instruction) (ReaderT FontSet m) type Updater m = StateT (Indexed Instruction) (ReaderT (Indexed Text) (ReaderT FontSet m)) -type TextUpdater m = StateT Font (Updater m) -decodeString :: MonadFail m => StringObject -> FontContext m Text +decodeString :: MonadFail m => StringObject -> TmpFont (Renderer m) Text decodeString input = do Font {decode} <- get either fail return . decode $ toByteString input @@ -57,56 +48,53 @@ chunk :: Int -> Id Text chunk = Id instance MonadFail m => Box (ReaderT FontSet m) Chunks Content (Indexed Text) where - r Chunks content = do - fontSet <- ask - renderText fontSet content + r Chunks content = + runReaderT (mconcat <$> renderer) $ indexedInstructions content + where + renderer = mapM renderContentUnit (contentUnits content) - w Chunks indexedText (Content.Content {contentUnits, Content.indexedInstructions}) = - uncurry Content.Content <$> runReaderT ( - runStateT (mapM updateContentUnit contentUnits) indexedInstructions - ) indexedText + w Chunks indexedText content = do + (contentUnits, indexedInstructions) <- runReaderT readerUpdate indexedText + return $ content {contentUnits, indexedInstructions} + where + stateUpdate = mapM updateContentUnit (contentUnits content) + readerUpdate = runStateT stateUpdate (indexedInstructions content) -renderText :: MonadFail m => FontSet -> Content -> m (Indexed Text) -renderText fontSet (Content.Content {contentUnits, Content.indexedInstructions}) = - runReaderT (mconcat <$> mapM renderContentUnit contentUnits) roContext - where - roContext = ROContext {indexedInstructions, fontSet} - -renderContentUnit :: MonadFail m => ContentUnit -> TextContent m (Indexed Text) +renderContentUnit :: MonadFail m => IdContentUnit -> Renderer m (Indexed Text) renderContentUnit (GraphicContext graphicContextUnits) = mconcat <$> mapM renderGraphicContextUnit graphicContextUnits renderContentUnit (TextContext instructionIds) = evalStateT (mconcat <$> mapM renderInstructionId instructionIds) emptyFont -updateContentUnit :: MonadFail m => ContentUnit -> Updater m ContentUnit +updateContentUnit :: MonadFail m => IdContentUnit -> Updater m IdContentUnit updateContentUnit (GraphicContext graphicContextUnits) = GraphicContext <$> mapM updateGraphicContextUnit graphicContextUnits updateContentUnit (TextContext instructionIds) = TextContext . concat <$> evalStateT (mapM updateInstructionId instructionIds) emptyFont -renderGraphicContextUnit :: MonadFail m => GraphicContextUnit -> TextContent m (Indexed Text) +renderGraphicContextUnit :: MonadFail m => IdGraphicContextUnit -> Renderer m (Indexed Text) renderGraphicContextUnit (GraphicInstruction _) = return empty renderGraphicContextUnit (ContentUnit contentUnit) = renderContentUnit contentUnit -updateGraphicContextUnit :: MonadFail m => GraphicContextUnit -> Updater m GraphicContextUnit +updateGraphicContextUnit :: MonadFail m => IdGraphicContextUnit -> Updater m IdGraphicContextUnit updateGraphicContextUnit gI@(GraphicInstruction _) = return gI updateGraphicContextUnit (ContentUnit contentUnit) = ContentUnit <$> updateContentUnit contentUnit -renderInstructionId :: MonadFail m => Id Instruction -> FontContext m (Indexed Text) +renderInstructionId :: MonadFail m => Id Instruction -> TmpFont (Renderer m) (Indexed Text) renderInstructionId instructionId@(Id n) = toMap <$> - (asks ((`at` instructionId) . indexedInstructions) >>= renderInstruction) + (asks ((`at` instructionId)) >>= renderInstruction) where toMap = maybe empty (singleton (Id n)) -updateInstructionId :: MonadFail m => Id Instruction -> TextUpdater m [Id Instruction] +updateInstructionId :: MonadFail m => Id Instruction -> TmpFont (Updater m) [Id Instruction] updateInstructionId instructionId = lift (gets (`at` instructionId)) >>= updateInstruction instructionId -renderInstruction :: MonadFail m => Instruction -> FontContext m (Maybe Text) +renderInstruction :: MonadFail m => Instruction -> TmpFont (Renderer m) (Maybe Text) renderInstruction (Text Tf, [NameObject fontName, _]) = - asks ((! fontName) . fontSet) >>= put >> return Nothing + lift (lift $ asks (! fontName)) >>= put >> return Nothing renderInstruction (Text Tstar, []) = return $ Just "\n" @@ -128,7 +116,7 @@ renderInstruction (Text DQuote, [_, _, StringObject outputString]) = renderInstruction _ = return Nothing -updateInstruction :: MonadFail m => Id Instruction -> Instruction -> TextUpdater m [Id Instruction] +updateInstruction :: MonadFail m => Id Instruction -> Instruction -> TmpFont (Updater m) [Id Instruction] updateInstruction instructionId (Text Tf, [NameObject fontName, _]) = (lift . lift . lift $ asks (!fontName)) >>= put >> return [instructionId] @@ -155,5 +143,9 @@ format input = do (line, left) -> (:) <$> tj line <*> format left where tj t = do - literal <- either fail (return . Literal) =<< gets (($t) . encode) - return (Text Tj, [StringObject literal]) + encoded <- either fail return =<< gets (($t) . encode) + return (Text Tj, [StringObject . Literal $ BS.concatMap escape encoded]) + escape '\\' = "\\\\" + escape '(' = "\\(" + escape ')' = "\\)" + escape c = BS.singleton c diff --git a/src/PDF/Encoding.hs b/src/PDF/Encoding.hs index 8d14e6a..ba65846 100644 --- a/src/PDF/Encoding.hs +++ b/src/PDF/Encoding.hs @@ -1,12 +1,13 @@ +{-# LANGUAGE FlexibleContexts #-} module PDF.Encoding ( encoding ) where -import Control.Monad.Fail (MonadFail(..)) +import Control.Monad.Except (MonadError(..)) import PDF.Encoding.MacRoman (macRomanEncoding) import PDF.Font (Font) import Prelude hiding (fail) -encoding :: MonadFail m => String -> m Font +encoding :: MonadError String m => String -> m Font encoding "MacRomanEncoding" = return macRomanEncoding -encoding s = fail $ "Unknown encoding " ++ s +encoding s = throwError $ "Unknown encoding " ++ s diff --git a/src/PDF/Encoding/MacRoman.hs b/src/PDF/Encoding/MacRoman.hs index 71da279..a0d8e7a 100644 --- a/src/PDF/Encoding/MacRoman.hs +++ b/src/PDF/Encoding/MacRoman.hs @@ -3,6 +3,7 @@ module PDF.Encoding.MacRoman ( ) where import qualified Data.ByteString.Char8 as BS (pack, unpack) +import Data.Foldable (foldl') import Data.Map (Map) import qualified Data.Map as Map (empty, insert, lookup) import qualified Data.Text as Text (pack, unpack) @@ -24,7 +25,7 @@ macRomanEncoding = Font { | otherwise -> Left ("Character '" ++ k :"' unavailable in MacRoman") mappers :: (Mapper, Mapper) -mappers = foldl generateMapers (Map.empty, Map.empty) [ +mappers = foldl' generateMapers (Map.empty, Map.empty) [ ('\x80', '\x00C4') -- LATIN CAPITAL LETTER A WITH DIAERESIS , ('\x81', '\x00C5') -- LATIN CAPITAL LETTER A WITH RING ABOVE , ('\x82', '\x00C7') -- LATIN CAPITAL LETTER C WITH CEDILLA diff --git a/src/PDF/Layer.hs b/src/PDF/Layer.hs index 8daa3a2..37e5af1 100644 --- a/src/PDF/Layer.hs +++ b/src/PDF/Layer.hs @@ -1,21 +1,26 @@ {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE FlexibleContexts #-} module PDF.Layer ( Layer(..) + , LayerReader , Objects(..) , unify ) where +import Control.Monad.Except (MonadError(..)) +import Control.Monad.Reader (ReaderT) +import Data.Foldable (foldl') import Data.Id (Id(..), Indexed, keysSet, mapWithKey, member) -import qualified Data.Id as Id (empty, union) +import qualified Data.Id as Id (empty, insert, lookup, union) import Data.Map (Map, (!)) -import qualified Data.Map as Map (empty, union) +import qualified Data.Map as Map (empty, lookup, union) import qualified Data.IntSet as IntSet (delete, toList) import PDF.Box (Box(..)) import PDF.Object ( - IndirectObjCoordinates(..), Object, Occurrence(..) - , Structure(..), XRefEntry(..), XRefSection, eofMarker, outputBody + IndirectObjCoordinates(..), Object, Occurrence(..), Structure(..) + , XRefEntry(..), XRefSection, eofMarker, outputBody ) import qualified PDF.Output as Output (line) import PDF.Output ( @@ -26,24 +31,28 @@ import Text.Printf (printf) data Layer = Layer { occurrences :: [Occurrence] - , objects :: (Indexed Object) + , objects :: Indexed Object , docStructure :: Structure } deriving Show +type LayerReader m = ReaderT Layer m -updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset) -updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef) +updateXRefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset) +updateXRefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef) where - updateEntry objectId e@(InUse {}) = - e {offset = offsets ! (ObjectId $ getId objectId)} + updateEntry objectId e@(InUse {offset}) = + case Map.lookup (ObjectId $ getId objectId) offsets of + Nothing -> Free {nextFree = Id $ getOffset offset, generation = 65535} + Just newOffset -> e {offset = newOffset} updateEntry _ e = e instance Output Layer where - output (Layer {occurrences, objects, docStructure}) = - fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects)) - >>= \(body, (xref, startXRef)) -> mconcat [ - body + output (Layer {occurrences, objects, docStructure}) = do + (body, savedOffsets) <- getOffsets (outputBody (occurrences, objects)) + let (newXRef, startXRef) = updateXRefs xRef savedOffsets + mconcat [ + return body , Output.line "xref" - , output xref + , output newXRef , Output.line "trailer" , output trailer, newLine , Output.line "startxref" @@ -76,6 +85,14 @@ instance Monad m => Box m Objects Layer (Indexed Object) where Indirect (IndirectObjCoordinates {objectId, versionNumber = 0}) newOccurrences = (makeOccurrence . Id) <$> IntSet.toList newObjectIds +instance MonadError String m => Box m (Id Object) Layer Object where + r objectId = + maybe (throwError "Unknown key") return . Id.lookup objectId . objects + w objectId a layer@(Layer {objects}) + | member objectId objects = return $ + layer {objects = Id.insert objectId a objects} + | otherwise = throwError "Unknown key" + emptyLayer :: Layer emptyLayer = Layer { docStructure = Structure {xRef = Id.empty, trailer = Map.empty} @@ -84,7 +101,7 @@ emptyLayer = Layer { } unify :: [Layer] -> Layer -unify = foldl complete emptyLayer +unify = foldl' complete emptyLayer where complete tmpLayer older = let mergedObjects = Id.union (objects tmpLayer) (objects older) in diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs index 795c988..cf1ea12 100644 --- a/src/PDF/Object.hs +++ b/src/PDF/Object.hs @@ -34,6 +34,7 @@ module PDF.Object ( ) where import Control.Applicative ((<|>), many) +import Control.Monad.Reader (asks) import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy) import Data.ByteString (ByteString) import qualified Data.ByteString as BS (concat) @@ -48,11 +49,11 @@ import qualified Data.Id as Id ( import Data.Map (Map) import qualified Data.Map as Map (fromList, toList) import qualified Data.Set as Set (fromList, member) -import qualified PDF.EOL as EOL (charset, parser) +import qualified PDF.EOL as EOL (Style(..), charset, parser) import qualified PDF.Output as Output (line, string) import PDF.Output ( - OBuilder, Offset(..), Output(..), Resource(..) - , byteString, getOffset, join, newLine, saveOffset + OBuilder, OContext(..), Offset(..), Output(..), Resource(..), byteString + , getOffset, join, newLine, saveOffset ) import PDF.Parser (MonadParser(..), Parser, (), octDigit, oneOf) import Text.Printf (printf) @@ -190,8 +191,8 @@ nullObject = string "null" *> return () "null object" -- Reference -- data IndirectObjCoordinates = IndirectObjCoordinates { - objectId :: (Id Object) - , versionNumber :: Int + objectId :: {-# UNPACK #-} !(Id Object) + , versionNumber :: {-# UNPACK #-} !Int } deriving Show reference :: MonadParser m => m IndirectObjCoordinates @@ -202,14 +203,14 @@ reference = IndirectObjCoordinates -- DirectObject -- data DirectObject = - Boolean Bool - | NumberObject Number - | StringObject StringObject - | NameObject Name - | Array [DirectObject] - | Dictionary Dictionary + Boolean !Bool + | NumberObject !Number + | StringObject !StringObject + | NameObject !Name + | Array ![DirectObject] + | Dictionary !Dictionary | Null - | Reference IndirectObjCoordinates + | Reference !IndirectObjCoordinates deriving Show instance Output DirectObject where @@ -294,10 +295,15 @@ data XRefEntry = InUse { } deriving Show instance Output XRefEntry where - output (InUse {offset, generation}) = - Output.line (printf "%010d %05d n " (getOffset offset) generation) - output (Free {nextFree, generation}) = - Output.line (printf "%010d %05d f " (getId nextFree) generation) + output xRefEntry = Output.string (build xRefEntry) `mappend` endXRefEntryLine + where + build (InUse {offset, generation}) = + printf "%010d %05d n" (getOffset offset) generation + build (Free {nextFree, generation}) = + printf "%010d %05d f" (getId nextFree) generation + endXRefEntryLine = OContext (asks padEOLToTwoBytes) >>= Output.line + padEOLToTwoBytes EOL.CRLF = ("" :: String) + padEOLToTwoBytes _ = " " entry :: Parser u XRefEntry entry = do diff --git a/src/PDF/Object/Navigation.hs b/src/PDF/Object/Navigation.hs index 4eadde8..58ead06 100644 --- a/src/PDF/Object/Navigation.hs +++ b/src/PDF/Object/Navigation.hs @@ -3,98 +3,162 @@ {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE DeriveFunctor #-} module PDF.Object.Navigation ( - Clear(..) - , PDFContent - , Raw(..) + Nav(..) + , PPath(..) + , ROLayer + , RWLayer + , StreamContent(..) , (./) , (//) , (>./) , (>//) , castObject + , catalog , getDictionary , getKey , objectById - , catalog + , save ) where import Codec.Compression.Zlib (compress, decompress) +import Control.Monad.Except (MonadError(..)) import Control.Monad.Reader (MonadReader(..)) -import Control.Monad.Fail (MonadFail(..)) +import Control.Monad.State (MonadState) import Data.ByteString (ByteString) +import qualified Data.ByteString as BS (length) import qualified Data.ByteString.Lazy as Lazy (ByteString, fromStrict, toStrict) -import Data.Id (Id, at) -import qualified Data.Map as Map (lookup) -import PDF.Box (Box(..)) +import Data.Id (Id) +import qualified Data.Id as Id (at) +import qualified Data.Map as Map (adjust, insert, lookup) +import PDF.Box (Box(..), at, edit{-, runRO-}) import PDF.Layer (Layer(..)) import PDF.Object ( Dictionary, DirectObject(..), IndirectObjCoordinates(..) - , Name(..), Object(..), Structure(..) + , Name(..), Number(..), Object(..), Structure(..) ) import Prelude hiding (fail) import Text.Printf (printf) -type PDFContent m = (MonadReader Layer m, MonadFail m) +newtype PPath = PPath [Component] +data DPath = DPath { + root :: Id Object + , offset :: [Component] + } deriving Show + +push :: Component -> DPath -> DPath +push component dPath = dPath {offset = (offset dPath) ++ [component]} + +data Nav a = Nav { + dPath :: DPath + , value :: a + } deriving (Functor) + +instance Show a => Show (Nav a) where + show (Nav {dPath, value}) = "Nav {dPath = " ++ show dPath ++ ", value = " ++ show value ++ "}" + +type ROLayer m = (MonadReader Layer m, MonadError String m) +type RWLayer m = (MonadState Layer m, MonadError String m) type Component = String -getDictionary :: PDFContent m => Object -> m Dictionary -getDictionary (Direct (Dictionary aDict)) = return aDict -getDictionary (Direct (Reference ref)) = - objectById (objectId ref) >>= getDictionary -getDictionary (Stream {header}) = return header -getDictionary obj = expected "dictionary : " obj +getDictionary :: ROLayer m => Nav Object -> m (Nav Dictionary) +getDictionary (Nav {dPath, value}) = + case value of + (Direct (Dictionary aDict)) -> return $ Nav {dPath, value = aDict} + (Direct (Reference ref)) -> objectById (objectId ref) >>= getDictionary + (Stream {header}) -> return $ Nav {dPath, value = header} + obj -> expected "dictionary : " obj -expected :: (MonadFail m, Show a) => String -> a -> m b -expected name = fail . printf "Not a %s: %s" name . show +expected :: (MonadError String m, Show a) => String -> a -> m b +expected name = throwError . printf "Not a %s: %s" name . show -getKey :: PDFContent m => String -> Object -> m DirectObject -getKey key object = getDictionary object >>= catchMaybe . Map.lookup (Name key) +getKey :: ROLayer m => String -> Nav Object -> m (Nav DirectObject) +getKey key navObject = getDictionary navObject >>= f where errorMessage = - printf "Key %s not found in object %s" key (show object) - catchMaybe = maybe (fail errorMessage) return + printf "Key %s not found in object %s" key (show navObject) + f (Nav {dPath, value}) = + case Map.lookup (Name key) value of + Nothing -> throwError errorMessage + Just dObj -> return $ Nav {dPath = push key dPath, value = dObj} -objectById :: PDFContent m => (Id Object) -> m Object +objectById :: ROLayer m => (Id Object) -> m (Nav Object) objectById objectId = do layer <- ask - return (objects layer `at` objectId) + return $ Nav { + dPath = DPath {root = objectId, offset = []} + , value = objects layer `Id.at` objectId + } -(./) :: PDFContent m => m Object -> Component -> m Object -(./) object key = (object >>= getKey key >>= castObject) - -castObject :: PDFContent m => DirectObject -> m Object -castObject (Reference (IndirectObjCoordinates {objectId})) = +castObject :: ROLayer m => Nav DirectObject -> m (Nav Object) +castObject (Nav {value = !(Reference (IndirectObjCoordinates {objectId}))}) = objectById objectId -castObject directObject = return $ Direct directObject +castObject (Nav {dPath, value}) = return $ Nav {dPath, value = Direct value} -(//) :: PDFContent m => m Object -> [Component] -> m Object -(//) object [] = object -(//) object (key:keys) = object ./ key // keys +(./) :: ROLayer m => m (Nav Object) -> Component -> m (Nav Object) +(./) navObject key = (navObject >>= getKey key >>= castObject) -(>./) :: PDFContent m => Object -> Component -> m Object -(>./) object = (return object ./) +(//) :: ROLayer m => m (Nav Object) -> PPath -> m (Nav Object) +(//) navObject (PPath []) = navObject +(//) navObject (PPath (key:keys)) = navObject ./ key // (PPath keys) -(>//) :: PDFContent m => Object -> [Component] -> m Object -(>//) object = (return object //) +(>./) :: ROLayer m => Nav Object -> Component -> m (Nav Object) +(>./) navObject = (return navObject ./) -catalog :: PDFContent m => m Object -catalog = Direct . Dictionary . trailer . docStructure <$> ask +(>//) :: ROLayer m => Nav Object -> PPath -> m (Nav Object) +(>//) navObject = (return navObject //) -data Clear = Clear -data Raw = Raw +catalog :: ROLayer m => m (Nav Object) +catalog = do + value <- Direct . Dictionary . trailer . docStructure <$> ask + return $ Nav {dPath = undefined, value} + +setAt :: [Component] -> DirectObject -> Dictionary -> Dictionary +setAt [] _ dict = dict +setAt [component] directObject dict = + Map.insert (Name component) directObject dict +setAt (component:components) directObject dict = + Map.adjust setDirObj (Name component) dict + where + setDirObj (Dictionary subDict) = + Dictionary $ setAt components directObject subDict + setDirObj x = x + +save :: RWLayer m => Nav Object -> m () +save nav@(Nav {dPath, value = Direct dObj}) = + edit .at (root dPath) $ return . setObj + where + setObj obj@(Stream {header}) = + obj {header = setAt (offset dPath) dObj header} + setObj (Direct (Dictionary dict)) = + Direct . Dictionary $ setAt (offset dPath) dObj dict + setObj _ = value nav +save (Nav {dPath = DPath {root, offset = []}, value}) = edit $ w root value +save _ = throwError "Streams can't be properties of PDF objects" + +data StreamContent = Clear | Raw onLazy :: (Lazy.ByteString -> Lazy.ByteString) -> ByteString -> ByteString onLazy f = Lazy.toStrict . f . Lazy.fromStrict contains :: String -> DirectObject -> Bool -contains needle (NameObject (Name n)) = needle == n -contains needle (Array directObjects) = oneOf directObjects (contains needle) +contains needle !(NameObject (Name n)) = needle == n +contains needle !(Array directObjects) = oneOf directObjects (contains needle) where oneOf [] _ = False oneOf (x:xs) p = p x || oneOf xs p contains _ _ = False -instance MonadFail m => Box m Clear Object ByteString where +instance MonadError String m => Box m StreamContent (Nav Object) ByteString where + r sc = r sc . value + w sc newStreamContent nav = setValue <$> w sc newStreamContent (value nav) + where + setValue value = nav {value} + +instance MonadError String m => Box m StreamContent Object ByteString where + r Raw (Stream {streamContent}) = return streamContent r Clear (Stream {header, streamContent}) = return $ case Map.lookup (Name "Filter") header of Just directObject @@ -102,17 +166,15 @@ instance MonadFail m => Box m Clear Object ByteString where _ -> streamContent r _ obj = expected "stream" obj - w Clear streamContent obj@(Stream {header}) = return $ - case Map.lookup (Name "Filter") header of - Just directObject - | contains "FlateDecode" directObject -> - obj {streamContent = onLazy compress streamContent} - _ -> obj {streamContent} - w _ _ obj = expected "stream" obj - -instance MonadFail m => Box m Raw Object ByteString where - r Raw (Stream {streamContent}) = return streamContent - r _ obj = expected "stream" obj - w Raw streamContent obj@(Stream {}) = return $ obj {streamContent} + w Clear newStreamContent (Stream {header}) = + let streamContent = getStreamContent (Map.lookup (Name "Filter") header) in + return $ Stream {header = fixLength streamContent, streamContent} + where + getStreamContent (Just directObject) + | contains "FlateDecode" directObject = onLazy compress newStreamContent + getStreamContent _ = newStreamContent + fixLength sc = + let newLength = NumberObject . Number . fromIntegral $ BS.length sc in + Map.insert (Name "Length") newLength header w _ _ obj = expected "stream" obj diff --git a/src/PDF/Output.hs b/src/PDF/Output.hs index f349ab7..373f554 100644 --- a/src/PDF/Output.hs +++ b/src/PDF/Output.hs @@ -55,9 +55,8 @@ saveOffset resource = OContext $ lift :: (a -> Builder) -> a -> OBuilder lift f a = return (f a) -getOffsets :: OBuilder -> OContext (OBuilder, Map Resource Offset) -getOffsets (OContext builder) = - OContext (listen builder >>= \(a, w) -> return (return a, w)) +getOffsets :: OBuilder -> OContext (Builder, Map Resource Offset) +getOffsets (OContext builder) = OContext $ listen builder append :: OBuilder -> OBuilder -> OBuilder append (OContext a) (OContext b) = OContext (mappend <$> a <*> b) diff --git a/src/PDF/Pages.hs b/src/PDF/Pages.hs index a78f709..80d9048 100755 --- a/src/PDF/Pages.hs +++ b/src/PDF/Pages.hs @@ -13,63 +13,62 @@ module PDF.Pages ( , withResources ) where -import Control.Applicative (Alternative, (<|>)) +import Control.Applicative (Alternative) +import Control.Applicative ((<|>)) import Control.Monad (foldM) -import Control.Monad.Fail (MonadFail(..)) +import Control.Monad.Except (MonadError(..)) import Control.Monad.Reader (ReaderT, runReaderT) import Control.Monad.State (StateT(..), evalStateT, execStateT, gets, modify) import Control.Monad.Trans (lift) +import Data.ByteString (ByteString) import Data.ByteString.Lazy (toStrict) -import Data.Id (Id, IdMap) +import Data.Id (Id(..), IdMap) import qualified Data.Id as Id (empty, insert, lookup) import Data.Map (Map) -import qualified Data.Map as Map (empty, fromList, insert, toList) +import qualified Data.Map as Map (empty, elems, fromList, insert, toList) +import Data.Maybe (listToMaybe) import Data.OrderedMap (OrderedMap, build, mapi) -import PDF.Box (Box(..), at, edit) -import PDF.CMap (cMap) +import PDF.Box (Box(..), at, edit, runRO) +import qualified PDF.CMap as CMap (parse) import PDF.Content (Content(..)) import qualified PDF.Content as Content (parse) import PDF.Encoding (encoding) import PDF.EOL (Style(..)) import PDF.Font (Font, FontSet) -import PDF.Layer (Layer(..), Objects(..)) +import PDF.Layer (Layer(..), LayerReader) import PDF.Object ( Dictionary, DirectObject(..), IndirectObjCoordinates(..) - , Name(..), Object(..) + , Name(..), Number(..), Object(..) ,) import PDF.Object.Navigation ( - Clear(..), (//), (>./), (>//), getDictionary - , getKey, objectById, origin + Nav(..), PPath(..), ROLayer, RWLayer, StreamContent(..), (//), (>./) + , (>//), catalog, getDictionary, getKey, objectById, save ) import PDF.Output (render) -import Prelude hiding (fail) import Text.Printf (printf) -type Except m = (Alternative m, MonadFail m) -type InLayer m = ReaderT Layer m - type CachedFonts = IdMap Object Font -type FontCache m = StateT CachedFonts (InLayer m) +type FontCache m = StateT CachedFonts m data Page = Page { - contents :: OrderedMap (Id Object) Content + byteContents :: OrderedMap (Id Object) ByteString , resources :: Dictionary , source :: (Id Object) } -loadContents :: Except m => DirectObject -> InLayer m (OrderedMap (Id Object) Content) -loadContents directObject = - sequenceA . build loadContent $ objectIds directObject +loadByteContents :: ROLayer m => DirectObject -> m (OrderedMap (Id Object) ByteString) +loadByteContents directObject = do + objs <- sequence . build objectById $ objectIds directObject + mapM (r Clear) objs where - loadContent :: Except m => (Id Object) -> InLayer m Content - loadContent objectId = objectById objectId >>= r Clear >>= Content.parse objectIds (Array l) = l >>= getReference objectIds dirObj = getReference dirObj -getFontDictionary :: Except m => Object -> InLayer m Dictionary +getFontDictionary :: (Alternative m, ROLayer m) => Nav Object -> m Dictionary getFontDictionary pageObj = - (pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty + (pageObj >// PPath ["Resources", "Font"] >>= fmap value . getDictionary) + <|> return Map.empty -cache :: Except m => ((Id Object) -> FontCache m Font) -> (Id Object) -> FontCache m Font +cache :: ROLayer m => (Id Object -> FontCache m Font) -> Id Object -> FontCache m Font cache loader objectId = gets (Id.lookup objectId) >>= maybe load return where @@ -78,81 +77,105 @@ cache loader objectId = modify $ Id.insert objectId value return value -loadFont :: Except m => (Id Object) -> FontCache m Font +loadFont :: (Alternative m, ROLayer m) => Id Object -> FontCache m Font loadFont objectId = lift $ objectById objectId >>= tryMappings where tryMappings object = - (object >./ "ToUnicode" >>= r Clear >>= cMap) - <|> (object >./ "Encoding" >>= loadEncoding) - <|> (fail $ unknownFormat (show objectId) (show object)) + (object >./ "ToUnicode" >>= r Clear >>= CMap.parse) + <|> (object >./ "Encoding" >>= loadEncoding . value) + <|> (throwError $ unknownFormat (show objectId) (show object)) unknownFormat = printf "Unknown font format for object #%s : %s" - loadEncoding :: MonadFail m => Object -> m Font + loadEncoding :: MonadError String m => Object -> m Font loadEncoding (Direct (NameObject (Name name))) = encoding name loadEncoding object = - fail $ printf "Encoding must be a name, not that : %s" $ show object + throwError $ printf "Encoding must be a name, not that : %s" $ show object -loadResources :: Except m => Dictionary -> FontCache m FontSet +loadResources :: (Alternative m, ROLayer m) => Dictionary -> FontCache m FontSet loadResources = foldM addFont Map.empty . Map.toList where addFont output (name, Reference (IndirectObjCoordinates {objectId})) = flip (Map.insert name) output <$> cache loadFont objectId addFont output _ = return output -getReference :: DirectObject -> [(Id Object)] +getReference :: DirectObject -> [Id Object] getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId] getReference _ = [] -loadPage :: Except m => (Id Object) -> InLayer m Page +loadPage :: (Alternative m, ROLayer m) => Id Object -> m Page loadPage source = do page <- objectById source - contents <- getKey "Contents" page >>= loadContents + byteContents <- loadByteContents . value =<< getKey "Contents" page resources <- getFontDictionary page - return $ Page {contents, resources, source} + return $ Page {byteContents, resources, source} -pagesList :: Except m => InLayer m [(Id Object)] +pagesList :: (Alternative m, ROLayer m) => m [Id Object] pagesList = - (origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences) + (catalog // PPath ["Root", "Pages", "Kids"] >>= getReferences . value) <|> return [] where - getReferences (Array kids) = kids >>= getReference - getReferences _ = fail "Not a pages array" + getReferences (Direct (Array kids)) = return $ getReference =<< kids + getReferences _ = throwError "Not a pages array" -updatePage :: MonadFail m => Page -> StateT Layer m () -updatePage (Page {contents}) = sequence_ $ mapi updateContent contents +editPagesList :: RWLayer m => ([DirectObject] -> [DirectObject]) -> m () +editPagesList f = do + pages <- runRO (catalog // PPath ["Root", "Pages"]) + kids <- runRO (pages >./ "Kids") + count <- runRO (pages >./ "Count") + (newSize, newKids) <- editKids (value kids) + save $ kids {value = newKids} + save $ count {value = Direct $ NumberObject newSize} where - updateContent source content = - edit . at Objects . at source . at Clear $ setContent content - setContent content _ = return . toStrict $ render LF content + editKids (Direct (Array pageRefs)) = + let result = f pageRefs in + return (Number . fromIntegral $ length result, Direct $ Array result) + editKids _ = throwError "Invalid format for Root.Pages.Kids (not an array)" + +updatePage :: RWLayer m => Page -> m () +updatePage (Page {byteContents}) = sequence_ $ mapi updateByteContent byteContents + where + updateByteContent source byteContent = + edit .at source .at Clear $ \_ -> return byteContent data Pages = Pages newtype PageNumber = P Int data Contents = Contents -instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where - r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer +instance (Alternative m, MonadError String m) => Box m Pages Layer (Map Int Page) where + r Pages = runReaderT (numbered <$> pagesList >>= mapM loadPage) where - numbered :: [Page] -> Map Int Page + numbered :: [Id Object] -> Map Int (Id Object) numbered = Map.fromList . zip [1..] - w Pages pages = execStateT $ mapM_ updatePage pages - -instance (Alternative m, MonadFail m) => Box m PageNumber Layer Page where - r (P i) layer - | i < 1 = fail "Pages start at 1" - | otherwise = runReaderT (drop (i - 1) <$> pagesList >>= firstPage) layer + w Pages pages = execStateT $ do + mapM_ updatePage pages + setPagesList $ Map.elems (source <$> pages) where - firstPage [] = fail "Page is out of bounds" - firstPage (p:_) = loadPage p + setPagesList = + editPagesList . const . fmap (Reference . flip IndirectObjCoordinates 0) - w _ page = execStateT $ updatePage page +instance (Alternative m, MonadError String m) => Box m PageNumber Layer Page where + r (P p) layer + | p < 1 = throwError "Pages start at 1" + | otherwise = runReaderT (drop (p - 1) <$> pagesList >>= firstPage) layer + where + firstPage = + maybe (throwError "Page is out of bounds") loadPage . listToMaybe + + w (P p) page = execStateT $ do + updatePage page + editPagesList $ setPage (Reference $ IndirectObjCoordinates (source page) 0) + where + setPage ref l = take (p-1) l ++ ref : drop p l instance Monad m => Box m Contents Page (OrderedMap (Id Object) Content) where - r Contents = return . contents - w _ contents page = return $ page {contents} + r Contents = return . fmap Content.parse . byteContents + w Contents contents page = return $ page {byteContents} + where + byteContents = toStrict . render LF <$> contents -withFonts :: Monad m => (Layer -> FontCache m a) -> Layer -> m a +withFonts :: MonadError String m => (Layer -> FontCache (LayerReader m) a) -> Layer -> m a withFonts f layer = runReaderT (evalStateT (f layer) Id.empty) layer -withResources :: Except m => (Page -> ReaderT FontSet m b) -> Page -> FontCache m b +withResources :: (Alternative m, MonadError String m) => (Page -> ReaderT FontSet m b) -> Page -> FontCache (LayerReader m) b withResources f p = loadResources (resources p) >>= lift . lift . runReaderT (f p)