Make a couple improvements in performance + add an example script to extract pages from a PDF

This commit is contained in:
Tissevert 2020-05-28 18:54:15 +02:00
parent f6664683c7
commit d9f69014a0
17 changed files with 448 additions and 258 deletions

View file

@ -59,7 +59,7 @@ executable equivalent
build-depends: base build-depends: base
, bytestring , bytestring
, Hufflepdf , Hufflepdf
ghc-options: -Wall ghc-options: -Wall -rtsopts
default-language: Haskell2010 default-language: Haskell2010
executable getObj executable getObj
@ -69,7 +69,7 @@ executable getObj
, containers , containers
, Hufflepdf , Hufflepdf
, mtl , mtl
ghc-options: -Wall ghc-options: -Wall -rtsopts
default-language: Haskell2010 default-language: Haskell2010
executable getText executable getText
@ -77,9 +77,34 @@ executable getText
build-depends: base build-depends: base
, bytestring , bytestring
, containers , containers
, ExceptIOH
, Hufflepdf , Hufflepdf
, mtl
, text , text
ghc-options: -Wall ghc-options: -Wall -rtsopts
default-language: Haskell2010
executable pdfCut
main-is: examples/pdfCut.hs
build-depends: base
, bytestring
, containers
, ExceptIOH
, filepath
, Hufflepdf
, mtl
, text
ghc-options: -Wall -rtsopts
default-language: Haskell2010
executable fixMermoz
main-is: examples/fixMermoz.hs
build-depends: base
, bytestring
, ExceptIOH
, Hufflepdf
, mtl
ghc-options: -Wall -rtsopts
default-language: Haskell2010 default-language: Haskell2010
Test-Suite unitTests Test-Suite unitTests

View file

@ -1,8 +1,6 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (ReaderT, runReaderT) import Control.Monad.Reader (ReaderT, runReaderT)
import Data.ByteString.Lazy.Char8 (ByteString) import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Char8 as BS (readFile)
@ -13,7 +11,7 @@ import PDF.Box (Box(..))
import PDF.Layer (Layer(..), unify) import PDF.Layer (Layer(..), unify)
import PDF.Object (Object(..)) import PDF.Object (Object(..))
import PDF.Object.Navigation ( import PDF.Object.Navigation (
Clear(..), Raw(..), (//), objectById, catalog Nav(..), PPath(..), StreamContent(..), (//), objectById, catalog
) )
import PDF.Output (Output) import PDF.Output (Output)
import qualified PDF.Output as Output (render) import qualified PDF.Output as Output (render)
@ -23,33 +21,27 @@ import System.Exit (die)
import Text.Printf (printf) import Text.Printf (printf)
import Text.Read (readMaybe) import Text.Read (readMaybe)
newtype Error a = Error {
runError :: Either String a
} deriving (Functor, Applicative, Monad)
instance MonadFail Error where
fail = Error . Left
decodedStream :: Object -> Object decodedStream :: Object -> Object
decodedStream object = decodedStream object =
maybe object id $ r Clear object >>= flip (w Raw) object either (const object) id $ r Clear object >>= flip (w Raw) object
display :: Output a => ReaderT Layer Error a -> Document -> Either String ByteString display :: Functor m => Output a => ReaderT Layer m a -> Document -> m ByteString
display getter (Document {eolStyle, layers}) = display getter (Document {eolStyle, layers}) =
Output.render eolStyle <$> runError (runReaderT getter (unify layers)) Output.render eolStyle <$> runReaderT getter (unify layers)
parse :: [String] -> IO (FilePath, Document -> Either String ByteString) parse :: [String] -> IO (FilePath, Document -> Either String ByteString)
parse [inputFile] = return (inputFile, display catalog) parse [inputFile] = return (inputFile, display $ value <$> catalog)
parse [inputFile, key] = parse [inputFile, key] =
return (inputFile, clear . maybe (byPath key) byId $ readMaybe key) return (inputFile, clear . maybe (byPath key) byId $ readMaybe key)
where where
byId = objectById . Id byId = objectById . Id
byPath path = catalog // (explode path) byPath path = (catalog // PPath (explode path))
explode "" = [] explode "" = []
explode path = explode path =
case break (== '.') path of case break (== '.') path of
(name, "") -> [name] (name, "") -> [name]
(name, rest) -> name : explode (drop 1 rest) (name, rest) -> name : explode (drop 1 rest)
clear = display . fmap decodedStream clear = display . fmap (decodedStream . value)
parse _ = parse _ =
die . printf "Syntax: %s inputFile [OBJECT_ID | PATH_TO_OBJ]\n" =<< getProgName die . printf "Syntax: %s inputFile [OBJECT_ID | PATH_TO_OBJ]\n" =<< getProgName

View file

@ -1,5 +1,7 @@
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
import Control.Monad ((>=>)) import Control.Monad ((>=>))
import Control.Monad.Except (ExceptT(..))
import Control.Monad.Except.IOH (handle)
import Control.Monad.IO.Class (liftIO) import Control.Monad.IO.Class (liftIO)
import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Char8 as BS (readFile)
import Data.Id (Id(..), mapWithKey) import Data.Id (Id(..), mapWithKey)
@ -9,7 +11,7 @@ import qualified Data.Text as Text (unpack)
import PDF (UnifiedLayers(..), parseDocument) import PDF (UnifiedLayers(..), parseDocument)
import PDF.Box (Box(..)) import PDF.Box (Box(..))
import PDF.Content.Text (Chunks(..)) import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer) import PDF.Layer (Layer, LayerReader)
import PDF.Pages ( import PDF.Pages (
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), withFonts Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), withFonts
, withResources , withResources
@ -19,7 +21,7 @@ import System.Exit (die)
import System.IO (BufferMode(..), hSetBuffering, stdout) import System.IO (BufferMode(..), hSetBuffering, stdout)
import Text.Printf (printf) import Text.Printf (printf)
displayPage :: Int -> Page -> FontCache IO () displayPage :: Int -> Page -> FontCache (LayerReader (ExceptT String IO)) ()
displayPage n = withResources ( displayPage n = withResources (
r Contents r Contents
>=> sequence_ . mapi (\objectId -> >=> sequence_ . mapi (\objectId ->
@ -31,22 +33,22 @@ displayPage n = withResources (
liftIO . putStrLn $ liftIO . putStrLn $
printf "p#%d obj#%d instr#%d: %s" n (getId a) (getId b) (Text.unpack v) printf "p#%d obj#%d instr#%d: %s" n (getId a) (getId b) (Text.unpack v)
getAll :: Layer -> IO () getAll :: Layer -> ExceptT String IO ()
getAll = withFonts $ r Pages >=> sequence_ . Map.mapWithKey displayPage getAll = withFonts $ r Pages >=> sequence_ . Map.mapWithKey displayPage
get :: Int -> Layer -> IO () get :: Int -> Layer -> ExceptT String IO ()
get n = withFonts $ r (P n) >=> displayPage n get n = withFonts $ r (P n) >=> displayPage n
onDoc :: FilePath -> (Layer -> IO ()) -> IO () onDoc :: FilePath -> (Layer -> ExceptT String IO ()) -> ExceptT String IO ()
onDoc inputFile f = do onDoc inputFile f =
(parseDocument <$> BS.readFile inputFile) ExceptT (parseDocument <$> BS.readFile inputFile) >>= r UnifiedLayers >>= f
>>= either die (r UnifiedLayers >=> f)
main :: IO () main :: IO ()
main = do main = do
hSetBuffering stdout LineBuffering hSetBuffering stdout LineBuffering
args <- getArgs args <- getArgs
case args of case args of
[inputFile] -> onDoc inputFile getAll [inputFile] -> onDoc inputFile getAll `handle` die
[inputFile, pageNumber] -> onDoc inputFile (get $ read pageNumber) [inputFile, pageNumber] ->
onDoc inputFile (get $ read pageNumber) `handle` die
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]" _ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"

41
examples/pdfCut.hs Normal file
View file

@ -0,0 +1,41 @@
import Control.Monad.State (MonadState(..), evalStateT)
import Control.Monad.Except.IOH (handle)
import Control.Monad.Trans (lift)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (writeFile)
import Data.Map ((!), fromSet)
import qualified Data.Set as Set (fromList)
import PDF (Document, UnifiedLayers(..), parseDocument, render)
import PDF.Box (at)
import PDF.Pages (Pages(..))
import System.Environment (getArgs)
import System.Exit (die)
import System.FilePath (replaceBaseName, takeBaseName)
import Text.Read (readEither)
parseRange :: String -> Either String (Int, Int)
parseRange = evalStateT $ do
from <- lift . readEither =<< state (break (== '-'))
get >>= makeRange from
where
makeRange from "" = return (from, from)
makeRange from (_:to) = (,) from <$> lift (readEither to)
cut :: (Int, Int) -> Document -> IO Document
cut (p1, p2) doc = ((
at UnifiedLayers .at Pages $ \pages ->
return $ fromSet (pages!) $ Set.fromList [pMin .. pMax]
) doc) `handle` die
where
(pMin, pMax) = (min p1 p2, max p1 p2)
main :: IO ()
main = do
[inputPath, inputRange] <- getArgs
range <- catchLeft $ parseRange inputRange
doc <- catchLeft =<< parseDocument <$> BS.readFile inputPath
Lazy.writeFile (outputPath inputPath inputRange) . render =<< cut range doc
where
catchLeft = either die return
outputPath fileName range =
replaceBaseName fileName (takeBaseName fileName ++ "_p" ++ range)

View file

@ -14,7 +14,7 @@ module Data.ByteString.Char8.Util (
) where ) where
import Data.ByteString (ByteString, snoc) import Data.ByteString (ByteString, snoc)
import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt) import qualified Data.ByteString as BS (empty, foldl', length, pack, singleton, splitAt)
import qualified Data.ByteString.Char8 as Char8 ( import qualified Data.ByteString.Char8 as Char8 (
cons, drop, index, splitAt, take, uncons, unpack cons, drop, index, splitAt, take, uncons, unpack
) )
@ -43,7 +43,7 @@ intToB256 n
B256Int $ begining `snoc` (toEnum (n `mod` 0x100)) B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
b256ToInt :: B256Int -> Int b256ToInt :: B256Int -> Int
b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n b256ToInt (B256Int n) = BS.foldl' (\k w -> 0x100*k + fromEnum w) 0 n
toBytes :: Int -> Int -> ByteString toBytes :: Int -> Int -> ByteString
toBytes 0 _ = BS.empty toBytes 0 _ = BS.empty

View file

@ -8,6 +8,7 @@ module Data.Id (
, at , at
, delete , delete
, empty , empty
, filterWithKey
, fromList , fromList
, insert , insert
, keysSet , keysSet
@ -21,11 +22,11 @@ module Data.Id (
, union , union
) where ) where
import Control.Monad.State (MonadState, modify, gets) import Control.Monad.State.Strict (MonadState, modify, gets)
import Data.IntMap (IntMap, (!)) import Data.IntMap (IntMap, (!))
import qualified Data.IntMap as IntMap ( import qualified Data.IntMap as IntMap (
delete, empty, fromList, keysSet, insert, lookup, mapWithKey, maxViewWithKey delete, empty, filterWithKey, fromList, keysSet, insert, lookup
, member, minViewWithKey, size, union , mapWithKey, maxViewWithKey, member, minViewWithKey, size, union
) )
import Data.IntSet (IntSet) import Data.IntSet (IntSet)
import Prelude hiding (lookup) import Prelude hiding (lookup)
@ -74,6 +75,9 @@ union (IdMap intMap1) (IdMap intMap2) =
mapWithKey :: (Id a -> b -> c) -> IdMap a b -> IdMap a c mapWithKey :: (Id a -> b -> c) -> IdMap a b -> IdMap a c
mapWithKey f (IdMap idMap) = IdMap {intMap = IntMap.mapWithKey (f . Id) idMap} mapWithKey f (IdMap idMap) = IdMap {intMap = IntMap.mapWithKey (f . Id) idMap}
filterWithKey :: (Id a -> b -> Bool) -> IdMap a b -> IdMap a b
filterWithKey f = IdMap . IntMap.filterWithKey (f . Id) . intMap
fromList :: [(Id a, b)] -> IdMap a b fromList :: [(Id a, b)] -> IdMap a b
fromList = IdMap . IntMap.fromList . fmap (\(key, b) -> (getId key, b)) fromList = IdMap . IntMap.fromList . fmap (\(key, b) -> (getId key, b))

View file

@ -1,7 +1,6 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FunctionalDependencies #-} {-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE UndecidableInstances #-} {-# LANGUAGE UndecidableInstances #-}
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
module PDF.Box ( module PDF.Box (
@ -17,8 +16,8 @@ module PDF.Box (
import Control.Monad.Except (ExceptT(..), runExceptT) import Control.Monad.Except (ExceptT(..), runExceptT)
import Control.Monad.Fail (MonadFail(..)) import Control.Monad.Fail (MonadFail(..))
import Control.Monad.State (MonadState(..))
import Control.Monad.Reader (ReaderT, runReaderT) import Control.Monad.Reader (ReaderT, runReaderT)
import Control.Monad.State (MonadState(..))
import Data.Id (Id, IdMap) import Data.Id (Id, IdMap)
import qualified Data.Id as Id (insert, lookup) import qualified Data.Id as Id (insert, lookup)
import Data.Map (Map) import Data.Map (Map)

View file

@ -1,15 +1,17 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
module PDF.CMap ( module PDF.CMap (
CMap CMap
, CMappers , CMappers
, CRange(..) , CRange(..)
, cMap
, matches , matches
, parse
) where ) where
import Control.Applicative ((<|>), many) import Control.Applicative ((<|>), many)
import Control.Monad.Fail (MonadFail(..)) import Control.Monad.Except (MonadError(..))
import Control.Monad.Fail (fail)
import Control.Monad.State (modify) import Control.Monad.State (modify)
import Data.Attoparsec.ByteString.Char8 (count) import Data.Attoparsec.ByteString.Char8 (count)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
@ -18,6 +20,7 @@ import Data.ByteString.Char8 (unpack)
import Data.ByteString.Char8.Util ( import Data.ByteString.Char8.Util (
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8 B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
) )
import Data.Foldable (foldl', foldr')
import Data.Map (Map, mapWithKey, union) import Data.Map (Map, mapWithKey, union)
import qualified Data.Map as Map ( import qualified Data.Map as Map (
adjust, empty, fromList, insert, insertWith, lookup, toList adjust, empty, fromList, insert, insertWith, lookup, toList
@ -75,7 +78,7 @@ encoder :: Map Size FromUnicode -> Encoder
encoder fromUnicodes input encoder fromUnicodes input
| Text.null input = Right "" | Text.null input = Right ""
| otherwise = | otherwise =
foldl (<>) (Left "No encoding found") $ mapWithKey tryOn fromUnicodes foldr' (<>) (Left "No encoding found") $ mapWithKey tryOn fromUnicodes
where where
tryOn size fromUnicode = tryOn size fromUnicode =
let (prefix, end) = Text.splitAt size input in let (prefix, end) = Text.splitAt size input in
@ -87,8 +90,8 @@ matches :: ByteString -> CRange -> Bool
matches code (CRange {fromSequence, toSequence}) = matches code (CRange {fromSequence, toSequence}) =
fromSequence <= code && code <= toSequence fromSequence <= code && code <= toSequence
cMap :: MonadFail m => ByteString -> m Font parse :: MonadError String m => ByteString -> m Font
cMap = either fail (return . toFont . snd) . runParser parse = either throwError (return . toFont . snd) . runParser
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine)) (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
emptyCMap emptyCMap
where where
@ -148,7 +151,7 @@ saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length cod
) )
saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode
saveFromUnicodeBySize = flip (foldl insertFromUnicode) saveFromUnicodeBySize = flip (foldl' insertFromUnicode)
where where
insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode
insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) = insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) =

View file

@ -9,54 +9,77 @@ module PDF.Content (
Content(..) Content(..)
, ContentUnit(..) , ContentUnit(..)
, GraphicContextUnit(..) , GraphicContextUnit(..)
, IdContentUnit
, IdGraphicContextUnit
, IdTextContext
, Instructions(..) , Instructions(..)
, TextContext , TextContext
, parse , parse
) where ) where
import Control.Applicative ((<|>)) import Control.Applicative ((<|>))
import Control.Monad.Fail (MonadFail)
import Control.Monad.Reader (asks, runReader) import Control.Monad.Reader (asks, runReader)
import Control.Monad.State (evalStateT, modify) import Control.Monad.State.Strict (runState, evalStateT, modify)
import Data.Attoparsec.ByteString.Char8 (sepBy) import Data.Attoparsec.ByteString.Char8 (sepBy)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser, parseOnly)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import Data.Id (Id(..), Indexed, at, empty, register) import Data.Id (Id(..), Indexed, at, empty, register)
import PDF.Box (Box(..)) import PDF.Box (Box(..))
import PDF.Content.Operator (Instruction, operator) import PDF.Content.Operator (Instruction, operator)
import PDF.Object (blank, directObject) import PDF.Object (blank, directObject)
import PDF.Output (Output(..), line) import PDF.Output (Output(..), line)
import PDF.Parser (Parser, runParser, string) import PDF.Parser (string)
data Instructions = Instructions data Instructions = Instructions
data GraphicContextUnit = data GraphicContextUnit a =
GraphicInstruction (Id Instruction) GraphicInstruction a
| ContentUnit ContentUnit | ContentUnit (ContentUnit a)
deriving Show deriving Show
type TextContext = [Id Instruction] type TextContext a = [a]
data ContentUnit = data ContentUnit a =
GraphicContext [GraphicContextUnit] GraphicContext [GraphicContextUnit a]
| TextContext TextContext | TextContext (TextContext a)
deriving Show deriving Show
data Content = Content { data Content = Content {
contentUnits :: [ContentUnit] contentUnits :: [IdContentUnit]
, indexedInstructions :: Indexed Instruction , indexedInstructions :: Indexed Instruction
, firstError :: Maybe String
} deriving Show } deriving Show
type InstructionParser = Parser (Indexed Instruction) type TmpContentUnit = ContentUnit Instruction
type TmpGraphicContextUnit = GraphicContextUnit Instruction
type TmpTextContext = TextContext Instruction
type IdContentUnit = ContentUnit (Id Instruction)
type IdGraphicContextUnit = GraphicContextUnit (Id Instruction)
type IdTextContext = TextContext (Id Instruction)
instance Monad m => Box m Instructions Content (Indexed Instruction) where instance Monad m => Box m Instructions Content (Indexed Instruction) where
r Instructions = return . indexedInstructions r Instructions = return . indexedInstructions
w Instructions indexedInstructions someContent = w Instructions indexedInstructions someContent =
return $ someContent {indexedInstructions} return $ someContent {indexedInstructions}
parse :: MonadFail m => ByteString -> m Content parse :: ByteString -> Content
parse = parse input =
either fail (return . uncurry Content) . runParser contentUnits empty let result = Atto.parseOnly (contentUnit `sepBy` blank) input in
where let (contentUnits, indexedInstructions) = either (const ([], empty)) buildContent result in
contentUnits = contentUnit `sepBy` blank let firstError = either Just (const Nothing) result in
Content {contentUnits, indexedInstructions, firstError}
contentUnit :: InstructionParser ContentUnit buildContent :: [TmpContentUnit] -> ([IdContentUnit], Indexed Instruction)
buildContent instructionContentUnits =
runState (mapM registerContentUnit instructionContentUnits) empty
where
registerContentUnit (GraphicContext gc) =
GraphicContext <$> (mapM registerGraphicContext gc)
registerContentUnit (TextContext tc) = TextContext <$> (mapM register tc)
registerGraphicContext (GraphicInstruction gi) =
GraphicInstruction <$> (register gi)
registerGraphicContext (ContentUnit cu) =
ContentUnit <$> (registerContentUnit cu)
contentUnit :: Atto.Parser TmpContentUnit
contentUnit = contentUnit =
(GraphicContext <$> graphicContext) (GraphicContext <$> graphicContext)
<|> (TextContext <$> textContext) <|> (TextContext <$> textContext)
@ -64,18 +87,18 @@ contentUnit =
graphicContext = graphicContext =
string "q" *> blank *> graphicContextUnit `sepBy` blank <* blank <* string "Q" string "q" *> blank *> graphicContextUnit `sepBy` blank <* blank <* string "Q"
graphicContextUnit :: InstructionParser GraphicContextUnit graphicContextUnit :: Atto.Parser TmpGraphicContextUnit
graphicContextUnit = graphicContextUnit =
(GraphicInstruction <$> instruction) (GraphicInstruction <$> instruction)
<|> (ContentUnit <$> contentUnit) <|> (ContentUnit <$> contentUnit)
instruction :: InstructionParser (Id Instruction) instruction :: Atto.Parser Instruction
instruction = evalStateT stackParser [] >>= register instruction = evalStateT stackParser []
where where
stackParser = ((directObject <* blank) >>= push) <|> operator stackParser = ((directObject <* blank) >>= push) <|> operator
push arg = modify (arg:) *> stackParser push arg = modify (arg:) *> stackParser
textContext :: InstructionParser TextContext textContext :: Atto.Parser TmpTextContext
textContext = textContext =
string "BT" *> blank *> instruction `sepBy` blank <* blank <* string "ET" string "BT" *> blank *> instruction `sepBy` blank <* blank <* string "ET"

View file

@ -1,24 +1,22 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE MultiParamTypeClasses #-}
module PDF.Content.Text ( module PDF.Content.Text (
Chunks(..) Chunks(..)
, chunk , chunk
, format , format
, renderText
) where ) where
import Control.Monad (foldM) import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail(..)) import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (MonadReader(..), ReaderT, asks, runReaderT) import Control.Monad.Reader (ReaderT, asks, runReaderT)
import Control.Monad.State ( import Control.Monad.State (
MonadState(..), StateT, evalStateT, gets, modify, runStateT MonadState(..), StateT, evalStateT, gets, modify, runStateT
) )
import Control.Monad.Trans (lift) import Control.Monad.Trans (lift)
import qualified Data.ByteString.Char8 as BS (concatMap, singleton)
import Data.Id (Id(..), Indexed, at, empty, singleton) import Data.Id (Id(..), Indexed, at, empty, singleton)
import qualified Data.Id as Id (delete, lookup, register) import qualified Data.Id as Id (delete, lookup, register)
import Data.Map ((!)) import Data.Map ((!))
@ -26,27 +24,20 @@ import Data.Text (Text, breakOn)
import qualified Data.Text as Text (drop) import qualified Data.Text as Text (drop)
import PDF.Box (Box(..)) import PDF.Box (Box(..))
import PDF.Content ( import PDF.Content (
Content, ContentUnit(..), GraphicContextUnit(..), contentUnits Content(..), ContentUnit(..), GraphicContextUnit(..), IdContentUnit
, IdGraphicContextUnit
) )
import qualified PDF.Content as Content (Content(..))
import PDF.Content.Operator (Instruction, Operator(..)) import PDF.Content.Operator (Instruction, Operator(..))
import PDF.Content.Operator.Text (Operator(..)) import PDF.Content.Operator.Text (Operator(..))
import PDF.Font (Font(..), FontSet, emptyFont) import PDF.Font (Font(..), FontSet, emptyFont)
import PDF.Object (DirectObject(..), StringObject(..), toByteString) import PDF.Object (DirectObject(..), StringObject(..), toByteString)
import Prelude hiding (fail) import Prelude hiding (fail)
data ROContext = ROContext { type TmpFont = StateT Font
indexedInstructions :: Indexed Instruction type Renderer m = ReaderT (Indexed Instruction) (ReaderT FontSet m)
, fontSet :: FontSet
}
type TextContent m = ReaderT ROContext m
type FontContext m = StateT Font (TextContent m)
type Updater m = StateT (Indexed Instruction) (ReaderT (Indexed Text) (ReaderT FontSet m)) type Updater m = StateT (Indexed Instruction) (ReaderT (Indexed Text) (ReaderT FontSet m))
type TextUpdater m = StateT Font (Updater m)
decodeString :: MonadFail m => StringObject -> FontContext m Text decodeString :: MonadFail m => StringObject -> TmpFont (Renderer m) Text
decodeString input = do decodeString input = do
Font {decode} <- get Font {decode} <- get
either fail return . decode $ toByteString input either fail return . decode $ toByteString input
@ -57,56 +48,53 @@ chunk :: Int -> Id Text
chunk = Id chunk = Id
instance MonadFail m => Box (ReaderT FontSet m) Chunks Content (Indexed Text) where instance MonadFail m => Box (ReaderT FontSet m) Chunks Content (Indexed Text) where
r Chunks content = do r Chunks content =
fontSet <- ask runReaderT (mconcat <$> renderer) $ indexedInstructions content
renderText fontSet content where
renderer = mapM renderContentUnit (contentUnits content)
w Chunks indexedText (Content.Content {contentUnits, Content.indexedInstructions}) = w Chunks indexedText content = do
uncurry Content.Content <$> runReaderT ( (contentUnits, indexedInstructions) <- runReaderT readerUpdate indexedText
runStateT (mapM updateContentUnit contentUnits) indexedInstructions return $ content {contentUnits, indexedInstructions}
) indexedText where
stateUpdate = mapM updateContentUnit (contentUnits content)
readerUpdate = runStateT stateUpdate (indexedInstructions content)
renderText :: MonadFail m => FontSet -> Content -> m (Indexed Text) renderContentUnit :: MonadFail m => IdContentUnit -> Renderer m (Indexed Text)
renderText fontSet (Content.Content {contentUnits, Content.indexedInstructions}) =
runReaderT (mconcat <$> mapM renderContentUnit contentUnits) roContext
where
roContext = ROContext {indexedInstructions, fontSet}
renderContentUnit :: MonadFail m => ContentUnit -> TextContent m (Indexed Text)
renderContentUnit (GraphicContext graphicContextUnits) = renderContentUnit (GraphicContext graphicContextUnits) =
mconcat <$> mapM renderGraphicContextUnit graphicContextUnits mconcat <$> mapM renderGraphicContextUnit graphicContextUnits
renderContentUnit (TextContext instructionIds) = renderContentUnit (TextContext instructionIds) =
evalStateT (mconcat <$> mapM renderInstructionId instructionIds) emptyFont evalStateT (mconcat <$> mapM renderInstructionId instructionIds) emptyFont
updateContentUnit :: MonadFail m => ContentUnit -> Updater m ContentUnit updateContentUnit :: MonadFail m => IdContentUnit -> Updater m IdContentUnit
updateContentUnit (GraphicContext graphicContextUnits) = GraphicContext <$> updateContentUnit (GraphicContext graphicContextUnits) = GraphicContext <$>
mapM updateGraphicContextUnit graphicContextUnits mapM updateGraphicContextUnit graphicContextUnits
updateContentUnit (TextContext instructionIds) = TextContext . concat <$> updateContentUnit (TextContext instructionIds) = TextContext . concat <$>
evalStateT (mapM updateInstructionId instructionIds) emptyFont evalStateT (mapM updateInstructionId instructionIds) emptyFont
renderGraphicContextUnit :: MonadFail m => GraphicContextUnit -> TextContent m (Indexed Text) renderGraphicContextUnit :: MonadFail m => IdGraphicContextUnit -> Renderer m (Indexed Text)
renderGraphicContextUnit (GraphicInstruction _) = return empty renderGraphicContextUnit (GraphicInstruction _) = return empty
renderGraphicContextUnit (ContentUnit contentUnit) = renderGraphicContextUnit (ContentUnit contentUnit) =
renderContentUnit contentUnit renderContentUnit contentUnit
updateGraphicContextUnit :: MonadFail m => GraphicContextUnit -> Updater m GraphicContextUnit updateGraphicContextUnit :: MonadFail m => IdGraphicContextUnit -> Updater m IdGraphicContextUnit
updateGraphicContextUnit gI@(GraphicInstruction _) = return gI updateGraphicContextUnit gI@(GraphicInstruction _) = return gI
updateGraphicContextUnit (ContentUnit contentUnit) = updateGraphicContextUnit (ContentUnit contentUnit) =
ContentUnit <$> updateContentUnit contentUnit ContentUnit <$> updateContentUnit contentUnit
renderInstructionId :: MonadFail m => Id Instruction -> FontContext m (Indexed Text) renderInstructionId :: MonadFail m => Id Instruction -> TmpFont (Renderer m) (Indexed Text)
renderInstructionId instructionId@(Id n) = toMap <$> renderInstructionId instructionId@(Id n) = toMap <$>
(asks ((`at` instructionId) . indexedInstructions) >>= renderInstruction) (asks ((`at` instructionId)) >>= renderInstruction)
where where
toMap = maybe empty (singleton (Id n)) toMap = maybe empty (singleton (Id n))
updateInstructionId :: MonadFail m => Id Instruction -> TextUpdater m [Id Instruction] updateInstructionId :: MonadFail m => Id Instruction -> TmpFont (Updater m) [Id Instruction]
updateInstructionId instructionId = updateInstructionId instructionId =
lift (gets (`at` instructionId)) >>= updateInstruction instructionId lift (gets (`at` instructionId)) >>= updateInstruction instructionId
renderInstruction :: MonadFail m => Instruction -> FontContext m (Maybe Text) renderInstruction :: MonadFail m => Instruction -> TmpFont (Renderer m) (Maybe Text)
renderInstruction (Text Tf, [NameObject fontName, _]) = renderInstruction (Text Tf, [NameObject fontName, _]) =
asks ((! fontName) . fontSet) >>= put >> return Nothing lift (lift $ asks (! fontName)) >>= put >> return Nothing
renderInstruction (Text Tstar, []) = return $ Just "\n" renderInstruction (Text Tstar, []) = return $ Just "\n"
@ -128,7 +116,7 @@ renderInstruction (Text DQuote, [_, _, StringObject outputString]) =
renderInstruction _ = return Nothing renderInstruction _ = return Nothing
updateInstruction :: MonadFail m => Id Instruction -> Instruction -> TextUpdater m [Id Instruction] updateInstruction :: MonadFail m => Id Instruction -> Instruction -> TmpFont (Updater m) [Id Instruction]
updateInstruction instructionId (Text Tf, [NameObject fontName, _]) = updateInstruction instructionId (Text Tf, [NameObject fontName, _]) =
(lift . lift . lift $ asks (!fontName)) >>= put >> return [instructionId] (lift . lift . lift $ asks (!fontName)) >>= put >> return [instructionId]
@ -155,5 +143,9 @@ format input = do
(line, left) -> (:) <$> tj line <*> format left (line, left) -> (:) <$> tj line <*> format left
where where
tj t = do tj t = do
literal <- either fail (return . Literal) =<< gets (($t) . encode) encoded <- either fail return =<< gets (($t) . encode)
return (Text Tj, [StringObject literal]) return (Text Tj, [StringObject . Literal $ BS.concatMap escape encoded])
escape '\\' = "\\\\"
escape '(' = "\\("
escape ')' = "\\)"
escape c = BS.singleton c

View file

@ -1,12 +1,13 @@
{-# LANGUAGE FlexibleContexts #-}
module PDF.Encoding ( module PDF.Encoding (
encoding encoding
) where ) where
import Control.Monad.Fail (MonadFail(..)) import Control.Monad.Except (MonadError(..))
import PDF.Encoding.MacRoman (macRomanEncoding) import PDF.Encoding.MacRoman (macRomanEncoding)
import PDF.Font (Font) import PDF.Font (Font)
import Prelude hiding (fail) import Prelude hiding (fail)
encoding :: MonadFail m => String -> m Font encoding :: MonadError String m => String -> m Font
encoding "MacRomanEncoding" = return macRomanEncoding encoding "MacRomanEncoding" = return macRomanEncoding
encoding s = fail $ "Unknown encoding " ++ s encoding s = throwError $ "Unknown encoding " ++ s

View file

@ -3,6 +3,7 @@ module PDF.Encoding.MacRoman (
) where ) where
import qualified Data.ByteString.Char8 as BS (pack, unpack) import qualified Data.ByteString.Char8 as BS (pack, unpack)
import Data.Foldable (foldl')
import Data.Map (Map) import Data.Map (Map)
import qualified Data.Map as Map (empty, insert, lookup) import qualified Data.Map as Map (empty, insert, lookup)
import qualified Data.Text as Text (pack, unpack) import qualified Data.Text as Text (pack, unpack)
@ -24,7 +25,7 @@ macRomanEncoding = Font {
| otherwise -> Left ("Character '" ++ k :"' unavailable in MacRoman") | otherwise -> Left ("Character '" ++ k :"' unavailable in MacRoman")
mappers :: (Mapper, Mapper) mappers :: (Mapper, Mapper)
mappers = foldl generateMapers (Map.empty, Map.empty) [ mappers = foldl' generateMapers (Map.empty, Map.empty) [
('\x80', '\x00C4') -- LATIN CAPITAL LETTER A WITH DIAERESIS ('\x80', '\x00C4') -- LATIN CAPITAL LETTER A WITH DIAERESIS
, ('\x81', '\x00C5') -- LATIN CAPITAL LETTER A WITH RING ABOVE , ('\x81', '\x00C5') -- LATIN CAPITAL LETTER A WITH RING ABOVE
, ('\x82', '\x00C7') -- LATIN CAPITAL LETTER C WITH CEDILLA , ('\x82', '\x00C7') -- LATIN CAPITAL LETTER C WITH CEDILLA

View file

@ -1,21 +1,26 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE FlexibleContexts #-}
module PDF.Layer ( module PDF.Layer (
Layer(..) Layer(..)
, LayerReader
, Objects(..) , Objects(..)
, unify , unify
) where ) where
import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (ReaderT)
import Data.Foldable (foldl')
import Data.Id (Id(..), Indexed, keysSet, mapWithKey, member) import Data.Id (Id(..), Indexed, keysSet, mapWithKey, member)
import qualified Data.Id as Id (empty, union) import qualified Data.Id as Id (empty, insert, lookup, union)
import Data.Map (Map, (!)) import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, union) import qualified Data.Map as Map (empty, lookup, union)
import qualified Data.IntSet as IntSet (delete, toList) import qualified Data.IntSet as IntSet (delete, toList)
import PDF.Box (Box(..)) import PDF.Box (Box(..))
import PDF.Object ( import PDF.Object (
IndirectObjCoordinates(..), Object, Occurrence(..) IndirectObjCoordinates(..), Object, Occurrence(..), Structure(..)
, Structure(..), XRefEntry(..), XRefSection, eofMarker, outputBody , XRefEntry(..), XRefSection, eofMarker, outputBody
) )
import qualified PDF.Output as Output (line) import qualified PDF.Output as Output (line)
import PDF.Output ( import PDF.Output (
@ -26,24 +31,28 @@ import Text.Printf (printf)
data Layer = Layer { data Layer = Layer {
occurrences :: [Occurrence] occurrences :: [Occurrence]
, objects :: (Indexed Object) , objects :: Indexed Object
, docStructure :: Structure , docStructure :: Structure
} deriving Show } deriving Show
type LayerReader m = ReaderT Layer m
updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset) updateXRefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef) updateXRefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
where where
updateEntry objectId e@(InUse {}) = updateEntry objectId e@(InUse {offset}) =
e {offset = offsets ! (ObjectId $ getId objectId)} case Map.lookup (ObjectId $ getId objectId) offsets of
Nothing -> Free {nextFree = Id $ getOffset offset, generation = 65535}
Just newOffset -> e {offset = newOffset}
updateEntry _ e = e updateEntry _ e = e
instance Output Layer where instance Output Layer where
output (Layer {occurrences, objects, docStructure}) = output (Layer {occurrences, objects, docStructure}) = do
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects)) (body, savedOffsets) <- getOffsets (outputBody (occurrences, objects))
>>= \(body, (xref, startXRef)) -> mconcat [ let (newXRef, startXRef) = updateXRefs xRef savedOffsets
body mconcat [
return body
, Output.line "xref" , Output.line "xref"
, output xref , output newXRef
, Output.line "trailer" , Output.line "trailer"
, output trailer, newLine , output trailer, newLine
, Output.line "startxref" , Output.line "startxref"
@ -76,6 +85,14 @@ instance Monad m => Box m Objects Layer (Indexed Object) where
Indirect (IndirectObjCoordinates {objectId, versionNumber = 0}) Indirect (IndirectObjCoordinates {objectId, versionNumber = 0})
newOccurrences = (makeOccurrence . Id) <$> IntSet.toList newObjectIds newOccurrences = (makeOccurrence . Id) <$> IntSet.toList newObjectIds
instance MonadError String m => Box m (Id Object) Layer Object where
r objectId =
maybe (throwError "Unknown key") return . Id.lookup objectId . objects
w objectId a layer@(Layer {objects})
| member objectId objects = return $
layer {objects = Id.insert objectId a objects}
| otherwise = throwError "Unknown key"
emptyLayer :: Layer emptyLayer :: Layer
emptyLayer = Layer { emptyLayer = Layer {
docStructure = Structure {xRef = Id.empty, trailer = Map.empty} docStructure = Structure {xRef = Id.empty, trailer = Map.empty}
@ -84,7 +101,7 @@ emptyLayer = Layer {
} }
unify :: [Layer] -> Layer unify :: [Layer] -> Layer
unify = foldl complete emptyLayer unify = foldl' complete emptyLayer
where where
complete tmpLayer older = complete tmpLayer older =
let mergedObjects = Id.union (objects tmpLayer) (objects older) in let mergedObjects = Id.union (objects tmpLayer) (objects older) in

View file

@ -34,6 +34,7 @@ module PDF.Object (
) where ) where
import Control.Applicative ((<|>), many) import Control.Applicative ((<|>), many)
import Control.Monad.Reader (asks)
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (concat) import qualified Data.ByteString as BS (concat)
@ -48,11 +49,11 @@ import qualified Data.Id as Id (
import Data.Map (Map) import Data.Map (Map)
import qualified Data.Map as Map (fromList, toList) import qualified Data.Map as Map (fromList, toList)
import qualified Data.Set as Set (fromList, member) import qualified Data.Set as Set (fromList, member)
import qualified PDF.EOL as EOL (charset, parser) import qualified PDF.EOL as EOL (Style(..), charset, parser)
import qualified PDF.Output as Output (line, string) import qualified PDF.Output as Output (line, string)
import PDF.Output ( import PDF.Output (
OBuilder, Offset(..), Output(..), Resource(..) OBuilder, OContext(..), Offset(..), Output(..), Resource(..), byteString
, byteString, getOffset, join, newLine, saveOffset , getOffset, join, newLine, saveOffset
) )
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf) import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
import Text.Printf (printf) import Text.Printf (printf)
@ -190,8 +191,8 @@ nullObject = string "null" *> return () <?> "null object"
-- Reference -- Reference
-- --
data IndirectObjCoordinates = IndirectObjCoordinates { data IndirectObjCoordinates = IndirectObjCoordinates {
objectId :: (Id Object) objectId :: {-# UNPACK #-} !(Id Object)
, versionNumber :: Int , versionNumber :: {-# UNPACK #-} !Int
} deriving Show } deriving Show
reference :: MonadParser m => m IndirectObjCoordinates reference :: MonadParser m => m IndirectObjCoordinates
@ -202,14 +203,14 @@ reference = IndirectObjCoordinates
-- DirectObject -- DirectObject
-- --
data DirectObject = data DirectObject =
Boolean Bool Boolean !Bool
| NumberObject Number | NumberObject !Number
| StringObject StringObject | StringObject !StringObject
| NameObject Name | NameObject !Name
| Array [DirectObject] | Array ![DirectObject]
| Dictionary Dictionary | Dictionary !Dictionary
| Null | Null
| Reference IndirectObjCoordinates | Reference !IndirectObjCoordinates
deriving Show deriving Show
instance Output DirectObject where instance Output DirectObject where
@ -294,10 +295,15 @@ data XRefEntry = InUse {
} deriving Show } deriving Show
instance Output XRefEntry where instance Output XRefEntry where
output (InUse {offset, generation}) = output xRefEntry = Output.string (build xRefEntry) `mappend` endXRefEntryLine
Output.line (printf "%010d %05d n " (getOffset offset) generation) where
output (Free {nextFree, generation}) = build (InUse {offset, generation}) =
Output.line (printf "%010d %05d f " (getId nextFree) generation) printf "%010d %05d n" (getOffset offset) generation
build (Free {nextFree, generation}) =
printf "%010d %05d f" (getId nextFree) generation
endXRefEntryLine = OContext (asks padEOLToTwoBytes) >>= Output.line
padEOLToTwoBytes EOL.CRLF = ("" :: String)
padEOLToTwoBytes _ = " "
entry :: Parser u XRefEntry entry :: Parser u XRefEntry
entry = do entry = do

View file

@ -3,98 +3,162 @@
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveFunctor #-}
module PDF.Object.Navigation ( module PDF.Object.Navigation (
Clear(..) Nav(..)
, PDFContent , PPath(..)
, Raw(..) , ROLayer
, RWLayer
, StreamContent(..)
, (./) , (./)
, (//) , (//)
, (>./) , (>./)
, (>//) , (>//)
, castObject , castObject
, catalog
, getDictionary , getDictionary
, getKey , getKey
, objectById , objectById
, catalog , save
) where ) where
import Codec.Compression.Zlib (compress, decompress) import Codec.Compression.Zlib (compress, decompress)
import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (MonadReader(..)) import Control.Monad.Reader (MonadReader(..))
import Control.Monad.Fail (MonadFail(..)) import Control.Monad.State (MonadState)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (length)
import qualified Data.ByteString.Lazy as Lazy (ByteString, fromStrict, toStrict) import qualified Data.ByteString.Lazy as Lazy (ByteString, fromStrict, toStrict)
import Data.Id (Id, at) import Data.Id (Id)
import qualified Data.Map as Map (lookup) import qualified Data.Id as Id (at)
import PDF.Box (Box(..)) import qualified Data.Map as Map (adjust, insert, lookup)
import PDF.Box (Box(..), at, edit{-, runRO-})
import PDF.Layer (Layer(..)) import PDF.Layer (Layer(..))
import PDF.Object ( import PDF.Object (
Dictionary, DirectObject(..), IndirectObjCoordinates(..) Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Name(..), Object(..), Structure(..) , Name(..), Number(..), Object(..), Structure(..)
) )
import Prelude hiding (fail) import Prelude hiding (fail)
import Text.Printf (printf) import Text.Printf (printf)
type PDFContent m = (MonadReader Layer m, MonadFail m) newtype PPath = PPath [Component]
data DPath = DPath {
root :: Id Object
, offset :: [Component]
} deriving Show
push :: Component -> DPath -> DPath
push component dPath = dPath {offset = (offset dPath) ++ [component]}
data Nav a = Nav {
dPath :: DPath
, value :: a
} deriving (Functor)
instance Show a => Show (Nav a) where
show (Nav {dPath, value}) = "Nav {dPath = " ++ show dPath ++ ", value = " ++ show value ++ "}"
type ROLayer m = (MonadReader Layer m, MonadError String m)
type RWLayer m = (MonadState Layer m, MonadError String m)
type Component = String type Component = String
getDictionary :: PDFContent m => Object -> m Dictionary getDictionary :: ROLayer m => Nav Object -> m (Nav Dictionary)
getDictionary (Direct (Dictionary aDict)) = return aDict getDictionary (Nav {dPath, value}) =
getDictionary (Direct (Reference ref)) = case value of
objectById (objectId ref) >>= getDictionary (Direct (Dictionary aDict)) -> return $ Nav {dPath, value = aDict}
getDictionary (Stream {header}) = return header (Direct (Reference ref)) -> objectById (objectId ref) >>= getDictionary
getDictionary obj = expected "dictionary : " obj (Stream {header}) -> return $ Nav {dPath, value = header}
obj -> expected "dictionary : " obj
expected :: (MonadFail m, Show a) => String -> a -> m b expected :: (MonadError String m, Show a) => String -> a -> m b
expected name = fail . printf "Not a %s: %s" name . show expected name = throwError . printf "Not a %s: %s" name . show
getKey :: PDFContent m => String -> Object -> m DirectObject getKey :: ROLayer m => String -> Nav Object -> m (Nav DirectObject)
getKey key object = getDictionary object >>= catchMaybe . Map.lookup (Name key) getKey key navObject = getDictionary navObject >>= f
where where
errorMessage = errorMessage =
printf "Key %s not found in object %s" key (show object) printf "Key %s not found in object %s" key (show navObject)
catchMaybe = maybe (fail errorMessage) return f (Nav {dPath, value}) =
case Map.lookup (Name key) value of
Nothing -> throwError errorMessage
Just dObj -> return $ Nav {dPath = push key dPath, value = dObj}
objectById :: PDFContent m => (Id Object) -> m Object objectById :: ROLayer m => (Id Object) -> m (Nav Object)
objectById objectId = do objectById objectId = do
layer <- ask layer <- ask
return (objects layer `at` objectId) return $ Nav {
dPath = DPath {root = objectId, offset = []}
, value = objects layer `Id.at` objectId
}
(./) :: PDFContent m => m Object -> Component -> m Object castObject :: ROLayer m => Nav DirectObject -> m (Nav Object)
(./) object key = (object >>= getKey key >>= castObject) castObject (Nav {value = !(Reference (IndirectObjCoordinates {objectId}))}) =
castObject :: PDFContent m => DirectObject -> m Object
castObject (Reference (IndirectObjCoordinates {objectId})) =
objectById objectId objectById objectId
castObject directObject = return $ Direct directObject castObject (Nav {dPath, value}) = return $ Nav {dPath, value = Direct value}
(//) :: PDFContent m => m Object -> [Component] -> m Object (./) :: ROLayer m => m (Nav Object) -> Component -> m (Nav Object)
(//) object [] = object (./) navObject key = (navObject >>= getKey key >>= castObject)
(//) object (key:keys) = object ./ key // keys
(>./) :: PDFContent m => Object -> Component -> m Object (//) :: ROLayer m => m (Nav Object) -> PPath -> m (Nav Object)
(>./) object = (return object ./) (//) navObject (PPath []) = navObject
(//) navObject (PPath (key:keys)) = navObject ./ key // (PPath keys)
(>//) :: PDFContent m => Object -> [Component] -> m Object (>./) :: ROLayer m => Nav Object -> Component -> m (Nav Object)
(>//) object = (return object //) (>./) navObject = (return navObject ./)
catalog :: PDFContent m => m Object (>//) :: ROLayer m => Nav Object -> PPath -> m (Nav Object)
catalog = Direct . Dictionary . trailer . docStructure <$> ask (>//) navObject = (return navObject //)
data Clear = Clear catalog :: ROLayer m => m (Nav Object)
data Raw = Raw catalog = do
value <- Direct . Dictionary . trailer . docStructure <$> ask
return $ Nav {dPath = undefined, value}
setAt :: [Component] -> DirectObject -> Dictionary -> Dictionary
setAt [] _ dict = dict
setAt [component] directObject dict =
Map.insert (Name component) directObject dict
setAt (component:components) directObject dict =
Map.adjust setDirObj (Name component) dict
where
setDirObj (Dictionary subDict) =
Dictionary $ setAt components directObject subDict
setDirObj x = x
save :: RWLayer m => Nav Object -> m ()
save nav@(Nav {dPath, value = Direct dObj}) =
edit .at (root dPath) $ return . setObj
where
setObj obj@(Stream {header}) =
obj {header = setAt (offset dPath) dObj header}
setObj (Direct (Dictionary dict)) =
Direct . Dictionary $ setAt (offset dPath) dObj dict
setObj _ = value nav
save (Nav {dPath = DPath {root, offset = []}, value}) = edit $ w root value
save _ = throwError "Streams can't be properties of PDF objects"
data StreamContent = Clear | Raw
onLazy :: (Lazy.ByteString -> Lazy.ByteString) -> ByteString -> ByteString onLazy :: (Lazy.ByteString -> Lazy.ByteString) -> ByteString -> ByteString
onLazy f = Lazy.toStrict . f . Lazy.fromStrict onLazy f = Lazy.toStrict . f . Lazy.fromStrict
contains :: String -> DirectObject -> Bool contains :: String -> DirectObject -> Bool
contains needle (NameObject (Name n)) = needle == n contains needle !(NameObject (Name n)) = needle == n
contains needle (Array directObjects) = oneOf directObjects (contains needle) contains needle !(Array directObjects) = oneOf directObjects (contains needle)
where where
oneOf [] _ = False oneOf [] _ = False
oneOf (x:xs) p = p x || oneOf xs p oneOf (x:xs) p = p x || oneOf xs p
contains _ _ = False contains _ _ = False
instance MonadFail m => Box m Clear Object ByteString where instance MonadError String m => Box m StreamContent (Nav Object) ByteString where
r sc = r sc . value
w sc newStreamContent nav = setValue <$> w sc newStreamContent (value nav)
where
setValue value = nav {value}
instance MonadError String m => Box m StreamContent Object ByteString where
r Raw (Stream {streamContent}) = return streamContent
r Clear (Stream {header, streamContent}) = return $ r Clear (Stream {header, streamContent}) = return $
case Map.lookup (Name "Filter") header of case Map.lookup (Name "Filter") header of
Just directObject Just directObject
@ -102,17 +166,15 @@ instance MonadFail m => Box m Clear Object ByteString where
_ -> streamContent _ -> streamContent
r _ obj = expected "stream" obj r _ obj = expected "stream" obj
w Clear streamContent obj@(Stream {header}) = return $
case Map.lookup (Name "Filter") header of
Just directObject
| contains "FlateDecode" directObject ->
obj {streamContent = onLazy compress streamContent}
_ -> obj {streamContent}
w _ _ obj = expected "stream" obj
instance MonadFail m => Box m Raw Object ByteString where
r Raw (Stream {streamContent}) = return streamContent
r _ obj = expected "stream" obj
w Raw streamContent obj@(Stream {}) = return $ obj {streamContent} w Raw streamContent obj@(Stream {}) = return $ obj {streamContent}
w Clear newStreamContent (Stream {header}) =
let streamContent = getStreamContent (Map.lookup (Name "Filter") header) in
return $ Stream {header = fixLength streamContent, streamContent}
where
getStreamContent (Just directObject)
| contains "FlateDecode" directObject = onLazy compress newStreamContent
getStreamContent _ = newStreamContent
fixLength sc =
let newLength = NumberObject . Number . fromIntegral $ BS.length sc in
Map.insert (Name "Length") newLength header
w _ _ obj = expected "stream" obj w _ _ obj = expected "stream" obj

View file

@ -55,9 +55,8 @@ saveOffset resource = OContext $
lift :: (a -> Builder) -> a -> OBuilder lift :: (a -> Builder) -> a -> OBuilder
lift f a = return (f a) lift f a = return (f a)
getOffsets :: OBuilder -> OContext (OBuilder, Map Resource Offset) getOffsets :: OBuilder -> OContext (Builder, Map Resource Offset)
getOffsets (OContext builder) = getOffsets (OContext builder) = OContext $ listen builder
OContext (listen builder >>= \(a, w) -> return (return a, w))
append :: OBuilder -> OBuilder -> OBuilder append :: OBuilder -> OBuilder -> OBuilder
append (OContext a) (OContext b) = OContext (mappend <$> a <*> b) append (OContext a) (OContext b) = OContext (mappend <$> a <*> b)

View file

@ -13,63 +13,62 @@ module PDF.Pages (
, withResources , withResources
) where ) where
import Control.Applicative (Alternative, (<|>)) import Control.Applicative (Alternative)
import Control.Applicative ((<|>))
import Control.Monad (foldM) import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail(..)) import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (ReaderT, runReaderT) import Control.Monad.Reader (ReaderT, runReaderT)
import Control.Monad.State (StateT(..), evalStateT, execStateT, gets, modify) import Control.Monad.State (StateT(..), evalStateT, execStateT, gets, modify)
import Control.Monad.Trans (lift) import Control.Monad.Trans (lift)
import Data.ByteString (ByteString)
import Data.ByteString.Lazy (toStrict) import Data.ByteString.Lazy (toStrict)
import Data.Id (Id, IdMap) import Data.Id (Id(..), IdMap)
import qualified Data.Id as Id (empty, insert, lookup) import qualified Data.Id as Id (empty, insert, lookup)
import Data.Map (Map) import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList, insert, toList) import qualified Data.Map as Map (empty, elems, fromList, insert, toList)
import Data.Maybe (listToMaybe)
import Data.OrderedMap (OrderedMap, build, mapi) import Data.OrderedMap (OrderedMap, build, mapi)
import PDF.Box (Box(..), at, edit) import PDF.Box (Box(..), at, edit, runRO)
import PDF.CMap (cMap) import qualified PDF.CMap as CMap (parse)
import PDF.Content (Content(..)) import PDF.Content (Content(..))
import qualified PDF.Content as Content (parse) import qualified PDF.Content as Content (parse)
import PDF.Encoding (encoding) import PDF.Encoding (encoding)
import PDF.EOL (Style(..)) import PDF.EOL (Style(..))
import PDF.Font (Font, FontSet) import PDF.Font (Font, FontSet)
import PDF.Layer (Layer(..), Objects(..)) import PDF.Layer (Layer(..), LayerReader)
import PDF.Object ( import PDF.Object (
Dictionary, DirectObject(..), IndirectObjCoordinates(..) Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Name(..), Object(..) , Name(..), Number(..), Object(..)
,) ,)
import PDF.Object.Navigation ( import PDF.Object.Navigation (
Clear(..), (//), (>./), (>//), getDictionary Nav(..), PPath(..), ROLayer, RWLayer, StreamContent(..), (//), (>./)
, getKey, objectById, origin , (>//), catalog, getDictionary, getKey, objectById, save
) )
import PDF.Output (render) import PDF.Output (render)
import Prelude hiding (fail)
import Text.Printf (printf) import Text.Printf (printf)
type Except m = (Alternative m, MonadFail m)
type InLayer m = ReaderT Layer m
type CachedFonts = IdMap Object Font type CachedFonts = IdMap Object Font
type FontCache m = StateT CachedFonts (InLayer m) type FontCache m = StateT CachedFonts m
data Page = Page { data Page = Page {
contents :: OrderedMap (Id Object) Content byteContents :: OrderedMap (Id Object) ByteString
, resources :: Dictionary , resources :: Dictionary
, source :: (Id Object) , source :: (Id Object)
} }
loadContents :: Except m => DirectObject -> InLayer m (OrderedMap (Id Object) Content) loadByteContents :: ROLayer m => DirectObject -> m (OrderedMap (Id Object) ByteString)
loadContents directObject = loadByteContents directObject = do
sequenceA . build loadContent $ objectIds directObject objs <- sequence . build objectById $ objectIds directObject
mapM (r Clear) objs
where where
loadContent :: Except m => (Id Object) -> InLayer m Content
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
objectIds (Array l) = l >>= getReference objectIds (Array l) = l >>= getReference
objectIds dirObj = getReference dirObj objectIds dirObj = getReference dirObj
getFontDictionary :: Except m => Object -> InLayer m Dictionary getFontDictionary :: (Alternative m, ROLayer m) => Nav Object -> m Dictionary
getFontDictionary pageObj = getFontDictionary pageObj =
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty (pageObj >// PPath ["Resources", "Font"] >>= fmap value . getDictionary)
<|> return Map.empty
cache :: Except m => ((Id Object) -> FontCache m Font) -> (Id Object) -> FontCache m Font cache :: ROLayer m => (Id Object -> FontCache m Font) -> Id Object -> FontCache m Font
cache loader objectId = cache loader objectId =
gets (Id.lookup objectId) >>= maybe load return gets (Id.lookup objectId) >>= maybe load return
where where
@ -78,81 +77,105 @@ cache loader objectId =
modify $ Id.insert objectId value modify $ Id.insert objectId value
return value return value
loadFont :: Except m => (Id Object) -> FontCache m Font loadFont :: (Alternative m, ROLayer m) => Id Object -> FontCache m Font
loadFont objectId = lift $ objectById objectId >>= tryMappings loadFont objectId = lift $ objectById objectId >>= tryMappings
where where
tryMappings object = tryMappings object =
(object >./ "ToUnicode" >>= r Clear >>= cMap) (object >./ "ToUnicode" >>= r Clear >>= CMap.parse)
<|> (object >./ "Encoding" >>= loadEncoding) <|> (object >./ "Encoding" >>= loadEncoding . value)
<|> (fail $ unknownFormat (show objectId) (show object)) <|> (throwError $ unknownFormat (show objectId) (show object))
unknownFormat = printf "Unknown font format for object #%s : %s" unknownFormat = printf "Unknown font format for object #%s : %s"
loadEncoding :: MonadFail m => Object -> m Font loadEncoding :: MonadError String m => Object -> m Font
loadEncoding (Direct (NameObject (Name name))) = encoding name loadEncoding (Direct (NameObject (Name name))) = encoding name
loadEncoding object = loadEncoding object =
fail $ printf "Encoding must be a name, not that : %s" $ show object throwError $ printf "Encoding must be a name, not that : %s" $ show object
loadResources :: Except m => Dictionary -> FontCache m FontSet loadResources :: (Alternative m, ROLayer m) => Dictionary -> FontCache m FontSet
loadResources = foldM addFont Map.empty . Map.toList loadResources = foldM addFont Map.empty . Map.toList
where where
addFont output (name, Reference (IndirectObjCoordinates {objectId})) = addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output addFont output _ = return output
getReference :: DirectObject -> [(Id Object)] getReference :: DirectObject -> [Id Object]
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId] getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
getReference _ = [] getReference _ = []
loadPage :: Except m => (Id Object) -> InLayer m Page loadPage :: (Alternative m, ROLayer m) => Id Object -> m Page
loadPage source = do loadPage source = do
page <- objectById source page <- objectById source
contents <- getKey "Contents" page >>= loadContents byteContents <- loadByteContents . value =<< getKey "Contents" page
resources <- getFontDictionary page resources <- getFontDictionary page
return $ Page {contents, resources, source} return $ Page {byteContents, resources, source}
pagesList :: Except m => InLayer m [(Id Object)] pagesList :: (Alternative m, ROLayer m) => m [Id Object]
pagesList = pagesList =
(origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences) (catalog // PPath ["Root", "Pages", "Kids"] >>= getReferences . value)
<|> return [] <|> return []
where where
getReferences (Array kids) = kids >>= getReference getReferences (Direct (Array kids)) = return $ getReference =<< kids
getReferences _ = fail "Not a pages array" getReferences _ = throwError "Not a pages array"
updatePage :: MonadFail m => Page -> StateT Layer m () editPagesList :: RWLayer m => ([DirectObject] -> [DirectObject]) -> m ()
updatePage (Page {contents}) = sequence_ $ mapi updateContent contents editPagesList f = do
pages <- runRO (catalog // PPath ["Root", "Pages"])
kids <- runRO (pages >./ "Kids")
count <- runRO (pages >./ "Count")
(newSize, newKids) <- editKids (value kids)
save $ kids {value = newKids}
save $ count {value = Direct $ NumberObject newSize}
where where
updateContent source content = editKids (Direct (Array pageRefs)) =
edit . at Objects . at source . at Clear $ setContent content let result = f pageRefs in
setContent content _ = return . toStrict $ render LF content return (Number . fromIntegral $ length result, Direct $ Array result)
editKids _ = throwError "Invalid format for Root.Pages.Kids (not an array)"
updatePage :: RWLayer m => Page -> m ()
updatePage (Page {byteContents}) = sequence_ $ mapi updateByteContent byteContents
where
updateByteContent source byteContent =
edit .at source .at Clear $ \_ -> return byteContent
data Pages = Pages data Pages = Pages
newtype PageNumber = P Int newtype PageNumber = P Int
data Contents = Contents data Contents = Contents
instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where instance (Alternative m, MonadError String m) => Box m Pages Layer (Map Int Page) where
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer r Pages = runReaderT (numbered <$> pagesList >>= mapM loadPage)
where where
numbered :: [Page] -> Map Int Page numbered :: [Id Object] -> Map Int (Id Object)
numbered = Map.fromList . zip [1..] numbered = Map.fromList . zip [1..]
w Pages pages = execStateT $ mapM_ updatePage pages w Pages pages = execStateT $ do
mapM_ updatePage pages
instance (Alternative m, MonadFail m) => Box m PageNumber Layer Page where setPagesList $ Map.elems (source <$> pages)
r (P i) layer
| i < 1 = fail "Pages start at 1"
| otherwise = runReaderT (drop (i - 1) <$> pagesList >>= firstPage) layer
where where
firstPage [] = fail "Page is out of bounds" setPagesList =
firstPage (p:_) = loadPage p editPagesList . const . fmap (Reference . flip IndirectObjCoordinates 0)
w _ page = execStateT $ updatePage page instance (Alternative m, MonadError String m) => Box m PageNumber Layer Page where
r (P p) layer
| p < 1 = throwError "Pages start at 1"
| otherwise = runReaderT (drop (p - 1) <$> pagesList >>= firstPage) layer
where
firstPage =
maybe (throwError "Page is out of bounds") loadPage . listToMaybe
w (P p) page = execStateT $ do
updatePage page
editPagesList $ setPage (Reference $ IndirectObjCoordinates (source page) 0)
where
setPage ref l = take (p-1) l ++ ref : drop p l
instance Monad m => Box m Contents Page (OrderedMap (Id Object) Content) where instance Monad m => Box m Contents Page (OrderedMap (Id Object) Content) where
r Contents = return . contents r Contents = return . fmap Content.parse . byteContents
w _ contents page = return $ page {contents} w Contents contents page = return $ page {byteContents}
where
byteContents = toStrict . render LF <$> contents
withFonts :: Monad m => (Layer -> FontCache m a) -> Layer -> m a withFonts :: MonadError String m => (Layer -> FontCache (LayerReader m) a) -> Layer -> m a
withFonts f layer = runReaderT (evalStateT (f layer) Id.empty) layer withFonts f layer = runReaderT (evalStateT (f layer) Id.empty) layer
withResources :: Except m => (Page -> ReaderT FontSet m b) -> Page -> FontCache m b withResources :: (Alternative m, MonadError String m) => (Page -> ReaderT FontSet m b) -> Page -> FontCache (LayerReader m) b
withResources f p = withResources f p =
loadResources (resources p) >>= lift . lift . runReaderT (f p) loadResources (resources p) >>= lift . lift . runReaderT (f p)