Make a couple improvements in performance + add an example script to extract pages from a PDF
This commit is contained in:
parent
f6664683c7
commit
d9f69014a0
17 changed files with 448 additions and 258 deletions
|
@ -59,7 +59,7 @@ executable equivalent
|
|||
build-depends: base
|
||||
, bytestring
|
||||
, Hufflepdf
|
||||
ghc-options: -Wall
|
||||
ghc-options: -Wall -rtsopts
|
||||
default-language: Haskell2010
|
||||
|
||||
executable getObj
|
||||
|
@ -69,7 +69,7 @@ executable getObj
|
|||
, containers
|
||||
, Hufflepdf
|
||||
, mtl
|
||||
ghc-options: -Wall
|
||||
ghc-options: -Wall -rtsopts
|
||||
default-language: Haskell2010
|
||||
|
||||
executable getText
|
||||
|
@ -77,9 +77,34 @@ executable getText
|
|||
build-depends: base
|
||||
, bytestring
|
||||
, containers
|
||||
, ExceptIOH
|
||||
, Hufflepdf
|
||||
, mtl
|
||||
, text
|
||||
ghc-options: -Wall
|
||||
ghc-options: -Wall -rtsopts
|
||||
default-language: Haskell2010
|
||||
|
||||
executable pdfCut
|
||||
main-is: examples/pdfCut.hs
|
||||
build-depends: base
|
||||
, bytestring
|
||||
, containers
|
||||
, ExceptIOH
|
||||
, filepath
|
||||
, Hufflepdf
|
||||
, mtl
|
||||
, text
|
||||
ghc-options: -Wall -rtsopts
|
||||
default-language: Haskell2010
|
||||
|
||||
executable fixMermoz
|
||||
main-is: examples/fixMermoz.hs
|
||||
build-depends: base
|
||||
, bytestring
|
||||
, ExceptIOH
|
||||
, Hufflepdf
|
||||
, mtl
|
||||
ghc-options: -Wall -rtsopts
|
||||
default-language: Haskell2010
|
||||
|
||||
Test-Suite unitTests
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
|
@ -13,7 +11,7 @@ import PDF.Box (Box(..))
|
|||
import PDF.Layer (Layer(..), unify)
|
||||
import PDF.Object (Object(..))
|
||||
import PDF.Object.Navigation (
|
||||
Clear(..), Raw(..), (//), objectById, catalog
|
||||
Nav(..), PPath(..), StreamContent(..), (//), objectById, catalog
|
||||
)
|
||||
import PDF.Output (Output)
|
||||
import qualified PDF.Output as Output (render)
|
||||
|
@ -23,33 +21,27 @@ import System.Exit (die)
|
|||
import Text.Printf (printf)
|
||||
import Text.Read (readMaybe)
|
||||
|
||||
newtype Error a = Error {
|
||||
runError :: Either String a
|
||||
} deriving (Functor, Applicative, Monad)
|
||||
instance MonadFail Error where
|
||||
fail = Error . Left
|
||||
|
||||
decodedStream :: Object -> Object
|
||||
decodedStream object =
|
||||
maybe object id $ r Clear object >>= flip (w Raw) object
|
||||
either (const object) id $ r Clear object >>= flip (w Raw) object
|
||||
|
||||
display :: Output a => ReaderT Layer Error a -> Document -> Either String ByteString
|
||||
display :: Functor m => Output a => ReaderT Layer m a -> Document -> m ByteString
|
||||
display getter (Document {eolStyle, layers}) =
|
||||
Output.render eolStyle <$> runError (runReaderT getter (unify layers))
|
||||
Output.render eolStyle <$> runReaderT getter (unify layers)
|
||||
|
||||
parse :: [String] -> IO (FilePath, Document -> Either String ByteString)
|
||||
parse [inputFile] = return (inputFile, display catalog)
|
||||
parse [inputFile] = return (inputFile, display $ value <$> catalog)
|
||||
parse [inputFile, key] =
|
||||
return (inputFile, clear . maybe (byPath key) byId $ readMaybe key)
|
||||
where
|
||||
byId = objectById . Id
|
||||
byPath path = catalog // (explode path)
|
||||
byPath path = (catalog // PPath (explode path))
|
||||
explode "" = []
|
||||
explode path =
|
||||
case break (== '.') path of
|
||||
(name, "") -> [name]
|
||||
(name, rest) -> name : explode (drop 1 rest)
|
||||
clear = display . fmap decodedStream
|
||||
clear = display . fmap (decodedStream . value)
|
||||
parse _ =
|
||||
die . printf "Syntax: %s inputFile [OBJECT_ID | PATH_TO_OBJ]\n" =<< getProgName
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
import Control.Monad ((>=>))
|
||||
import Control.Monad.Except (ExceptT(..))
|
||||
import Control.Monad.Except.IOH (handle)
|
||||
import Control.Monad.IO.Class (liftIO)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import Data.Id (Id(..), mapWithKey)
|
||||
|
@ -9,7 +11,7 @@ import qualified Data.Text as Text (unpack)
|
|||
import PDF (UnifiedLayers(..), parseDocument)
|
||||
import PDF.Box (Box(..))
|
||||
import PDF.Content.Text (Chunks(..))
|
||||
import PDF.Layer (Layer)
|
||||
import PDF.Layer (Layer, LayerReader)
|
||||
import PDF.Pages (
|
||||
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), withFonts
|
||||
, withResources
|
||||
|
@ -19,7 +21,7 @@ import System.Exit (die)
|
|||
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
||||
import Text.Printf (printf)
|
||||
|
||||
displayPage :: Int -> Page -> FontCache IO ()
|
||||
displayPage :: Int -> Page -> FontCache (LayerReader (ExceptT String IO)) ()
|
||||
displayPage n = withResources (
|
||||
r Contents
|
||||
>=> sequence_ . mapi (\objectId ->
|
||||
|
@ -31,22 +33,22 @@ displayPage n = withResources (
|
|||
liftIO . putStrLn $
|
||||
printf "p#%d obj#%d instr#%d: %s" n (getId a) (getId b) (Text.unpack v)
|
||||
|
||||
getAll :: Layer -> IO ()
|
||||
getAll :: Layer -> ExceptT String IO ()
|
||||
getAll = withFonts $ r Pages >=> sequence_ . Map.mapWithKey displayPage
|
||||
|
||||
get :: Int -> Layer -> IO ()
|
||||
get :: Int -> Layer -> ExceptT String IO ()
|
||||
get n = withFonts $ r (P n) >=> displayPage n
|
||||
|
||||
onDoc :: FilePath -> (Layer -> IO ()) -> IO ()
|
||||
onDoc inputFile f = do
|
||||
(parseDocument <$> BS.readFile inputFile)
|
||||
>>= either die (r UnifiedLayers >=> f)
|
||||
onDoc :: FilePath -> (Layer -> ExceptT String IO ()) -> ExceptT String IO ()
|
||||
onDoc inputFile f =
|
||||
ExceptT (parseDocument <$> BS.readFile inputFile) >>= r UnifiedLayers >>= f
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
hSetBuffering stdout LineBuffering
|
||||
args <- getArgs
|
||||
case args of
|
||||
[inputFile] -> onDoc inputFile getAll
|
||||
[inputFile, pageNumber] -> onDoc inputFile (get $ read pageNumber)
|
||||
[inputFile] -> onDoc inputFile getAll `handle` die
|
||||
[inputFile, pageNumber] ->
|
||||
onDoc inputFile (get $ read pageNumber) `handle` die
|
||||
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
|
||||
|
|
41
examples/pdfCut.hs
Normal file
41
examples/pdfCut.hs
Normal file
|
@ -0,0 +1,41 @@
|
|||
import Control.Monad.State (MonadState(..), evalStateT)
|
||||
import Control.Monad.Except.IOH (handle)
|
||||
import Control.Monad.Trans (lift)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import qualified Data.ByteString.Lazy.Char8 as Lazy (writeFile)
|
||||
import Data.Map ((!), fromSet)
|
||||
import qualified Data.Set as Set (fromList)
|
||||
import PDF (Document, UnifiedLayers(..), parseDocument, render)
|
||||
import PDF.Box (at)
|
||||
import PDF.Pages (Pages(..))
|
||||
import System.Environment (getArgs)
|
||||
import System.Exit (die)
|
||||
import System.FilePath (replaceBaseName, takeBaseName)
|
||||
import Text.Read (readEither)
|
||||
|
||||
parseRange :: String -> Either String (Int, Int)
|
||||
parseRange = evalStateT $ do
|
||||
from <- lift . readEither =<< state (break (== '-'))
|
||||
get >>= makeRange from
|
||||
where
|
||||
makeRange from "" = return (from, from)
|
||||
makeRange from (_:to) = (,) from <$> lift (readEither to)
|
||||
|
||||
cut :: (Int, Int) -> Document -> IO Document
|
||||
cut (p1, p2) doc = ((
|
||||
at UnifiedLayers .at Pages $ \pages ->
|
||||
return $ fromSet (pages!) $ Set.fromList [pMin .. pMax]
|
||||
) doc) `handle` die
|
||||
where
|
||||
(pMin, pMax) = (min p1 p2, max p1 p2)
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
[inputPath, inputRange] <- getArgs
|
||||
range <- catchLeft $ parseRange inputRange
|
||||
doc <- catchLeft =<< parseDocument <$> BS.readFile inputPath
|
||||
Lazy.writeFile (outputPath inputPath inputRange) . render =<< cut range doc
|
||||
where
|
||||
catchLeft = either die return
|
||||
outputPath fileName range =
|
||||
replaceBaseName fileName (takeBaseName fileName ++ "_p" ++ range)
|
|
@ -14,7 +14,7 @@ module Data.ByteString.Char8.Util (
|
|||
) where
|
||||
|
||||
import Data.ByteString (ByteString, snoc)
|
||||
import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt)
|
||||
import qualified Data.ByteString as BS (empty, foldl', length, pack, singleton, splitAt)
|
||||
import qualified Data.ByteString.Char8 as Char8 (
|
||||
cons, drop, index, splitAt, take, uncons, unpack
|
||||
)
|
||||
|
@ -43,7 +43,7 @@ intToB256 n
|
|||
B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
|
||||
|
||||
b256ToInt :: B256Int -> Int
|
||||
b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n
|
||||
b256ToInt (B256Int n) = BS.foldl' (\k w -> 0x100*k + fromEnum w) 0 n
|
||||
|
||||
toBytes :: Int -> Int -> ByteString
|
||||
toBytes 0 _ = BS.empty
|
||||
|
|
|
@ -8,6 +8,7 @@ module Data.Id (
|
|||
, at
|
||||
, delete
|
||||
, empty
|
||||
, filterWithKey
|
||||
, fromList
|
||||
, insert
|
||||
, keysSet
|
||||
|
@ -21,11 +22,11 @@ module Data.Id (
|
|||
, union
|
||||
) where
|
||||
|
||||
import Control.Monad.State (MonadState, modify, gets)
|
||||
import Control.Monad.State.Strict (MonadState, modify, gets)
|
||||
import Data.IntMap (IntMap, (!))
|
||||
import qualified Data.IntMap as IntMap (
|
||||
delete, empty, fromList, keysSet, insert, lookup, mapWithKey, maxViewWithKey
|
||||
, member, minViewWithKey, size, union
|
||||
delete, empty, filterWithKey, fromList, keysSet, insert, lookup
|
||||
, mapWithKey, maxViewWithKey, member, minViewWithKey, size, union
|
||||
)
|
||||
import Data.IntSet (IntSet)
|
||||
import Prelude hiding (lookup)
|
||||
|
@ -74,6 +75,9 @@ union (IdMap intMap1) (IdMap intMap2) =
|
|||
mapWithKey :: (Id a -> b -> c) -> IdMap a b -> IdMap a c
|
||||
mapWithKey f (IdMap idMap) = IdMap {intMap = IntMap.mapWithKey (f . Id) idMap}
|
||||
|
||||
filterWithKey :: (Id a -> b -> Bool) -> IdMap a b -> IdMap a b
|
||||
filterWithKey f = IdMap . IntMap.filterWithKey (f . Id) . intMap
|
||||
|
||||
fromList :: [(Id a, b)] -> IdMap a b
|
||||
fromList = IdMap . IntMap.fromList . fmap (\(key, b) -> (getId key, b))
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE FunctionalDependencies #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE UndecidableInstances #-}
|
||||
{-# LANGUAGE ScopedTypeVariables #-}
|
||||
module PDF.Box (
|
||||
|
@ -17,8 +16,8 @@ module PDF.Box (
|
|||
|
||||
import Control.Monad.Except (ExceptT(..), runExceptT)
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Control.Monad.State (MonadState(..))
|
||||
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||
import Control.Monad.State (MonadState(..))
|
||||
import Data.Id (Id, IdMap)
|
||||
import qualified Data.Id as Id (insert, lookup)
|
||||
import Data.Map (Map)
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
module PDF.CMap (
|
||||
CMap
|
||||
, CMappers
|
||||
, CRange(..)
|
||||
, cMap
|
||||
, matches
|
||||
, parse
|
||||
) where
|
||||
|
||||
import Control.Applicative ((<|>), many)
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Control.Monad.Except (MonadError(..))
|
||||
import Control.Monad.Fail (fail)
|
||||
import Control.Monad.State (modify)
|
||||
import Data.Attoparsec.ByteString.Char8 (count)
|
||||
import Data.ByteString (ByteString)
|
||||
|
@ -18,6 +20,7 @@ import Data.ByteString.Char8 (unpack)
|
|||
import Data.ByteString.Char8.Util (
|
||||
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
|
||||
)
|
||||
import Data.Foldable (foldl', foldr')
|
||||
import Data.Map (Map, mapWithKey, union)
|
||||
import qualified Data.Map as Map (
|
||||
adjust, empty, fromList, insert, insertWith, lookup, toList
|
||||
|
@ -75,7 +78,7 @@ encoder :: Map Size FromUnicode -> Encoder
|
|||
encoder fromUnicodes input
|
||||
| Text.null input = Right ""
|
||||
| otherwise =
|
||||
foldl (<>) (Left "No encoding found") $ mapWithKey tryOn fromUnicodes
|
||||
foldr' (<>) (Left "No encoding found") $ mapWithKey tryOn fromUnicodes
|
||||
where
|
||||
tryOn size fromUnicode =
|
||||
let (prefix, end) = Text.splitAt size input in
|
||||
|
@ -87,8 +90,8 @@ matches :: ByteString -> CRange -> Bool
|
|||
matches code (CRange {fromSequence, toSequence}) =
|
||||
fromSequence <= code && code <= toSequence
|
||||
|
||||
cMap :: MonadFail m => ByteString -> m Font
|
||||
cMap = either fail (return . toFont . snd) . runParser
|
||||
parse :: MonadError String m => ByteString -> m Font
|
||||
parse = either throwError (return . toFont . snd) . runParser
|
||||
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
||||
emptyCMap
|
||||
where
|
||||
|
@ -148,7 +151,7 @@ saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length cod
|
|||
)
|
||||
|
||||
saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode
|
||||
saveFromUnicodeBySize = flip (foldl insertFromUnicode)
|
||||
saveFromUnicodeBySize = flip (foldl' insertFromUnicode)
|
||||
where
|
||||
insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode
|
||||
insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) =
|
||||
|
|
|
@ -9,54 +9,77 @@ module PDF.Content (
|
|||
Content(..)
|
||||
, ContentUnit(..)
|
||||
, GraphicContextUnit(..)
|
||||
, IdContentUnit
|
||||
, IdGraphicContextUnit
|
||||
, IdTextContext
|
||||
, Instructions(..)
|
||||
, TextContext
|
||||
, parse
|
||||
) where
|
||||
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad.Fail (MonadFail)
|
||||
import Control.Monad.Reader (asks, runReader)
|
||||
import Control.Monad.State (evalStateT, modify)
|
||||
import Control.Monad.State.Strict (runState, evalStateT, modify)
|
||||
import Data.Attoparsec.ByteString.Char8 (sepBy)
|
||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser, parseOnly)
|
||||
import Data.ByteString (ByteString)
|
||||
import Data.Id (Id(..), Indexed, at, empty, register)
|
||||
import PDF.Box (Box(..))
|
||||
import PDF.Content.Operator (Instruction, operator)
|
||||
import PDF.Object (blank, directObject)
|
||||
import PDF.Output (Output(..), line)
|
||||
import PDF.Parser (Parser, runParser, string)
|
||||
import PDF.Parser (string)
|
||||
|
||||
data Instructions = Instructions
|
||||
|
||||
data GraphicContextUnit =
|
||||
GraphicInstruction (Id Instruction)
|
||||
| ContentUnit ContentUnit
|
||||
data GraphicContextUnit a =
|
||||
GraphicInstruction a
|
||||
| ContentUnit (ContentUnit a)
|
||||
deriving Show
|
||||
type TextContext = [Id Instruction]
|
||||
data ContentUnit =
|
||||
GraphicContext [GraphicContextUnit]
|
||||
| TextContext TextContext
|
||||
type TextContext a = [a]
|
||||
data ContentUnit a =
|
||||
GraphicContext [GraphicContextUnit a]
|
||||
| TextContext (TextContext a)
|
||||
deriving Show
|
||||
data Content = Content {
|
||||
contentUnits :: [ContentUnit]
|
||||
contentUnits :: [IdContentUnit]
|
||||
, indexedInstructions :: Indexed Instruction
|
||||
, firstError :: Maybe String
|
||||
} deriving Show
|
||||
|
||||
type InstructionParser = Parser (Indexed Instruction)
|
||||
type TmpContentUnit = ContentUnit Instruction
|
||||
type TmpGraphicContextUnit = GraphicContextUnit Instruction
|
||||
type TmpTextContext = TextContext Instruction
|
||||
|
||||
type IdContentUnit = ContentUnit (Id Instruction)
|
||||
type IdGraphicContextUnit = GraphicContextUnit (Id Instruction)
|
||||
type IdTextContext = TextContext (Id Instruction)
|
||||
|
||||
instance Monad m => Box m Instructions Content (Indexed Instruction) where
|
||||
r Instructions = return . indexedInstructions
|
||||
w Instructions indexedInstructions someContent =
|
||||
return $ someContent {indexedInstructions}
|
||||
|
||||
parse :: MonadFail m => ByteString -> m Content
|
||||
parse =
|
||||
either fail (return . uncurry Content) . runParser contentUnits empty
|
||||
where
|
||||
contentUnits = contentUnit `sepBy` blank
|
||||
parse :: ByteString -> Content
|
||||
parse input =
|
||||
let result = Atto.parseOnly (contentUnit `sepBy` blank) input in
|
||||
let (contentUnits, indexedInstructions) = either (const ([], empty)) buildContent result in
|
||||
let firstError = either Just (const Nothing) result in
|
||||
Content {contentUnits, indexedInstructions, firstError}
|
||||
|
||||
contentUnit :: InstructionParser ContentUnit
|
||||
buildContent :: [TmpContentUnit] -> ([IdContentUnit], Indexed Instruction)
|
||||
buildContent instructionContentUnits =
|
||||
runState (mapM registerContentUnit instructionContentUnits) empty
|
||||
where
|
||||
registerContentUnit (GraphicContext gc) =
|
||||
GraphicContext <$> (mapM registerGraphicContext gc)
|
||||
registerContentUnit (TextContext tc) = TextContext <$> (mapM register tc)
|
||||
registerGraphicContext (GraphicInstruction gi) =
|
||||
GraphicInstruction <$> (register gi)
|
||||
registerGraphicContext (ContentUnit cu) =
|
||||
ContentUnit <$> (registerContentUnit cu)
|
||||
|
||||
contentUnit :: Atto.Parser TmpContentUnit
|
||||
contentUnit =
|
||||
(GraphicContext <$> graphicContext)
|
||||
<|> (TextContext <$> textContext)
|
||||
|
@ -64,18 +87,18 @@ contentUnit =
|
|||
graphicContext =
|
||||
string "q" *> blank *> graphicContextUnit `sepBy` blank <* blank <* string "Q"
|
||||
|
||||
graphicContextUnit :: InstructionParser GraphicContextUnit
|
||||
graphicContextUnit :: Atto.Parser TmpGraphicContextUnit
|
||||
graphicContextUnit =
|
||||
(GraphicInstruction <$> instruction)
|
||||
<|> (ContentUnit <$> contentUnit)
|
||||
|
||||
instruction :: InstructionParser (Id Instruction)
|
||||
instruction = evalStateT stackParser [] >>= register
|
||||
instruction :: Atto.Parser Instruction
|
||||
instruction = evalStateT stackParser []
|
||||
where
|
||||
stackParser = ((directObject <* blank) >>= push) <|> operator
|
||||
push arg = modify (arg:) *> stackParser
|
||||
|
||||
textContext :: InstructionParser TextContext
|
||||
textContext :: Atto.Parser TmpTextContext
|
||||
textContext =
|
||||
string "BT" *> blank *> instruction `sepBy` blank <* blank <* string "ET"
|
||||
|
||||
|
|
|
@ -1,24 +1,22 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE ConstraintKinds #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE TypeSynonymInstances #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||
module PDF.Content.Text (
|
||||
Chunks(..)
|
||||
, chunk
|
||||
, format
|
||||
, renderText
|
||||
) where
|
||||
|
||||
import Control.Monad (foldM)
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Control.Monad.Reader (MonadReader(..), ReaderT, asks, runReaderT)
|
||||
import Control.Monad.Reader (ReaderT, asks, runReaderT)
|
||||
import Control.Monad.State (
|
||||
MonadState(..), StateT, evalStateT, gets, modify, runStateT
|
||||
)
|
||||
import Control.Monad.Trans (lift)
|
||||
import qualified Data.ByteString.Char8 as BS (concatMap, singleton)
|
||||
import Data.Id (Id(..), Indexed, at, empty, singleton)
|
||||
import qualified Data.Id as Id (delete, lookup, register)
|
||||
import Data.Map ((!))
|
||||
|
@ -26,27 +24,20 @@ import Data.Text (Text, breakOn)
|
|||
import qualified Data.Text as Text (drop)
|
||||
import PDF.Box (Box(..))
|
||||
import PDF.Content (
|
||||
Content, ContentUnit(..), GraphicContextUnit(..), contentUnits
|
||||
Content(..), ContentUnit(..), GraphicContextUnit(..), IdContentUnit
|
||||
, IdGraphicContextUnit
|
||||
)
|
||||
import qualified PDF.Content as Content (Content(..))
|
||||
import PDF.Content.Operator (Instruction, Operator(..))
|
||||
import PDF.Content.Operator.Text (Operator(..))
|
||||
import PDF.Font (Font(..), FontSet, emptyFont)
|
||||
import PDF.Object (DirectObject(..), StringObject(..), toByteString)
|
||||
import Prelude hiding (fail)
|
||||
|
||||
data ROContext = ROContext {
|
||||
indexedInstructions :: Indexed Instruction
|
||||
, fontSet :: FontSet
|
||||
}
|
||||
|
||||
type TextContent m = ReaderT ROContext m
|
||||
type FontContext m = StateT Font (TextContent m)
|
||||
|
||||
type TmpFont = StateT Font
|
||||
type Renderer m = ReaderT (Indexed Instruction) (ReaderT FontSet m)
|
||||
type Updater m = StateT (Indexed Instruction) (ReaderT (Indexed Text) (ReaderT FontSet m))
|
||||
type TextUpdater m = StateT Font (Updater m)
|
||||
|
||||
decodeString :: MonadFail m => StringObject -> FontContext m Text
|
||||
decodeString :: MonadFail m => StringObject -> TmpFont (Renderer m) Text
|
||||
decodeString input = do
|
||||
Font {decode} <- get
|
||||
either fail return . decode $ toByteString input
|
||||
|
@ -57,56 +48,53 @@ chunk :: Int -> Id Text
|
|||
chunk = Id
|
||||
|
||||
instance MonadFail m => Box (ReaderT FontSet m) Chunks Content (Indexed Text) where
|
||||
r Chunks content = do
|
||||
fontSet <- ask
|
||||
renderText fontSet content
|
||||
r Chunks content =
|
||||
runReaderT (mconcat <$> renderer) $ indexedInstructions content
|
||||
where
|
||||
renderer = mapM renderContentUnit (contentUnits content)
|
||||
|
||||
w Chunks indexedText (Content.Content {contentUnits, Content.indexedInstructions}) =
|
||||
uncurry Content.Content <$> runReaderT (
|
||||
runStateT (mapM updateContentUnit contentUnits) indexedInstructions
|
||||
) indexedText
|
||||
w Chunks indexedText content = do
|
||||
(contentUnits, indexedInstructions) <- runReaderT readerUpdate indexedText
|
||||
return $ content {contentUnits, indexedInstructions}
|
||||
where
|
||||
stateUpdate = mapM updateContentUnit (contentUnits content)
|
||||
readerUpdate = runStateT stateUpdate (indexedInstructions content)
|
||||
|
||||
renderText :: MonadFail m => FontSet -> Content -> m (Indexed Text)
|
||||
renderText fontSet (Content.Content {contentUnits, Content.indexedInstructions}) =
|
||||
runReaderT (mconcat <$> mapM renderContentUnit contentUnits) roContext
|
||||
where
|
||||
roContext = ROContext {indexedInstructions, fontSet}
|
||||
|
||||
renderContentUnit :: MonadFail m => ContentUnit -> TextContent m (Indexed Text)
|
||||
renderContentUnit :: MonadFail m => IdContentUnit -> Renderer m (Indexed Text)
|
||||
renderContentUnit (GraphicContext graphicContextUnits) =
|
||||
mconcat <$> mapM renderGraphicContextUnit graphicContextUnits
|
||||
renderContentUnit (TextContext instructionIds) =
|
||||
evalStateT (mconcat <$> mapM renderInstructionId instructionIds) emptyFont
|
||||
|
||||
updateContentUnit :: MonadFail m => ContentUnit -> Updater m ContentUnit
|
||||
updateContentUnit :: MonadFail m => IdContentUnit -> Updater m IdContentUnit
|
||||
updateContentUnit (GraphicContext graphicContextUnits) = GraphicContext <$>
|
||||
mapM updateGraphicContextUnit graphicContextUnits
|
||||
updateContentUnit (TextContext instructionIds) = TextContext . concat <$>
|
||||
evalStateT (mapM updateInstructionId instructionIds) emptyFont
|
||||
|
||||
renderGraphicContextUnit :: MonadFail m => GraphicContextUnit -> TextContent m (Indexed Text)
|
||||
renderGraphicContextUnit :: MonadFail m => IdGraphicContextUnit -> Renderer m (Indexed Text)
|
||||
renderGraphicContextUnit (GraphicInstruction _) = return empty
|
||||
renderGraphicContextUnit (ContentUnit contentUnit) =
|
||||
renderContentUnit contentUnit
|
||||
|
||||
updateGraphicContextUnit :: MonadFail m => GraphicContextUnit -> Updater m GraphicContextUnit
|
||||
updateGraphicContextUnit :: MonadFail m => IdGraphicContextUnit -> Updater m IdGraphicContextUnit
|
||||
updateGraphicContextUnit gI@(GraphicInstruction _) = return gI
|
||||
updateGraphicContextUnit (ContentUnit contentUnit) =
|
||||
ContentUnit <$> updateContentUnit contentUnit
|
||||
|
||||
renderInstructionId :: MonadFail m => Id Instruction -> FontContext m (Indexed Text)
|
||||
renderInstructionId :: MonadFail m => Id Instruction -> TmpFont (Renderer m) (Indexed Text)
|
||||
renderInstructionId instructionId@(Id n) = toMap <$>
|
||||
(asks ((`at` instructionId) . indexedInstructions) >>= renderInstruction)
|
||||
(asks ((`at` instructionId)) >>= renderInstruction)
|
||||
where
|
||||
toMap = maybe empty (singleton (Id n))
|
||||
|
||||
updateInstructionId :: MonadFail m => Id Instruction -> TextUpdater m [Id Instruction]
|
||||
updateInstructionId :: MonadFail m => Id Instruction -> TmpFont (Updater m) [Id Instruction]
|
||||
updateInstructionId instructionId =
|
||||
lift (gets (`at` instructionId)) >>= updateInstruction instructionId
|
||||
|
||||
renderInstruction :: MonadFail m => Instruction -> FontContext m (Maybe Text)
|
||||
renderInstruction :: MonadFail m => Instruction -> TmpFont (Renderer m) (Maybe Text)
|
||||
renderInstruction (Text Tf, [NameObject fontName, _]) =
|
||||
asks ((! fontName) . fontSet) >>= put >> return Nothing
|
||||
lift (lift $ asks (! fontName)) >>= put >> return Nothing
|
||||
|
||||
renderInstruction (Text Tstar, []) = return $ Just "\n"
|
||||
|
||||
|
@ -128,7 +116,7 @@ renderInstruction (Text DQuote, [_, _, StringObject outputString]) =
|
|||
|
||||
renderInstruction _ = return Nothing
|
||||
|
||||
updateInstruction :: MonadFail m => Id Instruction -> Instruction -> TextUpdater m [Id Instruction]
|
||||
updateInstruction :: MonadFail m => Id Instruction -> Instruction -> TmpFont (Updater m) [Id Instruction]
|
||||
updateInstruction instructionId (Text Tf, [NameObject fontName, _]) =
|
||||
(lift . lift . lift $ asks (!fontName)) >>= put >> return [instructionId]
|
||||
|
||||
|
@ -155,5 +143,9 @@ format input = do
|
|||
(line, left) -> (:) <$> tj line <*> format left
|
||||
where
|
||||
tj t = do
|
||||
literal <- either fail (return . Literal) =<< gets (($t) . encode)
|
||||
return (Text Tj, [StringObject literal])
|
||||
encoded <- either fail return =<< gets (($t) . encode)
|
||||
return (Text Tj, [StringObject . Literal $ BS.concatMap escape encoded])
|
||||
escape '\\' = "\\\\"
|
||||
escape '(' = "\\("
|
||||
escape ')' = "\\)"
|
||||
escape c = BS.singleton c
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
module PDF.Encoding (
|
||||
encoding
|
||||
) where
|
||||
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Control.Monad.Except (MonadError(..))
|
||||
import PDF.Encoding.MacRoman (macRomanEncoding)
|
||||
import PDF.Font (Font)
|
||||
import Prelude hiding (fail)
|
||||
|
||||
encoding :: MonadFail m => String -> m Font
|
||||
encoding :: MonadError String m => String -> m Font
|
||||
encoding "MacRomanEncoding" = return macRomanEncoding
|
||||
encoding s = fail $ "Unknown encoding " ++ s
|
||||
encoding s = throwError $ "Unknown encoding " ++ s
|
||||
|
|
|
@ -3,6 +3,7 @@ module PDF.Encoding.MacRoman (
|
|||
) where
|
||||
|
||||
import qualified Data.ByteString.Char8 as BS (pack, unpack)
|
||||
import Data.Foldable (foldl')
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map (empty, insert, lookup)
|
||||
import qualified Data.Text as Text (pack, unpack)
|
||||
|
@ -24,7 +25,7 @@ macRomanEncoding = Font {
|
|||
| otherwise -> Left ("Character '" ++ k :"' unavailable in MacRoman")
|
||||
|
||||
mappers :: (Mapper, Mapper)
|
||||
mappers = foldl generateMapers (Map.empty, Map.empty) [
|
||||
mappers = foldl' generateMapers (Map.empty, Map.empty) [
|
||||
('\x80', '\x00C4') -- LATIN CAPITAL LETTER A WITH DIAERESIS
|
||||
, ('\x81', '\x00C5') -- LATIN CAPITAL LETTER A WITH RING ABOVE
|
||||
, ('\x82', '\x00C7') -- LATIN CAPITAL LETTER C WITH CEDILLA
|
||||
|
|
|
@ -1,21 +1,26 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
module PDF.Layer (
|
||||
Layer(..)
|
||||
, LayerReader
|
||||
, Objects(..)
|
||||
, unify
|
||||
) where
|
||||
|
||||
import Control.Monad.Except (MonadError(..))
|
||||
import Control.Monad.Reader (ReaderT)
|
||||
import Data.Foldable (foldl')
|
||||
import Data.Id (Id(..), Indexed, keysSet, mapWithKey, member)
|
||||
import qualified Data.Id as Id (empty, union)
|
||||
import qualified Data.Id as Id (empty, insert, lookup, union)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, union)
|
||||
import qualified Data.Map as Map (empty, lookup, union)
|
||||
import qualified Data.IntSet as IntSet (delete, toList)
|
||||
import PDF.Box (Box(..))
|
||||
import PDF.Object (
|
||||
IndirectObjCoordinates(..), Object, Occurrence(..)
|
||||
, Structure(..), XRefEntry(..), XRefSection, eofMarker, outputBody
|
||||
IndirectObjCoordinates(..), Object, Occurrence(..), Structure(..)
|
||||
, XRefEntry(..), XRefSection, eofMarker, outputBody
|
||||
)
|
||||
import qualified PDF.Output as Output (line)
|
||||
import PDF.Output (
|
||||
|
@ -26,24 +31,28 @@ import Text.Printf (printf)
|
|||
|
||||
data Layer = Layer {
|
||||
occurrences :: [Occurrence]
|
||||
, objects :: (Indexed Object)
|
||||
, objects :: Indexed Object
|
||||
, docStructure :: Structure
|
||||
} deriving Show
|
||||
type LayerReader m = ReaderT Layer m
|
||||
|
||||
updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
|
||||
updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
|
||||
updateXRefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
|
||||
updateXRefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
|
||||
where
|
||||
updateEntry objectId e@(InUse {}) =
|
||||
e {offset = offsets ! (ObjectId $ getId objectId)}
|
||||
updateEntry objectId e@(InUse {offset}) =
|
||||
case Map.lookup (ObjectId $ getId objectId) offsets of
|
||||
Nothing -> Free {nextFree = Id $ getOffset offset, generation = 65535}
|
||||
Just newOffset -> e {offset = newOffset}
|
||||
updateEntry _ e = e
|
||||
|
||||
instance Output Layer where
|
||||
output (Layer {occurrences, objects, docStructure}) =
|
||||
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
|
||||
>>= \(body, (xref, startXRef)) -> mconcat [
|
||||
body
|
||||
output (Layer {occurrences, objects, docStructure}) = do
|
||||
(body, savedOffsets) <- getOffsets (outputBody (occurrences, objects))
|
||||
let (newXRef, startXRef) = updateXRefs xRef savedOffsets
|
||||
mconcat [
|
||||
return body
|
||||
, Output.line "xref"
|
||||
, output xref
|
||||
, output newXRef
|
||||
, Output.line "trailer"
|
||||
, output trailer, newLine
|
||||
, Output.line "startxref"
|
||||
|
@ -76,6 +85,14 @@ instance Monad m => Box m Objects Layer (Indexed Object) where
|
|||
Indirect (IndirectObjCoordinates {objectId, versionNumber = 0})
|
||||
newOccurrences = (makeOccurrence . Id) <$> IntSet.toList newObjectIds
|
||||
|
||||
instance MonadError String m => Box m (Id Object) Layer Object where
|
||||
r objectId =
|
||||
maybe (throwError "Unknown key") return . Id.lookup objectId . objects
|
||||
w objectId a layer@(Layer {objects})
|
||||
| member objectId objects = return $
|
||||
layer {objects = Id.insert objectId a objects}
|
||||
| otherwise = throwError "Unknown key"
|
||||
|
||||
emptyLayer :: Layer
|
||||
emptyLayer = Layer {
|
||||
docStructure = Structure {xRef = Id.empty, trailer = Map.empty}
|
||||
|
@ -84,7 +101,7 @@ emptyLayer = Layer {
|
|||
}
|
||||
|
||||
unify :: [Layer] -> Layer
|
||||
unify = foldl complete emptyLayer
|
||||
unify = foldl' complete emptyLayer
|
||||
where
|
||||
complete tmpLayer older =
|
||||
let mergedObjects = Id.union (objects tmpLayer) (objects older) in
|
||||
|
|
|
@ -34,6 +34,7 @@ module PDF.Object (
|
|||
) where
|
||||
|
||||
import Control.Applicative ((<|>), many)
|
||||
import Control.Monad.Reader (asks)
|
||||
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS (concat)
|
||||
|
@ -48,11 +49,11 @@ import qualified Data.Id as Id (
|
|||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map (fromList, toList)
|
||||
import qualified Data.Set as Set (fromList, member)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import qualified PDF.EOL as EOL (Style(..), charset, parser)
|
||||
import qualified PDF.Output as Output (line, string)
|
||||
import PDF.Output (
|
||||
OBuilder, Offset(..), Output(..), Resource(..)
|
||||
, byteString, getOffset, join, newLine, saveOffset
|
||||
OBuilder, OContext(..), Offset(..), Output(..), Resource(..), byteString
|
||||
, getOffset, join, newLine, saveOffset
|
||||
)
|
||||
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
|
||||
import Text.Printf (printf)
|
||||
|
@ -190,8 +191,8 @@ nullObject = string "null" *> return () <?> "null object"
|
|||
-- Reference
|
||||
--
|
||||
data IndirectObjCoordinates = IndirectObjCoordinates {
|
||||
objectId :: (Id Object)
|
||||
, versionNumber :: Int
|
||||
objectId :: {-# UNPACK #-} !(Id Object)
|
||||
, versionNumber :: {-# UNPACK #-} !Int
|
||||
} deriving Show
|
||||
|
||||
reference :: MonadParser m => m IndirectObjCoordinates
|
||||
|
@ -202,14 +203,14 @@ reference = IndirectObjCoordinates
|
|||
-- DirectObject
|
||||
--
|
||||
data DirectObject =
|
||||
Boolean Bool
|
||||
| NumberObject Number
|
||||
| StringObject StringObject
|
||||
| NameObject Name
|
||||
| Array [DirectObject]
|
||||
| Dictionary Dictionary
|
||||
Boolean !Bool
|
||||
| NumberObject !Number
|
||||
| StringObject !StringObject
|
||||
| NameObject !Name
|
||||
| Array ![DirectObject]
|
||||
| Dictionary !Dictionary
|
||||
| Null
|
||||
| Reference IndirectObjCoordinates
|
||||
| Reference !IndirectObjCoordinates
|
||||
deriving Show
|
||||
|
||||
instance Output DirectObject where
|
||||
|
@ -294,10 +295,15 @@ data XRefEntry = InUse {
|
|||
} deriving Show
|
||||
|
||||
instance Output XRefEntry where
|
||||
output (InUse {offset, generation}) =
|
||||
Output.line (printf "%010d %05d n " (getOffset offset) generation)
|
||||
output (Free {nextFree, generation}) =
|
||||
Output.line (printf "%010d %05d f " (getId nextFree) generation)
|
||||
output xRefEntry = Output.string (build xRefEntry) `mappend` endXRefEntryLine
|
||||
where
|
||||
build (InUse {offset, generation}) =
|
||||
printf "%010d %05d n" (getOffset offset) generation
|
||||
build (Free {nextFree, generation}) =
|
||||
printf "%010d %05d f" (getId nextFree) generation
|
||||
endXRefEntryLine = OContext (asks padEOLToTwoBytes) >>= Output.line
|
||||
padEOLToTwoBytes EOL.CRLF = ("" :: String)
|
||||
padEOLToTwoBytes _ = " "
|
||||
|
||||
entry :: Parser u XRefEntry
|
||||
entry = do
|
||||
|
|
|
@ -3,98 +3,162 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE DeriveFunctor #-}
|
||||
module PDF.Object.Navigation (
|
||||
Clear(..)
|
||||
, PDFContent
|
||||
, Raw(..)
|
||||
Nav(..)
|
||||
, PPath(..)
|
||||
, ROLayer
|
||||
, RWLayer
|
||||
, StreamContent(..)
|
||||
, (./)
|
||||
, (//)
|
||||
, (>./)
|
||||
, (>//)
|
||||
, castObject
|
||||
, catalog
|
||||
, getDictionary
|
||||
, getKey
|
||||
, objectById
|
||||
, catalog
|
||||
, save
|
||||
) where
|
||||
|
||||
import Codec.Compression.Zlib (compress, decompress)
|
||||
import Control.Monad.Except (MonadError(..))
|
||||
import Control.Monad.Reader (MonadReader(..))
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Control.Monad.State (MonadState)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS (length)
|
||||
import qualified Data.ByteString.Lazy as Lazy (ByteString, fromStrict, toStrict)
|
||||
import Data.Id (Id, at)
|
||||
import qualified Data.Map as Map (lookup)
|
||||
import PDF.Box (Box(..))
|
||||
import Data.Id (Id)
|
||||
import qualified Data.Id as Id (at)
|
||||
import qualified Data.Map as Map (adjust, insert, lookup)
|
||||
import PDF.Box (Box(..), at, edit{-, runRO-})
|
||||
import PDF.Layer (Layer(..))
|
||||
import PDF.Object (
|
||||
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Name(..), Object(..), Structure(..)
|
||||
, Name(..), Number(..), Object(..), Structure(..)
|
||||
)
|
||||
import Prelude hiding (fail)
|
||||
import Text.Printf (printf)
|
||||
|
||||
type PDFContent m = (MonadReader Layer m, MonadFail m)
|
||||
newtype PPath = PPath [Component]
|
||||
data DPath = DPath {
|
||||
root :: Id Object
|
||||
, offset :: [Component]
|
||||
} deriving Show
|
||||
|
||||
push :: Component -> DPath -> DPath
|
||||
push component dPath = dPath {offset = (offset dPath) ++ [component]}
|
||||
|
||||
data Nav a = Nav {
|
||||
dPath :: DPath
|
||||
, value :: a
|
||||
} deriving (Functor)
|
||||
|
||||
instance Show a => Show (Nav a) where
|
||||
show (Nav {dPath, value}) = "Nav {dPath = " ++ show dPath ++ ", value = " ++ show value ++ "}"
|
||||
|
||||
type ROLayer m = (MonadReader Layer m, MonadError String m)
|
||||
type RWLayer m = (MonadState Layer m, MonadError String m)
|
||||
type Component = String
|
||||
|
||||
getDictionary :: PDFContent m => Object -> m Dictionary
|
||||
getDictionary (Direct (Dictionary aDict)) = return aDict
|
||||
getDictionary (Direct (Reference ref)) =
|
||||
objectById (objectId ref) >>= getDictionary
|
||||
getDictionary (Stream {header}) = return header
|
||||
getDictionary obj = expected "dictionary : " obj
|
||||
getDictionary :: ROLayer m => Nav Object -> m (Nav Dictionary)
|
||||
getDictionary (Nav {dPath, value}) =
|
||||
case value of
|
||||
(Direct (Dictionary aDict)) -> return $ Nav {dPath, value = aDict}
|
||||
(Direct (Reference ref)) -> objectById (objectId ref) >>= getDictionary
|
||||
(Stream {header}) -> return $ Nav {dPath, value = header}
|
||||
obj -> expected "dictionary : " obj
|
||||
|
||||
expected :: (MonadFail m, Show a) => String -> a -> m b
|
||||
expected name = fail . printf "Not a %s: %s" name . show
|
||||
expected :: (MonadError String m, Show a) => String -> a -> m b
|
||||
expected name = throwError . printf "Not a %s: %s" name . show
|
||||
|
||||
getKey :: PDFContent m => String -> Object -> m DirectObject
|
||||
getKey key object = getDictionary object >>= catchMaybe . Map.lookup (Name key)
|
||||
getKey :: ROLayer m => String -> Nav Object -> m (Nav DirectObject)
|
||||
getKey key navObject = getDictionary navObject >>= f
|
||||
where
|
||||
errorMessage =
|
||||
printf "Key %s not found in object %s" key (show object)
|
||||
catchMaybe = maybe (fail errorMessage) return
|
||||
printf "Key %s not found in object %s" key (show navObject)
|
||||
f (Nav {dPath, value}) =
|
||||
case Map.lookup (Name key) value of
|
||||
Nothing -> throwError errorMessage
|
||||
Just dObj -> return $ Nav {dPath = push key dPath, value = dObj}
|
||||
|
||||
objectById :: PDFContent m => (Id Object) -> m Object
|
||||
objectById :: ROLayer m => (Id Object) -> m (Nav Object)
|
||||
objectById objectId = do
|
||||
layer <- ask
|
||||
return (objects layer `at` objectId)
|
||||
return $ Nav {
|
||||
dPath = DPath {root = objectId, offset = []}
|
||||
, value = objects layer `Id.at` objectId
|
||||
}
|
||||
|
||||
(./) :: PDFContent m => m Object -> Component -> m Object
|
||||
(./) object key = (object >>= getKey key >>= castObject)
|
||||
|
||||
castObject :: PDFContent m => DirectObject -> m Object
|
||||
castObject (Reference (IndirectObjCoordinates {objectId})) =
|
||||
castObject :: ROLayer m => Nav DirectObject -> m (Nav Object)
|
||||
castObject (Nav {value = !(Reference (IndirectObjCoordinates {objectId}))}) =
|
||||
objectById objectId
|
||||
castObject directObject = return $ Direct directObject
|
||||
castObject (Nav {dPath, value}) = return $ Nav {dPath, value = Direct value}
|
||||
|
||||
(//) :: PDFContent m => m Object -> [Component] -> m Object
|
||||
(//) object [] = object
|
||||
(//) object (key:keys) = object ./ key // keys
|
||||
(./) :: ROLayer m => m (Nav Object) -> Component -> m (Nav Object)
|
||||
(./) navObject key = (navObject >>= getKey key >>= castObject)
|
||||
|
||||
(>./) :: PDFContent m => Object -> Component -> m Object
|
||||
(>./) object = (return object ./)
|
||||
(//) :: ROLayer m => m (Nav Object) -> PPath -> m (Nav Object)
|
||||
(//) navObject (PPath []) = navObject
|
||||
(//) navObject (PPath (key:keys)) = navObject ./ key // (PPath keys)
|
||||
|
||||
(>//) :: PDFContent m => Object -> [Component] -> m Object
|
||||
(>//) object = (return object //)
|
||||
(>./) :: ROLayer m => Nav Object -> Component -> m (Nav Object)
|
||||
(>./) navObject = (return navObject ./)
|
||||
|
||||
catalog :: PDFContent m => m Object
|
||||
catalog = Direct . Dictionary . trailer . docStructure <$> ask
|
||||
(>//) :: ROLayer m => Nav Object -> PPath -> m (Nav Object)
|
||||
(>//) navObject = (return navObject //)
|
||||
|
||||
data Clear = Clear
|
||||
data Raw = Raw
|
||||
catalog :: ROLayer m => m (Nav Object)
|
||||
catalog = do
|
||||
value <- Direct . Dictionary . trailer . docStructure <$> ask
|
||||
return $ Nav {dPath = undefined, value}
|
||||
|
||||
setAt :: [Component] -> DirectObject -> Dictionary -> Dictionary
|
||||
setAt [] _ dict = dict
|
||||
setAt [component] directObject dict =
|
||||
Map.insert (Name component) directObject dict
|
||||
setAt (component:components) directObject dict =
|
||||
Map.adjust setDirObj (Name component) dict
|
||||
where
|
||||
setDirObj (Dictionary subDict) =
|
||||
Dictionary $ setAt components directObject subDict
|
||||
setDirObj x = x
|
||||
|
||||
save :: RWLayer m => Nav Object -> m ()
|
||||
save nav@(Nav {dPath, value = Direct dObj}) =
|
||||
edit .at (root dPath) $ return . setObj
|
||||
where
|
||||
setObj obj@(Stream {header}) =
|
||||
obj {header = setAt (offset dPath) dObj header}
|
||||
setObj (Direct (Dictionary dict)) =
|
||||
Direct . Dictionary $ setAt (offset dPath) dObj dict
|
||||
setObj _ = value nav
|
||||
save (Nav {dPath = DPath {root, offset = []}, value}) = edit $ w root value
|
||||
save _ = throwError "Streams can't be properties of PDF objects"
|
||||
|
||||
data StreamContent = Clear | Raw
|
||||
|
||||
onLazy :: (Lazy.ByteString -> Lazy.ByteString) -> ByteString -> ByteString
|
||||
onLazy f = Lazy.toStrict . f . Lazy.fromStrict
|
||||
|
||||
contains :: String -> DirectObject -> Bool
|
||||
contains needle (NameObject (Name n)) = needle == n
|
||||
contains needle (Array directObjects) = oneOf directObjects (contains needle)
|
||||
contains needle !(NameObject (Name n)) = needle == n
|
||||
contains needle !(Array directObjects) = oneOf directObjects (contains needle)
|
||||
where
|
||||
oneOf [] _ = False
|
||||
oneOf (x:xs) p = p x || oneOf xs p
|
||||
contains _ _ = False
|
||||
|
||||
instance MonadFail m => Box m Clear Object ByteString where
|
||||
instance MonadError String m => Box m StreamContent (Nav Object) ByteString where
|
||||
r sc = r sc . value
|
||||
w sc newStreamContent nav = setValue <$> w sc newStreamContent (value nav)
|
||||
where
|
||||
setValue value = nav {value}
|
||||
|
||||
instance MonadError String m => Box m StreamContent Object ByteString where
|
||||
r Raw (Stream {streamContent}) = return streamContent
|
||||
r Clear (Stream {header, streamContent}) = return $
|
||||
case Map.lookup (Name "Filter") header of
|
||||
Just directObject
|
||||
|
@ -102,17 +166,15 @@ instance MonadFail m => Box m Clear Object ByteString where
|
|||
_ -> streamContent
|
||||
r _ obj = expected "stream" obj
|
||||
|
||||
w Clear streamContent obj@(Stream {header}) = return $
|
||||
case Map.lookup (Name "Filter") header of
|
||||
Just directObject
|
||||
| contains "FlateDecode" directObject ->
|
||||
obj {streamContent = onLazy compress streamContent}
|
||||
_ -> obj {streamContent}
|
||||
w _ _ obj = expected "stream" obj
|
||||
|
||||
instance MonadFail m => Box m Raw Object ByteString where
|
||||
r Raw (Stream {streamContent}) = return streamContent
|
||||
r _ obj = expected "stream" obj
|
||||
|
||||
w Raw streamContent obj@(Stream {}) = return $ obj {streamContent}
|
||||
w Clear newStreamContent (Stream {header}) =
|
||||
let streamContent = getStreamContent (Map.lookup (Name "Filter") header) in
|
||||
return $ Stream {header = fixLength streamContent, streamContent}
|
||||
where
|
||||
getStreamContent (Just directObject)
|
||||
| contains "FlateDecode" directObject = onLazy compress newStreamContent
|
||||
getStreamContent _ = newStreamContent
|
||||
fixLength sc =
|
||||
let newLength = NumberObject . Number . fromIntegral $ BS.length sc in
|
||||
Map.insert (Name "Length") newLength header
|
||||
w _ _ obj = expected "stream" obj
|
||||
|
|
|
@ -55,9 +55,8 @@ saveOffset resource = OContext $
|
|||
lift :: (a -> Builder) -> a -> OBuilder
|
||||
lift f a = return (f a)
|
||||
|
||||
getOffsets :: OBuilder -> OContext (OBuilder, Map Resource Offset)
|
||||
getOffsets (OContext builder) =
|
||||
OContext (listen builder >>= \(a, w) -> return (return a, w))
|
||||
getOffsets :: OBuilder -> OContext (Builder, Map Resource Offset)
|
||||
getOffsets (OContext builder) = OContext $ listen builder
|
||||
|
||||
append :: OBuilder -> OBuilder -> OBuilder
|
||||
append (OContext a) (OContext b) = OContext (mappend <$> a <*> b)
|
||||
|
|
143
src/PDF/Pages.hs
143
src/PDF/Pages.hs
|
@ -13,63 +13,62 @@ module PDF.Pages (
|
|||
, withResources
|
||||
) where
|
||||
|
||||
import Control.Applicative (Alternative, (<|>))
|
||||
import Control.Applicative (Alternative)
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad (foldM)
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Control.Monad.Except (MonadError(..))
|
||||
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||
import Control.Monad.State (StateT(..), evalStateT, execStateT, gets, modify)
|
||||
import Control.Monad.Trans (lift)
|
||||
import Data.ByteString (ByteString)
|
||||
import Data.ByteString.Lazy (toStrict)
|
||||
import Data.Id (Id, IdMap)
|
||||
import Data.Id (Id(..), IdMap)
|
||||
import qualified Data.Id as Id (empty, insert, lookup)
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map (empty, fromList, insert, toList)
|
||||
import qualified Data.Map as Map (empty, elems, fromList, insert, toList)
|
||||
import Data.Maybe (listToMaybe)
|
||||
import Data.OrderedMap (OrderedMap, build, mapi)
|
||||
import PDF.Box (Box(..), at, edit)
|
||||
import PDF.CMap (cMap)
|
||||
import PDF.Box (Box(..), at, edit, runRO)
|
||||
import qualified PDF.CMap as CMap (parse)
|
||||
import PDF.Content (Content(..))
|
||||
import qualified PDF.Content as Content (parse)
|
||||
import PDF.Encoding (encoding)
|
||||
import PDF.EOL (Style(..))
|
||||
import PDF.Font (Font, FontSet)
|
||||
import PDF.Layer (Layer(..), Objects(..))
|
||||
import PDF.Layer (Layer(..), LayerReader)
|
||||
import PDF.Object (
|
||||
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Name(..), Object(..)
|
||||
, Name(..), Number(..), Object(..)
|
||||
,)
|
||||
import PDF.Object.Navigation (
|
||||
Clear(..), (//), (>./), (>//), getDictionary
|
||||
, getKey, objectById, origin
|
||||
Nav(..), PPath(..), ROLayer, RWLayer, StreamContent(..), (//), (>./)
|
||||
, (>//), catalog, getDictionary, getKey, objectById, save
|
||||
)
|
||||
import PDF.Output (render)
|
||||
import Prelude hiding (fail)
|
||||
import Text.Printf (printf)
|
||||
|
||||
type Except m = (Alternative m, MonadFail m)
|
||||
type InLayer m = ReaderT Layer m
|
||||
|
||||
type CachedFonts = IdMap Object Font
|
||||
type FontCache m = StateT CachedFonts (InLayer m)
|
||||
type FontCache m = StateT CachedFonts m
|
||||
data Page = Page {
|
||||
contents :: OrderedMap (Id Object) Content
|
||||
byteContents :: OrderedMap (Id Object) ByteString
|
||||
, resources :: Dictionary
|
||||
, source :: (Id Object)
|
||||
}
|
||||
|
||||
loadContents :: Except m => DirectObject -> InLayer m (OrderedMap (Id Object) Content)
|
||||
loadContents directObject =
|
||||
sequenceA . build loadContent $ objectIds directObject
|
||||
loadByteContents :: ROLayer m => DirectObject -> m (OrderedMap (Id Object) ByteString)
|
||||
loadByteContents directObject = do
|
||||
objs <- sequence . build objectById $ objectIds directObject
|
||||
mapM (r Clear) objs
|
||||
where
|
||||
loadContent :: Except m => (Id Object) -> InLayer m Content
|
||||
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
|
||||
objectIds (Array l) = l >>= getReference
|
||||
objectIds dirObj = getReference dirObj
|
||||
|
||||
getFontDictionary :: Except m => Object -> InLayer m Dictionary
|
||||
getFontDictionary :: (Alternative m, ROLayer m) => Nav Object -> m Dictionary
|
||||
getFontDictionary pageObj =
|
||||
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
|
||||
(pageObj >// PPath ["Resources", "Font"] >>= fmap value . getDictionary)
|
||||
<|> return Map.empty
|
||||
|
||||
cache :: Except m => ((Id Object) -> FontCache m Font) -> (Id Object) -> FontCache m Font
|
||||
cache :: ROLayer m => (Id Object -> FontCache m Font) -> Id Object -> FontCache m Font
|
||||
cache loader objectId =
|
||||
gets (Id.lookup objectId) >>= maybe load return
|
||||
where
|
||||
|
@ -78,81 +77,105 @@ cache loader objectId =
|
|||
modify $ Id.insert objectId value
|
||||
return value
|
||||
|
||||
loadFont :: Except m => (Id Object) -> FontCache m Font
|
||||
loadFont :: (Alternative m, ROLayer m) => Id Object -> FontCache m Font
|
||||
loadFont objectId = lift $ objectById objectId >>= tryMappings
|
||||
where
|
||||
tryMappings object =
|
||||
(object >./ "ToUnicode" >>= r Clear >>= cMap)
|
||||
<|> (object >./ "Encoding" >>= loadEncoding)
|
||||
<|> (fail $ unknownFormat (show objectId) (show object))
|
||||
(object >./ "ToUnicode" >>= r Clear >>= CMap.parse)
|
||||
<|> (object >./ "Encoding" >>= loadEncoding . value)
|
||||
<|> (throwError $ unknownFormat (show objectId) (show object))
|
||||
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||
loadEncoding :: MonadFail m => Object -> m Font
|
||||
loadEncoding :: MonadError String m => Object -> m Font
|
||||
loadEncoding (Direct (NameObject (Name name))) = encoding name
|
||||
loadEncoding object =
|
||||
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
||||
throwError $ printf "Encoding must be a name, not that : %s" $ show object
|
||||
|
||||
loadResources :: Except m => Dictionary -> FontCache m FontSet
|
||||
loadResources :: (Alternative m, ROLayer m) => Dictionary -> FontCache m FontSet
|
||||
loadResources = foldM addFont Map.empty . Map.toList
|
||||
where
|
||||
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||
flip (Map.insert name) output <$> cache loadFont objectId
|
||||
addFont output _ = return output
|
||||
|
||||
getReference :: DirectObject -> [(Id Object)]
|
||||
getReference :: DirectObject -> [Id Object]
|
||||
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
||||
getReference _ = []
|
||||
|
||||
loadPage :: Except m => (Id Object) -> InLayer m Page
|
||||
loadPage :: (Alternative m, ROLayer m) => Id Object -> m Page
|
||||
loadPage source = do
|
||||
page <- objectById source
|
||||
contents <- getKey "Contents" page >>= loadContents
|
||||
byteContents <- loadByteContents . value =<< getKey "Contents" page
|
||||
resources <- getFontDictionary page
|
||||
return $ Page {contents, resources, source}
|
||||
return $ Page {byteContents, resources, source}
|
||||
|
||||
pagesList :: Except m => InLayer m [(Id Object)]
|
||||
pagesList :: (Alternative m, ROLayer m) => m [Id Object]
|
||||
pagesList =
|
||||
(origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences)
|
||||
(catalog // PPath ["Root", "Pages", "Kids"] >>= getReferences . value)
|
||||
<|> return []
|
||||
where
|
||||
getReferences (Array kids) = kids >>= getReference
|
||||
getReferences _ = fail "Not a pages array"
|
||||
getReferences (Direct (Array kids)) = return $ getReference =<< kids
|
||||
getReferences _ = throwError "Not a pages array"
|
||||
|
||||
updatePage :: MonadFail m => Page -> StateT Layer m ()
|
||||
updatePage (Page {contents}) = sequence_ $ mapi updateContent contents
|
||||
editPagesList :: RWLayer m => ([DirectObject] -> [DirectObject]) -> m ()
|
||||
editPagesList f = do
|
||||
pages <- runRO (catalog // PPath ["Root", "Pages"])
|
||||
kids <- runRO (pages >./ "Kids")
|
||||
count <- runRO (pages >./ "Count")
|
||||
(newSize, newKids) <- editKids (value kids)
|
||||
save $ kids {value = newKids}
|
||||
save $ count {value = Direct $ NumberObject newSize}
|
||||
where
|
||||
updateContent source content =
|
||||
edit . at Objects . at source . at Clear $ setContent content
|
||||
setContent content _ = return . toStrict $ render LF content
|
||||
editKids (Direct (Array pageRefs)) =
|
||||
let result = f pageRefs in
|
||||
return (Number . fromIntegral $ length result, Direct $ Array result)
|
||||
editKids _ = throwError "Invalid format for Root.Pages.Kids (not an array)"
|
||||
|
||||
updatePage :: RWLayer m => Page -> m ()
|
||||
updatePage (Page {byteContents}) = sequence_ $ mapi updateByteContent byteContents
|
||||
where
|
||||
updateByteContent source byteContent =
|
||||
edit .at source .at Clear $ \_ -> return byteContent
|
||||
|
||||
data Pages = Pages
|
||||
newtype PageNumber = P Int
|
||||
data Contents = Contents
|
||||
|
||||
instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where
|
||||
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
||||
instance (Alternative m, MonadError String m) => Box m Pages Layer (Map Int Page) where
|
||||
r Pages = runReaderT (numbered <$> pagesList >>= mapM loadPage)
|
||||
where
|
||||
numbered :: [Page] -> Map Int Page
|
||||
numbered :: [Id Object] -> Map Int (Id Object)
|
||||
numbered = Map.fromList . zip [1..]
|
||||
|
||||
w Pages pages = execStateT $ mapM_ updatePage pages
|
||||
|
||||
instance (Alternative m, MonadFail m) => Box m PageNumber Layer Page where
|
||||
r (P i) layer
|
||||
| i < 1 = fail "Pages start at 1"
|
||||
| otherwise = runReaderT (drop (i - 1) <$> pagesList >>= firstPage) layer
|
||||
w Pages pages = execStateT $ do
|
||||
mapM_ updatePage pages
|
||||
setPagesList $ Map.elems (source <$> pages)
|
||||
where
|
||||
firstPage [] = fail "Page is out of bounds"
|
||||
firstPage (p:_) = loadPage p
|
||||
setPagesList =
|
||||
editPagesList . const . fmap (Reference . flip IndirectObjCoordinates 0)
|
||||
|
||||
w _ page = execStateT $ updatePage page
|
||||
instance (Alternative m, MonadError String m) => Box m PageNumber Layer Page where
|
||||
r (P p) layer
|
||||
| p < 1 = throwError "Pages start at 1"
|
||||
| otherwise = runReaderT (drop (p - 1) <$> pagesList >>= firstPage) layer
|
||||
where
|
||||
firstPage =
|
||||
maybe (throwError "Page is out of bounds") loadPage . listToMaybe
|
||||
|
||||
w (P p) page = execStateT $ do
|
||||
updatePage page
|
||||
editPagesList $ setPage (Reference $ IndirectObjCoordinates (source page) 0)
|
||||
where
|
||||
setPage ref l = take (p-1) l ++ ref : drop p l
|
||||
|
||||
instance Monad m => Box m Contents Page (OrderedMap (Id Object) Content) where
|
||||
r Contents = return . contents
|
||||
w _ contents page = return $ page {contents}
|
||||
r Contents = return . fmap Content.parse . byteContents
|
||||
w Contents contents page = return $ page {byteContents}
|
||||
where
|
||||
byteContents = toStrict . render LF <$> contents
|
||||
|
||||
withFonts :: Monad m => (Layer -> FontCache m a) -> Layer -> m a
|
||||
withFonts :: MonadError String m => (Layer -> FontCache (LayerReader m) a) -> Layer -> m a
|
||||
withFonts f layer = runReaderT (evalStateT (f layer) Id.empty) layer
|
||||
|
||||
withResources :: Except m => (Page -> ReaderT FontSet m b) -> Page -> FontCache m b
|
||||
withResources :: (Alternative m, MonadError String m) => (Page -> ReaderT FontSet m b) -> Page -> FontCache (LayerReader m) b
|
||||
withResources f p =
|
||||
loadResources (resources p) >>= lift . lift . runReaderT (f p)
|
||||
|
|
Loading…
Reference in a new issue