Make a couple improvements in performance + add an example script to extract pages from a PDF
This commit is contained in:
parent
f6664683c7
commit
d9f69014a0
17 changed files with 448 additions and 258 deletions
|
@ -59,7 +59,7 @@ executable equivalent
|
||||||
build-depends: base
|
build-depends: base
|
||||||
, bytestring
|
, bytestring
|
||||||
, Hufflepdf
|
, Hufflepdf
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall -rtsopts
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
executable getObj
|
executable getObj
|
||||||
|
@ -69,7 +69,7 @@ executable getObj
|
||||||
, containers
|
, containers
|
||||||
, Hufflepdf
|
, Hufflepdf
|
||||||
, mtl
|
, mtl
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall -rtsopts
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
executable getText
|
executable getText
|
||||||
|
@ -77,9 +77,34 @@ executable getText
|
||||||
build-depends: base
|
build-depends: base
|
||||||
, bytestring
|
, bytestring
|
||||||
, containers
|
, containers
|
||||||
|
, ExceptIOH
|
||||||
, Hufflepdf
|
, Hufflepdf
|
||||||
|
, mtl
|
||||||
, text
|
, text
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall -rtsopts
|
||||||
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
executable pdfCut
|
||||||
|
main-is: examples/pdfCut.hs
|
||||||
|
build-depends: base
|
||||||
|
, bytestring
|
||||||
|
, containers
|
||||||
|
, ExceptIOH
|
||||||
|
, filepath
|
||||||
|
, Hufflepdf
|
||||||
|
, mtl
|
||||||
|
, text
|
||||||
|
ghc-options: -Wall -rtsopts
|
||||||
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
executable fixMermoz
|
||||||
|
main-is: examples/fixMermoz.hs
|
||||||
|
build-depends: base
|
||||||
|
, bytestring
|
||||||
|
, ExceptIOH
|
||||||
|
, Hufflepdf
|
||||||
|
, mtl
|
||||||
|
ghc-options: -Wall -rtsopts
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
Test-Suite unitTests
|
Test-Suite unitTests
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
|
||||||
|
|
||||||
import Control.Monad.Fail (MonadFail(..))
|
|
||||||
import Control.Monad.Reader (ReaderT, runReaderT)
|
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
|
@ -13,7 +11,7 @@ import PDF.Box (Box(..))
|
||||||
import PDF.Layer (Layer(..), unify)
|
import PDF.Layer (Layer(..), unify)
|
||||||
import PDF.Object (Object(..))
|
import PDF.Object (Object(..))
|
||||||
import PDF.Object.Navigation (
|
import PDF.Object.Navigation (
|
||||||
Clear(..), Raw(..), (//), objectById, catalog
|
Nav(..), PPath(..), StreamContent(..), (//), objectById, catalog
|
||||||
)
|
)
|
||||||
import PDF.Output (Output)
|
import PDF.Output (Output)
|
||||||
import qualified PDF.Output as Output (render)
|
import qualified PDF.Output as Output (render)
|
||||||
|
@ -23,33 +21,27 @@ import System.Exit (die)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
import Text.Read (readMaybe)
|
import Text.Read (readMaybe)
|
||||||
|
|
||||||
newtype Error a = Error {
|
|
||||||
runError :: Either String a
|
|
||||||
} deriving (Functor, Applicative, Monad)
|
|
||||||
instance MonadFail Error where
|
|
||||||
fail = Error . Left
|
|
||||||
|
|
||||||
decodedStream :: Object -> Object
|
decodedStream :: Object -> Object
|
||||||
decodedStream object =
|
decodedStream object =
|
||||||
maybe object id $ r Clear object >>= flip (w Raw) object
|
either (const object) id $ r Clear object >>= flip (w Raw) object
|
||||||
|
|
||||||
display :: Output a => ReaderT Layer Error a -> Document -> Either String ByteString
|
display :: Functor m => Output a => ReaderT Layer m a -> Document -> m ByteString
|
||||||
display getter (Document {eolStyle, layers}) =
|
display getter (Document {eolStyle, layers}) =
|
||||||
Output.render eolStyle <$> runError (runReaderT getter (unify layers))
|
Output.render eolStyle <$> runReaderT getter (unify layers)
|
||||||
|
|
||||||
parse :: [String] -> IO (FilePath, Document -> Either String ByteString)
|
parse :: [String] -> IO (FilePath, Document -> Either String ByteString)
|
||||||
parse [inputFile] = return (inputFile, display catalog)
|
parse [inputFile] = return (inputFile, display $ value <$> catalog)
|
||||||
parse [inputFile, key] =
|
parse [inputFile, key] =
|
||||||
return (inputFile, clear . maybe (byPath key) byId $ readMaybe key)
|
return (inputFile, clear . maybe (byPath key) byId $ readMaybe key)
|
||||||
where
|
where
|
||||||
byId = objectById . Id
|
byId = objectById . Id
|
||||||
byPath path = catalog // (explode path)
|
byPath path = (catalog // PPath (explode path))
|
||||||
explode "" = []
|
explode "" = []
|
||||||
explode path =
|
explode path =
|
||||||
case break (== '.') path of
|
case break (== '.') path of
|
||||||
(name, "") -> [name]
|
(name, "") -> [name]
|
||||||
(name, rest) -> name : explode (drop 1 rest)
|
(name, rest) -> name : explode (drop 1 rest)
|
||||||
clear = display . fmap decodedStream
|
clear = display . fmap (decodedStream . value)
|
||||||
parse _ =
|
parse _ =
|
||||||
die . printf "Syntax: %s inputFile [OBJECT_ID | PATH_TO_OBJ]\n" =<< getProgName
|
die . printf "Syntax: %s inputFile [OBJECT_ID | PATH_TO_OBJ]\n" =<< getProgName
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
import Control.Monad ((>=>))
|
import Control.Monad ((>=>))
|
||||||
|
import Control.Monad.Except (ExceptT(..))
|
||||||
|
import Control.Monad.Except.IOH (handle)
|
||||||
import Control.Monad.IO.Class (liftIO)
|
import Control.Monad.IO.Class (liftIO)
|
||||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
import Data.Id (Id(..), mapWithKey)
|
import Data.Id (Id(..), mapWithKey)
|
||||||
|
@ -9,7 +11,7 @@ import qualified Data.Text as Text (unpack)
|
||||||
import PDF (UnifiedLayers(..), parseDocument)
|
import PDF (UnifiedLayers(..), parseDocument)
|
||||||
import PDF.Box (Box(..))
|
import PDF.Box (Box(..))
|
||||||
import PDF.Content.Text (Chunks(..))
|
import PDF.Content.Text (Chunks(..))
|
||||||
import PDF.Layer (Layer)
|
import PDF.Layer (Layer, LayerReader)
|
||||||
import PDF.Pages (
|
import PDF.Pages (
|
||||||
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), withFonts
|
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), withFonts
|
||||||
, withResources
|
, withResources
|
||||||
|
@ -19,7 +21,7 @@ import System.Exit (die)
|
||||||
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
displayPage :: Int -> Page -> FontCache IO ()
|
displayPage :: Int -> Page -> FontCache (LayerReader (ExceptT String IO)) ()
|
||||||
displayPage n = withResources (
|
displayPage n = withResources (
|
||||||
r Contents
|
r Contents
|
||||||
>=> sequence_ . mapi (\objectId ->
|
>=> sequence_ . mapi (\objectId ->
|
||||||
|
@ -31,22 +33,22 @@ displayPage n = withResources (
|
||||||
liftIO . putStrLn $
|
liftIO . putStrLn $
|
||||||
printf "p#%d obj#%d instr#%d: %s" n (getId a) (getId b) (Text.unpack v)
|
printf "p#%d obj#%d instr#%d: %s" n (getId a) (getId b) (Text.unpack v)
|
||||||
|
|
||||||
getAll :: Layer -> IO ()
|
getAll :: Layer -> ExceptT String IO ()
|
||||||
getAll = withFonts $ r Pages >=> sequence_ . Map.mapWithKey displayPage
|
getAll = withFonts $ r Pages >=> sequence_ . Map.mapWithKey displayPage
|
||||||
|
|
||||||
get :: Int -> Layer -> IO ()
|
get :: Int -> Layer -> ExceptT String IO ()
|
||||||
get n = withFonts $ r (P n) >=> displayPage n
|
get n = withFonts $ r (P n) >=> displayPage n
|
||||||
|
|
||||||
onDoc :: FilePath -> (Layer -> IO ()) -> IO ()
|
onDoc :: FilePath -> (Layer -> ExceptT String IO ()) -> ExceptT String IO ()
|
||||||
onDoc inputFile f = do
|
onDoc inputFile f =
|
||||||
(parseDocument <$> BS.readFile inputFile)
|
ExceptT (parseDocument <$> BS.readFile inputFile) >>= r UnifiedLayers >>= f
|
||||||
>>= either die (r UnifiedLayers >=> f)
|
|
||||||
|
|
||||||
main :: IO ()
|
main :: IO ()
|
||||||
main = do
|
main = do
|
||||||
hSetBuffering stdout LineBuffering
|
hSetBuffering stdout LineBuffering
|
||||||
args <- getArgs
|
args <- getArgs
|
||||||
case args of
|
case args of
|
||||||
[inputFile] -> onDoc inputFile getAll
|
[inputFile] -> onDoc inputFile getAll `handle` die
|
||||||
[inputFile, pageNumber] -> onDoc inputFile (get $ read pageNumber)
|
[inputFile, pageNumber] ->
|
||||||
|
onDoc inputFile (get $ read pageNumber) `handle` die
|
||||||
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
|
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
|
||||||
|
|
41
examples/pdfCut.hs
Normal file
41
examples/pdfCut.hs
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
import Control.Monad.State (MonadState(..), evalStateT)
|
||||||
|
import Control.Monad.Except.IOH (handle)
|
||||||
|
import Control.Monad.Trans (lift)
|
||||||
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
|
import qualified Data.ByteString.Lazy.Char8 as Lazy (writeFile)
|
||||||
|
import Data.Map ((!), fromSet)
|
||||||
|
import qualified Data.Set as Set (fromList)
|
||||||
|
import PDF (Document, UnifiedLayers(..), parseDocument, render)
|
||||||
|
import PDF.Box (at)
|
||||||
|
import PDF.Pages (Pages(..))
|
||||||
|
import System.Environment (getArgs)
|
||||||
|
import System.Exit (die)
|
||||||
|
import System.FilePath (replaceBaseName, takeBaseName)
|
||||||
|
import Text.Read (readEither)
|
||||||
|
|
||||||
|
parseRange :: String -> Either String (Int, Int)
|
||||||
|
parseRange = evalStateT $ do
|
||||||
|
from <- lift . readEither =<< state (break (== '-'))
|
||||||
|
get >>= makeRange from
|
||||||
|
where
|
||||||
|
makeRange from "" = return (from, from)
|
||||||
|
makeRange from (_:to) = (,) from <$> lift (readEither to)
|
||||||
|
|
||||||
|
cut :: (Int, Int) -> Document -> IO Document
|
||||||
|
cut (p1, p2) doc = ((
|
||||||
|
at UnifiedLayers .at Pages $ \pages ->
|
||||||
|
return $ fromSet (pages!) $ Set.fromList [pMin .. pMax]
|
||||||
|
) doc) `handle` die
|
||||||
|
where
|
||||||
|
(pMin, pMax) = (min p1 p2, max p1 p2)
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
[inputPath, inputRange] <- getArgs
|
||||||
|
range <- catchLeft $ parseRange inputRange
|
||||||
|
doc <- catchLeft =<< parseDocument <$> BS.readFile inputPath
|
||||||
|
Lazy.writeFile (outputPath inputPath inputRange) . render =<< cut range doc
|
||||||
|
where
|
||||||
|
catchLeft = either die return
|
||||||
|
outputPath fileName range =
|
||||||
|
replaceBaseName fileName (takeBaseName fileName ++ "_p" ++ range)
|
|
@ -14,7 +14,7 @@ module Data.ByteString.Char8.Util (
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString (ByteString, snoc)
|
import Data.ByteString (ByteString, snoc)
|
||||||
import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt)
|
import qualified Data.ByteString as BS (empty, foldl', length, pack, singleton, splitAt)
|
||||||
import qualified Data.ByteString.Char8 as Char8 (
|
import qualified Data.ByteString.Char8 as Char8 (
|
||||||
cons, drop, index, splitAt, take, uncons, unpack
|
cons, drop, index, splitAt, take, uncons, unpack
|
||||||
)
|
)
|
||||||
|
@ -43,7 +43,7 @@ intToB256 n
|
||||||
B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
|
B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
|
||||||
|
|
||||||
b256ToInt :: B256Int -> Int
|
b256ToInt :: B256Int -> Int
|
||||||
b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n
|
b256ToInt (B256Int n) = BS.foldl' (\k w -> 0x100*k + fromEnum w) 0 n
|
||||||
|
|
||||||
toBytes :: Int -> Int -> ByteString
|
toBytes :: Int -> Int -> ByteString
|
||||||
toBytes 0 _ = BS.empty
|
toBytes 0 _ = BS.empty
|
||||||
|
|
|
@ -8,6 +8,7 @@ module Data.Id (
|
||||||
, at
|
, at
|
||||||
, delete
|
, delete
|
||||||
, empty
|
, empty
|
||||||
|
, filterWithKey
|
||||||
, fromList
|
, fromList
|
||||||
, insert
|
, insert
|
||||||
, keysSet
|
, keysSet
|
||||||
|
@ -21,11 +22,11 @@ module Data.Id (
|
||||||
, union
|
, union
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Monad.State (MonadState, modify, gets)
|
import Control.Monad.State.Strict (MonadState, modify, gets)
|
||||||
import Data.IntMap (IntMap, (!))
|
import Data.IntMap (IntMap, (!))
|
||||||
import qualified Data.IntMap as IntMap (
|
import qualified Data.IntMap as IntMap (
|
||||||
delete, empty, fromList, keysSet, insert, lookup, mapWithKey, maxViewWithKey
|
delete, empty, filterWithKey, fromList, keysSet, insert, lookup
|
||||||
, member, minViewWithKey, size, union
|
, mapWithKey, maxViewWithKey, member, minViewWithKey, size, union
|
||||||
)
|
)
|
||||||
import Data.IntSet (IntSet)
|
import Data.IntSet (IntSet)
|
||||||
import Prelude hiding (lookup)
|
import Prelude hiding (lookup)
|
||||||
|
@ -74,6 +75,9 @@ union (IdMap intMap1) (IdMap intMap2) =
|
||||||
mapWithKey :: (Id a -> b -> c) -> IdMap a b -> IdMap a c
|
mapWithKey :: (Id a -> b -> c) -> IdMap a b -> IdMap a c
|
||||||
mapWithKey f (IdMap idMap) = IdMap {intMap = IntMap.mapWithKey (f . Id) idMap}
|
mapWithKey f (IdMap idMap) = IdMap {intMap = IntMap.mapWithKey (f . Id) idMap}
|
||||||
|
|
||||||
|
filterWithKey :: (Id a -> b -> Bool) -> IdMap a b -> IdMap a b
|
||||||
|
filterWithKey f = IdMap . IntMap.filterWithKey (f . Id) . intMap
|
||||||
|
|
||||||
fromList :: [(Id a, b)] -> IdMap a b
|
fromList :: [(Id a, b)] -> IdMap a b
|
||||||
fromList = IdMap . IntMap.fromList . fmap (\(key, b) -> (getId key, b))
|
fromList = IdMap . IntMap.fromList . fmap (\(key, b) -> (getId key, b))
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
{-# LANGUAGE FunctionalDependencies #-}
|
{-# LANGUAGE FunctionalDependencies #-}
|
||||||
{-# LANGUAGE FlexibleInstances #-}
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
|
||||||
{-# LANGUAGE UndecidableInstances #-}
|
{-# LANGUAGE UndecidableInstances #-}
|
||||||
{-# LANGUAGE ScopedTypeVariables #-}
|
{-# LANGUAGE ScopedTypeVariables #-}
|
||||||
module PDF.Box (
|
module PDF.Box (
|
||||||
|
@ -17,8 +16,8 @@ module PDF.Box (
|
||||||
|
|
||||||
import Control.Monad.Except (ExceptT(..), runExceptT)
|
import Control.Monad.Except (ExceptT(..), runExceptT)
|
||||||
import Control.Monad.Fail (MonadFail(..))
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
import Control.Monad.State (MonadState(..))
|
|
||||||
import Control.Monad.Reader (ReaderT, runReaderT)
|
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||||
|
import Control.Monad.State (MonadState(..))
|
||||||
import Data.Id (Id, IdMap)
|
import Data.Id (Id, IdMap)
|
||||||
import qualified Data.Id as Id (insert, lookup)
|
import qualified Data.Id as Id (insert, lookup)
|
||||||
import Data.Map (Map)
|
import Data.Map (Map)
|
||||||
|
|
|
@ -1,15 +1,17 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
module PDF.CMap (
|
module PDF.CMap (
|
||||||
CMap
|
CMap
|
||||||
, CMappers
|
, CMappers
|
||||||
, CRange(..)
|
, CRange(..)
|
||||||
, cMap
|
|
||||||
, matches
|
, matches
|
||||||
|
, parse
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>), many)
|
import Control.Applicative ((<|>), many)
|
||||||
import Control.Monad.Fail (MonadFail(..))
|
import Control.Monad.Except (MonadError(..))
|
||||||
|
import Control.Monad.Fail (fail)
|
||||||
import Control.Monad.State (modify)
|
import Control.Monad.State (modify)
|
||||||
import Data.Attoparsec.ByteString.Char8 (count)
|
import Data.Attoparsec.ByteString.Char8 (count)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
|
@ -18,6 +20,7 @@ import Data.ByteString.Char8 (unpack)
|
||||||
import Data.ByteString.Char8.Util (
|
import Data.ByteString.Char8.Util (
|
||||||
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
|
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
|
||||||
)
|
)
|
||||||
|
import Data.Foldable (foldl', foldr')
|
||||||
import Data.Map (Map, mapWithKey, union)
|
import Data.Map (Map, mapWithKey, union)
|
||||||
import qualified Data.Map as Map (
|
import qualified Data.Map as Map (
|
||||||
adjust, empty, fromList, insert, insertWith, lookup, toList
|
adjust, empty, fromList, insert, insertWith, lookup, toList
|
||||||
|
@ -75,7 +78,7 @@ encoder :: Map Size FromUnicode -> Encoder
|
||||||
encoder fromUnicodes input
|
encoder fromUnicodes input
|
||||||
| Text.null input = Right ""
|
| Text.null input = Right ""
|
||||||
| otherwise =
|
| otherwise =
|
||||||
foldl (<>) (Left "No encoding found") $ mapWithKey tryOn fromUnicodes
|
foldr' (<>) (Left "No encoding found") $ mapWithKey tryOn fromUnicodes
|
||||||
where
|
where
|
||||||
tryOn size fromUnicode =
|
tryOn size fromUnicode =
|
||||||
let (prefix, end) = Text.splitAt size input in
|
let (prefix, end) = Text.splitAt size input in
|
||||||
|
@ -87,8 +90,8 @@ matches :: ByteString -> CRange -> Bool
|
||||||
matches code (CRange {fromSequence, toSequence}) =
|
matches code (CRange {fromSequence, toSequence}) =
|
||||||
fromSequence <= code && code <= toSequence
|
fromSequence <= code && code <= toSequence
|
||||||
|
|
||||||
cMap :: MonadFail m => ByteString -> m Font
|
parse :: MonadError String m => ByteString -> m Font
|
||||||
cMap = either fail (return . toFont . snd) . runParser
|
parse = either throwError (return . toFont . snd) . runParser
|
||||||
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
||||||
emptyCMap
|
emptyCMap
|
||||||
where
|
where
|
||||||
|
@ -148,7 +151,7 @@ saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length cod
|
||||||
)
|
)
|
||||||
|
|
||||||
saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode
|
saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode
|
||||||
saveFromUnicodeBySize = flip (foldl insertFromUnicode)
|
saveFromUnicodeBySize = flip (foldl' insertFromUnicode)
|
||||||
where
|
where
|
||||||
insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode
|
insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode
|
||||||
insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) =
|
insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) =
|
||||||
|
|
|
@ -9,54 +9,77 @@ module PDF.Content (
|
||||||
Content(..)
|
Content(..)
|
||||||
, ContentUnit(..)
|
, ContentUnit(..)
|
||||||
, GraphicContextUnit(..)
|
, GraphicContextUnit(..)
|
||||||
|
, IdContentUnit
|
||||||
|
, IdGraphicContextUnit
|
||||||
|
, IdTextContext
|
||||||
, Instructions(..)
|
, Instructions(..)
|
||||||
, TextContext
|
, TextContext
|
||||||
, parse
|
, parse
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>))
|
import Control.Applicative ((<|>))
|
||||||
import Control.Monad.Fail (MonadFail)
|
|
||||||
import Control.Monad.Reader (asks, runReader)
|
import Control.Monad.Reader (asks, runReader)
|
||||||
import Control.Monad.State (evalStateT, modify)
|
import Control.Monad.State.Strict (runState, evalStateT, modify)
|
||||||
import Data.Attoparsec.ByteString.Char8 (sepBy)
|
import Data.Attoparsec.ByteString.Char8 (sepBy)
|
||||||
|
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser, parseOnly)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import Data.Id (Id(..), Indexed, at, empty, register)
|
import Data.Id (Id(..), Indexed, at, empty, register)
|
||||||
import PDF.Box (Box(..))
|
import PDF.Box (Box(..))
|
||||||
import PDF.Content.Operator (Instruction, operator)
|
import PDF.Content.Operator (Instruction, operator)
|
||||||
import PDF.Object (blank, directObject)
|
import PDF.Object (blank, directObject)
|
||||||
import PDF.Output (Output(..), line)
|
import PDF.Output (Output(..), line)
|
||||||
import PDF.Parser (Parser, runParser, string)
|
import PDF.Parser (string)
|
||||||
|
|
||||||
data Instructions = Instructions
|
data Instructions = Instructions
|
||||||
|
|
||||||
data GraphicContextUnit =
|
data GraphicContextUnit a =
|
||||||
GraphicInstruction (Id Instruction)
|
GraphicInstruction a
|
||||||
| ContentUnit ContentUnit
|
| ContentUnit (ContentUnit a)
|
||||||
deriving Show
|
deriving Show
|
||||||
type TextContext = [Id Instruction]
|
type TextContext a = [a]
|
||||||
data ContentUnit =
|
data ContentUnit a =
|
||||||
GraphicContext [GraphicContextUnit]
|
GraphicContext [GraphicContextUnit a]
|
||||||
| TextContext TextContext
|
| TextContext (TextContext a)
|
||||||
deriving Show
|
deriving Show
|
||||||
data Content = Content {
|
data Content = Content {
|
||||||
contentUnits :: [ContentUnit]
|
contentUnits :: [IdContentUnit]
|
||||||
, indexedInstructions :: Indexed Instruction
|
, indexedInstructions :: Indexed Instruction
|
||||||
|
, firstError :: Maybe String
|
||||||
} deriving Show
|
} deriving Show
|
||||||
|
|
||||||
type InstructionParser = Parser (Indexed Instruction)
|
type TmpContentUnit = ContentUnit Instruction
|
||||||
|
type TmpGraphicContextUnit = GraphicContextUnit Instruction
|
||||||
|
type TmpTextContext = TextContext Instruction
|
||||||
|
|
||||||
|
type IdContentUnit = ContentUnit (Id Instruction)
|
||||||
|
type IdGraphicContextUnit = GraphicContextUnit (Id Instruction)
|
||||||
|
type IdTextContext = TextContext (Id Instruction)
|
||||||
|
|
||||||
instance Monad m => Box m Instructions Content (Indexed Instruction) where
|
instance Monad m => Box m Instructions Content (Indexed Instruction) where
|
||||||
r Instructions = return . indexedInstructions
|
r Instructions = return . indexedInstructions
|
||||||
w Instructions indexedInstructions someContent =
|
w Instructions indexedInstructions someContent =
|
||||||
return $ someContent {indexedInstructions}
|
return $ someContent {indexedInstructions}
|
||||||
|
|
||||||
parse :: MonadFail m => ByteString -> m Content
|
parse :: ByteString -> Content
|
||||||
parse =
|
parse input =
|
||||||
either fail (return . uncurry Content) . runParser contentUnits empty
|
let result = Atto.parseOnly (contentUnit `sepBy` blank) input in
|
||||||
where
|
let (contentUnits, indexedInstructions) = either (const ([], empty)) buildContent result in
|
||||||
contentUnits = contentUnit `sepBy` blank
|
let firstError = either Just (const Nothing) result in
|
||||||
|
Content {contentUnits, indexedInstructions, firstError}
|
||||||
|
|
||||||
contentUnit :: InstructionParser ContentUnit
|
buildContent :: [TmpContentUnit] -> ([IdContentUnit], Indexed Instruction)
|
||||||
|
buildContent instructionContentUnits =
|
||||||
|
runState (mapM registerContentUnit instructionContentUnits) empty
|
||||||
|
where
|
||||||
|
registerContentUnit (GraphicContext gc) =
|
||||||
|
GraphicContext <$> (mapM registerGraphicContext gc)
|
||||||
|
registerContentUnit (TextContext tc) = TextContext <$> (mapM register tc)
|
||||||
|
registerGraphicContext (GraphicInstruction gi) =
|
||||||
|
GraphicInstruction <$> (register gi)
|
||||||
|
registerGraphicContext (ContentUnit cu) =
|
||||||
|
ContentUnit <$> (registerContentUnit cu)
|
||||||
|
|
||||||
|
contentUnit :: Atto.Parser TmpContentUnit
|
||||||
contentUnit =
|
contentUnit =
|
||||||
(GraphicContext <$> graphicContext)
|
(GraphicContext <$> graphicContext)
|
||||||
<|> (TextContext <$> textContext)
|
<|> (TextContext <$> textContext)
|
||||||
|
@ -64,18 +87,18 @@ contentUnit =
|
||||||
graphicContext =
|
graphicContext =
|
||||||
string "q" *> blank *> graphicContextUnit `sepBy` blank <* blank <* string "Q"
|
string "q" *> blank *> graphicContextUnit `sepBy` blank <* blank <* string "Q"
|
||||||
|
|
||||||
graphicContextUnit :: InstructionParser GraphicContextUnit
|
graphicContextUnit :: Atto.Parser TmpGraphicContextUnit
|
||||||
graphicContextUnit =
|
graphicContextUnit =
|
||||||
(GraphicInstruction <$> instruction)
|
(GraphicInstruction <$> instruction)
|
||||||
<|> (ContentUnit <$> contentUnit)
|
<|> (ContentUnit <$> contentUnit)
|
||||||
|
|
||||||
instruction :: InstructionParser (Id Instruction)
|
instruction :: Atto.Parser Instruction
|
||||||
instruction = evalStateT stackParser [] >>= register
|
instruction = evalStateT stackParser []
|
||||||
where
|
where
|
||||||
stackParser = ((directObject <* blank) >>= push) <|> operator
|
stackParser = ((directObject <* blank) >>= push) <|> operator
|
||||||
push arg = modify (arg:) *> stackParser
|
push arg = modify (arg:) *> stackParser
|
||||||
|
|
||||||
textContext :: InstructionParser TextContext
|
textContext :: Atto.Parser TmpTextContext
|
||||||
textContext =
|
textContext =
|
||||||
string "BT" *> blank *> instruction `sepBy` blank <* blank <* string "ET"
|
string "BT" *> blank *> instruction `sepBy` blank <* blank <* string "ET"
|
||||||
|
|
||||||
|
|
|
@ -1,24 +1,22 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
{-# LANGUAGE ConstraintKinds #-}
|
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
{-# LANGUAGE TypeSynonymInstances #-}
|
|
||||||
{-# LANGUAGE FlexibleInstances #-}
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||||
module PDF.Content.Text (
|
module PDF.Content.Text (
|
||||||
Chunks(..)
|
Chunks(..)
|
||||||
, chunk
|
, chunk
|
||||||
, format
|
, format
|
||||||
, renderText
|
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Monad (foldM)
|
import Control.Monad (foldM)
|
||||||
import Control.Monad.Fail (MonadFail(..))
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
import Control.Monad.Reader (MonadReader(..), ReaderT, asks, runReaderT)
|
import Control.Monad.Reader (ReaderT, asks, runReaderT)
|
||||||
import Control.Monad.State (
|
import Control.Monad.State (
|
||||||
MonadState(..), StateT, evalStateT, gets, modify, runStateT
|
MonadState(..), StateT, evalStateT, gets, modify, runStateT
|
||||||
)
|
)
|
||||||
import Control.Monad.Trans (lift)
|
import Control.Monad.Trans (lift)
|
||||||
|
import qualified Data.ByteString.Char8 as BS (concatMap, singleton)
|
||||||
import Data.Id (Id(..), Indexed, at, empty, singleton)
|
import Data.Id (Id(..), Indexed, at, empty, singleton)
|
||||||
import qualified Data.Id as Id (delete, lookup, register)
|
import qualified Data.Id as Id (delete, lookup, register)
|
||||||
import Data.Map ((!))
|
import Data.Map ((!))
|
||||||
|
@ -26,27 +24,20 @@ import Data.Text (Text, breakOn)
|
||||||
import qualified Data.Text as Text (drop)
|
import qualified Data.Text as Text (drop)
|
||||||
import PDF.Box (Box(..))
|
import PDF.Box (Box(..))
|
||||||
import PDF.Content (
|
import PDF.Content (
|
||||||
Content, ContentUnit(..), GraphicContextUnit(..), contentUnits
|
Content(..), ContentUnit(..), GraphicContextUnit(..), IdContentUnit
|
||||||
|
, IdGraphicContextUnit
|
||||||
)
|
)
|
||||||
import qualified PDF.Content as Content (Content(..))
|
|
||||||
import PDF.Content.Operator (Instruction, Operator(..))
|
import PDF.Content.Operator (Instruction, Operator(..))
|
||||||
import PDF.Content.Operator.Text (Operator(..))
|
import PDF.Content.Operator.Text (Operator(..))
|
||||||
import PDF.Font (Font(..), FontSet, emptyFont)
|
import PDF.Font (Font(..), FontSet, emptyFont)
|
||||||
import PDF.Object (DirectObject(..), StringObject(..), toByteString)
|
import PDF.Object (DirectObject(..), StringObject(..), toByteString)
|
||||||
import Prelude hiding (fail)
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
data ROContext = ROContext {
|
type TmpFont = StateT Font
|
||||||
indexedInstructions :: Indexed Instruction
|
type Renderer m = ReaderT (Indexed Instruction) (ReaderT FontSet m)
|
||||||
, fontSet :: FontSet
|
|
||||||
}
|
|
||||||
|
|
||||||
type TextContent m = ReaderT ROContext m
|
|
||||||
type FontContext m = StateT Font (TextContent m)
|
|
||||||
|
|
||||||
type Updater m = StateT (Indexed Instruction) (ReaderT (Indexed Text) (ReaderT FontSet m))
|
type Updater m = StateT (Indexed Instruction) (ReaderT (Indexed Text) (ReaderT FontSet m))
|
||||||
type TextUpdater m = StateT Font (Updater m)
|
|
||||||
|
|
||||||
decodeString :: MonadFail m => StringObject -> FontContext m Text
|
decodeString :: MonadFail m => StringObject -> TmpFont (Renderer m) Text
|
||||||
decodeString input = do
|
decodeString input = do
|
||||||
Font {decode} <- get
|
Font {decode} <- get
|
||||||
either fail return . decode $ toByteString input
|
either fail return . decode $ toByteString input
|
||||||
|
@ -57,56 +48,53 @@ chunk :: Int -> Id Text
|
||||||
chunk = Id
|
chunk = Id
|
||||||
|
|
||||||
instance MonadFail m => Box (ReaderT FontSet m) Chunks Content (Indexed Text) where
|
instance MonadFail m => Box (ReaderT FontSet m) Chunks Content (Indexed Text) where
|
||||||
r Chunks content = do
|
r Chunks content =
|
||||||
fontSet <- ask
|
runReaderT (mconcat <$> renderer) $ indexedInstructions content
|
||||||
renderText fontSet content
|
where
|
||||||
|
renderer = mapM renderContentUnit (contentUnits content)
|
||||||
|
|
||||||
w Chunks indexedText (Content.Content {contentUnits, Content.indexedInstructions}) =
|
w Chunks indexedText content = do
|
||||||
uncurry Content.Content <$> runReaderT (
|
(contentUnits, indexedInstructions) <- runReaderT readerUpdate indexedText
|
||||||
runStateT (mapM updateContentUnit contentUnits) indexedInstructions
|
return $ content {contentUnits, indexedInstructions}
|
||||||
) indexedText
|
where
|
||||||
|
stateUpdate = mapM updateContentUnit (contentUnits content)
|
||||||
|
readerUpdate = runStateT stateUpdate (indexedInstructions content)
|
||||||
|
|
||||||
renderText :: MonadFail m => FontSet -> Content -> m (Indexed Text)
|
renderContentUnit :: MonadFail m => IdContentUnit -> Renderer m (Indexed Text)
|
||||||
renderText fontSet (Content.Content {contentUnits, Content.indexedInstructions}) =
|
|
||||||
runReaderT (mconcat <$> mapM renderContentUnit contentUnits) roContext
|
|
||||||
where
|
|
||||||
roContext = ROContext {indexedInstructions, fontSet}
|
|
||||||
|
|
||||||
renderContentUnit :: MonadFail m => ContentUnit -> TextContent m (Indexed Text)
|
|
||||||
renderContentUnit (GraphicContext graphicContextUnits) =
|
renderContentUnit (GraphicContext graphicContextUnits) =
|
||||||
mconcat <$> mapM renderGraphicContextUnit graphicContextUnits
|
mconcat <$> mapM renderGraphicContextUnit graphicContextUnits
|
||||||
renderContentUnit (TextContext instructionIds) =
|
renderContentUnit (TextContext instructionIds) =
|
||||||
evalStateT (mconcat <$> mapM renderInstructionId instructionIds) emptyFont
|
evalStateT (mconcat <$> mapM renderInstructionId instructionIds) emptyFont
|
||||||
|
|
||||||
updateContentUnit :: MonadFail m => ContentUnit -> Updater m ContentUnit
|
updateContentUnit :: MonadFail m => IdContentUnit -> Updater m IdContentUnit
|
||||||
updateContentUnit (GraphicContext graphicContextUnits) = GraphicContext <$>
|
updateContentUnit (GraphicContext graphicContextUnits) = GraphicContext <$>
|
||||||
mapM updateGraphicContextUnit graphicContextUnits
|
mapM updateGraphicContextUnit graphicContextUnits
|
||||||
updateContentUnit (TextContext instructionIds) = TextContext . concat <$>
|
updateContentUnit (TextContext instructionIds) = TextContext . concat <$>
|
||||||
evalStateT (mapM updateInstructionId instructionIds) emptyFont
|
evalStateT (mapM updateInstructionId instructionIds) emptyFont
|
||||||
|
|
||||||
renderGraphicContextUnit :: MonadFail m => GraphicContextUnit -> TextContent m (Indexed Text)
|
renderGraphicContextUnit :: MonadFail m => IdGraphicContextUnit -> Renderer m (Indexed Text)
|
||||||
renderGraphicContextUnit (GraphicInstruction _) = return empty
|
renderGraphicContextUnit (GraphicInstruction _) = return empty
|
||||||
renderGraphicContextUnit (ContentUnit contentUnit) =
|
renderGraphicContextUnit (ContentUnit contentUnit) =
|
||||||
renderContentUnit contentUnit
|
renderContentUnit contentUnit
|
||||||
|
|
||||||
updateGraphicContextUnit :: MonadFail m => GraphicContextUnit -> Updater m GraphicContextUnit
|
updateGraphicContextUnit :: MonadFail m => IdGraphicContextUnit -> Updater m IdGraphicContextUnit
|
||||||
updateGraphicContextUnit gI@(GraphicInstruction _) = return gI
|
updateGraphicContextUnit gI@(GraphicInstruction _) = return gI
|
||||||
updateGraphicContextUnit (ContentUnit contentUnit) =
|
updateGraphicContextUnit (ContentUnit contentUnit) =
|
||||||
ContentUnit <$> updateContentUnit contentUnit
|
ContentUnit <$> updateContentUnit contentUnit
|
||||||
|
|
||||||
renderInstructionId :: MonadFail m => Id Instruction -> FontContext m (Indexed Text)
|
renderInstructionId :: MonadFail m => Id Instruction -> TmpFont (Renderer m) (Indexed Text)
|
||||||
renderInstructionId instructionId@(Id n) = toMap <$>
|
renderInstructionId instructionId@(Id n) = toMap <$>
|
||||||
(asks ((`at` instructionId) . indexedInstructions) >>= renderInstruction)
|
(asks ((`at` instructionId)) >>= renderInstruction)
|
||||||
where
|
where
|
||||||
toMap = maybe empty (singleton (Id n))
|
toMap = maybe empty (singleton (Id n))
|
||||||
|
|
||||||
updateInstructionId :: MonadFail m => Id Instruction -> TextUpdater m [Id Instruction]
|
updateInstructionId :: MonadFail m => Id Instruction -> TmpFont (Updater m) [Id Instruction]
|
||||||
updateInstructionId instructionId =
|
updateInstructionId instructionId =
|
||||||
lift (gets (`at` instructionId)) >>= updateInstruction instructionId
|
lift (gets (`at` instructionId)) >>= updateInstruction instructionId
|
||||||
|
|
||||||
renderInstruction :: MonadFail m => Instruction -> FontContext m (Maybe Text)
|
renderInstruction :: MonadFail m => Instruction -> TmpFont (Renderer m) (Maybe Text)
|
||||||
renderInstruction (Text Tf, [NameObject fontName, _]) =
|
renderInstruction (Text Tf, [NameObject fontName, _]) =
|
||||||
asks ((! fontName) . fontSet) >>= put >> return Nothing
|
lift (lift $ asks (! fontName)) >>= put >> return Nothing
|
||||||
|
|
||||||
renderInstruction (Text Tstar, []) = return $ Just "\n"
|
renderInstruction (Text Tstar, []) = return $ Just "\n"
|
||||||
|
|
||||||
|
@ -128,7 +116,7 @@ renderInstruction (Text DQuote, [_, _, StringObject outputString]) =
|
||||||
|
|
||||||
renderInstruction _ = return Nothing
|
renderInstruction _ = return Nothing
|
||||||
|
|
||||||
updateInstruction :: MonadFail m => Id Instruction -> Instruction -> TextUpdater m [Id Instruction]
|
updateInstruction :: MonadFail m => Id Instruction -> Instruction -> TmpFont (Updater m) [Id Instruction]
|
||||||
updateInstruction instructionId (Text Tf, [NameObject fontName, _]) =
|
updateInstruction instructionId (Text Tf, [NameObject fontName, _]) =
|
||||||
(lift . lift . lift $ asks (!fontName)) >>= put >> return [instructionId]
|
(lift . lift . lift $ asks (!fontName)) >>= put >> return [instructionId]
|
||||||
|
|
||||||
|
@ -155,5 +143,9 @@ format input = do
|
||||||
(line, left) -> (:) <$> tj line <*> format left
|
(line, left) -> (:) <$> tj line <*> format left
|
||||||
where
|
where
|
||||||
tj t = do
|
tj t = do
|
||||||
literal <- either fail (return . Literal) =<< gets (($t) . encode)
|
encoded <- either fail return =<< gets (($t) . encode)
|
||||||
return (Text Tj, [StringObject literal])
|
return (Text Tj, [StringObject . Literal $ BS.concatMap escape encoded])
|
||||||
|
escape '\\' = "\\\\"
|
||||||
|
escape '(' = "\\("
|
||||||
|
escape ')' = "\\)"
|
||||||
|
escape c = BS.singleton c
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
module PDF.Encoding (
|
module PDF.Encoding (
|
||||||
encoding
|
encoding
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Monad.Fail (MonadFail(..))
|
import Control.Monad.Except (MonadError(..))
|
||||||
import PDF.Encoding.MacRoman (macRomanEncoding)
|
import PDF.Encoding.MacRoman (macRomanEncoding)
|
||||||
import PDF.Font (Font)
|
import PDF.Font (Font)
|
||||||
import Prelude hiding (fail)
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
encoding :: MonadFail m => String -> m Font
|
encoding :: MonadError String m => String -> m Font
|
||||||
encoding "MacRomanEncoding" = return macRomanEncoding
|
encoding "MacRomanEncoding" = return macRomanEncoding
|
||||||
encoding s = fail $ "Unknown encoding " ++ s
|
encoding s = throwError $ "Unknown encoding " ++ s
|
||||||
|
|
|
@ -3,6 +3,7 @@ module PDF.Encoding.MacRoman (
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import qualified Data.ByteString.Char8 as BS (pack, unpack)
|
import qualified Data.ByteString.Char8 as BS (pack, unpack)
|
||||||
|
import Data.Foldable (foldl')
|
||||||
import Data.Map (Map)
|
import Data.Map (Map)
|
||||||
import qualified Data.Map as Map (empty, insert, lookup)
|
import qualified Data.Map as Map (empty, insert, lookup)
|
||||||
import qualified Data.Text as Text (pack, unpack)
|
import qualified Data.Text as Text (pack, unpack)
|
||||||
|
@ -24,7 +25,7 @@ macRomanEncoding = Font {
|
||||||
| otherwise -> Left ("Character '" ++ k :"' unavailable in MacRoman")
|
| otherwise -> Left ("Character '" ++ k :"' unavailable in MacRoman")
|
||||||
|
|
||||||
mappers :: (Mapper, Mapper)
|
mappers :: (Mapper, Mapper)
|
||||||
mappers = foldl generateMapers (Map.empty, Map.empty) [
|
mappers = foldl' generateMapers (Map.empty, Map.empty) [
|
||||||
('\x80', '\x00C4') -- LATIN CAPITAL LETTER A WITH DIAERESIS
|
('\x80', '\x00C4') -- LATIN CAPITAL LETTER A WITH DIAERESIS
|
||||||
, ('\x81', '\x00C5') -- LATIN CAPITAL LETTER A WITH RING ABOVE
|
, ('\x81', '\x00C5') -- LATIN CAPITAL LETTER A WITH RING ABOVE
|
||||||
, ('\x82', '\x00C7') -- LATIN CAPITAL LETTER C WITH CEDILLA
|
, ('\x82', '\x00C7') -- LATIN CAPITAL LETTER C WITH CEDILLA
|
||||||
|
|
|
@ -1,21 +1,26 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
{-# LANGUAGE FlexibleInstances #-}
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
module PDF.Layer (
|
module PDF.Layer (
|
||||||
Layer(..)
|
Layer(..)
|
||||||
|
, LayerReader
|
||||||
, Objects(..)
|
, Objects(..)
|
||||||
, unify
|
, unify
|
||||||
) where
|
) where
|
||||||
|
|
||||||
|
import Control.Monad.Except (MonadError(..))
|
||||||
|
import Control.Monad.Reader (ReaderT)
|
||||||
|
import Data.Foldable (foldl')
|
||||||
import Data.Id (Id(..), Indexed, keysSet, mapWithKey, member)
|
import Data.Id (Id(..), Indexed, keysSet, mapWithKey, member)
|
||||||
import qualified Data.Id as Id (empty, union)
|
import qualified Data.Id as Id (empty, insert, lookup, union)
|
||||||
import Data.Map (Map, (!))
|
import Data.Map (Map, (!))
|
||||||
import qualified Data.Map as Map (empty, union)
|
import qualified Data.Map as Map (empty, lookup, union)
|
||||||
import qualified Data.IntSet as IntSet (delete, toList)
|
import qualified Data.IntSet as IntSet (delete, toList)
|
||||||
import PDF.Box (Box(..))
|
import PDF.Box (Box(..))
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
IndirectObjCoordinates(..), Object, Occurrence(..)
|
IndirectObjCoordinates(..), Object, Occurrence(..), Structure(..)
|
||||||
, Structure(..), XRefEntry(..), XRefSection, eofMarker, outputBody
|
, XRefEntry(..), XRefSection, eofMarker, outputBody
|
||||||
)
|
)
|
||||||
import qualified PDF.Output as Output (line)
|
import qualified PDF.Output as Output (line)
|
||||||
import PDF.Output (
|
import PDF.Output (
|
||||||
|
@ -26,24 +31,28 @@ import Text.Printf (printf)
|
||||||
|
|
||||||
data Layer = Layer {
|
data Layer = Layer {
|
||||||
occurrences :: [Occurrence]
|
occurrences :: [Occurrence]
|
||||||
, objects :: (Indexed Object)
|
, objects :: Indexed Object
|
||||||
, docStructure :: Structure
|
, docStructure :: Structure
|
||||||
} deriving Show
|
} deriving Show
|
||||||
|
type LayerReader m = ReaderT Layer m
|
||||||
|
|
||||||
updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
|
updateXRefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
|
||||||
updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
|
updateXRefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
|
||||||
where
|
where
|
||||||
updateEntry objectId e@(InUse {}) =
|
updateEntry objectId e@(InUse {offset}) =
|
||||||
e {offset = offsets ! (ObjectId $ getId objectId)}
|
case Map.lookup (ObjectId $ getId objectId) offsets of
|
||||||
|
Nothing -> Free {nextFree = Id $ getOffset offset, generation = 65535}
|
||||||
|
Just newOffset -> e {offset = newOffset}
|
||||||
updateEntry _ e = e
|
updateEntry _ e = e
|
||||||
|
|
||||||
instance Output Layer where
|
instance Output Layer where
|
||||||
output (Layer {occurrences, objects, docStructure}) =
|
output (Layer {occurrences, objects, docStructure}) = do
|
||||||
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
|
(body, savedOffsets) <- getOffsets (outputBody (occurrences, objects))
|
||||||
>>= \(body, (xref, startXRef)) -> mconcat [
|
let (newXRef, startXRef) = updateXRefs xRef savedOffsets
|
||||||
body
|
mconcat [
|
||||||
|
return body
|
||||||
, Output.line "xref"
|
, Output.line "xref"
|
||||||
, output xref
|
, output newXRef
|
||||||
, Output.line "trailer"
|
, Output.line "trailer"
|
||||||
, output trailer, newLine
|
, output trailer, newLine
|
||||||
, Output.line "startxref"
|
, Output.line "startxref"
|
||||||
|
@ -76,6 +85,14 @@ instance Monad m => Box m Objects Layer (Indexed Object) where
|
||||||
Indirect (IndirectObjCoordinates {objectId, versionNumber = 0})
|
Indirect (IndirectObjCoordinates {objectId, versionNumber = 0})
|
||||||
newOccurrences = (makeOccurrence . Id) <$> IntSet.toList newObjectIds
|
newOccurrences = (makeOccurrence . Id) <$> IntSet.toList newObjectIds
|
||||||
|
|
||||||
|
instance MonadError String m => Box m (Id Object) Layer Object where
|
||||||
|
r objectId =
|
||||||
|
maybe (throwError "Unknown key") return . Id.lookup objectId . objects
|
||||||
|
w objectId a layer@(Layer {objects})
|
||||||
|
| member objectId objects = return $
|
||||||
|
layer {objects = Id.insert objectId a objects}
|
||||||
|
| otherwise = throwError "Unknown key"
|
||||||
|
|
||||||
emptyLayer :: Layer
|
emptyLayer :: Layer
|
||||||
emptyLayer = Layer {
|
emptyLayer = Layer {
|
||||||
docStructure = Structure {xRef = Id.empty, trailer = Map.empty}
|
docStructure = Structure {xRef = Id.empty, trailer = Map.empty}
|
||||||
|
@ -84,7 +101,7 @@ emptyLayer = Layer {
|
||||||
}
|
}
|
||||||
|
|
||||||
unify :: [Layer] -> Layer
|
unify :: [Layer] -> Layer
|
||||||
unify = foldl complete emptyLayer
|
unify = foldl' complete emptyLayer
|
||||||
where
|
where
|
||||||
complete tmpLayer older =
|
complete tmpLayer older =
|
||||||
let mergedObjects = Id.union (objects tmpLayer) (objects older) in
|
let mergedObjects = Id.union (objects tmpLayer) (objects older) in
|
||||||
|
|
|
@ -34,6 +34,7 @@ module PDF.Object (
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>), many)
|
import Control.Applicative ((<|>), many)
|
||||||
|
import Control.Monad.Reader (asks)
|
||||||
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import qualified Data.ByteString as BS (concat)
|
import qualified Data.ByteString as BS (concat)
|
||||||
|
@ -48,11 +49,11 @@ import qualified Data.Id as Id (
|
||||||
import Data.Map (Map)
|
import Data.Map (Map)
|
||||||
import qualified Data.Map as Map (fromList, toList)
|
import qualified Data.Map as Map (fromList, toList)
|
||||||
import qualified Data.Set as Set (fromList, member)
|
import qualified Data.Set as Set (fromList, member)
|
||||||
import qualified PDF.EOL as EOL (charset, parser)
|
import qualified PDF.EOL as EOL (Style(..), charset, parser)
|
||||||
import qualified PDF.Output as Output (line, string)
|
import qualified PDF.Output as Output (line, string)
|
||||||
import PDF.Output (
|
import PDF.Output (
|
||||||
OBuilder, Offset(..), Output(..), Resource(..)
|
OBuilder, OContext(..), Offset(..), Output(..), Resource(..), byteString
|
||||||
, byteString, getOffset, join, newLine, saveOffset
|
, getOffset, join, newLine, saveOffset
|
||||||
)
|
)
|
||||||
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
|
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
@ -190,8 +191,8 @@ nullObject = string "null" *> return () <?> "null object"
|
||||||
-- Reference
|
-- Reference
|
||||||
--
|
--
|
||||||
data IndirectObjCoordinates = IndirectObjCoordinates {
|
data IndirectObjCoordinates = IndirectObjCoordinates {
|
||||||
objectId :: (Id Object)
|
objectId :: {-# UNPACK #-} !(Id Object)
|
||||||
, versionNumber :: Int
|
, versionNumber :: {-# UNPACK #-} !Int
|
||||||
} deriving Show
|
} deriving Show
|
||||||
|
|
||||||
reference :: MonadParser m => m IndirectObjCoordinates
|
reference :: MonadParser m => m IndirectObjCoordinates
|
||||||
|
@ -202,14 +203,14 @@ reference = IndirectObjCoordinates
|
||||||
-- DirectObject
|
-- DirectObject
|
||||||
--
|
--
|
||||||
data DirectObject =
|
data DirectObject =
|
||||||
Boolean Bool
|
Boolean !Bool
|
||||||
| NumberObject Number
|
| NumberObject !Number
|
||||||
| StringObject StringObject
|
| StringObject !StringObject
|
||||||
| NameObject Name
|
| NameObject !Name
|
||||||
| Array [DirectObject]
|
| Array ![DirectObject]
|
||||||
| Dictionary Dictionary
|
| Dictionary !Dictionary
|
||||||
| Null
|
| Null
|
||||||
| Reference IndirectObjCoordinates
|
| Reference !IndirectObjCoordinates
|
||||||
deriving Show
|
deriving Show
|
||||||
|
|
||||||
instance Output DirectObject where
|
instance Output DirectObject where
|
||||||
|
@ -294,10 +295,15 @@ data XRefEntry = InUse {
|
||||||
} deriving Show
|
} deriving Show
|
||||||
|
|
||||||
instance Output XRefEntry where
|
instance Output XRefEntry where
|
||||||
output (InUse {offset, generation}) =
|
output xRefEntry = Output.string (build xRefEntry) `mappend` endXRefEntryLine
|
||||||
Output.line (printf "%010d %05d n " (getOffset offset) generation)
|
where
|
||||||
output (Free {nextFree, generation}) =
|
build (InUse {offset, generation}) =
|
||||||
Output.line (printf "%010d %05d f " (getId nextFree) generation)
|
printf "%010d %05d n" (getOffset offset) generation
|
||||||
|
build (Free {nextFree, generation}) =
|
||||||
|
printf "%010d %05d f" (getId nextFree) generation
|
||||||
|
endXRefEntryLine = OContext (asks padEOLToTwoBytes) >>= Output.line
|
||||||
|
padEOLToTwoBytes EOL.CRLF = ("" :: String)
|
||||||
|
padEOLToTwoBytes _ = " "
|
||||||
|
|
||||||
entry :: Parser u XRefEntry
|
entry :: Parser u XRefEntry
|
||||||
entry = do
|
entry = do
|
||||||
|
|
|
@ -3,98 +3,162 @@
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||||
{-# LANGUAGE FlexibleInstances #-}
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
|
{-# LANGUAGE BangPatterns #-}
|
||||||
|
{-# LANGUAGE DeriveFunctor #-}
|
||||||
module PDF.Object.Navigation (
|
module PDF.Object.Navigation (
|
||||||
Clear(..)
|
Nav(..)
|
||||||
, PDFContent
|
, PPath(..)
|
||||||
, Raw(..)
|
, ROLayer
|
||||||
|
, RWLayer
|
||||||
|
, StreamContent(..)
|
||||||
, (./)
|
, (./)
|
||||||
, (//)
|
, (//)
|
||||||
, (>./)
|
, (>./)
|
||||||
, (>//)
|
, (>//)
|
||||||
, castObject
|
, castObject
|
||||||
|
, catalog
|
||||||
, getDictionary
|
, getDictionary
|
||||||
, getKey
|
, getKey
|
||||||
, objectById
|
, objectById
|
||||||
, catalog
|
, save
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Codec.Compression.Zlib (compress, decompress)
|
import Codec.Compression.Zlib (compress, decompress)
|
||||||
|
import Control.Monad.Except (MonadError(..))
|
||||||
import Control.Monad.Reader (MonadReader(..))
|
import Control.Monad.Reader (MonadReader(..))
|
||||||
import Control.Monad.Fail (MonadFail(..))
|
import Control.Monad.State (MonadState)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
|
import qualified Data.ByteString as BS (length)
|
||||||
import qualified Data.ByteString.Lazy as Lazy (ByteString, fromStrict, toStrict)
|
import qualified Data.ByteString.Lazy as Lazy (ByteString, fromStrict, toStrict)
|
||||||
import Data.Id (Id, at)
|
import Data.Id (Id)
|
||||||
import qualified Data.Map as Map (lookup)
|
import qualified Data.Id as Id (at)
|
||||||
import PDF.Box (Box(..))
|
import qualified Data.Map as Map (adjust, insert, lookup)
|
||||||
|
import PDF.Box (Box(..), at, edit{-, runRO-})
|
||||||
import PDF.Layer (Layer(..))
|
import PDF.Layer (Layer(..))
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
, Name(..), Object(..), Structure(..)
|
, Name(..), Number(..), Object(..), Structure(..)
|
||||||
)
|
)
|
||||||
import Prelude hiding (fail)
|
import Prelude hiding (fail)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
type PDFContent m = (MonadReader Layer m, MonadFail m)
|
newtype PPath = PPath [Component]
|
||||||
|
data DPath = DPath {
|
||||||
|
root :: Id Object
|
||||||
|
, offset :: [Component]
|
||||||
|
} deriving Show
|
||||||
|
|
||||||
|
push :: Component -> DPath -> DPath
|
||||||
|
push component dPath = dPath {offset = (offset dPath) ++ [component]}
|
||||||
|
|
||||||
|
data Nav a = Nav {
|
||||||
|
dPath :: DPath
|
||||||
|
, value :: a
|
||||||
|
} deriving (Functor)
|
||||||
|
|
||||||
|
instance Show a => Show (Nav a) where
|
||||||
|
show (Nav {dPath, value}) = "Nav {dPath = " ++ show dPath ++ ", value = " ++ show value ++ "}"
|
||||||
|
|
||||||
|
type ROLayer m = (MonadReader Layer m, MonadError String m)
|
||||||
|
type RWLayer m = (MonadState Layer m, MonadError String m)
|
||||||
type Component = String
|
type Component = String
|
||||||
|
|
||||||
getDictionary :: PDFContent m => Object -> m Dictionary
|
getDictionary :: ROLayer m => Nav Object -> m (Nav Dictionary)
|
||||||
getDictionary (Direct (Dictionary aDict)) = return aDict
|
getDictionary (Nav {dPath, value}) =
|
||||||
getDictionary (Direct (Reference ref)) =
|
case value of
|
||||||
objectById (objectId ref) >>= getDictionary
|
(Direct (Dictionary aDict)) -> return $ Nav {dPath, value = aDict}
|
||||||
getDictionary (Stream {header}) = return header
|
(Direct (Reference ref)) -> objectById (objectId ref) >>= getDictionary
|
||||||
getDictionary obj = expected "dictionary : " obj
|
(Stream {header}) -> return $ Nav {dPath, value = header}
|
||||||
|
obj -> expected "dictionary : " obj
|
||||||
|
|
||||||
expected :: (MonadFail m, Show a) => String -> a -> m b
|
expected :: (MonadError String m, Show a) => String -> a -> m b
|
||||||
expected name = fail . printf "Not a %s: %s" name . show
|
expected name = throwError . printf "Not a %s: %s" name . show
|
||||||
|
|
||||||
getKey :: PDFContent m => String -> Object -> m DirectObject
|
getKey :: ROLayer m => String -> Nav Object -> m (Nav DirectObject)
|
||||||
getKey key object = getDictionary object >>= catchMaybe . Map.lookup (Name key)
|
getKey key navObject = getDictionary navObject >>= f
|
||||||
where
|
where
|
||||||
errorMessage =
|
errorMessage =
|
||||||
printf "Key %s not found in object %s" key (show object)
|
printf "Key %s not found in object %s" key (show navObject)
|
||||||
catchMaybe = maybe (fail errorMessage) return
|
f (Nav {dPath, value}) =
|
||||||
|
case Map.lookup (Name key) value of
|
||||||
|
Nothing -> throwError errorMessage
|
||||||
|
Just dObj -> return $ Nav {dPath = push key dPath, value = dObj}
|
||||||
|
|
||||||
objectById :: PDFContent m => (Id Object) -> m Object
|
objectById :: ROLayer m => (Id Object) -> m (Nav Object)
|
||||||
objectById objectId = do
|
objectById objectId = do
|
||||||
layer <- ask
|
layer <- ask
|
||||||
return (objects layer `at` objectId)
|
return $ Nav {
|
||||||
|
dPath = DPath {root = objectId, offset = []}
|
||||||
|
, value = objects layer `Id.at` objectId
|
||||||
|
}
|
||||||
|
|
||||||
(./) :: PDFContent m => m Object -> Component -> m Object
|
castObject :: ROLayer m => Nav DirectObject -> m (Nav Object)
|
||||||
(./) object key = (object >>= getKey key >>= castObject)
|
castObject (Nav {value = !(Reference (IndirectObjCoordinates {objectId}))}) =
|
||||||
|
|
||||||
castObject :: PDFContent m => DirectObject -> m Object
|
|
||||||
castObject (Reference (IndirectObjCoordinates {objectId})) =
|
|
||||||
objectById objectId
|
objectById objectId
|
||||||
castObject directObject = return $ Direct directObject
|
castObject (Nav {dPath, value}) = return $ Nav {dPath, value = Direct value}
|
||||||
|
|
||||||
(//) :: PDFContent m => m Object -> [Component] -> m Object
|
(./) :: ROLayer m => m (Nav Object) -> Component -> m (Nav Object)
|
||||||
(//) object [] = object
|
(./) navObject key = (navObject >>= getKey key >>= castObject)
|
||||||
(//) object (key:keys) = object ./ key // keys
|
|
||||||
|
|
||||||
(>./) :: PDFContent m => Object -> Component -> m Object
|
(//) :: ROLayer m => m (Nav Object) -> PPath -> m (Nav Object)
|
||||||
(>./) object = (return object ./)
|
(//) navObject (PPath []) = navObject
|
||||||
|
(//) navObject (PPath (key:keys)) = navObject ./ key // (PPath keys)
|
||||||
|
|
||||||
(>//) :: PDFContent m => Object -> [Component] -> m Object
|
(>./) :: ROLayer m => Nav Object -> Component -> m (Nav Object)
|
||||||
(>//) object = (return object //)
|
(>./) navObject = (return navObject ./)
|
||||||
|
|
||||||
catalog :: PDFContent m => m Object
|
(>//) :: ROLayer m => Nav Object -> PPath -> m (Nav Object)
|
||||||
catalog = Direct . Dictionary . trailer . docStructure <$> ask
|
(>//) navObject = (return navObject //)
|
||||||
|
|
||||||
data Clear = Clear
|
catalog :: ROLayer m => m (Nav Object)
|
||||||
data Raw = Raw
|
catalog = do
|
||||||
|
value <- Direct . Dictionary . trailer . docStructure <$> ask
|
||||||
|
return $ Nav {dPath = undefined, value}
|
||||||
|
|
||||||
|
setAt :: [Component] -> DirectObject -> Dictionary -> Dictionary
|
||||||
|
setAt [] _ dict = dict
|
||||||
|
setAt [component] directObject dict =
|
||||||
|
Map.insert (Name component) directObject dict
|
||||||
|
setAt (component:components) directObject dict =
|
||||||
|
Map.adjust setDirObj (Name component) dict
|
||||||
|
where
|
||||||
|
setDirObj (Dictionary subDict) =
|
||||||
|
Dictionary $ setAt components directObject subDict
|
||||||
|
setDirObj x = x
|
||||||
|
|
||||||
|
save :: RWLayer m => Nav Object -> m ()
|
||||||
|
save nav@(Nav {dPath, value = Direct dObj}) =
|
||||||
|
edit .at (root dPath) $ return . setObj
|
||||||
|
where
|
||||||
|
setObj obj@(Stream {header}) =
|
||||||
|
obj {header = setAt (offset dPath) dObj header}
|
||||||
|
setObj (Direct (Dictionary dict)) =
|
||||||
|
Direct . Dictionary $ setAt (offset dPath) dObj dict
|
||||||
|
setObj _ = value nav
|
||||||
|
save (Nav {dPath = DPath {root, offset = []}, value}) = edit $ w root value
|
||||||
|
save _ = throwError "Streams can't be properties of PDF objects"
|
||||||
|
|
||||||
|
data StreamContent = Clear | Raw
|
||||||
|
|
||||||
onLazy :: (Lazy.ByteString -> Lazy.ByteString) -> ByteString -> ByteString
|
onLazy :: (Lazy.ByteString -> Lazy.ByteString) -> ByteString -> ByteString
|
||||||
onLazy f = Lazy.toStrict . f . Lazy.fromStrict
|
onLazy f = Lazy.toStrict . f . Lazy.fromStrict
|
||||||
|
|
||||||
contains :: String -> DirectObject -> Bool
|
contains :: String -> DirectObject -> Bool
|
||||||
contains needle (NameObject (Name n)) = needle == n
|
contains needle !(NameObject (Name n)) = needle == n
|
||||||
contains needle (Array directObjects) = oneOf directObjects (contains needle)
|
contains needle !(Array directObjects) = oneOf directObjects (contains needle)
|
||||||
where
|
where
|
||||||
oneOf [] _ = False
|
oneOf [] _ = False
|
||||||
oneOf (x:xs) p = p x || oneOf xs p
|
oneOf (x:xs) p = p x || oneOf xs p
|
||||||
contains _ _ = False
|
contains _ _ = False
|
||||||
|
|
||||||
instance MonadFail m => Box m Clear Object ByteString where
|
instance MonadError String m => Box m StreamContent (Nav Object) ByteString where
|
||||||
|
r sc = r sc . value
|
||||||
|
w sc newStreamContent nav = setValue <$> w sc newStreamContent (value nav)
|
||||||
|
where
|
||||||
|
setValue value = nav {value}
|
||||||
|
|
||||||
|
instance MonadError String m => Box m StreamContent Object ByteString where
|
||||||
|
r Raw (Stream {streamContent}) = return streamContent
|
||||||
r Clear (Stream {header, streamContent}) = return $
|
r Clear (Stream {header, streamContent}) = return $
|
||||||
case Map.lookup (Name "Filter") header of
|
case Map.lookup (Name "Filter") header of
|
||||||
Just directObject
|
Just directObject
|
||||||
|
@ -102,17 +166,15 @@ instance MonadFail m => Box m Clear Object ByteString where
|
||||||
_ -> streamContent
|
_ -> streamContent
|
||||||
r _ obj = expected "stream" obj
|
r _ obj = expected "stream" obj
|
||||||
|
|
||||||
w Clear streamContent obj@(Stream {header}) = return $
|
|
||||||
case Map.lookup (Name "Filter") header of
|
|
||||||
Just directObject
|
|
||||||
| contains "FlateDecode" directObject ->
|
|
||||||
obj {streamContent = onLazy compress streamContent}
|
|
||||||
_ -> obj {streamContent}
|
|
||||||
w _ _ obj = expected "stream" obj
|
|
||||||
|
|
||||||
instance MonadFail m => Box m Raw Object ByteString where
|
|
||||||
r Raw (Stream {streamContent}) = return streamContent
|
|
||||||
r _ obj = expected "stream" obj
|
|
||||||
|
|
||||||
w Raw streamContent obj@(Stream {}) = return $ obj {streamContent}
|
w Raw streamContent obj@(Stream {}) = return $ obj {streamContent}
|
||||||
|
w Clear newStreamContent (Stream {header}) =
|
||||||
|
let streamContent = getStreamContent (Map.lookup (Name "Filter") header) in
|
||||||
|
return $ Stream {header = fixLength streamContent, streamContent}
|
||||||
|
where
|
||||||
|
getStreamContent (Just directObject)
|
||||||
|
| contains "FlateDecode" directObject = onLazy compress newStreamContent
|
||||||
|
getStreamContent _ = newStreamContent
|
||||||
|
fixLength sc =
|
||||||
|
let newLength = NumberObject . Number . fromIntegral $ BS.length sc in
|
||||||
|
Map.insert (Name "Length") newLength header
|
||||||
w _ _ obj = expected "stream" obj
|
w _ _ obj = expected "stream" obj
|
||||||
|
|
|
@ -55,9 +55,8 @@ saveOffset resource = OContext $
|
||||||
lift :: (a -> Builder) -> a -> OBuilder
|
lift :: (a -> Builder) -> a -> OBuilder
|
||||||
lift f a = return (f a)
|
lift f a = return (f a)
|
||||||
|
|
||||||
getOffsets :: OBuilder -> OContext (OBuilder, Map Resource Offset)
|
getOffsets :: OBuilder -> OContext (Builder, Map Resource Offset)
|
||||||
getOffsets (OContext builder) =
|
getOffsets (OContext builder) = OContext $ listen builder
|
||||||
OContext (listen builder >>= \(a, w) -> return (return a, w))
|
|
||||||
|
|
||||||
append :: OBuilder -> OBuilder -> OBuilder
|
append :: OBuilder -> OBuilder -> OBuilder
|
||||||
append (OContext a) (OContext b) = OContext (mappend <$> a <*> b)
|
append (OContext a) (OContext b) = OContext (mappend <$> a <*> b)
|
||||||
|
|
143
src/PDF/Pages.hs
143
src/PDF/Pages.hs
|
@ -13,63 +13,62 @@ module PDF.Pages (
|
||||||
, withResources
|
, withResources
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative (Alternative, (<|>))
|
import Control.Applicative (Alternative)
|
||||||
|
import Control.Applicative ((<|>))
|
||||||
import Control.Monad (foldM)
|
import Control.Monad (foldM)
|
||||||
import Control.Monad.Fail (MonadFail(..))
|
import Control.Monad.Except (MonadError(..))
|
||||||
import Control.Monad.Reader (ReaderT, runReaderT)
|
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||||
import Control.Monad.State (StateT(..), evalStateT, execStateT, gets, modify)
|
import Control.Monad.State (StateT(..), evalStateT, execStateT, gets, modify)
|
||||||
import Control.Monad.Trans (lift)
|
import Control.Monad.Trans (lift)
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
import Data.ByteString.Lazy (toStrict)
|
import Data.ByteString.Lazy (toStrict)
|
||||||
import Data.Id (Id, IdMap)
|
import Data.Id (Id(..), IdMap)
|
||||||
import qualified Data.Id as Id (empty, insert, lookup)
|
import qualified Data.Id as Id (empty, insert, lookup)
|
||||||
import Data.Map (Map)
|
import Data.Map (Map)
|
||||||
import qualified Data.Map as Map (empty, fromList, insert, toList)
|
import qualified Data.Map as Map (empty, elems, fromList, insert, toList)
|
||||||
|
import Data.Maybe (listToMaybe)
|
||||||
import Data.OrderedMap (OrderedMap, build, mapi)
|
import Data.OrderedMap (OrderedMap, build, mapi)
|
||||||
import PDF.Box (Box(..), at, edit)
|
import PDF.Box (Box(..), at, edit, runRO)
|
||||||
import PDF.CMap (cMap)
|
import qualified PDF.CMap as CMap (parse)
|
||||||
import PDF.Content (Content(..))
|
import PDF.Content (Content(..))
|
||||||
import qualified PDF.Content as Content (parse)
|
import qualified PDF.Content as Content (parse)
|
||||||
import PDF.Encoding (encoding)
|
import PDF.Encoding (encoding)
|
||||||
import PDF.EOL (Style(..))
|
import PDF.EOL (Style(..))
|
||||||
import PDF.Font (Font, FontSet)
|
import PDF.Font (Font, FontSet)
|
||||||
import PDF.Layer (Layer(..), Objects(..))
|
import PDF.Layer (Layer(..), LayerReader)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
, Name(..), Object(..)
|
, Name(..), Number(..), Object(..)
|
||||||
,)
|
,)
|
||||||
import PDF.Object.Navigation (
|
import PDF.Object.Navigation (
|
||||||
Clear(..), (//), (>./), (>//), getDictionary
|
Nav(..), PPath(..), ROLayer, RWLayer, StreamContent(..), (//), (>./)
|
||||||
, getKey, objectById, origin
|
, (>//), catalog, getDictionary, getKey, objectById, save
|
||||||
)
|
)
|
||||||
import PDF.Output (render)
|
import PDF.Output (render)
|
||||||
import Prelude hiding (fail)
|
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
type Except m = (Alternative m, MonadFail m)
|
|
||||||
type InLayer m = ReaderT Layer m
|
|
||||||
|
|
||||||
type CachedFonts = IdMap Object Font
|
type CachedFonts = IdMap Object Font
|
||||||
type FontCache m = StateT CachedFonts (InLayer m)
|
type FontCache m = StateT CachedFonts m
|
||||||
data Page = Page {
|
data Page = Page {
|
||||||
contents :: OrderedMap (Id Object) Content
|
byteContents :: OrderedMap (Id Object) ByteString
|
||||||
, resources :: Dictionary
|
, resources :: Dictionary
|
||||||
, source :: (Id Object)
|
, source :: (Id Object)
|
||||||
}
|
}
|
||||||
|
|
||||||
loadContents :: Except m => DirectObject -> InLayer m (OrderedMap (Id Object) Content)
|
loadByteContents :: ROLayer m => DirectObject -> m (OrderedMap (Id Object) ByteString)
|
||||||
loadContents directObject =
|
loadByteContents directObject = do
|
||||||
sequenceA . build loadContent $ objectIds directObject
|
objs <- sequence . build objectById $ objectIds directObject
|
||||||
|
mapM (r Clear) objs
|
||||||
where
|
where
|
||||||
loadContent :: Except m => (Id Object) -> InLayer m Content
|
|
||||||
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
|
|
||||||
objectIds (Array l) = l >>= getReference
|
objectIds (Array l) = l >>= getReference
|
||||||
objectIds dirObj = getReference dirObj
|
objectIds dirObj = getReference dirObj
|
||||||
|
|
||||||
getFontDictionary :: Except m => Object -> InLayer m Dictionary
|
getFontDictionary :: (Alternative m, ROLayer m) => Nav Object -> m Dictionary
|
||||||
getFontDictionary pageObj =
|
getFontDictionary pageObj =
|
||||||
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
|
(pageObj >// PPath ["Resources", "Font"] >>= fmap value . getDictionary)
|
||||||
|
<|> return Map.empty
|
||||||
|
|
||||||
cache :: Except m => ((Id Object) -> FontCache m Font) -> (Id Object) -> FontCache m Font
|
cache :: ROLayer m => (Id Object -> FontCache m Font) -> Id Object -> FontCache m Font
|
||||||
cache loader objectId =
|
cache loader objectId =
|
||||||
gets (Id.lookup objectId) >>= maybe load return
|
gets (Id.lookup objectId) >>= maybe load return
|
||||||
where
|
where
|
||||||
|
@ -78,81 +77,105 @@ cache loader objectId =
|
||||||
modify $ Id.insert objectId value
|
modify $ Id.insert objectId value
|
||||||
return value
|
return value
|
||||||
|
|
||||||
loadFont :: Except m => (Id Object) -> FontCache m Font
|
loadFont :: (Alternative m, ROLayer m) => Id Object -> FontCache m Font
|
||||||
loadFont objectId = lift $ objectById objectId >>= tryMappings
|
loadFont objectId = lift $ objectById objectId >>= tryMappings
|
||||||
where
|
where
|
||||||
tryMappings object =
|
tryMappings object =
|
||||||
(object >./ "ToUnicode" >>= r Clear >>= cMap)
|
(object >./ "ToUnicode" >>= r Clear >>= CMap.parse)
|
||||||
<|> (object >./ "Encoding" >>= loadEncoding)
|
<|> (object >./ "Encoding" >>= loadEncoding . value)
|
||||||
<|> (fail $ unknownFormat (show objectId) (show object))
|
<|> (throwError $ unknownFormat (show objectId) (show object))
|
||||||
unknownFormat = printf "Unknown font format for object #%s : %s"
|
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||||
loadEncoding :: MonadFail m => Object -> m Font
|
loadEncoding :: MonadError String m => Object -> m Font
|
||||||
loadEncoding (Direct (NameObject (Name name))) = encoding name
|
loadEncoding (Direct (NameObject (Name name))) = encoding name
|
||||||
loadEncoding object =
|
loadEncoding object =
|
||||||
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
throwError $ printf "Encoding must be a name, not that : %s" $ show object
|
||||||
|
|
||||||
loadResources :: Except m => Dictionary -> FontCache m FontSet
|
loadResources :: (Alternative m, ROLayer m) => Dictionary -> FontCache m FontSet
|
||||||
loadResources = foldM addFont Map.empty . Map.toList
|
loadResources = foldM addFont Map.empty . Map.toList
|
||||||
where
|
where
|
||||||
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||||
flip (Map.insert name) output <$> cache loadFont objectId
|
flip (Map.insert name) output <$> cache loadFont objectId
|
||||||
addFont output _ = return output
|
addFont output _ = return output
|
||||||
|
|
||||||
getReference :: DirectObject -> [(Id Object)]
|
getReference :: DirectObject -> [Id Object]
|
||||||
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
||||||
getReference _ = []
|
getReference _ = []
|
||||||
|
|
||||||
loadPage :: Except m => (Id Object) -> InLayer m Page
|
loadPage :: (Alternative m, ROLayer m) => Id Object -> m Page
|
||||||
loadPage source = do
|
loadPage source = do
|
||||||
page <- objectById source
|
page <- objectById source
|
||||||
contents <- getKey "Contents" page >>= loadContents
|
byteContents <- loadByteContents . value =<< getKey "Contents" page
|
||||||
resources <- getFontDictionary page
|
resources <- getFontDictionary page
|
||||||
return $ Page {contents, resources, source}
|
return $ Page {byteContents, resources, source}
|
||||||
|
|
||||||
pagesList :: Except m => InLayer m [(Id Object)]
|
pagesList :: (Alternative m, ROLayer m) => m [Id Object]
|
||||||
pagesList =
|
pagesList =
|
||||||
(origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences)
|
(catalog // PPath ["Root", "Pages", "Kids"] >>= getReferences . value)
|
||||||
<|> return []
|
<|> return []
|
||||||
where
|
where
|
||||||
getReferences (Array kids) = kids >>= getReference
|
getReferences (Direct (Array kids)) = return $ getReference =<< kids
|
||||||
getReferences _ = fail "Not a pages array"
|
getReferences _ = throwError "Not a pages array"
|
||||||
|
|
||||||
updatePage :: MonadFail m => Page -> StateT Layer m ()
|
editPagesList :: RWLayer m => ([DirectObject] -> [DirectObject]) -> m ()
|
||||||
updatePage (Page {contents}) = sequence_ $ mapi updateContent contents
|
editPagesList f = do
|
||||||
|
pages <- runRO (catalog // PPath ["Root", "Pages"])
|
||||||
|
kids <- runRO (pages >./ "Kids")
|
||||||
|
count <- runRO (pages >./ "Count")
|
||||||
|
(newSize, newKids) <- editKids (value kids)
|
||||||
|
save $ kids {value = newKids}
|
||||||
|
save $ count {value = Direct $ NumberObject newSize}
|
||||||
where
|
where
|
||||||
updateContent source content =
|
editKids (Direct (Array pageRefs)) =
|
||||||
edit . at Objects . at source . at Clear $ setContent content
|
let result = f pageRefs in
|
||||||
setContent content _ = return . toStrict $ render LF content
|
return (Number . fromIntegral $ length result, Direct $ Array result)
|
||||||
|
editKids _ = throwError "Invalid format for Root.Pages.Kids (not an array)"
|
||||||
|
|
||||||
|
updatePage :: RWLayer m => Page -> m ()
|
||||||
|
updatePage (Page {byteContents}) = sequence_ $ mapi updateByteContent byteContents
|
||||||
|
where
|
||||||
|
updateByteContent source byteContent =
|
||||||
|
edit .at source .at Clear $ \_ -> return byteContent
|
||||||
|
|
||||||
data Pages = Pages
|
data Pages = Pages
|
||||||
newtype PageNumber = P Int
|
newtype PageNumber = P Int
|
||||||
data Contents = Contents
|
data Contents = Contents
|
||||||
|
|
||||||
instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where
|
instance (Alternative m, MonadError String m) => Box m Pages Layer (Map Int Page) where
|
||||||
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
r Pages = runReaderT (numbered <$> pagesList >>= mapM loadPage)
|
||||||
where
|
where
|
||||||
numbered :: [Page] -> Map Int Page
|
numbered :: [Id Object] -> Map Int (Id Object)
|
||||||
numbered = Map.fromList . zip [1..]
|
numbered = Map.fromList . zip [1..]
|
||||||
|
|
||||||
w Pages pages = execStateT $ mapM_ updatePage pages
|
w Pages pages = execStateT $ do
|
||||||
|
mapM_ updatePage pages
|
||||||
instance (Alternative m, MonadFail m) => Box m PageNumber Layer Page where
|
setPagesList $ Map.elems (source <$> pages)
|
||||||
r (P i) layer
|
|
||||||
| i < 1 = fail "Pages start at 1"
|
|
||||||
| otherwise = runReaderT (drop (i - 1) <$> pagesList >>= firstPage) layer
|
|
||||||
where
|
where
|
||||||
firstPage [] = fail "Page is out of bounds"
|
setPagesList =
|
||||||
firstPage (p:_) = loadPage p
|
editPagesList . const . fmap (Reference . flip IndirectObjCoordinates 0)
|
||||||
|
|
||||||
w _ page = execStateT $ updatePage page
|
instance (Alternative m, MonadError String m) => Box m PageNumber Layer Page where
|
||||||
|
r (P p) layer
|
||||||
|
| p < 1 = throwError "Pages start at 1"
|
||||||
|
| otherwise = runReaderT (drop (p - 1) <$> pagesList >>= firstPage) layer
|
||||||
|
where
|
||||||
|
firstPage =
|
||||||
|
maybe (throwError "Page is out of bounds") loadPage . listToMaybe
|
||||||
|
|
||||||
|
w (P p) page = execStateT $ do
|
||||||
|
updatePage page
|
||||||
|
editPagesList $ setPage (Reference $ IndirectObjCoordinates (source page) 0)
|
||||||
|
where
|
||||||
|
setPage ref l = take (p-1) l ++ ref : drop p l
|
||||||
|
|
||||||
instance Monad m => Box m Contents Page (OrderedMap (Id Object) Content) where
|
instance Monad m => Box m Contents Page (OrderedMap (Id Object) Content) where
|
||||||
r Contents = return . contents
|
r Contents = return . fmap Content.parse . byteContents
|
||||||
w _ contents page = return $ page {contents}
|
w Contents contents page = return $ page {byteContents}
|
||||||
|
where
|
||||||
|
byteContents = toStrict . render LF <$> contents
|
||||||
|
|
||||||
withFonts :: Monad m => (Layer -> FontCache m a) -> Layer -> m a
|
withFonts :: MonadError String m => (Layer -> FontCache (LayerReader m) a) -> Layer -> m a
|
||||||
withFonts f layer = runReaderT (evalStateT (f layer) Id.empty) layer
|
withFonts f layer = runReaderT (evalStateT (f layer) Id.empty) layer
|
||||||
|
|
||||||
withResources :: Except m => (Page -> ReaderT FontSet m b) -> Page -> FontCache m b
|
withResources :: (Alternative m, MonadError String m) => (Page -> ReaderT FontSet m b) -> Page -> FontCache (LayerReader m) b
|
||||||
withResources f p =
|
withResources f p =
|
||||||
loadResources (resources p) >>= lift . lift . runReaderT (f p)
|
loadResources (resources p) >>= lift . lift . runReaderT (f p)
|
||||||
|
|
Loading…
Reference in a new issue