Compare commits

..

No commits in common. "extract-text" and "main" have entirely different histories.

34 changed files with 334 additions and 2224 deletions

1
.gitignore vendored
View file

@ -1,3 +1,2 @@
dist*/
.ghc.environment.*
cabal.project.local

View file

@ -16,40 +16,20 @@ extra-source-files: ChangeLog.md
cabal-version: >=1.10
library
exposed-modules: Data.OrderedMap
, Data.Id
, PDF
, PDF.Box
, PDF.CMap
, PDF.Content
, PDF.Content.Operator
, PDF.Content.Operator.Color
, PDF.Content.Operator.Common
, PDF.Content.Operator.GraphicState
, PDF.Content.Operator.Path
, PDF.Content.Operator.Text
, PDF.Content.Text
exposed-modules: PDF
, PDF.EOL
, PDF.Layer
, PDF.Object
, PDF.Object.Navigation
, PDF.Output
, PDF.Parser
, PDF.Pages
, PDF.Update
other-modules: Data.ByteString.Char8.Util
, PDF.Body
, PDF.Encoding
, PDF.Encoding.MacRoman
, PDF.Font
, PDF.Parser
-- other-extensions:
build-depends: attoparsec
, base >=4.9 && <4.13
, bytestring
, containers
, mtl
, text
, utf8-string
, zlib
hs-source-dirs: src
ghc-options: -Wall
default-language: Haskell2010
@ -59,7 +39,7 @@ executable equivalent
build-depends: base
, bytestring
, Hufflepdf
ghc-options: -Wall -rtsopts
ghc-options: -Wall
default-language: Haskell2010
executable getObj
@ -68,63 +48,6 @@ executable getObj
, bytestring
, containers
, Hufflepdf
, mtl
ghc-options: -Wall -rtsopts
default-language: Haskell2010
executable getText
main-is: examples/getText.hs
build-depends: base
, bytestring
, containers
, ExceptIOH
, Hufflepdf
, mtl
, text
ghc-options: -Wall -rtsopts
default-language: Haskell2010
executable pdfCut
main-is: examples/pdfCut.hs
build-depends: base
, bytestring
, containers
, ExceptIOH
, filepath
, Hufflepdf
, mtl
, text
ghc-options: -Wall -rtsopts
default-language: Haskell2010
executable fixMermoz
main-is: examples/fixMermoz.hs
build-depends: base
, bytestring
, ExceptIOH
, Hufflepdf
, mtl
ghc-options: -Wall -rtsopts
default-language: Haskell2010
Test-Suite unitTests
type: exitcode-stdio-1.0
main-is: Main.hs
other-modules: Object
, Data.ByteString.Char8.Util
, Data.Id
, PDF.EOL
, PDF.Parser
, PDF.Object
, PDF.Output
hs-source-dirs: test
, src
build-depends: attoparsec
, base >=4.9 && <4.13
, bytestring
, containers
, Hufflepdf
, HUnit
, mtl
, text
, zlib
ghc-options: -Wall
default-language: Haskell2010

View file

@ -1,11 +0,0 @@
main :: IO ()
main = do
hSetBuffering stdout LineBuffering
args <- getArgs
case args of
--[inputFile] -> wholeDoc inputFile
[inputFile, pageNumber] -> do
content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
get content (read pageNumber)
singlePage inputFile (read pageNumber)
_ -> die "Syntax: getContent INPUT_FILE PAGE_NUMBER"

View file

@ -1,52 +1,53 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleContexts #-}
import Control.Monad.Reader (ReaderT, runReaderT)
import Codec.Compression.Zlib (decompress)
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (putStrLn)
import Data.Id (Id(..))
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict)
import Data.Map ((!?))
import qualified Data.Map as Map (keys, lookup)
import PDF (Document(..), parseDocument)
import PDF.Box (Box(..))
import PDF.Layer (Layer(..), unify)
import PDF.Object (Object(..))
import PDF.Object.Navigation (
Nav(..), PPath(..), StreamContent(..), (//), objectById, catalog
)
import PDF.Output (Output)
import qualified PDF.EOL as EOL (Style)
import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..))
import PDF.Output (ObjectId(..))
import qualified PDF.Output as Output (render)
import Prelude hiding (fail)
import PDF.Update (unify)
import System.Environment (getArgs, getProgName)
import System.Exit (die)
import Text.Printf (printf)
import Text.Read (readMaybe)
decodedStream :: Object -> Object
decodedStream object =
either (const object) id $ r Clear object >>= flip (w Raw) object
display :: EOL.Style -> Object -> ByteString
display eolStyle d@(Direct _) = Output.render eolStyle d
display eolStyle s@(Stream {header, streamContent}) = Output.render eolStyle $
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) -> Stream {
header
, streamContent = Lazy.toStrict . decompress $ Lazy.fromStrict streamContent
}
_ -> s
display :: Functor m => Output a => ReaderT Layer m a -> Document -> m ByteString
display getter (Document {eolStyle, layers}) =
Output.render eolStyle <$> runReaderT getter (unify layers)
parse :: [String] -> IO (FilePath, Document -> Either String ByteString)
parse [inputFile] = return (inputFile, display $ value <$> catalog)
parse [inputFile, key] =
return (inputFile, clear . maybe (byPath key) byId $ readMaybe key)
extractObject :: ObjectId -> Document -> Either String ByteString
extractObject objectId (Document {eolStyle, updates}) =
case objects content !? objectId of
Nothing -> Left $ "No object has ID " ++ show (getObjectId objectId)
Just o -> Right $ display eolStyle o
where
byId = objectById . Id
byPath path = (catalog // PPath (explode path))
explode "" = []
explode path =
case break (== '.') path of
(name, "") -> [name]
(name, rest) -> name : explode (drop 1 rest)
clear = display . fmap (decodedStream . value)
parse _ =
die . printf "Syntax: %s inputFile [OBJECT_ID | PATH_TO_OBJ]\n" =<< getProgName
content = unify updates
listObjectIds :: Document -> Either String [String]
listObjectIds =
Right . prependTitle . toString . Map.keys . objects . unify . updates
where
toString = fmap (show . getObjectId)
prependTitle = ("ObjectIds defined in this PDF:":)
main :: IO ()
main = do
(inputFile, getData) <- parse =<< getArgs
input <- BS.readFile inputFile
either die Lazy.putStrLn $ (parseDocument input >>= getData)
either die id $ (parseDocument input >>= getData)
where
parse [inputFile] = return (inputFile, fmap (mapM_ putStrLn) . listObjectIds)
parse [inputFile, objectId] = return
(inputFile, fmap Lazy.putStr . extractObject (ObjectId (read objectId)))
parse _ = die . printf "Syntax: %s inputFile [OBJECT_ID]\n" =<< getProgName

View file

@ -1,54 +0,0 @@
{-# LANGUAGE FlexibleContexts #-}
import Control.Monad ((>=>))
import Control.Monad.Except (ExceptT(..))
import Control.Monad.Except.IOH (handle)
import Control.Monad.IO.Class (liftIO)
import qualified Data.ByteString.Char8 as BS (readFile)
import Data.Id (Id(..), mapWithKey)
import qualified Data.Map as Map (mapWithKey)
import Data.OrderedMap (mapi)
import qualified Data.Text as Text (unpack)
import PDF (UnifiedLayers(..), parseDocument)
import PDF.Box (Box(..))
import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer, LayerReader)
import PDF.Pages (
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), withFonts
, withResources
)
import System.Environment (getArgs)
import System.Exit (die)
import System.IO (BufferMode(..), hSetBuffering, stdout)
import Text.Printf (printf)
displayPage :: Int -> Page -> FontCache (LayerReader (ExceptT String IO)) ()
displayPage n = withResources (
r Contents
>=> sequence_ . mapi (\objectId ->
r Chunks >=> sequence_ . mapWithKey (display objectId)
)
)
where
display a b v =
liftIO . putStrLn $
printf "p#%d obj#%d instr#%d: %s" n (getId a) (getId b) (Text.unpack v)
getAll :: Layer -> ExceptT String IO ()
getAll = withFonts $ r Pages >=> sequence_ . Map.mapWithKey displayPage
get :: Int -> Layer -> ExceptT String IO ()
get n = withFonts $ r (P n) >=> displayPage n
onDoc :: FilePath -> (Layer -> ExceptT String IO ()) -> ExceptT String IO ()
onDoc inputFile f =
ExceptT (parseDocument <$> BS.readFile inputFile) >>= r UnifiedLayers >>= f
main :: IO ()
main = do
hSetBuffering stdout LineBuffering
args <- getArgs
case args of
[inputFile] -> onDoc inputFile getAll `handle` die
[inputFile, pageNumber] ->
onDoc inputFile (get $ read pageNumber) `handle` die
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"

View file

@ -1,41 +0,0 @@
import Control.Monad.State (MonadState(..), evalStateT)
import Control.Monad.Except.IOH (handle)
import Control.Monad.Trans (lift)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (writeFile)
import Data.Map ((!), fromSet)
import qualified Data.Set as Set (fromList)
import PDF (Document, UnifiedLayers(..), parseDocument, render)
import PDF.Box (at)
import PDF.Pages (Pages(..))
import System.Environment (getArgs)
import System.Exit (die)
import System.FilePath (replaceBaseName, takeBaseName)
import Text.Read (readEither)
parseRange :: String -> Either String (Int, Int)
parseRange = evalStateT $ do
from <- lift . readEither =<< state (break (== '-'))
get >>= makeRange from
where
makeRange from "" = return (from, from)
makeRange from (_:to) = (,) from <$> lift (readEither to)
cut :: (Int, Int) -> Document -> IO Document
cut (p1, p2) doc = ((
at UnifiedLayers .at Pages $ \pages ->
return $ fromSet (pages!) $ Set.fromList [pMin .. pMax]
) doc) `handle` die
where
(pMin, pMax) = (min p1 p2, max p1 p2)
main :: IO ()
main = do
[inputPath, inputRange] <- getArgs
range <- catchLeft $ parseRange inputRange
doc <- catchLeft =<< parseDocument <$> BS.readFile inputPath
Lazy.writeFile (outputPath inputPath inputRange) . render =<< cut range doc
where
catchLeft = either die return
outputPath fileName range =
replaceBaseName fileName (takeBaseName fileName ++ "_p" ++ range)

View file

@ -1,91 +1,16 @@
module Data.ByteString.Char8.Util (
B16Int(..)
, B256Int(..)
, b8ToInt
, b16ToBytes
, b16ToInt
, b256ToInt
, intToB256
, previous
previous
, subBS
, toBytes
, unescape
, utf16BEToutf8
) where
import Data.ByteString (ByteString, snoc)
import qualified Data.ByteString as BS (empty, foldl', length, pack, singleton, splitAt)
import qualified Data.ByteString.Char8 as Char8 (
cons, drop, index, splitAt, take, uncons, unpack
)
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf16BE)
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (drop, index, take)
import Prelude hiding (length)
import Text.Printf (printf)
newtype B8Int = B8Int ByteString deriving (Eq, Show)
newtype B16Int = B16Int ByteString deriving (Eq, Show)
newtype B256Int = B256Int ByteString deriving (Eq, Show)
previous :: Char -> Int -> ByteString -> Int
previous char position byteString
| Char8.index byteString position == char = position
| BS.index byteString position == char = position
| otherwise = previous char (position - 1) byteString
subBS :: Int -> Int -> ByteString -> ByteString
subBS offset length = Char8.take length . Char8.drop offset
intToB256 :: Int -> B256Int
intToB256 n
| n < 0x100 = B256Int . BS.singleton $ toEnum n
| otherwise =
let B256Int begining = intToB256 (n `div` 0x100) in
B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
b256ToInt :: B256Int -> Int
b256ToInt (B256Int n) = BS.foldl' (\k w -> 0x100*k + fromEnum w) 0 n
toBytes :: Int -> Int -> ByteString
toBytes 0 _ = BS.empty
toBytes size n =
(toBytes (size - 1) (n `div` 0x100)) `snoc` (toEnum (n `mod` 0x100))
b16ToBytes :: B16Int -> ByteString
b16ToBytes (B16Int n) = BS.pack . fmap b16ToInt $ pairDigits n
where
pairDigits s =
case BS.length s of
0 -> []
1 -> [B16Int s]
_ ->
let (twoHexDigits, rest) = BS.splitAt 2 s in
(B16Int $ twoHexDigits):(pairDigits rest)
fromBase :: (Num a, Read a) => Char -> ByteString -> a
fromBase b = read . printf "0%c%s" b . Char8.unpack
b16ToInt :: (Num a, Read a) => B16Int -> a
b16ToInt (B16Int n) = fromBase 'x' n
b8ToInt :: (Num a, Read a) => B8Int -> a
b8ToInt (B8Int n) = fromBase 'o' n
unescape :: ByteString -> ByteString
unescape escapedBS =
case Char8.uncons escapedBS of
Nothing -> BS.empty
Just ('\\', s) -> unescapeChar s
Just (c, s) -> Char8.cons c (unescape s)
where
unescapeChar s =
case Char8.uncons s of
Nothing -> BS.empty
Just (c, s')
| c `elem` "()" -> Char8.cons c (unescape s')
| c `elem` "nrtbf" -> Char8.cons (read (printf "'\\%c'" c)) (unescape s')
| c `elem` ['0'..'7'] -> fromOctal (Char8.splitAt 3 s)
| otherwise -> Char8.cons c (unescape s')
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
utf16BEToutf8 :: ByteString -> Text
utf16BEToutf8 = decodeUtf16BE
subBS offset length = BS.take length . BS.drop offset

View file

@ -1,95 +0,0 @@
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE DeriveTraversable #-}
{-# LANGUAGE FlexibleContexts #-}
module Data.Id (
Id(..)
, IdMap
, Indexed
, at
, delete
, empty
, filterWithKey
, fromList
, insert
, keysSet
, lookup
, mapWithKey
, member
, minViewWithKey
, register
, singleton
, size
, toList
, union
) where
import Control.Monad.State.Strict (MonadState, modify, gets)
import Data.IntMap (IntMap, (!))
import qualified Data.IntMap as IntMap (
delete, empty, filterWithKey, fromList, keysSet, insert, lookup, mapWithKey
, maxViewWithKey, member, minViewWithKey, size, toList, union
)
import Data.IntSet (IntSet)
import Prelude hiding (lookup)
newtype Id a = Id {
getId :: Int
} deriving (Eq, Enum, Ord, Show)
newtype IdMap a b = IdMap {
intMap :: IntMap b
} deriving (Show, Functor, Semigroup, Monoid, Foldable, Traversable)
type Indexed a = IdMap a a
at :: IdMap a b -> Id a -> b
at (IdMap idMap) = (idMap !) . getId
lookup :: Id a -> IdMap a b -> Maybe b
lookup (Id a) (IdMap idMap) = IntMap.lookup a idMap
size :: IdMap a b -> Int
size = IntMap.size . intMap
member :: Id a -> IdMap a b -> Bool
member (Id a) (IdMap idMap) = IntMap.member a idMap
empty :: IdMap a b
empty = IdMap {intMap = IntMap.empty}
singleton :: Id a -> b -> IdMap a b
singleton a b = fromList [(a, b)]
insert :: Id a -> b -> IdMap a b -> IdMap a b
insert (Id a) b (IdMap idMap) = IdMap {intMap = IntMap.insert a b idMap}
delete :: Id a -> IdMap a b -> IdMap a b
delete (Id a) (IdMap idMap) = IdMap {intMap = IntMap.delete a idMap}
minViewWithKey :: IdMap a b -> Maybe ((Id a, b), IdMap a b)
minViewWithKey = fmap wrap . IntMap.minViewWithKey . intMap
where
wrap ((key, b), idMap) = ((Id key, b), IdMap idMap)
union :: IdMap a b -> IdMap a b -> IdMap a b
union (IdMap intMap1) (IdMap intMap2) =
IdMap {intMap = IntMap.union intMap1 intMap2}
mapWithKey :: (Id a -> b -> c) -> IdMap a b -> IdMap a c
mapWithKey f (IdMap idMap) = IdMap {intMap = IntMap.mapWithKey (f . Id) idMap}
filterWithKey :: (Id a -> b -> Bool) -> IdMap a b -> IdMap a b
filterWithKey f = IdMap . IntMap.filterWithKey (f . Id) . intMap
fromList :: [(Id a, b)] -> IdMap a b
fromList = IdMap . IntMap.fromList . fmap (\(key, b) -> (getId key, b))
toList :: IdMap a b -> [(Id a, b)]
toList = fmap (\(key, b) -> (Id key, b)) . IntMap.toList . intMap
keysSet :: IdMap a b -> IntSet
keysSet = IntMap.keysSet . intMap
register :: MonadState (IdMap a b) m => b -> m (Id a)
register b = do
newId <- gets (Id . maybe 0 ((+1) . fst . fst) . IntMap.maxViewWithKey . intMap)
modify (insert newId b)
return newId

View file

@ -1,79 +0,0 @@
{-# LANGUAGE NamedFieldPuns #-}
module Data.OrderedMap (
OrderedMap
, build
, elems
, fromList
, get
, keys
, lookup
, mapi
, set
, toList
) where
import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map (fromList, insert, lookup, member)
import Prelude hiding (lookup)
data OrderedMap k a = OrderedMap {
assoc :: Map k a
, keys :: [k]
}
instance (Ord k, Show k, Show a) => Show (OrderedMap k a) where
show = show . toList
instance Functor (OrderedMap k) where
fmap f orderedMap = orderedMap {assoc = fmap f (assoc orderedMap)}
instance Ord k => Foldable (OrderedMap k) where
foldMap f (OrderedMap {assoc, keys}) = foldMap f $ (assoc !) <$> keys
instance Ord k => Traversable (OrderedMap k) where
sequenceA (OrderedMap {assoc, keys}) =
(flip OrderedMap keys) <$> sequenceA assoc
elems :: Ord k => OrderedMap k a -> [a]
elems (OrderedMap {assoc, keys}) = (assoc !) <$> keys
toList :: Ord k => OrderedMap k a -> [(k, a)]
toList (OrderedMap {assoc, keys}) = (\k -> (k, assoc ! k)) <$> keys
fromList :: Ord k => [(k, a)] -> OrderedMap k a
fromList keyValueList = OrderedMap {
assoc = Map.fromList keyValueList
, keys = fst <$> keyValueList
}
build :: Ord k => (k -> a) -> [k] -> OrderedMap k a
build f keys = OrderedMap {
assoc = Map.fromList $ (\k -> (k, f k)) <$> keys
, keys
}
get :: Ord k => k -> OrderedMap k a -> a
get k = (! k) . assoc
lookup :: Ord k => k -> OrderedMap k a -> Maybe a
lookup k = (Map.lookup k) . assoc
set :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
set k v orderedMap@(OrderedMap {assoc})
| Map.member k assoc = orderedMap {assoc = Map.insert k v assoc}
| otherwise = orderedMap
mapi :: Ord k => (k -> a -> b) -> OrderedMap k a -> OrderedMap k b
mapi f orderedMap = orderedMap {
assoc = mapWithKey f $ assoc orderedMap
}
{-
cons :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
cons k a orderedMap =
snoc :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
alter :: Ord k => (Maybe a -> Maybe a) -> k -> OrderedMap k a -> OrderedMap k a
alter
-}

View file

@ -1,11 +1,6 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module PDF (
Document(..)
, EOLStyle(..)
, Layers(..)
, UnifiedLayers(..)
, parseDocument
, render
) where
@ -18,45 +13,27 @@ import Data.ByteString.Char8.Util (previous, subBS)
import qualified Data.ByteString.Lazy as Lazy (ByteString)
import qualified Data.Map as Map (lookup)
import PDF.Body (populate)
import PDF.Box (Box(..))
import qualified PDF.EOL as EOL (Style(..), charset, parser)
import PDF.Layer (Layer, unify)
import PDF.Object (
DirectObject(..), InputStructure(..), Name(..), Number(..)
Content(..), DirectObject(..), InputStructure(..), Name(..), Number(..)
, Structure(..)
, eofMarker, magicNumber, structure
)
import qualified PDF.Output as Output (render, line)
import PDF.Output (Output(..))
import PDF.Parser (Parser, evalParser, string, takeAll)
import PDF.Parser (Parser, runParser, string, takeAll)
import Text.Printf (printf)
data Document = Document {
pdfVersion :: String
, eolStyle :: EOL.Style
, layers :: [Layer]
, updates :: [Content]
} deriving Show
instance Output Document where
output (Document {pdfVersion, layers}) =
output (Document {pdfVersion, updates}) =
Output.line (printf "%%PDF-%s" pdfVersion)
`mappend` output layers
data EOLStyle = EOLStyle
data Layers = Layers
data UnifiedLayers = UnifiedLayers
instance Monad m => Box m EOLStyle Document EOL.Style where
r EOLStyle = return . eolStyle
w EOLStyle eolStyle document = return $ document {eolStyle}
instance Monad m => Box m UnifiedLayers Document Layer where
r UnifiedLayers = return . unify . layers
w UnifiedLayers layer = w Layers [layer]
instance Monad m => Box m Layers Document [Layer] where
r Layers = return . layers
w Layers layers document = return $ document {layers = layers}
`mappend` output updates
render :: Document -> Lazy.ByteString
render document@(Document {eolStyle}) = Output.render eolStyle document
@ -106,7 +83,7 @@ findNextSection offset input =
readStructures :: Int -> ByteString -> Either String [InputStructure]
readStructures startXref input =
evalParser structure () (BS.drop startXref input) >>= stopOrFollow
runParser structure () (BS.drop startXref input) >>= stopOrFollow
where
stopOrFollow s@(Structure {trailer}) =
case Map.lookup (Name "Prev") trailer of
@ -119,8 +96,8 @@ readStructures startXref input =
parseDocument :: ByteString -> Either String Document
parseDocument input = do
(pdfVersion, eolStyle) <- evalParser ((,) <$> version <*> EOL.parser) () input
(pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input
startXref <- readStartXref eolStyle input
structuresRead <- readStructures startXref input
let layers = populate input <$> structuresRead
return $ Document {pdfVersion, eolStyle, layers}
let updates = populate input <$> structuresRead
return $ Document {pdfVersion, eolStyle, updates}

View file

@ -6,22 +6,19 @@ module PDF.Body (
import Control.Applicative ((<|>))
import Control.Monad.State (get, gets, modify)
import Data.Attoparsec.ByteString.Char8 (option)
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
import Data.Id (Id(..), at, empty)
import qualified Data.Id as Id (insert, lookup)
import qualified Data.Map as Map (lookup)
import Data.Map ((!))
import qualified Data.Map as Map (empty, insert, lookup)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Layer (Layer(..))
import PDF.Object (
DirectObject(..), Flow(..), IndirectObjCoordinates(..)
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
, Structure(..), XRefEntry(..), XRefSection
, blank, dictionary, directObject, integer, line
)
import PDF.Output (Offset(..))
import PDF.Parser (MonadParser(..), Parser, (<?>), evalParser, on)
import PDF.Output (ObjectId(..), Offset(..))
import PDF.Parser (Parser, (<?>), block, char, on, option, runParser, takeAll)
data UserState = UserState {
input :: ByteString
@ -34,9 +31,9 @@ type SParser = Parser UserState
modifyFlow :: (Flow -> Flow) -> SParser ()
modifyFlow f = modify $ \state -> state {flow = f $ flow state}
addObject :: (Id Object) -> Object -> SParser ()
addObject :: ObjectId -> Object -> SParser ()
addObject objectId newObject = modifyFlow $ \flow -> flow {
tmpObjects = Id.insert objectId newObject $ tmpObjects flow
tmpObjects = Map.insert objectId newObject $ tmpObjects flow
}
pushOccurrence :: Occurrence -> SParser ()
@ -49,10 +46,10 @@ comment = BS.unpack <$> (option "" afterPercent <* EOL.parser)
where
afterPercent = BS.cons <$> char '%' <*> takeAll (not . (`elem` EOL.charset))
lookupOffset :: (Id Object) -> SParser Offset
lookupOffset :: ObjectId -> SParser Offset
lookupOffset objectId = do
table <- gets xreferences
case Id.lookup objectId table >>= entryOffset of
case Map.lookup objectId table >>= entryOffset of
Nothing -> fail $
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
Just offset -> return offset
@ -60,12 +57,12 @@ lookupOffset objectId = do
entryOffset (InUse {offset}) = Just offset
entryOffset _ = Nothing
loadNumber :: (Id Object) -> SParser Double
loadNumber :: ObjectId -> SParser Double
loadNumber objectId = do
offset <- getOffset <$> lookupOffset objectId
objectStart <- BS.drop offset <$> gets input
indirectObjCoordinates `on` (objectStart :: ByteString) >> return ()
objectValue <- (`at` objectId) . tmpObjects <$> gets flow
objectValue <- (!objectId) . tmpObjects <$> gets flow
case objectValue of
Direct (NumberObject (Number n)) -> return n
obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
@ -78,7 +75,7 @@ getSize Nothing = fail "Missing '/Length' key on stream"
getSize (Just (NumberObject (Number size))) = return size
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
Flow {tmpObjects} <- gets flow
case Id.lookup objectId tmpObjects of
case Map.lookup objectId tmpObjects of
Nothing -> loadNumber objectId
Just (Direct (NumberObject (Number size))) -> return size
Just v -> fail $
@ -99,7 +96,7 @@ object = streamObject <|> Direct <$> directObject
indirectObjCoordinates :: SParser IndirectObjCoordinates
indirectObjCoordinates = do
objectId <- Id <$> integer
objectId <- ObjectId <$> integer
coordinates <- IndirectObjCoordinates objectId <$> integer
objectValue <- line "obj" *> object <* blank <* line "endobj"
addObject objectId objectValue
@ -109,14 +106,14 @@ occurrence :: SParser Occurrence
occurrence =
Comment <$> comment <|> Indirect <$> indirectObjCoordinates <?> "comment or object"
populate :: ByteString -> InputStructure -> Layer
populate :: ByteString -> InputStructure -> Content
populate input structure =
let bodyInput = BS.drop (startOffset structure) input in
case evalParser recurseOnOccurrences initialState bodyInput of
Left _ -> Layer {occurrences = [], objects = empty, docStructure}
case runParser recurseOnOccurrences initialState bodyInput of
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
Right finalState ->
let Flow {occurrencesStack, tmpObjects} = flow finalState in
Layer {
Content {
occurrences = reverse occurrencesStack, objects = tmpObjects, docStructure
}
where
@ -124,7 +121,7 @@ populate input structure =
xreferences = xRef docStructure
initialState = UserState {
input, xreferences, flow = Flow {
occurrencesStack = [], tmpObjects = empty
occurrencesStack = [], tmpObjects = Map.empty
}
}

View file

@ -1,80 +0,0 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE UndecidableInstances #-}
{-# LANGUAGE ScopedTypeVariables #-}
module PDF.Box (
Box(..)
, Index(..)
, Maybe_(..)
, Either_(..)
, at
, atAll
, edit
, runRO
) where
import Control.Monad.Except (ExceptT(..), runExceptT)
import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (ReaderT, runReaderT)
import Control.Monad.State (MonadState(..))
import Data.Id (Id, IdMap)
import qualified Data.Id as Id (insert, lookup)
import Data.Map (Map)
import qualified Data.Map as Map (insert, lookup)
import Data.OrderedMap (OrderedMap)
import qualified Data.OrderedMap as OrderedMap (lookup, set)
import Prelude hiding (fail)
runRO :: MonadState s m => ReaderT s m a -> m a
runRO ro = get >>= runReaderT ro
newtype Index = Index Int
newtype Maybe_ x = Maybe_ x
newtype Either_ b x = Either_ x
class Monad m => Box m i a b | m a i -> b where
r :: i -> a -> m b
w :: i -> b -> a -> m a
at :: Box m i a b => i -> (b -> m b) -> a -> m a
at i f a = r i a >>= f >>= flip (w i) a
atAll :: (Traversable t, Monad m, Box m i a (t b)) => i -> (b -> m b) -> a -> m a
atAll i f = at i $ (mapM f)
edit :: MonadState a m => (a -> m a) -> m ()
edit f = get >>= f >>= put
instance MonadFail m => Box m Index [a] a where
r (Index i) [] = fail $ "Index out of bounds " ++ show i
r (Index 0) (x:_) = return x
r (Index i) (_:xs) = r (Index (i-1)) xs
w (Index i) _ [] = fail $ "Index out of bounds " ++ show i
w (Index 0) newX (_:xs) = return (newX:xs)
w (Index i) newX (x:xs) = (x:) <$> w (Index (i-1)) newX xs
instance (Ord k, MonadFail m) => Box m k (Map k a) a where
r k = maybe (fail "Unknown key") return . Map.lookup k
w k a = return . Map.insert k a
instance (Ord k, MonadFail m) => Box m k (OrderedMap k a) a where
r k = maybe (fail "Unknown key") return . OrderedMap.lookup k
w k a orderedMap = r k orderedMap >> return (OrderedMap.set k a orderedMap)
instance MonadFail m => Box m (Id k) (IdMap k a) a where
r k = maybe (fail "Unknown key") return . Id.lookup k
w k a = return . Id.insert k a
instance (Monad m, Box Maybe i a b) => Box m (Maybe_ i) a (Maybe b) where
r (Maybe_ i) = return . r i
w (Maybe_ i) (Just b) a =
return . maybe a id $ w i b a
w _ _ obj = return obj
instance (Monad m, Box (ExceptT e m) i a b) => Box m (Either_ e i) a (Either e b) where
r (Either_ i) a = runExceptT (r i a)
w (Either_ i) (Right b) a =
either (const a) id <$> runExceptT (w i b a :: ExceptT e m a)
w _ _ a = return a

View file

@ -1,196 +0,0 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
module PDF.CMap (
CMap
, CMappers
, CRange(..)
, matches
, parse
) where
import Control.Applicative ((<|>), many)
import Control.Monad.Except (MonadError(..))
import Control.Monad.Fail (fail)
import Control.Monad.State (modify)
import Data.Attoparsec.ByteString.Char8 (count)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (drop, length, null, take)
import Data.ByteString.Char8 (unpack)
import Data.ByteString.Char8.Util (
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
)
import Data.Foldable (foldl', foldr')
import Data.Map (Map, mapWithKey, union)
import qualified Data.Map as Map (
adjust, empty, fromList, insert, insertWith, lookup, toList
)
import Data.Text (Text)
import qualified Data.Text as Text (length, null, splitAt)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Font (Decoder, Encoder, Font(..))
import PDF.Object (
DirectObject(..), Name, StringObject(..)
, blank, directObject, integer, line, stringObject
)
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
import Prelude hiding (fail)
type CMappers = Map Name CMap
type ToUnicode = Map ByteString Text
type FromUnicode = Map Text ByteString
data CRange = CRange {
fromSequence :: ByteString
, toSequence :: ByteString
, toUnicode :: ToUnicode
} deriving Show
type Size = Int
data CMap = CMap {
toUnicodeBySize :: Map Size [CRange]
, fromUnicodeBySize :: Map Size FromUnicode
}
toFont :: CMap -> Font
toFont (CMap {toUnicodeBySize, fromUnicodeBySize}) =
Font {decode = decoder toUnicodeBySize, encode = encoder fromUnicodeBySize}
decoder :: Map Size [CRange] -> Decoder
decoder rangesBySize input
| BS.null input = Right ""
| otherwise = do
(output, remainingInput) <- trySizes input $ Map.toList rangesBySize
mappend output <$> decoder rangesBySize remainingInput
where
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
trySizes s ((size, cRanges):others) =
let prefix = BS.take size s in
case tryRanges prefix cRanges of
Nothing -> trySizes s others
Just outputSequence -> Right (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe Text
tryRanges _ [] = Nothing
tryRanges prefix ((CRange {toUnicode}):cRanges) =
case Map.lookup prefix toUnicode of
Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence
encoder :: Map Size FromUnicode -> Encoder
encoder fromUnicodes input
| Text.null input = Right ""
| otherwise =
foldr' (<>) (Left "No encoding found") $ mapWithKey tryOn fromUnicodes
where
tryOn size fromUnicode =
let (prefix, end) = Text.splitAt size input in
case Map.lookup prefix fromUnicode of
Nothing -> Left ""
Just code -> (code <>) <$> encoder fromUnicodes end
matches :: ByteString -> CRange -> Bool
matches code (CRange {fromSequence, toSequence}) =
fromSequence <= code && code <= toSequence
parse :: MonadError String m => ByteString -> m Font
parse = either throwError (return . toFont . snd) . runParser
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
emptyCMap
where
ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
emptyCMap = CMap {toUnicodeBySize = Map.empty, fromUnicodeBySize = Map.empty}
codeRanges :: Parser CMap ()
codeRanges = do
size <- integer <* line "begincodespacerange"
mapM_ createMapping =<< count size codeRange
line "endcodespacerange"
where
codeRange =
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
createMapping :: (StringObject, StringObject) -> Parser CMap ()
createMapping (Hexadecimal from, Hexadecimal to) = modify $ \cmap -> cmap {
toUnicodeBySize = Map.insertWith (++) size [cRange] (toUnicodeBySize cmap)
}
where
fromSequence = b16ToBytes from
size = BS.length fromSequence
toSequence = b16ToBytes to
cRange = CRange {fromSequence, toSequence, toUnicode = Map.empty}
createMapping _ = return ()
cMapRange :: Parser CMap ()
cMapRange = do
size <- integer <* line "beginbfrange"
mapM_ saveMapping =<< count size rangeMapping
line "endbfrange"
where
rangeMapping = do
from <- (stringObject <* blank)
to <- (stringObject <* blank)
mapFromTo from to =<< directObject <* EOL.parser
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
saveMapping assoc =
modify $ \(CMap {toUnicodeBySize, fromUnicodeBySize}) -> CMap {
toUnicodeBySize = saveToUnicodeBySize assoc toUnicodeBySize
, fromUnicodeBySize = saveFromUnicodeBySize reversed fromUnicodeBySize
}
where
reversed = (\(a, b) -> (b, a)) <$> assoc
saveToUnicodeBySize :: [(ByteString, Text)] -> Map Size [CRange] -> Map Size [CRange]
saveToUnicodeBySize [] = id
saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length code)
where
newMapping = Map.fromList assoc
appendMapping cRange =
cRange {toUnicode = toUnicode cRange `union` newMapping}
insertCRange = fmap (\cRange ->
if code `matches` cRange then appendMapping cRange else cRange
)
saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode
saveFromUnicodeBySize = flip (foldl' insertFromUnicode)
where
insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode
insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) =
let size = Text.length unicodeSequence in
Map.adjust (Map.insert unicodeSequence code) size tmpFromUnicodeBySize
cMapChar :: Parser CMap ()
cMapChar = do
size <- integer <* line "beginbfchar"
saveMapping =<< count size charMapping <* line "endbfchar"
where
charMapping = do
from <- stringObject <* blank
pairMapping from =<< stringObject <* EOL.parser
between :: B16Int -> B16Int -> [ByteString]
between from@(B16Int s) to =
let size = BS.length s `div` 2 in
toBytes size <$> [b16ToInt from .. b16ToInt to]
startFrom :: B16Int -> [ByteString]
startFrom from@(B16Int s) =
let size = BS.length s `div` 2 in
toBytes size <$> [b16ToInt from .. ]
mapFromTo :: MonadParser m => StringObject -> StringObject -> DirectObject -> m [(ByteString, Text)]
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) =
zip (between from to) <$> (mapM dstByteString dstPoints)
where
dstByteString (StringObject (Hexadecimal dst)) =
return . utf16BEToutf8 $ b16ToBytes dst
dstByteString _ = fail "Invalid for a replacement string"
mapFromTo _ _ _ = fail "invalid range mapping found"
pairMapping :: MonadParser m => StringObject -> StringObject -> m (ByteString, Text)
pairMapping (Hexadecimal from) (Hexadecimal to) =
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
pairMapping _ _ = fail "invalid pair mapping found"

View file

@ -1,117 +0,0 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module PDF.Content (
Content(..)
, ContentUnit(..)
, GraphicContextUnit(..)
, IdContentUnit
, IdGraphicContextUnit
, IdTextContext
, Instructions(..)
, TextContext
, parse
) where
import Control.Applicative ((<|>))
import Control.Monad.Reader (asks, runReader)
import Control.Monad.State.Strict (runState, evalStateT, modify)
import Data.Attoparsec.ByteString.Char8 (sepBy)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser, parseOnly)
import Data.ByteString (ByteString)
import Data.Id (Id(..), Indexed, at, empty, register)
import PDF.Box (Box(..))
import PDF.Content.Operator (Instruction, operator)
import PDF.Object (blank, directObject)
import PDF.Output (Output(..), line)
import PDF.Parser (string)
data Instructions = Instructions
data GraphicContextUnit a =
GraphicInstruction a
| ContentUnit (ContentUnit a)
deriving Show
type TextContext a = [a]
data ContentUnit a =
GraphicContext [GraphicContextUnit a]
| TextContext (TextContext a)
deriving Show
data Content = Content {
contentUnits :: [IdContentUnit]
, indexedInstructions :: Indexed Instruction
, firstError :: Maybe String
} deriving Show
type TmpContentUnit = ContentUnit Instruction
type TmpGraphicContextUnit = GraphicContextUnit Instruction
type TmpTextContext = TextContext Instruction
type IdContentUnit = ContentUnit (Id Instruction)
type IdGraphicContextUnit = GraphicContextUnit (Id Instruction)
type IdTextContext = TextContext (Id Instruction)
instance Monad m => Box m Instructions Content (Indexed Instruction) where
r Instructions = return . indexedInstructions
w Instructions indexedInstructions someContent =
return $ someContent {indexedInstructions}
parse :: ByteString -> Content
parse input =
let result = Atto.parseOnly (contentUnit `sepBy` blank) input in
let (contentUnits, indexedInstructions) = either (const ([], empty)) buildContent result in
let firstError = either Just (const Nothing) result in
Content {contentUnits, indexedInstructions, firstError}
buildContent :: [TmpContentUnit] -> ([IdContentUnit], Indexed Instruction)
buildContent instructionContentUnits =
runState (mapM registerContentUnit instructionContentUnits) empty
where
registerContentUnit (GraphicContext gc) =
GraphicContext <$> (mapM registerGraphicContext gc)
registerContentUnit (TextContext tc) = TextContext <$> (mapM register tc)
registerGraphicContext (GraphicInstruction gi) =
GraphicInstruction <$> (register gi)
registerGraphicContext (ContentUnit cu) =
ContentUnit <$> (registerContentUnit cu)
contentUnit :: Atto.Parser TmpContentUnit
contentUnit =
(GraphicContext <$> graphicContext)
<|> (TextContext <$> textContext)
where
graphicContext =
string "q" *> blank *> graphicContextUnit `sepBy` blank <* blank <* string "Q"
graphicContextUnit :: Atto.Parser TmpGraphicContextUnit
graphicContextUnit =
(GraphicInstruction <$> instruction)
<|> (ContentUnit <$> contentUnit)
instruction :: Atto.Parser Instruction
instruction = evalStateT stackParser []
where
stackParser = ((directObject <* blank) >>= push) <|> operator
push arg = modify (arg:) *> stackParser
textContext :: Atto.Parser TmpTextContext
textContext =
string "BT" *> blank *> instruction `sepBy` blank <* blank <* string "ET"
instance Output Content where
output (Content {contentUnits, indexedInstructions}) =
runReader (mconcat <$> mapM outputCU contentUnits) indexedInstructions
where
outputCU (GraphicContext gc) = do
inside <- mconcat <$> mapM outputGCU gc
return (line "q" `mappend` inside `mappend` line "Q")
outputCU (TextContext tc) = do
inside <- mconcat <$> mapM outputIId tc
return (line "BT" `mappend` inside `mappend` line "ET")
outputGCU (GraphicInstruction gi) = outputIId gi
outputGCU (ContentUnit cu) = outputCU cu
outputIId instructionId = asks (output . (`at` instructionId))

View file

@ -1,76 +0,0 @@
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-}
module PDF.Content.Operator (
Instruction
, Operator(..)
, operator
) where
import Control.Monad.Fail (MonadFail(..))
import Control.Monad.State (MonadState(..))
import Data.ByteString.Char8 (ByteString, pack, unpack)
import Data.Char (toLower)
import Data.Map (Map)
import qualified Data.Map as Map (fromList, lookup)
import qualified PDF.Content.Operator.Color as Color (Operator, signature)
import PDF.Content.Operator.Common (Signature)
import qualified PDF.Content.Operator.GraphicState as GraphicState (Operator, signature)
import qualified PDF.Content.Operator.Path as Path (Operator, signature)
import qualified PDF.Content.Operator.Text as Text (Operator, signature)
import PDF.Object (DirectObject, blank, regular)
import PDF.Output (Output(..), join, line)
import PDF.Parser (MonadParser, takeAll1)
import Prelude hiding (fail)
import Text.Printf (printf)
data Operator =
GraphicState GraphicState.Operator
| Path Path.Operator
| Color Color.Operator
| Text Text.Operator
instance Show Operator where
show (GraphicState gcOp) = code gcOp
show (Path pOp) = code pOp
show (Color cOp) = code cOp
show (Text tOp) = code tOp
type Instruction = (Operator, [DirectObject])
instance Output Instruction where
output (op, args) = join " " ((output <$> args) ++ [line (show op)])
operatorsTable :: Map ByteString (Signature Operator)
operatorsTable = Map.fromList (
(prepare GraphicState <$> GraphicState.signature)
++ (prepare Path <$> Path.signature)
++ (prepare Color <$> Color.signature)
++ (prepare Text <$> Text.signature)
)
where
prepare constructor (op, sig) = (pack $ code op, (constructor op, sig))
code :: Show a => a -> String
code = expand . show
where
expand "" = ""
expand (c:'_':s) = toLower c : expand s
expand ('s':'t':'a':'r':s) = '*' : expand s
expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
expand (c:s) = c : expand s
type StackParser m = (MonadState [DirectObject] m, MonadParser m)
operator :: StackParser m => m Instruction
operator = do
chunk <- takeAll1 regular <* blank
args <- reverse <$> get
case Map.lookup chunk operatorsTable of
Just (op, sig)
| sig args -> return (op, args)
| otherwise ->
get >>= fail . printf "Operator %s with stack %s" (show op) . show
_ -> fail ("Unknown chunk " ++ unpack chunk)

View file

@ -1,27 +0,0 @@
module PDF.Content.Operator.Color (
Operator(..)
, signature
) where
import PDF.Content.Operator.Common (Signature)
import PDF.Object (DirectObject(..))
data Operator =
CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
deriving (Bounded, Enum, Show)
signature :: [Signature Operator]
signature = [
(CS, \l -> case l of [NameObject _] -> True ; _ -> False)
, (C_s, \l -> case l of [NameObject _] -> True ; _ -> False)
, (SC, \_ -> True)
, (SCN, \_ -> True)
, (S_c, \_ -> True)
, (S_cn, \_ -> True)
, (G, \l -> case l of [_] -> True ; _ -> False)
, (G_, \l -> case l of [_] -> True ; _ -> False)
, (RG, \l -> case l of [_, _, _] -> True ; _ -> False)
, (R_g, \l -> case l of [_, _, _] -> True ; _ -> False)
, (K, \l -> case l of [_, _, _, _] -> True ; _ -> False)
, (K_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
]

View file

@ -1,7 +0,0 @@
module PDF.Content.Operator.Common (
Signature
) where
import PDF.Object (DirectObject)
type Signature a = (a, [DirectObject] -> Bool)

View file

@ -1,24 +0,0 @@
module PDF.Content.Operator.GraphicState (
Operator(..)
, signature
) where
import PDF.Content.Operator.Common (Signature)
import PDF.Object (DirectObject(..))
data Operator =
C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
deriving (Bounded, Enum, Show)
signature :: [Signature Operator]
signature = [
(C_m, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
, (W_, \l -> case l of [_] -> True ; _ -> False)
, (J, \l -> case l of [_] -> True ; _ -> False)
, (J_, \l -> case l of [_] -> True ; _ -> False)
, (M, \l -> case l of [_] -> True ; _ -> False)
, (D_, \l -> case l of [_, _] -> True ; _ -> False)
, (R_i, \l -> case l of [_] -> True ; _ -> False)
, (I_, \l -> case l of [_] -> True ; _ -> False)
, (G_s, \l -> case l of [NameObject _] -> True ; _ -> False)
]

View file

@ -1,35 +0,0 @@
module PDF.Content.Operator.Path (
Operator(..)
, signature
) where
import PDF.Content.Operator.Common (Signature)
data Operator =
M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
| S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
| W | Wstar -- clipping path
deriving (Bounded, Enum, Show)
signature :: [Signature Operator]
signature = [
(M_, \l -> case l of [_, _] -> True ; _ -> False)
, (L_, \l -> case l of [_, _] -> True ; _ -> False)
, (C_, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
, (V_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
, (Y_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
, (H_, \l -> case l of [] -> True ; _ -> False)
, (R_e, \l -> case l of [_, _, _, _] -> True ; _ -> False)
, (S, \l -> case l of [] -> True ; _ -> False)
, (S_, \l -> case l of [] -> True ; _ -> False)
, (F_, \l -> case l of [] -> True ; _ -> False)
, (F, \l -> case l of [] -> True ; _ -> False)
, (Fstar, \l -> case l of [] -> True ; _ -> False)
, (B, \l -> case l of [] -> True ; _ -> False)
, (Bstar, \l -> case l of [] -> True ; _ -> False)
, (B_, \l -> case l of [] -> True ; _ -> False)
, (B_star, \l -> case l of [] -> True ; _ -> False)
, (N_, \l -> case l of [] -> True ; _ -> False)
, (W, \l -> case l of [] -> True ; _ -> False)
, (Wstar, \l -> case l of [] -> True ; _ -> False)
]

View file

@ -1,32 +0,0 @@
module PDF.Content.Operator.Text (
Operator(..)
, signature
) where
import PDF.Content.Operator.Common (Signature)
import PDF.Object (DirectObject(..))
data Operator =
Td | TD | Tm | Tstar -- text positioning
| TJ | Tj | Quote | DQuote -- text showing
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
deriving (Bounded, Enum, Show)
signature :: [Signature Operator]
signature = [
(Td, \l -> case l of [_, _] -> True ; _ -> False)
, (TD, \l -> case l of [_, _] -> True ; _ -> False)
, (Tm, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
, (Tstar, \l -> case l of [] -> True ; _ -> False)
, (TJ, \l -> case l of [Array _] -> True ; _ -> False)
, (Tj, \l -> case l of [StringObject _] -> True ; _ -> False)
, (Quote, \l -> case l of [StringObject _] -> True ; _ -> False)
, (DQuote, \l -> case l of [_, _, StringObject _] -> True ; _ -> False)
, (Tc, \l -> case l of [_] -> True ; _ -> False)
, (Tw, \l -> case l of [_] -> True ; _ -> False)
, (Tz, \l -> case l of [_] -> True ; _ -> False)
, (TL, \l -> case l of [_] -> True ; _ -> False)
, (Tf, \l -> case l of [NameObject _, _] -> True ; _ -> False)
, (Tr, \l -> case l of [_] -> True ; _ -> False)
, (Ts, \l -> case l of [_] -> True ; _ -> False)
]

View file

@ -1,151 +0,0 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module PDF.Content.Text (
Chunks(..)
, chunk
, format
) where
import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (ReaderT, asks, runReaderT)
import Control.Monad.State (
MonadState(..), StateT, evalStateT, gets, modify, runStateT
)
import Control.Monad.Trans (lift)
import qualified Data.ByteString.Char8 as BS (concatMap, singleton)
import Data.Id (Id(..), Indexed, at, empty, singleton)
import qualified Data.Id as Id (delete, lookup, register)
import Data.Map ((!))
import Data.Text (Text, breakOn)
import qualified Data.Text as Text (drop)
import PDF.Box (Box(..))
import PDF.Content (
Content(..), ContentUnit(..), GraphicContextUnit(..), IdContentUnit
, IdGraphicContextUnit
)
import PDF.Content.Operator (Instruction, Operator(..))
import PDF.Content.Operator.Text (Operator(..))
import PDF.Font (Font(..), FontSet, emptyFont)
import PDF.Object (DirectObject(..), StringObject(..), toByteString)
import Prelude hiding (fail)
type TmpFont = StateT Font
type Renderer m = ReaderT (Indexed Instruction) (ReaderT FontSet m)
type Updater m = StateT (Indexed Instruction) (ReaderT (Indexed Text) (ReaderT FontSet m))
decodeString :: MonadFail m => StringObject -> TmpFont (Renderer m) Text
decodeString input = do
Font {decode} <- get
either fail return . decode $ toByteString input
data Chunks = Chunks
chunk :: Int -> Id Text
chunk = Id
instance MonadFail m => Box (ReaderT FontSet m) Chunks Content (Indexed Text) where
r Chunks content =
runReaderT (mconcat <$> renderer) $ indexedInstructions content
where
renderer = mapM renderContentUnit (contentUnits content)
w Chunks indexedText content = do
(contentUnits, indexedInstructions) <- runReaderT readerUpdate indexedText
return $ content {contentUnits, indexedInstructions}
where
stateUpdate = mapM updateContentUnit (contentUnits content)
readerUpdate = runStateT stateUpdate (indexedInstructions content)
renderContentUnit :: MonadFail m => IdContentUnit -> Renderer m (Indexed Text)
renderContentUnit (GraphicContext graphicContextUnits) =
mconcat <$> mapM renderGraphicContextUnit graphicContextUnits
renderContentUnit (TextContext instructionIds) =
evalStateT (mconcat <$> mapM renderInstructionId instructionIds) emptyFont
updateContentUnit :: MonadFail m => IdContentUnit -> Updater m IdContentUnit
updateContentUnit (GraphicContext graphicContextUnits) = GraphicContext <$>
mapM updateGraphicContextUnit graphicContextUnits
updateContentUnit (TextContext instructionIds) = TextContext . concat <$>
evalStateT (mapM updateInstructionId instructionIds) emptyFont
renderGraphicContextUnit :: MonadFail m => IdGraphicContextUnit -> Renderer m (Indexed Text)
renderGraphicContextUnit (GraphicInstruction _) = return empty
renderGraphicContextUnit (ContentUnit contentUnit) =
renderContentUnit contentUnit
updateGraphicContextUnit :: MonadFail m => IdGraphicContextUnit -> Updater m IdGraphicContextUnit
updateGraphicContextUnit gI@(GraphicInstruction _) = return gI
updateGraphicContextUnit (ContentUnit contentUnit) =
ContentUnit <$> updateContentUnit contentUnit
renderInstructionId :: MonadFail m => Id Instruction -> TmpFont (Renderer m) (Indexed Text)
renderInstructionId instructionId@(Id n) = toMap <$>
(asks ((`at` instructionId)) >>= renderInstruction)
where
toMap = maybe empty (singleton (Id n))
updateInstructionId :: MonadFail m => Id Instruction -> TmpFont (Updater m) [Id Instruction]
updateInstructionId instructionId =
lift (gets (`at` instructionId)) >>= updateInstruction instructionId
renderInstruction :: MonadFail m => Instruction -> TmpFont (Renderer m) (Maybe Text)
renderInstruction (Text Tf, [NameObject fontName, _]) =
lift (lift $ asks (! fontName)) >>= put >> return Nothing
renderInstruction (Text Tstar, []) = return $ Just "\n"
renderInstruction (Text TJ, [Array arrayObject]) =
Just <$> foldM appendText "" arrayObject
where
appendText t (StringObject outputString) =
mappend t <$> decodeString outputString
appendText t _ = return t
renderInstruction (Text Tj, [StringObject outputString]) =
Just <$> decodeString outputString
renderInstruction (Text Quote, [StringObject outputString]) =
(Just . mappend "\n") <$> decodeString outputString
renderInstruction (Text DQuote, [_, _, StringObject outputString]) =
(Just . mappend "\n") <$> decodeString outputString
renderInstruction _ = return Nothing
updateInstruction :: MonadFail m => Id Instruction -> Instruction -> TmpFont (Updater m) [Id Instruction]
updateInstruction instructionId (Text Tf, [NameObject fontName, _]) =
(lift . lift . lift $ asks (!fontName)) >>= put >> return [instructionId]
updateInstruction instructionId@(Id n) instruction = do
if emitsText $ fst instruction
then asks (Id.lookup (Id n)) >>= replaceText
else return [instructionId]
where
emitsText (Text Tstar) = True
emitsText (Text TJ) = True
emitsText (Text Tj) = True
emitsText (Text Quote) = True
emitsText (Text DQuote) = True
emitsText _ = False
replaceText = maybe (return []) $ \text -> do
lift $ modify (Id.delete instructionId)
format text >>= mapM (lift . Id.register)
format :: MonadFail m => Text -> StateT Font m [Instruction]
format input = do
case breakOn "\n" input of
("", "") -> return []
("", left) -> ((Text Tstar, []) :) <$> format (Text.drop 1 left)
(line, left) -> (:) <$> tj line <*> format left
where
tj t = do
encoded <- either fail return =<< gets (($t) . encode)
return (Text Tj, [StringObject . Literal $ BS.concatMap escape encoded])
escape '\\' = "\\\\"
escape '(' = "\\("
escape ')' = "\\)"
escape c = BS.singleton c

View file

@ -6,14 +6,14 @@ module PDF.EOL (
) where
import Control.Applicative ((<|>))
import PDF.Parser (MonadParser, string)
import PDF.Parser (Parser, string)
data Style = CR | LF | CRLF deriving Show
charset :: String
charset = "\r\n"
parser :: MonadParser m => m Style
parser :: Parser s Style
parser =
(string "\r\n" >> return CRLF)
<|> (string "\r" >> return CR)

View file

@ -1,13 +0,0 @@
{-# LANGUAGE FlexibleContexts #-}
module PDF.Encoding (
encoding
) where
import Control.Monad.Except (MonadError(..))
import PDF.Encoding.MacRoman (macRomanEncoding)
import PDF.Font (Font)
import Prelude hiding (fail)
encoding :: MonadError String m => String -> m Font
encoding "MacRomanEncoding" = return macRomanEncoding
encoding s = throwError $ "Unknown encoding " ++ s

View file

@ -1,162 +0,0 @@
module PDF.Encoding.MacRoman (
macRomanEncoding
) where
import qualified Data.ByteString.Char8 as BS (pack, unpack)
import Data.Foldable (foldl')
import Data.Map (Map)
import qualified Data.Map as Map (empty, insert, lookup)
import qualified Data.Text as Text (pack, unpack)
import PDF.Font (Font(..))
type Mapper = Map Char Char
macRomanEncoding :: Font
macRomanEncoding = Font {
decode = Right . Text.pack . fmap decodeChar . BS.unpack
, encode = fmap BS.pack . mapM encodeChar . Text.unpack
}
where
decodeChar k = maybe k id $ Map.lookup k (fst mappers)
encodeChar k =
case Map.lookup k (snd mappers) of
Just v -> Right v
Nothing
| k < '\x80' -> Right k
| otherwise -> Left ("Character '" ++ k :"' unavailable in MacRoman")
mappers :: (Mapper, Mapper)
mappers = foldl' generateMapers (Map.empty, Map.empty) [
('\x80', '\x00C4') -- LATIN CAPITAL LETTER A WITH DIAERESIS
, ('\x81', '\x00C5') -- LATIN CAPITAL LETTER A WITH RING ABOVE
, ('\x82', '\x00C7') -- LATIN CAPITAL LETTER C WITH CEDILLA
, ('\x83', '\x00C9') -- LATIN CAPITAL LETTER E WITH ACUTE
, ('\x84', '\x00D1') -- LATIN CAPITAL LETTER N WITH TILDE
, ('\x85', '\x00D6') -- LATIN CAPITAL LETTER O WITH DIAERESIS
, ('\x86', '\x00DC') -- LATIN CAPITAL LETTER U WITH DIAERESIS
, ('\x87', '\x00E1') -- LATIN SMALL LETTER A WITH ACUTE
, ('\x88', '\x00E0') -- LATIN SMALL LETTER A WITH GRAVE
, ('\x89', '\x00E2') -- LATIN SMALL LETTER A WITH CIRCUMFLEX
, ('\x8A', '\x00E4') -- LATIN SMALL LETTER A WITH DIAERESIS
, ('\x8B', '\x00E3') -- LATIN SMALL LETTER A WITH TILDE
, ('\x8C', '\x00E5') -- LATIN SMALL LETTER A WITH RING ABOVE
, ('\x8D', '\x00E7') -- LATIN SMALL LETTER C WITH CEDILLA
, ('\x8E', '\x00E9') -- LATIN SMALL LETTER E WITH ACUTE
, ('\x8F', '\x00E8') -- LATIN SMALL LETTER E WITH GRAVE
, ('\x90', '\x00EA') -- LATIN SMALL LETTER E WITH CIRCUMFLEX
, ('\x91', '\x00EB') -- LATIN SMALL LETTER E WITH DIAERESIS
, ('\x92', '\x00ED') -- LATIN SMALL LETTER I WITH ACUTE
, ('\x93', '\x00EC') -- LATIN SMALL LETTER I WITH GRAVE
, ('\x94', '\x00EE') -- LATIN SMALL LETTER I WITH CIRCUMFLEX
, ('\x95', '\x00EF') -- LATIN SMALL LETTER I WITH DIAERESIS
, ('\x96', '\x00F1') -- LATIN SMALL LETTER N WITH TILDE
, ('\x97', '\x00F3') -- LATIN SMALL LETTER O WITH ACUTE
, ('\x98', '\x00F2') -- LATIN SMALL LETTER O WITH GRAVE
, ('\x99', '\x00F4') -- LATIN SMALL LETTER O WITH CIRCUMFLEX
, ('\x9A', '\x00F6') -- LATIN SMALL LETTER O WITH DIAERESIS
, ('\x9B', '\x00F5') -- LATIN SMALL LETTER O WITH TILDE
, ('\x9C', '\x00FA') -- LATIN SMALL LETTER U WITH ACUTE
, ('\x9D', '\x00F9') -- LATIN SMALL LETTER U WITH GRAVE
, ('\x9E', '\x00FB') -- LATIN SMALL LETTER U WITH CIRCUMFLEX
, ('\x9F', '\x00FC') -- LATIN SMALL LETTER U WITH DIAERESIS
, ('\xA0', '\x2020') -- DAGGER
, ('\xA1', '\x00B0') -- DEGREE SIGN
, ('\xA2', '\x00A2') -- CENT SIGN
, ('\xA3', '\x00A3') -- POUND SIGN
, ('\xA4', '\x00A7') -- SECTION SIGN
, ('\xA5', '\x2022') -- BULLET
, ('\xA6', '\x00B6') -- PILCROW SIGN
, ('\xA7', '\x00DF') -- LATIN SMALL LETTER SHARP S
, ('\xA8', '\x00AE') -- REGISTERED SIGN
, ('\xA9', '\x00A9') -- COPYRIGHT SIGN
, ('\xAA', '\x2122') -- TRADE MARK SIGN
, ('\xAB', '\x00B4') -- ACUTE ACCENT
, ('\xAC', '\x00A8') -- DIAERESIS
, ('\xAD', '\x2260') -- NOT EQUAL TO
, ('\xAE', '\x00C6') -- LATIN CAPITAL LETTER AE
, ('\xAF', '\x00D8') -- LATIN CAPITAL LETTER O WITH STROKE
, ('\xB0', '\x221E') -- INFINITY
, ('\xB1', '\x00B1') -- PLUS-MINUS SIGN
, ('\xB2', '\x2264') -- LESS-THAN OR EQUAL TO
, ('\xB3', '\x2265') -- GREATER-THAN OR EQUAL TO
, ('\xB4', '\x00A5') -- YEN SIGN
, ('\xB5', '\x00B5') -- MICRO SIGN
, ('\xB6', '\x2202') -- PARTIAL DIFFERENTIAL
, ('\xB7', '\x2211') -- N-ARY SUMMATION
, ('\xB8', '\x220F') -- N-ARY PRODUCT
, ('\xB9', '\x03C0') -- GREEK SMALL LETTER PI
, ('\xBA', '\x222B') -- INTEGRAL
, ('\xBB', '\x00AA') -- FEMININE ORDINAL INDICATOR
, ('\xBC', '\x00BA') -- MASCULINE ORDINAL INDICATOR
, ('\xBD', '\x03A9') -- GREEK CAPITAL LETTER OMEGA
, ('\xBE', '\x00E6') -- LATIN SMALL LETTER AE
, ('\xBF', '\x00F8') -- LATIN SMALL LETTER O WITH STROKE
, ('\xC0', '\x00BF') -- INVERTED QUESTION MARK
, ('\xC1', '\x00A1') -- INVERTED EXCLAMATION MARK
, ('\xC2', '\x00AC') -- NOT SIGN
, ('\xC3', '\x221A') -- SQUARE ROOT
, ('\xC4', '\x0192') -- LATIN SMALL LETTER F WITH HOOK
, ('\xC5', '\x2248') -- ALMOST EQUAL TO
, ('\xC6', '\x2206') -- INCREMENT
, ('\xC7', '\x00AB') -- LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
, ('\xC8', '\x00BB') -- RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
, ('\xC9', '\x2026') -- HORIZONTAL ELLIPSIS
, ('\xCA', '\x00A0') -- NO-BREAK SPACE
, ('\xCB', '\x00C0') -- LATIN CAPITAL LETTER A WITH GRAVE
, ('\xCC', '\x00C3') -- LATIN CAPITAL LETTER A WITH TILDE
, ('\xCD', '\x00D5') -- LATIN CAPITAL LETTER O WITH TILDE
, ('\xCE', '\x0152') -- LATIN CAPITAL LIGATURE OE
, ('\xCF', '\x0153') -- LATIN SMALL LIGATURE OE
, ('\xD0', '\x2013') -- EN DASH
, ('\xD1', '\x2014') -- EM DASH
, ('\xD2', '\x201C') -- LEFT DOUBLE QUOTATION MARK
, ('\xD3', '\x201D') -- RIGHT DOUBLE QUOTATION MARK
, ('\xD4', '\x2018') -- LEFT SINGLE QUOTATION MARK
, ('\xD5', '\x2019') -- RIGHT SINGLE QUOTATION MARK
, ('\xD6', '\x00F7') -- DIVISION SIGN
, ('\xD7', '\x25CA') -- LOZENGE
, ('\xD8', '\x00FF') -- LATIN SMALL LETTER Y WITH DIAERESIS
, ('\xD9', '\x0178') -- LATIN CAPITAL LETTER Y WITH DIAERESIS
, ('\xDA', '\x2044') -- FRACTION SLASH
, ('\xDB', '\x20AC') -- EURO SIGN
, ('\xDC', '\x2039') -- SINGLE LEFT-POINTING ANGLE QUOTATION MARK
, ('\xDD', '\x203A') -- SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
, ('\xDE', '\xFB01') -- LATIN SMALL LIGATURE FI
, ('\xDF', '\xFB02') -- LATIN SMALL LIGATURE FL
, ('\xE0', '\x2021') -- DOUBLE DAGGER
, ('\xE1', '\x00B7') -- MIDDLE DOT
, ('\xE2', '\x201A') -- SINGLE LOW-9 QUOTATION MARK
, ('\xE3', '\x201E') -- DOUBLE LOW-9 QUOTATION MARK
, ('\xE4', '\x2030') -- PER MILLE SIGN
, ('\xE5', '\x00C2') -- LATIN CAPITAL LETTER A WITH CIRCUMFLEX
, ('\xE6', '\x00CA') -- LATIN CAPITAL LETTER E WITH CIRCUMFLEX
, ('\xE7', '\x00C1') -- LATIN CAPITAL LETTER A WITH ACUTE
, ('\xE8', '\x00CB') -- LATIN CAPITAL LETTER E WITH DIAERESIS
, ('\xE9', '\x00C8') -- LATIN CAPITAL LETTER E WITH GRAVE
, ('\xEA', '\x00CD') -- LATIN CAPITAL LETTER I WITH ACUTE
, ('\xEB', '\x00CE') -- LATIN CAPITAL LETTER I WITH CIRCUMFLEX
, ('\xEC', '\x00CF') -- LATIN CAPITAL LETTER I WITH DIAERESIS
, ('\xED', '\x00CC') -- LATIN CAPITAL LETTER I WITH GRAVE
, ('\xEE', '\x00D3') -- LATIN CAPITAL LETTER O WITH ACUTE
, ('\xEF', '\x00D4') -- LATIN CAPITAL LETTER O WITH CIRCUMFLEX
, ('\xF0', '\xF8FF') -- Apple logo
, ('\xF1', '\x00D2') -- LATIN CAPITAL LETTER O WITH GRAVE
, ('\xF2', '\x00DA') -- LATIN CAPITAL LETTER U WITH ACUTE
, ('\xF3', '\x00DB') -- LATIN CAPITAL LETTER U WITH CIRCUMFLEX
, ('\xF4', '\x00D9') -- LATIN CAPITAL LETTER U WITH GRAVE
, ('\xF5', '\x0131') -- LATIN SMALL LETTER DOTLESS I
, ('\xF6', '\x02C6') -- MODIFIER LETTER CIRCUMFLEX ACCENT
, ('\xF7', '\x02DC') -- SMALL TILDE
, ('\xF8', '\x00AF') -- MACRON
, ('\xF9', '\x02D8') -- BREVE
, ('\xFA', '\x02D9') -- DOT ABOVE
, ('\xFB', '\x02DA') -- RING ABOVE
, ('\xFC', '\x00B8') -- CEDILLA
, ('\xFD', '\x02DD') -- DOUBLE ACUTE ACCENT
, ('\xFE', '\x02DB') -- OGONEK
, ('\xFF', '\x02C7') -- CARON
]
where
generateMapers (tmpDecoder, tmpEncoder) (macChar, utf8Char) = (
Map.insert macChar utf8Char tmpDecoder
, Map.insert utf8Char macChar tmpEncoder
)

View file

@ -1,27 +0,0 @@
module PDF.Font (
Decoder
, Encoder
, Font(..)
, FontSet
, emptyFont
) where
import Data.ByteString (ByteString)
import Data.Map (Map)
import Data.Text (Text)
import PDF.Object (Name)
type Decoder = ByteString -> Either String Text
type Encoder = Text -> Either String ByteString
data Font = Font {
decode :: Decoder
, encode :: Encoder
}
type FontSet = Map Name Font
emptyFont :: Font
emptyFont = Font {
decode = \_ -> Left "No fond loaded"
, encode = \_ -> Left "No fond loaded"
}

View file

@ -1,129 +0,0 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE FlexibleContexts #-}
module PDF.Layer (
Layer(..)
, LayerReader
, Objects(..)
, unify
) where
import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (ReaderT)
import Data.Foldable (foldl')
import Data.Id (Id(..), Indexed, keysSet, mapWithKey, member)
import qualified Data.Id as Id (empty, insert, lookup, union)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, lookup, union)
import qualified Data.IntSet as IntSet (delete, toList)
import PDF.Box (Box(..))
import PDF.Object (
IndirectObjCoordinates(..), Object, Occurrence(..), Structure(..)
, XRefEntry(..), XRefSection, eofMarker, outputBody
)
import qualified PDF.Output as Output (line)
import PDF.Output (
Offset(..), Output(..), Resource(..), byteString, getOffset
, getOffsets, newLine
)
import Text.Printf (printf)
data Layer = Layer {
occurrences :: [Occurrence]
, objects :: Indexed Object
, docStructure :: Structure
} deriving Show
type LayerReader m = ReaderT Layer m
updateXRefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
updateXRefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
where
updateEntry objectId e@(InUse {offset}) =
case Map.lookup (ObjectId $ getId objectId) offsets of
Nothing -> Free {nextFree = Id $ getOffset offset, generation = 65535}
Just newOffset -> e {offset = newOffset}
updateEntry _ e = e
instance Output Layer where
output (Layer {occurrences, objects, docStructure}) = do
(body, savedOffsets) <- getOffsets (outputBody (occurrences, objects))
let (newXRef, startXRef) = updateXRefs xRef savedOffsets
mconcat [
return body
, Output.line "xref"
, output newXRef
, Output.line "trailer"
, output trailer, newLine
, Output.line "startxref"
, Output.line (printf "%d" (getOffset startXRef))
, byteString eofMarker
]
where
Structure {xRef, trailer} = docStructure
data Objects = Objects
instance Monad m => Box m Objects Layer (Indexed Object) where
r Objects = return . objects
w Objects newObjects layer@(Layer {occurrences, docStructure}) =
return $ layer {
occurrences = keptOccurrences ++ newOccurrences
, objects = newObjects
, docStructure = docStructure {
xRef = (const $ InUse (Offset 0) 0) <$> newObjects
}
}
where
filterOccurrences c@(Comment _) (occ, newObjIds) = (c:occ, newObjIds)
filterOccurrences i@(Indirect (IndirectObjCoordinates {objectId})) (occ, newObjIds)
| member objectId newObjects = (i:occ, IntSet.delete (getId objectId) newObjIds)
| otherwise = (occ, newObjIds)
(keptOccurrences, newObjectIds) =
foldr filterOccurrences ([], keysSet newObjects) occurrences
makeOccurrence objectId =
Indirect (IndirectObjCoordinates {objectId, versionNumber = 0})
newOccurrences = (makeOccurrence . Id) <$> IntSet.toList newObjectIds
instance MonadError String m => Box m (Id Object) Layer Object where
r objectId =
maybe (throwError "Unknown key") return . Id.lookup objectId . objects
w objectId a layer@(Layer {objects})
| member objectId objects = return $
layer {objects = Id.insert objectId a objects}
| otherwise = throwError "Unknown key"
emptyLayer :: Layer
emptyLayer = Layer {
docStructure = Structure {xRef = Id.empty, trailer = Map.empty}
, objects = Id.empty
, occurrences = []
}
unify :: [Layer] -> Layer
unify = foldl' complete emptyLayer
where
complete tmpLayer older =
let mergedObjects = Id.union (objects tmpLayer) (objects older) in
Layer {
docStructure =
unifyDocStructure (docStructure tmpLayer) (docStructure older)
, objects = mergedObjects
, occurrences =
unifyOccurrences mergedObjects (occurrences tmpLayer) (occurrences older)
}
unifyDocStructure :: Structure -> Structure -> Structure
unifyDocStructure new old = Structure {
xRef = Id.union (xRef new) (xRef old)
, trailer = Map.union (trailer new) (trailer old)
}
unifyOccurrences :: (Indexed Object) -> [Occurrence] -> [Occurrence] -> [Occurrence]
unifyOccurrences objects new = foldr addOlder new
where
addOlder occurrence@(Comment _) existing = occurrence : existing
addOlder occurrence@(Indirect indirect) existing =
if objectId indirect `member` objects
then occurrence : existing
else existing

View file

@ -2,20 +2,19 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleInstances #-}
module PDF.Object (
Dictionary
Content(..)
, DirectObject(..)
, Flow(..)
, IndexedObjects
, IndirectObjCoordinates(..)
, InputStructure(..)
, Name(..)
, Number(..)
, Object(..)
, Occurrence(..)
, StringObject(..)
, Structure(..)
, XRefEntry(..)
, XRefSection
, array
, blank
, dictionary
, directObject
@ -23,43 +22,34 @@ module PDF.Object (
, integer
, line
, magicNumber
, name
, number
, object
, outputBody
, regular
, stringObject
, structure
, toByteString
) where
import Control.Applicative ((<|>), many)
import Control.Monad.Reader (asks)
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (concat)
import qualified Data.ByteString.Char8 as Char8 (
cons, length, pack, singleton, snoc, unpack
import Control.Applicative ((<|>))
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (
concat, cons, pack, singleton, unpack
)
import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
import Data.Id (Id(..), IdMap, Indexed)
import qualified Data.Id as Id (
at, delete, empty, fromList, lookup, minViewWithKey, union
import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map (
delete, empty, fromList, lookup, minViewWithKey, toList, union
)
import Data.Map (Map)
import qualified Data.Map as Map (fromList, toList)
import qualified Data.Set as Set (fromList, member)
import qualified PDF.EOL as EOL (Style(..), charset, parser)
import qualified PDF.Output as Output (line, string)
import qualified PDF.EOL as EOL (charset, parser)
import qualified PDF.Output as Output (concat, line, string)
import PDF.Output (
OBuilder, OContext(..), Offset(..), Output(..), Resource(..), byteString
, getOffset, join, newLine, saveOffset
OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..)
, byteString, getObjectId, getOffset, getOffsets, join, newLine
, saveOffset
)
import PDF.Parser (
Parser, (<?>)
, char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option
, sepBy, string, takeAll, takeAll1
)
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
import Text.Printf (printf)
line :: MonadParser m => String -> m ()
line l = (string (Char8.pack l) *> blank *> return ()) <?> printf "line «%s»" l
line :: String -> Parser u ()
line l = (string (BS.pack l) *> blank *> return ()) <?> printf "line «%s»" l
magicNumber :: ByteString
magicNumber = "%PDF-"
@ -70,8 +60,8 @@ eofMarker = "%%EOF"
whiteSpaceCharset :: String
whiteSpaceCharset = "\0\t\12 "
blank :: MonadParser m => m ()
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> pure ()
blank :: Parser u ()
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return ()
delimiterCharset :: String
delimiterCharset = "()<>[]{}/%"
@ -79,24 +69,26 @@ delimiterCharset = "()<>[]{}/%"
regular :: Char -> Bool
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
integer :: MonadParser m => m Int
integer = decNumber <* blank <?> "decimal integer"
integer :: (Read a, Num a) => Parser u a
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
-------------------------------------
-- OBJECTS
-------------------------------------
type IndexedObjects = Map ObjectId Object
--
-- Boolean
--
boolean :: MonadParser m => m Bool
boolean :: Parser u Bool
boolean =
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
--
-- Number
--
newtype Number = Number Double deriving (Eq, Show)
newtype Number = Number Double deriving Show
instance Output Number where
output (Number f) = Output.string $
@ -104,44 +96,38 @@ instance Output Number where
(n, 0) -> printf "%d" (n :: Int)
_ -> printf "%f" f
number :: MonadParser m => m Number
number = Number <$> (sign <*> value) <?> "number"
number :: Parser u Number
number = Number . read . BS.unpack <$>
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
<?> "number"
where
sign = (string "-" *> return negate) <|> option id (char '+' >> return id)
value = floatNumber <|> (char '.' *> afterPoint)
afterPoint = read . ("0." ++) . Char8.unpack <$> takeAll (`Set.member` digits)
digits = Set.fromList ['0' .. '9']
sign = string "-" <|> option "" (char '+' >> return "")
integerPart = mappend <$> decNumber <*> option "" floatPart
floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
--
-- StringObject
--
data StringObject = Literal ByteString | Hexadecimal B16Int deriving (Eq, Show)
data StringObject = Literal String | Hexadecimal String deriving Show
instance Output StringObject where
output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))
output (Literal s) = Output.string (printf "(%s)" s)
output (Hexadecimal s) = Output.string (printf "<%s>" s)
stringObject :: MonadParser m => m StringObject
stringObject :: Parser u StringObject
stringObject =
Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal . roundBytes <$> (char '<' *> hexNumber <* char '>')
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
<?> "string object (literal or hexadecimal)"
where
literalString = many literalStringBlock
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
normalChar = not . (`elem` ("\\()" :: String))
matchingParenthesis =
mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar =
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
roundBytes (B16Int bs)
| Char8.length bs `mod` 2 == 1 = B16Int (bs `Char8.snoc` '0')
| otherwise = B16Int bs
toByteString :: StringObject -> ByteString
toByteString (Hexadecimal h) = b16ToBytes h
toByteString (Literal s) = unescape s
BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
--
-- Name
@ -151,13 +137,13 @@ newtype Name = Name String deriving (Eq, Ord, Show)
instance Output Name where
output (Name n) = Output.string ('/':n)
name :: MonadParser m => m Name
name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"
name :: Parser u Name
name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
--
-- Array
--
array :: MonadParser m => m [DirectObject]
array :: Parser u [DirectObject]
array =
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
@ -167,14 +153,14 @@ array =
type Dictionary = Map Name DirectObject
instance Output Dictionary where
output aDictionary =
output dict =
"<<" `mappend` keyValues `mappend` ">>"
where
keyValues = join " " $ outputKeyVal <$> Map.toList aDictionary
keyValues = join " " $ outputKeyVal <$> Map.toList dict
outputKeyVal :: (Name, DirectObject) -> OBuilder
outputKeyVal (key, val) = mconcat [output key, " ", output val]
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
dictionary :: MonadParser m => m Dictionary
dictionary :: Parser u Dictionary
dictionary =
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
where
@ -184,33 +170,33 @@ dictionary =
--
-- Null
--
nullObject :: MonadParser m => m ()
nullObject :: Parser u ()
nullObject = string "null" *> return () <?> "null object"
--
-- Reference
--
data IndirectObjCoordinates = IndirectObjCoordinates {
objectId :: {-# UNPACK #-} !(Id Object)
, versionNumber :: {-# UNPACK #-} !Int
objectId :: ObjectId
, versionNumber :: Int
} deriving Show
reference :: MonadParser m => m IndirectObjCoordinates
reference :: Parser u IndirectObjCoordinates
reference = IndirectObjCoordinates
<$> (fmap Id integer) <*> integer <* char 'R' <?> "reference to an object"
<$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
--
-- DirectObject
--
data DirectObject =
Boolean !Bool
| NumberObject !Number
| StringObject !StringObject
| NameObject !Name
| Array ![DirectObject]
| Dictionary !Dictionary
Boolean Bool
| NumberObject Number
| StringObject StringObject
| NameObject Name
| Array [DirectObject]
| Dictionary Dictionary
| Null
| Reference !IndirectObjCoordinates
| Reference IndirectObjCoordinates
deriving Show
instance Output DirectObject where
@ -218,25 +204,23 @@ instance Output DirectObject where
output (NumberObject n) = output n
output (StringObject s) = output s
output (NameObject n) = output n
output (Array a) = mconcat ["[", join " " a, "]"]
output (Array a) = Output.concat ["[", join " " a, "]"]
output (Dictionary d) = output d
output (Null) = "null"
output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
Output.string (printf "%d %d R" (getId objectId) versionNumber)
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
directObject :: MonadParser m => m DirectObject
directObject = (peek >>= dispatch) <?> "direct object"
where
dispatch 't' = Boolean <$> boolean
dispatch 'f' = Boolean <$> boolean
dispatch '(' = StringObject <$> stringObject
dispatch '<' = StringObject <$> stringObject <|> Dictionary <$> dictionary
dispatch '/' = NameObject <$> name
dispatch '[' = Array <$> array
dispatch 'n' = nullObject *> return Null
dispatch _ =
Reference <$> reference {- defined before Number because Number is a prefix of it -}
<|> NumberObject <$> number
directObject :: Parser u DirectObject
directObject =
Boolean <$> boolean
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
<|> NumberObject <$> number
<|> StringObject <$> stringObject
<|> NameObject <$> name
<|> Array <$> array
<|> Dictionary <$> dictionary
<|> const Null <$> nullObject
<?> "direct object"
--
-- Object
@ -251,34 +235,27 @@ data Object =
instance Output Object where
output (Direct d) = output d
output (Stream {header, streamContent}) = mconcat [
output (Stream {header, streamContent}) = Output.concat [
output header, newLine
, Output.line "stream"
, byteString streamContent
, "endstream"
]
object :: Int -> Id Object
object = Id
--
-- Occurrence
--
data Occurrence = Comment String | Indirect IndirectObjCoordinates deriving Show
outputOccurrence :: (Indexed Object) -> Occurrence -> OBuilder
outputOccurrence :: IndexedObjects -> Occurrence -> OBuilder
outputOccurrence _ (Comment c) = Output.line c
outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNumber})) =
saveOffset (ObjectId $ getId objectId) >> mconcat [
Output.line (printf "%d %d obj" (getId objectId) versionNumber)
, output (objects `Id.at` objectId), newLine
saveOffset (Object objectId) >> Output.concat [
Output.line (printf "%d %d obj" (getObjectId objectId) versionNumber)
, output (objects ! objectId), newLine
, Output.line "endobj"
]
outputBody :: ([Occurrence], (Indexed Object)) -> OBuilder
outputBody (occurrences, objects) =
output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
-------------------------------------
-- XREF TABLE
-------------------------------------
@ -290,20 +267,15 @@ data XRefEntry = InUse {
offset :: Offset
, generation :: Int
} | Free {
nextFree :: (Id Object)
nextFree :: ObjectId
, generation :: Int
} deriving Show
instance Output XRefEntry where
output xRefEntry = Output.string (build xRefEntry) `mappend` endXRefEntryLine
where
build (InUse {offset, generation}) =
printf "%010d %05d n" (getOffset offset) generation
build (Free {nextFree, generation}) =
printf "%010d %05d f" (getId nextFree) generation
endXRefEntryLine = OContext (asks padEOLToTwoBytes) >>= Output.line
padEOLToTwoBytes EOL.CRLF = ("" :: String)
padEOLToTwoBytes _ = " "
output (InUse {offset, generation}) =
Output.line (printf "%010d %05d n " (getOffset offset) generation)
output (Free {nextFree, generation}) =
Output.line (printf "%010d %05d f " (getObjectId nextFree) generation)
entry :: Parser u XRefEntry
entry = do
@ -315,51 +287,52 @@ entry = do
char 'n' *> return (InUse {offset = Offset big, generation})
free :: Int -> Int -> Parser u XRefEntry
free big generation =
char 'f' *> return (Free {nextFree = Id big, generation})
char 'f' *> return (Free {nextFree = ObjectId big, generation})
--
-- XRefSubSection
--
data XRefSubSection = XRefSubSection {
firstObjectId :: (Id Object)
firstObjectId :: ObjectId
, entries :: [XRefEntry]
} deriving Show
instance Output XRefSubSection where
output (XRefSubSection {firstObjectId, entries}) =
Output.line (printf "%d %d" (getId firstObjectId) (length entries))
Output.line (printf "%d %d" (getObjectId firstObjectId) (length entries))
`mappend` output entries
xRefSubSection :: Parser u XRefSubSection
xRefSubSection = do
(firstId, entriesNumber) <- (,) <$> integer <*> integer <?> "XRef subsection"
entries <- count entriesNumber entry
return $ XRefSubSection {firstObjectId = Id firstId, entries}
return $ XRefSubSection {firstObjectId = ObjectId firstId, entries}
type XRefSection = IdMap Object XRefEntry
type XRefSection = Map ObjectId XRefEntry
instance Output XRefSection where
output = output . sections
where
sections tmp =
case Id.minViewWithKey tmp of
case Map.minViewWithKey tmp of
Nothing -> []
Just ((objectId@(Id value), firstEntry), rest) ->
Just ((objectId@(ObjectId value), firstEntry), rest) ->
let (subSection, sndRest) = section objectId [firstEntry] (value + 1) rest in
subSection : sections sndRest
section firstObjectId stack nextValue tmp =
let nextId = (Id nextValue :: Id Object) in
case Id.lookup nextId tmp of
let nextId = ObjectId nextValue in
case Map.lookup nextId tmp of
Nothing -> (XRefSubSection {firstObjectId, entries = reverse stack}, tmp)
Just nextEntry ->
section firstObjectId (nextEntry:stack) (nextValue + 1) (Id.delete nextId tmp)
section firstObjectId (nextEntry:stack) (nextValue + 1) (Map.delete nextId tmp)
xRefSection :: Parser u XRefSection
xRefSection = foldr addSubsection Id.empty <$>
xRefSection = foldr addSubsection Map.empty <$>
(line "xref" *> xRefSubSection `sepBy` many EOL.parser)
where
addSubsection (XRefSubSection {firstObjectId, entries}) =
Id.union . Id.fromList $ zip ([firstObjectId..]) entries
Map.union . Map.fromList $ zip ([firstObjectId..]) entries
--
-- Structure
@ -380,10 +353,45 @@ structure =
<$> xRefSection
<*> (string "trailer" *> blank *> dictionary <* EOL.parser)
updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
where
updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)}
updateEntry _ e = e
--
-- Flow
--
data Flow = Flow {
occurrencesStack :: [Occurrence]
, tmpObjects :: (Indexed Object)
, tmpObjects :: IndexedObjects
} deriving Show
--
-- Content
--
data Content = Content {
occurrences :: [Occurrence]
, objects :: IndexedObjects
, docStructure :: Structure
} deriving Show
outputBody :: ([Occurrence], IndexedObjects) -> OBuilder
outputBody (occurrences, objects) =
output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
instance Output Content where
output (Content {occurrences, objects, docStructure}) =
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
>>= \(body, (xref, startXRef)) -> Output.concat [
body
, Output.line "xref"
, output xref
, Output.line "trailer"
, output trailer, newLine
, Output.line "startxref"
, Output.line (printf "%d" (getOffset startXRef))
, byteString eofMarker
]
where
Structure {xRef, trailer} = docStructure

View file

@ -1,180 +0,0 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveFunctor #-}
module PDF.Object.Navigation (
Nav(..)
, PPath(..)
, ROLayer
, RWLayer
, StreamContent(..)
, (./)
, (//)
, (>./)
, (>//)
, castObject
, catalog
, getDictionary
, getKey
, objectById
, save
) where
import Codec.Compression.Zlib (compress, decompress)
import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (MonadReader(..))
import Control.Monad.State (MonadState)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (length)
import qualified Data.ByteString.Lazy as Lazy (ByteString, fromStrict, toStrict)
import Data.Id (Id)
import qualified Data.Id as Id (at)
import qualified Data.Map as Map (adjust, insert, lookup)
import PDF.Box (Box(..), at, edit{-, runRO-})
import PDF.Layer (Layer(..))
import PDF.Object (
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Name(..), Number(..), Object(..), Structure(..)
)
import Prelude hiding (fail)
import Text.Printf (printf)
newtype PPath = PPath [Component]
data DPath = DPath {
root :: Id Object
, offset :: [Component]
} deriving Show
push :: Component -> DPath -> DPath
push component dPath = dPath {offset = (offset dPath) ++ [component]}
data Nav a = Nav {
dPath :: DPath
, value :: a
} deriving (Functor)
instance Show a => Show (Nav a) where
show (Nav {dPath, value}) = "Nav {dPath = " ++ show dPath ++ ", value = " ++ show value ++ "}"
type ROLayer m = (MonadReader Layer m, MonadError String m)
type RWLayer m = (MonadState Layer m, MonadError String m)
type Component = String
getDictionary :: ROLayer m => Nav Object -> m (Nav Dictionary)
getDictionary (Nav {dPath, value}) =
case value of
(Direct (Dictionary aDict)) -> return $ Nav {dPath, value = aDict}
(Direct (Reference ref)) -> objectById (objectId ref) >>= getDictionary
(Stream {header}) -> return $ Nav {dPath, value = header}
obj -> expected "dictionary : " obj
expected :: (MonadError String m, Show a) => String -> a -> m b
expected name = throwError . printf "Not a %s: %s" name . show
getKey :: ROLayer m => String -> Nav Object -> m (Nav DirectObject)
getKey key navObject = getDictionary navObject >>= f
where
errorMessage =
printf "Key %s not found in object %s" key (show navObject)
f (Nav {dPath, value}) =
case Map.lookup (Name key) value of
Nothing -> throwError errorMessage
Just dObj -> return $ Nav {dPath = push key dPath, value = dObj}
objectById :: ROLayer m => (Id Object) -> m (Nav Object)
objectById objectId = do
layer <- ask
return $ Nav {
dPath = DPath {root = objectId, offset = []}
, value = objects layer `Id.at` objectId
}
castObject :: ROLayer m => Nav DirectObject -> m (Nav Object)
castObject (Nav {value = !(Reference (IndirectObjCoordinates {objectId}))}) =
objectById objectId
castObject (Nav {dPath, value}) = return $ Nav {dPath, value = Direct value}
(./) :: ROLayer m => m (Nav Object) -> Component -> m (Nav Object)
(./) navObject key = (navObject >>= getKey key >>= castObject)
(//) :: ROLayer m => m (Nav Object) -> PPath -> m (Nav Object)
(//) navObject (PPath []) = navObject
(//) navObject (PPath (key:keys)) = navObject ./ key // (PPath keys)
(>./) :: ROLayer m => Nav Object -> Component -> m (Nav Object)
(>./) navObject = (return navObject ./)
(>//) :: ROLayer m => Nav Object -> PPath -> m (Nav Object)
(>//) navObject = (return navObject //)
catalog :: ROLayer m => m (Nav Object)
catalog = do
value <- Direct . Dictionary . trailer . docStructure <$> ask
return $ Nav {dPath = undefined, value}
setAt :: [Component] -> DirectObject -> Dictionary -> Dictionary
setAt [] _ dict = dict
setAt [component] directObject dict =
Map.insert (Name component) directObject dict
setAt (component:components) directObject dict =
Map.adjust setDirObj (Name component) dict
where
setDirObj (Dictionary subDict) =
Dictionary $ setAt components directObject subDict
setDirObj x = x
save :: RWLayer m => Nav Object -> m ()
save nav@(Nav {dPath, value = Direct dObj}) =
edit .at (root dPath) $ return . setObj
where
setObj obj@(Stream {header}) =
obj {header = setAt (offset dPath) dObj header}
setObj (Direct (Dictionary dict)) =
Direct . Dictionary $ setAt (offset dPath) dObj dict
setObj _ = value nav
save (Nav {dPath = DPath {root, offset = []}, value}) = edit $ w root value
save _ = throwError "Streams can't be properties of PDF objects"
data StreamContent = Clear | Raw
onLazy :: (Lazy.ByteString -> Lazy.ByteString) -> ByteString -> ByteString
onLazy f = Lazy.toStrict . f . Lazy.fromStrict
contains :: String -> DirectObject -> Bool
contains needle !(NameObject (Name n)) = needle == n
contains needle !(Array directObjects) = oneOf directObjects (contains needle)
where
oneOf [] _ = False
oneOf (x:xs) p = p x || oneOf xs p
contains _ _ = False
instance MonadError String m => Box m StreamContent (Nav Object) ByteString where
r sc = r sc . value
w sc newStreamContent nav = setValue <$> w sc newStreamContent (value nav)
where
setValue value = nav {value}
instance MonadError String m => Box m StreamContent Object ByteString where
r Raw (Stream {streamContent}) = return streamContent
r Clear (Stream {header, streamContent}) = return $
case Map.lookup (Name "Filter") header of
Just directObject
| contains "FlateDecode" directObject -> onLazy decompress streamContent
_ -> streamContent
r _ obj = expected "stream" obj
w Raw streamContent obj@(Stream {}) = return $ obj {streamContent}
w Clear newStreamContent (Stream {header}) =
let streamContent = getStreamContent (Map.lookup (Name "Filter") header) in
return $ Stream {header = fixLength streamContent, streamContent}
where
getStreamContent (Just directObject)
| contains "FlateDecode" directObject = onLazy compress newStreamContent
getStreamContent _ = newStreamContent
fixLength sc =
let newLength = NumberObject . Number . fromIntegral $ BS.length sc in
Map.insert (Name "Length") newLength header
w _ _ obj = expected "stream" obj

View file

@ -5,12 +5,14 @@
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
module PDF.Output (
OBuilder
, ObjectId(..)
, OContext(..)
, Offset(..)
, Output(..)
, Resource(..)
, byteString
, char
, concat
, getOffsets
, join
, line
@ -30,10 +32,12 @@ import qualified Data.Map as Map (singleton)
import Data.String (IsString(..))
import Control.Monad.RWS (RWS, runRWS, ask, get, listen, modify, tell)
import qualified PDF.EOL as EOL (Style(..))
import Prelude hiding (concat)
newtype ObjectId = ObjectId {getObjectId :: Int} deriving (Enum, Eq, Ord, Show)
newtype Offset = Offset {getOffset :: Int} deriving (Show)
data Resource = StartXRef | ObjectId Int deriving (Eq, Ord)
data Resource = StartXRef | Object ObjectId deriving (Eq, Ord)
newtype OContext a = OContext (RWS EOL.Style (Map Resource Offset) Offset a)
type OBuilder = OContext Builder
@ -55,12 +59,16 @@ saveOffset resource = OContext $
lift :: (a -> Builder) -> a -> OBuilder
lift f a = return (f a)
getOffsets :: OBuilder -> OContext (Builder, Map Resource Offset)
getOffsets (OContext builder) = OContext $ listen builder
getOffsets :: OBuilder -> OContext (OBuilder, Map Resource Offset)
getOffsets (OContext builder) =
OContext (listen builder >>= \(a, w) -> return (return a, w))
append :: OBuilder -> OBuilder -> OBuilder
append (OContext a) (OContext b) = OContext (mappend <$> a <*> b)
concat :: [OBuilder] -> OBuilder
concat = foldl mappend mempty
#if MIN_VERSION_base(4,11,0)
instance Semigroup OBuilder where
(<>) = append
@ -86,7 +94,7 @@ instance Output Bool where
output True = string "true"
instance Output a => Output [a] where
output = mconcat . fmap output
output = concat . fmap output
join :: Output a => String -> [a] -> OBuilder
join _ [] = mempty
@ -108,7 +116,7 @@ char :: Char -> OBuilder
char c = lift char8 c <* offset (+1)
string :: String -> OBuilder
string s = lift string8 s <* offset (+ length s)
string s = lift string8 s <* offset (+ toEnum (length s))
line :: String -> OBuilder
line l = string l `mappend` newLine

View file

@ -1,181 +0,0 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
module PDF.Pages (
Contents(..)
, FontCache
, Page(..)
, PageNumber(..)
, Pages(..)
, withFonts
, withResources
) where
import Control.Applicative (Alternative)
import Control.Applicative ((<|>))
import Control.Monad (foldM)
import Control.Monad.Except (MonadError(..))
import Control.Monad.Reader (ReaderT, runReaderT)
import Control.Monad.State (StateT(..), evalStateT, execStateT, gets, modify)
import Control.Monad.Trans (lift)
import Data.ByteString (ByteString)
import Data.ByteString.Lazy (toStrict)
import Data.Id (Id(..), IdMap)
import qualified Data.Id as Id (empty, insert, lookup)
import Data.Map (Map)
import qualified Data.Map as Map (empty, elems, fromList, insert, toList)
import Data.Maybe (listToMaybe)
import Data.OrderedMap (OrderedMap, build, mapi)
import PDF.Box (Box(..), at, edit, runRO)
import qualified PDF.CMap as CMap (parse)
import PDF.Content (Content(..))
import qualified PDF.Content as Content (parse)
import PDF.Encoding (encoding)
import PDF.EOL (Style(..))
import PDF.Font (Font, FontSet)
import PDF.Layer (Layer(..), LayerReader)
import PDF.Object (
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Name(..), Number(..), Object(..)
,)
import PDF.Object.Navigation (
Nav(..), PPath(..), ROLayer, RWLayer, StreamContent(..), (//), (>./)
, (>//), catalog, getDictionary, getKey, objectById, save
)
import PDF.Output (render)
import Text.Printf (printf)
type CachedFonts = IdMap Object Font
type FontCache m = StateT CachedFonts m
data Page = Page {
byteContents :: OrderedMap (Id Object) ByteString
, resources :: Dictionary
, source :: (Id Object)
}
loadByteContents :: ROLayer m => DirectObject -> m (OrderedMap (Id Object) ByteString)
loadByteContents directObject = do
objs <- sequence . build objectById $ objectIds directObject
mapM (r Clear) objs
where
objectIds (Array l) = l >>= getReference
objectIds dirObj = getReference dirObj
getFontDictionary :: (Alternative m, ROLayer m) => Nav Object -> m Dictionary
getFontDictionary pageObj =
(pageObj >// PPath ["Resources", "Font"] >>= fmap value . getDictionary)
<|> return Map.empty
cache :: ROLayer m => (Id Object -> FontCache m Font) -> Id Object -> FontCache m Font
cache loader objectId =
gets (Id.lookup objectId) >>= maybe load return
where
load = do
value <- loader objectId
modify $ Id.insert objectId value
return value
loadFont :: (Alternative m, ROLayer m) => Id Object -> FontCache m Font
loadFont objectId = lift $ objectById objectId >>= tryMappings
where
tryMappings object =
(object >./ "ToUnicode" >>= r Clear >>= CMap.parse)
<|> (object >./ "Encoding" >>= loadEncoding . value)
<|> (throwError $ unknownFormat (show objectId) (show object))
unknownFormat = printf "Unknown font format for object #%s : %s"
loadEncoding :: MonadError String m => Object -> m Font
loadEncoding (Direct (NameObject (Name name))) = encoding name
loadEncoding object =
throwError $ printf "Encoding must be a name, not that : %s" $ show object
loadResources :: (Alternative m, ROLayer m) => Dictionary -> FontCache m FontSet
loadResources = foldM addFont Map.empty . Map.toList
where
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output
getReference :: DirectObject -> [Id Object]
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
getReference _ = []
loadPage :: (Alternative m, ROLayer m) => Id Object -> m Page
loadPage source = do
page <- objectById source
byteContents <- loadByteContents . value =<< getKey "Contents" page
resources <- getFontDictionary page
return $ Page {byteContents, resources, source}
pagesList :: (Alternative m, ROLayer m) => m [Id Object]
pagesList =
(catalog // PPath ["Root", "Pages", "Kids"] >>= getReferences . value)
<|> return []
where
getReferences (Direct (Array kids)) = return $ getReference =<< kids
getReferences _ = throwError "Not a pages array"
editPagesList :: RWLayer m => ([DirectObject] -> [DirectObject]) -> m ()
editPagesList f = do
pages <- runRO (catalog // PPath ["Root", "Pages"])
kids <- runRO (pages >./ "Kids")
count <- runRO (pages >./ "Count")
(newSize, newKids) <- editKids (value kids)
save $ kids {value = newKids}
save $ count {value = Direct $ NumberObject newSize}
where
editKids (Direct (Array pageRefs)) =
let result = f pageRefs in
return (Number . fromIntegral $ length result, Direct $ Array result)
editKids _ = throwError "Invalid format for Root.Pages.Kids (not an array)"
updatePage :: RWLayer m => Page -> m ()
updatePage (Page {byteContents}) = sequence_ $ mapi updateByteContent byteContents
where
updateByteContent source byteContent =
edit .at source .at Clear $ \_ -> return byteContent
data Pages = Pages
newtype PageNumber = P Int
data Contents = Contents
instance (Alternative m, MonadError String m) => Box m Pages Layer (Map Int Page) where
r Pages = runReaderT (numbered <$> pagesList >>= mapM loadPage)
where
numbered :: [Id Object] -> Map Int (Id Object)
numbered = Map.fromList . zip [1..]
w Pages pages = execStateT $ do
mapM_ updatePage pages
setPagesList $ Map.elems (source <$> pages)
where
setPagesList =
editPagesList . const . fmap (Reference . flip IndirectObjCoordinates 0)
instance (Alternative m, MonadError String m) => Box m PageNumber Layer Page where
r (P p) layer
| p < 1 = throwError "Pages start at 1"
| otherwise = runReaderT (drop (p - 1) <$> pagesList >>= firstPage) layer
where
firstPage =
maybe (throwError "Page is out of bounds") loadPage . listToMaybe
w (P p) page = execStateT $ do
updatePage page
editPagesList $ setPage (Reference $ IndirectObjCoordinates (source page) 0)
where
setPage ref l = take (p-1) l ++ ref : drop p l
instance Monad m => Box m Contents Page (OrderedMap (Id Object) Content) where
r Contents = return . fmap Content.parse . byteContents
w Contents contents page = return $ page {byteContents}
where
byteContents = toStrict . render LF <$> contents
withFonts :: MonadError String m => (Layer -> FontCache (LayerReader m) a) -> Layer -> m a
withFonts f layer = runReaderT (evalStateT (f layer) Id.empty) layer
withResources :: (Alternative m, MonadError String m) => (Page -> ReaderT FontSet m b) -> Page -> FontCache (LayerReader m) b
withResources f p =
loadResources (resources p) >>= lift . lift . runReaderT (f p)

View file

@ -1,78 +1,56 @@
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE UndecidableInstances #-}
module PDF.Parser (
MonadParser(..)
, Parser
Parser
, (<?>)
, block
, char
, choice
, count
, decNumber
, hexNumber
, many
, octDigit
, on
, oneOf
, option
, runParser
, evalParser
, sepBy
, string
, takeAll
, takeAll1
) where
import Control.Applicative (Alternative, (<|>))
import Control.Monad (MonadPlus)
import Control.Monad.Fail (MonadFail(..))
import Control.Applicative ((<|>), empty)
import Control.Monad.State (StateT(..), evalStateT)
import Control.Monad.Trans (MonadTrans(..))
import Control.Monad.Trans (lift)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
Parser, char, decimal, double, endOfInput, parseOnly, peekChar', satisfy, string, take
, takeWhile, takeWhile1
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
)
import Data.ByteString (ByteString)
import Data.ByteString.Char8.Util (B16Int(..))
import Data.Char (toLower)
import Data.Set (Set)
import qualified Data.Set as Set (fromList, member, unions)
import Prelude hiding (fail)
type MonadDeps m = (MonadFail m, MonadPlus m)
class MonadDeps m => MonadParser m where
block :: Int -> m ByteString
char :: Char -> m Char
decNumber :: m Int
floatNumber :: m Double
endOfInput :: m ()
hexNumber :: m B16Int
oneOf :: String -> m Char
peek :: m Char
string :: ByteString -> m ByteString
takeAll :: (Char -> Bool) -> m ByteString
takeAll1 :: (Char -> Bool) -> m ByteString
instance MonadParser Atto.Parser where
block = Atto.take
char = Atto.char
endOfInput = Atto.endOfInput
decNumber = Atto.decimal
floatNumber = Atto.double
hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
oneOf charSet = Atto.satisfy (`elem` charSet)
peek = Atto.peekChar'
string s = Atto.string s <?> show s
takeAll = Atto.takeWhile
takeAll1 = Atto.takeWhile1
instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) where
block = lift . block
char = lift . char
endOfInput = lift $ endOfInput
decNumber = lift $ decNumber
floatNumber = lift $ floatNumber
hexNumber = lift $ hexNumber
oneOf = lift . oneOf
peek = lift $ peek
string = lift . string
takeAll = lift . takeAll
takeAll1 = lift . takeAll1
type Parser s = StateT s Atto.Parser
(<?>) :: (Alternative m, MonadFail m) => m a -> String -> m a
(<?>) :: Parser s a -> String -> Parser s a
(<?>) parser debugMessage = parser <|> fail debugMessage
block :: Int -> Parser s ByteString
block = lift . Atto.take
char :: Char -> Parser s Char
char = lift . Atto.char
choice :: [Parser s a] -> Parser s a
choice = foldr (<|>) empty
count :: Int -> Parser s a -> Parser s [a]
count 0 _ = return []
count n p = (:) <$> p <*> count (n-1) p
decNumber :: Parser s ByteString
decNumber = lift $ Atto.takeWhile1 (`Set.member` digits)
digits :: Set Char
digits = Set.fromList ['0'..'9']
@ -81,7 +59,13 @@ hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af]
where
af = ['A'..'F']
octDigit :: MonadParser m => m Char
hexNumber :: Parser s ByteString
hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits)
many :: Parser s a -> Parser s [a]
many parser = (:) <$> parser <*> many parser <|> return []
octDigit :: Parser s Char
octDigit = oneOf ['0'..'7']
on :: Parser s a -> ByteString -> Parser s (Either String a)
@ -90,8 +74,25 @@ on (StateT parserF) input = StateT $ \state ->
Left errorMsg -> return (Left errorMsg, state)
Right (result, newState) -> return (Right result, newState)
runParser :: Parser s a -> s -> ByteString -> Either String (a, s)
runParser parser initState = Atto.parseOnly (runStateT parser initState)
oneOf :: String -> Parser s Char
oneOf charSet = lift $ Atto.satisfy (`elem` charSet)
evalParser :: Parser s a -> s -> ByteString -> Either String a
evalParser parser initState = Atto.parseOnly (evalStateT parser initState)
option :: a -> Parser s a -> Parser s a
option defaultValue p = p <|> pure defaultValue
runParser :: Parser s a -> s -> ByteString -> Either String a
runParser parser initState =
Atto.parseOnly (evalStateT parser initState)
sepBy :: Parser s a -> Parser s b -> Parser s [a]
sepBy parser separator =
option [] $ (:) <$> parser <*> many (separator *> parser)
string :: ByteString -> Parser s ByteString
string = lift . Atto.string
takeAll :: (Char -> Bool) -> Parser s ByteString
takeAll = lift . Atto.takeWhile
takeAll1 :: (Char -> Bool) -> Parser s ByteString
takeAll1 = lift . Atto.takeWhile1

47
src/PDF/Update.hs Normal file
View file

@ -0,0 +1,47 @@
{-# LANGUAGE NamedFieldPuns #-}
module PDF.Update (
unify
) where
import Data.Map (member)
import qualified Data.Map as Map (empty, union)
import PDF.Object (
Content(..), IndexedObjects, IndirectObjCoordinates(..), Occurrence(..)
, Structure(..)
)
emptyContent :: Content
emptyContent = Content {
docStructure = Structure {xRef = Map.empty, trailer = Map.empty}
, objects = Map.empty
, occurrences = []
}
unify :: [Content] -> Content
unify = foldl complete emptyContent
where
complete tmpContent older =
let mergedObjects = Map.union (objects tmpContent) (objects older) in
Content {
docStructure =
unifyDocStructure (docStructure tmpContent) (docStructure older)
, objects = mergedObjects
, occurrences =
unifyOccurrences mergedObjects (occurrences tmpContent) (occurrences older)
}
unifyDocStructure :: Structure -> Structure -> Structure
unifyDocStructure update original = Structure {
xRef = Map.union (xRef update) (xRef original)
, trailer = Map.union (trailer update) (trailer original)
}
unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence]
unifyOccurrences objects update = foldr addOlder update
where
addOlder occurrence@(Comment _) existing = occurrence : existing
addOlder occurrence@(Indirect indirect) existing =
if objectId indirect `member` objects
then occurrence : existing
else existing

View file

@ -1,9 +0,0 @@
{-# LANGUAGE OverloadedStrings #-}
import Test.HUnit
import Object (testNumber, testString)
main :: IO ()
main = runTestTT (TestList [
testNumber
, testString
]) *> return ()

View file

@ -1,50 +0,0 @@
{-# LANGUAGE OverloadedStrings #-}
module Object (
testNumber
, testString
) where
import Data.ByteString.Char8 (pack)
import Data.ByteString.Char8.Util (B16Int(..))
import PDF.Object (Number(..), StringObject(..), name, number, stringObject)
import PDF.Parser (MonadParser, Parser, evalParser)
import Test.HUnit
check :: (Eq a, Show a) => Parser () a -> (a, String) -> Test
check parser (aim, input) =
TestCase $ assertEqual message (Right aim) (parse input)
where
message = "parse: " ++ input
parse = evalParser parser () . pack
testName :: Test
testName = TestLabel "Name" . TestList $ check name <$> [
]
testNumber :: Test
testNumber = TestLabel "Number" . TestList $ check number <$> [
(Number 123, "123")
, (Number 43445, "43445")
, (Number 17, "+17")
, (Number (-98), "-98")
, (Number 0, "0")
, (Number 34.5, "34.5")
, (Number (-3.62), "-3.62")
, (Number 123.6, "+123.6")
, (Number 4, "4.")
, (Number (-0.002), "-.002")
, (Number 0, "0.0")
]
testString :: Test
testString = TestLabel "StringObject" . TestList $ check stringObject <$> [
(Literal "This is a string", "(This is a string)")
, (Literal "Strings may contain newlines\nand such .", "(Strings may contain newlines\nand such .)")
, (Literal "Strings may contain balanced parentheses ( ) and\nspecial characters ( * ! & } ^ % and so on ) .", "(Strings may contain balanced parentheses ( ) and\nspecial characters ( * ! & } ^ % and so on ) .)")
, (Literal "The following is an empty string .", "(The following is an empty string .)")
, (Literal "", "()")
, (Literal "It has zero ( 0 ) length .", "(It has zero ( 0 ) length .)")
, (Hexadecimal $ B16Int "4E6F762073686D6F7A206B6120706F702E", "<4E6F762073686D6F7A206B6120706F702E>")
, (Hexadecimal $ B16Int "901FA3", "<901FA3>")
, (Hexadecimal $ B16Int "901FA0", "<901FA>")
]