Compare commits
119 commits
main
...
extract-te
Author | SHA1 | Date | |
---|---|---|---|
6a7e9e9595 | |||
d9f69014a0 | |||
f6664683c7 | |||
c491e8a70c | |||
09bd706748 | |||
729e312f90 | |||
6d265633e4 | |||
1eb1c23053 | |||
44125f75a6 | |||
c8a5e2b191 | |||
11640c8465 | |||
e94a09b3ec | |||
ba7dd6a690 | |||
d21e14f9a4 | |||
a1c2fbf110 | |||
24630a04a1 | |||
ee5e7500a8 | |||
d8aec5bf80 | |||
25e2823c75 | |||
5027b079eb | |||
5722dd1a04 | |||
f31e9eb38b | |||
0f857c457d | |||
40475a3093 | |||
a9d3e5d326 | |||
f2a99e1fd2 | |||
5bf2b08fa9 | |||
5b8d951516 | |||
d3f1b97f3a | |||
c4c3e35e09 | |||
10f8c711da | |||
b6c1f670ef | |||
3b1a5152e4 | |||
a04adff1d2 | |||
103037ffb2 | |||
dce10ae63a | |||
f2986da96d | |||
673321bf0a | |||
0ade9cc2f5 | |||
457f1755e6 | |||
ca40d2df76 | |||
44bc898ed3 | |||
1ec47c5d07 | |||
6e245189fd | |||
90348c57d6 | |||
50ac0692b2 | |||
2b9abc24b6 | |||
309f6ed461 | |||
93c9863426 | |||
7cef65d799 | |||
d288ecf0ac | |||
3b3eeef218 | |||
2c02e44adf | |||
9ce1a48030 | |||
4969c6442e | |||
cb257fc07e | |||
d90eaf6f1c | |||
99014ff30d | |||
f4df4aab22 | |||
bdbc5f7351 | |||
30fece6537 | |||
1a70f2972b | |||
67faa06ea2 | |||
83a63d4b02 | |||
85ee8519c4 | |||
e607f9cd37 | |||
a9252b129a | |||
71e62ee732 | |||
160999a7d7 | |||
36b1782464 | |||
bcf2e05bfb | |||
6096a1a237 | |||
23186100a8 | |||
b916ab5206 | |||
4a6dbda7d3 | |||
923d1800b0 | |||
1c457d71d8 | |||
a72d76e229 | |||
919f640443 | |||
ae938acc02 | |||
32f9866106 | |||
eb4d76002c | |||
af994cb50c | |||
704d7a7fcf | |||
11647eb4eb | |||
aed7af376a | |||
e77bbbcda9 | |||
195446e653 | |||
9f1b1afafe | |||
20466c4f13 | |||
325250383a | |||
c48ab22808 | |||
a2b66ac6d6 | |||
cefb08ee50 | |||
afbbcbffc5 | |||
8373bd1ea0 | |||
bac08446dd | |||
f9f799c59b | |||
08a9717b3a | |||
42a02808c1 | |||
c9f050e64b | |||
3a3e1533b4 | |||
a96e36ec5a | |||
d07c286f8e | |||
7a15113285 | |||
36d7f9b819 | |||
b8ca7281aa | |||
32efdcdd6b | |||
3b59fd0c61 | |||
0374b72920 | |||
1dd22c3889 | |||
98d029c4d4 | |||
c349d9b4c2 | |||
e7484ef536 | |||
f9e5683bf4 | |||
b8eb9e6856 | |||
66d315b7fe | |||
51db57ec67 | |||
6f3c159ea7 |
34 changed files with 2224 additions and 334 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,2 +1,3 @@
|
||||||
dist*/
|
dist*/
|
||||||
.ghc.environment.*
|
.ghc.environment.*
|
||||||
|
cabal.project.local
|
||||||
|
|
|
@ -16,20 +16,40 @@ extra-source-files: ChangeLog.md
|
||||||
cabal-version: >=1.10
|
cabal-version: >=1.10
|
||||||
|
|
||||||
library
|
library
|
||||||
exposed-modules: PDF
|
exposed-modules: Data.OrderedMap
|
||||||
|
, Data.Id
|
||||||
|
, PDF
|
||||||
|
, PDF.Box
|
||||||
|
, PDF.CMap
|
||||||
|
, PDF.Content
|
||||||
|
, PDF.Content.Operator
|
||||||
|
, PDF.Content.Operator.Color
|
||||||
|
, PDF.Content.Operator.Common
|
||||||
|
, PDF.Content.Operator.GraphicState
|
||||||
|
, PDF.Content.Operator.Path
|
||||||
|
, PDF.Content.Operator.Text
|
||||||
|
, PDF.Content.Text
|
||||||
, PDF.EOL
|
, PDF.EOL
|
||||||
|
, PDF.Layer
|
||||||
, PDF.Object
|
, PDF.Object
|
||||||
|
, PDF.Object.Navigation
|
||||||
, PDF.Output
|
, PDF.Output
|
||||||
, PDF.Update
|
, PDF.Parser
|
||||||
|
, PDF.Pages
|
||||||
other-modules: Data.ByteString.Char8.Util
|
other-modules: Data.ByteString.Char8.Util
|
||||||
, PDF.Body
|
, PDF.Body
|
||||||
, PDF.Parser
|
, PDF.Encoding
|
||||||
|
, PDF.Encoding.MacRoman
|
||||||
|
, PDF.Font
|
||||||
-- other-extensions:
|
-- other-extensions:
|
||||||
build-depends: attoparsec
|
build-depends: attoparsec
|
||||||
, base >=4.9 && <4.13
|
, base >=4.9 && <4.13
|
||||||
, bytestring
|
, bytestring
|
||||||
, containers
|
, containers
|
||||||
, mtl
|
, mtl
|
||||||
|
, text
|
||||||
|
, utf8-string
|
||||||
|
, zlib
|
||||||
hs-source-dirs: src
|
hs-source-dirs: src
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
@ -39,7 +59,7 @@ executable equivalent
|
||||||
build-depends: base
|
build-depends: base
|
||||||
, bytestring
|
, bytestring
|
||||||
, Hufflepdf
|
, Hufflepdf
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall -rtsopts
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
executable getObj
|
executable getObj
|
||||||
|
@ -48,6 +68,63 @@ executable getObj
|
||||||
, bytestring
|
, bytestring
|
||||||
, containers
|
, containers
|
||||||
, Hufflepdf
|
, Hufflepdf
|
||||||
, zlib
|
, mtl
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall -rtsopts
|
||||||
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
executable getText
|
||||||
|
main-is: examples/getText.hs
|
||||||
|
build-depends: base
|
||||||
|
, bytestring
|
||||||
|
, containers
|
||||||
|
, ExceptIOH
|
||||||
|
, Hufflepdf
|
||||||
|
, mtl
|
||||||
|
, text
|
||||||
|
ghc-options: -Wall -rtsopts
|
||||||
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
executable pdfCut
|
||||||
|
main-is: examples/pdfCut.hs
|
||||||
|
build-depends: base
|
||||||
|
, bytestring
|
||||||
|
, containers
|
||||||
|
, ExceptIOH
|
||||||
|
, filepath
|
||||||
|
, Hufflepdf
|
||||||
|
, mtl
|
||||||
|
, text
|
||||||
|
ghc-options: -Wall -rtsopts
|
||||||
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
executable fixMermoz
|
||||||
|
main-is: examples/fixMermoz.hs
|
||||||
|
build-depends: base
|
||||||
|
, bytestring
|
||||||
|
, ExceptIOH
|
||||||
|
, Hufflepdf
|
||||||
|
, mtl
|
||||||
|
ghc-options: -Wall -rtsopts
|
||||||
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
Test-Suite unitTests
|
||||||
|
type: exitcode-stdio-1.0
|
||||||
|
main-is: Main.hs
|
||||||
|
other-modules: Object
|
||||||
|
, Data.ByteString.Char8.Util
|
||||||
|
, Data.Id
|
||||||
|
, PDF.EOL
|
||||||
|
, PDF.Parser
|
||||||
|
, PDF.Object
|
||||||
|
, PDF.Output
|
||||||
|
hs-source-dirs: test
|
||||||
|
, src
|
||||||
|
build-depends: attoparsec
|
||||||
|
, base >=4.9 && <4.13
|
||||||
|
, bytestring
|
||||||
|
, containers
|
||||||
|
, Hufflepdf
|
||||||
|
, HUnit
|
||||||
|
, mtl
|
||||||
|
, text
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
11
examples/getContent.hs
Normal file
11
examples/getContent.hs
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
hSetBuffering stdout LineBuffering
|
||||||
|
args <- getArgs
|
||||||
|
case args of
|
||||||
|
--[inputFile] -> wholeDoc inputFile
|
||||||
|
[inputFile, pageNumber] -> do
|
||||||
|
content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
|
||||||
|
get content (read pageNumber)
|
||||||
|
singlePage inputFile (read pageNumber)
|
||||||
|
_ -> die "Syntax: getContent INPUT_FILE PAGE_NUMBER"
|
|
@ -1,53 +1,52 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
|
||||||
import Codec.Compression.Zlib (decompress)
|
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict)
|
import qualified Data.ByteString.Lazy.Char8 as Lazy (putStrLn)
|
||||||
import Data.Map ((!?))
|
import Data.Id (Id(..))
|
||||||
import qualified Data.Map as Map (keys, lookup)
|
|
||||||
import PDF (Document(..), parseDocument)
|
import PDF (Document(..), parseDocument)
|
||||||
import qualified PDF.EOL as EOL (Style)
|
import PDF.Box (Box(..))
|
||||||
import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..))
|
import PDF.Layer (Layer(..), unify)
|
||||||
import PDF.Output (ObjectId(..))
|
import PDF.Object (Object(..))
|
||||||
|
import PDF.Object.Navigation (
|
||||||
|
Nav(..), PPath(..), StreamContent(..), (//), objectById, catalog
|
||||||
|
)
|
||||||
|
import PDF.Output (Output)
|
||||||
import qualified PDF.Output as Output (render)
|
import qualified PDF.Output as Output (render)
|
||||||
import PDF.Update (unify)
|
import Prelude hiding (fail)
|
||||||
import System.Environment (getArgs, getProgName)
|
import System.Environment (getArgs, getProgName)
|
||||||
import System.Exit (die)
|
import System.Exit (die)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
import Text.Read (readMaybe)
|
||||||
|
|
||||||
display :: EOL.Style -> Object -> ByteString
|
decodedStream :: Object -> Object
|
||||||
display eolStyle d@(Direct _) = Output.render eolStyle d
|
decodedStream object =
|
||||||
display eolStyle s@(Stream {header, streamContent}) = Output.render eolStyle $
|
either (const object) id $ r Clear object >>= flip (w Raw) object
|
||||||
case Map.lookup (Name "Filter") header of
|
|
||||||
Just (NameObject (Name "FlateDecode")) -> Stream {
|
|
||||||
header
|
|
||||||
, streamContent = Lazy.toStrict . decompress $ Lazy.fromStrict streamContent
|
|
||||||
}
|
|
||||||
_ -> s
|
|
||||||
|
|
||||||
extractObject :: ObjectId -> Document -> Either String ByteString
|
display :: Functor m => Output a => ReaderT Layer m a -> Document -> m ByteString
|
||||||
extractObject objectId (Document {eolStyle, updates}) =
|
display getter (Document {eolStyle, layers}) =
|
||||||
case objects content !? objectId of
|
Output.render eolStyle <$> runReaderT getter (unify layers)
|
||||||
Nothing -> Left $ "No object has ID " ++ show (getObjectId objectId)
|
|
||||||
Just o -> Right $ display eolStyle o
|
parse :: [String] -> IO (FilePath, Document -> Either String ByteString)
|
||||||
|
parse [inputFile] = return (inputFile, display $ value <$> catalog)
|
||||||
|
parse [inputFile, key] =
|
||||||
|
return (inputFile, clear . maybe (byPath key) byId $ readMaybe key)
|
||||||
where
|
where
|
||||||
content = unify updates
|
byId = objectById . Id
|
||||||
|
byPath path = (catalog // PPath (explode path))
|
||||||
listObjectIds :: Document -> Either String [String]
|
explode "" = []
|
||||||
listObjectIds =
|
explode path =
|
||||||
Right . prependTitle . toString . Map.keys . objects . unify . updates
|
case break (== '.') path of
|
||||||
where
|
(name, "") -> [name]
|
||||||
toString = fmap (show . getObjectId)
|
(name, rest) -> name : explode (drop 1 rest)
|
||||||
prependTitle = ("ObjectIds defined in this PDF:":)
|
clear = display . fmap (decodedStream . value)
|
||||||
|
parse _ =
|
||||||
|
die . printf "Syntax: %s inputFile [OBJECT_ID | PATH_TO_OBJ]\n" =<< getProgName
|
||||||
|
|
||||||
main :: IO ()
|
main :: IO ()
|
||||||
main = do
|
main = do
|
||||||
(inputFile, getData) <- parse =<< getArgs
|
(inputFile, getData) <- parse =<< getArgs
|
||||||
input <- BS.readFile inputFile
|
input <- BS.readFile inputFile
|
||||||
either die id $ (parseDocument input >>= getData)
|
either die Lazy.putStrLn $ (parseDocument input >>= getData)
|
||||||
where
|
|
||||||
parse [inputFile] = return (inputFile, fmap (mapM_ putStrLn) . listObjectIds)
|
|
||||||
parse [inputFile, objectId] = return
|
|
||||||
(inputFile, fmap Lazy.putStr . extractObject (ObjectId (read objectId)))
|
|
||||||
parse _ = die . printf "Syntax: %s inputFile [OBJECT_ID]\n" =<< getProgName
|
|
||||||
|
|
54
examples/getText.hs
Normal file
54
examples/getText.hs
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
import Control.Monad ((>=>))
|
||||||
|
import Control.Monad.Except (ExceptT(..))
|
||||||
|
import Control.Monad.Except.IOH (handle)
|
||||||
|
import Control.Monad.IO.Class (liftIO)
|
||||||
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
|
import Data.Id (Id(..), mapWithKey)
|
||||||
|
import qualified Data.Map as Map (mapWithKey)
|
||||||
|
import Data.OrderedMap (mapi)
|
||||||
|
import qualified Data.Text as Text (unpack)
|
||||||
|
import PDF (UnifiedLayers(..), parseDocument)
|
||||||
|
import PDF.Box (Box(..))
|
||||||
|
import PDF.Content.Text (Chunks(..))
|
||||||
|
import PDF.Layer (Layer, LayerReader)
|
||||||
|
import PDF.Pages (
|
||||||
|
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), withFonts
|
||||||
|
, withResources
|
||||||
|
)
|
||||||
|
import System.Environment (getArgs)
|
||||||
|
import System.Exit (die)
|
||||||
|
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
displayPage :: Int -> Page -> FontCache (LayerReader (ExceptT String IO)) ()
|
||||||
|
displayPage n = withResources (
|
||||||
|
r Contents
|
||||||
|
>=> sequence_ . mapi (\objectId ->
|
||||||
|
r Chunks >=> sequence_ . mapWithKey (display objectId)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
where
|
||||||
|
display a b v =
|
||||||
|
liftIO . putStrLn $
|
||||||
|
printf "p#%d obj#%d instr#%d: %s" n (getId a) (getId b) (Text.unpack v)
|
||||||
|
|
||||||
|
getAll :: Layer -> ExceptT String IO ()
|
||||||
|
getAll = withFonts $ r Pages >=> sequence_ . Map.mapWithKey displayPage
|
||||||
|
|
||||||
|
get :: Int -> Layer -> ExceptT String IO ()
|
||||||
|
get n = withFonts $ r (P n) >=> displayPage n
|
||||||
|
|
||||||
|
onDoc :: FilePath -> (Layer -> ExceptT String IO ()) -> ExceptT String IO ()
|
||||||
|
onDoc inputFile f =
|
||||||
|
ExceptT (parseDocument <$> BS.readFile inputFile) >>= r UnifiedLayers >>= f
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
hSetBuffering stdout LineBuffering
|
||||||
|
args <- getArgs
|
||||||
|
case args of
|
||||||
|
[inputFile] -> onDoc inputFile getAll `handle` die
|
||||||
|
[inputFile, pageNumber] ->
|
||||||
|
onDoc inputFile (get $ read pageNumber) `handle` die
|
||||||
|
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
|
41
examples/pdfCut.hs
Normal file
41
examples/pdfCut.hs
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
import Control.Monad.State (MonadState(..), evalStateT)
|
||||||
|
import Control.Monad.Except.IOH (handle)
|
||||||
|
import Control.Monad.Trans (lift)
|
||||||
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
|
import qualified Data.ByteString.Lazy.Char8 as Lazy (writeFile)
|
||||||
|
import Data.Map ((!), fromSet)
|
||||||
|
import qualified Data.Set as Set (fromList)
|
||||||
|
import PDF (Document, UnifiedLayers(..), parseDocument, render)
|
||||||
|
import PDF.Box (at)
|
||||||
|
import PDF.Pages (Pages(..))
|
||||||
|
import System.Environment (getArgs)
|
||||||
|
import System.Exit (die)
|
||||||
|
import System.FilePath (replaceBaseName, takeBaseName)
|
||||||
|
import Text.Read (readEither)
|
||||||
|
|
||||||
|
parseRange :: String -> Either String (Int, Int)
|
||||||
|
parseRange = evalStateT $ do
|
||||||
|
from <- lift . readEither =<< state (break (== '-'))
|
||||||
|
get >>= makeRange from
|
||||||
|
where
|
||||||
|
makeRange from "" = return (from, from)
|
||||||
|
makeRange from (_:to) = (,) from <$> lift (readEither to)
|
||||||
|
|
||||||
|
cut :: (Int, Int) -> Document -> IO Document
|
||||||
|
cut (p1, p2) doc = ((
|
||||||
|
at UnifiedLayers .at Pages $ \pages ->
|
||||||
|
return $ fromSet (pages!) $ Set.fromList [pMin .. pMax]
|
||||||
|
) doc) `handle` die
|
||||||
|
where
|
||||||
|
(pMin, pMax) = (min p1 p2, max p1 p2)
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
[inputPath, inputRange] <- getArgs
|
||||||
|
range <- catchLeft $ parseRange inputRange
|
||||||
|
doc <- catchLeft =<< parseDocument <$> BS.readFile inputPath
|
||||||
|
Lazy.writeFile (outputPath inputPath inputRange) . render =<< cut range doc
|
||||||
|
where
|
||||||
|
catchLeft = either die return
|
||||||
|
outputPath fileName range =
|
||||||
|
replaceBaseName fileName (takeBaseName fileName ++ "_p" ++ range)
|
|
@ -1,16 +1,91 @@
|
||||||
module Data.ByteString.Char8.Util (
|
module Data.ByteString.Char8.Util (
|
||||||
previous
|
B16Int(..)
|
||||||
|
, B256Int(..)
|
||||||
|
, b8ToInt
|
||||||
|
, b16ToBytes
|
||||||
|
, b16ToInt
|
||||||
|
, b256ToInt
|
||||||
|
, intToB256
|
||||||
|
, previous
|
||||||
, subBS
|
, subBS
|
||||||
|
, toBytes
|
||||||
|
, unescape
|
||||||
|
, utf16BEToutf8
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Char8 (ByteString)
|
import Data.ByteString (ByteString, snoc)
|
||||||
import qualified Data.ByteString.Char8 as BS (drop, index, take)
|
import qualified Data.ByteString as BS (empty, foldl', length, pack, singleton, splitAt)
|
||||||
|
import qualified Data.ByteString.Char8 as Char8 (
|
||||||
|
cons, drop, index, splitAt, take, uncons, unpack
|
||||||
|
)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import Data.Text.Encoding (decodeUtf16BE)
|
||||||
import Prelude hiding (length)
|
import Prelude hiding (length)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
newtype B8Int = B8Int ByteString deriving (Eq, Show)
|
||||||
|
newtype B16Int = B16Int ByteString deriving (Eq, Show)
|
||||||
|
newtype B256Int = B256Int ByteString deriving (Eq, Show)
|
||||||
|
|
||||||
previous :: Char -> Int -> ByteString -> Int
|
previous :: Char -> Int -> ByteString -> Int
|
||||||
previous char position byteString
|
previous char position byteString
|
||||||
| BS.index byteString position == char = position
|
| Char8.index byteString position == char = position
|
||||||
| otherwise = previous char (position - 1) byteString
|
| otherwise = previous char (position - 1) byteString
|
||||||
|
|
||||||
subBS :: Int -> Int -> ByteString -> ByteString
|
subBS :: Int -> Int -> ByteString -> ByteString
|
||||||
subBS offset length = BS.take length . BS.drop offset
|
subBS offset length = Char8.take length . Char8.drop offset
|
||||||
|
|
||||||
|
intToB256 :: Int -> B256Int
|
||||||
|
intToB256 n
|
||||||
|
| n < 0x100 = B256Int . BS.singleton $ toEnum n
|
||||||
|
| otherwise =
|
||||||
|
let B256Int begining = intToB256 (n `div` 0x100) in
|
||||||
|
B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
|
||||||
|
|
||||||
|
b256ToInt :: B256Int -> Int
|
||||||
|
b256ToInt (B256Int n) = BS.foldl' (\k w -> 0x100*k + fromEnum w) 0 n
|
||||||
|
|
||||||
|
toBytes :: Int -> Int -> ByteString
|
||||||
|
toBytes 0 _ = BS.empty
|
||||||
|
toBytes size n =
|
||||||
|
(toBytes (size - 1) (n `div` 0x100)) `snoc` (toEnum (n `mod` 0x100))
|
||||||
|
|
||||||
|
b16ToBytes :: B16Int -> ByteString
|
||||||
|
b16ToBytes (B16Int n) = BS.pack . fmap b16ToInt $ pairDigits n
|
||||||
|
where
|
||||||
|
pairDigits s =
|
||||||
|
case BS.length s of
|
||||||
|
0 -> []
|
||||||
|
1 -> [B16Int s]
|
||||||
|
_ ->
|
||||||
|
let (twoHexDigits, rest) = BS.splitAt 2 s in
|
||||||
|
(B16Int $ twoHexDigits):(pairDigits rest)
|
||||||
|
|
||||||
|
fromBase :: (Num a, Read a) => Char -> ByteString -> a
|
||||||
|
fromBase b = read . printf "0%c%s" b . Char8.unpack
|
||||||
|
|
||||||
|
b16ToInt :: (Num a, Read a) => B16Int -> a
|
||||||
|
b16ToInt (B16Int n) = fromBase 'x' n
|
||||||
|
|
||||||
|
b8ToInt :: (Num a, Read a) => B8Int -> a
|
||||||
|
b8ToInt (B8Int n) = fromBase 'o' n
|
||||||
|
|
||||||
|
unescape :: ByteString -> ByteString
|
||||||
|
unescape escapedBS =
|
||||||
|
case Char8.uncons escapedBS of
|
||||||
|
Nothing -> BS.empty
|
||||||
|
Just ('\\', s) -> unescapeChar s
|
||||||
|
Just (c, s) -> Char8.cons c (unescape s)
|
||||||
|
where
|
||||||
|
unescapeChar s =
|
||||||
|
case Char8.uncons s of
|
||||||
|
Nothing -> BS.empty
|
||||||
|
Just (c, s')
|
||||||
|
| c `elem` "()" -> Char8.cons c (unescape s')
|
||||||
|
| c `elem` "nrtbf" -> Char8.cons (read (printf "'\\%c'" c)) (unescape s')
|
||||||
|
| c `elem` ['0'..'7'] -> fromOctal (Char8.splitAt 3 s)
|
||||||
|
| otherwise -> Char8.cons c (unescape s')
|
||||||
|
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
|
||||||
|
|
||||||
|
utf16BEToutf8 :: ByteString -> Text
|
||||||
|
utf16BEToutf8 = decodeUtf16BE
|
||||||
|
|
95
src/Data/Id.hs
Normal file
95
src/Data/Id.hs
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||||
|
{-# LANGUAGE DeriveTraversable #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
module Data.Id (
|
||||||
|
Id(..)
|
||||||
|
, IdMap
|
||||||
|
, Indexed
|
||||||
|
, at
|
||||||
|
, delete
|
||||||
|
, empty
|
||||||
|
, filterWithKey
|
||||||
|
, fromList
|
||||||
|
, insert
|
||||||
|
, keysSet
|
||||||
|
, lookup
|
||||||
|
, mapWithKey
|
||||||
|
, member
|
||||||
|
, minViewWithKey
|
||||||
|
, register
|
||||||
|
, singleton
|
||||||
|
, size
|
||||||
|
, toList
|
||||||
|
, union
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Monad.State.Strict (MonadState, modify, gets)
|
||||||
|
import Data.IntMap (IntMap, (!))
|
||||||
|
import qualified Data.IntMap as IntMap (
|
||||||
|
delete, empty, filterWithKey, fromList, keysSet, insert, lookup, mapWithKey
|
||||||
|
, maxViewWithKey, member, minViewWithKey, size, toList, union
|
||||||
|
)
|
||||||
|
import Data.IntSet (IntSet)
|
||||||
|
import Prelude hiding (lookup)
|
||||||
|
|
||||||
|
newtype Id a = Id {
|
||||||
|
getId :: Int
|
||||||
|
} deriving (Eq, Enum, Ord, Show)
|
||||||
|
newtype IdMap a b = IdMap {
|
||||||
|
intMap :: IntMap b
|
||||||
|
} deriving (Show, Functor, Semigroup, Monoid, Foldable, Traversable)
|
||||||
|
type Indexed a = IdMap a a
|
||||||
|
|
||||||
|
at :: IdMap a b -> Id a -> b
|
||||||
|
at (IdMap idMap) = (idMap !) . getId
|
||||||
|
|
||||||
|
lookup :: Id a -> IdMap a b -> Maybe b
|
||||||
|
lookup (Id a) (IdMap idMap) = IntMap.lookup a idMap
|
||||||
|
|
||||||
|
size :: IdMap a b -> Int
|
||||||
|
size = IntMap.size . intMap
|
||||||
|
|
||||||
|
member :: Id a -> IdMap a b -> Bool
|
||||||
|
member (Id a) (IdMap idMap) = IntMap.member a idMap
|
||||||
|
|
||||||
|
empty :: IdMap a b
|
||||||
|
empty = IdMap {intMap = IntMap.empty}
|
||||||
|
|
||||||
|
singleton :: Id a -> b -> IdMap a b
|
||||||
|
singleton a b = fromList [(a, b)]
|
||||||
|
|
||||||
|
insert :: Id a -> b -> IdMap a b -> IdMap a b
|
||||||
|
insert (Id a) b (IdMap idMap) = IdMap {intMap = IntMap.insert a b idMap}
|
||||||
|
|
||||||
|
delete :: Id a -> IdMap a b -> IdMap a b
|
||||||
|
delete (Id a) (IdMap idMap) = IdMap {intMap = IntMap.delete a idMap}
|
||||||
|
|
||||||
|
minViewWithKey :: IdMap a b -> Maybe ((Id a, b), IdMap a b)
|
||||||
|
minViewWithKey = fmap wrap . IntMap.minViewWithKey . intMap
|
||||||
|
where
|
||||||
|
wrap ((key, b), idMap) = ((Id key, b), IdMap idMap)
|
||||||
|
|
||||||
|
union :: IdMap a b -> IdMap a b -> IdMap a b
|
||||||
|
union (IdMap intMap1) (IdMap intMap2) =
|
||||||
|
IdMap {intMap = IntMap.union intMap1 intMap2}
|
||||||
|
|
||||||
|
mapWithKey :: (Id a -> b -> c) -> IdMap a b -> IdMap a c
|
||||||
|
mapWithKey f (IdMap idMap) = IdMap {intMap = IntMap.mapWithKey (f . Id) idMap}
|
||||||
|
|
||||||
|
filterWithKey :: (Id a -> b -> Bool) -> IdMap a b -> IdMap a b
|
||||||
|
filterWithKey f = IdMap . IntMap.filterWithKey (f . Id) . intMap
|
||||||
|
|
||||||
|
fromList :: [(Id a, b)] -> IdMap a b
|
||||||
|
fromList = IdMap . IntMap.fromList . fmap (\(key, b) -> (getId key, b))
|
||||||
|
|
||||||
|
toList :: IdMap a b -> [(Id a, b)]
|
||||||
|
toList = fmap (\(key, b) -> (Id key, b)) . IntMap.toList . intMap
|
||||||
|
|
||||||
|
keysSet :: IdMap a b -> IntSet
|
||||||
|
keysSet = IntMap.keysSet . intMap
|
||||||
|
|
||||||
|
register :: MonadState (IdMap a b) m => b -> m (Id a)
|
||||||
|
register b = do
|
||||||
|
newId <- gets (Id . maybe 0 ((+1) . fst . fst) . IntMap.maxViewWithKey . intMap)
|
||||||
|
modify (insert newId b)
|
||||||
|
return newId
|
79
src/Data/OrderedMap.hs
Normal file
79
src/Data/OrderedMap.hs
Normal file
|
@ -0,0 +1,79 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
module Data.OrderedMap (
|
||||||
|
OrderedMap
|
||||||
|
, build
|
||||||
|
, elems
|
||||||
|
, fromList
|
||||||
|
, get
|
||||||
|
, keys
|
||||||
|
, lookup
|
||||||
|
, mapi
|
||||||
|
, set
|
||||||
|
, toList
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Data.Map (Map, (!), mapWithKey)
|
||||||
|
import qualified Data.Map as Map (fromList, insert, lookup, member)
|
||||||
|
import Prelude hiding (lookup)
|
||||||
|
|
||||||
|
data OrderedMap k a = OrderedMap {
|
||||||
|
assoc :: Map k a
|
||||||
|
, keys :: [k]
|
||||||
|
}
|
||||||
|
|
||||||
|
instance (Ord k, Show k, Show a) => Show (OrderedMap k a) where
|
||||||
|
show = show . toList
|
||||||
|
|
||||||
|
instance Functor (OrderedMap k) where
|
||||||
|
fmap f orderedMap = orderedMap {assoc = fmap f (assoc orderedMap)}
|
||||||
|
|
||||||
|
instance Ord k => Foldable (OrderedMap k) where
|
||||||
|
foldMap f (OrderedMap {assoc, keys}) = foldMap f $ (assoc !) <$> keys
|
||||||
|
|
||||||
|
instance Ord k => Traversable (OrderedMap k) where
|
||||||
|
sequenceA (OrderedMap {assoc, keys}) =
|
||||||
|
(flip OrderedMap keys) <$> sequenceA assoc
|
||||||
|
|
||||||
|
elems :: Ord k => OrderedMap k a -> [a]
|
||||||
|
elems (OrderedMap {assoc, keys}) = (assoc !) <$> keys
|
||||||
|
|
||||||
|
toList :: Ord k => OrderedMap k a -> [(k, a)]
|
||||||
|
toList (OrderedMap {assoc, keys}) = (\k -> (k, assoc ! k)) <$> keys
|
||||||
|
|
||||||
|
fromList :: Ord k => [(k, a)] -> OrderedMap k a
|
||||||
|
fromList keyValueList = OrderedMap {
|
||||||
|
assoc = Map.fromList keyValueList
|
||||||
|
, keys = fst <$> keyValueList
|
||||||
|
}
|
||||||
|
|
||||||
|
build :: Ord k => (k -> a) -> [k] -> OrderedMap k a
|
||||||
|
build f keys = OrderedMap {
|
||||||
|
assoc = Map.fromList $ (\k -> (k, f k)) <$> keys
|
||||||
|
, keys
|
||||||
|
}
|
||||||
|
|
||||||
|
get :: Ord k => k -> OrderedMap k a -> a
|
||||||
|
get k = (! k) . assoc
|
||||||
|
|
||||||
|
lookup :: Ord k => k -> OrderedMap k a -> Maybe a
|
||||||
|
lookup k = (Map.lookup k) . assoc
|
||||||
|
|
||||||
|
set :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
|
||||||
|
set k v orderedMap@(OrderedMap {assoc})
|
||||||
|
| Map.member k assoc = orderedMap {assoc = Map.insert k v assoc}
|
||||||
|
| otherwise = orderedMap
|
||||||
|
|
||||||
|
mapi :: Ord k => (k -> a -> b) -> OrderedMap k a -> OrderedMap k b
|
||||||
|
mapi f orderedMap = orderedMap {
|
||||||
|
assoc = mapWithKey f $ assoc orderedMap
|
||||||
|
}
|
||||||
|
|
||||||
|
{-
|
||||||
|
cons :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
|
||||||
|
cons k a orderedMap =
|
||||||
|
|
||||||
|
snoc :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
|
||||||
|
|
||||||
|
alter :: Ord k => (Maybe a -> Maybe a) -> k -> OrderedMap k a -> OrderedMap k a
|
||||||
|
alter
|
||||||
|
-}
|
41
src/PDF.hs
41
src/PDF.hs
|
@ -1,6 +1,11 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||||
module PDF (
|
module PDF (
|
||||||
Document(..)
|
Document(..)
|
||||||
|
, EOLStyle(..)
|
||||||
|
, Layers(..)
|
||||||
|
, UnifiedLayers(..)
|
||||||
, parseDocument
|
, parseDocument
|
||||||
, render
|
, render
|
||||||
) where
|
) where
|
||||||
|
@ -13,27 +18,45 @@ import Data.ByteString.Char8.Util (previous, subBS)
|
||||||
import qualified Data.ByteString.Lazy as Lazy (ByteString)
|
import qualified Data.ByteString.Lazy as Lazy (ByteString)
|
||||||
import qualified Data.Map as Map (lookup)
|
import qualified Data.Map as Map (lookup)
|
||||||
import PDF.Body (populate)
|
import PDF.Body (populate)
|
||||||
|
import PDF.Box (Box(..))
|
||||||
import qualified PDF.EOL as EOL (Style(..), charset, parser)
|
import qualified PDF.EOL as EOL (Style(..), charset, parser)
|
||||||
|
import PDF.Layer (Layer, unify)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
Content(..), DirectObject(..), InputStructure(..), Name(..), Number(..)
|
DirectObject(..), InputStructure(..), Name(..), Number(..)
|
||||||
, Structure(..)
|
, Structure(..)
|
||||||
, eofMarker, magicNumber, structure
|
, eofMarker, magicNumber, structure
|
||||||
)
|
)
|
||||||
import qualified PDF.Output as Output (render, line)
|
import qualified PDF.Output as Output (render, line)
|
||||||
import PDF.Output (Output(..))
|
import PDF.Output (Output(..))
|
||||||
import PDF.Parser (Parser, runParser, string, takeAll)
|
import PDF.Parser (Parser, evalParser, string, takeAll)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
data Document = Document {
|
data Document = Document {
|
||||||
pdfVersion :: String
|
pdfVersion :: String
|
||||||
, eolStyle :: EOL.Style
|
, eolStyle :: EOL.Style
|
||||||
, updates :: [Content]
|
, layers :: [Layer]
|
||||||
} deriving Show
|
} deriving Show
|
||||||
|
|
||||||
instance Output Document where
|
instance Output Document where
|
||||||
output (Document {pdfVersion, updates}) =
|
output (Document {pdfVersion, layers}) =
|
||||||
Output.line (printf "%%PDF-%s" pdfVersion)
|
Output.line (printf "%%PDF-%s" pdfVersion)
|
||||||
`mappend` output updates
|
`mappend` output layers
|
||||||
|
|
||||||
|
data EOLStyle = EOLStyle
|
||||||
|
data Layers = Layers
|
||||||
|
data UnifiedLayers = UnifiedLayers
|
||||||
|
|
||||||
|
instance Monad m => Box m EOLStyle Document EOL.Style where
|
||||||
|
r EOLStyle = return . eolStyle
|
||||||
|
w EOLStyle eolStyle document = return $ document {eolStyle}
|
||||||
|
|
||||||
|
instance Monad m => Box m UnifiedLayers Document Layer where
|
||||||
|
r UnifiedLayers = return . unify . layers
|
||||||
|
w UnifiedLayers layer = w Layers [layer]
|
||||||
|
|
||||||
|
instance Monad m => Box m Layers Document [Layer] where
|
||||||
|
r Layers = return . layers
|
||||||
|
w Layers layers document = return $ document {layers = layers}
|
||||||
|
|
||||||
render :: Document -> Lazy.ByteString
|
render :: Document -> Lazy.ByteString
|
||||||
render document@(Document {eolStyle}) = Output.render eolStyle document
|
render document@(Document {eolStyle}) = Output.render eolStyle document
|
||||||
|
@ -83,7 +106,7 @@ findNextSection offset input =
|
||||||
|
|
||||||
readStructures :: Int -> ByteString -> Either String [InputStructure]
|
readStructures :: Int -> ByteString -> Either String [InputStructure]
|
||||||
readStructures startXref input =
|
readStructures startXref input =
|
||||||
runParser structure () (BS.drop startXref input) >>= stopOrFollow
|
evalParser structure () (BS.drop startXref input) >>= stopOrFollow
|
||||||
where
|
where
|
||||||
stopOrFollow s@(Structure {trailer}) =
|
stopOrFollow s@(Structure {trailer}) =
|
||||||
case Map.lookup (Name "Prev") trailer of
|
case Map.lookup (Name "Prev") trailer of
|
||||||
|
@ -96,8 +119,8 @@ readStructures startXref input =
|
||||||
|
|
||||||
parseDocument :: ByteString -> Either String Document
|
parseDocument :: ByteString -> Either String Document
|
||||||
parseDocument input = do
|
parseDocument input = do
|
||||||
(pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input
|
(pdfVersion, eolStyle) <- evalParser ((,) <$> version <*> EOL.parser) () input
|
||||||
startXref <- readStartXref eolStyle input
|
startXref <- readStartXref eolStyle input
|
||||||
structuresRead <- readStructures startXref input
|
structuresRead <- readStructures startXref input
|
||||||
let updates = populate input <$> structuresRead
|
let layers = populate input <$> structuresRead
|
||||||
return $ Document {pdfVersion, eolStyle, updates}
|
return $ Document {pdfVersion, eolStyle, layers}
|
||||||
|
|
|
@ -6,19 +6,22 @@ module PDF.Body (
|
||||||
|
|
||||||
import Control.Applicative ((<|>))
|
import Control.Applicative ((<|>))
|
||||||
import Control.Monad.State (get, gets, modify)
|
import Control.Monad.State (get, gets, modify)
|
||||||
|
import Data.Attoparsec.ByteString.Char8 (option)
|
||||||
import Data.ByteString.Char8 (ByteString)
|
import Data.ByteString.Char8 (ByteString)
|
||||||
import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
|
import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
|
||||||
import Data.Map ((!))
|
import Data.Id (Id(..), at, empty)
|
||||||
import qualified Data.Map as Map (empty, insert, lookup)
|
import qualified Data.Id as Id (insert, lookup)
|
||||||
|
import qualified Data.Map as Map (lookup)
|
||||||
import qualified PDF.EOL as EOL (charset, parser)
|
import qualified PDF.EOL as EOL (charset, parser)
|
||||||
|
import PDF.Layer (Layer(..))
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
|
DirectObject(..), Flow(..), IndirectObjCoordinates(..)
|
||||||
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
|
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
|
||||||
, Structure(..), XRefEntry(..), XRefSection
|
, Structure(..), XRefEntry(..), XRefSection
|
||||||
, blank, dictionary, directObject, integer, line
|
, blank, dictionary, directObject, integer, line
|
||||||
)
|
)
|
||||||
import PDF.Output (ObjectId(..), Offset(..))
|
import PDF.Output (Offset(..))
|
||||||
import PDF.Parser (Parser, (<?>), block, char, on, option, runParser, takeAll)
|
import PDF.Parser (MonadParser(..), Parser, (<?>), evalParser, on)
|
||||||
|
|
||||||
data UserState = UserState {
|
data UserState = UserState {
|
||||||
input :: ByteString
|
input :: ByteString
|
||||||
|
@ -31,9 +34,9 @@ type SParser = Parser UserState
|
||||||
modifyFlow :: (Flow -> Flow) -> SParser ()
|
modifyFlow :: (Flow -> Flow) -> SParser ()
|
||||||
modifyFlow f = modify $ \state -> state {flow = f $ flow state}
|
modifyFlow f = modify $ \state -> state {flow = f $ flow state}
|
||||||
|
|
||||||
addObject :: ObjectId -> Object -> SParser ()
|
addObject :: (Id Object) -> Object -> SParser ()
|
||||||
addObject objectId newObject = modifyFlow $ \flow -> flow {
|
addObject objectId newObject = modifyFlow $ \flow -> flow {
|
||||||
tmpObjects = Map.insert objectId newObject $ tmpObjects flow
|
tmpObjects = Id.insert objectId newObject $ tmpObjects flow
|
||||||
}
|
}
|
||||||
|
|
||||||
pushOccurrence :: Occurrence -> SParser ()
|
pushOccurrence :: Occurrence -> SParser ()
|
||||||
|
@ -46,10 +49,10 @@ comment = BS.unpack <$> (option "" afterPercent <* EOL.parser)
|
||||||
where
|
where
|
||||||
afterPercent = BS.cons <$> char '%' <*> takeAll (not . (`elem` EOL.charset))
|
afterPercent = BS.cons <$> char '%' <*> takeAll (not . (`elem` EOL.charset))
|
||||||
|
|
||||||
lookupOffset :: ObjectId -> SParser Offset
|
lookupOffset :: (Id Object) -> SParser Offset
|
||||||
lookupOffset objectId = do
|
lookupOffset objectId = do
|
||||||
table <- gets xreferences
|
table <- gets xreferences
|
||||||
case Map.lookup objectId table >>= entryOffset of
|
case Id.lookup objectId table >>= entryOffset of
|
||||||
Nothing -> fail $
|
Nothing -> fail $
|
||||||
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
|
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
|
||||||
Just offset -> return offset
|
Just offset -> return offset
|
||||||
|
@ -57,12 +60,12 @@ lookupOffset objectId = do
|
||||||
entryOffset (InUse {offset}) = Just offset
|
entryOffset (InUse {offset}) = Just offset
|
||||||
entryOffset _ = Nothing
|
entryOffset _ = Nothing
|
||||||
|
|
||||||
loadNumber :: ObjectId -> SParser Double
|
loadNumber :: (Id Object) -> SParser Double
|
||||||
loadNumber objectId = do
|
loadNumber objectId = do
|
||||||
offset <- getOffset <$> lookupOffset objectId
|
offset <- getOffset <$> lookupOffset objectId
|
||||||
objectStart <- BS.drop offset <$> gets input
|
objectStart <- BS.drop offset <$> gets input
|
||||||
indirectObjCoordinates `on` (objectStart :: ByteString) >> return ()
|
indirectObjCoordinates `on` (objectStart :: ByteString) >> return ()
|
||||||
objectValue <- (!objectId) . tmpObjects <$> gets flow
|
objectValue <- (`at` objectId) . tmpObjects <$> gets flow
|
||||||
case objectValue of
|
case objectValue of
|
||||||
Direct (NumberObject (Number n)) -> return n
|
Direct (NumberObject (Number n)) -> return n
|
||||||
obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
|
obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
|
||||||
|
@ -75,7 +78,7 @@ getSize Nothing = fail "Missing '/Length' key on stream"
|
||||||
getSize (Just (NumberObject (Number size))) = return size
|
getSize (Just (NumberObject (Number size))) = return size
|
||||||
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
|
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
|
||||||
Flow {tmpObjects} <- gets flow
|
Flow {tmpObjects} <- gets flow
|
||||||
case Map.lookup objectId tmpObjects of
|
case Id.lookup objectId tmpObjects of
|
||||||
Nothing -> loadNumber objectId
|
Nothing -> loadNumber objectId
|
||||||
Just (Direct (NumberObject (Number size))) -> return size
|
Just (Direct (NumberObject (Number size))) -> return size
|
||||||
Just v -> fail $
|
Just v -> fail $
|
||||||
|
@ -96,7 +99,7 @@ object = streamObject <|> Direct <$> directObject
|
||||||
|
|
||||||
indirectObjCoordinates :: SParser IndirectObjCoordinates
|
indirectObjCoordinates :: SParser IndirectObjCoordinates
|
||||||
indirectObjCoordinates = do
|
indirectObjCoordinates = do
|
||||||
objectId <- ObjectId <$> integer
|
objectId <- Id <$> integer
|
||||||
coordinates <- IndirectObjCoordinates objectId <$> integer
|
coordinates <- IndirectObjCoordinates objectId <$> integer
|
||||||
objectValue <- line "obj" *> object <* blank <* line "endobj"
|
objectValue <- line "obj" *> object <* blank <* line "endobj"
|
||||||
addObject objectId objectValue
|
addObject objectId objectValue
|
||||||
|
@ -106,14 +109,14 @@ occurrence :: SParser Occurrence
|
||||||
occurrence =
|
occurrence =
|
||||||
Comment <$> comment <|> Indirect <$> indirectObjCoordinates <?> "comment or object"
|
Comment <$> comment <|> Indirect <$> indirectObjCoordinates <?> "comment or object"
|
||||||
|
|
||||||
populate :: ByteString -> InputStructure -> Content
|
populate :: ByteString -> InputStructure -> Layer
|
||||||
populate input structure =
|
populate input structure =
|
||||||
let bodyInput = BS.drop (startOffset structure) input in
|
let bodyInput = BS.drop (startOffset structure) input in
|
||||||
case runParser recurseOnOccurrences initialState bodyInput of
|
case evalParser recurseOnOccurrences initialState bodyInput of
|
||||||
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
|
Left _ -> Layer {occurrences = [], objects = empty, docStructure}
|
||||||
Right finalState ->
|
Right finalState ->
|
||||||
let Flow {occurrencesStack, tmpObjects} = flow finalState in
|
let Flow {occurrencesStack, tmpObjects} = flow finalState in
|
||||||
Content {
|
Layer {
|
||||||
occurrences = reverse occurrencesStack, objects = tmpObjects, docStructure
|
occurrences = reverse occurrencesStack, objects = tmpObjects, docStructure
|
||||||
}
|
}
|
||||||
where
|
where
|
||||||
|
@ -121,7 +124,7 @@ populate input structure =
|
||||||
xreferences = xRef docStructure
|
xreferences = xRef docStructure
|
||||||
initialState = UserState {
|
initialState = UserState {
|
||||||
input, xreferences, flow = Flow {
|
input, xreferences, flow = Flow {
|
||||||
occurrencesStack = [], tmpObjects = Map.empty
|
occurrencesStack = [], tmpObjects = empty
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
80
src/PDF/Box.hs
Normal file
80
src/PDF/Box.hs
Normal file
|
@ -0,0 +1,80 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE FunctionalDependencies #-}
|
||||||
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
|
{-# LANGUAGE UndecidableInstances #-}
|
||||||
|
{-# LANGUAGE ScopedTypeVariables #-}
|
||||||
|
module PDF.Box (
|
||||||
|
Box(..)
|
||||||
|
, Index(..)
|
||||||
|
, Maybe_(..)
|
||||||
|
, Either_(..)
|
||||||
|
, at
|
||||||
|
, atAll
|
||||||
|
, edit
|
||||||
|
, runRO
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Monad.Except (ExceptT(..), runExceptT)
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
|
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||||
|
import Control.Monad.State (MonadState(..))
|
||||||
|
import Data.Id (Id, IdMap)
|
||||||
|
import qualified Data.Id as Id (insert, lookup)
|
||||||
|
import Data.Map (Map)
|
||||||
|
import qualified Data.Map as Map (insert, lookup)
|
||||||
|
import Data.OrderedMap (OrderedMap)
|
||||||
|
import qualified Data.OrderedMap as OrderedMap (lookup, set)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
|
runRO :: MonadState s m => ReaderT s m a -> m a
|
||||||
|
runRO ro = get >>= runReaderT ro
|
||||||
|
|
||||||
|
newtype Index = Index Int
|
||||||
|
newtype Maybe_ x = Maybe_ x
|
||||||
|
newtype Either_ b x = Either_ x
|
||||||
|
|
||||||
|
class Monad m => Box m i a b | m a i -> b where
|
||||||
|
r :: i -> a -> m b
|
||||||
|
w :: i -> b -> a -> m a
|
||||||
|
|
||||||
|
at :: Box m i a b => i -> (b -> m b) -> a -> m a
|
||||||
|
at i f a = r i a >>= f >>= flip (w i) a
|
||||||
|
|
||||||
|
atAll :: (Traversable t, Monad m, Box m i a (t b)) => i -> (b -> m b) -> a -> m a
|
||||||
|
atAll i f = at i $ (mapM f)
|
||||||
|
|
||||||
|
edit :: MonadState a m => (a -> m a) -> m ()
|
||||||
|
edit f = get >>= f >>= put
|
||||||
|
|
||||||
|
instance MonadFail m => Box m Index [a] a where
|
||||||
|
r (Index i) [] = fail $ "Index out of bounds " ++ show i
|
||||||
|
r (Index 0) (x:_) = return x
|
||||||
|
r (Index i) (_:xs) = r (Index (i-1)) xs
|
||||||
|
|
||||||
|
w (Index i) _ [] = fail $ "Index out of bounds " ++ show i
|
||||||
|
w (Index 0) newX (_:xs) = return (newX:xs)
|
||||||
|
w (Index i) newX (x:xs) = (x:) <$> w (Index (i-1)) newX xs
|
||||||
|
|
||||||
|
instance (Ord k, MonadFail m) => Box m k (Map k a) a where
|
||||||
|
r k = maybe (fail "Unknown key") return . Map.lookup k
|
||||||
|
w k a = return . Map.insert k a
|
||||||
|
|
||||||
|
instance (Ord k, MonadFail m) => Box m k (OrderedMap k a) a where
|
||||||
|
r k = maybe (fail "Unknown key") return . OrderedMap.lookup k
|
||||||
|
w k a orderedMap = r k orderedMap >> return (OrderedMap.set k a orderedMap)
|
||||||
|
|
||||||
|
instance MonadFail m => Box m (Id k) (IdMap k a) a where
|
||||||
|
r k = maybe (fail "Unknown key") return . Id.lookup k
|
||||||
|
w k a = return . Id.insert k a
|
||||||
|
|
||||||
|
instance (Monad m, Box Maybe i a b) => Box m (Maybe_ i) a (Maybe b) where
|
||||||
|
r (Maybe_ i) = return . r i
|
||||||
|
w (Maybe_ i) (Just b) a =
|
||||||
|
return . maybe a id $ w i b a
|
||||||
|
w _ _ obj = return obj
|
||||||
|
|
||||||
|
instance (Monad m, Box (ExceptT e m) i a b) => Box m (Either_ e i) a (Either e b) where
|
||||||
|
r (Either_ i) a = runExceptT (r i a)
|
||||||
|
w (Either_ i) (Right b) a =
|
||||||
|
either (const a) id <$> runExceptT (w i b a :: ExceptT e m a)
|
||||||
|
w _ _ a = return a
|
196
src/PDF/CMap.hs
Normal file
196
src/PDF/CMap.hs
Normal file
|
@ -0,0 +1,196 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
module PDF.CMap (
|
||||||
|
CMap
|
||||||
|
, CMappers
|
||||||
|
, CRange(..)
|
||||||
|
, matches
|
||||||
|
, parse
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Applicative ((<|>), many)
|
||||||
|
import Control.Monad.Except (MonadError(..))
|
||||||
|
import Control.Monad.Fail (fail)
|
||||||
|
import Control.Monad.State (modify)
|
||||||
|
import Data.Attoparsec.ByteString.Char8 (count)
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import qualified Data.ByteString as BS (drop, length, null, take)
|
||||||
|
import Data.ByteString.Char8 (unpack)
|
||||||
|
import Data.ByteString.Char8.Util (
|
||||||
|
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
|
||||||
|
)
|
||||||
|
import Data.Foldable (foldl', foldr')
|
||||||
|
import Data.Map (Map, mapWithKey, union)
|
||||||
|
import qualified Data.Map as Map (
|
||||||
|
adjust, empty, fromList, insert, insertWith, lookup, toList
|
||||||
|
)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import qualified Data.Text as Text (length, null, splitAt)
|
||||||
|
import qualified PDF.EOL as EOL (charset, parser)
|
||||||
|
import PDF.Font (Decoder, Encoder, Font(..))
|
||||||
|
import PDF.Object (
|
||||||
|
DirectObject(..), Name, StringObject(..)
|
||||||
|
, blank, directObject, integer, line, stringObject
|
||||||
|
)
|
||||||
|
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
|
type CMappers = Map Name CMap
|
||||||
|
type ToUnicode = Map ByteString Text
|
||||||
|
type FromUnicode = Map Text ByteString
|
||||||
|
data CRange = CRange {
|
||||||
|
fromSequence :: ByteString
|
||||||
|
, toSequence :: ByteString
|
||||||
|
, toUnicode :: ToUnicode
|
||||||
|
} deriving Show
|
||||||
|
type Size = Int
|
||||||
|
data CMap = CMap {
|
||||||
|
toUnicodeBySize :: Map Size [CRange]
|
||||||
|
, fromUnicodeBySize :: Map Size FromUnicode
|
||||||
|
}
|
||||||
|
|
||||||
|
toFont :: CMap -> Font
|
||||||
|
toFont (CMap {toUnicodeBySize, fromUnicodeBySize}) =
|
||||||
|
Font {decode = decoder toUnicodeBySize, encode = encoder fromUnicodeBySize}
|
||||||
|
|
||||||
|
decoder :: Map Size [CRange] -> Decoder
|
||||||
|
decoder rangesBySize input
|
||||||
|
| BS.null input = Right ""
|
||||||
|
| otherwise = do
|
||||||
|
(output, remainingInput) <- trySizes input $ Map.toList rangesBySize
|
||||||
|
mappend output <$> decoder rangesBySize remainingInput
|
||||||
|
where
|
||||||
|
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
|
||||||
|
trySizes s ((size, cRanges):others) =
|
||||||
|
let prefix = BS.take size s in
|
||||||
|
case tryRanges prefix cRanges of
|
||||||
|
Nothing -> trySizes s others
|
||||||
|
Just outputSequence -> Right (outputSequence, BS.drop size s)
|
||||||
|
tryRanges :: ByteString -> [CRange] -> Maybe Text
|
||||||
|
tryRanges _ [] = Nothing
|
||||||
|
tryRanges prefix ((CRange {toUnicode}):cRanges) =
|
||||||
|
case Map.lookup prefix toUnicode of
|
||||||
|
Nothing -> tryRanges prefix cRanges
|
||||||
|
outputSequence -> outputSequence
|
||||||
|
|
||||||
|
encoder :: Map Size FromUnicode -> Encoder
|
||||||
|
encoder fromUnicodes input
|
||||||
|
| Text.null input = Right ""
|
||||||
|
| otherwise =
|
||||||
|
foldr' (<>) (Left "No encoding found") $ mapWithKey tryOn fromUnicodes
|
||||||
|
where
|
||||||
|
tryOn size fromUnicode =
|
||||||
|
let (prefix, end) = Text.splitAt size input in
|
||||||
|
case Map.lookup prefix fromUnicode of
|
||||||
|
Nothing -> Left ""
|
||||||
|
Just code -> (code <>) <$> encoder fromUnicodes end
|
||||||
|
|
||||||
|
matches :: ByteString -> CRange -> Bool
|
||||||
|
matches code (CRange {fromSequence, toSequence}) =
|
||||||
|
fromSequence <= code && code <= toSequence
|
||||||
|
|
||||||
|
parse :: MonadError String m => ByteString -> m Font
|
||||||
|
parse = either throwError (return . toFont . snd) . runParser
|
||||||
|
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
||||||
|
emptyCMap
|
||||||
|
where
|
||||||
|
ignoredLine =
|
||||||
|
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
|
||||||
|
emptyCMap = CMap {toUnicodeBySize = Map.empty, fromUnicodeBySize = Map.empty}
|
||||||
|
|
||||||
|
codeRanges :: Parser CMap ()
|
||||||
|
codeRanges = do
|
||||||
|
size <- integer <* line "begincodespacerange"
|
||||||
|
mapM_ createMapping =<< count size codeRange
|
||||||
|
line "endcodespacerange"
|
||||||
|
where
|
||||||
|
codeRange =
|
||||||
|
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
|
||||||
|
|
||||||
|
createMapping :: (StringObject, StringObject) -> Parser CMap ()
|
||||||
|
createMapping (Hexadecimal from, Hexadecimal to) = modify $ \cmap -> cmap {
|
||||||
|
toUnicodeBySize = Map.insertWith (++) size [cRange] (toUnicodeBySize cmap)
|
||||||
|
}
|
||||||
|
where
|
||||||
|
fromSequence = b16ToBytes from
|
||||||
|
size = BS.length fromSequence
|
||||||
|
toSequence = b16ToBytes to
|
||||||
|
cRange = CRange {fromSequence, toSequence, toUnicode = Map.empty}
|
||||||
|
createMapping _ = return ()
|
||||||
|
|
||||||
|
cMapRange :: Parser CMap ()
|
||||||
|
cMapRange = do
|
||||||
|
size <- integer <* line "beginbfrange"
|
||||||
|
mapM_ saveMapping =<< count size rangeMapping
|
||||||
|
line "endbfrange"
|
||||||
|
where
|
||||||
|
rangeMapping = do
|
||||||
|
from <- (stringObject <* blank)
|
||||||
|
to <- (stringObject <* blank)
|
||||||
|
mapFromTo from to =<< directObject <* EOL.parser
|
||||||
|
|
||||||
|
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
|
||||||
|
saveMapping assoc =
|
||||||
|
modify $ \(CMap {toUnicodeBySize, fromUnicodeBySize}) -> CMap {
|
||||||
|
toUnicodeBySize = saveToUnicodeBySize assoc toUnicodeBySize
|
||||||
|
, fromUnicodeBySize = saveFromUnicodeBySize reversed fromUnicodeBySize
|
||||||
|
}
|
||||||
|
where
|
||||||
|
reversed = (\(a, b) -> (b, a)) <$> assoc
|
||||||
|
|
||||||
|
saveToUnicodeBySize :: [(ByteString, Text)] -> Map Size [CRange] -> Map Size [CRange]
|
||||||
|
saveToUnicodeBySize [] = id
|
||||||
|
saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length code)
|
||||||
|
where
|
||||||
|
newMapping = Map.fromList assoc
|
||||||
|
appendMapping cRange =
|
||||||
|
cRange {toUnicode = toUnicode cRange `union` newMapping}
|
||||||
|
insertCRange = fmap (\cRange ->
|
||||||
|
if code `matches` cRange then appendMapping cRange else cRange
|
||||||
|
)
|
||||||
|
|
||||||
|
saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode
|
||||||
|
saveFromUnicodeBySize = flip (foldl' insertFromUnicode)
|
||||||
|
where
|
||||||
|
insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode
|
||||||
|
insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) =
|
||||||
|
let size = Text.length unicodeSequence in
|
||||||
|
Map.adjust (Map.insert unicodeSequence code) size tmpFromUnicodeBySize
|
||||||
|
|
||||||
|
cMapChar :: Parser CMap ()
|
||||||
|
cMapChar = do
|
||||||
|
size <- integer <* line "beginbfchar"
|
||||||
|
saveMapping =<< count size charMapping <* line "endbfchar"
|
||||||
|
where
|
||||||
|
charMapping = do
|
||||||
|
from <- stringObject <* blank
|
||||||
|
pairMapping from =<< stringObject <* EOL.parser
|
||||||
|
|
||||||
|
between :: B16Int -> B16Int -> [ByteString]
|
||||||
|
between from@(B16Int s) to =
|
||||||
|
let size = BS.length s `div` 2 in
|
||||||
|
toBytes size <$> [b16ToInt from .. b16ToInt to]
|
||||||
|
|
||||||
|
startFrom :: B16Int -> [ByteString]
|
||||||
|
startFrom from@(B16Int s) =
|
||||||
|
let size = BS.length s `div` 2 in
|
||||||
|
toBytes size <$> [b16ToInt from .. ]
|
||||||
|
|
||||||
|
mapFromTo :: MonadParser m => StringObject -> StringObject -> DirectObject -> m [(ByteString, Text)]
|
||||||
|
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
|
||||||
|
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
|
||||||
|
|
||||||
|
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) =
|
||||||
|
zip (between from to) <$> (mapM dstByteString dstPoints)
|
||||||
|
where
|
||||||
|
dstByteString (StringObject (Hexadecimal dst)) =
|
||||||
|
return . utf16BEToutf8 $ b16ToBytes dst
|
||||||
|
dstByteString _ = fail "Invalid for a replacement string"
|
||||||
|
|
||||||
|
mapFromTo _ _ _ = fail "invalid range mapping found"
|
||||||
|
|
||||||
|
pairMapping :: MonadParser m => StringObject -> StringObject -> m (ByteString, Text)
|
||||||
|
pairMapping (Hexadecimal from) (Hexadecimal to) =
|
||||||
|
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
|
||||||
|
pairMapping _ _ = fail "invalid pair mapping found"
|
117
src/PDF/Content.hs
Normal file
117
src/PDF/Content.hs
Normal file
|
@ -0,0 +1,117 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
{-# LANGUAGE TypeSynonymInstances #-}
|
||||||
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||||
|
module PDF.Content (
|
||||||
|
Content(..)
|
||||||
|
, ContentUnit(..)
|
||||||
|
, GraphicContextUnit(..)
|
||||||
|
, IdContentUnit
|
||||||
|
, IdGraphicContextUnit
|
||||||
|
, IdTextContext
|
||||||
|
, Instructions(..)
|
||||||
|
, TextContext
|
||||||
|
, parse
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Applicative ((<|>))
|
||||||
|
import Control.Monad.Reader (asks, runReader)
|
||||||
|
import Control.Monad.State.Strict (runState, evalStateT, modify)
|
||||||
|
import Data.Attoparsec.ByteString.Char8 (sepBy)
|
||||||
|
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser, parseOnly)
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import Data.Id (Id(..), Indexed, at, empty, register)
|
||||||
|
import PDF.Box (Box(..))
|
||||||
|
import PDF.Content.Operator (Instruction, operator)
|
||||||
|
import PDF.Object (blank, directObject)
|
||||||
|
import PDF.Output (Output(..), line)
|
||||||
|
import PDF.Parser (string)
|
||||||
|
|
||||||
|
data Instructions = Instructions
|
||||||
|
|
||||||
|
data GraphicContextUnit a =
|
||||||
|
GraphicInstruction a
|
||||||
|
| ContentUnit (ContentUnit a)
|
||||||
|
deriving Show
|
||||||
|
type TextContext a = [a]
|
||||||
|
data ContentUnit a =
|
||||||
|
GraphicContext [GraphicContextUnit a]
|
||||||
|
| TextContext (TextContext a)
|
||||||
|
deriving Show
|
||||||
|
data Content = Content {
|
||||||
|
contentUnits :: [IdContentUnit]
|
||||||
|
, indexedInstructions :: Indexed Instruction
|
||||||
|
, firstError :: Maybe String
|
||||||
|
} deriving Show
|
||||||
|
|
||||||
|
type TmpContentUnit = ContentUnit Instruction
|
||||||
|
type TmpGraphicContextUnit = GraphicContextUnit Instruction
|
||||||
|
type TmpTextContext = TextContext Instruction
|
||||||
|
|
||||||
|
type IdContentUnit = ContentUnit (Id Instruction)
|
||||||
|
type IdGraphicContextUnit = GraphicContextUnit (Id Instruction)
|
||||||
|
type IdTextContext = TextContext (Id Instruction)
|
||||||
|
|
||||||
|
instance Monad m => Box m Instructions Content (Indexed Instruction) where
|
||||||
|
r Instructions = return . indexedInstructions
|
||||||
|
w Instructions indexedInstructions someContent =
|
||||||
|
return $ someContent {indexedInstructions}
|
||||||
|
|
||||||
|
parse :: ByteString -> Content
|
||||||
|
parse input =
|
||||||
|
let result = Atto.parseOnly (contentUnit `sepBy` blank) input in
|
||||||
|
let (contentUnits, indexedInstructions) = either (const ([], empty)) buildContent result in
|
||||||
|
let firstError = either Just (const Nothing) result in
|
||||||
|
Content {contentUnits, indexedInstructions, firstError}
|
||||||
|
|
||||||
|
buildContent :: [TmpContentUnit] -> ([IdContentUnit], Indexed Instruction)
|
||||||
|
buildContent instructionContentUnits =
|
||||||
|
runState (mapM registerContentUnit instructionContentUnits) empty
|
||||||
|
where
|
||||||
|
registerContentUnit (GraphicContext gc) =
|
||||||
|
GraphicContext <$> (mapM registerGraphicContext gc)
|
||||||
|
registerContentUnit (TextContext tc) = TextContext <$> (mapM register tc)
|
||||||
|
registerGraphicContext (GraphicInstruction gi) =
|
||||||
|
GraphicInstruction <$> (register gi)
|
||||||
|
registerGraphicContext (ContentUnit cu) =
|
||||||
|
ContentUnit <$> (registerContentUnit cu)
|
||||||
|
|
||||||
|
contentUnit :: Atto.Parser TmpContentUnit
|
||||||
|
contentUnit =
|
||||||
|
(GraphicContext <$> graphicContext)
|
||||||
|
<|> (TextContext <$> textContext)
|
||||||
|
where
|
||||||
|
graphicContext =
|
||||||
|
string "q" *> blank *> graphicContextUnit `sepBy` blank <* blank <* string "Q"
|
||||||
|
|
||||||
|
graphicContextUnit :: Atto.Parser TmpGraphicContextUnit
|
||||||
|
graphicContextUnit =
|
||||||
|
(GraphicInstruction <$> instruction)
|
||||||
|
<|> (ContentUnit <$> contentUnit)
|
||||||
|
|
||||||
|
instruction :: Atto.Parser Instruction
|
||||||
|
instruction = evalStateT stackParser []
|
||||||
|
where
|
||||||
|
stackParser = ((directObject <* blank) >>= push) <|> operator
|
||||||
|
push arg = modify (arg:) *> stackParser
|
||||||
|
|
||||||
|
textContext :: Atto.Parser TmpTextContext
|
||||||
|
textContext =
|
||||||
|
string "BT" *> blank *> instruction `sepBy` blank <* blank <* string "ET"
|
||||||
|
|
||||||
|
instance Output Content where
|
||||||
|
output (Content {contentUnits, indexedInstructions}) =
|
||||||
|
runReader (mconcat <$> mapM outputCU contentUnits) indexedInstructions
|
||||||
|
where
|
||||||
|
outputCU (GraphicContext gc) = do
|
||||||
|
inside <- mconcat <$> mapM outputGCU gc
|
||||||
|
return (line "q" `mappend` inside `mappend` line "Q")
|
||||||
|
outputCU (TextContext tc) = do
|
||||||
|
inside <- mconcat <$> mapM outputIId tc
|
||||||
|
return (line "BT" `mappend` inside `mappend` line "ET")
|
||||||
|
outputGCU (GraphicInstruction gi) = outputIId gi
|
||||||
|
outputGCU (ContentUnit cu) = outputCU cu
|
||||||
|
outputIId instructionId = asks (output . (`at` instructionId))
|
76
src/PDF/Content/Operator.hs
Normal file
76
src/PDF/Content/Operator.hs
Normal file
|
@ -0,0 +1,76 @@
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
{-# LANGUAGE TypeSynonymInstances #-}
|
||||||
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
|
module PDF.Content.Operator (
|
||||||
|
Instruction
|
||||||
|
, Operator(..)
|
||||||
|
, operator
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
|
import Control.Monad.State (MonadState(..))
|
||||||
|
import Data.ByteString.Char8 (ByteString, pack, unpack)
|
||||||
|
import Data.Char (toLower)
|
||||||
|
import Data.Map (Map)
|
||||||
|
import qualified Data.Map as Map (fromList, lookup)
|
||||||
|
import qualified PDF.Content.Operator.Color as Color (Operator, signature)
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
import qualified PDF.Content.Operator.GraphicState as GraphicState (Operator, signature)
|
||||||
|
import qualified PDF.Content.Operator.Path as Path (Operator, signature)
|
||||||
|
import qualified PDF.Content.Operator.Text as Text (Operator, signature)
|
||||||
|
import PDF.Object (DirectObject, blank, regular)
|
||||||
|
import PDF.Output (Output(..), join, line)
|
||||||
|
import PDF.Parser (MonadParser, takeAll1)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
GraphicState GraphicState.Operator
|
||||||
|
| Path Path.Operator
|
||||||
|
| Color Color.Operator
|
||||||
|
| Text Text.Operator
|
||||||
|
|
||||||
|
instance Show Operator where
|
||||||
|
show (GraphicState gcOp) = code gcOp
|
||||||
|
show (Path pOp) = code pOp
|
||||||
|
show (Color cOp) = code cOp
|
||||||
|
show (Text tOp) = code tOp
|
||||||
|
|
||||||
|
type Instruction = (Operator, [DirectObject])
|
||||||
|
|
||||||
|
instance Output Instruction where
|
||||||
|
output (op, args) = join " " ((output <$> args) ++ [line (show op)])
|
||||||
|
|
||||||
|
operatorsTable :: Map ByteString (Signature Operator)
|
||||||
|
operatorsTable = Map.fromList (
|
||||||
|
(prepare GraphicState <$> GraphicState.signature)
|
||||||
|
++ (prepare Path <$> Path.signature)
|
||||||
|
++ (prepare Color <$> Color.signature)
|
||||||
|
++ (prepare Text <$> Text.signature)
|
||||||
|
)
|
||||||
|
where
|
||||||
|
prepare constructor (op, sig) = (pack $ code op, (constructor op, sig))
|
||||||
|
|
||||||
|
code :: Show a => a -> String
|
||||||
|
code = expand . show
|
||||||
|
where
|
||||||
|
expand "" = ""
|
||||||
|
expand (c:'_':s) = toLower c : expand s
|
||||||
|
expand ('s':'t':'a':'r':s) = '*' : expand s
|
||||||
|
expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
|
||||||
|
expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
|
||||||
|
expand (c:s) = c : expand s
|
||||||
|
|
||||||
|
type StackParser m = (MonadState [DirectObject] m, MonadParser m)
|
||||||
|
|
||||||
|
operator :: StackParser m => m Instruction
|
||||||
|
operator = do
|
||||||
|
chunk <- takeAll1 regular <* blank
|
||||||
|
args <- reverse <$> get
|
||||||
|
case Map.lookup chunk operatorsTable of
|
||||||
|
Just (op, sig)
|
||||||
|
| sig args -> return (op, args)
|
||||||
|
| otherwise ->
|
||||||
|
get >>= fail . printf "Operator %s with stack %s" (show op) . show
|
||||||
|
_ -> fail ("Unknown chunk " ++ unpack chunk)
|
27
src/PDF/Content/Operator/Color.hs
Normal file
27
src/PDF/Content/Operator/Color.hs
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
module PDF.Content.Operator.Color (
|
||||||
|
Operator(..)
|
||||||
|
, signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
import PDF.Object (DirectObject(..))
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
|
||||||
|
signature :: [Signature Operator]
|
||||||
|
signature = [
|
||||||
|
(CS, \l -> case l of [NameObject _] -> True ; _ -> False)
|
||||||
|
, (C_s, \l -> case l of [NameObject _] -> True ; _ -> False)
|
||||||
|
, (SC, \_ -> True)
|
||||||
|
, (SCN, \_ -> True)
|
||||||
|
, (S_c, \_ -> True)
|
||||||
|
, (S_cn, \_ -> True)
|
||||||
|
, (G, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (G_, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (RG, \l -> case l of [_, _, _] -> True ; _ -> False)
|
||||||
|
, (R_g, \l -> case l of [_, _, _] -> True ; _ -> False)
|
||||||
|
, (K, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
, (K_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
]
|
7
src/PDF/Content/Operator/Common.hs
Normal file
7
src/PDF/Content/Operator/Common.hs
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
module PDF.Content.Operator.Common (
|
||||||
|
Signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Object (DirectObject)
|
||||||
|
|
||||||
|
type Signature a = (a, [DirectObject] -> Bool)
|
24
src/PDF/Content/Operator/GraphicState.hs
Normal file
24
src/PDF/Content/Operator/GraphicState.hs
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
module PDF.Content.Operator.GraphicState (
|
||||||
|
Operator(..)
|
||||||
|
, signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
import PDF.Object (DirectObject(..))
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
|
||||||
|
signature :: [Signature Operator]
|
||||||
|
signature = [
|
||||||
|
(C_m, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
|
||||||
|
, (W_, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (J, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (J_, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (M, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (D_, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (R_i, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (I_, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (G_s, \l -> case l of [NameObject _] -> True ; _ -> False)
|
||||||
|
]
|
35
src/PDF/Content/Operator/Path.hs
Normal file
35
src/PDF/Content/Operator/Path.hs
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
module PDF.Content.Operator.Path (
|
||||||
|
Operator(..)
|
||||||
|
, signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
|
||||||
|
| S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
|
||||||
|
| W | Wstar -- clipping path
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
|
||||||
|
signature :: [Signature Operator]
|
||||||
|
signature = [
|
||||||
|
(M_, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (L_, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (C_, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
|
||||||
|
, (V_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
, (Y_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
, (H_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (R_e, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
, (S, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (S_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (F_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (F, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Fstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Bstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B_star, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (N_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (W, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Wstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
]
|
32
src/PDF/Content/Operator/Text.hs
Normal file
32
src/PDF/Content/Operator/Text.hs
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
module PDF.Content.Operator.Text (
|
||||||
|
Operator(..)
|
||||||
|
, signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
import PDF.Object (DirectObject(..))
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
Td | TD | Tm | Tstar -- text positioning
|
||||||
|
| TJ | Tj | Quote | DQuote -- text showing
|
||||||
|
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
|
||||||
|
signature :: [Signature Operator]
|
||||||
|
signature = [
|
||||||
|
(Td, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (TD, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (Tm, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
|
||||||
|
, (Tstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (TJ, \l -> case l of [Array _] -> True ; _ -> False)
|
||||||
|
, (Tj, \l -> case l of [StringObject _] -> True ; _ -> False)
|
||||||
|
, (Quote, \l -> case l of [StringObject _] -> True ; _ -> False)
|
||||||
|
, (DQuote, \l -> case l of [_, _, StringObject _] -> True ; _ -> False)
|
||||||
|
, (Tc, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (Tw, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (Tz, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (TL, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (Tf, \l -> case l of [NameObject _, _] -> True ; _ -> False)
|
||||||
|
, (Tr, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (Ts, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
]
|
151
src/PDF/Content/Text.hs
Normal file
151
src/PDF/Content/Text.hs
Normal file
|
@ -0,0 +1,151 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||||
|
module PDF.Content.Text (
|
||||||
|
Chunks(..)
|
||||||
|
, chunk
|
||||||
|
, format
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Monad (foldM)
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
|
import Control.Monad.Reader (ReaderT, asks, runReaderT)
|
||||||
|
import Control.Monad.State (
|
||||||
|
MonadState(..), StateT, evalStateT, gets, modify, runStateT
|
||||||
|
)
|
||||||
|
import Control.Monad.Trans (lift)
|
||||||
|
import qualified Data.ByteString.Char8 as BS (concatMap, singleton)
|
||||||
|
import Data.Id (Id(..), Indexed, at, empty, singleton)
|
||||||
|
import qualified Data.Id as Id (delete, lookup, register)
|
||||||
|
import Data.Map ((!))
|
||||||
|
import Data.Text (Text, breakOn)
|
||||||
|
import qualified Data.Text as Text (drop)
|
||||||
|
import PDF.Box (Box(..))
|
||||||
|
import PDF.Content (
|
||||||
|
Content(..), ContentUnit(..), GraphicContextUnit(..), IdContentUnit
|
||||||
|
, IdGraphicContextUnit
|
||||||
|
)
|
||||||
|
import PDF.Content.Operator (Instruction, Operator(..))
|
||||||
|
import PDF.Content.Operator.Text (Operator(..))
|
||||||
|
import PDF.Font (Font(..), FontSet, emptyFont)
|
||||||
|
import PDF.Object (DirectObject(..), StringObject(..), toByteString)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
|
type TmpFont = StateT Font
|
||||||
|
type Renderer m = ReaderT (Indexed Instruction) (ReaderT FontSet m)
|
||||||
|
type Updater m = StateT (Indexed Instruction) (ReaderT (Indexed Text) (ReaderT FontSet m))
|
||||||
|
|
||||||
|
decodeString :: MonadFail m => StringObject -> TmpFont (Renderer m) Text
|
||||||
|
decodeString input = do
|
||||||
|
Font {decode} <- get
|
||||||
|
either fail return . decode $ toByteString input
|
||||||
|
|
||||||
|
data Chunks = Chunks
|
||||||
|
|
||||||
|
chunk :: Int -> Id Text
|
||||||
|
chunk = Id
|
||||||
|
|
||||||
|
instance MonadFail m => Box (ReaderT FontSet m) Chunks Content (Indexed Text) where
|
||||||
|
r Chunks content =
|
||||||
|
runReaderT (mconcat <$> renderer) $ indexedInstructions content
|
||||||
|
where
|
||||||
|
renderer = mapM renderContentUnit (contentUnits content)
|
||||||
|
|
||||||
|
w Chunks indexedText content = do
|
||||||
|
(contentUnits, indexedInstructions) <- runReaderT readerUpdate indexedText
|
||||||
|
return $ content {contentUnits, indexedInstructions}
|
||||||
|
where
|
||||||
|
stateUpdate = mapM updateContentUnit (contentUnits content)
|
||||||
|
readerUpdate = runStateT stateUpdate (indexedInstructions content)
|
||||||
|
|
||||||
|
renderContentUnit :: MonadFail m => IdContentUnit -> Renderer m (Indexed Text)
|
||||||
|
renderContentUnit (GraphicContext graphicContextUnits) =
|
||||||
|
mconcat <$> mapM renderGraphicContextUnit graphicContextUnits
|
||||||
|
renderContentUnit (TextContext instructionIds) =
|
||||||
|
evalStateT (mconcat <$> mapM renderInstructionId instructionIds) emptyFont
|
||||||
|
|
||||||
|
updateContentUnit :: MonadFail m => IdContentUnit -> Updater m IdContentUnit
|
||||||
|
updateContentUnit (GraphicContext graphicContextUnits) = GraphicContext <$>
|
||||||
|
mapM updateGraphicContextUnit graphicContextUnits
|
||||||
|
updateContentUnit (TextContext instructionIds) = TextContext . concat <$>
|
||||||
|
evalStateT (mapM updateInstructionId instructionIds) emptyFont
|
||||||
|
|
||||||
|
renderGraphicContextUnit :: MonadFail m => IdGraphicContextUnit -> Renderer m (Indexed Text)
|
||||||
|
renderGraphicContextUnit (GraphicInstruction _) = return empty
|
||||||
|
renderGraphicContextUnit (ContentUnit contentUnit) =
|
||||||
|
renderContentUnit contentUnit
|
||||||
|
|
||||||
|
updateGraphicContextUnit :: MonadFail m => IdGraphicContextUnit -> Updater m IdGraphicContextUnit
|
||||||
|
updateGraphicContextUnit gI@(GraphicInstruction _) = return gI
|
||||||
|
updateGraphicContextUnit (ContentUnit contentUnit) =
|
||||||
|
ContentUnit <$> updateContentUnit contentUnit
|
||||||
|
|
||||||
|
renderInstructionId :: MonadFail m => Id Instruction -> TmpFont (Renderer m) (Indexed Text)
|
||||||
|
renderInstructionId instructionId@(Id n) = toMap <$>
|
||||||
|
(asks ((`at` instructionId)) >>= renderInstruction)
|
||||||
|
where
|
||||||
|
toMap = maybe empty (singleton (Id n))
|
||||||
|
|
||||||
|
updateInstructionId :: MonadFail m => Id Instruction -> TmpFont (Updater m) [Id Instruction]
|
||||||
|
updateInstructionId instructionId =
|
||||||
|
lift (gets (`at` instructionId)) >>= updateInstruction instructionId
|
||||||
|
|
||||||
|
renderInstruction :: MonadFail m => Instruction -> TmpFont (Renderer m) (Maybe Text)
|
||||||
|
renderInstruction (Text Tf, [NameObject fontName, _]) =
|
||||||
|
lift (lift $ asks (! fontName)) >>= put >> return Nothing
|
||||||
|
|
||||||
|
renderInstruction (Text Tstar, []) = return $ Just "\n"
|
||||||
|
|
||||||
|
renderInstruction (Text TJ, [Array arrayObject]) =
|
||||||
|
Just <$> foldM appendText "" arrayObject
|
||||||
|
where
|
||||||
|
appendText t (StringObject outputString) =
|
||||||
|
mappend t <$> decodeString outputString
|
||||||
|
appendText t _ = return t
|
||||||
|
|
||||||
|
renderInstruction (Text Tj, [StringObject outputString]) =
|
||||||
|
Just <$> decodeString outputString
|
||||||
|
|
||||||
|
renderInstruction (Text Quote, [StringObject outputString]) =
|
||||||
|
(Just . mappend "\n") <$> decodeString outputString
|
||||||
|
|
||||||
|
renderInstruction (Text DQuote, [_, _, StringObject outputString]) =
|
||||||
|
(Just . mappend "\n") <$> decodeString outputString
|
||||||
|
|
||||||
|
renderInstruction _ = return Nothing
|
||||||
|
|
||||||
|
updateInstruction :: MonadFail m => Id Instruction -> Instruction -> TmpFont (Updater m) [Id Instruction]
|
||||||
|
updateInstruction instructionId (Text Tf, [NameObject fontName, _]) =
|
||||||
|
(lift . lift . lift $ asks (!fontName)) >>= put >> return [instructionId]
|
||||||
|
|
||||||
|
updateInstruction instructionId@(Id n) instruction = do
|
||||||
|
if emitsText $ fst instruction
|
||||||
|
then asks (Id.lookup (Id n)) >>= replaceText
|
||||||
|
else return [instructionId]
|
||||||
|
where
|
||||||
|
emitsText (Text Tstar) = True
|
||||||
|
emitsText (Text TJ) = True
|
||||||
|
emitsText (Text Tj) = True
|
||||||
|
emitsText (Text Quote) = True
|
||||||
|
emitsText (Text DQuote) = True
|
||||||
|
emitsText _ = False
|
||||||
|
replaceText = maybe (return []) $ \text -> do
|
||||||
|
lift $ modify (Id.delete instructionId)
|
||||||
|
format text >>= mapM (lift . Id.register)
|
||||||
|
|
||||||
|
format :: MonadFail m => Text -> StateT Font m [Instruction]
|
||||||
|
format input = do
|
||||||
|
case breakOn "\n" input of
|
||||||
|
("", "") -> return []
|
||||||
|
("", left) -> ((Text Tstar, []) :) <$> format (Text.drop 1 left)
|
||||||
|
(line, left) -> (:) <$> tj line <*> format left
|
||||||
|
where
|
||||||
|
tj t = do
|
||||||
|
encoded <- either fail return =<< gets (($t) . encode)
|
||||||
|
return (Text Tj, [StringObject . Literal $ BS.concatMap escape encoded])
|
||||||
|
escape '\\' = "\\\\"
|
||||||
|
escape '(' = "\\("
|
||||||
|
escape ')' = "\\)"
|
||||||
|
escape c = BS.singleton c
|
|
@ -6,14 +6,14 @@ module PDF.EOL (
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>))
|
import Control.Applicative ((<|>))
|
||||||
import PDF.Parser (Parser, string)
|
import PDF.Parser (MonadParser, string)
|
||||||
|
|
||||||
data Style = CR | LF | CRLF deriving Show
|
data Style = CR | LF | CRLF deriving Show
|
||||||
|
|
||||||
charset :: String
|
charset :: String
|
||||||
charset = "\r\n"
|
charset = "\r\n"
|
||||||
|
|
||||||
parser :: Parser s Style
|
parser :: MonadParser m => m Style
|
||||||
parser =
|
parser =
|
||||||
(string "\r\n" >> return CRLF)
|
(string "\r\n" >> return CRLF)
|
||||||
<|> (string "\r" >> return CR)
|
<|> (string "\r" >> return CR)
|
||||||
|
|
13
src/PDF/Encoding.hs
Normal file
13
src/PDF/Encoding.hs
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
module PDF.Encoding (
|
||||||
|
encoding
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Monad.Except (MonadError(..))
|
||||||
|
import PDF.Encoding.MacRoman (macRomanEncoding)
|
||||||
|
import PDF.Font (Font)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
|
encoding :: MonadError String m => String -> m Font
|
||||||
|
encoding "MacRomanEncoding" = return macRomanEncoding
|
||||||
|
encoding s = throwError $ "Unknown encoding " ++ s
|
162
src/PDF/Encoding/MacRoman.hs
Normal file
162
src/PDF/Encoding/MacRoman.hs
Normal file
|
@ -0,0 +1,162 @@
|
||||||
|
module PDF.Encoding.MacRoman (
|
||||||
|
macRomanEncoding
|
||||||
|
) where
|
||||||
|
|
||||||
|
import qualified Data.ByteString.Char8 as BS (pack, unpack)
|
||||||
|
import Data.Foldable (foldl')
|
||||||
|
import Data.Map (Map)
|
||||||
|
import qualified Data.Map as Map (empty, insert, lookup)
|
||||||
|
import qualified Data.Text as Text (pack, unpack)
|
||||||
|
import PDF.Font (Font(..))
|
||||||
|
|
||||||
|
type Mapper = Map Char Char
|
||||||
|
macRomanEncoding :: Font
|
||||||
|
macRomanEncoding = Font {
|
||||||
|
decode = Right . Text.pack . fmap decodeChar . BS.unpack
|
||||||
|
, encode = fmap BS.pack . mapM encodeChar . Text.unpack
|
||||||
|
}
|
||||||
|
where
|
||||||
|
decodeChar k = maybe k id $ Map.lookup k (fst mappers)
|
||||||
|
encodeChar k =
|
||||||
|
case Map.lookup k (snd mappers) of
|
||||||
|
Just v -> Right v
|
||||||
|
Nothing
|
||||||
|
| k < '\x80' -> Right k
|
||||||
|
| otherwise -> Left ("Character '" ++ k :"' unavailable in MacRoman")
|
||||||
|
|
||||||
|
mappers :: (Mapper, Mapper)
|
||||||
|
mappers = foldl' generateMapers (Map.empty, Map.empty) [
|
||||||
|
('\x80', '\x00C4') -- LATIN CAPITAL LETTER A WITH DIAERESIS
|
||||||
|
, ('\x81', '\x00C5') -- LATIN CAPITAL LETTER A WITH RING ABOVE
|
||||||
|
, ('\x82', '\x00C7') -- LATIN CAPITAL LETTER C WITH CEDILLA
|
||||||
|
, ('\x83', '\x00C9') -- LATIN CAPITAL LETTER E WITH ACUTE
|
||||||
|
, ('\x84', '\x00D1') -- LATIN CAPITAL LETTER N WITH TILDE
|
||||||
|
, ('\x85', '\x00D6') -- LATIN CAPITAL LETTER O WITH DIAERESIS
|
||||||
|
, ('\x86', '\x00DC') -- LATIN CAPITAL LETTER U WITH DIAERESIS
|
||||||
|
, ('\x87', '\x00E1') -- LATIN SMALL LETTER A WITH ACUTE
|
||||||
|
, ('\x88', '\x00E0') -- LATIN SMALL LETTER A WITH GRAVE
|
||||||
|
, ('\x89', '\x00E2') -- LATIN SMALL LETTER A WITH CIRCUMFLEX
|
||||||
|
, ('\x8A', '\x00E4') -- LATIN SMALL LETTER A WITH DIAERESIS
|
||||||
|
, ('\x8B', '\x00E3') -- LATIN SMALL LETTER A WITH TILDE
|
||||||
|
, ('\x8C', '\x00E5') -- LATIN SMALL LETTER A WITH RING ABOVE
|
||||||
|
, ('\x8D', '\x00E7') -- LATIN SMALL LETTER C WITH CEDILLA
|
||||||
|
, ('\x8E', '\x00E9') -- LATIN SMALL LETTER E WITH ACUTE
|
||||||
|
, ('\x8F', '\x00E8') -- LATIN SMALL LETTER E WITH GRAVE
|
||||||
|
, ('\x90', '\x00EA') -- LATIN SMALL LETTER E WITH CIRCUMFLEX
|
||||||
|
, ('\x91', '\x00EB') -- LATIN SMALL LETTER E WITH DIAERESIS
|
||||||
|
, ('\x92', '\x00ED') -- LATIN SMALL LETTER I WITH ACUTE
|
||||||
|
, ('\x93', '\x00EC') -- LATIN SMALL LETTER I WITH GRAVE
|
||||||
|
, ('\x94', '\x00EE') -- LATIN SMALL LETTER I WITH CIRCUMFLEX
|
||||||
|
, ('\x95', '\x00EF') -- LATIN SMALL LETTER I WITH DIAERESIS
|
||||||
|
, ('\x96', '\x00F1') -- LATIN SMALL LETTER N WITH TILDE
|
||||||
|
, ('\x97', '\x00F3') -- LATIN SMALL LETTER O WITH ACUTE
|
||||||
|
, ('\x98', '\x00F2') -- LATIN SMALL LETTER O WITH GRAVE
|
||||||
|
, ('\x99', '\x00F4') -- LATIN SMALL LETTER O WITH CIRCUMFLEX
|
||||||
|
, ('\x9A', '\x00F6') -- LATIN SMALL LETTER O WITH DIAERESIS
|
||||||
|
, ('\x9B', '\x00F5') -- LATIN SMALL LETTER O WITH TILDE
|
||||||
|
, ('\x9C', '\x00FA') -- LATIN SMALL LETTER U WITH ACUTE
|
||||||
|
, ('\x9D', '\x00F9') -- LATIN SMALL LETTER U WITH GRAVE
|
||||||
|
, ('\x9E', '\x00FB') -- LATIN SMALL LETTER U WITH CIRCUMFLEX
|
||||||
|
, ('\x9F', '\x00FC') -- LATIN SMALL LETTER U WITH DIAERESIS
|
||||||
|
, ('\xA0', '\x2020') -- DAGGER
|
||||||
|
, ('\xA1', '\x00B0') -- DEGREE SIGN
|
||||||
|
, ('\xA2', '\x00A2') -- CENT SIGN
|
||||||
|
, ('\xA3', '\x00A3') -- POUND SIGN
|
||||||
|
, ('\xA4', '\x00A7') -- SECTION SIGN
|
||||||
|
, ('\xA5', '\x2022') -- BULLET
|
||||||
|
, ('\xA6', '\x00B6') -- PILCROW SIGN
|
||||||
|
, ('\xA7', '\x00DF') -- LATIN SMALL LETTER SHARP S
|
||||||
|
, ('\xA8', '\x00AE') -- REGISTERED SIGN
|
||||||
|
, ('\xA9', '\x00A9') -- COPYRIGHT SIGN
|
||||||
|
, ('\xAA', '\x2122') -- TRADE MARK SIGN
|
||||||
|
, ('\xAB', '\x00B4') -- ACUTE ACCENT
|
||||||
|
, ('\xAC', '\x00A8') -- DIAERESIS
|
||||||
|
, ('\xAD', '\x2260') -- NOT EQUAL TO
|
||||||
|
, ('\xAE', '\x00C6') -- LATIN CAPITAL LETTER AE
|
||||||
|
, ('\xAF', '\x00D8') -- LATIN CAPITAL LETTER O WITH STROKE
|
||||||
|
, ('\xB0', '\x221E') -- INFINITY
|
||||||
|
, ('\xB1', '\x00B1') -- PLUS-MINUS SIGN
|
||||||
|
, ('\xB2', '\x2264') -- LESS-THAN OR EQUAL TO
|
||||||
|
, ('\xB3', '\x2265') -- GREATER-THAN OR EQUAL TO
|
||||||
|
, ('\xB4', '\x00A5') -- YEN SIGN
|
||||||
|
, ('\xB5', '\x00B5') -- MICRO SIGN
|
||||||
|
, ('\xB6', '\x2202') -- PARTIAL DIFFERENTIAL
|
||||||
|
, ('\xB7', '\x2211') -- N-ARY SUMMATION
|
||||||
|
, ('\xB8', '\x220F') -- N-ARY PRODUCT
|
||||||
|
, ('\xB9', '\x03C0') -- GREEK SMALL LETTER PI
|
||||||
|
, ('\xBA', '\x222B') -- INTEGRAL
|
||||||
|
, ('\xBB', '\x00AA') -- FEMININE ORDINAL INDICATOR
|
||||||
|
, ('\xBC', '\x00BA') -- MASCULINE ORDINAL INDICATOR
|
||||||
|
, ('\xBD', '\x03A9') -- GREEK CAPITAL LETTER OMEGA
|
||||||
|
, ('\xBE', '\x00E6') -- LATIN SMALL LETTER AE
|
||||||
|
, ('\xBF', '\x00F8') -- LATIN SMALL LETTER O WITH STROKE
|
||||||
|
, ('\xC0', '\x00BF') -- INVERTED QUESTION MARK
|
||||||
|
, ('\xC1', '\x00A1') -- INVERTED EXCLAMATION MARK
|
||||||
|
, ('\xC2', '\x00AC') -- NOT SIGN
|
||||||
|
, ('\xC3', '\x221A') -- SQUARE ROOT
|
||||||
|
, ('\xC4', '\x0192') -- LATIN SMALL LETTER F WITH HOOK
|
||||||
|
, ('\xC5', '\x2248') -- ALMOST EQUAL TO
|
||||||
|
, ('\xC6', '\x2206') -- INCREMENT
|
||||||
|
, ('\xC7', '\x00AB') -- LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||||
|
, ('\xC8', '\x00BB') -- RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||||
|
, ('\xC9', '\x2026') -- HORIZONTAL ELLIPSIS
|
||||||
|
, ('\xCA', '\x00A0') -- NO-BREAK SPACE
|
||||||
|
, ('\xCB', '\x00C0') -- LATIN CAPITAL LETTER A WITH GRAVE
|
||||||
|
, ('\xCC', '\x00C3') -- LATIN CAPITAL LETTER A WITH TILDE
|
||||||
|
, ('\xCD', '\x00D5') -- LATIN CAPITAL LETTER O WITH TILDE
|
||||||
|
, ('\xCE', '\x0152') -- LATIN CAPITAL LIGATURE OE
|
||||||
|
, ('\xCF', '\x0153') -- LATIN SMALL LIGATURE OE
|
||||||
|
, ('\xD0', '\x2013') -- EN DASH
|
||||||
|
, ('\xD1', '\x2014') -- EM DASH
|
||||||
|
, ('\xD2', '\x201C') -- LEFT DOUBLE QUOTATION MARK
|
||||||
|
, ('\xD3', '\x201D') -- RIGHT DOUBLE QUOTATION MARK
|
||||||
|
, ('\xD4', '\x2018') -- LEFT SINGLE QUOTATION MARK
|
||||||
|
, ('\xD5', '\x2019') -- RIGHT SINGLE QUOTATION MARK
|
||||||
|
, ('\xD6', '\x00F7') -- DIVISION SIGN
|
||||||
|
, ('\xD7', '\x25CA') -- LOZENGE
|
||||||
|
, ('\xD8', '\x00FF') -- LATIN SMALL LETTER Y WITH DIAERESIS
|
||||||
|
, ('\xD9', '\x0178') -- LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||||
|
, ('\xDA', '\x2044') -- FRACTION SLASH
|
||||||
|
, ('\xDB', '\x20AC') -- EURO SIGN
|
||||||
|
, ('\xDC', '\x2039') -- SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||||
|
, ('\xDD', '\x203A') -- SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||||
|
, ('\xDE', '\xFB01') -- LATIN SMALL LIGATURE FI
|
||||||
|
, ('\xDF', '\xFB02') -- LATIN SMALL LIGATURE FL
|
||||||
|
, ('\xE0', '\x2021') -- DOUBLE DAGGER
|
||||||
|
, ('\xE1', '\x00B7') -- MIDDLE DOT
|
||||||
|
, ('\xE2', '\x201A') -- SINGLE LOW-9 QUOTATION MARK
|
||||||
|
, ('\xE3', '\x201E') -- DOUBLE LOW-9 QUOTATION MARK
|
||||||
|
, ('\xE4', '\x2030') -- PER MILLE SIGN
|
||||||
|
, ('\xE5', '\x00C2') -- LATIN CAPITAL LETTER A WITH CIRCUMFLEX
|
||||||
|
, ('\xE6', '\x00CA') -- LATIN CAPITAL LETTER E WITH CIRCUMFLEX
|
||||||
|
, ('\xE7', '\x00C1') -- LATIN CAPITAL LETTER A WITH ACUTE
|
||||||
|
, ('\xE8', '\x00CB') -- LATIN CAPITAL LETTER E WITH DIAERESIS
|
||||||
|
, ('\xE9', '\x00C8') -- LATIN CAPITAL LETTER E WITH GRAVE
|
||||||
|
, ('\xEA', '\x00CD') -- LATIN CAPITAL LETTER I WITH ACUTE
|
||||||
|
, ('\xEB', '\x00CE') -- LATIN CAPITAL LETTER I WITH CIRCUMFLEX
|
||||||
|
, ('\xEC', '\x00CF') -- LATIN CAPITAL LETTER I WITH DIAERESIS
|
||||||
|
, ('\xED', '\x00CC') -- LATIN CAPITAL LETTER I WITH GRAVE
|
||||||
|
, ('\xEE', '\x00D3') -- LATIN CAPITAL LETTER O WITH ACUTE
|
||||||
|
, ('\xEF', '\x00D4') -- LATIN CAPITAL LETTER O WITH CIRCUMFLEX
|
||||||
|
, ('\xF0', '\xF8FF') -- Apple logo
|
||||||
|
, ('\xF1', '\x00D2') -- LATIN CAPITAL LETTER O WITH GRAVE
|
||||||
|
, ('\xF2', '\x00DA') -- LATIN CAPITAL LETTER U WITH ACUTE
|
||||||
|
, ('\xF3', '\x00DB') -- LATIN CAPITAL LETTER U WITH CIRCUMFLEX
|
||||||
|
, ('\xF4', '\x00D9') -- LATIN CAPITAL LETTER U WITH GRAVE
|
||||||
|
, ('\xF5', '\x0131') -- LATIN SMALL LETTER DOTLESS I
|
||||||
|
, ('\xF6', '\x02C6') -- MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||||
|
, ('\xF7', '\x02DC') -- SMALL TILDE
|
||||||
|
, ('\xF8', '\x00AF') -- MACRON
|
||||||
|
, ('\xF9', '\x02D8') -- BREVE
|
||||||
|
, ('\xFA', '\x02D9') -- DOT ABOVE
|
||||||
|
, ('\xFB', '\x02DA') -- RING ABOVE
|
||||||
|
, ('\xFC', '\x00B8') -- CEDILLA
|
||||||
|
, ('\xFD', '\x02DD') -- DOUBLE ACUTE ACCENT
|
||||||
|
, ('\xFE', '\x02DB') -- OGONEK
|
||||||
|
, ('\xFF', '\x02C7') -- CARON
|
||||||
|
]
|
||||||
|
where
|
||||||
|
generateMapers (tmpDecoder, tmpEncoder) (macChar, utf8Char) = (
|
||||||
|
Map.insert macChar utf8Char tmpDecoder
|
||||||
|
, Map.insert utf8Char macChar tmpEncoder
|
||||||
|
)
|
27
src/PDF/Font.hs
Normal file
27
src/PDF/Font.hs
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
module PDF.Font (
|
||||||
|
Decoder
|
||||||
|
, Encoder
|
||||||
|
, Font(..)
|
||||||
|
, FontSet
|
||||||
|
, emptyFont
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import Data.Map (Map)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import PDF.Object (Name)
|
||||||
|
|
||||||
|
type Decoder = ByteString -> Either String Text
|
||||||
|
type Encoder = Text -> Either String ByteString
|
||||||
|
data Font = Font {
|
||||||
|
decode :: Decoder
|
||||||
|
, encode :: Encoder
|
||||||
|
}
|
||||||
|
|
||||||
|
type FontSet = Map Name Font
|
||||||
|
|
||||||
|
emptyFont :: Font
|
||||||
|
emptyFont = Font {
|
||||||
|
decode = \_ -> Left "No fond loaded"
|
||||||
|
, encode = \_ -> Left "No fond loaded"
|
||||||
|
}
|
129
src/PDF/Layer.hs
Normal file
129
src/PDF/Layer.hs
Normal file
|
@ -0,0 +1,129 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
module PDF.Layer (
|
||||||
|
Layer(..)
|
||||||
|
, LayerReader
|
||||||
|
, Objects(..)
|
||||||
|
, unify
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Monad.Except (MonadError(..))
|
||||||
|
import Control.Monad.Reader (ReaderT)
|
||||||
|
import Data.Foldable (foldl')
|
||||||
|
import Data.Id (Id(..), Indexed, keysSet, mapWithKey, member)
|
||||||
|
import qualified Data.Id as Id (empty, insert, lookup, union)
|
||||||
|
import Data.Map (Map, (!))
|
||||||
|
import qualified Data.Map as Map (empty, lookup, union)
|
||||||
|
import qualified Data.IntSet as IntSet (delete, toList)
|
||||||
|
import PDF.Box (Box(..))
|
||||||
|
import PDF.Object (
|
||||||
|
IndirectObjCoordinates(..), Object, Occurrence(..), Structure(..)
|
||||||
|
, XRefEntry(..), XRefSection, eofMarker, outputBody
|
||||||
|
)
|
||||||
|
import qualified PDF.Output as Output (line)
|
||||||
|
import PDF.Output (
|
||||||
|
Offset(..), Output(..), Resource(..), byteString, getOffset
|
||||||
|
, getOffsets, newLine
|
||||||
|
)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
data Layer = Layer {
|
||||||
|
occurrences :: [Occurrence]
|
||||||
|
, objects :: Indexed Object
|
||||||
|
, docStructure :: Structure
|
||||||
|
} deriving Show
|
||||||
|
type LayerReader m = ReaderT Layer m
|
||||||
|
|
||||||
|
updateXRefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
|
||||||
|
updateXRefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
|
||||||
|
where
|
||||||
|
updateEntry objectId e@(InUse {offset}) =
|
||||||
|
case Map.lookup (ObjectId $ getId objectId) offsets of
|
||||||
|
Nothing -> Free {nextFree = Id $ getOffset offset, generation = 65535}
|
||||||
|
Just newOffset -> e {offset = newOffset}
|
||||||
|
updateEntry _ e = e
|
||||||
|
|
||||||
|
instance Output Layer where
|
||||||
|
output (Layer {occurrences, objects, docStructure}) = do
|
||||||
|
(body, savedOffsets) <- getOffsets (outputBody (occurrences, objects))
|
||||||
|
let (newXRef, startXRef) = updateXRefs xRef savedOffsets
|
||||||
|
mconcat [
|
||||||
|
return body
|
||||||
|
, Output.line "xref"
|
||||||
|
, output newXRef
|
||||||
|
, Output.line "trailer"
|
||||||
|
, output trailer, newLine
|
||||||
|
, Output.line "startxref"
|
||||||
|
, Output.line (printf "%d" (getOffset startXRef))
|
||||||
|
, byteString eofMarker
|
||||||
|
]
|
||||||
|
where
|
||||||
|
Structure {xRef, trailer} = docStructure
|
||||||
|
|
||||||
|
data Objects = Objects
|
||||||
|
|
||||||
|
instance Monad m => Box m Objects Layer (Indexed Object) where
|
||||||
|
r Objects = return . objects
|
||||||
|
w Objects newObjects layer@(Layer {occurrences, docStructure}) =
|
||||||
|
return $ layer {
|
||||||
|
occurrences = keptOccurrences ++ newOccurrences
|
||||||
|
, objects = newObjects
|
||||||
|
, docStructure = docStructure {
|
||||||
|
xRef = (const $ InUse (Offset 0) 0) <$> newObjects
|
||||||
|
}
|
||||||
|
}
|
||||||
|
where
|
||||||
|
filterOccurrences c@(Comment _) (occ, newObjIds) = (c:occ, newObjIds)
|
||||||
|
filterOccurrences i@(Indirect (IndirectObjCoordinates {objectId})) (occ, newObjIds)
|
||||||
|
| member objectId newObjects = (i:occ, IntSet.delete (getId objectId) newObjIds)
|
||||||
|
| otherwise = (occ, newObjIds)
|
||||||
|
(keptOccurrences, newObjectIds) =
|
||||||
|
foldr filterOccurrences ([], keysSet newObjects) occurrences
|
||||||
|
makeOccurrence objectId =
|
||||||
|
Indirect (IndirectObjCoordinates {objectId, versionNumber = 0})
|
||||||
|
newOccurrences = (makeOccurrence . Id) <$> IntSet.toList newObjectIds
|
||||||
|
|
||||||
|
instance MonadError String m => Box m (Id Object) Layer Object where
|
||||||
|
r objectId =
|
||||||
|
maybe (throwError "Unknown key") return . Id.lookup objectId . objects
|
||||||
|
w objectId a layer@(Layer {objects})
|
||||||
|
| member objectId objects = return $
|
||||||
|
layer {objects = Id.insert objectId a objects}
|
||||||
|
| otherwise = throwError "Unknown key"
|
||||||
|
|
||||||
|
emptyLayer :: Layer
|
||||||
|
emptyLayer = Layer {
|
||||||
|
docStructure = Structure {xRef = Id.empty, trailer = Map.empty}
|
||||||
|
, objects = Id.empty
|
||||||
|
, occurrences = []
|
||||||
|
}
|
||||||
|
|
||||||
|
unify :: [Layer] -> Layer
|
||||||
|
unify = foldl' complete emptyLayer
|
||||||
|
where
|
||||||
|
complete tmpLayer older =
|
||||||
|
let mergedObjects = Id.union (objects tmpLayer) (objects older) in
|
||||||
|
Layer {
|
||||||
|
docStructure =
|
||||||
|
unifyDocStructure (docStructure tmpLayer) (docStructure older)
|
||||||
|
, objects = mergedObjects
|
||||||
|
, occurrences =
|
||||||
|
unifyOccurrences mergedObjects (occurrences tmpLayer) (occurrences older)
|
||||||
|
}
|
||||||
|
|
||||||
|
unifyDocStructure :: Structure -> Structure -> Structure
|
||||||
|
unifyDocStructure new old = Structure {
|
||||||
|
xRef = Id.union (xRef new) (xRef old)
|
||||||
|
, trailer = Map.union (trailer new) (trailer old)
|
||||||
|
}
|
||||||
|
|
||||||
|
unifyOccurrences :: (Indexed Object) -> [Occurrence] -> [Occurrence] -> [Occurrence]
|
||||||
|
unifyOccurrences objects new = foldr addOlder new
|
||||||
|
where
|
||||||
|
addOlder occurrence@(Comment _) existing = occurrence : existing
|
||||||
|
addOlder occurrence@(Indirect indirect) existing =
|
||||||
|
if objectId indirect `member` objects
|
||||||
|
then occurrence : existing
|
||||||
|
else existing
|
|
@ -2,19 +2,20 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
{-# LANGUAGE FlexibleInstances #-}
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
module PDF.Object (
|
module PDF.Object (
|
||||||
Content(..)
|
Dictionary
|
||||||
, DirectObject(..)
|
, DirectObject(..)
|
||||||
, Flow(..)
|
, Flow(..)
|
||||||
, IndexedObjects
|
|
||||||
, IndirectObjCoordinates(..)
|
, IndirectObjCoordinates(..)
|
||||||
, InputStructure(..)
|
, InputStructure(..)
|
||||||
, Name(..)
|
, Name(..)
|
||||||
, Number(..)
|
, Number(..)
|
||||||
, Object(..)
|
, Object(..)
|
||||||
, Occurrence(..)
|
, Occurrence(..)
|
||||||
|
, StringObject(..)
|
||||||
, Structure(..)
|
, Structure(..)
|
||||||
, XRefEntry(..)
|
, XRefEntry(..)
|
||||||
, XRefSection
|
, XRefSection
|
||||||
|
, array
|
||||||
, blank
|
, blank
|
||||||
, dictionary
|
, dictionary
|
||||||
, directObject
|
, directObject
|
||||||
|
@ -22,34 +23,43 @@ module PDF.Object (
|
||||||
, integer
|
, integer
|
||||||
, line
|
, line
|
||||||
, magicNumber
|
, magicNumber
|
||||||
|
, name
|
||||||
|
, number
|
||||||
|
, object
|
||||||
|
, outputBody
|
||||||
|
, regular
|
||||||
|
, stringObject
|
||||||
, structure
|
, structure
|
||||||
|
, toByteString
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>))
|
import Control.Applicative ((<|>), many)
|
||||||
import Data.ByteString.Char8 (ByteString)
|
import Control.Monad.Reader (asks)
|
||||||
import qualified Data.ByteString.Char8 as BS (
|
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
||||||
concat, cons, pack, singleton, unpack
|
import Data.ByteString (ByteString)
|
||||||
|
import qualified Data.ByteString as BS (concat)
|
||||||
|
import qualified Data.ByteString.Char8 as Char8 (
|
||||||
|
cons, length, pack, singleton, snoc, unpack
|
||||||
)
|
)
|
||||||
import Data.Map (Map, (!), mapWithKey)
|
import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
|
||||||
import qualified Data.Map as Map (
|
import Data.Id (Id(..), IdMap, Indexed)
|
||||||
delete, empty, fromList, lookup, minViewWithKey, toList, union
|
import qualified Data.Id as Id (
|
||||||
|
at, delete, empty, fromList, lookup, minViewWithKey, union
|
||||||
)
|
)
|
||||||
import qualified PDF.EOL as EOL (charset, parser)
|
import Data.Map (Map)
|
||||||
import qualified PDF.Output as Output (concat, line, string)
|
import qualified Data.Map as Map (fromList, toList)
|
||||||
|
import qualified Data.Set as Set (fromList, member)
|
||||||
|
import qualified PDF.EOL as EOL (Style(..), charset, parser)
|
||||||
|
import qualified PDF.Output as Output (line, string)
|
||||||
import PDF.Output (
|
import PDF.Output (
|
||||||
OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..)
|
OBuilder, OContext(..), Offset(..), Output(..), Resource(..), byteString
|
||||||
, byteString, getObjectId, getOffset, getOffsets, join, newLine
|
, getOffset, join, newLine, saveOffset
|
||||||
, saveOffset
|
|
||||||
)
|
|
||||||
import PDF.Parser (
|
|
||||||
Parser, (<?>)
|
|
||||||
, char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option
|
|
||||||
, sepBy, string, takeAll, takeAll1
|
|
||||||
)
|
)
|
||||||
|
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
line :: String -> Parser u ()
|
line :: MonadParser m => String -> m ()
|
||||||
line l = (string (BS.pack l) *> blank *> return ()) <?> printf "line «%s»" l
|
line l = (string (Char8.pack l) *> blank *> return ()) <?> printf "line «%s»" l
|
||||||
|
|
||||||
magicNumber :: ByteString
|
magicNumber :: ByteString
|
||||||
magicNumber = "%PDF-"
|
magicNumber = "%PDF-"
|
||||||
|
@ -60,8 +70,8 @@ eofMarker = "%%EOF"
|
||||||
whiteSpaceCharset :: String
|
whiteSpaceCharset :: String
|
||||||
whiteSpaceCharset = "\0\t\12 "
|
whiteSpaceCharset = "\0\t\12 "
|
||||||
|
|
||||||
blank :: Parser u ()
|
blank :: MonadParser m => m ()
|
||||||
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return ()
|
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> pure ()
|
||||||
|
|
||||||
delimiterCharset :: String
|
delimiterCharset :: String
|
||||||
delimiterCharset = "()<>[]{}/%"
|
delimiterCharset = "()<>[]{}/%"
|
||||||
|
@ -69,26 +79,24 @@ delimiterCharset = "()<>[]{}/%"
|
||||||
regular :: Char -> Bool
|
regular :: Char -> Bool
|
||||||
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
|
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
|
||||||
|
|
||||||
integer :: (Read a, Num a) => Parser u a
|
integer :: MonadParser m => m Int
|
||||||
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
|
integer = decNumber <* blank <?> "decimal integer"
|
||||||
|
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
-- OBJECTS
|
-- OBJECTS
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
|
|
||||||
type IndexedObjects = Map ObjectId Object
|
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Boolean
|
-- Boolean
|
||||||
--
|
--
|
||||||
boolean :: Parser u Bool
|
boolean :: MonadParser m => m Bool
|
||||||
boolean =
|
boolean =
|
||||||
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
|
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Number
|
-- Number
|
||||||
--
|
--
|
||||||
newtype Number = Number Double deriving Show
|
newtype Number = Number Double deriving (Eq, Show)
|
||||||
|
|
||||||
instance Output Number where
|
instance Output Number where
|
||||||
output (Number f) = Output.string $
|
output (Number f) = Output.string $
|
||||||
|
@ -96,38 +104,44 @@ instance Output Number where
|
||||||
(n, 0) -> printf "%d" (n :: Int)
|
(n, 0) -> printf "%d" (n :: Int)
|
||||||
_ -> printf "%f" f
|
_ -> printf "%f" f
|
||||||
|
|
||||||
number :: Parser u Number
|
number :: MonadParser m => m Number
|
||||||
number = Number . read . BS.unpack <$>
|
number = Number <$> (sign <*> value) <?> "number"
|
||||||
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
|
|
||||||
<?> "number"
|
|
||||||
where
|
where
|
||||||
sign = string "-" <|> option "" (char '+' >> return "")
|
sign = (string "-" *> return negate) <|> option id (char '+' >> return id)
|
||||||
integerPart = mappend <$> decNumber <*> option "" floatPart
|
value = floatNumber <|> (char '.' *> afterPoint)
|
||||||
floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
|
afterPoint = read . ("0." ++) . Char8.unpack <$> takeAll (`Set.member` digits)
|
||||||
|
digits = Set.fromList ['0' .. '9']
|
||||||
|
|
||||||
--
|
--
|
||||||
-- StringObject
|
-- StringObject
|
||||||
--
|
--
|
||||||
data StringObject = Literal String | Hexadecimal String deriving Show
|
data StringObject = Literal ByteString | Hexadecimal B16Int deriving (Eq, Show)
|
||||||
|
|
||||||
instance Output StringObject where
|
instance Output StringObject where
|
||||||
output (Literal s) = Output.string (printf "(%s)" s)
|
output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
|
||||||
output (Hexadecimal s) = Output.string (printf "<%s>" s)
|
output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))
|
||||||
|
|
||||||
stringObject :: Parser u StringObject
|
stringObject :: MonadParser m => m StringObject
|
||||||
stringObject =
|
stringObject =
|
||||||
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
|
Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
|
||||||
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
|
<|> Hexadecimal . roundBytes <$> (char '<' *> hexNumber <* char '>')
|
||||||
<?> "string object (literal or hexadecimal)"
|
<?> "string object (literal or hexadecimal)"
|
||||||
where
|
where
|
||||||
literalString = many literalStringBlock
|
literalString = many literalStringBlock
|
||||||
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
|
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
|
||||||
normalChar = not . (`elem` ("\\()" :: String))
|
normalChar = not . (`elem` ("\\()" :: String))
|
||||||
matchingParenthesis =
|
matchingParenthesis =
|
||||||
mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
|
mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
|
||||||
escapedChar =
|
escapedChar =
|
||||||
BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
|
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
|
||||||
octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
|
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
|
||||||
|
roundBytes (B16Int bs)
|
||||||
|
| Char8.length bs `mod` 2 == 1 = B16Int (bs `Char8.snoc` '0')
|
||||||
|
| otherwise = B16Int bs
|
||||||
|
|
||||||
|
toByteString :: StringObject -> ByteString
|
||||||
|
toByteString (Hexadecimal h) = b16ToBytes h
|
||||||
|
toByteString (Literal s) = unescape s
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Name
|
-- Name
|
||||||
|
@ -137,13 +151,13 @@ newtype Name = Name String deriving (Eq, Ord, Show)
|
||||||
instance Output Name where
|
instance Output Name where
|
||||||
output (Name n) = Output.string ('/':n)
|
output (Name n) = Output.string ('/':n)
|
||||||
|
|
||||||
name :: Parser u Name
|
name :: MonadParser m => m Name
|
||||||
name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
|
name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Array
|
-- Array
|
||||||
--
|
--
|
||||||
array :: Parser u [DirectObject]
|
array :: MonadParser m => m [DirectObject]
|
||||||
array =
|
array =
|
||||||
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
|
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
|
||||||
|
|
||||||
|
@ -153,14 +167,14 @@ array =
|
||||||
type Dictionary = Map Name DirectObject
|
type Dictionary = Map Name DirectObject
|
||||||
|
|
||||||
instance Output Dictionary where
|
instance Output Dictionary where
|
||||||
output dict =
|
output aDictionary =
|
||||||
"<<" `mappend` keyValues `mappend` ">>"
|
"<<" `mappend` keyValues `mappend` ">>"
|
||||||
where
|
where
|
||||||
keyValues = join " " $ outputKeyVal <$> Map.toList dict
|
keyValues = join " " $ outputKeyVal <$> Map.toList aDictionary
|
||||||
outputKeyVal :: (Name, DirectObject) -> OBuilder
|
outputKeyVal :: (Name, DirectObject) -> OBuilder
|
||||||
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
|
outputKeyVal (key, val) = mconcat [output key, " ", output val]
|
||||||
|
|
||||||
dictionary :: Parser u Dictionary
|
dictionary :: MonadParser m => m Dictionary
|
||||||
dictionary =
|
dictionary =
|
||||||
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
|
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
|
||||||
where
|
where
|
||||||
|
@ -170,33 +184,33 @@ dictionary =
|
||||||
--
|
--
|
||||||
-- Null
|
-- Null
|
||||||
--
|
--
|
||||||
nullObject :: Parser u ()
|
nullObject :: MonadParser m => m ()
|
||||||
nullObject = string "null" *> return () <?> "null object"
|
nullObject = string "null" *> return () <?> "null object"
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Reference
|
-- Reference
|
||||||
--
|
--
|
||||||
data IndirectObjCoordinates = IndirectObjCoordinates {
|
data IndirectObjCoordinates = IndirectObjCoordinates {
|
||||||
objectId :: ObjectId
|
objectId :: {-# UNPACK #-} !(Id Object)
|
||||||
, versionNumber :: Int
|
, versionNumber :: {-# UNPACK #-} !Int
|
||||||
} deriving Show
|
} deriving Show
|
||||||
|
|
||||||
reference :: Parser u IndirectObjCoordinates
|
reference :: MonadParser m => m IndirectObjCoordinates
|
||||||
reference = IndirectObjCoordinates
|
reference = IndirectObjCoordinates
|
||||||
<$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
|
<$> (fmap Id integer) <*> integer <* char 'R' <?> "reference to an object"
|
||||||
|
|
||||||
--
|
--
|
||||||
-- DirectObject
|
-- DirectObject
|
||||||
--
|
--
|
||||||
data DirectObject =
|
data DirectObject =
|
||||||
Boolean Bool
|
Boolean !Bool
|
||||||
| NumberObject Number
|
| NumberObject !Number
|
||||||
| StringObject StringObject
|
| StringObject !StringObject
|
||||||
| NameObject Name
|
| NameObject !Name
|
||||||
| Array [DirectObject]
|
| Array ![DirectObject]
|
||||||
| Dictionary Dictionary
|
| Dictionary !Dictionary
|
||||||
| Null
|
| Null
|
||||||
| Reference IndirectObjCoordinates
|
| Reference !IndirectObjCoordinates
|
||||||
deriving Show
|
deriving Show
|
||||||
|
|
||||||
instance Output DirectObject where
|
instance Output DirectObject where
|
||||||
|
@ -204,23 +218,25 @@ instance Output DirectObject where
|
||||||
output (NumberObject n) = output n
|
output (NumberObject n) = output n
|
||||||
output (StringObject s) = output s
|
output (StringObject s) = output s
|
||||||
output (NameObject n) = output n
|
output (NameObject n) = output n
|
||||||
output (Array a) = Output.concat ["[", join " " a, "]"]
|
output (Array a) = mconcat ["[", join " " a, "]"]
|
||||||
output (Dictionary d) = output d
|
output (Dictionary d) = output d
|
||||||
output (Null) = "null"
|
output (Null) = "null"
|
||||||
output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
|
output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
|
||||||
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
|
Output.string (printf "%d %d R" (getId objectId) versionNumber)
|
||||||
|
|
||||||
directObject :: Parser u DirectObject
|
directObject :: MonadParser m => m DirectObject
|
||||||
directObject =
|
directObject = (peek >>= dispatch) <?> "direct object"
|
||||||
Boolean <$> boolean
|
where
|
||||||
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
|
dispatch 't' = Boolean <$> boolean
|
||||||
|
dispatch 'f' = Boolean <$> boolean
|
||||||
|
dispatch '(' = StringObject <$> stringObject
|
||||||
|
dispatch '<' = StringObject <$> stringObject <|> Dictionary <$> dictionary
|
||||||
|
dispatch '/' = NameObject <$> name
|
||||||
|
dispatch '[' = Array <$> array
|
||||||
|
dispatch 'n' = nullObject *> return Null
|
||||||
|
dispatch _ =
|
||||||
|
Reference <$> reference {- defined before Number because Number is a prefix of it -}
|
||||||
<|> NumberObject <$> number
|
<|> NumberObject <$> number
|
||||||
<|> StringObject <$> stringObject
|
|
||||||
<|> NameObject <$> name
|
|
||||||
<|> Array <$> array
|
|
||||||
<|> Dictionary <$> dictionary
|
|
||||||
<|> const Null <$> nullObject
|
|
||||||
<?> "direct object"
|
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Object
|
-- Object
|
||||||
|
@ -235,27 +251,34 @@ data Object =
|
||||||
|
|
||||||
instance Output Object where
|
instance Output Object where
|
||||||
output (Direct d) = output d
|
output (Direct d) = output d
|
||||||
output (Stream {header, streamContent}) = Output.concat [
|
output (Stream {header, streamContent}) = mconcat [
|
||||||
output header, newLine
|
output header, newLine
|
||||||
, Output.line "stream"
|
, Output.line "stream"
|
||||||
, byteString streamContent
|
, byteString streamContent
|
||||||
, "endstream"
|
, "endstream"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
object :: Int -> Id Object
|
||||||
|
object = Id
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Occurrence
|
-- Occurrence
|
||||||
--
|
--
|
||||||
data Occurrence = Comment String | Indirect IndirectObjCoordinates deriving Show
|
data Occurrence = Comment String | Indirect IndirectObjCoordinates deriving Show
|
||||||
|
|
||||||
outputOccurrence :: IndexedObjects -> Occurrence -> OBuilder
|
outputOccurrence :: (Indexed Object) -> Occurrence -> OBuilder
|
||||||
outputOccurrence _ (Comment c) = Output.line c
|
outputOccurrence _ (Comment c) = Output.line c
|
||||||
outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNumber})) =
|
outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNumber})) =
|
||||||
saveOffset (Object objectId) >> Output.concat [
|
saveOffset (ObjectId $ getId objectId) >> mconcat [
|
||||||
Output.line (printf "%d %d obj" (getObjectId objectId) versionNumber)
|
Output.line (printf "%d %d obj" (getId objectId) versionNumber)
|
||||||
, output (objects ! objectId), newLine
|
, output (objects `Id.at` objectId), newLine
|
||||||
, Output.line "endobj"
|
, Output.line "endobj"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
outputBody :: ([Occurrence], (Indexed Object)) -> OBuilder
|
||||||
|
outputBody (occurrences, objects) =
|
||||||
|
output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
|
||||||
|
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
-- XREF TABLE
|
-- XREF TABLE
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
|
@ -267,15 +290,20 @@ data XRefEntry = InUse {
|
||||||
offset :: Offset
|
offset :: Offset
|
||||||
, generation :: Int
|
, generation :: Int
|
||||||
} | Free {
|
} | Free {
|
||||||
nextFree :: ObjectId
|
nextFree :: (Id Object)
|
||||||
, generation :: Int
|
, generation :: Int
|
||||||
} deriving Show
|
} deriving Show
|
||||||
|
|
||||||
instance Output XRefEntry where
|
instance Output XRefEntry where
|
||||||
output (InUse {offset, generation}) =
|
output xRefEntry = Output.string (build xRefEntry) `mappend` endXRefEntryLine
|
||||||
Output.line (printf "%010d %05d n " (getOffset offset) generation)
|
where
|
||||||
output (Free {nextFree, generation}) =
|
build (InUse {offset, generation}) =
|
||||||
Output.line (printf "%010d %05d f " (getObjectId nextFree) generation)
|
printf "%010d %05d n" (getOffset offset) generation
|
||||||
|
build (Free {nextFree, generation}) =
|
||||||
|
printf "%010d %05d f" (getId nextFree) generation
|
||||||
|
endXRefEntryLine = OContext (asks padEOLToTwoBytes) >>= Output.line
|
||||||
|
padEOLToTwoBytes EOL.CRLF = ("" :: String)
|
||||||
|
padEOLToTwoBytes _ = " "
|
||||||
|
|
||||||
entry :: Parser u XRefEntry
|
entry :: Parser u XRefEntry
|
||||||
entry = do
|
entry = do
|
||||||
|
@ -287,52 +315,51 @@ entry = do
|
||||||
char 'n' *> return (InUse {offset = Offset big, generation})
|
char 'n' *> return (InUse {offset = Offset big, generation})
|
||||||
free :: Int -> Int -> Parser u XRefEntry
|
free :: Int -> Int -> Parser u XRefEntry
|
||||||
free big generation =
|
free big generation =
|
||||||
char 'f' *> return (Free {nextFree = ObjectId big, generation})
|
char 'f' *> return (Free {nextFree = Id big, generation})
|
||||||
|
|
||||||
--
|
--
|
||||||
-- XRefSubSection
|
-- XRefSubSection
|
||||||
--
|
--
|
||||||
data XRefSubSection = XRefSubSection {
|
data XRefSubSection = XRefSubSection {
|
||||||
firstObjectId :: ObjectId
|
firstObjectId :: (Id Object)
|
||||||
, entries :: [XRefEntry]
|
, entries :: [XRefEntry]
|
||||||
} deriving Show
|
} deriving Show
|
||||||
|
|
||||||
instance Output XRefSubSection where
|
instance Output XRefSubSection where
|
||||||
output (XRefSubSection {firstObjectId, entries}) =
|
output (XRefSubSection {firstObjectId, entries}) =
|
||||||
Output.line (printf "%d %d" (getObjectId firstObjectId) (length entries))
|
Output.line (printf "%d %d" (getId firstObjectId) (length entries))
|
||||||
`mappend` output entries
|
`mappend` output entries
|
||||||
|
|
||||||
xRefSubSection :: Parser u XRefSubSection
|
xRefSubSection :: Parser u XRefSubSection
|
||||||
xRefSubSection = do
|
xRefSubSection = do
|
||||||
(firstId, entriesNumber) <- (,) <$> integer <*> integer <?> "XRef subsection"
|
(firstId, entriesNumber) <- (,) <$> integer <*> integer <?> "XRef subsection"
|
||||||
entries <- count entriesNumber entry
|
entries <- count entriesNumber entry
|
||||||
return $ XRefSubSection {firstObjectId = ObjectId firstId, entries}
|
return $ XRefSubSection {firstObjectId = Id firstId, entries}
|
||||||
|
|
||||||
type XRefSection = Map ObjectId XRefEntry
|
type XRefSection = IdMap Object XRefEntry
|
||||||
|
|
||||||
instance Output XRefSection where
|
instance Output XRefSection where
|
||||||
output = output . sections
|
output = output . sections
|
||||||
where
|
where
|
||||||
sections tmp =
|
sections tmp =
|
||||||
case Map.minViewWithKey tmp of
|
case Id.minViewWithKey tmp of
|
||||||
Nothing -> []
|
Nothing -> []
|
||||||
Just ((objectId@(ObjectId value), firstEntry), rest) ->
|
Just ((objectId@(Id value), firstEntry), rest) ->
|
||||||
let (subSection, sndRest) = section objectId [firstEntry] (value + 1) rest in
|
let (subSection, sndRest) = section objectId [firstEntry] (value + 1) rest in
|
||||||
subSection : sections sndRest
|
subSection : sections sndRest
|
||||||
section firstObjectId stack nextValue tmp =
|
section firstObjectId stack nextValue tmp =
|
||||||
let nextId = ObjectId nextValue in
|
let nextId = (Id nextValue :: Id Object) in
|
||||||
case Map.lookup nextId tmp of
|
case Id.lookup nextId tmp of
|
||||||
Nothing -> (XRefSubSection {firstObjectId, entries = reverse stack}, tmp)
|
Nothing -> (XRefSubSection {firstObjectId, entries = reverse stack}, tmp)
|
||||||
Just nextEntry ->
|
Just nextEntry ->
|
||||||
section firstObjectId (nextEntry:stack) (nextValue + 1) (Map.delete nextId tmp)
|
section firstObjectId (nextEntry:stack) (nextValue + 1) (Id.delete nextId tmp)
|
||||||
|
|
||||||
|
|
||||||
xRefSection :: Parser u XRefSection
|
xRefSection :: Parser u XRefSection
|
||||||
xRefSection = foldr addSubsection Map.empty <$>
|
xRefSection = foldr addSubsection Id.empty <$>
|
||||||
(line "xref" *> xRefSubSection `sepBy` many EOL.parser)
|
(line "xref" *> xRefSubSection `sepBy` many EOL.parser)
|
||||||
where
|
where
|
||||||
addSubsection (XRefSubSection {firstObjectId, entries}) =
|
addSubsection (XRefSubSection {firstObjectId, entries}) =
|
||||||
Map.union . Map.fromList $ zip ([firstObjectId..]) entries
|
Id.union . Id.fromList $ zip ([firstObjectId..]) entries
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Structure
|
-- Structure
|
||||||
|
@ -353,45 +380,10 @@ structure =
|
||||||
<$> xRefSection
|
<$> xRefSection
|
||||||
<*> (string "trailer" *> blank *> dictionary <* EOL.parser)
|
<*> (string "trailer" *> blank *> dictionary <* EOL.parser)
|
||||||
|
|
||||||
updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
|
|
||||||
updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
|
|
||||||
where
|
|
||||||
updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)}
|
|
||||||
updateEntry _ e = e
|
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Flow
|
-- Flow
|
||||||
--
|
--
|
||||||
data Flow = Flow {
|
data Flow = Flow {
|
||||||
occurrencesStack :: [Occurrence]
|
occurrencesStack :: [Occurrence]
|
||||||
, tmpObjects :: IndexedObjects
|
, tmpObjects :: (Indexed Object)
|
||||||
} deriving Show
|
} deriving Show
|
||||||
|
|
||||||
--
|
|
||||||
-- Content
|
|
||||||
--
|
|
||||||
data Content = Content {
|
|
||||||
occurrences :: [Occurrence]
|
|
||||||
, objects :: IndexedObjects
|
|
||||||
, docStructure :: Structure
|
|
||||||
} deriving Show
|
|
||||||
|
|
||||||
outputBody :: ([Occurrence], IndexedObjects) -> OBuilder
|
|
||||||
outputBody (occurrences, objects) =
|
|
||||||
output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
|
|
||||||
|
|
||||||
instance Output Content where
|
|
||||||
output (Content {occurrences, objects, docStructure}) =
|
|
||||||
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
|
|
||||||
>>= \(body, (xref, startXRef)) -> Output.concat [
|
|
||||||
body
|
|
||||||
, Output.line "xref"
|
|
||||||
, output xref
|
|
||||||
, Output.line "trailer"
|
|
||||||
, output trailer, newLine
|
|
||||||
, Output.line "startxref"
|
|
||||||
, Output.line (printf "%d" (getOffset startXRef))
|
|
||||||
, byteString eofMarker
|
|
||||||
]
|
|
||||||
where
|
|
||||||
Structure {xRef, trailer} = docStructure
|
|
||||||
|
|
180
src/PDF/Object/Navigation.hs
Normal file
180
src/PDF/Object/Navigation.hs
Normal file
|
@ -0,0 +1,180 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||||
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
|
{-# LANGUAGE BangPatterns #-}
|
||||||
|
{-# LANGUAGE DeriveFunctor #-}
|
||||||
|
module PDF.Object.Navigation (
|
||||||
|
Nav(..)
|
||||||
|
, PPath(..)
|
||||||
|
, ROLayer
|
||||||
|
, RWLayer
|
||||||
|
, StreamContent(..)
|
||||||
|
, (./)
|
||||||
|
, (//)
|
||||||
|
, (>./)
|
||||||
|
, (>//)
|
||||||
|
, castObject
|
||||||
|
, catalog
|
||||||
|
, getDictionary
|
||||||
|
, getKey
|
||||||
|
, objectById
|
||||||
|
, save
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Codec.Compression.Zlib (compress, decompress)
|
||||||
|
import Control.Monad.Except (MonadError(..))
|
||||||
|
import Control.Monad.Reader (MonadReader(..))
|
||||||
|
import Control.Monad.State (MonadState)
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import qualified Data.ByteString as BS (length)
|
||||||
|
import qualified Data.ByteString.Lazy as Lazy (ByteString, fromStrict, toStrict)
|
||||||
|
import Data.Id (Id)
|
||||||
|
import qualified Data.Id as Id (at)
|
||||||
|
import qualified Data.Map as Map (adjust, insert, lookup)
|
||||||
|
import PDF.Box (Box(..), at, edit{-, runRO-})
|
||||||
|
import PDF.Layer (Layer(..))
|
||||||
|
import PDF.Object (
|
||||||
|
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
|
, Name(..), Number(..), Object(..), Structure(..)
|
||||||
|
)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
newtype PPath = PPath [Component]
|
||||||
|
data DPath = DPath {
|
||||||
|
root :: Id Object
|
||||||
|
, offset :: [Component]
|
||||||
|
} deriving Show
|
||||||
|
|
||||||
|
push :: Component -> DPath -> DPath
|
||||||
|
push component dPath = dPath {offset = (offset dPath) ++ [component]}
|
||||||
|
|
||||||
|
data Nav a = Nav {
|
||||||
|
dPath :: DPath
|
||||||
|
, value :: a
|
||||||
|
} deriving (Functor)
|
||||||
|
|
||||||
|
instance Show a => Show (Nav a) where
|
||||||
|
show (Nav {dPath, value}) = "Nav {dPath = " ++ show dPath ++ ", value = " ++ show value ++ "}"
|
||||||
|
|
||||||
|
type ROLayer m = (MonadReader Layer m, MonadError String m)
|
||||||
|
type RWLayer m = (MonadState Layer m, MonadError String m)
|
||||||
|
type Component = String
|
||||||
|
|
||||||
|
getDictionary :: ROLayer m => Nav Object -> m (Nav Dictionary)
|
||||||
|
getDictionary (Nav {dPath, value}) =
|
||||||
|
case value of
|
||||||
|
(Direct (Dictionary aDict)) -> return $ Nav {dPath, value = aDict}
|
||||||
|
(Direct (Reference ref)) -> objectById (objectId ref) >>= getDictionary
|
||||||
|
(Stream {header}) -> return $ Nav {dPath, value = header}
|
||||||
|
obj -> expected "dictionary : " obj
|
||||||
|
|
||||||
|
expected :: (MonadError String m, Show a) => String -> a -> m b
|
||||||
|
expected name = throwError . printf "Not a %s: %s" name . show
|
||||||
|
|
||||||
|
getKey :: ROLayer m => String -> Nav Object -> m (Nav DirectObject)
|
||||||
|
getKey key navObject = getDictionary navObject >>= f
|
||||||
|
where
|
||||||
|
errorMessage =
|
||||||
|
printf "Key %s not found in object %s" key (show navObject)
|
||||||
|
f (Nav {dPath, value}) =
|
||||||
|
case Map.lookup (Name key) value of
|
||||||
|
Nothing -> throwError errorMessage
|
||||||
|
Just dObj -> return $ Nav {dPath = push key dPath, value = dObj}
|
||||||
|
|
||||||
|
objectById :: ROLayer m => (Id Object) -> m (Nav Object)
|
||||||
|
objectById objectId = do
|
||||||
|
layer <- ask
|
||||||
|
return $ Nav {
|
||||||
|
dPath = DPath {root = objectId, offset = []}
|
||||||
|
, value = objects layer `Id.at` objectId
|
||||||
|
}
|
||||||
|
|
||||||
|
castObject :: ROLayer m => Nav DirectObject -> m (Nav Object)
|
||||||
|
castObject (Nav {value = !(Reference (IndirectObjCoordinates {objectId}))}) =
|
||||||
|
objectById objectId
|
||||||
|
castObject (Nav {dPath, value}) = return $ Nav {dPath, value = Direct value}
|
||||||
|
|
||||||
|
(./) :: ROLayer m => m (Nav Object) -> Component -> m (Nav Object)
|
||||||
|
(./) navObject key = (navObject >>= getKey key >>= castObject)
|
||||||
|
|
||||||
|
(//) :: ROLayer m => m (Nav Object) -> PPath -> m (Nav Object)
|
||||||
|
(//) navObject (PPath []) = navObject
|
||||||
|
(//) navObject (PPath (key:keys)) = navObject ./ key // (PPath keys)
|
||||||
|
|
||||||
|
(>./) :: ROLayer m => Nav Object -> Component -> m (Nav Object)
|
||||||
|
(>./) navObject = (return navObject ./)
|
||||||
|
|
||||||
|
(>//) :: ROLayer m => Nav Object -> PPath -> m (Nav Object)
|
||||||
|
(>//) navObject = (return navObject //)
|
||||||
|
|
||||||
|
catalog :: ROLayer m => m (Nav Object)
|
||||||
|
catalog = do
|
||||||
|
value <- Direct . Dictionary . trailer . docStructure <$> ask
|
||||||
|
return $ Nav {dPath = undefined, value}
|
||||||
|
|
||||||
|
setAt :: [Component] -> DirectObject -> Dictionary -> Dictionary
|
||||||
|
setAt [] _ dict = dict
|
||||||
|
setAt [component] directObject dict =
|
||||||
|
Map.insert (Name component) directObject dict
|
||||||
|
setAt (component:components) directObject dict =
|
||||||
|
Map.adjust setDirObj (Name component) dict
|
||||||
|
where
|
||||||
|
setDirObj (Dictionary subDict) =
|
||||||
|
Dictionary $ setAt components directObject subDict
|
||||||
|
setDirObj x = x
|
||||||
|
|
||||||
|
save :: RWLayer m => Nav Object -> m ()
|
||||||
|
save nav@(Nav {dPath, value = Direct dObj}) =
|
||||||
|
edit .at (root dPath) $ return . setObj
|
||||||
|
where
|
||||||
|
setObj obj@(Stream {header}) =
|
||||||
|
obj {header = setAt (offset dPath) dObj header}
|
||||||
|
setObj (Direct (Dictionary dict)) =
|
||||||
|
Direct . Dictionary $ setAt (offset dPath) dObj dict
|
||||||
|
setObj _ = value nav
|
||||||
|
save (Nav {dPath = DPath {root, offset = []}, value}) = edit $ w root value
|
||||||
|
save _ = throwError "Streams can't be properties of PDF objects"
|
||||||
|
|
||||||
|
data StreamContent = Clear | Raw
|
||||||
|
|
||||||
|
onLazy :: (Lazy.ByteString -> Lazy.ByteString) -> ByteString -> ByteString
|
||||||
|
onLazy f = Lazy.toStrict . f . Lazy.fromStrict
|
||||||
|
|
||||||
|
contains :: String -> DirectObject -> Bool
|
||||||
|
contains needle !(NameObject (Name n)) = needle == n
|
||||||
|
contains needle !(Array directObjects) = oneOf directObjects (contains needle)
|
||||||
|
where
|
||||||
|
oneOf [] _ = False
|
||||||
|
oneOf (x:xs) p = p x || oneOf xs p
|
||||||
|
contains _ _ = False
|
||||||
|
|
||||||
|
instance MonadError String m => Box m StreamContent (Nav Object) ByteString where
|
||||||
|
r sc = r sc . value
|
||||||
|
w sc newStreamContent nav = setValue <$> w sc newStreamContent (value nav)
|
||||||
|
where
|
||||||
|
setValue value = nav {value}
|
||||||
|
|
||||||
|
instance MonadError String m => Box m StreamContent Object ByteString where
|
||||||
|
r Raw (Stream {streamContent}) = return streamContent
|
||||||
|
r Clear (Stream {header, streamContent}) = return $
|
||||||
|
case Map.lookup (Name "Filter") header of
|
||||||
|
Just directObject
|
||||||
|
| contains "FlateDecode" directObject -> onLazy decompress streamContent
|
||||||
|
_ -> streamContent
|
||||||
|
r _ obj = expected "stream" obj
|
||||||
|
|
||||||
|
w Raw streamContent obj@(Stream {}) = return $ obj {streamContent}
|
||||||
|
w Clear newStreamContent (Stream {header}) =
|
||||||
|
let streamContent = getStreamContent (Map.lookup (Name "Filter") header) in
|
||||||
|
return $ Stream {header = fixLength streamContent, streamContent}
|
||||||
|
where
|
||||||
|
getStreamContent (Just directObject)
|
||||||
|
| contains "FlateDecode" directObject = onLazy compress newStreamContent
|
||||||
|
getStreamContent _ = newStreamContent
|
||||||
|
fixLength sc =
|
||||||
|
let newLength = NumberObject . Number . fromIntegral $ BS.length sc in
|
||||||
|
Map.insert (Name "Length") newLength header
|
||||||
|
w _ _ obj = expected "stream" obj
|
|
@ -5,14 +5,12 @@
|
||||||
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||||
module PDF.Output (
|
module PDF.Output (
|
||||||
OBuilder
|
OBuilder
|
||||||
, ObjectId(..)
|
|
||||||
, OContext(..)
|
, OContext(..)
|
||||||
, Offset(..)
|
, Offset(..)
|
||||||
, Output(..)
|
, Output(..)
|
||||||
, Resource(..)
|
, Resource(..)
|
||||||
, byteString
|
, byteString
|
||||||
, char
|
, char
|
||||||
, concat
|
|
||||||
, getOffsets
|
, getOffsets
|
||||||
, join
|
, join
|
||||||
, line
|
, line
|
||||||
|
@ -32,12 +30,10 @@ import qualified Data.Map as Map (singleton)
|
||||||
import Data.String (IsString(..))
|
import Data.String (IsString(..))
|
||||||
import Control.Monad.RWS (RWS, runRWS, ask, get, listen, modify, tell)
|
import Control.Monad.RWS (RWS, runRWS, ask, get, listen, modify, tell)
|
||||||
import qualified PDF.EOL as EOL (Style(..))
|
import qualified PDF.EOL as EOL (Style(..))
|
||||||
import Prelude hiding (concat)
|
|
||||||
|
|
||||||
newtype ObjectId = ObjectId {getObjectId :: Int} deriving (Enum, Eq, Ord, Show)
|
|
||||||
newtype Offset = Offset {getOffset :: Int} deriving (Show)
|
newtype Offset = Offset {getOffset :: Int} deriving (Show)
|
||||||
|
|
||||||
data Resource = StartXRef | Object ObjectId deriving (Eq, Ord)
|
data Resource = StartXRef | ObjectId Int deriving (Eq, Ord)
|
||||||
|
|
||||||
newtype OContext a = OContext (RWS EOL.Style (Map Resource Offset) Offset a)
|
newtype OContext a = OContext (RWS EOL.Style (Map Resource Offset) Offset a)
|
||||||
type OBuilder = OContext Builder
|
type OBuilder = OContext Builder
|
||||||
|
@ -59,16 +55,12 @@ saveOffset resource = OContext $
|
||||||
lift :: (a -> Builder) -> a -> OBuilder
|
lift :: (a -> Builder) -> a -> OBuilder
|
||||||
lift f a = return (f a)
|
lift f a = return (f a)
|
||||||
|
|
||||||
getOffsets :: OBuilder -> OContext (OBuilder, Map Resource Offset)
|
getOffsets :: OBuilder -> OContext (Builder, Map Resource Offset)
|
||||||
getOffsets (OContext builder) =
|
getOffsets (OContext builder) = OContext $ listen builder
|
||||||
OContext (listen builder >>= \(a, w) -> return (return a, w))
|
|
||||||
|
|
||||||
append :: OBuilder -> OBuilder -> OBuilder
|
append :: OBuilder -> OBuilder -> OBuilder
|
||||||
append (OContext a) (OContext b) = OContext (mappend <$> a <*> b)
|
append (OContext a) (OContext b) = OContext (mappend <$> a <*> b)
|
||||||
|
|
||||||
concat :: [OBuilder] -> OBuilder
|
|
||||||
concat = foldl mappend mempty
|
|
||||||
|
|
||||||
#if MIN_VERSION_base(4,11,0)
|
#if MIN_VERSION_base(4,11,0)
|
||||||
instance Semigroup OBuilder where
|
instance Semigroup OBuilder where
|
||||||
(<>) = append
|
(<>) = append
|
||||||
|
@ -94,7 +86,7 @@ instance Output Bool where
|
||||||
output True = string "true"
|
output True = string "true"
|
||||||
|
|
||||||
instance Output a => Output [a] where
|
instance Output a => Output [a] where
|
||||||
output = concat . fmap output
|
output = mconcat . fmap output
|
||||||
|
|
||||||
join :: Output a => String -> [a] -> OBuilder
|
join :: Output a => String -> [a] -> OBuilder
|
||||||
join _ [] = mempty
|
join _ [] = mempty
|
||||||
|
@ -116,7 +108,7 @@ char :: Char -> OBuilder
|
||||||
char c = lift char8 c <* offset (+1)
|
char c = lift char8 c <* offset (+1)
|
||||||
|
|
||||||
string :: String -> OBuilder
|
string :: String -> OBuilder
|
||||||
string s = lift string8 s <* offset (+ toEnum (length s))
|
string s = lift string8 s <* offset (+ length s)
|
||||||
|
|
||||||
line :: String -> OBuilder
|
line :: String -> OBuilder
|
||||||
line l = string l `mappend` newLine
|
line l = string l `mappend` newLine
|
||||||
|
|
181
src/PDF/Pages.hs
Executable file
181
src/PDF/Pages.hs
Executable file
|
@ -0,0 +1,181 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
module PDF.Pages (
|
||||||
|
Contents(..)
|
||||||
|
, FontCache
|
||||||
|
, Page(..)
|
||||||
|
, PageNumber(..)
|
||||||
|
, Pages(..)
|
||||||
|
, withFonts
|
||||||
|
, withResources
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Applicative (Alternative)
|
||||||
|
import Control.Applicative ((<|>))
|
||||||
|
import Control.Monad (foldM)
|
||||||
|
import Control.Monad.Except (MonadError(..))
|
||||||
|
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||||
|
import Control.Monad.State (StateT(..), evalStateT, execStateT, gets, modify)
|
||||||
|
import Control.Monad.Trans (lift)
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import Data.ByteString.Lazy (toStrict)
|
||||||
|
import Data.Id (Id(..), IdMap)
|
||||||
|
import qualified Data.Id as Id (empty, insert, lookup)
|
||||||
|
import Data.Map (Map)
|
||||||
|
import qualified Data.Map as Map (empty, elems, fromList, insert, toList)
|
||||||
|
import Data.Maybe (listToMaybe)
|
||||||
|
import Data.OrderedMap (OrderedMap, build, mapi)
|
||||||
|
import PDF.Box (Box(..), at, edit, runRO)
|
||||||
|
import qualified PDF.CMap as CMap (parse)
|
||||||
|
import PDF.Content (Content(..))
|
||||||
|
import qualified PDF.Content as Content (parse)
|
||||||
|
import PDF.Encoding (encoding)
|
||||||
|
import PDF.EOL (Style(..))
|
||||||
|
import PDF.Font (Font, FontSet)
|
||||||
|
import PDF.Layer (Layer(..), LayerReader)
|
||||||
|
import PDF.Object (
|
||||||
|
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
|
, Name(..), Number(..), Object(..)
|
||||||
|
,)
|
||||||
|
import PDF.Object.Navigation (
|
||||||
|
Nav(..), PPath(..), ROLayer, RWLayer, StreamContent(..), (//), (>./)
|
||||||
|
, (>//), catalog, getDictionary, getKey, objectById, save
|
||||||
|
)
|
||||||
|
import PDF.Output (render)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
type CachedFonts = IdMap Object Font
|
||||||
|
type FontCache m = StateT CachedFonts m
|
||||||
|
data Page = Page {
|
||||||
|
byteContents :: OrderedMap (Id Object) ByteString
|
||||||
|
, resources :: Dictionary
|
||||||
|
, source :: (Id Object)
|
||||||
|
}
|
||||||
|
|
||||||
|
loadByteContents :: ROLayer m => DirectObject -> m (OrderedMap (Id Object) ByteString)
|
||||||
|
loadByteContents directObject = do
|
||||||
|
objs <- sequence . build objectById $ objectIds directObject
|
||||||
|
mapM (r Clear) objs
|
||||||
|
where
|
||||||
|
objectIds (Array l) = l >>= getReference
|
||||||
|
objectIds dirObj = getReference dirObj
|
||||||
|
|
||||||
|
getFontDictionary :: (Alternative m, ROLayer m) => Nav Object -> m Dictionary
|
||||||
|
getFontDictionary pageObj =
|
||||||
|
(pageObj >// PPath ["Resources", "Font"] >>= fmap value . getDictionary)
|
||||||
|
<|> return Map.empty
|
||||||
|
|
||||||
|
cache :: ROLayer m => (Id Object -> FontCache m Font) -> Id Object -> FontCache m Font
|
||||||
|
cache loader objectId =
|
||||||
|
gets (Id.lookup objectId) >>= maybe load return
|
||||||
|
where
|
||||||
|
load = do
|
||||||
|
value <- loader objectId
|
||||||
|
modify $ Id.insert objectId value
|
||||||
|
return value
|
||||||
|
|
||||||
|
loadFont :: (Alternative m, ROLayer m) => Id Object -> FontCache m Font
|
||||||
|
loadFont objectId = lift $ objectById objectId >>= tryMappings
|
||||||
|
where
|
||||||
|
tryMappings object =
|
||||||
|
(object >./ "ToUnicode" >>= r Clear >>= CMap.parse)
|
||||||
|
<|> (object >./ "Encoding" >>= loadEncoding . value)
|
||||||
|
<|> (throwError $ unknownFormat (show objectId) (show object))
|
||||||
|
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||||
|
loadEncoding :: MonadError String m => Object -> m Font
|
||||||
|
loadEncoding (Direct (NameObject (Name name))) = encoding name
|
||||||
|
loadEncoding object =
|
||||||
|
throwError $ printf "Encoding must be a name, not that : %s" $ show object
|
||||||
|
|
||||||
|
loadResources :: (Alternative m, ROLayer m) => Dictionary -> FontCache m FontSet
|
||||||
|
loadResources = foldM addFont Map.empty . Map.toList
|
||||||
|
where
|
||||||
|
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||||
|
flip (Map.insert name) output <$> cache loadFont objectId
|
||||||
|
addFont output _ = return output
|
||||||
|
|
||||||
|
getReference :: DirectObject -> [Id Object]
|
||||||
|
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
||||||
|
getReference _ = []
|
||||||
|
|
||||||
|
loadPage :: (Alternative m, ROLayer m) => Id Object -> m Page
|
||||||
|
loadPage source = do
|
||||||
|
page <- objectById source
|
||||||
|
byteContents <- loadByteContents . value =<< getKey "Contents" page
|
||||||
|
resources <- getFontDictionary page
|
||||||
|
return $ Page {byteContents, resources, source}
|
||||||
|
|
||||||
|
pagesList :: (Alternative m, ROLayer m) => m [Id Object]
|
||||||
|
pagesList =
|
||||||
|
(catalog // PPath ["Root", "Pages", "Kids"] >>= getReferences . value)
|
||||||
|
<|> return []
|
||||||
|
where
|
||||||
|
getReferences (Direct (Array kids)) = return $ getReference =<< kids
|
||||||
|
getReferences _ = throwError "Not a pages array"
|
||||||
|
|
||||||
|
editPagesList :: RWLayer m => ([DirectObject] -> [DirectObject]) -> m ()
|
||||||
|
editPagesList f = do
|
||||||
|
pages <- runRO (catalog // PPath ["Root", "Pages"])
|
||||||
|
kids <- runRO (pages >./ "Kids")
|
||||||
|
count <- runRO (pages >./ "Count")
|
||||||
|
(newSize, newKids) <- editKids (value kids)
|
||||||
|
save $ kids {value = newKids}
|
||||||
|
save $ count {value = Direct $ NumberObject newSize}
|
||||||
|
where
|
||||||
|
editKids (Direct (Array pageRefs)) =
|
||||||
|
let result = f pageRefs in
|
||||||
|
return (Number . fromIntegral $ length result, Direct $ Array result)
|
||||||
|
editKids _ = throwError "Invalid format for Root.Pages.Kids (not an array)"
|
||||||
|
|
||||||
|
updatePage :: RWLayer m => Page -> m ()
|
||||||
|
updatePage (Page {byteContents}) = sequence_ $ mapi updateByteContent byteContents
|
||||||
|
where
|
||||||
|
updateByteContent source byteContent =
|
||||||
|
edit .at source .at Clear $ \_ -> return byteContent
|
||||||
|
|
||||||
|
data Pages = Pages
|
||||||
|
newtype PageNumber = P Int
|
||||||
|
data Contents = Contents
|
||||||
|
|
||||||
|
instance (Alternative m, MonadError String m) => Box m Pages Layer (Map Int Page) where
|
||||||
|
r Pages = runReaderT (numbered <$> pagesList >>= mapM loadPage)
|
||||||
|
where
|
||||||
|
numbered :: [Id Object] -> Map Int (Id Object)
|
||||||
|
numbered = Map.fromList . zip [1..]
|
||||||
|
|
||||||
|
w Pages pages = execStateT $ do
|
||||||
|
mapM_ updatePage pages
|
||||||
|
setPagesList $ Map.elems (source <$> pages)
|
||||||
|
where
|
||||||
|
setPagesList =
|
||||||
|
editPagesList . const . fmap (Reference . flip IndirectObjCoordinates 0)
|
||||||
|
|
||||||
|
instance (Alternative m, MonadError String m) => Box m PageNumber Layer Page where
|
||||||
|
r (P p) layer
|
||||||
|
| p < 1 = throwError "Pages start at 1"
|
||||||
|
| otherwise = runReaderT (drop (p - 1) <$> pagesList >>= firstPage) layer
|
||||||
|
where
|
||||||
|
firstPage =
|
||||||
|
maybe (throwError "Page is out of bounds") loadPage . listToMaybe
|
||||||
|
|
||||||
|
w (P p) page = execStateT $ do
|
||||||
|
updatePage page
|
||||||
|
editPagesList $ setPage (Reference $ IndirectObjCoordinates (source page) 0)
|
||||||
|
where
|
||||||
|
setPage ref l = take (p-1) l ++ ref : drop p l
|
||||||
|
|
||||||
|
instance Monad m => Box m Contents Page (OrderedMap (Id Object) Content) where
|
||||||
|
r Contents = return . fmap Content.parse . byteContents
|
||||||
|
w Contents contents page = return $ page {byteContents}
|
||||||
|
where
|
||||||
|
byteContents = toStrict . render LF <$> contents
|
||||||
|
|
||||||
|
withFonts :: MonadError String m => (Layer -> FontCache (LayerReader m) a) -> Layer -> m a
|
||||||
|
withFonts f layer = runReaderT (evalStateT (f layer) Id.empty) layer
|
||||||
|
|
||||||
|
withResources :: (Alternative m, MonadError String m) => (Page -> ReaderT FontSet m b) -> Page -> FontCache (LayerReader m) b
|
||||||
|
withResources f p =
|
||||||
|
loadResources (resources p) >>= lift . lift . runReaderT (f p)
|
|
@ -1,56 +1,78 @@
|
||||||
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE UndecidableInstances #-}
|
||||||
module PDF.Parser (
|
module PDF.Parser (
|
||||||
Parser
|
MonadParser(..)
|
||||||
|
, Parser
|
||||||
, (<?>)
|
, (<?>)
|
||||||
, block
|
|
||||||
, char
|
|
||||||
, choice
|
|
||||||
, count
|
|
||||||
, decNumber
|
|
||||||
, hexNumber
|
|
||||||
, many
|
|
||||||
, octDigit
|
, octDigit
|
||||||
, on
|
, on
|
||||||
, oneOf
|
|
||||||
, option
|
|
||||||
, runParser
|
, runParser
|
||||||
, sepBy
|
, evalParser
|
||||||
, string
|
|
||||||
, takeAll
|
|
||||||
, takeAll1
|
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>), empty)
|
import Control.Applicative (Alternative, (<|>))
|
||||||
|
import Control.Monad (MonadPlus)
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
import Control.Monad.State (StateT(..), evalStateT)
|
import Control.Monad.State (StateT(..), evalStateT)
|
||||||
import Control.Monad.Trans (lift)
|
import Control.Monad.Trans (MonadTrans(..))
|
||||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
|
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
|
||||||
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
|
Parser, char, decimal, double, endOfInput, parseOnly, peekChar', satisfy, string, take
|
||||||
|
, takeWhile, takeWhile1
|
||||||
)
|
)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
|
import Data.ByteString.Char8.Util (B16Int(..))
|
||||||
import Data.Char (toLower)
|
import Data.Char (toLower)
|
||||||
import Data.Set (Set)
|
import Data.Set (Set)
|
||||||
import qualified Data.Set as Set (fromList, member, unions)
|
import qualified Data.Set as Set (fromList, member, unions)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
|
type MonadDeps m = (MonadFail m, MonadPlus m)
|
||||||
|
|
||||||
|
class MonadDeps m => MonadParser m where
|
||||||
|
block :: Int -> m ByteString
|
||||||
|
char :: Char -> m Char
|
||||||
|
decNumber :: m Int
|
||||||
|
floatNumber :: m Double
|
||||||
|
endOfInput :: m ()
|
||||||
|
hexNumber :: m B16Int
|
||||||
|
oneOf :: String -> m Char
|
||||||
|
peek :: m Char
|
||||||
|
string :: ByteString -> m ByteString
|
||||||
|
takeAll :: (Char -> Bool) -> m ByteString
|
||||||
|
takeAll1 :: (Char -> Bool) -> m ByteString
|
||||||
|
|
||||||
|
instance MonadParser Atto.Parser where
|
||||||
|
block = Atto.take
|
||||||
|
char = Atto.char
|
||||||
|
endOfInput = Atto.endOfInput
|
||||||
|
decNumber = Atto.decimal
|
||||||
|
floatNumber = Atto.double
|
||||||
|
hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
|
||||||
|
oneOf charSet = Atto.satisfy (`elem` charSet)
|
||||||
|
peek = Atto.peekChar'
|
||||||
|
string s = Atto.string s <?> show s
|
||||||
|
takeAll = Atto.takeWhile
|
||||||
|
takeAll1 = Atto.takeWhile1
|
||||||
|
|
||||||
|
instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) where
|
||||||
|
block = lift . block
|
||||||
|
char = lift . char
|
||||||
|
endOfInput = lift $ endOfInput
|
||||||
|
decNumber = lift $ decNumber
|
||||||
|
floatNumber = lift $ floatNumber
|
||||||
|
hexNumber = lift $ hexNumber
|
||||||
|
oneOf = lift . oneOf
|
||||||
|
peek = lift $ peek
|
||||||
|
string = lift . string
|
||||||
|
takeAll = lift . takeAll
|
||||||
|
takeAll1 = lift . takeAll1
|
||||||
|
|
||||||
type Parser s = StateT s Atto.Parser
|
type Parser s = StateT s Atto.Parser
|
||||||
|
|
||||||
(<?>) :: Parser s a -> String -> Parser s a
|
(<?>) :: (Alternative m, MonadFail m) => m a -> String -> m a
|
||||||
(<?>) parser debugMessage = parser <|> fail debugMessage
|
(<?>) parser debugMessage = parser <|> fail debugMessage
|
||||||
|
|
||||||
block :: Int -> Parser s ByteString
|
|
||||||
block = lift . Atto.take
|
|
||||||
|
|
||||||
char :: Char -> Parser s Char
|
|
||||||
char = lift . Atto.char
|
|
||||||
|
|
||||||
choice :: [Parser s a] -> Parser s a
|
|
||||||
choice = foldr (<|>) empty
|
|
||||||
|
|
||||||
count :: Int -> Parser s a -> Parser s [a]
|
|
||||||
count 0 _ = return []
|
|
||||||
count n p = (:) <$> p <*> count (n-1) p
|
|
||||||
|
|
||||||
decNumber :: Parser s ByteString
|
|
||||||
decNumber = lift $ Atto.takeWhile1 (`Set.member` digits)
|
|
||||||
|
|
||||||
digits :: Set Char
|
digits :: Set Char
|
||||||
digits = Set.fromList ['0'..'9']
|
digits = Set.fromList ['0'..'9']
|
||||||
|
|
||||||
|
@ -59,13 +81,7 @@ hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af]
|
||||||
where
|
where
|
||||||
af = ['A'..'F']
|
af = ['A'..'F']
|
||||||
|
|
||||||
hexNumber :: Parser s ByteString
|
octDigit :: MonadParser m => m Char
|
||||||
hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits)
|
|
||||||
|
|
||||||
many :: Parser s a -> Parser s [a]
|
|
||||||
many parser = (:) <$> parser <*> many parser <|> return []
|
|
||||||
|
|
||||||
octDigit :: Parser s Char
|
|
||||||
octDigit = oneOf ['0'..'7']
|
octDigit = oneOf ['0'..'7']
|
||||||
|
|
||||||
on :: Parser s a -> ByteString -> Parser s (Either String a)
|
on :: Parser s a -> ByteString -> Parser s (Either String a)
|
||||||
|
@ -74,25 +90,8 @@ on (StateT parserF) input = StateT $ \state ->
|
||||||
Left errorMsg -> return (Left errorMsg, state)
|
Left errorMsg -> return (Left errorMsg, state)
|
||||||
Right (result, newState) -> return (Right result, newState)
|
Right (result, newState) -> return (Right result, newState)
|
||||||
|
|
||||||
oneOf :: String -> Parser s Char
|
runParser :: Parser s a -> s -> ByteString -> Either String (a, s)
|
||||||
oneOf charSet = lift $ Atto.satisfy (`elem` charSet)
|
runParser parser initState = Atto.parseOnly (runStateT parser initState)
|
||||||
|
|
||||||
option :: a -> Parser s a -> Parser s a
|
evalParser :: Parser s a -> s -> ByteString -> Either String a
|
||||||
option defaultValue p = p <|> pure defaultValue
|
evalParser parser initState = Atto.parseOnly (evalStateT parser initState)
|
||||||
|
|
||||||
runParser :: Parser s a -> s -> ByteString -> Either String a
|
|
||||||
runParser parser initState =
|
|
||||||
Atto.parseOnly (evalStateT parser initState)
|
|
||||||
|
|
||||||
sepBy :: Parser s a -> Parser s b -> Parser s [a]
|
|
||||||
sepBy parser separator =
|
|
||||||
option [] $ (:) <$> parser <*> many (separator *> parser)
|
|
||||||
|
|
||||||
string :: ByteString -> Parser s ByteString
|
|
||||||
string = lift . Atto.string
|
|
||||||
|
|
||||||
takeAll :: (Char -> Bool) -> Parser s ByteString
|
|
||||||
takeAll = lift . Atto.takeWhile
|
|
||||||
|
|
||||||
takeAll1 :: (Char -> Bool) -> Parser s ByteString
|
|
||||||
takeAll1 = lift . Atto.takeWhile1
|
|
||||||
|
|
|
@ -1,47 +0,0 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
|
||||||
module PDF.Update (
|
|
||||||
unify
|
|
||||||
) where
|
|
||||||
|
|
||||||
import Data.Map (member)
|
|
||||||
import qualified Data.Map as Map (empty, union)
|
|
||||||
import PDF.Object (
|
|
||||||
Content(..), IndexedObjects, IndirectObjCoordinates(..), Occurrence(..)
|
|
||||||
, Structure(..)
|
|
||||||
)
|
|
||||||
|
|
||||||
emptyContent :: Content
|
|
||||||
emptyContent = Content {
|
|
||||||
docStructure = Structure {xRef = Map.empty, trailer = Map.empty}
|
|
||||||
, objects = Map.empty
|
|
||||||
, occurrences = []
|
|
||||||
}
|
|
||||||
|
|
||||||
unify :: [Content] -> Content
|
|
||||||
unify = foldl complete emptyContent
|
|
||||||
where
|
|
||||||
complete tmpContent older =
|
|
||||||
let mergedObjects = Map.union (objects tmpContent) (objects older) in
|
|
||||||
Content {
|
|
||||||
docStructure =
|
|
||||||
unifyDocStructure (docStructure tmpContent) (docStructure older)
|
|
||||||
, objects = mergedObjects
|
|
||||||
, occurrences =
|
|
||||||
unifyOccurrences mergedObjects (occurrences tmpContent) (occurrences older)
|
|
||||||
}
|
|
||||||
|
|
||||||
unifyDocStructure :: Structure -> Structure -> Structure
|
|
||||||
unifyDocStructure update original = Structure {
|
|
||||||
xRef = Map.union (xRef update) (xRef original)
|
|
||||||
, trailer = Map.union (trailer update) (trailer original)
|
|
||||||
}
|
|
||||||
|
|
||||||
unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence]
|
|
||||||
unifyOccurrences objects update = foldr addOlder update
|
|
||||||
where
|
|
||||||
addOlder occurrence@(Comment _) existing = occurrence : existing
|
|
||||||
addOlder occurrence@(Indirect indirect) existing =
|
|
||||||
if objectId indirect `member` objects
|
|
||||||
then occurrence : existing
|
|
||||||
else existing
|
|
||||||
|
|
9
test/Main.hs
Normal file
9
test/Main.hs
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
import Test.HUnit
|
||||||
|
import Object (testNumber, testString)
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = runTestTT (TestList [
|
||||||
|
testNumber
|
||||||
|
, testString
|
||||||
|
]) *> return ()
|
50
test/Object.hs
Normal file
50
test/Object.hs
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
module Object (
|
||||||
|
testNumber
|
||||||
|
, testString
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Data.ByteString.Char8 (pack)
|
||||||
|
import Data.ByteString.Char8.Util (B16Int(..))
|
||||||
|
import PDF.Object (Number(..), StringObject(..), name, number, stringObject)
|
||||||
|
import PDF.Parser (MonadParser, Parser, evalParser)
|
||||||
|
import Test.HUnit
|
||||||
|
|
||||||
|
check :: (Eq a, Show a) => Parser () a -> (a, String) -> Test
|
||||||
|
check parser (aim, input) =
|
||||||
|
TestCase $ assertEqual message (Right aim) (parse input)
|
||||||
|
where
|
||||||
|
message = "parse: " ++ input
|
||||||
|
parse = evalParser parser () . pack
|
||||||
|
|
||||||
|
testName :: Test
|
||||||
|
testName = TestLabel "Name" . TestList $ check name <$> [
|
||||||
|
]
|
||||||
|
|
||||||
|
testNumber :: Test
|
||||||
|
testNumber = TestLabel "Number" . TestList $ check number <$> [
|
||||||
|
(Number 123, "123")
|
||||||
|
, (Number 43445, "43445")
|
||||||
|
, (Number 17, "+17")
|
||||||
|
, (Number (-98), "-98")
|
||||||
|
, (Number 0, "0")
|
||||||
|
, (Number 34.5, "34.5")
|
||||||
|
, (Number (-3.62), "-3.62")
|
||||||
|
, (Number 123.6, "+123.6")
|
||||||
|
, (Number 4, "4.")
|
||||||
|
, (Number (-0.002), "-.002")
|
||||||
|
, (Number 0, "0.0")
|
||||||
|
]
|
||||||
|
|
||||||
|
testString :: Test
|
||||||
|
testString = TestLabel "StringObject" . TestList $ check stringObject <$> [
|
||||||
|
(Literal "This is a string", "(This is a string)")
|
||||||
|
, (Literal "Strings may contain newlines\nand such .", "(Strings may contain newlines\nand such .)")
|
||||||
|
, (Literal "Strings may contain balanced parentheses ( ) and\nspecial characters ( * ! & } ^ % and so on ) .", "(Strings may contain balanced parentheses ( ) and\nspecial characters ( * ! & } ^ % and so on ) .)")
|
||||||
|
, (Literal "The following is an empty string .", "(The following is an empty string .)")
|
||||||
|
, (Literal "", "()")
|
||||||
|
, (Literal "It has zero ( 0 ) length .", "(It has zero ( 0 ) length .)")
|
||||||
|
, (Hexadecimal $ B16Int "4E6F762073686D6F7A206B6120706F702E", "<4E6F762073686D6F7A206B6120706F702E>")
|
||||||
|
, (Hexadecimal $ B16Int "901FA3", "<901FA3>")
|
||||||
|
, (Hexadecimal $ B16Int "901FA0", "<901FA>")
|
||||||
|
]
|
Loading…
Reference in a new issue