diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index e457753..5e2657e 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -21,6 +21,7 @@ library , PDF.Content , PDF.EOL , PDF.Object + , PDF.Object.Navigation , PDF.Output , PDF.Parser , PDF.Pages diff --git a/src/PDF/CMap.hs b/src/PDF/CMap.hs index b4845cb..8f55b22 100644 --- a/src/PDF/CMap.hs +++ b/src/PDF/CMap.hs @@ -12,6 +12,7 @@ module PDF.CMap ( ) where import Control.Applicative ((<|>), many) +import Control.Monad.Fail (MonadFail(..)) import Control.Monad.State (modify) import Data.Attoparsec.ByteString.Char8 (count) import Data.ByteString (ByteString) @@ -32,6 +33,7 @@ import PDF.Object ( , blank, directObject, integer, line, stringObject ) import PDF.Parser (MonadParser, Parser, runParser, takeAll) +import Prelude hiding (fail) type CMappers = Map Name CMap type Mapping = Map ByteString Text @@ -70,8 +72,8 @@ matches :: ByteString -> CRange -> Bool matches code (CRange {fromSequence, toSequence}) = fromSequence <= code && code <= toSequence -cMap :: ByteString -> Either String Font -cMap = fmap (toFont . snd) <$> runParser +cMap :: MonadFail m => ByteString -> m Font +cMap = either fail (return . toFont . snd) . runParser (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine)) emptyCMap where diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs index c368e1c..87081d3 100644 --- a/src/PDF/Object.hs +++ b/src/PDF/Object.hs @@ -1,8 +1,6 @@ {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE ConstraintKinds #-} -{-# LANGUAGE FlexibleContexts #-} module PDF.Object ( Content(..) , Dictionary @@ -19,7 +17,6 @@ module PDF.Object ( , Structure(..) , XRefEntry(..) , XRefSection - , (//) , array , blank , dictionary @@ -37,8 +34,6 @@ module PDF.Object ( ) where import Control.Applicative ((<|>), many) -import Control.Monad.Reader (MonadReader(..)) -import Control.Monad.Fail (MonadFail(..)) import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy) import Data.ByteString (ByteString) import qualified Data.ByteString as BS (concat) @@ -56,7 +51,6 @@ import PDF.Output ( , saveOffset ) import PDF.Parser (MonadParser(..), Parser, (), octDigit, oneOf) -import Prelude hiding (fail) import Text.Printf (printf) line :: MonadParser m => String -> m () @@ -168,10 +162,10 @@ array = type Dictionary = Map Name DirectObject instance Output Dictionary where - output dict = + output aDictionary = "<<" `mappend` keyValues `mappend` ">>" where - keyValues = join " " $ outputKeyVal <$> Map.toList dict + keyValues = join " " $ outputKeyVal <$> Map.toList aDictionary outputKeyVal :: (Name, DirectObject) -> OBuilder outputKeyVal (key, val) = Output.concat [output key, " ", output val] @@ -410,36 +404,3 @@ instance Output Content where ] where Structure {xRef, trailer} = docStructure - -type PDFContent m = (MonadReader Content m, MonadFail m) - --- --- Navigation --- -key :: MonadFail m => String -> Dictionary -> m DirectObject -key keyName dictionary = - maybe (fail errorMessage) return (Map.lookup (Name keyName) dictionary) - where - errorMessage = - printf "Key %s not found in dictionary %s" keyName (show dictionary) - -dict :: PDFContent m => Object -> m Dictionary -dict (Direct (Dictionary aDict)) = return aDict -dict obj = fail $ "Not a dictionary : " ++ show obj - -getObject :: PDFContent m => ObjectId -> m Object -getObject objectId = do - content <- ask - return (objects content ! objectId) - -getResource :: PDFContent m => DirectObject -> m Dictionary -getResource (Dictionary aDict) = return aDict -getResource (Reference (IndirectObjCoordinates {objectId})) = - getObject objectId >>= dict -getResource directObject = - fail $ "Not a resource (dictionary or reference) : " ++ show directObject - -(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject -(//) aDict [] = return $ Dictionary aDict -(//) aDict [field] = key field aDict -(//) aDict (field:fields) = key field aDict >>= getResource >>= (// fields) diff --git a/src/PDF/Object/Navigation.hs b/src/PDF/Object/Navigation.hs new file mode 100644 index 0000000..bd43bc4 --- /dev/null +++ b/src/PDF/Object/Navigation.hs @@ -0,0 +1,70 @@ +{-# LANGUAGE NamedFieldPuns #-} +{-# LANGUAGE ConstraintKinds #-} +{-# LANGUAGE FlexibleContexts #-} +module PDF.Object.Navigation ( + (//) + , getDictionary + , getField + , getObject + , openStream + , origin + ) where + +import Codec.Compression.Zlib (decompress) +import Control.Monad.Reader (MonadReader(..)) +import Control.Monad.Fail (MonadFail(..)) +import Data.ByteString (ByteString) +import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) +import Data.Map ((!)) +import qualified Data.Map as Map (lookup) +import PDF.Object ( + Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) + , Name(..), Object(..), Structure(..) + ) +import PDF.Output (ObjectId) +import Prelude hiding (fail) +import Text.Printf (printf) + +type PDFContent m = (MonadReader Content m, MonadFail m) + +expected :: (PDFContent m, Show a) => String -> a -> m b +expected name = fail . printf "Not a %s: %s" name . show + +getField :: MonadFail m => String -> Dictionary -> m DirectObject +getField key aDictionary = + maybe (fail errorMessage) return (Map.lookup (Name key) aDictionary) + where + errorMessage = + printf "Key %s not found in dictionary %s" key (show aDictionary) + +castDictionary :: PDFContent m => Object -> m Dictionary +castDictionary (Direct (Dictionary aDict)) = return aDict +castDictionary obj = expected "dictionary : " obj + +getObject :: PDFContent m => ObjectId -> m Object +getObject objectId = do + content <- ask + return (objects content ! objectId) + +getDictionary :: PDFContent m => DirectObject -> m Dictionary +getDictionary (Dictionary aDictionary) = return aDictionary +getDictionary (Reference (IndirectObjCoordinates {objectId})) = + getObject objectId >>= castDictionary +getDictionary aDirectObject = + expected "resource (dictionary or reference)" aDirectObject + +(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject +(//) aDict [] = return $ Dictionary aDict +(//) aDict [key] = getField key aDict +(//) aDict (key:keys) = getField key aDict >>= getDictionary >>= (// keys) + +origin :: PDFContent m => m Dictionary +origin = trailer . docStructure <$> ask + +openStream :: PDFContent m => Object -> m ByteString +openStream (Stream {header, streamContent}) = return $ + case Map.lookup (Name "Filter") header of + Just (NameObject (Name "FlateDecode")) -> + Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent + _ -> streamContent +openStream obj = expected "stream" obj diff --git a/src/PDF/Pages.hs b/src/PDF/Pages.hs index 0500e97..b8f09cb 100755 --- a/src/PDF/Pages.hs +++ b/src/PDF/Pages.hs @@ -5,13 +5,13 @@ module PDF.Pages ( , getAll ) where -import Codec.Compression.Zlib (decompress) import Control.Applicative ((<|>)) import Control.Monad (foldM) +import Control.Monad.Except (Except, runExcept) +import Control.Monad.Fail (MonadFail(..)) import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify) import qualified Control.Monad.RWS as RWS (get) import Data.ByteString (ByteString) -import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import Data.Map (Map, (!)) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import Data.Text (Text) @@ -24,11 +24,13 @@ import PDF.Object ( Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Object(..), Name(..), Structure(..) ,) +import PDF.Object.Navigation ((//), getDictionary, origin, stream) import PDF.Output (ObjectId(..)) +import Prelude hiding (fail) import Text.Printf (printf) type CachedFonts = Map ObjectId Font -type T = RWST Content () CachedFonts (Either String) +type T = RWST Content () CachedFonts (Except String) data Page = Page { contents :: [Text] , source :: ObjectId @@ -42,17 +44,6 @@ infixl 1 //= (//=) :: Either String a -> (a -> T b) -> T b x //= f = lift x >>= f -expected :: Show a => String -> a -> Either String b -expected name = Left . printf "Not a %s: %s" name . show - -stream :: Object -> Either String ByteString -stream (Stream {header, streamContent}) = Right $ - case Map.lookup (Name "Filter") header of - Just (NameObject (Name "FlateDecode")) -> - Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent - _ -> streamContent -stream obj = expected "stream" obj - getResource :: DirectObject -> T Dictionary getResource (Dictionary dictionary) = return dictionary getResource (Reference (IndirectObjCoordinates {objectId})) = @@ -81,10 +72,10 @@ loadFont objectId = getObject objectId \\= dict >>= tryMappings tryMappings dictionary = loadCMap dictionary <|> lift (key "Encoding" dictionary >>= loadEncoding) - <|> lift (Left $ unknownFormat (show objectId) (show dictionary)) + <|> (fail $ unknownFormat (show objectId) (show dictionary)) unknownFormat = printf "Unknown font format for object #%s : %s" loadCMap dictionary = - key "ToUnicode" dictionary //= follow \\= stream \\= cMap + getField "ToUnicode" dictionary >>= follow \\= stream >>= cMap loadEncoding (NameObject (Name name)) = encoding name loadEncoding directObject = Left . printf "Encoding must be a name, not that : %s" $ show directObject @@ -129,8 +120,7 @@ dictObject keyName dictionary = key keyName dictionary //= follow \\= dict pagesList :: T [ObjectId] pagesList = do - root <- dictObject "Root" . trailer . docStructure =<< ask - pages <- dictObject "Pages" root + pages <- origin >>= (// ["Root", "Pages"]) >>= getDictionary case Map.lookup (Name "Kids") pages of Just (Array kids) -> return $ getReferences kids _ -> return [] @@ -158,7 +148,7 @@ loadPage source = (\contents -> Page {contents, source}) <$> (extractText =<< getObject source) getAll :: Content -> Either String (Map Int Page) -getAll content = fst <$> evalRWST getPages content Map.empty +getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty where numbered = Map.fromList . zip [1..] getPages = numbered <$> (mapM loadPage =<< pagesList) @@ -166,8 +156,8 @@ getAll content = fst <$> evalRWST getPages content Map.empty get :: Content -> Int -> Either String Page get content pageNumber | pageNumber < 1 = Left "Pages start at 1" - | otherwise = fst <$> evalRWST getPage content Map.empty + | otherwise = runExcept $ fst <$> evalRWST getPage content Map.empty where - firstPage [] = lift $ Left "Page is out of bounds" + firstPage [] = fail "Page is out of bounds" firstPage (p:_) = loadPage p getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage