Try connecting all the Box instance in the getText demo, try to encode pages contents with a simple assoc list
This commit is contained in:
parent
a04adff1d2
commit
3b1a5152e4
3 changed files with 34 additions and 10 deletions
|
@ -20,6 +20,7 @@ library
|
||||||
, PDF.Box
|
, PDF.Box
|
||||||
, PDF.CMap
|
, PDF.CMap
|
||||||
, PDF.Content
|
, PDF.Content
|
||||||
|
, PDF.Content.Text
|
||||||
, PDF.EOL
|
, PDF.EOL
|
||||||
, PDF.Layer
|
, PDF.Layer
|
||||||
, PDF.Object
|
, PDF.Object
|
||||||
|
@ -35,7 +36,6 @@ library
|
||||||
, PDF.Content.Operator.GraphicState
|
, PDF.Content.Operator.GraphicState
|
||||||
, PDF.Content.Operator.Path
|
, PDF.Content.Operator.Path
|
||||||
, PDF.Content.Operator.Text
|
, PDF.Content.Operator.Text
|
||||||
, PDF.Content.Text
|
|
||||||
, PDF.Encoding
|
, PDF.Encoding
|
||||||
, PDF.Encoding.MacRoman
|
, PDF.Encoding.MacRoman
|
||||||
, PDF.Body
|
, PDF.Body
|
||||||
|
|
|
@ -1,17 +1,32 @@
|
||||||
import Control.Monad ((>=>))
|
import Control.Monad ((>=>))
|
||||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
import qualified Data.Map as Map (toList)
|
import Data.Map (Map, foldlWithKey, mapWithKey)
|
||||||
|
import qualified Data.Map as Map (empty, insert, toList, union)
|
||||||
|
import qualified Data.Text as Text (unpack)
|
||||||
import qualified Data.Text.IO as Text (putStrLn)
|
import qualified Data.Text.IO as Text (putStrLn)
|
||||||
import PDF (UnifiedLayers(..), parseDocument)
|
import PDF (UnifiedLayers(..), parseDocument)
|
||||||
import PDF.Box (Box(..))
|
import PDF.Box (Box(..))
|
||||||
|
import PDF.Content.Text (Chunks(..))
|
||||||
import PDF.Layer (Layer)
|
import PDF.Layer (Layer)
|
||||||
import PDF.Pages (Page(..), PageNumber(..), Pages(..), cacheFonts)
|
import PDF.Pages (Contents(..), Page(..), PageNumber(..), Pages(..), cacheFonts)
|
||||||
import System.Environment (getArgs)
|
import System.Environment (getArgs)
|
||||||
import System.Exit (die)
|
import System.Exit (die)
|
||||||
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
displayPage :: Page -> IO ()
|
displayPage :: Page -> IO ()
|
||||||
displayPage = mapM_ Text.putStrLn . contents
|
displayPage =
|
||||||
|
r Contents
|
||||||
|
>=> mapM (\(objectId, content) -> r Chunks content >>= return . mixIn objectId)
|
||||||
|
>=> fusion
|
||||||
|
>=> sequence_ . mapWithKey (\k v -> putStrLn $ printf "%s: %s" (show k) (Text.unpack v))
|
||||||
|
-- >=> mapM_ Text.putStrLn
|
||||||
|
where
|
||||||
|
mixIn :: (Ord k1, Ord k2) => k1 -> Map k2 v -> Map (k1, k2) v
|
||||||
|
mixIn prefix =
|
||||||
|
foldlWithKey (\m k v -> Map.insert (prefix, k) v m) Map.empty
|
||||||
|
fusion :: (Monad m, Ord a) => [Map a b] -> m (Map a b)
|
||||||
|
fusion = return . foldl Map.union Map.empty
|
||||||
|
|
||||||
getAll :: Layer -> IO ()
|
getAll :: Layer -> IO ()
|
||||||
getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList
|
getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList
|
||||||
|
|
|
@ -6,7 +6,8 @@
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
{-# LANGUAGE UndecidableInstances #-}
|
{-# LANGUAGE UndecidableInstances #-}
|
||||||
module PDF.Pages (
|
module PDF.Pages (
|
||||||
Page(..)
|
Contents(..)
|
||||||
|
, Page(..)
|
||||||
, PageNumber(..)
|
, PageNumber(..)
|
||||||
, Pages(..)
|
, Pages(..)
|
||||||
, cacheFonts
|
, cacheFonts
|
||||||
|
@ -18,13 +19,14 @@ import Control.Monad.Fail (MonadFail(..))
|
||||||
import Control.Monad.Reader (runReaderT)
|
import Control.Monad.Reader (runReaderT)
|
||||||
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
|
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
|
||||||
import qualified Control.Monad.RWS as RWS (get)
|
import qualified Control.Monad.RWS as RWS (get)
|
||||||
import Data.OrderedMap (OrderedMap, build)
|
--import Data.OrderedMap (OrderedMap, build)
|
||||||
import Data.Map (Map)
|
import Data.Map (Map)
|
||||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||||
import PDF.Box (Box(..))
|
import PDF.Box (Box(..))
|
||||||
import PDF.CMap (cMap)
|
import PDF.CMap (cMap)
|
||||||
import PDF.Content (Content(..))
|
import PDF.Content (Content(..))
|
||||||
import qualified PDF.Content as Content (parse)
|
import qualified PDF.Content as Content (parse)
|
||||||
|
import PDF.Content.Text (Chunks(..))
|
||||||
import PDF.Encoding (encoding)
|
import PDF.Encoding (encoding)
|
||||||
import PDF.Font (Font, FontSet)
|
import PDF.Font (Font, FontSet)
|
||||||
import PDF.Layer (Layer(..))
|
import PDF.Layer (Layer(..))
|
||||||
|
@ -43,12 +45,14 @@ import Text.Printf (printf)
|
||||||
type CachedFonts = Map ObjectId Font
|
type CachedFonts = Map ObjectId Font
|
||||||
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
||||||
data Page = Page {
|
data Page = Page {
|
||||||
contents :: OrderedMap ObjectId Content
|
contents :: [(ObjectId, Content)]
|
||||||
|
-- contents :: OrderedMap ObjectId Content
|
||||||
, resources :: Dictionary
|
, resources :: Dictionary
|
||||||
, source :: ObjectId
|
, source :: ObjectId
|
||||||
}
|
}
|
||||||
|
|
||||||
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
|
loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
|
||||||
|
--loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
|
||||||
loadContents directObject =
|
loadContents directObject =
|
||||||
sequenceA . build loadContent $ objectIds directObject
|
sequenceA . build loadContent $ objectIds directObject
|
||||||
where
|
where
|
||||||
|
@ -56,12 +60,12 @@ loadContents directObject =
|
||||||
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
|
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
|
||||||
objectIds (Array l) = l >>= getReference
|
objectIds (Array l) = l >>= getReference
|
||||||
objectIds dirObj = getReference dirObj
|
objectIds dirObj = getReference dirObj
|
||||||
|
build f = fmap (\k -> f k >>= \v -> return (k, v))
|
||||||
|
|
||||||
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
||||||
getFontDictionary pageObj =
|
getFontDictionary pageObj =
|
||||||
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
|
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
|
||||||
|
|
||||||
{-
|
|
||||||
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
|
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
|
||||||
cache loader objectId =
|
cache loader objectId =
|
||||||
(maybe load return . Map.lookup objectId) =<< RWS.get
|
(maybe load return . Map.lookup objectId) =<< RWS.get
|
||||||
|
@ -90,7 +94,6 @@ loadResources = foldM addFont Map.empty . Map.toList
|
||||||
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||||
flip (Map.insert name) output <$> cache loadFont objectId
|
flip (Map.insert name) output <$> cache loadFont objectId
|
||||||
addFont output _ = return output
|
addFont output _ = return output
|
||||||
-}
|
|
||||||
|
|
||||||
getReference :: DirectObject -> [ObjectId]
|
getReference :: DirectObject -> [ObjectId]
|
||||||
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
||||||
|
@ -114,6 +117,7 @@ pagesList =
|
||||||
data Pages = Pages
|
data Pages = Pages
|
||||||
newtype PageNumber = P Int
|
newtype PageNumber = P Int
|
||||||
data Contents = Contents
|
data Contents = Contents
|
||||||
|
data Text_ = Text_
|
||||||
|
|
||||||
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
|
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
|
||||||
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
||||||
|
@ -129,5 +133,10 @@ instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNum
|
||||||
firstPage [] = fail "Page is out of bounds"
|
firstPage [] = fail "Page is out of bounds"
|
||||||
firstPage (p:_) = loadPage p
|
firstPage (p:_) = loadPage p
|
||||||
|
|
||||||
|
instance Monad m => Box m Contents Page [(ObjectId, Content)] where
|
||||||
|
--instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
|
||||||
|
r Contents = return . contents
|
||||||
|
|
||||||
|
|
||||||
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
|
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
|
||||||
cacheFonts = flip evalStateT Map.empty
|
cacheFonts = flip evalStateT Map.empty
|
||||||
|
|
Loading…
Reference in a new issue