Try connecting all the Box instance in the getText demo, try to encode pages contents with a simple assoc list
This commit is contained in:
parent
a04adff1d2
commit
3b1a5152e4
3 changed files with 34 additions and 10 deletions
|
@ -20,6 +20,7 @@ library
|
|||
, PDF.Box
|
||||
, PDF.CMap
|
||||
, PDF.Content
|
||||
, PDF.Content.Text
|
||||
, PDF.EOL
|
||||
, PDF.Layer
|
||||
, PDF.Object
|
||||
|
@ -35,7 +36,6 @@ library
|
|||
, PDF.Content.Operator.GraphicState
|
||||
, PDF.Content.Operator.Path
|
||||
, PDF.Content.Operator.Text
|
||||
, PDF.Content.Text
|
||||
, PDF.Encoding
|
||||
, PDF.Encoding.MacRoman
|
||||
, PDF.Body
|
||||
|
|
|
@ -1,17 +1,32 @@
|
|||
import Control.Monad ((>=>))
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import qualified Data.Map as Map (toList)
|
||||
import Data.Map (Map, foldlWithKey, mapWithKey)
|
||||
import qualified Data.Map as Map (empty, insert, toList, union)
|
||||
import qualified Data.Text as Text (unpack)
|
||||
import qualified Data.Text.IO as Text (putStrLn)
|
||||
import PDF (UnifiedLayers(..), parseDocument)
|
||||
import PDF.Box (Box(..))
|
||||
import PDF.Content.Text (Chunks(..))
|
||||
import PDF.Layer (Layer)
|
||||
import PDF.Pages (Page(..), PageNumber(..), Pages(..), cacheFonts)
|
||||
import PDF.Pages (Contents(..), Page(..), PageNumber(..), Pages(..), cacheFonts)
|
||||
import System.Environment (getArgs)
|
||||
import System.Exit (die)
|
||||
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
||||
import Text.Printf (printf)
|
||||
|
||||
displayPage :: Page -> IO ()
|
||||
displayPage = mapM_ Text.putStrLn . contents
|
||||
displayPage =
|
||||
r Contents
|
||||
>=> mapM (\(objectId, content) -> r Chunks content >>= return . mixIn objectId)
|
||||
>=> fusion
|
||||
>=> sequence_ . mapWithKey (\k v -> putStrLn $ printf "%s: %s" (show k) (Text.unpack v))
|
||||
-- >=> mapM_ Text.putStrLn
|
||||
where
|
||||
mixIn :: (Ord k1, Ord k2) => k1 -> Map k2 v -> Map (k1, k2) v
|
||||
mixIn prefix =
|
||||
foldlWithKey (\m k v -> Map.insert (prefix, k) v m) Map.empty
|
||||
fusion :: (Monad m, Ord a) => [Map a b] -> m (Map a b)
|
||||
fusion = return . foldl Map.union Map.empty
|
||||
|
||||
getAll :: Layer -> IO ()
|
||||
getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList
|
||||
|
|
|
@ -6,7 +6,8 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE UndecidableInstances #-}
|
||||
module PDF.Pages (
|
||||
Page(..)
|
||||
Contents(..)
|
||||
, Page(..)
|
||||
, PageNumber(..)
|
||||
, Pages(..)
|
||||
, cacheFonts
|
||||
|
@ -18,13 +19,14 @@ import Control.Monad.Fail (MonadFail(..))
|
|||
import Control.Monad.Reader (runReaderT)
|
||||
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
|
||||
import qualified Control.Monad.RWS as RWS (get)
|
||||
import Data.OrderedMap (OrderedMap, build)
|
||||
--import Data.OrderedMap (OrderedMap, build)
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||
import PDF.Box (Box(..))
|
||||
import PDF.CMap (cMap)
|
||||
import PDF.Content (Content(..))
|
||||
import qualified PDF.Content as Content (parse)
|
||||
import PDF.Content.Text (Chunks(..))
|
||||
import PDF.Encoding (encoding)
|
||||
import PDF.Font (Font, FontSet)
|
||||
import PDF.Layer (Layer(..))
|
||||
|
@ -43,12 +45,14 @@ import Text.Printf (printf)
|
|||
type CachedFonts = Map ObjectId Font
|
||||
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
||||
data Page = Page {
|
||||
contents :: OrderedMap ObjectId Content
|
||||
contents :: [(ObjectId, Content)]
|
||||
-- contents :: OrderedMap ObjectId Content
|
||||
, resources :: Dictionary
|
||||
, source :: ObjectId
|
||||
}
|
||||
|
||||
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
|
||||
loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
|
||||
--loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
|
||||
loadContents directObject =
|
||||
sequenceA . build loadContent $ objectIds directObject
|
||||
where
|
||||
|
@ -56,12 +60,12 @@ loadContents directObject =
|
|||
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
|
||||
objectIds (Array l) = l >>= getReference
|
||||
objectIds dirObj = getReference dirObj
|
||||
build f = fmap (\k -> f k >>= \v -> return (k, v))
|
||||
|
||||
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
||||
getFontDictionary pageObj =
|
||||
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
|
||||
|
||||
{-
|
||||
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
|
||||
cache loader objectId =
|
||||
(maybe load return . Map.lookup objectId) =<< RWS.get
|
||||
|
@ -90,7 +94,6 @@ loadResources = foldM addFont Map.empty . Map.toList
|
|||
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||
flip (Map.insert name) output <$> cache loadFont objectId
|
||||
addFont output _ = return output
|
||||
-}
|
||||
|
||||
getReference :: DirectObject -> [ObjectId]
|
||||
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
||||
|
@ -114,6 +117,7 @@ pagesList =
|
|||
data Pages = Pages
|
||||
newtype PageNumber = P Int
|
||||
data Contents = Contents
|
||||
data Text_ = Text_
|
||||
|
||||
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
|
||||
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
||||
|
@ -129,5 +133,10 @@ instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNum
|
|||
firstPage [] = fail "Page is out of bounds"
|
||||
firstPage (p:_) = loadPage p
|
||||
|
||||
instance Monad m => Box m Contents Page [(ObjectId, Content)] where
|
||||
--instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
|
||||
r Contents = return . contents
|
||||
|
||||
|
||||
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
|
||||
cacheFonts = flip evalStateT Map.empty
|
||||
|
|
Loading…
Reference in a new issue