Try connecting all the Box instance in the getText demo, try to encode pages contents with a simple assoc list

This commit is contained in:
Tissevert 2020-03-10 22:57:11 +01:00
parent a04adff1d2
commit 3b1a5152e4
3 changed files with 34 additions and 10 deletions

View file

@ -20,6 +20,7 @@ library
, PDF.Box
, PDF.CMap
, PDF.Content
, PDF.Content.Text
, PDF.EOL
, PDF.Layer
, PDF.Object
@ -35,7 +36,6 @@ library
, PDF.Content.Operator.GraphicState
, PDF.Content.Operator.Path
, PDF.Content.Operator.Text
, PDF.Content.Text
, PDF.Encoding
, PDF.Encoding.MacRoman
, PDF.Body

View file

@ -1,17 +1,32 @@
import Control.Monad ((>=>))
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.Map as Map (toList)
import Data.Map (Map, foldlWithKey, mapWithKey)
import qualified Data.Map as Map (empty, insert, toList, union)
import qualified Data.Text as Text (unpack)
import qualified Data.Text.IO as Text (putStrLn)
import PDF (UnifiedLayers(..), parseDocument)
import PDF.Box (Box(..))
import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer)
import PDF.Pages (Page(..), PageNumber(..), Pages(..), cacheFonts)
import PDF.Pages (Contents(..), Page(..), PageNumber(..), Pages(..), cacheFonts)
import System.Environment (getArgs)
import System.Exit (die)
import System.IO (BufferMode(..), hSetBuffering, stdout)
import Text.Printf (printf)
displayPage :: Page -> IO ()
displayPage = mapM_ Text.putStrLn . contents
displayPage =
r Contents
>=> mapM (\(objectId, content) -> r Chunks content >>= return . mixIn objectId)
>=> fusion
>=> sequence_ . mapWithKey (\k v -> putStrLn $ printf "%s: %s" (show k) (Text.unpack v))
-- >=> mapM_ Text.putStrLn
where
mixIn :: (Ord k1, Ord k2) => k1 -> Map k2 v -> Map (k1, k2) v
mixIn prefix =
foldlWithKey (\m k v -> Map.insert (prefix, k) v m) Map.empty
fusion :: (Monad m, Ord a) => [Map a b] -> m (Map a b)
fusion = return . foldl Map.union Map.empty
getAll :: Layer -> IO ()
getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList

View file

@ -6,7 +6,8 @@
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE UndecidableInstances #-}
module PDF.Pages (
Page(..)
Contents(..)
, Page(..)
, PageNumber(..)
, Pages(..)
, cacheFonts
@ -18,13 +19,14 @@ import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (runReaderT)
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
import qualified Control.Monad.RWS as RWS (get)
import Data.OrderedMap (OrderedMap, build)
--import Data.OrderedMap (OrderedMap, build)
import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import PDF.Box (Box(..))
import PDF.CMap (cMap)
import PDF.Content (Content(..))
import qualified PDF.Content as Content (parse)
import PDF.Content.Text (Chunks(..))
import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet)
import PDF.Layer (Layer(..))
@ -43,12 +45,14 @@ import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
type FontCache m = (MonadState CachedFonts m, PDFContent m)
data Page = Page {
contents :: OrderedMap ObjectId Content
contents :: [(ObjectId, Content)]
-- contents :: OrderedMap ObjectId Content
, resources :: Dictionary
, source :: ObjectId
}
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
--loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
loadContents directObject =
sequenceA . build loadContent $ objectIds directObject
where
@ -56,12 +60,12 @@ loadContents directObject =
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
objectIds (Array l) = l >>= getReference
objectIds dirObj = getReference dirObj
build f = fmap (\k -> f k >>= \v -> return (k, v))
getFontDictionary :: PDFContent m => Object -> m Dictionary
getFontDictionary pageObj =
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
{-
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get
@ -90,7 +94,6 @@ loadResources = foldM addFont Map.empty . Map.toList
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output
-}
getReference :: DirectObject -> [ObjectId]
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
@ -114,6 +117,7 @@ pagesList =
data Pages = Pages
newtype PageNumber = P Int
data Contents = Contents
data Text_ = Text_
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
@ -129,5 +133,10 @@ instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNum
firstPage [] = fail "Page is out of bounds"
firstPage (p:_) = loadPage p
instance Monad m => Box m Contents Page [(ObjectId, Content)] where
--instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
r Contents = return . contents
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
cacheFonts = flip evalStateT Map.empty