Disable text rendering and font loading from the Page abstraction, this code will have to be moved into a separate Box instance

This commit is contained in:
Tissevert 2020-03-05 17:40:58 +01:00
parent 50ac0692b2
commit 90348c57d6

View file

@ -23,6 +23,7 @@ import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text) import Data.Text (Text)
import PDF.Box (Box(..)) import PDF.Box (Box(..))
import PDF.CMap (cMap) import PDF.CMap (cMap)
import PDF.Content (Content(..))
import qualified PDF.Content as Content (parse) import qualified PDF.Content as Content (parse)
import PDF.Content.Text (renderText) import PDF.Content.Text (renderText)
import PDF.Encoding (encoding) import PDF.Encoding (encoding)
@ -43,10 +44,11 @@ import Text.Printf (printf)
type CachedFonts = Map ObjectId Font type CachedFonts = Map ObjectId Font
type FontCache m = (MonadState CachedFonts m, PDFContent m) type FontCache m = (MonadState CachedFonts m, PDFContent m)
data Page = Page { data Page = Page {
contents :: [Text] contents :: [Content]
, source :: ObjectId , source :: ObjectId
} }
{-
getFontDictionary :: PDFContent m => Object -> m Dictionary getFontDictionary :: PDFContent m => Object -> m Dictionary
getFontDictionary pageObj = getFontDictionary pageObj =
(pageObj >// ["Resources", "Font"] >>= getDictionary) (pageObj >// ["Resources", "Font"] >>= getDictionary)
@ -80,10 +82,7 @@ loadFonts = foldM addFont Map.empty . Map.toList
addFont output (name, Reference (IndirectObjCoordinates {objectId})) = addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output addFont output _ = return output
-}
several :: PDFContent m => Object -> m [Object]
several (Direct (Array l)) = mapM castObject l
several object = return [object]
pagesList :: PDFContent m => m [ObjectId] pagesList :: PDFContent m => m [ObjectId]
pagesList = do pagesList = do
@ -99,20 +98,19 @@ getReferences objects = do
Reference (IndirectObjCoordinates {objectId}) -> [objectId] Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> [] _ -> []
extractText :: FontCache m => Object -> m [Text] extractContent :: FontCache m => Object -> m [Content]
extractText pageObj = do extractContent pageObj = do
fonts <- loadFonts =<< getFontDictionary pageObj --fonts <- loadFonts =<< getFontDictionary pageObj
objects <- pageObj >./ "Contents" >>= several pageObj >./ "Contents" >>= several >>= mapM loadContent
concat <$> mapM (loadContent fonts) objects
where where
loadContent fonts object = loadContent object = r Clear object >>= (either fail return . Content.parse)
r Clear object several (Direct (Array l)) = mapM castObject l
>>= (either fail return . Content.parse) several object = return [object]
>>= renderText fonts -- >>= renderText fonts
loadPage :: FontCache m => ObjectId -> m Page loadPage :: FontCache m => ObjectId -> m Page
loadPage source = do loadPage source = do
contents <- extractText =<< objectById source contents <- extractContent =<< objectById source
return $ Page {contents, source} return $ Page {contents, source}
data Pages = Pages data Pages = Pages