From f9e5683bf473a88789f3af663e6ef49f23882d3d Mon Sep 17 00:00:00 2001 From: Tissevert Date: Tue, 24 Sep 2019 18:38:12 +0200 Subject: [PATCH] WIP: Use previous changes to start implementing font caching and text parsing (still very broken, doesn't compile) --- examples/getText.hs | 91 ++++++++++++++++++++++++++------------------- src/PDF/Text.hs | 54 ++++++++++++++++++++------- 2 files changed, 93 insertions(+), 52 deletions(-) diff --git a/examples/getText.hs b/examples/getText.hs index 76c4347..d8a82bc 100755 --- a/examples/getText.hs +++ b/examples/getText.hs @@ -2,29 +2,27 @@ {-# LANGUAGE OverloadedStrings #-} import Codec.Compression.Zlib (decompress) -import Control.Monad.Reader (ask) -import Control.Monad.RWS (RWST(..), evalRWST) +import Control.Monad (foldM) +import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify) import Control.Monad.Writer (tell) -import Data.ByteString.Lazy.Char8 (ByteString) -import qualified Data.ByteString.Char8 as BS (readFile) -import qualified Data.ByteString.Lazy.Char8 as Lazy (concat, fromStrict, pack, putStrLn, toStrict) -import Data.Map ((!)) -import qualified Data.Map as Map (empty, lookup) +import Data.ByteString (ByteString) +import qualified Data.ByteString.Char8 as BS (readFile, putStrLn) +import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, toStrict) +import Data.Map (Map, (!)) +import qualified Data.Map as Map (empty, insert, lookup, toList) import PDF (Document(..), parseDocument) -import qualified PDF.EOL as EOL (Style) import PDF.Object ( Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Object(..), Name(..), Structure(..) ,) -import PDF.Output (ObjectId(..)) -import qualified PDF.Output as Output (render) +import PDF.Output (ObjectId) import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents) import PDF.Update (unify) import System.Environment (getArgs) import System.IO (hPutStrLn, stderr) ---type T a = ReaderT Content [] a -type T a = RWST Content [ByteString] CMappers [] a +type CachedCMaps = Map ObjectId CMap +type T a = RWST Content [ByteString] CachedCMaps [] a list :: [a] -> T a list l = RWST (\_ s -> fillContext s <$> l) @@ -34,58 +32,73 @@ list l = RWST (\_ s -> fillContext s <$> l) extractText :: Object -> T () extractText object = do pageDict <- dict object - pageContents <- follow =<< get "Contents" pageDict - case pageContents of + contents <- follow =<< key "Contents" pageDict + case contents of (Stream {header, streamContent}) -> do font <- getFont pageDict - storeDecodedText font . clear header $ Lazy.fromStrict streamContent + cMappers <- loadCMappers font + storeDecodedText cMappers $ clear header streamContent _ -> return () clear :: Dictionary -> ByteString -> ByteString clear header streamContent = case Map.lookup (Name "Filter") header of - Just (NameObject (Name "FlateDecode")) -> decompress streamContent + Just (NameObject (Name "FlateDecode")) -> + Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent _ -> streamContent -storeDecodedText :: Dictionary -> ByteString -> T () +storeDecodedText :: CMappers -> ByteString -> T () storeDecodedText font page = - tell . chunks =<< pageContents font page + case pageContents font page of + Left _ -> return () + Right (PageContents {chunks}) -> tell chunks getFont :: Dictionary -> T Dictionary getFont pageDict = - get "Resources" pageDict + key "Resources" pageDict >>= dict . Direct - >>= get "Font" + >>= key "Font" >>= follow >>= dict +cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap +cache loader objectId = do + loaded <- get + case Map.lookup objectId loaded of + Just value -> return value + Nothing -> do + value <- loader objectId + modify (Map.insert objectId value) >> return value + +loadFont :: ObjectId -> T CMap +loadFont objectId = + getObject objectId + >>= dict + >>= key "ToUnicode" + >>= follow + +loadCMappers :: Dictionary -> T CMappers +loadCMappers = foldM loadCMapper Map.empty . Map.toList + where + loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers + loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do + flip (Map.insert name) output <$> cache loadFont objectId + loadCMapper output _ = return output + getObject :: ObjectId -> T Object getObject objectId = do content <- ask return (objects content ! objectId) -get :: String -> Dictionary -> T DirectObject -get key dictionary = - case Map.lookup (Name key) dictionary of +key :: String -> Dictionary -> T DirectObject +key keyName dictionary = + case Map.lookup (Name keyName) dictionary of Just obj -> return obj _ -> list [] follow :: DirectObject -> T Object follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId follow _ = list [] -{- - obj <- getObject objectId - case obj of - Direct directObj -> return directObj - _ -> list [] --} - -{- -key dictionary = - case Map.lookup (Name key) dictionary of - Just (Reference (IndirectObjCoordinates {objectId})) -> getObject objectId - _ -> list [] --} dict :: Object -> T Dictionary dict (Direct (Dictionary dictionary)) = return dictionary @@ -93,8 +106,8 @@ dict _ = list [] pagesList :: T ObjectId pagesList = do - root <- dict =<< follow =<< get "Root" . trailer . docStructure =<< ask - pages <- dict =<< follow =<< get "Pages" root + root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask + pages <- dict =<< follow =<< key "Pages" root case Map.lookup (Name "Kids") pages of Just (Array kids) -> list $ filterObjectIds kids _ -> list [] @@ -121,4 +134,4 @@ main = do result <- parseDocument <$> BS.readFile inputFile case result of Left parseError -> hPutStrLn stderr $ show parseError - Right doc -> mapM_ Lazy.putStrLn $ listTextObjects doc + Right doc -> mapM_ BS.putStrLn $ listTextObjects doc diff --git a/src/PDF/Text.hs b/src/PDF/Text.hs index 022e628..ef635d6 100644 --- a/src/PDF/Text.hs +++ b/src/PDF/Text.hs @@ -1,4 +1,3 @@ -{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} module PDF.Text ( CMap @@ -9,16 +8,23 @@ module PDF.Text ( ) where import Control.Applicative ((<|>)) -import Control.Monad.State (MonadState) -import Data.ByteString.Lazy.Char8 (ByteString) +import Control.Monad (join) +import Control.Monad.Reader (ReaderT, runReaderT) +import Control.Monad.State (state) +import Data.Attoparsec.ByteString.Char8 (count, sepBy) +import Data.ByteString.Char8 (ByteString) import Data.Map (Map) -import PDF.Object (Dictionary, blank, name, regular) +import qualified Data.Map as Map (empty) +import PDF.Object (Content, Name, blank, name, regular) import PDF.Output (ObjectId) -import PDF.Parser (Parser, count, runParser, sepBy, string, takeAll) +import PDF.Parser (Parser, evalParser, string, takeAll) -type CMappers = Map ObjectId CMap +type CMappers = Map Name CMap type CMap = Map Int ByteString +emptyCMap :: CMap +emptyCMap = Map.empty + data TextOperator = TJ | Tj | Tf | Other cMap :: ByteString -> CMap @@ -28,18 +34,40 @@ data PageContents = PageContents { chunks :: [ByteString] } -pageContents :: MonadState CMappers m => Dictionary -> ByteString -> m (Either String PageContents) -pageContents font = runParser page +type ParserWithFont = ReaderT CMappers (Parser CMap) -page :: Parser u PageContents -page = PageContents <$> (graphicState <|> text) +{- +data FontContext = FontContext { + cMappers :: CMappers + , currentFont :: CMap + } +initFontContext cMappers = FontContext { + cMappers + , currentFont = emptyCMap + } +-} + +pageContents :: CMappers -> ByteString -> Either String PageContents +pageContents font input = + evalParser (runReaderT (PageContents <$> page) font) emptyCMap input + +page :: ParserWithFont [ByteString] +page = graphicState <|> text + +graphicState :: ParserWithFont [ByteString] graphicState = - string "q" *> blank *> (command <|> page) `sepBy` blank <* string "Q" + string "q" *> blank *> insideQ <* string "Q" where + insideQ = join <$> (command <|> page `sepBy` blank ) command = - count 6 argument *> string "cm" - <|> name *> blank *> string "gs" + count 6 argument *> string "cm" *> return [] + <|> name *> blank *> string "gs" *> return [] argument = takeAll regular <* blank +text :: ParserWithFont [ByteString] +text = undefined + +textOperator :: ParserWithFont TextOperator +textOperator = undefined