WIP: Use previous changes to start implementing font caching and text parsing (still very broken, doesn't compile)

This commit is contained in:
Tissevert 2019-09-24 18:38:12 +02:00
parent b8eb9e6856
commit f9e5683bf4
2 changed files with 93 additions and 52 deletions

View file

@ -2,29 +2,27 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
import Codec.Compression.Zlib (decompress) import Codec.Compression.Zlib (decompress)
import Control.Monad.Reader (ask) import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), evalRWST) import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
import Control.Monad.Writer (tell) import Control.Monad.Writer (tell)
import Data.ByteString.Lazy.Char8 (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Char8 as BS (readFile, putStrLn)
import qualified Data.ByteString.Lazy.Char8 as Lazy (concat, fromStrict, pack, putStrLn, toStrict) import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, toStrict)
import Data.Map ((!)) import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, lookup) import qualified Data.Map as Map (empty, insert, lookup, toList)
import PDF (Document(..), parseDocument) import PDF (Document(..), parseDocument)
import qualified PDF.EOL as EOL (Style)
import PDF.Object ( import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..) , Object(..), Name(..), Structure(..)
,) ,)
import PDF.Output (ObjectId(..)) import PDF.Output (ObjectId)
import qualified PDF.Output as Output (render)
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents) import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
import PDF.Update (unify) import PDF.Update (unify)
import System.Environment (getArgs) import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr) import System.IO (hPutStrLn, stderr)
--type T a = ReaderT Content [] a type CachedCMaps = Map ObjectId CMap
type T a = RWST Content [ByteString] CMappers [] a type T a = RWST Content [ByteString] CachedCMaps [] a
list :: [a] -> T a list :: [a] -> T a
list l = RWST (\_ s -> fillContext s <$> l) list l = RWST (\_ s -> fillContext s <$> l)
@ -34,58 +32,73 @@ list l = RWST (\_ s -> fillContext s <$> l)
extractText :: Object -> T () extractText :: Object -> T ()
extractText object = do extractText object = do
pageDict <- dict object pageDict <- dict object
pageContents <- follow =<< get "Contents" pageDict contents <- follow =<< key "Contents" pageDict
case pageContents of case contents of
(Stream {header, streamContent}) -> do (Stream {header, streamContent}) -> do
font <- getFont pageDict font <- getFont pageDict
storeDecodedText font . clear header $ Lazy.fromStrict streamContent cMappers <- loadCMappers font
storeDecodedText cMappers $ clear header streamContent
_ -> return () _ -> return ()
clear :: Dictionary -> ByteString -> ByteString clear :: Dictionary -> ByteString -> ByteString
clear header streamContent = clear header streamContent =
case Map.lookup (Name "Filter") header of case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) -> decompress streamContent Just (NameObject (Name "FlateDecode")) ->
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
_ -> streamContent _ -> streamContent
storeDecodedText :: Dictionary -> ByteString -> T () storeDecodedText :: CMappers -> ByteString -> T ()
storeDecodedText font page = storeDecodedText font page =
tell . chunks =<< pageContents font page case pageContents font page of
Left _ -> return ()
Right (PageContents {chunks}) -> tell chunks
getFont :: Dictionary -> T Dictionary getFont :: Dictionary -> T Dictionary
getFont pageDict = getFont pageDict =
get "Resources" pageDict key "Resources" pageDict
>>= dict . Direct >>= dict . Direct
>>= get "Font" >>= key "Font"
>>= follow >>= follow
>>= dict >>= dict
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
cache loader objectId = do
loaded <- get
case Map.lookup objectId loaded of
Just value -> return value
Nothing -> do
value <- loader objectId
modify (Map.insert objectId value) >> return value
loadFont :: ObjectId -> T CMap
loadFont objectId =
getObject objectId
>>= dict
>>= key "ToUnicode"
>>= follow
loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList
where
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do
flip (Map.insert name) output <$> cache loadFont objectId
loadCMapper output _ = return output
getObject :: ObjectId -> T Object getObject :: ObjectId -> T Object
getObject objectId = do getObject objectId = do
content <- ask content <- ask
return (objects content ! objectId) return (objects content ! objectId)
get :: String -> Dictionary -> T DirectObject key :: String -> Dictionary -> T DirectObject
get key dictionary = key keyName dictionary =
case Map.lookup (Name key) dictionary of case Map.lookup (Name keyName) dictionary of
Just obj -> return obj Just obj -> return obj
_ -> list [] _ -> list []
follow :: DirectObject -> T Object follow :: DirectObject -> T Object
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
follow _ = list [] follow _ = list []
{-
obj <- getObject objectId
case obj of
Direct directObj -> return directObj
_ -> list []
-}
{-
key dictionary =
case Map.lookup (Name key) dictionary of
Just (Reference (IndirectObjCoordinates {objectId})) -> getObject objectId
_ -> list []
-}
dict :: Object -> T Dictionary dict :: Object -> T Dictionary
dict (Direct (Dictionary dictionary)) = return dictionary dict (Direct (Dictionary dictionary)) = return dictionary
@ -93,8 +106,8 @@ dict _ = list []
pagesList :: T ObjectId pagesList :: T ObjectId
pagesList = do pagesList = do
root <- dict =<< follow =<< get "Root" . trailer . docStructure =<< ask root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask
pages <- dict =<< follow =<< get "Pages" root pages <- dict =<< follow =<< key "Pages" root
case Map.lookup (Name "Kids") pages of case Map.lookup (Name "Kids") pages of
Just (Array kids) -> list $ filterObjectIds kids Just (Array kids) -> list $ filterObjectIds kids
_ -> list [] _ -> list []
@ -121,4 +134,4 @@ main = do
result <- parseDocument <$> BS.readFile inputFile result <- parseDocument <$> BS.readFile inputFile
case result of case result of
Left parseError -> hPutStrLn stderr $ show parseError Left parseError -> hPutStrLn stderr $ show parseError
Right doc -> mapM_ Lazy.putStrLn $ listTextObjects doc Right doc -> mapM_ BS.putStrLn $ listTextObjects doc

View file

@ -1,4 +1,3 @@
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module PDF.Text ( module PDF.Text (
CMap CMap
@ -9,16 +8,23 @@ module PDF.Text (
) where ) where
import Control.Applicative ((<|>)) import Control.Applicative ((<|>))
import Control.Monad.State (MonadState) import Control.Monad (join)
import Data.ByteString.Lazy.Char8 (ByteString) import Control.Monad.Reader (ReaderT, runReaderT)
import Control.Monad.State (state)
import Data.Attoparsec.ByteString.Char8 (count, sepBy)
import Data.ByteString.Char8 (ByteString)
import Data.Map (Map) import Data.Map (Map)
import PDF.Object (Dictionary, blank, name, regular) import qualified Data.Map as Map (empty)
import PDF.Object (Content, Name, blank, name, regular)
import PDF.Output (ObjectId) import PDF.Output (ObjectId)
import PDF.Parser (Parser, count, runParser, sepBy, string, takeAll) import PDF.Parser (Parser, evalParser, string, takeAll)
type CMappers = Map ObjectId CMap type CMappers = Map Name CMap
type CMap = Map Int ByteString type CMap = Map Int ByteString
emptyCMap :: CMap
emptyCMap = Map.empty
data TextOperator = TJ | Tj | Tf | Other data TextOperator = TJ | Tj | Tf | Other
cMap :: ByteString -> CMap cMap :: ByteString -> CMap
@ -28,18 +34,40 @@ data PageContents = PageContents {
chunks :: [ByteString] chunks :: [ByteString]
} }
pageContents :: MonadState CMappers m => Dictionary -> ByteString -> m (Either String PageContents) type ParserWithFont = ReaderT CMappers (Parser CMap)
pageContents font = runParser page
page :: Parser u PageContents {-
page = PageContents <$> (graphicState <|> text) data FontContext = FontContext {
cMappers :: CMappers
, currentFont :: CMap
}
initFontContext cMappers = FontContext {
cMappers
, currentFont = emptyCMap
}
-}
pageContents :: CMappers -> ByteString -> Either String PageContents
pageContents font input =
evalParser (runReaderT (PageContents <$> page) font) emptyCMap input
page :: ParserWithFont [ByteString]
page = graphicState <|> text
graphicState :: ParserWithFont [ByteString]
graphicState = graphicState =
string "q" *> blank *> (command <|> page) `sepBy` blank <* string "Q" string "q" *> blank *> insideQ <* string "Q"
where where
insideQ = join <$> (command <|> page `sepBy` blank )
command = command =
count 6 argument *> string "cm" count 6 argument *> string "cm" *> return []
<|> name *> blank *> string "gs" <|> name *> blank *> string "gs" *> return []
argument = takeAll regular <* blank argument = takeAll regular <* blank
text :: ParserWithFont [ByteString]
text = undefined
textOperator :: ParserWithFont TextOperator
textOperator = undefined