WIP: Use previous changes to start implementing font caching and text parsing (still very broken, doesn't compile)

This commit is contained in:
Tissevert 2019-09-24 18:38:12 +02:00
parent b8eb9e6856
commit f9e5683bf4
2 changed files with 93 additions and 52 deletions

View file

@ -2,29 +2,27 @@
{-# LANGUAGE OverloadedStrings #-}
import Codec.Compression.Zlib (decompress)
import Control.Monad.Reader (ask)
import Control.Monad.RWS (RWST(..), evalRWST)
import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
import Control.Monad.Writer (tell)
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (concat, fromStrict, pack, putStrLn, toStrict)
import Data.Map ((!))
import qualified Data.Map as Map (empty, lookup)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile, putStrLn)
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, insert, lookup, toList)
import PDF (Document(..), parseDocument)
import qualified PDF.EOL as EOL (Style)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId(..))
import qualified PDF.Output as Output (render)
import PDF.Output (ObjectId)
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)
--type T a = ReaderT Content [] a
type T a = RWST Content [ByteString] CMappers [] a
type CachedCMaps = Map ObjectId CMap
type T a = RWST Content [ByteString] CachedCMaps [] a
list :: [a] -> T a
list l = RWST (\_ s -> fillContext s <$> l)
@ -34,58 +32,73 @@ list l = RWST (\_ s -> fillContext s <$> l)
extractText :: Object -> T ()
extractText object = do
pageDict <- dict object
pageContents <- follow =<< get "Contents" pageDict
case pageContents of
contents <- follow =<< key "Contents" pageDict
case contents of
(Stream {header, streamContent}) -> do
font <- getFont pageDict
storeDecodedText font . clear header $ Lazy.fromStrict streamContent
cMappers <- loadCMappers font
storeDecodedText cMappers $ clear header streamContent
_ -> return ()
clear :: Dictionary -> ByteString -> ByteString
clear header streamContent =
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) -> decompress streamContent
Just (NameObject (Name "FlateDecode")) ->
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
_ -> streamContent
storeDecodedText :: Dictionary -> ByteString -> T ()
storeDecodedText :: CMappers -> ByteString -> T ()
storeDecodedText font page =
tell . chunks =<< pageContents font page
case pageContents font page of
Left _ -> return ()
Right (PageContents {chunks}) -> tell chunks
getFont :: Dictionary -> T Dictionary
getFont pageDict =
get "Resources" pageDict
key "Resources" pageDict
>>= dict . Direct
>>= get "Font"
>>= key "Font"
>>= follow
>>= dict
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
cache loader objectId = do
loaded <- get
case Map.lookup objectId loaded of
Just value -> return value
Nothing -> do
value <- loader objectId
modify (Map.insert objectId value) >> return value
loadFont :: ObjectId -> T CMap
loadFont objectId =
getObject objectId
>>= dict
>>= key "ToUnicode"
>>= follow
loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList
where
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do
flip (Map.insert name) output <$> cache loadFont objectId
loadCMapper output _ = return output
getObject :: ObjectId -> T Object
getObject objectId = do
content <- ask
return (objects content ! objectId)
get :: String -> Dictionary -> T DirectObject
get key dictionary =
case Map.lookup (Name key) dictionary of
key :: String -> Dictionary -> T DirectObject
key keyName dictionary =
case Map.lookup (Name keyName) dictionary of
Just obj -> return obj
_ -> list []
follow :: DirectObject -> T Object
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
follow _ = list []
{-
obj <- getObject objectId
case obj of
Direct directObj -> return directObj
_ -> list []
-}
{-
key dictionary =
case Map.lookup (Name key) dictionary of
Just (Reference (IndirectObjCoordinates {objectId})) -> getObject objectId
_ -> list []
-}
dict :: Object -> T Dictionary
dict (Direct (Dictionary dictionary)) = return dictionary
@ -93,8 +106,8 @@ dict _ = list []
pagesList :: T ObjectId
pagesList = do
root <- dict =<< follow =<< get "Root" . trailer . docStructure =<< ask
pages <- dict =<< follow =<< get "Pages" root
root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask
pages <- dict =<< follow =<< key "Pages" root
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> list $ filterObjectIds kids
_ -> list []
@ -121,4 +134,4 @@ main = do
result <- parseDocument <$> BS.readFile inputFile
case result of
Left parseError -> hPutStrLn stderr $ show parseError
Right doc -> mapM_ Lazy.putStrLn $ listTextObjects doc
Right doc -> mapM_ BS.putStrLn $ listTextObjects doc

View file

@ -1,4 +1,3 @@
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
module PDF.Text (
CMap
@ -9,16 +8,23 @@ module PDF.Text (
) where
import Control.Applicative ((<|>))
import Control.Monad.State (MonadState)
import Data.ByteString.Lazy.Char8 (ByteString)
import Control.Monad (join)
import Control.Monad.Reader (ReaderT, runReaderT)
import Control.Monad.State (state)
import Data.Attoparsec.ByteString.Char8 (count, sepBy)
import Data.ByteString.Char8 (ByteString)
import Data.Map (Map)
import PDF.Object (Dictionary, blank, name, regular)
import qualified Data.Map as Map (empty)
import PDF.Object (Content, Name, blank, name, regular)
import PDF.Output (ObjectId)
import PDF.Parser (Parser, count, runParser, sepBy, string, takeAll)
import PDF.Parser (Parser, evalParser, string, takeAll)
type CMappers = Map ObjectId CMap
type CMappers = Map Name CMap
type CMap = Map Int ByteString
emptyCMap :: CMap
emptyCMap = Map.empty
data TextOperator = TJ | Tj | Tf | Other
cMap :: ByteString -> CMap
@ -28,18 +34,40 @@ data PageContents = PageContents {
chunks :: [ByteString]
}
pageContents :: MonadState CMappers m => Dictionary -> ByteString -> m (Either String PageContents)
pageContents font = runParser page
type ParserWithFont = ReaderT CMappers (Parser CMap)
page :: Parser u PageContents
page = PageContents <$> (graphicState <|> text)
{-
data FontContext = FontContext {
cMappers :: CMappers
, currentFont :: CMap
}
initFontContext cMappers = FontContext {
cMappers
, currentFont = emptyCMap
}
-}
pageContents :: CMappers -> ByteString -> Either String PageContents
pageContents font input =
evalParser (runReaderT (PageContents <$> page) font) emptyCMap input
page :: ParserWithFont [ByteString]
page = graphicState <|> text
graphicState :: ParserWithFont [ByteString]
graphicState =
string "q" *> blank *> (command <|> page) `sepBy` blank <* string "Q"
string "q" *> blank *> insideQ <* string "Q"
where
insideQ = join <$> (command <|> page `sepBy` blank )
command =
count 6 argument *> string "cm"
<|> name *> blank *> string "gs"
count 6 argument *> string "cm" *> return []
<|> name *> blank *> string "gs" *> return []
argument = takeAll regular <* blank
text :: ParserWithFont [ByteString]
text = undefined
textOperator :: ParserWithFont TextOperator
textOperator = undefined