WIP: Use previous changes to start implementing font caching and text parsing (still very broken, doesn't compile)
This commit is contained in:
parent
b8eb9e6856
commit
f9e5683bf4
2 changed files with 93 additions and 52 deletions
|
@ -2,29 +2,27 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
import Codec.Compression.Zlib (decompress)
|
||||
import Control.Monad.Reader (ask)
|
||||
import Control.Monad.RWS (RWST(..), evalRWST)
|
||||
import Control.Monad (foldM)
|
||||
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
|
||||
import Control.Monad.Writer (tell)
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import qualified Data.ByteString.Lazy.Char8 as Lazy (concat, fromStrict, pack, putStrLn, toStrict)
|
||||
import Data.Map ((!))
|
||||
import qualified Data.Map as Map (empty, lookup)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile, putStrLn)
|
||||
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, toStrict)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, insert, lookup, toList)
|
||||
import PDF (Document(..), parseDocument)
|
||||
import qualified PDF.EOL as EOL (Style)
|
||||
import PDF.Object (
|
||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Object(..), Name(..), Structure(..)
|
||||
,)
|
||||
import PDF.Output (ObjectId(..))
|
||||
import qualified PDF.Output as Output (render)
|
||||
import PDF.Output (ObjectId)
|
||||
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
|
||||
import PDF.Update (unify)
|
||||
import System.Environment (getArgs)
|
||||
import System.IO (hPutStrLn, stderr)
|
||||
|
||||
--type T a = ReaderT Content [] a
|
||||
type T a = RWST Content [ByteString] CMappers [] a
|
||||
type CachedCMaps = Map ObjectId CMap
|
||||
type T a = RWST Content [ByteString] CachedCMaps [] a
|
||||
|
||||
list :: [a] -> T a
|
||||
list l = RWST (\_ s -> fillContext s <$> l)
|
||||
|
@ -34,58 +32,73 @@ list l = RWST (\_ s -> fillContext s <$> l)
|
|||
extractText :: Object -> T ()
|
||||
extractText object = do
|
||||
pageDict <- dict object
|
||||
pageContents <- follow =<< get "Contents" pageDict
|
||||
case pageContents of
|
||||
contents <- follow =<< key "Contents" pageDict
|
||||
case contents of
|
||||
(Stream {header, streamContent}) -> do
|
||||
font <- getFont pageDict
|
||||
storeDecodedText font . clear header $ Lazy.fromStrict streamContent
|
||||
cMappers <- loadCMappers font
|
||||
storeDecodedText cMappers $ clear header streamContent
|
||||
_ -> return ()
|
||||
|
||||
clear :: Dictionary -> ByteString -> ByteString
|
||||
clear header streamContent =
|
||||
case Map.lookup (Name "Filter") header of
|
||||
Just (NameObject (Name "FlateDecode")) -> decompress streamContent
|
||||
Just (NameObject (Name "FlateDecode")) ->
|
||||
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
||||
_ -> streamContent
|
||||
|
||||
storeDecodedText :: Dictionary -> ByteString -> T ()
|
||||
storeDecodedText :: CMappers -> ByteString -> T ()
|
||||
storeDecodedText font page =
|
||||
tell . chunks =<< pageContents font page
|
||||
case pageContents font page of
|
||||
Left _ -> return ()
|
||||
Right (PageContents {chunks}) -> tell chunks
|
||||
|
||||
getFont :: Dictionary -> T Dictionary
|
||||
getFont pageDict =
|
||||
get "Resources" pageDict
|
||||
key "Resources" pageDict
|
||||
>>= dict . Direct
|
||||
>>= get "Font"
|
||||
>>= key "Font"
|
||||
>>= follow
|
||||
>>= dict
|
||||
|
||||
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
|
||||
cache loader objectId = do
|
||||
loaded <- get
|
||||
case Map.lookup objectId loaded of
|
||||
Just value -> return value
|
||||
Nothing -> do
|
||||
value <- loader objectId
|
||||
modify (Map.insert objectId value) >> return value
|
||||
|
||||
loadFont :: ObjectId -> T CMap
|
||||
loadFont objectId =
|
||||
getObject objectId
|
||||
>>= dict
|
||||
>>= key "ToUnicode"
|
||||
>>= follow
|
||||
|
||||
loadCMappers :: Dictionary -> T CMappers
|
||||
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
||||
where
|
||||
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
|
||||
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do
|
||||
flip (Map.insert name) output <$> cache loadFont objectId
|
||||
loadCMapper output _ = return output
|
||||
|
||||
getObject :: ObjectId -> T Object
|
||||
getObject objectId = do
|
||||
content <- ask
|
||||
return (objects content ! objectId)
|
||||
|
||||
get :: String -> Dictionary -> T DirectObject
|
||||
get key dictionary =
|
||||
case Map.lookup (Name key) dictionary of
|
||||
key :: String -> Dictionary -> T DirectObject
|
||||
key keyName dictionary =
|
||||
case Map.lookup (Name keyName) dictionary of
|
||||
Just obj -> return obj
|
||||
_ -> list []
|
||||
|
||||
follow :: DirectObject -> T Object
|
||||
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
|
||||
follow _ = list []
|
||||
{-
|
||||
obj <- getObject objectId
|
||||
case obj of
|
||||
Direct directObj -> return directObj
|
||||
_ -> list []
|
||||
-}
|
||||
|
||||
{-
|
||||
key dictionary =
|
||||
case Map.lookup (Name key) dictionary of
|
||||
Just (Reference (IndirectObjCoordinates {objectId})) -> getObject objectId
|
||||
_ -> list []
|
||||
-}
|
||||
|
||||
dict :: Object -> T Dictionary
|
||||
dict (Direct (Dictionary dictionary)) = return dictionary
|
||||
|
@ -93,8 +106,8 @@ dict _ = list []
|
|||
|
||||
pagesList :: T ObjectId
|
||||
pagesList = do
|
||||
root <- dict =<< follow =<< get "Root" . trailer . docStructure =<< ask
|
||||
pages <- dict =<< follow =<< get "Pages" root
|
||||
root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask
|
||||
pages <- dict =<< follow =<< key "Pages" root
|
||||
case Map.lookup (Name "Kids") pages of
|
||||
Just (Array kids) -> list $ filterObjectIds kids
|
||||
_ -> list []
|
||||
|
@ -121,4 +134,4 @@ main = do
|
|||
result <- parseDocument <$> BS.readFile inputFile
|
||||
case result of
|
||||
Left parseError -> hPutStrLn stderr $ show parseError
|
||||
Right doc -> mapM_ Lazy.putStrLn $ listTextObjects doc
|
||||
Right doc -> mapM_ BS.putStrLn $ listTextObjects doc
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
module PDF.Text (
|
||||
CMap
|
||||
|
@ -9,16 +8,23 @@ module PDF.Text (
|
|||
) where
|
||||
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad.State (MonadState)
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import Control.Monad (join)
|
||||
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||
import Control.Monad.State (state)
|
||||
import Data.Attoparsec.ByteString.Char8 (count, sepBy)
|
||||
import Data.ByteString.Char8 (ByteString)
|
||||
import Data.Map (Map)
|
||||
import PDF.Object (Dictionary, blank, name, regular)
|
||||
import qualified Data.Map as Map (empty)
|
||||
import PDF.Object (Content, Name, blank, name, regular)
|
||||
import PDF.Output (ObjectId)
|
||||
import PDF.Parser (Parser, count, runParser, sepBy, string, takeAll)
|
||||
import PDF.Parser (Parser, evalParser, string, takeAll)
|
||||
|
||||
type CMappers = Map ObjectId CMap
|
||||
type CMappers = Map Name CMap
|
||||
type CMap = Map Int ByteString
|
||||
|
||||
emptyCMap :: CMap
|
||||
emptyCMap = Map.empty
|
||||
|
||||
data TextOperator = TJ | Tj | Tf | Other
|
||||
|
||||
cMap :: ByteString -> CMap
|
||||
|
@ -28,18 +34,40 @@ data PageContents = PageContents {
|
|||
chunks :: [ByteString]
|
||||
}
|
||||
|
||||
pageContents :: MonadState CMappers m => Dictionary -> ByteString -> m (Either String PageContents)
|
||||
pageContents font = runParser page
|
||||
type ParserWithFont = ReaderT CMappers (Parser CMap)
|
||||
|
||||
page :: Parser u PageContents
|
||||
page = PageContents <$> (graphicState <|> text)
|
||||
{-
|
||||
data FontContext = FontContext {
|
||||
cMappers :: CMappers
|
||||
, currentFont :: CMap
|
||||
}
|
||||
|
||||
initFontContext cMappers = FontContext {
|
||||
cMappers
|
||||
, currentFont = emptyCMap
|
||||
}
|
||||
-}
|
||||
|
||||
pageContents :: CMappers -> ByteString -> Either String PageContents
|
||||
pageContents font input =
|
||||
evalParser (runReaderT (PageContents <$> page) font) emptyCMap input
|
||||
|
||||
page :: ParserWithFont [ByteString]
|
||||
page = graphicState <|> text
|
||||
|
||||
graphicState :: ParserWithFont [ByteString]
|
||||
graphicState =
|
||||
string "q" *> blank *> (command <|> page) `sepBy` blank <* string "Q"
|
||||
string "q" *> blank *> insideQ <* string "Q"
|
||||
where
|
||||
insideQ = join <$> (command <|> page `sepBy` blank )
|
||||
command =
|
||||
count 6 argument *> string "cm"
|
||||
<|> name *> blank *> string "gs"
|
||||
count 6 argument *> string "cm" *> return []
|
||||
<|> name *> blank *> string "gs" *> return []
|
||||
argument = takeAll regular <* blank
|
||||
|
||||
text :: ParserWithFont [ByteString]
|
||||
text = undefined
|
||||
|
||||
textOperator :: ParserWithFont TextOperator
|
||||
textOperator = undefined
|
||||
|
||||
|
|
Loading…
Reference in a new issue