WIP: Use previous changes to start implementing font caching and text parsing (still very broken, doesn't compile)
This commit is contained in:
parent
b8eb9e6856
commit
f9e5683bf4
2 changed files with 93 additions and 52 deletions
|
@ -2,29 +2,27 @@
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
|
||||||
import Codec.Compression.Zlib (decompress)
|
import Codec.Compression.Zlib (decompress)
|
||||||
import Control.Monad.Reader (ask)
|
import Control.Monad (foldM)
|
||||||
import Control.Monad.RWS (RWST(..), evalRWST)
|
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
|
||||||
import Control.Monad.Writer (tell)
|
import Control.Monad.Writer (tell)
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
import qualified Data.ByteString.Char8 as BS (readFile, putStrLn)
|
||||||
import qualified Data.ByteString.Lazy.Char8 as Lazy (concat, fromStrict, pack, putStrLn, toStrict)
|
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, toStrict)
|
||||||
import Data.Map ((!))
|
import Data.Map (Map, (!))
|
||||||
import qualified Data.Map as Map (empty, lookup)
|
import qualified Data.Map as Map (empty, insert, lookup, toList)
|
||||||
import PDF (Document(..), parseDocument)
|
import PDF (Document(..), parseDocument)
|
||||||
import qualified PDF.EOL as EOL (Style)
|
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
, Object(..), Name(..), Structure(..)
|
, Object(..), Name(..), Structure(..)
|
||||||
,)
|
,)
|
||||||
import PDF.Output (ObjectId(..))
|
import PDF.Output (ObjectId)
|
||||||
import qualified PDF.Output as Output (render)
|
|
||||||
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
|
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
|
||||||
import PDF.Update (unify)
|
import PDF.Update (unify)
|
||||||
import System.Environment (getArgs)
|
import System.Environment (getArgs)
|
||||||
import System.IO (hPutStrLn, stderr)
|
import System.IO (hPutStrLn, stderr)
|
||||||
|
|
||||||
--type T a = ReaderT Content [] a
|
type CachedCMaps = Map ObjectId CMap
|
||||||
type T a = RWST Content [ByteString] CMappers [] a
|
type T a = RWST Content [ByteString] CachedCMaps [] a
|
||||||
|
|
||||||
list :: [a] -> T a
|
list :: [a] -> T a
|
||||||
list l = RWST (\_ s -> fillContext s <$> l)
|
list l = RWST (\_ s -> fillContext s <$> l)
|
||||||
|
@ -34,58 +32,73 @@ list l = RWST (\_ s -> fillContext s <$> l)
|
||||||
extractText :: Object -> T ()
|
extractText :: Object -> T ()
|
||||||
extractText object = do
|
extractText object = do
|
||||||
pageDict <- dict object
|
pageDict <- dict object
|
||||||
pageContents <- follow =<< get "Contents" pageDict
|
contents <- follow =<< key "Contents" pageDict
|
||||||
case pageContents of
|
case contents of
|
||||||
(Stream {header, streamContent}) -> do
|
(Stream {header, streamContent}) -> do
|
||||||
font <- getFont pageDict
|
font <- getFont pageDict
|
||||||
storeDecodedText font . clear header $ Lazy.fromStrict streamContent
|
cMappers <- loadCMappers font
|
||||||
|
storeDecodedText cMappers $ clear header streamContent
|
||||||
_ -> return ()
|
_ -> return ()
|
||||||
|
|
||||||
clear :: Dictionary -> ByteString -> ByteString
|
clear :: Dictionary -> ByteString -> ByteString
|
||||||
clear header streamContent =
|
clear header streamContent =
|
||||||
case Map.lookup (Name "Filter") header of
|
case Map.lookup (Name "Filter") header of
|
||||||
Just (NameObject (Name "FlateDecode")) -> decompress streamContent
|
Just (NameObject (Name "FlateDecode")) ->
|
||||||
|
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
||||||
_ -> streamContent
|
_ -> streamContent
|
||||||
|
|
||||||
storeDecodedText :: Dictionary -> ByteString -> T ()
|
storeDecodedText :: CMappers -> ByteString -> T ()
|
||||||
storeDecodedText font page =
|
storeDecodedText font page =
|
||||||
tell . chunks =<< pageContents font page
|
case pageContents font page of
|
||||||
|
Left _ -> return ()
|
||||||
|
Right (PageContents {chunks}) -> tell chunks
|
||||||
|
|
||||||
getFont :: Dictionary -> T Dictionary
|
getFont :: Dictionary -> T Dictionary
|
||||||
getFont pageDict =
|
getFont pageDict =
|
||||||
get "Resources" pageDict
|
key "Resources" pageDict
|
||||||
>>= dict . Direct
|
>>= dict . Direct
|
||||||
>>= get "Font"
|
>>= key "Font"
|
||||||
>>= follow
|
>>= follow
|
||||||
>>= dict
|
>>= dict
|
||||||
|
|
||||||
|
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
|
||||||
|
cache loader objectId = do
|
||||||
|
loaded <- get
|
||||||
|
case Map.lookup objectId loaded of
|
||||||
|
Just value -> return value
|
||||||
|
Nothing -> do
|
||||||
|
value <- loader objectId
|
||||||
|
modify (Map.insert objectId value) >> return value
|
||||||
|
|
||||||
|
loadFont :: ObjectId -> T CMap
|
||||||
|
loadFont objectId =
|
||||||
|
getObject objectId
|
||||||
|
>>= dict
|
||||||
|
>>= key "ToUnicode"
|
||||||
|
>>= follow
|
||||||
|
|
||||||
|
loadCMappers :: Dictionary -> T CMappers
|
||||||
|
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
||||||
|
where
|
||||||
|
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
|
||||||
|
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do
|
||||||
|
flip (Map.insert name) output <$> cache loadFont objectId
|
||||||
|
loadCMapper output _ = return output
|
||||||
|
|
||||||
getObject :: ObjectId -> T Object
|
getObject :: ObjectId -> T Object
|
||||||
getObject objectId = do
|
getObject objectId = do
|
||||||
content <- ask
|
content <- ask
|
||||||
return (objects content ! objectId)
|
return (objects content ! objectId)
|
||||||
|
|
||||||
get :: String -> Dictionary -> T DirectObject
|
key :: String -> Dictionary -> T DirectObject
|
||||||
get key dictionary =
|
key keyName dictionary =
|
||||||
case Map.lookup (Name key) dictionary of
|
case Map.lookup (Name keyName) dictionary of
|
||||||
Just obj -> return obj
|
Just obj -> return obj
|
||||||
_ -> list []
|
_ -> list []
|
||||||
|
|
||||||
follow :: DirectObject -> T Object
|
follow :: DirectObject -> T Object
|
||||||
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
|
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
|
||||||
follow _ = list []
|
follow _ = list []
|
||||||
{-
|
|
||||||
obj <- getObject objectId
|
|
||||||
case obj of
|
|
||||||
Direct directObj -> return directObj
|
|
||||||
_ -> list []
|
|
||||||
-}
|
|
||||||
|
|
||||||
{-
|
|
||||||
key dictionary =
|
|
||||||
case Map.lookup (Name key) dictionary of
|
|
||||||
Just (Reference (IndirectObjCoordinates {objectId})) -> getObject objectId
|
|
||||||
_ -> list []
|
|
||||||
-}
|
|
||||||
|
|
||||||
dict :: Object -> T Dictionary
|
dict :: Object -> T Dictionary
|
||||||
dict (Direct (Dictionary dictionary)) = return dictionary
|
dict (Direct (Dictionary dictionary)) = return dictionary
|
||||||
|
@ -93,8 +106,8 @@ dict _ = list []
|
||||||
|
|
||||||
pagesList :: T ObjectId
|
pagesList :: T ObjectId
|
||||||
pagesList = do
|
pagesList = do
|
||||||
root <- dict =<< follow =<< get "Root" . trailer . docStructure =<< ask
|
root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask
|
||||||
pages <- dict =<< follow =<< get "Pages" root
|
pages <- dict =<< follow =<< key "Pages" root
|
||||||
case Map.lookup (Name "Kids") pages of
|
case Map.lookup (Name "Kids") pages of
|
||||||
Just (Array kids) -> list $ filterObjectIds kids
|
Just (Array kids) -> list $ filterObjectIds kids
|
||||||
_ -> list []
|
_ -> list []
|
||||||
|
@ -121,4 +134,4 @@ main = do
|
||||||
result <- parseDocument <$> BS.readFile inputFile
|
result <- parseDocument <$> BS.readFile inputFile
|
||||||
case result of
|
case result of
|
||||||
Left parseError -> hPutStrLn stderr $ show parseError
|
Left parseError -> hPutStrLn stderr $ show parseError
|
||||||
Right doc -> mapM_ Lazy.putStrLn $ listTextObjects doc
|
Right doc -> mapM_ BS.putStrLn $ listTextObjects doc
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
module PDF.Text (
|
module PDF.Text (
|
||||||
CMap
|
CMap
|
||||||
|
@ -9,16 +8,23 @@ module PDF.Text (
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>))
|
import Control.Applicative ((<|>))
|
||||||
import Control.Monad.State (MonadState)
|
import Control.Monad (join)
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||||
|
import Control.Monad.State (state)
|
||||||
|
import Data.Attoparsec.ByteString.Char8 (count, sepBy)
|
||||||
|
import Data.ByteString.Char8 (ByteString)
|
||||||
import Data.Map (Map)
|
import Data.Map (Map)
|
||||||
import PDF.Object (Dictionary, blank, name, regular)
|
import qualified Data.Map as Map (empty)
|
||||||
|
import PDF.Object (Content, Name, blank, name, regular)
|
||||||
import PDF.Output (ObjectId)
|
import PDF.Output (ObjectId)
|
||||||
import PDF.Parser (Parser, count, runParser, sepBy, string, takeAll)
|
import PDF.Parser (Parser, evalParser, string, takeAll)
|
||||||
|
|
||||||
type CMappers = Map ObjectId CMap
|
type CMappers = Map Name CMap
|
||||||
type CMap = Map Int ByteString
|
type CMap = Map Int ByteString
|
||||||
|
|
||||||
|
emptyCMap :: CMap
|
||||||
|
emptyCMap = Map.empty
|
||||||
|
|
||||||
data TextOperator = TJ | Tj | Tf | Other
|
data TextOperator = TJ | Tj | Tf | Other
|
||||||
|
|
||||||
cMap :: ByteString -> CMap
|
cMap :: ByteString -> CMap
|
||||||
|
@ -28,18 +34,40 @@ data PageContents = PageContents {
|
||||||
chunks :: [ByteString]
|
chunks :: [ByteString]
|
||||||
}
|
}
|
||||||
|
|
||||||
pageContents :: MonadState CMappers m => Dictionary -> ByteString -> m (Either String PageContents)
|
type ParserWithFont = ReaderT CMappers (Parser CMap)
|
||||||
pageContents font = runParser page
|
|
||||||
|
|
||||||
page :: Parser u PageContents
|
{-
|
||||||
page = PageContents <$> (graphicState <|> text)
|
data FontContext = FontContext {
|
||||||
|
cMappers :: CMappers
|
||||||
|
, currentFont :: CMap
|
||||||
|
}
|
||||||
|
|
||||||
|
initFontContext cMappers = FontContext {
|
||||||
|
cMappers
|
||||||
|
, currentFont = emptyCMap
|
||||||
|
}
|
||||||
|
-}
|
||||||
|
|
||||||
|
pageContents :: CMappers -> ByteString -> Either String PageContents
|
||||||
|
pageContents font input =
|
||||||
|
evalParser (runReaderT (PageContents <$> page) font) emptyCMap input
|
||||||
|
|
||||||
|
page :: ParserWithFont [ByteString]
|
||||||
|
page = graphicState <|> text
|
||||||
|
|
||||||
|
graphicState :: ParserWithFont [ByteString]
|
||||||
graphicState =
|
graphicState =
|
||||||
string "q" *> blank *> (command <|> page) `sepBy` blank <* string "Q"
|
string "q" *> blank *> insideQ <* string "Q"
|
||||||
where
|
where
|
||||||
|
insideQ = join <$> (command <|> page `sepBy` blank )
|
||||||
command =
|
command =
|
||||||
count 6 argument *> string "cm"
|
count 6 argument *> string "cm" *> return []
|
||||||
<|> name *> blank *> string "gs"
|
<|> name *> blank *> string "gs" *> return []
|
||||||
argument = takeAll regular <* blank
|
argument = takeAll regular <* blank
|
||||||
|
|
||||||
|
text :: ParserWithFont [ByteString]
|
||||||
|
text = undefined
|
||||||
|
|
||||||
|
textOperator :: ParserWithFont TextOperator
|
||||||
|
textOperator = undefined
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue