Merge branch 'extract-text' into navigation

This commit is contained in:
Tissevert 2020-02-12 17:35:56 +01:00
commit 919f640443
3 changed files with 41 additions and 23 deletions

View File

@ -4,16 +4,17 @@ import Codec.Compression.Zlib (decompress)
import Data.ByteString.Lazy.Char8 (ByteString) import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict) import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict)
import Data.Map ((!)) import Data.Map ((!?))
import qualified Data.Map as Map (lookup) import qualified Data.Map as Map (keys, lookup)
import PDF (Document(..), parseDocument) import PDF (Document(..), parseDocument)
import qualified PDF.EOL as EOL (Style) import qualified PDF.EOL as EOL (Style)
import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..)) import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..))
import PDF.Output (ObjectId(..)) import PDF.Output (ObjectId(..))
import qualified PDF.Output as Output (render) import qualified PDF.Output as Output (render)
import PDF.Update (unify) import PDF.Update (unify)
import System.Environment (getArgs) import System.Environment (getArgs, getProgName)
import System.IO (hPutStrLn, stderr) import System.Exit (die)
import Text.Printf (printf)
display :: EOL.Style -> Object -> ByteString display :: EOL.Style -> Object -> ByteString
display eolStyle d@(Direct _) = Output.render eolStyle d display eolStyle d@(Direct _) = Output.render eolStyle d
@ -25,16 +26,28 @@ display eolStyle s@(Stream {header, streamContent}) = Output.render eolStyle $
} }
_ -> s _ -> s
extractObject :: ObjectId -> Document -> ByteString extractObject :: ObjectId -> Document -> Either String ByteString
extractObject objectId (Document {eolStyle, updates}) = extractObject objectId (Document {eolStyle, updates}) =
display eolStyle . (!objectId) $ objects content case objects content !? objectId of
Nothing -> Left $ "No object has ID " ++ show (getObjectId objectId)
Just o -> Right $ display eolStyle o
where where
content = unify updates content = unify updates
listObjectIds :: Document -> Either String [String]
listObjectIds =
Right . prependTitle . toString . Map.keys . objects . unify . updates
where
toString = fmap (show . getObjectId)
prependTitle = ("ObjectIds defined in this PDF:":)
main :: IO () main :: IO ()
main = do main = do
[inputFile, objectId] <- getArgs (inputFile, getData) <- parse =<< getArgs
result <- parseDocument <$> BS.readFile inputFile input <- BS.readFile inputFile
case result of either die id $ (parseDocument input >>= getData)
Left parseError -> hPutStrLn stderr parseError where
Right doc -> Lazy.putStr $ extractObject (ObjectId (read objectId)) doc parse [inputFile] = return (inputFile, fmap (mapM_ putStrLn) . listObjectIds)
parse [inputFile, objectId] = return
(inputFile, fmap Lazy.putStr . extractObject (ObjectId (read objectId)))
parse _ = die . printf "Syntax: %s inputFile [OBJECT_ID]\n" =<< getProgName

View File

@ -220,16 +220,18 @@ instance Output DirectObject where
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber) Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
directObject :: MonadParser m => m DirectObject directObject :: MonadParser m => m DirectObject
directObject = directObject = (peek >>= dispatch) <?> "direct object"
Boolean <$> boolean where
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -} dispatch 't' = Boolean <$> boolean
<|> NumberObject <$> number dispatch 'f' = Boolean <$> boolean
<|> StringObject <$> stringObject dispatch '(' = StringObject <$> stringObject
<|> NameObject <$> name dispatch '<' = StringObject <$> stringObject <|> Dictionary <$> dictionary
<|> Array <$> array dispatch '/' = NameObject <$> name
<|> Dictionary <$> dictionary dispatch '[' = Array <$> array
<|> const Null <$> nullObject dispatch 'n' = nullObject *> return Null
<?> "direct object" dispatch _ =
Reference <$> reference {- defined before Number because Number is a prefix of it -}
<|> NumberObject <$> number
-- --
-- Object -- Object

View File

@ -17,8 +17,8 @@ import Control.Monad.Fail (MonadFail(..))
import Control.Monad.State (StateT(..), evalStateT) import Control.Monad.State (StateT(..), evalStateT)
import Control.Monad.Trans (MonadTrans(..)) import Control.Monad.Trans (MonadTrans(..))
import qualified Data.Attoparsec.ByteString.Char8 as Atto ( import qualified Data.Attoparsec.ByteString.Char8 as Atto (
Parser, char, endOfInput, parseOnly, satisfy, string, take, takeWhile Parser, char, endOfInput, parseOnly, peekChar', satisfy, string, take
, takeWhile1 , takeWhile, takeWhile1
) )
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import Data.ByteString.Char8.Util (B16Int(..)) import Data.ByteString.Char8.Util (B16Int(..))
@ -36,6 +36,7 @@ class MonadDeps m => MonadParser m where
endOfInput :: m () endOfInput :: m ()
hexNumber :: m B16Int hexNumber :: m B16Int
oneOf :: String -> m Char oneOf :: String -> m Char
peek :: m Char
string :: ByteString -> m ByteString string :: ByteString -> m ByteString
takeAll :: (Char -> Bool) -> m ByteString takeAll :: (Char -> Bool) -> m ByteString
takeAll1 :: (Char -> Bool) -> m ByteString takeAll1 :: (Char -> Bool) -> m ByteString
@ -47,6 +48,7 @@ instance MonadParser Atto.Parser where
decNumber = Atto.takeWhile1 (`Set.member` digits) decNumber = Atto.takeWhile1 (`Set.member` digits)
hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits) hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
oneOf charSet = Atto.satisfy (`elem` charSet) oneOf charSet = Atto.satisfy (`elem` charSet)
peek = Atto.peekChar'
string s = Atto.string s <?> show s string s = Atto.string s <?> show s
takeAll = Atto.takeWhile takeAll = Atto.takeWhile
takeAll1 = Atto.takeWhile1 takeAll1 = Atto.takeWhile1
@ -58,6 +60,7 @@ instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) whe
decNumber = lift $ decNumber decNumber = lift $ decNumber
hexNumber = lift $ hexNumber hexNumber = lift $ hexNumber
oneOf = lift . oneOf oneOf = lift . oneOf
peek = lift $ peek
string = lift . string string = lift . string
takeAll = lift . takeAll takeAll = lift . takeAll
takeAll1 = lift . takeAll1 takeAll1 = lift . takeAll1