Merge branch 'extract-text' into navigation

This commit is contained in:
Tissevert 2020-02-12 17:35:56 +01:00
commit 919f640443
3 changed files with 41 additions and 23 deletions

View File

@ -4,16 +4,17 @@ import Codec.Compression.Zlib (decompress)
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict)
import Data.Map ((!))
import qualified Data.Map as Map (lookup)
import Data.Map ((!?))
import qualified Data.Map as Map (keys, lookup)
import PDF (Document(..), parseDocument)
import qualified PDF.EOL as EOL (Style)
import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..))
import PDF.Output (ObjectId(..))
import qualified PDF.Output as Output (render)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)
import System.Environment (getArgs, getProgName)
import System.Exit (die)
import Text.Printf (printf)
display :: EOL.Style -> Object -> ByteString
display eolStyle d@(Direct _) = Output.render eolStyle d
@ -25,16 +26,28 @@ display eolStyle s@(Stream {header, streamContent}) = Output.render eolStyle $
}
_ -> s
extractObject :: ObjectId -> Document -> ByteString
extractObject :: ObjectId -> Document -> Either String ByteString
extractObject objectId (Document {eolStyle, updates}) =
display eolStyle . (!objectId) $ objects content
case objects content !? objectId of
Nothing -> Left $ "No object has ID " ++ show (getObjectId objectId)
Just o -> Right $ display eolStyle o
where
content = unify updates
listObjectIds :: Document -> Either String [String]
listObjectIds =
Right . prependTitle . toString . Map.keys . objects . unify . updates
where
toString = fmap (show . getObjectId)
prependTitle = ("ObjectIds defined in this PDF:":)
main :: IO ()
main = do
[inputFile, objectId] <- getArgs
result <- parseDocument <$> BS.readFile inputFile
case result of
Left parseError -> hPutStrLn stderr parseError
Right doc -> Lazy.putStr $ extractObject (ObjectId (read objectId)) doc
(inputFile, getData) <- parse =<< getArgs
input <- BS.readFile inputFile
either die id $ (parseDocument input >>= getData)
where
parse [inputFile] = return (inputFile, fmap (mapM_ putStrLn) . listObjectIds)
parse [inputFile, objectId] = return
(inputFile, fmap Lazy.putStr . extractObject (ObjectId (read objectId)))
parse _ = die . printf "Syntax: %s inputFile [OBJECT_ID]\n" =<< getProgName

View File

@ -220,16 +220,18 @@ instance Output DirectObject where
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
directObject :: MonadParser m => m DirectObject
directObject =
Boolean <$> boolean
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
<|> NumberObject <$> number
<|> StringObject <$> stringObject
<|> NameObject <$> name
<|> Array <$> array
<|> Dictionary <$> dictionary
<|> const Null <$> nullObject
<?> "direct object"
directObject = (peek >>= dispatch) <?> "direct object"
where
dispatch 't' = Boolean <$> boolean
dispatch 'f' = Boolean <$> boolean
dispatch '(' = StringObject <$> stringObject
dispatch '<' = StringObject <$> stringObject <|> Dictionary <$> dictionary
dispatch '/' = NameObject <$> name
dispatch '[' = Array <$> array
dispatch 'n' = nullObject *> return Null
dispatch _ =
Reference <$> reference {- defined before Number because Number is a prefix of it -}
<|> NumberObject <$> number
--
-- Object

View File

@ -17,8 +17,8 @@ import Control.Monad.Fail (MonadFail(..))
import Control.Monad.State (StateT(..), evalStateT)
import Control.Monad.Trans (MonadTrans(..))
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
Parser, char, endOfInput, parseOnly, satisfy, string, take, takeWhile
, takeWhile1
Parser, char, endOfInput, parseOnly, peekChar', satisfy, string, take
, takeWhile, takeWhile1
)
import Data.ByteString (ByteString)
import Data.ByteString.Char8.Util (B16Int(..))
@ -36,6 +36,7 @@ class MonadDeps m => MonadParser m where
endOfInput :: m ()
hexNumber :: m B16Int
oneOf :: String -> m Char
peek :: m Char
string :: ByteString -> m ByteString
takeAll :: (Char -> Bool) -> m ByteString
takeAll1 :: (Char -> Bool) -> m ByteString
@ -47,6 +48,7 @@ instance MonadParser Atto.Parser where
decNumber = Atto.takeWhile1 (`Set.member` digits)
hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
oneOf charSet = Atto.satisfy (`elem` charSet)
peek = Atto.peekChar'
string s = Atto.string s <?> show s
takeAll = Atto.takeWhile
takeAll1 = Atto.takeWhile1
@ -58,6 +60,7 @@ instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) whe
decNumber = lift $ decNumber
hexNumber = lift $ hexNumber
oneOf = lift . oneOf
peek = lift $ peek
string = lift . string
takeAll = lift . takeAll
takeAll1 = lift . takeAll1