Merge branch 'extract-text' into navigation
This commit is contained in:
commit
919f640443
|
@ -4,16 +4,17 @@ import Codec.Compression.Zlib (decompress)
|
|||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict)
|
||||
import Data.Map ((!))
|
||||
import qualified Data.Map as Map (lookup)
|
||||
import Data.Map ((!?))
|
||||
import qualified Data.Map as Map (keys, lookup)
|
||||
import PDF (Document(..), parseDocument)
|
||||
import qualified PDF.EOL as EOL (Style)
|
||||
import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..))
|
||||
import PDF.Output (ObjectId(..))
|
||||
import qualified PDF.Output as Output (render)
|
||||
import PDF.Update (unify)
|
||||
import System.Environment (getArgs)
|
||||
import System.IO (hPutStrLn, stderr)
|
||||
import System.Environment (getArgs, getProgName)
|
||||
import System.Exit (die)
|
||||
import Text.Printf (printf)
|
||||
|
||||
display :: EOL.Style -> Object -> ByteString
|
||||
display eolStyle d@(Direct _) = Output.render eolStyle d
|
||||
|
@ -25,16 +26,28 @@ display eolStyle s@(Stream {header, streamContent}) = Output.render eolStyle $
|
|||
}
|
||||
_ -> s
|
||||
|
||||
extractObject :: ObjectId -> Document -> ByteString
|
||||
extractObject :: ObjectId -> Document -> Either String ByteString
|
||||
extractObject objectId (Document {eolStyle, updates}) =
|
||||
display eolStyle . (!objectId) $ objects content
|
||||
case objects content !? objectId of
|
||||
Nothing -> Left $ "No object has ID " ++ show (getObjectId objectId)
|
||||
Just o -> Right $ display eolStyle o
|
||||
where
|
||||
content = unify updates
|
||||
|
||||
listObjectIds :: Document -> Either String [String]
|
||||
listObjectIds =
|
||||
Right . prependTitle . toString . Map.keys . objects . unify . updates
|
||||
where
|
||||
toString = fmap (show . getObjectId)
|
||||
prependTitle = ("ObjectIds defined in this PDF:":)
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
[inputFile, objectId] <- getArgs
|
||||
result <- parseDocument <$> BS.readFile inputFile
|
||||
case result of
|
||||
Left parseError -> hPutStrLn stderr parseError
|
||||
Right doc -> Lazy.putStr $ extractObject (ObjectId (read objectId)) doc
|
||||
(inputFile, getData) <- parse =<< getArgs
|
||||
input <- BS.readFile inputFile
|
||||
either die id $ (parseDocument input >>= getData)
|
||||
where
|
||||
parse [inputFile] = return (inputFile, fmap (mapM_ putStrLn) . listObjectIds)
|
||||
parse [inputFile, objectId] = return
|
||||
(inputFile, fmap Lazy.putStr . extractObject (ObjectId (read objectId)))
|
||||
parse _ = die . printf "Syntax: %s inputFile [OBJECT_ID]\n" =<< getProgName
|
||||
|
|
|
@ -220,16 +220,18 @@ instance Output DirectObject where
|
|||
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
|
||||
|
||||
directObject :: MonadParser m => m DirectObject
|
||||
directObject =
|
||||
Boolean <$> boolean
|
||||
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
|
||||
<|> NumberObject <$> number
|
||||
<|> StringObject <$> stringObject
|
||||
<|> NameObject <$> name
|
||||
<|> Array <$> array
|
||||
<|> Dictionary <$> dictionary
|
||||
<|> const Null <$> nullObject
|
||||
<?> "direct object"
|
||||
directObject = (peek >>= dispatch) <?> "direct object"
|
||||
where
|
||||
dispatch 't' = Boolean <$> boolean
|
||||
dispatch 'f' = Boolean <$> boolean
|
||||
dispatch '(' = StringObject <$> stringObject
|
||||
dispatch '<' = StringObject <$> stringObject <|> Dictionary <$> dictionary
|
||||
dispatch '/' = NameObject <$> name
|
||||
dispatch '[' = Array <$> array
|
||||
dispatch 'n' = nullObject *> return Null
|
||||
dispatch _ =
|
||||
Reference <$> reference {- defined before Number because Number is a prefix of it -}
|
||||
<|> NumberObject <$> number
|
||||
|
||||
--
|
||||
-- Object
|
||||
|
|
|
@ -17,8 +17,8 @@ import Control.Monad.Fail (MonadFail(..))
|
|||
import Control.Monad.State (StateT(..), evalStateT)
|
||||
import Control.Monad.Trans (MonadTrans(..))
|
||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
|
||||
Parser, char, endOfInput, parseOnly, satisfy, string, take, takeWhile
|
||||
, takeWhile1
|
||||
Parser, char, endOfInput, parseOnly, peekChar', satisfy, string, take
|
||||
, takeWhile, takeWhile1
|
||||
)
|
||||
import Data.ByteString (ByteString)
|
||||
import Data.ByteString.Char8.Util (B16Int(..))
|
||||
|
@ -36,6 +36,7 @@ class MonadDeps m => MonadParser m where
|
|||
endOfInput :: m ()
|
||||
hexNumber :: m B16Int
|
||||
oneOf :: String -> m Char
|
||||
peek :: m Char
|
||||
string :: ByteString -> m ByteString
|
||||
takeAll :: (Char -> Bool) -> m ByteString
|
||||
takeAll1 :: (Char -> Bool) -> m ByteString
|
||||
|
@ -47,6 +48,7 @@ instance MonadParser Atto.Parser where
|
|||
decNumber = Atto.takeWhile1 (`Set.member` digits)
|
||||
hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
|
||||
oneOf charSet = Atto.satisfy (`elem` charSet)
|
||||
peek = Atto.peekChar'
|
||||
string s = Atto.string s <?> show s
|
||||
takeAll = Atto.takeWhile
|
||||
takeAll1 = Atto.takeWhile1
|
||||
|
@ -58,6 +60,7 @@ instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) whe
|
|||
decNumber = lift $ decNumber
|
||||
hexNumber = lift $ hexNumber
|
||||
oneOf = lift . oneOf
|
||||
peek = lift $ peek
|
||||
string = lift . string
|
||||
takeAll = lift . takeAll
|
||||
takeAll1 = lift . takeAll1
|
||||
|
|
Loading…
Reference in New Issue