Merge branch 'extract-text' into navigation
This commit is contained in:
commit
919f640443
|
@ -4,16 +4,17 @@ import Codec.Compression.Zlib (decompress)
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict)
|
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict)
|
||||||
import Data.Map ((!))
|
import Data.Map ((!?))
|
||||||
import qualified Data.Map as Map (lookup)
|
import qualified Data.Map as Map (keys, lookup)
|
||||||
import PDF (Document(..), parseDocument)
|
import PDF (Document(..), parseDocument)
|
||||||
import qualified PDF.EOL as EOL (Style)
|
import qualified PDF.EOL as EOL (Style)
|
||||||
import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..))
|
import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..))
|
||||||
import PDF.Output (ObjectId(..))
|
import PDF.Output (ObjectId(..))
|
||||||
import qualified PDF.Output as Output (render)
|
import qualified PDF.Output as Output (render)
|
||||||
import PDF.Update (unify)
|
import PDF.Update (unify)
|
||||||
import System.Environment (getArgs)
|
import System.Environment (getArgs, getProgName)
|
||||||
import System.IO (hPutStrLn, stderr)
|
import System.Exit (die)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
display :: EOL.Style -> Object -> ByteString
|
display :: EOL.Style -> Object -> ByteString
|
||||||
display eolStyle d@(Direct _) = Output.render eolStyle d
|
display eolStyle d@(Direct _) = Output.render eolStyle d
|
||||||
|
@ -25,16 +26,28 @@ display eolStyle s@(Stream {header, streamContent}) = Output.render eolStyle $
|
||||||
}
|
}
|
||||||
_ -> s
|
_ -> s
|
||||||
|
|
||||||
extractObject :: ObjectId -> Document -> ByteString
|
extractObject :: ObjectId -> Document -> Either String ByteString
|
||||||
extractObject objectId (Document {eolStyle, updates}) =
|
extractObject objectId (Document {eolStyle, updates}) =
|
||||||
display eolStyle . (!objectId) $ objects content
|
case objects content !? objectId of
|
||||||
|
Nothing -> Left $ "No object has ID " ++ show (getObjectId objectId)
|
||||||
|
Just o -> Right $ display eolStyle o
|
||||||
where
|
where
|
||||||
content = unify updates
|
content = unify updates
|
||||||
|
|
||||||
|
listObjectIds :: Document -> Either String [String]
|
||||||
|
listObjectIds =
|
||||||
|
Right . prependTitle . toString . Map.keys . objects . unify . updates
|
||||||
|
where
|
||||||
|
toString = fmap (show . getObjectId)
|
||||||
|
prependTitle = ("ObjectIds defined in this PDF:":)
|
||||||
|
|
||||||
main :: IO ()
|
main :: IO ()
|
||||||
main = do
|
main = do
|
||||||
[inputFile, objectId] <- getArgs
|
(inputFile, getData) <- parse =<< getArgs
|
||||||
result <- parseDocument <$> BS.readFile inputFile
|
input <- BS.readFile inputFile
|
||||||
case result of
|
either die id $ (parseDocument input >>= getData)
|
||||||
Left parseError -> hPutStrLn stderr parseError
|
where
|
||||||
Right doc -> Lazy.putStr $ extractObject (ObjectId (read objectId)) doc
|
parse [inputFile] = return (inputFile, fmap (mapM_ putStrLn) . listObjectIds)
|
||||||
|
parse [inputFile, objectId] = return
|
||||||
|
(inputFile, fmap Lazy.putStr . extractObject (ObjectId (read objectId)))
|
||||||
|
parse _ = die . printf "Syntax: %s inputFile [OBJECT_ID]\n" =<< getProgName
|
||||||
|
|
|
@ -220,16 +220,18 @@ instance Output DirectObject where
|
||||||
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
|
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
|
||||||
|
|
||||||
directObject :: MonadParser m => m DirectObject
|
directObject :: MonadParser m => m DirectObject
|
||||||
directObject =
|
directObject = (peek >>= dispatch) <?> "direct object"
|
||||||
Boolean <$> boolean
|
where
|
||||||
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
|
dispatch 't' = Boolean <$> boolean
|
||||||
<|> NumberObject <$> number
|
dispatch 'f' = Boolean <$> boolean
|
||||||
<|> StringObject <$> stringObject
|
dispatch '(' = StringObject <$> stringObject
|
||||||
<|> NameObject <$> name
|
dispatch '<' = StringObject <$> stringObject <|> Dictionary <$> dictionary
|
||||||
<|> Array <$> array
|
dispatch '/' = NameObject <$> name
|
||||||
<|> Dictionary <$> dictionary
|
dispatch '[' = Array <$> array
|
||||||
<|> const Null <$> nullObject
|
dispatch 'n' = nullObject *> return Null
|
||||||
<?> "direct object"
|
dispatch _ =
|
||||||
|
Reference <$> reference {- defined before Number because Number is a prefix of it -}
|
||||||
|
<|> NumberObject <$> number
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Object
|
-- Object
|
||||||
|
|
|
@ -17,8 +17,8 @@ import Control.Monad.Fail (MonadFail(..))
|
||||||
import Control.Monad.State (StateT(..), evalStateT)
|
import Control.Monad.State (StateT(..), evalStateT)
|
||||||
import Control.Monad.Trans (MonadTrans(..))
|
import Control.Monad.Trans (MonadTrans(..))
|
||||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
|
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
|
||||||
Parser, char, endOfInput, parseOnly, satisfy, string, take, takeWhile
|
Parser, char, endOfInput, parseOnly, peekChar', satisfy, string, take
|
||||||
, takeWhile1
|
, takeWhile, takeWhile1
|
||||||
)
|
)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import Data.ByteString.Char8.Util (B16Int(..))
|
import Data.ByteString.Char8.Util (B16Int(..))
|
||||||
|
@ -36,6 +36,7 @@ class MonadDeps m => MonadParser m where
|
||||||
endOfInput :: m ()
|
endOfInput :: m ()
|
||||||
hexNumber :: m B16Int
|
hexNumber :: m B16Int
|
||||||
oneOf :: String -> m Char
|
oneOf :: String -> m Char
|
||||||
|
peek :: m Char
|
||||||
string :: ByteString -> m ByteString
|
string :: ByteString -> m ByteString
|
||||||
takeAll :: (Char -> Bool) -> m ByteString
|
takeAll :: (Char -> Bool) -> m ByteString
|
||||||
takeAll1 :: (Char -> Bool) -> m ByteString
|
takeAll1 :: (Char -> Bool) -> m ByteString
|
||||||
|
@ -47,6 +48,7 @@ instance MonadParser Atto.Parser where
|
||||||
decNumber = Atto.takeWhile1 (`Set.member` digits)
|
decNumber = Atto.takeWhile1 (`Set.member` digits)
|
||||||
hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
|
hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
|
||||||
oneOf charSet = Atto.satisfy (`elem` charSet)
|
oneOf charSet = Atto.satisfy (`elem` charSet)
|
||||||
|
peek = Atto.peekChar'
|
||||||
string s = Atto.string s <?> show s
|
string s = Atto.string s <?> show s
|
||||||
takeAll = Atto.takeWhile
|
takeAll = Atto.takeWhile
|
||||||
takeAll1 = Atto.takeWhile1
|
takeAll1 = Atto.takeWhile1
|
||||||
|
@ -58,6 +60,7 @@ instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) whe
|
||||||
decNumber = lift $ decNumber
|
decNumber = lift $ decNumber
|
||||||
hexNumber = lift $ hexNumber
|
hexNumber = lift $ hexNumber
|
||||||
oneOf = lift . oneOf
|
oneOf = lift . oneOf
|
||||||
|
peek = lift $ peek
|
||||||
string = lift . string
|
string = lift . string
|
||||||
takeAll = lift . takeAll
|
takeAll = lift . takeAll
|
||||||
takeAll1 = lift . takeAll1
|
takeAll1 = lift . takeAll1
|
||||||
|
|
Loading…
Reference in New Issue