Finish implementing reading, still bugs to investigate
This commit is contained in:
parent
1dd22c3889
commit
0374b72920
3 changed files with 33 additions and 22 deletions
|
@ -6,8 +6,8 @@ import Control.Monad (foldM)
|
||||||
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
|
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
|
||||||
import Control.Monad.Writer (tell)
|
import Control.Monad.Writer (tell)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import qualified Data.ByteString.Char8 as BS (readFile, putStrLn)
|
import qualified Data.ByteString.Char8 as BS (pack, putStrLn, readFile)
|
||||||
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, toStrict)
|
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||||
import Data.Map (Map, (!))
|
import Data.Map (Map, (!))
|
||||||
import qualified Data.Map as Map (empty, insert, lookup, toList)
|
import qualified Data.Map as Map (empty, insert, lookup, toList)
|
||||||
import PDF (Document(..), parseDocument)
|
import PDF (Document(..), parseDocument)
|
||||||
|
@ -29,12 +29,16 @@ list l = RWST (\_ s -> fillContext s <$> l)
|
||||||
where
|
where
|
||||||
fillContext s a = (a, s, [])
|
fillContext s a = (a, s, [])
|
||||||
|
|
||||||
|
handleError :: a -> String -> T a
|
||||||
|
handleError defaultValue s =
|
||||||
|
(tell . replicate 1 . BS.pack $ "Warning: " ++ s) >> return defaultValue
|
||||||
|
|
||||||
extractText :: Object -> T ()
|
extractText :: Object -> T ()
|
||||||
extractText object = do
|
extractText object = do
|
||||||
pageDict <- dict object
|
pageDict <- dict object
|
||||||
cMappers <- loadCMappers =<< getFont pageDict
|
cMappers <- loadCMappers =<< getFont pageDict
|
||||||
contents <- stream =<< follow =<< key "Contents" pageDict
|
contents <- stream =<< follow =<< key "Contents" pageDict
|
||||||
either (return . const ()) (tell . chunks) (pageContents cMappers contents)
|
either (handleError ()) (tell . chunks) (pageContents cMappers contents)
|
||||||
|
|
||||||
stream :: Object -> T ByteString
|
stream :: Object -> T ByteString
|
||||||
stream (Stream {header, streamContent}) = return $
|
stream (Stream {header, streamContent}) = return $
|
||||||
|
@ -68,7 +72,7 @@ loadFont objectId =
|
||||||
>>= key "ToUnicode"
|
>>= key "ToUnicode"
|
||||||
>>= follow
|
>>= follow
|
||||||
>>= stream
|
>>= stream
|
||||||
>>= either (return . const emptyCMap) return . cMap
|
>>= either (handleError emptyCMap) return . cMap
|
||||||
|
|
||||||
loadCMappers :: Dictionary -> T CMappers
|
loadCMappers :: Dictionary -> T CMappers
|
||||||
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
||||||
|
|
|
@ -51,7 +51,7 @@ instance MonadParser Atto.Parser where
|
||||||
decNumber = Atto.takeWhile1 (`Set.member` digits)
|
decNumber = Atto.takeWhile1 (`Set.member` digits)
|
||||||
hexNumber = Atto.takeWhile1 (`Set.member` hexDigits)
|
hexNumber = Atto.takeWhile1 (`Set.member` hexDigits)
|
||||||
oneOf charSet = Atto.satisfy (`elem` charSet)
|
oneOf charSet = Atto.satisfy (`elem` charSet)
|
||||||
string = Atto.string
|
string s = Atto.string s <?> show s
|
||||||
takeAll = Atto.takeWhile
|
takeAll = Atto.takeWhile
|
||||||
takeAll1 = Atto.takeWhile1
|
takeAll1 = Atto.takeWhile1
|
||||||
|
|
||||||
|
|
|
@ -1,34 +1,31 @@
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
module PDF.Text {-(
|
module PDF.Text {
|
||||||
CMap
|
CMap
|
||||||
, CMappers
|
, CMappers
|
||||||
, PageContents(..)
|
, PageContents(..)
|
||||||
, cMap
|
, cMap
|
||||||
, emptyCMap
|
, emptyCMap
|
||||||
, pageContents
|
, pageContents
|
||||||
)-} where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>), many)
|
import Control.Applicative ((<|>), many)
|
||||||
import Control.Monad (foldM, join)
|
import Control.Monad (foldM, join)
|
||||||
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
||||||
import Control.Monad.State (put)
|
import Control.Monad.State (get, put)
|
||||||
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
|
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
|
||||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
||||||
import Data.ByteString (ByteString, pack)
|
import Data.ByteString (ByteString, pack)
|
||||||
import qualified Data.ByteString as BS (head)
|
|
||||||
import Data.ByteString.Char8 as Char8 (pack)
|
|
||||||
import qualified Data.ByteString.UTF8 as UTF8 (fromString)
|
|
||||||
import Data.Map (Map, (!))
|
import Data.Map (Map, (!))
|
||||||
import qualified Data.Map as Map (empty, fromList)
|
import qualified Data.Map as Map (empty, fromList, lookup)
|
||||||
import Data.Text (Text, snoc)
|
import Data.Text (Text, snoc)
|
||||||
import qualified Data.Text as Text (init, last, null, unpack)
|
import qualified Data.Text as Text (init, last, null, unpack)
|
||||||
import Data.Text.Encoding (decodeUtf16BE)
|
import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
DirectObject(..), Name, StringObject(..)
|
DirectObject(..), Name, StringObject(..)
|
||||||
, array, blank, directObject, integer, line, name, regular, stringObject
|
, array, blank, directObject, integer, line, name, regular, stringObject
|
||||||
)
|
)
|
||||||
import qualified PDF.EOL as EOL (charset, parser)
|
import qualified PDF.EOL as EOL (charset, parser)
|
||||||
import PDF.Parser (Parser, evalParser, string, takeAll)
|
import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll)
|
||||||
|
|
||||||
type CMappers = Map Name CMap
|
type CMappers = Map Name CMap
|
||||||
type CMap = Map Int Text
|
type CMap = Map Int Text
|
||||||
|
@ -106,7 +103,7 @@ data TextOperator =
|
||||||
data Argument = Raw ByteString | Typed DirectObject
|
data Argument = Raw ByteString | Typed DirectObject
|
||||||
type Call a = (a, [Argument])
|
type Call a = (a, [Argument])
|
||||||
|
|
||||||
stateOperator :: StateOperator -> ParserWithFont (Call StateOperator)
|
stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator)
|
||||||
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm"
|
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm"
|
||||||
stateOperator W = (,) W <$> count 1 argument <* string "w"
|
stateOperator W = (,) W <$> count 1 argument <* string "w"
|
||||||
stateOperator J = (,) J <$> count 1 argument <* string "J"
|
stateOperator J = (,) J <$> count 1 argument <* string "J"
|
||||||
|
@ -117,15 +114,15 @@ stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri"
|
||||||
stateOperator I = (,) I <$> count 1 argument <* string "i"
|
stateOperator I = (,) I <$> count 1 argument <* string "i"
|
||||||
stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs"
|
stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs"
|
||||||
|
|
||||||
textOperator :: TextOperator -> ParserWithFont (Call TextOperator)
|
textOperator :: MonadParser m => TextOperator -> m (Call TextOperator)
|
||||||
textOperator Td = (,) Td <$> count 2 argument <* string "Td"
|
textOperator Td = (,) Td <$> count 2 argument <* string "Td"
|
||||||
textOperator TD = (,) TD <$> count 2 argument <* string "TD"
|
textOperator TD = (,) TD <$> count 2 argument <* string "TD"
|
||||||
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
|
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
|
||||||
textOperator Tstar = (,) Td <$> return [] <* string "T*"
|
textOperator Tstar = (,) Td <$> return [] <* string "T*"
|
||||||
textOperator TJ =
|
textOperator TJ =
|
||||||
(,) TJ <$> sequence [Typed . Array <$> array] <* string "TJ"
|
(,) TJ <$> sequence [Typed . Array <$> array] <* blank <* string "TJ"
|
||||||
textOperator Tj =
|
textOperator Tj =
|
||||||
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* string "Tj"
|
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* blank <* string "Tj"
|
||||||
textOperator Quote = (,) Quote <$> count 1 argument <* string "'"
|
textOperator Quote = (,) Quote <$> count 1 argument <* string "'"
|
||||||
textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\""
|
textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\""
|
||||||
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
|
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
|
||||||
|
@ -139,7 +136,7 @@ textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
|
||||||
a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o)
|
a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o)
|
||||||
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
|
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
|
||||||
|
|
||||||
argument :: ParserWithFont Argument
|
argument :: MonadParser m => m Argument
|
||||||
argument = Raw <$> takeAll regular <* blank
|
argument = Raw <$> takeAll regular <* blank
|
||||||
|
|
||||||
data PageContents = PageContents {
|
data PageContents = PageContents {
|
||||||
|
@ -194,7 +191,17 @@ runOperator _ = return []
|
||||||
|
|
||||||
decodeString :: StringObject -> ParserWithFont ByteString
|
decodeString :: StringObject -> ParserWithFont ByteString
|
||||||
decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h))
|
decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h))
|
||||||
decodeString (Literal s) =
|
decodeString (Literal litString) = get >>= convertBytes litString
|
||||||
asks
|
|
||||||
where
|
where
|
||||||
bytes = Char8.pack s
|
convertBytes :: String -> CMap -> ParserWithFont ByteString
|
||||||
|
convertBytes [] _ = return ""
|
||||||
|
convertBytes (c:cs) someCMap = do
|
||||||
|
convertBytesAux (fromEnum c) 1 cs someCMap
|
||||||
|
convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString
|
||||||
|
convertBytesAux code size s someCMap
|
||||||
|
| size > 4 = fail "Could not match any input code smaller than an int"
|
||||||
|
| otherwise =
|
||||||
|
case (Map.lookup code someCMap, s) of
|
||||||
|
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
|
||||||
|
(Nothing, []) -> fail "No character left to read but no code recognized"
|
||||||
|
(Just outputText, _) -> mappend (encodeUtf8 outputText) <$> convertBytes s someCMap
|
||||||
|
|
Loading…
Reference in a new issue