Finish implementing reading, still bugs to investigate
This commit is contained in:
parent
1dd22c3889
commit
0374b72920
3 changed files with 33 additions and 22 deletions
|
@ -6,8 +6,8 @@ import Control.Monad (foldM)
|
|||
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
|
||||
import Control.Monad.Writer (tell)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile, putStrLn)
|
||||
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, toStrict)
|
||||
import qualified Data.ByteString.Char8 as BS (pack, putStrLn, readFile)
|
||||
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, insert, lookup, toList)
|
||||
import PDF (Document(..), parseDocument)
|
||||
|
@ -29,12 +29,16 @@ list l = RWST (\_ s -> fillContext s <$> l)
|
|||
where
|
||||
fillContext s a = (a, s, [])
|
||||
|
||||
handleError :: a -> String -> T a
|
||||
handleError defaultValue s =
|
||||
(tell . replicate 1 . BS.pack $ "Warning: " ++ s) >> return defaultValue
|
||||
|
||||
extractText :: Object -> T ()
|
||||
extractText object = do
|
||||
pageDict <- dict object
|
||||
cMappers <- loadCMappers =<< getFont pageDict
|
||||
contents <- stream =<< follow =<< key "Contents" pageDict
|
||||
either (return . const ()) (tell . chunks) (pageContents cMappers contents)
|
||||
either (handleError ()) (tell . chunks) (pageContents cMappers contents)
|
||||
|
||||
stream :: Object -> T ByteString
|
||||
stream (Stream {header, streamContent}) = return $
|
||||
|
@ -68,7 +72,7 @@ loadFont objectId =
|
|||
>>= key "ToUnicode"
|
||||
>>= follow
|
||||
>>= stream
|
||||
>>= either (return . const emptyCMap) return . cMap
|
||||
>>= either (handleError emptyCMap) return . cMap
|
||||
|
||||
loadCMappers :: Dictionary -> T CMappers
|
||||
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
||||
|
|
|
@ -51,7 +51,7 @@ instance MonadParser Atto.Parser where
|
|||
decNumber = Atto.takeWhile1 (`Set.member` digits)
|
||||
hexNumber = Atto.takeWhile1 (`Set.member` hexDigits)
|
||||
oneOf charSet = Atto.satisfy (`elem` charSet)
|
||||
string = Atto.string
|
||||
string s = Atto.string s <?> show s
|
||||
takeAll = Atto.takeWhile
|
||||
takeAll1 = Atto.takeWhile1
|
||||
|
||||
|
|
|
@ -1,34 +1,31 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
module PDF.Text {-(
|
||||
module PDF.Text {
|
||||
CMap
|
||||
, CMappers
|
||||
, PageContents(..)
|
||||
, cMap
|
||||
, emptyCMap
|
||||
, pageContents
|
||||
)-} where
|
||||
) where
|
||||
|
||||
import Control.Applicative ((<|>), many)
|
||||
import Control.Monad (foldM, join)
|
||||
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
||||
import Control.Monad.State (put)
|
||||
import Control.Monad.State (get, put)
|
||||
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
|
||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
||||
import Data.ByteString (ByteString, pack)
|
||||
import qualified Data.ByteString as BS (head)
|
||||
import Data.ByteString.Char8 as Char8 (pack)
|
||||
import qualified Data.ByteString.UTF8 as UTF8 (fromString)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, fromList)
|
||||
import qualified Data.Map as Map (empty, fromList, lookup)
|
||||
import Data.Text (Text, snoc)
|
||||
import qualified Data.Text as Text (init, last, null, unpack)
|
||||
import Data.Text.Encoding (decodeUtf16BE)
|
||||
import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
|
||||
import PDF.Object (
|
||||
DirectObject(..), Name, StringObject(..)
|
||||
, array, blank, directObject, integer, line, name, regular, stringObject
|
||||
)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import PDF.Parser (Parser, evalParser, string, takeAll)
|
||||
import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll)
|
||||
|
||||
type CMappers = Map Name CMap
|
||||
type CMap = Map Int Text
|
||||
|
@ -106,7 +103,7 @@ data TextOperator =
|
|||
data Argument = Raw ByteString | Typed DirectObject
|
||||
type Call a = (a, [Argument])
|
||||
|
||||
stateOperator :: StateOperator -> ParserWithFont (Call StateOperator)
|
||||
stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator)
|
||||
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm"
|
||||
stateOperator W = (,) W <$> count 1 argument <* string "w"
|
||||
stateOperator J = (,) J <$> count 1 argument <* string "J"
|
||||
|
@ -117,15 +114,15 @@ stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri"
|
|||
stateOperator I = (,) I <$> count 1 argument <* string "i"
|
||||
stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs"
|
||||
|
||||
textOperator :: TextOperator -> ParserWithFont (Call TextOperator)
|
||||
textOperator :: MonadParser m => TextOperator -> m (Call TextOperator)
|
||||
textOperator Td = (,) Td <$> count 2 argument <* string "Td"
|
||||
textOperator TD = (,) TD <$> count 2 argument <* string "TD"
|
||||
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
|
||||
textOperator Tstar = (,) Td <$> return [] <* string "T*"
|
||||
textOperator TJ =
|
||||
(,) TJ <$> sequence [Typed . Array <$> array] <* string "TJ"
|
||||
(,) TJ <$> sequence [Typed . Array <$> array] <* blank <* string "TJ"
|
||||
textOperator Tj =
|
||||
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* string "Tj"
|
||||
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* blank <* string "Tj"
|
||||
textOperator Quote = (,) Quote <$> count 1 argument <* string "'"
|
||||
textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\""
|
||||
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
|
||||
|
@ -139,7 +136,7 @@ textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
|
|||
a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o)
|
||||
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
|
||||
|
||||
argument :: ParserWithFont Argument
|
||||
argument :: MonadParser m => m Argument
|
||||
argument = Raw <$> takeAll regular <* blank
|
||||
|
||||
data PageContents = PageContents {
|
||||
|
@ -194,7 +191,17 @@ runOperator _ = return []
|
|||
|
||||
decodeString :: StringObject -> ParserWithFont ByteString
|
||||
decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h))
|
||||
decodeString (Literal s) =
|
||||
asks
|
||||
decodeString (Literal litString) = get >>= convertBytes litString
|
||||
where
|
||||
bytes = Char8.pack s
|
||||
convertBytes :: String -> CMap -> ParserWithFont ByteString
|
||||
convertBytes [] _ = return ""
|
||||
convertBytes (c:cs) someCMap = do
|
||||
convertBytesAux (fromEnum c) 1 cs someCMap
|
||||
convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString
|
||||
convertBytesAux code size s someCMap
|
||||
| size > 4 = fail "Could not match any input code smaller than an int"
|
||||
| otherwise =
|
||||
case (Map.lookup code someCMap, s) of
|
||||
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
|
||||
(Nothing, []) -> fail "No character left to read but no code recognized"
|
||||
(Just outputText, _) -> mappend (encodeUtf8 outputText) <$> convertBytes s someCMap
|
||||
|
|
Loading…
Reference in a new issue