Completely lost, the same old Char8 / Word8 again, implemented all the text reading, still needing a couple details to parse CMaps

This commit is contained in:
Tissevert 2019-09-25 18:42:34 +02:00
parent f9e5683bf4
commit e7484ef536
5 changed files with 158 additions and 62 deletions

View file

@ -30,6 +30,7 @@ library
, base >=4.9 && <4.13 , base >=4.9 && <4.13
, bytestring , bytestring
, containers , containers
, data-serializer
, mtl , mtl
hs-source-dirs: src hs-source-dirs: src
ghc-options: -Wall ghc-options: -Wall

View file

@ -16,7 +16,7 @@ import PDF.Object (
, Object(..), Name(..), Structure(..) , Object(..), Name(..), Structure(..)
,) ,)
import PDF.Output (ObjectId) import PDF.Output (ObjectId)
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents) import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents)
import PDF.Update (unify) import PDF.Update (unify)
import System.Environment (getArgs) import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr) import System.IO (hPutStrLn, stderr)
@ -32,26 +32,17 @@ list l = RWST (\_ s -> fillContext s <$> l)
extractText :: Object -> T () extractText :: Object -> T ()
extractText object = do extractText object = do
pageDict <- dict object pageDict <- dict object
contents <- follow =<< key "Contents" pageDict cMappers <- loadCMappers =<< getFont pageDict
case contents of contents <- stream =<< follow =<< key "Contents" pageDict
(Stream {header, streamContent}) -> do either (return . const ()) (tell . chunks) (pageContents cMappers contents)
font <- getFont pageDict
cMappers <- loadCMappers font
storeDecodedText cMappers $ clear header streamContent
_ -> return ()
clear :: Dictionary -> ByteString -> ByteString stream :: Object -> T ByteString
clear header streamContent = stream (Stream {header, streamContent}) = return $
case Map.lookup (Name "Filter") header of case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) -> Just (NameObject (Name "FlateDecode")) ->
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
_ -> streamContent _ -> streamContent
stream _ = list []
storeDecodedText :: CMappers -> ByteString -> T ()
storeDecodedText font page =
case pageContents font page of
Left _ -> return ()
Right (PageContents {chunks}) -> tell chunks
getFont :: Dictionary -> T Dictionary getFont :: Dictionary -> T Dictionary
getFont pageDict = getFont pageDict =
@ -76,6 +67,8 @@ loadFont objectId =
>>= dict >>= dict
>>= key "ToUnicode" >>= key "ToUnicode"
>>= follow >>= follow
>>= stream
>>= either (return . const emptyCMap) return . cMap
loadCMappers :: Dictionary -> T CMappers loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList loadCMappers = foldM loadCMapper Map.empty . Map.toList

View file

@ -6,14 +6,14 @@ module PDF.EOL (
) where ) where
import Control.Applicative ((<|>)) import Control.Applicative ((<|>))
import PDF.Parser (Parser, string) import PDF.Parser (MonadParser, string)
data Style = CR | LF | CRLF deriving Show data Style = CR | LF | CRLF deriving Show
charset :: String charset :: String
charset = "\r\n" charset = "\r\n"
parser :: Parser s Style parser :: MonadParser m => m Style
parser = parser =
(string "\r\n" >> return CRLF) (string "\r\n" >> return CRLF)
<|> (string "\r" >> return CR) <|> (string "\r" >> return CR)

View file

@ -13,9 +13,11 @@ module PDF.Object (
, Number(..) , Number(..)
, Object(..) , Object(..)
, Occurrence(..) , Occurrence(..)
, StringObject(..)
, Structure(..) , Structure(..)
, XRefEntry(..) , XRefEntry(..)
, XRefSection , XRefSection
, array
, blank , blank
, dictionary , dictionary
, directObject , directObject
@ -24,14 +26,16 @@ module PDF.Object (
, line , line
, magicNumber , magicNumber
, name , name
, number
, regular , regular
, stringObject
, structure , structure
) where ) where
import Control.Applicative ((<|>), many) import Control.Applicative ((<|>), many)
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString.Char8 (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BS ( import qualified Data.ByteString as BS (
concat, cons, pack, singleton, unpack concat, cons, pack, singleton, unpack
) )
import Data.Map (Map, (!), mapWithKey) import Data.Map (Map, (!), mapWithKey)
@ -48,7 +52,7 @@ import PDF.Output (
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf) import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
import Text.Printf (printf) import Text.Printf (printf)
line :: String -> Parser u () line :: MonadParser m => String -> m ()
line l = (string (BS.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l line l = (string (BS.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l
magicNumber :: ByteString magicNumber :: ByteString
@ -69,7 +73,7 @@ delimiterCharset = "()<>[]{}/%"
regular :: Char -> Bool regular :: Char -> Bool
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset)) regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
integer :: (Read a, Num a) => Parser u a integer :: (Read a, Num a, MonadParser m) => m a
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer" integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
------------------------------------- -------------------------------------
@ -81,7 +85,7 @@ type IndexedObjects = Map ObjectId Object
-- --
-- Boolean -- Boolean
-- --
boolean :: Parser u Bool boolean :: MonadParser m => m Bool
boolean = boolean =
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean" (string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
@ -96,7 +100,7 @@ instance Output Number where
(n, 0) -> printf "%d" (n :: Int) (n, 0) -> printf "%d" (n :: Int)
_ -> printf "%f" f _ -> printf "%f" f
number :: Parser u Number number :: MonadParser m => m Number
number = Number . read . BS.unpack <$> number = Number . read . BS.unpack <$>
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart)) (mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
<?> "number" <?> "number"
@ -114,7 +118,7 @@ instance Output StringObject where
output (Literal s) = Output.string (printf "(%s)" s) output (Literal s) = Output.string (printf "(%s)" s)
output (Hexadecimal s) = Output.string (printf "<%s>" s) output (Hexadecimal s) = Output.string (printf "<%s>" s)
stringObject :: Parser u StringObject stringObject :: MonadParser m => m StringObject
stringObject = stringObject =
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')') Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>') <|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
@ -143,7 +147,7 @@ name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
-- --
-- Array -- Array
-- --
array :: Parser u [DirectObject] array :: MonadParser m => m [DirectObject]
array = array =
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array" char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
@ -160,7 +164,7 @@ instance Output Dictionary where
outputKeyVal :: (Name, DirectObject) -> OBuilder outputKeyVal :: (Name, DirectObject) -> OBuilder
outputKeyVal (key, val) = Output.concat [output key, " ", output val] outputKeyVal (key, val) = Output.concat [output key, " ", output val]
dictionary :: Parser u Dictionary dictionary :: MonadParser m => m Dictionary
dictionary = dictionary =
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary" string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
where where
@ -170,7 +174,7 @@ dictionary =
-- --
-- Null -- Null
-- --
nullObject :: Parser u () nullObject :: MonadParser m => m ()
nullObject = string "null" *> return () <?> "null object" nullObject = string "null" *> return () <?> "null object"
-- --
@ -181,7 +185,7 @@ data IndirectObjCoordinates = IndirectObjCoordinates {
, versionNumber :: Int , versionNumber :: Int
} deriving Show } deriving Show
reference :: Parser u IndirectObjCoordinates reference :: MonadParser m => m IndirectObjCoordinates
reference = IndirectObjCoordinates reference = IndirectObjCoordinates
<$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object" <$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
@ -210,7 +214,7 @@ instance Output DirectObject where
output (Reference (IndirectObjCoordinates {objectId, versionNumber})) = output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber) Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
directObject :: Parser u DirectObject directObject :: MonadParser m => m DirectObject
directObject = directObject =
Boolean <$> boolean Boolean <$> boolean
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -} <|> Reference <$> reference {- defined before Number because Number is a prefix of it -}

View file

@ -4,19 +4,24 @@ module PDF.Text (
, CMappers , CMappers
, PageContents(..) , PageContents(..)
, cMap , cMap
, emptyCMap
, pageContents , pageContents
) where ) where
import Control.Applicative ((<|>)) import Control.Applicative ((<|>), many)
import Control.Monad (join) import Control.Monad (foldM, join)
import Control.Monad.Reader (ReaderT, runReaderT) import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (state) import Control.Monad.State (put)
import Data.Attoparsec.ByteString.Char8 (count, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
import Data.ByteString.Char8 (ByteString) import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
import Data.Map (Map) import Data.ByteString.Char8 (ByteString, pack)
import qualified Data.Map as Map (empty) import Data.Map (Map, (!))
import PDF.Object (Content, Name, blank, name, regular) import qualified Data.Map as Map (empty, fromList)
import PDF.Output (ObjectId) import PDF.Object (
DirectObject(..), Name, StringObject(..)
, array, blank, directObject, integer, line, name, regular, stringObject
)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Parser (Parser, evalParser, string, takeAll) import PDF.Parser (Parser, evalParser, string, takeAll)
type CMappers = Map Name CMap type CMappers = Map Name CMap
@ -25,10 +30,92 @@ type CMap = Map Int ByteString
emptyCMap :: CMap emptyCMap :: CMap
emptyCMap = Map.empty emptyCMap = Map.empty
data TextOperator = TJ | Tj | Tf | Other cMap :: ByteString -> Either String CMap
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
where
ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
cMap :: ByteString -> CMap cMapRange :: Atto.Parser CMap
cMap = undefined cMapRange = do
size <- integer <* line "beginbfrange"
mconcat <$> count size rangeMapping <* line "endbfrange"
where
rangeMapping = mapFromTo
<$> (stringObject <* blank)
<*> (stringObject <* blank)
<*> directObject <* EOL.parser
cMapChar :: Atto.Parser CMap
cMapChar = do
size <- integer <* line "beginbfchar"
Map.fromList <$> count size charMapping <* line "endbfchar"
where
charMapping = pairMapping
<$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
undefined
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) = undefined
mapFromTo _ _ _ = Map.empty
pairMapping :: StringObject -> StringObject -> (Int, ByteString)
pairMapping (Hexadecimal from) (Hexadecimal to) =
(read $ "0x" ++ from,
fromBytes :: String -> ByteString
fromBytes s
where
toBaseWord8 n
| n < 0xff = [
to
data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
deriving (Bounded, Enum)
data TextOperator =
Td | TD | Tm | Tstar -- text positioning
| TJ | Tj | Quote | DQuote -- text showing
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
deriving (Bounded, Enum)
data Argument = Raw ByteString | Typed DirectObject
type Call a = (a, [Argument])
stateOperator :: StateOperator -> ParserWithFont (Call StateOperator)
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm"
stateOperator W = (,) W <$> count 1 argument <* string "w"
stateOperator J = (,) J <$> count 1 argument <* string "J"
stateOperator J_ = (,) J_ <$> count 1 argument <* string "j"
stateOperator M = (,) M <$> count 1 argument <* string "M"
stateOperator D = (,) D <$> count 2 argument <* string "d"
stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri"
stateOperator I = (,) I <$> count 1 argument <* string "i"
stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs"
textOperator :: TextOperator -> ParserWithFont (Call TextOperator)
textOperator Td = (,) Td <$> count 2 argument <* string "Td"
textOperator TD = (,) TD <$> count 2 argument <* string "TD"
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
textOperator Tstar = (,) Td <$> return [] <* string "T*"
textOperator TJ =
(,) TJ <$> sequence [Typed . Array <$> array] <* string "TJ"
textOperator Tj =
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* string "Tj"
textOperator Quote = (,) Quote <$> count 1 argument <* string "'"
textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\""
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw"
textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz"
textOperator TL = (,) TL <$> count 1 argument <* string "TL"
textOperator Tf = (,) Tf <$> sequence [Typed . NameObject <$> name, argument] <* string "Tf"
textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr"
textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o)
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
argument :: ParserWithFont Argument
argument = Raw <$> takeAll regular <* blank
data PageContents = PageContents { data PageContents = PageContents {
chunks :: [ByteString] chunks :: [ByteString]
@ -36,18 +123,6 @@ data PageContents = PageContents {
type ParserWithFont = ReaderT CMappers (Parser CMap) type ParserWithFont = ReaderT CMappers (Parser CMap)
{-
data FontContext = FontContext {
cMappers :: CMappers
, currentFont :: CMap
}
initFontContext cMappers = FontContext {
cMappers
, currentFont = emptyCMap
}
-}
pageContents :: CMappers -> ByteString -> Either String PageContents pageContents :: CMappers -> ByteString -> Either String PageContents
pageContents font input = pageContents font input =
evalParser (runReaderT (PageContents <$> page) font) emptyCMap input evalParser (runReaderT (PageContents <$> page) font) emptyCMap input
@ -60,14 +135,37 @@ graphicState =
string "q" *> blank *> insideQ <* string "Q" string "q" *> blank *> insideQ <* string "Q"
where where
insideQ = join <$> (command <|> page `sepBy` blank ) insideQ = join <$> (command <|> page `sepBy` blank )
command = command = a stateOperator *> return []
count 6 argument *> string "cm" *> return []
<|> name *> blank *> string "gs" *> return []
argument = takeAll regular <* blank
text :: ParserWithFont [ByteString] text :: ParserWithFont [ByteString]
text = undefined text =
string "BT" *> blank *> commands <* blank <* string "ET"
where
commands = join <$> (a textOperator >>= runOperator) `sepBy` blank
textOperator :: ParserWithFont TextOperator runOperator :: Call TextOperator -> ParserWithFont [ByteString]
textOperator = undefined runOperator (Tf, [Typed (NameObject fontName), _]) =
asks (! fontName) >>= put >> return []
runOperator (Tstar, []) = return ["\n"]
runOperator (TJ, [Typed (Array arrayObject)]) =
replicate 1 <$> foldM appendText "" arrayObject
where
appendText bs (StringObject outputString) =
mappend bs <$> decodeString outputString
appendText bs _ = return bs
runOperator (Tj, [Typed (StringObject outputString)]) =
replicate 1 <$> decodeString outputString
runOperator (Quote, [Typed (StringObject outputString)]) =
(\bs -> ["\n", bs]) <$> decodeString outputString
runOperator (DQuote, [Typed (StringObject outputString)]) =
(\bs -> ["\n", bs]) <$> decodeString outputString
runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString
decodeString = undefined