Still debugging, broke pretty much everything and finally implementing a proper coderange parsing for CMap because apparently that's necessary

This commit is contained in:
Tissevert 2019-09-30 14:13:12 +02:00
parent b8ca7281aa
commit 36d7f9b819
7 changed files with 187 additions and 64 deletions

View file

@ -23,9 +23,9 @@ library
, PDF.Output , PDF.Output
, PDF.Text , PDF.Text
, PDF.Update , PDF.Update
, PDF.Parser
other-modules: Data.ByteString.Char8.Util other-modules: Data.ByteString.Char8.Util
, PDF.Body , PDF.Body
, PDF.Parser
-- other-extensions: -- other-extensions:
build-depends: attoparsec build-depends: attoparsec
, base >=4.9 && <4.13 , base >=4.9 && <4.13
@ -56,6 +56,16 @@ executable getObj
ghc-options: -Wall ghc-options: -Wall
default-language: Haskell2010 default-language: Haskell2010
executable debug
main-is: examples/debug.hs
build-depends: base
, bytestring
, containers
, Hufflepdf
, mtl
ghc-options: -Wall
default-language: Haskell2010
executable getText executable getText
main-is: examples/getText.hs main-is: examples/getText.hs
build-depends: base build-depends: base

26
examples/debug.hs Normal file
View file

@ -0,0 +1,26 @@
module Main where
import Control.Monad.Reader
import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as C8
import qualified Data.Map as Map
import PDF.Object (Name(..), array)
import PDF.CMap (CMappers, cMap, emptyCMap)
import PDF.Parser (evalParser)
import PDF.Text
test :: CMappers -> ParserWithFont a -> BS.ByteString -> Either String a
test fonts parser =
evalParser (runReaderT parser fonts) emptyCMap
main :: IO ()
main = do
--input <- BS.readFile "20.stream"
input <- BS.readFile "array.bin"
Right font <- cMap <$> BS.readFile "6300.stream"
--mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> BS.putStrLn v) $ Map.toList font
--case pageContents (Map.singleton (Name "R9") font) input of
--case test (Map.singleton (Name "R9") font) array input of
case test (Map.singleton (Name "R9") font) (a textOperator) input of
Left e -> putStrLn e
Right l -> putStrLn . show $ l

View file

@ -16,11 +16,12 @@ import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..) , Object(..), Name(..), Structure(..)
,) ,)
import PDF.Output (ObjectId) import PDF.Output (ObjectId(..))
import PDF.Text (PageContents(..), pageContents) import PDF.Text (PageContents(..), pageContents)
import PDF.Update (unify) import PDF.Update (unify)
import System.Environment (getArgs) import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr) import System.IO (hPutStrLn, stderr)
import Text.Printf (printf)
type CachedCMaps = Map ObjectId CMap type CachedCMaps = Map ObjectId CMap
type T a = RWST Content [ByteString] CachedCMaps [] a type T a = RWST Content [ByteString] CachedCMaps [] a
@ -30,16 +31,19 @@ list l = RWST (\_ s -> fillContext s <$> l)
where where
fillContext s a = (a, s, []) fillContext s a = (a, s, [])
handleError :: a -> String -> T a handleError :: ObjectId -> a -> String -> T a
handleError defaultValue s = handleError objectId defaultValue s =
(tell . replicate 1 . BS.pack $ "Warning: " ++ s) >> return defaultValue (tell . replicate 1 $ BS.pack message) >> return defaultValue
where
message = printf "Object #%d : %s" (getObjectId objectId) s
extractText :: Object -> T () extractText :: Object -> T ()
extractText object = do extractText object = do
pageDict <- dict object pageDict <- dict object
cMappers <- loadCMappers =<< getFont pageDict cMappers <- loadCMappers =<< getFont pageDict
contents <- stream =<< follow =<< key "Contents" pageDict contentsId <- target =<< key "Contents" pageDict
either (handleError ()) (tell . chunks) (pageContents cMappers contents) contents <- stream =<< getObject contentsId
either (handleError contentsId ()) (tell . chunks) (pageContents cMappers contents)
stream :: Object -> T ByteString stream :: Object -> T ByteString
stream (Stream {header, streamContent}) = return $ stream (Stream {header, streamContent}) = return $
@ -73,7 +77,7 @@ loadFont objectId =
>>= key "ToUnicode" >>= key "ToUnicode"
>>= follow >>= follow
>>= stream >>= stream
>>= either (handleError emptyCMap) return . cMap >>= either (handleError objectId emptyCMap) return . cMap
loadCMappers :: Dictionary -> T CMappers loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList loadCMappers = foldM loadCMapper Map.empty . Map.toList
@ -94,9 +98,12 @@ key keyName dictionary =
Just obj -> return obj Just obj -> return obj
_ -> list [] _ -> list []
target :: DirectObject -> T ObjectId
target (Reference (IndirectObjCoordinates {objectId})) = return objectId
target _ = list []
follow :: DirectObject -> T Object follow :: DirectObject -> T Object
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId follow directObject = target directObject >>= getObject
follow _ = list []
dict :: Object -> T Dictionary dict :: Object -> T Dictionary
dict (Direct (Dictionary dictionary)) = return dictionary dict (Direct (Dictionary dictionary)) = return dictionary
@ -122,7 +129,6 @@ listTextObjects (Document {updates}) =
snd =<< evalRWST rwsMain (unify updates) Map.empty snd =<< evalRWST rwsMain (unify updates) Map.empty
where where
rwsMain = rwsMain =
--Lazy.pack . show <$> (getObject =<< pagesList)
pagesList >>= getObject >>= extractText pagesList >>= getObject >>= extractText

View file

@ -1,16 +1,53 @@
module Data.ByteString.Char8.Util ( module Data.ByteString.Char8.Util (
previous decodeHex
, fromInt
, hexString
, parseBytes
, previous
, subBS , subBS
, toInt
, utf16BEToutf8
) where ) where
import Data.ByteString.Char8 (ByteString) import Data.ByteString (ByteString, snoc)
import qualified Data.ByteString.Char8 as BS (drop, index, take) import qualified Data.ByteString as BS (foldl, pack, singleton)
import qualified Data.ByteString.Char8 as Char8 (drop, index, take, unpack)
import Data.Text.Encoding (encodeUtf8, decodeUtf16BE)
import Prelude hiding (length) import Prelude hiding (length)
previous :: Char -> Int -> ByteString -> Int previous :: Char -> Int -> ByteString -> Int
previous char position byteString previous char position byteString
| BS.index byteString position == char = position | Char8.index byteString position == char = position
| otherwise = previous char (position - 1) byteString | otherwise = previous char (position - 1) byteString
subBS :: Int -> Int -> ByteString -> ByteString subBS :: Int -> Int -> ByteString -> ByteString
subBS offset length = BS.take length . BS.drop offset subBS offset length = Char8.take length . Char8.drop offset
hexString :: (Num a, Read a) => String -> a
hexString s = read $ "0x" ++ s
fromInt :: Int -> ByteString
fromInt n
| n < 0x100 = BS.singleton $ toEnum n
| otherwise = fromInt (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100))
toInt :: ByteString -> Int
toInt = BS.foldl (\n w -> 0x100*n + fromEnum w) 0
{-
encodeHex :: ByteString -> ByteString
encodeHex =
-}
decodeHex :: ByteString -> ByteString
decodeHex = parseBytes . Char8.unpack
parseBytes :: String -> ByteString
parseBytes = BS.pack . fmap hexString . pairDigits
where
pairDigits "" = []
pairDigits [c] = [[c]]
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
utf16BEToutf8 :: ByteString -> ByteString
utf16BEToutf8 = encodeUtf8 . decodeUtf16BE

View file

@ -1,3 +1,4 @@
{-# LANGUAGE NamedFieldPuns #-}
module PDF.CMap ( module PDF.CMap (
CMap CMap
, CMappers , CMappers
@ -6,35 +7,66 @@ module PDF.CMap (
) where ) where
import Control.Applicative ((<|>), many) import Control.Applicative ((<|>), many)
import Data.Attoparsec.ByteString.Char8 (count, parseOnly) import Control.Monad.State (modify)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser) import Data.Attoparsec.ByteString.Char8 (count)
import Data.ByteString (ByteString, snoc) import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (init, last, null) import qualified Data.ByteString as BS (length)
import Data.Map (Map) import Data.ByteString.Char8.Util (
import qualified Data.Map as Map (empty, fromList) decodeHex, fromInt, toInt, utf16BEToutf8
)
import Data.Map (Map, union)
import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
import qualified PDF.EOL as EOL (charset, parser) import qualified PDF.EOL as EOL (charset, parser)
import PDF.Object ( import PDF.Object (
DirectObject(..), Name, StringObject(..) DirectObject(..), Name, StringObject(..)
, blank, directObject, hexString, integer, line, parseBytes, stringObject , blank, directObject, integer, line, stringObject
) )
import PDF.Parser (takeAll) import PDF.Parser (MonadParser, Parser, runParser, takeAll)
type CMappers = Map Name CMap type CMappers = Map Name CMap
type CMap = Map Int ByteString type Mapping = Map ByteString ByteString
data CRange = CRange {
fromSequence :: ByteString
, toSequence :: ByteString
, mapping :: Mapping
}
type RangeSize = Int
type CMap = Map RangeSize [CRange]
emptyCMap :: CMap emptyCMap :: CMap
emptyCMap = Map.empty emptyCMap = Map.empty
cMap :: ByteString -> Either String CMap cMap :: ByteString -> Either String CMap
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine) cMap = fmap snd <$> runParser
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
emptyCMap
where where
ignoredLine = ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
cMapRange :: Atto.Parser CMap codeRanges :: Parser CMap ()
codeRanges = do
size <- integer <* line "begincodespacerange"
count size (createMapping <$> codeRange) *> return ()
line "endcodespacerange"
where
codeRange =
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
createMapping :: (StringObject, StringObject) -> Parser CMap ()
createMapping (Hexadecimal from, Hexadecimal to) = modify $
Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
where
fromSequence = decodeHex from
size = BS.length fromSequence
toSequence = decodeHex to
mapping = Map.empty
createMapping _ = return ()
cMapRange :: Parser CMap ()
cMapRange = do cMapRange = do
size <- integer <* line "beginbfrange" size <- integer <* line "beginbfrange"
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange" mapM_ saveMapping =<< count size rangeMapping <* line "endbfrange"
where where
rangeMapping = (,,) rangeMapping = (,,)
<$> (stringObject <* blank) <$> (stringObject <* blank)
@ -42,33 +74,52 @@ cMapRange = do
<*> directObject <* EOL.parser <*> directObject <* EOL.parser
>>= mapFromTo >>= mapFromTo
cMapChar :: Atto.Parser CMap saveMapping :: [(ByteString, ByteString)] -> Parser CMap ()
saveMapping [] = return ()
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
where
newMapping = Map.fromList assoc
mappingSize = BS.length code
matchingRange (CRange {fromSequence, toSequence}) =
fromSequence <= code && code <= toSequence
appendMapping cRange =
cRange {mapping = mapping cRange `union` newMapping}
insertCRange = fmap (\cRange ->
if matchingRange cRange then appendMapping cRange else cRange
)
cMapChar :: Parser CMap ()
cMapChar = do cMapChar = do
size <- integer <* line "beginbfchar" size <- integer <* line "beginbfchar"
Map.fromList <$> count size charMapping <* line "endbfchar" saveMapping =<< count size charMapping <* line "endbfchar"
where where
charMapping = charMapping =
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
>>= pairMapping >>= pairMapping
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, ByteString)] between :: ByteString -> ByteString -> [ByteString]
between from to = fromInt <$> [toInt from .. toInt to]
startFrom :: ByteString -> [ByteString]
startFrom from = fromInt <$> [toInt from .. ]
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
let dstString = parseBytes dstFrom in return $ zip (between fromBS toBS) (startFrom dstBS)
return $ zip [hexString from .. hexString to] (textsFrom dstString)
where where
textsFrom t (fromBS, toBS, dstBS) = (decodeHex from, decodeHex to, decodeHex dstFrom)
| BS.null t = [t]
| otherwise = (BS.init t `snoc`) <$> [BS.last t ..]
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) = mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints) zip (between fromBS toBS) <$> (mapM dstByteString dstPoints)
where where
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseBytes dstPoint (fromBS, toBS) = (decodeHex from, decodeHex to)
dstString _ = fail "Invalid for a replacement string" dstByteString (StringObject (Hexadecimal dst)) =
return . utf16BEToutf8 $ decodeHex dst
dstByteString _ = fail "Invalid for a replacement string"
mapFromTo _ = fail "invalid range mapping found" mapFromTo _ = fail "invalid range mapping found"
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, ByteString) pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
pairMapping (Hexadecimal from, Hexadecimal to) = pairMapping (Hexadecimal from, Hexadecimal to) =
return (hexString from, parseBytes to) return (decodeHex from, utf16BEToutf8 $ decodeHex to)
pairMapping _ = fail "invalid pair mapping found" pairMapping _ = fail "invalid pair mapping found"

View file

@ -22,7 +22,6 @@ module PDF.Object (
, dictionary , dictionary
, directObject , directObject
, eofMarker , eofMarker
, hexString
, integer , integer
, line , line
, magicNumber , magicNumber
@ -39,6 +38,7 @@ import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (concat, pack) import qualified Data.ByteString as BS (concat, pack)
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack) import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
import Data.ByteString.Char8.Util (hexString, parseBytes)
import Data.Map (Map, (!), mapWithKey) import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map ( import qualified Data.Map as Map (
delete, empty, fromList, lookup, minViewWithKey, toList, union delete, empty, fromList, lookup, minViewWithKey, toList, union
@ -114,16 +114,16 @@ number = Number . read . Char8.unpack <$>
-- --
-- StringObject -- StringObject
-- --
data StringObject = Literal String | Hexadecimal String deriving Show data StringObject = Literal ByteString | Hexadecimal ByteString deriving Show
instance Output StringObject where instance Output StringObject where
output (Literal s) = Output.string (printf "(%s)" s) output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
output (Hexadecimal s) = Output.string (printf "<%s>" s) output (Hexadecimal s) = Output.string (printf "<%s>" (Char8.unpack s))
stringObject :: MonadParser m => m StringObject stringObject :: MonadParser m => m StringObject
stringObject = stringObject =
Literal . Char8.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')') Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal . Char8.unpack <$> (char '<' *> hexNumber <* char '>') <|> Hexadecimal <$> (char '<' *> hexNumber <* char '>')
<?> "string object (literal or hexadecimal)" <?> "string object (literal or hexadecimal)"
where where
literalString = many literalStringBlock literalString = many literalStringBlock
@ -135,16 +135,6 @@ stringObject =
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode) Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3] octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
hexString :: (Num a, Read a) => String -> a
hexString s = read $ "0x" ++ s
parseBytes :: String -> ByteString
parseBytes = encodeUtf8 . decodeUtf16BE . BS.pack . fmap hexString . pairDigits
where
pairDigits "" = []
pairDigits [c] = [[c]]
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
-- --
-- Name -- Name
-- --

View file

@ -11,6 +11,7 @@ import Control.Monad.State (get, put)
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as Char8 (unpack) import qualified Data.ByteString.Char8 as Char8 (unpack)
import Data.ByteString.Char8.Util (decodeHex)
import Data.Map ((!)) import Data.Map ((!))
import qualified Data.Map as Map (lookup) import qualified Data.Map as Map (lookup)
import PDF.CMap (CMappers, CMap, emptyCMap) import PDF.CMap (CMappers, CMap, emptyCMap)
@ -22,13 +23,13 @@ import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll)
data StateOperator = data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
deriving (Bounded, Enum) deriving (Bounded, Enum, Show)
data TextOperator = data TextOperator =
Td | TD | Tm | Tstar -- text positioning Td | TD | Tm | Tstar -- text positioning
| TJ | Tj | Quote | DQuote -- text showing | TJ | Tj | Quote | DQuote -- text showing
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
deriving (Bounded, Enum) deriving (Bounded, Enum, Show)
data Argument = Raw ByteString | Typed DirectObject data Argument = Raw ByteString | Typed DirectObject deriving Show
type Call a = (a, [Argument]) type Call a = (a, [Argument])
stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator) stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator)
@ -112,7 +113,7 @@ runOperator (TJ, [Typed (Array arrayObject)]) =
appendText bs (StringObject outputString) = appendText bs (StringObject outputString) =
mappend bs <$> decodeString outputString mappend bs <$> decodeString outputString
appendText bs _ = return bs appendText bs _ = return bs
runOperator (Tj, [Typed (StringObject outputString)]) = runOperator (Tj, [Typed (StringObject outputString)]) =
replicate 1 <$> decodeString outputString replicate 1 <$> decodeString outputString
@ -125,8 +126,10 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return [] runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString decodeString :: StringObject -> ParserWithFont ByteString
decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h)) decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h))
decodeString (Literal litString) = get >>= convertBytes litString decodeString (Literal litString) =
get >>= convertBytes litString
where where
convertBytes :: String -> CMap -> ParserWithFont ByteString convertBytes :: String -> CMap -> ParserWithFont ByteString
convertBytes [] _ = return "" convertBytes [] _ = return ""