Still debugging, broke pretty much everything and finally implementing a proper coderange parsing for CMap because apparently that's necessary

This commit is contained in:
Tissevert 2019-09-30 14:13:12 +02:00
parent b8ca7281aa
commit 36d7f9b819
7 changed files with 187 additions and 64 deletions

View file

@ -23,9 +23,9 @@ library
, PDF.Output
, PDF.Text
, PDF.Update
, PDF.Parser
other-modules: Data.ByteString.Char8.Util
, PDF.Body
, PDF.Parser
-- other-extensions:
build-depends: attoparsec
, base >=4.9 && <4.13
@ -56,6 +56,16 @@ executable getObj
ghc-options: -Wall
default-language: Haskell2010
executable debug
main-is: examples/debug.hs
build-depends: base
, bytestring
, containers
, Hufflepdf
, mtl
ghc-options: -Wall
default-language: Haskell2010
executable getText
main-is: examples/getText.hs
build-depends: base

26
examples/debug.hs Normal file
View file

@ -0,0 +1,26 @@
module Main where
import Control.Monad.Reader
import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as C8
import qualified Data.Map as Map
import PDF.Object (Name(..), array)
import PDF.CMap (CMappers, cMap, emptyCMap)
import PDF.Parser (evalParser)
import PDF.Text
test :: CMappers -> ParserWithFont a -> BS.ByteString -> Either String a
test fonts parser =
evalParser (runReaderT parser fonts) emptyCMap
main :: IO ()
main = do
--input <- BS.readFile "20.stream"
input <- BS.readFile "array.bin"
Right font <- cMap <$> BS.readFile "6300.stream"
--mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> BS.putStrLn v) $ Map.toList font
--case pageContents (Map.singleton (Name "R9") font) input of
--case test (Map.singleton (Name "R9") font) array input of
case test (Map.singleton (Name "R9") font) (a textOperator) input of
Left e -> putStrLn e
Right l -> putStrLn . show $ l

View file

@ -16,11 +16,12 @@ import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId)
import PDF.Output (ObjectId(..))
import PDF.Text (PageContents(..), pageContents)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)
import Text.Printf (printf)
type CachedCMaps = Map ObjectId CMap
type T a = RWST Content [ByteString] CachedCMaps [] a
@ -30,16 +31,19 @@ list l = RWST (\_ s -> fillContext s <$> l)
where
fillContext s a = (a, s, [])
handleError :: a -> String -> T a
handleError defaultValue s =
(tell . replicate 1 . BS.pack $ "Warning: " ++ s) >> return defaultValue
handleError :: ObjectId -> a -> String -> T a
handleError objectId defaultValue s =
(tell . replicate 1 $ BS.pack message) >> return defaultValue
where
message = printf "Object #%d : %s" (getObjectId objectId) s
extractText :: Object -> T ()
extractText object = do
pageDict <- dict object
cMappers <- loadCMappers =<< getFont pageDict
contents <- stream =<< follow =<< key "Contents" pageDict
either (handleError ()) (tell . chunks) (pageContents cMappers contents)
contentsId <- target =<< key "Contents" pageDict
contents <- stream =<< getObject contentsId
either (handleError contentsId ()) (tell . chunks) (pageContents cMappers contents)
stream :: Object -> T ByteString
stream (Stream {header, streamContent}) = return $
@ -73,7 +77,7 @@ loadFont objectId =
>>= key "ToUnicode"
>>= follow
>>= stream
>>= either (handleError emptyCMap) return . cMap
>>= either (handleError objectId emptyCMap) return . cMap
loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList
@ -94,9 +98,12 @@ key keyName dictionary =
Just obj -> return obj
_ -> list []
target :: DirectObject -> T ObjectId
target (Reference (IndirectObjCoordinates {objectId})) = return objectId
target _ = list []
follow :: DirectObject -> T Object
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
follow _ = list []
follow directObject = target directObject >>= getObject
dict :: Object -> T Dictionary
dict (Direct (Dictionary dictionary)) = return dictionary
@ -122,7 +129,6 @@ listTextObjects (Document {updates}) =
snd =<< evalRWST rwsMain (unify updates) Map.empty
where
rwsMain =
--Lazy.pack . show <$> (getObject =<< pagesList)
pagesList >>= getObject >>= extractText

View file

@ -1,16 +1,53 @@
module Data.ByteString.Char8.Util (
previous
decodeHex
, fromInt
, hexString
, parseBytes
, previous
, subBS
, toInt
, utf16BEToutf8
) where
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (drop, index, take)
import Data.ByteString (ByteString, snoc)
import qualified Data.ByteString as BS (foldl, pack, singleton)
import qualified Data.ByteString.Char8 as Char8 (drop, index, take, unpack)
import Data.Text.Encoding (encodeUtf8, decodeUtf16BE)
import Prelude hiding (length)
previous :: Char -> Int -> ByteString -> Int
previous char position byteString
| BS.index byteString position == char = position
| Char8.index byteString position == char = position
| otherwise = previous char (position - 1) byteString
subBS :: Int -> Int -> ByteString -> ByteString
subBS offset length = BS.take length . BS.drop offset
subBS offset length = Char8.take length . Char8.drop offset
hexString :: (Num a, Read a) => String -> a
hexString s = read $ "0x" ++ s
fromInt :: Int -> ByteString
fromInt n
| n < 0x100 = BS.singleton $ toEnum n
| otherwise = fromInt (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100))
toInt :: ByteString -> Int
toInt = BS.foldl (\n w -> 0x100*n + fromEnum w) 0
{-
encodeHex :: ByteString -> ByteString
encodeHex =
-}
decodeHex :: ByteString -> ByteString
decodeHex = parseBytes . Char8.unpack
parseBytes :: String -> ByteString
parseBytes = BS.pack . fmap hexString . pairDigits
where
pairDigits "" = []
pairDigits [c] = [[c]]
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
utf16BEToutf8 :: ByteString -> ByteString
utf16BEToutf8 = encodeUtf8 . decodeUtf16BE

View file

@ -1,3 +1,4 @@
{-# LANGUAGE NamedFieldPuns #-}
module PDF.CMap (
CMap
, CMappers
@ -6,35 +7,66 @@ module PDF.CMap (
) where
import Control.Applicative ((<|>), many)
import Data.Attoparsec.ByteString.Char8 (count, parseOnly)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
import Data.ByteString (ByteString, snoc)
import qualified Data.ByteString as BS (init, last, null)
import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList)
import Control.Monad.State (modify)
import Data.Attoparsec.ByteString.Char8 (count)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (length)
import Data.ByteString.Char8.Util (
decodeHex, fromInt, toInt, utf16BEToutf8
)
import Data.Map (Map, union)
import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Object (
DirectObject(..), Name, StringObject(..)
, blank, directObject, hexString, integer, line, parseBytes, stringObject
, blank, directObject, integer, line, stringObject
)
import PDF.Parser (takeAll)
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
type CMappers = Map Name CMap
type CMap = Map Int ByteString
type Mapping = Map ByteString ByteString
data CRange = CRange {
fromSequence :: ByteString
, toSequence :: ByteString
, mapping :: Mapping
}
type RangeSize = Int
type CMap = Map RangeSize [CRange]
emptyCMap :: CMap
emptyCMap = Map.empty
cMap :: ByteString -> Either String CMap
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
cMap = fmap snd <$> runParser
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
emptyCMap
where
ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
cMapRange :: Atto.Parser CMap
codeRanges :: Parser CMap ()
codeRanges = do
size <- integer <* line "begincodespacerange"
count size (createMapping <$> codeRange) *> return ()
line "endcodespacerange"
where
codeRange =
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
createMapping :: (StringObject, StringObject) -> Parser CMap ()
createMapping (Hexadecimal from, Hexadecimal to) = modify $
Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
where
fromSequence = decodeHex from
size = BS.length fromSequence
toSequence = decodeHex to
mapping = Map.empty
createMapping _ = return ()
cMapRange :: Parser CMap ()
cMapRange = do
size <- integer <* line "beginbfrange"
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
mapM_ saveMapping =<< count size rangeMapping <* line "endbfrange"
where
rangeMapping = (,,)
<$> (stringObject <* blank)
@ -42,33 +74,52 @@ cMapRange = do
<*> directObject <* EOL.parser
>>= mapFromTo
cMapChar :: Atto.Parser CMap
saveMapping :: [(ByteString, ByteString)] -> Parser CMap ()
saveMapping [] = return ()
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
where
newMapping = Map.fromList assoc
mappingSize = BS.length code
matchingRange (CRange {fromSequence, toSequence}) =
fromSequence <= code && code <= toSequence
appendMapping cRange =
cRange {mapping = mapping cRange `union` newMapping}
insertCRange = fmap (\cRange ->
if matchingRange cRange then appendMapping cRange else cRange
)
cMapChar :: Parser CMap ()
cMapChar = do
size <- integer <* line "beginbfchar"
Map.fromList <$> count size charMapping <* line "endbfchar"
saveMapping =<< count size charMapping <* line "endbfchar"
where
charMapping =
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
>>= pairMapping
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, ByteString)]
between :: ByteString -> ByteString -> [ByteString]
between from to = fromInt <$> [toInt from .. toInt to]
startFrom :: ByteString -> [ByteString]
startFrom from = fromInt <$> [toInt from .. ]
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
let dstString = parseBytes dstFrom in
return $ zip [hexString from .. hexString to] (textsFrom dstString)
return $ zip (between fromBS toBS) (startFrom dstBS)
where
textsFrom t
| BS.null t = [t]
| otherwise = (BS.init t `snoc`) <$> [BS.last t ..]
(fromBS, toBS, dstBS) = (decodeHex from, decodeHex to, decodeHex dstFrom)
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
zip (between fromBS toBS) <$> (mapM dstByteString dstPoints)
where
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseBytes dstPoint
dstString _ = fail "Invalid for a replacement string"
(fromBS, toBS) = (decodeHex from, decodeHex to)
dstByteString (StringObject (Hexadecimal dst)) =
return . utf16BEToutf8 $ decodeHex dst
dstByteString _ = fail "Invalid for a replacement string"
mapFromTo _ = fail "invalid range mapping found"
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, ByteString)
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
pairMapping (Hexadecimal from, Hexadecimal to) =
return (hexString from, parseBytes to)
return (decodeHex from, utf16BEToutf8 $ decodeHex to)
pairMapping _ = fail "invalid pair mapping found"

View file

@ -22,7 +22,6 @@ module PDF.Object (
, dictionary
, directObject
, eofMarker
, hexString
, integer
, line
, magicNumber
@ -39,6 +38,7 @@ import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (concat, pack)
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
import Data.ByteString.Char8.Util (hexString, parseBytes)
import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map (
delete, empty, fromList, lookup, minViewWithKey, toList, union
@ -114,16 +114,16 @@ number = Number . read . Char8.unpack <$>
--
-- StringObject
--
data StringObject = Literal String | Hexadecimal String deriving Show
data StringObject = Literal ByteString | Hexadecimal ByteString deriving Show
instance Output StringObject where
output (Literal s) = Output.string (printf "(%s)" s)
output (Hexadecimal s) = Output.string (printf "<%s>" s)
output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
output (Hexadecimal s) = Output.string (printf "<%s>" (Char8.unpack s))
stringObject :: MonadParser m => m StringObject
stringObject =
Literal . Char8.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal . Char8.unpack <$> (char '<' *> hexNumber <* char '>')
Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal <$> (char '<' *> hexNumber <* char '>')
<?> "string object (literal or hexadecimal)"
where
literalString = many literalStringBlock
@ -135,16 +135,6 @@ stringObject =
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
hexString :: (Num a, Read a) => String -> a
hexString s = read $ "0x" ++ s
parseBytes :: String -> ByteString
parseBytes = encodeUtf8 . decodeUtf16BE . BS.pack . fmap hexString . pairDigits
where
pairDigits "" = []
pairDigits [c] = [[c]]
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
--
-- Name
--

View file

@ -11,6 +11,7 @@ import Control.Monad.State (get, put)
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as Char8 (unpack)
import Data.ByteString.Char8.Util (decodeHex)
import Data.Map ((!))
import qualified Data.Map as Map (lookup)
import PDF.CMap (CMappers, CMap, emptyCMap)
@ -22,13 +23,13 @@ import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll)
data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
deriving (Bounded, Enum)
deriving (Bounded, Enum, Show)
data TextOperator =
Td | TD | Tm | Tstar -- text positioning
| TJ | Tj | Quote | DQuote -- text showing
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
deriving (Bounded, Enum)
data Argument = Raw ByteString | Typed DirectObject
deriving (Bounded, Enum, Show)
data Argument = Raw ByteString | Typed DirectObject deriving Show
type Call a = (a, [Argument])
stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator)
@ -112,7 +113,7 @@ runOperator (TJ, [Typed (Array arrayObject)]) =
appendText bs (StringObject outputString) =
mappend bs <$> decodeString outputString
appendText bs _ = return bs
runOperator (Tj, [Typed (StringObject outputString)]) =
replicate 1 <$> decodeString outputString
@ -125,8 +126,10 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString
decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h))
decodeString (Literal litString) = get >>= convertBytes litString
decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h))
decodeString (Literal litString) =
get >>= convertBytes litString
where
convertBytes :: String -> CMap -> ParserWithFont ByteString
convertBytes [] _ = return ""