Still debugging, broke pretty much everything and finally implementing a proper coderange parsing for CMap because apparently that's necessary

2019-09-30 14:13:12 +02:00 · 2019-09-30 14:13:12 +02:00 · 36d7f9b819
commit 36d7f9b819
parent b8ca7281aa
7 changed files with 187 additions and 64 deletions
--- a/Hufflepdf.cabal
+++ b/Hufflepdf.cabal
@ -23,9 +23,9 @@ library
                     , PDF.Output
                     , PDF.Text
                     , PDF.Update
                     , PDF.Parser
  other-modules:       Data.ByteString.Char8.Util
                     , PDF.Body
                     , PDF.Parser
  -- other-extensions:    
  build-depends:       attoparsec
                     , base >=4.9 && <4.13
@ -56,6 +56,16 @@ executable             getObj
  ghc-options:         -Wall
  default-language:    Haskell2010
 executable             debug
  main-is:             examples/debug.hs
  build-depends:       base
                     , bytestring
                     , containers
                     , Hufflepdf
                     , mtl
  ghc-options:         -Wall
  default-language:    Haskell2010
 executable             getText
  main-is:             examples/getText.hs
  build-depends:       base
--- a/examples/debug.hs
+++ b/examples/debug.hs
@ -0,0 +1,26 @@
 module Main where
 import Control.Monad.Reader
 import qualified Data.ByteString as BS
 import qualified Data.ByteString.Char8 as C8
 import qualified Data.Map as Map
 import PDF.Object (Name(..), array)
 import PDF.CMap (CMappers, cMap, emptyCMap)
 import PDF.Parser (evalParser)
 import PDF.Text
 test :: CMappers -> ParserWithFont a -> BS.ByteString -> Either String a
 test fonts parser =
  evalParser (runReaderT parser fonts) emptyCMap
 main :: IO ()
 main = do
  --input <- BS.readFile "20.stream"
  input <- BS.readFile "array.bin"
  Right font <- cMap <$> BS.readFile "6300.stream"
  --mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> BS.putStrLn v) $ Map.toList font
  --case pageContents (Map.singleton (Name "R9") font) input of
  --case test (Map.singleton (Name "R9") font) array input of
  case test (Map.singleton (Name "R9") font) (a textOperator) input of
    Left e -> putStrLn e
    Right l -> putStrLn . show $ l
--- a/examples/getText.hs
+++ b/examples/getText.hs
@ -16,11 +16,12 @@ import PDF.Object (
      Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
    , Object(..), Name(..), Structure(..)
  ,)
-import PDF.Output (ObjectId)
+import PDF.Output (ObjectId(..))
 import PDF.Text (PageContents(..), pageContents)
 import PDF.Update (unify)
 import System.Environment (getArgs)
 import System.IO (hPutStrLn, stderr)
 import Text.Printf (printf)
 type CachedCMaps = Map ObjectId CMap
 type T a = RWST Content [ByteString] CachedCMaps [] a
@ -30,16 +31,19 @@ list l = RWST (\_ s -> fillContext s <$> l)
  where
    fillContext s a = (a, s, [])
-handleError :: a -> String -> T a
+handleError :: ObjectId -> a -> String -> T a
-handleError defaultValue s =
+handleError objectId defaultValue s =
-  (tell . replicate 1 . BS.pack $ "Warning: " ++ s) >> return defaultValue
+  (tell . replicate 1 $ BS.pack message) >> return defaultValue
  where
    message = printf "Object #%d : %s" (getObjectId objectId) s
 extractText :: Object -> T ()
 extractText object = do
  pageDict <- dict object
  cMappers <- loadCMappers =<< getFont pageDict
-  contents <- stream =<< follow =<< key "Contents" pageDict
+  contentsId <- target =<< key "Contents" pageDict
-  either (handleError ()) (tell . chunks) (pageContents cMappers contents)
+  contents <- stream =<< getObject contentsId
  either (handleError contentsId ()) (tell . chunks) (pageContents cMappers contents)
 stream :: Object -> T ByteString
 stream (Stream {header, streamContent}) = return $
@ -73,7 +77,7 @@ loadFont objectId =
  >>= key "ToUnicode"
  >>= follow
  >>= stream
-  >>= either (handleError emptyCMap) return . cMap
+  >>= either (handleError objectId emptyCMap) return . cMap
 loadCMappers :: Dictionary -> T CMappers
 loadCMappers = foldM loadCMapper Map.empty . Map.toList
@ -94,9 +98,12 @@ key keyName dictionary =
    Just obj -> return obj
    _ -> list []
 target :: DirectObject -> T ObjectId
 target (Reference (IndirectObjCoordinates {objectId})) = return objectId
 target _ = list []
 follow :: DirectObject -> T Object
-follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
+follow directObject = target directObject >>= getObject
 follow _ = list []
 dict :: Object -> T Dictionary
 dict (Direct (Dictionary dictionary)) = return dictionary
@ -122,7 +129,6 @@ listTextObjects (Document {updates}) =
  snd =<< evalRWST rwsMain (unify updates) Map.empty
  where
    rwsMain =
      --Lazy.pack . show <$> (getObject =<< pagesList)
      pagesList >>= getObject >>= extractText
--- a/src/Data/ByteString/Char8/Util.hs
+++ b/src/Data/ByteString/Char8/Util.hs
@ -1,16 +1,53 @@
 module Data.ByteString.Char8.Util (
-      previous
+      decodeHex
    , fromInt
    , hexString
    , parseBytes
    , previous
    , subBS
    , toInt
    , utf16BEToutf8
  ) where
-import Data.ByteString.Char8 (ByteString)
+import Data.ByteString (ByteString, snoc)
-import qualified Data.ByteString.Char8 as BS (drop, index, take)
+import qualified Data.ByteString as BS (foldl, pack, singleton)
 import qualified Data.ByteString.Char8 as Char8 (drop, index, take, unpack)
 import Data.Text.Encoding (encodeUtf8, decodeUtf16BE)
 import Prelude hiding (length)
 previous :: Char -> Int -> ByteString -> Int
 previous char position byteString
-  | BS.index byteString position == char = position
+  | Char8.index byteString position == char = position
  | otherwise = previous char (position - 1) byteString
 subBS :: Int -> Int -> ByteString -> ByteString
-subBS offset length = BS.take length . BS.drop offset
+subBS offset length = Char8.take length . Char8.drop offset
 hexString :: (Num a, Read a) => String -> a
 hexString s = read $ "0x" ++ s
 fromInt :: Int -> ByteString
 fromInt n
  | n < 0x100 = BS.singleton $ toEnum n
  | otherwise = fromInt (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100))
 toInt :: ByteString -> Int
 toInt = BS.foldl (\n w -> 0x100*n + fromEnum w) 0
 {-
 encodeHex :: ByteString -> ByteString
 encodeHex = 
 -}
 decodeHex :: ByteString -> ByteString
 decodeHex = parseBytes . Char8.unpack
 parseBytes :: String -> ByteString
 parseBytes = BS.pack . fmap hexString . pairDigits
  where
    pairDigits "" = []
    pairDigits [c] = [[c]]
    pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
 utf16BEToutf8 :: ByteString -> ByteString
 utf16BEToutf8 = encodeUtf8 . decodeUtf16BE
--- a/src/PDF/CMap.hs
+++ b/src/PDF/CMap.hs
@ -1,3 +1,4 @@
 {-# LANGUAGE NamedFieldPuns #-}
 module PDF.CMap (
      CMap
    , CMappers
@ -6,35 +7,66 @@ module PDF.CMap (
  ) where
 import Control.Applicative ((<|>), many)
-import Data.Attoparsec.ByteString.Char8 (count, parseOnly)
+import Control.Monad.State (modify)
-import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
+import Data.Attoparsec.ByteString.Char8 (count)
-import Data.ByteString (ByteString, snoc)
+import Data.ByteString (ByteString)
-import qualified Data.ByteString as BS (init, last, null)
+import qualified Data.ByteString as BS (length)
-import Data.Map (Map)
+import Data.ByteString.Char8.Util (
-import qualified Data.Map as Map (empty, fromList)
+    decodeHex, fromInt, toInt, utf16BEToutf8
  )
 import Data.Map (Map, union)
 import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
 import qualified PDF.EOL as EOL (charset, parser)
 import PDF.Object (
      DirectObject(..), Name, StringObject(..)
-    , blank, directObject, hexString, integer, line, parseBytes, stringObject
+    , blank, directObject, integer, line, stringObject
  )
-import PDF.Parser (takeAll)
+import PDF.Parser (MonadParser, Parser, runParser, takeAll)
 type CMappers = Map Name CMap
-type CMap = Map Int ByteString
+type Mapping = Map ByteString ByteString
 data CRange = CRange {
      fromSequence :: ByteString
    , toSequence :: ByteString
    , mapping :: Mapping
  }
 type RangeSize = Int
 type CMap = Map RangeSize [CRange]
 emptyCMap :: CMap
 emptyCMap = Map.empty
 cMap :: ByteString -> Either String CMap
-cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
+cMap = fmap snd <$> runParser
  (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
  emptyCMap
  where
    ignoredLine =
-      takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
+      takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
-cMapRange :: Atto.Parser CMap
+codeRanges :: Parser CMap ()
 codeRanges = do
  size <- integer <* line "begincodespacerange"
  count size (createMapping <$> codeRange) *> return ()
  line "endcodespacerange"
  where
    codeRange =
      (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
 createMapping :: (StringObject, StringObject) -> Parser CMap ()
 createMapping (Hexadecimal from, Hexadecimal to) = modify $
  Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
  where
    fromSequence = decodeHex from
    size = BS.length fromSequence
    toSequence = decodeHex to
    mapping = Map.empty
 createMapping _ = return ()
 cMapRange :: Parser CMap ()
 cMapRange = do
  size <- integer <* line "beginbfrange"
-  mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
+  mapM_ saveMapping =<< count size rangeMapping <* line "endbfrange"
  where
    rangeMapping = (,,) 
      <$> (stringObject <* blank)
@ -42,33 +74,52 @@ cMapRange = do
      <*> directObject <* EOL.parser
      >>= mapFromTo
-cMapChar :: Atto.Parser CMap
+saveMapping :: [(ByteString, ByteString)] -> Parser CMap ()
 saveMapping [] = return ()
 saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
  where
    newMapping = Map.fromList assoc
    mappingSize = BS.length code
    matchingRange (CRange {fromSequence, toSequence}) =
      fromSequence <= code && code <= toSequence
    appendMapping cRange =
      cRange {mapping = mapping cRange `union` newMapping}
    insertCRange = fmap (\cRange ->
        if matchingRange cRange then appendMapping cRange else cRange
      )
 cMapChar :: Parser CMap ()
 cMapChar = do
  size <- integer <* line "beginbfchar"
-  Map.fromList <$> count size charMapping <* line "endbfchar"
+  saveMapping =<< count size charMapping <* line "endbfchar"
  where
    charMapping =
      (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
      >>= pairMapping
-mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, ByteString)]
+between :: ByteString -> ByteString -> [ByteString]
 between from to = fromInt <$> [toInt from .. toInt to]
 startFrom :: ByteString -> [ByteString]
 startFrom from = fromInt <$> [toInt from .. ]
 mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
 mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
-  let dstString = parseBytes dstFrom in
+  return $ zip (between fromBS toBS) (startFrom dstBS)
  return $ zip [hexString from .. hexString to] (textsFrom dstString)
  where
-    textsFrom t
+    (fromBS, toBS, dstBS) = (decodeHex from, decodeHex to, decodeHex dstFrom)
      | BS.null t = [t]
      | otherwise = (BS.init t `snoc`) <$> [BS.last t ..]
 mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
-  zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
+  zip (between fromBS toBS) <$> (mapM dstByteString dstPoints)
  where
-    dstString (StringObject (Hexadecimal dstPoint)) = return $ parseBytes dstPoint
+    (fromBS, toBS) = (decodeHex from, decodeHex to)
-    dstString _ = fail "Invalid for a replacement string"
+    dstByteString (StringObject (Hexadecimal dst)) =
      return . utf16BEToutf8 $ decodeHex dst
    dstByteString _ = fail "Invalid for a replacement string"
 mapFromTo _ = fail "invalid range mapping found"
-pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, ByteString)
+pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
 pairMapping (Hexadecimal from, Hexadecimal to) =
-  return (hexString from, parseBytes to)
+  return (decodeHex from, utf16BEToutf8 $ decodeHex to)
 pairMapping _ = fail "invalid pair mapping found"
--- a/src/PDF/Object.hs
+++ b/src/PDF/Object.hs
@ -22,7 +22,6 @@ module PDF.Object (
    , dictionary
    , directObject
    , eofMarker
    , hexString
    , integer
    , line
    , magicNumber
@ -39,6 +38,7 @@ import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as BS (concat, pack)
 import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
 import Data.ByteString.Char8.Util (hexString, parseBytes)
 import Data.Map (Map, (!), mapWithKey)
 import qualified Data.Map as Map (
    delete, empty, fromList, lookup, minViewWithKey, toList, union
@ -114,16 +114,16 @@ number = Number . read . Char8.unpack <$>
 --
 -- StringObject
 --
-data StringObject = Literal String | Hexadecimal String deriving Show
+data StringObject = Literal ByteString | Hexadecimal ByteString deriving Show
 instance Output StringObject where
-  output (Literal s) = Output.string (printf "(%s)" s)
+  output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
-  output (Hexadecimal s) = Output.string (printf "<%s>" s)
+  output (Hexadecimal s) = Output.string (printf "<%s>" (Char8.unpack s))
 stringObject :: MonadParser m => m StringObject
 stringObject =
-    Literal . Char8.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
+    Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
-  <|> Hexadecimal . Char8.unpack <$> (char '<' *> hexNumber <* char '>')
+  <|> Hexadecimal <$> (char '<' *> hexNumber <* char '>')
  <?> "string object (literal or hexadecimal)"
  where
    literalString = many literalStringBlock
@ -135,16 +135,6 @@ stringObject =
      Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
    octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
 hexString :: (Num a, Read a) => String -> a
 hexString s = read $ "0x" ++ s
 parseBytes :: String -> ByteString
 parseBytes = encodeUtf8 . decodeUtf16BE . BS.pack . fmap hexString . pairDigits
  where
    pairDigits "" = []
    pairDigits [c] = [[c]]
    pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
 --
 -- Name
 --
--- a/src/PDF/Text.hs
+++ b/src/PDF/Text.hs
@ -11,6 +11,7 @@ import Control.Monad.State (get, put)
 import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString.Char8 as Char8 (unpack)
 import Data.ByteString.Char8.Util (decodeHex)
 import Data.Map ((!))
 import qualified Data.Map as Map (lookup)
 import PDF.CMap (CMappers, CMap, emptyCMap)
@ -22,13 +23,13 @@ import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll)
 data StateOperator =
  Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
-  deriving (Bounded, Enum)
+  deriving (Bounded, Enum, Show)
 data TextOperator =
    Td | TD | Tm | Tstar -- text positioning
  | TJ | Tj | Quote | DQuote -- text showing
  | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
-  deriving (Bounded, Enum)
+  deriving (Bounded, Enum, Show)
-data Argument = Raw ByteString | Typed DirectObject
+data Argument = Raw ByteString | Typed DirectObject deriving Show
 type Call a = (a, [Argument])
 stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator)
@ -112,7 +113,7 @@ runOperator (TJ, [Typed (Array arrayObject)]) =
    appendText bs (StringObject outputString) =
      mappend bs <$> decodeString outputString
    appendText bs _ = return bs
-      
+
 runOperator (Tj, [Typed (StringObject outputString)]) =
  replicate 1 <$> decodeString outputString
@ -125,8 +126,10 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
 runOperator _ = return []
 decodeString :: StringObject -> ParserWithFont ByteString
-decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h))
+decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h))
-decodeString (Literal litString) = get >>= convertBytes litString
+decodeString (Literal litString) =
 get >>= convertBytes litString
  where
    convertBytes :: String -> CMap -> ParserWithFont ByteString
    convertBytes [] _ = return ""