Still debugging, broke pretty much everything and finally implementing a proper coderange parsing for CMap because apparently that's necessary

2019-09-30 14:13:12 +02:00 · 2019-09-30 14:13:12 +02:00 · 36d7f9b819
commit 36d7f9b819
parent b8ca7281aa
7 changed files with 187 additions and 64 deletions
--- a/Hufflepdf.cabal
+++ b/Hufflepdf.cabal
@ -23,9 +23,9 @@ library
                     , PDF.Output
                     , PDF.Text
                     , PDF.Update
+                     , PDF.Parser
  other-modules:       Data.ByteString.Char8.Util
                     , PDF.Body
-                     , PDF.Parser
  -- other-extensions:    
  build-depends:       attoparsec
                     , base >=4.9 && <4.13
@ -56,6 +56,16 @@ executable             getObj
  ghc-options:         -Wall
  default-language:    Haskell2010

+executable             debug
+  main-is:             examples/debug.hs
+  build-depends:       base
+                     , bytestring
+                     , containers
+                     , Hufflepdf
+                     , mtl
+  ghc-options:         -Wall
+  default-language:    Haskell2010
+
 executable             getText
  main-is:             examples/getText.hs
  build-depends:       base
--- a/examples/debug.hs
+++ b/examples/debug.hs
@ -0,0 +1,26 @@
+module Main where
+
+import Control.Monad.Reader
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Char8 as C8
+import qualified Data.Map as Map
+import PDF.Object (Name(..), array)
+import PDF.CMap (CMappers, cMap, emptyCMap)
+import PDF.Parser (evalParser)
+import PDF.Text
+
+test :: CMappers -> ParserWithFont a -> BS.ByteString -> Either String a
+test fonts parser =
+  evalParser (runReaderT parser fonts) emptyCMap
+
+main :: IO ()
+main = do
+  --input <- BS.readFile "20.stream"
+  input <- BS.readFile "array.bin"
+  Right font <- cMap <$> BS.readFile "6300.stream"
+  --mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> BS.putStrLn v) $ Map.toList font
+  --case pageContents (Map.singleton (Name "R9") font) input of
+  --case test (Map.singleton (Name "R9") font) array input of
+  case test (Map.singleton (Name "R9") font) (a textOperator) input of
+    Left e -> putStrLn e
+    Right l -> putStrLn . show $ l
--- a/examples/getText.hs
+++ b/examples/getText.hs
@ -16,11 +16,12 @@ import PDF.Object (
      Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
    , Object(..), Name(..), Structure(..)
  ,)
-import PDF.Output (ObjectId)
+import PDF.Output (ObjectId(..))
 import PDF.Text (PageContents(..), pageContents)
 import PDF.Update (unify)
 import System.Environment (getArgs)
 import System.IO (hPutStrLn, stderr)
+import Text.Printf (printf)

 type CachedCMaps = Map ObjectId CMap
 type T a = RWST Content [ByteString] CachedCMaps [] a
@ -30,16 +31,19 @@ list l = RWST (\_ s -> fillContext s <$> l)
  where
    fillContext s a = (a, s, [])

-handleError :: a -> String -> T a
-handleError defaultValue s =
-  (tell . replicate 1 . BS.pack $ "Warning: " ++ s) >> return defaultValue
+handleError :: ObjectId -> a -> String -> T a
+handleError objectId defaultValue s =
+  (tell . replicate 1 $ BS.pack message) >> return defaultValue
+  where
+    message = printf "Object #%d : %s" (getObjectId objectId) s

 extractText :: Object -> T ()
 extractText object = do
  pageDict <- dict object
  cMappers <- loadCMappers =<< getFont pageDict
-  contents <- stream =<< follow =<< key "Contents" pageDict
-  either (handleError ()) (tell . chunks) (pageContents cMappers contents)
+  contentsId <- target =<< key "Contents" pageDict
+  contents <- stream =<< getObject contentsId
+  either (handleError contentsId ()) (tell . chunks) (pageContents cMappers contents)

 stream :: Object -> T ByteString
 stream (Stream {header, streamContent}) = return $
@ -73,7 +77,7 @@ loadFont objectId =
  >>= key "ToUnicode"
  >>= follow
  >>= stream
-  >>= either (handleError emptyCMap) return . cMap
+  >>= either (handleError objectId emptyCMap) return . cMap

 loadCMappers :: Dictionary -> T CMappers
 loadCMappers = foldM loadCMapper Map.empty . Map.toList
@ -94,9 +98,12 @@ key keyName dictionary =
    Just obj -> return obj
    _ -> list []

+target :: DirectObject -> T ObjectId
+target (Reference (IndirectObjCoordinates {objectId})) = return objectId
+target _ = list []
+
 follow :: DirectObject -> T Object
-follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
-follow _ = list []
+follow directObject = target directObject >>= getObject

 dict :: Object -> T Dictionary
 dict (Direct (Dictionary dictionary)) = return dictionary
@ -122,7 +129,6 @@ listTextObjects (Document {updates}) =
  snd =<< evalRWST rwsMain (unify updates) Map.empty
  where
    rwsMain =
-      --Lazy.pack . show <$> (getObject =<< pagesList)
      pagesList >>= getObject >>= extractText
  

--- a/src/Data/ByteString/Char8/Util.hs
+++ b/src/Data/ByteString/Char8/Util.hs
@ -1,16 +1,53 @@
 module Data.ByteString.Char8.Util (
-      previous
+      decodeHex
+    , fromInt
+    , hexString
+    , parseBytes
+    , previous
    , subBS
+    , toInt
+    , utf16BEToutf8
  ) where

-import Data.ByteString.Char8 (ByteString)
-import qualified Data.ByteString.Char8 as BS (drop, index, take)
+import Data.ByteString (ByteString, snoc)
+import qualified Data.ByteString as BS (foldl, pack, singleton)
+import qualified Data.ByteString.Char8 as Char8 (drop, index, take, unpack)
+import Data.Text.Encoding (encodeUtf8, decodeUtf16BE)
 import Prelude hiding (length)

 previous :: Char -> Int -> ByteString -> Int
 previous char position byteString
-  | BS.index byteString position == char = position
+  | Char8.index byteString position == char = position
  | otherwise = previous char (position - 1) byteString

 subBS :: Int -> Int -> ByteString -> ByteString
-subBS offset length = BS.take length . BS.drop offset
+subBS offset length = Char8.take length . Char8.drop offset
+
+hexString :: (Num a, Read a) => String -> a
+hexString s = read $ "0x" ++ s
+
+fromInt :: Int -> ByteString
+fromInt n
+  | n < 0x100 = BS.singleton $ toEnum n
+  | otherwise = fromInt (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100))
+
+toInt :: ByteString -> Int
+toInt = BS.foldl (\n w -> 0x100*n + fromEnum w) 0
+
+{-
+encodeHex :: ByteString -> ByteString
+encodeHex = 
+-}
+
+decodeHex :: ByteString -> ByteString
+decodeHex = parseBytes . Char8.unpack
+
+parseBytes :: String -> ByteString
+parseBytes = BS.pack . fmap hexString . pairDigits
+  where
+    pairDigits "" = []
+    pairDigits [c] = [[c]]
+    pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
+
+utf16BEToutf8 :: ByteString -> ByteString
+utf16BEToutf8 = encodeUtf8 . decodeUtf16BE
--- a/src/PDF/CMap.hs
+++ b/src/PDF/CMap.hs
@ -1,3 +1,4 @@
+{-# LANGUAGE NamedFieldPuns #-}
 module PDF.CMap (
      CMap
    , CMappers
@ -6,35 +7,66 @@ module PDF.CMap (
  ) where

 import Control.Applicative ((<|>), many)
-import Data.Attoparsec.ByteString.Char8 (count, parseOnly)
-import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
-import Data.ByteString (ByteString, snoc)
-import qualified Data.ByteString as BS (init, last, null)
-import Data.Map (Map)
-import qualified Data.Map as Map (empty, fromList)
+import Control.Monad.State (modify)
+import Data.Attoparsec.ByteString.Char8 (count)
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as BS (length)
+import Data.ByteString.Char8.Util (
+    decodeHex, fromInt, toInt, utf16BEToutf8
+  )
+import Data.Map (Map, union)
+import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
 import qualified PDF.EOL as EOL (charset, parser)
 import PDF.Object (
      DirectObject(..), Name, StringObject(..)
-    , blank, directObject, hexString, integer, line, parseBytes, stringObject
+    , blank, directObject, integer, line, stringObject
  )
-import PDF.Parser (takeAll)
+import PDF.Parser (MonadParser, Parser, runParser, takeAll)

 type CMappers = Map Name CMap
-type CMap = Map Int ByteString
+type Mapping = Map ByteString ByteString
+data CRange = CRange {
+      fromSequence :: ByteString
+    , toSequence :: ByteString
+    , mapping :: Mapping
+  }
+type RangeSize = Int
+type CMap = Map RangeSize [CRange]

 emptyCMap :: CMap
 emptyCMap = Map.empty

 cMap :: ByteString -> Either String CMap
-cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
+cMap = fmap snd <$> runParser
+  (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
+  emptyCMap
  where
    ignoredLine =
-      takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
+      takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()

-cMapRange :: Atto.Parser CMap
+codeRanges :: Parser CMap ()
+codeRanges = do
+  size <- integer <* line "begincodespacerange"
+  count size (createMapping <$> codeRange) *> return ()
+  line "endcodespacerange"
+  where
+    codeRange =
+      (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
+
+createMapping :: (StringObject, StringObject) -> Parser CMap ()
+createMapping (Hexadecimal from, Hexadecimal to) = modify $
+  Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
+  where
+    fromSequence = decodeHex from
+    size = BS.length fromSequence
+    toSequence = decodeHex to
+    mapping = Map.empty
+createMapping _ = return ()
+
+cMapRange :: Parser CMap ()
 cMapRange = do
  size <- integer <* line "beginbfrange"
-  mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
+  mapM_ saveMapping =<< count size rangeMapping <* line "endbfrange"
  where
    rangeMapping = (,,) 
      <$> (stringObject <* blank)
@ -42,33 +74,52 @@ cMapRange = do
      <*> directObject <* EOL.parser
      >>= mapFromTo

-cMapChar :: Atto.Parser CMap
+saveMapping :: [(ByteString, ByteString)] -> Parser CMap ()
+saveMapping [] = return ()
+saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
+  where
+    newMapping = Map.fromList assoc
+    mappingSize = BS.length code
+    matchingRange (CRange {fromSequence, toSequence}) =
+      fromSequence <= code && code <= toSequence
+    appendMapping cRange =
+      cRange {mapping = mapping cRange `union` newMapping}
+    insertCRange = fmap (\cRange ->
+        if matchingRange cRange then appendMapping cRange else cRange
+      )
+
+cMapChar :: Parser CMap ()
 cMapChar = do
  size <- integer <* line "beginbfchar"
-  Map.fromList <$> count size charMapping <* line "endbfchar"
+  saveMapping =<< count size charMapping <* line "endbfchar"
  where
    charMapping =
      (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
      >>= pairMapping

-mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, ByteString)]
+between :: ByteString -> ByteString -> [ByteString]
+between from to = fromInt <$> [toInt from .. toInt to]
+
+startFrom :: ByteString -> [ByteString]
+startFrom from = fromInt <$> [toInt from .. ]
+
+mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
 mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
-  let dstString = parseBytes dstFrom in
-  return $ zip [hexString from .. hexString to] (textsFrom dstString)
+  return $ zip (between fromBS toBS) (startFrom dstBS)
  where
-    textsFrom t
-      | BS.null t = [t]
-      | otherwise = (BS.init t `snoc`) <$> [BS.last t ..]
+    (fromBS, toBS, dstBS) = (decodeHex from, decodeHex to, decodeHex dstFrom)

 mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
-  zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
+  zip (between fromBS toBS) <$> (mapM dstByteString dstPoints)
  where
-    dstString (StringObject (Hexadecimal dstPoint)) = return $ parseBytes dstPoint
-    dstString _ = fail "Invalid for a replacement string"
+    (fromBS, toBS) = (decodeHex from, decodeHex to)
+    dstByteString (StringObject (Hexadecimal dst)) =
+      return . utf16BEToutf8 $ decodeHex dst
+    dstByteString _ = fail "Invalid for a replacement string"

 mapFromTo _ = fail "invalid range mapping found"

-pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, ByteString)
+pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
 pairMapping (Hexadecimal from, Hexadecimal to) =
-  return (hexString from, parseBytes to)
+  return (decodeHex from, utf16BEToutf8 $ decodeHex to)
 pairMapping _ = fail "invalid pair mapping found"
--- a/src/PDF/Object.hs
+++ b/src/PDF/Object.hs
@ -22,7 +22,6 @@ module PDF.Object (
    , dictionary
    , directObject
    , eofMarker
-    , hexString
    , integer
    , line
    , magicNumber
@ -39,6 +38,7 @@ import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as BS (concat, pack)
 import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
+import Data.ByteString.Char8.Util (hexString, parseBytes)
 import Data.Map (Map, (!), mapWithKey)
 import qualified Data.Map as Map (
    delete, empty, fromList, lookup, minViewWithKey, toList, union
@ -114,16 +114,16 @@ number = Number . read . Char8.unpack <$>
 --
 -- StringObject
 --
-data StringObject = Literal String | Hexadecimal String deriving Show
+data StringObject = Literal ByteString | Hexadecimal ByteString deriving Show

 instance Output StringObject where
-  output (Literal s) = Output.string (printf "(%s)" s)
-  output (Hexadecimal s) = Output.string (printf "<%s>" s)
+  output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
+  output (Hexadecimal s) = Output.string (printf "<%s>" (Char8.unpack s))

 stringObject :: MonadParser m => m StringObject
 stringObject =
-    Literal . Char8.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
-  <|> Hexadecimal . Char8.unpack <$> (char '<' *> hexNumber <* char '>')
+    Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
+  <|> Hexadecimal <$> (char '<' *> hexNumber <* char '>')
  <?> "string object (literal or hexadecimal)"
  where
    literalString = many literalStringBlock
@ -135,16 +135,6 @@ stringObject =
      Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
    octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]

-hexString :: (Num a, Read a) => String -> a
-hexString s = read $ "0x" ++ s
-
-parseBytes :: String -> ByteString
-parseBytes = encodeUtf8 . decodeUtf16BE . BS.pack . fmap hexString . pairDigits
-  where
-    pairDigits "" = []
-    pairDigits [c] = [[c]]
-    pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
-
 --
 -- Name
 --
--- a/src/PDF/Text.hs
+++ b/src/PDF/Text.hs
@ -11,6 +11,7 @@ import Control.Monad.State (get, put)
 import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString.Char8 as Char8 (unpack)
+import Data.ByteString.Char8.Util (decodeHex)
 import Data.Map ((!))
 import qualified Data.Map as Map (lookup)
 import PDF.CMap (CMappers, CMap, emptyCMap)
@ -22,13 +23,13 @@ import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll)

 data StateOperator =
  Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
-  deriving (Bounded, Enum)
+  deriving (Bounded, Enum, Show)
 data TextOperator =
    Td | TD | Tm | Tstar -- text positioning
  | TJ | Tj | Quote | DQuote -- text showing
  | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
-  deriving (Bounded, Enum)
-data Argument = Raw ByteString | Typed DirectObject
+  deriving (Bounded, Enum, Show)
+data Argument = Raw ByteString | Typed DirectObject deriving Show
 type Call a = (a, [Argument])

 stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator)
@ -112,7 +113,7 @@ runOperator (TJ, [Typed (Array arrayObject)]) =
    appendText bs (StringObject outputString) =
      mappend bs <$> decodeString outputString
    appendText bs _ = return bs
-      
+
 runOperator (Tj, [Typed (StringObject outputString)]) =
  replicate 1 <$> decodeString outputString

@ -125,8 +126,10 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
 runOperator _ = return []

 decodeString :: StringObject -> ParserWithFont ByteString
-decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h))
-decodeString (Literal litString) = get >>= convertBytes litString
+decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h))
+decodeString (Literal litString) =
+
+get >>= convertBytes litString
  where
    convertBytes :: String -> CMap -> ParserWithFont ByteString
    convertBytes [] _ = return ""