Clean ByteString types to identify when a ByteString contains the representation of an integer in a given base and fix the last remaining PDF string (un)escaping issue

2019-10-04 18:46:07 +02:00 · 2019-10-04 18:46:07 +02:00 · 3a3e1533b4
commit 3a3e1533b4
parent a96e36ec5a
6 changed files with 119 additions and 87 deletions
--- a/examples/debug.hs
+++ b/examples/debug.hs
@ -1,11 +1,12 @@
+{-# LANGUAGE NamedFieldPuns #-}
 module Main where

 import Control.Monad.Reader
 import qualified Data.ByteString as BS
 import qualified Data.ByteString.Char8 as C8
 import qualified Data.Map as Map
-import PDF.Object (Name(..), array)
-import PDF.CMap (CMappers, cMap, emptyCMap)
+import PDF.Object (Name(..), StringObject(..), array)
+import PDF.CMap (CMappers, CRange(..), cMap, emptyCMap)
 import PDF.Parser (evalParser)
 import PDF.Text

@ -15,12 +16,20 @@ test fonts parser =

 main :: IO ()
 main = do
-  --input <- BS.readFile "20.stream"
-  input <- BS.readFile "array.bin"
+  input <- BS.readFile "20.stream"
+  --input <- BS.readFile "array.bin"
+  --let input = C8.pack "\f\xb5\0I"
  Right font <- cMap <$> BS.readFile "6300.stream"
-  --mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> BS.putStrLn v) $ Map.toList font
+  --mapM_ (\(size, m) -> putStrLn ("taille " ++ show size) >> mapM_ showCRange m) $ Map.toList font
  --case pageContents (Map.singleton (Name "R9") font) input of
  --case test (Map.singleton (Name "R9") font) array input of
-  case test (Map.singleton (Name "R9") font) (a textOperator) input of
+  --case test (Map.singleton (Name "R9") font) (a textOperator) input of
+  case test (Map.singleton (Name "R9") font) page input of
    Left e -> putStrLn e
    Right l -> putStrLn . show $ l
+  where
+    showMapping = mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> C8.putStrLn v) . Map.toList
+    showCRange :: CRange -> IO ()
+    showCRange (CRange {fromSequence, toSequence, mapping}) = do
+      putStrLn $ "from " ++ C8.unpack fromSequence ++ " to " ++ C8.unpack toSequence
+      showMapping mapping
--- a/src/Data/ByteString/Char8/Util.hs
+++ b/src/Data/ByteString/Char8/Util.hs
@ -1,19 +1,30 @@
 module Data.ByteString.Char8.Util (
-      decodeHex
-    , fromInt
-    , hexString
+      B16Int(..)
+    , B256Int(..)
+    , b8ToInt
+    , b16ToBytes
+    , b16ToInt
+    , b256ToInt
+    , intToB256
    , previous
    , subBS
-    , toInt
    , toBytes
+    , unescape
    , utf16BEToutf8
  ) where

 import Data.ByteString (ByteString, snoc)
-import qualified Data.ByteString as BS (empty, foldl, pack, singleton)
-import qualified Data.ByteString.Char8 as Char8 (drop, index, take, unpack)
+import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt)
+import qualified Data.ByteString.Char8 as Char8 (
+    cons, drop, index, splitAt, take, uncons, unpack
+  )
 import Data.Text.Encoding (encodeUtf8, decodeUtf16BE)
 import Prelude hiding (length)
+import Text.Printf (printf)
+
+newtype B8Int = B8Int ByteString deriving Show
+newtype B16Int = B16Int ByteString deriving Show
+newtype B256Int = B256Int ByteString deriving Show

 previous :: Char -> Int -> ByteString -> Int
 previous char position byteString
@ -23,27 +34,57 @@ previous char position byteString
 subBS :: Int -> Int -> ByteString -> ByteString
 subBS offset length = Char8.take length . Char8.drop offset

-hexString :: (Num a, Read a) => String -> a
-hexString s = read $ "0x" ++ s
+intToB256 :: Int -> B256Int
+intToB256 n
+  | n < 0x100 = B256Int . BS.singleton $ toEnum n
+  | otherwise =
+    let B256Int begining = intToB256 (n `div` 0x100) in
+    B256Int $ begining `snoc` (toEnum (n `mod` 0x100))

-fromInt :: Int -> ByteString
-fromInt n
-  | n < 0x100 = BS.singleton $ toEnum n
-  | otherwise = fromInt (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100))
+b256ToInt :: B256Int -> Int
+b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n

 toBytes :: Int -> Int -> ByteString
 toBytes 0 _ = BS.empty
-toBytes size n = toBytes (size - 1) (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100))
+toBytes size n =
+  (toBytes (size - 1) (n `div` 0x100)) `snoc` (toEnum (n `mod` 0x100))

-toInt :: ByteString -> Int
-toInt = BS.foldl (\n w -> 0x100*n + fromEnum w) 0
-
-decodeHex :: ByteString -> ByteString
-decodeHex = BS.pack . fmap hexString . pairDigits . Char8.unpack
+b16ToBytes :: B16Int -> ByteString
+b16ToBytes (B16Int n) = BS.pack . fmap b16ToInt $ pairDigits n
  where
-    pairDigits "" = []
-    pairDigits [c] = [[c]]
-    pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
+    pairDigits s =
+      case BS.length s of
+        0 -> []
+        1 -> [B16Int s]
+        _ ->
+          let (twoHexDigits, rest) = BS.splitAt 2 s in
+          (B16Int $ twoHexDigits):(pairDigits rest)
+
+fromBase :: (Num a, Read a) => Char -> ByteString -> a
+fromBase b = read . printf "0%c%s" b . Char8.unpack
+
+b16ToInt :: (Num a, Read a) => B16Int -> a
+b16ToInt (B16Int n) = fromBase 'x' n
+
+b8ToInt :: (Num a, Read a) => B8Int -> a
+b8ToInt (B8Int n) = fromBase 'o' n
+
+unescape :: ByteString -> ByteString
+unescape escapedBS =
+  case Char8.uncons escapedBS of
+    Nothing -> BS.empty
+    Just ('\\', s) -> unescapeChar s
+    Just (c, s) -> Char8.cons c (unescape s)
+  where
+    unescapeChar s =
+      case Char8.uncons s of
+        Nothing -> BS.empty
+        Just (c, s')
+          | c `elem` "()" -> Char8.cons c (unescape s')
+          | c `elem` "nrtbf" -> Char8.cons (read (printf "'\\%c'" c)) (unescape s')
+          | c `elem` ['0'..'7'] -> fromOctal (Char8.splitAt 3 s)
+          | otherwise -> Char8.cons c (unescape s')
+    fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)

 utf16BEToutf8 :: ByteString -> ByteString
 utf16BEToutf8 = encodeUtf8 . decodeUtf16BE
--- a/src/PDF/CMap.hs
+++ b/src/PDF/CMap.hs
@ -14,7 +14,7 @@ import Data.Attoparsec.ByteString.Char8 (count)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as BS (length)
 import Data.ByteString.Char8.Util (
-    decodeHex, toBytes, toInt, utf16BEToutf8
+    B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
  )
 import Data.Map (Map, union)
 import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
@ -63,9 +63,9 @@ createMapping :: (StringObject, StringObject) -> Parser CMap ()
 createMapping (Hexadecimal from, Hexadecimal to) = modify $
  Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
  where
-    fromSequence = decodeHex from
+    fromSequence = b16ToBytes from
    size = BS.length fromSequence
-    toSequence = decodeHex to
+    toSequence = b16ToBytes to
    mapping = Map.empty
 createMapping _ = return ()

@ -102,33 +102,30 @@ cMapChar = do
      (,) <$> stringObject <* blank <*> stringObject <* EOL.parser
      >>= pairMapping

-between :: ByteString -> ByteString -> [ByteString]
-between from to =
-  let size = BS.length from in
-  toBytes size <$> [toInt from .. toInt to]
+between :: B16Int -> B16Int -> [ByteString]
+between from@(B16Int s) to =
+  let size = BS.length s `div` 2 in
+  toBytes size <$> [b16ToInt from .. b16ToInt to]

-startFrom :: ByteString -> [ByteString]
-startFrom from =
-  let size = BS.length from in
-  toBytes size <$> [toInt from .. ]
+startFrom :: B16Int -> [ByteString]
+startFrom from@(B16Int s) =
+  let size = BS.length s `div` 2 in
+  toBytes size <$> [b16ToInt from .. ]

 mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
 mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
-  return $ zip (between fromBS toBS) (utf16BEToutf8 <$> startFrom dstBS)
-  where
-    (fromBS, toBS, dstBS) = (decodeHex from, decodeHex to, decodeHex dstFrom)
+  return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)

 mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
-  zip (between fromBS toBS) <$> (mapM dstByteString dstPoints)
+  zip (between from to) <$> (mapM dstByteString dstPoints)
  where
-    (fromBS, toBS) = (decodeHex from, decodeHex to)
    dstByteString (StringObject (Hexadecimal dst)) =
-      return . utf16BEToutf8 $ decodeHex dst
+      return . utf16BEToutf8 $ b16ToBytes dst
    dstByteString _ = fail "Invalid for a replacement string"

 mapFromTo _ = fail "invalid range mapping found"

 pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
 pairMapping (Hexadecimal from, Hexadecimal to) =
-  return (decodeHex from, utf16BEToutf8 $ decodeHex to)
+  return (b16ToBytes  from, utf16BEToutf8 $ b16ToBytes  to)
 pairMapping _ = fail "invalid pair mapping found"
--- a/src/PDF/Object.hs
+++ b/src/PDF/Object.hs
@ -30,6 +30,7 @@ module PDF.Object (
    , regular
    , stringObject
    , structure
+    , toByteString
  ) where

 import Control.Applicative ((<|>), many)
@ -37,6 +38,7 @@ import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as BS (concat)
 import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
+import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
 import Data.Map (Map, (!), mapWithKey)
 import qualified Data.Map as Map (
    delete, empty, fromList, lookup, minViewWithKey, toList, union
@ -111,11 +113,11 @@ number = Number . read . Char8.unpack <$>
 --
 -- StringObject
 --
-data StringObject = Literal ByteString | Hexadecimal ByteString deriving Show
+data StringObject = Literal ByteString | Hexadecimal B16Int deriving Show

 instance Output StringObject where
  output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
-  output (Hexadecimal s) = Output.string (printf "<%s>" (Char8.unpack s))
+  output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))

 stringObject :: MonadParser m => m StringObject
 stringObject =
@ -129,9 +131,13 @@ stringObject =
    matchingParenthesis =
      mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
    escapedChar =
-      Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
+      Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
    octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]

+toByteString :: StringObject -> ByteString
+toByteString (Hexadecimal h) = b16ToBytes h
+toByteString (Literal s) = unescape s
+
 --
 -- Name
 --
--- a/src/PDF/Parser.hs
+++ b/src/PDF/Parser.hs
@ -28,6 +28,7 @@ import qualified Data.Attoparsec.ByteString.Char8 as Atto (
    Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
  )
 import Data.ByteString (ByteString)
+import Data.ByteString.Char8.Util (B16Int(..))
 import Data.Char (toLower)
 import Data.Set (Set)
 import qualified Data.Set as Set (fromList, member, unions)
@ -39,7 +40,7 @@ class MonadDeps m => MonadParser m where
  block :: Int -> m ByteString
  char :: Char -> m Char
  decNumber :: m ByteString
-  hexNumber :: m ByteString
+  hexNumber :: m B16Int
  oneOf :: String -> m Char
  string :: ByteString -> m ByteString
  takeAll :: (Char -> Bool) -> m ByteString
@ -49,7 +50,7 @@ instance MonadParser Atto.Parser where
  block = Atto.take
  char = Atto.char
  decNumber = Atto.takeWhile1 (`Set.member` digits)
-  hexNumber = Atto.takeWhile1 (`Set.member` hexDigits)
+  hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
  oneOf charSet = Atto.satisfy (`elem` charSet)
  string s = Atto.string s <?> show s
  takeAll = Atto.takeWhile
--- a/src/PDF/Text.hs
+++ b/src/PDF/Text.hs
@ -12,14 +12,13 @@ import Control.Monad.State (get, put)
 import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as BS (drop, null, take)
-import Data.ByteString.Char8.Util (decodeHex)
-import Data.List (find)
+import Data.ByteString.Char8 (unpack)
 import Data.Map ((!))
 import qualified Data.Map as Map (lookup, toList)
 import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap)
 import PDF.Object (
      DirectObject(..), StringObject(..)
-    , array, blank, name, parseBytes, regular, stringObject
+    , array, blank, name, regular, stringObject, toByteString
  )
 import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll)

@ -128,44 +127,23 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
 runOperator _ = return []

 decodeString :: StringObject -> ParserWithFont ByteString
-decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h))
-decodeString (Literal litString) = do
-  cRangesBySize <- Map.toList <$> get
-  f cRangesBySize litString
+decodeString = decode . toByteString
  where
-    f :: [(Int, [CRange])] -> ByteString -> ParserWithFont ByteString
-    f cRangesBySize input
+    decode input
      | BS.null input = return ""
      | otherwise = do
-        (output, newInput) <- g cRangesBySize input
-        mappend output <$> f cRangesBySize newInput
-    g :: [(Int, [CRange])] -> ByteString -> ParserWithFont (ByteString, ByteString)
-    g [] _ = fail "No matching code found in font"
-    g ((size, cRanges):others) s =
+        (output, remainingInput) <- trySizes input =<< Map.toList <$> get
+        mappend output <$> decode remainingInput
+    trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString)
+    trySizes s [] = fail $ "No matching code found in font for " ++ unpack s
+    trySizes s ((size, cRanges):others) =
      let prefix = BS.take size s in
-      case h prefix cRanges of
-        Nothing -> g others s
+      case tryRanges prefix cRanges of
+        Nothing -> trySizes s others
        Just outputSequence -> return (outputSequence, BS.drop size s)
-    h :: ByteString -> [CRange] -> Maybe ByteString
-    h prefix [] = Nothing
-    h prefix ((CRange {mapping}):cRanges) =
+    tryRanges :: ByteString -> [CRange] -> Maybe ByteString
+    tryRanges _ [] = Nothing
+    tryRanges prefix ((CRange {mapping}):cRanges) =
      case Map.lookup prefix mapping of
-        Nothing -> h prefix cRanges
+        Nothing -> tryRanges prefix cRanges
        outputSequence -> outputSequence
-
-{-
-get >>= convertBytes litString
-  where
-    convertBytes :: String -> CMap -> ParserWithFont ByteString
-    convertBytes [] _ = return ""
-    convertBytes (c:cs) someCMap = do
-      convertBytesAux (fromEnum c) 1 cs someCMap
-    convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString
-    convertBytesAux code size s someCMap
-      | size > 4 = fail "Could not match any input code smaller than an int"
-      | otherwise =
-        case (Map.lookup code someCMap, s) of
-          (Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
-          (Nothing, []) -> fail "No character left to read but no code recognized"
-          (Just outputText, _) -> mappend outputText <$> convertBytes s someCMap
-}