Clean ByteString types to identify when a ByteString contains the representation of an integer in a given base and fix the last remaining PDF string (un)escaping issue

This commit is contained in:
Tissevert 2019-10-04 18:46:07 +02:00
parent a96e36ec5a
commit 3a3e1533b4
6 changed files with 119 additions and 87 deletions

View file

@ -1,11 +1,12 @@
{-# LANGUAGE NamedFieldPuns #-}
module Main where module Main where
import Control.Monad.Reader import Control.Monad.Reader
import qualified Data.ByteString as BS import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as C8 import qualified Data.ByteString.Char8 as C8
import qualified Data.Map as Map import qualified Data.Map as Map
import PDF.Object (Name(..), array) import PDF.Object (Name(..), StringObject(..), array)
import PDF.CMap (CMappers, cMap, emptyCMap) import PDF.CMap (CMappers, CRange(..), cMap, emptyCMap)
import PDF.Parser (evalParser) import PDF.Parser (evalParser)
import PDF.Text import PDF.Text
@ -15,12 +16,20 @@ test fonts parser =
main :: IO () main :: IO ()
main = do main = do
--input <- BS.readFile "20.stream" input <- BS.readFile "20.stream"
input <- BS.readFile "array.bin" --input <- BS.readFile "array.bin"
--let input = C8.pack "\f\xb5\0I"
Right font <- cMap <$> BS.readFile "6300.stream" Right font <- cMap <$> BS.readFile "6300.stream"
--mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> BS.putStrLn v) $ Map.toList font --mapM_ (\(size, m) -> putStrLn ("taille " ++ show size) >> mapM_ showCRange m) $ Map.toList font
--case pageContents (Map.singleton (Name "R9") font) input of --case pageContents (Map.singleton (Name "R9") font) input of
--case test (Map.singleton (Name "R9") font) array input of --case test (Map.singleton (Name "R9") font) array input of
case test (Map.singleton (Name "R9") font) (a textOperator) input of --case test (Map.singleton (Name "R9") font) (a textOperator) input of
case test (Map.singleton (Name "R9") font) page input of
Left e -> putStrLn e Left e -> putStrLn e
Right l -> putStrLn . show $ l Right l -> putStrLn . show $ l
where
showMapping = mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> C8.putStrLn v) . Map.toList
showCRange :: CRange -> IO ()
showCRange (CRange {fromSequence, toSequence, mapping}) = do
putStrLn $ "from " ++ C8.unpack fromSequence ++ " to " ++ C8.unpack toSequence
showMapping mapping

View file

@ -1,19 +1,30 @@
module Data.ByteString.Char8.Util ( module Data.ByteString.Char8.Util (
decodeHex B16Int(..)
, fromInt , B256Int(..)
, hexString , b8ToInt
, b16ToBytes
, b16ToInt
, b256ToInt
, intToB256
, previous , previous
, subBS , subBS
, toInt
, toBytes , toBytes
, unescape
, utf16BEToutf8 , utf16BEToutf8
) where ) where
import Data.ByteString (ByteString, snoc) import Data.ByteString (ByteString, snoc)
import qualified Data.ByteString as BS (empty, foldl, pack, singleton) import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt)
import qualified Data.ByteString.Char8 as Char8 (drop, index, take, unpack) import qualified Data.ByteString.Char8 as Char8 (
cons, drop, index, splitAt, take, uncons, unpack
)
import Data.Text.Encoding (encodeUtf8, decodeUtf16BE) import Data.Text.Encoding (encodeUtf8, decodeUtf16BE)
import Prelude hiding (length) import Prelude hiding (length)
import Text.Printf (printf)
newtype B8Int = B8Int ByteString deriving Show
newtype B16Int = B16Int ByteString deriving Show
newtype B256Int = B256Int ByteString deriving Show
previous :: Char -> Int -> ByteString -> Int previous :: Char -> Int -> ByteString -> Int
previous char position byteString previous char position byteString
@ -23,27 +34,57 @@ previous char position byteString
subBS :: Int -> Int -> ByteString -> ByteString subBS :: Int -> Int -> ByteString -> ByteString
subBS offset length = Char8.take length . Char8.drop offset subBS offset length = Char8.take length . Char8.drop offset
hexString :: (Num a, Read a) => String -> a intToB256 :: Int -> B256Int
hexString s = read $ "0x" ++ s intToB256 n
| n < 0x100 = B256Int . BS.singleton $ toEnum n
| otherwise =
let B256Int begining = intToB256 (n `div` 0x100) in
B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
fromInt :: Int -> ByteString b256ToInt :: B256Int -> Int
fromInt n b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n
| n < 0x100 = BS.singleton $ toEnum n
| otherwise = fromInt (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100))
toBytes :: Int -> Int -> ByteString toBytes :: Int -> Int -> ByteString
toBytes 0 _ = BS.empty toBytes 0 _ = BS.empty
toBytes size n = toBytes (size - 1) (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100)) toBytes size n =
(toBytes (size - 1) (n `div` 0x100)) `snoc` (toEnum (n `mod` 0x100))
toInt :: ByteString -> Int b16ToBytes :: B16Int -> ByteString
toInt = BS.foldl (\n w -> 0x100*n + fromEnum w) 0 b16ToBytes (B16Int n) = BS.pack . fmap b16ToInt $ pairDigits n
decodeHex :: ByteString -> ByteString
decodeHex = BS.pack . fmap hexString . pairDigits . Char8.unpack
where where
pairDigits "" = [] pairDigits s =
pairDigits [c] = [[c]] case BS.length s of
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end 0 -> []
1 -> [B16Int s]
_ ->
let (twoHexDigits, rest) = BS.splitAt 2 s in
(B16Int $ twoHexDigits):(pairDigits rest)
fromBase :: (Num a, Read a) => Char -> ByteString -> a
fromBase b = read . printf "0%c%s" b . Char8.unpack
b16ToInt :: (Num a, Read a) => B16Int -> a
b16ToInt (B16Int n) = fromBase 'x' n
b8ToInt :: (Num a, Read a) => B8Int -> a
b8ToInt (B8Int n) = fromBase 'o' n
unescape :: ByteString -> ByteString
unescape escapedBS =
case Char8.uncons escapedBS of
Nothing -> BS.empty
Just ('\\', s) -> unescapeChar s
Just (c, s) -> Char8.cons c (unescape s)
where
unescapeChar s =
case Char8.uncons s of
Nothing -> BS.empty
Just (c, s')
| c `elem` "()" -> Char8.cons c (unescape s')
| c `elem` "nrtbf" -> Char8.cons (read (printf "'\\%c'" c)) (unescape s')
| c `elem` ['0'..'7'] -> fromOctal (Char8.splitAt 3 s)
| otherwise -> Char8.cons c (unescape s')
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
utf16BEToutf8 :: ByteString -> ByteString utf16BEToutf8 :: ByteString -> ByteString
utf16BEToutf8 = encodeUtf8 . decodeUtf16BE utf16BEToutf8 = encodeUtf8 . decodeUtf16BE

View file

@ -14,7 +14,7 @@ import Data.Attoparsec.ByteString.Char8 (count)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (length) import qualified Data.ByteString as BS (length)
import Data.ByteString.Char8.Util ( import Data.ByteString.Char8.Util (
decodeHex, toBytes, toInt, utf16BEToutf8 B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
) )
import Data.Map (Map, union) import Data.Map (Map, union)
import qualified Data.Map as Map (adjust, empty, fromList, insertWith) import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
@ -63,9 +63,9 @@ createMapping :: (StringObject, StringObject) -> Parser CMap ()
createMapping (Hexadecimal from, Hexadecimal to) = modify $ createMapping (Hexadecimal from, Hexadecimal to) = modify $
Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}] Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
where where
fromSequence = decodeHex from fromSequence = b16ToBytes from
size = BS.length fromSequence size = BS.length fromSequence
toSequence = decodeHex to toSequence = b16ToBytes to
mapping = Map.empty mapping = Map.empty
createMapping _ = return () createMapping _ = return ()
@ -102,33 +102,30 @@ cMapChar = do
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser (,) <$> stringObject <* blank <*> stringObject <* EOL.parser
>>= pairMapping >>= pairMapping
between :: ByteString -> ByteString -> [ByteString] between :: B16Int -> B16Int -> [ByteString]
between from to = between from@(B16Int s) to =
let size = BS.length from in let size = BS.length s `div` 2 in
toBytes size <$> [toInt from .. toInt to] toBytes size <$> [b16ToInt from .. b16ToInt to]
startFrom :: ByteString -> [ByteString] startFrom :: B16Int -> [ByteString]
startFrom from = startFrom from@(B16Int s) =
let size = BS.length from in let size = BS.length s `div` 2 in
toBytes size <$> [toInt from .. ] toBytes size <$> [b16ToInt from .. ]
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)] mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
return $ zip (between fromBS toBS) (utf16BEToutf8 <$> startFrom dstBS) return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
where
(fromBS, toBS, dstBS) = (decodeHex from, decodeHex to, decodeHex dstFrom)
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) = mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
zip (between fromBS toBS) <$> (mapM dstByteString dstPoints) zip (between from to) <$> (mapM dstByteString dstPoints)
where where
(fromBS, toBS) = (decodeHex from, decodeHex to)
dstByteString (StringObject (Hexadecimal dst)) = dstByteString (StringObject (Hexadecimal dst)) =
return . utf16BEToutf8 $ decodeHex dst return . utf16BEToutf8 $ b16ToBytes dst
dstByteString _ = fail "Invalid for a replacement string" dstByteString _ = fail "Invalid for a replacement string"
mapFromTo _ = fail "invalid range mapping found" mapFromTo _ = fail "invalid range mapping found"
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString) pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
pairMapping (Hexadecimal from, Hexadecimal to) = pairMapping (Hexadecimal from, Hexadecimal to) =
return (decodeHex from, utf16BEToutf8 $ decodeHex to) return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
pairMapping _ = fail "invalid pair mapping found" pairMapping _ = fail "invalid pair mapping found"

View file

@ -30,6 +30,7 @@ module PDF.Object (
, regular , regular
, stringObject , stringObject
, structure , structure
, toByteString
) where ) where
import Control.Applicative ((<|>), many) import Control.Applicative ((<|>), many)
@ -37,6 +38,7 @@ import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (concat) import qualified Data.ByteString as BS (concat)
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack) import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
import Data.Map (Map, (!), mapWithKey) import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map ( import qualified Data.Map as Map (
delete, empty, fromList, lookup, minViewWithKey, toList, union delete, empty, fromList, lookup, minViewWithKey, toList, union
@ -111,11 +113,11 @@ number = Number . read . Char8.unpack <$>
-- --
-- StringObject -- StringObject
-- --
data StringObject = Literal ByteString | Hexadecimal ByteString deriving Show data StringObject = Literal ByteString | Hexadecimal B16Int deriving Show
instance Output StringObject where instance Output StringObject where
output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s)) output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
output (Hexadecimal s) = Output.string (printf "<%s>" (Char8.unpack s)) output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))
stringObject :: MonadParser m => m StringObject stringObject :: MonadParser m => m StringObject
stringObject = stringObject =
@ -129,9 +131,13 @@ stringObject =
matchingParenthesis = matchingParenthesis =
mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")" mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar = escapedChar =
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode) Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3] octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
toByteString :: StringObject -> ByteString
toByteString (Hexadecimal h) = b16ToBytes h
toByteString (Literal s) = unescape s
-- --
-- Name -- Name
-- --

View file

@ -28,6 +28,7 @@ import qualified Data.Attoparsec.ByteString.Char8 as Atto (
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1 Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
) )
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import Data.ByteString.Char8.Util (B16Int(..))
import Data.Char (toLower) import Data.Char (toLower)
import Data.Set (Set) import Data.Set (Set)
import qualified Data.Set as Set (fromList, member, unions) import qualified Data.Set as Set (fromList, member, unions)
@ -39,7 +40,7 @@ class MonadDeps m => MonadParser m where
block :: Int -> m ByteString block :: Int -> m ByteString
char :: Char -> m Char char :: Char -> m Char
decNumber :: m ByteString decNumber :: m ByteString
hexNumber :: m ByteString hexNumber :: m B16Int
oneOf :: String -> m Char oneOf :: String -> m Char
string :: ByteString -> m ByteString string :: ByteString -> m ByteString
takeAll :: (Char -> Bool) -> m ByteString takeAll :: (Char -> Bool) -> m ByteString
@ -49,7 +50,7 @@ instance MonadParser Atto.Parser where
block = Atto.take block = Atto.take
char = Atto.char char = Atto.char
decNumber = Atto.takeWhile1 (`Set.member` digits) decNumber = Atto.takeWhile1 (`Set.member` digits)
hexNumber = Atto.takeWhile1 (`Set.member` hexDigits) hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
oneOf charSet = Atto.satisfy (`elem` charSet) oneOf charSet = Atto.satisfy (`elem` charSet)
string s = Atto.string s <?> show s string s = Atto.string s <?> show s
takeAll = Atto.takeWhile takeAll = Atto.takeWhile

View file

@ -12,14 +12,13 @@ import Control.Monad.State (get, put)
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (drop, null, take) import qualified Data.ByteString as BS (drop, null, take)
import Data.ByteString.Char8.Util (decodeHex) import Data.ByteString.Char8 (unpack)
import Data.List (find)
import Data.Map ((!)) import Data.Map ((!))
import qualified Data.Map as Map (lookup, toList) import qualified Data.Map as Map (lookup, toList)
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap) import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap)
import PDF.Object ( import PDF.Object (
DirectObject(..), StringObject(..) DirectObject(..), StringObject(..)
, array, blank, name, parseBytes, regular, stringObject , array, blank, name, regular, stringObject, toByteString
) )
import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll) import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll)
@ -128,44 +127,23 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return [] runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString decodeString :: StringObject -> ParserWithFont ByteString
decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h)) decodeString = decode . toByteString
decodeString (Literal litString) = do
cRangesBySize <- Map.toList <$> get
f cRangesBySize litString
where where
f :: [(Int, [CRange])] -> ByteString -> ParserWithFont ByteString decode input
f cRangesBySize input
| BS.null input = return "" | BS.null input = return ""
| otherwise = do | otherwise = do
(output, newInput) <- g cRangesBySize input (output, remainingInput) <- trySizes input =<< Map.toList <$> get
mappend output <$> f cRangesBySize newInput mappend output <$> decode remainingInput
g :: [(Int, [CRange])] -> ByteString -> ParserWithFont (ByteString, ByteString) trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString)
g [] _ = fail "No matching code found in font" trySizes s [] = fail $ "No matching code found in font for " ++ unpack s
g ((size, cRanges):others) s = trySizes s ((size, cRanges):others) =
let prefix = BS.take size s in let prefix = BS.take size s in
case h prefix cRanges of case tryRanges prefix cRanges of
Nothing -> g others s Nothing -> trySizes s others
Just outputSequence -> return (outputSequence, BS.drop size s) Just outputSequence -> return (outputSequence, BS.drop size s)
h :: ByteString -> [CRange] -> Maybe ByteString tryRanges :: ByteString -> [CRange] -> Maybe ByteString
h prefix [] = Nothing tryRanges _ [] = Nothing
h prefix ((CRange {mapping}):cRanges) = tryRanges prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of case Map.lookup prefix mapping of
Nothing -> h prefix cRanges Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence outputSequence -> outputSequence
{-
get >>= convertBytes litString
where
convertBytes :: String -> CMap -> ParserWithFont ByteString
convertBytes [] _ = return ""
convertBytes (c:cs) someCMap = do
convertBytesAux (fromEnum c) 1 cs someCMap
convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString
convertBytesAux code size s someCMap
| size > 4 = fail "Could not match any input code smaller than an int"
| otherwise =
case (Map.lookup code someCMap, s) of
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
(Nothing, []) -> fail "No character left to read but no code recognized"
(Just outputText, _) -> mappend outputText <$> convertBytes s someCMap
-}