Clean ByteString types to identify when a ByteString contains the representation of an integer in a given base and fix the last remaining PDF string (un)escaping issue

This commit is contained in:
Tissevert 2019-10-04 18:46:07 +02:00
parent a96e36ec5a
commit 3a3e1533b4
6 changed files with 119 additions and 87 deletions

View file

@ -1,11 +1,12 @@
{-# LANGUAGE NamedFieldPuns #-}
module Main where
import Control.Monad.Reader
import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as C8
import qualified Data.Map as Map
import PDF.Object (Name(..), array)
import PDF.CMap (CMappers, cMap, emptyCMap)
import PDF.Object (Name(..), StringObject(..), array)
import PDF.CMap (CMappers, CRange(..), cMap, emptyCMap)
import PDF.Parser (evalParser)
import PDF.Text
@ -15,12 +16,20 @@ test fonts parser =
main :: IO ()
main = do
--input <- BS.readFile "20.stream"
input <- BS.readFile "array.bin"
input <- BS.readFile "20.stream"
--input <- BS.readFile "array.bin"
--let input = C8.pack "\f\xb5\0I"
Right font <- cMap <$> BS.readFile "6300.stream"
--mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> BS.putStrLn v) $ Map.toList font
--mapM_ (\(size, m) -> putStrLn ("taille " ++ show size) >> mapM_ showCRange m) $ Map.toList font
--case pageContents (Map.singleton (Name "R9") font) input of
--case test (Map.singleton (Name "R9") font) array input of
case test (Map.singleton (Name "R9") font) (a textOperator) input of
--case test (Map.singleton (Name "R9") font) (a textOperator) input of
case test (Map.singleton (Name "R9") font) page input of
Left e -> putStrLn e
Right l -> putStrLn . show $ l
where
showMapping = mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> C8.putStrLn v) . Map.toList
showCRange :: CRange -> IO ()
showCRange (CRange {fromSequence, toSequence, mapping}) = do
putStrLn $ "from " ++ C8.unpack fromSequence ++ " to " ++ C8.unpack toSequence
showMapping mapping

View file

@ -1,19 +1,30 @@
module Data.ByteString.Char8.Util (
decodeHex
, fromInt
, hexString
B16Int(..)
, B256Int(..)
, b8ToInt
, b16ToBytes
, b16ToInt
, b256ToInt
, intToB256
, previous
, subBS
, toInt
, toBytes
, unescape
, utf16BEToutf8
) where
import Data.ByteString (ByteString, snoc)
import qualified Data.ByteString as BS (empty, foldl, pack, singleton)
import qualified Data.ByteString.Char8 as Char8 (drop, index, take, unpack)
import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt)
import qualified Data.ByteString.Char8 as Char8 (
cons, drop, index, splitAt, take, uncons, unpack
)
import Data.Text.Encoding (encodeUtf8, decodeUtf16BE)
import Prelude hiding (length)
import Text.Printf (printf)
newtype B8Int = B8Int ByteString deriving Show
newtype B16Int = B16Int ByteString deriving Show
newtype B256Int = B256Int ByteString deriving Show
previous :: Char -> Int -> ByteString -> Int
previous char position byteString
@ -23,27 +34,57 @@ previous char position byteString
subBS :: Int -> Int -> ByteString -> ByteString
subBS offset length = Char8.take length . Char8.drop offset
hexString :: (Num a, Read a) => String -> a
hexString s = read $ "0x" ++ s
intToB256 :: Int -> B256Int
intToB256 n
| n < 0x100 = B256Int . BS.singleton $ toEnum n
| otherwise =
let B256Int begining = intToB256 (n `div` 0x100) in
B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
fromInt :: Int -> ByteString
fromInt n
| n < 0x100 = BS.singleton $ toEnum n
| otherwise = fromInt (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100))
b256ToInt :: B256Int -> Int
b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n
toBytes :: Int -> Int -> ByteString
toBytes 0 _ = BS.empty
toBytes size n = toBytes (size - 1) (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100))
toBytes size n =
(toBytes (size - 1) (n `div` 0x100)) `snoc` (toEnum (n `mod` 0x100))
toInt :: ByteString -> Int
toInt = BS.foldl (\n w -> 0x100*n + fromEnum w) 0
decodeHex :: ByteString -> ByteString
decodeHex = BS.pack . fmap hexString . pairDigits . Char8.unpack
b16ToBytes :: B16Int -> ByteString
b16ToBytes (B16Int n) = BS.pack . fmap b16ToInt $ pairDigits n
where
pairDigits "" = []
pairDigits [c] = [[c]]
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
pairDigits s =
case BS.length s of
0 -> []
1 -> [B16Int s]
_ ->
let (twoHexDigits, rest) = BS.splitAt 2 s in
(B16Int $ twoHexDigits):(pairDigits rest)
fromBase :: (Num a, Read a) => Char -> ByteString -> a
fromBase b = read . printf "0%c%s" b . Char8.unpack
b16ToInt :: (Num a, Read a) => B16Int -> a
b16ToInt (B16Int n) = fromBase 'x' n
b8ToInt :: (Num a, Read a) => B8Int -> a
b8ToInt (B8Int n) = fromBase 'o' n
unescape :: ByteString -> ByteString
unescape escapedBS =
case Char8.uncons escapedBS of
Nothing -> BS.empty
Just ('\\', s) -> unescapeChar s
Just (c, s) -> Char8.cons c (unescape s)
where
unescapeChar s =
case Char8.uncons s of
Nothing -> BS.empty
Just (c, s')
| c `elem` "()" -> Char8.cons c (unescape s')
| c `elem` "nrtbf" -> Char8.cons (read (printf "'\\%c'" c)) (unescape s')
| c `elem` ['0'..'7'] -> fromOctal (Char8.splitAt 3 s)
| otherwise -> Char8.cons c (unescape s')
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
utf16BEToutf8 :: ByteString -> ByteString
utf16BEToutf8 = encodeUtf8 . decodeUtf16BE

View file

@ -14,7 +14,7 @@ import Data.Attoparsec.ByteString.Char8 (count)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (length)
import Data.ByteString.Char8.Util (
decodeHex, toBytes, toInt, utf16BEToutf8
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
)
import Data.Map (Map, union)
import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
@ -63,9 +63,9 @@ createMapping :: (StringObject, StringObject) -> Parser CMap ()
createMapping (Hexadecimal from, Hexadecimal to) = modify $
Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
where
fromSequence = decodeHex from
fromSequence = b16ToBytes from
size = BS.length fromSequence
toSequence = decodeHex to
toSequence = b16ToBytes to
mapping = Map.empty
createMapping _ = return ()
@ -102,33 +102,30 @@ cMapChar = do
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
>>= pairMapping
between :: ByteString -> ByteString -> [ByteString]
between from to =
let size = BS.length from in
toBytes size <$> [toInt from .. toInt to]
between :: B16Int -> B16Int -> [ByteString]
between from@(B16Int s) to =
let size = BS.length s `div` 2 in
toBytes size <$> [b16ToInt from .. b16ToInt to]
startFrom :: ByteString -> [ByteString]
startFrom from =
let size = BS.length from in
toBytes size <$> [toInt from .. ]
startFrom :: B16Int -> [ByteString]
startFrom from@(B16Int s) =
let size = BS.length s `div` 2 in
toBytes size <$> [b16ToInt from .. ]
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
return $ zip (between fromBS toBS) (utf16BEToutf8 <$> startFrom dstBS)
where
(fromBS, toBS, dstBS) = (decodeHex from, decodeHex to, decodeHex dstFrom)
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
zip (between fromBS toBS) <$> (mapM dstByteString dstPoints)
zip (between from to) <$> (mapM dstByteString dstPoints)
where
(fromBS, toBS) = (decodeHex from, decodeHex to)
dstByteString (StringObject (Hexadecimal dst)) =
return . utf16BEToutf8 $ decodeHex dst
return . utf16BEToutf8 $ b16ToBytes dst
dstByteString _ = fail "Invalid for a replacement string"
mapFromTo _ = fail "invalid range mapping found"
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
pairMapping (Hexadecimal from, Hexadecimal to) =
return (decodeHex from, utf16BEToutf8 $ decodeHex to)
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
pairMapping _ = fail "invalid pair mapping found"

View file

@ -30,6 +30,7 @@ module PDF.Object (
, regular
, stringObject
, structure
, toByteString
) where
import Control.Applicative ((<|>), many)
@ -37,6 +38,7 @@ import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (concat)
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map (
delete, empty, fromList, lookup, minViewWithKey, toList, union
@ -111,11 +113,11 @@ number = Number . read . Char8.unpack <$>
--
-- StringObject
--
data StringObject = Literal ByteString | Hexadecimal ByteString deriving Show
data StringObject = Literal ByteString | Hexadecimal B16Int deriving Show
instance Output StringObject where
output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
output (Hexadecimal s) = Output.string (printf "<%s>" (Char8.unpack s))
output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))
stringObject :: MonadParser m => m StringObject
stringObject =
@ -129,9 +131,13 @@ stringObject =
matchingParenthesis =
mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar =
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
toByteString :: StringObject -> ByteString
toByteString (Hexadecimal h) = b16ToBytes h
toByteString (Literal s) = unescape s
--
-- Name
--

View file

@ -28,6 +28,7 @@ import qualified Data.Attoparsec.ByteString.Char8 as Atto (
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
)
import Data.ByteString (ByteString)
import Data.ByteString.Char8.Util (B16Int(..))
import Data.Char (toLower)
import Data.Set (Set)
import qualified Data.Set as Set (fromList, member, unions)
@ -39,7 +40,7 @@ class MonadDeps m => MonadParser m where
block :: Int -> m ByteString
char :: Char -> m Char
decNumber :: m ByteString
hexNumber :: m ByteString
hexNumber :: m B16Int
oneOf :: String -> m Char
string :: ByteString -> m ByteString
takeAll :: (Char -> Bool) -> m ByteString
@ -49,7 +50,7 @@ instance MonadParser Atto.Parser where
block = Atto.take
char = Atto.char
decNumber = Atto.takeWhile1 (`Set.member` digits)
hexNumber = Atto.takeWhile1 (`Set.member` hexDigits)
hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
oneOf charSet = Atto.satisfy (`elem` charSet)
string s = Atto.string s <?> show s
takeAll = Atto.takeWhile

View file

@ -12,14 +12,13 @@ import Control.Monad.State (get, put)
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (drop, null, take)
import Data.ByteString.Char8.Util (decodeHex)
import Data.List (find)
import Data.ByteString.Char8 (unpack)
import Data.Map ((!))
import qualified Data.Map as Map (lookup, toList)
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap)
import PDF.Object (
DirectObject(..), StringObject(..)
, array, blank, name, parseBytes, regular, stringObject
, array, blank, name, regular, stringObject, toByteString
)
import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll)
@ -128,44 +127,23 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString
decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h))
decodeString (Literal litString) = do
cRangesBySize <- Map.toList <$> get
f cRangesBySize litString
decodeString = decode . toByteString
where
f :: [(Int, [CRange])] -> ByteString -> ParserWithFont ByteString
f cRangesBySize input
decode input
| BS.null input = return ""
| otherwise = do
(output, newInput) <- g cRangesBySize input
mappend output <$> f cRangesBySize newInput
g :: [(Int, [CRange])] -> ByteString -> ParserWithFont (ByteString, ByteString)
g [] _ = fail "No matching code found in font"
g ((size, cRanges):others) s =
(output, remainingInput) <- trySizes input =<< Map.toList <$> get
mappend output <$> decode remainingInput
trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString)
trySizes s [] = fail $ "No matching code found in font for " ++ unpack s
trySizes s ((size, cRanges):others) =
let prefix = BS.take size s in
case h prefix cRanges of
Nothing -> g others s
case tryRanges prefix cRanges of
Nothing -> trySizes s others
Just outputSequence -> return (outputSequence, BS.drop size s)
h :: ByteString -> [CRange] -> Maybe ByteString
h prefix [] = Nothing
h prefix ((CRange {mapping}):cRanges) =
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
tryRanges _ [] = Nothing
tryRanges prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of
Nothing -> h prefix cRanges
Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence
{-
get >>= convertBytes litString
where
convertBytes :: String -> CMap -> ParserWithFont ByteString
convertBytes [] _ = return ""
convertBytes (c:cs) someCMap = do
convertBytesAux (fromEnum c) 1 cs someCMap
convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString
convertBytesAux code size s someCMap
| size > 4 = fail "Could not match any input code smaller than an int"
| otherwise =
case (Map.lookup code someCMap, s) of
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
(Nothing, []) -> fail "No character left to read but no code recognized"
(Just outputText, _) -> mappend outputText <$> convertBytes s someCMap
-}