Clean ByteString types to identify when a ByteString contains the representation of an integer in a given base and fix the last remaining PDF string (un)escaping issue
This commit is contained in:
parent
a96e36ec5a
commit
3a3e1533b4
6 changed files with 119 additions and 87 deletions
|
@ -1,11 +1,12 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module Main where
|
||||
|
||||
import Control.Monad.Reader
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Data.ByteString.Char8 as C8
|
||||
import qualified Data.Map as Map
|
||||
import PDF.Object (Name(..), array)
|
||||
import PDF.CMap (CMappers, cMap, emptyCMap)
|
||||
import PDF.Object (Name(..), StringObject(..), array)
|
||||
import PDF.CMap (CMappers, CRange(..), cMap, emptyCMap)
|
||||
import PDF.Parser (evalParser)
|
||||
import PDF.Text
|
||||
|
||||
|
@ -15,12 +16,20 @@ test fonts parser =
|
|||
|
||||
main :: IO ()
|
||||
main = do
|
||||
--input <- BS.readFile "20.stream"
|
||||
input <- BS.readFile "array.bin"
|
||||
input <- BS.readFile "20.stream"
|
||||
--input <- BS.readFile "array.bin"
|
||||
--let input = C8.pack "\f\xb5\0I"
|
||||
Right font <- cMap <$> BS.readFile "6300.stream"
|
||||
--mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> BS.putStrLn v) $ Map.toList font
|
||||
--mapM_ (\(size, m) -> putStrLn ("taille " ++ show size) >> mapM_ showCRange m) $ Map.toList font
|
||||
--case pageContents (Map.singleton (Name "R9") font) input of
|
||||
--case test (Map.singleton (Name "R9") font) array input of
|
||||
case test (Map.singleton (Name "R9") font) (a textOperator) input of
|
||||
--case test (Map.singleton (Name "R9") font) (a textOperator) input of
|
||||
case test (Map.singleton (Name "R9") font) page input of
|
||||
Left e -> putStrLn e
|
||||
Right l -> putStrLn . show $ l
|
||||
where
|
||||
showMapping = mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> C8.putStrLn v) . Map.toList
|
||||
showCRange :: CRange -> IO ()
|
||||
showCRange (CRange {fromSequence, toSequence, mapping}) = do
|
||||
putStrLn $ "from " ++ C8.unpack fromSequence ++ " to " ++ C8.unpack toSequence
|
||||
showMapping mapping
|
||||
|
|
|
@ -1,19 +1,30 @@
|
|||
module Data.ByteString.Char8.Util (
|
||||
decodeHex
|
||||
, fromInt
|
||||
, hexString
|
||||
B16Int(..)
|
||||
, B256Int(..)
|
||||
, b8ToInt
|
||||
, b16ToBytes
|
||||
, b16ToInt
|
||||
, b256ToInt
|
||||
, intToB256
|
||||
, previous
|
||||
, subBS
|
||||
, toInt
|
||||
, toBytes
|
||||
, unescape
|
||||
, utf16BEToutf8
|
||||
) where
|
||||
|
||||
import Data.ByteString (ByteString, snoc)
|
||||
import qualified Data.ByteString as BS (empty, foldl, pack, singleton)
|
||||
import qualified Data.ByteString.Char8 as Char8 (drop, index, take, unpack)
|
||||
import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt)
|
||||
import qualified Data.ByteString.Char8 as Char8 (
|
||||
cons, drop, index, splitAt, take, uncons, unpack
|
||||
)
|
||||
import Data.Text.Encoding (encodeUtf8, decodeUtf16BE)
|
||||
import Prelude hiding (length)
|
||||
import Text.Printf (printf)
|
||||
|
||||
newtype B8Int = B8Int ByteString deriving Show
|
||||
newtype B16Int = B16Int ByteString deriving Show
|
||||
newtype B256Int = B256Int ByteString deriving Show
|
||||
|
||||
previous :: Char -> Int -> ByteString -> Int
|
||||
previous char position byteString
|
||||
|
@ -23,27 +34,57 @@ previous char position byteString
|
|||
subBS :: Int -> Int -> ByteString -> ByteString
|
||||
subBS offset length = Char8.take length . Char8.drop offset
|
||||
|
||||
hexString :: (Num a, Read a) => String -> a
|
||||
hexString s = read $ "0x" ++ s
|
||||
intToB256 :: Int -> B256Int
|
||||
intToB256 n
|
||||
| n < 0x100 = B256Int . BS.singleton $ toEnum n
|
||||
| otherwise =
|
||||
let B256Int begining = intToB256 (n `div` 0x100) in
|
||||
B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
|
||||
|
||||
fromInt :: Int -> ByteString
|
||||
fromInt n
|
||||
| n < 0x100 = BS.singleton $ toEnum n
|
||||
| otherwise = fromInt (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100))
|
||||
b256ToInt :: B256Int -> Int
|
||||
b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n
|
||||
|
||||
toBytes :: Int -> Int -> ByteString
|
||||
toBytes 0 _ = BS.empty
|
||||
toBytes size n = toBytes (size - 1) (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100))
|
||||
toBytes size n =
|
||||
(toBytes (size - 1) (n `div` 0x100)) `snoc` (toEnum (n `mod` 0x100))
|
||||
|
||||
toInt :: ByteString -> Int
|
||||
toInt = BS.foldl (\n w -> 0x100*n + fromEnum w) 0
|
||||
|
||||
decodeHex :: ByteString -> ByteString
|
||||
decodeHex = BS.pack . fmap hexString . pairDigits . Char8.unpack
|
||||
b16ToBytes :: B16Int -> ByteString
|
||||
b16ToBytes (B16Int n) = BS.pack . fmap b16ToInt $ pairDigits n
|
||||
where
|
||||
pairDigits "" = []
|
||||
pairDigits [c] = [[c]]
|
||||
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
|
||||
pairDigits s =
|
||||
case BS.length s of
|
||||
0 -> []
|
||||
1 -> [B16Int s]
|
||||
_ ->
|
||||
let (twoHexDigits, rest) = BS.splitAt 2 s in
|
||||
(B16Int $ twoHexDigits):(pairDigits rest)
|
||||
|
||||
fromBase :: (Num a, Read a) => Char -> ByteString -> a
|
||||
fromBase b = read . printf "0%c%s" b . Char8.unpack
|
||||
|
||||
b16ToInt :: (Num a, Read a) => B16Int -> a
|
||||
b16ToInt (B16Int n) = fromBase 'x' n
|
||||
|
||||
b8ToInt :: (Num a, Read a) => B8Int -> a
|
||||
b8ToInt (B8Int n) = fromBase 'o' n
|
||||
|
||||
unescape :: ByteString -> ByteString
|
||||
unescape escapedBS =
|
||||
case Char8.uncons escapedBS of
|
||||
Nothing -> BS.empty
|
||||
Just ('\\', s) -> unescapeChar s
|
||||
Just (c, s) -> Char8.cons c (unescape s)
|
||||
where
|
||||
unescapeChar s =
|
||||
case Char8.uncons s of
|
||||
Nothing -> BS.empty
|
||||
Just (c, s')
|
||||
| c `elem` "()" -> Char8.cons c (unescape s')
|
||||
| c `elem` "nrtbf" -> Char8.cons (read (printf "'\\%c'" c)) (unescape s')
|
||||
| c `elem` ['0'..'7'] -> fromOctal (Char8.splitAt 3 s)
|
||||
| otherwise -> Char8.cons c (unescape s')
|
||||
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
|
||||
|
||||
utf16BEToutf8 :: ByteString -> ByteString
|
||||
utf16BEToutf8 = encodeUtf8 . decodeUtf16BE
|
||||
|
|
|
@ -14,7 +14,7 @@ import Data.Attoparsec.ByteString.Char8 (count)
|
|||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS (length)
|
||||
import Data.ByteString.Char8.Util (
|
||||
decodeHex, toBytes, toInt, utf16BEToutf8
|
||||
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
|
||||
)
|
||||
import Data.Map (Map, union)
|
||||
import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
|
||||
|
@ -63,9 +63,9 @@ createMapping :: (StringObject, StringObject) -> Parser CMap ()
|
|||
createMapping (Hexadecimal from, Hexadecimal to) = modify $
|
||||
Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
|
||||
where
|
||||
fromSequence = decodeHex from
|
||||
fromSequence = b16ToBytes from
|
||||
size = BS.length fromSequence
|
||||
toSequence = decodeHex to
|
||||
toSequence = b16ToBytes to
|
||||
mapping = Map.empty
|
||||
createMapping _ = return ()
|
||||
|
||||
|
@ -102,33 +102,30 @@ cMapChar = do
|
|||
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
|
||||
>>= pairMapping
|
||||
|
||||
between :: ByteString -> ByteString -> [ByteString]
|
||||
between from to =
|
||||
let size = BS.length from in
|
||||
toBytes size <$> [toInt from .. toInt to]
|
||||
between :: B16Int -> B16Int -> [ByteString]
|
||||
between from@(B16Int s) to =
|
||||
let size = BS.length s `div` 2 in
|
||||
toBytes size <$> [b16ToInt from .. b16ToInt to]
|
||||
|
||||
startFrom :: ByteString -> [ByteString]
|
||||
startFrom from =
|
||||
let size = BS.length from in
|
||||
toBytes size <$> [toInt from .. ]
|
||||
startFrom :: B16Int -> [ByteString]
|
||||
startFrom from@(B16Int s) =
|
||||
let size = BS.length s `div` 2 in
|
||||
toBytes size <$> [b16ToInt from .. ]
|
||||
|
||||
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
||||
return $ zip (between fromBS toBS) (utf16BEToutf8 <$> startFrom dstBS)
|
||||
where
|
||||
(fromBS, toBS, dstBS) = (decodeHex from, decodeHex to, decodeHex dstFrom)
|
||||
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
|
||||
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
||||
zip (between fromBS toBS) <$> (mapM dstByteString dstPoints)
|
||||
zip (between from to) <$> (mapM dstByteString dstPoints)
|
||||
where
|
||||
(fromBS, toBS) = (decodeHex from, decodeHex to)
|
||||
dstByteString (StringObject (Hexadecimal dst)) =
|
||||
return . utf16BEToutf8 $ decodeHex dst
|
||||
return . utf16BEToutf8 $ b16ToBytes dst
|
||||
dstByteString _ = fail "Invalid for a replacement string"
|
||||
|
||||
mapFromTo _ = fail "invalid range mapping found"
|
||||
|
||||
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
|
||||
pairMapping (Hexadecimal from, Hexadecimal to) =
|
||||
return (decodeHex from, utf16BEToutf8 $ decodeHex to)
|
||||
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
|
||||
pairMapping _ = fail "invalid pair mapping found"
|
||||
|
|
|
@ -30,6 +30,7 @@ module PDF.Object (
|
|||
, regular
|
||||
, stringObject
|
||||
, structure
|
||||
, toByteString
|
||||
) where
|
||||
|
||||
import Control.Applicative ((<|>), many)
|
||||
|
@ -37,6 +38,7 @@ import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
|||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS (concat)
|
||||
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
|
||||
import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
|
||||
import Data.Map (Map, (!), mapWithKey)
|
||||
import qualified Data.Map as Map (
|
||||
delete, empty, fromList, lookup, minViewWithKey, toList, union
|
||||
|
@ -111,11 +113,11 @@ number = Number . read . Char8.unpack <$>
|
|||
--
|
||||
-- StringObject
|
||||
--
|
||||
data StringObject = Literal ByteString | Hexadecimal ByteString deriving Show
|
||||
data StringObject = Literal ByteString | Hexadecimal B16Int deriving Show
|
||||
|
||||
instance Output StringObject where
|
||||
output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
|
||||
output (Hexadecimal s) = Output.string (printf "<%s>" (Char8.unpack s))
|
||||
output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))
|
||||
|
||||
stringObject :: MonadParser m => m StringObject
|
||||
stringObject =
|
||||
|
@ -129,9 +131,13 @@ stringObject =
|
|||
matchingParenthesis =
|
||||
mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
|
||||
escapedChar =
|
||||
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
|
||||
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
|
||||
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
|
||||
|
||||
toByteString :: StringObject -> ByteString
|
||||
toByteString (Hexadecimal h) = b16ToBytes h
|
||||
toByteString (Literal s) = unescape s
|
||||
|
||||
--
|
||||
-- Name
|
||||
--
|
||||
|
|
|
@ -28,6 +28,7 @@ import qualified Data.Attoparsec.ByteString.Char8 as Atto (
|
|||
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
|
||||
)
|
||||
import Data.ByteString (ByteString)
|
||||
import Data.ByteString.Char8.Util (B16Int(..))
|
||||
import Data.Char (toLower)
|
||||
import Data.Set (Set)
|
||||
import qualified Data.Set as Set (fromList, member, unions)
|
||||
|
@ -39,7 +40,7 @@ class MonadDeps m => MonadParser m where
|
|||
block :: Int -> m ByteString
|
||||
char :: Char -> m Char
|
||||
decNumber :: m ByteString
|
||||
hexNumber :: m ByteString
|
||||
hexNumber :: m B16Int
|
||||
oneOf :: String -> m Char
|
||||
string :: ByteString -> m ByteString
|
||||
takeAll :: (Char -> Bool) -> m ByteString
|
||||
|
@ -49,7 +50,7 @@ instance MonadParser Atto.Parser where
|
|||
block = Atto.take
|
||||
char = Atto.char
|
||||
decNumber = Atto.takeWhile1 (`Set.member` digits)
|
||||
hexNumber = Atto.takeWhile1 (`Set.member` hexDigits)
|
||||
hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
|
||||
oneOf charSet = Atto.satisfy (`elem` charSet)
|
||||
string s = Atto.string s <?> show s
|
||||
takeAll = Atto.takeWhile
|
||||
|
|
|
@ -12,14 +12,13 @@ import Control.Monad.State (get, put)
|
|||
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS (drop, null, take)
|
||||
import Data.ByteString.Char8.Util (decodeHex)
|
||||
import Data.List (find)
|
||||
import Data.ByteString.Char8 (unpack)
|
||||
import Data.Map ((!))
|
||||
import qualified Data.Map as Map (lookup, toList)
|
||||
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap)
|
||||
import PDF.Object (
|
||||
DirectObject(..), StringObject(..)
|
||||
, array, blank, name, parseBytes, regular, stringObject
|
||||
, array, blank, name, regular, stringObject, toByteString
|
||||
)
|
||||
import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll)
|
||||
|
||||
|
@ -128,44 +127,23 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
|
|||
runOperator _ = return []
|
||||
|
||||
decodeString :: StringObject -> ParserWithFont ByteString
|
||||
decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h))
|
||||
decodeString (Literal litString) = do
|
||||
cRangesBySize <- Map.toList <$> get
|
||||
f cRangesBySize litString
|
||||
decodeString = decode . toByteString
|
||||
where
|
||||
f :: [(Int, [CRange])] -> ByteString -> ParserWithFont ByteString
|
||||
f cRangesBySize input
|
||||
decode input
|
||||
| BS.null input = return ""
|
||||
| otherwise = do
|
||||
(output, newInput) <- g cRangesBySize input
|
||||
mappend output <$> f cRangesBySize newInput
|
||||
g :: [(Int, [CRange])] -> ByteString -> ParserWithFont (ByteString, ByteString)
|
||||
g [] _ = fail "No matching code found in font"
|
||||
g ((size, cRanges):others) s =
|
||||
(output, remainingInput) <- trySizes input =<< Map.toList <$> get
|
||||
mappend output <$> decode remainingInput
|
||||
trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString)
|
||||
trySizes s [] = fail $ "No matching code found in font for " ++ unpack s
|
||||
trySizes s ((size, cRanges):others) =
|
||||
let prefix = BS.take size s in
|
||||
case h prefix cRanges of
|
||||
Nothing -> g others s
|
||||
case tryRanges prefix cRanges of
|
||||
Nothing -> trySizes s others
|
||||
Just outputSequence -> return (outputSequence, BS.drop size s)
|
||||
h :: ByteString -> [CRange] -> Maybe ByteString
|
||||
h prefix [] = Nothing
|
||||
h prefix ((CRange {mapping}):cRanges) =
|
||||
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
|
||||
tryRanges _ [] = Nothing
|
||||
tryRanges prefix ((CRange {mapping}):cRanges) =
|
||||
case Map.lookup prefix mapping of
|
||||
Nothing -> h prefix cRanges
|
||||
Nothing -> tryRanges prefix cRanges
|
||||
outputSequence -> outputSequence
|
||||
|
||||
{-
|
||||
get >>= convertBytes litString
|
||||
where
|
||||
convertBytes :: String -> CMap -> ParserWithFont ByteString
|
||||
convertBytes [] _ = return ""
|
||||
convertBytes (c:cs) someCMap = do
|
||||
convertBytesAux (fromEnum c) 1 cs someCMap
|
||||
convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString
|
||||
convertBytesAux code size s someCMap
|
||||
| size > 4 = fail "Could not match any input code smaller than an int"
|
||||
| otherwise =
|
||||
case (Map.lookup code someCMap, s) of
|
||||
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
|
||||
(Nothing, []) -> fail "No character left to read but no code recognized"
|
||||
(Just outputText, _) -> mappend outputText <$> convertBytes s someCMap
|
||||
-}
|
||||
|
|
Loading…
Reference in a new issue