diff --git a/examples/debug.hs b/examples/debug.hs index 13bc965..ef9ef4a 100644 --- a/examples/debug.hs +++ b/examples/debug.hs @@ -1,11 +1,12 @@ +{-# LANGUAGE NamedFieldPuns #-} module Main where import Control.Monad.Reader import qualified Data.ByteString as BS import qualified Data.ByteString.Char8 as C8 import qualified Data.Map as Map -import PDF.Object (Name(..), array) -import PDF.CMap (CMappers, cMap, emptyCMap) +import PDF.Object (Name(..), StringObject(..), array) +import PDF.CMap (CMappers, CRange(..), cMap, emptyCMap) import PDF.Parser (evalParser) import PDF.Text @@ -15,12 +16,20 @@ test fonts parser = main :: IO () main = do - --input <- BS.readFile "20.stream" - input <- BS.readFile "array.bin" + input <- BS.readFile "20.stream" + --input <- BS.readFile "array.bin" + --let input = C8.pack "\f\xb5\0I" Right font <- cMap <$> BS.readFile "6300.stream" - --mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> BS.putStrLn v) $ Map.toList font + --mapM_ (\(size, m) -> putStrLn ("taille " ++ show size) >> mapM_ showCRange m) $ Map.toList font --case pageContents (Map.singleton (Name "R9") font) input of --case test (Map.singleton (Name "R9") font) array input of - case test (Map.singleton (Name "R9") font) (a textOperator) input of + --case test (Map.singleton (Name "R9") font) (a textOperator) input of + case test (Map.singleton (Name "R9") font) page input of Left e -> putStrLn e Right l -> putStrLn . show $ l + where + showMapping = mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> C8.putStrLn v) . Map.toList + showCRange :: CRange -> IO () + showCRange (CRange {fromSequence, toSequence, mapping}) = do + putStrLn $ "from " ++ C8.unpack fromSequence ++ " to " ++ C8.unpack toSequence + showMapping mapping diff --git a/src/Data/ByteString/Char8/Util.hs b/src/Data/ByteString/Char8/Util.hs index 6a04ea0..d2471b4 100644 --- a/src/Data/ByteString/Char8/Util.hs +++ b/src/Data/ByteString/Char8/Util.hs @@ -1,19 +1,30 @@ module Data.ByteString.Char8.Util ( - decodeHex - , fromInt - , hexString + B16Int(..) + , B256Int(..) + , b8ToInt + , b16ToBytes + , b16ToInt + , b256ToInt + , intToB256 , previous , subBS - , toInt , toBytes + , unescape , utf16BEToutf8 ) where import Data.ByteString (ByteString, snoc) -import qualified Data.ByteString as BS (empty, foldl, pack, singleton) -import qualified Data.ByteString.Char8 as Char8 (drop, index, take, unpack) +import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt) +import qualified Data.ByteString.Char8 as Char8 ( + cons, drop, index, splitAt, take, uncons, unpack + ) import Data.Text.Encoding (encodeUtf8, decodeUtf16BE) import Prelude hiding (length) +import Text.Printf (printf) + +newtype B8Int = B8Int ByteString deriving Show +newtype B16Int = B16Int ByteString deriving Show +newtype B256Int = B256Int ByteString deriving Show previous :: Char -> Int -> ByteString -> Int previous char position byteString @@ -23,27 +34,57 @@ previous char position byteString subBS :: Int -> Int -> ByteString -> ByteString subBS offset length = Char8.take length . Char8.drop offset -hexString :: (Num a, Read a) => String -> a -hexString s = read $ "0x" ++ s +intToB256 :: Int -> B256Int +intToB256 n + | n < 0x100 = B256Int . BS.singleton $ toEnum n + | otherwise = + let B256Int begining = intToB256 (n `div` 0x100) in + B256Int $ begining `snoc` (toEnum (n `mod` 0x100)) -fromInt :: Int -> ByteString -fromInt n - | n < 0x100 = BS.singleton $ toEnum n - | otherwise = fromInt (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100)) +b256ToInt :: B256Int -> Int +b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n toBytes :: Int -> Int -> ByteString toBytes 0 _ = BS.empty -toBytes size n = toBytes (size - 1) (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100)) +toBytes size n = + (toBytes (size - 1) (n `div` 0x100)) `snoc` (toEnum (n `mod` 0x100)) -toInt :: ByteString -> Int -toInt = BS.foldl (\n w -> 0x100*n + fromEnum w) 0 - -decodeHex :: ByteString -> ByteString -decodeHex = BS.pack . fmap hexString . pairDigits . Char8.unpack +b16ToBytes :: B16Int -> ByteString +b16ToBytes (B16Int n) = BS.pack . fmap b16ToInt $ pairDigits n where - pairDigits "" = [] - pairDigits [c] = [[c]] - pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end + pairDigits s = + case BS.length s of + 0 -> [] + 1 -> [B16Int s] + _ -> + let (twoHexDigits, rest) = BS.splitAt 2 s in + (B16Int $ twoHexDigits):(pairDigits rest) + +fromBase :: (Num a, Read a) => Char -> ByteString -> a +fromBase b = read . printf "0%c%s" b . Char8.unpack + +b16ToInt :: (Num a, Read a) => B16Int -> a +b16ToInt (B16Int n) = fromBase 'x' n + +b8ToInt :: (Num a, Read a) => B8Int -> a +b8ToInt (B8Int n) = fromBase 'o' n + +unescape :: ByteString -> ByteString +unescape escapedBS = + case Char8.uncons escapedBS of + Nothing -> BS.empty + Just ('\\', s) -> unescapeChar s + Just (c, s) -> Char8.cons c (unescape s) + where + unescapeChar s = + case Char8.uncons s of + Nothing -> BS.empty + Just (c, s') + | c `elem` "()" -> Char8.cons c (unescape s') + | c `elem` "nrtbf" -> Char8.cons (read (printf "'\\%c'" c)) (unescape s') + | c `elem` ['0'..'7'] -> fromOctal (Char8.splitAt 3 s) + | otherwise -> Char8.cons c (unescape s') + fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s) utf16BEToutf8 :: ByteString -> ByteString utf16BEToutf8 = encodeUtf8 . decodeUtf16BE diff --git a/src/PDF/CMap.hs b/src/PDF/CMap.hs index 4d5b553..51a48db 100644 --- a/src/PDF/CMap.hs +++ b/src/PDF/CMap.hs @@ -14,7 +14,7 @@ import Data.Attoparsec.ByteString.Char8 (count) import Data.ByteString (ByteString) import qualified Data.ByteString as BS (length) import Data.ByteString.Char8.Util ( - decodeHex, toBytes, toInt, utf16BEToutf8 + B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8 ) import Data.Map (Map, union) import qualified Data.Map as Map (adjust, empty, fromList, insertWith) @@ -63,9 +63,9 @@ createMapping :: (StringObject, StringObject) -> Parser CMap () createMapping (Hexadecimal from, Hexadecimal to) = modify $ Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}] where - fromSequence = decodeHex from + fromSequence = b16ToBytes from size = BS.length fromSequence - toSequence = decodeHex to + toSequence = b16ToBytes to mapping = Map.empty createMapping _ = return () @@ -102,33 +102,30 @@ cMapChar = do (,) <$> stringObject <* blank <*> stringObject <* EOL.parser >>= pairMapping -between :: ByteString -> ByteString -> [ByteString] -between from to = - let size = BS.length from in - toBytes size <$> [toInt from .. toInt to] +between :: B16Int -> B16Int -> [ByteString] +between from@(B16Int s) to = + let size = BS.length s `div` 2 in + toBytes size <$> [b16ToInt from .. b16ToInt to] -startFrom :: ByteString -> [ByteString] -startFrom from = - let size = BS.length from in - toBytes size <$> [toInt from .. ] +startFrom :: B16Int -> [ByteString] +startFrom from@(B16Int s) = + let size = BS.length s `div` 2 in + toBytes size <$> [b16ToInt from .. ] mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)] mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = - return $ zip (between fromBS toBS) (utf16BEToutf8 <$> startFrom dstBS) - where - (fromBS, toBS, dstBS) = (decodeHex from, decodeHex to, decodeHex dstFrom) + return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom) mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) = - zip (between fromBS toBS) <$> (mapM dstByteString dstPoints) + zip (between from to) <$> (mapM dstByteString dstPoints) where - (fromBS, toBS) = (decodeHex from, decodeHex to) dstByteString (StringObject (Hexadecimal dst)) = - return . utf16BEToutf8 $ decodeHex dst + return . utf16BEToutf8 $ b16ToBytes dst dstByteString _ = fail "Invalid for a replacement string" mapFromTo _ = fail "invalid range mapping found" pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString) pairMapping (Hexadecimal from, Hexadecimal to) = - return (decodeHex from, utf16BEToutf8 $ decodeHex to) + return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to) pairMapping _ = fail "invalid pair mapping found" diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs index 35bf0d8..9e990f4 100644 --- a/src/PDF/Object.hs +++ b/src/PDF/Object.hs @@ -30,6 +30,7 @@ module PDF.Object ( , regular , stringObject , structure + , toByteString ) where import Control.Applicative ((<|>), many) @@ -37,6 +38,7 @@ import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy) import Data.ByteString (ByteString) import qualified Data.ByteString as BS (concat) import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack) +import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape) import Data.Map (Map, (!), mapWithKey) import qualified Data.Map as Map ( delete, empty, fromList, lookup, minViewWithKey, toList, union @@ -111,11 +113,11 @@ number = Number . read . Char8.unpack <$> -- -- StringObject -- -data StringObject = Literal ByteString | Hexadecimal ByteString deriving Show +data StringObject = Literal ByteString | Hexadecimal B16Int deriving Show instance Output StringObject where output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s)) - output (Hexadecimal s) = Output.string (printf "<%s>" (Char8.unpack s)) + output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n)) stringObject :: MonadParser m => m StringObject stringObject = @@ -129,9 +131,13 @@ stringObject = matchingParenthesis = mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")" escapedChar = - Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode) + Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode) octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3] +toByteString :: StringObject -> ByteString +toByteString (Hexadecimal h) = b16ToBytes h +toByteString (Literal s) = unescape s + -- -- Name -- diff --git a/src/PDF/Parser.hs b/src/PDF/Parser.hs index c9c380a..c1b0ec9 100644 --- a/src/PDF/Parser.hs +++ b/src/PDF/Parser.hs @@ -28,6 +28,7 @@ import qualified Data.Attoparsec.ByteString.Char8 as Atto ( Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1 ) import Data.ByteString (ByteString) +import Data.ByteString.Char8.Util (B16Int(..)) import Data.Char (toLower) import Data.Set (Set) import qualified Data.Set as Set (fromList, member, unions) @@ -39,7 +40,7 @@ class MonadDeps m => MonadParser m where block :: Int -> m ByteString char :: Char -> m Char decNumber :: m ByteString - hexNumber :: m ByteString + hexNumber :: m B16Int oneOf :: String -> m Char string :: ByteString -> m ByteString takeAll :: (Char -> Bool) -> m ByteString @@ -49,7 +50,7 @@ instance MonadParser Atto.Parser where block = Atto.take char = Atto.char decNumber = Atto.takeWhile1 (`Set.member` digits) - hexNumber = Atto.takeWhile1 (`Set.member` hexDigits) + hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits) oneOf charSet = Atto.satisfy (`elem` charSet) string s = Atto.string s show s takeAll = Atto.takeWhile diff --git a/src/PDF/Text.hs b/src/PDF/Text.hs index 7cb4783..dfa926c 100644 --- a/src/PDF/Text.hs +++ b/src/PDF/Text.hs @@ -12,14 +12,13 @@ import Control.Monad.State (get, put) import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy) import Data.ByteString (ByteString) import qualified Data.ByteString as BS (drop, null, take) -import Data.ByteString.Char8.Util (decodeHex) -import Data.List (find) +import Data.ByteString.Char8 (unpack) import Data.Map ((!)) import qualified Data.Map as Map (lookup, toList) import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap) import PDF.Object ( DirectObject(..), StringObject(..) - , array, blank, name, parseBytes, regular, stringObject + , array, blank, name, regular, stringObject, toByteString ) import PDF.Parser (MonadParser, (), Parser, evalParser, string, takeAll) @@ -128,44 +127,23 @@ runOperator (DQuote, [Typed (StringObject outputString)]) = runOperator _ = return [] decodeString :: StringObject -> ParserWithFont ByteString -decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h)) -decodeString (Literal litString) = do - cRangesBySize <- Map.toList <$> get - f cRangesBySize litString +decodeString = decode . toByteString where - f :: [(Int, [CRange])] -> ByteString -> ParserWithFont ByteString - f cRangesBySize input + decode input | BS.null input = return "" | otherwise = do - (output, newInput) <- g cRangesBySize input - mappend output <$> f cRangesBySize newInput - g :: [(Int, [CRange])] -> ByteString -> ParserWithFont (ByteString, ByteString) - g [] _ = fail "No matching code found in font" - g ((size, cRanges):others) s = + (output, remainingInput) <- trySizes input =<< Map.toList <$> get + mappend output <$> decode remainingInput + trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString) + trySizes s [] = fail $ "No matching code found in font for " ++ unpack s + trySizes s ((size, cRanges):others) = let prefix = BS.take size s in - case h prefix cRanges of - Nothing -> g others s + case tryRanges prefix cRanges of + Nothing -> trySizes s others Just outputSequence -> return (outputSequence, BS.drop size s) - h :: ByteString -> [CRange] -> Maybe ByteString - h prefix [] = Nothing - h prefix ((CRange {mapping}):cRanges) = + tryRanges :: ByteString -> [CRange] -> Maybe ByteString + tryRanges _ [] = Nothing + tryRanges prefix ((CRange {mapping}):cRanges) = case Map.lookup prefix mapping of - Nothing -> h prefix cRanges + Nothing -> tryRanges prefix cRanges outputSequence -> outputSequence - -{- -get >>= convertBytes litString - where - convertBytes :: String -> CMap -> ParserWithFont ByteString - convertBytes [] _ = return "" - convertBytes (c:cs) someCMap = do - convertBytesAux (fromEnum c) 1 cs someCMap - convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString - convertBytesAux code size s someCMap - | size > 4 = fail "Could not match any input code smaller than an int" - | otherwise = - case (Map.lookup code someCMap, s) of - (Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap - (Nothing, []) -> fail "No character left to read but no code recognized" - (Just outputText, _) -> mappend outputText <$> convertBytes s someCMap --}