diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index 69a94a5..0c485f7 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -23,9 +23,9 @@ library , PDF.Output , PDF.Text , PDF.Update + , PDF.Parser other-modules: Data.ByteString.Char8.Util , PDF.Body - , PDF.Parser -- other-extensions: build-depends: attoparsec , base >=4.9 && <4.13 @@ -56,6 +56,16 @@ executable getObj ghc-options: -Wall default-language: Haskell2010 +executable debug + main-is: examples/debug.hs + build-depends: base + , bytestring + , containers + , Hufflepdf + , mtl + ghc-options: -Wall + default-language: Haskell2010 + executable getText main-is: examples/getText.hs build-depends: base diff --git a/examples/debug.hs b/examples/debug.hs new file mode 100644 index 0000000..13bc965 --- /dev/null +++ b/examples/debug.hs @@ -0,0 +1,26 @@ +module Main where + +import Control.Monad.Reader +import qualified Data.ByteString as BS +import qualified Data.ByteString.Char8 as C8 +import qualified Data.Map as Map +import PDF.Object (Name(..), array) +import PDF.CMap (CMappers, cMap, emptyCMap) +import PDF.Parser (evalParser) +import PDF.Text + +test :: CMappers -> ParserWithFont a -> BS.ByteString -> Either String a +test fonts parser = + evalParser (runReaderT parser fonts) emptyCMap + +main :: IO () +main = do + --input <- BS.readFile "20.stream" + input <- BS.readFile "array.bin" + Right font <- cMap <$> BS.readFile "6300.stream" + --mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> BS.putStrLn v) $ Map.toList font + --case pageContents (Map.singleton (Name "R9") font) input of + --case test (Map.singleton (Name "R9") font) array input of + case test (Map.singleton (Name "R9") font) (a textOperator) input of + Left e -> putStrLn e + Right l -> putStrLn . show $ l diff --git a/examples/getText.hs b/examples/getText.hs index e3f563b..8a47496 100755 --- a/examples/getText.hs +++ b/examples/getText.hs @@ -16,11 +16,12 @@ import PDF.Object ( Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Object(..), Name(..), Structure(..) ,) -import PDF.Output (ObjectId) +import PDF.Output (ObjectId(..)) import PDF.Text (PageContents(..), pageContents) import PDF.Update (unify) import System.Environment (getArgs) import System.IO (hPutStrLn, stderr) +import Text.Printf (printf) type CachedCMaps = Map ObjectId CMap type T a = RWST Content [ByteString] CachedCMaps [] a @@ -30,16 +31,19 @@ list l = RWST (\_ s -> fillContext s <$> l) where fillContext s a = (a, s, []) -handleError :: a -> String -> T a -handleError defaultValue s = - (tell . replicate 1 . BS.pack $ "Warning: " ++ s) >> return defaultValue +handleError :: ObjectId -> a -> String -> T a +handleError objectId defaultValue s = + (tell . replicate 1 $ BS.pack message) >> return defaultValue + where + message = printf "Object #%d : %s" (getObjectId objectId) s extractText :: Object -> T () extractText object = do pageDict <- dict object cMappers <- loadCMappers =<< getFont pageDict - contents <- stream =<< follow =<< key "Contents" pageDict - either (handleError ()) (tell . chunks) (pageContents cMappers contents) + contentsId <- target =<< key "Contents" pageDict + contents <- stream =<< getObject contentsId + either (handleError contentsId ()) (tell . chunks) (pageContents cMappers contents) stream :: Object -> T ByteString stream (Stream {header, streamContent}) = return $ @@ -73,7 +77,7 @@ loadFont objectId = >>= key "ToUnicode" >>= follow >>= stream - >>= either (handleError emptyCMap) return . cMap + >>= either (handleError objectId emptyCMap) return . cMap loadCMappers :: Dictionary -> T CMappers loadCMappers = foldM loadCMapper Map.empty . Map.toList @@ -94,9 +98,12 @@ key keyName dictionary = Just obj -> return obj _ -> list [] +target :: DirectObject -> T ObjectId +target (Reference (IndirectObjCoordinates {objectId})) = return objectId +target _ = list [] + follow :: DirectObject -> T Object -follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId -follow _ = list [] +follow directObject = target directObject >>= getObject dict :: Object -> T Dictionary dict (Direct (Dictionary dictionary)) = return dictionary @@ -122,7 +129,6 @@ listTextObjects (Document {updates}) = snd =<< evalRWST rwsMain (unify updates) Map.empty where rwsMain = - --Lazy.pack . show <$> (getObject =<< pagesList) pagesList >>= getObject >>= extractText diff --git a/src/Data/ByteString/Char8/Util.hs b/src/Data/ByteString/Char8/Util.hs index c091351..4f9d905 100644 --- a/src/Data/ByteString/Char8/Util.hs +++ b/src/Data/ByteString/Char8/Util.hs @@ -1,16 +1,53 @@ module Data.ByteString.Char8.Util ( - previous + decodeHex + , fromInt + , hexString + , parseBytes + , previous , subBS + , toInt + , utf16BEToutf8 ) where -import Data.ByteString.Char8 (ByteString) -import qualified Data.ByteString.Char8 as BS (drop, index, take) +import Data.ByteString (ByteString, snoc) +import qualified Data.ByteString as BS (foldl, pack, singleton) +import qualified Data.ByteString.Char8 as Char8 (drop, index, take, unpack) +import Data.Text.Encoding (encodeUtf8, decodeUtf16BE) import Prelude hiding (length) previous :: Char -> Int -> ByteString -> Int previous char position byteString - | BS.index byteString position == char = position + | Char8.index byteString position == char = position | otherwise = previous char (position - 1) byteString subBS :: Int -> Int -> ByteString -> ByteString -subBS offset length = BS.take length . BS.drop offset +subBS offset length = Char8.take length . Char8.drop offset + +hexString :: (Num a, Read a) => String -> a +hexString s = read $ "0x" ++ s + +fromInt :: Int -> ByteString +fromInt n + | n < 0x100 = BS.singleton $ toEnum n + | otherwise = fromInt (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100)) + +toInt :: ByteString -> Int +toInt = BS.foldl (\n w -> 0x100*n + fromEnum w) 0 + +{- +encodeHex :: ByteString -> ByteString +encodeHex = +-} + +decodeHex :: ByteString -> ByteString +decodeHex = parseBytes . Char8.unpack + +parseBytes :: String -> ByteString +parseBytes = BS.pack . fmap hexString . pairDigits + where + pairDigits "" = [] + pairDigits [c] = [[c]] + pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end + +utf16BEToutf8 :: ByteString -> ByteString +utf16BEToutf8 = encodeUtf8 . decodeUtf16BE diff --git a/src/PDF/CMap.hs b/src/PDF/CMap.hs index 6fee0d4..6b9395c 100644 --- a/src/PDF/CMap.hs +++ b/src/PDF/CMap.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE NamedFieldPuns #-} module PDF.CMap ( CMap , CMappers @@ -6,35 +7,66 @@ module PDF.CMap ( ) where import Control.Applicative ((<|>), many) -import Data.Attoparsec.ByteString.Char8 (count, parseOnly) -import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser) -import Data.ByteString (ByteString, snoc) -import qualified Data.ByteString as BS (init, last, null) -import Data.Map (Map) -import qualified Data.Map as Map (empty, fromList) +import Control.Monad.State (modify) +import Data.Attoparsec.ByteString.Char8 (count) +import Data.ByteString (ByteString) +import qualified Data.ByteString as BS (length) +import Data.ByteString.Char8.Util ( + decodeHex, fromInt, toInt, utf16BEToutf8 + ) +import Data.Map (Map, union) +import qualified Data.Map as Map (adjust, empty, fromList, insertWith) import qualified PDF.EOL as EOL (charset, parser) import PDF.Object ( DirectObject(..), Name, StringObject(..) - , blank, directObject, hexString, integer, line, parseBytes, stringObject + , blank, directObject, integer, line, stringObject ) -import PDF.Parser (takeAll) +import PDF.Parser (MonadParser, Parser, runParser, takeAll) type CMappers = Map Name CMap -type CMap = Map Int ByteString +type Mapping = Map ByteString ByteString +data CRange = CRange { + fromSequence :: ByteString + , toSequence :: ByteString + , mapping :: Mapping + } +type RangeSize = Int +type CMap = Map RangeSize [CRange] emptyCMap :: CMap emptyCMap = Map.empty cMap :: ByteString -> Either String CMap -cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine) +cMap = fmap snd <$> runParser + (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine)) + emptyCMap where ignoredLine = - takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty + takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return () -cMapRange :: Atto.Parser CMap +codeRanges :: Parser CMap () +codeRanges = do + size <- integer <* line "begincodespacerange" + count size (createMapping <$> codeRange) *> return () + line "endcodespacerange" + where + codeRange = + (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser + +createMapping :: (StringObject, StringObject) -> Parser CMap () +createMapping (Hexadecimal from, Hexadecimal to) = modify $ + Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}] + where + fromSequence = decodeHex from + size = BS.length fromSequence + toSequence = decodeHex to + mapping = Map.empty +createMapping _ = return () + +cMapRange :: Parser CMap () cMapRange = do size <- integer <* line "beginbfrange" - mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange" + mapM_ saveMapping =<< count size rangeMapping <* line "endbfrange" where rangeMapping = (,,) <$> (stringObject <* blank) @@ -42,33 +74,52 @@ cMapRange = do <*> directObject <* EOL.parser >>= mapFromTo -cMapChar :: Atto.Parser CMap +saveMapping :: [(ByteString, ByteString)] -> Parser CMap () +saveMapping [] = return () +saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize + where + newMapping = Map.fromList assoc + mappingSize = BS.length code + matchingRange (CRange {fromSequence, toSequence}) = + fromSequence <= code && code <= toSequence + appendMapping cRange = + cRange {mapping = mapping cRange `union` newMapping} + insertCRange = fmap (\cRange -> + if matchingRange cRange then appendMapping cRange else cRange + ) + +cMapChar :: Parser CMap () cMapChar = do size <- integer <* line "beginbfchar" - Map.fromList <$> count size charMapping <* line "endbfchar" + saveMapping =<< count size charMapping <* line "endbfchar" where charMapping = (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser >>= pairMapping -mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, ByteString)] +between :: ByteString -> ByteString -> [ByteString] +between from to = fromInt <$> [toInt from .. toInt to] + +startFrom :: ByteString -> [ByteString] +startFrom from = fromInt <$> [toInt from .. ] + +mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)] mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = - let dstString = parseBytes dstFrom in - return $ zip [hexString from .. hexString to] (textsFrom dstString) + return $ zip (between fromBS toBS) (startFrom dstBS) where - textsFrom t - | BS.null t = [t] - | otherwise = (BS.init t `snoc`) <$> [BS.last t ..] + (fromBS, toBS, dstBS) = (decodeHex from, decodeHex to, decodeHex dstFrom) mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) = - zip [hexString from .. hexString to] <$> (mapM dstString dstPoints) + zip (between fromBS toBS) <$> (mapM dstByteString dstPoints) where - dstString (StringObject (Hexadecimal dstPoint)) = return $ parseBytes dstPoint - dstString _ = fail "Invalid for a replacement string" + (fromBS, toBS) = (decodeHex from, decodeHex to) + dstByteString (StringObject (Hexadecimal dst)) = + return . utf16BEToutf8 $ decodeHex dst + dstByteString _ = fail "Invalid for a replacement string" mapFromTo _ = fail "invalid range mapping found" -pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, ByteString) +pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString) pairMapping (Hexadecimal from, Hexadecimal to) = - return (hexString from, parseBytes to) + return (decodeHex from, utf16BEToutf8 $ decodeHex to) pairMapping _ = fail "invalid pair mapping found" diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs index d9d2457..542d00c 100644 --- a/src/PDF/Object.hs +++ b/src/PDF/Object.hs @@ -22,7 +22,6 @@ module PDF.Object ( , dictionary , directObject , eofMarker - , hexString , integer , line , magicNumber @@ -39,6 +38,7 @@ import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy) import Data.ByteString (ByteString) import qualified Data.ByteString as BS (concat, pack) import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack) +import Data.ByteString.Char8.Util (hexString, parseBytes) import Data.Map (Map, (!), mapWithKey) import qualified Data.Map as Map ( delete, empty, fromList, lookup, minViewWithKey, toList, union @@ -114,16 +114,16 @@ number = Number . read . Char8.unpack <$> -- -- StringObject -- -data StringObject = Literal String | Hexadecimal String deriving Show +data StringObject = Literal ByteString | Hexadecimal ByteString deriving Show instance Output StringObject where - output (Literal s) = Output.string (printf "(%s)" s) - output (Hexadecimal s) = Output.string (printf "<%s>" s) + output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s)) + output (Hexadecimal s) = Output.string (printf "<%s>" (Char8.unpack s)) stringObject :: MonadParser m => m StringObject stringObject = - Literal . Char8.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')') - <|> Hexadecimal . Char8.unpack <$> (char '<' *> hexNumber <* char '>') + Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')') + <|> Hexadecimal <$> (char '<' *> hexNumber <* char '>') "string object (literal or hexadecimal)" where literalString = many literalStringBlock @@ -135,16 +135,6 @@ stringObject = Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode) octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3] -hexString :: (Num a, Read a) => String -> a -hexString s = read $ "0x" ++ s - -parseBytes :: String -> ByteString -parseBytes = encodeUtf8 . decodeUtf16BE . BS.pack . fmap hexString . pairDigits - where - pairDigits "" = [] - pairDigits [c] = [[c]] - pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end - -- -- Name -- diff --git a/src/PDF/Text.hs b/src/PDF/Text.hs index 150edc2..30ac6d0 100644 --- a/src/PDF/Text.hs +++ b/src/PDF/Text.hs @@ -11,6 +11,7 @@ import Control.Monad.State (get, put) import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy) import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as Char8 (unpack) +import Data.ByteString.Char8.Util (decodeHex) import Data.Map ((!)) import qualified Data.Map as Map (lookup) import PDF.CMap (CMappers, CMap, emptyCMap) @@ -22,13 +23,13 @@ import PDF.Parser (MonadParser, (), Parser, evalParser, string, takeAll) data StateOperator = Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state - deriving (Bounded, Enum) + deriving (Bounded, Enum, Show) data TextOperator = Td | TD | Tm | Tstar -- text positioning | TJ | Tj | Quote | DQuote -- text showing | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state - deriving (Bounded, Enum) -data Argument = Raw ByteString | Typed DirectObject + deriving (Bounded, Enum, Show) +data Argument = Raw ByteString | Typed DirectObject deriving Show type Call a = (a, [Argument]) stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator) @@ -112,7 +113,7 @@ runOperator (TJ, [Typed (Array arrayObject)]) = appendText bs (StringObject outputString) = mappend bs <$> decodeString outputString appendText bs _ = return bs - + runOperator (Tj, [Typed (StringObject outputString)]) = replicate 1 <$> decodeString outputString @@ -125,8 +126,10 @@ runOperator (DQuote, [Typed (StringObject outputString)]) = runOperator _ = return [] decodeString :: StringObject -> ParserWithFont ByteString -decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h)) -decodeString (Literal litString) = get >>= convertBytes litString +decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h)) +decodeString (Literal litString) = + +get >>= convertBytes litString where convertBytes :: String -> CMap -> ParserWithFont ByteString convertBytes [] _ = return ""