diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index f4eb761..5238663 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -30,6 +30,7 @@ library , base >=4.9 && <4.13 , bytestring , containers + , data-serializer , mtl hs-source-dirs: src ghc-options: -Wall diff --git a/examples/getText.hs b/examples/getText.hs index d8a82bc..c0ba6a2 100755 --- a/examples/getText.hs +++ b/examples/getText.hs @@ -16,7 +16,7 @@ import PDF.Object ( , Object(..), Name(..), Structure(..) ,) import PDF.Output (ObjectId) -import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents) +import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents) import PDF.Update (unify) import System.Environment (getArgs) import System.IO (hPutStrLn, stderr) @@ -32,26 +32,17 @@ list l = RWST (\_ s -> fillContext s <$> l) extractText :: Object -> T () extractText object = do pageDict <- dict object - contents <- follow =<< key "Contents" pageDict - case contents of - (Stream {header, streamContent}) -> do - font <- getFont pageDict - cMappers <- loadCMappers font - storeDecodedText cMappers $ clear header streamContent - _ -> return () + cMappers <- loadCMappers =<< getFont pageDict + contents <- stream =<< follow =<< key "Contents" pageDict + either (return . const ()) (tell . chunks) (pageContents cMappers contents) -clear :: Dictionary -> ByteString -> ByteString -clear header streamContent = +stream :: Object -> T ByteString +stream (Stream {header, streamContent}) = return $ case Map.lookup (Name "Filter") header of Just (NameObject (Name "FlateDecode")) -> Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent _ -> streamContent - -storeDecodedText :: CMappers -> ByteString -> T () -storeDecodedText font page = - case pageContents font page of - Left _ -> return () - Right (PageContents {chunks}) -> tell chunks +stream _ = list [] getFont :: Dictionary -> T Dictionary getFont pageDict = @@ -76,6 +67,8 @@ loadFont objectId = >>= dict >>= key "ToUnicode" >>= follow + >>= stream + >>= either (return . const emptyCMap) return . cMap loadCMappers :: Dictionary -> T CMappers loadCMappers = foldM loadCMapper Map.empty . Map.toList diff --git a/src/PDF/EOL.hs b/src/PDF/EOL.hs index 0493ea0..45253fc 100644 --- a/src/PDF/EOL.hs +++ b/src/PDF/EOL.hs @@ -6,14 +6,14 @@ module PDF.EOL ( ) where import Control.Applicative ((<|>)) -import PDF.Parser (Parser, string) +import PDF.Parser (MonadParser, string) data Style = CR | LF | CRLF deriving Show charset :: String charset = "\r\n" -parser :: Parser s Style +parser :: MonadParser m => m Style parser = (string "\r\n" >> return CRLF) <|> (string "\r" >> return CR) diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs index 1a4ed06..495f2a3 100644 --- a/src/PDF/Object.hs +++ b/src/PDF/Object.hs @@ -13,9 +13,11 @@ module PDF.Object ( , Number(..) , Object(..) , Occurrence(..) + , StringObject(..) , Structure(..) , XRefEntry(..) , XRefSection + , array , blank , dictionary , directObject @@ -24,14 +26,16 @@ module PDF.Object ( , line , magicNumber , name + , number , regular + , stringObject , structure ) where import Control.Applicative ((<|>), many) import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy) -import Data.ByteString.Char8 (ByteString) -import qualified Data.ByteString.Char8 as BS ( +import Data.ByteString (ByteString) +import qualified Data.ByteString as BS ( concat, cons, pack, singleton, unpack ) import Data.Map (Map, (!), mapWithKey) @@ -48,7 +52,7 @@ import PDF.Output ( import PDF.Parser (MonadParser(..), Parser, (), octDigit, oneOf) import Text.Printf (printf) -line :: String -> Parser u () +line :: MonadParser m => String -> m () line l = (string (BS.pack l) *> EOL.parser *> return ()) printf "line «%s»" l magicNumber :: ByteString @@ -69,7 +73,7 @@ delimiterCharset = "()<>[]{}/%" regular :: Char -> Bool regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset)) -integer :: (Read a, Num a) => Parser u a +integer :: (Read a, Num a, MonadParser m) => m a integer = read . BS.unpack <$> decNumber <* blank "decimal integer" ------------------------------------- @@ -81,7 +85,7 @@ type IndexedObjects = Map ObjectId Object -- -- Boolean -- -boolean :: Parser u Bool +boolean :: MonadParser m => m Bool boolean = (string "true" *> return True) <|> (string "false" *> return False) "boolean" @@ -96,7 +100,7 @@ instance Output Number where (n, 0) -> printf "%d" (n :: Int) _ -> printf "%f" f -number :: Parser u Number +number :: MonadParser m => m Number number = Number . read . BS.unpack <$> (mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart)) "number" @@ -114,7 +118,7 @@ instance Output StringObject where output (Literal s) = Output.string (printf "(%s)" s) output (Hexadecimal s) = Output.string (printf "<%s>" s) -stringObject :: Parser u StringObject +stringObject :: MonadParser m => m StringObject stringObject = Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')') <|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>') @@ -143,7 +147,7 @@ name = Name . BS.unpack <$> (char '/' *> takeAll regular) "name" -- -- Array -- -array :: Parser u [DirectObject] +array :: MonadParser m => m [DirectObject] array = char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' "array" @@ -160,7 +164,7 @@ instance Output Dictionary where outputKeyVal :: (Name, DirectObject) -> OBuilder outputKeyVal (key, val) = Output.concat [output key, " ", output val] -dictionary :: Parser u Dictionary +dictionary :: MonadParser m => m Dictionary dictionary = string "<<" *> blank *> keyValPairs <* string ">>" "dictionary" where @@ -170,7 +174,7 @@ dictionary = -- -- Null -- -nullObject :: Parser u () +nullObject :: MonadParser m => m () nullObject = string "null" *> return () "null object" -- @@ -181,7 +185,7 @@ data IndirectObjCoordinates = IndirectObjCoordinates { , versionNumber :: Int } deriving Show -reference :: Parser u IndirectObjCoordinates +reference :: MonadParser m => m IndirectObjCoordinates reference = IndirectObjCoordinates <$> (fmap ObjectId integer) <*> integer <* char 'R' "reference to an object" @@ -210,7 +214,7 @@ instance Output DirectObject where output (Reference (IndirectObjCoordinates {objectId, versionNumber})) = Output.string (printf "%d %d R" (getObjectId objectId) versionNumber) -directObject :: Parser u DirectObject +directObject :: MonadParser m => m DirectObject directObject = Boolean <$> boolean <|> Reference <$> reference {- defined before Number because Number is a prefix of it -} diff --git a/src/PDF/Text.hs b/src/PDF/Text.hs index ef635d6..35d787f 100644 --- a/src/PDF/Text.hs +++ b/src/PDF/Text.hs @@ -4,19 +4,24 @@ module PDF.Text ( , CMappers , PageContents(..) , cMap + , emptyCMap , pageContents ) where -import Control.Applicative ((<|>)) -import Control.Monad (join) -import Control.Monad.Reader (ReaderT, runReaderT) -import Control.Monad.State (state) -import Data.Attoparsec.ByteString.Char8 (count, sepBy) -import Data.ByteString.Char8 (ByteString) -import Data.Map (Map) -import qualified Data.Map as Map (empty) -import PDF.Object (Content, Name, blank, name, regular) -import PDF.Output (ObjectId) +import Control.Applicative ((<|>), many) +import Control.Monad (foldM, join) +import Control.Monad.Reader (ReaderT, runReaderT, asks) +import Control.Monad.State (put) +import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy) +import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser) +import Data.ByteString.Char8 (ByteString, pack) +import Data.Map (Map, (!)) +import qualified Data.Map as Map (empty, fromList) +import PDF.Object ( + DirectObject(..), Name, StringObject(..) + , array, blank, directObject, integer, line, name, regular, stringObject + ) +import qualified PDF.EOL as EOL (charset, parser) import PDF.Parser (Parser, evalParser, string, takeAll) type CMappers = Map Name CMap @@ -25,10 +30,92 @@ type CMap = Map Int ByteString emptyCMap :: CMap emptyCMap = Map.empty -data TextOperator = TJ | Tj | Tf | Other +cMap :: ByteString -> Either String CMap +cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine) + where + ignoredLine = + takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty -cMap :: ByteString -> CMap -cMap = undefined +cMapRange :: Atto.Parser CMap +cMapRange = do + size <- integer <* line "beginbfrange" + mconcat <$> count size rangeMapping <* line "endbfrange" + where + rangeMapping = mapFromTo + <$> (stringObject <* blank) + <*> (stringObject <* blank) + <*> directObject <* EOL.parser + +cMapChar :: Atto.Parser CMap +cMapChar = do + size <- integer <* line "beginbfchar" + Map.fromList <$> count size charMapping <* line "endbfchar" + where + charMapping = pairMapping + <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser + +mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) = + undefined +mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) = undefined +mapFromTo _ _ _ = Map.empty + +pairMapping :: StringObject -> StringObject -> (Int, ByteString) +pairMapping (Hexadecimal from) (Hexadecimal to) = + (read $ "0x" ++ from, + +fromBytes :: String -> ByteString +fromBytes s + where + toBaseWord8 n + | n < 0xff = [ + to + +data StateOperator = + Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state + deriving (Bounded, Enum) +data TextOperator = + Td | TD | Tm | Tstar -- text positioning + | TJ | Tj | Quote | DQuote -- text showing + | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state + deriving (Bounded, Enum) +data Argument = Raw ByteString | Typed DirectObject +type Call a = (a, [Argument]) + +stateOperator :: StateOperator -> ParserWithFont (Call StateOperator) +stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm" +stateOperator W = (,) W <$> count 1 argument <* string "w" +stateOperator J = (,) J <$> count 1 argument <* string "J" +stateOperator J_ = (,) J_ <$> count 1 argument <* string "j" +stateOperator M = (,) M <$> count 1 argument <* string "M" +stateOperator D = (,) D <$> count 2 argument <* string "d" +stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri" +stateOperator I = (,) I <$> count 1 argument <* string "i" +stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs" + +textOperator :: TextOperator -> ParserWithFont (Call TextOperator) +textOperator Td = (,) Td <$> count 2 argument <* string "Td" +textOperator TD = (,) TD <$> count 2 argument <* string "TD" +textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm" +textOperator Tstar = (,) Td <$> return [] <* string "T*" +textOperator TJ = + (,) TJ <$> sequence [Typed . Array <$> array] <* string "TJ" +textOperator Tj = + (,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* string "Tj" +textOperator Quote = (,) Quote <$> count 1 argument <* string "'" +textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\"" +textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc" +textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw" +textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz" +textOperator TL = (,) TL <$> count 1 argument <* string "TL" +textOperator Tf = (,) Tf <$> sequence [Typed . NameObject <$> name, argument] <* string "Tf" +textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr" +textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts" + +a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o) +a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound] + +argument :: ParserWithFont Argument +argument = Raw <$> takeAll regular <* blank data PageContents = PageContents { chunks :: [ByteString] @@ -36,18 +123,6 @@ data PageContents = PageContents { type ParserWithFont = ReaderT CMappers (Parser CMap) -{- -data FontContext = FontContext { - cMappers :: CMappers - , currentFont :: CMap - } - -initFontContext cMappers = FontContext { - cMappers - , currentFont = emptyCMap - } --} - pageContents :: CMappers -> ByteString -> Either String PageContents pageContents font input = evalParser (runReaderT (PageContents <$> page) font) emptyCMap input @@ -60,14 +135,37 @@ graphicState = string "q" *> blank *> insideQ <* string "Q" where insideQ = join <$> (command <|> page `sepBy` blank ) - command = - count 6 argument *> string "cm" *> return [] - <|> name *> blank *> string "gs" *> return [] - argument = takeAll regular <* blank + command = a stateOperator *> return [] text :: ParserWithFont [ByteString] -text = undefined +text = + string "BT" *> blank *> commands <* blank <* string "ET" + where + commands = join <$> (a textOperator >>= runOperator) `sepBy` blank -textOperator :: ParserWithFont TextOperator -textOperator = undefined +runOperator :: Call TextOperator -> ParserWithFont [ByteString] +runOperator (Tf, [Typed (NameObject fontName), _]) = + asks (! fontName) >>= put >> return [] +runOperator (Tstar, []) = return ["\n"] + +runOperator (TJ, [Typed (Array arrayObject)]) = + replicate 1 <$> foldM appendText "" arrayObject + where + appendText bs (StringObject outputString) = + mappend bs <$> decodeString outputString + appendText bs _ = return bs + +runOperator (Tj, [Typed (StringObject outputString)]) = + replicate 1 <$> decodeString outputString + +runOperator (Quote, [Typed (StringObject outputString)]) = + (\bs -> ["\n", bs]) <$> decodeString outputString + +runOperator (DQuote, [Typed (StringObject outputString)]) = + (\bs -> ["\n", bs]) <$> decodeString outputString + +runOperator _ = return [] + +decodeString :: StringObject -> ParserWithFont ByteString +decodeString = undefined