{-# LANGUAGE OverloadedStrings #-} module PDF.Text {-( CMap , CMappers , PageContents(..) , cMap , emptyCMap , pageContents )-} where import Control.Applicative ((<|>), many) import Control.Monad (foldM, join) import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.State (put) import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy) import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser) import Data.ByteString (ByteString, pack) import qualified Data.ByteString.UTF8 as UTF8 (fromString, toString) import Data.Map (Map, (!)) import qualified Data.Map as Map (empty, fromList) import PDF.Object ( DirectObject(..), Name, StringObject(..) , array, blank, directObject, integer, line, name, regular, stringObject ) import qualified PDF.EOL as EOL (charset, parser) import PDF.Parser (Parser, evalParser, string, takeAll) type CMappers = Map Name CMap type CMap = Map Int String emptyCMap :: CMap emptyCMap = Map.empty cMap :: ByteString -> Either String CMap cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine) where ignoredLine = takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty cMapRange :: Atto.Parser CMap cMapRange = do size <- integer <* line "beginbfrange" mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange" where rangeMapping = (,,) <$> (stringObject <* blank) <*> (stringObject <* blank) <*> directObject <* EOL.parser >>= mapFromTo cMapChar :: Atto.Parser CMap cMapChar = do size <- integer <* line "beginbfchar" Map.fromList <$> count size charMapping <* line "endbfchar" where charMapping = (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser >>= pairMapping mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, String)] mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = let dstString = utf8Decode dstFrom in return $ zip [hexString from .. hexString to] (stringsFrom dstString) where stringsFrom [] = [[]] stringsFrom [x] = (:[]) <$> [x..] stringsFrom (x:xs) = (x:) <$> stringsFrom xs mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) = zip [hexString from .. hexString to] <$> (mapM dstString dstPoints) where dstString (StringObject (Hexadecimal dstPoint)) = return $ utf8Decode dstPoint dstString _ = fail "Invalid for a replacement string" mapFromTo _ = fail "invalid range mapping found" pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, String) pairMapping (Hexadecimal from, Hexadecimal to) = return (hexString from, utf8Decode to) pairMapping _ = fail "invalid pair mapping found" hexString :: (Num a, Read a) => String -> a hexString s = read $ "0x" ++ s pairDigits :: String -> [String] pairDigits "" = [] pairDigits [c] = [[c]] pairDigits (a:b:end) = (a:[b]):pairDigits end toBytes :: String -> ByteString toBytes = pack . fmap hexString . pairDigits utf8Decode :: String -> String utf8Decode = UTF8.toString . pack . toBaseWord8 [] . (hexString :: String -> Integer) where toBaseWord8 digits n | n < 0xff = (fromIntegral n):digits | otherwise = let newDigits = (fromIntegral (n `mod` 0x100)):digits in case n `div` 0x100 of 0 -> newDigits k -> toBaseWord8 newDigits k data StateOperator = Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state deriving (Bounded, Enum) data TextOperator = Td | TD | Tm | Tstar -- text positioning | TJ | Tj | Quote | DQuote -- text showing | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state deriving (Bounded, Enum) data Argument = Raw ByteString | Typed DirectObject type Call a = (a, [Argument]) stateOperator :: StateOperator -> ParserWithFont (Call StateOperator) stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm" stateOperator W = (,) W <$> count 1 argument <* string "w" stateOperator J = (,) J <$> count 1 argument <* string "J" stateOperator J_ = (,) J_ <$> count 1 argument <* string "j" stateOperator M = (,) M <$> count 1 argument <* string "M" stateOperator D = (,) D <$> count 2 argument <* string "d" stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri" stateOperator I = (,) I <$> count 1 argument <* string "i" stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs" textOperator :: TextOperator -> ParserWithFont (Call TextOperator) textOperator Td = (,) Td <$> count 2 argument <* string "Td" textOperator TD = (,) TD <$> count 2 argument <* string "TD" textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm" textOperator Tstar = (,) Td <$> return [] <* string "T*" textOperator TJ = (,) TJ <$> sequence [Typed . Array <$> array] <* string "TJ" textOperator Tj = (,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* string "Tj" textOperator Quote = (,) Quote <$> count 1 argument <* string "'" textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\"" textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc" textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw" textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz" textOperator TL = (,) TL <$> count 1 argument <* string "TL" textOperator Tf = (,) Tf <$> sequence [Typed . NameObject <$> name, argument] <* string "Tf" textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr" textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts" a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o) a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound] argument :: ParserWithFont Argument argument = Raw <$> takeAll regular <* blank data PageContents = PageContents { chunks :: [ByteString] } type ParserWithFont = ReaderT CMappers (Parser CMap) pageContents :: CMappers -> ByteString -> Either String PageContents pageContents font input = evalParser (runReaderT (PageContents <$> page) font) emptyCMap input page :: ParserWithFont [ByteString] page = graphicState <|> text graphicState :: ParserWithFont [ByteString] graphicState = string "q" *> blank *> insideQ <* string "Q" where insideQ = join <$> (command <|> page `sepBy` blank ) command = a stateOperator *> return [] text :: ParserWithFont [ByteString] text = string "BT" *> blank *> commands <* blank <* string "ET" where commands = join <$> (a textOperator >>= runOperator) `sepBy` blank runOperator :: Call TextOperator -> ParserWithFont [ByteString] runOperator (Tf, [Typed (NameObject fontName), _]) = asks (! fontName) >>= put >> return [] runOperator (Tstar, []) = return ["\n"] runOperator (TJ, [Typed (Array arrayObject)]) = replicate 1 <$> foldM appendText "" arrayObject where appendText bs (StringObject outputString) = mappend bs <$> decodeString outputString appendText bs _ = return bs runOperator (Tj, [Typed (StringObject outputString)]) = replicate 1 <$> decodeString outputString runOperator (Quote, [Typed (StringObject outputString)]) = (\bs -> ["\n", bs]) <$> decodeString outputString runOperator (DQuote, [Typed (StringObject outputString)]) = (\bs -> ["\n", bs]) <$> decodeString outputString runOperator _ = return [] decodeString :: StringObject -> ParserWithFont ByteString decodeString (Hexadecimal h) = decodeString (Literal (utf8Decode h)) decodeString (Literal s) = undefined where bytes = UTF8.fromString s