{-# LANGUAGE OverloadedStrings #-} module PDF.Text {-( CMap , CMappers , PageContents(..) , cMap , emptyCMap , pageContents )-} where import Control.Applicative ((<|>), many) import Control.Monad (foldM, join) import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.State (put) import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy) import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser) import Data.ByteString (ByteString, pack) import qualified Data.ByteString as BS (head) import Data.ByteString.Char8 as Char8 (pack) import qualified Data.ByteString.UTF8 as UTF8 (fromString) import Data.Map (Map, (!)) import qualified Data.Map as Map (empty, fromList) import Data.Text (Text, snoc) import qualified Data.Text as Text (init, last, null, unpack) import Data.Text.Encoding (decodeUtf16BE) import PDF.Object ( DirectObject(..), Name, StringObject(..) , array, blank, directObject, integer, line, name, regular, stringObject ) import qualified PDF.EOL as EOL (charset, parser) import PDF.Parser (Parser, evalParser, string, takeAll) type CMappers = Map Name CMap type CMap = Map Int Text emptyCMap :: CMap emptyCMap = Map.empty cMap :: ByteString -> Either String CMap cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine) where ignoredLine = takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty cMapRange :: Atto.Parser CMap cMapRange = do size <- integer <* line "beginbfrange" mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange" where rangeMapping = (,,) <$> (stringObject <* blank) <*> (stringObject <* blank) <*> directObject <* EOL.parser >>= mapFromTo cMapChar :: Atto.Parser CMap cMapChar = do size <- integer <* line "beginbfchar" Map.fromList <$> count size charMapping <* line "endbfchar" where charMapping = (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser >>= pairMapping mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, Text)] mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = let dstString = parseText dstFrom in return $ zip [hexString from .. hexString to] (textsFrom dstString) where textsFrom t | Text.null t = [t] | otherwise = (Text.init t `snoc`) <$> [Text.last t ..] mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) = zip [hexString from .. hexString to] <$> (mapM dstString dstPoints) where dstString (StringObject (Hexadecimal dstPoint)) = return $ parseText dstPoint dstString _ = fail "Invalid for a replacement string" mapFromTo _ = fail "invalid range mapping found" pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, Text) pairMapping (Hexadecimal from, Hexadecimal to) = return (hexString from, parseText to) pairMapping _ = fail "invalid pair mapping found" hexString :: (Num a, Read a) => String -> a hexString s = read $ "0x" ++ s pairDigits :: String -> [String] pairDigits "" = [] pairDigits [c] = [[c]] pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end parseText :: String -> Text parseText = decodeUtf16BE . pack . fmap hexString . pairDigits data StateOperator = Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state deriving (Bounded, Enum) data TextOperator = Td | TD | Tm | Tstar -- text positioning | TJ | Tj | Quote | DQuote -- text showing | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state deriving (Bounded, Enum) data Argument = Raw ByteString | Typed DirectObject type Call a = (a, [Argument]) stateOperator :: StateOperator -> ParserWithFont (Call StateOperator) stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm" stateOperator W = (,) W <$> count 1 argument <* string "w" stateOperator J = (,) J <$> count 1 argument <* string "J" stateOperator J_ = (,) J_ <$> count 1 argument <* string "j" stateOperator M = (,) M <$> count 1 argument <* string "M" stateOperator D = (,) D <$> count 2 argument <* string "d" stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri" stateOperator I = (,) I <$> count 1 argument <* string "i" stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs" textOperator :: TextOperator -> ParserWithFont (Call TextOperator) textOperator Td = (,) Td <$> count 2 argument <* string "Td" textOperator TD = (,) TD <$> count 2 argument <* string "TD" textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm" textOperator Tstar = (,) Td <$> return [] <* string "T*" textOperator TJ = (,) TJ <$> sequence [Typed . Array <$> array] <* string "TJ" textOperator Tj = (,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* string "Tj" textOperator Quote = (,) Quote <$> count 1 argument <* string "'" textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\"" textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc" textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw" textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz" textOperator TL = (,) TL <$> count 1 argument <* string "TL" textOperator Tf = (,) Tf <$> sequence [Typed . NameObject <$> name, argument] <* string "Tf" textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr" textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts" a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o) a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound] argument :: ParserWithFont Argument argument = Raw <$> takeAll regular <* blank data PageContents = PageContents { chunks :: [ByteString] } type ParserWithFont = ReaderT CMappers (Parser CMap) pageContents :: CMappers -> ByteString -> Either String PageContents pageContents font input = evalParser (runReaderT (PageContents <$> page) font) emptyCMap input page :: ParserWithFont [ByteString] page = graphicState <|> text graphicState :: ParserWithFont [ByteString] graphicState = string "q" *> blank *> insideQ <* string "Q" where insideQ = join <$> (command <|> page `sepBy` blank ) command = a stateOperator *> return [] text :: ParserWithFont [ByteString] text = string "BT" *> blank *> commands <* blank <* string "ET" where commands = join <$> (a textOperator >>= runOperator) `sepBy` blank runOperator :: Call TextOperator -> ParserWithFont [ByteString] runOperator (Tf, [Typed (NameObject fontName), _]) = asks (! fontName) >>= put >> return [] runOperator (Tstar, []) = return ["\n"] runOperator (TJ, [Typed (Array arrayObject)]) = replicate 1 <$> foldM appendText "" arrayObject where appendText bs (StringObject outputString) = mappend bs <$> decodeString outputString appendText bs _ = return bs runOperator (Tj, [Typed (StringObject outputString)]) = replicate 1 <$> decodeString outputString runOperator (Quote, [Typed (StringObject outputString)]) = (\bs -> ["\n", bs]) <$> decodeString outputString runOperator (DQuote, [Typed (StringObject outputString)]) = (\bs -> ["\n", bs]) <$> decodeString outputString runOperator _ = return [] decodeString :: StringObject -> ParserWithFont ByteString decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h)) decodeString (Literal s) = asks where bytes = Char8.pack s