Hufflepdf/src/PDF/Text.hs

207 lines
7.3 KiB
Haskell

{-# LANGUAGE OverloadedStrings #-}
module PDF.Text {-(
CMap
, CMappers
, PageContents(..)
, cMap
, emptyCMap
, pageContents
)-} where
import Control.Applicative ((<|>), many)
import Control.Monad (foldM, join)
import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (put)
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
import Data.ByteString (ByteString, pack)
import qualified Data.ByteString.UTF8 as UTF8 (fromString, toString)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList)
import PDF.Object (
DirectObject(..), Name, StringObject(..)
, array, blank, directObject, integer, line, name, regular, stringObject
)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Parser (Parser, evalParser, string, takeAll)
type CMappers = Map Name CMap
type CMap = Map Int String
emptyCMap :: CMap
emptyCMap = Map.empty
cMap :: ByteString -> Either String CMap
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
where
ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
cMapRange :: Atto.Parser CMap
cMapRange = do
size <- integer <* line "beginbfrange"
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
where
rangeMapping = (,,)
<$> (stringObject <* blank)
<*> (stringObject <* blank)
<*> directObject <* EOL.parser
>>= mapFromTo
cMapChar :: Atto.Parser CMap
cMapChar = do
size <- integer <* line "beginbfchar"
Map.fromList <$> count size charMapping <* line "endbfchar"
where
charMapping =
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
>>= pairMapping
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, String)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
let dstString = utf8Decode dstFrom in
return $ zip [hexString from .. hexString to] (stringsFrom dstString)
where
stringsFrom [] = [[]]
stringsFrom [x] = (:[]) <$> [x..]
stringsFrom (x:xs) = (x:) <$> stringsFrom xs
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
where
dstString (StringObject (Hexadecimal dstPoint)) = return $ utf8Decode dstPoint
dstString _ = fail "Invalid for a replacement string"
mapFromTo _ = fail "invalid range mapping found"
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, String)
pairMapping (Hexadecimal from, Hexadecimal to) =
return (hexString from, utf8Decode to)
pairMapping _ = fail "invalid pair mapping found"
hexString :: (Num a, Read a) => String -> a
hexString s = read $ "0x" ++ s
pairDigits :: String -> [String]
pairDigits "" = []
pairDigits [c] = [[c]]
pairDigits (a:b:end) = (a:[b]):pairDigits end
toBytes :: String -> ByteString
toBytes = pack . fmap hexString . pairDigits
utf8Decode :: String -> String
utf8Decode = UTF8.toString . pack . toBaseWord8 [] . (hexString :: String -> Integer)
where
toBaseWord8 digits n
| n < 0xff = (fromIntegral n):digits
| otherwise =
let newDigits = (fromIntegral (n `mod` 0x100)):digits in
case n `div` 0x100 of
0 -> newDigits
k -> toBaseWord8 newDigits k
data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
deriving (Bounded, Enum)
data TextOperator =
Td | TD | Tm | Tstar -- text positioning
| TJ | Tj | Quote | DQuote -- text showing
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
deriving (Bounded, Enum)
data Argument = Raw ByteString | Typed DirectObject
type Call a = (a, [Argument])
stateOperator :: StateOperator -> ParserWithFont (Call StateOperator)
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm"
stateOperator W = (,) W <$> count 1 argument <* string "w"
stateOperator J = (,) J <$> count 1 argument <* string "J"
stateOperator J_ = (,) J_ <$> count 1 argument <* string "j"
stateOperator M = (,) M <$> count 1 argument <* string "M"
stateOperator D = (,) D <$> count 2 argument <* string "d"
stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri"
stateOperator I = (,) I <$> count 1 argument <* string "i"
stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs"
textOperator :: TextOperator -> ParserWithFont (Call TextOperator)
textOperator Td = (,) Td <$> count 2 argument <* string "Td"
textOperator TD = (,) TD <$> count 2 argument <* string "TD"
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
textOperator Tstar = (,) Td <$> return [] <* string "T*"
textOperator TJ =
(,) TJ <$> sequence [Typed . Array <$> array] <* string "TJ"
textOperator Tj =
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* string "Tj"
textOperator Quote = (,) Quote <$> count 1 argument <* string "'"
textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\""
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw"
textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz"
textOperator TL = (,) TL <$> count 1 argument <* string "TL"
textOperator Tf = (,) Tf <$> sequence [Typed . NameObject <$> name, argument] <* string "Tf"
textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr"
textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o)
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
argument :: ParserWithFont Argument
argument = Raw <$> takeAll regular <* blank
data PageContents = PageContents {
chunks :: [ByteString]
}
type ParserWithFont = ReaderT CMappers (Parser CMap)
pageContents :: CMappers -> ByteString -> Either String PageContents
pageContents font input =
evalParser (runReaderT (PageContents <$> page) font) emptyCMap input
page :: ParserWithFont [ByteString]
page = graphicState <|> text
graphicState :: ParserWithFont [ByteString]
graphicState =
string "q" *> blank *> insideQ <* string "Q"
where
insideQ = join <$> (command <|> page `sepBy` blank )
command = a stateOperator *> return []
text :: ParserWithFont [ByteString]
text =
string "BT" *> blank *> commands <* blank <* string "ET"
where
commands = join <$> (a textOperator >>= runOperator) `sepBy` blank
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
runOperator (Tf, [Typed (NameObject fontName), _]) =
asks (! fontName) >>= put >> return []
runOperator (Tstar, []) = return ["\n"]
runOperator (TJ, [Typed (Array arrayObject)]) =
replicate 1 <$> foldM appendText "" arrayObject
where
appendText bs (StringObject outputString) =
mappend bs <$> decodeString outputString
appendText bs _ = return bs
runOperator (Tj, [Typed (StringObject outputString)]) =
replicate 1 <$> decodeString outputString
runOperator (Quote, [Typed (StringObject outputString)]) =
(\bs -> ["\n", bs]) <$> decodeString outputString
runOperator (DQuote, [Typed (StringObject outputString)]) =
(\bs -> ["\n", bs]) <$> decodeString outputString
runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString
decodeString (Hexadecimal h) = decodeString (Literal (utf8Decode h))
decodeString (Literal s) =
undefined
where
bytes = UTF8.fromString s