Hufflepdf/src/PDF/Text.hs

172 lines
6.7 KiB
Haskell

{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
module PDF.Text {-(
PageContents(..)
, pageContents
)-} where
import Control.Applicative ((<|>))
import Control.Monad (foldM)
import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (get, put)
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (drop, null, take)
import Data.ByteString.Char8.Util (decodeHex)
import Data.List (find)
import Data.Map ((!))
import qualified Data.Map as Map (lookup, toList)
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap)
import PDF.Object (
DirectObject(..), StringObject(..)
, array, blank, name, parseBytes, regular, stringObject
)
import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll)
data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
deriving (Bounded, Enum, Show)
data TextOperator =
Td | TD | Tm | Tstar -- text positioning
| TJ | Tj | Quote | DQuote -- text showing
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
deriving (Bounded, Enum, Show)
data Argument = Raw ByteString | Typed DirectObject deriving Show
type Call a = (a, [Argument])
stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator)
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm"
stateOperator W = (,) W <$> count 1 argument <* string "w"
stateOperator J = (,) J <$> count 1 argument <* string "J"
stateOperator J_ = (,) J_ <$> count 1 argument <* string "j"
stateOperator M = (,) M <$> count 1 argument <* string "M"
stateOperator D = (,) D <$> count 2 argument <* string "d"
stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri"
stateOperator I = (,) I <$> count 1 argument <* string "i"
stateOperator Gs = (,) Gs <$> count 1 nameArg <* string "gs"
textOperator :: MonadParser m => TextOperator -> m (Call TextOperator)
textOperator Td = (,) Td <$> count 2 argument <* string "Td"
textOperator TD = (,) TD <$> count 2 argument <* string "TD"
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
textOperator Tstar = (,) Td <$> return [] <* string "T*"
textOperator TJ = (,) TJ <$> count 1 arrayArg <* string "TJ"
textOperator Tj = (,) Tj <$> count 1 stringArg <* string "Tj"
textOperator Quote = (,) Quote <$> count 1 stringArg <* string "'"
textOperator DQuote = (,) DQuote <$> count 1 stringArg <* string "\""
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw"
textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz"
textOperator TL = (,) TL <$> count 1 argument <* string "TL"
textOperator Tf = (,) Tf <$> sequence [nameArg, argument] <* string "Tf"
textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr"
textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
a :: (Bounded o, Enum o, MonadParser m) => (o -> m (Call o)) -> m (Call o)
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
argument :: MonadParser m => m Argument
argument = Raw <$> takeAll regular <* blank
arrayArg :: MonadParser m => m Argument
arrayArg = Typed . Array <$> array <* blank
nameArg :: MonadParser m => m Argument
nameArg = Typed . NameObject <$> name <* blank
stringArg :: MonadParser m => m Argument
stringArg = Typed . StringObject <$> stringObject <* blank
data PageContents = PageContents {
chunks :: [ByteString]
}
type ParserWithFont = ReaderT CMappers (Parser CMap)
pageContents :: CMappers -> ByteString -> Either String PageContents
pageContents font input =
evalParser (runReaderT (PageContents <$> page) font) emptyCMap input
page :: ParserWithFont [ByteString]
page = graphicState <|> text <?> "Text page contents"
graphicState :: ParserWithFont [ByteString]
graphicState =
(string "q" *> blank *> insideQ <* blank <* string "Q") <?> "Graphic state"
where
insideQ = concat <$> ((command <|> page) `sepBy` blank )
command = a stateOperator *> return []
text :: ParserWithFont [ByteString]
text =
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
where
commands = concat <$> (a textOperator >>= runOperator) `sepBy` blank
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
runOperator (Tf, [Typed (NameObject fontName), _]) =
asks (! fontName) >>= put >> return []
runOperator (Tstar, []) = return ["\n"]
runOperator (TJ, [Typed (Array arrayObject)]) =
replicate 1 <$> foldM appendText "" arrayObject
where
appendText bs (StringObject outputString) =
mappend bs <$> decodeString outputString
appendText bs _ = return bs
runOperator (Tj, [Typed (StringObject outputString)]) =
replicate 1 <$> decodeString outputString
runOperator (Quote, [Typed (StringObject outputString)]) =
(\bs -> ["\n", bs]) <$> decodeString outputString
runOperator (DQuote, [Typed (StringObject outputString)]) =
(\bs -> ["\n", bs]) <$> decodeString outputString
runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString
decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h))
decodeString (Literal litString) = do
cRangesBySize <- Map.toList <$> get
f cRangesBySize litString
where
f :: [(Int, [CRange])] -> ByteString -> ParserWithFont ByteString
f cRangesBySize input
| BS.null input = return ""
| otherwise = do
(output, newInput) <- g cRangesBySize input
mappend output <$> f cRangesBySize newInput
g :: [(Int, [CRange])] -> ByteString -> ParserWithFont (ByteString, ByteString)
g [] _ = fail "No matching code found in font"
g ((size, cRanges):others) s =
let prefix = BS.take size s in
case h prefix cRanges of
Nothing -> g others s
Just outputSequence -> return (outputSequence, BS.drop size s)
h :: ByteString -> [CRange] -> Maybe ByteString
h prefix [] = Nothing
h prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of
Nothing -> h prefix cRanges
outputSequence -> outputSequence
{-
get >>= convertBytes litString
where
convertBytes :: String -> CMap -> ParserWithFont ByteString
convertBytes [] _ = return ""
convertBytes (c:cs) someCMap = do
convertBytesAux (fromEnum c) 1 cs someCMap
convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString
convertBytesAux code size s someCMap
| size > 4 = fail "Could not match any input code smaller than an int"
| otherwise =
case (Map.lookup code someCMap, s) of
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
(Nothing, []) -> fail "No character left to read but no code recognized"
(Just outputText, _) -> mappend outputText <$> convertBytes s someCMap
-}