Hufflepdf/src/PDF/Text.hs

198 lines
6.6 KiB
Haskell

{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
module PDF.Text {-(
pageContents
)-} where
import Control.Applicative ((<|>))
import Control.Monad (foldM)
import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (MonadState, evalStateT, get, modify, put)
import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (drop, null, take)
import Data.ByteString.Char8 (pack, unpack)
import Data.Map ((!))
import qualified Data.Map as Map (lookup, toList)
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap)
import PDF.Object (
DirectObject(..), StringObject(..)
, array, blank, name, regular, stringObject, toByteString
)
import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll1)
data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
deriving (Bounded, Enum)
data TextOperator =
Td | TD | Tm | Tstar -- text positioning
| TJ | Tj | Quote | DQuote -- text showing
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
deriving (Bounded, Enum)
data Argument = Raw ByteString | Typed DirectObject deriving Show
type Call a = (a, [Argument])
instance Show StateOperator where
show Cm = "cm"
show W = "w"
show J = "J"
show J_ = "j"
show M = "M"
show D = "d"
show Ri = "ri"
show I = "i"
show Gs = "gs"
instance Show TextOperator where
show Td = "Td"
show TD = "TD"
show Tm = "Tm"
show Tstar = "T*"
show TJ = "TJ"
show Tj = "Tj"
show Quote = "'"
show DQuote = "\""
show Tc = "Tc"
show Tw = "Tw"
show Tz = "Tz"
show TL = "TL"
show Tf = "Tf"
show Tr = "Tr"
show Ts = "Ts"
stateOperator :: TypeChecker StateOperator
stateOperator (Cm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
stateOperator (W, [Raw _]) = True
stateOperator (J, [Raw _]) = True
stateOperator (J_, [Raw _]) = True
stateOperator (M, [Raw _]) = True
stateOperator (D, [Raw _, Raw _]) = True
stateOperator (Ri, [Raw _]) = True
stateOperator (I, [Raw _]) = True
stateOperator (Gs, [Typed (NameObject _)]) = True
stateOperator _ = False
textOperator :: TypeChecker TextOperator
textOperator (Td, [Raw _, Raw _]) = True
textOperator (TD, [Raw _, Raw _]) = True
textOperator (Tm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
textOperator (Tstar, []) = True
textOperator (TJ, [Typed (Array _)]) = True
textOperator (Tj, [Typed (StringObject _)]) = True
textOperator (Quote, [Typed (StringObject _)]) = True
textOperator (DQuote, [Typed (StringObject _)]) = True
textOperator (Tc, [Raw _]) = True
textOperator (Tw, [Raw _]) = True
textOperator (Tz, [Raw _]) = True
textOperator (TL, [Raw _]) = True
textOperator (Tf, [Typed (NameObject _), Raw _]) = True
textOperator (Tr, [Raw _]) = True
textOperator (Ts, [Raw _]) = True
textOperator _ = False
type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
type Operator a = (Bounded a, Enum a, Show a)
type TypeChecker a = Call a -> Bool
parseShowable :: (Show a, MonadParser m) => a -> m a
parseShowable textOp = string (pack $ show textOp) *> return textOp
callChunk :: (MonadParser m, Operator a) => m (Either a Argument)
callChunk =
Left <$> choice (parseShowable <$> [minBound .. maxBound])
<|> Right <$> choice [stringArg, nameArg, arrayArg, argument]
<?> "call chunk"
stackParser :: (ArgumentStackParser m, Operator a) => TypeChecker a -> m (Call a)
stackParser typeChecker = either popCall push =<< callChunk
where
push arg = modify (arg:) >> stackParser typeChecker
popCall operator = do
call <- (,) operator . reverse <$> get
if typeChecker call then return call else fail (show call)
a :: (Operator a, MonadParser m) => TypeChecker a -> m (Call a)
a typeChecker = evalStateT (stackParser typeChecker) []
argument :: MonadParser m => m Argument
argument = Raw <$> takeAll1 regular <* blank
arrayArg :: MonadParser m => m Argument
arrayArg = Typed . Array <$> array <* blank
nameArg :: MonadParser m => m Argument
nameArg = Typed . NameObject <$> name <* blank
stringArg :: MonadParser m => m Argument
stringArg = Typed . StringObject <$> stringObject <* blank
type ParserWithFont = ReaderT CMappers (Parser CMap)
pageContents :: CMappers -> ByteString -> Either String [ByteString]
pageContents font input =
evalParser (runReaderT page font) emptyCMap input
page :: ParserWithFont [ByteString]
page = graphicState <|> text <?> "Text page contents"
graphicState :: ParserWithFont [ByteString]
graphicState =
(string "q" *> blank *> insideQ <* blank <* string "Q") <?> "Graphic state"
where
insideQ = concat <$> ((command <|> page) `sepBy` blank )
command = a stateOperator *> return []
text :: ParserWithFont [ByteString]
text =
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
where
commands = concat <$> (a textOperator >>= runOperator) `sepBy` blank
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
runOperator (Tf, [Typed (NameObject fontName), _]) =
asks (! fontName) >>= put >> return []
runOperator (Tstar, []) = return ["\n"]
runOperator (TJ, [Typed (Array arrayObject)]) =
replicate 1 <$> foldM appendText "" arrayObject
where
appendText bs (StringObject outputString) =
mappend bs <$> decodeString outputString
appendText bs _ = return bs
runOperator (Tj, [Typed (StringObject outputString)]) =
replicate 1 <$> decodeString outputString
runOperator (Quote, [Typed (StringObject outputString)]) =
(\bs -> ["\n", bs]) <$> decodeString outputString
runOperator (DQuote, [Typed (StringObject outputString)]) =
(\bs -> ["\n", bs]) <$> decodeString outputString
runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString
decodeString = decode . toByteString
where
decode input
| BS.null input = return ""
| otherwise = do
(output, remainingInput) <- trySizes input =<< Map.toList <$> get
mappend output <$> decode remainingInput
trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString)
trySizes s [] = fail $ "No matching code found in font for " ++ unpack s
trySizes s ((size, cRanges):others) =
let prefix = BS.take size s in
case tryRanges prefix cRanges of
Nothing -> trySizes s others
Just outputSequence -> return (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
tryRanges _ [] = Nothing
tryRanges prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of
Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence