diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index c5ac77e..5ff0159 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -18,14 +18,19 @@ cabal-version: >=1.10 library exposed-modules: PDF , PDF.CMap + , PDF.Content , PDF.EOL , PDF.Object , PDF.Output - , PDF.Pages , PDF.Parser - , PDF.Text , PDF.Update other-modules: Data.ByteString.Char8.Util + , PDF.Content.Operator + , PDF.Content.Operator.Color + , PDF.Content.Operator.Common + , PDF.Content.Operator.GraphicState + , PDF.Content.Operator.Path + , PDF.Content.Operator.Text , PDF.Encoding , PDF.Encoding.MacRoman , PDF.Body diff --git a/src/PDF/Content.hs b/src/PDF/Content.hs new file mode 100644 index 0000000..03f8c33 --- /dev/null +++ b/src/PDF/Content.hs @@ -0,0 +1,57 @@ +{-# LANGUAGE OverloadedStrings #-} +module PDF.Content ( + Content(..) + , ContentUnit(..) + , GraphicContextUnit(..) + , TextContext + , content + , parse + ) where + +import Control.Applicative ((<|>)) +import Control.Monad.State (evalStateT, modify) +import Data.Attoparsec.ByteString.Char8 (sepBy) +import Data.ByteString (ByteString) +import PDF.Object (blank, directObject) +import PDF.Parser (MonadParser, (), evalParser, string) +import PDF.Content.Operator (Instruction, operator) + +data GraphicContextUnit = + GraphicInstruction Instruction + | ContentUnit ContentUnit + deriving Show +type TextContext = [Instruction] +data ContentUnit = + GraphicContext [GraphicContextUnit] + | TextContext TextContext + deriving Show +newtype Content = Content [ContentUnit] deriving Show + +content :: MonadParser m => m Content +content = Content <$> contentUnit `sepBy` blank "content" + +contentUnit :: MonadParser m => m ContentUnit +contentUnit = + (GraphicContext <$> graphicContext) + <|> (TextContext <$> textContext) + where + graphicContext = + string "q" *> blank *> graphicContextUnit `sepBy` blank <* blank <* string "Q" + +graphicContextUnit :: MonadParser m => m GraphicContextUnit +graphicContextUnit = + (GraphicInstruction <$> instruction) + <|> (ContentUnit <$> contentUnit) + +instruction :: MonadParser m => m Instruction +instruction = evalStateT stackParser [] + where + stackParser = ((directObject <* blank) >>= push) <|> operator + push arg = modify (arg:) *> stackParser + +parse :: ByteString -> Either String Content +parse = evalParser content () + +textContext :: MonadParser m => m TextContext +textContext = + string "BT" *> blank *> instruction `sepBy` blank <* blank <* string "ET" diff --git a/src/PDF/Content/Operator.hs b/src/PDF/Content/Operator.hs new file mode 100644 index 0000000..c5cc524 --- /dev/null +++ b/src/PDF/Content/Operator.hs @@ -0,0 +1,65 @@ +{-# LANGUAGE ConstraintKinds #-} +{-# LANGUAGE FlexibleContexts #-} +module PDF.Content.Operator ( + Instruction + , Operator + , operator + ) where + +import Control.Monad.Fail (MonadFail(..)) +import Control.Monad.State (MonadState(..)) +import Data.ByteString.Char8 (ByteString, pack, unpack) +import Data.Char (toLower) +import Data.Map (Map, (!?)) +import qualified Data.Map as Map (fromList) +import qualified PDF.Content.Operator.Color as Color (Operator, signature) +import PDF.Content.Operator.Common (Signature) +import qualified PDF.Content.Operator.GraphicState as GraphicState (Operator, signature) +import qualified PDF.Content.Operator.Path as Path (Operator, signature) +import qualified PDF.Content.Operator.Text as Text (Operator, signature) +import PDF.Object (DirectObject, blank, regular) +import PDF.Parser (MonadParser, takeAll1) +import Prelude hiding (fail) +import Text.Printf (printf) + +data Operator = + GraphicState GraphicState.Operator + | Path Path.Operator + | Color Color.Operator + | Text Text.Operator + deriving Show + +type Instruction = (Operator, [DirectObject]) + +operatorsTable :: Map ByteString (Signature Operator) +operatorsTable = Map.fromList ( + (prepare GraphicState <$> GraphicState.signature) + ++ (prepare Path <$> Path.signature) + ++ (prepare Color <$> Color.signature) + ++ (prepare Text <$> Text.signature) + ) + where + prepare constructor (op, sig) = (code op, (constructor op, sig)) + +code :: Show a => a -> ByteString +code = pack . expand . show + where + expand "" = "" + expand (c:'_':s) = toLower c : expand s + expand ('s':'t':'a':'r':s) = '*' : expand s + expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s + expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s + expand (c:s) = c : expand s + +type StackParser m = (MonadState [DirectObject] m, MonadParser m) + +operator :: StackParser m => m Instruction +operator = do + chunk <- takeAll1 regular <* blank + args <- reverse <$> get + case operatorsTable !? chunk of + Just (op, sig) + | sig args -> return (op, args) + | otherwise -> + get >>= fail . printf "Operator %s with stack %s" (show op) . show + _ -> fail ("Unknown chunk " ++ unpack chunk) diff --git a/src/PDF/Content/Operator/Color.hs b/src/PDF/Content/Operator/Color.hs new file mode 100644 index 0000000..7f86387 --- /dev/null +++ b/src/PDF/Content/Operator/Color.hs @@ -0,0 +1,27 @@ +module PDF.Content.Operator.Color ( + Operator + , signature + ) where + +import PDF.Content.Operator.Common (Signature) +import PDF.Object (DirectObject(..)) + +data Operator = + CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_ + deriving (Bounded, Enum, Show) + +signature :: [Signature Operator] +signature = [ + (CS, \l -> case l of [NameObject _] -> True ; _ -> False) + , (C_s, \l -> case l of [NameObject _] -> True ; _ -> False) + , (SC, \_ -> True) + , (SCN, \_ -> True) + , (S_c, \_ -> True) + , (S_cn, \_ -> True) + , (G, \l -> case l of [_] -> True ; _ -> False) + , (G_, \l -> case l of [_] -> True ; _ -> False) + , (RG, \l -> case l of [_, _, _] -> True ; _ -> False) + , (R_g, \l -> case l of [_, _, _] -> True ; _ -> False) + , (K, \l -> case l of [_, _, _, _] -> True ; _ -> False) + , (K_, \l -> case l of [_, _, _, _] -> True ; _ -> False) + ] diff --git a/src/PDF/Content/Operator/Common.hs b/src/PDF/Content/Operator/Common.hs new file mode 100644 index 0000000..a8717e4 --- /dev/null +++ b/src/PDF/Content/Operator/Common.hs @@ -0,0 +1,7 @@ +module PDF.Content.Operator.Common ( + Signature + ) where + +import PDF.Object (DirectObject) + +type Signature a = (a, [DirectObject] -> Bool) diff --git a/src/PDF/Content/Operator/GraphicState.hs b/src/PDF/Content/Operator/GraphicState.hs new file mode 100644 index 0000000..c6ec5de --- /dev/null +++ b/src/PDF/Content/Operator/GraphicState.hs @@ -0,0 +1,24 @@ +module PDF.Content.Operator.GraphicState ( + Operator(..) + , signature + ) where + +import PDF.Content.Operator.Common (Signature) +import PDF.Object (DirectObject(..)) + +data Operator = + C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state + deriving (Bounded, Enum, Show) + +signature :: [Signature Operator] +signature = [ + (C_m, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False) + , (W_, \l -> case l of [_] -> True ; _ -> False) + , (J, \l -> case l of [_] -> True ; _ -> False) + , (J_, \l -> case l of [_] -> True ; _ -> False) + , (M, \l -> case l of [_] -> True ; _ -> False) + , (D_, \l -> case l of [_, _] -> True ; _ -> False) + , (R_i, \l -> case l of [_] -> True ; _ -> False) + , (I_, \l -> case l of [_] -> True ; _ -> False) + , (G_s, \l -> case l of [NameObject _] -> True ; _ -> False) + ] diff --git a/src/PDF/Content/Operator/Path.hs b/src/PDF/Content/Operator/Path.hs new file mode 100644 index 0000000..c9c7921 --- /dev/null +++ b/src/PDF/Content/Operator/Path.hs @@ -0,0 +1,35 @@ +module PDF.Content.Operator.Path ( + Operator + , signature + ) where + +import PDF.Content.Operator.Common (Signature) + +data Operator = + M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction + | S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting + | W | Wstar -- clipping path + deriving (Bounded, Enum, Show) + +signature :: [Signature Operator] +signature = [ + (M_, \l -> case l of [_, _] -> True ; _ -> False) + , (L_, \l -> case l of [_, _] -> True ; _ -> False) + , (C_, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False) + , (V_, \l -> case l of [_, _, _, _] -> True ; _ -> False) + , (Y_, \l -> case l of [_, _, _, _] -> True ; _ -> False) + , (H_, \l -> case l of [] -> True ; _ -> False) + , (R_e, \l -> case l of [_, _, _, _] -> True ; _ -> False) + , (S, \l -> case l of [] -> True ; _ -> False) + , (S_, \l -> case l of [] -> True ; _ -> False) + , (F_, \l -> case l of [] -> True ; _ -> False) + , (F, \l -> case l of [] -> True ; _ -> False) + , (Fstar, \l -> case l of [] -> True ; _ -> False) + , (B, \l -> case l of [] -> True ; _ -> False) + , (Bstar, \l -> case l of [] -> True ; _ -> False) + , (B_, \l -> case l of [] -> True ; _ -> False) + , (B_star, \l -> case l of [] -> True ; _ -> False) + , (N_, \l -> case l of [] -> True ; _ -> False) + , (W, \l -> case l of [] -> True ; _ -> False) + , (Wstar, \l -> case l of [] -> True ; _ -> False) + ] diff --git a/src/PDF/Content/Operator/Text.hs b/src/PDF/Content/Operator/Text.hs new file mode 100644 index 0000000..2e6800f --- /dev/null +++ b/src/PDF/Content/Operator/Text.hs @@ -0,0 +1,32 @@ +module PDF.Content.Operator.Text ( + Operator + , signature + ) where + +import PDF.Content.Operator.Common (Signature) +import PDF.Object (DirectObject(..)) + +data Operator = + Td | TD | Tm | Tstar -- text positioning + | TJ | Tj | Quote | DQuote -- text showing + | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state + deriving (Bounded, Enum, Show) + +signature :: [Signature Operator] +signature = [ + (Td, \l -> case l of [_, _] -> True ; _ -> False) + , (TD, \l -> case l of [_, _] -> True ; _ -> False) + , (Tm, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False) + , (Tstar, \l -> case l of [] -> True ; _ -> False) + , (TJ, \l -> case l of [Array _] -> True ; _ -> False) + , (Tj, \l -> case l of [StringObject _] -> True ; _ -> False) + , (Quote, \l -> case l of [StringObject _] -> True ; _ -> False) + , (DQuote, \l -> case l of [StringObject _] -> True ; _ -> False) + , (Tc, \l -> case l of [_] -> True ; _ -> False) + , (Tw, \l -> case l of [_] -> True ; _ -> False) + , (Tz, \l -> case l of [_] -> True ; _ -> False) + , (TL, \l -> case l of [_] -> True ; _ -> False) + , (Tf, \l -> case l of [NameObject _, _] -> True ; _ -> False) + , (Tr, \l -> case l of [_] -> True ; _ -> False) + , (Ts, \l -> case l of [_] -> True ; _ -> False) + ] diff --git a/src/PDF/Content/Text.hs b/src/PDF/Content/Text.hs new file mode 100644 index 0000000..e69de29