WIP: Clean code parsing «pages» (now Content), separated from text rendering (will be reimplemented as an upper layer, also providing modification as stream filters) — Page is also forgotten for now, will need a big improvement in Object navigation
This commit is contained in:
parent
325250383a
commit
20466c4f13
9 changed files with 254 additions and 2 deletions
|
@ -18,14 +18,19 @@ cabal-version: >=1.10
|
|||
library
|
||||
exposed-modules: PDF
|
||||
, PDF.CMap
|
||||
, PDF.Content
|
||||
, PDF.EOL
|
||||
, PDF.Object
|
||||
, PDF.Output
|
||||
, PDF.Pages
|
||||
, PDF.Parser
|
||||
, PDF.Text
|
||||
, PDF.Update
|
||||
other-modules: Data.ByteString.Char8.Util
|
||||
, PDF.Content.Operator
|
||||
, PDF.Content.Operator.Color
|
||||
, PDF.Content.Operator.Common
|
||||
, PDF.Content.Operator.GraphicState
|
||||
, PDF.Content.Operator.Path
|
||||
, PDF.Content.Operator.Text
|
||||
, PDF.Encoding
|
||||
, PDF.Encoding.MacRoman
|
||||
, PDF.Body
|
||||
|
|
57
src/PDF/Content.hs
Normal file
57
src/PDF/Content.hs
Normal file
|
@ -0,0 +1,57 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
module PDF.Content (
|
||||
Content(..)
|
||||
, ContentUnit(..)
|
||||
, GraphicContextUnit(..)
|
||||
, TextContext
|
||||
, content
|
||||
, parse
|
||||
) where
|
||||
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad.State (evalStateT, modify)
|
||||
import Data.Attoparsec.ByteString.Char8 (sepBy)
|
||||
import Data.ByteString (ByteString)
|
||||
import PDF.Object (blank, directObject)
|
||||
import PDF.Parser (MonadParser, (<?>), evalParser, string)
|
||||
import PDF.Content.Operator (Instruction, operator)
|
||||
|
||||
data GraphicContextUnit =
|
||||
GraphicInstruction Instruction
|
||||
| ContentUnit ContentUnit
|
||||
deriving Show
|
||||
type TextContext = [Instruction]
|
||||
data ContentUnit =
|
||||
GraphicContext [GraphicContextUnit]
|
||||
| TextContext TextContext
|
||||
deriving Show
|
||||
newtype Content = Content [ContentUnit] deriving Show
|
||||
|
||||
content :: MonadParser m => m Content
|
||||
content = Content <$> contentUnit `sepBy` blank <?> "content"
|
||||
|
||||
contentUnit :: MonadParser m => m ContentUnit
|
||||
contentUnit =
|
||||
(GraphicContext <$> graphicContext)
|
||||
<|> (TextContext <$> textContext)
|
||||
where
|
||||
graphicContext =
|
||||
string "q" *> blank *> graphicContextUnit `sepBy` blank <* blank <* string "Q"
|
||||
|
||||
graphicContextUnit :: MonadParser m => m GraphicContextUnit
|
||||
graphicContextUnit =
|
||||
(GraphicInstruction <$> instruction)
|
||||
<|> (ContentUnit <$> contentUnit)
|
||||
|
||||
instruction :: MonadParser m => m Instruction
|
||||
instruction = evalStateT stackParser []
|
||||
where
|
||||
stackParser = ((directObject <* blank) >>= push) <|> operator
|
||||
push arg = modify (arg:) *> stackParser
|
||||
|
||||
parse :: ByteString -> Either String Content
|
||||
parse = evalParser content ()
|
||||
|
||||
textContext :: MonadParser m => m TextContext
|
||||
textContext =
|
||||
string "BT" *> blank *> instruction `sepBy` blank <* blank <* string "ET"
|
65
src/PDF/Content/Operator.hs
Normal file
65
src/PDF/Content/Operator.hs
Normal file
|
@ -0,0 +1,65 @@
|
|||
{-# LANGUAGE ConstraintKinds #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
module PDF.Content.Operator (
|
||||
Instruction
|
||||
, Operator
|
||||
, operator
|
||||
) where
|
||||
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Control.Monad.State (MonadState(..))
|
||||
import Data.ByteString.Char8 (ByteString, pack, unpack)
|
||||
import Data.Char (toLower)
|
||||
import Data.Map (Map, (!?))
|
||||
import qualified Data.Map as Map (fromList)
|
||||
import qualified PDF.Content.Operator.Color as Color (Operator, signature)
|
||||
import PDF.Content.Operator.Common (Signature)
|
||||
import qualified PDF.Content.Operator.GraphicState as GraphicState (Operator, signature)
|
||||
import qualified PDF.Content.Operator.Path as Path (Operator, signature)
|
||||
import qualified PDF.Content.Operator.Text as Text (Operator, signature)
|
||||
import PDF.Object (DirectObject, blank, regular)
|
||||
import PDF.Parser (MonadParser, takeAll1)
|
||||
import Prelude hiding (fail)
|
||||
import Text.Printf (printf)
|
||||
|
||||
data Operator =
|
||||
GraphicState GraphicState.Operator
|
||||
| Path Path.Operator
|
||||
| Color Color.Operator
|
||||
| Text Text.Operator
|
||||
deriving Show
|
||||
|
||||
type Instruction = (Operator, [DirectObject])
|
||||
|
||||
operatorsTable :: Map ByteString (Signature Operator)
|
||||
operatorsTable = Map.fromList (
|
||||
(prepare GraphicState <$> GraphicState.signature)
|
||||
++ (prepare Path <$> Path.signature)
|
||||
++ (prepare Color <$> Color.signature)
|
||||
++ (prepare Text <$> Text.signature)
|
||||
)
|
||||
where
|
||||
prepare constructor (op, sig) = (code op, (constructor op, sig))
|
||||
|
||||
code :: Show a => a -> ByteString
|
||||
code = pack . expand . show
|
||||
where
|
||||
expand "" = ""
|
||||
expand (c:'_':s) = toLower c : expand s
|
||||
expand ('s':'t':'a':'r':s) = '*' : expand s
|
||||
expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
|
||||
expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
|
||||
expand (c:s) = c : expand s
|
||||
|
||||
type StackParser m = (MonadState [DirectObject] m, MonadParser m)
|
||||
|
||||
operator :: StackParser m => m Instruction
|
||||
operator = do
|
||||
chunk <- takeAll1 regular <* blank
|
||||
args <- reverse <$> get
|
||||
case operatorsTable !? chunk of
|
||||
Just (op, sig)
|
||||
| sig args -> return (op, args)
|
||||
| otherwise ->
|
||||
get >>= fail . printf "Operator %s with stack %s" (show op) . show
|
||||
_ -> fail ("Unknown chunk " ++ unpack chunk)
|
27
src/PDF/Content/Operator/Color.hs
Normal file
27
src/PDF/Content/Operator/Color.hs
Normal file
|
@ -0,0 +1,27 @@
|
|||
module PDF.Content.Operator.Color (
|
||||
Operator
|
||||
, signature
|
||||
) where
|
||||
|
||||
import PDF.Content.Operator.Common (Signature)
|
||||
import PDF.Object (DirectObject(..))
|
||||
|
||||
data Operator =
|
||||
CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
|
||||
deriving (Bounded, Enum, Show)
|
||||
|
||||
signature :: [Signature Operator]
|
||||
signature = [
|
||||
(CS, \l -> case l of [NameObject _] -> True ; _ -> False)
|
||||
, (C_s, \l -> case l of [NameObject _] -> True ; _ -> False)
|
||||
, (SC, \_ -> True)
|
||||
, (SCN, \_ -> True)
|
||||
, (S_c, \_ -> True)
|
||||
, (S_cn, \_ -> True)
|
||||
, (G, \l -> case l of [_] -> True ; _ -> False)
|
||||
, (G_, \l -> case l of [_] -> True ; _ -> False)
|
||||
, (RG, \l -> case l of [_, _, _] -> True ; _ -> False)
|
||||
, (R_g, \l -> case l of [_, _, _] -> True ; _ -> False)
|
||||
, (K, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||
, (K_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||
]
|
7
src/PDF/Content/Operator/Common.hs
Normal file
7
src/PDF/Content/Operator/Common.hs
Normal file
|
@ -0,0 +1,7 @@
|
|||
module PDF.Content.Operator.Common (
|
||||
Signature
|
||||
) where
|
||||
|
||||
import PDF.Object (DirectObject)
|
||||
|
||||
type Signature a = (a, [DirectObject] -> Bool)
|
24
src/PDF/Content/Operator/GraphicState.hs
Normal file
24
src/PDF/Content/Operator/GraphicState.hs
Normal file
|
@ -0,0 +1,24 @@
|
|||
module PDF.Content.Operator.GraphicState (
|
||||
Operator(..)
|
||||
, signature
|
||||
) where
|
||||
|
||||
import PDF.Content.Operator.Common (Signature)
|
||||
import PDF.Object (DirectObject(..))
|
||||
|
||||
data Operator =
|
||||
C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
|
||||
deriving (Bounded, Enum, Show)
|
||||
|
||||
signature :: [Signature Operator]
|
||||
signature = [
|
||||
(C_m, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
|
||||
, (W_, \l -> case l of [_] -> True ; _ -> False)
|
||||
, (J, \l -> case l of [_] -> True ; _ -> False)
|
||||
, (J_, \l -> case l of [_] -> True ; _ -> False)
|
||||
, (M, \l -> case l of [_] -> True ; _ -> False)
|
||||
, (D_, \l -> case l of [_, _] -> True ; _ -> False)
|
||||
, (R_i, \l -> case l of [_] -> True ; _ -> False)
|
||||
, (I_, \l -> case l of [_] -> True ; _ -> False)
|
||||
, (G_s, \l -> case l of [NameObject _] -> True ; _ -> False)
|
||||
]
|
35
src/PDF/Content/Operator/Path.hs
Normal file
35
src/PDF/Content/Operator/Path.hs
Normal file
|
@ -0,0 +1,35 @@
|
|||
module PDF.Content.Operator.Path (
|
||||
Operator
|
||||
, signature
|
||||
) where
|
||||
|
||||
import PDF.Content.Operator.Common (Signature)
|
||||
|
||||
data Operator =
|
||||
M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
|
||||
| S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
|
||||
| W | Wstar -- clipping path
|
||||
deriving (Bounded, Enum, Show)
|
||||
|
||||
signature :: [Signature Operator]
|
||||
signature = [
|
||||
(M_, \l -> case l of [_, _] -> True ; _ -> False)
|
||||
, (L_, \l -> case l of [_, _] -> True ; _ -> False)
|
||||
, (C_, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
|
||||
, (V_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||
, (Y_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||
, (H_, \l -> case l of [] -> True ; _ -> False)
|
||||
, (R_e, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||
, (S, \l -> case l of [] -> True ; _ -> False)
|
||||
, (S_, \l -> case l of [] -> True ; _ -> False)
|
||||
, (F_, \l -> case l of [] -> True ; _ -> False)
|
||||
, (F, \l -> case l of [] -> True ; _ -> False)
|
||||
, (Fstar, \l -> case l of [] -> True ; _ -> False)
|
||||
, (B, \l -> case l of [] -> True ; _ -> False)
|
||||
, (Bstar, \l -> case l of [] -> True ; _ -> False)
|
||||
, (B_, \l -> case l of [] -> True ; _ -> False)
|
||||
, (B_star, \l -> case l of [] -> True ; _ -> False)
|
||||
, (N_, \l -> case l of [] -> True ; _ -> False)
|
||||
, (W, \l -> case l of [] -> True ; _ -> False)
|
||||
, (Wstar, \l -> case l of [] -> True ; _ -> False)
|
||||
]
|
32
src/PDF/Content/Operator/Text.hs
Normal file
32
src/PDF/Content/Operator/Text.hs
Normal file
|
@ -0,0 +1,32 @@
|
|||
module PDF.Content.Operator.Text (
|
||||
Operator
|
||||
, signature
|
||||
) where
|
||||
|
||||
import PDF.Content.Operator.Common (Signature)
|
||||
import PDF.Object (DirectObject(..))
|
||||
|
||||
data Operator =
|
||||
Td | TD | Tm | Tstar -- text positioning
|
||||
| TJ | Tj | Quote | DQuote -- text showing
|
||||
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
||||
deriving (Bounded, Enum, Show)
|
||||
|
||||
signature :: [Signature Operator]
|
||||
signature = [
|
||||
(Td, \l -> case l of [_, _] -> True ; _ -> False)
|
||||
, (TD, \l -> case l of [_, _] -> True ; _ -> False)
|
||||
, (Tm, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
|
||||
, (Tstar, \l -> case l of [] -> True ; _ -> False)
|
||||
, (TJ, \l -> case l of [Array _] -> True ; _ -> False)
|
||||
, (Tj, \l -> case l of [StringObject _] -> True ; _ -> False)
|
||||
, (Quote, \l -> case l of [StringObject _] -> True ; _ -> False)
|
||||
, (DQuote, \l -> case l of [StringObject _] -> True ; _ -> False)
|
||||
, (Tc, \l -> case l of [_] -> True ; _ -> False)
|
||||
, (Tw, \l -> case l of [_] -> True ; _ -> False)
|
||||
, (Tz, \l -> case l of [_] -> True ; _ -> False)
|
||||
, (TL, \l -> case l of [_] -> True ; _ -> False)
|
||||
, (Tf, \l -> case l of [NameObject _, _] -> True ; _ -> False)
|
||||
, (Tr, \l -> case l of [_] -> True ; _ -> False)
|
||||
, (Ts, \l -> case l of [_] -> True ; _ -> False)
|
||||
]
|
0
src/PDF/Content/Text.hs
Normal file
0
src/PDF/Content/Text.hs
Normal file
Loading…
Reference in a new issue