WIP: Clean code parsing «pages» (now Content), separated from text rendering (will be reimplemented as an upper layer, also providing modification as stream filters) — Page is also forgotten for now, will need a big improvement in Object navigation
This commit is contained in:
parent
325250383a
commit
20466c4f13
9 changed files with 254 additions and 2 deletions
|
@ -18,14 +18,19 @@ cabal-version: >=1.10
|
||||||
library
|
library
|
||||||
exposed-modules: PDF
|
exposed-modules: PDF
|
||||||
, PDF.CMap
|
, PDF.CMap
|
||||||
|
, PDF.Content
|
||||||
, PDF.EOL
|
, PDF.EOL
|
||||||
, PDF.Object
|
, PDF.Object
|
||||||
, PDF.Output
|
, PDF.Output
|
||||||
, PDF.Pages
|
|
||||||
, PDF.Parser
|
, PDF.Parser
|
||||||
, PDF.Text
|
|
||||||
, PDF.Update
|
, PDF.Update
|
||||||
other-modules: Data.ByteString.Char8.Util
|
other-modules: Data.ByteString.Char8.Util
|
||||||
|
, PDF.Content.Operator
|
||||||
|
, PDF.Content.Operator.Color
|
||||||
|
, PDF.Content.Operator.Common
|
||||||
|
, PDF.Content.Operator.GraphicState
|
||||||
|
, PDF.Content.Operator.Path
|
||||||
|
, PDF.Content.Operator.Text
|
||||||
, PDF.Encoding
|
, PDF.Encoding
|
||||||
, PDF.Encoding.MacRoman
|
, PDF.Encoding.MacRoman
|
||||||
, PDF.Body
|
, PDF.Body
|
||||||
|
|
57
src/PDF/Content.hs
Normal file
57
src/PDF/Content.hs
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
module PDF.Content (
|
||||||
|
Content(..)
|
||||||
|
, ContentUnit(..)
|
||||||
|
, GraphicContextUnit(..)
|
||||||
|
, TextContext
|
||||||
|
, content
|
||||||
|
, parse
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Applicative ((<|>))
|
||||||
|
import Control.Monad.State (evalStateT, modify)
|
||||||
|
import Data.Attoparsec.ByteString.Char8 (sepBy)
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import PDF.Object (blank, directObject)
|
||||||
|
import PDF.Parser (MonadParser, (<?>), evalParser, string)
|
||||||
|
import PDF.Content.Operator (Instruction, operator)
|
||||||
|
|
||||||
|
data GraphicContextUnit =
|
||||||
|
GraphicInstruction Instruction
|
||||||
|
| ContentUnit ContentUnit
|
||||||
|
deriving Show
|
||||||
|
type TextContext = [Instruction]
|
||||||
|
data ContentUnit =
|
||||||
|
GraphicContext [GraphicContextUnit]
|
||||||
|
| TextContext TextContext
|
||||||
|
deriving Show
|
||||||
|
newtype Content = Content [ContentUnit] deriving Show
|
||||||
|
|
||||||
|
content :: MonadParser m => m Content
|
||||||
|
content = Content <$> contentUnit `sepBy` blank <?> "content"
|
||||||
|
|
||||||
|
contentUnit :: MonadParser m => m ContentUnit
|
||||||
|
contentUnit =
|
||||||
|
(GraphicContext <$> graphicContext)
|
||||||
|
<|> (TextContext <$> textContext)
|
||||||
|
where
|
||||||
|
graphicContext =
|
||||||
|
string "q" *> blank *> graphicContextUnit `sepBy` blank <* blank <* string "Q"
|
||||||
|
|
||||||
|
graphicContextUnit :: MonadParser m => m GraphicContextUnit
|
||||||
|
graphicContextUnit =
|
||||||
|
(GraphicInstruction <$> instruction)
|
||||||
|
<|> (ContentUnit <$> contentUnit)
|
||||||
|
|
||||||
|
instruction :: MonadParser m => m Instruction
|
||||||
|
instruction = evalStateT stackParser []
|
||||||
|
where
|
||||||
|
stackParser = ((directObject <* blank) >>= push) <|> operator
|
||||||
|
push arg = modify (arg:) *> stackParser
|
||||||
|
|
||||||
|
parse :: ByteString -> Either String Content
|
||||||
|
parse = evalParser content ()
|
||||||
|
|
||||||
|
textContext :: MonadParser m => m TextContext
|
||||||
|
textContext =
|
||||||
|
string "BT" *> blank *> instruction `sepBy` blank <* blank <* string "ET"
|
65
src/PDF/Content/Operator.hs
Normal file
65
src/PDF/Content/Operator.hs
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
module PDF.Content.Operator (
|
||||||
|
Instruction
|
||||||
|
, Operator
|
||||||
|
, operator
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
|
import Control.Monad.State (MonadState(..))
|
||||||
|
import Data.ByteString.Char8 (ByteString, pack, unpack)
|
||||||
|
import Data.Char (toLower)
|
||||||
|
import Data.Map (Map, (!?))
|
||||||
|
import qualified Data.Map as Map (fromList)
|
||||||
|
import qualified PDF.Content.Operator.Color as Color (Operator, signature)
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
import qualified PDF.Content.Operator.GraphicState as GraphicState (Operator, signature)
|
||||||
|
import qualified PDF.Content.Operator.Path as Path (Operator, signature)
|
||||||
|
import qualified PDF.Content.Operator.Text as Text (Operator, signature)
|
||||||
|
import PDF.Object (DirectObject, blank, regular)
|
||||||
|
import PDF.Parser (MonadParser, takeAll1)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
GraphicState GraphicState.Operator
|
||||||
|
| Path Path.Operator
|
||||||
|
| Color Color.Operator
|
||||||
|
| Text Text.Operator
|
||||||
|
deriving Show
|
||||||
|
|
||||||
|
type Instruction = (Operator, [DirectObject])
|
||||||
|
|
||||||
|
operatorsTable :: Map ByteString (Signature Operator)
|
||||||
|
operatorsTable = Map.fromList (
|
||||||
|
(prepare GraphicState <$> GraphicState.signature)
|
||||||
|
++ (prepare Path <$> Path.signature)
|
||||||
|
++ (prepare Color <$> Color.signature)
|
||||||
|
++ (prepare Text <$> Text.signature)
|
||||||
|
)
|
||||||
|
where
|
||||||
|
prepare constructor (op, sig) = (code op, (constructor op, sig))
|
||||||
|
|
||||||
|
code :: Show a => a -> ByteString
|
||||||
|
code = pack . expand . show
|
||||||
|
where
|
||||||
|
expand "" = ""
|
||||||
|
expand (c:'_':s) = toLower c : expand s
|
||||||
|
expand ('s':'t':'a':'r':s) = '*' : expand s
|
||||||
|
expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
|
||||||
|
expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
|
||||||
|
expand (c:s) = c : expand s
|
||||||
|
|
||||||
|
type StackParser m = (MonadState [DirectObject] m, MonadParser m)
|
||||||
|
|
||||||
|
operator :: StackParser m => m Instruction
|
||||||
|
operator = do
|
||||||
|
chunk <- takeAll1 regular <* blank
|
||||||
|
args <- reverse <$> get
|
||||||
|
case operatorsTable !? chunk of
|
||||||
|
Just (op, sig)
|
||||||
|
| sig args -> return (op, args)
|
||||||
|
| otherwise ->
|
||||||
|
get >>= fail . printf "Operator %s with stack %s" (show op) . show
|
||||||
|
_ -> fail ("Unknown chunk " ++ unpack chunk)
|
27
src/PDF/Content/Operator/Color.hs
Normal file
27
src/PDF/Content/Operator/Color.hs
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
module PDF.Content.Operator.Color (
|
||||||
|
Operator
|
||||||
|
, signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
import PDF.Object (DirectObject(..))
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
|
||||||
|
signature :: [Signature Operator]
|
||||||
|
signature = [
|
||||||
|
(CS, \l -> case l of [NameObject _] -> True ; _ -> False)
|
||||||
|
, (C_s, \l -> case l of [NameObject _] -> True ; _ -> False)
|
||||||
|
, (SC, \_ -> True)
|
||||||
|
, (SCN, \_ -> True)
|
||||||
|
, (S_c, \_ -> True)
|
||||||
|
, (S_cn, \_ -> True)
|
||||||
|
, (G, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (G_, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (RG, \l -> case l of [_, _, _] -> True ; _ -> False)
|
||||||
|
, (R_g, \l -> case l of [_, _, _] -> True ; _ -> False)
|
||||||
|
, (K, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
, (K_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
]
|
7
src/PDF/Content/Operator/Common.hs
Normal file
7
src/PDF/Content/Operator/Common.hs
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
module PDF.Content.Operator.Common (
|
||||||
|
Signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Object (DirectObject)
|
||||||
|
|
||||||
|
type Signature a = (a, [DirectObject] -> Bool)
|
24
src/PDF/Content/Operator/GraphicState.hs
Normal file
24
src/PDF/Content/Operator/GraphicState.hs
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
module PDF.Content.Operator.GraphicState (
|
||||||
|
Operator(..)
|
||||||
|
, signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
import PDF.Object (DirectObject(..))
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
|
||||||
|
signature :: [Signature Operator]
|
||||||
|
signature = [
|
||||||
|
(C_m, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
|
||||||
|
, (W_, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (J, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (J_, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (M, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (D_, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (R_i, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (I_, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (G_s, \l -> case l of [NameObject _] -> True ; _ -> False)
|
||||||
|
]
|
35
src/PDF/Content/Operator/Path.hs
Normal file
35
src/PDF/Content/Operator/Path.hs
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
module PDF.Content.Operator.Path (
|
||||||
|
Operator
|
||||||
|
, signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
|
||||||
|
| S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
|
||||||
|
| W | Wstar -- clipping path
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
|
||||||
|
signature :: [Signature Operator]
|
||||||
|
signature = [
|
||||||
|
(M_, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (L_, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (C_, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
|
||||||
|
, (V_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
, (Y_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
, (H_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (R_e, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
, (S, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (S_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (F_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (F, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Fstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Bstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B_star, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (N_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (W, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Wstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
]
|
32
src/PDF/Content/Operator/Text.hs
Normal file
32
src/PDF/Content/Operator/Text.hs
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
module PDF.Content.Operator.Text (
|
||||||
|
Operator
|
||||||
|
, signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
import PDF.Object (DirectObject(..))
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
Td | TD | Tm | Tstar -- text positioning
|
||||||
|
| TJ | Tj | Quote | DQuote -- text showing
|
||||||
|
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
|
||||||
|
signature :: [Signature Operator]
|
||||||
|
signature = [
|
||||||
|
(Td, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (TD, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (Tm, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
|
||||||
|
, (Tstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (TJ, \l -> case l of [Array _] -> True ; _ -> False)
|
||||||
|
, (Tj, \l -> case l of [StringObject _] -> True ; _ -> False)
|
||||||
|
, (Quote, \l -> case l of [StringObject _] -> True ; _ -> False)
|
||||||
|
, (DQuote, \l -> case l of [StringObject _] -> True ; _ -> False)
|
||||||
|
, (Tc, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (Tw, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (Tz, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (TL, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (Tf, \l -> case l of [NameObject _, _] -> True ; _ -> False)
|
||||||
|
, (Tr, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (Ts, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
]
|
0
src/PDF/Content/Text.hs
Normal file
0
src/PDF/Content/Text.hs
Normal file
Loading…
Reference in a new issue