Implement Text rendering from parsed Content
This commit is contained in:
parent
20466c4f13
commit
9f1b1afafe
|
@ -23,6 +23,7 @@ library
|
|||
, PDF.Object
|
||||
, PDF.Output
|
||||
, PDF.Parser
|
||||
, PDF.Pages
|
||||
, PDF.Update
|
||||
other-modules: Data.ByteString.Char8.Util
|
||||
, PDF.Content.Operator
|
||||
|
@ -31,6 +32,7 @@ library
|
|||
, PDF.Content.Operator.GraphicState
|
||||
, PDF.Content.Operator.Path
|
||||
, PDF.Content.Operator.Text
|
||||
, PDF.Content.Text
|
||||
, PDF.Encoding
|
||||
, PDF.Encoding.MacRoman
|
||||
, PDF.Body
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
module PDF.Content.Operator (
|
||||
Instruction
|
||||
, Operator
|
||||
, Operator(..)
|
||||
, operator
|
||||
) where
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
module PDF.Content.Operator.Color (
|
||||
Operator
|
||||
Operator(..)
|
||||
, signature
|
||||
) where
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
module PDF.Content.Operator.Path (
|
||||
Operator
|
||||
Operator(..)
|
||||
, signature
|
||||
) where
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
module PDF.Content.Operator.Text (
|
||||
Operator
|
||||
Operator(..)
|
||||
, signature
|
||||
) where
|
||||
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
module PDF.Content.Text (
|
||||
renderText
|
||||
) where
|
||||
|
||||
import Control.Monad (foldM)
|
||||
import Control.Monad.Reader (ReaderT, asks, runReaderT)
|
||||
import Control.Monad.State (MonadState(..), StateT, evalStateT)
|
||||
import Data.Map ((!))
|
||||
import Data.Text (Text)
|
||||
import PDF.Content (Content(..), ContentUnit(..), GraphicContextUnit(..))
|
||||
import PDF.Content.Operator (Instruction, Operator(..))
|
||||
import PDF.Content.Operator.Text (Operator(..))
|
||||
import PDF.Font (Font, FontSet, emptyFont)
|
||||
import PDF.Object (DirectObject(..), StringObject, toByteString)
|
||||
|
||||
type RenderingContext = ReaderT FontSet (Either String)
|
||||
type TextRenderingContext = StateT Font RenderingContext
|
||||
|
||||
decodeString :: StringObject -> TextRenderingContext Text
|
||||
decodeString input = do
|
||||
font <- get
|
||||
either fail return . font $ toByteString input
|
||||
|
||||
renderText :: FontSet -> Content -> Either String [Text]
|
||||
renderText fontSet (Content contentUnits) =
|
||||
runReaderT (concat <$> mapM renderContentUnit contentUnits) fontSet
|
||||
|
||||
renderContentUnit :: ContentUnit -> RenderingContext [Text]
|
||||
renderContentUnit (GraphicContext graphicContextUnits) =
|
||||
concat <$> mapM renderGraphicContextUnit graphicContextUnits
|
||||
renderContentUnit (TextContext instructions) =
|
||||
evalStateT (concat <$> mapM renderInstruction instructions) emptyFont
|
||||
|
||||
renderGraphicContextUnit :: GraphicContextUnit -> RenderingContext [Text]
|
||||
renderGraphicContextUnit (GraphicInstruction _) = return []
|
||||
renderGraphicContextUnit (ContentUnit contentUnit) =
|
||||
renderContentUnit contentUnit
|
||||
|
||||
renderInstruction :: Instruction -> StateT Font RenderingContext [Text]
|
||||
renderInstruction (Text Tf, [NameObject fontName, _]) =
|
||||
asks (! fontName) >>= put >> return []
|
||||
|
||||
renderInstruction (Text Tstar, []) = return ["\n"]
|
||||
|
||||
renderInstruction (Text TJ, [Array arrayObject]) =
|
||||
replicate 1 <$> foldM appendText "" arrayObject
|
||||
where
|
||||
appendText t (StringObject outputString) =
|
||||
mappend t <$> decodeString outputString
|
||||
appendText t _ = return t
|
||||
|
||||
renderInstruction (Text Tj, [StringObject outputString]) =
|
||||
replicate 1 <$> decodeString outputString
|
||||
|
||||
renderInstruction (Text Quote, [StringObject outputString]) =
|
||||
(\t -> ["\n", t]) <$> decodeString outputString
|
||||
|
||||
renderInstruction (Text DQuote, [StringObject outputString]) =
|
||||
(\t -> ["\n", t]) <$> decodeString outputString
|
||||
|
||||
renderInstruction _ = return []
|
|
@ -16,6 +16,8 @@ import Data.Map (Map, (!))
|
|||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||
import Data.Text (Text)
|
||||
import PDF.CMap (cMap)
|
||||
import qualified PDF.Content as Content (parse)
|
||||
import PDF.Content.Text (renderText)
|
||||
import PDF.Encoding (encoding)
|
||||
import PDF.Font (Font, FontSet)
|
||||
import PDF.Object (
|
||||
|
@ -23,7 +25,6 @@ import PDF.Object (
|
|||
, Object(..), Name(..), Structure(..)
|
||||
,)
|
||||
import PDF.Output (ObjectId(..))
|
||||
import PDF.Text (pageContents)
|
||||
import Text.Printf (printf)
|
||||
|
||||
type CachedFonts = Map ObjectId Font
|
||||
|
@ -151,7 +152,7 @@ extractText object = do
|
|||
where
|
||||
loadContent :: FontSet -> DirectObject -> T [Text]
|
||||
loadContent fonts directObject =
|
||||
follow directObject \\= stream \\= pageContents fonts
|
||||
follow directObject \\= stream \\= Content.parse \\= renderText fonts
|
||||
|
||||
loadPage :: ObjectId -> T Page
|
||||
loadPage source =
|
||||
|
|
Loading…
Reference in New Issue