Implement Text rendering from parsed Content

This commit is contained in:
Tissevert 2020-02-10 10:54:44 +01:00
parent 20466c4f13
commit 9f1b1afafe
7 changed files with 71 additions and 6 deletions

View File

@ -23,6 +23,7 @@ library
, PDF.Object
, PDF.Output
, PDF.Parser
, PDF.Pages
, PDF.Update
other-modules: Data.ByteString.Char8.Util
, PDF.Content.Operator
@ -31,6 +32,7 @@ library
, PDF.Content.Operator.GraphicState
, PDF.Content.Operator.Path
, PDF.Content.Operator.Text
, PDF.Content.Text
, PDF.Encoding
, PDF.Encoding.MacRoman
, PDF.Body

View File

@ -2,7 +2,7 @@
{-# LANGUAGE FlexibleContexts #-}
module PDF.Content.Operator (
Instruction
, Operator
, Operator(..)
, operator
) where

View File

@ -1,5 +1,5 @@
module PDF.Content.Operator.Color (
Operator
Operator(..)
, signature
) where

View File

@ -1,5 +1,5 @@
module PDF.Content.Operator.Path (
Operator
Operator(..)
, signature
) where

View File

@ -1,5 +1,5 @@
module PDF.Content.Operator.Text (
Operator
Operator(..)
, signature
) where

View File

@ -0,0 +1,62 @@
{-# LANGUAGE OverloadedStrings #-}
module PDF.Content.Text (
renderText
) where
import Control.Monad (foldM)
import Control.Monad.Reader (ReaderT, asks, runReaderT)
import Control.Monad.State (MonadState(..), StateT, evalStateT)
import Data.Map ((!))
import Data.Text (Text)
import PDF.Content (Content(..), ContentUnit(..), GraphicContextUnit(..))
import PDF.Content.Operator (Instruction, Operator(..))
import PDF.Content.Operator.Text (Operator(..))
import PDF.Font (Font, FontSet, emptyFont)
import PDF.Object (DirectObject(..), StringObject, toByteString)
type RenderingContext = ReaderT FontSet (Either String)
type TextRenderingContext = StateT Font RenderingContext
decodeString :: StringObject -> TextRenderingContext Text
decodeString input = do
font <- get
either fail return . font $ toByteString input
renderText :: FontSet -> Content -> Either String [Text]
renderText fontSet (Content contentUnits) =
runReaderT (concat <$> mapM renderContentUnit contentUnits) fontSet
renderContentUnit :: ContentUnit -> RenderingContext [Text]
renderContentUnit (GraphicContext graphicContextUnits) =
concat <$> mapM renderGraphicContextUnit graphicContextUnits
renderContentUnit (TextContext instructions) =
evalStateT (concat <$> mapM renderInstruction instructions) emptyFont
renderGraphicContextUnit :: GraphicContextUnit -> RenderingContext [Text]
renderGraphicContextUnit (GraphicInstruction _) = return []
renderGraphicContextUnit (ContentUnit contentUnit) =
renderContentUnit contentUnit
renderInstruction :: Instruction -> StateT Font RenderingContext [Text]
renderInstruction (Text Tf, [NameObject fontName, _]) =
asks (! fontName) >>= put >> return []
renderInstruction (Text Tstar, []) = return ["\n"]
renderInstruction (Text TJ, [Array arrayObject]) =
replicate 1 <$> foldM appendText "" arrayObject
where
appendText t (StringObject outputString) =
mappend t <$> decodeString outputString
appendText t _ = return t
renderInstruction (Text Tj, [StringObject outputString]) =
replicate 1 <$> decodeString outputString
renderInstruction (Text Quote, [StringObject outputString]) =
(\t -> ["\n", t]) <$> decodeString outputString
renderInstruction (Text DQuote, [StringObject outputString]) =
(\t -> ["\n", t]) <$> decodeString outputString
renderInstruction _ = return []

View File

@ -16,6 +16,8 @@ import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.CMap (cMap)
import qualified PDF.Content as Content (parse)
import PDF.Content.Text (renderText)
import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet)
import PDF.Object (
@ -23,7 +25,6 @@ import PDF.Object (
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId(..))
import PDF.Text (pageContents)
import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
@ -151,7 +152,7 @@ extractText object = do
where
loadContent :: FontSet -> DirectObject -> T [Text]
loadContent fonts directObject =
follow directObject \\= stream \\= pageContents fonts
follow directObject \\= stream \\= Content.parse \\= renderText fonts
loadPage :: ObjectId -> T Page
loadPage source =