Finish the split of Navigation out of Page, generalize the use of MonadFail with a custom Error monad (~= Either String)

This commit is contained in:
Tissevert 2020-02-11 22:41:46 +01:00
parent af994cb50c
commit eb4d76002c
5 changed files with 47 additions and 60 deletions

View file

@ -1,7 +1,5 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE TypeSynonymInstances #-}
module PDF.CMap ( module PDF.CMap (
CMap CMap
, CMappers , CMappers

View file

@ -1,12 +1,15 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
module PDF.Content.Text ( module PDF.Content.Text (
format format
, renderText , renderText
) where ) where
import Control.Monad (foldM) import Control.Monad (foldM)
import Control.Monad.Reader (ReaderT, asks, runReaderT) import Control.Monad.Fail (MonadFail(..))
import Control.Monad.State (MonadState(..), StateT, evalStateT) import Control.Monad.Reader (MonadReader(..), asks, runReaderT)
import Control.Monad.State (MonadState(..), evalStateT)
import Data.ByteString.Char8 (pack) import Data.ByteString.Char8 (pack)
import Data.Map ((!)) import Data.Map ((!))
import Data.Text (Text) import Data.Text (Text)
@ -15,31 +18,32 @@ import PDF.Content.Operator (Instruction, Operator(..))
import PDF.Content.Operator.Text (Operator(..)) import PDF.Content.Operator.Text (Operator(..))
import PDF.Font (Font, FontSet, emptyFont) import PDF.Font (Font, FontSet, emptyFont)
import PDF.Object (DirectObject(..), StringObject(..), toByteString) import PDF.Object (DirectObject(..), StringObject(..), toByteString)
import Prelude hiding (fail)
type RenderingContext = ReaderT FontSet (Either String) type TextContent m = (MonadReader FontSet m, MonadFail m)
type TextRenderingContext = StateT Font RenderingContext type FontContext m = (MonadState Font m, TextContent m)
decodeString :: StringObject -> TextRenderingContext Text decodeString :: FontContext m => StringObject -> m Text
decodeString input = do decodeString input = do
font <- get font <- get
either fail return . font $ toByteString input either fail return . font $ toByteString input
renderText :: FontSet -> Content -> Either String [Text] renderText :: MonadFail m => FontSet -> Content -> m [Text]
renderText fontSet (Content contentUnits) = renderText fontSet (Content contentUnits) =
runReaderT (concat <$> mapM renderContentUnit contentUnits) fontSet runReaderT (concat <$> mapM renderContentUnit contentUnits) fontSet
renderContentUnit :: ContentUnit -> RenderingContext [Text] renderContentUnit :: TextContent m => ContentUnit -> m [Text]
renderContentUnit (GraphicContext graphicContextUnits) = renderContentUnit (GraphicContext graphicContextUnits) =
concat <$> mapM renderGraphicContextUnit graphicContextUnits concat <$> mapM renderGraphicContextUnit graphicContextUnits
renderContentUnit (TextContext instructions) = renderContentUnit (TextContext instructions) =
evalStateT (concat <$> mapM renderInstruction instructions) emptyFont evalStateT (concat <$> mapM renderInstruction instructions) emptyFont
renderGraphicContextUnit :: GraphicContextUnit -> RenderingContext [Text] renderGraphicContextUnit :: TextContent m => GraphicContextUnit -> m [Text]
renderGraphicContextUnit (GraphicInstruction _) = return [] renderGraphicContextUnit (GraphicInstruction _) = return []
renderGraphicContextUnit (ContentUnit contentUnit) = renderGraphicContextUnit (ContentUnit contentUnit) =
renderContentUnit contentUnit renderContentUnit contentUnit
renderInstruction :: Instruction -> StateT Font RenderingContext [Text] renderInstruction :: FontContext m => Instruction -> m [Text]
renderInstruction (Text Tf, [NameObject fontName, _]) = renderInstruction (Text Tf, [NameObject fontName, _]) =
asks (! fontName) >>= put >> return [] asks (! fontName) >>= put >> return []

View file

@ -2,9 +2,11 @@ module PDF.Encoding (
encoding encoding
) where ) where
import Control.Monad.Fail (MonadFail(..))
import PDF.Encoding.MacRoman (macRomanEncoding) import PDF.Encoding.MacRoman (macRomanEncoding)
import PDF.Font (Font) import PDF.Font (Font)
import Prelude hiding (fail)
encoding :: String -> Either String Font encoding :: MonadFail m => String -> m Font
encoding "MacRomanEncoding" = Right macRomanEncoding encoding "MacRomanEncoding" = return macRomanEncoding
encoding s = Left $ "Unknown encoding " ++ s encoding s = fail $ "Unknown encoding " ++ s

View file

@ -43,16 +43,13 @@ dictionaryById objectId = objectById objectId >>= castDictionary
expected :: (MonadFail m, Show a) => String -> a -> m b expected :: (MonadFail m, Show a) => String -> a -> m b
expected name = fail . printf "Not a %s: %s" name . show expected name = fail . printf "Not a %s: %s" name . show
lookupField :: String -> Dictionary -> Either String DirectObject getField :: MonadFail m => String -> Dictionary -> m DirectObject
lookupField key aDictionary = getField key aDictionary =
maybe (Left errorMessage) Right (Map.lookup (Name key) aDictionary) maybe (fail errorMessage) return (Map.lookup (Name key) aDictionary)
where where
errorMessage = errorMessage =
printf "Key %s not found in dictionary %s" key (show aDictionary) printf "Key %s not found in dictionary %s" key (show aDictionary)
getField :: MonadFail m => String -> Dictionary -> m DirectObject
getField key = either fail return . lookupField key
follow :: PDFContent m => DirectObject -> m Object follow :: PDFContent m => DirectObject -> m Object
follow directObject = castObjectId directObject >>= objectById follow directObject = castObjectId directObject >>= objectById

View file

@ -1,18 +1,17 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
module PDF.Pages ( module PDF.Pages (
Page(..) Page(..)
, get , get
, getAll , getAll
) where ) where
import Control.Applicative ((<|>)) import Control.Applicative (Alternative(..), (<|>))
import Control.Monad (foldM) import Control.Monad (MonadPlus(..), foldM)
import Control.Monad.Except (Except, runExcept)
import Control.Monad.Fail (MonadFail(..)) import Control.Monad.Fail (MonadFail(..))
import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify) import Control.Monad.RWS (RWST(..), evalRWST, modify)
import qualified Control.Monad.RWS as RWS (get) import qualified Control.Monad.RWS as RWS (get)
import Data.ByteString (ByteString) import Data.Map (Map)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text) import Data.Text (Text)
import PDF.CMap (cMap) import PDF.CMap (cMap)
@ -22,7 +21,7 @@ import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet) import PDF.Font (Font, FontSet)
import PDF.Object ( import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..) , Name(..)
,) ,)
import PDF.Object.Navigation ( import PDF.Object.Navigation (
(//), dictionaryById, getDictionary, getField, follow, openStream, origin (//), dictionaryById, getDictionary, getField, follow, openStream, origin
@ -32,12 +31,18 @@ import Prelude hiding (fail)
import Text.Printf (printf) import Text.Printf (printf)
type CachedFonts = Map ObjectId Font type CachedFonts = Map ObjectId Font
type T = RWST Content () CachedFonts (Except String) newtype Error a = Error {
runError :: Either String a
} deriving (Alternative, Functor, Applicative, Monad, MonadPlus)
type T = RWST Content () CachedFonts Error
data Page = Page { data Page = Page {
contents :: [Text] contents :: [Text]
, source :: ObjectId , source :: ObjectId
} }
instance MonadFail Error where
fail = Error . Left
getFontDictionary :: Dictionary -> T Dictionary getFontDictionary :: Dictionary -> T Dictionary
getFontDictionary pageDict = getFontDictionary pageDict =
((pageDict // ["Resources", "Font"]) >>= getDictionary) ((pageDict // ["Resources", "Font"]) >>= getDictionary)
@ -60,11 +65,13 @@ loadFont objectId = dictionaryById objectId >>= tryMappings
<|> (getField "Encoding" dictionary >>= loadEncoding) <|> (getField "Encoding" dictionary >>= loadEncoding)
<|> (fail $ unknownFormat (show objectId) (show dictionary)) <|> (fail $ unknownFormat (show objectId) (show dictionary))
unknownFormat = printf "Unknown font format for object #%s : %s" unknownFormat = printf "Unknown font format for object #%s : %s"
loadCMap :: Dictionary -> T Font
loadCMap dictionary = loadCMap dictionary =
getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
loadEncoding (NameObject (Name name)) = return $ encoding name loadEncoding :: DirectObject -> T Font
loadEncoding (NameObject (Name name)) = encoding name
loadEncoding directObject = loadEncoding directObject =
fail $ "Encoding must be a name, not that : %s" $ show directObject fail $ printf "Encoding must be a name, not that : %s" $ show directObject
loadFonts :: Dictionary -> T FontSet loadFonts :: Dictionary -> T FontSet
loadFonts = foldM addFont Map.empty . Map.toList loadFonts = foldM addFont Map.empty . Map.toList
@ -74,34 +81,13 @@ loadFonts = foldM addFont Map.empty . Map.toList
flip (Map.insert name) output <$> cache loadFont objectId flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output addFont output _ = return output
getObject :: ObjectId -> T Object several :: DirectObject -> [DirectObject]
getObject objectId = do several (Array l) = l
content <- ask several directObject = [directObject]
return (objects content ! objectId)
key :: String -> Dictionary -> Either String DirectObject
key keyName dictionary =
maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary)
where
errorMessage =
printf "Key %s not found in dictionary %s" keyName (show dictionary)
many :: DirectObject -> [DirectObject]
many (Array l) = l
many directObject = [directObject]
{-
dict :: Object -> Either String Dictionary
dict (Direct (Dictionary dictionary)) = Right dictionary
dict obj = expected "dictionary" obj
dictObject :: String -> Dictionary -> T Dictionary
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
-}
pagesList :: T [ObjectId] pagesList :: T [ObjectId]
pagesList = do pagesList = do
pages <- origin >>= (// ["Root", "Pages"]) >>= getDictionary pages <- (origin :: T Dictionary) >>= (// ["Root", "Pages"]) >>= getDictionary
case Map.lookup (Name "Kids") pages of case Map.lookup (Name "Kids") pages of
Just (Array kids) -> return $ getReferences kids Just (Array kids) -> return $ getReferences kids
_ -> return [] _ -> return []
@ -116,14 +102,14 @@ getReferences objects = do
extractText :: Dictionary -> T [Text] extractText :: Dictionary -> T [Text]
extractText pageDict = do extractText pageDict = do
fonts <- loadFonts =<< getFontDictionary pageDict fonts <- loadFonts =<< getFontDictionary pageDict
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject]) objects <- several <$> getField "Contents" pageDict
concat <$> (objects //= (mapM $ loadContent fonts)) concat <$> mapM (loadContent fonts) objects
where where
loadContent :: FontSet -> DirectObject -> T [Text] loadContent :: FontSet -> DirectObject -> T [Text]
loadContent fonts directObject = loadContent fonts directObject =
follow directObject follow directObject
>>= openStream >>= openStream
>>= either fail return . Content.parse >>= (either fail return . Content.parse)
>>= renderText fonts >>= renderText fonts
loadPage :: ObjectId -> T Page loadPage :: ObjectId -> T Page
@ -132,7 +118,7 @@ loadPage source = do
return $ Page {contents, source} return $ Page {contents, source}
getAll :: Content -> Either String (Map Int Page) getAll :: Content -> Either String (Map Int Page)
getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty getAll content = runError $ fst <$> evalRWST getPages content Map.empty
where where
numbered = Map.fromList . zip [1..] numbered = Map.fromList . zip [1..]
getPages = numbered <$> (mapM loadPage =<< pagesList) getPages = numbered <$> (mapM loadPage =<< pagesList)
@ -140,7 +126,7 @@ getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty
get :: Content -> Int -> Either String Page get :: Content -> Int -> Either String Page
get content pageNumber get content pageNumber
| pageNumber < 1 = Left "Pages start at 1" | pageNumber < 1 = Left "Pages start at 1"
| otherwise = runExcept $ fst <$> evalRWST getPage content Map.empty | otherwise = runError $ fst <$> evalRWST getPage content Map.empty
where where
firstPage [] = fail "Page is out of bounds" firstPage [] = fail "Page is out of bounds"
firstPage (p:_) = loadPage p firstPage (p:_) = loadPage p