Finish the split of Navigation out of Page, generalize the use of MonadFail with a custom Error monad (~= Either String)
This commit is contained in:
parent
af994cb50c
commit
eb4d76002c
5 changed files with 47 additions and 60 deletions
|
@ -1,7 +1,5 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
{-# LANGUAGE FlexibleInstances #-}
|
|
||||||
{-# LANGUAGE TypeSynonymInstances #-}
|
|
||||||
module PDF.CMap (
|
module PDF.CMap (
|
||||||
CMap
|
CMap
|
||||||
, CMappers
|
, CMappers
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
module PDF.Content.Text (
|
module PDF.Content.Text (
|
||||||
format
|
format
|
||||||
, renderText
|
, renderText
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Monad (foldM)
|
import Control.Monad (foldM)
|
||||||
import Control.Monad.Reader (ReaderT, asks, runReaderT)
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
import Control.Monad.State (MonadState(..), StateT, evalStateT)
|
import Control.Monad.Reader (MonadReader(..), asks, runReaderT)
|
||||||
|
import Control.Monad.State (MonadState(..), evalStateT)
|
||||||
import Data.ByteString.Char8 (pack)
|
import Data.ByteString.Char8 (pack)
|
||||||
import Data.Map ((!))
|
import Data.Map ((!))
|
||||||
import Data.Text (Text)
|
import Data.Text (Text)
|
||||||
|
@ -15,31 +18,32 @@ import PDF.Content.Operator (Instruction, Operator(..))
|
||||||
import PDF.Content.Operator.Text (Operator(..))
|
import PDF.Content.Operator.Text (Operator(..))
|
||||||
import PDF.Font (Font, FontSet, emptyFont)
|
import PDF.Font (Font, FontSet, emptyFont)
|
||||||
import PDF.Object (DirectObject(..), StringObject(..), toByteString)
|
import PDF.Object (DirectObject(..), StringObject(..), toByteString)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
type RenderingContext = ReaderT FontSet (Either String)
|
type TextContent m = (MonadReader FontSet m, MonadFail m)
|
||||||
type TextRenderingContext = StateT Font RenderingContext
|
type FontContext m = (MonadState Font m, TextContent m)
|
||||||
|
|
||||||
decodeString :: StringObject -> TextRenderingContext Text
|
decodeString :: FontContext m => StringObject -> m Text
|
||||||
decodeString input = do
|
decodeString input = do
|
||||||
font <- get
|
font <- get
|
||||||
either fail return . font $ toByteString input
|
either fail return . font $ toByteString input
|
||||||
|
|
||||||
renderText :: FontSet -> Content -> Either String [Text]
|
renderText :: MonadFail m => FontSet -> Content -> m [Text]
|
||||||
renderText fontSet (Content contentUnits) =
|
renderText fontSet (Content contentUnits) =
|
||||||
runReaderT (concat <$> mapM renderContentUnit contentUnits) fontSet
|
runReaderT (concat <$> mapM renderContentUnit contentUnits) fontSet
|
||||||
|
|
||||||
renderContentUnit :: ContentUnit -> RenderingContext [Text]
|
renderContentUnit :: TextContent m => ContentUnit -> m [Text]
|
||||||
renderContentUnit (GraphicContext graphicContextUnits) =
|
renderContentUnit (GraphicContext graphicContextUnits) =
|
||||||
concat <$> mapM renderGraphicContextUnit graphicContextUnits
|
concat <$> mapM renderGraphicContextUnit graphicContextUnits
|
||||||
renderContentUnit (TextContext instructions) =
|
renderContentUnit (TextContext instructions) =
|
||||||
evalStateT (concat <$> mapM renderInstruction instructions) emptyFont
|
evalStateT (concat <$> mapM renderInstruction instructions) emptyFont
|
||||||
|
|
||||||
renderGraphicContextUnit :: GraphicContextUnit -> RenderingContext [Text]
|
renderGraphicContextUnit :: TextContent m => GraphicContextUnit -> m [Text]
|
||||||
renderGraphicContextUnit (GraphicInstruction _) = return []
|
renderGraphicContextUnit (GraphicInstruction _) = return []
|
||||||
renderGraphicContextUnit (ContentUnit contentUnit) =
|
renderGraphicContextUnit (ContentUnit contentUnit) =
|
||||||
renderContentUnit contentUnit
|
renderContentUnit contentUnit
|
||||||
|
|
||||||
renderInstruction :: Instruction -> StateT Font RenderingContext [Text]
|
renderInstruction :: FontContext m => Instruction -> m [Text]
|
||||||
renderInstruction (Text Tf, [NameObject fontName, _]) =
|
renderInstruction (Text Tf, [NameObject fontName, _]) =
|
||||||
asks (! fontName) >>= put >> return []
|
asks (! fontName) >>= put >> return []
|
||||||
|
|
||||||
|
|
|
@ -2,9 +2,11 @@ module PDF.Encoding (
|
||||||
encoding
|
encoding
|
||||||
) where
|
) where
|
||||||
|
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
import PDF.Encoding.MacRoman (macRomanEncoding)
|
import PDF.Encoding.MacRoman (macRomanEncoding)
|
||||||
import PDF.Font (Font)
|
import PDF.Font (Font)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
encoding :: String -> Either String Font
|
encoding :: MonadFail m => String -> m Font
|
||||||
encoding "MacRomanEncoding" = Right macRomanEncoding
|
encoding "MacRomanEncoding" = return macRomanEncoding
|
||||||
encoding s = Left $ "Unknown encoding " ++ s
|
encoding s = fail $ "Unknown encoding " ++ s
|
||||||
|
|
|
@ -43,16 +43,13 @@ dictionaryById objectId = objectById objectId >>= castDictionary
|
||||||
expected :: (MonadFail m, Show a) => String -> a -> m b
|
expected :: (MonadFail m, Show a) => String -> a -> m b
|
||||||
expected name = fail . printf "Not a %s: %s" name . show
|
expected name = fail . printf "Not a %s: %s" name . show
|
||||||
|
|
||||||
lookupField :: String -> Dictionary -> Either String DirectObject
|
getField :: MonadFail m => String -> Dictionary -> m DirectObject
|
||||||
lookupField key aDictionary =
|
getField key aDictionary =
|
||||||
maybe (Left errorMessage) Right (Map.lookup (Name key) aDictionary)
|
maybe (fail errorMessage) return (Map.lookup (Name key) aDictionary)
|
||||||
where
|
where
|
||||||
errorMessage =
|
errorMessage =
|
||||||
printf "Key %s not found in dictionary %s" key (show aDictionary)
|
printf "Key %s not found in dictionary %s" key (show aDictionary)
|
||||||
|
|
||||||
getField :: MonadFail m => String -> Dictionary -> m DirectObject
|
|
||||||
getField key = either fail return . lookupField key
|
|
||||||
|
|
||||||
follow :: PDFContent m => DirectObject -> m Object
|
follow :: PDFContent m => DirectObject -> m Object
|
||||||
follow directObject = castObjectId directObject >>= objectById
|
follow directObject = castObjectId directObject >>= objectById
|
||||||
|
|
||||||
|
|
|
@ -1,18 +1,17 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||||
module PDF.Pages (
|
module PDF.Pages (
|
||||||
Page(..)
|
Page(..)
|
||||||
, get
|
, get
|
||||||
, getAll
|
, getAll
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>))
|
import Control.Applicative (Alternative(..), (<|>))
|
||||||
import Control.Monad (foldM)
|
import Control.Monad (MonadPlus(..), foldM)
|
||||||
import Control.Monad.Except (Except, runExcept)
|
|
||||||
import Control.Monad.Fail (MonadFail(..))
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify)
|
import Control.Monad.RWS (RWST(..), evalRWST, modify)
|
||||||
import qualified Control.Monad.RWS as RWS (get)
|
import qualified Control.Monad.RWS as RWS (get)
|
||||||
import Data.ByteString (ByteString)
|
import Data.Map (Map)
|
||||||
import Data.Map (Map, (!))
|
|
||||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||||
import Data.Text (Text)
|
import Data.Text (Text)
|
||||||
import PDF.CMap (cMap)
|
import PDF.CMap (cMap)
|
||||||
|
@ -22,7 +21,7 @@ import PDF.Encoding (encoding)
|
||||||
import PDF.Font (Font, FontSet)
|
import PDF.Font (Font, FontSet)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
, Object(..), Name(..), Structure(..)
|
, Name(..)
|
||||||
,)
|
,)
|
||||||
import PDF.Object.Navigation (
|
import PDF.Object.Navigation (
|
||||||
(//), dictionaryById, getDictionary, getField, follow, openStream, origin
|
(//), dictionaryById, getDictionary, getField, follow, openStream, origin
|
||||||
|
@ -32,12 +31,18 @@ import Prelude hiding (fail)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
type CachedFonts = Map ObjectId Font
|
type CachedFonts = Map ObjectId Font
|
||||||
type T = RWST Content () CachedFonts (Except String)
|
newtype Error a = Error {
|
||||||
|
runError :: Either String a
|
||||||
|
} deriving (Alternative, Functor, Applicative, Monad, MonadPlus)
|
||||||
|
type T = RWST Content () CachedFonts Error
|
||||||
data Page = Page {
|
data Page = Page {
|
||||||
contents :: [Text]
|
contents :: [Text]
|
||||||
, source :: ObjectId
|
, source :: ObjectId
|
||||||
}
|
}
|
||||||
|
|
||||||
|
instance MonadFail Error where
|
||||||
|
fail = Error . Left
|
||||||
|
|
||||||
getFontDictionary :: Dictionary -> T Dictionary
|
getFontDictionary :: Dictionary -> T Dictionary
|
||||||
getFontDictionary pageDict =
|
getFontDictionary pageDict =
|
||||||
((pageDict // ["Resources", "Font"]) >>= getDictionary)
|
((pageDict // ["Resources", "Font"]) >>= getDictionary)
|
||||||
|
@ -60,11 +65,13 @@ loadFont objectId = dictionaryById objectId >>= tryMappings
|
||||||
<|> (getField "Encoding" dictionary >>= loadEncoding)
|
<|> (getField "Encoding" dictionary >>= loadEncoding)
|
||||||
<|> (fail $ unknownFormat (show objectId) (show dictionary))
|
<|> (fail $ unknownFormat (show objectId) (show dictionary))
|
||||||
unknownFormat = printf "Unknown font format for object #%s : %s"
|
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||||
|
loadCMap :: Dictionary -> T Font
|
||||||
loadCMap dictionary =
|
loadCMap dictionary =
|
||||||
getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
|
getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
|
||||||
loadEncoding (NameObject (Name name)) = return $ encoding name
|
loadEncoding :: DirectObject -> T Font
|
||||||
|
loadEncoding (NameObject (Name name)) = encoding name
|
||||||
loadEncoding directObject =
|
loadEncoding directObject =
|
||||||
fail $ "Encoding must be a name, not that : %s" $ show directObject
|
fail $ printf "Encoding must be a name, not that : %s" $ show directObject
|
||||||
|
|
||||||
loadFonts :: Dictionary -> T FontSet
|
loadFonts :: Dictionary -> T FontSet
|
||||||
loadFonts = foldM addFont Map.empty . Map.toList
|
loadFonts = foldM addFont Map.empty . Map.toList
|
||||||
|
@ -74,34 +81,13 @@ loadFonts = foldM addFont Map.empty . Map.toList
|
||||||
flip (Map.insert name) output <$> cache loadFont objectId
|
flip (Map.insert name) output <$> cache loadFont objectId
|
||||||
addFont output _ = return output
|
addFont output _ = return output
|
||||||
|
|
||||||
getObject :: ObjectId -> T Object
|
several :: DirectObject -> [DirectObject]
|
||||||
getObject objectId = do
|
several (Array l) = l
|
||||||
content <- ask
|
several directObject = [directObject]
|
||||||
return (objects content ! objectId)
|
|
||||||
|
|
||||||
key :: String -> Dictionary -> Either String DirectObject
|
|
||||||
key keyName dictionary =
|
|
||||||
maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary)
|
|
||||||
where
|
|
||||||
errorMessage =
|
|
||||||
printf "Key %s not found in dictionary %s" keyName (show dictionary)
|
|
||||||
|
|
||||||
many :: DirectObject -> [DirectObject]
|
|
||||||
many (Array l) = l
|
|
||||||
many directObject = [directObject]
|
|
||||||
|
|
||||||
{-
|
|
||||||
dict :: Object -> Either String Dictionary
|
|
||||||
dict (Direct (Dictionary dictionary)) = Right dictionary
|
|
||||||
dict obj = expected "dictionary" obj
|
|
||||||
|
|
||||||
dictObject :: String -> Dictionary -> T Dictionary
|
|
||||||
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
|
|
||||||
-}
|
|
||||||
|
|
||||||
pagesList :: T [ObjectId]
|
pagesList :: T [ObjectId]
|
||||||
pagesList = do
|
pagesList = do
|
||||||
pages <- origin >>= (// ["Root", "Pages"]) >>= getDictionary
|
pages <- (origin :: T Dictionary) >>= (// ["Root", "Pages"]) >>= getDictionary
|
||||||
case Map.lookup (Name "Kids") pages of
|
case Map.lookup (Name "Kids") pages of
|
||||||
Just (Array kids) -> return $ getReferences kids
|
Just (Array kids) -> return $ getReferences kids
|
||||||
_ -> return []
|
_ -> return []
|
||||||
|
@ -116,14 +102,14 @@ getReferences objects = do
|
||||||
extractText :: Dictionary -> T [Text]
|
extractText :: Dictionary -> T [Text]
|
||||||
extractText pageDict = do
|
extractText pageDict = do
|
||||||
fonts <- loadFonts =<< getFontDictionary pageDict
|
fonts <- loadFonts =<< getFontDictionary pageDict
|
||||||
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
objects <- several <$> getField "Contents" pageDict
|
||||||
concat <$> (objects //= (mapM $ loadContent fonts))
|
concat <$> mapM (loadContent fonts) objects
|
||||||
where
|
where
|
||||||
loadContent :: FontSet -> DirectObject -> T [Text]
|
loadContent :: FontSet -> DirectObject -> T [Text]
|
||||||
loadContent fonts directObject =
|
loadContent fonts directObject =
|
||||||
follow directObject
|
follow directObject
|
||||||
>>= openStream
|
>>= openStream
|
||||||
>>= either fail return . Content.parse
|
>>= (either fail return . Content.parse)
|
||||||
>>= renderText fonts
|
>>= renderText fonts
|
||||||
|
|
||||||
loadPage :: ObjectId -> T Page
|
loadPage :: ObjectId -> T Page
|
||||||
|
@ -132,7 +118,7 @@ loadPage source = do
|
||||||
return $ Page {contents, source}
|
return $ Page {contents, source}
|
||||||
|
|
||||||
getAll :: Content -> Either String (Map Int Page)
|
getAll :: Content -> Either String (Map Int Page)
|
||||||
getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty
|
getAll content = runError $ fst <$> evalRWST getPages content Map.empty
|
||||||
where
|
where
|
||||||
numbered = Map.fromList . zip [1..]
|
numbered = Map.fromList . zip [1..]
|
||||||
getPages = numbered <$> (mapM loadPage =<< pagesList)
|
getPages = numbered <$> (mapM loadPage =<< pagesList)
|
||||||
|
@ -140,7 +126,7 @@ getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty
|
||||||
get :: Content -> Int -> Either String Page
|
get :: Content -> Int -> Either String Page
|
||||||
get content pageNumber
|
get content pageNumber
|
||||||
| pageNumber < 1 = Left "Pages start at 1"
|
| pageNumber < 1 = Left "Pages start at 1"
|
||||||
| otherwise = runExcept $ fst <$> evalRWST getPage content Map.empty
|
| otherwise = runError $ fst <$> evalRWST getPage content Map.empty
|
||||||
where
|
where
|
||||||
firstPage [] = fail "Page is out of bounds"
|
firstPage [] = fail "Page is out of bounds"
|
||||||
firstPage (p:_) = loadPage p
|
firstPage (p:_) = loadPage p
|
||||||
|
|
Loading…
Reference in a new issue