Reimplement cMap as a type of Font and make the code ready for other Fonts

This commit is contained in:
Tissevert 2020-02-05 17:42:17 +01:00
parent 22cde37025
commit 6ed57d66e8
4 changed files with 67 additions and 56 deletions

View file

@ -1,4 +1,5 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE TypeSynonymInstances #-}
module PDF.CMap ( module PDF.CMap (
@ -14,14 +15,17 @@ import Control.Applicative ((<|>), many)
import Control.Monad.State (modify) import Control.Monad.State (modify)
import Data.Attoparsec.ByteString.Char8 (count) import Data.Attoparsec.ByteString.Char8 (count)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (length) import qualified Data.ByteString as BS (drop, length, null, take)
import Data.ByteString.Char8 (unpack)
import Data.ByteString.Char8.Util ( import Data.ByteString.Char8.Util (
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8 B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
) )
import Data.Map (Map, union) import Data.Map (Map, union)
import qualified Data.Map as Map (adjust, empty, fromList, insertWith) import qualified Data.Map as Map (
adjust, empty, fromList, insertWith, lookup, toList
)
import qualified PDF.EOL as EOL (charset, parser) import qualified PDF.EOL as EOL (charset, parser)
import PDF.Font (Font(..)) import PDF.Font (Font)
import PDF.Object ( import PDF.Object (
DirectObject(..), Name, StringObject(..) DirectObject(..), Name, StringObject(..)
, blank, directObject, integer, line, stringObject , blank, directObject, integer, line, stringObject
@ -38,8 +42,25 @@ data CRange = CRange {
type RangeSize = Int type RangeSize = Int
type CMap = Map RangeSize [CRange] type CMap = Map RangeSize [CRange]
instance Font CMap where toFont :: CMap -> Font
decode = undefined toFont aCMap input
| BS.null input = Right ""
| otherwise = do
(output, remainingInput) <- trySizes input $ Map.toList aCMap
mappend output <$> toFont aCMap remainingInput
where
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
trySizes s ((size, cRanges):others) =
let prefix = BS.take size s in
case tryRanges prefix cRanges of
Nothing -> trySizes s others
Just outputSequence -> Right (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
tryRanges _ [] = Nothing
tryRanges prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of
Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence
emptyCMap :: CMap emptyCMap :: CMap
emptyCMap = Map.empty emptyCMap = Map.empty
@ -48,8 +69,8 @@ matches :: ByteString -> CRange -> Bool
matches code (CRange {fromSequence, toSequence}) = matches code (CRange {fromSequence, toSequence}) =
fromSequence <= code && code <= toSequence fromSequence <= code && code <= toSequence
cMap :: ByteString -> Either String CMap cMap :: ByteString -> Either String Font
cMap = fmap snd <$> runParser cMap = fmap (toFont . snd) <$> runParser
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine)) (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
emptyCMap emptyCMap
where where

View file

@ -1,9 +1,15 @@
module PDF.Font ( module PDF.Font (
Font(..) Font
, FontSet
, emptyFont
) where ) where
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import PDF.Object (StringObject) import Data.Map (Map)
import PDF.Object (Name)
class Font a where type Font = ByteString -> Either String ByteString
decode :: a -> StringObject -> ByteString type FontSet = Map Name Font
emptyFont :: Font
emptyFont _ = Left "No fond loaded"

View file

@ -13,7 +13,8 @@ import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!)) import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import PDF.CMap (CMap, CMappers, cMap) import PDF.CMap (cMap)
import PDF.Font (Font, FontSet)
import PDF.Object ( import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..) , Object(..), Name(..), Structure(..)
@ -23,8 +24,8 @@ import PDF.Text (pageContents)
import Prelude hiding (fail) import Prelude hiding (fail)
import Text.Printf (printf) import Text.Printf (printf)
type CachedCMaps = Map ObjectId CMap type CachedFonts = Map ObjectId Font
type T = RWST Content () CachedCMaps (Either String) type T = RWST Content () CachedFonts (Either String)
data Page = Page { data Page = Page {
contents :: [ByteString] contents :: [ByteString]
, source :: ObjectId , source :: ObjectId
@ -60,14 +61,14 @@ getResource (Reference (IndirectObjCoordinates {objectId})) =
getResource directObject = getResource directObject =
lift $ expected "resource (dictionary or reference)" directObject lift $ expected "resource (dictionary or reference)" directObject
getFont :: Dictionary -> T Dictionary getFontDictionary :: Dictionary -> T Dictionary
getFont pageDict = getFontDictionary pageDict =
key "Resources" pageDict key "Resources" pageDict
//= getResource //= getResource
\\= key "Font" \\= key "Font"
>>= getResource >>= getResource
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap cache :: (ObjectId -> T Font) -> ObjectId -> T Font
cache loader objectId = cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get (maybe load return . Map.lookup objectId) =<< RWS.get
where where
@ -76,7 +77,7 @@ cache loader objectId =
modify $ Map.insert objectId value modify $ Map.insert objectId value
return value return value
loadFont :: ObjectId -> T CMap loadFont :: ObjectId -> T Font
loadFont objectId = loadFont objectId =
getObject objectId getObject objectId
\\= dict \\= dict
@ -85,14 +86,14 @@ loadFont objectId =
\\= stream \\= stream
\\= cMap \\= cMap
loadCMappers :: Dictionary -> T CMappers loadFonts :: Dictionary -> T FontSet
loadCMappers = foldM loadCMapper Map.empty . Map.toList loadFonts = foldM addFont Map.empty . Map.toList
where where
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers addFont :: FontSet -> (Name, DirectObject) -> T FontSet
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId flip (Map.insert name) output <$> cache loadFont objectId
--maybe output (flip (Map.insert name) output) <$> cache loadFont objectId --maybe output (flip (Map.insert name) output) <$> cache loadFont objectId
loadCMapper output _ = return output addFont output _ = return output
getObject :: ObjectId -> T Object getObject :: ObjectId -> T Object
getObject objectId = do getObject objectId = do
@ -142,13 +143,13 @@ getReferences objects = do
extractText :: Object -> T [ByteString] extractText :: Object -> T [ByteString]
extractText object = do extractText object = do
pageDict <- lift $ dict object pageDict <- lift $ dict object
cMappers <- loadCMappers =<< getFont pageDict fonts <- loadFonts =<< getFontDictionary pageDict
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject]) let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
concat <$> (objects //= (mapM $ loadContent cMappers)) concat <$> (objects //= (mapM $ loadContent fonts))
where where
loadContent :: CMappers -> DirectObject -> T [ByteString] loadContent :: FontSet -> DirectObject -> T [ByteString]
loadContent cMappers directObject = loadContent fonts directObject =
follow directObject \\= stream \\= pageContents cMappers follow directObject \\= stream \\= pageContents fonts
loadPage :: ObjectId -> T Page loadPage :: ObjectId -> T Page
loadPage source = loadPage source =

View file

@ -8,15 +8,15 @@ module PDF.Text {-(
import Control.Applicative ((<|>)) import Control.Applicative ((<|>))
import Control.Monad (foldM) import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail)
import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (MonadState, evalStateT, get, modify, put) import Control.Monad.State (MonadState, evalStateT, get, modify, put)
import Data.Attoparsec.ByteString.Char8 (choice, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (drop, null, take) import Data.ByteString.Char8 (pack)
import Data.ByteString.Char8 (pack, unpack)
import Data.Map ((!), (!?), Map) import Data.Map ((!), (!?), Map)
import qualified Data.Map as Map (fromList, lookup, toList) import qualified Data.Map as Map (fromList)
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap) import PDF.Font (Font, FontSet, emptyFont)
import PDF.Object ( import PDF.Object (
DirectObject(..), StringObject(..) DirectObject(..), StringObject(..)
, array, blank, name, regular, stringObject, toByteString , array, blank, name, regular, stringObject, toByteString
@ -170,11 +170,11 @@ nameArg = Typed . NameObject <$> name <* blank
stringArg :: MonadParser m => m Argument stringArg :: MonadParser m => m Argument
stringArg = Typed . StringObject <$> stringObject <* blank stringArg = Typed . StringObject <$> stringObject <* blank
type ParserWithFont = ReaderT CMappers (Parser CMap) type ParserWithFont = ReaderT FontSet (Parser Font)
pageContents :: CMappers -> ByteString -> Either String [ByteString] pageContents :: FontSet -> ByteString -> Either String [ByteString]
pageContents font input = pageContents font input =
evalParser (runReaderT page font) emptyCMap input evalParser (runReaderT page font) emptyFont input
page :: ParserWithFont [ByteString] page :: ParserWithFont [ByteString]
page = graphicState <|> text <?> "Text page contents" page = graphicState <|> text <?> "Text page contents"
@ -216,24 +216,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return [] runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m ByteString
decodeString = decode . toByteString decodeString input = do
where font <- get
decode input either fail return . font $ toByteString input
| BS.null input = return ""
| otherwise = do
(output, remainingInput) <- trySizes input =<< Map.toList <$> get
mappend output <$> decode remainingInput
trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString)
trySizes s [] = fail $ "No matching code found in font for " ++ unpack s
trySizes s ((size, cRanges):others) =
let prefix = BS.take size s in
case tryRanges prefix cRanges of
Nothing -> trySizes s others
Just outputSequence -> return (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
tryRanges _ [] = Nothing
tryRanges prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of
Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence