Reimplement cMap as a type of Font and make the code ready for other Fonts

This commit is contained in:
Tissevert 2020-02-05 17:42:17 +01:00
parent 22cde37025
commit 6ed57d66e8
4 changed files with 67 additions and 56 deletions

View file

@ -1,4 +1,5 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE TypeSynonymInstances #-}
module PDF.CMap (
@ -14,14 +15,17 @@ import Control.Applicative ((<|>), many)
import Control.Monad.State (modify)
import Data.Attoparsec.ByteString.Char8 (count)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (length)
import qualified Data.ByteString as BS (drop, length, null, take)
import Data.ByteString.Char8 (unpack)
import Data.ByteString.Char8.Util (
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
)
import Data.Map (Map, union)
import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
import qualified Data.Map as Map (
adjust, empty, fromList, insertWith, lookup, toList
)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Font (Font(..))
import PDF.Font (Font)
import PDF.Object (
DirectObject(..), Name, StringObject(..)
, blank, directObject, integer, line, stringObject
@ -38,8 +42,25 @@ data CRange = CRange {
type RangeSize = Int
type CMap = Map RangeSize [CRange]
instance Font CMap where
decode = undefined
toFont :: CMap -> Font
toFont aCMap input
| BS.null input = Right ""
| otherwise = do
(output, remainingInput) <- trySizes input $ Map.toList aCMap
mappend output <$> toFont aCMap remainingInput
where
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
trySizes s ((size, cRanges):others) =
let prefix = BS.take size s in
case tryRanges prefix cRanges of
Nothing -> trySizes s others
Just outputSequence -> Right (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
tryRanges _ [] = Nothing
tryRanges prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of
Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence
emptyCMap :: CMap
emptyCMap = Map.empty
@ -48,8 +69,8 @@ matches :: ByteString -> CRange -> Bool
matches code (CRange {fromSequence, toSequence}) =
fromSequence <= code && code <= toSequence
cMap :: ByteString -> Either String CMap
cMap = fmap snd <$> runParser
cMap :: ByteString -> Either String Font
cMap = fmap (toFont . snd) <$> runParser
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
emptyCMap
where

View file

@ -1,9 +1,15 @@
module PDF.Font (
Font(..)
Font
, FontSet
, emptyFont
) where
import Data.ByteString (ByteString)
import PDF.Object (StringObject)
import Data.Map (Map)
import PDF.Object (Name)
class Font a where
decode :: a -> StringObject -> ByteString
type Font = ByteString -> Either String ByteString
type FontSet = Map Name Font
emptyFont :: Font
emptyFont _ = Left "No fond loaded"

View file

@ -13,7 +13,8 @@ import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import PDF.CMap (CMap, CMappers, cMap)
import PDF.CMap (cMap)
import PDF.Font (Font, FontSet)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
@ -23,8 +24,8 @@ import PDF.Text (pageContents)
import Prelude hiding (fail)
import Text.Printf (printf)
type CachedCMaps = Map ObjectId CMap
type T = RWST Content () CachedCMaps (Either String)
type CachedFonts = Map ObjectId Font
type T = RWST Content () CachedFonts (Either String)
data Page = Page {
contents :: [ByteString]
, source :: ObjectId
@ -60,14 +61,14 @@ getResource (Reference (IndirectObjCoordinates {objectId})) =
getResource directObject =
lift $ expected "resource (dictionary or reference)" directObject
getFont :: Dictionary -> T Dictionary
getFont pageDict =
getFontDictionary :: Dictionary -> T Dictionary
getFontDictionary pageDict =
key "Resources" pageDict
//= getResource
\\= key "Font"
>>= getResource
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get
where
@ -76,7 +77,7 @@ cache loader objectId =
modify $ Map.insert objectId value
return value
loadFont :: ObjectId -> T CMap
loadFont :: ObjectId -> T Font
loadFont objectId =
getObject objectId
\\= dict
@ -85,14 +86,14 @@ loadFont objectId =
\\= stream
\\= cMap
loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList
loadFonts :: Dictionary -> T FontSet
loadFonts = foldM addFont Map.empty . Map.toList
where
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) =
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId
--maybe output (flip (Map.insert name) output) <$> cache loadFont objectId
loadCMapper output _ = return output
addFont output _ = return output
getObject :: ObjectId -> T Object
getObject objectId = do
@ -142,13 +143,13 @@ getReferences objects = do
extractText :: Object -> T [ByteString]
extractText object = do
pageDict <- lift $ dict object
cMappers <- loadCMappers =<< getFont pageDict
fonts <- loadFonts =<< getFontDictionary pageDict
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
concat <$> (objects //= (mapM $ loadContent cMappers))
concat <$> (objects //= (mapM $ loadContent fonts))
where
loadContent :: CMappers -> DirectObject -> T [ByteString]
loadContent cMappers directObject =
follow directObject \\= stream \\= pageContents cMappers
loadContent :: FontSet -> DirectObject -> T [ByteString]
loadContent fonts directObject =
follow directObject \\= stream \\= pageContents fonts
loadPage :: ObjectId -> T Page
loadPage source =

View file

@ -8,15 +8,15 @@ module PDF.Text {-(
import Control.Applicative ((<|>))
import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail)
import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (MonadState, evalStateT, get, modify, put)
import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (drop, null, take)
import Data.ByteString.Char8 (pack, unpack)
import Data.ByteString.Char8 (pack)
import Data.Map ((!), (!?), Map)
import qualified Data.Map as Map (fromList, lookup, toList)
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap)
import qualified Data.Map as Map (fromList)
import PDF.Font (Font, FontSet, emptyFont)
import PDF.Object (
DirectObject(..), StringObject(..)
, array, blank, name, regular, stringObject, toByteString
@ -170,11 +170,11 @@ nameArg = Typed . NameObject <$> name <* blank
stringArg :: MonadParser m => m Argument
stringArg = Typed . StringObject <$> stringObject <* blank
type ParserWithFont = ReaderT CMappers (Parser CMap)
type ParserWithFont = ReaderT FontSet (Parser Font)
pageContents :: CMappers -> ByteString -> Either String [ByteString]
pageContents :: FontSet -> ByteString -> Either String [ByteString]
pageContents font input =
evalParser (runReaderT page font) emptyCMap input
evalParser (runReaderT page font) emptyFont input
page :: ParserWithFont [ByteString]
page = graphicState <|> text <?> "Text page contents"
@ -216,24 +216,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString
decodeString = decode . toByteString
where
decode input
| BS.null input = return ""
| otherwise = do
(output, remainingInput) <- trySizes input =<< Map.toList <$> get
mappend output <$> decode remainingInput
trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString)
trySizes s [] = fail $ "No matching code found in font for " ++ unpack s
trySizes s ((size, cRanges):others) =
let prefix = BS.take size s in
case tryRanges prefix cRanges of
Nothing -> trySizes s others
Just outputSequence -> return (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
tryRanges _ [] = Nothing
tryRanges prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of
Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence
decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m ByteString
decodeString input = do
font <- get
either fail return . font $ toByteString input