From 6ed57d66e8f86963c3db42d20f1582a72cf4cee7 Mon Sep 17 00:00:00 2001 From: Tissevert Date: Wed, 5 Feb 2020 17:42:17 +0100 Subject: [PATCH] Reimplement cMap as a type of Font and make the code ready for other Fonts --- src/PDF/CMap.hs | 35 ++++++++++++++++++++++++++++------- src/PDF/Font.hs | 14 ++++++++++---- src/PDF/Pages.hs | 35 ++++++++++++++++++----------------- src/PDF/Text.hs | 39 +++++++++++---------------------------- 4 files changed, 67 insertions(+), 56 deletions(-) diff --git a/src/PDF/CMap.hs b/src/PDF/CMap.hs index 4a99f00..712c990 100644 --- a/src/PDF/CMap.hs +++ b/src/PDF/CMap.hs @@ -1,4 +1,5 @@ {-# LANGUAGE NamedFieldPuns #-} +{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE TypeSynonymInstances #-} module PDF.CMap ( @@ -14,14 +15,17 @@ import Control.Applicative ((<|>), many) import Control.Monad.State (modify) import Data.Attoparsec.ByteString.Char8 (count) import Data.ByteString (ByteString) -import qualified Data.ByteString as BS (length) +import qualified Data.ByteString as BS (drop, length, null, take) +import Data.ByteString.Char8 (unpack) import Data.ByteString.Char8.Util ( B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8 ) import Data.Map (Map, union) -import qualified Data.Map as Map (adjust, empty, fromList, insertWith) +import qualified Data.Map as Map ( + adjust, empty, fromList, insertWith, lookup, toList + ) import qualified PDF.EOL as EOL (charset, parser) -import PDF.Font (Font(..)) +import PDF.Font (Font) import PDF.Object ( DirectObject(..), Name, StringObject(..) , blank, directObject, integer, line, stringObject @@ -38,8 +42,25 @@ data CRange = CRange { type RangeSize = Int type CMap = Map RangeSize [CRange] -instance Font CMap where - decode = undefined +toFont :: CMap -> Font +toFont aCMap input + | BS.null input = Right "" + | otherwise = do + (output, remainingInput) <- trySizes input $ Map.toList aCMap + mappend output <$> toFont aCMap remainingInput + where + trySizes s [] = Left $ "No matching code found in font for " ++ unpack s + trySizes s ((size, cRanges):others) = + let prefix = BS.take size s in + case tryRanges prefix cRanges of + Nothing -> trySizes s others + Just outputSequence -> Right (outputSequence, BS.drop size s) + tryRanges :: ByteString -> [CRange] -> Maybe ByteString + tryRanges _ [] = Nothing + tryRanges prefix ((CRange {mapping}):cRanges) = + case Map.lookup prefix mapping of + Nothing -> tryRanges prefix cRanges + outputSequence -> outputSequence emptyCMap :: CMap emptyCMap = Map.empty @@ -48,8 +69,8 @@ matches :: ByteString -> CRange -> Bool matches code (CRange {fromSequence, toSequence}) = fromSequence <= code && code <= toSequence -cMap :: ByteString -> Either String CMap -cMap = fmap snd <$> runParser +cMap :: ByteString -> Either String Font +cMap = fmap (toFont . snd) <$> runParser (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine)) emptyCMap where diff --git a/src/PDF/Font.hs b/src/PDF/Font.hs index 0f23c7c..6d47503 100644 --- a/src/PDF/Font.hs +++ b/src/PDF/Font.hs @@ -1,9 +1,15 @@ module PDF.Font ( - Font(..) + Font + , FontSet + , emptyFont ) where import Data.ByteString (ByteString) -import PDF.Object (StringObject) +import Data.Map (Map) +import PDF.Object (Name) -class Font a where - decode :: a -> StringObject -> ByteString +type Font = ByteString -> Either String ByteString +type FontSet = Map Name Font + +emptyFont :: Font +emptyFont _ = Left "No fond loaded" diff --git a/src/PDF/Pages.hs b/src/PDF/Pages.hs index 035ce43..9fa6812 100755 --- a/src/PDF/Pages.hs +++ b/src/PDF/Pages.hs @@ -13,7 +13,8 @@ import Data.ByteString (ByteString) import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import Data.Map (Map, (!)) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) -import PDF.CMap (CMap, CMappers, cMap) +import PDF.CMap (cMap) +import PDF.Font (Font, FontSet) import PDF.Object ( Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Object(..), Name(..), Structure(..) @@ -23,8 +24,8 @@ import PDF.Text (pageContents) import Prelude hiding (fail) import Text.Printf (printf) -type CachedCMaps = Map ObjectId CMap -type T = RWST Content () CachedCMaps (Either String) +type CachedFonts = Map ObjectId Font +type T = RWST Content () CachedFonts (Either String) data Page = Page { contents :: [ByteString] , source :: ObjectId @@ -60,14 +61,14 @@ getResource (Reference (IndirectObjCoordinates {objectId})) = getResource directObject = lift $ expected "resource (dictionary or reference)" directObject -getFont :: Dictionary -> T Dictionary -getFont pageDict = +getFontDictionary :: Dictionary -> T Dictionary +getFontDictionary pageDict = key "Resources" pageDict //= getResource \\= key "Font" >>= getResource -cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap +cache :: (ObjectId -> T Font) -> ObjectId -> T Font cache loader objectId = (maybe load return . Map.lookup objectId) =<< RWS.get where @@ -76,7 +77,7 @@ cache loader objectId = modify $ Map.insert objectId value return value -loadFont :: ObjectId -> T CMap +loadFont :: ObjectId -> T Font loadFont objectId = getObject objectId \\= dict @@ -85,14 +86,14 @@ loadFont objectId = \\= stream \\= cMap -loadCMappers :: Dictionary -> T CMappers -loadCMappers = foldM loadCMapper Map.empty . Map.toList +loadFonts :: Dictionary -> T FontSet +loadFonts = foldM addFont Map.empty . Map.toList where - loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers - loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = + addFont :: FontSet -> (Name, DirectObject) -> T FontSet + addFont output (name, Reference (IndirectObjCoordinates {objectId})) = flip (Map.insert name) output <$> cache loadFont objectId --maybe output (flip (Map.insert name) output) <$> cache loadFont objectId - loadCMapper output _ = return output + addFont output _ = return output getObject :: ObjectId -> T Object getObject objectId = do @@ -142,13 +143,13 @@ getReferences objects = do extractText :: Object -> T [ByteString] extractText object = do pageDict <- lift $ dict object - cMappers <- loadCMappers =<< getFont pageDict + fonts <- loadFonts =<< getFontDictionary pageDict let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject]) - concat <$> (objects //= (mapM $ loadContent cMappers)) + concat <$> (objects //= (mapM $ loadContent fonts)) where - loadContent :: CMappers -> DirectObject -> T [ByteString] - loadContent cMappers directObject = - follow directObject \\= stream \\= pageContents cMappers + loadContent :: FontSet -> DirectObject -> T [ByteString] + loadContent fonts directObject = + follow directObject \\= stream \\= pageContents fonts loadPage :: ObjectId -> T Page loadPage source = diff --git a/src/PDF/Text.hs b/src/PDF/Text.hs index a2c7d45..bdceb64 100644 --- a/src/PDF/Text.hs +++ b/src/PDF/Text.hs @@ -8,15 +8,15 @@ module PDF.Text {-( import Control.Applicative ((<|>)) import Control.Monad (foldM) +import Control.Monad.Fail (MonadFail) import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.State (MonadState, evalStateT, get, modify, put) import Data.Attoparsec.ByteString.Char8 (choice, sepBy) import Data.ByteString (ByteString) -import qualified Data.ByteString as BS (drop, null, take) -import Data.ByteString.Char8 (pack, unpack) +import Data.ByteString.Char8 (pack) import Data.Map ((!), (!?), Map) -import qualified Data.Map as Map (fromList, lookup, toList) -import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap) +import qualified Data.Map as Map (fromList) +import PDF.Font (Font, FontSet, emptyFont) import PDF.Object ( DirectObject(..), StringObject(..) , array, blank, name, regular, stringObject, toByteString @@ -170,11 +170,11 @@ nameArg = Typed . NameObject <$> name <* blank stringArg :: MonadParser m => m Argument stringArg = Typed . StringObject <$> stringObject <* blank -type ParserWithFont = ReaderT CMappers (Parser CMap) +type ParserWithFont = ReaderT FontSet (Parser Font) -pageContents :: CMappers -> ByteString -> Either String [ByteString] +pageContents :: FontSet -> ByteString -> Either String [ByteString] pageContents font input = - evalParser (runReaderT page font) emptyCMap input + evalParser (runReaderT page font) emptyFont input page :: ParserWithFont [ByteString] page = graphicState <|> text "Text page contents" @@ -216,24 +216,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) = runOperator _ = return [] -decodeString :: StringObject -> ParserWithFont ByteString -decodeString = decode . toByteString - where - decode input - | BS.null input = return "" - | otherwise = do - (output, remainingInput) <- trySizes input =<< Map.toList <$> get - mappend output <$> decode remainingInput - trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString) - trySizes s [] = fail $ "No matching code found in font for " ++ unpack s - trySizes s ((size, cRanges):others) = - let prefix = BS.take size s in - case tryRanges prefix cRanges of - Nothing -> trySizes s others - Just outputSequence -> return (outputSequence, BS.drop size s) - tryRanges :: ByteString -> [CRange] -> Maybe ByteString - tryRanges _ [] = Nothing - tryRanges prefix ((CRange {mapping}):cRanges) = - case Map.lookup prefix mapping of - Nothing -> tryRanges prefix cRanges - outputSequence -> outputSequence +decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m ByteString +decodeString input = do + font <- get + either fail return . font $ toByteString input