Reimplement cMap as a type of Font and make the code ready for other Fonts
This commit is contained in:
parent
22cde37025
commit
6ed57d66e8
4 changed files with 67 additions and 56 deletions
|
@ -1,4 +1,5 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE TypeSynonymInstances #-}
|
||||
module PDF.CMap (
|
||||
|
@ -14,14 +15,17 @@ import Control.Applicative ((<|>), many)
|
|||
import Control.Monad.State (modify)
|
||||
import Data.Attoparsec.ByteString.Char8 (count)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS (length)
|
||||
import qualified Data.ByteString as BS (drop, length, null, take)
|
||||
import Data.ByteString.Char8 (unpack)
|
||||
import Data.ByteString.Char8.Util (
|
||||
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
|
||||
)
|
||||
import Data.Map (Map, union)
|
||||
import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
|
||||
import qualified Data.Map as Map (
|
||||
adjust, empty, fromList, insertWith, lookup, toList
|
||||
)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import PDF.Font (Font(..))
|
||||
import PDF.Font (Font)
|
||||
import PDF.Object (
|
||||
DirectObject(..), Name, StringObject(..)
|
||||
, blank, directObject, integer, line, stringObject
|
||||
|
@ -38,8 +42,25 @@ data CRange = CRange {
|
|||
type RangeSize = Int
|
||||
type CMap = Map RangeSize [CRange]
|
||||
|
||||
instance Font CMap where
|
||||
decode = undefined
|
||||
toFont :: CMap -> Font
|
||||
toFont aCMap input
|
||||
| BS.null input = Right ""
|
||||
| otherwise = do
|
||||
(output, remainingInput) <- trySizes input $ Map.toList aCMap
|
||||
mappend output <$> toFont aCMap remainingInput
|
||||
where
|
||||
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
|
||||
trySizes s ((size, cRanges):others) =
|
||||
let prefix = BS.take size s in
|
||||
case tryRanges prefix cRanges of
|
||||
Nothing -> trySizes s others
|
||||
Just outputSequence -> Right (outputSequence, BS.drop size s)
|
||||
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
|
||||
tryRanges _ [] = Nothing
|
||||
tryRanges prefix ((CRange {mapping}):cRanges) =
|
||||
case Map.lookup prefix mapping of
|
||||
Nothing -> tryRanges prefix cRanges
|
||||
outputSequence -> outputSequence
|
||||
|
||||
emptyCMap :: CMap
|
||||
emptyCMap = Map.empty
|
||||
|
@ -48,8 +69,8 @@ matches :: ByteString -> CRange -> Bool
|
|||
matches code (CRange {fromSequence, toSequence}) =
|
||||
fromSequence <= code && code <= toSequence
|
||||
|
||||
cMap :: ByteString -> Either String CMap
|
||||
cMap = fmap snd <$> runParser
|
||||
cMap :: ByteString -> Either String Font
|
||||
cMap = fmap (toFont . snd) <$> runParser
|
||||
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
||||
emptyCMap
|
||||
where
|
||||
|
|
|
@ -1,9 +1,15 @@
|
|||
module PDF.Font (
|
||||
Font(..)
|
||||
Font
|
||||
, FontSet
|
||||
, emptyFont
|
||||
) where
|
||||
|
||||
import Data.ByteString (ByteString)
|
||||
import PDF.Object (StringObject)
|
||||
import Data.Map (Map)
|
||||
import PDF.Object (Name)
|
||||
|
||||
class Font a where
|
||||
decode :: a -> StringObject -> ByteString
|
||||
type Font = ByteString -> Either String ByteString
|
||||
type FontSet = Map Name Font
|
||||
|
||||
emptyFont :: Font
|
||||
emptyFont _ = Left "No fond loaded"
|
||||
|
|
|
@ -13,7 +13,8 @@ import Data.ByteString (ByteString)
|
|||
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||
import PDF.CMap (CMap, CMappers, cMap)
|
||||
import PDF.CMap (cMap)
|
||||
import PDF.Font (Font, FontSet)
|
||||
import PDF.Object (
|
||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Object(..), Name(..), Structure(..)
|
||||
|
@ -23,8 +24,8 @@ import PDF.Text (pageContents)
|
|||
import Prelude hiding (fail)
|
||||
import Text.Printf (printf)
|
||||
|
||||
type CachedCMaps = Map ObjectId CMap
|
||||
type T = RWST Content () CachedCMaps (Either String)
|
||||
type CachedFonts = Map ObjectId Font
|
||||
type T = RWST Content () CachedFonts (Either String)
|
||||
data Page = Page {
|
||||
contents :: [ByteString]
|
||||
, source :: ObjectId
|
||||
|
@ -60,14 +61,14 @@ getResource (Reference (IndirectObjCoordinates {objectId})) =
|
|||
getResource directObject =
|
||||
lift $ expected "resource (dictionary or reference)" directObject
|
||||
|
||||
getFont :: Dictionary -> T Dictionary
|
||||
getFont pageDict =
|
||||
getFontDictionary :: Dictionary -> T Dictionary
|
||||
getFontDictionary pageDict =
|
||||
key "Resources" pageDict
|
||||
//= getResource
|
||||
\\= key "Font"
|
||||
>>= getResource
|
||||
|
||||
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
|
||||
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
||||
cache loader objectId =
|
||||
(maybe load return . Map.lookup objectId) =<< RWS.get
|
||||
where
|
||||
|
@ -76,7 +77,7 @@ cache loader objectId =
|
|||
modify $ Map.insert objectId value
|
||||
return value
|
||||
|
||||
loadFont :: ObjectId -> T CMap
|
||||
loadFont :: ObjectId -> T Font
|
||||
loadFont objectId =
|
||||
getObject objectId
|
||||
\\= dict
|
||||
|
@ -85,14 +86,14 @@ loadFont objectId =
|
|||
\\= stream
|
||||
\\= cMap
|
||||
|
||||
loadCMappers :: Dictionary -> T CMappers
|
||||
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
||||
loadFonts :: Dictionary -> T FontSet
|
||||
loadFonts = foldM addFont Map.empty . Map.toList
|
||||
where
|
||||
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
|
||||
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
|
||||
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||
flip (Map.insert name) output <$> cache loadFont objectId
|
||||
--maybe output (flip (Map.insert name) output) <$> cache loadFont objectId
|
||||
loadCMapper output _ = return output
|
||||
addFont output _ = return output
|
||||
|
||||
getObject :: ObjectId -> T Object
|
||||
getObject objectId = do
|
||||
|
@ -142,13 +143,13 @@ getReferences objects = do
|
|||
extractText :: Object -> T [ByteString]
|
||||
extractText object = do
|
||||
pageDict <- lift $ dict object
|
||||
cMappers <- loadCMappers =<< getFont pageDict
|
||||
fonts <- loadFonts =<< getFontDictionary pageDict
|
||||
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
||||
concat <$> (objects //= (mapM $ loadContent cMappers))
|
||||
concat <$> (objects //= (mapM $ loadContent fonts))
|
||||
where
|
||||
loadContent :: CMappers -> DirectObject -> T [ByteString]
|
||||
loadContent cMappers directObject =
|
||||
follow directObject \\= stream \\= pageContents cMappers
|
||||
loadContent :: FontSet -> DirectObject -> T [ByteString]
|
||||
loadContent fonts directObject =
|
||||
follow directObject \\= stream \\= pageContents fonts
|
||||
|
||||
loadPage :: ObjectId -> T Page
|
||||
loadPage source =
|
||||
|
|
|
@ -8,15 +8,15 @@ module PDF.Text {-(
|
|||
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad (foldM)
|
||||
import Control.Monad.Fail (MonadFail)
|
||||
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
||||
import Control.Monad.State (MonadState, evalStateT, get, modify, put)
|
||||
import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS (drop, null, take)
|
||||
import Data.ByteString.Char8 (pack, unpack)
|
||||
import Data.ByteString.Char8 (pack)
|
||||
import Data.Map ((!), (!?), Map)
|
||||
import qualified Data.Map as Map (fromList, lookup, toList)
|
||||
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap)
|
||||
import qualified Data.Map as Map (fromList)
|
||||
import PDF.Font (Font, FontSet, emptyFont)
|
||||
import PDF.Object (
|
||||
DirectObject(..), StringObject(..)
|
||||
, array, blank, name, regular, stringObject, toByteString
|
||||
|
@ -170,11 +170,11 @@ nameArg = Typed . NameObject <$> name <* blank
|
|||
stringArg :: MonadParser m => m Argument
|
||||
stringArg = Typed . StringObject <$> stringObject <* blank
|
||||
|
||||
type ParserWithFont = ReaderT CMappers (Parser CMap)
|
||||
type ParserWithFont = ReaderT FontSet (Parser Font)
|
||||
|
||||
pageContents :: CMappers -> ByteString -> Either String [ByteString]
|
||||
pageContents :: FontSet -> ByteString -> Either String [ByteString]
|
||||
pageContents font input =
|
||||
evalParser (runReaderT page font) emptyCMap input
|
||||
evalParser (runReaderT page font) emptyFont input
|
||||
|
||||
page :: ParserWithFont [ByteString]
|
||||
page = graphicState <|> text <?> "Text page contents"
|
||||
|
@ -216,24 +216,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
|
|||
|
||||
runOperator _ = return []
|
||||
|
||||
decodeString :: StringObject -> ParserWithFont ByteString
|
||||
decodeString = decode . toByteString
|
||||
where
|
||||
decode input
|
||||
| BS.null input = return ""
|
||||
| otherwise = do
|
||||
(output, remainingInput) <- trySizes input =<< Map.toList <$> get
|
||||
mappend output <$> decode remainingInput
|
||||
trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString)
|
||||
trySizes s [] = fail $ "No matching code found in font for " ++ unpack s
|
||||
trySizes s ((size, cRanges):others) =
|
||||
let prefix = BS.take size s in
|
||||
case tryRanges prefix cRanges of
|
||||
Nothing -> trySizes s others
|
||||
Just outputSequence -> return (outputSequence, BS.drop size s)
|
||||
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
|
||||
tryRanges _ [] = Nothing
|
||||
tryRanges prefix ((CRange {mapping}):cRanges) =
|
||||
case Map.lookup prefix mapping of
|
||||
Nothing -> tryRanges prefix cRanges
|
||||
outputSequence -> outputSequence
|
||||
decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m ByteString
|
||||
decodeString input = do
|
||||
font <- get
|
||||
either fail return . font $ toByteString input
|
||||
|
|
Loading…
Reference in a new issue