Reimplement cMap as a type of Font and make the code ready for other Fonts
This commit is contained in:
parent
22cde37025
commit
6ed57d66e8
4 changed files with 67 additions and 56 deletions
|
@ -1,4 +1,5 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
{-# LANGUAGE FlexibleInstances #-}
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
{-# LANGUAGE TypeSynonymInstances #-}
|
{-# LANGUAGE TypeSynonymInstances #-}
|
||||||
module PDF.CMap (
|
module PDF.CMap (
|
||||||
|
@ -14,14 +15,17 @@ import Control.Applicative ((<|>), many)
|
||||||
import Control.Monad.State (modify)
|
import Control.Monad.State (modify)
|
||||||
import Data.Attoparsec.ByteString.Char8 (count)
|
import Data.Attoparsec.ByteString.Char8 (count)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import qualified Data.ByteString as BS (length)
|
import qualified Data.ByteString as BS (drop, length, null, take)
|
||||||
|
import Data.ByteString.Char8 (unpack)
|
||||||
import Data.ByteString.Char8.Util (
|
import Data.ByteString.Char8.Util (
|
||||||
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
|
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
|
||||||
)
|
)
|
||||||
import Data.Map (Map, union)
|
import Data.Map (Map, union)
|
||||||
import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
|
import qualified Data.Map as Map (
|
||||||
|
adjust, empty, fromList, insertWith, lookup, toList
|
||||||
|
)
|
||||||
import qualified PDF.EOL as EOL (charset, parser)
|
import qualified PDF.EOL as EOL (charset, parser)
|
||||||
import PDF.Font (Font(..))
|
import PDF.Font (Font)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
DirectObject(..), Name, StringObject(..)
|
DirectObject(..), Name, StringObject(..)
|
||||||
, blank, directObject, integer, line, stringObject
|
, blank, directObject, integer, line, stringObject
|
||||||
|
@ -38,8 +42,25 @@ data CRange = CRange {
|
||||||
type RangeSize = Int
|
type RangeSize = Int
|
||||||
type CMap = Map RangeSize [CRange]
|
type CMap = Map RangeSize [CRange]
|
||||||
|
|
||||||
instance Font CMap where
|
toFont :: CMap -> Font
|
||||||
decode = undefined
|
toFont aCMap input
|
||||||
|
| BS.null input = Right ""
|
||||||
|
| otherwise = do
|
||||||
|
(output, remainingInput) <- trySizes input $ Map.toList aCMap
|
||||||
|
mappend output <$> toFont aCMap remainingInput
|
||||||
|
where
|
||||||
|
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
|
||||||
|
trySizes s ((size, cRanges):others) =
|
||||||
|
let prefix = BS.take size s in
|
||||||
|
case tryRanges prefix cRanges of
|
||||||
|
Nothing -> trySizes s others
|
||||||
|
Just outputSequence -> Right (outputSequence, BS.drop size s)
|
||||||
|
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
|
||||||
|
tryRanges _ [] = Nothing
|
||||||
|
tryRanges prefix ((CRange {mapping}):cRanges) =
|
||||||
|
case Map.lookup prefix mapping of
|
||||||
|
Nothing -> tryRanges prefix cRanges
|
||||||
|
outputSequence -> outputSequence
|
||||||
|
|
||||||
emptyCMap :: CMap
|
emptyCMap :: CMap
|
||||||
emptyCMap = Map.empty
|
emptyCMap = Map.empty
|
||||||
|
@ -48,8 +69,8 @@ matches :: ByteString -> CRange -> Bool
|
||||||
matches code (CRange {fromSequence, toSequence}) =
|
matches code (CRange {fromSequence, toSequence}) =
|
||||||
fromSequence <= code && code <= toSequence
|
fromSequence <= code && code <= toSequence
|
||||||
|
|
||||||
cMap :: ByteString -> Either String CMap
|
cMap :: ByteString -> Either String Font
|
||||||
cMap = fmap snd <$> runParser
|
cMap = fmap (toFont . snd) <$> runParser
|
||||||
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
||||||
emptyCMap
|
emptyCMap
|
||||||
where
|
where
|
||||||
|
|
|
@ -1,9 +1,15 @@
|
||||||
module PDF.Font (
|
module PDF.Font (
|
||||||
Font(..)
|
Font
|
||||||
|
, FontSet
|
||||||
|
, emptyFont
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import PDF.Object (StringObject)
|
import Data.Map (Map)
|
||||||
|
import PDF.Object (Name)
|
||||||
|
|
||||||
class Font a where
|
type Font = ByteString -> Either String ByteString
|
||||||
decode :: a -> StringObject -> ByteString
|
type FontSet = Map Name Font
|
||||||
|
|
||||||
|
emptyFont :: Font
|
||||||
|
emptyFont _ = Left "No fond loaded"
|
||||||
|
|
|
@ -13,7 +13,8 @@ import Data.ByteString (ByteString)
|
||||||
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||||
import Data.Map (Map, (!))
|
import Data.Map (Map, (!))
|
||||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||||
import PDF.CMap (CMap, CMappers, cMap)
|
import PDF.CMap (cMap)
|
||||||
|
import PDF.Font (Font, FontSet)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
, Object(..), Name(..), Structure(..)
|
, Object(..), Name(..), Structure(..)
|
||||||
|
@ -23,8 +24,8 @@ import PDF.Text (pageContents)
|
||||||
import Prelude hiding (fail)
|
import Prelude hiding (fail)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
type CachedCMaps = Map ObjectId CMap
|
type CachedFonts = Map ObjectId Font
|
||||||
type T = RWST Content () CachedCMaps (Either String)
|
type T = RWST Content () CachedFonts (Either String)
|
||||||
data Page = Page {
|
data Page = Page {
|
||||||
contents :: [ByteString]
|
contents :: [ByteString]
|
||||||
, source :: ObjectId
|
, source :: ObjectId
|
||||||
|
@ -60,14 +61,14 @@ getResource (Reference (IndirectObjCoordinates {objectId})) =
|
||||||
getResource directObject =
|
getResource directObject =
|
||||||
lift $ expected "resource (dictionary or reference)" directObject
|
lift $ expected "resource (dictionary or reference)" directObject
|
||||||
|
|
||||||
getFont :: Dictionary -> T Dictionary
|
getFontDictionary :: Dictionary -> T Dictionary
|
||||||
getFont pageDict =
|
getFontDictionary pageDict =
|
||||||
key "Resources" pageDict
|
key "Resources" pageDict
|
||||||
//= getResource
|
//= getResource
|
||||||
\\= key "Font"
|
\\= key "Font"
|
||||||
>>= getResource
|
>>= getResource
|
||||||
|
|
||||||
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
|
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
||||||
cache loader objectId =
|
cache loader objectId =
|
||||||
(maybe load return . Map.lookup objectId) =<< RWS.get
|
(maybe load return . Map.lookup objectId) =<< RWS.get
|
||||||
where
|
where
|
||||||
|
@ -76,7 +77,7 @@ cache loader objectId =
|
||||||
modify $ Map.insert objectId value
|
modify $ Map.insert objectId value
|
||||||
return value
|
return value
|
||||||
|
|
||||||
loadFont :: ObjectId -> T CMap
|
loadFont :: ObjectId -> T Font
|
||||||
loadFont objectId =
|
loadFont objectId =
|
||||||
getObject objectId
|
getObject objectId
|
||||||
\\= dict
|
\\= dict
|
||||||
|
@ -85,14 +86,14 @@ loadFont objectId =
|
||||||
\\= stream
|
\\= stream
|
||||||
\\= cMap
|
\\= cMap
|
||||||
|
|
||||||
loadCMappers :: Dictionary -> T CMappers
|
loadFonts :: Dictionary -> T FontSet
|
||||||
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
loadFonts = foldM addFont Map.empty . Map.toList
|
||||||
where
|
where
|
||||||
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
|
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
|
||||||
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) =
|
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||||
flip (Map.insert name) output <$> cache loadFont objectId
|
flip (Map.insert name) output <$> cache loadFont objectId
|
||||||
--maybe output (flip (Map.insert name) output) <$> cache loadFont objectId
|
--maybe output (flip (Map.insert name) output) <$> cache loadFont objectId
|
||||||
loadCMapper output _ = return output
|
addFont output _ = return output
|
||||||
|
|
||||||
getObject :: ObjectId -> T Object
|
getObject :: ObjectId -> T Object
|
||||||
getObject objectId = do
|
getObject objectId = do
|
||||||
|
@ -142,13 +143,13 @@ getReferences objects = do
|
||||||
extractText :: Object -> T [ByteString]
|
extractText :: Object -> T [ByteString]
|
||||||
extractText object = do
|
extractText object = do
|
||||||
pageDict <- lift $ dict object
|
pageDict <- lift $ dict object
|
||||||
cMappers <- loadCMappers =<< getFont pageDict
|
fonts <- loadFonts =<< getFontDictionary pageDict
|
||||||
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
||||||
concat <$> (objects //= (mapM $ loadContent cMappers))
|
concat <$> (objects //= (mapM $ loadContent fonts))
|
||||||
where
|
where
|
||||||
loadContent :: CMappers -> DirectObject -> T [ByteString]
|
loadContent :: FontSet -> DirectObject -> T [ByteString]
|
||||||
loadContent cMappers directObject =
|
loadContent fonts directObject =
|
||||||
follow directObject \\= stream \\= pageContents cMappers
|
follow directObject \\= stream \\= pageContents fonts
|
||||||
|
|
||||||
loadPage :: ObjectId -> T Page
|
loadPage :: ObjectId -> T Page
|
||||||
loadPage source =
|
loadPage source =
|
||||||
|
|
|
@ -8,15 +8,15 @@ module PDF.Text {-(
|
||||||
|
|
||||||
import Control.Applicative ((<|>))
|
import Control.Applicative ((<|>))
|
||||||
import Control.Monad (foldM)
|
import Control.Monad (foldM)
|
||||||
|
import Control.Monad.Fail (MonadFail)
|
||||||
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
||||||
import Control.Monad.State (MonadState, evalStateT, get, modify, put)
|
import Control.Monad.State (MonadState, evalStateT, get, modify, put)
|
||||||
import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
|
import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import qualified Data.ByteString as BS (drop, null, take)
|
import Data.ByteString.Char8 (pack)
|
||||||
import Data.ByteString.Char8 (pack, unpack)
|
|
||||||
import Data.Map ((!), (!?), Map)
|
import Data.Map ((!), (!?), Map)
|
||||||
import qualified Data.Map as Map (fromList, lookup, toList)
|
import qualified Data.Map as Map (fromList)
|
||||||
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap)
|
import PDF.Font (Font, FontSet, emptyFont)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
DirectObject(..), StringObject(..)
|
DirectObject(..), StringObject(..)
|
||||||
, array, blank, name, regular, stringObject, toByteString
|
, array, blank, name, regular, stringObject, toByteString
|
||||||
|
@ -170,11 +170,11 @@ nameArg = Typed . NameObject <$> name <* blank
|
||||||
stringArg :: MonadParser m => m Argument
|
stringArg :: MonadParser m => m Argument
|
||||||
stringArg = Typed . StringObject <$> stringObject <* blank
|
stringArg = Typed . StringObject <$> stringObject <* blank
|
||||||
|
|
||||||
type ParserWithFont = ReaderT CMappers (Parser CMap)
|
type ParserWithFont = ReaderT FontSet (Parser Font)
|
||||||
|
|
||||||
pageContents :: CMappers -> ByteString -> Either String [ByteString]
|
pageContents :: FontSet -> ByteString -> Either String [ByteString]
|
||||||
pageContents font input =
|
pageContents font input =
|
||||||
evalParser (runReaderT page font) emptyCMap input
|
evalParser (runReaderT page font) emptyFont input
|
||||||
|
|
||||||
page :: ParserWithFont [ByteString]
|
page :: ParserWithFont [ByteString]
|
||||||
page = graphicState <|> text <?> "Text page contents"
|
page = graphicState <|> text <?> "Text page contents"
|
||||||
|
@ -216,24 +216,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
|
||||||
|
|
||||||
runOperator _ = return []
|
runOperator _ = return []
|
||||||
|
|
||||||
decodeString :: StringObject -> ParserWithFont ByteString
|
decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m ByteString
|
||||||
decodeString = decode . toByteString
|
decodeString input = do
|
||||||
where
|
font <- get
|
||||||
decode input
|
either fail return . font $ toByteString input
|
||||||
| BS.null input = return ""
|
|
||||||
| otherwise = do
|
|
||||||
(output, remainingInput) <- trySizes input =<< Map.toList <$> get
|
|
||||||
mappend output <$> decode remainingInput
|
|
||||||
trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString)
|
|
||||||
trySizes s [] = fail $ "No matching code found in font for " ++ unpack s
|
|
||||||
trySizes s ((size, cRanges):others) =
|
|
||||||
let prefix = BS.take size s in
|
|
||||||
case tryRanges prefix cRanges of
|
|
||||||
Nothing -> trySizes s others
|
|
||||||
Just outputSequence -> return (outputSequence, BS.drop size s)
|
|
||||||
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
|
|
||||||
tryRanges _ [] = Nothing
|
|
||||||
tryRanges prefix ((CRange {mapping}):cRanges) =
|
|
||||||
case Map.lookup prefix mapping of
|
|
||||||
Nothing -> tryRanges prefix cRanges
|
|
||||||
outputSequence -> outputSequence
|
|
||||||
|
|
Loading…
Reference in a new issue