Prepare storing the reverse mapping for CMaps, divided by length to be able to implement encoding with a reasonable complexity

This commit is contained in:
Tissevert 2020-03-08 00:02:24 +01:00
parent ca40d2df76
commit 457f1755e6
1 changed files with 58 additions and 42 deletions

View File

@ -5,7 +5,6 @@ module PDF.CMap (
, CMappers , CMappers
, CRange(..) , CRange(..)
, cMap , cMap
, emptyCMap
, matches , matches
) where ) where
@ -21,9 +20,10 @@ import Data.ByteString.Char8.Util (
) )
import Data.Map (Map, union) import Data.Map (Map, union)
import qualified Data.Map as Map ( import qualified Data.Map as Map (
adjust, empty, fromList, insertWith, lookup, toList adjust, empty, fromList, insert, insertWith, lookup, toList
) )
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as Text (length)
import qualified PDF.EOL as EOL (charset, parser) import qualified PDF.EOL as EOL (charset, parser)
import PDF.Font (Decoder, Encoder, Font(..)) import PDF.Font (Decoder, Encoder, Font(..))
import PDF.Object ( import PDF.Object (
@ -34,27 +34,29 @@ import PDF.Parser (MonadParser, Parser, runParser, takeAll)
import Prelude hiding (fail) import Prelude hiding (fail)
type CMappers = Map Name CMap type CMappers = Map Name CMap
data Mapping = Mapping { type ToUnicode = Map ByteString Text
bsToT :: Map ByteString Text type FromUnicode = Map Text ByteString
, tToBS :: Map Text ByteString
}
data CRange = CRange { data CRange = CRange {
fromSequence :: ByteString fromSequence :: ByteString
, toSequence :: ByteString , toSequence :: ByteString
, mapping :: Mapping , toUnicode :: ToUnicode
} deriving Show } deriving Show
type RangeSize = Int type Size = Int
type CMap = Map RangeSize [CRange] data CMap = CMap {
toUnicodeBySize :: Map Size [CRange]
, fromUnicodeBySize :: Map Size FromUnicode
}
toFont :: CMap -> Font toFont :: CMap -> Font
toFont aCMap = Font {decode = decoder aCMap, encode = encoder aCMap} toFont (CMap {toUnicodeBySize, fromUnicodeBySize}) =
Font {decode = decoder toUnicodeBySize, encode = encoder fromUnicodeBySize}
decoder :: CMap -> Decoder decoder :: Map Size [CRange] -> Decoder
decoder aCMap input decoder rangesBySize input
| BS.null input = Right "" | BS.null input = Right ""
| otherwise = do | otherwise = do
(output, remainingInput) <- trySizes input $ Map.toList aCMap (output, remainingInput) <- trySizes input $ Map.toList rangesBySize
mappend output <$> decoder aCMap remainingInput mappend output <$> decoder rangesBySize remainingInput
where where
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
trySizes s ((size, cRanges):others) = trySizes s ((size, cRanges):others) =
@ -64,17 +66,14 @@ decoder aCMap input
Just outputSequence -> Right (outputSequence, BS.drop size s) Just outputSequence -> Right (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe Text tryRanges :: ByteString -> [CRange] -> Maybe Text
tryRanges _ [] = Nothing tryRanges _ [] = Nothing
tryRanges prefix ((CRange {mapping}):cRanges) = tryRanges prefix ((CRange {toUnicode}):cRanges) =
case Map.lookup prefix mapping of case Map.lookup prefix toUnicode of
Nothing -> tryRanges prefix cRanges Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence outputSequence -> outputSequence
encoder :: CMap -> Encoder encoder :: Map Size FromUnicode -> Encoder
encoder = undefined encoder = undefined
emptyCMap :: CMap
emptyCMap = Map.empty
matches :: ByteString -> CRange -> Bool matches :: ByteString -> CRange -> Bool
matches code (CRange {fromSequence, toSequence}) = matches code (CRange {fromSequence, toSequence}) =
fromSequence <= code && code <= toSequence fromSequence <= code && code <= toSequence
@ -86,6 +85,7 @@ cMap = either fail (return . toFont . snd) . runParser
where where
ignoredLine = ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return () takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
emptyCMap = CMap {toUnicodeBySize = Map.empty, fromUnicodeBySize = Map.empty}
codeRanges :: Parser CMap () codeRanges :: Parser CMap ()
codeRanges = do codeRanges = do
@ -97,13 +97,14 @@ codeRanges = do
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser (,) <$> stringObject <* blank <*> stringObject <* EOL.parser
createMapping :: (StringObject, StringObject) -> Parser CMap () createMapping :: (StringObject, StringObject) -> Parser CMap ()
createMapping (Hexadecimal from, Hexadecimal to) = modify $ createMapping (Hexadecimal from, Hexadecimal to) = modify $ \cmap -> cmap {
Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}] toUnicodeBySize = Map.insertWith (++) size [cRange] (toUnicodeBySize cmap)
}
where where
fromSequence = b16ToBytes from fromSequence = b16ToBytes from
size = BS.length fromSequence size = BS.length fromSequence
toSequence = b16ToBytes to toSequence = b16ToBytes to
mapping = Map.empty cRange = CRange {fromSequence, toSequence, toUnicode = Map.empty}
createMapping _ = return () createMapping _ = return ()
cMapRange :: Parser CMap () cMapRange :: Parser CMap ()
@ -112,32 +113,47 @@ cMapRange = do
mapM_ saveMapping =<< count size rangeMapping mapM_ saveMapping =<< count size rangeMapping
line "endbfrange" line "endbfrange"
where where
rangeMapping = (,,) rangeMapping = do
<$> (stringObject <* blank) from <- (stringObject <* blank)
<*> (stringObject <* blank) to <- (stringObject <* blank)
<*> directObject <* EOL.parser mapFromTo from to =<< directObject <* EOL.parser
>>= mapFromTo
saveMapping :: [(ByteString, Text)] -> Parser CMap () saveMapping :: [(ByteString, Text)] -> Parser CMap ()
saveMapping [] = return () saveMapping assoc =
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize modify $ \(CMap {toUnicodeBySize, fromUnicodeBySize}) -> CMap {
toUnicodeBySize = saveToUnicodeBySize assoc toUnicodeBySize
, fromUnicodeBySize = saveFromUnicodeBySize reversed fromUnicodeBySize
}
where
reversed = (\(a, b) -> (b, a)) <$> assoc
saveToUnicodeBySize :: [(ByteString, Text)] -> Map Size [CRange] -> Map Size [CRange]
saveToUnicodeBySize [] = id
saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length code)
where where
newMapping = Map.fromList assoc newMapping = Map.fromList assoc
mappingSize = BS.length code
appendMapping cRange = appendMapping cRange =
cRange {mapping = mapping cRange `union` newMapping} cRange {toUnicode = toUnicode cRange `union` newMapping}
insertCRange = fmap (\cRange -> insertCRange = fmap (\cRange ->
if code `matches` cRange then appendMapping cRange else cRange if code `matches` cRange then appendMapping cRange else cRange
) )
saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode
saveFromUnicodeBySize = flip (foldl insertFromUnicode)
where
insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode
insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) =
let size = Text.length unicodeSequence in
Map.adjust (Map.insert unicodeSequence code) size tmpFromUnicodeBySize
cMapChar :: Parser CMap () cMapChar :: Parser CMap ()
cMapChar = do cMapChar = do
size <- integer <* line "beginbfchar" size <- integer <* line "beginbfchar"
saveMapping =<< count size charMapping <* line "endbfchar" saveMapping =<< count size charMapping <* line "endbfchar"
where where
charMapping = charMapping = do
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser from <- stringObject <* blank
>>= pairMapping pairMapping from =<< stringObject <* EOL.parser
between :: B16Int -> B16Int -> [ByteString] between :: B16Int -> B16Int -> [ByteString]
between from@(B16Int s) to = between from@(B16Int s) to =
@ -149,20 +165,20 @@ startFrom from@(B16Int s) =
let size = BS.length s `div` 2 in let size = BS.length s `div` 2 in
toBytes size <$> [b16ToInt from .. ] toBytes size <$> [b16ToInt from .. ]
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)] mapFromTo :: MonadParser m => StringObject -> StringObject -> DirectObject -> m [(ByteString, Text)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom) return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) = mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) =
zip (between from to) <$> (mapM dstByteString dstPoints) zip (between from to) <$> (mapM dstByteString dstPoints)
where where
dstByteString (StringObject (Hexadecimal dst)) = dstByteString (StringObject (Hexadecimal dst)) =
return . utf16BEToutf8 $ b16ToBytes dst return . utf16BEToutf8 $ b16ToBytes dst
dstByteString _ = fail "Invalid for a replacement string" dstByteString _ = fail "Invalid for a replacement string"
mapFromTo _ = fail "invalid range mapping found" mapFromTo _ _ _ = fail "invalid range mapping found"
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text) pairMapping :: MonadParser m => StringObject -> StringObject -> m (ByteString, Text)
pairMapping (Hexadecimal from, Hexadecimal to) = pairMapping (Hexadecimal from) (Hexadecimal to) =
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to) return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
pairMapping _ = fail "invalid pair mapping found" pairMapping _ _ = fail "invalid pair mapping found"