Prepare storing the reverse mapping for CMaps, divided by length to be able to implement encoding with a reasonable complexity

This commit is contained in:
Tissevert 2020-03-08 00:02:24 +01:00
parent ca40d2df76
commit 457f1755e6
1 changed files with 58 additions and 42 deletions

View File

@ -5,7 +5,6 @@ module PDF.CMap (
, CMappers
, CRange(..)
, cMap
, emptyCMap
, matches
) where
@ -21,9 +20,10 @@ import Data.ByteString.Char8.Util (
)
import Data.Map (Map, union)
import qualified Data.Map as Map (
adjust, empty, fromList, insertWith, lookup, toList
adjust, empty, fromList, insert, insertWith, lookup, toList
)
import Data.Text (Text)
import qualified Data.Text as Text (length)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Font (Decoder, Encoder, Font(..))
import PDF.Object (
@ -34,27 +34,29 @@ import PDF.Parser (MonadParser, Parser, runParser, takeAll)
import Prelude hiding (fail)
type CMappers = Map Name CMap
data Mapping = Mapping {
bsToT :: Map ByteString Text
, tToBS :: Map Text ByteString
}
type ToUnicode = Map ByteString Text
type FromUnicode = Map Text ByteString
data CRange = CRange {
fromSequence :: ByteString
, toSequence :: ByteString
, mapping :: Mapping
, toUnicode :: ToUnicode
} deriving Show
type RangeSize = Int
type CMap = Map RangeSize [CRange]
type Size = Int
data CMap = CMap {
toUnicodeBySize :: Map Size [CRange]
, fromUnicodeBySize :: Map Size FromUnicode
}
toFont :: CMap -> Font
toFont aCMap = Font {decode = decoder aCMap, encode = encoder aCMap}
toFont (CMap {toUnicodeBySize, fromUnicodeBySize}) =
Font {decode = decoder toUnicodeBySize, encode = encoder fromUnicodeBySize}
decoder :: CMap -> Decoder
decoder aCMap input
decoder :: Map Size [CRange] -> Decoder
decoder rangesBySize input
| BS.null input = Right ""
| otherwise = do
(output, remainingInput) <- trySizes input $ Map.toList aCMap
mappend output <$> decoder aCMap remainingInput
(output, remainingInput) <- trySizes input $ Map.toList rangesBySize
mappend output <$> decoder rangesBySize remainingInput
where
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
trySizes s ((size, cRanges):others) =
@ -64,17 +66,14 @@ decoder aCMap input
Just outputSequence -> Right (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe Text
tryRanges _ [] = Nothing
tryRanges prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of
tryRanges prefix ((CRange {toUnicode}):cRanges) =
case Map.lookup prefix toUnicode of
Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence
encoder :: CMap -> Encoder
encoder :: Map Size FromUnicode -> Encoder
encoder = undefined
emptyCMap :: CMap
emptyCMap = Map.empty
matches :: ByteString -> CRange -> Bool
matches code (CRange {fromSequence, toSequence}) =
fromSequence <= code && code <= toSequence
@ -86,6 +85,7 @@ cMap = either fail (return . toFont . snd) . runParser
where
ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
emptyCMap = CMap {toUnicodeBySize = Map.empty, fromUnicodeBySize = Map.empty}
codeRanges :: Parser CMap ()
codeRanges = do
@ -97,13 +97,14 @@ codeRanges = do
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
createMapping :: (StringObject, StringObject) -> Parser CMap ()
createMapping (Hexadecimal from, Hexadecimal to) = modify $
Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
createMapping (Hexadecimal from, Hexadecimal to) = modify $ \cmap -> cmap {
toUnicodeBySize = Map.insertWith (++) size [cRange] (toUnicodeBySize cmap)
}
where
fromSequence = b16ToBytes from
size = BS.length fromSequence
toSequence = b16ToBytes to
mapping = Map.empty
cRange = CRange {fromSequence, toSequence, toUnicode = Map.empty}
createMapping _ = return ()
cMapRange :: Parser CMap ()
@ -112,32 +113,47 @@ cMapRange = do
mapM_ saveMapping =<< count size rangeMapping
line "endbfrange"
where
rangeMapping = (,,)
<$> (stringObject <* blank)
<*> (stringObject <* blank)
<*> directObject <* EOL.parser
>>= mapFromTo
rangeMapping = do
from <- (stringObject <* blank)
to <- (stringObject <* blank)
mapFromTo from to =<< directObject <* EOL.parser
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
saveMapping [] = return ()
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
saveMapping assoc =
modify $ \(CMap {toUnicodeBySize, fromUnicodeBySize}) -> CMap {
toUnicodeBySize = saveToUnicodeBySize assoc toUnicodeBySize
, fromUnicodeBySize = saveFromUnicodeBySize reversed fromUnicodeBySize
}
where
reversed = (\(a, b) -> (b, a)) <$> assoc
saveToUnicodeBySize :: [(ByteString, Text)] -> Map Size [CRange] -> Map Size [CRange]
saveToUnicodeBySize [] = id
saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length code)
where
newMapping = Map.fromList assoc
mappingSize = BS.length code
appendMapping cRange =
cRange {mapping = mapping cRange `union` newMapping}
cRange {toUnicode = toUnicode cRange `union` newMapping}
insertCRange = fmap (\cRange ->
if code `matches` cRange then appendMapping cRange else cRange
)
saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode
saveFromUnicodeBySize = flip (foldl insertFromUnicode)
where
insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode
insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) =
let size = Text.length unicodeSequence in
Map.adjust (Map.insert unicodeSequence code) size tmpFromUnicodeBySize
cMapChar :: Parser CMap ()
cMapChar = do
size <- integer <* line "beginbfchar"
saveMapping =<< count size charMapping <* line "endbfchar"
where
charMapping =
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
>>= pairMapping
charMapping = do
from <- stringObject <* blank
pairMapping from =<< stringObject <* EOL.parser
between :: B16Int -> B16Int -> [ByteString]
between from@(B16Int s) to =
@ -149,20 +165,20 @@ startFrom from@(B16Int s) =
let size = BS.length s `div` 2 in
toBytes size <$> [b16ToInt from .. ]
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
mapFromTo :: MonadParser m => StringObject -> StringObject -> DirectObject -> m [(ByteString, Text)]
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) =
zip (between from to) <$> (mapM dstByteString dstPoints)
where
dstByteString (StringObject (Hexadecimal dst)) =
return . utf16BEToutf8 $ b16ToBytes dst
dstByteString _ = fail "Invalid for a replacement string"
mapFromTo _ = fail "invalid range mapping found"
mapFromTo _ _ _ = fail "invalid range mapping found"
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
pairMapping (Hexadecimal from, Hexadecimal to) =
pairMapping :: MonadParser m => StringObject -> StringObject -> m (ByteString, Text)
pairMapping (Hexadecimal from) (Hexadecimal to) =
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
pairMapping _ = fail "invalid pair mapping found"
pairMapping _ _ = fail "invalid pair mapping found"