Prepare storing the reverse mapping for CMaps, divided by length to be able to implement encoding with a reasonable complexity
This commit is contained in:
parent
ca40d2df76
commit
457f1755e6
1 changed files with 58 additions and 42 deletions
100
src/PDF/CMap.hs
100
src/PDF/CMap.hs
|
@ -5,7 +5,6 @@ module PDF.CMap (
|
||||||
, CMappers
|
, CMappers
|
||||||
, CRange(..)
|
, CRange(..)
|
||||||
, cMap
|
, cMap
|
||||||
, emptyCMap
|
|
||||||
, matches
|
, matches
|
||||||
) where
|
) where
|
||||||
|
|
||||||
|
@ -21,9 +20,10 @@ import Data.ByteString.Char8.Util (
|
||||||
)
|
)
|
||||||
import Data.Map (Map, union)
|
import Data.Map (Map, union)
|
||||||
import qualified Data.Map as Map (
|
import qualified Data.Map as Map (
|
||||||
adjust, empty, fromList, insertWith, lookup, toList
|
adjust, empty, fromList, insert, insertWith, lookup, toList
|
||||||
)
|
)
|
||||||
import Data.Text (Text)
|
import Data.Text (Text)
|
||||||
|
import qualified Data.Text as Text (length)
|
||||||
import qualified PDF.EOL as EOL (charset, parser)
|
import qualified PDF.EOL as EOL (charset, parser)
|
||||||
import PDF.Font (Decoder, Encoder, Font(..))
|
import PDF.Font (Decoder, Encoder, Font(..))
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
|
@ -34,27 +34,29 @@ import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
||||||
import Prelude hiding (fail)
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
type CMappers = Map Name CMap
|
type CMappers = Map Name CMap
|
||||||
data Mapping = Mapping {
|
type ToUnicode = Map ByteString Text
|
||||||
bsToT :: Map ByteString Text
|
type FromUnicode = Map Text ByteString
|
||||||
, tToBS :: Map Text ByteString
|
|
||||||
}
|
|
||||||
data CRange = CRange {
|
data CRange = CRange {
|
||||||
fromSequence :: ByteString
|
fromSequence :: ByteString
|
||||||
, toSequence :: ByteString
|
, toSequence :: ByteString
|
||||||
, mapping :: Mapping
|
, toUnicode :: ToUnicode
|
||||||
} deriving Show
|
} deriving Show
|
||||||
type RangeSize = Int
|
type Size = Int
|
||||||
type CMap = Map RangeSize [CRange]
|
data CMap = CMap {
|
||||||
|
toUnicodeBySize :: Map Size [CRange]
|
||||||
|
, fromUnicodeBySize :: Map Size FromUnicode
|
||||||
|
}
|
||||||
|
|
||||||
toFont :: CMap -> Font
|
toFont :: CMap -> Font
|
||||||
toFont aCMap = Font {decode = decoder aCMap, encode = encoder aCMap}
|
toFont (CMap {toUnicodeBySize, fromUnicodeBySize}) =
|
||||||
|
Font {decode = decoder toUnicodeBySize, encode = encoder fromUnicodeBySize}
|
||||||
|
|
||||||
decoder :: CMap -> Decoder
|
decoder :: Map Size [CRange] -> Decoder
|
||||||
decoder aCMap input
|
decoder rangesBySize input
|
||||||
| BS.null input = Right ""
|
| BS.null input = Right ""
|
||||||
| otherwise = do
|
| otherwise = do
|
||||||
(output, remainingInput) <- trySizes input $ Map.toList aCMap
|
(output, remainingInput) <- trySizes input $ Map.toList rangesBySize
|
||||||
mappend output <$> decoder aCMap remainingInput
|
mappend output <$> decoder rangesBySize remainingInput
|
||||||
where
|
where
|
||||||
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
|
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
|
||||||
trySizes s ((size, cRanges):others) =
|
trySizes s ((size, cRanges):others) =
|
||||||
|
@ -64,17 +66,14 @@ decoder aCMap input
|
||||||
Just outputSequence -> Right (outputSequence, BS.drop size s)
|
Just outputSequence -> Right (outputSequence, BS.drop size s)
|
||||||
tryRanges :: ByteString -> [CRange] -> Maybe Text
|
tryRanges :: ByteString -> [CRange] -> Maybe Text
|
||||||
tryRanges _ [] = Nothing
|
tryRanges _ [] = Nothing
|
||||||
tryRanges prefix ((CRange {mapping}):cRanges) =
|
tryRanges prefix ((CRange {toUnicode}):cRanges) =
|
||||||
case Map.lookup prefix mapping of
|
case Map.lookup prefix toUnicode of
|
||||||
Nothing -> tryRanges prefix cRanges
|
Nothing -> tryRanges prefix cRanges
|
||||||
outputSequence -> outputSequence
|
outputSequence -> outputSequence
|
||||||
|
|
||||||
encoder :: CMap -> Encoder
|
encoder :: Map Size FromUnicode -> Encoder
|
||||||
encoder = undefined
|
encoder = undefined
|
||||||
|
|
||||||
emptyCMap :: CMap
|
|
||||||
emptyCMap = Map.empty
|
|
||||||
|
|
||||||
matches :: ByteString -> CRange -> Bool
|
matches :: ByteString -> CRange -> Bool
|
||||||
matches code (CRange {fromSequence, toSequence}) =
|
matches code (CRange {fromSequence, toSequence}) =
|
||||||
fromSequence <= code && code <= toSequence
|
fromSequence <= code && code <= toSequence
|
||||||
|
@ -86,6 +85,7 @@ cMap = either fail (return . toFont . snd) . runParser
|
||||||
where
|
where
|
||||||
ignoredLine =
|
ignoredLine =
|
||||||
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
|
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
|
||||||
|
emptyCMap = CMap {toUnicodeBySize = Map.empty, fromUnicodeBySize = Map.empty}
|
||||||
|
|
||||||
codeRanges :: Parser CMap ()
|
codeRanges :: Parser CMap ()
|
||||||
codeRanges = do
|
codeRanges = do
|
||||||
|
@ -97,13 +97,14 @@ codeRanges = do
|
||||||
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
|
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
|
||||||
|
|
||||||
createMapping :: (StringObject, StringObject) -> Parser CMap ()
|
createMapping :: (StringObject, StringObject) -> Parser CMap ()
|
||||||
createMapping (Hexadecimal from, Hexadecimal to) = modify $
|
createMapping (Hexadecimal from, Hexadecimal to) = modify $ \cmap -> cmap {
|
||||||
Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
|
toUnicodeBySize = Map.insertWith (++) size [cRange] (toUnicodeBySize cmap)
|
||||||
|
}
|
||||||
where
|
where
|
||||||
fromSequence = b16ToBytes from
|
fromSequence = b16ToBytes from
|
||||||
size = BS.length fromSequence
|
size = BS.length fromSequence
|
||||||
toSequence = b16ToBytes to
|
toSequence = b16ToBytes to
|
||||||
mapping = Map.empty
|
cRange = CRange {fromSequence, toSequence, toUnicode = Map.empty}
|
||||||
createMapping _ = return ()
|
createMapping _ = return ()
|
||||||
|
|
||||||
cMapRange :: Parser CMap ()
|
cMapRange :: Parser CMap ()
|
||||||
|
@ -112,32 +113,47 @@ cMapRange = do
|
||||||
mapM_ saveMapping =<< count size rangeMapping
|
mapM_ saveMapping =<< count size rangeMapping
|
||||||
line "endbfrange"
|
line "endbfrange"
|
||||||
where
|
where
|
||||||
rangeMapping = (,,)
|
rangeMapping = do
|
||||||
<$> (stringObject <* blank)
|
from <- (stringObject <* blank)
|
||||||
<*> (stringObject <* blank)
|
to <- (stringObject <* blank)
|
||||||
<*> directObject <* EOL.parser
|
mapFromTo from to =<< directObject <* EOL.parser
|
||||||
>>= mapFromTo
|
|
||||||
|
|
||||||
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
|
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
|
||||||
saveMapping [] = return ()
|
saveMapping assoc =
|
||||||
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
|
modify $ \(CMap {toUnicodeBySize, fromUnicodeBySize}) -> CMap {
|
||||||
|
toUnicodeBySize = saveToUnicodeBySize assoc toUnicodeBySize
|
||||||
|
, fromUnicodeBySize = saveFromUnicodeBySize reversed fromUnicodeBySize
|
||||||
|
}
|
||||||
|
where
|
||||||
|
reversed = (\(a, b) -> (b, a)) <$> assoc
|
||||||
|
|
||||||
|
saveToUnicodeBySize :: [(ByteString, Text)] -> Map Size [CRange] -> Map Size [CRange]
|
||||||
|
saveToUnicodeBySize [] = id
|
||||||
|
saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length code)
|
||||||
where
|
where
|
||||||
newMapping = Map.fromList assoc
|
newMapping = Map.fromList assoc
|
||||||
mappingSize = BS.length code
|
|
||||||
appendMapping cRange =
|
appendMapping cRange =
|
||||||
cRange {mapping = mapping cRange `union` newMapping}
|
cRange {toUnicode = toUnicode cRange `union` newMapping}
|
||||||
insertCRange = fmap (\cRange ->
|
insertCRange = fmap (\cRange ->
|
||||||
if code `matches` cRange then appendMapping cRange else cRange
|
if code `matches` cRange then appendMapping cRange else cRange
|
||||||
)
|
)
|
||||||
|
|
||||||
|
saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode
|
||||||
|
saveFromUnicodeBySize = flip (foldl insertFromUnicode)
|
||||||
|
where
|
||||||
|
insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode
|
||||||
|
insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) =
|
||||||
|
let size = Text.length unicodeSequence in
|
||||||
|
Map.adjust (Map.insert unicodeSequence code) size tmpFromUnicodeBySize
|
||||||
|
|
||||||
cMapChar :: Parser CMap ()
|
cMapChar :: Parser CMap ()
|
||||||
cMapChar = do
|
cMapChar = do
|
||||||
size <- integer <* line "beginbfchar"
|
size <- integer <* line "beginbfchar"
|
||||||
saveMapping =<< count size charMapping <* line "endbfchar"
|
saveMapping =<< count size charMapping <* line "endbfchar"
|
||||||
where
|
where
|
||||||
charMapping =
|
charMapping = do
|
||||||
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
|
from <- stringObject <* blank
|
||||||
>>= pairMapping
|
pairMapping from =<< stringObject <* EOL.parser
|
||||||
|
|
||||||
between :: B16Int -> B16Int -> [ByteString]
|
between :: B16Int -> B16Int -> [ByteString]
|
||||||
between from@(B16Int s) to =
|
between from@(B16Int s) to =
|
||||||
|
@ -149,20 +165,20 @@ startFrom from@(B16Int s) =
|
||||||
let size = BS.length s `div` 2 in
|
let size = BS.length s `div` 2 in
|
||||||
toBytes size <$> [b16ToInt from .. ]
|
toBytes size <$> [b16ToInt from .. ]
|
||||||
|
|
||||||
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
|
mapFromTo :: MonadParser m => StringObject -> StringObject -> DirectObject -> m [(ByteString, Text)]
|
||||||
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
|
||||||
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
|
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
|
||||||
|
|
||||||
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) =
|
||||||
zip (between from to) <$> (mapM dstByteString dstPoints)
|
zip (between from to) <$> (mapM dstByteString dstPoints)
|
||||||
where
|
where
|
||||||
dstByteString (StringObject (Hexadecimal dst)) =
|
dstByteString (StringObject (Hexadecimal dst)) =
|
||||||
return . utf16BEToutf8 $ b16ToBytes dst
|
return . utf16BEToutf8 $ b16ToBytes dst
|
||||||
dstByteString _ = fail "Invalid for a replacement string"
|
dstByteString _ = fail "Invalid for a replacement string"
|
||||||
|
|
||||||
mapFromTo _ = fail "invalid range mapping found"
|
mapFromTo _ _ _ = fail "invalid range mapping found"
|
||||||
|
|
||||||
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
|
pairMapping :: MonadParser m => StringObject -> StringObject -> m (ByteString, Text)
|
||||||
pairMapping (Hexadecimal from, Hexadecimal to) =
|
pairMapping (Hexadecimal from) (Hexadecimal to) =
|
||||||
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
|
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
|
||||||
pairMapping _ = fail "invalid pair mapping found"
|
pairMapping _ _ = fail "invalid pair mapping found"
|
||||||
|
|
Loading…
Reference in a new issue