Prepare storing the reverse mapping for CMaps, divided by length to be able to implement encoding with a reasonable complexity
This commit is contained in:
parent
ca40d2df76
commit
457f1755e6
1 changed files with 58 additions and 42 deletions
100
src/PDF/CMap.hs
100
src/PDF/CMap.hs
|
@ -5,7 +5,6 @@ module PDF.CMap (
|
|||
, CMappers
|
||||
, CRange(..)
|
||||
, cMap
|
||||
, emptyCMap
|
||||
, matches
|
||||
) where
|
||||
|
||||
|
@ -21,9 +20,10 @@ import Data.ByteString.Char8.Util (
|
|||
)
|
||||
import Data.Map (Map, union)
|
||||
import qualified Data.Map as Map (
|
||||
adjust, empty, fromList, insertWith, lookup, toList
|
||||
adjust, empty, fromList, insert, insertWith, lookup, toList
|
||||
)
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as Text (length)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import PDF.Font (Decoder, Encoder, Font(..))
|
||||
import PDF.Object (
|
||||
|
@ -34,27 +34,29 @@ import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
|||
import Prelude hiding (fail)
|
||||
|
||||
type CMappers = Map Name CMap
|
||||
data Mapping = Mapping {
|
||||
bsToT :: Map ByteString Text
|
||||
, tToBS :: Map Text ByteString
|
||||
}
|
||||
type ToUnicode = Map ByteString Text
|
||||
type FromUnicode = Map Text ByteString
|
||||
data CRange = CRange {
|
||||
fromSequence :: ByteString
|
||||
, toSequence :: ByteString
|
||||
, mapping :: Mapping
|
||||
, toUnicode :: ToUnicode
|
||||
} deriving Show
|
||||
type RangeSize = Int
|
||||
type CMap = Map RangeSize [CRange]
|
||||
type Size = Int
|
||||
data CMap = CMap {
|
||||
toUnicodeBySize :: Map Size [CRange]
|
||||
, fromUnicodeBySize :: Map Size FromUnicode
|
||||
}
|
||||
|
||||
toFont :: CMap -> Font
|
||||
toFont aCMap = Font {decode = decoder aCMap, encode = encoder aCMap}
|
||||
toFont (CMap {toUnicodeBySize, fromUnicodeBySize}) =
|
||||
Font {decode = decoder toUnicodeBySize, encode = encoder fromUnicodeBySize}
|
||||
|
||||
decoder :: CMap -> Decoder
|
||||
decoder aCMap input
|
||||
decoder :: Map Size [CRange] -> Decoder
|
||||
decoder rangesBySize input
|
||||
| BS.null input = Right ""
|
||||
| otherwise = do
|
||||
(output, remainingInput) <- trySizes input $ Map.toList aCMap
|
||||
mappend output <$> decoder aCMap remainingInput
|
||||
(output, remainingInput) <- trySizes input $ Map.toList rangesBySize
|
||||
mappend output <$> decoder rangesBySize remainingInput
|
||||
where
|
||||
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
|
||||
trySizes s ((size, cRanges):others) =
|
||||
|
@ -64,17 +66,14 @@ decoder aCMap input
|
|||
Just outputSequence -> Right (outputSequence, BS.drop size s)
|
||||
tryRanges :: ByteString -> [CRange] -> Maybe Text
|
||||
tryRanges _ [] = Nothing
|
||||
tryRanges prefix ((CRange {mapping}):cRanges) =
|
||||
case Map.lookup prefix mapping of
|
||||
tryRanges prefix ((CRange {toUnicode}):cRanges) =
|
||||
case Map.lookup prefix toUnicode of
|
||||
Nothing -> tryRanges prefix cRanges
|
||||
outputSequence -> outputSequence
|
||||
|
||||
encoder :: CMap -> Encoder
|
||||
encoder :: Map Size FromUnicode -> Encoder
|
||||
encoder = undefined
|
||||
|
||||
emptyCMap :: CMap
|
||||
emptyCMap = Map.empty
|
||||
|
||||
matches :: ByteString -> CRange -> Bool
|
||||
matches code (CRange {fromSequence, toSequence}) =
|
||||
fromSequence <= code && code <= toSequence
|
||||
|
@ -86,6 +85,7 @@ cMap = either fail (return . toFont . snd) . runParser
|
|||
where
|
||||
ignoredLine =
|
||||
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
|
||||
emptyCMap = CMap {toUnicodeBySize = Map.empty, fromUnicodeBySize = Map.empty}
|
||||
|
||||
codeRanges :: Parser CMap ()
|
||||
codeRanges = do
|
||||
|
@ -97,13 +97,14 @@ codeRanges = do
|
|||
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
|
||||
|
||||
createMapping :: (StringObject, StringObject) -> Parser CMap ()
|
||||
createMapping (Hexadecimal from, Hexadecimal to) = modify $
|
||||
Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
|
||||
createMapping (Hexadecimal from, Hexadecimal to) = modify $ \cmap -> cmap {
|
||||
toUnicodeBySize = Map.insertWith (++) size [cRange] (toUnicodeBySize cmap)
|
||||
}
|
||||
where
|
||||
fromSequence = b16ToBytes from
|
||||
size = BS.length fromSequence
|
||||
toSequence = b16ToBytes to
|
||||
mapping = Map.empty
|
||||
cRange = CRange {fromSequence, toSequence, toUnicode = Map.empty}
|
||||
createMapping _ = return ()
|
||||
|
||||
cMapRange :: Parser CMap ()
|
||||
|
@ -112,32 +113,47 @@ cMapRange = do
|
|||
mapM_ saveMapping =<< count size rangeMapping
|
||||
line "endbfrange"
|
||||
where
|
||||
rangeMapping = (,,)
|
||||
<$> (stringObject <* blank)
|
||||
<*> (stringObject <* blank)
|
||||
<*> directObject <* EOL.parser
|
||||
>>= mapFromTo
|
||||
rangeMapping = do
|
||||
from <- (stringObject <* blank)
|
||||
to <- (stringObject <* blank)
|
||||
mapFromTo from to =<< directObject <* EOL.parser
|
||||
|
||||
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
|
||||
saveMapping [] = return ()
|
||||
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
|
||||
saveMapping assoc =
|
||||
modify $ \(CMap {toUnicodeBySize, fromUnicodeBySize}) -> CMap {
|
||||
toUnicodeBySize = saveToUnicodeBySize assoc toUnicodeBySize
|
||||
, fromUnicodeBySize = saveFromUnicodeBySize reversed fromUnicodeBySize
|
||||
}
|
||||
where
|
||||
reversed = (\(a, b) -> (b, a)) <$> assoc
|
||||
|
||||
saveToUnicodeBySize :: [(ByteString, Text)] -> Map Size [CRange] -> Map Size [CRange]
|
||||
saveToUnicodeBySize [] = id
|
||||
saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length code)
|
||||
where
|
||||
newMapping = Map.fromList assoc
|
||||
mappingSize = BS.length code
|
||||
appendMapping cRange =
|
||||
cRange {mapping = mapping cRange `union` newMapping}
|
||||
cRange {toUnicode = toUnicode cRange `union` newMapping}
|
||||
insertCRange = fmap (\cRange ->
|
||||
if code `matches` cRange then appendMapping cRange else cRange
|
||||
)
|
||||
|
||||
saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode
|
||||
saveFromUnicodeBySize = flip (foldl insertFromUnicode)
|
||||
where
|
||||
insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode
|
||||
insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) =
|
||||
let size = Text.length unicodeSequence in
|
||||
Map.adjust (Map.insert unicodeSequence code) size tmpFromUnicodeBySize
|
||||
|
||||
cMapChar :: Parser CMap ()
|
||||
cMapChar = do
|
||||
size <- integer <* line "beginbfchar"
|
||||
saveMapping =<< count size charMapping <* line "endbfchar"
|
||||
where
|
||||
charMapping =
|
||||
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
|
||||
>>= pairMapping
|
||||
charMapping = do
|
||||
from <- stringObject <* blank
|
||||
pairMapping from =<< stringObject <* EOL.parser
|
||||
|
||||
between :: B16Int -> B16Int -> [ByteString]
|
||||
between from@(B16Int s) to =
|
||||
|
@ -149,20 +165,20 @@ startFrom from@(B16Int s) =
|
|||
let size = BS.length s `div` 2 in
|
||||
toBytes size <$> [b16ToInt from .. ]
|
||||
|
||||
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
||||
mapFromTo :: MonadParser m => StringObject -> StringObject -> DirectObject -> m [(ByteString, Text)]
|
||||
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
|
||||
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
|
||||
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
||||
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) =
|
||||
zip (between from to) <$> (mapM dstByteString dstPoints)
|
||||
where
|
||||
dstByteString (StringObject (Hexadecimal dst)) =
|
||||
return . utf16BEToutf8 $ b16ToBytes dst
|
||||
dstByteString _ = fail "Invalid for a replacement string"
|
||||
|
||||
mapFromTo _ = fail "invalid range mapping found"
|
||||
mapFromTo _ _ _ = fail "invalid range mapping found"
|
||||
|
||||
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
|
||||
pairMapping (Hexadecimal from, Hexadecimal to) =
|
||||
pairMapping :: MonadParser m => StringObject -> StringObject -> m (ByteString, Text)
|
||||
pairMapping (Hexadecimal from) (Hexadecimal to) =
|
||||
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
|
||||
pairMapping _ = fail "invalid pair mapping found"
|
||||
pairMapping _ _ = fail "invalid pair mapping found"
|
||||
|
|
Loading…
Reference in a new issue