diff --git a/src/PDF/CMap.hs b/src/PDF/CMap.hs index 93a2ce2..729ff1d 100644 --- a/src/PDF/CMap.hs +++ b/src/PDF/CMap.hs @@ -5,7 +5,6 @@ module PDF.CMap ( , CMappers , CRange(..) , cMap - , emptyCMap , matches ) where @@ -21,9 +20,10 @@ import Data.ByteString.Char8.Util ( ) import Data.Map (Map, union) import qualified Data.Map as Map ( - adjust, empty, fromList, insertWith, lookup, toList + adjust, empty, fromList, insert, insertWith, lookup, toList ) import Data.Text (Text) +import qualified Data.Text as Text (length) import qualified PDF.EOL as EOL (charset, parser) import PDF.Font (Decoder, Encoder, Font(..)) import PDF.Object ( @@ -34,27 +34,29 @@ import PDF.Parser (MonadParser, Parser, runParser, takeAll) import Prelude hiding (fail) type CMappers = Map Name CMap -data Mapping = Mapping { - bsToT :: Map ByteString Text - , tToBS :: Map Text ByteString - } +type ToUnicode = Map ByteString Text +type FromUnicode = Map Text ByteString data CRange = CRange { fromSequence :: ByteString , toSequence :: ByteString - , mapping :: Mapping + , toUnicode :: ToUnicode } deriving Show -type RangeSize = Int -type CMap = Map RangeSize [CRange] +type Size = Int +data CMap = CMap { + toUnicodeBySize :: Map Size [CRange] + , fromUnicodeBySize :: Map Size FromUnicode + } toFont :: CMap -> Font -toFont aCMap = Font {decode = decoder aCMap, encode = encoder aCMap} +toFont (CMap {toUnicodeBySize, fromUnicodeBySize}) = + Font {decode = decoder toUnicodeBySize, encode = encoder fromUnicodeBySize} -decoder :: CMap -> Decoder -decoder aCMap input +decoder :: Map Size [CRange] -> Decoder +decoder rangesBySize input | BS.null input = Right "" | otherwise = do - (output, remainingInput) <- trySizes input $ Map.toList aCMap - mappend output <$> decoder aCMap remainingInput + (output, remainingInput) <- trySizes input $ Map.toList rangesBySize + mappend output <$> decoder rangesBySize remainingInput where trySizes s [] = Left $ "No matching code found in font for " ++ unpack s trySizes s ((size, cRanges):others) = @@ -64,17 +66,14 @@ decoder aCMap input Just outputSequence -> Right (outputSequence, BS.drop size s) tryRanges :: ByteString -> [CRange] -> Maybe Text tryRanges _ [] = Nothing - tryRanges prefix ((CRange {mapping}):cRanges) = - case Map.lookup prefix mapping of + tryRanges prefix ((CRange {toUnicode}):cRanges) = + case Map.lookup prefix toUnicode of Nothing -> tryRanges prefix cRanges outputSequence -> outputSequence -encoder :: CMap -> Encoder +encoder :: Map Size FromUnicode -> Encoder encoder = undefined -emptyCMap :: CMap -emptyCMap = Map.empty - matches :: ByteString -> CRange -> Bool matches code (CRange {fromSequence, toSequence}) = fromSequence <= code && code <= toSequence @@ -86,6 +85,7 @@ cMap = either fail (return . toFont . snd) . runParser where ignoredLine = takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return () + emptyCMap = CMap {toUnicodeBySize = Map.empty, fromUnicodeBySize = Map.empty} codeRanges :: Parser CMap () codeRanges = do @@ -97,13 +97,14 @@ codeRanges = do (,) <$> stringObject <* blank <*> stringObject <* EOL.parser createMapping :: (StringObject, StringObject) -> Parser CMap () -createMapping (Hexadecimal from, Hexadecimal to) = modify $ - Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}] +createMapping (Hexadecimal from, Hexadecimal to) = modify $ \cmap -> cmap { + toUnicodeBySize = Map.insertWith (++) size [cRange] (toUnicodeBySize cmap) + } where fromSequence = b16ToBytes from size = BS.length fromSequence toSequence = b16ToBytes to - mapping = Map.empty + cRange = CRange {fromSequence, toSequence, toUnicode = Map.empty} createMapping _ = return () cMapRange :: Parser CMap () @@ -112,32 +113,47 @@ cMapRange = do mapM_ saveMapping =<< count size rangeMapping line "endbfrange" where - rangeMapping = (,,) - <$> (stringObject <* blank) - <*> (stringObject <* blank) - <*> directObject <* EOL.parser - >>= mapFromTo + rangeMapping = do + from <- (stringObject <* blank) + to <- (stringObject <* blank) + mapFromTo from to =<< directObject <* EOL.parser saveMapping :: [(ByteString, Text)] -> Parser CMap () -saveMapping [] = return () -saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize +saveMapping assoc = + modify $ \(CMap {toUnicodeBySize, fromUnicodeBySize}) -> CMap { + toUnicodeBySize = saveToUnicodeBySize assoc toUnicodeBySize + , fromUnicodeBySize = saveFromUnicodeBySize reversed fromUnicodeBySize + } + where + reversed = (\(a, b) -> (b, a)) <$> assoc + +saveToUnicodeBySize :: [(ByteString, Text)] -> Map Size [CRange] -> Map Size [CRange] +saveToUnicodeBySize [] = id +saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length code) where newMapping = Map.fromList assoc - mappingSize = BS.length code appendMapping cRange = - cRange {mapping = mapping cRange `union` newMapping} + cRange {toUnicode = toUnicode cRange `union` newMapping} insertCRange = fmap (\cRange -> if code `matches` cRange then appendMapping cRange else cRange ) +saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode +saveFromUnicodeBySize = flip (foldl insertFromUnicode) + where + insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode + insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) = + let size = Text.length unicodeSequence in + Map.adjust (Map.insert unicodeSequence code) size tmpFromUnicodeBySize + cMapChar :: Parser CMap () cMapChar = do size <- integer <* line "beginbfchar" saveMapping =<< count size charMapping <* line "endbfchar" where - charMapping = - (,) <$> stringObject <* blank <*> stringObject <* EOL.parser - >>= pairMapping + charMapping = do + from <- stringObject <* blank + pairMapping from =<< stringObject <* EOL.parser between :: B16Int -> B16Int -> [ByteString] between from@(B16Int s) to = @@ -149,20 +165,20 @@ startFrom from@(B16Int s) = let size = BS.length s `div` 2 in toBytes size <$> [b16ToInt from .. ] -mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)] -mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = +mapFromTo :: MonadParser m => StringObject -> StringObject -> DirectObject -> m [(ByteString, Text)] +mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) = return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom) -mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) = +mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) = zip (between from to) <$> (mapM dstByteString dstPoints) where dstByteString (StringObject (Hexadecimal dst)) = return . utf16BEToutf8 $ b16ToBytes dst dstByteString _ = fail "Invalid for a replacement string" -mapFromTo _ = fail "invalid range mapping found" +mapFromTo _ _ _ = fail "invalid range mapping found" -pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text) -pairMapping (Hexadecimal from, Hexadecimal to) = +pairMapping :: MonadParser m => StringObject -> StringObject -> m (ByteString, Text) +pairMapping (Hexadecimal from) (Hexadecimal to) = return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to) -pairMapping _ = fail "invalid pair mapping found" +pairMapping _ _ = fail "invalid pair mapping found"