{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE FlexibleContexts #-} module PDF.CMap ( CMap , CMappers , CRange(..) , matches , parse ) where import Control.Applicative ((<|>), many) import Control.Monad.Except (MonadError(..)) import Control.Monad.Fail (fail) import Control.Monad.State (modify) import Data.Attoparsec.ByteString.Char8 (count) import Data.ByteString (ByteString) import qualified Data.ByteString as BS (drop, length, null, take) import Data.ByteString.Char8 (unpack) import Data.ByteString.Char8.Util ( B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8 ) import Data.Foldable (foldl', foldr') import Data.Map (Map, mapWithKey, union) import qualified Data.Map as Map ( adjust, empty, fromList, insert, insertWith, lookup, toList ) import Data.Text (Text) import qualified Data.Text as Text (length, null, splitAt) import qualified PDF.EOL as EOL (charset, parser) import PDF.Font (Decoder, Encoder, Font(..)) import PDF.Object ( DirectObject(..), Name, StringObject(..) , blank, directObject, integer, line, stringObject ) import PDF.Parser (MonadParser, Parser, runParser, takeAll) import Prelude hiding (fail) type CMappers = Map Name CMap type ToUnicode = Map ByteString Text type FromUnicode = Map Text ByteString data CRange = CRange { fromSequence :: ByteString , toSequence :: ByteString , toUnicode :: ToUnicode } deriving Show type Size = Int data CMap = CMap { toUnicodeBySize :: Map Size [CRange] , fromUnicodeBySize :: Map Size FromUnicode } toFont :: CMap -> Font toFont (CMap {toUnicodeBySize, fromUnicodeBySize}) = Font {decode = decoder toUnicodeBySize, encode = encoder fromUnicodeBySize} decoder :: Map Size [CRange] -> Decoder decoder rangesBySize input | BS.null input = Right "" | otherwise = do (output, remainingInput) <- trySizes input $ Map.toList rangesBySize mappend output <$> decoder rangesBySize remainingInput where trySizes s [] = Left $ "No matching code found in font for " ++ unpack s trySizes s ((size, cRanges):others) = let prefix = BS.take size s in case tryRanges prefix cRanges of Nothing -> trySizes s others Just outputSequence -> Right (outputSequence, BS.drop size s) tryRanges :: ByteString -> [CRange] -> Maybe Text tryRanges _ [] = Nothing tryRanges prefix ((CRange {toUnicode}):cRanges) = case Map.lookup prefix toUnicode of Nothing -> tryRanges prefix cRanges outputSequence -> outputSequence encoder :: Map Size FromUnicode -> Encoder encoder fromUnicodes input | Text.null input = Right "" | otherwise = foldr' (<>) (Left "No encoding found") $ mapWithKey tryOn fromUnicodes where tryOn size fromUnicode = let (prefix, end) = Text.splitAt size input in case Map.lookup prefix fromUnicode of Nothing -> Left "" Just code -> (code <>) <$> encoder fromUnicodes end matches :: ByteString -> CRange -> Bool matches code (CRange {fromSequence, toSequence}) = fromSequence <= code && code <= toSequence parse :: MonadError String m => ByteString -> m Font parse = either throwError (return . toFont . snd) . runParser (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine)) emptyCMap where ignoredLine = takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return () emptyCMap = CMap {toUnicodeBySize = Map.empty, fromUnicodeBySize = Map.empty} codeRanges :: Parser CMap () codeRanges = do size <- integer <* line "begincodespacerange" mapM_ createMapping =<< count size codeRange line "endcodespacerange" where codeRange = (,) <$> stringObject <* blank <*> stringObject <* EOL.parser createMapping :: (StringObject, StringObject) -> Parser CMap () createMapping (Hexadecimal from, Hexadecimal to) = modify $ \cmap -> cmap { toUnicodeBySize = Map.insertWith (++) size [cRange] (toUnicodeBySize cmap) } where fromSequence = b16ToBytes from size = BS.length fromSequence toSequence = b16ToBytes to cRange = CRange {fromSequence, toSequence, toUnicode = Map.empty} createMapping _ = return () cMapRange :: Parser CMap () cMapRange = do size <- integer <* line "beginbfrange" mapM_ saveMapping =<< count size rangeMapping line "endbfrange" where rangeMapping = do from <- (stringObject <* blank) to <- (stringObject <* blank) mapFromTo from to =<< directObject <* EOL.parser saveMapping :: [(ByteString, Text)] -> Parser CMap () saveMapping assoc = modify $ \(CMap {toUnicodeBySize, fromUnicodeBySize}) -> CMap { toUnicodeBySize = saveToUnicodeBySize assoc toUnicodeBySize , fromUnicodeBySize = saveFromUnicodeBySize reversed fromUnicodeBySize } where reversed = (\(a, b) -> (b, a)) <$> assoc saveToUnicodeBySize :: [(ByteString, Text)] -> Map Size [CRange] -> Map Size [CRange] saveToUnicodeBySize [] = id saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length code) where newMapping = Map.fromList assoc appendMapping cRange = cRange {toUnicode = toUnicode cRange `union` newMapping} insertCRange = fmap (\cRange -> if code `matches` cRange then appendMapping cRange else cRange ) saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode saveFromUnicodeBySize = flip (foldl' insertFromUnicode) where insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) = let size = Text.length unicodeSequence in Map.adjust (Map.insert unicodeSequence code) size tmpFromUnicodeBySize cMapChar :: Parser CMap () cMapChar = do size <- integer <* line "beginbfchar" saveMapping =<< count size charMapping <* line "endbfchar" where charMapping = do from <- stringObject <* blank pairMapping from =<< stringObject <* EOL.parser between :: B16Int -> B16Int -> [ByteString] between from@(B16Int s) to = let size = BS.length s `div` 2 in toBytes size <$> [b16ToInt from .. b16ToInt to] startFrom :: B16Int -> [ByteString] startFrom from@(B16Int s) = let size = BS.length s `div` 2 in toBytes size <$> [b16ToInt from .. ] mapFromTo :: MonadParser m => StringObject -> StringObject -> DirectObject -> m [(ByteString, Text)] mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) = return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom) mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) = zip (between from to) <$> (mapM dstByteString dstPoints) where dstByteString (StringObject (Hexadecimal dst)) = return . utf16BEToutf8 $ b16ToBytes dst dstByteString _ = fail "Invalid for a replacement string" mapFromTo _ _ _ = fail "invalid range mapping found" pairMapping :: MonadParser m => StringObject -> StringObject -> m (ByteString, Text) pairMapping (Hexadecimal from) (Hexadecimal to) = return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to) pairMapping _ _ = fail "invalid pair mapping found"