{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE TypeSynonymInstances #-} module PDF.CMap ( CMap , CMappers , CRange(..) , cMap , emptyCMap , matches ) where import Control.Applicative ((<|>), many) import Control.Monad.State (modify) import Data.Attoparsec.ByteString.Char8 (count) import Data.ByteString (ByteString) import qualified Data.ByteString as BS (drop, length, null, take) import Data.ByteString.Char8 (unpack) import Data.ByteString.Char8.Util ( B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8 ) import Data.Map (Map, union) import qualified Data.Map as Map ( adjust, empty, fromList, insertWith, lookup, toList ) import Data.Text (Text) import qualified PDF.EOL as EOL (charset, parser) import PDF.Font (Font) import PDF.Object ( DirectObject(..), Name, StringObject(..) , blank, directObject, integer, line, stringObject ) import PDF.Parser (MonadParser, Parser, runParser, takeAll) type CMappers = Map Name CMap type Mapping = Map ByteString Text data CRange = CRange { fromSequence :: ByteString , toSequence :: ByteString , mapping :: Mapping } deriving Show type RangeSize = Int type CMap = Map RangeSize [CRange] toFont :: CMap -> Font toFont aCMap input | BS.null input = Right "" | otherwise = do (output, remainingInput) <- trySizes input $ Map.toList aCMap mappend output <$> toFont aCMap remainingInput where trySizes s [] = Left $ "No matching code found in font for " ++ unpack s trySizes s ((size, cRanges):others) = let prefix = BS.take size s in case tryRanges prefix cRanges of Nothing -> trySizes s others Just outputSequence -> Right (outputSequence, BS.drop size s) tryRanges :: ByteString -> [CRange] -> Maybe Text tryRanges _ [] = Nothing tryRanges prefix ((CRange {mapping}):cRanges) = case Map.lookup prefix mapping of Nothing -> tryRanges prefix cRanges outputSequence -> outputSequence emptyCMap :: CMap emptyCMap = Map.empty matches :: ByteString -> CRange -> Bool matches code (CRange {fromSequence, toSequence}) = fromSequence <= code && code <= toSequence cMap :: ByteString -> Either String Font cMap = fmap (toFont . snd) <$> runParser (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine)) emptyCMap where ignoredLine = takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return () codeRanges :: Parser CMap () codeRanges = do size <- integer <* line "begincodespacerange" mapM_ createMapping =<< count size codeRange line "endcodespacerange" where codeRange = (,) <$> stringObject <* blank <*> stringObject <* EOL.parser createMapping :: (StringObject, StringObject) -> Parser CMap () createMapping (Hexadecimal from, Hexadecimal to) = modify $ Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}] where fromSequence = b16ToBytes from size = BS.length fromSequence toSequence = b16ToBytes to mapping = Map.empty createMapping _ = return () cMapRange :: Parser CMap () cMapRange = do size <- integer <* line "beginbfrange" mapM_ saveMapping =<< count size rangeMapping line "endbfrange" where rangeMapping = (,,) <$> (stringObject <* blank) <*> (stringObject <* blank) <*> directObject <* EOL.parser >>= mapFromTo saveMapping :: [(ByteString, Text)] -> Parser CMap () saveMapping [] = return () saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize where newMapping = Map.fromList assoc mappingSize = BS.length code appendMapping cRange = cRange {mapping = mapping cRange `union` newMapping} insertCRange = fmap (\cRange -> if code `matches` cRange then appendMapping cRange else cRange ) cMapChar :: Parser CMap () cMapChar = do size <- integer <* line "beginbfchar" saveMapping =<< count size charMapping <* line "endbfchar" where charMapping = (,) <$> stringObject <* blank <*> stringObject <* EOL.parser >>= pairMapping between :: B16Int -> B16Int -> [ByteString] between from@(B16Int s) to = let size = BS.length s `div` 2 in toBytes size <$> [b16ToInt from .. b16ToInt to] startFrom :: B16Int -> [ByteString] startFrom from@(B16Int s) = let size = BS.length s `div` 2 in toBytes size <$> [b16ToInt from .. ] mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)] mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom) mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) = zip (between from to) <$> (mapM dstByteString dstPoints) where dstByteString (StringObject (Hexadecimal dst)) = return . utf16BEToutf8 $ b16ToBytes dst dstByteString _ = fail "Invalid for a replacement string" mapFromTo _ = fail "invalid range mapping found" pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text) pairMapping (Hexadecimal from, Hexadecimal to) = return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to) pairMapping _ = fail "invalid pair mapping found"