2019-09-30 14:13:12 +02:00
|
|
|
{-# LANGUAGE NamedFieldPuns #-}
|
2020-02-05 14:42:51 +01:00
|
|
|
{-# LANGUAGE FlexibleInstances #-}
|
|
|
|
{-# LANGUAGE TypeSynonymInstances #-}
|
2019-09-27 18:16:12 +02:00
|
|
|
module PDF.CMap (
|
|
|
|
CMap
|
|
|
|
, CMappers
|
2019-10-03 07:59:09 +02:00
|
|
|
, CRange(..)
|
2019-09-27 18:16:12 +02:00
|
|
|
, cMap
|
|
|
|
, emptyCMap
|
2019-10-03 07:59:09 +02:00
|
|
|
, matches
|
2019-09-27 18:16:12 +02:00
|
|
|
) where
|
|
|
|
|
|
|
|
import Control.Applicative ((<|>), many)
|
2019-09-30 14:13:12 +02:00
|
|
|
import Control.Monad.State (modify)
|
|
|
|
import Data.Attoparsec.ByteString.Char8 (count)
|
|
|
|
import Data.ByteString (ByteString)
|
|
|
|
import qualified Data.ByteString as BS (length)
|
|
|
|
import Data.ByteString.Char8.Util (
|
2019-10-04 18:46:07 +02:00
|
|
|
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
|
2019-09-30 14:13:12 +02:00
|
|
|
)
|
|
|
|
import Data.Map (Map, union)
|
|
|
|
import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
|
2019-09-27 18:16:12 +02:00
|
|
|
import qualified PDF.EOL as EOL (charset, parser)
|
2020-02-05 14:42:51 +01:00
|
|
|
import PDF.Font (Font(..))
|
2019-09-27 18:16:12 +02:00
|
|
|
import PDF.Object (
|
|
|
|
DirectObject(..), Name, StringObject(..)
|
2019-09-30 14:13:12 +02:00
|
|
|
, blank, directObject, integer, line, stringObject
|
2019-09-27 18:16:12 +02:00
|
|
|
)
|
2019-09-30 14:13:12 +02:00
|
|
|
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
2019-09-27 18:16:12 +02:00
|
|
|
|
|
|
|
type CMappers = Map Name CMap
|
2019-09-30 14:13:12 +02:00
|
|
|
type Mapping = Map ByteString ByteString
|
|
|
|
data CRange = CRange {
|
|
|
|
fromSequence :: ByteString
|
|
|
|
, toSequence :: ByteString
|
|
|
|
, mapping :: Mapping
|
2019-10-03 14:59:06 +02:00
|
|
|
} deriving Show
|
2019-09-30 14:13:12 +02:00
|
|
|
type RangeSize = Int
|
|
|
|
type CMap = Map RangeSize [CRange]
|
2019-09-27 18:16:12 +02:00
|
|
|
|
2020-02-05 14:42:51 +01:00
|
|
|
instance Font CMap where
|
|
|
|
decode = undefined
|
|
|
|
|
2019-09-27 18:16:12 +02:00
|
|
|
emptyCMap :: CMap
|
|
|
|
emptyCMap = Map.empty
|
|
|
|
|
2019-10-03 07:59:09 +02:00
|
|
|
matches :: ByteString -> CRange -> Bool
|
|
|
|
matches code (CRange {fromSequence, toSequence}) =
|
|
|
|
fromSequence <= code && code <= toSequence
|
|
|
|
|
2019-09-27 18:16:12 +02:00
|
|
|
cMap :: ByteString -> Either String CMap
|
2019-09-30 14:13:12 +02:00
|
|
|
cMap = fmap snd <$> runParser
|
|
|
|
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
|
|
|
emptyCMap
|
2019-09-27 18:16:12 +02:00
|
|
|
where
|
|
|
|
ignoredLine =
|
2019-09-30 14:13:12 +02:00
|
|
|
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
|
|
|
|
|
|
|
|
codeRanges :: Parser CMap ()
|
|
|
|
codeRanges = do
|
|
|
|
size <- integer <* line "begincodespacerange"
|
2019-10-03 14:59:06 +02:00
|
|
|
mapM_ createMapping =<< count size codeRange
|
2019-09-30 14:13:12 +02:00
|
|
|
line "endcodespacerange"
|
|
|
|
where
|
|
|
|
codeRange =
|
2019-10-03 14:59:06 +02:00
|
|
|
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
|
2019-09-30 14:13:12 +02:00
|
|
|
|
|
|
|
createMapping :: (StringObject, StringObject) -> Parser CMap ()
|
|
|
|
createMapping (Hexadecimal from, Hexadecimal to) = modify $
|
|
|
|
Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
|
|
|
|
where
|
2019-10-04 18:46:07 +02:00
|
|
|
fromSequence = b16ToBytes from
|
2019-09-30 14:13:12 +02:00
|
|
|
size = BS.length fromSequence
|
2019-10-04 18:46:07 +02:00
|
|
|
toSequence = b16ToBytes to
|
2019-09-30 14:13:12 +02:00
|
|
|
mapping = Map.empty
|
|
|
|
createMapping _ = return ()
|
2019-09-27 18:16:12 +02:00
|
|
|
|
2019-09-30 14:13:12 +02:00
|
|
|
cMapRange :: Parser CMap ()
|
2019-09-27 18:16:12 +02:00
|
|
|
cMapRange = do
|
|
|
|
size <- integer <* line "beginbfrange"
|
2019-10-03 14:59:06 +02:00
|
|
|
mapM_ saveMapping =<< count size rangeMapping
|
|
|
|
line "endbfrange"
|
2019-09-27 18:16:12 +02:00
|
|
|
where
|
|
|
|
rangeMapping = (,,)
|
|
|
|
<$> (stringObject <* blank)
|
|
|
|
<*> (stringObject <* blank)
|
|
|
|
<*> directObject <* EOL.parser
|
|
|
|
>>= mapFromTo
|
|
|
|
|
2019-09-30 14:13:12 +02:00
|
|
|
saveMapping :: [(ByteString, ByteString)] -> Parser CMap ()
|
|
|
|
saveMapping [] = return ()
|
|
|
|
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
|
|
|
|
where
|
|
|
|
newMapping = Map.fromList assoc
|
|
|
|
mappingSize = BS.length code
|
|
|
|
appendMapping cRange =
|
|
|
|
cRange {mapping = mapping cRange `union` newMapping}
|
|
|
|
insertCRange = fmap (\cRange ->
|
2019-10-03 07:59:09 +02:00
|
|
|
if code `matches` cRange then appendMapping cRange else cRange
|
2019-09-30 14:13:12 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
cMapChar :: Parser CMap ()
|
2019-09-27 18:16:12 +02:00
|
|
|
cMapChar = do
|
|
|
|
size <- integer <* line "beginbfchar"
|
2019-09-30 14:13:12 +02:00
|
|
|
saveMapping =<< count size charMapping <* line "endbfchar"
|
2019-09-27 18:16:12 +02:00
|
|
|
where
|
|
|
|
charMapping =
|
2019-10-03 14:59:06 +02:00
|
|
|
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
|
2019-09-27 18:16:12 +02:00
|
|
|
>>= pairMapping
|
|
|
|
|
2019-10-04 18:46:07 +02:00
|
|
|
between :: B16Int -> B16Int -> [ByteString]
|
|
|
|
between from@(B16Int s) to =
|
|
|
|
let size = BS.length s `div` 2 in
|
|
|
|
toBytes size <$> [b16ToInt from .. b16ToInt to]
|
2019-09-30 14:13:12 +02:00
|
|
|
|
2019-10-04 18:46:07 +02:00
|
|
|
startFrom :: B16Int -> [ByteString]
|
|
|
|
startFrom from@(B16Int s) =
|
|
|
|
let size = BS.length s `div` 2 in
|
|
|
|
toBytes size <$> [b16ToInt from .. ]
|
2019-09-30 14:13:12 +02:00
|
|
|
|
|
|
|
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
|
2019-09-27 18:16:12 +02:00
|
|
|
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
2019-10-04 18:46:07 +02:00
|
|
|
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
|
2019-09-27 18:16:12 +02:00
|
|
|
|
|
|
|
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
2019-10-04 18:46:07 +02:00
|
|
|
zip (between from to) <$> (mapM dstByteString dstPoints)
|
2019-09-27 18:16:12 +02:00
|
|
|
where
|
2019-09-30 14:13:12 +02:00
|
|
|
dstByteString (StringObject (Hexadecimal dst)) =
|
2019-10-04 18:46:07 +02:00
|
|
|
return . utf16BEToutf8 $ b16ToBytes dst
|
2019-09-30 14:13:12 +02:00
|
|
|
dstByteString _ = fail "Invalid for a replacement string"
|
2019-09-27 18:16:12 +02:00
|
|
|
|
|
|
|
mapFromTo _ = fail "invalid range mapping found"
|
|
|
|
|
2019-09-30 14:13:12 +02:00
|
|
|
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
|
2019-09-27 18:16:12 +02:00
|
|
|
pairMapping (Hexadecimal from, Hexadecimal to) =
|
2019-10-04 18:46:07 +02:00
|
|
|
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
|
2019-09-27 18:16:12 +02:00
|
|
|
pairMapping _ = fail "invalid pair mapping found"
|