2019-09-30 14:13:12 +02:00
|
|
|
{-# LANGUAGE NamedFieldPuns #-}
|
2020-02-08 08:15:32 +01:00
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
2019-09-27 18:16:12 +02:00
|
|
|
module PDF.CMap (
|
|
|
|
CMap
|
|
|
|
, CMappers
|
2019-10-03 07:59:09 +02:00
|
|
|
, CRange(..)
|
2019-09-27 18:16:12 +02:00
|
|
|
, cMap
|
2019-10-03 07:59:09 +02:00
|
|
|
, matches
|
2019-09-27 18:16:12 +02:00
|
|
|
) where
|
|
|
|
|
|
|
|
import Control.Applicative ((<|>), many)
|
2020-02-11 08:29:08 +01:00
|
|
|
import Control.Monad.Fail (MonadFail(..))
|
2019-09-30 14:13:12 +02:00
|
|
|
import Control.Monad.State (modify)
|
|
|
|
import Data.Attoparsec.ByteString.Char8 (count)
|
|
|
|
import Data.ByteString (ByteString)
|
2020-02-08 08:15:32 +01:00
|
|
|
import qualified Data.ByteString as BS (drop, length, null, take)
|
|
|
|
import Data.ByteString.Char8 (unpack)
|
2019-09-30 14:13:12 +02:00
|
|
|
import Data.ByteString.Char8.Util (
|
2019-10-04 18:46:07 +02:00
|
|
|
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
|
2019-09-30 14:13:12 +02:00
|
|
|
)
|
2020-03-08 22:14:36 +01:00
|
|
|
import Data.Map (Map, mapWithKey, union)
|
2020-02-08 08:15:32 +01:00
|
|
|
import qualified Data.Map as Map (
|
2020-03-08 00:02:24 +01:00
|
|
|
adjust, empty, fromList, insert, insertWith, lookup, toList
|
2020-02-08 08:15:32 +01:00
|
|
|
)
|
|
|
|
import Data.Text (Text)
|
2020-03-08 22:14:36 +01:00
|
|
|
import qualified Data.Text as Text (length, null, splitAt)
|
2019-09-27 18:16:12 +02:00
|
|
|
import qualified PDF.EOL as EOL (charset, parser)
|
2020-03-06 19:19:53 +01:00
|
|
|
import PDF.Font (Decoder, Encoder, Font(..))
|
2019-09-27 18:16:12 +02:00
|
|
|
import PDF.Object (
|
|
|
|
DirectObject(..), Name, StringObject(..)
|
2019-09-30 14:13:12 +02:00
|
|
|
, blank, directObject, integer, line, stringObject
|
2019-09-27 18:16:12 +02:00
|
|
|
)
|
2019-09-30 14:13:12 +02:00
|
|
|
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
2020-02-11 08:29:08 +01:00
|
|
|
import Prelude hiding (fail)
|
2019-09-27 18:16:12 +02:00
|
|
|
|
|
|
|
type CMappers = Map Name CMap
|
2020-03-08 00:02:24 +01:00
|
|
|
type ToUnicode = Map ByteString Text
|
|
|
|
type FromUnicode = Map Text ByteString
|
2019-09-30 14:13:12 +02:00
|
|
|
data CRange = CRange {
|
|
|
|
fromSequence :: ByteString
|
|
|
|
, toSequence :: ByteString
|
2020-03-08 00:02:24 +01:00
|
|
|
, toUnicode :: ToUnicode
|
2019-10-03 14:59:06 +02:00
|
|
|
} deriving Show
|
2020-03-08 00:02:24 +01:00
|
|
|
type Size = Int
|
|
|
|
data CMap = CMap {
|
|
|
|
toUnicodeBySize :: Map Size [CRange]
|
|
|
|
, fromUnicodeBySize :: Map Size FromUnicode
|
|
|
|
}
|
2019-09-27 18:16:12 +02:00
|
|
|
|
2020-02-08 08:15:32 +01:00
|
|
|
toFont :: CMap -> Font
|
2020-03-08 00:02:24 +01:00
|
|
|
toFont (CMap {toUnicodeBySize, fromUnicodeBySize}) =
|
|
|
|
Font {decode = decoder toUnicodeBySize, encode = encoder fromUnicodeBySize}
|
2020-03-06 19:19:53 +01:00
|
|
|
|
2020-03-08 00:02:24 +01:00
|
|
|
decoder :: Map Size [CRange] -> Decoder
|
|
|
|
decoder rangesBySize input
|
2020-02-08 08:15:32 +01:00
|
|
|
| BS.null input = Right ""
|
|
|
|
| otherwise = do
|
2020-03-08 00:02:24 +01:00
|
|
|
(output, remainingInput) <- trySizes input $ Map.toList rangesBySize
|
|
|
|
mappend output <$> decoder rangesBySize remainingInput
|
2020-02-08 08:15:32 +01:00
|
|
|
where
|
|
|
|
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
|
|
|
|
trySizes s ((size, cRanges):others) =
|
|
|
|
let prefix = BS.take size s in
|
|
|
|
case tryRanges prefix cRanges of
|
|
|
|
Nothing -> trySizes s others
|
|
|
|
Just outputSequence -> Right (outputSequence, BS.drop size s)
|
|
|
|
tryRanges :: ByteString -> [CRange] -> Maybe Text
|
|
|
|
tryRanges _ [] = Nothing
|
2020-03-08 00:02:24 +01:00
|
|
|
tryRanges prefix ((CRange {toUnicode}):cRanges) =
|
|
|
|
case Map.lookup prefix toUnicode of
|
2020-02-08 08:15:32 +01:00
|
|
|
Nothing -> tryRanges prefix cRanges
|
|
|
|
outputSequence -> outputSequence
|
|
|
|
|
2020-03-08 00:02:24 +01:00
|
|
|
encoder :: Map Size FromUnicode -> Encoder
|
2020-03-08 22:14:36 +01:00
|
|
|
encoder fromUnicodes input
|
|
|
|
| Text.null input = Right ""
|
|
|
|
| otherwise =
|
|
|
|
foldl (<>) (Left "No encoding found") $ mapWithKey tryOn fromUnicodes
|
|
|
|
where
|
|
|
|
tryOn size fromUnicode =
|
|
|
|
let (prefix, end) = Text.splitAt size input in
|
|
|
|
case Map.lookup prefix fromUnicode of
|
|
|
|
Nothing -> Left ""
|
|
|
|
Just code -> (code <>) <$> encoder fromUnicodes end
|
2020-03-06 19:19:53 +01:00
|
|
|
|
2019-10-03 07:59:09 +02:00
|
|
|
matches :: ByteString -> CRange -> Bool
|
|
|
|
matches code (CRange {fromSequence, toSequence}) =
|
|
|
|
fromSequence <= code && code <= toSequence
|
|
|
|
|
2020-02-11 08:29:08 +01:00
|
|
|
cMap :: MonadFail m => ByteString -> m Font
|
|
|
|
cMap = either fail (return . toFont . snd) . runParser
|
2019-09-30 14:13:12 +02:00
|
|
|
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
|
|
|
emptyCMap
|
2019-09-27 18:16:12 +02:00
|
|
|
where
|
|
|
|
ignoredLine =
|
2019-09-30 14:13:12 +02:00
|
|
|
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
|
2020-03-08 00:02:24 +01:00
|
|
|
emptyCMap = CMap {toUnicodeBySize = Map.empty, fromUnicodeBySize = Map.empty}
|
2019-09-30 14:13:12 +02:00
|
|
|
|
|
|
|
codeRanges :: Parser CMap ()
|
|
|
|
codeRanges = do
|
|
|
|
size <- integer <* line "begincodespacerange"
|
2019-10-03 14:59:06 +02:00
|
|
|
mapM_ createMapping =<< count size codeRange
|
2019-09-30 14:13:12 +02:00
|
|
|
line "endcodespacerange"
|
|
|
|
where
|
|
|
|
codeRange =
|
2019-10-03 14:59:06 +02:00
|
|
|
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
|
2019-09-30 14:13:12 +02:00
|
|
|
|
|
|
|
createMapping :: (StringObject, StringObject) -> Parser CMap ()
|
2020-03-08 00:02:24 +01:00
|
|
|
createMapping (Hexadecimal from, Hexadecimal to) = modify $ \cmap -> cmap {
|
|
|
|
toUnicodeBySize = Map.insertWith (++) size [cRange] (toUnicodeBySize cmap)
|
|
|
|
}
|
2019-09-30 14:13:12 +02:00
|
|
|
where
|
2019-10-04 18:46:07 +02:00
|
|
|
fromSequence = b16ToBytes from
|
2019-09-30 14:13:12 +02:00
|
|
|
size = BS.length fromSequence
|
2019-10-04 18:46:07 +02:00
|
|
|
toSequence = b16ToBytes to
|
2020-03-08 00:02:24 +01:00
|
|
|
cRange = CRange {fromSequence, toSequence, toUnicode = Map.empty}
|
2019-09-30 14:13:12 +02:00
|
|
|
createMapping _ = return ()
|
2019-09-27 18:16:12 +02:00
|
|
|
|
2019-09-30 14:13:12 +02:00
|
|
|
cMapRange :: Parser CMap ()
|
2019-09-27 18:16:12 +02:00
|
|
|
cMapRange = do
|
|
|
|
size <- integer <* line "beginbfrange"
|
2019-10-03 14:59:06 +02:00
|
|
|
mapM_ saveMapping =<< count size rangeMapping
|
|
|
|
line "endbfrange"
|
2019-09-27 18:16:12 +02:00
|
|
|
where
|
2020-03-08 00:02:24 +01:00
|
|
|
rangeMapping = do
|
|
|
|
from <- (stringObject <* blank)
|
|
|
|
to <- (stringObject <* blank)
|
|
|
|
mapFromTo from to =<< directObject <* EOL.parser
|
2019-09-27 18:16:12 +02:00
|
|
|
|
2020-02-08 08:15:32 +01:00
|
|
|
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
|
2020-03-08 00:02:24 +01:00
|
|
|
saveMapping assoc =
|
|
|
|
modify $ \(CMap {toUnicodeBySize, fromUnicodeBySize}) -> CMap {
|
|
|
|
toUnicodeBySize = saveToUnicodeBySize assoc toUnicodeBySize
|
|
|
|
, fromUnicodeBySize = saveFromUnicodeBySize reversed fromUnicodeBySize
|
|
|
|
}
|
|
|
|
where
|
|
|
|
reversed = (\(a, b) -> (b, a)) <$> assoc
|
|
|
|
|
|
|
|
saveToUnicodeBySize :: [(ByteString, Text)] -> Map Size [CRange] -> Map Size [CRange]
|
|
|
|
saveToUnicodeBySize [] = id
|
|
|
|
saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length code)
|
2019-09-30 14:13:12 +02:00
|
|
|
where
|
|
|
|
newMapping = Map.fromList assoc
|
|
|
|
appendMapping cRange =
|
2020-03-08 00:02:24 +01:00
|
|
|
cRange {toUnicode = toUnicode cRange `union` newMapping}
|
2019-09-30 14:13:12 +02:00
|
|
|
insertCRange = fmap (\cRange ->
|
2019-10-03 07:59:09 +02:00
|
|
|
if code `matches` cRange then appendMapping cRange else cRange
|
2019-09-30 14:13:12 +02:00
|
|
|
)
|
|
|
|
|
2020-03-08 00:02:24 +01:00
|
|
|
saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode
|
|
|
|
saveFromUnicodeBySize = flip (foldl insertFromUnicode)
|
|
|
|
where
|
|
|
|
insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode
|
|
|
|
insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) =
|
|
|
|
let size = Text.length unicodeSequence in
|
|
|
|
Map.adjust (Map.insert unicodeSequence code) size tmpFromUnicodeBySize
|
|
|
|
|
2019-09-30 14:13:12 +02:00
|
|
|
cMapChar :: Parser CMap ()
|
2019-09-27 18:16:12 +02:00
|
|
|
cMapChar = do
|
|
|
|
size <- integer <* line "beginbfchar"
|
2019-09-30 14:13:12 +02:00
|
|
|
saveMapping =<< count size charMapping <* line "endbfchar"
|
2019-09-27 18:16:12 +02:00
|
|
|
where
|
2020-03-08 00:02:24 +01:00
|
|
|
charMapping = do
|
|
|
|
from <- stringObject <* blank
|
|
|
|
pairMapping from =<< stringObject <* EOL.parser
|
2019-09-27 18:16:12 +02:00
|
|
|
|
2019-10-04 18:46:07 +02:00
|
|
|
between :: B16Int -> B16Int -> [ByteString]
|
|
|
|
between from@(B16Int s) to =
|
|
|
|
let size = BS.length s `div` 2 in
|
|
|
|
toBytes size <$> [b16ToInt from .. b16ToInt to]
|
2019-09-30 14:13:12 +02:00
|
|
|
|
2019-10-04 18:46:07 +02:00
|
|
|
startFrom :: B16Int -> [ByteString]
|
|
|
|
startFrom from@(B16Int s) =
|
|
|
|
let size = BS.length s `div` 2 in
|
|
|
|
toBytes size <$> [b16ToInt from .. ]
|
2019-09-30 14:13:12 +02:00
|
|
|
|
2020-03-08 00:02:24 +01:00
|
|
|
mapFromTo :: MonadParser m => StringObject -> StringObject -> DirectObject -> m [(ByteString, Text)]
|
|
|
|
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
|
2019-10-04 18:46:07 +02:00
|
|
|
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
|
2019-09-27 18:16:12 +02:00
|
|
|
|
2020-03-08 00:02:24 +01:00
|
|
|
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) =
|
2019-10-04 18:46:07 +02:00
|
|
|
zip (between from to) <$> (mapM dstByteString dstPoints)
|
2019-09-27 18:16:12 +02:00
|
|
|
where
|
2019-09-30 14:13:12 +02:00
|
|
|
dstByteString (StringObject (Hexadecimal dst)) =
|
2019-10-04 18:46:07 +02:00
|
|
|
return . utf16BEToutf8 $ b16ToBytes dst
|
2019-09-30 14:13:12 +02:00
|
|
|
dstByteString _ = fail "Invalid for a replacement string"
|
2019-09-27 18:16:12 +02:00
|
|
|
|
2020-03-08 00:02:24 +01:00
|
|
|
mapFromTo _ _ _ = fail "invalid range mapping found"
|
2019-09-27 18:16:12 +02:00
|
|
|
|
2020-03-08 00:02:24 +01:00
|
|
|
pairMapping :: MonadParser m => StringObject -> StringObject -> m (ByteString, Text)
|
|
|
|
pairMapping (Hexadecimal from) (Hexadecimal to) =
|
2019-10-04 18:46:07 +02:00
|
|
|
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
|
2020-03-08 00:02:24 +01:00
|
|
|
pairMapping _ _ = fail "invalid pair mapping found"
|