Hufflepdf/src/PDF/CMap.hs

197 lines
7.0 KiB
Haskell

{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
module PDF.CMap (
CMap
, CMappers
, CRange(..)
, matches
, parse
) where
import Control.Applicative ((<|>), many)
import Control.Monad.Except (MonadError(..))
import Control.Monad.Fail (fail)
import Control.Monad.State (modify)
import Data.Attoparsec.ByteString.Char8 (count)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (drop, length, null, take)
import Data.ByteString.Char8 (unpack)
import Data.ByteString.Char8.Util (
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
)
import Data.Foldable (foldl', foldr')
import Data.Map (Map, mapWithKey, union)
import qualified Data.Map as Map (
adjust, empty, fromList, insert, insertWith, lookup, toList
)
import Data.Text (Text)
import qualified Data.Text as Text (length, null, splitAt)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Font (Decoder, Encoder, Font(..))
import PDF.Object (
DirectObject(..), Name, StringObject(..)
, blank, directObject, integer, line, stringObject
)
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
import Prelude hiding (fail)
type CMappers = Map Name CMap
type ToUnicode = Map ByteString Text
type FromUnicode = Map Text ByteString
data CRange = CRange {
fromSequence :: ByteString
, toSequence :: ByteString
, toUnicode :: ToUnicode
} deriving Show
type Size = Int
data CMap = CMap {
toUnicodeBySize :: Map Size [CRange]
, fromUnicodeBySize :: Map Size FromUnicode
}
toFont :: CMap -> Font
toFont (CMap {toUnicodeBySize, fromUnicodeBySize}) =
Font {decode = decoder toUnicodeBySize, encode = encoder fromUnicodeBySize}
decoder :: Map Size [CRange] -> Decoder
decoder rangesBySize input
| BS.null input = Right ""
| otherwise = do
(output, remainingInput) <- trySizes input $ Map.toList rangesBySize
mappend output <$> decoder rangesBySize remainingInput
where
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
trySizes s ((size, cRanges):others) =
let prefix = BS.take size s in
case tryRanges prefix cRanges of
Nothing -> trySizes s others
Just outputSequence -> Right (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe Text
tryRanges _ [] = Nothing
tryRanges prefix ((CRange {toUnicode}):cRanges) =
case Map.lookup prefix toUnicode of
Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence
encoder :: Map Size FromUnicode -> Encoder
encoder fromUnicodes input
| Text.null input = Right ""
| otherwise =
foldr' (<>) (Left "No encoding found") $ mapWithKey tryOn fromUnicodes
where
tryOn size fromUnicode =
let (prefix, end) = Text.splitAt size input in
case Map.lookup prefix fromUnicode of
Nothing -> Left ""
Just code -> (code <>) <$> encoder fromUnicodes end
matches :: ByteString -> CRange -> Bool
matches code (CRange {fromSequence, toSequence}) =
fromSequence <= code && code <= toSequence
parse :: MonadError String m => ByteString -> m Font
parse = either throwError (return . toFont . snd) . runParser
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
emptyCMap
where
ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
emptyCMap = CMap {toUnicodeBySize = Map.empty, fromUnicodeBySize = Map.empty}
codeRanges :: Parser CMap ()
codeRanges = do
size <- integer <* line "begincodespacerange"
mapM_ createMapping =<< count size codeRange
line "endcodespacerange"
where
codeRange =
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
createMapping :: (StringObject, StringObject) -> Parser CMap ()
createMapping (Hexadecimal from, Hexadecimal to) = modify $ \cmap -> cmap {
toUnicodeBySize = Map.insertWith (++) size [cRange] (toUnicodeBySize cmap)
}
where
fromSequence = b16ToBytes from
size = BS.length fromSequence
toSequence = b16ToBytes to
cRange = CRange {fromSequence, toSequence, toUnicode = Map.empty}
createMapping _ = return ()
cMapRange :: Parser CMap ()
cMapRange = do
size <- integer <* line "beginbfrange"
mapM_ saveMapping =<< count size rangeMapping
line "endbfrange"
where
rangeMapping = do
from <- (stringObject <* blank)
to <- (stringObject <* blank)
mapFromTo from to =<< directObject <* EOL.parser
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
saveMapping assoc =
modify $ \(CMap {toUnicodeBySize, fromUnicodeBySize}) -> CMap {
toUnicodeBySize = saveToUnicodeBySize assoc toUnicodeBySize
, fromUnicodeBySize = saveFromUnicodeBySize reversed fromUnicodeBySize
}
where
reversed = (\(a, b) -> (b, a)) <$> assoc
saveToUnicodeBySize :: [(ByteString, Text)] -> Map Size [CRange] -> Map Size [CRange]
saveToUnicodeBySize [] = id
saveToUnicodeBySize assoc@((code, _):_) = Map.adjust insertCRange (BS.length code)
where
newMapping = Map.fromList assoc
appendMapping cRange =
cRange {toUnicode = toUnicode cRange `union` newMapping}
insertCRange = fmap (\cRange ->
if code `matches` cRange then appendMapping cRange else cRange
)
saveFromUnicodeBySize :: [(Text, ByteString)] -> Map Size FromUnicode -> Map Size FromUnicode
saveFromUnicodeBySize = flip (foldl' insertFromUnicode)
where
insertFromUnicode :: Map Size FromUnicode -> (Text, ByteString) -> Map Size FromUnicode
insertFromUnicode tmpFromUnicodeBySize (unicodeSequence, code) =
let size = Text.length unicodeSequence in
Map.adjust (Map.insert unicodeSequence code) size tmpFromUnicodeBySize
cMapChar :: Parser CMap ()
cMapChar = do
size <- integer <* line "beginbfchar"
saveMapping =<< count size charMapping <* line "endbfchar"
where
charMapping = do
from <- stringObject <* blank
pairMapping from =<< stringObject <* EOL.parser
between :: B16Int -> B16Int -> [ByteString]
between from@(B16Int s) to =
let size = BS.length s `div` 2 in
toBytes size <$> [b16ToInt from .. b16ToInt to]
startFrom :: B16Int -> [ByteString]
startFrom from@(B16Int s) =
let size = BS.length s `div` 2 in
toBytes size <$> [b16ToInt from .. ]
mapFromTo :: MonadParser m => StringObject -> StringObject -> DirectObject -> m [(ByteString, Text)]
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) =
zip (between from to) <$> (mapM dstByteString dstPoints)
where
dstByteString (StringObject (Hexadecimal dst)) =
return . utf16BEToutf8 $ b16ToBytes dst
dstByteString _ = fail "Invalid for a replacement string"
mapFromTo _ _ _ = fail "invalid range mapping found"
pairMapping :: MonadParser m => StringObject -> StringObject -> m (ByteString, Text)
pairMapping (Hexadecimal from) (Hexadecimal to) =
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
pairMapping _ _ = fail "invalid pair mapping found"