In complete debug, more or less implemented CMap parsing but apparently it uses UTF16 ?!

This commit is contained in:
Tissevert 2019-09-26 15:51:41 +02:00
parent c349d9b4c2
commit 98d029c4d4
1 changed files with 49 additions and 23 deletions

View File

@ -1,12 +1,12 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module PDF.Text ( module PDF.Text {-(
CMap CMap
, CMappers , CMappers
, PageContents(..) , PageContents(..)
, cMap , cMap
, emptyCMap , emptyCMap
, pageContents , pageContents
) where )-} where
import Control.Applicative ((<|>), many) import Control.Applicative ((<|>), many)
import Control.Monad (foldM, join) import Control.Monad (foldM, join)
@ -14,8 +14,8 @@ import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (put) import Control.Monad.State (put)
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser) import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
import Data.ByteString.Char8 (ByteString, pack) import Data.ByteString (ByteString, pack)
import qualified Data.ByteString.UTF8 as UTF8 (toString) import qualified Data.ByteString.UTF8 as UTF8 (fromString, toString)
import Data.Map (Map, (!)) import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList) import qualified Data.Map as Map (empty, fromList)
import PDF.Object ( import PDF.Object (
@ -26,7 +26,7 @@ import qualified PDF.EOL as EOL (charset, parser)
import PDF.Parser (Parser, evalParser, string, takeAll) import PDF.Parser (Parser, evalParser, string, takeAll)
type CMappers = Map Name CMap type CMappers = Map Name CMap
type CMap = Map Int ByteString type CMap = Map Int String
emptyCMap :: CMap emptyCMap :: CMap
emptyCMap = Map.empty emptyCMap = Map.empty
@ -40,42 +40,64 @@ cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
cMapRange :: Atto.Parser CMap cMapRange :: Atto.Parser CMap
cMapRange = do cMapRange = do
size <- integer <* line "beginbfrange" size <- integer <* line "beginbfrange"
mconcat <$> count size rangeMapping <* line "endbfrange" mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
where where
rangeMapping = mapFromTo rangeMapping = (,,)
<$> (stringObject <* blank) <$> (stringObject <* blank)
<*> (stringObject <* blank) <*> (stringObject <* blank)
<*> directObject <* EOL.parser <*> directObject <* EOL.parser
>>= mapFromTo
cMapChar :: Atto.Parser CMap cMapChar :: Atto.Parser CMap
cMapChar = do cMapChar = do
size <- integer <* line "beginbfchar" size <- integer <* line "beginbfchar"
Map.fromList <$> count size charMapping <* line "endbfchar" Map.fromList <$> count size charMapping <* line "endbfchar"
where where
charMapping = pairMapping charMapping =
<$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
>>= pairMapping
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) = mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, String)]
let dstFrom mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) = undefined let dstString = utf8Decode dstFrom in
mapFromTo _ _ _ = fail "invalid range mapping found" return $ zip [hexString from .. hexString to] (stringsFrom dstString)
where
stringsFrom [] = [[]]
stringsFrom [x] = (:[]) <$> [x..]
stringsFrom (x:xs) = (x:) <$> stringsFrom xs
pairMapping :: StringObject -> StringObject -> (Int, ByteString) mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
pairMapping (Hexadecimal from) (Hexadecimal to) = zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
(hexString from, toByteString to) where
pairMapping = fail "invalid pair mapping found" dstString (StringObject (Hexadecimal dstPoint)) = return $ utf8Decode dstPoint
dstString _ = fail "Invalid for a replacement string"
hexString :: Num a => String -> a mapFromTo _ = fail "invalid range mapping found"
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, String)
pairMapping (Hexadecimal from, Hexadecimal to) =
return (hexString from, utf8Decode to)
pairMapping _ = fail "invalid pair mapping found"
hexString :: (Num a, Read a) => String -> a
hexString s = read $ "0x" ++ s hexString s = read $ "0x" ++ s
toByteString :: String -> [Word8] pairDigits :: String -> [String]
toByteString = pack . toBaseWord8 [] . hexString pairDigits "" = []
pairDigits [c] = [[c]]
pairDigits (a:b:end) = (a:[b]):pairDigits end
toBytes :: String -> ByteString
toBytes = pack . fmap hexString . pairDigits
utf8Decode :: String -> String
utf8Decode = UTF8.toString . pack . toBaseWord8 [] . (hexString :: String -> Integer)
where where
toBaseWord8 digits n toBaseWord8 digits n
| n < 0xff = (fromIntegral n):digits | n < 0xff = (fromIntegral n):digits
| otherwise = | otherwise =
let newDigits = (fromIntegral (n `mod` 0xff)):digits in let newDigits = (fromIntegral (n `mod` 0x100)):digits in
case n `div` 0xff of case n `div` 0x100 of
0 -> newDigits 0 -> newDigits
k -> toBaseWord8 newDigits k k -> toBaseWord8 newDigits k
@ -177,4 +199,8 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return [] runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString decodeString :: StringObject -> ParserWithFont ByteString
decodeString = undefined decodeString (Hexadecimal h) = decodeString (Literal (utf8Decode h))
decodeString (Literal s) =
undefined
where
bytes = UTF8.fromString s