In complete debug, more or less implemented CMap parsing but apparently it uses UTF16 ?!

This commit is contained in:
Tissevert 2019-09-26 15:51:41 +02:00
parent c349d9b4c2
commit 98d029c4d4
1 changed files with 49 additions and 23 deletions

View File

@ -1,12 +1,12 @@
{-# LANGUAGE OverloadedStrings #-}
module PDF.Text (
module PDF.Text {-(
CMap
, CMappers
, PageContents(..)
, cMap
, emptyCMap
, pageContents
) where
)-} where
import Control.Applicative ((<|>), many)
import Control.Monad (foldM, join)
@ -14,8 +14,8 @@ import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (put)
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
import Data.ByteString.Char8 (ByteString, pack)
import qualified Data.ByteString.UTF8 as UTF8 (toString)
import Data.ByteString (ByteString, pack)
import qualified Data.ByteString.UTF8 as UTF8 (fromString, toString)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList)
import PDF.Object (
@ -26,7 +26,7 @@ import qualified PDF.EOL as EOL (charset, parser)
import PDF.Parser (Parser, evalParser, string, takeAll)
type CMappers = Map Name CMap
type CMap = Map Int ByteString
type CMap = Map Int String
emptyCMap :: CMap
emptyCMap = Map.empty
@ -40,42 +40,64 @@ cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
cMapRange :: Atto.Parser CMap
cMapRange = do
size <- integer <* line "beginbfrange"
mconcat <$> count size rangeMapping <* line "endbfrange"
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
where
rangeMapping = mapFromTo
rangeMapping = (,,)
<$> (stringObject <* blank)
<*> (stringObject <* blank)
<*> directObject <* EOL.parser
>>= mapFromTo
cMapChar :: Atto.Parser CMap
cMapChar = do
size <- integer <* line "beginbfchar"
Map.fromList <$> count size charMapping <* line "endbfchar"
where
charMapping = pairMapping
<$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
charMapping =
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
>>= pairMapping
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
let dstFrom
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) = undefined
mapFromTo _ _ _ = fail "invalid range mapping found"
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, String)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
let dstString = utf8Decode dstFrom in
return $ zip [hexString from .. hexString to] (stringsFrom dstString)
where
stringsFrom [] = [[]]
stringsFrom [x] = (:[]) <$> [x..]
stringsFrom (x:xs) = (x:) <$> stringsFrom xs
pairMapping :: StringObject -> StringObject -> (Int, ByteString)
pairMapping (Hexadecimal from) (Hexadecimal to) =
(hexString from, toByteString to)
pairMapping = fail "invalid pair mapping found"
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
where
dstString (StringObject (Hexadecimal dstPoint)) = return $ utf8Decode dstPoint
dstString _ = fail "Invalid for a replacement string"
hexString :: Num a => String -> a
mapFromTo _ = fail "invalid range mapping found"
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, String)
pairMapping (Hexadecimal from, Hexadecimal to) =
return (hexString from, utf8Decode to)
pairMapping _ = fail "invalid pair mapping found"
hexString :: (Num a, Read a) => String -> a
hexString s = read $ "0x" ++ s
toByteString :: String -> [Word8]
toByteString = pack . toBaseWord8 [] . hexString
pairDigits :: String -> [String]
pairDigits "" = []
pairDigits [c] = [[c]]
pairDigits (a:b:end) = (a:[b]):pairDigits end
toBytes :: String -> ByteString
toBytes = pack . fmap hexString . pairDigits
utf8Decode :: String -> String
utf8Decode = UTF8.toString . pack . toBaseWord8 [] . (hexString :: String -> Integer)
where
toBaseWord8 digits n
| n < 0xff = (fromIntegral n):digits
| otherwise =
let newDigits = (fromIntegral (n `mod` 0xff)):digits in
case n `div` 0xff of
let newDigits = (fromIntegral (n `mod` 0x100)):digits in
case n `div` 0x100 of
0 -> newDigits
k -> toBaseWord8 newDigits k
@ -177,4 +199,8 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString
decodeString = undefined
decodeString (Hexadecimal h) = decodeString (Literal (utf8Decode h))
decodeString (Literal s) =
undefined
where
bytes = UTF8.fromString s