In complete debug, more or less implemented CMap parsing but apparently it uses UTF16 ?!
This commit is contained in:
parent
c349d9b4c2
commit
98d029c4d4
|
@ -1,12 +1,12 @@
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
module PDF.Text (
|
module PDF.Text {-(
|
||||||
CMap
|
CMap
|
||||||
, CMappers
|
, CMappers
|
||||||
, PageContents(..)
|
, PageContents(..)
|
||||||
, cMap
|
, cMap
|
||||||
, emptyCMap
|
, emptyCMap
|
||||||
, pageContents
|
, pageContents
|
||||||
) where
|
)-} where
|
||||||
|
|
||||||
import Control.Applicative ((<|>), many)
|
import Control.Applicative ((<|>), many)
|
||||||
import Control.Monad (foldM, join)
|
import Control.Monad (foldM, join)
|
||||||
|
@ -14,8 +14,8 @@ import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
||||||
import Control.Monad.State (put)
|
import Control.Monad.State (put)
|
||||||
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
|
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
|
||||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
||||||
import Data.ByteString.Char8 (ByteString, pack)
|
import Data.ByteString (ByteString, pack)
|
||||||
import qualified Data.ByteString.UTF8 as UTF8 (toString)
|
import qualified Data.ByteString.UTF8 as UTF8 (fromString, toString)
|
||||||
import Data.Map (Map, (!))
|
import Data.Map (Map, (!))
|
||||||
import qualified Data.Map as Map (empty, fromList)
|
import qualified Data.Map as Map (empty, fromList)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
|
@ -26,7 +26,7 @@ import qualified PDF.EOL as EOL (charset, parser)
|
||||||
import PDF.Parser (Parser, evalParser, string, takeAll)
|
import PDF.Parser (Parser, evalParser, string, takeAll)
|
||||||
|
|
||||||
type CMappers = Map Name CMap
|
type CMappers = Map Name CMap
|
||||||
type CMap = Map Int ByteString
|
type CMap = Map Int String
|
||||||
|
|
||||||
emptyCMap :: CMap
|
emptyCMap :: CMap
|
||||||
emptyCMap = Map.empty
|
emptyCMap = Map.empty
|
||||||
|
@ -40,42 +40,64 @@ cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
|
||||||
cMapRange :: Atto.Parser CMap
|
cMapRange :: Atto.Parser CMap
|
||||||
cMapRange = do
|
cMapRange = do
|
||||||
size <- integer <* line "beginbfrange"
|
size <- integer <* line "beginbfrange"
|
||||||
mconcat <$> count size rangeMapping <* line "endbfrange"
|
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
|
||||||
where
|
where
|
||||||
rangeMapping = mapFromTo
|
rangeMapping = (,,)
|
||||||
<$> (stringObject <* blank)
|
<$> (stringObject <* blank)
|
||||||
<*> (stringObject <* blank)
|
<*> (stringObject <* blank)
|
||||||
<*> directObject <* EOL.parser
|
<*> directObject <* EOL.parser
|
||||||
|
>>= mapFromTo
|
||||||
|
|
||||||
cMapChar :: Atto.Parser CMap
|
cMapChar :: Atto.Parser CMap
|
||||||
cMapChar = do
|
cMapChar = do
|
||||||
size <- integer <* line "beginbfchar"
|
size <- integer <* line "beginbfchar"
|
||||||
Map.fromList <$> count size charMapping <* line "endbfchar"
|
Map.fromList <$> count size charMapping <* line "endbfchar"
|
||||||
where
|
where
|
||||||
charMapping = pairMapping
|
charMapping =
|
||||||
<$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
||||||
|
>>= pairMapping
|
||||||
|
|
||||||
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
|
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, String)]
|
||||||
let dstFrom
|
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
||||||
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) = undefined
|
let dstString = utf8Decode dstFrom in
|
||||||
mapFromTo _ _ _ = fail "invalid range mapping found"
|
return $ zip [hexString from .. hexString to] (stringsFrom dstString)
|
||||||
|
where
|
||||||
|
stringsFrom [] = [[]]
|
||||||
|
stringsFrom [x] = (:[]) <$> [x..]
|
||||||
|
stringsFrom (x:xs) = (x:) <$> stringsFrom xs
|
||||||
|
|
||||||
pairMapping :: StringObject -> StringObject -> (Int, ByteString)
|
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
||||||
pairMapping (Hexadecimal from) (Hexadecimal to) =
|
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
|
||||||
(hexString from, toByteString to)
|
where
|
||||||
pairMapping = fail "invalid pair mapping found"
|
dstString (StringObject (Hexadecimal dstPoint)) = return $ utf8Decode dstPoint
|
||||||
|
dstString _ = fail "Invalid for a replacement string"
|
||||||
|
|
||||||
hexString :: Num a => String -> a
|
mapFromTo _ = fail "invalid range mapping found"
|
||||||
|
|
||||||
|
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, String)
|
||||||
|
pairMapping (Hexadecimal from, Hexadecimal to) =
|
||||||
|
return (hexString from, utf8Decode to)
|
||||||
|
pairMapping _ = fail "invalid pair mapping found"
|
||||||
|
|
||||||
|
hexString :: (Num a, Read a) => String -> a
|
||||||
hexString s = read $ "0x" ++ s
|
hexString s = read $ "0x" ++ s
|
||||||
|
|
||||||
toByteString :: String -> [Word8]
|
pairDigits :: String -> [String]
|
||||||
toByteString = pack . toBaseWord8 [] . hexString
|
pairDigits "" = []
|
||||||
|
pairDigits [c] = [[c]]
|
||||||
|
pairDigits (a:b:end) = (a:[b]):pairDigits end
|
||||||
|
|
||||||
|
toBytes :: String -> ByteString
|
||||||
|
toBytes = pack . fmap hexString . pairDigits
|
||||||
|
|
||||||
|
utf8Decode :: String -> String
|
||||||
|
utf8Decode = UTF8.toString . pack . toBaseWord8 [] . (hexString :: String -> Integer)
|
||||||
where
|
where
|
||||||
toBaseWord8 digits n
|
toBaseWord8 digits n
|
||||||
| n < 0xff = (fromIntegral n):digits
|
| n < 0xff = (fromIntegral n):digits
|
||||||
| otherwise =
|
| otherwise =
|
||||||
let newDigits = (fromIntegral (n `mod` 0xff)):digits in
|
let newDigits = (fromIntegral (n `mod` 0x100)):digits in
|
||||||
case n `div` 0xff of
|
case n `div` 0x100 of
|
||||||
0 -> newDigits
|
0 -> newDigits
|
||||||
k -> toBaseWord8 newDigits k
|
k -> toBaseWord8 newDigits k
|
||||||
|
|
||||||
|
@ -177,4 +199,8 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
|
||||||
runOperator _ = return []
|
runOperator _ = return []
|
||||||
|
|
||||||
decodeString :: StringObject -> ParserWithFont ByteString
|
decodeString :: StringObject -> ParserWithFont ByteString
|
||||||
decodeString = undefined
|
decodeString (Hexadecimal h) = decodeString (Literal (utf8Decode h))
|
||||||
|
decodeString (Literal s) =
|
||||||
|
undefined
|
||||||
|
where
|
||||||
|
bytes = UTF8.fromString s
|
||||||
|
|
Loading…
Reference in New Issue