In complete debug, more or less implemented CMap parsing but apparently it uses UTF16 ?!
This commit is contained in:
parent
c349d9b4c2
commit
98d029c4d4
1 changed files with 49 additions and 23 deletions
|
@ -1,12 +1,12 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
module PDF.Text (
|
||||
module PDF.Text {-(
|
||||
CMap
|
||||
, CMappers
|
||||
, PageContents(..)
|
||||
, cMap
|
||||
, emptyCMap
|
||||
, pageContents
|
||||
) where
|
||||
)-} where
|
||||
|
||||
import Control.Applicative ((<|>), many)
|
||||
import Control.Monad (foldM, join)
|
||||
|
@ -14,8 +14,8 @@ import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
|||
import Control.Monad.State (put)
|
||||
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
|
||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
||||
import Data.ByteString.Char8 (ByteString, pack)
|
||||
import qualified Data.ByteString.UTF8 as UTF8 (toString)
|
||||
import Data.ByteString (ByteString, pack)
|
||||
import qualified Data.ByteString.UTF8 as UTF8 (fromString, toString)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, fromList)
|
||||
import PDF.Object (
|
||||
|
@ -26,7 +26,7 @@ import qualified PDF.EOL as EOL (charset, parser)
|
|||
import PDF.Parser (Parser, evalParser, string, takeAll)
|
||||
|
||||
type CMappers = Map Name CMap
|
||||
type CMap = Map Int ByteString
|
||||
type CMap = Map Int String
|
||||
|
||||
emptyCMap :: CMap
|
||||
emptyCMap = Map.empty
|
||||
|
@ -40,42 +40,64 @@ cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
|
|||
cMapRange :: Atto.Parser CMap
|
||||
cMapRange = do
|
||||
size <- integer <* line "beginbfrange"
|
||||
mconcat <$> count size rangeMapping <* line "endbfrange"
|
||||
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
|
||||
where
|
||||
rangeMapping = mapFromTo
|
||||
rangeMapping = (,,)
|
||||
<$> (stringObject <* blank)
|
||||
<*> (stringObject <* blank)
|
||||
<*> directObject <* EOL.parser
|
||||
>>= mapFromTo
|
||||
|
||||
cMapChar :: Atto.Parser CMap
|
||||
cMapChar = do
|
||||
size <- integer <* line "beginbfchar"
|
||||
Map.fromList <$> count size charMapping <* line "endbfchar"
|
||||
where
|
||||
charMapping = pairMapping
|
||||
<$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
||||
charMapping =
|
||||
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
||||
>>= pairMapping
|
||||
|
||||
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
|
||||
let dstFrom
|
||||
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) = undefined
|
||||
mapFromTo _ _ _ = fail "invalid range mapping found"
|
||||
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, String)]
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
||||
let dstString = utf8Decode dstFrom in
|
||||
return $ zip [hexString from .. hexString to] (stringsFrom dstString)
|
||||
where
|
||||
stringsFrom [] = [[]]
|
||||
stringsFrom [x] = (:[]) <$> [x..]
|
||||
stringsFrom (x:xs) = (x:) <$> stringsFrom xs
|
||||
|
||||
pairMapping :: StringObject -> StringObject -> (Int, ByteString)
|
||||
pairMapping (Hexadecimal from) (Hexadecimal to) =
|
||||
(hexString from, toByteString to)
|
||||
pairMapping = fail "invalid pair mapping found"
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
||||
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
|
||||
where
|
||||
dstString (StringObject (Hexadecimal dstPoint)) = return $ utf8Decode dstPoint
|
||||
dstString _ = fail "Invalid for a replacement string"
|
||||
|
||||
hexString :: Num a => String -> a
|
||||
mapFromTo _ = fail "invalid range mapping found"
|
||||
|
||||
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, String)
|
||||
pairMapping (Hexadecimal from, Hexadecimal to) =
|
||||
return (hexString from, utf8Decode to)
|
||||
pairMapping _ = fail "invalid pair mapping found"
|
||||
|
||||
hexString :: (Num a, Read a) => String -> a
|
||||
hexString s = read $ "0x" ++ s
|
||||
|
||||
toByteString :: String -> [Word8]
|
||||
toByteString = pack . toBaseWord8 [] . hexString
|
||||
pairDigits :: String -> [String]
|
||||
pairDigits "" = []
|
||||
pairDigits [c] = [[c]]
|
||||
pairDigits (a:b:end) = (a:[b]):pairDigits end
|
||||
|
||||
toBytes :: String -> ByteString
|
||||
toBytes = pack . fmap hexString . pairDigits
|
||||
|
||||
utf8Decode :: String -> String
|
||||
utf8Decode = UTF8.toString . pack . toBaseWord8 [] . (hexString :: String -> Integer)
|
||||
where
|
||||
toBaseWord8 digits n
|
||||
| n < 0xff = (fromIntegral n):digits
|
||||
| otherwise =
|
||||
let newDigits = (fromIntegral (n `mod` 0xff)):digits in
|
||||
case n `div` 0xff of
|
||||
let newDigits = (fromIntegral (n `mod` 0x100)):digits in
|
||||
case n `div` 0x100 of
|
||||
0 -> newDigits
|
||||
k -> toBaseWord8 newDigits k
|
||||
|
||||
|
@ -177,4 +199,8 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
|
|||
runOperator _ = return []
|
||||
|
||||
decodeString :: StringObject -> ParserWithFont ByteString
|
||||
decodeString = undefined
|
||||
decodeString (Hexadecimal h) = decodeString (Literal (utf8Decode h))
|
||||
decodeString (Literal s) =
|
||||
undefined
|
||||
where
|
||||
bytes = UTF8.fromString s
|
||||
|
|
Loading…
Reference in a new issue