Going to try with Text, naturally handling UTF-16 but will still have to parse «int codes» manually from strings
This commit is contained in:
parent
98d029c4d4
commit
1dd22c3889
|
@ -31,6 +31,7 @@ library
|
|||
, bytestring
|
||||
, containers
|
||||
, mtl
|
||||
, text
|
||||
, utf8-string
|
||||
hs-source-dirs: src
|
||||
ghc-options: -Wall
|
||||
|
|
|
@ -15,9 +15,14 @@ import Control.Monad.State (put)
|
|||
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
|
||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
||||
import Data.ByteString (ByteString, pack)
|
||||
import qualified Data.ByteString.UTF8 as UTF8 (fromString, toString)
|
||||
import qualified Data.ByteString as BS (head)
|
||||
import Data.ByteString.Char8 as Char8 (pack)
|
||||
import qualified Data.ByteString.UTF8 as UTF8 (fromString)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, fromList)
|
||||
import Data.Text (Text, snoc)
|
||||
import qualified Data.Text as Text (init, last, null, unpack)
|
||||
import Data.Text.Encoding (decodeUtf16BE)
|
||||
import PDF.Object (
|
||||
DirectObject(..), Name, StringObject(..)
|
||||
, array, blank, directObject, integer, line, name, regular, stringObject
|
||||
|
@ -26,7 +31,7 @@ import qualified PDF.EOL as EOL (charset, parser)
|
|||
import PDF.Parser (Parser, evalParser, string, takeAll)
|
||||
|
||||
type CMappers = Map Name CMap
|
||||
type CMap = Map Int String
|
||||
type CMap = Map Int Text
|
||||
|
||||
emptyCMap :: CMap
|
||||
emptyCMap = Map.empty
|
||||
|
@ -57,26 +62,26 @@ cMapChar = do
|
|||
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
||||
>>= pairMapping
|
||||
|
||||
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, String)]
|
||||
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, Text)]
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
||||
let dstString = utf8Decode dstFrom in
|
||||
return $ zip [hexString from .. hexString to] (stringsFrom dstString)
|
||||
let dstString = parseText dstFrom in
|
||||
return $ zip [hexString from .. hexString to] (textsFrom dstString)
|
||||
where
|
||||
stringsFrom [] = [[]]
|
||||
stringsFrom [x] = (:[]) <$> [x..]
|
||||
stringsFrom (x:xs) = (x:) <$> stringsFrom xs
|
||||
textsFrom t
|
||||
| Text.null t = [t]
|
||||
| otherwise = (Text.init t `snoc`) <$> [Text.last t ..]
|
||||
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
||||
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
|
||||
where
|
||||
dstString (StringObject (Hexadecimal dstPoint)) = return $ utf8Decode dstPoint
|
||||
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseText dstPoint
|
||||
dstString _ = fail "Invalid for a replacement string"
|
||||
|
||||
mapFromTo _ = fail "invalid range mapping found"
|
||||
|
||||
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, String)
|
||||
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, Text)
|
||||
pairMapping (Hexadecimal from, Hexadecimal to) =
|
||||
return (hexString from, utf8Decode to)
|
||||
return (hexString from, parseText to)
|
||||
pairMapping _ = fail "invalid pair mapping found"
|
||||
|
||||
hexString :: (Num a, Read a) => String -> a
|
||||
|
@ -85,21 +90,10 @@ hexString s = read $ "0x" ++ s
|
|||
pairDigits :: String -> [String]
|
||||
pairDigits "" = []
|
||||
pairDigits [c] = [[c]]
|
||||
pairDigits (a:b:end) = (a:[b]):pairDigits end
|
||||
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
|
||||
|
||||
toBytes :: String -> ByteString
|
||||
toBytes = pack . fmap hexString . pairDigits
|
||||
|
||||
utf8Decode :: String -> String
|
||||
utf8Decode = UTF8.toString . pack . toBaseWord8 [] . (hexString :: String -> Integer)
|
||||
where
|
||||
toBaseWord8 digits n
|
||||
| n < 0xff = (fromIntegral n):digits
|
||||
| otherwise =
|
||||
let newDigits = (fromIntegral (n `mod` 0x100)):digits in
|
||||
case n `div` 0x100 of
|
||||
0 -> newDigits
|
||||
k -> toBaseWord8 newDigits k
|
||||
parseText :: String -> Text
|
||||
parseText = decodeUtf16BE . pack . fmap hexString . pairDigits
|
||||
|
||||
data StateOperator =
|
||||
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
|
||||
|
@ -199,8 +193,8 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
|
|||
runOperator _ = return []
|
||||
|
||||
decodeString :: StringObject -> ParserWithFont ByteString
|
||||
decodeString (Hexadecimal h) = decodeString (Literal (utf8Decode h))
|
||||
decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h))
|
||||
decodeString (Literal s) =
|
||||
undefined
|
||||
asks
|
||||
where
|
||||
bytes = UTF8.fromString s
|
||||
bytes = Char8.pack s
|
||||
|
|
Loading…
Reference in New Issue