Going to try with Text, naturally handling UTF-16 but will still have to parse «int codes» manually from strings

This commit is contained in:
Tissevert 2019-09-26 16:56:13 +02:00
parent 98d029c4d4
commit 1dd22c3889
2 changed files with 23 additions and 28 deletions

View File

@ -31,6 +31,7 @@ library
, bytestring
, containers
, mtl
, text
, utf8-string
hs-source-dirs: src
ghc-options: -Wall

View File

@ -15,9 +15,14 @@ import Control.Monad.State (put)
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
import Data.ByteString (ByteString, pack)
import qualified Data.ByteString.UTF8 as UTF8 (fromString, toString)
import qualified Data.ByteString as BS (head)
import Data.ByteString.Char8 as Char8 (pack)
import qualified Data.ByteString.UTF8 as UTF8 (fromString)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList)
import Data.Text (Text, snoc)
import qualified Data.Text as Text (init, last, null, unpack)
import Data.Text.Encoding (decodeUtf16BE)
import PDF.Object (
DirectObject(..), Name, StringObject(..)
, array, blank, directObject, integer, line, name, regular, stringObject
@ -26,7 +31,7 @@ import qualified PDF.EOL as EOL (charset, parser)
import PDF.Parser (Parser, evalParser, string, takeAll)
type CMappers = Map Name CMap
type CMap = Map Int String
type CMap = Map Int Text
emptyCMap :: CMap
emptyCMap = Map.empty
@ -57,26 +62,26 @@ cMapChar = do
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
>>= pairMapping
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, String)]
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, Text)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
let dstString = utf8Decode dstFrom in
return $ zip [hexString from .. hexString to] (stringsFrom dstString)
let dstString = parseText dstFrom in
return $ zip [hexString from .. hexString to] (textsFrom dstString)
where
stringsFrom [] = [[]]
stringsFrom [x] = (:[]) <$> [x..]
stringsFrom (x:xs) = (x:) <$> stringsFrom xs
textsFrom t
| Text.null t = [t]
| otherwise = (Text.init t `snoc`) <$> [Text.last t ..]
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
where
dstString (StringObject (Hexadecimal dstPoint)) = return $ utf8Decode dstPoint
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseText dstPoint
dstString _ = fail "Invalid for a replacement string"
mapFromTo _ = fail "invalid range mapping found"
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, String)
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, Text)
pairMapping (Hexadecimal from, Hexadecimal to) =
return (hexString from, utf8Decode to)
return (hexString from, parseText to)
pairMapping _ = fail "invalid pair mapping found"
hexString :: (Num a, Read a) => String -> a
@ -85,21 +90,10 @@ hexString s = read $ "0x" ++ s
pairDigits :: String -> [String]
pairDigits "" = []
pairDigits [c] = [[c]]
pairDigits (a:b:end) = (a:[b]):pairDigits end
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
toBytes :: String -> ByteString
toBytes = pack . fmap hexString . pairDigits
utf8Decode :: String -> String
utf8Decode = UTF8.toString . pack . toBaseWord8 [] . (hexString :: String -> Integer)
where
toBaseWord8 digits n
| n < 0xff = (fromIntegral n):digits
| otherwise =
let newDigits = (fromIntegral (n `mod` 0x100)):digits in
case n `div` 0x100 of
0 -> newDigits
k -> toBaseWord8 newDigits k
parseText :: String -> Text
parseText = decodeUtf16BE . pack . fmap hexString . pairDigits
data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
@ -199,8 +193,8 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString
decodeString (Hexadecimal h) = decodeString (Literal (utf8Decode h))
decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h))
decodeString (Literal s) =
undefined
asks
where
bytes = UTF8.fromString s
bytes = Char8.pack s