diff --git a/src/PDF/Text.hs b/src/PDF/Text.hs index e7c71a6..fc5e67a 100644 --- a/src/PDF/Text.hs +++ b/src/PDF/Text.hs @@ -1,12 +1,12 @@ {-# LANGUAGE OverloadedStrings #-} -module PDF.Text ( +module PDF.Text {-( CMap , CMappers , PageContents(..) , cMap , emptyCMap , pageContents - ) where + )-} where import Control.Applicative ((<|>), many) import Control.Monad (foldM, join) @@ -14,8 +14,8 @@ import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.State (put) import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy) import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser) -import Data.ByteString.Char8 (ByteString, pack) -import qualified Data.ByteString.UTF8 as UTF8 (toString) +import Data.ByteString (ByteString, pack) +import qualified Data.ByteString.UTF8 as UTF8 (fromString, toString) import Data.Map (Map, (!)) import qualified Data.Map as Map (empty, fromList) import PDF.Object ( @@ -26,7 +26,7 @@ import qualified PDF.EOL as EOL (charset, parser) import PDF.Parser (Parser, evalParser, string, takeAll) type CMappers = Map Name CMap -type CMap = Map Int ByteString +type CMap = Map Int String emptyCMap :: CMap emptyCMap = Map.empty @@ -40,42 +40,64 @@ cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine) cMapRange :: Atto.Parser CMap cMapRange = do size <- integer <* line "beginbfrange" - mconcat <$> count size rangeMapping <* line "endbfrange" + mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange" where - rangeMapping = mapFromTo + rangeMapping = (,,) <$> (stringObject <* blank) <*> (stringObject <* blank) <*> directObject <* EOL.parser + >>= mapFromTo cMapChar :: Atto.Parser CMap cMapChar = do size <- integer <* line "beginbfchar" Map.fromList <$> count size charMapping <* line "endbfchar" where - charMapping = pairMapping - <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser + charMapping = + (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser + >>= pairMapping -mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) = - let dstFrom -mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) = undefined -mapFromTo _ _ _ = fail "invalid range mapping found" +mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, String)] +mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = + let dstString = utf8Decode dstFrom in + return $ zip [hexString from .. hexString to] (stringsFrom dstString) + where + stringsFrom [] = [[]] + stringsFrom [x] = (:[]) <$> [x..] + stringsFrom (x:xs) = (x:) <$> stringsFrom xs -pairMapping :: StringObject -> StringObject -> (Int, ByteString) -pairMapping (Hexadecimal from) (Hexadecimal to) = - (hexString from, toByteString to) -pairMapping = fail "invalid pair mapping found" +mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) = + zip [hexString from .. hexString to] <$> (mapM dstString dstPoints) + where + dstString (StringObject (Hexadecimal dstPoint)) = return $ utf8Decode dstPoint + dstString _ = fail "Invalid for a replacement string" -hexString :: Num a => String -> a +mapFromTo _ = fail "invalid range mapping found" + +pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, String) +pairMapping (Hexadecimal from, Hexadecimal to) = + return (hexString from, utf8Decode to) +pairMapping _ = fail "invalid pair mapping found" + +hexString :: (Num a, Read a) => String -> a hexString s = read $ "0x" ++ s -toByteString :: String -> [Word8] -toByteString = pack . toBaseWord8 [] . hexString +pairDigits :: String -> [String] +pairDigits "" = [] +pairDigits [c] = [[c]] +pairDigits (a:b:end) = (a:[b]):pairDigits end + +toBytes :: String -> ByteString +toBytes = pack . fmap hexString . pairDigits + +utf8Decode :: String -> String +utf8Decode = UTF8.toString . pack . toBaseWord8 [] . (hexString :: String -> Integer) where toBaseWord8 digits n | n < 0xff = (fromIntegral n):digits | otherwise = - let newDigits = (fromIntegral (n `mod` 0xff)):digits in - case n `div` 0xff of + let newDigits = (fromIntegral (n `mod` 0x100)):digits in + case n `div` 0x100 of 0 -> newDigits k -> toBaseWord8 newDigits k @@ -177,4 +199,8 @@ runOperator (DQuote, [Typed (StringObject outputString)]) = runOperator _ = return [] decodeString :: StringObject -> ParserWithFont ByteString -decodeString = undefined +decodeString (Hexadecimal h) = decodeString (Literal (utf8Decode h)) +decodeString (Literal s) = + undefined + where + bytes = UTF8.fromString s