diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index 649b1b4..69a94a5 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -17,6 +17,7 @@ cabal-version: >=1.10 library exposed-modules: PDF + , PDF.CMap , PDF.EOL , PDF.Object , PDF.Output diff --git a/examples/getText.hs b/examples/getText.hs index 0515104..e3f563b 100755 --- a/examples/getText.hs +++ b/examples/getText.hs @@ -11,12 +11,13 @@ import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import Data.Map (Map, (!)) import qualified Data.Map as Map (empty, insert, lookup, toList) import PDF (Document(..), parseDocument) +import PDF.CMap (CMap, CMappers, cMap, emptyCMap) import PDF.Object ( Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Object(..), Name(..), Structure(..) ,) import PDF.Output (ObjectId) -import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents) +import PDF.Text (PageContents(..), pageContents) import PDF.Update (unify) import System.Environment (getArgs) import System.IO (hPutStrLn, stderr) diff --git a/src/PDF/CMap.hs b/src/PDF/CMap.hs new file mode 100644 index 0000000..6fee0d4 --- /dev/null +++ b/src/PDF/CMap.hs @@ -0,0 +1,74 @@ +module PDF.CMap ( + CMap + , CMappers + , cMap + , emptyCMap + ) where + +import Control.Applicative ((<|>), many) +import Data.Attoparsec.ByteString.Char8 (count, parseOnly) +import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser) +import Data.ByteString (ByteString, snoc) +import qualified Data.ByteString as BS (init, last, null) +import Data.Map (Map) +import qualified Data.Map as Map (empty, fromList) +import qualified PDF.EOL as EOL (charset, parser) +import PDF.Object ( + DirectObject(..), Name, StringObject(..) + , blank, directObject, hexString, integer, line, parseBytes, stringObject + ) +import PDF.Parser (takeAll) + +type CMappers = Map Name CMap +type CMap = Map Int ByteString + +emptyCMap :: CMap +emptyCMap = Map.empty + +cMap :: ByteString -> Either String CMap +cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine) + where + ignoredLine = + takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty + +cMapRange :: Atto.Parser CMap +cMapRange = do + size <- integer <* line "beginbfrange" + mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange" + where + rangeMapping = (,,) + <$> (stringObject <* blank) + <*> (stringObject <* blank) + <*> directObject <* EOL.parser + >>= mapFromTo + +cMapChar :: Atto.Parser CMap +cMapChar = do + size <- integer <* line "beginbfchar" + Map.fromList <$> count size charMapping <* line "endbfchar" + where + charMapping = + (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser + >>= pairMapping + +mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, ByteString)] +mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = + let dstString = parseBytes dstFrom in + return $ zip [hexString from .. hexString to] (textsFrom dstString) + where + textsFrom t + | BS.null t = [t] + | otherwise = (BS.init t `snoc`) <$> [BS.last t ..] + +mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) = + zip [hexString from .. hexString to] <$> (mapM dstString dstPoints) + where + dstString (StringObject (Hexadecimal dstPoint)) = return $ parseBytes dstPoint + dstString _ = fail "Invalid for a replacement string" + +mapFromTo _ = fail "invalid range mapping found" + +pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, ByteString) +pairMapping (Hexadecimal from, Hexadecimal to) = + return (hexString from, parseBytes to) +pairMapping _ = fail "invalid pair mapping found" diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs index a2c4b71..d9d2457 100644 --- a/src/PDF/Object.hs +++ b/src/PDF/Object.hs @@ -22,11 +22,13 @@ module PDF.Object ( , dictionary , directObject , eofMarker + , hexString , integer , line , magicNumber , name , number + , parseBytes , regular , stringObject , structure @@ -34,14 +36,14 @@ module PDF.Object ( import Control.Applicative ((<|>), many) import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy) -import Data.ByteString.Char8 (ByteString) -import qualified Data.ByteString.Char8 as BS ( - concat, cons, pack, singleton, unpack - ) +import Data.ByteString (ByteString) +import qualified Data.ByteString as BS (concat, pack) +import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack) import Data.Map (Map, (!), mapWithKey) import qualified Data.Map as Map ( delete, empty, fromList, lookup, minViewWithKey, toList, union ) +import Data.Text.Encoding (decodeUtf16BE, encodeUtf8) import qualified PDF.EOL as EOL (charset, parser) import qualified PDF.Output as Output (concat, line, string) import PDF.Output ( @@ -53,7 +55,7 @@ import PDF.Parser (MonadParser(..), Parser, (), octDigit, oneOf) import Text.Printf (printf) line :: MonadParser m => String -> m () -line l = (string (BS.pack l) *> EOL.parser *> return ()) printf "line «%s»" l +line l = (string (Char8.pack l) *> EOL.parser *> return ()) printf "line «%s»" l magicNumber :: ByteString magicNumber = "%PDF-" @@ -74,7 +76,7 @@ regular :: Char -> Bool regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset)) integer :: (Read a, Num a, MonadParser m) => m a -integer = read . BS.unpack <$> decNumber <* blank "decimal integer" +integer = read . Char8.unpack <$> decNumber <* blank "decimal integer" ------------------------------------- -- OBJECTS @@ -101,13 +103,13 @@ instance Output Number where _ -> printf "%f" f number :: MonadParser m => m Number -number = Number . read . BS.unpack <$> - (mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart)) +number = Number . read . Char8.unpack <$> + (mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart)) "number" where sign = string "-" <|> option "" (char '+' >> return "") integerPart = mappend <$> decNumber <*> option "" floatPart - floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber) + floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber) -- -- StringObject @@ -120,18 +122,28 @@ instance Output StringObject where stringObject :: MonadParser m => m StringObject stringObject = - Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')') - <|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>') + Literal . Char8.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')') + <|> Hexadecimal . Char8.unpack <$> (char '<' *> hexNumber <* char '>') "string object (literal or hexadecimal)" where literalString = many literalStringBlock literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar normalChar = not . (`elem` ("\\()" :: String)) matchingParenthesis = - mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")" + mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")" escapedChar = - BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode) - octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3] + Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode) + octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3] + +hexString :: (Num a, Read a) => String -> a +hexString s = read $ "0x" ++ s + +parseBytes :: String -> ByteString +parseBytes = encodeUtf8 . decodeUtf16BE . BS.pack . fmap hexString . pairDigits + where + pairDigits "" = [] + pairDigits [c] = [[c]] + pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end -- -- Name @@ -142,7 +154,7 @@ instance Output Name where output (Name n) = Output.string ('/':n) name :: MonadParser m => m Name -name = Name . BS.unpack <$> (char '/' *> takeAll regular) "name" +name = Name . Char8.unpack <$> (char '/' *> takeAll regular) "name" -- -- Array diff --git a/src/PDF/Text.hs b/src/PDF/Text.hs index bfcbeb5..7927561 100644 --- a/src/PDF/Text.hs +++ b/src/PDF/Text.hs @@ -1,97 +1,25 @@ {-# LANGUAGE OverloadedStrings #-} -module PDF.Text { - CMap - , CMappers - , PageContents(..) - , cMap - , emptyCMap +module PDF.Text ( + PageContents(..) , pageContents ) where -import Control.Applicative ((<|>), many) +import Control.Applicative ((<|>)) import Control.Monad (foldM, join) import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.State (get, put) -import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy) -import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser) -import Data.ByteString (ByteString, pack) -import Data.Map (Map, (!)) -import qualified Data.Map as Map (empty, fromList, lookup) -import Data.Text (Text, snoc) -import qualified Data.Text as Text (init, last, null, unpack) -import Data.Text.Encoding (decodeUtf16BE, encodeUtf8) +import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy) +import Data.ByteString (ByteString) +import qualified Data.ByteString.Char8 as Char8 (unpack) +import Data.Map ((!)) +import qualified Data.Map as Map (lookup) +import PDF.CMap (CMappers, CMap, emptyCMap) import PDF.Object ( - DirectObject(..), Name, StringObject(..) - , array, blank, directObject, integer, line, name, regular, stringObject + DirectObject(..), StringObject(..) + , array, blank, name, parseBytes, regular, stringObject ) -import qualified PDF.EOL as EOL (charset, parser) import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll) -type CMappers = Map Name CMap -type CMap = Map Int Text - -emptyCMap :: CMap -emptyCMap = Map.empty - -cMap :: ByteString -> Either String CMap -cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine) - where - ignoredLine = - takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty - -cMapRange :: Atto.Parser CMap -cMapRange = do - size <- integer <* line "beginbfrange" - mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange" - where - rangeMapping = (,,) - <$> (stringObject <* blank) - <*> (stringObject <* blank) - <*> directObject <* EOL.parser - >>= mapFromTo - -cMapChar :: Atto.Parser CMap -cMapChar = do - size <- integer <* line "beginbfchar" - Map.fromList <$> count size charMapping <* line "endbfchar" - where - charMapping = - (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser - >>= pairMapping - -mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, Text)] -mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = - let dstString = parseText dstFrom in - return $ zip [hexString from .. hexString to] (textsFrom dstString) - where - textsFrom t - | Text.null t = [t] - | otherwise = (Text.init t `snoc`) <$> [Text.last t ..] - -mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) = - zip [hexString from .. hexString to] <$> (mapM dstString dstPoints) - where - dstString (StringObject (Hexadecimal dstPoint)) = return $ parseText dstPoint - dstString _ = fail "Invalid for a replacement string" - -mapFromTo _ = fail "invalid range mapping found" - -pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, Text) -pairMapping (Hexadecimal from, Hexadecimal to) = - return (hexString from, parseText to) -pairMapping _ = fail "invalid pair mapping found" - -hexString :: (Num a, Read a) => String -> a -hexString s = read $ "0x" ++ s - -pairDigits :: String -> [String] -pairDigits "" = [] -pairDigits [c] = [[c]] -pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end - -parseText :: String -> Text -parseText = decodeUtf16BE . pack . fmap hexString . pairDigits - data StateOperator = Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state deriving (Bounded, Enum) @@ -190,7 +118,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) = runOperator _ = return [] decodeString :: StringObject -> ParserWithFont ByteString -decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h)) +decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h)) decodeString (Literal litString) = get >>= convertBytes litString where convertBytes :: String -> CMap -> ParserWithFont ByteString @@ -204,4 +132,4 @@ decodeString (Literal litString) = get >>= convertBytes litString case (Map.lookup code someCMap, s) of (Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap (Nothing, []) -> fail "No character left to read but no code recognized" - (Just outputText, _) -> mappend (encodeUtf8 outputText) <$> convertBytes s someCMap + (Just outputText, _) -> mappend outputText <$> convertBytes s someCMap