Separate CMap and Text in two distinct modules

2019-09-27 18:16:12 +02:00 · 2019-09-27 18:16:12 +02:00 · 3b59fd0c61
parent 0374b72920
commit 3b59fd0c61
5 changed files with 117 additions and 101 deletions
--- a/Hufflepdf.cabal
+++ b/Hufflepdf.cabal
@ -17,6 +17,7 @@ cabal-version:       >=1.10
 library
  exposed-modules:     PDF
                     , PDF.CMap
                     , PDF.EOL
                     , PDF.Object
                     , PDF.Output
--- a/examples/getText.hs
+++ b/examples/getText.hs
@ -11,12 +11,13 @@ import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
 import Data.Map (Map, (!))
 import qualified Data.Map as Map (empty, insert, lookup, toList)
 import PDF (Document(..), parseDocument)
 import PDF.CMap (CMap, CMappers, cMap, emptyCMap)
 import PDF.Object (
      Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
    , Object(..), Name(..), Structure(..)
  ,)
 import PDF.Output (ObjectId)
-import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents)
+import PDF.Text (PageContents(..), pageContents)
 import PDF.Update (unify)
 import System.Environment (getArgs)
 import System.IO (hPutStrLn, stderr)
--- a/src/PDF/CMap.hs
+++ b/src/PDF/CMap.hs
@ -0,0 +1,74 @@
 module PDF.CMap (
      CMap
    , CMappers
    , cMap
    , emptyCMap
  ) where
 import Control.Applicative ((<|>), many)
 import Data.Attoparsec.ByteString.Char8 (count, parseOnly)
 import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
 import Data.ByteString (ByteString, snoc)
 import qualified Data.ByteString as BS (init, last, null)
 import Data.Map (Map)
 import qualified Data.Map as Map (empty, fromList)
 import qualified PDF.EOL as EOL (charset, parser)
 import PDF.Object (
      DirectObject(..), Name, StringObject(..)
    , blank, directObject, hexString, integer, line, parseBytes, stringObject
  )
 import PDF.Parser (takeAll)
 type CMappers = Map Name CMap
 type CMap = Map Int ByteString
 emptyCMap :: CMap
 emptyCMap = Map.empty
 cMap :: ByteString -> Either String CMap
 cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
  where
    ignoredLine =
      takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
 cMapRange :: Atto.Parser CMap
 cMapRange = do
  size <- integer <* line "beginbfrange"
  mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
  where
    rangeMapping = (,,) 
      <$> (stringObject <* blank)
      <*> (stringObject <* blank)
      <*> directObject <* EOL.parser
      >>= mapFromTo
 cMapChar :: Atto.Parser CMap
 cMapChar = do
  size <- integer <* line "beginbfchar"
  Map.fromList <$> count size charMapping <* line "endbfchar"
  where
    charMapping =
      (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
      >>= pairMapping
 mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, ByteString)]
 mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
  let dstString = parseBytes dstFrom in
  return $ zip [hexString from .. hexString to] (textsFrom dstString)
  where
    textsFrom t
      | BS.null t = [t]
      | otherwise = (BS.init t `snoc`) <$> [BS.last t ..]
 mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
  zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
  where
    dstString (StringObject (Hexadecimal dstPoint)) = return $ parseBytes dstPoint
    dstString _ = fail "Invalid for a replacement string"
 mapFromTo _ = fail "invalid range mapping found"
 pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, ByteString)
 pairMapping (Hexadecimal from, Hexadecimal to) =
  return (hexString from, parseBytes to)
 pairMapping _ = fail "invalid pair mapping found"
--- a/src/PDF/Object.hs
+++ b/src/PDF/Object.hs
@ -22,11 +22,13 @@ module PDF.Object (
    , dictionary
    , directObject
    , eofMarker
    , hexString
    , integer
    , line
    , magicNumber
    , name
    , number
    , parseBytes
    , regular
    , stringObject
    , structure
@ -34,14 +36,14 @@ module PDF.Object (
 import Control.Applicative ((<|>), many)
 import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
-import Data.ByteString.Char8 (ByteString)
+import Data.ByteString (ByteString)
-import qualified Data.ByteString.Char8 as BS (
+import qualified Data.ByteString as BS (concat, pack)
-    concat, cons, pack, singleton, unpack
+import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
  )
 import Data.Map (Map, (!), mapWithKey)
 import qualified Data.Map as Map (
    delete, empty, fromList, lookup, minViewWithKey, toList, union
  )
 import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
 import qualified PDF.EOL as EOL (charset, parser)
 import qualified PDF.Output as Output (concat, line, string)
 import PDF.Output (
@ -53,7 +55,7 @@ import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
 import Text.Printf (printf)
 line :: MonadParser m => String -> m ()
-line l = (string (BS.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l
+line l = (string (Char8.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l
 magicNumber :: ByteString
 magicNumber = "%PDF-"
@ -74,7 +76,7 @@ regular :: Char -> Bool
 regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
 integer :: (Read a, Num a, MonadParser m) => m a
-integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
+integer = read . Char8.unpack <$> decNumber <* blank <?> "decimal integer"
 -------------------------------------
 --          OBJECTS
@ -101,13 +103,13 @@ instance Output Number where
      _ -> printf "%f" f
 number :: MonadParser m => m Number
-number = Number . read . BS.unpack <$>
+number = Number . read . Char8.unpack <$>
-  (mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
+  (mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
  <?> "number"
  where
    sign = string "-" <|> option "" (char '+' >> return "")
    integerPart = mappend <$> decNumber <*> option "" floatPart
-    floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
+    floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)
 --
 -- StringObject
@ -120,18 +122,28 @@ instance Output StringObject where
 stringObject :: MonadParser m => m StringObject
 stringObject =
-    Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
+    Literal . Char8.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
-  <|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
+  <|> Hexadecimal . Char8.unpack <$> (char '<' *> hexNumber <* char '>')
  <?> "string object (literal or hexadecimal)"
  where
    literalString = many literalStringBlock
    literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
    normalChar = not . (`elem` ("\\()" :: String))
    matchingParenthesis =
-      mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
+      mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
    escapedChar =
-      BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
+      Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
-    octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
+    octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
 hexString :: (Num a, Read a) => String -> a
 hexString s = read $ "0x" ++ s
 parseBytes :: String -> ByteString
 parseBytes = encodeUtf8 . decodeUtf16BE . BS.pack . fmap hexString . pairDigits
  where
    pairDigits "" = []
    pairDigits [c] = [[c]]
    pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
 --
 -- Name
@ -142,7 +154,7 @@ instance Output Name where
  output (Name n) = Output.string ('/':n)
 name :: MonadParser m => m Name
-name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
+name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"
 --
 -- Array
--- a/src/PDF/Text.hs
+++ b/src/PDF/Text.hs
@ -1,97 +1,25 @@
 {-# LANGUAGE OverloadedStrings #-}
-module PDF.Text {
+module PDF.Text (
-      CMap
+      PageContents(..)
    , CMappers
    , PageContents(..)
    , cMap
    , emptyCMap
    , pageContents
  ) where
-import Control.Applicative ((<|>), many)
+import Control.Applicative ((<|>))
 import Control.Monad (foldM, join)
 import Control.Monad.Reader (ReaderT, runReaderT, asks)
 import Control.Monad.State (get, put)
-import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
+import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
-import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
+import Data.ByteString (ByteString)
-import Data.ByteString (ByteString, pack)
+import qualified Data.ByteString.Char8 as Char8 (unpack)
-import Data.Map (Map, (!))
+import Data.Map ((!))
-import qualified Data.Map as Map (empty, fromList, lookup)
+import qualified Data.Map as Map (lookup)
-import Data.Text (Text, snoc)
+import PDF.CMap (CMappers, CMap, emptyCMap)
 import qualified Data.Text as Text (init, last, null, unpack)
 import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
 import PDF.Object (
-      DirectObject(..), Name, StringObject(..)
+      DirectObject(..), StringObject(..)
-    , array, blank, directObject, integer, line, name, regular, stringObject
+    , array, blank, name, parseBytes, regular, stringObject
  )
 import qualified PDF.EOL as EOL (charset, parser)
 import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll)
 type CMappers = Map Name CMap
 type CMap = Map Int Text
 emptyCMap :: CMap
 emptyCMap = Map.empty
 cMap :: ByteString -> Either String CMap
 cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
  where
    ignoredLine =
      takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
 cMapRange :: Atto.Parser CMap
 cMapRange = do
  size <- integer <* line "beginbfrange"
  mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
  where
    rangeMapping = (,,) 
      <$> (stringObject <* blank)
      <*> (stringObject <* blank)
      <*> directObject <* EOL.parser
      >>= mapFromTo
 cMapChar :: Atto.Parser CMap
 cMapChar = do
  size <- integer <* line "beginbfchar"
  Map.fromList <$> count size charMapping <* line "endbfchar"
  where
    charMapping =
      (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
      >>= pairMapping
 mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, Text)]
 mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
  let dstString = parseText dstFrom in
  return $ zip [hexString from .. hexString to] (textsFrom dstString)
  where
    textsFrom t
      | Text.null t = [t]
      | otherwise = (Text.init t `snoc`) <$> [Text.last t ..]
 mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
  zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
  where
    dstString (StringObject (Hexadecimal dstPoint)) = return $ parseText dstPoint
    dstString _ = fail "Invalid for a replacement string"
 mapFromTo _ = fail "invalid range mapping found"
 pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, Text)
 pairMapping (Hexadecimal from, Hexadecimal to) =
  return (hexString from, parseText to)
 pairMapping _ = fail "invalid pair mapping found"
 hexString :: (Num a, Read a) => String -> a
 hexString s = read $ "0x" ++ s
 pairDigits :: String -> [String]
 pairDigits "" = []
 pairDigits [c] = [[c]]
 pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
 parseText :: String -> Text
 parseText = decodeUtf16BE . pack . fmap hexString . pairDigits
 data StateOperator =
  Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
  deriving (Bounded, Enum)
@ -190,7 +118,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
 runOperator _ = return []
 decodeString :: StringObject -> ParserWithFont ByteString
-decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h))
+decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h))
 decodeString (Literal litString) = get >>= convertBytes litString
  where
    convertBytes :: String -> CMap -> ParserWithFont ByteString
@ -204,4 +132,4 @@ decodeString (Literal litString) = get >>= convertBytes litString
        case (Map.lookup code someCMap, s) of
          (Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
          (Nothing, []) -> fail "No character left to read but no code recognized"
-          (Just outputText, _) -> mappend (encodeUtf8 outputText) <$> convertBytes s someCMap
+          (Just outputText, _) -> mappend outputText <$> convertBytes s someCMap