Separate CMap and Text in two distinct modules

2019-09-27 18:16:12 +02:00 · 2019-09-27 18:16:12 +02:00 · 3b59fd0c61
commit 3b59fd0c61
parent 0374b72920
5 changed files with 117 additions and 101 deletions
--- a/Hufflepdf.cabal
+++ b/Hufflepdf.cabal
@ -17,6 +17,7 @@ cabal-version:       >=1.10

 library
  exposed-modules:     PDF
+                     , PDF.CMap
                     , PDF.EOL
                     , PDF.Object
                     , PDF.Output
--- a/examples/getText.hs
+++ b/examples/getText.hs
@ -11,12 +11,13 @@ import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
 import Data.Map (Map, (!))
 import qualified Data.Map as Map (empty, insert, lookup, toList)
 import PDF (Document(..), parseDocument)
+import PDF.CMap (CMap, CMappers, cMap, emptyCMap)
 import PDF.Object (
      Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
    , Object(..), Name(..), Structure(..)
  ,)
 import PDF.Output (ObjectId)
-import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents)
+import PDF.Text (PageContents(..), pageContents)
 import PDF.Update (unify)
 import System.Environment (getArgs)
 import System.IO (hPutStrLn, stderr)
--- a/src/PDF/CMap.hs
+++ b/src/PDF/CMap.hs
@ -0,0 +1,74 @@
+module PDF.CMap (
+      CMap
+    , CMappers
+    , cMap
+    , emptyCMap
+  ) where
+
+import Control.Applicative ((<|>), many)
+import Data.Attoparsec.ByteString.Char8 (count, parseOnly)
+import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
+import Data.ByteString (ByteString, snoc)
+import qualified Data.ByteString as BS (init, last, null)
+import Data.Map (Map)
+import qualified Data.Map as Map (empty, fromList)
+import qualified PDF.EOL as EOL (charset, parser)
+import PDF.Object (
+      DirectObject(..), Name, StringObject(..)
+    , blank, directObject, hexString, integer, line, parseBytes, stringObject
+  )
+import PDF.Parser (takeAll)
+
+type CMappers = Map Name CMap
+type CMap = Map Int ByteString
+
+emptyCMap :: CMap
+emptyCMap = Map.empty
+
+cMap :: ByteString -> Either String CMap
+cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
+  where
+    ignoredLine =
+      takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
+
+cMapRange :: Atto.Parser CMap
+cMapRange = do
+  size <- integer <* line "beginbfrange"
+  mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
+  where
+    rangeMapping = (,,) 
+      <$> (stringObject <* blank)
+      <*> (stringObject <* blank)
+      <*> directObject <* EOL.parser
+      >>= mapFromTo
+
+cMapChar :: Atto.Parser CMap
+cMapChar = do
+  size <- integer <* line "beginbfchar"
+  Map.fromList <$> count size charMapping <* line "endbfchar"
+  where
+    charMapping =
+      (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
+      >>= pairMapping
+
+mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, ByteString)]
+mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
+  let dstString = parseBytes dstFrom in
+  return $ zip [hexString from .. hexString to] (textsFrom dstString)
+  where
+    textsFrom t
+      | BS.null t = [t]
+      | otherwise = (BS.init t `snoc`) <$> [BS.last t ..]
+
+mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
+  zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
+  where
+    dstString (StringObject (Hexadecimal dstPoint)) = return $ parseBytes dstPoint
+    dstString _ = fail "Invalid for a replacement string"
+
+mapFromTo _ = fail "invalid range mapping found"
+
+pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, ByteString)
+pairMapping (Hexadecimal from, Hexadecimal to) =
+  return (hexString from, parseBytes to)
+pairMapping _ = fail "invalid pair mapping found"
--- a/src/PDF/Object.hs
+++ b/src/PDF/Object.hs
@ -22,11 +22,13 @@ module PDF.Object (
    , dictionary
    , directObject
    , eofMarker
+    , hexString
    , integer
    , line
    , magicNumber
    , name
    , number
+    , parseBytes
    , regular
    , stringObject
    , structure
@ -34,14 +36,14 @@ module PDF.Object (

 import Control.Applicative ((<|>), many)
 import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
-import Data.ByteString.Char8 (ByteString)
-import qualified Data.ByteString.Char8 as BS (
-    concat, cons, pack, singleton, unpack
-  )
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as BS (concat, pack)
+import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
 import Data.Map (Map, (!), mapWithKey)
 import qualified Data.Map as Map (
    delete, empty, fromList, lookup, minViewWithKey, toList, union
  )
+import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
 import qualified PDF.EOL as EOL (charset, parser)
 import qualified PDF.Output as Output (concat, line, string)
 import PDF.Output (
@ -53,7 +55,7 @@ import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
 import Text.Printf (printf)

 line :: MonadParser m => String -> m ()
-line l = (string (BS.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l
+line l = (string (Char8.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l

 magicNumber :: ByteString
 magicNumber = "%PDF-"
@ -74,7 +76,7 @@ regular :: Char -> Bool
 regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))

 integer :: (Read a, Num a, MonadParser m) => m a
-integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
+integer = read . Char8.unpack <$> decNumber <* blank <?> "decimal integer"

 -------------------------------------
 --          OBJECTS
@ -101,13 +103,13 @@ instance Output Number where
      _ -> printf "%f" f

 number :: MonadParser m => m Number
-number = Number . read . BS.unpack <$>
-  (mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
+number = Number . read . Char8.unpack <$>
+  (mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
  <?> "number"
  where
    sign = string "-" <|> option "" (char '+' >> return "")
    integerPart = mappend <$> decNumber <*> option "" floatPart
-    floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
+    floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)

 --
 -- StringObject
@ -120,18 +122,28 @@ instance Output StringObject where

 stringObject :: MonadParser m => m StringObject
 stringObject =
-    Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
-  <|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
+    Literal . Char8.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
+  <|> Hexadecimal . Char8.unpack <$> (char '<' *> hexNumber <* char '>')
  <?> "string object (literal or hexadecimal)"
  where
    literalString = many literalStringBlock
    literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
    normalChar = not . (`elem` ("\\()" :: String))
    matchingParenthesis =
-      mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
+      mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
    escapedChar =
-      BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
-    octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
+      Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
+    octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
+
+hexString :: (Num a, Read a) => String -> a
+hexString s = read $ "0x" ++ s
+
+parseBytes :: String -> ByteString
+parseBytes = encodeUtf8 . decodeUtf16BE . BS.pack . fmap hexString . pairDigits
+  where
+    pairDigits "" = []
+    pairDigits [c] = [[c]]
+    pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end

 --
 -- Name
@ -142,7 +154,7 @@ instance Output Name where
  output (Name n) = Output.string ('/':n)

 name :: MonadParser m => m Name
-name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
+name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"

 --
 -- Array
--- a/src/PDF/Text.hs
+++ b/src/PDF/Text.hs
@ -1,97 +1,25 @@
 {-# LANGUAGE OverloadedStrings #-}
-module PDF.Text {
-      CMap
-    , CMappers
-    , PageContents(..)
-    , cMap
-    , emptyCMap
+module PDF.Text (
+      PageContents(..)
    , pageContents
  ) where

-import Control.Applicative ((<|>), many)
+import Control.Applicative ((<|>))
 import Control.Monad (foldM, join)
 import Control.Monad.Reader (ReaderT, runReaderT, asks)
 import Control.Monad.State (get, put)
-import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
-import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
-import Data.ByteString (ByteString, pack)
-import Data.Map (Map, (!))
-import qualified Data.Map as Map (empty, fromList, lookup)
-import Data.Text (Text, snoc)
-import qualified Data.Text as Text (init, last, null, unpack)
-import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
+import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
+import Data.ByteString (ByteString)
+import qualified Data.ByteString.Char8 as Char8 (unpack)
+import Data.Map ((!))
+import qualified Data.Map as Map (lookup)
+import PDF.CMap (CMappers, CMap, emptyCMap)
 import PDF.Object (
-      DirectObject(..), Name, StringObject(..)
-    , array, blank, directObject, integer, line, name, regular, stringObject
+      DirectObject(..), StringObject(..)
+    , array, blank, name, parseBytes, regular, stringObject
  )
-import qualified PDF.EOL as EOL (charset, parser)
 import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll)

-type CMappers = Map Name CMap
-type CMap = Map Int Text
-
-emptyCMap :: CMap
-emptyCMap = Map.empty
-
-cMap :: ByteString -> Either String CMap
-cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
-  where
-    ignoredLine =
-      takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
-
-cMapRange :: Atto.Parser CMap
-cMapRange = do
-  size <- integer <* line "beginbfrange"
-  mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
-  where
-    rangeMapping = (,,) 
-      <$> (stringObject <* blank)
-      <*> (stringObject <* blank)
-      <*> directObject <* EOL.parser
-      >>= mapFromTo
-
-cMapChar :: Atto.Parser CMap
-cMapChar = do
-  size <- integer <* line "beginbfchar"
-  Map.fromList <$> count size charMapping <* line "endbfchar"
-  where
-    charMapping =
-      (,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
-      >>= pairMapping
-
-mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, Text)]
-mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
-  let dstString = parseText dstFrom in
-  return $ zip [hexString from .. hexString to] (textsFrom dstString)
-  where
-    textsFrom t
-      | Text.null t = [t]
-      | otherwise = (Text.init t `snoc`) <$> [Text.last t ..]
-
-mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
-  zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
-  where
-    dstString (StringObject (Hexadecimal dstPoint)) = return $ parseText dstPoint
-    dstString _ = fail "Invalid for a replacement string"
-
-mapFromTo _ = fail "invalid range mapping found"
-
-pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, Text)
-pairMapping (Hexadecimal from, Hexadecimal to) =
-  return (hexString from, parseText to)
-pairMapping _ = fail "invalid pair mapping found"
-
-hexString :: (Num a, Read a) => String -> a
-hexString s = read $ "0x" ++ s
-
-pairDigits :: String -> [String]
-pairDigits "" = []
-pairDigits [c] = [[c]]
-pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
-
-parseText :: String -> Text
-parseText = decodeUtf16BE . pack . fmap hexString . pairDigits
-
 data StateOperator =
  Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
  deriving (Bounded, Enum)
@ -190,7 +118,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
 runOperator _ = return []

 decodeString :: StringObject -> ParserWithFont ByteString
-decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h))
+decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h))
 decodeString (Literal litString) = get >>= convertBytes litString
  where
    convertBytes :: String -> CMap -> ParserWithFont ByteString
@ -204,4 +132,4 @@ decodeString (Literal litString) = get >>= convertBytes litString
        case (Map.lookup code someCMap, s) of
          (Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
          (Nothing, []) -> fail "No character left to read but no code recognized"
-          (Just outputText, _) -> mappend (encodeUtf8 outputText) <$> convertBytes s someCMap
+          (Just outputText, _) -> mappend outputText <$> convertBytes s someCMap