Separate CMap and Text in two distinct modules

This commit is contained in:
Tissevert 2019-09-27 18:16:12 +02:00
parent 0374b72920
commit 3b59fd0c61
5 changed files with 117 additions and 101 deletions

View file

@ -17,6 +17,7 @@ cabal-version: >=1.10
library library
exposed-modules: PDF exposed-modules: PDF
, PDF.CMap
, PDF.EOL , PDF.EOL
, PDF.Object , PDF.Object
, PDF.Output , PDF.Output

View file

@ -11,12 +11,13 @@ import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!)) import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, insert, lookup, toList) import qualified Data.Map as Map (empty, insert, lookup, toList)
import PDF (Document(..), parseDocument) import PDF (Document(..), parseDocument)
import PDF.CMap (CMap, CMappers, cMap, emptyCMap)
import PDF.Object ( import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..) , Object(..), Name(..), Structure(..)
,) ,)
import PDF.Output (ObjectId) import PDF.Output (ObjectId)
import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents) import PDF.Text (PageContents(..), pageContents)
import PDF.Update (unify) import PDF.Update (unify)
import System.Environment (getArgs) import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr) import System.IO (hPutStrLn, stderr)

74
src/PDF/CMap.hs Normal file
View file

@ -0,0 +1,74 @@
module PDF.CMap (
CMap
, CMappers
, cMap
, emptyCMap
) where
import Control.Applicative ((<|>), many)
import Data.Attoparsec.ByteString.Char8 (count, parseOnly)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
import Data.ByteString (ByteString, snoc)
import qualified Data.ByteString as BS (init, last, null)
import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Object (
DirectObject(..), Name, StringObject(..)
, blank, directObject, hexString, integer, line, parseBytes, stringObject
)
import PDF.Parser (takeAll)
type CMappers = Map Name CMap
type CMap = Map Int ByteString
emptyCMap :: CMap
emptyCMap = Map.empty
cMap :: ByteString -> Either String CMap
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
where
ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
cMapRange :: Atto.Parser CMap
cMapRange = do
size <- integer <* line "beginbfrange"
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
where
rangeMapping = (,,)
<$> (stringObject <* blank)
<*> (stringObject <* blank)
<*> directObject <* EOL.parser
>>= mapFromTo
cMapChar :: Atto.Parser CMap
cMapChar = do
size <- integer <* line "beginbfchar"
Map.fromList <$> count size charMapping <* line "endbfchar"
where
charMapping =
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
>>= pairMapping
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, ByteString)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
let dstString = parseBytes dstFrom in
return $ zip [hexString from .. hexString to] (textsFrom dstString)
where
textsFrom t
| BS.null t = [t]
| otherwise = (BS.init t `snoc`) <$> [BS.last t ..]
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
where
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseBytes dstPoint
dstString _ = fail "Invalid for a replacement string"
mapFromTo _ = fail "invalid range mapping found"
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, ByteString)
pairMapping (Hexadecimal from, Hexadecimal to) =
return (hexString from, parseBytes to)
pairMapping _ = fail "invalid pair mapping found"

View file

@ -22,11 +22,13 @@ module PDF.Object (
, dictionary , dictionary
, directObject , directObject
, eofMarker , eofMarker
, hexString
, integer , integer
, line , line
, magicNumber , magicNumber
, name , name
, number , number
, parseBytes
, regular , regular
, stringObject , stringObject
, structure , structure
@ -34,14 +36,14 @@ module PDF.Object (
import Control.Applicative ((<|>), many) import Control.Applicative ((<|>), many)
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString.Char8 (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BS ( import qualified Data.ByteString as BS (concat, pack)
concat, cons, pack, singleton, unpack import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
)
import Data.Map (Map, (!), mapWithKey) import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map ( import qualified Data.Map as Map (
delete, empty, fromList, lookup, minViewWithKey, toList, union delete, empty, fromList, lookup, minViewWithKey, toList, union
) )
import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
import qualified PDF.EOL as EOL (charset, parser) import qualified PDF.EOL as EOL (charset, parser)
import qualified PDF.Output as Output (concat, line, string) import qualified PDF.Output as Output (concat, line, string)
import PDF.Output ( import PDF.Output (
@ -53,7 +55,7 @@ import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
import Text.Printf (printf) import Text.Printf (printf)
line :: MonadParser m => String -> m () line :: MonadParser m => String -> m ()
line l = (string (BS.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l line l = (string (Char8.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l
magicNumber :: ByteString magicNumber :: ByteString
magicNumber = "%PDF-" magicNumber = "%PDF-"
@ -74,7 +76,7 @@ regular :: Char -> Bool
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset)) regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
integer :: (Read a, Num a, MonadParser m) => m a integer :: (Read a, Num a, MonadParser m) => m a
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer" integer = read . Char8.unpack <$> decNumber <* blank <?> "decimal integer"
------------------------------------- -------------------------------------
-- OBJECTS -- OBJECTS
@ -101,13 +103,13 @@ instance Output Number where
_ -> printf "%f" f _ -> printf "%f" f
number :: MonadParser m => m Number number :: MonadParser m => m Number
number = Number . read . BS.unpack <$> number = Number . read . Char8.unpack <$>
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart)) (mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
<?> "number" <?> "number"
where where
sign = string "-" <|> option "" (char '+' >> return "") sign = string "-" <|> option "" (char '+' >> return "")
integerPart = mappend <$> decNumber <*> option "" floatPart integerPart = mappend <$> decNumber <*> option "" floatPart
floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber) floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)
-- --
-- StringObject -- StringObject
@ -120,18 +122,28 @@ instance Output StringObject where
stringObject :: MonadParser m => m StringObject stringObject :: MonadParser m => m StringObject
stringObject = stringObject =
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')') Literal . Char8.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>') <|> Hexadecimal . Char8.unpack <$> (char '<' *> hexNumber <* char '>')
<?> "string object (literal or hexadecimal)" <?> "string object (literal or hexadecimal)"
where where
literalString = many literalStringBlock literalString = many literalStringBlock
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
normalChar = not . (`elem` ("\\()" :: String)) normalChar = not . (`elem` ("\\()" :: String))
matchingParenthesis = matchingParenthesis =
mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")" mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar = escapedChar =
BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode) Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3] octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
hexString :: (Num a, Read a) => String -> a
hexString s = read $ "0x" ++ s
parseBytes :: String -> ByteString
parseBytes = encodeUtf8 . decodeUtf16BE . BS.pack . fmap hexString . pairDigits
where
pairDigits "" = []
pairDigits [c] = [[c]]
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
-- --
-- Name -- Name
@ -142,7 +154,7 @@ instance Output Name where
output (Name n) = Output.string ('/':n) output (Name n) = Output.string ('/':n)
name :: MonadParser m => m Name name :: MonadParser m => m Name
name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name" name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"
-- --
-- Array -- Array

View file

@ -1,97 +1,25 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module PDF.Text { module PDF.Text (
CMap PageContents(..)
, CMappers
, PageContents(..)
, cMap
, emptyCMap
, pageContents , pageContents
) where ) where
import Control.Applicative ((<|>), many) import Control.Applicative ((<|>))
import Control.Monad (foldM, join) import Control.Monad (foldM, join)
import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (get, put) import Control.Monad.State (get, put)
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser) import Data.ByteString (ByteString)
import Data.ByteString (ByteString, pack) import qualified Data.ByteString.Char8 as Char8 (unpack)
import Data.Map (Map, (!)) import Data.Map ((!))
import qualified Data.Map as Map (empty, fromList, lookup) import qualified Data.Map as Map (lookup)
import Data.Text (Text, snoc) import PDF.CMap (CMappers, CMap, emptyCMap)
import qualified Data.Text as Text (init, last, null, unpack)
import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
import PDF.Object ( import PDF.Object (
DirectObject(..), Name, StringObject(..) DirectObject(..), StringObject(..)
, array, blank, directObject, integer, line, name, regular, stringObject , array, blank, name, parseBytes, regular, stringObject
) )
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll) import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll)
type CMappers = Map Name CMap
type CMap = Map Int Text
emptyCMap :: CMap
emptyCMap = Map.empty
cMap :: ByteString -> Either String CMap
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
where
ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
cMapRange :: Atto.Parser CMap
cMapRange = do
size <- integer <* line "beginbfrange"
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
where
rangeMapping = (,,)
<$> (stringObject <* blank)
<*> (stringObject <* blank)
<*> directObject <* EOL.parser
>>= mapFromTo
cMapChar :: Atto.Parser CMap
cMapChar = do
size <- integer <* line "beginbfchar"
Map.fromList <$> count size charMapping <* line "endbfchar"
where
charMapping =
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
>>= pairMapping
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, Text)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
let dstString = parseText dstFrom in
return $ zip [hexString from .. hexString to] (textsFrom dstString)
where
textsFrom t
| Text.null t = [t]
| otherwise = (Text.init t `snoc`) <$> [Text.last t ..]
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
where
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseText dstPoint
dstString _ = fail "Invalid for a replacement string"
mapFromTo _ = fail "invalid range mapping found"
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, Text)
pairMapping (Hexadecimal from, Hexadecimal to) =
return (hexString from, parseText to)
pairMapping _ = fail "invalid pair mapping found"
hexString :: (Num a, Read a) => String -> a
hexString s = read $ "0x" ++ s
pairDigits :: String -> [String]
pairDigits "" = []
pairDigits [c] = [[c]]
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
parseText :: String -> Text
parseText = decodeUtf16BE . pack . fmap hexString . pairDigits
data StateOperator = data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
deriving (Bounded, Enum) deriving (Bounded, Enum)
@ -190,7 +118,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return [] runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString decodeString :: StringObject -> ParserWithFont ByteString
decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h)) decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h))
decodeString (Literal litString) = get >>= convertBytes litString decodeString (Literal litString) = get >>= convertBytes litString
where where
convertBytes :: String -> CMap -> ParserWithFont ByteString convertBytes :: String -> CMap -> ParserWithFont ByteString
@ -204,4 +132,4 @@ decodeString (Literal litString) = get >>= convertBytes litString
case (Map.lookup code someCMap, s) of case (Map.lookup code someCMap, s) of
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap (Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
(Nothing, []) -> fail "No character left to read but no code recognized" (Nothing, []) -> fail "No character left to read but no code recognized"
(Just outputText, _) -> mappend (encodeUtf8 outputText) <$> convertBytes s someCMap (Just outputText, _) -> mappend outputText <$> convertBytes s someCMap