Separate CMap and Text in two distinct modules

This commit is contained in:
Tissevert 2019-09-27 18:16:12 +02:00
parent 0374b72920
commit 3b59fd0c61
5 changed files with 117 additions and 101 deletions

View File

@ -17,6 +17,7 @@ cabal-version: >=1.10
library
exposed-modules: PDF
, PDF.CMap
, PDF.EOL
, PDF.Object
, PDF.Output

View File

@ -11,12 +11,13 @@ import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, insert, lookup, toList)
import PDF (Document(..), parseDocument)
import PDF.CMap (CMap, CMappers, cMap, emptyCMap)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId)
import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents)
import PDF.Text (PageContents(..), pageContents)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)

74
src/PDF/CMap.hs Normal file
View File

@ -0,0 +1,74 @@
module PDF.CMap (
CMap
, CMappers
, cMap
, emptyCMap
) where
import Control.Applicative ((<|>), many)
import Data.Attoparsec.ByteString.Char8 (count, parseOnly)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
import Data.ByteString (ByteString, snoc)
import qualified Data.ByteString as BS (init, last, null)
import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Object (
DirectObject(..), Name, StringObject(..)
, blank, directObject, hexString, integer, line, parseBytes, stringObject
)
import PDF.Parser (takeAll)
type CMappers = Map Name CMap
type CMap = Map Int ByteString
emptyCMap :: CMap
emptyCMap = Map.empty
cMap :: ByteString -> Either String CMap
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
where
ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
cMapRange :: Atto.Parser CMap
cMapRange = do
size <- integer <* line "beginbfrange"
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
where
rangeMapping = (,,)
<$> (stringObject <* blank)
<*> (stringObject <* blank)
<*> directObject <* EOL.parser
>>= mapFromTo
cMapChar :: Atto.Parser CMap
cMapChar = do
size <- integer <* line "beginbfchar"
Map.fromList <$> count size charMapping <* line "endbfchar"
where
charMapping =
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
>>= pairMapping
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, ByteString)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
let dstString = parseBytes dstFrom in
return $ zip [hexString from .. hexString to] (textsFrom dstString)
where
textsFrom t
| BS.null t = [t]
| otherwise = (BS.init t `snoc`) <$> [BS.last t ..]
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
where
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseBytes dstPoint
dstString _ = fail "Invalid for a replacement string"
mapFromTo _ = fail "invalid range mapping found"
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, ByteString)
pairMapping (Hexadecimal from, Hexadecimal to) =
return (hexString from, parseBytes to)
pairMapping _ = fail "invalid pair mapping found"

View File

@ -22,11 +22,13 @@ module PDF.Object (
, dictionary
, directObject
, eofMarker
, hexString
, integer
, line
, magicNumber
, name
, number
, parseBytes
, regular
, stringObject
, structure
@ -34,14 +36,14 @@ module PDF.Object (
import Control.Applicative ((<|>), many)
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (
concat, cons, pack, singleton, unpack
)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (concat, pack)
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map (
delete, empty, fromList, lookup, minViewWithKey, toList, union
)
import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
import qualified PDF.EOL as EOL (charset, parser)
import qualified PDF.Output as Output (concat, line, string)
import PDF.Output (
@ -53,7 +55,7 @@ import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
import Text.Printf (printf)
line :: MonadParser m => String -> m ()
line l = (string (BS.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l
line l = (string (Char8.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l
magicNumber :: ByteString
magicNumber = "%PDF-"
@ -74,7 +76,7 @@ regular :: Char -> Bool
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
integer :: (Read a, Num a, MonadParser m) => m a
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
integer = read . Char8.unpack <$> decNumber <* blank <?> "decimal integer"
-------------------------------------
-- OBJECTS
@ -101,13 +103,13 @@ instance Output Number where
_ -> printf "%f" f
number :: MonadParser m => m Number
number = Number . read . BS.unpack <$>
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
number = Number . read . Char8.unpack <$>
(mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
<?> "number"
where
sign = string "-" <|> option "" (char '+' >> return "")
integerPart = mappend <$> decNumber <*> option "" floatPart
floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)
--
-- StringObject
@ -120,18 +122,28 @@ instance Output StringObject where
stringObject :: MonadParser m => m StringObject
stringObject =
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
Literal . Char8.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal . Char8.unpack <$> (char '<' *> hexNumber <* char '>')
<?> "string object (literal or hexadecimal)"
where
literalString = many literalStringBlock
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
normalChar = not . (`elem` ("\\()" :: String))
matchingParenthesis =
mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar =
BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
hexString :: (Num a, Read a) => String -> a
hexString s = read $ "0x" ++ s
parseBytes :: String -> ByteString
parseBytes = encodeUtf8 . decodeUtf16BE . BS.pack . fmap hexString . pairDigits
where
pairDigits "" = []
pairDigits [c] = [[c]]
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
--
-- Name
@ -142,7 +154,7 @@ instance Output Name where
output (Name n) = Output.string ('/':n)
name :: MonadParser m => m Name
name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"
--
-- Array

View File

@ -1,97 +1,25 @@
{-# LANGUAGE OverloadedStrings #-}
module PDF.Text {
CMap
, CMappers
, PageContents(..)
, cMap
, emptyCMap
module PDF.Text (
PageContents(..)
, pageContents
) where
import Control.Applicative ((<|>), many)
import Control.Applicative ((<|>))
import Control.Monad (foldM, join)
import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (get, put)
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
import Data.ByteString (ByteString, pack)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList, lookup)
import Data.Text (Text, snoc)
import qualified Data.Text as Text (init, last, null, unpack)
import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as Char8 (unpack)
import Data.Map ((!))
import qualified Data.Map as Map (lookup)
import PDF.CMap (CMappers, CMap, emptyCMap)
import PDF.Object (
DirectObject(..), Name, StringObject(..)
, array, blank, directObject, integer, line, name, regular, stringObject
DirectObject(..), StringObject(..)
, array, blank, name, parseBytes, regular, stringObject
)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll)
type CMappers = Map Name CMap
type CMap = Map Int Text
emptyCMap :: CMap
emptyCMap = Map.empty
cMap :: ByteString -> Either String CMap
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
where
ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
cMapRange :: Atto.Parser CMap
cMapRange = do
size <- integer <* line "beginbfrange"
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
where
rangeMapping = (,,)
<$> (stringObject <* blank)
<*> (stringObject <* blank)
<*> directObject <* EOL.parser
>>= mapFromTo
cMapChar :: Atto.Parser CMap
cMapChar = do
size <- integer <* line "beginbfchar"
Map.fromList <$> count size charMapping <* line "endbfchar"
where
charMapping =
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
>>= pairMapping
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, Text)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
let dstString = parseText dstFrom in
return $ zip [hexString from .. hexString to] (textsFrom dstString)
where
textsFrom t
| Text.null t = [t]
| otherwise = (Text.init t `snoc`) <$> [Text.last t ..]
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
where
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseText dstPoint
dstString _ = fail "Invalid for a replacement string"
mapFromTo _ = fail "invalid range mapping found"
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, Text)
pairMapping (Hexadecimal from, Hexadecimal to) =
return (hexString from, parseText to)
pairMapping _ = fail "invalid pair mapping found"
hexString :: (Num a, Read a) => String -> a
hexString s = read $ "0x" ++ s
pairDigits :: String -> [String]
pairDigits "" = []
pairDigits [c] = [[c]]
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
parseText :: String -> Text
parseText = decodeUtf16BE . pack . fmap hexString . pairDigits
data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
deriving (Bounded, Enum)
@ -190,7 +118,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString
decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h))
decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h))
decodeString (Literal litString) = get >>= convertBytes litString
where
convertBytes :: String -> CMap -> ParserWithFont ByteString
@ -204,4 +132,4 @@ decodeString (Literal litString) = get >>= convertBytes litString
case (Map.lookup code someCMap, s) of
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
(Nothing, []) -> fail "No character left to read but no code recognized"
(Just outputText, _) -> mappend (encodeUtf8 outputText) <$> convertBytes s someCMap
(Just outputText, _) -> mappend outputText <$> convertBytes s someCMap