Separate CMap and Text in two distinct modules
This commit is contained in:
parent
0374b72920
commit
3b59fd0c61
5 changed files with 117 additions and 101 deletions
|
@ -17,6 +17,7 @@ cabal-version: >=1.10
|
||||||
|
|
||||||
library
|
library
|
||||||
exposed-modules: PDF
|
exposed-modules: PDF
|
||||||
|
, PDF.CMap
|
||||||
, PDF.EOL
|
, PDF.EOL
|
||||||
, PDF.Object
|
, PDF.Object
|
||||||
, PDF.Output
|
, PDF.Output
|
||||||
|
|
|
@ -11,12 +11,13 @@ import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||||
import Data.Map (Map, (!))
|
import Data.Map (Map, (!))
|
||||||
import qualified Data.Map as Map (empty, insert, lookup, toList)
|
import qualified Data.Map as Map (empty, insert, lookup, toList)
|
||||||
import PDF (Document(..), parseDocument)
|
import PDF (Document(..), parseDocument)
|
||||||
|
import PDF.CMap (CMap, CMappers, cMap, emptyCMap)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
, Object(..), Name(..), Structure(..)
|
, Object(..), Name(..), Structure(..)
|
||||||
,)
|
,)
|
||||||
import PDF.Output (ObjectId)
|
import PDF.Output (ObjectId)
|
||||||
import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents)
|
import PDF.Text (PageContents(..), pageContents)
|
||||||
import PDF.Update (unify)
|
import PDF.Update (unify)
|
||||||
import System.Environment (getArgs)
|
import System.Environment (getArgs)
|
||||||
import System.IO (hPutStrLn, stderr)
|
import System.IO (hPutStrLn, stderr)
|
||||||
|
|
74
src/PDF/CMap.hs
Normal file
74
src/PDF/CMap.hs
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
module PDF.CMap (
|
||||||
|
CMap
|
||||||
|
, CMappers
|
||||||
|
, cMap
|
||||||
|
, emptyCMap
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Applicative ((<|>), many)
|
||||||
|
import Data.Attoparsec.ByteString.Char8 (count, parseOnly)
|
||||||
|
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
||||||
|
import Data.ByteString (ByteString, snoc)
|
||||||
|
import qualified Data.ByteString as BS (init, last, null)
|
||||||
|
import Data.Map (Map)
|
||||||
|
import qualified Data.Map as Map (empty, fromList)
|
||||||
|
import qualified PDF.EOL as EOL (charset, parser)
|
||||||
|
import PDF.Object (
|
||||||
|
DirectObject(..), Name, StringObject(..)
|
||||||
|
, blank, directObject, hexString, integer, line, parseBytes, stringObject
|
||||||
|
)
|
||||||
|
import PDF.Parser (takeAll)
|
||||||
|
|
||||||
|
type CMappers = Map Name CMap
|
||||||
|
type CMap = Map Int ByteString
|
||||||
|
|
||||||
|
emptyCMap :: CMap
|
||||||
|
emptyCMap = Map.empty
|
||||||
|
|
||||||
|
cMap :: ByteString -> Either String CMap
|
||||||
|
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
|
||||||
|
where
|
||||||
|
ignoredLine =
|
||||||
|
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
|
||||||
|
|
||||||
|
cMapRange :: Atto.Parser CMap
|
||||||
|
cMapRange = do
|
||||||
|
size <- integer <* line "beginbfrange"
|
||||||
|
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
|
||||||
|
where
|
||||||
|
rangeMapping = (,,)
|
||||||
|
<$> (stringObject <* blank)
|
||||||
|
<*> (stringObject <* blank)
|
||||||
|
<*> directObject <* EOL.parser
|
||||||
|
>>= mapFromTo
|
||||||
|
|
||||||
|
cMapChar :: Atto.Parser CMap
|
||||||
|
cMapChar = do
|
||||||
|
size <- integer <* line "beginbfchar"
|
||||||
|
Map.fromList <$> count size charMapping <* line "endbfchar"
|
||||||
|
where
|
||||||
|
charMapping =
|
||||||
|
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
||||||
|
>>= pairMapping
|
||||||
|
|
||||||
|
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, ByteString)]
|
||||||
|
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
||||||
|
let dstString = parseBytes dstFrom in
|
||||||
|
return $ zip [hexString from .. hexString to] (textsFrom dstString)
|
||||||
|
where
|
||||||
|
textsFrom t
|
||||||
|
| BS.null t = [t]
|
||||||
|
| otherwise = (BS.init t `snoc`) <$> [BS.last t ..]
|
||||||
|
|
||||||
|
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
||||||
|
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
|
||||||
|
where
|
||||||
|
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseBytes dstPoint
|
||||||
|
dstString _ = fail "Invalid for a replacement string"
|
||||||
|
|
||||||
|
mapFromTo _ = fail "invalid range mapping found"
|
||||||
|
|
||||||
|
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, ByteString)
|
||||||
|
pairMapping (Hexadecimal from, Hexadecimal to) =
|
||||||
|
return (hexString from, parseBytes to)
|
||||||
|
pairMapping _ = fail "invalid pair mapping found"
|
|
@ -22,11 +22,13 @@ module PDF.Object (
|
||||||
, dictionary
|
, dictionary
|
||||||
, directObject
|
, directObject
|
||||||
, eofMarker
|
, eofMarker
|
||||||
|
, hexString
|
||||||
, integer
|
, integer
|
||||||
, line
|
, line
|
||||||
, magicNumber
|
, magicNumber
|
||||||
, name
|
, name
|
||||||
, number
|
, number
|
||||||
|
, parseBytes
|
||||||
, regular
|
, regular
|
||||||
, stringObject
|
, stringObject
|
||||||
, structure
|
, structure
|
||||||
|
@ -34,14 +36,14 @@ module PDF.Object (
|
||||||
|
|
||||||
import Control.Applicative ((<|>), many)
|
import Control.Applicative ((<|>), many)
|
||||||
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
||||||
import Data.ByteString.Char8 (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import qualified Data.ByteString.Char8 as BS (
|
import qualified Data.ByteString as BS (concat, pack)
|
||||||
concat, cons, pack, singleton, unpack
|
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
|
||||||
)
|
|
||||||
import Data.Map (Map, (!), mapWithKey)
|
import Data.Map (Map, (!), mapWithKey)
|
||||||
import qualified Data.Map as Map (
|
import qualified Data.Map as Map (
|
||||||
delete, empty, fromList, lookup, minViewWithKey, toList, union
|
delete, empty, fromList, lookup, minViewWithKey, toList, union
|
||||||
)
|
)
|
||||||
|
import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
|
||||||
import qualified PDF.EOL as EOL (charset, parser)
|
import qualified PDF.EOL as EOL (charset, parser)
|
||||||
import qualified PDF.Output as Output (concat, line, string)
|
import qualified PDF.Output as Output (concat, line, string)
|
||||||
import PDF.Output (
|
import PDF.Output (
|
||||||
|
@ -53,7 +55,7 @@ import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
line :: MonadParser m => String -> m ()
|
line :: MonadParser m => String -> m ()
|
||||||
line l = (string (BS.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l
|
line l = (string (Char8.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l
|
||||||
|
|
||||||
magicNumber :: ByteString
|
magicNumber :: ByteString
|
||||||
magicNumber = "%PDF-"
|
magicNumber = "%PDF-"
|
||||||
|
@ -74,7 +76,7 @@ regular :: Char -> Bool
|
||||||
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
|
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
|
||||||
|
|
||||||
integer :: (Read a, Num a, MonadParser m) => m a
|
integer :: (Read a, Num a, MonadParser m) => m a
|
||||||
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
|
integer = read . Char8.unpack <$> decNumber <* blank <?> "decimal integer"
|
||||||
|
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
-- OBJECTS
|
-- OBJECTS
|
||||||
|
@ -101,13 +103,13 @@ instance Output Number where
|
||||||
_ -> printf "%f" f
|
_ -> printf "%f" f
|
||||||
|
|
||||||
number :: MonadParser m => m Number
|
number :: MonadParser m => m Number
|
||||||
number = Number . read . BS.unpack <$>
|
number = Number . read . Char8.unpack <$>
|
||||||
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
|
(mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
|
||||||
<?> "number"
|
<?> "number"
|
||||||
where
|
where
|
||||||
sign = string "-" <|> option "" (char '+' >> return "")
|
sign = string "-" <|> option "" (char '+' >> return "")
|
||||||
integerPart = mappend <$> decNumber <*> option "" floatPart
|
integerPart = mappend <$> decNumber <*> option "" floatPart
|
||||||
floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
|
floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)
|
||||||
|
|
||||||
--
|
--
|
||||||
-- StringObject
|
-- StringObject
|
||||||
|
@ -120,18 +122,28 @@ instance Output StringObject where
|
||||||
|
|
||||||
stringObject :: MonadParser m => m StringObject
|
stringObject :: MonadParser m => m StringObject
|
||||||
stringObject =
|
stringObject =
|
||||||
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
|
Literal . Char8.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
|
||||||
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
|
<|> Hexadecimal . Char8.unpack <$> (char '<' *> hexNumber <* char '>')
|
||||||
<?> "string object (literal or hexadecimal)"
|
<?> "string object (literal or hexadecimal)"
|
||||||
where
|
where
|
||||||
literalString = many literalStringBlock
|
literalString = many literalStringBlock
|
||||||
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
|
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
|
||||||
normalChar = not . (`elem` ("\\()" :: String))
|
normalChar = not . (`elem` ("\\()" :: String))
|
||||||
matchingParenthesis =
|
matchingParenthesis =
|
||||||
mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
|
mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
|
||||||
escapedChar =
|
escapedChar =
|
||||||
BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
|
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
|
||||||
octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
|
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
|
||||||
|
|
||||||
|
hexString :: (Num a, Read a) => String -> a
|
||||||
|
hexString s = read $ "0x" ++ s
|
||||||
|
|
||||||
|
parseBytes :: String -> ByteString
|
||||||
|
parseBytes = encodeUtf8 . decodeUtf16BE . BS.pack . fmap hexString . pairDigits
|
||||||
|
where
|
||||||
|
pairDigits "" = []
|
||||||
|
pairDigits [c] = [[c]]
|
||||||
|
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Name
|
-- Name
|
||||||
|
@ -142,7 +154,7 @@ instance Output Name where
|
||||||
output (Name n) = Output.string ('/':n)
|
output (Name n) = Output.string ('/':n)
|
||||||
|
|
||||||
name :: MonadParser m => m Name
|
name :: MonadParser m => m Name
|
||||||
name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
|
name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Array
|
-- Array
|
||||||
|
|
|
@ -1,97 +1,25 @@
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
module PDF.Text {
|
module PDF.Text (
|
||||||
CMap
|
PageContents(..)
|
||||||
, CMappers
|
|
||||||
, PageContents(..)
|
|
||||||
, cMap
|
|
||||||
, emptyCMap
|
|
||||||
, pageContents
|
, pageContents
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>), many)
|
import Control.Applicative ((<|>))
|
||||||
import Control.Monad (foldM, join)
|
import Control.Monad (foldM, join)
|
||||||
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
||||||
import Control.Monad.State (get, put)
|
import Control.Monad.State (get, put)
|
||||||
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
|
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
|
||||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
import Data.ByteString (ByteString)
|
||||||
import Data.ByteString (ByteString, pack)
|
import qualified Data.ByteString.Char8 as Char8 (unpack)
|
||||||
import Data.Map (Map, (!))
|
import Data.Map ((!))
|
||||||
import qualified Data.Map as Map (empty, fromList, lookup)
|
import qualified Data.Map as Map (lookup)
|
||||||
import Data.Text (Text, snoc)
|
import PDF.CMap (CMappers, CMap, emptyCMap)
|
||||||
import qualified Data.Text as Text (init, last, null, unpack)
|
|
||||||
import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
|
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
DirectObject(..), Name, StringObject(..)
|
DirectObject(..), StringObject(..)
|
||||||
, array, blank, directObject, integer, line, name, regular, stringObject
|
, array, blank, name, parseBytes, regular, stringObject
|
||||||
)
|
)
|
||||||
import qualified PDF.EOL as EOL (charset, parser)
|
|
||||||
import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll)
|
import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll)
|
||||||
|
|
||||||
type CMappers = Map Name CMap
|
|
||||||
type CMap = Map Int Text
|
|
||||||
|
|
||||||
emptyCMap :: CMap
|
|
||||||
emptyCMap = Map.empty
|
|
||||||
|
|
||||||
cMap :: ByteString -> Either String CMap
|
|
||||||
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
|
|
||||||
where
|
|
||||||
ignoredLine =
|
|
||||||
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
|
|
||||||
|
|
||||||
cMapRange :: Atto.Parser CMap
|
|
||||||
cMapRange = do
|
|
||||||
size <- integer <* line "beginbfrange"
|
|
||||||
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
|
|
||||||
where
|
|
||||||
rangeMapping = (,,)
|
|
||||||
<$> (stringObject <* blank)
|
|
||||||
<*> (stringObject <* blank)
|
|
||||||
<*> directObject <* EOL.parser
|
|
||||||
>>= mapFromTo
|
|
||||||
|
|
||||||
cMapChar :: Atto.Parser CMap
|
|
||||||
cMapChar = do
|
|
||||||
size <- integer <* line "beginbfchar"
|
|
||||||
Map.fromList <$> count size charMapping <* line "endbfchar"
|
|
||||||
where
|
|
||||||
charMapping =
|
|
||||||
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
|
||||||
>>= pairMapping
|
|
||||||
|
|
||||||
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, Text)]
|
|
||||||
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
|
||||||
let dstString = parseText dstFrom in
|
|
||||||
return $ zip [hexString from .. hexString to] (textsFrom dstString)
|
|
||||||
where
|
|
||||||
textsFrom t
|
|
||||||
| Text.null t = [t]
|
|
||||||
| otherwise = (Text.init t `snoc`) <$> [Text.last t ..]
|
|
||||||
|
|
||||||
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
|
||||||
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
|
|
||||||
where
|
|
||||||
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseText dstPoint
|
|
||||||
dstString _ = fail "Invalid for a replacement string"
|
|
||||||
|
|
||||||
mapFromTo _ = fail "invalid range mapping found"
|
|
||||||
|
|
||||||
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, Text)
|
|
||||||
pairMapping (Hexadecimal from, Hexadecimal to) =
|
|
||||||
return (hexString from, parseText to)
|
|
||||||
pairMapping _ = fail "invalid pair mapping found"
|
|
||||||
|
|
||||||
hexString :: (Num a, Read a) => String -> a
|
|
||||||
hexString s = read $ "0x" ++ s
|
|
||||||
|
|
||||||
pairDigits :: String -> [String]
|
|
||||||
pairDigits "" = []
|
|
||||||
pairDigits [c] = [[c]]
|
|
||||||
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
|
|
||||||
|
|
||||||
parseText :: String -> Text
|
|
||||||
parseText = decodeUtf16BE . pack . fmap hexString . pairDigits
|
|
||||||
|
|
||||||
data StateOperator =
|
data StateOperator =
|
||||||
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
|
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
|
||||||
deriving (Bounded, Enum)
|
deriving (Bounded, Enum)
|
||||||
|
@ -190,7 +118,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
|
||||||
runOperator _ = return []
|
runOperator _ = return []
|
||||||
|
|
||||||
decodeString :: StringObject -> ParserWithFont ByteString
|
decodeString :: StringObject -> ParserWithFont ByteString
|
||||||
decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h))
|
decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h))
|
||||||
decodeString (Literal litString) = get >>= convertBytes litString
|
decodeString (Literal litString) = get >>= convertBytes litString
|
||||||
where
|
where
|
||||||
convertBytes :: String -> CMap -> ParserWithFont ByteString
|
convertBytes :: String -> CMap -> ParserWithFont ByteString
|
||||||
|
@ -204,4 +132,4 @@ decodeString (Literal litString) = get >>= convertBytes litString
|
||||||
case (Map.lookup code someCMap, s) of
|
case (Map.lookup code someCMap, s) of
|
||||||
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
|
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
|
||||||
(Nothing, []) -> fail "No character left to read but no code recognized"
|
(Nothing, []) -> fail "No character left to read but no code recognized"
|
||||||
(Just outputText, _) -> mappend (encodeUtf8 outputText) <$> convertBytes s someCMap
|
(Just outputText, _) -> mappend outputText <$> convertBytes s someCMap
|
||||||
|
|
Loading…
Reference in a new issue