Separate CMap and Text in two distinct modules
This commit is contained in:
parent
0374b72920
commit
3b59fd0c61
5 changed files with 117 additions and 101 deletions
|
@ -17,6 +17,7 @@ cabal-version: >=1.10
|
|||
|
||||
library
|
||||
exposed-modules: PDF
|
||||
, PDF.CMap
|
||||
, PDF.EOL
|
||||
, PDF.Object
|
||||
, PDF.Output
|
||||
|
|
|
@ -11,12 +11,13 @@ import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
|||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, insert, lookup, toList)
|
||||
import PDF (Document(..), parseDocument)
|
||||
import PDF.CMap (CMap, CMappers, cMap, emptyCMap)
|
||||
import PDF.Object (
|
||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Object(..), Name(..), Structure(..)
|
||||
,)
|
||||
import PDF.Output (ObjectId)
|
||||
import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents)
|
||||
import PDF.Text (PageContents(..), pageContents)
|
||||
import PDF.Update (unify)
|
||||
import System.Environment (getArgs)
|
||||
import System.IO (hPutStrLn, stderr)
|
||||
|
|
74
src/PDF/CMap.hs
Normal file
74
src/PDF/CMap.hs
Normal file
|
@ -0,0 +1,74 @@
|
|||
module PDF.CMap (
|
||||
CMap
|
||||
, CMappers
|
||||
, cMap
|
||||
, emptyCMap
|
||||
) where
|
||||
|
||||
import Control.Applicative ((<|>), many)
|
||||
import Data.Attoparsec.ByteString.Char8 (count, parseOnly)
|
||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
||||
import Data.ByteString (ByteString, snoc)
|
||||
import qualified Data.ByteString as BS (init, last, null)
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map (empty, fromList)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import PDF.Object (
|
||||
DirectObject(..), Name, StringObject(..)
|
||||
, blank, directObject, hexString, integer, line, parseBytes, stringObject
|
||||
)
|
||||
import PDF.Parser (takeAll)
|
||||
|
||||
type CMappers = Map Name CMap
|
||||
type CMap = Map Int ByteString
|
||||
|
||||
emptyCMap :: CMap
|
||||
emptyCMap = Map.empty
|
||||
|
||||
cMap :: ByteString -> Either String CMap
|
||||
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
|
||||
where
|
||||
ignoredLine =
|
||||
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
|
||||
|
||||
cMapRange :: Atto.Parser CMap
|
||||
cMapRange = do
|
||||
size <- integer <* line "beginbfrange"
|
||||
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
|
||||
where
|
||||
rangeMapping = (,,)
|
||||
<$> (stringObject <* blank)
|
||||
<*> (stringObject <* blank)
|
||||
<*> directObject <* EOL.parser
|
||||
>>= mapFromTo
|
||||
|
||||
cMapChar :: Atto.Parser CMap
|
||||
cMapChar = do
|
||||
size <- integer <* line "beginbfchar"
|
||||
Map.fromList <$> count size charMapping <* line "endbfchar"
|
||||
where
|
||||
charMapping =
|
||||
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
||||
>>= pairMapping
|
||||
|
||||
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, ByteString)]
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
||||
let dstString = parseBytes dstFrom in
|
||||
return $ zip [hexString from .. hexString to] (textsFrom dstString)
|
||||
where
|
||||
textsFrom t
|
||||
| BS.null t = [t]
|
||||
| otherwise = (BS.init t `snoc`) <$> [BS.last t ..]
|
||||
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
||||
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
|
||||
where
|
||||
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseBytes dstPoint
|
||||
dstString _ = fail "Invalid for a replacement string"
|
||||
|
||||
mapFromTo _ = fail "invalid range mapping found"
|
||||
|
||||
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, ByteString)
|
||||
pairMapping (Hexadecimal from, Hexadecimal to) =
|
||||
return (hexString from, parseBytes to)
|
||||
pairMapping _ = fail "invalid pair mapping found"
|
|
@ -22,11 +22,13 @@ module PDF.Object (
|
|||
, dictionary
|
||||
, directObject
|
||||
, eofMarker
|
||||
, hexString
|
||||
, integer
|
||||
, line
|
||||
, magicNumber
|
||||
, name
|
||||
, number
|
||||
, parseBytes
|
||||
, regular
|
||||
, stringObject
|
||||
, structure
|
||||
|
@ -34,14 +36,14 @@ module PDF.Object (
|
|||
|
||||
import Control.Applicative ((<|>), many)
|
||||
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
||||
import Data.ByteString.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (
|
||||
concat, cons, pack, singleton, unpack
|
||||
)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS (concat, pack)
|
||||
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
|
||||
import Data.Map (Map, (!), mapWithKey)
|
||||
import qualified Data.Map as Map (
|
||||
delete, empty, fromList, lookup, minViewWithKey, toList, union
|
||||
)
|
||||
import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import qualified PDF.Output as Output (concat, line, string)
|
||||
import PDF.Output (
|
||||
|
@ -53,7 +55,7 @@ import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
|
|||
import Text.Printf (printf)
|
||||
|
||||
line :: MonadParser m => String -> m ()
|
||||
line l = (string (BS.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l
|
||||
line l = (string (Char8.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l
|
||||
|
||||
magicNumber :: ByteString
|
||||
magicNumber = "%PDF-"
|
||||
|
@ -74,7 +76,7 @@ regular :: Char -> Bool
|
|||
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
|
||||
|
||||
integer :: (Read a, Num a, MonadParser m) => m a
|
||||
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
|
||||
integer = read . Char8.unpack <$> decNumber <* blank <?> "decimal integer"
|
||||
|
||||
-------------------------------------
|
||||
-- OBJECTS
|
||||
|
@ -101,13 +103,13 @@ instance Output Number where
|
|||
_ -> printf "%f" f
|
||||
|
||||
number :: MonadParser m => m Number
|
||||
number = Number . read . BS.unpack <$>
|
||||
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
|
||||
number = Number . read . Char8.unpack <$>
|
||||
(mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
|
||||
<?> "number"
|
||||
where
|
||||
sign = string "-" <|> option "" (char '+' >> return "")
|
||||
integerPart = mappend <$> decNumber <*> option "" floatPart
|
||||
floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
|
||||
floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)
|
||||
|
||||
--
|
||||
-- StringObject
|
||||
|
@ -120,18 +122,28 @@ instance Output StringObject where
|
|||
|
||||
stringObject :: MonadParser m => m StringObject
|
||||
stringObject =
|
||||
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
|
||||
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
|
||||
Literal . Char8.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
|
||||
<|> Hexadecimal . Char8.unpack <$> (char '<' *> hexNumber <* char '>')
|
||||
<?> "string object (literal or hexadecimal)"
|
||||
where
|
||||
literalString = many literalStringBlock
|
||||
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
|
||||
normalChar = not . (`elem` ("\\()" :: String))
|
||||
matchingParenthesis =
|
||||
mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
|
||||
mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
|
||||
escapedChar =
|
||||
BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
|
||||
octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
|
||||
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
|
||||
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
|
||||
|
||||
hexString :: (Num a, Read a) => String -> a
|
||||
hexString s = read $ "0x" ++ s
|
||||
|
||||
parseBytes :: String -> ByteString
|
||||
parseBytes = encodeUtf8 . decodeUtf16BE . BS.pack . fmap hexString . pairDigits
|
||||
where
|
||||
pairDigits "" = []
|
||||
pairDigits [c] = [[c]]
|
||||
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
|
||||
|
||||
--
|
||||
-- Name
|
||||
|
@ -142,7 +154,7 @@ instance Output Name where
|
|||
output (Name n) = Output.string ('/':n)
|
||||
|
||||
name :: MonadParser m => m Name
|
||||
name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
|
||||
name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"
|
||||
|
||||
--
|
||||
-- Array
|
||||
|
|
|
@ -1,97 +1,25 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
module PDF.Text {
|
||||
CMap
|
||||
, CMappers
|
||||
, PageContents(..)
|
||||
, cMap
|
||||
, emptyCMap
|
||||
module PDF.Text (
|
||||
PageContents(..)
|
||||
, pageContents
|
||||
) where
|
||||
|
||||
import Control.Applicative ((<|>), many)
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad (foldM, join)
|
||||
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
||||
import Control.Monad.State (get, put)
|
||||
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
|
||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
||||
import Data.ByteString (ByteString, pack)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, fromList, lookup)
|
||||
import Data.Text (Text, snoc)
|
||||
import qualified Data.Text as Text (init, last, null, unpack)
|
||||
import Data.Text.Encoding (decodeUtf16BE, encodeUtf8)
|
||||
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString.Char8 as Char8 (unpack)
|
||||
import Data.Map ((!))
|
||||
import qualified Data.Map as Map (lookup)
|
||||
import PDF.CMap (CMappers, CMap, emptyCMap)
|
||||
import PDF.Object (
|
||||
DirectObject(..), Name, StringObject(..)
|
||||
, array, blank, directObject, integer, line, name, regular, stringObject
|
||||
DirectObject(..), StringObject(..)
|
||||
, array, blank, name, parseBytes, regular, stringObject
|
||||
)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll)
|
||||
|
||||
type CMappers = Map Name CMap
|
||||
type CMap = Map Int Text
|
||||
|
||||
emptyCMap :: CMap
|
||||
emptyCMap = Map.empty
|
||||
|
||||
cMap :: ByteString -> Either String CMap
|
||||
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
|
||||
where
|
||||
ignoredLine =
|
||||
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
|
||||
|
||||
cMapRange :: Atto.Parser CMap
|
||||
cMapRange = do
|
||||
size <- integer <* line "beginbfrange"
|
||||
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
|
||||
where
|
||||
rangeMapping = (,,)
|
||||
<$> (stringObject <* blank)
|
||||
<*> (stringObject <* blank)
|
||||
<*> directObject <* EOL.parser
|
||||
>>= mapFromTo
|
||||
|
||||
cMapChar :: Atto.Parser CMap
|
||||
cMapChar = do
|
||||
size <- integer <* line "beginbfchar"
|
||||
Map.fromList <$> count size charMapping <* line "endbfchar"
|
||||
where
|
||||
charMapping =
|
||||
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
||||
>>= pairMapping
|
||||
|
||||
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, Text)]
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
||||
let dstString = parseText dstFrom in
|
||||
return $ zip [hexString from .. hexString to] (textsFrom dstString)
|
||||
where
|
||||
textsFrom t
|
||||
| Text.null t = [t]
|
||||
| otherwise = (Text.init t `snoc`) <$> [Text.last t ..]
|
||||
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
||||
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
|
||||
where
|
||||
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseText dstPoint
|
||||
dstString _ = fail "Invalid for a replacement string"
|
||||
|
||||
mapFromTo _ = fail "invalid range mapping found"
|
||||
|
||||
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, Text)
|
||||
pairMapping (Hexadecimal from, Hexadecimal to) =
|
||||
return (hexString from, parseText to)
|
||||
pairMapping _ = fail "invalid pair mapping found"
|
||||
|
||||
hexString :: (Num a, Read a) => String -> a
|
||||
hexString s = read $ "0x" ++ s
|
||||
|
||||
pairDigits :: String -> [String]
|
||||
pairDigits "" = []
|
||||
pairDigits [c] = [[c]]
|
||||
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
|
||||
|
||||
parseText :: String -> Text
|
||||
parseText = decodeUtf16BE . pack . fmap hexString . pairDigits
|
||||
|
||||
data StateOperator =
|
||||
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
|
||||
deriving (Bounded, Enum)
|
||||
|
@ -190,7 +118,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
|
|||
runOperator _ = return []
|
||||
|
||||
decodeString :: StringObject -> ParserWithFont ByteString
|
||||
decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h))
|
||||
decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h))
|
||||
decodeString (Literal litString) = get >>= convertBytes litString
|
||||
where
|
||||
convertBytes :: String -> CMap -> ParserWithFont ByteString
|
||||
|
@ -204,4 +132,4 @@ decodeString (Literal litString) = get >>= convertBytes litString
|
|||
case (Map.lookup code someCMap, s) of
|
||||
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
|
||||
(Nothing, []) -> fail "No character left to read but no code recognized"
|
||||
(Just outputText, _) -> mappend (encodeUtf8 outputText) <$> convertBytes s someCMap
|
||||
(Just outputText, _) -> mappend outputText <$> convertBytes s someCMap
|
||||
|
|
Loading…
Reference in a new issue