Still debugging, broke pretty much everything and finally implementing a proper coderange parsing for CMap because apparently that's necessary
This commit is contained in:
parent
b8ca7281aa
commit
36d7f9b819
7 changed files with 187 additions and 64 deletions
|
@ -23,9 +23,9 @@ library
|
||||||
, PDF.Output
|
, PDF.Output
|
||||||
, PDF.Text
|
, PDF.Text
|
||||||
, PDF.Update
|
, PDF.Update
|
||||||
|
, PDF.Parser
|
||||||
other-modules: Data.ByteString.Char8.Util
|
other-modules: Data.ByteString.Char8.Util
|
||||||
, PDF.Body
|
, PDF.Body
|
||||||
, PDF.Parser
|
|
||||||
-- other-extensions:
|
-- other-extensions:
|
||||||
build-depends: attoparsec
|
build-depends: attoparsec
|
||||||
, base >=4.9 && <4.13
|
, base >=4.9 && <4.13
|
||||||
|
@ -56,6 +56,16 @@ executable getObj
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
executable debug
|
||||||
|
main-is: examples/debug.hs
|
||||||
|
build-depends: base
|
||||||
|
, bytestring
|
||||||
|
, containers
|
||||||
|
, Hufflepdf
|
||||||
|
, mtl
|
||||||
|
ghc-options: -Wall
|
||||||
|
default-language: Haskell2010
|
||||||
|
|
||||||
executable getText
|
executable getText
|
||||||
main-is: examples/getText.hs
|
main-is: examples/getText.hs
|
||||||
build-depends: base
|
build-depends: base
|
||||||
|
|
26
examples/debug.hs
Normal file
26
examples/debug.hs
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
module Main where
|
||||||
|
|
||||||
|
import Control.Monad.Reader
|
||||||
|
import qualified Data.ByteString as BS
|
||||||
|
import qualified Data.ByteString.Char8 as C8
|
||||||
|
import qualified Data.Map as Map
|
||||||
|
import PDF.Object (Name(..), array)
|
||||||
|
import PDF.CMap (CMappers, cMap, emptyCMap)
|
||||||
|
import PDF.Parser (evalParser)
|
||||||
|
import PDF.Text
|
||||||
|
|
||||||
|
test :: CMappers -> ParserWithFont a -> BS.ByteString -> Either String a
|
||||||
|
test fonts parser =
|
||||||
|
evalParser (runReaderT parser fonts) emptyCMap
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
--input <- BS.readFile "20.stream"
|
||||||
|
input <- BS.readFile "array.bin"
|
||||||
|
Right font <- cMap <$> BS.readFile "6300.stream"
|
||||||
|
--mapM_ (\(k, v) -> putStr (show k) >> putStr " -> " >> BS.putStrLn v) $ Map.toList font
|
||||||
|
--case pageContents (Map.singleton (Name "R9") font) input of
|
||||||
|
--case test (Map.singleton (Name "R9") font) array input of
|
||||||
|
case test (Map.singleton (Name "R9") font) (a textOperator) input of
|
||||||
|
Left e -> putStrLn e
|
||||||
|
Right l -> putStrLn . show $ l
|
|
@ -16,11 +16,12 @@ import PDF.Object (
|
||||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
, Object(..), Name(..), Structure(..)
|
, Object(..), Name(..), Structure(..)
|
||||||
,)
|
,)
|
||||||
import PDF.Output (ObjectId)
|
import PDF.Output (ObjectId(..))
|
||||||
import PDF.Text (PageContents(..), pageContents)
|
import PDF.Text (PageContents(..), pageContents)
|
||||||
import PDF.Update (unify)
|
import PDF.Update (unify)
|
||||||
import System.Environment (getArgs)
|
import System.Environment (getArgs)
|
||||||
import System.IO (hPutStrLn, stderr)
|
import System.IO (hPutStrLn, stderr)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
type CachedCMaps = Map ObjectId CMap
|
type CachedCMaps = Map ObjectId CMap
|
||||||
type T a = RWST Content [ByteString] CachedCMaps [] a
|
type T a = RWST Content [ByteString] CachedCMaps [] a
|
||||||
|
@ -30,16 +31,19 @@ list l = RWST (\_ s -> fillContext s <$> l)
|
||||||
where
|
where
|
||||||
fillContext s a = (a, s, [])
|
fillContext s a = (a, s, [])
|
||||||
|
|
||||||
handleError :: a -> String -> T a
|
handleError :: ObjectId -> a -> String -> T a
|
||||||
handleError defaultValue s =
|
handleError objectId defaultValue s =
|
||||||
(tell . replicate 1 . BS.pack $ "Warning: " ++ s) >> return defaultValue
|
(tell . replicate 1 $ BS.pack message) >> return defaultValue
|
||||||
|
where
|
||||||
|
message = printf "Object #%d : %s" (getObjectId objectId) s
|
||||||
|
|
||||||
extractText :: Object -> T ()
|
extractText :: Object -> T ()
|
||||||
extractText object = do
|
extractText object = do
|
||||||
pageDict <- dict object
|
pageDict <- dict object
|
||||||
cMappers <- loadCMappers =<< getFont pageDict
|
cMappers <- loadCMappers =<< getFont pageDict
|
||||||
contents <- stream =<< follow =<< key "Contents" pageDict
|
contentsId <- target =<< key "Contents" pageDict
|
||||||
either (handleError ()) (tell . chunks) (pageContents cMappers contents)
|
contents <- stream =<< getObject contentsId
|
||||||
|
either (handleError contentsId ()) (tell . chunks) (pageContents cMappers contents)
|
||||||
|
|
||||||
stream :: Object -> T ByteString
|
stream :: Object -> T ByteString
|
||||||
stream (Stream {header, streamContent}) = return $
|
stream (Stream {header, streamContent}) = return $
|
||||||
|
@ -73,7 +77,7 @@ loadFont objectId =
|
||||||
>>= key "ToUnicode"
|
>>= key "ToUnicode"
|
||||||
>>= follow
|
>>= follow
|
||||||
>>= stream
|
>>= stream
|
||||||
>>= either (handleError emptyCMap) return . cMap
|
>>= either (handleError objectId emptyCMap) return . cMap
|
||||||
|
|
||||||
loadCMappers :: Dictionary -> T CMappers
|
loadCMappers :: Dictionary -> T CMappers
|
||||||
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
||||||
|
@ -94,9 +98,12 @@ key keyName dictionary =
|
||||||
Just obj -> return obj
|
Just obj -> return obj
|
||||||
_ -> list []
|
_ -> list []
|
||||||
|
|
||||||
|
target :: DirectObject -> T ObjectId
|
||||||
|
target (Reference (IndirectObjCoordinates {objectId})) = return objectId
|
||||||
|
target _ = list []
|
||||||
|
|
||||||
follow :: DirectObject -> T Object
|
follow :: DirectObject -> T Object
|
||||||
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
|
follow directObject = target directObject >>= getObject
|
||||||
follow _ = list []
|
|
||||||
|
|
||||||
dict :: Object -> T Dictionary
|
dict :: Object -> T Dictionary
|
||||||
dict (Direct (Dictionary dictionary)) = return dictionary
|
dict (Direct (Dictionary dictionary)) = return dictionary
|
||||||
|
@ -122,7 +129,6 @@ listTextObjects (Document {updates}) =
|
||||||
snd =<< evalRWST rwsMain (unify updates) Map.empty
|
snd =<< evalRWST rwsMain (unify updates) Map.empty
|
||||||
where
|
where
|
||||||
rwsMain =
|
rwsMain =
|
||||||
--Lazy.pack . show <$> (getObject =<< pagesList)
|
|
||||||
pagesList >>= getObject >>= extractText
|
pagesList >>= getObject >>= extractText
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,53 @@
|
||||||
module Data.ByteString.Char8.Util (
|
module Data.ByteString.Char8.Util (
|
||||||
previous
|
decodeHex
|
||||||
|
, fromInt
|
||||||
|
, hexString
|
||||||
|
, parseBytes
|
||||||
|
, previous
|
||||||
, subBS
|
, subBS
|
||||||
|
, toInt
|
||||||
|
, utf16BEToutf8
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Char8 (ByteString)
|
import Data.ByteString (ByteString, snoc)
|
||||||
import qualified Data.ByteString.Char8 as BS (drop, index, take)
|
import qualified Data.ByteString as BS (foldl, pack, singleton)
|
||||||
|
import qualified Data.ByteString.Char8 as Char8 (drop, index, take, unpack)
|
||||||
|
import Data.Text.Encoding (encodeUtf8, decodeUtf16BE)
|
||||||
import Prelude hiding (length)
|
import Prelude hiding (length)
|
||||||
|
|
||||||
previous :: Char -> Int -> ByteString -> Int
|
previous :: Char -> Int -> ByteString -> Int
|
||||||
previous char position byteString
|
previous char position byteString
|
||||||
| BS.index byteString position == char = position
|
| Char8.index byteString position == char = position
|
||||||
| otherwise = previous char (position - 1) byteString
|
| otherwise = previous char (position - 1) byteString
|
||||||
|
|
||||||
subBS :: Int -> Int -> ByteString -> ByteString
|
subBS :: Int -> Int -> ByteString -> ByteString
|
||||||
subBS offset length = BS.take length . BS.drop offset
|
subBS offset length = Char8.take length . Char8.drop offset
|
||||||
|
|
||||||
|
hexString :: (Num a, Read a) => String -> a
|
||||||
|
hexString s = read $ "0x" ++ s
|
||||||
|
|
||||||
|
fromInt :: Int -> ByteString
|
||||||
|
fromInt n
|
||||||
|
| n < 0x100 = BS.singleton $ toEnum n
|
||||||
|
| otherwise = fromInt (n `div` 0x100) `snoc` (toEnum (n `mod` 0x100))
|
||||||
|
|
||||||
|
toInt :: ByteString -> Int
|
||||||
|
toInt = BS.foldl (\n w -> 0x100*n + fromEnum w) 0
|
||||||
|
|
||||||
|
{-
|
||||||
|
encodeHex :: ByteString -> ByteString
|
||||||
|
encodeHex =
|
||||||
|
-}
|
||||||
|
|
||||||
|
decodeHex :: ByteString -> ByteString
|
||||||
|
decodeHex = parseBytes . Char8.unpack
|
||||||
|
|
||||||
|
parseBytes :: String -> ByteString
|
||||||
|
parseBytes = BS.pack . fmap hexString . pairDigits
|
||||||
|
where
|
||||||
|
pairDigits "" = []
|
||||||
|
pairDigits [c] = [[c]]
|
||||||
|
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
|
||||||
|
|
||||||
|
utf16BEToutf8 :: ByteString -> ByteString
|
||||||
|
utf16BEToutf8 = encodeUtf8 . decodeUtf16BE
|
||||||
|
|
103
src/PDF/CMap.hs
103
src/PDF/CMap.hs
|
@ -1,3 +1,4 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
module PDF.CMap (
|
module PDF.CMap (
|
||||||
CMap
|
CMap
|
||||||
, CMappers
|
, CMappers
|
||||||
|
@ -6,35 +7,66 @@ module PDF.CMap (
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>), many)
|
import Control.Applicative ((<|>), many)
|
||||||
import Data.Attoparsec.ByteString.Char8 (count, parseOnly)
|
import Control.Monad.State (modify)
|
||||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
import Data.Attoparsec.ByteString.Char8 (count)
|
||||||
import Data.ByteString (ByteString, snoc)
|
import Data.ByteString (ByteString)
|
||||||
import qualified Data.ByteString as BS (init, last, null)
|
import qualified Data.ByteString as BS (length)
|
||||||
import Data.Map (Map)
|
import Data.ByteString.Char8.Util (
|
||||||
import qualified Data.Map as Map (empty, fromList)
|
decodeHex, fromInt, toInt, utf16BEToutf8
|
||||||
|
)
|
||||||
|
import Data.Map (Map, union)
|
||||||
|
import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
|
||||||
import qualified PDF.EOL as EOL (charset, parser)
|
import qualified PDF.EOL as EOL (charset, parser)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
DirectObject(..), Name, StringObject(..)
|
DirectObject(..), Name, StringObject(..)
|
||||||
, blank, directObject, hexString, integer, line, parseBytes, stringObject
|
, blank, directObject, integer, line, stringObject
|
||||||
)
|
)
|
||||||
import PDF.Parser (takeAll)
|
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
||||||
|
|
||||||
type CMappers = Map Name CMap
|
type CMappers = Map Name CMap
|
||||||
type CMap = Map Int ByteString
|
type Mapping = Map ByteString ByteString
|
||||||
|
data CRange = CRange {
|
||||||
|
fromSequence :: ByteString
|
||||||
|
, toSequence :: ByteString
|
||||||
|
, mapping :: Mapping
|
||||||
|
}
|
||||||
|
type RangeSize = Int
|
||||||
|
type CMap = Map RangeSize [CRange]
|
||||||
|
|
||||||
emptyCMap :: CMap
|
emptyCMap :: CMap
|
||||||
emptyCMap = Map.empty
|
emptyCMap = Map.empty
|
||||||
|
|
||||||
cMap :: ByteString -> Either String CMap
|
cMap :: ByteString -> Either String CMap
|
||||||
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
|
cMap = fmap snd <$> runParser
|
||||||
|
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
||||||
|
emptyCMap
|
||||||
where
|
where
|
||||||
ignoredLine =
|
ignoredLine =
|
||||||
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
|
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
|
||||||
|
|
||||||
cMapRange :: Atto.Parser CMap
|
codeRanges :: Parser CMap ()
|
||||||
|
codeRanges = do
|
||||||
|
size <- integer <* line "begincodespacerange"
|
||||||
|
count size (createMapping <$> codeRange) *> return ()
|
||||||
|
line "endcodespacerange"
|
||||||
|
where
|
||||||
|
codeRange =
|
||||||
|
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
||||||
|
|
||||||
|
createMapping :: (StringObject, StringObject) -> Parser CMap ()
|
||||||
|
createMapping (Hexadecimal from, Hexadecimal to) = modify $
|
||||||
|
Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
|
||||||
|
where
|
||||||
|
fromSequence = decodeHex from
|
||||||
|
size = BS.length fromSequence
|
||||||
|
toSequence = decodeHex to
|
||||||
|
mapping = Map.empty
|
||||||
|
createMapping _ = return ()
|
||||||
|
|
||||||
|
cMapRange :: Parser CMap ()
|
||||||
cMapRange = do
|
cMapRange = do
|
||||||
size <- integer <* line "beginbfrange"
|
size <- integer <* line "beginbfrange"
|
||||||
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
|
mapM_ saveMapping =<< count size rangeMapping <* line "endbfrange"
|
||||||
where
|
where
|
||||||
rangeMapping = (,,)
|
rangeMapping = (,,)
|
||||||
<$> (stringObject <* blank)
|
<$> (stringObject <* blank)
|
||||||
|
@ -42,33 +74,52 @@ cMapRange = do
|
||||||
<*> directObject <* EOL.parser
|
<*> directObject <* EOL.parser
|
||||||
>>= mapFromTo
|
>>= mapFromTo
|
||||||
|
|
||||||
cMapChar :: Atto.Parser CMap
|
saveMapping :: [(ByteString, ByteString)] -> Parser CMap ()
|
||||||
|
saveMapping [] = return ()
|
||||||
|
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
|
||||||
|
where
|
||||||
|
newMapping = Map.fromList assoc
|
||||||
|
mappingSize = BS.length code
|
||||||
|
matchingRange (CRange {fromSequence, toSequence}) =
|
||||||
|
fromSequence <= code && code <= toSequence
|
||||||
|
appendMapping cRange =
|
||||||
|
cRange {mapping = mapping cRange `union` newMapping}
|
||||||
|
insertCRange = fmap (\cRange ->
|
||||||
|
if matchingRange cRange then appendMapping cRange else cRange
|
||||||
|
)
|
||||||
|
|
||||||
|
cMapChar :: Parser CMap ()
|
||||||
cMapChar = do
|
cMapChar = do
|
||||||
size <- integer <* line "beginbfchar"
|
size <- integer <* line "beginbfchar"
|
||||||
Map.fromList <$> count size charMapping <* line "endbfchar"
|
saveMapping =<< count size charMapping <* line "endbfchar"
|
||||||
where
|
where
|
||||||
charMapping =
|
charMapping =
|
||||||
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
||||||
>>= pairMapping
|
>>= pairMapping
|
||||||
|
|
||||||
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, ByteString)]
|
between :: ByteString -> ByteString -> [ByteString]
|
||||||
|
between from to = fromInt <$> [toInt from .. toInt to]
|
||||||
|
|
||||||
|
startFrom :: ByteString -> [ByteString]
|
||||||
|
startFrom from = fromInt <$> [toInt from .. ]
|
||||||
|
|
||||||
|
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
|
||||||
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
||||||
let dstString = parseBytes dstFrom in
|
return $ zip (between fromBS toBS) (startFrom dstBS)
|
||||||
return $ zip [hexString from .. hexString to] (textsFrom dstString)
|
|
||||||
where
|
where
|
||||||
textsFrom t
|
(fromBS, toBS, dstBS) = (decodeHex from, decodeHex to, decodeHex dstFrom)
|
||||||
| BS.null t = [t]
|
|
||||||
| otherwise = (BS.init t `snoc`) <$> [BS.last t ..]
|
|
||||||
|
|
||||||
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
||||||
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
|
zip (between fromBS toBS) <$> (mapM dstByteString dstPoints)
|
||||||
where
|
where
|
||||||
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseBytes dstPoint
|
(fromBS, toBS) = (decodeHex from, decodeHex to)
|
||||||
dstString _ = fail "Invalid for a replacement string"
|
dstByteString (StringObject (Hexadecimal dst)) =
|
||||||
|
return . utf16BEToutf8 $ decodeHex dst
|
||||||
|
dstByteString _ = fail "Invalid for a replacement string"
|
||||||
|
|
||||||
mapFromTo _ = fail "invalid range mapping found"
|
mapFromTo _ = fail "invalid range mapping found"
|
||||||
|
|
||||||
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, ByteString)
|
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
|
||||||
pairMapping (Hexadecimal from, Hexadecimal to) =
|
pairMapping (Hexadecimal from, Hexadecimal to) =
|
||||||
return (hexString from, parseBytes to)
|
return (decodeHex from, utf16BEToutf8 $ decodeHex to)
|
||||||
pairMapping _ = fail "invalid pair mapping found"
|
pairMapping _ = fail "invalid pair mapping found"
|
||||||
|
|
|
@ -22,7 +22,6 @@ module PDF.Object (
|
||||||
, dictionary
|
, dictionary
|
||||||
, directObject
|
, directObject
|
||||||
, eofMarker
|
, eofMarker
|
||||||
, hexString
|
|
||||||
, integer
|
, integer
|
||||||
, line
|
, line
|
||||||
, magicNumber
|
, magicNumber
|
||||||
|
@ -39,6 +38,7 @@ import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import qualified Data.ByteString as BS (concat, pack)
|
import qualified Data.ByteString as BS (concat, pack)
|
||||||
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
|
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
|
||||||
|
import Data.ByteString.Char8.Util (hexString, parseBytes)
|
||||||
import Data.Map (Map, (!), mapWithKey)
|
import Data.Map (Map, (!), mapWithKey)
|
||||||
import qualified Data.Map as Map (
|
import qualified Data.Map as Map (
|
||||||
delete, empty, fromList, lookup, minViewWithKey, toList, union
|
delete, empty, fromList, lookup, minViewWithKey, toList, union
|
||||||
|
@ -114,16 +114,16 @@ number = Number . read . Char8.unpack <$>
|
||||||
--
|
--
|
||||||
-- StringObject
|
-- StringObject
|
||||||
--
|
--
|
||||||
data StringObject = Literal String | Hexadecimal String deriving Show
|
data StringObject = Literal ByteString | Hexadecimal ByteString deriving Show
|
||||||
|
|
||||||
instance Output StringObject where
|
instance Output StringObject where
|
||||||
output (Literal s) = Output.string (printf "(%s)" s)
|
output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
|
||||||
output (Hexadecimal s) = Output.string (printf "<%s>" s)
|
output (Hexadecimal s) = Output.string (printf "<%s>" (Char8.unpack s))
|
||||||
|
|
||||||
stringObject :: MonadParser m => m StringObject
|
stringObject :: MonadParser m => m StringObject
|
||||||
stringObject =
|
stringObject =
|
||||||
Literal . Char8.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
|
Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
|
||||||
<|> Hexadecimal . Char8.unpack <$> (char '<' *> hexNumber <* char '>')
|
<|> Hexadecimal <$> (char '<' *> hexNumber <* char '>')
|
||||||
<?> "string object (literal or hexadecimal)"
|
<?> "string object (literal or hexadecimal)"
|
||||||
where
|
where
|
||||||
literalString = many literalStringBlock
|
literalString = many literalStringBlock
|
||||||
|
@ -135,16 +135,6 @@ stringObject =
|
||||||
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
|
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
|
||||||
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
|
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
|
||||||
|
|
||||||
hexString :: (Num a, Read a) => String -> a
|
|
||||||
hexString s = read $ "0x" ++ s
|
|
||||||
|
|
||||||
parseBytes :: String -> ByteString
|
|
||||||
parseBytes = encodeUtf8 . decodeUtf16BE . BS.pack . fmap hexString . pairDigits
|
|
||||||
where
|
|
||||||
pairDigits "" = []
|
|
||||||
pairDigits [c] = [[c]]
|
|
||||||
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
|
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Name
|
-- Name
|
||||||
--
|
--
|
||||||
|
|
|
@ -11,6 +11,7 @@ import Control.Monad.State (get, put)
|
||||||
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
|
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import qualified Data.ByteString.Char8 as Char8 (unpack)
|
import qualified Data.ByteString.Char8 as Char8 (unpack)
|
||||||
|
import Data.ByteString.Char8.Util (decodeHex)
|
||||||
import Data.Map ((!))
|
import Data.Map ((!))
|
||||||
import qualified Data.Map as Map (lookup)
|
import qualified Data.Map as Map (lookup)
|
||||||
import PDF.CMap (CMappers, CMap, emptyCMap)
|
import PDF.CMap (CMappers, CMap, emptyCMap)
|
||||||
|
@ -22,13 +23,13 @@ import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll)
|
||||||
|
|
||||||
data StateOperator =
|
data StateOperator =
|
||||||
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
|
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
|
||||||
deriving (Bounded, Enum)
|
deriving (Bounded, Enum, Show)
|
||||||
data TextOperator =
|
data TextOperator =
|
||||||
Td | TD | Tm | Tstar -- text positioning
|
Td | TD | Tm | Tstar -- text positioning
|
||||||
| TJ | Tj | Quote | DQuote -- text showing
|
| TJ | Tj | Quote | DQuote -- text showing
|
||||||
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
||||||
deriving (Bounded, Enum)
|
deriving (Bounded, Enum, Show)
|
||||||
data Argument = Raw ByteString | Typed DirectObject
|
data Argument = Raw ByteString | Typed DirectObject deriving Show
|
||||||
type Call a = (a, [Argument])
|
type Call a = (a, [Argument])
|
||||||
|
|
||||||
stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator)
|
stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator)
|
||||||
|
@ -112,7 +113,7 @@ runOperator (TJ, [Typed (Array arrayObject)]) =
|
||||||
appendText bs (StringObject outputString) =
|
appendText bs (StringObject outputString) =
|
||||||
mappend bs <$> decodeString outputString
|
mappend bs <$> decodeString outputString
|
||||||
appendText bs _ = return bs
|
appendText bs _ = return bs
|
||||||
|
|
||||||
runOperator (Tj, [Typed (StringObject outputString)]) =
|
runOperator (Tj, [Typed (StringObject outputString)]) =
|
||||||
replicate 1 <$> decodeString outputString
|
replicate 1 <$> decodeString outputString
|
||||||
|
|
||||||
|
@ -125,8 +126,10 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
|
||||||
runOperator _ = return []
|
runOperator _ = return []
|
||||||
|
|
||||||
decodeString :: StringObject -> ParserWithFont ByteString
|
decodeString :: StringObject -> ParserWithFont ByteString
|
||||||
decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h))
|
decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h))
|
||||||
decodeString (Literal litString) = get >>= convertBytes litString
|
decodeString (Literal litString) =
|
||||||
|
|
||||||
|
get >>= convertBytes litString
|
||||||
where
|
where
|
||||||
convertBytes :: String -> CMap -> ParserWithFont ByteString
|
convertBytes :: String -> CMap -> ParserWithFont ByteString
|
||||||
convertBytes [] _ = return ""
|
convertBytes [] _ = return ""
|
||||||
|
|
Loading…
Reference in a new issue