Completely lost, the same old Char8 / Word8 again, implemented all the text reading, still needing a couple details to parse CMaps

This commit is contained in:
Tissevert 2019-09-25 18:42:34 +02:00
parent f9e5683bf4
commit e7484ef536
5 changed files with 158 additions and 62 deletions

View file

@ -30,6 +30,7 @@ library
, base >=4.9 && <4.13
, bytestring
, containers
, data-serializer
, mtl
hs-source-dirs: src
ghc-options: -Wall

View file

@ -16,7 +16,7 @@ import PDF.Object (
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId)
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)
@ -32,26 +32,17 @@ list l = RWST (\_ s -> fillContext s <$> l)
extractText :: Object -> T ()
extractText object = do
pageDict <- dict object
contents <- follow =<< key "Contents" pageDict
case contents of
(Stream {header, streamContent}) -> do
font <- getFont pageDict
cMappers <- loadCMappers font
storeDecodedText cMappers $ clear header streamContent
_ -> return ()
cMappers <- loadCMappers =<< getFont pageDict
contents <- stream =<< follow =<< key "Contents" pageDict
either (return . const ()) (tell . chunks) (pageContents cMappers contents)
clear :: Dictionary -> ByteString -> ByteString
clear header streamContent =
stream :: Object -> T ByteString
stream (Stream {header, streamContent}) = return $
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) ->
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
_ -> streamContent
storeDecodedText :: CMappers -> ByteString -> T ()
storeDecodedText font page =
case pageContents font page of
Left _ -> return ()
Right (PageContents {chunks}) -> tell chunks
stream _ = list []
getFont :: Dictionary -> T Dictionary
getFont pageDict =
@ -76,6 +67,8 @@ loadFont objectId =
>>= dict
>>= key "ToUnicode"
>>= follow
>>= stream
>>= either (return . const emptyCMap) return . cMap
loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList

View file

@ -6,14 +6,14 @@ module PDF.EOL (
) where
import Control.Applicative ((<|>))
import PDF.Parser (Parser, string)
import PDF.Parser (MonadParser, string)
data Style = CR | LF | CRLF deriving Show
charset :: String
charset = "\r\n"
parser :: Parser s Style
parser :: MonadParser m => m Style
parser =
(string "\r\n" >> return CRLF)
<|> (string "\r" >> return CR)

View file

@ -13,9 +13,11 @@ module PDF.Object (
, Number(..)
, Object(..)
, Occurrence(..)
, StringObject(..)
, Structure(..)
, XRefEntry(..)
, XRefSection
, array
, blank
, dictionary
, directObject
@ -24,14 +26,16 @@ module PDF.Object (
, line
, magicNumber
, name
, number
, regular
, stringObject
, structure
) where
import Control.Applicative ((<|>), many)
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (
concat, cons, pack, singleton, unpack
)
import Data.Map (Map, (!), mapWithKey)
@ -48,7 +52,7 @@ import PDF.Output (
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
import Text.Printf (printf)
line :: String -> Parser u ()
line :: MonadParser m => String -> m ()
line l = (string (BS.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l
magicNumber :: ByteString
@ -69,7 +73,7 @@ delimiterCharset = "()<>[]{}/%"
regular :: Char -> Bool
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
integer :: (Read a, Num a) => Parser u a
integer :: (Read a, Num a, MonadParser m) => m a
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
-------------------------------------
@ -81,7 +85,7 @@ type IndexedObjects = Map ObjectId Object
--
-- Boolean
--
boolean :: Parser u Bool
boolean :: MonadParser m => m Bool
boolean =
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
@ -96,7 +100,7 @@ instance Output Number where
(n, 0) -> printf "%d" (n :: Int)
_ -> printf "%f" f
number :: Parser u Number
number :: MonadParser m => m Number
number = Number . read . BS.unpack <$>
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
<?> "number"
@ -114,7 +118,7 @@ instance Output StringObject where
output (Literal s) = Output.string (printf "(%s)" s)
output (Hexadecimal s) = Output.string (printf "<%s>" s)
stringObject :: Parser u StringObject
stringObject :: MonadParser m => m StringObject
stringObject =
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
@ -143,7 +147,7 @@ name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
--
-- Array
--
array :: Parser u [DirectObject]
array :: MonadParser m => m [DirectObject]
array =
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
@ -160,7 +164,7 @@ instance Output Dictionary where
outputKeyVal :: (Name, DirectObject) -> OBuilder
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
dictionary :: Parser u Dictionary
dictionary :: MonadParser m => m Dictionary
dictionary =
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
where
@ -170,7 +174,7 @@ dictionary =
--
-- Null
--
nullObject :: Parser u ()
nullObject :: MonadParser m => m ()
nullObject = string "null" *> return () <?> "null object"
--
@ -181,7 +185,7 @@ data IndirectObjCoordinates = IndirectObjCoordinates {
, versionNumber :: Int
} deriving Show
reference :: Parser u IndirectObjCoordinates
reference :: MonadParser m => m IndirectObjCoordinates
reference = IndirectObjCoordinates
<$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
@ -210,7 +214,7 @@ instance Output DirectObject where
output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
directObject :: Parser u DirectObject
directObject :: MonadParser m => m DirectObject
directObject =
Boolean <$> boolean
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}

View file

@ -4,19 +4,24 @@ module PDF.Text (
, CMappers
, PageContents(..)
, cMap
, emptyCMap
, pageContents
) where
import Control.Applicative ((<|>))
import Control.Monad (join)
import Control.Monad.Reader (ReaderT, runReaderT)
import Control.Monad.State (state)
import Data.Attoparsec.ByteString.Char8 (count, sepBy)
import Data.ByteString.Char8 (ByteString)
import Data.Map (Map)
import qualified Data.Map as Map (empty)
import PDF.Object (Content, Name, blank, name, regular)
import PDF.Output (ObjectId)
import Control.Applicative ((<|>), many)
import Control.Monad (foldM, join)
import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (put)
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
import Data.ByteString.Char8 (ByteString, pack)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList)
import PDF.Object (
DirectObject(..), Name, StringObject(..)
, array, blank, directObject, integer, line, name, regular, stringObject
)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Parser (Parser, evalParser, string, takeAll)
type CMappers = Map Name CMap
@ -25,10 +30,92 @@ type CMap = Map Int ByteString
emptyCMap :: CMap
emptyCMap = Map.empty
data TextOperator = TJ | Tj | Tf | Other
cMap :: ByteString -> Either String CMap
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
where
ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
cMap :: ByteString -> CMap
cMap = undefined
cMapRange :: Atto.Parser CMap
cMapRange = do
size <- integer <* line "beginbfrange"
mconcat <$> count size rangeMapping <* line "endbfrange"
where
rangeMapping = mapFromTo
<$> (stringObject <* blank)
<*> (stringObject <* blank)
<*> directObject <* EOL.parser
cMapChar :: Atto.Parser CMap
cMapChar = do
size <- integer <* line "beginbfchar"
Map.fromList <$> count size charMapping <* line "endbfchar"
where
charMapping = pairMapping
<$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
undefined
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) = undefined
mapFromTo _ _ _ = Map.empty
pairMapping :: StringObject -> StringObject -> (Int, ByteString)
pairMapping (Hexadecimal from) (Hexadecimal to) =
(read $ "0x" ++ from,
fromBytes :: String -> ByteString
fromBytes s
where
toBaseWord8 n
| n < 0xff = [
to
data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
deriving (Bounded, Enum)
data TextOperator =
Td | TD | Tm | Tstar -- text positioning
| TJ | Tj | Quote | DQuote -- text showing
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
deriving (Bounded, Enum)
data Argument = Raw ByteString | Typed DirectObject
type Call a = (a, [Argument])
stateOperator :: StateOperator -> ParserWithFont (Call StateOperator)
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm"
stateOperator W = (,) W <$> count 1 argument <* string "w"
stateOperator J = (,) J <$> count 1 argument <* string "J"
stateOperator J_ = (,) J_ <$> count 1 argument <* string "j"
stateOperator M = (,) M <$> count 1 argument <* string "M"
stateOperator D = (,) D <$> count 2 argument <* string "d"
stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri"
stateOperator I = (,) I <$> count 1 argument <* string "i"
stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs"
textOperator :: TextOperator -> ParserWithFont (Call TextOperator)
textOperator Td = (,) Td <$> count 2 argument <* string "Td"
textOperator TD = (,) TD <$> count 2 argument <* string "TD"
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
textOperator Tstar = (,) Td <$> return [] <* string "T*"
textOperator TJ =
(,) TJ <$> sequence [Typed . Array <$> array] <* string "TJ"
textOperator Tj =
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* string "Tj"
textOperator Quote = (,) Quote <$> count 1 argument <* string "'"
textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\""
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw"
textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz"
textOperator TL = (,) TL <$> count 1 argument <* string "TL"
textOperator Tf = (,) Tf <$> sequence [Typed . NameObject <$> name, argument] <* string "Tf"
textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr"
textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o)
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
argument :: ParserWithFont Argument
argument = Raw <$> takeAll regular <* blank
data PageContents = PageContents {
chunks :: [ByteString]
@ -36,18 +123,6 @@ data PageContents = PageContents {
type ParserWithFont = ReaderT CMappers (Parser CMap)
{-
data FontContext = FontContext {
cMappers :: CMappers
, currentFont :: CMap
}
initFontContext cMappers = FontContext {
cMappers
, currentFont = emptyCMap
}
-}
pageContents :: CMappers -> ByteString -> Either String PageContents
pageContents font input =
evalParser (runReaderT (PageContents <$> page) font) emptyCMap input
@ -60,14 +135,37 @@ graphicState =
string "q" *> blank *> insideQ <* string "Q"
where
insideQ = join <$> (command <|> page `sepBy` blank )
command =
count 6 argument *> string "cm" *> return []
<|> name *> blank *> string "gs" *> return []
argument = takeAll regular <* blank
command = a stateOperator *> return []
text :: ParserWithFont [ByteString]
text = undefined
text =
string "BT" *> blank *> commands <* blank <* string "ET"
where
commands = join <$> (a textOperator >>= runOperator) `sepBy` blank
textOperator :: ParserWithFont TextOperator
textOperator = undefined
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
runOperator (Tf, [Typed (NameObject fontName), _]) =
asks (! fontName) >>= put >> return []
runOperator (Tstar, []) = return ["\n"]
runOperator (TJ, [Typed (Array arrayObject)]) =
replicate 1 <$> foldM appendText "" arrayObject
where
appendText bs (StringObject outputString) =
mappend bs <$> decodeString outputString
appendText bs _ = return bs
runOperator (Tj, [Typed (StringObject outputString)]) =
replicate 1 <$> decodeString outputString
runOperator (Quote, [Typed (StringObject outputString)]) =
(\bs -> ["\n", bs]) <$> decodeString outputString
runOperator (DQuote, [Typed (StringObject outputString)]) =
(\bs -> ["\n", bs]) <$> decodeString outputString
runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString
decodeString = undefined