Completely lost, the same old Char8 / Word8 again, implemented all the text reading, still needing a couple details to parse CMaps
This commit is contained in:
parent
f9e5683bf4
commit
e7484ef536
5 changed files with 158 additions and 62 deletions
|
@ -30,6 +30,7 @@ library
|
|||
, base >=4.9 && <4.13
|
||||
, bytestring
|
||||
, containers
|
||||
, data-serializer
|
||||
, mtl
|
||||
hs-source-dirs: src
|
||||
ghc-options: -Wall
|
||||
|
|
|
@ -16,7 +16,7 @@ import PDF.Object (
|
|||
, Object(..), Name(..), Structure(..)
|
||||
,)
|
||||
import PDF.Output (ObjectId)
|
||||
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
|
||||
import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents)
|
||||
import PDF.Update (unify)
|
||||
import System.Environment (getArgs)
|
||||
import System.IO (hPutStrLn, stderr)
|
||||
|
@ -32,26 +32,17 @@ list l = RWST (\_ s -> fillContext s <$> l)
|
|||
extractText :: Object -> T ()
|
||||
extractText object = do
|
||||
pageDict <- dict object
|
||||
contents <- follow =<< key "Contents" pageDict
|
||||
case contents of
|
||||
(Stream {header, streamContent}) -> do
|
||||
font <- getFont pageDict
|
||||
cMappers <- loadCMappers font
|
||||
storeDecodedText cMappers $ clear header streamContent
|
||||
_ -> return ()
|
||||
cMappers <- loadCMappers =<< getFont pageDict
|
||||
contents <- stream =<< follow =<< key "Contents" pageDict
|
||||
either (return . const ()) (tell . chunks) (pageContents cMappers contents)
|
||||
|
||||
clear :: Dictionary -> ByteString -> ByteString
|
||||
clear header streamContent =
|
||||
stream :: Object -> T ByteString
|
||||
stream (Stream {header, streamContent}) = return $
|
||||
case Map.lookup (Name "Filter") header of
|
||||
Just (NameObject (Name "FlateDecode")) ->
|
||||
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
||||
_ -> streamContent
|
||||
|
||||
storeDecodedText :: CMappers -> ByteString -> T ()
|
||||
storeDecodedText font page =
|
||||
case pageContents font page of
|
||||
Left _ -> return ()
|
||||
Right (PageContents {chunks}) -> tell chunks
|
||||
stream _ = list []
|
||||
|
||||
getFont :: Dictionary -> T Dictionary
|
||||
getFont pageDict =
|
||||
|
@ -76,6 +67,8 @@ loadFont objectId =
|
|||
>>= dict
|
||||
>>= key "ToUnicode"
|
||||
>>= follow
|
||||
>>= stream
|
||||
>>= either (return . const emptyCMap) return . cMap
|
||||
|
||||
loadCMappers :: Dictionary -> T CMappers
|
||||
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
||||
|
|
|
@ -6,14 +6,14 @@ module PDF.EOL (
|
|||
) where
|
||||
|
||||
import Control.Applicative ((<|>))
|
||||
import PDF.Parser (Parser, string)
|
||||
import PDF.Parser (MonadParser, string)
|
||||
|
||||
data Style = CR | LF | CRLF deriving Show
|
||||
|
||||
charset :: String
|
||||
charset = "\r\n"
|
||||
|
||||
parser :: Parser s Style
|
||||
parser :: MonadParser m => m Style
|
||||
parser =
|
||||
(string "\r\n" >> return CRLF)
|
||||
<|> (string "\r" >> return CR)
|
||||
|
|
|
@ -13,9 +13,11 @@ module PDF.Object (
|
|||
, Number(..)
|
||||
, Object(..)
|
||||
, Occurrence(..)
|
||||
, StringObject(..)
|
||||
, Structure(..)
|
||||
, XRefEntry(..)
|
||||
, XRefSection
|
||||
, array
|
||||
, blank
|
||||
, dictionary
|
||||
, directObject
|
||||
|
@ -24,14 +26,16 @@ module PDF.Object (
|
|||
, line
|
||||
, magicNumber
|
||||
, name
|
||||
, number
|
||||
, regular
|
||||
, stringObject
|
||||
, structure
|
||||
) where
|
||||
|
||||
import Control.Applicative ((<|>), many)
|
||||
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
||||
import Data.ByteString.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS (
|
||||
concat, cons, pack, singleton, unpack
|
||||
)
|
||||
import Data.Map (Map, (!), mapWithKey)
|
||||
|
@ -48,7 +52,7 @@ import PDF.Output (
|
|||
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
|
||||
import Text.Printf (printf)
|
||||
|
||||
line :: String -> Parser u ()
|
||||
line :: MonadParser m => String -> m ()
|
||||
line l = (string (BS.pack l) *> EOL.parser *> return ()) <?> printf "line «%s»" l
|
||||
|
||||
magicNumber :: ByteString
|
||||
|
@ -69,7 +73,7 @@ delimiterCharset = "()<>[]{}/%"
|
|||
regular :: Char -> Bool
|
||||
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
|
||||
|
||||
integer :: (Read a, Num a) => Parser u a
|
||||
integer :: (Read a, Num a, MonadParser m) => m a
|
||||
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
|
||||
|
||||
-------------------------------------
|
||||
|
@ -81,7 +85,7 @@ type IndexedObjects = Map ObjectId Object
|
|||
--
|
||||
-- Boolean
|
||||
--
|
||||
boolean :: Parser u Bool
|
||||
boolean :: MonadParser m => m Bool
|
||||
boolean =
|
||||
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
|
||||
|
||||
|
@ -96,7 +100,7 @@ instance Output Number where
|
|||
(n, 0) -> printf "%d" (n :: Int)
|
||||
_ -> printf "%f" f
|
||||
|
||||
number :: Parser u Number
|
||||
number :: MonadParser m => m Number
|
||||
number = Number . read . BS.unpack <$>
|
||||
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
|
||||
<?> "number"
|
||||
|
@ -114,7 +118,7 @@ instance Output StringObject where
|
|||
output (Literal s) = Output.string (printf "(%s)" s)
|
||||
output (Hexadecimal s) = Output.string (printf "<%s>" s)
|
||||
|
||||
stringObject :: Parser u StringObject
|
||||
stringObject :: MonadParser m => m StringObject
|
||||
stringObject =
|
||||
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
|
||||
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
|
||||
|
@ -143,7 +147,7 @@ name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
|
|||
--
|
||||
-- Array
|
||||
--
|
||||
array :: Parser u [DirectObject]
|
||||
array :: MonadParser m => m [DirectObject]
|
||||
array =
|
||||
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
|
||||
|
||||
|
@ -160,7 +164,7 @@ instance Output Dictionary where
|
|||
outputKeyVal :: (Name, DirectObject) -> OBuilder
|
||||
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
|
||||
|
||||
dictionary :: Parser u Dictionary
|
||||
dictionary :: MonadParser m => m Dictionary
|
||||
dictionary =
|
||||
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
|
||||
where
|
||||
|
@ -170,7 +174,7 @@ dictionary =
|
|||
--
|
||||
-- Null
|
||||
--
|
||||
nullObject :: Parser u ()
|
||||
nullObject :: MonadParser m => m ()
|
||||
nullObject = string "null" *> return () <?> "null object"
|
||||
|
||||
--
|
||||
|
@ -181,7 +185,7 @@ data IndirectObjCoordinates = IndirectObjCoordinates {
|
|||
, versionNumber :: Int
|
||||
} deriving Show
|
||||
|
||||
reference :: Parser u IndirectObjCoordinates
|
||||
reference :: MonadParser m => m IndirectObjCoordinates
|
||||
reference = IndirectObjCoordinates
|
||||
<$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
|
||||
|
||||
|
@ -210,7 +214,7 @@ instance Output DirectObject where
|
|||
output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
|
||||
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
|
||||
|
||||
directObject :: Parser u DirectObject
|
||||
directObject :: MonadParser m => m DirectObject
|
||||
directObject =
|
||||
Boolean <$> boolean
|
||||
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
|
||||
|
|
162
src/PDF/Text.hs
162
src/PDF/Text.hs
|
@ -4,19 +4,24 @@ module PDF.Text (
|
|||
, CMappers
|
||||
, PageContents(..)
|
||||
, cMap
|
||||
, emptyCMap
|
||||
, pageContents
|
||||
) where
|
||||
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad (join)
|
||||
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||
import Control.Monad.State (state)
|
||||
import Data.Attoparsec.ByteString.Char8 (count, sepBy)
|
||||
import Data.ByteString.Char8 (ByteString)
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map (empty)
|
||||
import PDF.Object (Content, Name, blank, name, regular)
|
||||
import PDF.Output (ObjectId)
|
||||
import Control.Applicative ((<|>), many)
|
||||
import Control.Monad (foldM, join)
|
||||
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
||||
import Control.Monad.State (put)
|
||||
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
|
||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
||||
import Data.ByteString.Char8 (ByteString, pack)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, fromList)
|
||||
import PDF.Object (
|
||||
DirectObject(..), Name, StringObject(..)
|
||||
, array, blank, directObject, integer, line, name, regular, stringObject
|
||||
)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import PDF.Parser (Parser, evalParser, string, takeAll)
|
||||
|
||||
type CMappers = Map Name CMap
|
||||
|
@ -25,10 +30,92 @@ type CMap = Map Int ByteString
|
|||
emptyCMap :: CMap
|
||||
emptyCMap = Map.empty
|
||||
|
||||
data TextOperator = TJ | Tj | Tf | Other
|
||||
cMap :: ByteString -> Either String CMap
|
||||
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
|
||||
where
|
||||
ignoredLine =
|
||||
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
|
||||
|
||||
cMap :: ByteString -> CMap
|
||||
cMap = undefined
|
||||
cMapRange :: Atto.Parser CMap
|
||||
cMapRange = do
|
||||
size <- integer <* line "beginbfrange"
|
||||
mconcat <$> count size rangeMapping <* line "endbfrange"
|
||||
where
|
||||
rangeMapping = mapFromTo
|
||||
<$> (stringObject <* blank)
|
||||
<*> (stringObject <* blank)
|
||||
<*> directObject <* EOL.parser
|
||||
|
||||
cMapChar :: Atto.Parser CMap
|
||||
cMapChar = do
|
||||
size <- integer <* line "beginbfchar"
|
||||
Map.fromList <$> count size charMapping <* line "endbfchar"
|
||||
where
|
||||
charMapping = pairMapping
|
||||
<$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
||||
|
||||
mapFromTo (Hexadecimal from) (Hexadecimal to) (StringObject (Hexadecimal dstFrom)) =
|
||||
undefined
|
||||
mapFromTo (Hexadecimal from) (Hexadecimal to) (Array dstPoints) = undefined
|
||||
mapFromTo _ _ _ = Map.empty
|
||||
|
||||
pairMapping :: StringObject -> StringObject -> (Int, ByteString)
|
||||
pairMapping (Hexadecimal from) (Hexadecimal to) =
|
||||
(read $ "0x" ++ from,
|
||||
|
||||
fromBytes :: String -> ByteString
|
||||
fromBytes s
|
||||
where
|
||||
toBaseWord8 n
|
||||
| n < 0xff = [
|
||||
to
|
||||
|
||||
data StateOperator =
|
||||
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
|
||||
deriving (Bounded, Enum)
|
||||
data TextOperator =
|
||||
Td | TD | Tm | Tstar -- text positioning
|
||||
| TJ | Tj | Quote | DQuote -- text showing
|
||||
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
||||
deriving (Bounded, Enum)
|
||||
data Argument = Raw ByteString | Typed DirectObject
|
||||
type Call a = (a, [Argument])
|
||||
|
||||
stateOperator :: StateOperator -> ParserWithFont (Call StateOperator)
|
||||
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm"
|
||||
stateOperator W = (,) W <$> count 1 argument <* string "w"
|
||||
stateOperator J = (,) J <$> count 1 argument <* string "J"
|
||||
stateOperator J_ = (,) J_ <$> count 1 argument <* string "j"
|
||||
stateOperator M = (,) M <$> count 1 argument <* string "M"
|
||||
stateOperator D = (,) D <$> count 2 argument <* string "d"
|
||||
stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri"
|
||||
stateOperator I = (,) I <$> count 1 argument <* string "i"
|
||||
stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs"
|
||||
|
||||
textOperator :: TextOperator -> ParserWithFont (Call TextOperator)
|
||||
textOperator Td = (,) Td <$> count 2 argument <* string "Td"
|
||||
textOperator TD = (,) TD <$> count 2 argument <* string "TD"
|
||||
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
|
||||
textOperator Tstar = (,) Td <$> return [] <* string "T*"
|
||||
textOperator TJ =
|
||||
(,) TJ <$> sequence [Typed . Array <$> array] <* string "TJ"
|
||||
textOperator Tj =
|
||||
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* string "Tj"
|
||||
textOperator Quote = (,) Quote <$> count 1 argument <* string "'"
|
||||
textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\""
|
||||
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
|
||||
textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw"
|
||||
textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz"
|
||||
textOperator TL = (,) TL <$> count 1 argument <* string "TL"
|
||||
textOperator Tf = (,) Tf <$> sequence [Typed . NameObject <$> name, argument] <* string "Tf"
|
||||
textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr"
|
||||
textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
|
||||
|
||||
a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o)
|
||||
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
|
||||
|
||||
argument :: ParserWithFont Argument
|
||||
argument = Raw <$> takeAll regular <* blank
|
||||
|
||||
data PageContents = PageContents {
|
||||
chunks :: [ByteString]
|
||||
|
@ -36,18 +123,6 @@ data PageContents = PageContents {
|
|||
|
||||
type ParserWithFont = ReaderT CMappers (Parser CMap)
|
||||
|
||||
{-
|
||||
data FontContext = FontContext {
|
||||
cMappers :: CMappers
|
||||
, currentFont :: CMap
|
||||
}
|
||||
|
||||
initFontContext cMappers = FontContext {
|
||||
cMappers
|
||||
, currentFont = emptyCMap
|
||||
}
|
||||
-}
|
||||
|
||||
pageContents :: CMappers -> ByteString -> Either String PageContents
|
||||
pageContents font input =
|
||||
evalParser (runReaderT (PageContents <$> page) font) emptyCMap input
|
||||
|
@ -60,14 +135,37 @@ graphicState =
|
|||
string "q" *> blank *> insideQ <* string "Q"
|
||||
where
|
||||
insideQ = join <$> (command <|> page `sepBy` blank )
|
||||
command =
|
||||
count 6 argument *> string "cm" *> return []
|
||||
<|> name *> blank *> string "gs" *> return []
|
||||
argument = takeAll regular <* blank
|
||||
command = a stateOperator *> return []
|
||||
|
||||
text :: ParserWithFont [ByteString]
|
||||
text = undefined
|
||||
text =
|
||||
string "BT" *> blank *> commands <* blank <* string "ET"
|
||||
where
|
||||
commands = join <$> (a textOperator >>= runOperator) `sepBy` blank
|
||||
|
||||
textOperator :: ParserWithFont TextOperator
|
||||
textOperator = undefined
|
||||
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
|
||||
runOperator (Tf, [Typed (NameObject fontName), _]) =
|
||||
asks (! fontName) >>= put >> return []
|
||||
|
||||
runOperator (Tstar, []) = return ["\n"]
|
||||
|
||||
runOperator (TJ, [Typed (Array arrayObject)]) =
|
||||
replicate 1 <$> foldM appendText "" arrayObject
|
||||
where
|
||||
appendText bs (StringObject outputString) =
|
||||
mappend bs <$> decodeString outputString
|
||||
appendText bs _ = return bs
|
||||
|
||||
runOperator (Tj, [Typed (StringObject outputString)]) =
|
||||
replicate 1 <$> decodeString outputString
|
||||
|
||||
runOperator (Quote, [Typed (StringObject outputString)]) =
|
||||
(\bs -> ["\n", bs]) <$> decodeString outputString
|
||||
|
||||
runOperator (DQuote, [Typed (StringObject outputString)]) =
|
||||
(\bs -> ["\n", bs]) <$> decodeString outputString
|
||||
|
||||
runOperator _ = return []
|
||||
|
||||
decodeString :: StringObject -> ParserWithFont ByteString
|
||||
decodeString = undefined
|
||||
|
|
Loading…
Reference in a new issue