2019-09-23 23:19:27 +02:00
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
2019-09-26 15:51:41 +02:00
|
|
|
module PDF.Text {-(
|
2019-09-23 18:00:47 +02:00
|
|
|
CMap
|
|
|
|
, CMappers
|
|
|
|
, PageContents(..)
|
|
|
|
, cMap
|
2019-09-25 18:42:34 +02:00
|
|
|
, emptyCMap
|
2019-09-23 18:00:47 +02:00
|
|
|
, pageContents
|
2019-09-26 15:51:41 +02:00
|
|
|
)-} where
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
import Control.Applicative ((<|>), many)
|
|
|
|
import Control.Monad (foldM, join)
|
|
|
|
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
|
|
|
import Control.Monad.State (put)
|
|
|
|
import Data.Attoparsec.ByteString.Char8 (choice, count, parseOnly, sepBy)
|
|
|
|
import qualified Data.Attoparsec.ByteString.Char8 as Atto (Parser)
|
2019-09-26 15:51:41 +02:00
|
|
|
import Data.ByteString (ByteString, pack)
|
2019-09-26 16:56:13 +02:00
|
|
|
import qualified Data.ByteString as BS (head)
|
|
|
|
import Data.ByteString.Char8 as Char8 (pack)
|
|
|
|
import qualified Data.ByteString.UTF8 as UTF8 (fromString)
|
2019-09-25 18:42:34 +02:00
|
|
|
import Data.Map (Map, (!))
|
|
|
|
import qualified Data.Map as Map (empty, fromList)
|
2019-09-26 16:56:13 +02:00
|
|
|
import Data.Text (Text, snoc)
|
|
|
|
import qualified Data.Text as Text (init, last, null, unpack)
|
|
|
|
import Data.Text.Encoding (decodeUtf16BE)
|
2019-09-25 18:42:34 +02:00
|
|
|
import PDF.Object (
|
|
|
|
DirectObject(..), Name, StringObject(..)
|
|
|
|
, array, blank, directObject, integer, line, name, regular, stringObject
|
|
|
|
)
|
|
|
|
import qualified PDF.EOL as EOL (charset, parser)
|
2019-09-24 18:38:12 +02:00
|
|
|
import PDF.Parser (Parser, evalParser, string, takeAll)
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2019-09-24 18:38:12 +02:00
|
|
|
type CMappers = Map Name CMap
|
2019-09-26 16:56:13 +02:00
|
|
|
type CMap = Map Int Text
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2019-09-24 18:38:12 +02:00
|
|
|
emptyCMap :: CMap
|
|
|
|
emptyCMap = Map.empty
|
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
cMap :: ByteString -> Either String CMap
|
|
|
|
cMap = parseOnly $ mconcat <$> many (cMapRange <|> cMapChar <|> ignoredLine)
|
|
|
|
where
|
|
|
|
ignoredLine =
|
|
|
|
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return Map.empty
|
|
|
|
|
|
|
|
cMapRange :: Atto.Parser CMap
|
|
|
|
cMapRange = do
|
|
|
|
size <- integer <* line "beginbfrange"
|
2019-09-26 15:51:41 +02:00
|
|
|
mconcat <$> count size (Map.fromList <$> rangeMapping) <* line "endbfrange"
|
2019-09-25 18:42:34 +02:00
|
|
|
where
|
2019-09-26 15:51:41 +02:00
|
|
|
rangeMapping = (,,)
|
2019-09-25 18:42:34 +02:00
|
|
|
<$> (stringObject <* blank)
|
|
|
|
<*> (stringObject <* blank)
|
|
|
|
<*> directObject <* EOL.parser
|
2019-09-26 15:51:41 +02:00
|
|
|
>>= mapFromTo
|
2019-09-25 18:42:34 +02:00
|
|
|
|
|
|
|
cMapChar :: Atto.Parser CMap
|
|
|
|
cMapChar = do
|
|
|
|
size <- integer <* line "beginbfchar"
|
|
|
|
Map.fromList <$> count size charMapping <* line "endbfchar"
|
|
|
|
where
|
2019-09-26 15:51:41 +02:00
|
|
|
charMapping =
|
|
|
|
(,) <$> (stringObject <* blank) <*> (stringObject <* blank) <* EOL.parser
|
|
|
|
>>= pairMapping
|
|
|
|
|
2019-09-26 16:56:13 +02:00
|
|
|
mapFromTo :: (StringObject, StringObject, DirectObject) -> Atto.Parser [(Int, Text)]
|
2019-09-26 15:51:41 +02:00
|
|
|
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
2019-09-26 16:56:13 +02:00
|
|
|
let dstString = parseText dstFrom in
|
|
|
|
return $ zip [hexString from .. hexString to] (textsFrom dstString)
|
2019-09-26 15:51:41 +02:00
|
|
|
where
|
2019-09-26 16:56:13 +02:00
|
|
|
textsFrom t
|
|
|
|
| Text.null t = [t]
|
|
|
|
| otherwise = (Text.init t `snoc`) <$> [Text.last t ..]
|
2019-09-26 15:51:41 +02:00
|
|
|
|
|
|
|
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
|
|
|
zip [hexString from .. hexString to] <$> (mapM dstString dstPoints)
|
|
|
|
where
|
2019-09-26 16:56:13 +02:00
|
|
|
dstString (StringObject (Hexadecimal dstPoint)) = return $ parseText dstPoint
|
2019-09-26 15:51:41 +02:00
|
|
|
dstString _ = fail "Invalid for a replacement string"
|
2019-09-25 18:42:34 +02:00
|
|
|
|
2019-09-26 15:51:41 +02:00
|
|
|
mapFromTo _ = fail "invalid range mapping found"
|
2019-09-25 18:42:34 +02:00
|
|
|
|
2019-09-26 16:56:13 +02:00
|
|
|
pairMapping :: (StringObject, StringObject) -> Atto.Parser (Int, Text)
|
2019-09-26 15:51:41 +02:00
|
|
|
pairMapping (Hexadecimal from, Hexadecimal to) =
|
2019-09-26 16:56:13 +02:00
|
|
|
return (hexString from, parseText to)
|
2019-09-26 15:51:41 +02:00
|
|
|
pairMapping _ = fail "invalid pair mapping found"
|
2019-09-25 18:42:34 +02:00
|
|
|
|
2019-09-26 15:51:41 +02:00
|
|
|
hexString :: (Num a, Read a) => String -> a
|
2019-09-25 23:46:24 +02:00
|
|
|
hexString s = read $ "0x" ++ s
|
|
|
|
|
2019-09-26 15:51:41 +02:00
|
|
|
pairDigits :: String -> [String]
|
|
|
|
pairDigits "" = []
|
|
|
|
pairDigits [c] = [[c]]
|
2019-09-26 16:56:13 +02:00
|
|
|
pairDigits (firstChar:secondChar:end) = (firstChar:[secondChar]):pairDigits end
|
2019-09-26 15:51:41 +02:00
|
|
|
|
2019-09-26 16:56:13 +02:00
|
|
|
parseText :: String -> Text
|
|
|
|
parseText = decodeUtf16BE . pack . fmap hexString . pairDigits
|
2019-09-25 18:42:34 +02:00
|
|
|
|
|
|
|
data StateOperator =
|
|
|
|
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
|
|
|
|
deriving (Bounded, Enum)
|
|
|
|
data TextOperator =
|
|
|
|
Td | TD | Tm | Tstar -- text positioning
|
|
|
|
| TJ | Tj | Quote | DQuote -- text showing
|
|
|
|
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
|
|
|
deriving (Bounded, Enum)
|
|
|
|
data Argument = Raw ByteString | Typed DirectObject
|
|
|
|
type Call a = (a, [Argument])
|
|
|
|
|
|
|
|
stateOperator :: StateOperator -> ParserWithFont (Call StateOperator)
|
|
|
|
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm"
|
|
|
|
stateOperator W = (,) W <$> count 1 argument <* string "w"
|
|
|
|
stateOperator J = (,) J <$> count 1 argument <* string "J"
|
|
|
|
stateOperator J_ = (,) J_ <$> count 1 argument <* string "j"
|
|
|
|
stateOperator M = (,) M <$> count 1 argument <* string "M"
|
|
|
|
stateOperator D = (,) D <$> count 2 argument <* string "d"
|
|
|
|
stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri"
|
|
|
|
stateOperator I = (,) I <$> count 1 argument <* string "i"
|
|
|
|
stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs"
|
|
|
|
|
|
|
|
textOperator :: TextOperator -> ParserWithFont (Call TextOperator)
|
|
|
|
textOperator Td = (,) Td <$> count 2 argument <* string "Td"
|
|
|
|
textOperator TD = (,) TD <$> count 2 argument <* string "TD"
|
|
|
|
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
|
|
|
|
textOperator Tstar = (,) Td <$> return [] <* string "T*"
|
|
|
|
textOperator TJ =
|
|
|
|
(,) TJ <$> sequence [Typed . Array <$> array] <* string "TJ"
|
|
|
|
textOperator Tj =
|
|
|
|
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* string "Tj"
|
|
|
|
textOperator Quote = (,) Quote <$> count 1 argument <* string "'"
|
|
|
|
textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\""
|
|
|
|
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
|
|
|
|
textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw"
|
|
|
|
textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz"
|
|
|
|
textOperator TL = (,) TL <$> count 1 argument <* string "TL"
|
|
|
|
textOperator Tf = (,) Tf <$> sequence [Typed . NameObject <$> name, argument] <* string "Tf"
|
|
|
|
textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr"
|
|
|
|
textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
|
2019-09-23 23:19:27 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
a :: (Bounded o, Enum o) => (o -> ParserWithFont (Call o)) -> ParserWithFont (Call o)
|
|
|
|
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
|
|
|
|
|
|
|
|
argument :: ParserWithFont Argument
|
|
|
|
argument = Raw <$> takeAll regular <* blank
|
2019-09-23 18:00:47 +02:00
|
|
|
|
|
|
|
data PageContents = PageContents {
|
|
|
|
chunks :: [ByteString]
|
|
|
|
}
|
|
|
|
|
2019-09-24 18:38:12 +02:00
|
|
|
type ParserWithFont = ReaderT CMappers (Parser CMap)
|
|
|
|
|
|
|
|
pageContents :: CMappers -> ByteString -> Either String PageContents
|
|
|
|
pageContents font input =
|
|
|
|
evalParser (runReaderT (PageContents <$> page) font) emptyCMap input
|
|
|
|
|
|
|
|
page :: ParserWithFont [ByteString]
|
|
|
|
page = graphicState <|> text
|
|
|
|
|
|
|
|
graphicState :: ParserWithFont [ByteString]
|
2019-09-23 23:19:27 +02:00
|
|
|
graphicState =
|
2019-09-24 18:38:12 +02:00
|
|
|
string "q" *> blank *> insideQ <* string "Q"
|
2019-09-23 23:19:27 +02:00
|
|
|
where
|
2019-09-24 18:38:12 +02:00
|
|
|
insideQ = join <$> (command <|> page `sepBy` blank )
|
2019-09-25 18:42:34 +02:00
|
|
|
command = a stateOperator *> return []
|
2019-09-23 23:19:27 +02:00
|
|
|
|
2019-09-24 18:38:12 +02:00
|
|
|
text :: ParserWithFont [ByteString]
|
2019-09-25 18:42:34 +02:00
|
|
|
text =
|
|
|
|
string "BT" *> blank *> commands <* blank <* string "ET"
|
|
|
|
where
|
|
|
|
commands = join <$> (a textOperator >>= runOperator) `sepBy` blank
|
|
|
|
|
|
|
|
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
|
|
|
|
runOperator (Tf, [Typed (NameObject fontName), _]) =
|
|
|
|
asks (! fontName) >>= put >> return []
|
|
|
|
|
|
|
|
runOperator (Tstar, []) = return ["\n"]
|
|
|
|
|
|
|
|
runOperator (TJ, [Typed (Array arrayObject)]) =
|
|
|
|
replicate 1 <$> foldM appendText "" arrayObject
|
|
|
|
where
|
|
|
|
appendText bs (StringObject outputString) =
|
|
|
|
mappend bs <$> decodeString outputString
|
|
|
|
appendText bs _ = return bs
|
|
|
|
|
|
|
|
runOperator (Tj, [Typed (StringObject outputString)]) =
|
|
|
|
replicate 1 <$> decodeString outputString
|
|
|
|
|
|
|
|
runOperator (Quote, [Typed (StringObject outputString)]) =
|
|
|
|
(\bs -> ["\n", bs]) <$> decodeString outputString
|
|
|
|
|
|
|
|
runOperator (DQuote, [Typed (StringObject outputString)]) =
|
|
|
|
(\bs -> ["\n", bs]) <$> decodeString outputString
|
2019-09-24 18:38:12 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
runOperator _ = return []
|
2019-09-23 23:19:27 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
decodeString :: StringObject -> ParserWithFont ByteString
|
2019-09-26 16:56:13 +02:00
|
|
|
decodeString (Hexadecimal h) = decodeString (Literal (Text.unpack $ parseText h))
|
2019-09-26 15:51:41 +02:00
|
|
|
decodeString (Literal s) =
|
2019-09-26 16:56:13 +02:00
|
|
|
asks
|
2019-09-26 15:51:41 +02:00
|
|
|
where
|
2019-09-26 16:56:13 +02:00
|
|
|
bytes = Char8.pack s
|