Compare commits

..

1 commit
fonts ... main

16 changed files with 137 additions and 1079 deletions

View file

@ -17,28 +17,19 @@ cabal-version: >=1.10
library
exposed-modules: PDF
, PDF.CMap
, PDF.EOL
, PDF.Object
, PDF.Output
, PDF.Pages
, PDF.Parser
, PDF.Text
, PDF.Update
other-modules: Data.ByteString.Char8.Util
, PDF.Encoding
, PDF.Encoding.MacRoman
, PDF.Body
, PDF.Font
, PDF.Parser
-- other-extensions:
build-depends: attoparsec
, base >=4.9 && <4.13
, bytestring
, containers
, mtl
, text
, utf8-string
, zlib
hs-source-dirs: src
ghc-options: -Wall
default-language: Haskell2010
@ -60,15 +51,3 @@ executable getObj
, zlib
ghc-options: -Wall
default-language: Haskell2010
executable getText
main-is: examples/getText.hs
build-depends: base
, bytestring
, containers
, Hufflepdf
, mtl
, text
, zlib
ghc-options: -Wall
default-language: Haskell2010

View file

@ -4,16 +4,17 @@ import Codec.Compression.Zlib (decompress)
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict)
import Data.Map ((!))
import qualified Data.Map as Map (lookup)
import Data.Map ((!?))
import qualified Data.Map as Map (keys, lookup)
import PDF (Document(..), parseDocument)
import qualified PDF.EOL as EOL (Style)
import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..))
import PDF.Output (ObjectId(..))
import qualified PDF.Output as Output (render)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)
import System.Environment (getArgs, getProgName)
import System.Exit (die)
import Text.Printf (printf)
display :: EOL.Style -> Object -> ByteString
display eolStyle d@(Direct _) = Output.render eolStyle d
@ -25,16 +26,28 @@ display eolStyle s@(Stream {header, streamContent}) = Output.render eolStyle $
}
_ -> s
extractObject :: ObjectId -> Document -> ByteString
extractObject :: ObjectId -> Document -> Either String ByteString
extractObject objectId (Document {eolStyle, updates}) =
display eolStyle . (!objectId) $ objects content
case objects content !? objectId of
Nothing -> Left $ "No object has ID " ++ show (getObjectId objectId)
Just o -> Right $ display eolStyle o
where
content = unify updates
listObjectIds :: Document -> Either String [String]
listObjectIds =
Right . prependTitle . toString . Map.keys . objects . unify . updates
where
toString = fmap (show . getObjectId)
prependTitle = ("ObjectIds defined in this PDF:":)
main :: IO ()
main = do
[inputFile, objectId] <- getArgs
result <- parseDocument <$> BS.readFile inputFile
case result of
Left parseError -> hPutStrLn stderr parseError
Right doc -> Lazy.putStr $ extractObject (ObjectId (read objectId)) doc
(inputFile, getData) <- parse =<< getArgs
input <- BS.readFile inputFile
either die id $ (parseDocument input >>= getData)
where
parse [inputFile] = return (inputFile, fmap (mapM_ putStrLn) . listObjectIds)
parse [inputFile, objectId] = return
(inputFile, fmap Lazy.putStr . extractObject (ObjectId (read objectId)))
parse _ = die . printf "Syntax: %s inputFile [OBJECT_ID]\n" =<< getProgName

View file

@ -1,38 +0,0 @@
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.Map as Map (toList)
import qualified Data.Text.IO as Text (putStrLn)
import PDF (Document(..), parseDocument)
import PDF.Object (Content)
import PDF.Pages (Page(..), get, getAll)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.Exit (die)
import System.IO (BufferMode(..), hSetBuffering, stdout)
onDoc :: FilePath -> (Content -> Either String a) -> IO a
onDoc inputFile f = do
content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
case content >>= f of
Left someError -> die someError
Right value -> return value
displayPage :: Page -> IO ()
displayPage = mapM_ Text.putStrLn . contents
wholeDoc :: FilePath -> IO ()
wholeDoc inputFile = do
pages <- onDoc inputFile getAll
mapM_ (displayPage . snd) $ Map.toList pages
singlePage :: FilePath -> Int -> IO ()
singlePage inputFile pageNumber =
onDoc inputFile (`get` pageNumber) >>= displayPage
main :: IO ()
main = do
hSetBuffering stdout LineBuffering
args <- getArgs
case args of
[inputFile] -> wholeDoc inputFile
[inputFile, pageNumber] -> singlePage inputFile (read pageNumber)
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"

View file

@ -1,91 +1,16 @@
module Data.ByteString.Char8.Util (
B16Int(..)
, B256Int(..)
, b8ToInt
, b16ToBytes
, b16ToInt
, b256ToInt
, intToB256
, previous
previous
, subBS
, toBytes
, unescape
, utf16BEToutf8
) where
import Data.ByteString (ByteString, snoc)
import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt)
import qualified Data.ByteString.Char8 as Char8 (
cons, drop, index, splitAt, take, uncons, unpack
)
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf16BE)
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (drop, index, take)
import Prelude hiding (length)
import Text.Printf (printf)
newtype B8Int = B8Int ByteString deriving Show
newtype B16Int = B16Int ByteString deriving Show
newtype B256Int = B256Int ByteString deriving Show
previous :: Char -> Int -> ByteString -> Int
previous char position byteString
| Char8.index byteString position == char = position
| BS.index byteString position == char = position
| otherwise = previous char (position - 1) byteString
subBS :: Int -> Int -> ByteString -> ByteString
subBS offset length = Char8.take length . Char8.drop offset
intToB256 :: Int -> B256Int
intToB256 n
| n < 0x100 = B256Int . BS.singleton $ toEnum n
| otherwise =
let B256Int begining = intToB256 (n `div` 0x100) in
B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
b256ToInt :: B256Int -> Int
b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n
toBytes :: Int -> Int -> ByteString
toBytes 0 _ = BS.empty
toBytes size n =
(toBytes (size - 1) (n `div` 0x100)) `snoc` (toEnum (n `mod` 0x100))
b16ToBytes :: B16Int -> ByteString
b16ToBytes (B16Int n) = BS.pack . fmap b16ToInt $ pairDigits n
where
pairDigits s =
case BS.length s of
0 -> []
1 -> [B16Int s]
_ ->
let (twoHexDigits, rest) = BS.splitAt 2 s in
(B16Int $ twoHexDigits):(pairDigits rest)
fromBase :: (Num a, Read a) => Char -> ByteString -> a
fromBase b = read . printf "0%c%s" b . Char8.unpack
b16ToInt :: (Num a, Read a) => B16Int -> a
b16ToInt (B16Int n) = fromBase 'x' n
b8ToInt :: (Num a, Read a) => B8Int -> a
b8ToInt (B8Int n) = fromBase 'o' n
unescape :: ByteString -> ByteString
unescape escapedBS =
case Char8.uncons escapedBS of
Nothing -> BS.empty
Just ('\\', s) -> unescapeChar s
Just (c, s) -> Char8.cons c (unescape s)
where
unescapeChar s =
case Char8.uncons s of
Nothing -> BS.empty
Just (c, s')
| c `elem` "()" -> Char8.cons c (unescape s')
| c `elem` "nrtbf" -> Char8.cons (read (printf "'\\%c'" c)) (unescape s')
| c `elem` ['0'..'7'] -> fromOctal (Char8.splitAt 3 s)
| otherwise -> Char8.cons c (unescape s')
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
utf16BEToutf8 :: ByteString -> Text
utf16BEToutf8 = decodeUtf16BE
subBS offset length = BS.take length . BS.drop offset

View file

@ -21,7 +21,7 @@ import PDF.Object (
)
import qualified PDF.Output as Output (render, line)
import PDF.Output (Output(..))
import PDF.Parser (Parser, evalParser, string, takeAll)
import PDF.Parser (Parser, runParser, string, takeAll)
import Text.Printf (printf)
data Document = Document {
@ -83,7 +83,7 @@ findNextSection offset input =
readStructures :: Int -> ByteString -> Either String [InputStructure]
readStructures startXref input =
evalParser structure () (BS.drop startXref input) >>= stopOrFollow
runParser structure () (BS.drop startXref input) >>= stopOrFollow
where
stopOrFollow s@(Structure {trailer}) =
case Map.lookup (Name "Prev") trailer of
@ -96,7 +96,7 @@ readStructures startXref input =
parseDocument :: ByteString -> Either String Document
parseDocument input = do
(pdfVersion, eolStyle) <- evalParser ((,) <$> version <*> EOL.parser) () input
(pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input
startXref <- readStartXref eolStyle input
structuresRead <- readStructures startXref input
let updates = populate input <$> structuresRead

View file

@ -6,7 +6,6 @@ module PDF.Body (
import Control.Applicative ((<|>))
import Control.Monad.State (get, gets, modify)
import Data.Attoparsec.ByteString.Char8 (option)
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
import Data.Map ((!))
@ -19,7 +18,7 @@ import PDF.Object (
, blank, dictionary, directObject, integer, line
)
import PDF.Output (ObjectId(..), Offset(..))
import PDF.Parser (Parser, (<?>), block, char, evalParser, on, takeAll)
import PDF.Parser (Parser, (<?>), block, char, on, option, runParser, takeAll)
data UserState = UserState {
input :: ByteString
@ -110,7 +109,7 @@ occurrence =
populate :: ByteString -> InputStructure -> Content
populate input structure =
let bodyInput = BS.drop (startOffset structure) input in
case evalParser recurseOnOccurrences initialState bodyInput of
case runParser recurseOnOccurrences initialState bodyInput of
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
Right finalState ->
let Flow {occurrencesStack, tmpObjects} = flow finalState in

View file

@ -1,159 +0,0 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE TypeSynonymInstances #-}
module PDF.CMap (
CMap
, CMappers
, CRange(..)
, cMap
, emptyCMap
, matches
) where
import Control.Applicative ((<|>), many)
import Control.Monad.State (modify)
import Data.Attoparsec.ByteString.Char8 (count)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (drop, length, null, take)
import Data.ByteString.Char8 (unpack)
import Data.ByteString.Char8.Util (
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
)
import Data.Map (Map, union)
import qualified Data.Map as Map (
adjust, empty, fromList, insertWith, lookup, toList
)
import Data.Text (Text)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Font (Font)
import PDF.Object (
DirectObject(..), Name, StringObject(..)
, blank, directObject, integer, line, stringObject
)
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
type CMappers = Map Name CMap
type Mapping = Map ByteString Text
data CRange = CRange {
fromSequence :: ByteString
, toSequence :: ByteString
, mapping :: Mapping
} deriving Show
type RangeSize = Int
type CMap = Map RangeSize [CRange]
toFont :: CMap -> Font
toFont aCMap input
| BS.null input = Right ""
| otherwise = do
(output, remainingInput) <- trySizes input $ Map.toList aCMap
mappend output <$> toFont aCMap remainingInput
where
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
trySizes s ((size, cRanges):others) =
let prefix = BS.take size s in
case tryRanges prefix cRanges of
Nothing -> trySizes s others
Just outputSequence -> Right (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe Text
tryRanges _ [] = Nothing
tryRanges prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of
Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence
emptyCMap :: CMap
emptyCMap = Map.empty
matches :: ByteString -> CRange -> Bool
matches code (CRange {fromSequence, toSequence}) =
fromSequence <= code && code <= toSequence
cMap :: ByteString -> Either String Font
cMap = fmap (toFont . snd) <$> runParser
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
emptyCMap
where
ignoredLine =
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
codeRanges :: Parser CMap ()
codeRanges = do
size <- integer <* line "begincodespacerange"
mapM_ createMapping =<< count size codeRange
line "endcodespacerange"
where
codeRange =
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
createMapping :: (StringObject, StringObject) -> Parser CMap ()
createMapping (Hexadecimal from, Hexadecimal to) = modify $
Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
where
fromSequence = b16ToBytes from
size = BS.length fromSequence
toSequence = b16ToBytes to
mapping = Map.empty
createMapping _ = return ()
cMapRange :: Parser CMap ()
cMapRange = do
size <- integer <* line "beginbfrange"
mapM_ saveMapping =<< count size rangeMapping
line "endbfrange"
where
rangeMapping = (,,)
<$> (stringObject <* blank)
<*> (stringObject <* blank)
<*> directObject <* EOL.parser
>>= mapFromTo
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
saveMapping [] = return ()
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
where
newMapping = Map.fromList assoc
mappingSize = BS.length code
appendMapping cRange =
cRange {mapping = mapping cRange `union` newMapping}
insertCRange = fmap (\cRange ->
if code `matches` cRange then appendMapping cRange else cRange
)
cMapChar :: Parser CMap ()
cMapChar = do
size <- integer <* line "beginbfchar"
saveMapping =<< count size charMapping <* line "endbfchar"
where
charMapping =
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
>>= pairMapping
between :: B16Int -> B16Int -> [ByteString]
between from@(B16Int s) to =
let size = BS.length s `div` 2 in
toBytes size <$> [b16ToInt from .. b16ToInt to]
startFrom :: B16Int -> [ByteString]
startFrom from@(B16Int s) =
let size = BS.length s `div` 2 in
toBytes size <$> [b16ToInt from .. ]
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
zip (between from to) <$> (mapM dstByteString dstPoints)
where
dstByteString (StringObject (Hexadecimal dst)) =
return . utf16BEToutf8 $ b16ToBytes dst
dstByteString _ = fail "Invalid for a replacement string"
mapFromTo _ = fail "invalid range mapping found"
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
pairMapping (Hexadecimal from, Hexadecimal to) =
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
pairMapping _ = fail "invalid pair mapping found"

View file

@ -6,14 +6,14 @@ module PDF.EOL (
) where
import Control.Applicative ((<|>))
import PDF.Parser (MonadParser, string)
import PDF.Parser (Parser, string)
data Style = CR | LF | CRLF deriving Show
charset :: String
charset = "\r\n"
parser :: MonadParser m => m Style
parser :: Parser s Style
parser =
(string "\r\n" >> return CRLF)
<|> (string "\r" >> return CR)

View file

@ -1,10 +0,0 @@
module PDF.Encoding (
encoding
) where
import PDF.Encoding.MacRoman (macRomanEncoding)
import PDF.Font (Font)
encoding :: String -> Either String Font
encoding "MacRomanEncoding" = Right macRomanEncoding
encoding s = Left $ "Unknown encoding " ++ s

View file

@ -1,141 +0,0 @@
module PDF.Encoding.MacRoman (
macRomanEncoding
) where
import Data.ByteString.Char8 (unpack)
import Data.Text (pack)
import PDF.Font (Font)
macRomanEncoding :: Font
macRomanEncoding = Right . pack . fmap decode . unpack
decode :: Char -> Char
decode '\x80' = '\x00C4' -- LATIN CAPITAL LETTER A WITH DIAERESIS
decode '\x81' = '\x00C5' -- LATIN CAPITAL LETTER A WITH RING ABOVE
decode '\x82' = '\x00C7' -- LATIN CAPITAL LETTER C WITH CEDILLA
decode '\x83' = '\x00C9' -- LATIN CAPITAL LETTER E WITH ACUTE
decode '\x84' = '\x00D1' -- LATIN CAPITAL LETTER N WITH TILDE
decode '\x85' = '\x00D6' -- LATIN CAPITAL LETTER O WITH DIAERESIS
decode '\x86' = '\x00DC' -- LATIN CAPITAL LETTER U WITH DIAERESIS
decode '\x87' = '\x00E1' -- LATIN SMALL LETTER A WITH ACUTE
decode '\x88' = '\x00E0' -- LATIN SMALL LETTER A WITH GRAVE
decode '\x89' = '\x00E2' -- LATIN SMALL LETTER A WITH CIRCUMFLEX
decode '\x8A' = '\x00E4' -- LATIN SMALL LETTER A WITH DIAERESIS
decode '\x8B' = '\x00E3' -- LATIN SMALL LETTER A WITH TILDE
decode '\x8C' = '\x00E5' -- LATIN SMALL LETTER A WITH RING ABOVE
decode '\x8D' = '\x00E7' -- LATIN SMALL LETTER C WITH CEDILLA
decode '\x8E' = '\x00E9' -- LATIN SMALL LETTER E WITH ACUTE
decode '\x8F' = '\x00E8' -- LATIN SMALL LETTER E WITH GRAVE
decode '\x90' = '\x00EA' -- LATIN SMALL LETTER E WITH CIRCUMFLEX
decode '\x91' = '\x00EB' -- LATIN SMALL LETTER E WITH DIAERESIS
decode '\x92' = '\x00ED' -- LATIN SMALL LETTER I WITH ACUTE
decode '\x93' = '\x00EC' -- LATIN SMALL LETTER I WITH GRAVE
decode '\x94' = '\x00EE' -- LATIN SMALL LETTER I WITH CIRCUMFLEX
decode '\x95' = '\x00EF' -- LATIN SMALL LETTER I WITH DIAERESIS
decode '\x96' = '\x00F1' -- LATIN SMALL LETTER N WITH TILDE
decode '\x97' = '\x00F3' -- LATIN SMALL LETTER O WITH ACUTE
decode '\x98' = '\x00F2' -- LATIN SMALL LETTER O WITH GRAVE
decode '\x99' = '\x00F4' -- LATIN SMALL LETTER O WITH CIRCUMFLEX
decode '\x9A' = '\x00F6' -- LATIN SMALL LETTER O WITH DIAERESIS
decode '\x9B' = '\x00F5' -- LATIN SMALL LETTER O WITH TILDE
decode '\x9C' = '\x00FA' -- LATIN SMALL LETTER U WITH ACUTE
decode '\x9D' = '\x00F9' -- LATIN SMALL LETTER U WITH GRAVE
decode '\x9E' = '\x00FB' -- LATIN SMALL LETTER U WITH CIRCUMFLEX
decode '\x9F' = '\x00FC' -- LATIN SMALL LETTER U WITH DIAERESIS
decode '\xA0' = '\x2020' -- DAGGER
decode '\xA1' = '\x00B0' -- DEGREE SIGN
decode '\xA2' = '\x00A2' -- CENT SIGN
decode '\xA3' = '\x00A3' -- POUND SIGN
decode '\xA4' = '\x00A7' -- SECTION SIGN
decode '\xA5' = '\x2022' -- BULLET
decode '\xA6' = '\x00B6' -- PILCROW SIGN
decode '\xA7' = '\x00DF' -- LATIN SMALL LETTER SHARP S
decode '\xA8' = '\x00AE' -- REGISTERED SIGN
decode '\xA9' = '\x00A9' -- COPYRIGHT SIGN
decode '\xAA' = '\x2122' -- TRADE MARK SIGN
decode '\xAB' = '\x00B4' -- ACUTE ACCENT
decode '\xAC' = '\x00A8' -- DIAERESIS
decode '\xAD' = '\x2260' -- NOT EQUAL TO
decode '\xAE' = '\x00C6' -- LATIN CAPITAL LETTER AE
decode '\xAF' = '\x00D8' -- LATIN CAPITAL LETTER O WITH STROKE
decode '\xB0' = '\x221E' -- INFINITY
decode '\xB1' = '\x00B1' -- PLUS-MINUS SIGN
decode '\xB2' = '\x2264' -- LESS-THAN OR EQUAL TO
decode '\xB3' = '\x2265' -- GREATER-THAN OR EQUAL TO
decode '\xB4' = '\x00A5' -- YEN SIGN
decode '\xB5' = '\x00B5' -- MICRO SIGN
decode '\xB6' = '\x2202' -- PARTIAL DIFFERENTIAL
decode '\xB7' = '\x2211' -- N-ARY SUMMATION
decode '\xB8' = '\x220F' -- N-ARY PRODUCT
decode '\xB9' = '\x03C0' -- GREEK SMALL LETTER PI
decode '\xBA' = '\x222B' -- INTEGRAL
decode '\xBB' = '\x00AA' -- FEMININE ORDINAL INDICATOR
decode '\xBC' = '\x00BA' -- MASCULINE ORDINAL INDICATOR
decode '\xBD' = '\x03A9' -- GREEK CAPITAL LETTER OMEGA
decode '\xBE' = '\x00E6' -- LATIN SMALL LETTER AE
decode '\xBF' = '\x00F8' -- LATIN SMALL LETTER O WITH STROKE
decode '\xC0' = '\x00BF' -- INVERTED QUESTION MARK
decode '\xC1' = '\x00A1' -- INVERTED EXCLAMATION MARK
decode '\xC2' = '\x00AC' -- NOT SIGN
decode '\xC3' = '\x221A' -- SQUARE ROOT
decode '\xC4' = '\x0192' -- LATIN SMALL LETTER F WITH HOOK
decode '\xC5' = '\x2248' -- ALMOST EQUAL TO
decode '\xC6' = '\x2206' -- INCREMENT
decode '\xC7' = '\x00AB' -- LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
decode '\xC8' = '\x00BB' -- RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
decode '\xC9' = '\x2026' -- HORIZONTAL ELLIPSIS
decode '\xCA' = '\x00A0' -- NO-BREAK SPACE
decode '\xCB' = '\x00C0' -- LATIN CAPITAL LETTER A WITH GRAVE
decode '\xCC' = '\x00C3' -- LATIN CAPITAL LETTER A WITH TILDE
decode '\xCD' = '\x00D5' -- LATIN CAPITAL LETTER O WITH TILDE
decode '\xCE' = '\x0152' -- LATIN CAPITAL LIGATURE OE
decode '\xCF' = '\x0153' -- LATIN SMALL LIGATURE OE
decode '\xD0' = '\x2013' -- EN DASH
decode '\xD1' = '\x2014' -- EM DASH
decode '\xD2' = '\x201C' -- LEFT DOUBLE QUOTATION MARK
decode '\xD3' = '\x201D' -- RIGHT DOUBLE QUOTATION MARK
decode '\xD4' = '\x2018' -- LEFT SINGLE QUOTATION MARK
decode '\xD5' = '\x2019' -- RIGHT SINGLE QUOTATION MARK
decode '\xD6' = '\x00F7' -- DIVISION SIGN
decode '\xD7' = '\x25CA' -- LOZENGE
decode '\xD8' = '\x00FF' -- LATIN SMALL LETTER Y WITH DIAERESIS
decode '\xD9' = '\x0178' -- LATIN CAPITAL LETTER Y WITH DIAERESIS
decode '\xDA' = '\x2044' -- FRACTION SLASH
decode '\xDB' = '\x20AC' -- EURO SIGN
decode '\xDC' = '\x2039' -- SINGLE LEFT-POINTING ANGLE QUOTATION MARK
decode '\xDD' = '\x203A' -- SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
decode '\xDE' = '\xFB01' -- LATIN SMALL LIGATURE FI
decode '\xDF' = '\xFB02' -- LATIN SMALL LIGATURE FL
decode '\xE0' = '\x2021' -- DOUBLE DAGGER
decode '\xE1' = '\x00B7' -- MIDDLE DOT
decode '\xE2' = '\x201A' -- SINGLE LOW-9 QUOTATION MARK
decode '\xE3' = '\x201E' -- DOUBLE LOW-9 QUOTATION MARK
decode '\xE4' = '\x2030' -- PER MILLE SIGN
decode '\xE5' = '\x00C2' -- LATIN CAPITAL LETTER A WITH CIRCUMFLEX
decode '\xE6' = '\x00CA' -- LATIN CAPITAL LETTER E WITH CIRCUMFLEX
decode '\xE7' = '\x00C1' -- LATIN CAPITAL LETTER A WITH ACUTE
decode '\xE8' = '\x00CB' -- LATIN CAPITAL LETTER E WITH DIAERESIS
decode '\xE9' = '\x00C8' -- LATIN CAPITAL LETTER E WITH GRAVE
decode '\xEA' = '\x00CD' -- LATIN CAPITAL LETTER I WITH ACUTE
decode '\xEB' = '\x00CE' -- LATIN CAPITAL LETTER I WITH CIRCUMFLEX
decode '\xEC' = '\x00CF' -- LATIN CAPITAL LETTER I WITH DIAERESIS
decode '\xED' = '\x00CC' -- LATIN CAPITAL LETTER I WITH GRAVE
decode '\xEE' = '\x00D3' -- LATIN CAPITAL LETTER O WITH ACUTE
decode '\xEF' = '\x00D4' -- LATIN CAPITAL LETTER O WITH CIRCUMFLEX
decode '\xF0' = '\xF8FF' -- Apple logo
decode '\xF1' = '\x00D2' -- LATIN CAPITAL LETTER O WITH GRAVE
decode '\xF2' = '\x00DA' -- LATIN CAPITAL LETTER U WITH ACUTE
decode '\xF3' = '\x00DB' -- LATIN CAPITAL LETTER U WITH CIRCUMFLEX
decode '\xF4' = '\x00D9' -- LATIN CAPITAL LETTER U WITH GRAVE
decode '\xF5' = '\x0131' -- LATIN SMALL LETTER DOTLESS I
decode '\xF6' = '\x02C6' -- MODIFIER LETTER CIRCUMFLEX ACCENT
decode '\xF7' = '\x02DC' -- SMALL TILDE
decode '\xF8' = '\x00AF' -- MACRON
decode '\xF9' = '\x02D8' -- BREVE
decode '\xFA' = '\x02D9' -- DOT ABOVE
decode '\xFB' = '\x02DA' -- RING ABOVE
decode '\xFC' = '\x00B8' -- CEDILLA
decode '\xFD' = '\x02DD' -- DOUBLE ACUTE ACCENT
decode '\xFE' = '\x02DB' -- OGONEK
decode '\xFF' = '\x02C7' -- CARON
decode c = c -- The rest is ASCII

View file

@ -1,16 +0,0 @@
module PDF.Font (
Font
, FontSet
, emptyFont
) where
import Data.ByteString (ByteString)
import Data.Map (Map)
import Data.Text (Text)
import PDF.Object (Name)
type Font = ByteString -> Either String Text
type FontSet = Map Name Font
emptyFont :: Font
emptyFont _ = Left "No fond loaded"

View file

@ -3,7 +3,6 @@
{-# LANGUAGE FlexibleInstances #-}
module PDF.Object (
Content(..)
, Dictionary
, DirectObject(..)
, Flow(..)
, IndexedObjects
@ -13,11 +12,9 @@ module PDF.Object (
, Number(..)
, Object(..)
, Occurrence(..)
, StringObject(..)
, Structure(..)
, XRefEntry(..)
, XRefSection
, array
, blank
, dictionary
, directObject
@ -25,20 +22,14 @@ module PDF.Object (
, integer
, line
, magicNumber
, name
, number
, regular
, stringObject
, structure
, toByteString
) where
import Control.Applicative ((<|>), many)
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (concat)
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
import Control.Applicative ((<|>))
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (
concat, cons, pack, singleton, unpack
)
import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map (
delete, empty, fromList, lookup, minViewWithKey, toList, union
@ -50,11 +41,15 @@ import PDF.Output (
, byteString, getObjectId, getOffset, getOffsets, join, newLine
, saveOffset
)
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
import PDF.Parser (
Parser, (<?>)
, char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option
, sepBy, string, takeAll, takeAll1
)
import Text.Printf (printf)
line :: MonadParser m => String -> m ()
line l = (string (Char8.pack l) *> blank *> return ()) <?> printf "line «%s»" l
line :: String -> Parser u ()
line l = (string (BS.pack l) *> blank *> return ()) <?> printf "line «%s»" l
magicNumber :: ByteString
magicNumber = "%PDF-"
@ -65,8 +60,8 @@ eofMarker = "%%EOF"
whiteSpaceCharset :: String
whiteSpaceCharset = "\0\t\12 "
blank :: MonadParser m => m ()
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> pure ()
blank :: Parser u ()
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return ()
delimiterCharset :: String
delimiterCharset = "()<>[]{}/%"
@ -74,8 +69,8 @@ delimiterCharset = "()<>[]{}/%"
regular :: Char -> Bool
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
integer :: (Read a, Num a, MonadParser m) => m a
integer = read . Char8.unpack <$> decNumber <* blank <?> "decimal integer"
integer :: (Read a, Num a) => Parser u a
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
-------------------------------------
-- OBJECTS
@ -86,7 +81,7 @@ type IndexedObjects = Map ObjectId Object
--
-- Boolean
--
boolean :: MonadParser m => m Bool
boolean :: Parser u Bool
boolean =
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
@ -101,42 +96,38 @@ instance Output Number where
(n, 0) -> printf "%d" (n :: Int)
_ -> printf "%f" f
number :: MonadParser m => m Number
number = Number . read . Char8.unpack <$>
(mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
number :: Parser u Number
number = Number . read . BS.unpack <$>
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
<?> "number"
where
sign = string "-" <|> option "" (char '+' >> return "")
integerPart = mappend <$> decNumber <*> option "" floatPart
floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)
floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
--
-- StringObject
--
data StringObject = Literal ByteString | Hexadecimal B16Int deriving Show
data StringObject = Literal String | Hexadecimal String deriving Show
instance Output StringObject where
output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))
output (Literal s) = Output.string (printf "(%s)" s)
output (Hexadecimal s) = Output.string (printf "<%s>" s)
stringObject :: MonadParser m => m StringObject
stringObject :: Parser u StringObject
stringObject =
Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal <$> (char '<' *> hexNumber <* char '>')
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
<?> "string object (literal or hexadecimal)"
where
literalString = many literalStringBlock
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
normalChar = not . (`elem` ("\\()" :: String))
matchingParenthesis =
mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar =
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
toByteString :: StringObject -> ByteString
toByteString (Hexadecimal h) = b16ToBytes h
toByteString (Literal s) = unescape s
BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
--
-- Name
@ -146,13 +137,13 @@ newtype Name = Name String deriving (Eq, Ord, Show)
instance Output Name where
output (Name n) = Output.string ('/':n)
name :: MonadParser m => m Name
name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"
name :: Parser u Name
name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
--
-- Array
--
array :: MonadParser m => m [DirectObject]
array :: Parser u [DirectObject]
array =
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
@ -169,7 +160,7 @@ instance Output Dictionary where
outputKeyVal :: (Name, DirectObject) -> OBuilder
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
dictionary :: MonadParser m => m Dictionary
dictionary :: Parser u Dictionary
dictionary =
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
where
@ -179,7 +170,7 @@ dictionary =
--
-- Null
--
nullObject :: MonadParser m => m ()
nullObject :: Parser u ()
nullObject = string "null" *> return () <?> "null object"
--
@ -190,7 +181,7 @@ data IndirectObjCoordinates = IndirectObjCoordinates {
, versionNumber :: Int
} deriving Show
reference :: MonadParser m => m IndirectObjCoordinates
reference :: Parser u IndirectObjCoordinates
reference = IndirectObjCoordinates
<$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
@ -219,7 +210,7 @@ instance Output DirectObject where
output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
directObject :: MonadParser m => m DirectObject
directObject :: Parser u DirectObject
directObject =
Boolean <$> boolean
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}

View file

@ -116,7 +116,7 @@ char :: Char -> OBuilder
char c = lift char8 c <* offset (+1)
string :: String -> OBuilder
string s = lift string8 s <* offset (+ length s)
string s = lift string8 s <* offset (+ toEnum (length s))
line :: String -> OBuilder
line l = string l `mappend` newLine

View file

@ -1,173 +0,0 @@
{-# LANGUAGE NamedFieldPuns #-}
module PDF.Pages (
Page(..)
, get
, getAll
) where
import Codec.Compression.Zlib (decompress)
import Control.Applicative ((<|>))
import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify)
import qualified Control.Monad.RWS as RWS (get)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.CMap (cMap)
import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId(..))
import PDF.Text (pageContents)
import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
type T = RWST Content () CachedFonts (Either String)
data Page = Page {
contents :: [Text]
, source :: ObjectId
}
infixl 1 \\=
(\\=) :: T a -> (a -> Either String b) -> T b
x \\= f = x >>= lift . f
infixl 1 //=
(//=) :: Either String a -> (a -> T b) -> T b
x //= f = lift x >>= f
expected :: Show a => String -> a -> Either String b
expected name = Left . printf "Not a %s: %s" name . show
stream :: Object -> Either String ByteString
stream (Stream {header, streamContent}) = Right $
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) ->
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
_ -> streamContent
stream obj = expected "stream" obj
getResource :: DirectObject -> T Dictionary
getResource (Dictionary dictionary) = return dictionary
getResource (Reference (IndirectObjCoordinates {objectId})) =
getObject objectId \\= dict
getResource directObject =
lift $ expected "resource (dictionary or reference)" directObject
getFontDictionary :: Dictionary -> T Dictionary
getFontDictionary pageDict =
key "Resources" pageDict
//= getResource
\\= key "Font"
>>= getResource
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get
where
load = do
value <- loader objectId
modify $ Map.insert objectId value
return value
loadFont :: ObjectId -> T Font
loadFont objectId = getObject objectId \\= dict >>= tryMappings
where
tryMappings dictionary =
loadCMap dictionary
<|> lift (key "Encoding" dictionary >>= loadEncoding)
<|> lift (Left $ unknownFormat (show objectId) (show dictionary))
unknownFormat = printf "Unknown font format for object #%s : %s"
loadCMap dictionary =
key "ToUnicode" dictionary //= follow \\= stream \\= cMap
loadEncoding (NameObject (Name name)) = encoding name
loadEncoding directObject =
Left . printf "Encoding must be a name, not that : %s" $ show directObject
loadFonts :: Dictionary -> T FontSet
loadFonts = foldM addFont Map.empty . Map.toList
where
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output
getObject :: ObjectId -> T Object
getObject objectId = do
content <- ask
return (objects content ! objectId)
key :: String -> Dictionary -> Either String DirectObject
key keyName dictionary =
maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary)
where
errorMessage =
printf "Key %s not found in dictionary %s" keyName (show dictionary)
target :: DirectObject -> Either String ObjectId
target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
target directObject = expected "reference" directObject
many :: DirectObject -> [DirectObject]
many (Array l) = l
many directObject = [directObject]
follow :: DirectObject -> T Object
follow directObject = target directObject //= getObject
dict :: Object -> Either String Dictionary
dict (Direct (Dictionary dictionary)) = Right dictionary
dict obj = expected "dictionary" obj
dictObject :: String -> Dictionary -> T Dictionary
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
pagesList :: T [ObjectId]
pagesList = do
root <- dictObject "Root" . trailer . docStructure =<< ask
pages <- dictObject "Pages" root
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> return $ getReferences kids
_ -> return []
getReferences :: [DirectObject] -> [ObjectId]
getReferences objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
extractText :: Object -> T [Text]
extractText object = do
pageDict <- lift $ dict object
fonts <- loadFonts =<< getFontDictionary pageDict
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
concat <$> (objects //= (mapM $ loadContent fonts))
where
loadContent :: FontSet -> DirectObject -> T [Text]
loadContent fonts directObject =
follow directObject \\= stream \\= pageContents fonts
loadPage :: ObjectId -> T Page
loadPage source =
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
getAll :: Content -> Either String (Map Int Page)
getAll content = fst <$> evalRWST getPages content Map.empty
where
numbered = Map.fromList . zip [1..]
getPages = numbered <$> (mapM loadPage =<< pagesList)
get :: Content -> Int -> Either String Page
get content pageNumber
| pageNumber < 1 = Left "Pages start at 1"
| otherwise = fst <$> evalRWST getPage content Map.empty
where
firstPage [] = lift $ Left "Page is out of bounds"
firstPage (p:_) = loadPage p
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage

View file

@ -1,72 +1,56 @@
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE UndecidableInstances #-}
module PDF.Parser (
MonadParser(..)
, Parser
Parser
, (<?>)
, block
, char
, choice
, count
, decNumber
, hexNumber
, many
, octDigit
, on
, oneOf
, option
, runParser
, evalParser
, sepBy
, string
, takeAll
, takeAll1
) where
import Control.Applicative (Alternative, (<|>))
import Control.Monad (MonadPlus)
import Control.Monad.Fail (MonadFail(..))
import Control.Applicative ((<|>), empty)
import Control.Monad.State (StateT(..), evalStateT)
import Control.Monad.Trans (MonadTrans(..))
import Control.Monad.Trans (lift)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
Parser, char, endOfInput, parseOnly, satisfy, string, take, takeWhile
, takeWhile1
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
)
import Data.ByteString (ByteString)
import Data.ByteString.Char8.Util (B16Int(..))
import Data.Char (toLower)
import Data.Set (Set)
import qualified Data.Set as Set (fromList, member, unions)
import Prelude hiding (fail)
type MonadDeps m = (MonadFail m, MonadPlus m)
class MonadDeps m => MonadParser m where
block :: Int -> m ByteString
char :: Char -> m Char
decNumber :: m ByteString
endOfInput :: m ()
hexNumber :: m B16Int
oneOf :: String -> m Char
string :: ByteString -> m ByteString
takeAll :: (Char -> Bool) -> m ByteString
takeAll1 :: (Char -> Bool) -> m ByteString
instance MonadParser Atto.Parser where
block = Atto.take
char = Atto.char
endOfInput = Atto.endOfInput
decNumber = Atto.takeWhile1 (`Set.member` digits)
hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
oneOf charSet = Atto.satisfy (`elem` charSet)
string s = Atto.string s <?> show s
takeAll = Atto.takeWhile
takeAll1 = Atto.takeWhile1
instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) where
block = lift . block
char = lift . char
endOfInput = lift $ endOfInput
decNumber = lift $ decNumber
hexNumber = lift $ hexNumber
oneOf = lift . oneOf
string = lift . string
takeAll = lift . takeAll
takeAll1 = lift . takeAll1
type Parser s = StateT s Atto.Parser
(<?>) :: (Alternative m, MonadFail m) => m a -> String -> m a
(<?>) :: Parser s a -> String -> Parser s a
(<?>) parser debugMessage = parser <|> fail debugMessage
block :: Int -> Parser s ByteString
block = lift . Atto.take
char :: Char -> Parser s Char
char = lift . Atto.char
choice :: [Parser s a] -> Parser s a
choice = foldr (<|>) empty
count :: Int -> Parser s a -> Parser s [a]
count 0 _ = return []
count n p = (:) <$> p <*> count (n-1) p
decNumber :: Parser s ByteString
decNumber = lift $ Atto.takeWhile1 (`Set.member` digits)
digits :: Set Char
digits = Set.fromList ['0'..'9']
@ -75,7 +59,13 @@ hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af]
where
af = ['A'..'F']
octDigit :: MonadParser m => m Char
hexNumber :: Parser s ByteString
hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits)
many :: Parser s a -> Parser s [a]
many parser = (:) <$> parser <*> many parser <|> return []
octDigit :: Parser s Char
octDigit = oneOf ['0'..'7']
on :: Parser s a -> ByteString -> Parser s (Either String a)
@ -84,8 +74,25 @@ on (StateT parserF) input = StateT $ \state ->
Left errorMsg -> return (Left errorMsg, state)
Right (result, newState) -> return (Right result, newState)
runParser :: Parser s a -> s -> ByteString -> Either String (a, s)
runParser parser initState = Atto.parseOnly (runStateT parser initState)
oneOf :: String -> Parser s Char
oneOf charSet = lift $ Atto.satisfy (`elem` charSet)
evalParser :: Parser s a -> s -> ByteString -> Either String a
evalParser parser initState = Atto.parseOnly (evalStateT parser initState)
option :: a -> Parser s a -> Parser s a
option defaultValue p = p <|> pure defaultValue
runParser :: Parser s a -> s -> ByteString -> Either String a
runParser parser initState =
Atto.parseOnly (evalStateT parser initState)
sepBy :: Parser s a -> Parser s b -> Parser s [a]
sepBy parser separator =
option [] $ (:) <$> parser <*> many (separator *> parser)
string :: ByteString -> Parser s ByteString
string = lift . Atto.string
takeAll :: (Char -> Bool) -> Parser s ByteString
takeAll = lift . Atto.takeWhile
takeAll1 :: (Char -> Bool) -> Parser s ByteString
takeAll1 = lift . Atto.takeWhile1

View file

@ -1,319 +0,0 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
module PDF.Text {-(
pageContents
)-} where
import Control.Applicative ((<|>))
import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail)
import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (MonadState, evalStateT, get, modify, put)
import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
import Data.ByteString (ByteString)
import Data.ByteString.Char8 (pack, unpack)
import Data.Char (toLower)
import Data.Map ((!), (!?), Map)
import qualified Data.Map as Map (fromList)
import Data.Text (Text)
import PDF.Font (Font, FontSet, emptyFont)
import PDF.Object (
DirectObject(..), StringObject(..)
, array, blank, name, regular, stringObject, toByteString
)
import PDF.Parser (MonadParser(..), (<?>), Parser, evalParser)
data StateOperator =
C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
deriving (Bounded, Enum, Show)
data PathOperator =
M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
| S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
| W | Wstar -- clipping path
deriving (Bounded, Enum, Show)
data ColorOperator =
CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
deriving (Bounded, Enum, Show)
data TextOperator =
Td | TD | Tm | Tstar -- text positioning
| TJ | Tj | Quote | DQuote -- text showing
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
deriving (Bounded, Enum, Show)
data Argument = Raw ByteString | Typed DirectObject deriving Show
type Call a = (a, [Argument])
type Operator a = (Bounded a, Enum a, Show a)
code :: Operator a => a -> ByteString
code = pack . expand . show
where
expand "" = ""
expand (c:'_':s) = toLower c : expand s
expand ('s':'t':'a':'r':s) = '*' : expand s
expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
expand (c:s) = c : expand s
{-
instance Show StateOperator where
show Cm = "cm"
show W_ = "w"
show J = "J"
show J_ = "j"
show M = "M"
show D = "d"
show Ri = "ri"
show I = "i"
show Gs = "gs"
instance Show PathOperator where
show M_ = "m"
show L_ = "l"
show C_ = "c"
show V_ = "v"
show Y_ = "y"
show H_
("m", (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
, ("l", (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
, ("c", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
, ("v", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
, ("y", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
, ("h", (L_, \l -> case l of [] -> True ; _ -> False))
, ("re", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
, ("S", (L_, \l -> case l of [] -> True ; _ -> False))
, ("s", (L_, \l -> case l of [] -> True ; _ -> False))
, ("f", (L_, \l -> case l of [] -> True ; _ -> False))
, ("F", (L_, \l -> case l of [] -> True ; _ -> False))
, ("F*", (L_, \l -> case l of [] -> True ; _ -> False))
, ("B", (L_, \l -> case l of [] -> True ; _ -> False))
, ("B*", (L_, \l -> case l of [] -> True ; _ -> False))
, ("b", (L_, \l -> case l of [] -> True ; _ -> False))
, ("b*", (L_, \l -> case l of [] -> True ; _ -> False))
, ("n", (L_, \l -> case l of [] -> True ; _ -> False))
, ("W", (L_, \l -> case l of [] -> True ; _ -> False))
, ("W*", (L_, \l -> case l of [] -> True ; _ -> False))
instance Show ColorOperator where
instance Show TextOperator where
show Td = "Td"
show TD = "TD"
show Tm = "Tm"
show Tstar = "T*"
show TJ = "TJ"
show Tj = "Tj"
show Quote = "'"
show DQuote = "\""
show Tc = "Tc"
show Tw = "Tw"
show Tz = "Tz"
show TL = "TL"
show Tf = "Tf"
show Tr = "Tr"
show Ts = "Ts"
-}
stateOperator :: OperatorTable StateOperator
stateOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
(C_m, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (W_, \l -> case l of [Raw _] -> True ; _ -> False)
, (J, \l -> case l of [Raw _] -> True ; _ -> False)
, (J_, \l -> case l of [Raw _] -> True ; _ -> False)
, (M, \l -> case l of [Raw _] -> True ; _ -> False)
, (D_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, (R_i, \l -> case l of [Raw _] -> True ; _ -> False)
, (I_, \l -> case l of [Raw _] -> True ; _ -> False)
, (G_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
]
pathOperator :: OperatorTable PathOperator
pathOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
(M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, (C_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (V_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (Y_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (H_, \l -> case l of [] -> True ; _ -> False)
, (R_e, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (S, \l -> case l of [] -> True ; _ -> False)
, (S_, \l -> case l of [] -> True ; _ -> False)
, (F_, \l -> case l of [] -> True ; _ -> False)
, (F, \l -> case l of [] -> True ; _ -> False)
, (Fstar, \l -> case l of [] -> True ; _ -> False)
, (B, \l -> case l of [] -> True ; _ -> False)
, (Bstar, \l -> case l of [] -> True ; _ -> False)
, (B_, \l -> case l of [] -> True ; _ -> False)
, (B_star, \l -> case l of [] -> True ; _ -> False)
, (N_, \l -> case l of [] -> True ; _ -> False)
, (W, \l -> case l of [] -> True ; _ -> False)
, (Wstar, \l -> case l of [] -> True ; _ -> False)
]
colorOperator :: OperatorTable ColorOperator
colorOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
(CS, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
, (C_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
, (SC, \_ -> True)
, (SCN, \_ -> True)
, (S_c, \_ -> True)
, (S_cn, \_ -> True)
, (G, \l -> case l of [Raw _] -> True ; _ -> False)
, (G_, \l -> case l of [Raw _] -> True ; _ -> False)
, (RG, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
, (R_g, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
, (K, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (K_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
]
{-
stateOperator (Cm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
stateOperator (W, [Raw _]) = True
stateOperator (J, [Raw _]) = True
stateOperator (J_, [Raw _]) = True
stateOperator (M, [Raw _]) = True
stateOperator (D, [Raw _, Raw _]) = True
stateOperator (Ri, [Raw _]) = True
stateOperator (I, [Raw _]) = True
stateOperator (Gs, [Typed (NameObject _)]) = True
stateOperator _ = False
-}
textOperator :: OperatorTable TextOperator
textOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
(Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (Tstar, \l -> case l of [] -> True ; _ -> False)
, (TJ, \l -> case l of [Typed (Array _)] -> True ; _ -> False)
, (Tj, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
, (Quote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
, (DQuote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
, (Tc, \l -> case l of [Raw _] -> True ; _ -> False)
, (Tw, \l -> case l of [Raw _] -> True ; _ -> False)
, (Tz, \l -> case l of [Raw _] -> True ; _ -> False)
, (TL, \l -> case l of [Raw _] -> True ; _ -> False)
, (Tf, \l -> case l of [Typed (NameObject _), Raw _] -> True ; _ -> False)
, (Tr, \l -> case l of [Raw _] -> True ; _ -> False)
, (Ts, \l -> case l of [Raw _] -> True ; _ -> False)
]
{-
textOperator (Td, [Raw _, Raw _]) = True
textOperator (TD, [Raw _, Raw _]) = True
textOperator (Tm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
textOperator (Tstar, []) = True
textOperator (TJ, [Typed (Array _)]) = True
textOperator (Tj, [Typed (StringObject _)]) = True
textOperator (Quote, [Typed (StringObject _)]) = True
textOperator (DQuote, [Typed (StringObject _)]) = True
textOperator (Tc, [Raw _]) = True
textOperator (Tw, [Raw _]) = True
textOperator (Tz, [Raw _]) = True
textOperator (TL, [Raw _]) = True
textOperator (Tf, [Typed (NameObject _), Raw _]) = True
textOperator (Tr, [Raw _]) = True
textOperator (Ts, [Raw _]) = True
textOperator _ = False
-}
type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
--type Operator a = (Bounded a, Enum a, Show a)
type OperatorTable a = Map ByteString (TypeChecker a)
type TypeChecker a = (a, [Argument] -> Bool)
parseShowable :: (Show a, MonadParser m) => a -> m a
parseShowable textOp = string (pack $ show textOp) *> return textOp
callChunk :: MonadParser m => OperatorTable a -> m (Either (TypeChecker a) Argument)
callChunk table =
(Right <$> choice [stringArg, nameArg, arrayArg])
<|> operatorOrRawArg
<?> "call chunk"
where
operatorOrRawArg = do
chunk <- takeAll1 regular <* blank
case table !? chunk of
Nothing -> return . Right $ Raw chunk
Just typeChecker -> return $ Left typeChecker
stackParser :: (ArgumentStackParser m, Operator a) => OperatorTable a -> m (Call a)
stackParser table = either popCall push =<< (callChunk table)
where
push arg = modify (arg:) >> stackParser table
popCall (operator, predicate) = do
arguments <- reverse <$> get
let call = (operator, arguments)
if predicate arguments then return call else fail (unpack $ code operator)
a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a)
a table = evalStateT (stackParser table) []
argument :: MonadParser m => m Argument
argument = Raw <$> takeAll1 regular <* blank
arrayArg :: MonadParser m => m Argument
arrayArg = Typed . Array <$> array <* blank
nameArg :: MonadParser m => m Argument
nameArg = Typed . NameObject <$> name <* blank
stringArg :: MonadParser m => m Argument
stringArg = Typed . StringObject <$> stringObject <* blank
type ParserWithFont = ReaderT FontSet (Parser Font)
pageContents :: FontSet -> ByteString -> Either String [Text]
pageContents fontSet input =
evalParser (runReaderT (page) fontSet) emptyFont input
several :: MonadParser m => m [a] -> m [a]
several p = concat <$> (p `sepBy` blank)
page :: ParserWithFont [Text]
page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
graphicState :: ParserWithFont [Text]
graphicState =
string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
where
insideQ = several (command <|> graphicState <|> text)
ignore x = a x *> return []
command =
ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
text :: ParserWithFont [Text]
text =
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
where
commands = several (a textOperator >>= runOperator)
runOperator :: Call TextOperator -> ParserWithFont [Text]
runOperator (Tf, [Typed (NameObject fontName), _]) =
asks (! fontName) >>= put >> return []
runOperator (Tstar, []) = return ["\n"]
runOperator (TJ, [Typed (Array arrayObject)]) =
replicate 1 <$> foldM appendText "" arrayObject
where
appendText bs (StringObject outputString) =
mappend bs <$> decodeString outputString
appendText bs _ = return bs
runOperator (Tj, [Typed (StringObject outputString)]) =
replicate 1 <$> decodeString outputString
runOperator (Quote, [Typed (StringObject outputString)]) =
(\bs -> ["\n", bs]) <$> decodeString outputString
runOperator (DQuote, [Typed (StringObject outputString)]) =
(\bs -> ["\n", bs]) <$> decodeString outputString
runOperator _ = return []
decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text
decodeString input = do
font <- get
either fail return . font $ toByteString input