Compare commits
42 commits
Author | SHA1 | Date | |
---|---|---|---|
1a25307c8c | |||
919f640443 | |||
ae938acc02 | |||
32f9866106 | |||
eb4d76002c | |||
af994cb50c | |||
704d7a7fcf | |||
11647eb4eb | |||
aed7af376a | |||
e77bbbcda9 | |||
195446e653 | |||
9f1b1afafe | |||
20466c4f13 | |||
325250383a | |||
c48ab22808 | |||
a2b66ac6d6 | |||
cefb08ee50 | |||
afbbcbffc5 | |||
8373bd1ea0 | |||
bac08446dd | |||
f9f799c59b | |||
08a9717b3a | |||
42a02808c1 | |||
c9f050e64b | |||
3a3e1533b4 | |||
a96e36ec5a | |||
d07c286f8e | |||
7a15113285 | |||
36d7f9b819 | |||
b8ca7281aa | |||
32efdcdd6b | |||
3b59fd0c61 | |||
0374b72920 | |||
1dd22c3889 | |||
98d029c4d4 | |||
c349d9b4c2 | |||
e7484ef536 | |||
f9e5683bf4 | |||
b8eb9e6856 | |||
66d315b7fe | |||
51db57ec67 | |||
6f3c159ea7 |
24 changed files with 1495 additions and 150 deletions
|
@ -17,19 +17,36 @@ cabal-version: >=1.10
|
||||||
|
|
||||||
library
|
library
|
||||||
exposed-modules: PDF
|
exposed-modules: PDF
|
||||||
|
, PDF.CMap
|
||||||
|
, PDF.Content
|
||||||
, PDF.EOL
|
, PDF.EOL
|
||||||
, PDF.Object
|
, PDF.Object
|
||||||
|
, PDF.Object.Navigation
|
||||||
, PDF.Output
|
, PDF.Output
|
||||||
|
, PDF.Parser
|
||||||
|
, PDF.Pages
|
||||||
, PDF.Update
|
, PDF.Update
|
||||||
other-modules: Data.ByteString.Char8.Util
|
other-modules: Data.ByteString.Char8.Util
|
||||||
|
, PDF.Content.Operator
|
||||||
|
, PDF.Content.Operator.Color
|
||||||
|
, PDF.Content.Operator.Common
|
||||||
|
, PDF.Content.Operator.GraphicState
|
||||||
|
, PDF.Content.Operator.Path
|
||||||
|
, PDF.Content.Operator.Text
|
||||||
|
, PDF.Content.Text
|
||||||
|
, PDF.Encoding
|
||||||
|
, PDF.Encoding.MacRoman
|
||||||
, PDF.Body
|
, PDF.Body
|
||||||
, PDF.Parser
|
, PDF.Font
|
||||||
-- other-extensions:
|
-- other-extensions:
|
||||||
build-depends: attoparsec
|
build-depends: attoparsec
|
||||||
, base >=4.9 && <4.13
|
, base >=4.9 && <4.13
|
||||||
, bytestring
|
, bytestring
|
||||||
, containers
|
, containers
|
||||||
, mtl
|
, mtl
|
||||||
|
, text
|
||||||
|
, utf8-string
|
||||||
|
, zlib
|
||||||
hs-source-dirs: src
|
hs-source-dirs: src
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
@ -51,3 +68,15 @@ executable getObj
|
||||||
, zlib
|
, zlib
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
executable getText
|
||||||
|
main-is: examples/getText.hs
|
||||||
|
build-depends: base
|
||||||
|
, bytestring
|
||||||
|
, containers
|
||||||
|
, Hufflepdf
|
||||||
|
, mtl
|
||||||
|
, text
|
||||||
|
, zlib
|
||||||
|
ghc-options: -Wall
|
||||||
|
default-language: Haskell2010
|
||||||
|
|
38
examples/getText.hs
Normal file
38
examples/getText.hs
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
|
import qualified Data.Map as Map (toList)
|
||||||
|
import qualified Data.Text.IO as Text (putStrLn)
|
||||||
|
import PDF (Document(..), parseDocument)
|
||||||
|
import PDF.Object (Content)
|
||||||
|
import PDF.Pages (Page(..), get, getAll)
|
||||||
|
import PDF.Update (unify)
|
||||||
|
import System.Environment (getArgs)
|
||||||
|
import System.Exit (die)
|
||||||
|
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
||||||
|
|
||||||
|
onDoc :: FilePath -> (Content -> Either String a) -> IO a
|
||||||
|
onDoc inputFile f = do
|
||||||
|
content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
|
||||||
|
case content >>= f of
|
||||||
|
Left someError -> die someError
|
||||||
|
Right value -> return value
|
||||||
|
|
||||||
|
displayPage :: Page -> IO ()
|
||||||
|
displayPage = mapM_ Text.putStrLn . contents
|
||||||
|
|
||||||
|
wholeDoc :: FilePath -> IO ()
|
||||||
|
wholeDoc inputFile = do
|
||||||
|
pages <- onDoc inputFile getAll
|
||||||
|
mapM_ (displayPage . snd) $ Map.toList pages
|
||||||
|
|
||||||
|
singlePage :: FilePath -> Int -> IO ()
|
||||||
|
singlePage inputFile pageNumber =
|
||||||
|
onDoc inputFile (`get` pageNumber) >>= displayPage
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
hSetBuffering stdout LineBuffering
|
||||||
|
args <- getArgs
|
||||||
|
case args of
|
||||||
|
[inputFile] -> wholeDoc inputFile
|
||||||
|
[inputFile, pageNumber] -> singlePage inputFile (read pageNumber)
|
||||||
|
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
|
|
@ -1,16 +1,91 @@
|
||||||
module Data.ByteString.Char8.Util (
|
module Data.ByteString.Char8.Util (
|
||||||
previous
|
B16Int(..)
|
||||||
|
, B256Int(..)
|
||||||
|
, b8ToInt
|
||||||
|
, b16ToBytes
|
||||||
|
, b16ToInt
|
||||||
|
, b256ToInt
|
||||||
|
, intToB256
|
||||||
|
, previous
|
||||||
, subBS
|
, subBS
|
||||||
|
, toBytes
|
||||||
|
, unescape
|
||||||
|
, utf16BEToutf8
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Char8 (ByteString)
|
import Data.ByteString (ByteString, snoc)
|
||||||
import qualified Data.ByteString.Char8 as BS (drop, index, take)
|
import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt)
|
||||||
|
import qualified Data.ByteString.Char8 as Char8 (
|
||||||
|
cons, drop, index, splitAt, take, uncons, unpack
|
||||||
|
)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import Data.Text.Encoding (decodeUtf16BE)
|
||||||
import Prelude hiding (length)
|
import Prelude hiding (length)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
newtype B8Int = B8Int ByteString deriving Show
|
||||||
|
newtype B16Int = B16Int ByteString deriving Show
|
||||||
|
newtype B256Int = B256Int ByteString deriving Show
|
||||||
|
|
||||||
previous :: Char -> Int -> ByteString -> Int
|
previous :: Char -> Int -> ByteString -> Int
|
||||||
previous char position byteString
|
previous char position byteString
|
||||||
| BS.index byteString position == char = position
|
| Char8.index byteString position == char = position
|
||||||
| otherwise = previous char (position - 1) byteString
|
| otherwise = previous char (position - 1) byteString
|
||||||
|
|
||||||
subBS :: Int -> Int -> ByteString -> ByteString
|
subBS :: Int -> Int -> ByteString -> ByteString
|
||||||
subBS offset length = BS.take length . BS.drop offset
|
subBS offset length = Char8.take length . Char8.drop offset
|
||||||
|
|
||||||
|
intToB256 :: Int -> B256Int
|
||||||
|
intToB256 n
|
||||||
|
| n < 0x100 = B256Int . BS.singleton $ toEnum n
|
||||||
|
| otherwise =
|
||||||
|
let B256Int begining = intToB256 (n `div` 0x100) in
|
||||||
|
B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
|
||||||
|
|
||||||
|
b256ToInt :: B256Int -> Int
|
||||||
|
b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n
|
||||||
|
|
||||||
|
toBytes :: Int -> Int -> ByteString
|
||||||
|
toBytes 0 _ = BS.empty
|
||||||
|
toBytes size n =
|
||||||
|
(toBytes (size - 1) (n `div` 0x100)) `snoc` (toEnum (n `mod` 0x100))
|
||||||
|
|
||||||
|
b16ToBytes :: B16Int -> ByteString
|
||||||
|
b16ToBytes (B16Int n) = BS.pack . fmap b16ToInt $ pairDigits n
|
||||||
|
where
|
||||||
|
pairDigits s =
|
||||||
|
case BS.length s of
|
||||||
|
0 -> []
|
||||||
|
1 -> [B16Int s]
|
||||||
|
_ ->
|
||||||
|
let (twoHexDigits, rest) = BS.splitAt 2 s in
|
||||||
|
(B16Int $ twoHexDigits):(pairDigits rest)
|
||||||
|
|
||||||
|
fromBase :: (Num a, Read a) => Char -> ByteString -> a
|
||||||
|
fromBase b = read . printf "0%c%s" b . Char8.unpack
|
||||||
|
|
||||||
|
b16ToInt :: (Num a, Read a) => B16Int -> a
|
||||||
|
b16ToInt (B16Int n) = fromBase 'x' n
|
||||||
|
|
||||||
|
b8ToInt :: (Num a, Read a) => B8Int -> a
|
||||||
|
b8ToInt (B8Int n) = fromBase 'o' n
|
||||||
|
|
||||||
|
unescape :: ByteString -> ByteString
|
||||||
|
unescape escapedBS =
|
||||||
|
case Char8.uncons escapedBS of
|
||||||
|
Nothing -> BS.empty
|
||||||
|
Just ('\\', s) -> unescapeChar s
|
||||||
|
Just (c, s) -> Char8.cons c (unescape s)
|
||||||
|
where
|
||||||
|
unescapeChar s =
|
||||||
|
case Char8.uncons s of
|
||||||
|
Nothing -> BS.empty
|
||||||
|
Just (c, s')
|
||||||
|
| c `elem` "()" -> Char8.cons c (unescape s')
|
||||||
|
| c `elem` "nrtbf" -> Char8.cons (read (printf "'\\%c'" c)) (unescape s')
|
||||||
|
| c `elem` ['0'..'7'] -> fromOctal (Char8.splitAt 3 s)
|
||||||
|
| otherwise -> Char8.cons c (unescape s')
|
||||||
|
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
|
||||||
|
|
||||||
|
utf16BEToutf8 :: ByteString -> Text
|
||||||
|
utf16BEToutf8 = decodeUtf16BE
|
||||||
|
|
|
@ -21,7 +21,7 @@ import PDF.Object (
|
||||||
)
|
)
|
||||||
import qualified PDF.Output as Output (render, line)
|
import qualified PDF.Output as Output (render, line)
|
||||||
import PDF.Output (Output(..))
|
import PDF.Output (Output(..))
|
||||||
import PDF.Parser (Parser, runParser, string, takeAll)
|
import PDF.Parser (Parser, evalParser, string, takeAll)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
data Document = Document {
|
data Document = Document {
|
||||||
|
@ -83,7 +83,7 @@ findNextSection offset input =
|
||||||
|
|
||||||
readStructures :: Int -> ByteString -> Either String [InputStructure]
|
readStructures :: Int -> ByteString -> Either String [InputStructure]
|
||||||
readStructures startXref input =
|
readStructures startXref input =
|
||||||
runParser structure () (BS.drop startXref input) >>= stopOrFollow
|
evalParser structure () (BS.drop startXref input) >>= stopOrFollow
|
||||||
where
|
where
|
||||||
stopOrFollow s@(Structure {trailer}) =
|
stopOrFollow s@(Structure {trailer}) =
|
||||||
case Map.lookup (Name "Prev") trailer of
|
case Map.lookup (Name "Prev") trailer of
|
||||||
|
@ -96,7 +96,7 @@ readStructures startXref input =
|
||||||
|
|
||||||
parseDocument :: ByteString -> Either String Document
|
parseDocument :: ByteString -> Either String Document
|
||||||
parseDocument input = do
|
parseDocument input = do
|
||||||
(pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input
|
(pdfVersion, eolStyle) <- evalParser ((,) <$> version <*> EOL.parser) () input
|
||||||
startXref <- readStartXref eolStyle input
|
startXref <- readStartXref eolStyle input
|
||||||
structuresRead <- readStructures startXref input
|
structuresRead <- readStructures startXref input
|
||||||
let updates = populate input <$> structuresRead
|
let updates = populate input <$> structuresRead
|
||||||
|
|
|
@ -6,6 +6,7 @@ module PDF.Body (
|
||||||
|
|
||||||
import Control.Applicative ((<|>))
|
import Control.Applicative ((<|>))
|
||||||
import Control.Monad.State (get, gets, modify)
|
import Control.Monad.State (get, gets, modify)
|
||||||
|
import Data.Attoparsec.ByteString.Char8 (option)
|
||||||
import Data.ByteString.Char8 (ByteString)
|
import Data.ByteString.Char8 (ByteString)
|
||||||
import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
|
import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
|
||||||
import Data.Map ((!))
|
import Data.Map ((!))
|
||||||
|
@ -18,7 +19,7 @@ import PDF.Object (
|
||||||
, blank, dictionary, directObject, integer, line
|
, blank, dictionary, directObject, integer, line
|
||||||
)
|
)
|
||||||
import PDF.Output (ObjectId(..), Offset(..))
|
import PDF.Output (ObjectId(..), Offset(..))
|
||||||
import PDF.Parser (Parser, (<?>), block, char, on, option, runParser, takeAll)
|
import PDF.Parser (Parser, block, char, evalParser, on, takeAll)
|
||||||
|
|
||||||
data UserState = UserState {
|
data UserState = UserState {
|
||||||
input :: ByteString
|
input :: ByteString
|
||||||
|
@ -104,12 +105,12 @@ indirectObjCoordinates = do
|
||||||
|
|
||||||
occurrence :: SParser Occurrence
|
occurrence :: SParser Occurrence
|
||||||
occurrence =
|
occurrence =
|
||||||
Comment <$> comment <|> Indirect <$> indirectObjCoordinates <?> "comment or object"
|
Comment <$> comment <|> Indirect <$> indirectObjCoordinates
|
||||||
|
|
||||||
populate :: ByteString -> InputStructure -> Content
|
populate :: ByteString -> InputStructure -> Content
|
||||||
populate input structure =
|
populate input structure =
|
||||||
let bodyInput = BS.drop (startOffset structure) input in
|
let bodyInput = BS.drop (startOffset structure) input in
|
||||||
case runParser recurseOnOccurrences initialState bodyInput of
|
case evalParser recurseOnOccurrences initialState bodyInput of
|
||||||
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
|
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
|
||||||
Right finalState ->
|
Right finalState ->
|
||||||
let Flow {occurrencesStack, tmpObjects} = flow finalState in
|
let Flow {occurrencesStack, tmpObjects} = flow finalState in
|
||||||
|
|
159
src/PDF/CMap.hs
Normal file
159
src/PDF/CMap.hs
Normal file
|
@ -0,0 +1,159 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
module PDF.CMap (
|
||||||
|
CMap
|
||||||
|
, CMappers
|
||||||
|
, CRange(..)
|
||||||
|
, cMap
|
||||||
|
, emptyCMap
|
||||||
|
, matches
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Applicative ((<|>), many)
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
|
import Control.Monad.State (modify)
|
||||||
|
import Data.Attoparsec.ByteString.Char8 (count)
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import qualified Data.ByteString as BS (drop, length, null, take)
|
||||||
|
import Data.ByteString.Char8 (unpack)
|
||||||
|
import Data.ByteString.Char8.Util (
|
||||||
|
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
|
||||||
|
)
|
||||||
|
import Data.Map (Map, union)
|
||||||
|
import qualified Data.Map as Map (
|
||||||
|
adjust, empty, fromList, insertWith, lookup, toList
|
||||||
|
)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import qualified PDF.EOL as EOL (charset, parser)
|
||||||
|
import PDF.Font (Font)
|
||||||
|
import PDF.Object (
|
||||||
|
DirectObject(..), Name, StringObject(..)
|
||||||
|
, blank, directObject, integer, line, stringObject
|
||||||
|
)
|
||||||
|
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
|
type CMappers = Map Name CMap
|
||||||
|
type Mapping = Map ByteString Text
|
||||||
|
data CRange = CRange {
|
||||||
|
fromSequence :: ByteString
|
||||||
|
, toSequence :: ByteString
|
||||||
|
, mapping :: Mapping
|
||||||
|
} deriving Show
|
||||||
|
type RangeSize = Int
|
||||||
|
type CMap = Map RangeSize [CRange]
|
||||||
|
|
||||||
|
toFont :: CMap -> Font
|
||||||
|
toFont aCMap input
|
||||||
|
| BS.null input = Right ""
|
||||||
|
| otherwise = do
|
||||||
|
(output, remainingInput) <- trySizes input $ Map.toList aCMap
|
||||||
|
mappend output <$> toFont aCMap remainingInput
|
||||||
|
where
|
||||||
|
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
|
||||||
|
trySizes s ((size, cRanges):others) =
|
||||||
|
let prefix = BS.take size s in
|
||||||
|
case tryRanges prefix cRanges of
|
||||||
|
Nothing -> trySizes s others
|
||||||
|
Just outputSequence -> Right (outputSequence, BS.drop size s)
|
||||||
|
tryRanges :: ByteString -> [CRange] -> Maybe Text
|
||||||
|
tryRanges _ [] = Nothing
|
||||||
|
tryRanges prefix ((CRange {mapping}):cRanges) =
|
||||||
|
case Map.lookup prefix mapping of
|
||||||
|
Nothing -> tryRanges prefix cRanges
|
||||||
|
outputSequence -> outputSequence
|
||||||
|
|
||||||
|
emptyCMap :: CMap
|
||||||
|
emptyCMap = Map.empty
|
||||||
|
|
||||||
|
matches :: ByteString -> CRange -> Bool
|
||||||
|
matches code (CRange {fromSequence, toSequence}) =
|
||||||
|
fromSequence <= code && code <= toSequence
|
||||||
|
|
||||||
|
cMap :: MonadFail m => ByteString -> m Font
|
||||||
|
cMap = either fail (return . toFont . snd) . runParser
|
||||||
|
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
||||||
|
emptyCMap
|
||||||
|
where
|
||||||
|
ignoredLine =
|
||||||
|
takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
|
||||||
|
|
||||||
|
codeRanges :: Parser CMap ()
|
||||||
|
codeRanges = do
|
||||||
|
size <- integer <* line "begincodespacerange"
|
||||||
|
mapM_ createMapping =<< count size codeRange
|
||||||
|
line "endcodespacerange"
|
||||||
|
where
|
||||||
|
codeRange =
|
||||||
|
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
|
||||||
|
|
||||||
|
createMapping :: (StringObject, StringObject) -> Parser CMap ()
|
||||||
|
createMapping (Hexadecimal from, Hexadecimal to) = modify $
|
||||||
|
Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
|
||||||
|
where
|
||||||
|
fromSequence = b16ToBytes from
|
||||||
|
size = BS.length fromSequence
|
||||||
|
toSequence = b16ToBytes to
|
||||||
|
mapping = Map.empty
|
||||||
|
createMapping _ = return ()
|
||||||
|
|
||||||
|
cMapRange :: Parser CMap ()
|
||||||
|
cMapRange = do
|
||||||
|
size <- integer <* line "beginbfrange"
|
||||||
|
mapM_ saveMapping =<< count size rangeMapping
|
||||||
|
line "endbfrange"
|
||||||
|
where
|
||||||
|
rangeMapping = (,,)
|
||||||
|
<$> (stringObject <* blank)
|
||||||
|
<*> (stringObject <* blank)
|
||||||
|
<*> directObject <* EOL.parser
|
||||||
|
>>= mapFromTo
|
||||||
|
|
||||||
|
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
|
||||||
|
saveMapping [] = return ()
|
||||||
|
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
|
||||||
|
where
|
||||||
|
newMapping = Map.fromList assoc
|
||||||
|
mappingSize = BS.length code
|
||||||
|
appendMapping cRange =
|
||||||
|
cRange {mapping = mapping cRange `union` newMapping}
|
||||||
|
insertCRange = fmap (\cRange ->
|
||||||
|
if code `matches` cRange then appendMapping cRange else cRange
|
||||||
|
)
|
||||||
|
|
||||||
|
cMapChar :: Parser CMap ()
|
||||||
|
cMapChar = do
|
||||||
|
size <- integer <* line "beginbfchar"
|
||||||
|
saveMapping =<< count size charMapping <* line "endbfchar"
|
||||||
|
where
|
||||||
|
charMapping =
|
||||||
|
(,) <$> stringObject <* blank <*> stringObject <* EOL.parser
|
||||||
|
>>= pairMapping
|
||||||
|
|
||||||
|
between :: B16Int -> B16Int -> [ByteString]
|
||||||
|
between from@(B16Int s) to =
|
||||||
|
let size = BS.length s `div` 2 in
|
||||||
|
toBytes size <$> [b16ToInt from .. b16ToInt to]
|
||||||
|
|
||||||
|
startFrom :: B16Int -> [ByteString]
|
||||||
|
startFrom from@(B16Int s) =
|
||||||
|
let size = BS.length s `div` 2 in
|
||||||
|
toBytes size <$> [b16ToInt from .. ]
|
||||||
|
|
||||||
|
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
|
||||||
|
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
||||||
|
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
|
||||||
|
|
||||||
|
mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
||||||
|
zip (between from to) <$> (mapM dstByteString dstPoints)
|
||||||
|
where
|
||||||
|
dstByteString (StringObject (Hexadecimal dst)) =
|
||||||
|
return . utf16BEToutf8 $ b16ToBytes dst
|
||||||
|
dstByteString _ = fail "Invalid for a replacement string"
|
||||||
|
|
||||||
|
mapFromTo _ = fail "invalid range mapping found"
|
||||||
|
|
||||||
|
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
|
||||||
|
pairMapping (Hexadecimal from, Hexadecimal to) =
|
||||||
|
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
|
||||||
|
pairMapping _ = fail "invalid pair mapping found"
|
69
src/PDF/Content.hs
Normal file
69
src/PDF/Content.hs
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
module PDF.Content (
|
||||||
|
Content(..)
|
||||||
|
, ContentUnit(..)
|
||||||
|
, GraphicContextUnit(..)
|
||||||
|
, TextContext
|
||||||
|
, content
|
||||||
|
, parse
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Applicative ((<|>))
|
||||||
|
import Control.Monad.State (evalStateT, modify)
|
||||||
|
import Data.Attoparsec.ByteString.Char8 (sepBy)
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import PDF.Content.Operator (Instruction, operator)
|
||||||
|
import PDF.Object (blank, directObject)
|
||||||
|
import PDF.Output (Output(..), line)
|
||||||
|
import PDF.Parser (MonadParser, evalParser, string)
|
||||||
|
|
||||||
|
data GraphicContextUnit =
|
||||||
|
GraphicInstruction Instruction
|
||||||
|
| ContentUnit ContentUnit
|
||||||
|
deriving Show
|
||||||
|
type TextContext = [Instruction]
|
||||||
|
data ContentUnit =
|
||||||
|
GraphicContext [GraphicContextUnit]
|
||||||
|
| TextContext TextContext
|
||||||
|
deriving Show
|
||||||
|
newtype Content = Content [ContentUnit] deriving Show
|
||||||
|
|
||||||
|
content :: MonadParser m => m Content
|
||||||
|
content = Content <$> contentUnit `sepBy` blank
|
||||||
|
|
||||||
|
contentUnit :: MonadParser m => m ContentUnit
|
||||||
|
contentUnit =
|
||||||
|
(GraphicContext <$> graphicContext)
|
||||||
|
<|> (TextContext <$> textContext)
|
||||||
|
where
|
||||||
|
graphicContext =
|
||||||
|
string "q" *> blank *> graphicContextUnit `sepBy` blank <* blank <* string "Q"
|
||||||
|
|
||||||
|
graphicContextUnit :: MonadParser m => m GraphicContextUnit
|
||||||
|
graphicContextUnit =
|
||||||
|
(GraphicInstruction <$> instruction)
|
||||||
|
<|> (ContentUnit <$> contentUnit)
|
||||||
|
|
||||||
|
instruction :: MonadParser m => m Instruction
|
||||||
|
instruction = evalStateT stackParser []
|
||||||
|
where
|
||||||
|
stackParser = ((directObject <* blank) >>= push) <|> operator
|
||||||
|
push arg = modify (arg:) *> stackParser
|
||||||
|
|
||||||
|
parse :: ByteString -> Either String Content
|
||||||
|
parse = evalParser content ()
|
||||||
|
|
||||||
|
textContext :: MonadParser m => m TextContext
|
||||||
|
textContext =
|
||||||
|
string "BT" *> blank *> instruction `sepBy` blank <* blank <* string "ET"
|
||||||
|
|
||||||
|
instance Output Content where
|
||||||
|
output (Content contentUnits) = output contentUnits
|
||||||
|
|
||||||
|
instance Output ContentUnit where
|
||||||
|
output (GraphicContext gc) = line "q" `mappend` output gc `mappend` line "Q"
|
||||||
|
output (TextContext tc) = line "BT" `mappend` output tc `mappend` line "ET"
|
||||||
|
|
||||||
|
instance Output GraphicContextUnit where
|
||||||
|
output (GraphicInstruction gi) = output gi
|
||||||
|
output (ContentUnit cu) = output cu
|
76
src/PDF/Content/Operator.hs
Normal file
76
src/PDF/Content/Operator.hs
Normal file
|
@ -0,0 +1,76 @@
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
{-# LANGUAGE TypeSynonymInstances #-}
|
||||||
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
|
module PDF.Content.Operator (
|
||||||
|
Instruction
|
||||||
|
, Operator(..)
|
||||||
|
, operator
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
|
import Control.Monad.State (MonadState(..))
|
||||||
|
import Data.ByteString.Char8 (ByteString, pack, unpack)
|
||||||
|
import Data.Char (toLower)
|
||||||
|
import Data.Map (Map, (!?))
|
||||||
|
import qualified Data.Map as Map (fromList)
|
||||||
|
import qualified PDF.Content.Operator.Color as Color (Operator, signature)
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
import qualified PDF.Content.Operator.GraphicState as GraphicState (Operator, signature)
|
||||||
|
import qualified PDF.Content.Operator.Path as Path (Operator, signature)
|
||||||
|
import qualified PDF.Content.Operator.Text as Text (Operator, signature)
|
||||||
|
import PDF.Object (DirectObject, blank, regular)
|
||||||
|
import PDF.Output (Output(..), join, line)
|
||||||
|
import PDF.Parser (MonadParser, takeAll1)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
GraphicState GraphicState.Operator
|
||||||
|
| Path Path.Operator
|
||||||
|
| Color Color.Operator
|
||||||
|
| Text Text.Operator
|
||||||
|
|
||||||
|
instance Show Operator where
|
||||||
|
show (GraphicState gcOp) = code gcOp
|
||||||
|
show (Path pOp) = code pOp
|
||||||
|
show (Color cOp) = code cOp
|
||||||
|
show (Text tOp) = code tOp
|
||||||
|
|
||||||
|
type Instruction = (Operator, [DirectObject])
|
||||||
|
|
||||||
|
instance Output Instruction where
|
||||||
|
output (op, args) = join " " ((output <$> args) ++ [line (show op)])
|
||||||
|
|
||||||
|
operatorsTable :: Map ByteString (Signature Operator)
|
||||||
|
operatorsTable = Map.fromList (
|
||||||
|
(prepare GraphicState <$> GraphicState.signature)
|
||||||
|
++ (prepare Path <$> Path.signature)
|
||||||
|
++ (prepare Color <$> Color.signature)
|
||||||
|
++ (prepare Text <$> Text.signature)
|
||||||
|
)
|
||||||
|
where
|
||||||
|
prepare constructor (op, sig) = (pack $ code op, (constructor op, sig))
|
||||||
|
|
||||||
|
code :: Show a => a -> String
|
||||||
|
code = expand . show
|
||||||
|
where
|
||||||
|
expand "" = ""
|
||||||
|
expand (c:'_':s) = toLower c : expand s
|
||||||
|
expand ('s':'t':'a':'r':s) = '*' : expand s
|
||||||
|
expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
|
||||||
|
expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
|
||||||
|
expand (c:s) = c : expand s
|
||||||
|
|
||||||
|
type StackParser m = (MonadState [DirectObject] m, MonadParser m)
|
||||||
|
|
||||||
|
operator :: StackParser m => m Instruction
|
||||||
|
operator = do
|
||||||
|
chunk <- takeAll1 regular <* blank
|
||||||
|
args <- reverse <$> get
|
||||||
|
case operatorsTable !? chunk of
|
||||||
|
Just (op, sig)
|
||||||
|
| sig args -> return (op, args)
|
||||||
|
| otherwise ->
|
||||||
|
get >>= fail . printf "Operator %s with stack %s" (show op) . show
|
||||||
|
_ -> fail ("Unknown chunk " ++ unpack chunk)
|
27
src/PDF/Content/Operator/Color.hs
Normal file
27
src/PDF/Content/Operator/Color.hs
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
module PDF.Content.Operator.Color (
|
||||||
|
Operator(..)
|
||||||
|
, signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
import PDF.Object (DirectObject(..))
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
|
||||||
|
signature :: [Signature Operator]
|
||||||
|
signature = [
|
||||||
|
(CS, \l -> case l of [NameObject _] -> True ; _ -> False)
|
||||||
|
, (C_s, \l -> case l of [NameObject _] -> True ; _ -> False)
|
||||||
|
, (SC, \_ -> True)
|
||||||
|
, (SCN, \_ -> True)
|
||||||
|
, (S_c, \_ -> True)
|
||||||
|
, (S_cn, \_ -> True)
|
||||||
|
, (G, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (G_, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (RG, \l -> case l of [_, _, _] -> True ; _ -> False)
|
||||||
|
, (R_g, \l -> case l of [_, _, _] -> True ; _ -> False)
|
||||||
|
, (K, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
, (K_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
]
|
7
src/PDF/Content/Operator/Common.hs
Normal file
7
src/PDF/Content/Operator/Common.hs
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
module PDF.Content.Operator.Common (
|
||||||
|
Signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Object (DirectObject)
|
||||||
|
|
||||||
|
type Signature a = (a, [DirectObject] -> Bool)
|
24
src/PDF/Content/Operator/GraphicState.hs
Normal file
24
src/PDF/Content/Operator/GraphicState.hs
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
module PDF.Content.Operator.GraphicState (
|
||||||
|
Operator(..)
|
||||||
|
, signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
import PDF.Object (DirectObject(..))
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
|
||||||
|
signature :: [Signature Operator]
|
||||||
|
signature = [
|
||||||
|
(C_m, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
|
||||||
|
, (W_, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (J, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (J_, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (M, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (D_, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (R_i, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (I_, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (G_s, \l -> case l of [NameObject _] -> True ; _ -> False)
|
||||||
|
]
|
35
src/PDF/Content/Operator/Path.hs
Normal file
35
src/PDF/Content/Operator/Path.hs
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
module PDF.Content.Operator.Path (
|
||||||
|
Operator(..)
|
||||||
|
, signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
|
||||||
|
| S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
|
||||||
|
| W | Wstar -- clipping path
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
|
||||||
|
signature :: [Signature Operator]
|
||||||
|
signature = [
|
||||||
|
(M_, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (L_, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (C_, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
|
||||||
|
, (V_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
, (Y_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
, (H_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (R_e, \l -> case l of [_, _, _, _] -> True ; _ -> False)
|
||||||
|
, (S, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (S_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (F_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (F, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Fstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Bstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B_star, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (N_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (W, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Wstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
]
|
32
src/PDF/Content/Operator/Text.hs
Normal file
32
src/PDF/Content/Operator/Text.hs
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
module PDF.Content.Operator.Text (
|
||||||
|
Operator(..)
|
||||||
|
, signature
|
||||||
|
) where
|
||||||
|
|
||||||
|
import PDF.Content.Operator.Common (Signature)
|
||||||
|
import PDF.Object (DirectObject(..))
|
||||||
|
|
||||||
|
data Operator =
|
||||||
|
Td | TD | Tm | Tstar -- text positioning
|
||||||
|
| TJ | Tj | Quote | DQuote -- text showing
|
||||||
|
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
|
||||||
|
signature :: [Signature Operator]
|
||||||
|
signature = [
|
||||||
|
(Td, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (TD, \l -> case l of [_, _] -> True ; _ -> False)
|
||||||
|
, (Tm, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
|
||||||
|
, (Tstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (TJ, \l -> case l of [Array _] -> True ; _ -> False)
|
||||||
|
, (Tj, \l -> case l of [StringObject _] -> True ; _ -> False)
|
||||||
|
, (Quote, \l -> case l of [StringObject _] -> True ; _ -> False)
|
||||||
|
, (DQuote, \l -> case l of [StringObject _] -> True ; _ -> False)
|
||||||
|
, (Tc, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (Tw, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (Tz, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (TL, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (Tf, \l -> case l of [NameObject _, _] -> True ; _ -> False)
|
||||||
|
, (Tr, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
, (Ts, \l -> case l of [_] -> True ; _ -> False)
|
||||||
|
]
|
75
src/PDF/Content/Text.hs
Normal file
75
src/PDF/Content/Text.hs
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
module PDF.Content.Text (
|
||||||
|
format
|
||||||
|
, renderText
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Monad (foldM)
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
|
import Control.Monad.Reader (MonadReader(..), asks, runReaderT)
|
||||||
|
import Control.Monad.State (MonadState(..), evalStateT)
|
||||||
|
import Data.ByteString.Char8 (pack)
|
||||||
|
import Data.Map ((!))
|
||||||
|
import Data.Text (Text)
|
||||||
|
import PDF.Content (Content(..), ContentUnit(..), GraphicContextUnit(..))
|
||||||
|
import PDF.Content.Operator (Instruction, Operator(..))
|
||||||
|
import PDF.Content.Operator.Text (Operator(..))
|
||||||
|
import PDF.Font (Font, FontSet, emptyFont)
|
||||||
|
import PDF.Object (DirectObject(..), StringObject(..), toByteString)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
|
type TextContent m = (MonadReader FontSet m, MonadFail m)
|
||||||
|
type FontContext m = (MonadState Font m, TextContent m)
|
||||||
|
|
||||||
|
decodeString :: FontContext m => StringObject -> m Text
|
||||||
|
decodeString input = do
|
||||||
|
font <- get
|
||||||
|
either fail return . font $ toByteString input
|
||||||
|
|
||||||
|
renderText :: MonadFail m => FontSet -> Content -> m [Text]
|
||||||
|
renderText fontSet (Content contentUnits) =
|
||||||
|
runReaderT (concat <$> mapM renderContentUnit contentUnits) fontSet
|
||||||
|
|
||||||
|
renderContentUnit :: TextContent m => ContentUnit -> m [Text]
|
||||||
|
renderContentUnit (GraphicContext graphicContextUnits) =
|
||||||
|
concat <$> mapM renderGraphicContextUnit graphicContextUnits
|
||||||
|
renderContentUnit (TextContext instructions) =
|
||||||
|
evalStateT (concat <$> mapM renderInstruction instructions) emptyFont
|
||||||
|
|
||||||
|
renderGraphicContextUnit :: TextContent m => GraphicContextUnit -> m [Text]
|
||||||
|
renderGraphicContextUnit (GraphicInstruction _) = return []
|
||||||
|
renderGraphicContextUnit (ContentUnit contentUnit) =
|
||||||
|
renderContentUnit contentUnit
|
||||||
|
|
||||||
|
renderInstruction :: FontContext m => Instruction -> m [Text]
|
||||||
|
renderInstruction (Text Tf, [NameObject fontName, _]) =
|
||||||
|
asks (! fontName) >>= put >> return []
|
||||||
|
|
||||||
|
renderInstruction (Text Tstar, []) = return ["\n"]
|
||||||
|
|
||||||
|
renderInstruction (Text TJ, [Array arrayObject]) =
|
||||||
|
replicate 1 <$> foldM appendText "" arrayObject
|
||||||
|
where
|
||||||
|
appendText t (StringObject outputString) =
|
||||||
|
mappend t <$> decodeString outputString
|
||||||
|
appendText t _ = return t
|
||||||
|
|
||||||
|
renderInstruction (Text Tj, [StringObject outputString]) =
|
||||||
|
replicate 1 <$> decodeString outputString
|
||||||
|
|
||||||
|
renderInstruction (Text Quote, [StringObject outputString]) =
|
||||||
|
(\t -> ["\n", t]) <$> decodeString outputString
|
||||||
|
|
||||||
|
renderInstruction (Text DQuote, [StringObject outputString]) =
|
||||||
|
(\t -> ["\n", t]) <$> decodeString outputString
|
||||||
|
|
||||||
|
renderInstruction _ = return []
|
||||||
|
|
||||||
|
format :: String -> [Instruction]
|
||||||
|
format s =
|
||||||
|
case break (== '\n') s of
|
||||||
|
("", "") -> []
|
||||||
|
("", left) -> (Text Tstar, []) : format (drop 1 left)
|
||||||
|
(line, left) -> (Text Tj, [StringObject . Literal $ pack line]) : format left
|
|
@ -6,14 +6,14 @@ module PDF.EOL (
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>))
|
import Control.Applicative ((<|>))
|
||||||
import PDF.Parser (Parser, string)
|
import PDF.Parser (MonadParser, string)
|
||||||
|
|
||||||
data Style = CR | LF | CRLF deriving Show
|
data Style = CR | LF | CRLF deriving Show
|
||||||
|
|
||||||
charset :: String
|
charset :: String
|
||||||
charset = "\r\n"
|
charset = "\r\n"
|
||||||
|
|
||||||
parser :: Parser s Style
|
parser :: MonadParser m => m Style
|
||||||
parser =
|
parser =
|
||||||
(string "\r\n" >> return CRLF)
|
(string "\r\n" >> return CRLF)
|
||||||
<|> (string "\r" >> return CR)
|
<|> (string "\r" >> return CR)
|
||||||
|
|
12
src/PDF/Encoding.hs
Normal file
12
src/PDF/Encoding.hs
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
module PDF.Encoding (
|
||||||
|
encoding
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
|
import PDF.Encoding.MacRoman (macRomanEncoding)
|
||||||
|
import PDF.Font (Font)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
|
encoding :: MonadFail m => String -> m Font
|
||||||
|
encoding "MacRomanEncoding" = return macRomanEncoding
|
||||||
|
encoding s = fail $ "Unknown encoding " ++ s
|
141
src/PDF/Encoding/MacRoman.hs
Normal file
141
src/PDF/Encoding/MacRoman.hs
Normal file
|
@ -0,0 +1,141 @@
|
||||||
|
module PDF.Encoding.MacRoman (
|
||||||
|
macRomanEncoding
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Data.ByteString.Char8 (unpack)
|
||||||
|
import Data.Text (pack)
|
||||||
|
import PDF.Font (Font)
|
||||||
|
|
||||||
|
macRomanEncoding :: Font
|
||||||
|
macRomanEncoding = Right . pack . fmap decode . unpack
|
||||||
|
|
||||||
|
decode :: Char -> Char
|
||||||
|
decode '\x80' = '\x00C4' -- LATIN CAPITAL LETTER A WITH DIAERESIS
|
||||||
|
decode '\x81' = '\x00C5' -- LATIN CAPITAL LETTER A WITH RING ABOVE
|
||||||
|
decode '\x82' = '\x00C7' -- LATIN CAPITAL LETTER C WITH CEDILLA
|
||||||
|
decode '\x83' = '\x00C9' -- LATIN CAPITAL LETTER E WITH ACUTE
|
||||||
|
decode '\x84' = '\x00D1' -- LATIN CAPITAL LETTER N WITH TILDE
|
||||||
|
decode '\x85' = '\x00D6' -- LATIN CAPITAL LETTER O WITH DIAERESIS
|
||||||
|
decode '\x86' = '\x00DC' -- LATIN CAPITAL LETTER U WITH DIAERESIS
|
||||||
|
decode '\x87' = '\x00E1' -- LATIN SMALL LETTER A WITH ACUTE
|
||||||
|
decode '\x88' = '\x00E0' -- LATIN SMALL LETTER A WITH GRAVE
|
||||||
|
decode '\x89' = '\x00E2' -- LATIN SMALL LETTER A WITH CIRCUMFLEX
|
||||||
|
decode '\x8A' = '\x00E4' -- LATIN SMALL LETTER A WITH DIAERESIS
|
||||||
|
decode '\x8B' = '\x00E3' -- LATIN SMALL LETTER A WITH TILDE
|
||||||
|
decode '\x8C' = '\x00E5' -- LATIN SMALL LETTER A WITH RING ABOVE
|
||||||
|
decode '\x8D' = '\x00E7' -- LATIN SMALL LETTER C WITH CEDILLA
|
||||||
|
decode '\x8E' = '\x00E9' -- LATIN SMALL LETTER E WITH ACUTE
|
||||||
|
decode '\x8F' = '\x00E8' -- LATIN SMALL LETTER E WITH GRAVE
|
||||||
|
decode '\x90' = '\x00EA' -- LATIN SMALL LETTER E WITH CIRCUMFLEX
|
||||||
|
decode '\x91' = '\x00EB' -- LATIN SMALL LETTER E WITH DIAERESIS
|
||||||
|
decode '\x92' = '\x00ED' -- LATIN SMALL LETTER I WITH ACUTE
|
||||||
|
decode '\x93' = '\x00EC' -- LATIN SMALL LETTER I WITH GRAVE
|
||||||
|
decode '\x94' = '\x00EE' -- LATIN SMALL LETTER I WITH CIRCUMFLEX
|
||||||
|
decode '\x95' = '\x00EF' -- LATIN SMALL LETTER I WITH DIAERESIS
|
||||||
|
decode '\x96' = '\x00F1' -- LATIN SMALL LETTER N WITH TILDE
|
||||||
|
decode '\x97' = '\x00F3' -- LATIN SMALL LETTER O WITH ACUTE
|
||||||
|
decode '\x98' = '\x00F2' -- LATIN SMALL LETTER O WITH GRAVE
|
||||||
|
decode '\x99' = '\x00F4' -- LATIN SMALL LETTER O WITH CIRCUMFLEX
|
||||||
|
decode '\x9A' = '\x00F6' -- LATIN SMALL LETTER O WITH DIAERESIS
|
||||||
|
decode '\x9B' = '\x00F5' -- LATIN SMALL LETTER O WITH TILDE
|
||||||
|
decode '\x9C' = '\x00FA' -- LATIN SMALL LETTER U WITH ACUTE
|
||||||
|
decode '\x9D' = '\x00F9' -- LATIN SMALL LETTER U WITH GRAVE
|
||||||
|
decode '\x9E' = '\x00FB' -- LATIN SMALL LETTER U WITH CIRCUMFLEX
|
||||||
|
decode '\x9F' = '\x00FC' -- LATIN SMALL LETTER U WITH DIAERESIS
|
||||||
|
decode '\xA0' = '\x2020' -- DAGGER
|
||||||
|
decode '\xA1' = '\x00B0' -- DEGREE SIGN
|
||||||
|
decode '\xA2' = '\x00A2' -- CENT SIGN
|
||||||
|
decode '\xA3' = '\x00A3' -- POUND SIGN
|
||||||
|
decode '\xA4' = '\x00A7' -- SECTION SIGN
|
||||||
|
decode '\xA5' = '\x2022' -- BULLET
|
||||||
|
decode '\xA6' = '\x00B6' -- PILCROW SIGN
|
||||||
|
decode '\xA7' = '\x00DF' -- LATIN SMALL LETTER SHARP S
|
||||||
|
decode '\xA8' = '\x00AE' -- REGISTERED SIGN
|
||||||
|
decode '\xA9' = '\x00A9' -- COPYRIGHT SIGN
|
||||||
|
decode '\xAA' = '\x2122' -- TRADE MARK SIGN
|
||||||
|
decode '\xAB' = '\x00B4' -- ACUTE ACCENT
|
||||||
|
decode '\xAC' = '\x00A8' -- DIAERESIS
|
||||||
|
decode '\xAD' = '\x2260' -- NOT EQUAL TO
|
||||||
|
decode '\xAE' = '\x00C6' -- LATIN CAPITAL LETTER AE
|
||||||
|
decode '\xAF' = '\x00D8' -- LATIN CAPITAL LETTER O WITH STROKE
|
||||||
|
decode '\xB0' = '\x221E' -- INFINITY
|
||||||
|
decode '\xB1' = '\x00B1' -- PLUS-MINUS SIGN
|
||||||
|
decode '\xB2' = '\x2264' -- LESS-THAN OR EQUAL TO
|
||||||
|
decode '\xB3' = '\x2265' -- GREATER-THAN OR EQUAL TO
|
||||||
|
decode '\xB4' = '\x00A5' -- YEN SIGN
|
||||||
|
decode '\xB5' = '\x00B5' -- MICRO SIGN
|
||||||
|
decode '\xB6' = '\x2202' -- PARTIAL DIFFERENTIAL
|
||||||
|
decode '\xB7' = '\x2211' -- N-ARY SUMMATION
|
||||||
|
decode '\xB8' = '\x220F' -- N-ARY PRODUCT
|
||||||
|
decode '\xB9' = '\x03C0' -- GREEK SMALL LETTER PI
|
||||||
|
decode '\xBA' = '\x222B' -- INTEGRAL
|
||||||
|
decode '\xBB' = '\x00AA' -- FEMININE ORDINAL INDICATOR
|
||||||
|
decode '\xBC' = '\x00BA' -- MASCULINE ORDINAL INDICATOR
|
||||||
|
decode '\xBD' = '\x03A9' -- GREEK CAPITAL LETTER OMEGA
|
||||||
|
decode '\xBE' = '\x00E6' -- LATIN SMALL LETTER AE
|
||||||
|
decode '\xBF' = '\x00F8' -- LATIN SMALL LETTER O WITH STROKE
|
||||||
|
decode '\xC0' = '\x00BF' -- INVERTED QUESTION MARK
|
||||||
|
decode '\xC1' = '\x00A1' -- INVERTED EXCLAMATION MARK
|
||||||
|
decode '\xC2' = '\x00AC' -- NOT SIGN
|
||||||
|
decode '\xC3' = '\x221A' -- SQUARE ROOT
|
||||||
|
decode '\xC4' = '\x0192' -- LATIN SMALL LETTER F WITH HOOK
|
||||||
|
decode '\xC5' = '\x2248' -- ALMOST EQUAL TO
|
||||||
|
decode '\xC6' = '\x2206' -- INCREMENT
|
||||||
|
decode '\xC7' = '\x00AB' -- LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||||
|
decode '\xC8' = '\x00BB' -- RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||||
|
decode '\xC9' = '\x2026' -- HORIZONTAL ELLIPSIS
|
||||||
|
decode '\xCA' = '\x00A0' -- NO-BREAK SPACE
|
||||||
|
decode '\xCB' = '\x00C0' -- LATIN CAPITAL LETTER A WITH GRAVE
|
||||||
|
decode '\xCC' = '\x00C3' -- LATIN CAPITAL LETTER A WITH TILDE
|
||||||
|
decode '\xCD' = '\x00D5' -- LATIN CAPITAL LETTER O WITH TILDE
|
||||||
|
decode '\xCE' = '\x0152' -- LATIN CAPITAL LIGATURE OE
|
||||||
|
decode '\xCF' = '\x0153' -- LATIN SMALL LIGATURE OE
|
||||||
|
decode '\xD0' = '\x2013' -- EN DASH
|
||||||
|
decode '\xD1' = '\x2014' -- EM DASH
|
||||||
|
decode '\xD2' = '\x201C' -- LEFT DOUBLE QUOTATION MARK
|
||||||
|
decode '\xD3' = '\x201D' -- RIGHT DOUBLE QUOTATION MARK
|
||||||
|
decode '\xD4' = '\x2018' -- LEFT SINGLE QUOTATION MARK
|
||||||
|
decode '\xD5' = '\x2019' -- RIGHT SINGLE QUOTATION MARK
|
||||||
|
decode '\xD6' = '\x00F7' -- DIVISION SIGN
|
||||||
|
decode '\xD7' = '\x25CA' -- LOZENGE
|
||||||
|
decode '\xD8' = '\x00FF' -- LATIN SMALL LETTER Y WITH DIAERESIS
|
||||||
|
decode '\xD9' = '\x0178' -- LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||||
|
decode '\xDA' = '\x2044' -- FRACTION SLASH
|
||||||
|
decode '\xDB' = '\x20AC' -- EURO SIGN
|
||||||
|
decode '\xDC' = '\x2039' -- SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||||
|
decode '\xDD' = '\x203A' -- SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||||
|
decode '\xDE' = '\xFB01' -- LATIN SMALL LIGATURE FI
|
||||||
|
decode '\xDF' = '\xFB02' -- LATIN SMALL LIGATURE FL
|
||||||
|
decode '\xE0' = '\x2021' -- DOUBLE DAGGER
|
||||||
|
decode '\xE1' = '\x00B7' -- MIDDLE DOT
|
||||||
|
decode '\xE2' = '\x201A' -- SINGLE LOW-9 QUOTATION MARK
|
||||||
|
decode '\xE3' = '\x201E' -- DOUBLE LOW-9 QUOTATION MARK
|
||||||
|
decode '\xE4' = '\x2030' -- PER MILLE SIGN
|
||||||
|
decode '\xE5' = '\x00C2' -- LATIN CAPITAL LETTER A WITH CIRCUMFLEX
|
||||||
|
decode '\xE6' = '\x00CA' -- LATIN CAPITAL LETTER E WITH CIRCUMFLEX
|
||||||
|
decode '\xE7' = '\x00C1' -- LATIN CAPITAL LETTER A WITH ACUTE
|
||||||
|
decode '\xE8' = '\x00CB' -- LATIN CAPITAL LETTER E WITH DIAERESIS
|
||||||
|
decode '\xE9' = '\x00C8' -- LATIN CAPITAL LETTER E WITH GRAVE
|
||||||
|
decode '\xEA' = '\x00CD' -- LATIN CAPITAL LETTER I WITH ACUTE
|
||||||
|
decode '\xEB' = '\x00CE' -- LATIN CAPITAL LETTER I WITH CIRCUMFLEX
|
||||||
|
decode '\xEC' = '\x00CF' -- LATIN CAPITAL LETTER I WITH DIAERESIS
|
||||||
|
decode '\xED' = '\x00CC' -- LATIN CAPITAL LETTER I WITH GRAVE
|
||||||
|
decode '\xEE' = '\x00D3' -- LATIN CAPITAL LETTER O WITH ACUTE
|
||||||
|
decode '\xEF' = '\x00D4' -- LATIN CAPITAL LETTER O WITH CIRCUMFLEX
|
||||||
|
decode '\xF0' = '\xF8FF' -- Apple logo
|
||||||
|
decode '\xF1' = '\x00D2' -- LATIN CAPITAL LETTER O WITH GRAVE
|
||||||
|
decode '\xF2' = '\x00DA' -- LATIN CAPITAL LETTER U WITH ACUTE
|
||||||
|
decode '\xF3' = '\x00DB' -- LATIN CAPITAL LETTER U WITH CIRCUMFLEX
|
||||||
|
decode '\xF4' = '\x00D9' -- LATIN CAPITAL LETTER U WITH GRAVE
|
||||||
|
decode '\xF5' = '\x0131' -- LATIN SMALL LETTER DOTLESS I
|
||||||
|
decode '\xF6' = '\x02C6' -- MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||||
|
decode '\xF7' = '\x02DC' -- SMALL TILDE
|
||||||
|
decode '\xF8' = '\x00AF' -- MACRON
|
||||||
|
decode '\xF9' = '\x02D8' -- BREVE
|
||||||
|
decode '\xFA' = '\x02D9' -- DOT ABOVE
|
||||||
|
decode '\xFB' = '\x02DA' -- RING ABOVE
|
||||||
|
decode '\xFC' = '\x00B8' -- CEDILLA
|
||||||
|
decode '\xFD' = '\x02DD' -- DOUBLE ACUTE ACCENT
|
||||||
|
decode '\xFE' = '\x02DB' -- OGONEK
|
||||||
|
decode '\xFF' = '\x02C7' -- CARON
|
||||||
|
decode c = c -- The rest is ASCII
|
16
src/PDF/Font.hs
Normal file
16
src/PDF/Font.hs
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
module PDF.Font (
|
||||||
|
Font
|
||||||
|
, FontSet
|
||||||
|
, emptyFont
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import Data.Map (Map)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import PDF.Object (Name)
|
||||||
|
|
||||||
|
type Font = ByteString -> Either String Text
|
||||||
|
type FontSet = Map Name Font
|
||||||
|
|
||||||
|
emptyFont :: Font
|
||||||
|
emptyFont _ = Left "No fond loaded"
|
|
@ -3,6 +3,7 @@
|
||||||
{-# LANGUAGE FlexibleInstances #-}
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
module PDF.Object (
|
module PDF.Object (
|
||||||
Content(..)
|
Content(..)
|
||||||
|
, Dictionary
|
||||||
, DirectObject(..)
|
, DirectObject(..)
|
||||||
, Flow(..)
|
, Flow(..)
|
||||||
, IndexedObjects
|
, IndexedObjects
|
||||||
|
@ -12,9 +13,11 @@ module PDF.Object (
|
||||||
, Number(..)
|
, Number(..)
|
||||||
, Object(..)
|
, Object(..)
|
||||||
, Occurrence(..)
|
, Occurrence(..)
|
||||||
|
, StringObject(..)
|
||||||
, Structure(..)
|
, Structure(..)
|
||||||
, XRefEntry(..)
|
, XRefEntry(..)
|
||||||
, XRefSection
|
, XRefSection
|
||||||
|
, array
|
||||||
, blank
|
, blank
|
||||||
, dictionary
|
, dictionary
|
||||||
, directObject
|
, directObject
|
||||||
|
@ -22,34 +25,36 @@ module PDF.Object (
|
||||||
, integer
|
, integer
|
||||||
, line
|
, line
|
||||||
, magicNumber
|
, magicNumber
|
||||||
|
, name
|
||||||
|
, number
|
||||||
|
, regular
|
||||||
|
, stringObject
|
||||||
, structure
|
, structure
|
||||||
|
, toByteString
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>))
|
import Control.Applicative ((<|>), many)
|
||||||
import Data.ByteString.Char8 (ByteString)
|
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
||||||
import qualified Data.ByteString.Char8 as BS (
|
import Data.ByteString (ByteString)
|
||||||
concat, cons, pack, singleton, unpack
|
import qualified Data.ByteString as BS (concat)
|
||||||
)
|
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
|
||||||
|
import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
|
||||||
import Data.Map (Map, (!), mapWithKey)
|
import Data.Map (Map, (!), mapWithKey)
|
||||||
import qualified Data.Map as Map (
|
import qualified Data.Map as Map (
|
||||||
delete, empty, fromList, lookup, minViewWithKey, toList, union
|
delete, empty, fromList, lookup, minViewWithKey, toList, union
|
||||||
)
|
)
|
||||||
import qualified PDF.EOL as EOL (charset, parser)
|
import qualified PDF.EOL as EOL (charset, parser)
|
||||||
import qualified PDF.Output as Output (concat, line, string)
|
import qualified PDF.Output as Output (line, string)
|
||||||
import PDF.Output (
|
import PDF.Output (
|
||||||
OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..)
|
OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..)
|
||||||
, byteString, getObjectId, getOffset, getOffsets, join, newLine
|
, byteString, getObjectId, getOffset, getOffsets, join, newLine
|
||||||
, saveOffset
|
, saveOffset
|
||||||
)
|
)
|
||||||
import PDF.Parser (
|
import PDF.Parser (MonadParser(..), Parser, octDigit, oneOf)
|
||||||
Parser, (<?>)
|
|
||||||
, char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option
|
|
||||||
, sepBy, string, takeAll, takeAll1
|
|
||||||
)
|
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
line :: String -> Parser u ()
|
line :: MonadParser m => String -> m ()
|
||||||
line l = (string (BS.pack l) *> blank *> return ()) <?> printf "line «%s»" l
|
line l = (string (Char8.pack l) *> blank *> return ())
|
||||||
|
|
||||||
magicNumber :: ByteString
|
magicNumber :: ByteString
|
||||||
magicNumber = "%PDF-"
|
magicNumber = "%PDF-"
|
||||||
|
@ -60,8 +65,8 @@ eofMarker = "%%EOF"
|
||||||
whiteSpaceCharset :: String
|
whiteSpaceCharset :: String
|
||||||
whiteSpaceCharset = "\0\t\12 "
|
whiteSpaceCharset = "\0\t\12 "
|
||||||
|
|
||||||
blank :: Parser u ()
|
blank :: MonadParser m => m ()
|
||||||
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return ()
|
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> pure ()
|
||||||
|
|
||||||
delimiterCharset :: String
|
delimiterCharset :: String
|
||||||
delimiterCharset = "()<>[]{}/%"
|
delimiterCharset = "()<>[]{}/%"
|
||||||
|
@ -69,8 +74,8 @@ delimiterCharset = "()<>[]{}/%"
|
||||||
regular :: Char -> Bool
|
regular :: Char -> Bool
|
||||||
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
|
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
|
||||||
|
|
||||||
integer :: (Read a, Num a) => Parser u a
|
integer :: (Read a, Num a, MonadParser m) => m a
|
||||||
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
|
integer = read . Char8.unpack <$> decNumber <* blank
|
||||||
|
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
-- OBJECTS
|
-- OBJECTS
|
||||||
|
@ -81,9 +86,9 @@ type IndexedObjects = Map ObjectId Object
|
||||||
--
|
--
|
||||||
-- Boolean
|
-- Boolean
|
||||||
--
|
--
|
||||||
boolean :: Parser u Bool
|
boolean :: MonadParser m => m Bool
|
||||||
boolean =
|
boolean =
|
||||||
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
|
(string "true" *> return True) <|> (string "false" *> return False)
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Number
|
-- Number
|
||||||
|
@ -96,38 +101,40 @@ instance Output Number where
|
||||||
(n, 0) -> printf "%d" (n :: Int)
|
(n, 0) -> printf "%d" (n :: Int)
|
||||||
_ -> printf "%f" f
|
_ -> printf "%f" f
|
||||||
|
|
||||||
number :: Parser u Number
|
number :: MonadParser m => m Number
|
||||||
number = Number . read . BS.unpack <$>
|
number = Number . read . Char8.unpack <$>
|
||||||
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
|
(mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
|
||||||
<?> "number"
|
|
||||||
where
|
where
|
||||||
sign = string "-" <|> option "" (char '+' >> return "")
|
sign = string "-" <|> option "" (char '+' >> return "")
|
||||||
integerPart = mappend <$> decNumber <*> option "" floatPart
|
integerPart = mappend <$> decNumber <*> option "" floatPart
|
||||||
floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
|
floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)
|
||||||
|
|
||||||
--
|
--
|
||||||
-- StringObject
|
-- StringObject
|
||||||
--
|
--
|
||||||
data StringObject = Literal String | Hexadecimal String deriving Show
|
data StringObject = Literal ByteString | Hexadecimal B16Int deriving Show
|
||||||
|
|
||||||
instance Output StringObject where
|
instance Output StringObject where
|
||||||
output (Literal s) = Output.string (printf "(%s)" s)
|
output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
|
||||||
output (Hexadecimal s) = Output.string (printf "<%s>" s)
|
output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))
|
||||||
|
|
||||||
stringObject :: Parser u StringObject
|
stringObject :: MonadParser m => m StringObject
|
||||||
stringObject =
|
stringObject =
|
||||||
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
|
Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
|
||||||
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
|
<|> Hexadecimal <$> (char '<' *> hexNumber <* char '>')
|
||||||
<?> "string object (literal or hexadecimal)"
|
|
||||||
where
|
where
|
||||||
literalString = many literalStringBlock
|
literalString = many literalStringBlock
|
||||||
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
|
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
|
||||||
normalChar = not . (`elem` ("\\()" :: String))
|
normalChar = not . (`elem` ("\\()" :: String))
|
||||||
matchingParenthesis =
|
matchingParenthesis =
|
||||||
mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
|
mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
|
||||||
escapedChar =
|
escapedChar =
|
||||||
BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
|
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
|
||||||
octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
|
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
|
||||||
|
|
||||||
|
toByteString :: StringObject -> ByteString
|
||||||
|
toByteString (Hexadecimal h) = b16ToBytes h
|
||||||
|
toByteString (Literal s) = unescape s
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Name
|
-- Name
|
||||||
|
@ -137,15 +144,15 @@ newtype Name = Name String deriving (Eq, Ord, Show)
|
||||||
instance Output Name where
|
instance Output Name where
|
||||||
output (Name n) = Output.string ('/':n)
|
output (Name n) = Output.string ('/':n)
|
||||||
|
|
||||||
name :: Parser u Name
|
name :: MonadParser m => m Name
|
||||||
name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
|
name = Name . Char8.unpack <$> (char '/' *> takeAll regular)
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Array
|
-- Array
|
||||||
--
|
--
|
||||||
array :: Parser u [DirectObject]
|
array :: MonadParser m => m [DirectObject]
|
||||||
array =
|
array =
|
||||||
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
|
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']'
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Dictionary
|
-- Dictionary
|
||||||
|
@ -153,16 +160,16 @@ array =
|
||||||
type Dictionary = Map Name DirectObject
|
type Dictionary = Map Name DirectObject
|
||||||
|
|
||||||
instance Output Dictionary where
|
instance Output Dictionary where
|
||||||
output dict =
|
output aDictionary =
|
||||||
"<<" `mappend` keyValues `mappend` ">>"
|
"<<" `mappend` keyValues `mappend` ">>"
|
||||||
where
|
where
|
||||||
keyValues = join " " $ outputKeyVal <$> Map.toList dict
|
keyValues = join " " $ outputKeyVal <$> Map.toList aDictionary
|
||||||
outputKeyVal :: (Name, DirectObject) -> OBuilder
|
outputKeyVal :: (Name, DirectObject) -> OBuilder
|
||||||
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
|
outputKeyVal (key, val) = mconcat [output key, " ", output val]
|
||||||
|
|
||||||
dictionary :: Parser u Dictionary
|
dictionary :: MonadParser m => m Dictionary
|
||||||
dictionary =
|
dictionary =
|
||||||
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
|
string "<<" *> blank *> keyValPairs <* string ">>"
|
||||||
where
|
where
|
||||||
keyVal = (,) <$> name <* blank <*> directObject
|
keyVal = (,) <$> name <* blank <*> directObject
|
||||||
keyValPairs = Map.fromList <$> keyVal `sepBy` blank <* blank
|
keyValPairs = Map.fromList <$> keyVal `sepBy` blank <* blank
|
||||||
|
@ -170,8 +177,8 @@ dictionary =
|
||||||
--
|
--
|
||||||
-- Null
|
-- Null
|
||||||
--
|
--
|
||||||
nullObject :: Parser u ()
|
nullObject :: MonadParser m => m ()
|
||||||
nullObject = string "null" *> return () <?> "null object"
|
nullObject = string "null" *> return ()
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Reference
|
-- Reference
|
||||||
|
@ -181,9 +188,9 @@ data IndirectObjCoordinates = IndirectObjCoordinates {
|
||||||
, versionNumber :: Int
|
, versionNumber :: Int
|
||||||
} deriving Show
|
} deriving Show
|
||||||
|
|
||||||
reference :: Parser u IndirectObjCoordinates
|
reference :: MonadParser m => m IndirectObjCoordinates
|
||||||
reference = IndirectObjCoordinates
|
reference = IndirectObjCoordinates
|
||||||
<$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
|
<$> (fmap ObjectId integer) <*> integer <* char 'R'
|
||||||
|
|
||||||
--
|
--
|
||||||
-- DirectObject
|
-- DirectObject
|
||||||
|
@ -204,23 +211,25 @@ instance Output DirectObject where
|
||||||
output (NumberObject n) = output n
|
output (NumberObject n) = output n
|
||||||
output (StringObject s) = output s
|
output (StringObject s) = output s
|
||||||
output (NameObject n) = output n
|
output (NameObject n) = output n
|
||||||
output (Array a) = Output.concat ["[", join " " a, "]"]
|
output (Array a) = mconcat ["[", join " " a, "]"]
|
||||||
output (Dictionary d) = output d
|
output (Dictionary d) = output d
|
||||||
output (Null) = "null"
|
output (Null) = "null"
|
||||||
output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
|
output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
|
||||||
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
|
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
|
||||||
|
|
||||||
directObject :: Parser u DirectObject
|
directObject :: MonadParser m => m DirectObject
|
||||||
directObject =
|
directObject = peek >>= dispatch
|
||||||
Boolean <$> boolean
|
where
|
||||||
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
|
dispatch 't' = Boolean <$> boolean
|
||||||
<|> NumberObject <$> number
|
dispatch 'f' = Boolean <$> boolean
|
||||||
<|> StringObject <$> stringObject
|
dispatch '(' = StringObject <$> stringObject
|
||||||
<|> NameObject <$> name
|
dispatch '<' = StringObject <$> stringObject <|> Dictionary <$> dictionary
|
||||||
<|> Array <$> array
|
dispatch '/' = NameObject <$> name
|
||||||
<|> Dictionary <$> dictionary
|
dispatch '[' = Array <$> array
|
||||||
<|> const Null <$> nullObject
|
dispatch 'n' = nullObject *> return Null
|
||||||
<?> "direct object"
|
dispatch _ =
|
||||||
|
Reference <$> reference {- defined before Number because Number is a prefix of it -}
|
||||||
|
<|> NumberObject <$> number
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Object
|
-- Object
|
||||||
|
@ -235,7 +244,7 @@ data Object =
|
||||||
|
|
||||||
instance Output Object where
|
instance Output Object where
|
||||||
output (Direct d) = output d
|
output (Direct d) = output d
|
||||||
output (Stream {header, streamContent}) = Output.concat [
|
output (Stream {header, streamContent}) = mconcat [
|
||||||
output header, newLine
|
output header, newLine
|
||||||
, Output.line "stream"
|
, Output.line "stream"
|
||||||
, byteString streamContent
|
, byteString streamContent
|
||||||
|
@ -250,7 +259,7 @@ data Occurrence = Comment String | Indirect IndirectObjCoordinates deriving Show
|
||||||
outputOccurrence :: IndexedObjects -> Occurrence -> OBuilder
|
outputOccurrence :: IndexedObjects -> Occurrence -> OBuilder
|
||||||
outputOccurrence _ (Comment c) = Output.line c
|
outputOccurrence _ (Comment c) = Output.line c
|
||||||
outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNumber})) =
|
outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNumber})) =
|
||||||
saveOffset (Object objectId) >> Output.concat [
|
saveOffset (Object objectId) >> mconcat [
|
||||||
Output.line (printf "%d %d obj" (getObjectId objectId) versionNumber)
|
Output.line (printf "%d %d obj" (getObjectId objectId) versionNumber)
|
||||||
, output (objects ! objectId), newLine
|
, output (objects ! objectId), newLine
|
||||||
, Output.line "endobj"
|
, Output.line "endobj"
|
||||||
|
@ -280,7 +289,7 @@ instance Output XRefEntry where
|
||||||
entry :: Parser u XRefEntry
|
entry :: Parser u XRefEntry
|
||||||
entry = do
|
entry = do
|
||||||
(big, small) <- (,) <$> integer <*> integer
|
(big, small) <- (,) <$> integer <*> integer
|
||||||
(inUse big small <|> free big small <?> "XRef entry") <* blank
|
(inUse big small <|> free big small) <* blank
|
||||||
where
|
where
|
||||||
inUse :: Int -> Int -> Parser u XRefEntry
|
inUse :: Int -> Int -> Parser u XRefEntry
|
||||||
inUse big generation =
|
inUse big generation =
|
||||||
|
@ -304,7 +313,7 @@ instance Output XRefSubSection where
|
||||||
|
|
||||||
xRefSubSection :: Parser u XRefSubSection
|
xRefSubSection :: Parser u XRefSubSection
|
||||||
xRefSubSection = do
|
xRefSubSection = do
|
||||||
(firstId, entriesNumber) <- (,) <$> integer <*> integer <?> "XRef subsection"
|
(firstId, entriesNumber) <- (,) <$> integer <*> integer
|
||||||
entries <- count entriesNumber entry
|
entries <- count entriesNumber entry
|
||||||
return $ XRefSubSection {firstObjectId = ObjectId firstId, entries}
|
return $ XRefSubSection {firstObjectId = ObjectId firstId, entries}
|
||||||
|
|
||||||
|
@ -383,7 +392,7 @@ outputBody (occurrences, objects) =
|
||||||
instance Output Content where
|
instance Output Content where
|
||||||
output (Content {occurrences, objects, docStructure}) =
|
output (Content {occurrences, objects, docStructure}) =
|
||||||
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
|
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
|
||||||
>>= \(body, (xref, startXRef)) -> Output.concat [
|
>>= \(body, (xref, startXRef)) -> mconcat [
|
||||||
body
|
body
|
||||||
, Output.line "xref"
|
, Output.line "xref"
|
||||||
, output xref
|
, output xref
|
||||||
|
|
82
src/PDF/Object/Navigation.hs
Normal file
82
src/PDF/Object/Navigation.hs
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
module PDF.Object.Navigation (
|
||||||
|
(//)
|
||||||
|
, dictionaryById
|
||||||
|
, getDictionary
|
||||||
|
, getField
|
||||||
|
, follow
|
||||||
|
, objectById
|
||||||
|
, openStream
|
||||||
|
, origin
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Codec.Compression.Zlib (decompress)
|
||||||
|
import Control.Monad.Reader (MonadReader(..))
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||||
|
import Data.Map ((!))
|
||||||
|
import qualified Data.Map as Map (lookup)
|
||||||
|
import PDF.Object (
|
||||||
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
|
, Name(..), Object(..), Structure(..)
|
||||||
|
)
|
||||||
|
import PDF.Output (ObjectId)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
type PDFContent m = (MonadReader Content m, MonadFail m)
|
||||||
|
|
||||||
|
castDictionary :: MonadFail m => Object -> m Dictionary
|
||||||
|
castDictionary (Direct (Dictionary aDict)) = return aDict
|
||||||
|
castDictionary obj = expected "dictionary : " obj
|
||||||
|
|
||||||
|
castObjectId :: MonadFail m => DirectObject -> m ObjectId
|
||||||
|
castObjectId (Reference (IndirectObjCoordinates {objectId})) = return objectId
|
||||||
|
castObjectId directObject = expected "reference" directObject
|
||||||
|
|
||||||
|
dictionaryById :: PDFContent m => ObjectId -> m Dictionary
|
||||||
|
dictionaryById objectId = objectById objectId >>= castDictionary
|
||||||
|
|
||||||
|
expected :: (MonadFail m, Show a) => String -> a -> m b
|
||||||
|
expected name = fail . printf "Not a %s: %s" name . show
|
||||||
|
|
||||||
|
getField :: MonadFail m => String -> Dictionary -> m DirectObject
|
||||||
|
getField key aDictionary =
|
||||||
|
maybe (fail errorMessage) return (Map.lookup (Name key) aDictionary)
|
||||||
|
where
|
||||||
|
errorMessage =
|
||||||
|
printf "Key %s not found in dictionary %s" key (show aDictionary)
|
||||||
|
|
||||||
|
follow :: PDFContent m => DirectObject -> m Object
|
||||||
|
follow directObject = castObjectId directObject >>= objectById
|
||||||
|
|
||||||
|
objectById :: PDFContent m => ObjectId -> m Object
|
||||||
|
objectById objectId = do
|
||||||
|
content <- ask
|
||||||
|
return (objects content ! objectId)
|
||||||
|
|
||||||
|
getDictionary :: PDFContent m => DirectObject -> m Dictionary
|
||||||
|
getDictionary (Dictionary aDictionary) = return aDictionary
|
||||||
|
getDictionary (Reference (IndirectObjCoordinates {objectId})) =
|
||||||
|
objectById objectId >>= castDictionary
|
||||||
|
getDictionary aDirectObject =
|
||||||
|
expected "resource (dictionary or reference)" aDirectObject
|
||||||
|
|
||||||
|
(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject
|
||||||
|
(//) aDict [] = return $ Dictionary aDict
|
||||||
|
(//) aDict [key] = getField key aDict
|
||||||
|
(//) aDict (key:keys) = getField key aDict >>= getDictionary >>= (// keys)
|
||||||
|
|
||||||
|
origin :: PDFContent m => m Dictionary
|
||||||
|
origin = trailer . docStructure <$> ask
|
||||||
|
|
||||||
|
openStream :: MonadFail m => Object -> m ByteString
|
||||||
|
openStream (Stream {header, streamContent}) = return $
|
||||||
|
case Map.lookup (Name "Filter") header of
|
||||||
|
Just (NameObject (Name "FlateDecode")) ->
|
||||||
|
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
||||||
|
_ -> streamContent
|
||||||
|
openStream obj = expected "stream" obj
|
|
@ -12,7 +12,6 @@ module PDF.Output (
|
||||||
, Resource(..)
|
, Resource(..)
|
||||||
, byteString
|
, byteString
|
||||||
, char
|
, char
|
||||||
, concat
|
|
||||||
, getOffsets
|
, getOffsets
|
||||||
, join
|
, join
|
||||||
, line
|
, line
|
||||||
|
@ -32,7 +31,6 @@ import qualified Data.Map as Map (singleton)
|
||||||
import Data.String (IsString(..))
|
import Data.String (IsString(..))
|
||||||
import Control.Monad.RWS (RWS, runRWS, ask, get, listen, modify, tell)
|
import Control.Monad.RWS (RWS, runRWS, ask, get, listen, modify, tell)
|
||||||
import qualified PDF.EOL as EOL (Style(..))
|
import qualified PDF.EOL as EOL (Style(..))
|
||||||
import Prelude hiding (concat)
|
|
||||||
|
|
||||||
newtype ObjectId = ObjectId {getObjectId :: Int} deriving (Enum, Eq, Ord, Show)
|
newtype ObjectId = ObjectId {getObjectId :: Int} deriving (Enum, Eq, Ord, Show)
|
||||||
newtype Offset = Offset {getOffset :: Int} deriving (Show)
|
newtype Offset = Offset {getOffset :: Int} deriving (Show)
|
||||||
|
@ -66,9 +64,6 @@ getOffsets (OContext builder) =
|
||||||
append :: OBuilder -> OBuilder -> OBuilder
|
append :: OBuilder -> OBuilder -> OBuilder
|
||||||
append (OContext a) (OContext b) = OContext (mappend <$> a <*> b)
|
append (OContext a) (OContext b) = OContext (mappend <$> a <*> b)
|
||||||
|
|
||||||
concat :: [OBuilder] -> OBuilder
|
|
||||||
concat = foldl mappend mempty
|
|
||||||
|
|
||||||
#if MIN_VERSION_base(4,11,0)
|
#if MIN_VERSION_base(4,11,0)
|
||||||
instance Semigroup OBuilder where
|
instance Semigroup OBuilder where
|
||||||
(<>) = append
|
(<>) = append
|
||||||
|
@ -94,7 +89,7 @@ instance Output Bool where
|
||||||
output True = string "true"
|
output True = string "true"
|
||||||
|
|
||||||
instance Output a => Output [a] where
|
instance Output a => Output [a] where
|
||||||
output = concat . fmap output
|
output = mconcat . fmap output
|
||||||
|
|
||||||
join :: Output a => String -> [a] -> OBuilder
|
join :: Output a => String -> [a] -> OBuilder
|
||||||
join _ [] = mempty
|
join _ [] = mempty
|
||||||
|
@ -116,7 +111,7 @@ char :: Char -> OBuilder
|
||||||
char c = lift char8 c <* offset (+1)
|
char c = lift char8 c <* offset (+1)
|
||||||
|
|
||||||
string :: String -> OBuilder
|
string :: String -> OBuilder
|
||||||
string s = lift string8 s <* offset (+ toEnum (length s))
|
string s = lift string8 s <* offset (+ length s)
|
||||||
|
|
||||||
line :: String -> OBuilder
|
line :: String -> OBuilder
|
||||||
line l = string l `mappend` newLine
|
line l = string l `mappend` newLine
|
||||||
|
|
133
src/PDF/Pages.hs
Executable file
133
src/PDF/Pages.hs
Executable file
|
@ -0,0 +1,133 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||||
|
module PDF.Pages (
|
||||||
|
Page(..)
|
||||||
|
, get
|
||||||
|
, getAll
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Applicative (Alternative(..), (<|>))
|
||||||
|
import Control.Monad (MonadPlus(..), foldM)
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
|
import Control.Monad.RWS (RWST(..), evalRWST, modify)
|
||||||
|
import qualified Control.Monad.RWS as RWS (get)
|
||||||
|
import Data.Map (Map)
|
||||||
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import PDF.CMap (cMap)
|
||||||
|
import qualified PDF.Content as Content (parse)
|
||||||
|
import PDF.Content.Text (renderText)
|
||||||
|
import PDF.Encoding (encoding)
|
||||||
|
import PDF.Font (Font, FontSet)
|
||||||
|
import PDF.Object (
|
||||||
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
|
, Name(..)
|
||||||
|
,)
|
||||||
|
import PDF.Object.Navigation (
|
||||||
|
(//), dictionaryById, getDictionary, getField, follow, openStream, origin
|
||||||
|
)
|
||||||
|
import PDF.Output (ObjectId(..))
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
type CachedFonts = Map ObjectId Font
|
||||||
|
newtype Error a = Error {
|
||||||
|
runError :: Either String a
|
||||||
|
} deriving (Alternative, Functor, Applicative, Monad, MonadPlus)
|
||||||
|
type T = RWST Content () CachedFonts Error
|
||||||
|
data Page = Page {
|
||||||
|
contents :: [Text]
|
||||||
|
, source :: ObjectId
|
||||||
|
}
|
||||||
|
|
||||||
|
instance MonadFail Error where
|
||||||
|
fail = Error . Left
|
||||||
|
|
||||||
|
getFontDictionary :: Dictionary -> T Dictionary
|
||||||
|
getFontDictionary pageDict =
|
||||||
|
((pageDict // ["Resources", "Font"]) >>= getDictionary)
|
||||||
|
<|> return Map.empty
|
||||||
|
|
||||||
|
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
||||||
|
cache loader objectId =
|
||||||
|
(maybe load return . Map.lookup objectId) =<< RWS.get
|
||||||
|
where
|
||||||
|
load = do
|
||||||
|
value <- loader objectId
|
||||||
|
modify $ Map.insert objectId value
|
||||||
|
return value
|
||||||
|
|
||||||
|
loadFont :: ObjectId -> T Font
|
||||||
|
loadFont objectId = dictionaryById objectId >>= tryMappings
|
||||||
|
where
|
||||||
|
tryMappings dictionary =
|
||||||
|
loadCMap dictionary
|
||||||
|
<|> (getField "Encoding" dictionary >>= loadEncoding)
|
||||||
|
<|> (fail $ unknownFormat (show objectId) (show dictionary))
|
||||||
|
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||||
|
loadCMap :: Dictionary -> T Font
|
||||||
|
loadCMap dictionary =
|
||||||
|
getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
|
||||||
|
loadEncoding :: DirectObject -> T Font
|
||||||
|
loadEncoding (NameObject (Name name)) = encoding name
|
||||||
|
loadEncoding directObject =
|
||||||
|
fail $ printf "Encoding must be a name, not that : %s" $ show directObject
|
||||||
|
|
||||||
|
loadFonts :: Dictionary -> T FontSet
|
||||||
|
loadFonts = foldM addFont Map.empty . Map.toList
|
||||||
|
where
|
||||||
|
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
|
||||||
|
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||||
|
flip (Map.insert name) output <$> cache loadFont objectId
|
||||||
|
addFont output _ = return output
|
||||||
|
|
||||||
|
several :: DirectObject -> [DirectObject]
|
||||||
|
several (Array l) = l
|
||||||
|
several directObject = [directObject]
|
||||||
|
|
||||||
|
pagesList :: T [ObjectId]
|
||||||
|
pagesList = do
|
||||||
|
pages <- (origin :: T Dictionary) >>= (// ["Root", "Pages"]) >>= getDictionary
|
||||||
|
case Map.lookup (Name "Kids") pages of
|
||||||
|
Just (Array kids) -> return $ getReferences kids
|
||||||
|
_ -> return []
|
||||||
|
|
||||||
|
getReferences :: [DirectObject] -> [ObjectId]
|
||||||
|
getReferences objects = do
|
||||||
|
object <- objects
|
||||||
|
case object of
|
||||||
|
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||||
|
_ -> []
|
||||||
|
|
||||||
|
extractText :: Dictionary -> T [Text]
|
||||||
|
extractText pageDict = do
|
||||||
|
fonts <- loadFonts =<< getFontDictionary pageDict
|
||||||
|
objects <- several <$> getField "Contents" pageDict
|
||||||
|
concat <$> mapM (loadContent fonts) objects
|
||||||
|
where
|
||||||
|
loadContent :: FontSet -> DirectObject -> T [Text]
|
||||||
|
loadContent fonts directObject =
|
||||||
|
follow directObject
|
||||||
|
>>= openStream
|
||||||
|
>>= (either fail return . Content.parse)
|
||||||
|
>>= renderText fonts
|
||||||
|
|
||||||
|
loadPage :: ObjectId -> T Page
|
||||||
|
loadPage source = do
|
||||||
|
contents <- extractText =<< dictionaryById source
|
||||||
|
return $ Page {contents, source}
|
||||||
|
|
||||||
|
getAll :: Content -> Either String (Map Int Page)
|
||||||
|
getAll content = runError $ fst <$> evalRWST getPages content Map.empty
|
||||||
|
where
|
||||||
|
numbered = Map.fromList . zip [1..]
|
||||||
|
getPages = numbered <$> (mapM loadPage =<< pagesList)
|
||||||
|
|
||||||
|
get :: Content -> Int -> Either String Page
|
||||||
|
get content pageNumber
|
||||||
|
| pageNumber < 1 = Left "Pages start at 1"
|
||||||
|
| otherwise = runError $ fst <$> evalRWST getPage content Map.empty
|
||||||
|
where
|
||||||
|
firstPage [] = fail "Page is out of bounds"
|
||||||
|
firstPage (p:_) = loadPage p
|
||||||
|
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
|
|
@ -1,56 +1,70 @@
|
||||||
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE UndecidableInstances #-}
|
||||||
module PDF.Parser (
|
module PDF.Parser (
|
||||||
Parser
|
MonadParser(..)
|
||||||
, (<?>)
|
, Parser
|
||||||
, block
|
|
||||||
, char
|
|
||||||
, choice
|
|
||||||
, count
|
|
||||||
, decNumber
|
|
||||||
, hexNumber
|
|
||||||
, many
|
|
||||||
, octDigit
|
, octDigit
|
||||||
, on
|
, on
|
||||||
, oneOf
|
|
||||||
, option
|
|
||||||
, runParser
|
, runParser
|
||||||
, sepBy
|
, evalParser
|
||||||
, string
|
|
||||||
, takeAll
|
|
||||||
, takeAll1
|
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>), empty)
|
import Control.Monad (MonadPlus)
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
import Control.Monad.State (StateT(..), evalStateT)
|
import Control.Monad.State (StateT(..), evalStateT)
|
||||||
import Control.Monad.Trans (lift)
|
import Control.Monad.Trans (MonadTrans(..))
|
||||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
|
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
|
||||||
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
|
Parser, char, endOfInput, parseOnly, peekChar', satisfy, string, take
|
||||||
|
, takeWhile, takeWhile1
|
||||||
)
|
)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
|
import Data.ByteString.Char8.Util (B16Int(..))
|
||||||
import Data.Char (toLower)
|
import Data.Char (toLower)
|
||||||
import Data.Set (Set)
|
import Data.Set (Set)
|
||||||
import qualified Data.Set as Set (fromList, member, unions)
|
import qualified Data.Set as Set (fromList, member, unions)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
|
type MonadDeps m = (MonadFail m, MonadPlus m)
|
||||||
|
|
||||||
|
class MonadDeps m => MonadParser m where
|
||||||
|
block :: Int -> m ByteString
|
||||||
|
char :: Char -> m Char
|
||||||
|
decNumber :: m ByteString
|
||||||
|
endOfInput :: m ()
|
||||||
|
hexNumber :: m B16Int
|
||||||
|
oneOf :: String -> m Char
|
||||||
|
peek :: m Char
|
||||||
|
string :: ByteString -> m ByteString
|
||||||
|
takeAll :: (Char -> Bool) -> m ByteString
|
||||||
|
takeAll1 :: (Char -> Bool) -> m ByteString
|
||||||
|
|
||||||
|
instance MonadParser Atto.Parser where
|
||||||
|
block = Atto.take
|
||||||
|
char = Atto.char
|
||||||
|
endOfInput = Atto.endOfInput
|
||||||
|
decNumber = Atto.takeWhile1 (`Set.member` digits)
|
||||||
|
hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
|
||||||
|
oneOf charSet = Atto.satisfy (`elem` charSet)
|
||||||
|
peek = Atto.peekChar'
|
||||||
|
string s = Atto.string s
|
||||||
|
takeAll = Atto.takeWhile
|
||||||
|
takeAll1 = Atto.takeWhile1
|
||||||
|
|
||||||
|
instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) where
|
||||||
|
block = lift . block
|
||||||
|
char = lift . char
|
||||||
|
endOfInput = lift $ endOfInput
|
||||||
|
decNumber = lift $ decNumber
|
||||||
|
hexNumber = lift $ hexNumber
|
||||||
|
oneOf = lift . oneOf
|
||||||
|
peek = lift $ peek
|
||||||
|
string = lift . string
|
||||||
|
takeAll = lift . takeAll
|
||||||
|
takeAll1 = lift . takeAll1
|
||||||
|
|
||||||
type Parser s = StateT s Atto.Parser
|
type Parser s = StateT s Atto.Parser
|
||||||
|
|
||||||
(<?>) :: Parser s a -> String -> Parser s a
|
|
||||||
(<?>) parser debugMessage = parser <|> fail debugMessage
|
|
||||||
|
|
||||||
block :: Int -> Parser s ByteString
|
|
||||||
block = lift . Atto.take
|
|
||||||
|
|
||||||
char :: Char -> Parser s Char
|
|
||||||
char = lift . Atto.char
|
|
||||||
|
|
||||||
choice :: [Parser s a] -> Parser s a
|
|
||||||
choice = foldr (<|>) empty
|
|
||||||
|
|
||||||
count :: Int -> Parser s a -> Parser s [a]
|
|
||||||
count 0 _ = return []
|
|
||||||
count n p = (:) <$> p <*> count (n-1) p
|
|
||||||
|
|
||||||
decNumber :: Parser s ByteString
|
|
||||||
decNumber = lift $ Atto.takeWhile1 (`Set.member` digits)
|
|
||||||
|
|
||||||
digits :: Set Char
|
digits :: Set Char
|
||||||
digits = Set.fromList ['0'..'9']
|
digits = Set.fromList ['0'..'9']
|
||||||
|
|
||||||
|
@ -59,13 +73,7 @@ hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af]
|
||||||
where
|
where
|
||||||
af = ['A'..'F']
|
af = ['A'..'F']
|
||||||
|
|
||||||
hexNumber :: Parser s ByteString
|
octDigit :: MonadParser m => m Char
|
||||||
hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits)
|
|
||||||
|
|
||||||
many :: Parser s a -> Parser s [a]
|
|
||||||
many parser = (:) <$> parser <*> many parser <|> return []
|
|
||||||
|
|
||||||
octDigit :: Parser s Char
|
|
||||||
octDigit = oneOf ['0'..'7']
|
octDigit = oneOf ['0'..'7']
|
||||||
|
|
||||||
on :: Parser s a -> ByteString -> Parser s (Either String a)
|
on :: Parser s a -> ByteString -> Parser s (Either String a)
|
||||||
|
@ -74,25 +82,8 @@ on (StateT parserF) input = StateT $ \state ->
|
||||||
Left errorMsg -> return (Left errorMsg, state)
|
Left errorMsg -> return (Left errorMsg, state)
|
||||||
Right (result, newState) -> return (Right result, newState)
|
Right (result, newState) -> return (Right result, newState)
|
||||||
|
|
||||||
oneOf :: String -> Parser s Char
|
runParser :: Parser s a -> s -> ByteString -> Either String (a, s)
|
||||||
oneOf charSet = lift $ Atto.satisfy (`elem` charSet)
|
runParser parser initState = Atto.parseOnly (runStateT parser initState)
|
||||||
|
|
||||||
option :: a -> Parser s a -> Parser s a
|
evalParser :: Parser s a -> s -> ByteString -> Either String a
|
||||||
option defaultValue p = p <|> pure defaultValue
|
evalParser parser initState = Atto.parseOnly (evalStateT parser initState)
|
||||||
|
|
||||||
runParser :: Parser s a -> s -> ByteString -> Either String a
|
|
||||||
runParser parser initState =
|
|
||||||
Atto.parseOnly (evalStateT parser initState)
|
|
||||||
|
|
||||||
sepBy :: Parser s a -> Parser s b -> Parser s [a]
|
|
||||||
sepBy parser separator =
|
|
||||||
option [] $ (:) <$> parser <*> many (separator *> parser)
|
|
||||||
|
|
||||||
string :: ByteString -> Parser s ByteString
|
|
||||||
string = lift . Atto.string
|
|
||||||
|
|
||||||
takeAll :: (Char -> Bool) -> Parser s ByteString
|
|
||||||
takeAll = lift . Atto.takeWhile
|
|
||||||
|
|
||||||
takeAll1 :: (Char -> Bool) -> Parser s ByteString
|
|
||||||
takeAll1 = lift . Atto.takeWhile1
|
|
||||||
|
|
319
src/PDF/Text.hs
Normal file
319
src/PDF/Text.hs
Normal file
|
@ -0,0 +1,319 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
module PDF.Text {-(
|
||||||
|
pageContents
|
||||||
|
)-} where
|
||||||
|
|
||||||
|
import Control.Applicative ((<|>))
|
||||||
|
import Control.Monad (foldM)
|
||||||
|
import Control.Monad.Fail (MonadFail)
|
||||||
|
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
||||||
|
import Control.Monad.State (MonadState, evalStateT, get, modify, put)
|
||||||
|
import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import Data.ByteString.Char8 (pack, unpack)
|
||||||
|
import Data.Char (toLower)
|
||||||
|
import Data.Map ((!), (!?), Map)
|
||||||
|
import qualified Data.Map as Map (fromList)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import PDF.Font (Font, FontSet, emptyFont)
|
||||||
|
import PDF.Object (
|
||||||
|
DirectObject(..), StringObject(..)
|
||||||
|
, array, blank, name, regular, stringObject, toByteString
|
||||||
|
)
|
||||||
|
import PDF.Parser (MonadParser(..), (<?>), Parser, evalParser)
|
||||||
|
|
||||||
|
data StateOperator =
|
||||||
|
C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
data PathOperator =
|
||||||
|
M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
|
||||||
|
| S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
|
||||||
|
| W | Wstar -- clipping path
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
data ColorOperator =
|
||||||
|
CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
data TextOperator =
|
||||||
|
Td | TD | Tm | Tstar -- text positioning
|
||||||
|
| TJ | Tj | Quote | DQuote -- text showing
|
||||||
|
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
data Argument = Raw ByteString | Typed DirectObject deriving Show
|
||||||
|
type Call a = (a, [Argument])
|
||||||
|
|
||||||
|
type Operator a = (Bounded a, Enum a, Show a)
|
||||||
|
|
||||||
|
code :: Operator a => a -> ByteString
|
||||||
|
code = pack . expand . show
|
||||||
|
where
|
||||||
|
expand "" = ""
|
||||||
|
expand (c:'_':s) = toLower c : expand s
|
||||||
|
expand ('s':'t':'a':'r':s) = '*' : expand s
|
||||||
|
expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
|
||||||
|
expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
|
||||||
|
expand (c:s) = c : expand s
|
||||||
|
|
||||||
|
{-
|
||||||
|
instance Show StateOperator where
|
||||||
|
show Cm = "cm"
|
||||||
|
show W_ = "w"
|
||||||
|
show J = "J"
|
||||||
|
show J_ = "j"
|
||||||
|
show M = "M"
|
||||||
|
show D = "d"
|
||||||
|
show Ri = "ri"
|
||||||
|
show I = "i"
|
||||||
|
show Gs = "gs"
|
||||||
|
|
||||||
|
instance Show PathOperator where
|
||||||
|
show M_ = "m"
|
||||||
|
show L_ = "l"
|
||||||
|
show C_ = "c"
|
||||||
|
show V_ = "v"
|
||||||
|
show Y_ = "y"
|
||||||
|
show H_
|
||||||
|
("m", (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
|
||||||
|
, ("l", (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
|
||||||
|
, ("c", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
||||||
|
, ("v", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
||||||
|
, ("y", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
||||||
|
, ("h", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("re", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
||||||
|
, ("S", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("s", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("f", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("F", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("F*", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("B", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("B*", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("b", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("b*", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("n", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("W", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("W*", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
|
||||||
|
instance Show ColorOperator where
|
||||||
|
|
||||||
|
instance Show TextOperator where
|
||||||
|
show Td = "Td"
|
||||||
|
show TD = "TD"
|
||||||
|
show Tm = "Tm"
|
||||||
|
show Tstar = "T*"
|
||||||
|
show TJ = "TJ"
|
||||||
|
show Tj = "Tj"
|
||||||
|
show Quote = "'"
|
||||||
|
show DQuote = "\""
|
||||||
|
show Tc = "Tc"
|
||||||
|
show Tw = "Tw"
|
||||||
|
show Tz = "Tz"
|
||||||
|
show TL = "TL"
|
||||||
|
show Tf = "Tf"
|
||||||
|
show Tr = "Tr"
|
||||||
|
show Ts = "Ts"
|
||||||
|
-}
|
||||||
|
|
||||||
|
stateOperator :: OperatorTable StateOperator
|
||||||
|
stateOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
|
||||||
|
(C_m, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (W_, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (J, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (J_, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (M, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (D_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (R_i, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (I_, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (G_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
|
||||||
|
]
|
||||||
|
|
||||||
|
pathOperator :: OperatorTable PathOperator
|
||||||
|
pathOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
|
||||||
|
(M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (C_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (V_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (Y_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (H_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (R_e, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (S, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (S_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (F_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (F, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Fstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Bstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B_star, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (N_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (W, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Wstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
]
|
||||||
|
|
||||||
|
colorOperator :: OperatorTable ColorOperator
|
||||||
|
colorOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
|
||||||
|
(CS, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
|
||||||
|
, (C_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
|
||||||
|
, (SC, \_ -> True)
|
||||||
|
, (SCN, \_ -> True)
|
||||||
|
, (S_c, \_ -> True)
|
||||||
|
, (S_cn, \_ -> True)
|
||||||
|
, (G, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (G_, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (RG, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (R_g, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (K, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (K_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
]
|
||||||
|
|
||||||
|
{-
|
||||||
|
stateOperator (Cm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
|
||||||
|
stateOperator (W, [Raw _]) = True
|
||||||
|
stateOperator (J, [Raw _]) = True
|
||||||
|
stateOperator (J_, [Raw _]) = True
|
||||||
|
stateOperator (M, [Raw _]) = True
|
||||||
|
stateOperator (D, [Raw _, Raw _]) = True
|
||||||
|
stateOperator (Ri, [Raw _]) = True
|
||||||
|
stateOperator (I, [Raw _]) = True
|
||||||
|
stateOperator (Gs, [Typed (NameObject _)]) = True
|
||||||
|
stateOperator _ = False
|
||||||
|
-}
|
||||||
|
|
||||||
|
textOperator :: OperatorTable TextOperator
|
||||||
|
textOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
|
||||||
|
(Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (Tstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (TJ, \l -> case l of [Typed (Array _)] -> True ; _ -> False)
|
||||||
|
, (Tj, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
|
||||||
|
, (Quote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
|
||||||
|
, (DQuote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
|
||||||
|
, (Tc, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (Tw, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (Tz, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (TL, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (Tf, \l -> case l of [Typed (NameObject _), Raw _] -> True ; _ -> False)
|
||||||
|
, (Tr, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (Ts, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
]
|
||||||
|
|
||||||
|
{-
|
||||||
|
textOperator (Td, [Raw _, Raw _]) = True
|
||||||
|
textOperator (TD, [Raw _, Raw _]) = True
|
||||||
|
textOperator (Tm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
|
||||||
|
textOperator (Tstar, []) = True
|
||||||
|
textOperator (TJ, [Typed (Array _)]) = True
|
||||||
|
textOperator (Tj, [Typed (StringObject _)]) = True
|
||||||
|
textOperator (Quote, [Typed (StringObject _)]) = True
|
||||||
|
textOperator (DQuote, [Typed (StringObject _)]) = True
|
||||||
|
textOperator (Tc, [Raw _]) = True
|
||||||
|
textOperator (Tw, [Raw _]) = True
|
||||||
|
textOperator (Tz, [Raw _]) = True
|
||||||
|
textOperator (TL, [Raw _]) = True
|
||||||
|
textOperator (Tf, [Typed (NameObject _), Raw _]) = True
|
||||||
|
textOperator (Tr, [Raw _]) = True
|
||||||
|
textOperator (Ts, [Raw _]) = True
|
||||||
|
textOperator _ = False
|
||||||
|
-}
|
||||||
|
|
||||||
|
type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
|
||||||
|
--type Operator a = (Bounded a, Enum a, Show a)
|
||||||
|
type OperatorTable a = Map ByteString (TypeChecker a)
|
||||||
|
type TypeChecker a = (a, [Argument] -> Bool)
|
||||||
|
|
||||||
|
parseShowable :: (Show a, MonadParser m) => a -> m a
|
||||||
|
parseShowable textOp = string (pack $ show textOp) *> return textOp
|
||||||
|
|
||||||
|
callChunk :: MonadParser m => OperatorTable a -> m (Either (TypeChecker a) Argument)
|
||||||
|
callChunk table =
|
||||||
|
(Right <$> choice [stringArg, nameArg, arrayArg])
|
||||||
|
<|> operatorOrRawArg
|
||||||
|
<?> "call chunk"
|
||||||
|
where
|
||||||
|
operatorOrRawArg = do
|
||||||
|
chunk <- takeAll1 regular <* blank
|
||||||
|
case table !? chunk of
|
||||||
|
Nothing -> return . Right $ Raw chunk
|
||||||
|
Just typeChecker -> return $ Left typeChecker
|
||||||
|
|
||||||
|
stackParser :: (ArgumentStackParser m, Operator a) => OperatorTable a -> m (Call a)
|
||||||
|
stackParser table = either popCall push =<< (callChunk table)
|
||||||
|
where
|
||||||
|
push arg = modify (arg:) >> stackParser table
|
||||||
|
popCall (operator, predicate) = do
|
||||||
|
arguments <- reverse <$> get
|
||||||
|
let call = (operator, arguments)
|
||||||
|
if predicate arguments then return call else fail (unpack $ code operator)
|
||||||
|
|
||||||
|
a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a)
|
||||||
|
a table = evalStateT (stackParser table) []
|
||||||
|
|
||||||
|
argument :: MonadParser m => m Argument
|
||||||
|
argument = Raw <$> takeAll1 regular <* blank
|
||||||
|
|
||||||
|
arrayArg :: MonadParser m => m Argument
|
||||||
|
arrayArg = Typed . Array <$> array <* blank
|
||||||
|
|
||||||
|
nameArg :: MonadParser m => m Argument
|
||||||
|
nameArg = Typed . NameObject <$> name <* blank
|
||||||
|
|
||||||
|
stringArg :: MonadParser m => m Argument
|
||||||
|
stringArg = Typed . StringObject <$> stringObject <* blank
|
||||||
|
|
||||||
|
type ParserWithFont = ReaderT FontSet (Parser Font)
|
||||||
|
|
||||||
|
pageContents :: FontSet -> ByteString -> Either String [Text]
|
||||||
|
pageContents fontSet input =
|
||||||
|
evalParser (runReaderT (page) fontSet) emptyFont input
|
||||||
|
|
||||||
|
several :: MonadParser m => m [a] -> m [a]
|
||||||
|
several p = concat <$> (p `sepBy` blank)
|
||||||
|
|
||||||
|
page :: ParserWithFont [Text]
|
||||||
|
page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
|
||||||
|
|
||||||
|
graphicState :: ParserWithFont [Text]
|
||||||
|
graphicState =
|
||||||
|
string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
|
||||||
|
where
|
||||||
|
insideQ = several (command <|> graphicState <|> text)
|
||||||
|
ignore x = a x *> return []
|
||||||
|
command =
|
||||||
|
ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
|
||||||
|
|
||||||
|
text :: ParserWithFont [Text]
|
||||||
|
text =
|
||||||
|
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
|
||||||
|
where
|
||||||
|
commands = several (a textOperator >>= runOperator)
|
||||||
|
|
||||||
|
runOperator :: Call TextOperator -> ParserWithFont [Text]
|
||||||
|
runOperator (Tf, [Typed (NameObject fontName), _]) =
|
||||||
|
asks (! fontName) >>= put >> return []
|
||||||
|
|
||||||
|
runOperator (Tstar, []) = return ["\n"]
|
||||||
|
|
||||||
|
runOperator (TJ, [Typed (Array arrayObject)]) =
|
||||||
|
replicate 1 <$> foldM appendText "" arrayObject
|
||||||
|
where
|
||||||
|
appendText bs (StringObject outputString) =
|
||||||
|
mappend bs <$> decodeString outputString
|
||||||
|
appendText bs _ = return bs
|
||||||
|
|
||||||
|
runOperator (Tj, [Typed (StringObject outputString)]) =
|
||||||
|
replicate 1 <$> decodeString outputString
|
||||||
|
|
||||||
|
runOperator (Quote, [Typed (StringObject outputString)]) =
|
||||||
|
(\bs -> ["\n", bs]) <$> decodeString outputString
|
||||||
|
|
||||||
|
runOperator (DQuote, [Typed (StringObject outputString)]) =
|
||||||
|
(\bs -> ["\n", bs]) <$> decodeString outputString
|
||||||
|
|
||||||
|
runOperator _ = return []
|
||||||
|
|
||||||
|
decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text
|
||||||
|
decodeString input = do
|
||||||
|
font <- get
|
||||||
|
either fail return . font $ toByteString input
|
Loading…
Reference in a new issue