Add support for fonts and implement MacRomanEncoding

This commit is contained in:
Tissevert 2020-02-08 08:15:32 +01:00
parent c48ab22808
commit 325250383a
11 changed files with 396 additions and 116 deletions

View File

@ -26,7 +26,10 @@ library
, PDF.Text , PDF.Text
, PDF.Update , PDF.Update
other-modules: Data.ByteString.Char8.Util other-modules: Data.ByteString.Char8.Util
, PDF.Encoding
, PDF.Encoding.MacRoman
, PDF.Body , PDF.Body
, PDF.Font
-- other-extensions: -- other-extensions:
build-depends: attoparsec build-depends: attoparsec
, base >=4.9 && <4.13 , base >=4.9 && <4.13
@ -65,6 +68,7 @@ executable getText
, containers , containers
, Hufflepdf , Hufflepdf
, mtl , mtl
, text
, zlib , zlib
ghc-options: -Wall ghc-options: -Wall
default-language: Haskell2010 default-language: Haskell2010

View File

@ -1,5 +1,6 @@
import qualified Data.ByteString.Char8 as BS (putStrLn, readFile) import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.Map as Map (toList) import qualified Data.Map as Map (toList)
import qualified Data.Text.IO as Text (putStrLn)
import PDF (Document(..), parseDocument) import PDF (Document(..), parseDocument)
import PDF.Object (Content) import PDF.Object (Content)
import PDF.Pages (Page(..), get, getAll) import PDF.Pages (Page(..), get, getAll)
@ -16,7 +17,7 @@ onDoc inputFile f = do
Right value -> return value Right value -> return value
displayPage :: Page -> IO () displayPage :: Page -> IO ()
displayPage = mapM_ BS.putStrLn . contents displayPage = mapM_ Text.putStrLn . contents
wholeDoc :: FilePath -> IO () wholeDoc :: FilePath -> IO ()
wholeDoc inputFile = do wholeDoc inputFile = do

View File

@ -18,7 +18,8 @@ import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, s
import qualified Data.ByteString.Char8 as Char8 ( import qualified Data.ByteString.Char8 as Char8 (
cons, drop, index, splitAt, take, uncons, unpack cons, drop, index, splitAt, take, uncons, unpack
) )
import Data.Text.Encoding (encodeUtf8, decodeUtf16BE) import Data.Text (Text)
import Data.Text.Encoding (decodeUtf16BE)
import Prelude hiding (length) import Prelude hiding (length)
import Text.Printf (printf) import Text.Printf (printf)
@ -86,5 +87,5 @@ unescape escapedBS =
| otherwise -> Char8.cons c (unescape s') | otherwise -> Char8.cons c (unescape s')
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s) fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
utf16BEToutf8 :: ByteString -> ByteString utf16BEToutf8 :: ByteString -> Text
utf16BEToutf8 = encodeUtf8 . decodeUtf16BE utf16BEToutf8 = decodeUtf16BE

View File

@ -1,4 +1,7 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE TypeSynonymInstances #-}
module PDF.CMap ( module PDF.CMap (
CMap CMap
, CMappers , CMappers
@ -12,13 +15,18 @@ import Control.Applicative ((<|>), many)
import Control.Monad.State (modify) import Control.Monad.State (modify)
import Data.Attoparsec.ByteString.Char8 (count) import Data.Attoparsec.ByteString.Char8 (count)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (length) import qualified Data.ByteString as BS (drop, length, null, take)
import Data.ByteString.Char8 (unpack)
import Data.ByteString.Char8.Util ( import Data.ByteString.Char8.Util (
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8 B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
) )
import Data.Map (Map, union) import Data.Map (Map, union)
import qualified Data.Map as Map (adjust, empty, fromList, insertWith) import qualified Data.Map as Map (
adjust, empty, fromList, insertWith, lookup, toList
)
import Data.Text (Text)
import qualified PDF.EOL as EOL (charset, parser) import qualified PDF.EOL as EOL (charset, parser)
import PDF.Font (Font)
import PDF.Object ( import PDF.Object (
DirectObject(..), Name, StringObject(..) DirectObject(..), Name, StringObject(..)
, blank, directObject, integer, line, stringObject , blank, directObject, integer, line, stringObject
@ -26,7 +34,7 @@ import PDF.Object (
import PDF.Parser (MonadParser, Parser, runParser, takeAll) import PDF.Parser (MonadParser, Parser, runParser, takeAll)
type CMappers = Map Name CMap type CMappers = Map Name CMap
type Mapping = Map ByteString ByteString type Mapping = Map ByteString Text
data CRange = CRange { data CRange = CRange {
fromSequence :: ByteString fromSequence :: ByteString
, toSequence :: ByteString , toSequence :: ByteString
@ -35,6 +43,26 @@ data CRange = CRange {
type RangeSize = Int type RangeSize = Int
type CMap = Map RangeSize [CRange] type CMap = Map RangeSize [CRange]
toFont :: CMap -> Font
toFont aCMap input
| BS.null input = Right ""
| otherwise = do
(output, remainingInput) <- trySizes input $ Map.toList aCMap
mappend output <$> toFont aCMap remainingInput
where
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
trySizes s ((size, cRanges):others) =
let prefix = BS.take size s in
case tryRanges prefix cRanges of
Nothing -> trySizes s others
Just outputSequence -> Right (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe Text
tryRanges _ [] = Nothing
tryRanges prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of
Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence
emptyCMap :: CMap emptyCMap :: CMap
emptyCMap = Map.empty emptyCMap = Map.empty
@ -42,8 +70,8 @@ matches :: ByteString -> CRange -> Bool
matches code (CRange {fromSequence, toSequence}) = matches code (CRange {fromSequence, toSequence}) =
fromSequence <= code && code <= toSequence fromSequence <= code && code <= toSequence
cMap :: ByteString -> Either String CMap cMap :: ByteString -> Either String Font
cMap = fmap snd <$> runParser cMap = fmap (toFont . snd) <$> runParser
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine)) (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
emptyCMap emptyCMap
where where
@ -81,7 +109,7 @@ cMapRange = do
<*> directObject <* EOL.parser <*> directObject <* EOL.parser
>>= mapFromTo >>= mapFromTo
saveMapping :: [(ByteString, ByteString)] -> Parser CMap () saveMapping :: [(ByteString, Text)] -> Parser CMap ()
saveMapping [] = return () saveMapping [] = return ()
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
where where
@ -112,7 +140,7 @@ startFrom from@(B16Int s) =
let size = BS.length s `div` 2 in let size = BS.length s `div` 2 in
toBytes size <$> [b16ToInt from .. ] toBytes size <$> [b16ToInt from .. ]
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)] mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom) return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
@ -125,7 +153,7 @@ mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
mapFromTo _ = fail "invalid range mapping found" mapFromTo _ = fail "invalid range mapping found"
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString) pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
pairMapping (Hexadecimal from, Hexadecimal to) = pairMapping (Hexadecimal from, Hexadecimal to) =
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to) return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
pairMapping _ = fail "invalid pair mapping found" pairMapping _ = fail "invalid pair mapping found"

10
src/PDF/Encoding.hs Normal file
View File

@ -0,0 +1,10 @@
module PDF.Encoding (
encoding
) where
import PDF.Encoding.MacRoman (macRomanEncoding)
import PDF.Font (Font)
encoding :: String -> Either String Font
encoding "MacRomanEncoding" = Right macRomanEncoding
encoding s = Left $ "Unknown encoding " ++ s

View File

@ -0,0 +1,141 @@
module PDF.Encoding.MacRoman (
macRomanEncoding
) where
import Data.ByteString.Char8 (unpack)
import Data.Text (pack)
import PDF.Font (Font)
macRomanEncoding :: Font
macRomanEncoding = Right . pack . fmap decode . unpack
decode :: Char -> Char
decode '\x80' = '\x00C4' -- LATIN CAPITAL LETTER A WITH DIAERESIS
decode '\x81' = '\x00C5' -- LATIN CAPITAL LETTER A WITH RING ABOVE
decode '\x82' = '\x00C7' -- LATIN CAPITAL LETTER C WITH CEDILLA
decode '\x83' = '\x00C9' -- LATIN CAPITAL LETTER E WITH ACUTE
decode '\x84' = '\x00D1' -- LATIN CAPITAL LETTER N WITH TILDE
decode '\x85' = '\x00D6' -- LATIN CAPITAL LETTER O WITH DIAERESIS
decode '\x86' = '\x00DC' -- LATIN CAPITAL LETTER U WITH DIAERESIS
decode '\x87' = '\x00E1' -- LATIN SMALL LETTER A WITH ACUTE
decode '\x88' = '\x00E0' -- LATIN SMALL LETTER A WITH GRAVE
decode '\x89' = '\x00E2' -- LATIN SMALL LETTER A WITH CIRCUMFLEX
decode '\x8A' = '\x00E4' -- LATIN SMALL LETTER A WITH DIAERESIS
decode '\x8B' = '\x00E3' -- LATIN SMALL LETTER A WITH TILDE
decode '\x8C' = '\x00E5' -- LATIN SMALL LETTER A WITH RING ABOVE
decode '\x8D' = '\x00E7' -- LATIN SMALL LETTER C WITH CEDILLA
decode '\x8E' = '\x00E9' -- LATIN SMALL LETTER E WITH ACUTE
decode '\x8F' = '\x00E8' -- LATIN SMALL LETTER E WITH GRAVE
decode '\x90' = '\x00EA' -- LATIN SMALL LETTER E WITH CIRCUMFLEX
decode '\x91' = '\x00EB' -- LATIN SMALL LETTER E WITH DIAERESIS
decode '\x92' = '\x00ED' -- LATIN SMALL LETTER I WITH ACUTE
decode '\x93' = '\x00EC' -- LATIN SMALL LETTER I WITH GRAVE
decode '\x94' = '\x00EE' -- LATIN SMALL LETTER I WITH CIRCUMFLEX
decode '\x95' = '\x00EF' -- LATIN SMALL LETTER I WITH DIAERESIS
decode '\x96' = '\x00F1' -- LATIN SMALL LETTER N WITH TILDE
decode '\x97' = '\x00F3' -- LATIN SMALL LETTER O WITH ACUTE
decode '\x98' = '\x00F2' -- LATIN SMALL LETTER O WITH GRAVE
decode '\x99' = '\x00F4' -- LATIN SMALL LETTER O WITH CIRCUMFLEX
decode '\x9A' = '\x00F6' -- LATIN SMALL LETTER O WITH DIAERESIS
decode '\x9B' = '\x00F5' -- LATIN SMALL LETTER O WITH TILDE
decode '\x9C' = '\x00FA' -- LATIN SMALL LETTER U WITH ACUTE
decode '\x9D' = '\x00F9' -- LATIN SMALL LETTER U WITH GRAVE
decode '\x9E' = '\x00FB' -- LATIN SMALL LETTER U WITH CIRCUMFLEX
decode '\x9F' = '\x00FC' -- LATIN SMALL LETTER U WITH DIAERESIS
decode '\xA0' = '\x2020' -- DAGGER
decode '\xA1' = '\x00B0' -- DEGREE SIGN
decode '\xA2' = '\x00A2' -- CENT SIGN
decode '\xA3' = '\x00A3' -- POUND SIGN
decode '\xA4' = '\x00A7' -- SECTION SIGN
decode '\xA5' = '\x2022' -- BULLET
decode '\xA6' = '\x00B6' -- PILCROW SIGN
decode '\xA7' = '\x00DF' -- LATIN SMALL LETTER SHARP S
decode '\xA8' = '\x00AE' -- REGISTERED SIGN
decode '\xA9' = '\x00A9' -- COPYRIGHT SIGN
decode '\xAA' = '\x2122' -- TRADE MARK SIGN
decode '\xAB' = '\x00B4' -- ACUTE ACCENT
decode '\xAC' = '\x00A8' -- DIAERESIS
decode '\xAD' = '\x2260' -- NOT EQUAL TO
decode '\xAE' = '\x00C6' -- LATIN CAPITAL LETTER AE
decode '\xAF' = '\x00D8' -- LATIN CAPITAL LETTER O WITH STROKE
decode '\xB0' = '\x221E' -- INFINITY
decode '\xB1' = '\x00B1' -- PLUS-MINUS SIGN
decode '\xB2' = '\x2264' -- LESS-THAN OR EQUAL TO
decode '\xB3' = '\x2265' -- GREATER-THAN OR EQUAL TO
decode '\xB4' = '\x00A5' -- YEN SIGN
decode '\xB5' = '\x00B5' -- MICRO SIGN
decode '\xB6' = '\x2202' -- PARTIAL DIFFERENTIAL
decode '\xB7' = '\x2211' -- N-ARY SUMMATION
decode '\xB8' = '\x220F' -- N-ARY PRODUCT
decode '\xB9' = '\x03C0' -- GREEK SMALL LETTER PI
decode '\xBA' = '\x222B' -- INTEGRAL
decode '\xBB' = '\x00AA' -- FEMININE ORDINAL INDICATOR
decode '\xBC' = '\x00BA' -- MASCULINE ORDINAL INDICATOR
decode '\xBD' = '\x03A9' -- GREEK CAPITAL LETTER OMEGA
decode '\xBE' = '\x00E6' -- LATIN SMALL LETTER AE
decode '\xBF' = '\x00F8' -- LATIN SMALL LETTER O WITH STROKE
decode '\xC0' = '\x00BF' -- INVERTED QUESTION MARK
decode '\xC1' = '\x00A1' -- INVERTED EXCLAMATION MARK
decode '\xC2' = '\x00AC' -- NOT SIGN
decode '\xC3' = '\x221A' -- SQUARE ROOT
decode '\xC4' = '\x0192' -- LATIN SMALL LETTER F WITH HOOK
decode '\xC5' = '\x2248' -- ALMOST EQUAL TO
decode '\xC6' = '\x2206' -- INCREMENT
decode '\xC7' = '\x00AB' -- LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
decode '\xC8' = '\x00BB' -- RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
decode '\xC9' = '\x2026' -- HORIZONTAL ELLIPSIS
decode '\xCA' = '\x00A0' -- NO-BREAK SPACE
decode '\xCB' = '\x00C0' -- LATIN CAPITAL LETTER A WITH GRAVE
decode '\xCC' = '\x00C3' -- LATIN CAPITAL LETTER A WITH TILDE
decode '\xCD' = '\x00D5' -- LATIN CAPITAL LETTER O WITH TILDE
decode '\xCE' = '\x0152' -- LATIN CAPITAL LIGATURE OE
decode '\xCF' = '\x0153' -- LATIN SMALL LIGATURE OE
decode '\xD0' = '\x2013' -- EN DASH
decode '\xD1' = '\x2014' -- EM DASH
decode '\xD2' = '\x201C' -- LEFT DOUBLE QUOTATION MARK
decode '\xD3' = '\x201D' -- RIGHT DOUBLE QUOTATION MARK
decode '\xD4' = '\x2018' -- LEFT SINGLE QUOTATION MARK
decode '\xD5' = '\x2019' -- RIGHT SINGLE QUOTATION MARK
decode '\xD6' = '\x00F7' -- DIVISION SIGN
decode '\xD7' = '\x25CA' -- LOZENGE
decode '\xD8' = '\x00FF' -- LATIN SMALL LETTER Y WITH DIAERESIS
decode '\xD9' = '\x0178' -- LATIN CAPITAL LETTER Y WITH DIAERESIS
decode '\xDA' = '\x2044' -- FRACTION SLASH
decode '\xDB' = '\x20AC' -- EURO SIGN
decode '\xDC' = '\x2039' -- SINGLE LEFT-POINTING ANGLE QUOTATION MARK
decode '\xDD' = '\x203A' -- SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
decode '\xDE' = '\xFB01' -- LATIN SMALL LIGATURE FI
decode '\xDF' = '\xFB02' -- LATIN SMALL LIGATURE FL
decode '\xE0' = '\x2021' -- DOUBLE DAGGER
decode '\xE1' = '\x00B7' -- MIDDLE DOT
decode '\xE2' = '\x201A' -- SINGLE LOW-9 QUOTATION MARK
decode '\xE3' = '\x201E' -- DOUBLE LOW-9 QUOTATION MARK
decode '\xE4' = '\x2030' -- PER MILLE SIGN
decode '\xE5' = '\x00C2' -- LATIN CAPITAL LETTER A WITH CIRCUMFLEX
decode '\xE6' = '\x00CA' -- LATIN CAPITAL LETTER E WITH CIRCUMFLEX
decode '\xE7' = '\x00C1' -- LATIN CAPITAL LETTER A WITH ACUTE
decode '\xE8' = '\x00CB' -- LATIN CAPITAL LETTER E WITH DIAERESIS
decode '\xE9' = '\x00C8' -- LATIN CAPITAL LETTER E WITH GRAVE
decode '\xEA' = '\x00CD' -- LATIN CAPITAL LETTER I WITH ACUTE
decode '\xEB' = '\x00CE' -- LATIN CAPITAL LETTER I WITH CIRCUMFLEX
decode '\xEC' = '\x00CF' -- LATIN CAPITAL LETTER I WITH DIAERESIS
decode '\xED' = '\x00CC' -- LATIN CAPITAL LETTER I WITH GRAVE
decode '\xEE' = '\x00D3' -- LATIN CAPITAL LETTER O WITH ACUTE
decode '\xEF' = '\x00D4' -- LATIN CAPITAL LETTER O WITH CIRCUMFLEX
decode '\xF0' = '\xF8FF' -- Apple logo
decode '\xF1' = '\x00D2' -- LATIN CAPITAL LETTER O WITH GRAVE
decode '\xF2' = '\x00DA' -- LATIN CAPITAL LETTER U WITH ACUTE
decode '\xF3' = '\x00DB' -- LATIN CAPITAL LETTER U WITH CIRCUMFLEX
decode '\xF4' = '\x00D9' -- LATIN CAPITAL LETTER U WITH GRAVE
decode '\xF5' = '\x0131' -- LATIN SMALL LETTER DOTLESS I
decode '\xF6' = '\x02C6' -- MODIFIER LETTER CIRCUMFLEX ACCENT
decode '\xF7' = '\x02DC' -- SMALL TILDE
decode '\xF8' = '\x00AF' -- MACRON
decode '\xF9' = '\x02D8' -- BREVE
decode '\xFA' = '\x02D9' -- DOT ABOVE
decode '\xFB' = '\x02DA' -- RING ABOVE
decode '\xFC' = '\x00B8' -- CEDILLA
decode '\xFD' = '\x02DD' -- DOUBLE ACUTE ACCENT
decode '\xFE' = '\x02DB' -- OGONEK
decode '\xFF' = '\x02C7' -- CARON
decode c = c -- The rest is ASCII

16
src/PDF/Font.hs Normal file
View File

@ -0,0 +1,16 @@
module PDF.Font (
Font
, FontSet
, emptyFont
) where
import Data.ByteString (ByteString)
import Data.Map (Map)
import Data.Text (Text)
import PDF.Object (Name)
type Font = ByteString -> Either String Text
type FontSet = Map Name Font
emptyFont :: Font
emptyFont _ = Left "No fond loaded"

View File

@ -116,7 +116,7 @@ char :: Char -> OBuilder
char c = lift char8 c <* offset (+1) char c = lift char8 c <* offset (+1)
string :: String -> OBuilder string :: String -> OBuilder
string s = lift string8 s <* offset (+ toEnum (length s)) string s = lift string8 s <* offset (+ length s)
line :: String -> OBuilder line :: String -> OBuilder
line l = string l `mappend` newLine line l = string l `mappend` newLine

View File

@ -6,41 +6,40 @@ module PDF.Pages (
) where ) where
import Codec.Compression.Zlib (decompress) import Codec.Compression.Zlib (decompress)
import Control.Applicative ((<|>))
import Control.Monad (foldM) import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), ask, evalRWST, mapRWST, modify) import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify)
import qualified Control.Monad.RWS as RWS (get) import qualified Control.Monad.RWS as RWS (get)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!)) import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import PDF.CMap (CMap, CMappers, cMap) import Data.Text (Text)
import PDF.CMap (cMap)
import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet)
import PDF.Object ( import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..) , Object(..), Name(..), Structure(..)
,) ,)
import PDF.Output (ObjectId(..)) import PDF.Output (ObjectId(..))
import PDF.Text (pageContents) import PDF.Text (pageContents)
import Prelude hiding (fail)
import Text.Printf (printf) import Text.Printf (printf)
type CachedCMaps = Map ObjectId CMap type CachedFonts = Map ObjectId Font
type T = RWST Content () CachedCMaps (Either String) type T = RWST Content () CachedFonts (Either String)
data Page = Page { data Page = Page {
contents :: [ByteString] contents :: [Text]
, source :: ObjectId , source :: ObjectId
} }
infixl 1 \\= infixl 1 \\=
(\\=) :: T a -> (a -> Either String b) -> T b (\\=) :: T a -> (a -> Either String b) -> T b
x \\= f = mapRWST ((\(a, s, w) -> (\b -> (b, s, w)) <$> f a) =<<) x x \\= f = x >>= lift . f
infixl 1 //= infixl 1 //=
(//=) :: Either String a -> (a -> T b) -> T b (//=) :: Either String a -> (a -> T b) -> T b
(//=) (Left e) _ = RWST (\_ _ -> Left e) x //= f = lift x >>= f
(//=) (Right a) f = f a
lift :: Either String a -> T a
lift x = x //= return
expected :: Show a => String -> a -> Either String b expected :: Show a => String -> a -> Either String b
expected name = Left . printf "Not a %s: %s" name . show expected name = Left . printf "Not a %s: %s" name . show
@ -60,14 +59,14 @@ getResource (Reference (IndirectObjCoordinates {objectId})) =
getResource directObject = getResource directObject =
lift $ expected "resource (dictionary or reference)" directObject lift $ expected "resource (dictionary or reference)" directObject
getFont :: Dictionary -> T Dictionary getFontDictionary :: Dictionary -> T Dictionary
getFont pageDict = getFontDictionary pageDict =
key "Resources" pageDict key "Resources" pageDict
//= getResource //= getResource
\\= key "Font" \\= key "Font"
>>= getResource >>= getResource
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap cache :: (ObjectId -> T Font) -> ObjectId -> T Font
cache loader objectId = cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get (maybe load return . Map.lookup objectId) =<< RWS.get
where where
@ -76,23 +75,27 @@ cache loader objectId =
modify $ Map.insert objectId value modify $ Map.insert objectId value
return value return value
loadFont :: ObjectId -> T CMap loadFont :: ObjectId -> T Font
loadFont objectId = loadFont objectId = getObject objectId \\= dict >>= tryMappings
getObject objectId
\\= dict
\\= key "ToUnicode"
>>= follow
\\= stream
\\= cMap
loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList
where where
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers tryMappings dictionary =
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = loadCMap dictionary
<|> lift (key "Encoding" dictionary >>= loadEncoding)
<|> lift (Left $ unknownFormat (show objectId) (show dictionary))
unknownFormat = printf "Unknown font format for object #%s : %s"
loadCMap dictionary =
key "ToUnicode" dictionary //= follow \\= stream \\= cMap
loadEncoding (NameObject (Name name)) = encoding name
loadEncoding directObject =
Left . printf "Encoding must be a name, not that : %s" $ show directObject
loadFonts :: Dictionary -> T FontSet
loadFonts = foldM addFont Map.empty . Map.toList
where
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId flip (Map.insert name) output <$> cache loadFont objectId
--maybe output (flip (Map.insert name) output) <$> cache loadFont objectId addFont output _ = return output
loadCMapper output _ = return output
getObject :: ObjectId -> T Object getObject :: ObjectId -> T Object
getObject objectId = do getObject objectId = do
@ -139,16 +142,16 @@ getReferences objects = do
Reference (IndirectObjCoordinates {objectId}) -> [objectId] Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> [] _ -> []
extractText :: Object -> T [ByteString] extractText :: Object -> T [Text]
extractText object = do extractText object = do
pageDict <- lift $ dict object pageDict <- lift $ dict object
cMappers <- loadCMappers =<< getFont pageDict fonts <- loadFonts =<< getFontDictionary pageDict
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject]) let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
concat <$> (objects //= (mapM $ loadContent cMappers)) concat <$> (objects //= (mapM $ loadContent fonts))
where where
loadContent :: CMappers -> DirectObject -> T [ByteString] loadContent :: FontSet -> DirectObject -> T [Text]
loadContent cMappers directObject = loadContent fonts directObject =
follow directObject \\= stream \\= pageContents cMappers follow directObject \\= stream \\= pageContents fonts
loadPage :: ObjectId -> T Page loadPage :: ObjectId -> T Page
loadPage source = loadPage source =

View File

@ -2,21 +2,13 @@
{-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE UndecidableInstances #-} {-# LANGUAGE UndecidableInstances #-}
module PDF.Parser ( module PDF.Parser (
MonadParser MonadParser(..)
, Parser , Parser
, (<?>) , (<?>)
, block
, char
, decNumber
, hexNumber
, octDigit , octDigit
, on , on
, oneOf
, runParser , runParser
, evalParser , evalParser
, string
, takeAll
, takeAll1
) where ) where
import Control.Applicative (Alternative, (<|>)) import Control.Applicative (Alternative, (<|>))
@ -25,7 +17,8 @@ import Control.Monad.Fail (MonadFail(..))
import Control.Monad.State (StateT(..), evalStateT) import Control.Monad.State (StateT(..), evalStateT)
import Control.Monad.Trans (MonadTrans(..)) import Control.Monad.Trans (MonadTrans(..))
import qualified Data.Attoparsec.ByteString.Char8 as Atto ( import qualified Data.Attoparsec.ByteString.Char8 as Atto (
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1 Parser, char, endOfInput, parseOnly, satisfy, string, take, takeWhile
, takeWhile1
) )
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import Data.ByteString.Char8.Util (B16Int(..)) import Data.ByteString.Char8.Util (B16Int(..))
@ -40,6 +33,7 @@ class MonadDeps m => MonadParser m where
block :: Int -> m ByteString block :: Int -> m ByteString
char :: Char -> m Char char :: Char -> m Char
decNumber :: m ByteString decNumber :: m ByteString
endOfInput :: m ()
hexNumber :: m B16Int hexNumber :: m B16Int
oneOf :: String -> m Char oneOf :: String -> m Char
string :: ByteString -> m ByteString string :: ByteString -> m ByteString
@ -49,6 +43,7 @@ class MonadDeps m => MonadParser m where
instance MonadParser Atto.Parser where instance MonadParser Atto.Parser where
block = Atto.take block = Atto.take
char = Atto.char char = Atto.char
endOfInput = Atto.endOfInput
decNumber = Atto.takeWhile1 (`Set.member` digits) decNumber = Atto.takeWhile1 (`Set.member` digits)
hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits) hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
oneOf charSet = Atto.satisfy (`elem` charSet) oneOf charSet = Atto.satisfy (`elem` charSet)
@ -59,6 +54,7 @@ instance MonadParser Atto.Parser where
instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) where instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) where
block = lift . block block = lift . block
char = lift . char char = lift . char
endOfInput = lift $ endOfInput
decNumber = lift $ decNumber decNumber = lift $ decNumber
hexNumber = lift $ hexNumber hexNumber = lift $ hexNumber
oneOf = lift . oneOf oneOf = lift . oneOf

View File

@ -8,35 +8,58 @@ module PDF.Text {-(
import Control.Applicative ((<|>)) import Control.Applicative ((<|>))
import Control.Monad (foldM) import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail)
import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (MonadState, evalStateT, get, modify, put) import Control.Monad.State (MonadState, evalStateT, get, modify, put)
import Data.Attoparsec.ByteString.Char8 (choice, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (drop, null, take)
import Data.ByteString.Char8 (pack, unpack) import Data.ByteString.Char8 (pack, unpack)
import Data.Char (toLower)
import Data.Map ((!), (!?), Map) import Data.Map ((!), (!?), Map)
import qualified Data.Map as Map (fromList, lookup, toList) import qualified Data.Map as Map (fromList)
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap) import Data.Text (Text)
import PDF.Font (Font, FontSet, emptyFont)
import PDF.Object ( import PDF.Object (
DirectObject(..), StringObject(..) DirectObject(..), StringObject(..)
, array, blank, name, regular, stringObject, toByteString , array, blank, name, regular, stringObject, toByteString
) )
import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll1) import PDF.Parser (MonadParser(..), (<?>), Parser, evalParser)
data StateOperator = data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
deriving (Bounded, Enum) deriving (Bounded, Enum, Show)
data PathOperator =
M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
| S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
| W | Wstar -- clipping path
deriving (Bounded, Enum, Show)
data ColorOperator =
CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
deriving (Bounded, Enum, Show)
data TextOperator = data TextOperator =
Td | TD | Tm | Tstar -- text positioning Td | TD | Tm | Tstar -- text positioning
| TJ | Tj | Quote | DQuote -- text showing | TJ | Tj | Quote | DQuote -- text showing
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
deriving (Bounded, Enum) deriving (Bounded, Enum, Show)
data Argument = Raw ByteString | Typed DirectObject deriving Show data Argument = Raw ByteString | Typed DirectObject deriving Show
type Call a = (a, [Argument]) type Call a = (a, [Argument])
type Operator a = (Bounded a, Enum a, Show a)
code :: Operator a => a -> ByteString
code = pack . expand . show
where
expand "" = ""
expand (c:'_':s) = toLower c : expand s
expand ('s':'t':'a':'r':s) = '*' : expand s
expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
expand (c:s) = c : expand s
{-
instance Show StateOperator where instance Show StateOperator where
show Cm = "cm" show Cm = "cm"
show W = "w" show W_ = "w"
show J = "J" show J = "J"
show J_ = "j" show J_ = "j"
show M = "M" show M = "M"
@ -45,6 +68,35 @@ instance Show StateOperator where
show I = "i" show I = "i"
show Gs = "gs" show Gs = "gs"
instance Show PathOperator where
show M_ = "m"
show L_ = "l"
show C_ = "c"
show V_ = "v"
show Y_ = "y"
show H_
("m", (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
, ("l", (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
, ("c", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
, ("v", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
, ("y", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
, ("h", (L_, \l -> case l of [] -> True ; _ -> False))
, ("re", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
, ("S", (L_, \l -> case l of [] -> True ; _ -> False))
, ("s", (L_, \l -> case l of [] -> True ; _ -> False))
, ("f", (L_, \l -> case l of [] -> True ; _ -> False))
, ("F", (L_, \l -> case l of [] -> True ; _ -> False))
, ("F*", (L_, \l -> case l of [] -> True ; _ -> False))
, ("B", (L_, \l -> case l of [] -> True ; _ -> False))
, ("B*", (L_, \l -> case l of [] -> True ; _ -> False))
, ("b", (L_, \l -> case l of [] -> True ; _ -> False))
, ("b*", (L_, \l -> case l of [] -> True ; _ -> False))
, ("n", (L_, \l -> case l of [] -> True ; _ -> False))
, ("W", (L_, \l -> case l of [] -> True ; _ -> False))
, ("W*", (L_, \l -> case l of [] -> True ; _ -> False))
instance Show ColorOperator where
instance Show TextOperator where instance Show TextOperator where
show Td = "Td" show Td = "Td"
show TD = "TD" show TD = "TD"
@ -61,18 +113,58 @@ instance Show TextOperator where
show Tf = "Tf" show Tf = "Tf"
show Tr = "Tr" show Tr = "Tr"
show Ts = "Ts" show Ts = "Ts"
-}
stateOperator :: OperatorTable StateOperator stateOperator :: OperatorTable StateOperator
stateOperator = Map.fromList [ stateOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
("cm", (Cm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)) (C_m, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, ("w", (W, \l -> case l of [Raw _] -> True ; _ -> False)) , (W_, \l -> case l of [Raw _] -> True ; _ -> False)
, ("J", (J, \l -> case l of [Raw _] -> True ; _ -> False)) , (J, \l -> case l of [Raw _] -> True ; _ -> False)
, ("j", (J_, \l -> case l of [Raw _] -> True ; _ -> False)) , (J_, \l -> case l of [Raw _] -> True ; _ -> False)
, ("M", (M, \l -> case l of [Raw _] -> True ; _ -> False)) , (M, \l -> case l of [Raw _] -> True ; _ -> False)
, ("d", (D, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)) , (D_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, ("ri", (Ri, \l -> case l of [Raw _] -> True ; _ -> False)) , (R_i, \l -> case l of [Raw _] -> True ; _ -> False)
, ("i", (I, \l -> case l of [Raw _] -> True ; _ -> False)) , (I_, \l -> case l of [Raw _] -> True ; _ -> False)
, ("gs", (Gs, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)) , (G_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
]
pathOperator :: OperatorTable PathOperator
pathOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
(M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, (C_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (V_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (Y_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (H_, \l -> case l of [] -> True ; _ -> False)
, (R_e, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (S, \l -> case l of [] -> True ; _ -> False)
, (S_, \l -> case l of [] -> True ; _ -> False)
, (F_, \l -> case l of [] -> True ; _ -> False)
, (F, \l -> case l of [] -> True ; _ -> False)
, (Fstar, \l -> case l of [] -> True ; _ -> False)
, (B, \l -> case l of [] -> True ; _ -> False)
, (Bstar, \l -> case l of [] -> True ; _ -> False)
, (B_, \l -> case l of [] -> True ; _ -> False)
, (B_star, \l -> case l of [] -> True ; _ -> False)
, (N_, \l -> case l of [] -> True ; _ -> False)
, (W, \l -> case l of [] -> True ; _ -> False)
, (Wstar, \l -> case l of [] -> True ; _ -> False)
]
colorOperator :: OperatorTable ColorOperator
colorOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
(CS, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
, (C_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
, (SC, \_ -> True)
, (SCN, \_ -> True)
, (S_c, \_ -> True)
, (S_cn, \_ -> True)
, (G, \l -> case l of [Raw _] -> True ; _ -> False)
, (G_, \l -> case l of [Raw _] -> True ; _ -> False)
, (RG, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
, (R_g, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
, (K, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (K_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
] ]
{- {-
@ -89,7 +181,7 @@ stateOperator _ = False
-} -}
textOperator :: OperatorTable TextOperator textOperator :: OperatorTable TextOperator
textOperator = Map.fromList $ (\(op, checker) -> (pack $ show op, (op, checker))) <$> [ textOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
(Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False) (Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False) , (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False) , (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
@ -127,7 +219,7 @@ textOperator _ = False
-} -}
type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m) type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
type Operator a = (Bounded a, Enum a, Show a) --type Operator a = (Bounded a, Enum a, Show a)
type OperatorTable a = Map ByteString (TypeChecker a) type OperatorTable a = Map ByteString (TypeChecker a)
type TypeChecker a = (a, [Argument] -> Bool) type TypeChecker a = (a, [Argument] -> Bool)
@ -146,14 +238,14 @@ callChunk table =
Nothing -> return . Right $ Raw chunk Nothing -> return . Right $ Raw chunk
Just typeChecker -> return $ Left typeChecker Just typeChecker -> return $ Left typeChecker
stackParser :: (ArgumentStackParser m, Show a) => OperatorTable a -> m (Call a) stackParser :: (ArgumentStackParser m, Operator a) => OperatorTable a -> m (Call a)
stackParser table = either popCall push =<< (callChunk table) stackParser table = either popCall push =<< (callChunk table)
where where
push arg = modify (arg:) >> stackParser table push arg = modify (arg:) >> stackParser table
popCall (operator, predicate) = do popCall (operator, predicate) = do
arguments <- reverse <$> get arguments <- reverse <$> get
let call = (operator, arguments) let call = (operator, arguments)
if predicate arguments then return call else fail (show call) if predicate arguments then return call else fail (unpack $ code operator)
a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a) a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a)
a table = evalStateT (stackParser table) [] a table = evalStateT (stackParser table) []
@ -170,29 +262,34 @@ nameArg = Typed . NameObject <$> name <* blank
stringArg :: MonadParser m => m Argument stringArg :: MonadParser m => m Argument
stringArg = Typed . StringObject <$> stringObject <* blank stringArg = Typed . StringObject <$> stringObject <* blank
type ParserWithFont = ReaderT CMappers (Parser CMap) type ParserWithFont = ReaderT FontSet (Parser Font)
pageContents :: CMappers -> ByteString -> Either String [ByteString] pageContents :: FontSet -> ByteString -> Either String [Text]
pageContents font input = pageContents fontSet input =
evalParser (runReaderT page font) emptyCMap input evalParser (runReaderT (page) fontSet) emptyFont input
page :: ParserWithFont [ByteString] several :: MonadParser m => m [a] -> m [a]
page = graphicState <|> text <?> "Text page contents" several p = concat <$> (p `sepBy` blank)
graphicState :: ParserWithFont [ByteString] page :: ParserWithFont [Text]
page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
graphicState :: ParserWithFont [Text]
graphicState = graphicState =
(string "q" *> blank *> insideQ <* blank <* string "Q") <?> "Graphic state" string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
where where
insideQ = concat <$> ((command <|> page) `sepBy` blank ) insideQ = several (command <|> graphicState <|> text)
command = a stateOperator *> return [] ignore x = a x *> return []
command =
ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
text :: ParserWithFont [ByteString] text :: ParserWithFont [Text]
text = text =
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators" string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
where where
commands = concat <$> (a textOperator >>= runOperator) `sepBy` blank commands = several (a textOperator >>= runOperator)
runOperator :: Call TextOperator -> ParserWithFont [ByteString] runOperator :: Call TextOperator -> ParserWithFont [Text]
runOperator (Tf, [Typed (NameObject fontName), _]) = runOperator (Tf, [Typed (NameObject fontName), _]) =
asks (! fontName) >>= put >> return [] asks (! fontName) >>= put >> return []
@ -216,24 +313,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return [] runOperator _ = return []
decodeString :: StringObject -> ParserWithFont ByteString decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text
decodeString = decode . toByteString decodeString input = do
where font <- get
decode input either fail return . font $ toByteString input
| BS.null input = return ""
| otherwise = do
(output, remainingInput) <- trySizes input =<< Map.toList <$> get
mappend output <$> decode remainingInput
trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString)
trySizes s [] = fail $ "No matching code found in font for " ++ unpack s
trySizes s ((size, cRanges):others) =
let prefix = BS.take size s in
case tryRanges prefix cRanges of
Nothing -> trySizes s others
Just outputSequence -> return (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
tryRanges _ [] = Nothing
tryRanges prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of
Nothing -> tryRanges prefix cRanges
outputSequence -> outputSequence