Improve getObj example to catch no-existing ObjectId and default to listing existing ObjectIds when none is provided

2019-11-29 11:53:08 +01:00
16 changed files with 137 additions and 1079 deletions
--- a/Hufflepdf.cabal
+++ b/Hufflepdf.cabal
@ -17,28 +17,19 @@ cabal-version:       >=1.10
 library
  exposed-modules:     PDF
                     , PDF.CMap
                     , PDF.EOL
                     , PDF.Object
                     , PDF.Output
                     , PDF.Pages
                     , PDF.Parser
                     , PDF.Text
                     , PDF.Update
  other-modules:       Data.ByteString.Char8.Util
                     , PDF.Encoding
                     , PDF.Encoding.MacRoman
                     , PDF.Body
-                     , PDF.Font
+                     , PDF.Parser
  -- other-extensions:    
  build-depends:       attoparsec
                     , base >=4.9 && <4.13
                     , bytestring
                     , containers
                     , mtl
                     , text
                     , utf8-string
                     , zlib
  hs-source-dirs:      src
  ghc-options:         -Wall
  default-language:    Haskell2010
@ -60,15 +51,3 @@ executable             getObj
                     , zlib
  ghc-options:         -Wall
  default-language:    Haskell2010
 executable             getText
  main-is:             examples/getText.hs
  build-depends:       base
                     , bytestring
                     , containers
                     , Hufflepdf
                     , mtl
                     , text
                     , zlib
  ghc-options:         -Wall
  default-language:    Haskell2010
--- a/examples/getObj.hs
+++ b/examples/getObj.hs
@ -4,16 +4,17 @@ import Codec.Compression.Zlib (decompress)
 import Data.ByteString.Lazy.Char8 (ByteString)
 import qualified Data.ByteString.Char8 as BS (readFile)
 import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict)
-import Data.Map ((!))
+import Data.Map ((!?))
-import qualified Data.Map as Map (lookup)
+import qualified Data.Map as Map (keys, lookup)
 import PDF (Document(..), parseDocument)
 import qualified PDF.EOL as EOL (Style)
 import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..))
 import PDF.Output (ObjectId(..))
 import qualified PDF.Output as Output (render)
 import PDF.Update (unify)
-import System.Environment (getArgs)
+import System.Environment (getArgs, getProgName)
-import System.IO (hPutStrLn, stderr)
+import System.Exit (die)
 import Text.Printf (printf)
 display :: EOL.Style -> Object -> ByteString
 display eolStyle d@(Direct _) = Output.render eolStyle d
@ -25,16 +26,28 @@ display eolStyle s@(Stream {header, streamContent}) = Output.render eolStyle $
      }
    _ -> s
-extractObject :: ObjectId -> Document -> ByteString
+extractObject :: ObjectId -> Document -> Either String ByteString
 extractObject objectId (Document {eolStyle, updates}) =
-  display eolStyle . (!objectId) $ objects  content
+  case objects content !? objectId of
    Nothing -> Left $ "No object has ID " ++ show (getObjectId objectId)
    Just o -> Right $ display eolStyle o
  where
    content = unify updates
 listObjectIds :: Document -> Either String [String]
 listObjectIds =
  Right . prependTitle . toString . Map.keys . objects . unify . updates
  where
    toString = fmap (show . getObjectId)
    prependTitle = ("ObjectIds defined in this PDF:":)
 main :: IO ()
 main = do
-  [inputFile, objectId] <- getArgs
+  (inputFile, getData) <- parse =<< getArgs
-  result <- parseDocument <$> BS.readFile inputFile
+  input <- BS.readFile inputFile
-  case result of
+  either die id $ (parseDocument input >>= getData)
-    Left parseError -> hPutStrLn stderr parseError
+  where
-    Right doc -> Lazy.putStr $ extractObject (ObjectId (read objectId)) doc
+    parse [inputFile] = return (inputFile, fmap (mapM_ putStrLn) . listObjectIds)
    parse [inputFile, objectId] = return
      (inputFile, fmap Lazy.putStr . extractObject (ObjectId (read objectId)))
    parse _ = die . printf "Syntax: %s inputFile [OBJECT_ID]\n" =<< getProgName
--- a/examples/getText.hs
+++ b/examples/getText.hs
@ -1,38 +0,0 @@
 import qualified Data.ByteString.Char8 as BS (readFile)
 import qualified Data.Map as Map (toList)
 import qualified Data.Text.IO as Text (putStrLn)
 import PDF (Document(..), parseDocument)
 import PDF.Object (Content)
 import PDF.Pages (Page(..), get, getAll)
 import PDF.Update (unify)
 import System.Environment (getArgs)
 import System.Exit (die)
 import System.IO (BufferMode(..), hSetBuffering, stdout)
 onDoc :: FilePath -> (Content -> Either String a) -> IO a
 onDoc inputFile f = do
  content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
  case content >>= f of
    Left someError -> die someError
    Right value -> return value
 displayPage :: Page -> IO ()
 displayPage = mapM_ Text.putStrLn . contents
 wholeDoc :: FilePath -> IO ()
 wholeDoc inputFile = do
  pages <- onDoc inputFile getAll
  mapM_ (displayPage . snd) $ Map.toList pages
 singlePage :: FilePath -> Int -> IO ()
 singlePage inputFile pageNumber =
  onDoc inputFile (`get` pageNumber) >>= displayPage
 main :: IO ()
 main = do
  hSetBuffering stdout LineBuffering
  args <- getArgs
  case args of
    [inputFile] -> wholeDoc inputFile
    [inputFile, pageNumber] -> singlePage inputFile (read pageNumber)
    _ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
--- a/src/Data/ByteString/Char8/Util.hs
+++ b/src/Data/ByteString/Char8/Util.hs
@ -1,91 +1,16 @@
 module Data.ByteString.Char8.Util (
-      B16Int(..)
+      previous
    , B256Int(..)
    , b8ToInt
    , b16ToBytes
    , b16ToInt
    , b256ToInt
    , intToB256
    , previous
    , subBS
    , toBytes
    , unescape
    , utf16BEToutf8
  ) where
-import Data.ByteString (ByteString, snoc)
+import Data.ByteString.Char8 (ByteString)
-import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt)
+import qualified Data.ByteString.Char8 as BS (drop, index, take)
 import qualified Data.ByteString.Char8 as Char8 (
    cons, drop, index, splitAt, take, uncons, unpack
  )
 import Data.Text (Text)
 import Data.Text.Encoding (decodeUtf16BE)
 import Prelude hiding (length)
 import Text.Printf (printf)
 newtype B8Int = B8Int ByteString deriving Show
 newtype B16Int = B16Int ByteString deriving Show
 newtype B256Int = B256Int ByteString deriving Show
 previous :: Char -> Int -> ByteString -> Int
 previous char position byteString
-  | Char8.index byteString position == char = position
+  | BS.index byteString position == char = position
  | otherwise = previous char (position - 1) byteString
 subBS :: Int -> Int -> ByteString -> ByteString
-subBS offset length = Char8.take length . Char8.drop offset
+subBS offset length = BS.take length . BS.drop offset
 intToB256 :: Int -> B256Int
 intToB256 n
  | n < 0x100 = B256Int . BS.singleton $ toEnum n
  | otherwise =
    let B256Int begining = intToB256 (n `div` 0x100) in
    B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
 b256ToInt :: B256Int -> Int
 b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n
 toBytes :: Int -> Int -> ByteString
 toBytes 0 _ = BS.empty
 toBytes size n =
  (toBytes (size - 1) (n `div` 0x100)) `snoc` (toEnum (n `mod` 0x100))
 b16ToBytes :: B16Int -> ByteString
 b16ToBytes (B16Int n) = BS.pack . fmap b16ToInt $ pairDigits n
  where
    pairDigits s =
      case BS.length s of
        0 -> []
        1 -> [B16Int s]
        _ ->
          let (twoHexDigits, rest) = BS.splitAt 2 s in
          (B16Int $ twoHexDigits):(pairDigits rest)
 fromBase :: (Num a, Read a) => Char -> ByteString -> a
 fromBase b = read . printf "0%c%s" b . Char8.unpack
 b16ToInt :: (Num a, Read a) => B16Int -> a
 b16ToInt (B16Int n) = fromBase 'x' n
 b8ToInt :: (Num a, Read a) => B8Int -> a
 b8ToInt (B8Int n) = fromBase 'o' n
 unescape :: ByteString -> ByteString
 unescape escapedBS =
  case Char8.uncons escapedBS of
    Nothing -> BS.empty
    Just ('\\', s) -> unescapeChar s
    Just (c, s) -> Char8.cons c (unescape s)
  where
    unescapeChar s =
      case Char8.uncons s of
        Nothing -> BS.empty
        Just (c, s')
          | c `elem` "()" -> Char8.cons c (unescape s')
          | c `elem` "nrtbf" -> Char8.cons (read (printf "'\\%c'" c)) (unescape s')
          | c `elem` ['0'..'7'] -> fromOctal (Char8.splitAt 3 s)
          | otherwise -> Char8.cons c (unescape s')
    fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
 utf16BEToutf8 :: ByteString -> Text
 utf16BEToutf8 = decodeUtf16BE
--- a/src/PDF.hs
+++ b/src/PDF.hs
@ -21,7 +21,7 @@ import PDF.Object (
  )
 import qualified PDF.Output as Output (render, line)
 import PDF.Output (Output(..))
-import PDF.Parser (Parser, evalParser, string, takeAll)
+import PDF.Parser (Parser, runParser, string, takeAll)
 import Text.Printf (printf)
 data Document = Document {
@ -83,7 +83,7 @@ findNextSection offset input =
 readStructures :: Int -> ByteString -> Either String [InputStructure]
 readStructures startXref input =
-  evalParser structure () (BS.drop startXref input) >>= stopOrFollow
+  runParser structure () (BS.drop startXref input) >>= stopOrFollow
  where
    stopOrFollow s@(Structure {trailer}) =
      case Map.lookup (Name "Prev") trailer of
@ -96,7 +96,7 @@ readStructures startXref input =
 parseDocument :: ByteString -> Either String Document
 parseDocument input = do
-  (pdfVersion, eolStyle) <- evalParser ((,) <$> version <*> EOL.parser) () input
+  (pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input
  startXref <- readStartXref eolStyle input
  structuresRead <- readStructures startXref input
  let updates = populate input <$> structuresRead
--- a/src/PDF/Body.hs
+++ b/src/PDF/Body.hs
@ -6,7 +6,6 @@ module PDF.Body (
 import Control.Applicative ((<|>))
 import Control.Monad.State (get, gets, modify)
 import Data.Attoparsec.ByteString.Char8 (option)
 import Data.ByteString.Char8 (ByteString)
 import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
 import Data.Map ((!))
@ -19,7 +18,7 @@ import PDF.Object (
    , blank, dictionary, directObject, integer, line
  )
 import PDF.Output (ObjectId(..), Offset(..))
-import PDF.Parser (Parser, (<?>), block, char, evalParser, on, takeAll)
+import PDF.Parser (Parser, (<?>), block, char, on, option, runParser, takeAll)
 data UserState = UserState {
      input :: ByteString
@ -110,7 +109,7 @@ occurrence =
 populate :: ByteString -> InputStructure -> Content
 populate input structure =
  let bodyInput = BS.drop (startOffset structure) input in
-  case evalParser recurseOnOccurrences initialState bodyInput of
+  case runParser recurseOnOccurrences initialState bodyInput of
    Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
    Right finalState ->
      let Flow {occurrencesStack, tmpObjects} = flow finalState in
--- a/src/PDF/CMap.hs
+++ b/src/PDF/CMap.hs
@ -1,159 +0,0 @@
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE TypeSynonymInstances #-}
 module PDF.CMap (
      CMap
    , CMappers
    , CRange(..)
    , cMap
    , emptyCMap
    , matches
  ) where
 import Control.Applicative ((<|>), many)
 import Control.Monad.State (modify)
 import Data.Attoparsec.ByteString.Char8 (count)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as BS (drop, length, null, take)
 import Data.ByteString.Char8 (unpack)
 import Data.ByteString.Char8.Util (
    B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
  )
 import Data.Map (Map, union)
 import qualified Data.Map as Map (
    adjust, empty, fromList, insertWith, lookup, toList
  )
 import Data.Text (Text)
 import qualified PDF.EOL as EOL (charset, parser)
 import PDF.Font (Font)
 import PDF.Object (
      DirectObject(..), Name, StringObject(..)
    , blank, directObject, integer, line, stringObject
  )
 import PDF.Parser (MonadParser, Parser, runParser, takeAll)
 type CMappers = Map Name CMap
 type Mapping = Map ByteString Text
 data CRange = CRange {
      fromSequence :: ByteString
    , toSequence :: ByteString
    , mapping :: Mapping
  } deriving Show
 type RangeSize = Int
 type CMap = Map RangeSize [CRange]
 toFont :: CMap -> Font
 toFont aCMap input
  | BS.null input = Right ""
  | otherwise = do
    (output, remainingInput) <- trySizes input $ Map.toList aCMap
    mappend output <$> toFont aCMap remainingInput
  where
    trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
    trySizes s ((size, cRanges):others) =
      let prefix = BS.take size s in
      case tryRanges prefix cRanges of
        Nothing -> trySizes s others
        Just outputSequence -> Right (outputSequence, BS.drop size s)
    tryRanges :: ByteString -> [CRange] -> Maybe Text
    tryRanges _ [] = Nothing
    tryRanges prefix ((CRange {mapping}):cRanges) =
      case Map.lookup prefix mapping of
        Nothing -> tryRanges prefix cRanges
        outputSequence -> outputSequence
 emptyCMap :: CMap
 emptyCMap = Map.empty
 matches :: ByteString -> CRange -> Bool
 matches code (CRange {fromSequence, toSequence}) =
  fromSequence <= code && code <= toSequence
 cMap :: ByteString -> Either String Font
 cMap = fmap (toFont . snd) <$> runParser
  (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
  emptyCMap
  where
    ignoredLine =
      takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
 codeRanges :: Parser CMap ()
 codeRanges = do
  size <- integer <* line "begincodespacerange"
  mapM_ createMapping =<< count size codeRange
  line "endcodespacerange"
  where
    codeRange =
      (,) <$> stringObject <* blank <*> stringObject <* EOL.parser
 createMapping :: (StringObject, StringObject) -> Parser CMap ()
 createMapping (Hexadecimal from, Hexadecimal to) = modify $
  Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
  where
    fromSequence = b16ToBytes from
    size = BS.length fromSequence
    toSequence = b16ToBytes to
    mapping = Map.empty
 createMapping _ = return ()
 cMapRange :: Parser CMap ()
 cMapRange = do
  size <- integer <* line "beginbfrange"
  mapM_ saveMapping =<< count size rangeMapping
  line "endbfrange"
  where
    rangeMapping = (,,) 
      <$> (stringObject <* blank)
      <*> (stringObject <* blank)
      <*> directObject <* EOL.parser
      >>= mapFromTo
 saveMapping :: [(ByteString, Text)] -> Parser CMap ()
 saveMapping [] = return ()
 saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
  where
    newMapping = Map.fromList assoc
    mappingSize = BS.length code
    appendMapping cRange =
      cRange {mapping = mapping cRange `union` newMapping}
    insertCRange = fmap (\cRange ->
        if code `matches` cRange then appendMapping cRange else cRange
      )
 cMapChar :: Parser CMap ()
 cMapChar = do
  size <- integer <* line "beginbfchar"
  saveMapping =<< count size charMapping <* line "endbfchar"
  where
    charMapping =
      (,) <$> stringObject <* blank <*> stringObject <* EOL.parser
      >>= pairMapping
 between :: B16Int -> B16Int -> [ByteString]
 between from@(B16Int s) to =
  let size = BS.length s `div` 2 in
  toBytes size <$> [b16ToInt from .. b16ToInt to]
 startFrom :: B16Int -> [ByteString]
 startFrom from@(B16Int s) =
  let size = BS.length s `div` 2 in
  toBytes size <$> [b16ToInt from .. ]
 mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
 mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
  return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
 mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
  zip (between from to) <$> (mapM dstByteString dstPoints)
  where
    dstByteString (StringObject (Hexadecimal dst)) =
      return . utf16BEToutf8 $ b16ToBytes dst
    dstByteString _ = fail "Invalid for a replacement string"
 mapFromTo _ = fail "invalid range mapping found"
 pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
 pairMapping (Hexadecimal from, Hexadecimal to) =
  return (b16ToBytes  from, utf16BEToutf8 $ b16ToBytes  to)
 pairMapping _ = fail "invalid pair mapping found"
--- a/src/PDF/EOL.hs
+++ b/src/PDF/EOL.hs
@ -6,14 +6,14 @@ module PDF.EOL (
  ) where
 import Control.Applicative ((<|>))
-import PDF.Parser (MonadParser, string)
+import PDF.Parser (Parser, string)
 data Style = CR | LF | CRLF deriving Show
 charset :: String
 charset = "\r\n"
-parser :: MonadParser m => m Style
+parser :: Parser s Style
 parser =
    (string "\r\n" >> return CRLF)
  <|> (string "\r" >> return CR)
--- a/src/PDF/Encoding.hs
+++ b/src/PDF/Encoding.hs
@ -1,10 +0,0 @@
 module PDF.Encoding (
    encoding
  ) where
 import PDF.Encoding.MacRoman (macRomanEncoding)
 import PDF.Font (Font)
 encoding :: String -> Either String Font
 encoding "MacRomanEncoding" = Right macRomanEncoding
 encoding s = Left $ "Unknown encoding " ++ s
--- a/src/PDF/Encoding/MacRoman.hs
+++ b/src/PDF/Encoding/MacRoman.hs
@ -1,141 +0,0 @@
 module PDF.Encoding.MacRoman (
    macRomanEncoding
  ) where
 import Data.ByteString.Char8 (unpack)
 import Data.Text (pack)
 import PDF.Font (Font)
 macRomanEncoding :: Font
 macRomanEncoding = Right . pack . fmap decode . unpack
 decode :: Char -> Char
 decode '\x80' = '\x00C4' -- LATIN CAPITAL LETTER A WITH DIAERESIS
 decode '\x81' = '\x00C5' -- LATIN CAPITAL LETTER A WITH RING ABOVE
 decode '\x82' = '\x00C7' -- LATIN CAPITAL LETTER C WITH CEDILLA
 decode '\x83' = '\x00C9' -- LATIN CAPITAL LETTER E WITH ACUTE
 decode '\x84' = '\x00D1' -- LATIN CAPITAL LETTER N WITH TILDE
 decode '\x85' = '\x00D6' -- LATIN CAPITAL LETTER O WITH DIAERESIS
 decode '\x86' = '\x00DC' -- LATIN CAPITAL LETTER U WITH DIAERESIS
 decode '\x87' = '\x00E1' -- LATIN SMALL LETTER A WITH ACUTE
 decode '\x88' = '\x00E0' -- LATIN SMALL LETTER A WITH GRAVE
 decode '\x89' = '\x00E2' -- LATIN SMALL LETTER A WITH CIRCUMFLEX
 decode '\x8A' = '\x00E4' -- LATIN SMALL LETTER A WITH DIAERESIS
 decode '\x8B' = '\x00E3' -- LATIN SMALL LETTER A WITH TILDE
 decode '\x8C' = '\x00E5' -- LATIN SMALL LETTER A WITH RING ABOVE
 decode '\x8D' = '\x00E7' -- LATIN SMALL LETTER C WITH CEDILLA
 decode '\x8E' = '\x00E9' -- LATIN SMALL LETTER E WITH ACUTE
 decode '\x8F' = '\x00E8' -- LATIN SMALL LETTER E WITH GRAVE
 decode '\x90' = '\x00EA' -- LATIN SMALL LETTER E WITH CIRCUMFLEX
 decode '\x91' = '\x00EB' -- LATIN SMALL LETTER E WITH DIAERESIS
 decode '\x92' = '\x00ED' -- LATIN SMALL LETTER I WITH ACUTE
 decode '\x93' = '\x00EC' -- LATIN SMALL LETTER I WITH GRAVE
 decode '\x94' = '\x00EE' -- LATIN SMALL LETTER I WITH CIRCUMFLEX
 decode '\x95' = '\x00EF' -- LATIN SMALL LETTER I WITH DIAERESIS
 decode '\x96' = '\x00F1' -- LATIN SMALL LETTER N WITH TILDE
 decode '\x97' = '\x00F3' -- LATIN SMALL LETTER O WITH ACUTE
 decode '\x98' = '\x00F2' -- LATIN SMALL LETTER O WITH GRAVE
 decode '\x99' = '\x00F4' -- LATIN SMALL LETTER O WITH CIRCUMFLEX
 decode '\x9A' = '\x00F6' -- LATIN SMALL LETTER O WITH DIAERESIS
 decode '\x9B' = '\x00F5' -- LATIN SMALL LETTER O WITH TILDE
 decode '\x9C' = '\x00FA' -- LATIN SMALL LETTER U WITH ACUTE
 decode '\x9D' = '\x00F9' -- LATIN SMALL LETTER U WITH GRAVE
 decode '\x9E' = '\x00FB' -- LATIN SMALL LETTER U WITH CIRCUMFLEX
 decode '\x9F' = '\x00FC' -- LATIN SMALL LETTER U WITH DIAERESIS
 decode '\xA0' = '\x2020' -- DAGGER
 decode '\xA1' = '\x00B0' -- DEGREE SIGN
 decode '\xA2' = '\x00A2' -- CENT SIGN
 decode '\xA3' = '\x00A3' -- POUND SIGN
 decode '\xA4' = '\x00A7' -- SECTION SIGN
 decode '\xA5' = '\x2022' -- BULLET
 decode '\xA6' = '\x00B6' -- PILCROW SIGN
 decode '\xA7' = '\x00DF' -- LATIN SMALL LETTER SHARP S
 decode '\xA8' = '\x00AE' -- REGISTERED SIGN
 decode '\xA9' = '\x00A9' -- COPYRIGHT SIGN
 decode '\xAA' = '\x2122' -- TRADE MARK SIGN
 decode '\xAB' = '\x00B4' -- ACUTE ACCENT
 decode '\xAC' = '\x00A8' -- DIAERESIS
 decode '\xAD' = '\x2260' -- NOT EQUAL TO
 decode '\xAE' = '\x00C6' -- LATIN CAPITAL LETTER AE
 decode '\xAF' = '\x00D8' -- LATIN CAPITAL LETTER O WITH STROKE
 decode '\xB0' = '\x221E' -- INFINITY
 decode '\xB1' = '\x00B1' -- PLUS-MINUS SIGN
 decode '\xB2' = '\x2264' -- LESS-THAN OR EQUAL TO
 decode '\xB3' = '\x2265' -- GREATER-THAN OR EQUAL TO
 decode '\xB4' = '\x00A5' -- YEN SIGN
 decode '\xB5' = '\x00B5' -- MICRO SIGN
 decode '\xB6' = '\x2202' -- PARTIAL DIFFERENTIAL
 decode '\xB7' = '\x2211' -- N-ARY SUMMATION
 decode '\xB8' = '\x220F' -- N-ARY PRODUCT
 decode '\xB9' = '\x03C0' -- GREEK SMALL LETTER PI
 decode '\xBA' = '\x222B' -- INTEGRAL
 decode '\xBB' = '\x00AA' -- FEMININE ORDINAL INDICATOR
 decode '\xBC' = '\x00BA' -- MASCULINE ORDINAL INDICATOR
 decode '\xBD' = '\x03A9' -- GREEK CAPITAL LETTER OMEGA
 decode '\xBE' = '\x00E6' -- LATIN SMALL LETTER AE
 decode '\xBF' = '\x00F8' -- LATIN SMALL LETTER O WITH STROKE
 decode '\xC0' = '\x00BF' -- INVERTED QUESTION MARK
 decode '\xC1' = '\x00A1' -- INVERTED EXCLAMATION MARK
 decode '\xC2' = '\x00AC' -- NOT SIGN
 decode '\xC3' = '\x221A' -- SQUARE ROOT
 decode '\xC4' = '\x0192' -- LATIN SMALL LETTER F WITH HOOK
 decode '\xC5' = '\x2248' -- ALMOST EQUAL TO
 decode '\xC6' = '\x2206' -- INCREMENT
 decode '\xC7' = '\x00AB' -- LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
 decode '\xC8' = '\x00BB' -- RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
 decode '\xC9' = '\x2026' -- HORIZONTAL ELLIPSIS
 decode '\xCA' = '\x00A0' -- NO-BREAK SPACE
 decode '\xCB' = '\x00C0' -- LATIN CAPITAL LETTER A WITH GRAVE
 decode '\xCC' = '\x00C3' -- LATIN CAPITAL LETTER A WITH TILDE
 decode '\xCD' = '\x00D5' -- LATIN CAPITAL LETTER O WITH TILDE
 decode '\xCE' = '\x0152' -- LATIN CAPITAL LIGATURE OE
 decode '\xCF' = '\x0153' -- LATIN SMALL LIGATURE OE
 decode '\xD0' = '\x2013' -- EN DASH
 decode '\xD1' = '\x2014' -- EM DASH
 decode '\xD2' = '\x201C' -- LEFT DOUBLE QUOTATION MARK
 decode '\xD3' = '\x201D' -- RIGHT DOUBLE QUOTATION MARK
 decode '\xD4' = '\x2018' -- LEFT SINGLE QUOTATION MARK
 decode '\xD5' = '\x2019' -- RIGHT SINGLE QUOTATION MARK
 decode '\xD6' = '\x00F7' -- DIVISION SIGN
 decode '\xD7' = '\x25CA' -- LOZENGE
 decode '\xD8' = '\x00FF' -- LATIN SMALL LETTER Y WITH DIAERESIS
 decode '\xD9' = '\x0178' -- LATIN CAPITAL LETTER Y WITH DIAERESIS
 decode '\xDA' = '\x2044' -- FRACTION SLASH
 decode '\xDB' = '\x20AC' -- EURO SIGN
 decode '\xDC' = '\x2039' -- SINGLE LEFT-POINTING ANGLE QUOTATION MARK
 decode '\xDD' = '\x203A' -- SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
 decode '\xDE' = '\xFB01' -- LATIN SMALL LIGATURE FI
 decode '\xDF' = '\xFB02' -- LATIN SMALL LIGATURE FL
 decode '\xE0' = '\x2021' -- DOUBLE DAGGER
 decode '\xE1' = '\x00B7' -- MIDDLE DOT
 decode '\xE2' = '\x201A' -- SINGLE LOW-9 QUOTATION MARK
 decode '\xE3' = '\x201E' -- DOUBLE LOW-9 QUOTATION MARK
 decode '\xE4' = '\x2030' -- PER MILLE SIGN
 decode '\xE5' = '\x00C2' -- LATIN CAPITAL LETTER A WITH CIRCUMFLEX
 decode '\xE6' = '\x00CA' -- LATIN CAPITAL LETTER E WITH CIRCUMFLEX
 decode '\xE7' = '\x00C1' -- LATIN CAPITAL LETTER A WITH ACUTE
 decode '\xE8' = '\x00CB' -- LATIN CAPITAL LETTER E WITH DIAERESIS
 decode '\xE9' = '\x00C8' -- LATIN CAPITAL LETTER E WITH GRAVE
 decode '\xEA' = '\x00CD' -- LATIN CAPITAL LETTER I WITH ACUTE
 decode '\xEB' = '\x00CE' -- LATIN CAPITAL LETTER I WITH CIRCUMFLEX
 decode '\xEC' = '\x00CF' -- LATIN CAPITAL LETTER I WITH DIAERESIS
 decode '\xED' = '\x00CC' -- LATIN CAPITAL LETTER I WITH GRAVE
 decode '\xEE' = '\x00D3' -- LATIN CAPITAL LETTER O WITH ACUTE
 decode '\xEF' = '\x00D4' -- LATIN CAPITAL LETTER O WITH CIRCUMFLEX
 decode '\xF0' = '\xF8FF' -- Apple logo
 decode '\xF1' = '\x00D2' -- LATIN CAPITAL LETTER O WITH GRAVE
 decode '\xF2' = '\x00DA' -- LATIN CAPITAL LETTER U WITH ACUTE
 decode '\xF3' = '\x00DB' -- LATIN CAPITAL LETTER U WITH CIRCUMFLEX
 decode '\xF4' = '\x00D9' -- LATIN CAPITAL LETTER U WITH GRAVE
 decode '\xF5' = '\x0131' -- LATIN SMALL LETTER DOTLESS I
 decode '\xF6' = '\x02C6' -- MODIFIER LETTER CIRCUMFLEX ACCENT
 decode '\xF7' = '\x02DC' -- SMALL TILDE
 decode '\xF8' = '\x00AF' -- MACRON
 decode '\xF9' = '\x02D8' -- BREVE
 decode '\xFA' = '\x02D9' -- DOT ABOVE
 decode '\xFB' = '\x02DA' -- RING ABOVE
 decode '\xFC' = '\x00B8' -- CEDILLA
 decode '\xFD' = '\x02DD' -- DOUBLE ACUTE ACCENT
 decode '\xFE' = '\x02DB' -- OGONEK
 decode '\xFF' = '\x02C7' -- CARON
 decode c = c             -- The rest is ASCII
--- a/src/PDF/Font.hs
+++ b/src/PDF/Font.hs
@ -1,16 +0,0 @@
 module PDF.Font (
      Font
    , FontSet
    , emptyFont
  ) where
 import Data.ByteString (ByteString)
 import Data.Map (Map)
 import Data.Text (Text)
 import PDF.Object (Name)
 type Font = ByteString -> Either String Text
 type FontSet = Map Name Font
 emptyFont :: Font
 emptyFont _ = Left "No fond loaded"
--- a/src/PDF/Object.hs
+++ b/src/PDF/Object.hs
@ -3,7 +3,6 @@
 {-# LANGUAGE FlexibleInstances #-}
 module PDF.Object (
      Content(..)
    , Dictionary
    , DirectObject(..)
    , Flow(..)
    , IndexedObjects
@ -13,11 +12,9 @@ module PDF.Object (
    , Number(..)
    , Object(..)
    , Occurrence(..)
    , StringObject(..)
    , Structure(..)
    , XRefEntry(..)
    , XRefSection
    , array
    , blank
    , dictionary
    , directObject
@ -25,20 +22,14 @@ module PDF.Object (
    , integer
    , line
    , magicNumber
    , name
    , number
    , regular
    , stringObject
    , structure
    , toByteString
  ) where
-import Control.Applicative ((<|>), many)
+import Control.Applicative ((<|>))
-import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
+import Data.ByteString.Char8 (ByteString)
-import Data.ByteString (ByteString)
+import qualified Data.ByteString.Char8 as BS (
-import qualified Data.ByteString as BS (concat)
+    concat, cons, pack, singleton, unpack
-import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
+  )
 import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
 import Data.Map (Map, (!), mapWithKey)
 import qualified Data.Map as Map (
    delete, empty, fromList, lookup, minViewWithKey, toList, union
@ -50,11 +41,15 @@ import PDF.Output (
    , byteString, getObjectId, getOffset, getOffsets, join, newLine
    , saveOffset
  )
-import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
+import PDF.Parser (
      Parser, (<?>)
    , char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option
    , sepBy, string, takeAll, takeAll1
  )
 import Text.Printf (printf)
-line :: MonadParser m => String -> m ()
+line :: String -> Parser u ()
-line l = (string (Char8.pack l) *> blank *> return ()) <?> printf "line «%s»" l
+line l = (string (BS.pack l) *> blank *> return ()) <?> printf "line «%s»" l
 magicNumber :: ByteString
 magicNumber = "%PDF-"
@ -65,8 +60,8 @@ eofMarker = "%%EOF"
 whiteSpaceCharset :: String
 whiteSpaceCharset = "\0\t\12 "
-blank :: MonadParser m => m ()
+blank :: Parser u ()
-blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> pure ()
+blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return ()
 delimiterCharset :: String
 delimiterCharset = "()<>[]{}/%"
@ -74,8 +69,8 @@ delimiterCharset = "()<>[]{}/%"
 regular :: Char -> Bool
 regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
-integer :: (Read a, Num a, MonadParser m) => m a
+integer :: (Read a, Num a) => Parser u a
-integer = read . Char8.unpack <$> decNumber <* blank <?> "decimal integer"
+integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
 -------------------------------------
 --          OBJECTS
@ -86,7 +81,7 @@ type IndexedObjects = Map ObjectId Object
 --
 -- Boolean
 --
-boolean :: MonadParser m => m Bool
+boolean :: Parser u Bool
 boolean =
  (string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
@ -101,42 +96,38 @@ instance Output Number where
      (n, 0) -> printf "%d" (n :: Int)
      _ -> printf "%f" f
-number :: MonadParser m => m Number
+number :: Parser u Number
-number = Number . read . Char8.unpack <$>
+number = Number . read . BS.unpack <$>
-  (mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
+  (mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
  <?> "number"
  where
    sign = string "-" <|> option "" (char '+' >> return "")
    integerPart = mappend <$> decNumber <*> option "" floatPart
-    floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)
+    floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
 --
 -- StringObject
 --
-data StringObject = Literal ByteString | Hexadecimal B16Int deriving Show
+data StringObject = Literal String | Hexadecimal String deriving Show
 instance Output StringObject where
-  output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
+  output (Literal s) = Output.string (printf "(%s)" s)
-  output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))
+  output (Hexadecimal s) = Output.string (printf "<%s>" s)
-stringObject :: MonadParser m => m StringObject
+stringObject :: Parser u StringObject
 stringObject =
-    Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
+    Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
-  <|> Hexadecimal <$> (char '<' *> hexNumber <* char '>')
+  <|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
  <?> "string object (literal or hexadecimal)"
  where
    literalString = many literalStringBlock
    literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
    normalChar = not . (`elem` ("\\()" :: String))
    matchingParenthesis =
-      mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
+      mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
    escapedChar =
-      Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
+      BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
-    octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
+    octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
 toByteString :: StringObject -> ByteString
 toByteString (Hexadecimal h) = b16ToBytes h
 toByteString (Literal s) = unescape s
 --
 -- Name
@ -146,13 +137,13 @@ newtype Name = Name String deriving (Eq, Ord, Show)
 instance Output Name where
  output (Name n) = Output.string ('/':n)
-name :: MonadParser m => m Name
+name :: Parser u Name
-name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"
+name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
 --
 -- Array
 --
-array :: MonadParser m => m [DirectObject]
+array :: Parser u [DirectObject]
 array =
  char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
@ -169,7 +160,7 @@ instance Output Dictionary where
      outputKeyVal :: (Name, DirectObject) -> OBuilder
      outputKeyVal (key, val) = Output.concat [output key, " ", output val]
-dictionary :: MonadParser m => m Dictionary
+dictionary :: Parser u Dictionary
 dictionary =
  string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
  where
@ -179,7 +170,7 @@ dictionary =
 --
 -- Null
 --
-nullObject :: MonadParser m => m ()
+nullObject :: Parser u ()
 nullObject = string "null" *> return () <?> "null object"
 --
@ -190,7 +181,7 @@ data IndirectObjCoordinates = IndirectObjCoordinates {
    , versionNumber :: Int
  } deriving Show
-reference :: MonadParser m => m IndirectObjCoordinates
+reference :: Parser u IndirectObjCoordinates
 reference = IndirectObjCoordinates
  <$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
@ -219,7 +210,7 @@ instance Output DirectObject where
  output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
    Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
-directObject :: MonadParser m => m DirectObject
+directObject :: Parser u DirectObject
 directObject =
    Boolean <$> boolean
  <|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
--- a/src/PDF/Output.hs
+++ b/src/PDF/Output.hs
@ -116,7 +116,7 @@ char :: Char -> OBuilder
 char c = lift char8 c <* offset (+1)
 string :: String -> OBuilder
-string s = lift string8 s <* offset (+ length s)
+string s = lift string8 s <* offset (+ toEnum (length s))
 line :: String -> OBuilder
 line l = string l `mappend` newLine
--- a/src/PDF/Pages.hs
+++ b/src/PDF/Pages.hs
@ -1,173 +0,0 @@
 {-# LANGUAGE NamedFieldPuns #-}
 module PDF.Pages (
      Page(..)
    , get
    , getAll
  ) where
 import Codec.Compression.Zlib (decompress)
 import Control.Applicative ((<|>))
 import Control.Monad (foldM)
 import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify)
 import qualified Control.Monad.RWS as RWS (get)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
 import Data.Map (Map, (!))
 import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
 import Data.Text (Text)
 import PDF.CMap (cMap)
 import PDF.Encoding (encoding)
 import PDF.Font (Font, FontSet)
 import PDF.Object (
      Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
    , Object(..), Name(..), Structure(..)
  ,)
 import PDF.Output (ObjectId(..))
 import PDF.Text (pageContents)
 import Text.Printf (printf)
 type CachedFonts = Map ObjectId Font
 type T = RWST Content () CachedFonts (Either String)
 data Page = Page {
      contents :: [Text]
    , source :: ObjectId
  }
 infixl 1 \\=
 (\\=) :: T a -> (a -> Either String b) -> T b
 x \\= f = x >>= lift . f
 infixl 1 //=
 (//=) :: Either String a -> (a -> T b) -> T b
 x //= f = lift x >>= f
 expected :: Show a => String -> a -> Either String b
 expected name = Left . printf "Not a %s: %s" name . show
 stream :: Object -> Either String ByteString
 stream (Stream {header, streamContent}) = Right $
  case Map.lookup (Name "Filter") header of
    Just (NameObject (Name "FlateDecode")) ->
      Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
    _ -> streamContent
 stream obj = expected "stream" obj
 getResource :: DirectObject -> T Dictionary
 getResource (Dictionary dictionary) = return dictionary
 getResource (Reference (IndirectObjCoordinates {objectId})) =
  getObject objectId \\= dict
 getResource directObject =
  lift $ expected "resource (dictionary or reference)" directObject
 getFontDictionary :: Dictionary -> T Dictionary
 getFontDictionary pageDict =
  key "Resources" pageDict
  //= getResource
  \\= key "Font"
  >>= getResource
 cache :: (ObjectId -> T Font) -> ObjectId -> T Font
 cache loader objectId =
  (maybe load return . Map.lookup objectId) =<< RWS.get
  where
    load = do
      value <- loader objectId
      modify $ Map.insert objectId value
      return value
 loadFont :: ObjectId -> T Font
 loadFont objectId = getObject objectId \\= dict >>= tryMappings
  where
    tryMappings dictionary =
        loadCMap dictionary
      <|> lift (key "Encoding" dictionary >>= loadEncoding)
      <|> lift (Left $ unknownFormat (show objectId) (show dictionary))
    unknownFormat = printf "Unknown font format for object #%s : %s"
    loadCMap dictionary =
      key "ToUnicode" dictionary //= follow \\= stream \\= cMap
    loadEncoding (NameObject (Name name)) = encoding name
    loadEncoding directObject =
      Left . printf "Encoding must be a name, not that : %s" $ show directObject
 loadFonts :: Dictionary -> T FontSet
 loadFonts = foldM addFont Map.empty . Map.toList
  where
    addFont :: FontSet -> (Name, DirectObject) -> T FontSet
    addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
      flip (Map.insert name) output <$> cache loadFont objectId
    addFont output _ = return output
 getObject :: ObjectId -> T Object
 getObject objectId = do
  content <- ask
  return (objects content ! objectId)
 key :: String -> Dictionary -> Either String DirectObject
 key keyName dictionary =
  maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary)
  where
    errorMessage =
      printf "Key %s not found in dictionary %s" keyName (show dictionary)
 target :: DirectObject -> Either String ObjectId
 target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
 target directObject = expected "reference" directObject
 many :: DirectObject -> [DirectObject]
 many (Array l) =  l
 many directObject = [directObject]
 follow :: DirectObject -> T Object
 follow directObject = target directObject //= getObject
 dict :: Object -> Either String Dictionary
 dict (Direct (Dictionary dictionary)) = Right dictionary
 dict obj = expected "dictionary" obj
 dictObject :: String -> Dictionary -> T Dictionary
 dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
 pagesList :: T [ObjectId]
 pagesList = do
  root <- dictObject "Root" . trailer . docStructure =<< ask
  pages <- dictObject "Pages" root
  case Map.lookup (Name "Kids") pages of
    Just (Array kids) -> return $ getReferences kids
    _ -> return []
 getReferences :: [DirectObject] -> [ObjectId]
 getReferences objects = do
  object <- objects
  case object of
    Reference (IndirectObjCoordinates {objectId}) -> [objectId]
    _ -> []
 extractText :: Object -> T [Text]
 extractText object = do
  pageDict <- lift $ dict object
  fonts <- loadFonts =<< getFontDictionary pageDict
  let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
  concat <$> (objects //= (mapM $ loadContent fonts))
  where
    loadContent :: FontSet -> DirectObject -> T [Text]
    loadContent fonts directObject =
      follow directObject \\= stream \\= pageContents fonts
 loadPage :: ObjectId -> T Page
 loadPage source =
  (\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
 getAll :: Content -> Either String (Map Int Page)
 getAll content = fst <$> evalRWST getPages content Map.empty
  where
    numbered = Map.fromList . zip [1..]
    getPages = numbered <$> (mapM loadPage =<< pagesList)
 get :: Content -> Int -> Either String Page
 get content pageNumber
  | pageNumber < 1 = Left "Pages start at 1"
  | otherwise = fst <$> evalRWST getPage content Map.empty
  where
    firstPage [] = lift $ Left "Page is out of bounds"
    firstPage (p:_) = loadPage p
    getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
--- a/src/PDF/Parser.hs
+++ b/src/PDF/Parser.hs
@ -1,72 +1,56 @@
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE ConstraintKinds #-}
 {-# LANGUAGE UndecidableInstances #-}
 module PDF.Parser (
-      MonadParser(..)
+      Parser
    , Parser
    , (<?>)
    , block
    , char
    , choice
    , count
    , decNumber
    , hexNumber
    , many
    , octDigit
    , on
    , oneOf
    , option
    , runParser
-    , evalParser
+    , sepBy
    , string
    , takeAll
    , takeAll1
  ) where
-import Control.Applicative (Alternative, (<|>))
+import Control.Applicative ((<|>), empty)
 import Control.Monad (MonadPlus)
 import Control.Monad.Fail (MonadFail(..))
 import Control.Monad.State (StateT(..), evalStateT)
-import Control.Monad.Trans (MonadTrans(..))
+import Control.Monad.Trans (lift)
 import qualified Data.Attoparsec.ByteString.Char8 as Atto (
-      Parser, char, endOfInput, parseOnly, satisfy, string, take, takeWhile
+    Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
    , takeWhile1
  )
 import Data.ByteString (ByteString)
 import Data.ByteString.Char8.Util (B16Int(..))
 import Data.Char (toLower)
 import Data.Set (Set)
 import qualified Data.Set as Set (fromList, member, unions)
 import Prelude hiding (fail)
 type MonadDeps m = (MonadFail m, MonadPlus m)
 class MonadDeps m => MonadParser m where
  block :: Int -> m ByteString
  char :: Char -> m Char
  decNumber :: m ByteString
  endOfInput :: m ()
  hexNumber :: m B16Int
  oneOf :: String -> m Char
  string :: ByteString -> m ByteString
  takeAll :: (Char -> Bool) -> m ByteString
  takeAll1 :: (Char -> Bool) -> m ByteString
 instance MonadParser Atto.Parser where
  block = Atto.take
  char = Atto.char
  endOfInput = Atto.endOfInput
  decNumber = Atto.takeWhile1 (`Set.member` digits)
  hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
  oneOf charSet = Atto.satisfy (`elem` charSet)
  string s = Atto.string s <?> show s
  takeAll = Atto.takeWhile
  takeAll1 = Atto.takeWhile1
 instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) where
  block = lift . block
  char = lift . char
  endOfInput = lift $ endOfInput
  decNumber = lift $ decNumber
  hexNumber = lift $ hexNumber
  oneOf = lift . oneOf
  string = lift . string
  takeAll = lift . takeAll
  takeAll1 = lift . takeAll1
 type Parser s = StateT s Atto.Parser
-(<?>) :: (Alternative m, MonadFail m) => m a -> String -> m a
+(<?>) :: Parser s a -> String -> Parser s a
 (<?>) parser debugMessage = parser <|> fail debugMessage
 block :: Int -> Parser s ByteString
 block = lift . Atto.take
 char :: Char -> Parser s Char
 char = lift . Atto.char
 choice :: [Parser s a] -> Parser s a
 choice = foldr (<|>) empty
 count :: Int -> Parser s a -> Parser s [a]
 count 0 _ = return []
 count n p = (:) <$> p <*> count (n-1) p
 decNumber :: Parser s ByteString
 decNumber = lift $ Atto.takeWhile1 (`Set.member` digits)
 digits :: Set Char
 digits = Set.fromList ['0'..'9']
@ -75,7 +59,13 @@ hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af]
  where
    af = ['A'..'F']
-octDigit :: MonadParser m => m Char
+hexNumber :: Parser s ByteString
 hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits)
 many :: Parser s a -> Parser s [a]
 many parser = (:) <$> parser <*> many parser <|> return []
 octDigit :: Parser s Char
 octDigit = oneOf ['0'..'7']
 on :: Parser s a -> ByteString -> Parser s (Either String a)
@ -84,8 +74,25 @@ on (StateT parserF) input = StateT $ \state ->
    Left errorMsg -> return (Left errorMsg, state)
    Right (result, newState) -> return (Right result, newState)
-runParser :: Parser s a -> s -> ByteString -> Either String (a, s)
+oneOf :: String -> Parser s Char
-runParser parser initState = Atto.parseOnly (runStateT parser initState)
+oneOf charSet = lift $ Atto.satisfy (`elem` charSet)
-evalParser :: Parser s a -> s -> ByteString -> Either String a
+option :: a -> Parser s a -> Parser s a
-evalParser parser initState = Atto.parseOnly (evalStateT parser initState)
+option defaultValue p = p <|> pure defaultValue
 runParser :: Parser s a -> s -> ByteString -> Either String a
 runParser parser initState =
  Atto.parseOnly (evalStateT parser initState)
 sepBy :: Parser s a -> Parser s b -> Parser s [a]
 sepBy parser separator =
  option [] $ (:) <$> parser <*> many (separator *> parser)
 string :: ByteString -> Parser s ByteString
 string = lift . Atto.string
 takeAll :: (Char -> Bool) -> Parser s ByteString
 takeAll = lift . Atto.takeWhile
 takeAll1 :: (Char -> Bool) -> Parser s ByteString
 takeAll1 = lift . Atto.takeWhile1
--- a/src/PDF/Text.hs
+++ b/src/PDF/Text.hs
@ -1,319 +0,0 @@
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE ConstraintKinds #-}
 {-# LANGUAGE FlexibleContexts #-}
 module PDF.Text {-(
    pageContents
  )-} where
 import Control.Applicative ((<|>))
 import Control.Monad (foldM)
 import Control.Monad.Fail (MonadFail)
 import Control.Monad.Reader (ReaderT, runReaderT, asks)
 import Control.Monad.State (MonadState, evalStateT, get, modify, put)
 import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
 import Data.ByteString (ByteString)
 import Data.ByteString.Char8 (pack, unpack)
 import Data.Char (toLower)
 import Data.Map ((!), (!?), Map)
 import qualified Data.Map as Map (fromList)
 import Data.Text (Text)
 import PDF.Font (Font, FontSet, emptyFont)
 import PDF.Object (
      DirectObject(..), StringObject(..)
    , array, blank, name, regular, stringObject, toByteString
  )
 import PDF.Parser (MonadParser(..), (<?>), Parser, evalParser)
 data StateOperator =
  C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
  deriving (Bounded, Enum, Show)
 data PathOperator =
    M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
  | S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
  | W | Wstar -- clipping path
  deriving (Bounded, Enum, Show)
 data ColorOperator =
    CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
  deriving (Bounded, Enum, Show)
 data TextOperator =
    Td | TD | Tm | Tstar -- text positioning
  | TJ | Tj | Quote | DQuote -- text showing
  | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
  deriving (Bounded, Enum, Show)
 data Argument = Raw ByteString | Typed DirectObject deriving Show
 type Call a = (a, [Argument])
 type Operator a = (Bounded a, Enum a, Show a)
 code :: Operator a => a -> ByteString
 code = pack . expand . show
  where
    expand "" = ""
    expand (c:'_':s) = toLower c : expand s
    expand ('s':'t':'a':'r':s) = '*' : expand s
    expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
    expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
    expand (c:s) = c : expand s
 {-
 instance Show StateOperator where
  show Cm = "cm"
  show W_ = "w"
  show J = "J"
  show J_ = "j"
  show M = "M"
  show D = "d"
  show Ri = "ri"
  show I = "i"
  show Gs = "gs"
 instance Show PathOperator where
  show M_ = "m"
  show L_ = "l"
  show C_ = "c"
  show V_ = "v"
  show Y_ = "y"
  show H_
      ("m", (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
    , ("l", (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
    , ("c", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
    , ("v", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
    , ("y", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
    , ("h", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("re", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
    , ("S", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("s", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("f", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("F", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("F*", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("B", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("B*", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("b", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("b*", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("n", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("W", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("W*", (L_, \l -> case l of [] -> True ; _ -> False))
 instance Show ColorOperator where
 instance Show TextOperator where
  show Td = "Td"
  show TD = "TD"
  show Tm = "Tm"
  show Tstar = "T*"
  show TJ = "TJ"
  show Tj = "Tj"
  show Quote = "'"
  show DQuote = "\""
  show Tc = "Tc"
  show Tw = "Tw"
  show Tz = "Tz"
  show TL = "TL"
  show Tf = "Tf"
  show Tr = "Tr"
  show Ts = "Ts"
 -}
 stateOperator :: OperatorTable StateOperator
 stateOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
      (C_m, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (W_, \l -> case l of [Raw _] -> True ; _ -> False)
    , (J, \l -> case l of [Raw _] -> True ; _ -> False)
    , (J_, \l -> case l of [Raw _] -> True ; _ -> False)
    , (M, \l -> case l of [Raw _] -> True ; _ -> False)
    , (D_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (R_i, \l -> case l of [Raw _] -> True ; _ -> False)
    , (I_, \l -> case l of [Raw _] -> True ; _ -> False)
    , (G_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
  ]
 pathOperator :: OperatorTable PathOperator
 pathOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
      (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (C_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (V_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (Y_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (H_, \l -> case l of [] -> True ; _ -> False)
    , (R_e, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (S, \l -> case l of [] -> True ; _ -> False)
    , (S_, \l -> case l of [] -> True ; _ -> False)
    , (F_, \l -> case l of [] -> True ; _ -> False)
    , (F, \l -> case l of [] -> True ; _ -> False)
    , (Fstar, \l -> case l of [] -> True ; _ -> False)
    , (B, \l -> case l of [] -> True ; _ -> False)
    , (Bstar, \l -> case l of [] -> True ; _ -> False)
    , (B_, \l -> case l of [] -> True ; _ -> False)
    , (B_star, \l -> case l of [] -> True ; _ -> False)
    , (N_, \l -> case l of [] -> True ; _ -> False)
    , (W, \l -> case l of [] -> True ; _ -> False)
    , (Wstar, \l -> case l of [] -> True ; _ -> False)
  ]
 colorOperator :: OperatorTable ColorOperator
 colorOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
      (CS, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
    , (C_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
    , (SC, \_ -> True)
    , (SCN, \_ -> True)
    , (S_c, \_ -> True)
    , (S_cn, \_ -> True)
    , (G, \l -> case l of [Raw _] -> True ; _ -> False)
    , (G_, \l -> case l of [Raw _] -> True ; _ -> False)
    , (RG, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (R_g, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (K, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (K_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
  ]
 {-
 stateOperator (Cm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
 stateOperator (W, [Raw _]) = True
 stateOperator (J, [Raw _]) = True
 stateOperator (J_, [Raw _]) = True
 stateOperator (M, [Raw _]) = True
 stateOperator (D, [Raw _, Raw _]) = True
 stateOperator (Ri, [Raw _]) = True
 stateOperator (I, [Raw _]) = True
 stateOperator (Gs, [Typed (NameObject _)]) = True
 stateOperator _ = False
 -}
 textOperator :: OperatorTable TextOperator
 textOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
      (Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (Tstar, \l -> case l of [] -> True ; _ -> False)
    , (TJ, \l -> case l of [Typed (Array _)] -> True ; _ -> False)
    , (Tj, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
    , (Quote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
    , (DQuote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
    , (Tc, \l -> case l of [Raw _] -> True ; _ -> False)
    , (Tw, \l -> case l of [Raw _] -> True ; _ -> False)
    , (Tz, \l -> case l of [Raw _] -> True ; _ -> False)
    , (TL, \l -> case l of [Raw _] -> True ; _ -> False)
    , (Tf, \l -> case l of [Typed (NameObject _), Raw _] -> True ; _ -> False)
    , (Tr, \l -> case l of [Raw _] -> True ; _ -> False)
    , (Ts, \l -> case l of [Raw _] -> True ; _ -> False)
  ]
  {-
 textOperator (Td, [Raw _, Raw _]) = True
 textOperator (TD, [Raw _, Raw _]) = True
 textOperator (Tm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
 textOperator (Tstar, []) = True
 textOperator (TJ, [Typed (Array _)]) = True
 textOperator (Tj, [Typed (StringObject _)]) = True
 textOperator (Quote, [Typed (StringObject _)]) = True
 textOperator (DQuote, [Typed (StringObject _)]) = True
 textOperator (Tc, [Raw _]) = True
 textOperator (Tw, [Raw _]) = True
 textOperator (Tz, [Raw _]) = True
 textOperator (TL, [Raw _]) = True
 textOperator (Tf, [Typed (NameObject _), Raw _]) = True
 textOperator (Tr, [Raw _]) = True
 textOperator (Ts, [Raw _]) = True
 textOperator _ = False
 -}
 type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
 --type Operator a = (Bounded a, Enum a, Show a)
 type OperatorTable a = Map ByteString (TypeChecker a)
 type TypeChecker a = (a, [Argument] -> Bool)
 parseShowable :: (Show a, MonadParser m) => a -> m a
 parseShowable textOp = string (pack $ show textOp) *> return textOp
 callChunk :: MonadParser m => OperatorTable a -> m (Either (TypeChecker a) Argument)
 callChunk table =
    (Right <$> choice [stringArg, nameArg, arrayArg])
  <|> operatorOrRawArg
  <?> "call chunk"
  where
    operatorOrRawArg = do
      chunk <- takeAll1 regular <* blank
      case table !? chunk of
        Nothing -> return . Right $ Raw chunk
        Just typeChecker -> return $ Left typeChecker
 stackParser :: (ArgumentStackParser m, Operator a) => OperatorTable a -> m (Call a)
 stackParser table = either popCall push =<< (callChunk table)
  where
    push arg = modify (arg:) >> stackParser table
    popCall (operator, predicate) = do
      arguments <- reverse <$> get
      let call = (operator, arguments)
      if predicate arguments then return call else fail (unpack $ code operator)
 a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a)
 a table = evalStateT (stackParser table) []
 argument :: MonadParser m => m Argument
 argument = Raw <$> takeAll1 regular <* blank
 arrayArg :: MonadParser m => m Argument
 arrayArg = Typed . Array <$> array <* blank
 nameArg :: MonadParser m => m Argument
 nameArg = Typed . NameObject <$> name <* blank
 stringArg :: MonadParser m => m Argument
 stringArg = Typed . StringObject <$> stringObject <* blank
 type ParserWithFont = ReaderT FontSet (Parser Font)
 pageContents :: FontSet -> ByteString -> Either String [Text]
 pageContents fontSet input =
  evalParser (runReaderT (page) fontSet) emptyFont input
 several :: MonadParser m => m [a] -> m [a]
 several p = concat <$> (p `sepBy` blank)
 page :: ParserWithFont [Text]
 page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
 graphicState :: ParserWithFont [Text]
 graphicState =
  string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
  where
    insideQ = several (command <|> graphicState <|> text)
    ignore x = a x *> return []
    command =
      ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
 text :: ParserWithFont [Text]
 text =
  string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
  where
    commands = several (a textOperator >>= runOperator)
 runOperator :: Call TextOperator -> ParserWithFont [Text]
 runOperator (Tf, [Typed (NameObject fontName), _]) =
  asks (! fontName) >>= put >> return []
 runOperator (Tstar, []) = return ["\n"]
 runOperator (TJ, [Typed (Array arrayObject)]) =
  replicate 1 <$> foldM appendText "" arrayObject
  where
    appendText bs (StringObject outputString) =
      mappend bs <$> decodeString outputString
    appendText bs _ = return bs
 runOperator (Tj, [Typed (StringObject outputString)]) =
  replicate 1 <$> decodeString outputString
 runOperator (Quote, [Typed (StringObject outputString)]) =
  (\bs -> ["\n", bs]) <$> decodeString outputString
 runOperator (DQuote, [Typed (StringObject outputString)]) =
  (\bs -> ["\n", bs]) <$> decodeString outputString
 runOperator _ = return []
 decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text
 decodeString input = do
  font <- get
  either fail return . font $ toByteString input