Minor but strict improvement : remove the general implementation of <?> for Alternative

Merge branch 'extract-text' into navigation
Merge branch 'main' into extract-text
2020-02-12 17:48:26 +01:00 · 2020-02-12 17:35:56 +01:00 · 2020-02-12 17:34:56 +01:00 · 2020-02-12 17:34:27 +01:00 · 2020-02-11 22:41:46 +01:00 · 2020-02-11 17:59:15 +01:00
24 changed files with 1495 additions and 150 deletions
--- a/Hufflepdf.cabal
+++ b/Hufflepdf.cabal
@ -17,19 +17,36 @@ cabal-version:       >=1.10
 library
  exposed-modules:     PDF
                     , PDF.CMap
                     , PDF.Content
                     , PDF.EOL
                     , PDF.Object
                     , PDF.Object.Navigation
                     , PDF.Output
                     , PDF.Parser
                     , PDF.Pages
                     , PDF.Update
  other-modules:       Data.ByteString.Char8.Util
                     , PDF.Content.Operator
                     , PDF.Content.Operator.Color
                     , PDF.Content.Operator.Common
                     , PDF.Content.Operator.GraphicState
                     , PDF.Content.Operator.Path
                     , PDF.Content.Operator.Text
                     , PDF.Content.Text
                     , PDF.Encoding
                     , PDF.Encoding.MacRoman
                     , PDF.Body
-                     , PDF.Parser
+                     , PDF.Font
  -- other-extensions:    
  build-depends:       attoparsec
                     , base >=4.9 && <4.13
                     , bytestring
                     , containers
                     , mtl
                     , text
                     , utf8-string
                     , zlib
  hs-source-dirs:      src
  ghc-options:         -Wall
  default-language:    Haskell2010
@ -51,3 +68,15 @@ executable             getObj
                     , zlib
  ghc-options:         -Wall
  default-language:    Haskell2010
 executable             getText
  main-is:             examples/getText.hs
  build-depends:       base
                     , bytestring
                     , containers
                     , Hufflepdf
                     , mtl
                     , text
                     , zlib
  ghc-options:         -Wall
  default-language:    Haskell2010
--- a/examples/getText.hs
+++ b/examples/getText.hs
@ -0,0 +1,38 @@
 import qualified Data.ByteString.Char8 as BS (readFile)
 import qualified Data.Map as Map (toList)
 import qualified Data.Text.IO as Text (putStrLn)
 import PDF (Document(..), parseDocument)
 import PDF.Object (Content)
 import PDF.Pages (Page(..), get, getAll)
 import PDF.Update (unify)
 import System.Environment (getArgs)
 import System.Exit (die)
 import System.IO (BufferMode(..), hSetBuffering, stdout)
 onDoc :: FilePath -> (Content -> Either String a) -> IO a
 onDoc inputFile f = do
  content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
  case content >>= f of
    Left someError -> die someError
    Right value -> return value
 displayPage :: Page -> IO ()
 displayPage = mapM_ Text.putStrLn . contents
 wholeDoc :: FilePath -> IO ()
 wholeDoc inputFile = do
  pages <- onDoc inputFile getAll
  mapM_ (displayPage . snd) $ Map.toList pages
 singlePage :: FilePath -> Int -> IO ()
 singlePage inputFile pageNumber =
  onDoc inputFile (`get` pageNumber) >>= displayPage
 main :: IO ()
 main = do
  hSetBuffering stdout LineBuffering
  args <- getArgs
  case args of
    [inputFile] -> wholeDoc inputFile
    [inputFile, pageNumber] -> singlePage inputFile (read pageNumber)
    _ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
--- a/src/Data/ByteString/Char8/Util.hs
+++ b/src/Data/ByteString/Char8/Util.hs
@ -1,16 +1,91 @@
 module Data.ByteString.Char8.Util (
-      previous
+      B16Int(..)
    , B256Int(..)
    , b8ToInt
    , b16ToBytes
    , b16ToInt
    , b256ToInt
    , intToB256
    , previous
    , subBS
    , toBytes
    , unescape
    , utf16BEToutf8
  ) where
-import Data.ByteString.Char8 (ByteString)
+import Data.ByteString (ByteString, snoc)
-import qualified Data.ByteString.Char8 as BS (drop, index, take)
+import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt)
 import qualified Data.ByteString.Char8 as Char8 (
    cons, drop, index, splitAt, take, uncons, unpack
  )
 import Data.Text (Text)
 import Data.Text.Encoding (decodeUtf16BE)
 import Prelude hiding (length)
 import Text.Printf (printf)
 newtype B8Int = B8Int ByteString deriving Show
 newtype B16Int = B16Int ByteString deriving Show
 newtype B256Int = B256Int ByteString deriving Show
 previous :: Char -> Int -> ByteString -> Int
 previous char position byteString
-  | BS.index byteString position == char = position
+  | Char8.index byteString position == char = position
  | otherwise = previous char (position - 1) byteString
 subBS :: Int -> Int -> ByteString -> ByteString
-subBS offset length = BS.take length . BS.drop offset
+subBS offset length = Char8.take length . Char8.drop offset
 intToB256 :: Int -> B256Int
 intToB256 n
  | n < 0x100 = B256Int . BS.singleton $ toEnum n
  | otherwise =
    let B256Int begining = intToB256 (n `div` 0x100) in
    B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
 b256ToInt :: B256Int -> Int
 b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n
 toBytes :: Int -> Int -> ByteString
 toBytes 0 _ = BS.empty
 toBytes size n =
  (toBytes (size - 1) (n `div` 0x100)) `snoc` (toEnum (n `mod` 0x100))
 b16ToBytes :: B16Int -> ByteString
 b16ToBytes (B16Int n) = BS.pack . fmap b16ToInt $ pairDigits n
  where
    pairDigits s =
      case BS.length s of
        0 -> []
        1 -> [B16Int s]
        _ ->
          let (twoHexDigits, rest) = BS.splitAt 2 s in
          (B16Int $ twoHexDigits):(pairDigits rest)
 fromBase :: (Num a, Read a) => Char -> ByteString -> a
 fromBase b = read . printf "0%c%s" b . Char8.unpack
 b16ToInt :: (Num a, Read a) => B16Int -> a
 b16ToInt (B16Int n) = fromBase 'x' n
 b8ToInt :: (Num a, Read a) => B8Int -> a
 b8ToInt (B8Int n) = fromBase 'o' n
 unescape :: ByteString -> ByteString
 unescape escapedBS =
  case Char8.uncons escapedBS of
    Nothing -> BS.empty
    Just ('\\', s) -> unescapeChar s
    Just (c, s) -> Char8.cons c (unescape s)
  where
    unescapeChar s =
      case Char8.uncons s of
        Nothing -> BS.empty
        Just (c, s')
          | c `elem` "()" -> Char8.cons c (unescape s')
          | c `elem` "nrtbf" -> Char8.cons (read (printf "'\\%c'" c)) (unescape s')
          | c `elem` ['0'..'7'] -> fromOctal (Char8.splitAt 3 s)
          | otherwise -> Char8.cons c (unescape s')
    fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
 utf16BEToutf8 :: ByteString -> Text
 utf16BEToutf8 = decodeUtf16BE
--- a/src/PDF.hs
+++ b/src/PDF.hs
@ -21,7 +21,7 @@ import PDF.Object (
  )
 import qualified PDF.Output as Output (render, line)
 import PDF.Output (Output(..))
-import PDF.Parser (Parser, runParser, string, takeAll)
+import PDF.Parser (Parser, evalParser, string, takeAll)
 import Text.Printf (printf)
 data Document = Document {
@ -83,7 +83,7 @@ findNextSection offset input =
 readStructures :: Int -> ByteString -> Either String [InputStructure]
 readStructures startXref input =
-  runParser structure () (BS.drop startXref input) >>= stopOrFollow
+  evalParser structure () (BS.drop startXref input) >>= stopOrFollow
  where
    stopOrFollow s@(Structure {trailer}) =
      case Map.lookup (Name "Prev") trailer of
@ -96,7 +96,7 @@ readStructures startXref input =
 parseDocument :: ByteString -> Either String Document
 parseDocument input = do
-  (pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input
+  (pdfVersion, eolStyle) <- evalParser ((,) <$> version <*> EOL.parser) () input
  startXref <- readStartXref eolStyle input
  structuresRead <- readStructures startXref input
  let updates = populate input <$> structuresRead
--- a/src/PDF/Body.hs
+++ b/src/PDF/Body.hs
@ -6,6 +6,7 @@ module PDF.Body (
 import Control.Applicative ((<|>))
 import Control.Monad.State (get, gets, modify)
 import Data.Attoparsec.ByteString.Char8 (option)
 import Data.ByteString.Char8 (ByteString)
 import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
 import Data.Map ((!))
@ -18,7 +19,7 @@ import PDF.Object (
    , blank, dictionary, directObject, integer, line
  )
 import PDF.Output (ObjectId(..), Offset(..))
-import PDF.Parser (Parser, (<?>), block, char, on, option, runParser, takeAll)
+import PDF.Parser (Parser, block, char, evalParser, on, takeAll)
 data UserState = UserState {
      input :: ByteString
@ -104,12 +105,12 @@ indirectObjCoordinates = do
 occurrence :: SParser Occurrence
 occurrence =
-  Comment <$> comment <|> Indirect <$> indirectObjCoordinates <?> "comment or object"
+  Comment <$> comment <|> Indirect <$> indirectObjCoordinates
 populate :: ByteString -> InputStructure -> Content
 populate input structure =
  let bodyInput = BS.drop (startOffset structure) input in
-  case runParser recurseOnOccurrences initialState bodyInput of
+  case evalParser recurseOnOccurrences initialState bodyInput of
    Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
    Right finalState ->
      let Flow {occurrencesStack, tmpObjects} = flow finalState in
--- a/src/PDF/CMap.hs
+++ b/src/PDF/CMap.hs
@ -0,0 +1,159 @@
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE OverloadedStrings #-}
 module PDF.CMap (
      CMap
    , CMappers
    , CRange(..)
    , cMap
    , emptyCMap
    , matches
  ) where
 import Control.Applicative ((<|>), many)
 import Control.Monad.Fail (MonadFail(..))
 import Control.Monad.State (modify)
 import Data.Attoparsec.ByteString.Char8 (count)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as BS (drop, length, null, take)
 import Data.ByteString.Char8 (unpack)
 import Data.ByteString.Char8.Util (
    B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
  )
 import Data.Map (Map, union)
 import qualified Data.Map as Map (
    adjust, empty, fromList, insertWith, lookup, toList
  )
 import Data.Text (Text)
 import qualified PDF.EOL as EOL (charset, parser)
 import PDF.Font (Font)
 import PDF.Object (
      DirectObject(..), Name, StringObject(..)
    , blank, directObject, integer, line, stringObject
  )
 import PDF.Parser (MonadParser, Parser, runParser, takeAll)
 import Prelude hiding (fail)
 type CMappers = Map Name CMap
 type Mapping = Map ByteString Text
 data CRange = CRange {
      fromSequence :: ByteString
    , toSequence :: ByteString
    , mapping :: Mapping
  } deriving Show
 type RangeSize = Int
 type CMap = Map RangeSize [CRange]
 toFont :: CMap -> Font
 toFont aCMap input
  | BS.null input = Right ""
  | otherwise = do
    (output, remainingInput) <- trySizes input $ Map.toList aCMap
    mappend output <$> toFont aCMap remainingInput
  where
    trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
    trySizes s ((size, cRanges):others) =
      let prefix = BS.take size s in
      case tryRanges prefix cRanges of
        Nothing -> trySizes s others
        Just outputSequence -> Right (outputSequence, BS.drop size s)
    tryRanges :: ByteString -> [CRange] -> Maybe Text
    tryRanges _ [] = Nothing
    tryRanges prefix ((CRange {mapping}):cRanges) =
      case Map.lookup prefix mapping of
        Nothing -> tryRanges prefix cRanges
        outputSequence -> outputSequence
 emptyCMap :: CMap
 emptyCMap = Map.empty
 matches :: ByteString -> CRange -> Bool
 matches code (CRange {fromSequence, toSequence}) =
  fromSequence <= code && code <= toSequence
 cMap :: MonadFail m => ByteString -> m Font
 cMap = either fail (return . toFont . snd) . runParser
  (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
  emptyCMap
  where
    ignoredLine =
      takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
 codeRanges :: Parser CMap ()
 codeRanges = do
  size <- integer <* line "begincodespacerange"
  mapM_ createMapping =<< count size codeRange
  line "endcodespacerange"
  where
    codeRange =
      (,) <$> stringObject <* blank <*> stringObject <* EOL.parser
 createMapping :: (StringObject, StringObject) -> Parser CMap ()
 createMapping (Hexadecimal from, Hexadecimal to) = modify $
  Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
  where
    fromSequence = b16ToBytes from
    size = BS.length fromSequence
    toSequence = b16ToBytes to
    mapping = Map.empty
 createMapping _ = return ()
 cMapRange :: Parser CMap ()
 cMapRange = do
  size <- integer <* line "beginbfrange"
  mapM_ saveMapping =<< count size rangeMapping
  line "endbfrange"
  where
    rangeMapping = (,,) 
      <$> (stringObject <* blank)
      <*> (stringObject <* blank)
      <*> directObject <* EOL.parser
      >>= mapFromTo
 saveMapping :: [(ByteString, Text)] -> Parser CMap ()
 saveMapping [] = return ()
 saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
  where
    newMapping = Map.fromList assoc
    mappingSize = BS.length code
    appendMapping cRange =
      cRange {mapping = mapping cRange `union` newMapping}
    insertCRange = fmap (\cRange ->
        if code `matches` cRange then appendMapping cRange else cRange
      )
 cMapChar :: Parser CMap ()
 cMapChar = do
  size <- integer <* line "beginbfchar"
  saveMapping =<< count size charMapping <* line "endbfchar"
  where
    charMapping =
      (,) <$> stringObject <* blank <*> stringObject <* EOL.parser
      >>= pairMapping
 between :: B16Int -> B16Int -> [ByteString]
 between from@(B16Int s) to =
  let size = BS.length s `div` 2 in
  toBytes size <$> [b16ToInt from .. b16ToInt to]
 startFrom :: B16Int -> [ByteString]
 startFrom from@(B16Int s) =
  let size = BS.length s `div` 2 in
  toBytes size <$> [b16ToInt from .. ]
 mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
 mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
  return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
 mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
  zip (between from to) <$> (mapM dstByteString dstPoints)
  where
    dstByteString (StringObject (Hexadecimal dst)) =
      return . utf16BEToutf8 $ b16ToBytes dst
    dstByteString _ = fail "Invalid for a replacement string"
 mapFromTo _ = fail "invalid range mapping found"
 pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
 pairMapping (Hexadecimal from, Hexadecimal to) =
  return (b16ToBytes  from, utf16BEToutf8 $ b16ToBytes  to)
 pairMapping _ = fail "invalid pair mapping found"
--- a/src/PDF/Content.hs
+++ b/src/PDF/Content.hs
@ -0,0 +1,69 @@
 {-# LANGUAGE OverloadedStrings #-}
 module PDF.Content (
      Content(..)
    , ContentUnit(..)
    , GraphicContextUnit(..)
    , TextContext
    , content
    , parse
  ) where
 import Control.Applicative ((<|>))
 import Control.Monad.State (evalStateT, modify)
 import Data.Attoparsec.ByteString.Char8 (sepBy)
 import Data.ByteString (ByteString)
 import PDF.Content.Operator (Instruction, operator)
 import PDF.Object (blank, directObject)
 import PDF.Output (Output(..), line)
 import PDF.Parser (MonadParser, evalParser, string)
 data GraphicContextUnit =
    GraphicInstruction Instruction
  | ContentUnit ContentUnit
  deriving Show
 type TextContext = [Instruction]
 data ContentUnit =
    GraphicContext [GraphicContextUnit]
  | TextContext TextContext
  deriving Show
 newtype Content = Content [ContentUnit] deriving Show
 content :: MonadParser m => m Content
 content = Content <$> contentUnit `sepBy` blank
 contentUnit :: MonadParser m => m ContentUnit
 contentUnit =
    (GraphicContext <$> graphicContext)
  <|> (TextContext <$> textContext)
  where
    graphicContext =
      string "q" *> blank *> graphicContextUnit `sepBy` blank <* blank <* string "Q"
 graphicContextUnit :: MonadParser m => m GraphicContextUnit
 graphicContextUnit =
    (GraphicInstruction <$> instruction)
  <|> (ContentUnit <$> contentUnit)
 instruction :: MonadParser m => m Instruction
 instruction = evalStateT stackParser []
  where
    stackParser = ((directObject <* blank) >>= push) <|> operator
    push arg = modify (arg:) *> stackParser
 parse :: ByteString -> Either String Content
 parse = evalParser content ()
 textContext :: MonadParser m => m TextContext
 textContext =
  string "BT" *> blank *> instruction `sepBy` blank <* blank <* string "ET"
 instance Output Content where
  output (Content contentUnits) = output contentUnits
 instance Output ContentUnit where
  output (GraphicContext gc) = line "q" `mappend` output gc `mappend` line "Q"
  output (TextContext tc) = line "BT" `mappend` output tc `mappend` line "ET"
 instance Output GraphicContextUnit where
  output (GraphicInstruction gi) = output gi
  output (ContentUnit cu) = output cu
--- a/src/PDF/Content/Operator.hs
+++ b/src/PDF/Content/Operator.hs
@ -0,0 +1,76 @@
 {-# LANGUAGE ConstraintKinds #-}
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE TypeSynonymInstances #-}
 {-# LANGUAGE FlexibleInstances #-}
 module PDF.Content.Operator (
      Instruction
    , Operator(..)
    , operator
  ) where
 import Control.Monad.Fail (MonadFail(..))
 import Control.Monad.State (MonadState(..))
 import Data.ByteString.Char8 (ByteString, pack, unpack)
 import Data.Char (toLower)
 import Data.Map (Map, (!?))
 import qualified Data.Map as Map (fromList)
 import qualified PDF.Content.Operator.Color as Color (Operator, signature)
 import PDF.Content.Operator.Common (Signature)
 import qualified PDF.Content.Operator.GraphicState as GraphicState (Operator, signature)
 import qualified PDF.Content.Operator.Path as Path (Operator, signature)
 import qualified PDF.Content.Operator.Text as Text (Operator, signature)
 import PDF.Object (DirectObject, blank, regular)
 import PDF.Output (Output(..), join, line)
 import PDF.Parser (MonadParser, takeAll1)
 import Prelude hiding (fail)
 import Text.Printf (printf)
 data Operator =
    GraphicState GraphicState.Operator
  | Path Path.Operator
  | Color Color.Operator
  | Text Text.Operator 
 instance Show Operator where
  show (GraphicState gcOp) = code gcOp
  show (Path pOp) = code pOp
  show (Color cOp) = code cOp
  show (Text tOp) = code tOp
 type Instruction = (Operator, [DirectObject])
 instance Output Instruction where
  output (op, args) = join " " ((output <$> args) ++ [line (show op)])
 operatorsTable :: Map ByteString (Signature Operator)
 operatorsTable = Map.fromList (
      (prepare GraphicState <$> GraphicState.signature)
    ++ (prepare Path <$> Path.signature)
    ++ (prepare Color <$> Color.signature)
    ++ (prepare Text <$> Text.signature)
  )
  where
    prepare constructor (op, sig) = (pack $ code op, (constructor op, sig))
 code :: Show a => a -> String
 code = expand . show
  where
    expand "" = ""
    expand (c:'_':s) = toLower c : expand s
    expand ('s':'t':'a':'r':s) = '*' : expand s
    expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
    expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
    expand (c:s) = c : expand s
 type StackParser m = (MonadState [DirectObject] m, MonadParser m)
 operator :: StackParser m => m Instruction
 operator = do
  chunk <- takeAll1 regular <* blank
  args <- reverse <$> get
  case operatorsTable !? chunk of
    Just (op, sig)
      | sig args -> return (op, args)
      | otherwise -> 
        get >>= fail . printf "Operator %s with stack %s" (show op) . show
    _ -> fail ("Unknown chunk " ++ unpack chunk)
--- a/src/PDF/Content/Operator/Color.hs
+++ b/src/PDF/Content/Operator/Color.hs
@ -0,0 +1,27 @@
 module PDF.Content.Operator.Color (
      Operator(..)
    , signature
  ) where
 import PDF.Content.Operator.Common (Signature)
 import PDF.Object (DirectObject(..))
 data Operator =
    CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
  deriving (Bounded, Enum, Show)
 signature :: [Signature Operator]
 signature = [
      (CS, \l -> case l of [NameObject _] -> True ; _ -> False)
    , (C_s, \l -> case l of [NameObject _] -> True ; _ -> False)
    , (SC, \_ -> True)
    , (SCN, \_ -> True)
    , (S_c, \_ -> True)
    , (S_cn, \_ -> True)
    , (G, \l -> case l of [_] -> True ; _ -> False)
    , (G_, \l -> case l of [_] -> True ; _ -> False)
    , (RG, \l -> case l of [_, _, _] -> True ; _ -> False)
    , (R_g, \l -> case l of [_, _, _] -> True ; _ -> False)
    , (K, \l -> case l of [_, _, _, _] -> True ; _ -> False)
    , (K_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
  ]
--- a/src/PDF/Content/Operator/Common.hs
+++ b/src/PDF/Content/Operator/Common.hs
@ -0,0 +1,7 @@
 module PDF.Content.Operator.Common (
    Signature
  ) where
 import PDF.Object (DirectObject)
 type Signature a = (a, [DirectObject] -> Bool)
--- a/src/PDF/Content/Operator/GraphicState.hs
+++ b/src/PDF/Content/Operator/GraphicState.hs
@ -0,0 +1,24 @@
 module PDF.Content.Operator.GraphicState (
      Operator(..)
    , signature
  ) where
 import PDF.Content.Operator.Common (Signature)
 import PDF.Object (DirectObject(..))
 data Operator =
  C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
  deriving (Bounded, Enum, Show)
 signature :: [Signature Operator]
 signature = [
      (C_m, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
    , (W_, \l -> case l of [_] -> True ; _ -> False)
    , (J, \l -> case l of [_] -> True ; _ -> False)
    , (J_, \l -> case l of [_] -> True ; _ -> False)
    , (M, \l -> case l of [_] -> True ; _ -> False)
    , (D_, \l -> case l of [_, _] -> True ; _ -> False)
    , (R_i, \l -> case l of [_] -> True ; _ -> False)
    , (I_, \l -> case l of [_] -> True ; _ -> False)
    , (G_s, \l -> case l of [NameObject _] -> True ; _ -> False)
  ]
--- a/src/PDF/Content/Operator/Path.hs
+++ b/src/PDF/Content/Operator/Path.hs
@ -0,0 +1,35 @@
 module PDF.Content.Operator.Path (
      Operator(..)
    , signature
  ) where
 import PDF.Content.Operator.Common (Signature)
 data Operator =
    M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
  | S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
  | W | Wstar -- clipping path
  deriving (Bounded, Enum, Show)
 signature :: [Signature Operator]
 signature = [
      (M_, \l -> case l of [_, _] -> True ; _ -> False)
    , (L_, \l -> case l of [_, _] -> True ; _ -> False)
    , (C_, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
    , (V_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
    , (Y_, \l -> case l of [_, _, _, _] -> True ; _ -> False)
    , (H_, \l -> case l of [] -> True ; _ -> False)
    , (R_e, \l -> case l of [_, _, _, _] -> True ; _ -> False)
    , (S, \l -> case l of [] -> True ; _ -> False)
    , (S_, \l -> case l of [] -> True ; _ -> False)
    , (F_, \l -> case l of [] -> True ; _ -> False)
    , (F, \l -> case l of [] -> True ; _ -> False)
    , (Fstar, \l -> case l of [] -> True ; _ -> False)
    , (B, \l -> case l of [] -> True ; _ -> False)
    , (Bstar, \l -> case l of [] -> True ; _ -> False)
    , (B_, \l -> case l of [] -> True ; _ -> False)
    , (B_star, \l -> case l of [] -> True ; _ -> False)
    , (N_, \l -> case l of [] -> True ; _ -> False)
    , (W, \l -> case l of [] -> True ; _ -> False)
    , (Wstar, \l -> case l of [] -> True ; _ -> False)
  ]
--- a/src/PDF/Content/Operator/Text.hs
+++ b/src/PDF/Content/Operator/Text.hs
@ -0,0 +1,32 @@
 module PDF.Content.Operator.Text (
      Operator(..)
    , signature
  ) where
 import PDF.Content.Operator.Common (Signature)
 import PDF.Object (DirectObject(..))
 data Operator =
    Td | TD | Tm | Tstar -- text positioning
  | TJ | Tj | Quote | DQuote -- text showing
  | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
  deriving (Bounded, Enum, Show)
 signature :: [Signature Operator]
 signature = [
      (Td, \l -> case l of [_, _] -> True ; _ -> False)
    , (TD, \l -> case l of [_, _] -> True ; _ -> False)
    , (Tm, \l -> case l of [_, _, _, _, _, _] -> True ; _ -> False)
    , (Tstar, \l -> case l of [] -> True ; _ -> False)
    , (TJ, \l -> case l of [Array _] -> True ; _ -> False)
    , (Tj, \l -> case l of [StringObject _] -> True ; _ -> False)
    , (Quote, \l -> case l of [StringObject _] -> True ; _ -> False)
    , (DQuote, \l -> case l of [StringObject _] -> True ; _ -> False)
    , (Tc, \l -> case l of [_] -> True ; _ -> False)
    , (Tw, \l -> case l of [_] -> True ; _ -> False)
    , (Tz, \l -> case l of [_] -> True ; _ -> False)
    , (TL, \l -> case l of [_] -> True ; _ -> False)
    , (Tf, \l -> case l of [NameObject _, _] -> True ; _ -> False)
    , (Tr, \l -> case l of [_] -> True ; _ -> False)
    , (Ts, \l -> case l of [_] -> True ; _ -> False)
  ]
--- a/src/PDF/Content/Text.hs
+++ b/src/PDF/Content/Text.hs
@ -0,0 +1,75 @@
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE ConstraintKinds #-}
 {-# LANGUAGE FlexibleContexts #-}
 module PDF.Content.Text (
      format
    , renderText
  ) where
 import Control.Monad (foldM)
 import Control.Monad.Fail (MonadFail(..))
 import Control.Monad.Reader (MonadReader(..), asks, runReaderT)
 import Control.Monad.State (MonadState(..), evalStateT)
 import Data.ByteString.Char8 (pack)
 import Data.Map ((!))
 import Data.Text (Text)
 import PDF.Content (Content(..), ContentUnit(..), GraphicContextUnit(..))
 import PDF.Content.Operator (Instruction, Operator(..))
 import PDF.Content.Operator.Text (Operator(..))
 import PDF.Font (Font, FontSet, emptyFont)
 import PDF.Object (DirectObject(..), StringObject(..), toByteString)
 import Prelude hiding (fail)
 type TextContent m = (MonadReader FontSet m, MonadFail m)
 type FontContext m = (MonadState Font m, TextContent m)
 decodeString :: FontContext m => StringObject -> m Text
 decodeString input = do
  font <- get
  either fail return . font $ toByteString input
 renderText :: MonadFail m => FontSet -> Content -> m [Text]
 renderText fontSet (Content contentUnits) =
  runReaderT (concat <$> mapM renderContentUnit contentUnits) fontSet
 renderContentUnit :: TextContent m => ContentUnit -> m [Text]
 renderContentUnit (GraphicContext graphicContextUnits) =
  concat <$> mapM renderGraphicContextUnit graphicContextUnits
 renderContentUnit (TextContext instructions) =
  evalStateT (concat <$> mapM renderInstruction instructions) emptyFont
 renderGraphicContextUnit :: TextContent m => GraphicContextUnit -> m [Text]
 renderGraphicContextUnit (GraphicInstruction _) = return []
 renderGraphicContextUnit (ContentUnit contentUnit) =
  renderContentUnit contentUnit
 renderInstruction :: FontContext m => Instruction -> m [Text]
 renderInstruction (Text Tf, [NameObject fontName, _]) =
  asks (! fontName) >>= put >> return []
 renderInstruction (Text Tstar, []) = return ["\n"]
 renderInstruction (Text TJ, [Array arrayObject]) =
  replicate 1 <$> foldM appendText "" arrayObject
  where
    appendText t (StringObject outputString) =
      mappend t <$> decodeString outputString
    appendText t _ = return t
 renderInstruction (Text Tj, [StringObject outputString]) =
  replicate 1 <$> decodeString outputString
 renderInstruction (Text Quote, [StringObject outputString]) =
  (\t -> ["\n", t]) <$> decodeString outputString
 renderInstruction (Text DQuote, [StringObject outputString]) =
  (\t -> ["\n", t]) <$> decodeString outputString
 renderInstruction _ = return []
 format :: String -> [Instruction]
 format s =
  case break (== '\n') s of
    ("", "") -> []
    ("", left) -> (Text Tstar, []) : format (drop 1 left)
    (line, left) -> (Text Tj, [StringObject . Literal $ pack line]) : format left
--- a/src/PDF/EOL.hs
+++ b/src/PDF/EOL.hs
@ -6,14 +6,14 @@ module PDF.EOL (
  ) where
 import Control.Applicative ((<|>))
-import PDF.Parser (Parser, string)
+import PDF.Parser (MonadParser, string)
 data Style = CR | LF | CRLF deriving Show
 charset :: String
 charset = "\r\n"
-parser :: Parser s Style
+parser :: MonadParser m => m Style
 parser =
    (string "\r\n" >> return CRLF)
  <|> (string "\r" >> return CR)
--- a/src/PDF/Encoding.hs
+++ b/src/PDF/Encoding.hs
@ -0,0 +1,12 @@
 module PDF.Encoding (
    encoding
  ) where
 import Control.Monad.Fail (MonadFail(..))
 import PDF.Encoding.MacRoman (macRomanEncoding)
 import PDF.Font (Font)
 import Prelude hiding (fail)
 encoding :: MonadFail m => String -> m Font
 encoding "MacRomanEncoding" = return macRomanEncoding
 encoding s = fail $ "Unknown encoding " ++ s
--- a/src/PDF/Encoding/MacRoman.hs
+++ b/src/PDF/Encoding/MacRoman.hs
@ -0,0 +1,141 @@
 module PDF.Encoding.MacRoman (
    macRomanEncoding
  ) where
 import Data.ByteString.Char8 (unpack)
 import Data.Text (pack)
 import PDF.Font (Font)
 macRomanEncoding :: Font
 macRomanEncoding = Right . pack . fmap decode . unpack
 decode :: Char -> Char
 decode '\x80' = '\x00C4' -- LATIN CAPITAL LETTER A WITH DIAERESIS
 decode '\x81' = '\x00C5' -- LATIN CAPITAL LETTER A WITH RING ABOVE
 decode '\x82' = '\x00C7' -- LATIN CAPITAL LETTER C WITH CEDILLA
 decode '\x83' = '\x00C9' -- LATIN CAPITAL LETTER E WITH ACUTE
 decode '\x84' = '\x00D1' -- LATIN CAPITAL LETTER N WITH TILDE
 decode '\x85' = '\x00D6' -- LATIN CAPITAL LETTER O WITH DIAERESIS
 decode '\x86' = '\x00DC' -- LATIN CAPITAL LETTER U WITH DIAERESIS
 decode '\x87' = '\x00E1' -- LATIN SMALL LETTER A WITH ACUTE
 decode '\x88' = '\x00E0' -- LATIN SMALL LETTER A WITH GRAVE
 decode '\x89' = '\x00E2' -- LATIN SMALL LETTER A WITH CIRCUMFLEX
 decode '\x8A' = '\x00E4' -- LATIN SMALL LETTER A WITH DIAERESIS
 decode '\x8B' = '\x00E3' -- LATIN SMALL LETTER A WITH TILDE
 decode '\x8C' = '\x00E5' -- LATIN SMALL LETTER A WITH RING ABOVE
 decode '\x8D' = '\x00E7' -- LATIN SMALL LETTER C WITH CEDILLA
 decode '\x8E' = '\x00E9' -- LATIN SMALL LETTER E WITH ACUTE
 decode '\x8F' = '\x00E8' -- LATIN SMALL LETTER E WITH GRAVE
 decode '\x90' = '\x00EA' -- LATIN SMALL LETTER E WITH CIRCUMFLEX
 decode '\x91' = '\x00EB' -- LATIN SMALL LETTER E WITH DIAERESIS
 decode '\x92' = '\x00ED' -- LATIN SMALL LETTER I WITH ACUTE
 decode '\x93' = '\x00EC' -- LATIN SMALL LETTER I WITH GRAVE
 decode '\x94' = '\x00EE' -- LATIN SMALL LETTER I WITH CIRCUMFLEX
 decode '\x95' = '\x00EF' -- LATIN SMALL LETTER I WITH DIAERESIS
 decode '\x96' = '\x00F1' -- LATIN SMALL LETTER N WITH TILDE
 decode '\x97' = '\x00F3' -- LATIN SMALL LETTER O WITH ACUTE
 decode '\x98' = '\x00F2' -- LATIN SMALL LETTER O WITH GRAVE
 decode '\x99' = '\x00F4' -- LATIN SMALL LETTER O WITH CIRCUMFLEX
 decode '\x9A' = '\x00F6' -- LATIN SMALL LETTER O WITH DIAERESIS
 decode '\x9B' = '\x00F5' -- LATIN SMALL LETTER O WITH TILDE
 decode '\x9C' = '\x00FA' -- LATIN SMALL LETTER U WITH ACUTE
 decode '\x9D' = '\x00F9' -- LATIN SMALL LETTER U WITH GRAVE
 decode '\x9E' = '\x00FB' -- LATIN SMALL LETTER U WITH CIRCUMFLEX
 decode '\x9F' = '\x00FC' -- LATIN SMALL LETTER U WITH DIAERESIS
 decode '\xA0' = '\x2020' -- DAGGER
 decode '\xA1' = '\x00B0' -- DEGREE SIGN
 decode '\xA2' = '\x00A2' -- CENT SIGN
 decode '\xA3' = '\x00A3' -- POUND SIGN
 decode '\xA4' = '\x00A7' -- SECTION SIGN
 decode '\xA5' = '\x2022' -- BULLET
 decode '\xA6' = '\x00B6' -- PILCROW SIGN
 decode '\xA7' = '\x00DF' -- LATIN SMALL LETTER SHARP S
 decode '\xA8' = '\x00AE' -- REGISTERED SIGN
 decode '\xA9' = '\x00A9' -- COPYRIGHT SIGN
 decode '\xAA' = '\x2122' -- TRADE MARK SIGN
 decode '\xAB' = '\x00B4' -- ACUTE ACCENT
 decode '\xAC' = '\x00A8' -- DIAERESIS
 decode '\xAD' = '\x2260' -- NOT EQUAL TO
 decode '\xAE' = '\x00C6' -- LATIN CAPITAL LETTER AE
 decode '\xAF' = '\x00D8' -- LATIN CAPITAL LETTER O WITH STROKE
 decode '\xB0' = '\x221E' -- INFINITY
 decode '\xB1' = '\x00B1' -- PLUS-MINUS SIGN
 decode '\xB2' = '\x2264' -- LESS-THAN OR EQUAL TO
 decode '\xB3' = '\x2265' -- GREATER-THAN OR EQUAL TO
 decode '\xB4' = '\x00A5' -- YEN SIGN
 decode '\xB5' = '\x00B5' -- MICRO SIGN
 decode '\xB6' = '\x2202' -- PARTIAL DIFFERENTIAL
 decode '\xB7' = '\x2211' -- N-ARY SUMMATION
 decode '\xB8' = '\x220F' -- N-ARY PRODUCT
 decode '\xB9' = '\x03C0' -- GREEK SMALL LETTER PI
 decode '\xBA' = '\x222B' -- INTEGRAL
 decode '\xBB' = '\x00AA' -- FEMININE ORDINAL INDICATOR
 decode '\xBC' = '\x00BA' -- MASCULINE ORDINAL INDICATOR
 decode '\xBD' = '\x03A9' -- GREEK CAPITAL LETTER OMEGA
 decode '\xBE' = '\x00E6' -- LATIN SMALL LETTER AE
 decode '\xBF' = '\x00F8' -- LATIN SMALL LETTER O WITH STROKE
 decode '\xC0' = '\x00BF' -- INVERTED QUESTION MARK
 decode '\xC1' = '\x00A1' -- INVERTED EXCLAMATION MARK
 decode '\xC2' = '\x00AC' -- NOT SIGN
 decode '\xC3' = '\x221A' -- SQUARE ROOT
 decode '\xC4' = '\x0192' -- LATIN SMALL LETTER F WITH HOOK
 decode '\xC5' = '\x2248' -- ALMOST EQUAL TO
 decode '\xC6' = '\x2206' -- INCREMENT
 decode '\xC7' = '\x00AB' -- LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
 decode '\xC8' = '\x00BB' -- RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
 decode '\xC9' = '\x2026' -- HORIZONTAL ELLIPSIS
 decode '\xCA' = '\x00A0' -- NO-BREAK SPACE
 decode '\xCB' = '\x00C0' -- LATIN CAPITAL LETTER A WITH GRAVE
 decode '\xCC' = '\x00C3' -- LATIN CAPITAL LETTER A WITH TILDE
 decode '\xCD' = '\x00D5' -- LATIN CAPITAL LETTER O WITH TILDE
 decode '\xCE' = '\x0152' -- LATIN CAPITAL LIGATURE OE
 decode '\xCF' = '\x0153' -- LATIN SMALL LIGATURE OE
 decode '\xD0' = '\x2013' -- EN DASH
 decode '\xD1' = '\x2014' -- EM DASH
 decode '\xD2' = '\x201C' -- LEFT DOUBLE QUOTATION MARK
 decode '\xD3' = '\x201D' -- RIGHT DOUBLE QUOTATION MARK
 decode '\xD4' = '\x2018' -- LEFT SINGLE QUOTATION MARK
 decode '\xD5' = '\x2019' -- RIGHT SINGLE QUOTATION MARK
 decode '\xD6' = '\x00F7' -- DIVISION SIGN
 decode '\xD7' = '\x25CA' -- LOZENGE
 decode '\xD8' = '\x00FF' -- LATIN SMALL LETTER Y WITH DIAERESIS
 decode '\xD9' = '\x0178' -- LATIN CAPITAL LETTER Y WITH DIAERESIS
 decode '\xDA' = '\x2044' -- FRACTION SLASH
 decode '\xDB' = '\x20AC' -- EURO SIGN
 decode '\xDC' = '\x2039' -- SINGLE LEFT-POINTING ANGLE QUOTATION MARK
 decode '\xDD' = '\x203A' -- SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
 decode '\xDE' = '\xFB01' -- LATIN SMALL LIGATURE FI
 decode '\xDF' = '\xFB02' -- LATIN SMALL LIGATURE FL
 decode '\xE0' = '\x2021' -- DOUBLE DAGGER
 decode '\xE1' = '\x00B7' -- MIDDLE DOT
 decode '\xE2' = '\x201A' -- SINGLE LOW-9 QUOTATION MARK
 decode '\xE3' = '\x201E' -- DOUBLE LOW-9 QUOTATION MARK
 decode '\xE4' = '\x2030' -- PER MILLE SIGN
 decode '\xE5' = '\x00C2' -- LATIN CAPITAL LETTER A WITH CIRCUMFLEX
 decode '\xE6' = '\x00CA' -- LATIN CAPITAL LETTER E WITH CIRCUMFLEX
 decode '\xE7' = '\x00C1' -- LATIN CAPITAL LETTER A WITH ACUTE
 decode '\xE8' = '\x00CB' -- LATIN CAPITAL LETTER E WITH DIAERESIS
 decode '\xE9' = '\x00C8' -- LATIN CAPITAL LETTER E WITH GRAVE
 decode '\xEA' = '\x00CD' -- LATIN CAPITAL LETTER I WITH ACUTE
 decode '\xEB' = '\x00CE' -- LATIN CAPITAL LETTER I WITH CIRCUMFLEX
 decode '\xEC' = '\x00CF' -- LATIN CAPITAL LETTER I WITH DIAERESIS
 decode '\xED' = '\x00CC' -- LATIN CAPITAL LETTER I WITH GRAVE
 decode '\xEE' = '\x00D3' -- LATIN CAPITAL LETTER O WITH ACUTE
 decode '\xEF' = '\x00D4' -- LATIN CAPITAL LETTER O WITH CIRCUMFLEX
 decode '\xF0' = '\xF8FF' -- Apple logo
 decode '\xF1' = '\x00D2' -- LATIN CAPITAL LETTER O WITH GRAVE
 decode '\xF2' = '\x00DA' -- LATIN CAPITAL LETTER U WITH ACUTE
 decode '\xF3' = '\x00DB' -- LATIN CAPITAL LETTER U WITH CIRCUMFLEX
 decode '\xF4' = '\x00D9' -- LATIN CAPITAL LETTER U WITH GRAVE
 decode '\xF5' = '\x0131' -- LATIN SMALL LETTER DOTLESS I
 decode '\xF6' = '\x02C6' -- MODIFIER LETTER CIRCUMFLEX ACCENT
 decode '\xF7' = '\x02DC' -- SMALL TILDE
 decode '\xF8' = '\x00AF' -- MACRON
 decode '\xF9' = '\x02D8' -- BREVE
 decode '\xFA' = '\x02D9' -- DOT ABOVE
 decode '\xFB' = '\x02DA' -- RING ABOVE
 decode '\xFC' = '\x00B8' -- CEDILLA
 decode '\xFD' = '\x02DD' -- DOUBLE ACUTE ACCENT
 decode '\xFE' = '\x02DB' -- OGONEK
 decode '\xFF' = '\x02C7' -- CARON
 decode c = c             -- The rest is ASCII
--- a/src/PDF/Font.hs
+++ b/src/PDF/Font.hs
@ -0,0 +1,16 @@
 module PDF.Font (
      Font
    , FontSet
    , emptyFont
  ) where
 import Data.ByteString (ByteString)
 import Data.Map (Map)
 import Data.Text (Text)
 import PDF.Object (Name)
 type Font = ByteString -> Either String Text
 type FontSet = Map Name Font
 emptyFont :: Font
 emptyFont _ = Left "No fond loaded"
--- a/src/PDF/Object.hs
+++ b/src/PDF/Object.hs
@ -3,6 +3,7 @@
 {-# LANGUAGE FlexibleInstances #-}
 module PDF.Object (
      Content(..)
    , Dictionary
    , DirectObject(..)
    , Flow(..)
    , IndexedObjects
@ -12,9 +13,11 @@ module PDF.Object (
    , Number(..)
    , Object(..)
    , Occurrence(..)
    , StringObject(..)
    , Structure(..)
    , XRefEntry(..)
    , XRefSection
    , array
    , blank
    , dictionary
    , directObject
@ -22,34 +25,36 @@ module PDF.Object (
    , integer
    , line
    , magicNumber
    , name
    , number
    , regular
    , stringObject
    , structure
    , toByteString
  ) where
-import Control.Applicative ((<|>))
+import Control.Applicative ((<|>), many)
-import Data.ByteString.Char8 (ByteString)
+import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
-import qualified Data.ByteString.Char8 as BS (
+import Data.ByteString (ByteString)
-    concat, cons, pack, singleton, unpack
+import qualified Data.ByteString as BS (concat)
-  )
+import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
 import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
 import Data.Map (Map, (!), mapWithKey)
 import qualified Data.Map as Map (
    delete, empty, fromList, lookup, minViewWithKey, toList, union
  )
 import qualified PDF.EOL as EOL (charset, parser)
-import qualified PDF.Output as Output (concat, line, string)
+import qualified PDF.Output as Output (line, string)
 import PDF.Output (
      OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..)
    , byteString, getObjectId, getOffset, getOffsets, join, newLine
    , saveOffset
  )
-import PDF.Parser (
+import PDF.Parser (MonadParser(..), Parser, octDigit, oneOf)
      Parser, (<?>)
    , char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option
    , sepBy, string, takeAll, takeAll1
  )
 import Text.Printf (printf)
-line :: String -> Parser u ()
+line :: MonadParser m => String -> m ()
-line l = (string (BS.pack l) *> blank *> return ()) <?> printf "line «%s»" l
+line l = (string (Char8.pack l) *> blank *> return ())
 magicNumber :: ByteString
 magicNumber = "%PDF-"
@ -60,8 +65,8 @@ eofMarker = "%%EOF"
 whiteSpaceCharset :: String
 whiteSpaceCharset = "\0\t\12 "
-blank :: Parser u ()
+blank :: MonadParser m => m ()
-blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return ()
+blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> pure ()
 delimiterCharset :: String
 delimiterCharset = "()<>[]{}/%"
@ -69,8 +74,8 @@ delimiterCharset = "()<>[]{}/%"
 regular :: Char -> Bool
 regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
-integer :: (Read a, Num a) => Parser u a
+integer :: (Read a, Num a, MonadParser m) => m a
-integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
+integer = read . Char8.unpack <$> decNumber <* blank
 -------------------------------------
 --          OBJECTS
@ -81,9 +86,9 @@ type IndexedObjects = Map ObjectId Object
 --
 -- Boolean
 --
-boolean :: Parser u Bool
+boolean :: MonadParser m => m Bool
 boolean =
-  (string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
+  (string "true" *> return True) <|> (string "false" *> return False)
 --
 -- Number
@ -96,38 +101,40 @@ instance Output Number where
      (n, 0) -> printf "%d" (n :: Int)
      _ -> printf "%f" f
-number :: Parser u Number
+number :: MonadParser m => m Number
-number = Number . read . BS.unpack <$>
+number = Number . read . Char8.unpack <$>
-  (mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
+  (mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
  <?> "number"
  where
    sign = string "-" <|> option "" (char '+' >> return "")
    integerPart = mappend <$> decNumber <*> option "" floatPart
-    floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
+    floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)
 --
 -- StringObject
 --
-data StringObject = Literal String | Hexadecimal String deriving Show
+data StringObject = Literal ByteString | Hexadecimal B16Int deriving Show
 instance Output StringObject where
-  output (Literal s) = Output.string (printf "(%s)" s)
+  output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
-  output (Hexadecimal s) = Output.string (printf "<%s>" s)
+  output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))
-stringObject :: Parser u StringObject
+stringObject :: MonadParser m => m StringObject
 stringObject =
-    Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
+    Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
-  <|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
+  <|> Hexadecimal <$> (char '<' *> hexNumber <* char '>')
  <?> "string object (literal or hexadecimal)"
  where
    literalString = many literalStringBlock
    literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
    normalChar = not . (`elem` ("\\()" :: String))
    matchingParenthesis =
-      mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
+      mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
    escapedChar =
-      BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
+      Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
-    octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
+    octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
 toByteString :: StringObject -> ByteString
 toByteString (Hexadecimal h) = b16ToBytes h
 toByteString (Literal s) = unescape s
 --
 -- Name
@ -137,15 +144,15 @@ newtype Name = Name String deriving (Eq, Ord, Show)
 instance Output Name where
  output (Name n) = Output.string ('/':n)
-name :: Parser u Name
+name :: MonadParser m => m Name
-name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
+name = Name . Char8.unpack <$> (char '/' *> takeAll regular)
 --
 -- Array
 --
-array :: Parser u [DirectObject]
+array :: MonadParser m => m [DirectObject]
 array =
-  char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
+  char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']'
 --
 -- Dictionary
@ -153,16 +160,16 @@ array =
 type Dictionary = Map Name DirectObject
 instance Output Dictionary where
-  output dict =
+  output aDictionary =
    "<<" `mappend` keyValues `mappend` ">>"
    where
-      keyValues = join " " $ outputKeyVal <$> Map.toList dict
+      keyValues = join " " $ outputKeyVal <$> Map.toList aDictionary
      outputKeyVal :: (Name, DirectObject) -> OBuilder
-      outputKeyVal (key, val) = Output.concat [output key, " ", output val]
+      outputKeyVal (key, val) = mconcat [output key, " ", output val]
-dictionary :: Parser u Dictionary
+dictionary :: MonadParser m => m Dictionary
 dictionary =
-  string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
+  string "<<" *> blank *> keyValPairs <* string ">>"
  where
    keyVal = (,) <$> name <* blank <*> directObject
    keyValPairs = Map.fromList <$> keyVal `sepBy` blank <* blank
@ -170,8 +177,8 @@ dictionary =
 --
 -- Null
 --
-nullObject :: Parser u ()
+nullObject :: MonadParser m => m ()
-nullObject = string "null" *> return () <?> "null object"
+nullObject = string "null" *> return ()
 --
 -- Reference
@ -181,9 +188,9 @@ data IndirectObjCoordinates = IndirectObjCoordinates {
    , versionNumber :: Int
  } deriving Show
-reference :: Parser u IndirectObjCoordinates
+reference :: MonadParser m => m IndirectObjCoordinates
 reference = IndirectObjCoordinates
-  <$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
+  <$> (fmap ObjectId integer) <*> integer <* char 'R'
 --
 -- DirectObject
@ -204,23 +211,25 @@ instance Output DirectObject where
  output (NumberObject n) = output n
  output (StringObject s) = output s
  output (NameObject n) = output n
-  output (Array a) = Output.concat ["[", join " " a, "]"]
+  output (Array a) = mconcat ["[", join " " a, "]"]
  output (Dictionary d) = output d
  output (Null) = "null"
  output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
    Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
-directObject :: Parser u DirectObject
+directObject :: MonadParser m => m DirectObject
-directObject =
+directObject = peek >>= dispatch
-    Boolean <$> boolean
+  where
-  <|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
+    dispatch 't' = Boolean <$> boolean
-  <|> NumberObject <$> number
+    dispatch 'f' = Boolean <$> boolean
-  <|> StringObject <$> stringObject
+    dispatch '(' = StringObject <$> stringObject
-  <|> NameObject <$> name
+    dispatch '<' = StringObject <$> stringObject <|> Dictionary <$> dictionary
-  <|> Array <$> array
+    dispatch '/' = NameObject <$> name
-  <|> Dictionary <$> dictionary
+    dispatch '[' = Array <$> array
-  <|> const Null <$> nullObject
+    dispatch 'n' = nullObject *> return Null
-  <?> "direct object"
+    dispatch _ =
        Reference <$> reference {- defined before Number because Number is a prefix of it -}
      <|> NumberObject <$> number
 --
 -- Object
@ -235,7 +244,7 @@ data Object =
 instance Output Object where
  output (Direct d) = output d
-  output (Stream {header, streamContent}) = Output.concat [
+  output (Stream {header, streamContent}) = mconcat [
        output header, newLine
      , Output.line "stream"
      , byteString streamContent
@ -250,7 +259,7 @@ data Occurrence = Comment String | Indirect IndirectObjCoordinates deriving Show
 outputOccurrence :: IndexedObjects -> Occurrence -> OBuilder
 outputOccurrence _ (Comment c) = Output.line c
 outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNumber})) =
-  saveOffset (Object objectId) >> Output.concat [
+  saveOffset (Object objectId) >> mconcat [
        Output.line (printf "%d %d obj" (getObjectId objectId) versionNumber)
      , output (objects ! objectId), newLine
      , Output.line "endobj"
@ -280,7 +289,7 @@ instance Output XRefEntry where
 entry :: Parser u XRefEntry
 entry = do
  (big, small) <- (,) <$> integer <*> integer
-  (inUse big small <|> free big small <?> "XRef entry") <* blank
+  (inUse big small <|> free big small) <* blank
  where
    inUse :: Int -> Int -> Parser u XRefEntry
    inUse big generation =
@ -304,7 +313,7 @@ instance Output XRefSubSection where
 xRefSubSection :: Parser u XRefSubSection
 xRefSubSection = do
-  (firstId, entriesNumber) <- (,) <$> integer <*> integer <?> "XRef subsection"
+  (firstId, entriesNumber) <- (,) <$> integer <*> integer
  entries <- count entriesNumber entry
  return $ XRefSubSection {firstObjectId = ObjectId firstId, entries}
@ -383,7 +392,7 @@ outputBody (occurrences, objects) =
 instance Output Content where
  output (Content {occurrences, objects, docStructure}) =
    fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
-    >>= \(body, (xref, startXRef)) -> Output.concat [
+    >>= \(body, (xref, startXRef)) -> mconcat [
          body
        , Output.line "xref"
        , output xref
--- a/src/PDF/Object/Navigation.hs
+++ b/src/PDF/Object/Navigation.hs
@ -0,0 +1,82 @@
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE ConstraintKinds #-}
 {-# LANGUAGE FlexibleContexts #-}
 module PDF.Object.Navigation (
      (//)
    , dictionaryById
    , getDictionary
    , getField
    , follow
    , objectById
    , openStream
    , origin
  ) where
 import Codec.Compression.Zlib (decompress)
 import Control.Monad.Reader (MonadReader(..))
 import Control.Monad.Fail (MonadFail(..))
 import Data.ByteString (ByteString)
 import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
 import Data.Map ((!))
 import qualified Data.Map as Map (lookup)
 import PDF.Object (
      Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
    , Name(..), Object(..), Structure(..)
  )
 import PDF.Output (ObjectId)
 import Prelude hiding (fail)
 import Text.Printf (printf)
 type PDFContent m = (MonadReader Content m, MonadFail m)
 castDictionary :: MonadFail m => Object -> m Dictionary
 castDictionary (Direct (Dictionary aDict)) = return aDict
 castDictionary obj = expected "dictionary : " obj
 castObjectId :: MonadFail m => DirectObject -> m ObjectId
 castObjectId (Reference (IndirectObjCoordinates {objectId})) = return objectId
 castObjectId directObject = expected "reference" directObject
 dictionaryById :: PDFContent m => ObjectId -> m Dictionary
 dictionaryById objectId = objectById objectId >>= castDictionary
 expected :: (MonadFail m, Show a) => String -> a -> m b
 expected name = fail . printf "Not a %s: %s" name . show
 getField :: MonadFail m => String -> Dictionary -> m DirectObject
 getField key aDictionary =
  maybe (fail errorMessage) return (Map.lookup (Name key) aDictionary)
  where
    errorMessage =
      printf "Key %s not found in dictionary %s" key (show aDictionary)
 follow :: PDFContent m => DirectObject -> m Object
 follow directObject = castObjectId directObject >>= objectById
 objectById :: PDFContent m => ObjectId -> m Object
 objectById objectId = do
  content <- ask
  return (objects content ! objectId)
 getDictionary :: PDFContent m => DirectObject -> m Dictionary
 getDictionary (Dictionary aDictionary) = return aDictionary
 getDictionary (Reference (IndirectObjCoordinates {objectId})) =
  objectById objectId >>= castDictionary
 getDictionary aDirectObject =
  expected "resource (dictionary or reference)" aDirectObject
 (//) :: PDFContent m => Dictionary -> [String] -> m DirectObject
 (//) aDict [] = return $ Dictionary aDict
 (//) aDict [key] = getField key aDict
 (//) aDict (key:keys) = getField key aDict >>= getDictionary >>= (// keys)
 origin :: PDFContent m => m Dictionary
 origin = trailer . docStructure <$> ask
 openStream :: MonadFail m => Object -> m ByteString
 openStream (Stream {header, streamContent}) = return $
  case Map.lookup (Name "Filter") header of
    Just (NameObject (Name "FlateDecode")) ->
      Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
    _ -> streamContent
 openStream obj = expected "stream" obj
--- a/src/PDF/Output.hs
+++ b/src/PDF/Output.hs
@ -12,7 +12,6 @@ module PDF.Output (
    , Resource(..)
    , byteString
    , char
    , concat
    , getOffsets
    , join
    , line
@ -32,7 +31,6 @@ import qualified Data.Map as Map (singleton)
 import Data.String (IsString(..))
 import Control.Monad.RWS (RWS, runRWS, ask, get, listen, modify, tell)
 import qualified PDF.EOL as EOL (Style(..))
 import Prelude hiding (concat)
 newtype ObjectId = ObjectId {getObjectId :: Int} deriving (Enum, Eq, Ord, Show)
 newtype Offset = Offset {getOffset :: Int} deriving (Show)
@ -66,9 +64,6 @@ getOffsets (OContext builder) =
 append :: OBuilder -> OBuilder -> OBuilder
 append (OContext a) (OContext b) = OContext (mappend <$> a <*> b)
 concat :: [OBuilder] -> OBuilder
 concat = foldl mappend mempty
 #if MIN_VERSION_base(4,11,0)
 instance Semigroup OBuilder where
  (<>) = append
@ -94,7 +89,7 @@ instance Output Bool where
  output True = string "true"
 instance Output a => Output [a] where
-  output = concat . fmap output
+  output = mconcat . fmap output
 join :: Output a => String -> [a] -> OBuilder
 join _ [] = mempty
@ -116,7 +111,7 @@ char :: Char -> OBuilder
 char c = lift char8 c <* offset (+1)
 string :: String -> OBuilder
-string s = lift string8 s <* offset (+ toEnum (length s))
+string s = lift string8 s <* offset (+ length s)
 line :: String -> OBuilder
 line l = string l `mappend` newLine
--- a/src/PDF/Pages.hs
+++ b/src/PDF/Pages.hs
@ -0,0 +1,133 @@
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
 module PDF.Pages (
      Page(..)
    , get
    , getAll
  ) where
 import Control.Applicative (Alternative(..), (<|>))
 import Control.Monad (MonadPlus(..), foldM)
 import Control.Monad.Fail (MonadFail(..))
 import Control.Monad.RWS (RWST(..), evalRWST, modify)
 import qualified Control.Monad.RWS as RWS (get)
 import Data.Map (Map)
 import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
 import Data.Text (Text)
 import PDF.CMap (cMap)
 import qualified PDF.Content as Content (parse)
 import PDF.Content.Text (renderText)
 import PDF.Encoding (encoding)
 import PDF.Font (Font, FontSet)
 import PDF.Object (
      Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
    , Name(..)
  ,)
 import PDF.Object.Navigation (
    (//), dictionaryById, getDictionary, getField, follow, openStream, origin
  )
 import PDF.Output (ObjectId(..))
 import Prelude hiding (fail)
 import Text.Printf (printf)
 type CachedFonts = Map ObjectId Font
 newtype Error a = Error {
    runError :: Either String a
  } deriving (Alternative, Functor, Applicative, Monad, MonadPlus)
 type T = RWST Content () CachedFonts Error
 data Page = Page {
      contents :: [Text]
    , source :: ObjectId
  }
 instance MonadFail Error where
  fail = Error . Left
 getFontDictionary :: Dictionary -> T Dictionary
 getFontDictionary pageDict =
    ((pageDict // ["Resources", "Font"]) >>= getDictionary)
  <|> return Map.empty
 cache :: (ObjectId -> T Font) -> ObjectId -> T Font
 cache loader objectId =
  (maybe load return . Map.lookup objectId) =<< RWS.get
  where
    load = do
      value <- loader objectId
      modify $ Map.insert objectId value
      return value
 loadFont :: ObjectId -> T Font
 loadFont objectId = dictionaryById objectId >>= tryMappings
  where
    tryMappings dictionary =
        loadCMap dictionary
      <|> (getField "Encoding" dictionary >>= loadEncoding)
      <|> (fail $ unknownFormat (show objectId) (show dictionary))
    unknownFormat = printf "Unknown font format for object #%s : %s"
    loadCMap :: Dictionary -> T Font
    loadCMap dictionary =
      getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
    loadEncoding :: DirectObject -> T Font
    loadEncoding (NameObject (Name name)) = encoding name
    loadEncoding directObject =
      fail $ printf "Encoding must be a name, not that : %s" $ show directObject
 loadFonts :: Dictionary -> T FontSet
 loadFonts = foldM addFont Map.empty . Map.toList
  where
    addFont :: FontSet -> (Name, DirectObject) -> T FontSet
    addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
      flip (Map.insert name) output <$> cache loadFont objectId
    addFont output _ = return output
 several :: DirectObject -> [DirectObject]
 several (Array l) =  l
 several directObject = [directObject]
 pagesList :: T [ObjectId]
 pagesList = do
  pages <- (origin :: T Dictionary) >>= (// ["Root", "Pages"]) >>= getDictionary
  case Map.lookup (Name "Kids") pages of
    Just (Array kids) -> return $ getReferences kids
    _ -> return []
 getReferences :: [DirectObject] -> [ObjectId]
 getReferences objects = do
  object <- objects
  case object of
    Reference (IndirectObjCoordinates {objectId}) -> [objectId]
    _ -> []
 extractText :: Dictionary -> T [Text]
 extractText pageDict = do
  fonts <- loadFonts =<< getFontDictionary pageDict
  objects <- several <$> getField "Contents" pageDict
  concat <$> mapM (loadContent fonts) objects
  where
    loadContent :: FontSet -> DirectObject -> T [Text]
    loadContent fonts directObject =
        follow directObject
      >>= openStream
      >>= (either fail return . Content.parse)
      >>= renderText fonts
 loadPage :: ObjectId -> T Page
 loadPage source = do
  contents <- extractText =<< dictionaryById source
  return $ Page {contents, source}
 getAll :: Content -> Either String (Map Int Page)
 getAll content = runError $ fst <$> evalRWST getPages content Map.empty
  where
    numbered = Map.fromList . zip [1..]
    getPages = numbered <$> (mapM loadPage =<< pagesList)
 get :: Content -> Int -> Either String Page
 get content pageNumber
  | pageNumber < 1 = Left "Pages start at 1"
  | otherwise = runError $ fst <$> evalRWST getPage content Map.empty
  where
    firstPage [] = fail "Page is out of bounds"
    firstPage (p:_) = loadPage p
    getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
--- a/src/PDF/Parser.hs
+++ b/src/PDF/Parser.hs
@ -1,56 +1,70 @@
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE ConstraintKinds #-}
 {-# LANGUAGE UndecidableInstances #-}
 module PDF.Parser (
-      Parser
+      MonadParser(..)
-    , (<?>)
+    , Parser
    , block
    , char
    , choice
    , count
    , decNumber
    , hexNumber
    , many
    , octDigit
    , on
    , oneOf
    , option
    , runParser
-    , sepBy
+    , evalParser
    , string
    , takeAll
    , takeAll1
  ) where
-import Control.Applicative ((<|>), empty)
+import Control.Monad (MonadPlus)
 import Control.Monad.Fail (MonadFail(..))
 import Control.Monad.State (StateT(..), evalStateT)
-import Control.Monad.Trans (lift)
+import Control.Monad.Trans (MonadTrans(..))
 import qualified Data.Attoparsec.ByteString.Char8 as Atto (
-    Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
+      Parser, char, endOfInput, parseOnly, peekChar', satisfy, string, take
    , takeWhile, takeWhile1
  )
 import Data.ByteString (ByteString)
 import Data.ByteString.Char8.Util (B16Int(..))
 import Data.Char (toLower)
 import Data.Set (Set)
 import qualified Data.Set as Set (fromList, member, unions)
 import Prelude hiding (fail)
 type MonadDeps m = (MonadFail m, MonadPlus m)
 class MonadDeps m => MonadParser m where
  block :: Int -> m ByteString
  char :: Char -> m Char
  decNumber :: m ByteString
  endOfInput :: m ()
  hexNumber :: m B16Int
  oneOf :: String -> m Char
  peek :: m Char
  string :: ByteString -> m ByteString
  takeAll :: (Char -> Bool) -> m ByteString
  takeAll1 :: (Char -> Bool) -> m ByteString
 instance MonadParser Atto.Parser where
  block = Atto.take
  char = Atto.char
  endOfInput = Atto.endOfInput
  decNumber = Atto.takeWhile1 (`Set.member` digits)
  hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
  oneOf charSet = Atto.satisfy (`elem` charSet)
  peek = Atto.peekChar'
  string s = Atto.string s
  takeAll = Atto.takeWhile
  takeAll1 = Atto.takeWhile1
 instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) where
  block = lift . block
  char = lift . char
  endOfInput = lift $ endOfInput
  decNumber = lift $ decNumber
  hexNumber = lift $ hexNumber
  oneOf = lift . oneOf
  peek = lift $ peek
  string = lift . string
  takeAll = lift . takeAll
  takeAll1 = lift . takeAll1
 type Parser s = StateT s Atto.Parser
 (<?>) :: Parser s a -> String -> Parser s a
 (<?>) parser debugMessage = parser <|> fail debugMessage
 block :: Int -> Parser s ByteString
 block = lift . Atto.take
 char :: Char -> Parser s Char
 char = lift . Atto.char
 choice :: [Parser s a] -> Parser s a
 choice = foldr (<|>) empty
 count :: Int -> Parser s a -> Parser s [a]
 count 0 _ = return []
 count n p = (:) <$> p <*> count (n-1) p
 decNumber :: Parser s ByteString
 decNumber = lift $ Atto.takeWhile1 (`Set.member` digits)
 digits :: Set Char
 digits = Set.fromList ['0'..'9']
@ -59,13 +73,7 @@ hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af]
  where
    af = ['A'..'F']
-hexNumber :: Parser s ByteString
+octDigit :: MonadParser m => m Char
 hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits)
 many :: Parser s a -> Parser s [a]
 many parser = (:) <$> parser <*> many parser <|> return []
 octDigit :: Parser s Char
 octDigit = oneOf ['0'..'7']
 on :: Parser s a -> ByteString -> Parser s (Either String a)
@ -74,25 +82,8 @@ on (StateT parserF) input = StateT $ \state ->
    Left errorMsg -> return (Left errorMsg, state)
    Right (result, newState) -> return (Right result, newState)
-oneOf :: String -> Parser s Char
+runParser :: Parser s a -> s -> ByteString -> Either String (a, s)
-oneOf charSet = lift $ Atto.satisfy (`elem` charSet)
+runParser parser initState = Atto.parseOnly (runStateT parser initState)
-option :: a -> Parser s a -> Parser s a
+evalParser :: Parser s a -> s -> ByteString -> Either String a
-option defaultValue p = p <|> pure defaultValue
+evalParser parser initState = Atto.parseOnly (evalStateT parser initState)
 runParser :: Parser s a -> s -> ByteString -> Either String a
 runParser parser initState =
  Atto.parseOnly (evalStateT parser initState)
 sepBy :: Parser s a -> Parser s b -> Parser s [a]
 sepBy parser separator =
  option [] $ (:) <$> parser <*> many (separator *> parser)
 string :: ByteString -> Parser s ByteString
 string = lift . Atto.string
 takeAll :: (Char -> Bool) -> Parser s ByteString
 takeAll = lift . Atto.takeWhile
 takeAll1 :: (Char -> Bool) -> Parser s ByteString
 takeAll1 = lift . Atto.takeWhile1
--- a/src/PDF/Text.hs
+++ b/src/PDF/Text.hs
@ -0,0 +1,319 @@
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE ConstraintKinds #-}
 {-# LANGUAGE FlexibleContexts #-}
 module PDF.Text {-(
    pageContents
  )-} where
 import Control.Applicative ((<|>))
 import Control.Monad (foldM)
 import Control.Monad.Fail (MonadFail)
 import Control.Monad.Reader (ReaderT, runReaderT, asks)
 import Control.Monad.State (MonadState, evalStateT, get, modify, put)
 import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
 import Data.ByteString (ByteString)
 import Data.ByteString.Char8 (pack, unpack)
 import Data.Char (toLower)
 import Data.Map ((!), (!?), Map)
 import qualified Data.Map as Map (fromList)
 import Data.Text (Text)
 import PDF.Font (Font, FontSet, emptyFont)
 import PDF.Object (
      DirectObject(..), StringObject(..)
    , array, blank, name, regular, stringObject, toByteString
  )
 import PDF.Parser (MonadParser(..), (<?>), Parser, evalParser)
 data StateOperator =
  C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
  deriving (Bounded, Enum, Show)
 data PathOperator =
    M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
  | S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
  | W | Wstar -- clipping path
  deriving (Bounded, Enum, Show)
 data ColorOperator =
    CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
  deriving (Bounded, Enum, Show)
 data TextOperator =
    Td | TD | Tm | Tstar -- text positioning
  | TJ | Tj | Quote | DQuote -- text showing
  | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
  deriving (Bounded, Enum, Show)
 data Argument = Raw ByteString | Typed DirectObject deriving Show
 type Call a = (a, [Argument])
 type Operator a = (Bounded a, Enum a, Show a)
 code :: Operator a => a -> ByteString
 code = pack . expand . show
  where
    expand "" = ""
    expand (c:'_':s) = toLower c : expand s
    expand ('s':'t':'a':'r':s) = '*' : expand s
    expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
    expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
    expand (c:s) = c : expand s
 {-
 instance Show StateOperator where
  show Cm = "cm"
  show W_ = "w"
  show J = "J"
  show J_ = "j"
  show M = "M"
  show D = "d"
  show Ri = "ri"
  show I = "i"
  show Gs = "gs"
 instance Show PathOperator where
  show M_ = "m"
  show L_ = "l"
  show C_ = "c"
  show V_ = "v"
  show Y_ = "y"
  show H_
      ("m", (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
    , ("l", (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
    , ("c", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
    , ("v", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
    , ("y", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
    , ("h", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("re", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
    , ("S", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("s", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("f", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("F", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("F*", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("B", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("B*", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("b", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("b*", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("n", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("W", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("W*", (L_, \l -> case l of [] -> True ; _ -> False))
 instance Show ColorOperator where
 instance Show TextOperator where
  show Td = "Td"
  show TD = "TD"
  show Tm = "Tm"
  show Tstar = "T*"
  show TJ = "TJ"
  show Tj = "Tj"
  show Quote = "'"
  show DQuote = "\""
  show Tc = "Tc"
  show Tw = "Tw"
  show Tz = "Tz"
  show TL = "TL"
  show Tf = "Tf"
  show Tr = "Tr"
  show Ts = "Ts"
 -}
 stateOperator :: OperatorTable StateOperator
 stateOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
      (C_m, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (W_, \l -> case l of [Raw _] -> True ; _ -> False)
    , (J, \l -> case l of [Raw _] -> True ; _ -> False)
    , (J_, \l -> case l of [Raw _] -> True ; _ -> False)
    , (M, \l -> case l of [Raw _] -> True ; _ -> False)
    , (D_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (R_i, \l -> case l of [Raw _] -> True ; _ -> False)
    , (I_, \l -> case l of [Raw _] -> True ; _ -> False)
    , (G_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
  ]
 pathOperator :: OperatorTable PathOperator
 pathOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
      (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (C_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (V_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (Y_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (H_, \l -> case l of [] -> True ; _ -> False)
    , (R_e, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (S, \l -> case l of [] -> True ; _ -> False)
    , (S_, \l -> case l of [] -> True ; _ -> False)
    , (F_, \l -> case l of [] -> True ; _ -> False)
    , (F, \l -> case l of [] -> True ; _ -> False)
    , (Fstar, \l -> case l of [] -> True ; _ -> False)
    , (B, \l -> case l of [] -> True ; _ -> False)
    , (Bstar, \l -> case l of [] -> True ; _ -> False)
    , (B_, \l -> case l of [] -> True ; _ -> False)
    , (B_star, \l -> case l of [] -> True ; _ -> False)
    , (N_, \l -> case l of [] -> True ; _ -> False)
    , (W, \l -> case l of [] -> True ; _ -> False)
    , (Wstar, \l -> case l of [] -> True ; _ -> False)
  ]
 colorOperator :: OperatorTable ColorOperator
 colorOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
      (CS, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
    , (C_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
    , (SC, \_ -> True)
    , (SCN, \_ -> True)
    , (S_c, \_ -> True)
    , (S_cn, \_ -> True)
    , (G, \l -> case l of [Raw _] -> True ; _ -> False)
    , (G_, \l -> case l of [Raw _] -> True ; _ -> False)
    , (RG, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (R_g, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (K, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (K_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
  ]
 {-
 stateOperator (Cm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
 stateOperator (W, [Raw _]) = True
 stateOperator (J, [Raw _]) = True
 stateOperator (J_, [Raw _]) = True
 stateOperator (M, [Raw _]) = True
 stateOperator (D, [Raw _, Raw _]) = True
 stateOperator (Ri, [Raw _]) = True
 stateOperator (I, [Raw _]) = True
 stateOperator (Gs, [Typed (NameObject _)]) = True
 stateOperator _ = False
 -}
 textOperator :: OperatorTable TextOperator
 textOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
      (Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (Tstar, \l -> case l of [] -> True ; _ -> False)
    , (TJ, \l -> case l of [Typed (Array _)] -> True ; _ -> False)
    , (Tj, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
    , (Quote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
    , (DQuote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
    , (Tc, \l -> case l of [Raw _] -> True ; _ -> False)
    , (Tw, \l -> case l of [Raw _] -> True ; _ -> False)
    , (Tz, \l -> case l of [Raw _] -> True ; _ -> False)
    , (TL, \l -> case l of [Raw _] -> True ; _ -> False)
    , (Tf, \l -> case l of [Typed (NameObject _), Raw _] -> True ; _ -> False)
    , (Tr, \l -> case l of [Raw _] -> True ; _ -> False)
    , (Ts, \l -> case l of [Raw _] -> True ; _ -> False)
  ]
  {-
 textOperator (Td, [Raw _, Raw _]) = True
 textOperator (TD, [Raw _, Raw _]) = True
 textOperator (Tm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
 textOperator (Tstar, []) = True
 textOperator (TJ, [Typed (Array _)]) = True
 textOperator (Tj, [Typed (StringObject _)]) = True
 textOperator (Quote, [Typed (StringObject _)]) = True
 textOperator (DQuote, [Typed (StringObject _)]) = True
 textOperator (Tc, [Raw _]) = True
 textOperator (Tw, [Raw _]) = True
 textOperator (Tz, [Raw _]) = True
 textOperator (TL, [Raw _]) = True
 textOperator (Tf, [Typed (NameObject _), Raw _]) = True
 textOperator (Tr, [Raw _]) = True
 textOperator (Ts, [Raw _]) = True
 textOperator _ = False
 -}
 type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
 --type Operator a = (Bounded a, Enum a, Show a)
 type OperatorTable a = Map ByteString (TypeChecker a)
 type TypeChecker a = (a, [Argument] -> Bool)
 parseShowable :: (Show a, MonadParser m) => a -> m a
 parseShowable textOp = string (pack $ show textOp) *> return textOp
 callChunk :: MonadParser m => OperatorTable a -> m (Either (TypeChecker a) Argument)
 callChunk table =
    (Right <$> choice [stringArg, nameArg, arrayArg])
  <|> operatorOrRawArg
  <?> "call chunk"
  where
    operatorOrRawArg = do
      chunk <- takeAll1 regular <* blank
      case table !? chunk of
        Nothing -> return . Right $ Raw chunk
        Just typeChecker -> return $ Left typeChecker
 stackParser :: (ArgumentStackParser m, Operator a) => OperatorTable a -> m (Call a)
 stackParser table = either popCall push =<< (callChunk table)
  where
    push arg = modify (arg:) >> stackParser table
    popCall (operator, predicate) = do
      arguments <- reverse <$> get
      let call = (operator, arguments)
      if predicate arguments then return call else fail (unpack $ code operator)
 a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a)
 a table = evalStateT (stackParser table) []
 argument :: MonadParser m => m Argument
 argument = Raw <$> takeAll1 regular <* blank
 arrayArg :: MonadParser m => m Argument
 arrayArg = Typed . Array <$> array <* blank
 nameArg :: MonadParser m => m Argument
 nameArg = Typed . NameObject <$> name <* blank
 stringArg :: MonadParser m => m Argument
 stringArg = Typed . StringObject <$> stringObject <* blank
 type ParserWithFont = ReaderT FontSet (Parser Font)
 pageContents :: FontSet -> ByteString -> Either String [Text]
 pageContents fontSet input =
  evalParser (runReaderT (page) fontSet) emptyFont input
 several :: MonadParser m => m [a] -> m [a]
 several p = concat <$> (p `sepBy` blank)
 page :: ParserWithFont [Text]
 page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
 graphicState :: ParserWithFont [Text]
 graphicState =
  string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
  where
    insideQ = several (command <|> graphicState <|> text)
    ignore x = a x *> return []
    command =
      ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
 text :: ParserWithFont [Text]
 text =
  string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
  where
    commands = several (a textOperator >>= runOperator)
 runOperator :: Call TextOperator -> ParserWithFont [Text]
 runOperator (Tf, [Typed (NameObject fontName), _]) =
  asks (! fontName) >>= put >> return []
 runOperator (Tstar, []) = return ["\n"]
 runOperator (TJ, [Typed (Array arrayObject)]) =
  replicate 1 <$> foldM appendText "" arrayObject
  where
    appendText bs (StringObject outputString) =
      mappend bs <$> decodeString outputString
    appendText bs _ = return bs
 runOperator (Tj, [Typed (StringObject outputString)]) =
  replicate 1 <$> decodeString outputString
 runOperator (Quote, [Typed (StringObject outputString)]) =
  (\bs -> ["\n", bs]) <$> decodeString outputString
 runOperator (DQuote, [Typed (StringObject outputString)]) =
  (\bs -> ["\n", bs]) <$> decodeString outputString
 runOperator _ = return []
 decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text
 decodeString input = do
  font <- get
  either fail return . font $ toByteString input
Author	SHA1	Message	Date
Tissevert	1a25307c8c	Minor but strict improvement : remove the general implementation of <?> for Alternative	2020-02-12 17:48:26 +01:00
Tissevert	919f640443	Merge branch 'extract-text' into navigation	2020-02-12 17:35:56 +01:00
Tissevert	ae938acc02	Merge branch 'main' into extract-text	2020-02-12 17:34:56 +01:00
Tissevert	32f9866106	Use peek to improve directObject parser avoiding a large <\|> disjunction	2020-02-12 17:34:27 +01:00
Tissevert	eb4d76002c	Finish the split of Navigation out of Page, generalize the use of MonadFail with a custom Error monad (~= Either String)	2020-02-11 22:41:46 +01:00
Tissevert	af994cb50c	WIP: in the process of migrating to Object.Navigation in Pages, still unsure how to manage simple Content parsing and efficient font loading (+ giving a way to edit Contents)	2020-02-11 17:59:15 +01:00
Tissevert	704d7a7fcf	It turns out Output.concat wasn't necessary, OBuilder seems already is a Monoid so mconcat works (that fact was used in the very implementation of concat…)	2020-02-11 17:36:29 +01:00
Tissevert	11647eb4eb	Implement output for Content streams	2020-02-11 17:26:47 +01:00
Tissevert	aed7af376a	WIP: still trying to figure things out, moved to a separate submodule for Navigation, proper naming is hell	2020-02-11 08:29:08 +01:00
Tissevert	e77bbbcda9	WIP: start moving some navigation-related routines from Pages into Object directly and generalize them to multi-component to allow easier browsing	2020-02-10 17:43:04 +01:00
Tissevert	195446e653	Allow resources with no /Font field, they won't cause any problem as long as no call to Tf (to load a font) is made	2020-02-10 17:41:44 +01:00
Tissevert	9f1b1afafe	Implement Text rendering from parsed Content	2020-02-10 10:54:44 +01:00
Tissevert	20466c4f13	WIP: Clean code parsing «pages» (now Content), separated from text rendering (will be reimplemented as an upper layer, also providing modification as stream filters) — Page is also forgotten for now, will need a big improvement in Object navigation	2020-02-09 22:42:57 +01:00
Tissevert	325250383a	Add support for fonts and implement MacRomanEncoding	2020-02-08 08:15:32 +01:00
Tissevert	c48ab22808	Forgot some useless parentheses when playing with operator precedences	2020-02-04 17:05:15 +01:00
Tissevert	a2b66ac6d6	Generalize the getFont function because some /Resources have a direct dictionary as value for their /Font property	2020-02-04 17:04:42 +01:00
Tissevert	cefb08ee50	Going a step further in «optimization» (slowing it even more…) by replacing choice by a search in a Map	2019-11-30 21:46:22 +01:00
Tissevert	afbbcbffc5	Finish implementing the new stack-based call parser	2019-11-30 12:39:40 +01:00
Tissevert	8373bd1ea0	Removing +x permission on getText source that shouldn't ever have been set	2019-11-29 19:07:54 +01:00
Tissevert	bac08446dd	WIP: starting to fix this criminally inefficient parser for PDF's postfix-operator instructions	2019-11-29 17:42:57 +01:00
Tissevert	f9f799c59b	Take the dirty code of «getText» and turn it into a relatively clean module exposing pages, that can be retrieved all at once or by page number (numbered human-style, starting from 1)	2019-11-29 11:51:35 +01:00
Tissevert	08a9717b3a	Get rid of wrapper PageContents structure returned by PageContent in the PDF.Text module (and return directly [ByteString] instead)	2019-11-29 11:48:28 +01:00
Tissevert	42a02808c1	Merge branch 'main' into extract-text	2019-11-27 18:05:47 +01:00
Tissevert	c9f050e64b	Remove deprecated debug script and forgotten comments to bypass the selective export of Text module	2019-10-14 10:17:15 +02:00
Tissevert	3a3e1533b4	Clean ByteString types to identify when a ByteString contains the representation of an integer in a given base and fix the last remaining PDF string (un)escaping issue	2019-10-14 10:17:15 +02:00
Tissevert	a96e36ec5a	Fix error silently discarding code ranges, make sure ByteString intervals are created with the correct byte length and decode utf16BE encoded values in single-value ranges	2019-10-14 10:17:15 +02:00
Tissevert	d07c286f8e	Clean exported ByteString custom functions	2019-10-14 10:17:15 +02:00
Tissevert	7a15113285	Try and re-implement string decoding — compiles but now fails to decode any string	2019-10-14 10:17:15 +02:00
Tissevert	36d7f9b819	Still debugging, broke pretty much everything and finally implementing a proper coderange parsing for CMap because apparently that's necessary	2019-10-14 10:17:15 +02:00
Tissevert	b8ca7281aa	Fix parsing errors forgetting to make sure there's a space after special operator arguments like names and stringObjects	2019-10-14 10:17:15 +02:00
Tissevert	32efdcdd6b	Try and fix stuff by generalizing a signature to ease debugging and add parenthesis which I think should have been here all along	2019-10-14 10:17:15 +02:00
Tissevert	3b59fd0c61	Separate CMap and Text in two distinct modules	2019-10-14 10:17:15 +02:00
Tissevert	0374b72920	Finish implementing reading, still bugs to investigate	2019-10-14 10:17:15 +02:00
Tissevert	1dd22c3889	Going to try with Text, naturally handling UTF-16 but will still have to parse «int codes» manually from strings	2019-10-14 10:17:15 +02:00
Tissevert	98d029c4d4	In complete debug, more or less implemented CMap parsing but apparently it uses UTF16 ?!	2019-10-14 10:17:15 +02:00
Tissevert	c349d9b4c2	Don't trust serializer, they have nothing todo with a reasonable binary encoding	2019-10-14 10:17:15 +02:00
Tissevert	e7484ef536	Completely lost, the same old Char8 / Word8 again, implemented all the text reading, still needing a couple details to parse CMaps	2019-10-14 10:17:15 +02:00
Tissevert	f9e5683bf4	WIP: Use previous changes to start implementing font caching and text parsing (still very broken, doesn't compile)	2019-10-14 10:17:15 +02:00
Tissevert	b8eb9e6856	Generalize the Parser type into a MonadParser class to use with MonadTrans and remove redundant code already defined in Applicative or Attoparsec	2019-10-14 10:17:15 +02:00
Tissevert	66d315b7fe	Reflect the distinction between eval and run from State monad into the Parser module	2019-10-14 10:17:15 +02:00
Tissevert	51db57ec67	Ugly commit, breaks everything, still trying to figure a grammar for text	2019-10-14 10:17:15 +02:00
Tissevert	6f3c159ea7	Adding a module to implement text reading and a demo program to go with it	2019-10-14 10:17:15 +02:00