Using toEnum to convert from Int to Int ? Surely a left-over from some time when it was a different type

Why did I implement this overly complicated lift by hand again ?
Implement MacRomanEncoding for real following their own vendor file https://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT
2020-02-08 08:10:32 +01:00 · 2020-02-07 13:08:10 +01:00 · 2020-02-07 10:59:28 +01:00 · 2020-02-07 10:49:16 +01:00 · 2020-02-06 16:54:27 +01:00 · 2020-02-06 16:53:06 +01:00
15 changed files with 1068 additions and 113 deletions
--- a/Hufflepdf.cabal
+++ b/Hufflepdf.cabal
@ -17,19 +17,28 @@ cabal-version:       >=1.10
 library
  exposed-modules:     PDF
                     , PDF.CMap
                     , PDF.EOL
                     , PDF.Object
                     , PDF.Output
                     , PDF.Pages
                     , PDF.Parser
                     , PDF.Text
                     , PDF.Update
  other-modules:       Data.ByteString.Char8.Util
                     , PDF.Encoding
                     , PDF.Encoding.MacRoman
                     , PDF.Body
-                     , PDF.Parser
+                     , PDF.Font
  -- other-extensions:    
  build-depends:       attoparsec
                     , base >=4.9 && <4.13
                     , bytestring
                     , containers
                     , mtl
                     , text
                     , utf8-string
                     , zlib
  hs-source-dirs:      src
  ghc-options:         -Wall
  default-language:    Haskell2010
@ -51,3 +60,15 @@ executable             getObj
                     , zlib
  ghc-options:         -Wall
  default-language:    Haskell2010
 executable             getText
  main-is:             examples/getText.hs
  build-depends:       base
                     , bytestring
                     , containers
                     , Hufflepdf
                     , mtl
                     , text
                     , zlib
  ghc-options:         -Wall
  default-language:    Haskell2010
--- a/examples/getText.hs
+++ b/examples/getText.hs
@ -0,0 +1,38 @@
 import qualified Data.ByteString.Char8 as BS (readFile)
 import qualified Data.Map as Map (toList)
 import qualified Data.Text.IO as Text (putStrLn)
 import PDF (Document(..), parseDocument)
 import PDF.Object (Content)
 import PDF.Pages (Page(..), get, getAll)
 import PDF.Update (unify)
 import System.Environment (getArgs)
 import System.Exit (die)
 import System.IO (BufferMode(..), hSetBuffering, stdout)
 onDoc :: FilePath -> (Content -> Either String a) -> IO a
 onDoc inputFile f = do
  content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
  case content >>= f of
    Left someError -> die someError
    Right value -> return value
 displayPage :: Page -> IO ()
 displayPage = mapM_ Text.putStrLn . contents
 wholeDoc :: FilePath -> IO ()
 wholeDoc inputFile = do
  pages <- onDoc inputFile getAll
  mapM_ (displayPage . snd) $ Map.toList pages
 singlePage :: FilePath -> Int -> IO ()
 singlePage inputFile pageNumber =
  onDoc inputFile (`get` pageNumber) >>= displayPage
 main :: IO ()
 main = do
  hSetBuffering stdout LineBuffering
  args <- getArgs
  case args of
    [inputFile] -> wholeDoc inputFile
    [inputFile, pageNumber] -> singlePage inputFile (read pageNumber)
    _ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
--- a/src/Data/ByteString/Char8/Util.hs
+++ b/src/Data/ByteString/Char8/Util.hs
@ -1,16 +1,91 @@
 module Data.ByteString.Char8.Util (
-      previous
+      B16Int(..)
    , B256Int(..)
    , b8ToInt
    , b16ToBytes
    , b16ToInt
    , b256ToInt
    , intToB256
    , previous
    , subBS
    , toBytes
    , unescape
    , utf16BEToutf8
  ) where
-import Data.ByteString.Char8 (ByteString)
+import Data.ByteString (ByteString, snoc)
-import qualified Data.ByteString.Char8 as BS (drop, index, take)
+import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt)
 import qualified Data.ByteString.Char8 as Char8 (
    cons, drop, index, splitAt, take, uncons, unpack
  )
 import Data.Text (Text)
 import Data.Text.Encoding (decodeUtf16BE)
 import Prelude hiding (length)
 import Text.Printf (printf)
 newtype B8Int = B8Int ByteString deriving Show
 newtype B16Int = B16Int ByteString deriving Show
 newtype B256Int = B256Int ByteString deriving Show
 previous :: Char -> Int -> ByteString -> Int
 previous char position byteString
-  | BS.index byteString position == char = position
+  | Char8.index byteString position == char = position
  | otherwise = previous char (position - 1) byteString
 subBS :: Int -> Int -> ByteString -> ByteString
-subBS offset length = BS.take length . BS.drop offset
+subBS offset length = Char8.take length . Char8.drop offset
 intToB256 :: Int -> B256Int
 intToB256 n
  | n < 0x100 = B256Int . BS.singleton $ toEnum n
  | otherwise =
    let B256Int begining = intToB256 (n `div` 0x100) in
    B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
 b256ToInt :: B256Int -> Int
 b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n
 toBytes :: Int -> Int -> ByteString
 toBytes 0 _ = BS.empty
 toBytes size n =
  (toBytes (size - 1) (n `div` 0x100)) `snoc` (toEnum (n `mod` 0x100))
 b16ToBytes :: B16Int -> ByteString
 b16ToBytes (B16Int n) = BS.pack . fmap b16ToInt $ pairDigits n
  where
    pairDigits s =
      case BS.length s of
        0 -> []
        1 -> [B16Int s]
        _ ->
          let (twoHexDigits, rest) = BS.splitAt 2 s in
          (B16Int $ twoHexDigits):(pairDigits rest)
 fromBase :: (Num a, Read a) => Char -> ByteString -> a
 fromBase b = read . printf "0%c%s" b . Char8.unpack
 b16ToInt :: (Num a, Read a) => B16Int -> a
 b16ToInt (B16Int n) = fromBase 'x' n
 b8ToInt :: (Num a, Read a) => B8Int -> a
 b8ToInt (B8Int n) = fromBase 'o' n
 unescape :: ByteString -> ByteString
 unescape escapedBS =
  case Char8.uncons escapedBS of
    Nothing -> BS.empty
    Just ('\\', s) -> unescapeChar s
    Just (c, s) -> Char8.cons c (unescape s)
  where
    unescapeChar s =
      case Char8.uncons s of
        Nothing -> BS.empty
        Just (c, s')
          | c `elem` "()" -> Char8.cons c (unescape s')
          | c `elem` "nrtbf" -> Char8.cons (read (printf "'\\%c'" c)) (unescape s')
          | c `elem` ['0'..'7'] -> fromOctal (Char8.splitAt 3 s)
          | otherwise -> Char8.cons c (unescape s')
    fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
 utf16BEToutf8 :: ByteString -> Text
 utf16BEToutf8 = decodeUtf16BE
--- a/src/PDF.hs
+++ b/src/PDF.hs
@ -21,7 +21,7 @@ import PDF.Object (
  )
 import qualified PDF.Output as Output (render, line)
 import PDF.Output (Output(..))
-import PDF.Parser (Parser, runParser, string, takeAll)
+import PDF.Parser (Parser, evalParser, string, takeAll)
 import Text.Printf (printf)
 data Document = Document {
@ -83,7 +83,7 @@ findNextSection offset input =
 readStructures :: Int -> ByteString -> Either String [InputStructure]
 readStructures startXref input =
-  runParser structure () (BS.drop startXref input) >>= stopOrFollow
+  evalParser structure () (BS.drop startXref input) >>= stopOrFollow
  where
    stopOrFollow s@(Structure {trailer}) =
      case Map.lookup (Name "Prev") trailer of
@ -96,7 +96,7 @@ readStructures startXref input =
 parseDocument :: ByteString -> Either String Document
 parseDocument input = do
-  (pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input
+  (pdfVersion, eolStyle) <- evalParser ((,) <$> version <*> EOL.parser) () input
  startXref <- readStartXref eolStyle input
  structuresRead <- readStructures startXref input
  let updates = populate input <$> structuresRead
--- a/src/PDF/Body.hs
+++ b/src/PDF/Body.hs
@ -6,6 +6,7 @@ module PDF.Body (
 import Control.Applicative ((<|>))
 import Control.Monad.State (get, gets, modify)
 import Data.Attoparsec.ByteString.Char8 (option)
 import Data.ByteString.Char8 (ByteString)
 import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
 import Data.Map ((!))
@ -18,7 +19,7 @@ import PDF.Object (
    , blank, dictionary, directObject, integer, line
  )
 import PDF.Output (ObjectId(..), Offset(..))
-import PDF.Parser (Parser, (<?>), block, char, on, option, runParser, takeAll)
+import PDF.Parser (Parser, (<?>), block, char, evalParser, on, takeAll)
 data UserState = UserState {
      input :: ByteString
@ -109,7 +110,7 @@ occurrence =
 populate :: ByteString -> InputStructure -> Content
 populate input structure =
  let bodyInput = BS.drop (startOffset structure) input in
-  case runParser recurseOnOccurrences initialState bodyInput of
+  case evalParser recurseOnOccurrences initialState bodyInput of
    Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
    Right finalState ->
      let Flow {occurrencesStack, tmpObjects} = flow finalState in
--- a/src/PDF/CMap.hs
+++ b/src/PDF/CMap.hs
@ -0,0 +1,159 @@
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE TypeSynonymInstances #-}
 module PDF.CMap (
      CMap
    , CMappers
    , CRange(..)
    , cMap
    , emptyCMap
    , matches
  ) where
 import Control.Applicative ((<|>), many)
 import Control.Monad.State (modify)
 import Data.Attoparsec.ByteString.Char8 (count)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as BS (drop, length, null, take)
 import Data.ByteString.Char8 (unpack)
 import Data.ByteString.Char8.Util (
    B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
  )
 import Data.Map (Map, union)
 import qualified Data.Map as Map (
    adjust, empty, fromList, insertWith, lookup, toList
  )
 import Data.Text (Text)
 import qualified PDF.EOL as EOL (charset, parser)
 import PDF.Font (Font)
 import PDF.Object (
      DirectObject(..), Name, StringObject(..)
    , blank, directObject, integer, line, stringObject
  )
 import PDF.Parser (MonadParser, Parser, runParser, takeAll)
 type CMappers = Map Name CMap
 type Mapping = Map ByteString Text
 data CRange = CRange {
      fromSequence :: ByteString
    , toSequence :: ByteString
    , mapping :: Mapping
  } deriving Show
 type RangeSize = Int
 type CMap = Map RangeSize [CRange]
 toFont :: CMap -> Font
 toFont aCMap input
  | BS.null input = Right ""
  | otherwise = do
    (output, remainingInput) <- trySizes input $ Map.toList aCMap
    mappend output <$> toFont aCMap remainingInput
  where
    trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
    trySizes s ((size, cRanges):others) =
      let prefix = BS.take size s in
      case tryRanges prefix cRanges of
        Nothing -> trySizes s others
        Just outputSequence -> Right (outputSequence, BS.drop size s)
    tryRanges :: ByteString -> [CRange] -> Maybe Text
    tryRanges _ [] = Nothing
    tryRanges prefix ((CRange {mapping}):cRanges) =
      case Map.lookup prefix mapping of
        Nothing -> tryRanges prefix cRanges
        outputSequence -> outputSequence
 emptyCMap :: CMap
 emptyCMap = Map.empty
 matches :: ByteString -> CRange -> Bool
 matches code (CRange {fromSequence, toSequence}) =
  fromSequence <= code && code <= toSequence
 cMap :: ByteString -> Either String Font
 cMap = fmap (toFont . snd) <$> runParser
  (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
  emptyCMap
  where
    ignoredLine =
      takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
 codeRanges :: Parser CMap ()
 codeRanges = do
  size <- integer <* line "begincodespacerange"
  mapM_ createMapping =<< count size codeRange
  line "endcodespacerange"
  where
    codeRange =
      (,) <$> stringObject <* blank <*> stringObject <* EOL.parser
 createMapping :: (StringObject, StringObject) -> Parser CMap ()
 createMapping (Hexadecimal from, Hexadecimal to) = modify $
  Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
  where
    fromSequence = b16ToBytes from
    size = BS.length fromSequence
    toSequence = b16ToBytes to
    mapping = Map.empty
 createMapping _ = return ()
 cMapRange :: Parser CMap ()
 cMapRange = do
  size <- integer <* line "beginbfrange"
  mapM_ saveMapping =<< count size rangeMapping
  line "endbfrange"
  where
    rangeMapping = (,,) 
      <$> (stringObject <* blank)
      <*> (stringObject <* blank)
      <*> directObject <* EOL.parser
      >>= mapFromTo
 saveMapping :: [(ByteString, Text)] -> Parser CMap ()
 saveMapping [] = return ()
 saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
  where
    newMapping = Map.fromList assoc
    mappingSize = BS.length code
    appendMapping cRange =
      cRange {mapping = mapping cRange `union` newMapping}
    insertCRange = fmap (\cRange ->
        if code `matches` cRange then appendMapping cRange else cRange
      )
 cMapChar :: Parser CMap ()
 cMapChar = do
  size <- integer <* line "beginbfchar"
  saveMapping =<< count size charMapping <* line "endbfchar"
  where
    charMapping =
      (,) <$> stringObject <* blank <*> stringObject <* EOL.parser
      >>= pairMapping
 between :: B16Int -> B16Int -> [ByteString]
 between from@(B16Int s) to =
  let size = BS.length s `div` 2 in
  toBytes size <$> [b16ToInt from .. b16ToInt to]
 startFrom :: B16Int -> [ByteString]
 startFrom from@(B16Int s) =
  let size = BS.length s `div` 2 in
  toBytes size <$> [b16ToInt from .. ]
 mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
 mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
  return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
 mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
  zip (between from to) <$> (mapM dstByteString dstPoints)
  where
    dstByteString (StringObject (Hexadecimal dst)) =
      return . utf16BEToutf8 $ b16ToBytes dst
    dstByteString _ = fail "Invalid for a replacement string"
 mapFromTo _ = fail "invalid range mapping found"
 pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
 pairMapping (Hexadecimal from, Hexadecimal to) =
  return (b16ToBytes  from, utf16BEToutf8 $ b16ToBytes  to)
 pairMapping _ = fail "invalid pair mapping found"
--- a/src/PDF/EOL.hs
+++ b/src/PDF/EOL.hs
@ -6,14 +6,14 @@ module PDF.EOL (
  ) where
 import Control.Applicative ((<|>))
-import PDF.Parser (Parser, string)
+import PDF.Parser (MonadParser, string)
 data Style = CR | LF | CRLF deriving Show
 charset :: String
 charset = "\r\n"
-parser :: Parser s Style
+parser :: MonadParser m => m Style
 parser =
    (string "\r\n" >> return CRLF)
  <|> (string "\r" >> return CR)
--- a/src/PDF/Encoding.hs
+++ b/src/PDF/Encoding.hs
@ -0,0 +1,10 @@
 module PDF.Encoding (
    encoding
  ) where
 import PDF.Encoding.MacRoman (macRomanEncoding)
 import PDF.Font (Font)
 encoding :: String -> Either String Font
 encoding "MacRomanEncoding" = Right macRomanEncoding
 encoding s = Left $ "Unknown encoding " ++ s
--- a/src/PDF/Encoding/MacRoman.hs
+++ b/src/PDF/Encoding/MacRoman.hs
@ -0,0 +1,141 @@
 module PDF.Encoding.MacRoman (
    macRomanEncoding
  ) where
 import Data.ByteString.Char8 (unpack)
 import Data.Text (pack)
 import PDF.Font (Font)
 macRomanEncoding :: Font
 macRomanEncoding = Right . pack . fmap decode . unpack
 decode :: Char -> Char
 decode '\x80' = '\x00C4' -- LATIN CAPITAL LETTER A WITH DIAERESIS
 decode '\x81' = '\x00C5' -- LATIN CAPITAL LETTER A WITH RING ABOVE
 decode '\x82' = '\x00C7' -- LATIN CAPITAL LETTER C WITH CEDILLA
 decode '\x83' = '\x00C9' -- LATIN CAPITAL LETTER E WITH ACUTE
 decode '\x84' = '\x00D1' -- LATIN CAPITAL LETTER N WITH TILDE
 decode '\x85' = '\x00D6' -- LATIN CAPITAL LETTER O WITH DIAERESIS
 decode '\x86' = '\x00DC' -- LATIN CAPITAL LETTER U WITH DIAERESIS
 decode '\x87' = '\x00E1' -- LATIN SMALL LETTER A WITH ACUTE
 decode '\x88' = '\x00E0' -- LATIN SMALL LETTER A WITH GRAVE
 decode '\x89' = '\x00E2' -- LATIN SMALL LETTER A WITH CIRCUMFLEX
 decode '\x8A' = '\x00E4' -- LATIN SMALL LETTER A WITH DIAERESIS
 decode '\x8B' = '\x00E3' -- LATIN SMALL LETTER A WITH TILDE
 decode '\x8C' = '\x00E5' -- LATIN SMALL LETTER A WITH RING ABOVE
 decode '\x8D' = '\x00E7' -- LATIN SMALL LETTER C WITH CEDILLA
 decode '\x8E' = '\x00E9' -- LATIN SMALL LETTER E WITH ACUTE
 decode '\x8F' = '\x00E8' -- LATIN SMALL LETTER E WITH GRAVE
 decode '\x90' = '\x00EA' -- LATIN SMALL LETTER E WITH CIRCUMFLEX
 decode '\x91' = '\x00EB' -- LATIN SMALL LETTER E WITH DIAERESIS
 decode '\x92' = '\x00ED' -- LATIN SMALL LETTER I WITH ACUTE
 decode '\x93' = '\x00EC' -- LATIN SMALL LETTER I WITH GRAVE
 decode '\x94' = '\x00EE' -- LATIN SMALL LETTER I WITH CIRCUMFLEX
 decode '\x95' = '\x00EF' -- LATIN SMALL LETTER I WITH DIAERESIS
 decode '\x96' = '\x00F1' -- LATIN SMALL LETTER N WITH TILDE
 decode '\x97' = '\x00F3' -- LATIN SMALL LETTER O WITH ACUTE
 decode '\x98' = '\x00F2' -- LATIN SMALL LETTER O WITH GRAVE
 decode '\x99' = '\x00F4' -- LATIN SMALL LETTER O WITH CIRCUMFLEX
 decode '\x9A' = '\x00F6' -- LATIN SMALL LETTER O WITH DIAERESIS
 decode '\x9B' = '\x00F5' -- LATIN SMALL LETTER O WITH TILDE
 decode '\x9C' = '\x00FA' -- LATIN SMALL LETTER U WITH ACUTE
 decode '\x9D' = '\x00F9' -- LATIN SMALL LETTER U WITH GRAVE
 decode '\x9E' = '\x00FB' -- LATIN SMALL LETTER U WITH CIRCUMFLEX
 decode '\x9F' = '\x00FC' -- LATIN SMALL LETTER U WITH DIAERESIS
 decode '\xA0' = '\x2020' -- DAGGER
 decode '\xA1' = '\x00B0' -- DEGREE SIGN
 decode '\xA2' = '\x00A2' -- CENT SIGN
 decode '\xA3' = '\x00A3' -- POUND SIGN
 decode '\xA4' = '\x00A7' -- SECTION SIGN
 decode '\xA5' = '\x2022' -- BULLET
 decode '\xA6' = '\x00B6' -- PILCROW SIGN
 decode '\xA7' = '\x00DF' -- LATIN SMALL LETTER SHARP S
 decode '\xA8' = '\x00AE' -- REGISTERED SIGN
 decode '\xA9' = '\x00A9' -- COPYRIGHT SIGN
 decode '\xAA' = '\x2122' -- TRADE MARK SIGN
 decode '\xAB' = '\x00B4' -- ACUTE ACCENT
 decode '\xAC' = '\x00A8' -- DIAERESIS
 decode '\xAD' = '\x2260' -- NOT EQUAL TO
 decode '\xAE' = '\x00C6' -- LATIN CAPITAL LETTER AE
 decode '\xAF' = '\x00D8' -- LATIN CAPITAL LETTER O WITH STROKE
 decode '\xB0' = '\x221E' -- INFINITY
 decode '\xB1' = '\x00B1' -- PLUS-MINUS SIGN
 decode '\xB2' = '\x2264' -- LESS-THAN OR EQUAL TO
 decode '\xB3' = '\x2265' -- GREATER-THAN OR EQUAL TO
 decode '\xB4' = '\x00A5' -- YEN SIGN
 decode '\xB5' = '\x00B5' -- MICRO SIGN
 decode '\xB6' = '\x2202' -- PARTIAL DIFFERENTIAL
 decode '\xB7' = '\x2211' -- N-ARY SUMMATION
 decode '\xB8' = '\x220F' -- N-ARY PRODUCT
 decode '\xB9' = '\x03C0' -- GREEK SMALL LETTER PI
 decode '\xBA' = '\x222B' -- INTEGRAL
 decode '\xBB' = '\x00AA' -- FEMININE ORDINAL INDICATOR
 decode '\xBC' = '\x00BA' -- MASCULINE ORDINAL INDICATOR
 decode '\xBD' = '\x03A9' -- GREEK CAPITAL LETTER OMEGA
 decode '\xBE' = '\x00E6' -- LATIN SMALL LETTER AE
 decode '\xBF' = '\x00F8' -- LATIN SMALL LETTER O WITH STROKE
 decode '\xC0' = '\x00BF' -- INVERTED QUESTION MARK
 decode '\xC1' = '\x00A1' -- INVERTED EXCLAMATION MARK
 decode '\xC2' = '\x00AC' -- NOT SIGN
 decode '\xC3' = '\x221A' -- SQUARE ROOT
 decode '\xC4' = '\x0192' -- LATIN SMALL LETTER F WITH HOOK
 decode '\xC5' = '\x2248' -- ALMOST EQUAL TO
 decode '\xC6' = '\x2206' -- INCREMENT
 decode '\xC7' = '\x00AB' -- LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
 decode '\xC8' = '\x00BB' -- RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
 decode '\xC9' = '\x2026' -- HORIZONTAL ELLIPSIS
 decode '\xCA' = '\x00A0' -- NO-BREAK SPACE
 decode '\xCB' = '\x00C0' -- LATIN CAPITAL LETTER A WITH GRAVE
 decode '\xCC' = '\x00C3' -- LATIN CAPITAL LETTER A WITH TILDE
 decode '\xCD' = '\x00D5' -- LATIN CAPITAL LETTER O WITH TILDE
 decode '\xCE' = '\x0152' -- LATIN CAPITAL LIGATURE OE
 decode '\xCF' = '\x0153' -- LATIN SMALL LIGATURE OE
 decode '\xD0' = '\x2013' -- EN DASH
 decode '\xD1' = '\x2014' -- EM DASH
 decode '\xD2' = '\x201C' -- LEFT DOUBLE QUOTATION MARK
 decode '\xD3' = '\x201D' -- RIGHT DOUBLE QUOTATION MARK
 decode '\xD4' = '\x2018' -- LEFT SINGLE QUOTATION MARK
 decode '\xD5' = '\x2019' -- RIGHT SINGLE QUOTATION MARK
 decode '\xD6' = '\x00F7' -- DIVISION SIGN
 decode '\xD7' = '\x25CA' -- LOZENGE
 decode '\xD8' = '\x00FF' -- LATIN SMALL LETTER Y WITH DIAERESIS
 decode '\xD9' = '\x0178' -- LATIN CAPITAL LETTER Y WITH DIAERESIS
 decode '\xDA' = '\x2044' -- FRACTION SLASH
 decode '\xDB' = '\x20AC' -- EURO SIGN
 decode '\xDC' = '\x2039' -- SINGLE LEFT-POINTING ANGLE QUOTATION MARK
 decode '\xDD' = '\x203A' -- SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
 decode '\xDE' = '\xFB01' -- LATIN SMALL LIGATURE FI
 decode '\xDF' = '\xFB02' -- LATIN SMALL LIGATURE FL
 decode '\xE0' = '\x2021' -- DOUBLE DAGGER
 decode '\xE1' = '\x00B7' -- MIDDLE DOT
 decode '\xE2' = '\x201A' -- SINGLE LOW-9 QUOTATION MARK
 decode '\xE3' = '\x201E' -- DOUBLE LOW-9 QUOTATION MARK
 decode '\xE4' = '\x2030' -- PER MILLE SIGN
 decode '\xE5' = '\x00C2' -- LATIN CAPITAL LETTER A WITH CIRCUMFLEX
 decode '\xE6' = '\x00CA' -- LATIN CAPITAL LETTER E WITH CIRCUMFLEX
 decode '\xE7' = '\x00C1' -- LATIN CAPITAL LETTER A WITH ACUTE
 decode '\xE8' = '\x00CB' -- LATIN CAPITAL LETTER E WITH DIAERESIS
 decode '\xE9' = '\x00C8' -- LATIN CAPITAL LETTER E WITH GRAVE
 decode '\xEA' = '\x00CD' -- LATIN CAPITAL LETTER I WITH ACUTE
 decode '\xEB' = '\x00CE' -- LATIN CAPITAL LETTER I WITH CIRCUMFLEX
 decode '\xEC' = '\x00CF' -- LATIN CAPITAL LETTER I WITH DIAERESIS
 decode '\xED' = '\x00CC' -- LATIN CAPITAL LETTER I WITH GRAVE
 decode '\xEE' = '\x00D3' -- LATIN CAPITAL LETTER O WITH ACUTE
 decode '\xEF' = '\x00D4' -- LATIN CAPITAL LETTER O WITH CIRCUMFLEX
 decode '\xF0' = '\xF8FF' -- Apple logo
 decode '\xF1' = '\x00D2' -- LATIN CAPITAL LETTER O WITH GRAVE
 decode '\xF2' = '\x00DA' -- LATIN CAPITAL LETTER U WITH ACUTE
 decode '\xF3' = '\x00DB' -- LATIN CAPITAL LETTER U WITH CIRCUMFLEX
 decode '\xF4' = '\x00D9' -- LATIN CAPITAL LETTER U WITH GRAVE
 decode '\xF5' = '\x0131' -- LATIN SMALL LETTER DOTLESS I
 decode '\xF6' = '\x02C6' -- MODIFIER LETTER CIRCUMFLEX ACCENT
 decode '\xF7' = '\x02DC' -- SMALL TILDE
 decode '\xF8' = '\x00AF' -- MACRON
 decode '\xF9' = '\x02D8' -- BREVE
 decode '\xFA' = '\x02D9' -- DOT ABOVE
 decode '\xFB' = '\x02DA' -- RING ABOVE
 decode '\xFC' = '\x00B8' -- CEDILLA
 decode '\xFD' = '\x02DD' -- DOUBLE ACUTE ACCENT
 decode '\xFE' = '\x02DB' -- OGONEK
 decode '\xFF' = '\x02C7' -- CARON
 decode c = c             -- The rest is ASCII
--- a/src/PDF/Font.hs
+++ b/src/PDF/Font.hs
@ -0,0 +1,16 @@
 module PDF.Font (
      Font
    , FontSet
    , emptyFont
  ) where
 import Data.ByteString (ByteString)
 import Data.Map (Map)
 import Data.Text (Text)
 import PDF.Object (Name)
 type Font = ByteString -> Either String Text
 type FontSet = Map Name Font
 emptyFont :: Font
 emptyFont _ = Left "No fond loaded"
--- a/src/PDF/Object.hs
+++ b/src/PDF/Object.hs
@ -3,6 +3,7 @@
 {-# LANGUAGE FlexibleInstances #-}
 module PDF.Object (
      Content(..)
    , Dictionary
    , DirectObject(..)
    , Flow(..)
    , IndexedObjects
@ -12,9 +13,11 @@ module PDF.Object (
    , Number(..)
    , Object(..)
    , Occurrence(..)
    , StringObject(..)
    , Structure(..)
    , XRefEntry(..)
    , XRefSection
    , array
    , blank
    , dictionary
    , directObject
@ -22,14 +25,20 @@ module PDF.Object (
    , integer
    , line
    , magicNumber
    , name
    , number
    , regular
    , stringObject
    , structure
    , toByteString
  ) where
-import Control.Applicative ((<|>))
+import Control.Applicative ((<|>), many)
-import Data.ByteString.Char8 (ByteString)
+import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
-import qualified Data.ByteString.Char8 as BS (
+import Data.ByteString (ByteString)
-    concat, cons, pack, singleton, unpack
+import qualified Data.ByteString as BS (concat)
-  )
+import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
 import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
 import Data.Map (Map, (!), mapWithKey)
 import qualified Data.Map as Map (
    delete, empty, fromList, lookup, minViewWithKey, toList, union
@ -41,15 +50,11 @@ import PDF.Output (
    , byteString, getObjectId, getOffset, getOffsets, join, newLine
    , saveOffset
  )
-import PDF.Parser (
+import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
      Parser, (<?>)
    , char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option
    , sepBy, string, takeAll, takeAll1
  )
 import Text.Printf (printf)
-line :: String -> Parser u ()
+line :: MonadParser m => String -> m ()
-line l = (string (BS.pack l) *> blank *> return ()) <?> printf "line «%s»" l
+line l = (string (Char8.pack l) *> blank *> return ()) <?> printf "line «%s»" l
 magicNumber :: ByteString
 magicNumber = "%PDF-"
@ -60,8 +65,8 @@ eofMarker = "%%EOF"
 whiteSpaceCharset :: String
 whiteSpaceCharset = "\0\t\12 "
-blank :: Parser u ()
+blank :: MonadParser m => m ()
-blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return ()
+blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> pure ()
 delimiterCharset :: String
 delimiterCharset = "()<>[]{}/%"
@ -69,8 +74,8 @@ delimiterCharset = "()<>[]{}/%"
 regular :: Char -> Bool
 regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
-integer :: (Read a, Num a) => Parser u a
+integer :: (Read a, Num a, MonadParser m) => m a
-integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
+integer = read . Char8.unpack <$> decNumber <* blank <?> "decimal integer"
 -------------------------------------
 --          OBJECTS
@ -81,7 +86,7 @@ type IndexedObjects = Map ObjectId Object
 --
 -- Boolean
 --
-boolean :: Parser u Bool
+boolean :: MonadParser m => m Bool
 boolean =
  (string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
@ -96,38 +101,42 @@ instance Output Number where
      (n, 0) -> printf "%d" (n :: Int)
      _ -> printf "%f" f
-number :: Parser u Number
+number :: MonadParser m => m Number
-number = Number . read . BS.unpack <$>
+number = Number . read . Char8.unpack <$>
-  (mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
+  (mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
  <?> "number"
  where
    sign = string "-" <|> option "" (char '+' >> return "")
    integerPart = mappend <$> decNumber <*> option "" floatPart
-    floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
+    floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)
 --
 -- StringObject
 --
-data StringObject = Literal String | Hexadecimal String deriving Show
+data StringObject = Literal ByteString | Hexadecimal B16Int deriving Show
 instance Output StringObject where
-  output (Literal s) = Output.string (printf "(%s)" s)
+  output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
-  output (Hexadecimal s) = Output.string (printf "<%s>" s)
+  output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))
-stringObject :: Parser u StringObject
+stringObject :: MonadParser m => m StringObject
 stringObject =
-    Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
+    Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
-  <|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
+  <|> Hexadecimal <$> (char '<' *> hexNumber <* char '>')
  <?> "string object (literal or hexadecimal)"
  where
    literalString = many literalStringBlock
    literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
    normalChar = not . (`elem` ("\\()" :: String))
    matchingParenthesis =
-      mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
+      mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
    escapedChar =
-      BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
+      Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
-    octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
+    octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
 toByteString :: StringObject -> ByteString
 toByteString (Hexadecimal h) = b16ToBytes h
 toByteString (Literal s) = unescape s
 --
 -- Name
@ -137,13 +146,13 @@ newtype Name = Name String deriving (Eq, Ord, Show)
 instance Output Name where
  output (Name n) = Output.string ('/':n)
-name :: Parser u Name
+name :: MonadParser m => m Name
-name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
+name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"
 --
 -- Array
 --
-array :: Parser u [DirectObject]
+array :: MonadParser m => m [DirectObject]
 array =
  char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
@ -160,7 +169,7 @@ instance Output Dictionary where
      outputKeyVal :: (Name, DirectObject) -> OBuilder
      outputKeyVal (key, val) = Output.concat [output key, " ", output val]
-dictionary :: Parser u Dictionary
+dictionary :: MonadParser m => m Dictionary
 dictionary =
  string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
  where
@ -170,7 +179,7 @@ dictionary =
 --
 -- Null
 --
-nullObject :: Parser u ()
+nullObject :: MonadParser m => m ()
 nullObject = string "null" *> return () <?> "null object"
 --
@ -181,7 +190,7 @@ data IndirectObjCoordinates = IndirectObjCoordinates {
    , versionNumber :: Int
  } deriving Show
-reference :: Parser u IndirectObjCoordinates
+reference :: MonadParser m => m IndirectObjCoordinates
 reference = IndirectObjCoordinates
  <$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
@ -210,7 +219,7 @@ instance Output DirectObject where
  output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
    Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
-directObject :: Parser u DirectObject
+directObject :: MonadParser m => m DirectObject
 directObject =
    Boolean <$> boolean
  <|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
--- a/src/PDF/Output.hs
+++ b/src/PDF/Output.hs
@ -116,7 +116,7 @@ char :: Char -> OBuilder
 char c = lift char8 c <* offset (+1)
 string :: String -> OBuilder
-string s = lift string8 s <* offset (+ toEnum (length s))
+string s = lift string8 s <* offset (+ length s)
 line :: String -> OBuilder
 line l = string l `mappend` newLine
--- a/src/PDF/Pages.hs
+++ b/src/PDF/Pages.hs
@ -0,0 +1,173 @@
 {-# LANGUAGE NamedFieldPuns #-}
 module PDF.Pages (
      Page(..)
    , get
    , getAll
  ) where
 import Codec.Compression.Zlib (decompress)
 import Control.Applicative ((<|>))
 import Control.Monad (foldM)
 import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify)
 import qualified Control.Monad.RWS as RWS (get)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
 import Data.Map (Map, (!))
 import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
 import Data.Text (Text)
 import PDF.CMap (cMap)
 import PDF.Encoding (encoding)
 import PDF.Font (Font, FontSet)
 import PDF.Object (
      Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
    , Object(..), Name(..), Structure(..)
  ,)
 import PDF.Output (ObjectId(..))
 import PDF.Text (pageContents)
 import Text.Printf (printf)
 type CachedFonts = Map ObjectId Font
 type T = RWST Content () CachedFonts (Either String)
 data Page = Page {
      contents :: [Text]
    , source :: ObjectId
  }
 infixl 1 \\=
 (\\=) :: T a -> (a -> Either String b) -> T b
 x \\= f = x >>= lift . f
 infixl 1 //=
 (//=) :: Either String a -> (a -> T b) -> T b
 x //= f = lift x >>= f
 expected :: Show a => String -> a -> Either String b
 expected name = Left . printf "Not a %s: %s" name . show
 stream :: Object -> Either String ByteString
 stream (Stream {header, streamContent}) = Right $
  case Map.lookup (Name "Filter") header of
    Just (NameObject (Name "FlateDecode")) ->
      Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
    _ -> streamContent
 stream obj = expected "stream" obj
 getResource :: DirectObject -> T Dictionary
 getResource (Dictionary dictionary) = return dictionary
 getResource (Reference (IndirectObjCoordinates {objectId})) =
  getObject objectId \\= dict
 getResource directObject =
  lift $ expected "resource (dictionary or reference)" directObject
 getFontDictionary :: Dictionary -> T Dictionary
 getFontDictionary pageDict =
  key "Resources" pageDict
  //= getResource
  \\= key "Font"
  >>= getResource
 cache :: (ObjectId -> T Font) -> ObjectId -> T Font
 cache loader objectId =
  (maybe load return . Map.lookup objectId) =<< RWS.get
  where
    load = do
      value <- loader objectId
      modify $ Map.insert objectId value
      return value
 loadFont :: ObjectId -> T Font
 loadFont objectId = getObject objectId \\= dict >>= tryMappings
  where
    tryMappings dictionary =
        loadCMap dictionary
      <|> lift (key "Encoding" dictionary >>= loadEncoding)
      <|> lift (Left $ unknownFormat (show objectId) (show dictionary))
    unknownFormat = printf "Unknown font format for object #%s : %s"
    loadCMap dictionary =
      key "ToUnicode" dictionary //= follow \\= stream \\= cMap
    loadEncoding (NameObject (Name name)) = encoding name
    loadEncoding directObject =
      Left . printf "Encoding must be a name, not that : %s" $ show directObject
 loadFonts :: Dictionary -> T FontSet
 loadFonts = foldM addFont Map.empty . Map.toList
  where
    addFont :: FontSet -> (Name, DirectObject) -> T FontSet
    addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
      flip (Map.insert name) output <$> cache loadFont objectId
    addFont output _ = return output
 getObject :: ObjectId -> T Object
 getObject objectId = do
  content <- ask
  return (objects content ! objectId)
 key :: String -> Dictionary -> Either String DirectObject
 key keyName dictionary =
  maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary)
  where
    errorMessage =
      printf "Key %s not found in dictionary %s" keyName (show dictionary)
 target :: DirectObject -> Either String ObjectId
 target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
 target directObject = expected "reference" directObject
 many :: DirectObject -> [DirectObject]
 many (Array l) =  l
 many directObject = [directObject]
 follow :: DirectObject -> T Object
 follow directObject = target directObject //= getObject
 dict :: Object -> Either String Dictionary
 dict (Direct (Dictionary dictionary)) = Right dictionary
 dict obj = expected "dictionary" obj
 dictObject :: String -> Dictionary -> T Dictionary
 dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
 pagesList :: T [ObjectId]
 pagesList = do
  root <- dictObject "Root" . trailer . docStructure =<< ask
  pages <- dictObject "Pages" root
  case Map.lookup (Name "Kids") pages of
    Just (Array kids) -> return $ getReferences kids
    _ -> return []
 getReferences :: [DirectObject] -> [ObjectId]
 getReferences objects = do
  object <- objects
  case object of
    Reference (IndirectObjCoordinates {objectId}) -> [objectId]
    _ -> []
 extractText :: Object -> T [Text]
 extractText object = do
  pageDict <- lift $ dict object
  fonts <- loadFonts =<< getFontDictionary pageDict
  let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
  concat <$> (objects //= (mapM $ loadContent fonts))
  where
    loadContent :: FontSet -> DirectObject -> T [Text]
    loadContent fonts directObject =
      follow directObject \\= stream \\= pageContents fonts
 loadPage :: ObjectId -> T Page
 loadPage source =
  (\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
 getAll :: Content -> Either String (Map Int Page)
 getAll content = fst <$> evalRWST getPages content Map.empty
  where
    numbered = Map.fromList . zip [1..]
    getPages = numbered <$> (mapM loadPage =<< pagesList)
 get :: Content -> Int -> Either String Page
 get content pageNumber
  | pageNumber < 1 = Left "Pages start at 1"
  | otherwise = fst <$> evalRWST getPage content Map.empty
  where
    firstPage [] = lift $ Left "Page is out of bounds"
    firstPage (p:_) = loadPage p
    getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
--- a/src/PDF/Parser.hs
+++ b/src/PDF/Parser.hs
@ -1,56 +1,72 @@
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE ConstraintKinds #-}
 {-# LANGUAGE UndecidableInstances #-}
 module PDF.Parser (
-      Parser
+      MonadParser(..)
    , Parser
    , (<?>)
    , block
    , char
    , choice
    , count
    , decNumber
    , hexNumber
    , many
    , octDigit
    , on
    , oneOf
    , option
    , runParser
-    , sepBy
+    , evalParser
    , string
    , takeAll
    , takeAll1
  ) where
-import Control.Applicative ((<|>), empty)
+import Control.Applicative (Alternative, (<|>))
 import Control.Monad (MonadPlus)
 import Control.Monad.Fail (MonadFail(..))
 import Control.Monad.State (StateT(..), evalStateT)
-import Control.Monad.Trans (lift)
+import Control.Monad.Trans (MonadTrans(..))
 import qualified Data.Attoparsec.ByteString.Char8 as Atto (
-    Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
+      Parser, char, endOfInput, parseOnly, satisfy, string, take, takeWhile
    , takeWhile1
  )
 import Data.ByteString (ByteString)
 import Data.ByteString.Char8.Util (B16Int(..))
 import Data.Char (toLower)
 import Data.Set (Set)
 import qualified Data.Set as Set (fromList, member, unions)
 import Prelude hiding (fail)
 type MonadDeps m = (MonadFail m, MonadPlus m)
 class MonadDeps m => MonadParser m where
  block :: Int -> m ByteString
  char :: Char -> m Char
  decNumber :: m ByteString
  endOfInput :: m ()
  hexNumber :: m B16Int
  oneOf :: String -> m Char
  string :: ByteString -> m ByteString
  takeAll :: (Char -> Bool) -> m ByteString
  takeAll1 :: (Char -> Bool) -> m ByteString
 instance MonadParser Atto.Parser where
  block = Atto.take
  char = Atto.char
  endOfInput = Atto.endOfInput
  decNumber = Atto.takeWhile1 (`Set.member` digits)
  hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
  oneOf charSet = Atto.satisfy (`elem` charSet)
  string s = Atto.string s <?> show s
  takeAll = Atto.takeWhile
  takeAll1 = Atto.takeWhile1
 instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) where
  block = lift . block
  char = lift . char
  endOfInput = lift $ endOfInput
  decNumber = lift $ decNumber
  hexNumber = lift $ hexNumber
  oneOf = lift . oneOf
  string = lift . string
  takeAll = lift . takeAll
  takeAll1 = lift . takeAll1
 type Parser s = StateT s Atto.Parser
-(<?>) :: Parser s a -> String -> Parser s a
+(<?>) :: (Alternative m, MonadFail m) => m a -> String -> m a
 (<?>) parser debugMessage = parser <|> fail debugMessage
 block :: Int -> Parser s ByteString
 block = lift . Atto.take
 char :: Char -> Parser s Char
 char = lift . Atto.char
 choice :: [Parser s a] -> Parser s a
 choice = foldr (<|>) empty
 count :: Int -> Parser s a -> Parser s [a]
 count 0 _ = return []
 count n p = (:) <$> p <*> count (n-1) p
 decNumber :: Parser s ByteString
 decNumber = lift $ Atto.takeWhile1 (`Set.member` digits)
 digits :: Set Char
 digits = Set.fromList ['0'..'9']
@ -59,13 +75,7 @@ hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af]
  where
    af = ['A'..'F']
-hexNumber :: Parser s ByteString
+octDigit :: MonadParser m => m Char
 hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits)
 many :: Parser s a -> Parser s [a]
 many parser = (:) <$> parser <*> many parser <|> return []
 octDigit :: Parser s Char
 octDigit = oneOf ['0'..'7']
 on :: Parser s a -> ByteString -> Parser s (Either String a)
@ -74,25 +84,8 @@ on (StateT parserF) input = StateT $ \state ->
    Left errorMsg -> return (Left errorMsg, state)
    Right (result, newState) -> return (Right result, newState)
-oneOf :: String -> Parser s Char
+runParser :: Parser s a -> s -> ByteString -> Either String (a, s)
-oneOf charSet = lift $ Atto.satisfy (`elem` charSet)
+runParser parser initState = Atto.parseOnly (runStateT parser initState)
-option :: a -> Parser s a -> Parser s a
+evalParser :: Parser s a -> s -> ByteString -> Either String a
-option defaultValue p = p <|> pure defaultValue
+evalParser parser initState = Atto.parseOnly (evalStateT parser initState)
 runParser :: Parser s a -> s -> ByteString -> Either String a
 runParser parser initState =
  Atto.parseOnly (evalStateT parser initState)
 sepBy :: Parser s a -> Parser s b -> Parser s [a]
 sepBy parser separator =
  option [] $ (:) <$> parser <*> many (separator *> parser)
 string :: ByteString -> Parser s ByteString
 string = lift . Atto.string
 takeAll :: (Char -> Bool) -> Parser s ByteString
 takeAll = lift . Atto.takeWhile
 takeAll1 :: (Char -> Bool) -> Parser s ByteString
 takeAll1 = lift . Atto.takeWhile1
--- a/src/PDF/Text.hs
+++ b/src/PDF/Text.hs
@ -0,0 +1,319 @@
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE ConstraintKinds #-}
 {-# LANGUAGE FlexibleContexts #-}
 module PDF.Text {-(
    pageContents
  )-} where
 import Control.Applicative ((<|>))
 import Control.Monad (foldM)
 import Control.Monad.Fail (MonadFail)
 import Control.Monad.Reader (ReaderT, runReaderT, asks)
 import Control.Monad.State (MonadState, evalStateT, get, modify, put)
 import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
 import Data.ByteString (ByteString)
 import Data.ByteString.Char8 (pack, unpack)
 import Data.Char (toLower)
 import Data.Map ((!), (!?), Map)
 import qualified Data.Map as Map (fromList)
 import Data.Text (Text)
 import PDF.Font (Font, FontSet, emptyFont)
 import PDF.Object (
      DirectObject(..), StringObject(..)
    , array, blank, name, regular, stringObject, toByteString
  )
 import PDF.Parser (MonadParser(..), (<?>), Parser, evalParser)
 data StateOperator =
  C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
  deriving (Bounded, Enum, Show)
 data PathOperator =
    M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
  | S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
  | W | Wstar -- clipping path
  deriving (Bounded, Enum, Show)
 data ColorOperator =
    CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
  deriving (Bounded, Enum, Show)
 data TextOperator =
    Td | TD | Tm | Tstar -- text positioning
  | TJ | Tj | Quote | DQuote -- text showing
  | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
  deriving (Bounded, Enum, Show)
 data Argument = Raw ByteString | Typed DirectObject deriving Show
 type Call a = (a, [Argument])
 type Operator a = (Bounded a, Enum a, Show a)
 code :: Operator a => a -> ByteString
 code = pack . expand . show
  where
    expand "" = ""
    expand (c:'_':s) = toLower c : expand s
    expand ('s':'t':'a':'r':s) = '*' : expand s
    expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
    expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
    expand (c:s) = c : expand s
 {-
 instance Show StateOperator where
  show Cm = "cm"
  show W_ = "w"
  show J = "J"
  show J_ = "j"
  show M = "M"
  show D = "d"
  show Ri = "ri"
  show I = "i"
  show Gs = "gs"
 instance Show PathOperator where
  show M_ = "m"
  show L_ = "l"
  show C_ = "c"
  show V_ = "v"
  show Y_ = "y"
  show H_
      ("m", (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
    , ("l", (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
    , ("c", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
    , ("v", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
    , ("y", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
    , ("h", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("re", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
    , ("S", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("s", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("f", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("F", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("F*", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("B", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("B*", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("b", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("b*", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("n", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("W", (L_, \l -> case l of [] -> True ; _ -> False))
    , ("W*", (L_, \l -> case l of [] -> True ; _ -> False))
 instance Show ColorOperator where
 instance Show TextOperator where
  show Td = "Td"
  show TD = "TD"
  show Tm = "Tm"
  show Tstar = "T*"
  show TJ = "TJ"
  show Tj = "Tj"
  show Quote = "'"
  show DQuote = "\""
  show Tc = "Tc"
  show Tw = "Tw"
  show Tz = "Tz"
  show TL = "TL"
  show Tf = "Tf"
  show Tr = "Tr"
  show Ts = "Ts"
 -}
 stateOperator :: OperatorTable StateOperator
 stateOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
      (C_m, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (W_, \l -> case l of [Raw _] -> True ; _ -> False)
    , (J, \l -> case l of [Raw _] -> True ; _ -> False)
    , (J_, \l -> case l of [Raw _] -> True ; _ -> False)
    , (M, \l -> case l of [Raw _] -> True ; _ -> False)
    , (D_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (R_i, \l -> case l of [Raw _] -> True ; _ -> False)
    , (I_, \l -> case l of [Raw _] -> True ; _ -> False)
    , (G_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
  ]
 pathOperator :: OperatorTable PathOperator
 pathOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
      (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (C_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (V_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (Y_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (H_, \l -> case l of [] -> True ; _ -> False)
    , (R_e, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (S, \l -> case l of [] -> True ; _ -> False)
    , (S_, \l -> case l of [] -> True ; _ -> False)
    , (F_, \l -> case l of [] -> True ; _ -> False)
    , (F, \l -> case l of [] -> True ; _ -> False)
    , (Fstar, \l -> case l of [] -> True ; _ -> False)
    , (B, \l -> case l of [] -> True ; _ -> False)
    , (Bstar, \l -> case l of [] -> True ; _ -> False)
    , (B_, \l -> case l of [] -> True ; _ -> False)
    , (B_star, \l -> case l of [] -> True ; _ -> False)
    , (N_, \l -> case l of [] -> True ; _ -> False)
    , (W, \l -> case l of [] -> True ; _ -> False)
    , (Wstar, \l -> case l of [] -> True ; _ -> False)
  ]
 colorOperator :: OperatorTable ColorOperator
 colorOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
      (CS, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
    , (C_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
    , (SC, \_ -> True)
    , (SCN, \_ -> True)
    , (S_c, \_ -> True)
    , (S_cn, \_ -> True)
    , (G, \l -> case l of [Raw _] -> True ; _ -> False)
    , (G_, \l -> case l of [Raw _] -> True ; _ -> False)
    , (RG, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (R_g, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (K, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (K_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
  ]
 {-
 stateOperator (Cm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
 stateOperator (W, [Raw _]) = True
 stateOperator (J, [Raw _]) = True
 stateOperator (J_, [Raw _]) = True
 stateOperator (M, [Raw _]) = True
 stateOperator (D, [Raw _, Raw _]) = True
 stateOperator (Ri, [Raw _]) = True
 stateOperator (I, [Raw _]) = True
 stateOperator (Gs, [Typed (NameObject _)]) = True
 stateOperator _ = False
 -}
 textOperator :: OperatorTable TextOperator
 textOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
      (Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
    , (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
    , (Tstar, \l -> case l of [] -> True ; _ -> False)
    , (TJ, \l -> case l of [Typed (Array _)] -> True ; _ -> False)
    , (Tj, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
    , (Quote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
    , (DQuote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
    , (Tc, \l -> case l of [Raw _] -> True ; _ -> False)
    , (Tw, \l -> case l of [Raw _] -> True ; _ -> False)
    , (Tz, \l -> case l of [Raw _] -> True ; _ -> False)
    , (TL, \l -> case l of [Raw _] -> True ; _ -> False)
    , (Tf, \l -> case l of [Typed (NameObject _), Raw _] -> True ; _ -> False)
    , (Tr, \l -> case l of [Raw _] -> True ; _ -> False)
    , (Ts, \l -> case l of [Raw _] -> True ; _ -> False)
  ]
  {-
 textOperator (Td, [Raw _, Raw _]) = True
 textOperator (TD, [Raw _, Raw _]) = True
 textOperator (Tm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
 textOperator (Tstar, []) = True
 textOperator (TJ, [Typed (Array _)]) = True
 textOperator (Tj, [Typed (StringObject _)]) = True
 textOperator (Quote, [Typed (StringObject _)]) = True
 textOperator (DQuote, [Typed (StringObject _)]) = True
 textOperator (Tc, [Raw _]) = True
 textOperator (Tw, [Raw _]) = True
 textOperator (Tz, [Raw _]) = True
 textOperator (TL, [Raw _]) = True
 textOperator (Tf, [Typed (NameObject _), Raw _]) = True
 textOperator (Tr, [Raw _]) = True
 textOperator (Ts, [Raw _]) = True
 textOperator _ = False
 -}
 type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
 --type Operator a = (Bounded a, Enum a, Show a)
 type OperatorTable a = Map ByteString (TypeChecker a)
 type TypeChecker a = (a, [Argument] -> Bool)
 parseShowable :: (Show a, MonadParser m) => a -> m a
 parseShowable textOp = string (pack $ show textOp) *> return textOp
 callChunk :: MonadParser m => OperatorTable a -> m (Either (TypeChecker a) Argument)
 callChunk table =
    (Right <$> choice [stringArg, nameArg, arrayArg])
  <|> operatorOrRawArg
  <?> "call chunk"
  where
    operatorOrRawArg = do
      chunk <- takeAll1 regular <* blank
      case table !? chunk of
        Nothing -> return . Right $ Raw chunk
        Just typeChecker -> return $ Left typeChecker
 stackParser :: (ArgumentStackParser m, Operator a) => OperatorTable a -> m (Call a)
 stackParser table = either popCall push =<< (callChunk table)
  where
    push arg = modify (arg:) >> stackParser table
    popCall (operator, predicate) = do
      arguments <- reverse <$> get
      let call = (operator, arguments)
      if predicate arguments then return call else fail (unpack $ code operator)
 a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a)
 a table = evalStateT (stackParser table) []
 argument :: MonadParser m => m Argument
 argument = Raw <$> takeAll1 regular <* blank
 arrayArg :: MonadParser m => m Argument
 arrayArg = Typed . Array <$> array <* blank
 nameArg :: MonadParser m => m Argument
 nameArg = Typed . NameObject <$> name <* blank
 stringArg :: MonadParser m => m Argument
 stringArg = Typed . StringObject <$> stringObject <* blank
 type ParserWithFont = ReaderT FontSet (Parser Font)
 pageContents :: FontSet -> ByteString -> Either String [Text]
 pageContents fontSet input =
  evalParser (runReaderT (page) fontSet) emptyFont input
 several :: MonadParser m => m [a] -> m [a]
 several p = concat <$> (p `sepBy` blank)
 page :: ParserWithFont [Text]
 page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
 graphicState :: ParserWithFont [Text]
 graphicState =
  string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
  where
    insideQ = several (command <|> graphicState <|> text)
    ignore x = a x *> return []
    command =
      ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
 text :: ParserWithFont [Text]
 text =
  string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
  where
    commands = several (a textOperator >>= runOperator)
 runOperator :: Call TextOperator -> ParserWithFont [Text]
 runOperator (Tf, [Typed (NameObject fontName), _]) =
  asks (! fontName) >>= put >> return []
 runOperator (Tstar, []) = return ["\n"]
 runOperator (TJ, [Typed (Array arrayObject)]) =
  replicate 1 <$> foldM appendText "" arrayObject
  where
    appendText bs (StringObject outputString) =
      mappend bs <$> decodeString outputString
    appendText bs _ = return bs
 runOperator (Tj, [Typed (StringObject outputString)]) =
  replicate 1 <$> decodeString outputString
 runOperator (Quote, [Typed (StringObject outputString)]) =
  (\bs -> ["\n", bs]) <$> decodeString outputString
 runOperator (DQuote, [Typed (StringObject outputString)]) =
  (\bs -> ["\n", bs]) <$> decodeString outputString
 runOperator _ = return []
 decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text
 decodeString input = do
  font <- get
  either fail return . font $ toByteString input
Author	SHA1	Message	Date
Tissevert	dd6bfd90bd	Using toEnum to convert from Int to Int ? Surely a left-over from some time when it was a different type	2020-02-08 08:10:32 +01:00
Tissevert	03fbbc3a96	Why did I implement this overly complicated lift by hand again ?	2020-02-07 13:08:10 +01:00
Tissevert	95f9ab35b1	Implement MacRomanEncoding for real following their own vendor file https://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT	2020-02-07 10:59:28 +01:00
Tissevert	fe055150a3	Migrate to Text to represent page contents and get rid of encoding concerns to early	2020-02-07 10:49:16 +01:00
Tissevert	57996749c6	Fix loose parser not making sure endOfInput is reached; add two families of operators and simplify the «Show» instance with a dedicated function to allow deleting lines of uninteresting code	2020-02-06 16:54:27 +01:00
Tissevert	3f6b0651f3	Expose the endOfLine parser through MonadParser to allow enforcing reaching the end of input in page parser	2020-02-06 16:53:06 +01:00
Tissevert	ecfd682b34	Simplify functions exposed (all part of the MonadParser class	2020-02-06 16:52:22 +01:00
Tissevert	5fa32e35db	Implement Font retrieving for simple fonts with an /Encoding and no ToUnicode	2020-02-05 22:15:18 +01:00
Tissevert	b5a15a692b	Forgot to remove commented-out dead code	2020-02-05 19:49:03 +01:00
Tissevert	b859338a57	Start implementing the MacRomanEncoding	2020-02-05 18:03:44 +01:00
Tissevert	764e2c6a4f	Removing deprecated hidding for «fail»	2020-02-05 18:02:52 +01:00
Tissevert	6ed57d66e8	Reimplement cMap as a type of Font and make the code ready for other Fonts	2020-02-05 17:42:17 +01:00
Tissevert	22cde37025	Add a Font class type to allow text rendition schemes other than CMaps	2020-02-05 14:42:51 +01:00
Tissevert	c48ab22808	Forgot some useless parentheses when playing with operator precedences	2020-02-04 17:05:15 +01:00
Tissevert	a2b66ac6d6	Generalize the getFont function because some /Resources have a direct dictionary as value for their /Font property	2020-02-04 17:04:42 +01:00
Tissevert	cefb08ee50	Going a step further in «optimization» (slowing it even more…) by replacing choice by a search in a Map	2019-11-30 21:46:22 +01:00
Tissevert	afbbcbffc5	Finish implementing the new stack-based call parser	2019-11-30 12:39:40 +01:00
Tissevert	8373bd1ea0	Removing +x permission on getText source that shouldn't ever have been set	2019-11-29 19:07:54 +01:00
Tissevert	bac08446dd	WIP: starting to fix this criminally inefficient parser for PDF's postfix-operator instructions	2019-11-29 17:42:57 +01:00
Tissevert	f9f799c59b	Take the dirty code of «getText» and turn it into a relatively clean module exposing pages, that can be retrieved all at once or by page number (numbered human-style, starting from 1)	2019-11-29 11:51:35 +01:00
Tissevert	08a9717b3a	Get rid of wrapper PageContents structure returned by PageContent in the PDF.Text module (and return directly [ByteString] instead)	2019-11-29 11:48:28 +01:00
Tissevert	42a02808c1	Merge branch 'main' into extract-text	2019-11-27 18:05:47 +01:00
Tissevert	c9f050e64b	Remove deprecated debug script and forgotten comments to bypass the selective export of Text module	2019-10-14 10:17:15 +02:00
Tissevert	3a3e1533b4	Clean ByteString types to identify when a ByteString contains the representation of an integer in a given base and fix the last remaining PDF string (un)escaping issue	2019-10-14 10:17:15 +02:00
Tissevert	a96e36ec5a	Fix error silently discarding code ranges, make sure ByteString intervals are created with the correct byte length and decode utf16BE encoded values in single-value ranges	2019-10-14 10:17:15 +02:00
Tissevert	d07c286f8e	Clean exported ByteString custom functions	2019-10-14 10:17:15 +02:00
Tissevert	7a15113285	Try and re-implement string decoding — compiles but now fails to decode any string	2019-10-14 10:17:15 +02:00
Tissevert	36d7f9b819	Still debugging, broke pretty much everything and finally implementing a proper coderange parsing for CMap because apparently that's necessary	2019-10-14 10:17:15 +02:00
Tissevert	b8ca7281aa	Fix parsing errors forgetting to make sure there's a space after special operator arguments like names and stringObjects	2019-10-14 10:17:15 +02:00
Tissevert	32efdcdd6b	Try and fix stuff by generalizing a signature to ease debugging and add parenthesis which I think should have been here all along	2019-10-14 10:17:15 +02:00
Tissevert	3b59fd0c61	Separate CMap and Text in two distinct modules	2019-10-14 10:17:15 +02:00
Tissevert	0374b72920	Finish implementing reading, still bugs to investigate	2019-10-14 10:17:15 +02:00
Tissevert	1dd22c3889	Going to try with Text, naturally handling UTF-16 but will still have to parse «int codes» manually from strings	2019-10-14 10:17:15 +02:00
Tissevert	98d029c4d4	In complete debug, more or less implemented CMap parsing but apparently it uses UTF16 ?!	2019-10-14 10:17:15 +02:00
Tissevert	c349d9b4c2	Don't trust serializer, they have nothing todo with a reasonable binary encoding	2019-10-14 10:17:15 +02:00
Tissevert	e7484ef536	Completely lost, the same old Char8 / Word8 again, implemented all the text reading, still needing a couple details to parse CMaps	2019-10-14 10:17:15 +02:00
Tissevert	f9e5683bf4	WIP: Use previous changes to start implementing font caching and text parsing (still very broken, doesn't compile)	2019-10-14 10:17:15 +02:00
Tissevert	b8eb9e6856	Generalize the Parser type into a MonadParser class to use with MonadTrans and remove redundant code already defined in Applicative or Attoparsec	2019-10-14 10:17:15 +02:00
Tissevert	66d315b7fe	Reflect the distinction between eval and run from State monad into the Parser module	2019-10-14 10:17:15 +02:00
Tissevert	51db57ec67	Ugly commit, breaks everything, still trying to figure a grammar for text	2019-10-14 10:17:15 +02:00
Tissevert	6f3c159ea7	Adding a module to implement text reading and a demo program to go with it	2019-10-14 10:17:15 +02:00