Using toEnum to convert from Int to Int ? Surely a left-over from some time when it was a different type

Why did I implement this overly complicated lift by hand again ?
Implement MacRomanEncoding for real following their own vendor file https://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT
2020-02-08 08:10:32 +01:00 · 2020-02-07 13:08:10 +01:00 · 2020-02-07 10:59:28 +01:00 · 2020-02-07 10:49:16 +01:00 · 2020-02-06 16:54:27 +01:00 · 2020-02-06 16:53:06 +01:00
15 changed files with 1068 additions and 113 deletions
--- a/Hufflepdf.cabal
+++ b/Hufflepdf.cabal
@ -17,19 +17,28 @@ cabal-version:       >=1.10

 library
  exposed-modules:     PDF
+                     , PDF.CMap
                     , PDF.EOL
                     , PDF.Object
                     , PDF.Output
+                     , PDF.Pages
+                     , PDF.Parser
+                     , PDF.Text
                     , PDF.Update
  other-modules:       Data.ByteString.Char8.Util
+                     , PDF.Encoding
+                     , PDF.Encoding.MacRoman
                     , PDF.Body
-                     , PDF.Parser
+                     , PDF.Font
  -- other-extensions:    
  build-depends:       attoparsec
                     , base >=4.9 && <4.13
                     , bytestring
                     , containers
                     , mtl
+                     , text
+                     , utf8-string
+                     , zlib
  hs-source-dirs:      src
  ghc-options:         -Wall
  default-language:    Haskell2010
@ -51,3 +60,15 @@ executable             getObj
                     , zlib
  ghc-options:         -Wall
  default-language:    Haskell2010
+
+executable             getText
+  main-is:             examples/getText.hs
+  build-depends:       base
+                     , bytestring
+                     , containers
+                     , Hufflepdf
+                     , mtl
+                     , text
+                     , zlib
+  ghc-options:         -Wall
+  default-language:    Haskell2010
--- a/examples/getText.hs
+++ b/examples/getText.hs
@ -0,0 +1,38 @@
+import qualified Data.ByteString.Char8 as BS (readFile)
+import qualified Data.Map as Map (toList)
+import qualified Data.Text.IO as Text (putStrLn)
+import PDF (Document(..), parseDocument)
+import PDF.Object (Content)
+import PDF.Pages (Page(..), get, getAll)
+import PDF.Update (unify)
+import System.Environment (getArgs)
+import System.Exit (die)
+import System.IO (BufferMode(..), hSetBuffering, stdout)
+
+onDoc :: FilePath -> (Content -> Either String a) -> IO a
+onDoc inputFile f = do
+  content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
+  case content >>= f of
+    Left someError -> die someError
+    Right value -> return value
+
+displayPage :: Page -> IO ()
+displayPage = mapM_ Text.putStrLn . contents
+
+wholeDoc :: FilePath -> IO ()
+wholeDoc inputFile = do
+  pages <- onDoc inputFile getAll
+  mapM_ (displayPage . snd) $ Map.toList pages
+
+singlePage :: FilePath -> Int -> IO ()
+singlePage inputFile pageNumber =
+  onDoc inputFile (`get` pageNumber) >>= displayPage
+
+main :: IO ()
+main = do
+  hSetBuffering stdout LineBuffering
+  args <- getArgs
+  case args of
+    [inputFile] -> wholeDoc inputFile
+    [inputFile, pageNumber] -> singlePage inputFile (read pageNumber)
+    _ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
--- a/src/Data/ByteString/Char8/Util.hs
+++ b/src/Data/ByteString/Char8/Util.hs
@ -1,16 +1,91 @@
 module Data.ByteString.Char8.Util (
-      previous
+      B16Int(..)
+    , B256Int(..)
+    , b8ToInt
+    , b16ToBytes
+    , b16ToInt
+    , b256ToInt
+    , intToB256
+    , previous
    , subBS
+    , toBytes
+    , unescape
+    , utf16BEToutf8
  ) where

-import Data.ByteString.Char8 (ByteString)
-import qualified Data.ByteString.Char8 as BS (drop, index, take)
+import Data.ByteString (ByteString, snoc)
+import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, splitAt)
+import qualified Data.ByteString.Char8 as Char8 (
+    cons, drop, index, splitAt, take, uncons, unpack
+  )
+import Data.Text (Text)
+import Data.Text.Encoding (decodeUtf16BE)
 import Prelude hiding (length)
+import Text.Printf (printf)
+
+newtype B8Int = B8Int ByteString deriving Show
+newtype B16Int = B16Int ByteString deriving Show
+newtype B256Int = B256Int ByteString deriving Show

 previous :: Char -> Int -> ByteString -> Int
 previous char position byteString
-  | BS.index byteString position == char = position
+  | Char8.index byteString position == char = position
  | otherwise = previous char (position - 1) byteString

 subBS :: Int -> Int -> ByteString -> ByteString
-subBS offset length = BS.take length . BS.drop offset
+subBS offset length = Char8.take length . Char8.drop offset
+
+intToB256 :: Int -> B256Int
+intToB256 n
+  | n < 0x100 = B256Int . BS.singleton $ toEnum n
+  | otherwise =
+    let B256Int begining = intToB256 (n `div` 0x100) in
+    B256Int $ begining `snoc` (toEnum (n `mod` 0x100))
+
+b256ToInt :: B256Int -> Int
+b256ToInt (B256Int n) = BS.foldl (\k w -> 0x100*k + fromEnum w) 0 n
+
+toBytes :: Int -> Int -> ByteString
+toBytes 0 _ = BS.empty
+toBytes size n =
+  (toBytes (size - 1) (n `div` 0x100)) `snoc` (toEnum (n `mod` 0x100))
+
+b16ToBytes :: B16Int -> ByteString
+b16ToBytes (B16Int n) = BS.pack . fmap b16ToInt $ pairDigits n
+  where
+    pairDigits s =
+      case BS.length s of
+        0 -> []
+        1 -> [B16Int s]
+        _ ->
+          let (twoHexDigits, rest) = BS.splitAt 2 s in
+          (B16Int $ twoHexDigits):(pairDigits rest)
+
+fromBase :: (Num a, Read a) => Char -> ByteString -> a
+fromBase b = read . printf "0%c%s" b . Char8.unpack
+
+b16ToInt :: (Num a, Read a) => B16Int -> a
+b16ToInt (B16Int n) = fromBase 'x' n
+
+b8ToInt :: (Num a, Read a) => B8Int -> a
+b8ToInt (B8Int n) = fromBase 'o' n
+
+unescape :: ByteString -> ByteString
+unescape escapedBS =
+  case Char8.uncons escapedBS of
+    Nothing -> BS.empty
+    Just ('\\', s) -> unescapeChar s
+    Just (c, s) -> Char8.cons c (unescape s)
+  where
+    unescapeChar s =
+      case Char8.uncons s of
+        Nothing -> BS.empty
+        Just (c, s')
+          | c `elem` "()" -> Char8.cons c (unescape s')
+          | c `elem` "nrtbf" -> Char8.cons (read (printf "'\\%c'" c)) (unescape s')
+          | c `elem` ['0'..'7'] -> fromOctal (Char8.splitAt 3 s)
+          | otherwise -> Char8.cons c (unescape s')
+    fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
+
+utf16BEToutf8 :: ByteString -> Text
+utf16BEToutf8 = decodeUtf16BE
--- a/src/PDF.hs
+++ b/src/PDF.hs
@ -21,7 +21,7 @@ import PDF.Object (
  )
 import qualified PDF.Output as Output (render, line)
 import PDF.Output (Output(..))
-import PDF.Parser (Parser, runParser, string, takeAll)
+import PDF.Parser (Parser, evalParser, string, takeAll)
 import Text.Printf (printf)

 data Document = Document {
@ -83,7 +83,7 @@ findNextSection offset input =

 readStructures :: Int -> ByteString -> Either String [InputStructure]
 readStructures startXref input =
-  runParser structure () (BS.drop startXref input) >>= stopOrFollow
+  evalParser structure () (BS.drop startXref input) >>= stopOrFollow
  where
    stopOrFollow s@(Structure {trailer}) =
      case Map.lookup (Name "Prev") trailer of
@ -96,7 +96,7 @@ readStructures startXref input =

 parseDocument :: ByteString -> Either String Document
 parseDocument input = do
-  (pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input
+  (pdfVersion, eolStyle) <- evalParser ((,) <$> version <*> EOL.parser) () input
  startXref <- readStartXref eolStyle input
  structuresRead <- readStructures startXref input
  let updates = populate input <$> structuresRead
--- a/src/PDF/Body.hs
+++ b/src/PDF/Body.hs
@ -6,6 +6,7 @@ module PDF.Body (

 import Control.Applicative ((<|>))
 import Control.Monad.State (get, gets, modify)
+import Data.Attoparsec.ByteString.Char8 (option)
 import Data.ByteString.Char8 (ByteString)
 import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
 import Data.Map ((!))
@ -18,7 +19,7 @@ import PDF.Object (
    , blank, dictionary, directObject, integer, line
  )
 import PDF.Output (ObjectId(..), Offset(..))
-import PDF.Parser (Parser, (<?>), block, char, on, option, runParser, takeAll)
+import PDF.Parser (Parser, (<?>), block, char, evalParser, on, takeAll)

 data UserState = UserState {
      input :: ByteString
@ -109,7 +110,7 @@ occurrence =
 populate :: ByteString -> InputStructure -> Content
 populate input structure =
  let bodyInput = BS.drop (startOffset structure) input in
-  case runParser recurseOnOccurrences initialState bodyInput of
+  case evalParser recurseOnOccurrences initialState bodyInput of
    Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
    Right finalState ->
      let Flow {occurrencesStack, tmpObjects} = flow finalState in
--- a/src/PDF/CMap.hs
+++ b/src/PDF/CMap.hs
@ -0,0 +1,159 @@
+{-# LANGUAGE NamedFieldPuns #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE TypeSynonymInstances #-}
+module PDF.CMap (
+      CMap
+    , CMappers
+    , CRange(..)
+    , cMap
+    , emptyCMap
+    , matches
+  ) where
+
+import Control.Applicative ((<|>), many)
+import Control.Monad.State (modify)
+import Data.Attoparsec.ByteString.Char8 (count)
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as BS (drop, length, null, take)
+import Data.ByteString.Char8 (unpack)
+import Data.ByteString.Char8.Util (
+    B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
+  )
+import Data.Map (Map, union)
+import qualified Data.Map as Map (
+    adjust, empty, fromList, insertWith, lookup, toList
+  )
+import Data.Text (Text)
+import qualified PDF.EOL as EOL (charset, parser)
+import PDF.Font (Font)
+import PDF.Object (
+      DirectObject(..), Name, StringObject(..)
+    , blank, directObject, integer, line, stringObject
+  )
+import PDF.Parser (MonadParser, Parser, runParser, takeAll)
+
+type CMappers = Map Name CMap
+type Mapping = Map ByteString Text
+data CRange = CRange {
+      fromSequence :: ByteString
+    , toSequence :: ByteString
+    , mapping :: Mapping
+  } deriving Show
+type RangeSize = Int
+type CMap = Map RangeSize [CRange]
+
+toFont :: CMap -> Font
+toFont aCMap input
+  | BS.null input = Right ""
+  | otherwise = do
+    (output, remainingInput) <- trySizes input $ Map.toList aCMap
+    mappend output <$> toFont aCMap remainingInput
+  where
+    trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
+    trySizes s ((size, cRanges):others) =
+      let prefix = BS.take size s in
+      case tryRanges prefix cRanges of
+        Nothing -> trySizes s others
+        Just outputSequence -> Right (outputSequence, BS.drop size s)
+    tryRanges :: ByteString -> [CRange] -> Maybe Text
+    tryRanges _ [] = Nothing
+    tryRanges prefix ((CRange {mapping}):cRanges) =
+      case Map.lookup prefix mapping of
+        Nothing -> tryRanges prefix cRanges
+        outputSequence -> outputSequence
+
+emptyCMap :: CMap
+emptyCMap = Map.empty
+
+matches :: ByteString -> CRange -> Bool
+matches code (CRange {fromSequence, toSequence}) =
+  fromSequence <= code && code <= toSequence
+
+cMap :: ByteString -> Either String Font
+cMap = fmap (toFont . snd) <$> runParser
+  (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
+  emptyCMap
+  where
+    ignoredLine =
+      takeAll (not . (`elem` EOL.charset)) *> EOL.parser *> return ()
+
+codeRanges :: Parser CMap ()
+codeRanges = do
+  size <- integer <* line "begincodespacerange"
+  mapM_ createMapping =<< count size codeRange
+  line "endcodespacerange"
+  where
+    codeRange =
+      (,) <$> stringObject <* blank <*> stringObject <* EOL.parser
+
+createMapping :: (StringObject, StringObject) -> Parser CMap ()
+createMapping (Hexadecimal from, Hexadecimal to) = modify $
+  Map.insertWith (++) size [CRange {fromSequence, toSequence, mapping}]
+  where
+    fromSequence = b16ToBytes from
+    size = BS.length fromSequence
+    toSequence = b16ToBytes to
+    mapping = Map.empty
+createMapping _ = return ()
+
+cMapRange :: Parser CMap ()
+cMapRange = do
+  size <- integer <* line "beginbfrange"
+  mapM_ saveMapping =<< count size rangeMapping
+  line "endbfrange"
+  where
+    rangeMapping = (,,) 
+      <$> (stringObject <* blank)
+      <*> (stringObject <* blank)
+      <*> directObject <* EOL.parser
+      >>= mapFromTo
+
+saveMapping :: [(ByteString, Text)] -> Parser CMap ()
+saveMapping [] = return ()
+saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
+  where
+    newMapping = Map.fromList assoc
+    mappingSize = BS.length code
+    appendMapping cRange =
+      cRange {mapping = mapping cRange `union` newMapping}
+    insertCRange = fmap (\cRange ->
+        if code `matches` cRange then appendMapping cRange else cRange
+      )
+
+cMapChar :: Parser CMap ()
+cMapChar = do
+  size <- integer <* line "beginbfchar"
+  saveMapping =<< count size charMapping <* line "endbfchar"
+  where
+    charMapping =
+      (,) <$> stringObject <* blank <*> stringObject <* EOL.parser
+      >>= pairMapping
+
+between :: B16Int -> B16Int -> [ByteString]
+between from@(B16Int s) to =
+  let size = BS.length s `div` 2 in
+  toBytes size <$> [b16ToInt from .. b16ToInt to]
+
+startFrom :: B16Int -> [ByteString]
+startFrom from@(B16Int s) =
+  let size = BS.length s `div` 2 in
+  toBytes size <$> [b16ToInt from .. ]
+
+mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
+mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
+  return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
+
+mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
+  zip (between from to) <$> (mapM dstByteString dstPoints)
+  where
+    dstByteString (StringObject (Hexadecimal dst)) =
+      return . utf16BEToutf8 $ b16ToBytes dst
+    dstByteString _ = fail "Invalid for a replacement string"
+
+mapFromTo _ = fail "invalid range mapping found"
+
+pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
+pairMapping (Hexadecimal from, Hexadecimal to) =
+  return (b16ToBytes  from, utf16BEToutf8 $ b16ToBytes  to)
+pairMapping _ = fail "invalid pair mapping found"
--- a/src/PDF/EOL.hs
+++ b/src/PDF/EOL.hs
@ -6,14 +6,14 @@ module PDF.EOL (
  ) where

 import Control.Applicative ((<|>))
-import PDF.Parser (Parser, string)
+import PDF.Parser (MonadParser, string)

 data Style = CR | LF | CRLF deriving Show

 charset :: String
 charset = "\r\n"

-parser :: Parser s Style
+parser :: MonadParser m => m Style
 parser =
    (string "\r\n" >> return CRLF)
  <|> (string "\r" >> return CR)
--- a/src/PDF/Encoding.hs
+++ b/src/PDF/Encoding.hs
@ -0,0 +1,10 @@
+module PDF.Encoding (
+    encoding
+  ) where
+
+import PDF.Encoding.MacRoman (macRomanEncoding)
+import PDF.Font (Font)
+
+encoding :: String -> Either String Font
+encoding "MacRomanEncoding" = Right macRomanEncoding
+encoding s = Left $ "Unknown encoding " ++ s
--- a/src/PDF/Encoding/MacRoman.hs
+++ b/src/PDF/Encoding/MacRoman.hs
@ -0,0 +1,141 @@
+module PDF.Encoding.MacRoman (
+    macRomanEncoding
+  ) where
+
+import Data.ByteString.Char8 (unpack)
+import Data.Text (pack)
+import PDF.Font (Font)
+
+macRomanEncoding :: Font
+macRomanEncoding = Right . pack . fmap decode . unpack
+
+decode :: Char -> Char
+decode '\x80' = '\x00C4' -- LATIN CAPITAL LETTER A WITH DIAERESIS
+decode '\x81' = '\x00C5' -- LATIN CAPITAL LETTER A WITH RING ABOVE
+decode '\x82' = '\x00C7' -- LATIN CAPITAL LETTER C WITH CEDILLA
+decode '\x83' = '\x00C9' -- LATIN CAPITAL LETTER E WITH ACUTE
+decode '\x84' = '\x00D1' -- LATIN CAPITAL LETTER N WITH TILDE
+decode '\x85' = '\x00D6' -- LATIN CAPITAL LETTER O WITH DIAERESIS
+decode '\x86' = '\x00DC' -- LATIN CAPITAL LETTER U WITH DIAERESIS
+decode '\x87' = '\x00E1' -- LATIN SMALL LETTER A WITH ACUTE
+decode '\x88' = '\x00E0' -- LATIN SMALL LETTER A WITH GRAVE
+decode '\x89' = '\x00E2' -- LATIN SMALL LETTER A WITH CIRCUMFLEX
+decode '\x8A' = '\x00E4' -- LATIN SMALL LETTER A WITH DIAERESIS
+decode '\x8B' = '\x00E3' -- LATIN SMALL LETTER A WITH TILDE
+decode '\x8C' = '\x00E5' -- LATIN SMALL LETTER A WITH RING ABOVE
+decode '\x8D' = '\x00E7' -- LATIN SMALL LETTER C WITH CEDILLA
+decode '\x8E' = '\x00E9' -- LATIN SMALL LETTER E WITH ACUTE
+decode '\x8F' = '\x00E8' -- LATIN SMALL LETTER E WITH GRAVE
+decode '\x90' = '\x00EA' -- LATIN SMALL LETTER E WITH CIRCUMFLEX
+decode '\x91' = '\x00EB' -- LATIN SMALL LETTER E WITH DIAERESIS
+decode '\x92' = '\x00ED' -- LATIN SMALL LETTER I WITH ACUTE
+decode '\x93' = '\x00EC' -- LATIN SMALL LETTER I WITH GRAVE
+decode '\x94' = '\x00EE' -- LATIN SMALL LETTER I WITH CIRCUMFLEX
+decode '\x95' = '\x00EF' -- LATIN SMALL LETTER I WITH DIAERESIS
+decode '\x96' = '\x00F1' -- LATIN SMALL LETTER N WITH TILDE
+decode '\x97' = '\x00F3' -- LATIN SMALL LETTER O WITH ACUTE
+decode '\x98' = '\x00F2' -- LATIN SMALL LETTER O WITH GRAVE
+decode '\x99' = '\x00F4' -- LATIN SMALL LETTER O WITH CIRCUMFLEX
+decode '\x9A' = '\x00F6' -- LATIN SMALL LETTER O WITH DIAERESIS
+decode '\x9B' = '\x00F5' -- LATIN SMALL LETTER O WITH TILDE
+decode '\x9C' = '\x00FA' -- LATIN SMALL LETTER U WITH ACUTE
+decode '\x9D' = '\x00F9' -- LATIN SMALL LETTER U WITH GRAVE
+decode '\x9E' = '\x00FB' -- LATIN SMALL LETTER U WITH CIRCUMFLEX
+decode '\x9F' = '\x00FC' -- LATIN SMALL LETTER U WITH DIAERESIS
+decode '\xA0' = '\x2020' -- DAGGER
+decode '\xA1' = '\x00B0' -- DEGREE SIGN
+decode '\xA2' = '\x00A2' -- CENT SIGN
+decode '\xA3' = '\x00A3' -- POUND SIGN
+decode '\xA4' = '\x00A7' -- SECTION SIGN
+decode '\xA5' = '\x2022' -- BULLET
+decode '\xA6' = '\x00B6' -- PILCROW SIGN
+decode '\xA7' = '\x00DF' -- LATIN SMALL LETTER SHARP S
+decode '\xA8' = '\x00AE' -- REGISTERED SIGN
+decode '\xA9' = '\x00A9' -- COPYRIGHT SIGN
+decode '\xAA' = '\x2122' -- TRADE MARK SIGN
+decode '\xAB' = '\x00B4' -- ACUTE ACCENT
+decode '\xAC' = '\x00A8' -- DIAERESIS
+decode '\xAD' = '\x2260' -- NOT EQUAL TO
+decode '\xAE' = '\x00C6' -- LATIN CAPITAL LETTER AE
+decode '\xAF' = '\x00D8' -- LATIN CAPITAL LETTER O WITH STROKE
+decode '\xB0' = '\x221E' -- INFINITY
+decode '\xB1' = '\x00B1' -- PLUS-MINUS SIGN
+decode '\xB2' = '\x2264' -- LESS-THAN OR EQUAL TO
+decode '\xB3' = '\x2265' -- GREATER-THAN OR EQUAL TO
+decode '\xB4' = '\x00A5' -- YEN SIGN
+decode '\xB5' = '\x00B5' -- MICRO SIGN
+decode '\xB6' = '\x2202' -- PARTIAL DIFFERENTIAL
+decode '\xB7' = '\x2211' -- N-ARY SUMMATION
+decode '\xB8' = '\x220F' -- N-ARY PRODUCT
+decode '\xB9' = '\x03C0' -- GREEK SMALL LETTER PI
+decode '\xBA' = '\x222B' -- INTEGRAL
+decode '\xBB' = '\x00AA' -- FEMININE ORDINAL INDICATOR
+decode '\xBC' = '\x00BA' -- MASCULINE ORDINAL INDICATOR
+decode '\xBD' = '\x03A9' -- GREEK CAPITAL LETTER OMEGA
+decode '\xBE' = '\x00E6' -- LATIN SMALL LETTER AE
+decode '\xBF' = '\x00F8' -- LATIN SMALL LETTER O WITH STROKE
+decode '\xC0' = '\x00BF' -- INVERTED QUESTION MARK
+decode '\xC1' = '\x00A1' -- INVERTED EXCLAMATION MARK
+decode '\xC2' = '\x00AC' -- NOT SIGN
+decode '\xC3' = '\x221A' -- SQUARE ROOT
+decode '\xC4' = '\x0192' -- LATIN SMALL LETTER F WITH HOOK
+decode '\xC5' = '\x2248' -- ALMOST EQUAL TO
+decode '\xC6' = '\x2206' -- INCREMENT
+decode '\xC7' = '\x00AB' -- LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+decode '\xC8' = '\x00BB' -- RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+decode '\xC9' = '\x2026' -- HORIZONTAL ELLIPSIS
+decode '\xCA' = '\x00A0' -- NO-BREAK SPACE
+decode '\xCB' = '\x00C0' -- LATIN CAPITAL LETTER A WITH GRAVE
+decode '\xCC' = '\x00C3' -- LATIN CAPITAL LETTER A WITH TILDE
+decode '\xCD' = '\x00D5' -- LATIN CAPITAL LETTER O WITH TILDE
+decode '\xCE' = '\x0152' -- LATIN CAPITAL LIGATURE OE
+decode '\xCF' = '\x0153' -- LATIN SMALL LIGATURE OE
+decode '\xD0' = '\x2013' -- EN DASH
+decode '\xD1' = '\x2014' -- EM DASH
+decode '\xD2' = '\x201C' -- LEFT DOUBLE QUOTATION MARK
+decode '\xD3' = '\x201D' -- RIGHT DOUBLE QUOTATION MARK
+decode '\xD4' = '\x2018' -- LEFT SINGLE QUOTATION MARK
+decode '\xD5' = '\x2019' -- RIGHT SINGLE QUOTATION MARK
+decode '\xD6' = '\x00F7' -- DIVISION SIGN
+decode '\xD7' = '\x25CA' -- LOZENGE
+decode '\xD8' = '\x00FF' -- LATIN SMALL LETTER Y WITH DIAERESIS
+decode '\xD9' = '\x0178' -- LATIN CAPITAL LETTER Y WITH DIAERESIS
+decode '\xDA' = '\x2044' -- FRACTION SLASH
+decode '\xDB' = '\x20AC' -- EURO SIGN
+decode '\xDC' = '\x2039' -- SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+decode '\xDD' = '\x203A' -- SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+decode '\xDE' = '\xFB01' -- LATIN SMALL LIGATURE FI
+decode '\xDF' = '\xFB02' -- LATIN SMALL LIGATURE FL
+decode '\xE0' = '\x2021' -- DOUBLE DAGGER
+decode '\xE1' = '\x00B7' -- MIDDLE DOT
+decode '\xE2' = '\x201A' -- SINGLE LOW-9 QUOTATION MARK
+decode '\xE3' = '\x201E' -- DOUBLE LOW-9 QUOTATION MARK
+decode '\xE4' = '\x2030' -- PER MILLE SIGN
+decode '\xE5' = '\x00C2' -- LATIN CAPITAL LETTER A WITH CIRCUMFLEX
+decode '\xE6' = '\x00CA' -- LATIN CAPITAL LETTER E WITH CIRCUMFLEX
+decode '\xE7' = '\x00C1' -- LATIN CAPITAL LETTER A WITH ACUTE
+decode '\xE8' = '\x00CB' -- LATIN CAPITAL LETTER E WITH DIAERESIS
+decode '\xE9' = '\x00C8' -- LATIN CAPITAL LETTER E WITH GRAVE
+decode '\xEA' = '\x00CD' -- LATIN CAPITAL LETTER I WITH ACUTE
+decode '\xEB' = '\x00CE' -- LATIN CAPITAL LETTER I WITH CIRCUMFLEX
+decode '\xEC' = '\x00CF' -- LATIN CAPITAL LETTER I WITH DIAERESIS
+decode '\xED' = '\x00CC' -- LATIN CAPITAL LETTER I WITH GRAVE
+decode '\xEE' = '\x00D3' -- LATIN CAPITAL LETTER O WITH ACUTE
+decode '\xEF' = '\x00D4' -- LATIN CAPITAL LETTER O WITH CIRCUMFLEX
+decode '\xF0' = '\xF8FF' -- Apple logo
+decode '\xF1' = '\x00D2' -- LATIN CAPITAL LETTER O WITH GRAVE
+decode '\xF2' = '\x00DA' -- LATIN CAPITAL LETTER U WITH ACUTE
+decode '\xF3' = '\x00DB' -- LATIN CAPITAL LETTER U WITH CIRCUMFLEX
+decode '\xF4' = '\x00D9' -- LATIN CAPITAL LETTER U WITH GRAVE
+decode '\xF5' = '\x0131' -- LATIN SMALL LETTER DOTLESS I
+decode '\xF6' = '\x02C6' -- MODIFIER LETTER CIRCUMFLEX ACCENT
+decode '\xF7' = '\x02DC' -- SMALL TILDE
+decode '\xF8' = '\x00AF' -- MACRON
+decode '\xF9' = '\x02D8' -- BREVE
+decode '\xFA' = '\x02D9' -- DOT ABOVE
+decode '\xFB' = '\x02DA' -- RING ABOVE
+decode '\xFC' = '\x00B8' -- CEDILLA
+decode '\xFD' = '\x02DD' -- DOUBLE ACUTE ACCENT
+decode '\xFE' = '\x02DB' -- OGONEK
+decode '\xFF' = '\x02C7' -- CARON
+decode c = c             -- The rest is ASCII
--- a/src/PDF/Font.hs
+++ b/src/PDF/Font.hs
@ -0,0 +1,16 @@
+module PDF.Font (
+      Font
+    , FontSet
+    , emptyFont
+  ) where
+
+import Data.ByteString (ByteString)
+import Data.Map (Map)
+import Data.Text (Text)
+import PDF.Object (Name)
+
+type Font = ByteString -> Either String Text
+type FontSet = Map Name Font
+
+emptyFont :: Font
+emptyFont _ = Left "No fond loaded"
--- a/src/PDF/Object.hs
+++ b/src/PDF/Object.hs
@ -3,6 +3,7 @@
 {-# LANGUAGE FlexibleInstances #-}
 module PDF.Object (
      Content(..)
+    , Dictionary
    , DirectObject(..)
    , Flow(..)
    , IndexedObjects
@ -12,9 +13,11 @@ module PDF.Object (
    , Number(..)
    , Object(..)
    , Occurrence(..)
+    , StringObject(..)
    , Structure(..)
    , XRefEntry(..)
    , XRefSection
+    , array
    , blank
    , dictionary
    , directObject
@ -22,14 +25,20 @@ module PDF.Object (
    , integer
    , line
    , magicNumber
+    , name
+    , number
+    , regular
+    , stringObject
    , structure
+    , toByteString
  ) where

-import Control.Applicative ((<|>))
-import Data.ByteString.Char8 (ByteString)
-import qualified Data.ByteString.Char8 as BS (
-    concat, cons, pack, singleton, unpack
-  )
+import Control.Applicative ((<|>), many)
+import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as BS (concat)
+import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
+import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
 import Data.Map (Map, (!), mapWithKey)
 import qualified Data.Map as Map (
    delete, empty, fromList, lookup, minViewWithKey, toList, union
@ -41,15 +50,11 @@ import PDF.Output (
    , byteString, getObjectId, getOffset, getOffsets, join, newLine
    , saveOffset
  )
-import PDF.Parser (
-      Parser, (<?>)
-    , char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option
-    , sepBy, string, takeAll, takeAll1
-  )
+import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
 import Text.Printf (printf)

-line :: String -> Parser u ()
-line l = (string (BS.pack l) *> blank *> return ()) <?> printf "line «%s»" l
+line :: MonadParser m => String -> m ()
+line l = (string (Char8.pack l) *> blank *> return ()) <?> printf "line «%s»" l

 magicNumber :: ByteString
 magicNumber = "%PDF-"
@ -60,8 +65,8 @@ eofMarker = "%%EOF"
 whiteSpaceCharset :: String
 whiteSpaceCharset = "\0\t\12 "

-blank :: Parser u ()
-blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return ()
+blank :: MonadParser m => m ()
+blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> pure ()

 delimiterCharset :: String
 delimiterCharset = "()<>[]{}/%"
@ -69,8 +74,8 @@ delimiterCharset = "()<>[]{}/%"
 regular :: Char -> Bool
 regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))

-integer :: (Read a, Num a) => Parser u a
-integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
+integer :: (Read a, Num a, MonadParser m) => m a
+integer = read . Char8.unpack <$> decNumber <* blank <?> "decimal integer"

 -------------------------------------
 --          OBJECTS
@ -81,7 +86,7 @@ type IndexedObjects = Map ObjectId Object
 --
 -- Boolean
 --
-boolean :: Parser u Bool
+boolean :: MonadParser m => m Bool
 boolean =
  (string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"

@ -96,38 +101,42 @@ instance Output Number where
      (n, 0) -> printf "%d" (n :: Int)
      _ -> printf "%f" f

-number :: Parser u Number
-number = Number . read . BS.unpack <$>
-  (mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
+number :: MonadParser m => m Number
+number = Number . read . Char8.unpack <$>
+  (mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
  <?> "number"
  where
    sign = string "-" <|> option "" (char '+' >> return "")
    integerPart = mappend <$> decNumber <*> option "" floatPart
-    floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
+    floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)

 --
 -- StringObject
 --
-data StringObject = Literal String | Hexadecimal String deriving Show
+data StringObject = Literal ByteString | Hexadecimal B16Int deriving Show

 instance Output StringObject where
-  output (Literal s) = Output.string (printf "(%s)" s)
-  output (Hexadecimal s) = Output.string (printf "<%s>" s)
+  output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
+  output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))

-stringObject :: Parser u StringObject
+stringObject :: MonadParser m => m StringObject
 stringObject =
-    Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
-  <|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
+    Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
+  <|> Hexadecimal <$> (char '<' *> hexNumber <* char '>')
  <?> "string object (literal or hexadecimal)"
  where
    literalString = many literalStringBlock
    literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
    normalChar = not . (`elem` ("\\()" :: String))
    matchingParenthesis =
-      mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
+      mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
    escapedChar =
-      BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
-    octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
+      Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
+    octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
+
+toByteString :: StringObject -> ByteString
+toByteString (Hexadecimal h) = b16ToBytes h
+toByteString (Literal s) = unescape s

 --
 -- Name
@ -137,13 +146,13 @@ newtype Name = Name String deriving (Eq, Ord, Show)
 instance Output Name where
  output (Name n) = Output.string ('/':n)

-name :: Parser u Name
-name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
+name :: MonadParser m => m Name
+name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"

 --
 -- Array
 --
-array :: Parser u [DirectObject]
+array :: MonadParser m => m [DirectObject]
 array =
  char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"

@ -160,7 +169,7 @@ instance Output Dictionary where
      outputKeyVal :: (Name, DirectObject) -> OBuilder
      outputKeyVal (key, val) = Output.concat [output key, " ", output val]

-dictionary :: Parser u Dictionary
+dictionary :: MonadParser m => m Dictionary
 dictionary =
  string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
  where
@ -170,7 +179,7 @@ dictionary =
 --
 -- Null
 --
-nullObject :: Parser u ()
+nullObject :: MonadParser m => m ()
 nullObject = string "null" *> return () <?> "null object"

 --
@ -181,7 +190,7 @@ data IndirectObjCoordinates = IndirectObjCoordinates {
    , versionNumber :: Int
  } deriving Show

-reference :: Parser u IndirectObjCoordinates
+reference :: MonadParser m => m IndirectObjCoordinates
 reference = IndirectObjCoordinates
  <$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"

@ -210,7 +219,7 @@ instance Output DirectObject where
  output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
    Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)

-directObject :: Parser u DirectObject
+directObject :: MonadParser m => m DirectObject
 directObject =
    Boolean <$> boolean
  <|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
--- a/src/PDF/Output.hs
+++ b/src/PDF/Output.hs
@ -116,7 +116,7 @@ char :: Char -> OBuilder
 char c = lift char8 c <* offset (+1)

 string :: String -> OBuilder
-string s = lift string8 s <* offset (+ toEnum (length s))
+string s = lift string8 s <* offset (+ length s)

 line :: String -> OBuilder
 line l = string l `mappend` newLine
--- a/src/PDF/Pages.hs
+++ b/src/PDF/Pages.hs
@ -0,0 +1,173 @@
+{-# LANGUAGE NamedFieldPuns #-}
+module PDF.Pages (
+      Page(..)
+    , get
+    , getAll
+  ) where
+
+import Codec.Compression.Zlib (decompress)
+import Control.Applicative ((<|>))
+import Control.Monad (foldM)
+import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify)
+import qualified Control.Monad.RWS as RWS (get)
+import Data.ByteString (ByteString)
+import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
+import Data.Map (Map, (!))
+import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
+import Data.Text (Text)
+import PDF.CMap (cMap)
+import PDF.Encoding (encoding)
+import PDF.Font (Font, FontSet)
+import PDF.Object (
+      Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
+    , Object(..), Name(..), Structure(..)
+  ,)
+import PDF.Output (ObjectId(..))
+import PDF.Text (pageContents)
+import Text.Printf (printf)
+
+type CachedFonts = Map ObjectId Font
+type T = RWST Content () CachedFonts (Either String)
+data Page = Page {
+      contents :: [Text]
+    , source :: ObjectId
+  }
+
+infixl 1 \\=
+(\\=) :: T a -> (a -> Either String b) -> T b
+x \\= f = x >>= lift . f
+
+infixl 1 //=
+(//=) :: Either String a -> (a -> T b) -> T b
+x //= f = lift x >>= f
+
+expected :: Show a => String -> a -> Either String b
+expected name = Left . printf "Not a %s: %s" name . show
+
+stream :: Object -> Either String ByteString
+stream (Stream {header, streamContent}) = Right $
+  case Map.lookup (Name "Filter") header of
+    Just (NameObject (Name "FlateDecode")) ->
+      Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
+    _ -> streamContent
+stream obj = expected "stream" obj
+
+getResource :: DirectObject -> T Dictionary
+getResource (Dictionary dictionary) = return dictionary
+getResource (Reference (IndirectObjCoordinates {objectId})) =
+  getObject objectId \\= dict
+getResource directObject =
+  lift $ expected "resource (dictionary or reference)" directObject
+
+getFontDictionary :: Dictionary -> T Dictionary
+getFontDictionary pageDict =
+  key "Resources" pageDict
+  //= getResource
+  \\= key "Font"
+  >>= getResource
+
+cache :: (ObjectId -> T Font) -> ObjectId -> T Font
+cache loader objectId =
+  (maybe load return . Map.lookup objectId) =<< RWS.get
+  where
+    load = do
+      value <- loader objectId
+      modify $ Map.insert objectId value
+      return value
+
+loadFont :: ObjectId -> T Font
+loadFont objectId = getObject objectId \\= dict >>= tryMappings
+  where
+    tryMappings dictionary =
+        loadCMap dictionary
+      <|> lift (key "Encoding" dictionary >>= loadEncoding)
+      <|> lift (Left $ unknownFormat (show objectId) (show dictionary))
+    unknownFormat = printf "Unknown font format for object #%s : %s"
+    loadCMap dictionary =
+      key "ToUnicode" dictionary //= follow \\= stream \\= cMap
+    loadEncoding (NameObject (Name name)) = encoding name
+    loadEncoding directObject =
+      Left . printf "Encoding must be a name, not that : %s" $ show directObject
+
+loadFonts :: Dictionary -> T FontSet
+loadFonts = foldM addFont Map.empty . Map.toList
+  where
+    addFont :: FontSet -> (Name, DirectObject) -> T FontSet
+    addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
+      flip (Map.insert name) output <$> cache loadFont objectId
+    addFont output _ = return output
+
+getObject :: ObjectId -> T Object
+getObject objectId = do
+  content <- ask
+  return (objects content ! objectId)
+
+key :: String -> Dictionary -> Either String DirectObject
+key keyName dictionary =
+  maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary)
+  where
+    errorMessage =
+      printf "Key %s not found in dictionary %s" keyName (show dictionary)
+
+target :: DirectObject -> Either String ObjectId
+target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
+target directObject = expected "reference" directObject
+
+many :: DirectObject -> [DirectObject]
+many (Array l) =  l
+many directObject = [directObject]
+
+follow :: DirectObject -> T Object
+follow directObject = target directObject //= getObject
+
+dict :: Object -> Either String Dictionary
+dict (Direct (Dictionary dictionary)) = Right dictionary
+dict obj = expected "dictionary" obj
+
+dictObject :: String -> Dictionary -> T Dictionary
+dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
+
+pagesList :: T [ObjectId]
+pagesList = do
+  root <- dictObject "Root" . trailer . docStructure =<< ask
+  pages <- dictObject "Pages" root
+  case Map.lookup (Name "Kids") pages of
+    Just (Array kids) -> return $ getReferences kids
+    _ -> return []
+
+getReferences :: [DirectObject] -> [ObjectId]
+getReferences objects = do
+  object <- objects
+  case object of
+    Reference (IndirectObjCoordinates {objectId}) -> [objectId]
+    _ -> []
+
+extractText :: Object -> T [Text]
+extractText object = do
+  pageDict <- lift $ dict object
+  fonts <- loadFonts =<< getFontDictionary pageDict
+  let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
+  concat <$> (objects //= (mapM $ loadContent fonts))
+  where
+    loadContent :: FontSet -> DirectObject -> T [Text]
+    loadContent fonts directObject =
+      follow directObject \\= stream \\= pageContents fonts
+
+loadPage :: ObjectId -> T Page
+loadPage source =
+  (\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
+
+getAll :: Content -> Either String (Map Int Page)
+getAll content = fst <$> evalRWST getPages content Map.empty
+  where
+    numbered = Map.fromList . zip [1..]
+    getPages = numbered <$> (mapM loadPage =<< pagesList)
+
+get :: Content -> Int -> Either String Page
+get content pageNumber
+  | pageNumber < 1 = Left "Pages start at 1"
+  | otherwise = fst <$> evalRWST getPage content Map.empty
+  where
+    firstPage [] = lift $ Left "Page is out of bounds"
+    firstPage (p:_) = loadPage p
+    getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
--- a/src/PDF/Parser.hs
+++ b/src/PDF/Parser.hs
@ -1,56 +1,72 @@
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE ConstraintKinds #-}
+{-# LANGUAGE UndecidableInstances #-}
 module PDF.Parser (
-      Parser
+      MonadParser(..)
+    , Parser
    , (<?>)
-    , block
-    , char
-    , choice
-    , count
-    , decNumber
-    , hexNumber
-    , many
    , octDigit
    , on
-    , oneOf
-    , option
    , runParser
-    , sepBy
-    , string
-    , takeAll
-    , takeAll1
+    , evalParser
  ) where

-import Control.Applicative ((<|>), empty)
+import Control.Applicative (Alternative, (<|>))
+import Control.Monad (MonadPlus)
+import Control.Monad.Fail (MonadFail(..))
 import Control.Monad.State (StateT(..), evalStateT)
-import Control.Monad.Trans (lift)
+import Control.Monad.Trans (MonadTrans(..))
 import qualified Data.Attoparsec.ByteString.Char8 as Atto (
-    Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
+      Parser, char, endOfInput, parseOnly, satisfy, string, take, takeWhile
+    , takeWhile1
  )
 import Data.ByteString (ByteString)
+import Data.ByteString.Char8.Util (B16Int(..))
 import Data.Char (toLower)
 import Data.Set (Set)
 import qualified Data.Set as Set (fromList, member, unions)
+import Prelude hiding (fail)
+
+type MonadDeps m = (MonadFail m, MonadPlus m)
+
+class MonadDeps m => MonadParser m where
+  block :: Int -> m ByteString
+  char :: Char -> m Char
+  decNumber :: m ByteString
+  endOfInput :: m ()
+  hexNumber :: m B16Int
+  oneOf :: String -> m Char
+  string :: ByteString -> m ByteString
+  takeAll :: (Char -> Bool) -> m ByteString
+  takeAll1 :: (Char -> Bool) -> m ByteString
+
+instance MonadParser Atto.Parser where
+  block = Atto.take
+  char = Atto.char
+  endOfInput = Atto.endOfInput
+  decNumber = Atto.takeWhile1 (`Set.member` digits)
+  hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
+  oneOf charSet = Atto.satisfy (`elem` charSet)
+  string s = Atto.string s <?> show s
+  takeAll = Atto.takeWhile
+  takeAll1 = Atto.takeWhile1
+
+instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) where
+  block = lift . block
+  char = lift . char
+  endOfInput = lift $ endOfInput
+  decNumber = lift $ decNumber
+  hexNumber = lift $ hexNumber
+  oneOf = lift . oneOf
+  string = lift . string
+  takeAll = lift . takeAll
+  takeAll1 = lift . takeAll1

 type Parser s = StateT s Atto.Parser

-(<?>) :: Parser s a -> String -> Parser s a
+(<?>) :: (Alternative m, MonadFail m) => m a -> String -> m a
 (<?>) parser debugMessage = parser <|> fail debugMessage

-block :: Int -> Parser s ByteString
-block = lift . Atto.take
-
-char :: Char -> Parser s Char
-char = lift . Atto.char
-
-choice :: [Parser s a] -> Parser s a
-choice = foldr (<|>) empty
-
-count :: Int -> Parser s a -> Parser s [a]
-count 0 _ = return []
-count n p = (:) <$> p <*> count (n-1) p
-
-decNumber :: Parser s ByteString
-decNumber = lift $ Atto.takeWhile1 (`Set.member` digits)
-
 digits :: Set Char
 digits = Set.fromList ['0'..'9']

@ -59,13 +75,7 @@ hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af]
  where
    af = ['A'..'F']

-hexNumber :: Parser s ByteString
-hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits)
-
-many :: Parser s a -> Parser s [a]
-many parser = (:) <$> parser <*> many parser <|> return []
-
-octDigit :: Parser s Char
+octDigit :: MonadParser m => m Char
 octDigit = oneOf ['0'..'7']

 on :: Parser s a -> ByteString -> Parser s (Either String a)
@ -74,25 +84,8 @@ on (StateT parserF) input = StateT $ \state ->
    Left errorMsg -> return (Left errorMsg, state)
    Right (result, newState) -> return (Right result, newState)

-oneOf :: String -> Parser s Char
-oneOf charSet = lift $ Atto.satisfy (`elem` charSet)
+runParser :: Parser s a -> s -> ByteString -> Either String (a, s)
+runParser parser initState = Atto.parseOnly (runStateT parser initState)

-option :: a -> Parser s a -> Parser s a
-option defaultValue p = p <|> pure defaultValue
-
-runParser :: Parser s a -> s -> ByteString -> Either String a
-runParser parser initState =
-  Atto.parseOnly (evalStateT parser initState)
-
-sepBy :: Parser s a -> Parser s b -> Parser s [a]
-sepBy parser separator =
-  option [] $ (:) <$> parser <*> many (separator *> parser)
-
-string :: ByteString -> Parser s ByteString
-string = lift . Atto.string
-
-takeAll :: (Char -> Bool) -> Parser s ByteString
-takeAll = lift . Atto.takeWhile
-
-takeAll1 :: (Char -> Bool) -> Parser s ByteString
-takeAll1 = lift . Atto.takeWhile1
+evalParser :: Parser s a -> s -> ByteString -> Either String a
+evalParser parser initState = Atto.parseOnly (evalStateT parser initState)
--- a/src/PDF/Text.hs
+++ b/src/PDF/Text.hs
@ -0,0 +1,319 @@
+{-# LANGUAGE NamedFieldPuns #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ConstraintKinds #-}
+{-# LANGUAGE FlexibleContexts #-}
+module PDF.Text {-(
+    pageContents
+  )-} where
+
+import Control.Applicative ((<|>))
+import Control.Monad (foldM)
+import Control.Monad.Fail (MonadFail)
+import Control.Monad.Reader (ReaderT, runReaderT, asks)
+import Control.Monad.State (MonadState, evalStateT, get, modify, put)
+import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
+import Data.ByteString (ByteString)
+import Data.ByteString.Char8 (pack, unpack)
+import Data.Char (toLower)
+import Data.Map ((!), (!?), Map)
+import qualified Data.Map as Map (fromList)
+import Data.Text (Text)
+import PDF.Font (Font, FontSet, emptyFont)
+import PDF.Object (
+      DirectObject(..), StringObject(..)
+    , array, blank, name, regular, stringObject, toByteString
+  )
+import PDF.Parser (MonadParser(..), (<?>), Parser, evalParser)
+
+data StateOperator =
+  C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
+  deriving (Bounded, Enum, Show)
+data PathOperator =
+    M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
+  | S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
+  | W | Wstar -- clipping path
+  deriving (Bounded, Enum, Show)
+data ColorOperator =
+    CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
+  deriving (Bounded, Enum, Show)
+data TextOperator =
+    Td | TD | Tm | Tstar -- text positioning
+  | TJ | Tj | Quote | DQuote -- text showing
+  | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
+  deriving (Bounded, Enum, Show)
+data Argument = Raw ByteString | Typed DirectObject deriving Show
+type Call a = (a, [Argument])
+
+type Operator a = (Bounded a, Enum a, Show a)
+
+code :: Operator a => a -> ByteString
+code = pack . expand . show
+  where
+    expand "" = ""
+    expand (c:'_':s) = toLower c : expand s
+    expand ('s':'t':'a':'r':s) = '*' : expand s
+    expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
+    expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
+    expand (c:s) = c : expand s
+
+{-
+instance Show StateOperator where
+  show Cm = "cm"
+  show W_ = "w"
+  show J = "J"
+  show J_ = "j"
+  show M = "M"
+  show D = "d"
+  show Ri = "ri"
+  show I = "i"
+  show Gs = "gs"
+
+instance Show PathOperator where
+  show M_ = "m"
+  show L_ = "l"
+  show C_ = "c"
+  show V_ = "v"
+  show Y_ = "y"
+  show H_
+      ("m", (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
+    , ("l", (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
+    , ("c", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
+    , ("v", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
+    , ("y", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
+    , ("h", (L_, \l -> case l of [] -> True ; _ -> False))
+    , ("re", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
+    , ("S", (L_, \l -> case l of [] -> True ; _ -> False))
+    , ("s", (L_, \l -> case l of [] -> True ; _ -> False))
+    , ("f", (L_, \l -> case l of [] -> True ; _ -> False))
+    , ("F", (L_, \l -> case l of [] -> True ; _ -> False))
+    , ("F*", (L_, \l -> case l of [] -> True ; _ -> False))
+    , ("B", (L_, \l -> case l of [] -> True ; _ -> False))
+    , ("B*", (L_, \l -> case l of [] -> True ; _ -> False))
+    , ("b", (L_, \l -> case l of [] -> True ; _ -> False))
+    , ("b*", (L_, \l -> case l of [] -> True ; _ -> False))
+    , ("n", (L_, \l -> case l of [] -> True ; _ -> False))
+    , ("W", (L_, \l -> case l of [] -> True ; _ -> False))
+    , ("W*", (L_, \l -> case l of [] -> True ; _ -> False))
+
+instance Show ColorOperator where
+
+instance Show TextOperator where
+  show Td = "Td"
+  show TD = "TD"
+  show Tm = "Tm"
+  show Tstar = "T*"
+  show TJ = "TJ"
+  show Tj = "Tj"
+  show Quote = "'"
+  show DQuote = "\""
+  show Tc = "Tc"
+  show Tw = "Tw"
+  show Tz = "Tz"
+  show TL = "TL"
+  show Tf = "Tf"
+  show Tr = "Tr"
+  show Ts = "Ts"
+-}
+
+stateOperator :: OperatorTable StateOperator
+stateOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
+      (C_m, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
+    , (W_, \l -> case l of [Raw _] -> True ; _ -> False)
+    , (J, \l -> case l of [Raw _] -> True ; _ -> False)
+    , (J_, \l -> case l of [Raw _] -> True ; _ -> False)
+    , (M, \l -> case l of [Raw _] -> True ; _ -> False)
+    , (D_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
+    , (R_i, \l -> case l of [Raw _] -> True ; _ -> False)
+    , (I_, \l -> case l of [Raw _] -> True ; _ -> False)
+    , (G_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
+  ]
+
+pathOperator :: OperatorTable PathOperator
+pathOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
+      (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
+    , (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
+    , (C_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
+    , (V_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
+    , (Y_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
+    , (H_, \l -> case l of [] -> True ; _ -> False)
+    , (R_e, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
+    , (S, \l -> case l of [] -> True ; _ -> False)
+    , (S_, \l -> case l of [] -> True ; _ -> False)
+    , (F_, \l -> case l of [] -> True ; _ -> False)
+    , (F, \l -> case l of [] -> True ; _ -> False)
+    , (Fstar, \l -> case l of [] -> True ; _ -> False)
+    , (B, \l -> case l of [] -> True ; _ -> False)
+    , (Bstar, \l -> case l of [] -> True ; _ -> False)
+    , (B_, \l -> case l of [] -> True ; _ -> False)
+    , (B_star, \l -> case l of [] -> True ; _ -> False)
+    , (N_, \l -> case l of [] -> True ; _ -> False)
+    , (W, \l -> case l of [] -> True ; _ -> False)
+    , (Wstar, \l -> case l of [] -> True ; _ -> False)
+  ]
+
+colorOperator :: OperatorTable ColorOperator
+colorOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
+      (CS, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
+    , (C_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
+    , (SC, \_ -> True)
+    , (SCN, \_ -> True)
+    , (S_c, \_ -> True)
+    , (S_cn, \_ -> True)
+    , (G, \l -> case l of [Raw _] -> True ; _ -> False)
+    , (G_, \l -> case l of [Raw _] -> True ; _ -> False)
+    , (RG, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
+    , (R_g, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
+    , (K, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
+    , (K_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
+  ]
+
+{-
+stateOperator (Cm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
+stateOperator (W, [Raw _]) = True
+stateOperator (J, [Raw _]) = True
+stateOperator (J_, [Raw _]) = True
+stateOperator (M, [Raw _]) = True
+stateOperator (D, [Raw _, Raw _]) = True
+stateOperator (Ri, [Raw _]) = True
+stateOperator (I, [Raw _]) = True
+stateOperator (Gs, [Typed (NameObject _)]) = True
+stateOperator _ = False
+-}
+
+textOperator :: OperatorTable TextOperator
+textOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
+      (Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
+    , (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
+    , (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
+    , (Tstar, \l -> case l of [] -> True ; _ -> False)
+    , (TJ, \l -> case l of [Typed (Array _)] -> True ; _ -> False)
+    , (Tj, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
+    , (Quote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
+    , (DQuote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
+    , (Tc, \l -> case l of [Raw _] -> True ; _ -> False)
+    , (Tw, \l -> case l of [Raw _] -> True ; _ -> False)
+    , (Tz, \l -> case l of [Raw _] -> True ; _ -> False)
+    , (TL, \l -> case l of [Raw _] -> True ; _ -> False)
+    , (Tf, \l -> case l of [Typed (NameObject _), Raw _] -> True ; _ -> False)
+    , (Tr, \l -> case l of [Raw _] -> True ; _ -> False)
+    , (Ts, \l -> case l of [Raw _] -> True ; _ -> False)
+  ]
+
+  {-
+textOperator (Td, [Raw _, Raw _]) = True
+textOperator (TD, [Raw _, Raw _]) = True
+textOperator (Tm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
+textOperator (Tstar, []) = True
+textOperator (TJ, [Typed (Array _)]) = True
+textOperator (Tj, [Typed (StringObject _)]) = True
+textOperator (Quote, [Typed (StringObject _)]) = True
+textOperator (DQuote, [Typed (StringObject _)]) = True
+textOperator (Tc, [Raw _]) = True
+textOperator (Tw, [Raw _]) = True
+textOperator (Tz, [Raw _]) = True
+textOperator (TL, [Raw _]) = True
+textOperator (Tf, [Typed (NameObject _), Raw _]) = True
+textOperator (Tr, [Raw _]) = True
+textOperator (Ts, [Raw _]) = True
+textOperator _ = False
+-}
+
+type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
+--type Operator a = (Bounded a, Enum a, Show a)
+type OperatorTable a = Map ByteString (TypeChecker a)
+type TypeChecker a = (a, [Argument] -> Bool)
+
+parseShowable :: (Show a, MonadParser m) => a -> m a
+parseShowable textOp = string (pack $ show textOp) *> return textOp
+
+callChunk :: MonadParser m => OperatorTable a -> m (Either (TypeChecker a) Argument)
+callChunk table =
+    (Right <$> choice [stringArg, nameArg, arrayArg])
+  <|> operatorOrRawArg
+  <?> "call chunk"
+  where
+    operatorOrRawArg = do
+      chunk <- takeAll1 regular <* blank
+      case table !? chunk of
+        Nothing -> return . Right $ Raw chunk
+        Just typeChecker -> return $ Left typeChecker
+
+stackParser :: (ArgumentStackParser m, Operator a) => OperatorTable a -> m (Call a)
+stackParser table = either popCall push =<< (callChunk table)
+  where
+    push arg = modify (arg:) >> stackParser table
+    popCall (operator, predicate) = do
+      arguments <- reverse <$> get
+      let call = (operator, arguments)
+      if predicate arguments then return call else fail (unpack $ code operator)
+
+a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a)
+a table = evalStateT (stackParser table) []
+
+argument :: MonadParser m => m Argument
+argument = Raw <$> takeAll1 regular <* blank
+
+arrayArg :: MonadParser m => m Argument
+arrayArg = Typed . Array <$> array <* blank
+
+nameArg :: MonadParser m => m Argument
+nameArg = Typed . NameObject <$> name <* blank
+
+stringArg :: MonadParser m => m Argument
+stringArg = Typed . StringObject <$> stringObject <* blank
+
+type ParserWithFont = ReaderT FontSet (Parser Font)
+
+pageContents :: FontSet -> ByteString -> Either String [Text]
+pageContents fontSet input =
+  evalParser (runReaderT (page) fontSet) emptyFont input
+
+several :: MonadParser m => m [a] -> m [a]
+several p = concat <$> (p `sepBy` blank)
+
+page :: ParserWithFont [Text]
+page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
+
+graphicState :: ParserWithFont [Text]
+graphicState =
+  string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
+  where
+    insideQ = several (command <|> graphicState <|> text)
+    ignore x = a x *> return []
+    command =
+      ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
+
+text :: ParserWithFont [Text]
+text =
+  string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
+  where
+    commands = several (a textOperator >>= runOperator)
+
+runOperator :: Call TextOperator -> ParserWithFont [Text]
+runOperator (Tf, [Typed (NameObject fontName), _]) =
+  asks (! fontName) >>= put >> return []
+
+runOperator (Tstar, []) = return ["\n"]
+
+runOperator (TJ, [Typed (Array arrayObject)]) =
+  replicate 1 <$> foldM appendText "" arrayObject
+  where
+    appendText bs (StringObject outputString) =
+      mappend bs <$> decodeString outputString
+    appendText bs _ = return bs
+
+runOperator (Tj, [Typed (StringObject outputString)]) =
+  replicate 1 <$> decodeString outputString
+
+runOperator (Quote, [Typed (StringObject outputString)]) =
+  (\bs -> ["\n", bs]) <$> decodeString outputString
+
+runOperator (DQuote, [Typed (StringObject outputString)]) =
+  (\bs -> ["\n", bs]) <$> decodeString outputString
+
+runOperator _ = return []
+
+decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text
+decodeString input = do
+  font <- get
+  either fail return . font $ toByteString input
Author	SHA1	Message	Date
Tissevert	dd6bfd90bd	Using toEnum to convert from Int to Int ? Surely a left-over from some time when it was a different type	2020-02-08 08:10:32 +01:00
Tissevert	03fbbc3a96	Why did I implement this overly complicated lift by hand again ?	2020-02-07 13:08:10 +01:00
Tissevert	95f9ab35b1	Implement MacRomanEncoding for real following their own vendor file https://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT	2020-02-07 10:59:28 +01:00
Tissevert	fe055150a3	Migrate to Text to represent page contents and get rid of encoding concerns to early	2020-02-07 10:49:16 +01:00
Tissevert	57996749c6	Fix loose parser not making sure endOfInput is reached; add two families of operators and simplify the «Show» instance with a dedicated function to allow deleting lines of uninteresting code	2020-02-06 16:54:27 +01:00
Tissevert	3f6b0651f3	Expose the endOfLine parser through MonadParser to allow enforcing reaching the end of input in page parser	2020-02-06 16:53:06 +01:00
Tissevert	ecfd682b34	Simplify functions exposed (all part of the MonadParser class	2020-02-06 16:52:22 +01:00
Tissevert	5fa32e35db	Implement Font retrieving for simple fonts with an /Encoding and no ToUnicode	2020-02-05 22:15:18 +01:00
Tissevert	b5a15a692b	Forgot to remove commented-out dead code	2020-02-05 19:49:03 +01:00
Tissevert	b859338a57	Start implementing the MacRomanEncoding	2020-02-05 18:03:44 +01:00
Tissevert	764e2c6a4f	Removing deprecated hidding for «fail»	2020-02-05 18:02:52 +01:00
Tissevert	6ed57d66e8	Reimplement cMap as a type of Font and make the code ready for other Fonts	2020-02-05 17:42:17 +01:00
Tissevert	22cde37025	Add a Font class type to allow text rendition schemes other than CMaps	2020-02-05 14:42:51 +01:00
Tissevert	c48ab22808	Forgot some useless parentheses when playing with operator precedences	2020-02-04 17:05:15 +01:00
Tissevert	a2b66ac6d6	Generalize the getFont function because some /Resources have a direct dictionary as value for their /Font property	2020-02-04 17:04:42 +01:00
Tissevert	cefb08ee50	Going a step further in «optimization» (slowing it even more…) by replacing choice by a search in a Map	2019-11-30 21:46:22 +01:00
Tissevert	afbbcbffc5	Finish implementing the new stack-based call parser	2019-11-30 12:39:40 +01:00
Tissevert	8373bd1ea0	Removing +x permission on getText source that shouldn't ever have been set	2019-11-29 19:07:54 +01:00
Tissevert	bac08446dd	WIP: starting to fix this criminally inefficient parser for PDF's postfix-operator instructions	2019-11-29 17:42:57 +01:00
Tissevert	f9f799c59b	Take the dirty code of «getText» and turn it into a relatively clean module exposing pages, that can be retrieved all at once or by page number (numbered human-style, starting from 1)	2019-11-29 11:51:35 +01:00
Tissevert	08a9717b3a	Get rid of wrapper PageContents structure returned by PageContent in the PDF.Text module (and return directly [ByteString] instead)	2019-11-29 11:48:28 +01:00
Tissevert	42a02808c1	Merge branch 'main' into extract-text	2019-11-27 18:05:47 +01:00
Tissevert	c9f050e64b	Remove deprecated debug script and forgotten comments to bypass the selective export of Text module	2019-10-14 10:17:15 +02:00
Tissevert	3a3e1533b4	Clean ByteString types to identify when a ByteString contains the representation of an integer in a given base and fix the last remaining PDF string (un)escaping issue	2019-10-14 10:17:15 +02:00
Tissevert	a96e36ec5a	Fix error silently discarding code ranges, make sure ByteString intervals are created with the correct byte length and decode utf16BE encoded values in single-value ranges	2019-10-14 10:17:15 +02:00
Tissevert	d07c286f8e	Clean exported ByteString custom functions	2019-10-14 10:17:15 +02:00
Tissevert	7a15113285	Try and re-implement string decoding — compiles but now fails to decode any string	2019-10-14 10:17:15 +02:00
Tissevert	36d7f9b819	Still debugging, broke pretty much everything and finally implementing a proper coderange parsing for CMap because apparently that's necessary	2019-10-14 10:17:15 +02:00
Tissevert	b8ca7281aa	Fix parsing errors forgetting to make sure there's a space after special operator arguments like names and stringObjects	2019-10-14 10:17:15 +02:00
Tissevert	32efdcdd6b	Try and fix stuff by generalizing a signature to ease debugging and add parenthesis which I think should have been here all along	2019-10-14 10:17:15 +02:00
Tissevert	3b59fd0c61	Separate CMap and Text in two distinct modules	2019-10-14 10:17:15 +02:00
Tissevert	0374b72920	Finish implementing reading, still bugs to investigate	2019-10-14 10:17:15 +02:00
Tissevert	1dd22c3889	Going to try with Text, naturally handling UTF-16 but will still have to parse «int codes» manually from strings	2019-10-14 10:17:15 +02:00
Tissevert	98d029c4d4	In complete debug, more or less implemented CMap parsing but apparently it uses UTF16 ?!	2019-10-14 10:17:15 +02:00
Tissevert	c349d9b4c2	Don't trust serializer, they have nothing todo with a reasonable binary encoding	2019-10-14 10:17:15 +02:00
Tissevert	e7484ef536	Completely lost, the same old Char8 / Word8 again, implemented all the text reading, still needing a couple details to parse CMaps	2019-10-14 10:17:15 +02:00
Tissevert	f9e5683bf4	WIP: Use previous changes to start implementing font caching and text parsing (still very broken, doesn't compile)	2019-10-14 10:17:15 +02:00
Tissevert	b8eb9e6856	Generalize the Parser type into a MonadParser class to use with MonadTrans and remove redundant code already defined in Applicative or Attoparsec	2019-10-14 10:17:15 +02:00
Tissevert	66d315b7fe	Reflect the distinction between eval and run from State monad into the Parser module	2019-10-14 10:17:15 +02:00
Tissevert	51db57ec67	Ugly commit, breaks everything, still trying to figure a grammar for text	2019-10-14 10:17:15 +02:00
Tissevert	6f3c159ea7	Adding a module to implement text reading and a demo program to go with it	2019-10-14 10:17:15 +02:00