diff --git a/src/PDF/CMap.hs b/src/PDF/CMap.hs index 6b9395c..8fb02b2 100644 --- a/src/PDF/CMap.hs +++ b/src/PDF/CMap.hs @@ -2,8 +2,10 @@ module PDF.CMap ( CMap , CMappers + , CRange(..) , cMap , emptyCMap + , matches ) where import Control.Applicative ((<|>), many) @@ -36,6 +38,10 @@ type CMap = Map RangeSize [CRange] emptyCMap :: CMap emptyCMap = Map.empty +matches :: ByteString -> CRange -> Bool +matches code (CRange {fromSequence, toSequence}) = + fromSequence <= code && code <= toSequence + cMap :: ByteString -> Either String CMap cMap = fmap snd <$> runParser (many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine)) @@ -80,12 +86,10 @@ saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize where newMapping = Map.fromList assoc mappingSize = BS.length code - matchingRange (CRange {fromSequence, toSequence}) = - fromSequence <= code && code <= toSequence appendMapping cRange = cRange {mapping = mapping cRange `union` newMapping} insertCRange = fmap (\cRange -> - if matchingRange cRange then appendMapping cRange else cRange + if code `matches` cRange then appendMapping cRange else cRange ) cMapChar :: Parser CMap () diff --git a/src/PDF/Text.hs b/src/PDF/Text.hs index 30ac6d0..7cb4783 100644 --- a/src/PDF/Text.hs +++ b/src/PDF/Text.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} module PDF.Text {-( PageContents(..) @@ -10,11 +11,12 @@ import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.State (get, put) import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy) import Data.ByteString (ByteString) -import qualified Data.ByteString.Char8 as Char8 (unpack) +import qualified Data.ByteString as BS (drop, null, take) import Data.ByteString.Char8.Util (decodeHex) +import Data.List (find) import Data.Map ((!)) -import qualified Data.Map as Map (lookup) -import PDF.CMap (CMappers, CMap, emptyCMap) +import qualified Data.Map as Map (lookup, toList) +import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap) import PDF.Object ( DirectObject(..), StringObject(..) , array, blank, name, parseBytes, regular, stringObject @@ -127,8 +129,31 @@ runOperator _ = return [] decodeString :: StringObject -> ParserWithFont ByteString decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h)) -decodeString (Literal litString) = +decodeString (Literal litString) = do + cRangesBySize <- Map.toList <$> get + f cRangesBySize litString + where + f :: [(Int, [CRange])] -> ByteString -> ParserWithFont ByteString + f cRangesBySize input + | BS.null input = return "" + | otherwise = do + (output, newInput) <- g cRangesBySize input + mappend output <$> f cRangesBySize newInput + g :: [(Int, [CRange])] -> ByteString -> ParserWithFont (ByteString, ByteString) + g [] _ = fail "No matching code found in font" + g ((size, cRanges):others) s = + let prefix = BS.take size s in + case h prefix cRanges of + Nothing -> g others s + Just outputSequence -> return (outputSequence, BS.drop size s) + h :: ByteString -> [CRange] -> Maybe ByteString + h prefix [] = Nothing + h prefix ((CRange {mapping}):cRanges) = + case Map.lookup prefix mapping of + Nothing -> h prefix cRanges + outputSequence -> outputSequence +{- get >>= convertBytes litString where convertBytes :: String -> CMap -> ParserWithFont ByteString @@ -143,3 +168,4 @@ get >>= convertBytes litString (Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap (Nothing, []) -> fail "No character left to read but no code recognized" (Just outputText, _) -> mappend outputText <$> convertBytes s someCMap +-}