From fe055150a33bbe63e7b2ba8f6b915cc6cf694eaa Mon Sep 17 00:00:00 2001 From: Tissevert Date: Fri, 7 Feb 2020 10:03:17 +0100 Subject: [PATCH] Migrate to Text to represent page contents and get rid of encoding concerns to early --- Hufflepdf.cabal | 1 + examples/getText.hs | 5 +++-- src/Data/ByteString/Char8/Util.hs | 7 ++++--- src/PDF/CMap.hs | 11 ++++++----- src/PDF/Font.hs | 3 ++- src/PDF/Pages.hs | 7 ++++--- src/PDF/Text.hs | 13 +++++++------ 7 files changed, 27 insertions(+), 20 deletions(-) diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index d12c6b4..c5ac77e 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -68,6 +68,7 @@ executable getText , containers , Hufflepdf , mtl + , text , zlib ghc-options: -Wall default-language: Haskell2010 diff --git a/examples/getText.hs b/examples/getText.hs index c93441e..51a2a9f 100644 --- a/examples/getText.hs +++ b/examples/getText.hs @@ -1,5 +1,6 @@ -import qualified Data.ByteString.Char8 as BS (putStrLn, readFile) +import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.Map as Map (toList) +import qualified Data.Text.IO as Text (putStrLn) import PDF (Document(..), parseDocument) import PDF.Object (Content) import PDF.Pages (Page(..), get, getAll) @@ -16,7 +17,7 @@ onDoc inputFile f = do Right value -> return value displayPage :: Page -> IO () -displayPage = mapM_ BS.putStrLn . contents +displayPage = mapM_ Text.putStrLn . contents wholeDoc :: FilePath -> IO () wholeDoc inputFile = do diff --git a/src/Data/ByteString/Char8/Util.hs b/src/Data/ByteString/Char8/Util.hs index d2471b4..b6753fe 100644 --- a/src/Data/ByteString/Char8/Util.hs +++ b/src/Data/ByteString/Char8/Util.hs @@ -18,7 +18,8 @@ import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, s import qualified Data.ByteString.Char8 as Char8 ( cons, drop, index, splitAt, take, uncons, unpack ) -import Data.Text.Encoding (encodeUtf8, decodeUtf16BE) +import Data.Text (Text) +import Data.Text.Encoding (decodeUtf16BE) import Prelude hiding (length) import Text.Printf (printf) @@ -86,5 +87,5 @@ unescape escapedBS = | otherwise -> Char8.cons c (unescape s') fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s) -utf16BEToutf8 :: ByteString -> ByteString -utf16BEToutf8 = encodeUtf8 . decodeUtf16BE +utf16BEToutf8 :: ByteString -> Text +utf16BEToutf8 = decodeUtf16BE diff --git a/src/PDF/CMap.hs b/src/PDF/CMap.hs index 712c990..b4845cb 100644 --- a/src/PDF/CMap.hs +++ b/src/PDF/CMap.hs @@ -24,6 +24,7 @@ import Data.Map (Map, union) import qualified Data.Map as Map ( adjust, empty, fromList, insertWith, lookup, toList ) +import Data.Text (Text) import qualified PDF.EOL as EOL (charset, parser) import PDF.Font (Font) import PDF.Object ( @@ -33,7 +34,7 @@ import PDF.Object ( import PDF.Parser (MonadParser, Parser, runParser, takeAll) type CMappers = Map Name CMap -type Mapping = Map ByteString ByteString +type Mapping = Map ByteString Text data CRange = CRange { fromSequence :: ByteString , toSequence :: ByteString @@ -55,7 +56,7 @@ toFont aCMap input case tryRanges prefix cRanges of Nothing -> trySizes s others Just outputSequence -> Right (outputSequence, BS.drop size s) - tryRanges :: ByteString -> [CRange] -> Maybe ByteString + tryRanges :: ByteString -> [CRange] -> Maybe Text tryRanges _ [] = Nothing tryRanges prefix ((CRange {mapping}):cRanges) = case Map.lookup prefix mapping of @@ -108,7 +109,7 @@ cMapRange = do <*> directObject <* EOL.parser >>= mapFromTo -saveMapping :: [(ByteString, ByteString)] -> Parser CMap () +saveMapping :: [(ByteString, Text)] -> Parser CMap () saveMapping [] = return () saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize where @@ -139,7 +140,7 @@ startFrom from@(B16Int s) = let size = BS.length s `div` 2 in toBytes size <$> [b16ToInt from .. ] -mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)] +mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)] mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom) @@ -152,7 +153,7 @@ mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) = mapFromTo _ = fail "invalid range mapping found" -pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString) +pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text) pairMapping (Hexadecimal from, Hexadecimal to) = return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to) pairMapping _ = fail "invalid pair mapping found" diff --git a/src/PDF/Font.hs b/src/PDF/Font.hs index 6d47503..f5fe7e9 100644 --- a/src/PDF/Font.hs +++ b/src/PDF/Font.hs @@ -6,9 +6,10 @@ module PDF.Font ( import Data.ByteString (ByteString) import Data.Map (Map) +import Data.Text (Text) import PDF.Object (Name) -type Font = ByteString -> Either String ByteString +type Font = ByteString -> Either String Text type FontSet = Map Name Font emptyFont :: Font diff --git a/src/PDF/Pages.hs b/src/PDF/Pages.hs index e4af7df..31cdb6d 100755 --- a/src/PDF/Pages.hs +++ b/src/PDF/Pages.hs @@ -14,6 +14,7 @@ import Data.ByteString (ByteString) import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import Data.Map (Map, (!)) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) +import Data.Text (Text) import PDF.CMap (cMap) import PDF.Encoding (encoding) import PDF.Font (Font, FontSet) @@ -28,7 +29,7 @@ import Text.Printf (printf) type CachedFonts = Map ObjectId Font type T = RWST Content () CachedFonts (Either String) data Page = Page { - contents :: [ByteString] + contents :: [Text] , source :: ObjectId } @@ -145,14 +146,14 @@ getReferences objects = do Reference (IndirectObjCoordinates {objectId}) -> [objectId] _ -> [] -extractText :: Object -> T [ByteString] +extractText :: Object -> T [Text] extractText object = do pageDict <- lift $ dict object fonts <- loadFonts =<< getFontDictionary pageDict let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject]) concat <$> (objects //= (mapM $ loadContent fonts)) where - loadContent :: FontSet -> DirectObject -> T [ByteString] + loadContent :: FontSet -> DirectObject -> T [Text] loadContent fonts directObject = follow directObject \\= stream \\= pageContents fonts diff --git a/src/PDF/Text.hs b/src/PDF/Text.hs index 9b34b13..e956b4e 100644 --- a/src/PDF/Text.hs +++ b/src/PDF/Text.hs @@ -17,6 +17,7 @@ import Data.ByteString.Char8 (pack, unpack) import Data.Char (toLower) import Data.Map ((!), (!?), Map) import qualified Data.Map as Map (fromList) +import Data.Text (Text) import PDF.Font (Font, FontSet, emptyFont) import PDF.Object ( DirectObject(..), StringObject(..) @@ -263,17 +264,17 @@ stringArg = Typed . StringObject <$> stringObject <* blank type ParserWithFont = ReaderT FontSet (Parser Font) -pageContents :: FontSet -> ByteString -> Either String [ByteString] +pageContents :: FontSet -> ByteString -> Either String [Text] pageContents fontSet input = evalParser (runReaderT (page) fontSet) emptyFont input several :: MonadParser m => m [a] -> m [a] several p = concat <$> (p `sepBy` blank) -page :: ParserWithFont [ByteString] +page :: ParserWithFont [Text] page = several (graphicState <|> text) <* blank <* endOfInput "Text page contents" -graphicState :: ParserWithFont [ByteString] +graphicState :: ParserWithFont [Text] graphicState = string "q" *> blank *> insideQ <* blank <* string "Q" "Graphic state" where @@ -282,13 +283,13 @@ graphicState = command = ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator -text :: ParserWithFont [ByteString] +text :: ParserWithFont [Text] text = string "BT" *> blank *> commands <* blank <* string "ET" "Text operators" where commands = several (a textOperator >>= runOperator) -runOperator :: Call TextOperator -> ParserWithFont [ByteString] +runOperator :: Call TextOperator -> ParserWithFont [Text] runOperator (Tf, [Typed (NameObject fontName), _]) = asks (! fontName) >>= put >> return [] @@ -312,7 +313,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) = runOperator _ = return [] -decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m ByteString +decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text decodeString input = do font <- get either fail return . font $ toByteString input