Migrate to Text to represent page contents and get rid of encoding concerns to early

This commit is contained in:
Tissevert 2020-02-07 10:03:17 +01:00
parent 57996749c6
commit fe055150a3
7 changed files with 27 additions and 20 deletions

View file

@ -68,6 +68,7 @@ executable getText
, containers
, Hufflepdf
, mtl
, text
, zlib
ghc-options: -Wall
default-language: Haskell2010

View file

@ -1,5 +1,6 @@
import qualified Data.ByteString.Char8 as BS (putStrLn, readFile)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.Map as Map (toList)
import qualified Data.Text.IO as Text (putStrLn)
import PDF (Document(..), parseDocument)
import PDF.Object (Content)
import PDF.Pages (Page(..), get, getAll)
@ -16,7 +17,7 @@ onDoc inputFile f = do
Right value -> return value
displayPage :: Page -> IO ()
displayPage = mapM_ BS.putStrLn . contents
displayPage = mapM_ Text.putStrLn . contents
wholeDoc :: FilePath -> IO ()
wholeDoc inputFile = do

View file

@ -18,7 +18,8 @@ import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, s
import qualified Data.ByteString.Char8 as Char8 (
cons, drop, index, splitAt, take, uncons, unpack
)
import Data.Text.Encoding (encodeUtf8, decodeUtf16BE)
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf16BE)
import Prelude hiding (length)
import Text.Printf (printf)
@ -86,5 +87,5 @@ unescape escapedBS =
| otherwise -> Char8.cons c (unescape s')
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
utf16BEToutf8 :: ByteString -> ByteString
utf16BEToutf8 = encodeUtf8 . decodeUtf16BE
utf16BEToutf8 :: ByteString -> Text
utf16BEToutf8 = decodeUtf16BE

View file

@ -24,6 +24,7 @@ import Data.Map (Map, union)
import qualified Data.Map as Map (
adjust, empty, fromList, insertWith, lookup, toList
)
import Data.Text (Text)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Font (Font)
import PDF.Object (
@ -33,7 +34,7 @@ import PDF.Object (
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
type CMappers = Map Name CMap
type Mapping = Map ByteString ByteString
type Mapping = Map ByteString Text
data CRange = CRange {
fromSequence :: ByteString
, toSequence :: ByteString
@ -55,7 +56,7 @@ toFont aCMap input
case tryRanges prefix cRanges of
Nothing -> trySizes s others
Just outputSequence -> Right (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
tryRanges :: ByteString -> [CRange] -> Maybe Text
tryRanges _ [] = Nothing
tryRanges prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of
@ -108,7 +109,7 @@ cMapRange = do
<*> directObject <* EOL.parser
>>= mapFromTo
saveMapping :: [(ByteString, ByteString)] -> Parser CMap ()
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
saveMapping [] = return ()
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
where
@ -139,7 +140,7 @@ startFrom from@(B16Int s) =
let size = BS.length s `div` 2 in
toBytes size <$> [b16ToInt from .. ]
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
@ -152,7 +153,7 @@ mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
mapFromTo _ = fail "invalid range mapping found"
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
pairMapping (Hexadecimal from, Hexadecimal to) =
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
pairMapping _ = fail "invalid pair mapping found"

View file

@ -6,9 +6,10 @@ module PDF.Font (
import Data.ByteString (ByteString)
import Data.Map (Map)
import Data.Text (Text)
import PDF.Object (Name)
type Font = ByteString -> Either String ByteString
type Font = ByteString -> Either String Text
type FontSet = Map Name Font
emptyFont :: Font

View file

@ -14,6 +14,7 @@ import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.CMap (cMap)
import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet)
@ -28,7 +29,7 @@ import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
type T = RWST Content () CachedFonts (Either String)
data Page = Page {
contents :: [ByteString]
contents :: [Text]
, source :: ObjectId
}
@ -145,14 +146,14 @@ getReferences objects = do
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
extractText :: Object -> T [ByteString]
extractText :: Object -> T [Text]
extractText object = do
pageDict <- lift $ dict object
fonts <- loadFonts =<< getFontDictionary pageDict
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
concat <$> (objects //= (mapM $ loadContent fonts))
where
loadContent :: FontSet -> DirectObject -> T [ByteString]
loadContent :: FontSet -> DirectObject -> T [Text]
loadContent fonts directObject =
follow directObject \\= stream \\= pageContents fonts

View file

@ -17,6 +17,7 @@ import Data.ByteString.Char8 (pack, unpack)
import Data.Char (toLower)
import Data.Map ((!), (!?), Map)
import qualified Data.Map as Map (fromList)
import Data.Text (Text)
import PDF.Font (Font, FontSet, emptyFont)
import PDF.Object (
DirectObject(..), StringObject(..)
@ -263,17 +264,17 @@ stringArg = Typed . StringObject <$> stringObject <* blank
type ParserWithFont = ReaderT FontSet (Parser Font)
pageContents :: FontSet -> ByteString -> Either String [ByteString]
pageContents :: FontSet -> ByteString -> Either String [Text]
pageContents fontSet input =
evalParser (runReaderT (page) fontSet) emptyFont input
several :: MonadParser m => m [a] -> m [a]
several p = concat <$> (p `sepBy` blank)
page :: ParserWithFont [ByteString]
page :: ParserWithFont [Text]
page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
graphicState :: ParserWithFont [ByteString]
graphicState :: ParserWithFont [Text]
graphicState =
string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
where
@ -282,13 +283,13 @@ graphicState =
command =
ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
text :: ParserWithFont [ByteString]
text :: ParserWithFont [Text]
text =
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
where
commands = several (a textOperator >>= runOperator)
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
runOperator :: Call TextOperator -> ParserWithFont [Text]
runOperator (Tf, [Typed (NameObject fontName), _]) =
asks (! fontName) >>= put >> return []
@ -312,7 +313,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return []
decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m ByteString
decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text
decodeString input = do
font <- get
either fail return . font $ toByteString input