Migrate to Text to represent page contents and get rid of encoding concerns to early

This commit is contained in:
Tissevert 2020-02-07 10:03:17 +01:00
parent 57996749c6
commit fe055150a3
7 changed files with 27 additions and 20 deletions

View file

@ -68,6 +68,7 @@ executable getText
, containers , containers
, Hufflepdf , Hufflepdf
, mtl , mtl
, text
, zlib , zlib
ghc-options: -Wall ghc-options: -Wall
default-language: Haskell2010 default-language: Haskell2010

View file

@ -1,5 +1,6 @@
import qualified Data.ByteString.Char8 as BS (putStrLn, readFile) import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.Map as Map (toList) import qualified Data.Map as Map (toList)
import qualified Data.Text.IO as Text (putStrLn)
import PDF (Document(..), parseDocument) import PDF (Document(..), parseDocument)
import PDF.Object (Content) import PDF.Object (Content)
import PDF.Pages (Page(..), get, getAll) import PDF.Pages (Page(..), get, getAll)
@ -16,7 +17,7 @@ onDoc inputFile f = do
Right value -> return value Right value -> return value
displayPage :: Page -> IO () displayPage :: Page -> IO ()
displayPage = mapM_ BS.putStrLn . contents displayPage = mapM_ Text.putStrLn . contents
wholeDoc :: FilePath -> IO () wholeDoc :: FilePath -> IO ()
wholeDoc inputFile = do wholeDoc inputFile = do

View file

@ -18,7 +18,8 @@ import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, s
import qualified Data.ByteString.Char8 as Char8 ( import qualified Data.ByteString.Char8 as Char8 (
cons, drop, index, splitAt, take, uncons, unpack cons, drop, index, splitAt, take, uncons, unpack
) )
import Data.Text.Encoding (encodeUtf8, decodeUtf16BE) import Data.Text (Text)
import Data.Text.Encoding (decodeUtf16BE)
import Prelude hiding (length) import Prelude hiding (length)
import Text.Printf (printf) import Text.Printf (printf)
@ -86,5 +87,5 @@ unescape escapedBS =
| otherwise -> Char8.cons c (unescape s') | otherwise -> Char8.cons c (unescape s')
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s) fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
utf16BEToutf8 :: ByteString -> ByteString utf16BEToutf8 :: ByteString -> Text
utf16BEToutf8 = encodeUtf8 . decodeUtf16BE utf16BEToutf8 = decodeUtf16BE

View file

@ -24,6 +24,7 @@ import Data.Map (Map, union)
import qualified Data.Map as Map ( import qualified Data.Map as Map (
adjust, empty, fromList, insertWith, lookup, toList adjust, empty, fromList, insertWith, lookup, toList
) )
import Data.Text (Text)
import qualified PDF.EOL as EOL (charset, parser) import qualified PDF.EOL as EOL (charset, parser)
import PDF.Font (Font) import PDF.Font (Font)
import PDF.Object ( import PDF.Object (
@ -33,7 +34,7 @@ import PDF.Object (
import PDF.Parser (MonadParser, Parser, runParser, takeAll) import PDF.Parser (MonadParser, Parser, runParser, takeAll)
type CMappers = Map Name CMap type CMappers = Map Name CMap
type Mapping = Map ByteString ByteString type Mapping = Map ByteString Text
data CRange = CRange { data CRange = CRange {
fromSequence :: ByteString fromSequence :: ByteString
, toSequence :: ByteString , toSequence :: ByteString
@ -55,7 +56,7 @@ toFont aCMap input
case tryRanges prefix cRanges of case tryRanges prefix cRanges of
Nothing -> trySizes s others Nothing -> trySizes s others
Just outputSequence -> Right (outputSequence, BS.drop size s) Just outputSequence -> Right (outputSequence, BS.drop size s)
tryRanges :: ByteString -> [CRange] -> Maybe ByteString tryRanges :: ByteString -> [CRange] -> Maybe Text
tryRanges _ [] = Nothing tryRanges _ [] = Nothing
tryRanges prefix ((CRange {mapping}):cRanges) = tryRanges prefix ((CRange {mapping}):cRanges) =
case Map.lookup prefix mapping of case Map.lookup prefix mapping of
@ -108,7 +109,7 @@ cMapRange = do
<*> directObject <* EOL.parser <*> directObject <* EOL.parser
>>= mapFromTo >>= mapFromTo
saveMapping :: [(ByteString, ByteString)] -> Parser CMap () saveMapping :: [(ByteString, Text)] -> Parser CMap ()
saveMapping [] = return () saveMapping [] = return ()
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
where where
@ -139,7 +140,7 @@ startFrom from@(B16Int s) =
let size = BS.length s `div` 2 in let size = BS.length s `div` 2 in
toBytes size <$> [b16ToInt from .. ] toBytes size <$> [b16ToInt from .. ]
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)] mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) = mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom) return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
@ -152,7 +153,7 @@ mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
mapFromTo _ = fail "invalid range mapping found" mapFromTo _ = fail "invalid range mapping found"
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString) pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
pairMapping (Hexadecimal from, Hexadecimal to) = pairMapping (Hexadecimal from, Hexadecimal to) =
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to) return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
pairMapping _ = fail "invalid pair mapping found" pairMapping _ = fail "invalid pair mapping found"

View file

@ -6,9 +6,10 @@ module PDF.Font (
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import Data.Map (Map) import Data.Map (Map)
import Data.Text (Text)
import PDF.Object (Name) import PDF.Object (Name)
type Font = ByteString -> Either String ByteString type Font = ByteString -> Either String Text
type FontSet = Map Name Font type FontSet = Map Name Font
emptyFont :: Font emptyFont :: Font

View file

@ -14,6 +14,7 @@ import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!)) import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.CMap (cMap) import PDF.CMap (cMap)
import PDF.Encoding (encoding) import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet) import PDF.Font (Font, FontSet)
@ -28,7 +29,7 @@ import Text.Printf (printf)
type CachedFonts = Map ObjectId Font type CachedFonts = Map ObjectId Font
type T = RWST Content () CachedFonts (Either String) type T = RWST Content () CachedFonts (Either String)
data Page = Page { data Page = Page {
contents :: [ByteString] contents :: [Text]
, source :: ObjectId , source :: ObjectId
} }
@ -145,14 +146,14 @@ getReferences objects = do
Reference (IndirectObjCoordinates {objectId}) -> [objectId] Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> [] _ -> []
extractText :: Object -> T [ByteString] extractText :: Object -> T [Text]
extractText object = do extractText object = do
pageDict <- lift $ dict object pageDict <- lift $ dict object
fonts <- loadFonts =<< getFontDictionary pageDict fonts <- loadFonts =<< getFontDictionary pageDict
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject]) let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
concat <$> (objects //= (mapM $ loadContent fonts)) concat <$> (objects //= (mapM $ loadContent fonts))
where where
loadContent :: FontSet -> DirectObject -> T [ByteString] loadContent :: FontSet -> DirectObject -> T [Text]
loadContent fonts directObject = loadContent fonts directObject =
follow directObject \\= stream \\= pageContents fonts follow directObject \\= stream \\= pageContents fonts

View file

@ -17,6 +17,7 @@ import Data.ByteString.Char8 (pack, unpack)
import Data.Char (toLower) import Data.Char (toLower)
import Data.Map ((!), (!?), Map) import Data.Map ((!), (!?), Map)
import qualified Data.Map as Map (fromList) import qualified Data.Map as Map (fromList)
import Data.Text (Text)
import PDF.Font (Font, FontSet, emptyFont) import PDF.Font (Font, FontSet, emptyFont)
import PDF.Object ( import PDF.Object (
DirectObject(..), StringObject(..) DirectObject(..), StringObject(..)
@ -263,17 +264,17 @@ stringArg = Typed . StringObject <$> stringObject <* blank
type ParserWithFont = ReaderT FontSet (Parser Font) type ParserWithFont = ReaderT FontSet (Parser Font)
pageContents :: FontSet -> ByteString -> Either String [ByteString] pageContents :: FontSet -> ByteString -> Either String [Text]
pageContents fontSet input = pageContents fontSet input =
evalParser (runReaderT (page) fontSet) emptyFont input evalParser (runReaderT (page) fontSet) emptyFont input
several :: MonadParser m => m [a] -> m [a] several :: MonadParser m => m [a] -> m [a]
several p = concat <$> (p `sepBy` blank) several p = concat <$> (p `sepBy` blank)
page :: ParserWithFont [ByteString] page :: ParserWithFont [Text]
page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents" page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
graphicState :: ParserWithFont [ByteString] graphicState :: ParserWithFont [Text]
graphicState = graphicState =
string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state" string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
where where
@ -282,13 +283,13 @@ graphicState =
command = command =
ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
text :: ParserWithFont [ByteString] text :: ParserWithFont [Text]
text = text =
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators" string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
where where
commands = several (a textOperator >>= runOperator) commands = several (a textOperator >>= runOperator)
runOperator :: Call TextOperator -> ParserWithFont [ByteString] runOperator :: Call TextOperator -> ParserWithFont [Text]
runOperator (Tf, [Typed (NameObject fontName), _]) = runOperator (Tf, [Typed (NameObject fontName), _]) =
asks (! fontName) >>= put >> return [] asks (! fontName) >>= put >> return []
@ -312,7 +313,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
runOperator _ = return [] runOperator _ = return []
decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m ByteString decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text
decodeString input = do decodeString input = do
font <- get font <- get
either fail return . font $ toByteString input either fail return . font $ toByteString input