Migrate to Text to represent page contents and get rid of encoding concerns to early
This commit is contained in:
parent
57996749c6
commit
fe055150a3
7 changed files with 27 additions and 20 deletions
|
@ -68,6 +68,7 @@ executable getText
|
||||||
, containers
|
, containers
|
||||||
, Hufflepdf
|
, Hufflepdf
|
||||||
, mtl
|
, mtl
|
||||||
|
, text
|
||||||
, zlib
|
, zlib
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import qualified Data.ByteString.Char8 as BS (putStrLn, readFile)
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
import qualified Data.Map as Map (toList)
|
import qualified Data.Map as Map (toList)
|
||||||
|
import qualified Data.Text.IO as Text (putStrLn)
|
||||||
import PDF (Document(..), parseDocument)
|
import PDF (Document(..), parseDocument)
|
||||||
import PDF.Object (Content)
|
import PDF.Object (Content)
|
||||||
import PDF.Pages (Page(..), get, getAll)
|
import PDF.Pages (Page(..), get, getAll)
|
||||||
|
@ -16,7 +17,7 @@ onDoc inputFile f = do
|
||||||
Right value -> return value
|
Right value -> return value
|
||||||
|
|
||||||
displayPage :: Page -> IO ()
|
displayPage :: Page -> IO ()
|
||||||
displayPage = mapM_ BS.putStrLn . contents
|
displayPage = mapM_ Text.putStrLn . contents
|
||||||
|
|
||||||
wholeDoc :: FilePath -> IO ()
|
wholeDoc :: FilePath -> IO ()
|
||||||
wholeDoc inputFile = do
|
wholeDoc inputFile = do
|
||||||
|
|
|
@ -18,7 +18,8 @@ import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, s
|
||||||
import qualified Data.ByteString.Char8 as Char8 (
|
import qualified Data.ByteString.Char8 as Char8 (
|
||||||
cons, drop, index, splitAt, take, uncons, unpack
|
cons, drop, index, splitAt, take, uncons, unpack
|
||||||
)
|
)
|
||||||
import Data.Text.Encoding (encodeUtf8, decodeUtf16BE)
|
import Data.Text (Text)
|
||||||
|
import Data.Text.Encoding (decodeUtf16BE)
|
||||||
import Prelude hiding (length)
|
import Prelude hiding (length)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
@ -86,5 +87,5 @@ unescape escapedBS =
|
||||||
| otherwise -> Char8.cons c (unescape s')
|
| otherwise -> Char8.cons c (unescape s')
|
||||||
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
|
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
|
||||||
|
|
||||||
utf16BEToutf8 :: ByteString -> ByteString
|
utf16BEToutf8 :: ByteString -> Text
|
||||||
utf16BEToutf8 = encodeUtf8 . decodeUtf16BE
|
utf16BEToutf8 = decodeUtf16BE
|
||||||
|
|
|
@ -24,6 +24,7 @@ import Data.Map (Map, union)
|
||||||
import qualified Data.Map as Map (
|
import qualified Data.Map as Map (
|
||||||
adjust, empty, fromList, insertWith, lookup, toList
|
adjust, empty, fromList, insertWith, lookup, toList
|
||||||
)
|
)
|
||||||
|
import Data.Text (Text)
|
||||||
import qualified PDF.EOL as EOL (charset, parser)
|
import qualified PDF.EOL as EOL (charset, parser)
|
||||||
import PDF.Font (Font)
|
import PDF.Font (Font)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
|
@ -33,7 +34,7 @@ import PDF.Object (
|
||||||
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
||||||
|
|
||||||
type CMappers = Map Name CMap
|
type CMappers = Map Name CMap
|
||||||
type Mapping = Map ByteString ByteString
|
type Mapping = Map ByteString Text
|
||||||
data CRange = CRange {
|
data CRange = CRange {
|
||||||
fromSequence :: ByteString
|
fromSequence :: ByteString
|
||||||
, toSequence :: ByteString
|
, toSequence :: ByteString
|
||||||
|
@ -55,7 +56,7 @@ toFont aCMap input
|
||||||
case tryRanges prefix cRanges of
|
case tryRanges prefix cRanges of
|
||||||
Nothing -> trySizes s others
|
Nothing -> trySizes s others
|
||||||
Just outputSequence -> Right (outputSequence, BS.drop size s)
|
Just outputSequence -> Right (outputSequence, BS.drop size s)
|
||||||
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
|
tryRanges :: ByteString -> [CRange] -> Maybe Text
|
||||||
tryRanges _ [] = Nothing
|
tryRanges _ [] = Nothing
|
||||||
tryRanges prefix ((CRange {mapping}):cRanges) =
|
tryRanges prefix ((CRange {mapping}):cRanges) =
|
||||||
case Map.lookup prefix mapping of
|
case Map.lookup prefix mapping of
|
||||||
|
@ -108,7 +109,7 @@ cMapRange = do
|
||||||
<*> directObject <* EOL.parser
|
<*> directObject <* EOL.parser
|
||||||
>>= mapFromTo
|
>>= mapFromTo
|
||||||
|
|
||||||
saveMapping :: [(ByteString, ByteString)] -> Parser CMap ()
|
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
|
||||||
saveMapping [] = return ()
|
saveMapping [] = return ()
|
||||||
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
|
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
|
||||||
where
|
where
|
||||||
|
@ -139,7 +140,7 @@ startFrom from@(B16Int s) =
|
||||||
let size = BS.length s `div` 2 in
|
let size = BS.length s `div` 2 in
|
||||||
toBytes size <$> [b16ToInt from .. ]
|
toBytes size <$> [b16ToInt from .. ]
|
||||||
|
|
||||||
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
|
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
|
||||||
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
||||||
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
|
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
|
||||||
|
|
||||||
|
@ -152,7 +153,7 @@ mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
||||||
|
|
||||||
mapFromTo _ = fail "invalid range mapping found"
|
mapFromTo _ = fail "invalid range mapping found"
|
||||||
|
|
||||||
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
|
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
|
||||||
pairMapping (Hexadecimal from, Hexadecimal to) =
|
pairMapping (Hexadecimal from, Hexadecimal to) =
|
||||||
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
|
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
|
||||||
pairMapping _ = fail "invalid pair mapping found"
|
pairMapping _ = fail "invalid pair mapping found"
|
||||||
|
|
|
@ -6,9 +6,10 @@ module PDF.Font (
|
||||||
|
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import Data.Map (Map)
|
import Data.Map (Map)
|
||||||
|
import Data.Text (Text)
|
||||||
import PDF.Object (Name)
|
import PDF.Object (Name)
|
||||||
|
|
||||||
type Font = ByteString -> Either String ByteString
|
type Font = ByteString -> Either String Text
|
||||||
type FontSet = Map Name Font
|
type FontSet = Map Name Font
|
||||||
|
|
||||||
emptyFont :: Font
|
emptyFont :: Font
|
||||||
|
|
|
@ -14,6 +14,7 @@ import Data.ByteString (ByteString)
|
||||||
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||||
import Data.Map (Map, (!))
|
import Data.Map (Map, (!))
|
||||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||||
|
import Data.Text (Text)
|
||||||
import PDF.CMap (cMap)
|
import PDF.CMap (cMap)
|
||||||
import PDF.Encoding (encoding)
|
import PDF.Encoding (encoding)
|
||||||
import PDF.Font (Font, FontSet)
|
import PDF.Font (Font, FontSet)
|
||||||
|
@ -28,7 +29,7 @@ import Text.Printf (printf)
|
||||||
type CachedFonts = Map ObjectId Font
|
type CachedFonts = Map ObjectId Font
|
||||||
type T = RWST Content () CachedFonts (Either String)
|
type T = RWST Content () CachedFonts (Either String)
|
||||||
data Page = Page {
|
data Page = Page {
|
||||||
contents :: [ByteString]
|
contents :: [Text]
|
||||||
, source :: ObjectId
|
, source :: ObjectId
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -145,14 +146,14 @@ getReferences objects = do
|
||||||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||||
_ -> []
|
_ -> []
|
||||||
|
|
||||||
extractText :: Object -> T [ByteString]
|
extractText :: Object -> T [Text]
|
||||||
extractText object = do
|
extractText object = do
|
||||||
pageDict <- lift $ dict object
|
pageDict <- lift $ dict object
|
||||||
fonts <- loadFonts =<< getFontDictionary pageDict
|
fonts <- loadFonts =<< getFontDictionary pageDict
|
||||||
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
||||||
concat <$> (objects //= (mapM $ loadContent fonts))
|
concat <$> (objects //= (mapM $ loadContent fonts))
|
||||||
where
|
where
|
||||||
loadContent :: FontSet -> DirectObject -> T [ByteString]
|
loadContent :: FontSet -> DirectObject -> T [Text]
|
||||||
loadContent fonts directObject =
|
loadContent fonts directObject =
|
||||||
follow directObject \\= stream \\= pageContents fonts
|
follow directObject \\= stream \\= pageContents fonts
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ import Data.ByteString.Char8 (pack, unpack)
|
||||||
import Data.Char (toLower)
|
import Data.Char (toLower)
|
||||||
import Data.Map ((!), (!?), Map)
|
import Data.Map ((!), (!?), Map)
|
||||||
import qualified Data.Map as Map (fromList)
|
import qualified Data.Map as Map (fromList)
|
||||||
|
import Data.Text (Text)
|
||||||
import PDF.Font (Font, FontSet, emptyFont)
|
import PDF.Font (Font, FontSet, emptyFont)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
DirectObject(..), StringObject(..)
|
DirectObject(..), StringObject(..)
|
||||||
|
@ -263,17 +264,17 @@ stringArg = Typed . StringObject <$> stringObject <* blank
|
||||||
|
|
||||||
type ParserWithFont = ReaderT FontSet (Parser Font)
|
type ParserWithFont = ReaderT FontSet (Parser Font)
|
||||||
|
|
||||||
pageContents :: FontSet -> ByteString -> Either String [ByteString]
|
pageContents :: FontSet -> ByteString -> Either String [Text]
|
||||||
pageContents fontSet input =
|
pageContents fontSet input =
|
||||||
evalParser (runReaderT (page) fontSet) emptyFont input
|
evalParser (runReaderT (page) fontSet) emptyFont input
|
||||||
|
|
||||||
several :: MonadParser m => m [a] -> m [a]
|
several :: MonadParser m => m [a] -> m [a]
|
||||||
several p = concat <$> (p `sepBy` blank)
|
several p = concat <$> (p `sepBy` blank)
|
||||||
|
|
||||||
page :: ParserWithFont [ByteString]
|
page :: ParserWithFont [Text]
|
||||||
page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
|
page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
|
||||||
|
|
||||||
graphicState :: ParserWithFont [ByteString]
|
graphicState :: ParserWithFont [Text]
|
||||||
graphicState =
|
graphicState =
|
||||||
string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
|
string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
|
||||||
where
|
where
|
||||||
|
@ -282,13 +283,13 @@ graphicState =
|
||||||
command =
|
command =
|
||||||
ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
|
ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
|
||||||
|
|
||||||
text :: ParserWithFont [ByteString]
|
text :: ParserWithFont [Text]
|
||||||
text =
|
text =
|
||||||
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
|
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
|
||||||
where
|
where
|
||||||
commands = several (a textOperator >>= runOperator)
|
commands = several (a textOperator >>= runOperator)
|
||||||
|
|
||||||
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
|
runOperator :: Call TextOperator -> ParserWithFont [Text]
|
||||||
runOperator (Tf, [Typed (NameObject fontName), _]) =
|
runOperator (Tf, [Typed (NameObject fontName), _]) =
|
||||||
asks (! fontName) >>= put >> return []
|
asks (! fontName) >>= put >> return []
|
||||||
|
|
||||||
|
@ -312,7 +313,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
|
||||||
|
|
||||||
runOperator _ = return []
|
runOperator _ = return []
|
||||||
|
|
||||||
decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m ByteString
|
decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text
|
||||||
decodeString input = do
|
decodeString input = do
|
||||||
font <- get
|
font <- get
|
||||||
either fail return . font $ toByteString input
|
either fail return . font $ toByteString input
|
||||||
|
|
Loading…
Reference in a new issue