Migrate to Text to represent page contents and get rid of encoding concerns to early
This commit is contained in:
parent
57996749c6
commit
fe055150a3
7 changed files with 27 additions and 20 deletions
|
@ -68,6 +68,7 @@ executable getText
|
|||
, containers
|
||||
, Hufflepdf
|
||||
, mtl
|
||||
, text
|
||||
, zlib
|
||||
ghc-options: -Wall
|
||||
default-language: Haskell2010
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import qualified Data.ByteString.Char8 as BS (putStrLn, readFile)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import qualified Data.Map as Map (toList)
|
||||
import qualified Data.Text.IO as Text (putStrLn)
|
||||
import PDF (Document(..), parseDocument)
|
||||
import PDF.Object (Content)
|
||||
import PDF.Pages (Page(..), get, getAll)
|
||||
|
@ -16,7 +17,7 @@ onDoc inputFile f = do
|
|||
Right value -> return value
|
||||
|
||||
displayPage :: Page -> IO ()
|
||||
displayPage = mapM_ BS.putStrLn . contents
|
||||
displayPage = mapM_ Text.putStrLn . contents
|
||||
|
||||
wholeDoc :: FilePath -> IO ()
|
||||
wholeDoc inputFile = do
|
||||
|
|
|
@ -18,7 +18,8 @@ import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, s
|
|||
import qualified Data.ByteString.Char8 as Char8 (
|
||||
cons, drop, index, splitAt, take, uncons, unpack
|
||||
)
|
||||
import Data.Text.Encoding (encodeUtf8, decodeUtf16BE)
|
||||
import Data.Text (Text)
|
||||
import Data.Text.Encoding (decodeUtf16BE)
|
||||
import Prelude hiding (length)
|
||||
import Text.Printf (printf)
|
||||
|
||||
|
@ -86,5 +87,5 @@ unescape escapedBS =
|
|||
| otherwise -> Char8.cons c (unescape s')
|
||||
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
|
||||
|
||||
utf16BEToutf8 :: ByteString -> ByteString
|
||||
utf16BEToutf8 = encodeUtf8 . decodeUtf16BE
|
||||
utf16BEToutf8 :: ByteString -> Text
|
||||
utf16BEToutf8 = decodeUtf16BE
|
||||
|
|
|
@ -24,6 +24,7 @@ import Data.Map (Map, union)
|
|||
import qualified Data.Map as Map (
|
||||
adjust, empty, fromList, insertWith, lookup, toList
|
||||
)
|
||||
import Data.Text (Text)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import PDF.Font (Font)
|
||||
import PDF.Object (
|
||||
|
@ -33,7 +34,7 @@ import PDF.Object (
|
|||
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
||||
|
||||
type CMappers = Map Name CMap
|
||||
type Mapping = Map ByteString ByteString
|
||||
type Mapping = Map ByteString Text
|
||||
data CRange = CRange {
|
||||
fromSequence :: ByteString
|
||||
, toSequence :: ByteString
|
||||
|
@ -55,7 +56,7 @@ toFont aCMap input
|
|||
case tryRanges prefix cRanges of
|
||||
Nothing -> trySizes s others
|
||||
Just outputSequence -> Right (outputSequence, BS.drop size s)
|
||||
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
|
||||
tryRanges :: ByteString -> [CRange] -> Maybe Text
|
||||
tryRanges _ [] = Nothing
|
||||
tryRanges prefix ((CRange {mapping}):cRanges) =
|
||||
case Map.lookup prefix mapping of
|
||||
|
@ -108,7 +109,7 @@ cMapRange = do
|
|||
<*> directObject <* EOL.parser
|
||||
>>= mapFromTo
|
||||
|
||||
saveMapping :: [(ByteString, ByteString)] -> Parser CMap ()
|
||||
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
|
||||
saveMapping [] = return ()
|
||||
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
|
||||
where
|
||||
|
@ -139,7 +140,7 @@ startFrom from@(B16Int s) =
|
|||
let size = BS.length s `div` 2 in
|
||||
toBytes size <$> [b16ToInt from .. ]
|
||||
|
||||
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
|
||||
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
||||
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
|
||||
|
||||
|
@ -152,7 +153,7 @@ mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
|||
|
||||
mapFromTo _ = fail "invalid range mapping found"
|
||||
|
||||
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
|
||||
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
|
||||
pairMapping (Hexadecimal from, Hexadecimal to) =
|
||||
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
|
||||
pairMapping _ = fail "invalid pair mapping found"
|
||||
|
|
|
@ -6,9 +6,10 @@ module PDF.Font (
|
|||
|
||||
import Data.ByteString (ByteString)
|
||||
import Data.Map (Map)
|
||||
import Data.Text (Text)
|
||||
import PDF.Object (Name)
|
||||
|
||||
type Font = ByteString -> Either String ByteString
|
||||
type Font = ByteString -> Either String Text
|
||||
type FontSet = Map Name Font
|
||||
|
||||
emptyFont :: Font
|
||||
|
|
|
@ -14,6 +14,7 @@ import Data.ByteString (ByteString)
|
|||
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||
import Data.Text (Text)
|
||||
import PDF.CMap (cMap)
|
||||
import PDF.Encoding (encoding)
|
||||
import PDF.Font (Font, FontSet)
|
||||
|
@ -28,7 +29,7 @@ import Text.Printf (printf)
|
|||
type CachedFonts = Map ObjectId Font
|
||||
type T = RWST Content () CachedFonts (Either String)
|
||||
data Page = Page {
|
||||
contents :: [ByteString]
|
||||
contents :: [Text]
|
||||
, source :: ObjectId
|
||||
}
|
||||
|
||||
|
@ -145,14 +146,14 @@ getReferences objects = do
|
|||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||
_ -> []
|
||||
|
||||
extractText :: Object -> T [ByteString]
|
||||
extractText :: Object -> T [Text]
|
||||
extractText object = do
|
||||
pageDict <- lift $ dict object
|
||||
fonts <- loadFonts =<< getFontDictionary pageDict
|
||||
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
||||
concat <$> (objects //= (mapM $ loadContent fonts))
|
||||
where
|
||||
loadContent :: FontSet -> DirectObject -> T [ByteString]
|
||||
loadContent :: FontSet -> DirectObject -> T [Text]
|
||||
loadContent fonts directObject =
|
||||
follow directObject \\= stream \\= pageContents fonts
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ import Data.ByteString.Char8 (pack, unpack)
|
|||
import Data.Char (toLower)
|
||||
import Data.Map ((!), (!?), Map)
|
||||
import qualified Data.Map as Map (fromList)
|
||||
import Data.Text (Text)
|
||||
import PDF.Font (Font, FontSet, emptyFont)
|
||||
import PDF.Object (
|
||||
DirectObject(..), StringObject(..)
|
||||
|
@ -263,17 +264,17 @@ stringArg = Typed . StringObject <$> stringObject <* blank
|
|||
|
||||
type ParserWithFont = ReaderT FontSet (Parser Font)
|
||||
|
||||
pageContents :: FontSet -> ByteString -> Either String [ByteString]
|
||||
pageContents :: FontSet -> ByteString -> Either String [Text]
|
||||
pageContents fontSet input =
|
||||
evalParser (runReaderT (page) fontSet) emptyFont input
|
||||
|
||||
several :: MonadParser m => m [a] -> m [a]
|
||||
several p = concat <$> (p `sepBy` blank)
|
||||
|
||||
page :: ParserWithFont [ByteString]
|
||||
page :: ParserWithFont [Text]
|
||||
page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
|
||||
|
||||
graphicState :: ParserWithFont [ByteString]
|
||||
graphicState :: ParserWithFont [Text]
|
||||
graphicState =
|
||||
string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
|
||||
where
|
||||
|
@ -282,13 +283,13 @@ graphicState =
|
|||
command =
|
||||
ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
|
||||
|
||||
text :: ParserWithFont [ByteString]
|
||||
text :: ParserWithFont [Text]
|
||||
text =
|
||||
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
|
||||
where
|
||||
commands = several (a textOperator >>= runOperator)
|
||||
|
||||
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
|
||||
runOperator :: Call TextOperator -> ParserWithFont [Text]
|
||||
runOperator (Tf, [Typed (NameObject fontName), _]) =
|
||||
asks (! fontName) >>= put >> return []
|
||||
|
||||
|
@ -312,7 +313,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
|
|||
|
||||
runOperator _ = return []
|
||||
|
||||
decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m ByteString
|
||||
decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text
|
||||
decodeString input = do
|
||||
font <- get
|
||||
either fail return . font $ toByteString input
|
||||
|
|
Loading…
Reference in a new issue