Add support for fonts and implement MacRomanEncoding
This commit is contained in:
parent
c48ab22808
commit
325250383a
|
@ -26,7 +26,10 @@ library
|
|||
, PDF.Text
|
||||
, PDF.Update
|
||||
other-modules: Data.ByteString.Char8.Util
|
||||
, PDF.Encoding
|
||||
, PDF.Encoding.MacRoman
|
||||
, PDF.Body
|
||||
, PDF.Font
|
||||
-- other-extensions:
|
||||
build-depends: attoparsec
|
||||
, base >=4.9 && <4.13
|
||||
|
@ -65,6 +68,7 @@ executable getText
|
|||
, containers
|
||||
, Hufflepdf
|
||||
, mtl
|
||||
, text
|
||||
, zlib
|
||||
ghc-options: -Wall
|
||||
default-language: Haskell2010
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import qualified Data.ByteString.Char8 as BS (putStrLn, readFile)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import qualified Data.Map as Map (toList)
|
||||
import qualified Data.Text.IO as Text (putStrLn)
|
||||
import PDF (Document(..), parseDocument)
|
||||
import PDF.Object (Content)
|
||||
import PDF.Pages (Page(..), get, getAll)
|
||||
|
@ -16,7 +17,7 @@ onDoc inputFile f = do
|
|||
Right value -> return value
|
||||
|
||||
displayPage :: Page -> IO ()
|
||||
displayPage = mapM_ BS.putStrLn . contents
|
||||
displayPage = mapM_ Text.putStrLn . contents
|
||||
|
||||
wholeDoc :: FilePath -> IO ()
|
||||
wholeDoc inputFile = do
|
||||
|
|
|
@ -18,7 +18,8 @@ import qualified Data.ByteString as BS (empty, foldl, length, pack, singleton, s
|
|||
import qualified Data.ByteString.Char8 as Char8 (
|
||||
cons, drop, index, splitAt, take, uncons, unpack
|
||||
)
|
||||
import Data.Text.Encoding (encodeUtf8, decodeUtf16BE)
|
||||
import Data.Text (Text)
|
||||
import Data.Text.Encoding (decodeUtf16BE)
|
||||
import Prelude hiding (length)
|
||||
import Text.Printf (printf)
|
||||
|
||||
|
@ -86,5 +87,5 @@ unescape escapedBS =
|
|||
| otherwise -> Char8.cons c (unescape s')
|
||||
fromOctal (code, s) = Char8.cons (toEnum $ b8ToInt (B8Int code)) (unescape s)
|
||||
|
||||
utf16BEToutf8 :: ByteString -> ByteString
|
||||
utf16BEToutf8 = encodeUtf8 . decodeUtf16BE
|
||||
utf16BEToutf8 :: ByteString -> Text
|
||||
utf16BEToutf8 = decodeUtf16BE
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE TypeSynonymInstances #-}
|
||||
module PDF.CMap (
|
||||
CMap
|
||||
, CMappers
|
||||
|
@ -12,13 +15,18 @@ import Control.Applicative ((<|>), many)
|
|||
import Control.Monad.State (modify)
|
||||
import Data.Attoparsec.ByteString.Char8 (count)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS (length)
|
||||
import qualified Data.ByteString as BS (drop, length, null, take)
|
||||
import Data.ByteString.Char8 (unpack)
|
||||
import Data.ByteString.Char8.Util (
|
||||
B16Int(..), b16ToBytes, b16ToInt, toBytes, utf16BEToutf8
|
||||
)
|
||||
import Data.Map (Map, union)
|
||||
import qualified Data.Map as Map (adjust, empty, fromList, insertWith)
|
||||
import qualified Data.Map as Map (
|
||||
adjust, empty, fromList, insertWith, lookup, toList
|
||||
)
|
||||
import Data.Text (Text)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import PDF.Font (Font)
|
||||
import PDF.Object (
|
||||
DirectObject(..), Name, StringObject(..)
|
||||
, blank, directObject, integer, line, stringObject
|
||||
|
@ -26,7 +34,7 @@ import PDF.Object (
|
|||
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
||||
|
||||
type CMappers = Map Name CMap
|
||||
type Mapping = Map ByteString ByteString
|
||||
type Mapping = Map ByteString Text
|
||||
data CRange = CRange {
|
||||
fromSequence :: ByteString
|
||||
, toSequence :: ByteString
|
||||
|
@ -35,6 +43,26 @@ data CRange = CRange {
|
|||
type RangeSize = Int
|
||||
type CMap = Map RangeSize [CRange]
|
||||
|
||||
toFont :: CMap -> Font
|
||||
toFont aCMap input
|
||||
| BS.null input = Right ""
|
||||
| otherwise = do
|
||||
(output, remainingInput) <- trySizes input $ Map.toList aCMap
|
||||
mappend output <$> toFont aCMap remainingInput
|
||||
where
|
||||
trySizes s [] = Left $ "No matching code found in font for " ++ unpack s
|
||||
trySizes s ((size, cRanges):others) =
|
||||
let prefix = BS.take size s in
|
||||
case tryRanges prefix cRanges of
|
||||
Nothing -> trySizes s others
|
||||
Just outputSequence -> Right (outputSequence, BS.drop size s)
|
||||
tryRanges :: ByteString -> [CRange] -> Maybe Text
|
||||
tryRanges _ [] = Nothing
|
||||
tryRanges prefix ((CRange {mapping}):cRanges) =
|
||||
case Map.lookup prefix mapping of
|
||||
Nothing -> tryRanges prefix cRanges
|
||||
outputSequence -> outputSequence
|
||||
|
||||
emptyCMap :: CMap
|
||||
emptyCMap = Map.empty
|
||||
|
||||
|
@ -42,8 +70,8 @@ matches :: ByteString -> CRange -> Bool
|
|||
matches code (CRange {fromSequence, toSequence}) =
|
||||
fromSequence <= code && code <= toSequence
|
||||
|
||||
cMap :: ByteString -> Either String CMap
|
||||
cMap = fmap snd <$> runParser
|
||||
cMap :: ByteString -> Either String Font
|
||||
cMap = fmap (toFont . snd) <$> runParser
|
||||
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
||||
emptyCMap
|
||||
where
|
||||
|
@ -81,7 +109,7 @@ cMapRange = do
|
|||
<*> directObject <* EOL.parser
|
||||
>>= mapFromTo
|
||||
|
||||
saveMapping :: [(ByteString, ByteString)] -> Parser CMap ()
|
||||
saveMapping :: [(ByteString, Text)] -> Parser CMap ()
|
||||
saveMapping [] = return ()
|
||||
saveMapping assoc@((code, _):_) = modify $ Map.adjust insertCRange mappingSize
|
||||
where
|
||||
|
@ -112,7 +140,7 @@ startFrom from@(B16Int s) =
|
|||
let size = BS.length s `div` 2 in
|
||||
toBytes size <$> [b16ToInt from .. ]
|
||||
|
||||
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, ByteString)]
|
||||
mapFromTo :: MonadParser m => (StringObject, StringObject, DirectObject) -> m [(ByteString, Text)]
|
||||
mapFromTo (Hexadecimal from, Hexadecimal to, StringObject (Hexadecimal dstFrom)) =
|
||||
return $ zip (between from to) (utf16BEToutf8 <$> startFrom dstFrom)
|
||||
|
||||
|
@ -125,7 +153,7 @@ mapFromTo (Hexadecimal from, Hexadecimal to, Array dstPoints) =
|
|||
|
||||
mapFromTo _ = fail "invalid range mapping found"
|
||||
|
||||
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, ByteString)
|
||||
pairMapping :: MonadParser m => (StringObject, StringObject) -> m (ByteString, Text)
|
||||
pairMapping (Hexadecimal from, Hexadecimal to) =
|
||||
return (b16ToBytes from, utf16BEToutf8 $ b16ToBytes to)
|
||||
pairMapping _ = fail "invalid pair mapping found"
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
module PDF.Encoding (
|
||||
encoding
|
||||
) where
|
||||
|
||||
import PDF.Encoding.MacRoman (macRomanEncoding)
|
||||
import PDF.Font (Font)
|
||||
|
||||
encoding :: String -> Either String Font
|
||||
encoding "MacRomanEncoding" = Right macRomanEncoding
|
||||
encoding s = Left $ "Unknown encoding " ++ s
|
|
@ -0,0 +1,141 @@
|
|||
module PDF.Encoding.MacRoman (
|
||||
macRomanEncoding
|
||||
) where
|
||||
|
||||
import Data.ByteString.Char8 (unpack)
|
||||
import Data.Text (pack)
|
||||
import PDF.Font (Font)
|
||||
|
||||
macRomanEncoding :: Font
|
||||
macRomanEncoding = Right . pack . fmap decode . unpack
|
||||
|
||||
decode :: Char -> Char
|
||||
decode '\x80' = '\x00C4' -- LATIN CAPITAL LETTER A WITH DIAERESIS
|
||||
decode '\x81' = '\x00C5' -- LATIN CAPITAL LETTER A WITH RING ABOVE
|
||||
decode '\x82' = '\x00C7' -- LATIN CAPITAL LETTER C WITH CEDILLA
|
||||
decode '\x83' = '\x00C9' -- LATIN CAPITAL LETTER E WITH ACUTE
|
||||
decode '\x84' = '\x00D1' -- LATIN CAPITAL LETTER N WITH TILDE
|
||||
decode '\x85' = '\x00D6' -- LATIN CAPITAL LETTER O WITH DIAERESIS
|
||||
decode '\x86' = '\x00DC' -- LATIN CAPITAL LETTER U WITH DIAERESIS
|
||||
decode '\x87' = '\x00E1' -- LATIN SMALL LETTER A WITH ACUTE
|
||||
decode '\x88' = '\x00E0' -- LATIN SMALL LETTER A WITH GRAVE
|
||||
decode '\x89' = '\x00E2' -- LATIN SMALL LETTER A WITH CIRCUMFLEX
|
||||
decode '\x8A' = '\x00E4' -- LATIN SMALL LETTER A WITH DIAERESIS
|
||||
decode '\x8B' = '\x00E3' -- LATIN SMALL LETTER A WITH TILDE
|
||||
decode '\x8C' = '\x00E5' -- LATIN SMALL LETTER A WITH RING ABOVE
|
||||
decode '\x8D' = '\x00E7' -- LATIN SMALL LETTER C WITH CEDILLA
|
||||
decode '\x8E' = '\x00E9' -- LATIN SMALL LETTER E WITH ACUTE
|
||||
decode '\x8F' = '\x00E8' -- LATIN SMALL LETTER E WITH GRAVE
|
||||
decode '\x90' = '\x00EA' -- LATIN SMALL LETTER E WITH CIRCUMFLEX
|
||||
decode '\x91' = '\x00EB' -- LATIN SMALL LETTER E WITH DIAERESIS
|
||||
decode '\x92' = '\x00ED' -- LATIN SMALL LETTER I WITH ACUTE
|
||||
decode '\x93' = '\x00EC' -- LATIN SMALL LETTER I WITH GRAVE
|
||||
decode '\x94' = '\x00EE' -- LATIN SMALL LETTER I WITH CIRCUMFLEX
|
||||
decode '\x95' = '\x00EF' -- LATIN SMALL LETTER I WITH DIAERESIS
|
||||
decode '\x96' = '\x00F1' -- LATIN SMALL LETTER N WITH TILDE
|
||||
decode '\x97' = '\x00F3' -- LATIN SMALL LETTER O WITH ACUTE
|
||||
decode '\x98' = '\x00F2' -- LATIN SMALL LETTER O WITH GRAVE
|
||||
decode '\x99' = '\x00F4' -- LATIN SMALL LETTER O WITH CIRCUMFLEX
|
||||
decode '\x9A' = '\x00F6' -- LATIN SMALL LETTER O WITH DIAERESIS
|
||||
decode '\x9B' = '\x00F5' -- LATIN SMALL LETTER O WITH TILDE
|
||||
decode '\x9C' = '\x00FA' -- LATIN SMALL LETTER U WITH ACUTE
|
||||
decode '\x9D' = '\x00F9' -- LATIN SMALL LETTER U WITH GRAVE
|
||||
decode '\x9E' = '\x00FB' -- LATIN SMALL LETTER U WITH CIRCUMFLEX
|
||||
decode '\x9F' = '\x00FC' -- LATIN SMALL LETTER U WITH DIAERESIS
|
||||
decode '\xA0' = '\x2020' -- DAGGER
|
||||
decode '\xA1' = '\x00B0' -- DEGREE SIGN
|
||||
decode '\xA2' = '\x00A2' -- CENT SIGN
|
||||
decode '\xA3' = '\x00A3' -- POUND SIGN
|
||||
decode '\xA4' = '\x00A7' -- SECTION SIGN
|
||||
decode '\xA5' = '\x2022' -- BULLET
|
||||
decode '\xA6' = '\x00B6' -- PILCROW SIGN
|
||||
decode '\xA7' = '\x00DF' -- LATIN SMALL LETTER SHARP S
|
||||
decode '\xA8' = '\x00AE' -- REGISTERED SIGN
|
||||
decode '\xA9' = '\x00A9' -- COPYRIGHT SIGN
|
||||
decode '\xAA' = '\x2122' -- TRADE MARK SIGN
|
||||
decode '\xAB' = '\x00B4' -- ACUTE ACCENT
|
||||
decode '\xAC' = '\x00A8' -- DIAERESIS
|
||||
decode '\xAD' = '\x2260' -- NOT EQUAL TO
|
||||
decode '\xAE' = '\x00C6' -- LATIN CAPITAL LETTER AE
|
||||
decode '\xAF' = '\x00D8' -- LATIN CAPITAL LETTER O WITH STROKE
|
||||
decode '\xB0' = '\x221E' -- INFINITY
|
||||
decode '\xB1' = '\x00B1' -- PLUS-MINUS SIGN
|
||||
decode '\xB2' = '\x2264' -- LESS-THAN OR EQUAL TO
|
||||
decode '\xB3' = '\x2265' -- GREATER-THAN OR EQUAL TO
|
||||
decode '\xB4' = '\x00A5' -- YEN SIGN
|
||||
decode '\xB5' = '\x00B5' -- MICRO SIGN
|
||||
decode '\xB6' = '\x2202' -- PARTIAL DIFFERENTIAL
|
||||
decode '\xB7' = '\x2211' -- N-ARY SUMMATION
|
||||
decode '\xB8' = '\x220F' -- N-ARY PRODUCT
|
||||
decode '\xB9' = '\x03C0' -- GREEK SMALL LETTER PI
|
||||
decode '\xBA' = '\x222B' -- INTEGRAL
|
||||
decode '\xBB' = '\x00AA' -- FEMININE ORDINAL INDICATOR
|
||||
decode '\xBC' = '\x00BA' -- MASCULINE ORDINAL INDICATOR
|
||||
decode '\xBD' = '\x03A9' -- GREEK CAPITAL LETTER OMEGA
|
||||
decode '\xBE' = '\x00E6' -- LATIN SMALL LETTER AE
|
||||
decode '\xBF' = '\x00F8' -- LATIN SMALL LETTER O WITH STROKE
|
||||
decode '\xC0' = '\x00BF' -- INVERTED QUESTION MARK
|
||||
decode '\xC1' = '\x00A1' -- INVERTED EXCLAMATION MARK
|
||||
decode '\xC2' = '\x00AC' -- NOT SIGN
|
||||
decode '\xC3' = '\x221A' -- SQUARE ROOT
|
||||
decode '\xC4' = '\x0192' -- LATIN SMALL LETTER F WITH HOOK
|
||||
decode '\xC5' = '\x2248' -- ALMOST EQUAL TO
|
||||
decode '\xC6' = '\x2206' -- INCREMENT
|
||||
decode '\xC7' = '\x00AB' -- LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
decode '\xC8' = '\x00BB' -- RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
decode '\xC9' = '\x2026' -- HORIZONTAL ELLIPSIS
|
||||
decode '\xCA' = '\x00A0' -- NO-BREAK SPACE
|
||||
decode '\xCB' = '\x00C0' -- LATIN CAPITAL LETTER A WITH GRAVE
|
||||
decode '\xCC' = '\x00C3' -- LATIN CAPITAL LETTER A WITH TILDE
|
||||
decode '\xCD' = '\x00D5' -- LATIN CAPITAL LETTER O WITH TILDE
|
||||
decode '\xCE' = '\x0152' -- LATIN CAPITAL LIGATURE OE
|
||||
decode '\xCF' = '\x0153' -- LATIN SMALL LIGATURE OE
|
||||
decode '\xD0' = '\x2013' -- EN DASH
|
||||
decode '\xD1' = '\x2014' -- EM DASH
|
||||
decode '\xD2' = '\x201C' -- LEFT DOUBLE QUOTATION MARK
|
||||
decode '\xD3' = '\x201D' -- RIGHT DOUBLE QUOTATION MARK
|
||||
decode '\xD4' = '\x2018' -- LEFT SINGLE QUOTATION MARK
|
||||
decode '\xD5' = '\x2019' -- RIGHT SINGLE QUOTATION MARK
|
||||
decode '\xD6' = '\x00F7' -- DIVISION SIGN
|
||||
decode '\xD7' = '\x25CA' -- LOZENGE
|
||||
decode '\xD8' = '\x00FF' -- LATIN SMALL LETTER Y WITH DIAERESIS
|
||||
decode '\xD9' = '\x0178' -- LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
decode '\xDA' = '\x2044' -- FRACTION SLASH
|
||||
decode '\xDB' = '\x20AC' -- EURO SIGN
|
||||
decode '\xDC' = '\x2039' -- SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
decode '\xDD' = '\x203A' -- SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
decode '\xDE' = '\xFB01' -- LATIN SMALL LIGATURE FI
|
||||
decode '\xDF' = '\xFB02' -- LATIN SMALL LIGATURE FL
|
||||
decode '\xE0' = '\x2021' -- DOUBLE DAGGER
|
||||
decode '\xE1' = '\x00B7' -- MIDDLE DOT
|
||||
decode '\xE2' = '\x201A' -- SINGLE LOW-9 QUOTATION MARK
|
||||
decode '\xE3' = '\x201E' -- DOUBLE LOW-9 QUOTATION MARK
|
||||
decode '\xE4' = '\x2030' -- PER MILLE SIGN
|
||||
decode '\xE5' = '\x00C2' -- LATIN CAPITAL LETTER A WITH CIRCUMFLEX
|
||||
decode '\xE6' = '\x00CA' -- LATIN CAPITAL LETTER E WITH CIRCUMFLEX
|
||||
decode '\xE7' = '\x00C1' -- LATIN CAPITAL LETTER A WITH ACUTE
|
||||
decode '\xE8' = '\x00CB' -- LATIN CAPITAL LETTER E WITH DIAERESIS
|
||||
decode '\xE9' = '\x00C8' -- LATIN CAPITAL LETTER E WITH GRAVE
|
||||
decode '\xEA' = '\x00CD' -- LATIN CAPITAL LETTER I WITH ACUTE
|
||||
decode '\xEB' = '\x00CE' -- LATIN CAPITAL LETTER I WITH CIRCUMFLEX
|
||||
decode '\xEC' = '\x00CF' -- LATIN CAPITAL LETTER I WITH DIAERESIS
|
||||
decode '\xED' = '\x00CC' -- LATIN CAPITAL LETTER I WITH GRAVE
|
||||
decode '\xEE' = '\x00D3' -- LATIN CAPITAL LETTER O WITH ACUTE
|
||||
decode '\xEF' = '\x00D4' -- LATIN CAPITAL LETTER O WITH CIRCUMFLEX
|
||||
decode '\xF0' = '\xF8FF' -- Apple logo
|
||||
decode '\xF1' = '\x00D2' -- LATIN CAPITAL LETTER O WITH GRAVE
|
||||
decode '\xF2' = '\x00DA' -- LATIN CAPITAL LETTER U WITH ACUTE
|
||||
decode '\xF3' = '\x00DB' -- LATIN CAPITAL LETTER U WITH CIRCUMFLEX
|
||||
decode '\xF4' = '\x00D9' -- LATIN CAPITAL LETTER U WITH GRAVE
|
||||
decode '\xF5' = '\x0131' -- LATIN SMALL LETTER DOTLESS I
|
||||
decode '\xF6' = '\x02C6' -- MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||
decode '\xF7' = '\x02DC' -- SMALL TILDE
|
||||
decode '\xF8' = '\x00AF' -- MACRON
|
||||
decode '\xF9' = '\x02D8' -- BREVE
|
||||
decode '\xFA' = '\x02D9' -- DOT ABOVE
|
||||
decode '\xFB' = '\x02DA' -- RING ABOVE
|
||||
decode '\xFC' = '\x00B8' -- CEDILLA
|
||||
decode '\xFD' = '\x02DD' -- DOUBLE ACUTE ACCENT
|
||||
decode '\xFE' = '\x02DB' -- OGONEK
|
||||
decode '\xFF' = '\x02C7' -- CARON
|
||||
decode c = c -- The rest is ASCII
|
|
@ -0,0 +1,16 @@
|
|||
module PDF.Font (
|
||||
Font
|
||||
, FontSet
|
||||
, emptyFont
|
||||
) where
|
||||
|
||||
import Data.ByteString (ByteString)
|
||||
import Data.Map (Map)
|
||||
import Data.Text (Text)
|
||||
import PDF.Object (Name)
|
||||
|
||||
type Font = ByteString -> Either String Text
|
||||
type FontSet = Map Name Font
|
||||
|
||||
emptyFont :: Font
|
||||
emptyFont _ = Left "No fond loaded"
|
|
@ -116,7 +116,7 @@ char :: Char -> OBuilder
|
|||
char c = lift char8 c <* offset (+1)
|
||||
|
||||
string :: String -> OBuilder
|
||||
string s = lift string8 s <* offset (+ toEnum (length s))
|
||||
string s = lift string8 s <* offset (+ length s)
|
||||
|
||||
line :: String -> OBuilder
|
||||
line l = string l `mappend` newLine
|
||||
|
|
|
@ -6,41 +6,40 @@ module PDF.Pages (
|
|||
) where
|
||||
|
||||
import Codec.Compression.Zlib (decompress)
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad (foldM)
|
||||
import Control.Monad.RWS (RWST(..), ask, evalRWST, mapRWST, modify)
|
||||
import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify)
|
||||
import qualified Control.Monad.RWS as RWS (get)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||
import PDF.CMap (CMap, CMappers, cMap)
|
||||
import Data.Text (Text)
|
||||
import PDF.CMap (cMap)
|
||||
import PDF.Encoding (encoding)
|
||||
import PDF.Font (Font, FontSet)
|
||||
import PDF.Object (
|
||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Object(..), Name(..), Structure(..)
|
||||
,)
|
||||
import PDF.Output (ObjectId(..))
|
||||
import PDF.Text (pageContents)
|
||||
import Prelude hiding (fail)
|
||||
import Text.Printf (printf)
|
||||
|
||||
type CachedCMaps = Map ObjectId CMap
|
||||
type T = RWST Content () CachedCMaps (Either String)
|
||||
type CachedFonts = Map ObjectId Font
|
||||
type T = RWST Content () CachedFonts (Either String)
|
||||
data Page = Page {
|
||||
contents :: [ByteString]
|
||||
contents :: [Text]
|
||||
, source :: ObjectId
|
||||
}
|
||||
|
||||
infixl 1 \\=
|
||||
(\\=) :: T a -> (a -> Either String b) -> T b
|
||||
x \\= f = mapRWST ((\(a, s, w) -> (\b -> (b, s, w)) <$> f a) =<<) x
|
||||
x \\= f = x >>= lift . f
|
||||
|
||||
infixl 1 //=
|
||||
(//=) :: Either String a -> (a -> T b) -> T b
|
||||
(//=) (Left e) _ = RWST (\_ _ -> Left e)
|
||||
(//=) (Right a) f = f a
|
||||
|
||||
lift :: Either String a -> T a
|
||||
lift x = x //= return
|
||||
x //= f = lift x >>= f
|
||||
|
||||
expected :: Show a => String -> a -> Either String b
|
||||
expected name = Left . printf "Not a %s: %s" name . show
|
||||
|
@ -60,14 +59,14 @@ getResource (Reference (IndirectObjCoordinates {objectId})) =
|
|||
getResource directObject =
|
||||
lift $ expected "resource (dictionary or reference)" directObject
|
||||
|
||||
getFont :: Dictionary -> T Dictionary
|
||||
getFont pageDict =
|
||||
getFontDictionary :: Dictionary -> T Dictionary
|
||||
getFontDictionary pageDict =
|
||||
key "Resources" pageDict
|
||||
//= getResource
|
||||
\\= key "Font"
|
||||
>>= getResource
|
||||
|
||||
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
|
||||
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
||||
cache loader objectId =
|
||||
(maybe load return . Map.lookup objectId) =<< RWS.get
|
||||
where
|
||||
|
@ -76,23 +75,27 @@ cache loader objectId =
|
|||
modify $ Map.insert objectId value
|
||||
return value
|
||||
|
||||
loadFont :: ObjectId -> T CMap
|
||||
loadFont objectId =
|
||||
getObject objectId
|
||||
\\= dict
|
||||
\\= key "ToUnicode"
|
||||
>>= follow
|
||||
\\= stream
|
||||
\\= cMap
|
||||
|
||||
loadCMappers :: Dictionary -> T CMappers
|
||||
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
||||
loadFont :: ObjectId -> T Font
|
||||
loadFont objectId = getObject objectId \\= dict >>= tryMappings
|
||||
where
|
||||
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
|
||||
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||
tryMappings dictionary =
|
||||
loadCMap dictionary
|
||||
<|> lift (key "Encoding" dictionary >>= loadEncoding)
|
||||
<|> lift (Left $ unknownFormat (show objectId) (show dictionary))
|
||||
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||
loadCMap dictionary =
|
||||
key "ToUnicode" dictionary //= follow \\= stream \\= cMap
|
||||
loadEncoding (NameObject (Name name)) = encoding name
|
||||
loadEncoding directObject =
|
||||
Left . printf "Encoding must be a name, not that : %s" $ show directObject
|
||||
|
||||
loadFonts :: Dictionary -> T FontSet
|
||||
loadFonts = foldM addFont Map.empty . Map.toList
|
||||
where
|
||||
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
|
||||
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||
flip (Map.insert name) output <$> cache loadFont objectId
|
||||
--maybe output (flip (Map.insert name) output) <$> cache loadFont objectId
|
||||
loadCMapper output _ = return output
|
||||
addFont output _ = return output
|
||||
|
||||
getObject :: ObjectId -> T Object
|
||||
getObject objectId = do
|
||||
|
@ -139,16 +142,16 @@ getReferences objects = do
|
|||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||
_ -> []
|
||||
|
||||
extractText :: Object -> T [ByteString]
|
||||
extractText :: Object -> T [Text]
|
||||
extractText object = do
|
||||
pageDict <- lift $ dict object
|
||||
cMappers <- loadCMappers =<< getFont pageDict
|
||||
fonts <- loadFonts =<< getFontDictionary pageDict
|
||||
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
||||
concat <$> (objects //= (mapM $ loadContent cMappers))
|
||||
concat <$> (objects //= (mapM $ loadContent fonts))
|
||||
where
|
||||
loadContent :: CMappers -> DirectObject -> T [ByteString]
|
||||
loadContent cMappers directObject =
|
||||
follow directObject \\= stream \\= pageContents cMappers
|
||||
loadContent :: FontSet -> DirectObject -> T [Text]
|
||||
loadContent fonts directObject =
|
||||
follow directObject \\= stream \\= pageContents fonts
|
||||
|
||||
loadPage :: ObjectId -> T Page
|
||||
loadPage source =
|
||||
|
|
|
@ -2,21 +2,13 @@
|
|||
{-# LANGUAGE ConstraintKinds #-}
|
||||
{-# LANGUAGE UndecidableInstances #-}
|
||||
module PDF.Parser (
|
||||
MonadParser
|
||||
MonadParser(..)
|
||||
, Parser
|
||||
, (<?>)
|
||||
, block
|
||||
, char
|
||||
, decNumber
|
||||
, hexNumber
|
||||
, octDigit
|
||||
, on
|
||||
, oneOf
|
||||
, runParser
|
||||
, evalParser
|
||||
, string
|
||||
, takeAll
|
||||
, takeAll1
|
||||
) where
|
||||
|
||||
import Control.Applicative (Alternative, (<|>))
|
||||
|
@ -25,7 +17,8 @@ import Control.Monad.Fail (MonadFail(..))
|
|||
import Control.Monad.State (StateT(..), evalStateT)
|
||||
import Control.Monad.Trans (MonadTrans(..))
|
||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
|
||||
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
|
||||
Parser, char, endOfInput, parseOnly, satisfy, string, take, takeWhile
|
||||
, takeWhile1
|
||||
)
|
||||
import Data.ByteString (ByteString)
|
||||
import Data.ByteString.Char8.Util (B16Int(..))
|
||||
|
@ -40,6 +33,7 @@ class MonadDeps m => MonadParser m where
|
|||
block :: Int -> m ByteString
|
||||
char :: Char -> m Char
|
||||
decNumber :: m ByteString
|
||||
endOfInput :: m ()
|
||||
hexNumber :: m B16Int
|
||||
oneOf :: String -> m Char
|
||||
string :: ByteString -> m ByteString
|
||||
|
@ -49,6 +43,7 @@ class MonadDeps m => MonadParser m where
|
|||
instance MonadParser Atto.Parser where
|
||||
block = Atto.take
|
||||
char = Atto.char
|
||||
endOfInput = Atto.endOfInput
|
||||
decNumber = Atto.takeWhile1 (`Set.member` digits)
|
||||
hexNumber = B16Int <$> Atto.takeWhile1 (`Set.member` hexDigits)
|
||||
oneOf charSet = Atto.satisfy (`elem` charSet)
|
||||
|
@ -59,6 +54,7 @@ instance MonadParser Atto.Parser where
|
|||
instance (MonadParser m, MonadTrans t, MonadDeps (t m)) => MonadParser (t m) where
|
||||
block = lift . block
|
||||
char = lift . char
|
||||
endOfInput = lift $ endOfInput
|
||||
decNumber = lift $ decNumber
|
||||
hexNumber = lift $ hexNumber
|
||||
oneOf = lift . oneOf
|
||||
|
|
192
src/PDF/Text.hs
192
src/PDF/Text.hs
|
@ -8,35 +8,58 @@ module PDF.Text {-(
|
|||
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad (foldM)
|
||||
import Control.Monad.Fail (MonadFail)
|
||||
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
||||
import Control.Monad.State (MonadState, evalStateT, get, modify, put)
|
||||
import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS (drop, null, take)
|
||||
import Data.ByteString.Char8 (pack, unpack)
|
||||
import Data.Char (toLower)
|
||||
import Data.Map ((!), (!?), Map)
|
||||
import qualified Data.Map as Map (fromList, lookup, toList)
|
||||
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap)
|
||||
import qualified Data.Map as Map (fromList)
|
||||
import Data.Text (Text)
|
||||
import PDF.Font (Font, FontSet, emptyFont)
|
||||
import PDF.Object (
|
||||
DirectObject(..), StringObject(..)
|
||||
, array, blank, name, regular, stringObject, toByteString
|
||||
)
|
||||
import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll1)
|
||||
import PDF.Parser (MonadParser(..), (<?>), Parser, evalParser)
|
||||
|
||||
data StateOperator =
|
||||
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
|
||||
deriving (Bounded, Enum)
|
||||
C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
|
||||
deriving (Bounded, Enum, Show)
|
||||
data PathOperator =
|
||||
M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
|
||||
| S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
|
||||
| W | Wstar -- clipping path
|
||||
deriving (Bounded, Enum, Show)
|
||||
data ColorOperator =
|
||||
CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
|
||||
deriving (Bounded, Enum, Show)
|
||||
data TextOperator =
|
||||
Td | TD | Tm | Tstar -- text positioning
|
||||
| TJ | Tj | Quote | DQuote -- text showing
|
||||
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
||||
deriving (Bounded, Enum)
|
||||
deriving (Bounded, Enum, Show)
|
||||
data Argument = Raw ByteString | Typed DirectObject deriving Show
|
||||
type Call a = (a, [Argument])
|
||||
|
||||
type Operator a = (Bounded a, Enum a, Show a)
|
||||
|
||||
code :: Operator a => a -> ByteString
|
||||
code = pack . expand . show
|
||||
where
|
||||
expand "" = ""
|
||||
expand (c:'_':s) = toLower c : expand s
|
||||
expand ('s':'t':'a':'r':s) = '*' : expand s
|
||||
expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
|
||||
expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
|
||||
expand (c:s) = c : expand s
|
||||
|
||||
{-
|
||||
instance Show StateOperator where
|
||||
show Cm = "cm"
|
||||
show W = "w"
|
||||
show W_ = "w"
|
||||
show J = "J"
|
||||
show J_ = "j"
|
||||
show M = "M"
|
||||
|
@ -45,6 +68,35 @@ instance Show StateOperator where
|
|||
show I = "i"
|
||||
show Gs = "gs"
|
||||
|
||||
instance Show PathOperator where
|
||||
show M_ = "m"
|
||||
show L_ = "l"
|
||||
show C_ = "c"
|
||||
show V_ = "v"
|
||||
show Y_ = "y"
|
||||
show H_
|
||||
("m", (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
|
||||
, ("l", (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
|
||||
, ("c", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
||||
, ("v", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
||||
, ("y", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
||||
, ("h", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||
, ("re", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
||||
, ("S", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||
, ("s", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||
, ("f", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||
, ("F", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||
, ("F*", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||
, ("B", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||
, ("B*", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||
, ("b", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||
, ("b*", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||
, ("n", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||
, ("W", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||
, ("W*", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||
|
||||
instance Show ColorOperator where
|
||||
|
||||
instance Show TextOperator where
|
||||
show Td = "Td"
|
||||
show TD = "TD"
|
||||
|
@ -61,18 +113,58 @@ instance Show TextOperator where
|
|||
show Tf = "Tf"
|
||||
show Tr = "Tr"
|
||||
show Ts = "Ts"
|
||||
-}
|
||||
|
||||
stateOperator :: OperatorTable StateOperator
|
||||
stateOperator = Map.fromList [
|
||||
("cm", (Cm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
||||
, ("w", (W, \l -> case l of [Raw _] -> True ; _ -> False))
|
||||
, ("J", (J, \l -> case l of [Raw _] -> True ; _ -> False))
|
||||
, ("j", (J_, \l -> case l of [Raw _] -> True ; _ -> False))
|
||||
, ("M", (M, \l -> case l of [Raw _] -> True ; _ -> False))
|
||||
, ("d", (D, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
|
||||
, ("ri", (Ri, \l -> case l of [Raw _] -> True ; _ -> False))
|
||||
, ("i", (I, \l -> case l of [Raw _] -> True ; _ -> False))
|
||||
, ("gs", (Gs, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False))
|
||||
stateOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
|
||||
(C_m, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||
, (W_, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||
, (J, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||
, (J_, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||
, (M, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||
, (D_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||
, (R_i, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||
, (I_, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||
, (G_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
|
||||
]
|
||||
|
||||
pathOperator :: OperatorTable PathOperator
|
||||
pathOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
|
||||
(M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||
, (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||
, (C_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||
, (V_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||
, (Y_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||
, (H_, \l -> case l of [] -> True ; _ -> False)
|
||||
, (R_e, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||
, (S, \l -> case l of [] -> True ; _ -> False)
|
||||
, (S_, \l -> case l of [] -> True ; _ -> False)
|
||||
, (F_, \l -> case l of [] -> True ; _ -> False)
|
||||
, (F, \l -> case l of [] -> True ; _ -> False)
|
||||
, (Fstar, \l -> case l of [] -> True ; _ -> False)
|
||||
, (B, \l -> case l of [] -> True ; _ -> False)
|
||||
, (Bstar, \l -> case l of [] -> True ; _ -> False)
|
||||
, (B_, \l -> case l of [] -> True ; _ -> False)
|
||||
, (B_star, \l -> case l of [] -> True ; _ -> False)
|
||||
, (N_, \l -> case l of [] -> True ; _ -> False)
|
||||
, (W, \l -> case l of [] -> True ; _ -> False)
|
||||
, (Wstar, \l -> case l of [] -> True ; _ -> False)
|
||||
]
|
||||
|
||||
colorOperator :: OperatorTable ColorOperator
|
||||
colorOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
|
||||
(CS, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
|
||||
, (C_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
|
||||
, (SC, \_ -> True)
|
||||
, (SCN, \_ -> True)
|
||||
, (S_c, \_ -> True)
|
||||
, (S_cn, \_ -> True)
|
||||
, (G, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||
, (G_, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||
, (RG, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||
, (R_g, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||
, (K, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||
, (K_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||
]
|
||||
|
||||
{-
|
||||
|
@ -89,7 +181,7 @@ stateOperator _ = False
|
|||
-}
|
||||
|
||||
textOperator :: OperatorTable TextOperator
|
||||
textOperator = Map.fromList $ (\(op, checker) -> (pack $ show op, (op, checker))) <$> [
|
||||
textOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
|
||||
(Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||
, (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||
, (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||
|
@ -127,7 +219,7 @@ textOperator _ = False
|
|||
-}
|
||||
|
||||
type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
|
||||
type Operator a = (Bounded a, Enum a, Show a)
|
||||
--type Operator a = (Bounded a, Enum a, Show a)
|
||||
type OperatorTable a = Map ByteString (TypeChecker a)
|
||||
type TypeChecker a = (a, [Argument] -> Bool)
|
||||
|
||||
|
@ -146,14 +238,14 @@ callChunk table =
|
|||
Nothing -> return . Right $ Raw chunk
|
||||
Just typeChecker -> return $ Left typeChecker
|
||||
|
||||
stackParser :: (ArgumentStackParser m, Show a) => OperatorTable a -> m (Call a)
|
||||
stackParser :: (ArgumentStackParser m, Operator a) => OperatorTable a -> m (Call a)
|
||||
stackParser table = either popCall push =<< (callChunk table)
|
||||
where
|
||||
push arg = modify (arg:) >> stackParser table
|
||||
popCall (operator, predicate) = do
|
||||
arguments <- reverse <$> get
|
||||
let call = (operator, arguments)
|
||||
if predicate arguments then return call else fail (show call)
|
||||
if predicate arguments then return call else fail (unpack $ code operator)
|
||||
|
||||
a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a)
|
||||
a table = evalStateT (stackParser table) []
|
||||
|
@ -170,29 +262,34 @@ nameArg = Typed . NameObject <$> name <* blank
|
|||
stringArg :: MonadParser m => m Argument
|
||||
stringArg = Typed . StringObject <$> stringObject <* blank
|
||||
|
||||
type ParserWithFont = ReaderT CMappers (Parser CMap)
|
||||
type ParserWithFont = ReaderT FontSet (Parser Font)
|
||||
|
||||
pageContents :: CMappers -> ByteString -> Either String [ByteString]
|
||||
pageContents font input =
|
||||
evalParser (runReaderT page font) emptyCMap input
|
||||
pageContents :: FontSet -> ByteString -> Either String [Text]
|
||||
pageContents fontSet input =
|
||||
evalParser (runReaderT (page) fontSet) emptyFont input
|
||||
|
||||
page :: ParserWithFont [ByteString]
|
||||
page = graphicState <|> text <?> "Text page contents"
|
||||
several :: MonadParser m => m [a] -> m [a]
|
||||
several p = concat <$> (p `sepBy` blank)
|
||||
|
||||
graphicState :: ParserWithFont [ByteString]
|
||||
page :: ParserWithFont [Text]
|
||||
page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
|
||||
|
||||
graphicState :: ParserWithFont [Text]
|
||||
graphicState =
|
||||
(string "q" *> blank *> insideQ <* blank <* string "Q") <?> "Graphic state"
|
||||
string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
|
||||
where
|
||||
insideQ = concat <$> ((command <|> page) `sepBy` blank )
|
||||
command = a stateOperator *> return []
|
||||
insideQ = several (command <|> graphicState <|> text)
|
||||
ignore x = a x *> return []
|
||||
command =
|
||||
ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
|
||||
|
||||
text :: ParserWithFont [ByteString]
|
||||
text :: ParserWithFont [Text]
|
||||
text =
|
||||
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
|
||||
where
|
||||
commands = concat <$> (a textOperator >>= runOperator) `sepBy` blank
|
||||
commands = several (a textOperator >>= runOperator)
|
||||
|
||||
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
|
||||
runOperator :: Call TextOperator -> ParserWithFont [Text]
|
||||
runOperator (Tf, [Typed (NameObject fontName), _]) =
|
||||
asks (! fontName) >>= put >> return []
|
||||
|
||||
|
@ -216,24 +313,7 @@ runOperator (DQuote, [Typed (StringObject outputString)]) =
|
|||
|
||||
runOperator _ = return []
|
||||
|
||||
decodeString :: StringObject -> ParserWithFont ByteString
|
||||
decodeString = decode . toByteString
|
||||
where
|
||||
decode input
|
||||
| BS.null input = return ""
|
||||
| otherwise = do
|
||||
(output, remainingInput) <- trySizes input =<< Map.toList <$> get
|
||||
mappend output <$> decode remainingInput
|
||||
trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString)
|
||||
trySizes s [] = fail $ "No matching code found in font for " ++ unpack s
|
||||
trySizes s ((size, cRanges):others) =
|
||||
let prefix = BS.take size s in
|
||||
case tryRanges prefix cRanges of
|
||||
Nothing -> trySizes s others
|
||||
Just outputSequence -> return (outputSequence, BS.drop size s)
|
||||
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
|
||||
tryRanges _ [] = Nothing
|
||||
tryRanges prefix ((CRange {mapping}):cRanges) =
|
||||
case Map.lookup prefix mapping of
|
||||
Nothing -> tryRanges prefix cRanges
|
||||
outputSequence -> outputSequence
|
||||
decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text
|
||||
decodeString input = do
|
||||
font <- get
|
||||
either fail return . font $ toByteString input
|
||||
|
|
Loading…
Reference in New Issue