From fc41f815a3981887a7bebe55fb708b07061db45a Mon Sep 17 00:00:00 2001 From: Tissevert Date: Wed, 15 May 2019 19:12:38 +0200 Subject: [PATCH] Broken state : trying to implement Boyer-Moore for fast-forwarding to the end of a section --- src/Data/ByteString/Lazy/Char8/Util.hs | 45 +++++++++++++++++++++++++- src/PDF.hs | 3 ++ src/PDF/Body.hs | 4 ++- src/PDF/Object.hs | 3 +- 4 files changed, 52 insertions(+), 3 deletions(-) diff --git a/src/Data/ByteString/Lazy/Char8/Util.hs b/src/Data/ByteString/Lazy/Char8/Util.hs index 831be25..b392cb4 100644 --- a/src/Data/ByteString/Lazy/Char8/Util.hs +++ b/src/Data/ByteString/Lazy/Char8/Util.hs @@ -1,11 +1,15 @@ +{-# LANGUAGE NamedFieldPuns #-} module Data.ByteString.Lazy.Char8.Util ( previous + , search , subBS ) where import Data.ByteString.Lazy.Char8 (ByteString) -import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take) +import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, isPrefixOf, length, tails, take, uncons) import Data.Int (Int64) +import Data.Map (Map) +import qualified Data.Map as Map (empty, fromList, insert, lookup, member) import Prelude hiding (length) previous :: Char -> Int64 -> ByteString -> Int64 @@ -13,5 +17,44 @@ previous char position byteString | BS.index byteString position == char = position | otherwise = previous char (position - 1) byteString +data BMTable = BMTable { + length :: Int64 + , offsets :: Map Int64 (Map Char Int64) + } deriving Show + +prepare :: ByteString -> BMTable +prepare needle = + let length = BS.length needle in + let offsets = Map.fromList $ generateSuffixOffsets <$> [0..length - 1] in + BMTable {length, offsets} + where + generateSuffixOffsets l = + let suffix = BS.drop l needle in + let prefixes = fmap (l -) <$> zip (BS.tails needle) [0..l] in + (l, foldl (addOffset suffix) Map.empty prefixes) + addOffset suffix tmpMap ((aSuffix, delta)) = maybe tmpMap id $ do + (initial, rest) <- BS.uncons aSuffix + if BS.isPrefixOf suffix rest && not (Map.member initial tmpMap) + then return (Map.insert initial delta tmpMap) + else Nothing + +jump :: BMTable -> Int64 -> Char -> Int64 +jump (BMTable {length, offsets}) index char = + maybe length (maybe length id . Map.lookup char) (Map.lookup index offsets) + +search :: ByteString -> ByteString -> Maybe Int64 +search needle = boyerMoore 0 + where + table = prepare needle + tryMatch offset haystack n + | n < 0 = Just offset + | BS.index haystack n == BS.index needle n = tryMatch offset haystack (n-1) + | otherwise = + let delta = jump table n (BS.index haystack n) in + boyerMoore (offset + delta) (BS.drop delta haystack) + boyerMoore offset haystack + | BS.length haystack < 1 = Nothing + | otherwise = tryMatch offset haystack (BS.length needle - 1) + subBS :: Int64 -> Int64 -> ByteString -> ByteString subBS offset length = BS.take length . BS.drop offset diff --git a/src/PDF.hs b/src/PDF.hs index 57c90a3..2e056bf 100644 --- a/src/PDF.hs +++ b/src/PDF.hs @@ -65,6 +65,9 @@ parseDocument input = do let contents = populate input <$> structures return $ Document {pdfVersion, contents} +findNextContentSection :: Int64 -> ByteString -> Int64 +findNextContentSection startXref input = + iterateContents :: Int64 -> ByteString -> Either ParseError [Content] iterateContents startXref input = parse content ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow diff --git a/src/PDF/Body.hs b/src/PDF/Body.hs index 7e43763..d0f6554 100644 --- a/src/PDF/Body.hs +++ b/src/PDF/Body.hs @@ -1,5 +1,7 @@ {-# LANGUAGE NamedFieldPuns #-} -module PDF.Body where +module PDF.Body ( + populate + ) where import Data.ByteString.Lazy.Char8 (ByteString) import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack) diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs index bc96161..549881d 100644 --- a/src/PDF/Object.hs +++ b/src/PDF/Object.hs @@ -78,7 +78,8 @@ data XRefSubSection = XRefSubSection { type XRefSection = [XRefSubSection] data Content = Content { - body :: [Occurrence] + startOffset :: Int64 + , body :: [Occurrence] , objects :: Map Int Object , xrefSection :: XRefSection , trailer :: Dictionary