Broken state : trying to implement Boyer-Moore for fast-forwarding to the end of a section

This commit is contained in:
Tissevert 2019-05-15 19:12:38 +02:00
parent 379a821550
commit fc41f815a3
4 changed files with 52 additions and 3 deletions

View file

@ -1,11 +1,15 @@
{-# LANGUAGE NamedFieldPuns #-}
module Data.ByteString.Lazy.Char8.Util ( module Data.ByteString.Lazy.Char8.Util (
previous previous
, search
, subBS , subBS
) where ) where
import Data.ByteString.Lazy.Char8 (ByteString) import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take) import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, isPrefixOf, length, tails, take, uncons)
import Data.Int (Int64) import Data.Int (Int64)
import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList, insert, lookup, member)
import Prelude hiding (length) import Prelude hiding (length)
previous :: Char -> Int64 -> ByteString -> Int64 previous :: Char -> Int64 -> ByteString -> Int64
@ -13,5 +17,44 @@ previous char position byteString
| BS.index byteString position == char = position | BS.index byteString position == char = position
| otherwise = previous char (position - 1) byteString | otherwise = previous char (position - 1) byteString
data BMTable = BMTable {
length :: Int64
, offsets :: Map Int64 (Map Char Int64)
} deriving Show
prepare :: ByteString -> BMTable
prepare needle =
let length = BS.length needle in
let offsets = Map.fromList $ generateSuffixOffsets <$> [0..length - 1] in
BMTable {length, offsets}
where
generateSuffixOffsets l =
let suffix = BS.drop l needle in
let prefixes = fmap (l -) <$> zip (BS.tails needle) [0..l] in
(l, foldl (addOffset suffix) Map.empty prefixes)
addOffset suffix tmpMap ((aSuffix, delta)) = maybe tmpMap id $ do
(initial, rest) <- BS.uncons aSuffix
if BS.isPrefixOf suffix rest && not (Map.member initial tmpMap)
then return (Map.insert initial delta tmpMap)
else Nothing
jump :: BMTable -> Int64 -> Char -> Int64
jump (BMTable {length, offsets}) index char =
maybe length (maybe length id . Map.lookup char) (Map.lookup index offsets)
search :: ByteString -> ByteString -> Maybe Int64
search needle = boyerMoore 0
where
table = prepare needle
tryMatch offset haystack n
| n < 0 = Just offset
| BS.index haystack n == BS.index needle n = tryMatch offset haystack (n-1)
| otherwise =
let delta = jump table n (BS.index haystack n) in
boyerMoore (offset + delta) (BS.drop delta haystack)
boyerMoore offset haystack
| BS.length haystack < 1 = Nothing
| otherwise = tryMatch offset haystack (BS.length needle - 1)
subBS :: Int64 -> Int64 -> ByteString -> ByteString subBS :: Int64 -> Int64 -> ByteString -> ByteString
subBS offset length = BS.take length . BS.drop offset subBS offset length = BS.take length . BS.drop offset

View file

@ -65,6 +65,9 @@ parseDocument input = do
let contents = populate input <$> structures let contents = populate input <$> structures
return $ Document {pdfVersion, contents} return $ Document {pdfVersion, contents}
findNextContentSection :: Int64 -> ByteString -> Int64
findNextContentSection startXref input =
iterateContents :: Int64 -> ByteString -> Either ParseError [Content] iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
iterateContents startXref input = iterateContents startXref input =
parse content ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow parse content ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow

View file

@ -1,5 +1,7 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
module PDF.Body where module PDF.Body (
populate
) where
import Data.ByteString.Lazy.Char8 (ByteString) import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack) import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)

View file

@ -78,7 +78,8 @@ data XRefSubSection = XRefSubSection {
type XRefSection = [XRefSubSection] type XRefSection = [XRefSubSection]
data Content = Content { data Content = Content {
body :: [Occurrence] startOffset :: Int64
, body :: [Occurrence]
, objects :: Map Int Object , objects :: Map Int Object
, xrefSection :: XRefSection , xrefSection :: XRefSection
, trailer :: Dictionary , trailer :: Dictionary