Broken state : trying to implement Boyer-Moore for fast-forwarding to the end of a section
This commit is contained in:
parent
379a821550
commit
fc41f815a3
4 changed files with 52 additions and 3 deletions
|
@ -1,11 +1,15 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module Data.ByteString.Lazy.Char8.Util (
|
||||
previous
|
||||
, search
|
||||
, subBS
|
||||
) where
|
||||
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, isPrefixOf, length, tails, take, uncons)
|
||||
import Data.Int (Int64)
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, member)
|
||||
import Prelude hiding (length)
|
||||
|
||||
previous :: Char -> Int64 -> ByteString -> Int64
|
||||
|
@ -13,5 +17,44 @@ previous char position byteString
|
|||
| BS.index byteString position == char = position
|
||||
| otherwise = previous char (position - 1) byteString
|
||||
|
||||
data BMTable = BMTable {
|
||||
length :: Int64
|
||||
, offsets :: Map Int64 (Map Char Int64)
|
||||
} deriving Show
|
||||
|
||||
prepare :: ByteString -> BMTable
|
||||
prepare needle =
|
||||
let length = BS.length needle in
|
||||
let offsets = Map.fromList $ generateSuffixOffsets <$> [0..length - 1] in
|
||||
BMTable {length, offsets}
|
||||
where
|
||||
generateSuffixOffsets l =
|
||||
let suffix = BS.drop l needle in
|
||||
let prefixes = fmap (l -) <$> zip (BS.tails needle) [0..l] in
|
||||
(l, foldl (addOffset suffix) Map.empty prefixes)
|
||||
addOffset suffix tmpMap ((aSuffix, delta)) = maybe tmpMap id $ do
|
||||
(initial, rest) <- BS.uncons aSuffix
|
||||
if BS.isPrefixOf suffix rest && not (Map.member initial tmpMap)
|
||||
then return (Map.insert initial delta tmpMap)
|
||||
else Nothing
|
||||
|
||||
jump :: BMTable -> Int64 -> Char -> Int64
|
||||
jump (BMTable {length, offsets}) index char =
|
||||
maybe length (maybe length id . Map.lookup char) (Map.lookup index offsets)
|
||||
|
||||
search :: ByteString -> ByteString -> Maybe Int64
|
||||
search needle = boyerMoore 0
|
||||
where
|
||||
table = prepare needle
|
||||
tryMatch offset haystack n
|
||||
| n < 0 = Just offset
|
||||
| BS.index haystack n == BS.index needle n = tryMatch offset haystack (n-1)
|
||||
| otherwise =
|
||||
let delta = jump table n (BS.index haystack n) in
|
||||
boyerMoore (offset + delta) (BS.drop delta haystack)
|
||||
boyerMoore offset haystack
|
||||
| BS.length haystack < 1 = Nothing
|
||||
| otherwise = tryMatch offset haystack (BS.length needle - 1)
|
||||
|
||||
subBS :: Int64 -> Int64 -> ByteString -> ByteString
|
||||
subBS offset length = BS.take length . BS.drop offset
|
||||
|
|
|
@ -65,6 +65,9 @@ parseDocument input = do
|
|||
let contents = populate input <$> structures
|
||||
return $ Document {pdfVersion, contents}
|
||||
|
||||
findNextContentSection :: Int64 -> ByteString -> Int64
|
||||
findNextContentSection startXref input =
|
||||
|
||||
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
|
||||
iterateContents startXref input =
|
||||
parse content ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module PDF.Body where
|
||||
module PDF.Body (
|
||||
populate
|
||||
) where
|
||||
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)
|
||||
|
|
|
@ -78,7 +78,8 @@ data XRefSubSection = XRefSubSection {
|
|||
type XRefSection = [XRefSubSection]
|
||||
|
||||
data Content = Content {
|
||||
body :: [Occurrence]
|
||||
startOffset :: Int64
|
||||
, body :: [Occurrence]
|
||||
, objects :: Map Int Object
|
||||
, xrefSection :: XRefSection
|
||||
, trailer :: Dictionary
|
||||
|
|
Loading…
Reference in a new issue