Broken state : trying to implement Boyer-Moore for fast-forwarding to the end of a section
This commit is contained in:
parent
379a821550
commit
fc41f815a3
4 changed files with 52 additions and 3 deletions
|
@ -1,11 +1,15 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
module Data.ByteString.Lazy.Char8.Util (
|
module Data.ByteString.Lazy.Char8.Util (
|
||||||
previous
|
previous
|
||||||
|
, search
|
||||||
, subBS
|
, subBS
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
|
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, isPrefixOf, length, tails, take, uncons)
|
||||||
import Data.Int (Int64)
|
import Data.Int (Int64)
|
||||||
|
import Data.Map (Map)
|
||||||
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, member)
|
||||||
import Prelude hiding (length)
|
import Prelude hiding (length)
|
||||||
|
|
||||||
previous :: Char -> Int64 -> ByteString -> Int64
|
previous :: Char -> Int64 -> ByteString -> Int64
|
||||||
|
@ -13,5 +17,44 @@ previous char position byteString
|
||||||
| BS.index byteString position == char = position
|
| BS.index byteString position == char = position
|
||||||
| otherwise = previous char (position - 1) byteString
|
| otherwise = previous char (position - 1) byteString
|
||||||
|
|
||||||
|
data BMTable = BMTable {
|
||||||
|
length :: Int64
|
||||||
|
, offsets :: Map Int64 (Map Char Int64)
|
||||||
|
} deriving Show
|
||||||
|
|
||||||
|
prepare :: ByteString -> BMTable
|
||||||
|
prepare needle =
|
||||||
|
let length = BS.length needle in
|
||||||
|
let offsets = Map.fromList $ generateSuffixOffsets <$> [0..length - 1] in
|
||||||
|
BMTable {length, offsets}
|
||||||
|
where
|
||||||
|
generateSuffixOffsets l =
|
||||||
|
let suffix = BS.drop l needle in
|
||||||
|
let prefixes = fmap (l -) <$> zip (BS.tails needle) [0..l] in
|
||||||
|
(l, foldl (addOffset suffix) Map.empty prefixes)
|
||||||
|
addOffset suffix tmpMap ((aSuffix, delta)) = maybe tmpMap id $ do
|
||||||
|
(initial, rest) <- BS.uncons aSuffix
|
||||||
|
if BS.isPrefixOf suffix rest && not (Map.member initial tmpMap)
|
||||||
|
then return (Map.insert initial delta tmpMap)
|
||||||
|
else Nothing
|
||||||
|
|
||||||
|
jump :: BMTable -> Int64 -> Char -> Int64
|
||||||
|
jump (BMTable {length, offsets}) index char =
|
||||||
|
maybe length (maybe length id . Map.lookup char) (Map.lookup index offsets)
|
||||||
|
|
||||||
|
search :: ByteString -> ByteString -> Maybe Int64
|
||||||
|
search needle = boyerMoore 0
|
||||||
|
where
|
||||||
|
table = prepare needle
|
||||||
|
tryMatch offset haystack n
|
||||||
|
| n < 0 = Just offset
|
||||||
|
| BS.index haystack n == BS.index needle n = tryMatch offset haystack (n-1)
|
||||||
|
| otherwise =
|
||||||
|
let delta = jump table n (BS.index haystack n) in
|
||||||
|
boyerMoore (offset + delta) (BS.drop delta haystack)
|
||||||
|
boyerMoore offset haystack
|
||||||
|
| BS.length haystack < 1 = Nothing
|
||||||
|
| otherwise = tryMatch offset haystack (BS.length needle - 1)
|
||||||
|
|
||||||
subBS :: Int64 -> Int64 -> ByteString -> ByteString
|
subBS :: Int64 -> Int64 -> ByteString -> ByteString
|
||||||
subBS offset length = BS.take length . BS.drop offset
|
subBS offset length = BS.take length . BS.drop offset
|
||||||
|
|
|
@ -65,6 +65,9 @@ parseDocument input = do
|
||||||
let contents = populate input <$> structures
|
let contents = populate input <$> structures
|
||||||
return $ Document {pdfVersion, contents}
|
return $ Document {pdfVersion, contents}
|
||||||
|
|
||||||
|
findNextContentSection :: Int64 -> ByteString -> Int64
|
||||||
|
findNextContentSection startXref input =
|
||||||
|
|
||||||
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
|
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
|
||||||
iterateContents startXref input =
|
iterateContents startXref input =
|
||||||
parse content ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow
|
parse content ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
module PDF.Body where
|
module PDF.Body (
|
||||||
|
populate
|
||||||
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)
|
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)
|
||||||
|
|
|
@ -78,7 +78,8 @@ data XRefSubSection = XRefSubSection {
|
||||||
type XRefSection = [XRefSubSection]
|
type XRefSection = [XRefSubSection]
|
||||||
|
|
||||||
data Content = Content {
|
data Content = Content {
|
||||||
body :: [Occurrence]
|
startOffset :: Int64
|
||||||
|
, body :: [Occurrence]
|
||||||
, objects :: Map Int Object
|
, objects :: Map Int Object
|
||||||
, xrefSection :: XRefSection
|
, xrefSection :: XRefSection
|
||||||
, trailer :: Dictionary
|
, trailer :: Dictionary
|
||||||
|
|
Loading…
Reference in a new issue