From 9b2f89022722061920d73a0e55be837518e287da Mon Sep 17 00:00:00 2001 From: Tissevert Date: Thu, 16 May 2019 11:01:50 +0200 Subject: [PATCH] Boyer-Moore is canceled, implement the rest of parsing with naive search --- src/Data/ByteString/Lazy/Char8/Util.hs | 44 +------------------------- src/PDF.hs | 44 +++++++++++++++++++------- src/PDF/Body.hs | 7 ++-- src/PDF/Object.hs | 3 +- 4 files changed, 39 insertions(+), 59 deletions(-) diff --git a/src/Data/ByteString/Lazy/Char8/Util.hs b/src/Data/ByteString/Lazy/Char8/Util.hs index b392cb4..5686bf1 100644 --- a/src/Data/ByteString/Lazy/Char8/Util.hs +++ b/src/Data/ByteString/Lazy/Char8/Util.hs @@ -1,15 +1,12 @@ {-# LANGUAGE NamedFieldPuns #-} module Data.ByteString.Lazy.Char8.Util ( previous - , search , subBS ) where import Data.ByteString.Lazy.Char8 (ByteString) -import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, isPrefixOf, length, tails, take, uncons) +import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take) import Data.Int (Int64) -import Data.Map (Map) -import qualified Data.Map as Map (empty, fromList, insert, lookup, member) import Prelude hiding (length) previous :: Char -> Int64 -> ByteString -> Int64 @@ -17,44 +14,5 @@ previous char position byteString | BS.index byteString position == char = position | otherwise = previous char (position - 1) byteString -data BMTable = BMTable { - length :: Int64 - , offsets :: Map Int64 (Map Char Int64) - } deriving Show - -prepare :: ByteString -> BMTable -prepare needle = - let length = BS.length needle in - let offsets = Map.fromList $ generateSuffixOffsets <$> [0..length - 1] in - BMTable {length, offsets} - where - generateSuffixOffsets l = - let suffix = BS.drop l needle in - let prefixes = fmap (l -) <$> zip (BS.tails needle) [0..l] in - (l, foldl (addOffset suffix) Map.empty prefixes) - addOffset suffix tmpMap ((aSuffix, delta)) = maybe tmpMap id $ do - (initial, rest) <- BS.uncons aSuffix - if BS.isPrefixOf suffix rest && not (Map.member initial tmpMap) - then return (Map.insert initial delta tmpMap) - else Nothing - -jump :: BMTable -> Int64 -> Char -> Int64 -jump (BMTable {length, offsets}) index char = - maybe length (maybe length id . Map.lookup char) (Map.lookup index offsets) - -search :: ByteString -> ByteString -> Maybe Int64 -search needle = boyerMoore 0 - where - table = prepare needle - tryMatch offset haystack n - | n < 0 = Just offset - | BS.index haystack n == BS.index needle n = tryMatch offset haystack (n-1) - | otherwise = - let delta = jump table n (BS.index haystack n) in - boyerMoore (offset + delta) (BS.drop delta haystack) - boyerMoore offset haystack - | BS.length haystack < 1 = Nothing - | otherwise = tryMatch offset haystack (BS.length needle - 1) - subBS :: Int64 -> Int64 -> ByteString -> ByteString subBS offset length = BS.take length . BS.drop offset diff --git a/src/PDF.hs b/src/PDF.hs index 2e056bf..ec68971 100644 --- a/src/PDF.hs +++ b/src/PDF.hs @@ -8,7 +8,9 @@ module PDF ( ) where import Data.ByteString.Lazy.Char8 (ByteString) -import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, unpack) +import qualified Data.ByteString.Lazy.Char8 as BS ( + drop, findIndex, head, isPrefixOf, last, length, span, unpack + ) import Data.ByteString.Lazy.Char8.Util (previous, subBS) import Data.Int (Int64) import qualified Data.Map as Map (lookup) @@ -57,16 +59,24 @@ readStartXref eolStyle input = previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1 startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition -parseDocument :: ByteString -> Either ParseError Document -parseDocument input = do - (pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input - startXref <- readStartXref eolStyle input - structures <- iterateContents startXref input - let contents = populate input <$> structures - return $ Document {pdfVersion, contents} +nextLine :: ByteString -> Int64 +nextLine input = + let (line, eolPrefixed) = BS.span notInEol input in + let nextNotInEol = BS.findIndex notInEol eolPrefixed in + BS.length line + (maybe (BS.length eolPrefixed) id nextNotInEol) + where + notInEol = not . (`elem` eolCharset) -findNextContentSection :: Int64 -> ByteString -> Int64 -findNextContentSection startXref input = +nextSection :: Int64 -> ByteString -> Int64 +nextSection offset input = + case BS.findIndex (== BS.head eofMarker) input of + Nothing -> 0 + Just delta -> + let newInput = BS.drop delta input in + let newOffset = offset + delta in + if BS.isPrefixOf eofMarker newInput + then newOffset + nextLine newInput + else nextSection (newOffset + 1) (BS.drop 1 newInput) iterateContents :: Int64 -> ByteString -> Either ParseError [Content] iterateContents startXref input = @@ -74,7 +84,17 @@ iterateContents startXref input = where stopOrFollow c@(Content {trailer}) = case Map.lookup "Prev" trailer of - Nothing -> Right [c] - Just (Number f) -> (c:) <$> (iterateContents (truncate f) input) + Nothing -> Right [c {startOffset = nextLine input}] + Just (Number newStartXref) -> + let offset = truncate newStartXref in + let startOffset = nextSection offset (BS.drop offset input) in + (c {startOffset}:) <$> (iterateContents offset input) Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v +parseDocument :: ByteString -> Either ParseError Document +parseDocument input = do + (pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input + startXref <- readStartXref eolStyle input + structures <- iterateContents startXref input + let contents = populate input <$> structures + return $ Document {pdfVersion, contents} diff --git a/src/PDF/Body.hs b/src/PDF/Body.hs index d0f6554..cf8b836 100644 --- a/src/PDF/Body.hs +++ b/src/PDF/Body.hs @@ -113,9 +113,12 @@ occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates populate :: ByteString -> Content -> Content populate input initialContent = - case runParser recurseOnOccurrences initialState "" input of + let bodyInput = BS.drop (startOffset initialContent) input in + case runParser recurseOnOccurrences initialState "" bodyInput of Left _ -> initialContent - Right finalState -> content finalState + Right finalState -> + let finalContent = content finalState in + finalContent {body = reverse (body finalContent)} where initialState = UserState {input, content = initialContent} diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs index 549881d..5d5d5f8 100644 --- a/src/PDF/Object.hs +++ b/src/PDF/Object.hs @@ -25,7 +25,6 @@ import Data.Int (Int64) import Data.Map (Map) import qualified Data.Map as Map (empty, fromList) import Text.Parsec ---import Text.Parsec.ByteString.Lazy (Parser) type Parser u = Parsec ByteString u @@ -185,7 +184,7 @@ xrefSubSection = do content :: Parser u Content content = - Content [] Map.empty + Content 0 [] Map.empty <$> (line "xref" *> xrefSubSection `sepBy` eol) <*> (line "trailer" *> dictionary <* eol) <*> (line "startxref" *> integer)