Boyer-Moore is canceled, implement the rest of parsing with naive search

2019-05-16 11:01:50 +02:00 · 2019-05-16 11:01:50 +02:00 · 9b2f890227
commit 9b2f890227
parent fc41f815a3
4 changed files with 39 additions and 59 deletions
--- a/src/Data/ByteString/Lazy/Char8/Util.hs
+++ b/src/Data/ByteString/Lazy/Char8/Util.hs
@ -1,15 +1,12 @@
 {-# LANGUAGE NamedFieldPuns #-}
 module Data.ByteString.Lazy.Char8.Util (
      previous
    , search
    , subBS
  ) where
 import Data.ByteString.Lazy.Char8 (ByteString)
-import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, isPrefixOf, length, tails, take, uncons)
+import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
 import Data.Int (Int64)
 import Data.Map (Map)
 import qualified Data.Map as Map (empty, fromList, insert, lookup, member)
 import Prelude hiding (length)
 previous :: Char -> Int64 -> ByteString -> Int64
@ -17,44 +14,5 @@ previous char position byteString
  | BS.index byteString position == char = position
  | otherwise = previous char (position - 1) byteString
 data BMTable = BMTable {
      length :: Int64
    , offsets :: Map Int64 (Map Char Int64)
  } deriving Show
 prepare :: ByteString -> BMTable
 prepare needle =
  let length = BS.length needle in
  let offsets = Map.fromList $ generateSuffixOffsets <$> [0..length - 1] in
  BMTable {length, offsets}
  where
    generateSuffixOffsets l =
      let suffix = BS.drop l needle in
      let prefixes = fmap (l -) <$> zip (BS.tails needle) [0..l] in
      (l, foldl (addOffset suffix) Map.empty prefixes)
    addOffset suffix tmpMap ((aSuffix, delta)) = maybe tmpMap id $ do
      (initial, rest) <- BS.uncons aSuffix
      if BS.isPrefixOf suffix rest && not (Map.member initial tmpMap)
      then return (Map.insert initial delta tmpMap)
      else Nothing
 jump :: BMTable -> Int64 -> Char -> Int64
 jump (BMTable {length, offsets}) index char =
  maybe length (maybe length id . Map.lookup char) (Map.lookup index offsets) 
 search :: ByteString -> ByteString -> Maybe Int64
 search needle = boyerMoore 0
  where
    table = prepare needle
    tryMatch offset haystack n
      | n < 0 = Just offset
      | BS.index haystack n == BS.index needle n = tryMatch offset haystack (n-1)
      | otherwise =
        let delta = jump table n (BS.index haystack n) in
        boyerMoore (offset + delta) (BS.drop delta haystack)
    boyerMoore offset haystack
      | BS.length haystack < 1 = Nothing
      | otherwise = tryMatch offset haystack (BS.length needle - 1)
 subBS :: Int64 -> Int64 -> ByteString -> ByteString
 subBS offset length = BS.take length . BS.drop offset
--- a/src/PDF.hs
+++ b/src/PDF.hs
@ -8,7 +8,9 @@ module PDF (
  ) where
 import Data.ByteString.Lazy.Char8 (ByteString)
-import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, unpack)
+import qualified Data.ByteString.Lazy.Char8 as BS (
    drop, findIndex, head, isPrefixOf, last, length, span, unpack
  )
 import Data.ByteString.Lazy.Char8.Util (previous, subBS)
 import Data.Int (Int64)
 import qualified Data.Map as Map (lookup)
@ -57,16 +59,24 @@ readStartXref eolStyle input =
      previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
    startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
-parseDocument :: ByteString -> Either ParseError Document
+nextLine :: ByteString -> Int64
-parseDocument input = do
+nextLine input =
-  (pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
+  let (line, eolPrefixed) = BS.span notInEol input in
-  startXref <- readStartXref eolStyle input
+  let nextNotInEol = BS.findIndex notInEol eolPrefixed in
-  structures <- iterateContents startXref input
+  BS.length line + (maybe (BS.length eolPrefixed) id nextNotInEol)
-  let contents = populate input <$> structures
+  where
-  return $ Document {pdfVersion, contents}
+    notInEol = not . (`elem` eolCharset)
-findNextContentSection :: Int64 -> ByteString -> Int64
+nextSection :: Int64 -> ByteString -> Int64
-findNextContentSection startXref input =
+nextSection offset input =
  case BS.findIndex (== BS.head eofMarker) input of
    Nothing -> 0
    Just delta ->
      let newInput = BS.drop delta input in
      let newOffset = offset + delta in
      if BS.isPrefixOf eofMarker newInput
      then newOffset + nextLine newInput
      else nextSection (newOffset + 1) (BS.drop 1 newInput)
 iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
 iterateContents startXref input =
@ -74,7 +84,17 @@ iterateContents startXref input =
  where
    stopOrFollow c@(Content {trailer}) =
      case Map.lookup "Prev" trailer of
-        Nothing -> Right [c]
+        Nothing -> Right [c {startOffset = nextLine input}]
-        Just (Number f) -> (c:) <$> (iterateContents (truncate f) input)
+        Just (Number newStartXref) ->
          let offset = truncate newStartXref in
          let startOffset = nextSection offset (BS.drop offset input) in
          (c {startOffset}:) <$> (iterateContents offset input)
        Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
 parseDocument :: ByteString -> Either ParseError Document
 parseDocument input = do
  (pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
  startXref <- readStartXref eolStyle input
  structures <- iterateContents startXref input
  let contents = populate input <$> structures
  return $ Document {pdfVersion, contents}
--- a/src/PDF/Body.hs
+++ b/src/PDF/Body.hs
@ -113,9 +113,12 @@ occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates
 populate :: ByteString -> Content -> Content
 populate input initialContent =
-  case runParser recurseOnOccurrences initialState "" input of
+  let bodyInput = BS.drop (startOffset initialContent) input in
  case runParser recurseOnOccurrences initialState "" bodyInput of
    Left _ -> initialContent
-    Right finalState -> content finalState
+    Right finalState ->
      let finalContent = content finalState in
      finalContent {body = reverse (body finalContent)}
  where
    initialState = UserState {input, content = initialContent}
--- a/src/PDF/Object.hs
+++ b/src/PDF/Object.hs
@ -25,7 +25,6 @@ import Data.Int (Int64)
 import Data.Map (Map)
 import qualified Data.Map as Map (empty, fromList)
 import Text.Parsec
 --import Text.Parsec.ByteString.Lazy (Parser)
 type Parser u = Parsec ByteString u
@ -185,7 +184,7 @@ xrefSubSection = do
 content :: Parser u Content
 content =
-    Content [] Map.empty
+    Content 0 [] Map.empty
  <$> (line "xref" *> xrefSubSection `sepBy` eol)
  <*> (line "trailer" *> dictionary <* eol)
  <*> (line "startxref" *> integer)