Boyer-Moore is canceled, implement the rest of parsing with naive search

This commit is contained in:
Tissevert 2019-05-16 11:01:50 +02:00
parent fc41f815a3
commit 9b2f890227
4 changed files with 39 additions and 59 deletions

View file

@ -1,15 +1,12 @@
{-# LANGUAGE NamedFieldPuns #-}
module Data.ByteString.Lazy.Char8.Util (
previous
, search
, subBS
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, isPrefixOf, length, tails, take, uncons)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
import Data.Int (Int64)
import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList, insert, lookup, member)
import Prelude hiding (length)
previous :: Char -> Int64 -> ByteString -> Int64
@ -17,44 +14,5 @@ previous char position byteString
| BS.index byteString position == char = position
| otherwise = previous char (position - 1) byteString
data BMTable = BMTable {
length :: Int64
, offsets :: Map Int64 (Map Char Int64)
} deriving Show
prepare :: ByteString -> BMTable
prepare needle =
let length = BS.length needle in
let offsets = Map.fromList $ generateSuffixOffsets <$> [0..length - 1] in
BMTable {length, offsets}
where
generateSuffixOffsets l =
let suffix = BS.drop l needle in
let prefixes = fmap (l -) <$> zip (BS.tails needle) [0..l] in
(l, foldl (addOffset suffix) Map.empty prefixes)
addOffset suffix tmpMap ((aSuffix, delta)) = maybe tmpMap id $ do
(initial, rest) <- BS.uncons aSuffix
if BS.isPrefixOf suffix rest && not (Map.member initial tmpMap)
then return (Map.insert initial delta tmpMap)
else Nothing
jump :: BMTable -> Int64 -> Char -> Int64
jump (BMTable {length, offsets}) index char =
maybe length (maybe length id . Map.lookup char) (Map.lookup index offsets)
search :: ByteString -> ByteString -> Maybe Int64
search needle = boyerMoore 0
where
table = prepare needle
tryMatch offset haystack n
| n < 0 = Just offset
| BS.index haystack n == BS.index needle n = tryMatch offset haystack (n-1)
| otherwise =
let delta = jump table n (BS.index haystack n) in
boyerMoore (offset + delta) (BS.drop delta haystack)
boyerMoore offset haystack
| BS.length haystack < 1 = Nothing
| otherwise = tryMatch offset haystack (BS.length needle - 1)
subBS :: Int64 -> Int64 -> ByteString -> ByteString
subBS offset length = BS.take length . BS.drop offset

View file

@ -8,7 +8,9 @@ module PDF (
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, unpack)
import qualified Data.ByteString.Lazy.Char8 as BS (
drop, findIndex, head, isPrefixOf, last, length, span, unpack
)
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
import Data.Int (Int64)
import qualified Data.Map as Map (lookup)
@ -57,16 +59,24 @@ readStartXref eolStyle input =
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
parseDocument :: ByteString -> Either ParseError Document
parseDocument input = do
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
startXref <- readStartXref eolStyle input
structures <- iterateContents startXref input
let contents = populate input <$> structures
return $ Document {pdfVersion, contents}
nextLine :: ByteString -> Int64
nextLine input =
let (line, eolPrefixed) = BS.span notInEol input in
let nextNotInEol = BS.findIndex notInEol eolPrefixed in
BS.length line + (maybe (BS.length eolPrefixed) id nextNotInEol)
where
notInEol = not . (`elem` eolCharset)
findNextContentSection :: Int64 -> ByteString -> Int64
findNextContentSection startXref input =
nextSection :: Int64 -> ByteString -> Int64
nextSection offset input =
case BS.findIndex (== BS.head eofMarker) input of
Nothing -> 0
Just delta ->
let newInput = BS.drop delta input in
let newOffset = offset + delta in
if BS.isPrefixOf eofMarker newInput
then newOffset + nextLine newInput
else nextSection (newOffset + 1) (BS.drop 1 newInput)
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
iterateContents startXref input =
@ -74,7 +84,17 @@ iterateContents startXref input =
where
stopOrFollow c@(Content {trailer}) =
case Map.lookup "Prev" trailer of
Nothing -> Right [c]
Just (Number f) -> (c:) <$> (iterateContents (truncate f) input)
Nothing -> Right [c {startOffset = nextLine input}]
Just (Number newStartXref) ->
let offset = truncate newStartXref in
let startOffset = nextSection offset (BS.drop offset input) in
(c {startOffset}:) <$> (iterateContents offset input)
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
parseDocument :: ByteString -> Either ParseError Document
parseDocument input = do
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
startXref <- readStartXref eolStyle input
structures <- iterateContents startXref input
let contents = populate input <$> structures
return $ Document {pdfVersion, contents}

View file

@ -113,9 +113,12 @@ occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates
populate :: ByteString -> Content -> Content
populate input initialContent =
case runParser recurseOnOccurrences initialState "" input of
let bodyInput = BS.drop (startOffset initialContent) input in
case runParser recurseOnOccurrences initialState "" bodyInput of
Left _ -> initialContent
Right finalState -> content finalState
Right finalState ->
let finalContent = content finalState in
finalContent {body = reverse (body finalContent)}
where
initialState = UserState {input, content = initialContent}

View file

@ -25,7 +25,6 @@ import Data.Int (Int64)
import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList)
import Text.Parsec
--import Text.Parsec.ByteString.Lazy (Parser)
type Parser u = Parsec ByteString u
@ -185,7 +184,7 @@ xrefSubSection = do
content :: Parser u Content
content =
Content [] Map.empty
Content 0 [] Map.empty
<$> (line "xref" *> xrefSubSection `sepBy` eol)
<*> (line "trailer" *> dictionary <* eol)
<*> (line "startxref" *> integer)