Boyer-Moore is canceled, implement the rest of parsing with naive search

This commit is contained in:
Tissevert 2019-05-16 11:01:50 +02:00
parent fc41f815a3
commit 9b2f890227
4 changed files with 39 additions and 59 deletions

View file

@ -1,15 +1,12 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
module Data.ByteString.Lazy.Char8.Util ( module Data.ByteString.Lazy.Char8.Util (
previous previous
, search
, subBS , subBS
) where ) where
import Data.ByteString.Lazy.Char8 (ByteString) import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, isPrefixOf, length, tails, take, uncons) import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
import Data.Int (Int64) import Data.Int (Int64)
import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList, insert, lookup, member)
import Prelude hiding (length) import Prelude hiding (length)
previous :: Char -> Int64 -> ByteString -> Int64 previous :: Char -> Int64 -> ByteString -> Int64
@ -17,44 +14,5 @@ previous char position byteString
| BS.index byteString position == char = position | BS.index byteString position == char = position
| otherwise = previous char (position - 1) byteString | otherwise = previous char (position - 1) byteString
data BMTable = BMTable {
length :: Int64
, offsets :: Map Int64 (Map Char Int64)
} deriving Show
prepare :: ByteString -> BMTable
prepare needle =
let length = BS.length needle in
let offsets = Map.fromList $ generateSuffixOffsets <$> [0..length - 1] in
BMTable {length, offsets}
where
generateSuffixOffsets l =
let suffix = BS.drop l needle in
let prefixes = fmap (l -) <$> zip (BS.tails needle) [0..l] in
(l, foldl (addOffset suffix) Map.empty prefixes)
addOffset suffix tmpMap ((aSuffix, delta)) = maybe tmpMap id $ do
(initial, rest) <- BS.uncons aSuffix
if BS.isPrefixOf suffix rest && not (Map.member initial tmpMap)
then return (Map.insert initial delta tmpMap)
else Nothing
jump :: BMTable -> Int64 -> Char -> Int64
jump (BMTable {length, offsets}) index char =
maybe length (maybe length id . Map.lookup char) (Map.lookup index offsets)
search :: ByteString -> ByteString -> Maybe Int64
search needle = boyerMoore 0
where
table = prepare needle
tryMatch offset haystack n
| n < 0 = Just offset
| BS.index haystack n == BS.index needle n = tryMatch offset haystack (n-1)
| otherwise =
let delta = jump table n (BS.index haystack n) in
boyerMoore (offset + delta) (BS.drop delta haystack)
boyerMoore offset haystack
| BS.length haystack < 1 = Nothing
| otherwise = tryMatch offset haystack (BS.length needle - 1)
subBS :: Int64 -> Int64 -> ByteString -> ByteString subBS :: Int64 -> Int64 -> ByteString -> ByteString
subBS offset length = BS.take length . BS.drop offset subBS offset length = BS.take length . BS.drop offset

View file

@ -8,7 +8,9 @@ module PDF (
) where ) where
import Data.ByteString.Lazy.Char8 (ByteString) import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, unpack) import qualified Data.ByteString.Lazy.Char8 as BS (
drop, findIndex, head, isPrefixOf, last, length, span, unpack
)
import Data.ByteString.Lazy.Char8.Util (previous, subBS) import Data.ByteString.Lazy.Char8.Util (previous, subBS)
import Data.Int (Int64) import Data.Int (Int64)
import qualified Data.Map as Map (lookup) import qualified Data.Map as Map (lookup)
@ -57,16 +59,24 @@ readStartXref eolStyle input =
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1 previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
parseDocument :: ByteString -> Either ParseError Document nextLine :: ByteString -> Int64
parseDocument input = do nextLine input =
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input let (line, eolPrefixed) = BS.span notInEol input in
startXref <- readStartXref eolStyle input let nextNotInEol = BS.findIndex notInEol eolPrefixed in
structures <- iterateContents startXref input BS.length line + (maybe (BS.length eolPrefixed) id nextNotInEol)
let contents = populate input <$> structures where
return $ Document {pdfVersion, contents} notInEol = not . (`elem` eolCharset)
findNextContentSection :: Int64 -> ByteString -> Int64 nextSection :: Int64 -> ByteString -> Int64
findNextContentSection startXref input = nextSection offset input =
case BS.findIndex (== BS.head eofMarker) input of
Nothing -> 0
Just delta ->
let newInput = BS.drop delta input in
let newOffset = offset + delta in
if BS.isPrefixOf eofMarker newInput
then newOffset + nextLine newInput
else nextSection (newOffset + 1) (BS.drop 1 newInput)
iterateContents :: Int64 -> ByteString -> Either ParseError [Content] iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
iterateContents startXref input = iterateContents startXref input =
@ -74,7 +84,17 @@ iterateContents startXref input =
where where
stopOrFollow c@(Content {trailer}) = stopOrFollow c@(Content {trailer}) =
case Map.lookup "Prev" trailer of case Map.lookup "Prev" trailer of
Nothing -> Right [c] Nothing -> Right [c {startOffset = nextLine input}]
Just (Number f) -> (c:) <$> (iterateContents (truncate f) input) Just (Number newStartXref) ->
let offset = truncate newStartXref in
let startOffset = nextSection offset (BS.drop offset input) in
(c {startOffset}:) <$> (iterateContents offset input)
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
parseDocument :: ByteString -> Either ParseError Document
parseDocument input = do
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
startXref <- readStartXref eolStyle input
structures <- iterateContents startXref input
let contents = populate input <$> structures
return $ Document {pdfVersion, contents}

View file

@ -113,9 +113,12 @@ occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates
populate :: ByteString -> Content -> Content populate :: ByteString -> Content -> Content
populate input initialContent = populate input initialContent =
case runParser recurseOnOccurrences initialState "" input of let bodyInput = BS.drop (startOffset initialContent) input in
case runParser recurseOnOccurrences initialState "" bodyInput of
Left _ -> initialContent Left _ -> initialContent
Right finalState -> content finalState Right finalState ->
let finalContent = content finalState in
finalContent {body = reverse (body finalContent)}
where where
initialState = UserState {input, content = initialContent} initialState = UserState {input, content = initialContent}

View file

@ -25,7 +25,6 @@ import Data.Int (Int64)
import Data.Map (Map) import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList) import qualified Data.Map as Map (empty, fromList)
import Text.Parsec import Text.Parsec
--import Text.Parsec.ByteString.Lazy (Parser)
type Parser u = Parsec ByteString u type Parser u = Parsec ByteString u
@ -185,7 +184,7 @@ xrefSubSection = do
content :: Parser u Content content :: Parser u Content
content = content =
Content [] Map.empty Content 0 [] Map.empty
<$> (line "xref" *> xrefSubSection `sepBy` eol) <$> (line "xref" *> xrefSubSection `sepBy` eol)
<*> (line "trailer" *> dictionary <* eol) <*> (line "trailer" *> dictionary <* eol)
<*> (line "startxref" *> integer) <*> (line "startxref" *> integer)