Boyer-Moore is canceled, implement the rest of parsing with naive search
This commit is contained in:
parent
fc41f815a3
commit
9b2f890227
4 changed files with 39 additions and 59 deletions
|
@ -1,15 +1,12 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module Data.ByteString.Lazy.Char8.Util (
|
||||
previous
|
||||
, search
|
||||
, subBS
|
||||
) where
|
||||
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, isPrefixOf, length, tails, take, uncons)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
|
||||
import Data.Int (Int64)
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, member)
|
||||
import Prelude hiding (length)
|
||||
|
||||
previous :: Char -> Int64 -> ByteString -> Int64
|
||||
|
@ -17,44 +14,5 @@ previous char position byteString
|
|||
| BS.index byteString position == char = position
|
||||
| otherwise = previous char (position - 1) byteString
|
||||
|
||||
data BMTable = BMTable {
|
||||
length :: Int64
|
||||
, offsets :: Map Int64 (Map Char Int64)
|
||||
} deriving Show
|
||||
|
||||
prepare :: ByteString -> BMTable
|
||||
prepare needle =
|
||||
let length = BS.length needle in
|
||||
let offsets = Map.fromList $ generateSuffixOffsets <$> [0..length - 1] in
|
||||
BMTable {length, offsets}
|
||||
where
|
||||
generateSuffixOffsets l =
|
||||
let suffix = BS.drop l needle in
|
||||
let prefixes = fmap (l -) <$> zip (BS.tails needle) [0..l] in
|
||||
(l, foldl (addOffset suffix) Map.empty prefixes)
|
||||
addOffset suffix tmpMap ((aSuffix, delta)) = maybe tmpMap id $ do
|
||||
(initial, rest) <- BS.uncons aSuffix
|
||||
if BS.isPrefixOf suffix rest && not (Map.member initial tmpMap)
|
||||
then return (Map.insert initial delta tmpMap)
|
||||
else Nothing
|
||||
|
||||
jump :: BMTable -> Int64 -> Char -> Int64
|
||||
jump (BMTable {length, offsets}) index char =
|
||||
maybe length (maybe length id . Map.lookup char) (Map.lookup index offsets)
|
||||
|
||||
search :: ByteString -> ByteString -> Maybe Int64
|
||||
search needle = boyerMoore 0
|
||||
where
|
||||
table = prepare needle
|
||||
tryMatch offset haystack n
|
||||
| n < 0 = Just offset
|
||||
| BS.index haystack n == BS.index needle n = tryMatch offset haystack (n-1)
|
||||
| otherwise =
|
||||
let delta = jump table n (BS.index haystack n) in
|
||||
boyerMoore (offset + delta) (BS.drop delta haystack)
|
||||
boyerMoore offset haystack
|
||||
| BS.length haystack < 1 = Nothing
|
||||
| otherwise = tryMatch offset haystack (BS.length needle - 1)
|
||||
|
||||
subBS :: Int64 -> Int64 -> ByteString -> ByteString
|
||||
subBS offset length = BS.take length . BS.drop offset
|
||||
|
|
44
src/PDF.hs
44
src/PDF.hs
|
@ -8,7 +8,9 @@ module PDF (
|
|||
) where
|
||||
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, unpack)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BS (
|
||||
drop, findIndex, head, isPrefixOf, last, length, span, unpack
|
||||
)
|
||||
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
|
||||
import Data.Int (Int64)
|
||||
import qualified Data.Map as Map (lookup)
|
||||
|
@ -57,16 +59,24 @@ readStartXref eolStyle input =
|
|||
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
|
||||
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
|
||||
|
||||
parseDocument :: ByteString -> Either ParseError Document
|
||||
parseDocument input = do
|
||||
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
|
||||
startXref <- readStartXref eolStyle input
|
||||
structures <- iterateContents startXref input
|
||||
let contents = populate input <$> structures
|
||||
return $ Document {pdfVersion, contents}
|
||||
nextLine :: ByteString -> Int64
|
||||
nextLine input =
|
||||
let (line, eolPrefixed) = BS.span notInEol input in
|
||||
let nextNotInEol = BS.findIndex notInEol eolPrefixed in
|
||||
BS.length line + (maybe (BS.length eolPrefixed) id nextNotInEol)
|
||||
where
|
||||
notInEol = not . (`elem` eolCharset)
|
||||
|
||||
findNextContentSection :: Int64 -> ByteString -> Int64
|
||||
findNextContentSection startXref input =
|
||||
nextSection :: Int64 -> ByteString -> Int64
|
||||
nextSection offset input =
|
||||
case BS.findIndex (== BS.head eofMarker) input of
|
||||
Nothing -> 0
|
||||
Just delta ->
|
||||
let newInput = BS.drop delta input in
|
||||
let newOffset = offset + delta in
|
||||
if BS.isPrefixOf eofMarker newInput
|
||||
then newOffset + nextLine newInput
|
||||
else nextSection (newOffset + 1) (BS.drop 1 newInput)
|
||||
|
||||
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
|
||||
iterateContents startXref input =
|
||||
|
@ -74,7 +84,17 @@ iterateContents startXref input =
|
|||
where
|
||||
stopOrFollow c@(Content {trailer}) =
|
||||
case Map.lookup "Prev" trailer of
|
||||
Nothing -> Right [c]
|
||||
Just (Number f) -> (c:) <$> (iterateContents (truncate f) input)
|
||||
Nothing -> Right [c {startOffset = nextLine input}]
|
||||
Just (Number newStartXref) ->
|
||||
let offset = truncate newStartXref in
|
||||
let startOffset = nextSection offset (BS.drop offset input) in
|
||||
(c {startOffset}:) <$> (iterateContents offset input)
|
||||
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
|
||||
|
||||
parseDocument :: ByteString -> Either ParseError Document
|
||||
parseDocument input = do
|
||||
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
|
||||
startXref <- readStartXref eolStyle input
|
||||
structures <- iterateContents startXref input
|
||||
let contents = populate input <$> structures
|
||||
return $ Document {pdfVersion, contents}
|
||||
|
|
|
@ -113,9 +113,12 @@ occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates
|
|||
|
||||
populate :: ByteString -> Content -> Content
|
||||
populate input initialContent =
|
||||
case runParser recurseOnOccurrences initialState "" input of
|
||||
let bodyInput = BS.drop (startOffset initialContent) input in
|
||||
case runParser recurseOnOccurrences initialState "" bodyInput of
|
||||
Left _ -> initialContent
|
||||
Right finalState -> content finalState
|
||||
Right finalState ->
|
||||
let finalContent = content finalState in
|
||||
finalContent {body = reverse (body finalContent)}
|
||||
where
|
||||
initialState = UserState {input, content = initialContent}
|
||||
|
||||
|
|
|
@ -25,7 +25,6 @@ import Data.Int (Int64)
|
|||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map (empty, fromList)
|
||||
import Text.Parsec
|
||||
--import Text.Parsec.ByteString.Lazy (Parser)
|
||||
|
||||
type Parser u = Parsec ByteString u
|
||||
|
||||
|
@ -185,7 +184,7 @@ xrefSubSection = do
|
|||
|
||||
content :: Parser u Content
|
||||
content =
|
||||
Content [] Map.empty
|
||||
Content 0 [] Map.empty
|
||||
<$> (line "xref" *> xrefSubSection `sepBy` eol)
|
||||
<*> (line "trailer" *> dictionary <* eol)
|
||||
<*> (line "startxref" *> integer)
|
||||
|
|
Loading…
Reference in a new issue