Boyer-Moore is canceled, implement the rest of parsing with naive search
This commit is contained in:
parent
fc41f815a3
commit
9b2f890227
4 changed files with 39 additions and 59 deletions
|
@ -1,15 +1,12 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
module Data.ByteString.Lazy.Char8.Util (
|
module Data.ByteString.Lazy.Char8.Util (
|
||||||
previous
|
previous
|
||||||
, search
|
|
||||||
, subBS
|
, subBS
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, isPrefixOf, length, tails, take, uncons)
|
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
|
||||||
import Data.Int (Int64)
|
import Data.Int (Int64)
|
||||||
import Data.Map (Map)
|
|
||||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, member)
|
|
||||||
import Prelude hiding (length)
|
import Prelude hiding (length)
|
||||||
|
|
||||||
previous :: Char -> Int64 -> ByteString -> Int64
|
previous :: Char -> Int64 -> ByteString -> Int64
|
||||||
|
@ -17,44 +14,5 @@ previous char position byteString
|
||||||
| BS.index byteString position == char = position
|
| BS.index byteString position == char = position
|
||||||
| otherwise = previous char (position - 1) byteString
|
| otherwise = previous char (position - 1) byteString
|
||||||
|
|
||||||
data BMTable = BMTable {
|
|
||||||
length :: Int64
|
|
||||||
, offsets :: Map Int64 (Map Char Int64)
|
|
||||||
} deriving Show
|
|
||||||
|
|
||||||
prepare :: ByteString -> BMTable
|
|
||||||
prepare needle =
|
|
||||||
let length = BS.length needle in
|
|
||||||
let offsets = Map.fromList $ generateSuffixOffsets <$> [0..length - 1] in
|
|
||||||
BMTable {length, offsets}
|
|
||||||
where
|
|
||||||
generateSuffixOffsets l =
|
|
||||||
let suffix = BS.drop l needle in
|
|
||||||
let prefixes = fmap (l -) <$> zip (BS.tails needle) [0..l] in
|
|
||||||
(l, foldl (addOffset suffix) Map.empty prefixes)
|
|
||||||
addOffset suffix tmpMap ((aSuffix, delta)) = maybe tmpMap id $ do
|
|
||||||
(initial, rest) <- BS.uncons aSuffix
|
|
||||||
if BS.isPrefixOf suffix rest && not (Map.member initial tmpMap)
|
|
||||||
then return (Map.insert initial delta tmpMap)
|
|
||||||
else Nothing
|
|
||||||
|
|
||||||
jump :: BMTable -> Int64 -> Char -> Int64
|
|
||||||
jump (BMTable {length, offsets}) index char =
|
|
||||||
maybe length (maybe length id . Map.lookup char) (Map.lookup index offsets)
|
|
||||||
|
|
||||||
search :: ByteString -> ByteString -> Maybe Int64
|
|
||||||
search needle = boyerMoore 0
|
|
||||||
where
|
|
||||||
table = prepare needle
|
|
||||||
tryMatch offset haystack n
|
|
||||||
| n < 0 = Just offset
|
|
||||||
| BS.index haystack n == BS.index needle n = tryMatch offset haystack (n-1)
|
|
||||||
| otherwise =
|
|
||||||
let delta = jump table n (BS.index haystack n) in
|
|
||||||
boyerMoore (offset + delta) (BS.drop delta haystack)
|
|
||||||
boyerMoore offset haystack
|
|
||||||
| BS.length haystack < 1 = Nothing
|
|
||||||
| otherwise = tryMatch offset haystack (BS.length needle - 1)
|
|
||||||
|
|
||||||
subBS :: Int64 -> Int64 -> ByteString -> ByteString
|
subBS :: Int64 -> Int64 -> ByteString -> ByteString
|
||||||
subBS offset length = BS.take length . BS.drop offset
|
subBS offset length = BS.take length . BS.drop offset
|
||||||
|
|
44
src/PDF.hs
44
src/PDF.hs
|
@ -8,7 +8,9 @@ module PDF (
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, unpack)
|
import qualified Data.ByteString.Lazy.Char8 as BS (
|
||||||
|
drop, findIndex, head, isPrefixOf, last, length, span, unpack
|
||||||
|
)
|
||||||
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
|
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
|
||||||
import Data.Int (Int64)
|
import Data.Int (Int64)
|
||||||
import qualified Data.Map as Map (lookup)
|
import qualified Data.Map as Map (lookup)
|
||||||
|
@ -57,16 +59,24 @@ readStartXref eolStyle input =
|
||||||
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
|
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
|
||||||
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
|
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
|
||||||
|
|
||||||
parseDocument :: ByteString -> Either ParseError Document
|
nextLine :: ByteString -> Int64
|
||||||
parseDocument input = do
|
nextLine input =
|
||||||
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
|
let (line, eolPrefixed) = BS.span notInEol input in
|
||||||
startXref <- readStartXref eolStyle input
|
let nextNotInEol = BS.findIndex notInEol eolPrefixed in
|
||||||
structures <- iterateContents startXref input
|
BS.length line + (maybe (BS.length eolPrefixed) id nextNotInEol)
|
||||||
let contents = populate input <$> structures
|
where
|
||||||
return $ Document {pdfVersion, contents}
|
notInEol = not . (`elem` eolCharset)
|
||||||
|
|
||||||
findNextContentSection :: Int64 -> ByteString -> Int64
|
nextSection :: Int64 -> ByteString -> Int64
|
||||||
findNextContentSection startXref input =
|
nextSection offset input =
|
||||||
|
case BS.findIndex (== BS.head eofMarker) input of
|
||||||
|
Nothing -> 0
|
||||||
|
Just delta ->
|
||||||
|
let newInput = BS.drop delta input in
|
||||||
|
let newOffset = offset + delta in
|
||||||
|
if BS.isPrefixOf eofMarker newInput
|
||||||
|
then newOffset + nextLine newInput
|
||||||
|
else nextSection (newOffset + 1) (BS.drop 1 newInput)
|
||||||
|
|
||||||
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
|
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
|
||||||
iterateContents startXref input =
|
iterateContents startXref input =
|
||||||
|
@ -74,7 +84,17 @@ iterateContents startXref input =
|
||||||
where
|
where
|
||||||
stopOrFollow c@(Content {trailer}) =
|
stopOrFollow c@(Content {trailer}) =
|
||||||
case Map.lookup "Prev" trailer of
|
case Map.lookup "Prev" trailer of
|
||||||
Nothing -> Right [c]
|
Nothing -> Right [c {startOffset = nextLine input}]
|
||||||
Just (Number f) -> (c:) <$> (iterateContents (truncate f) input)
|
Just (Number newStartXref) ->
|
||||||
|
let offset = truncate newStartXref in
|
||||||
|
let startOffset = nextSection offset (BS.drop offset input) in
|
||||||
|
(c {startOffset}:) <$> (iterateContents offset input)
|
||||||
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
|
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
|
||||||
|
|
||||||
|
parseDocument :: ByteString -> Either ParseError Document
|
||||||
|
parseDocument input = do
|
||||||
|
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
|
||||||
|
startXref <- readStartXref eolStyle input
|
||||||
|
structures <- iterateContents startXref input
|
||||||
|
let contents = populate input <$> structures
|
||||||
|
return $ Document {pdfVersion, contents}
|
||||||
|
|
|
@ -113,9 +113,12 @@ occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates
|
||||||
|
|
||||||
populate :: ByteString -> Content -> Content
|
populate :: ByteString -> Content -> Content
|
||||||
populate input initialContent =
|
populate input initialContent =
|
||||||
case runParser recurseOnOccurrences initialState "" input of
|
let bodyInput = BS.drop (startOffset initialContent) input in
|
||||||
|
case runParser recurseOnOccurrences initialState "" bodyInput of
|
||||||
Left _ -> initialContent
|
Left _ -> initialContent
|
||||||
Right finalState -> content finalState
|
Right finalState ->
|
||||||
|
let finalContent = content finalState in
|
||||||
|
finalContent {body = reverse (body finalContent)}
|
||||||
where
|
where
|
||||||
initialState = UserState {input, content = initialContent}
|
initialState = UserState {input, content = initialContent}
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,6 @@ import Data.Int (Int64)
|
||||||
import Data.Map (Map)
|
import Data.Map (Map)
|
||||||
import qualified Data.Map as Map (empty, fromList)
|
import qualified Data.Map as Map (empty, fromList)
|
||||||
import Text.Parsec
|
import Text.Parsec
|
||||||
--import Text.Parsec.ByteString.Lazy (Parser)
|
|
||||||
|
|
||||||
type Parser u = Parsec ByteString u
|
type Parser u = Parsec ByteString u
|
||||||
|
|
||||||
|
@ -185,7 +184,7 @@ xrefSubSection = do
|
||||||
|
|
||||||
content :: Parser u Content
|
content :: Parser u Content
|
||||||
content =
|
content =
|
||||||
Content [] Map.empty
|
Content 0 [] Map.empty
|
||||||
<$> (line "xref" *> xrefSubSection `sepBy` eol)
|
<$> (line "xref" *> xrefSubSection `sepBy` eol)
|
||||||
<*> (line "trailer" *> dictionary <* eol)
|
<*> (line "trailer" *> dictionary <* eol)
|
||||||
<*> (line "startxref" *> integer)
|
<*> (line "startxref" *> integer)
|
||||||
|
|
Loading…
Reference in a new issue