Fixes a concurrency bug wherein a thread (running beyond the maximum number of

pluckers) would register a CQ notification via
grpc_server_request_registered_call prior to obtaining pluck permission, block
on acquiring such permission, and thus miss notification of the server call.

The bug can could be witnessed with particular interleavings by ensuring that
GRPC_MAX_COMPLETION_QUEUE_PLUCKERS server threads are blocked on their own
server call notifications, and then using the
(GRPC_MAX_COMPLETION_QUEUE_PLUCKERS+1)th server thread to attempt to block on a
notification for an additional server call.

In order to fix this, we simply extended the pluck permission held window for
serverRequestCall, and created a variant of pluck (called pluck') which assumes
permission to pluck is already held by the calling thread.
This commit is contained in:
Joel Stanley 2016-07-10 17:04:31 -05:00
parent 96d12c1e6c
commit 72160cea89
5 changed files with 136 additions and 111 deletions

View file

@ -5,21 +5,17 @@
import Control.Monad import Control.Monad
import Network.GRPC.LowLevel import Network.GRPC.LowLevel
import Network.GRPC.LowLevel.Call
import qualified Network.GRPC.LowLevel.Client.Unregistered as U import qualified Network.GRPC.LowLevel.Client.Unregistered as U
import System.Environment
echoMethod = MethodName "/echo.Echo/DoEcho" echoMethod = MethodName "/echo.Echo/DoEcho"
unregistered c = do _unregistered c = U.clientRequest c echoMethod 1 "hi" mempty
U.clientRequest c echoMethod 1 "hi" mempty
registered c = do main = withGRPC $ \g ->
meth <- clientRegisterMethod c echoMethod Normal withClient g (ClientConfig "localhost" 50051 []) $ \c -> do
clientRequest c meth 1 "hi" mempty rm <- clientRegisterMethod c echoMethod Normal
replicateM_ 100000 $ clientRequest c rm 5 "hi" mempty >>= \case
run f = withGRPC $ \g -> withClient g (ClientConfig "localhost" 50051 []) $ \c ->
f c >>= \case
Left e -> error $ "Got client error: " ++ show e Left e -> error $ "Got client error: " ++ show e
_ -> return () _ -> return ()
main = replicateM_ 100 $ run $
registered

View file

@ -5,10 +5,12 @@
{-# OPTIONS_GHC -fno-warn-missing-signatures #-} {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
{-# OPTIONS_GHC -fno-warn-unused-binds #-} {-# OPTIONS_GHC -fno-warn-unused-binds #-}
import Control.Concurrent.Async (async, wait) import Control.Concurrent
import Control.Monad (forever) import Control.Concurrent.Async
import Control.Monad
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import Network.GRPC.LowLevel import Network.GRPC.LowLevel
import Network.GRPC.LowLevel.Call
import qualified Network.GRPC.LowLevel.Server.Unregistered as U import qualified Network.GRPC.LowLevel.Server.Unregistered as U
import qualified Network.GRPC.LowLevel.Call.Unregistered as U import qualified Network.GRPC.LowLevel.Call.Unregistered as U
@ -44,14 +46,18 @@ regMain = withGRPC $ \grpc -> do
Left x -> putStrLn $ "registered call result error: " ++ show x Left x -> putStrLn $ "registered call result error: " ++ show x
Right _ -> return () Right _ -> return ()
-- | loop to fork n times tputStrLn x = do
tid <- myThreadId
putStrLn $ "[" ++ show tid ++ "]: " ++ x
regLoop :: Server -> RegisteredMethod 'Normal -> IO () regLoop :: Server -> RegisteredMethod 'Normal -> IO ()
regLoop server method = forever $ do regLoop server method = forever $ do
-- tputStrLn "about to block on call handler"
result <- serverHandleNormalCall server method serverMeta $ result <- serverHandleNormalCall server method serverMeta $
\_call reqBody _reqMeta -> return (reqBody, serverMeta, StatusOk, \_call reqBody _reqMeta ->
StatusDetails "") return (reqBody, serverMeta, StatusOk, StatusDetails "")
case result of case result of
Left x -> putStrLn $ "registered call result error: " ++ show x Left x -> error $! "registered call result error: " ++ show x
Right _ -> return () Right _ -> return ()
regMainThreaded :: IO () regMainThreaded :: IO ()
@ -60,11 +66,10 @@ regMainThreaded = do
let methods = [(MethodName "/echo.Echo/DoEcho", Normal)] let methods = [(MethodName "/echo.Echo/DoEcho", Normal)]
withServer grpc (ServerConfig "localhost" 50051 methods []) $ \server -> do withServer grpc (ServerConfig "localhost" 50051 methods []) $ \server -> do
let method = head (normalMethods server) let method = head (normalMethods server)
tid1 <- async $ regLoop server method tids <- replicateM 7 $ async $ do tputStrLn "starting handler"
tid2 <- async $ regLoop server method regLoop server method
wait tid1 waitAnyCancel tids
wait tid2 tputStrLn "finishing"
return ()
main :: IO () main :: IO ()
main = regMainThreaded main = regMainThreaded

View file

@ -12,6 +12,7 @@
{-# LANGUAGE LambdaCase #-} {-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-} {-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE PolyKinds #-}
{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TupleSections #-} {-# LANGUAGE TupleSections #-}
@ -33,14 +34,16 @@ module Network.GRPC.LowLevel.CompletionQueue
) )
where where
import Control.Concurrent.STM (atomically, check) import Control.Concurrent.STM (atomically,
import Control.Concurrent.STM.TVar (newTVarIO, readTVar, check)
import Control.Concurrent.STM.TVar (newTVarIO,
readTVar,
writeTVar) writeTVar)
import Control.Exception (bracket) import Control.Exception (bracket)
import Control.Monad.Trans.Class (MonadTrans(lift))
import Control.Monad.Trans.Except
import Control.Monad (liftM2) import Control.Monad (liftM2)
import Control.Monad.Managed import Control.Monad.Managed
import Control.Monad.Trans.Class (MonadTrans (lift))
import Control.Monad.Trans.Except
import Data.IORef (newIORef) import Data.IORef (newIORef)
import Data.List (intersperse) import Data.List (intersperse)
import Foreign.Marshal.Alloc (free, malloc) import Foreign.Marshal.Alloc (free, malloc)
@ -51,12 +54,13 @@ import qualified Network.GRPC.Unsafe.Constants as C
import qualified Network.GRPC.Unsafe.Metadata as C import qualified Network.GRPC.Unsafe.Metadata as C
import qualified Network.GRPC.Unsafe.Op as C import qualified Network.GRPC.Unsafe.Op as C
import qualified Network.GRPC.Unsafe.Time as C import qualified Network.GRPC.Unsafe.Time as C
import System.Clock (getTime, Clock(..)) import System.Clock (Clock (..),
getTime)
import System.Timeout (timeout) import System.Timeout (timeout)
import Network.GRPC.LowLevel.Call import Network.GRPC.LowLevel.Call
import Network.GRPC.LowLevel.GRPC
import Network.GRPC.LowLevel.CompletionQueue.Internal import Network.GRPC.LowLevel.CompletionQueue.Internal
import Network.GRPC.LowLevel.GRPC
import qualified Network.GRPC.Unsafe.ByteBuffer as C import qualified Network.GRPC.Unsafe.ByteBuffer as C
withCompletionQueue :: GRPC -> (CompletionQueue -> IO a) -> IO a withCompletionQueue :: GRPC -> (CompletionQueue -> IO a) -> IO a
@ -81,7 +85,9 @@ startBatch cq@CompletionQueue{..} call opArray opArraySize tag =
withPermission Push cq $ fmap throwIfCallError $ do withPermission Push cq $ fmap throwIfCallError $ do
grpcDebug $ "startBatch: calling grpc_call_start_batch with pointers: " grpcDebug $ "startBatch: calling grpc_call_start_batch with pointers: "
++ show call ++ " " ++ show opArray ++ show call ++ " " ++ show opArray
grpcDebug "About to enter grpc_call_start_batch"
res <- C.grpcCallStartBatch call opArray opArraySize tag C.reserved res <- C.grpcCallStartBatch call opArray opArraySize tag C.reserved
grpcDebug "Returned from grpc_call_start_batch"
grpcDebug "startBatch: grpc_call_start_batch call returned." grpcDebug "startBatch: grpc_call_start_batch call returned."
return res return res
@ -93,8 +99,9 @@ startBatch cq@CompletionQueue{..} call opArray opArraySize tag =
shutdownCompletionQueue :: CompletionQueue -> IO (Either GRPCIOError ()) shutdownCompletionQueue :: CompletionQueue -> IO (Either GRPCIOError ())
shutdownCompletionQueue CompletionQueue{..} = do shutdownCompletionQueue CompletionQueue{..} = do
atomically $ writeTVar shuttingDown True atomically $ writeTVar shuttingDown True
atomically $ readTVar currentPushers >>= \x -> check (x == 0) atomically $ do
atomically $ readTVar currentPluckers >>= \x -> check (x == 0) readTVar currentPushers >>= check . (==0)
readTVar currentPluckers >>= check . (==0)
--drain the queue --drain the queue
C.grpcCompletionQueueShutdown unsafeCQ C.grpcCompletionQueueShutdown unsafeCQ
loopRes <- timeout (5*10^(6::Int)) drainLoop loopRes <- timeout (5*10^(6::Int)) drainLoop
@ -139,22 +146,22 @@ serverRequestCall :: C.Server
-> CompletionQueue -> CompletionQueue
-> RegisteredMethod mt -> RegisteredMethod mt
-> IO (Either GRPCIOError ServerCall) -> IO (Either GRPCIOError ServerCall)
serverRequestCall s cq@CompletionQueue{.. } RegisteredMethod{..} = serverRequestCall s cq@CompletionQueue{.. } rm =
-- NB: The method type dictates whether or not a payload is present, according -- NB: The method type dictates whether or not a payload is present, according
-- to the payloadHandling function. We do not allocate a buffer for the -- to the payloadHandling function. We do not allocate a buffer for the
-- payload when it is not present. -- payload when it is not present.
withPermission Push cq . with allocs $ \(dead, call, pay, meta) -> do withPermission Push cq . with allocs $ \(dead, call, pay, meta) -> do
dbug "pre-pluck block"
withPermission Pluck cq $ do
md <- peek meta md <- peek meta
tag <- newTag cq tag <- newTag cq
dbug $ "tag is " ++ show tag dbug $ "got pluck permission, registering call for tag=" ++ show tag
ce <- C.grpcServerRequestRegisteredCall s methodHandle call ce <- C.grpcServerRequestRegisteredCall s (methodHandle rm) call dead md pay unsafeCQ unsafeCQ tag
dead md pay unsafeCQ unsafeCQ tag
dbug $ "callError: " ++ show ce
runExceptT $ case ce of runExceptT $ case ce of
C.CallOk -> do C.CallOk -> do
ExceptT $ do ExceptT $ do
r <- pluck cq tag Nothing r <- pluck' cq tag Nothing
dbug $ "pluck finished:" ++ show r dbug $ "pluck' finished:" ++ show r
return r return r
lift $ lift $
ServerCall ServerCall
@ -162,9 +169,11 @@ serverRequestCall s cq@CompletionQueue{.. } RegisteredMethod{..} =
<*> C.getAllMetadataArray md <*> C.getAllMetadataArray md
<*> (if havePay then toBS pay else return Nothing) <*> (if havePay then toBS pay else return Nothing)
<*> liftM2 (+) (getTime Monotonic) (C.timeSpec <$> peek dead) <*> liftM2 (+) (getTime Monotonic) (C.timeSpec <$> peek dead)
-- gRPC gives us a deadline that is just a delta, so we convert it -- gRPC gives us a deadline that is just a delta, so we convert
-- to a proper deadline. -- it to a proper deadline.
_ -> throwE (GRPCIOCallError ce) _ -> do
lift $ dbug $ "Throwing callError: " ++ show ce
throwE (GRPCIOCallError ce)
where where
allocs = (,,,) <$> ptr <*> ptr <*> pay <*> md allocs = (,,,) <$> ptr <*> ptr <*> pay <*> md
where where
@ -173,7 +182,7 @@ serverRequestCall s cq@CompletionQueue{.. } RegisteredMethod{..} =
ptr :: forall a. Storable a => Managed (Ptr a) ptr :: forall a. Storable a => Managed (Ptr a)
ptr = managed (bracket malloc free) ptr = managed (bracket malloc free)
dbug = grpcDebug . ("serverRequestCall(R): " ++) dbug = grpcDebug . ("serverRequestCall(R): " ++)
havePay = payloadHandling methodType /= C.SrmPayloadNone havePay = payloadHandling (methodType rm) /= C.SrmPayloadNone
toBS p = peek p >>= \bb@(C.ByteBuffer rawPtr) -> toBS p = peek p >>= \bb@(C.ByteBuffer rawPtr) ->
if | rawPtr == nullPtr -> return Nothing if | rawPtr == nullPtr -> return Nothing
| otherwise -> Just <$> C.copyByteBufferToByteString bb | otherwise -> Just <$> C.copyByteBufferToByteString bb

View file

@ -3,8 +3,11 @@
module Network.GRPC.LowLevel.CompletionQueue.Internal where module Network.GRPC.LowLevel.CompletionQueue.Internal where
import Control.Concurrent.STM (atomically, retry) import Control.Concurrent.STM (atomically, retry)
import Control.Concurrent.STM.TVar (TVar, modifyTVar', readTVar) import Control.Concurrent.STM.TVar (TVar, modifyTVar', readTVar,
writeTVar)
import Control.Monad.IO.Class
import Control.Exception (bracket) import Control.Exception (bracket)
import Control.Monad
import Data.IORef (IORef, atomicModifyIORef') import Data.IORef (IORef, atomicModifyIORef')
import Foreign.Ptr (nullPtr, plusPtr) import Foreign.Ptr (nullPtr, plusPtr)
import Network.GRPC.LowLevel.GRPC import Network.GRPC.LowLevel.GRPC
@ -12,6 +15,8 @@ import qualified Network.GRPC.Unsafe as C
import qualified Network.GRPC.Unsafe.Constants as C import qualified Network.GRPC.Unsafe.Constants as C
import qualified Network.GRPC.Unsafe.Time as C import qualified Network.GRPC.Unsafe.Time as C
import Debug.Trace
-- NOTE: the concurrency requirements for a CompletionQueue are a little -- NOTE: the concurrency requirements for a CompletionQueue are a little
-- complicated. There are two read operations: next and pluck. We can either -- complicated. There are two read operations: next and pluck. We can either
-- call next on a CQ or call pluck up to 'maxCompletionQueuePluckers' times -- call next on a CQ or call pluck up to 'maxCompletionQueuePluckers' times
@ -77,23 +82,24 @@ withPermission :: CQOpType
-> CompletionQueue -> CompletionQueue
-> IO (Either GRPCIOError a) -> IO (Either GRPCIOError a)
-> IO (Either GRPCIOError a) -> IO (Either GRPCIOError a)
withPermission op cq f = withPermission op cq act = bracket acquire release doOp
bracket acquire release doOp where
where acquire = atomically $ do acquire = atomically $ do
isShuttingDown <- readTVar (shuttingDown cq) isShuttingDown <- readTVar (shuttingDown cq)
if isShuttingDown if isShuttingDown
then return False then return False
else do currCount <- readTVar $ getCount op cq else do currCount <- readTVar (getCount op cq)
if currCount < getLimit op if currCount < getLimit op
then modifyTVar' (getCount op cq) (+1) >> return True then do
writeTVar (getCount op cq) (currCount+1)
return True
else retry else retry
doOp gotResource = if gotResource
then f doOp gotResource =
else return $ Left GRPCIOShutdown if gotResource then act else return (Left GRPCIOShutdown)
release gotResource =
if gotResource release gotResource = when gotResource $
then atomically $ modifyTVar' (getCount op cq) (subtract 1) atomically $ modifyTVar' (getCount op cq) (subtract 1)
else return ()
-- | Waits for the given number of seconds for the given tag to appear on the -- | Waits for the given number of seconds for the given tag to appear on the
-- completion queue. Throws 'GRPCIOShutdown' if the completion queue is shutting -- completion queue. Throws 'GRPCIOShutdown' if the completion queue is shutting
@ -103,16 +109,20 @@ withPermission op cq f =
-- 'serverRequestCall', this will block forever unless a timeout is given. -- 'serverRequestCall', this will block forever unless a timeout is given.
pluck :: CompletionQueue -> C.Tag -> Maybe TimeoutSeconds pluck :: CompletionQueue -> C.Tag -> Maybe TimeoutSeconds
-> IO (Either GRPCIOError ()) -> IO (Either GRPCIOError ())
pluck cq@CompletionQueue{..} tag waitSeconds = do pluck cq@CompletionQueue{..} tag mwait = do
grpcDebug $ "pluck: called with tag: " ++ show tag grpcDebug $ "pluck: called with tag=" ++ show tag ++ ",mwait=" ++ show mwait
++ " and wait: " ++ show waitSeconds withPermission Pluck cq $ pluck' cq tag mwait
withPermission Pluck cq $
case waitSeconds of -- Variant of pluck' which assumes pluck permission has been granted.
Nothing -> C.withInfiniteDeadline go pluck' :: CompletionQueue
Just seconds -> C.withDeadlineSeconds seconds go -> C.Tag
where go deadline = do -> Maybe TimeoutSeconds
ev <- C.grpcCompletionQueuePluck unsafeCQ tag deadline C.reserved -> IO (Either GRPCIOError ())
grpcDebug $ "pluck: finished. Event: " ++ show ev pluck' CompletionQueue{..} tag mwait =
maybe C.withInfiniteDeadline C.withDeadlineSeconds mwait $ \dead -> do
grpcDebug $ "pluck: blocking on grpc_completion_queue_pluck for tag=" ++ show tag
ev <- C.grpcCompletionQueuePluck unsafeCQ tag dead C.reserved
grpcDebug $ "pluck finished: " ++ show ev
return $ if isEventSuccessful ev then Right () else eventToError ev return $ if isEventSuccessful ev then Right () else eventToError ev
-- | Translate 'C.Event' to an error. The caller is responsible for ensuring -- | Translate 'C.Event' to an error. The caller is responsible for ensuring

View file

@ -4,7 +4,7 @@
module Network.GRPC.LowLevel.GRPC where module Network.GRPC.LowLevel.GRPC where
import Control.Concurrent (threadDelay) import Control.Concurrent (threadDelay, myThreadId)
import Control.Exception import Control.Exception
import Data.String (IsString) import Data.String (IsString)
import qualified Data.ByteString as B import qualified Data.ByteString as B
@ -59,12 +59,17 @@ throwIfCallError x = Left $ GRPCIOCallError x
grpcDebug :: String -> IO () grpcDebug :: String -> IO ()
{-# INLINE grpcDebug #-} {-# INLINE grpcDebug #-}
#ifdef DEBUG #ifdef DEBUG
grpcDebug str = do tid <- myThreadId grpcDebug = grpcDebug'
putStrLn $ (show tid) ++ ": " ++ str
#else #else
grpcDebug _ = return () grpcDebug _ = return ()
#endif #endif
grpcDebug' :: String -> IO ()
{-# INLINE grpcDebug' #-}
grpcDebug' str = do
tid <- myThreadId
putStrLn $ "[" ++ show tid ++ "]: " ++ str
threadDelaySecs :: Int -> IO () threadDelaySecs :: Int -> IO ()
threadDelaySecs = threadDelay . (* 10^(6::Int)) threadDelaySecs = threadDelay . (* 10^(6::Int))