Fix async race causing crash on server shutdown (#43)

* clean up forked threads before shutting down server * get rid of sleep, wait until all forked threads are dead to shutdown server * add todo
2024-11-23 03:29:42 +01:00 · 2016-07-18 09:04:01 -07:00 · 2016-07-18 09:04:01 -07:00 · 65026eb79c
commit 65026eb79c
parent 99e6f0652d
3 changed files with 82 additions and 11 deletions
--- a/grpc-haskell.cabal
+++ b/grpc-haskell.cabal
@ -80,7 +80,7 @@ library
    , grpc/impl/codegen/slice.h
  build-tools:          c2hs
  default-language:     Haskell2010
-  ghc-options:          -Wall -fwarn-incomplete-patterns -fno-warn-unused-do-bind
+  ghc-options:          -Wall -fwarn-incomplete-patterns -fno-warn-unused-do-bind -threaded
  include-dirs:         include
  hs-source-dirs:       src
  default-extensions:   CPP
--- a/src/Network/GRPC/LowLevel/Server.hs
+++ b/src/Network/GRPC/LowLevel/Server.hs
@ -14,10 +14,24 @@
 -- `Network.GRPC.LowLevel.Server.Unregistered`.
 module Network.GRPC.LowLevel.Server where

+import           Control.Concurrent                    (ThreadId
+                                                        , forkFinally
+                                                        , myThreadId
+                                                        , killThread)
+import           Control.Concurrent.STM                (atomically
+                                                        , check)
+import           Control.Concurrent.STM.TVar           (TVar
+                                                        , modifyTVar'
+                                                        , readTVar
+                                                        , writeTVar
+                                                        , readTVarIO
+                                                        , newTVarIO)
 import           Control.Exception                     (bracket, finally)
-import           Control.Monad
+import           Control.Monad hiding (mapM_)
 import           Control.Monad.Trans.Except
 import           Data.ByteString                       (ByteString)
+import           Data.Foldable                         (mapM_)
+import qualified Data.Set as S
 import           Network.GRPC.LowLevel.Call
 import           Network.GRPC.LowLevel.CompletionQueue (CompletionQueue,
                                                        createCompletionQueue,
@ -42,8 +56,41 @@ data Server = Server
  , cstreamingMethods    :: [RegisteredMethod 'ClientStreaming]
  , bidiStreamingMethods :: [RegisteredMethod 'BiDiStreaming]
  , serverConfig         :: ServerConfig
+  , outstandingForks     :: TVar (S.Set ThreadId)
+  , serverShuttingDown   :: TVar Bool
  }

+-- TODO: should we make a forkGRPC function instead? I am not sure if it would
+-- be safe to let the call handlers threads keep running after the server stops,
+-- so I'm taking the more conservative route of ensuring the server will
+-- stay alive. Experiment more when time permits.
+-- | Fork a thread from the server (presumably for a handler) with the guarantee
+-- that the server won't shut down while the thread is alive.
+-- Returns true if the fork happens successfully, and false if the server is
+-- already shutting down (in which case the function to fork is never executed).
+-- If the thread stays alive too long while the server is trying to shut down,
+-- the thread will be killed with 'killThread'.
+-- The purpose of this is to prevent memory access
+-- errors at the C level of the library, not to ensure that application layer
+-- operations in user code complete successfully.
+forkServer :: Server -> IO () -> IO Bool
+forkServer Server{..} f = do
+  shutdown <- readTVarIO serverShuttingDown
+  case shutdown of
+    True -> return False
+    False -> do
+      tid <- forkFinally f cleanup
+      atomically $ modifyTVar' outstandingForks (S.insert tid)
+      #ifdef DEBUG
+      currSet <- readTVarIO outstandingForks
+      grpcDebug $ "forkServer: number of outstandingForks is "
+                  ++ show (S.size currSet)
+      #endif
+      return True
+      where cleanup _ = do
+              tid <- myThreadId
+              atomically $ modifyTVar' outstandingForks (S.delete tid)
+
 -- | Configuration needed to start a server.
 data ServerConfig = ServerConfig
  { host              :: Host
@ -90,15 +137,18 @@ startServer grpc conf@ServerConfig{..} =
    bs <- mapM (\nm -> serverRegisterMethodBiDiStreaming server nm e)
               methodsToRegisterBiDiStreaming
    C.grpcServerStart server
-    return $ Server grpc server cq ns ss cs bs conf
+    forks <- newTVarIO S.empty
+    shutdown <- newTVarIO False
+    return $ Server grpc server cq ns ss cs bs conf forks shutdown

 stopServer :: Server -> IO ()
 -- TODO: Do method handles need to be freed?
-stopServer Server{ unsafeServer = s, serverCQ = scq } = do
+stopServer Server{ unsafeServer = s, serverCQ = scq, .. } = do
  grpcDebug "stopServer: calling shutdownNotify."
  shutdownNotify
  grpcDebug "stopServer: cancelling all calls."
  C.grpcServerCancelAllCalls s
+  cleanupForks
  grpcDebug "stopServer: call grpc_server_destroy."
  C.grpcServerDestroy s
  grpcDebug "stopServer: shutting down CQ."
@ -122,6 +172,14 @@ stopServer Server{ unsafeServer = s, serverCQ = scq } = do
            (Left GRPCIOShutdown) -> error "Called stopServer twice!"
            (Left _) -> error "Failed to stop server."
            (Right _) -> return ()
+        cleanupForks = do
+          atomically $ writeTVar serverShuttingDown True
+          liveForks <- readTVarIO outstandingForks
+          grpcDebug $ "Server shutdown: killing threads: " ++ show liveForks
+          mapM_ killThread liveForks
+          --wait for threads to shut down.
+          atomically $ readTVar outstandingForks >>= (check . (==0) . S.size)
+          grpcDebug "Server shutdown: All forks cleaned up."

 -- Uses 'bracket' to safely start and stop a server, even if exceptions occur.
 withServer :: GRPC -> ServerConfig -> (Server -> IO a) -> IO a
--- a/src/Network/GRPC/LowLevel/Server/Unregistered.hs
+++ b/src/Network/GRPC/LowLevel/Server/Unregistered.hs
@ -3,9 +3,7 @@

 module Network.GRPC.LowLevel.Server.Unregistered where

-import           Control.Concurrent                                 (forkIO)
 import           Control.Exception                                  (finally)
-import           Control.Monad
 import           Control.Monad.Trans.Except
 import           Data.ByteString                                    (ByteString)
 import           Network.GRPC.LowLevel.Call.Unregistered
@ -25,7 +23,8 @@ import           Network.GRPC.LowLevel.Op                           (Op (..)
 import           Network.GRPC.LowLevel.Server                       (Server (..)
                                                                     , ServerReaderHandlerLL
                                                                     , ServerWriterHandlerLL
-                                                                     , ServerRWHandlerLL)
+                                                                     , ServerRWHandlerLL
+                                                                     , forkServer)
 import qualified Network.GRPC.Unsafe.Op                             as C

 serverCreateCall :: Server
@ -54,10 +53,24 @@ withServerCallAsync :: Server
                       -> IO ()
 withServerCallAsync s f =
  serverCreateCall s >>= \case
-    Left _ -> return ()
-    Right c -> void $ forkIO (f c `finally` do
-      grpcDebug "withServerCallAsync: destroying."
-      destroyServerCall c)
+    Left e -> do grpcDebug $ "withServerCallAsync: call error: " ++ show e
+                 return ()
+    Right c -> do wasForkSuccess <- forkServer s handler
+                  if wasForkSuccess
+                     then return ()
+                     else destroy
+                where handler = f c `finally` destroy
+                      --TODO: We sometimes never finish cleanup if the server
+                      -- is shutting down and calls killThread. This causes
+                      -- gRPC core to complain about leaks.
+                      -- I think the cause of this is that killThread gets
+                      -- called after we are already in destroyServerCall,
+                      -- and wrapping uninterruptibleMask doesn't seem to help.
+                      -- Doesn't crash, but does emit annoying log messages.
+                      destroy = do
+                        grpcDebug "withServerCallAsync: destroying."
+                        destroyServerCall c
+                        grpcDebug "withServerCallAsync: cleanup finished."

 -- | Sequence of 'Op's needed to receive a normal (non-streaming) call.
 -- TODO: We have to put 'OpRecvCloseOnServer' in the response ops, or else the