Fixing a race condition in the Crash Generation Server which has to

do with simultaneous handling of dump requests and client process termination events. http://breakpad.appspot.com/430002/ git-svn-id: http://google-breakpad.googlecode.com/svn/trunk@1013 4c0a9323-5329-0410-9bdc-e9ce6186880e
author: ivan.penkov@gmail.com <ivan.penkov@gmail.com@4c0a9323-5329-0410-9bdc-e9ce6186880e> 2012-08-15 22:09:42 +0000
committer: ivan.penkov@gmail.com <ivan.penkov@gmail.com@4c0a9323-5329-0410-9bdc-e9ce6186880e> 2012-08-15 22:09:42 +0000
commit: d7de392b05e015f669012db65395f26daaa53f14 (patch)
tree: cb4e3b3c63a4fb71c26b5611bd8db89e984968a3
parent: Fix narrowing conversion from -1 to size_t (diff)
download: breakpad-d7de392b05e015f669012db65395f26daaa53f14.tar.xz
4 files changed, 179 insertions, 147 deletions
diff --git a/src/client/windows/crash_generation/client_info.cc b/src/client/windows/crash_generation/client_info.cc
index 60bbac82..ca10caa4 100644
--- a/src/client/windows/crash_generation/client_info.cc
+++ b/src/client/windows/crash_generation/client_info.cc
@@ -85,16 +85,38 @@ bool ClientInfo::Initialize() {
   return dump_generated_handle_ != NULL;
 }
 
-ClientInfo::~ClientInfo() {
+void ClientInfo::UnregisterDumpRequestWaitAndBlockUntilNoPending() {
   if (dump_request_wait_handle_) {
     // Wait for callbacks that might already be running to finish.
     UnregisterWaitEx(dump_request_wait_handle_, INVALID_HANDLE_VALUE);
+    dump_request_wait_handle_ = NULL;
   }
+}
 
+void ClientInfo::UnregisterProcessExitWait(bool block_until_no_pending) {
   if (process_exit_wait_handle_) {
-    // Wait for the callback that might already be running to finish.
-    UnregisterWaitEx(process_exit_wait_handle_, INVALID_HANDLE_VALUE);
+    if (block_until_no_pending) {
+      // Wait for the callback that might already be running to finish.
+      UnregisterWaitEx(process_exit_wait_handle_, INVALID_HANDLE_VALUE);
+    } else {
+      UnregisterWait(process_exit_wait_handle_);
+    }
+    process_exit_wait_handle_ = NULL;
   }
+}
+
+ClientInfo::~ClientInfo() {
+  // Waiting for the callback to finish here is safe because ClientInfo's are
+  // never destroyed from the dump request handling callback.
+  UnregisterDumpRequestWaitAndBlockUntilNoPending();
+
+  // This is a little tricky because ClientInfo's may be destroyed by the same
+  // callback (OnClientEnd) and waiting for it to finish will cause a deadlock.
+  // Regardless of this complication, wait for any running callbacks to finish
+  // so that the common case is properly handled.  In order to avoid deadlocks,
+  // the OnClientEnd callback must call UnregisterProcessExitWait(false)
+  // before deleting the ClientInfo.
+  UnregisterProcessExitWait(true);
 
   if (process_handle_) {
     CloseHandle(process_handle_);
@@ -109,18 +131,6 @@ ClientInfo::~ClientInfo() {
   }
 }
 
-void ClientInfo::UnregisterWaits() {
-  if (dump_request_wait_handle_) {
-    UnregisterWait(dump_request_wait_handle_);
-    dump_request_wait_handle_ = NULL;
-  }
-
-  if (process_exit_wait_handle_) {
-    UnregisterWait(process_exit_wait_handle_);
-    process_exit_wait_handle_ = NULL;
-  }
-}
-
 bool ClientInfo::GetClientExceptionInfo(EXCEPTION_POINTERS** ex_info) const {
   SIZE_T bytes_count = 0;
   if (!ReadProcessMemory(process_handle_,
diff --git a/src/client/windows/crash_generation/client_info.h b/src/client/windows/crash_generation/client_info.h
index 999e6678..a24a82e7 100644
--- a/src/client/windows/crash_generation/client_info.h
+++ b/src/client/windows/crash_generation/client_info.h
@@ -67,24 +67,22 @@ class ClientInfo {
   HANDLE dump_generated_handle() const { return dump_generated_handle_; }
   DWORD crash_id() const { return crash_id_; }
 
-  HANDLE dump_request_wait_handle() const {
-    return dump_request_wait_handle_;
-  }
-
   void set_dump_request_wait_handle(HANDLE value) {
     dump_request_wait_handle_ = value;
   }
 
-  HANDLE process_exit_wait_handle() const {
-    return process_exit_wait_handle_;
-  }
-
   void set_process_exit_wait_handle(HANDLE value) {
     process_exit_wait_handle_ = value;
   }
 
-  // Unregister all waits for the client.
-  void UnregisterWaits();
+  // Unregister the dump request wait operation and wait for all callbacks
+  // that might already be running to complete before returning.
+  void UnregisterDumpRequestWaitAndBlockUntilNoPending();
+
+  // Unregister the process exit wait operation.  If block_until_no_pending is
+  // true, wait for all callbacks that might already be running to complete
+  // before returning.
+  void UnregisterProcessExitWait(bool block_until_no_pending);
 
   bool Initialize();
   bool GetClientExceptionInfo(EXCEPTION_POINTERS** ex_info) const;
diff --git a/src/client/windows/crash_generation/crash_generation_server.cc b/src/client/windows/crash_generation/crash_generation_server.cc
index 477973c1..e67025eb 100644
--- a/src/client/windows/crash_generation/crash_generation_server.cc
+++ b/src/client/windows/crash_generation/crash_generation_server.cc
@@ -76,13 +76,6 @@ static const ULONG kPipeIOThreadFlags = WT_EXECUTEINWAITTHREAD;
 static const ULONG kDumpRequestThreadFlags = WT_EXECUTEINWAITTHREAD |
                                              WT_EXECUTELONGFUNCTION;
 
-// Maximum delay during server shutdown if some work items
-// are still executing.
-static const int kShutdownDelayMs = 10000;
-
-// Interval for each sleep during server shutdown.
-static const int kShutdownSleepIntervalMs = 5;
-
 static bool IsClientRequestValid(const ProtocolMessage& msg) {
   return msg.tag == MESSAGE_TAG_UPLOAD_REQUEST ||
          (msg.tag == MESSAGE_TAG_REGISTRATION_REQUEST &&
@@ -123,8 +116,7 @@ CrashGenerationServer::CrashGenerationServer(
       server_state_(IPC_SERVER_STATE_UNINITIALIZED),
       shutting_down_(false),
       overlapped_(),
-      client_info_(NULL),
-      cleanup_item_count_(0) {
+      client_info_(NULL) {
   InitializeCriticalSection(&sync_);
 
   if (dump_path) {
@@ -132,91 +124,85 @@ CrashGenerationServer::CrashGenerationServer(
   }
 }
 
+// This should never be called from the OnPipeConnected callback.
+// Otherwise the UnregisterWaitEx call below will cause a deadlock.
 CrashGenerationServer::~CrashGenerationServer() {
   // New scope to release the lock automatically.
   {
+    // Make sure no clients are added or removed beyond this point.
+    // Before adding or removing any clients, the critical section
+    // must be entered and the shutting_down_ flag checked. The
+    // critical section is then exited only after the clients_ list
+    // modifications are done and the list is in a consistent state.
     AutoCriticalSection lock(&sync_);
 
     // Indicate to existing threads that server is shutting down.
     shutting_down_ = true;
-
-    // Even if there are no current worker threads running, it is possible that
-    // an I/O request is pending on the pipe right now but not yet done.
-    // In fact, it's very likely this is the case unless we are in an ERROR
-    // state. If we don't wait for the pending I/O to be done, then when the I/O
-    // completes, it may write to invalid memory. AppVerifier will flag this
-    // problem too. So we disconnect from the pipe and then wait for the server
-    // to get into error state so that the pending I/O will fail and get
-    // cleared.
-    DisconnectNamedPipe(pipe_);
-    int num_tries = 100;
-    while (num_tries-- && server_state_ != IPC_SERVER_STATE_ERROR) {
-      lock.Release();
-      Sleep(10);
-      lock.Acquire();
-    }
-
-    // Unregister wait on the pipe.
-    if (pipe_wait_handle_) {
-      // Wait for already executing callbacks to finish.
-      UnregisterWaitEx(pipe_wait_handle_, INVALID_HANDLE_VALUE);
-    }
-
-    // Close the pipe to avoid further client connections.
-    if (pipe_) {
-      CloseHandle(pipe_);
-    }
-
-    // Request all ClientInfo objects to unregister all waits.
-    std::list<ClientInfo*>::iterator iter;
-    for (iter = clients_.begin(); iter != clients_.end(); ++iter) {
-      ClientInfo* client_info = *iter;
-      client_info->UnregisterWaits();
-    }
   }
+  // No one will modify the clients_ list beyond this point -
+  // not even from another thread.
 
-  // Now that all waits have been unregistered, wait for some time
-  // for all pending work items to finish.
-  int total_wait = 0;
-  while (cleanup_item_count_ > 0) {
-    Sleep(kShutdownSleepIntervalMs);
-
-    total_wait += kShutdownSleepIntervalMs;
+  // Even if there are no current worker threads running, it is possible that
+  // an I/O request is pending on the pipe right now but not yet done.
+  // In fact, it's very likely this is the case unless we are in an ERROR
+  // state. If we don't wait for the pending I/O to be done, then when the I/O
+  // completes, it may write to invalid memory. AppVerifier will flag this
+  // problem too. So we disconnect from the pipe and then wait for the server
+  // to get into error state so that the pending I/O will fail and get
+  // cleared.
+  DisconnectNamedPipe(pipe_);
+  int num_tries = 100;
+  while (num_tries-- && server_state_ != IPC_SERVER_STATE_ERROR) {
+    Sleep(10);
+  }
 
-    if (total_wait >= kShutdownDelayMs) {
-      break;
-    }
+  // Unregister wait on the pipe.
+  if (pipe_wait_handle_) {
+    // Wait for already executing callbacks to finish.
+    UnregisterWaitEx(pipe_wait_handle_, INVALID_HANDLE_VALUE);
   }
 
-  // New scope to hold the lock for the shortest time.
-  {
-    AutoCriticalSection lock(&sync_);
+  // Close the pipe to avoid further client connections.
+  if (pipe_) {
+    CloseHandle(pipe_);
+  }
 
-    // Clean up all the ClientInfo objects.
-    std::list<ClientInfo*>::iterator iter;
-    for (iter = clients_.begin(); iter != clients_.end(); ++iter) {
-      ClientInfo* client_info = *iter;
-      delete client_info;
-    }
+  // Request all ClientInfo objects to unregister all waits.
+  // No need to enter the critical section because no one is allowed to modify
+  // the clients_ list once the shutting_down_ flag is set.
+  std::list<ClientInfo*>::iterator iter;
+  for (iter = clients_.begin(); iter != clients_.end(); ++iter) {
+    ClientInfo* client_info = *iter;
+    // Unregister waits. Wait for already executing callbacks to finish.
+    // Unregister the client process exit wait first and only then unregister
+    // the dump request wait.  The reason is that the OnClientExit callback
+    // also unregisters the dump request wait and such a race (doing the same
+    // unregistration from two threads) is undesirable.
+    client_info->UnregisterProcessExitWait(true);
+    client_info->UnregisterDumpRequestWaitAndBlockUntilNoPending();
 
-    if (server_alive_handle_) {
-      // Release the mutex before closing the handle so that clients requesting
-      // dumps wait for a long time for the server to generate a dump.
-      ReleaseMutex(server_alive_handle_);
-      CloseHandle(server_alive_handle_);
-    }
+    // Destroying the ClientInfo here is safe because all wait operations for
+    // this ClientInfo were unregistered and no pending or running callbacks
+    // for this ClientInfo can possible exist (block_until_no_pending option
+    // was used).
+    delete client_info;
+  }
 
-    if (overlapped_.hEvent) {
-      CloseHandle(overlapped_.hEvent);
-    }
+  if (server_alive_handle_) {
+    // Release the mutex before closing the handle so that clients requesting
+    // dumps wait for a long time for the server to generate a dump.
+    ReleaseMutex(server_alive_handle_);
+    CloseHandle(server_alive_handle_);
   }
 
+  if (overlapped_.hEvent) {
+    CloseHandle(overlapped_.hEvent);
+  }
+  
   DeleteCriticalSection(&sync_);
 }
 
 bool CrashGenerationServer::Start() {
-  AutoCriticalSection lock(&sync_);
-
   if (server_state_ != IPC_SERVER_STATE_UNINITIALIZED) {
     return false;
   }
@@ -455,6 +441,7 @@ void CrashGenerationServer::HandleReadDoneState() {
     return;
   }
 
+  // This is only valid as long as it can be found in the clients_ list
   client_info_ = client_info.release();
 
   // Note that the asynchronous write issued by RespondToClient function
@@ -532,7 +519,32 @@ void CrashGenerationServer::HandleReadingAckState() {
     // The connection handshake with the client is now complete; perform
     // the callback.
     if (connect_callback_) {
-      connect_callback_(connect_context_, client_info_);
+      // Note that there is only a single copy of the ClientInfo of the
+      // currently connected client.  However it is being referenced from
+      // two different places:
+      //  - the client_info_ member
+      //  - the clients_ list
+      // The lifetime of this ClientInfo depends on the lifetime of the
+      // client process - basically it can go away at any time.
+      // However, as long as it is referenced by the clients_ list it
+      // is guaranteed to be valid. Enter the critical section and check
+      // to see whether the client_info_ can be found in the list.
+      // If found, execute the callback and only then leave the critical
+      // section.
+      AutoCriticalSection lock(&sync_);
+
+      bool client_is_still_alive = false;
+      std::list<ClientInfo*>::iterator iter;
+      for (iter = clients_.begin(); iter != clients_.end(); ++iter) {
+        if (client_info_ == *iter) {
+          client_is_still_alive = true;
+          break;
+        }
+      }
+
+      if (client_is_still_alive) {
+        connect_callback_(connect_context_, client_info_);
+      }
     }
   } else {
     // We should never get an I/O incomplete since we should not execute this
@@ -605,16 +617,39 @@ bool CrashGenerationServer::PrepareReply(const ClientInfo& client_info,
     return true;
   }
 
+  // Closing of remote handles (belonging to a different process) can
+  // only be done through DuplicateHandle.
   if (reply->dump_request_handle) {
-    CloseHandle(reply->dump_request_handle);
+    DuplicateHandle(client_info.process_handle(),  // hSourceProcessHandle
+                    reply->dump_request_handle,    // hSourceHandle
+                    NULL,                          // hTargetProcessHandle
+                    0,                             // lpTargetHandle
+                    0,                             // dwDesiredAccess
+                    FALSE,                         // bInheritHandle
+                    DUPLICATE_CLOSE_SOURCE);       // dwOptions
+    reply->dump_request_handle = NULL;
   }
 
   if (reply->dump_generated_handle) {
-    CloseHandle(reply->dump_generated_handle);
+    DuplicateHandle(client_info.process_handle(),  // hSourceProcessHandle
+                    reply->dump_generated_handle,  // hSourceHandle
+                    NULL,                          // hTargetProcessHandle
+                    0,                             // lpTargetHandle
+                    0,                             // dwDesiredAccess
+                    FALSE,                         // bInheritHandle
+                    DUPLICATE_CLOSE_SOURCE);       // dwOptions
+    reply->dump_generated_handle = NULL;
   }
 
   if (reply->server_alive_handle) {
-    CloseHandle(reply->server_alive_handle);
+    DuplicateHandle(client_info.process_handle(),  // hSourceProcessHandle
+                    reply->server_alive_handle,    // hSourceHandle
+                    NULL,                          // hTargetProcessHandle
+                    0,                             // lpTargetHandle
+                    0,                             // dwDesiredAccess
+                    FALSE,                         // bInheritHandle
+                    DUPLICATE_CLOSE_SOURCE);       // dwOptions
+    reply->server_alive_handle = NULL;
   }
 
   return false;
@@ -676,21 +711,15 @@ bool CrashGenerationServer::RespondToClient(ClientInfo* client_info) {
 
   // Takes over ownership of client_info. We MUST return true if AddClient
   // succeeds.
-  if (!AddClient(client_info)) {
-    return false;
-  }
-
-  return true;
+  return AddClient(client_info);
 }
 
 // The server thread servicing the clients runs this method. The method
 // implements the state machine described in ReadMe.txt along with the
 // helper methods HandleXXXState.
 void CrashGenerationServer::HandleConnectionRequest() {
-  AutoCriticalSection lock(&sync_);
-
-  // If we are shutting doen then get into ERROR state, reset the event so more
-  // workers don't run and return immediately.
+  // If the server is shutting down, get into ERROR state, reset the event so
+  // more workers don't run and return immediately.
   if (shutting_down_) {
     server_state_ = IPC_SERVER_STATE_ERROR;
     ResetEvent(overlapped_.hEvent);
@@ -776,6 +805,10 @@ bool CrashGenerationServer::AddClient(ClientInfo* client_info) {
   // New scope to hold the lock for the shortest time.
   {
     AutoCriticalSection lock(&sync_);
+    if (shutting_down_) {
+      // If server is shutting down, don't add new clients
+      return false;
+    }
     clients_.push_back(client_info);
   }
 
@@ -812,56 +845,55 @@ void CALLBACK CrashGenerationServer::OnClientEnd(void* context, BOOLEAN) {
   CrashGenerationServer* crash_server = client_info->crash_server();
   assert(crash_server);
 
-  client_info->UnregisterWaits();
-  InterlockedIncrement(&crash_server->cleanup_item_count_);
-
-  if (!QueueUserWorkItem(CleanupClient, context, WT_EXECUTEDEFAULT)) {
-    InterlockedDecrement(&crash_server->cleanup_item_count_);
-  }
+  crash_server->HandleClientProcessExit(client_info);
 }
 
-// static
-DWORD WINAPI CrashGenerationServer::CleanupClient(void* context) {
-  assert(context);
-  ClientInfo* client_info = reinterpret_cast<ClientInfo*>(context);
+void CrashGenerationServer::HandleClientProcessExit(ClientInfo* client_info) {
+  assert(client_info);
 
-  CrashGenerationServer* crash_server = client_info->crash_server();
-  assert(crash_server);
+  // Must unregister the dump request wait operation and wait for any
+  // dump requests that might be pending to finish before proceeding
+  // with the client_info cleanup.
+  client_info->UnregisterDumpRequestWaitAndBlockUntilNoPending();
 
-  if (crash_server->exit_callback_) {
-    crash_server->exit_callback_(crash_server->exit_context_, client_info);
+  if (exit_callback_) {
+    exit_callback_(exit_context_, client_info);
   }
 
-  crash_server->DoCleanup(client_info);
-
-  InterlockedDecrement(&crash_server->cleanup_item_count_);
-  return 0;
-}
-
-void CrashGenerationServer::DoCleanup(ClientInfo* client_info) {
-  assert(client_info);
-
   // Start a new scope to release lock automatically.
   {
     AutoCriticalSection lock(&sync_);
+    if (shutting_down_) {
+      // The crash generation server is shutting down and as part of the
+      // shutdown process it will delete all clients from the clients_ list.
+      return;
+    }
     clients_.remove(client_info);
   }
 
+  // Explicitly unregister the process exit wait using the non-blocking method.
+  // Otherwise, the destructor will attempt to unregister it using the blocking
+  // method which will lead to a deadlock because it is being called from the
+  // callback of the same wait operation
+  client_info->UnregisterProcessExitWait(false);
+
   delete client_info;
 }
 
 void CrashGenerationServer::HandleDumpRequest(const ClientInfo& client_info) {
+  bool execute_callback = true;
   // Generate the dump only if it's explicitly requested by the
   // server application; otherwise the server might want to generate
   // dump in the callback.
   std::wstring dump_path;
   if (generate_dumps_) {
     if (!GenerateDump(client_info, &dump_path)) {
-      return;
+      // client proccess terminated or some other error
+      execute_callback = false;
     }
   }
 
-  if (dump_callback_) {
+  if (dump_callback_ && execute_callback) {
     std::wstring* ptr_dump_path = (dump_path == L"") ? NULL : &dump_path;
     dump_callback_(dump_context_, &client_info, ptr_dump_path);
   }
diff --git a/src/client/windows/crash_generation/crash_generation_server.h b/src/client/windows/crash_generation/crash_generation_server.h
index ea3776fb..37278484 100644
--- a/src/client/windows/crash_generation/crash_generation_server.h
+++ b/src/client/windows/crash_generation/crash_generation_server.h
@@ -189,11 +189,8 @@ class CrashGenerationServer {
   // Callback for client process exit event.
   static void CALLBACK OnClientEnd(void* context, BOOLEAN timer_or_wait);
 
-  // Releases resources for a client.
-  static DWORD WINAPI CleanupClient(void* context);
-
-  // Cleans up for the given client.
-  void DoCleanup(ClientInfo* client_info);
+  // Handles client process exit.
+  void HandleClientProcessExit(ClientInfo* client_info);
 
   // Adds the given client to the list of registered clients.
   bool AddClient(ClientInfo* client_info);
@@ -216,8 +213,7 @@ class CrashGenerationServer {
   // asynchronous IO operation.
   void EnterStateWhenSignaled(IPCServerState state);
 
-  // Sync object for thread-safe access to the shared list of clients and
-  // the server's state.
+  // Sync object for thread-safe access to the shared list of clients.
   CRITICAL_SECTION sync_;
 
   // List of clients.
@@ -286,10 +282,6 @@ class CrashGenerationServer {
   // Client Info for the client that's connecting to the server.
   ClientInfo* client_info_;
 
-  // Count of clean-up work items that are currently running or are
-  // already queued to run.
-  volatile LONG cleanup_item_count_;
-
   // Disable copy ctor and operator=.
   CrashGenerationServer(const CrashGenerationServer& crash_server);
   CrashGenerationServer& operator=(const CrashGenerationServer& crash_server);
author	ivan.penkov@gmail.com <ivan.penkov@gmail.com@4c0a9323-5329-0410-9bdc-e9ce6186880e>	2012-08-15 22:09:42 +0000
committer	ivan.penkov@gmail.com <ivan.penkov@gmail.com@4c0a9323-5329-0410-9bdc-e9ce6186880e>	2012-08-15 22:09:42 +0000
commit	d7de392b05e015f669012db65395f26daaa53f14 (patch)
tree	cb4e3b3c63a4fb71c26b5611bd8db89e984968a3
parent	Fix narrowing conversion from -1 to size_t (diff)
download	breakpad-d7de392b05e015f669012db65395f26daaa53f14.tar.xz