From 0ac94ba6174d3d5e3725da69d092373d51370181 Mon Sep 17 00:00:00 2001 From: "vapier@chromium.org" Date: Wed, 2 Apr 2014 22:55:12 +0000 Subject: fix races in CrashGenerator::CreateChildCrash The current CreateChildCrash logic is racy when it comes to creating a crash dump for two reasons: The main thread that calls kill() on a different thread is guaranteed the signal will be *queued* when it returns, but not *delivered*. If the kernel doesn't automatically schedule the receiving thread, but instead lets the main thread run to the exit() call, then the signal never triggers a coredump and the whole process simply exits. The main thread is using kill() to try to deliver a signal to a specific thread, but that function is for sending signals to a process. That means the kernel is free to deliver the signal to any thread in the process and not just the one requested. This manifests itself as the pr_pid in the coredump not being the one expected. Instead, we must use tkill() with the tid (which we already took care of gathering) to deliver to a specific thread. These are a lot easier to see on a UMP system as contention is heavier. BUG=chromium:207918 TEST=`dumper_unittest` still passes, and doesn't flake out in a UMP system TEST=`linux_client_unittest` still passes R=benchan@chromium.org Review URL: https://breakpad.appspot.com/1304005 git-svn-id: http://google-breakpad.googlecode.com/svn/trunk@1299 4c0a9323-5329-0410-9bdc-e9ce6186880e --- src/common/linux/tests/crash_generator.cc | 41 ++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'src/common/linux/tests') diff --git a/src/common/linux/tests/crash_generator.cc b/src/common/linux/tests/crash_generator.cc index 9262c323..f25086dc 100644 --- a/src/common/linux/tests/crash_generator.cc +++ b/src/common/linux/tests/crash_generator.cc @@ -65,15 +65,26 @@ const char* const kProcFilesToCopy[] = { const size_t kNumProcFilesToCopy = sizeof(kProcFilesToCopy) / sizeof(kProcFilesToCopy[0]); +int gettid() { + // Glibc does not provide a wrapper for this. + return syscall(__NR_gettid); +} + +int tkill(pid_t tid, int sig) { + // Glibc does not provide a wrapper for this. + return syscall(__NR_tkill, tid, sig); +} + // Core file size limit set to 1 MB, which is big enough for test purposes. const rlim_t kCoreSizeLimit = 1024 * 1024; void *thread_function(void *data) { ThreadData* thread_data = reinterpret_cast(data); - volatile pid_t thread_id = syscall(__NR_gettid); + volatile pid_t thread_id = gettid(); *(thread_data->thread_id_ptr) = thread_id; int result = pthread_barrier_wait(thread_data->barrier); if (result != 0 && result != PTHREAD_BARRIER_SERIAL_THREAD) { + perror("Failed to wait for sync barrier"); exit(1); } while (true) { @@ -160,11 +171,16 @@ bool CrashGenerator::SetCoreFileSizeLimit(rlim_t limit) const { bool CrashGenerator::CreateChildCrash( unsigned num_threads, unsigned crash_thread, int crash_signal, pid_t* child_pid) { - if (num_threads == 0 || crash_thread >= num_threads) + if (num_threads == 0 || crash_thread >= num_threads) { + fprintf(stderr, "CrashGenerator: Invalid thread counts; num_threads=%u" + " crash_thread=%u\n", num_threads, crash_thread); return false; + } - if (!MapSharedMemory(num_threads * sizeof(pid_t))) + if (!MapSharedMemory(num_threads * sizeof(pid_t))) { + perror("CrashGenerator: Unable to map shared memory"); return false; + } pid_t pid = fork(); if (pid == 0) { @@ -183,9 +199,22 @@ bool CrashGenerator::CreateChildCrash( fprintf(stderr, "CrashGenerator: Failed to copy proc files\n"); exit(1); } - if (kill(*GetThreadIdPointer(crash_thread), crash_signal) == -1) { + if (tkill(*GetThreadIdPointer(crash_thread), crash_signal) == -1) { perror("CrashGenerator: Failed to kill thread by signal"); + } else { + // At this point, we've queued the signal for delivery, but there's no + // guarantee when it'll be delivered. We don't want the main thread to + // race and exit before the thread we signaled is processed. So sleep + // long enough that we won't flake even under fairly high load. + // TODO: See if we can't be a bit more deterministic. There doesn't + // seem to be an API to check on signal delivery status, so we can't + // really poll and wait for the kernel to declare the signal has been + // delivered. If it has, and things worked, we'd be killed, so the + // sleep length doesn't really matter. + sleep(10 * 60); } + } else { + perror("CrashGenerator: Failed to set core limit"); } exit(1); } else if (pid == -1) { @@ -200,8 +229,8 @@ bool CrashGenerator::CreateChildCrash( } if (!WIFSIGNALED(status) || WTERMSIG(status) != crash_signal) { fprintf(stderr, "CrashGenerator: Child process not killed by the expected signal\n" - " exit status=0x%x signaled=%s sig=%d expected=%d\n", - status, WIFSIGNALED(status) ? "true" : "false", + " exit status=0x%x pid=%u signaled=%s sig=%d expected=%d\n", + status, pid, WIFSIGNALED(status) ? "true" : "false", WTERMSIG(status), crash_signal); return false; } -- cgit v1.2.1