Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions ddprof-lib/src/main/cpp/guards.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
// Static bitmap storage for fallback cases
uint64_t CriticalSection::_fallback_bitmap[CriticalSection::FALLBACK_BITMAP_WORDS] = {};

CriticalSection::CriticalSection() : _entered(false), _using_fallback(false), _word_index(0), _bit_mask(0) {
ProfiledThread* current = ProfiledThread::currentSignalSafe();
if (current != nullptr) {
CriticalSection::CriticalSection() : _entered(false), _using_fallback(false), _word_index(0), _bit_mask(0), _thread_ptr(nullptr) {
_thread_ptr = ProfiledThread::currentSignalSafe();
if (_thread_ptr != nullptr) {
// Primary path: Use ProfiledThread storage (fast and memory-efficient)
_entered = current->tryEnterCriticalSection();
_entered = _thread_ptr->tryEnterCriticalSection();
} else {
// Fallback path: Use hash-based bitmap for stress tests and edge cases
_using_fallback = true;
Expand All @@ -51,10 +51,9 @@ CriticalSection::~CriticalSection() {
// Use RELEASE ordering to ensure protected data writes are visible before releasing
__atomic_fetch_and(&_fallback_bitmap[_word_index], ~_bit_mask, __ATOMIC_RELEASE);
} else {
// Release ProfiledThread flag
ProfiledThread* current = ProfiledThread::currentSignalSafe();
if (current != nullptr) {
current->exitCriticalSection();
// Release ProfiledThread flag using the pointer captured at construction
if (_thread_ptr != nullptr) {
_thread_ptr->exitCriticalSection();
}
}
}
Expand Down
3 changes: 3 additions & 0 deletions ddprof-lib/src/main/cpp/guards.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include <signal.h>
#include <pthread.h>

class ProfiledThread;

/**
* Race-free critical section using atomic compare-and-swap.
*
Expand Down Expand Up @@ -67,6 +69,7 @@ class CriticalSection {
bool _using_fallback; // Track which storage mechanism we're using
uint32_t _word_index; // For fallback bitmap cleanup
uint64_t _bit_mask; // For fallback bitmap cleanup
ProfiledThread* _thread_ptr; // ProfiledThread captured at construction

public:
CriticalSection();
Expand Down
6 changes: 3 additions & 3 deletions ddprof-lib/src/main/cpp/otel_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ DLLEXPORT extern thread_local OtelThreadContextRecord* otel_thread_ctx_v1;
* Each thread gets a pre-allocated OtelThreadContextRecord cached in
* ProfiledThread. The TLS pointer otel_thread_ctx_v1 is set permanently
* to the record during thread initialization; detach/attach (context writes)
* never touch it. It is nulled on thread exit (in releaseFromBuffer) to
* prevent external profilers from dereferencing a recycled record.
* Context activity is indicated solely by the valid flag in the record.
* never touch it. Readers must not assume the TLS pointer is cleared during
* teardown; record liveness is determined by the owning thread lifetime and
* the valid flag in the record.
*
* Signal safety: signal handlers must never access
* otel_thread_ctx_v1 directly (TLS lazy init can deadlock
Expand Down
2 changes: 1 addition & 1 deletion ddprof-lib/src/main/cpp/profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ void Profiler::onThreadStart(jvmtiEnv *jvmti, JNIEnv *jni, jthread thread) {
}

void Profiler::onThreadEnd(jvmtiEnv *jvmti, JNIEnv *jni, jthread thread) {
ProfiledThread *current = ProfiledThread::current();
ProfiledThread *current = ProfiledThread::currentSignalSafe();
int tid = -1;

if (current != nullptr) {
Expand Down
122 changes: 5 additions & 117 deletions ddprof-lib/src/main/cpp/thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,14 @@

#include "thread.h"
#include "context_api.h"
#include "guards.h"
#include "otel_context.h"
#include "os.h"
#include <cstring>
#include <time.h>

pthread_key_t ProfiledThread::_tls_key;
bool ProfiledThread::_tls_key_initialized = false;
int ProfiledThread::_buffer_size = 0;
volatile int ProfiledThread::_running_buffer_pos = 0;
ProfiledThread** ProfiledThread::_buffer = nullptr;
volatile int ProfiledThread::_free_stack_top = -1;
int* ProfiledThread::_free_slots = nullptr;

void ProfiledThread::initTLSKey() {
static pthread_once_t tls_initialized = PTHREAD_ONCE_INIT;
Expand All @@ -35,16 +31,8 @@ void ProfiledThread::doInitTLSKey() {
inline void ProfiledThread::freeKey(void *key) {
ProfiledThread *tls_ref = (ProfiledThread *)(key);
if (tls_ref != NULL) {
// Check if this is a buffer-allocated thread (has valid buffer_pos)
bool is_buffer_allocated = (tls_ref->_buffer_pos >= 0);

if (is_buffer_allocated) {
// Buffer-allocated: reset and return to buffer for reuse
tls_ref->releaseFromBuffer();
} else {
// Non-buffer (JVMTI-allocated): delete the instance
delete tls_ref;
}
SignalBlocker blocker;
delete tls_ref;
}
}

Expand All @@ -70,62 +58,12 @@ void ProfiledThread::release() {
pthread_key_t key = _tls_key;
ProfiledThread *tls = (ProfiledThread *)pthread_getspecific(key);
if (tls != NULL) {
SignalBlocker blocker;
pthread_setspecific(key, NULL);

// Check if this is a buffer-allocated thread (has valid buffer_pos)
bool is_buffer_allocated = (tls->_buffer_pos >= 0);

tls->releaseFromBuffer();

// Only delete non-buffer threads (e.g., created via forTid())
if (!is_buffer_allocated) {
pthread_setspecific(key, NULL);
delete tls;
}
// Buffer-allocated threads are kept for reuse and will be deleted in cleanupBuffer()
delete tls;
}
}

void ProfiledThread::releaseFromBuffer() {
if (_buffer_pos >= 0 && _buffer != nullptr && _buffer_pos < _buffer_size) {
// Reset the thread object for reuse (clear thread-specific data)
_tid = 0;
_pc = 0;
_sp = 0;
_span_id = 0;
_crash_depth = 0;
_cpu_epoch = 0;
_wall_epoch = 0;
_call_trace_id = 0;
_recording_epoch = 0;
_filter_slot_id = -1;
_init_window = 0;
_unwind_failures.clear();

// Null the TLS pointer so external profilers that dereference the pointer
// (rather than just checking the valid flag) don't access a recycled record.
// This is distinct from the valid flag: valid guards the OTEP write protocol
// between the Java writer and native reader, but does not protect recycling.
__atomic_store_n(&otel_thread_ctx_v1, (OtelThreadContextRecord*)nullptr, __ATOMIC_RELEASE);
// Mark uninitialized BEFORE zeroing the record, so that our own signal handlers
// short-circuit before reading partially-zeroed data during the memset below.
// (The valid flag is zeroed by memset too, but _otel_ctx_initialized guards
// the isContextInitialized() check which runs before any record access.)
// Use __ATOMIC_RELEASE so the compiler cannot reorder this store after the
// memset on ARM with aggressive optimizations.
__atomic_store_n(&_otel_ctx_initialized, false, __ATOMIC_RELEASE);
clearOtelSidecar();
memset(&_otel_ctx_record, 0, sizeof(_otel_ctx_record));

// Put this ProfiledThread object back in the buffer for reuse
_buffer[_buffer_pos] = this;

// Push this slot back to the free list for reuse
pushFreeSlot(_buffer_pos);

_buffer_pos = -1;
}
}

int ProfiledThread::currentTid() {
ProfiledThread *tls = current();
Expand Down Expand Up @@ -155,56 +93,6 @@ ProfiledThread *ProfiledThread::currentSignalSafe() {
return __atomic_load_n(&_tls_key_initialized, __ATOMIC_ACQUIRE) ? (ProfiledThread *)pthread_getspecific(_tls_key) : nullptr;
}

int ProfiledThread::popFreeSlot() {
int current_top;
int new_top;

do {
current_top = __atomic_load_n(&_free_stack_top, __ATOMIC_ACQUIRE);
if (current_top == -1) {
return -1; // Stack is empty
}
new_top = _free_slots[current_top];
} while (!__atomic_compare_exchange_n(&_free_stack_top, &current_top, new_top,
true, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE));

return current_top;
}

void ProfiledThread::pushFreeSlot(int slot_index) {
if (slot_index < 0 || slot_index >= _buffer_size || _free_slots == nullptr) {
return; // Invalid slot index
}

int current_top;
do {
current_top = __atomic_load_n(&_free_stack_top, __ATOMIC_ACQUIRE);
_free_slots[slot_index] = current_top;
} while (!__atomic_compare_exchange_n(&_free_stack_top, &current_top, slot_index,
true, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE));
}

void ProfiledThread::cleanupBuffer() {
if (_buffer != nullptr) {
for (int i = 0; i < _buffer_size; i++) {
if (_buffer[i] != nullptr) {
delete _buffer[i];
_buffer[i] = nullptr;
}
}
free(_buffer);
_buffer = nullptr;
}

if (_free_slots != nullptr) {
free(_free_slots);
_free_slots = nullptr;
}

_buffer_size = 0;
_running_buffer_pos = 0;
_free_stack_top = -1;
}

Context ProfiledThread::snapshotContext(size_t numAttrs) {
Context ctx = {};
Expand Down
33 changes: 11 additions & 22 deletions ddprof-lib/src/main/cpp/thread.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,30 +38,15 @@ class ProfiledThread : public ThreadLocalData {
static constexpr u32 CRASH_HANDLER_NESTING_LIMIT = 5;
static pthread_key_t _tls_key;
static bool _tls_key_initialized;
static int _buffer_size;
static volatile int _running_buffer_pos;
static ProfiledThread** _buffer;

// Free slot recycling - lock-free stack of available buffer slots
// Note: Using plain int with GCC atomic builtins instead of std::atomic
// because std::atomic is not guaranteed async-signal-safe (may use mutexes)
static volatile int _free_stack_top;
static int* _free_slots; // Array to store free slot indices

static void initTLSKey();
static void doInitTLSKey();
static inline void freeKey(void *key);
static void cleanupBuffer();

// Free slot management - lock-free operations
static int popFreeSlot(); // Returns -1 if no free slots
static void pushFreeSlot(int slot_index);

u64 _pc;
u64 _sp;
u64 _span_id; // Wall-clock collapsing cache: last-seen span ID (not a context store — read from _otel_ctx_record on each signal, cached here to detect "same as last time")
volatile u32 _crash_depth;
int _buffer_pos;
int _tid;
u32 _cpu_epoch;
u32 _wall_epoch;
Expand All @@ -85,22 +70,26 @@ class ProfiledThread : public ThreadLocalData {
alignas(8) u32 _otel_tag_encodings[DD_TAGS_CAPACITY];
u64 _otel_local_root_span_id;

ProfiledThread(int buffer_pos, int tid)
: ThreadLocalData(), _pc(0), _sp(0), _span_id(0), _crash_depth(0), _buffer_pos(buffer_pos), _tid(tid), _cpu_epoch(0),
ProfiledThread(int tid)
: ThreadLocalData(), _pc(0), _sp(0), _span_id(0), _crash_depth(0), _tid(tid), _cpu_epoch(0),
_wall_epoch(0), _call_trace_id(0), _recording_epoch(0), _misc_flags(0), _filter_slot_id(-1), _init_window(0),
_otel_ctx_initialized(false), _crash_protection_active(false),
_otel_ctx_record{}, _otel_tag_encodings{}, _otel_local_root_span_id(0) {};

virtual ~ProfiledThread() { }
void releaseFromBuffer();
public:
static ProfiledThread *forTid(int tid) { return new ProfiledThread(-1, tid); }
static ProfiledThread *inBuffer(int buffer_pos) {
return new ProfiledThread(buffer_pos, 0);
}
static ProfiledThread *forTid(int tid) { return new ProfiledThread(tid); }

static void initCurrentThread();
static void release();
// Clears TLS without deleting the ProfiledThread. For unit tests only:
// simulates the moment inside release() after pthread_setspecific(NULL) but
// before delete, which is the race window the _thread_ptr fix covers.
static void clearCurrentThreadTLS() {
if (__atomic_load_n(&_tls_key_initialized, __ATOMIC_ACQUIRE)) {
pthread_setspecific(_tls_key, nullptr);
}
}

static ProfiledThread *current();
static ProfiledThread *currentSignalSafe(); // Signal-safe version that never allocates
Expand Down
58 changes: 56 additions & 2 deletions ddprof-lib/src/test/cpp/ddprof_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
#include "buffers.h"
#include "context.h"
#include "counters.h"
#include "guards.h"
#include "mutex.h"
#include "os.h"
#include "thread.h"
#include "unwindStats.h"
#include "threadFilter.h"
#include "threadInfo.h"
Expand All @@ -16,8 +18,8 @@
#include <thread>
#include <vector>
#include <algorithm> // For std::sort
#include <thread>
#include <atomic>
#include <sys/wait.h>
#include <unistd.h>

// Test name for crash handler
static constexpr char DDPROF_TEST_NAME[] = "DdprofTest";
Expand Down Expand Up @@ -357,6 +359,58 @@ static DdprofGlobalSetup ddprof_global_setup;
EXPECT_TRUE(globalCount > 0);
}

// Deterministic regression for the CriticalSection::_thread_ptr capture fix.
//
// Bug: the old destructor re-fetched currentSignalSafe() at destruction time.
// If TLS was cleared between the ctor and dtor (e.g. release() called inside
// the CS scope as it was in the old onThreadEnd), the re-fetch returned nullptr
// and exitCriticalSection() was silently skipped, leaving _in_critical_section
// stuck true so no subsequent CS could enter on that ProfiledThread.
//
// Fix: the ctor captures _thread_ptr once; the dtor uses that pointer regardless
// of TLS state at destruction time.
//
// This test exercises the exact race window by calling clearCurrentThreadTLS()
// inside a live CriticalSection scope, then verifying the flag is cleared.
// Without the fix tryEnterCriticalSection() returns false (exit 5).
TEST(ProfiledThreadTeardown, CriticalSectionExitsEvenAfterTLSCleared) {
pid_t pid = fork();
Comment thread
jbachorik marked this conversation as resolved.
ASSERT_NE(-1, pid);

if (pid == 0) {
// ---- child process (fork isolates TLS from other tests) ----
ProfiledThread::initCurrentThread();
ProfiledThread* pt = ProfiledThread::currentSignalSafe();
if (pt == nullptr) _exit(2);

// Baseline: entering critical section works.
if (!pt->tryEnterCriticalSection()) _exit(3);
pt->exitCriticalSection();

// Simulate the race: CriticalSection is constructed while TLS is valid
// (so _thread_ptr is captured), then TLS is cleared before the dtor runs.
{
CriticalSection cs;
if (!cs.entered()) _exit(4);
// Mimics the moment inside release() after pthread_setspecific(NULL).
ProfiledThread::clearCurrentThreadTLS();
} // dtor: old code → re-fetch nullptr → skip exit → _in_critical_section stuck
// new code → _thread_ptr captured at ctor → exitCriticalSection called

// _in_critical_section must be false; if the bug is present this fails.
if (!pt->tryEnterCriticalSection()) _exit(5);
pt->exitCriticalSection();

_exit(0); // destructor is private; OS reclaims memory on exit.
}

// ---- parent: reap child and check exit code ----
int status = 0;
ASSERT_NE(-1, waitpid(pid, &status, 0));
ASSERT_TRUE(WIFEXITED(status)) << "child crashed (signal " << WTERMSIG(status) << ")";
ASSERT_EQ(0, WEXITSTATUS(status)) << "child exited with code " << WEXITSTATUS(status);
}

int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
Expand Down
Loading