From 3d85ed8641c0876e4d2e97c1a6d3ab5ae2269020 Mon Sep 17 00:00:00 2001 From: Jeff McJunkin Date: Thu, 11 Jun 2026 18:10:38 -0700 Subject: [PATCH 1/2] kernel32: make FLS values per-thread FlsAlloc/FlsGetValue/FlsSetValue stored values in a single process-global array, so every thread shared one cell per FLS index. Fiber-local storage without fibers is thread-local storage on Windows: each thread is exactly one fiber, and FlsGetValue must return the value the calling thread set. The shared cell breaks msvcr120.dll (VS2013 CRT) thread creation. The CRT stores its per-thread data block (_ptd) -- which carries the _beginthreadex entry point and argument -- via FlsSetValue, and the new thread's _threadstartex/_callthreadstartex re-read it via FlsGetValue. With a process-global cell, two concurrently-starting threads overwrite each other's _ptd and can both start with the same argument. Observed with VS2013 cl.exe: c2.dll's parallel codegen pool creates four worker threads back-to-back; ~1-2% of compiles deadlocked forever. An API trace of a hung process shows two workers waiting on the same per-worker dispatch event while another worker's event has a signal and no waiter: CreateEvent h=7c / 88 / 94 / a0 (per-worker "go" events) t251103 WaitForSingleObject h=88 <- worker 1 t251102 WaitForSingleObject h=88 <- different thread, SAME event boss SetEvent 7c, 88, 94, a0 <- 94 never gets a waiter boss WaitForMultipleObjects({done events}, bWaitAll, INFINITE) -> hangs forever: the orphaned worker never runs its work item, so its "done" event is never set. Fix: keep the index allocation map process-wide (FLS indices are process-wide on Windows) but store the values in a thread_local array; wibo maps guest threads 1:1 onto host threads, so thread_local is exactly per-guest-thread. New threads observe zero-initialized values, matching Windows. Index alloc/free is guarded by a mutex. Caveat (unchanged behavior): FLS destructor callbacks are still never invoked, and freeing then reallocating an index does not clear other threads' stale values for it. The VS2013 CRT allocates its index once per process, so neither is reachable for it. With this fix the cl.exe hang rate dropped from ~1-2% to ~0.1-0.3% over 1000-run stress batches; the remainder was a separate CRITICAL_SECTION issue fixed in the following commit. Co-Authored-By: Claude Fable 5 --- dll/kernel32/fibersapi.cpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/dll/kernel32/fibersapi.cpp b/dll/kernel32/fibersapi.cpp index c834bd6..59a97e9 100644 --- a/dll/kernel32/fibersapi.cpp +++ b/dll/kernel32/fibersapi.cpp @@ -5,11 +5,21 @@ #include "errors.h" #include "internal.h" +#include + namespace { +// FLS without fibers is THREAD-local storage on +// Windows (every thread is exactly one fiber); only the index allocation map +// is process-wide. The previous process-global value array made all threads +// share one cell per index, which clobbers msvcr120's per-thread data (_ptd) -- +// including the _beginthreadex entry/argument that _threadstartex re-reads +// through FlsGetValue -- so concurrently-starting threads could duplicate or +// swap their start arguments. constexpr DWORD kMaxFlsValues = 0x100; +std::mutex g_flsMutex; bool g_flsValuesUsed[kMaxFlsValues] = {false}; -LPVOID g_flsValues[kMaxFlsValues] = {nullptr}; +thread_local LPVOID t_flsValues[kMaxFlsValues] = {nullptr}; } // namespace @@ -19,10 +29,11 @@ DWORD WINAPI FlsAlloc(PFLS_CALLBACK_FUNCTION lpCallback) { HOST_CONTEXT_GUARD(); DEBUG_LOG("FlsAlloc(%p)", lpCallback); // If the function succeeds, the return value is an FLS index initialized to zero. + std::lock_guard lk(g_flsMutex); for (DWORD i = 0; i < kMaxFlsValues; i++) { if (g_flsValuesUsed[i] == false) { g_flsValuesUsed[i] = true; - g_flsValues[i] = nullptr; + t_flsValues[i] = nullptr; DEBUG_LOG(" -> %d\n", i); return i; } @@ -35,6 +46,7 @@ DWORD WINAPI FlsAlloc(PFLS_CALLBACK_FUNCTION lpCallback) { BOOL WINAPI FlsFree(DWORD dwFlsIndex) { HOST_CONTEXT_GUARD(); DEBUG_LOG("FlsFree(%u)\n", dwFlsIndex); + std::lock_guard lk(g_flsMutex); if (dwFlsIndex < kMaxFlsValues && g_flsValuesUsed[dwFlsIndex]) { g_flsValuesUsed[dwFlsIndex] = false; return TRUE; @@ -49,7 +61,7 @@ PVOID WINAPI FlsGetValue(DWORD dwFlsIndex) { VERBOSE_LOG("FlsGetValue(%u)\n", dwFlsIndex); PVOID result = nullptr; if (dwFlsIndex < kMaxFlsValues && g_flsValuesUsed[dwFlsIndex]) { - result = g_flsValues[dwFlsIndex]; + result = t_flsValues[dwFlsIndex]; // See https://learn.microsoft.com/en-us/windows/win32/api/fibersapi/nf-fibersapi-flsgetvalue setLastError(ERROR_SUCCESS); } else { @@ -63,7 +75,7 @@ BOOL WINAPI FlsSetValue(DWORD dwFlsIndex, PVOID lpFlsData) { HOST_CONTEXT_GUARD(); VERBOSE_LOG("FlsSetValue(%u, %p)\n", dwFlsIndex, lpFlsData); if (dwFlsIndex < kMaxFlsValues && g_flsValuesUsed[dwFlsIndex]) { - g_flsValues[dwFlsIndex] = lpFlsData; + t_flsValues[dwFlsIndex] = lpFlsData; return TRUE; } else { setLastError(ERROR_INVALID_PARAMETER); From 27adf092e6f8e2880184c073b5ccdd0f40cd7b0f Mon Sep 17 00:00:00 2001 From: Jeff McJunkin Date: Thu, 11 Jun 2026 18:10:38 -0700 Subject: [PATCH 2/2] kernel32: make CRITICAL_SECTION handoff and Leave faithful to Windows Two divergences from the real Windows state machine, both observed breaking VS2013 cl.exe (c2.dll multithreaded codegen): 1. Contended EnterCriticalSection waited until OwningThread was observed to be 0 and then claimed the section with a plain store. Two waiters can both observe 0 after a single Leave and both enter the critical section simultaneously. The fingerprint -- a free section left with LockCount == 0 instead of -1, created when the second "owner"'s Leave failed the ownership check and returned without decrementing -- was captured with gdb in hung c2.dll worker pools. 2. LeaveCriticalSection bailed out when the calling thread did not match OwningThread. Real Windows Leave performs no caller validation at all: it unconditionally decrements RecursionCount/LockCount and releases exactly one waiter; mutual exclusion is carried entirely by the LockCount/semaphore state machine. Guest lock usage that Windows tolerates therefore silently strands wibo's lock state instead. Instrumented runs caught exactly this on c2.dll's work-queue section: leave-not-owner cs= tid=B owner=A lock=1 rec=1 after which the queue's "active worker" counter and the LockCount were each stranded one too high, the queue's drain condition became unsatisfiable, and the compiler deadlocked (~0.1-0.3% of compiles even after the FLS fix). Fix, mirroring the Windows protocol: - Model LockSemaphore as a ticket count: every contended Leave posts exactly one ticket (InterlockedIncrement + WakeByAddressSingle); every blocked Enter consumes exactly one ticket (CAS decrement, WaitOnAddress otherwise) and then owns the section by construction. There is no claim race on OwningThread. TryEnterCriticalSection cannot steal while waiters exist because each waiter's LockCount increment persists until it owns and leaves. - Leave validates nothing, like Windows: if --RecursionCount != 0, decrement LockCount; otherwise clear the owner, decrement, and release one waiter if the result is >= 0. Measured with VS2013 cl.exe under stress (18KB C unit, 25s timeout = hang): ~1-2% hangs before, 0 hangs in 5,100 runs with this commit plus the FLS fix (2x1000 + 2000 at 24-way parallelism, 300 sequential, 800 on two other source units). Compiler output stays byte-identical to native Windows. Side effect: heavily contended compiles got ~6x faster (1000 parallel compiles: 26s -> 4s wall) because the old wait loop woke every waiter to race on each release. Co-Authored-By: Claude Fable 5 --- dll/kernel32/synchapi.cpp | 44 +++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/dll/kernel32/synchapi.cpp b/dll/kernel32/synchapi.cpp index c349faa..e418f91 100644 --- a/dll/kernel32/synchapi.cpp +++ b/dll/kernel32/synchapi.cpp @@ -455,18 +455,29 @@ inline void setOwningThread(LPCRITICAL_SECTION crit, DWORD threadId) { } void waitForCriticalSection(LPCRITICAL_SECTION cs) { - auto *sequence = reinterpret_cast(&cs->LockSemaphore); - LONG observed = __atomic_load_n(sequence, __ATOMIC_ACQUIRE); - while (owningThreadId(cs) != 0) { - kernel32::WaitOnAddress(sequence, &observed, sizeof(observed), INFINITE); - observed = __atomic_load_n(sequence, __ATOMIC_ACQUIRE); + // Ticket handoff: Windows hands a contended section to exactly one + // waiter per Leave through a counted semaphore (LockSemaphore). Model the + // semaphore as a ticket count on the LockSemaphore word: Leave posts one + // ticket, each blocked Enter consumes one. The woken waiter owns the + // section by construction (no claim race on OwningThread). + auto *tickets = reinterpret_cast(&cs->LockSemaphore); + for (;;) { + LONG available = __atomic_load_n(tickets, __ATOMIC_ACQUIRE); + if (available > 0) { + if (__atomic_compare_exchange_n(tickets, &available, available - 1, false, __ATOMIC_ACQ_REL, + __ATOMIC_ACQUIRE)) { + return; + } + continue; + } + kernel32::WaitOnAddress(tickets, &available, sizeof(available), INFINITE); } } void signalCriticalSection(LPCRITICAL_SECTION cs) { - auto *sequence = reinterpret_cast(&cs->LockSemaphore); - kernel32::InterlockedIncrement(const_cast(sequence)); - kernel32::WakeByAddressSingle(sequence); + auto *tickets = reinterpret_cast(&cs->LockSemaphore); + kernel32::InterlockedIncrement(const_cast(tickets)); + kernel32::WakeByAddressSingle(tickets); } inline bool trySpinAcquireCriticalSection(LPCRITICAL_SECTION cs, DWORD threadId) { @@ -1086,7 +1097,7 @@ void WINAPI EnterCriticalSection(LPCRITICAL_SECTION lpCriticalSection) { lpCriticalSection->RecursionCount++; return; } - waitForCriticalSection(lpCriticalSection); + waitForCriticalSection(lpCriticalSection); // ticket handoff: we own the section now } setOwningThread(lpCriticalSection, threadId); lpCriticalSection->RecursionCount = 1; @@ -1100,16 +1111,13 @@ void WINAPI LeaveCriticalSection(LPCRITICAL_SECTION lpCriticalSection) { return; } - const DWORD threadId = GetCurrentThreadId(); - if (owningThreadId(lpCriticalSection) != threadId || lpCriticalSection->RecursionCount <= 0) { - DEBUG_LOG("LeaveCriticalSection: thread %u does not own %p (owner=%u, recursion=%ld)\n", threadId, - lpCriticalSection, owningThreadId(lpCriticalSection), - static_cast(lpCriticalSection->RecursionCount)); - return; - } - + // Windows LeaveCriticalSection performs NO + // caller/ownership validation; it unconditionally decrements + // RecursionCount/LockCount and releases one waiter. Guest patterns that + // rely on that (cross-thread leave, reinitialized sections) must behave + // identically here. auto *lockCount = const_cast(&lpCriticalSection->LockCount); - if (--lpCriticalSection->RecursionCount > 0) { + if (--lpCriticalSection->RecursionCount != 0) { kernel32::InterlockedDecrement(lockCount); return; }