diff --git a/dll/kernel32/fibersapi.cpp b/dll/kernel32/fibersapi.cpp index c834bd6..59a97e9 100644 --- a/dll/kernel32/fibersapi.cpp +++ b/dll/kernel32/fibersapi.cpp @@ -5,11 +5,21 @@ #include "errors.h" #include "internal.h" +#include + namespace { +// FLS without fibers is THREAD-local storage on +// Windows (every thread is exactly one fiber); only the index allocation map +// is process-wide. The previous process-global value array made all threads +// share one cell per index, which clobbers msvcr120's per-thread data (_ptd) -- +// including the _beginthreadex entry/argument that _threadstartex re-reads +// through FlsGetValue -- so concurrently-starting threads could duplicate or +// swap their start arguments. constexpr DWORD kMaxFlsValues = 0x100; +std::mutex g_flsMutex; bool g_flsValuesUsed[kMaxFlsValues] = {false}; -LPVOID g_flsValues[kMaxFlsValues] = {nullptr}; +thread_local LPVOID t_flsValues[kMaxFlsValues] = {nullptr}; } // namespace @@ -19,10 +29,11 @@ DWORD WINAPI FlsAlloc(PFLS_CALLBACK_FUNCTION lpCallback) { HOST_CONTEXT_GUARD(); DEBUG_LOG("FlsAlloc(%p)", lpCallback); // If the function succeeds, the return value is an FLS index initialized to zero. + std::lock_guard lk(g_flsMutex); for (DWORD i = 0; i < kMaxFlsValues; i++) { if (g_flsValuesUsed[i] == false) { g_flsValuesUsed[i] = true; - g_flsValues[i] = nullptr; + t_flsValues[i] = nullptr; DEBUG_LOG(" -> %d\n", i); return i; } @@ -35,6 +46,7 @@ DWORD WINAPI FlsAlloc(PFLS_CALLBACK_FUNCTION lpCallback) { BOOL WINAPI FlsFree(DWORD dwFlsIndex) { HOST_CONTEXT_GUARD(); DEBUG_LOG("FlsFree(%u)\n", dwFlsIndex); + std::lock_guard lk(g_flsMutex); if (dwFlsIndex < kMaxFlsValues && g_flsValuesUsed[dwFlsIndex]) { g_flsValuesUsed[dwFlsIndex] = false; return TRUE; @@ -49,7 +61,7 @@ PVOID WINAPI FlsGetValue(DWORD dwFlsIndex) { VERBOSE_LOG("FlsGetValue(%u)\n", dwFlsIndex); PVOID result = nullptr; if (dwFlsIndex < kMaxFlsValues && g_flsValuesUsed[dwFlsIndex]) { - result = g_flsValues[dwFlsIndex]; + result = t_flsValues[dwFlsIndex]; // See https://learn.microsoft.com/en-us/windows/win32/api/fibersapi/nf-fibersapi-flsgetvalue setLastError(ERROR_SUCCESS); } else { @@ -63,7 +75,7 @@ BOOL WINAPI FlsSetValue(DWORD dwFlsIndex, PVOID lpFlsData) { HOST_CONTEXT_GUARD(); VERBOSE_LOG("FlsSetValue(%u, %p)\n", dwFlsIndex, lpFlsData); if (dwFlsIndex < kMaxFlsValues && g_flsValuesUsed[dwFlsIndex]) { - g_flsValues[dwFlsIndex] = lpFlsData; + t_flsValues[dwFlsIndex] = lpFlsData; return TRUE; } else { setLastError(ERROR_INVALID_PARAMETER); diff --git a/dll/kernel32/synchapi.cpp b/dll/kernel32/synchapi.cpp index c349faa..e418f91 100644 --- a/dll/kernel32/synchapi.cpp +++ b/dll/kernel32/synchapi.cpp @@ -455,18 +455,29 @@ inline void setOwningThread(LPCRITICAL_SECTION crit, DWORD threadId) { } void waitForCriticalSection(LPCRITICAL_SECTION cs) { - auto *sequence = reinterpret_cast(&cs->LockSemaphore); - LONG observed = __atomic_load_n(sequence, __ATOMIC_ACQUIRE); - while (owningThreadId(cs) != 0) { - kernel32::WaitOnAddress(sequence, &observed, sizeof(observed), INFINITE); - observed = __atomic_load_n(sequence, __ATOMIC_ACQUIRE); + // Ticket handoff: Windows hands a contended section to exactly one + // waiter per Leave through a counted semaphore (LockSemaphore). Model the + // semaphore as a ticket count on the LockSemaphore word: Leave posts one + // ticket, each blocked Enter consumes one. The woken waiter owns the + // section by construction (no claim race on OwningThread). + auto *tickets = reinterpret_cast(&cs->LockSemaphore); + for (;;) { + LONG available = __atomic_load_n(tickets, __ATOMIC_ACQUIRE); + if (available > 0) { + if (__atomic_compare_exchange_n(tickets, &available, available - 1, false, __ATOMIC_ACQ_REL, + __ATOMIC_ACQUIRE)) { + return; + } + continue; + } + kernel32::WaitOnAddress(tickets, &available, sizeof(available), INFINITE); } } void signalCriticalSection(LPCRITICAL_SECTION cs) { - auto *sequence = reinterpret_cast(&cs->LockSemaphore); - kernel32::InterlockedIncrement(const_cast(sequence)); - kernel32::WakeByAddressSingle(sequence); + auto *tickets = reinterpret_cast(&cs->LockSemaphore); + kernel32::InterlockedIncrement(const_cast(tickets)); + kernel32::WakeByAddressSingle(tickets); } inline bool trySpinAcquireCriticalSection(LPCRITICAL_SECTION cs, DWORD threadId) { @@ -1086,7 +1097,7 @@ void WINAPI EnterCriticalSection(LPCRITICAL_SECTION lpCriticalSection) { lpCriticalSection->RecursionCount++; return; } - waitForCriticalSection(lpCriticalSection); + waitForCriticalSection(lpCriticalSection); // ticket handoff: we own the section now } setOwningThread(lpCriticalSection, threadId); lpCriticalSection->RecursionCount = 1; @@ -1100,16 +1111,13 @@ void WINAPI LeaveCriticalSection(LPCRITICAL_SECTION lpCriticalSection) { return; } - const DWORD threadId = GetCurrentThreadId(); - if (owningThreadId(lpCriticalSection) != threadId || lpCriticalSection->RecursionCount <= 0) { - DEBUG_LOG("LeaveCriticalSection: thread %u does not own %p (owner=%u, recursion=%ld)\n", threadId, - lpCriticalSection, owningThreadId(lpCriticalSection), - static_cast(lpCriticalSection->RecursionCount)); - return; - } - + // Windows LeaveCriticalSection performs NO + // caller/ownership validation; it unconditionally decrements + // RecursionCount/LockCount and releases one waiter. Guest patterns that + // rely on that (cross-thread leave, reinitialized sections) must behave + // identically here. auto *lockCount = const_cast(&lpCriticalSection->LockCount); - if (--lpCriticalSection->RecursionCount > 0) { + if (--lpCriticalSection->RecursionCount != 0) { kernel32::InterlockedDecrement(lockCount); return; }