-
Notifications
You must be signed in to change notification settings - Fork 16
[Bug] Abort kernel execution on assertion failure instead of segfaulting #419
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
705c43d
72b23e3
8f51ba5
f83fca9
41cb051
0006e75
661c9fe
bc92118
0294af7
cda9a94
5b99545
0a30ca2
cae334b
41db2f3
aef2072
d754c78
982c3dc
867998c
87eea2b
fcee8dd
75ccc48
eafc800
58683b4
099498b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1142,7 +1142,11 @@ | |
| auto arguments = create_entry_block_alloca(argument_buffer_size); | ||
|
|
||
| std::vector<llvm::Value *> args; | ||
| args.emplace_back(get_runtime()); | ||
| // On CPU, use the context-aware variant that returns non-zero on failure | ||
| // so we can emit an early return and avoid the subsequent out-of-bounds | ||
| // memory access. On GPU, asm("exit;") kills the thread directly. | ||
| bool use_ctx_variant = arch_is_cpu(current_arch()); | ||
| args.emplace_back(use_ctx_variant ? get_context() : get_runtime()); | ||
| args.emplace_back(builder->CreateIsNotNull(llvm_val[stmt->cond])); | ||
| args.emplace_back(builder->CreateGlobalStringPtr(stmt->text)); | ||
|
|
||
|
|
@@ -1167,7 +1171,17 @@ | |
| args.emplace_back( | ||
| builder->CreateGEP(argument_buffer_size, arguments, {tlctx->get_constant(0), tlctx->get_constant(0)})); | ||
|
|
||
| llvm_val[stmt] = call("quadrants_assert_format", std::move(args)); | ||
| llvm_val[stmt] = call(use_ctx_variant ? "quadrants_assert_format_ctx" : "quadrants_assert_format", std::move(args)); | ||
|
|
||
| if (use_ctx_variant) { | ||
| auto *assert_abort = llvm::BasicBlock::Create(*llvm_context, "assert_abort", func); | ||
| auto *assert_cont = llvm::BasicBlock::Create(*llvm_context, "assert_cont", func); | ||
| auto *failed = builder->CreateICmpNE(llvm_val[stmt], tlctx->get_constant(0)); | ||
| builder->CreateCondBr(failed, assert_abort, assert_cont); | ||
| builder->SetInsertPoint(assert_abort); | ||
| builder->CreateRetVoid(); | ||
| builder->SetInsertPoint(assert_cont); | ||
| } | ||
|
Check warning on line 1184 in quadrants/codegen/llvm/codegen_llvm.cpp
|
||
|
Comment on lines
+1180
to
+1184
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 nit: In visit(AssertStmt) at codegen_llvm.cpp:1182, the assert_abort block emits Extended reasoning...What the bug is and how it manifests In builder->SetInsertPoint(assert_abort);
builder->CreateRetVoid(); // bypasses final_blockThis bypasses the task function's The specific code path
Why existing code does not prevent it No invariant enforces that all function exits go through Scope (addressing the refutations) The refutations are correct that the real-world impact is narrow. The triggering conditions are: CPU + How to fix it Replace line 1182: builder->CreateRetVoid();with: builder->CreateBr(final_block);(Or rely on finalize_offloaded_task_function as ReturnStmt does.) Step-by-step proof with a concrete example Configuration:
|
||
| } | ||
|
|
||
| void TaskCodeGenLLVM::visit(SNodeOpStmt *stmt) { | ||
|
|
@@ -2493,6 +2507,12 @@ | |
| current_callable = old_callable; | ||
| } | ||
| llvm::Function *llvm_func = func_map[stmt->func]; | ||
| // FIXME: when cpu_assert_failed fires inside a @qd.real_func callee, the | ||
| // flag is set on new_ctx but never propagated back to the caller's context. | ||
| // Regular @qd.func is AST-inlined so assertions are handled by the caller's | ||
| // visit(AssertStmt) directly. real_func needs: (1) zero-init new_ctx's | ||
| // cpu_assert_failed before the call, (2) post-call check + propagate to | ||
| // get_context(), (3) emit ret void on failure. | ||
| auto *new_ctx = create_entry_block_alloca(get_runtime_type("RuntimeContext")); | ||
| call("RuntimeContext_set_runtime", new_ctx, get_runtime()); | ||
| if (!stmt->func->parameter_list.empty()) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,16 +5,19 @@ namespace quadrants::lang { | |
| namespace cpu { | ||
|
|
||
| void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, const std::vector<TaskFunc> &task_funcs) { | ||
| ctx.get_context().cpu_assert_failed = 0; | ||
| for (auto task : task_funcs) { | ||
| task(&ctx.get_context()); | ||
| if (ctx.get_context().cpu_assert_failed) | ||
| break; | ||
|
Comment on lines
+11
to
+12
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Stopping Useful? React with 👍 / 👎. |
||
| } | ||
|
claude[bot] marked this conversation as resolved.
|
||
| } | ||
|
claude[bot] marked this conversation as resolved.
|
||
|
|
||
| void KernelLauncher::launch_offloaded_tasks_with_do_while(LaunchContextBuilder &ctx, | ||
| const std::vector<TaskFunc> &task_funcs) { | ||
| do { | ||
| launch_offloaded_tasks(ctx, task_funcs); | ||
| } while (*static_cast<int32_t *>(ctx.graph_do_while_flag_dev_ptr) != 0); | ||
| } while (ctx.get_context().cpu_assert_failed == 0 && *static_cast<int32_t *>(ctx.graph_do_while_flag_dev_ptr) != 0); | ||
| } | ||
|
|
||
| void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -294,6 +294,7 @@ STRUCT_FIELD_ARRAY(PhysicalCoordinates, val); | |
|
|
||
| STRUCT_FIELD(RuntimeContext, runtime); | ||
| STRUCT_FIELD(RuntimeContext, result_buffer) | ||
| STRUCT_FIELD(RuntimeContext, cpu_assert_failed) | ||
|
|
||
| #include "quadrants/runtime/llvm/runtime_module/atomic.h" | ||
|
|
||
|
|
@@ -795,6 +796,25 @@ void quadrants_assert_format(LLVMRuntime *runtime, u1 test, const char *format, | |
| #endif | ||
| } | ||
|
|
||
| // Context-aware variant called by bounds-check assertions in JIT'd code. | ||
| // Returns 1 when the assertion failed (so the codegen can emit an early | ||
| // return), 0 otherwise. This replaces a previous setjmp/longjmp approach | ||
| // that crashed on Windows because JIT'd frames lack SEH unwind tables. | ||
| i32 quadrants_assert_format_ctx(RuntimeContext *context, | ||
| u1 test, | ||
| const char *format, | ||
| int num_arguments, | ||
| uint64 *arguments) { | ||
| quadrants_assert_format(context->runtime, test, format, num_arguments, arguments); | ||
| #if !ARCH_cuda && !ARCH_amdgpu | ||
| if (enable_assert && test == 0) { | ||
| context->cpu_assert_failed = 1; | ||
| return 1; | ||
| } | ||
| #endif | ||
| return 0; | ||
| } | ||
|
|
||
| void quadrants_assert_runtime(LLVMRuntime *runtime, u1 test, const char *msg) { | ||
| quadrants_assert_format(runtime, test, msg, 0, nullptr); | ||
| } | ||
|
|
@@ -1505,9 +1525,13 @@ void cpu_struct_for_block_helper(void *ctx_, int thread_id, int i) { | |
|
|
||
| RuntimeContext this_thread_context = *ctx->context; | ||
| this_thread_context.cpu_thread_id = thread_id; | ||
| this_thread_context.cpu_assert_failed = 0; | ||
|
|
||
| if (lower < upper) { | ||
| (*ctx->task)(&this_thread_context, tls_buffer, &ctx->list->get<Element>(element_id), lower, upper); | ||
| } | ||
| if (this_thread_context.cpu_assert_failed) | ||
| ctx->context->cpu_assert_failed = 1; | ||
| } | ||
|
|
||
| void parallel_struct_for(RuntimeContext *context, | ||
|
Comment on lines
1531
to
1537
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 In Extended reasoning...What the bug is and how it manifests The PR adds if (this_thread_context.cpu_assert_failed)
ctx->context->cpu_assert_failed = 1; // or ctx.context->...When multiple parallel workers each observe a failure in their private copy, they can all reach this store simultaneously. Per the C++11 memory model, two or more threads concurrently writing to the same non-atomic memory location is undefined behaviour, even when every writer stores the same value (1). The three affected sites are runtime.cpp:1522 ( The specific code path that triggers it Any CPU kernel where more than one parallel worker encounters an assertion failure during the same dispatch will trigger the race. For example, a Why existing code does not prevent it
Addressing the refutation The refutation correctly notes two mitigating factors: (1) all writers store the same value (1), so there is no torn-write data corruption risk, and (2) the Impact and fix The practical impact is limited to TSan noise and theoretical undefined behaviour on non-x86/non-AArch64 targets. The correct fix is to change |
||
|
|
@@ -1578,26 +1602,41 @@ void cpu_parallel_range_for_task(void *range_context, int thread_id, int task_id | |
| alignas(8) char tls_buffer[ctx.tls_size]; | ||
| #pragma clang diagnostic pop | ||
| auto tls_ptr = &tls_buffer[0]; | ||
| if (ctx.prologue) | ||
| ctx.prologue(ctx.context, tls_ptr); | ||
|
|
||
| RuntimeContext this_thread_context = *ctx.context; | ||
| this_thread_context.cpu_thread_id = thread_id; | ||
| this_thread_context.cpu_assert_failed = 0; | ||
|
|
||
|
Comment on lines
1606
to
+1609
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Parallel helpers copy Useful? React with 👍 / 👎. |
||
| if (ctx.prologue) { | ||
| ctx.prologue(&this_thread_context, tls_ptr); | ||
| if (this_thread_context.cpu_assert_failed) { | ||
| ctx.context->cpu_assert_failed = 1; | ||
| return; | ||
| } | ||
| } | ||
|
|
||
| if (ctx.step == 1) { | ||
|
claude[bot] marked this conversation as resolved.
|
||
| int block_start = ctx.begin + task_id * ctx.block_size; | ||
| int block_end = std::min(block_start + ctx.block_size, ctx.end); | ||
| for (int i = block_start; i < block_end; i++) { | ||
| ctx.body(&this_thread_context, tls_ptr, i); | ||
| if (this_thread_context.cpu_assert_failed) | ||
| break; | ||
| } | ||
| } else if (ctx.step == -1) { | ||
| int block_start = ctx.end - task_id * ctx.block_size; | ||
| int block_end = std::max(ctx.begin, block_start * ctx.block_size); | ||
| int block_end = std::max(ctx.begin, block_start - ctx.block_size); | ||
| for (int i = block_start - 1; i >= block_end; i--) { | ||
|
claude[bot] marked this conversation as resolved.
|
||
| ctx.body(&this_thread_context, tls_ptr, i); | ||
| if (this_thread_context.cpu_assert_failed) | ||
| break; | ||
| } | ||
| } | ||
| if (ctx.epilogue) | ||
| ctx.epilogue(ctx.context, tls_ptr); | ||
|
|
||
| if (!this_thread_context.cpu_assert_failed && ctx.epilogue) | ||
| ctx.epilogue(&this_thread_context, tls_ptr); | ||
| if (this_thread_context.cpu_assert_failed) | ||
| ctx.context->cpu_assert_failed = 1; | ||
| } | ||
|
|
||
| void cpu_parallel_range_for(RuntimeContext *context, | ||
|
|
@@ -1678,17 +1717,28 @@ void cpu_parallel_mesh_for_task(void *range_context, int thread_id, int task_id) | |
|
|
||
| RuntimeContext this_thread_context = *ctx.context; | ||
| this_thread_context.cpu_thread_id = thread_id; | ||
| this_thread_context.cpu_assert_failed = 0; | ||
|
|
||
| int block_start = task_id * ctx.block_size; | ||
| int block_end = std::min(block_start + ctx.block_size, ctx.num_patches); | ||
|
|
||
| for (int idx = block_start; idx < block_end; idx++) { | ||
| if (ctx.prologue) | ||
| ctx.prologue(ctx.context, tls_ptr, idx); | ||
| if (ctx.prologue) { | ||
| ctx.prologue(&this_thread_context, tls_ptr, idx); | ||
| if (this_thread_context.cpu_assert_failed) | ||
| break; | ||
| } | ||
| ctx.body(&this_thread_context, tls_ptr, idx); | ||
| if (ctx.epilogue) | ||
| ctx.epilogue(ctx.context, tls_ptr, idx); | ||
| if (this_thread_context.cpu_assert_failed) | ||
| break; | ||
| if (ctx.epilogue) { | ||
| ctx.epilogue(&this_thread_context, tls_ptr, idx); | ||
| if (this_thread_context.cpu_assert_failed) | ||
| break; | ||
| } | ||
| } | ||
| if (this_thread_context.cpu_assert_failed) | ||
| ctx.context->cpu_assert_failed = 1; | ||
| } | ||
|
|
||
| void cpu_parallel_mesh_for(RuntimeContext *context, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -135,3 +135,128 @@ def func(): | |
| x[3, 7] = 2 | ||
|
|
||
| func() | ||
|
|
||
|
|
||
| @test_utils.test( | ||
| arch=[qd.cpu], | ||
| require=qd.extension.assertion, | ||
| debug=True, | ||
| check_out_of_bound=True, | ||
| gdb_trigger=False, | ||
| ) | ||
| def test_ndarray_oob_cpu_raises_not_segfaults(): | ||
| """Out-of-bounds ndarray access in a parallel kernel on CPU should raise | ||
| QuadrantsAssertionError instead of segfaulting.""" | ||
| arr = qd.ndarray(dtype=qd.f32, shape=(4,)) | ||
|
|
||
| @qd.kernel | ||
| def write_oob(a: qd.types.ndarray(dtype=qd.f32, ndim=1)): | ||
| for i in range(10): | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I dont think this test is valid, because this si a paallel loop? we should make ti serial I think? |
||
| a[i] = 1.0 | ||
|
|
||
| with pytest.raises(AssertionError, match=r"Out of bound access"): | ||
| write_oob(arr) | ||
|
|
||
|
|
||
| @test_utils.test( | ||
| arch=[qd.cpu], | ||
| require=qd.extension.assertion, | ||
| debug=True, | ||
| check_out_of_bound=True, | ||
| gdb_trigger=False, | ||
| ) | ||
| def test_ndarray_oob_cpu_small_array(): | ||
| """Reproduces the pattern from the temperature-sensor segfault: a kernel | ||
| accesses a very small (shape-1) array with an index that goes out of | ||
| bounds. Before the cpu_assert_failed fix this would SIGSEGV on CPU in debug mode.""" | ||
| small = qd.ndarray(dtype=qd.f32, shape=(1,)) | ||
| small.fill(42.0) | ||
|
|
||
| @qd.kernel | ||
| def read_oob(a: qd.types.ndarray(dtype=qd.f32, ndim=1)) -> qd.f32: | ||
| return a[5] | ||
|
|
||
| with pytest.raises(AssertionError, match=r"Out of bound access"): | ||
| read_oob(small) | ||
|
|
||
|
|
||
| @test_utils.test( | ||
| arch=[qd.cpu], | ||
| require=qd.extension.assertion, | ||
| debug=True, | ||
| check_out_of_bound=True, | ||
| gdb_trigger=False, | ||
| ) | ||
| def test_ndarray_oob_cpu_2d(): | ||
| """2D ndarray out-of-bounds on CPU should produce a clear error.""" | ||
| arr = qd.ndarray(dtype=qd.f32, shape=(3, 4)) | ||
|
|
||
| @qd.kernel | ||
| def write_oob_2d(a: qd.types.ndarray(dtype=qd.f32, ndim=2)): | ||
| for i in range(1): | ||
| a[10, 0] = 1.0 | ||
|
|
||
| with pytest.raises(AssertionError, match=r"Out of bound access"): | ||
| write_oob_2d(arr) | ||
|
|
||
|
|
||
| @test_utils.test( | ||
| arch=[qd.cpu], | ||
| require=qd.extension.assertion, | ||
| debug=True, | ||
| check_out_of_bound=True, | ||
| gdb_trigger=False, | ||
| ) | ||
| def test_ndarray_inbounds_cpu_still_works(): | ||
| """Verify that the cpu_assert_failed mechanism does not break normal | ||
| in-bounds ndarray access.""" | ||
| n = 8 | ||
| arr = qd.ndarray(dtype=qd.f32, shape=(n,)) | ||
|
|
||
| @qd.kernel | ||
| def fill(a: qd.types.ndarray(dtype=qd.f32, ndim=1)): | ||
| for i in range(n): | ||
| a[i] = qd.cast(i * 10, qd.f32) | ||
|
|
||
| fill(arr) | ||
| result = arr.to_numpy() | ||
| for i in range(n): | ||
| assert result[i] == pytest.approx(i * 10) | ||
|
claude[bot] marked this conversation as resolved.
|
||
|
|
||
|
|
||
| @test_utils.test( | ||
| arch=[qd.cpu], | ||
| require=qd.extension.assertion, | ||
| debug=True, | ||
| check_out_of_bound=True, | ||
| gdb_trigger=False, | ||
| ) | ||
| def test_do_while_oob_does_not_loop_forever(): | ||
| """An OOB assertion inside a do-while kernel must break the outer loop. | ||
|
|
||
| Without the cpu_assert_failed check in the do-while condition, the | ||
| flag-clearing task is skipped (inner break), the outer loop sees | ||
| flag != 0, re-enters launch_offloaded_tasks which resets | ||
| cpu_assert_failed = 0, and re-runs tasks on corrupted data forever. | ||
| """ | ||
| import numpy as np | ||
|
|
||
| arr = qd.ndarray(dtype=qd.f32, shape=(4,)) | ||
| counter = qd.ndarray(dtype=qd.i32, shape=()) | ||
| counter.from_numpy(np.array(10, dtype=np.int32)) | ||
|
|
||
| @qd.kernel(graph=True) | ||
| def oob_in_do_while( | ||
| a: qd.types.ndarray(dtype=qd.f32, ndim=1), | ||
| c: qd.types.ndarray(dtype=qd.i32, ndim=0), | ||
| ): | ||
| while qd.graph_do_while(c): | ||
| # Serial loop so the OOB fires on the shared context immediately, | ||
| # guaranteeing the do-while condition sees it on the same iteration. | ||
| for i in qd.static(range(10)): | ||
| a[i] = 1.0 | ||
| for i in range(1): | ||
| c[()] = c[()] - 1 | ||
|
|
||
| with pytest.raises(AssertionError, match=r"Out of bound access"): | ||
|
claude[bot] marked this conversation as resolved.
|
||
| oob_in_do_while(arr, counter) | ||
|
claude[bot] marked this conversation as resolved.
|
||
Uh oh!
There was an error while loading. Please reload this page.