Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
705c43d
[CPU] Abort kernel execution on assertion failure instead of segfaulting
hughperkins Mar 16, 2026
72b23e3
Merge branch 'main' into hp/cpu-longjmp-after-assert
hughperkins Mar 16, 2026
8f51ba5
Fix pre-commit lint: clang-format and unused import
hughperkins Mar 16, 2026
f83fca9
Replace setjmp/longjmp with flag-based early return for CPU assert abort
hughperkins Mar 16, 2026
41cb051
Merge branch 'main' into hp/cpu-longjmp-after-assert
hughperkins Mar 16, 2026
0006e75
Merge remote-tracking branch 'origin/main' into hp/cpu-longjmp-after-…
hughperkins Mar 16, 2026
661c9fe
Merge remote-tracking branch 'origin/main' into hp/cpu-longjmp-after-…
hughperkins Mar 16, 2026
bc92118
Merge remote-tracking branch 'origin/main' into hp/cpu-longjmp-after-…
hughperkins Apr 19, 2026
0294af7
style: apply clang-format
hughperkins Apr 20, 2026
cda9a94
Merge branch 'main' into hp/cpu-longjmp-after-assert
hughperkins Apr 21, 2026
5b99545
fix: check cpu_assert_failed in do-while loop condition
hughperkins Apr 22, 2026
0a30ca2
fix(test): use serial loop in do-while OOB test
hughperkins Apr 22, 2026
cae334b
fix: use per-thread context for prologue/epilogue in parallel task he…
hughperkins Apr 22, 2026
41db2f3
fix: propagate cpu_assert_failed from @qd.func callees to caller
hughperkins Apr 22, 2026
aef2072
fix: propagate cpu_assert_failed from per-thread copies to shared con…
hughperkins Apr 22, 2026
d754c78
docs: update test docstrings to reflect cpu_assert_failed mechanism
hughperkins Apr 22, 2026
982c3dc
fix: zero-initialize cpu_assert_failed in FuncCallStmt's new_ctx
hughperkins Apr 22, 2026
867998c
test: add test for @qd.func OOB assertion propagation
hughperkins Apr 22, 2026
87eea2b
fix: use runtime helpers instead of raw GEP for cpu_assert_failed access
hughperkins Apr 22, 2026
fcee8dd
fix: propagate cpu_assert_failed before early return in range_for pro…
hughperkins Apr 22, 2026
75ccc48
docs: fix stale comment about parallel range_for write-back
hughperkins Apr 22, 2026
eafc800
Merge branch 'main' into hp/cpu-longjmp-after-assert
hughperkins Apr 22, 2026
58683b4
Merge branch 'main' into hp/cpu-longjmp-after-assert
hughperkins Apr 22, 2026
099498b
revert: remove real_func-only cpu_assert_failed propagation from Func…
hughperkins Apr 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions quadrants/codegen/llvm/codegen_llvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1142,7 +1142,11 @@
auto arguments = create_entry_block_alloca(argument_buffer_size);

std::vector<llvm::Value *> args;
args.emplace_back(get_runtime());
// On CPU, use the context-aware variant that returns non-zero on failure
// so we can emit an early return and avoid the subsequent out-of-bounds
// memory access. On GPU, asm("exit;") kills the thread directly.
bool use_ctx_variant = arch_is_cpu(current_arch());
args.emplace_back(use_ctx_variant ? get_context() : get_runtime());
args.emplace_back(builder->CreateIsNotNull(llvm_val[stmt->cond]));
args.emplace_back(builder->CreateGlobalStringPtr(stmt->text));

Expand All @@ -1167,7 +1171,17 @@
args.emplace_back(
builder->CreateGEP(argument_buffer_size, arguments, {tlctx->get_constant(0), tlctx->get_constant(0)}));

llvm_val[stmt] = call("quadrants_assert_format", std::move(args));
llvm_val[stmt] = call(use_ctx_variant ? "quadrants_assert_format_ctx" : "quadrants_assert_format", std::move(args));

if (use_ctx_variant) {
auto *assert_abort = llvm::BasicBlock::Create(*llvm_context, "assert_abort", func);
auto *assert_cont = llvm::BasicBlock::Create(*llvm_context, "assert_cont", func);
auto *failed = builder->CreateICmpNE(llvm_val[stmt], tlctx->get_constant(0));
builder->CreateCondBr(failed, assert_abort, assert_cont);
builder->SetInsertPoint(assert_abort);
builder->CreateRetVoid();
builder->SetInsertPoint(assert_cont);
}

Check warning on line 1184 in quadrants/codegen/llvm/codegen_llvm.cpp

View check run for this annotation

Claude / Claude Code Review

AssertStmt skips final_block, missing profiler_stop on assert

nit: In visit(AssertStmt) at codegen_llvm.cpp:1182, the assert_abort block emits `builder->CreateRetVoid()` directly instead of `builder->CreateBr(final_block)`. When compile_config.kernel_profiler is enabled on CPU, codegen_cpu.cpp:172-176 places `LLVMRuntime_profiler_stop` into final_block, so an assert-aborted serial task returns without calling it and the failing task's profiling record is silently dropped. Minor inconsistency with visit(ReturnStmt) at line 1116 which correctly uses CreateBr
Comment thread
claude[bot] marked this conversation as resolved.
Comment on lines +1180 to +1184
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 nit: In visit(AssertStmt) at codegen_llvm.cpp:1182, the assert_abort block emits builder->CreateRetVoid() directly instead of builder->CreateBr(final_block). When compile_config.kernel_profiler is enabled on CPU, codegen_cpu.cpp:172-176 places LLVMRuntime_profiler_stop into final_block, so an assert-aborted serial task returns without calling it and the failing task's profiling record is silently dropped. Minor inconsistency with visit(ReturnStmt) at line 1116 which correctly uses CreateBr(final_block); fix is a one-line change to match that pattern.

Extended reasoning...

What the bug is and how it manifests

In visit(AssertStmt) (codegen_llvm.cpp:1176-1184), when use_ctx_variant is true (CPU), the new assert_abort BasicBlock is terminated directly with builder->CreateRetVoid():

builder->SetInsertPoint(assert_abort);
builder->CreateRetVoid();  // bypasses final_block

This bypasses the task function's final_block. On CPU with compile_config.kernel_profiler enabled, codegen_cpu.cpp:172-176 emits LLVMRuntime_profiler_stop into final_block, which is therefore skipped when an assertion fires.

The specific code path

  1. visit(OffloadedStmt) at codegen_cpu.cpp:152-155 emits LLVMRuntime_profiler_start at task entry when kernel_profiler is on.
  2. The task body is emitted.
  3. codegen_cpu.cpp:172-176 sets the insert point to final_block and emits profiler_stop there.
  4. finalize_offloaded_task_function (codegen_llvm.cpp:1781-1782) adds CreateRetVoid to final_block.
  5. visit(ReturnStmt) at line 1116 correctly uses builder->CreateBr(final_block) so normal early-returns run profiler_stop.
  6. visit(AssertStmt) at line 1182 diverges — its CreateRetVoid() skips final_block entirely and profiler_stop never runs.

Why existing code does not prevent it

No invariant enforces that all function exits go through final_block; visit(AssertStmt) simply creates a new terminator on its own BasicBlock with no static check catching the divergence from visit(ReturnStmt)'s established pattern.

Scope (addressing the refutations)

The refutations are correct that the real-world impact is narrow. The triggering conditions are: CPU + kernel_profiler=True + debug=True + an assertion actually firing, AND the assertion must be in a serial offloaded task body. For range_for/mesh_for/struct_for/xlogue/real_func bodies, codegen_llvm.cpp:28-56 compiles them inside FunctionCreationGuard which saves and replaces final_block; the inner final_block only contains CreateRetVoid (no profiler_stop), so an inner CreateRetVoid returns to the outer offloaded task function which still falls through to its outer final_block and runs profiler_stop normally. Also, DefaultProfiler::start() unconditionally resets start_t_ / event_name_, so no state corruption occurs on the next kernel — the only observable effect is that the failing task is missing from traced_records_ / statistical_results_. The Python-side AssertionError still propagates correctly via runtime->error_code. This is why I am filing as nit, not normal.

How to fix it

Replace line 1182:

builder->CreateRetVoid();

with:

builder->CreateBr(final_block);

(Or rely on finalize_offloaded_task_function as ReturnStmt does.) final_block already ends with CreateRetVoid, so correctness is preserved and profiler_stop runs before the task returns.

Step-by-step proof with a concrete example

Configuration: qd.init(arch=qd.cpu, debug=True, check_out_of_bound=True, kernel_profiler=True). A @qd.kernel compiled as a single serial offloaded task reads a[5] on a shape-(1,) ndarray.

  1. Task function prologue (emitted by codegen_cpu.cpp:152-155): LLVMRuntime_profiler_start(runtime, kernel_name) — records start_t_ and event_name_ in DefaultProfiler.
  2. Bounds check fires in the task body. visit(AssertStmt) emits call quadrants_assert_format_ctx(...) → returns 1 → CondBr taken to assert_abort.
  3. assert_abort executes ret void (current code) — the task function returns immediately.
  4. final_block is never entered; LLVMRuntime_profiler_stop is never called.
  5. DefaultProfiler::traced_records_ gets no entry for the failing task. The next profiler_start() overwrites start_t_ / event_name_ cleanly.
  6. With the fix (CreateBr(final_block)), step 3 becomes br label %final_block; profiler_stop runs; traced_records_ correctly records the kernel before ret void executes from final_block.

}

void TaskCodeGenLLVM::visit(SNodeOpStmt *stmt) {
Expand Down Expand Up @@ -2493,6 +2507,12 @@
current_callable = old_callable;
}
llvm::Function *llvm_func = func_map[stmt->func];
// FIXME: when cpu_assert_failed fires inside a @qd.real_func callee, the
// flag is set on new_ctx but never propagated back to the caller's context.
// Regular @qd.func is AST-inlined so assertions are handled by the caller's
// visit(AssertStmt) directly. real_func needs: (1) zero-init new_ctx's
// cpu_assert_failed before the call, (2) post-call check + propagate to
// get_context(), (3) emit ret void on failure.
auto *new_ctx = create_entry_block_alloca(get_runtime_type("RuntimeContext"));
call("RuntimeContext_set_runtime", new_ctx, get_runtime());
if (!stmt->func->parameter_list.empty()) {
Expand Down
6 changes: 6 additions & 0 deletions quadrants/program/context.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ struct RuntimeContext {
// LLVMRuntime is shared among functions. So we moved the pointer to
// RuntimeContext which each function have one.
uint64_t *result_buffer;

// Set to 1 by quadrants_assert_format_ctx when a runtime assertion (e.g.
// out-of-bounds check) fails on CPU. The codegen emits an early return
// after each assert call when this is set, and the task runner breaks out
// of its loop.
int32_t cpu_assert_failed{0};
};

#if defined(QD_RUNTIME_HOST)
Expand Down
5 changes: 4 additions & 1 deletion quadrants/runtime/cpu/kernel_launcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,19 @@ namespace quadrants::lang {
namespace cpu {

void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, const std::vector<TaskFunc> &task_funcs) {
ctx.get_context().cpu_assert_failed = 0;
for (auto task : task_funcs) {
task(&ctx.get_context());
if (ctx.get_context().cpu_assert_failed)
break;
Comment on lines +11 to +12
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Break do-while loop when assertion aborts tasks

Stopping launch_offloaded_tasks() on cpu_assert_failed can leave graph_do_while_flag_dev_ptr unchanged when the flag is normally updated by a later offloaded task in the same iteration. In that case launch_offloaded_tasks_with_do_while() keeps seeing a non-zero flag and repeats forever, so kernels that assert inside a graph do-while can hang instead of returning an error.

Useful? React with 👍 / 👎.

}
Comment thread
claude[bot] marked this conversation as resolved.
}
Comment thread
claude[bot] marked this conversation as resolved.

void KernelLauncher::launch_offloaded_tasks_with_do_while(LaunchContextBuilder &ctx,
const std::vector<TaskFunc> &task_funcs) {
do {
launch_offloaded_tasks(ctx, task_funcs);
} while (*static_cast<int32_t *>(ctx.graph_do_while_flag_dev_ptr) != 0);
} while (ctx.get_context().cpu_assert_failed == 0 && *static_cast<int32_t *>(ctx.graph_do_while_flag_dev_ptr) != 0);
}

void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx) {
Expand Down
68 changes: 59 additions & 9 deletions quadrants/runtime/llvm/runtime_module/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ STRUCT_FIELD_ARRAY(PhysicalCoordinates, val);

STRUCT_FIELD(RuntimeContext, runtime);
STRUCT_FIELD(RuntimeContext, result_buffer)
STRUCT_FIELD(RuntimeContext, cpu_assert_failed)

#include "quadrants/runtime/llvm/runtime_module/atomic.h"

Expand Down Expand Up @@ -795,6 +796,25 @@ void quadrants_assert_format(LLVMRuntime *runtime, u1 test, const char *format,
#endif
}

// Context-aware variant called by bounds-check assertions in JIT'd code.
// Returns 1 when the assertion failed (so the codegen can emit an early
// return), 0 otherwise. This replaces a previous setjmp/longjmp approach
// that crashed on Windows because JIT'd frames lack SEH unwind tables.
i32 quadrants_assert_format_ctx(RuntimeContext *context,
u1 test,
const char *format,
int num_arguments,
uint64 *arguments) {
quadrants_assert_format(context->runtime, test, format, num_arguments, arguments);
#if !ARCH_cuda && !ARCH_amdgpu
if (enable_assert && test == 0) {
context->cpu_assert_failed = 1;
return 1;
}
#endif
return 0;
}

void quadrants_assert_runtime(LLVMRuntime *runtime, u1 test, const char *msg) {
quadrants_assert_format(runtime, test, msg, 0, nullptr);
}
Expand Down Expand Up @@ -1505,9 +1525,13 @@ void cpu_struct_for_block_helper(void *ctx_, int thread_id, int i) {

RuntimeContext this_thread_context = *ctx->context;
this_thread_context.cpu_thread_id = thread_id;
this_thread_context.cpu_assert_failed = 0;

if (lower < upper) {
(*ctx->task)(&this_thread_context, tls_buffer, &ctx->list->get<Element>(element_id), lower, upper);
}
if (this_thread_context.cpu_assert_failed)
ctx->context->cpu_assert_failed = 1;
}

void parallel_struct_for(RuntimeContext *context,
Comment on lines 1531 to 1537
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 In cpu_struct_for_block_helper (runtime.cpp:1522), cpu_parallel_range_for_task (runtime.cpp:1625), and cpu_parallel_mesh_for_task (runtime.cpp:1727), multiple parallel worker threads can concurrently execute ctx->context->cpu_assert_failed = 1 on the same shared RuntimeContext with no synchronization — a C++ data race per the C++11 memory model even when all writers store the same value. In practice this is benign on x86-64/AArch64 (hardware-atomic aligned 32-bit store, parallel_for join provides the acquire fence), but ThreadSanitizer will flag every kernel invocation where multiple workers fail simultaneously. Fix: declare cpu_assert_failed as std::atomic<int32_t> in RuntimeContext and use memory_order_relaxed stores in the worker threads.

Extended reasoning...

What the bug is and how it manifests

The PR adds cpu_assert_failed to RuntimeContext and propagates it from per-thread copies back to the shared context at the end of each parallel task helper. The propagation pattern in all three helpers is identical:

if (this_thread_context.cpu_assert_failed)
    ctx->context->cpu_assert_failed = 1;   // or ctx.context->...

When multiple parallel workers each observe a failure in their private copy, they can all reach this store simultaneously. Per the C++11 memory model, two or more threads concurrently writing to the same non-atomic memory location is undefined behaviour, even when every writer stores the same value (1). The three affected sites are runtime.cpp:1522 (cpu_struct_for_block_helper), runtime.cpp:1625 (cpu_parallel_range_for_task), and runtime.cpp:1727 (cpu_parallel_mesh_for_task).

The specific code path that triggers it

Any CPU kernel where more than one parallel worker encounters an assertion failure during the same dispatch will trigger the race. For example, a range_for kernel over a small array where every element index is out-of-bounds: all threads make their per-thread copy, all hit the OOB assertion, and all attempt ctx.context->cpu_assert_failed = 1 concurrently when cpu_parallel_range_for_task returns.

Why existing code does not prevent it

cpu_assert_failed is declared as a plain int32_t in RuntimeContext (context.h). There are no mutexes, no atomics, and no other synchronization protecting the shared-context write. The per-thread copy pattern isolates the body/prologue/epilogue execution correctly, but the propagation-back step at the end of each task function is unprotected.

Addressing the refutation

The refutation correctly notes two mitigating factors: (1) all writers store the same value (1), so there is no torn-write data corruption risk, and (2) the parallel_for thread-pool join provides the acquire fence needed by the main-thread reader in launch_offloaded_tasks. Both points are accurate — this will not produce wrong results on x86-64 or AArch64. However, the C++ abstract machine does not guarantee hardware-level atomicity for plain integer stores, so the code is technically non-conforming. The practical consequence is that ThreadSanitizer will flag these as races on every invocation where multiple workers fail simultaneously, producing hard-to-dismiss TSan reports that obscure real races.

Impact and fix

The practical impact is limited to TSan noise and theoretical undefined behaviour on non-x86/non-AArch64 targets. The correct fix is to change cpu_assert_failed to std::atomic<int32_t> in RuntimeContext and replace the plain stores with store(1, std::memory_order_relaxed)relaxed is sufficient because the parallel_for join already provides the acquire fence the reading thread needs. Alternatively, std::atomic_ref<int32_t> could be used at the store sites if changing the struct layout is undesirable for JIT compatibility.

Expand Down Expand Up @@ -1578,26 +1602,41 @@ void cpu_parallel_range_for_task(void *range_context, int thread_id, int task_id
alignas(8) char tls_buffer[ctx.tls_size];
#pragma clang diagnostic pop
auto tls_ptr = &tls_buffer[0];
if (ctx.prologue)
ctx.prologue(ctx.context, tls_ptr);

RuntimeContext this_thread_context = *ctx.context;
this_thread_context.cpu_thread_id = thread_id;
this_thread_context.cpu_assert_failed = 0;

Comment on lines 1606 to +1609
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Propagate worker assert failures to launcher context

Parallel helpers copy RuntimeContext into this_thread_context and only set/check cpu_assert_failed on that copy. Because the shared ctx.context is never updated, KernelLauncher::launch_offloaded_tasks() does not observe failures from parallel tasks and continues launching subsequent offloaded tasks after an assertion, which can cause extra side effects after a kernel should have aborted.

Useful? React with 👍 / 👎.

if (ctx.prologue) {
ctx.prologue(&this_thread_context, tls_ptr);
if (this_thread_context.cpu_assert_failed) {
ctx.context->cpu_assert_failed = 1;
return;
}
}

if (ctx.step == 1) {
Comment thread
claude[bot] marked this conversation as resolved.
int block_start = ctx.begin + task_id * ctx.block_size;
int block_end = std::min(block_start + ctx.block_size, ctx.end);
for (int i = block_start; i < block_end; i++) {
ctx.body(&this_thread_context, tls_ptr, i);
if (this_thread_context.cpu_assert_failed)
break;
}
} else if (ctx.step == -1) {
int block_start = ctx.end - task_id * ctx.block_size;
int block_end = std::max(ctx.begin, block_start * ctx.block_size);
int block_end = std::max(ctx.begin, block_start - ctx.block_size);
for (int i = block_start - 1; i >= block_end; i--) {
Comment thread
claude[bot] marked this conversation as resolved.
ctx.body(&this_thread_context, tls_ptr, i);
if (this_thread_context.cpu_assert_failed)
break;
}
}
if (ctx.epilogue)
ctx.epilogue(ctx.context, tls_ptr);

if (!this_thread_context.cpu_assert_failed && ctx.epilogue)
ctx.epilogue(&this_thread_context, tls_ptr);
if (this_thread_context.cpu_assert_failed)
ctx.context->cpu_assert_failed = 1;
}

void cpu_parallel_range_for(RuntimeContext *context,
Expand Down Expand Up @@ -1678,17 +1717,28 @@ void cpu_parallel_mesh_for_task(void *range_context, int thread_id, int task_id)

RuntimeContext this_thread_context = *ctx.context;
this_thread_context.cpu_thread_id = thread_id;
this_thread_context.cpu_assert_failed = 0;

int block_start = task_id * ctx.block_size;
int block_end = std::min(block_start + ctx.block_size, ctx.num_patches);

for (int idx = block_start; idx < block_end; idx++) {
if (ctx.prologue)
ctx.prologue(ctx.context, tls_ptr, idx);
if (ctx.prologue) {
ctx.prologue(&this_thread_context, tls_ptr, idx);
if (this_thread_context.cpu_assert_failed)
break;
}
ctx.body(&this_thread_context, tls_ptr, idx);
if (ctx.epilogue)
ctx.epilogue(ctx.context, tls_ptr, idx);
if (this_thread_context.cpu_assert_failed)
break;
if (ctx.epilogue) {
ctx.epilogue(&this_thread_context, tls_ptr, idx);
if (this_thread_context.cpu_assert_failed)
break;
}
}
if (this_thread_context.cpu_assert_failed)
ctx.context->cpu_assert_failed = 1;
}

void cpu_parallel_mesh_for(RuntimeContext *context,
Expand Down
125 changes: 125 additions & 0 deletions tests/python/test_debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,128 @@ def func():
x[3, 7] = 2

func()


@test_utils.test(
arch=[qd.cpu],
require=qd.extension.assertion,
debug=True,
check_out_of_bound=True,
gdb_trigger=False,
)
def test_ndarray_oob_cpu_raises_not_segfaults():
"""Out-of-bounds ndarray access in a parallel kernel on CPU should raise
QuadrantsAssertionError instead of segfaulting."""
arr = qd.ndarray(dtype=qd.f32, shape=(4,))

@qd.kernel
def write_oob(a: qd.types.ndarray(dtype=qd.f32, ndim=1)):
for i in range(10):
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont think this test is valid, because this si a paallel loop? we should make ti serial I think?

a[i] = 1.0

with pytest.raises(AssertionError, match=r"Out of bound access"):
write_oob(arr)


@test_utils.test(
arch=[qd.cpu],
require=qd.extension.assertion,
debug=True,
check_out_of_bound=True,
gdb_trigger=False,
)
def test_ndarray_oob_cpu_small_array():
"""Reproduces the pattern from the temperature-sensor segfault: a kernel
accesses a very small (shape-1) array with an index that goes out of
bounds. Before the cpu_assert_failed fix this would SIGSEGV on CPU in debug mode."""
small = qd.ndarray(dtype=qd.f32, shape=(1,))
small.fill(42.0)

@qd.kernel
def read_oob(a: qd.types.ndarray(dtype=qd.f32, ndim=1)) -> qd.f32:
return a[5]

with pytest.raises(AssertionError, match=r"Out of bound access"):
read_oob(small)


@test_utils.test(
arch=[qd.cpu],
require=qd.extension.assertion,
debug=True,
check_out_of_bound=True,
gdb_trigger=False,
)
def test_ndarray_oob_cpu_2d():
"""2D ndarray out-of-bounds on CPU should produce a clear error."""
arr = qd.ndarray(dtype=qd.f32, shape=(3, 4))

@qd.kernel
def write_oob_2d(a: qd.types.ndarray(dtype=qd.f32, ndim=2)):
for i in range(1):
a[10, 0] = 1.0

with pytest.raises(AssertionError, match=r"Out of bound access"):
write_oob_2d(arr)


@test_utils.test(
arch=[qd.cpu],
require=qd.extension.assertion,
debug=True,
check_out_of_bound=True,
gdb_trigger=False,
)
def test_ndarray_inbounds_cpu_still_works():
"""Verify that the cpu_assert_failed mechanism does not break normal
in-bounds ndarray access."""
n = 8
arr = qd.ndarray(dtype=qd.f32, shape=(n,))

@qd.kernel
def fill(a: qd.types.ndarray(dtype=qd.f32, ndim=1)):
for i in range(n):
a[i] = qd.cast(i * 10, qd.f32)

fill(arr)
result = arr.to_numpy()
for i in range(n):
assert result[i] == pytest.approx(i * 10)
Comment thread
claude[bot] marked this conversation as resolved.


@test_utils.test(
arch=[qd.cpu],
require=qd.extension.assertion,
debug=True,
check_out_of_bound=True,
gdb_trigger=False,
)
def test_do_while_oob_does_not_loop_forever():
"""An OOB assertion inside a do-while kernel must break the outer loop.

Without the cpu_assert_failed check in the do-while condition, the
flag-clearing task is skipped (inner break), the outer loop sees
flag != 0, re-enters launch_offloaded_tasks which resets
cpu_assert_failed = 0, and re-runs tasks on corrupted data forever.
"""
import numpy as np

arr = qd.ndarray(dtype=qd.f32, shape=(4,))
counter = qd.ndarray(dtype=qd.i32, shape=())
counter.from_numpy(np.array(10, dtype=np.int32))

@qd.kernel(graph=True)
def oob_in_do_while(
a: qd.types.ndarray(dtype=qd.f32, ndim=1),
c: qd.types.ndarray(dtype=qd.i32, ndim=0),
):
while qd.graph_do_while(c):
# Serial loop so the OOB fires on the shared context immediately,
# guaranteeing the do-while condition sees it on the same iteration.
for i in qd.static(range(10)):
a[i] = 1.0
for i in range(1):
c[()] = c[()] - 1

with pytest.raises(AssertionError, match=r"Out of bound access"):
Comment thread
claude[bot] marked this conversation as resolved.
oob_in_do_while(arr, counter)
Comment thread
claude[bot] marked this conversation as resolved.
Loading