Genesis-Embodied-AI · hughperkins · Apr 22, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/quadrants/codegen/llvm/codegen_llvm.cpp b/quadrants/codegen/llvm/codegen_llvm.cpp
@@ -1142,7 +1142,11 @@
   auto arguments = create_entry_block_alloca(argument_buffer_size);
 
   std::vector<llvm::Value *> args;
-  args.emplace_back(get_runtime());
+  // On CPU, use the context-aware variant that returns non-zero on failure
+  // so we can emit an early return and avoid the subsequent out-of-bounds
+  // memory access.  On GPU, asm("exit;") kills the thread directly.
+  bool use_ctx_variant = arch_is_cpu(current_arch());
+  args.emplace_back(use_ctx_variant ? get_context() : get_runtime());
   args.emplace_back(builder->CreateIsNotNull(llvm_val[stmt->cond]));
   args.emplace_back(builder->CreateGlobalStringPtr(stmt->text));
 
@@ -1167,7 +1171,17 @@
   args.emplace_back(
       builder->CreateGEP(argument_buffer_size, arguments, {tlctx->get_constant(0), tlctx->get_constant(0)}));
 
-  llvm_val[stmt] = call("quadrants_assert_format", std::move(args));
+  llvm_val[stmt] = call(use_ctx_variant ? "quadrants_assert_format_ctx" : "quadrants_assert_format", std::move(args));
+
+  if (use_ctx_variant) {
+    auto *assert_abort = llvm::BasicBlock::Create(*llvm_context, "assert_abort", func);
+    auto *assert_cont = llvm::BasicBlock::Create(*llvm_context, "assert_cont", func);
+    auto *failed = builder->CreateICmpNE(llvm_val[stmt], tlctx->get_constant(0));
+    builder->CreateCondBr(failed, assert_abort, assert_cont);
+    builder->SetInsertPoint(assert_abort);
+    builder->CreateRetVoid();
+    builder->SetInsertPoint(assert_cont);
+  }
 }
 
 void TaskCodeGenLLVM::visit(SNodeOpStmt *stmt) {
@@ -2493,6 +2507,12 @@
     current_callable = old_callable;
   }
   llvm::Function *llvm_func = func_map[stmt->func];
+  // FIXME: when cpu_assert_failed fires inside a @qd.real_func callee, the
+  // flag is set on new_ctx but never propagated back to the caller's context.
+  // Regular @qd.func is AST-inlined so assertions are handled by the caller's
+  // visit(AssertStmt) directly.  real_func needs: (1) zero-init new_ctx's
+  // cpu_assert_failed before the call, (2) post-call check + propagate to
+  // get_context(), (3) emit ret void on failure.
   auto *new_ctx = create_entry_block_alloca(get_runtime_type("RuntimeContext"));
   call("RuntimeContext_set_runtime", new_ctx, get_runtime());
   if (!stmt->func->parameter_list.empty()) {

diff --git a/quadrants/program/context.h b/quadrants/program/context.h
@@ -24,6 +24,12 @@ struct RuntimeContext {
   // LLVMRuntime is shared among functions. So we moved the pointer to
   // RuntimeContext which each function have one.
   uint64_t *result_buffer;
+
+  // Set to 1 by quadrants_assert_format_ctx when a runtime assertion (e.g.
+  // out-of-bounds check) fails on CPU.  The codegen emits an early return
+  // after each assert call when this is set, and the task runner breaks out
+  // of its loop.
+  int32_t cpu_assert_failed{0};
 };
 
 #if defined(QD_RUNTIME_HOST)

diff --git a/quadrants/runtime/cpu/kernel_launcher.cpp b/quadrants/runtime/cpu/kernel_launcher.cpp
@@ -5,16 +5,19 @@ namespace quadrants::lang {
 namespace cpu {
 
 void KernelLauncher::launch_offloaded_tasks(LaunchContextBuilder &ctx, const std::vector<TaskFunc> &task_funcs) {
+  ctx.get_context().cpu_assert_failed = 0;
   for (auto task : task_funcs) {
     task(&ctx.get_context());
+    if (ctx.get_context().cpu_assert_failed)
+      break;
   }
 }
 
 void KernelLauncher::launch_offloaded_tasks_with_do_while(LaunchContextBuilder &ctx,
                                                           const std::vector<TaskFunc> &task_funcs) {
   do {
     launch_offloaded_tasks(ctx, task_funcs);
-  } while (*static_cast<int32_t *>(ctx.graph_do_while_flag_dev_ptr) != 0);
+  } while (ctx.get_context().cpu_assert_failed == 0 && *static_cast<int32_t *>(ctx.graph_do_while_flag_dev_ptr) != 0);
 }
 
 void KernelLauncher::launch_llvm_kernel(Handle handle, LaunchContextBuilder &ctx) {

diff --git a/quadrants/runtime/llvm/runtime_module/runtime.cpp b/quadrants/runtime/llvm/runtime_module/runtime.cpp
@@ -294,6 +294,7 @@ STRUCT_FIELD_ARRAY(PhysicalCoordinates, val);
 
 STRUCT_FIELD(RuntimeContext, runtime);
 STRUCT_FIELD(RuntimeContext, result_buffer)
+STRUCT_FIELD(RuntimeContext, cpu_assert_failed)
 
 #include "quadrants/runtime/llvm/runtime_module/atomic.h"
 
@@ -795,6 +796,25 @@ void quadrants_assert_format(LLVMRuntime *runtime, u1 test, const char *format,
 #endif
 }
 
+// Context-aware variant called by bounds-check assertions in JIT'd code.
+// Returns 1 when the assertion failed (so the codegen can emit an early
+// return), 0 otherwise.  This replaces a previous setjmp/longjmp approach
+// that crashed on Windows because JIT'd frames lack SEH unwind tables.
+i32 quadrants_assert_format_ctx(RuntimeContext *context,
+                                u1 test,
+                                const char *format,
+                                int num_arguments,
+                                uint64 *arguments) {
+  quadrants_assert_format(context->runtime, test, format, num_arguments, arguments);
+#if !ARCH_cuda && !ARCH_amdgpu
+  if (enable_assert && test == 0) {
+    context->cpu_assert_failed = 1;
+    return 1;
+  }
+#endif
+  return 0;
+}
+
 void quadrants_assert_runtime(LLVMRuntime *runtime, u1 test, const char *msg) {
   quadrants_assert_format(runtime, test, msg, 0, nullptr);
 }
@@ -1505,9 +1525,13 @@ void cpu_struct_for_block_helper(void *ctx_, int thread_id, int i) {
 
   RuntimeContext this_thread_context = *ctx->context;
   this_thread_context.cpu_thread_id = thread_id;
+  this_thread_context.cpu_assert_failed = 0;
+
   if (lower < upper) {
     (*ctx->task)(&this_thread_context, tls_buffer, &ctx->list->get<Element>(element_id), lower, upper);
   }
+  if (this_thread_context.cpu_assert_failed)
+    ctx->context->cpu_assert_failed = 1;
 }
 
 void parallel_struct_for(RuntimeContext *context,
@@ -1578,26 +1602,41 @@ void cpu_parallel_range_for_task(void *range_context, int thread_id, int task_id
   alignas(8) char tls_buffer[ctx.tls_size];
 #pragma clang diagnostic pop
   auto tls_ptr = &tls_buffer[0];
-  if (ctx.prologue)
-    ctx.prologue(ctx.context, tls_ptr);
 
   RuntimeContext this_thread_context = *ctx.context;
   this_thread_context.cpu_thread_id = thread_id;
+  this_thread_context.cpu_assert_failed = 0;
+
+  if (ctx.prologue) {
+    ctx.prologue(&this_thread_context, tls_ptr);
+    if (this_thread_context.cpu_assert_failed) {
+      ctx.context->cpu_assert_failed = 1;
+      return;
+    }
+  }
+
   if (ctx.step == 1) {
     int block_start = ctx.begin + task_id * ctx.block_size;
     int block_end = std::min(block_start + ctx.block_size, ctx.end);
     for (int i = block_start; i < block_end; i++) {
       ctx.body(&this_thread_context, tls_ptr, i);
+      if (this_thread_context.cpu_assert_failed)
+        break;
     }
   } else if (ctx.step == -1) {
     int block_start = ctx.end - task_id * ctx.block_size;
-    int block_end = std::max(ctx.begin, block_start * ctx.block_size);
+    int block_end = std::max(ctx.begin, block_start - ctx.block_size);
     for (int i = block_start - 1; i >= block_end; i--) {
       ctx.body(&this_thread_context, tls_ptr, i);
+      if (this_thread_context.cpu_assert_failed)
+        break;
     }
   }
-  if (ctx.epilogue)
-    ctx.epilogue(ctx.context, tls_ptr);
+
+  if (!this_thread_context.cpu_assert_failed && ctx.epilogue)
+    ctx.epilogue(&this_thread_context, tls_ptr);
+  if (this_thread_context.cpu_assert_failed)
+    ctx.context->cpu_assert_failed = 1;
 }
 
 void cpu_parallel_range_for(RuntimeContext *context,
@@ -1678,17 +1717,28 @@ void cpu_parallel_mesh_for_task(void *range_context, int thread_id, int task_id)
 
   RuntimeContext this_thread_context = *ctx.context;
   this_thread_context.cpu_thread_id = thread_id;
+  this_thread_context.cpu_assert_failed = 0;
 
   int block_start = task_id * ctx.block_size;
   int block_end = std::min(block_start + ctx.block_size, ctx.num_patches);
 
   for (int idx = block_start; idx < block_end; idx++) {
-    if (ctx.prologue)
-      ctx.prologue(ctx.context, tls_ptr, idx);
+    if (ctx.prologue) {
+      ctx.prologue(&this_thread_context, tls_ptr, idx);
+      if (this_thread_context.cpu_assert_failed)
+        break;
+    }
     ctx.body(&this_thread_context, tls_ptr, idx);
-    if (ctx.epilogue)
-      ctx.epilogue(ctx.context, tls_ptr, idx);
+    if (this_thread_context.cpu_assert_failed)
+      break;
+    if (ctx.epilogue) {
+      ctx.epilogue(&this_thread_context, tls_ptr, idx);
+      if (this_thread_context.cpu_assert_failed)
+        break;
+    }
   }
+  if (this_thread_context.cpu_assert_failed)
+    ctx.context->cpu_assert_failed = 1;
 }
 
 void cpu_parallel_mesh_for(RuntimeContext *context,

diff --git a/tests/python/test_debug.py b/tests/python/test_debug.py
@@ -135,3 +135,128 @@ def func():
         x[3, 7] = 2
 
     func()
+
+
+@test_utils.test(
+    arch=[qd.cpu],
+    require=qd.extension.assertion,
+    debug=True,
+    check_out_of_bound=True,
+    gdb_trigger=False,
+)
+def test_ndarray_oob_cpu_raises_not_segfaults():
+    """Out-of-bounds ndarray access in a parallel kernel on CPU should raise
+    QuadrantsAssertionError instead of segfaulting."""
+    arr = qd.ndarray(dtype=qd.f32, shape=(4,))
+
+    @qd.kernel
+    def write_oob(a: qd.types.ndarray(dtype=qd.f32, ndim=1)):
+        for i in range(10):
+            a[i] = 1.0
+
+    with pytest.raises(AssertionError, match=r"Out of bound access"):
+        write_oob(arr)
+
+
+@test_utils.test(
+    arch=[qd.cpu],
+    require=qd.extension.assertion,
+    debug=True,
+    check_out_of_bound=True,
+    gdb_trigger=False,
+)
+def test_ndarray_oob_cpu_small_array():
+    """Reproduces the pattern from the temperature-sensor segfault: a kernel
+    accesses a very small (shape-1) array with an index that goes out of
+    bounds.  Before the cpu_assert_failed fix this would SIGSEGV on CPU in debug mode."""
+    small = qd.ndarray(dtype=qd.f32, shape=(1,))
+    small.fill(42.0)
+
+    @qd.kernel
+    def read_oob(a: qd.types.ndarray(dtype=qd.f32, ndim=1)) -> qd.f32:
+        return a[5]
+
+    with pytest.raises(AssertionError, match=r"Out of bound access"):
+        read_oob(small)
+
+
+@test_utils.test(
+    arch=[qd.cpu],
+    require=qd.extension.assertion,
+    debug=True,
+    check_out_of_bound=True,
+    gdb_trigger=False,
+)
+def test_ndarray_oob_cpu_2d():
+    """2D ndarray out-of-bounds on CPU should produce a clear error."""
+    arr = qd.ndarray(dtype=qd.f32, shape=(3, 4))
+
+    @qd.kernel
+    def write_oob_2d(a: qd.types.ndarray(dtype=qd.f32, ndim=2)):
+        for i in range(1):
+            a[10, 0] = 1.0
+
+    with pytest.raises(AssertionError, match=r"Out of bound access"):
+        write_oob_2d(arr)
+
+
+@test_utils.test(
+    arch=[qd.cpu],
+    require=qd.extension.assertion,
+    debug=True,
+    check_out_of_bound=True,
+    gdb_trigger=False,
+)
+def test_ndarray_inbounds_cpu_still_works():
+    """Verify that the cpu_assert_failed mechanism does not break normal
+    in-bounds ndarray access."""
+    n = 8
+    arr = qd.ndarray(dtype=qd.f32, shape=(n,))
+
+    @qd.kernel
+    def fill(a: qd.types.ndarray(dtype=qd.f32, ndim=1)):
+        for i in range(n):
+            a[i] = qd.cast(i * 10, qd.f32)
+
+    fill(arr)
+    result = arr.to_numpy()
+    for i in range(n):
+        assert result[i] == pytest.approx(i * 10)
+
+
+@test_utils.test(
+    arch=[qd.cpu],
+    require=qd.extension.assertion,
+    debug=True,
+    check_out_of_bound=True,
+    gdb_trigger=False,
+)
+def test_do_while_oob_does_not_loop_forever():
+    """An OOB assertion inside a do-while kernel must break the outer loop.
+
+    Without the cpu_assert_failed check in the do-while condition, the
+    flag-clearing task is skipped (inner break), the outer loop sees
+    flag != 0, re-enters launch_offloaded_tasks which resets
+    cpu_assert_failed = 0, and re-runs tasks on corrupted data forever.
+    """
+    import numpy as np
+
+    arr = qd.ndarray(dtype=qd.f32, shape=(4,))
+    counter = qd.ndarray(dtype=qd.i32, shape=())
+    counter.from_numpy(np.array(10, dtype=np.int32))
+
+    @qd.kernel(graph=True)
+    def oob_in_do_while(
+        a: qd.types.ndarray(dtype=qd.f32, ndim=1),
+        c: qd.types.ndarray(dtype=qd.i32, ndim=0),
+    ):
+        while qd.graph_do_while(c):
+            # Serial loop so the OOB fires on the shared context immediately,
+            # guaranteeing the do-while condition sees it on the same iteration.
+            for i in qd.static(range(10)):
+                a[i] = 1.0
+            for i in range(1):
+                c[()] = c[()] - 1
+
+    with pytest.raises(AssertionError, match=r"Out of bound access"):
+        oob_in_do_while(arr, counter)