Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions quadrants/runtime/cuda/gpu_graph_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,9 @@ void GpuGraphManager::ensure_condition_kernel_loaded() {

int cc = CUDAContext::get_instance().get_compute_capability();
if (cc < 90) {
QD_WARN(
"graph_do_while requires SM 9.0+ (Hopper), but this device is SM {}. "
"Falling back to non-graph path.",
QD_INFO(
"graph_do_while natively requires SM 9.0+, but this device is SM {}. "
Comment thread
duburcqa marked this conversation as resolved.
"Falling back to host-side do-while loop.",
cc);
return;
}
Expand Down Expand Up @@ -414,8 +414,19 @@ bool GpuGraphManager::try_launch(

if (use_graph_do_while) {
ensure_condition_kernel_loaded();
QD_ERROR_IF(!cond_kernel_func_,
"Condition kernel not available; cannot build graph_do_while");
if (!cond_kernel_func_) {
int cc = CUDAContext::get_instance().get_compute_capability();
if (cc >= 90) {
// SM 9.0+ should always be able to load the condition kernel.
// Failing here means prerequisites are missing.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Looks plausible to me.

QD_ERROR(
"Condition kernel not available on SM {}; "
"cannot build graph_do_while",
cc);
}
// Pre-SM 9.0: fall back to host-side do-while loop.
return false;
}
kernel_target_graph = add_conditional_while_node(graph, &cond_handle);
}

Expand Down
33 changes: 13 additions & 20 deletions tests/python/test_gpu_graph_do_while.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,13 @@ def _on_cuda():
return impl.current_cfg().arch == qd.cuda


def _xfail_if_cuda_without_hopper():
if _on_cuda() and qd.lang.impl.get_cuda_compute_capability() < 90:
pytest.xfail("graph_do_while requires SM 9.0+ (Hopper)")
def _is_gpu_graph_do_while_natively_supported():
return _on_cuda() and qd.lang.impl.get_cuda_compute_capability() >= 90


@test_utils.test()
def test_graph_do_while_counter():
"""Test graph_do_while with a counter that decrements each iteration."""
_xfail_if_cuda_without_hopper()
N = 64

@qd.kernel(gpu_graph=True)
Expand All @@ -58,7 +56,7 @@ def graph_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd
counter.from_numpy(np.array(5, dtype=np.int32))

graph_loop(x, counter)
if _on_cuda():
if _is_gpu_graph_do_while_natively_supported():
assert _gpu_graph_used()
assert _gpu_graph_cache_size() == 1

Expand All @@ -69,7 +67,7 @@ def graph_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd
counter.from_numpy(np.array(10, dtype=np.int32))

graph_loop(x, counter)
if _on_cuda():
if _is_gpu_graph_do_while_natively_supported():
assert _gpu_graph_used()
assert _gpu_graph_cache_size() == 1

Expand All @@ -80,7 +78,6 @@ def graph_loop(x: qd.types.ndarray(qd.i32, ndim=1), counter: qd.types.ndarray(qd
@test_utils.test()
def test_graph_do_while_boolean_done():
"""Test graph_do_while with a boolean 'continue' flag (non-zero = keep going)."""
_xfail_if_cuda_without_hopper()
N = 64

@qd.kernel(gpu_graph=True)
Expand All @@ -103,7 +100,7 @@ def increment_until_threshold(
keep_going.from_numpy(np.array(1, dtype=np.int32))

increment_until_threshold(x, 7, keep_going)
if _on_cuda():
if _is_gpu_graph_do_while_natively_supported():
assert _gpu_graph_used()
assert _gpu_graph_cache_size() == 1

Expand All @@ -114,7 +111,7 @@ def increment_until_threshold(
keep_going.from_numpy(np.array(1, dtype=np.int32))

increment_until_threshold(x, 12, keep_going)
if _on_cuda():
if _is_gpu_graph_do_while_natively_supported():
assert _gpu_graph_used()
assert _gpu_graph_cache_size() == 1

Expand All @@ -125,7 +122,6 @@ def increment_until_threshold(
@test_utils.test()
def test_graph_do_while_multiple_loops():
"""Test graph_do_while with multiple top-level loops in the kernel body."""
_xfail_if_cuda_without_hopper()
N = 32

@qd.kernel(gpu_graph=True)
Expand All @@ -151,7 +147,7 @@ def multi_loop(
counter.from_numpy(np.array(10, dtype=np.int32))

multi_loop(x, y, counter)
if _on_cuda():
if _is_gpu_graph_do_while_natively_supported():
assert _gpu_graph_used()
assert _gpu_graph_cache_size() == 1

Expand All @@ -164,7 +160,7 @@ def multi_loop(
counter.from_numpy(np.array(5, dtype=np.int32))

multi_loop(x, y, counter)
if _on_cuda():
if _is_gpu_graph_do_while_natively_supported():
assert _gpu_graph_used()
assert _gpu_graph_cache_size() == 1

Expand All @@ -183,7 +179,6 @@ def test_graph_do_while_swap_counter_ndarray():
the graph wasn't rebuilt, it just updated the indirection slot with c2's
pointer.
"""
_xfail_if_cuda_without_hopper()
N = 32

@qd.kernel(gpu_graph=True)
Expand All @@ -200,7 +195,7 @@ def k(x: qd.types.ndarray(qd.i32, ndim=1), c: qd.types.ndarray(qd.i32, ndim=0)):
x.from_numpy(np.zeros(N, dtype=np.int32))
c1.from_numpy(np.array(3, dtype=np.int32))
k(x, c1)
if _on_cuda():
if _is_gpu_graph_do_while_natively_supported():
assert _gpu_graph_used()
assert _gpu_graph_cache_size() == 1
assert c1.to_numpy() == 0
Expand All @@ -211,7 +206,7 @@ def k(x: qd.types.ndarray(qd.i32, ndim=1), c: qd.types.ndarray(qd.i32, ndim=0)):
x.from_numpy(np.zeros(N, dtype=np.int32))
c2.from_numpy(np.array(7, dtype=np.int32))
k(x, c2)
if _on_cuda():
if _is_gpu_graph_do_while_natively_supported():
assert _gpu_graph_used()
assert _gpu_graph_cache_size() == 1
assert _gpu_graph_total_builds() == 1
Expand All @@ -228,7 +223,6 @@ def test_graph_do_while_alternate_counter_ndarrays():
count+10). Confirms the slot update works back and forth, not just as a
one-time swap. Cache size is checked once at the end -- still 1.
"""
_xfail_if_cuda_without_hopper()
N = 16

@qd.kernel(gpu_graph=True)
Expand All @@ -249,20 +243,20 @@ def k(x: qd.types.ndarray(qd.i32, ndim=1), c: qd.types.ndarray(qd.i32, ndim=0)):
x.from_numpy(np.zeros(N, dtype=np.int32))
c1.from_numpy(np.array(count, dtype=np.int32))
k(x, c1)
if _on_cuda():
if _is_gpu_graph_do_while_natively_supported():
assert _gpu_graph_used()
assert c1.to_numpy() == 0
np.testing.assert_array_equal(x.to_numpy(), np.full(N, count, dtype=np.int32))

x.from_numpy(np.zeros(N, dtype=np.int32))
c2.from_numpy(np.array(count + 10, dtype=np.int32))
k(x, c2)
if _on_cuda():
if _is_gpu_graph_do_while_natively_supported():
assert _gpu_graph_used()
assert c2.to_numpy() == 0
np.testing.assert_array_equal(x.to_numpy(), np.full(N, count + 10, dtype=np.int32))

if _on_cuda():
if _is_gpu_graph_do_while_natively_supported():
assert _gpu_graph_cache_size() == 1
assert _gpu_graph_total_builds() == 1

Expand Down Expand Up @@ -351,7 +345,6 @@ def _fastcache_do_while_child(args: list[str]) -> None:
@test_utils.test()
def test_graph_do_while_fastcache_restores_arg(tmp_path: pathlib.Path):
"""After fastcache restore in a fresh process, graph_do_while_arg must be set."""
_xfail_if_cuda_without_hopper()
assert qd.lang is not None
arch = qd.lang.impl.current_cfg().arch.name
env = dict(os.environ)
Expand Down
Loading