Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions gdb/testsuite/gdb.rocm/coop-group-grid-sync.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
/* Copyright 2026 Free Software Foundation, Inc.
Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.

This file is part of GDB.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */

/* This program exercises a single-device cooperative kernel launch.
The kernel uses cooperative_groups::this_grid().sync (), which is
implemented on AMD GPUs using the Global Wave Sync (GWS) hardware
mechanism. It is launched through hipLaunchCooperativeKernel so the
whole grid is co-resident and can synchronize at the grid level.

The companion .exp file uses this program to exercise the debugger
while a kernel is running with GWS-based grid synchronization. */

#include <hip/hip_runtime.h>
#include <hip/hip_cooperative_groups.h>
#include <stdio.h>
#include <stdlib.h>

#include "rocm-test-utils.h"

namespace cg = cooperative_groups;

/* Use small but non-trivial launch dimensions so the test runs quickly
while still creating multiple waves participating in the GWS
barrier. GROUP_SIZE = 64 means 1 wave on wave64 architectures and
2 waves on wave32 architectures, for a total of 2 (or 4) waves.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can,t we check at runtime if we have wave32 or wave64? Could also build the testsuite with -mwavefrontsize64 to force wave64.


N is exactly one slot per thread so Phase 2 has no intra-thread race
and the host-side expected values can be computed straightforwardly. */

constexpr unsigned int group_size = 64;
constexpr unsigned int num_groups = 2;
constexpr int total_threads = group_size * num_groups;
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are group_size and num_groups unsigned, but not total_threads?

constexpr int N = total_threads;

/* Two-phase grid-cooperative kernel.

Phase 1: every thread writes its slot in IN.
GWS : the whole grid synchronizes via cooperative_groups::this_grid ().
Phase 2: every thread reads a slot owned by a thread in a *different*
workgroup and stores it into OUT.

Because the Phase 2 read targets a slot written by a different
workgroup during Phase 1, this is only correct if grid.sync ()
actually synchronized every wave in the grid. */

__global__ void
coop_grid_sync_kernel (int *in_buf, int *out_buf)
{
cg::grid_group grid = cg::this_grid ();

int tid = blockIdx.x * blockDim.x + threadIdx.x;

/* Phase 1: each thread writes its own value. */
in_buf[tid] = tid + 1; /* before-sync line */

/* Grid-wide synchronization (GWS). */
grid.sync (); /* sync line */

/* Phase 2: read the slot owned by a thread in a different workgroup
and store it in our own slot of the output buffer. */
int peer = (tid + (int) blockDim.x) % total_threads;
int v = in_buf[peer]; /* after-sync line */
out_buf[tid] = v;
}

int
main ()
{
int device_id = 0;
CHECK (hipSetDevice (device_id));

hipDeviceProp_t props;
CHECK (hipGetDeviceProperties (&props, device_id));
if (!props.cooperativeLaunch)
{
printf ("Device does not support cooperative launch, skipping.\n");
return 0;
}

int *in_d = nullptr;
int *out_d = nullptr;
CHECK (hipMalloc ((void **) &in_d, N * sizeof (int)));
CHECK (hipMalloc ((void **) &out_d, N * sizeof (int)));
CHECK (hipMemset (in_d, 0, N * sizeof (int)));
CHECK (hipMemset (out_d, 0, N * sizeof (int)));

dim3 grid_dim (num_groups, 1, 1);
dim3 block_dim (group_size, 1, 1);

void *kernel_args[2];
kernel_args[0] = (void *) &in_d;
kernel_args[1] = (void *) &out_d;

/* Launch the kernel cooperatively. This is the API that enables
grid.sync () support via GWS. */
CHECK (hipLaunchCooperativeKernel (
reinterpret_cast<void *> (coop_grid_sync_kernel),
grid_dim, block_dim, kernel_args,
/*sharedMem=*/0, /*stream=*/0));

CHECK (hipDeviceSynchronize ());

int out_h[N];
CHECK (hipMemcpy (out_h, out_d, N * sizeof (int), hipMemcpyDeviceToHost));

CHECK (hipFree (in_d));
CHECK (hipFree (out_d));

/* Each thread tid stores in_buf[(tid + group_size) % total_threads]
into out_buf[tid]. After Phase 1, in_buf[k] == k + 1, so the
expected value at out_h[tid] is ((tid + group_size) %
total_threads) + 1. */
int errors = 0;
for (int tid = 0; tid < N; tid++)
{
int peer = (tid + (int) group_size) % total_threads;
int expected = peer + 1;
if (out_h[tid] != expected)
{
if (errors < 4)
fprintf (stderr,
"mismatch at %d: got %d, expected %d\n",
tid, out_h[tid], expected);
errors++;
}
}

if (errors != 0)
{
fprintf (stderr, "FAILED: %d mismatches\n", errors);
return EXIT_FAILURE;
}

printf ("PASSED\n");
return EXIT_SUCCESS;
}
173 changes: 173 additions & 0 deletions gdb/testsuite/gdb.rocm/coop-group-grid-sync.exp
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
# Copyright 2026 Free Software Foundation, Inc.
# Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.

# This file is part of GDB.

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

# Tests for debugging a single-device cooperative kernel that uses
# cooperative_groups::this_grid ().sync (), implemented on AMD GPUs
# using the Global Wave Sync (GWS) hardware mechanism.
#
# The test verifies that GDB can:
# - hit a breakpoint placed before grid.sync () in a cooperatively
# launched kernel (i.e. when the dispatch was created with
# hipLaunchCooperativeKernel),
# - list the cooperative dispatch via "info dispatches",
# - observe multiple GPU waves participating in the grid sync,
# - hit a breakpoint placed after grid.sync (), and
# - resume the kernel to normal completion.
#
# This intentionally exercises the same cooperative-launch +
# this_grid ().sync () code path that earlier CQE GWS tests used, but
# integrated into the dejagnu testsuite.

load_lib rocm.exp

require allow_hipcc_tests

standard_testfile .cpp

if {[build_executable "failed to prepare" $testfile $srcfile {debug hip}]} {
return
}

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do not support debugging cooperative groups on gfx110x (see documented limitations). Testing this on gfx1100, I see:

FAIL: gdb.rocm/coop-group-grid-sync.exp: test_break_around_grid_sync: stop before grid.sync (the program exited)
FAIL: gdb.rocm/coop-group-grid-sync.exp: test_threads_in_coop_kernel: stop in cooperative kernel (the program exited)

The testcase should probably detect for those devices and issue a XFAIL.

# Continue from main until we either hit a breakpoint inside
# coop_grid_sync_kernel or the program exits normally because the
# device does not support cooperative launch. Returns 1 if the kernel
# breakpoint was hit (test can proceed), 0 otherwise.

proc continue_to_kernel_or_skip {tag} {
gdb_test_multiple "continue" $tag {
-re "hit Breakpoint $::decimal,\[^\r\n\]*coop_grid_sync_kernel\[^\r\n\]*\r\n\[^\r\n\]*\r\n$::gdb_prompt " {
pass $tag
return 1
}
-re "Device does not support cooperative launch\[^\r\n\]*\r\n" {
unsupported "device does not support cooperative launch"
return 0
}
-re "\\\[Inferior 1 \[^\r\n\]* exited normally\\\]\[^\r\n\]*\r\n$::gdb_prompt " {
# Program skipped without our diagnostic line, but still
# exited cleanly. Treat as unsupported.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which case is this covering? Shouldn't there be an XFAIL if the environment is broken to a point where we do not see the message?

unsupported $tag
return 0
}
}
return 0
}

# Test: hit a breakpoint inside a cooperative kernel before grid.sync (),
# inspect the GPU state, then continue past grid.sync () and to program
# exit.

proc_with_prefix test_break_around_grid_sync {} {
clean_restart
gdb_load $::binfile

with_rocm_gpu_lock {
if {![runto_main]} {
return
}

# Place a breakpoint before grid.sync (). Use "allow-pending"
# because the kernel is loaded as device code at dispatch time.
gdb_breakpoint \
[gdb_get_line_number "before-sync line"] allow-pending

# The kernel must hit the pre-sync breakpoint.
if {![continue_to_kernel_or_skip "stop before grid.sync"]} {
return
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should report UNSUPPORTED here.

}

# We expect at least one AMDGPU Wave thread to be present.
# With a 64-thread block size and 2 workgroups we should have
# 2 waves on wave64 architectures (or 4 waves on wave32
# architectures).
set waves [info_thread_get_wave_list]
gdb_assert {[llength $waves] >= 2} \
"at least two AMDGPU waves are present pre-sync"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can there be a race where, when running on wave32, we just have the 2 waves of a single workgroup, but not the waves of the other?


# "info dispatches" should show the cooperative dispatch.
gdb_test "info dispatches" \
".*AMDGPU Dispatch\[^\r\n\]*coop_grid_sync_kernel.*" \
"info dispatches lists cooperative dispatch"

# Replace the pre-sync breakpoint with one after grid.sync ().
# The after-sync breakpoint fires once all waves have crossed
# the GWS barrier, which proves GWS-protected code can be
# debugged across the barrier.
delete_breakpoints
gdb_breakpoint \
[gdb_get_line_number "after-sync line"] allow-pending

gdb_test "continue" \
"hit Breakpoint $::decimal,\[^\r\n\]*coop_grid_sync_kernel\[^\r\n\]*\r\n\[^\r\n\]*" \
"stop after grid.sync"

# The kernel should be able to run to completion afterwards.
delete_breakpoints
gdb_test "continue" \
"\\\[Inferior 1 \[^\r\n\]* exited normally\\\]" \
"continue to program end"
}
}

# Test: place a breakpoint inside the cooperative kernel, then iterate
# over every stopped GPU wave and verify that each one has a backtrace
# pointing into coop_grid_sync_kernel. This exercises the wave
# organization a debugger sees when several waves participate in a GWS
# barrier.

proc_with_prefix test_threads_in_coop_kernel {} {
clean_restart
gdb_load $::binfile

with_rocm_gpu_lock {
if {![runto_main]} {
return
}

gdb_breakpoint \
[gdb_get_line_number "before-sync line"] allow-pending

if {![continue_to_kernel_or_skip "stop in cooperative kernel"]} {
return
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Report UNSUPPORTED here.

}

# Collect the list of AMDGPU waves and check that every one of
# them has a backtrace pointing into coop_grid_sync_kernel.
set waves [info_thread_get_wave_list]
gdb_assert {[llength $waves] >= 2} "multiple GPU waves listed"

foreach wave $waves {
with_test_prefix "wave $wave" {
gdb_test "thread $wave" "Switching to thread $wave.*" \
"switch to wave $wave"
gdb_test "bt 1" \
"#0\[^\r\n\]*coop_grid_sync_kernel\[^\r\n\]*" \
"backtrace inside coop_grid_sync_kernel"
}
}

# Resume to completion.
delete_breakpoints
gdb_test "continue" \
"\\\[Inferior 1 \[^\r\n\]* exited normally\\\]" \
"continue to program end"
}
}

test_break_around_grid_sync
test_threads_in_coop_kernel
Loading
Loading