From fdcbf244da20f9a5381bc840e5a26fb6e8cadc3b Mon Sep 17 00:00:00 2001
From: nileshnegi <Nilesh.Negi@amd.com>
Date: Sun, 10 May 2026 09:01:16 -0500
Subject: [PATCH 1/2] Add DMA memset support (no-src EXE_GPU_DMA transfer)

Allow EXE_GPU_DMA transfers with zero sources to perform a memset using
hsa_amd_memory_fill, which enqueues a LINEAR_FILL operation on the SDMA
engines.

Fill value:
  uint32_t fillVal = bit_cast<uint32_t>(MEMSET_VAL);  // 0x4B4B4B4B
  count = numBytes / sizeof(uint32_t);  // count is in uint32_t units

0x4B4B4B4B matches both memset(MEMSET_CHAR) (used by dstReference[0])
and MemsetVal<float>() used by the GFX no-src kernel, so existing
correctness validation passes without changes.

Validation changes (AMD only, gated on !__NVCC__):
- DMA no-src is now valid; rejected only on NVIDIA builds
- DMA no-src with a specific SDMA engine (e.g. "n d0.2 g1") is rejected
  because hsa_amd_memory_fill has no engine-selection parameter
- Copy-agent-selection warnings guarded by !t.srcs.empty() to avoid
  out-of-bounds access when no source is specified

Execution changes (ExecuteDmaTransfer):
- no-src hoisted before hipMemcpy/HSA-async-copy branches
- Copy paths (hipMemcpy and HSA async copy) unchanged

HSA resource setup:
- srcMem pointer-info query guarded by !rss.srcMem.empty()

Co-authored-by: Claude <claude@anthropic.com>
---
 src/header/TransferBench.hpp | 70 +++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 20 deletions(-)
diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp
index 9f4589c..a2d1faf 100644
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -2313,12 +2313,20 @@ namespace {
         }
         break;
       case EXE_GPU_DMA:
-        if (t.srcs.size() != 1) {
+        if (t.srcs.size() > 1) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: DMA executor must have 0 or 1 sources", i});
+          hasFatalError = true;
+          break;
+        }
+#if defined(__NVCC__)
+        if (t.srcs.empty()) {
           errors.push_back({ERR_FATAL,
-                            "Transfer %d: DMA executor must have exactly 1 source", i});
+                            "Transfer %d: DMA memset (0 sources) not supported on NVIDIA hardware", i});
           hasFatalError = true;
           break;
         }
+#endif
         if (t.dsts.size() < 1) {
           errors.push_back({ERR_FATAL,
                             "Transfer %d: DMA executor must have at least 1 destination", i});
@@ -2341,6 +2349,12 @@ namespace {
           hasFatalError = true;
           break;
 #else
+          if (t.srcs.empty()) {
+            errors.push_back({ERR_FATAL,
+                              "Transfer %d: DMA memset (0 sources) does not support engine selection", i});
+            hasFatalError = true;
+            break;
+          }
           useSubIndexCount[t.exeDevice]++;
           int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
           if (t.exeSubIndex >= numSubIndices) {
@@ -2361,7 +2375,6 @@ namespace {
               hasFatalError = true;
               break;
             }
-
           }
 
           int numDsts = (int)t.dsts.size();
@@ -2400,22 +2413,24 @@ namespace {
 #endif
         }
 
-        if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) {
-          errors.push_back({ERR_WARN,
-              "Transfer %d: No GPU memory for source or destination.  Copy might not execute on DMA %d",
-              i, t.exeDevice.exeIndex});
-        } else {
-          // Currently HIP will use src agent if source memory is GPU, otherwise dst agent
-          if (IsGpuMemType(t.srcs[0].memType)) {
-            if (t.srcs[0].memIndex != t.exeDevice.exeIndex) {
+        if (!t.srcs.empty()) {
+          if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) {
+            errors.push_back({ERR_WARN,
+                "Transfer %d: No GPU memory for source or destination.  Copy might not execute on DMA %d",
+                i, t.exeDevice.exeIndex});
+          } else {
+            // Currently HIP will use src agent if source memory is GPU, otherwise dst agent
+            if (IsGpuMemType(t.srcs[0].memType)) {
+              if (t.srcs[0].memIndex != t.exeDevice.exeIndex) {
+                errors.push_back({ERR_WARN,
+                    "Transfer %d: DMA executor may automatically switch to using the source memory device (%d) not (%d)",
+                    i, t.srcs[0].memIndex, t.exeDevice.exeIndex});
+              }
+            } else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) {
               errors.push_back({ERR_WARN,
-                  "Transfer %d: DMA executor may automatically switch to using the source memory device (%d) not (%d)",
-                  i, t.srcs[0].memIndex, t.exeDevice.exeIndex});
+                  "Transfer %d: DMA executor may automatically switch to using the destination memory device (%d) not (%d)",
+                  i, t.dsts[0].memIndex, t.exeDevice.exeIndex});
             }
-          } else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) {
-            errors.push_back({ERR_WARN,
-                "Transfer %d: DMA executor may automatically switch to using the destination memory device (%d) not (%d)",
-                i, t.dsts[0].memIndex, t.exeDevice.exeIndex});
           }
         }
         break;
@@ -4263,8 +4278,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
           rss.dstAgent[dstIdx] = info.agentOwner;
         }
 
-        ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL));
-        rss.srcAgent = info.agentOwner;
+        if (!rss.srcMem.empty()) {
+          ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL));
+          rss.srcAgent = info.agentOwner;
+        }
 
         // Create HSA completion signal
         ERR_CHECK(hsa_signal_create(1, 0, NULL, &rss.signal));
@@ -5450,7 +5467,20 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     int numDsts = (int)resources.dstMem.size();
     ERR_CHECK(hipSetDevice(exeIndex));
     int subIterations = 0;
-    if (!useSubIndices && !cfg.dma.useHsaCopy) {
+    if (resources.srcMem.empty()) {
+      // DMA memset: fill each destination via LINEAR_FILL on SDMA DACC BE.
+      // count is in uint32_t units; value matches MEMSET_VAL byte pattern (0x4B4B4B4B).
+#if !defined(__NVCC__)
+      uint32_t fillVal;
+      float const f = MEMSET_VAL;
+      memcpy(&fillVal, &f, sizeof(fillVal));
+      size_t const count = resources.numBytes / sizeof(uint32_t);
+      do {
+        for (int dstIdx = 0; dstIdx < numDsts; dstIdx++)
+          ERR_CHECK(hsa_amd_memory_fill(resources.dstMem[dstIdx], fillVal, count));
+      } while (++subIterations != cfg.general.numSubIterations);
+#endif
+    } else if (!useSubIndices && !cfg.dma.useHsaCopy) {
       if (cfg.dma.useHipEvents)
         ERR_CHECK(hipEventRecord(startEvent, stream));
 

From a5353d4d618560bebf31f88d709af9300799680f Mon Sep 17 00:00:00 2001
From: nileshnegi <Nilesh.Negi@amd.com>
Date: Sun, 10 May 2026 09:38:51 -0500
Subject: [PATCH 2/2] Fix DMA memset timing: skip HIP event elapsed time for
 0-src transfers

hsa_amd_memory_fill does not record HIP events, so querying
hipEventElapsedTime after a 0-src DMA transfer produced an
"invalid resource handle" error. Guard the HIP event timing
path with !resources.srcMem.empty(); the fill path falls back
to CPU wall-clock time, which is accurate since
hsa_amd_memory_fill is synchronous.

Co-authored-by: Claude <claude@anthropic.com>
---
 src/header/TransferBench.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp
index a2d1faf..d8359c5 100644
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -5540,7 +5540,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 
     if (iteration >= 0) {
       double deltaMsec = cpuDeltaMsec;
-      if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) {
+      if (!resources.srcMem.empty() && !useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) {
         float gpuDeltaMsec;
         ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
         deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;