From fdcbf244da20f9a5381bc840e5a26fb6e8cadc3b Mon Sep 17 00:00:00 2001 From: nileshnegi Date: Sun, 10 May 2026 09:01:16 -0500 Subject: [PATCH 1/2] Add DMA memset support (no-src EXE_GPU_DMA transfer) Allow EXE_GPU_DMA transfers with zero sources to perform a memset using hsa_amd_memory_fill, which enqueues a LINEAR_FILL operation on the SDMA engines. Fill value: uint32_t fillVal = bit_cast(MEMSET_VAL); // 0x4B4B4B4B count = numBytes / sizeof(uint32_t); // count is in uint32_t units 0x4B4B4B4B matches both memset(MEMSET_CHAR) (used by dstReference[0]) and MemsetVal() used by the GFX no-src kernel, so existing correctness validation passes without changes. Validation changes (AMD only, gated on !__NVCC__): - DMA no-src is now valid; rejected only on NVIDIA builds - DMA no-src with a specific SDMA engine (e.g. "n d0.2 g1") is rejected because hsa_amd_memory_fill has no engine-selection parameter - Copy-agent-selection warnings guarded by !t.srcs.empty() to avoid out-of-bounds access when no source is specified Execution changes (ExecuteDmaTransfer): - no-src hoisted before hipMemcpy/HSA-async-copy branches - Copy paths (hipMemcpy and HSA async copy) unchanged HSA resource setup: - srcMem pointer-info query guarded by !rss.srcMem.empty() Co-authored-by: Claude --- src/header/TransferBench.hpp | 70 +++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index 9f4589c..a2d1faf 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -2313,12 +2313,20 @@ namespace { } break; case EXE_GPU_DMA: - if (t.srcs.size() != 1) { + if (t.srcs.size() > 1) { + errors.push_back({ERR_FATAL, + "Transfer %d: DMA executor must have 0 or 1 sources", i}); + hasFatalError = true; + break; + } +#if defined(__NVCC__) + if (t.srcs.empty()) { errors.push_back({ERR_FATAL, - "Transfer %d: DMA executor must have exactly 1 source", i}); + "Transfer %d: DMA memset (0 sources) not supported on NVIDIA hardware", i}); hasFatalError = true; break; } +#endif if (t.dsts.size() < 1) { errors.push_back({ERR_FATAL, "Transfer %d: DMA executor must have at least 1 destination", i}); @@ -2341,6 +2349,12 @@ namespace { hasFatalError = true; break; #else + if (t.srcs.empty()) { + errors.push_back({ERR_FATAL, + "Transfer %d: DMA memset (0 sources) does not support engine selection", i}); + hasFatalError = true; + break; + } useSubIndexCount[t.exeDevice]++; int numSubIndices = GetNumExecutorSubIndices(t.exeDevice); if (t.exeSubIndex >= numSubIndices) { @@ -2361,7 +2375,6 @@ namespace { hasFatalError = true; break; } - } int numDsts = (int)t.dsts.size(); @@ -2400,22 +2413,24 @@ namespace { #endif } - if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) { - errors.push_back({ERR_WARN, - "Transfer %d: No GPU memory for source or destination. Copy might not execute on DMA %d", - i, t.exeDevice.exeIndex}); - } else { - // Currently HIP will use src agent if source memory is GPU, otherwise dst agent - if (IsGpuMemType(t.srcs[0].memType)) { - if (t.srcs[0].memIndex != t.exeDevice.exeIndex) { + if (!t.srcs.empty()) { + if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) { + errors.push_back({ERR_WARN, + "Transfer %d: No GPU memory for source or destination. Copy might not execute on DMA %d", + i, t.exeDevice.exeIndex}); + } else { + // Currently HIP will use src agent if source memory is GPU, otherwise dst agent + if (IsGpuMemType(t.srcs[0].memType)) { + if (t.srcs[0].memIndex != t.exeDevice.exeIndex) { + errors.push_back({ERR_WARN, + "Transfer %d: DMA executor may automatically switch to using the source memory device (%d) not (%d)", + i, t.srcs[0].memIndex, t.exeDevice.exeIndex}); + } + } else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) { errors.push_back({ERR_WARN, - "Transfer %d: DMA executor may automatically switch to using the source memory device (%d) not (%d)", - i, t.srcs[0].memIndex, t.exeDevice.exeIndex}); + "Transfer %d: DMA executor may automatically switch to using the destination memory device (%d) not (%d)", + i, t.dsts[0].memIndex, t.exeDevice.exeIndex}); } - } else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) { - errors.push_back({ERR_WARN, - "Transfer %d: DMA executor may automatically switch to using the destination memory device (%d) not (%d)", - i, t.dsts[0].memIndex, t.exeDevice.exeIndex}); } } break; @@ -4263,8 +4278,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid) rss.dstAgent[dstIdx] = info.agentOwner; } - ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL)); - rss.srcAgent = info.agentOwner; + if (!rss.srcMem.empty()) { + ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL)); + rss.srcAgent = info.agentOwner; + } // Create HSA completion signal ERR_CHECK(hsa_signal_create(1, 0, NULL, &rss.signal)); @@ -5450,7 +5467,20 @@ static bool IsConfiguredGid(union ibv_gid const& gid) int numDsts = (int)resources.dstMem.size(); ERR_CHECK(hipSetDevice(exeIndex)); int subIterations = 0; - if (!useSubIndices && !cfg.dma.useHsaCopy) { + if (resources.srcMem.empty()) { + // DMA memset: fill each destination via LINEAR_FILL on SDMA DACC BE. + // count is in uint32_t units; value matches MEMSET_VAL byte pattern (0x4B4B4B4B). +#if !defined(__NVCC__) + uint32_t fillVal; + float const f = MEMSET_VAL; + memcpy(&fillVal, &f, sizeof(fillVal)); + size_t const count = resources.numBytes / sizeof(uint32_t); + do { + for (int dstIdx = 0; dstIdx < numDsts; dstIdx++) + ERR_CHECK(hsa_amd_memory_fill(resources.dstMem[dstIdx], fillVal, count)); + } while (++subIterations != cfg.general.numSubIterations); +#endif + } else if (!useSubIndices && !cfg.dma.useHsaCopy) { if (cfg.dma.useHipEvents) ERR_CHECK(hipEventRecord(startEvent, stream)); From a5353d4d618560bebf31f88d709af9300799680f Mon Sep 17 00:00:00 2001 From: nileshnegi Date: Sun, 10 May 2026 09:38:51 -0500 Subject: [PATCH 2/2] Fix DMA memset timing: skip HIP event elapsed time for 0-src transfers hsa_amd_memory_fill does not record HIP events, so querying hipEventElapsedTime after a 0-src DMA transfer produced an "invalid resource handle" error. Guard the HIP event timing path with !resources.srcMem.empty(); the fill path falls back to CPU wall-clock time, which is accurate since hsa_amd_memory_fill is synchronous. Co-authored-by: Claude --- src/header/TransferBench.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index a2d1faf..d8359c5 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -5540,7 +5540,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) if (iteration >= 0) { double deltaMsec = cpuDeltaMsec; - if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) { + if (!resources.srcMem.empty() && !useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) { float gpuDeltaMsec; ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;