Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 51 additions & 21 deletions src/header/TransferBench.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2313,12 +2313,20 @@ namespace {
}
break;
case EXE_GPU_DMA:
if (t.srcs.size() != 1) {
if (t.srcs.size() > 1) {
errors.push_back({ERR_FATAL,
"Transfer %d: DMA executor must have 0 or 1 sources", i});
hasFatalError = true;
break;
}
#if defined(__NVCC__)
if (t.srcs.empty()) {
errors.push_back({ERR_FATAL,
"Transfer %d: DMA executor must have exactly 1 source", i});
"Transfer %d: DMA memset (0 sources) not supported on NVIDIA hardware", i});
hasFatalError = true;
break;
}
#endif
if (t.dsts.size() < 1) {
errors.push_back({ERR_FATAL,
"Transfer %d: DMA executor must have at least 1 destination", i});
Expand All @@ -2341,6 +2349,12 @@ namespace {
hasFatalError = true;
break;
#else
if (t.srcs.empty()) {
errors.push_back({ERR_FATAL,
"Transfer %d: DMA memset (0 sources) does not support engine selection", i});
hasFatalError = true;
break;
}
useSubIndexCount[t.exeDevice]++;
int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
if (t.exeSubIndex >= numSubIndices) {
Expand All @@ -2361,7 +2375,6 @@ namespace {
hasFatalError = true;
break;
}

}

int numDsts = (int)t.dsts.size();
Expand Down Expand Up @@ -2400,22 +2413,24 @@ namespace {
#endif
}

if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) {
errors.push_back({ERR_WARN,
"Transfer %d: No GPU memory for source or destination. Copy might not execute on DMA %d",
i, t.exeDevice.exeIndex});
} else {
// Currently HIP will use src agent if source memory is GPU, otherwise dst agent
if (IsGpuMemType(t.srcs[0].memType)) {
if (t.srcs[0].memIndex != t.exeDevice.exeIndex) {
if (!t.srcs.empty()) {
if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) {
errors.push_back({ERR_WARN,
"Transfer %d: No GPU memory for source or destination. Copy might not execute on DMA %d",
i, t.exeDevice.exeIndex});
} else {
// Currently HIP will use src agent if source memory is GPU, otherwise dst agent
if (IsGpuMemType(t.srcs[0].memType)) {
if (t.srcs[0].memIndex != t.exeDevice.exeIndex) {
errors.push_back({ERR_WARN,
"Transfer %d: DMA executor may automatically switch to using the source memory device (%d) not (%d)",
i, t.srcs[0].memIndex, t.exeDevice.exeIndex});
}
} else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) {
errors.push_back({ERR_WARN,
"Transfer %d: DMA executor may automatically switch to using the source memory device (%d) not (%d)",
i, t.srcs[0].memIndex, t.exeDevice.exeIndex});
"Transfer %d: DMA executor may automatically switch to using the destination memory device (%d) not (%d)",
i, t.dsts[0].memIndex, t.exeDevice.exeIndex});
}
} else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) {
errors.push_back({ERR_WARN,
"Transfer %d: DMA executor may automatically switch to using the destination memory device (%d) not (%d)",
i, t.dsts[0].memIndex, t.exeDevice.exeIndex});
}
}
break;
Expand Down Expand Up @@ -4263,8 +4278,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
rss.dstAgent[dstIdx] = info.agentOwner;
}

ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL));
rss.srcAgent = info.agentOwner;
if (!rss.srcMem.empty()) {
ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL));
rss.srcAgent = info.agentOwner;
}

// Create HSA completion signal
ERR_CHECK(hsa_signal_create(1, 0, NULL, &rss.signal));
Expand Down Expand Up @@ -5450,7 +5467,20 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
int numDsts = (int)resources.dstMem.size();
ERR_CHECK(hipSetDevice(exeIndex));
int subIterations = 0;
if (!useSubIndices && !cfg.dma.useHsaCopy) {
if (resources.srcMem.empty()) {
// DMA memset: fill each destination via LINEAR_FILL on SDMA DACC BE.
// count is in uint32_t units; value matches MEMSET_VAL byte pattern (0x4B4B4B4B).
#if !defined(__NVCC__)
uint32_t fillVal;
float const f = MEMSET_VAL;
memcpy(&fillVal, &f, sizeof(fillVal));
size_t const count = resources.numBytes / sizeof(uint32_t);
do {
for (int dstIdx = 0; dstIdx < numDsts; dstIdx++)
ERR_CHECK(hsa_amd_memory_fill(resources.dstMem[dstIdx], fillVal, count));
} while (++subIterations != cfg.general.numSubIterations);
Comment thread
nileshnegi marked this conversation as resolved.
#endif
} else if (!useSubIndices && !cfg.dma.useHsaCopy) {
if (cfg.dma.useHipEvents)
ERR_CHECK(hipEventRecord(startEvent, stream));

Expand Down Expand Up @@ -5510,7 +5540,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)

if (iteration >= 0) {
double deltaMsec = cpuDeltaMsec;
if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) {
if (!resources.srcMem.empty() && !useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) {
float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
Expand Down