From 29efe1249b5ca578b470c6c3c6535a39ab24f2f9 Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Sat, 2 May 2026 00:20:27 -0500
Subject: [PATCH 1/2] TransferBench v1.67.0

- Initial pod communication support (#235)
- cuda + MNNVL update & pod presets (#241)
- Increase CQ size for high qps (#244)
- fix hang when NVML is present but fabricmanager isnt (#246)
- Adding nica2a preset  (#248)
- Adding HBM read bandwidth preset (#250)
- Pod Ring preset (#251)
- gfxsweep preset (#254) (#256)
- Adding Batched DMA support (hipMemcpyBatchAsync), and bmasweep preset (#255)
- Adding a wallclock consistency detection preset (#258)
- Adding smoketest preset for simple correctness tests (#266)
- Help / envvars / presets presets (#267)
- Modernize CMake build (#268)
- Replace version-based pod/amd-smi detection with compile-time API probes (#269)
- Fix collective mismatch hangs in multi-rank error paths (#270)
- Fix SHOW_ITERATIONS table truncation with multiple transfers per executor (#271)
- Reformat a2asweep output to match gfxsweep style (#272)
- Gfx sweep update (#274)
- Increasing flush frequency in smoketest (#275)
- Adding new experimental copy-only GFX kernel, gfxsweep update (#277)
- Fixes for cuMem compilation and invalid device ordinal (#278)
- Simplifying socket connect, allow for using host address (#279)
- Updating podring to run on single node without need to force single pod (#280)
- Adding SHOW_PERCENTILES to show extra per-iteration statistics (#281)

---------

Co-authored-by: Tim <43156029+AtlantaPepsi@users.noreply.github.com>
Co-authored-by: Pak Nin Lui <pak.lui@amd.com>
Co-authored-by: pierreantoineH <PierreAntoine.Harraud@amd.com>
Co-authored-by: Nilesh M Negi <Nilesh.Negi@amd.com>
Co-authored-by: Claude <claude@anthropic.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 CHANGELOG.md                         |   38 +
 CMakeLists.txt                       |  444 ++++--
 Makefile                             |  212 ++-
 examples/example.cfg                 |    6 +-
 src/client/Client.cpp                |   25 +-
 src/client/EnvVars.hpp               |  199 ++-
 src/client/Presets/AllToAll.hpp      |   34 +-
 src/client/Presets/AllToAllN.hpp     |   21 +-
 src/client/Presets/AllToAllSweep.hpp |  160 +-
 src/client/Presets/BmaSweep.hpp      |  182 +++
 src/client/Presets/EnvVarsList.hpp   |   31 +
 src/client/Presets/GfxSweep.hpp      |  239 +++
 src/client/Presets/HbmBandwidth.hpp  |  619 ++++++++
 src/client/Presets/HealthCheck.hpp   |   13 +-
 src/client/Presets/Help.hpp          |  123 ++
 src/client/Presets/NicAllToAll.hpp   |  374 +++++
 src/client/Presets/NicPeerToPeer.hpp |  121 +-
 src/client/Presets/NicRings.hpp      |   19 +-
 src/client/Presets/OneToAll.hpp      |   17 +-
 src/client/Presets/PeerToPeer.hpp    |   20 +-
 src/client/Presets/PodAllToAll.hpp   |  270 ++++
 src/client/Presets/PodPeerToPeer.hpp |  300 ++++
 src/client/Presets/Presets.hpp       |   58 +-
 src/client/Presets/Rings.hpp         |  280 ++++
 src/client/Presets/Scaling.hpp       |   40 +-
 src/client/Presets/Schmoo.hpp        |   63 +-
 src/client/Presets/SmokeTest.hpp     |  336 +++++
 src/client/Presets/Sweep.hpp         |   25 +-
 src/client/Presets/WallClock.hpp     |  234 +++
 src/client/Topology.hpp              |   57 +-
 src/client/Utilities.hpp             |  300 +++-
 src/header/TransferBench.hpp         | 2080 +++++++++++++++++++++-----
 toolchain-linux.cmake                |   34 -
 33 files changed, 5972 insertions(+), 1002 deletions(-)
 create mode 100644 src/client/Presets/BmaSweep.hpp
 create mode 100644 src/client/Presets/EnvVarsList.hpp
 create mode 100644 src/client/Presets/GfxSweep.hpp
 create mode 100644 src/client/Presets/HbmBandwidth.hpp
 create mode 100644 src/client/Presets/Help.hpp
 create mode 100644 src/client/Presets/NicAllToAll.hpp
 create mode 100644 src/client/Presets/PodAllToAll.hpp
 create mode 100644 src/client/Presets/PodPeerToPeer.hpp
 create mode 100644 src/client/Presets/Rings.hpp
 create mode 100644 src/client/Presets/SmokeTest.hpp
 create mode 100644 src/client/Presets/WallClock.hpp
 delete mode 100644 toolchain-linux.cmake

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 95991318..443f667d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,44 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
 
+## v1.67.00
+### Added
+- Initial support for pod communication.  Requires compatible hardware / ROCm version and subject to further testing
+  - This potentially enables GFX/DMA executors to access SRC/DST memory locations on GPUs within the same pod
+  - Pod membership requires amd-smi however can be skipped by setting TB_FORCE_SINGLE_POD=1
+- Support for dumping executed Transfers to a config file specified by TB_DUMP_CFG_FILE
+  - This will write Transfers that are executed (for example via a preset) to a config file that can then be executed
+- Reporting number of iterations run when running in timed mode (NUM_ITERATIONS < 0)
+- Adding NIC_CQ_POLL_BATCH to control CQ poll batch size for NIC transfers
+- New "hbm" preset which sweeps and tests local HBM read performance
+- Added a new TB_WALLCLOCK_RATE that will override GPU GFX wallclock rate if it returns 0 (debug)
+- Adding new batched-DMA executor "B", which utilizes the hipMemcpyBatchAsync API introduced in HIP 7.1 / CUDA 12.8
+- Added new "bmasweep" preset that compares DMA to batched DMA execution for parallel transfers to other GPUs
+- Added new "wallclock" preset that compares wallclock counters across XCCs within a GPU
+- Added new "smoketest" preset that runs a variety of DMA/GFX tests for simple correctness tests
+- Added new "help" preset to show config file examples
+- Added new "presets" preset to show available presets and their descriptions
+- Added new "rings" preset that runs parallel rings of transfers (pod-capable)
+- Added new "envvars" preset to show environment variables that can change TransferBench behavior
+- Adding information on how to run multi-rank with TransferBench, when run with no args
+- Added new "nica2a" preset (NIC all-to-all over GPUs via NIC executors, multi-node)
+- Added new GFX_KERNEL to allow experimenting with copy-only GFX kernel.  Currently this is opt-in only
+- Added `SHOW_PERCENTILES` (e.g. `50,75,90,95,99`) to show empirical percentiles of per-iteration duration
+
+### Modified
+- DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options
+- Adding extra information to CMake and make build methods to indicate enabled / disabled features
+- a2asweep preset changes from USE_FINE_GRAIN to MEM_TYPE to reflect various memory types
+- a2asweep preset changes from NUM_CUS to NUM_SUB_EXECS to match with a2a preset naming convention
+- scaling preset changes from using USE_FINE_GRAIN to CPU_MEM_TYPE and GPU_MEM_TYPE
+- NIC_FILTER renamed to TB_NIC_FILTER for consistency
+- DUMP_LINES renamed to TB_DUMP_LINES for consistency
+- Dynamically size CQs for NIC transfers in high QPs case
+- Switch to using hipMemcpyDeviceToDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0)
+- Allow for multiple destination memory locations for DMA/Batched-DMA Transfers
+- Removed env vars printing and preset print when running TransferBench with no args
+- Modification to simplify socket comm usage - first rank only needs to set TB_NUM_RANKS=X to see connection info
+
 ## v1.66.02
 ### Added
 - Adding DMA-BUF support
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b6591d3..c73e33d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,35 +1,137 @@
 # Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
 
-cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+# MPI::MPI_CXX and hip:: config targets require >= 3.9; 3.16 for modern policy defaults.
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
 
-# CMake Toolchain file to define compilers and path to ROCm
+# Pre-project: ROCM_PATH detection and compiler/flag selection.
+# Must be before project() so CMake uses the right compiler on first configure.
+# Priority: -DROCM_PATH / $ROCM_PATH env > PATH (amdclang++) > /opt/rocm
 #==================================================================================================
-if (NOT CMAKE_TOOLCHAIN_FILE)
-  set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake")
-  message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
+
+# ROCM_PATH: Check CMake cache or environment.
+if(NOT ROCM_PATH)
+  if(DEFINED ENV{ROCM_PATH} AND NOT "$ENV{ROCM_PATH}" STREQUAL "")
+    set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Path to ROCm installation.")
+  endif()
+endif()
+
+# ROCM_PATH: Derive from PATH; walk up from amdclang++/clang++ to the ROCm root.
+# Handles both ${ROCM_PATH}/bin/ and ${ROCM_PATH}/llvm/bin/ layouts.
+if(NOT ROCM_PATH)
+  find_program(_rocm_bin_hint NAMES amdclang++ clang++)
+  if(_rocm_bin_hint)
+    get_filename_component(_bin_dir    "${_rocm_bin_hint}" DIRECTORY)
+    get_filename_component(_parent     "${_bin_dir}"       DIRECTORY)
+    if(EXISTS "${_parent}/lib/libamdhip64.so" OR EXISTS "${_parent}/lib64/libamdhip64.so")
+      set(ROCM_PATH "${_parent}" CACHE PATH "Path to ROCm installation (auto-detected from PATH).")
+      message(STATUS "ROCM_PATH auto-detected from PATH: ${ROCM_PATH}")
+    else()
+      get_filename_component(_grandparent "${_parent}" DIRECTORY)
+      if(EXISTS "${_grandparent}/lib/libamdhip64.so" OR EXISTS "${_grandparent}/lib64/libamdhip64.so")
+        set(ROCM_PATH "${_grandparent}" CACHE PATH "Path to ROCm installation (auto-detected from PATH).")
+        message(STATUS "ROCM_PATH auto-detected from PATH: ${ROCM_PATH}")
+      endif()
+    endif()
+  endif()
+  unset(_rocm_bin_hint CACHE)
+  unset(_rocm_bin_hint)
+  unset(_bin_dir)
+  unset(_parent)
+  unset(_grandparent)
+endif()
+
+# ROCM_PATH: Fallback.
+if(NOT ROCM_PATH)
+  set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation.")
+  message(WARNING "ROCM_PATH not found; falling back to ${ROCM_PATH}")
 endif()
 
-set(VERSION_STRING "1.66.02")
+if(NOT EXISTS "${ROCM_PATH}")
+  message(FATAL_ERROR "ROCM_PATH=${ROCM_PATH} does not exist")
+endif()
+
+message(STATUS "ROCM_PATH: ${ROCM_PATH}")
+
+# Compiler detection: amdclang++ > llvm/amdclang++ > llvm/clang++
+# Respects -DCMAKE_CXX_COMPILER and $CXX / $CC env vars.
+if(NOT CMAKE_CXX_COMPILER)
+  if(DEFINED ENV{CXX} AND NOT "$ENV{CXX}" STREQUAL "")
+    set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to C++ compiler")
+  elseif(EXISTS "${ROCM_PATH}/bin/amdclang++")
+    set(CMAKE_CXX_COMPILER "${ROCM_PATH}/bin/amdclang++" CACHE PATH "Path to C++ compiler")
+  elseif(EXISTS "${ROCM_PATH}/llvm/bin/amdclang++")
+    set(CMAKE_CXX_COMPILER "${ROCM_PATH}/llvm/bin/amdclang++" CACHE PATH "Path to C++ compiler")
+  elseif(EXISTS "${ROCM_PATH}/llvm/bin/clang++")
+    set(CMAKE_CXX_COMPILER "${ROCM_PATH}/llvm/bin/clang++" CACHE PATH "Path to C++ compiler")
+  else()
+    message(FATAL_ERROR
+      "Cannot find amdclang++/clang++ under ${ROCM_PATH}/bin or ${ROCM_PATH}/llvm/bin")
+  endif()
+endif()
+
+if(NOT CMAKE_C_COMPILER)
+  if(DEFINED ENV{CC} AND NOT "$ENV{CC}" STREQUAL "")
+    set(CMAKE_C_COMPILER "$ENV{CC}" CACHE PATH "Path to C compiler")
+  else()
+    get_filename_component(_cxx_dir  "${CMAKE_CXX_COMPILER}" DIRECTORY)
+    get_filename_component(_cxx_name "${CMAKE_CXX_COMPILER}" NAME)
+    string(REPLACE "clang++" "clang" _cc_name "${_cxx_name}")
+    if(EXISTS "${_cxx_dir}/${_cc_name}")
+      set(CMAKE_C_COMPILER "${_cxx_dir}/${_cc_name}" CACHE PATH "Path to C compiler")
+    endif()
+    unset(_cxx_dir)
+    unset(_cxx_name)
+    unset(_cc_name)
+  endif()
+endif()
+
+# Seed default per-config flags. _INIT vars are written to cache on first configure;
+# user overrides via -DCMAKE_CXX_FLAGS_DEBUG=... or $CXXFLAGS/$CFLAGS take precedence.
+if(NOT (DEFINED ENV{CXXFLAGS} AND NOT "$ENV{CXXFLAGS}" STREQUAL ""))
+  set(CMAKE_CXX_FLAGS_DEBUG_INIT          "-O0 -g -ggdb3")
+  set(CMAKE_CXX_FLAGS_RELEASE_INIT        "-O3")
+  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO_INIT "-O3 -g")
+endif()
+if(NOT (DEFINED ENV{CFLAGS} AND NOT "$ENV{CFLAGS}" STREQUAL ""))
+  set(CMAKE_C_FLAGS_DEBUG_INIT          "-O0 -g -ggdb3")
+  set(CMAKE_C_FLAGS_RELEASE_INIT        "-O3")
+  set(CMAKE_C_FLAGS_RELWITHDEBINFO_INIT "-O3 -g")
+endif()
+
+set(ENV{ROCM_PATH} "${ROCM_PATH}")
+
+# TransferBench project definitions
+#==================================================================================================
+set(VERSION_STRING "1.67.00")
 project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX)
 
-## Load CMake modules
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "Setting build type to 'Release' as none was specified.")
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE)
+endif()
+
+# Load CMake modules
+# Extend MODULE_PATH before any include() that searches it.
 #==================================================================================================
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 include(CheckIncludeFiles)
 include(CheckSymbolExists)
-include(cmake/Dependencies.cmake) # rocm-cmake, rocm_local_targets
-
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+include(CheckCXXSourceCompiles)
+include(CheckCXXCompilerFlag)
+include(CMakePushCheckState)
 
 # Build options
 #==================================================================================================
 option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF)
 option(ENABLE_NIC_EXEC             "Enable RDMA NIC Executor in TransferBench"    OFF)
 option(ENABLE_MPI_COMM             "Enable MPI Communicator support"              OFF)
-option(DISABLE_DMABUF              "Disable DMA-BUF support for GPU Direct RDMA"  ON)
+option(ENABLE_DMA_BUF              "Enable DMA-BUF support for GPU Direct RDMA"   OFF)
+option(ENABLE_AMD_SMI              "Enable AMD-SMI pod membership queries"        OFF)
+option(ENABLE_POD_COMM             "Enable pod communication"                     OFF)
 option(BUILD_RELOCATABLE_PACKAGE   "Build with RVS-style relocatable RPATH and amdrocm<MAJOR>-transferbench package naming" OFF)
 
-# Default GPU architectures to build
-#==================================================================================================
+include(cmake/Dependencies.cmake) # rocm-cmake, rocm_local_targets, rocm_check_target_ids
+
 set(DEFAULT_GPUS
       gfx906
       gfx908
@@ -43,190 +145,297 @@ set(DEFAULT_GPUS
       gfx1150
       gfx1151
       gfx1200
-      gfx1201)
+      gfx1201
+      gfx1250)
 
-## Build only for local GPU architecture
 if(BUILD_LOCAL_GPU_TARGET_ONLY)
   message(STATUS "Building only for local GPU target")
-  if (COMMAND rocm_local_targets)
-    rocm_local_targets(DEFAULT_GPUS)
+  if(COMMAND rocm_local_targets)
+    rocm_local_targets(LOCAL_GPU_TARGETS)
+    if(LOCAL_GPU_TARGETS)
+      set(DEFAULT_GPUS ${LOCAL_GPU_TARGETS})
+    else()
+      message(WARNING "No local GPUs detected; falling back to default GPU list.")
+    endif()
   else()
     message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.")
   endif()
+  # FORCE so re-runs pick up the freshly detected local set.
+  set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "GPU architectures to build for." FORCE)
+else()
+  # Seeded once on first configure; hip-config-amd.cmake applies the same priority but warns on AMDGPU_TARGETS.
+  if(NOT DEFINED CACHE{GPU_TARGETS})
+    if(DEFINED AMDGPU_TARGETS AND NOT AMDGPU_TARGETS STREQUAL "")
+      set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to build for.")
+      message(STATUS "GPU_TARGETS seeded from CMake AMDGPU_TARGETS: ${GPU_TARGETS}")
+    elseif(DEFINED ENV{AMDGPU_TARGETS} AND NOT "$ENV{AMDGPU_TARGETS}" STREQUAL "")
+      set(GPU_TARGETS "$ENV{AMDGPU_TARGETS}" CACHE STRING "GPU architectures to build for.")
+      message(STATUS "GPU_TARGETS seeded from environment AMDGPU_TARGETS: ${GPU_TARGETS}")
+    else()
+      set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "GPU architectures to build for.")
+    endif()
+  endif()
 endif()
 
-## Determine which GPU architectures to build for
-set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
-
-## Check if clang compiler can offload to GPU_TARGETS
-if (COMMAND rocm_check_target_ids)
-  message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}")
+# Check if clang can offload to each GPU_TARGETS entry.
+if(COMMAND rocm_check_target_ids)
+  message(STATUS "Checking for ROCm support for GPU targets: ${GPU_TARGETS}")
   rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS})
 else()
   message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs.")
   set(SUPPORTED_GPUS ${DEFAULT_GPUS})
 endif()
 
-set(GPU_TARGETS "${SUPPORTED_GPUS}")
-message(STATUS "Compiling for ${GPU_TARGETS}")
-
-## NOTE: Reload rocm-cmake in order to update GPU_TARGETS
-include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults
-
-# Check for required dependencies
-#==================================================================================================
-## Try to establish ROCM_PATH (for find_package)
-if(NOT DEFINED ROCM_PATH)
-  # Guess default location
-  set(ROCM_PATH "/opt/rocm")
-  message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}")
+if(SUPPORTED_GPUS)
+  set(GPU_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "GPU architectures to build for." FORCE)
 else()
-  message(STATUS "ROCM_PATH found: ${ROCM_PATH}")
+  message(WARNING "rocm_check_target_ids returned no supported GPUs; keeping existing GPU_TARGETS=${GPU_TARGETS}")
 endif()
-set(ENV{ROCM_PATH} ${ROCM_PATH})
+message(STATUS "- Compiling for ${GPU_TARGETS}")
 
-## Set CMAKE flags
-if (NOT DEFINED CMAKE_CXX_STANDARD)
+if(NOT DEFINED CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
-list(APPEND CMAKE_PREFIX_PATH  # Add ROCM_PATH to CMake search paths for finding HIP / HSA
+
+# Search only the active ROCm installation. ROCM_PATH is already resolved by the
+# pre-project() block, so this is always the right install.
+list(APPEND CMAKE_PREFIX_PATH
             ${ROCM_PATH}
             ${ROCM_PATH}/llvm
-            ${ROCM_PATH}/hip
-            /opt/rocm
-            /opt/rocm/llvm
-            /opt/rocm/hip)
+            ${ROCM_PATH}/hip)
 
-## Check for HIP
-find_package(hip REQUIRED CONFIG PATHS ${CMAKE_PREFIX_PATH})
+find_package(hip REQUIRED CONFIG)
 message(STATUS "HIP compiler: ${HIP_COMPILER}")
 
-## Ensuring that CXX compiler meets expectations
 if(NOT (("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc") OR ("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+")))
   message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as C++ compiler.")
 endif()
 
 ## Check for Threads
-find_package(Threads REQUIRED)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
 
 ## Check for numa support
 find_library(NUMA_LIBRARY numa)
 find_path(NUMA_INCLUDE_DIR numa.h)
 if(NUMA_LIBRARY AND NUMA_INCLUDE_DIR)
   add_library(numa SHARED IMPORTED)
-  set_target_properties(numa PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}" IMPORTED_LOCATION "${NUMA_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}")
+  set_target_properties(numa PROPERTIES IMPORTED_LOCATION "${NUMA_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}")
+else()
+  message(FATAL_ERROR "NUMA library or headers not found; TransferBench requires libnuma")
 endif()
 
 ## Check for hsa support
-find_library(HSA_LIBRARY hsa-runtime64 PATHS ${ROCM_PATH} ${ROCM_PATH}/lib)
-find_path(HSA_INCLUDE_DIR hsa.h PATHS ${ROCM_PATH}/include ${ROCM_PATH}/include/hsa)
+find_library(HSA_LIBRARY hsa-runtime64 PATHS ${ROCM_PATH}/lib ${ROCM_PATH}/lib64 NO_DEFAULT_PATH)
+find_path(HSA_INCLUDE_DIR hsa/hsa.h PATHS ${ROCM_PATH}/include)
 if(HSA_LIBRARY AND HSA_INCLUDE_DIR)
   add_library(hsa-runtime64 SHARED IMPORTED)
-  set_target_properties(hsa-runtime64 PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}" IMPORTED_LOCATION "${HSA_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}")
+  set_target_properties(hsa-runtime64 PROPERTIES IMPORTED_LOCATION "${HSA_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}")
+else()
+  message(FATAL_ERROR "HSA library or headers not found under ${ROCM_PATH}; TransferBench requires libhsa-runtime64")
 endif()
 
 ## Check for infiniband verbs support
 if(DEFINED ENV{DISABLE_NIC_EXEC} AND "$ENV{DISABLE_NIC_EXEC}" STREQUAL "1")
   message(STATUS "Disabling NIC Executor support as env. flag DISABLE_NIC_EXEC was enabled")
 elseif(NOT ENABLE_NIC_EXEC)
-  message(STATUS "For CMake builds, NIC executor so requires explicit opt-in by setting CMake flag -DENABLE_NIC_EXEC=ON")
-  message(STATUS "Disabling NIC Executor support")
+  message(STATUS "For CMake builds, NIC Executor support requires explicit opt-in by setting CMake flag -DENABLE_NIC_EXEC=ON")
+  message(STATUS "- Disabling NIC Executor support")
 else()
+  message(STATUS "Attempting to build with NIC executor support")
+
   find_library(IBVERBS_LIBRARY ibverbs)
   find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h)
   if(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
     add_library(ibverbs SHARED IMPORTED)
     set_target_properties(ibverbs PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}" IMPORTED_LOCATION "${IBVERBS_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}")
     set(IBVERBS_FOUND 1)
-    message(STATUS "Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
+    message(STATUS "- Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
   else()
     if(NOT IBVERBS_LIBRARY)
-      message(WARNING "IBVerbs library not found")
+      message(WARNING "- IBVerbs library not found")
     elseif(NOT IBVERBS_INCLUDE_DIR)
-      message(WARNING "infiniband/verbs.h not found")
+      message(WARNING "- infiniband/verbs.h not found")
     endif()
-    message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed")
+    message(WARNING "- Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed")
   endif()
 endif()
 
-## Check for DMA-BUF support (requires IBVERBS_FOUND)
-if(IBVERBS_FOUND AND NOT DISABLE_DMABUF)
-  message(STATUS "Checking for DMA-BUF support...")
-
-  # Check for ibv_reg_dmabuf_mr
-  include(CheckSymbolExists)
-  set(CMAKE_REQUIRED_INCLUDES ${IBVERBS_INCLUDE_DIR})
-  set(CMAKE_REQUIRED_LIBRARIES ${IBVERBS_LIBRARY})
-  check_symbol_exists(ibv_reg_dmabuf_mr "infiniband/verbs.h" HAVE_IBV_DMABUF)
-
-  # Check for hsa_amd_portable_export_dmabuf
-  set(CMAKE_REQUIRED_INCLUDES ${HSA_INCLUDE_DIR})
-  set(CMAKE_REQUIRED_LIBRARIES ${HSA_LIBRARY})
-  check_symbol_exists(hsa_amd_portable_export_dmabuf "hsa_ext_amd.h" HAVE_ROCM_DMABUF)
-
-  # Enable DMA-BUF only if both APIs are available
-  if(HAVE_IBV_DMABUF AND HAVE_ROCM_DMABUF)
-    set(DMABUF_SUPPORT_FOUND 1)
-    message(STATUS "Building with DMA-BUF support")
+## Check for DMA-BUF support (requires IBVERBS)
+if(IBVERBS_FOUND)
+  if(DEFINED ENV{DISABLE_DMA_BUF} AND "$ENV{DISABLE_DMA_BUF}" STREQUAL "1")
+    message(STATUS "Disabling DMA-BUF support as env. flag DISABLE_DMA_BUF was enabled")
+  elseif(NOT ENABLE_DMA_BUF)
+    message(STATUS "For CMake builds, DMA-BUF support requires explicit opt-in by setting CMake flags -DENABLE_DMA_BUF=ON")
+    message(STATUS "- Disabling DMA-BUF support")
   else()
-    if(NOT HAVE_IBV_DMABUF AND NOT HAVE_ROCM_DMABUF)
-      message(WARNING "Building without DMA-BUF support: missing both ibv_reg_dmabuf_mr and ROCm DMA-BUF export")
-    elseif(NOT HAVE_IBV_DMABUF)
-      message(WARNING "Building without DMA-BUF support: missing ibv_reg_dmabuf_mr")
+    message(STATUS "Attempting to build with DMA-BUF support")
+
+    # Check for ibv_reg_dmabuf_mr
+    cmake_push_check_state()
+    set(CMAKE_REQUIRED_INCLUDES ${IBVERBS_INCLUDE_DIR})
+    set(CMAKE_REQUIRED_LIBRARIES ${IBVERBS_LIBRARY})
+    check_symbol_exists(ibv_reg_dmabuf_mr "infiniband/verbs.h" HAVE_IBV_DMABUF)
+    cmake_pop_check_state()
+
+    # Check for hsa_amd_portable_export_dmabuf
+    cmake_push_check_state()
+    set(CMAKE_REQUIRED_INCLUDES ${HSA_INCLUDE_DIR})
+    set(CMAKE_REQUIRED_LIBRARIES ${HSA_LIBRARY})
+    check_symbol_exists(hsa_amd_portable_export_dmabuf "hsa/hsa_ext_amd.h" HAVE_ROCM_DMABUF)
+    cmake_pop_check_state()
+
+    # Enable DMA-BUF only if both APIs are available
+    if(HAVE_IBV_DMABUF AND HAVE_ROCM_DMABUF)
+      set(DMABUF_SUPPORT_FOUND 1)
+      message(STATUS "- Building with DMA-BUF support")
     else()
-      message(WARNING "Building without DMA-BUF support: missing ROCm DMA-BUF export")
+      if(NOT HAVE_IBV_DMABUF AND NOT HAVE_ROCM_DMABUF)
+        message(WARNING "- Building without DMA-BUF support: missing both ibv_reg_dmabuf_mr and ROCm DMA-BUF export")
+      elseif(NOT HAVE_IBV_DMABUF)
+        message(WARNING "- Building without DMA-BUF support: missing ibv_reg_dmabuf_mr")
+      else()
+        message(WARNING "- Building without DMA-BUF support: missing ROCm DMA-BUF export")
+      endif()
     endif()
   endif()
-elseif(NOT DISABLE_DMABUF)
-  message(WARNING "DMA-BUF support requires ENABLE_NIC_EXEC=ON")
 endif()
 
 ## Check for MPI support
 set(MPI_PATH "" CACHE PATH "Path to MPI installation (takes priority over system MPI)")
-if(NOT ENABLE_MPI_COMM)
+if(DEFINED ENV{DISABLE_MPI_COMM} AND "$ENV{DISABLE_MPI_COMM}" STREQUAL "1")
+  message(STATUS "Disabling MPI Communicator support as env. flag DISABLE_MPI_COMM was enabled")
+elseif(NOT ENABLE_MPI_COMM)
   message(STATUS "For CMake builds, MPI Communicator requires explicit opt-in by setting CMake flag -DENABLE_MPI_COMM=ON")
   message(STATUS "Disabling MPI Communicator support")
 else()
-  # First check user-specified MPI_PATH (similar to Makefile)
+  message(STATUS "Attempting to build with MPI communicator support")
   if(MPI_PATH AND EXISTS "${MPI_PATH}/include/mpi.h")
-    find_library(MPI_LIBRARY NAMES mpi PATHS ${MPI_PATH}/lib NO_DEFAULT_PATH)
+    find_library(MPI_LIBRARY NAMES mpi PATHS ${MPI_PATH}/lib ${MPI_PATH}/lib64 NO_DEFAULT_PATH)
     if(MPI_LIBRARY)
       set(MPI_COMM_FOUND 1)
       set(MPI_INCLUDE_DIR "${MPI_PATH}/include")
-      set(MPI_LINK_DIR "${MPI_PATH}/lib")
-      message(STATUS "Building with MPI Communicator support (found at MPI_PATH: ${MPI_PATH})")
+      message(STATUS "- Building with MPI Communicator support (found at MPI_PATH: ${MPI_PATH})")
     else()
-      message(WARNING "Found mpi.h at ${MPI_PATH}/include but could not find MPI library at ${MPI_PATH}/lib")
+      message(WARNING "- Found mpi.h at ${MPI_PATH}/include but could not find MPI library at ${MPI_PATH}/lib")
     endif()
   else()
-    # Fall back to find_package
     if(MPI_PATH)
-      message(STATUS "Unable to find mpi.h at ${MPI_PATH}/include, trying find_package")
+      message(STATUS "- Unable to find mpi.h at ${MPI_PATH}/include, trying find_package")
     endif()
     find_package(MPI QUIET)
     if(MPI_CXX_FOUND)
       set(MPI_COMM_FOUND 1)
-      message(STATUS "Building with MPI Communicator support (found via find_package)")
-      message(STATUS "- Using MPI include path: ${MPI_CXX_INCLUDE_PATH}")
-      message(STATUS "- Using MPI library:: ${MPI_CXX_LIBRARIES}")
+      message(STATUS "- Building with MPI Communicator support (found via find_package)")
+      message(STATUS "  - Using MPI include path: ${MPI_CXX_INCLUDE_DIRS}")
+      message(STATUS "  - Using MPI library: ${MPI_CXX_LIBRARIES}")
     else()
-      message(WARNING "MPI not found. Please specify appropriate MPI_PATH or install MPI libraries (e.g., OpenMPI or MPICH)")
+      message(WARNING "- MPI not found. Please specify appropriate MPI_PATH or install MPI libraries (e.g., OpenMPI or MPICH)")
     endif()
   endif()
 endif()
 
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
+## Check for pod communication support
+if(ENABLE_AMD_SMI AND NOT ENABLE_POD_COMM)
+  message(WARNING "ENABLE_AMD_SMI=ON has no effect without ENABLE_POD_COMM=ON; AMD-SMI detection will be skipped")
+endif()
+if(DEFINED ENV{DISABLE_POD_COMM} AND "$ENV{DISABLE_POD_COMM}" STREQUAL "1")
+  message(STATUS "Disabling pod communication support as env. flag DISABLE_POD_COMM was enabled")
+elseif(NOT ENABLE_POD_COMM)
+  message(STATUS "For CMake builds, pod communication support requires explicit opt-in by setting CMake flag -DENABLE_POD_COMM=ON")
+  message(STATUS "- Disabling pod communication support")
+else()
+  find_library(HIP_RUNTIME_LIBRARY amdhip64 PATHS ${ROCM_PATH}/lib ${ROCM_PATH}/lib64 NO_DEFAULT_PATH)
+  if(NOT HIP_RUNTIME_LIBRARY)
+    message(FATAL_ERROR "libamdhip64 not found under ${ROCM_PATH}/lib or ${ROCM_PATH}/lib64; cannot probe for HIP fabric API")
+  endif()
+  # Probe for the HIP fabric API functions used by TransferBench at runtime.
+  cmake_push_check_state()
+  set(CMAKE_REQUIRED_INCLUDES "${ROCM_PATH}/include")
+  set(CMAKE_REQUIRED_LIBRARIES "${HIP_RUNTIME_LIBRARY}")
+  set(CMAKE_REQUIRED_DEFINITIONS "-D__HIP_PLATFORM_AMD__")
+  check_cxx_source_compiles("
+    #include <hip/hip_runtime_api.h>
+    int main() {
+      hipMemFabricHandle_t fabricHandle = {};
+      hipMemGenericAllocationHandle_t allocationHandle = {};
+      hipMemExportToShareableHandle(&fabricHandle, allocationHandle, hipMemHandleTypeFabric, 0);
+      hipMemImportFromShareableHandle(&allocationHandle, &fabricHandle, hipMemHandleTypeFabric);
+      return 0;
+    }" HIP_HAS_FABRIC_API)
+  cmake_pop_check_state()
+
+  if(HIP_HAS_FABRIC_API)
+    message(STATUS "- HIP fabric API found; enabling pod communication support")
+    set(POD_COMM_FOUND 1)
+
+    # Check for AMD-SMI support
+    # Try amd-smi for pod membership queries; fall back to TB_FORCE_SINGLE_POD=1 at runtime.
+    if(DEFINED ENV{DISABLE_AMD_SMI} AND "$ENV{DISABLE_AMD_SMI}" STREQUAL "1")
+      message(STATUS "- AMD-SMI disabled via env. flag DISABLE_AMD_SMI was enabled")
+      message(WARNING "Set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership")
+    elseif(NOT ENABLE_AMD_SMI)
+      message(STATUS "- For CMake builds, AMD-SMI requires explicit opt-in by setting CMake flag -DENABLE_AMD_SMI=ON")
+      message(WARNING "Set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership")
+    else()
+      find_path(AMD_SMI_INCLUDE_DIR amd_smi/amdsmi.h PATHS ${ROCM_PATH}/include NO_DEFAULT_PATH)
+      find_library(AMD_SMI_LIBRARY amd_smi PATHS ${ROCM_PATH}/lib ${ROCM_PATH}/lib64 NO_DEFAULT_PATH)
+      if(AMD_SMI_INCLUDE_DIR AND AMD_SMI_LIBRARY)
+        # Probe for the AMD-SMI functions used by TransferBench at runtime.
+        cmake_push_check_state()
+        set(CMAKE_REQUIRED_INCLUDES "${AMD_SMI_INCLUDE_DIR}")
+        set(CMAKE_REQUIRED_LIBRARIES "${AMD_SMI_LIBRARY}")
+        check_cxx_source_compiles("
+          #include <amd_smi/amdsmi.h>
+          int main() {
+            amdsmi_bdf_t bdf = {};
+            amdsmi_processor_handle h;
+            amdsmi_get_processor_handle_from_bdf(bdf, &h);
+            amdsmi_fabric_info_t fi;
+            amdsmi_get_gpu_fabric_info(h, &fi);
+            (void)fi.fabric_info.fabric_version.v1.ppod_id;
+            (void)fi.fabric_info.fabric_version.v1.vpod_id;
+            return 0;
+          }" AMDSMI_HAS_FABRIC)
+        cmake_pop_check_state()
+
+        if(AMDSMI_HAS_FABRIC)
+          message(STATUS "- AMD-SMI fabric API found; using AMD-SMI for pod membership queries")
+          set(AMD_SMI_FOUND 1)
+        else()
+          message(STATUS "- AMD-SMI fabric API not found")
+          message(WARNING "Set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership")
+        endif()
+      else()
+        if(NOT AMD_SMI_INCLUDE_DIR)
+          message(STATUS "- amd_smi/amdsmi.h not found under ${ROCM_PATH}/include")
+        endif()
+        if(NOT AMD_SMI_LIBRARY)
+          message(STATUS "- libamd_smi not found under ${ROCM_PATH}/lib or ${ROCM_PATH}/lib64")
+        endif()
+        message(STATUS "- AMD-SMI not available")
+        message(WARNING "Set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership")
+      endif()
+    endif()
+  else()
+    message(STATUS "- HIP fabric API not found; disabling pod communication support")
+  endif()
+endif()
+
+set(PACKAGE_NAME TB)
+set(LIBRARY_NAME TransferBench)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
 
 add_executable(TransferBench src/client/Client.cpp)
 
-target_include_directories(TransferBench PRIVATE src/header)
-target_include_directories(TransferBench PRIVATE src/client)
-target_include_directories(TransferBench PRIVATE src/client/Presets)
-target_include_directories(TransferBench PRIVATE ${NUMA_INCLUDE_DIR})
-target_include_directories(TransferBench PRIVATE ${HSA_INCLUDE_DIR})
+target_include_directories(TransferBench PRIVATE
+  src/header
+  src/client
+  src/client/Presets)
+
 if(IBVERBS_FOUND)
   target_include_directories(TransferBench PRIVATE ${IBVERBS_INCLUDE_DIR})
   target_link_libraries(TransferBench PRIVATE ${IBVERBS_LIBRARY})
@@ -234,11 +443,8 @@ if(IBVERBS_FOUND)
 endif()
 if(MPI_COMM_FOUND)
   if(TARGET MPI::MPI_CXX)
-    # Found via find_package
-    target_include_directories(TransferBench PRIVATE ${MPI_CXX_INCLUDE_DIRS})
     target_link_libraries(TransferBench PRIVATE MPI::MPI_CXX)
   else()
-    # Found via MPI_PATH fallback
     target_include_directories(TransferBench PRIVATE ${MPI_INCLUDE_DIR})
     target_link_libraries(TransferBench PRIVATE ${MPI_LIBRARY})
   endif()
@@ -247,18 +453,28 @@ endif()
 if(DMABUF_SUPPORT_FOUND)
   target_compile_definitions(TransferBench PRIVATE HAVE_DMABUF_SUPPORT)
 endif()
-if (HAVE_PARALLEL_JOBS)
-  target_compile_options(TransferBench PRIVATE -parallel-jobs=12)
+if(AMD_SMI_FOUND)
+  target_include_directories(TransferBench PRIVATE ${AMD_SMI_INCLUDE_DIR})
+  target_link_libraries(TransferBench PRIVATE ${AMD_SMI_LIBRARY})
+  target_compile_definitions(TransferBench PRIVATE AMD_SMI_ENABLED)
+endif()
+if(POD_COMM_FOUND)
+  target_compile_definitions(TransferBench PRIVATE POD_COMM_ENABLED)
 endif()
 
+check_cxx_compiler_flag(-parallel-jobs=12 HAVE_PARALLEL_JOBS)
+if(HAVE_PARALLEL_JOBS)
+  message(STATUS "Enabling parallel compile jobs: -parallel-jobs=12")
+  target_compile_options(TransferBench PRIVATE -parallel-jobs=12)
+else()
+  message(STATUS "Compiler does not support -parallel-jobs=12 (or the check failed); skipping -parallel-jobs optimisation")
+endif()
 
-target_link_libraries(TransferBench PRIVATE -fgpu-rdc)             # Required when linking relocatable device code
+target_link_options(TransferBench PRIVATE -fgpu-rdc)
 target_link_libraries(TransferBench PRIVATE Threads::Threads)
-target_link_libraries(TransferBench INTERFACE hip::host)
-target_link_libraries(TransferBench PRIVATE hip::device)
-target_link_libraries(TransferBench PRIVATE dl)
-target_link_libraries(TransferBench PRIVATE ${NUMA_LIBRARY})
-target_link_libraries(TransferBench PRIVATE ${HSA_LIBRARY})
+target_link_libraries(TransferBench PRIVATE hip::host hip::device dl)
+target_link_libraries(TransferBench PRIVATE hsa-runtime64)
+target_link_libraries(TransferBench PRIVATE numa)
 
 # gcc <9 ships std::filesystem in a separate library (libstdc++fs).
 # Required on AlmaLinux 8 / manylinux_2_28; harmless no-op stub on newer toolchains.
diff --git a/Makefile b/Makefile
index 4bc3cdab..71562cc2 100644
--- a/Makefile
+++ b/Makefile
@@ -6,14 +6,18 @@
 ROCM_PATH ?= /opt/rocm
 CUDA_PATH ?= /usr/local/cuda
 MPI_PATH  ?= /usr/local/openmpi
+HIPCC     ?= $(ROCM_PATH)/bin/amdclang++
+NVCC      ?= $(CUDA_PATH)/bin/nvcc
+DEBUG     ?= 0
 
 # Optional features (set to 0 to disable, 1 to enable)
-# DISABLE_NIC_EXEC: Disable RDMA/NIC executor support (default: 0)
-# DISABLE_MPI_COMM: Disable MPI communicator support (default: 0)
-# DISABLE_DMABUF: Disable DMA-BUF support for GPU Direct RDMA (default: 1)
-
-HIPCC ?= $(ROCM_PATH)/bin/amdclang++
-NVCC ?= $(CUDA_PATH)/bin/nvcc
+# DISABLE_NIC_EXEC: Disable RDMA/NIC executor support                     (default: 0)
+# DISABLE_MPI_COMM: Disable MPI communicator support                      (default: 0)
+# DISABLE_DMA_BUF:  Disable DMA-BUF support for GPU Direct RDMA           (default: 1)
+# DISABLE_AMD_SMI:  Disable AMD-SMI pod membership checking support       (default: 0)
+# DISABLE_NVML:     Disable NVML pod membership detection for CUDA builds (default: 0)
+# DISABLE_POD_COMM: Disable pod communication support                     (default: 0)
+# DISABLE_CUMEM:    Disable CUDA driver API (also disables pod on CUDA)   (default: 0)
 
 # ROCm device libraries can live in different locations depending on packaging.
 # hipcc/clang needs to find the amdgcn bitcode directory at link time.
@@ -32,11 +36,11 @@ SINGLE_KERNEL ?= 0
 GPU_TARGETS ?= native
 
 EXE=TransferBench
-DEBUG ?= 0
 
 # Only perform this check if 'make clean' is not the target
 ifeq ($(filter clean,$(MAKECMDGOALS)),)
   ifeq ($(MAKECMDGOALS),TransferBenchCuda)
+    $(info Building TransferBenchCuda)
     # Check for nvcc
     ifneq ($(shell test -e $(NVCC) && echo found), found)
       $(error "Could not find $(NVCC).  Please set CUDA_PATH appropriately")
@@ -48,15 +52,21 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),)
     # Check for HIP compiler
     ifeq ("$(shell test -e $(HIPCC) && echo found)", "found")
       CXX=$(HIPCC)
-    else ifeq ("$(shell test -e $(ROCM_PATH)/bin/hipcc && echo found)", "found")
-      CXX=$(ROCM_PATH)/bin/hipcc
-      $(warning "Could not find $(HIPCC). Using fallback to $(CXX)")
     else
-      $(error "Could not find $(HIPCC) or $(ROCM_PATH)/bin/hipcc. Check if the path is correct if you want to build $(EXE)")
+      ifeq ("$(shell test -e $(ROCM_PATH)/llvm/bin/amdclang++ && echo found)", "found")
+        CXX=$(ROCM_PATH)/llvm/bin/amdclang++
+      else ifeq ("$(shell test -e $(ROCM_PATH)/llvm/bin/clang++ && echo found)", "found")
+        CXX=$(ROCM_PATH)/llvm/bin/clang++
+      else ifeq ("$(shell test -e $(ROCM_PATH)/bin/hipcc && echo found)", "found")
+        CXX=$(ROCM_PATH)/bin/hipcc
+      else
+        $(error "Could not find a HIP compiler. Tried: $(HIPCC), $(ROCM_PATH)/llvm/bin/amdclang++, $(ROCM_PATH)/llvm/bin/clang++, $(ROCM_PATH)/bin/hipcc. Check if ROCM_PATH is correct")
+      endif
+      $(info "Could not find $(HIPCC). Using fallback to $(CXX)")
     endif
     GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)")
-
-    CXXFLAGS = -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/hip -I$(ROCM_PATH)/include/hsa
+    $(info Compiling for $(GPU_TARGETS) architecture(s). Can modify this by setting GPU_TARGETS)
+    CXXFLAGS = -I. -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/hip -I$(ROCM_PATH)/include/hsa
     HIPLDFLAGS= -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 -lamdhip64
     HIPFLAGS = -Wall -x hip -D__HIP_PLATFORM_AMD__ -D__HIPCC__ $(GPU_TARGETS_FLAGS)
     ifneq ($(strip $(ROCM_DEVICE_LIB_PATH)),)
@@ -84,18 +94,19 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),)
   # 3) infiniband/verbs.h is found in the default include path
   DISABLE_NIC_EXEC ?= 0
   ifneq ($(DISABLE_NIC_EXEC),1)
+    $(info Attempting to build with NIC executor support)
     ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
-      $(info lib IBVerbs not found)
+      $(info - ibverbs library not found)
     else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
-      $(info infiniband/verbs.h not found)
+      $(info - infiniband/verbs.h not found)
     else
       COMMON_FLAGS += -DNIC_EXEC_ENABLED
       LDFLAGS += -libverbs
       NIC_ENABLED = 1
 
-      # Disable DMA-BUF support by default (set DISABLE_DMABUF=0 to enable)
-      DISABLE_DMABUF ?= 1
-      ifeq ($(DISABLE_DMABUF), 0)
+      # Disable DMA-BUF support by default (set DISABLE_DMA_BUF=0 to enable)
+      DISABLE_DMA_BUF ?= 1
+      ifeq ($(DISABLE_DMA_BUF), 0)
         # Check for both ibv_reg_dmabuf_mr and ROCm DMA-BUF export support
         HAVE_IBV_DMABUF := $(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'ibv_reg_dmabuf_mr')
         HAVE_ROCM_DMABUF := $(shell echo '#include <hsa/hsa_ext_amd.h>' | $(CXX) -I$(ROCM_PATH)/include -E - 2>/dev/null | grep -c 'hsa_amd_portable_export_dmabuf')
@@ -111,14 +122,14 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),)
           $(info Building with DMA-BUF support)
         endif
       else
-        $(info Building with DMA-BUF support disabled (DISABLE_DMABUF=1))
+        $(info Building with DMA-BUF support disabled (DISABLE_DMA_BUF=1))
       endif
     endif
     ifeq ($(NIC_ENABLED), 0)
-      $(info Building without NIC executor support)
-      $(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
+      $(info - Building without NIC executor support)
+      $(info - To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
     else
-      $(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
+      $(info - Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
     endif
   endif
 
@@ -128,30 +139,167 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),)
   # 2) mpi.h is found in the MPI_PATH
   DISABLE_MPI_COMM ?= 0
   ifneq ($(DISABLE_MPI_COMM), 1)
+    $(info Attempting to build with MPI communicator support)
     ifeq ($(wildcard $(MPI_PATH)/include/mpi.h),)
-      $(info Unable to find mpi.h at $(MPI_PATH)/include.  Please specify appropriate MPI_PATH)
+      $(info - Unable to find mpi.h at $(MPI_PATH)/include.  Please specify appropriate MPI_PATH)
     else
       MPI_ENABLED = 1
       COMMON_FLAGS += -DMPI_COMM_ENABLED -I$(MPI_PATH)/include
-      LDFLAGS += -L/$(MPI_PATH)/lib -lmpi
-      ifeq ($(DEBUG), 1)
-        LDFLAGS += -lmpi_cxx
-      endif
+      LDFLAGS += -L$(MPI_PATH)/lib -L$(MPI_PATH)/lib64 -lmpi
     endif
 
     ifeq ($(MPI_ENABLED), 0)
-      $(info Building without MPI communicator support)
-      $(info To use TransferBench with MPI support, install MPI libraries and specify appropriate MPI_PATH)
+      $(info - Building without MPI communicator support)
+      $(info - To use TransferBench with MPI support, install MPI libraries and specify appropriate MPI_PATH)
     else
-      $(info Building with MPI communicator support.  Can set DISABLE_MPI_COMM=1 to disable)
+      $(info - Building with MPI communicator support.  Can set DISABLE_MPI_COMM=1 to disable)
    endif
   endif
-endif
 
+  NVML_ENABLED = 0
+  # Enable NVML support for pod membership detection on NVIDIA platforms
+  # Compile with NVML support if
+  # 1) DISABLE_NVML is not set to 1
+  # 2) Building TransferBenchCuda
+  # 3) nvml.h is found under CUDA_PATH
+  DISABLE_NVML ?= 0
+  ifneq ($(DISABLE_NVML), 1)
+    ifeq ($(MAKECMDGOALS),TransferBenchCuda)
+      $(info Attempting to build with NVML support)
+      ifneq ($(wildcard $(CUDA_PATH)/include/nvml.h),)
+        COMMON_FLAGS += -DNVML_ENABLED
+        LDFLAGS += -lnvidia-ml
+        NVML_ENABLED = 1
+        $(info - Building with NVML support for pod membership detection)
+      else
+        $(info - nvml.h not found at $(CUDA_PATH)/include. Building without NVML support)
+        $(info - Pod membership may be forced by setting TB_FORCE_SINGLE_POD=1)
+      endif
+    endif
+  endif
+
+  # TransferBenchCuda: CUDA driver API (libcuda). Independent of POD, but POD on CUDA requires CUMEM.
+  DISABLE_CUMEM ?= 0
+  ifeq ($(MAKECMDGOALS),TransferBenchCuda)
+    ifneq ($(DISABLE_CUMEM),1)
+      $(info - Building with CUMEM_ENABLED (CUDA driver API, -lcuda))
+      COMMON_FLAGS += -DCUMEM_ENABLED
+      LDFLAGS += -lcuda
+    else
+      $(info - CUDA driver API disabled (DISABLE_CUMEM=1); POD comm unavailable on CUDA)
+    endif
+  endif
+
+  POD_ENABLED = 0
+  AMD_SMI_ENABLED = 0
+  # Compile with pod support if
+  # 1) DISABLE_POD_COMM is not set to 1
+  # 2) For HIP: a small probe program that uses hipMemFabricHandle_t,
+  #    hipMemExportToShareableHandle, and hipMemImportFromShareableHandle
+  #    compiles and links successfully against amdhip64
+  #    For CUDA: CUDA Version >= 12.2
+  DISABLE_POD_COMM ?= 0
+  DISABLE_AMD_SMI ?= 0
+  ifneq ($(DISABLE_POD_COMM), 1)
+    $(info Attempting to build with pod communication support)
+    ifeq ($(MAKECMDGOALS),TransferBenchCuda)
+      # Check for appropriate CUDA support for MNNVL
+      CUDA_MIN_MAJOR := 12
+      CUDA_MIN_MINOR := 2
+
+      CUDA_VERSION_STR := $(shell $(NVCC) --version | grep release | sed -E 's/.*release ([0-9]+)\.([0-9]+).*/\1 \2/')
+      CUDA_MAJOR := $(word 1,$(CUDA_VERSION_STR))
+      CUDA_MINOR := $(word 2,$(CUDA_VERSION_STR))
+
+      CUDA_VERSION_OK := $(shell \
+        if [ $(CUDA_MAJOR) -gt $(CUDA_MIN_MAJOR) ] || \
+           [ $(CUDA_MAJOR) -eq $(CUDA_MIN_MAJOR) -a $(CUDA_MINOR) -ge $(CUDA_MIN_MINOR) ]; then \
+          echo yes; \
+        else \
+          echo no; \
+        fi)
+
+      ifeq ($(CUDA_VERSION_OK),yes)
+        $(info - Detected CUDA version $(CUDA_MAJOR).$(CUDA_MINOR) which has MNNVL support)
+        ifeq ($(DISABLE_CUMEM),1)
+          $(info - Pod communication skipped on CUDA: requires CUMEM_ENABLED (DISABLE_CUMEM=1))
+        else
+          COMMON_FLAGS += -DPOD_COMM_ENABLED
+          POD_ENABLED = 1
+        endif
+      else
+        $(info - Detected CUDA version $(CUDA_MAJOR).$(CUDA_MINOR) which does not have MNNVL support)
+        $(info - Pod support will require CUDA version of at least $(CUDA_MIN_MAJOR).$(CUDA_MIN_MINOR))
+      endif
+    else
+      # Check for the HIP fabric API functions used by TransferBench at runtime.
+      HIP_HAS_FABRIC := $(shell \
+        printf '%s\n' \
+          '#include <hip/hip_runtime_api.h>' \
+          'int main() {' \
+          '  hipMemFabricHandle_t fabricHandle = {};' \
+          '  hipMemGenericAllocationHandle_t allocationHandle = {};' \
+          '  hipMemExportToShareableHandle(&fabricHandle, allocationHandle, hipMemHandleTypeFabric, 0);' \
+          '  hipMemImportFromShareableHandle(&allocationHandle, &fabricHandle, hipMemHandleTypeFabric);' \
+          '  return 0;' \
+          '}' | \
+        $(CXX) -I$(ROCM_PATH)/include -D__HIP_PLATFORM_AMD__ -x c++ - \
+          -L$(ROCM_PATH)/lib -L$(ROCM_PATH)/lib64 -lamdhip64 -o /dev/null 2>/dev/null && echo yes || echo no)
+
+      ifeq ($(HIP_HAS_FABRIC),yes)
+        $(info - HIP fabric API found; enabling pod communication support)
+        COMMON_FLAGS += -DPOD_COMM_ENABLED
+        POD_ENABLED = 1
+        ifeq ($(DISABLE_AMD_SMI), 1)
+          $(info - AMD-SMI disabled via DISABLE_AMD_SMI=1; set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership)
+        else
+          # Prefer AMD-SMI for pod membership queries; fall back to TB_FORCE_SINGLE_POD=1 at runtime.
+          AMD_SMI_HEADER := $(ROCM_PATH)/include/amd_smi/amdsmi.h
+          AMD_SMI_LIB    := $(firstword $(wildcard $(ROCM_PATH)/lib/libamd_smi.so $(ROCM_PATH)/lib64/libamd_smi.so))
+          ifneq ($(wildcard $(AMD_SMI_HEADER)),)
+            ifneq ($(AMD_SMI_LIB),)
+              # Check for the AMD-SMI functions used by TransferBench at runtime.
+              AMDSMI_HAS_FABRIC := $(shell \
+                printf '%s\n' \
+                  '#include <amd_smi/amdsmi.h>' \
+                  'int main() {' \
+                  '  amdsmi_bdf_t bdf = {};' \
+                  '  amdsmi_processor_handle h;' \
+                  '  amdsmi_get_processor_handle_from_bdf(bdf, &h);' \
+                  '  amdsmi_fabric_info_t fi;' \
+                  '  amdsmi_get_gpu_fabric_info(h, &fi);' \
+                  '  (void)fi.fabric_info.fabric_version.v1.ppod_id;' \
+                  '  (void)fi.fabric_info.fabric_version.v1.vpod_id;' \
+                  '  return 0;' \
+                  '}' | \
+                $(CXX) -I$(ROCM_PATH)/include -x c++ - \
+                  -L$(dir $(AMD_SMI_LIB)) -lamd_smi -o /dev/null 2>/dev/null && echo yes || echo no)
+
+              ifeq ($(AMDSMI_HAS_FABRIC),yes)
+                $(info - AMD-SMI fabric API found; using AMD-SMI for pod membership queries)
+                COMMON_FLAGS += -DAMD_SMI_ENABLED
+                LDFLAGS += -L$(dir $(AMD_SMI_LIB)) -lamd_smi
+                AMD_SMI_ENABLED = 1
+              else
+                $(info - AMD-SMI fabric API not found; set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership)
+              endif
+            else
+              $(info - libamd_smi not found under $(ROCM_PATH)/lib or $(ROCM_PATH)/lib64; set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership)
+            endif
+          else
+            $(info - amd_smi/amdsmi.h not found under $(ROCM_PATH)/include; set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership)
+          endif
+        endif
+      else
+        $(info - HIP fabric API not found; disabling pod communication support)
+      endif
+    endif
+  endif
+endif
 
 .PHONY : all clean
 
-all: $(EXE)
+all: TransferBench
 
 TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
 	$(CXX) $(CXXFLAGS) $(HIPFLAGS) $(COMMON_FLAGS) $< -o $@ $(HIPLDFLAGS) $(LDFLAGS)
diff --git a/examples/example.cfg b/examples/example.cfg
index 57d4ae03..14df7e3a 100644
--- a/examples/example.cfg
+++ b/examples/example.cfg
@@ -8,12 +8,13 @@
 #                SRC 1 -> Executor -> DST 1
 #                SRC X                DST Y
 
-# Three Executors are supported by TransferBench
+# Five Executors are supported by TransferBench
 #   Executor:        SubExecutor:
 #   1) CPU           CPU thread
 #   2) GPU           GPU threadblock/Compute Unit (CU)
-#   3) DMA           N/A.                                 (May only be used for copies (single SRC/DST)
+#   3) DMA           N/A.                                 (Must have single SRC, at least one DST)
 #   4) NIC           Queue Pair
+#   5) Batched-DMA   Batch item                           (Must have single SRC, at least one DST)
 
 # Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
 
@@ -38,6 +39,7 @@
 #                 - C:    CPU-executed          (Indexed from 0 to # NUMA nodes - 1)
 #                 - G:    GPU-executed          (Indexed from 0 to # GPUs - 1)
 #                 - D:    DMA-executor          (Indexed from 0 to # GPUs - 1)
+#                 - B:    Batched-DMA-executor  (Indexed from 0 to # GPUs - 1)
 #                 - I#.#: NIC executor          (Indexed from 0 to # NICs - 1)
 #                 - N#.#: Nearest NIC executor  (Indexed from 0 to # GPUs - 1)
 #   dstMemL   :   Destination memory locations (Where the data is to be written to)
diff --git a/src/client/Client.cpp b/src/client/Client.cpp
index 26433500..81cc951b 100644
--- a/src/client/Client.cpp
+++ b/src/client/Client.cpp
@@ -43,7 +43,6 @@ int main(int argc, char **argv)
       if (!ev.outputToCsv) {
         DisplayVersion();
         DisplayUsage(argv[0]);
-        DisplayPresets();
       }
       DisplayTopology(ev.outputToCsv, ev.showBorders);
     }
@@ -258,14 +257,26 @@ void DisplayUsage(char const* cmdName)
 
   Print("Usage: %s config <N>\n", cmdName);
   Print("  config: Either:\n");
-  Print("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
-  Print("          - Name of preset config:\n");
+  Print("          - Filename of config file containing Transfers to execute\n");
+  Print("          - Name of preset config\n");
+  Print("          - 'cmdline' followed by one transfer expression\n");
+  Print("          - 'dryrun' followed by one transfer expression (prints parsed transfers only)\n");
   Print("  N     : (Optional) Number of bytes to copy per Transfer.\n");
-  Print("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
-        DEFAULT_BYTES_PER_TRANSFER);
+  Print("          If not specified, defaults to %lu. Must be a multiple of 4 bytes\n", DEFAULT_BYTES_PER_TRANSFER);
   Print("          If 0 is specified, a range of Ns will be benchmarked\n");
   Print("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
   Print("\n");
-
-  EnvVars::DisplayUsage();
+  Print("- Use \"%s help\"    for more information about how to create config files / describe Transfers\n", cmdName);
+  Print("- Use \"%s envvars\" for more information about environment variables that customize behavior\n", cmdName);
+  Print("- Use \"%s presets\" to display list of available presets\n", cmdName);
+  Print("\n");
+  Print("For multi-rank usage, TransferBench must either be compiled with MPI support or rely on sockets\n");
+  Print("It is recommended to only run one process per node\n");
+  Print(" - MPI approach:\n");
+  Print("   Node 0> mpirun -np 4 -host node0,node1,node2,node3 ./TransferBench a2a\n");
+  Print(" - Socket approach:\n");
+  Print("   Node 0> TB_NUM_RANKS=4 [TB_RANK=0] [TB_MASTER_ADDR=<from rank 0>] ./TransferBench a2a # Displays connect info for other ranks\n" );
+  Print("   Node 1> TB_NUM_RANKS=4  TB_RANK=1   TB_MASTER_ADDR=<from rank 0>  ./TransferBench a2a\n");
+  Print("   Node 2> TB_NUM_RANKS=4  TB_RANK=2   TB_MASTER_ADDR=<from rank 0>  ./TransferBench a2a\n");
+  Print("   Node 3> TB_NUM_RANKS=4  TB_RANK=3   TB_MASTER_ADDR=<from rank 0>  ./TransferBench a2a\n");
 };
diff --git a/src/client/EnvVars.hpp b/src/client/EnvVars.hpp
index c77d2bea..97fd3ea2 100644
--- a/src/client/EnvVars.hpp
+++ b/src/client/EnvVars.hpp
@@ -35,12 +35,15 @@ THE SOFTWARE.
   } while (0)
 
 #include <algorithm>
+#include <cstdio>
+#include <cstring>
 #include <iostream>
+#include <set>
 #include <numa.h>
 #include <random>
 #include <time.h>
 
-#define CLIENT_VERSION "02"
+#define CLIENT_VERSION "00"
 
 #include "TransferBench.hpp"
 using namespace TransferBench;
@@ -87,18 +90,19 @@ class EnvVars
   int useHsaDma;                     // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions
 
   // GFX options
+  vector<uint32_t> cuMask;           // Bit-vector representing the CU mask
   int gfxBlockOrder;                 // How threadblocks for multiple Transfers are ordered 0=sequential 1=interleaved
   int gfxBlockSize;                  // Size of each threadblock (must be multiple of 64)
-  vector<uint32_t> cuMask;           // Bit-vector representing the CU mask
-  vector<vector<int>> prefXccTable;  // Specifies XCC to use for given exe->dst pair
+  int gfxKernel;                     // GFX Kernel to use (-1=auto, 0=reduce, 1=copy-only)
   int gfxSeType;                     // GFX subexecutor type (0=threadblock, 1=warp)
+  int gfxSingleTeam;                 // Team all subExecutors across the data array
   int gfxTemporal;                   // Non-temporal load/store mode (0=none, 1=load, 2=store, 3=both)
   int gfxUnroll;                     // GFX-kernel unroll factor
-  int useHipEvents;                  // Use HIP events for timing GFX/DMA Executor
-  int useSingleStream;               // Use a single stream per GPU GFX executor instead of stream per Transfer
-  int gfxSingleTeam;                 // Team all subExecutors across the data array
   int gfxWaveOrder;                  // GFX-kernel wavefront ordering
   int gfxWordSize;                   // GFX-kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)
+  vector<vector<int>> prefXccTable;  // Specifies XCC to use for given exe->dst pair
+  int useHipEvents;                  // Use HIP events for timing GFX/DMA Executor
+  int useSingleStream;               // Use a single stream per GPU GFX executor instead of stream per Transfer
 
   // Client options
   int hideEnv;                       // Skip printing environment variable
@@ -106,12 +110,14 @@ class EnvVars
   int maxNumVarSubExec;              // Maximum # of subexecutors to use for variable subExec Transfers (0 to use device limit)
   int outputToCsv;                   // Output in CSV format
   int samplingFactor;                // Affects how many different values of N are generated (when N set to 0)
+  std::vector<int> showPercentiles;  // Iteration-duration percentiles to print
 
   // NIC options
   int ibGidIndex;                    // GID Index for RoCE NICs
   uint8_t ibPort;                    // NIC port number to be used
   int ipAddressFamily;               // IP Address Famliy
   int nicChunkBytes;                 // Number of bytes to send per chunk for RDMA operations
+  int nicCqPollBatch;                // Number of CQ entries to poll per ibv_poll_cq call
   int nicRelaxedOrder;               // Use relaxed ordering for RDMA
   int roceVersion;                   // RoCE version number
 
@@ -146,8 +152,9 @@ class EnvVars
     fillCompress      = GetEnvVarArray("FILL_COMPRESS"  , {});
     gfxBlockOrder     = GetEnvVar("GFX_BLOCK_ORDER"     , 0);
     gfxBlockSize      = GetEnvVar("GFX_BLOCK_SIZE"      , 256);
+    gfxKernel         = GetEnvVar("GFX_KERNEL"          , 0);
     gfxSeType         = GetEnvVar("GFX_SE_TYPE"         , 0);
-    gfxSingleTeam     = GetEnvVar("GFX_SINGLE_TEAM"     , 1);
+    gfxSingleTeam     = GetEnvVar("GFX_SINGLE_TEAM"     , 0);
     gfxTemporal       = GetEnvVar("GFX_TEMPORAL"        , 0);
     gfxUnroll         = GetEnvVar("GFX_UNROLL"          , defaultGfxUnroll);
     gfxWaveOrder      = GetEnvVar("GFX_WAVE_ORDER"      , 0);
@@ -162,6 +169,7 @@ class EnvVars
     samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , 1);
     showBorders       = GetEnvVar("SHOW_BORDERS"        , 1);
     showIterations    = GetEnvVar("SHOW_ITERATIONS"     , 0);
+    showPercentiles   = GetEnvVarArray("SHOW_PERCENTILES", {});
     useHipEvents      = GetEnvVar("USE_HIP_EVENTS"      , 1);
     useHsaDma         = GetEnvVar("USE_HSA_DMA"         , 0);
     useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
@@ -174,6 +182,7 @@ class EnvVars
     roceVersion       = GetEnvVar("ROCE_VERSION"        , 2);
     ipAddressFamily   = GetEnvVar("IP_ADDRESS_FAMILY"   , 4);
     nicChunkBytes     = GetEnvVar("NIC_CHUNK_BYTES"     , 1073741824);
+    nicCqPollBatch    = GetEnvVar("NIC_CQ_POLL_BATCH"   , 4);
     nicRelaxedOrder   = GetEnvVar("NIC_RELAX_ORDER"     , 1);
 
     gpuMaxHwQueues    = GetEnvVar("GPU_MAX_HW_QUEUES"   , 4);
@@ -234,14 +243,15 @@ class EnvVars
     // Check for CU mask
     int numXccs = TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
     cuMask.clear();
-    char* cuMaskStr = getenv("CU_MASK");
-    if (cuMaskStr != NULL) {
+    char const* cuMaskRaw = getenv("CU_MASK");
+    if (cuMaskRaw != NULL) {
 #if defined(__NVCC__)
       printf("[WARN] CU_MASK is not supported in CUDA\n");
 #else
       std::vector<std::pair<int, int>> ranges;
       int maxCU = 0;
-      char* token = strtok(cuMaskStr, ",");
+      std::string cuMaskCopy(cuMaskRaw);
+      char* token = cuMaskCopy.empty() ? NULL : strtok(&cuMaskCopy[0], ",");
       while (token) {
         int start, end;
         if (sscanf(token, "%d-%d", &start, &end) == 2) {
@@ -269,14 +279,25 @@ class EnvVars
 #endif
     }
 
+    // Check that percentiles are valid
+    std::sort(showPercentiles.begin(), showPercentiles.end());
+    showPercentiles.erase(std::unique(showPercentiles.begin(), showPercentiles.end()), showPercentiles.end());
+    for (int v : showPercentiles) {
+      if (v < 1 || v > 99) {
+        printf("[ERROR] SHOW_PERCENTILES: value %d out of range (allowed 1..99)\n", v);
+        exit(1);
+      }
+    }
+
     // Parse preferred XCC table (if provided)
-    char* prefXccStr = getenv("XCC_PREF_TABLE");
-    if (prefXccStr) {
+    char const* prefXccRaw = getenv("XCC_PREF_TABLE");
+    if (prefXccRaw) {
       prefXccTable.resize(numDetectedGpus);
       for (int i = 0; i < numDetectedGpus; i++){
         prefXccTable[i].resize(numDetectedGpus, -1);
       }
-      char* token = strtok(prefXccStr, ",");
+      std::string prefXccCopy(prefXccRaw);
+      char* token = prefXccCopy.empty() ? NULL : strtok(&prefXccCopy[0], ",");
       int tokenCount = 0;
       while (token) {
         int xccId;
@@ -312,55 +333,73 @@ class EnvVars
   }
 
   // Display info on the env vars that can be used
-  static void DisplayUsage()
+  static void DisplayEnvVarsList()
   {
-    printf("Environment variables:\n");
+    printf("Environment variables (client):\n");
     printf("======================\n");
-    printf(" ALWAYS_VALIDATE   - Validate after each iteration instead of once after all iterations\n");
-    printf(" BLOCK_BYTES       - Controls granularity of how work is divided across subExecutors\n");
-    printf(" BYTE_OFFSET       - Initial byte-offset for memory allocations.  Must be multiple of 4\n");
-    printf(" CU_MASK           - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
-    printf(" FILL_COMPRESS     - Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0\n");
-    printf(" FILL_PATTERN      - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
-    printf(" GFX_BLOCK_ORDER   - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
-    printf(" GFX_BLOCK_SIZE    - # of threads per threadblock (Must be multiple of 64)\n");
-    printf(" GFX_SE_TYPE       - SubExecutor granularity type (0=threadblock, 1=warp)\n");
-    printf(" GFX_TEMPORAL      - Use of non-temporal loads or stores (0=none 1=loads 2=stores 3=both)\n");
-    printf(" GFX_UNROLL        - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
-    printf(" GFX_SINGLE_TEAM   - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
-    printf(" GFX_WAVE_ORDER    - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
-    printf(" GFX_WORD_SIZE     - GFX kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)\n");
-    printf(" HIDE_ENV          - Hide environment variable value listing\n");
+    printf(" ALWAYS_VALIDATE     - Validate after each iteration instead of once after all iterations\n");
+    printf(" BLOCK_BYTES         - Controls granularity of how work is divided across subExecutors\n");
+    printf(" BYTE_OFFSET         - Initial byte-offset for memory allocations.  Must be multiple of 4\n");
+    printf(" CU_MASK             - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
+    printf(" FILL_COMPRESS       - Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0\n");
+    printf(" FILL_PATTERN        - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
+    printf(" GFX_BLOCK_ORDER     - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
+    printf(" GFX_BLOCK_SIZE      - # of threads per threadblock (Must be multiple of 64)\n");
+    printf(" GFX_KERNEL          - -1=auto, 0=force GpuReduceKernel, 1=force GpuCopyKernel (may error if ineligible)\n");
+    printf(" GFX_SE_TYPE         - SubExecutor granularity type (0=threadblock, 1=warp)\n");
+    printf(" GFX_TEMPORAL        - Use of non-temporal loads or stores (0=none 1=loads 2=stores 3=both)\n");
+    printf(" GFX_UNROLL          - Unroll factor for GFX kernel\n");
+    printf(" GFX_SINGLE_TEAM     - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
+    printf(" GFX_WAVE_ORDER      - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
+    printf(" GFX_WORD_SIZE       - GFX kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)\n");
+    printf(" HIDE_ENV            - Hide environment variable value listing\n");
 #if NIC_EXEC_ENABLED
-    printf(" IB_GID_INDEX      - Required for RoCE NICs (default=-1/auto)\n");
-    printf(" IB_PORT_NUMBER    - RDMA port count for RDMA NIC (default=1)\n");
-    printf(" IP_ADDRESS_FAMILY - IP address family (4=v4, 6=v6, default=v4)\n");
+    printf(" IB_GID_INDEX        - Required for RoCE NICs (default=-1/auto)\n");
+    printf(" IB_PORT_NUMBER      - RDMA port count for RDMA NIC (default=1)\n");
+    printf(" IP_ADDRESS_FAMILY   - IP address family (4=v4, 6=v6, default=v4)\n");
 #endif
-    printf(" MIN_VAR_SUBEXEC   - Minumum # of subexecutors to use for variable subExec Transfers\n");
-    printf(" MAX_VAR_SUBEXEC   - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
+    printf(" MIN_VAR_SUBEXEC     - Minimum # of subexecutors to use for variable subExec Transfers\n");
+    printf(" MAX_VAR_SUBEXEC     - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
 #if NIC_EXEC_ENABLED
-    printf(" NIC_CHUNK_BYTES   - Number of bytes to send at a time using NIC (default = 1GB)\n");
-    printf(" NIC_RELAX_ORDER   - Set to non-zero to use relaxed ordering");
+    printf(" NIC_CHUNK_BYTES     - Number of bytes to send at a time using NIC (default = 1GB)\n");
+    printf(" NIC_CQ_POLL_BATCH   - Number of CQ entries to poll per ibv_poll_cq call (default = 4)\n");
+    printf(" NIC_RELAX_ORDER     - Set to non-zero to use relaxed ordering\n");
 #endif
-    printf(" NUM_ITERATIONS    - # of timed iterations per test. If negative, run for this many seconds instead\n");
-    printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n");
-    printf(" NUM_WARMUPS       - # of untimed warmup iterations per test\n");
-    printf(" OUTPUT_TO_CSV     - Outputs to CSV format if set\n");
+    printf(" NUM_ITERATIONS      - # of timed iterations per test. If negative, run for this many seconds instead\n");
+    printf(" NUM_SUBITERATIONS   - # of sub-iterations to run per iteration. Must be non-negative\n");
+    printf(" NUM_WARMUPS         - # of untimed warmup iterations per test\n");
+    printf(" OUTPUT_TO_CSV       - Outputs to CSV format if set\n");
 #if NIC_EXEC_ENABLED
-    printf(" ROCE_VERSION      - RoCE version (default=2)\n");
+    printf(" ROCE_VERSION        - RoCE version (default=2)\n");
 #endif
-    printf(" SAMPLING_FACTOR   - Add this many samples (when possible) between powers of 2 when auto-generating data sizes\n");
-    printf(" SHOW_BORDERS      - Show ASCII box-drawing characaters in tables\n");
-    printf(" SHOW_ITERATIONS   - Show per-iteration timing info\n");
-    printf(" USE_HIP_EVENTS    - Use HIP events for GFX executor timing\n");
-    printf(" USE_HSA_DMA       - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n");
-    printf(" USE_INTERACTIVE   - Pause for user-input before starting transfer loop\n");
-    printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
-    printf(" VALIDATE_DIRECT   - Validate GPU destination memory directly instead of staging GPU memory on host\n");
-    printf(" VALIDATE_SOURCE   - Validate GPU src memory immediately after preparation\n");
+    printf(" SAMPLING_FACTOR     - Add this many samples (when possible) between powers of 2 when auto-generating data sizes\n");
+    printf(" SHOW_BORDERS        - Show ASCII box-drawing characters in tables\n");
+    printf(" SHOW_ITERATIONS     - Show per-iteration timing info\n");
+    printf(" SHOW_PERCENTILES    - Comma-separated percentiles iteration duration\n");
+    printf(" USE_HIP_EVENTS      - Use HIP events for GFX executor timing\n");
+    printf(" USE_HSA_DMA         - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n");
+    printf(" USE_INTERACTIVE     - Pause for user-input before starting transfer loop\n");
+    printf(" USE_SINGLE_STREAM   - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
+    printf(" VALIDATE_DIRECT     - Validate GPU destination memory directly instead of staging GPU memory on host\n");
+    printf(" VALIDATE_SOURCE     - Validate GPU src memory immediately after preparation\n");
+    printf("\n");
+    printf("Environment variables (back-end):\n");
+    printf("====================================\n");
+    printf(" TB_RANK             - Rank for socket communicator (0-based); defaults to 0 if unset or empty\n");
+    printf(" TB_NUM_RANKS        - Total ranks for socket mode (>=2); alone on rank 0 starts listener and logs worker env\n");
+    printf(" TB_MASTER_ADDR      - Rank 0 hostname or IPv4 for workers; optional on rank 0 (auto-detected if unset)\n");
+    printf(" TB_MASTER_IFACE     - When TB_MASTER_ADDR unset on rank 0, optional interface for IPv4 detection (e.g. eth0)\n");
+    printf(" TB_MASTER_PORT      - Used to set Rank 0 port for socket communicator (default: 29500)\n");
+    printf(" TB_SINGLE_LOG       - In socket mode, only rank 0 logs when set\n");
+    printf(" TB_VERBOSE          - Enables additional internal logging\n");
+    printf(" TB_DUMP_CFG_FILE    - Writes executed transfers to a config file\n");
+    printf(" TB_DUMP_LINES       - Dumps randomized input-line statistics for FILL_COMPRESS setup\n");
+    printf(" TB_NIC_FILTER       - Regex filter to limit NIC visibility for NIC executors\n");
+    printf(" TB_FORCE_SINGLE_POD - Forces all ranks into one pod (skips pod query)\n");
+    printf(" TB_WALLCLOCK_RATE   - Overrides queried GPU wallclock rate if needed\n");
+    printf(" TB_PAUSE            - Pauses startup for debugger attachment\n");
   }
 
-
   void Print(std::string const& name, int32_t const value, const char* format, ...) const
   {
     printf("%-20s%s%12d%s", name.c_str(), outputToCsv ? "," : " = ", value, outputToCsv ? "," : " : ");
@@ -412,6 +451,10 @@ class EnvVars
           "Thread block ordering: %s", gfxBlockOrder == 0 ? "Sequential" : "Interleaved");
     Print("GFX_BLOCK_SIZE", gfxBlockSize,
           "Threadblock size of %d", gfxBlockSize);
+    Print("GFX_KERNEL", gfxKernel,
+          "%s", gfxKernel == -1 ? "auto" :
+                gfxKernel ==  0 ? "force GpuReduceKernel" :
+                gfxKernel ==  1 ? "force GpuCopyKernel"   : "unknown");
     Print("GFX_SE_TYPE", gfxSeType,
           "SubExecutor granularity: %s", gfxSeType == 0 ? "Threadblock" : "Warp");
     Print("GFX_SINGLE_TEAM", gfxSingleTeam,
@@ -452,6 +495,8 @@ class EnvVars
 #if NIC_EXEC_ENABLED
     Print("NIC_CHUNK_BYTES", nicChunkBytes,
           "Sending %lu bytes at a time for NIC RDMA", nicChunkBytes);
+    Print("NIC_CQ_POLL_BATCH", nicCqPollBatch,
+          "Polling %d CQ entries per ibv_poll_cq call", nicCqPollBatch);
     Print("NIC_RELAX_ORDER", nicRelaxedOrder,
           "Using %s ordering for NIC RDMA", nicRelaxedOrder ? "relaxed" : "strict");
 #endif
@@ -469,6 +514,8 @@ class EnvVars
     Print("SHOW_BORDERS", showBorders, "%s ASCII box-drawing characaters in tables", showBorders ? "Showing" : "Hiding");
     Print("SHOW_ITERATIONS", showIterations,
           "%s per-iteration timing", showIterations ? "Showing" : "Hiding");
+    Print("SHOW_PERCENTILES", showPercentiles.empty() ? 0 : 1, "%s",
+          showPercentiles.empty() ? "Disabled" : GetStr(showPercentiles).c_str());
     Print("USE_HIP_EVENTS", useHipEvents,
           "Using %s for GFX/DMA Executor timing", useHipEvents ? "HIP events" : "CPU wall time");
     Print("USE_HSA_DMA", useHsaDma,
@@ -500,6 +547,9 @@ class EnvVars
   {
     char const* varStr = getenv(varname.c_str());
     if (varStr) {
+      if (varStr[0] == '\0') {
+        return defaultValue;
+      }
       int val = atoi(varStr);
       char units = varStr[strlen(varStr)-1];
       switch (units) {
@@ -514,10 +564,11 @@ class EnvVars
 
   static std::vector<int> GetEnvVarArray(std::string const& varname, std::vector<int> const& defaultValue)
   {
-    if (getenv(varname.c_str())) {
+    char const* raw = getenv(varname.c_str());
+    if (raw) {
       std::vector<int> values;
-      char* arrayStr = getenv(varname.c_str());
-      char* token = strtok(arrayStr, ",");
+      std::string copy(raw);
+      char* token = copy.empty() ? NULL : strtok(&copy[0], ",");
       while (token) {
         int val;
         if (sscanf(token, "%d", &val) == 1) {
@@ -533,12 +584,29 @@ class EnvVars
     return defaultValue;
   }
 
+  static std::vector<std::string> GetEnvVarStrArray(std::string const& varname, std::vector<std::string> const& defaultValue)
+  {
+    char const* raw = getenv(varname.c_str());
+    if (raw) {
+      std::vector<std::string> values;
+      std::string copy(raw);
+      char* token = copy.empty() ? NULL : strtok(&copy[0], ",");
+      while (token) {
+        values.push_back(token);
+        token = strtok(NULL, ",");
+      }
+      return values;
+    }
+    return defaultValue;
+  }
+
   static std::vector<int> GetEnvVarRangeArray(std::string const& varname, std::vector<int> const& defaultValue)
   {
-    if (getenv(varname.c_str())) {
-      char* rangeStr = getenv(varname.c_str());
+    char const* raw = getenv(varname.c_str());
+    if (raw) {
+      std::string copy(raw);
       std::set<int> values;
-      char* token = strtok(rangeStr, ",");
+      char* token = copy.empty() ? NULL : strtok(&copy[0], ",");
       while (token) {
         int start, end;
         if (sscanf(token, "%d-%d", &start, &end) == 2) {
@@ -567,13 +635,22 @@ class EnvVars
 
   std::string GetStr(std::vector<int> const& varnameList) const {
     std::string result = "";
-    for (int i = 0; i < varnameList.size(); i++) {
+    for (auto i = 0; i < varnameList.size(); i++) {
       if (i) result += ",";
       result += std::to_string(varnameList[i]);
     }
     return result;
   }
 
+  std::string GetStr(std::vector<std::string> const& varnameList) const {
+    std::string result = "";
+    for (auto i = 0; i < varnameList.size(); i++) {
+      if (i) result += ",";
+      result += varnameList[i];
+    }
+    return result;
+  }
+
   std::string GetCuMaskDesc() const
   {
     std::vector<std::pair<int, int>> runs;
@@ -616,7 +693,7 @@ class EnvVars
     cfg.general.numIterations      = numIterations;
     cfg.general.numSubIterations   = numSubIterations;
     cfg.general.numWarmups         = numWarmups;
-    cfg.general.recordPerIteration = showIterations;
+    cfg.general.recordPerIteration = ((showIterations != 0) || !showPercentiles.empty()) ? 1 : 0;
     cfg.general.useInteractive     = useInteractive;
 
     cfg.data.alwaysValidate        = alwaysValidate;
@@ -633,6 +710,7 @@ class EnvVars
     cfg.gfx.blockOrder             = gfxBlockOrder;
     cfg.gfx.blockSize              = gfxBlockSize;
     cfg.gfx.cuMask                 = cuMask;
+    cfg.gfx.gfxKernel              = gfxKernel;
     cfg.gfx.prefXccTable           = prefXccTable;
     cfg.gfx.seType                 = gfxSeType;
     cfg.gfx.unrollFactor           = gfxUnroll;
@@ -644,6 +722,7 @@ class EnvVars
     cfg.gfx.wordSize               = gfxWordSize;
 
     cfg.nic.chunkBytes             = nicChunkBytes;
+    cfg.nic.cqPollBatch            = nicCqPollBatch;
     cfg.nic.ibGidIndex             = ibGidIndex;
     cfg.nic.ibPort                 = ibPort;
     cfg.nic.ipAddressFamily        = ipAddressFamily;
diff --git a/src/client/Presets/AllToAll.hpp b/src/client/Presets/AllToAll.hpp
index 2beae8af..cfea85c3 100644
--- a/src/client/Presets/AllToAll.hpp
+++ b/src/client/Presets/AllToAll.hpp
@@ -22,9 +22,10 @@ THE SOFTWARE.
 
 #include <limits>
 
-int AllToAllPreset(EnvVars&           ev,
-                    size_t      const  numBytesPerTransfer,
-                    std::string const  presetName)
+int AllToAllPreset(EnvVars&          ev,
+                   size_t      const numBytesPerTransfer,
+                   std::string const presetName,
+                   bool        const bytesSpecified)
 {
   enum
   {
@@ -54,7 +55,6 @@ int AllToAllPreset(EnvVars&           ev,
   int numSubExecs   = EnvVars::GetEnvVar("NUM_SUB_EXEC"   , 8);
   int showDetails   = EnvVars::GetEnvVar("SHOW_DETAILS"   , 0);
   int useDmaExec    = EnvVars::GetEnvVar("USE_DMA_EXEC"   , 0);
-  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , -999); // Deprecated
   int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
 
   // Check that all ranks have at least the number of GPUs requested
@@ -64,7 +64,7 @@ int AllToAllPreset(EnvVars&           ev,
   for (int rank = 0; rank < numRanks; rank++) {
     if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) {
       Utils::Print("[ERROR] All-to-All preset requires each rank to have the same number of GPUs\n");
-      return 1;
+      return ERR_FATAL;
     }
     if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank))
       nicDifference = true;
@@ -81,17 +81,12 @@ int AllToAllPreset(EnvVars&           ev,
     a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
     if (a2aMode < 0 || a2aMode > 2) {
       Utils::Print("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
-      return 1;
+      return ERR_FATAL;
     }
     numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
     numDsts = (a2aMode == A2A_READ_ONLY  ? 0 : 1);
   }
 
-  // Deprecated env var check
-  if (useFineGrain != -999) {
-    memTypeIdx = useFineGrain ? 2 : 0;
-  }
-
   MemType memType = Utils::GetGpuMemType(memTypeIdx);
   std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx);
 
@@ -120,15 +115,15 @@ int AllToAllPreset(EnvVars&           ev,
   // Validate env vars
   if (numGpus < 0 || numGpus > numDetectedGpus) {
     Utils::Print("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
-    return 1;
+    return ERR_FATAL;
   }
   if (useDmaExec && (numSrcs != 1 || numDsts != 1)) {
     Utils::Print("[ERROR] DMA execution can only be used for copies (A2A_MODE=0)\n");
-    return 1;
+    return ERR_FATAL;
   }
   if (numResults * 2 > numRanks) {
     Utils::Print("[ERROR] Number of extrema results requested exceeds number of ranks.  NUM_RESULTS should be at most half the number of ranks\n");
-    return 1;
+    return ERR_FATAL;
   }
 
   // Collect the number of GPU devices to use
@@ -201,14 +196,14 @@ int AllToAllPreset(EnvVars&           ev,
   if (!TransferBench::RunTransfers(cfg, transfers, results)) {
     for (auto const& err : results.errResults)
       Utils::Print("%s\n", err.errMsg.c_str());
-    return 1;
+    return ERR_FATAL;
   } else if (showDetails) {
     Utils::PrintResults(ev, 1, transfers, results);
     Utils::Print("\n");
   }
 
   // Only ranks that actually do output will compile results
-  if (!Utils::RankDoesOutput()) return 0;
+  if (!Utils::RankDoesOutput()) return ERR_NONE;
 
   // Prepare table of results
   int numRows   = 2 + (numGpus + 1) * (1 + 2*numResults);
@@ -491,10 +486,5 @@ int AllToAllPreset(EnvVars&           ev,
     printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
   }
 
-  if (useFineGrain != -999) {
-    Utils::Print("[WARN] USE_FINE_GRAIN has been deprecated and replaced by MEM_TYPE\n");
-    Utils::Print("[WARN] MEM_TYPE has been set to %d to correspond to previous use of USE_FINE_GRAIN=%d\n", memTypeIdx, useFineGrain);
-  }
-
-  return 0;
+  return ERR_NONE;
 }
diff --git a/src/client/Presets/AllToAllN.hpp b/src/client/Presets/AllToAllN.hpp
index 7dac6b22..15698917 100644
--- a/src/client/Presets/AllToAllN.hpp
+++ b/src/client/Presets/AllToAllN.hpp
@@ -23,13 +23,14 @@ THE SOFTWARE.
 #include <limits>
 #include "EnvVars.hpp"
 
-int AllToAllRdmaPreset(EnvVars&           ev,
-                       size_t      const  numBytesPerTransfer,
-                       std::string const  presetName)
+int AllToAllRdmaPreset(EnvVars&          ev,
+                       size_t      const numBytesPerTransfer,
+                       std::string const presetName,
+                       bool        const bytesSpecified)
 {
   if (TransferBench::GetNumRanks() > 1) {
     Utils::Print("[ERROR]a2an preset currently not supported for multi-node\n");
-    return 1;
+    return ERR_FATAL;
   }
 
   int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
@@ -38,12 +39,6 @@ int AllToAllRdmaPreset(EnvVars&           ev,
   int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
   int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 1);
   int memTypeIdx    = EnvVars::GetEnvVar("MEM_TYPE"       , 2);
-  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , -999); // Deprecated
-
-  // Deprecated env var check
-  if (useFineGrain != -999) {
-    memTypeIdx = useFineGrain ? 2 : 0;
-  }
 
   MemType memType = Utils::GetGpuMemType(memTypeIdx);
   std::string memTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx);
@@ -63,7 +58,7 @@ int AllToAllRdmaPreset(EnvVars&           ev,
   // Validate env vars
   if (numGpus < 0 || numGpus > numDetectedGpus) {
     Utils::Print("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
-    return 1;
+    return ERR_FATAL;
   }
 
 
@@ -97,7 +92,7 @@ int AllToAllRdmaPreset(EnvVars&           ev,
   if (!TransferBench::RunTransfers(cfg, transfers, results)) {
     for (auto const& err : results.errResults)
       Utils::Print("%s\n", err.errMsg.c_str());
-    return 1;
+    return ERR_FATAL;
   } else {
     Utils::PrintResults(ev, 1, transfers, results);
   }
@@ -154,5 +149,5 @@ int AllToAllRdmaPreset(EnvVars&           ev,
 
   Utils::PrintErrors(results.errResults);
 
-  return 0;
+  return ERR_NONE;
 }
diff --git a/src/client/Presets/AllToAllSweep.hpp b/src/client/Presets/AllToAllSweep.hpp
index 9da2fc0d..36e571d9 100644
--- a/src/client/Presets/AllToAllSweep.hpp
+++ b/src/client/Presets/AllToAllSweep.hpp
@@ -22,13 +22,14 @@ THE SOFTWARE.
 
 #include "EnvVars.hpp"
 
-int AllToAllSweepPreset(EnvVars&           ev,
-                        size_t      const  numBytesPerTransfer,
-                        std::string const  presetName)
+int AllToAllSweepPreset(EnvVars&          ev,
+                        size_t      const numBytesPerTransfer,
+                        std::string const presetName,
+                        bool        const bytesSpecified)
 {
   if (TransferBench::GetNumRanks() > 1) {
     Utils::Print("[ERROR] All to All Sweep preset currently not supported for multi-node\n");
-    return 1;
+    return ERR_FATAL;
   }
 
   enum
@@ -42,22 +43,24 @@ int AllToAllSweepPreset(EnvVars&           ev,
 
   // Force single-stream mode for all-to-all benchmark
   ev.useSingleStream = 1;
+  // Default to GPU-event timing for a2asweep (overridable via USE_HIP_EVENTS=0 for CPU wall-clock)
+  ev.useHipEvents = EnvVars::GetEnvVar("USE_HIP_EVENTS", 1);
 
   int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
 
   // Collect env vars for this preset
   int a2aDirect     = EnvVars::GetEnvVar("A2A_DIRECT"     , 1);
   int a2aLocal      = EnvVars::GetEnvVar("A2A_LOCAL"      , 0);
+  int memTypeIdx    = EnvVars::GetEnvVar("MEM_TYPE"       , 2);
   int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
   int showMinOnly   = EnvVars::GetEnvVar("SHOW_MIN_ONLY",   1);
-  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
   int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
   int useSpray      = EnvVars::GetEnvVar("USE_SPRAY",       0);
   int verbose       = EnvVars::GetEnvVar("VERBOSE",         0);
 
-  std::vector<int> blockList  = EnvVars::GetEnvVarArray("BLOCKSIZES", {256});
+  std::vector<int> blockList  = EnvVars::GetEnvVarArray("BLOCKSIZES", {256,512,768,1024});
   std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,3,4,6,8});
-  std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4,8,12,16,24,32});
+  std::vector<int> numSesList = EnvVars::GetEnvVarArray("NUM_SUB_EXECS", {4,8,12,16,24,32});
 
   // A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
   int numSrcs, numDsts;
@@ -74,6 +77,9 @@ int AllToAllSweepPreset(EnvVars&           ev,
     numDsts = (a2aMode == A2A_READ_ONLY  ? 0 : 1);
   }
 
+  MemType memType = Utils::GetGpuMemType(memTypeIdx);
+  std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx);
+
   // Print off environment variables
   ev.DisplayEnvVars();
   if (!ev.hideEnv) {
@@ -84,13 +90,13 @@ int AllToAllSweepPreset(EnvVars&           ev,
                                 (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
                                                            std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
     ev.Print("BLOCKSIZES"     , blockList.size() , EnvVars::ToStr(blockList).c_str());
-    ev.Print("SHOW_MIN_ONLY"  , showMinOnly      , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results");
-    ev.Print("NUM_CUS"        , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
+    ev.Print("MEM_TYPE"       , memTypeIdx   , "Using %s GPU memory (%s)", devMemTypeStr.c_str(), Utils::GetAllGpuMemTypeStr().c_str());
     ev.Print("NUM_GPU_DEVICES", numGpus          , "Using %d GPUs", numGpus);
+    ev.Print("NUM_SUB_EXECS"  , numSesList.size(), EnvVars::ToStr(numSesList).c_str());
+    ev.Print("SHOW_MIN_ONLY"  , showMinOnly      , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results");
     ev.Print("UNROLLS"        , unrollList.size(), EnvVars::ToStr(unrollList).c_str());
-    ev.Print("USE_FINE_GRAIN" , useFineGrain     , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
     ev.Print("USE_REMOTE_READ", useRemoteRead    , "Using %s as executor", useRemoteRead ? "DST" : "SRC");
-    ev.Print("USE_SPRAY"      , useSpray         , "%s per CU", useSpray ? "All targets" : "One target");
+    ev.Print("USE_SPRAY"      , useSpray         , "%s per SubExecutor", useSpray ? "All targets" : "One target");
     ev.Print("VERBOSE"        , verbose          , verbose ? "Display test results" : "Display summary only");
     printf("\n");
   }
@@ -107,14 +113,13 @@ int AllToAllSweepPreset(EnvVars&           ev,
   }
 
   // Collect the number of GPU devices to use
-  MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
   ExeType exeType = EXE_GPU_GFX;
 
   std::vector<Transfer> transfers;
 
   int targetCount = 0;
   if (!useSpray) {
-    // Each CU will work on just one target
+    // Each SubExecutor will work on just one target
     for (int i = 0; i < numGpus; i++) {
       targetCount = 0;
       for (int j = 0; j < numGpus; j++) {
@@ -144,7 +149,10 @@ int AllToAllSweepPreset(EnvVars&           ev,
       }
     }
   } else {
-    // Each CU will work on all targets
+    // Each CU will work on all targets.
+    // NOTE: targetCount ends up reflecting the last GPU's target count. This is correct for
+    // symmetric topologies (all GPUs have equal peer counts), but may be inaccurate with
+    // A2A_DIRECT on asymmetric hardware where different GPUs have different hop-1 peer counts.
     for (int i = 0; i < numGpus; i++) {
       TransferBench::Transfer transfer;
       transfer.numBytes = numBytesPerTransfer;
@@ -172,70 +180,116 @@ int AllToAllSweepPreset(EnvVars&           ev,
     }
   }
 
-  printf("GPU-GFX All-To-All Sweep benchmark:\n");
-  printf("==========================\n");
-  printf("- Copying %lu bytes between %s pairs of GPUs\n", numBytesPerTransfer, a2aDirect ? "directly connected" : "all");
+  Utils::Print("GPU-GFX All-To-All Sweep benchmark (%lu bytes, local=%s). All values are %s GB/s\n",
+               numBytesPerTransfer,
+               a2aLocal         ? "yes"                            : "no",
+               ev.useHipEvents  ? "GPU-Event-Timed (min over GPUs)": "CPU-Timed");
+  Utils::Print("=======================================================================================\n");
   if (transfers.size() == 0) {
-    printf("[WARN} No transfers requested. Try adjusting A2A_DIRECT or A2A_LOCAL\n");
+    Utils::Print("[WARN] No transfers requested. Try adjusting A2A_DIRECT or A2A_LOCAL\n");
     return 0;
   }
 
   // Execute Transfers
   TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
 
-  // Run tests
-  std::map<std::pair<int, int>, TransferBench::TestResults> results;
+  char sep = ev.outputToCsv ? ',' : ' ';
+
+  double bestMinBw = 0.0;
+  int bestBlock = -1, bestUnroll = -1, bestNumSes = -1;
+
+  // Print header once
+  Utils::Print(" BlkS %c UnR ", sep);
+  for (int c : numSesList) {
+    Utils::Print("%c  SE %03d", sep, c);
+    if (ev.useHipEvents && !showMinOnly) {
+      Utils::Print("%c SE%03dMx", sep, c);
+    }
+  }
+  Utils::Print("\n");
+
+  // Results keyed by (blockSize, numSes, unroll) for verbose output
+  std::map<std::tuple<int,int,int>, TransferBench::TestResults> results;
 
-  // Display summary
   for (int blockSize : blockList) {
-    printf("Blocksize: %d\n", blockSize);
-    ev.gfxBlockSize = cfg.gfx.blockSize = blockSize;
+    cfg.gfx.blockSize = blockSize;
 
-    printf("#CUs\\Unroll");
     for (int u : unrollList) {
-      printf("  %d(Min) ", u);
-      if (!showMinOnly) printf("  %d(Max) ", u);
-    }
-    printf("\n");
-    for (int c : numCusList) {
-      printf("   %5d   ", c);  fflush(stdout);
-      for (int u : unrollList) {
-        ev.gfxUnroll = cfg.gfx.unrollFactor = u;
-        for (auto& transfer : transfers)
+      cfg.gfx.unrollFactor = u;
+      Utils::Print("%5d %c %3d ", blockSize, sep, u);
+      fflush(stdout);
+
+      for (int c : numSesList) {
+        for (auto& transfer : transfers) {
           transfer.numSubExecs = useSpray ? (c * targetCount) : c;
+        }
 
-        double minBandwidth = std::numeric_limits<double>::max();
-        double maxBandwidth = std::numeric_limits<double>::min();
         TransferBench::TestResults result;
+        double minBw = 0.0, maxBw = 0.0;
         if (TransferBench::RunTransfers(cfg, transfers, result)) {
-          for (auto const& exeResult : result.exeResults) {
-            minBandwidth = std::min(minBandwidth, exeResult.second.avgBandwidthGbPerSec);
-            maxBandwidth = std::max(maxBandwidth, exeResult.second.avgBandwidthGbPerSec);
+          if (!ev.useHipEvents) {
+            minBw = result.avgTotalBandwidthGbPerSec;
+            if (useSpray) {
+              minBw *= targetCount;
+            }
+          } else {
+            minBw = std::numeric_limits<double>::max();
+            maxBw = std::numeric_limits<double>::lowest();
+            for (auto const& exeResult : result.exeResults) {
+              minBw = std::min(minBw, exeResult.second.avgBandwidthGbPerSec);
+              maxBw = std::max(maxBw, exeResult.second.avgBandwidthGbPerSec);
+            }
+            if (useSpray) {
+              minBw *= targetCount;
+              maxBw *= targetCount;
+            }
+          }
+          if (minBw > bestMinBw) {
+            bestMinBw  = minBw;
+            bestBlock  = blockSize;
+            bestUnroll = u;
+            bestNumSes = c;
           }
-          if (useSpray) {
-            minBandwidth *= targetCount;
-            maxBandwidth *= targetCount;
+          if (verbose) {
+            results[std::make_tuple(blockSize, c, u)] = result;
           }
-          results[std::make_pair(c,u)] = result;
-        } else {
-          minBandwidth = 0.0;
         }
-        printf(" %7.2f ", minBandwidth);
-        if (!showMinOnly) printf(" %7.2f ", maxBandwidth);
+        Utils::Print("%c%8.2f", sep, minBw);
+        if (ev.useHipEvents && !showMinOnly) {
+          Utils::Print("%c%8.2f", sep, maxBw);
+        }
         fflush(stdout);
       }
-      printf("\n"); fflush(stdout);
+      Utils::Print("\n");
+      fflush(stdout);
     }
+  }
+  Utils::Print("=======================================================================================\n");
 
-    if (verbose) {
-      int testNum = 0;
-      for (int c : numCusList) {
+  if (verbose) {
+    int testNum = 0;
+    for (int blockSize : blockList) {
+      for (int c : numSesList) {
         for (int u : unrollList) {
-          printf("CUs: %d Unroll %d\n", c, u);
-          Utils::PrintResults(ev, ++testNum, transfers, results[std::make_pair(c,u)]);
+          auto verboseTransfers = transfers;
+          for (auto& t : verboseTransfers) {
+            t.numSubExecs = useSpray ? (c * targetCount) : c;
+          }
+          Utils::Print("BlockSize: %d SubExecs: %d Unroll: %d\n", blockSize, c, u);
+          Utils::PrintResults(ev, ++testNum, verboseTransfers, results[std::make_tuple(blockSize, c, u)]);
         }
       }
     }
   }
-  return 1;
+
+  // Print combination that produced highest bandwidth
+  if (bestBlock != -1) {
+    Utils::Print("Highest %s bandwidth found: %7.2f GB/s\n",
+                 ev.useHipEvents ? "GPU-event-timed (min)" : "CPU-timed", bestMinBw);
+    Utils::Print("          BlockSize  : %7d\n", bestBlock);
+    Utils::Print("          Unroll     : %7d\n", bestUnroll);
+    Utils::Print("          NumSubExec : %7d\n", bestNumSes);
+  }
+
+  return ERR_NONE;
 }
diff --git a/src/client/Presets/BmaSweep.hpp b/src/client/Presets/BmaSweep.hpp
new file mode 100644
index 00000000..7bc9ab77
--- /dev/null
+++ b/src/client/Presets/BmaSweep.hpp
@@ -0,0 +1,182 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+int BmaSweepPreset(EnvVars&          ev,
+                   size_t      const numBytesPerTransfer,
+                   std::string const presetName,
+                   bool        const bytesSpecified)
+{
+  if (TransferBench::GetNumRanks() > 1) {
+    Utils::Print("[ERROR] BMA sweep preset currently not supported for multi-node\n");
+    return ERR_FATAL;
+  }
+
+#ifndef BMA_EXEC_ENABLED
+  Utils::Print("[ERROR] BMA executor requires ROCm 7.1 or newer\n");
+  return ERR_FATAL;
+#endif
+
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int         exeIndex      = EnvVars::GetEnvVar("EXE_INDEX"         ,               0);
+  int         localCopy     = EnvVars::GetEnvVar("LOCAL_COPY"        ,               0);
+  vector<int> gfxSesList    = EnvVars::GetEnvVarArray("GFX_SUB_EXECS",              {});
+  int         gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE"      ,               0);
+  int         numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES"   , numDetectedGpus);
+  vector<int> bmaSesList    = EnvVars::GetEnvVarArray("NUM_SUB_EXECS",       {1,2,4,8});
+
+
+  MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx);
+
+  // Display environment variables
+  if (Utils::RankDoesOutput()) {
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+      int outputToCsv = ev.outputToCsv;
+      if (!outputToCsv) printf("[BMA Sweep Related]\n");
+      ev.Print("EXE_INDEX"      , exeIndex,          "Executing on GPU %d", exeIndex);
+      ev.Print("LOCAL_COPY"     , localCopy,         "%s local copy to GPU %d", localCopy ? "Including" : "Excluding", exeIndex);
+      ev.Print("GFX_SUB_EXECS"  , gfxSesList.size(), EnvVars::ToStr(gfxSesList).c_str());
+      ev.Print("GPU_MEM_TYPE"   , gpuMemTypeIdx,     "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str());
+      ev.Print("NUM_GPU_DEVICES", numGpuDevices,     "Using %d GPUs", numGpuDevices);
+      ev.Print("NUM_SUB_EXECS"  , bmaSesList.size(), EnvVars::ToStr(bmaSesList).c_str());
+      printf("\n");
+    }
+  }
+
+  if (exeIndex < 0 || exeIndex >= numGpuDevices) {
+    Utils::Print("EXE_INDEX must be between 0 and %d inclusively\n", numGpuDevices - 1);
+    return ERR_FATAL;
+  }
+
+  int numTransfers  = numGpuDevices - 1 + (localCopy ? 1 : 0);
+  int numBmaSubExec = (int)bmaSesList.size();
+  int numGfxSubExec = (int)gfxSesList.size();
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+
+  // Prepare table of results
+  int minPow2Exp = 12;
+  int maxPow2Exp = 30;
+  int numRows    = 1 + (bytesSpecified ? 1 : (maxPow2Exp - minPow2Exp + 1));
+  int numCols    = 2 + numBmaSubExec + numGfxSubExec;
+
+  Utils::TableHelper table(numRows, numCols);
+  Utils::Print("Performing %d simultaneous DMA Transfers from GPU %d to other GPUs\n", numTransfers, exeIndex);
+
+  // Prepare headers
+
+  table.Set(0, 0, " Bytes ");
+  table.Set(0, 1, " DMA ");
+  for (int i = 0; i < numBmaSubExec; i++) {
+    table.Set(0, 2+i, " BMA(%02d) ", bmaSesList[i]);
+  }
+  for (int i = 0; i < numGfxSubExec; i++) {
+    table.Set(0, 2+numBmaSubExec+i, " GFX(%02d) ", gfxSesList[i]);
+  }
+
+  table.DrawRowBorder(0);
+  table.DrawRowBorder(1);
+  table.DrawRowBorder(numRows);
+  table.DrawColBorder(0);
+  table.DrawColBorder(1);
+  table.DrawColBorder(2);
+  table.DrawColBorder(2+numBmaSubExec);
+  table.DrawColBorder(numCols);
+
+  if (!ev.outputToCsv){
+    Utils::Print("Executing: ");
+    fflush(stdout);
+  };
+
+  for (size_t numBytes = 1ULL<<minPow2Exp, currRow=1; numBytes <= (1ULL<<maxPow2Exp); numBytes<<=1, currRow++) {
+    if (bytesSpecified) numBytes = numBytesPerTransfer;
+
+    if (!ev.outputToCsv) {
+      Utils::Print(".");
+      fflush(stdout);
+    }
+
+    table.Set(currRow, 0, " %lu ", numBytes);
+    std::vector<Transfer> transfers(1);
+
+    Transfer& t = transfers[0];
+    t.numBytes = numBytes;
+    t.srcs     = {{gpuMemType, exeIndex}};
+    t.dsts.clear();
+    for (int i = 0; i < numGpuDevices; i++) {
+      if (i == exeIndex && localCopy == 0) continue;
+      t.dsts.push_back({gpuMemType, i});
+    }
+
+    // DMA executor first
+    t.exeDevice = {EXE_GPU_DMA, exeIndex};
+    t.numSubExecs = 1;
+
+    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+      for (auto const& err : results.errResults)
+        Utils::Print("%s\n", err.errMsg.c_str());
+      return ERR_FATAL;
+    }
+
+    table.Set(currRow, 1, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec);
+
+    // BMA executor next
+    t.exeDevice = {EXE_GPU_BDMA, exeIndex};
+    for (int i = 0; i < numBmaSubExec; i++) {
+      t.numSubExecs = bmaSesList[i];
+
+      if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+        for (auto const& err : results.errResults)
+          Utils::Print("%s\n", err.errMsg.c_str());
+        return ERR_FATAL;
+      }
+
+      table.Set(currRow, 2+i, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec);
+    }
+
+    // GFX executor last
+    t.exeDevice = {EXE_GPU_GFX, exeIndex};
+    for (int i = 0; i < numGfxSubExec; i++) {
+      t.numSubExecs = gfxSesList[i];
+
+      if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+        for (auto const& err : results.errResults)
+          Utils::Print("%s\n", err.errMsg.c_str());
+        return ERR_FATAL;
+      }
+
+      table.Set(currRow, 2+numBmaSubExec+i, " %6.2f ", results.tfrResults[0].avgBandwidthGbPerSec);
+    }
+    if (bytesSpecified) break;
+  }
+
+  if (!ev.outputToCsv) {
+    Utils::Print("\n");
+  }
+  table.PrintTable(ev.outputToCsv, ev.showBorders);
+  Utils::Print("Reported numbers are all GB/s, normalized for per Transfer for %d Transfers\n", numTransfers);
+
+  return ERR_NONE;
+}
diff --git a/src/client/Presets/EnvVarsList.hpp b/src/client/Presets/EnvVarsList.hpp
new file mode 100644
index 00000000..90fbcd39
--- /dev/null
+++ b/src/client/Presets/EnvVarsList.hpp
@@ -0,0 +1,31 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+int EnvVarsPreset([[maybe_unused]] EnvVars&          ev,
+                  [[maybe_unused]] size_t      const numBytesPerTransfer,
+                  [[maybe_unused]] std::string const presetName,
+                  [[maybe_unused]] bool        const bytesSpecified)
+{
+  if (!Utils::RankDoesOutput()) return 0;
+  EnvVars::DisplayEnvVarsList();
+  return 0;
+}
diff --git a/src/client/Presets/GfxSweep.hpp b/src/client/Presets/GfxSweep.hpp
new file mode 100644
index 00000000..fdf4fd8d
--- /dev/null
+++ b/src/client/Presets/GfxSweep.hpp
@@ -0,0 +1,239 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "EnvVars.hpp"
+
+int GfxSweepPreset(EnvVars&          ev,
+                   size_t      const numBytesPerTransfer,
+                   std::string const presetName,
+                   bool        const bytesSpecified)
+{
+  enum TimingMode
+  {
+    TimingModeAuto = -1,
+    TimingModeCpu  =  0,
+    TimingModeHip  =  1,
+    TimingModeGpu  =  2
+  };
+
+  // Collect environment variables for this preset
+  vector<int> blockList      = EnvVars::GetEnvVarArray("BLOCKSIZES",   {256,512,768,1024});
+  std::string transferStr    = EnvVars::GetEnvVar(     "GFX_TRANSFER", "R0G0->R0G0->R0G0");
+  vector<int> kernelList     = EnvVars::GetEnvVarArray("KERNELS",                     {0});
+  vector<int> numSesList     = EnvVars::GetEnvVarArray("NUM_SUB_EXECS",    {4,8,16,32,64});
+  int         numTransfers   = EnvVars::GetEnvVar(     "NUM_TRANSFERS",                 1);
+  vector<int> temporalList   = EnvVars::GetEnvVarArray("TEMPORAL_MODES",              {0});
+  int         timingMode     = EnvVars::GetEnvVar(     "TIMING_MODE",      TimingModeAuto);
+  vector<int> unrollList     = EnvVars::GetEnvVarArray("UNROLLS",            {1,2,4,8,16});
+  vector<int> waveOrderList  = EnvVars::GetEnvVarArray("WAVE_ORDERS",                 {0});
+  vector<int> wordSizeList   = EnvVars::GetEnvVarArray("WORDSIZES",                   {4});
+
+  // Print off relevant environment variables
+  if (Utils::RankDoesOutput()) {
+    if (!ev.hideEnv) {
+      ev.DisplayEnvVars();
+      if (!ev.outputToCsv)
+        Utils::Print("[GFX Sweep Related]\n");
+      ev.Print("BLOCKSIZES",     blockList.size(),     EnvVars::ToStr(blockList).c_str());
+      ev.Print("GFX_TRANSFER",   transferStr,          "GFX Transfer to sweep (see config file format)");
+      ev.Print("KERNELS",        kernelList.size(),    EnvVars::ToStr(kernelList).c_str());
+      ev.Print("NUM_TRANSFERS",  numTransfers,         "Number of Transfers specified in GFX_TRANSFER");
+      ev.Print("NUM_SUB_EXECS",  numSesList.size(),    EnvVars::ToStr(numSesList).c_str());
+      ev.Print("TEMPORAL_MODES", temporalList.size(),  EnvVars::ToStr(temporalList).c_str());
+      ev.Print("TIMING_MODE",    timingMode,           "-1=auto 0=Aggregate CPU, 1=Executor Time, 2=Transfer Time");
+      ev.Print("UNROLLS",        unrollList.size(),    EnvVars::ToStr(unrollList).c_str());
+      ev.Print("WAVE_ORDERS",    waveOrderList.size(), EnvVars::ToStr(waveOrderList).c_str());
+      ev.Print("WORDSIZES",      wordSizeList.size(),  EnvVars::ToStr(wordSizeList).c_str());
+      Utils::Print("\n");
+    }
+  }
+
+  if (timingMode < TimingModeAuto || timingMode > TimingModeGpu) {
+    Utils::Print("TIMING_MODE value is invalid (%d)\n", timingMode);
+    return ERR_FATAL;
+  }
+
+  if (numSesList.empty()){
+    Utils::Print("NUM_SUB_EXECS should not be empty\n");
+    return ERR_FATAL;
+  }
+
+  std::vector<Transfer> transfers;
+  Utils::CheckForError(ParseTransfers(std::to_string(numTransfers) + " 1 " + transferStr, transfers));
+  if (transfers.size() == 0) {
+    Utils::Print("[WARN] No valid Transfers found in GFX_TRANSFER\n");
+    return 0;
+  }
+
+  // Automatically pick timing method
+  if (timingMode == TimingModeAuto) {
+    // Use Transfer timing if there is only one Transfer
+    if (transfers.size() == 1) timingMode = TimingModeGpu;
+    // Use Executor timing if there is only one executor
+    else {
+      bool singleExecutor = true;
+      for (size_t i = 1; i < transfers.size(); i++) {
+        if (transfers[i].exeDevice   <  transfers[0].exeDevice   ||
+            transfers[0].exeDevice   <  transfers[i].exeDevice   ||
+            transfers[i].exeSubIndex != transfers[0].exeSubIndex ||
+            transfers[i].exeSubSlot  != transfers[0].exeSubSlot) {
+          singleExecutor = false;
+          break;
+        }
+      }
+      timingMode = singleExecutor ? TimingModeHip : TimingModeCpu;
+    }
+  }
+  if (timingMode < 0 || timingMode > 2) {
+    Utils::Print("[ERROR] Invalid timing mode %d\n", timingMode);
+    return ERR_FATAL;
+  }
+
+  // Print out the Transfers being run
+  Utils::Print("GFX sweep: (%lu bytes per Transfer). All values are %s-timed GB/s\n", numBytesPerTransfer,
+               timingMode == TimingModeCpu ? "Aggregate-CPU" :
+               timingMode == TimingModeHip ? "HIP-event"     :
+                                             "GPU wallclock");
+  Utils::Print("=======================================================================================\n");
+
+  bool isMultiNode = GetNumRanks() > 1;
+  for (size_t i = 0; i < transfers.size(); i++) {
+    Transfer& t = transfers[i];
+    Utils::Print("Transfer %5lu: (%s->", i, Utils::MemDevicesToStr(t.srcs).c_str());
+    if (isMultiNode)         Utils::Print("R%d", t.exeDevice.exeRank);
+    Utils::Print("%c%d", ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex);
+    if (t.exeDevice.exeSlot) Utils::Print("%c", 'A' + t.exeDevice.exeSlot);
+    if (t.exeSubIndex != -1) Utils::Print(".%d", t.exeSubIndex);
+    if (t.exeSubSlot != 0)   Utils::Print("%c", 'A' + t.exeSubSlot);
+    Utils::Print("->%s)\n",  Utils::MemDevicesToStr(t.dsts).c_str());
+
+    if (t.exeDevice.exeType != EXE_GPU_GFX) {
+      Utils::Print("[ERROR] gfxsweep preset only works on Transfers that are using GFX executor\n");
+      return ERR_FATAL;
+    }
+    t.numBytes = numBytesPerTransfer;
+  }
+
+  Utils::Print("=======================================================================================\n");
+
+  ConfigOptions cfg = ev.ToConfigOptions();
+
+  // Print header
+  char sep = ev.outputToCsv ? ',' : ' ';
+  Utils::Print(" WvO %c WSz %c TpM %c BlkS %c UnR %c KrN ", sep, sep, sep, sep, sep);
+  for  (int numSubExec : numSesList)
+    Utils::Print("%c  SE %03d", sep, numSubExec);
+  Utils::Print("\n");
+
+  int bestSe = -1;
+  double overallBestBw = 0;
+  vector<double> bestBw(numSesList.size(), 0.0);
+  vector<vector<int>> best(numSesList.size(), vector<int>(7));
+
+  // Loop over all combinations
+  for (int waveOrder : waveOrderList) {         cfg.gfx.waveOrder    = waveOrder;
+    for (int wordSize : wordSizeList) {         cfg.gfx.wordSize     = wordSize;
+      for (int temporalMode : temporalList) {   cfg.gfx.temporalMode = temporalMode;
+        for (int blockSize : blockList) {       cfg.gfx.blockSize    = blockSize;
+          for (int unroll : unrollList) {       cfg.gfx.unrollFactor = unroll;
+            for (int kernelIdx : kernelList) {  cfg.gfx.gfxKernel    = kernelIdx;
+              Utils::Print("  %1d  %c  %1d  %c  %1d  %c %4d %c %2d  %c  %1d  ",
+                           waveOrder, sep, wordSize, sep,  temporalMode, sep,
+                           blockSize, sep, unroll, sep, kernelIdx, sep);
+
+              for (auto s = 0; s < numSesList.size(); s++) {
+                int numSubExec = numSesList[s];
+                for (Transfer& t : transfers) t.numSubExecs = numSubExec;
+
+                TestResults result;
+                if (RunTransfers(cfg, transfers, result)) {
+                  double bw = 0.0;
+                  switch (timingMode) {
+                  case 0: bw = result.avgTotalBandwidthGbPerSec; break;
+                  case 1:
+                    for (auto const& e : result.exeResults) {
+                      bw = std::max(bw, e.second.avgBandwidthGbPerSec);
+                    }
+                    break;
+                  case 2: default:
+                    for (auto const& t : result.tfrResults) {
+                      bw = std::max(bw, t.avgBandwidthGbPerSec);
+                    }
+                    break;
+                  }
+
+                  if (bw > bestBw[s]) {
+                    bestBw[s] = bw;
+                    best[s] = {waveOrder, wordSize, temporalMode, blockSize, unroll, kernelIdx, numSubExec};
+                    if (bw > overallBestBw) {
+                      overallBestBw = bw;
+                      bestSe = s;
+                    }
+                  }
+                  Utils::Print("%c%8.2f", sep, bw);
+                  fflush(stdout);
+                } else {
+                  Utils::Print("\n");
+                  Utils::PrintErrors(result.errResults);
+                  return ERR_FATAL;
+                }
+              }
+              Utils::Print("\n");
+              fflush(stdout);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Utils::Print(" WvO %c WSz %c TpM %c BlkS %c UnR %c KrN ", sep, sep, sep, sep, sep);
+  for (auto s = 0; s < numSesList.size(); s++) {
+    Utils::Print("%c%8.2f", sep, bestBw[s]);
+  }
+  Utils::Print("\n");
+
+  if (bestSe == -1) {
+    Utils::Print("[WARN] No transfers executed - make sure sweep parameters lists are not empty\n");
+    return ERR_FATAL;
+  }
+
+  // Print combination that produced highest bandwidth
+  Utils::Print("=======================================================================================\n");
+  Utils::Print("Highest bandwidth found: %7.2f GB/s (%s-timed)\n", overallBestBw,
+               timingMode == TimingModeCpu ? "Aggregate-CPU" :
+               timingMode == TimingModeHip ? "HIP-event"     :
+                                             "GPU wallclock");
+  Utils::Print("          WaveOrder    : %7d  [GFX_WAVE_ORDER=%d]\n", best[bestSe][0], best[bestSe][0]);
+  Utils::Print("          WordSize     : %7d  [GFX_WORD_SIZE=%d]\n",  best[bestSe][1], best[bestSe][1]);
+  Utils::Print("          Temporal Mode: %7d  [GFX_TEMPORAL=%d]\n",   best[bestSe][2], best[bestSe][2]);
+  Utils::Print("          BlockSize    : %7d  [GFX_BLOCK_SIZE=%d]\n", best[bestSe][3], best[bestSe][3]);
+  Utils::Print("          Unroll       : %7d  [GFX_UNROLL=%d]\n",     best[bestSe][4], best[bestSe][4]);
+  Utils::Print("          Kernel       : %7d  [GFX_KERNEL=%d]\n"    , best[bestSe][5], best[bestSe][5]);
+  Utils::Print("          NumSubExec   : %7d\n", best[bestSe][6]);
+  Utils::Print("Command to run best result:\n");
+  Utils::Print("GFX_WAVE_ORDER=%d GFX_WORD_SIZE=%d GFX_TEMPORAL=%d GFX_BLOCK_SIZE=%d "
+               "GFX_UNROLL=%d GFX_KERNEL=%d ./TransferBench cmdline %lu \"%d %d %s\"\n",
+               best[bestSe][0], best[bestSe][1], best[bestSe][2], best[bestSe][3],
+               best[bestSe][4], best[bestSe][5], numBytesPerTransfer, numTransfers, best[bestSe][6], transferStr.c_str());
+  return ERR_NONE;
+}
diff --git a/src/client/Presets/HbmBandwidth.hpp b/src/client/Presets/HbmBandwidth.hpp
new file mode 100644
index 00000000..f8d60aca
--- /dev/null
+++ b/src/client/Presets/HbmBandwidth.hpp
@@ -0,0 +1,619 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "EnvVars.hpp"
+#include "Utilities.hpp"
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <iomanip>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace TransferBench;
+
+// CUDA translation
+#if defined(__NVCC__)
+#define hipEvent_t           cudaEvent_t
+#define hipEventCreate       cudaEventCreate
+#define hipEventDestroy      cudaEventDestroy
+#define hipEventElapsedTime  cudaEventElapsedTime
+#define hipEventRecord       cudaEventRecord
+#define hipSetDevice         cudaSetDevice
+#define hipStream_t          cudaStream_t
+#define hipStreamCreate      cudaStreamCreate
+#define hipStreamDestroy     cudaStreamDestroy
+#define hipStreamSynchronize cudaStreamSynchronize
+#endif
+
+// Load a value
+template<bool USE_NT, typename T>
+__device__ __forceinline__ T Load(const T& ref)
+{
+#if !defined(__NVCC__)
+  if (USE_NT) return __builtin_nontemporal_load(&ref);
+#endif
+  return ref;
+}
+
+// Main kernel for HBM bandwidth testing
+template<int LAUNCH_BOUND, int UNROLL, typename T, bool USE_NT>
+__global__ __launch_bounds__(LAUNCH_BOUND)
+void HbmReadBwKernel(const void* __restrict pSrcBuffer,
+                     void*       __restrict dummy,
+                     const size_t           numSteps,
+                     long long*  __restrict minStartCycle,
+                     long long*  __restrict maxStopCycle)
+{
+  int64_t startTime;
+  if (threadIdx.x == 0) {
+    startTime = GetTimestamp();
+  }
+
+  // Cast src/dst buffers to the correct type
+  T const* __restrict srcBuffer = reinterpret_cast<T const*>(pSrcBuffer);
+  T*       __restrict dstBuffer = reinterpret_cast<T*      >(dummy);
+  T v{};
+
+  // Determine the total number of elements this threadblock handles
+  size_t elemPerThreadblock = numSteps * blockDim.x * UNROLL;
+
+  // Determine the initial offset for this threadblock
+  size_t srcOffset = blockIdx.x * elemPerThreadblock + threadIdx.x;
+
+  #pragma unroll 1
+  for (size_t step = 0; step < numSteps; step++) {
+    #pragma unroll
+    for (uint32_t i = 0; i < UNROLL; i++) {
+      v |= Load<USE_NT>(srcBuffer[srcOffset]);
+      srcOffset += blockDim.x;
+    }
+  }
+
+  // This statement is never true, but is required to make sure compiler
+  // doesn't optimize away the reads
+  if (elemPerThreadblock == 0)
+    *dstBuffer = v;
+
+  // Update min/max start times
+  __syncthreads();
+  if (threadIdx.x == 0 && minStartCycle != nullptr) {
+    int64_t stopTime = GetTimestamp();
+    atomicMin(minStartCycle, startTime);
+    atomicMax(maxStopCycle, stopTime);
+  }
+}
+
+// Build up function pointer table
+typedef void (*HbmReadBwKernelFuncPtr)(const void*, void *, size_t, long long*, long long*);
+
+#define HBM_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, DTYPE) \
+  {HbmReadBwKernel<LAUNCH_BOUND, UNROLL, DTYPE, false>,       \
+   HbmReadBwKernel<LAUNCH_BOUND, UNROLL, DTYPE, true>}
+
+#define HBM_KERNEL_DTYPE_DECL(LAUNCH_BOUND, UNROLL)             \
+  {HBM_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, uint32_t),    \
+   HBM_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, uint64_t),    \
+   HBM_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, __uint128_t)}
+
+#define HBM_KERNEL_UNROLL_DECL(LAUNCH_BOUND) \
+  {HBM_KERNEL_DTYPE_DECL(LAUNCH_BOUND, 1),   \
+   HBM_KERNEL_DTYPE_DECL(LAUNCH_BOUND, 2),   \
+   HBM_KERNEL_DTYPE_DECL(LAUNCH_BOUND, 4),   \
+   HBM_KERNEL_DTYPE_DECL(LAUNCH_BOUND, 8),   \
+   HBM_KERNEL_DTYPE_DECL(LAUNCH_BOUND, 16)}
+
+  HbmReadBwKernelFuncPtr HbmReadKernelTable[4][5][3][2] =
+  {
+    HBM_KERNEL_UNROLL_DECL(256),
+    HBM_KERNEL_UNROLL_DECL(512),
+    HBM_KERNEL_UNROLL_DECL(768),
+    HBM_KERNEL_UNROLL_DECL(1024)
+  };
+
+// Kernel to fill buffer with random data
+__global__ void FillPsuedoRandomData(size_t N, uint32_t* p, uint32_t shift)
+{
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < N; idx += blockDim.x * gridDim.x) {
+    uint32_t d = static_cast<uint32_t>(idx + shift);
+    uint32_t val = 2166136261u;
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      val ^= d & 0xff;
+      val *= 16777619u;
+      d >>= 8;
+    }
+    p[idx] = val;
+  }
+}
+
+struct HbmBwResult
+{
+  int rank;
+  int gpuIdx;
+  int numSubExec;
+  int blockSize;
+  int unroll;
+  int elemByte;
+
+  double bw[3];  // MAX | AVG | MIN
+};
+
+int HbmBandwidthPreset(EnvVars&          ev,
+                       size_t      const numBytesPerTransfer,
+                       std::string const presetName,
+                       bool        const bytesSpecified)
+{
+  // If bytes aren't specified, default to 1GB
+  size_t numBytesAtLeast = (bytesSpecified ? numBytesPerTransfer : 1024 * 1024 * 1024);
+
+  // Determine rank information
+  int numRanks = TransferBench::GetNumRanks();
+  int myRank   = TransferBench::GetRank();
+
+  // Make sure each rank has at least one GPU
+  for (int rank = 0; rank < numRanks; rank++) {
+    if (TransferBench::GetNumExecutors(EXE_GPU_GFX, rank) == 0) {
+      Utils::Print("[ERROR] Each rank must have at least GPU.  Rank %d has no GPUs\n", rank);
+      return ERR_FATAL;
+    }
+  }
+  int defSubExec = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0});
+
+  // Collect environment variables
+  std::vector<int> blockSizes    = EnvVars::GetEnvVarArray("BLOCKSIZES"    ,   {256, 512});
+  int              criteria      = EnvVars::GetEnvVar     ("CRITERIA"      ,            0);
+  std::vector<int> elemBytes     = EnvVars::GetEnvVarArray("ELEM_BYTES"    ,       {16,8});
+  std::vector<int> gpuIndices    = EnvVars::GetEnvVarArray("GPU_INDICES"   ,           {});
+  int              memTypeIdx    = EnvVars::GetEnvVar     ("MEM_TYPE"      ,            0);
+  int              numBuffers    = EnvVars::GetEnvVar     ("NUM_BUFFERS"   ,            2);
+  int              numIterations = EnvVars::GetEnvVar     ("NUM_ITERATIONS",          100);
+  std::vector<int> numSesList    = EnvVars::GetEnvVarArray("NUM_SUB_EXECS" , {defSubExec});
+  int              outputToCsv   = EnvVars::GetEnvVar     ("OUTPUT_TO_CSV" ,            0);
+  int              prewarmMsec   = EnvVars::GetEnvVar     ("PREWARM_MSEC"  ,           50);
+  int              showBorders   = EnvVars::GetEnvVar     ("SHOW_BORDERS"  ,            1);
+  int              showDetails   = EnvVars::GetEnvVar     ("SHOW_DETAILS"  ,            0);
+  int              showExtra     = EnvVars::GetEnvVar     ("SHOW_EXTRA"    ,            0);
+  int              temporalMask  = EnvVars::GetEnvVar     ("TEMPORAL_MASK" ,            3);
+  std::vector<int> unrolls       = EnvVars::GetEnvVarArray("UNROLLS"       ,     {16,8,4});
+  int              useWallClock  = EnvVars::GetEnvVar     ("USE_WALLCLOCK" ,            1);
+
+  // SHOW_DETAILS is not supported in multi-rank runs
+  if (numRanks > 1) showDetails = 0;
+
+  // Non-temporal reads are not supported for CUDA
+#if defined(__NVCC__)
+  temporalMask = 1;
+#endif
+
+  // Check for consistency across ranks
+  IS_UNIFORM(blockSizes,    "BLOCKSIZES");
+  IS_UNIFORM(criteria,      "CRITERIA");
+  IS_UNIFORM(elemBytes,     "ELEM_BYTES");
+  // GPU_INDICES may be different per rank
+  IS_UNIFORM(memTypeIdx,    "MEM_TYPE");
+  IS_UNIFORM(numBuffers,    "NUM_BUFFERS");
+  IS_UNIFORM(numIterations, "NUM_ITERATIONS");
+  IS_UNIFORM(numSesList,    "NUM_SUB_EXECS");
+  IS_UNIFORM(prewarmMsec,   "PREWARM_MSEC");
+  IS_UNIFORM(showDetails,   "SHOW_DETAILS");
+  IS_UNIFORM(showExtra,     "SHOW_EXTRA");
+  IS_UNIFORM(temporalMask,  "TEMPORAL_MASK");
+  IS_UNIFORM(unrolls,       "UNROLLS");
+  IS_UNIFORM(useWallClock,  "USE_WALLCLOCK");
+
+  // Validate environment variables and set defaults
+  if (blockSizes.empty()) {
+    Utils::Print("[ERROR] BLOCKSIZES may not be empty\n");
+    return ERR_FATAL;
+  }
+  for (auto blockSize : blockSizes) {
+    if (blockSize <= 0 || blockSize % 128 != 0 || blockSize > 1024) {
+      Utils::Print("[ERROR] BLOCKSIZES must only contain positive multiples of 128 up to 1024 (not %d)\n", blockSize);
+      return ERR_FATAL;
+    }
+  }
+
+  if (criteria < 0 || criteria > 2) {
+    Utils::Print("[ERROR] CRITERIA must be either 0 (for MAX), 1 (for AVG), or 2 (for MIN) (not %d)\n", criteria);
+    return ERR_FATAL;
+  }
+
+  if (elemBytes.empty()) {
+    Utils::Print("[ERROR] ELEM_BYTES may not be empty\n");
+    return ERR_FATAL;
+  }
+  for (auto elemByte : elemBytes) {
+    if (elemByte != 4 && elemByte != 8 && elemByte != 16) {
+      Utils::Print("[ERROR] ELEM_BYTES may only contain {4,8 or 16}\n");
+      return ERR_FATAL;
+    }
+  }
+
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+  if (!gpuIndices.empty()) {
+    for (auto gpuIdx : gpuIndices) {
+      if (gpuIdx < 0 || gpuIdx >= numDetectedGpus) {
+        Utils::Print("[ERROR] GPU_INDICES index out of range (%d) (rank %d)\n", gpuIdx, myRank);
+        return ERR_FATAL;
+      }
+    }
+  }
+
+  if (numBuffers < 1) {
+    Utils::Print("[ERROR] NUM_BUFFERS must be a positive number (not %d)\n", numBuffers);
+    return ERR_FATAL;
+  }
+  if (numIterations <= 0) {
+    Utils::Print("[ERROR] NUM_ITERATIONS must be positive (not %d)\n", numIterations);
+    return ERR_FATAL;
+  }
+  if (numBuffers > numIterations) {
+    Utils::Print("[WARN] NUM_BUFFERS (%d) exceeds NUM_ITERATIONS (%d), so some buffers will not be used\n",
+                 numBuffers, numIterations);
+    numBuffers = numIterations;
+  }
+
+  if (numSesList.empty()) {
+    // By default, use all available sub executors
+    numSesList.push_back(defSubExec);
+  } else {
+    for (auto x : numSesList) {
+      if (x <= 0 || x > defSubExec) {
+        Utils::Print("[ERROR] Number of subexecutors must be positive and less than %d\n", defSubExec);
+        return ERR_FATAL;
+      }
+    }
+  }
+
+  if (prewarmMsec < 0) {
+    Utils::Print("[ERROR] PREWARM_MSEC must be non-negative (not %d)\n", prewarmMsec);
+    return ERR_FATAL;
+  }
+
+  if (temporalMask < 1 || temporalMask > 3) {
+    Utils::Print("[ERROR] TEMPORAL_MASK must be between 1 to 3 (not %d)\n", temporalMask);
+    return ERR_FATAL;
+  }
+
+  if (unrolls.empty()) {
+    Utils::Print("[ERROR] UNROLLS may not be empty");
+    return ERR_FATAL;
+  }
+  for (auto unroll : unrolls) {
+    if (unroll != 1 && unroll != 2 && unroll != 4 && unroll != 8 && unroll != 16) {
+      Utils::Print("[ERROR] UNROLLS must only contain {1,2,4,8 or 16} (not %d)\n", unroll);
+      return ERR_FATAL;
+    }
+  }
+
+  MemType memType = Utils::GetGpuMemType(memTypeIdx);
+  std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx);
+
+  if (!ev.hideEnv)
+  {
+    if (!ev.outputToCsv) Utils::Print("[HBM Bandwidth Related]\n");
+    if (Utils::RankDoesOutput()) {
+      ev.Print("BLOCKSIZES"    , EnvVars::ToStr(blockSizes).c_str(), "Threadblock sizes to sweep over (multiple of 128 up to 1024)");
+      ev.Print("CRITERIA"      , criteria                          , "Reporting highest %s bandwidth (0=MAX,1=AVG,2=MIN)", criteria == 0 ? "MAX" : criteria == 1 ? "AVG" : "MIN");
+      ev.Print("ELEM_BYTES"    , EnvVars::ToStr(elemBytes).c_str() , "Element sizes in bytes to sweep over (must contain only 4,8 or 16)");
+      ev.Print("GPU_INDICES"   , EnvVars::ToStr(gpuIndices).c_str(), "GPU indices to test.  Leave empty for all");
+      ev.Print("MEM_TYPE"      , memTypeIdx                        , "Using %s memory (%s)", devMemTypeStr.c_str(), Utils::GetAllGpuMemTypeStr().c_str());
+      ev.Print("NUM_BUFFERS"   , numBuffers                        , "Number of buffers to rotate through (1 per iteration)");
+      ev.Print("NUM_ITERATIONS", numIterations                     , "Number of iterations to time");
+      ev.Print("NUM_SUB_EXECS" , EnvVars::ToStr(numSesList).c_str(), "Number of subexecutors to sweep over (default to all available)");
+      ev.Print("PREWARM_MSEC"  , prewarmMsec                       , "Prewarm duration in msec");
+      ev.Print("SHOW_DETAILS"  , showDetails                       , "Show sweep details (ignored for multi-rank).  Setting to 2 shows per iteration output");
+      ev.Print("SHOW_EXTRA"    , showExtra                         , "Show best sweep config details");
+      ev.Print("TEMPORAL_MASK" , temporalMask                      , "Temporal mask (1 = temporal, 2 = non-temporal, 3 = both)");
+      ev.Print("UNROLLS"       , EnvVars::ToStr(unrolls).c_str()   , "Unroll factors to sweep over (must contain only 1,2,4,8 or 16)");
+      ev.Print("USE_WALLCLOCK" , useWallClock                      , useWallClock ? "Using GPU wall-clock for timing" : "Using events for timing");
+      Utils::Print("\n");
+    }
+  }
+
+  if (gpuIndices.empty()) {
+    // If empty, use all available GPUs on local rank
+    for (int gpuIdx = 0; gpuIdx < numDetectedGpus; gpuIdx++)
+      gpuIndices.push_back(gpuIdx);
+  }
+
+  // Determine how how much memory to allocate based on sweep setting
+  // During each Step each threadblock works on BLOCKSIZE * UNROLL * ELEM_BYTES bytes
+  // Each buffer will be allocated as the smallest multiple of this, larger than numBytesAtLeast,
+  // NOTE: It's not safe to just base this on maximums values in each sweep parameter,
+  //       (e.g if maximum size divides numBytesAtLeast perfectly) so looping over entire space is safer
+  size_t largestTotalBytesPerBuffer = 0;
+  for (int numSubExec : numSesList) {
+    for (int blockSize : blockSizes) {
+      for (int unroll : unrolls) {
+        for (int elemByte : elemBytes) {
+          size_t totalBytesPerStep = numSubExec * blockSize * unroll * elemByte;
+          size_t numSteps = std::max((size_t)1, (numBytesAtLeast + totalBytesPerStep - 1) / totalBytesPerStep);
+          size_t totalBytesPerBuffer = numSteps * totalBytesPerStep;
+          if (totalBytesPerBuffer > largestTotalBytesPerBuffer) largestTotalBytesPerBuffer = totalBytesPerBuffer;
+        }
+      }
+    }
+  }
+
+  if (showDetails) {
+    Utils::Print("GPU ## | #SE | BKSZ | UR | EB | TOTALBYTES | #STEP | MAX GB/s | AVG GB/s | MIN GB/s\n");
+  }
+
+  // Test all local GPUs
+  std::vector<HbmBwResult> localResults;
+
+  if (!showDetails) {
+    // Calculate total number of tests that will be executed per GPU
+    size_t numTests = numSesList.size() * blockSizes.size() * unrolls.size() * elemBytes.size() * (temporalMask == 3 ? 2 : 1);
+
+    Utils::Print("Testing on at least %lu bytes (%lu configs per GPU): ", numBytesAtLeast, numTests);
+    fflush(stdout);
+  }
+
+  for (int gpuIdx : gpuIndices) {
+    HIP_CALL(hipSetDevice(gpuIdx));
+
+    // Create streams/events for this GPU
+    hipStream_t stream;
+    hipEvent_t startEvent, stopEvent;
+    HIP_CALL(hipStreamCreate(&stream));
+    HIP_CALL(hipEventCreate(&startEvent));
+    HIP_CALL(hipEventCreate(&stopEvent));
+
+    // Allocate pinned host memory closest to this GPU to capture timestamps (if enabled)
+    int wallClockRate;
+    long long* minStartCycle = nullptr;
+    long long* maxStopCycle = nullptr;
+
+    if (useWallClock) {
+    #if defined(__NVCC__)
+      wallClockRate = 1000000;
+#else
+      HIP_CALL(hipDeviceGetAttribute(&wallClockRate, hipDeviceAttributeWallClockRate, gpuIdx));
+#endif
+      if (Utils::AllocateMemory({MEM_CPU_CLOSEST, gpuIdx, myRank}, sizeof(int64_t), (void**)&minStartCycle) ||
+          Utils::AllocateMemory({MEM_CPU_CLOSEST, gpuIdx, myRank}, sizeof(int64_t), (void**)&maxStopCycle)) {
+        Utils::Print("[ERROR] Unable to allocate pinned host memory on rank %d closest to GPU %d\n", myRank, gpuIdx);
+        return ERR_FATAL;
+      }
+    }
+
+    // Allocate and initialize each GPU buffer
+    MemDevice memDevice = {memType, gpuIdx, myRank};
+    std::vector<void*> inputBuffers(numBuffers);
+    for (int bufferIdx = 0; bufferIdx < numBuffers; bufferIdx++) {
+      ErrResult err = AllocateMemory(memDevice, largestTotalBytesPerBuffer, &inputBuffers[bufferIdx]);
+      if (err.errType != ERR_NONE) {
+        Utils::Print("[ERROR] Error when allocating memory (%s)\n", err.errMsg.c_str());
+        return ERR_FATAL;
+      }
+      FillPsuedoRandomData<<<32, 256, 0, stream>>>(largestTotalBytesPerBuffer / sizeof(uint32_t),
+                                                   (uint32_t*)inputBuffers[bufferIdx], bufferIdx);
+    }
+    HIP_CALL(hipStreamSynchronize(stream));
+
+    HbmBwResult bestResult = {};
+
+    // Run sweep to find fastest result
+    for (int numSubExec : numSesList) {
+      dim3 gridDim(numSubExec, 1, 1);
+      for (int blockSize : blockSizes) {
+        if (!showDetails) {
+          Utils::Print(".");
+          fflush(stdout);
+        }
+        dim3 blockDim(blockSize, 1, 1);
+        int launchBoundIdx = (blockSize + 255) / 256 - 1;
+        for (int unroll : unrolls) {
+          int unrollIdx = (int)log2(unroll);
+          for (int elemByte : elemBytes) {
+            int elemByteIdx = (int)log2(elemByte) - 2;
+            size_t totalBytesPerStep = numSubExec * blockSize * unroll * elemByte;
+            size_t numSteps = std::max((size_t)1, (numBytesAtLeast + totalBytesPerStep - 1) / totalBytesPerStep);
+            size_t totalBytes = numSteps * totalBytesPerStep;
+
+            for (int useNt = 0; useNt <= 1; useNt++) {
+              if (!(temporalMask & (1<<useNt))) continue;
+              auto kernel = HbmReadKernelTable[launchBoundIdx][unrollIdx][elemByteIdx][useNt];
+
+              double minBw = std::numeric_limits<double>::max();
+              double maxBw = std::numeric_limits<double>::lowest();
+              double sumBw = 0.0;
+
+              /* Run warmups for user-specified time */
+              int currBufferIdx = 0;
+              auto prewarmEnd = std::chrono::steady_clock::now() + std::chrono::milliseconds(prewarmMsec);
+              do {
+                kernel<<<gridDim, blockDim, 0, stream>>>(inputBuffers[currBufferIdx++], nullptr, numSteps, minStartCycle, maxStopCycle);
+                HIP_CALL(hipStreamSynchronize(stream));
+                if (currBufferIdx == numBuffers) currBufferIdx = 0;
+              } while (std::chrono::steady_clock::now() < prewarmEnd);
+
+              /* Run timed iterations */
+              currBufferIdx = 0;
+              for (int iteration = 0; iteration < numIterations; iteration++) {
+                if (useWallClock) {
+                  *minStartCycle = std::numeric_limits<long long int>::max();
+                  *maxStopCycle = 0;
+                }
+
+#if defined(__NVCC__)
+                if (!useWallClock) {
+                  HIP_CALL(hipEventRecord(startEvent, stream));
+                }
+                kernel<<<gridDim, blockDim, 0, stream>>>(inputBuffers[currBufferIdx++], nullptr, numSteps, minStartCycle, maxStopCycle);
+                if (!useWallClock) {
+                  HIP_CALL(hipEventRecord(stopEvent, stream));
+                }
+#else
+                hipExtLaunchKernelGGL(kernel, gridDim, blockDim, 0, stream, useWallClock ? nullptr : startEvent, useWallClock ? nullptr : stopEvent, 0,
+                                      inputBuffers[currBufferIdx++], nullptr, numSteps, minStartCycle, maxStopCycle);
+#endif
+                HIP_CALL(hipStreamSynchronize(stream));
+                if (currBufferIdx == numBuffers) currBufferIdx = 0;
+
+                float elapsedMsec;
+                if (useWallClock) {
+                  elapsedMsec = (*maxStopCycle - *minStartCycle) / (double)wallClockRate;
+                } else {
+                  HIP_CALL(hipEventElapsedTime(&elapsedMsec, startEvent, stopEvent));
+                }
+
+                double bw = totalBytes / (elapsedMsec / 1000.0) / 1e9;
+
+                if (showDetails > 1) {
+                  Utils::Print("GPU %02d | %3d | %4d | %2d | %2d | %10lu | %5d | %8.3f\n",
+                               gpuIdx, numSubExec, blockSize, unroll, elemByte, totalBytes, numSteps, bw);
+                  fflush(stdout);
+                }
+
+                minBw = std::min(minBw, bw);
+                maxBw = std::max(maxBw, bw);
+                sumBw += bw;
+              }
+
+              double avgBw = sumBw / numIterations;
+
+              if (showDetails) {
+                Utils::Print("GPU %02d | %3d | %4d | %2d | %2d | %10lu | %5d | %8.3f | %8.3f | %8.3f\n",
+                             gpuIdx, numSubExec, blockSize, unroll, elemByte, totalBytes, numSteps, maxBw, avgBw, minBw);
+                fflush(stdout);
+              }
+
+              double bw[3] = {maxBw, avgBw, minBw};
+              if (bw[criteria] > bestResult.bw[criteria]) {
+                bestResult.rank       = myRank;
+                bestResult.gpuIdx     = gpuIdx;
+                bestResult.numSubExec = numSubExec;
+                bestResult.blockSize  = blockSize;
+                bestResult.unroll     = unroll;
+                bestResult.elemByte   = elemByte;
+                bestResult.bw[0]      = bw[0];
+                bestResult.bw[1]      = bw[1];
+                bestResult.bw[2]      = bw[2];
+              }
+            }
+          }
+        }
+      }
+    }
+
+    localResults.push_back(bestResult);
+
+    // Deallocate memory buffers
+    for (int bufferIdx = 0; bufferIdx < numBuffers; bufferIdx++) {
+      ErrResult err = DeallocateMemory(memType, inputBuffers[bufferIdx], largestTotalBytesPerBuffer);
+      if (err.errType != ERR_NONE) {
+        Utils::Print("[ERROR] Error when deallocating memory (%s)\n", err.errMsg.c_str());
+        return ERR_FATAL;
+      }
+    }
+
+    if (useWallClock) {
+      if (Utils::DeallocateMemory(MEM_CPU_CLOSEST, minStartCycle, sizeof(int64_t)) ||
+          Utils::DeallocateMemory(MEM_CPU_CLOSEST, maxStopCycle,  sizeof(int64_t))) {
+        Utils::Print("[ERROR] Unable to deallocate pinned host memory on rank %d closest to GPU %d\n", myRank, gpuIdx);
+        return ERR_FATAL;
+      }
+    }
+
+    // Cleanup streams and events
+    HIP_CALL(hipStreamDestroy(stream));
+    HIP_CALL(hipEventDestroy(startEvent));
+    HIP_CALL(hipEventDestroy(stopEvent));
+  }
+  if (!showDetails) {
+    Utils::Print("\n"); fflush(stdout);
+  }
+
+  // Determine the total number of results
+  std::vector<int> numGpusOnRank(numRanks);
+  int totalGpus = 0;
+  for (int rank = 0; rank < numRanks; rank++) {
+    numGpusOnRank[rank] = (int)gpuIndices.size();
+    TransferBench::System::Get().Broadcast(rank, sizeof(int), &numGpusOnRank[rank]);
+    totalGpus += numGpusOnRank[rank];
+  }
+
+  int numRows = 1 + totalGpus;
+  int numCols = 5 + (showExtra ? 4 : 0);
+  int precision = 2;
+  Utils::TableHelper table(numRows, numCols, precision);
+
+  table.DrawRowBorder(0);
+  table.DrawRowBorder(1);
+  table.DrawColBorder(0);
+  table.DrawColBorder(2);
+  table.DrawColBorder(5);
+  table.DrawColBorder(numCols);
+
+  // Header row
+  table.Set(0, 0, " Rank ");
+  table.Set(0, 1, " GPU ");
+  table.Set(0, 2, " MaxBw (GB/s) ");
+  table.Set(0, 3, " AvgBw (GB/s) ");
+  table.Set(0, 4, " MinBw (GB/s) ");
+  if (showExtra) {
+    table.Set(0, 5, " #SE ");
+    table.Set(0, 6, " Blocksize ");
+    table.Set(0, 7, " Unroll ");
+    table.Set(0, 8, " EBytes ");
+  }
+
+  // Data rows
+  int rowIdx = 1;
+  for (int rank = 0; rank < numRanks; rank++) {
+    for (int gpu = 0; gpu < numGpusOnRank[rank]; gpu++) {
+      HbmBwResult result;
+      if (rank == myRank) result = localResults[gpu];
+      TransferBench::System::Get().Broadcast(rank, sizeof(result), &result);
+
+      table.Set(rowIdx, 0, " %d "   , result.rank);
+      table.Set(rowIdx, 1, " %d "   , result.gpuIdx);
+      table.Set(rowIdx, 2, " %8.2f ", result.bw[0]);
+      table.Set(rowIdx, 3, " %8.2f ", result.bw[1]);
+      table.Set(rowIdx, 4, " %8.2f ", result.bw[2]);
+      if (showExtra) {
+        table.Set(rowIdx, 5, " %d ", result.numSubExec);
+        table.Set(rowIdx, 6, " %d ", result.blockSize);
+        table.Set(rowIdx, 7, " %d ", result.unroll);
+        table.Set(rowIdx, 8, " %d ", result.elemByte);
+      }
+      rowIdx++;
+    }
+    table.DrawRowBorder(rowIdx);
+  }
+  table.PrintTable(outputToCsv, showBorders);
+
+  return ERR_NONE;
+}
diff --git a/src/client/Presets/HealthCheck.hpp b/src/client/Presets/HealthCheck.hpp
index 973df3e7..37608ad0 100644
--- a/src/client/Presets/HealthCheck.hpp
+++ b/src/client/Presets/HealthCheck.hpp
@@ -439,19 +439,20 @@ int TestHbmPerformance(int modelId, bool verbose)
   return hasFail;
 }
 
-int HealthCheckPreset(EnvVars&           ev,
-                      size_t      const  numBytesPerTransfer,
-                      std::string const  presetName)
+int HealthCheckPreset(EnvVars&          ev,
+                      size_t      const numBytesPerTransfer,
+                      std::string const presetName,
+                      bool        const bytesSpecified)
 {
   if (TransferBench::GetNumRanks() > 1) {
     Utils::Print("[ERROR] Healthcheck preset currently not supported for multi-node\n");
-    return 1;
+    return ERR_FATAL;
   }
 
   // Check for supported platforms
 #if defined(__NVCC__)
   printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
-  return 0;
+  return ERR_NONE;
 #endif
 
   printf("Disclaimer:\n");
@@ -473,5 +474,5 @@ int HealthCheckPreset(EnvVars&           ev,
   numFails += TestUnidir(modelId, verbose);
   numFails += TestBidir(modelId, verbose);
   numFails += TestAllToAll(modelId, verbose);
-  return numFails ? 1 : 0;
+  return numFails ? ERR_FATAL : ERR_NONE;
 }
diff --git a/src/client/Presets/Help.hpp b/src/client/Presets/Help.hpp
new file mode 100644
index 00000000..26ede846
--- /dev/null
+++ b/src/client/Presets/Help.hpp
@@ -0,0 +1,123 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+int HelpPreset([[maybe_unused]] EnvVars&          ev,
+               [[maybe_unused]] size_t      const numBytesPerTransfer,
+               [[maybe_unused]] std::string const presetName,
+               [[maybe_unused]] bool        const bytesSpecified)
+{
+  if (!Utils::RankDoesOutput()) return 0;
+
+  printf("# ConfigFile Format:\n");
+  printf("# ==================\n");
+  printf("# A Transfer is defined as a single operation where an Executor reads and adds together\n");
+  printf("# values from Source (SRC) memory locations, then writes the sum to destination (DST) memory locations.\n");
+  printf("# This simplifies to a simple copy operation when dealing with single SRC/DST.\n");
+  printf("#\n");
+  printf("#                SRC 0                DST 0\n");
+  printf("#                SRC 1 -> Executor -> DST 1\n");
+  printf("#                SRC X                DST Y\n");
+  printf("\n");
+  printf("# Five Executors are supported by TransferBench\n");
+  printf("#   Executor:        SubExecutor:\n");
+  printf("#   1) CPU           CPU thread\n");
+  printf("#   2) GPU           GPU threadblock/Compute Unit (CU)\n");
+  printf("#   3) DMA           N/A.                                 (Must have single SRC, at least one DST)\n");
+  printf("#   4) NIC           Queue Pair\n");
+  printf("#   5) Batched-DMA   Batch item                           (Must have single SRC, at least one DST)\n");
+  printf("\n");
+  printf("# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel\n");
+  printf("\n");
+  printf("# There are two ways to specify a Test:\n");
+  printf("\n");
+  printf("# 1) Basic\n");
+  printf("#    The basic specification assumes the same number of SubExecutors (SE) used per Transfer\n");
+  printf("#    A positive number of Transfers is specified followed by that number of triplets describing each Transfer\n");
+  printf("\n");
+  printf("#    #Transfers #SEs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)\n");
+  printf("\n");
+  printf("# 2) Advanced\n");
+  printf("#    A negative number of Transfers is specified, followed by quintuplets describing each Transfer\n");
+  printf("#    A non-zero number of bytes specified will override any provided value\n");
+  printf("#    -#Transfers (srcMem1->Executor1->dstMem1 #SEs1 Bytes1) ... (srcMemL->ExecutorL->dstMemL #SEsL BytesL)\n");
+  printf("\n");
+  printf("# Argument Details:\n");
+  printf("#   #Transfers:   Number of Transfers to be run in parallel\n");
+  printf("#   #SEs      :   Number of SubExecutors to use (CPU threads/ GPU threadblocks)\n");
+  printf("#   srcMemL   :   Source memory locations (Where the data is to be read from)\n");
+  printf("#   Executor  :   Executor is specified by a character indicating type, followed by device index (0-indexed)\n");
+  printf("#                 - C:    CPU-executed          (Indexed from 0 to # NUMA nodes - 1)\n");
+  printf("#                 - G:    GPU-executed          (Indexed from 0 to # GPUs - 1)\n");
+  printf("#                 - D:    DMA-executor          (Indexed from 0 to # GPUs - 1)\n");
+  printf("#                 - B:    Batched-DMA-executor  (Indexed from 0 to # GPUs - 1)\n");
+  printf("#                 - I#.#: NIC executor          (Indexed from 0 to # NICs - 1)\n");
+  printf("#                 - N#.#: Nearest NIC executor  (Indexed from 0 to # GPUs - 1)\n");
+  printf("#   dstMemL   :   Destination memory locations (Where the data is to be written to)\n");
+  printf("#   bytesL    :   Number of bytes to copy (0 means use command-line specified size)\n");
+  printf("#                 Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')\n");
+  printf("#\n");
+  printf("#                 Memory locations are specified by one or more (device character / device index) pairs\n");
+  printf("#                 Character indicating memory type followed by device index (0-indexed)\n");
+  printf("#                 Supported memory locations are:\n");
+  printf("#                 - C:    Pinned host memory              (on NUMA node, indexed from 0 to [# NUMA nodes-1])\n");
+  printf("#                 - P:    Pinned host memory              (on NUMA node, indexed by closest GPU [# GPUs -1])\n");
+  printf("#                 - B:    Coherent pinned host memory     (on NUMA node, indexed from 0 to [# NUMA nodes-1])\n");
+  printf("#                 - D:    Non-coherent pinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])\n");
+  printf("#                 - K:    Uncached pinned host memory     (on NUMA node, indexed from 0 to [# NUMA nodes-1])\n");
+  printf("#                 - H:    Unpinned host memory            (on NUMA node, indexed from 0 to [# NUMA nodes-1])\n");
+  printf("#                 - G:    Global device memory            (on GPU device indexed from 0 to [# GPUs - 1])\n");
+  printf("#                 - F:    Fine-grain device memory        (on GPU device indexed from 0 to [# GPUs - 1])\n");
+  printf("#                 - U:    Uncached device memory          (on GPU device indexed from 0 to [# GPUs - 1])\n");
+  printf("#                 - N:    Null memory                     (index ignored)\n");
+  printf("\n");
+  printf("\n");
+  printf("# Examples:\n");
+  printf("# 1 4 (G0->G0->G1)                   Uses 4 CUs on GPU0 to copy from GPU0 to GPU1\n");
+  printf("# 1 4 (C1->G2->G0)                   Uses 4 CUs on GPU2 to copy from CPU1 to GPU0\n");
+  printf("# 2 4 G0->G0->G1 G1->G1->G0          Copies from GPU0 to GPU1, and GPU1 to GPU0, each with 4 SEs\n");
+  printf("# -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 SEs, and 2Mb from GPU1 to GPU0 with 2 SEs\n");
+  printf("# 1 2 (F0->I0.2->F1)                 Uses 2 QPs to transfer data from GPU0 via NIC0 to GPU1 via NIC2\n");
+  printf("# 1 1 (F0->N0.1->F1)                 Uses 1 QP to transfer data from GPU0 via GPU0's closest NIC to GPU1 via GPU1's closest NIC\n");
+  printf("# -2 (G0->N0.1->G1 2 128M) (G1->N1.0->G0 1 256M) Uses Nearest NIC executor to copy 128Mb from GPU0 to GPU1 with 2 QPs,\n");
+  printf("#                                                and 256Mb from GPU1 to GPU0 with 1 QP\n");
+  printf("# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary\n");
+  printf("# Lines starting with # will be ignored. Lines starting with ## will be echoed to output\n");
+  printf("\n");
+  printf("## Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs\n");
+  printf("1 4 (G0->G0->G1)\n");
+  printf("\n");
+  printf("## Single DMA executed Transfer between GPUs 0 and 1\n");
+  printf("1 1 (G0->D0->G1)\n");
+  printf("\n");
+  printf("## Copy 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 8 CUs\n");
+  printf("-2 (G0->G0->G1 4 1M) (G1->G1->G0 8 2M)\n");
+  printf("\n");
+  printf("## \"Memset\" by GPU 0 to GPU 0 memory\n");
+  printf("1 32 (N0->G0->G0)\n");
+  printf("\n");
+  printf("## \"Read-only\" by CPU 0\n");
+  printf("1 4 (C0->C0->N0)\n");
+  printf("\n");
+  printf("## Broadcast from GPU 0 to GPU 0 and GPU 1\n");
+  printf("1 16 (G0->G0->G0G1)\n");
+  return 0;
+}
diff --git a/src/client/Presets/NicAllToAll.hpp b/src/client/Presets/NicAllToAll.hpp
new file mode 100644
index 00000000..32e882a4
--- /dev/null
+++ b/src/client/Presets/NicAllToAll.hpp
@@ -0,0 +1,374 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <cstring>
+#include <limits>
+#include <numeric>
+
+int NicAllToAllPreset(EnvVars&                    ev,
+                      size_t      const           numBytesPerTransfer,
+                      std::string const           presetName,
+                      [[maybe_unused]] bool const bytesSpecified)
+{
+  // Check for single homogenous group
+  if (Utils::GetNumRankGroups() > 1) {
+    Utils::Print("[ERROR] NIC all-to-all preset can only be run across ranks that are homogenous\n");
+    Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n");
+    Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility to scale-out NICs\n");
+    return 1;
+  }
+
+  int numRanks = TransferBench::GetNumRanks();
+  int numNicsPerRank = TransferBench::GetNumExecutors(EXE_NIC);
+  if (numNicsPerRank == 0) {
+    Utils::Print("[ERROR] No NIC detected. This preset requires NIC executors.\n");
+    return 1;
+  }
+
+  int useCpuMem = EnvVars::GetEnvVar("USE_CPU_MEM", 0);
+  // Device count from topology: GFX executors, or CPU executors when USE_CPU_MEM (same pattern as NicRings).
+  int numMemDevices = TransferBench::GetNumExecutors(useCpuMem ? EXE_CPU : EXE_GPU_GFX);
+  if (numMemDevices == 0) {
+    Utils::Print("[ERROR] No %s executors detected for NIC all-to-all.\n", useCpuMem ? "CPU" : "GPU GFX");
+    return 1;
+  }
+
+  int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 1);
+  int showDetails   = EnvVars::GetEnvVar("SHOW_DETAILS", 0);
+  int useRdmaRead   = EnvVars::GetEnvVar("USE_RDMA_READ", 0);
+  int memTypeIdx    = EnvVars::GetEnvVar("MEM_TYPE", 0);
+  int stride        = EnvVars::GetEnvVar("STRIDE", 1);
+
+  // Compute orbit structure before reading GROUP_SIZE so its default can be stride-aware.
+  // Stride orbits on devices (rank-major devLin = rank * numMemDevices + memIdx): same gcd structure as PodAllToAll's StrideGenerate,
+  // but NIC A2A does not use the permuted slot order for GROUP_SIZE — subgroups follow natural order within each orbit.
+  int const M         = numRanks * numMemDevices;
+  int const kNorm     = ((stride % M) + M) % M;
+  int const dCycles   = (kNorm == 0) ? 1 : std::gcd(kNorm, M);
+  int const orbitSize = M / dCycles;
+
+  int groupSize    = EnvVars::GetEnvVar("GROUP_SIZE", orbitSize);
+  int noSameRank   = EnvVars::GetEnvVar("NIC_A2A_NO_SAME_RANK", 1);
+  int numNicPlanes = EnvVars::GetEnvVar("NUM_NIC_PLANES", 1);
+
+  if (numQueuePairs < 1) {
+    Utils::Print("[ERROR] NUM_QUEUE_PAIRS must be >= 1 (got %d)\n", numQueuePairs);
+    return 1;
+  }
+  if (groupSize < 1) {
+    Utils::Print("[ERROR] GROUP_SIZE must be >= 1 (got %d)\n", groupSize);
+    return 1;
+  }
+
+  bool scopeInter = false;
+  {
+    char const* scopeStr = getenv("NIC_A2A_SCOPE");
+    if (scopeStr && scopeStr[0]) {
+      if (!strcmp(scopeStr, "inter") || !strcmp(scopeStr, "INTER"))
+        scopeInter = true;
+      else if (strcmp(scopeStr, "intra") && strcmp(scopeStr, "INTRA")) {
+        Utils::Print("[ERROR] NIC_A2A_SCOPE must be \"intra\" or \"inter\"\n");
+        return 1;
+      }
+    }
+  }
+
+  MemType memType        = Utils::GetMemType(memTypeIdx, useCpuMem);
+  std::string memTypeStr = Utils::GetMemTypeStr(memTypeIdx, useCpuMem);
+
+  if (numNicPlanes < 1) {
+    Utils::Print("[ERROR] NUM_NIC_PLANES must be >= 1\n");
+    return 1;
+  }
+
+  // Same divisibility check as PodAllToAll (total devices = ranks × memory devices per rank).
+  if (M % groupSize) {
+    Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n",
+                 groupSize, M, numRanks);
+    return 1;
+  }
+
+  // Within each stride orbit, partition by natural rank-major device index: orbit lists devLin = r, r+d, r+2d, ...
+  // (r = devLin %% dCycles). Subgroup id = (index along that list) / GROUP_SIZE.
+  if (orbitSize % groupSize != 0) {
+    Utils::Print("[ERROR] GROUP_SIZE (%d) must divide stride-cycle size %d (devices M=%d, orbits=%d).\n",
+                 groupSize, orbitSize, M, dCycles);
+    Utils::Print("[ERROR] With STRIDE=%d there are %d disjoint cycles; use a GROUP_SIZE that divides each cycle's device count,\n",
+                 stride, dCycles);
+    Utils::Print("[ERROR] or use STRIDE=1 so the cycle size equals total devices (%d).\n", M);
+    return 1;
+  }
+
+  std::vector<int> deviceSubgroup(M);
+  for (int devLin = 0; devLin < M; devLin++) {
+    int const r = devLin % dCycles;
+    int const k = (devLin - r) / dCycles;  // 0 .. orbitSize-1 along natural order in this orbit
+    deviceSubgroup[devLin] = k / groupSize;
+  }
+
+  if (Utils::RankDoesOutput()) {
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+      if (!ev.outputToCsv) printf("[NIC A2A Related]\n");
+      ev.Print("USE_CPU_MEM"         , useCpuMem     , "Using closest %s memory", useCpuMem ? "CPU" : "GPU");
+      ev.Print("MEM_TYPE"            , memTypeIdx    , "Using %s memory (%s)", memTypeStr.c_str(), Utils::GetAllMemTypeStr(useCpuMem).c_str());
+      ev.Print("STRIDE"              , stride        , "Reordering devices by taking %d steps", stride);
+      ev.Print("GROUP_SIZE"          , groupSize     , "Dividing all devices into groups of %d for a2a", groupSize);
+      ev.Print("NUM_NIC_PLANES"      , numNicPlanes  , "Number of planes on scale-out");
+      if (scopeInter)
+        ev.Print("NIC_A2A_SCOPE"     , "inter"       , "Between-group transfers only. Other value: intra");
+      else
+        ev.Print("NIC_A2A_SCOPE"     , "intra"       , "Within-group transfers only. Other value: inter");
+      ev.Print("NIC_A2A_NO_SAME_RANK", noSameRank    , "%s transfers where src rank == dst rank", noSameRank ? "Excluding" : "Allowing");
+      ev.Print("NUM_QUEUE_PAIRS"     , numQueuePairs , "Using %d queue pairs for NIC transfers", numQueuePairs);
+      ev.Print("SHOW_DETAILS"        , showDetails   , "%s full Test details", showDetails ? "Showing" : "Hiding");
+      ev.Print("USE_RDMA_READ"       , useRdmaRead   , "Performing RDMA %s", useRdmaRead ? "reads" : "writes");
+      printf("\n");
+    }
+  }
+
+  // For each rank/NIC, closest memory device (GPU or CPU NUMA) — several NICs may share the same device (same subgroup).
+  std::vector<std::vector<int>> nicToMem(numRanks, std::vector<int>(numNicsPerRank, -1));
+  for (int rank = 0; rank < numRanks; rank++) {
+    for (int nic = 0; nic < numNicsPerRank; nic++) {
+      int memIdx = useCpuMem ? TransferBench::GetClosestCpuNumaToNic(nic, rank)
+                             : TransferBench::GetClosestGpuToNic(nic, rank);
+      if (memIdx < 0) {
+        Utils::Print("[ERROR] Failed to identify closest %s for Rank %d NIC %d\n",
+                     useCpuMem ? "CPU NUMA node" : "GPU", rank, nic);
+        return 1;
+      }
+      if (memIdx >= numMemDevices) {
+        Utils::Print("[ERROR] Closest %s index %d for Rank %d NIC %d is out of range [0,%d)\n",
+                     useCpuMem ? "CPU" : "GPU", memIdx, rank, nic, numMemDevices);
+        return 1;
+      }
+      nicToMem[rank][nic] = memIdx;
+    }
+  }
+
+  auto devLinOf = [&](int rank, int memIdx) -> int { return rank * numMemDevices + memIdx; };
+
+  // NIC plane: independent of STRIDE over memory devices. Global rank-major order over NIC endpoints, round-robin into P planes.
+  auto nicPlaneOf = [&](int rank, int nic) -> int {
+    int const L = rank * numNicsPerRank + nic;
+    return L % numNicPlanes;
+  };
+
+  std::vector<Transfer> transfers;
+
+  auto const acceptPair = [&](int srcRank, int srcNic, int dstRank, int dstNic) -> bool {
+    if (nicPlaneOf(srcRank, srcNic) != nicPlaneOf(dstRank, dstNic))
+      return false;
+    int srcDevLin = devLinOf(srcRank, nicToMem[srcRank][srcNic]);
+    int dstDevLin = devLinOf(dstRank, nicToMem[dstRank][dstNic]);
+    if ((srcDevLin % dCycles) != (dstDevLin % dCycles))
+      return false;
+    if (noSameRank && srcRank == dstRank)
+      return false;
+    if (scopeInter)
+      return deviceSubgroup[srcDevLin] != deviceSubgroup[dstDevLin];
+    return deviceSubgroup[srcDevLin] == deviceSubgroup[dstDevLin];
+  };
+
+  for (int srcRank = 0; srcRank < numRanks; srcRank++) {
+    for (int srcNic = 0; srcNic < numNicsPerRank; srcNic++) {
+      int srcMem = nicToMem[srcRank][srcNic];
+      for (int dstRank = 0; dstRank < numRanks; dstRank++) {
+        for (int dstNic = 0; dstNic < numNicsPerRank; dstNic++) {
+          if (!acceptPair(srcRank, srcNic, dstRank, dstNic)) continue;
+
+          int dstMem = nicToMem[dstRank][dstNic];
+
+          TransferBench::Transfer transfer;
+          transfer.srcs.push_back({memType, srcMem, srcRank});
+          transfer.dsts.push_back({memType, dstMem, dstRank});
+          transfer.exeDevice   = {EXE_NIC, useRdmaRead ? dstNic : srcNic, useRdmaRead ? dstRank : srcRank};
+          transfer.exeSubIndex = useRdmaRead ? srcNic : dstNic;
+          transfer.numSubExecs = numQueuePairs;
+          transfer.numBytes    = numBytesPerTransfer;
+
+          transfers.push_back(transfer);
+        }
+      }
+    }
+  }
+
+  Utils::Print("NIC All-To-All benchmark\n");
+  Utils::Print("========================\n");
+  Utils::Print("%s traffic over NIC executors. %d rank-major devices; STRIDE sets gcd-orbits; GROUP_SIZE chunks each orbit in natural order.\n",
+               useCpuMem ? "CPU" : "GPU", M);
+  Utils::Print("NICs map to devices via closest %s;\n", useCpuMem ? "CPU NUMA node" : "GPU");
+  Utils::Print("NIC planes: %d , traffic only between NICs in the same plane. Stride: %d\n",
+               numNicPlanes, stride);
+  Utils::Print("Using closest %s per NIC endpoint and %s memory.\n",
+               useCpuMem ? "CPU NUMA node" : "GPU", memTypeStr.c_str());
+  Utils::Print("Visible NICs per rank: %d\n", numNicsPerRank);
+  Utils::Print("%d queue pairs per NIC.  %lu bytes per Transfer.  All numbers are GB/s\n",
+               numQueuePairs, numBytesPerTransfer);
+  Utils::Print("Total transfers: %lu\n\n", transfers.size());
+
+  if (transfers.empty()) {
+    Utils::Print("[WARN] No transfers were generated for this preset.\n");
+    return 0;
+  }
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+  if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+    for (auto const& err : results.errResults)
+      Utils::Print("%s\n", err.errMsg.c_str());
+    return 1;
+  } else if (showDetails) {
+    Utils::PrintResults(ev, 1, transfers, results);
+    Utils::Print("\n");
+  }
+
+  if (!Utils::RankDoesOutput()) return 0;
+
+  int numRows = 6 + numRanks;
+  int numCols = 3 + numNicsPerRank;
+  Utils::TableHelper table(numRows, numCols);
+
+  table.Set(2, 0, " Rank ");
+  table.Set(2, 1, " Name ");
+  table.Set(1, numCols - 1, " TOTAL ");
+  table.Set(2, numCols - 1, " (GB/s) ");
+  table.SetColAlignment(1, Utils::TableHelper::ALIGN_LEFT);
+  for (int rank = 0; rank < numRanks; rank++) {
+    table.Set(3 + rank, 0, " %d ", rank);
+    table.Set(3 + rank, 1, " %s ", TransferBench::GetHostname(rank).c_str());
+  }
+  table.Set(numRows - 3, 1, " MAX (GB/s) ");
+  table.Set(numRows - 2, 1, " AVG (GB/s) ");
+  table.Set(numRows - 1, 1, " MIN (GB/s) ");
+  for (int row = numRows - 3; row < numRows; row++)
+    table.SetCellAlignment(row, 1, Utils::TableHelper::ALIGN_RIGHT);
+  table.DrawRowBorder(3);
+  table.DrawRowBorder(numRows - 3);
+
+  std::vector<std::vector<double>> bwByRankNic(numRanks, std::vector<double>(numNicsPerRank, 0.0));
+  for (size_t i = 0; i < results.tfrResults.size(); i++) {
+    int nicIdx  = results.tfrResults[i].exeDevice.exeIndex;
+    int rankIdx = results.tfrResults[i].exeDevice.exeRank;
+    bwByRankNic[rankIdx][nicIdx] += results.tfrResults[i].avgBandwidthGbPerSec;
+  }
+
+  std::vector<bool> nicHasMixedMemMapping(numNicsPerRank, false);
+  bool hasMixedMemMapping = false;
+  for (int nic = 0; nic < numNicsPerRank; nic++) {
+    int refMem = nicToMem[0][nic];
+    for (int rank = 1; rank < numRanks; rank++) {
+      if (nicToMem[rank][nic] != refMem) {
+        nicHasMixedMemMapping[nic] = true;
+        hasMixedMemMapping = true;
+        break;
+      }
+    }
+  }
+
+  std::vector<double> rankTotal(numRanks, 0.0);
+  int colIdx = 2;
+  table.DrawColBorder(colIdx);
+  for (int nic = 0; nic < numNicsPerRank; nic++) {
+    table.Set(0, colIdx, " NIC %02d ", nic);
+    if (nicHasMixedMemMapping[nic]) {
+      table.Set(1, colIdx, " MIXED ");
+    } else if (useCpuMem) {
+      table.Set(1, colIdx, " CPU %02d ", nicToMem[0][nic]);
+    } else {
+      table.Set(1, colIdx, " GPU %02d ", nicToMem[0][nic]);
+    }
+    table.Set(2, colIdx, " %s ", TransferBench::GetExecutorName({EXE_NIC, nic}).c_str());
+
+    double nicMin = std::numeric_limits<double>::max();
+    double nicAvg = 0.0;
+    double nicMax = std::numeric_limits<double>::lowest();
+    for (int rank = 0; rank < numRanks; rank++) {
+      double bw = bwByRankNic[rank][nic];
+      table.Set(3 + rank, colIdx, " %.2f ", bw);
+      nicMin = std::min(nicMin, bw);
+      nicAvg += bw;
+      nicMax = std::max(nicMax, bw);
+      rankTotal[rank] += bw;
+    }
+
+    table.Set(numRows - 3, colIdx, " %.2f ", nicMax);
+    table.Set(numRows - 2, colIdx, " %.2f ", nicAvg / numRanks);
+    table.Set(numRows - 1, colIdx, " %.2f ", nicMin);
+    colIdx++;
+  }
+  table.DrawColBorder(colIdx);
+
+  double rankMin = std::numeric_limits<double>::max();
+  double rankAvg = 0.0;
+  double rankMax = std::numeric_limits<double>::lowest();
+  for (int rank = 0; rank < numRanks; rank++) {
+    table.Set(3 + rank, numCols - 1, " %.2f ", rankTotal[rank]);
+    rankMin = std::min(rankMin, rankTotal[rank]);
+    rankAvg += rankTotal[rank];
+    rankMax = std::max(rankMax, rankTotal[rank]);
+  }
+  table.Set(numRows - 3, numCols - 1, " %.2f ", rankMax);
+  table.Set(numRows - 2, numCols - 1, " %.2f ", rankAvg / numRanks);
+  table.Set(numRows - 1, numCols - 1, " %.2f ", rankMin);
+
+  table.PrintTable(ev.outputToCsv, ev.showBorders);
+  Utils::Print("\n");
+  if (hasMixedMemMapping) {
+    Utils::Print("[WARN] NIC-to-%s mapping differs across ranks. 'MIXED' columns are detailed below.\n",
+                 useCpuMem ? "CPU" : "GPU");
+
+    int mapRows = 2 + numRanks;
+    int mapCols = 2 + numNicsPerRank;
+    Utils::TableHelper mapTable(mapRows, mapCols);
+    mapTable.Set(0, 0, " Rank ");
+    mapTable.Set(0, 1, " Name ");
+    mapTable.SetColAlignment(1, Utils::TableHelper::ALIGN_LEFT);
+    for (int nic = 0; nic < numNicsPerRank; nic++) {
+      mapTable.Set(0, 2 + nic, " NIC %02d ", nic);
+      mapTable.SetCellAlignment(0, 2 + nic, Utils::TableHelper::ALIGN_CENTER);
+    }
+    mapTable.DrawRowBorder(1);
+    mapTable.DrawColBorder(2);
+
+    for (int rank = 0; rank < numRanks; rank++) {
+      int rowIdx = 1 + rank;
+      mapTable.Set(rowIdx, 0, " %d ", rank);
+      mapTable.Set(rowIdx, 1, " %s ", TransferBench::GetHostname(rank).c_str());
+      for (int nic = 0; nic < numNicsPerRank; nic++) {
+        mapTable.Set(rowIdx, 2 + nic, " %s %02d ", useCpuMem ? "CPU" : "GPU", nicToMem[rank][nic]);
+      }
+    }
+
+    mapTable.PrintTable(ev.outputToCsv, ev.showBorders);
+    Utils::Print("\n");
+  }
+  Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
+  Utils::PrintErrors(results.errResults);
+
+  if (Utils::HasDuplicateHostname()) {
+    printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
+  }
+
+  return 0;
+}
diff --git a/src/client/Presets/NicPeerToPeer.hpp b/src/client/Presets/NicPeerToPeer.hpp
index 24f5d71f..ff67d8ac 100644
--- a/src/client/Presets/NicPeerToPeer.hpp
+++ b/src/client/Presets/NicPeerToPeer.hpp
@@ -24,122 +24,29 @@ THE SOFTWARE.
 
 // Helper functions
 
-// Returns a schedule of round robin pairing of N elements, using Circle Method
-// if parallel, each round contains N/2 pairs, otherwise serial
-void RoundRobinSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
-                        int N, int parallel = 0) {
-  if (N == 1) {
-    schedule.push_back({{0,0}});
-    return;
-  }
-  // Generate standard round-robin tournament (maximum parallelism)
-  std::vector<std::vector<std::pair<int, int>>> fullSchedule;
-
-  // Pad odd number of ranks with a dummy round (N+1)
-  int paddedN = N + N%2;
-  // Round-robin tournament scheduling
-  for (int round = 0; round < paddedN - 1; round++) {
-    std::vector<std::pair<int, int>> roundPairs;
-    std::vector<std::pair<int, int>> roundPairsReversed;
-    for (int i = 0; i < paddedN / 2; i++) {
-      int item1 = i;
-      int item2 = paddedN - 1 - i;
-      if (round > 0) {
-        // Rotate all except the first item
-        if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1;
-        if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1;
-      }
-      // Ignore dummy round, its partner sits out this ronud
-      if (item1 < N && item2 < N){
-        roundPairs.push_back({item1, item2});
-        roundPairsReversed.push_back({item2, item1});
-      }
-    }
-    fullSchedule.push_back(roundPairs);
-    fullSchedule.push_back(roundPairsReversed);
-  }
-
-  // A loopback round where all run in parallel
-  std::vector<std::pair<int, int>> selfRound;
-  for (int i = 0; i < N; i++) {
-    selfRound.push_back({i, i});
-  }
-  fullSchedule.push_back(selfRound);
-
-  if (parallel) {
-    schedule = std::move(fullSchedule);
-  } else {
-    // Serialize each round if needed
-    for (auto const& fullRound : fullSchedule) {
-      for (auto const& match : fullRound) {
-        std::vector<std::pair<int, int>> subRound;
-        subRound.push_back({match.first, match.second});
-        schedule.push_back(subRound);
-      }
-    }
-  }
-}
-
-// Returns a schedule for ordered 2-combination of N elements 
-// by pairing the list with its rotating self,
-// each round contains n pairs, where 1 <= n <= N and N is divisible by n
-// and an element cannot appear more than twice in a round,
-void CombinationSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
-                           int N, int n = 0) {
-  std::vector<std::vector<std::pair<int, int>>> fullSchedule;
-
-  if (n <= 0) n = N;
-  if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round
-  {
-    n = 1;
-    Utils::Print("[WARN] cannot create round robin schedule, falling back to serial");
-  }
-
-  // Generate rounds of combination based on incrementing distance
-  for (int i = 0; i < N; i++) {
-    std::vector<std::pair<int, int>> round;
-    for (int j = 0; j < N; j++) {
-      round.push_back({j, (j+i)%N});
-    }
-    fullSchedule.push_back(round);
-  }
-
-  // Step 2: Split each full round into sub-rounds with at most n pairs
-  for (auto const& fullRound : fullSchedule) {
-    for (size_t start = 0; start < fullRound.size(); start += n) {
-      std::vector<std::pair<int, int>> subRound;
-      for (size_t i = start; i < start + n && i < fullRound.size(); i++) {
-        subRound.push_back(fullRound[i]);
-      }
-      if (!subRound.empty()) {
-        schedule.push_back(subRound);
-      }
-    }
-  }
-}
-
 int GetClosestDeviceToNic(MemType memType, int nicIdx, int rank) {
   return TransferBench::IsCpuMemType(memType) ?
          TransferBench::GetClosestCpuNumaToNic(nicIdx, rank) :
          TransferBench::GetClosestGpuToNic(nicIdx, rank);
 }
 
-int NicPeerToPeerPreset(EnvVars&           ev,
-                        size_t      const  numBytesPerTransfer,
-                        std::string const  presetName)
+int NicPeerToPeerPreset(EnvVars&          ev,
+                        size_t      const numBytesPerTransfer,
+                        std::string const presetName,
+                        bool        const bytesSpecified)
 {
   if (Utils::GetNumRankGroups() > 1) {
     Utils::Print("[ERROR] NIC p2p preset can only be run across ranks that are homogenous\n");
     Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n");
-    Utils::Print("[ERROR] NIC_FILTER may also be used to limit NIC visibility\n");
-    return 1;
+    Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n");
+    return ERR_FATAL;
   }
 
   int numRanks = TransferBench::GetNumRanks();
   int numNicsPerRank = TransferBench::GetNumExecutors(EXE_NIC);
   if (numNicsPerRank == 0) {
     Utils::Print("No NIC detected, NICs are required to run this preset\n");
-    return 1;
+    return ERR_FATAL;
   }
 
   // Collect env vars for this preset
@@ -204,8 +111,8 @@ int NicPeerToPeerPreset(EnvVars&           ev,
   std::vector<std::vector<std::pair<int, int>>> schedule;
   std::vector<std::vector<std::pair<int, int>>> nicSchedule;
 
-  RoundRobinSchedule(schedule, numRanks, nodeParallel);
-  CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel);
+  Utils::RoundRobinSchedule(schedule, numRanks, nodeParallel);
+  Utils::CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel);
 
   int totalTransfers = numRanks * numNicsPerRank * numRanks * numNicsPerRank;
   int counter = 0;
@@ -239,7 +146,7 @@ int NicPeerToPeerPreset(EnvVars&           ev,
           if (srcMemIndex == -1 || dstMemIndex == -1) {
             Utils::Print("[ERROR] No proper GPU device can be found for transfer R%dN%d - R%dN%d\n",
                          srcRank, srcNicIdx, dstRank, dstNicIdx);
-            return 1;
+            return ERR_FATAL;
           }
           transfer.numBytes = numBytesPerTransfer;
           transfer.srcs.push_back({srcTypeActual, srcMemIndex, srcRank});
@@ -255,7 +162,7 @@ int NicPeerToPeerPreset(EnvVars&           ev,
       if (!TransferBench::RunTransfers(cfg, transfers, results)) {
         for (auto const& err : results.errResults)
           Utils::Print("%s\n", err.errMsg.c_str());
-        return 1;
+        return ERR_FATAL;
       }
 
       counter += transfers.size();
@@ -365,10 +272,10 @@ int NicPeerToPeerPreset(EnvVars&           ev,
   Utils::TableHelper summaryTable(11, 6, precision);
   Utils::Print("Summary of top 10 fastest/slowest connection\n");
 
-  summaryTable.Set(0, 0, " Fastest Bandwidth (GB/s) "); 
+  summaryTable.Set(0, 0, " Fastest Bandwidth (GB/s) ");
   summaryTable.Set(0, 1, " Src ");
   summaryTable.Set(0, 2, " Dst ");
-  summaryTable.Set(0, 3, " Slowest Bandwidth (GB/s) "); 
+  summaryTable.Set(0, 3, " Slowest Bandwidth (GB/s) ");
   summaryTable.Set(0, 4, " Src ");
   summaryTable.Set(0, 5, " Dst ");
 
@@ -410,5 +317,5 @@ int NicPeerToPeerPreset(EnvVars&           ev,
   }
   summaryTable.PrintTable(ev.outputToCsv, ev.showBorders);
 
-  return 0;
+  return ERR_NONE;
 }
diff --git a/src/client/Presets/NicRings.hpp b/src/client/Presets/NicRings.hpp
index 95dbba85..031d5744 100644
--- a/src/client/Presets/NicRings.hpp
+++ b/src/client/Presets/NicRings.hpp
@@ -20,17 +20,18 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-int NicRingsPreset(EnvVars&           ev,
-                   size_t      const  numBytesPerTransfer,
-                   std::string const  presetName)
+int NicRingsPreset(EnvVars&          ev,
+                   size_t      const numBytesPerTransfer,
+                   std::string const presetName,
+                   bool        const bytesSpecified)
 {
 
   // Check for single homogenous group
   if (Utils::GetNumRankGroups() > 1) {
     Utils::Print("[ERROR] NIC-rings preset can only be run across ranks that are homogenous\n");
     Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n");
-    Utils::Print("[ERROR] NIC_FILTER may also be used to limit NIC visibility\n");
-    return 1;
+    Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n");
+    return ERR_FATAL;
   }
 
   // Collect topology
@@ -104,7 +105,7 @@ int NicRingsPreset(EnvVars&           ev,
   if (!TransferBench::RunTransfers(cfg, transfers, results)) {
     for (auto const& err : results.errResults)
       Utils::Print("%s\n", err.errMsg.c_str());
-    return 1;
+    return ERR_FATAL;
   } else if (showDetails) {
     Utils::PrintResults(ev, 1, transfers, results);
     Utils::Print("\n");
@@ -160,7 +161,7 @@ int NicRingsPreset(EnvVars&           ev,
 
       double ringMin = std::numeric_limits<double>::max();
       double ringAvg = 0.0;
-      double ringMax = std::numeric_limits<double>::min();
+      double ringMax = std::numeric_limits<double>::lowest();
 
       for (int rank = 0; rank < numRanks; rank++) {
         double avgBw = results.tfrResults[transferIdx].avgBandwidthGbPerSec;
@@ -184,7 +185,7 @@ int NicRingsPreset(EnvVars&           ev,
 
   double rankMin = std::numeric_limits<double>::max();
   double rankAvg = 0.0;
-  double rankMax = std::numeric_limits<double>::min();
+  double rankMax = std::numeric_limits<double>::lowest();
   for (int rank = 0; rank < numRanks; rank++) {
     table.Set(3 + rank, numCols - 1, " %.2f ", rankTotal[rank]);
     rankMin = std::min(rankMin, rankTotal[rank]);
@@ -204,5 +205,5 @@ int NicRingsPreset(EnvVars&           ev,
   if (Utils::HasDuplicateHostname()) {
     printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
   }
-  return 0;
+  return ERR_NONE;
 }
diff --git a/src/client/Presets/OneToAll.hpp b/src/client/Presets/OneToAll.hpp
index f43f4c0d..b2084cdd 100644
--- a/src/client/Presets/OneToAll.hpp
+++ b/src/client/Presets/OneToAll.hpp
@@ -20,19 +20,20 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-int OneToAllPreset(EnvVars&           ev,
-                   size_t      const  numBytesPerTransfer,
-                   std::string const  presetName)
+int OneToAllPreset(EnvVars&          ev,
+                   size_t      const numBytesPerTransfer,
+                   std::string const presetName,
+                   bool        const bytesSpecified)
 {
   if (TransferBench::GetNumRanks() > 1) {
     Utils::Print("[ERROR] One-to-All preset currently not supported for multi-node\n");
-    return 1;
+    return ERR_FATAL;
   }
 
   int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
   if (numDetectedGpus < 2) {
     printf("[ERROR] One-to-all benchmark requires machine with at least 2 GPUs\n");
-    return 1;
+    return ERR_FATAL;
   }
 
   // Collect env vars for this preset
@@ -66,7 +67,7 @@ int OneToAllPreset(EnvVars&           ev,
   for (auto ch : sweepExe) {
     if (ch != 'G' && ch != 'D') {
       printf("[ERROR] Unrecognized executor type '%c' specified\n", ch);
-      return 1;
+      return ERR_FATAL;
     }
   }
 
@@ -129,7 +130,7 @@ int OneToAllPreset(EnvVars&           ev,
         }
         if (!TransferBench::RunTransfers(cfg, transfers, results)) {
           Utils::PrintErrors(results.errResults);
-          return 1;
+          return ERR_FATAL;
         }
 
         int counter = 0;
@@ -151,5 +152,5 @@ int OneToAllPreset(EnvVars&           ev,
       }
     }
   }
-  return 0;
+  return ERR_NONE;
 }
diff --git a/src/client/Presets/PeerToPeer.hpp b/src/client/Presets/PeerToPeer.hpp
index fd32f9b1..5fbe1554 100644
--- a/src/client/Presets/PeerToPeer.hpp
+++ b/src/client/Presets/PeerToPeer.hpp
@@ -20,13 +20,14 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-int PeerToPeerPreset(EnvVars&           ev,
-                     size_t      const  numBytesPerTransfer,
-                     std::string const  presetName)
+int PeerToPeerPreset(EnvVars&          ev,
+                     size_t      const numBytesPerTransfer,
+                     std::string const presetName,
+                     bool        const bytesSpecified)
 {
   if (TransferBench::GetNumRanks() > 1) {
     Utils::Print("[ERROR] Peer-to-peer preset currently not supported for multi-node\n");
-    return 1;
+    return ERR_FATAL;
   }
 
   int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
@@ -42,7 +43,6 @@ int PeerToPeerPreset(EnvVars&           ev,
   int numGpuDevices  = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
   int numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE",      useDmaCopy ? 1 : TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}));
   int p2pMode        = EnvVars::GetEnvVar("P2P_MODE",        0);
-  int useFineGrain   = EnvVars::GetEnvVar("USE_FINE_GRAIN",  -999); // Deprecated
   int useRemoteRead  = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
 
   MemType cpuMemType = Utils::GetCpuMemType(cpuMemTypeIdx);
@@ -72,12 +72,6 @@ int PeerToPeerPreset(EnvVars&           ev,
     }
   }
 
-  // Check for deprecated env vars
-  if (useFineGrain != -999) {
-    Utils::Print("[ERROR] USE_FINE_GRAIN has been deprecated and replaced by CPU_MEM_TYPE and GPU_MEM_TYPE\n");
-    return 1;
-  }
-
   char const separator = ev.outputToCsv ? ',' : ' ';
   printf("Bytes Per Direction%c%lu\n", separator, numBytesPerTransfer);
 
@@ -188,7 +182,7 @@ int PeerToPeerPreset(EnvVars&           ev,
           if (!TransferBench::RunTransfers(cfg, transfers, results)) {
             for (auto const& err : results.errResults)
               printf("%s\n", err.errMsg.c_str());
-            return 1;
+            return ERR_FATAL;
           }
 
           for (int dir = 0; dir <= isBidirectional; dir++) {
@@ -326,5 +320,5 @@ int PeerToPeerPreset(EnvVars&           ev,
       printf("\n\n");
     }
   }
-  return 0;
+  return ERR_NONE;
 }
diff --git a/src/client/Presets/PodAllToAll.hpp b/src/client/Presets/PodAllToAll.hpp
new file mode 100644
index 00000000..f81586b7
--- /dev/null
+++ b/src/client/Presets/PodAllToAll.hpp
@@ -0,0 +1,270 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+int PodAllToAllPreset(EnvVars&          ev,
+                      size_t      const numBytesPerTransfer,
+                      std::string const presetName,
+                      bool        const bytesSpecified)
+{
+  enum
+  {
+    A2A_COPY       = 0,
+    A2A_READ_ONLY  = 1,
+    A2A_WRITE_ONLY = 2,
+    A2A_CUSTOM     = 3,
+  };
+  char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"};
+
+  // Force single-stream mode for all-to-all benchmark
+  ev.useSingleStream = 1;
+
+  // Force to gfx unroll 2 unless explicitly set
+  ev.gfxUnroll      = EnvVars::GetEnvVar("GFX_UNROLL", 2);
+
+  int numRanks = TransferBench::GetNumRanks();
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int a2aLocal      = EnvVars::GetEnvVar("A2A_LOCAL"      , 0);
+  int memTypeIdx    = EnvVars::GetEnvVar("MEM_TYPE"       , 0);
+  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
+  int numSubExecs   = EnvVars::GetEnvVar("NUM_SUB_EXEC"   , 8);
+  int showDetails   = EnvVars::GetEnvVar("SHOW_DETAILS"   , 0);
+  int useDmaExec    = EnvVars::GetEnvVar("USE_DMA_EXEC"   , 0);
+  int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+  int stride        = EnvVars::GetEnvVar("STRIDE"         , 1);
+  int groupSize     = EnvVars::GetEnvVar("GROUP_SIZE"     , numRanks * numDetectedGpus);
+
+  // Check that all ranks have at least the number of GPUs requested
+  // Warn if NIC configuration is slightly different from one another
+  int numNics  = TransferBench::GetNumExecutors(EXE_NIC, 0);
+  bool nicDifference = false;
+  for (int rank = 0; rank < numRanks; rank++) {
+    if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) {
+      Utils::Print("[ERROR] All-to-All preset requires each rank to have the same number of GPUs\n");
+      return ERR_FATAL;
+    }
+    if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank))
+      nicDifference = true;
+  }
+  if (nicDifference)
+    Utils::Print("[WARN] Not all ranks have the same number of NICs\n");
+
+  // A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
+  int numSrcs, numDsts;
+  int a2aMode = 0;
+  if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) {
+    a2aMode = A2A_CUSTOM;
+  } else {
+    a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
+    if (a2aMode < 0 || a2aMode > 2) {
+      Utils::Print("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
+      return ERR_FATAL;
+    }
+    numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
+    numDsts = (a2aMode == A2A_READ_ONLY  ? 0 : 1);
+  }
+
+  MemType memType = Utils::GetGpuMemType(memTypeIdx);
+  std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx);
+
+  // Print off environment variables
+  if (Utils::RankDoesOutput()) {
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+      if (!ev.outputToCsv) printf("[AllToAll Related]\n");
+      ev.Print("A2A_LOCAL"      , a2aLocal     , "%s local transfers", a2aLocal ? "Include" : "Exclude");
+      ev.Print("A2A_MODE"       , (a2aMode == A2A_CUSTOM) ?  std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
+                                  (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
+                                                             std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
+      ev.Print("MEM_TYPE"       , memTypeIdx   , "Using %s GPU memory (%s)", devMemTypeStr.c_str(), Utils::GetAllGpuMemTypeStr().c_str());
+      ev.Print("NUM_GPU_DEVICES", numGpus      , "Using %d GPUs", numGpus);
+      ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
+      ev.Print("NUM_SUB_EXEC"   , numSubExecs  , "Using %d subexecutors/CUs per Transfer", numSubExecs);
+      ev.Print("USE_DMA_EXEC"   , useDmaExec   , "Using %s executor", useDmaExec ? "DMA" : "GFX");
+      ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
+      ev.Print("STRIDE"         , stride       , "Reordering devices by taking %d steps", stride);
+      ev.Print("GROUP_SIZE"     , groupSize    , "Dividing all devices into groups of %d for a2a", groupSize);
+      printf("\n");
+    }
+  }
+  // Validate env vars
+  if (numGpus < 0 || numGpus > numDetectedGpus) {
+    Utils::Print("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+    return ERR_FATAL;
+  }
+  if (useDmaExec && (numSrcs != 1 || numDsts != 1)) {
+    Utils::Print("[ERROR] DMA execution can only be used for copies (A2A_MODE=0)\n");
+    return ERR_FATAL;
+  }
+
+  if (numRanks * numDetectedGpus % groupSize) {
+    Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n", groupSize, numRanks * numDetectedGpus, numRanks);
+    return ERR_FATAL;
+  }
+
+  Utils::Print("GPU-%s IntraPod All-To-All benchmark:\n", useDmaExec ? "DMA" : "GFX");
+  Utils::Print("==============================\n");
+  Utils::Print("[%lu bytes per Transfer] [%s:%d] [%d Read(s) %d Write(s)] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n",
+               numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, numSrcs, numDsts,
+               devMemTypeStr.c_str(), numQueuePairs, numRanks);
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
+
+  Utils::RankPerPodMap& rankToPod = Utils::GetRankPerPodMap();
+  if (rankToPod.empty()) {
+    Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n");
+    return ERR_FATAL;
+  }
+  for (auto const& [pod, ranks] : rankToPod) {
+    int n = ranks.size() * numGpus;
+    int numGroups = n / groupSize;
+    std::vector<MemDevice> devices(n);
+    std::vector<int> indices(n);
+    for (int k = 0; k < n; k++) indices[k] = k;
+    Utils::StrideGenerate(indices, stride);
+    int idx = 0;
+    for (int rank : ranks) {
+      for (int devIdx = 0; devIdx < numGpus; devIdx++) {
+        devices[indices[idx++]] = {memType, devIdx, rank};
+      }
+    }
+
+    for (int group = 0; group < numGroups; group++) {
+      std::vector<std::vector<int>> groupReIndex(groupSize, std::vector<int>(groupSize, -1));
+      std::vector<Transfer> transfers;
+      for (int i = group * groupSize; i < (group + 1) * groupSize; i++) {
+        for (int j = group * groupSize; j < (group + 1) * groupSize; j++) {
+          if (i == j) {
+            if (!a2aLocal) continue;
+          }
+          TransferBench::Transfer transfer;
+          transfer.numBytes = numBytesPerTransfer;
+          for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back(devices[i]);
+          if (numDsts) transfer.dsts.push_back(devices[j]);
+          for (int x = 1; x < numDsts; x++) transfer.dsts.push_back(devices[i]);
+          transfer.exeDevice = {exeType,
+                               (int32_t)(useRemoteRead ? devices[j].memIndex : devices[i].memIndex),
+                               (int32_t)(useRemoteRead ? devices[j].memRank : devices[i].memRank)};
+          transfer.exeSubIndex = -1;
+          transfer.numSubExecs = numSubExecs;
+          int const localI = i - group * groupSize;
+          int const localJ = j - group * groupSize;
+          groupReIndex[localI][localJ] = (int)transfers.size();
+          transfers.push_back(transfer);
+        }
+
+        if (numQueuePairs > 0) {
+          TransferBench::Transfer transfer;
+          transfer.numBytes = numBytesPerTransfer;
+          transfer.srcs.push_back(devices[i]);
+          int next = group * groupSize + (i - group * groupSize + 1) % groupSize;
+          transfer.dsts.push_back(devices[next]);
+          transfer.exeDevice = {TransferBench::EXE_NIC_NEAREST,
+                               (int32_t)devices[i].memIndex, (int32_t)devices[i].memRank};
+          transfer.exeSubIndex = devices[next].memIndex;
+          transfer.numSubExecs = numQueuePairs;
+          transfers.push_back(transfer);
+        }
+      }
+      TransferBench::TestResults results;
+      if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+        for (auto const& err : results.errResults)
+          Utils::Print("%s\n", err.errMsg.c_str());
+        return ERR_FATAL;
+      }
+      if (showDetails) {
+        Utils::PrintResults(ev, 1, transfers, results);
+        Utils::Print("\n");
+      }
+
+      // Per-group bandwidth table
+      std::vector<std::vector<double>> groupBw(groupSize, std::vector<double>(groupSize, -1.0));
+      for (int localI = 0; localI < groupSize; localI++) {
+        for (int localJ = 0; localJ < groupSize; localJ++) {
+          int const k = groupReIndex[localI][localJ];
+          if (k >= 0)
+            groupBw[localI][localJ] = results.tfrResults[k].avgBandwidthGbPerSec;
+        }
+      }
+      if (Utils::RankDoesOutput()) {
+        Utils::Print("\n--- Pod AllToAll Group %d ---\n", group);
+        int const groupBase = group * groupSize;
+        int const numRows = 2 + groupSize;
+        int const numCols = 2 + groupSize;
+        int const precision = 2;
+        Utils::TableHelper table(numRows, numCols, precision);
+        table.DrawRowBorder(0);
+        table.DrawColBorder(0);
+        table.DrawColBorder(numCols);
+        table.DrawRowBorder(numRows);
+        table.Set(0, 0, useRemoteRead ? " SRC\\DST+EXE " : " SRC+EXE\\DST ");
+        table.DrawRowBorder(1);
+        table.DrawColBorder(1);
+        table.Set(1, 1, " Mem Device ");
+
+        // Column headers
+        int colPrevRank = -1;
+        for (int j = 0; j < groupSize; j++) {
+          int colIdx = 2 + j;
+          int r = devices[groupBase + j].memRank;
+          if (r != colPrevRank) {
+            table.DrawColBorder(colIdx);
+            table.Set(0, colIdx, " Rank %02d ", r);
+            colPrevRank = r;
+          }
+          table.Set(1, colIdx, " GPU %02d ", devices[groupBase + j].memIndex);
+        }
+
+        // Row headers and data
+        int rowPrevRank = -1;
+        for (int localI = 0; localI < groupSize; localI++) {
+          int rowIdx = 2 + localI;
+          int r = devices[groupBase + localI].memRank;
+          if (r != rowPrevRank) {
+            table.DrawRowBorder(rowIdx);
+            table.Set(rowIdx, 0, " Rank %02d ", r);
+            rowPrevRank = r;
+          }
+          table.Set(rowIdx, 1, " GPU %02d ", devices[groupBase + localI].memIndex);
+          for (int localJ = 0; localJ < groupSize; localJ++) {
+            if (groupBw[localI][localJ] >= 0)
+              table.Set(rowIdx, 2 + localJ, " %.2f ", groupBw[localI][localJ]);
+            else
+              table.Set(rowIdx, 2 + localJ, " N/A ");
+          }
+        }
+        table.PrintTable(ev.outputToCsv, ev.showBorders);
+      }
+    }
+  }
+
+  if (!Utils::RankDoesOutput()) return 0;
+
+  if (Utils::HasDuplicateHostname()) {
+    printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
+  }
+
+  return ERR_NONE;
+}
diff --git a/src/client/Presets/PodPeerToPeer.hpp b/src/client/Presets/PodPeerToPeer.hpp
new file mode 100644
index 00000000..fe1cc775
--- /dev/null
+++ b/src/client/Presets/PodPeerToPeer.hpp
@@ -0,0 +1,300 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+int PodPeerToPeerPreset(EnvVars&          ev,
+                        size_t      const numBytesPerTransfer,
+                        std::string const presetName,
+                        bool        const bytesSpecified)
+{
+  if (Utils::GetNumRankGroups() > 1) {
+    Utils::Print("[ERROR] Pod p2p preset can only be run across ranks that are homogenous\n");
+    Utils::Print("[ERROR] All ranks currently have to be under the same physical and virtual pod\n");
+    Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n");
+    return ERR_FATAL;
+  }
+
+  Utils::RankPerPodMap& rankToPod = Utils::GetRankPerPodMap();
+  if (rankToPod.empty()) {
+    Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n");
+    return ERR_FATAL;
+  }
+
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int useDmaCopy     = EnvVars::GetEnvVar("USE_GPU_DMA",     0);
+  int gpuMemTypeIdx  = EnvVars::GetEnvVar("GPU_MEM_TYPE",    0);
+  int numGpuDevices  = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE",      useDmaCopy ? 1 : TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}));
+  int p2pMode        = EnvVars::GetEnvVar("P2P_MODE",        0);
+  int parallelLevel  = EnvVars::GetEnvVar("PARALLEL_LVL",    0);
+  int useRemoteRead  = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+  int showFullMatrix = EnvVars::GetEnvVar("OUTPUT_FORMAT", 1);
+
+  MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx);
+
+  // Display environment variables
+
+  if (Utils::RankDoesOutput()) {
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+      int outputToCsv = ev.outputToCsv;
+      if (!outputToCsv) printf("[P2P Related]\n");
+      ev.Print("GPU_MEM_TYPE"   , gpuMemTypeIdx,  "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str());
+      ev.Print("NUM_GPU_DEVICES", numGpuDevices,  "Using %d GPUs per rank", numGpuDevices);
+      ev.Print("NUM_GPU_SE",      numGpuSubExecs, "Using %d GPU subexecutors/CUs per Transfer", numGpuSubExecs);
+      ev.Print("P2P_MODE",        p2pMode,        "Running %s transfers", p2pMode == 0 ? "Uni + Bi" :
+                                                                          p2pMode == 1 ? "Unidirectional"
+                                                                                       : "Bidirectional");
+      ev.Print("PARALLEL_LVL",    parallelLevel,  "Executing p2p in parallel level %d (0: no parallel, 1: node pairs in parallel)", parallelLevel);
+      ev.Print("USE_GPU_DMA",     useDmaCopy,     "Using GPU-%s as GPU executor", useDmaCopy ? "DMA" : "GFX");
+      ev.Print("USE_REMOTE_READ", useRemoteRead,  "Using %s as executor", useRemoteRead ? "DST" : "SRC");
+      printf("\n");
+    }
+  }
+
+  char const separator = ev.outputToCsv ? ',' : ' ';
+  Utils::Print("Bytes Per Direction%c%lu\n", separator, numBytesPerTransfer);
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+
+  for (auto const& [pod, ranks] : rankToPod) {
+    // Add all devices in a pod
+    int n = ranks.size() * numGpuDevices;
+
+    std::vector<MemDevice> devices(n);
+    int idx = 0;
+    for (int rank : ranks) {
+      for (int devIdx = 0; devIdx < numGpuDevices; devIdx++) {
+        devices[idx++] = {gpuMemType, devIdx, rank};
+      }
+    }
+
+    // Build reverse map: (memRank, memIndex) -> device index
+    std::map<std::pair<int,int>, int> deviceLookup;
+    for (int i = 0; i < n; i++)
+      deviceLookup[{devices[i].memRank, devices[i].memIndex}] = i;
+
+    ExeType const gpuExeType = useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
+
+    for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) {
+      if ((p2pMode == 1 && isBidirectional == 1) ||
+          (p2pMode == 2 && isBidirectional == 0)) continue;
+
+      Utils::Print("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n",
+                   isBidirectional ? "Bi" : "Uni",
+                   useRemoteRead ? "Remote" : "Local",
+                   useRemoteRead ? "Local" : "Remote",
+                   useDmaCopy    ? "DMA"   : "GFX");
+
+      std::vector<double> avgBandwidth(n * n, 0.0);
+
+      // Build rounds of transfers; all transfers in a round run in parallel
+      std::vector<std::vector<std::pair<MemDevice, MemDevice>>> rounds;
+
+      if (parallelLevel == 0) {
+        for (int i = 0; i < n; i++) {
+          for (int j = 0; j < n; j++) {
+            if (isBidirectional && i == j) continue;
+            std::vector<std::pair<MemDevice, MemDevice>> pairs;
+            pairs.push_back({devices[i], devices[j]});
+            if (isBidirectional)
+              pairs.push_back({devices[j], devices[i]});
+            rounds.push_back(std::move(pairs));
+          }
+        }
+      } else {
+        // parallelLevel == 1: node pairs run concurrently, one device pair at a time per node pair
+        std::vector<std::vector<std::pair<int, int>>> nodePairSchedule;
+        Utils::RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1);
+
+        for (auto const& roundNodePairs : nodePairSchedule) {
+          for (int srcDev = 0; srcDev < numGpuDevices; srcDev++) {
+            for (int dstDev = 0; dstDev < numGpuDevices; dstDev++) {
+              std::vector<std::pair<MemDevice, MemDevice>> pairs;
+              for (auto const& [rankIdxA, rankIdxB] : roundNodePairs) {
+                int const rA = ranks[rankIdxA];
+                int const rB = ranks[rankIdxB];
+                if (isBidirectional && rA == rB && srcDev == dstDev) continue;
+                pairs.push_back({{gpuMemType, srcDev, rA}, {gpuMemType, dstDev, rB}});
+                if (isBidirectional)
+                  pairs.push_back({{gpuMemType, dstDev, rB}, {gpuMemType, srcDev, rA}});
+              }
+              if (!pairs.empty())
+                rounds.push_back(std::move(pairs));
+            }
+          }
+        }
+      }
+
+      // Execute rounds and collect results
+      for (auto const& round : rounds) {
+        std::vector<TransferBench::Transfer> transfers;
+        for (auto const& [src, dst] : round) {
+          TransferBench::Transfer transfer;
+          transfer.numBytes = numBytesPerTransfer;
+          transfer.srcs.push_back(src);
+          transfer.dsts.push_back(dst);
+          transfer.exeDevice = { gpuExeType,
+                                useRemoteRead ? (int32_t)dst.memIndex : (int32_t)src.memIndex,
+                                useRemoteRead ? (int32_t)dst.memRank  : (int32_t)src.memRank };
+          transfer.exeSubIndex = -1;
+          transfer.numSubExecs = numGpuSubExecs;
+          transfers.push_back(transfer);
+        }
+        if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+          for (auto const& err : results.errResults)
+            Utils::Print("%s\n", err.errMsg.c_str());
+          return ERR_FATAL;
+        }
+
+        for (size_t k = 0; k < round.size(); k++) {
+          auto const& [src, dst] = round[k];
+          int i = deviceLookup[{src.memRank, src.memIndex}];
+          int j = deviceLookup[{dst.memRank, dst.memIndex}];
+          avgBandwidth[i * n + j] = results.tfrResults[k].avgBandwidthGbPerSec;
+        }
+      }
+
+      // Output results
+      int const rowsPerSrc = isBidirectional ? 3 : 1;
+      int const rowStride = isBidirectional ? rowsPerSrc + 1 : rowsPerSrc;
+      int const numRows = showFullMatrix ? 2 + n * rowStride - (isBidirectional ? 1 : 0)
+                                         : 1 + n * n * rowsPerSrc;
+      int const numCols = showFullMatrix ? 2 + n : (isBidirectional ? 6 : 5);
+      int const precision = 2;
+      Utils::TableHelper table(numRows, numCols, precision);
+
+      table.DrawRowBorder(0);
+      table.DrawColBorder(0);
+      table.DrawColBorder(numCols);
+      table.DrawRowBorder(numRows);
+
+      if (showFullMatrix) {
+        if (isBidirectional)
+          table.Set(0, 0, " SRC\\DST ");
+        else
+          table.Set(0, 0, useRemoteRead ? " SRC\\DST+EXE " : " SRC+EXE\\DST ");
+        table.DrawRowBorder(1);
+        table.DrawColBorder(1);
+        table.Set(1, 1, " Mem Device ");
+
+        int colPrevRank = -1;
+        for (int j = 0; j < n; j++) {
+          int colIdx = 2 + j;
+          int r = devices[j].memRank;
+          if (r != colPrevRank) {
+            table.DrawColBorder(colIdx);
+            table.Set(0, colIdx, " Rank %02d ", r);
+            colPrevRank = r;
+          }
+          table.Set(1, colIdx, " GPU %02d ", devices[j].memIndex);
+        }
+
+        int rowPrevRank = -1;
+        for (int i = 0; i < n; i++) {
+          int r = devices[i].memRank;
+          int baseRow = 2 + i * rowStride;
+          if (r != rowPrevRank) {
+            table.DrawRowBorder(baseRow);
+            table.Set(baseRow, 0, " Rank %02d ", r);
+            rowPrevRank = r;
+          }
+
+          for (int dir = 0; dir < rowsPerSrc; dir++) {
+            int rowIdx = baseRow + dir;
+            if (isBidirectional) {
+              char const* arrow = (dir == 0) ? " ->" : (dir == 1) ? "<- " : "<->";
+              table.Set(rowIdx, 1, " GPU %02d %s ", devices[i].memIndex, arrow);
+            } else {
+              table.Set(rowIdx, 1, " GPU %02d ", devices[i].memIndex);
+            }
+
+            for (int j = 0; j < n; j++) {
+              double fwd = avgBandwidth[i * n + j];
+              double rev = avgBandwidth[j * n + i];
+              double val = (dir == 0) ? fwd : (dir == 1) ? rev : fwd + rev;
+              if (val == 0.0)
+                table.Set(rowIdx, 2 + j, " N/A ");
+              else
+                table.Set(rowIdx, 2 + j, " %.2f ", val);
+            }
+          }
+        }
+      } else {
+        table.Set(0, 0, " SRC Rank ");
+        table.Set(0, 1, " SRC MEM ");
+        if (isBidirectional) {
+          table.Set(0, 2, " Dir ");
+          table.Set(0, 3, " DST Rank ");
+          table.Set(0, 4, " DST MEM ");
+          table.Set(0, 5, " bw (GB/s) ");
+          table.DrawColBorder(3);
+          table.DrawColBorder(5);
+        } else {
+          table.Set(0, 2, " DST Rank ");
+          table.Set(0, 3, " DST MEM ");
+          table.Set(0, 4, " bw (GB/s) ");
+          table.DrawColBorder(2);
+          table.DrawColBorder(4);
+        }
+        int rowIdx = 1;
+        for (int i = 0; i < n; i++) {
+          table.DrawRowBorder(rowIdx);
+          for (int j = 0; j < n; j++) {
+            for (int dir = 0; dir < rowsPerSrc; dir++) {
+              double fwd = avgBandwidth[i * n + j];
+              double rev = avgBandwidth[j * n + i];
+              double val = (dir == 0) ? fwd : (dir == 1) ? rev : fwd + rev;
+              if (isBidirectional) {
+                char const* arrow = (dir == 0) ? " -> " : (dir == 1) ? " <- " : " <-> ";
+                table.Set(rowIdx, 0, " Rank %02d ", devices[i].memRank);
+                table.Set(rowIdx, 1, " GPU %02d ", devices[i].memIndex);
+                table.Set(rowIdx, 2, arrow);
+                table.Set(rowIdx, 3, " Rank %02d ", devices[j].memRank);
+                table.Set(rowIdx, 4, " GPU %02d ", devices[j].memIndex);
+                if (val == 0.0)
+                  table.Set(rowIdx, 5, " N/A ");
+                else
+                  table.Set(rowIdx, 5, " %.2f ", val);
+              } else {
+                table.Set(rowIdx, 0, " Rank %02d ", devices[i].memRank);
+                table.Set(rowIdx, 1, " GPU %02d ", devices[i].memIndex);
+                table.Set(rowIdx, 2, " Rank %02d ", devices[j].memRank);
+                table.Set(rowIdx, 3, " GPU %02d ", devices[j].memIndex);
+                if (val == 0.0)
+                  table.Set(rowIdx, 4, " N/A ");
+                else
+                  table.Set(rowIdx, 4, " %.2f ", val);
+              }
+              rowIdx++;
+            }
+          }
+        }
+      }
+      table.PrintTable(ev.outputToCsv, ev.showBorders);
+    }
+
+  }
+  return ERR_NONE;
+}
diff --git a/src/client/Presets/Presets.hpp b/src/client/Presets/Presets.hpp
index de1001fa..43631a45 100644
--- a/src/client/Presets/Presets.hpp
+++ b/src/client/Presets/Presets.hpp
@@ -22,6 +22,7 @@ THE SOFTWARE.
 
 #pragma once
 #include <map>
+#include <vector>
 
 // EnvVars is available to all presets
 #include "EnvVars.hpp"
@@ -30,41 +31,75 @@ THE SOFTWARE.
 #include "AllToAll.hpp"
 #include "AllToAllN.hpp"
 #include "AllToAllSweep.hpp"
+#include "BmaSweep.hpp"
+#include "EnvVarsList.hpp"
+#include "GfxSweep.hpp"
+#include "HbmBandwidth.hpp"
 #include "HealthCheck.hpp"
+#include "Help.hpp"
+#include "NicAllToAll.hpp"
 #include "NicRings.hpp"
 #include "NicPeerToPeer.hpp"
 #include "OneToAll.hpp"
 #include "PeerToPeer.hpp"
+#include "PodAllToAll.hpp"
+#include "PodPeerToPeer.hpp"
+#include "Rings.hpp"
 #include "Scaling.hpp"
 #include "Schmoo.hpp"
+#include "SmokeTest.hpp"
 #include "Sweep.hpp"
+#include "WallClock.hpp"
 
 typedef int (*PresetFunc)(EnvVars&          ev,
                           size_t      const numBytesPerTransfer,
-                          std::string const presetName);
+                          std::string const presetName,
+                          [[maybe_unused]] bool const bytesSpecified);
 
-std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
+struct PresetInfo
+{
+  PresetFunc   func;
+  std::string  description;
+};
+
+std::map<std::string, PresetInfo> presetFuncMap =
 {
   {"a2a",         {AllToAllPreset,      "Tests parallel transfers between all pairs of GPU devices"}},
   {"a2a_n",       {AllToAllRdmaPreset,  "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}},
   {"a2asweep",    {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}},
+  {"bmasweep",    {BmaSweepPreset,      "Test and compare batched DMA executor for multi destination copies"}},
+  {"envvars",     {EnvVarsPreset,       "Show list of environment variables that can be used to modify behavior"}},
+  {"gfxsweep",    {GfxSweepPreset,      "Sweep over various GFX kernel options for a given GFX Transfer"}},
+  {"hbm",         {HbmBandwidthPreset,  "Tests HBM bandwidth"}},
   {"healthcheck", {HealthCheckPreset,   "Simple bandwidth health check (MI300X series only)"}},
-  {"nicrings",    {NicRingsPreset,      "Tests NIC rings created across identical NIC indices across ranks"}},
+  {"help",        {HelpPreset,          "Shows example usage details"}},
+  {"nica2a",      {NicAllToAllPreset,   "All-to-all GPU traffic over NIC transfers using each NIC's closest GPU/CPU endpoint"}},
   {"nicp2p",      {NicPeerToPeerPreset, "Multi-node peer-to-peer RDMA transfer test between all NICs"}},
+  {"nicrings",    {NicRingsPreset,      "Tests NIC rings created across identical NIC indices across ranks"}},
   {"one2all",     {OneToAllPreset,      "Test all subsets of parallel transfers from one GPU to all others"}},
   {"p2p"   ,      {PeerToPeerPreset,    "Peer-to-peer device memory bandwidth test"}},
+  {"poda2a",      {PodAllToAllPreset,   "All-to-all transfers between subgroups of ranks within a pod"}},
+  {"podp2p",      {PodPeerToPeerPreset, "Peer-to-peer transfers test among ranks within a pod"}},
+  {"rings",       {RingsPreset,         "Ring transfers within subgroups of ranks in a pod"}},
   {"rsweep",      {SweepPreset,         "Randomly sweep through sets of Transfers"}},
   {"scaling",     {ScalingPreset,       "Run scaling test from one GPU to other devices"}},
   {"schmoo",      {SchmooPreset,        "Scaling tests for local/remote read/write/copy"}},
+  {"smoketest",   {SmokeTestPreset,     "Simple correctness smoke-test"}},
   {"sweep",       {SweepPreset,         "Ordered sweep through sets of Transfers"}},
+  {"wallclock",   {WallClockPreset,     "Tests wallclock consistency across XCCs within a GPU"}},
 };
 
 void DisplayPresets()
 {
-  printf("\nAvailable Preset Benchmarks:\n");
-  printf("============================\n");
-  for (auto const& x : presetFuncMap)
-    printf("   %15s - %s\n", x.first.c_str(), x.second.second.c_str());
+  if (!Utils::RankDoesOutput()) return;
+  printf(" %-12s | %-56s\n", "Preset", "Description");
+  printf("=============================================================================================================\n");
+  for (auto const& x : presetFuncMap) {
+    printf(" %-12s | %-56s\n",
+           x.first.c_str(),
+           x.second.description.c_str());
+  }
+  printf("=============================================================================================================\n");
 }
 
 int RunPreset(EnvVars&       ev,
@@ -74,8 +109,15 @@ int RunPreset(EnvVars&       ev,
               int&           retCode)
 {
   std::string preset = (argc > 1 ? argv[1] : "");
+  bool bytesSpecified = (argc > 2);
+
+  if (preset == "presets") {
+      DisplayPresets();
+      retCode = 0;
+      return 1;
+  }
   if (presetFuncMap.count(preset)) {
-    retCode = (presetFuncMap[preset].first)(ev, numBytesPerTransfer, preset);
+    retCode = (presetFuncMap[preset].func)(ev, numBytesPerTransfer, preset, bytesSpecified);
     return 1;
   }
   return 0;
diff --git a/src/client/Presets/Rings.hpp b/src/client/Presets/Rings.hpp
new file mode 100644
index 00000000..bee03055
--- /dev/null
+++ b/src/client/Presets/Rings.hpp
@@ -0,0 +1,280 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+int RingsPreset(EnvVars&          ev,
+                size_t      const numBytesPerTransfer,
+                std::string const presetName,
+                bool        const bytesSpecified)
+{
+  // Check for homogeneous ranks
+  if (Utils::GetNumRankGroups() > 1) {
+    Utils::Print("[ERROR] rings preset can only be run across ranks that are homogeneous\n");
+    Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n");
+    Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n");
+    return 1;
+  }
+
+  // Check for pod support (if multi-node)
+  int numRanks = TransferBench::GetNumRanks();
+  if (numRanks > 1 && Utils::GetRankPerPodMap().empty()) {
+    Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n");
+    return 1;
+  }
+
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  int memTypeIdx    = EnvVars::GetEnvVar("MEM_TYPE"       , 0);
+  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
+  int numSubExecs   = EnvVars::GetEnvVar("NUM_SUB_EXEC"   , 8);
+  int showDetails   = EnvVars::GetEnvVar("SHOW_DETAILS"   , 0);
+  int useDmaExec    = EnvVars::GetEnvVar("USE_DMA_EXEC"   , 0);
+  int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+  int stride        = EnvVars::GetEnvVar("STRIDE"         , 1);
+  int groupSize     = EnvVars::GetEnvVar("GROUP_SIZE"     , numRanks * numGpus);
+
+  if (numGpus <= 0 || numGpus > numDetectedGpus) {
+    Utils::Print("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+    return 1;
+  }
+  if (groupSize <= 0) {
+    Utils::Print("[ERROR] Group size must be greater than 0\n");
+    return 1;
+  }
+  if (numRanks * numGpus % groupSize) {
+    Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n",
+                 groupSize, numRanks * numGpus, numRanks);
+    return 1;
+  }
+
+  int numNics = TransferBench::GetNumExecutors(EXE_NIC, 0);
+  bool nicDifference = false;
+  for (int rank = 0; rank < numRanks; rank++) {
+    if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) {
+      Utils::Print("[ERROR] rings preset requires each rank to have the same number of GPUs\n");
+      return 1;
+    }
+    if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank))
+      nicDifference = true;
+  }
+  if (nicDifference)
+    Utils::Print("[WARN] Not all ranks have the same number of NICs\n");
+
+  MemType memType = Utils::GetGpuMemType(memTypeIdx);
+  std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx);
+
+  if (Utils::RankDoesOutput()) {
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+      if (!ev.outputToCsv) printf("[Rings Related]\n");
+      ev.Print("MEM_TYPE"       , memTypeIdx   , "Using %s GPU memory (%s)", devMemTypeStr.c_str(), Utils::GetAllGpuMemTypeStr().c_str());
+      ev.Print("NUM_GPU_DEVICES", numGpus      , "Using %d GPUs", numGpus);
+      ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
+      ev.Print("NUM_SUB_EXEC"   , numSubExecs  , "Using %d subexecutors/CUs per Transfer", numSubExecs);
+      ev.Print("USE_DMA_EXEC"   , useDmaExec   , "Using %s executor", useDmaExec ? "DMA" : "GFX");
+      ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
+      ev.Print("STRIDE"         , stride       , "Reordering devices by taking %d steps", stride);
+      ev.Print("GROUP_SIZE"     , groupSize    , "Dividing all devices into ring groups of %d", groupSize);
+      printf("\n");
+    }
+  }
+
+  Utils::Print("GPU-%s IntraPod Ring benchmark:\n", useDmaExec ? "DMA" : "GFX");
+  Utils::Print("==============================\n");
+  Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n",
+               numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs,
+               devMemTypeStr.c_str(), numQueuePairs, numRanks);
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
+
+  int n = numRanks * numGpus;
+  int numGroups = n / groupSize;
+
+  std::vector<int> indices(n);
+  for (int k = 0; k < n; k++) indices[k] = k;
+  Utils::StrideGenerate(indices, stride);
+
+  std::vector<MemDevice> devices(n);
+  for (int i = 0; i < n; i++) {
+    int const globalIdx = indices[i];
+    int const rank      = globalIdx / numGpus;
+    int const devIdx    = globalIdx % numGpus;
+    devices[i] = {memType, devIdx, rank};
+  }
+
+  Utils::Print("%d ring(s) of %d devices:\n", numGroups, groupSize);
+  for (int group = 0; group < numGroups; group++) {
+    int const groupBase = group * groupSize;
+    Utils::Print("  Ring %d: ", group);
+    for (int i = 0; i < groupSize; i++) {
+      Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex);
+    }
+    Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex);
+  }
+  Utils::Print("\n");
+
+  for (int group = 0; group < numGroups; group++) {
+    int const groupBase = group * groupSize;
+    std::vector<Transfer> transfers;
+
+    for (int i = 0; i < groupSize; i++) {
+      int srcIdx = groupBase + i;
+      int dstIdx = groupBase + (i + 1) % groupSize;
+
+      TransferBench::Transfer transfer;
+      transfer.numBytes = numBytesPerTransfer;
+      transfer.srcs.push_back(devices[srcIdx]);
+      transfer.dsts.push_back(devices[dstIdx]);
+      transfer.exeDevice = {exeType,
+                           (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex),
+                           (int32_t)(useRemoteRead ? devices[dstIdx].memRank  : devices[srcIdx].memRank)};
+      transfer.exeSubIndex = -1;
+      transfer.numSubExecs = numSubExecs;
+      transfers.push_back(transfer);
+
+      if (numQueuePairs > 0) {
+        TransferBench::Transfer nicTransfer;
+        nicTransfer.numBytes = numBytesPerTransfer;
+        nicTransfer.srcs.push_back(devices[srcIdx]);
+        nicTransfer.dsts.push_back(devices[dstIdx]);
+        nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST,
+                                (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank};
+        nicTransfer.exeSubIndex = devices[dstIdx].memIndex;
+        nicTransfer.numSubExecs = numQueuePairs;
+        transfers.push_back(nicTransfer);
+      }
+    }
+
+    TransferBench::TestResults results;
+    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+      for (auto const& err : results.errResults)
+        Utils::Print("%s\n", err.errMsg.c_str());
+      return 1;
+    }
+    if (showDetails) {
+      Utils::PrintResults(ev, 1, transfers, results);
+      Utils::Print("\n");
+    }
+
+    if (Utils::RankDoesOutput()) {
+      Utils::Print("\n--- Ring Group %d ---\n", group);
+
+      int const numHops   = groupSize;
+      int const numRows   = 2 + numHops + 3;
+      int const numCols   = 6;
+      int const precision = 2;
+      Utils::TableHelper table(numRows, numCols, precision);
+
+      table.DrawRowBorder(0);
+      table.DrawColBorder(0);
+      table.DrawColBorder(numCols);
+      table.DrawRowBorder(numRows);
+
+      table.Set(0, 0, " Src ");
+      table.Set(0, 1, " Src ");
+      table.Set(0, 2, " Dst ");
+      table.Set(0, 3, " Dst ");
+      table.Set(0, 4, " GFX BW ");
+      table.Set(1, 0, " Rank ");
+      table.Set(1, 1, " GPU ");
+      table.Set(1, 2, " Rank ");
+      table.Set(1, 3, " GPU ");
+      table.Set(1, 4, " (GB/s) ");
+      table.DrawColBorder(2);
+      table.DrawColBorder(4);
+
+      if (numQueuePairs > 0) {
+        table.Set(0, 5, " NIC BW ");
+        table.Set(1, 5, " (GB/s) ");
+      } else {
+        table.Set(0, 5, " ");
+        table.Set(1, 5, " ");
+      }
+
+      table.DrawRowBorder(2);
+
+      double gfxMin = std::numeric_limits<double>::max();
+      double gfxAvg = 0.0;
+      double gfxMax = std::numeric_limits<double>::lowest();
+      double nicMin = std::numeric_limits<double>::max();
+      double nicAvg = 0.0;
+      double nicMax = std::numeric_limits<double>::lowest();
+
+      int tfrIdx = 0;
+      for (int i = 0; i < numHops; i++) {
+        int srcIdx = groupBase + i;
+        int dstIdx = groupBase + (i + 1) % groupSize;
+        int row    = 2 + i;
+
+        double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
+        tfrIdx++;
+
+        table.Set(row, 0, " %d ", devices[srcIdx].memRank);
+        table.Set(row, 1, " %d ", devices[srcIdx].memIndex);
+        table.Set(row, 2, " %d ", devices[dstIdx].memRank);
+        table.Set(row, 3, " %d ", devices[dstIdx].memIndex);
+        table.Set(row, 4, " %.2f ", gfxBw);
+
+        gfxMin = std::min(gfxMin, gfxBw);
+        gfxAvg += gfxBw;
+        gfxMax = std::max(gfxMax, gfxBw);
+
+        if (numQueuePairs > 0) {
+          double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
+          tfrIdx++;
+          table.Set(row, 5, " %.2f ", nicBw);
+          nicMin = std::min(nicMin, nicBw);
+          nicAvg += nicBw;
+          nicMax = std::max(nicMax, nicBw);
+        }
+      }
+
+      int summaryBase = 2 + numHops;
+      table.DrawRowBorder(summaryBase);
+      table.Set(summaryBase    , 1, " MAX ");
+      table.Set(summaryBase + 1, 1, " AVG ");
+      table.Set(summaryBase + 2, 1, " MIN ");
+      table.Set(summaryBase    , 4, " %.2f ", gfxMax);
+      table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops);
+      table.Set(summaryBase + 2, 4, " %.2f ", gfxMin);
+
+      if (numQueuePairs > 0) {
+        table.Set(summaryBase    , 5, " %.2f ", nicMax);
+        table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops);
+        table.Set(summaryBase + 2, 5, " %.2f ", nicMin);
+      }
+
+      table.PrintTable(ev.outputToCsv, ev.showBorders);
+
+      Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
+    }
+  }
+
+  if (!Utils::RankDoesOutput()) return 0;
+
+  if (Utils::HasDuplicateHostname()) {
+    printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
+  }
+
+  return 0;
+}
diff --git a/src/client/Presets/Scaling.hpp b/src/client/Presets/Scaling.hpp
index c654943e..5c6879ce 100644
--- a/src/client/Presets/Scaling.hpp
+++ b/src/client/Presets/Scaling.hpp
@@ -20,41 +20,50 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-int ScalingPreset(EnvVars&           ev,
-                  size_t      const  numBytesPerTransfer,
-                  std::string const  presetName)
+int ScalingPreset(EnvVars&          ev,
+                  size_t      const numBytesPerTransfer,
+                  std::string const presetName,
+                  bool        const bytesSpecified)
 {
   if (TransferBench::GetNumRanks() > 1) {
     Utils::Print("[ERROR] Scaling preset currently not supported for multi-node\n");
-    return 1;
+    return ERR_FATAL;
   }
 
   int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
   int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
 
   // Collect env vars for this preset
+  int cpuMemTypeIdx = EnvVars::GetEnvVar("CPU_MEM_TYPE",    0);
+  int gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE",    0);
   int localIdx      = EnvVars::GetEnvVar("LOCAL_IDX",       0);
   int numCpuDevices = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
   int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
   int sweepMax      = EnvVars::GetEnvVar("SWEEP_MAX",       32);
   int sweepMin      = EnvVars::GetEnvVar("SWEEP_MIN",       1);
-  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN",  0);
 
   // Display environment variables
+  MemType cpuMemType = Utils::GetCpuMemType(cpuMemTypeIdx);
+  MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx);
+
   ev.DisplayEnvVars();
   if (!ev.hideEnv) {
     int outputToCsv = ev.outputToCsv;
-    if (!outputToCsv) printf("[Schmoo Related]\n");
-    ev.Print("LOCAL_IDX",      localIdx,     "Local GPU index");
-    ev.Print("SWEEP_MAX",      sweepMax,     "Max number of subExecutors to use");
-    ev.Print("SWEEP_MIN",      sweepMin,     "Min number of subExecutors to use");
+    if (!outputToCsv) printf("[Scaling Related]\n");
+    ev.Print("CPU_MEM_TYPE"   , cpuMemTypeIdx, "Using %s (%s)", Utils::GetCpuMemTypeStr(cpuMemTypeIdx).c_str(), Utils::GetAllCpuMemTypeStr().c_str());
+    ev.Print("GPU_MEM_TYPE"   , gpuMemTypeIdx, "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str());
+    ev.Print("LOCAL_IDX"      , localIdx     , "Local GPU index");
+    ev.Print("NUM_CPU_DEVICES", numCpuDevices, "Using %d CPUs", numCpuDevices);
+    ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices);
+    ev.Print("SWEEP_MAX"      , sweepMax     , "Max number of subExecutors to use");
+    ev.Print("SWEEP_MIN"      , sweepMin     , "Min number of subExecutors to use");
     printf("\n");
   }
 
   // Validate env vars
   if (localIdx >= numDetectedGpus) {
     printf("[ERROR] Cannot execute scaling test with local GPU device %d\n", localIdx);
-    return 1;
+    return ERR_FATAL;
   }
 
   TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
@@ -74,25 +83,23 @@ int ScalingPreset(EnvVars&           ev,
 
   std::vector<std::pair<double, int>> bestResult(numDevices);
 
-  MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
-
   std::vector<Transfer> transfers(1);
   Transfer& t   = transfers[0];
   t.exeDevice   = {EXE_GPU_GFX, localIdx};
   t.exeSubIndex = -1;
   t.numBytes    = numBytesPerTransfer;
-  t.srcs        = {{memType, localIdx}};
+  t.srcs        = {{gpuMemType, localIdx}};
 
   for (int numSubExec = sweepMin; numSubExec <= sweepMax; numSubExec++) {
     t.numSubExecs = numSubExec;
     printf("%4d  ", numSubExec);
 
     for (int i = 0; i < numDevices; i++) {
-      t.dsts = {{i < numCpuDevices ? MEM_CPU : MEM_GPU,
+      t.dsts = {{i < numCpuDevices ? cpuMemType : gpuMemType,
                  i < numCpuDevices ? i : i - numCpuDevices}};
       if (!RunTransfers(cfg, transfers, results)) {
         Utils::PrintErrors(results.errResults);
-        return 1;
+        return ERR_FATAL;
       }
       double bw = results.tfrResults[0].avgBandwidthGbPerSec;
       printf("%c%7.2f     ", separator, bw);
@@ -109,5 +116,6 @@ int ScalingPreset(EnvVars&           ev,
   for (int i = 0; i < numDevices; i++)
     printf("%c%7.2f(%3d)", separator, bestResult[i].first, bestResult[i].second);
   printf("\n");
-  return 0;
+
+  return ERR_NONE;
 }
diff --git a/src/client/Presets/Schmoo.hpp b/src/client/Presets/Schmoo.hpp
index 71576ef8..bdd24b66 100644
--- a/src/client/Presets/Schmoo.hpp
+++ b/src/client/Presets/Schmoo.hpp
@@ -19,52 +19,55 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
-int SchmooPreset(EnvVars&           ev,
-                 size_t      const  numBytesPerTransfer,
-                 std::string const  presetName)
+int SchmooPreset(EnvVars&          ev,
+                 size_t      const numBytesPerTransfer,
+                 std::string const presetName,
+                 bool        const bytesSpecified)
 {
   if (TransferBench::GetNumRanks() > 1) {
     Utils::Print("[ERROR] Schmoo preset currently not supported for multi-node\n");
-    return 1;
+    return ERR_FATAL;
   }
 
   int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
 
   if (numDetectedGpus < 2) {
     printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
-    return 1;
+    return ERR_FATAL;
   }
 
   // Collect env vars for this preset
-  int localIdx     = EnvVars::GetEnvVar("LOCAL_IDX",      0);
-  int remoteIdx    = EnvVars::GetEnvVar("REMOTE_IDX",     1);
-  int sweepMax     = EnvVars::GetEnvVar("SWEEP_MAX",      32);
-  int sweepMin     = EnvVars::GetEnvVar("SWEEP_MIN",      1);
-  int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0);
+  int gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE",    0);
+  int localIdx      = EnvVars::GetEnvVar("LOCAL_IDX",      0);
+  int remoteIdx     = EnvVars::GetEnvVar("REMOTE_IDX",     1);
+  int sweepMax      = EnvVars::GetEnvVar("SWEEP_MAX",      32);
+  int sweepMin      = EnvVars::GetEnvVar("SWEEP_MIN",      1);
+
+  MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx);
 
   // Display environment variables
   ev.DisplayEnvVars();
   if (!ev.hideEnv) {
     int outputToCsv = ev.outputToCsv;
     if (!outputToCsv) printf("[Schmoo Related]\n");
+    ev.Print("GPU_MEM_TYPE"   , gpuMemTypeIdx,  "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str());
     ev.Print("LOCAL_IDX",      localIdx,     "Local GPU index");
     ev.Print("REMOTE_IDX",     remoteIdx,    "Remote GPU index");
     ev.Print("SWEEP_MAX",      sweepMax,     "Max number of subExecutors to use");
     ev.Print("SWEEP_MIN",      sweepMin,     "Min number of subExecutors to use");
-    ev.Print("USE_FINE_GRAIN", useFineGrain, "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
     printf("\n");
   }
 
   // Validate env vars
   if (localIdx >= numDetectedGpus || remoteIdx >= numDetectedGpus) {
     printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx);
-    return 1;
+    return ERR_FATAL;
   }
 
   TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
   TransferBench::TestResults results;
 
-  char memChar = useFineGrain ? 'F' : 'G';
+  char memChar = MemTypeStr[gpuMemType];
   printf("Bytes to transfer: %lu Local GPU: %d Remote GPU: %d\n", numBytesPerTransfer, localIdx, remoteIdx);
   printf("       | Local Read  | Local Write | Local Copy  | Remote Read | Remote Write| Remote Copy |\n");
   printf("  #CUs |%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|\n",
@@ -82,69 +85,65 @@ int SchmooPreset(EnvVars&           ev,
   t.exeSubIndex = -1;
   t.numBytes    = numBytesPerTransfer;
 
-  MemType memType = (useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-
   for (int numCUs = sweepMin; numCUs <= sweepMax; numCUs++) {
     t.numSubExecs = numCUs;
 
     // Local Read
-    t.srcs = {{memType, localIdx}};
+    t.srcs = {{gpuMemType, localIdx}};
     t.dsts = {};
     if (!TransferBench::RunTransfers(cfg, transfers, results)) {
       Utils::PrintErrors(results.errResults);
-      return 1;
+      return ERR_FATAL;
     }
     double const localRead = results.tfrResults[0].avgBandwidthGbPerSec;
 
     // Local Write
     t.srcs = {};
-    t.dsts = {{memType, localIdx}};
+    t.dsts = {{gpuMemType, localIdx}};
     if (!TransferBench::RunTransfers(cfg, transfers, results)) {
       Utils::PrintErrors(results.errResults);
-      return 1;
+      return ERR_FATAL;
     }
     double const localWrite = results.tfrResults[0].avgBandwidthGbPerSec;
 
     // Local Copy
-    t.srcs = {{memType, localIdx}};
-    t.dsts = {{memType, localIdx}};
-    t.srcs = {};
-    t.dsts = {{memType, localIdx}};
+    t.srcs = {{gpuMemType, localIdx}};
+    t.dsts = {{gpuMemType, localIdx}};
     if (!TransferBench::RunTransfers(cfg, transfers, results)) {
       Utils::PrintErrors(results.errResults);
-      return 1;
+      return ERR_FATAL;
     }
     double const localCopy = results.tfrResults[0].avgBandwidthGbPerSec;
 
     // Remote Read
-    t.srcs = {{memType, remoteIdx}};
+    t.srcs = {{gpuMemType, remoteIdx}};
     t.dsts = {};
     if (!TransferBench::RunTransfers(cfg, transfers, results)) {
       Utils::PrintErrors(results.errResults);
-      return 1;
+      return ERR_FATAL;
     }
     double const remoteRead = results.tfrResults[0].avgBandwidthGbPerSec;
 
     // Remote Write
     t.srcs = {};
-    t.dsts = {{memType, remoteIdx}};
+    t.dsts = {{gpuMemType, remoteIdx}};
     if (!TransferBench::RunTransfers(cfg, transfers, results)) {
       Utils::PrintErrors(results.errResults);
-      return 1;
+      return ERR_FATAL;
     }
     double const remoteWrite = results.tfrResults[0].avgBandwidthGbPerSec;
 
     // Remote Copy
-    t.srcs = {{memType, localIdx}};
-    t.dsts = {{memType, remoteIdx}};
+    t.srcs = {{gpuMemType, localIdx}};
+    t.dsts = {{gpuMemType, remoteIdx}};
     if (!TransferBench::RunTransfers(cfg, transfers, results)) {
       Utils::PrintErrors(results.errResults);
-      return 1;
+      return ERR_FATAL;
     }
     double const remoteCopy = results.tfrResults[0].avgBandwidthGbPerSec;
 
     printf("   %3d   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f  \n",
            numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy);
   }
-  return 0;
+  return ERR_NONE;
 }
diff --git a/src/client/Presets/SmokeTest.hpp b/src/client/Presets/SmokeTest.hpp
new file mode 100644
index 00000000..9628823c
--- /dev/null
+++ b/src/client/Presets/SmokeTest.hpp
@@ -0,0 +1,336 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <set>
+
+namespace  {
+
+#define NUM_SMOKE_TESTS 14
+#define MAX_TRANSFER_STRLEN 128
+
+// What to print on pass/fail/skip
+const std::string pass = "*";
+const std::string fail = "F";
+const std::string skip = ".";
+
+int RunTest(int                        testNum,
+            std::set<int> const&       testsToRun,
+            std::vector<size_t> const& sizeList,
+            int                        numSubExecPerGpu,
+            ConfigOptions&             cfg,
+            MemType                    cpuMemType,
+            MemType                    gpuMemType,
+            size_t                     maxBytesPerSubExec,
+            int                        totalGpus)
+{
+  int numFail = 0;
+
+  // Collect some topology information
+  int numRanks = TransferBench::GetNumRanks();
+
+  std::vector<Transfer> transfers;
+  std::vector<Transfer> allTransfers;
+  TestResults results;
+  char transferStr[MAX_TRANSFER_STRLEN] = {};
+
+
+  // Different test categories
+  bool isH2D       = (testNum == 1 || testNum ==  8);
+  bool isD2H       = (testNum == 2 || testNum ==  9);
+  bool isD2D_RW    = (testNum == 3 || testNum == 10);
+  bool isD2D_RR    = (testNum == 4 || testNum == 11);
+  bool isBroadcast = (testNum == 5 || testNum == 12);
+  bool isGather    = (testNum == 6 || testNum == 13);
+  bool isAllToAll  = (testNum == 7 || testNum == 14);
+
+  // Determine executor type
+  ExeType exeType;
+  if      (1 <= testNum && testNum <= 7)  exeType = EXE_GPU_DMA;
+  else if (8 <= testNum && testNum <= 14) exeType = EXE_GPU_GFX;
+  else {
+    Utils::Print("[ERROR] Unsupported test number %d\n", testNum);
+    exit(1);
+  }
+
+  // Adjust number of subexecutors per transfer if performing multiple transfers
+  int numSubExec = exeType == EXE_GPU_DMA ? 1 : numSubExecPerGpu;
+  if (exeType == EXE_GPU_GFX && (isBroadcast || isGather || isAllToAll))
+    numSubExec = std::max(1, numSubExecPerGpu / totalGpus);
+
+  for (size_t numBytes : sizeList) {
+
+    // Print skip symbol for skipped tests
+    if (!testsToRun.count(testNum)) {
+      Utils::Print("%s", skip.c_str()); fflush(stdout);
+      continue;
+    }
+    if (exeType == EXE_GPU_GFX &&
+        (numSubExec * cfg.data.blockBytes > numBytes ||
+         numSubExec * maxBytesPerSubExec  < numBytes)) {
+      Utils::Print("%s", skip.c_str()); fflush(stdout);
+      continue;
+    }
+    // Skip test that require pod
+    if (numRanks > 1 && Utils::GetRankPerPodMap().size() != 1 && !(isH2D || isD2H)) {
+      Utils::Print("%s", skip.c_str()); fflush(stdout);
+      continue;
+    }
+
+    bool allPass = true;
+    allTransfers.clear();
+
+    // Combine transfers from each GPU and run them all in parallel
+    for (int rank = 0; allPass && rank < numRanks; rank++) {
+      int numGpus = GetNumExecutors(exeType, rank);
+      for (int gpuIdx = 0; allPass && gpuIdx < numGpus; gpuIdx++) {
+        if (isH2D || isD2H) {
+          // Copy to/from closest CPU NUMA node for this GPU
+          int cpuIdx = GetClosestCpuNumaToGpu(gpuIdx, rank);
+          snprintf(transferStr, MAX_TRANSFER_STRLEN, "-1 (R%d%c%d R%d%c%d R%d%c%d %d %lu)",
+                   rank, MemTypeStr[isH2D ? cpuMemType : gpuMemType], isH2D ? cpuIdx : gpuIdx,
+                   rank, ExeTypeStr[exeType], gpuIdx,
+                   rank, MemTypeStr[isH2D ? gpuMemType : cpuMemType], isH2D ? gpuIdx : cpuIdx,
+                   numSubExec, numBytes);
+        } else if (isD2D_RW || isD2D_RR) {
+          // Copy from this GPU to "next" GPU
+          int dstRank = rank, dstGpuIdx = gpuIdx + 1;
+          if (dstGpuIdx >= GetNumExecutors(exeType, dstRank)) {
+            dstGpuIdx = 0;
+            dstRank = (rank+1) % numRanks;
+          }
+          snprintf(transferStr, MAX_TRANSFER_STRLEN, "-1 (R%d%c%d R%d%c%d R%d%c%d %d %lu)",
+                   rank, MemTypeStr[gpuMemType], gpuIdx,
+                   isD2D_RW ? rank : dstRank, ExeTypeStr[exeType], isD2D_RW ? gpuIdx : dstGpuIdx,
+                   dstRank, MemTypeStr[gpuMemType], dstGpuIdx,
+                   numSubExec, numBytes);
+        } else if (isBroadcast) {
+          // Split up the number of CUs across all Transfers
+          snprintf(transferStr, MAX_TRANSFER_STRLEN, "-1 (R%d%c%d R%d%c%d R*%c* %d %lu)",
+                   rank, MemTypeStr[gpuMemType], gpuIdx,
+                   rank, ExeTypeStr[exeType], gpuIdx,
+                   MemTypeStr[gpuMemType],
+                   numSubExec, numBytes);
+        } else if (isGather) {
+          // Split up the number of CUs across all Transfers
+          snprintf(transferStr, MAX_TRANSFER_STRLEN, "-1 (R*%c* R%d%c%d R%d%c%d %d %lu)",
+                   MemTypeStr[gpuMemType],
+                   rank, ExeTypeStr[exeType], gpuIdx,
+                   rank, MemTypeStr[gpuMemType], gpuIdx,
+                   numSubExec, numBytes);
+        } else if (isAllToAll) {
+          // Split up the number of CUs across all Transfers
+          snprintf(transferStr, MAX_TRANSFER_STRLEN, "-1 (R%d%c%d R%d%c%d R*%c* %d %lu)",
+                   rank, MemTypeStr[gpuMemType], gpuIdx,
+                   rank, ExeTypeStr[exeType], gpuIdx,
+                   MemTypeStr[gpuMemType],
+                   numSubExec, numBytes);
+        }
+
+        ErrResult err = ParseTransfers(transferStr, transfers);
+        if (err.errType != ERR_NONE) {
+          Utils::Print("[ERROR] Unexpected parsing error - %s.  This is a coding error\n", err.errMsg.c_str());
+          exit(1);
+        }
+
+        if (isBroadcast || isGather) {
+          if (!RunTransfers(cfg, transfers, results)) {
+            allPass = false;
+            break;
+          }
+        } else {
+          allTransfers.insert(allTransfers.end(), transfers.begin(), transfers.end());
+        }
+      }
+    }
+    if (!(isBroadcast || isGather)) {
+      if (!RunTransfers(cfg, allTransfers, results)) {
+        allPass = false;
+      }
+    }
+    Utils::Print("%s", allPass ? pass.c_str() : fail.c_str()); fflush(stdout);
+    numFail += (allPass ? 0 : 1);
+  }
+  return numFail;
+}
+
+int SmokeTestPreset(EnvVars&          ev,
+                    size_t      const numBytesPerTransfer,
+                    std::string const presetName,
+                    bool        const bytesSpecified)
+{
+  // Check for single pod
+  if (Utils::GetRankPerPodMap().size() > 1) {
+    Utils::Print("[ERROR] %s preset can only be run within a single pod\n", presetName.c_str());
+    Utils::Print("[ERROR] Pod membership may be forced by setting TB_FORCE_SINGLE_POD=1\n");
+    return ERR_FATAL;
+  }
+
+  // Collect topology and check that all GPUs have the same number of subExecutors
+  int numRanks = TransferBench::GetNumRanks();
+  int totalGpus = 0;
+  int numSubExec = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0, 0});
+  for (int rank = 0; rank < numRanks; rank++) {
+    int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX, rank);
+    totalGpus += numGpus;
+    for (int gpu = 0; gpu < numGpus; gpu++) {
+      if (numSubExec != TransferBench::GetNumSubExecutors({EXE_GPU_GFX, gpu, rank})) {
+        Utils::Print("[ERROR] %s preset can only be run on GPUs with the same number of subexecutors\n", presetName.c_str());
+        return ERR_FATAL;
+      }
+    }
+  }
+
+  // Modify defaults unless they were set
+  if (!getenv("ALWAYS_VALIDATE")) ev.alwaysValidate = 1;
+  if (!getenv("NUM_ITERATIONS" )) ev.numIterations  = 2;
+  if (!getenv("NUM_WARMUPS"    )) ev.numWarmups     = 0;
+
+  // Collect env vars
+  int                 cpuMemTypeIdx = EnvVars::GetEnvVar          ("CPU_MEM_TYPE",                  0);
+  int                 gpuMemTypeIdx = EnvVars::GetEnvVar          ("GPU_MEM_TYPE",                  0);
+  vector<int>         gfxSesList    = EnvVars::GetEnvVarArray     ("GFX_SE_LIST",      {1,numSubExec});
+  std::string         seMaxBytesStr = EnvVars::GetEnvVar          ("SE_MAX_BYTES",             "128M");
+  vector<std::string> sizeStrList   = EnvVars::GetEnvVarStrArray  ("SIZE_LIST",   {"1K","16M","256M"});
+  vector<int>         testList      = EnvVars::GetEnvVarRangeArray("TEST_LIST",                    {});
+
+  MemType cpuMemType = Utils::GetCpuMemType(cpuMemTypeIdx);
+  MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx);
+  std::set<int> testsToRun(testList.begin(), testList.end());
+  if (testList.empty()) {
+    for (int testIdx = 1; testIdx <= NUM_SMOKE_TESTS; testIdx++)
+      testsToRun.insert(testIdx);
+  }
+
+  vector<size_t> sizeList;
+  if (bytesSpecified) {
+    sizeList = {numBytesPerTransfer};
+  } else {
+    for (auto s : sizeStrList) {
+      size_t val;
+      if (sscanf(s.c_str(), "%lu", &val) == 1) {
+        switch (s[s.size()-1]) {
+        case 'G': case 'g': val *= 1024;
+        case 'M': case 'm': val *= 1024;
+        case 'K': case 'k': val *= 1024;
+        }
+        sizeList.push_back(val);
+      }
+    }
+  }
+  size_t seMaxBytes = 128 * 1024 * 1024;
+  if (sscanf(seMaxBytesStr.c_str(), " %lu", &seMaxBytes) == 1) {
+    switch (seMaxBytesStr[seMaxBytesStr.size()-1]) {
+    case 'G': case 'g': seMaxBytes *= 1024;
+    case 'M': case 'm': seMaxBytes *= 1024;
+    case 'K': case 'k': seMaxBytes *= 1024;
+    }
+  }
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+
+  // Print off environment variables
+  if (Utils::RankDoesOutput()) {
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+      if (!ev.outputToCsv) printf("[%s-preset Related]\n", presetName.c_str());
+      ev.Print("CPU_MEM_TYPE", cpuMemTypeIdx,      "Using %s (%s)", Utils::GetCpuMemTypeStr(cpuMemTypeIdx).c_str(), Utils::GetAllCpuMemTypeStr().c_str());
+      ev.Print("GFX_SE_LIST" , gfxSesList.size(),  "Testing GFX with subexecutor counts: %s", EnvVars::ToStr(gfxSesList).c_str());
+      ev.Print("GPU_MEM_TYPE", gpuMemTypeIdx,      "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str());
+      ev.Print("SIZE_LIST"   , sizeStrList.size(), "Transfer sizes tested: %s", ev.GetStr(sizeStrList).c_str());
+      ev.Print("SE_MAX_BYTES", seMaxBytesStr,      "Each SubExecutor can work on at most %lu bytes", seMaxBytes);
+      ev.Print("TEST_LIST"   , testsToRun.size(),  testList.empty() ? "Running all tests" : "Running Tests: %s", ev.GetStr(testList).c_str());
+      printf("\n");
+    }
+  }
+
+  // Calculate cell-spacing / padding
+  int numSizes  = sizeList.size();
+  int colSize   = std::max(5, 2 + numSizes);
+  int lPad1Size = (colSize -        3) / 2, rPad1Size = colSize - lPad1Size - 3;
+  int lPad2Size = (colSize - numSizes) / 2, rPad2Size = colSize - lPad2Size - numSizes;
+
+  std::string l1(lPad1Size, ' '), r1(rPad1Size, ' ');
+  std::string l2(lPad2Size, ' '), r2(rPad2Size, ' ');
+
+  int testsFailed = 0;
+  auto test = [&](int x, int y) {
+    Utils::Print("  %02d  |%s", x, l2.c_str());
+    fflush(stdout);
+    testsFailed += RunTest(x, testsToRun, sizeList, 1, cfg, cpuMemType, gpuMemType, seMaxBytes, totalGpus);
+    Utils::Print("%s|  %02d  |", r2.c_str(), y);
+    for (auto numSubExec : gfxSesList) {
+      Utils::Print("%s", l2.c_str());
+      fflush(stdout);
+      testsFailed += RunTest(y, testsToRun, sizeList, numSubExec, cfg, cpuMemType, gpuMemType, seMaxBytes, totalGpus);
+      Utils::Print("%s|", r2.c_str());
+    }
+    Utils::Print("\n");
+    fflush(stdout);
+  };
+
+  Utils::Print("Running tests on %d GPUs total across %d rank(s)\n", totalGpus, numRanks);
+  Utils::Print("Legend: %s=Pass %s=Skip %s=Fail\n", pass.c_str(), skip.c_str(), fail.c_str());
+
+  // Print headers
+  Utils::Print("                                    %s   %s       |", l1.c_str(), r1.c_str());
+  for ([[maybe_unused]] auto numSubExec : gfxSesList)
+    Utils::Print("%sGFX%s|", l1.c_str(), r1.c_str());
+  Utils::Print("\n");
+  Utils::Print("| Name                      | Test |%sDMA%s| Test |", l1.c_str(), r1.c_str());
+  for (auto numSubExec : gfxSesList)
+    Utils::Print("%s%03d%s|", l1.c_str(), numSubExec, r1.c_str());
+  Utils::Print("\n");
+  Utils::Print("|---------------------------|------|%s|------|", std::string(colSize, '-').c_str());
+  for ([[maybe_unused]] auto numSubExec : gfxSesList)
+    Utils::Print("%s|", std::string(colSize, '-').c_str());
+  Utils::Print("\n");
+
+  // Print table / Run Tests
+  Utils::Print("| Copy (H2D)                |"); test(1, 8);
+  Utils::Print("| Copy (D2H)                |"); test(2, 9);
+  Utils::Print("| Copy (D2D) (Remote Write) |"); test(3,10);
+  Utils::Print("| Copy (D2D) (Remote Read ) |"); test(4,11);
+  Utils::Print("| Broadcast  (One to All)   |"); test(5,12);
+  Utils::Print("| Gather     (All to One)   |"); test(6,13);
+  Utils::Print("| All To All                |"); test(7,14);
+
+  Utils::Print("|---------------------------|------|%s|------|", std::string(colSize, '-').c_str());
+  for ([[maybe_unused]] auto numSubExec : gfxSesList)
+    Utils::Print("%s|", std::string(colSize, '-').c_str());
+  Utils::Print("\n\n");
+
+  // Show summary
+  if (testsFailed) {
+    Utils::Print("[WARN] %d Tests FAILED\n", testsFailed);
+  } else {
+    Utils::Print("All tests passed\n");
+  }
+  if (numRanks > 1 && Utils::GetRankPerPodMap().size() != 1) {
+    Utils::Print("[WARN] Copy (D2D) / Broadcast / Gather / AllToAll tests are skipped if ranks are not in same pod\n");
+  }
+  if (Utils::HasDuplicateHostname()) {
+    Utils::Print("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
+  }
+  return testsFailed ? ERR_FATAL : ERR_NONE;
+}
+
+}
diff --git a/src/client/Presets/Sweep.hpp b/src/client/Presets/Sweep.hpp
index 97c9c951..2ab85006 100644
--- a/src/client/Presets/Sweep.hpp
+++ b/src/client/Presets/Sweep.hpp
@@ -39,13 +39,14 @@ void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& tran
   }
 }
 
-int SweepPreset(EnvVars&           ev,
-                size_t      const  numBytesPerTransfer,
-                std::string const  presetName)
+int SweepPreset(EnvVars&          ev,
+                size_t      const numBytesPerTransfer,
+                std::string const presetName,
+                bool        const bytesSpecified)
 {
   if (TransferBench::GetNumRanks() > 1) {
     Utils::Print("[ERROR] Sweep preset currently not supported for multi-node\n");
-    return 1;
+    return ERR_FATAL;
   }
 
   bool const isRandom = (presetName == "rsweep");
@@ -103,33 +104,33 @@ int SweepPreset(EnvVars&           ev,
   for (auto ch : sweepSrc) {
     if (!strchr(MemTypeStr, ch)) {
       printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
-      return 1;
+      return ERR_FATAL;
     }
     if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch)) {
       printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
-      return 1;
+      return ERR_FATAL;
     }
   }
 
   for (auto ch : sweepDst) {
     if (!strchr(MemTypeStr, ch)) {
       printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
-      return 1;
+      return ERR_FATAL;
     }
     if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch)) {
       printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
-      return 1;
+      return ERR_FATAL;
     }
   }
 
   for (auto ch : sweepExe) {
     if (!strchr(ExeTypeStr, ch)) {
       printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
-      return 1;
+      return ERR_FATAL;
     }
     if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch)) {
       printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
-      return 1;
+      return ERR_FATAL;
     }
   }
 
@@ -339,7 +340,7 @@ int SweepPreset(EnvVars&           ev,
 
     if (!TransferBench::RunTransfers(cfg, transfers, results)) {
       Utils::PrintErrors(results.errResults);
-      if (!continueOnErr) return 1;
+      if (!continueOnErr) return ERR_FATAL;
     } else {
       Utils::PrintResults(ev, numTestsRun, transfers, results);
     }
@@ -371,5 +372,5 @@ int SweepPreset(EnvVars&           ev,
     }
   }
   if (fp) fclose(fp);
-  return 0;
+  return ERR_NONE;
 }
diff --git a/src/client/Presets/WallClock.hpp b/src/client/Presets/WallClock.hpp
new file mode 100644
index 00000000..e23844bc
--- /dev/null
+++ b/src/client/Presets/WallClock.hpp
@@ -0,0 +1,234 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+__global__ void GetXccTimestamps(uint64_t* timestamps, volatile int* readyFlag)
+{
+  // Only first thread does any work
+  if (threadIdx.x != 0) return;
+
+  // Threadblocks in first "row" handle timestamps
+  if (blockIdx.y == 0) {
+
+    // Collect XCD for this
+    int xccId;
+    GetXccId(xccId);
+
+    // All threadblocks wait for ready signal
+    while (*readyFlag == 0);
+
+    // Collect timestamp and save to memory
+    auto w = GetTimestamp();
+    timestamps[xccId] = w;
+  } else if (blockIdx.x == 0) {
+
+    // Sleep for some number of cycles to ensure that other threadblocks are active
+    auto w = GetTimestamp();
+    while (GetTimestamp() - w < 10000);
+
+    // Signal start to the other threadblocks
+    *readyFlag = 1;
+  }
+}
+
+#if defined(__NVCC__)
+#define hipDeviceSynchronize                               cudaDeviceSynchronize
+#define hipMemset                                          cudaMemset
+#define hipSetDevice                                       cudaSetDevice
+#endif
+
+int WallClockPreset(EnvVars&          ev,
+                    size_t      const numBytesPerTransfer,
+                    std::string const presetName,
+                    bool        const bytesSpecified)
+{
+  // Gather results and print
+  int numRanks = GetNumRanks();
+  int myRank   = GetRank();
+
+  // Check for single homogenous group
+  if (Utils::GetNumRankGroups() > 1) {
+    Utils::Print("[ERROR] wallclock preset can only be run across ranks that are homogenous\n");
+    Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n");
+    Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n");
+    return ERR_FATAL;
+  }
+
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+  int numGpuDevices   = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+
+  // Print off env vars
+  if (Utils::RankDoesOutput()) {
+    if (!ev.hideEnv) {
+      if (!ev.outputToCsv) printf("[WallClock Related]\n");
+      ev.Print("NUM_GPU_DEVICES", numGpuDevices,     "Limit to using %d GPUs (per rank)", numGpuDevices);
+      ev.Print("NUM_ITERATIONS" , ev.numIterations,  "Number of iterations");
+      ev.Print("NUM_WARMUPS"    , ev.numWarmups,     "Number of warmup iterations");
+      ev.Print("SHOW_ITERATIONS", ev.showIterations, "Showing per iteration details. Set to 2 to see raw wallclock values");
+    }
+  }
+
+  // Check for env var consistency across ranks
+  IS_UNIFORM(numGpuDevices,     "NUM_GPU_DEVICES");
+  IS_UNIFORM(ev.numIterations,  "NUM_ITERATIONS");
+  IS_UNIFORM(ev.numWarmups,     "NUM_WARMUPS");
+  IS_UNIFORM(ev.showIterations, "SHOW_ITERATIONS");
+
+  if (numGpuDevices <= 0) {
+    Utils::Print("[ERROR] wallclock preset requires at least one GPU\n");
+    return ERR_FATAL;
+  }
+
+  // Collect local results
+  int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
+
+  // Compute wall clock rate (based on GPU 0)
+  int wallClockKhz;
+#if defined(__NVCC__)
+  wallClockKhz = 1000000;
+#else
+  HIP_CALL(hipDeviceGetAttribute(&wallClockKhz, hipDeviceAttributeWallClockRate, 0));
+#endif
+  if (wallClockKhz == 0) wallClockKhz = 100000;
+  double uSecPerCycle = 1000.0 / wallClockKhz;
+
+  Utils::Print("\nRunning %d iterations.  Detected wall clock rate of %dKhz = %.2f usec per cycle\n\n",
+               ev.numIterations, wallClockKhz, uSecPerCycle);
+
+  std::vector<std::vector<std::vector<uint64_t>>> results(numGpuDevices,
+                                                          std::vector<std::vector<uint64_t>>(ev.numIterations,
+                                                                                             std::vector<uint64_t>(numXccs, 0)));
+  for (int deviceId = 0; deviceId < numGpuDevices; deviceId++) {
+    HIP_CALL(hipSetDevice(deviceId));
+
+    uint64_t* timestamps;
+    int32_t* readyFlag;
+
+    if (Utils::AllocateMemory({MEM_CPU_CLOSEST, deviceId}, numXccs * sizeof(uint64_t), (void**)&timestamps)) {
+      Utils::Print("[ERROR] Unable to allocate pinned host memory for storing timestamps for GPU device %d on rank %d\n",
+                   deviceId, myRank);
+      return ERR_FATAL;
+    }
+    if (Utils::AllocateMemory({MEM_GPU, deviceId}, sizeof(int32_t), (void**)&readyFlag)) {
+      Utils::Print("[ERROR] Unable to allocate readyFlag on GPU device %d on rank %d\n", deviceId, myRank);
+      return ERR_FATAL;
+    }
+
+    for (int i = -ev.numWarmups; i < ev.numIterations; i++)
+    {
+      HIP_CALL(hipMemset(readyFlag, 0, sizeof(int)));
+      HIP_CALL(hipDeviceSynchronize());
+      GetXccTimestamps<<<dim3(numXccs,2,1), 1>>>(timestamps, readyFlag);
+      HIP_CALL(hipDeviceSynchronize());
+      if (i >= 0) {
+        memcpy(results[deviceId][i].data(), timestamps, numXccs * sizeof(uint64_t));
+      }
+    }
+
+    Utils::DeallocateMemory(MEM_CPU_CLOSEST, timestamps, numXccs * sizeof(uint64_t));
+    Utils::DeallocateMemory(MEM_GPU, readyFlag, sizeof(int32_t));
+  }
+
+  // Prepare table of results
+  int numRows   = 1 + numRanks * numGpuDevices * (ev.showIterations ? (ev.numIterations+1) : 1);
+  int numCols   = 5 + (ev.showIterations ? numXccs : 0);
+  Utils::TableHelper table(numRows, numCols);
+
+  for (int i = 0; i < numCols; i++) {
+    table.SetColAlignment(i, Utils::TableHelper::ALIGN_CENTER);
+  }
+
+  // Prepare header row
+  int currRow = 0;
+  int currCol = 0;
+  table.Set(currRow, currCol++, "Rank");
+  table.Set(currRow, currCol++, "GPU");
+  table.Set(currRow, currCol++, "Iter");
+  table.Set(currRow, currCol++, "Delta(cycles)");
+  table.Set(currRow, currCol++, "Delta(usec)");
+  if (ev.showIterations) {
+    for (int i = 0; i < numXccs; i++) {
+      table.Set(currRow, currCol++, " XCC %d ", i);
+    }
+  }
+  currRow++;
+
+  double minDelta = std::numeric_limits<double>::max();
+  double maxDelta = std::numeric_limits<double>::lowest();
+
+  for (int rank = 0; rank < numRanks; rank++) {
+    table.DrawRowBorder(currRow);
+    for (int deviceId = 0; deviceId < numGpuDevices; deviceId++) {
+      size_t totalCycles = 0;
+      std::vector<uint64_t> timestamps(numXccs, 0);
+
+      for (int iteration = 0; iteration < ev.numIterations; iteration++) {
+        if (rank == myRank) timestamps = results[deviceId][iteration];
+        TransferBench::System::Get().Broadcast(rank, numXccs * sizeof(uint64_t), timestamps.data());
+
+        const auto [min,max] = std::minmax_element(timestamps.begin(), timestamps.end());
+
+        uint64_t cycles = (*max - *min);
+        totalCycles += cycles;
+
+        if (ev.showIterations) {
+          currCol = 0;
+          table.Set(currRow, currCol++, "%d", rank);
+          table.Set(currRow, currCol++, "%d", deviceId);
+          table.Set(currRow, currCol++, "%d", iteration);
+          table.Set(currRow, currCol++, "%lu", cycles);
+          table.Set(currRow, currCol++, "%.2f", cycles * uSecPerCycle);
+          for (int i = 0; i < numXccs; i++) {
+            table.Set(currRow, currCol++, "%lu", timestamps[i] - (ev.showIterations > 1 ? 0 : *min));
+          }
+          currRow++;
+        }
+      }
+
+      double avgCycles = totalCycles * 1.0 / ev.numIterations;
+      minDelta = std::min(minDelta, avgCycles);
+      maxDelta = std::max(maxDelta, avgCycles);
+      currCol = 0;
+      table.Set(currRow, currCol++, "%d", rank);
+      table.Set(currRow, currCol++, "%d", deviceId);
+      table.Set(currRow, currCol++, "AVG");
+      table.Set(currRow, currCol++, "%.2f", avgCycles);
+      table.Set(currRow, currCol++, "%.2f", avgCycles * uSecPerCycle);
+      currRow++;
+    }
+  }
+
+  table.PrintTable(ev.outputToCsv, ev.showBorders);
+
+  Utils::Print("\n");
+  Utils::Print("Minimum Delta detected: %.2f cycles (%.2f usec)\n", minDelta, minDelta * uSecPerCycle);
+  Utils::Print("Maximum Delta detected: %.2f cycles (%.2f usec)\n", maxDelta, maxDelta * uSecPerCycle);
+
+  if (Utils::HasDuplicateHostname()) {
+    Utils::Print("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
+  }
+  return ERR_NONE;
+}
+
+#if defined(__NVCC__)
+#undef hipDeviceSynchronize
+#undef hipMemset
+#endif
diff --git a/src/client/Topology.hpp b/src/client/Topology.hpp
index 52de4aca..180b65ba 100644
--- a/src/client/Topology.hpp
+++ b/src/client/Topology.hpp
@@ -215,17 +215,16 @@ void DisplayMultiRankTopology(bool outputToCsv, bool showBorders)
     Utils::GroupKey const& key    = group.first;
     std::vector<int> const& hosts = group.second;
 
-    std::string              ppodId        = std::get<0>(key);
-    int                      vpodId        = std::get<1>(key);
-    std::vector<std::string> cpuNames      = std::get<2>(key);
-    std::vector<int>         cpuSubExecs   = std::get<3>(key);
-    std::vector<std::string> gpuNames      = std::get<4>(key);
-    std::vector<int>         gpuSubExecs   = std::get<5>(key);
-    std::vector<int>         gpuClosestCpu = std::get<6>(key);
-    std::vector<std::string> nicNames      = std::get<7>(key);
-    std::vector<int>         nicClosestCpu = std::get<8>(key);
-    std::vector<int>         nicClosestGpu = std::get<9>(key);
-    std::vector<int>         nicIsActive   = std::get<10>(key);
+    int64_t                  podId         = std::get<0>(key);
+    std::vector<std::string> cpuNames      = std::get<1>(key);
+    std::vector<int>         cpuSubExecs   = std::get<2>(key);
+    std::vector<std::string> gpuNames      = std::get<3>(key);
+    std::vector<int>         gpuSubExecs   = std::get<4>(key);
+    std::vector<int>         gpuClosestCpu = std::get<5>(key);
+    std::vector<std::string> nicNames      = std::get<6>(key);
+    std::vector<int>         nicClosestCpu = std::get<7>(key);
+    std::vector<int>         nicClosestGpu = std::get<8>(key);
+    std::vector<int>         nicIsActive   = std::get<9>(key);
 
     int numRanks = hosts.size();
     int numCpus  = cpuNames.size();
@@ -240,7 +239,7 @@ void DisplayMultiRankTopology(bool outputToCsv, bool showBorders)
            groupNum++, numRanks, numCpus, numGpus, numNics, numActiveNics);
 
     // Determine size of table
-    int numCols = 7;
+    int numCols = 6;
     int numRows = 1 + std::max(numRanks, numExecutors);
     TransferBench::Utils::TableHelper table(numRows, numCols);
 
@@ -257,10 +256,9 @@ void DisplayMultiRankTopology(bool outputToCsv, bool showBorders)
     table.Set(0, 0, " Rank ");
     table.Set(0, 1, " Hostname ");
     table.Set(0, 2, " POD ");
-    table.Set(0, 3, " VID ");
-    table.Set(0, 4, " Executor ");
-    table.Set(0, 5, " Executor Name ");
-    table.Set(0, 6, " #SE ");
+    table.Set(0, 3, " Executor ");
+    table.Set(0, 4, " Executor Name ");
+    table.Set(0, 5, " #SE ");
 
     // Fill in ranks / hosts
     for (int i = 0; i < numRanks; i++) {
@@ -270,31 +268,30 @@ void DisplayMultiRankTopology(bool outputToCsv, bool showBorders)
     }
 
     // Fill in PPOD and VPOD
-    table.Set(1, 2, " %s ", ppodId.c_str());
-    table.Set(1, 3, " %d ", vpodId);
+    table.Set(1, 2, " %ld ", podId);
 
     // Fill in Executor information
     int rowIdx = 1;
     for (int cpuIndex = 0; cpuIndex < numCpus; cpuIndex++) {
-      table.Set(rowIdx, 4, " CPU %02d ", cpuIndex);
-      table.Set(rowIdx, 5, " %s ",       cpuNames[cpuIndex].c_str());
-      table.Set(rowIdx, 6, " %d ",       cpuSubExecs[cpuIndex]);
+      table.Set(rowIdx, 3, " CPU %02d ", cpuIndex);
+      table.Set(rowIdx, 4, " %s ",       cpuNames[cpuIndex].c_str());
+      table.Set(rowIdx, 5, " %d ",       cpuSubExecs[cpuIndex]);
       rowIdx++;
 
       // Loop over each GPU closest to this CPU executor
       for (int gpuIndex = 0; gpuIndex < numGpus; gpuIndex++) {
         if (gpuClosestCpu[gpuIndex] != cpuIndex) continue;
-        table.Set(rowIdx, 4, " - GPU %02d ", gpuIndex);
-        table.Set(rowIdx, 5, " - %s ",         gpuNames[gpuIndex].c_str());
-        table.Set(rowIdx, 6, " %d ",         gpuSubExecs[gpuIndex]);
+        table.Set(rowIdx, 3, " - GPU %02d ", gpuIndex);
+        table.Set(rowIdx, 4, " - %s ",         gpuNames[gpuIndex].c_str());
+        table.Set(rowIdx, 5, " %d ",         gpuSubExecs[gpuIndex]);
         rowIdx++;
 
         //  Loop over each NIC closest to this GPU
         for (int nicIndex = 0; nicIndex < numNics; nicIndex++) {
           if (nicClosestGpu[nicIndex] != gpuIndex) continue;
-          table.Set(rowIdx, 4, "   - NIC %02d ", nicIndex);
-          table.Set(rowIdx, 5, "   - %s", nicNames[nicIndex].c_str());
-          table.Set(rowIdx, 6, " %s ", nicIsActive[nicIndex] ? "ON" : "OFF");
+          table.Set(rowIdx, 3, "   - NIC %02d ", nicIndex);
+          table.Set(rowIdx, 4, "   - %s", nicNames[nicIndex].c_str());
+          table.Set(rowIdx, 5, " %s ", nicIsActive[nicIndex] ? "ON" : "OFF");
           rowIdx++;
         }
       }
@@ -302,9 +299,9 @@ void DisplayMultiRankTopology(bool outputToCsv, bool showBorders)
       // Loop over remaining NICs not associated with GPU but associated with this CPU
       for (int nicIndex = 0; nicIndex < numNics; nicIndex++) {
         if (nicClosestGpu[nicIndex] != -1 || nicClosestCpu[nicIndex] != cpuIndex) continue;
-        table.Set(rowIdx, 4, " - NIC %02d ", nicIndex);
-        table.Set(rowIdx, 5, " - %s ", nicNames[nicIndex].c_str());
-        table.Set(rowIdx, 6, " %s ", nicIsActive[nicIndex] ? "ON" : "OFF");
+        table.Set(rowIdx, 3, " - NIC %02d ", nicIndex);
+        table.Set(rowIdx, 4, " - %s ", nicNames[nicIndex].c_str());
+        table.Set(rowIdx, 5, " %s ", nicIsActive[nicIndex] ? "ON" : "OFF");
         rowIdx++;
       }
     }
diff --git a/src/client/Utilities.hpp b/src/client/Utilities.hpp
index 0ba93fc6..5fa4de97 100644
--- a/src/client/Utilities.hpp
+++ b/src/client/Utilities.hpp
@@ -21,13 +21,30 @@ THE SOFTWARE.
 */
 
 #pragma once
+#include <algorithm>
+#include <cmath>
 #include <iomanip>
 #include <unordered_map>
 #include <unordered_set>
+#include <type_traits>
+#include "EnvVars.hpp"
 #include "TransferBench.hpp"
 
 namespace TransferBench::Utils
 {
+  // Linear interpolation on sorted samples (same ordering as common empirical quantiles with (n-1) indexing).
+  inline double PercentileDurationMsecFromSorted(std::vector<double> const& sortedAsc, int pct)
+  {
+    size_t const n = sortedAsc.size();
+    if (n == 0)
+      return 0.0;
+    double const pos = (static_cast<double>(pct) / 100.0) * static_cast<double>(n - 1);
+    size_t const lo = static_cast<size_t>(std::floor(pos));
+    size_t const hi = static_cast<size_t>(std::ceil(pos));
+    double const frac = pos - std::floor(pos);
+    return sortedAsc[lo] * (1.0 - frac) + sortedAsc[hi] * frac;
+  }
+
   // Helper class to help format tabular data / output to CSV
   class TableHelper
   {
@@ -85,8 +102,7 @@ namespace TransferBench::Utils
 
   // Group information
   typedef std::tuple<
-    std::string,                   // RackId
-    int,                           // VPod
+    int64_t,                       // Pod Index
     std::vector<std::string>,      // CPU Names
     std::vector<int>,              // CPU #Subexecutors
     std::vector<std::string>,      // GPU Names
@@ -99,12 +115,16 @@ namespace TransferBench::Utils
     > GroupKey;
 
   typedef std::map<GroupKey, std::vector<int>> RankGroupMap;
+  typedef std::map<int64_t, std::vector<int>> RankPerPodMap;
 
   // Get information about how ranks can be organized into homogenous groups
   RankGroupMap& GetRankGroupMap();
 
   // Return the number of homogenous groups of ranks
-  int numRankGroups();
+  int GetNumRankGroups();
+
+  // Helper function for pod membership
+  RankPerPodMap& GetRankPerPodMap();
 
   // Helper function to convert an ExeType to a string
   std::string ExeTypeToStr(ExeType exeType);
@@ -147,6 +167,29 @@ namespace TransferBench::Utils
   std::string GetAllGpuMemTypeStr();
   std::string GetAllMemTypeStr(bool isCpu);
 
+  // Helper forwarders to allocation/deallocation functions
+  // Returns true if error occurs
+  bool AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr);
+  bool DeallocateMemory(MemType memType, void *memPtr, size_t const bytes);
+
+  // Reorder elements of list by stepping through with stride k, wrapping around.
+  // When gcd(k, n) > 1 the single cycle breaks into gcd(k, n) orbits which are
+  // concatenated, so every element appears exactly once in the output.
+  // The reordered list will be further separated into different groups.
+  void StrideGenerate(std::vector<int>& list, int k);
+
+  // Returns a schedule of round robin pairing of N elements, using Circle Method.
+  // If parallel, each round contains N/2 pairs, otherwise serial.
+  void RoundRobinSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
+                          int N, int parallel = 0);
+
+  // Returns a schedule for ordered 2-combination of N elements
+  // by pairing the list with its rotating self.
+  // Each round contains n pairs, where 1 <= n <= N and N is divisible by n,
+  // and an element cannot appear more than twice in a round.
+  void CombinationSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
+                           int N, int n = 0);
+
   // Implementation details below
   //================================================================
   TableHelper::TableHelper(int numRows, int numCols, int precision) :
@@ -248,9 +291,9 @@ namespace TransferBench::Utils
 
     std::string borders[16] =
       {" ", "│", "│", "│",
-       "─", "┘", "┐", "┤",
-       "─", "└", "┌", "├",
-       "─", "┴", "┬", "┼"};
+       "-", "┘", "┐", "┤",
+       "-", "└", "┌", "├",
+       "-", "┴", "┬", "┼"};
 
     int mask;
     for (int rowIdx = 0; rowIdx <= numRows; rowIdx++) {
@@ -264,7 +307,7 @@ namespace TransferBench::Utils
           if (rowBorders[rowIdx].count(colIdx  )) mask |= BORDER_RIGHT;
           Print("%s", borders[mask].c_str());
           if (colIdx < numCols) {
-            std::string ch = rowBorders[rowIdx].count(colIdx) ? "─" : " ";
+            std::string ch = rowBorders[rowIdx].count(colIdx) ? "-" : " ";
             for (int i = 0; i < colWidth[colIdx]; i++) Print("%s", ch.c_str());
           }
         }
@@ -303,8 +346,7 @@ namespace TransferBench::Utils
       // Build GroupKey for each rank
       for (int rank = 0; rank < TransferBench::GetNumRanks(); rank++) {
 
-        std::string ppodId = TransferBench::GetPpodId(rank);
-        int         vpodId = TransferBench::GetVpodId(rank);
+        int64_t podId = TransferBench::GetPodIdx(rank);
 
         // CPU information
         int numCpus = TransferBench::GetNumExecutors(EXE_CPU, rank);
@@ -350,7 +392,7 @@ namespace TransferBench::Utils
           nicIsActive.push_back(TransferBench::NicIsActive(exeIndex, rank));
         }
 
-        GroupKey key(ppodId, vpodId,
+        GroupKey key(podId,
                      cpuNames, cpuNumSubExecs,
                      gpuNames, gpuNumSubExecs, gpuClosestCpu,
                      nicNames, nicClosestCpu, nicClosestGpu, nicIsActive);
@@ -367,16 +409,32 @@ namespace TransferBench::Utils
     return GetRankGroupMap().size();
   }
 
+  RankPerPodMap& GetRankPerPodMap()
+  {
+    static RankPerPodMap pods;
+    static bool initialized = false;
+
+    if (!initialized) {
+      for (int rank = 0; rank < TransferBench::GetNumRanks(); rank++) {
+        int64_t const podId = TransferBench::GetPodIdx(rank);
+        if (podId == -1) continue;
+        pods[podId].push_back(rank);
+      }
+      initialized = true;
+    }
+    return pods;
+  }
   // Helper function to convert an ExeType to a string
   std::string ExeTypeToStr(ExeType exeType)
   {
     switch (exeType) {
-    case EXE_CPU:         return "CPU";
-    case EXE_GPU_GFX:     return "GPU";
-    case EXE_GPU_DMA:     return "DMA";
-    case EXE_NIC:         return "NIC";
-    case EXE_NIC_NEAREST: return "NIC";
-    default:              return "N/A";
+    case EXE_CPU:           return "CPU";
+    case EXE_GPU_GFX:       return "GPU";
+    case EXE_GPU_DMA:       return "DMA";
+    case EXE_NIC:           return "NIC";
+    case EXE_NIC_NEAREST:   return "NIC";
+    case EXE_GPU_BDMA:      return "BMA";
+    default:                return "N/A";
     }
   }
 
@@ -394,6 +452,46 @@ namespace TransferBench::Utils
     return ss.str();
   }
 
+  template <typename T>
+  struct is_std_vector : std::false_type {};
+
+  template <typename T, typename Alloc>
+  struct is_std_vector<std::vector<T, Alloc>> : std::true_type {};
+
+  // This function can be used to check if a value is identical across ranks
+  template <typename T>
+  bool IsUniform(const T& val) {
+    if constexpr (is_std_vector<T>::value) {
+      using Elem = typename T::value_type;
+      static_assert(std::is_trivially_copyable_v<Elem>, "vector element must be trivially copyable");
+
+      size_t size = val.size();
+      size_t rootSize = size;
+      System::Get().Broadcast(0, sizeof(rootSize), &rootSize);
+      if (size != rootSize) return false;
+
+      std::vector<Elem> ref = val;
+      System::Get().Broadcast(0, rootSize * sizeof(Elem), ref.data());
+
+      return (std::memcmp(ref.data(), val.data(), rootSize * sizeof(Elem)) == 0);
+    } else {
+      static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially copyable");
+      T ref = val;
+      System::Get().Broadcast(0, sizeof(T), &ref);
+
+      return (std::memcmp(&ref, &val, sizeof(T)) == 0);
+    }
+  }
+
+  // Macro for use in presets that will return 1 if a value is not uniform across ranks
+#define IS_UNIFORM(val, name)                                                      \
+  do {                                                                             \
+    if (!Utils::IsUniform(val)) {                                                  \
+      Utils::Print("[ERROR] %s must be uniform across all ranks\n", name); \
+      return 1;                                                                    \
+    }                                                                              \
+  } while(0)
+
   // Helper function to determine if current rank does output
   bool RankDoesOutput()
   {
@@ -457,10 +555,13 @@ namespace TransferBench::Utils
     for (auto const& exeInfoPair : results.exeResults) {
       ExeResult const& exeResult = exeInfoPair.second;
       numRows += 1 + exeResult.transferIdx.size();
+      if (!ev.showPercentiles.empty()) {
+        numRows += static_cast<int>(ev.showPercentiles.size()) * static_cast<int>(exeResult.transferIdx.size());
+      }
       if (ev.showIterations) {
-        numRows += (numTimedIterations + 1);
-
-        // Check that per-iteration information exists
+        numRows += (numTimedIterations + 1) * exeResult.transferIdx.size();
+      }
+      if (ev.showIterations || !ev.showPercentiles.empty()) {
         for (int idx : exeResult.transferIdx) {
           TransferResult const& r = results.tfrResults[idx];
           if (r.perIterMsec.size() != numTimedIterations) {
@@ -472,7 +573,9 @@ namespace TransferBench::Utils
       }
     }
 
-    TableHelper table(numRows, numCols);
+    int showNumIterations = (ev.numIterations < 0) ? 1 : 0;
+
+    TableHelper table(numRows+showNumIterations, numCols);
     for (int col = 1; col < numCols; col++)
       table.DrawColBorder(col);
 
@@ -506,9 +609,9 @@ namespace TransferBench::Utils
         TransferResult const& r = results.tfrResults[idx];
 
         table.Set(rowIdx, 0, "Transfer %-4d ", idx);
-        table.Set(rowIdx, 1, "%8.3f GB/s "       , r.avgBandwidthGbPerSec);
-        table.Set(rowIdx, 2, "%8.3f ms "         , r.avgDurationMsec);
-        table.Set(rowIdx, 3, "%12lu bytes "      , r.numBytes);
+        table.Set(rowIdx, 1, "%8.3f GB/s "   , r.avgBandwidthGbPerSec);
+        table.Set(rowIdx, 2, "%8.3f ms "     , r.avgDurationMsec);
+        table.Set(rowIdx, 3, "%12lu bytes "  , r.numBytes);
 
         char exeSubIndexStr[32] = "";
         if (t.exeSubIndex != -1)
@@ -587,6 +690,24 @@ namespace TransferBench::Utils
           rowIdx++;
           table.DrawRowBorder(rowIdx);
         }
+
+        // Show percentiles
+        if (!ev.showPercentiles.empty()) {
+          std::vector<double> sortedDur = r.perIterMsec;
+          std::sort(sortedDur.begin(), sortedDur.end());
+          for (int pct : ev.showPercentiles) {
+            double dur = PercentileDurationMsecFromSorted(sortedDur, pct);
+            double bwGbs = dur > 0.0 ? (t.numBytes / 1.0E9) / dur * 1000.0 : 0.0;
+            table.Set(rowIdx, 0, "p%d ", pct);
+            table.Set(rowIdx, 1, "%8.3f GB/s ", bwGbs);
+            table.Set(rowIdx, 2, "%8.3f ms ", dur);
+            table.Set(rowIdx, 3, " ");
+            table.Set(rowIdx, 4, " ");
+            table.SetCellAlignment(rowIdx, 4, TableHelper::ALIGN_LEFT);
+            rowIdx++;
+          }
+        }
+
       }
     }
     table.DrawRowBorder(rowIdx);
@@ -596,8 +717,21 @@ namespace TransferBench::Utils
     table.Set(rowIdx, 3, "%12lu bytes "     , results.totalBytesTransferred);
     table.Set(rowIdx, 4, " Overhead %.3f ms", results.overheadMsec);
     table.SetCellAlignment(rowIdx, 4, TableHelper::ALIGN_LEFT);
-    table.DrawRowBorder(rowIdx + 1);
+    table.DrawRowBorder(rowIdx+1);
 
+    if (showNumIterations) {
+      rowIdx++;
+      table.Set(rowIdx, 0, "# Iters Run:");
+      table.Set(rowIdx, 1, "%lu ", numTimedIterations);
+      table.SetCellAlignment(rowIdx, 1, TableHelper::ALIGN_LEFT);
+      table.SetCellBorder(rowIdx, 0, 0);
+      table.SetCellBorder(rowIdx, 1, 0);
+      table.SetCellBorder(rowIdx, 2, 0);
+      table.SetCellBorder(rowIdx, 3, 0);
+      table.SetCellBorder(rowIdx, 4, 0);
+      table.DrawRowBorder(rowIdx);
+      table.DrawRowBorder(rowIdx+1);
+    }
     table.PrintTable(ev.outputToCsv, ev.showBorders);
   }
 
@@ -682,4 +816,122 @@ namespace TransferBench::Utils
   {
     return isCpu ? GetAllCpuMemTypeStr() : GetAllGpuMemTypeStr();
   }
+
+  bool AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr)
+  {
+    return (TransferBench::AllocateMemory(memDevice, numBytes, memPtr).errType != TransferBench::ERR_NONE);
+  }
+  bool DeallocateMemory(MemType memType, void *memPtr, size_t const bytes)
+  {
+    return (TransferBench::DeallocateMemory(memType, memPtr, bytes).errType != TransferBench::ERR_NONE);
+  }
+
+  void StrideGenerate(std::vector<int>& list, int k)
+  {
+    int n = list.size();
+    if (n == 0) return;
+    k = ((k % n) + n) % n;  // normalize to 0..n-1
+    if (k == 0) return;
+
+    int d = std::gcd(k, n);
+    std::vector<int> out;
+    out.reserve(n);
+
+    for (int s = 0; s < d; s++) {
+      for (int j = 0; j < n / d; j++) {
+        out.push_back(list[(s + j * k) % n]);
+      }
+    }
+    list = std::move(out);
+  }
+
+  void RoundRobinSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
+                          int N, int parallel)
+  {
+    if (N == 1) {
+      schedule.push_back({{0, 0}});
+      return;
+    }
+    // Generate standard round-robin tournament (maximum parallelism)
+    std::vector<std::vector<std::pair<int, int>>> fullSchedule;
+
+    // Pad odd number of ranks with a dummy round (N+1)
+    int paddedN = N + N % 2;
+    // Round-robin tournament scheduling
+    for (int round = 0; round < paddedN - 1; round++) {
+      std::vector<std::pair<int, int>> roundPairs;
+      std::vector<std::pair<int, int>> roundPairsReversed;
+      for (int i = 0; i < paddedN / 2; i++) {
+        int item1 = i;
+        int item2 = paddedN - 1 - i;
+        if (round > 0) {
+          // Rotate all except the first item
+          if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1;
+          if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1;
+        }
+        // Ignore dummy round, its partner sits out this round
+        if (item1 < N && item2 < N) {
+          roundPairs.push_back({item1, item2});
+          roundPairsReversed.push_back({item2, item1});
+        }
+      }
+      fullSchedule.push_back(roundPairs);
+      fullSchedule.push_back(roundPairsReversed);
+    }
+
+    // A loopback round where all run in parallel
+    std::vector<std::pair<int, int>> selfRound;
+    for (int i = 0; i < N; i++) {
+      selfRound.push_back({i, i});
+    }
+    fullSchedule.push_back(selfRound);
+
+    if (parallel) {
+      schedule = std::move(fullSchedule);
+    } else {
+      // Serialize each round if needed
+      for (auto const& fullRound : fullSchedule) {
+        for (auto const& match : fullRound) {
+          std::vector<std::pair<int, int>> subRound;
+          subRound.push_back({match.first, match.second});
+          schedule.push_back(subRound);
+        }
+      }
+    }
+  }
+
+  void CombinationSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
+                           int N, int n)
+  {
+    std::vector<std::vector<std::pair<int, int>>> fullSchedule;
+
+    if (n <= 0) n = N;
+    if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round
+    {
+      n = 1;
+      Print("[WARN] cannot create combination schedule, falling back to serial\n");
+    }
+
+    // Generate rounds of combination based on incrementing distance
+    for (int i = 0; i < N; i++) {
+      std::vector<std::pair<int, int>> round;
+      for (int j = 0; j < N; j++) {
+        round.push_back({j, (j + i) % N});
+      }
+      fullSchedule.push_back(round);
+    }
+
+    // Step 2: Split each full round into sub-rounds with at most n pairs
+    for (auto const& fullRound : fullSchedule) {
+      for (size_t start = 0; start < fullRound.size(); start += n) {
+        std::vector<std::pair<int, int>> subRound;
+        for (size_t i = start; i < start + n && i < fullRound.size(); i++) {
+          subRound.push_back(fullRound[i]);
+        }
+        if (!subRound.empty()) {
+          schedule.push_back(subRound);
+        }
+      }
+    }
+  }
 };
diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp
index 7b74dc5b..b16c587d 100644
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -27,12 +27,15 @@ THE SOFTWARE.
 #include <atomic>
 #include <cstring>
 #include <fcntl.h>
+#include <ifaddrs.h>
 #include <filesystem>
 #include <fstream>
 #include <functional>
 #include <future>
 #include <map>
 #include <mutex>
+#include <net/if.h>
+#include <netdb.h>
 #include <netinet/in.h>
 #include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
 #include <numaif.h>
@@ -61,16 +64,27 @@ THE SOFTWARE.
 #endif
 
 #if defined(__NVCC__)
+#include <cuda.h>
 #include <cuda_runtime.h>
+#ifdef NVML_ENABLED
 #include <nvml.h>
+#endif
 #else
-#include <hip/hip_ext.h>
-#include <hip/hip_runtime.h>
-#include <hsa/hsa.h>
-#include <hsa/hsa_ext_amd.h>
+#include "hip/hip_ext.h"
+#include "hip/hip_runtime.h"
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_amd.h"
+#ifdef AMD_SMI_ENABLED
+#include "amd_smi/amdsmi.h"
+#endif
 #endif
 /// @endcond
 
+// Batched DMA executor is only supported with HIP >= 7.1 and CUDA 12.8
+#if (defined(HIP_VERSION) && (HIP_VERSION >= 70100000)) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12080))
+#define BMA_EXEC_ENABLED
+#endif
+
 namespace TransferBench
 {
   using std::map;
@@ -78,7 +92,7 @@ namespace TransferBench
   using std::set;
   using std::vector;
 
-  constexpr char VERSION[] = "1.66";
+  constexpr char VERSION[] = "1.67";
 
   /**
    * Enumeration of supported Executor types
@@ -91,11 +105,12 @@ namespace TransferBench
     EXE_GPU_GFX      = 1,                       ///<  GPU kernel-based executor (subExecutor = threadblock/CU)
     EXE_GPU_DMA      = 2,                       ///<  GPU SDMA executor         (subExecutor = not supported)
     EXE_NIC          = 3,                       ///<  NIC RDMA executor         (subExecutor = queue pair)
-    EXE_NIC_NEAREST  = 4                        ///<  NIC RDMA nearest executor (subExecutor = queue pair)
+    EXE_NIC_NEAREST  = 4,                       ///<  NIC RDMA nearest executor (subExecutor = queue pair)
+    EXE_GPU_BDMA     = 5,                       ///<  GPU Batched SDMA executor (subExecutor = batch item)
   };
-  char const ExeTypeStr[6] = "CGDIN";
+  char const ExeTypeStr[7] = "CGDINB";
   inline bool IsCpuExeType(ExeType e){ return e == EXE_CPU; }
-  inline bool IsGpuExeType(ExeType e){ return e == EXE_GPU_GFX || e == EXE_GPU_DMA; }
+  inline bool IsGpuExeType(ExeType e){ return e == EXE_GPU_GFX || e == EXE_GPU_DMA || e == EXE_GPU_BDMA; }
   inline bool IsNicExeType(ExeType e){ return e == EXE_NIC || e == EXE_NIC_NEAREST; }
 
   /**
@@ -139,6 +154,17 @@ namespace TransferBench
   inline bool IsCpuMemType(MemType m) { return (MEM_CPU <= m && m <= MEM_CPU_UNPINNED);}
   inline bool IsGpuMemType(MemType m) { return (MEM_GPU <= m && m <= MEM_MANAGED);}
 
+  /**
+   * Enumeration of supported GFX kernels
+   */
+  enum GfxKernelType
+  {
+    GFX_KERNEL_AUTO   = -1,                     ///< Automatically choose a kernel
+    GFX_KERNEL_REDUCE =  0,                     ///< Default kernel that supports any multiple input/output buffers
+    GFX_KERNEL_COPY   =  1,                     ///< Simpler kernel that supports copies only
+    NUM_GFX_KERNELS   =  2                      ///< Number of GFX kernels currently supported
+  };
+
   /**
    * A MemDevice indicates a memory type on a specific device
    */
@@ -208,6 +234,7 @@ namespace TransferBench
     int                 blockOrder     = 0;     ///< Determines how threadblocks are ordered (0=sequential, 1=interleaved, 2=random)
     int                 blockSize      = 256;   ///< Size of each threadblock (must be multiple of 64)
     vector<uint32_t>    cuMask         = {};    ///< Bit-vector representing the CU mask
+    int                 gfxKernel      = 0;     ///< Kernel selector: -1=auto, 0=reduce, 1=copy-only
     vector<vector<int>> prefXccTable   = {};    ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
     int                 seType         = 0;     ///< SubExecutor granularity type (0=threadblock, 1=warp)
     int                 temporalMode   = 0;     ///< Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both
@@ -234,6 +261,7 @@ namespace TransferBench
   struct NicOptions
   {
     size_t      chunkBytes      = 1<<30;        ///< How much bytes to transfer at a time
+    int         cqPollBatch     = 4;            ///< Maximum CQ entries polled per call
     int         ibGidIndex      = -1;           ///< GID Index for RoCE NICs (-1 is auto)
     uint8_t     ibPort          = 1;            ///< NIC port number to be used
     int         ipAddressFamily = 4;            ///< 4=IPv4, 6=IPv6 (used for auto GID detection)
@@ -315,6 +343,7 @@ namespace TransferBench
     ErrResult() = default;
 #if defined(__NVCC__)
     ErrResult(cudaError_t  err);
+    ErrResult(CUresult     err);
 #else
     ErrResult(hipError_t   err);
     ErrResult(hsa_status_t err);
@@ -381,30 +410,11 @@ namespace TransferBench
                     vector<Transfer> const& transfers,
                     TestResults&            results);
 
-  /**
-   * Enumeration of implementation attributes
-   */
-  enum IntAttribute
-  {
-    ATR_GFX_MAX_BLOCKSIZE,                      ///< Maximum blocksize for GFX executor
-    ATR_GFX_MAX_UNROLL,                         ///< Maximum unroll factor for GFX executor
-  };
-
   enum StrAttribute
   {
     ATR_SRC_PREP_DESCRIPTION                    ///< Description of how source memory is prepared
   };
 
-  /**
-   * Query attributes (integer)
-   *
-   * @note This allows querying of implementation information such as limits
-   *
-   * @param[in] attribute   Attribute to query
-   * @returns Value of the attribute
-   */
-  int GetIntAttribute(IntAttribute attribute);
-
   /**
    * Query attributes (string)
    *
@@ -547,16 +557,17 @@ namespace TransferBench
   std::string GetHostname(int targetRank = -1);
 
   /**
-   * @param[in] targetRank  Rank to query (-1 for local rank)
-   * @returns Gets the physical pod identifier for the target rank
+   * @param[in] targetRank Rank to query (-1 for local rank)
+   * @returns Gets the unique pod identifier for the target rank based on its physical and virtual pod
    **/
-  std::string GetPpodId(int targetRank = -1);
+  int64_t GetPodIdx(int targetRank = -1);
 
   /**
-   * @param[in] targetRank  Rank to query (-1 for local rank)
-   * @returns Gets the virtual pod identifier for the target rank
+   * @param[in] targetRank  Remote rank to query
+   * @param[in] sourceRank  Base rank to query (-1 for local rank)
+   * @returns Whether source and target ranks belong to the same pod
    **/
-  int GetVpodId(int targetRank = -1);
+  bool IsSamePod(int targetRank, int sourceRank = -1);
 
   /**
    * @param[in] exeDevice       The specific Executor to query
@@ -581,7 +592,7 @@ namespace TransferBench
    */
   ErrResult ParseTransfers(std::string str,
                            std::vector<Transfer>& transfers);
-};
+}
 //==========================================================================================
 // End of TransferBench API
 //==========================================================================================
@@ -599,6 +610,10 @@ namespace TransferBench
   #define hipError_t                                         cudaError_t
   #define hipEvent_t                                         cudaEvent_t
   #define hipStream_t                                        cudaStream_t
+  #define hipMemAllocationProp                               CUmemAllocationProp
+  #define hipMemGenericAllocationHandle_t                    CUmemGenericAllocationHandle
+  #define hipMemAccessDesc                                   CUmemAccessDesc
+  #define hipMemFabricHandle_t                               CUmemFabricHandle
 
   // Enumerations
   #define hipDeviceAttributeClockRate                        cudaDevAttrClockRate
@@ -607,9 +622,15 @@ namespace TransferBench
   #define hipErrorPeerAccessAlreadyEnabled                   cudaErrorPeerAccessAlreadyEnabled
   #define hipFuncCachePreferShared                           cudaFuncCachePreferShared
   #define hipMemcpyDefault                                   cudaMemcpyDefault
+  #define hipMemcpyKind                                      cudaMemcpyKind
   #define hipMemcpyDeviceToHost                              cudaMemcpyDeviceToHost
   #define hipMemcpyHostToDevice                              cudaMemcpyHostToDevice
   #define hipSuccess                                         cudaSuccess
+  #define hipMemLocationTypeDevice                           CU_MEM_LOCATION_TYPE_DEVICE
+  #define hipMemAllocationTypePinned                         CU_MEM_ALLOCATION_TYPE_PINNED
+  #define hipMemHandleTypeFabric                             CU_MEM_HANDLE_TYPE_FABRIC
+  #define hipMemAllocationGranularityRecommended             CU_MEM_ALLOC_GRANULARITY_RECOMMENDED
+  #define hipMemAccessFlagsProtReadWrite                     CU_MEM_ACCESS_FLAGS_PROT_READWRITE
 
   // Functions
   #define hipDeviceCanAccessPeer                             cudaDeviceCanAccessPeer
@@ -632,12 +653,26 @@ namespace TransferBench
   #define hipMallocManaged                                   cudaMallocManaged
   #define hipMemcpy                                          cudaMemcpy
   #define hipMemcpyAsync                                     cudaMemcpyAsync
+  #define hipMemcpyBatchAsync                                cudaMemcpyBatchAsync
   #define hipMemset                                          cudaMemset
   #define hipMemsetAsync                                     cudaMemsetAsync
   #define hipSetDevice                                       cudaSetDevice
   #define hipStreamCreate                                    cudaStreamCreate
   #define hipStreamDestroy                                   cudaStreamDestroy
   #define hipStreamSynchronize                               cudaStreamSynchronize
+  // cu* driver API returns CUresult; cast to cudaError_t so callers can use a single error variable
+#define hipMemGetAllocationGranularity(...)                  ((cudaError_t)cuMemGetAllocationGranularity(__VA_ARGS__))
+  #define hipMemCreate(...)                                  ((cudaError_t)cuMemCreate(__VA_ARGS__))
+  #define hipMemAddressReserve(...)                          ((cudaError_t)cuMemAddressReserve(__VA_ARGS__))
+  #define hipMemMap(...)                                     ((cudaError_t)cuMemMap(__VA_ARGS__))
+  #define hipMemSetAccess(...)                               ((cudaError_t)cuMemSetAccess(__VA_ARGS__))
+  #define hipMemUnmap(...)                                   ((cudaError_t)cuMemUnmap(__VA_ARGS__))
+  #define hipMemRelease(...)                                 ((cudaError_t)cuMemRelease(__VA_ARGS__))
+  #define hipMemAddressFree(...)                             ((cudaError_t)cuMemAddressFree(__VA_ARGS__))
+  #define hipMemExportToShareableHandle(...)                 ((cudaError_t)cuMemExportToShareableHandle(__VA_ARGS__))
+  #define hipMemImportFromShareableHandle(...)               ((cudaError_t)cuMemImportFromShareableHandle(__VA_ARGS__))
+
+  using gpu_device_ptr = CUdeviceptr;
 
   // Define float2 addition operator for NVIDIA platform
   __device__ inline float2& operator +=(float2& a, const float2& b)
@@ -656,42 +691,59 @@ namespace TransferBench
     a.w += b.w;
     return a;
   }
+#else
+  using gpu_device_ptr = void*;
 #endif
 
 // Helper macro functions
 //==========================================================================================
 
 // Macro for collecting CU/SM GFX kernel is running on
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1150__) || defined(__gfx1151__) || defined(__gfx1200__) || defined(__gfx1201__)
-#define GetHwId(hwId) hwId = 0
+#if defined(__GFX9__)
+  #define GetHwId(hwId) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwId))
+#elif defined(__GFX10__) || defined(__GFX11__) || defined(__GFX12__)
+  #define GetHwId(hwId) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID1)" : "=s" (hwId))
 #elif defined(__NVCC__)
-#define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId))
+  #define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId))
 #else
-#define GetHwId(hwId) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwId));
+  #define GetHwId(hwId) hwId = 0
 #endif
 
 // Macro for collecting XCC GFX kernel is running on
 #if defined(__gfx942__) || defined(__gfx950__)
-#define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
+#define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val))
+#elif defined(__GFX12__)
+#define GetXccId(val) \
+  { asm volatile ("s_sendmsg_rtn_b32 %0, 0x87 \n" \
+                  "s_wait_kmcnt 0"                \
+                  : "=s" (val));                  \
+    val = ((val >> 16) & 0xF);                    \
+  }
 #else
 #define GetXccId(val) val = 0
 #endif
 
 // Error check macro (NOTE: This will return even for ERR_WARN)
-#define ERR_CHECK(cmd)            \
-  do {                            \
-    ErrResult err = (cmd);        \
-    if (err.errType != ERR_NONE)  \
-      return err;                 \
+#define ERR_CHECK(cmd)                                                       \
+  do {                                                                       \
+    ErrResult err = (cmd);                                                   \
+    if (err.errType != ERR_NONE) {                                           \
+      err.errMsg += std::string(" [") + __FILE__ + ":" +                     \
+                    std::to_string(__LINE__) + " in " + __func__ + "]";      \
+      return err;                                                            \
+    }                                                                        \
   } while (0)
 
 // Appends warn/fatal errors to a list, return false if fatal
-#define ERR_APPEND(cmd, list)     \
-  do {                            \
-    ErrResult err = (cmd);        \
-    if (err.errType != ERR_NONE)  \
-      list.push_back(err);        \
-    if (err.errType == ERR_FATAL) \
+#define ERR_APPEND(cmd, list)                                                \
+  do {                                                                       \
+    ErrResult err = (cmd);                                                   \
+    if (err.errType != ERR_NONE) {                                           \
+      err.errMsg += std::string(" [") + __FILE__ + ":" +                     \
+                    std::to_string(__LINE__) + " in " + __func__ + "]";      \
+      list.push_back(err);                                                   \
+    }                                                                        \
+    if (err.errType == ERR_FATAL)                                            \
       return false;               \
   } while (0)
 
@@ -744,9 +796,7 @@ namespace {
 
 // Constants
 //========================================================================================
-
   int   constexpr MAX_BLOCKSIZE  = 1024;               // Max threadblock size
-  int   constexpr MAX_UNROLL     = 8;                  // Max unroll factor
   int   constexpr MAX_SRCS       = 8;                  // Max srcs per Transfer
   int   constexpr MAX_DSTS       = 8;                  // Max dsts per Transfer
   int   constexpr MEMSET_CHAR    = 75;                 // Value to memset (char)
@@ -791,14 +841,15 @@ namespace {
    *
    * This supports three possible communication modes - Socket-based, MPI-based, disabled
    *
-   * - Will first attempt to use sockets if TB_RANK env var is detected
+   * - Will first attempt to use sockets when TB_NUM_RANKS is set (>= 2)
    * - Will then try MPI-based, if compiled with MPI support
    * - Drop back to single node functionality
 
    * - Configuration for socket-based communicator is read via environment variables
-   *   - TB_RANK:        Rank of this process (0-based)
-   *   - TB_NUM_RANKS:   Total number of processes
-   *   - TB_MASTER_ADDR: IP address of rank 0
+   *   - TB_NUM_RANKS:   Total number of processes (only variable required on rank 0; rank 0 logs how workers should connect)
+   *   - TB_RANK:        Rank of this process (0-based); defaults to 0 if unset or empty
+   *   - TB_MASTER_ADDR: Rank 0 address for workers to connect; optional on rank 0 (auto-detected IPv4 after listen)
+   *   - TB_MASTER_IFACE: Optional interface name when auto-detecting rank-0 address (e.g. eth0)
    *   - TB_MASTER_PORT: Port for communication (default: 29500)
    */
   class System
@@ -826,6 +877,18 @@ namespace {
 
     bool& IsVerbose() { return verbose; }
 
+    /**
+     * Helper logging function that logs only on output ranks
+     * - In MPI mode - Rank 0 only
+     * - In socket mode - All ranks unless TB_SINGLE_LOG=1
+     */
+    void Log(const char* format, ...) const;
+
+    /**
+     * Helper function that logs Transfers being executed to a config file
+     */
+    void LogTransfers(std::vector<Transfer> const& transfers);
+
     // Communication functions
     /**
      * Barrier that all ranks must arrive at before proceeding
@@ -949,8 +1012,8 @@ namespace {
     void GetClosestGpusToNic(std::vector<int>& gpuIndices, int nicIndex, int targetRank = -1) const;
 
     std::string GetHostname(int targetRank) const;
-    std::string GetPpodId(int targetRank) const;
-    int GetVpodId(int targetRank) const;
+    int64_t  GetPodIdx(int targetRank) const;
+    bool IsSamePod(int targetRank, int sourceRank) const;
     std::string GetExecutorName(ExeDevice exeDevice) const;
     int NicIsActive(int nicIndex, int targetRank) const;
 
@@ -977,6 +1040,8 @@ namespace {
     int rank;
     int numRanks;
     bool verbose = false;
+    bool rankDoesOutput = true;
+    FILE* dumpCfgFile = nullptr;
 
 #if !defined(__NVCC__)
     std::vector<hsa_agent_t> cpuAgents;
@@ -999,9 +1064,9 @@ namespace {
     // Topology related
     struct RankTopology
     {
-      char hostname[33];
-      char ppodId[256];
-      int  vpodId;
+      char    hostname[33];
+      char    ppodId[16];
+      int64_t vpodId;
 
       std::map<ExeType,            int>         numExecutors;
       std::map<pair<ExeType, int>, int>         numExecutorSubIndices;
@@ -1018,6 +1083,7 @@ namespace {
 
     void SetupSocketCommunicator();
     void SetupMpiCommunicator();
+    void CollectPodMembership(char* ppodId, int64_t& vpodId);
     void GetRankTopology(RankTopology& topo);
     void CollectTopology();
     std::string GetCpuName() const;
@@ -1343,8 +1409,36 @@ namespace {
     return ERR_NONE;
   }
 
+#ifdef POD_COMM_ENABLED
+  static ErrResult GetMemAllocationProp(MemDevice const& memDevice, hipMemAllocationProp& prop)
+  {
+
+    switch (memDevice.memType) {
+    case MEM_CPU: case MEM_CPU_CLOSEST: case MEM_GPU:
+      prop.type = hipMemAllocationTypePinned; break;
+    case MEM_CPU_UNCACHED: case MEM_GPU_UNCACHED:
+#if defined (__NVCC__)
+      return {ERR_FATAL, "Uncached memory type unsupported in CUDA"};
+#else
+      prop.type = hipMemAllocationTypeUncached; break;
+#endif
+    default:
+      return {ERR_FATAL, "Unsupported memory type for pod communication"};
+    }
+
+    prop.requestedHandleTypes = hipMemHandleTypeFabric;
+//  at this point shouldn't have any memtype other than device
+//    ERR_CHECK(GetMemLocation(memDevice, prop.location));
+    prop.location.type = hipMemLocationTypeDevice;
+    prop.location.id = memDevice.memIndex;
+    return ERR_NONE;
+  }
+#endif
+
   // Allocate memory
-  static ErrResult AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr, bool isShareable = false)
+  static ErrResult AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr,
+                                  size_t* actualBytes = NULL,
+                                  hipMemGenericAllocationHandle_t* memHandle = NULL)
   {
     if (numBytes == 0) {
       return {ERR_FATAL, "Unable to allocate 0 bytes"};
@@ -1352,20 +1446,69 @@ namespace {
     *memPtr = nullptr;
 
     MemType const& memType = memDevice.memType;
+    int deviceIdx = memDevice.memIndex;
+    if (memType == MEM_CPU_CLOSEST) {
+      deviceIdx = GetClosestCpuNumaToGpu(memDevice.memIndex);
+    }
 
-    if (IsCpuMemType(memType)) {
-      // Determine which NUMA device to use
-      int numaIdx = memDevice.memIndex;
-      if (memType == MEM_CPU_CLOSEST) {
-        numaIdx = GetClosestCpuNumaToGpu(memDevice.memIndex);
+    // If memHandle is provided, allocate sharable memory
+    if (memHandle != NULL) {
+#ifdef POD_COMM_ENABLED
+      hipMemAllocationProp prop = {};
+      ERR_CHECK(GetMemAllocationProp(memDevice, prop));
+
+      // Determine recommended allocation granularity
+      size_t granularity;
+      ERR_CHECK(hipMemGetAllocationGranularity(&granularity, &prop,
+                                               hipMemAllocationGranularityRecommended));
+      size_t roundedUpBytes = (numBytes + granularity - 1) / granularity * granularity;
+      if (actualBytes != NULL) *actualBytes = roundedUpBytes;
+
+      // Create memory allocation described by properties and size
+      ERR_CHECK(hipMemCreate(memHandle, roundedUpBytes, &prop, 0));
+
+      // Reserve a virtual address range for the memory allocation
+      ERR_CHECK(hipMemAddressReserve((gpu_device_ptr*)memPtr, roundedUpBytes, 0, 0, 0));
+
+      // Map the allocation handle to the reserved address range
+      ERR_CHECK(hipMemMap((gpu_device_ptr)*memPtr, roundedUpBytes, 0, *memHandle, 0));
+
+      // Specify memory access descriptor to enable local read/write
+      hipMemAccessDesc desc;
+//      ERR_CHECK(GetMemLocation(memDevice, desc.location));
+      desc.location.type = hipMemLocationTypeDevice;
+      desc.location.id = memDevice.memIndex;
+      desc.flags = hipMemAccessFlagsProtReadWrite;
+
+      // Set access flags for virtual address range
+      ERR_CHECK(hipMemSetAccess((gpu_device_ptr)*memPtr, roundedUpBytes, &desc, 1));
+
+      // Clear the memory
+      if (IsCpuMemType(memType)) {
+        memset(*memPtr, 0, roundedUpBytes);
+        // Check that the allocated pages are actually on the correct NUMA node
+        ERR_CHECK(CheckPages((char*)*memPtr, roundedUpBytes, deviceIdx));
+      } else if (IsGpuMemType(memType)) {
+        ERR_CHECK(hipSetDevice(memDevice.memIndex));
+        ERR_CHECK(hipMemset(*memPtr, 0, numBytes));
+        ERR_CHECK(hipDeviceSynchronize());
       }
+      return ERR_NONE;
+#else
+      return {ERR_FATAL, "Unable to allocate sharable memory if not compiled with pod communication support"};
+#endif
+    } else {
+      if (actualBytes != NULL) *actualBytes = numBytes;
+    }
+
+    if (IsCpuMemType(memType)) {
 
       // Set NUMA policy prior to call to hipHostMalloc
-      numa_set_preferred(numaIdx);
+      numa_set_preferred(deviceIdx);
 
       // Allocate host-pinned memory (should respect NUMA mem policy)
       int flags = 0;
-#if !defined(__NVCC__)
+#if !defined (__NVCC__)
       flags |= hipHostMallocNumaUser;
 #endif
       if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) {
@@ -1393,12 +1536,12 @@ namespace {
 #endif
 #endif
       } else if (memType == MEM_CPU_UNPINNED) {
-        *memPtr = numa_alloc_onnode(numBytes, numaIdx);
+        *memPtr = numa_alloc_onnode(numBytes, deviceIdx);
       }
 
       // Check that the allocated pages are actually on the correct NUMA node
       memset(*memPtr, 0, numBytes);
-      ERR_CHECK(CheckPages((char*)*memPtr, numBytes, numaIdx));
+      ERR_CHECK(CheckPages((char*)*memPtr, numBytes, deviceIdx));
 
       // Reset to default numa mem policy
       numa_set_preferred(-1);
@@ -1437,30 +1580,44 @@ namespace {
   }
 
   // Deallocate memory
-  static ErrResult DeallocateMemory(MemType memType, void *memPtr, size_t const bytes)
+  static ErrResult DeallocateMemory(MemType memType, void *memPtr, size_t const bytes,
+                                    hipMemGenericAllocationHandle_t* memHandle = nullptr)
   {
     // Avoid deallocating nullptr
     if (memPtr == nullptr)
       return {ERR_FATAL, "Attempted to free null pointer for %lu bytes", bytes};
 
-    switch (memType) {
-    case MEM_CPU: case MEM_CPU_CLOSEST: case MEM_CPU_COHERENT: case MEM_CPU_NONCOHERENT: case MEM_CPU_UNCACHED:
-    {
-      ERR_CHECK(hipHostFree(memPtr));
-      break;
-    }
-    case MEM_CPU_UNPINNED:
-    {
-      numa_free(memPtr, bytes);
-      break;
-    }
-    case MEM_GPU : case MEM_GPU_FINE: case MEM_GPU_UNCACHED: case MEM_MANAGED:
-    {
-      ERR_CHECK(hipFree(memPtr));
-      break;
-    }
-    default:
-      return {ERR_FATAL, "Attempting to deallocate unrecognized memory type (%d)", memType};
+    if (memHandle == nullptr || *memHandle == NULL) {
+      switch (memType) {
+      case MEM_CPU: case MEM_CPU_CLOSEST: case MEM_CPU_COHERENT: case MEM_CPU_NONCOHERENT: case MEM_CPU_UNCACHED:
+      {
+        ERR_CHECK(hipHostFree(memPtr));
+        break;
+      }
+      case MEM_CPU_UNPINNED:
+      {
+        numa_free(memPtr, bytes);
+        break;
+      }
+      case MEM_GPU : case MEM_GPU_FINE: case MEM_GPU_UNCACHED: case MEM_MANAGED:
+      {
+        ERR_CHECK(hipFree(memPtr));
+        break;
+      }
+      default:
+        return {ERR_FATAL, "Attempting to deallocate unrecognized memory type (%d)", memType};
+      }
+    } else {
+#ifdef POD_COMM_ENABLED
+      // Unmap the backing memory of the given virtual address
+      ERR_CHECK(hipMemUnmap((gpu_device_ptr)memPtr, bytes));
+      // Release the backing memory via its handle
+      ERR_CHECK(hipMemRelease(*memHandle));
+      // Free virtual address range reservation
+      ERR_CHECK(hipMemAddressFree((gpu_device_ptr)memPtr, bytes));
+#else
+      return {ERR_FATAL, "Unable to deallocate sharable memory if not compiled with pod communication support"};
+#endif
     }
     return ERR_NONE;
   }
@@ -1681,7 +1838,7 @@ namespace {
   {
     if (GetCommMode() == COMM_NONE) return;
     if (System::Get().IsVerbose()) {
-      printf("[INFO] Rank %d checking config consistency\n", GetRank());
+      System::Get().Log("[INFO] Rank %d checking config consistency\n", GetRank());
     }
 
     // To check consistency, compare against rank 0
@@ -1703,10 +1860,14 @@ namespace {
     // Compare data options
     {
       DataOptions data = cfg.data;
+      // Null out vector members before sizeof-broadcast: vectors carry heap pointers that are
+      // invalid on other ranks; freeing a remote pointer on scope exit causes a segfault
+      // These fields are permitted to differ across ranks and are not compared below
+      decltype(data.fillPattern)().swap(data.fillPattern);
+      decltype(data.fillCompress)().swap(data.fillCompress);
       System::Get().Broadcast(root, sizeof(data), &data);
 
       // data.alwaysValidate is permitted to be different across ranks
-
       if (data.blockBytes != cfg.data.blockBytes) ADD_ERROR("cfg.data.blockBytes");
       if (data.byteOffset != cfg.data.byteOffset) ADD_ERROR("cfg.data.byteOffset");
 
@@ -1747,10 +1908,14 @@ namespace {
     // Compare GFX Executor options
     {
       GfxOptions gfx = cfg.gfx;
+      // Null out vector members before sizeof broadcast
+      decltype(gfx.cuMask)().swap(gfx.cuMask);
+      decltype(gfx.prefXccTable)().swap(gfx.prefXccTable);
       System::Get().Broadcast(root, sizeof(gfx), &gfx);
       if (gfx.blockOrder     != cfg.gfx.blockOrder)     ADD_ERROR("cfg.gfx.blockOrder");
       if (gfx.blockSize      != cfg.gfx.blockSize)      ADD_ERROR("cfg.gfx.blockSize");
       // gfx.cuMask       is permitted to be different across ranks
+      if (gfx.gfxKernel      != cfg.gfx.gfxKernel)      ADD_ERROR("cfg.gfx.gfxKernel");
       // gfx.perfXccTable is permitted to be different across ranks
       if (gfx.seType         != cfg.gfx.seType)         ADD_ERROR("cfg.gfx.seType");
       if (gfx.temporalMode   != cfg.gfx.temporalMode)   ADD_ERROR("cfg.gfx.temporalMode");
@@ -1775,6 +1940,7 @@ namespace {
       NicOptions nic = cfg.nic;
       System::Get().Broadcast(root, sizeof(nic), &nic);
       if (nic.chunkBytes      != cfg.nic.chunkBytes)      ADD_ERROR("cfg.nic.chunkBytes");
+      if (nic.cqPollBatch     != cfg.nic.cqPollBatch)     ADD_ERROR("cfg.nic.cqPollBatch");
       // nic.ibGidIndex  is permitted to be different across ranks
       // nic.ibPort      is permitted to be different across ranks
       if (nic.ipAddressFamily != cfg.nic.ipAddressFamily) ADD_ERROR("cfg.nic.ipAddressFamily");
@@ -1789,6 +1955,9 @@ namespace {
     #undef ADD_ERROR
   }
 
+  // Forward declaration
+  int GetGpuKernelUnrollIdx(int unroll);
+
   // Validate configuration options - return trues if and only if an fatal error is detected
   static bool ConfigOptionsHaveErrors(ConfigOptions const&    cfg,
                                       std::vector<ErrResult>& errors)
@@ -1827,11 +1996,10 @@ namespace {
     if (cfg.gfx.useMultiStream && cfg.gfx.blockOrder > 0)
       errors.push_back({ERR_WARN, "[gfx.blockOrder] will be ignored when running in multi-stream mode"});
 
-    int gfxMaxBlockSize = GetIntAttribute(ATR_GFX_MAX_BLOCKSIZE);
-    if (cfg.gfx.blockSize < 0 || cfg.gfx.blockSize % 64 || cfg.gfx.blockSize > gfxMaxBlockSize)
+    if (cfg.gfx.blockSize < 0 || cfg.gfx.blockSize % 64 || cfg.gfx.blockSize > MAX_BLOCKSIZE)
       errors.push_back({ERR_FATAL,
                         "[gfx.blockSize] must be positive multiple of 64 less than or equal to %d",
-                        gfxMaxBlockSize});
+                        MAX_BLOCKSIZE});
 
     if (cfg.gfx.temporalMode < 0 || cfg.gfx.temporalMode > 3)
       errors.push_back({ERR_FATAL,
@@ -1843,11 +2011,10 @@ namespace {
           "[gfx.temporalMode] is not supported on NVIDIA hardware"});
 #endif
 
-    int gfxMaxUnroll = GetIntAttribute(ATR_GFX_MAX_UNROLL);
-    if (cfg.gfx.unrollFactor < 0 || cfg.gfx.unrollFactor > gfxMaxUnroll)
+    if (GetGpuKernelUnrollIdx(cfg.gfx.unrollFactor) == -1) {
       errors.push_back({ERR_FATAL,
-                        "[gfx.unrollFactor] must be non-negative and less than or equal to %d",
-                        gfxMaxUnroll});
+          "[gfx.unrollFactor] unroll factor of %d is unsupported", cfg.gfx.unrollFactor});
+    }
     if (cfg.gfx.waveOrder < 0 || cfg.gfx.waveOrder >= 6)
       errors.push_back({ERR_FATAL,
                         "[gfx.waveOrder] must be non-negative and less than 6"});
@@ -1855,6 +2022,10 @@ namespace {
     if (!(cfg.gfx.wordSize == 1 || cfg.gfx.wordSize == 2 || cfg.gfx.wordSize == 4))
       errors.push_back({ERR_FATAL, "[gfx.wordSize] must be either 1, 2 or 4"});
 
+    if (cfg.gfx.gfxKernel < -1 || cfg.gfx.gfxKernel >= NUM_GFX_KERNELS)
+      errors.push_back(
+        {ERR_FATAL, "[gfx.gfxKernel] must be -1 for auto, or less than %d", NUM_GFX_KERNELS});
+
     int numGpus = GetNumExecutors(EXE_GPU_GFX);
     int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
     vector<vector<int>> const& table = cfg.gfx.prefXccTable;
@@ -1885,6 +2056,9 @@ namespace {
     if (cfg.nic.chunkBytes == 0 || (cfg.nic.chunkBytes % 4 != 0)) {
       errors.push_back({ERR_FATAL, "[nic.chunkBytes] must be a non-negative multiple of 4"});
     }
+    if (cfg.nic.cqPollBatch <= 0) {
+      errors.push_back({ERR_FATAL, "[nic.cqPollBatch] must be positive"});
+    }
 #endif
 
     // NVIDIA specific
@@ -1919,7 +2093,7 @@ namespace {
     if (GetCommMode() == COMM_NONE) return;
 
     if (System::Get().IsVerbose()) {
-      printf("[INFO] Rank %d checking transfers consistency\n", GetRank());
+      System::Get().Log("[INFO] Rank %d checking transfers consistency\n", GetRank());
     }
 
     // To check consistency, compare against rank 0
@@ -1967,6 +2141,18 @@ namespace {
     #undef ADD_ERROR
   }
 
+  // Returns true if the given Transfer requires pod communication
+  static bool IsPodTransfer(Transfer const& t)
+  {
+    if (IsCpuExeType(t.exeDevice.exeType) || IsGpuExeType(t.exeDevice.exeType)) {
+      for (auto const& src : t.srcs)
+        if (src.memRank != t.exeDevice.exeRank) return true;
+      for (auto const& dst : t.dsts)
+        if (dst.memRank != t.exeDevice.exeRank) return true;
+    }
+    return false;
+  }
+
   // Validate Transfers to execute - returns true if and only if fatal error detected
   static bool TransfersHaveErrors(ConfigOptions         const& cfg,
                                   std::vector<Transfer> const& transfers,
@@ -1981,16 +2167,24 @@ namespace {
     CheckMultiNodeTransferConsistency(transfers, errors);
 
     // Per-Transfer checks
+    bool hasFatalError = false;
     for (size_t i = 0; i < transfers.size(); i++) {
       Transfer const& t = transfers[i];
 
-      if (t.numBytes == 0)
+      if (t.numBytes == 0) {
         errors.push_back({ERR_FATAL, "Transfer %d: Cannot perform 0-byte transfers", i});
+        break;
+      }
+
+      if (t.numBytes % 4) {
+        errors.push_back({ERR_FATAL, "Transfer %d: numBytes (%lu) must be a multiple of 4\n", i, t.numBytes});
+        break;
+      }
 
       // Each subexecutor is assigned a multiple of cfg.data.blockBytes, however this may
       // mean that some subexecutors might not have any work assigned to them if the amount to
       // transfer is small
-      if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU) {
+      if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU || t.exeDevice.exeType == EXE_GPU_BDMA) {
         size_t const N               = t.numBytes / sizeof(float);
         int    const targetMultiple  = cfg.data.blockBytes / sizeof(float);
         int    const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple,
@@ -2003,25 +2197,36 @@ namespace {
       }
 
       // Check sources and destinations
-      if (t.srcs.empty() && t.dsts.empty())
+      if (t.srcs.empty() && t.dsts.empty()) {
         errors.push_back({ERR_FATAL, "Transfer %d: Must have at least one source or destination", i});
+        break;
+      }
 
       for (int j = 0; j < t.srcs.size(); j++) {
         ErrResult err = CheckMemDevice(t.srcs[j]);
-        if (err.errType != ERR_NONE)
+        if (err.errType != ERR_NONE) {
           errors.push_back({ERR_FATAL, "Transfer %d: SRC %d: %s", i, j, err.errMsg.c_str()});
+          hasFatalError = true;
+          break;
+        }
       }
+      if (hasFatalError) break;
+
       for (int j = 0; j < t.dsts.size(); j++) {
         ErrResult err = CheckMemDevice(t.dsts[j]);
-        if (err.errType != ERR_NONE)
+        if (err.errType != ERR_NONE) {
           errors.push_back({ERR_FATAL, "Transfer %d: DST %d: %s", i, j, err.errMsg.c_str()});
+          hasFatalError = true;
+          break;
+        }
       }
+      if (hasFatalError) break;
 
       // Check executor rank
       if (t.exeDevice.exeRank < 0 || t.exeDevice.exeRank >= GetNumRanks()) {
         errors.push_back({ERR_FATAL,
             "Rank index for executor must be between 0 and %d (instead of %d)", GetNumRanks() - 1, t.exeDevice.exeRank});
-        continue;
+        break;
       }
 
       executors.insert(t.exeDevice);
@@ -2030,56 +2235,77 @@ namespace {
 
       switch (t.exeDevice.exeType) {
       case EXE_CPU:
-        if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors)
+        if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors) {
           errors.push_back({ERR_FATAL,
                             "Transfer %d: CPU index must be between 0 and %d (instead of %d) for rank %d",
                             i, numExecutors - 1, t.exeDevice.exeIndex, t.exeDevice.exeRank});
+          hasFatalError = true;
+        }
         break;
       case EXE_GPU_GFX:
         if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors) {
           errors.push_back({ERR_FATAL,
                             "Transfer %d: GFX index must be between 0 and %d (instead of %d) for rank %d",
                             i, numExecutors - 1, t.exeDevice.exeIndex, t.exeDevice.exeRank});
+          hasFatalError = true;
+          break;
         } else {
           if (t.exeSubIndex != -1) {
 #if defined(__NVCC__)
             errors.push_back({ERR_FATAL,
                               "Transfer %d: GFX executor subindex not supported on NVIDIA hardware", i});
+            hasFatalError = true;
 #else
             useSubIndexCount[t.exeDevice]++;
             int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
-            if (t.exeSubIndex >= numSubIndices)
+            if (t.exeSubIndex >= numSubIndices) {
               errors.push_back({ERR_FATAL,
                   "Transfer %d: GFX subIndex (XCC) must be between 0 and %d for rank %d", i, numSubIndices - 1, t.exeDevice.exeRank});
+              hasFatalError = true;
+              break;
+            }
 #endif
           }
         }
         break;
       case EXE_GPU_DMA:
-        if (t.srcs.size() != 1 || t.dsts.size() != 1) {
+        if (t.srcs.size() != 1) {
           errors.push_back({ERR_FATAL,
-                            "Transfer %d: DMA executor must have exactly 1 source and 1 destination", i});
+                            "Transfer %d: DMA executor must have exactly 1 source", i});
+          hasFatalError = true;
+          break;
+        }
+        if (t.dsts.size() < 1) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: DMA executor must have at least 1 destination", i});
+          hasFatalError = true;
+          break;
         }
 
         if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors) {
           errors.push_back({ERR_FATAL,
                             "Transfer %d: DMA index must be between 0 and %d (instead of %d) for rank %d",
                             i, numExecutors - 1, t.exeDevice.exeIndex, t.exeDevice.exeRank});
-          // Cannot proceed with any further checks
-          continue;
+          hasFatalError = true;
+          break;
         }
 
         if (t.exeSubIndex != -1) {
 #if defined(__NVCC__)
           errors.push_back({ERR_FATAL,
                             "Transfer %d: DMA executor subindex not supported on NVIDIA hardware", i});
+          hasFatalError = true;
+          break;
 #else
           useSubIndexCount[t.exeDevice]++;
           int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
-          if (t.exeSubIndex >= numSubIndices)
+          if (t.exeSubIndex >= numSubIndices) {
             errors.push_back({ERR_FATAL,
                               "Transfer %d: DMA subIndex (engine) must be between 0 and %d",
                               i, numSubIndices - 1});
+            hasFatalError = true;
+            break;
+          }
 
           // Check that engine Id exists between agents
           hsa_agent_t srcAgent, dstAgent;
@@ -2087,29 +2313,46 @@ namespace {
           err = System::Get().GetHsaAgent(t.srcs[0], srcAgent);
           if (err.errType != ERR_NONE) {
             errors.push_back(err);
-            if (err.errType == ERR_FATAL) break;
-          }
-          err = System::Get().GetHsaAgent(t.dsts[0], dstAgent);
-          if (err.errType != ERR_NONE) {
-            errors.push_back(err);
-            if (err.errType == ERR_FATAL) break;
+            if (err.errType == ERR_FATAL) {
+              hasFatalError = true;
+              break;
+            }
+
           }
 
-          // Skip check of engine Id mask for self copies
-          if (srcAgent.handle != dstAgent.handle) {
-            uint32_t engineIdMask = 0;
-            err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask);
+          int numDsts = (int)t.dsts.size();
+          for (int dstIdx = 0; dstIdx < numDsts; dstIdx++) {
+            err = System::Get().GetHsaAgent(t.dsts[dstIdx], dstAgent);
             if (err.errType != ERR_NONE) {
               errors.push_back(err);
-              if (err.errType == ERR_FATAL) break;
+              if (err.errType == ERR_FATAL) {
+                hasFatalError = true;
+                break;
+              }
             }
-            hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
-            if (!(sdmaEngineId & engineIdMask)) {
-              errors.push_back({ERR_FATAL,
-                  "Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst",
-                  i, t.exeDevice.exeIndex, t.exeSubIndex});
+
+            // Skip check of engine Id mask for self copies
+            if (srcAgent.handle != dstAgent.handle) {
+              uint32_t engineIdMask = 0;
+              err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask);
+              if (err.errType != ERR_NONE) {
+                errors.push_back(err);
+                if (err.errType == ERR_FATAL) {
+                  hasFatalError = true;
+                  break;
+                }
+              }
+              hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
+              if (!(sdmaEngineId & engineIdMask)) {
+                errors.push_back({ERR_FATAL,
+                    "Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst",
+                    i, t.exeDevice.exeIndex, t.exeSubIndex});
+                hasFatalError = true;
+                break;
+              }
             }
           }
+          if (hasFatalError) break;
 #endif
         }
 
@@ -2132,12 +2375,67 @@ namespace {
           }
         }
         break;
+      case EXE_GPU_BDMA:
+#ifdef BMA_EXEC_ENABLED
+        if (t.srcs.size() != 1) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: BMA executor must have exactly 1 source", i});
+          hasFatalError = true;
+          break;
+        }
+        if (t.dsts.size() < 1) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: BMA executor must have at least 1 destination", i});
+          hasFatalError = true;
+          break;
+        }
+
+        if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: BMA index must be between 0 and %d (instead of %d) for rank %d",
+                            i, numExecutors - 1, t.exeDevice.exeIndex, t.exeDevice.exeRank});
+          hasFatalError = true;
+          break;
+        }
+
+        if (t.exeSubIndex != -1) {
+          errors.push_back({ERR_FATAL,
+              "Transfer %d: BMA executor does not support executor subindices (SDMA engine selection)", i});
+          hasFatalError = true;
+          break;
+        }
+
+        if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) {
+          errors.push_back({ERR_WARN,
+              "Transfer %d: No GPU memory for source or destination.  Copy might not execute on BMA %d",
+              i, t.exeDevice.exeIndex});
+        } else {
+          if (IsGpuMemType(t.srcs[0].memType)) {
+            if (t.srcs[0].memIndex != t.exeDevice.exeIndex) {
+              errors.push_back({ERR_WARN,
+                  "Transfer %d: BMA executor may use the source memory device (%d) not (%d)",
+                  i, t.srcs[0].memIndex, t.exeDevice.exeIndex});
+            }
+          } else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) {
+            errors.push_back({ERR_WARN,
+                "Transfer %d: BMA executor may use the destination memory device (%d) not (%d)",
+                i, t.dsts[0].memIndex, t.exeDevice.exeIndex});
+          }
+        }
+        break;
+#else
+        errors.push_back({ERR_FATAL,
+            "Transfer %d: BMA executor requires ROCm 7.1 or newer (AMD HIP with hipMemcpyBatchAsync)", i});
+        hasFatalError = true;
+        break;
+#endif
       case EXE_NIC: case EXE_NIC_NEAREST:
 #ifdef NIC_EXEC_ENABLED
       {
         // NIC Executors can only execute a copy operation
         if (t.srcs.size() != 1 || t.dsts.size() != 1) {
           errors.push_back({ERR_FATAL, "Transfer %d: NIC executor requires single SRC and single DST", i});
+          hasFatalError = true;
           break;
         }
 
@@ -2149,6 +2447,7 @@ namespace {
         if (srcMemRank != srcExeRank && dstMemRank != srcExeRank) {
           errors.push_back({ERR_FATAL,
               "Transfer %d: NIC executor rank (%d) must be same as SRC memory rank (%d) or DST memory rank (%d)", i, srcExeRank, srcMemRank, dstMemRank});
+          hasFatalError = true;
           break;
         }
 
@@ -2161,8 +2460,12 @@ namespace {
         if (srcExeDevice.exeIndex < 0 || srcExeDevice.exeIndex >= GetNumExecutors(EXE_NIC, srcExeRank)) {
           errors.push_back({ERR_FATAL, "Transfer %d: Rank %d SRC NIC executor indexes an out-of-range NIC (%d).  Detected %d NICs",
               i, srcExeRank, srcExeDevice.exeIndex, GetNumExecutors(EXE_NIC, srcExeRank)});
+          hasFatalError = true;
+          break;
         } else if (!NicIsActive(srcExeDevice.exeIndex, srcExeDevice.exeRank)) {
           errors.push_back({ERR_FATAL, "Transfer %d: Rank %d SRC NIC executor %d is not active", i, srcExeDevice.exeRank, srcExeDevice.exeIndex});
+          hasFatalError = true;
+          break;
         }
 
         // The DST NIC executor facilitates the copy but issues no commands
@@ -2174,29 +2477,51 @@ namespace {
         if (dstExeDevice.exeIndex < 0 || dstExeDevice.exeIndex >= GetNumExecutors(EXE_NIC, dstExeRank)) {
           errors.push_back({ERR_FATAL, "Transfer %d: Rank %d DST NIC executor indexes an out-of-range NIC (%d).  Detected %d NICs",
               i, dstExeRank, dstExeDevice.exeIndex, GetNumExecutors(EXE_NIC, dstExeRank)});
+          hasFatalError = true;
+          break;
         } else if (!NicIsActive(dstExeDevice.exeIndex, dstExeDevice.exeRank)) {
           errors.push_back({ERR_FATAL, "Transfer %d: Rank %d DST NIC executor %d is not active", i, dstExeDevice.exeRank, dstExeDevice.exeIndex});
+          hasFatalError = true;
+          break;
         }
       }
 #else
       errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available.", i});
+      hasFatalError = true;
 #endif
       break;
       }
 
+      // Skip further tests if fatal error detected
+      if (hasFatalError) break;
+
       // Check for multi-node support
-      // Currently this is not supported for CPU/GPU executors
-      if (IsCpuExeType(t.exeDevice.exeType) || IsGpuExeType(t.exeDevice.exeType)) {
-        bool crossRank = false;
+      if (IsPodTransfer(t)) {
+#ifndef POD_COMM_ENABLED
+        errors.push_back({ERR_FATAL,
+            "Transfer %d: Cross-rank GPU memory access requires pod communication support (HIP 8.0+)", i});
+        hasFatalError = true;
+        break;
+#endif
+        // In order to support pod communication, the participanting ranks need to be members of the same pod
+        int exeRank = t.exeDevice.exeRank;
+        bool samePod = true;
+
         for (auto const& src : t.srcs) {
-          crossRank |= (src.memRank != t.exeDevice.exeRank);
+          if (!(samePod = IsSamePod(src.memRank, exeRank)))
+            break;
         }
-        for (auto const& dst : t.dsts) {
-          crossRank |= (dst.memRank != t.exeDevice.exeRank);
+        if (samePod) {
+          for (auto const& dst : t.dsts) {
+            if (!(samePod = IsSamePod(dst.memRank, exeRank)))
+              break;
+          }
         }
-        if (crossRank) {
+
+        if (!samePod || IsCpuExeType(t.exeDevice.exeType)) {
           errors.push_back({ERR_FATAL, "Transfer %d: Executor on rank %d can not access memory across ranks\n",
               i, t.exeDevice.exeRank});
+          break;
         }
       }
 
@@ -2205,6 +2530,7 @@ namespace {
         errors.push_back({ERR_FATAL, "Transfer %d: # of subexecutors must be positive", i});
       else
         totalSubExecs[t.exeDevice] += t.numSubExecs;
+
     }
 
     int gpuMaxHwQueues = 4;
@@ -2246,6 +2572,7 @@ namespace {
                             "GPU %d specifies XCC on only %d of %d Transfers. "
                             "Must either specific none or all",
                             exeDevice.exeIndex, useSubIndexCount[exeDevice], transferCount[exeDevice]});
+          break;
         }
 
         if (cfg.gfx.useMultiStream && transferCount[exeDevice] > gpuMaxHwQueues) {
@@ -2263,6 +2590,7 @@ namespace {
                             "DMA %d specifies engine on only %d of %d Transfers. "
                             "Must either specific none or all",
                             exeDevice.exeIndex, useSubIndexCount[exeDevice], transferCount[exeDevice]});
+          break;
         }
         if (transferCount[exeDevice] > gpuMaxHwQueues) {
           errors.push_back({ERR_WARN,
@@ -2277,6 +2605,15 @@ namespace {
                             "DMA %d copies will fallback to blit (GFX) kernels", exeDevice.exeIndex});
         break;
       }
+      case EXE_GPU_BDMA:
+      {
+        if (transferCount[exeDevice] > gpuMaxHwQueues) {
+          errors.push_back({ERR_WARN,
+                           "BMA %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d",
+                           exeDevice.exeIndex, transferCount[exeDevice], gpuMaxHwQueues});
+        }
+        break;
+      }
       default:
         break;
       }
@@ -2307,19 +2644,24 @@ namespace {
     int                        teamIdx;           ///< Size of team this sub executor is part of
 
     // Outputs
-    long long                  startCycle;        ///< Start timestamp for in-kernel timing (GPU-GFX executor)
-    long long                  stopCycle;         ///< Stop  timestamp for in-kernel timing (GPU-GFX executor)
+    int64_t                    startCycle;        ///< Start timestamp for in-kernel timing (GPU-GFX executor)
+    int64_t                    stopCycle;         ///< Stop  timestamp for in-kernel timing (GPU-GFX executor)
     uint32_t                   hwId;              ///< Hardware ID
     uint32_t                   xccId;             ///< XCC ID
   };
 
   // Internal resources allocated per Transfer
+  typedef hipMemGenericAllocationHandle_t memHandle_t;
   struct TransferResources
   {
     int                        transferIdx;       ///< The associated Transfer
     size_t                     numBytes;          ///< Number of bytes to Transfer
     vector<float*>             srcMem;            ///< Source memory
     vector<float*>             dstMem;            ///< Destination memory
+    vector<size_t>             srcActualBytes;    ///< Actual amount of src memory allocated (after padding)
+    vector<size_t>             dstActualBytes;    ///< Actual amount of dst memory allocated (after padding)
+    vector<memHandle_t>        srcMemHandle;      ///< Memory handles for source memory
+    vector<memHandle_t>        dstMemHandle;      ///< Memory handles for destination memory
     vector<SubExecParam>       subExecParamCpu;   ///< Defines subarrays for each subexecutor
     vector<int>                subExecIdx;        ///< Indices into subExecParamGpu
     int                        numaNode;          ///< NUMA node to use for this Transfer
@@ -2329,13 +2671,13 @@ namespace {
 
     // For targeted-SDMA
 #if !defined(__NVCC__)
-    hsa_agent_t                dstAgent;          ///< DMA destination memory agent
+    vector<hsa_agent_t>        dstAgent;          ///< DMA destination memory agents
     hsa_agent_t                srcAgent;          ///< DMA source memory agent
     hsa_signal_t               signal;            ///< HSA signal for completion
     hsa_amd_sdma_engine_id_t   sdmaEngineId;      ///< DMA engine ID
 #endif
 
-// For IBV executor
+    // For IBV executor
 #ifdef NIC_EXEC_ENABLED
     int                        srcNicIndex;       ///< SRC NIC index
     int                        dstNicIndex;       ///< DST NIC index
@@ -2363,6 +2705,13 @@ namespace {
     vector<vector<ibv_send_wr>>sendWorkRequests;  ///< Send work requests per queue pair
 #endif
 
+    // For BMA executor
+#ifdef BMA_EXEC_ENABLED
+    vector<void*>              batchDsts;         ///< Destination pointers (per batch item)
+    vector<void*>              batchSrcs;         ///< Source pointers (per batch item)
+    vector<size_t>             batchBytes;        ///< Bytes to copy (per batch item)
+#endif
+
     // Counters
     double                     totalDurationMsec; ///< Total duration for all iterations for this Transfer
     vector<double>             perIterMsec;       ///< Duration for each individual iteration
@@ -2386,6 +2735,7 @@ namespace {
     vector<hipEvent_t>         startEvents;       ///< HIP start timing event
     vector<hipEvent_t>         stopEvents;        ///< HIP stop timing event
     int                        wallClockRate;     ///< (GFX-only) Device wall clock rate
+    int                        gfxKernelToUse;    ///< (GFX-only) Which GFX kernel to use
   };
 
   // Structure to track PCIe topology
@@ -2548,9 +2898,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       int numIbvDevices = 0;
       ibv_device** deviceList = ibv_get_device_list(&numIbvDevices);
 
-      // Check for NIC_FILTER
+      // Check for TB_NIC_FILTER
       // By default, accept all NIC names
-      std::string nicFilterPattern = getenv("NIC_FILTER") ? getenv("NIC_FILTER") : ".*";
+      std::string nicFilterPattern = getenv("TB_NIC_FILTER") ? getenv("TB_NIC_FILTER") : ".*";
 
       if (deviceList && numIbvDevices > 0) {
         // Loop over each device to collect information
@@ -2639,11 +2989,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
                                    bool               isLast = true)
   {
     if (!node.address.empty()) {
-      printf("%s%s%s", prefix.c_str(), (isLast ? "└── " : "├── "), node.address.c_str());
+      System::Get().Log("%s%s%s", prefix.c_str(), (isLast ? "└── " : "├── "), node.address.c_str());
       if (!node.description.empty()) {
-        printf("(%s)", node.description.c_str());
+        System::Get().Log("(%s)", node.description.c_str());
       }
-      printf("\n");
+      System::Get().Log("\n");
     }
     auto const& children = node.children;
     for (auto it = children.begin(); it != children.end(); ++it) {
@@ -2765,7 +3115,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     iss >> std::hex >> domain >> delimiter >> bus >> delimiter >> device >> delimiter >> function;
     if (iss.fail()) {
 #ifdef VERBS_DEBUG
-      printf("Invalid PCIe address format: %s\n", pcieAddress.c_str());
+      System::Get().Log("Invalid PCIe address format: %s\n", pcieAddress.c_str());
 #endif
       return -1;
     }
@@ -3049,7 +3399,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         }
       }
       // Create SRC completion queues
-      IBV_PTR_CALL(rss.srcCompQueue, ibv_create_cq, rss.srcContext, cfg.nic.queueSize, NULL, NULL, 0);
+      // Ensure CQ size is at least as large as the number of queue pairs to avoid overflow
+      int srcCQSize = std::max(cfg.nic.queueSize, static_cast<int>(rss.qpCount));
+      IBV_PTR_CALL(rss.srcCompQueue, ibv_create_cq, rss.srcContext, srcCQSize, NULL, NULL, 0);
       // Get SRC port attributes
       IBV_CALL(ibv_query_port, rss.srcContext, port, &rss.srcPortAttr);
       // Check for RDMA over Converged Ethernet (RoCE) and update GID index appropriately
@@ -3113,7 +3465,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         }
       }
       // Create DST completion queues
-      IBV_PTR_CALL(rss.dstCompQueue, ibv_create_cq, rss.dstContext, cfg.nic.queueSize, NULL, NULL, 0);
+      // Ensure CQ size is at least as large as the number of queue pairs to avoid overflow
+      int dstCQSize = std::max(cfg.nic.queueSize,static_cast<int>(rss.qpCount));
+      IBV_PTR_CALL(rss.dstCompQueue, ibv_create_cq, rss.dstContext, dstCQSize, NULL, NULL, 0);
       // Get DST port attributes
       IBV_CALL(ibv_query_port, rss.dstContext, port, &rss.dstPortAttr);
       // Check for RDMA over Converged Ethernet (RoCE) and update GID index appropriately
@@ -3145,7 +3499,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     System::Get().Broadcast(srcMemRank, sizeof(rss.srcPortAttr.link_layer), &rss.srcPortAttr.link_layer);
     System::Get().Broadcast(dstMemRank, sizeof(rss.dstPortAttr.link_layer), &rss.dstPortAttr.link_layer);
     if (rss.srcPortAttr.link_layer != rss.dstPortAttr.link_layer) {
-      printf("[ERROR] Link layer do not match (%d vs %d)\n", rss.srcPortAttr.link_layer, rss.dstPortAttr.link_layer);
       return {ERR_FATAL, "SRC NIC (%d) [Rank %d] and DST NIC (%d) [Rank %d] do not have the same link layer [%d vs %d]",
         rss.srcNicIndex, srcMemRank, rss.dstNicIndex, dstMemRank, rss.srcPortAttr.link_layer, rss.dstPortAttr.link_layer};
     }
@@ -3177,13 +3530,38 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 
       // Move queue pairs to ready-to-receive (RTR), using exchanged connection info
       // Then move them to read-to-send (RTS)
+      // Broadcast each rank's result so all ranks fail together rather than
+      // hanging on the next iteration's Broadcast when qpCount > 1.
+      struct QpTransitionResult { ErrType errType; bool rtrFailed; };
+      static_assert(std::is_trivially_copyable<QpTransitionResult>::value, "QpTransitionResult must be trivially copyable for MPI broadcast");
+      QpTransitionResult srcQpResult = {ERR_NONE, false};
       if (GetRank() == srcMemRank) {
-        ERR_CHECK(TransitionQpToRtr(rss.srcQueuePairs[i], dstConnInfo, port, srcIsRoCE, rss.srcPortAttr.active_mtu));
-        ERR_CHECK(TransitionQpToRts(rss.srcQueuePairs[i]));
+        ErrResult err = TransitionQpToRtr(rss.srcQueuePairs[i], dstConnInfo, port, srcIsRoCE, rss.srcPortAttr.active_mtu);
+        srcQpResult.rtrFailed = (err.errType != ERR_NONE);
+        if (err.errType == ERR_NONE) {
+          err = TransitionQpToRts(rss.srcQueuePairs[i]);
+        }
+        srcQpResult.errType = err.errType;
+      }
+      System::Get().Broadcast(srcMemRank, sizeof(srcQpResult), &srcQpResult);
+      if (srcQpResult.errType != ERR_NONE) {
+        return {ERR_FATAL, "SRC rank %d failed to transition QP %d to %s",
+                srcMemRank, i, srcQpResult.rtrFailed ? "RTR" : "RTS"};
       }
+
+      QpTransitionResult dstQpResult = {ERR_NONE, false};
       if (GetRank() == dstMemRank) {
-        ERR_CHECK(TransitionQpToRtr(rss.dstQueuePairs[i], srcConnInfo, port, dstIsRoCE, rss.dstPortAttr.active_mtu));
-        ERR_CHECK(TransitionQpToRts(rss.dstQueuePairs[i]));
+        ErrResult err = TransitionQpToRtr(rss.dstQueuePairs[i], srcConnInfo, port, dstIsRoCE, rss.dstPortAttr.active_mtu);
+        dstQpResult.rtrFailed = (err.errType != ERR_NONE);
+        if (err.errType == ERR_NONE) {
+          err = TransitionQpToRts(rss.dstQueuePairs[i]);
+        }
+        dstQpResult.errType = err.errType;
+      }
+      System::Get().Broadcast(dstMemRank, sizeof(dstQpResult), &dstQpResult);
+      if (dstQpResult.errType != ERR_NONE) {
+        return {ERR_FATAL, "DST rank %d failed to transition QP %d to %s",
+                dstMemRank, i, dstQpResult.rtrFailed ? "RTR" : "RTS"};
       }
 
       // Prepare scatter-gather element / work request for this queue pair in advance
@@ -3198,10 +3576,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         auto const   lkey      = (nicExeRank == srcMemRank ? rss.srcMemRegion->lkey        : rss.dstMemRegion->lkey);
         auto const   rkey      = (nicExeRank == srcMemRank ? dstConnInfo.rkey              : srcConnInfo.rkey);
         if (System::Get().IsVerbose()) {
-          printf("[INFO] Transfer %d SubExec %d executed by rank %d NIC %d is %s with %lu chunks\n",
-                 rss.transferIdx, i, nicExeRank, nicExeDevice.exeIndex,
-                 (opcode == IBV_WR_RDMA_WRITE ? "remote write" : "remote read"),
-                 numChunks);
+          System::Get().Log("[INFO] Transfer %d SubExec %d executed by rank %d NIC %d is %s with %lu chunks\n",
+                            rss.transferIdx, i, nicExeRank, nicExeDevice.exeIndex,
+                            (opcode == IBV_WR_RDMA_WRITE ? "remote write" : "remote read"),
+                            numChunks);
         }
         rss.sgePerQueuePair[i].resize(numChunks, {});
         rss.sendWorkRequests[i].resize(numChunks, {});
@@ -3227,8 +3605,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
           wr.wr.rdma.rkey        = rkey;
 
           if (System::Get().IsVerbose()) {
-            printf("[INFO] Transfer %d SubExec %d chunk %lu local %p remote %p of size %lu\n",
-                   rss.transferIdx, i, chunkIdx, (void*)local, (void*)remote, currChunkBytes);
+            System::Get().Log("[INFO] Transfer %d SubExec %d chunk %lu local %p remote %p of size %lu\n",
+                              rss.transferIdx, i, chunkIdx, (void*)local, (void*)remote, currChunkBytes);
           }
 
           // Increment locations
@@ -3353,16 +3731,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       std::shuffle(lineTypes.begin(), lineTypes.end(), gen);
 
       // Apply zero-ing
-      int dumpLines = getenv("DUMP_LINES") ? atoi(getenv("DUMP_LINES")) : 0;
+      int dumpLines = getenv("TB_DUMP_LINES") ? atoi(getenv("TB_DUMP_LINES")) : 0;
 
       if (dumpLines) {
-        printf("Input pattern 64B line statistics for bufferIdx %d:\n", bufferIdx);
-        printf("Total lines: %lu\n", numLines);
-        printf("- 0: Random : %8lu (%8.3f%%)\n", lineCounts[0], 100.0 * lineCounts[0] / (1.0 * numLines));
-        printf("- 1: 1B0    : %8lu (%8.3f%%)\n", lineCounts[1], 100.0 * lineCounts[1] / (1.0 * numLines));
-        printf("- 2: 2B0    : %8lu (%8.3f%%)\n", lineCounts[2], 100.0 * lineCounts[2] / (1.0 * numLines));
-        printf("- 3: 4B0    : %8lu (%8.3f%%)\n", lineCounts[3], 100.0 * lineCounts[3] / (1.0 * numLines));
-        printf("- 4: 32B0   : %8lu (%8.3f%%)\n", lineCounts[4], 100.0 * lineCounts[4] / (1.0 * numLines));
+        System::Get().Log("Input pattern 64B line statistics for bufferIdx %d:\n", bufferIdx);
+        System::Get().Log("Total lines: %lu\n", numLines);
+        System::Get().Log("- 0: Random : %8lu (%8.3f%%)\n", lineCounts[0], 100.0 * lineCounts[0] / (1.0 * numLines));
+        System::Get().Log("- 1: 1B0    : %8lu (%8.3f%%)\n", lineCounts[1], 100.0 * lineCounts[1] / (1.0 * numLines));
+        System::Get().Log("- 2: 2B0    : %8lu (%8.3f%%)\n", lineCounts[2], 100.0 * lineCounts[2] / (1.0 * numLines));
+        System::Get().Log("- 3: 4B0    : %8lu (%8.3f%%)\n", lineCounts[3], 100.0 * lineCounts[3] / (1.0 * numLines));
+        System::Get().Log("- 4: 32B0   : %8lu (%8.3f%%)\n", lineCounts[4], 100.0 * lineCounts[4] / (1.0 * numLines));
       }
 
       for (int line = 0; line < numLines; line++) {
@@ -3394,12 +3772,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         }
 
         if (line < dumpLines) {
-          printf("Line %02d [%d]: ", line, lineTypes[line]);
+          System::Get().Log("Line %02d [%d]: ", line, lineTypes[line]);
           for (int j = 63; j >= 0; j--){
-            printf("%02x ", linePtr[j]);
-            if (j % 16 == 0) printf(" ");
+            System::Get().Log("%02x ", linePtr[j]);
+            if (j % 16 == 0) System::Get().Log(" ");
           }
-          printf("\n");
+          System::Get().Log("\n");
         }
       }
     } else {
@@ -3445,6 +3823,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         if (IsCpuMemType(t.dsts[dstIdx].memType) || cfg.data.validateDirect) {
           output = (rss->dstMem[dstIdx]) + initOffset;
         } else {
+          ERR_CHECK(hipSetDevice(t.dsts[dstIdx].memIndex));
           ERR_CHECK(hipMemcpy(outputBuffer.data(), (rss->dstMem[dstIdx]) + initOffset, t.numBytes, hipMemcpyDefault));
           ERR_CHECK(hipDeviceSynchronize());
           output = outputBuffer.data();
@@ -3465,6 +3844,48 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     return ERR_NONE;
   }
 
+  // Determine eligibility requirements for a particular GFX kernel
+  static bool CanUseGfxKernel(int const                gpuKernelIdx,
+                              ConfigOptions const&     cfg,
+                              vector<Transfer> const&  transfers,
+                              ExeInfo const&           exeInfo)
+  {
+    // GpuReduceKernel always works
+    if (gpuKernelIdx == GFX_KERNEL_REDUCE) return true;
+
+    // CopyKernel works if all Transfers have at most one SRC / one DST with no warp subexecutors
+    if (gpuKernelIdx == GFX_KERNEL_COPY) {
+      if (cfg.gfx.seType != 0) return false;
+      if (exeInfo.resources.empty()) return false;
+      for (auto const& rss : exeInfo.resources) {
+        Transfer const& t = transfers[rss.transferIdx];
+        if (t.srcs.size() > 1 || t.dsts.size() > 1) return false;
+        if (cfg.gfx.useSingleTeam && t.numSubExecs > 1) return false;
+      }
+      return true;
+    }
+
+    return false;
+  }
+
+  static ErrResult SelectGfxKernel(ConfigOptions const& cfg, vector<Transfer> const& transfers, ExeInfo& exeInfo)
+  {
+    // Decide on which GFX kernel to use
+    // Auto-select - prefer copyKernel if eligible
+    if (cfg.gfx.gfxKernel == GFX_KERNEL_AUTO) {
+      exeInfo.gfxKernelToUse = CanUseGfxKernel(GFX_KERNEL_COPY, cfg, transfers, exeInfo) ? 1 : 0;
+    } else {
+      exeInfo.gfxKernelToUse = cfg.gfx.gfxKernel;
+    }
+
+    // Warn if using forcing copy kernel, but allow kernel to continue
+    if (cfg.gfx.gfxKernel == GFX_KERNEL_COPY && !CanUseGfxKernel(GFX_KERNEL_COPY, cfg, transfers, exeInfo)) {
+      return {ERR_WARN,
+        "GFX copy kernel forced even though deemed incompatible for current set of Transfers / config"};
+    }
+    return ERR_NONE;
+  }
+
 // Preparation-related functions
 //========================================================================================
 
@@ -3538,12 +3959,96 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       }
     }
 
+#ifdef BMA_EXEC_ENABLED
+    // Prepare src/dst pointers for batched DMA executor
+    rss.batchDsts.clear();
+    rss.batchSrcs.clear();
+    rss.batchBytes.clear();
+    if (transfer.exeDevice.exeType == EXE_GPU_BDMA) {
+      for (int i = 0; i < transfer.numSubExecs; ++i) {
+        for (int j = 0; j < (int)rss.dstMem.size(); j++) {
+          rss.batchSrcs.push_back(subExecParam[i].src[0]);
+          rss.batchDsts.push_back(subExecParam[i].dst[j]);
+          rss.batchBytes.push_back(subExecParam[i].N * sizeof(float));
+        }
+      }
+    }
+#endif
+
     // Clear counters
     rss.totalDurationMsec = 0.0;
 
     return ERR_NONE;
   }
 
+  static ErrResult ExchangeMemory(MemDevice const& memDevice, ExeDevice const& exeDevice, size_t* pActualBytes,
+                                  float** memPtr, hipMemGenericAllocationHandle_t* memHandle)
+  {
+    // Pass this pointer to all ranks (Used for pointer arithmetic, not defererenced on non-local ranks)
+    // NOTE: This will be overwritten on executor rank if pod communication is required
+    System::Get().Broadcast(memDevice.memRank, sizeof(*memPtr), memPtr);
+
+    // Broadcast actualBytes from owning rank so importing rank gets the correct (rounded-up) size
+    System::Get().Broadcast(memDevice.memRank, sizeof(*pActualBytes), pActualBytes);
+
+    // If pod communication is required, export/import fabric handle
+    if (memDevice.memRank != exeDevice.exeRank && IsGpuExeType(exeDevice.exeType)) {
+#ifdef POD_COMM_ENABLED
+      // mem rank exports to shareable fabric handle; broadcast handle + status so all
+      // ranks fail together instead of hanging on the next collective if export fails
+      hipMemFabricHandle_t fabricHandle = {};
+      hipError_t exportErr = hipSuccess;
+      const char* exportStep = "hipSetDevice";
+      if (memDevice.memRank == GetRank()) {
+        exportErr = hipSetDevice(memDevice.memIndex);
+        if (exportErr == hipSuccess) {
+          exportStep = "hipMemExportToShareableHandle";
+          exportErr = hipMemExportToShareableHandle(&fabricHandle, *memHandle, hipMemHandleTypeFabric, 0);
+        }
+      }
+
+      System::Get().Broadcast(memDevice.memRank, sizeof(hipMemFabricHandle_t), &fabricHandle);
+      System::Get().Broadcast(memDevice.memRank, sizeof(hipError_t), &exportErr);
+      if (exportErr != hipSuccess) {
+        return {ERR_FATAL, "HIP Error in %s during fabric handle export: %s", exportStep, hipGetErrorString(exportErr)};
+      }
+
+      // exe rank imports the fabric handle; broadcast result so all ranks fail together
+      hipError_t importErr = hipSuccess;
+      const char* importStep = "hipSetDevice";
+      if (exeDevice.exeRank == GetRank()) {
+        importErr = hipSetDevice(exeDevice.exeIndex);
+        if (importErr == hipSuccess) {
+          importStep = "hipMemImportFromShareableHandle";
+          importErr = hipMemImportFromShareableHandle(memHandle, (void*)&fabricHandle, hipMemHandleTypeFabric);
+        }
+        if (importErr == hipSuccess) {
+          importStep = "hipMemAddressReserve";
+          importErr = hipMemAddressReserve((gpu_device_ptr*)memPtr, *pActualBytes, 0, 0, 0);
+        }
+        if (importErr == hipSuccess) {
+          importStep = "hipMemMap";
+          importErr = hipMemMap((gpu_device_ptr)*memPtr, *pActualBytes, 0, *memHandle, 0);
+        }
+        if (importErr == hipSuccess) {
+          importStep = "hipMemSetAccess";
+          hipMemAccessDesc desc;
+          desc.location = {hipMemLocationTypeDevice, exeDevice.exeIndex};
+          desc.flags = hipMemAccessFlagsProtReadWrite;
+          importErr = hipMemSetAccess((gpu_device_ptr)*memPtr, *pActualBytes, &desc, 1);
+        }
+      }
+      System::Get().Broadcast(exeDevice.exeRank, sizeof(hipError_t), &importErr);
+      if (importErr != hipSuccess) {
+        return {ERR_FATAL, "HIP Error in %s during fabric handle import: %s", importStep, hipGetErrorString(importErr)};
+      }
+#else
+      return {ERR_FATAL, "Unable to export/import fabric handle without compiling with pod communication support"};
+#endif
+    }
+    return ERR_NONE;
+  }
+
   // Prepare each executor
   // Allocates memory for src/dst, prepares subexecutors, executor-specific data structures
   static ErrResult PrepareExecutor(ConfigOptions    const& cfg,
@@ -3554,8 +4059,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     exeInfo.totalDurationMsec = 0.0;
     int const localRank = GetRank();
     if (System::Get().IsVerbose()) {
-      printf("[INFO] Rank %d preparing executor (%c%d on Rank %d)\n",
-             localRank, ExeTypeStr[exeDevice.exeType], exeDevice.exeIndex, exeDevice.exeRank);
+      System::Get().Log("[INFO] Rank %d preparing executor (%c%d on Rank %d)\n",
+                        localRank, ExeTypeStr[exeDevice.exeType], exeDevice.exeIndex, exeDevice.exeRank);
     }
 
     // Loop over each transfer this executor is involved in
@@ -3564,12 +4069,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       rss.numBytes = t.numBytes;
 
       if (System::Get().IsVerbose()) {
-        printf("[INFO] Rank %d preparing transfer %d (%lu SRC %lu DST)\n",
-               localRank, rss.transferIdx, t.srcs.size(), t.dsts.size());
+        System::Get().Log("[INFO] Rank %d preparing transfer %d (%lu SRC %lu DST)\n",
+                          localRank, rss.transferIdx, t.srcs.size(), t.dsts.size());
       }
 
       // Allocate source memory
       rss.srcMem.resize(t.srcs.size());
+      rss.srcActualBytes.resize(t.srcs.size());
+      rss.srcMemHandle.resize(t.srcs.size(), NULL);
       for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) {
         MemDevice const& srcMemDevice = t.srcs[iSrc];
 
@@ -3584,16 +4091,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         }
 
         // Allocate source memory (on the correct rank)
+        bool requiresFabricHandle = (srcMemDevice.memRank != exeDevice.exeRank) && IsGpuExeType(exeDevice.exeType);
         if (srcMemDevice.memRank == localRank) {
-          ERR_CHECK(AllocateMemory(srcMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.srcMem[iSrc]));
+          ERR_CHECK(AllocateMemory(srcMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.srcMem[iSrc],
+                                   &rss.srcActualBytes[iSrc], requiresFabricHandle ? &rss.srcMemHandle[iSrc] : nullptr));
         }
 
-        // Pass this pointer to all ranks (Used for pointer arithmetic, not defererenced on non-local ranks)
-        System::Get().Broadcast(srcMemDevice.memRank, sizeof(rss.srcMem[iSrc]), &rss.srcMem[iSrc]);
+        // Exchange memory pointer across ranks
+        ERR_CHECK(ExchangeMemory(srcMemDevice, exeDevice, &rss.srcActualBytes[iSrc],
+                                 &rss.srcMem[iSrc], &rss.srcMemHandle[iSrc]));
       }
 
       // Allocate destination memory
       rss.dstMem.resize(t.dsts.size());
+      rss.dstActualBytes.resize(t.dsts.size());
+      rss.dstMemHandle.resize(t.dsts.size(), NULL);
       for (int iDst = 0; iDst < t.dsts.size(); ++iDst) {
         MemDevice const& dstMemDevice = t.dsts[iDst];
 
@@ -3607,11 +4119,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         }
 
         // Allocate destination memory (on the correct rank)
+        bool requiresFabricHandle = (dstMemDevice.memRank != exeDevice.exeRank) && IsGpuExeType(exeDevice.exeType);
         if (dstMemDevice.memRank == localRank) {
-          ERR_CHECK(AllocateMemory(dstMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.dstMem[iDst]));
+          ERR_CHECK(AllocateMemory(dstMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.dstMem[iDst],
+                                   &rss.dstActualBytes[iDst], requiresFabricHandle ? &rss.dstMemHandle[iDst] : NULL));
         }
-        // Pass this pointer to all ranks (Used for pointer arithmetic, not defererenced on non-local ranks)
-        System::Get().Broadcast(dstMemDevice.memRank, sizeof(rss.dstMem[iDst]), &rss.dstMem[iDst]);
+
+        // Exchange memory pointer across ranks
+        ERR_CHECK(ExchangeMemory(dstMemDevice, exeDevice, &rss.dstActualBytes[iDst],
+                                 &rss.dstMem[iDst], &rss.dstMemHandle[iDst]));
       }
 
       // Prepare HSA DMA copy specific resources
@@ -3620,8 +4136,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         // Collect HSA agent information
         hsa_amd_pointer_info_t info;
         info.size = sizeof(info);
-        ERR_CHECK(hsa_amd_pointer_info(rss.dstMem[0], &info, NULL, NULL, NULL));
-        rss.dstAgent = info.agentOwner;
+        int numDst = (int)rss.dstMem.size();
+        rss.dstAgent.resize(numDst);
+        for (int dstIdx = 0; dstIdx < numDst; dstIdx++) {
+          ERR_CHECK(hsa_amd_pointer_info(rss.dstMem[dstIdx], &info, NULL, NULL, NULL));
+          rss.dstAgent[dstIdx] = info.agentOwner;
+        }
 
         ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL));
         rss.srcAgent = info.agentOwner;
@@ -3639,11 +4159,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     }
 
     // Prepare additional requirements for GPU-based executors
-    if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) && exeDevice.exeRank == localRank) {
+    if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA || exeDevice.exeType == EXE_GPU_BDMA)
+        && exeDevice.exeRank == localRank) {
       ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
 
       // Determine how many streams to use
-      int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA ||
+      int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA || exeDevice.exeType == EXE_GPU_BDMA ||
                                   (exeDevice.exeType == EXE_GPU_GFX && cfg.gfx.useMultiStream))
                                   ? exeInfo.resources.size() : 1;
       exeInfo.streams.resize(numStreamsToUse);
@@ -3757,6 +4278,22 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       return {ERR_FATAL, "RDMA executor is not supported"};
 #endif
     }
+
+    // Check that GPU wallclock rate is non-zero
+    if (exeDevice.exeType == EXE_GPU_GFX && exeInfo.wallClockRate == 0) {
+      if (getenv("TB_WALLCLOCK_RATE")) {
+        exeInfo.wallClockRate = atoi(getenv("TB_WALLCLOCK_RATE"));
+        return {ERR_WARN,
+          "GPU %d wallclock rate query returned 0 unexpectedly.  Setting to %d instead as specified by TB_WALLCLOCK_RATE",
+          exeDevice.exeIndex, exeInfo.wallClockRate};
+      } else {
+        exeInfo.wallClockRate = 100000;
+        return {ERR_WARN,
+          "GPU %d wallclock rate query returned 0 unexpectedly.  Setting to %d instead.  Use TB_WALLCLOCK_RATE to customize",
+          exeDevice.exeIndex, exeInfo.wallClockRate};
+      }
+    }
+
     return ERR_NONE;
   }
 
@@ -3778,14 +4315,30 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       // Deallocate source memory
       for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) {
         if (t.srcs[iSrc].memRank == localRank) {
-          ERR_CHECK(DeallocateMemory(t.srcs[iSrc].memType, rss.srcMem[iSrc], t.numBytes + cfg.data.byteOffset));
+          ERR_CHECK(DeallocateMemory(t.srcs[iSrc].memType, rss.srcMem[iSrc],
+                                     rss.srcActualBytes[iSrc],
+                                     &rss.srcMemHandle[iSrc]));
+        } else if (exeDevice.exeRank == localRank && rss.srcMemHandle[iSrc] != 0) {
+#ifdef POD_COMM_ENABLED
+          ERR_CHECK(hipMemUnmap((gpu_device_ptr)rss.srcMem[iSrc], rss.srcActualBytes[iSrc]));
+          ERR_CHECK(hipMemRelease(rss.srcMemHandle[iSrc]));
+          ERR_CHECK(hipMemAddressFree((gpu_device_ptr)rss.srcMem[iSrc], rss.srcActualBytes[iSrc]));
+#endif
         }
       }
 
       // Deallocate destination memory
       for (int iDst = 0; iDst < t.dsts.size(); ++iDst) {
         if (t.dsts[iDst].memRank == localRank) {
-          ERR_CHECK(DeallocateMemory(t.dsts[iDst].memType, rss.dstMem[iDst], t.numBytes + cfg.data.byteOffset));
+          ERR_CHECK(DeallocateMemory(t.dsts[iDst].memType, rss.dstMem[iDst],
+                                     rss.dstActualBytes[iDst],
+                                     &rss.dstMemHandle[iDst]));
+        } else if (exeDevice.exeRank == localRank && rss.dstMemHandle[iDst] != 0) {
+#ifdef POD_COMM_ENABLED
+          ERR_CHECK(hipMemUnmap((gpu_device_ptr)rss.dstMem[iDst], rss.dstActualBytes[iDst]));
+          ERR_CHECK(hipMemRelease(rss.dstMemHandle[iDst]));
+          ERR_CHECK(hipMemAddressFree((gpu_device_ptr)rss.dstMem[iDst], rss.dstActualBytes[iDst]));
+#endif
         }
       }
 
@@ -3805,7 +4358,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     }
 
     // Teardown additional requirements for GPU-based executors
-    if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) && exeDevice.exeRank == localRank) {
+    if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA || exeDevice.exeType == EXE_GPU_BDMA)
+        && exeDevice.exeRank == localRank) {
       for (auto stream : exeInfo.streams)
         ERR_CHECK(hipStreamDestroy(stream));
       if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) {
@@ -3855,7 +4409,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 
           // Add a dummy check to ensure the read is not optimized out
           if (sum != sum) {
-            printf("[ERROR] Nan detected\n");
+            System::Get().Log("[ERROR] Nan detected\n");
           }
         } else {
           for (int i = 0; i < numDsts; ++i)
@@ -3977,18 +4531,24 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       }
       // poll for completions
       size_t completedTransfers = 0;
+      int pollBatch = std::max(1, cfg.nic.cqPollBatch);
+      std::vector<ibv_wc> wc((size_t)pollBatch);
+      ibv_wc* wc_array = wc.data();
       while (completedTransfers < transferCount) {
         for (auto i = 0; i < transferCount; i++) {
           if(receivedQPs[i] < exeInfo.resources[i].qpCount) {
             auto& rss = exeInfo.resources[i];
             // Poll the completion queue until all queue pairs are complete
             // The order of completion doesn't matter because this completion queue is dedicated to this Transfer
-            ibv_wc wc;
-            int nc = ibv_poll_cq(rss.srcIsExeNic ? rss.srcCompQueue : rss.dstCompQueue, 1, &wc);
+            // Use batch polling to drain multiple completions at once for better efficiency
+            int nc = ibv_poll_cq(rss.srcIsExeNic ? rss.srcCompQueue : rss.dstCompQueue, pollBatch, wc_array);
             if (nc > 0) {
-              receivedQPs[i]++;
-              if (wc.status != IBV_WC_SUCCESS) {
-                return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion [status code %d]", rss.transferIdx, wc.status};
+              // Process all completions in the batch
+              for (int j = 0; j < nc; j++) {
+                if (wc_array[j].status != IBV_WC_SUCCESS) {
+                  return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion [status code %d]", rss.transferIdx, wc_array[j].status};
+                }
+                receivedQPs[i]++;
               }
             } else if (nc < 0) {
               return {ERR_FATAL, "Transfer %d: Received negative work completion", rss.transferIdx};
@@ -4143,7 +4703,166 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     }
   }
 
-  // Kernel for GFX execution
+  // Simplified Kernel for GFX execution for copies only
+  template <typename PACKED_FLOAT, int LAUNCH_BOUND, int UNROLL, int TEMPORAL_MODE>
+  __global__ void __launch_bounds__(LAUNCH_BOUND)
+    GpuCopyKernel(SubExecParam* params, int seType, int waveOrder, int numSubIterations)
+  {
+    int64_t startCycle;
+    // For warp-level, each warp's first thread records timing; for threadblock-level, only first thread of block
+    bool shouldRecordTiming = (seType == 1) ? (threadIdx.x % warpSize == 0) : (threadIdx.x == 0);
+    if (shouldRecordTiming) startCycle = GetTimestamp();
+
+    // seType: 0=threadblock, 1=warp
+    int subExecIdx;
+    if (seType == 0) {
+      // Threadblock-level: each threadblock is a subexecutor
+      subExecIdx = blockIdx.y;
+    } else {
+      // Warp-level: each warp is a subexecutor
+      int warpIdx       = threadIdx.x / warpSize;
+      int warpsPerBlock = blockDim.x  / warpSize;
+      subExecIdx = blockIdx.y * warpsPerBlock + warpIdx;
+    }
+
+    SubExecParam& p = params[subExecIdx];
+
+    // For warp-level dispatch, inactive warps should return early
+    if (seType == 1 && p.N == 0) return;
+
+    // Filter by XCC
+#if !defined(__NVCC__)
+    int32_t xccId;
+    GetXccId(xccId);
+    if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
+#endif
+
+    // Collect data information
+    bool hasSrc = p.numSrcs > 0;
+    bool hasDst = p.numDsts > 0;
+    PACKED_FLOAT const* __restrict__ srcFloatPacked = (PACKED_FLOAT const*)p.src[0];
+    PACKED_FLOAT*       __restrict__ dstFloatPacked = (PACKED_FLOAT*)p.dst[0];
+
+    // Operate on wavefront granularity
+    int32_t const nTeams  = p.teamSize;             // Number of threadblocks working together on this subarray
+    int32_t const teamIdx = p.teamIdx;              // Index of this threadblock within the team
+    int32_t nWaves, waveIdx;
+    if (seType == 0) {
+      // Threadblock-level: all wavefronts in block work together
+      nWaves  = blockDim.x  / warpSize;              // Number of wavefronts within this threadblock
+      waveIdx = threadIdx.x / warpSize;              // Index of this wavefront within the threadblock
+    } else {
+      // Warp-level: each warp works independently
+      nWaves  = 1;
+      waveIdx = 0;
+    }
+    int32_t const tIdx = threadIdx.x % warpSize;     // Thread index within wavefront
+
+    size_t  const numPackedFloat = p.N / (sizeof(PACKED_FLOAT)/sizeof(float));
+
+    int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
+    switch (waveOrder) {
+    case 0: /* U,W,C */ unrlStride = 1; waveStride = UNROLL; teamStride = UNROLL * nWaves;  teamStride2 = nWaves; waveStride2 = 1     ; break;
+    case 1: /* U,C,W */ unrlStride = 1; teamStride = UNROLL; waveStride = UNROLL * nTeams;  teamStride2 = 1;      waveStride2 = nTeams; break;
+    case 2: /* W,U,C */ waveStride = 1; unrlStride = nWaves; teamStride = nWaves * UNROLL;  teamStride2 = nWaves; waveStride2 = 1     ; break;
+    case 3: /* W,C,U */ waveStride = 1; teamStride = nWaves; unrlStride = nWaves * nTeams;  teamStride2 = nWaves; waveStride2 = 1     ; break;
+    case 4: /* C,U,W */ teamStride = 1; unrlStride = nTeams; waveStride = nTeams * UNROLL;  teamStride2 = 1;      waveStride2 = nTeams; break;
+    case 5: /* C,W,U */ teamStride = 1; waveStride = nTeams; unrlStride = nTeams * nWaves;  teamStride2 = 1;      waveStride2 = nTeams; break;
+    }
+
+    int subIterations = 0;
+    while (1) {
+      // First loop: Each wavefront in the team works on UNROLL PACKED_FLOAT per thread
+      size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
+      size_t const loop1Limit  = numPackedFloat / loop1Stride * loop1Stride;
+      {
+        PACKED_FLOAT val[UNROLL];
+        if (!hasSrc) {
+          #pragma unroll
+          for (int u = 0; u < UNROLL; u++)
+            val[u] = MemsetVal<PACKED_FLOAT>();
+        }
+
+        for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride) {
+          // Read sources into memory and accumulate in registers
+          if (hasSrc) {
+            #pragma unroll
+            for (int u = 0; u < UNROLL; u++)
+              Load<TEMPORAL_MODE>(&srcFloatPacked[idx + u * unrlStride * warpSize], val[u]);
+          }
+
+          // Write accumulation to all outputs
+          if (hasDst) {
+            #pragma unroll
+            for (int u = 0; u < UNROLL; u++)
+              Store<TEMPORAL_MODE>(val[u], &dstFloatPacked[idx + u * unrlStride * warpSize]);
+          }
+        }
+      }
+
+      // Second loop: Deal with remaining PACKED_FLOAT
+      {
+        if (loop1Limit < numPackedFloat) {
+          PACKED_FLOAT val;
+          if (!hasSrc) val = MemsetVal<PACKED_FLOAT>();
+
+          size_t const loop2Stride = nTeams * nWaves * warpSize;
+          for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
+               idx < numPackedFloat; idx += loop2Stride) {
+            if (hasSrc) {
+              Load<TEMPORAL_MODE>(&srcFloatPacked[idx], val);
+            }
+            if (hasDst) {
+              Store<TEMPORAL_MODE>(val, &dstFloatPacked[idx]);
+            }
+          }
+        }
+      }
+
+      // Third loop; Deal with remaining floats
+      {
+        if (numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) < p.N) {
+          float val;
+          if (!hasSrc) val = MemsetVal<float>();
+
+          size_t const loop3Stride = nTeams * nWaves * warpSize;
+          for (size_t idx = numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) {
+            if (hasSrc) {
+              Load<TEMPORAL_MODE>(&p.src[0][idx], val);
+            }
+
+            if (hasDst) {
+              Store<TEMPORAL_MODE>(val, &p.dst[0][idx]);
+            }
+          }
+        }
+      }
+      // Allows for numSubiterations == 0 to run infinitely
+      if (++subIterations == numSubIterations) break;
+    }
+
+    // Wait for all threads to finish
+    if (seType == 1) {
+      // For warp-level, sync within warp only
+#if defined(__HIP_PLATFORM_AMD__) && (HIP_VERSION_MAJOR < 7)
+      __builtin_amdgcn_wave_barrier();
+#else
+      __syncwarp();
+#endif
+    } else {
+      // For threadblock-level, sync all threads
+      __syncthreads();
+    }
+
+    if (shouldRecordTiming) {
+      p.stopCycle  = GetTimestamp();
+      p.startCycle = startCycle;
+      GetHwId(p.hwId);
+      GetXccId(p.xccId);
+    }
+  }
+
+  // Kernel for GFX execution
   template <typename PACKED_FLOAT, int LAUNCH_BOUND, int UNROLL, int TEMPORAL_MODE>
   __global__ void __launch_bounds__(LAUNCH_BOUND)
     GpuReduceKernel(SubExecParam* params, int seType, int waveOrder, int numSubIterations)
@@ -4295,26 +5014,25 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
           }
         }
       }
-
+      // Allows for numSubiterations == 0 to run infinitely
       if (++subIterations == numSubIterations) break;
     }
 
     // Wait for all threads to finish
     if (seType == 1) {
       // For warp-level, sync within warp only
- #if defined(__HIP_PLATFORM_AMD__) && (HIP_VERSION_MAJOR < 7)
+#if defined(__HIP_PLATFORM_AMD__) && (HIP_VERSION_MAJOR < 7)
       __builtin_amdgcn_wave_barrier();
- #else
+#else
 
       __syncwarp();
- #endif
+#endif
     } else {
       // For threadblock-level, sync all threads
       __syncthreads();
     }
 
     if (shouldRecordTiming) {
-      __threadfence_system();
       p.stopCycle  = GetTimestamp();
       p.startCycle = startCycle;
       GetHwId(p.hwId);
@@ -4322,31 +5040,68 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     }
   }
 
+  // Must match ordering in GfxKernelType
+#define GPU_KERNEL_KERNEL_DECL(LAUNCH_BOUND, UNROLL, DWORD, TEMPORAL) \
+  {GpuReduceKernel<DWORD, LAUNCH_BOUND, UNROLL, TEMPORAL>,            \
+   GpuCopyKernel  <DWORD, LAUNCH_BOUND, UNROLL, TEMPORAL>}
+
+  // Must match mapping in GetGpuKernelTemporalIdx
+  constexpr int KERN_TEMPORALS = 4;
 #define GPU_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, DWORD)           \
-  {GpuReduceKernel<DWORD, LAUNCH_BOUND, UNROLL, TEMPORAL_NONE>,      \
-   GpuReduceKernel<DWORD, LAUNCH_BOUND, UNROLL, TEMPORAL_LOAD>,      \
-   GpuReduceKernel<DWORD, LAUNCH_BOUND, UNROLL, TEMPORAL_STORE>,     \
-   GpuReduceKernel<DWORD, LAUNCH_BOUND, UNROLL, TEMPORAL_BOTH>}
+  {GPU_KERNEL_KERNEL_DECL(LAUNCH_BOUND, UNROLL, DWORD, TEMPORAL_NONE),  \
+   GPU_KERNEL_KERNEL_DECL(LAUNCH_BOUND, UNROLL, DWORD, TEMPORAL_LOAD),  \
+   GPU_KERNEL_KERNEL_DECL(LAUNCH_BOUND, UNROLL, DWORD, TEMPORAL_STORE), \
+   GPU_KERNEL_KERNEL_DECL(LAUNCH_BOUND, UNROLL, DWORD, TEMPORAL_BOTH)}
+
+  int GetGpuKernelTemporalIdx(int temporalMode) {
+    if (temporalMode == TEMPORAL_NONE)  return 0;
+    if (temporalMode == TEMPORAL_LOAD)  return 1;
+    if (temporalMode == TEMPORAL_STORE) return 2;
+    if (temporalMode == TEMPORAL_BOTH)  return 3;
+    return -1;
+  }
 
+  // Must match mapping in GetGpuKernelWordsizeIdx
+  constexpr int KERN_WORDSIZES = 3;
 #define GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, UNROLL)        \
   {GPU_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, float),  \
    GPU_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, float2), \
    GPU_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, float4)}
 
-#define GPU_KERNEL_UNROLL_DECL(LAUNCH_BOUND)    \
-  {GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 1),      \
-   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 2),      \
-   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 3),      \
-   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 4),      \
-   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 5),      \
-   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 6),      \
-   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 7),      \
-   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 8)}
-
-  // Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size / temporal)
+  int GetGpuKernelWordsizeIdx(int wordsize) {
+    if (wordsize == 1) return 0;
+    if (wordsize == 2) return 1;
+    if (wordsize == 4) return 2;
+    return -1;
+  }
+
+  // Must match mapping in GetGpuKernelUnrollIdx
+  constexpr int KERN_UNROLLS = 10;
+#define GPU_KERNEL_UNROLL_DECL(LAUNCH_BOUND) \
+  {GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND,  1),  \
+   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND,  2),  \
+   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND,  3),  \
+   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND,  4),  \
+   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND,  5),  \
+   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND,  6),  \
+   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND,  7),  \
+   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND,  8),  \
+   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 16),  \
+   GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 32)}
+
+  // Must match the unroll mapping in GPU_KERNEL_UNROLL_DECL
+  int GetGpuKernelUnrollIdx(int unroll) {
+    if (1 <= unroll && unroll <= 8) return unroll - 1;
+    if (unroll == 16)               return 8;
+    if (unroll == 32)               return 9;
+    return -1;
+  }
+
+  // Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size / temporal / kernel)
   typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int, int);
+  constexpr int KERN_BOUNDS = 4;
 #ifndef SINGLE_KERNEL
-  GpuKernelFuncPtr GpuKernelTable[4][MAX_UNROLL][3][4] =
+  GpuKernelFuncPtr GpuKernelsTable[KERN_BOUNDS][KERN_UNROLLS][KERN_WORDSIZES][KERN_TEMPORALS][NUM_GFX_KERNELS] =
   {
     GPU_KERNEL_UNROLL_DECL(256),
     GPU_KERNEL_UNROLL_DECL(512),
@@ -4358,65 +5113,83 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
   #undef GPU_KERNEL_UNROLL_DECL
   #undef GPU_KERNEL_DWORD_DECL
   #undef GPU_KERNEL_TEMPORAL_DECL
-  #undef GPU_KERNEL_SE_TYPE_DECL
+
+  int GetGpuKernelBlocksizeIdx(int blocksize) {
+    return (blocksize + 255) / 256 - 1;
+  }
 
   // Execute a single GPU Transfer (when using 1 stream per Transfer)
   static ErrResult ExecuteGpuTransfer(int           const  iteration,
+                                      int           const  exeTotalSubExecs,
+                                      SubExecParam*        exeSubExecParam,
                                       hipStream_t   const  stream,
                                       hipEvent_t    const  startEvent,
                                       hipEvent_t    const  stopEvent,
                                       int           const  xccDim,
                                       ConfigOptions const& cfg,
+                                      int           const  gfxKernelIdx,
                                       TransferResources&   rss)
   {
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-
-    int numSubExecs = rss.subExecParamCpu.size();
-    int gridY = CalculateGridY(cfg.gfx.seType, cfg.gfx.blockSize, numSubExecs);
-    dim3 const gridSize(xccDim, gridY, 1);
-    dim3 const blockSize(cfg.gfx.blockSize, 1);
+    // Determine which kernel to launch
+    int const blockSizeIdx = GetGpuKernelBlocksizeIdx(cfg.gfx.blockSize);
+    int const unrollIdx    = GetGpuKernelUnrollIdx(cfg.gfx.unrollFactor);
+    int const wordSizeIdx  = GetGpuKernelWordsizeIdx(cfg.gfx.wordSize);
+    int const temporalIdx  = GetGpuKernelTemporalIdx(cfg.gfx.temporalMode);
 
-    int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
-                      cfg.gfx.wordSize == 2 ? 1 :
-                                              2;
 #ifdef SINGLE_KERNEL
-    auto gpuKernel = GpuReduceKernel<float4, 256, 1, 0>;
+    auto gpuKernel = GpuReduceKernel<float4, 1024, 1, 0>;
 #else
-    auto gpuKernel = GpuKernelTable[(cfg.gfx.blockSize+255)/256 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode];
+    auto gpuKernel = GpuKernelsTable[blockSizeIdx][unrollIdx][wordSizeIdx][temporalIdx][gfxKernelIdx];
 #endif
 
+    // Compute kernel launch parameters
+    int  const numSubExecs = cfg.gfx.useMultiStream ? rss.subExecParamCpu.size() : exeTotalSubExecs;
+    int  const gridY = CalculateGridY(cfg.gfx.seType, cfg.gfx.blockSize, numSubExecs);
+    dim3 const gridSize(xccDim, gridY, 1);
+    dim3 const blockSize(cfg.gfx.blockSize);
+
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    SubExecParam* params = cfg.gfx.useMultiStream ? rss.subExecParamGpuPtr : exeSubExecParam;
+
 #if defined(__NVCC__)
-    if (startEvent != NULL)
+    if (cfg.gfx.useHipEvents)
       ERR_CHECK(hipEventRecord(startEvent, stream));
-    gpuKernel<<<gridSize, blockSize, 0, stream>>>(rss.subExecParamGpuPtr, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
-    if (stopEvent != NULL)
+    gpuKernel<<<gridSize, blockSize, 0 , stream>>>(params, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+    if (cfg.gfx.useHipEvents)
       ERR_CHECK(hipEventRecord(stopEvent, stream));
 #else
-    hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream, startEvent, stopEvent,
-                          0, rss.subExecParamGpuPtr, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+    hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream,
+                          cfg.gfx.useHipEvents ? startEvent : NULL,
+                          cfg.gfx.useHipEvents ? stopEvent  : NULL, 0,
+                          params, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
 #endif
 
     ERR_CHECK(hipStreamSynchronize(stream));
 
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
+    // Record this timing if this Transfer is being run in multistream mode
+    if (cfg.gfx.useMultiStream) {
+      if (iteration >= 0) {
+        auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+        double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
+
+        double deltaMsec = cpuDeltaMsec;
+        if (startEvent != NULL) {
+          float gpuDeltaMsec;
+          ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+          deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
+        }
 
-    if (iteration >= 0) {
-      double deltaMsec = cpuDeltaMsec;
-      if (startEvent != NULL) {
-        float gpuDeltaMsec;
-        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-        deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
-      }
-      rss.totalDurationMsec += deltaMsec;
-      if (cfg.general.recordPerIteration) {
-        rss.perIterMsec.push_back(deltaMsec);
-        std::set<std::pair<int,int>> CUs;
-        for (int i = 0; i < numSubExecs; i++) {
-          CUs.insert(std::make_pair(rss.subExecParamGpuPtr[i].xccId,
-                                    GetId(rss.subExecParamGpuPtr[i].hwId)));
+        rss.totalDurationMsec += deltaMsec;
+        if (cfg.general.recordPerIteration) {
+          rss.perIterMsec.push_back(deltaMsec);
+          std::set<std::pair<int,int>> CUs;
+          for (int i = 0; i < numSubExecs; i++) {
+            CUs.insert(std::make_pair(rss.subExecParamGpuPtr[i].xccId,
+                                      GetId(rss.subExecParamGpuPtr[i].hwId)));
+          }
+          rss.perIterCUs.push_back(CUs);
         }
-        rss.perIterCUs.push_back(CUs);
       }
     }
     return ERR_NONE;
@@ -4434,72 +5207,56 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     int xccDim = exeInfo.useSubIndices ? exeInfo.numSubIndices : 1;
 
     if (cfg.gfx.useMultiStream) {
-      // Launch each Transfer separately in its own stream
+      // Launch one thread per Transfer in separate streams
       vector<std::future<ErrResult>> asyncTransfers;
       for (int i = 0; i < exeInfo.streams.size(); i++) {
         asyncTransfers.emplace_back(std::async(std::launch::async,
                                                ExecuteGpuTransfer,
                                                iteration,
+                                               exeInfo.totalSubExecs,
+                                               exeInfo.subExecParamGpu,
                                                exeInfo.streams[i],
                                                cfg.gfx.useHipEvents ? exeInfo.startEvents[i] : NULL,
                                                cfg.gfx.useHipEvents ? exeInfo.stopEvents[i] : NULL,
                                                xccDim,
                                                std::cref(cfg),
+                                               exeInfo.gfxKernelToUse,
                                                std::ref(exeInfo.resources[i])));
       }
       for (auto& asyncTransfer : asyncTransfers)
         ERR_CHECK(asyncTransfer.get());
     } else {
-      // Combine all the Transfers into a single kernel launch
-      int numSubExecs = exeInfo.totalSubExecs;
-      int gridY = CalculateGridY(cfg.gfx.seType, cfg.gfx.blockSize, numSubExecs);
-      dim3 const gridSize(xccDim, gridY, 1);
-      dim3 const blockSize(cfg.gfx.blockSize, 1);
-      hipStream_t stream = exeInfo.streams[0];
-
-      int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
-                        cfg.gfx.wordSize == 2 ? 1 :
-                                                2;
-#ifdef SINGLE_KERNEL
-      auto gpuKernel = GpuReduceKernel<float4, 256, 1, 0>;
-#else
-      auto gpuKernel = GpuKernelTable[(cfg.gfx.blockSize+255)/256 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode];
-#endif
-
-#if defined(__NVCC__)
-      if (cfg.gfx.useHipEvents)
-        ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream));
-      gpuKernel<<<gridSize, blockSize, 0 , stream>>>(exeInfo.subExecParamGpu, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
-      if (cfg.gfx.useHipEvents)
-        ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream));
-#else
-      hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream,
-                            cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL,
-                            cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL, 0,
-                            exeInfo.subExecParamGpu, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
-#endif
-      ERR_CHECK(hipStreamSynchronize(stream));
+      // Launch all Transfers in one kernel launch (avoid extra thread creation)
+      ExecuteGpuTransfer(iteration, exeInfo.totalSubExecs, exeInfo.subExecParamGpu, exeInfo.streams[0],
+                         cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL,
+                         cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL,
+                         xccDim, cfg, exeInfo.gfxKernelToUse, exeInfo.resources[0]);
     }
+
     auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0
-      / cfg.general.numSubIterations;
 
     if (iteration >= 0) {
+      // Determine executor timing
+      // - Use HIP event timing if enabled and not using multi-stream
+      // - Otherwise, Use CPU timing
       if (cfg.gfx.useHipEvents && !cfg.gfx.useMultiStream) {
         float gpuDeltaMsec;
         ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0]));
         gpuDeltaMsec /= cfg.general.numSubIterations;
         exeInfo.totalDurationMsec += gpuDeltaMsec;
       } else {
+        double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0
+          / cfg.general.numSubIterations;
         exeInfo.totalDurationMsec += cpuDeltaMsec;
       }
 
+      // If Transfers were combined into a single launch, figure out per-Transfer timing
       // Determine timing for each of the individual transfers that were part of this launch
       if (!cfg.gfx.useMultiStream) {
         for (int i = 0; i < exeInfo.resources.size(); i++) {
           TransferResources& rss = exeInfo.resources[i];
-          long long minStartCycle = std::numeric_limits<long long>::max();
-          long long maxStopCycle  = std::numeric_limits<long long>::min();
+          int64_t minStartCycle = std::numeric_limits<int64_t>::max();
+          int64_t maxStopCycle  = std::numeric_limits<int64_t>::min();
           std::set<std::pair<int, int>> CUs;
 
           for (auto subExecIdx : rss.subExecIdx) {
@@ -4510,6 +5267,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
                                         GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
             }
           }
+
           double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate);
           deltaMsec /= cfg.general.numSubIterations;
           rss.totalDurationMsec += deltaMsec;
@@ -4529,6 +5287,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
   // Execute a single DMA Transfer
   static ErrResult ExecuteDmaTransfer(int           const  iteration,
                                       bool          const  useSubIndices,
+                                      int           const  exeIndex,
                                       hipStream_t   const  stream,
                                       hipEvent_t    const  startEvent,
                                       hipEvent_t    const  stopEvent,
@@ -4537,15 +5296,31 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
   {
     auto cpuStart = std::chrono::high_resolution_clock::now();
 
+    int numDsts = (int)resources.dstMem.size();
+    ERR_CHECK(hipSetDevice(exeIndex));
     int subIterations = 0;
     if (!useSubIndices && !cfg.dma.useHsaCopy) {
       if (cfg.dma.useHipEvents)
         ERR_CHECK(hipEventRecord(startEvent, stream));
 
-      // Use hipMemcpy
+      // Force the use of SDMA engine if possible
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 6)
+      hipMemcpyKind memcpyKind = hipMemcpyDeviceToDeviceNoCU;
+#endif
+
+      // Use DMA copy engine
       do {
-        ERR_CHECK(hipMemcpyAsync(resources.dstMem[0], resources.srcMem[0], resources.numBytes,
-                                 hipMemcpyDefault, stream));
+        // Queue for each output location
+        for (int dstIdx = 0; dstIdx < numDsts; dstIdx++) {
+#if defined(CUMEM_ENABLED)
+          ERR_CHECK(cuMemcpyAsync((CUdeviceptr)resources.dstMem[dstIdx],
+                                  (CUdeviceptr)resources.srcMem[0],
+                                  resources.numBytes, stream));
+#else
+          ERR_CHECK(hipMemcpyAsync(resources.dstMem[dstIdx], resources.srcMem[0], resources.numBytes,
+                                   memcpyKind, stream));
+#endif
+        }
       } while (++subIterations != cfg.general.numSubIterations);
 
       if (cfg.dma.useHipEvents)
@@ -4557,20 +5332,22 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #else
       // Use HSA async copy
       do {
-        hsa_signal_store_screlease(resources.signal, 1);
-        if (!useSubIndices) {
-          ERR_CHECK(hsa_amd_memory_async_copy(resources.dstMem[0], resources.dstAgent,
-                                              resources.srcMem[0], resources.srcAgent,
-                                              resources.numBytes, 0, NULL,
-                                              resources.signal));
-        } else {
-          HSA_CALL(hsa_amd_memory_async_copy_on_engine(resources.dstMem[0], resources.dstAgent,
-                                                       resources.srcMem[0], resources.srcAgent,
-                                                       resources.numBytes, 0, NULL,
-                                                       resources.signal,
-                                                       resources.sdmaEngineId, true));
+        hsa_signal_store_screlease(resources.signal, numDsts);
+        for (int dstIdx = 0; dstIdx < numDsts; dstIdx++) {
+          if (!useSubIndices) {
+            ERR_CHECK(hsa_amd_memory_async_copy(resources.dstMem[dstIdx], resources.dstAgent[dstIdx],
+                                                resources.srcMem[0], resources.srcAgent,
+                                                resources.numBytes, 0, NULL,
+                                                resources.signal));
+          } else {
+            HSA_CALL(hsa_amd_memory_async_copy_on_engine(resources.dstMem[dstIdx], resources.dstAgent[dstIdx],
+                                                         resources.srcMem[0], resources.srcAgent,
+                                                         resources.numBytes, 0, NULL,
+                                                         resources.signal,
+                                                         resources.sdmaEngineId, true));
+          }
         }
-        // Wait for SDMA transfer to complete
+        // Wait for SDMA transfer(s) to complete
         while(hsa_signal_wait_scacquire(resources.signal,
                                         HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
                                         HSA_WAIT_STATE_ACTIVE) >= 1);
@@ -4609,6 +5386,93 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
                                              ExecuteDmaTransfer,
                                              iteration,
                                              exeInfo.useSubIndices,
+                                             exeIndex,
+                                             exeInfo.streams[i],
+                                             cfg.dma.useHipEvents ? exeInfo.startEvents[i] : NULL,
+                                             cfg.dma.useHipEvents ? exeInfo.stopEvents[i]  : NULL,
+                                             std::cref(cfg),
+                                             std::ref(exeInfo.resources[i])));
+    }
+
+    for (auto& asyncTransfer : asyncTransfers)
+      ERR_CHECK(asyncTransfer.get());
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
+    if (iteration >= 0)
+      exeInfo.totalDurationMsec += deltaMsec;
+    return ERR_NONE;
+  }
+
+// BMA Executor-related functions
+//========================================================================================
+#ifdef BMA_EXEC_ENABLED
+  // Execute a single BMA Transfer (one hipMemcpyBatchAsync per sub-iteration; each subexecutor is one batch entry)
+  static ErrResult ExecuteBatchDmaTransfer(int           const  iteration,
+                                           int           const  exeIndex,
+                                           hipStream_t   const  stream,
+                                           hipEvent_t    const  startEvent,
+                                           hipEvent_t    const  stopEvent,
+                                           ConfigOptions const& cfg,
+                                           TransferResources&   resources)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    ERR_CHECK(hipSetDevice(exeIndex));
+
+    int subIterations = 0;
+    if (cfg.dma.useHipEvents)
+      ERR_CHECK(hipEventRecord(startEvent, stream));
+
+    [[maybe_unused]] size_t failIdx = 0;
+    do {
+      ERR_CHECK(hipMemcpyBatchAsync(resources.batchDsts.data(),
+                                    resources.batchSrcs.data(),
+                                    resources.batchBytes.data(),
+                                    resources.batchDsts.size(),
+                                    nullptr, nullptr, 0,
+    // In CUDA 13.0 the failIdx argument was removed from the original CUDA 12.8 API call
+#if !defined(__NVCC__) || (defined(CUDA_VERSION) && (CUDA_VERSION < 13000))
+                                    &failIdx,
+#endif
+                                    stream));
+    } while (++subIterations != cfg.general.numSubIterations);
+
+    if (cfg.dma.useHipEvents)
+      ERR_CHECK(hipEventRecord(stopEvent, stream));
+    ERR_CHECK(hipStreamSynchronize(stream));
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
+
+    if (iteration >= 0) {
+      double deltaMsec = cpuDeltaMsec;
+      if (cfg.dma.useHipEvents) {
+        float gpuDeltaMsec;
+        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+        deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
+      }
+      resources.totalDurationMsec += deltaMsec;
+      if (cfg.general.recordPerIteration)
+        resources.perIterMsec.push_back(deltaMsec);
+    }
+    return ERR_NONE;
+  }
+
+  static ErrResult RunBmaExecutor(int           const  iteration,
+                                  ConfigOptions const& cfg,
+                                  int           const  exeIndex,
+                                  ExeInfo&             exeInfo)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    ERR_CHECK(hipSetDevice(exeIndex));
+
+    vector<std::future<ErrResult>> asyncTransfers;
+    for (int i = 0; i < exeInfo.resources.size(); i++) {
+      asyncTransfers.emplace_back(std::async(std::launch::async,
+                                             ExecuteBatchDmaTransfer,
+                                             iteration,
+                                             exeIndex,
                                              exeInfo.streams[i],
                                              cfg.dma.useHipEvents ? exeInfo.startEvents[i] : NULL,
                                              cfg.dma.useHipEvents ? exeInfo.stopEvents[i]  : NULL,
@@ -4625,6 +5489,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       exeInfo.totalDurationMsec += deltaMsec;
     return ERR_NONE;
   }
+#endif // BMA_EXEC_ENABLED
 
 // Executor-related functions
 //========================================================================================
@@ -4634,16 +5499,32 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
                                ExeInfo&             exeInfo)
   {
     switch (exeDevice.exeType) {
-    case EXE_CPU:     return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
-    case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
-    case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    case EXE_CPU:           return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    case EXE_GPU_GFX:       return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    case EXE_GPU_DMA:       return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
 #ifdef NIC_EXEC_ENABLED
-    case EXE_NIC:     return RunNicExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    case EXE_NIC:           return RunNicExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+#endif
+#ifdef BMA_EXEC_ENABLED
+    case EXE_GPU_BDMA: return RunBmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
 #endif
-    default:          return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType};
+    default:            return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType};
     }
   }
 
+#if defined(__NVCC__)
+  static bool MnnvlCheck() {
+    int flag = 0;
+#ifdef POD_COMM_ENABLED
+    CUresult err = cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, 0);
+    if (err || !flag) return false;
+    err = cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, 0);
+#endif
+    if (!flag) return false;
+    return true;
+  }
+#endif
+
 } // End of anonymous namespace
 //========================================================================================
 /// @endcond
@@ -4657,7 +5538,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       this->errMsg  = "";
     } else {
       this->errType = ERR_FATAL;
+#if defined(__NVCC__)
+      this->errMsg  = std::string("CUDA Runtime Error: ") + hipGetErrorString(err);
+#else
       this->errMsg  = std::string("HIP Error: ") + hipGetErrorString(err);
+#endif
     }
   }
 
@@ -4674,6 +5559,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       this->errMsg  = std::string("HSA Error: ") + errString;
     }
   }
+#elif defined(CUMEM_ENABLED)
+  ErrResult::ErrResult(CUresult err)
+  {
+    if (err == CUDA_SUCCESS) {
+      this->errType = ERR_NONE;
+      this->errMsg  = "";
+    } else {
+      const char *errString = NULL, *errName = NULL;
+      cuGetErrorName(err, &errName);
+      cuGetErrorString(err, &errString);
+      this->errType = ERR_FATAL;
+      this->errMsg  = std::string("CUDA Driver Error: ") + errName
+                      + " (" + errString + ")";
+    }
+  }
 #endif
 
   ErrResult::ErrResult(ErrType errType, const char* format, ...)
@@ -4715,6 +5615,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       return false;
     }
 
+    // Log transfers (if requested)
+    System::Get().LogTransfers(transfers);
+
     // Collect up transfers by executor
     int minNumSrcs = MAX_SRCS + 1;
     int maxNumSrcs = 0;
@@ -4756,6 +5659,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       if (exeDevice.exeRank == localRank) {
         localExecutors.push_back(exeDevice);
       }
+
+      // Select which GFX kernel to use for this executor
+      if (exeDevice.exeType == EXE_GPU_GFX) {
+        ERR_APPEND(SelectGfxKernel(cfg, transfers, exeInfo), errResults);
+      }
     }
 
     // Prepare reference src/dst arrays - only once for largest size
@@ -4782,6 +5690,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         Transfer const& t = transfers[resource->transferIdx];
         for (int srcIdx = 0; srcIdx < resource->srcMem.size(); srcIdx++) {
           if (t.srcs[srcIdx].memRank == localRank) {
+            if (IsGpuMemType(t.srcs[srcIdx].memType)) {
+              ERR_APPEND(hipSetDevice(t.srcs[srcIdx].memIndex), errResults);
+            }
             ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx] + initOffset, srcReference[srcIdx].data(), resource->numBytes,
                                  hipMemcpyDefault), errResults);
           }
@@ -4792,22 +5703,22 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     // Pause before starting when running in iteractive mode
     if (cfg.general.useInteractive) {
       if (localRank == 0) {
-        printf("Memory prepared:\n");
+        System::Get().Log("Memory prepared:\n");
 
         for (int i = 0; i < transfers.size(); i++) {
-          printf("Transfer %03d:\n", i);
+          System::Get().Log("Transfer %03d:\n", i);
           for (int iSrc = 0; iSrc < transfers[i].srcs.size(); ++iSrc)
-            printf("  SRC %0d: %p\n", iSrc, transferResources[i]->srcMem[iSrc]);
+            System::Get().Log("  SRC %0d: %p\n", iSrc, transferResources[i]->srcMem[iSrc]);
           for (int iDst = 0; iDst < transfers[i].dsts.size(); ++iDst)
-            printf("  DST %0d: %p\n", iDst, transferResources[i]->dstMem[iDst]);
+            System::Get().Log("  DST %0d: %p\n", iDst, transferResources[i]->dstMem[iDst]);
         }
-        printf("Hit <Enter> to continue: ");
+        System::Get().Log("Hit <Enter> to continue: ");
         fflush(stdout);
         if (scanf("%*c") != 0) {
-          printf("[ERROR] Unexpected input\n");
+          System::Get().Log("[ERROR] Unexpected input\n");
           exit(1);
         }
-        printf("\n");
+        System::Get().Log("\n");
       }
       System::Get().Barrier();
     }
@@ -4866,12 +5777,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     // Pause for interactive mode
     if (cfg.general.useInteractive) {
       if (localRank == 0) {
-        printf("Transfers complete. Hit <Enter> to continue: ");
+        System::Get().Log("Transfers complete. Hit <Enter> to continue: ");
         if (scanf("%*c") != 0)  {
-          printf("[ERROR] Unexpected input\n");
+          System::Get().Log("[ERROR] Unexpected input\n");
           exit(1);
         }
-        printf("\n");
+        System::Get().Log("\n");
         fflush(stdout);
       }
       System::Get().Barrier();
@@ -4957,15 +5868,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     return true;
   }
 
-  int GetIntAttribute(IntAttribute attribute)
-  {
-    switch (attribute) {
-    case ATR_GFX_MAX_BLOCKSIZE: return MAX_BLOCKSIZE;
-    case ATR_GFX_MAX_UNROLL:    return MAX_UNROLL;
-    default:                    return -1;
-    }
-  }
-
   std::string GetStrAttribute(StrAttribute attribute)
   {
     switch (attribute) {
@@ -5013,14 +5915,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         }
         // At this point, there should be only 1 (valid) rank assigned to this SRC
         if (wc.mem[isDst][iMem].memRanks.size() != 1 || wc.mem[isDst][iMem].memRanks[0] < 0) {
-          printf("[ERROR] Unexpected number of ranks / invalid number of ranks for %s %d\n", isDst ? "DST" : "SRC", iMem);
+          System::Get().Log("[ERROR] Unexpected number of ranks / invalid number of ranks for %s %d\n", isDst ? "DST" : "SRC", iMem);
           exit(1);
         }
 
         // Resolve mem index wildcards
         // Mem devices should have at least one index
         if (wc.mem[isDst][iMem].memIndices.size() == 0) {
-          printf("[ERROR] MemIndex for %s %d cannot be empty\n", isDst ? "DST" : "SRC", iMem);
+          System::Get().Log("[ERROR] MemIndex for %s %d cannot be empty\n", isDst ? "DST" : "SRC", iMem);
           exit(1);
         }
 
@@ -5109,13 +6011,13 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       wc.exe.exeRanks.swap(exeRanks);
       return result;
     } else if (wc.exe.exeRanks[0] == -1) {
-      printf("[ERROR] Exe rank should not be -1\n");
+      System::Get().Log("[ERROR] Exe rank should not be -1\n");
       exit(1);
     }
 
     // Resolve EXE indices
     if (wc.exe.exeIndices.size() == 0) {
-      printf("[ERROR] Exe index should never be empty\n");
+      System::Get().Log("[ERROR] Exe index should never be empty\n");
       exit(1);
     } else if (wc.exe.exeIndices.size() > 1) {
       // Loop over user provided indices
@@ -5179,7 +6081,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         wc.exe.exeSubIndices.clear();
         return result;
       } else if (wc.exe.exeType == EXE_NIC) {
-        printf("[ERROR] NIC executor requires a subindex be specified\n");
+        System::Get().Log("[ERROR] NIC executor requires a subindex be specified\n");
         exit(1);
       } else if (wc.exe.exeType == EXE_NIC_NEAREST) {
         // Assign NIC closest to DST mem
@@ -5213,7 +6115,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
         wc.exe.exeSubIndices[0] = -2;
         return result;
-      case EXE_GPU_GFX: case EXE_GPU_DMA:
+      case EXE_GPU_GFX: case EXE_GPU_DMA: case EXE_GPU_BDMA:
       {
         // Iterate over all available subindices
         ExeDevice exeDevice = {wc.exe.exeType, wc.exe.exeIndices[0], wc.exe.exeRanks[0], 0};
@@ -5381,14 +6283,38 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
   System::System() :
     rank(0), numRanks(1), commMode(COMM_NONE)
   {
+    // Collect env vars
+    // TB_VERBOSE       = enables extra logging
+    // TB_SINGLE_LOG    = Only rank 0 will produce output (useful if spawning multi-node socket)
+    // TB_DUMP_CFG_FILE = Config file to dump executed Transfers
+    // TB_PAUSE         = Insert a pause for debug attachment
+
     verbose = getenv("TB_VERBOSE") ? atoi(getenv("TB_VERBOSE")) : 0;
+    bool singleLog = getenv("TB_SINGLE_LOG") ? atoi(getenv("TB_SINGLE_LOG")) : 0;
+
+    char* dumpCfgFilename = getenv("TB_DUMP_CFG_FILE");
+    if (dumpCfgFilename) {
+      dumpCfgFile = fopen(dumpCfgFilename, "w");
+    }
 
     if (getenv("TB_PAUSE")) {
-      printf("Pausing for debug attachment\n");
+      Log("Pausing for debug attachment (e.g. sudo gdb -p %d)\n", getpid());
       volatile bool pause = true;
       while (pause);
     }
 
+#ifdef AMD_SMI_ENABLED
+    if (verbose) {
+      Log("[INFO] Initializing AMD System Management Interface Library (AMDSMI)\n");
+    }
+    amdsmi_init(AMDSMI_INIT_AMD_APUS);
+#elif defined (NVML_ENABLED)
+    if (verbose) {
+      Log("[INFO] Initializing NVIDIA Management Library (NVML)\n");
+    }
+    nvmlInit_v2();
+#endif
+
     // Priority 1: Socket communicator
     SetupSocketCommunicator();
 
@@ -5397,8 +6323,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       SetupMpiCommunicator();
     }
 
+    // Establish which ranks will output when logging
+    if (rank > 0 && (commMode == COMM_MPI || singleLog))
+      rankDoesOutput = false;
+
     if (verbose && commMode == COMM_NONE) {
-      printf("[INFO] Running in single node mode\n");
+      Log("[INFO] Running in single node mode\n");
     }
 
     // Collect topology and distribute across all ranks
@@ -5428,8 +6358,115 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         listenSocket = -1;
       }
     }
+
+    if (dumpCfgFile) {
+      fclose(dumpCfgFile);
+    }
+
+#ifdef AMD_SMI_ENABLED
+    amdsmi_shut_down();
+#elif defined(NVML_ENABLED)
+    nvmlShutdown();
+#endif
+  }
+
+  namespace detail {
+
+  inline std::string FormatIpv4(struct in_addr const& addr)
+  {
+    char buf[INET_ADDRSTRLEN];
+    if (inet_ntop(AF_INET, &addr, buf, sizeof(buf)))
+      return std::string(buf);
+    return std::string();
+  }
+
+  inline bool IsUsableIpv4(sockaddr_in const* sin)
+  {
+    if (!sin || sin->sin_family != AF_INET)
+      return false;
+    uint32_t a = ntohl(sin->sin_addr.s_addr);
+    if (a == INADDR_ANY || a == INADDR_NONE)
+      return false;
+    if ((a >> 24) == 127)
+      return false;
+    return true;
+  }
+
+  // IPv4 to advertise when TB_MASTER_ADDR is unset on rank 0 (after listen).
+  inline std::string DetectPrimaryIpv4(char const* preferredIface)
+  {
+    ifaddrs* ifap = nullptr;
+    if (getifaddrs(&ifap) != 0)
+      return std::string();
+
+    auto tryPick = [&](bool allowLinkLocal) -> std::string {
+      for (ifaddrs* ifa = ifap; ifa; ifa = ifa->ifa_next) {
+        if (!ifa->ifa_addr || ifa->ifa_addr->sa_family != AF_INET)
+          continue;
+        if (ifa->ifa_flags & IFF_LOOPBACK)
+          continue;
+        if (!(ifa->ifa_flags & IFF_UP))
+          continue;
+        auto* sin = reinterpret_cast<sockaddr_in*>(ifa->ifa_addr);
+        if (!IsUsableIpv4(sin))
+          continue;
+        if (preferredIface && preferredIface[0]) {
+          if (!ifa->ifa_name || strcmp(ifa->ifa_name, preferredIface) != 0)
+            continue;
+        } else {
+          uint32_t a = ntohl(sin->sin_addr.s_addr);
+          if (!allowLinkLocal && (a & 0xffff0000) == 0xa9fe0000)
+            continue;
+        }
+        return FormatIpv4(sin->sin_addr);
+      }
+      return std::string();
+    };
+
+    std::string chosen;
+    if (preferredIface && preferredIface[0]) {
+      chosen = tryPick(true);
+      freeifaddrs(ifap);
+      return chosen;
+    }
+
+    chosen = tryPick(false);
+    if (chosen.empty())
+      chosen = tryPick(true);
+    freeifaddrs(ifap);
+    return chosen;
+  }
+
+  inline bool ResolveMasterAddrV4(char const* host, int port, sockaddr_in* out, char const** gaiErr)
+  {
+    *gaiErr = nullptr;
+    if (!host || !host[0] || !out)
+      return false;
+    char portBuf[16];
+    snprintf(portBuf, sizeof(portBuf), "%d", port);
+    addrinfo hints;
+    memset(&hints, 0, sizeof(hints));
+    hints.ai_family   = AF_INET;
+    hints.ai_socktype = SOCK_STREAM;
+    addrinfo* res = nullptr;
+    int gai = getaddrinfo(host, portBuf, &hints, &res);
+    if (gai != 0) {
+      *gaiErr = gai_strerror(gai);
+      return false;
+    }
+    for (addrinfo* p = res; p; p = p->ai_next) {
+      if (p->ai_family == AF_INET && p->ai_addrlen >= sizeof(sockaddr_in)) {
+        memcpy(out, p->ai_addr, sizeof(sockaddr_in));
+        freeaddrinfo(res);
+        return true;
+      }
+    }
+    freeaddrinfo(res);
+    return false;
   }
 
+  } // namespace detail
+
   void System::SetupSocketCommunicator()
   {
     char* rankStr       = getenv("TB_RANK");
@@ -5437,21 +6474,32 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     char* masterAddrStr = getenv("TB_MASTER_ADDR");
     char* masterPortStr = getenv("TB_MASTER_PORT");
 
-    // Socket communicator requires rank / numRanks / masterAddr
-    if (!rankStr || !numRanksStr || !masterAddrStr) {
+    if (!numRanksStr) {
       if (verbose) {
-        printf("[INFO] SocketCommunicator skipped due to missing TB_RANK | TB_NUM_RANKS | TB_MASTER_ADDR\n");
+        Log("[INFO] SocketCommunicator skipped (TB_NUM_RANKS not set)\n");
       }
       return;
     }
 
-    rank       = atoi(rankStr);
-    numRanks   = atoi(numRanksStr);
-    masterAddr = masterAddrStr;
+    numRanks = atoi(numRanksStr);
+    if (numRanks < 2) {
+      if (verbose) {
+        Log("[INFO] SocketCommunicator skipped (TB_NUM_RANKS=%d requires at least 2 for socket mode)\n", numRanks);
+      }
+      return;
+    }
+
+    rank = (rankStr && rankStr[0]) ? atoi(rankStr) : 0;
+    masterAddr = masterAddrStr ? std::string(masterAddrStr) : std::string();
     masterPort = masterPortStr ? atoi(masterPortStr) : 29500;
 
+    if (rank != 0 && masterAddr.empty()) {
+      Log("[ERROR] TB_MASTER_ADDR is required when TB_RANK is greater than 0 (socket communicator)\n");
+      exit(1);
+    }
+
     if (rank < 0 || rank >= numRanks) {
-      printf("[ERROR] Invalid rank index.  Must be between 0 and %d (not %d)\n", numRanks - 1, rank);
+      Log("[ERROR] Invalid rank index.  Must be between 0 and %d (not %d)\n", numRanks - 1, rank);
       exit(1);
     }
 
@@ -5463,7 +6511,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       // Create listening socket
       listenSocket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
       if (listenSocket == -1) {
-        printf("[ERROR] Unable to create listener socket\n");
+        Log("[ERROR] Unable to create listener socket\n");
         exit(1);
       }
 
@@ -5478,17 +6526,36 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       serverAddr.sin_port        = htons(masterPort);
 
       if (bind(listenSocket, (sockaddr*)&serverAddr, sizeof(serverAddr)) == -1) {
-        printf("[ERROR] Failed to bind listen socket\n");
+        Log("[ERROR] Failed to bind listen socket\n");
         exit(1);
       }
 
       if (listen(listenSocket, numRanks) == -1) {
-        printf("[ERROR] Failed to listen on socket\n");
+        Log("[ERROR] Failed to listen on socket\n");
         exit(1);
       }
-      // Accept connections from other ranks
-      printf("Waiting for connections from %d other ranks [listening on TB_MASTER_ADDR=%s TB_MASTER_PORT=%d]\n",
-             numRanks-1, masterAddr.c_str(), masterPort);
+
+      if (masterAddr.empty()) {
+        char const* ifaceEnv = getenv("TB_MASTER_IFACE");
+        masterAddr = detail::DetectPrimaryIpv4(ifaceEnv);
+        if (masterAddr.empty()) {
+          Log("[ERROR] TB_MASTER_ADDR not set and could not detect a primary IPv4 for workers");
+          if (ifaceEnv && ifaceEnv[0])
+            Log(" (check TB_MASTER_IFACE=%s)\n", ifaceEnv);
+          else
+            Log(" (set TB_MASTER_ADDR or TB_MASTER_IFACE)\n");
+          exit(1);
+        }
+        Log("[INFO] TB_MASTER_ADDR not set; using detected IPv4 %s\n", masterAddr.c_str());
+      }
+
+      Log("[INFO] Socket rank 0: on each other host set TB_RANK to a unique value in 1..%d, then for example:\n",
+          numRanks - 1);
+      Log("       TB_NUM_RANKS=%d TB_MASTER_ADDR=%s TB_MASTER_PORT=%d TB_RANK=1\n",
+          numRanks, masterAddr.c_str(), masterPort);
+
+      Log("[INFO] Waiting for connections from %d other rank(s) [TB_MASTER_ADDR=%s TB_MASTER_PORT=%d]\n",
+          numRanks - 1, masterAddr.c_str(), masterPort);
 
       for (int i = 1; i < numRanks; i++) {
         sockaddr_in clientAddr;
@@ -5496,7 +6563,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 
         auto clientSocket = accept(listenSocket, (sockaddr*)&clientAddr, &clientAddrLen);
         if (clientSocket == -1) {
-          printf("[ERROR] Failed to accept connection from rank %d\n", i);
+          Log("[ERROR] Failed to accept connection from rank %d\n", i);
           exit(1);
         }
 
@@ -5506,11 +6573,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 
         if (clientRank < 0 || clientRank >= numRanks) {
           close(clientSocket);
-          printf("[ERROR] Invalid rank received: %d\n", clientRank);
+          Log("[ERROR] Invalid rank received: %d\n", clientRank);
           exit(1);
         }
         if (verbose) {
-          printf("[INFO] Rank 0 accepted connection from rank %d\n", clientRank);
+          Log("[INFO] Rank 0 accepted connection from rank %d\n", clientRank);
         }
         sockets[clientRank] = clientSocket;
       }
@@ -5518,32 +6585,41 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       // All other ranks connect to rank 0
       int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
       if (sock == -1) {
-        printf("[ERROR] Failed to create socket\n");
+        Log("[ERROR] Failed to create socket\n");
         exit(1);
       }
 
       sockaddr_in serverAddr;
       memset(&serverAddr, 0, sizeof(serverAddr));
       serverAddr.sin_family = AF_INET;
-      serverAddr.sin_port = htons(masterPort);
-      if (inet_pton(AF_INET, masterAddr.c_str(), &serverAddr.sin_addr) <= 0) {
-        printf("[ERROR] Invalid master address: %s\n", masterAddr.c_str());
+      char const* gaiErr = nullptr;
+      if (!detail::ResolveMasterAddrV4(masterAddr.c_str(), masterPort, &serverAddr, &gaiErr)) {
+        if (gaiErr)
+          Log("[ERROR] Invalid master address '%s': %s\n", masterAddr.c_str(), gaiErr);
+        else
+          Log("[ERROR] Invalid master address: %s\n", masterAddr.c_str());
         exit(1);
       }
 
       // Retry connection with backoff
       if (verbose)
-        printf("[INFO] Rank %d attempting to connect to %s:%d\n", rank, masterAddrStr, masterPort);
+        Log("[INFO] Rank %d attempting to connect to %s:%d\n", rank, masterAddr.c_str(), masterPort);
       int maxRetries = 50;
+      bool connected = false;
       for (int retry = 0; retry < maxRetries; retry++) {
         if (connect(sock, (sockaddr*)&serverAddr, sizeof(serverAddr)) == 0) {
+          connected = true;
           break;
         }
         if (retry == maxRetries - 1) {
-          printf("[ERROR] Failed to connect to master after %d retries\n", maxRetries);
+          Log("[ERROR] Failed to connect to master after %d retries\n", maxRetries);
         }
         sleep(1);
       }
+      if (!connected) {
+        close(sock);
+        exit(1);
+      }
 
       // Send local rank to the server
       send(sock, (char*)&rank, sizeof(rank), 0);
@@ -5568,7 +6644,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     MPI_Comm_size(comm, &numRanks);
     if (numRanks > 1) {
       if (verbose) {
-        printf("[INFO] Enabling MPI communicator (%d ranks found)\n", numRanks);
+        Log("[INFO] Enabling MPI communicator (%d ranks found)\n", numRanks);
       }
       commMode = COMM_MPI;
     } else if (mpiInit) {
@@ -5578,6 +6654,59 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #endif
   }
 
+  void System::Log(const char* format, ...) const
+  {
+    if (rankDoesOutput) {
+      va_list args;
+      va_start(args, format);
+      vprintf(format, args);
+      va_end(args);
+    }
+  }
+
+  void System::LogTransfers(std::vector<Transfer> const& transfers)
+  {
+    if (!dumpCfgFile || !rankDoesOutput) return;
+
+    fprintf(dumpCfgFile, "-%lu ", transfers.size());
+    for (auto const& t : transfers) {
+      fprintf(dumpCfgFile, "(");
+
+      // Print SRCs
+      for (auto const& src : t.srcs) {
+        fprintf(dumpCfgFile, "R%d%c%d", src.memRank, MemTypeStr[src.memType], src.memIndex);
+      }
+      if (t.srcs.empty())
+        fprintf(dumpCfgFile, "N");
+
+      fprintf(dumpCfgFile, "->");
+
+      // Print Executor
+      fprintf(dumpCfgFile, "R%d%c%d", t.exeDevice.exeRank, ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex);
+      if (t.exeDevice.exeSlot != 0)
+        fprintf(dumpCfgFile, "%c", 'A' + t.exeDevice.exeSlot);
+      if (t.exeSubIndex != -1) {
+        fprintf(dumpCfgFile, ".%d", t.exeSubIndex);
+      }
+      if (t.exeSubSlot != 0) {
+        fprintf(dumpCfgFile, "%c", 'A' + t.exeSubSlot);
+      }
+
+      fprintf(dumpCfgFile, "->");
+
+      // Print DSTs
+      for (auto const& dst : t.dsts) {
+        fprintf(dumpCfgFile, "R%d%c%d", dst.memRank, MemTypeStr[dst.memType], dst.memIndex);
+      }
+      if (t.dsts.empty())
+        fprintf(dumpCfgFile, "N");
+
+      fprintf(dumpCfgFile, " %d %lu)", t.numSubExecs, t.numBytes);
+      fflush(dumpCfgFile);
+    }
+    fprintf(dumpCfgFile, "\n");
+  }
+
   void System::Barrier()
   {
 #ifdef MPI_COMM_ENABLED
@@ -5618,7 +6747,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #endif
     if (commMode == COMM_SOCKET) {
       if (rank != 0 && dstRank != 0) {
-        printf("[ERROR] Socket communicator is limited to sending from/to rank 0\n");
+        Log("[ERROR] Socket communicator is limited to sending from/to rank 0\n");
         exit(1);
       }
       auto sock = sockets[dstRank];
@@ -5628,7 +6757,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       while (totalSent < numBytes) {
         auto sent = send(sock, (char*)sendData + totalSent, numBytes - totalSent, 0);
         if (sent == -1) {
-          printf("[ERROR] Send failed (rank %d to rank %d)\n", rank, dstRank);
+          Log("[ERROR] Send failed (rank %d to rank %d)\n", rank, dstRank);
           exit(1);
         }
         totalSent += sent;
@@ -5647,7 +6776,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #endif
     if (commMode == COMM_SOCKET) {
       if (rank != 0 && srcRank != 0) {
-        printf("[ERROR] Socket communicator is limited to receiving from/at rank 0\n");
+        Log("[ERROR] Socket communicator is limited to receiving from/at rank 0\n");
         exit(1);
       }
 
@@ -5656,7 +6785,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       while (totalRecv < numBytes) {
         auto recvd = recv(sock, (char*)recvData + totalRecv, numBytes - totalRecv, 0);
         if (recvd == -1 || recvd == 0) {
-          printf("[ERROR] Recv failed (rank %d from rank %d)\n", rank, srcRank);
+          Log("[ERROR] Recv failed (rank %d from rank %d)\n", rank, srcRank);
           perror("recv");
           exit(1);
         }
@@ -5673,7 +6802,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     if (commMode == COMM_MPI) {
       int err = MPI_Bcast(data, numBytes, MPI_CHAR, root, comm);
       if (err != MPI_SUCCESS) {
-        printf("[ERROR] MPI_Bcast failed with error code %d\n", err);
+        Log("[ERROR] MPI_Bcast failed with error code %d\n", err);
       }
       return;
     }
@@ -5727,6 +6856,103 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     return "Unknown CPU";
   }
 
+  void System::CollectPodMembership(char* ppodId, int64_t& vpodId)
+  {
+    memset(ppodId, 0, 16);
+    vpodId = -1;
+
+    // TB_FORCE_SINGLE_POD skips any required queries to AMDSMI
+    char* forceSinglePod = getenv("TB_FORCE_SINGLE_POD");
+    if (forceSinglePod) {
+      vpodId = 0;
+      return;
+    }
+
+    // Check fabric support
+#if defined(__NVCC__)
+#ifdef NVML_ENABLED
+    if (!MnnvlCheck()) return;
+    char busId[] = "00000000:00:00.0";
+    if (cudaDeviceGetPCIBusId(busId, sizeof(busId), 0)) return;
+
+    nvmlGpuFabricInfoV_t fabricInfo;
+    fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
+    nvmlDevice_t nvmlDev;
+    nvmlReturn_t err = nvmlDeviceGetHandleByPciBusId_v2(busId, &nvmlDev);
+    if (err != NVML_SUCCESS) {
+      if (verbose) {
+        System::Get().Log("[WARN] Unable to get processor handle for GPU 0 at %s [%s]\n",
+                           busId, nvmlErrorString(err));
+      }
+      return;
+    }
+    fabricInfo.version = nvmlGpuFabricInfo_v2;
+
+    err = nvmlDeviceGetGpuFabricInfoV(nvmlDev, &fabricInfo);
+    if (err != NVML_SUCCESS || fabricInfo.state == NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
+      System::Get().Log("[WARN] MNNVL not supported\n");
+    } else {
+      vpodId = fabricInfo.cliqueId;
+      memcpy(ppodId, fabricInfo.clusterUuid, 16);
+    }
+#endif
+#else
+#ifdef AMD_SMI_ENABLED
+    int numGpus = 0;
+    if (hipGetDeviceCount(&numGpus) == hipSuccess && numGpus > 0) {
+      // Query GPU 0 as the representative for pod membership. All GPUs on a node are
+      // expected to share the same pod (ppod_id/vpod_id), so querying any one is sufficient.
+      char pciBusId[256] = "";
+      hipError_t hipErr = hipDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), 0);
+      if (hipErr != hipSuccess) {
+        if (verbose) {
+          Log("[WARN] Unable to get PCI bus ID for GPU 0; skipping AMD-SMI pod membership query\n");
+        }
+        return;
+      }
+
+      amdsmi_bdf_t bdf = {};
+      unsigned domain, bus, device, func;
+      if (sscanf(pciBusId, "%x:%x:%x.%x", &domain, &bus, &device, &func) != 4) {
+        if (verbose) {
+          Log("[WARN] Unable to parse PCI bus ID '%s'; skipping AMD-SMI pod membership query\n", pciBusId);
+        }
+        return;
+      }
+      bdf.domain_number   = domain;
+      bdf.bus_number      = bus;
+      bdf.device_number   = device;
+      bdf.function_number = func;
+
+      amdsmi_processor_handle gpuHandle;
+      amdsmi_status_t err = amdsmi_get_processor_handle_from_bdf(bdf, &gpuHandle);
+      if (err != AMDSMI_STATUS_SUCCESS) {
+        if (verbose) {
+          const char *errString = NULL;
+          amdsmi_status_code_to_string(err, &errString);
+          Log("[WARN] Unable to get processor handle for GPU 0 at %s [%s]\n",
+                            pciBusId, errString);
+        }
+      } else {
+        amdsmi_fabric_info_t fabricInfo;
+        err = amdsmi_get_gpu_fabric_info(gpuHandle, &fabricInfo);
+        if (err == AMDSMI_STATUS_SUCCESS) {
+          // NOTE: vpod_id is a uint32_t but System holds it as an int64_t to allow for
+          //       vpodId == -1 to represent no pod present
+          memcpy(ppodId, &fabricInfo.fabric_info.fabric_version.v1.ppod_id,
+                 sizeof(fabricInfo.fabric_info.fabric_version.v1.ppod_id));
+          vpodId = fabricInfo.fabric_info.fabric_version.v1.vpod_id;
+        } else if (verbose) {
+          const char *errString = NULL;
+          amdsmi_status_code_to_string(err, &errString);
+          Log("[WARN] Unable to get fabric info from AMD SMI [%s]\n", errString);
+        }
+      }
+    }
+#endif
+#endif
+  }
+
   void System::GetRankTopology(RankTopology& topo)
   {
     // Clear topology structure first
@@ -5743,9 +6969,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     char* firstDotPtr = std::strchr(topo.hostname, '.');
     if (firstDotPtr) *firstDotPtr = 0;
 
-    // NOTE: Placeholder values
-    strcpy(topo.ppodId, "N/A");
-    topo.vpodId = -1;
+    // Collect Pod membership
+    CollectPodMembership(topo.ppodId, topo.vpodId);
 
     // CPU Executor
     int numCpus = numa_num_configured_nodes();
@@ -5764,9 +6989,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 
     if (verbose) {
       for (int exeIndex = 0; exeIndex < numCpus; exeIndex++) {
-        printf("[INFO] Rank %03d: CPU [%02d/%02d] %03d cores (%s)\n", rank, exeIndex, numCpus,
-               topo.numSubExecutors[{EXE_CPU, exeIndex}],
-               topo.executorName[{EXE_CPU, exeIndex}].c_str());
+        Log("[INFO] Rank %03d: CPU [%02d/%02d] %03d cores (%s)\n", rank, exeIndex, numCpus,
+            topo.numSubExecutors[{EXE_CPU, exeIndex}],
+            topo.executorName[{EXE_CPU, exeIndex}].c_str());
       }
     }
 
@@ -5776,6 +7001,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     if (status != hipSuccess) numGpus = 0;
     topo.numExecutors[EXE_GPU_GFX] = numGpus;
     topo.numExecutors[EXE_GPU_DMA] = numGpus;
+    topo.numExecutors[EXE_GPU_BDMA] = numGpus;
 
     for (int exeIndex = 0; exeIndex < numGpus; exeIndex++) {
       int numDeviceCUs  = 0;
@@ -5794,6 +7020,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       }
       topo.executorName[{EXE_GPU_GFX, exeIndex}] = gpuName;
       topo.executorName[{EXE_GPU_DMA, exeIndex}] = gpuName;
+      topo.executorName[{EXE_GPU_BDMA, exeIndex}] = gpuName;
 
 #if !defined(__NVCC__)
       hsa_agent_t gpuAgent = gpuAgents[exeIndex];
@@ -5822,8 +7049,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #endif
       topo.numExecutorSubIndices[{EXE_GPU_GFX, exeIndex}] = numXccs;
       topo.numExecutorSubIndices[{EXE_GPU_DMA, exeIndex}] = numDmaEngines;
+      topo.numExecutorSubIndices[{EXE_GPU_BDMA, exeIndex}] = 0;
       topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}] = numDeviceCUs;
       topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = 1;
+      topo.numSubExecutors[{EXE_GPU_BDMA, exeIndex}] = numDmaEngines;
       topo.closestCpuNumaToGpu[exeIndex] = closestNuma;
       topo.closestNicsToGpu[exeIndex] = {};
     }
@@ -5837,7 +7066,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       topo.executorName[{EXE_NIC, exeIndex}] = GetIbvDeviceList()[exeIndex].name;
       topo.nicIsActive[exeIndex] = GetIbvDeviceList()[exeIndex].hasActivePort;
       if (verbose) {
-        printf("[INFO] Rank %03d: NIC [%02d/%02d] on CPU NUMA %d\n", rank, exeIndex, numNics, topo.closestCpuNumaToNic[exeIndex]);
+        Log("[INFO] Rank %03d: NIC [%02d/%02d] on CPU NUMA %d\n", rank, exeIndex, numNics, topo.closestCpuNumaToNic[exeIndex]);
       }
     }
 #endif
@@ -5883,7 +7112,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       hipError_t err = hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), gpuIndex);
       if (err != hipSuccess) {
 #ifdef VERBS_DEBUG
-        printf("Failed to get PCI Bus ID for HIP device %d: %s\n", gpuIndex, hipGetErrorString(err));
+        Log("Failed to get PCI Bus ID for HIP device %d: %s\n", gpuIndex, hipGetErrorString(err));
 #endif
         continue;
       }
@@ -5902,7 +7131,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       // to determine the closest NIC to GPU if the PCIe tree approach fails
       if (closestIdx < 0) {
 #ifdef VERBS_DEBUG
-        printf("[WARN] Falling back to PCIe bus ID distance to determine proximity\n");
+        Log("[WARN] Falling back to PCIe bus ID distance to determine proximity\n");
 #endif
         int minDistance = std::numeric_limits<int>::max();
         for (int nicIndex = 0; nicIndex < numNics; nicIndex++) {
@@ -5972,31 +7201,31 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 
     if (verbose) {
       for (int exeIndex = 0; exeIndex < numGpus; exeIndex++) {
-        printf("[INFO] Rank %03d: GPU [%02d/%02d] %d XCCs %03d CUs on CPU NUMA %d Closest NICs:", rank, exeIndex, numGpus,
-               topo.numExecutorSubIndices[{EXE_GPU_GFX, exeIndex}],
-               topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}],
-               topo.closestCpuNumaToGpu[exeIndex]);
+        Log("[INFO] Rank %03d: GPU [%02d/%02d] %d XCCs %03d CUs on CPU NUMA %d Closests NICs:", rank, exeIndex, numGpus,
+            topo.numExecutorSubIndices[{EXE_GPU_GFX, exeIndex}],
+            topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}],
+            topo.closestCpuNumaToGpu[exeIndex]);
         if (topo.closestNicsToGpu[exeIndex].size() == 0) {
-          printf(" none\n");
+          Log(" none");
         } else {
           for (auto nicIndex : topo.closestNicsToGpu[exeIndex]) {
-            printf(" %d", nicIndex);
+            Log(" %d", nicIndex);
           }
-          printf("\n");
+          Log("\n");
         }
       }
 #ifdef NIC_EXEC_ENABLED
       for (int nicIndex = 0; nicIndex < numNics; nicIndex++) {
-        printf("[INFO] Rank %03d: NIC [%02d/%02d] %s Closest GPUs:", rank, nicIndex, numNics,
-               ibvDeviceList[nicIndex].name.c_str());
+        Log("[INFO] Rank %03d: NIC [%02d/%02d] %s Closest GPUs:", rank, nicIndex, numNics,
+                          ibvDeviceList[nicIndex].name.c_str());
         if (topo.closestGpusToNic[nicIndex].size() == 0) {
-          printf(" none");
+          Log(" none");
         } else {
           for (auto gpuIndex : topo.closestGpusToNic[nicIndex]) {
-            printf(" %d", gpuIndex);
+            Log(" %d", gpuIndex);
           }
         }
-        printf("\n");
+        Log("\n");
       }
 #endif
     }
@@ -6099,7 +7328,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
   void System::SendRankTopo(int peerRank, RankTopology const& topo) const
   {
     SendData(peerRank, sizeof(topo.hostname), topo.hostname);
-    SendData(peerRank, sizeof(topo.ppodId), &topo.ppodId);
+    SendData(peerRank, sizeof(topo.ppodId), topo.ppodId);
     SendData(peerRank, sizeof(topo.vpodId), &topo.vpodId);
     SendMap(peerRank, topo.numExecutors);
     SendMap(peerRank, topo.numExecutorSubIndices);
@@ -6115,7 +7344,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
   void System::RecvRankTopo(int peerRank, RankTopology& topo) const
   {
     RecvData(peerRank, sizeof(topo.hostname), topo.hostname);
-    RecvData(peerRank, sizeof(topo.ppodId), &topo.ppodId);
+    RecvData(peerRank, sizeof(topo.ppodId), topo.ppodId);
     RecvData(peerRank, sizeof(topo.vpodId), &topo.vpodId);
     RecvMap(peerRank, topo.numExecutors);
     RecvMap(peerRank, topo.numExecutorSubIndices);
@@ -6196,7 +7425,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         } else {
           BROADCAST(setSize);
           tfrResult.perIterCUs[i].clear();
-          if (setSize > 0) {
+          for (size_t j = 0; j < setSize; j++) {
             pair<int, int> p;
             BROADCAST(p);
             tfrResult.perIterCUs[i].insert(p);
@@ -6243,7 +7472,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
         return {ERR_FATAL, "CPU index must be between 0 and %d inclusively", numCpus - 1};
       agent = cpuAgents[exeDevice.exeIndex];
       break;
-    case EXE_GPU_GFX: case EXE_GPU_DMA:
+    case EXE_GPU_GFX: case EXE_GPU_DMA: case EXE_GPU_BDMA:
       if (exeIndex < 0 || exeIndex >= numGpus)
         return {ERR_FATAL, "GPU index must be between 0 and %d inclusively", numGpus - 1};
       agent = gpuAgents[exeIndex];
@@ -6316,7 +7545,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       rankInfo[0] = localTopo;
       for (int peerRank = 1; peerRank < numRanks; peerRank++) {
         if (verbose) {
-          printf("[INFO] Rank 0 receives topology from Rank %d\n", peerRank);
+          Log("[INFO] Rank 0 receives topology from Rank %d\n", peerRank);
         }
         RecvRankTopo(peerRank, rankInfo[peerRank]);
       }
@@ -6325,7 +7554,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       for (int peerRank = 1; peerRank < numRanks; peerRank++) {
         for (int i = 0; i < numRanks; i++) {
           if (verbose) {
-            printf("[INFO] Rank 0 sends topology %d to Rank %d\n", i, peerRank);
+            Log("[INFO] Rank 0 sends topology %d to Rank %d\n", i, peerRank);
           }
           SendRankTopo(peerRank, rankInfo[i]);
         }
@@ -6333,14 +7562,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     } else {
       // Send local topology info back to root
       if (verbose) {
-        printf("[INF0] Rank %d sends topology from Rank 0\n", rank);
+        Log("[INF0] Rank %d sends topology from Rank 0\n", rank);
       }
       SendRankTopo(0, localTopo);
 
       for (int i = 0; i < numRanks; i++) {
         RecvRankTopo(0, rankInfo[i]);
         if (verbose) {
-          printf("[INF0] Rank %d receives topology %d from Rank 0\n", rank, i);
+          Log("[INF0] Rank %d receives topology %d from Rank 0\n", rank, i);
         }
       }
     }
@@ -6409,16 +7638,47 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     return rankInfo[targetRank].hostname;
   }
 
-  std::string System::GetPpodId(int targetRank) const
+  int64_t System::GetPodIdx(int targetRank) const
   {
+    using PodKey = std::pair<std::array<char, 16>, int64_t>;
+
+    static std::map<PodKey, int64_t> podIdxMap;
+    static bool initialized = false;
+
+    if (!initialized) {
+      int64_t nextIdx = 0;
+      for (int r = 0; r < numRanks; r++) {
+        PodKey key;
+        memcpy(key.first.data(), rankInfo[r].ppodId, 16);
+        key.second = rankInfo[r].vpodId;
+
+        // vpodIdx == -1 means not part of any pod; assign -1 directly
+        if (key.second == -1) continue;
+
+        if (podIdxMap.find(key) == podIdxMap.end()) {
+          podIdxMap[key] = nextIdx++;
+        }
+      }
+      initialized = true;
+    }
+
     if (targetRank < 0 || targetRank >= numRanks) targetRank = rank;
-    return rankInfo[targetRank].ppodId;
+
+    PodKey key;
+    memcpy(key.first.data(), rankInfo[targetRank].ppodId, 16);
+    key.second = rankInfo[targetRank].vpodId;
+
+    if (key.second == -1) return -1;
+
+    return podIdxMap[key];
   }
 
-  int System::GetVpodId(int targetRank) const
+  bool System::IsSamePod(int targetRank, int sourceRank) const
   {
-    if (targetRank < 0 || targetRank >= numRanks) targetRank = rank;
-    return rankInfo[targetRank].vpodId;
+    if (sourceRank < 0 || sourceRank >= numRanks) sourceRank = rank;
+    if (GetPodIdx(sourceRank) == -1 || GetPodIdx(targetRank) == -1)
+      return false;
+    return GetPodIdx(sourceRank) == GetPodIdx(targetRank);
   }
 
   std::string System::GetExecutorName(ExeDevice exeDevice) const
@@ -6527,14 +7787,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
     return System::Get().GetHostname(targetRank);
   }
 
-  std::string GetPpodId(int targetRank)
+  int64_t GetPodIdx(int targetRank)
   {
-    return System::Get().GetPpodId(targetRank);
+    return System::Get().GetPodIdx(targetRank);
   }
 
-  int GetVpodId(int targetRank)
+  bool IsSamePod(int targetRank, int sourceRank)
   {
-    return System::Get().GetVpodId(targetRank);
+    return System::Get().IsSamePod(targetRank, sourceRank);
   }
 
   std::string GetExecutorName(ExeDevice exeDevice)
@@ -6559,18 +7819,28 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #undef hipError_t
 #undef hipEvent_t
 #undef hipStream_t
+#undef hipMemAllocationProp
+#undef hipMemGenericAllocationHandle_t
+#undef hipMemAccessDesc
+#undef hipMemFabricHandle_t
 
 // Enumerations
 #undef hipDeviceAttributeClockRate
-#undef hipDeviceAttributeMaxSharedMemoryPerMultiprocessor
 #undef hipDeviceAttributeMultiprocessorCount
 #undef hipDeviceAttributeWarpSize
 #undef hipErrorPeerAccessAlreadyEnabled
 #undef hipFuncCachePreferShared
 #undef hipMemcpyDefault
+#undef hipMemcpyKind
 #undef hipMemcpyDeviceToHost
 #undef hipMemcpyHostToDevice
 #undef hipSuccess
+#undef hipMemLocationTypeDevice
+#undef hipMemAllocationTypePinned
+//#undef hipMemAllocationTypeUncached
+#undef hipMemHandleTypeFabric
+#undef hipMemAllocationGranularityRecommended
+#undef hipMemAccessFlagsProtReadWrite
 
 // Functions
 #undef hipDeviceCanAccessPeer
@@ -6599,11 +7869,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #undef hipStreamCreate
 #undef hipStreamDestroy
 #undef hipStreamSynchronize
+#undef hipMemGetAllocationGranularity
+#undef hipMemCreate
+#undef hipMemAddressReserve
+#undef hipMemMap
+#undef hipMemSetAccess
+#undef hipMemUnmap
+#undef hipMemRelease
+#undef hipMemAddressFree
+#undef hipMemExportToShareableHandle
+#undef hipMemImportFromShareableHandle
 #endif
 
 // Kernel macros
 #undef GetHwId
-#undef GetXccId
+//#undef GetXccId
 
 // Undefine helper macros
 #undef ERR_CHECK
diff --git a/toolchain-linux.cmake b/toolchain-linux.cmake
deleted file mode 100644
index 712c5f1c..00000000
--- a/toolchain-linux.cmake
+++ /dev/null
@@ -1,34 +0,0 @@
-
-if (DEFINED ENV{ROCM_PATH})
-  set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Path to the ROCm installation.")
-  set(rocm_bin "$ENV{ROCM_PATH}/bin")
-else()
-  set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.")
-  set(rocm_bin "/opt/rocm/bin")
-endif()
-
-if (NOT DEFINED ENV{CXX})
-  if(EXISTS "${rocm_bin}/amdclang++")
-    set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
-  else()
-    if(EXISTS "${ROCM_PATH}/llvm/bin/amdclang++")
-      set(rocm_bin "${ROCM_PATH}/llvm/bin")
-      set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
-    elseif(EXISTS "${ROCM_PATH}/llvm/bin/clang++")
-      set(rocm_bin "${ROCM_PATH}/llvm/bin")
-      set(CMAKE_CXX_COMPILER "${rocm_bin}/clang++" CACHE PATH "Path to the C++ compiler")
-    endif()
-  endif()
-else()
-  set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler")
-endif()
-
-if (NOT DEFINED ENV{CXXFLAGS})
-  set(CMAKE_CXX_FLAGS_DEBUG "-g -O1")
-  set(CMAKE_CXX_FLAGS_RELEASE "-O3")
-endif()
-
-if(NOT CMAKE_BUILD_TYPE)
-  message(STATUS "Setting build type to 'Release' as none was specified.")
-  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE)
-endif()

From 51d8ebc680bb7feecd67cdaf5ce10b3c24a36a87 Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Sun, 3 May 2026 00:30:07 -0500
Subject: [PATCH 2/2] Fixing rings preset to be parallel rings / updating
 formating

---
 src/client/Presets/Rings.hpp | 315 +++++++++++++++--------------------
 1 file changed, 136 insertions(+), 179 deletions(-)

diff --git a/src/client/Presets/Rings.hpp b/src/client/Presets/Rings.hpp
index bee03055..b9ffb0ff 100644
--- a/src/client/Presets/Rings.hpp
+++ b/src/client/Presets/Rings.hpp
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
+#include <numeric>
+
 int RingsPreset(EnvVars&          ev,
                 size_t      const numBytesPerTransfer,
                 std::string const presetName,
@@ -30,14 +32,14 @@ int RingsPreset(EnvVars&          ev,
     Utils::Print("[ERROR] rings preset can only be run across ranks that are homogeneous\n");
     Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n");
     Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n");
-    return 1;
+    return ERR_FATAL;
   }
 
   // Check for pod support (if multi-node)
   int numRanks = TransferBench::GetNumRanks();
-  if (numRanks > 1 && Utils::GetRankPerPodMap().empty()) {
-    Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n");
-    return 1;
+  if (numRanks > 1 && Utils::GetRankPerPodMap().size() != 1) {
+    Utils::Print("[ERROR] Multi-rank runs must be within a single pod.  Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n");
+    return ERR_FATAL;
   }
 
   int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
@@ -50,36 +52,24 @@ int RingsPreset(EnvVars&          ev,
   int useDmaExec    = EnvVars::GetEnvVar("USE_DMA_EXEC"   , 0);
   int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
   int stride        = EnvVars::GetEnvVar("STRIDE"         , 1);
-  int groupSize     = EnvVars::GetEnvVar("GROUP_SIZE"     , numRanks * numGpus);
+  int ringSize      = EnvVars::GetEnvVar("RING_SIZE"      , numRanks * numGpus);
+
 
   if (numGpus <= 0 || numGpus > numDetectedGpus) {
     Utils::Print("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
-    return 1;
+    return ERR_FATAL;
   }
-  if (groupSize <= 0) {
-    Utils::Print("[ERROR] Group size must be greater than 0\n");
-    return 1;
+  if (ringSize <= 0) {
+    Utils::Print("[ERROR] Ring size must be greater than 0\n");
+    return ERR_FATAL;
   }
-  if (numRanks * numGpus % groupSize) {
-    Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n",
-                 groupSize, numRanks * numGpus, numRanks);
-    return 1;
+  int totalGpus = numRanks * numGpus;
+  if (totalGpus % ringSize) {
+    Utils::Print("[ERROR] Ring size %d must evenly divide the total number of GPUs %d\n", ringSize, totalGpus);
+    return ERR_FATAL;
   }
 
-  int numNics = TransferBench::GetNumExecutors(EXE_NIC, 0);
-  bool nicDifference = false;
-  for (int rank = 0; rank < numRanks; rank++) {
-    if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) {
-      Utils::Print("[ERROR] rings preset requires each rank to have the same number of GPUs\n");
-      return 1;
-    }
-    if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank))
-      nicDifference = true;
-  }
-  if (nicDifference)
-    Utils::Print("[WARN] Not all ranks have the same number of NICs\n");
-
-  MemType memType = Utils::GetGpuMemType(memTypeIdx);
+  MemType memType           = Utils::GetGpuMemType(memTypeIdx);
   std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx);
 
   if (Utils::RankDoesOutput()) {
@@ -93,12 +83,12 @@ int RingsPreset(EnvVars&          ev,
       ev.Print("USE_DMA_EXEC"   , useDmaExec   , "Using %s executor", useDmaExec ? "DMA" : "GFX");
       ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
       ev.Print("STRIDE"         , stride       , "Reordering devices by taking %d steps", stride);
-      ev.Print("GROUP_SIZE"     , groupSize    , "Dividing all devices into ring groups of %d", groupSize);
+      ev.Print("RING_SIZE"      , ringSize     , "Building rings of size %d", ringSize);
       printf("\n");
     }
   }
 
-  Utils::Print("GPU-%s IntraPod Ring benchmark:\n", useDmaExec ? "DMA" : "GFX");
+  Utils::Print("GPU-%s Rings benchmark:\n", useDmaExec ? "DMA" : "GFX");
   Utils::Print("==============================\n");
   Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n",
                numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs,
@@ -107,174 +97,141 @@ int RingsPreset(EnvVars&          ev,
   TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
   ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
 
-  int n = numRanks * numGpus;
-  int numGroups = n / groupSize;
+  int numRings = totalGpus / ringSize;
+  Utils::Print("Running %d parallel ring(s) each of %d devices.  All numbers in GB/s:\n", numRings, ringSize);
 
-  std::vector<int> indices(n);
-  for (int k = 0; k < n; k++) indices[k] = k;
+  // Determine ordering of GPUs for the rings based on stride
+  std::vector<int> indices(totalGpus);
+  std::iota(indices.begin(), indices.end(), 0);
   Utils::StrideGenerate(indices, stride);
 
-  std::vector<MemDevice> devices(n);
-  for (int i = 0; i < n; i++) {
-    int const globalIdx = indices[i];
-    int const rank      = globalIdx / numGpus;
-    int const devIdx    = globalIdx % numGpus;
-    devices[i] = {memType, devIdx, rank};
+  // Establish memory devices for all GPUs
+  std::vector<MemDevice> memDevices(totalGpus);
+  for (int i = 0; i < totalGpus; i++) {
+    memDevices[i] = {memType, indices[i] % numGpus, indices[i] / numGpus};
   }
 
-  Utils::Print("%d ring(s) of %d devices:\n", numGroups, groupSize);
-  for (int group = 0; group < numGroups; group++) {
-    int const groupBase = group * groupSize;
-    Utils::Print("  Ring %d: ", group);
-    for (int i = 0; i < groupSize; i++) {
-      Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex);
-    }
-    Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex);
-  }
-  Utils::Print("\n");
-
-  for (int group = 0; group < numGroups; group++) {
-    int const groupBase = group * groupSize;
-    std::vector<Transfer> transfers;
-
-    for (int i = 0; i < groupSize; i++) {
-      int srcIdx = groupBase + i;
-      int dstIdx = groupBase + (i + 1) % groupSize;
-
-      TransferBench::Transfer transfer;
-      transfer.numBytes = numBytesPerTransfer;
-      transfer.srcs.push_back(devices[srcIdx]);
-      transfer.dsts.push_back(devices[dstIdx]);
-      transfer.exeDevice = {exeType,
-                           (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex),
-                           (int32_t)(useRemoteRead ? devices[dstIdx].memRank  : devices[srcIdx].memRank)};
-      transfer.exeSubIndex = -1;
-      transfer.numSubExecs = numSubExecs;
-      transfers.push_back(transfer);
-
+  // Build list of Transfers
+  std::vector<Transfer> transfers;
+  for (int ringIdx = 0; ringIdx < numRings; ringIdx++) {
+    int const ringBase = ringIdx * ringSize;
+
+    // Build GFX or DMA transfers for this ring
+    for (int i = 0; i < ringSize; i++) {
+      Transfer t;
+      int srcIdx    = ringBase + i;
+      int dstIdx    = ringBase + (i + 1) % ringSize;
+      int exeIdx    = useRemoteRead ? dstIdx : srcIdx;
+      t.numBytes    = numBytesPerTransfer;
+      t.srcs        = {memDevices[srcIdx]};
+      t.dsts        = {memDevices[dstIdx]};
+      t.exeDevice   = {exeType, memDevices[exeIdx].memIndex, memDevices[exeIdx].memRank};
+      t.numSubExecs = numSubExecs;
+      transfers.push_back(t);
+
+      // Build NIC transfers between these GPUs as well if requested
       if (numQueuePairs > 0) {
-        TransferBench::Transfer nicTransfer;
-        nicTransfer.numBytes = numBytesPerTransfer;
-        nicTransfer.srcs.push_back(devices[srcIdx]);
-        nicTransfer.dsts.push_back(devices[dstIdx]);
-        nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST,
-                                (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank};
-        nicTransfer.exeSubIndex = devices[dstIdx].memIndex;
+        Transfer nicTransfer = t;
+        nicTransfer.exeDevice   = {EXE_NIC_NEAREST, memDevices[exeIdx].memIndex, memDevices[exeIdx].memRank};
+        nicTransfer.exeSubIndex = memDevices[useRemoteRead ? srcIdx : dstIdx].memIndex;
         nicTransfer.numSubExecs = numQueuePairs;
         transfers.push_back(nicTransfer);
       }
     }
+  }
 
-    TransferBench::TestResults results;
-    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
-      for (auto const& err : results.errResults)
-        Utils::Print("%s\n", err.errMsg.c_str());
-      return 1;
-    }
-    if (showDetails) {
-      Utils::PrintResults(ev, 1, transfers, results);
-      Utils::Print("\n");
-    }
-
-    if (Utils::RankDoesOutput()) {
-      Utils::Print("\n--- Ring Group %d ---\n", group);
-
-      int const numHops   = groupSize;
-      int const numRows   = 2 + numHops + 3;
-      int const numCols   = 6;
-      int const precision = 2;
-      Utils::TableHelper table(numRows, numCols, precision);
-
-      table.DrawRowBorder(0);
-      table.DrawColBorder(0);
-      table.DrawColBorder(numCols);
-      table.DrawRowBorder(numRows);
-
-      table.Set(0, 0, " Src ");
-      table.Set(0, 1, " Src ");
-      table.Set(0, 2, " Dst ");
-      table.Set(0, 3, " Dst ");
-      table.Set(0, 4, " GFX BW ");
-      table.Set(1, 0, " Rank ");
-      table.Set(1, 1, " GPU ");
-      table.Set(1, 2, " Rank ");
-      table.Set(1, 3, " GPU ");
-      table.Set(1, 4, " (GB/s) ");
-      table.DrawColBorder(2);
-      table.DrawColBorder(4);
+  TransferBench::TestResults results;
+  if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+    for (auto const& err : results.errResults)
+      Utils::Print("%s\n", err.errMsg.c_str());
+    return ERR_FATAL;
+  }
+  if (showDetails) {
+    Utils::PrintResults(ev, 1, transfers, results);
+    Utils::Print("\n");
+  }
 
-      if (numQueuePairs > 0) {
-        table.Set(0, 5, " NIC BW ");
-        table.Set(1, 5, " (GB/s) ");
-      } else {
-        table.Set(0, 5, " ");
-        table.Set(1, 5, " ");
-      }
+  if (Utils::RankDoesOutput()) {
 
-      table.DrawRowBorder(2);
-
-      double gfxMin = std::numeric_limits<double>::max();
-      double gfxAvg = 0.0;
-      double gfxMax = std::numeric_limits<double>::lowest();
-      double nicMin = std::numeric_limits<double>::max();
-      double nicAvg = 0.0;
-      double nicMax = std::numeric_limits<double>::lowest();
-
-      int tfrIdx = 0;
-      for (int i = 0; i < numHops; i++) {
-        int srcIdx = groupBase + i;
-        int dstIdx = groupBase + (i + 1) % groupSize;
-        int row    = 2 + i;
-
-        double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
-        tfrIdx++;
-
-        table.Set(row, 0, " %d ", devices[srcIdx].memRank);
-        table.Set(row, 1, " %d ", devices[srcIdx].memIndex);
-        table.Set(row, 2, " %d ", devices[dstIdx].memRank);
-        table.Set(row, 3, " %d ", devices[dstIdx].memIndex);
-        table.Set(row, 4, " %.2f ", gfxBw);
-
-        gfxMin = std::min(gfxMin, gfxBw);
-        gfxAvg += gfxBw;
-        gfxMax = std::max(gfxMax, gfxBw);
-
-        if (numQueuePairs > 0) {
-          double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
-          tfrIdx++;
-          table.Set(row, 5, " %.2f ", nicBw);
-          nicMin = std::min(nicMin, nicBw);
-          nicAvg += nicBw;
-          nicMax = std::max(nicMax, nicBw);
+    // Limit the number of columns of output
+    int maxColumns   = 24;
+    int colsPerRing  = (numQueuePairs ? 3 : 2);
+    int ringsPerPage = maxColumns / colsPerRing;
+    int numPages     = (numRings + ringsPerPage - 1) / ringsPerPage;
+
+
+    // Compute table size
+    int numRows = numPages * (2 + ringSize + 4);
+    int numCols = std::min(numRings, ringsPerPage) * colsPerRing;
+    Utils::TableHelper table(numRows, numCols);
+
+    std::vector<std::vector<double>> ringMin(numQueuePairs ? 2 : 1, std::vector<double>(numRings, std::numeric_limits<double>::max()));
+    std::vector<std::vector<double>> ringSum(numQueuePairs ? 2 : 1, std::vector<double>(numRings, 0.0));
+    std::vector<std::vector<double>> ringMax(numQueuePairs ? 2 : 1, std::vector<double>(numRings, 0.0));
+
+    for (int pageIdx = 0; pageIdx < numPages; pageIdx++) {
+      int headerRow = pageIdx * (2 + ringSize + 4);
+
+      table.DrawRowBorder(headerRow);
+      table.DrawRowBorder(headerRow+2);
+      for (int r = 0; r < ringsPerPage; r++) {
+        int ringIdx = pageIdx * ringsPerPage + r;
+        if (ringIdx >= numRings) break;
+        int currCol = colsPerRing * r;
+
+        // Set header for ring
+        table.DrawColBorder(currCol);
+        table.DrawColBorder(currCol + colsPerRing);
+        for (int i = 0; i < colsPerRing; i++)
+          table.Set(headerRow, currCol+i, "Ring%02d", ringIdx);
+        table.Set(headerRow+1, currCol, "Device");
+        table.Set(headerRow+1, currCol+1, "%s BW", useDmaExec ? "DMA" : "GFX");
+        if (numQueuePairs) {
+          table.Set(headerRow+1, currCol+2, "NIC BW");
         }
-      }
-
-      int summaryBase = 2 + numHops;
-      table.DrawRowBorder(summaryBase);
-      table.Set(summaryBase    , 1, " MAX ");
-      table.Set(summaryBase + 1, 1, " AVG ");
-      table.Set(summaryBase + 2, 1, " MIN ");
-      table.Set(summaryBase    , 4, " %.2f ", gfxMax);
-      table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops);
-      table.Set(summaryBase + 2, 4, " %.2f ", gfxMin);
 
-      if (numQueuePairs > 0) {
-        table.Set(summaryBase    , 5, " %.2f ", nicMax);
-        table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops);
-        table.Set(summaryBase + 2, 5, " %.2f ", nicMin);
+        // Fill results for ring
+        int baseRow = headerRow + 2;
+        table.DrawRowBorder(baseRow);
+        for (int i = 0; i < ringSize; i++) {
+          int tfrIdx = (ringIdx * ringSize + i) * (colsPerRing - 1);
+          Transfer const& t = transfers[tfrIdx];
+          if (numRanks > 1) {
+            table.Set(baseRow + i, currCol, "R%02d:%d", t.srcs[0].memRank, t.srcs[0].memIndex);
+          } else {
+            table.Set(baseRow + i, currCol, "%d", t.srcs[0].memIndex);
+          }
+
+          for (int j = 0; j < colsPerRing - 1; j++) {
+            double bw = results.tfrResults[tfrIdx + j].avgBandwidthGbPerSec;
+            table.Set(baseRow + i, currCol+1+j, "%7.2f", bw);
+            ringMin[j][ringIdx] = std::min(ringMin[j][ringIdx], bw);
+            ringSum[j][ringIdx] += bw;
+            ringMax[j][ringIdx] = std::max(ringMax[j][ringIdx], bw);
+          }
+        }
+        int statRow = baseRow + ringSize;
+        table.DrawRowBorder(statRow);
+        table.Set(statRow  , currCol, "MIN");
+        table.Set(statRow+1, currCol, "AVG");
+        table.Set(statRow+2, currCol, "MAX");
+        table.Set(statRow+3, currCol, "SUM");
+
+        for (int j = 0; j < colsPerRing - 1; j++) {
+          table.Set(statRow  , currCol+1+j, "%7.2f", ringMin[j][ringIdx]);
+          table.Set(statRow+1, currCol+1+j, "%7.2f", ringSum[j][ringIdx] / ringSize);
+          table.Set(statRow+2, currCol+1+j, "%7.2f", ringMax[j][ringIdx]);
+          table.Set(statRow+3, currCol+1+j, "%7.2f", ringSum[j][ringIdx]);
+        }
+        table.DrawRowBorder(statRow+3);
       }
-
-      table.PrintTable(ev.outputToCsv, ev.showBorders);
-
-      Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
     }
-  }
-
-  if (!Utils::RankDoesOutput()) return 0;
+    table.PrintTable(ev.outputToCsv, ev.showBorders);
+    Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
 
-  if (Utils::HasDuplicateHostname()) {
-    printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
+    if (Utils::HasDuplicateHostname())
+      Utils::Print("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
   }
 
-  return 0;
+  return ERR_NONE;
 }