From 29efe1249b5ca578b470c6c3c6535a39ab24f2f9 Mon Sep 17 00:00:00 2001 From: Gilbert Lee Date: Sat, 2 May 2026 00:20:27 -0500 Subject: [PATCH 1/2] TransferBench v1.67.0 - Initial pod communication support (#235) - cuda + MNNVL update & pod presets (#241) - Increase CQ size for high qps (#244) - fix hang when NVML is present but fabricmanager isnt (#246) - Adding nica2a preset (#248) - Adding HBM read bandwidth preset (#250) - Pod Ring preset (#251) - gfxsweep preset (#254) (#256) - Adding Batched DMA support (hipMemcpyBatchAsync), and bmasweep preset (#255) - Adding a wallclock consistency detection preset (#258) - Adding smoketest preset for simple correctness tests (#266) - Help / envvars / presets presets (#267) - Modernize CMake build (#268) - Replace version-based pod/amd-smi detection with compile-time API probes (#269) - Fix collective mismatch hangs in multi-rank error paths (#270) - Fix SHOW_ITERATIONS table truncation with multiple transfers per executor (#271) - Reformat a2asweep output to match gfxsweep style (#272) - Gfx sweep update (#274) - Increasing flush frequency in smoketest (#275) - Adding new experimental copy-only GFX kernel, gfxsweep update (#277) - Fixes for cuMem compilation and invalid device ordinal (#278) - Simplifying socket connect, allow for using host address (#279) - Updating podring to run on single node without need to force single pod (#280) - Adding SHOW_PERCENTILES to show extra per-iteration statistics (#281) --------- Co-authored-by: Tim <43156029+AtlantaPepsi@users.noreply.github.com> Co-authored-by: Pak Nin Lui Co-authored-by: pierreantoineH Co-authored-by: Nilesh M Negi Co-authored-by: Claude Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- CHANGELOG.md | 38 + CMakeLists.txt | 444 ++++-- Makefile | 212 ++- examples/example.cfg | 6 +- src/client/Client.cpp | 25 +- src/client/EnvVars.hpp | 199 ++- src/client/Presets/AllToAll.hpp | 34 +- src/client/Presets/AllToAllN.hpp | 21 +- src/client/Presets/AllToAllSweep.hpp | 160 +- src/client/Presets/BmaSweep.hpp | 182 +++ src/client/Presets/EnvVarsList.hpp | 31 + src/client/Presets/GfxSweep.hpp | 239 +++ src/client/Presets/HbmBandwidth.hpp | 619 ++++++++ src/client/Presets/HealthCheck.hpp | 13 +- src/client/Presets/Help.hpp | 123 ++ src/client/Presets/NicAllToAll.hpp | 374 +++++ src/client/Presets/NicPeerToPeer.hpp | 121 +- src/client/Presets/NicRings.hpp | 19 +- src/client/Presets/OneToAll.hpp | 17 +- src/client/Presets/PeerToPeer.hpp | 20 +- src/client/Presets/PodAllToAll.hpp | 270 ++++ src/client/Presets/PodPeerToPeer.hpp | 300 ++++ src/client/Presets/Presets.hpp | 58 +- src/client/Presets/Rings.hpp | 280 ++++ src/client/Presets/Scaling.hpp | 40 +- src/client/Presets/Schmoo.hpp | 63 +- src/client/Presets/SmokeTest.hpp | 336 +++++ src/client/Presets/Sweep.hpp | 25 +- src/client/Presets/WallClock.hpp | 234 +++ src/client/Topology.hpp | 57 +- src/client/Utilities.hpp | 300 +++- src/header/TransferBench.hpp | 2080 +++++++++++++++++++++----- toolchain-linux.cmake | 34 - 33 files changed, 5972 insertions(+), 1002 deletions(-) create mode 100644 src/client/Presets/BmaSweep.hpp create mode 100644 src/client/Presets/EnvVarsList.hpp create mode 100644 src/client/Presets/GfxSweep.hpp create mode 100644 src/client/Presets/HbmBandwidth.hpp create mode 100644 src/client/Presets/Help.hpp create mode 100644 src/client/Presets/NicAllToAll.hpp create mode 100644 src/client/Presets/PodAllToAll.hpp create mode 100644 src/client/Presets/PodPeerToPeer.hpp create mode 100644 src/client/Presets/Rings.hpp create mode 100644 src/client/Presets/SmokeTest.hpp create mode 100644 src/client/Presets/WallClock.hpp delete mode 100644 toolchain-linux.cmake diff --git a/CHANGELOG.md b/CHANGELOG.md index 95991318..443f667d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,44 @@ Documentation for TransferBench is available at [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). +## v1.67.00 +### Added +- Initial support for pod communication. Requires compatible hardware / ROCm version and subject to further testing + - This potentially enables GFX/DMA executors to access SRC/DST memory locations on GPUs within the same pod + - Pod membership requires amd-smi however can be skipped by setting TB_FORCE_SINGLE_POD=1 +- Support for dumping executed Transfers to a config file specified by TB_DUMP_CFG_FILE + - This will write Transfers that are executed (for example via a preset) to a config file that can then be executed +- Reporting number of iterations run when running in timed mode (NUM_ITERATIONS < 0) +- Adding NIC_CQ_POLL_BATCH to control CQ poll batch size for NIC transfers +- New "hbm" preset which sweeps and tests local HBM read performance +- Added a new TB_WALLCLOCK_RATE that will override GPU GFX wallclock rate if it returns 0 (debug) +- Adding new batched-DMA executor "B", which utilizes the hipMemcpyBatchAsync API introduced in HIP 7.1 / CUDA 12.8 +- Added new "bmasweep" preset that compares DMA to batched DMA execution for parallel transfers to other GPUs +- Added new "wallclock" preset that compares wallclock counters across XCCs within a GPU +- Added new "smoketest" preset that runs a variety of DMA/GFX tests for simple correctness tests +- Added new "help" preset to show config file examples +- Added new "presets" preset to show available presets and their descriptions +- Added new "rings" preset that runs parallel rings of transfers (pod-capable) +- Added new "envvars" preset to show environment variables that can change TransferBench behavior +- Adding information on how to run multi-rank with TransferBench, when run with no args +- Added new "nica2a" preset (NIC all-to-all over GPUs via NIC executors, multi-node) +- Added new GFX_KERNEL to allow experimenting with copy-only GFX kernel. Currently this is opt-in only +- Added `SHOW_PERCENTILES` (e.g. `50,75,90,95,99`) to show empirical percentiles of per-iteration duration + +### Modified +- DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options +- Adding extra information to CMake and make build methods to indicate enabled / disabled features +- a2asweep preset changes from USE_FINE_GRAIN to MEM_TYPE to reflect various memory types +- a2asweep preset changes from NUM_CUS to NUM_SUB_EXECS to match with a2a preset naming convention +- scaling preset changes from using USE_FINE_GRAIN to CPU_MEM_TYPE and GPU_MEM_TYPE +- NIC_FILTER renamed to TB_NIC_FILTER for consistency +- DUMP_LINES renamed to TB_DUMP_LINES for consistency +- Dynamically size CQs for NIC transfers in high QPs case +- Switch to using hipMemcpyDeviceToDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0) +- Allow for multiple destination memory locations for DMA/Batched-DMA Transfers +- Removed env vars printing and preset print when running TransferBench with no args +- Modification to simplify socket comm usage - first rank only needs to set TB_NUM_RANKS=X to see connection info + ## v1.66.02 ### Added - Adding DMA-BUF support diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b6591d3..c73e33d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,35 +1,137 @@ # Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -cmake_minimum_required(VERSION 3.5 FATAL_ERROR) +# MPI::MPI_CXX and hip:: config targets require >= 3.9; 3.16 for modern policy defaults. +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) -# CMake Toolchain file to define compilers and path to ROCm +# Pre-project: ROCM_PATH detection and compiler/flag selection. +# Must be before project() so CMake uses the right compiler on first configure. +# Priority: -DROCM_PATH / $ROCM_PATH env > PATH (amdclang++) > /opt/rocm #================================================================================================== -if (NOT CMAKE_TOOLCHAIN_FILE) - set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake") - message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}") + +# ROCM_PATH: Check CMake cache or environment. +if(NOT ROCM_PATH) + if(DEFINED ENV{ROCM_PATH} AND NOT "$ENV{ROCM_PATH}" STREQUAL "") + set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Path to ROCm installation.") + endif() +endif() + +# ROCM_PATH: Derive from PATH; walk up from amdclang++/clang++ to the ROCm root. +# Handles both ${ROCM_PATH}/bin/ and ${ROCM_PATH}/llvm/bin/ layouts. +if(NOT ROCM_PATH) + find_program(_rocm_bin_hint NAMES amdclang++ clang++) + if(_rocm_bin_hint) + get_filename_component(_bin_dir "${_rocm_bin_hint}" DIRECTORY) + get_filename_component(_parent "${_bin_dir}" DIRECTORY) + if(EXISTS "${_parent}/lib/libamdhip64.so" OR EXISTS "${_parent}/lib64/libamdhip64.so") + set(ROCM_PATH "${_parent}" CACHE PATH "Path to ROCm installation (auto-detected from PATH).") + message(STATUS "ROCM_PATH auto-detected from PATH: ${ROCM_PATH}") + else() + get_filename_component(_grandparent "${_parent}" DIRECTORY) + if(EXISTS "${_grandparent}/lib/libamdhip64.so" OR EXISTS "${_grandparent}/lib64/libamdhip64.so") + set(ROCM_PATH "${_grandparent}" CACHE PATH "Path to ROCm installation (auto-detected from PATH).") + message(STATUS "ROCM_PATH auto-detected from PATH: ${ROCM_PATH}") + endif() + endif() + endif() + unset(_rocm_bin_hint CACHE) + unset(_rocm_bin_hint) + unset(_bin_dir) + unset(_parent) + unset(_grandparent) +endif() + +# ROCM_PATH: Fallback. +if(NOT ROCM_PATH) + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation.") + message(WARNING "ROCM_PATH not found; falling back to ${ROCM_PATH}") endif() -set(VERSION_STRING "1.66.02") +if(NOT EXISTS "${ROCM_PATH}") + message(FATAL_ERROR "ROCM_PATH=${ROCM_PATH} does not exist") +endif() + +message(STATUS "ROCM_PATH: ${ROCM_PATH}") + +# Compiler detection: amdclang++ > llvm/amdclang++ > llvm/clang++ +# Respects -DCMAKE_CXX_COMPILER and $CXX / $CC env vars. +if(NOT CMAKE_CXX_COMPILER) + if(DEFINED ENV{CXX} AND NOT "$ENV{CXX}" STREQUAL "") + set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to C++ compiler") + elseif(EXISTS "${ROCM_PATH}/bin/amdclang++") + set(CMAKE_CXX_COMPILER "${ROCM_PATH}/bin/amdclang++" CACHE PATH "Path to C++ compiler") + elseif(EXISTS "${ROCM_PATH}/llvm/bin/amdclang++") + set(CMAKE_CXX_COMPILER "${ROCM_PATH}/llvm/bin/amdclang++" CACHE PATH "Path to C++ compiler") + elseif(EXISTS "${ROCM_PATH}/llvm/bin/clang++") + set(CMAKE_CXX_COMPILER "${ROCM_PATH}/llvm/bin/clang++" CACHE PATH "Path to C++ compiler") + else() + message(FATAL_ERROR + "Cannot find amdclang++/clang++ under ${ROCM_PATH}/bin or ${ROCM_PATH}/llvm/bin") + endif() +endif() + +if(NOT CMAKE_C_COMPILER) + if(DEFINED ENV{CC} AND NOT "$ENV{CC}" STREQUAL "") + set(CMAKE_C_COMPILER "$ENV{CC}" CACHE PATH "Path to C compiler") + else() + get_filename_component(_cxx_dir "${CMAKE_CXX_COMPILER}" DIRECTORY) + get_filename_component(_cxx_name "${CMAKE_CXX_COMPILER}" NAME) + string(REPLACE "clang++" "clang" _cc_name "${_cxx_name}") + if(EXISTS "${_cxx_dir}/${_cc_name}") + set(CMAKE_C_COMPILER "${_cxx_dir}/${_cc_name}" CACHE PATH "Path to C compiler") + endif() + unset(_cxx_dir) + unset(_cxx_name) + unset(_cc_name) + endif() +endif() + +# Seed default per-config flags. _INIT vars are written to cache on first configure; +# user overrides via -DCMAKE_CXX_FLAGS_DEBUG=... or $CXXFLAGS/$CFLAGS take precedence. +if(NOT (DEFINED ENV{CXXFLAGS} AND NOT "$ENV{CXXFLAGS}" STREQUAL "")) + set(CMAKE_CXX_FLAGS_DEBUG_INIT "-O0 -g -ggdb3") + set(CMAKE_CXX_FLAGS_RELEASE_INIT "-O3") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO_INIT "-O3 -g") +endif() +if(NOT (DEFINED ENV{CFLAGS} AND NOT "$ENV{CFLAGS}" STREQUAL "")) + set(CMAKE_C_FLAGS_DEBUG_INIT "-O0 -g -ggdb3") + set(CMAKE_C_FLAGS_RELEASE_INIT "-O3") + set(CMAKE_C_FLAGS_RELWITHDEBINFO_INIT "-O3 -g") +endif() + +set(ENV{ROCM_PATH} "${ROCM_PATH}") + +# TransferBench project definitions +#================================================================================================== +set(VERSION_STRING "1.67.00") project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX) -## Load CMake modules +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + message(STATUS "Setting build type to 'Release' as none was specified.") + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE) +endif() + +# Load CMake modules +# Extend MODULE_PATH before any include() that searches it. #================================================================================================== +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") include(CheckIncludeFiles) include(CheckSymbolExists) -include(cmake/Dependencies.cmake) # rocm-cmake, rocm_local_targets - -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +include(CheckCXXSourceCompiles) +include(CheckCXXCompilerFlag) +include(CMakePushCheckState) # Build options #================================================================================================== option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF) option(ENABLE_NIC_EXEC "Enable RDMA NIC Executor in TransferBench" OFF) option(ENABLE_MPI_COMM "Enable MPI Communicator support" OFF) -option(DISABLE_DMABUF "Disable DMA-BUF support for GPU Direct RDMA" ON) +option(ENABLE_DMA_BUF "Enable DMA-BUF support for GPU Direct RDMA" OFF) +option(ENABLE_AMD_SMI "Enable AMD-SMI pod membership queries" OFF) +option(ENABLE_POD_COMM "Enable pod communication" OFF) option(BUILD_RELOCATABLE_PACKAGE "Build with RVS-style relocatable RPATH and amdrocm-transferbench package naming" OFF) -# Default GPU architectures to build -#================================================================================================== +include(cmake/Dependencies.cmake) # rocm-cmake, rocm_local_targets, rocm_check_target_ids + set(DEFAULT_GPUS gfx906 gfx908 @@ -43,190 +145,297 @@ set(DEFAULT_GPUS gfx1150 gfx1151 gfx1200 - gfx1201) + gfx1201 + gfx1250) -## Build only for local GPU architecture if(BUILD_LOCAL_GPU_TARGET_ONLY) message(STATUS "Building only for local GPU target") - if (COMMAND rocm_local_targets) - rocm_local_targets(DEFAULT_GPUS) + if(COMMAND rocm_local_targets) + rocm_local_targets(LOCAL_GPU_TARGETS) + if(LOCAL_GPU_TARGETS) + set(DEFAULT_GPUS ${LOCAL_GPU_TARGETS}) + else() + message(WARNING "No local GPUs detected; falling back to default GPU list.") + endif() else() message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.") endif() + # FORCE so re-runs pick up the freshly detected local set. + set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "GPU architectures to build for." FORCE) +else() + # Seeded once on first configure; hip-config-amd.cmake applies the same priority but warns on AMDGPU_TARGETS. + if(NOT DEFINED CACHE{GPU_TARGETS}) + if(DEFINED AMDGPU_TARGETS AND NOT AMDGPU_TARGETS STREQUAL "") + set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to build for.") + message(STATUS "GPU_TARGETS seeded from CMake AMDGPU_TARGETS: ${GPU_TARGETS}") + elseif(DEFINED ENV{AMDGPU_TARGETS} AND NOT "$ENV{AMDGPU_TARGETS}" STREQUAL "") + set(GPU_TARGETS "$ENV{AMDGPU_TARGETS}" CACHE STRING "GPU architectures to build for.") + message(STATUS "GPU_TARGETS seeded from environment AMDGPU_TARGETS: ${GPU_TARGETS}") + else() + set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "GPU architectures to build for.") + endif() + endif() endif() -## Determine which GPU architectures to build for -set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.") - -## Check if clang compiler can offload to GPU_TARGETS -if (COMMAND rocm_check_target_ids) - message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}") +# Check if clang can offload to each GPU_TARGETS entry. +if(COMMAND rocm_check_target_ids) + message(STATUS "Checking for ROCm support for GPU targets: ${GPU_TARGETS}") rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS}) else() message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs.") set(SUPPORTED_GPUS ${DEFAULT_GPUS}) endif() -set(GPU_TARGETS "${SUPPORTED_GPUS}") -message(STATUS "Compiling for ${GPU_TARGETS}") - -## NOTE: Reload rocm-cmake in order to update GPU_TARGETS -include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults - -# Check for required dependencies -#================================================================================================== -## Try to establish ROCM_PATH (for find_package) -if(NOT DEFINED ROCM_PATH) - # Guess default location - set(ROCM_PATH "/opt/rocm") - message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}") +if(SUPPORTED_GPUS) + set(GPU_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "GPU architectures to build for." FORCE) else() - message(STATUS "ROCM_PATH found: ${ROCM_PATH}") + message(WARNING "rocm_check_target_ids returned no supported GPUs; keeping existing GPU_TARGETS=${GPU_TARGETS}") endif() -set(ENV{ROCM_PATH} ${ROCM_PATH}) +message(STATUS "- Compiling for ${GPU_TARGETS}") -## Set CMAKE flags -if (NOT DEFINED CMAKE_CXX_STANDARD) +if(NOT DEFINED CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) -list(APPEND CMAKE_PREFIX_PATH # Add ROCM_PATH to CMake search paths for finding HIP / HSA + +# Search only the active ROCm installation. ROCM_PATH is already resolved by the +# pre-project() block, so this is always the right install. +list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/llvm - ${ROCM_PATH}/hip - /opt/rocm - /opt/rocm/llvm - /opt/rocm/hip) + ${ROCM_PATH}/hip) -## Check for HIP -find_package(hip REQUIRED CONFIG PATHS ${CMAKE_PREFIX_PATH}) +find_package(hip REQUIRED CONFIG) message(STATUS "HIP compiler: ${HIP_COMPILER}") -## Ensuring that CXX compiler meets expectations if(NOT (("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc") OR ("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+"))) message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as C++ compiler.") endif() ## Check for Threads -find_package(Threads REQUIRED) set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) ## Check for numa support find_library(NUMA_LIBRARY numa) find_path(NUMA_INCLUDE_DIR numa.h) if(NUMA_LIBRARY AND NUMA_INCLUDE_DIR) add_library(numa SHARED IMPORTED) - set_target_properties(numa PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}" IMPORTED_LOCATION "${NUMA_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}") + set_target_properties(numa PROPERTIES IMPORTED_LOCATION "${NUMA_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}") +else() + message(FATAL_ERROR "NUMA library or headers not found; TransferBench requires libnuma") endif() ## Check for hsa support -find_library(HSA_LIBRARY hsa-runtime64 PATHS ${ROCM_PATH} ${ROCM_PATH}/lib) -find_path(HSA_INCLUDE_DIR hsa.h PATHS ${ROCM_PATH}/include ${ROCM_PATH}/include/hsa) +find_library(HSA_LIBRARY hsa-runtime64 PATHS ${ROCM_PATH}/lib ${ROCM_PATH}/lib64 NO_DEFAULT_PATH) +find_path(HSA_INCLUDE_DIR hsa/hsa.h PATHS ${ROCM_PATH}/include) if(HSA_LIBRARY AND HSA_INCLUDE_DIR) add_library(hsa-runtime64 SHARED IMPORTED) - set_target_properties(hsa-runtime64 PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}" IMPORTED_LOCATION "${HSA_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}") + set_target_properties(hsa-runtime64 PROPERTIES IMPORTED_LOCATION "${HSA_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}") +else() + message(FATAL_ERROR "HSA library or headers not found under ${ROCM_PATH}; TransferBench requires libhsa-runtime64") endif() ## Check for infiniband verbs support if(DEFINED ENV{DISABLE_NIC_EXEC} AND "$ENV{DISABLE_NIC_EXEC}" STREQUAL "1") message(STATUS "Disabling NIC Executor support as env. flag DISABLE_NIC_EXEC was enabled") elseif(NOT ENABLE_NIC_EXEC) - message(STATUS "For CMake builds, NIC executor so requires explicit opt-in by setting CMake flag -DENABLE_NIC_EXEC=ON") - message(STATUS "Disabling NIC Executor support") + message(STATUS "For CMake builds, NIC Executor support requires explicit opt-in by setting CMake flag -DENABLE_NIC_EXEC=ON") + message(STATUS "- Disabling NIC Executor support") else() + message(STATUS "Attempting to build with NIC executor support") + find_library(IBVERBS_LIBRARY ibverbs) find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h) if(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR) add_library(ibverbs SHARED IMPORTED) set_target_properties(ibverbs PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}" IMPORTED_LOCATION "${IBVERBS_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}") set(IBVERBS_FOUND 1) - message(STATUS "Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable") + message(STATUS "- Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable") else() if(NOT IBVERBS_LIBRARY) - message(WARNING "IBVerbs library not found") + message(WARNING "- IBVerbs library not found") elseif(NOT IBVERBS_INCLUDE_DIR) - message(WARNING "infiniband/verbs.h not found") + message(WARNING "- infiniband/verbs.h not found") endif() - message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed") + message(WARNING "- Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed") endif() endif() -## Check for DMA-BUF support (requires IBVERBS_FOUND) -if(IBVERBS_FOUND AND NOT DISABLE_DMABUF) - message(STATUS "Checking for DMA-BUF support...") - - # Check for ibv_reg_dmabuf_mr - include(CheckSymbolExists) - set(CMAKE_REQUIRED_INCLUDES ${IBVERBS_INCLUDE_DIR}) - set(CMAKE_REQUIRED_LIBRARIES ${IBVERBS_LIBRARY}) - check_symbol_exists(ibv_reg_dmabuf_mr "infiniband/verbs.h" HAVE_IBV_DMABUF) - - # Check for hsa_amd_portable_export_dmabuf - set(CMAKE_REQUIRED_INCLUDES ${HSA_INCLUDE_DIR}) - set(CMAKE_REQUIRED_LIBRARIES ${HSA_LIBRARY}) - check_symbol_exists(hsa_amd_portable_export_dmabuf "hsa_ext_amd.h" HAVE_ROCM_DMABUF) - - # Enable DMA-BUF only if both APIs are available - if(HAVE_IBV_DMABUF AND HAVE_ROCM_DMABUF) - set(DMABUF_SUPPORT_FOUND 1) - message(STATUS "Building with DMA-BUF support") +## Check for DMA-BUF support (requires IBVERBS) +if(IBVERBS_FOUND) + if(DEFINED ENV{DISABLE_DMA_BUF} AND "$ENV{DISABLE_DMA_BUF}" STREQUAL "1") + message(STATUS "Disabling DMA-BUF support as env. flag DISABLE_DMA_BUF was enabled") + elseif(NOT ENABLE_DMA_BUF) + message(STATUS "For CMake builds, DMA-BUF support requires explicit opt-in by setting CMake flags -DENABLE_DMA_BUF=ON") + message(STATUS "- Disabling DMA-BUF support") else() - if(NOT HAVE_IBV_DMABUF AND NOT HAVE_ROCM_DMABUF) - message(WARNING "Building without DMA-BUF support: missing both ibv_reg_dmabuf_mr and ROCm DMA-BUF export") - elseif(NOT HAVE_IBV_DMABUF) - message(WARNING "Building without DMA-BUF support: missing ibv_reg_dmabuf_mr") + message(STATUS "Attempting to build with DMA-BUF support") + + # Check for ibv_reg_dmabuf_mr + cmake_push_check_state() + set(CMAKE_REQUIRED_INCLUDES ${IBVERBS_INCLUDE_DIR}) + set(CMAKE_REQUIRED_LIBRARIES ${IBVERBS_LIBRARY}) + check_symbol_exists(ibv_reg_dmabuf_mr "infiniband/verbs.h" HAVE_IBV_DMABUF) + cmake_pop_check_state() + + # Check for hsa_amd_portable_export_dmabuf + cmake_push_check_state() + set(CMAKE_REQUIRED_INCLUDES ${HSA_INCLUDE_DIR}) + set(CMAKE_REQUIRED_LIBRARIES ${HSA_LIBRARY}) + check_symbol_exists(hsa_amd_portable_export_dmabuf "hsa/hsa_ext_amd.h" HAVE_ROCM_DMABUF) + cmake_pop_check_state() + + # Enable DMA-BUF only if both APIs are available + if(HAVE_IBV_DMABUF AND HAVE_ROCM_DMABUF) + set(DMABUF_SUPPORT_FOUND 1) + message(STATUS "- Building with DMA-BUF support") else() - message(WARNING "Building without DMA-BUF support: missing ROCm DMA-BUF export") + if(NOT HAVE_IBV_DMABUF AND NOT HAVE_ROCM_DMABUF) + message(WARNING "- Building without DMA-BUF support: missing both ibv_reg_dmabuf_mr and ROCm DMA-BUF export") + elseif(NOT HAVE_IBV_DMABUF) + message(WARNING "- Building without DMA-BUF support: missing ibv_reg_dmabuf_mr") + else() + message(WARNING "- Building without DMA-BUF support: missing ROCm DMA-BUF export") + endif() endif() endif() -elseif(NOT DISABLE_DMABUF) - message(WARNING "DMA-BUF support requires ENABLE_NIC_EXEC=ON") endif() ## Check for MPI support set(MPI_PATH "" CACHE PATH "Path to MPI installation (takes priority over system MPI)") -if(NOT ENABLE_MPI_COMM) +if(DEFINED ENV{DISABLE_MPI_COMM} AND "$ENV{DISABLE_MPI_COMM}" STREQUAL "1") + message(STATUS "Disabling MPI Communicator support as env. flag DISABLE_MPI_COMM was enabled") +elseif(NOT ENABLE_MPI_COMM) message(STATUS "For CMake builds, MPI Communicator requires explicit opt-in by setting CMake flag -DENABLE_MPI_COMM=ON") message(STATUS "Disabling MPI Communicator support") else() - # First check user-specified MPI_PATH (similar to Makefile) + message(STATUS "Attempting to build with MPI communicator support") if(MPI_PATH AND EXISTS "${MPI_PATH}/include/mpi.h") - find_library(MPI_LIBRARY NAMES mpi PATHS ${MPI_PATH}/lib NO_DEFAULT_PATH) + find_library(MPI_LIBRARY NAMES mpi PATHS ${MPI_PATH}/lib ${MPI_PATH}/lib64 NO_DEFAULT_PATH) if(MPI_LIBRARY) set(MPI_COMM_FOUND 1) set(MPI_INCLUDE_DIR "${MPI_PATH}/include") - set(MPI_LINK_DIR "${MPI_PATH}/lib") - message(STATUS "Building with MPI Communicator support (found at MPI_PATH: ${MPI_PATH})") + message(STATUS "- Building with MPI Communicator support (found at MPI_PATH: ${MPI_PATH})") else() - message(WARNING "Found mpi.h at ${MPI_PATH}/include but could not find MPI library at ${MPI_PATH}/lib") + message(WARNING "- Found mpi.h at ${MPI_PATH}/include but could not find MPI library at ${MPI_PATH}/lib") endif() else() - # Fall back to find_package if(MPI_PATH) - message(STATUS "Unable to find mpi.h at ${MPI_PATH}/include, trying find_package") + message(STATUS "- Unable to find mpi.h at ${MPI_PATH}/include, trying find_package") endif() find_package(MPI QUIET) if(MPI_CXX_FOUND) set(MPI_COMM_FOUND 1) - message(STATUS "Building with MPI Communicator support (found via find_package)") - message(STATUS "- Using MPI include path: ${MPI_CXX_INCLUDE_PATH}") - message(STATUS "- Using MPI library:: ${MPI_CXX_LIBRARIES}") + message(STATUS "- Building with MPI Communicator support (found via find_package)") + message(STATUS " - Using MPI include path: ${MPI_CXX_INCLUDE_DIRS}") + message(STATUS " - Using MPI library: ${MPI_CXX_LIBRARIES}") else() - message(WARNING "MPI not found. Please specify appropriate MPI_PATH or install MPI libraries (e.g., OpenMPI or MPICH)") + message(WARNING "- MPI not found. Please specify appropriate MPI_PATH or install MPI libraries (e.g., OpenMPI or MPICH)") endif() endif() endif() -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY .) +## Check for pod communication support +if(ENABLE_AMD_SMI AND NOT ENABLE_POD_COMM) + message(WARNING "ENABLE_AMD_SMI=ON has no effect without ENABLE_POD_COMM=ON; AMD-SMI detection will be skipped") +endif() +if(DEFINED ENV{DISABLE_POD_COMM} AND "$ENV{DISABLE_POD_COMM}" STREQUAL "1") + message(STATUS "Disabling pod communication support as env. flag DISABLE_POD_COMM was enabled") +elseif(NOT ENABLE_POD_COMM) + message(STATUS "For CMake builds, pod communication support requires explicit opt-in by setting CMake flag -DENABLE_POD_COMM=ON") + message(STATUS "- Disabling pod communication support") +else() + find_library(HIP_RUNTIME_LIBRARY amdhip64 PATHS ${ROCM_PATH}/lib ${ROCM_PATH}/lib64 NO_DEFAULT_PATH) + if(NOT HIP_RUNTIME_LIBRARY) + message(FATAL_ERROR "libamdhip64 not found under ${ROCM_PATH}/lib or ${ROCM_PATH}/lib64; cannot probe for HIP fabric API") + endif() + # Probe for the HIP fabric API functions used by TransferBench at runtime. + cmake_push_check_state() + set(CMAKE_REQUIRED_INCLUDES "${ROCM_PATH}/include") + set(CMAKE_REQUIRED_LIBRARIES "${HIP_RUNTIME_LIBRARY}") + set(CMAKE_REQUIRED_DEFINITIONS "-D__HIP_PLATFORM_AMD__") + check_cxx_source_compiles(" + #include + int main() { + hipMemFabricHandle_t fabricHandle = {}; + hipMemGenericAllocationHandle_t allocationHandle = {}; + hipMemExportToShareableHandle(&fabricHandle, allocationHandle, hipMemHandleTypeFabric, 0); + hipMemImportFromShareableHandle(&allocationHandle, &fabricHandle, hipMemHandleTypeFabric); + return 0; + }" HIP_HAS_FABRIC_API) + cmake_pop_check_state() + + if(HIP_HAS_FABRIC_API) + message(STATUS "- HIP fabric API found; enabling pod communication support") + set(POD_COMM_FOUND 1) + + # Check for AMD-SMI support + # Try amd-smi for pod membership queries; fall back to TB_FORCE_SINGLE_POD=1 at runtime. + if(DEFINED ENV{DISABLE_AMD_SMI} AND "$ENV{DISABLE_AMD_SMI}" STREQUAL "1") + message(STATUS "- AMD-SMI disabled via env. flag DISABLE_AMD_SMI was enabled") + message(WARNING "Set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership") + elseif(NOT ENABLE_AMD_SMI) + message(STATUS "- For CMake builds, AMD-SMI requires explicit opt-in by setting CMake flag -DENABLE_AMD_SMI=ON") + message(WARNING "Set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership") + else() + find_path(AMD_SMI_INCLUDE_DIR amd_smi/amdsmi.h PATHS ${ROCM_PATH}/include NO_DEFAULT_PATH) + find_library(AMD_SMI_LIBRARY amd_smi PATHS ${ROCM_PATH}/lib ${ROCM_PATH}/lib64 NO_DEFAULT_PATH) + if(AMD_SMI_INCLUDE_DIR AND AMD_SMI_LIBRARY) + # Probe for the AMD-SMI functions used by TransferBench at runtime. + cmake_push_check_state() + set(CMAKE_REQUIRED_INCLUDES "${AMD_SMI_INCLUDE_DIR}") + set(CMAKE_REQUIRED_LIBRARIES "${AMD_SMI_LIBRARY}") + check_cxx_source_compiles(" + #include + int main() { + amdsmi_bdf_t bdf = {}; + amdsmi_processor_handle h; + amdsmi_get_processor_handle_from_bdf(bdf, &h); + amdsmi_fabric_info_t fi; + amdsmi_get_gpu_fabric_info(h, &fi); + (void)fi.fabric_info.fabric_version.v1.ppod_id; + (void)fi.fabric_info.fabric_version.v1.vpod_id; + return 0; + }" AMDSMI_HAS_FABRIC) + cmake_pop_check_state() + + if(AMDSMI_HAS_FABRIC) + message(STATUS "- AMD-SMI fabric API found; using AMD-SMI for pod membership queries") + set(AMD_SMI_FOUND 1) + else() + message(STATUS "- AMD-SMI fabric API not found") + message(WARNING "Set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership") + endif() + else() + if(NOT AMD_SMI_INCLUDE_DIR) + message(STATUS "- amd_smi/amdsmi.h not found under ${ROCM_PATH}/include") + endif() + if(NOT AMD_SMI_LIBRARY) + message(STATUS "- libamd_smi not found under ${ROCM_PATH}/lib or ${ROCM_PATH}/lib64") + endif() + message(STATUS "- AMD-SMI not available") + message(WARNING "Set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership") + endif() + endif() + else() + message(STATUS "- HIP fabric API not found; disabling pod communication support") + endif() +endif() + +set(PACKAGE_NAME TB) +set(LIBRARY_NAME TransferBench) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}") add_executable(TransferBench src/client/Client.cpp) -target_include_directories(TransferBench PRIVATE src/header) -target_include_directories(TransferBench PRIVATE src/client) -target_include_directories(TransferBench PRIVATE src/client/Presets) -target_include_directories(TransferBench PRIVATE ${NUMA_INCLUDE_DIR}) -target_include_directories(TransferBench PRIVATE ${HSA_INCLUDE_DIR}) +target_include_directories(TransferBench PRIVATE + src/header + src/client + src/client/Presets) + if(IBVERBS_FOUND) target_include_directories(TransferBench PRIVATE ${IBVERBS_INCLUDE_DIR}) target_link_libraries(TransferBench PRIVATE ${IBVERBS_LIBRARY}) @@ -234,11 +443,8 @@ if(IBVERBS_FOUND) endif() if(MPI_COMM_FOUND) if(TARGET MPI::MPI_CXX) - # Found via find_package - target_include_directories(TransferBench PRIVATE ${MPI_CXX_INCLUDE_DIRS}) target_link_libraries(TransferBench PRIVATE MPI::MPI_CXX) else() - # Found via MPI_PATH fallback target_include_directories(TransferBench PRIVATE ${MPI_INCLUDE_DIR}) target_link_libraries(TransferBench PRIVATE ${MPI_LIBRARY}) endif() @@ -247,18 +453,28 @@ endif() if(DMABUF_SUPPORT_FOUND) target_compile_definitions(TransferBench PRIVATE HAVE_DMABUF_SUPPORT) endif() -if (HAVE_PARALLEL_JOBS) - target_compile_options(TransferBench PRIVATE -parallel-jobs=12) +if(AMD_SMI_FOUND) + target_include_directories(TransferBench PRIVATE ${AMD_SMI_INCLUDE_DIR}) + target_link_libraries(TransferBench PRIVATE ${AMD_SMI_LIBRARY}) + target_compile_definitions(TransferBench PRIVATE AMD_SMI_ENABLED) +endif() +if(POD_COMM_FOUND) + target_compile_definitions(TransferBench PRIVATE POD_COMM_ENABLED) endif() +check_cxx_compiler_flag(-parallel-jobs=12 HAVE_PARALLEL_JOBS) +if(HAVE_PARALLEL_JOBS) + message(STATUS "Enabling parallel compile jobs: -parallel-jobs=12") + target_compile_options(TransferBench PRIVATE -parallel-jobs=12) +else() + message(STATUS "Compiler does not support -parallel-jobs=12 (or the check failed); skipping -parallel-jobs optimisation") +endif() -target_link_libraries(TransferBench PRIVATE -fgpu-rdc) # Required when linking relocatable device code +target_link_options(TransferBench PRIVATE -fgpu-rdc) target_link_libraries(TransferBench PRIVATE Threads::Threads) -target_link_libraries(TransferBench INTERFACE hip::host) -target_link_libraries(TransferBench PRIVATE hip::device) -target_link_libraries(TransferBench PRIVATE dl) -target_link_libraries(TransferBench PRIVATE ${NUMA_LIBRARY}) -target_link_libraries(TransferBench PRIVATE ${HSA_LIBRARY}) +target_link_libraries(TransferBench PRIVATE hip::host hip::device dl) +target_link_libraries(TransferBench PRIVATE hsa-runtime64) +target_link_libraries(TransferBench PRIVATE numa) # gcc <9 ships std::filesystem in a separate library (libstdc++fs). # Required on AlmaLinux 8 / manylinux_2_28; harmless no-op stub on newer toolchains. diff --git a/Makefile b/Makefile index 4bc3cdab..71562cc2 100644 --- a/Makefile +++ b/Makefile @@ -6,14 +6,18 @@ ROCM_PATH ?= /opt/rocm CUDA_PATH ?= /usr/local/cuda MPI_PATH ?= /usr/local/openmpi +HIPCC ?= $(ROCM_PATH)/bin/amdclang++ +NVCC ?= $(CUDA_PATH)/bin/nvcc +DEBUG ?= 0 # Optional features (set to 0 to disable, 1 to enable) -# DISABLE_NIC_EXEC: Disable RDMA/NIC executor support (default: 0) -# DISABLE_MPI_COMM: Disable MPI communicator support (default: 0) -# DISABLE_DMABUF: Disable DMA-BUF support for GPU Direct RDMA (default: 1) - -HIPCC ?= $(ROCM_PATH)/bin/amdclang++ -NVCC ?= $(CUDA_PATH)/bin/nvcc +# DISABLE_NIC_EXEC: Disable RDMA/NIC executor support (default: 0) +# DISABLE_MPI_COMM: Disable MPI communicator support (default: 0) +# DISABLE_DMA_BUF: Disable DMA-BUF support for GPU Direct RDMA (default: 1) +# DISABLE_AMD_SMI: Disable AMD-SMI pod membership checking support (default: 0) +# DISABLE_NVML: Disable NVML pod membership detection for CUDA builds (default: 0) +# DISABLE_POD_COMM: Disable pod communication support (default: 0) +# DISABLE_CUMEM: Disable CUDA driver API (also disables pod on CUDA) (default: 0) # ROCm device libraries can live in different locations depending on packaging. # hipcc/clang needs to find the amdgcn bitcode directory at link time. @@ -32,11 +36,11 @@ SINGLE_KERNEL ?= 0 GPU_TARGETS ?= native EXE=TransferBench -DEBUG ?= 0 # Only perform this check if 'make clean' is not the target ifeq ($(filter clean,$(MAKECMDGOALS)),) ifeq ($(MAKECMDGOALS),TransferBenchCuda) + $(info Building TransferBenchCuda) # Check for nvcc ifneq ($(shell test -e $(NVCC) && echo found), found) $(error "Could not find $(NVCC). Please set CUDA_PATH appropriately") @@ -48,15 +52,21 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),) # Check for HIP compiler ifeq ("$(shell test -e $(HIPCC) && echo found)", "found") CXX=$(HIPCC) - else ifeq ("$(shell test -e $(ROCM_PATH)/bin/hipcc && echo found)", "found") - CXX=$(ROCM_PATH)/bin/hipcc - $(warning "Could not find $(HIPCC). Using fallback to $(CXX)") else - $(error "Could not find $(HIPCC) or $(ROCM_PATH)/bin/hipcc. Check if the path is correct if you want to build $(EXE)") + ifeq ("$(shell test -e $(ROCM_PATH)/llvm/bin/amdclang++ && echo found)", "found") + CXX=$(ROCM_PATH)/llvm/bin/amdclang++ + else ifeq ("$(shell test -e $(ROCM_PATH)/llvm/bin/clang++ && echo found)", "found") + CXX=$(ROCM_PATH)/llvm/bin/clang++ + else ifeq ("$(shell test -e $(ROCM_PATH)/bin/hipcc && echo found)", "found") + CXX=$(ROCM_PATH)/bin/hipcc + else + $(error "Could not find a HIP compiler. Tried: $(HIPCC), $(ROCM_PATH)/llvm/bin/amdclang++, $(ROCM_PATH)/llvm/bin/clang++, $(ROCM_PATH)/bin/hipcc. Check if ROCM_PATH is correct") + endif + $(info "Could not find $(HIPCC). Using fallback to $(CXX)") endif GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)") - - CXXFLAGS = -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/hip -I$(ROCM_PATH)/include/hsa + $(info Compiling for $(GPU_TARGETS) architecture(s). Can modify this by setting GPU_TARGETS) + CXXFLAGS = -I. -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/hip -I$(ROCM_PATH)/include/hsa HIPLDFLAGS= -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 -lamdhip64 HIPFLAGS = -Wall -x hip -D__HIP_PLATFORM_AMD__ -D__HIPCC__ $(GPU_TARGETS_FLAGS) ifneq ($(strip $(ROCM_DEVICE_LIB_PATH)),) @@ -84,18 +94,19 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),) # 3) infiniband/verbs.h is found in the default include path DISABLE_NIC_EXEC ?= 0 ifneq ($(DISABLE_NIC_EXEC),1) + $(info Attempting to build with NIC executor support) ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0") - $(info lib IBVerbs not found) + $(info - ibverbs library not found) else ifeq ("$(shell echo '#include ' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0") - $(info infiniband/verbs.h not found) + $(info - infiniband/verbs.h not found) else COMMON_FLAGS += -DNIC_EXEC_ENABLED LDFLAGS += -libverbs NIC_ENABLED = 1 - # Disable DMA-BUF support by default (set DISABLE_DMABUF=0 to enable) - DISABLE_DMABUF ?= 1 - ifeq ($(DISABLE_DMABUF), 0) + # Disable DMA-BUF support by default (set DISABLE_DMA_BUF=0 to enable) + DISABLE_DMA_BUF ?= 1 + ifeq ($(DISABLE_DMA_BUF), 0) # Check for both ibv_reg_dmabuf_mr and ROCm DMA-BUF export support HAVE_IBV_DMABUF := $(shell echo '#include ' | $(CXX) -E - 2>/dev/null | grep -c 'ibv_reg_dmabuf_mr') HAVE_ROCM_DMABUF := $(shell echo '#include ' | $(CXX) -I$(ROCM_PATH)/include -E - 2>/dev/null | grep -c 'hsa_amd_portable_export_dmabuf') @@ -111,14 +122,14 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),) $(info Building with DMA-BUF support) endif else - $(info Building with DMA-BUF support disabled (DISABLE_DMABUF=1)) + $(info Building with DMA-BUF support disabled (DISABLE_DMA_BUF=1)) endif endif ifeq ($(NIC_ENABLED), 0) - $(info Building without NIC executor support) - $(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed) + $(info - Building without NIC executor support) + $(info - To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed) else - $(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable) + $(info - Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable) endif endif @@ -128,30 +139,167 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),) # 2) mpi.h is found in the MPI_PATH DISABLE_MPI_COMM ?= 0 ifneq ($(DISABLE_MPI_COMM), 1) + $(info Attempting to build with MPI communicator support) ifeq ($(wildcard $(MPI_PATH)/include/mpi.h),) - $(info Unable to find mpi.h at $(MPI_PATH)/include. Please specify appropriate MPI_PATH) + $(info - Unable to find mpi.h at $(MPI_PATH)/include. Please specify appropriate MPI_PATH) else MPI_ENABLED = 1 COMMON_FLAGS += -DMPI_COMM_ENABLED -I$(MPI_PATH)/include - LDFLAGS += -L/$(MPI_PATH)/lib -lmpi - ifeq ($(DEBUG), 1) - LDFLAGS += -lmpi_cxx - endif + LDFLAGS += -L$(MPI_PATH)/lib -L$(MPI_PATH)/lib64 -lmpi endif ifeq ($(MPI_ENABLED), 0) - $(info Building without MPI communicator support) - $(info To use TransferBench with MPI support, install MPI libraries and specify appropriate MPI_PATH) + $(info - Building without MPI communicator support) + $(info - To use TransferBench with MPI support, install MPI libraries and specify appropriate MPI_PATH) else - $(info Building with MPI communicator support. Can set DISABLE_MPI_COMM=1 to disable) + $(info - Building with MPI communicator support. Can set DISABLE_MPI_COMM=1 to disable) endif endif -endif + NVML_ENABLED = 0 + # Enable NVML support for pod membership detection on NVIDIA platforms + # Compile with NVML support if + # 1) DISABLE_NVML is not set to 1 + # 2) Building TransferBenchCuda + # 3) nvml.h is found under CUDA_PATH + DISABLE_NVML ?= 0 + ifneq ($(DISABLE_NVML), 1) + ifeq ($(MAKECMDGOALS),TransferBenchCuda) + $(info Attempting to build with NVML support) + ifneq ($(wildcard $(CUDA_PATH)/include/nvml.h),) + COMMON_FLAGS += -DNVML_ENABLED + LDFLAGS += -lnvidia-ml + NVML_ENABLED = 1 + $(info - Building with NVML support for pod membership detection) + else + $(info - nvml.h not found at $(CUDA_PATH)/include. Building without NVML support) + $(info - Pod membership may be forced by setting TB_FORCE_SINGLE_POD=1) + endif + endif + endif + + # TransferBenchCuda: CUDA driver API (libcuda). Independent of POD, but POD on CUDA requires CUMEM. + DISABLE_CUMEM ?= 0 + ifeq ($(MAKECMDGOALS),TransferBenchCuda) + ifneq ($(DISABLE_CUMEM),1) + $(info - Building with CUMEM_ENABLED (CUDA driver API, -lcuda)) + COMMON_FLAGS += -DCUMEM_ENABLED + LDFLAGS += -lcuda + else + $(info - CUDA driver API disabled (DISABLE_CUMEM=1); POD comm unavailable on CUDA) + endif + endif + + POD_ENABLED = 0 + AMD_SMI_ENABLED = 0 + # Compile with pod support if + # 1) DISABLE_POD_COMM is not set to 1 + # 2) For HIP: a small probe program that uses hipMemFabricHandle_t, + # hipMemExportToShareableHandle, and hipMemImportFromShareableHandle + # compiles and links successfully against amdhip64 + # For CUDA: CUDA Version >= 12.2 + DISABLE_POD_COMM ?= 0 + DISABLE_AMD_SMI ?= 0 + ifneq ($(DISABLE_POD_COMM), 1) + $(info Attempting to build with pod communication support) + ifeq ($(MAKECMDGOALS),TransferBenchCuda) + # Check for appropriate CUDA support for MNNVL + CUDA_MIN_MAJOR := 12 + CUDA_MIN_MINOR := 2 + + CUDA_VERSION_STR := $(shell $(NVCC) --version | grep release | sed -E 's/.*release ([0-9]+)\.([0-9]+).*/\1 \2/') + CUDA_MAJOR := $(word 1,$(CUDA_VERSION_STR)) + CUDA_MINOR := $(word 2,$(CUDA_VERSION_STR)) + + CUDA_VERSION_OK := $(shell \ + if [ $(CUDA_MAJOR) -gt $(CUDA_MIN_MAJOR) ] || \ + [ $(CUDA_MAJOR) -eq $(CUDA_MIN_MAJOR) -a $(CUDA_MINOR) -ge $(CUDA_MIN_MINOR) ]; then \ + echo yes; \ + else \ + echo no; \ + fi) + + ifeq ($(CUDA_VERSION_OK),yes) + $(info - Detected CUDA version $(CUDA_MAJOR).$(CUDA_MINOR) which has MNNVL support) + ifeq ($(DISABLE_CUMEM),1) + $(info - Pod communication skipped on CUDA: requires CUMEM_ENABLED (DISABLE_CUMEM=1)) + else + COMMON_FLAGS += -DPOD_COMM_ENABLED + POD_ENABLED = 1 + endif + else + $(info - Detected CUDA version $(CUDA_MAJOR).$(CUDA_MINOR) which does not have MNNVL support) + $(info - Pod support will require CUDA version of at least $(CUDA_MIN_MAJOR).$(CUDA_MIN_MINOR)) + endif + else + # Check for the HIP fabric API functions used by TransferBench at runtime. + HIP_HAS_FABRIC := $(shell \ + printf '%s\n' \ + '#include ' \ + 'int main() {' \ + ' hipMemFabricHandle_t fabricHandle = {};' \ + ' hipMemGenericAllocationHandle_t allocationHandle = {};' \ + ' hipMemExportToShareableHandle(&fabricHandle, allocationHandle, hipMemHandleTypeFabric, 0);' \ + ' hipMemImportFromShareableHandle(&allocationHandle, &fabricHandle, hipMemHandleTypeFabric);' \ + ' return 0;' \ + '}' | \ + $(CXX) -I$(ROCM_PATH)/include -D__HIP_PLATFORM_AMD__ -x c++ - \ + -L$(ROCM_PATH)/lib -L$(ROCM_PATH)/lib64 -lamdhip64 -o /dev/null 2>/dev/null && echo yes || echo no) + + ifeq ($(HIP_HAS_FABRIC),yes) + $(info - HIP fabric API found; enabling pod communication support) + COMMON_FLAGS += -DPOD_COMM_ENABLED + POD_ENABLED = 1 + ifeq ($(DISABLE_AMD_SMI), 1) + $(info - AMD-SMI disabled via DISABLE_AMD_SMI=1; set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership) + else + # Prefer AMD-SMI for pod membership queries; fall back to TB_FORCE_SINGLE_POD=1 at runtime. + AMD_SMI_HEADER := $(ROCM_PATH)/include/amd_smi/amdsmi.h + AMD_SMI_LIB := $(firstword $(wildcard $(ROCM_PATH)/lib/libamd_smi.so $(ROCM_PATH)/lib64/libamd_smi.so)) + ifneq ($(wildcard $(AMD_SMI_HEADER)),) + ifneq ($(AMD_SMI_LIB),) + # Check for the AMD-SMI functions used by TransferBench at runtime. + AMDSMI_HAS_FABRIC := $(shell \ + printf '%s\n' \ + '#include ' \ + 'int main() {' \ + ' amdsmi_bdf_t bdf = {};' \ + ' amdsmi_processor_handle h;' \ + ' amdsmi_get_processor_handle_from_bdf(bdf, &h);' \ + ' amdsmi_fabric_info_t fi;' \ + ' amdsmi_get_gpu_fabric_info(h, &fi);' \ + ' (void)fi.fabric_info.fabric_version.v1.ppod_id;' \ + ' (void)fi.fabric_info.fabric_version.v1.vpod_id;' \ + ' return 0;' \ + '}' | \ + $(CXX) -I$(ROCM_PATH)/include -x c++ - \ + -L$(dir $(AMD_SMI_LIB)) -lamd_smi -o /dev/null 2>/dev/null && echo yes || echo no) + + ifeq ($(AMDSMI_HAS_FABRIC),yes) + $(info - AMD-SMI fabric API found; using AMD-SMI for pod membership queries) + COMMON_FLAGS += -DAMD_SMI_ENABLED + LDFLAGS += -L$(dir $(AMD_SMI_LIB)) -lamd_smi + AMD_SMI_ENABLED = 1 + else + $(info - AMD-SMI fabric API not found; set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership) + endif + else + $(info - libamd_smi not found under $(ROCM_PATH)/lib or $(ROCM_PATH)/lib64; set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership) + endif + else + $(info - amd_smi/amdsmi.h not found under $(ROCM_PATH)/include; set TB_FORCE_SINGLE_POD=1 at runtime to override pod membership) + endif + endif + else + $(info - HIP fabric API not found; disabling pod communication support) + endif + endif + endif +endif .PHONY : all clean -all: $(EXE) +all: TransferBench TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") $(CXX) $(CXXFLAGS) $(HIPFLAGS) $(COMMON_FLAGS) $< -o $@ $(HIPLDFLAGS) $(LDFLAGS) diff --git a/examples/example.cfg b/examples/example.cfg index 57d4ae03..14df7e3a 100644 --- a/examples/example.cfg +++ b/examples/example.cfg @@ -8,12 +8,13 @@ # SRC 1 -> Executor -> DST 1 # SRC X DST Y -# Three Executors are supported by TransferBench +# Five Executors are supported by TransferBench # Executor: SubExecutor: # 1) CPU CPU thread # 2) GPU GPU threadblock/Compute Unit (CU) -# 3) DMA N/A. (May only be used for copies (single SRC/DST) +# 3) DMA N/A. (Must have single SRC, at least one DST) # 4) NIC Queue Pair +# 5) Batched-DMA Batch item (Must have single SRC, at least one DST) # Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel @@ -38,6 +39,7 @@ # - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1) # - G: GPU-executed (Indexed from 0 to # GPUs - 1) # - D: DMA-executor (Indexed from 0 to # GPUs - 1) +# - B: Batched-DMA-executor (Indexed from 0 to # GPUs - 1) # - I#.#: NIC executor (Indexed from 0 to # NICs - 1) # - N#.#: Nearest NIC executor (Indexed from 0 to # GPUs - 1) # dstMemL : Destination memory locations (Where the data is to be written to) diff --git a/src/client/Client.cpp b/src/client/Client.cpp index 26433500..81cc951b 100644 --- a/src/client/Client.cpp +++ b/src/client/Client.cpp @@ -43,7 +43,6 @@ int main(int argc, char **argv) if (!ev.outputToCsv) { DisplayVersion(); DisplayUsage(argv[0]); - DisplayPresets(); } DisplayTopology(ev.outputToCsv, ev.showBorders); } @@ -258,14 +257,26 @@ void DisplayUsage(char const* cmdName) Print("Usage: %s config \n", cmdName); Print(" config: Either:\n"); - Print(" - Filename of configFile containing Transfers to execute (see example.cfg for format)\n"); - Print(" - Name of preset config:\n"); + Print(" - Filename of config file containing Transfers to execute\n"); + Print(" - Name of preset config\n"); + Print(" - 'cmdline' followed by one transfer expression\n"); + Print(" - 'dryrun' followed by one transfer expression (prints parsed transfers only)\n"); Print(" N : (Optional) Number of bytes to copy per Transfer.\n"); - Print(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", - DEFAULT_BYTES_PER_TRANSFER); + Print(" If not specified, defaults to %lu. Must be a multiple of 4 bytes\n", DEFAULT_BYTES_PER_TRANSFER); Print(" If 0 is specified, a range of Ns will be benchmarked\n"); Print(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n"); Print("\n"); - - EnvVars::DisplayUsage(); + Print("- Use \"%s help\" for more information about how to create config files / describe Transfers\n", cmdName); + Print("- Use \"%s envvars\" for more information about environment variables that customize behavior\n", cmdName); + Print("- Use \"%s presets\" to display list of available presets\n", cmdName); + Print("\n"); + Print("For multi-rank usage, TransferBench must either be compiled with MPI support or rely on sockets\n"); + Print("It is recommended to only run one process per node\n"); + Print(" - MPI approach:\n"); + Print(" Node 0> mpirun -np 4 -host node0,node1,node2,node3 ./TransferBench a2a\n"); + Print(" - Socket approach:\n"); + Print(" Node 0> TB_NUM_RANKS=4 [TB_RANK=0] [TB_MASTER_ADDR=] ./TransferBench a2a # Displays connect info for other ranks\n" ); + Print(" Node 1> TB_NUM_RANKS=4 TB_RANK=1 TB_MASTER_ADDR= ./TransferBench a2a\n"); + Print(" Node 2> TB_NUM_RANKS=4 TB_RANK=2 TB_MASTER_ADDR= ./TransferBench a2a\n"); + Print(" Node 3> TB_NUM_RANKS=4 TB_RANK=3 TB_MASTER_ADDR= ./TransferBench a2a\n"); }; diff --git a/src/client/EnvVars.hpp b/src/client/EnvVars.hpp index c77d2bea..97fd3ea2 100644 --- a/src/client/EnvVars.hpp +++ b/src/client/EnvVars.hpp @@ -35,12 +35,15 @@ THE SOFTWARE. } while (0) #include +#include +#include #include +#include #include #include #include -#define CLIENT_VERSION "02" +#define CLIENT_VERSION "00" #include "TransferBench.hpp" using namespace TransferBench; @@ -87,18 +90,19 @@ class EnvVars int useHsaDma; // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions // GFX options + vector cuMask; // Bit-vector representing the CU mask int gfxBlockOrder; // How threadblocks for multiple Transfers are ordered 0=sequential 1=interleaved int gfxBlockSize; // Size of each threadblock (must be multiple of 64) - vector cuMask; // Bit-vector representing the CU mask - vector> prefXccTable; // Specifies XCC to use for given exe->dst pair + int gfxKernel; // GFX Kernel to use (-1=auto, 0=reduce, 1=copy-only) int gfxSeType; // GFX subexecutor type (0=threadblock, 1=warp) + int gfxSingleTeam; // Team all subExecutors across the data array int gfxTemporal; // Non-temporal load/store mode (0=none, 1=load, 2=store, 3=both) int gfxUnroll; // GFX-kernel unroll factor - int useHipEvents; // Use HIP events for timing GFX/DMA Executor - int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer - int gfxSingleTeam; // Team all subExecutors across the data array int gfxWaveOrder; // GFX-kernel wavefront ordering int gfxWordSize; // GFX-kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1) + vector> prefXccTable; // Specifies XCC to use for given exe->dst pair + int useHipEvents; // Use HIP events for timing GFX/DMA Executor + int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer // Client options int hideEnv; // Skip printing environment variable @@ -106,12 +110,14 @@ class EnvVars int maxNumVarSubExec; // Maximum # of subexecutors to use for variable subExec Transfers (0 to use device limit) int outputToCsv; // Output in CSV format int samplingFactor; // Affects how many different values of N are generated (when N set to 0) + std::vector showPercentiles; // Iteration-duration percentiles to print // NIC options int ibGidIndex; // GID Index for RoCE NICs uint8_t ibPort; // NIC port number to be used int ipAddressFamily; // IP Address Famliy int nicChunkBytes; // Number of bytes to send per chunk for RDMA operations + int nicCqPollBatch; // Number of CQ entries to poll per ibv_poll_cq call int nicRelaxedOrder; // Use relaxed ordering for RDMA int roceVersion; // RoCE version number @@ -146,8 +152,9 @@ class EnvVars fillCompress = GetEnvVarArray("FILL_COMPRESS" , {}); gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER" , 0); gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256); + gfxKernel = GetEnvVar("GFX_KERNEL" , 0); gfxSeType = GetEnvVar("GFX_SE_TYPE" , 0); - gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1); + gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 0); gfxTemporal = GetEnvVar("GFX_TEMPORAL" , 0); gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll); gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0); @@ -162,6 +169,7 @@ class EnvVars samplingFactor = GetEnvVar("SAMPLING_FACTOR" , 1); showBorders = GetEnvVar("SHOW_BORDERS" , 1); showIterations = GetEnvVar("SHOW_ITERATIONS" , 0); + showPercentiles = GetEnvVarArray("SHOW_PERCENTILES", {}); useHipEvents = GetEnvVar("USE_HIP_EVENTS" , 1); useHsaDma = GetEnvVar("USE_HSA_DMA" , 0); useInteractive = GetEnvVar("USE_INTERACTIVE" , 0); @@ -174,6 +182,7 @@ class EnvVars roceVersion = GetEnvVar("ROCE_VERSION" , 2); ipAddressFamily = GetEnvVar("IP_ADDRESS_FAMILY" , 4); nicChunkBytes = GetEnvVar("NIC_CHUNK_BYTES" , 1073741824); + nicCqPollBatch = GetEnvVar("NIC_CQ_POLL_BATCH" , 4); nicRelaxedOrder = GetEnvVar("NIC_RELAX_ORDER" , 1); gpuMaxHwQueues = GetEnvVar("GPU_MAX_HW_QUEUES" , 4); @@ -234,14 +243,15 @@ class EnvVars // Check for CU mask int numXccs = TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, 0}); cuMask.clear(); - char* cuMaskStr = getenv("CU_MASK"); - if (cuMaskStr != NULL) { + char const* cuMaskRaw = getenv("CU_MASK"); + if (cuMaskRaw != NULL) { #if defined(__NVCC__) printf("[WARN] CU_MASK is not supported in CUDA\n"); #else std::vector> ranges; int maxCU = 0; - char* token = strtok(cuMaskStr, ","); + std::string cuMaskCopy(cuMaskRaw); + char* token = cuMaskCopy.empty() ? NULL : strtok(&cuMaskCopy[0], ","); while (token) { int start, end; if (sscanf(token, "%d-%d", &start, &end) == 2) { @@ -269,14 +279,25 @@ class EnvVars #endif } + // Check that percentiles are valid + std::sort(showPercentiles.begin(), showPercentiles.end()); + showPercentiles.erase(std::unique(showPercentiles.begin(), showPercentiles.end()), showPercentiles.end()); + for (int v : showPercentiles) { + if (v < 1 || v > 99) { + printf("[ERROR] SHOW_PERCENTILES: value %d out of range (allowed 1..99)\n", v); + exit(1); + } + } + // Parse preferred XCC table (if provided) - char* prefXccStr = getenv("XCC_PREF_TABLE"); - if (prefXccStr) { + char const* prefXccRaw = getenv("XCC_PREF_TABLE"); + if (prefXccRaw) { prefXccTable.resize(numDetectedGpus); for (int i = 0; i < numDetectedGpus; i++){ prefXccTable[i].resize(numDetectedGpus, -1); } - char* token = strtok(prefXccStr, ","); + std::string prefXccCopy(prefXccRaw); + char* token = prefXccCopy.empty() ? NULL : strtok(&prefXccCopy[0], ","); int tokenCount = 0; while (token) { int xccId; @@ -312,55 +333,73 @@ class EnvVars } // Display info on the env vars that can be used - static void DisplayUsage() + static void DisplayEnvVarsList() { - printf("Environment variables:\n"); + printf("Environment variables (client):\n"); printf("======================\n"); - printf(" ALWAYS_VALIDATE - Validate after each iteration instead of once after all iterations\n"); - printf(" BLOCK_BYTES - Controls granularity of how work is divided across subExecutors\n"); - printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4\n"); - printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n"); - printf(" FILL_COMPRESS - Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0\n"); - printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n"); - printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n"); - printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n"); - printf(" GFX_SE_TYPE - SubExecutor granularity type (0=threadblock, 1=warp)\n"); - printf(" GFX_TEMPORAL - Use of non-temporal loads or stores (0=none 1=loads 2=stores 3=both)\n"); - printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL)); - printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n"); - printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n"); - printf(" GFX_WORD_SIZE - GFX kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)\n"); - printf(" HIDE_ENV - Hide environment variable value listing\n"); + printf(" ALWAYS_VALIDATE - Validate after each iteration instead of once after all iterations\n"); + printf(" BLOCK_BYTES - Controls granularity of how work is divided across subExecutors\n"); + printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4\n"); + printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n"); + printf(" FILL_COMPRESS - Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0\n"); + printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n"); + printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n"); + printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n"); + printf(" GFX_KERNEL - -1=auto, 0=force GpuReduceKernel, 1=force GpuCopyKernel (may error if ineligible)\n"); + printf(" GFX_SE_TYPE - SubExecutor granularity type (0=threadblock, 1=warp)\n"); + printf(" GFX_TEMPORAL - Use of non-temporal loads or stores (0=none 1=loads 2=stores 3=both)\n"); + printf(" GFX_UNROLL - Unroll factor for GFX kernel\n"); + printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n"); + printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n"); + printf(" GFX_WORD_SIZE - GFX kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)\n"); + printf(" HIDE_ENV - Hide environment variable value listing\n"); #if NIC_EXEC_ENABLED - printf(" IB_GID_INDEX - Required for RoCE NICs (default=-1/auto)\n"); - printf(" IB_PORT_NUMBER - RDMA port count for RDMA NIC (default=1)\n"); - printf(" IP_ADDRESS_FAMILY - IP address family (4=v4, 6=v6, default=v4)\n"); + printf(" IB_GID_INDEX - Required for RoCE NICs (default=-1/auto)\n"); + printf(" IB_PORT_NUMBER - RDMA port count for RDMA NIC (default=1)\n"); + printf(" IP_ADDRESS_FAMILY - IP address family (4=v4, 6=v6, default=v4)\n"); #endif - printf(" MIN_VAR_SUBEXEC - Minumum # of subexecutors to use for variable subExec Transfers\n"); - printf(" MAX_VAR_SUBEXEC - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n"); + printf(" MIN_VAR_SUBEXEC - Minimum # of subexecutors to use for variable subExec Transfers\n"); + printf(" MAX_VAR_SUBEXEC - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n"); #if NIC_EXEC_ENABLED - printf(" NIC_CHUNK_BYTES - Number of bytes to send at a time using NIC (default = 1GB)\n"); - printf(" NIC_RELAX_ORDER - Set to non-zero to use relaxed ordering"); + printf(" NIC_CHUNK_BYTES - Number of bytes to send at a time using NIC (default = 1GB)\n"); + printf(" NIC_CQ_POLL_BATCH - Number of CQ entries to poll per ibv_poll_cq call (default = 4)\n"); + printf(" NIC_RELAX_ORDER - Set to non-zero to use relaxed ordering\n"); #endif - printf(" NUM_ITERATIONS - # of timed iterations per test. If negative, run for this many seconds instead\n"); - printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n"); - printf(" NUM_WARMUPS - # of untimed warmup iterations per test\n"); - printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n"); + printf(" NUM_ITERATIONS - # of timed iterations per test. If negative, run for this many seconds instead\n"); + printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n"); + printf(" NUM_WARMUPS - # of untimed warmup iterations per test\n"); + printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n"); #if NIC_EXEC_ENABLED - printf(" ROCE_VERSION - RoCE version (default=2)\n"); + printf(" ROCE_VERSION - RoCE version (default=2)\n"); #endif - printf(" SAMPLING_FACTOR - Add this many samples (when possible) between powers of 2 when auto-generating data sizes\n"); - printf(" SHOW_BORDERS - Show ASCII box-drawing characaters in tables\n"); - printf(" SHOW_ITERATIONS - Show per-iteration timing info\n"); - printf(" USE_HIP_EVENTS - Use HIP events for GFX executor timing\n"); - printf(" USE_HSA_DMA - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n"); - printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n"); - printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n"); - printf(" VALIDATE_DIRECT - Validate GPU destination memory directly instead of staging GPU memory on host\n"); - printf(" VALIDATE_SOURCE - Validate GPU src memory immediately after preparation\n"); + printf(" SAMPLING_FACTOR - Add this many samples (when possible) between powers of 2 when auto-generating data sizes\n"); + printf(" SHOW_BORDERS - Show ASCII box-drawing characters in tables\n"); + printf(" SHOW_ITERATIONS - Show per-iteration timing info\n"); + printf(" SHOW_PERCENTILES - Comma-separated percentiles iteration duration\n"); + printf(" USE_HIP_EVENTS - Use HIP events for GFX executor timing\n"); + printf(" USE_HSA_DMA - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n"); + printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n"); + printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n"); + printf(" VALIDATE_DIRECT - Validate GPU destination memory directly instead of staging GPU memory on host\n"); + printf(" VALIDATE_SOURCE - Validate GPU src memory immediately after preparation\n"); + printf("\n"); + printf("Environment variables (back-end):\n"); + printf("====================================\n"); + printf(" TB_RANK - Rank for socket communicator (0-based); defaults to 0 if unset or empty\n"); + printf(" TB_NUM_RANKS - Total ranks for socket mode (>=2); alone on rank 0 starts listener and logs worker env\n"); + printf(" TB_MASTER_ADDR - Rank 0 hostname or IPv4 for workers; optional on rank 0 (auto-detected if unset)\n"); + printf(" TB_MASTER_IFACE - When TB_MASTER_ADDR unset on rank 0, optional interface for IPv4 detection (e.g. eth0)\n"); + printf(" TB_MASTER_PORT - Used to set Rank 0 port for socket communicator (default: 29500)\n"); + printf(" TB_SINGLE_LOG - In socket mode, only rank 0 logs when set\n"); + printf(" TB_VERBOSE - Enables additional internal logging\n"); + printf(" TB_DUMP_CFG_FILE - Writes executed transfers to a config file\n"); + printf(" TB_DUMP_LINES - Dumps randomized input-line statistics for FILL_COMPRESS setup\n"); + printf(" TB_NIC_FILTER - Regex filter to limit NIC visibility for NIC executors\n"); + printf(" TB_FORCE_SINGLE_POD - Forces all ranks into one pod (skips pod query)\n"); + printf(" TB_WALLCLOCK_RATE - Overrides queried GPU wallclock rate if needed\n"); + printf(" TB_PAUSE - Pauses startup for debugger attachment\n"); } - void Print(std::string const& name, int32_t const value, const char* format, ...) const { printf("%-20s%s%12d%s", name.c_str(), outputToCsv ? "," : " = ", value, outputToCsv ? "," : " : "); @@ -412,6 +451,10 @@ class EnvVars "Thread block ordering: %s", gfxBlockOrder == 0 ? "Sequential" : "Interleaved"); Print("GFX_BLOCK_SIZE", gfxBlockSize, "Threadblock size of %d", gfxBlockSize); + Print("GFX_KERNEL", gfxKernel, + "%s", gfxKernel == -1 ? "auto" : + gfxKernel == 0 ? "force GpuReduceKernel" : + gfxKernel == 1 ? "force GpuCopyKernel" : "unknown"); Print("GFX_SE_TYPE", gfxSeType, "SubExecutor granularity: %s", gfxSeType == 0 ? "Threadblock" : "Warp"); Print("GFX_SINGLE_TEAM", gfxSingleTeam, @@ -452,6 +495,8 @@ class EnvVars #if NIC_EXEC_ENABLED Print("NIC_CHUNK_BYTES", nicChunkBytes, "Sending %lu bytes at a time for NIC RDMA", nicChunkBytes); + Print("NIC_CQ_POLL_BATCH", nicCqPollBatch, + "Polling %d CQ entries per ibv_poll_cq call", nicCqPollBatch); Print("NIC_RELAX_ORDER", nicRelaxedOrder, "Using %s ordering for NIC RDMA", nicRelaxedOrder ? "relaxed" : "strict"); #endif @@ -469,6 +514,8 @@ class EnvVars Print("SHOW_BORDERS", showBorders, "%s ASCII box-drawing characaters in tables", showBorders ? "Showing" : "Hiding"); Print("SHOW_ITERATIONS", showIterations, "%s per-iteration timing", showIterations ? "Showing" : "Hiding"); + Print("SHOW_PERCENTILES", showPercentiles.empty() ? 0 : 1, "%s", + showPercentiles.empty() ? "Disabled" : GetStr(showPercentiles).c_str()); Print("USE_HIP_EVENTS", useHipEvents, "Using %s for GFX/DMA Executor timing", useHipEvents ? "HIP events" : "CPU wall time"); Print("USE_HSA_DMA", useHsaDma, @@ -500,6 +547,9 @@ class EnvVars { char const* varStr = getenv(varname.c_str()); if (varStr) { + if (varStr[0] == '\0') { + return defaultValue; + } int val = atoi(varStr); char units = varStr[strlen(varStr)-1]; switch (units) { @@ -514,10 +564,11 @@ class EnvVars static std::vector GetEnvVarArray(std::string const& varname, std::vector const& defaultValue) { - if (getenv(varname.c_str())) { + char const* raw = getenv(varname.c_str()); + if (raw) { std::vector values; - char* arrayStr = getenv(varname.c_str()); - char* token = strtok(arrayStr, ","); + std::string copy(raw); + char* token = copy.empty() ? NULL : strtok(©[0], ","); while (token) { int val; if (sscanf(token, "%d", &val) == 1) { @@ -533,12 +584,29 @@ class EnvVars return defaultValue; } + static std::vector GetEnvVarStrArray(std::string const& varname, std::vector const& defaultValue) + { + char const* raw = getenv(varname.c_str()); + if (raw) { + std::vector values; + std::string copy(raw); + char* token = copy.empty() ? NULL : strtok(©[0], ","); + while (token) { + values.push_back(token); + token = strtok(NULL, ","); + } + return values; + } + return defaultValue; + } + static std::vector GetEnvVarRangeArray(std::string const& varname, std::vector const& defaultValue) { - if (getenv(varname.c_str())) { - char* rangeStr = getenv(varname.c_str()); + char const* raw = getenv(varname.c_str()); + if (raw) { + std::string copy(raw); std::set values; - char* token = strtok(rangeStr, ","); + char* token = copy.empty() ? NULL : strtok(©[0], ","); while (token) { int start, end; if (sscanf(token, "%d-%d", &start, &end) == 2) { @@ -567,13 +635,22 @@ class EnvVars std::string GetStr(std::vector const& varnameList) const { std::string result = ""; - for (int i = 0; i < varnameList.size(); i++) { + for (auto i = 0; i < varnameList.size(); i++) { if (i) result += ","; result += std::to_string(varnameList[i]); } return result; } + std::string GetStr(std::vector const& varnameList) const { + std::string result = ""; + for (auto i = 0; i < varnameList.size(); i++) { + if (i) result += ","; + result += varnameList[i]; + } + return result; + } + std::string GetCuMaskDesc() const { std::vector> runs; @@ -616,7 +693,7 @@ class EnvVars cfg.general.numIterations = numIterations; cfg.general.numSubIterations = numSubIterations; cfg.general.numWarmups = numWarmups; - cfg.general.recordPerIteration = showIterations; + cfg.general.recordPerIteration = ((showIterations != 0) || !showPercentiles.empty()) ? 1 : 0; cfg.general.useInteractive = useInteractive; cfg.data.alwaysValidate = alwaysValidate; @@ -633,6 +710,7 @@ class EnvVars cfg.gfx.blockOrder = gfxBlockOrder; cfg.gfx.blockSize = gfxBlockSize; cfg.gfx.cuMask = cuMask; + cfg.gfx.gfxKernel = gfxKernel; cfg.gfx.prefXccTable = prefXccTable; cfg.gfx.seType = gfxSeType; cfg.gfx.unrollFactor = gfxUnroll; @@ -644,6 +722,7 @@ class EnvVars cfg.gfx.wordSize = gfxWordSize; cfg.nic.chunkBytes = nicChunkBytes; + cfg.nic.cqPollBatch = nicCqPollBatch; cfg.nic.ibGidIndex = ibGidIndex; cfg.nic.ibPort = ibPort; cfg.nic.ipAddressFamily = ipAddressFamily; diff --git a/src/client/Presets/AllToAll.hpp b/src/client/Presets/AllToAll.hpp index 2beae8af..cfea85c3 100644 --- a/src/client/Presets/AllToAll.hpp +++ b/src/client/Presets/AllToAll.hpp @@ -22,9 +22,10 @@ THE SOFTWARE. #include -int AllToAllPreset(EnvVars& ev, - size_t const numBytesPerTransfer, - std::string const presetName) +int AllToAllPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) { enum { @@ -54,7 +55,6 @@ int AllToAllPreset(EnvVars& ev, int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC" , 8); int showDetails = EnvVars::GetEnvVar("SHOW_DETAILS" , 0); int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC" , 0); - int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , -999); // Deprecated int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0); // Check that all ranks have at least the number of GPUs requested @@ -64,7 +64,7 @@ int AllToAllPreset(EnvVars& ev, for (int rank = 0; rank < numRanks; rank++) { if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) { Utils::Print("[ERROR] All-to-All preset requires each rank to have the same number of GPUs\n"); - return 1; + return ERR_FATAL; } if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank)) nicDifference = true; @@ -81,17 +81,12 @@ int AllToAllPreset(EnvVars& ev, a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0); if (a2aMode < 0 || a2aMode > 2) { Utils::Print("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n"); - return 1; + return ERR_FATAL; } numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1); numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1); } - // Deprecated env var check - if (useFineGrain != -999) { - memTypeIdx = useFineGrain ? 2 : 0; - } - MemType memType = Utils::GetGpuMemType(memTypeIdx); std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx); @@ -120,15 +115,15 @@ int AllToAllPreset(EnvVars& ev, // Validate env vars if (numGpus < 0 || numGpus > numDetectedGpus) { Utils::Print("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus); - return 1; + return ERR_FATAL; } if (useDmaExec && (numSrcs != 1 || numDsts != 1)) { Utils::Print("[ERROR] DMA execution can only be used for copies (A2A_MODE=0)\n"); - return 1; + return ERR_FATAL; } if (numResults * 2 > numRanks) { Utils::Print("[ERROR] Number of extrema results requested exceeds number of ranks. NUM_RESULTS should be at most half the number of ranks\n"); - return 1; + return ERR_FATAL; } // Collect the number of GPU devices to use @@ -201,14 +196,14 @@ int AllToAllPreset(EnvVars& ev, if (!TransferBench::RunTransfers(cfg, transfers, results)) { for (auto const& err : results.errResults) Utils::Print("%s\n", err.errMsg.c_str()); - return 1; + return ERR_FATAL; } else if (showDetails) { Utils::PrintResults(ev, 1, transfers, results); Utils::Print("\n"); } // Only ranks that actually do output will compile results - if (!Utils::RankDoesOutput()) return 0; + if (!Utils::RankDoesOutput()) return ERR_NONE; // Prepare table of results int numRows = 2 + (numGpus + 1) * (1 + 2*numResults); @@ -491,10 +486,5 @@ int AllToAllPreset(EnvVars& ev, printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n"); } - if (useFineGrain != -999) { - Utils::Print("[WARN] USE_FINE_GRAIN has been deprecated and replaced by MEM_TYPE\n"); - Utils::Print("[WARN] MEM_TYPE has been set to %d to correspond to previous use of USE_FINE_GRAIN=%d\n", memTypeIdx, useFineGrain); - } - - return 0; + return ERR_NONE; } diff --git a/src/client/Presets/AllToAllN.hpp b/src/client/Presets/AllToAllN.hpp index 7dac6b22..15698917 100644 --- a/src/client/Presets/AllToAllN.hpp +++ b/src/client/Presets/AllToAllN.hpp @@ -23,13 +23,14 @@ THE SOFTWARE. #include #include "EnvVars.hpp" -int AllToAllRdmaPreset(EnvVars& ev, - size_t const numBytesPerTransfer, - std::string const presetName) +int AllToAllRdmaPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) { if (TransferBench::GetNumRanks() > 1) { Utils::Print("[ERROR]a2an preset currently not supported for multi-node\n"); - return 1; + return ERR_FATAL; } int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); @@ -38,12 +39,6 @@ int AllToAllRdmaPreset(EnvVars& ev, int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 1); int memTypeIdx = EnvVars::GetEnvVar("MEM_TYPE" , 2); - int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , -999); // Deprecated - - // Deprecated env var check - if (useFineGrain != -999) { - memTypeIdx = useFineGrain ? 2 : 0; - } MemType memType = Utils::GetGpuMemType(memTypeIdx); std::string memTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx); @@ -63,7 +58,7 @@ int AllToAllRdmaPreset(EnvVars& ev, // Validate env vars if (numGpus < 0 || numGpus > numDetectedGpus) { Utils::Print("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus); - return 1; + return ERR_FATAL; } @@ -97,7 +92,7 @@ int AllToAllRdmaPreset(EnvVars& ev, if (!TransferBench::RunTransfers(cfg, transfers, results)) { for (auto const& err : results.errResults) Utils::Print("%s\n", err.errMsg.c_str()); - return 1; + return ERR_FATAL; } else { Utils::PrintResults(ev, 1, transfers, results); } @@ -154,5 +149,5 @@ int AllToAllRdmaPreset(EnvVars& ev, Utils::PrintErrors(results.errResults); - return 0; + return ERR_NONE; } diff --git a/src/client/Presets/AllToAllSweep.hpp b/src/client/Presets/AllToAllSweep.hpp index 9da2fc0d..36e571d9 100644 --- a/src/client/Presets/AllToAllSweep.hpp +++ b/src/client/Presets/AllToAllSweep.hpp @@ -22,13 +22,14 @@ THE SOFTWARE. #include "EnvVars.hpp" -int AllToAllSweepPreset(EnvVars& ev, - size_t const numBytesPerTransfer, - std::string const presetName) +int AllToAllSweepPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) { if (TransferBench::GetNumRanks() > 1) { Utils::Print("[ERROR] All to All Sweep preset currently not supported for multi-node\n"); - return 1; + return ERR_FATAL; } enum @@ -42,22 +43,24 @@ int AllToAllSweepPreset(EnvVars& ev, // Force single-stream mode for all-to-all benchmark ev.useSingleStream = 1; + // Default to GPU-event timing for a2asweep (overridable via USE_HIP_EVENTS=0 for CPU wall-clock) + ev.useHipEvents = EnvVars::GetEnvVar("USE_HIP_EVENTS", 1); int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); // Collect env vars for this preset int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT" , 1); int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0); + int memTypeIdx = EnvVars::GetEnvVar("MEM_TYPE" , 2); int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); int showMinOnly = EnvVars::GetEnvVar("SHOW_MIN_ONLY", 1); - int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1); int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0); int useSpray = EnvVars::GetEnvVar("USE_SPRAY", 0); int verbose = EnvVars::GetEnvVar("VERBOSE", 0); - std::vector blockList = EnvVars::GetEnvVarArray("BLOCKSIZES", {256}); + std::vector blockList = EnvVars::GetEnvVarArray("BLOCKSIZES", {256,512,768,1024}); std::vector unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,3,4,6,8}); - std::vector numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4,8,12,16,24,32}); + std::vector numSesList = EnvVars::GetEnvVarArray("NUM_SUB_EXECS", {4,8,12,16,24,32}); // A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts int numSrcs, numDsts; @@ -74,6 +77,9 @@ int AllToAllSweepPreset(EnvVars& ev, numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1); } + MemType memType = Utils::GetGpuMemType(memTypeIdx); + std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx); + // Print off environment variables ev.DisplayEnvVars(); if (!ev.hideEnv) { @@ -84,13 +90,13 @@ int AllToAllSweepPreset(EnvVars& ev, (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " + std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]); ev.Print("BLOCKSIZES" , blockList.size() , EnvVars::ToStr(blockList).c_str()); - ev.Print("SHOW_MIN_ONLY" , showMinOnly , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results"); - ev.Print("NUM_CUS" , numCusList.size(), EnvVars::ToStr(numCusList).c_str()); + ev.Print("MEM_TYPE" , memTypeIdx , "Using %s GPU memory (%s)", devMemTypeStr.c_str(), Utils::GetAllGpuMemTypeStr().c_str()); ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus); + ev.Print("NUM_SUB_EXECS" , numSesList.size(), EnvVars::ToStr(numSesList).c_str()); + ev.Print("SHOW_MIN_ONLY" , showMinOnly , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results"); ev.Print("UNROLLS" , unrollList.size(), EnvVars::ToStr(unrollList).c_str()); - ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse"); ev.Print("USE_REMOTE_READ", useRemoteRead , "Using %s as executor", useRemoteRead ? "DST" : "SRC"); - ev.Print("USE_SPRAY" , useSpray , "%s per CU", useSpray ? "All targets" : "One target"); + ev.Print("USE_SPRAY" , useSpray , "%s per SubExecutor", useSpray ? "All targets" : "One target"); ev.Print("VERBOSE" , verbose , verbose ? "Display test results" : "Display summary only"); printf("\n"); } @@ -107,14 +113,13 @@ int AllToAllSweepPreset(EnvVars& ev, } // Collect the number of GPU devices to use - MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU; ExeType exeType = EXE_GPU_GFX; std::vector transfers; int targetCount = 0; if (!useSpray) { - // Each CU will work on just one target + // Each SubExecutor will work on just one target for (int i = 0; i < numGpus; i++) { targetCount = 0; for (int j = 0; j < numGpus; j++) { @@ -144,7 +149,10 @@ int AllToAllSweepPreset(EnvVars& ev, } } } else { - // Each CU will work on all targets + // Each CU will work on all targets. + // NOTE: targetCount ends up reflecting the last GPU's target count. This is correct for + // symmetric topologies (all GPUs have equal peer counts), but may be inaccurate with + // A2A_DIRECT on asymmetric hardware where different GPUs have different hop-1 peer counts. for (int i = 0; i < numGpus; i++) { TransferBench::Transfer transfer; transfer.numBytes = numBytesPerTransfer; @@ -172,70 +180,116 @@ int AllToAllSweepPreset(EnvVars& ev, } } - printf("GPU-GFX All-To-All Sweep benchmark:\n"); - printf("==========================\n"); - printf("- Copying %lu bytes between %s pairs of GPUs\n", numBytesPerTransfer, a2aDirect ? "directly connected" : "all"); + Utils::Print("GPU-GFX All-To-All Sweep benchmark (%lu bytes, local=%s). All values are %s GB/s\n", + numBytesPerTransfer, + a2aLocal ? "yes" : "no", + ev.useHipEvents ? "GPU-Event-Timed (min over GPUs)": "CPU-Timed"); + Utils::Print("=======================================================================================\n"); if (transfers.size() == 0) { - printf("[WARN} No transfers requested. Try adjusting A2A_DIRECT or A2A_LOCAL\n"); + Utils::Print("[WARN] No transfers requested. Try adjusting A2A_DIRECT or A2A_LOCAL\n"); return 0; } // Execute Transfers TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); - // Run tests - std::map, TransferBench::TestResults> results; + char sep = ev.outputToCsv ? ',' : ' '; + + double bestMinBw = 0.0; + int bestBlock = -1, bestUnroll = -1, bestNumSes = -1; + + // Print header once + Utils::Print(" BlkS %c UnR ", sep); + for (int c : numSesList) { + Utils::Print("%c SE %03d", sep, c); + if (ev.useHipEvents && !showMinOnly) { + Utils::Print("%c SE%03dMx", sep, c); + } + } + Utils::Print("\n"); + + // Results keyed by (blockSize, numSes, unroll) for verbose output + std::map, TransferBench::TestResults> results; - // Display summary for (int blockSize : blockList) { - printf("Blocksize: %d\n", blockSize); - ev.gfxBlockSize = cfg.gfx.blockSize = blockSize; + cfg.gfx.blockSize = blockSize; - printf("#CUs\\Unroll"); for (int u : unrollList) { - printf(" %d(Min) ", u); - if (!showMinOnly) printf(" %d(Max) ", u); - } - printf("\n"); - for (int c : numCusList) { - printf(" %5d ", c); fflush(stdout); - for (int u : unrollList) { - ev.gfxUnroll = cfg.gfx.unrollFactor = u; - for (auto& transfer : transfers) + cfg.gfx.unrollFactor = u; + Utils::Print("%5d %c %3d ", blockSize, sep, u); + fflush(stdout); + + for (int c : numSesList) { + for (auto& transfer : transfers) { transfer.numSubExecs = useSpray ? (c * targetCount) : c; + } - double minBandwidth = std::numeric_limits::max(); - double maxBandwidth = std::numeric_limits::min(); TransferBench::TestResults result; + double minBw = 0.0, maxBw = 0.0; if (TransferBench::RunTransfers(cfg, transfers, result)) { - for (auto const& exeResult : result.exeResults) { - minBandwidth = std::min(minBandwidth, exeResult.second.avgBandwidthGbPerSec); - maxBandwidth = std::max(maxBandwidth, exeResult.second.avgBandwidthGbPerSec); + if (!ev.useHipEvents) { + minBw = result.avgTotalBandwidthGbPerSec; + if (useSpray) { + minBw *= targetCount; + } + } else { + minBw = std::numeric_limits::max(); + maxBw = std::numeric_limits::lowest(); + for (auto const& exeResult : result.exeResults) { + minBw = std::min(minBw, exeResult.second.avgBandwidthGbPerSec); + maxBw = std::max(maxBw, exeResult.second.avgBandwidthGbPerSec); + } + if (useSpray) { + minBw *= targetCount; + maxBw *= targetCount; + } + } + if (minBw > bestMinBw) { + bestMinBw = minBw; + bestBlock = blockSize; + bestUnroll = u; + bestNumSes = c; } - if (useSpray) { - minBandwidth *= targetCount; - maxBandwidth *= targetCount; + if (verbose) { + results[std::make_tuple(blockSize, c, u)] = result; } - results[std::make_pair(c,u)] = result; - } else { - minBandwidth = 0.0; } - printf(" %7.2f ", minBandwidth); - if (!showMinOnly) printf(" %7.2f ", maxBandwidth); + Utils::Print("%c%8.2f", sep, minBw); + if (ev.useHipEvents && !showMinOnly) { + Utils::Print("%c%8.2f", sep, maxBw); + } fflush(stdout); } - printf("\n"); fflush(stdout); + Utils::Print("\n"); + fflush(stdout); } + } + Utils::Print("=======================================================================================\n"); - if (verbose) { - int testNum = 0; - for (int c : numCusList) { + if (verbose) { + int testNum = 0; + for (int blockSize : blockList) { + for (int c : numSesList) { for (int u : unrollList) { - printf("CUs: %d Unroll %d\n", c, u); - Utils::PrintResults(ev, ++testNum, transfers, results[std::make_pair(c,u)]); + auto verboseTransfers = transfers; + for (auto& t : verboseTransfers) { + t.numSubExecs = useSpray ? (c * targetCount) : c; + } + Utils::Print("BlockSize: %d SubExecs: %d Unroll: %d\n", blockSize, c, u); + Utils::PrintResults(ev, ++testNum, verboseTransfers, results[std::make_tuple(blockSize, c, u)]); } } } } - return 1; + + // Print combination that produced highest bandwidth + if (bestBlock != -1) { + Utils::Print("Highest %s bandwidth found: %7.2f GB/s\n", + ev.useHipEvents ? "GPU-event-timed (min)" : "CPU-timed", bestMinBw); + Utils::Print(" BlockSize : %7d\n", bestBlock); + Utils::Print(" Unroll : %7d\n", bestUnroll); + Utils::Print(" NumSubExec : %7d\n", bestNumSes); + } + + return ERR_NONE; } diff --git a/src/client/Presets/BmaSweep.hpp b/src/client/Presets/BmaSweep.hpp new file mode 100644 index 00000000..7bc9ab77 --- /dev/null +++ b/src/client/Presets/BmaSweep.hpp @@ -0,0 +1,182 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +int BmaSweepPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) +{ + if (TransferBench::GetNumRanks() > 1) { + Utils::Print("[ERROR] BMA sweep preset currently not supported for multi-node\n"); + return ERR_FATAL; + } + +#ifndef BMA_EXEC_ENABLED + Utils::Print("[ERROR] BMA executor requires ROCm 7.1 or newer\n"); + return ERR_FATAL; +#endif + + int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); + + // Collect env vars for this preset + int exeIndex = EnvVars::GetEnvVar("EXE_INDEX" , 0); + int localCopy = EnvVars::GetEnvVar("LOCAL_COPY" , 0); + vector gfxSesList = EnvVars::GetEnvVarArray("GFX_SUB_EXECS", {}); + int gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE" , 0); + int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus); + vector bmaSesList = EnvVars::GetEnvVarArray("NUM_SUB_EXECS", {1,2,4,8}); + + + MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx); + + // Display environment variables + if (Utils::RankDoesOutput()) { + ev.DisplayEnvVars(); + if (!ev.hideEnv) { + int outputToCsv = ev.outputToCsv; + if (!outputToCsv) printf("[BMA Sweep Related]\n"); + ev.Print("EXE_INDEX" , exeIndex, "Executing on GPU %d", exeIndex); + ev.Print("LOCAL_COPY" , localCopy, "%s local copy to GPU %d", localCopy ? "Including" : "Excluding", exeIndex); + ev.Print("GFX_SUB_EXECS" , gfxSesList.size(), EnvVars::ToStr(gfxSesList).c_str()); + ev.Print("GPU_MEM_TYPE" , gpuMemTypeIdx, "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str()); + ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices); + ev.Print("NUM_SUB_EXECS" , bmaSesList.size(), EnvVars::ToStr(bmaSesList).c_str()); + printf("\n"); + } + } + + if (exeIndex < 0 || exeIndex >= numGpuDevices) { + Utils::Print("EXE_INDEX must be between 0 and %d inclusively\n", numGpuDevices - 1); + return ERR_FATAL; + } + + int numTransfers = numGpuDevices - 1 + (localCopy ? 1 : 0); + int numBmaSubExec = (int)bmaSesList.size(); + int numGfxSubExec = (int)gfxSesList.size(); + + TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); + TransferBench::TestResults results; + + // Prepare table of results + int minPow2Exp = 12; + int maxPow2Exp = 30; + int numRows = 1 + (bytesSpecified ? 1 : (maxPow2Exp - minPow2Exp + 1)); + int numCols = 2 + numBmaSubExec + numGfxSubExec; + + Utils::TableHelper table(numRows, numCols); + Utils::Print("Performing %d simultaneous DMA Transfers from GPU %d to other GPUs\n", numTransfers, exeIndex); + + // Prepare headers + + table.Set(0, 0, " Bytes "); + table.Set(0, 1, " DMA "); + for (int i = 0; i < numBmaSubExec; i++) { + table.Set(0, 2+i, " BMA(%02d) ", bmaSesList[i]); + } + for (int i = 0; i < numGfxSubExec; i++) { + table.Set(0, 2+numBmaSubExec+i, " GFX(%02d) ", gfxSesList[i]); + } + + table.DrawRowBorder(0); + table.DrawRowBorder(1); + table.DrawRowBorder(numRows); + table.DrawColBorder(0); + table.DrawColBorder(1); + table.DrawColBorder(2); + table.DrawColBorder(2+numBmaSubExec); + table.DrawColBorder(numCols); + + if (!ev.outputToCsv){ + Utils::Print("Executing: "); + fflush(stdout); + }; + + for (size_t numBytes = 1ULL< transfers(1); + + Transfer& t = transfers[0]; + t.numBytes = numBytes; + t.srcs = {{gpuMemType, exeIndex}}; + t.dsts.clear(); + for (int i = 0; i < numGpuDevices; i++) { + if (i == exeIndex && localCopy == 0) continue; + t.dsts.push_back({gpuMemType, i}); + } + + // DMA executor first + t.exeDevice = {EXE_GPU_DMA, exeIndex}; + t.numSubExecs = 1; + + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return ERR_FATAL; + } + + table.Set(currRow, 1, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec); + + // BMA executor next + t.exeDevice = {EXE_GPU_BDMA, exeIndex}; + for (int i = 0; i < numBmaSubExec; i++) { + t.numSubExecs = bmaSesList[i]; + + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return ERR_FATAL; + } + + table.Set(currRow, 2+i, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec); + } + + // GFX executor last + t.exeDevice = {EXE_GPU_GFX, exeIndex}; + for (int i = 0; i < numGfxSubExec; i++) { + t.numSubExecs = gfxSesList[i]; + + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return ERR_FATAL; + } + + table.Set(currRow, 2+numBmaSubExec+i, " %6.2f ", results.tfrResults[0].avgBandwidthGbPerSec); + } + if (bytesSpecified) break; + } + + if (!ev.outputToCsv) { + Utils::Print("\n"); + } + table.PrintTable(ev.outputToCsv, ev.showBorders); + Utils::Print("Reported numbers are all GB/s, normalized for per Transfer for %d Transfers\n", numTransfers); + + return ERR_NONE; +} diff --git a/src/client/Presets/EnvVarsList.hpp b/src/client/Presets/EnvVarsList.hpp new file mode 100644 index 00000000..90fbcd39 --- /dev/null +++ b/src/client/Presets/EnvVarsList.hpp @@ -0,0 +1,31 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +int EnvVarsPreset([[maybe_unused]] EnvVars& ev, + [[maybe_unused]] size_t const numBytesPerTransfer, + [[maybe_unused]] std::string const presetName, + [[maybe_unused]] bool const bytesSpecified) +{ + if (!Utils::RankDoesOutput()) return 0; + EnvVars::DisplayEnvVarsList(); + return 0; +} diff --git a/src/client/Presets/GfxSweep.hpp b/src/client/Presets/GfxSweep.hpp new file mode 100644 index 00000000..fdf4fd8d --- /dev/null +++ b/src/client/Presets/GfxSweep.hpp @@ -0,0 +1,239 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "EnvVars.hpp" + +int GfxSweepPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) +{ + enum TimingMode + { + TimingModeAuto = -1, + TimingModeCpu = 0, + TimingModeHip = 1, + TimingModeGpu = 2 + }; + + // Collect environment variables for this preset + vector blockList = EnvVars::GetEnvVarArray("BLOCKSIZES", {256,512,768,1024}); + std::string transferStr = EnvVars::GetEnvVar( "GFX_TRANSFER", "R0G0->R0G0->R0G0"); + vector kernelList = EnvVars::GetEnvVarArray("KERNELS", {0}); + vector numSesList = EnvVars::GetEnvVarArray("NUM_SUB_EXECS", {4,8,16,32,64}); + int numTransfers = EnvVars::GetEnvVar( "NUM_TRANSFERS", 1); + vector temporalList = EnvVars::GetEnvVarArray("TEMPORAL_MODES", {0}); + int timingMode = EnvVars::GetEnvVar( "TIMING_MODE", TimingModeAuto); + vector unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,4,8,16}); + vector waveOrderList = EnvVars::GetEnvVarArray("WAVE_ORDERS", {0}); + vector wordSizeList = EnvVars::GetEnvVarArray("WORDSIZES", {4}); + + // Print off relevant environment variables + if (Utils::RankDoesOutput()) { + if (!ev.hideEnv) { + ev.DisplayEnvVars(); + if (!ev.outputToCsv) + Utils::Print("[GFX Sweep Related]\n"); + ev.Print("BLOCKSIZES", blockList.size(), EnvVars::ToStr(blockList).c_str()); + ev.Print("GFX_TRANSFER", transferStr, "GFX Transfer to sweep (see config file format)"); + ev.Print("KERNELS", kernelList.size(), EnvVars::ToStr(kernelList).c_str()); + ev.Print("NUM_TRANSFERS", numTransfers, "Number of Transfers specified in GFX_TRANSFER"); + ev.Print("NUM_SUB_EXECS", numSesList.size(), EnvVars::ToStr(numSesList).c_str()); + ev.Print("TEMPORAL_MODES", temporalList.size(), EnvVars::ToStr(temporalList).c_str()); + ev.Print("TIMING_MODE", timingMode, "-1=auto 0=Aggregate CPU, 1=Executor Time, 2=Transfer Time"); + ev.Print("UNROLLS", unrollList.size(), EnvVars::ToStr(unrollList).c_str()); + ev.Print("WAVE_ORDERS", waveOrderList.size(), EnvVars::ToStr(waveOrderList).c_str()); + ev.Print("WORDSIZES", wordSizeList.size(), EnvVars::ToStr(wordSizeList).c_str()); + Utils::Print("\n"); + } + } + + if (timingMode < TimingModeAuto || timingMode > TimingModeGpu) { + Utils::Print("TIMING_MODE value is invalid (%d)\n", timingMode); + return ERR_FATAL; + } + + if (numSesList.empty()){ + Utils::Print("NUM_SUB_EXECS should not be empty\n"); + return ERR_FATAL; + } + + std::vector transfers; + Utils::CheckForError(ParseTransfers(std::to_string(numTransfers) + " 1 " + transferStr, transfers)); + if (transfers.size() == 0) { + Utils::Print("[WARN] No valid Transfers found in GFX_TRANSFER\n"); + return 0; + } + + // Automatically pick timing method + if (timingMode == TimingModeAuto) { + // Use Transfer timing if there is only one Transfer + if (transfers.size() == 1) timingMode = TimingModeGpu; + // Use Executor timing if there is only one executor + else { + bool singleExecutor = true; + for (size_t i = 1; i < transfers.size(); i++) { + if (transfers[i].exeDevice < transfers[0].exeDevice || + transfers[0].exeDevice < transfers[i].exeDevice || + transfers[i].exeSubIndex != transfers[0].exeSubIndex || + transfers[i].exeSubSlot != transfers[0].exeSubSlot) { + singleExecutor = false; + break; + } + } + timingMode = singleExecutor ? TimingModeHip : TimingModeCpu; + } + } + if (timingMode < 0 || timingMode > 2) { + Utils::Print("[ERROR] Invalid timing mode %d\n", timingMode); + return ERR_FATAL; + } + + // Print out the Transfers being run + Utils::Print("GFX sweep: (%lu bytes per Transfer). All values are %s-timed GB/s\n", numBytesPerTransfer, + timingMode == TimingModeCpu ? "Aggregate-CPU" : + timingMode == TimingModeHip ? "HIP-event" : + "GPU wallclock"); + Utils::Print("=======================================================================================\n"); + + bool isMultiNode = GetNumRanks() > 1; + for (size_t i = 0; i < transfers.size(); i++) { + Transfer& t = transfers[i]; + Utils::Print("Transfer %5lu: (%s->", i, Utils::MemDevicesToStr(t.srcs).c_str()); + if (isMultiNode) Utils::Print("R%d", t.exeDevice.exeRank); + Utils::Print("%c%d", ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex); + if (t.exeDevice.exeSlot) Utils::Print("%c", 'A' + t.exeDevice.exeSlot); + if (t.exeSubIndex != -1) Utils::Print(".%d", t.exeSubIndex); + if (t.exeSubSlot != 0) Utils::Print("%c", 'A' + t.exeSubSlot); + Utils::Print("->%s)\n", Utils::MemDevicesToStr(t.dsts).c_str()); + + if (t.exeDevice.exeType != EXE_GPU_GFX) { + Utils::Print("[ERROR] gfxsweep preset only works on Transfers that are using GFX executor\n"); + return ERR_FATAL; + } + t.numBytes = numBytesPerTransfer; + } + + Utils::Print("=======================================================================================\n"); + + ConfigOptions cfg = ev.ToConfigOptions(); + + // Print header + char sep = ev.outputToCsv ? ',' : ' '; + Utils::Print(" WvO %c WSz %c TpM %c BlkS %c UnR %c KrN ", sep, sep, sep, sep, sep); + for (int numSubExec : numSesList) + Utils::Print("%c SE %03d", sep, numSubExec); + Utils::Print("\n"); + + int bestSe = -1; + double overallBestBw = 0; + vector bestBw(numSesList.size(), 0.0); + vector> best(numSesList.size(), vector(7)); + + // Loop over all combinations + for (int waveOrder : waveOrderList) { cfg.gfx.waveOrder = waveOrder; + for (int wordSize : wordSizeList) { cfg.gfx.wordSize = wordSize; + for (int temporalMode : temporalList) { cfg.gfx.temporalMode = temporalMode; + for (int blockSize : blockList) { cfg.gfx.blockSize = blockSize; + for (int unroll : unrollList) { cfg.gfx.unrollFactor = unroll; + for (int kernelIdx : kernelList) { cfg.gfx.gfxKernel = kernelIdx; + Utils::Print(" %1d %c %1d %c %1d %c %4d %c %2d %c %1d ", + waveOrder, sep, wordSize, sep, temporalMode, sep, + blockSize, sep, unroll, sep, kernelIdx, sep); + + for (auto s = 0; s < numSesList.size(); s++) { + int numSubExec = numSesList[s]; + for (Transfer& t : transfers) t.numSubExecs = numSubExec; + + TestResults result; + if (RunTransfers(cfg, transfers, result)) { + double bw = 0.0; + switch (timingMode) { + case 0: bw = result.avgTotalBandwidthGbPerSec; break; + case 1: + for (auto const& e : result.exeResults) { + bw = std::max(bw, e.second.avgBandwidthGbPerSec); + } + break; + case 2: default: + for (auto const& t : result.tfrResults) { + bw = std::max(bw, t.avgBandwidthGbPerSec); + } + break; + } + + if (bw > bestBw[s]) { + bestBw[s] = bw; + best[s] = {waveOrder, wordSize, temporalMode, blockSize, unroll, kernelIdx, numSubExec}; + if (bw > overallBestBw) { + overallBestBw = bw; + bestSe = s; + } + } + Utils::Print("%c%8.2f", sep, bw); + fflush(stdout); + } else { + Utils::Print("\n"); + Utils::PrintErrors(result.errResults); + return ERR_FATAL; + } + } + Utils::Print("\n"); + fflush(stdout); + } + } + } + } + } + } + + Utils::Print(" WvO %c WSz %c TpM %c BlkS %c UnR %c KrN ", sep, sep, sep, sep, sep); + for (auto s = 0; s < numSesList.size(); s++) { + Utils::Print("%c%8.2f", sep, bestBw[s]); + } + Utils::Print("\n"); + + if (bestSe == -1) { + Utils::Print("[WARN] No transfers executed - make sure sweep parameters lists are not empty\n"); + return ERR_FATAL; + } + + // Print combination that produced highest bandwidth + Utils::Print("=======================================================================================\n"); + Utils::Print("Highest bandwidth found: %7.2f GB/s (%s-timed)\n", overallBestBw, + timingMode == TimingModeCpu ? "Aggregate-CPU" : + timingMode == TimingModeHip ? "HIP-event" : + "GPU wallclock"); + Utils::Print(" WaveOrder : %7d [GFX_WAVE_ORDER=%d]\n", best[bestSe][0], best[bestSe][0]); + Utils::Print(" WordSize : %7d [GFX_WORD_SIZE=%d]\n", best[bestSe][1], best[bestSe][1]); + Utils::Print(" Temporal Mode: %7d [GFX_TEMPORAL=%d]\n", best[bestSe][2], best[bestSe][2]); + Utils::Print(" BlockSize : %7d [GFX_BLOCK_SIZE=%d]\n", best[bestSe][3], best[bestSe][3]); + Utils::Print(" Unroll : %7d [GFX_UNROLL=%d]\n", best[bestSe][4], best[bestSe][4]); + Utils::Print(" Kernel : %7d [GFX_KERNEL=%d]\n" , best[bestSe][5], best[bestSe][5]); + Utils::Print(" NumSubExec : %7d\n", best[bestSe][6]); + Utils::Print("Command to run best result:\n"); + Utils::Print("GFX_WAVE_ORDER=%d GFX_WORD_SIZE=%d GFX_TEMPORAL=%d GFX_BLOCK_SIZE=%d " + "GFX_UNROLL=%d GFX_KERNEL=%d ./TransferBench cmdline %lu \"%d %d %s\"\n", + best[bestSe][0], best[bestSe][1], best[bestSe][2], best[bestSe][3], + best[bestSe][4], best[bestSe][5], numBytesPerTransfer, numTransfers, best[bestSe][6], transferStr.c_str()); + return ERR_NONE; +} diff --git a/src/client/Presets/HbmBandwidth.hpp b/src/client/Presets/HbmBandwidth.hpp new file mode 100644 index 00000000..f8d60aca --- /dev/null +++ b/src/client/Presets/HbmBandwidth.hpp @@ -0,0 +1,619 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include "EnvVars.hpp" +#include "Utilities.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace TransferBench; + +// CUDA translation +#if defined(__NVCC__) +#define hipEvent_t cudaEvent_t +#define hipEventCreate cudaEventCreate +#define hipEventDestroy cudaEventDestroy +#define hipEventElapsedTime cudaEventElapsedTime +#define hipEventRecord cudaEventRecord +#define hipSetDevice cudaSetDevice +#define hipStream_t cudaStream_t +#define hipStreamCreate cudaStreamCreate +#define hipStreamDestroy cudaStreamDestroy +#define hipStreamSynchronize cudaStreamSynchronize +#endif + +// Load a value +template +__device__ __forceinline__ T Load(const T& ref) +{ +#if !defined(__NVCC__) + if (USE_NT) return __builtin_nontemporal_load(&ref); +#endif + return ref; +} + +// Main kernel for HBM bandwidth testing +template +__global__ __launch_bounds__(LAUNCH_BOUND) +void HbmReadBwKernel(const void* __restrict pSrcBuffer, + void* __restrict dummy, + const size_t numSteps, + long long* __restrict minStartCycle, + long long* __restrict maxStopCycle) +{ + int64_t startTime; + if (threadIdx.x == 0) { + startTime = GetTimestamp(); + } + + // Cast src/dst buffers to the correct type + T const* __restrict srcBuffer = reinterpret_cast(pSrcBuffer); + T* __restrict dstBuffer = reinterpret_cast(dummy); + T v{}; + + // Determine the total number of elements this threadblock handles + size_t elemPerThreadblock = numSteps * blockDim.x * UNROLL; + + // Determine the initial offset for this threadblock + size_t srcOffset = blockIdx.x * elemPerThreadblock + threadIdx.x; + + #pragma unroll 1 + for (size_t step = 0; step < numSteps; step++) { + #pragma unroll + for (uint32_t i = 0; i < UNROLL; i++) { + v |= Load(srcBuffer[srcOffset]); + srcOffset += blockDim.x; + } + } + + // This statement is never true, but is required to make sure compiler + // doesn't optimize away the reads + if (elemPerThreadblock == 0) + *dstBuffer = v; + + // Update min/max start times + __syncthreads(); + if (threadIdx.x == 0 && minStartCycle != nullptr) { + int64_t stopTime = GetTimestamp(); + atomicMin(minStartCycle, startTime); + atomicMax(maxStopCycle, stopTime); + } +} + +// Build up function pointer table +typedef void (*HbmReadBwKernelFuncPtr)(const void*, void *, size_t, long long*, long long*); + +#define HBM_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, DTYPE) \ + {HbmReadBwKernel, \ + HbmReadBwKernel} + +#define HBM_KERNEL_DTYPE_DECL(LAUNCH_BOUND, UNROLL) \ + {HBM_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, uint32_t), \ + HBM_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, uint64_t), \ + HBM_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, __uint128_t)} + +#define HBM_KERNEL_UNROLL_DECL(LAUNCH_BOUND) \ + {HBM_KERNEL_DTYPE_DECL(LAUNCH_BOUND, 1), \ + HBM_KERNEL_DTYPE_DECL(LAUNCH_BOUND, 2), \ + HBM_KERNEL_DTYPE_DECL(LAUNCH_BOUND, 4), \ + HBM_KERNEL_DTYPE_DECL(LAUNCH_BOUND, 8), \ + HBM_KERNEL_DTYPE_DECL(LAUNCH_BOUND, 16)} + + HbmReadBwKernelFuncPtr HbmReadKernelTable[4][5][3][2] = + { + HBM_KERNEL_UNROLL_DECL(256), + HBM_KERNEL_UNROLL_DECL(512), + HBM_KERNEL_UNROLL_DECL(768), + HBM_KERNEL_UNROLL_DECL(1024) + }; + +// Kernel to fill buffer with random data +__global__ void FillPsuedoRandomData(size_t N, uint32_t* p, uint32_t shift) +{ + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < N; idx += blockDim.x * gridDim.x) { + uint32_t d = static_cast(idx + shift); + uint32_t val = 2166136261u; + #pragma unroll + for (int i = 0; i < 4; i++) { + val ^= d & 0xff; + val *= 16777619u; + d >>= 8; + } + p[idx] = val; + } +} + +struct HbmBwResult +{ + int rank; + int gpuIdx; + int numSubExec; + int blockSize; + int unroll; + int elemByte; + + double bw[3]; // MAX | AVG | MIN +}; + +int HbmBandwidthPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) +{ + // If bytes aren't specified, default to 1GB + size_t numBytesAtLeast = (bytesSpecified ? numBytesPerTransfer : 1024 * 1024 * 1024); + + // Determine rank information + int numRanks = TransferBench::GetNumRanks(); + int myRank = TransferBench::GetRank(); + + // Make sure each rank has at least one GPU + for (int rank = 0; rank < numRanks; rank++) { + if (TransferBench::GetNumExecutors(EXE_GPU_GFX, rank) == 0) { + Utils::Print("[ERROR] Each rank must have at least GPU. Rank %d has no GPUs\n", rank); + return ERR_FATAL; + } + } + int defSubExec = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}); + + // Collect environment variables + std::vector blockSizes = EnvVars::GetEnvVarArray("BLOCKSIZES" , {256, 512}); + int criteria = EnvVars::GetEnvVar ("CRITERIA" , 0); + std::vector elemBytes = EnvVars::GetEnvVarArray("ELEM_BYTES" , {16,8}); + std::vector gpuIndices = EnvVars::GetEnvVarArray("GPU_INDICES" , {}); + int memTypeIdx = EnvVars::GetEnvVar ("MEM_TYPE" , 0); + int numBuffers = EnvVars::GetEnvVar ("NUM_BUFFERS" , 2); + int numIterations = EnvVars::GetEnvVar ("NUM_ITERATIONS", 100); + std::vector numSesList = EnvVars::GetEnvVarArray("NUM_SUB_EXECS" , {defSubExec}); + int outputToCsv = EnvVars::GetEnvVar ("OUTPUT_TO_CSV" , 0); + int prewarmMsec = EnvVars::GetEnvVar ("PREWARM_MSEC" , 50); + int showBorders = EnvVars::GetEnvVar ("SHOW_BORDERS" , 1); + int showDetails = EnvVars::GetEnvVar ("SHOW_DETAILS" , 0); + int showExtra = EnvVars::GetEnvVar ("SHOW_EXTRA" , 0); + int temporalMask = EnvVars::GetEnvVar ("TEMPORAL_MASK" , 3); + std::vector unrolls = EnvVars::GetEnvVarArray("UNROLLS" , {16,8,4}); + int useWallClock = EnvVars::GetEnvVar ("USE_WALLCLOCK" , 1); + + // SHOW_DETAILS is not supported in multi-rank runs + if (numRanks > 1) showDetails = 0; + + // Non-temporal reads are not supported for CUDA +#if defined(__NVCC__) + temporalMask = 1; +#endif + + // Check for consistency across ranks + IS_UNIFORM(blockSizes, "BLOCKSIZES"); + IS_UNIFORM(criteria, "CRITERIA"); + IS_UNIFORM(elemBytes, "ELEM_BYTES"); + // GPU_INDICES may be different per rank + IS_UNIFORM(memTypeIdx, "MEM_TYPE"); + IS_UNIFORM(numBuffers, "NUM_BUFFERS"); + IS_UNIFORM(numIterations, "NUM_ITERATIONS"); + IS_UNIFORM(numSesList, "NUM_SUB_EXECS"); + IS_UNIFORM(prewarmMsec, "PREWARM_MSEC"); + IS_UNIFORM(showDetails, "SHOW_DETAILS"); + IS_UNIFORM(showExtra, "SHOW_EXTRA"); + IS_UNIFORM(temporalMask, "TEMPORAL_MASK"); + IS_UNIFORM(unrolls, "UNROLLS"); + IS_UNIFORM(useWallClock, "USE_WALLCLOCK"); + + // Validate environment variables and set defaults + if (blockSizes.empty()) { + Utils::Print("[ERROR] BLOCKSIZES may not be empty\n"); + return ERR_FATAL; + } + for (auto blockSize : blockSizes) { + if (blockSize <= 0 || blockSize % 128 != 0 || blockSize > 1024) { + Utils::Print("[ERROR] BLOCKSIZES must only contain positive multiples of 128 up to 1024 (not %d)\n", blockSize); + return ERR_FATAL; + } + } + + if (criteria < 0 || criteria > 2) { + Utils::Print("[ERROR] CRITERIA must be either 0 (for MAX), 1 (for AVG), or 2 (for MIN) (not %d)\n", criteria); + return ERR_FATAL; + } + + if (elemBytes.empty()) { + Utils::Print("[ERROR] ELEM_BYTES may not be empty\n"); + return ERR_FATAL; + } + for (auto elemByte : elemBytes) { + if (elemByte != 4 && elemByte != 8 && elemByte != 16) { + Utils::Print("[ERROR] ELEM_BYTES may only contain {4,8 or 16}\n"); + return ERR_FATAL; + } + } + + int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); + if (!gpuIndices.empty()) { + for (auto gpuIdx : gpuIndices) { + if (gpuIdx < 0 || gpuIdx >= numDetectedGpus) { + Utils::Print("[ERROR] GPU_INDICES index out of range (%d) (rank %d)\n", gpuIdx, myRank); + return ERR_FATAL; + } + } + } + + if (numBuffers < 1) { + Utils::Print("[ERROR] NUM_BUFFERS must be a positive number (not %d)\n", numBuffers); + return ERR_FATAL; + } + if (numIterations <= 0) { + Utils::Print("[ERROR] NUM_ITERATIONS must be positive (not %d)\n", numIterations); + return ERR_FATAL; + } + if (numBuffers > numIterations) { + Utils::Print("[WARN] NUM_BUFFERS (%d) exceeds NUM_ITERATIONS (%d), so some buffers will not be used\n", + numBuffers, numIterations); + numBuffers = numIterations; + } + + if (numSesList.empty()) { + // By default, use all available sub executors + numSesList.push_back(defSubExec); + } else { + for (auto x : numSesList) { + if (x <= 0 || x > defSubExec) { + Utils::Print("[ERROR] Number of subexecutors must be positive and less than %d\n", defSubExec); + return ERR_FATAL; + } + } + } + + if (prewarmMsec < 0) { + Utils::Print("[ERROR] PREWARM_MSEC must be non-negative (not %d)\n", prewarmMsec); + return ERR_FATAL; + } + + if (temporalMask < 1 || temporalMask > 3) { + Utils::Print("[ERROR] TEMPORAL_MASK must be between 1 to 3 (not %d)\n", temporalMask); + return ERR_FATAL; + } + + if (unrolls.empty()) { + Utils::Print("[ERROR] UNROLLS may not be empty"); + return ERR_FATAL; + } + for (auto unroll : unrolls) { + if (unroll != 1 && unroll != 2 && unroll != 4 && unroll != 8 && unroll != 16) { + Utils::Print("[ERROR] UNROLLS must only contain {1,2,4,8 or 16} (not %d)\n", unroll); + return ERR_FATAL; + } + } + + MemType memType = Utils::GetGpuMemType(memTypeIdx); + std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx); + + if (!ev.hideEnv) + { + if (!ev.outputToCsv) Utils::Print("[HBM Bandwidth Related]\n"); + if (Utils::RankDoesOutput()) { + ev.Print("BLOCKSIZES" , EnvVars::ToStr(blockSizes).c_str(), "Threadblock sizes to sweep over (multiple of 128 up to 1024)"); + ev.Print("CRITERIA" , criteria , "Reporting highest %s bandwidth (0=MAX,1=AVG,2=MIN)", criteria == 0 ? "MAX" : criteria == 1 ? "AVG" : "MIN"); + ev.Print("ELEM_BYTES" , EnvVars::ToStr(elemBytes).c_str() , "Element sizes in bytes to sweep over (must contain only 4,8 or 16)"); + ev.Print("GPU_INDICES" , EnvVars::ToStr(gpuIndices).c_str(), "GPU indices to test. Leave empty for all"); + ev.Print("MEM_TYPE" , memTypeIdx , "Using %s memory (%s)", devMemTypeStr.c_str(), Utils::GetAllGpuMemTypeStr().c_str()); + ev.Print("NUM_BUFFERS" , numBuffers , "Number of buffers to rotate through (1 per iteration)"); + ev.Print("NUM_ITERATIONS", numIterations , "Number of iterations to time"); + ev.Print("NUM_SUB_EXECS" , EnvVars::ToStr(numSesList).c_str(), "Number of subexecutors to sweep over (default to all available)"); + ev.Print("PREWARM_MSEC" , prewarmMsec , "Prewarm duration in msec"); + ev.Print("SHOW_DETAILS" , showDetails , "Show sweep details (ignored for multi-rank). Setting to 2 shows per iteration output"); + ev.Print("SHOW_EXTRA" , showExtra , "Show best sweep config details"); + ev.Print("TEMPORAL_MASK" , temporalMask , "Temporal mask (1 = temporal, 2 = non-temporal, 3 = both)"); + ev.Print("UNROLLS" , EnvVars::ToStr(unrolls).c_str() , "Unroll factors to sweep over (must contain only 1,2,4,8 or 16)"); + ev.Print("USE_WALLCLOCK" , useWallClock , useWallClock ? "Using GPU wall-clock for timing" : "Using events for timing"); + Utils::Print("\n"); + } + } + + if (gpuIndices.empty()) { + // If empty, use all available GPUs on local rank + for (int gpuIdx = 0; gpuIdx < numDetectedGpus; gpuIdx++) + gpuIndices.push_back(gpuIdx); + } + + // Determine how how much memory to allocate based on sweep setting + // During each Step each threadblock works on BLOCKSIZE * UNROLL * ELEM_BYTES bytes + // Each buffer will be allocated as the smallest multiple of this, larger than numBytesAtLeast, + // NOTE: It's not safe to just base this on maximums values in each sweep parameter, + // (e.g if maximum size divides numBytesAtLeast perfectly) so looping over entire space is safer + size_t largestTotalBytesPerBuffer = 0; + for (int numSubExec : numSesList) { + for (int blockSize : blockSizes) { + for (int unroll : unrolls) { + for (int elemByte : elemBytes) { + size_t totalBytesPerStep = numSubExec * blockSize * unroll * elemByte; + size_t numSteps = std::max((size_t)1, (numBytesAtLeast + totalBytesPerStep - 1) / totalBytesPerStep); + size_t totalBytesPerBuffer = numSteps * totalBytesPerStep; + if (totalBytesPerBuffer > largestTotalBytesPerBuffer) largestTotalBytesPerBuffer = totalBytesPerBuffer; + } + } + } + } + + if (showDetails) { + Utils::Print("GPU ## | #SE | BKSZ | UR | EB | TOTALBYTES | #STEP | MAX GB/s | AVG GB/s | MIN GB/s\n"); + } + + // Test all local GPUs + std::vector localResults; + + if (!showDetails) { + // Calculate total number of tests that will be executed per GPU + size_t numTests = numSesList.size() * blockSizes.size() * unrolls.size() * elemBytes.size() * (temporalMask == 3 ? 2 : 1); + + Utils::Print("Testing on at least %lu bytes (%lu configs per GPU): ", numBytesAtLeast, numTests); + fflush(stdout); + } + + for (int gpuIdx : gpuIndices) { + HIP_CALL(hipSetDevice(gpuIdx)); + + // Create streams/events for this GPU + hipStream_t stream; + hipEvent_t startEvent, stopEvent; + HIP_CALL(hipStreamCreate(&stream)); + HIP_CALL(hipEventCreate(&startEvent)); + HIP_CALL(hipEventCreate(&stopEvent)); + + // Allocate pinned host memory closest to this GPU to capture timestamps (if enabled) + int wallClockRate; + long long* minStartCycle = nullptr; + long long* maxStopCycle = nullptr; + + if (useWallClock) { + #if defined(__NVCC__) + wallClockRate = 1000000; +#else + HIP_CALL(hipDeviceGetAttribute(&wallClockRate, hipDeviceAttributeWallClockRate, gpuIdx)); +#endif + if (Utils::AllocateMemory({MEM_CPU_CLOSEST, gpuIdx, myRank}, sizeof(int64_t), (void**)&minStartCycle) || + Utils::AllocateMemory({MEM_CPU_CLOSEST, gpuIdx, myRank}, sizeof(int64_t), (void**)&maxStopCycle)) { + Utils::Print("[ERROR] Unable to allocate pinned host memory on rank %d closest to GPU %d\n", myRank, gpuIdx); + return ERR_FATAL; + } + } + + // Allocate and initialize each GPU buffer + MemDevice memDevice = {memType, gpuIdx, myRank}; + std::vector inputBuffers(numBuffers); + for (int bufferIdx = 0; bufferIdx < numBuffers; bufferIdx++) { + ErrResult err = AllocateMemory(memDevice, largestTotalBytesPerBuffer, &inputBuffers[bufferIdx]); + if (err.errType != ERR_NONE) { + Utils::Print("[ERROR] Error when allocating memory (%s)\n", err.errMsg.c_str()); + return ERR_FATAL; + } + FillPsuedoRandomData<<<32, 256, 0, stream>>>(largestTotalBytesPerBuffer / sizeof(uint32_t), + (uint32_t*)inputBuffers[bufferIdx], bufferIdx); + } + HIP_CALL(hipStreamSynchronize(stream)); + + HbmBwResult bestResult = {}; + + // Run sweep to find fastest result + for (int numSubExec : numSesList) { + dim3 gridDim(numSubExec, 1, 1); + for (int blockSize : blockSizes) { + if (!showDetails) { + Utils::Print("."); + fflush(stdout); + } + dim3 blockDim(blockSize, 1, 1); + int launchBoundIdx = (blockSize + 255) / 256 - 1; + for (int unroll : unrolls) { + int unrollIdx = (int)log2(unroll); + for (int elemByte : elemBytes) { + int elemByteIdx = (int)log2(elemByte) - 2; + size_t totalBytesPerStep = numSubExec * blockSize * unroll * elemByte; + size_t numSteps = std::max((size_t)1, (numBytesAtLeast + totalBytesPerStep - 1) / totalBytesPerStep); + size_t totalBytes = numSteps * totalBytesPerStep; + + for (int useNt = 0; useNt <= 1; useNt++) { + if (!(temporalMask & (1<::max(); + double maxBw = std::numeric_limits::lowest(); + double sumBw = 0.0; + + /* Run warmups for user-specified time */ + int currBufferIdx = 0; + auto prewarmEnd = std::chrono::steady_clock::now() + std::chrono::milliseconds(prewarmMsec); + do { + kernel<<>>(inputBuffers[currBufferIdx++], nullptr, numSteps, minStartCycle, maxStopCycle); + HIP_CALL(hipStreamSynchronize(stream)); + if (currBufferIdx == numBuffers) currBufferIdx = 0; + } while (std::chrono::steady_clock::now() < prewarmEnd); + + /* Run timed iterations */ + currBufferIdx = 0; + for (int iteration = 0; iteration < numIterations; iteration++) { + if (useWallClock) { + *minStartCycle = std::numeric_limits::max(); + *maxStopCycle = 0; + } + +#if defined(__NVCC__) + if (!useWallClock) { + HIP_CALL(hipEventRecord(startEvent, stream)); + } + kernel<<>>(inputBuffers[currBufferIdx++], nullptr, numSteps, minStartCycle, maxStopCycle); + if (!useWallClock) { + HIP_CALL(hipEventRecord(stopEvent, stream)); + } +#else + hipExtLaunchKernelGGL(kernel, gridDim, blockDim, 0, stream, useWallClock ? nullptr : startEvent, useWallClock ? nullptr : stopEvent, 0, + inputBuffers[currBufferIdx++], nullptr, numSteps, minStartCycle, maxStopCycle); +#endif + HIP_CALL(hipStreamSynchronize(stream)); + if (currBufferIdx == numBuffers) currBufferIdx = 0; + + float elapsedMsec; + if (useWallClock) { + elapsedMsec = (*maxStopCycle - *minStartCycle) / (double)wallClockRate; + } else { + HIP_CALL(hipEventElapsedTime(&elapsedMsec, startEvent, stopEvent)); + } + + double bw = totalBytes / (elapsedMsec / 1000.0) / 1e9; + + if (showDetails > 1) { + Utils::Print("GPU %02d | %3d | %4d | %2d | %2d | %10lu | %5d | %8.3f\n", + gpuIdx, numSubExec, blockSize, unroll, elemByte, totalBytes, numSteps, bw); + fflush(stdout); + } + + minBw = std::min(minBw, bw); + maxBw = std::max(maxBw, bw); + sumBw += bw; + } + + double avgBw = sumBw / numIterations; + + if (showDetails) { + Utils::Print("GPU %02d | %3d | %4d | %2d | %2d | %10lu | %5d | %8.3f | %8.3f | %8.3f\n", + gpuIdx, numSubExec, blockSize, unroll, elemByte, totalBytes, numSteps, maxBw, avgBw, minBw); + fflush(stdout); + } + + double bw[3] = {maxBw, avgBw, minBw}; + if (bw[criteria] > bestResult.bw[criteria]) { + bestResult.rank = myRank; + bestResult.gpuIdx = gpuIdx; + bestResult.numSubExec = numSubExec; + bestResult.blockSize = blockSize; + bestResult.unroll = unroll; + bestResult.elemByte = elemByte; + bestResult.bw[0] = bw[0]; + bestResult.bw[1] = bw[1]; + bestResult.bw[2] = bw[2]; + } + } + } + } + } + } + + localResults.push_back(bestResult); + + // Deallocate memory buffers + for (int bufferIdx = 0; bufferIdx < numBuffers; bufferIdx++) { + ErrResult err = DeallocateMemory(memType, inputBuffers[bufferIdx], largestTotalBytesPerBuffer); + if (err.errType != ERR_NONE) { + Utils::Print("[ERROR] Error when deallocating memory (%s)\n", err.errMsg.c_str()); + return ERR_FATAL; + } + } + + if (useWallClock) { + if (Utils::DeallocateMemory(MEM_CPU_CLOSEST, minStartCycle, sizeof(int64_t)) || + Utils::DeallocateMemory(MEM_CPU_CLOSEST, maxStopCycle, sizeof(int64_t))) { + Utils::Print("[ERROR] Unable to deallocate pinned host memory on rank %d closest to GPU %d\n", myRank, gpuIdx); + return ERR_FATAL; + } + } + + // Cleanup streams and events + HIP_CALL(hipStreamDestroy(stream)); + HIP_CALL(hipEventDestroy(startEvent)); + HIP_CALL(hipEventDestroy(stopEvent)); + } + if (!showDetails) { + Utils::Print("\n"); fflush(stdout); + } + + // Determine the total number of results + std::vector numGpusOnRank(numRanks); + int totalGpus = 0; + for (int rank = 0; rank < numRanks; rank++) { + numGpusOnRank[rank] = (int)gpuIndices.size(); + TransferBench::System::Get().Broadcast(rank, sizeof(int), &numGpusOnRank[rank]); + totalGpus += numGpusOnRank[rank]; + } + + int numRows = 1 + totalGpus; + int numCols = 5 + (showExtra ? 4 : 0); + int precision = 2; + Utils::TableHelper table(numRows, numCols, precision); + + table.DrawRowBorder(0); + table.DrawRowBorder(1); + table.DrawColBorder(0); + table.DrawColBorder(2); + table.DrawColBorder(5); + table.DrawColBorder(numCols); + + // Header row + table.Set(0, 0, " Rank "); + table.Set(0, 1, " GPU "); + table.Set(0, 2, " MaxBw (GB/s) "); + table.Set(0, 3, " AvgBw (GB/s) "); + table.Set(0, 4, " MinBw (GB/s) "); + if (showExtra) { + table.Set(0, 5, " #SE "); + table.Set(0, 6, " Blocksize "); + table.Set(0, 7, " Unroll "); + table.Set(0, 8, " EBytes "); + } + + // Data rows + int rowIdx = 1; + for (int rank = 0; rank < numRanks; rank++) { + for (int gpu = 0; gpu < numGpusOnRank[rank]; gpu++) { + HbmBwResult result; + if (rank == myRank) result = localResults[gpu]; + TransferBench::System::Get().Broadcast(rank, sizeof(result), &result); + + table.Set(rowIdx, 0, " %d " , result.rank); + table.Set(rowIdx, 1, " %d " , result.gpuIdx); + table.Set(rowIdx, 2, " %8.2f ", result.bw[0]); + table.Set(rowIdx, 3, " %8.2f ", result.bw[1]); + table.Set(rowIdx, 4, " %8.2f ", result.bw[2]); + if (showExtra) { + table.Set(rowIdx, 5, " %d ", result.numSubExec); + table.Set(rowIdx, 6, " %d ", result.blockSize); + table.Set(rowIdx, 7, " %d ", result.unroll); + table.Set(rowIdx, 8, " %d ", result.elemByte); + } + rowIdx++; + } + table.DrawRowBorder(rowIdx); + } + table.PrintTable(outputToCsv, showBorders); + + return ERR_NONE; +} diff --git a/src/client/Presets/HealthCheck.hpp b/src/client/Presets/HealthCheck.hpp index 973df3e7..37608ad0 100644 --- a/src/client/Presets/HealthCheck.hpp +++ b/src/client/Presets/HealthCheck.hpp @@ -439,19 +439,20 @@ int TestHbmPerformance(int modelId, bool verbose) return hasFail; } -int HealthCheckPreset(EnvVars& ev, - size_t const numBytesPerTransfer, - std::string const presetName) +int HealthCheckPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) { if (TransferBench::GetNumRanks() > 1) { Utils::Print("[ERROR] Healthcheck preset currently not supported for multi-node\n"); - return 1; + return ERR_FATAL; } // Check for supported platforms #if defined(__NVCC__) printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n"); - return 0; + return ERR_NONE; #endif printf("Disclaimer:\n"); @@ -473,5 +474,5 @@ int HealthCheckPreset(EnvVars& ev, numFails += TestUnidir(modelId, verbose); numFails += TestBidir(modelId, verbose); numFails += TestAllToAll(modelId, verbose); - return numFails ? 1 : 0; + return numFails ? ERR_FATAL : ERR_NONE; } diff --git a/src/client/Presets/Help.hpp b/src/client/Presets/Help.hpp new file mode 100644 index 00000000..26ede846 --- /dev/null +++ b/src/client/Presets/Help.hpp @@ -0,0 +1,123 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +int HelpPreset([[maybe_unused]] EnvVars& ev, + [[maybe_unused]] size_t const numBytesPerTransfer, + [[maybe_unused]] std::string const presetName, + [[maybe_unused]] bool const bytesSpecified) +{ + if (!Utils::RankDoesOutput()) return 0; + + printf("# ConfigFile Format:\n"); + printf("# ==================\n"); + printf("# A Transfer is defined as a single operation where an Executor reads and adds together\n"); + printf("# values from Source (SRC) memory locations, then writes the sum to destination (DST) memory locations.\n"); + printf("# This simplifies to a simple copy operation when dealing with single SRC/DST.\n"); + printf("#\n"); + printf("# SRC 0 DST 0\n"); + printf("# SRC 1 -> Executor -> DST 1\n"); + printf("# SRC X DST Y\n"); + printf("\n"); + printf("# Five Executors are supported by TransferBench\n"); + printf("# Executor: SubExecutor:\n"); + printf("# 1) CPU CPU thread\n"); + printf("# 2) GPU GPU threadblock/Compute Unit (CU)\n"); + printf("# 3) DMA N/A. (Must have single SRC, at least one DST)\n"); + printf("# 4) NIC Queue Pair\n"); + printf("# 5) Batched-DMA Batch item (Must have single SRC, at least one DST)\n"); + printf("\n"); + printf("# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel\n"); + printf("\n"); + printf("# There are two ways to specify a Test:\n"); + printf("\n"); + printf("# 1) Basic\n"); + printf("# The basic specification assumes the same number of SubExecutors (SE) used per Transfer\n"); + printf("# A positive number of Transfers is specified followed by that number of triplets describing each Transfer\n"); + printf("\n"); + printf("# #Transfers #SEs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)\n"); + printf("\n"); + printf("# 2) Advanced\n"); + printf("# A negative number of Transfers is specified, followed by quintuplets describing each Transfer\n"); + printf("# A non-zero number of bytes specified will override any provided value\n"); + printf("# -#Transfers (srcMem1->Executor1->dstMem1 #SEs1 Bytes1) ... (srcMemL->ExecutorL->dstMemL #SEsL BytesL)\n"); + printf("\n"); + printf("# Argument Details:\n"); + printf("# #Transfers: Number of Transfers to be run in parallel\n"); + printf("# #SEs : Number of SubExecutors to use (CPU threads/ GPU threadblocks)\n"); + printf("# srcMemL : Source memory locations (Where the data is to be read from)\n"); + printf("# Executor : Executor is specified by a character indicating type, followed by device index (0-indexed)\n"); + printf("# - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1)\n"); + printf("# - G: GPU-executed (Indexed from 0 to # GPUs - 1)\n"); + printf("# - D: DMA-executor (Indexed from 0 to # GPUs - 1)\n"); + printf("# - B: Batched-DMA-executor (Indexed from 0 to # GPUs - 1)\n"); + printf("# - I#.#: NIC executor (Indexed from 0 to # NICs - 1)\n"); + printf("# - N#.#: Nearest NIC executor (Indexed from 0 to # GPUs - 1)\n"); + printf("# dstMemL : Destination memory locations (Where the data is to be written to)\n"); + printf("# bytesL : Number of bytes to copy (0 means use command-line specified size)\n"); + printf("# Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')\n"); + printf("#\n"); + printf("# Memory locations are specified by one or more (device character / device index) pairs\n"); + printf("# Character indicating memory type followed by device index (0-indexed)\n"); + printf("# Supported memory locations are:\n"); + printf("# - C: Pinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])\n"); + printf("# - P: Pinned host memory (on NUMA node, indexed by closest GPU [# GPUs -1])\n"); + printf("# - B: Coherent pinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])\n"); + printf("# - D: Non-coherent pinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])\n"); + printf("# - K: Uncached pinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])\n"); + printf("# - H: Unpinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1])\n"); + printf("# - G: Global device memory (on GPU device indexed from 0 to [# GPUs - 1])\n"); + printf("# - F: Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])\n"); + printf("# - U: Uncached device memory (on GPU device indexed from 0 to [# GPUs - 1])\n"); + printf("# - N: Null memory (index ignored)\n"); + printf("\n"); + printf("\n"); + printf("# Examples:\n"); + printf("# 1 4 (G0->G0->G1) Uses 4 CUs on GPU0 to copy from GPU0 to GPU1\n"); + printf("# 1 4 (C1->G2->G0) Uses 4 CUs on GPU2 to copy from CPU1 to GPU0\n"); + printf("# 2 4 G0->G0->G1 G1->G1->G0 Copies from GPU0 to GPU1, and GPU1 to GPU0, each with 4 SEs\n"); + printf("# -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 SEs, and 2Mb from GPU1 to GPU0 with 2 SEs\n"); + printf("# 1 2 (F0->I0.2->F1) Uses 2 QPs to transfer data from GPU0 via NIC0 to GPU1 via NIC2\n"); + printf("# 1 1 (F0->N0.1->F1) Uses 1 QP to transfer data from GPU0 via GPU0's closest NIC to GPU1 via GPU1's closest NIC\n"); + printf("# -2 (G0->N0.1->G1 2 128M) (G1->N1.0->G0 1 256M) Uses Nearest NIC executor to copy 128Mb from GPU0 to GPU1 with 2 QPs,\n"); + printf("# and 256Mb from GPU1 to GPU0 with 1 QP\n"); + printf("# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary\n"); + printf("# Lines starting with # will be ignored. Lines starting with ## will be echoed to output\n"); + printf("\n"); + printf("## Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs\n"); + printf("1 4 (G0->G0->G1)\n"); + printf("\n"); + printf("## Single DMA executed Transfer between GPUs 0 and 1\n"); + printf("1 1 (G0->D0->G1)\n"); + printf("\n"); + printf("## Copy 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 8 CUs\n"); + printf("-2 (G0->G0->G1 4 1M) (G1->G1->G0 8 2M)\n"); + printf("\n"); + printf("## \"Memset\" by GPU 0 to GPU 0 memory\n"); + printf("1 32 (N0->G0->G0)\n"); + printf("\n"); + printf("## \"Read-only\" by CPU 0\n"); + printf("1 4 (C0->C0->N0)\n"); + printf("\n"); + printf("## Broadcast from GPU 0 to GPU 0 and GPU 1\n"); + printf("1 16 (G0->G0->G0G1)\n"); + return 0; +} diff --git a/src/client/Presets/NicAllToAll.hpp b/src/client/Presets/NicAllToAll.hpp new file mode 100644 index 00000000..32e882a4 --- /dev/null +++ b/src/client/Presets/NicAllToAll.hpp @@ -0,0 +1,374 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +int NicAllToAllPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + [[maybe_unused]] bool const bytesSpecified) +{ + // Check for single homogenous group + if (Utils::GetNumRankGroups() > 1) { + Utils::Print("[ERROR] NIC all-to-all preset can only be run across ranks that are homogenous\n"); + Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n"); + Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility to scale-out NICs\n"); + return 1; + } + + int numRanks = TransferBench::GetNumRanks(); + int numNicsPerRank = TransferBench::GetNumExecutors(EXE_NIC); + if (numNicsPerRank == 0) { + Utils::Print("[ERROR] No NIC detected. This preset requires NIC executors.\n"); + return 1; + } + + int useCpuMem = EnvVars::GetEnvVar("USE_CPU_MEM", 0); + // Device count from topology: GFX executors, or CPU executors when USE_CPU_MEM (same pattern as NicRings). + int numMemDevices = TransferBench::GetNumExecutors(useCpuMem ? EXE_CPU : EXE_GPU_GFX); + if (numMemDevices == 0) { + Utils::Print("[ERROR] No %s executors detected for NIC all-to-all.\n", useCpuMem ? "CPU" : "GPU GFX"); + return 1; + } + + int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 1); + int showDetails = EnvVars::GetEnvVar("SHOW_DETAILS", 0); + int useRdmaRead = EnvVars::GetEnvVar("USE_RDMA_READ", 0); + int memTypeIdx = EnvVars::GetEnvVar("MEM_TYPE", 0); + int stride = EnvVars::GetEnvVar("STRIDE", 1); + + // Compute orbit structure before reading GROUP_SIZE so its default can be stride-aware. + // Stride orbits on devices (rank-major devLin = rank * numMemDevices + memIdx): same gcd structure as PodAllToAll's StrideGenerate, + // but NIC A2A does not use the permuted slot order for GROUP_SIZE — subgroups follow natural order within each orbit. + int const M = numRanks * numMemDevices; + int const kNorm = ((stride % M) + M) % M; + int const dCycles = (kNorm == 0) ? 1 : std::gcd(kNorm, M); + int const orbitSize = M / dCycles; + + int groupSize = EnvVars::GetEnvVar("GROUP_SIZE", orbitSize); + int noSameRank = EnvVars::GetEnvVar("NIC_A2A_NO_SAME_RANK", 1); + int numNicPlanes = EnvVars::GetEnvVar("NUM_NIC_PLANES", 1); + + if (numQueuePairs < 1) { + Utils::Print("[ERROR] NUM_QUEUE_PAIRS must be >= 1 (got %d)\n", numQueuePairs); + return 1; + } + if (groupSize < 1) { + Utils::Print("[ERROR] GROUP_SIZE must be >= 1 (got %d)\n", groupSize); + return 1; + } + + bool scopeInter = false; + { + char const* scopeStr = getenv("NIC_A2A_SCOPE"); + if (scopeStr && scopeStr[0]) { + if (!strcmp(scopeStr, "inter") || !strcmp(scopeStr, "INTER")) + scopeInter = true; + else if (strcmp(scopeStr, "intra") && strcmp(scopeStr, "INTRA")) { + Utils::Print("[ERROR] NIC_A2A_SCOPE must be \"intra\" or \"inter\"\n"); + return 1; + } + } + } + + MemType memType = Utils::GetMemType(memTypeIdx, useCpuMem); + std::string memTypeStr = Utils::GetMemTypeStr(memTypeIdx, useCpuMem); + + if (numNicPlanes < 1) { + Utils::Print("[ERROR] NUM_NIC_PLANES must be >= 1\n"); + return 1; + } + + // Same divisibility check as PodAllToAll (total devices = ranks × memory devices per rank). + if (M % groupSize) { + Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n", + groupSize, M, numRanks); + return 1; + } + + // Within each stride orbit, partition by natural rank-major device index: orbit lists devLin = r, r+d, r+2d, ... + // (r = devLin %% dCycles). Subgroup id = (index along that list) / GROUP_SIZE. + if (orbitSize % groupSize != 0) { + Utils::Print("[ERROR] GROUP_SIZE (%d) must divide stride-cycle size %d (devices M=%d, orbits=%d).\n", + groupSize, orbitSize, M, dCycles); + Utils::Print("[ERROR] With STRIDE=%d there are %d disjoint cycles; use a GROUP_SIZE that divides each cycle's device count,\n", + stride, dCycles); + Utils::Print("[ERROR] or use STRIDE=1 so the cycle size equals total devices (%d).\n", M); + return 1; + } + + std::vector deviceSubgroup(M); + for (int devLin = 0; devLin < M; devLin++) { + int const r = devLin % dCycles; + int const k = (devLin - r) / dCycles; // 0 .. orbitSize-1 along natural order in this orbit + deviceSubgroup[devLin] = k / groupSize; + } + + if (Utils::RankDoesOutput()) { + ev.DisplayEnvVars(); + if (!ev.hideEnv) { + if (!ev.outputToCsv) printf("[NIC A2A Related]\n"); + ev.Print("USE_CPU_MEM" , useCpuMem , "Using closest %s memory", useCpuMem ? "CPU" : "GPU"); + ev.Print("MEM_TYPE" , memTypeIdx , "Using %s memory (%s)", memTypeStr.c_str(), Utils::GetAllMemTypeStr(useCpuMem).c_str()); + ev.Print("STRIDE" , stride , "Reordering devices by taking %d steps", stride); + ev.Print("GROUP_SIZE" , groupSize , "Dividing all devices into groups of %d for a2a", groupSize); + ev.Print("NUM_NIC_PLANES" , numNicPlanes , "Number of planes on scale-out"); + if (scopeInter) + ev.Print("NIC_A2A_SCOPE" , "inter" , "Between-group transfers only. Other value: intra"); + else + ev.Print("NIC_A2A_SCOPE" , "intra" , "Within-group transfers only. Other value: inter"); + ev.Print("NIC_A2A_NO_SAME_RANK", noSameRank , "%s transfers where src rank == dst rank", noSameRank ? "Excluding" : "Allowing"); + ev.Print("NUM_QUEUE_PAIRS" , numQueuePairs , "Using %d queue pairs for NIC transfers", numQueuePairs); + ev.Print("SHOW_DETAILS" , showDetails , "%s full Test details", showDetails ? "Showing" : "Hiding"); + ev.Print("USE_RDMA_READ" , useRdmaRead , "Performing RDMA %s", useRdmaRead ? "reads" : "writes"); + printf("\n"); + } + } + + // For each rank/NIC, closest memory device (GPU or CPU NUMA) — several NICs may share the same device (same subgroup). + std::vector> nicToMem(numRanks, std::vector(numNicsPerRank, -1)); + for (int rank = 0; rank < numRanks; rank++) { + for (int nic = 0; nic < numNicsPerRank; nic++) { + int memIdx = useCpuMem ? TransferBench::GetClosestCpuNumaToNic(nic, rank) + : TransferBench::GetClosestGpuToNic(nic, rank); + if (memIdx < 0) { + Utils::Print("[ERROR] Failed to identify closest %s for Rank %d NIC %d\n", + useCpuMem ? "CPU NUMA node" : "GPU", rank, nic); + return 1; + } + if (memIdx >= numMemDevices) { + Utils::Print("[ERROR] Closest %s index %d for Rank %d NIC %d is out of range [0,%d)\n", + useCpuMem ? "CPU" : "GPU", memIdx, rank, nic, numMemDevices); + return 1; + } + nicToMem[rank][nic] = memIdx; + } + } + + auto devLinOf = [&](int rank, int memIdx) -> int { return rank * numMemDevices + memIdx; }; + + // NIC plane: independent of STRIDE over memory devices. Global rank-major order over NIC endpoints, round-robin into P planes. + auto nicPlaneOf = [&](int rank, int nic) -> int { + int const L = rank * numNicsPerRank + nic; + return L % numNicPlanes; + }; + + std::vector transfers; + + auto const acceptPair = [&](int srcRank, int srcNic, int dstRank, int dstNic) -> bool { + if (nicPlaneOf(srcRank, srcNic) != nicPlaneOf(dstRank, dstNic)) + return false; + int srcDevLin = devLinOf(srcRank, nicToMem[srcRank][srcNic]); + int dstDevLin = devLinOf(dstRank, nicToMem[dstRank][dstNic]); + if ((srcDevLin % dCycles) != (dstDevLin % dCycles)) + return false; + if (noSameRank && srcRank == dstRank) + return false; + if (scopeInter) + return deviceSubgroup[srcDevLin] != deviceSubgroup[dstDevLin]; + return deviceSubgroup[srcDevLin] == deviceSubgroup[dstDevLin]; + }; + + for (int srcRank = 0; srcRank < numRanks; srcRank++) { + for (int srcNic = 0; srcNic < numNicsPerRank; srcNic++) { + int srcMem = nicToMem[srcRank][srcNic]; + for (int dstRank = 0; dstRank < numRanks; dstRank++) { + for (int dstNic = 0; dstNic < numNicsPerRank; dstNic++) { + if (!acceptPair(srcRank, srcNic, dstRank, dstNic)) continue; + + int dstMem = nicToMem[dstRank][dstNic]; + + TransferBench::Transfer transfer; + transfer.srcs.push_back({memType, srcMem, srcRank}); + transfer.dsts.push_back({memType, dstMem, dstRank}); + transfer.exeDevice = {EXE_NIC, useRdmaRead ? dstNic : srcNic, useRdmaRead ? dstRank : srcRank}; + transfer.exeSubIndex = useRdmaRead ? srcNic : dstNic; + transfer.numSubExecs = numQueuePairs; + transfer.numBytes = numBytesPerTransfer; + + transfers.push_back(transfer); + } + } + } + } + + Utils::Print("NIC All-To-All benchmark\n"); + Utils::Print("========================\n"); + Utils::Print("%s traffic over NIC executors. %d rank-major devices; STRIDE sets gcd-orbits; GROUP_SIZE chunks each orbit in natural order.\n", + useCpuMem ? "CPU" : "GPU", M); + Utils::Print("NICs map to devices via closest %s;\n", useCpuMem ? "CPU NUMA node" : "GPU"); + Utils::Print("NIC planes: %d , traffic only between NICs in the same plane. Stride: %d\n", + numNicPlanes, stride); + Utils::Print("Using closest %s per NIC endpoint and %s memory.\n", + useCpuMem ? "CPU NUMA node" : "GPU", memTypeStr.c_str()); + Utils::Print("Visible NICs per rank: %d\n", numNicsPerRank); + Utils::Print("%d queue pairs per NIC. %lu bytes per Transfer. All numbers are GB/s\n", + numQueuePairs, numBytesPerTransfer); + Utils::Print("Total transfers: %lu\n\n", transfers.size()); + + if (transfers.empty()) { + Utils::Print("[WARN] No transfers were generated for this preset.\n"); + return 0; + } + + TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); + TransferBench::TestResults results; + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return 1; + } else if (showDetails) { + Utils::PrintResults(ev, 1, transfers, results); + Utils::Print("\n"); + } + + if (!Utils::RankDoesOutput()) return 0; + + int numRows = 6 + numRanks; + int numCols = 3 + numNicsPerRank; + Utils::TableHelper table(numRows, numCols); + + table.Set(2, 0, " Rank "); + table.Set(2, 1, " Name "); + table.Set(1, numCols - 1, " TOTAL "); + table.Set(2, numCols - 1, " (GB/s) "); + table.SetColAlignment(1, Utils::TableHelper::ALIGN_LEFT); + for (int rank = 0; rank < numRanks; rank++) { + table.Set(3 + rank, 0, " %d ", rank); + table.Set(3 + rank, 1, " %s ", TransferBench::GetHostname(rank).c_str()); + } + table.Set(numRows - 3, 1, " MAX (GB/s) "); + table.Set(numRows - 2, 1, " AVG (GB/s) "); + table.Set(numRows - 1, 1, " MIN (GB/s) "); + for (int row = numRows - 3; row < numRows; row++) + table.SetCellAlignment(row, 1, Utils::TableHelper::ALIGN_RIGHT); + table.DrawRowBorder(3); + table.DrawRowBorder(numRows - 3); + + std::vector> bwByRankNic(numRanks, std::vector(numNicsPerRank, 0.0)); + for (size_t i = 0; i < results.tfrResults.size(); i++) { + int nicIdx = results.tfrResults[i].exeDevice.exeIndex; + int rankIdx = results.tfrResults[i].exeDevice.exeRank; + bwByRankNic[rankIdx][nicIdx] += results.tfrResults[i].avgBandwidthGbPerSec; + } + + std::vector nicHasMixedMemMapping(numNicsPerRank, false); + bool hasMixedMemMapping = false; + for (int nic = 0; nic < numNicsPerRank; nic++) { + int refMem = nicToMem[0][nic]; + for (int rank = 1; rank < numRanks; rank++) { + if (nicToMem[rank][nic] != refMem) { + nicHasMixedMemMapping[nic] = true; + hasMixedMemMapping = true; + break; + } + } + } + + std::vector rankTotal(numRanks, 0.0); + int colIdx = 2; + table.DrawColBorder(colIdx); + for (int nic = 0; nic < numNicsPerRank; nic++) { + table.Set(0, colIdx, " NIC %02d ", nic); + if (nicHasMixedMemMapping[nic]) { + table.Set(1, colIdx, " MIXED "); + } else if (useCpuMem) { + table.Set(1, colIdx, " CPU %02d ", nicToMem[0][nic]); + } else { + table.Set(1, colIdx, " GPU %02d ", nicToMem[0][nic]); + } + table.Set(2, colIdx, " %s ", TransferBench::GetExecutorName({EXE_NIC, nic}).c_str()); + + double nicMin = std::numeric_limits::max(); + double nicAvg = 0.0; + double nicMax = std::numeric_limits::lowest(); + for (int rank = 0; rank < numRanks; rank++) { + double bw = bwByRankNic[rank][nic]; + table.Set(3 + rank, colIdx, " %.2f ", bw); + nicMin = std::min(nicMin, bw); + nicAvg += bw; + nicMax = std::max(nicMax, bw); + rankTotal[rank] += bw; + } + + table.Set(numRows - 3, colIdx, " %.2f ", nicMax); + table.Set(numRows - 2, colIdx, " %.2f ", nicAvg / numRanks); + table.Set(numRows - 1, colIdx, " %.2f ", nicMin); + colIdx++; + } + table.DrawColBorder(colIdx); + + double rankMin = std::numeric_limits::max(); + double rankAvg = 0.0; + double rankMax = std::numeric_limits::lowest(); + for (int rank = 0; rank < numRanks; rank++) { + table.Set(3 + rank, numCols - 1, " %.2f ", rankTotal[rank]); + rankMin = std::min(rankMin, rankTotal[rank]); + rankAvg += rankTotal[rank]; + rankMax = std::max(rankMax, rankTotal[rank]); + } + table.Set(numRows - 3, numCols - 1, " %.2f ", rankMax); + table.Set(numRows - 2, numCols - 1, " %.2f ", rankAvg / numRanks); + table.Set(numRows - 1, numCols - 1, " %.2f ", rankMin); + + table.PrintTable(ev.outputToCsv, ev.showBorders); + Utils::Print("\n"); + if (hasMixedMemMapping) { + Utils::Print("[WARN] NIC-to-%s mapping differs across ranks. 'MIXED' columns are detailed below.\n", + useCpuMem ? "CPU" : "GPU"); + + int mapRows = 2 + numRanks; + int mapCols = 2 + numNicsPerRank; + Utils::TableHelper mapTable(mapRows, mapCols); + mapTable.Set(0, 0, " Rank "); + mapTable.Set(0, 1, " Name "); + mapTable.SetColAlignment(1, Utils::TableHelper::ALIGN_LEFT); + for (int nic = 0; nic < numNicsPerRank; nic++) { + mapTable.Set(0, 2 + nic, " NIC %02d ", nic); + mapTable.SetCellAlignment(0, 2 + nic, Utils::TableHelper::ALIGN_CENTER); + } + mapTable.DrawRowBorder(1); + mapTable.DrawColBorder(2); + + for (int rank = 0; rank < numRanks; rank++) { + int rowIdx = 1 + rank; + mapTable.Set(rowIdx, 0, " %d ", rank); + mapTable.Set(rowIdx, 1, " %s ", TransferBench::GetHostname(rank).c_str()); + for (int nic = 0; nic < numNicsPerRank; nic++) { + mapTable.Set(rowIdx, 2 + nic, " %s %02d ", useCpuMem ? "CPU" : "GPU", nicToMem[rank][nic]); + } + } + + mapTable.PrintTable(ev.outputToCsv, ev.showBorders); + Utils::Print("\n"); + } + Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec); + Utils::PrintErrors(results.errResults); + + if (Utils::HasDuplicateHostname()) { + printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n"); + } + + return 0; +} diff --git a/src/client/Presets/NicPeerToPeer.hpp b/src/client/Presets/NicPeerToPeer.hpp index 24f5d71f..ff67d8ac 100644 --- a/src/client/Presets/NicPeerToPeer.hpp +++ b/src/client/Presets/NicPeerToPeer.hpp @@ -24,122 +24,29 @@ THE SOFTWARE. // Helper functions -// Returns a schedule of round robin pairing of N elements, using Circle Method -// if parallel, each round contains N/2 pairs, otherwise serial -void RoundRobinSchedule(std::vector>>& schedule, - int N, int parallel = 0) { - if (N == 1) { - schedule.push_back({{0,0}}); - return; - } - // Generate standard round-robin tournament (maximum parallelism) - std::vector>> fullSchedule; - - // Pad odd number of ranks with a dummy round (N+1) - int paddedN = N + N%2; - // Round-robin tournament scheduling - for (int round = 0; round < paddedN - 1; round++) { - std::vector> roundPairs; - std::vector> roundPairsReversed; - for (int i = 0; i < paddedN / 2; i++) { - int item1 = i; - int item2 = paddedN - 1 - i; - if (round > 0) { - // Rotate all except the first item - if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1; - if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1; - } - // Ignore dummy round, its partner sits out this ronud - if (item1 < N && item2 < N){ - roundPairs.push_back({item1, item2}); - roundPairsReversed.push_back({item2, item1}); - } - } - fullSchedule.push_back(roundPairs); - fullSchedule.push_back(roundPairsReversed); - } - - // A loopback round where all run in parallel - std::vector> selfRound; - for (int i = 0; i < N; i++) { - selfRound.push_back({i, i}); - } - fullSchedule.push_back(selfRound); - - if (parallel) { - schedule = std::move(fullSchedule); - } else { - // Serialize each round if needed - for (auto const& fullRound : fullSchedule) { - for (auto const& match : fullRound) { - std::vector> subRound; - subRound.push_back({match.first, match.second}); - schedule.push_back(subRound); - } - } - } -} - -// Returns a schedule for ordered 2-combination of N elements -// by pairing the list with its rotating self, -// each round contains n pairs, where 1 <= n <= N and N is divisible by n -// and an element cannot appear more than twice in a round, -void CombinationSchedule(std::vector>>& schedule, - int N, int n = 0) { - std::vector>> fullSchedule; - - if (n <= 0) n = N; - if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round - { - n = 1; - Utils::Print("[WARN] cannot create round robin schedule, falling back to serial"); - } - - // Generate rounds of combination based on incrementing distance - for (int i = 0; i < N; i++) { - std::vector> round; - for (int j = 0; j < N; j++) { - round.push_back({j, (j+i)%N}); - } - fullSchedule.push_back(round); - } - - // Step 2: Split each full round into sub-rounds with at most n pairs - for (auto const& fullRound : fullSchedule) { - for (size_t start = 0; start < fullRound.size(); start += n) { - std::vector> subRound; - for (size_t i = start; i < start + n && i < fullRound.size(); i++) { - subRound.push_back(fullRound[i]); - } - if (!subRound.empty()) { - schedule.push_back(subRound); - } - } - } -} - int GetClosestDeviceToNic(MemType memType, int nicIdx, int rank) { return TransferBench::IsCpuMemType(memType) ? TransferBench::GetClosestCpuNumaToNic(nicIdx, rank) : TransferBench::GetClosestGpuToNic(nicIdx, rank); } -int NicPeerToPeerPreset(EnvVars& ev, - size_t const numBytesPerTransfer, - std::string const presetName) +int NicPeerToPeerPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) { if (Utils::GetNumRankGroups() > 1) { Utils::Print("[ERROR] NIC p2p preset can only be run across ranks that are homogenous\n"); Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n"); - Utils::Print("[ERROR] NIC_FILTER may also be used to limit NIC visibility\n"); - return 1; + Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n"); + return ERR_FATAL; } int numRanks = TransferBench::GetNumRanks(); int numNicsPerRank = TransferBench::GetNumExecutors(EXE_NIC); if (numNicsPerRank == 0) { Utils::Print("No NIC detected, NICs are required to run this preset\n"); - return 1; + return ERR_FATAL; } // Collect env vars for this preset @@ -204,8 +111,8 @@ int NicPeerToPeerPreset(EnvVars& ev, std::vector>> schedule; std::vector>> nicSchedule; - RoundRobinSchedule(schedule, numRanks, nodeParallel); - CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel); + Utils::RoundRobinSchedule(schedule, numRanks, nodeParallel); + Utils::CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel); int totalTransfers = numRanks * numNicsPerRank * numRanks * numNicsPerRank; int counter = 0; @@ -239,7 +146,7 @@ int NicPeerToPeerPreset(EnvVars& ev, if (srcMemIndex == -1 || dstMemIndex == -1) { Utils::Print("[ERROR] No proper GPU device can be found for transfer R%dN%d - R%dN%d\n", srcRank, srcNicIdx, dstRank, dstNicIdx); - return 1; + return ERR_FATAL; } transfer.numBytes = numBytesPerTransfer; transfer.srcs.push_back({srcTypeActual, srcMemIndex, srcRank}); @@ -255,7 +162,7 @@ int NicPeerToPeerPreset(EnvVars& ev, if (!TransferBench::RunTransfers(cfg, transfers, results)) { for (auto const& err : results.errResults) Utils::Print("%s\n", err.errMsg.c_str()); - return 1; + return ERR_FATAL; } counter += transfers.size(); @@ -365,10 +272,10 @@ int NicPeerToPeerPreset(EnvVars& ev, Utils::TableHelper summaryTable(11, 6, precision); Utils::Print("Summary of top 10 fastest/slowest connection\n"); - summaryTable.Set(0, 0, " Fastest Bandwidth (GB/s) "); + summaryTable.Set(0, 0, " Fastest Bandwidth (GB/s) "); summaryTable.Set(0, 1, " Src "); summaryTable.Set(0, 2, " Dst "); - summaryTable.Set(0, 3, " Slowest Bandwidth (GB/s) "); + summaryTable.Set(0, 3, " Slowest Bandwidth (GB/s) "); summaryTable.Set(0, 4, " Src "); summaryTable.Set(0, 5, " Dst "); @@ -410,5 +317,5 @@ int NicPeerToPeerPreset(EnvVars& ev, } summaryTable.PrintTable(ev.outputToCsv, ev.showBorders); - return 0; + return ERR_NONE; } diff --git a/src/client/Presets/NicRings.hpp b/src/client/Presets/NicRings.hpp index 95dbba85..031d5744 100644 --- a/src/client/Presets/NicRings.hpp +++ b/src/client/Presets/NicRings.hpp @@ -20,17 +20,18 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -int NicRingsPreset(EnvVars& ev, - size_t const numBytesPerTransfer, - std::string const presetName) +int NicRingsPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) { // Check for single homogenous group if (Utils::GetNumRankGroups() > 1) { Utils::Print("[ERROR] NIC-rings preset can only be run across ranks that are homogenous\n"); Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n"); - Utils::Print("[ERROR] NIC_FILTER may also be used to limit NIC visibility\n"); - return 1; + Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n"); + return ERR_FATAL; } // Collect topology @@ -104,7 +105,7 @@ int NicRingsPreset(EnvVars& ev, if (!TransferBench::RunTransfers(cfg, transfers, results)) { for (auto const& err : results.errResults) Utils::Print("%s\n", err.errMsg.c_str()); - return 1; + return ERR_FATAL; } else if (showDetails) { Utils::PrintResults(ev, 1, transfers, results); Utils::Print("\n"); @@ -160,7 +161,7 @@ int NicRingsPreset(EnvVars& ev, double ringMin = std::numeric_limits::max(); double ringAvg = 0.0; - double ringMax = std::numeric_limits::min(); + double ringMax = std::numeric_limits::lowest(); for (int rank = 0; rank < numRanks; rank++) { double avgBw = results.tfrResults[transferIdx].avgBandwidthGbPerSec; @@ -184,7 +185,7 @@ int NicRingsPreset(EnvVars& ev, double rankMin = std::numeric_limits::max(); double rankAvg = 0.0; - double rankMax = std::numeric_limits::min(); + double rankMax = std::numeric_limits::lowest(); for (int rank = 0; rank < numRanks; rank++) { table.Set(3 + rank, numCols - 1, " %.2f ", rankTotal[rank]); rankMin = std::min(rankMin, rankTotal[rank]); @@ -204,5 +205,5 @@ int NicRingsPreset(EnvVars& ev, if (Utils::HasDuplicateHostname()) { printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n"); } - return 0; + return ERR_NONE; } diff --git a/src/client/Presets/OneToAll.hpp b/src/client/Presets/OneToAll.hpp index f43f4c0d..b2084cdd 100644 --- a/src/client/Presets/OneToAll.hpp +++ b/src/client/Presets/OneToAll.hpp @@ -20,19 +20,20 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -int OneToAllPreset(EnvVars& ev, - size_t const numBytesPerTransfer, - std::string const presetName) +int OneToAllPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) { if (TransferBench::GetNumRanks() > 1) { Utils::Print("[ERROR] One-to-All preset currently not supported for multi-node\n"); - return 1; + return ERR_FATAL; } int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); if (numDetectedGpus < 2) { printf("[ERROR] One-to-all benchmark requires machine with at least 2 GPUs\n"); - return 1; + return ERR_FATAL; } // Collect env vars for this preset @@ -66,7 +67,7 @@ int OneToAllPreset(EnvVars& ev, for (auto ch : sweepExe) { if (ch != 'G' && ch != 'D') { printf("[ERROR] Unrecognized executor type '%c' specified\n", ch); - return 1; + return ERR_FATAL; } } @@ -129,7 +130,7 @@ int OneToAllPreset(EnvVars& ev, } if (!TransferBench::RunTransfers(cfg, transfers, results)) { Utils::PrintErrors(results.errResults); - return 1; + return ERR_FATAL; } int counter = 0; @@ -151,5 +152,5 @@ int OneToAllPreset(EnvVars& ev, } } } - return 0; + return ERR_NONE; } diff --git a/src/client/Presets/PeerToPeer.hpp b/src/client/Presets/PeerToPeer.hpp index fd32f9b1..5fbe1554 100644 --- a/src/client/Presets/PeerToPeer.hpp +++ b/src/client/Presets/PeerToPeer.hpp @@ -20,13 +20,14 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -int PeerToPeerPreset(EnvVars& ev, - size_t const numBytesPerTransfer, - std::string const presetName) +int PeerToPeerPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) { if (TransferBench::GetNumRanks() > 1) { Utils::Print("[ERROR] Peer-to-peer preset currently not supported for multi-node\n"); - return 1; + return ERR_FATAL; } int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU); @@ -42,7 +43,6 @@ int PeerToPeerPreset(EnvVars& ev, int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); int numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE", useDmaCopy ? 1 : TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0})); int p2pMode = EnvVars::GetEnvVar("P2P_MODE", 0); - int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", -999); // Deprecated int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0); MemType cpuMemType = Utils::GetCpuMemType(cpuMemTypeIdx); @@ -72,12 +72,6 @@ int PeerToPeerPreset(EnvVars& ev, } } - // Check for deprecated env vars - if (useFineGrain != -999) { - Utils::Print("[ERROR] USE_FINE_GRAIN has been deprecated and replaced by CPU_MEM_TYPE and GPU_MEM_TYPE\n"); - return 1; - } - char const separator = ev.outputToCsv ? ',' : ' '; printf("Bytes Per Direction%c%lu\n", separator, numBytesPerTransfer); @@ -188,7 +182,7 @@ int PeerToPeerPreset(EnvVars& ev, if (!TransferBench::RunTransfers(cfg, transfers, results)) { for (auto const& err : results.errResults) printf("%s\n", err.errMsg.c_str()); - return 1; + return ERR_FATAL; } for (int dir = 0; dir <= isBidirectional; dir++) { @@ -326,5 +320,5 @@ int PeerToPeerPreset(EnvVars& ev, printf("\n\n"); } } - return 0; + return ERR_NONE; } diff --git a/src/client/Presets/PodAllToAll.hpp b/src/client/Presets/PodAllToAll.hpp new file mode 100644 index 00000000..f81586b7 --- /dev/null +++ b/src/client/Presets/PodAllToAll.hpp @@ -0,0 +1,270 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +int PodAllToAllPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) +{ + enum + { + A2A_COPY = 0, + A2A_READ_ONLY = 1, + A2A_WRITE_ONLY = 2, + A2A_CUSTOM = 3, + }; + char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"}; + + // Force single-stream mode for all-to-all benchmark + ev.useSingleStream = 1; + + // Force to gfx unroll 2 unless explicitly set + ev.gfxUnroll = EnvVars::GetEnvVar("GFX_UNROLL", 2); + + int numRanks = TransferBench::GetNumRanks(); + int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); + + // Collect env vars for this preset + int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0); + int memTypeIdx = EnvVars::GetEnvVar("MEM_TYPE" , 0); + int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); + int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0); + int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC" , 8); + int showDetails = EnvVars::GetEnvVar("SHOW_DETAILS" , 0); + int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC" , 0); + int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0); + int stride = EnvVars::GetEnvVar("STRIDE" , 1); + int groupSize = EnvVars::GetEnvVar("GROUP_SIZE" , numRanks * numDetectedGpus); + + // Check that all ranks have at least the number of GPUs requested + // Warn if NIC configuration is slightly different from one another + int numNics = TransferBench::GetNumExecutors(EXE_NIC, 0); + bool nicDifference = false; + for (int rank = 0; rank < numRanks; rank++) { + if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) { + Utils::Print("[ERROR] All-to-All preset requires each rank to have the same number of GPUs\n"); + return ERR_FATAL; + } + if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank)) + nicDifference = true; + } + if (nicDifference) + Utils::Print("[WARN] Not all ranks have the same number of NICs\n"); + + // A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts + int numSrcs, numDsts; + int a2aMode = 0; + if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) { + a2aMode = A2A_CUSTOM; + } else { + a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0); + if (a2aMode < 0 || a2aMode > 2) { + Utils::Print("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n"); + return ERR_FATAL; + } + numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1); + numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1); + } + + MemType memType = Utils::GetGpuMemType(memTypeIdx); + std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx); + + // Print off environment variables + if (Utils::RankDoesOutput()) { + ev.DisplayEnvVars(); + if (!ev.hideEnv) { + if (!ev.outputToCsv) printf("[AllToAll Related]\n"); + ev.Print("A2A_LOCAL" , a2aLocal , "%s local transfers", a2aLocal ? "Include" : "Exclude"); + ev.Print("A2A_MODE" , (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode), + (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " + + std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]); + ev.Print("MEM_TYPE" , memTypeIdx , "Using %s GPU memory (%s)", devMemTypeStr.c_str(), Utils::GetAllGpuMemTypeStr().c_str()); + ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus); + ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs); + ev.Print("NUM_SUB_EXEC" , numSubExecs , "Using %d subexecutors/CUs per Transfer", numSubExecs); + ev.Print("USE_DMA_EXEC" , useDmaExec , "Using %s executor", useDmaExec ? "DMA" : "GFX"); + ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC"); + ev.Print("STRIDE" , stride , "Reordering devices by taking %d steps", stride); + ev.Print("GROUP_SIZE" , groupSize , "Dividing all devices into groups of %d for a2a", groupSize); + printf("\n"); + } + } + // Validate env vars + if (numGpus < 0 || numGpus > numDetectedGpus) { + Utils::Print("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus); + return ERR_FATAL; + } + if (useDmaExec && (numSrcs != 1 || numDsts != 1)) { + Utils::Print("[ERROR] DMA execution can only be used for copies (A2A_MODE=0)\n"); + return ERR_FATAL; + } + + if (numRanks * numDetectedGpus % groupSize) { + Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n", groupSize, numRanks * numDetectedGpus, numRanks); + return ERR_FATAL; + } + + Utils::Print("GPU-%s IntraPod All-To-All benchmark:\n", useDmaExec ? "DMA" : "GFX"); + Utils::Print("==============================\n"); + Utils::Print("[%lu bytes per Transfer] [%s:%d] [%d Read(s) %d Write(s)] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n", + numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, numSrcs, numDsts, + devMemTypeStr.c_str(), numQueuePairs, numRanks); + + TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); + ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX; + + Utils::RankPerPodMap& rankToPod = Utils::GetRankPerPodMap(); + if (rankToPod.empty()) { + Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n"); + return ERR_FATAL; + } + for (auto const& [pod, ranks] : rankToPod) { + int n = ranks.size() * numGpus; + int numGroups = n / groupSize; + std::vector devices(n); + std::vector indices(n); + for (int k = 0; k < n; k++) indices[k] = k; + Utils::StrideGenerate(indices, stride); + int idx = 0; + for (int rank : ranks) { + for (int devIdx = 0; devIdx < numGpus; devIdx++) { + devices[indices[idx++]] = {memType, devIdx, rank}; + } + } + + for (int group = 0; group < numGroups; group++) { + std::vector> groupReIndex(groupSize, std::vector(groupSize, -1)); + std::vector transfers; + for (int i = group * groupSize; i < (group + 1) * groupSize; i++) { + for (int j = group * groupSize; j < (group + 1) * groupSize; j++) { + if (i == j) { + if (!a2aLocal) continue; + } + TransferBench::Transfer transfer; + transfer.numBytes = numBytesPerTransfer; + for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back(devices[i]); + if (numDsts) transfer.dsts.push_back(devices[j]); + for (int x = 1; x < numDsts; x++) transfer.dsts.push_back(devices[i]); + transfer.exeDevice = {exeType, + (int32_t)(useRemoteRead ? devices[j].memIndex : devices[i].memIndex), + (int32_t)(useRemoteRead ? devices[j].memRank : devices[i].memRank)}; + transfer.exeSubIndex = -1; + transfer.numSubExecs = numSubExecs; + int const localI = i - group * groupSize; + int const localJ = j - group * groupSize; + groupReIndex[localI][localJ] = (int)transfers.size(); + transfers.push_back(transfer); + } + + if (numQueuePairs > 0) { + TransferBench::Transfer transfer; + transfer.numBytes = numBytesPerTransfer; + transfer.srcs.push_back(devices[i]); + int next = group * groupSize + (i - group * groupSize + 1) % groupSize; + transfer.dsts.push_back(devices[next]); + transfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, + (int32_t)devices[i].memIndex, (int32_t)devices[i].memRank}; + transfer.exeSubIndex = devices[next].memIndex; + transfer.numSubExecs = numQueuePairs; + transfers.push_back(transfer); + } + } + TransferBench::TestResults results; + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return ERR_FATAL; + } + if (showDetails) { + Utils::PrintResults(ev, 1, transfers, results); + Utils::Print("\n"); + } + + // Per-group bandwidth table + std::vector> groupBw(groupSize, std::vector(groupSize, -1.0)); + for (int localI = 0; localI < groupSize; localI++) { + for (int localJ = 0; localJ < groupSize; localJ++) { + int const k = groupReIndex[localI][localJ]; + if (k >= 0) + groupBw[localI][localJ] = results.tfrResults[k].avgBandwidthGbPerSec; + } + } + if (Utils::RankDoesOutput()) { + Utils::Print("\n--- Pod AllToAll Group %d ---\n", group); + int const groupBase = group * groupSize; + int const numRows = 2 + groupSize; + int const numCols = 2 + groupSize; + int const precision = 2; + Utils::TableHelper table(numRows, numCols, precision); + table.DrawRowBorder(0); + table.DrawColBorder(0); + table.DrawColBorder(numCols); + table.DrawRowBorder(numRows); + table.Set(0, 0, useRemoteRead ? " SRC\\DST+EXE " : " SRC+EXE\\DST "); + table.DrawRowBorder(1); + table.DrawColBorder(1); + table.Set(1, 1, " Mem Device "); + + // Column headers + int colPrevRank = -1; + for (int j = 0; j < groupSize; j++) { + int colIdx = 2 + j; + int r = devices[groupBase + j].memRank; + if (r != colPrevRank) { + table.DrawColBorder(colIdx); + table.Set(0, colIdx, " Rank %02d ", r); + colPrevRank = r; + } + table.Set(1, colIdx, " GPU %02d ", devices[groupBase + j].memIndex); + } + + // Row headers and data + int rowPrevRank = -1; + for (int localI = 0; localI < groupSize; localI++) { + int rowIdx = 2 + localI; + int r = devices[groupBase + localI].memRank; + if (r != rowPrevRank) { + table.DrawRowBorder(rowIdx); + table.Set(rowIdx, 0, " Rank %02d ", r); + rowPrevRank = r; + } + table.Set(rowIdx, 1, " GPU %02d ", devices[groupBase + localI].memIndex); + for (int localJ = 0; localJ < groupSize; localJ++) { + if (groupBw[localI][localJ] >= 0) + table.Set(rowIdx, 2 + localJ, " %.2f ", groupBw[localI][localJ]); + else + table.Set(rowIdx, 2 + localJ, " N/A "); + } + } + table.PrintTable(ev.outputToCsv, ev.showBorders); + } + } + } + + if (!Utils::RankDoesOutput()) return 0; + + if (Utils::HasDuplicateHostname()) { + printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n"); + } + + return ERR_NONE; +} diff --git a/src/client/Presets/PodPeerToPeer.hpp b/src/client/Presets/PodPeerToPeer.hpp new file mode 100644 index 00000000..fe1cc775 --- /dev/null +++ b/src/client/Presets/PodPeerToPeer.hpp @@ -0,0 +1,300 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +int PodPeerToPeerPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) +{ + if (Utils::GetNumRankGroups() > 1) { + Utils::Print("[ERROR] Pod p2p preset can only be run across ranks that are homogenous\n"); + Utils::Print("[ERROR] All ranks currently have to be under the same physical and virtual pod\n"); + Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n"); + return ERR_FATAL; + } + + Utils::RankPerPodMap& rankToPod = Utils::GetRankPerPodMap(); + if (rankToPod.empty()) { + Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n"); + return ERR_FATAL; + } + + int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); + + // Collect env vars for this preset + int useDmaCopy = EnvVars::GetEnvVar("USE_GPU_DMA", 0); + int gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE", 0); + int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); + int numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE", useDmaCopy ? 1 : TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0})); + int p2pMode = EnvVars::GetEnvVar("P2P_MODE", 0); + int parallelLevel = EnvVars::GetEnvVar("PARALLEL_LVL", 0); + int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0); + int showFullMatrix = EnvVars::GetEnvVar("OUTPUT_FORMAT", 1); + + MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx); + + // Display environment variables + + if (Utils::RankDoesOutput()) { + ev.DisplayEnvVars(); + if (!ev.hideEnv) { + int outputToCsv = ev.outputToCsv; + if (!outputToCsv) printf("[P2P Related]\n"); + ev.Print("GPU_MEM_TYPE" , gpuMemTypeIdx, "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str()); + ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs per rank", numGpuDevices); + ev.Print("NUM_GPU_SE", numGpuSubExecs, "Using %d GPU subexecutors/CUs per Transfer", numGpuSubExecs); + ev.Print("P2P_MODE", p2pMode, "Running %s transfers", p2pMode == 0 ? "Uni + Bi" : + p2pMode == 1 ? "Unidirectional" + : "Bidirectional"); + ev.Print("PARALLEL_LVL", parallelLevel, "Executing p2p in parallel level %d (0: no parallel, 1: node pairs in parallel)", parallelLevel); + ev.Print("USE_GPU_DMA", useDmaCopy, "Using GPU-%s as GPU executor", useDmaCopy ? "DMA" : "GFX"); + ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC"); + printf("\n"); + } + } + + char const separator = ev.outputToCsv ? ',' : ' '; + Utils::Print("Bytes Per Direction%c%lu\n", separator, numBytesPerTransfer); + + TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); + TransferBench::TestResults results; + + for (auto const& [pod, ranks] : rankToPod) { + // Add all devices in a pod + int n = ranks.size() * numGpuDevices; + + std::vector devices(n); + int idx = 0; + for (int rank : ranks) { + for (int devIdx = 0; devIdx < numGpuDevices; devIdx++) { + devices[idx++] = {gpuMemType, devIdx, rank}; + } + } + + // Build reverse map: (memRank, memIndex) -> device index + std::map, int> deviceLookup; + for (int i = 0; i < n; i++) + deviceLookup[{devices[i].memRank, devices[i].memIndex}] = i; + + ExeType const gpuExeType = useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX; + + for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) { + if ((p2pMode == 1 && isBidirectional == 1) || + (p2pMode == 2 && isBidirectional == 0)) continue; + + Utils::Print("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", + isBidirectional ? "Bi" : "Uni", + useRemoteRead ? "Remote" : "Local", + useRemoteRead ? "Local" : "Remote", + useDmaCopy ? "DMA" : "GFX"); + + std::vector avgBandwidth(n * n, 0.0); + + // Build rounds of transfers; all transfers in a round run in parallel + std::vector>> rounds; + + if (parallelLevel == 0) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + if (isBidirectional && i == j) continue; + std::vector> pairs; + pairs.push_back({devices[i], devices[j]}); + if (isBidirectional) + pairs.push_back({devices[j], devices[i]}); + rounds.push_back(std::move(pairs)); + } + } + } else { + // parallelLevel == 1: node pairs run concurrently, one device pair at a time per node pair + std::vector>> nodePairSchedule; + Utils::RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1); + + for (auto const& roundNodePairs : nodePairSchedule) { + for (int srcDev = 0; srcDev < numGpuDevices; srcDev++) { + for (int dstDev = 0; dstDev < numGpuDevices; dstDev++) { + std::vector> pairs; + for (auto const& [rankIdxA, rankIdxB] : roundNodePairs) { + int const rA = ranks[rankIdxA]; + int const rB = ranks[rankIdxB]; + if (isBidirectional && rA == rB && srcDev == dstDev) continue; + pairs.push_back({{gpuMemType, srcDev, rA}, {gpuMemType, dstDev, rB}}); + if (isBidirectional) + pairs.push_back({{gpuMemType, dstDev, rB}, {gpuMemType, srcDev, rA}}); + } + if (!pairs.empty()) + rounds.push_back(std::move(pairs)); + } + } + } + } + + // Execute rounds and collect results + for (auto const& round : rounds) { + std::vector transfers; + for (auto const& [src, dst] : round) { + TransferBench::Transfer transfer; + transfer.numBytes = numBytesPerTransfer; + transfer.srcs.push_back(src); + transfer.dsts.push_back(dst); + transfer.exeDevice = { gpuExeType, + useRemoteRead ? (int32_t)dst.memIndex : (int32_t)src.memIndex, + useRemoteRead ? (int32_t)dst.memRank : (int32_t)src.memRank }; + transfer.exeSubIndex = -1; + transfer.numSubExecs = numGpuSubExecs; + transfers.push_back(transfer); + } + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return ERR_FATAL; + } + + for (size_t k = 0; k < round.size(); k++) { + auto const& [src, dst] = round[k]; + int i = deviceLookup[{src.memRank, src.memIndex}]; + int j = deviceLookup[{dst.memRank, dst.memIndex}]; + avgBandwidth[i * n + j] = results.tfrResults[k].avgBandwidthGbPerSec; + } + } + + // Output results + int const rowsPerSrc = isBidirectional ? 3 : 1; + int const rowStride = isBidirectional ? rowsPerSrc + 1 : rowsPerSrc; + int const numRows = showFullMatrix ? 2 + n * rowStride - (isBidirectional ? 1 : 0) + : 1 + n * n * rowsPerSrc; + int const numCols = showFullMatrix ? 2 + n : (isBidirectional ? 6 : 5); + int const precision = 2; + Utils::TableHelper table(numRows, numCols, precision); + + table.DrawRowBorder(0); + table.DrawColBorder(0); + table.DrawColBorder(numCols); + table.DrawRowBorder(numRows); + + if (showFullMatrix) { + if (isBidirectional) + table.Set(0, 0, " SRC\\DST "); + else + table.Set(0, 0, useRemoteRead ? " SRC\\DST+EXE " : " SRC+EXE\\DST "); + table.DrawRowBorder(1); + table.DrawColBorder(1); + table.Set(1, 1, " Mem Device "); + + int colPrevRank = -1; + for (int j = 0; j < n; j++) { + int colIdx = 2 + j; + int r = devices[j].memRank; + if (r != colPrevRank) { + table.DrawColBorder(colIdx); + table.Set(0, colIdx, " Rank %02d ", r); + colPrevRank = r; + } + table.Set(1, colIdx, " GPU %02d ", devices[j].memIndex); + } + + int rowPrevRank = -1; + for (int i = 0; i < n; i++) { + int r = devices[i].memRank; + int baseRow = 2 + i * rowStride; + if (r != rowPrevRank) { + table.DrawRowBorder(baseRow); + table.Set(baseRow, 0, " Rank %02d ", r); + rowPrevRank = r; + } + + for (int dir = 0; dir < rowsPerSrc; dir++) { + int rowIdx = baseRow + dir; + if (isBidirectional) { + char const* arrow = (dir == 0) ? " ->" : (dir == 1) ? "<- " : "<->"; + table.Set(rowIdx, 1, " GPU %02d %s ", devices[i].memIndex, arrow); + } else { + table.Set(rowIdx, 1, " GPU %02d ", devices[i].memIndex); + } + + for (int j = 0; j < n; j++) { + double fwd = avgBandwidth[i * n + j]; + double rev = avgBandwidth[j * n + i]; + double val = (dir == 0) ? fwd : (dir == 1) ? rev : fwd + rev; + if (val == 0.0) + table.Set(rowIdx, 2 + j, " N/A "); + else + table.Set(rowIdx, 2 + j, " %.2f ", val); + } + } + } + } else { + table.Set(0, 0, " SRC Rank "); + table.Set(0, 1, " SRC MEM "); + if (isBidirectional) { + table.Set(0, 2, " Dir "); + table.Set(0, 3, " DST Rank "); + table.Set(0, 4, " DST MEM "); + table.Set(0, 5, " bw (GB/s) "); + table.DrawColBorder(3); + table.DrawColBorder(5); + } else { + table.Set(0, 2, " DST Rank "); + table.Set(0, 3, " DST MEM "); + table.Set(0, 4, " bw (GB/s) "); + table.DrawColBorder(2); + table.DrawColBorder(4); + } + int rowIdx = 1; + for (int i = 0; i < n; i++) { + table.DrawRowBorder(rowIdx); + for (int j = 0; j < n; j++) { + for (int dir = 0; dir < rowsPerSrc; dir++) { + double fwd = avgBandwidth[i * n + j]; + double rev = avgBandwidth[j * n + i]; + double val = (dir == 0) ? fwd : (dir == 1) ? rev : fwd + rev; + if (isBidirectional) { + char const* arrow = (dir == 0) ? " -> " : (dir == 1) ? " <- " : " <-> "; + table.Set(rowIdx, 0, " Rank %02d ", devices[i].memRank); + table.Set(rowIdx, 1, " GPU %02d ", devices[i].memIndex); + table.Set(rowIdx, 2, arrow); + table.Set(rowIdx, 3, " Rank %02d ", devices[j].memRank); + table.Set(rowIdx, 4, " GPU %02d ", devices[j].memIndex); + if (val == 0.0) + table.Set(rowIdx, 5, " N/A "); + else + table.Set(rowIdx, 5, " %.2f ", val); + } else { + table.Set(rowIdx, 0, " Rank %02d ", devices[i].memRank); + table.Set(rowIdx, 1, " GPU %02d ", devices[i].memIndex); + table.Set(rowIdx, 2, " Rank %02d ", devices[j].memRank); + table.Set(rowIdx, 3, " GPU %02d ", devices[j].memIndex); + if (val == 0.0) + table.Set(rowIdx, 4, " N/A "); + else + table.Set(rowIdx, 4, " %.2f ", val); + } + rowIdx++; + } + } + } + } + table.PrintTable(ev.outputToCsv, ev.showBorders); + } + + } + return ERR_NONE; +} diff --git a/src/client/Presets/Presets.hpp b/src/client/Presets/Presets.hpp index de1001fa..43631a45 100644 --- a/src/client/Presets/Presets.hpp +++ b/src/client/Presets/Presets.hpp @@ -22,6 +22,7 @@ THE SOFTWARE. #pragma once #include +#include // EnvVars is available to all presets #include "EnvVars.hpp" @@ -30,41 +31,75 @@ THE SOFTWARE. #include "AllToAll.hpp" #include "AllToAllN.hpp" #include "AllToAllSweep.hpp" +#include "BmaSweep.hpp" +#include "EnvVarsList.hpp" +#include "GfxSweep.hpp" +#include "HbmBandwidth.hpp" #include "HealthCheck.hpp" +#include "Help.hpp" +#include "NicAllToAll.hpp" #include "NicRings.hpp" #include "NicPeerToPeer.hpp" #include "OneToAll.hpp" #include "PeerToPeer.hpp" +#include "PodAllToAll.hpp" +#include "PodPeerToPeer.hpp" +#include "Rings.hpp" #include "Scaling.hpp" #include "Schmoo.hpp" +#include "SmokeTest.hpp" #include "Sweep.hpp" +#include "WallClock.hpp" typedef int (*PresetFunc)(EnvVars& ev, size_t const numBytesPerTransfer, - std::string const presetName); + std::string const presetName, + [[maybe_unused]] bool const bytesSpecified); -std::map> presetFuncMap = +struct PresetInfo +{ + PresetFunc func; + std::string description; +}; + +std::map presetFuncMap = { {"a2a", {AllToAllPreset, "Tests parallel transfers between all pairs of GPU devices"}}, {"a2a_n", {AllToAllRdmaPreset, "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}}, {"a2asweep", {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}}, + {"bmasweep", {BmaSweepPreset, "Test and compare batched DMA executor for multi destination copies"}}, + {"envvars", {EnvVarsPreset, "Show list of environment variables that can be used to modify behavior"}}, + {"gfxsweep", {GfxSweepPreset, "Sweep over various GFX kernel options for a given GFX Transfer"}}, + {"hbm", {HbmBandwidthPreset, "Tests HBM bandwidth"}}, {"healthcheck", {HealthCheckPreset, "Simple bandwidth health check (MI300X series only)"}}, - {"nicrings", {NicRingsPreset, "Tests NIC rings created across identical NIC indices across ranks"}}, + {"help", {HelpPreset, "Shows example usage details"}}, + {"nica2a", {NicAllToAllPreset, "All-to-all GPU traffic over NIC transfers using each NIC's closest GPU/CPU endpoint"}}, {"nicp2p", {NicPeerToPeerPreset, "Multi-node peer-to-peer RDMA transfer test between all NICs"}}, + {"nicrings", {NicRingsPreset, "Tests NIC rings created across identical NIC indices across ranks"}}, {"one2all", {OneToAllPreset, "Test all subsets of parallel transfers from one GPU to all others"}}, {"p2p" , {PeerToPeerPreset, "Peer-to-peer device memory bandwidth test"}}, + {"poda2a", {PodAllToAllPreset, "All-to-all transfers between subgroups of ranks within a pod"}}, + {"podp2p", {PodPeerToPeerPreset, "Peer-to-peer transfers test among ranks within a pod"}}, + {"rings", {RingsPreset, "Ring transfers within subgroups of ranks in a pod"}}, {"rsweep", {SweepPreset, "Randomly sweep through sets of Transfers"}}, {"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}}, {"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}}, + {"smoketest", {SmokeTestPreset, "Simple correctness smoke-test"}}, {"sweep", {SweepPreset, "Ordered sweep through sets of Transfers"}}, + {"wallclock", {WallClockPreset, "Tests wallclock consistency across XCCs within a GPU"}}, }; void DisplayPresets() { - printf("\nAvailable Preset Benchmarks:\n"); - printf("============================\n"); - for (auto const& x : presetFuncMap) - printf(" %15s - %s\n", x.first.c_str(), x.second.second.c_str()); + if (!Utils::RankDoesOutput()) return; + printf(" %-12s | %-56s\n", "Preset", "Description"); + printf("=============================================================================================================\n"); + for (auto const& x : presetFuncMap) { + printf(" %-12s | %-56s\n", + x.first.c_str(), + x.second.description.c_str()); + } + printf("=============================================================================================================\n"); } int RunPreset(EnvVars& ev, @@ -74,8 +109,15 @@ int RunPreset(EnvVars& ev, int& retCode) { std::string preset = (argc > 1 ? argv[1] : ""); + bool bytesSpecified = (argc > 2); + + if (preset == "presets") { + DisplayPresets(); + retCode = 0; + return 1; + } if (presetFuncMap.count(preset)) { - retCode = (presetFuncMap[preset].first)(ev, numBytesPerTransfer, preset); + retCode = (presetFuncMap[preset].func)(ev, numBytesPerTransfer, preset, bytesSpecified); return 1; } return 0; diff --git a/src/client/Presets/Rings.hpp b/src/client/Presets/Rings.hpp new file mode 100644 index 00000000..bee03055 --- /dev/null +++ b/src/client/Presets/Rings.hpp @@ -0,0 +1,280 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +int RingsPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) +{ + // Check for homogeneous ranks + if (Utils::GetNumRankGroups() > 1) { + Utils::Print("[ERROR] rings preset can only be run across ranks that are homogeneous\n"); + Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n"); + Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n"); + return 1; + } + + // Check for pod support (if multi-node) + int numRanks = TransferBench::GetNumRanks(); + if (numRanks > 1 && Utils::GetRankPerPodMap().empty()) { + Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n"); + return 1; + } + + int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); + + int memTypeIdx = EnvVars::GetEnvVar("MEM_TYPE" , 0); + int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); + int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0); + int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC" , 8); + int showDetails = EnvVars::GetEnvVar("SHOW_DETAILS" , 0); + int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC" , 0); + int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0); + int stride = EnvVars::GetEnvVar("STRIDE" , 1); + int groupSize = EnvVars::GetEnvVar("GROUP_SIZE" , numRanks * numGpus); + + if (numGpus <= 0 || numGpus > numDetectedGpus) { + Utils::Print("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus); + return 1; + } + if (groupSize <= 0) { + Utils::Print("[ERROR] Group size must be greater than 0\n"); + return 1; + } + if (numRanks * numGpus % groupSize) { + Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n", + groupSize, numRanks * numGpus, numRanks); + return 1; + } + + int numNics = TransferBench::GetNumExecutors(EXE_NIC, 0); + bool nicDifference = false; + for (int rank = 0; rank < numRanks; rank++) { + if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) { + Utils::Print("[ERROR] rings preset requires each rank to have the same number of GPUs\n"); + return 1; + } + if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank)) + nicDifference = true; + } + if (nicDifference) + Utils::Print("[WARN] Not all ranks have the same number of NICs\n"); + + MemType memType = Utils::GetGpuMemType(memTypeIdx); + std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx); + + if (Utils::RankDoesOutput()) { + ev.DisplayEnvVars(); + if (!ev.hideEnv) { + if (!ev.outputToCsv) printf("[Rings Related]\n"); + ev.Print("MEM_TYPE" , memTypeIdx , "Using %s GPU memory (%s)", devMemTypeStr.c_str(), Utils::GetAllGpuMemTypeStr().c_str()); + ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus); + ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs); + ev.Print("NUM_SUB_EXEC" , numSubExecs , "Using %d subexecutors/CUs per Transfer", numSubExecs); + ev.Print("USE_DMA_EXEC" , useDmaExec , "Using %s executor", useDmaExec ? "DMA" : "GFX"); + ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC"); + ev.Print("STRIDE" , stride , "Reordering devices by taking %d steps", stride); + ev.Print("GROUP_SIZE" , groupSize , "Dividing all devices into ring groups of %d", groupSize); + printf("\n"); + } + } + + Utils::Print("GPU-%s IntraPod Ring benchmark:\n", useDmaExec ? "DMA" : "GFX"); + Utils::Print("==============================\n"); + Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n", + numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, + devMemTypeStr.c_str(), numQueuePairs, numRanks); + + TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); + ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX; + + int n = numRanks * numGpus; + int numGroups = n / groupSize; + + std::vector indices(n); + for (int k = 0; k < n; k++) indices[k] = k; + Utils::StrideGenerate(indices, stride); + + std::vector devices(n); + for (int i = 0; i < n; i++) { + int const globalIdx = indices[i]; + int const rank = globalIdx / numGpus; + int const devIdx = globalIdx % numGpus; + devices[i] = {memType, devIdx, rank}; + } + + Utils::Print("%d ring(s) of %d devices:\n", numGroups, groupSize); + for (int group = 0; group < numGroups; group++) { + int const groupBase = group * groupSize; + Utils::Print(" Ring %d: ", group); + for (int i = 0; i < groupSize; i++) { + Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex); + } + Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex); + } + Utils::Print("\n"); + + for (int group = 0; group < numGroups; group++) { + int const groupBase = group * groupSize; + std::vector transfers; + + for (int i = 0; i < groupSize; i++) { + int srcIdx = groupBase + i; + int dstIdx = groupBase + (i + 1) % groupSize; + + TransferBench::Transfer transfer; + transfer.numBytes = numBytesPerTransfer; + transfer.srcs.push_back(devices[srcIdx]); + transfer.dsts.push_back(devices[dstIdx]); + transfer.exeDevice = {exeType, + (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex), + (int32_t)(useRemoteRead ? devices[dstIdx].memRank : devices[srcIdx].memRank)}; + transfer.exeSubIndex = -1; + transfer.numSubExecs = numSubExecs; + transfers.push_back(transfer); + + if (numQueuePairs > 0) { + TransferBench::Transfer nicTransfer; + nicTransfer.numBytes = numBytesPerTransfer; + nicTransfer.srcs.push_back(devices[srcIdx]); + nicTransfer.dsts.push_back(devices[dstIdx]); + nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, + (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank}; + nicTransfer.exeSubIndex = devices[dstIdx].memIndex; + nicTransfer.numSubExecs = numQueuePairs; + transfers.push_back(nicTransfer); + } + } + + TransferBench::TestResults results; + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return 1; + } + if (showDetails) { + Utils::PrintResults(ev, 1, transfers, results); + Utils::Print("\n"); + } + + if (Utils::RankDoesOutput()) { + Utils::Print("\n--- Ring Group %d ---\n", group); + + int const numHops = groupSize; + int const numRows = 2 + numHops + 3; + int const numCols = 6; + int const precision = 2; + Utils::TableHelper table(numRows, numCols, precision); + + table.DrawRowBorder(0); + table.DrawColBorder(0); + table.DrawColBorder(numCols); + table.DrawRowBorder(numRows); + + table.Set(0, 0, " Src "); + table.Set(0, 1, " Src "); + table.Set(0, 2, " Dst "); + table.Set(0, 3, " Dst "); + table.Set(0, 4, " GFX BW "); + table.Set(1, 0, " Rank "); + table.Set(1, 1, " GPU "); + table.Set(1, 2, " Rank "); + table.Set(1, 3, " GPU "); + table.Set(1, 4, " (GB/s) "); + table.DrawColBorder(2); + table.DrawColBorder(4); + + if (numQueuePairs > 0) { + table.Set(0, 5, " NIC BW "); + table.Set(1, 5, " (GB/s) "); + } else { + table.Set(0, 5, " "); + table.Set(1, 5, " "); + } + + table.DrawRowBorder(2); + + double gfxMin = std::numeric_limits::max(); + double gfxAvg = 0.0; + double gfxMax = std::numeric_limits::lowest(); + double nicMin = std::numeric_limits::max(); + double nicAvg = 0.0; + double nicMax = std::numeric_limits::lowest(); + + int tfrIdx = 0; + for (int i = 0; i < numHops; i++) { + int srcIdx = groupBase + i; + int dstIdx = groupBase + (i + 1) % groupSize; + int row = 2 + i; + + double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; + tfrIdx++; + + table.Set(row, 0, " %d ", devices[srcIdx].memRank); + table.Set(row, 1, " %d ", devices[srcIdx].memIndex); + table.Set(row, 2, " %d ", devices[dstIdx].memRank); + table.Set(row, 3, " %d ", devices[dstIdx].memIndex); + table.Set(row, 4, " %.2f ", gfxBw); + + gfxMin = std::min(gfxMin, gfxBw); + gfxAvg += gfxBw; + gfxMax = std::max(gfxMax, gfxBw); + + if (numQueuePairs > 0) { + double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; + tfrIdx++; + table.Set(row, 5, " %.2f ", nicBw); + nicMin = std::min(nicMin, nicBw); + nicAvg += nicBw; + nicMax = std::max(nicMax, nicBw); + } + } + + int summaryBase = 2 + numHops; + table.DrawRowBorder(summaryBase); + table.Set(summaryBase , 1, " MAX "); + table.Set(summaryBase + 1, 1, " AVG "); + table.Set(summaryBase + 2, 1, " MIN "); + table.Set(summaryBase , 4, " %.2f ", gfxMax); + table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops); + table.Set(summaryBase + 2, 4, " %.2f ", gfxMin); + + if (numQueuePairs > 0) { + table.Set(summaryBase , 5, " %.2f ", nicMax); + table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops); + table.Set(summaryBase + 2, 5, " %.2f ", nicMin); + } + + table.PrintTable(ev.outputToCsv, ev.showBorders); + + Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec); + } + } + + if (!Utils::RankDoesOutput()) return 0; + + if (Utils::HasDuplicateHostname()) { + printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n"); + } + + return 0; +} diff --git a/src/client/Presets/Scaling.hpp b/src/client/Presets/Scaling.hpp index c654943e..5c6879ce 100644 --- a/src/client/Presets/Scaling.hpp +++ b/src/client/Presets/Scaling.hpp @@ -20,41 +20,50 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -int ScalingPreset(EnvVars& ev, - size_t const numBytesPerTransfer, - std::string const presetName) +int ScalingPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) { if (TransferBench::GetNumRanks() > 1) { Utils::Print("[ERROR] Scaling preset currently not supported for multi-node\n"); - return 1; + return ERR_FATAL; } int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU); int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); // Collect env vars for this preset + int cpuMemTypeIdx = EnvVars::GetEnvVar("CPU_MEM_TYPE", 0); + int gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE", 0); int localIdx = EnvVars::GetEnvVar("LOCAL_IDX", 0); int numCpuDevices = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus); int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX", 32); int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN", 1); - int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0); // Display environment variables + MemType cpuMemType = Utils::GetCpuMemType(cpuMemTypeIdx); + MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx); + ev.DisplayEnvVars(); if (!ev.hideEnv) { int outputToCsv = ev.outputToCsv; - if (!outputToCsv) printf("[Schmoo Related]\n"); - ev.Print("LOCAL_IDX", localIdx, "Local GPU index"); - ev.Print("SWEEP_MAX", sweepMax, "Max number of subExecutors to use"); - ev.Print("SWEEP_MIN", sweepMin, "Min number of subExecutors to use"); + if (!outputToCsv) printf("[Scaling Related]\n"); + ev.Print("CPU_MEM_TYPE" , cpuMemTypeIdx, "Using %s (%s)", Utils::GetCpuMemTypeStr(cpuMemTypeIdx).c_str(), Utils::GetAllCpuMemTypeStr().c_str()); + ev.Print("GPU_MEM_TYPE" , gpuMemTypeIdx, "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str()); + ev.Print("LOCAL_IDX" , localIdx , "Local GPU index"); + ev.Print("NUM_CPU_DEVICES", numCpuDevices, "Using %d CPUs", numCpuDevices); + ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices); + ev.Print("SWEEP_MAX" , sweepMax , "Max number of subExecutors to use"); + ev.Print("SWEEP_MIN" , sweepMin , "Min number of subExecutors to use"); printf("\n"); } // Validate env vars if (localIdx >= numDetectedGpus) { printf("[ERROR] Cannot execute scaling test with local GPU device %d\n", localIdx); - return 1; + return ERR_FATAL; } TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); @@ -74,25 +83,23 @@ int ScalingPreset(EnvVars& ev, std::vector> bestResult(numDevices); - MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU; - std::vector transfers(1); Transfer& t = transfers[0]; t.exeDevice = {EXE_GPU_GFX, localIdx}; t.exeSubIndex = -1; t.numBytes = numBytesPerTransfer; - t.srcs = {{memType, localIdx}}; + t.srcs = {{gpuMemType, localIdx}}; for (int numSubExec = sweepMin; numSubExec <= sweepMax; numSubExec++) { t.numSubExecs = numSubExec; printf("%4d ", numSubExec); for (int i = 0; i < numDevices; i++) { - t.dsts = {{i < numCpuDevices ? MEM_CPU : MEM_GPU, + t.dsts = {{i < numCpuDevices ? cpuMemType : gpuMemType, i < numCpuDevices ? i : i - numCpuDevices}}; if (!RunTransfers(cfg, transfers, results)) { Utils::PrintErrors(results.errResults); - return 1; + return ERR_FATAL; } double bw = results.tfrResults[0].avgBandwidthGbPerSec; printf("%c%7.2f ", separator, bw); @@ -109,5 +116,6 @@ int ScalingPreset(EnvVars& ev, for (int i = 0; i < numDevices; i++) printf("%c%7.2f(%3d)", separator, bestResult[i].first, bestResult[i].second); printf("\n"); - return 0; + + return ERR_NONE; } diff --git a/src/client/Presets/Schmoo.hpp b/src/client/Presets/Schmoo.hpp index 71576ef8..bdd24b66 100644 --- a/src/client/Presets/Schmoo.hpp +++ b/src/client/Presets/Schmoo.hpp @@ -19,52 +19,55 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -int SchmooPreset(EnvVars& ev, - size_t const numBytesPerTransfer, - std::string const presetName) +int SchmooPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) { if (TransferBench::GetNumRanks() > 1) { Utils::Print("[ERROR] Schmoo preset currently not supported for multi-node\n"); - return 1; + return ERR_FATAL; } int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); if (numDetectedGpus < 2) { printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n"); - return 1; + return ERR_FATAL; } // Collect env vars for this preset - int localIdx = EnvVars::GetEnvVar("LOCAL_IDX", 0); - int remoteIdx = EnvVars::GetEnvVar("REMOTE_IDX", 1); - int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX", 32); - int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN", 1); - int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0); + int gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE", 0); + int localIdx = EnvVars::GetEnvVar("LOCAL_IDX", 0); + int remoteIdx = EnvVars::GetEnvVar("REMOTE_IDX", 1); + int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX", 32); + int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN", 1); + + MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx); // Display environment variables ev.DisplayEnvVars(); if (!ev.hideEnv) { int outputToCsv = ev.outputToCsv; if (!outputToCsv) printf("[Schmoo Related]\n"); + ev.Print("GPU_MEM_TYPE" , gpuMemTypeIdx, "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str()); ev.Print("LOCAL_IDX", localIdx, "Local GPU index"); ev.Print("REMOTE_IDX", remoteIdx, "Remote GPU index"); ev.Print("SWEEP_MAX", sweepMax, "Max number of subExecutors to use"); ev.Print("SWEEP_MIN", sweepMin, "Min number of subExecutors to use"); - ev.Print("USE_FINE_GRAIN", useFineGrain, "Using %s-grained memory", useFineGrain ? "fine" : "coarse"); printf("\n"); } // Validate env vars if (localIdx >= numDetectedGpus || remoteIdx >= numDetectedGpus) { printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx); - return 1; + return ERR_FATAL; } TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); TransferBench::TestResults results; - char memChar = useFineGrain ? 'F' : 'G'; + char memChar = MemTypeStr[gpuMemType]; printf("Bytes to transfer: %lu Local GPU: %d Remote GPU: %d\n", numBytesPerTransfer, localIdx, remoteIdx); printf(" | Local Read | Local Write | Local Copy | Remote Read | Remote Write| Remote Copy |\n"); printf(" #CUs |%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|\n", @@ -82,69 +85,65 @@ int SchmooPreset(EnvVars& ev, t.exeSubIndex = -1; t.numBytes = numBytesPerTransfer; - MemType memType = (useFineGrain ? MEM_GPU_FINE : MEM_GPU); - for (int numCUs = sweepMin; numCUs <= sweepMax; numCUs++) { t.numSubExecs = numCUs; // Local Read - t.srcs = {{memType, localIdx}}; + t.srcs = {{gpuMemType, localIdx}}; t.dsts = {}; if (!TransferBench::RunTransfers(cfg, transfers, results)) { Utils::PrintErrors(results.errResults); - return 1; + return ERR_FATAL; } double const localRead = results.tfrResults[0].avgBandwidthGbPerSec; // Local Write t.srcs = {}; - t.dsts = {{memType, localIdx}}; + t.dsts = {{gpuMemType, localIdx}}; if (!TransferBench::RunTransfers(cfg, transfers, results)) { Utils::PrintErrors(results.errResults); - return 1; + return ERR_FATAL; } double const localWrite = results.tfrResults[0].avgBandwidthGbPerSec; // Local Copy - t.srcs = {{memType, localIdx}}; - t.dsts = {{memType, localIdx}}; - t.srcs = {}; - t.dsts = {{memType, localIdx}}; + t.srcs = {{gpuMemType, localIdx}}; + t.dsts = {{gpuMemType, localIdx}}; if (!TransferBench::RunTransfers(cfg, transfers, results)) { Utils::PrintErrors(results.errResults); - return 1; + return ERR_FATAL; } double const localCopy = results.tfrResults[0].avgBandwidthGbPerSec; // Remote Read - t.srcs = {{memType, remoteIdx}}; + t.srcs = {{gpuMemType, remoteIdx}}; t.dsts = {}; if (!TransferBench::RunTransfers(cfg, transfers, results)) { Utils::PrintErrors(results.errResults); - return 1; + return ERR_FATAL; } double const remoteRead = results.tfrResults[0].avgBandwidthGbPerSec; // Remote Write t.srcs = {}; - t.dsts = {{memType, remoteIdx}}; + t.dsts = {{gpuMemType, remoteIdx}}; if (!TransferBench::RunTransfers(cfg, transfers, results)) { Utils::PrintErrors(results.errResults); - return 1; + return ERR_FATAL; } double const remoteWrite = results.tfrResults[0].avgBandwidthGbPerSec; // Remote Copy - t.srcs = {{memType, localIdx}}; - t.dsts = {{memType, remoteIdx}}; + t.srcs = {{gpuMemType, localIdx}}; + t.dsts = {{gpuMemType, remoteIdx}}; if (!TransferBench::RunTransfers(cfg, transfers, results)) { Utils::PrintErrors(results.errResults); - return 1; + return ERR_FATAL; } double const remoteCopy = results.tfrResults[0].avgBandwidthGbPerSec; printf(" %3d %11.3f %11.3f %11.3f %11.3f %11.3f %11.3f \n", numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy); } - return 0; + return ERR_NONE; } diff --git a/src/client/Presets/SmokeTest.hpp b/src/client/Presets/SmokeTest.hpp new file mode 100644 index 00000000..9628823c --- /dev/null +++ b/src/client/Presets/SmokeTest.hpp @@ -0,0 +1,336 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +namespace { + +#define NUM_SMOKE_TESTS 14 +#define MAX_TRANSFER_STRLEN 128 + +// What to print on pass/fail/skip +const std::string pass = "*"; +const std::string fail = "F"; +const std::string skip = "."; + +int RunTest(int testNum, + std::set const& testsToRun, + std::vector const& sizeList, + int numSubExecPerGpu, + ConfigOptions& cfg, + MemType cpuMemType, + MemType gpuMemType, + size_t maxBytesPerSubExec, + int totalGpus) +{ + int numFail = 0; + + // Collect some topology information + int numRanks = TransferBench::GetNumRanks(); + + std::vector transfers; + std::vector allTransfers; + TestResults results; + char transferStr[MAX_TRANSFER_STRLEN] = {}; + + + // Different test categories + bool isH2D = (testNum == 1 || testNum == 8); + bool isD2H = (testNum == 2 || testNum == 9); + bool isD2D_RW = (testNum == 3 || testNum == 10); + bool isD2D_RR = (testNum == 4 || testNum == 11); + bool isBroadcast = (testNum == 5 || testNum == 12); + bool isGather = (testNum == 6 || testNum == 13); + bool isAllToAll = (testNum == 7 || testNum == 14); + + // Determine executor type + ExeType exeType; + if (1 <= testNum && testNum <= 7) exeType = EXE_GPU_DMA; + else if (8 <= testNum && testNum <= 14) exeType = EXE_GPU_GFX; + else { + Utils::Print("[ERROR] Unsupported test number %d\n", testNum); + exit(1); + } + + // Adjust number of subexecutors per transfer if performing multiple transfers + int numSubExec = exeType == EXE_GPU_DMA ? 1 : numSubExecPerGpu; + if (exeType == EXE_GPU_GFX && (isBroadcast || isGather || isAllToAll)) + numSubExec = std::max(1, numSubExecPerGpu / totalGpus); + + for (size_t numBytes : sizeList) { + + // Print skip symbol for skipped tests + if (!testsToRun.count(testNum)) { + Utils::Print("%s", skip.c_str()); fflush(stdout); + continue; + } + if (exeType == EXE_GPU_GFX && + (numSubExec * cfg.data.blockBytes > numBytes || + numSubExec * maxBytesPerSubExec < numBytes)) { + Utils::Print("%s", skip.c_str()); fflush(stdout); + continue; + } + // Skip test that require pod + if (numRanks > 1 && Utils::GetRankPerPodMap().size() != 1 && !(isH2D || isD2H)) { + Utils::Print("%s", skip.c_str()); fflush(stdout); + continue; + } + + bool allPass = true; + allTransfers.clear(); + + // Combine transfers from each GPU and run them all in parallel + for (int rank = 0; allPass && rank < numRanks; rank++) { + int numGpus = GetNumExecutors(exeType, rank); + for (int gpuIdx = 0; allPass && gpuIdx < numGpus; gpuIdx++) { + if (isH2D || isD2H) { + // Copy to/from closest CPU NUMA node for this GPU + int cpuIdx = GetClosestCpuNumaToGpu(gpuIdx, rank); + snprintf(transferStr, MAX_TRANSFER_STRLEN, "-1 (R%d%c%d R%d%c%d R%d%c%d %d %lu)", + rank, MemTypeStr[isH2D ? cpuMemType : gpuMemType], isH2D ? cpuIdx : gpuIdx, + rank, ExeTypeStr[exeType], gpuIdx, + rank, MemTypeStr[isH2D ? gpuMemType : cpuMemType], isH2D ? gpuIdx : cpuIdx, + numSubExec, numBytes); + } else if (isD2D_RW || isD2D_RR) { + // Copy from this GPU to "next" GPU + int dstRank = rank, dstGpuIdx = gpuIdx + 1; + if (dstGpuIdx >= GetNumExecutors(exeType, dstRank)) { + dstGpuIdx = 0; + dstRank = (rank+1) % numRanks; + } + snprintf(transferStr, MAX_TRANSFER_STRLEN, "-1 (R%d%c%d R%d%c%d R%d%c%d %d %lu)", + rank, MemTypeStr[gpuMemType], gpuIdx, + isD2D_RW ? rank : dstRank, ExeTypeStr[exeType], isD2D_RW ? gpuIdx : dstGpuIdx, + dstRank, MemTypeStr[gpuMemType], dstGpuIdx, + numSubExec, numBytes); + } else if (isBroadcast) { + // Split up the number of CUs across all Transfers + snprintf(transferStr, MAX_TRANSFER_STRLEN, "-1 (R%d%c%d R%d%c%d R*%c* %d %lu)", + rank, MemTypeStr[gpuMemType], gpuIdx, + rank, ExeTypeStr[exeType], gpuIdx, + MemTypeStr[gpuMemType], + numSubExec, numBytes); + } else if (isGather) { + // Split up the number of CUs across all Transfers + snprintf(transferStr, MAX_TRANSFER_STRLEN, "-1 (R*%c* R%d%c%d R%d%c%d %d %lu)", + MemTypeStr[gpuMemType], + rank, ExeTypeStr[exeType], gpuIdx, + rank, MemTypeStr[gpuMemType], gpuIdx, + numSubExec, numBytes); + } else if (isAllToAll) { + // Split up the number of CUs across all Transfers + snprintf(transferStr, MAX_TRANSFER_STRLEN, "-1 (R%d%c%d R%d%c%d R*%c* %d %lu)", + rank, MemTypeStr[gpuMemType], gpuIdx, + rank, ExeTypeStr[exeType], gpuIdx, + MemTypeStr[gpuMemType], + numSubExec, numBytes); + } + + ErrResult err = ParseTransfers(transferStr, transfers); + if (err.errType != ERR_NONE) { + Utils::Print("[ERROR] Unexpected parsing error - %s. This is a coding error\n", err.errMsg.c_str()); + exit(1); + } + + if (isBroadcast || isGather) { + if (!RunTransfers(cfg, transfers, results)) { + allPass = false; + break; + } + } else { + allTransfers.insert(allTransfers.end(), transfers.begin(), transfers.end()); + } + } + } + if (!(isBroadcast || isGather)) { + if (!RunTransfers(cfg, allTransfers, results)) { + allPass = false; + } + } + Utils::Print("%s", allPass ? pass.c_str() : fail.c_str()); fflush(stdout); + numFail += (allPass ? 0 : 1); + } + return numFail; +} + +int SmokeTestPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) +{ + // Check for single pod + if (Utils::GetRankPerPodMap().size() > 1) { + Utils::Print("[ERROR] %s preset can only be run within a single pod\n", presetName.c_str()); + Utils::Print("[ERROR] Pod membership may be forced by setting TB_FORCE_SINGLE_POD=1\n"); + return ERR_FATAL; + } + + // Collect topology and check that all GPUs have the same number of subExecutors + int numRanks = TransferBench::GetNumRanks(); + int totalGpus = 0; + int numSubExec = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0, 0}); + for (int rank = 0; rank < numRanks; rank++) { + int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX, rank); + totalGpus += numGpus; + for (int gpu = 0; gpu < numGpus; gpu++) { + if (numSubExec != TransferBench::GetNumSubExecutors({EXE_GPU_GFX, gpu, rank})) { + Utils::Print("[ERROR] %s preset can only be run on GPUs with the same number of subexecutors\n", presetName.c_str()); + return ERR_FATAL; + } + } + } + + // Modify defaults unless they were set + if (!getenv("ALWAYS_VALIDATE")) ev.alwaysValidate = 1; + if (!getenv("NUM_ITERATIONS" )) ev.numIterations = 2; + if (!getenv("NUM_WARMUPS" )) ev.numWarmups = 0; + + // Collect env vars + int cpuMemTypeIdx = EnvVars::GetEnvVar ("CPU_MEM_TYPE", 0); + int gpuMemTypeIdx = EnvVars::GetEnvVar ("GPU_MEM_TYPE", 0); + vector gfxSesList = EnvVars::GetEnvVarArray ("GFX_SE_LIST", {1,numSubExec}); + std::string seMaxBytesStr = EnvVars::GetEnvVar ("SE_MAX_BYTES", "128M"); + vector sizeStrList = EnvVars::GetEnvVarStrArray ("SIZE_LIST", {"1K","16M","256M"}); + vector testList = EnvVars::GetEnvVarRangeArray("TEST_LIST", {}); + + MemType cpuMemType = Utils::GetCpuMemType(cpuMemTypeIdx); + MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx); + std::set testsToRun(testList.begin(), testList.end()); + if (testList.empty()) { + for (int testIdx = 1; testIdx <= NUM_SMOKE_TESTS; testIdx++) + testsToRun.insert(testIdx); + } + + vector sizeList; + if (bytesSpecified) { + sizeList = {numBytesPerTransfer}; + } else { + for (auto s : sizeStrList) { + size_t val; + if (sscanf(s.c_str(), "%lu", &val) == 1) { + switch (s[s.size()-1]) { + case 'G': case 'g': val *= 1024; + case 'M': case 'm': val *= 1024; + case 'K': case 'k': val *= 1024; + } + sizeList.push_back(val); + } + } + } + size_t seMaxBytes = 128 * 1024 * 1024; + if (sscanf(seMaxBytesStr.c_str(), " %lu", &seMaxBytes) == 1) { + switch (seMaxBytesStr[seMaxBytesStr.size()-1]) { + case 'G': case 'g': seMaxBytes *= 1024; + case 'M': case 'm': seMaxBytes *= 1024; + case 'K': case 'k': seMaxBytes *= 1024; + } + } + + TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); + + // Print off environment variables + if (Utils::RankDoesOutput()) { + ev.DisplayEnvVars(); + if (!ev.hideEnv) { + if (!ev.outputToCsv) printf("[%s-preset Related]\n", presetName.c_str()); + ev.Print("CPU_MEM_TYPE", cpuMemTypeIdx, "Using %s (%s)", Utils::GetCpuMemTypeStr(cpuMemTypeIdx).c_str(), Utils::GetAllCpuMemTypeStr().c_str()); + ev.Print("GFX_SE_LIST" , gfxSesList.size(), "Testing GFX with subexecutor counts: %s", EnvVars::ToStr(gfxSesList).c_str()); + ev.Print("GPU_MEM_TYPE", gpuMemTypeIdx, "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str()); + ev.Print("SIZE_LIST" , sizeStrList.size(), "Transfer sizes tested: %s", ev.GetStr(sizeStrList).c_str()); + ev.Print("SE_MAX_BYTES", seMaxBytesStr, "Each SubExecutor can work on at most %lu bytes", seMaxBytes); + ev.Print("TEST_LIST" , testsToRun.size(), testList.empty() ? "Running all tests" : "Running Tests: %s", ev.GetStr(testList).c_str()); + printf("\n"); + } + } + + // Calculate cell-spacing / padding + int numSizes = sizeList.size(); + int colSize = std::max(5, 2 + numSizes); + int lPad1Size = (colSize - 3) / 2, rPad1Size = colSize - lPad1Size - 3; + int lPad2Size = (colSize - numSizes) / 2, rPad2Size = colSize - lPad2Size - numSizes; + + std::string l1(lPad1Size, ' '), r1(rPad1Size, ' '); + std::string l2(lPad2Size, ' '), r2(rPad2Size, ' '); + + int testsFailed = 0; + auto test = [&](int x, int y) { + Utils::Print(" %02d |%s", x, l2.c_str()); + fflush(stdout); + testsFailed += RunTest(x, testsToRun, sizeList, 1, cfg, cpuMemType, gpuMemType, seMaxBytes, totalGpus); + Utils::Print("%s| %02d |", r2.c_str(), y); + for (auto numSubExec : gfxSesList) { + Utils::Print("%s", l2.c_str()); + fflush(stdout); + testsFailed += RunTest(y, testsToRun, sizeList, numSubExec, cfg, cpuMemType, gpuMemType, seMaxBytes, totalGpus); + Utils::Print("%s|", r2.c_str()); + } + Utils::Print("\n"); + fflush(stdout); + }; + + Utils::Print("Running tests on %d GPUs total across %d rank(s)\n", totalGpus, numRanks); + Utils::Print("Legend: %s=Pass %s=Skip %s=Fail\n", pass.c_str(), skip.c_str(), fail.c_str()); + + // Print headers + Utils::Print(" %s %s |", l1.c_str(), r1.c_str()); + for ([[maybe_unused]] auto numSubExec : gfxSesList) + Utils::Print("%sGFX%s|", l1.c_str(), r1.c_str()); + Utils::Print("\n"); + Utils::Print("| Name | Test |%sDMA%s| Test |", l1.c_str(), r1.c_str()); + for (auto numSubExec : gfxSesList) + Utils::Print("%s%03d%s|", l1.c_str(), numSubExec, r1.c_str()); + Utils::Print("\n"); + Utils::Print("|---------------------------|------|%s|------|", std::string(colSize, '-').c_str()); + for ([[maybe_unused]] auto numSubExec : gfxSesList) + Utils::Print("%s|", std::string(colSize, '-').c_str()); + Utils::Print("\n"); + + // Print table / Run Tests + Utils::Print("| Copy (H2D) |"); test(1, 8); + Utils::Print("| Copy (D2H) |"); test(2, 9); + Utils::Print("| Copy (D2D) (Remote Write) |"); test(3,10); + Utils::Print("| Copy (D2D) (Remote Read ) |"); test(4,11); + Utils::Print("| Broadcast (One to All) |"); test(5,12); + Utils::Print("| Gather (All to One) |"); test(6,13); + Utils::Print("| All To All |"); test(7,14); + + Utils::Print("|---------------------------|------|%s|------|", std::string(colSize, '-').c_str()); + for ([[maybe_unused]] auto numSubExec : gfxSesList) + Utils::Print("%s|", std::string(colSize, '-').c_str()); + Utils::Print("\n\n"); + + // Show summary + if (testsFailed) { + Utils::Print("[WARN] %d Tests FAILED\n", testsFailed); + } else { + Utils::Print("All tests passed\n"); + } + if (numRanks > 1 && Utils::GetRankPerPodMap().size() != 1) { + Utils::Print("[WARN] Copy (D2D) / Broadcast / Gather / AllToAll tests are skipped if ranks are not in same pod\n"); + } + if (Utils::HasDuplicateHostname()) { + Utils::Print("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n"); + } + return testsFailed ? ERR_FATAL : ERR_NONE; +} + +} diff --git a/src/client/Presets/Sweep.hpp b/src/client/Presets/Sweep.hpp index 97c9c951..2ab85006 100644 --- a/src/client/Presets/Sweep.hpp +++ b/src/client/Presets/Sweep.hpp @@ -39,13 +39,14 @@ void LogTransfers(FILE *fp, int const testNum, std::vector const& tran } } -int SweepPreset(EnvVars& ev, - size_t const numBytesPerTransfer, - std::string const presetName) +int SweepPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) { if (TransferBench::GetNumRanks() > 1) { Utils::Print("[ERROR] Sweep preset currently not supported for multi-node\n"); - return 1; + return ERR_FATAL; } bool const isRandom = (presetName == "rsweep"); @@ -103,33 +104,33 @@ int SweepPreset(EnvVars& ev, for (auto ch : sweepSrc) { if (!strchr(MemTypeStr, ch)) { printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch); - return 1; + return ERR_FATAL; } if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch)) { printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch); - return 1; + return ERR_FATAL; } } for (auto ch : sweepDst) { if (!strchr(MemTypeStr, ch)) { printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch); - return 1; + return ERR_FATAL; } if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch)) { printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch); - return 1; + return ERR_FATAL; } } for (auto ch : sweepExe) { if (!strchr(ExeTypeStr, ch)) { printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch); - return 1; + return ERR_FATAL; } if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch)) { printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch); - return 1; + return ERR_FATAL; } } @@ -339,7 +340,7 @@ int SweepPreset(EnvVars& ev, if (!TransferBench::RunTransfers(cfg, transfers, results)) { Utils::PrintErrors(results.errResults); - if (!continueOnErr) return 1; + if (!continueOnErr) return ERR_FATAL; } else { Utils::PrintResults(ev, numTestsRun, transfers, results); } @@ -371,5 +372,5 @@ int SweepPreset(EnvVars& ev, } } if (fp) fclose(fp); - return 0; + return ERR_NONE; } diff --git a/src/client/Presets/WallClock.hpp b/src/client/Presets/WallClock.hpp new file mode 100644 index 00000000..e23844bc --- /dev/null +++ b/src/client/Presets/WallClock.hpp @@ -0,0 +1,234 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +__global__ void GetXccTimestamps(uint64_t* timestamps, volatile int* readyFlag) +{ + // Only first thread does any work + if (threadIdx.x != 0) return; + + // Threadblocks in first "row" handle timestamps + if (blockIdx.y == 0) { + + // Collect XCD for this + int xccId; + GetXccId(xccId); + + // All threadblocks wait for ready signal + while (*readyFlag == 0); + + // Collect timestamp and save to memory + auto w = GetTimestamp(); + timestamps[xccId] = w; + } else if (blockIdx.x == 0) { + + // Sleep for some number of cycles to ensure that other threadblocks are active + auto w = GetTimestamp(); + while (GetTimestamp() - w < 10000); + + // Signal start to the other threadblocks + *readyFlag = 1; + } +} + +#if defined(__NVCC__) +#define hipDeviceSynchronize cudaDeviceSynchronize +#define hipMemset cudaMemset +#define hipSetDevice cudaSetDevice +#endif + +int WallClockPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) +{ + // Gather results and print + int numRanks = GetNumRanks(); + int myRank = GetRank(); + + // Check for single homogenous group + if (Utils::GetNumRankGroups() > 1) { + Utils::Print("[ERROR] wallclock preset can only be run across ranks that are homogenous\n"); + Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n"); + Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n"); + return ERR_FATAL; + } + + int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); + int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); + + // Print off env vars + if (Utils::RankDoesOutput()) { + if (!ev.hideEnv) { + if (!ev.outputToCsv) printf("[WallClock Related]\n"); + ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Limit to using %d GPUs (per rank)", numGpuDevices); + ev.Print("NUM_ITERATIONS" , ev.numIterations, "Number of iterations"); + ev.Print("NUM_WARMUPS" , ev.numWarmups, "Number of warmup iterations"); + ev.Print("SHOW_ITERATIONS", ev.showIterations, "Showing per iteration details. Set to 2 to see raw wallclock values"); + } + } + + // Check for env var consistency across ranks + IS_UNIFORM(numGpuDevices, "NUM_GPU_DEVICES"); + IS_UNIFORM(ev.numIterations, "NUM_ITERATIONS"); + IS_UNIFORM(ev.numWarmups, "NUM_WARMUPS"); + IS_UNIFORM(ev.showIterations, "SHOW_ITERATIONS"); + + if (numGpuDevices <= 0) { + Utils::Print("[ERROR] wallclock preset requires at least one GPU\n"); + return ERR_FATAL; + } + + // Collect local results + int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0}); + + // Compute wall clock rate (based on GPU 0) + int wallClockKhz; +#if defined(__NVCC__) + wallClockKhz = 1000000; +#else + HIP_CALL(hipDeviceGetAttribute(&wallClockKhz, hipDeviceAttributeWallClockRate, 0)); +#endif + if (wallClockKhz == 0) wallClockKhz = 100000; + double uSecPerCycle = 1000.0 / wallClockKhz; + + Utils::Print("\nRunning %d iterations. Detected wall clock rate of %dKhz = %.2f usec per cycle\n\n", + ev.numIterations, wallClockKhz, uSecPerCycle); + + std::vector>> results(numGpuDevices, + std::vector>(ev.numIterations, + std::vector(numXccs, 0))); + for (int deviceId = 0; deviceId < numGpuDevices; deviceId++) { + HIP_CALL(hipSetDevice(deviceId)); + + uint64_t* timestamps; + int32_t* readyFlag; + + if (Utils::AllocateMemory({MEM_CPU_CLOSEST, deviceId}, numXccs * sizeof(uint64_t), (void**)×tamps)) { + Utils::Print("[ERROR] Unable to allocate pinned host memory for storing timestamps for GPU device %d on rank %d\n", + deviceId, myRank); + return ERR_FATAL; + } + if (Utils::AllocateMemory({MEM_GPU, deviceId}, sizeof(int32_t), (void**)&readyFlag)) { + Utils::Print("[ERROR] Unable to allocate readyFlag on GPU device %d on rank %d\n", deviceId, myRank); + return ERR_FATAL; + } + + for (int i = -ev.numWarmups; i < ev.numIterations; i++) + { + HIP_CALL(hipMemset(readyFlag, 0, sizeof(int))); + HIP_CALL(hipDeviceSynchronize()); + GetXccTimestamps<<>>(timestamps, readyFlag); + HIP_CALL(hipDeviceSynchronize()); + if (i >= 0) { + memcpy(results[deviceId][i].data(), timestamps, numXccs * sizeof(uint64_t)); + } + } + + Utils::DeallocateMemory(MEM_CPU_CLOSEST, timestamps, numXccs * sizeof(uint64_t)); + Utils::DeallocateMemory(MEM_GPU, readyFlag, sizeof(int32_t)); + } + + // Prepare table of results + int numRows = 1 + numRanks * numGpuDevices * (ev.showIterations ? (ev.numIterations+1) : 1); + int numCols = 5 + (ev.showIterations ? numXccs : 0); + Utils::TableHelper table(numRows, numCols); + + for (int i = 0; i < numCols; i++) { + table.SetColAlignment(i, Utils::TableHelper::ALIGN_CENTER); + } + + // Prepare header row + int currRow = 0; + int currCol = 0; + table.Set(currRow, currCol++, "Rank"); + table.Set(currRow, currCol++, "GPU"); + table.Set(currRow, currCol++, "Iter"); + table.Set(currRow, currCol++, "Delta(cycles)"); + table.Set(currRow, currCol++, "Delta(usec)"); + if (ev.showIterations) { + for (int i = 0; i < numXccs; i++) { + table.Set(currRow, currCol++, " XCC %d ", i); + } + } + currRow++; + + double minDelta = std::numeric_limits::max(); + double maxDelta = std::numeric_limits::lowest(); + + for (int rank = 0; rank < numRanks; rank++) { + table.DrawRowBorder(currRow); + for (int deviceId = 0; deviceId < numGpuDevices; deviceId++) { + size_t totalCycles = 0; + std::vector timestamps(numXccs, 0); + + for (int iteration = 0; iteration < ev.numIterations; iteration++) { + if (rank == myRank) timestamps = results[deviceId][iteration]; + TransferBench::System::Get().Broadcast(rank, numXccs * sizeof(uint64_t), timestamps.data()); + + const auto [min,max] = std::minmax_element(timestamps.begin(), timestamps.end()); + + uint64_t cycles = (*max - *min); + totalCycles += cycles; + + if (ev.showIterations) { + currCol = 0; + table.Set(currRow, currCol++, "%d", rank); + table.Set(currRow, currCol++, "%d", deviceId); + table.Set(currRow, currCol++, "%d", iteration); + table.Set(currRow, currCol++, "%lu", cycles); + table.Set(currRow, currCol++, "%.2f", cycles * uSecPerCycle); + for (int i = 0; i < numXccs; i++) { + table.Set(currRow, currCol++, "%lu", timestamps[i] - (ev.showIterations > 1 ? 0 : *min)); + } + currRow++; + } + } + + double avgCycles = totalCycles * 1.0 / ev.numIterations; + minDelta = std::min(minDelta, avgCycles); + maxDelta = std::max(maxDelta, avgCycles); + currCol = 0; + table.Set(currRow, currCol++, "%d", rank); + table.Set(currRow, currCol++, "%d", deviceId); + table.Set(currRow, currCol++, "AVG"); + table.Set(currRow, currCol++, "%.2f", avgCycles); + table.Set(currRow, currCol++, "%.2f", avgCycles * uSecPerCycle); + currRow++; + } + } + + table.PrintTable(ev.outputToCsv, ev.showBorders); + + Utils::Print("\n"); + Utils::Print("Minimum Delta detected: %.2f cycles (%.2f usec)\n", minDelta, minDelta * uSecPerCycle); + Utils::Print("Maximum Delta detected: %.2f cycles (%.2f usec)\n", maxDelta, maxDelta * uSecPerCycle); + + if (Utils::HasDuplicateHostname()) { + Utils::Print("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n"); + } + return ERR_NONE; +} + +#if defined(__NVCC__) +#undef hipDeviceSynchronize +#undef hipMemset +#endif diff --git a/src/client/Topology.hpp b/src/client/Topology.hpp index 52de4aca..180b65ba 100644 --- a/src/client/Topology.hpp +++ b/src/client/Topology.hpp @@ -215,17 +215,16 @@ void DisplayMultiRankTopology(bool outputToCsv, bool showBorders) Utils::GroupKey const& key = group.first; std::vector const& hosts = group.second; - std::string ppodId = std::get<0>(key); - int vpodId = std::get<1>(key); - std::vector cpuNames = std::get<2>(key); - std::vector cpuSubExecs = std::get<3>(key); - std::vector gpuNames = std::get<4>(key); - std::vector gpuSubExecs = std::get<5>(key); - std::vector gpuClosestCpu = std::get<6>(key); - std::vector nicNames = std::get<7>(key); - std::vector nicClosestCpu = std::get<8>(key); - std::vector nicClosestGpu = std::get<9>(key); - std::vector nicIsActive = std::get<10>(key); + int64_t podId = std::get<0>(key); + std::vector cpuNames = std::get<1>(key); + std::vector cpuSubExecs = std::get<2>(key); + std::vector gpuNames = std::get<3>(key); + std::vector gpuSubExecs = std::get<4>(key); + std::vector gpuClosestCpu = std::get<5>(key); + std::vector nicNames = std::get<6>(key); + std::vector nicClosestCpu = std::get<7>(key); + std::vector nicClosestGpu = std::get<8>(key); + std::vector nicIsActive = std::get<9>(key); int numRanks = hosts.size(); int numCpus = cpuNames.size(); @@ -240,7 +239,7 @@ void DisplayMultiRankTopology(bool outputToCsv, bool showBorders) groupNum++, numRanks, numCpus, numGpus, numNics, numActiveNics); // Determine size of table - int numCols = 7; + int numCols = 6; int numRows = 1 + std::max(numRanks, numExecutors); TransferBench::Utils::TableHelper table(numRows, numCols); @@ -257,10 +256,9 @@ void DisplayMultiRankTopology(bool outputToCsv, bool showBorders) table.Set(0, 0, " Rank "); table.Set(0, 1, " Hostname "); table.Set(0, 2, " POD "); - table.Set(0, 3, " VID "); - table.Set(0, 4, " Executor "); - table.Set(0, 5, " Executor Name "); - table.Set(0, 6, " #SE "); + table.Set(0, 3, " Executor "); + table.Set(0, 4, " Executor Name "); + table.Set(0, 5, " #SE "); // Fill in ranks / hosts for (int i = 0; i < numRanks; i++) { @@ -270,31 +268,30 @@ void DisplayMultiRankTopology(bool outputToCsv, bool showBorders) } // Fill in PPOD and VPOD - table.Set(1, 2, " %s ", ppodId.c_str()); - table.Set(1, 3, " %d ", vpodId); + table.Set(1, 2, " %ld ", podId); // Fill in Executor information int rowIdx = 1; for (int cpuIndex = 0; cpuIndex < numCpus; cpuIndex++) { - table.Set(rowIdx, 4, " CPU %02d ", cpuIndex); - table.Set(rowIdx, 5, " %s ", cpuNames[cpuIndex].c_str()); - table.Set(rowIdx, 6, " %d ", cpuSubExecs[cpuIndex]); + table.Set(rowIdx, 3, " CPU %02d ", cpuIndex); + table.Set(rowIdx, 4, " %s ", cpuNames[cpuIndex].c_str()); + table.Set(rowIdx, 5, " %d ", cpuSubExecs[cpuIndex]); rowIdx++; // Loop over each GPU closest to this CPU executor for (int gpuIndex = 0; gpuIndex < numGpus; gpuIndex++) { if (gpuClosestCpu[gpuIndex] != cpuIndex) continue; - table.Set(rowIdx, 4, " - GPU %02d ", gpuIndex); - table.Set(rowIdx, 5, " - %s ", gpuNames[gpuIndex].c_str()); - table.Set(rowIdx, 6, " %d ", gpuSubExecs[gpuIndex]); + table.Set(rowIdx, 3, " - GPU %02d ", gpuIndex); + table.Set(rowIdx, 4, " - %s ", gpuNames[gpuIndex].c_str()); + table.Set(rowIdx, 5, " %d ", gpuSubExecs[gpuIndex]); rowIdx++; // Loop over each NIC closest to this GPU for (int nicIndex = 0; nicIndex < numNics; nicIndex++) { if (nicClosestGpu[nicIndex] != gpuIndex) continue; - table.Set(rowIdx, 4, " - NIC %02d ", nicIndex); - table.Set(rowIdx, 5, " - %s", nicNames[nicIndex].c_str()); - table.Set(rowIdx, 6, " %s ", nicIsActive[nicIndex] ? "ON" : "OFF"); + table.Set(rowIdx, 3, " - NIC %02d ", nicIndex); + table.Set(rowIdx, 4, " - %s", nicNames[nicIndex].c_str()); + table.Set(rowIdx, 5, " %s ", nicIsActive[nicIndex] ? "ON" : "OFF"); rowIdx++; } } @@ -302,9 +299,9 @@ void DisplayMultiRankTopology(bool outputToCsv, bool showBorders) // Loop over remaining NICs not associated with GPU but associated with this CPU for (int nicIndex = 0; nicIndex < numNics; nicIndex++) { if (nicClosestGpu[nicIndex] != -1 || nicClosestCpu[nicIndex] != cpuIndex) continue; - table.Set(rowIdx, 4, " - NIC %02d ", nicIndex); - table.Set(rowIdx, 5, " - %s ", nicNames[nicIndex].c_str()); - table.Set(rowIdx, 6, " %s ", nicIsActive[nicIndex] ? "ON" : "OFF"); + table.Set(rowIdx, 3, " - NIC %02d ", nicIndex); + table.Set(rowIdx, 4, " - %s ", nicNames[nicIndex].c_str()); + table.Set(rowIdx, 5, " %s ", nicIsActive[nicIndex] ? "ON" : "OFF"); rowIdx++; } } diff --git a/src/client/Utilities.hpp b/src/client/Utilities.hpp index 0ba93fc6..5fa4de97 100644 --- a/src/client/Utilities.hpp +++ b/src/client/Utilities.hpp @@ -21,13 +21,30 @@ THE SOFTWARE. */ #pragma once +#include +#include #include #include #include +#include +#include "EnvVars.hpp" #include "TransferBench.hpp" namespace TransferBench::Utils { + // Linear interpolation on sorted samples (same ordering as common empirical quantiles with (n-1) indexing). + inline double PercentileDurationMsecFromSorted(std::vector const& sortedAsc, int pct) + { + size_t const n = sortedAsc.size(); + if (n == 0) + return 0.0; + double const pos = (static_cast(pct) / 100.0) * static_cast(n - 1); + size_t const lo = static_cast(std::floor(pos)); + size_t const hi = static_cast(std::ceil(pos)); + double const frac = pos - std::floor(pos); + return sortedAsc[lo] * (1.0 - frac) + sortedAsc[hi] * frac; + } + // Helper class to help format tabular data / output to CSV class TableHelper { @@ -85,8 +102,7 @@ namespace TransferBench::Utils // Group information typedef std::tuple< - std::string, // RackId - int, // VPod + int64_t, // Pod Index std::vector, // CPU Names std::vector, // CPU #Subexecutors std::vector, // GPU Names @@ -99,12 +115,16 @@ namespace TransferBench::Utils > GroupKey; typedef std::map> RankGroupMap; + typedef std::map> RankPerPodMap; // Get information about how ranks can be organized into homogenous groups RankGroupMap& GetRankGroupMap(); // Return the number of homogenous groups of ranks - int numRankGroups(); + int GetNumRankGroups(); + + // Helper function for pod membership + RankPerPodMap& GetRankPerPodMap(); // Helper function to convert an ExeType to a string std::string ExeTypeToStr(ExeType exeType); @@ -147,6 +167,29 @@ namespace TransferBench::Utils std::string GetAllGpuMemTypeStr(); std::string GetAllMemTypeStr(bool isCpu); + // Helper forwarders to allocation/deallocation functions + // Returns true if error occurs + bool AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr); + bool DeallocateMemory(MemType memType, void *memPtr, size_t const bytes); + + // Reorder elements of list by stepping through with stride k, wrapping around. + // When gcd(k, n) > 1 the single cycle breaks into gcd(k, n) orbits which are + // concatenated, so every element appears exactly once in the output. + // The reordered list will be further separated into different groups. + void StrideGenerate(std::vector& list, int k); + + // Returns a schedule of round robin pairing of N elements, using Circle Method. + // If parallel, each round contains N/2 pairs, otherwise serial. + void RoundRobinSchedule(std::vector>>& schedule, + int N, int parallel = 0); + + // Returns a schedule for ordered 2-combination of N elements + // by pairing the list with its rotating self. + // Each round contains n pairs, where 1 <= n <= N and N is divisible by n, + // and an element cannot appear more than twice in a round. + void CombinationSchedule(std::vector>>& schedule, + int N, int n = 0); + // Implementation details below //================================================================ TableHelper::TableHelper(int numRows, int numCols, int precision) : @@ -248,9 +291,9 @@ namespace TransferBench::Utils std::string borders[16] = {" ", "│", "│", "│", - "─", "┘", "┐", "┤", - "─", "└", "┌", "├", - "─", "┴", "┬", "┼"}; + "-", "┘", "┐", "┤", + "-", "└", "┌", "├", + "-", "┴", "┬", "┼"}; int mask; for (int rowIdx = 0; rowIdx <= numRows; rowIdx++) { @@ -264,7 +307,7 @@ namespace TransferBench::Utils if (rowBorders[rowIdx].count(colIdx )) mask |= BORDER_RIGHT; Print("%s", borders[mask].c_str()); if (colIdx < numCols) { - std::string ch = rowBorders[rowIdx].count(colIdx) ? "─" : " "; + std::string ch = rowBorders[rowIdx].count(colIdx) ? "-" : " "; for (int i = 0; i < colWidth[colIdx]; i++) Print("%s", ch.c_str()); } } @@ -303,8 +346,7 @@ namespace TransferBench::Utils // Build GroupKey for each rank for (int rank = 0; rank < TransferBench::GetNumRanks(); rank++) { - std::string ppodId = TransferBench::GetPpodId(rank); - int vpodId = TransferBench::GetVpodId(rank); + int64_t podId = TransferBench::GetPodIdx(rank); // CPU information int numCpus = TransferBench::GetNumExecutors(EXE_CPU, rank); @@ -350,7 +392,7 @@ namespace TransferBench::Utils nicIsActive.push_back(TransferBench::NicIsActive(exeIndex, rank)); } - GroupKey key(ppodId, vpodId, + GroupKey key(podId, cpuNames, cpuNumSubExecs, gpuNames, gpuNumSubExecs, gpuClosestCpu, nicNames, nicClosestCpu, nicClosestGpu, nicIsActive); @@ -367,16 +409,32 @@ namespace TransferBench::Utils return GetRankGroupMap().size(); } + RankPerPodMap& GetRankPerPodMap() + { + static RankPerPodMap pods; + static bool initialized = false; + + if (!initialized) { + for (int rank = 0; rank < TransferBench::GetNumRanks(); rank++) { + int64_t const podId = TransferBench::GetPodIdx(rank); + if (podId == -1) continue; + pods[podId].push_back(rank); + } + initialized = true; + } + return pods; + } // Helper function to convert an ExeType to a string std::string ExeTypeToStr(ExeType exeType) { switch (exeType) { - case EXE_CPU: return "CPU"; - case EXE_GPU_GFX: return "GPU"; - case EXE_GPU_DMA: return "DMA"; - case EXE_NIC: return "NIC"; - case EXE_NIC_NEAREST: return "NIC"; - default: return "N/A"; + case EXE_CPU: return "CPU"; + case EXE_GPU_GFX: return "GPU"; + case EXE_GPU_DMA: return "DMA"; + case EXE_NIC: return "NIC"; + case EXE_NIC_NEAREST: return "NIC"; + case EXE_GPU_BDMA: return "BMA"; + default: return "N/A"; } } @@ -394,6 +452,46 @@ namespace TransferBench::Utils return ss.str(); } + template + struct is_std_vector : std::false_type {}; + + template + struct is_std_vector> : std::true_type {}; + + // This function can be used to check if a value is identical across ranks + template + bool IsUniform(const T& val) { + if constexpr (is_std_vector::value) { + using Elem = typename T::value_type; + static_assert(std::is_trivially_copyable_v, "vector element must be trivially copyable"); + + size_t size = val.size(); + size_t rootSize = size; + System::Get().Broadcast(0, sizeof(rootSize), &rootSize); + if (size != rootSize) return false; + + std::vector ref = val; + System::Get().Broadcast(0, rootSize * sizeof(Elem), ref.data()); + + return (std::memcmp(ref.data(), val.data(), rootSize * sizeof(Elem)) == 0); + } else { + static_assert(std::is_trivially_copyable_v, "Type must be trivially copyable"); + T ref = val; + System::Get().Broadcast(0, sizeof(T), &ref); + + return (std::memcmp(&ref, &val, sizeof(T)) == 0); + } + } + + // Macro for use in presets that will return 1 if a value is not uniform across ranks +#define IS_UNIFORM(val, name) \ + do { \ + if (!Utils::IsUniform(val)) { \ + Utils::Print("[ERROR] %s must be uniform across all ranks\n", name); \ + return 1; \ + } \ + } while(0) + // Helper function to determine if current rank does output bool RankDoesOutput() { @@ -457,10 +555,13 @@ namespace TransferBench::Utils for (auto const& exeInfoPair : results.exeResults) { ExeResult const& exeResult = exeInfoPair.second; numRows += 1 + exeResult.transferIdx.size(); + if (!ev.showPercentiles.empty()) { + numRows += static_cast(ev.showPercentiles.size()) * static_cast(exeResult.transferIdx.size()); + } if (ev.showIterations) { - numRows += (numTimedIterations + 1); - - // Check that per-iteration information exists + numRows += (numTimedIterations + 1) * exeResult.transferIdx.size(); + } + if (ev.showIterations || !ev.showPercentiles.empty()) { for (int idx : exeResult.transferIdx) { TransferResult const& r = results.tfrResults[idx]; if (r.perIterMsec.size() != numTimedIterations) { @@ -472,7 +573,9 @@ namespace TransferBench::Utils } } - TableHelper table(numRows, numCols); + int showNumIterations = (ev.numIterations < 0) ? 1 : 0; + + TableHelper table(numRows+showNumIterations, numCols); for (int col = 1; col < numCols; col++) table.DrawColBorder(col); @@ -506,9 +609,9 @@ namespace TransferBench::Utils TransferResult const& r = results.tfrResults[idx]; table.Set(rowIdx, 0, "Transfer %-4d ", idx); - table.Set(rowIdx, 1, "%8.3f GB/s " , r.avgBandwidthGbPerSec); - table.Set(rowIdx, 2, "%8.3f ms " , r.avgDurationMsec); - table.Set(rowIdx, 3, "%12lu bytes " , r.numBytes); + table.Set(rowIdx, 1, "%8.3f GB/s " , r.avgBandwidthGbPerSec); + table.Set(rowIdx, 2, "%8.3f ms " , r.avgDurationMsec); + table.Set(rowIdx, 3, "%12lu bytes " , r.numBytes); char exeSubIndexStr[32] = ""; if (t.exeSubIndex != -1) @@ -587,6 +690,24 @@ namespace TransferBench::Utils rowIdx++; table.DrawRowBorder(rowIdx); } + + // Show percentiles + if (!ev.showPercentiles.empty()) { + std::vector sortedDur = r.perIterMsec; + std::sort(sortedDur.begin(), sortedDur.end()); + for (int pct : ev.showPercentiles) { + double dur = PercentileDurationMsecFromSorted(sortedDur, pct); + double bwGbs = dur > 0.0 ? (t.numBytes / 1.0E9) / dur * 1000.0 : 0.0; + table.Set(rowIdx, 0, "p%d ", pct); + table.Set(rowIdx, 1, "%8.3f GB/s ", bwGbs); + table.Set(rowIdx, 2, "%8.3f ms ", dur); + table.Set(rowIdx, 3, " "); + table.Set(rowIdx, 4, " "); + table.SetCellAlignment(rowIdx, 4, TableHelper::ALIGN_LEFT); + rowIdx++; + } + } + } } table.DrawRowBorder(rowIdx); @@ -596,8 +717,21 @@ namespace TransferBench::Utils table.Set(rowIdx, 3, "%12lu bytes " , results.totalBytesTransferred); table.Set(rowIdx, 4, " Overhead %.3f ms", results.overheadMsec); table.SetCellAlignment(rowIdx, 4, TableHelper::ALIGN_LEFT); - table.DrawRowBorder(rowIdx + 1); + table.DrawRowBorder(rowIdx+1); + if (showNumIterations) { + rowIdx++; + table.Set(rowIdx, 0, "# Iters Run:"); + table.Set(rowIdx, 1, "%lu ", numTimedIterations); + table.SetCellAlignment(rowIdx, 1, TableHelper::ALIGN_LEFT); + table.SetCellBorder(rowIdx, 0, 0); + table.SetCellBorder(rowIdx, 1, 0); + table.SetCellBorder(rowIdx, 2, 0); + table.SetCellBorder(rowIdx, 3, 0); + table.SetCellBorder(rowIdx, 4, 0); + table.DrawRowBorder(rowIdx); + table.DrawRowBorder(rowIdx+1); + } table.PrintTable(ev.outputToCsv, ev.showBorders); } @@ -682,4 +816,122 @@ namespace TransferBench::Utils { return isCpu ? GetAllCpuMemTypeStr() : GetAllGpuMemTypeStr(); } + + bool AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr) + { + return (TransferBench::AllocateMemory(memDevice, numBytes, memPtr).errType != TransferBench::ERR_NONE); + } + bool DeallocateMemory(MemType memType, void *memPtr, size_t const bytes) + { + return (TransferBench::DeallocateMemory(memType, memPtr, bytes).errType != TransferBench::ERR_NONE); + } + + void StrideGenerate(std::vector& list, int k) + { + int n = list.size(); + if (n == 0) return; + k = ((k % n) + n) % n; // normalize to 0..n-1 + if (k == 0) return; + + int d = std::gcd(k, n); + std::vector out; + out.reserve(n); + + for (int s = 0; s < d; s++) { + for (int j = 0; j < n / d; j++) { + out.push_back(list[(s + j * k) % n]); + } + } + list = std::move(out); + } + + void RoundRobinSchedule(std::vector>>& schedule, + int N, int parallel) + { + if (N == 1) { + schedule.push_back({{0, 0}}); + return; + } + // Generate standard round-robin tournament (maximum parallelism) + std::vector>> fullSchedule; + + // Pad odd number of ranks with a dummy round (N+1) + int paddedN = N + N % 2; + // Round-robin tournament scheduling + for (int round = 0; round < paddedN - 1; round++) { + std::vector> roundPairs; + std::vector> roundPairsReversed; + for (int i = 0; i < paddedN / 2; i++) { + int item1 = i; + int item2 = paddedN - 1 - i; + if (round > 0) { + // Rotate all except the first item + if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1; + if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1; + } + // Ignore dummy round, its partner sits out this round + if (item1 < N && item2 < N) { + roundPairs.push_back({item1, item2}); + roundPairsReversed.push_back({item2, item1}); + } + } + fullSchedule.push_back(roundPairs); + fullSchedule.push_back(roundPairsReversed); + } + + // A loopback round where all run in parallel + std::vector> selfRound; + for (int i = 0; i < N; i++) { + selfRound.push_back({i, i}); + } + fullSchedule.push_back(selfRound); + + if (parallel) { + schedule = std::move(fullSchedule); + } else { + // Serialize each round if needed + for (auto const& fullRound : fullSchedule) { + for (auto const& match : fullRound) { + std::vector> subRound; + subRound.push_back({match.first, match.second}); + schedule.push_back(subRound); + } + } + } + } + + void CombinationSchedule(std::vector>>& schedule, + int N, int n) + { + std::vector>> fullSchedule; + + if (n <= 0) n = N; + if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round + { + n = 1; + Print("[WARN] cannot create combination schedule, falling back to serial\n"); + } + + // Generate rounds of combination based on incrementing distance + for (int i = 0; i < N; i++) { + std::vector> round; + for (int j = 0; j < N; j++) { + round.push_back({j, (j + i) % N}); + } + fullSchedule.push_back(round); + } + + // Step 2: Split each full round into sub-rounds with at most n pairs + for (auto const& fullRound : fullSchedule) { + for (size_t start = 0; start < fullRound.size(); start += n) { + std::vector> subRound; + for (size_t i = start; i < start + n && i < fullRound.size(); i++) { + subRound.push_back(fullRound[i]); + } + if (!subRound.empty()) { + schedule.push_back(subRound); + } + } + } + } }; diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index 7b74dc5b..b16c587d 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -27,12 +27,15 @@ THE SOFTWARE. #include #include #include +#include #include #include #include #include #include #include +#include +#include #include #include // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev) #include @@ -61,16 +64,27 @@ THE SOFTWARE. #endif #if defined(__NVCC__) +#include #include +#ifdef NVML_ENABLED #include +#endif #else -#include -#include -#include -#include +#include "hip/hip_ext.h" +#include "hip/hip_runtime.h" +#include "hsa/hsa.h" +#include "hsa/hsa_ext_amd.h" +#ifdef AMD_SMI_ENABLED +#include "amd_smi/amdsmi.h" +#endif #endif /// @endcond +// Batched DMA executor is only supported with HIP >= 7.1 and CUDA 12.8 +#if (defined(HIP_VERSION) && (HIP_VERSION >= 70100000)) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)) +#define BMA_EXEC_ENABLED +#endif + namespace TransferBench { using std::map; @@ -78,7 +92,7 @@ namespace TransferBench using std::set; using std::vector; - constexpr char VERSION[] = "1.66"; + constexpr char VERSION[] = "1.67"; /** * Enumeration of supported Executor types @@ -91,11 +105,12 @@ namespace TransferBench EXE_GPU_GFX = 1, ///< GPU kernel-based executor (subExecutor = threadblock/CU) EXE_GPU_DMA = 2, ///< GPU SDMA executor (subExecutor = not supported) EXE_NIC = 3, ///< NIC RDMA executor (subExecutor = queue pair) - EXE_NIC_NEAREST = 4 ///< NIC RDMA nearest executor (subExecutor = queue pair) + EXE_NIC_NEAREST = 4, ///< NIC RDMA nearest executor (subExecutor = queue pair) + EXE_GPU_BDMA = 5, ///< GPU Batched SDMA executor (subExecutor = batch item) }; - char const ExeTypeStr[6] = "CGDIN"; + char const ExeTypeStr[7] = "CGDINB"; inline bool IsCpuExeType(ExeType e){ return e == EXE_CPU; } - inline bool IsGpuExeType(ExeType e){ return e == EXE_GPU_GFX || e == EXE_GPU_DMA; } + inline bool IsGpuExeType(ExeType e){ return e == EXE_GPU_GFX || e == EXE_GPU_DMA || e == EXE_GPU_BDMA; } inline bool IsNicExeType(ExeType e){ return e == EXE_NIC || e == EXE_NIC_NEAREST; } /** @@ -139,6 +154,17 @@ namespace TransferBench inline bool IsCpuMemType(MemType m) { return (MEM_CPU <= m && m <= MEM_CPU_UNPINNED);} inline bool IsGpuMemType(MemType m) { return (MEM_GPU <= m && m <= MEM_MANAGED);} + /** + * Enumeration of supported GFX kernels + */ + enum GfxKernelType + { + GFX_KERNEL_AUTO = -1, ///< Automatically choose a kernel + GFX_KERNEL_REDUCE = 0, ///< Default kernel that supports any multiple input/output buffers + GFX_KERNEL_COPY = 1, ///< Simpler kernel that supports copies only + NUM_GFX_KERNELS = 2 ///< Number of GFX kernels currently supported + }; + /** * A MemDevice indicates a memory type on a specific device */ @@ -208,6 +234,7 @@ namespace TransferBench int blockOrder = 0; ///< Determines how threadblocks are ordered (0=sequential, 1=interleaved, 2=random) int blockSize = 256; ///< Size of each threadblock (must be multiple of 64) vector cuMask = {}; ///< Bit-vector representing the CU mask + int gfxKernel = 0; ///< Kernel selector: -1=auto, 0=reduce, 1=copy-only vector> prefXccTable = {}; ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device int seType = 0; ///< SubExecutor granularity type (0=threadblock, 1=warp) int temporalMode = 0; ///< Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both @@ -234,6 +261,7 @@ namespace TransferBench struct NicOptions { size_t chunkBytes = 1<<30; ///< How much bytes to transfer at a time + int cqPollBatch = 4; ///< Maximum CQ entries polled per call int ibGidIndex = -1; ///< GID Index for RoCE NICs (-1 is auto) uint8_t ibPort = 1; ///< NIC port number to be used int ipAddressFamily = 4; ///< 4=IPv4, 6=IPv6 (used for auto GID detection) @@ -315,6 +343,7 @@ namespace TransferBench ErrResult() = default; #if defined(__NVCC__) ErrResult(cudaError_t err); + ErrResult(CUresult err); #else ErrResult(hipError_t err); ErrResult(hsa_status_t err); @@ -381,30 +410,11 @@ namespace TransferBench vector const& transfers, TestResults& results); - /** - * Enumeration of implementation attributes - */ - enum IntAttribute - { - ATR_GFX_MAX_BLOCKSIZE, ///< Maximum blocksize for GFX executor - ATR_GFX_MAX_UNROLL, ///< Maximum unroll factor for GFX executor - }; - enum StrAttribute { ATR_SRC_PREP_DESCRIPTION ///< Description of how source memory is prepared }; - /** - * Query attributes (integer) - * - * @note This allows querying of implementation information such as limits - * - * @param[in] attribute Attribute to query - * @returns Value of the attribute - */ - int GetIntAttribute(IntAttribute attribute); - /** * Query attributes (string) * @@ -547,16 +557,17 @@ namespace TransferBench std::string GetHostname(int targetRank = -1); /** - * @param[in] targetRank Rank to query (-1 for local rank) - * @returns Gets the physical pod identifier for the target rank + * @param[in] targetRank Rank to query (-1 for local rank) + * @returns Gets the unique pod identifier for the target rank based on its physical and virtual pod **/ - std::string GetPpodId(int targetRank = -1); + int64_t GetPodIdx(int targetRank = -1); /** - * @param[in] targetRank Rank to query (-1 for local rank) - * @returns Gets the virtual pod identifier for the target rank + * @param[in] targetRank Remote rank to query + * @param[in] sourceRank Base rank to query (-1 for local rank) + * @returns Whether source and target ranks belong to the same pod **/ - int GetVpodId(int targetRank = -1); + bool IsSamePod(int targetRank, int sourceRank = -1); /** * @param[in] exeDevice The specific Executor to query @@ -581,7 +592,7 @@ namespace TransferBench */ ErrResult ParseTransfers(std::string str, std::vector& transfers); -}; +} //========================================================================================== // End of TransferBench API //========================================================================================== @@ -599,6 +610,10 @@ namespace TransferBench #define hipError_t cudaError_t #define hipEvent_t cudaEvent_t #define hipStream_t cudaStream_t + #define hipMemAllocationProp CUmemAllocationProp + #define hipMemGenericAllocationHandle_t CUmemGenericAllocationHandle + #define hipMemAccessDesc CUmemAccessDesc + #define hipMemFabricHandle_t CUmemFabricHandle // Enumerations #define hipDeviceAttributeClockRate cudaDevAttrClockRate @@ -607,9 +622,15 @@ namespace TransferBench #define hipErrorPeerAccessAlreadyEnabled cudaErrorPeerAccessAlreadyEnabled #define hipFuncCachePreferShared cudaFuncCachePreferShared #define hipMemcpyDefault cudaMemcpyDefault + #define hipMemcpyKind cudaMemcpyKind #define hipMemcpyDeviceToHost cudaMemcpyDeviceToHost #define hipMemcpyHostToDevice cudaMemcpyHostToDevice #define hipSuccess cudaSuccess + #define hipMemLocationTypeDevice CU_MEM_LOCATION_TYPE_DEVICE + #define hipMemAllocationTypePinned CU_MEM_ALLOCATION_TYPE_PINNED + #define hipMemHandleTypeFabric CU_MEM_HANDLE_TYPE_FABRIC + #define hipMemAllocationGranularityRecommended CU_MEM_ALLOC_GRANULARITY_RECOMMENDED + #define hipMemAccessFlagsProtReadWrite CU_MEM_ACCESS_FLAGS_PROT_READWRITE // Functions #define hipDeviceCanAccessPeer cudaDeviceCanAccessPeer @@ -632,12 +653,26 @@ namespace TransferBench #define hipMallocManaged cudaMallocManaged #define hipMemcpy cudaMemcpy #define hipMemcpyAsync cudaMemcpyAsync + #define hipMemcpyBatchAsync cudaMemcpyBatchAsync #define hipMemset cudaMemset #define hipMemsetAsync cudaMemsetAsync #define hipSetDevice cudaSetDevice #define hipStreamCreate cudaStreamCreate #define hipStreamDestroy cudaStreamDestroy #define hipStreamSynchronize cudaStreamSynchronize + // cu* driver API returns CUresult; cast to cudaError_t so callers can use a single error variable +#define hipMemGetAllocationGranularity(...) ((cudaError_t)cuMemGetAllocationGranularity(__VA_ARGS__)) + #define hipMemCreate(...) ((cudaError_t)cuMemCreate(__VA_ARGS__)) + #define hipMemAddressReserve(...) ((cudaError_t)cuMemAddressReserve(__VA_ARGS__)) + #define hipMemMap(...) ((cudaError_t)cuMemMap(__VA_ARGS__)) + #define hipMemSetAccess(...) ((cudaError_t)cuMemSetAccess(__VA_ARGS__)) + #define hipMemUnmap(...) ((cudaError_t)cuMemUnmap(__VA_ARGS__)) + #define hipMemRelease(...) ((cudaError_t)cuMemRelease(__VA_ARGS__)) + #define hipMemAddressFree(...) ((cudaError_t)cuMemAddressFree(__VA_ARGS__)) + #define hipMemExportToShareableHandle(...) ((cudaError_t)cuMemExportToShareableHandle(__VA_ARGS__)) + #define hipMemImportFromShareableHandle(...) ((cudaError_t)cuMemImportFromShareableHandle(__VA_ARGS__)) + + using gpu_device_ptr = CUdeviceptr; // Define float2 addition operator for NVIDIA platform __device__ inline float2& operator +=(float2& a, const float2& b) @@ -656,42 +691,59 @@ namespace TransferBench a.w += b.w; return a; } +#else + using gpu_device_ptr = void*; #endif // Helper macro functions //========================================================================================== // Macro for collecting CU/SM GFX kernel is running on -#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1150__) || defined(__gfx1151__) || defined(__gfx1200__) || defined(__gfx1201__) -#define GetHwId(hwId) hwId = 0 +#if defined(__GFX9__) + #define GetHwId(hwId) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwId)) +#elif defined(__GFX10__) || defined(__GFX11__) || defined(__GFX12__) + #define GetHwId(hwId) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID1)" : "=s" (hwId)) #elif defined(__NVCC__) -#define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId)) + #define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId)) #else -#define GetHwId(hwId) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwId)); + #define GetHwId(hwId) hwId = 0 #endif // Macro for collecting XCC GFX kernel is running on #if defined(__gfx942__) || defined(__gfx950__) -#define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val)); +#define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val)) +#elif defined(__GFX12__) +#define GetXccId(val) \ + { asm volatile ("s_sendmsg_rtn_b32 %0, 0x87 \n" \ + "s_wait_kmcnt 0" \ + : "=s" (val)); \ + val = ((val >> 16) & 0xF); \ + } #else #define GetXccId(val) val = 0 #endif // Error check macro (NOTE: This will return even for ERR_WARN) -#define ERR_CHECK(cmd) \ - do { \ - ErrResult err = (cmd); \ - if (err.errType != ERR_NONE) \ - return err; \ +#define ERR_CHECK(cmd) \ + do { \ + ErrResult err = (cmd); \ + if (err.errType != ERR_NONE) { \ + err.errMsg += std::string(" [") + __FILE__ + ":" + \ + std::to_string(__LINE__) + " in " + __func__ + "]"; \ + return err; \ + } \ } while (0) // Appends warn/fatal errors to a list, return false if fatal -#define ERR_APPEND(cmd, list) \ - do { \ - ErrResult err = (cmd); \ - if (err.errType != ERR_NONE) \ - list.push_back(err); \ - if (err.errType == ERR_FATAL) \ +#define ERR_APPEND(cmd, list) \ + do { \ + ErrResult err = (cmd); \ + if (err.errType != ERR_NONE) { \ + err.errMsg += std::string(" [") + __FILE__ + ":" + \ + std::to_string(__LINE__) + " in " + __func__ + "]"; \ + list.push_back(err); \ + } \ + if (err.errType == ERR_FATAL) \ return false; \ } while (0) @@ -744,9 +796,7 @@ namespace { // Constants //======================================================================================== - int constexpr MAX_BLOCKSIZE = 1024; // Max threadblock size - int constexpr MAX_UNROLL = 8; // Max unroll factor int constexpr MAX_SRCS = 8; // Max srcs per Transfer int constexpr MAX_DSTS = 8; // Max dsts per Transfer int constexpr MEMSET_CHAR = 75; // Value to memset (char) @@ -791,14 +841,15 @@ namespace { * * This supports three possible communication modes - Socket-based, MPI-based, disabled * - * - Will first attempt to use sockets if TB_RANK env var is detected + * - Will first attempt to use sockets when TB_NUM_RANKS is set (>= 2) * - Will then try MPI-based, if compiled with MPI support * - Drop back to single node functionality * - Configuration for socket-based communicator is read via environment variables - * - TB_RANK: Rank of this process (0-based) - * - TB_NUM_RANKS: Total number of processes - * - TB_MASTER_ADDR: IP address of rank 0 + * - TB_NUM_RANKS: Total number of processes (only variable required on rank 0; rank 0 logs how workers should connect) + * - TB_RANK: Rank of this process (0-based); defaults to 0 if unset or empty + * - TB_MASTER_ADDR: Rank 0 address for workers to connect; optional on rank 0 (auto-detected IPv4 after listen) + * - TB_MASTER_IFACE: Optional interface name when auto-detecting rank-0 address (e.g. eth0) * - TB_MASTER_PORT: Port for communication (default: 29500) */ class System @@ -826,6 +877,18 @@ namespace { bool& IsVerbose() { return verbose; } + /** + * Helper logging function that logs only on output ranks + * - In MPI mode - Rank 0 only + * - In socket mode - All ranks unless TB_SINGLE_LOG=1 + */ + void Log(const char* format, ...) const; + + /** + * Helper function that logs Transfers being executed to a config file + */ + void LogTransfers(std::vector const& transfers); + // Communication functions /** * Barrier that all ranks must arrive at before proceeding @@ -949,8 +1012,8 @@ namespace { void GetClosestGpusToNic(std::vector& gpuIndices, int nicIndex, int targetRank = -1) const; std::string GetHostname(int targetRank) const; - std::string GetPpodId(int targetRank) const; - int GetVpodId(int targetRank) const; + int64_t GetPodIdx(int targetRank) const; + bool IsSamePod(int targetRank, int sourceRank) const; std::string GetExecutorName(ExeDevice exeDevice) const; int NicIsActive(int nicIndex, int targetRank) const; @@ -977,6 +1040,8 @@ namespace { int rank; int numRanks; bool verbose = false; + bool rankDoesOutput = true; + FILE* dumpCfgFile = nullptr; #if !defined(__NVCC__) std::vector cpuAgents; @@ -999,9 +1064,9 @@ namespace { // Topology related struct RankTopology { - char hostname[33]; - char ppodId[256]; - int vpodId; + char hostname[33]; + char ppodId[16]; + int64_t vpodId; std::map numExecutors; std::map, int> numExecutorSubIndices; @@ -1018,6 +1083,7 @@ namespace { void SetupSocketCommunicator(); void SetupMpiCommunicator(); + void CollectPodMembership(char* ppodId, int64_t& vpodId); void GetRankTopology(RankTopology& topo); void CollectTopology(); std::string GetCpuName() const; @@ -1343,8 +1409,36 @@ namespace { return ERR_NONE; } +#ifdef POD_COMM_ENABLED + static ErrResult GetMemAllocationProp(MemDevice const& memDevice, hipMemAllocationProp& prop) + { + + switch (memDevice.memType) { + case MEM_CPU: case MEM_CPU_CLOSEST: case MEM_GPU: + prop.type = hipMemAllocationTypePinned; break; + case MEM_CPU_UNCACHED: case MEM_GPU_UNCACHED: +#if defined (__NVCC__) + return {ERR_FATAL, "Uncached memory type unsupported in CUDA"}; +#else + prop.type = hipMemAllocationTypeUncached; break; +#endif + default: + return {ERR_FATAL, "Unsupported memory type for pod communication"}; + } + + prop.requestedHandleTypes = hipMemHandleTypeFabric; +// at this point shouldn't have any memtype other than device +// ERR_CHECK(GetMemLocation(memDevice, prop.location)); + prop.location.type = hipMemLocationTypeDevice; + prop.location.id = memDevice.memIndex; + return ERR_NONE; + } +#endif + // Allocate memory - static ErrResult AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr, bool isShareable = false) + static ErrResult AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr, + size_t* actualBytes = NULL, + hipMemGenericAllocationHandle_t* memHandle = NULL) { if (numBytes == 0) { return {ERR_FATAL, "Unable to allocate 0 bytes"}; @@ -1352,20 +1446,69 @@ namespace { *memPtr = nullptr; MemType const& memType = memDevice.memType; + int deviceIdx = memDevice.memIndex; + if (memType == MEM_CPU_CLOSEST) { + deviceIdx = GetClosestCpuNumaToGpu(memDevice.memIndex); + } - if (IsCpuMemType(memType)) { - // Determine which NUMA device to use - int numaIdx = memDevice.memIndex; - if (memType == MEM_CPU_CLOSEST) { - numaIdx = GetClosestCpuNumaToGpu(memDevice.memIndex); + // If memHandle is provided, allocate sharable memory + if (memHandle != NULL) { +#ifdef POD_COMM_ENABLED + hipMemAllocationProp prop = {}; + ERR_CHECK(GetMemAllocationProp(memDevice, prop)); + + // Determine recommended allocation granularity + size_t granularity; + ERR_CHECK(hipMemGetAllocationGranularity(&granularity, &prop, + hipMemAllocationGranularityRecommended)); + size_t roundedUpBytes = (numBytes + granularity - 1) / granularity * granularity; + if (actualBytes != NULL) *actualBytes = roundedUpBytes; + + // Create memory allocation described by properties and size + ERR_CHECK(hipMemCreate(memHandle, roundedUpBytes, &prop, 0)); + + // Reserve a virtual address range for the memory allocation + ERR_CHECK(hipMemAddressReserve((gpu_device_ptr*)memPtr, roundedUpBytes, 0, 0, 0)); + + // Map the allocation handle to the reserved address range + ERR_CHECK(hipMemMap((gpu_device_ptr)*memPtr, roundedUpBytes, 0, *memHandle, 0)); + + // Specify memory access descriptor to enable local read/write + hipMemAccessDesc desc; +// ERR_CHECK(GetMemLocation(memDevice, desc.location)); + desc.location.type = hipMemLocationTypeDevice; + desc.location.id = memDevice.memIndex; + desc.flags = hipMemAccessFlagsProtReadWrite; + + // Set access flags for virtual address range + ERR_CHECK(hipMemSetAccess((gpu_device_ptr)*memPtr, roundedUpBytes, &desc, 1)); + + // Clear the memory + if (IsCpuMemType(memType)) { + memset(*memPtr, 0, roundedUpBytes); + // Check that the allocated pages are actually on the correct NUMA node + ERR_CHECK(CheckPages((char*)*memPtr, roundedUpBytes, deviceIdx)); + } else if (IsGpuMemType(memType)) { + ERR_CHECK(hipSetDevice(memDevice.memIndex)); + ERR_CHECK(hipMemset(*memPtr, 0, numBytes)); + ERR_CHECK(hipDeviceSynchronize()); } + return ERR_NONE; +#else + return {ERR_FATAL, "Unable to allocate sharable memory if not compiled with pod communication support"}; +#endif + } else { + if (actualBytes != NULL) *actualBytes = numBytes; + } + + if (IsCpuMemType(memType)) { // Set NUMA policy prior to call to hipHostMalloc - numa_set_preferred(numaIdx); + numa_set_preferred(deviceIdx); // Allocate host-pinned memory (should respect NUMA mem policy) int flags = 0; -#if !defined(__NVCC__) +#if !defined (__NVCC__) flags |= hipHostMallocNumaUser; #endif if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) { @@ -1393,12 +1536,12 @@ namespace { #endif #endif } else if (memType == MEM_CPU_UNPINNED) { - *memPtr = numa_alloc_onnode(numBytes, numaIdx); + *memPtr = numa_alloc_onnode(numBytes, deviceIdx); } // Check that the allocated pages are actually on the correct NUMA node memset(*memPtr, 0, numBytes); - ERR_CHECK(CheckPages((char*)*memPtr, numBytes, numaIdx)); + ERR_CHECK(CheckPages((char*)*memPtr, numBytes, deviceIdx)); // Reset to default numa mem policy numa_set_preferred(-1); @@ -1437,30 +1580,44 @@ namespace { } // Deallocate memory - static ErrResult DeallocateMemory(MemType memType, void *memPtr, size_t const bytes) + static ErrResult DeallocateMemory(MemType memType, void *memPtr, size_t const bytes, + hipMemGenericAllocationHandle_t* memHandle = nullptr) { // Avoid deallocating nullptr if (memPtr == nullptr) return {ERR_FATAL, "Attempted to free null pointer for %lu bytes", bytes}; - switch (memType) { - case MEM_CPU: case MEM_CPU_CLOSEST: case MEM_CPU_COHERENT: case MEM_CPU_NONCOHERENT: case MEM_CPU_UNCACHED: - { - ERR_CHECK(hipHostFree(memPtr)); - break; - } - case MEM_CPU_UNPINNED: - { - numa_free(memPtr, bytes); - break; - } - case MEM_GPU : case MEM_GPU_FINE: case MEM_GPU_UNCACHED: case MEM_MANAGED: - { - ERR_CHECK(hipFree(memPtr)); - break; - } - default: - return {ERR_FATAL, "Attempting to deallocate unrecognized memory type (%d)", memType}; + if (memHandle == nullptr || *memHandle == NULL) { + switch (memType) { + case MEM_CPU: case MEM_CPU_CLOSEST: case MEM_CPU_COHERENT: case MEM_CPU_NONCOHERENT: case MEM_CPU_UNCACHED: + { + ERR_CHECK(hipHostFree(memPtr)); + break; + } + case MEM_CPU_UNPINNED: + { + numa_free(memPtr, bytes); + break; + } + case MEM_GPU : case MEM_GPU_FINE: case MEM_GPU_UNCACHED: case MEM_MANAGED: + { + ERR_CHECK(hipFree(memPtr)); + break; + } + default: + return {ERR_FATAL, "Attempting to deallocate unrecognized memory type (%d)", memType}; + } + } else { +#ifdef POD_COMM_ENABLED + // Unmap the backing memory of the given virtual address + ERR_CHECK(hipMemUnmap((gpu_device_ptr)memPtr, bytes)); + // Release the backing memory via its handle + ERR_CHECK(hipMemRelease(*memHandle)); + // Free virtual address range reservation + ERR_CHECK(hipMemAddressFree((gpu_device_ptr)memPtr, bytes)); +#else + return {ERR_FATAL, "Unable to deallocate sharable memory if not compiled with pod communication support"}; +#endif } return ERR_NONE; } @@ -1681,7 +1838,7 @@ namespace { { if (GetCommMode() == COMM_NONE) return; if (System::Get().IsVerbose()) { - printf("[INFO] Rank %d checking config consistency\n", GetRank()); + System::Get().Log("[INFO] Rank %d checking config consistency\n", GetRank()); } // To check consistency, compare against rank 0 @@ -1703,10 +1860,14 @@ namespace { // Compare data options { DataOptions data = cfg.data; + // Null out vector members before sizeof-broadcast: vectors carry heap pointers that are + // invalid on other ranks; freeing a remote pointer on scope exit causes a segfault + // These fields are permitted to differ across ranks and are not compared below + decltype(data.fillPattern)().swap(data.fillPattern); + decltype(data.fillCompress)().swap(data.fillCompress); System::Get().Broadcast(root, sizeof(data), &data); // data.alwaysValidate is permitted to be different across ranks - if (data.blockBytes != cfg.data.blockBytes) ADD_ERROR("cfg.data.blockBytes"); if (data.byteOffset != cfg.data.byteOffset) ADD_ERROR("cfg.data.byteOffset"); @@ -1747,10 +1908,14 @@ namespace { // Compare GFX Executor options { GfxOptions gfx = cfg.gfx; + // Null out vector members before sizeof broadcast + decltype(gfx.cuMask)().swap(gfx.cuMask); + decltype(gfx.prefXccTable)().swap(gfx.prefXccTable); System::Get().Broadcast(root, sizeof(gfx), &gfx); if (gfx.blockOrder != cfg.gfx.blockOrder) ADD_ERROR("cfg.gfx.blockOrder"); if (gfx.blockSize != cfg.gfx.blockSize) ADD_ERROR("cfg.gfx.blockSize"); // gfx.cuMask is permitted to be different across ranks + if (gfx.gfxKernel != cfg.gfx.gfxKernel) ADD_ERROR("cfg.gfx.gfxKernel"); // gfx.perfXccTable is permitted to be different across ranks if (gfx.seType != cfg.gfx.seType) ADD_ERROR("cfg.gfx.seType"); if (gfx.temporalMode != cfg.gfx.temporalMode) ADD_ERROR("cfg.gfx.temporalMode"); @@ -1775,6 +1940,7 @@ namespace { NicOptions nic = cfg.nic; System::Get().Broadcast(root, sizeof(nic), &nic); if (nic.chunkBytes != cfg.nic.chunkBytes) ADD_ERROR("cfg.nic.chunkBytes"); + if (nic.cqPollBatch != cfg.nic.cqPollBatch) ADD_ERROR("cfg.nic.cqPollBatch"); // nic.ibGidIndex is permitted to be different across ranks // nic.ibPort is permitted to be different across ranks if (nic.ipAddressFamily != cfg.nic.ipAddressFamily) ADD_ERROR("cfg.nic.ipAddressFamily"); @@ -1789,6 +1955,9 @@ namespace { #undef ADD_ERROR } + // Forward declaration + int GetGpuKernelUnrollIdx(int unroll); + // Validate configuration options - return trues if and only if an fatal error is detected static bool ConfigOptionsHaveErrors(ConfigOptions const& cfg, std::vector& errors) @@ -1827,11 +1996,10 @@ namespace { if (cfg.gfx.useMultiStream && cfg.gfx.blockOrder > 0) errors.push_back({ERR_WARN, "[gfx.blockOrder] will be ignored when running in multi-stream mode"}); - int gfxMaxBlockSize = GetIntAttribute(ATR_GFX_MAX_BLOCKSIZE); - if (cfg.gfx.blockSize < 0 || cfg.gfx.blockSize % 64 || cfg.gfx.blockSize > gfxMaxBlockSize) + if (cfg.gfx.blockSize < 0 || cfg.gfx.blockSize % 64 || cfg.gfx.blockSize > MAX_BLOCKSIZE) errors.push_back({ERR_FATAL, "[gfx.blockSize] must be positive multiple of 64 less than or equal to %d", - gfxMaxBlockSize}); + MAX_BLOCKSIZE}); if (cfg.gfx.temporalMode < 0 || cfg.gfx.temporalMode > 3) errors.push_back({ERR_FATAL, @@ -1843,11 +2011,10 @@ namespace { "[gfx.temporalMode] is not supported on NVIDIA hardware"}); #endif - int gfxMaxUnroll = GetIntAttribute(ATR_GFX_MAX_UNROLL); - if (cfg.gfx.unrollFactor < 0 || cfg.gfx.unrollFactor > gfxMaxUnroll) + if (GetGpuKernelUnrollIdx(cfg.gfx.unrollFactor) == -1) { errors.push_back({ERR_FATAL, - "[gfx.unrollFactor] must be non-negative and less than or equal to %d", - gfxMaxUnroll}); + "[gfx.unrollFactor] unroll factor of %d is unsupported", cfg.gfx.unrollFactor}); + } if (cfg.gfx.waveOrder < 0 || cfg.gfx.waveOrder >= 6) errors.push_back({ERR_FATAL, "[gfx.waveOrder] must be non-negative and less than 6"}); @@ -1855,6 +2022,10 @@ namespace { if (!(cfg.gfx.wordSize == 1 || cfg.gfx.wordSize == 2 || cfg.gfx.wordSize == 4)) errors.push_back({ERR_FATAL, "[gfx.wordSize] must be either 1, 2 or 4"}); + if (cfg.gfx.gfxKernel < -1 || cfg.gfx.gfxKernel >= NUM_GFX_KERNELS) + errors.push_back( + {ERR_FATAL, "[gfx.gfxKernel] must be -1 for auto, or less than %d", NUM_GFX_KERNELS}); + int numGpus = GetNumExecutors(EXE_GPU_GFX); int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0}); vector> const& table = cfg.gfx.prefXccTable; @@ -1885,6 +2056,9 @@ namespace { if (cfg.nic.chunkBytes == 0 || (cfg.nic.chunkBytes % 4 != 0)) { errors.push_back({ERR_FATAL, "[nic.chunkBytes] must be a non-negative multiple of 4"}); } + if (cfg.nic.cqPollBatch <= 0) { + errors.push_back({ERR_FATAL, "[nic.cqPollBatch] must be positive"}); + } #endif // NVIDIA specific @@ -1919,7 +2093,7 @@ namespace { if (GetCommMode() == COMM_NONE) return; if (System::Get().IsVerbose()) { - printf("[INFO] Rank %d checking transfers consistency\n", GetRank()); + System::Get().Log("[INFO] Rank %d checking transfers consistency\n", GetRank()); } // To check consistency, compare against rank 0 @@ -1967,6 +2141,18 @@ namespace { #undef ADD_ERROR } + // Returns true if the given Transfer requires pod communication + static bool IsPodTransfer(Transfer const& t) + { + if (IsCpuExeType(t.exeDevice.exeType) || IsGpuExeType(t.exeDevice.exeType)) { + for (auto const& src : t.srcs) + if (src.memRank != t.exeDevice.exeRank) return true; + for (auto const& dst : t.dsts) + if (dst.memRank != t.exeDevice.exeRank) return true; + } + return false; + } + // Validate Transfers to execute - returns true if and only if fatal error detected static bool TransfersHaveErrors(ConfigOptions const& cfg, std::vector const& transfers, @@ -1981,16 +2167,24 @@ namespace { CheckMultiNodeTransferConsistency(transfers, errors); // Per-Transfer checks + bool hasFatalError = false; for (size_t i = 0; i < transfers.size(); i++) { Transfer const& t = transfers[i]; - if (t.numBytes == 0) + if (t.numBytes == 0) { errors.push_back({ERR_FATAL, "Transfer %d: Cannot perform 0-byte transfers", i}); + break; + } + + if (t.numBytes % 4) { + errors.push_back({ERR_FATAL, "Transfer %d: numBytes (%lu) must be a multiple of 4\n", i, t.numBytes}); + break; + } // Each subexecutor is assigned a multiple of cfg.data.blockBytes, however this may // mean that some subexecutors might not have any work assigned to them if the amount to // transfer is small - if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU) { + if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU || t.exeDevice.exeType == EXE_GPU_BDMA) { size_t const N = t.numBytes / sizeof(float); int const targetMultiple = cfg.data.blockBytes / sizeof(float); int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, @@ -2003,25 +2197,36 @@ namespace { } // Check sources and destinations - if (t.srcs.empty() && t.dsts.empty()) + if (t.srcs.empty() && t.dsts.empty()) { errors.push_back({ERR_FATAL, "Transfer %d: Must have at least one source or destination", i}); + break; + } for (int j = 0; j < t.srcs.size(); j++) { ErrResult err = CheckMemDevice(t.srcs[j]); - if (err.errType != ERR_NONE) + if (err.errType != ERR_NONE) { errors.push_back({ERR_FATAL, "Transfer %d: SRC %d: %s", i, j, err.errMsg.c_str()}); + hasFatalError = true; + break; + } } + if (hasFatalError) break; + for (int j = 0; j < t.dsts.size(); j++) { ErrResult err = CheckMemDevice(t.dsts[j]); - if (err.errType != ERR_NONE) + if (err.errType != ERR_NONE) { errors.push_back({ERR_FATAL, "Transfer %d: DST %d: %s", i, j, err.errMsg.c_str()}); + hasFatalError = true; + break; + } } + if (hasFatalError) break; // Check executor rank if (t.exeDevice.exeRank < 0 || t.exeDevice.exeRank >= GetNumRanks()) { errors.push_back({ERR_FATAL, "Rank index for executor must be between 0 and %d (instead of %d)", GetNumRanks() - 1, t.exeDevice.exeRank}); - continue; + break; } executors.insert(t.exeDevice); @@ -2030,56 +2235,77 @@ namespace { switch (t.exeDevice.exeType) { case EXE_CPU: - if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors) + if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors) { errors.push_back({ERR_FATAL, "Transfer %d: CPU index must be between 0 and %d (instead of %d) for rank %d", i, numExecutors - 1, t.exeDevice.exeIndex, t.exeDevice.exeRank}); + hasFatalError = true; + } break; case EXE_GPU_GFX: if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors) { errors.push_back({ERR_FATAL, "Transfer %d: GFX index must be between 0 and %d (instead of %d) for rank %d", i, numExecutors - 1, t.exeDevice.exeIndex, t.exeDevice.exeRank}); + hasFatalError = true; + break; } else { if (t.exeSubIndex != -1) { #if defined(__NVCC__) errors.push_back({ERR_FATAL, "Transfer %d: GFX executor subindex not supported on NVIDIA hardware", i}); + hasFatalError = true; #else useSubIndexCount[t.exeDevice]++; int numSubIndices = GetNumExecutorSubIndices(t.exeDevice); - if (t.exeSubIndex >= numSubIndices) + if (t.exeSubIndex >= numSubIndices) { errors.push_back({ERR_FATAL, "Transfer %d: GFX subIndex (XCC) must be between 0 and %d for rank %d", i, numSubIndices - 1, t.exeDevice.exeRank}); + hasFatalError = true; + break; + } #endif } } break; case EXE_GPU_DMA: - if (t.srcs.size() != 1 || t.dsts.size() != 1) { + if (t.srcs.size() != 1) { errors.push_back({ERR_FATAL, - "Transfer %d: DMA executor must have exactly 1 source and 1 destination", i}); + "Transfer %d: DMA executor must have exactly 1 source", i}); + hasFatalError = true; + break; + } + if (t.dsts.size() < 1) { + errors.push_back({ERR_FATAL, + "Transfer %d: DMA executor must have at least 1 destination", i}); + hasFatalError = true; + break; } if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors) { errors.push_back({ERR_FATAL, "Transfer %d: DMA index must be between 0 and %d (instead of %d) for rank %d", i, numExecutors - 1, t.exeDevice.exeIndex, t.exeDevice.exeRank}); - // Cannot proceed with any further checks - continue; + hasFatalError = true; + break; } if (t.exeSubIndex != -1) { #if defined(__NVCC__) errors.push_back({ERR_FATAL, "Transfer %d: DMA executor subindex not supported on NVIDIA hardware", i}); + hasFatalError = true; + break; #else useSubIndexCount[t.exeDevice]++; int numSubIndices = GetNumExecutorSubIndices(t.exeDevice); - if (t.exeSubIndex >= numSubIndices) + if (t.exeSubIndex >= numSubIndices) { errors.push_back({ERR_FATAL, "Transfer %d: DMA subIndex (engine) must be between 0 and %d", i, numSubIndices - 1}); + hasFatalError = true; + break; + } // Check that engine Id exists between agents hsa_agent_t srcAgent, dstAgent; @@ -2087,29 +2313,46 @@ namespace { err = System::Get().GetHsaAgent(t.srcs[0], srcAgent); if (err.errType != ERR_NONE) { errors.push_back(err); - if (err.errType == ERR_FATAL) break; - } - err = System::Get().GetHsaAgent(t.dsts[0], dstAgent); - if (err.errType != ERR_NONE) { - errors.push_back(err); - if (err.errType == ERR_FATAL) break; + if (err.errType == ERR_FATAL) { + hasFatalError = true; + break; + } + } - // Skip check of engine Id mask for self copies - if (srcAgent.handle != dstAgent.handle) { - uint32_t engineIdMask = 0; - err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask); + int numDsts = (int)t.dsts.size(); + for (int dstIdx = 0; dstIdx < numDsts; dstIdx++) { + err = System::Get().GetHsaAgent(t.dsts[dstIdx], dstAgent); if (err.errType != ERR_NONE) { errors.push_back(err); - if (err.errType == ERR_FATAL) break; + if (err.errType == ERR_FATAL) { + hasFatalError = true; + break; + } } - hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex); - if (!(sdmaEngineId & engineIdMask)) { - errors.push_back({ERR_FATAL, - "Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst", - i, t.exeDevice.exeIndex, t.exeSubIndex}); + + // Skip check of engine Id mask for self copies + if (srcAgent.handle != dstAgent.handle) { + uint32_t engineIdMask = 0; + err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask); + if (err.errType != ERR_NONE) { + errors.push_back(err); + if (err.errType == ERR_FATAL) { + hasFatalError = true; + break; + } + } + hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex); + if (!(sdmaEngineId & engineIdMask)) { + errors.push_back({ERR_FATAL, + "Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst", + i, t.exeDevice.exeIndex, t.exeSubIndex}); + hasFatalError = true; + break; + } } } + if (hasFatalError) break; #endif } @@ -2132,12 +2375,67 @@ namespace { } } break; + case EXE_GPU_BDMA: +#ifdef BMA_EXEC_ENABLED + if (t.srcs.size() != 1) { + errors.push_back({ERR_FATAL, + "Transfer %d: BMA executor must have exactly 1 source", i}); + hasFatalError = true; + break; + } + if (t.dsts.size() < 1) { + errors.push_back({ERR_FATAL, + "Transfer %d: BMA executor must have at least 1 destination", i}); + hasFatalError = true; + break; + } + + if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors) { + errors.push_back({ERR_FATAL, + "Transfer %d: BMA index must be between 0 and %d (instead of %d) for rank %d", + i, numExecutors - 1, t.exeDevice.exeIndex, t.exeDevice.exeRank}); + hasFatalError = true; + break; + } + + if (t.exeSubIndex != -1) { + errors.push_back({ERR_FATAL, + "Transfer %d: BMA executor does not support executor subindices (SDMA engine selection)", i}); + hasFatalError = true; + break; + } + + if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) { + errors.push_back({ERR_WARN, + "Transfer %d: No GPU memory for source or destination. Copy might not execute on BMA %d", + i, t.exeDevice.exeIndex}); + } else { + if (IsGpuMemType(t.srcs[0].memType)) { + if (t.srcs[0].memIndex != t.exeDevice.exeIndex) { + errors.push_back({ERR_WARN, + "Transfer %d: BMA executor may use the source memory device (%d) not (%d)", + i, t.srcs[0].memIndex, t.exeDevice.exeIndex}); + } + } else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) { + errors.push_back({ERR_WARN, + "Transfer %d: BMA executor may use the destination memory device (%d) not (%d)", + i, t.dsts[0].memIndex, t.exeDevice.exeIndex}); + } + } + break; +#else + errors.push_back({ERR_FATAL, + "Transfer %d: BMA executor requires ROCm 7.1 or newer (AMD HIP with hipMemcpyBatchAsync)", i}); + hasFatalError = true; + break; +#endif case EXE_NIC: case EXE_NIC_NEAREST: #ifdef NIC_EXEC_ENABLED { // NIC Executors can only execute a copy operation if (t.srcs.size() != 1 || t.dsts.size() != 1) { errors.push_back({ERR_FATAL, "Transfer %d: NIC executor requires single SRC and single DST", i}); + hasFatalError = true; break; } @@ -2149,6 +2447,7 @@ namespace { if (srcMemRank != srcExeRank && dstMemRank != srcExeRank) { errors.push_back({ERR_FATAL, "Transfer %d: NIC executor rank (%d) must be same as SRC memory rank (%d) or DST memory rank (%d)", i, srcExeRank, srcMemRank, dstMemRank}); + hasFatalError = true; break; } @@ -2161,8 +2460,12 @@ namespace { if (srcExeDevice.exeIndex < 0 || srcExeDevice.exeIndex >= GetNumExecutors(EXE_NIC, srcExeRank)) { errors.push_back({ERR_FATAL, "Transfer %d: Rank %d SRC NIC executor indexes an out-of-range NIC (%d). Detected %d NICs", i, srcExeRank, srcExeDevice.exeIndex, GetNumExecutors(EXE_NIC, srcExeRank)}); + hasFatalError = true; + break; } else if (!NicIsActive(srcExeDevice.exeIndex, srcExeDevice.exeRank)) { errors.push_back({ERR_FATAL, "Transfer %d: Rank %d SRC NIC executor %d is not active", i, srcExeDevice.exeRank, srcExeDevice.exeIndex}); + hasFatalError = true; + break; } // The DST NIC executor facilitates the copy but issues no commands @@ -2174,29 +2477,51 @@ namespace { if (dstExeDevice.exeIndex < 0 || dstExeDevice.exeIndex >= GetNumExecutors(EXE_NIC, dstExeRank)) { errors.push_back({ERR_FATAL, "Transfer %d: Rank %d DST NIC executor indexes an out-of-range NIC (%d). Detected %d NICs", i, dstExeRank, dstExeDevice.exeIndex, GetNumExecutors(EXE_NIC, dstExeRank)}); + hasFatalError = true; + break; } else if (!NicIsActive(dstExeDevice.exeIndex, dstExeDevice.exeRank)) { errors.push_back({ERR_FATAL, "Transfer %d: Rank %d DST NIC executor %d is not active", i, dstExeDevice.exeRank, dstExeDevice.exeIndex}); + hasFatalError = true; + break; } } #else errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available.", i}); + hasFatalError = true; #endif break; } + // Skip further tests if fatal error detected + if (hasFatalError) break; + // Check for multi-node support - // Currently this is not supported for CPU/GPU executors - if (IsCpuExeType(t.exeDevice.exeType) || IsGpuExeType(t.exeDevice.exeType)) { - bool crossRank = false; + if (IsPodTransfer(t)) { +#ifndef POD_COMM_ENABLED + errors.push_back({ERR_FATAL, + "Transfer %d: Cross-rank GPU memory access requires pod communication support (HIP 8.0+)", i}); + hasFatalError = true; + break; +#endif + // In order to support pod communication, the participanting ranks need to be members of the same pod + int exeRank = t.exeDevice.exeRank; + bool samePod = true; + for (auto const& src : t.srcs) { - crossRank |= (src.memRank != t.exeDevice.exeRank); + if (!(samePod = IsSamePod(src.memRank, exeRank))) + break; } - for (auto const& dst : t.dsts) { - crossRank |= (dst.memRank != t.exeDevice.exeRank); + if (samePod) { + for (auto const& dst : t.dsts) { + if (!(samePod = IsSamePod(dst.memRank, exeRank))) + break; + } } - if (crossRank) { + + if (!samePod || IsCpuExeType(t.exeDevice.exeType)) { errors.push_back({ERR_FATAL, "Transfer %d: Executor on rank %d can not access memory across ranks\n", i, t.exeDevice.exeRank}); + break; } } @@ -2205,6 +2530,7 @@ namespace { errors.push_back({ERR_FATAL, "Transfer %d: # of subexecutors must be positive", i}); else totalSubExecs[t.exeDevice] += t.numSubExecs; + } int gpuMaxHwQueues = 4; @@ -2246,6 +2572,7 @@ namespace { "GPU %d specifies XCC on only %d of %d Transfers. " "Must either specific none or all", exeDevice.exeIndex, useSubIndexCount[exeDevice], transferCount[exeDevice]}); + break; } if (cfg.gfx.useMultiStream && transferCount[exeDevice] > gpuMaxHwQueues) { @@ -2263,6 +2590,7 @@ namespace { "DMA %d specifies engine on only %d of %d Transfers. " "Must either specific none or all", exeDevice.exeIndex, useSubIndexCount[exeDevice], transferCount[exeDevice]}); + break; } if (transferCount[exeDevice] > gpuMaxHwQueues) { errors.push_back({ERR_WARN, @@ -2277,6 +2605,15 @@ namespace { "DMA %d copies will fallback to blit (GFX) kernels", exeDevice.exeIndex}); break; } + case EXE_GPU_BDMA: + { + if (transferCount[exeDevice] > gpuMaxHwQueues) { + errors.push_back({ERR_WARN, + "BMA %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d", + exeDevice.exeIndex, transferCount[exeDevice], gpuMaxHwQueues}); + } + break; + } default: break; } @@ -2307,19 +2644,24 @@ namespace { int teamIdx; ///< Size of team this sub executor is part of // Outputs - long long startCycle; ///< Start timestamp for in-kernel timing (GPU-GFX executor) - long long stopCycle; ///< Stop timestamp for in-kernel timing (GPU-GFX executor) + int64_t startCycle; ///< Start timestamp for in-kernel timing (GPU-GFX executor) + int64_t stopCycle; ///< Stop timestamp for in-kernel timing (GPU-GFX executor) uint32_t hwId; ///< Hardware ID uint32_t xccId; ///< XCC ID }; // Internal resources allocated per Transfer + typedef hipMemGenericAllocationHandle_t memHandle_t; struct TransferResources { int transferIdx; ///< The associated Transfer size_t numBytes; ///< Number of bytes to Transfer vector srcMem; ///< Source memory vector dstMem; ///< Destination memory + vector srcActualBytes; ///< Actual amount of src memory allocated (after padding) + vector dstActualBytes; ///< Actual amount of dst memory allocated (after padding) + vector srcMemHandle; ///< Memory handles for source memory + vector dstMemHandle; ///< Memory handles for destination memory vector subExecParamCpu; ///< Defines subarrays for each subexecutor vector subExecIdx; ///< Indices into subExecParamGpu int numaNode; ///< NUMA node to use for this Transfer @@ -2329,13 +2671,13 @@ namespace { // For targeted-SDMA #if !defined(__NVCC__) - hsa_agent_t dstAgent; ///< DMA destination memory agent + vector dstAgent; ///< DMA destination memory agents hsa_agent_t srcAgent; ///< DMA source memory agent hsa_signal_t signal; ///< HSA signal for completion hsa_amd_sdma_engine_id_t sdmaEngineId; ///< DMA engine ID #endif -// For IBV executor + // For IBV executor #ifdef NIC_EXEC_ENABLED int srcNicIndex; ///< SRC NIC index int dstNicIndex; ///< DST NIC index @@ -2363,6 +2705,13 @@ namespace { vector>sendWorkRequests; ///< Send work requests per queue pair #endif + // For BMA executor +#ifdef BMA_EXEC_ENABLED + vector batchDsts; ///< Destination pointers (per batch item) + vector batchSrcs; ///< Source pointers (per batch item) + vector batchBytes; ///< Bytes to copy (per batch item) +#endif + // Counters double totalDurationMsec; ///< Total duration for all iterations for this Transfer vector perIterMsec; ///< Duration for each individual iteration @@ -2386,6 +2735,7 @@ namespace { vector startEvents; ///< HIP start timing event vector stopEvents; ///< HIP stop timing event int wallClockRate; ///< (GFX-only) Device wall clock rate + int gfxKernelToUse; ///< (GFX-only) Which GFX kernel to use }; // Structure to track PCIe topology @@ -2548,9 +2898,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid) int numIbvDevices = 0; ibv_device** deviceList = ibv_get_device_list(&numIbvDevices); - // Check for NIC_FILTER + // Check for TB_NIC_FILTER // By default, accept all NIC names - std::string nicFilterPattern = getenv("NIC_FILTER") ? getenv("NIC_FILTER") : ".*"; + std::string nicFilterPattern = getenv("TB_NIC_FILTER") ? getenv("TB_NIC_FILTER") : ".*"; if (deviceList && numIbvDevices > 0) { // Loop over each device to collect information @@ -2639,11 +2989,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid) bool isLast = true) { if (!node.address.empty()) { - printf("%s%s%s", prefix.c_str(), (isLast ? "└── " : "├── "), node.address.c_str()); + System::Get().Log("%s%s%s", prefix.c_str(), (isLast ? "└── " : "├── "), node.address.c_str()); if (!node.description.empty()) { - printf("(%s)", node.description.c_str()); + System::Get().Log("(%s)", node.description.c_str()); } - printf("\n"); + System::Get().Log("\n"); } auto const& children = node.children; for (auto it = children.begin(); it != children.end(); ++it) { @@ -2765,7 +3115,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) iss >> std::hex >> domain >> delimiter >> bus >> delimiter >> device >> delimiter >> function; if (iss.fail()) { #ifdef VERBS_DEBUG - printf("Invalid PCIe address format: %s\n", pcieAddress.c_str()); + System::Get().Log("Invalid PCIe address format: %s\n", pcieAddress.c_str()); #endif return -1; } @@ -3049,7 +3399,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } } // Create SRC completion queues - IBV_PTR_CALL(rss.srcCompQueue, ibv_create_cq, rss.srcContext, cfg.nic.queueSize, NULL, NULL, 0); + // Ensure CQ size is at least as large as the number of queue pairs to avoid overflow + int srcCQSize = std::max(cfg.nic.queueSize, static_cast(rss.qpCount)); + IBV_PTR_CALL(rss.srcCompQueue, ibv_create_cq, rss.srcContext, srcCQSize, NULL, NULL, 0); // Get SRC port attributes IBV_CALL(ibv_query_port, rss.srcContext, port, &rss.srcPortAttr); // Check for RDMA over Converged Ethernet (RoCE) and update GID index appropriately @@ -3113,7 +3465,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } } // Create DST completion queues - IBV_PTR_CALL(rss.dstCompQueue, ibv_create_cq, rss.dstContext, cfg.nic.queueSize, NULL, NULL, 0); + // Ensure CQ size is at least as large as the number of queue pairs to avoid overflow + int dstCQSize = std::max(cfg.nic.queueSize,static_cast(rss.qpCount)); + IBV_PTR_CALL(rss.dstCompQueue, ibv_create_cq, rss.dstContext, dstCQSize, NULL, NULL, 0); // Get DST port attributes IBV_CALL(ibv_query_port, rss.dstContext, port, &rss.dstPortAttr); // Check for RDMA over Converged Ethernet (RoCE) and update GID index appropriately @@ -3145,7 +3499,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid) System::Get().Broadcast(srcMemRank, sizeof(rss.srcPortAttr.link_layer), &rss.srcPortAttr.link_layer); System::Get().Broadcast(dstMemRank, sizeof(rss.dstPortAttr.link_layer), &rss.dstPortAttr.link_layer); if (rss.srcPortAttr.link_layer != rss.dstPortAttr.link_layer) { - printf("[ERROR] Link layer do not match (%d vs %d)\n", rss.srcPortAttr.link_layer, rss.dstPortAttr.link_layer); return {ERR_FATAL, "SRC NIC (%d) [Rank %d] and DST NIC (%d) [Rank %d] do not have the same link layer [%d vs %d]", rss.srcNicIndex, srcMemRank, rss.dstNicIndex, dstMemRank, rss.srcPortAttr.link_layer, rss.dstPortAttr.link_layer}; } @@ -3177,13 +3530,38 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // Move queue pairs to ready-to-receive (RTR), using exchanged connection info // Then move them to read-to-send (RTS) + // Broadcast each rank's result so all ranks fail together rather than + // hanging on the next iteration's Broadcast when qpCount > 1. + struct QpTransitionResult { ErrType errType; bool rtrFailed; }; + static_assert(std::is_trivially_copyable::value, "QpTransitionResult must be trivially copyable for MPI broadcast"); + QpTransitionResult srcQpResult = {ERR_NONE, false}; if (GetRank() == srcMemRank) { - ERR_CHECK(TransitionQpToRtr(rss.srcQueuePairs[i], dstConnInfo, port, srcIsRoCE, rss.srcPortAttr.active_mtu)); - ERR_CHECK(TransitionQpToRts(rss.srcQueuePairs[i])); + ErrResult err = TransitionQpToRtr(rss.srcQueuePairs[i], dstConnInfo, port, srcIsRoCE, rss.srcPortAttr.active_mtu); + srcQpResult.rtrFailed = (err.errType != ERR_NONE); + if (err.errType == ERR_NONE) { + err = TransitionQpToRts(rss.srcQueuePairs[i]); + } + srcQpResult.errType = err.errType; + } + System::Get().Broadcast(srcMemRank, sizeof(srcQpResult), &srcQpResult); + if (srcQpResult.errType != ERR_NONE) { + return {ERR_FATAL, "SRC rank %d failed to transition QP %d to %s", + srcMemRank, i, srcQpResult.rtrFailed ? "RTR" : "RTS"}; } + + QpTransitionResult dstQpResult = {ERR_NONE, false}; if (GetRank() == dstMemRank) { - ERR_CHECK(TransitionQpToRtr(rss.dstQueuePairs[i], srcConnInfo, port, dstIsRoCE, rss.dstPortAttr.active_mtu)); - ERR_CHECK(TransitionQpToRts(rss.dstQueuePairs[i])); + ErrResult err = TransitionQpToRtr(rss.dstQueuePairs[i], srcConnInfo, port, dstIsRoCE, rss.dstPortAttr.active_mtu); + dstQpResult.rtrFailed = (err.errType != ERR_NONE); + if (err.errType == ERR_NONE) { + err = TransitionQpToRts(rss.dstQueuePairs[i]); + } + dstQpResult.errType = err.errType; + } + System::Get().Broadcast(dstMemRank, sizeof(dstQpResult), &dstQpResult); + if (dstQpResult.errType != ERR_NONE) { + return {ERR_FATAL, "DST rank %d failed to transition QP %d to %s", + dstMemRank, i, dstQpResult.rtrFailed ? "RTR" : "RTS"}; } // Prepare scatter-gather element / work request for this queue pair in advance @@ -3198,10 +3576,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid) auto const lkey = (nicExeRank == srcMemRank ? rss.srcMemRegion->lkey : rss.dstMemRegion->lkey); auto const rkey = (nicExeRank == srcMemRank ? dstConnInfo.rkey : srcConnInfo.rkey); if (System::Get().IsVerbose()) { - printf("[INFO] Transfer %d SubExec %d executed by rank %d NIC %d is %s with %lu chunks\n", - rss.transferIdx, i, nicExeRank, nicExeDevice.exeIndex, - (opcode == IBV_WR_RDMA_WRITE ? "remote write" : "remote read"), - numChunks); + System::Get().Log("[INFO] Transfer %d SubExec %d executed by rank %d NIC %d is %s with %lu chunks\n", + rss.transferIdx, i, nicExeRank, nicExeDevice.exeIndex, + (opcode == IBV_WR_RDMA_WRITE ? "remote write" : "remote read"), + numChunks); } rss.sgePerQueuePair[i].resize(numChunks, {}); rss.sendWorkRequests[i].resize(numChunks, {}); @@ -3227,8 +3605,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) wr.wr.rdma.rkey = rkey; if (System::Get().IsVerbose()) { - printf("[INFO] Transfer %d SubExec %d chunk %lu local %p remote %p of size %lu\n", - rss.transferIdx, i, chunkIdx, (void*)local, (void*)remote, currChunkBytes); + System::Get().Log("[INFO] Transfer %d SubExec %d chunk %lu local %p remote %p of size %lu\n", + rss.transferIdx, i, chunkIdx, (void*)local, (void*)remote, currChunkBytes); } // Increment locations @@ -3353,16 +3731,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid) std::shuffle(lineTypes.begin(), lineTypes.end(), gen); // Apply zero-ing - int dumpLines = getenv("DUMP_LINES") ? atoi(getenv("DUMP_LINES")) : 0; + int dumpLines = getenv("TB_DUMP_LINES") ? atoi(getenv("TB_DUMP_LINES")) : 0; if (dumpLines) { - printf("Input pattern 64B line statistics for bufferIdx %d:\n", bufferIdx); - printf("Total lines: %lu\n", numLines); - printf("- 0: Random : %8lu (%8.3f%%)\n", lineCounts[0], 100.0 * lineCounts[0] / (1.0 * numLines)); - printf("- 1: 1B0 : %8lu (%8.3f%%)\n", lineCounts[1], 100.0 * lineCounts[1] / (1.0 * numLines)); - printf("- 2: 2B0 : %8lu (%8.3f%%)\n", lineCounts[2], 100.0 * lineCounts[2] / (1.0 * numLines)); - printf("- 3: 4B0 : %8lu (%8.3f%%)\n", lineCounts[3], 100.0 * lineCounts[3] / (1.0 * numLines)); - printf("- 4: 32B0 : %8lu (%8.3f%%)\n", lineCounts[4], 100.0 * lineCounts[4] / (1.0 * numLines)); + System::Get().Log("Input pattern 64B line statistics for bufferIdx %d:\n", bufferIdx); + System::Get().Log("Total lines: %lu\n", numLines); + System::Get().Log("- 0: Random : %8lu (%8.3f%%)\n", lineCounts[0], 100.0 * lineCounts[0] / (1.0 * numLines)); + System::Get().Log("- 1: 1B0 : %8lu (%8.3f%%)\n", lineCounts[1], 100.0 * lineCounts[1] / (1.0 * numLines)); + System::Get().Log("- 2: 2B0 : %8lu (%8.3f%%)\n", lineCounts[2], 100.0 * lineCounts[2] / (1.0 * numLines)); + System::Get().Log("- 3: 4B0 : %8lu (%8.3f%%)\n", lineCounts[3], 100.0 * lineCounts[3] / (1.0 * numLines)); + System::Get().Log("- 4: 32B0 : %8lu (%8.3f%%)\n", lineCounts[4], 100.0 * lineCounts[4] / (1.0 * numLines)); } for (int line = 0; line < numLines; line++) { @@ -3394,12 +3772,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } if (line < dumpLines) { - printf("Line %02d [%d]: ", line, lineTypes[line]); + System::Get().Log("Line %02d [%d]: ", line, lineTypes[line]); for (int j = 63; j >= 0; j--){ - printf("%02x ", linePtr[j]); - if (j % 16 == 0) printf(" "); + System::Get().Log("%02x ", linePtr[j]); + if (j % 16 == 0) System::Get().Log(" "); } - printf("\n"); + System::Get().Log("\n"); } } } else { @@ -3445,6 +3823,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) if (IsCpuMemType(t.dsts[dstIdx].memType) || cfg.data.validateDirect) { output = (rss->dstMem[dstIdx]) + initOffset; } else { + ERR_CHECK(hipSetDevice(t.dsts[dstIdx].memIndex)); ERR_CHECK(hipMemcpy(outputBuffer.data(), (rss->dstMem[dstIdx]) + initOffset, t.numBytes, hipMemcpyDefault)); ERR_CHECK(hipDeviceSynchronize()); output = outputBuffer.data(); @@ -3465,6 +3844,48 @@ static bool IsConfiguredGid(union ibv_gid const& gid) return ERR_NONE; } + // Determine eligibility requirements for a particular GFX kernel + static bool CanUseGfxKernel(int const gpuKernelIdx, + ConfigOptions const& cfg, + vector const& transfers, + ExeInfo const& exeInfo) + { + // GpuReduceKernel always works + if (gpuKernelIdx == GFX_KERNEL_REDUCE) return true; + + // CopyKernel works if all Transfers have at most one SRC / one DST with no warp subexecutors + if (gpuKernelIdx == GFX_KERNEL_COPY) { + if (cfg.gfx.seType != 0) return false; + if (exeInfo.resources.empty()) return false; + for (auto const& rss : exeInfo.resources) { + Transfer const& t = transfers[rss.transferIdx]; + if (t.srcs.size() > 1 || t.dsts.size() > 1) return false; + if (cfg.gfx.useSingleTeam && t.numSubExecs > 1) return false; + } + return true; + } + + return false; + } + + static ErrResult SelectGfxKernel(ConfigOptions const& cfg, vector const& transfers, ExeInfo& exeInfo) + { + // Decide on which GFX kernel to use + // Auto-select - prefer copyKernel if eligible + if (cfg.gfx.gfxKernel == GFX_KERNEL_AUTO) { + exeInfo.gfxKernelToUse = CanUseGfxKernel(GFX_KERNEL_COPY, cfg, transfers, exeInfo) ? 1 : 0; + } else { + exeInfo.gfxKernelToUse = cfg.gfx.gfxKernel; + } + + // Warn if using forcing copy kernel, but allow kernel to continue + if (cfg.gfx.gfxKernel == GFX_KERNEL_COPY && !CanUseGfxKernel(GFX_KERNEL_COPY, cfg, transfers, exeInfo)) { + return {ERR_WARN, + "GFX copy kernel forced even though deemed incompatible for current set of Transfers / config"}; + } + return ERR_NONE; + } + // Preparation-related functions //======================================================================================== @@ -3538,12 +3959,96 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } } +#ifdef BMA_EXEC_ENABLED + // Prepare src/dst pointers for batched DMA executor + rss.batchDsts.clear(); + rss.batchSrcs.clear(); + rss.batchBytes.clear(); + if (transfer.exeDevice.exeType == EXE_GPU_BDMA) { + for (int i = 0; i < transfer.numSubExecs; ++i) { + for (int j = 0; j < (int)rss.dstMem.size(); j++) { + rss.batchSrcs.push_back(subExecParam[i].src[0]); + rss.batchDsts.push_back(subExecParam[i].dst[j]); + rss.batchBytes.push_back(subExecParam[i].N * sizeof(float)); + } + } + } +#endif + // Clear counters rss.totalDurationMsec = 0.0; return ERR_NONE; } + static ErrResult ExchangeMemory(MemDevice const& memDevice, ExeDevice const& exeDevice, size_t* pActualBytes, + float** memPtr, hipMemGenericAllocationHandle_t* memHandle) + { + // Pass this pointer to all ranks (Used for pointer arithmetic, not defererenced on non-local ranks) + // NOTE: This will be overwritten on executor rank if pod communication is required + System::Get().Broadcast(memDevice.memRank, sizeof(*memPtr), memPtr); + + // Broadcast actualBytes from owning rank so importing rank gets the correct (rounded-up) size + System::Get().Broadcast(memDevice.memRank, sizeof(*pActualBytes), pActualBytes); + + // If pod communication is required, export/import fabric handle + if (memDevice.memRank != exeDevice.exeRank && IsGpuExeType(exeDevice.exeType)) { +#ifdef POD_COMM_ENABLED + // mem rank exports to shareable fabric handle; broadcast handle + status so all + // ranks fail together instead of hanging on the next collective if export fails + hipMemFabricHandle_t fabricHandle = {}; + hipError_t exportErr = hipSuccess; + const char* exportStep = "hipSetDevice"; + if (memDevice.memRank == GetRank()) { + exportErr = hipSetDevice(memDevice.memIndex); + if (exportErr == hipSuccess) { + exportStep = "hipMemExportToShareableHandle"; + exportErr = hipMemExportToShareableHandle(&fabricHandle, *memHandle, hipMemHandleTypeFabric, 0); + } + } + + System::Get().Broadcast(memDevice.memRank, sizeof(hipMemFabricHandle_t), &fabricHandle); + System::Get().Broadcast(memDevice.memRank, sizeof(hipError_t), &exportErr); + if (exportErr != hipSuccess) { + return {ERR_FATAL, "HIP Error in %s during fabric handle export: %s", exportStep, hipGetErrorString(exportErr)}; + } + + // exe rank imports the fabric handle; broadcast result so all ranks fail together + hipError_t importErr = hipSuccess; + const char* importStep = "hipSetDevice"; + if (exeDevice.exeRank == GetRank()) { + importErr = hipSetDevice(exeDevice.exeIndex); + if (importErr == hipSuccess) { + importStep = "hipMemImportFromShareableHandle"; + importErr = hipMemImportFromShareableHandle(memHandle, (void*)&fabricHandle, hipMemHandleTypeFabric); + } + if (importErr == hipSuccess) { + importStep = "hipMemAddressReserve"; + importErr = hipMemAddressReserve((gpu_device_ptr*)memPtr, *pActualBytes, 0, 0, 0); + } + if (importErr == hipSuccess) { + importStep = "hipMemMap"; + importErr = hipMemMap((gpu_device_ptr)*memPtr, *pActualBytes, 0, *memHandle, 0); + } + if (importErr == hipSuccess) { + importStep = "hipMemSetAccess"; + hipMemAccessDesc desc; + desc.location = {hipMemLocationTypeDevice, exeDevice.exeIndex}; + desc.flags = hipMemAccessFlagsProtReadWrite; + importErr = hipMemSetAccess((gpu_device_ptr)*memPtr, *pActualBytes, &desc, 1); + } + } + System::Get().Broadcast(exeDevice.exeRank, sizeof(hipError_t), &importErr); + if (importErr != hipSuccess) { + return {ERR_FATAL, "HIP Error in %s during fabric handle import: %s", importStep, hipGetErrorString(importErr)}; + } +#else + return {ERR_FATAL, "Unable to export/import fabric handle without compiling with pod communication support"}; +#endif + } + return ERR_NONE; + } + // Prepare each executor // Allocates memory for src/dst, prepares subexecutors, executor-specific data structures static ErrResult PrepareExecutor(ConfigOptions const& cfg, @@ -3554,8 +4059,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) exeInfo.totalDurationMsec = 0.0; int const localRank = GetRank(); if (System::Get().IsVerbose()) { - printf("[INFO] Rank %d preparing executor (%c%d on Rank %d)\n", - localRank, ExeTypeStr[exeDevice.exeType], exeDevice.exeIndex, exeDevice.exeRank); + System::Get().Log("[INFO] Rank %d preparing executor (%c%d on Rank %d)\n", + localRank, ExeTypeStr[exeDevice.exeType], exeDevice.exeIndex, exeDevice.exeRank); } // Loop over each transfer this executor is involved in @@ -3564,12 +4069,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid) rss.numBytes = t.numBytes; if (System::Get().IsVerbose()) { - printf("[INFO] Rank %d preparing transfer %d (%lu SRC %lu DST)\n", - localRank, rss.transferIdx, t.srcs.size(), t.dsts.size()); + System::Get().Log("[INFO] Rank %d preparing transfer %d (%lu SRC %lu DST)\n", + localRank, rss.transferIdx, t.srcs.size(), t.dsts.size()); } // Allocate source memory rss.srcMem.resize(t.srcs.size()); + rss.srcActualBytes.resize(t.srcs.size()); + rss.srcMemHandle.resize(t.srcs.size(), NULL); for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) { MemDevice const& srcMemDevice = t.srcs[iSrc]; @@ -3584,16 +4091,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } // Allocate source memory (on the correct rank) + bool requiresFabricHandle = (srcMemDevice.memRank != exeDevice.exeRank) && IsGpuExeType(exeDevice.exeType); if (srcMemDevice.memRank == localRank) { - ERR_CHECK(AllocateMemory(srcMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.srcMem[iSrc])); + ERR_CHECK(AllocateMemory(srcMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.srcMem[iSrc], + &rss.srcActualBytes[iSrc], requiresFabricHandle ? &rss.srcMemHandle[iSrc] : nullptr)); } - // Pass this pointer to all ranks (Used for pointer arithmetic, not defererenced on non-local ranks) - System::Get().Broadcast(srcMemDevice.memRank, sizeof(rss.srcMem[iSrc]), &rss.srcMem[iSrc]); + // Exchange memory pointer across ranks + ERR_CHECK(ExchangeMemory(srcMemDevice, exeDevice, &rss.srcActualBytes[iSrc], + &rss.srcMem[iSrc], &rss.srcMemHandle[iSrc])); } // Allocate destination memory rss.dstMem.resize(t.dsts.size()); + rss.dstActualBytes.resize(t.dsts.size()); + rss.dstMemHandle.resize(t.dsts.size(), NULL); for (int iDst = 0; iDst < t.dsts.size(); ++iDst) { MemDevice const& dstMemDevice = t.dsts[iDst]; @@ -3607,11 +4119,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } // Allocate destination memory (on the correct rank) + bool requiresFabricHandle = (dstMemDevice.memRank != exeDevice.exeRank) && IsGpuExeType(exeDevice.exeType); if (dstMemDevice.memRank == localRank) { - ERR_CHECK(AllocateMemory(dstMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.dstMem[iDst])); + ERR_CHECK(AllocateMemory(dstMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.dstMem[iDst], + &rss.dstActualBytes[iDst], requiresFabricHandle ? &rss.dstMemHandle[iDst] : NULL)); } - // Pass this pointer to all ranks (Used for pointer arithmetic, not defererenced on non-local ranks) - System::Get().Broadcast(dstMemDevice.memRank, sizeof(rss.dstMem[iDst]), &rss.dstMem[iDst]); + + // Exchange memory pointer across ranks + ERR_CHECK(ExchangeMemory(dstMemDevice, exeDevice, &rss.dstActualBytes[iDst], + &rss.dstMem[iDst], &rss.dstMemHandle[iDst])); } // Prepare HSA DMA copy specific resources @@ -3620,8 +4136,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // Collect HSA agent information hsa_amd_pointer_info_t info; info.size = sizeof(info); - ERR_CHECK(hsa_amd_pointer_info(rss.dstMem[0], &info, NULL, NULL, NULL)); - rss.dstAgent = info.agentOwner; + int numDst = (int)rss.dstMem.size(); + rss.dstAgent.resize(numDst); + for (int dstIdx = 0; dstIdx < numDst; dstIdx++) { + ERR_CHECK(hsa_amd_pointer_info(rss.dstMem[dstIdx], &info, NULL, NULL, NULL)); + rss.dstAgent[dstIdx] = info.agentOwner; + } ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL)); rss.srcAgent = info.agentOwner; @@ -3639,11 +4159,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } // Prepare additional requirements for GPU-based executors - if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) && exeDevice.exeRank == localRank) { + if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA || exeDevice.exeType == EXE_GPU_BDMA) + && exeDevice.exeRank == localRank) { ERR_CHECK(hipSetDevice(exeDevice.exeIndex)); // Determine how many streams to use - int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA || + int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA || exeDevice.exeType == EXE_GPU_BDMA || (exeDevice.exeType == EXE_GPU_GFX && cfg.gfx.useMultiStream)) ? exeInfo.resources.size() : 1; exeInfo.streams.resize(numStreamsToUse); @@ -3757,6 +4278,22 @@ static bool IsConfiguredGid(union ibv_gid const& gid) return {ERR_FATAL, "RDMA executor is not supported"}; #endif } + + // Check that GPU wallclock rate is non-zero + if (exeDevice.exeType == EXE_GPU_GFX && exeInfo.wallClockRate == 0) { + if (getenv("TB_WALLCLOCK_RATE")) { + exeInfo.wallClockRate = atoi(getenv("TB_WALLCLOCK_RATE")); + return {ERR_WARN, + "GPU %d wallclock rate query returned 0 unexpectedly. Setting to %d instead as specified by TB_WALLCLOCK_RATE", + exeDevice.exeIndex, exeInfo.wallClockRate}; + } else { + exeInfo.wallClockRate = 100000; + return {ERR_WARN, + "GPU %d wallclock rate query returned 0 unexpectedly. Setting to %d instead. Use TB_WALLCLOCK_RATE to customize", + exeDevice.exeIndex, exeInfo.wallClockRate}; + } + } + return ERR_NONE; } @@ -3778,14 +4315,30 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // Deallocate source memory for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) { if (t.srcs[iSrc].memRank == localRank) { - ERR_CHECK(DeallocateMemory(t.srcs[iSrc].memType, rss.srcMem[iSrc], t.numBytes + cfg.data.byteOffset)); + ERR_CHECK(DeallocateMemory(t.srcs[iSrc].memType, rss.srcMem[iSrc], + rss.srcActualBytes[iSrc], + &rss.srcMemHandle[iSrc])); + } else if (exeDevice.exeRank == localRank && rss.srcMemHandle[iSrc] != 0) { +#ifdef POD_COMM_ENABLED + ERR_CHECK(hipMemUnmap((gpu_device_ptr)rss.srcMem[iSrc], rss.srcActualBytes[iSrc])); + ERR_CHECK(hipMemRelease(rss.srcMemHandle[iSrc])); + ERR_CHECK(hipMemAddressFree((gpu_device_ptr)rss.srcMem[iSrc], rss.srcActualBytes[iSrc])); +#endif } } // Deallocate destination memory for (int iDst = 0; iDst < t.dsts.size(); ++iDst) { if (t.dsts[iDst].memRank == localRank) { - ERR_CHECK(DeallocateMemory(t.dsts[iDst].memType, rss.dstMem[iDst], t.numBytes + cfg.data.byteOffset)); + ERR_CHECK(DeallocateMemory(t.dsts[iDst].memType, rss.dstMem[iDst], + rss.dstActualBytes[iDst], + &rss.dstMemHandle[iDst])); + } else if (exeDevice.exeRank == localRank && rss.dstMemHandle[iDst] != 0) { +#ifdef POD_COMM_ENABLED + ERR_CHECK(hipMemUnmap((gpu_device_ptr)rss.dstMem[iDst], rss.dstActualBytes[iDst])); + ERR_CHECK(hipMemRelease(rss.dstMemHandle[iDst])); + ERR_CHECK(hipMemAddressFree((gpu_device_ptr)rss.dstMem[iDst], rss.dstActualBytes[iDst])); +#endif } } @@ -3805,7 +4358,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } // Teardown additional requirements for GPU-based executors - if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) && exeDevice.exeRank == localRank) { + if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA || exeDevice.exeType == EXE_GPU_BDMA) + && exeDevice.exeRank == localRank) { for (auto stream : exeInfo.streams) ERR_CHECK(hipStreamDestroy(stream)); if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) { @@ -3855,7 +4409,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // Add a dummy check to ensure the read is not optimized out if (sum != sum) { - printf("[ERROR] Nan detected\n"); + System::Get().Log("[ERROR] Nan detected\n"); } } else { for (int i = 0; i < numDsts; ++i) @@ -3977,18 +4531,24 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } // poll for completions size_t completedTransfers = 0; + int pollBatch = std::max(1, cfg.nic.cqPollBatch); + std::vector wc((size_t)pollBatch); + ibv_wc* wc_array = wc.data(); while (completedTransfers < transferCount) { for (auto i = 0; i < transferCount; i++) { if(receivedQPs[i] < exeInfo.resources[i].qpCount) { auto& rss = exeInfo.resources[i]; // Poll the completion queue until all queue pairs are complete // The order of completion doesn't matter because this completion queue is dedicated to this Transfer - ibv_wc wc; - int nc = ibv_poll_cq(rss.srcIsExeNic ? rss.srcCompQueue : rss.dstCompQueue, 1, &wc); + // Use batch polling to drain multiple completions at once for better efficiency + int nc = ibv_poll_cq(rss.srcIsExeNic ? rss.srcCompQueue : rss.dstCompQueue, pollBatch, wc_array); if (nc > 0) { - receivedQPs[i]++; - if (wc.status != IBV_WC_SUCCESS) { - return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion [status code %d]", rss.transferIdx, wc.status}; + // Process all completions in the batch + for (int j = 0; j < nc; j++) { + if (wc_array[j].status != IBV_WC_SUCCESS) { + return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion [status code %d]", rss.transferIdx, wc_array[j].status}; + } + receivedQPs[i]++; } } else if (nc < 0) { return {ERR_FATAL, "Transfer %d: Received negative work completion", rss.transferIdx}; @@ -4143,7 +4703,166 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } } - // Kernel for GFX execution + // Simplified Kernel for GFX execution for copies only + template + __global__ void __launch_bounds__(LAUNCH_BOUND) + GpuCopyKernel(SubExecParam* params, int seType, int waveOrder, int numSubIterations) + { + int64_t startCycle; + // For warp-level, each warp's first thread records timing; for threadblock-level, only first thread of block + bool shouldRecordTiming = (seType == 1) ? (threadIdx.x % warpSize == 0) : (threadIdx.x == 0); + if (shouldRecordTiming) startCycle = GetTimestamp(); + + // seType: 0=threadblock, 1=warp + int subExecIdx; + if (seType == 0) { + // Threadblock-level: each threadblock is a subexecutor + subExecIdx = blockIdx.y; + } else { + // Warp-level: each warp is a subexecutor + int warpIdx = threadIdx.x / warpSize; + int warpsPerBlock = blockDim.x / warpSize; + subExecIdx = blockIdx.y * warpsPerBlock + warpIdx; + } + + SubExecParam& p = params[subExecIdx]; + + // For warp-level dispatch, inactive warps should return early + if (seType == 1 && p.N == 0) return; + + // Filter by XCC +#if !defined(__NVCC__) + int32_t xccId; + GetXccId(xccId); + if (p.preferredXccId != -1 && xccId != p.preferredXccId) return; +#endif + + // Collect data information + bool hasSrc = p.numSrcs > 0; + bool hasDst = p.numDsts > 0; + PACKED_FLOAT const* __restrict__ srcFloatPacked = (PACKED_FLOAT const*)p.src[0]; + PACKED_FLOAT* __restrict__ dstFloatPacked = (PACKED_FLOAT*)p.dst[0]; + + // Operate on wavefront granularity + int32_t const nTeams = p.teamSize; // Number of threadblocks working together on this subarray + int32_t const teamIdx = p.teamIdx; // Index of this threadblock within the team + int32_t nWaves, waveIdx; + if (seType == 0) { + // Threadblock-level: all wavefronts in block work together + nWaves = blockDim.x / warpSize; // Number of wavefronts within this threadblock + waveIdx = threadIdx.x / warpSize; // Index of this wavefront within the threadblock + } else { + // Warp-level: each warp works independently + nWaves = 1; + waveIdx = 0; + } + int32_t const tIdx = threadIdx.x % warpSize; // Thread index within wavefront + + size_t const numPackedFloat = p.N / (sizeof(PACKED_FLOAT)/sizeof(float)); + + int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2; + switch (waveOrder) { + case 0: /* U,W,C */ unrlStride = 1; waveStride = UNROLL; teamStride = UNROLL * nWaves; teamStride2 = nWaves; waveStride2 = 1 ; break; + case 1: /* U,C,W */ unrlStride = 1; teamStride = UNROLL; waveStride = UNROLL * nTeams; teamStride2 = 1; waveStride2 = nTeams; break; + case 2: /* W,U,C */ waveStride = 1; unrlStride = nWaves; teamStride = nWaves * UNROLL; teamStride2 = nWaves; waveStride2 = 1 ; break; + case 3: /* W,C,U */ waveStride = 1; teamStride = nWaves; unrlStride = nWaves * nTeams; teamStride2 = nWaves; waveStride2 = 1 ; break; + case 4: /* C,U,W */ teamStride = 1; unrlStride = nTeams; waveStride = nTeams * UNROLL; teamStride2 = 1; waveStride2 = nTeams; break; + case 5: /* C,W,U */ teamStride = 1; waveStride = nTeams; unrlStride = nTeams * nWaves; teamStride2 = 1; waveStride2 = nTeams; break; + } + + int subIterations = 0; + while (1) { + // First loop: Each wavefront in the team works on UNROLL PACKED_FLOAT per thread + size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize; + size_t const loop1Limit = numPackedFloat / loop1Stride * loop1Stride; + { + PACKED_FLOAT val[UNROLL]; + if (!hasSrc) { + #pragma unroll + for (int u = 0; u < UNROLL; u++) + val[u] = MemsetVal(); + } + + for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride) { + // Read sources into memory and accumulate in registers + if (hasSrc) { + #pragma unroll + for (int u = 0; u < UNROLL; u++) + Load(&srcFloatPacked[idx + u * unrlStride * warpSize], val[u]); + } + + // Write accumulation to all outputs + if (hasDst) { + #pragma unroll + for (int u = 0; u < UNROLL; u++) + Store(val[u], &dstFloatPacked[idx + u * unrlStride * warpSize]); + } + } + } + + // Second loop: Deal with remaining PACKED_FLOAT + { + if (loop1Limit < numPackedFloat) { + PACKED_FLOAT val; + if (!hasSrc) val = MemsetVal(); + + size_t const loop2Stride = nTeams * nWaves * warpSize; + for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; + idx < numPackedFloat; idx += loop2Stride) { + if (hasSrc) { + Load(&srcFloatPacked[idx], val); + } + if (hasDst) { + Store(val, &dstFloatPacked[idx]); + } + } + } + } + + // Third loop; Deal with remaining floats + { + if (numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) < p.N) { + float val; + if (!hasSrc) val = MemsetVal(); + + size_t const loop3Stride = nTeams * nWaves * warpSize; + for (size_t idx = numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) { + if (hasSrc) { + Load(&p.src[0][idx], val); + } + + if (hasDst) { + Store(val, &p.dst[0][idx]); + } + } + } + } + // Allows for numSubiterations == 0 to run infinitely + if (++subIterations == numSubIterations) break; + } + + // Wait for all threads to finish + if (seType == 1) { + // For warp-level, sync within warp only +#if defined(__HIP_PLATFORM_AMD__) && (HIP_VERSION_MAJOR < 7) + __builtin_amdgcn_wave_barrier(); +#else + __syncwarp(); +#endif + } else { + // For threadblock-level, sync all threads + __syncthreads(); + } + + if (shouldRecordTiming) { + p.stopCycle = GetTimestamp(); + p.startCycle = startCycle; + GetHwId(p.hwId); + GetXccId(p.xccId); + } + } + + // Kernel for GFX execution template __global__ void __launch_bounds__(LAUNCH_BOUND) GpuReduceKernel(SubExecParam* params, int seType, int waveOrder, int numSubIterations) @@ -4295,26 +5014,25 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } } } - + // Allows for numSubiterations == 0 to run infinitely if (++subIterations == numSubIterations) break; } // Wait for all threads to finish if (seType == 1) { // For warp-level, sync within warp only - #if defined(__HIP_PLATFORM_AMD__) && (HIP_VERSION_MAJOR < 7) +#if defined(__HIP_PLATFORM_AMD__) && (HIP_VERSION_MAJOR < 7) __builtin_amdgcn_wave_barrier(); - #else +#else __syncwarp(); - #endif +#endif } else { // For threadblock-level, sync all threads __syncthreads(); } if (shouldRecordTiming) { - __threadfence_system(); p.stopCycle = GetTimestamp(); p.startCycle = startCycle; GetHwId(p.hwId); @@ -4322,31 +5040,68 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } } + // Must match ordering in GfxKernelType +#define GPU_KERNEL_KERNEL_DECL(LAUNCH_BOUND, UNROLL, DWORD, TEMPORAL) \ + {GpuReduceKernel, \ + GpuCopyKernel } + + // Must match mapping in GetGpuKernelTemporalIdx + constexpr int KERN_TEMPORALS = 4; #define GPU_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, DWORD) \ - {GpuReduceKernel, \ - GpuReduceKernel, \ - GpuReduceKernel, \ - GpuReduceKernel} + {GPU_KERNEL_KERNEL_DECL(LAUNCH_BOUND, UNROLL, DWORD, TEMPORAL_NONE), \ + GPU_KERNEL_KERNEL_DECL(LAUNCH_BOUND, UNROLL, DWORD, TEMPORAL_LOAD), \ + GPU_KERNEL_KERNEL_DECL(LAUNCH_BOUND, UNROLL, DWORD, TEMPORAL_STORE), \ + GPU_KERNEL_KERNEL_DECL(LAUNCH_BOUND, UNROLL, DWORD, TEMPORAL_BOTH)} + + int GetGpuKernelTemporalIdx(int temporalMode) { + if (temporalMode == TEMPORAL_NONE) return 0; + if (temporalMode == TEMPORAL_LOAD) return 1; + if (temporalMode == TEMPORAL_STORE) return 2; + if (temporalMode == TEMPORAL_BOTH) return 3; + return -1; + } + // Must match mapping in GetGpuKernelWordsizeIdx + constexpr int KERN_WORDSIZES = 3; #define GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, UNROLL) \ {GPU_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, float), \ GPU_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, float2), \ GPU_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, float4)} -#define GPU_KERNEL_UNROLL_DECL(LAUNCH_BOUND) \ - {GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 1), \ - GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 2), \ - GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 3), \ - GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 4), \ - GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 5), \ - GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 6), \ - GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 7), \ - GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 8)} - - // Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size / temporal) + int GetGpuKernelWordsizeIdx(int wordsize) { + if (wordsize == 1) return 0; + if (wordsize == 2) return 1; + if (wordsize == 4) return 2; + return -1; + } + + // Must match mapping in GetGpuKernelUnrollIdx + constexpr int KERN_UNROLLS = 10; +#define GPU_KERNEL_UNROLL_DECL(LAUNCH_BOUND) \ + {GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 1), \ + GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 2), \ + GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 3), \ + GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 4), \ + GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 5), \ + GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 6), \ + GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 7), \ + GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 8), \ + GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 16), \ + GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 32)} + + // Must match the unroll mapping in GPU_KERNEL_UNROLL_DECL + int GetGpuKernelUnrollIdx(int unroll) { + if (1 <= unroll && unroll <= 8) return unroll - 1; + if (unroll == 16) return 8; + if (unroll == 32) return 9; + return -1; + } + + // Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size / temporal / kernel) typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int, int); + constexpr int KERN_BOUNDS = 4; #ifndef SINGLE_KERNEL - GpuKernelFuncPtr GpuKernelTable[4][MAX_UNROLL][3][4] = + GpuKernelFuncPtr GpuKernelsTable[KERN_BOUNDS][KERN_UNROLLS][KERN_WORDSIZES][KERN_TEMPORALS][NUM_GFX_KERNELS] = { GPU_KERNEL_UNROLL_DECL(256), GPU_KERNEL_UNROLL_DECL(512), @@ -4358,65 +5113,83 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #undef GPU_KERNEL_UNROLL_DECL #undef GPU_KERNEL_DWORD_DECL #undef GPU_KERNEL_TEMPORAL_DECL - #undef GPU_KERNEL_SE_TYPE_DECL + + int GetGpuKernelBlocksizeIdx(int blocksize) { + return (blocksize + 255) / 256 - 1; + } // Execute a single GPU Transfer (when using 1 stream per Transfer) static ErrResult ExecuteGpuTransfer(int const iteration, + int const exeTotalSubExecs, + SubExecParam* exeSubExecParam, hipStream_t const stream, hipEvent_t const startEvent, hipEvent_t const stopEvent, int const xccDim, ConfigOptions const& cfg, + int const gfxKernelIdx, TransferResources& rss) { - auto cpuStart = std::chrono::high_resolution_clock::now(); - - int numSubExecs = rss.subExecParamCpu.size(); - int gridY = CalculateGridY(cfg.gfx.seType, cfg.gfx.blockSize, numSubExecs); - dim3 const gridSize(xccDim, gridY, 1); - dim3 const blockSize(cfg.gfx.blockSize, 1); + // Determine which kernel to launch + int const blockSizeIdx = GetGpuKernelBlocksizeIdx(cfg.gfx.blockSize); + int const unrollIdx = GetGpuKernelUnrollIdx(cfg.gfx.unrollFactor); + int const wordSizeIdx = GetGpuKernelWordsizeIdx(cfg.gfx.wordSize); + int const temporalIdx = GetGpuKernelTemporalIdx(cfg.gfx.temporalMode); - int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 : - cfg.gfx.wordSize == 2 ? 1 : - 2; #ifdef SINGLE_KERNEL - auto gpuKernel = GpuReduceKernel; + auto gpuKernel = GpuReduceKernel; #else - auto gpuKernel = GpuKernelTable[(cfg.gfx.blockSize+255)/256 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode]; + auto gpuKernel = GpuKernelsTable[blockSizeIdx][unrollIdx][wordSizeIdx][temporalIdx][gfxKernelIdx]; #endif + // Compute kernel launch parameters + int const numSubExecs = cfg.gfx.useMultiStream ? rss.subExecParamCpu.size() : exeTotalSubExecs; + int const gridY = CalculateGridY(cfg.gfx.seType, cfg.gfx.blockSize, numSubExecs); + dim3 const gridSize(xccDim, gridY, 1); + dim3 const blockSize(cfg.gfx.blockSize); + + auto cpuStart = std::chrono::high_resolution_clock::now(); + + SubExecParam* params = cfg.gfx.useMultiStream ? rss.subExecParamGpuPtr : exeSubExecParam; + #if defined(__NVCC__) - if (startEvent != NULL) + if (cfg.gfx.useHipEvents) ERR_CHECK(hipEventRecord(startEvent, stream)); - gpuKernel<<>>(rss.subExecParamGpuPtr, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations); - if (stopEvent != NULL) + gpuKernel<<>>(params, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations); + if (cfg.gfx.useHipEvents) ERR_CHECK(hipEventRecord(stopEvent, stream)); #else - hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream, startEvent, stopEvent, - 0, rss.subExecParamGpuPtr, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations); + hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream, + cfg.gfx.useHipEvents ? startEvent : NULL, + cfg.gfx.useHipEvents ? stopEvent : NULL, 0, + params, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations); #endif ERR_CHECK(hipStreamSynchronize(stream)); - auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; - double cpuDeltaMsec = std::chrono::duration_cast>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations; + // Record this timing if this Transfer is being run in multistream mode + if (cfg.gfx.useMultiStream) { + if (iteration >= 0) { + auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; + double cpuDeltaMsec = std::chrono::duration_cast>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations; + + double deltaMsec = cpuDeltaMsec; + if (startEvent != NULL) { + float gpuDeltaMsec; + ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); + deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations; + } - if (iteration >= 0) { - double deltaMsec = cpuDeltaMsec; - if (startEvent != NULL) { - float gpuDeltaMsec; - ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); - deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations; - } - rss.totalDurationMsec += deltaMsec; - if (cfg.general.recordPerIteration) { - rss.perIterMsec.push_back(deltaMsec); - std::set> CUs; - for (int i = 0; i < numSubExecs; i++) { - CUs.insert(std::make_pair(rss.subExecParamGpuPtr[i].xccId, - GetId(rss.subExecParamGpuPtr[i].hwId))); + rss.totalDurationMsec += deltaMsec; + if (cfg.general.recordPerIteration) { + rss.perIterMsec.push_back(deltaMsec); + std::set> CUs; + for (int i = 0; i < numSubExecs; i++) { + CUs.insert(std::make_pair(rss.subExecParamGpuPtr[i].xccId, + GetId(rss.subExecParamGpuPtr[i].hwId))); + } + rss.perIterCUs.push_back(CUs); } - rss.perIterCUs.push_back(CUs); } } return ERR_NONE; @@ -4434,72 +5207,56 @@ static bool IsConfiguredGid(union ibv_gid const& gid) int xccDim = exeInfo.useSubIndices ? exeInfo.numSubIndices : 1; if (cfg.gfx.useMultiStream) { - // Launch each Transfer separately in its own stream + // Launch one thread per Transfer in separate streams vector> asyncTransfers; for (int i = 0; i < exeInfo.streams.size(); i++) { asyncTransfers.emplace_back(std::async(std::launch::async, ExecuteGpuTransfer, iteration, + exeInfo.totalSubExecs, + exeInfo.subExecParamGpu, exeInfo.streams[i], cfg.gfx.useHipEvents ? exeInfo.startEvents[i] : NULL, cfg.gfx.useHipEvents ? exeInfo.stopEvents[i] : NULL, xccDim, std::cref(cfg), + exeInfo.gfxKernelToUse, std::ref(exeInfo.resources[i]))); } for (auto& asyncTransfer : asyncTransfers) ERR_CHECK(asyncTransfer.get()); } else { - // Combine all the Transfers into a single kernel launch - int numSubExecs = exeInfo.totalSubExecs; - int gridY = CalculateGridY(cfg.gfx.seType, cfg.gfx.blockSize, numSubExecs); - dim3 const gridSize(xccDim, gridY, 1); - dim3 const blockSize(cfg.gfx.blockSize, 1); - hipStream_t stream = exeInfo.streams[0]; - - int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 : - cfg.gfx.wordSize == 2 ? 1 : - 2; -#ifdef SINGLE_KERNEL - auto gpuKernel = GpuReduceKernel; -#else - auto gpuKernel = GpuKernelTable[(cfg.gfx.blockSize+255)/256 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode]; -#endif - -#if defined(__NVCC__) - if (cfg.gfx.useHipEvents) - ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream)); - gpuKernel<<>>(exeInfo.subExecParamGpu, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations); - if (cfg.gfx.useHipEvents) - ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream)); -#else - hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream, - cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL, - cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL, 0, - exeInfo.subExecParamGpu, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations); -#endif - ERR_CHECK(hipStreamSynchronize(stream)); + // Launch all Transfers in one kernel launch (avoid extra thread creation) + ExecuteGpuTransfer(iteration, exeInfo.totalSubExecs, exeInfo.subExecParamGpu, exeInfo.streams[0], + cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL, + cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL, + xccDim, cfg, exeInfo.gfxKernelToUse, exeInfo.resources[0]); } + auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; - double cpuDeltaMsec = std::chrono::duration_cast>(cpuDelta).count() * 1000.0 - / cfg.general.numSubIterations; if (iteration >= 0) { + // Determine executor timing + // - Use HIP event timing if enabled and not using multi-stream + // - Otherwise, Use CPU timing if (cfg.gfx.useHipEvents && !cfg.gfx.useMultiStream) { float gpuDeltaMsec; ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0])); gpuDeltaMsec /= cfg.general.numSubIterations; exeInfo.totalDurationMsec += gpuDeltaMsec; } else { + double cpuDeltaMsec = std::chrono::duration_cast>(cpuDelta).count() * 1000.0 + / cfg.general.numSubIterations; exeInfo.totalDurationMsec += cpuDeltaMsec; } + // If Transfers were combined into a single launch, figure out per-Transfer timing // Determine timing for each of the individual transfers that were part of this launch if (!cfg.gfx.useMultiStream) { for (int i = 0; i < exeInfo.resources.size(); i++) { TransferResources& rss = exeInfo.resources[i]; - long long minStartCycle = std::numeric_limits::max(); - long long maxStopCycle = std::numeric_limits::min(); + int64_t minStartCycle = std::numeric_limits::max(); + int64_t maxStopCycle = std::numeric_limits::min(); std::set> CUs; for (auto subExecIdx : rss.subExecIdx) { @@ -4510,6 +5267,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) GetId(exeInfo.subExecParamGpu[subExecIdx].hwId))); } } + double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate); deltaMsec /= cfg.general.numSubIterations; rss.totalDurationMsec += deltaMsec; @@ -4529,6 +5287,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // Execute a single DMA Transfer static ErrResult ExecuteDmaTransfer(int const iteration, bool const useSubIndices, + int const exeIndex, hipStream_t const stream, hipEvent_t const startEvent, hipEvent_t const stopEvent, @@ -4537,15 +5296,31 @@ static bool IsConfiguredGid(union ibv_gid const& gid) { auto cpuStart = std::chrono::high_resolution_clock::now(); + int numDsts = (int)resources.dstMem.size(); + ERR_CHECK(hipSetDevice(exeIndex)); int subIterations = 0; if (!useSubIndices && !cfg.dma.useHsaCopy) { if (cfg.dma.useHipEvents) ERR_CHECK(hipEventRecord(startEvent, stream)); - // Use hipMemcpy + // Force the use of SDMA engine if possible +#if defined(__HIP_PLATFORM_AMD__) && defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 6) + hipMemcpyKind memcpyKind = hipMemcpyDeviceToDeviceNoCU; +#endif + + // Use DMA copy engine do { - ERR_CHECK(hipMemcpyAsync(resources.dstMem[0], resources.srcMem[0], resources.numBytes, - hipMemcpyDefault, stream)); + // Queue for each output location + for (int dstIdx = 0; dstIdx < numDsts; dstIdx++) { +#if defined(CUMEM_ENABLED) + ERR_CHECK(cuMemcpyAsync((CUdeviceptr)resources.dstMem[dstIdx], + (CUdeviceptr)resources.srcMem[0], + resources.numBytes, stream)); +#else + ERR_CHECK(hipMemcpyAsync(resources.dstMem[dstIdx], resources.srcMem[0], resources.numBytes, + memcpyKind, stream)); +#endif + } } while (++subIterations != cfg.general.numSubIterations); if (cfg.dma.useHipEvents) @@ -4557,20 +5332,22 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #else // Use HSA async copy do { - hsa_signal_store_screlease(resources.signal, 1); - if (!useSubIndices) { - ERR_CHECK(hsa_amd_memory_async_copy(resources.dstMem[0], resources.dstAgent, - resources.srcMem[0], resources.srcAgent, - resources.numBytes, 0, NULL, - resources.signal)); - } else { - HSA_CALL(hsa_amd_memory_async_copy_on_engine(resources.dstMem[0], resources.dstAgent, - resources.srcMem[0], resources.srcAgent, - resources.numBytes, 0, NULL, - resources.signal, - resources.sdmaEngineId, true)); + hsa_signal_store_screlease(resources.signal, numDsts); + for (int dstIdx = 0; dstIdx < numDsts; dstIdx++) { + if (!useSubIndices) { + ERR_CHECK(hsa_amd_memory_async_copy(resources.dstMem[dstIdx], resources.dstAgent[dstIdx], + resources.srcMem[0], resources.srcAgent, + resources.numBytes, 0, NULL, + resources.signal)); + } else { + HSA_CALL(hsa_amd_memory_async_copy_on_engine(resources.dstMem[dstIdx], resources.dstAgent[dstIdx], + resources.srcMem[0], resources.srcAgent, + resources.numBytes, 0, NULL, + resources.signal, + resources.sdmaEngineId, true)); + } } - // Wait for SDMA transfer to complete + // Wait for SDMA transfer(s) to complete while(hsa_signal_wait_scacquire(resources.signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE) >= 1); @@ -4609,6 +5386,93 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ExecuteDmaTransfer, iteration, exeInfo.useSubIndices, + exeIndex, + exeInfo.streams[i], + cfg.dma.useHipEvents ? exeInfo.startEvents[i] : NULL, + cfg.dma.useHipEvents ? exeInfo.stopEvents[i] : NULL, + std::cref(cfg), + std::ref(exeInfo.resources[i]))); + } + + for (auto& asyncTransfer : asyncTransfers) + ERR_CHECK(asyncTransfer.get()); + + auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; + double deltaMsec = std::chrono::duration_cast>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations; + if (iteration >= 0) + exeInfo.totalDurationMsec += deltaMsec; + return ERR_NONE; + } + +// BMA Executor-related functions +//======================================================================================== +#ifdef BMA_EXEC_ENABLED + // Execute a single BMA Transfer (one hipMemcpyBatchAsync per sub-iteration; each subexecutor is one batch entry) + static ErrResult ExecuteBatchDmaTransfer(int const iteration, + int const exeIndex, + hipStream_t const stream, + hipEvent_t const startEvent, + hipEvent_t const stopEvent, + ConfigOptions const& cfg, + TransferResources& resources) + { + auto cpuStart = std::chrono::high_resolution_clock::now(); + + ERR_CHECK(hipSetDevice(exeIndex)); + + int subIterations = 0; + if (cfg.dma.useHipEvents) + ERR_CHECK(hipEventRecord(startEvent, stream)); + + [[maybe_unused]] size_t failIdx = 0; + do { + ERR_CHECK(hipMemcpyBatchAsync(resources.batchDsts.data(), + resources.batchSrcs.data(), + resources.batchBytes.data(), + resources.batchDsts.size(), + nullptr, nullptr, 0, + // In CUDA 13.0 the failIdx argument was removed from the original CUDA 12.8 API call +#if !defined(__NVCC__) || (defined(CUDA_VERSION) && (CUDA_VERSION < 13000)) + &failIdx, +#endif + stream)); + } while (++subIterations != cfg.general.numSubIterations); + + if (cfg.dma.useHipEvents) + ERR_CHECK(hipEventRecord(stopEvent, stream)); + ERR_CHECK(hipStreamSynchronize(stream)); + + auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; + double cpuDeltaMsec = std::chrono::duration_cast>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations; + + if (iteration >= 0) { + double deltaMsec = cpuDeltaMsec; + if (cfg.dma.useHipEvents) { + float gpuDeltaMsec; + ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); + deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations; + } + resources.totalDurationMsec += deltaMsec; + if (cfg.general.recordPerIteration) + resources.perIterMsec.push_back(deltaMsec); + } + return ERR_NONE; + } + + static ErrResult RunBmaExecutor(int const iteration, + ConfigOptions const& cfg, + int const exeIndex, + ExeInfo& exeInfo) + { + auto cpuStart = std::chrono::high_resolution_clock::now(); + ERR_CHECK(hipSetDevice(exeIndex)); + + vector> asyncTransfers; + for (int i = 0; i < exeInfo.resources.size(); i++) { + asyncTransfers.emplace_back(std::async(std::launch::async, + ExecuteBatchDmaTransfer, + iteration, + exeIndex, exeInfo.streams[i], cfg.dma.useHipEvents ? exeInfo.startEvents[i] : NULL, cfg.dma.useHipEvents ? exeInfo.stopEvents[i] : NULL, @@ -4625,6 +5489,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) exeInfo.totalDurationMsec += deltaMsec; return ERR_NONE; } +#endif // BMA_EXEC_ENABLED // Executor-related functions //======================================================================================== @@ -4634,16 +5499,32 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ExeInfo& exeInfo) { switch (exeDevice.exeType) { - case EXE_CPU: return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); - case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); - case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); + case EXE_CPU: return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); + case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); + case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); #ifdef NIC_EXEC_ENABLED - case EXE_NIC: return RunNicExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); + case EXE_NIC: return RunNicExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); +#endif +#ifdef BMA_EXEC_ENABLED + case EXE_GPU_BDMA: return RunBmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); #endif - default: return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType}; + default: return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType}; } } +#if defined(__NVCC__) + static bool MnnvlCheck() { + int flag = 0; +#ifdef POD_COMM_ENABLED + CUresult err = cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, 0); + if (err || !flag) return false; + err = cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, 0); +#endif + if (!flag) return false; + return true; + } +#endif + } // End of anonymous namespace //======================================================================================== /// @endcond @@ -4657,7 +5538,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid) this->errMsg = ""; } else { this->errType = ERR_FATAL; +#if defined(__NVCC__) + this->errMsg = std::string("CUDA Runtime Error: ") + hipGetErrorString(err); +#else this->errMsg = std::string("HIP Error: ") + hipGetErrorString(err); +#endif } } @@ -4674,6 +5559,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid) this->errMsg = std::string("HSA Error: ") + errString; } } +#elif defined(CUMEM_ENABLED) + ErrResult::ErrResult(CUresult err) + { + if (err == CUDA_SUCCESS) { + this->errType = ERR_NONE; + this->errMsg = ""; + } else { + const char *errString = NULL, *errName = NULL; + cuGetErrorName(err, &errName); + cuGetErrorString(err, &errString); + this->errType = ERR_FATAL; + this->errMsg = std::string("CUDA Driver Error: ") + errName + + " (" + errString + ")"; + } + } #endif ErrResult::ErrResult(ErrType errType, const char* format, ...) @@ -4715,6 +5615,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid) return false; } + // Log transfers (if requested) + System::Get().LogTransfers(transfers); + // Collect up transfers by executor int minNumSrcs = MAX_SRCS + 1; int maxNumSrcs = 0; @@ -4756,6 +5659,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid) if (exeDevice.exeRank == localRank) { localExecutors.push_back(exeDevice); } + + // Select which GFX kernel to use for this executor + if (exeDevice.exeType == EXE_GPU_GFX) { + ERR_APPEND(SelectGfxKernel(cfg, transfers, exeInfo), errResults); + } } // Prepare reference src/dst arrays - only once for largest size @@ -4782,6 +5690,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid) Transfer const& t = transfers[resource->transferIdx]; for (int srcIdx = 0; srcIdx < resource->srcMem.size(); srcIdx++) { if (t.srcs[srcIdx].memRank == localRank) { + if (IsGpuMemType(t.srcs[srcIdx].memType)) { + ERR_APPEND(hipSetDevice(t.srcs[srcIdx].memIndex), errResults); + } ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx] + initOffset, srcReference[srcIdx].data(), resource->numBytes, hipMemcpyDefault), errResults); } @@ -4792,22 +5703,22 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // Pause before starting when running in iteractive mode if (cfg.general.useInteractive) { if (localRank == 0) { - printf("Memory prepared:\n"); + System::Get().Log("Memory prepared:\n"); for (int i = 0; i < transfers.size(); i++) { - printf("Transfer %03d:\n", i); + System::Get().Log("Transfer %03d:\n", i); for (int iSrc = 0; iSrc < transfers[i].srcs.size(); ++iSrc) - printf(" SRC %0d: %p\n", iSrc, transferResources[i]->srcMem[iSrc]); + System::Get().Log(" SRC %0d: %p\n", iSrc, transferResources[i]->srcMem[iSrc]); for (int iDst = 0; iDst < transfers[i].dsts.size(); ++iDst) - printf(" DST %0d: %p\n", iDst, transferResources[i]->dstMem[iDst]); + System::Get().Log(" DST %0d: %p\n", iDst, transferResources[i]->dstMem[iDst]); } - printf("Hit to continue: "); + System::Get().Log("Hit to continue: "); fflush(stdout); if (scanf("%*c") != 0) { - printf("[ERROR] Unexpected input\n"); + System::Get().Log("[ERROR] Unexpected input\n"); exit(1); } - printf("\n"); + System::Get().Log("\n"); } System::Get().Barrier(); } @@ -4866,12 +5777,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // Pause for interactive mode if (cfg.general.useInteractive) { if (localRank == 0) { - printf("Transfers complete. Hit to continue: "); + System::Get().Log("Transfers complete. Hit to continue: "); if (scanf("%*c") != 0) { - printf("[ERROR] Unexpected input\n"); + System::Get().Log("[ERROR] Unexpected input\n"); exit(1); } - printf("\n"); + System::Get().Log("\n"); fflush(stdout); } System::Get().Barrier(); @@ -4957,15 +5868,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid) return true; } - int GetIntAttribute(IntAttribute attribute) - { - switch (attribute) { - case ATR_GFX_MAX_BLOCKSIZE: return MAX_BLOCKSIZE; - case ATR_GFX_MAX_UNROLL: return MAX_UNROLL; - default: return -1; - } - } - std::string GetStrAttribute(StrAttribute attribute) { switch (attribute) { @@ -5013,14 +5915,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } // At this point, there should be only 1 (valid) rank assigned to this SRC if (wc.mem[isDst][iMem].memRanks.size() != 1 || wc.mem[isDst][iMem].memRanks[0] < 0) { - printf("[ERROR] Unexpected number of ranks / invalid number of ranks for %s %d\n", isDst ? "DST" : "SRC", iMem); + System::Get().Log("[ERROR] Unexpected number of ranks / invalid number of ranks for %s %d\n", isDst ? "DST" : "SRC", iMem); exit(1); } // Resolve mem index wildcards // Mem devices should have at least one index if (wc.mem[isDst][iMem].memIndices.size() == 0) { - printf("[ERROR] MemIndex for %s %d cannot be empty\n", isDst ? "DST" : "SRC", iMem); + System::Get().Log("[ERROR] MemIndex for %s %d cannot be empty\n", isDst ? "DST" : "SRC", iMem); exit(1); } @@ -5109,13 +6011,13 @@ static bool IsConfiguredGid(union ibv_gid const& gid) wc.exe.exeRanks.swap(exeRanks); return result; } else if (wc.exe.exeRanks[0] == -1) { - printf("[ERROR] Exe rank should not be -1\n"); + System::Get().Log("[ERROR] Exe rank should not be -1\n"); exit(1); } // Resolve EXE indices if (wc.exe.exeIndices.size() == 0) { - printf("[ERROR] Exe index should never be empty\n"); + System::Get().Log("[ERROR] Exe index should never be empty\n"); exit(1); } else if (wc.exe.exeIndices.size() > 1) { // Loop over user provided indices @@ -5179,7 +6081,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) wc.exe.exeSubIndices.clear(); return result; } else if (wc.exe.exeType == EXE_NIC) { - printf("[ERROR] NIC executor requires a subindex be specified\n"); + System::Get().Log("[ERROR] NIC executor requires a subindex be specified\n"); exit(1); } else if (wc.exe.exeType == EXE_NIC_NEAREST) { // Assign NIC closest to DST mem @@ -5213,7 +6115,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers); wc.exe.exeSubIndices[0] = -2; return result; - case EXE_GPU_GFX: case EXE_GPU_DMA: + case EXE_GPU_GFX: case EXE_GPU_DMA: case EXE_GPU_BDMA: { // Iterate over all available subindices ExeDevice exeDevice = {wc.exe.exeType, wc.exe.exeIndices[0], wc.exe.exeRanks[0], 0}; @@ -5381,14 +6283,38 @@ static bool IsConfiguredGid(union ibv_gid const& gid) System::System() : rank(0), numRanks(1), commMode(COMM_NONE) { + // Collect env vars + // TB_VERBOSE = enables extra logging + // TB_SINGLE_LOG = Only rank 0 will produce output (useful if spawning multi-node socket) + // TB_DUMP_CFG_FILE = Config file to dump executed Transfers + // TB_PAUSE = Insert a pause for debug attachment + verbose = getenv("TB_VERBOSE") ? atoi(getenv("TB_VERBOSE")) : 0; + bool singleLog = getenv("TB_SINGLE_LOG") ? atoi(getenv("TB_SINGLE_LOG")) : 0; + + char* dumpCfgFilename = getenv("TB_DUMP_CFG_FILE"); + if (dumpCfgFilename) { + dumpCfgFile = fopen(dumpCfgFilename, "w"); + } if (getenv("TB_PAUSE")) { - printf("Pausing for debug attachment\n"); + Log("Pausing for debug attachment (e.g. sudo gdb -p %d)\n", getpid()); volatile bool pause = true; while (pause); } +#ifdef AMD_SMI_ENABLED + if (verbose) { + Log("[INFO] Initializing AMD System Management Interface Library (AMDSMI)\n"); + } + amdsmi_init(AMDSMI_INIT_AMD_APUS); +#elif defined (NVML_ENABLED) + if (verbose) { + Log("[INFO] Initializing NVIDIA Management Library (NVML)\n"); + } + nvmlInit_v2(); +#endif + // Priority 1: Socket communicator SetupSocketCommunicator(); @@ -5397,8 +6323,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid) SetupMpiCommunicator(); } + // Establish which ranks will output when logging + if (rank > 0 && (commMode == COMM_MPI || singleLog)) + rankDoesOutput = false; + if (verbose && commMode == COMM_NONE) { - printf("[INFO] Running in single node mode\n"); + Log("[INFO] Running in single node mode\n"); } // Collect topology and distribute across all ranks @@ -5428,8 +6358,115 @@ static bool IsConfiguredGid(union ibv_gid const& gid) listenSocket = -1; } } + + if (dumpCfgFile) { + fclose(dumpCfgFile); + } + +#ifdef AMD_SMI_ENABLED + amdsmi_shut_down(); +#elif defined(NVML_ENABLED) + nvmlShutdown(); +#endif + } + + namespace detail { + + inline std::string FormatIpv4(struct in_addr const& addr) + { + char buf[INET_ADDRSTRLEN]; + if (inet_ntop(AF_INET, &addr, buf, sizeof(buf))) + return std::string(buf); + return std::string(); + } + + inline bool IsUsableIpv4(sockaddr_in const* sin) + { + if (!sin || sin->sin_family != AF_INET) + return false; + uint32_t a = ntohl(sin->sin_addr.s_addr); + if (a == INADDR_ANY || a == INADDR_NONE) + return false; + if ((a >> 24) == 127) + return false; + return true; + } + + // IPv4 to advertise when TB_MASTER_ADDR is unset on rank 0 (after listen). + inline std::string DetectPrimaryIpv4(char const* preferredIface) + { + ifaddrs* ifap = nullptr; + if (getifaddrs(&ifap) != 0) + return std::string(); + + auto tryPick = [&](bool allowLinkLocal) -> std::string { + for (ifaddrs* ifa = ifap; ifa; ifa = ifa->ifa_next) { + if (!ifa->ifa_addr || ifa->ifa_addr->sa_family != AF_INET) + continue; + if (ifa->ifa_flags & IFF_LOOPBACK) + continue; + if (!(ifa->ifa_flags & IFF_UP)) + continue; + auto* sin = reinterpret_cast(ifa->ifa_addr); + if (!IsUsableIpv4(sin)) + continue; + if (preferredIface && preferredIface[0]) { + if (!ifa->ifa_name || strcmp(ifa->ifa_name, preferredIface) != 0) + continue; + } else { + uint32_t a = ntohl(sin->sin_addr.s_addr); + if (!allowLinkLocal && (a & 0xffff0000) == 0xa9fe0000) + continue; + } + return FormatIpv4(sin->sin_addr); + } + return std::string(); + }; + + std::string chosen; + if (preferredIface && preferredIface[0]) { + chosen = tryPick(true); + freeifaddrs(ifap); + return chosen; + } + + chosen = tryPick(false); + if (chosen.empty()) + chosen = tryPick(true); + freeifaddrs(ifap); + return chosen; + } + + inline bool ResolveMasterAddrV4(char const* host, int port, sockaddr_in* out, char const** gaiErr) + { + *gaiErr = nullptr; + if (!host || !host[0] || !out) + return false; + char portBuf[16]; + snprintf(portBuf, sizeof(portBuf), "%d", port); + addrinfo hints; + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_INET; + hints.ai_socktype = SOCK_STREAM; + addrinfo* res = nullptr; + int gai = getaddrinfo(host, portBuf, &hints, &res); + if (gai != 0) { + *gaiErr = gai_strerror(gai); + return false; + } + for (addrinfo* p = res; p; p = p->ai_next) { + if (p->ai_family == AF_INET && p->ai_addrlen >= sizeof(sockaddr_in)) { + memcpy(out, p->ai_addr, sizeof(sockaddr_in)); + freeaddrinfo(res); + return true; + } + } + freeaddrinfo(res); + return false; } + } // namespace detail + void System::SetupSocketCommunicator() { char* rankStr = getenv("TB_RANK"); @@ -5437,21 +6474,32 @@ static bool IsConfiguredGid(union ibv_gid const& gid) char* masterAddrStr = getenv("TB_MASTER_ADDR"); char* masterPortStr = getenv("TB_MASTER_PORT"); - // Socket communicator requires rank / numRanks / masterAddr - if (!rankStr || !numRanksStr || !masterAddrStr) { + if (!numRanksStr) { if (verbose) { - printf("[INFO] SocketCommunicator skipped due to missing TB_RANK | TB_NUM_RANKS | TB_MASTER_ADDR\n"); + Log("[INFO] SocketCommunicator skipped (TB_NUM_RANKS not set)\n"); } return; } - rank = atoi(rankStr); - numRanks = atoi(numRanksStr); - masterAddr = masterAddrStr; + numRanks = atoi(numRanksStr); + if (numRanks < 2) { + if (verbose) { + Log("[INFO] SocketCommunicator skipped (TB_NUM_RANKS=%d requires at least 2 for socket mode)\n", numRanks); + } + return; + } + + rank = (rankStr && rankStr[0]) ? atoi(rankStr) : 0; + masterAddr = masterAddrStr ? std::string(masterAddrStr) : std::string(); masterPort = masterPortStr ? atoi(masterPortStr) : 29500; + if (rank != 0 && masterAddr.empty()) { + Log("[ERROR] TB_MASTER_ADDR is required when TB_RANK is greater than 0 (socket communicator)\n"); + exit(1); + } + if (rank < 0 || rank >= numRanks) { - printf("[ERROR] Invalid rank index. Must be between 0 and %d (not %d)\n", numRanks - 1, rank); + Log("[ERROR] Invalid rank index. Must be between 0 and %d (not %d)\n", numRanks - 1, rank); exit(1); } @@ -5463,7 +6511,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // Create listening socket listenSocket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if (listenSocket == -1) { - printf("[ERROR] Unable to create listener socket\n"); + Log("[ERROR] Unable to create listener socket\n"); exit(1); } @@ -5478,17 +6526,36 @@ static bool IsConfiguredGid(union ibv_gid const& gid) serverAddr.sin_port = htons(masterPort); if (bind(listenSocket, (sockaddr*)&serverAddr, sizeof(serverAddr)) == -1) { - printf("[ERROR] Failed to bind listen socket\n"); + Log("[ERROR] Failed to bind listen socket\n"); exit(1); } if (listen(listenSocket, numRanks) == -1) { - printf("[ERROR] Failed to listen on socket\n"); + Log("[ERROR] Failed to listen on socket\n"); exit(1); } - // Accept connections from other ranks - printf("Waiting for connections from %d other ranks [listening on TB_MASTER_ADDR=%s TB_MASTER_PORT=%d]\n", - numRanks-1, masterAddr.c_str(), masterPort); + + if (masterAddr.empty()) { + char const* ifaceEnv = getenv("TB_MASTER_IFACE"); + masterAddr = detail::DetectPrimaryIpv4(ifaceEnv); + if (masterAddr.empty()) { + Log("[ERROR] TB_MASTER_ADDR not set and could not detect a primary IPv4 for workers"); + if (ifaceEnv && ifaceEnv[0]) + Log(" (check TB_MASTER_IFACE=%s)\n", ifaceEnv); + else + Log(" (set TB_MASTER_ADDR or TB_MASTER_IFACE)\n"); + exit(1); + } + Log("[INFO] TB_MASTER_ADDR not set; using detected IPv4 %s\n", masterAddr.c_str()); + } + + Log("[INFO] Socket rank 0: on each other host set TB_RANK to a unique value in 1..%d, then for example:\n", + numRanks - 1); + Log(" TB_NUM_RANKS=%d TB_MASTER_ADDR=%s TB_MASTER_PORT=%d TB_RANK=1\n", + numRanks, masterAddr.c_str(), masterPort); + + Log("[INFO] Waiting for connections from %d other rank(s) [TB_MASTER_ADDR=%s TB_MASTER_PORT=%d]\n", + numRanks - 1, masterAddr.c_str(), masterPort); for (int i = 1; i < numRanks; i++) { sockaddr_in clientAddr; @@ -5496,7 +6563,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) auto clientSocket = accept(listenSocket, (sockaddr*)&clientAddr, &clientAddrLen); if (clientSocket == -1) { - printf("[ERROR] Failed to accept connection from rank %d\n", i); + Log("[ERROR] Failed to accept connection from rank %d\n", i); exit(1); } @@ -5506,11 +6573,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid) if (clientRank < 0 || clientRank >= numRanks) { close(clientSocket); - printf("[ERROR] Invalid rank received: %d\n", clientRank); + Log("[ERROR] Invalid rank received: %d\n", clientRank); exit(1); } if (verbose) { - printf("[INFO] Rank 0 accepted connection from rank %d\n", clientRank); + Log("[INFO] Rank 0 accepted connection from rank %d\n", clientRank); } sockets[clientRank] = clientSocket; } @@ -5518,32 +6585,41 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // All other ranks connect to rank 0 int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if (sock == -1) { - printf("[ERROR] Failed to create socket\n"); + Log("[ERROR] Failed to create socket\n"); exit(1); } sockaddr_in serverAddr; memset(&serverAddr, 0, sizeof(serverAddr)); serverAddr.sin_family = AF_INET; - serverAddr.sin_port = htons(masterPort); - if (inet_pton(AF_INET, masterAddr.c_str(), &serverAddr.sin_addr) <= 0) { - printf("[ERROR] Invalid master address: %s\n", masterAddr.c_str()); + char const* gaiErr = nullptr; + if (!detail::ResolveMasterAddrV4(masterAddr.c_str(), masterPort, &serverAddr, &gaiErr)) { + if (gaiErr) + Log("[ERROR] Invalid master address '%s': %s\n", masterAddr.c_str(), gaiErr); + else + Log("[ERROR] Invalid master address: %s\n", masterAddr.c_str()); exit(1); } // Retry connection with backoff if (verbose) - printf("[INFO] Rank %d attempting to connect to %s:%d\n", rank, masterAddrStr, masterPort); + Log("[INFO] Rank %d attempting to connect to %s:%d\n", rank, masterAddr.c_str(), masterPort); int maxRetries = 50; + bool connected = false; for (int retry = 0; retry < maxRetries; retry++) { if (connect(sock, (sockaddr*)&serverAddr, sizeof(serverAddr)) == 0) { + connected = true; break; } if (retry == maxRetries - 1) { - printf("[ERROR] Failed to connect to master after %d retries\n", maxRetries); + Log("[ERROR] Failed to connect to master after %d retries\n", maxRetries); } sleep(1); } + if (!connected) { + close(sock); + exit(1); + } // Send local rank to the server send(sock, (char*)&rank, sizeof(rank), 0); @@ -5568,7 +6644,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) MPI_Comm_size(comm, &numRanks); if (numRanks > 1) { if (verbose) { - printf("[INFO] Enabling MPI communicator (%d ranks found)\n", numRanks); + Log("[INFO] Enabling MPI communicator (%d ranks found)\n", numRanks); } commMode = COMM_MPI; } else if (mpiInit) { @@ -5578,6 +6654,59 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #endif } + void System::Log(const char* format, ...) const + { + if (rankDoesOutput) { + va_list args; + va_start(args, format); + vprintf(format, args); + va_end(args); + } + } + + void System::LogTransfers(std::vector const& transfers) + { + if (!dumpCfgFile || !rankDoesOutput) return; + + fprintf(dumpCfgFile, "-%lu ", transfers.size()); + for (auto const& t : transfers) { + fprintf(dumpCfgFile, "("); + + // Print SRCs + for (auto const& src : t.srcs) { + fprintf(dumpCfgFile, "R%d%c%d", src.memRank, MemTypeStr[src.memType], src.memIndex); + } + if (t.srcs.empty()) + fprintf(dumpCfgFile, "N"); + + fprintf(dumpCfgFile, "->"); + + // Print Executor + fprintf(dumpCfgFile, "R%d%c%d", t.exeDevice.exeRank, ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex); + if (t.exeDevice.exeSlot != 0) + fprintf(dumpCfgFile, "%c", 'A' + t.exeDevice.exeSlot); + if (t.exeSubIndex != -1) { + fprintf(dumpCfgFile, ".%d", t.exeSubIndex); + } + if (t.exeSubSlot != 0) { + fprintf(dumpCfgFile, "%c", 'A' + t.exeSubSlot); + } + + fprintf(dumpCfgFile, "->"); + + // Print DSTs + for (auto const& dst : t.dsts) { + fprintf(dumpCfgFile, "R%d%c%d", dst.memRank, MemTypeStr[dst.memType], dst.memIndex); + } + if (t.dsts.empty()) + fprintf(dumpCfgFile, "N"); + + fprintf(dumpCfgFile, " %d %lu)", t.numSubExecs, t.numBytes); + fflush(dumpCfgFile); + } + fprintf(dumpCfgFile, "\n"); + } + void System::Barrier() { #ifdef MPI_COMM_ENABLED @@ -5618,7 +6747,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #endif if (commMode == COMM_SOCKET) { if (rank != 0 && dstRank != 0) { - printf("[ERROR] Socket communicator is limited to sending from/to rank 0\n"); + Log("[ERROR] Socket communicator is limited to sending from/to rank 0\n"); exit(1); } auto sock = sockets[dstRank]; @@ -5628,7 +6757,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) while (totalSent < numBytes) { auto sent = send(sock, (char*)sendData + totalSent, numBytes - totalSent, 0); if (sent == -1) { - printf("[ERROR] Send failed (rank %d to rank %d)\n", rank, dstRank); + Log("[ERROR] Send failed (rank %d to rank %d)\n", rank, dstRank); exit(1); } totalSent += sent; @@ -5647,7 +6776,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #endif if (commMode == COMM_SOCKET) { if (rank != 0 && srcRank != 0) { - printf("[ERROR] Socket communicator is limited to receiving from/at rank 0\n"); + Log("[ERROR] Socket communicator is limited to receiving from/at rank 0\n"); exit(1); } @@ -5656,7 +6785,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) while (totalRecv < numBytes) { auto recvd = recv(sock, (char*)recvData + totalRecv, numBytes - totalRecv, 0); if (recvd == -1 || recvd == 0) { - printf("[ERROR] Recv failed (rank %d from rank %d)\n", rank, srcRank); + Log("[ERROR] Recv failed (rank %d from rank %d)\n", rank, srcRank); perror("recv"); exit(1); } @@ -5673,7 +6802,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) if (commMode == COMM_MPI) { int err = MPI_Bcast(data, numBytes, MPI_CHAR, root, comm); if (err != MPI_SUCCESS) { - printf("[ERROR] MPI_Bcast failed with error code %d\n", err); + Log("[ERROR] MPI_Bcast failed with error code %d\n", err); } return; } @@ -5727,6 +6856,103 @@ static bool IsConfiguredGid(union ibv_gid const& gid) return "Unknown CPU"; } + void System::CollectPodMembership(char* ppodId, int64_t& vpodId) + { + memset(ppodId, 0, 16); + vpodId = -1; + + // TB_FORCE_SINGLE_POD skips any required queries to AMDSMI + char* forceSinglePod = getenv("TB_FORCE_SINGLE_POD"); + if (forceSinglePod) { + vpodId = 0; + return; + } + + // Check fabric support +#if defined(__NVCC__) +#ifdef NVML_ENABLED + if (!MnnvlCheck()) return; + char busId[] = "00000000:00:00.0"; + if (cudaDeviceGetPCIBusId(busId, sizeof(busId), 0)) return; + + nvmlGpuFabricInfoV_t fabricInfo; + fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED; + nvmlDevice_t nvmlDev; + nvmlReturn_t err = nvmlDeviceGetHandleByPciBusId_v2(busId, &nvmlDev); + if (err != NVML_SUCCESS) { + if (verbose) { + System::Get().Log("[WARN] Unable to get processor handle for GPU 0 at %s [%s]\n", + busId, nvmlErrorString(err)); + } + return; + } + fabricInfo.version = nvmlGpuFabricInfo_v2; + + err = nvmlDeviceGetGpuFabricInfoV(nvmlDev, &fabricInfo); + if (err != NVML_SUCCESS || fabricInfo.state == NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) { + System::Get().Log("[WARN] MNNVL not supported\n"); + } else { + vpodId = fabricInfo.cliqueId; + memcpy(ppodId, fabricInfo.clusterUuid, 16); + } +#endif +#else +#ifdef AMD_SMI_ENABLED + int numGpus = 0; + if (hipGetDeviceCount(&numGpus) == hipSuccess && numGpus > 0) { + // Query GPU 0 as the representative for pod membership. All GPUs on a node are + // expected to share the same pod (ppod_id/vpod_id), so querying any one is sufficient. + char pciBusId[256] = ""; + hipError_t hipErr = hipDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), 0); + if (hipErr != hipSuccess) { + if (verbose) { + Log("[WARN] Unable to get PCI bus ID for GPU 0; skipping AMD-SMI pod membership query\n"); + } + return; + } + + amdsmi_bdf_t bdf = {}; + unsigned domain, bus, device, func; + if (sscanf(pciBusId, "%x:%x:%x.%x", &domain, &bus, &device, &func) != 4) { + if (verbose) { + Log("[WARN] Unable to parse PCI bus ID '%s'; skipping AMD-SMI pod membership query\n", pciBusId); + } + return; + } + bdf.domain_number = domain; + bdf.bus_number = bus; + bdf.device_number = device; + bdf.function_number = func; + + amdsmi_processor_handle gpuHandle; + amdsmi_status_t err = amdsmi_get_processor_handle_from_bdf(bdf, &gpuHandle); + if (err != AMDSMI_STATUS_SUCCESS) { + if (verbose) { + const char *errString = NULL; + amdsmi_status_code_to_string(err, &errString); + Log("[WARN] Unable to get processor handle for GPU 0 at %s [%s]\n", + pciBusId, errString); + } + } else { + amdsmi_fabric_info_t fabricInfo; + err = amdsmi_get_gpu_fabric_info(gpuHandle, &fabricInfo); + if (err == AMDSMI_STATUS_SUCCESS) { + // NOTE: vpod_id is a uint32_t but System holds it as an int64_t to allow for + // vpodId == -1 to represent no pod present + memcpy(ppodId, &fabricInfo.fabric_info.fabric_version.v1.ppod_id, + sizeof(fabricInfo.fabric_info.fabric_version.v1.ppod_id)); + vpodId = fabricInfo.fabric_info.fabric_version.v1.vpod_id; + } else if (verbose) { + const char *errString = NULL; + amdsmi_status_code_to_string(err, &errString); + Log("[WARN] Unable to get fabric info from AMD SMI [%s]\n", errString); + } + } + } +#endif +#endif + } + void System::GetRankTopology(RankTopology& topo) { // Clear topology structure first @@ -5743,9 +6969,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) char* firstDotPtr = std::strchr(topo.hostname, '.'); if (firstDotPtr) *firstDotPtr = 0; - // NOTE: Placeholder values - strcpy(topo.ppodId, "N/A"); - topo.vpodId = -1; + // Collect Pod membership + CollectPodMembership(topo.ppodId, topo.vpodId); // CPU Executor int numCpus = numa_num_configured_nodes(); @@ -5764,9 +6989,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid) if (verbose) { for (int exeIndex = 0; exeIndex < numCpus; exeIndex++) { - printf("[INFO] Rank %03d: CPU [%02d/%02d] %03d cores (%s)\n", rank, exeIndex, numCpus, - topo.numSubExecutors[{EXE_CPU, exeIndex}], - topo.executorName[{EXE_CPU, exeIndex}].c_str()); + Log("[INFO] Rank %03d: CPU [%02d/%02d] %03d cores (%s)\n", rank, exeIndex, numCpus, + topo.numSubExecutors[{EXE_CPU, exeIndex}], + topo.executorName[{EXE_CPU, exeIndex}].c_str()); } } @@ -5776,6 +7001,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) if (status != hipSuccess) numGpus = 0; topo.numExecutors[EXE_GPU_GFX] = numGpus; topo.numExecutors[EXE_GPU_DMA] = numGpus; + topo.numExecutors[EXE_GPU_BDMA] = numGpus; for (int exeIndex = 0; exeIndex < numGpus; exeIndex++) { int numDeviceCUs = 0; @@ -5794,6 +7020,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } topo.executorName[{EXE_GPU_GFX, exeIndex}] = gpuName; topo.executorName[{EXE_GPU_DMA, exeIndex}] = gpuName; + topo.executorName[{EXE_GPU_BDMA, exeIndex}] = gpuName; #if !defined(__NVCC__) hsa_agent_t gpuAgent = gpuAgents[exeIndex]; @@ -5822,8 +7049,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #endif topo.numExecutorSubIndices[{EXE_GPU_GFX, exeIndex}] = numXccs; topo.numExecutorSubIndices[{EXE_GPU_DMA, exeIndex}] = numDmaEngines; + topo.numExecutorSubIndices[{EXE_GPU_BDMA, exeIndex}] = 0; topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}] = numDeviceCUs; topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = 1; + topo.numSubExecutors[{EXE_GPU_BDMA, exeIndex}] = numDmaEngines; topo.closestCpuNumaToGpu[exeIndex] = closestNuma; topo.closestNicsToGpu[exeIndex] = {}; } @@ -5837,7 +7066,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) topo.executorName[{EXE_NIC, exeIndex}] = GetIbvDeviceList()[exeIndex].name; topo.nicIsActive[exeIndex] = GetIbvDeviceList()[exeIndex].hasActivePort; if (verbose) { - printf("[INFO] Rank %03d: NIC [%02d/%02d] on CPU NUMA %d\n", rank, exeIndex, numNics, topo.closestCpuNumaToNic[exeIndex]); + Log("[INFO] Rank %03d: NIC [%02d/%02d] on CPU NUMA %d\n", rank, exeIndex, numNics, topo.closestCpuNumaToNic[exeIndex]); } } #endif @@ -5883,7 +7112,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) hipError_t err = hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), gpuIndex); if (err != hipSuccess) { #ifdef VERBS_DEBUG - printf("Failed to get PCI Bus ID for HIP device %d: %s\n", gpuIndex, hipGetErrorString(err)); + Log("Failed to get PCI Bus ID for HIP device %d: %s\n", gpuIndex, hipGetErrorString(err)); #endif continue; } @@ -5902,7 +7131,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // to determine the closest NIC to GPU if the PCIe tree approach fails if (closestIdx < 0) { #ifdef VERBS_DEBUG - printf("[WARN] Falling back to PCIe bus ID distance to determine proximity\n"); + Log("[WARN] Falling back to PCIe bus ID distance to determine proximity\n"); #endif int minDistance = std::numeric_limits::max(); for (int nicIndex = 0; nicIndex < numNics; nicIndex++) { @@ -5972,31 +7201,31 @@ static bool IsConfiguredGid(union ibv_gid const& gid) if (verbose) { for (int exeIndex = 0; exeIndex < numGpus; exeIndex++) { - printf("[INFO] Rank %03d: GPU [%02d/%02d] %d XCCs %03d CUs on CPU NUMA %d Closest NICs:", rank, exeIndex, numGpus, - topo.numExecutorSubIndices[{EXE_GPU_GFX, exeIndex}], - topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}], - topo.closestCpuNumaToGpu[exeIndex]); + Log("[INFO] Rank %03d: GPU [%02d/%02d] %d XCCs %03d CUs on CPU NUMA %d Closests NICs:", rank, exeIndex, numGpus, + topo.numExecutorSubIndices[{EXE_GPU_GFX, exeIndex}], + topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}], + topo.closestCpuNumaToGpu[exeIndex]); if (topo.closestNicsToGpu[exeIndex].size() == 0) { - printf(" none\n"); + Log(" none"); } else { for (auto nicIndex : topo.closestNicsToGpu[exeIndex]) { - printf(" %d", nicIndex); + Log(" %d", nicIndex); } - printf("\n"); + Log("\n"); } } #ifdef NIC_EXEC_ENABLED for (int nicIndex = 0; nicIndex < numNics; nicIndex++) { - printf("[INFO] Rank %03d: NIC [%02d/%02d] %s Closest GPUs:", rank, nicIndex, numNics, - ibvDeviceList[nicIndex].name.c_str()); + Log("[INFO] Rank %03d: NIC [%02d/%02d] %s Closest GPUs:", rank, nicIndex, numNics, + ibvDeviceList[nicIndex].name.c_str()); if (topo.closestGpusToNic[nicIndex].size() == 0) { - printf(" none"); + Log(" none"); } else { for (auto gpuIndex : topo.closestGpusToNic[nicIndex]) { - printf(" %d", gpuIndex); + Log(" %d", gpuIndex); } } - printf("\n"); + Log("\n"); } #endif } @@ -6099,7 +7328,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) void System::SendRankTopo(int peerRank, RankTopology const& topo) const { SendData(peerRank, sizeof(topo.hostname), topo.hostname); - SendData(peerRank, sizeof(topo.ppodId), &topo.ppodId); + SendData(peerRank, sizeof(topo.ppodId), topo.ppodId); SendData(peerRank, sizeof(topo.vpodId), &topo.vpodId); SendMap(peerRank, topo.numExecutors); SendMap(peerRank, topo.numExecutorSubIndices); @@ -6115,7 +7344,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) void System::RecvRankTopo(int peerRank, RankTopology& topo) const { RecvData(peerRank, sizeof(topo.hostname), topo.hostname); - RecvData(peerRank, sizeof(topo.ppodId), &topo.ppodId); + RecvData(peerRank, sizeof(topo.ppodId), topo.ppodId); RecvData(peerRank, sizeof(topo.vpodId), &topo.vpodId); RecvMap(peerRank, topo.numExecutors); RecvMap(peerRank, topo.numExecutorSubIndices); @@ -6196,7 +7425,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } else { BROADCAST(setSize); tfrResult.perIterCUs[i].clear(); - if (setSize > 0) { + for (size_t j = 0; j < setSize; j++) { pair p; BROADCAST(p); tfrResult.perIterCUs[i].insert(p); @@ -6243,7 +7472,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) return {ERR_FATAL, "CPU index must be between 0 and %d inclusively", numCpus - 1}; agent = cpuAgents[exeDevice.exeIndex]; break; - case EXE_GPU_GFX: case EXE_GPU_DMA: + case EXE_GPU_GFX: case EXE_GPU_DMA: case EXE_GPU_BDMA: if (exeIndex < 0 || exeIndex >= numGpus) return {ERR_FATAL, "GPU index must be between 0 and %d inclusively", numGpus - 1}; agent = gpuAgents[exeIndex]; @@ -6316,7 +7545,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) rankInfo[0] = localTopo; for (int peerRank = 1; peerRank < numRanks; peerRank++) { if (verbose) { - printf("[INFO] Rank 0 receives topology from Rank %d\n", peerRank); + Log("[INFO] Rank 0 receives topology from Rank %d\n", peerRank); } RecvRankTopo(peerRank, rankInfo[peerRank]); } @@ -6325,7 +7554,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) for (int peerRank = 1; peerRank < numRanks; peerRank++) { for (int i = 0; i < numRanks; i++) { if (verbose) { - printf("[INFO] Rank 0 sends topology %d to Rank %d\n", i, peerRank); + Log("[INFO] Rank 0 sends topology %d to Rank %d\n", i, peerRank); } SendRankTopo(peerRank, rankInfo[i]); } @@ -6333,14 +7562,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } else { // Send local topology info back to root if (verbose) { - printf("[INF0] Rank %d sends topology from Rank 0\n", rank); + Log("[INF0] Rank %d sends topology from Rank 0\n", rank); } SendRankTopo(0, localTopo); for (int i = 0; i < numRanks; i++) { RecvRankTopo(0, rankInfo[i]); if (verbose) { - printf("[INF0] Rank %d receives topology %d from Rank 0\n", rank, i); + Log("[INF0] Rank %d receives topology %d from Rank 0\n", rank, i); } } } @@ -6409,16 +7638,47 @@ static bool IsConfiguredGid(union ibv_gid const& gid) return rankInfo[targetRank].hostname; } - std::string System::GetPpodId(int targetRank) const + int64_t System::GetPodIdx(int targetRank) const { + using PodKey = std::pair, int64_t>; + + static std::map podIdxMap; + static bool initialized = false; + + if (!initialized) { + int64_t nextIdx = 0; + for (int r = 0; r < numRanks; r++) { + PodKey key; + memcpy(key.first.data(), rankInfo[r].ppodId, 16); + key.second = rankInfo[r].vpodId; + + // vpodIdx == -1 means not part of any pod; assign -1 directly + if (key.second == -1) continue; + + if (podIdxMap.find(key) == podIdxMap.end()) { + podIdxMap[key] = nextIdx++; + } + } + initialized = true; + } + if (targetRank < 0 || targetRank >= numRanks) targetRank = rank; - return rankInfo[targetRank].ppodId; + + PodKey key; + memcpy(key.first.data(), rankInfo[targetRank].ppodId, 16); + key.second = rankInfo[targetRank].vpodId; + + if (key.second == -1) return -1; + + return podIdxMap[key]; } - int System::GetVpodId(int targetRank) const + bool System::IsSamePod(int targetRank, int sourceRank) const { - if (targetRank < 0 || targetRank >= numRanks) targetRank = rank; - return rankInfo[targetRank].vpodId; + if (sourceRank < 0 || sourceRank >= numRanks) sourceRank = rank; + if (GetPodIdx(sourceRank) == -1 || GetPodIdx(targetRank) == -1) + return false; + return GetPodIdx(sourceRank) == GetPodIdx(targetRank); } std::string System::GetExecutorName(ExeDevice exeDevice) const @@ -6527,14 +7787,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid) return System::Get().GetHostname(targetRank); } - std::string GetPpodId(int targetRank) + int64_t GetPodIdx(int targetRank) { - return System::Get().GetPpodId(targetRank); + return System::Get().GetPodIdx(targetRank); } - int GetVpodId(int targetRank) + bool IsSamePod(int targetRank, int sourceRank) { - return System::Get().GetVpodId(targetRank); + return System::Get().IsSamePod(targetRank, sourceRank); } std::string GetExecutorName(ExeDevice exeDevice) @@ -6559,18 +7819,28 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #undef hipError_t #undef hipEvent_t #undef hipStream_t +#undef hipMemAllocationProp +#undef hipMemGenericAllocationHandle_t +#undef hipMemAccessDesc +#undef hipMemFabricHandle_t // Enumerations #undef hipDeviceAttributeClockRate -#undef hipDeviceAttributeMaxSharedMemoryPerMultiprocessor #undef hipDeviceAttributeMultiprocessorCount #undef hipDeviceAttributeWarpSize #undef hipErrorPeerAccessAlreadyEnabled #undef hipFuncCachePreferShared #undef hipMemcpyDefault +#undef hipMemcpyKind #undef hipMemcpyDeviceToHost #undef hipMemcpyHostToDevice #undef hipSuccess +#undef hipMemLocationTypeDevice +#undef hipMemAllocationTypePinned +//#undef hipMemAllocationTypeUncached +#undef hipMemHandleTypeFabric +#undef hipMemAllocationGranularityRecommended +#undef hipMemAccessFlagsProtReadWrite // Functions #undef hipDeviceCanAccessPeer @@ -6599,11 +7869,21 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #undef hipStreamCreate #undef hipStreamDestroy #undef hipStreamSynchronize +#undef hipMemGetAllocationGranularity +#undef hipMemCreate +#undef hipMemAddressReserve +#undef hipMemMap +#undef hipMemSetAccess +#undef hipMemUnmap +#undef hipMemRelease +#undef hipMemAddressFree +#undef hipMemExportToShareableHandle +#undef hipMemImportFromShareableHandle #endif // Kernel macros #undef GetHwId -#undef GetXccId +//#undef GetXccId // Undefine helper macros #undef ERR_CHECK diff --git a/toolchain-linux.cmake b/toolchain-linux.cmake deleted file mode 100644 index 712c5f1c..00000000 --- a/toolchain-linux.cmake +++ /dev/null @@ -1,34 +0,0 @@ - -if (DEFINED ENV{ROCM_PATH}) - set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Path to the ROCm installation.") - set(rocm_bin "$ENV{ROCM_PATH}/bin") -else() - set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.") - set(rocm_bin "/opt/rocm/bin") -endif() - -if (NOT DEFINED ENV{CXX}) - if(EXISTS "${rocm_bin}/amdclang++") - set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler") - else() - if(EXISTS "${ROCM_PATH}/llvm/bin/amdclang++") - set(rocm_bin "${ROCM_PATH}/llvm/bin") - set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler") - elseif(EXISTS "${ROCM_PATH}/llvm/bin/clang++") - set(rocm_bin "${ROCM_PATH}/llvm/bin") - set(CMAKE_CXX_COMPILER "${rocm_bin}/clang++" CACHE PATH "Path to the C++ compiler") - endif() - endif() -else() - set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler") -endif() - -if (NOT DEFINED ENV{CXXFLAGS}) - set(CMAKE_CXX_FLAGS_DEBUG "-g -O1") - set(CMAKE_CXX_FLAGS_RELEASE "-O3") -endif() - -if(NOT CMAKE_BUILD_TYPE) - message(STATUS "Setting build type to 'Release' as none was specified.") - set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE) -endif() From 51d8ebc680bb7feecd67cdaf5ce10b3c24a36a87 Mon Sep 17 00:00:00 2001 From: Gilbert Lee Date: Sun, 3 May 2026 00:30:07 -0500 Subject: [PATCH 2/2] Fixing rings preset to be parallel rings / updating formating --- src/client/Presets/Rings.hpp | 315 +++++++++++++++-------------------- 1 file changed, 136 insertions(+), 179 deletions(-) diff --git a/src/client/Presets/Rings.hpp b/src/client/Presets/Rings.hpp index bee03055..b9ffb0ff 100644 --- a/src/client/Presets/Rings.hpp +++ b/src/client/Presets/Rings.hpp @@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + int RingsPreset(EnvVars& ev, size_t const numBytesPerTransfer, std::string const presetName, @@ -30,14 +32,14 @@ int RingsPreset(EnvVars& ev, Utils::Print("[ERROR] rings preset can only be run across ranks that are homogeneous\n"); Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n"); Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n"); - return 1; + return ERR_FATAL; } // Check for pod support (if multi-node) int numRanks = TransferBench::GetNumRanks(); - if (numRanks > 1 && Utils::GetRankPerPodMap().empty()) { - Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n"); - return 1; + if (numRanks > 1 && Utils::GetRankPerPodMap().size() != 1) { + Utils::Print("[ERROR] Multi-rank runs must be within a single pod. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n"); + return ERR_FATAL; } int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); @@ -50,36 +52,24 @@ int RingsPreset(EnvVars& ev, int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC" , 0); int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0); int stride = EnvVars::GetEnvVar("STRIDE" , 1); - int groupSize = EnvVars::GetEnvVar("GROUP_SIZE" , numRanks * numGpus); + int ringSize = EnvVars::GetEnvVar("RING_SIZE" , numRanks * numGpus); + if (numGpus <= 0 || numGpus > numDetectedGpus) { Utils::Print("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus); - return 1; + return ERR_FATAL; } - if (groupSize <= 0) { - Utils::Print("[ERROR] Group size must be greater than 0\n"); - return 1; + if (ringSize <= 0) { + Utils::Print("[ERROR] Ring size must be greater than 0\n"); + return ERR_FATAL; } - if (numRanks * numGpus % groupSize) { - Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n", - groupSize, numRanks * numGpus, numRanks); - return 1; + int totalGpus = numRanks * numGpus; + if (totalGpus % ringSize) { + Utils::Print("[ERROR] Ring size %d must evenly divide the total number of GPUs %d\n", ringSize, totalGpus); + return ERR_FATAL; } - int numNics = TransferBench::GetNumExecutors(EXE_NIC, 0); - bool nicDifference = false; - for (int rank = 0; rank < numRanks; rank++) { - if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) { - Utils::Print("[ERROR] rings preset requires each rank to have the same number of GPUs\n"); - return 1; - } - if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank)) - nicDifference = true; - } - if (nicDifference) - Utils::Print("[WARN] Not all ranks have the same number of NICs\n"); - - MemType memType = Utils::GetGpuMemType(memTypeIdx); + MemType memType = Utils::GetGpuMemType(memTypeIdx); std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx); if (Utils::RankDoesOutput()) { @@ -93,12 +83,12 @@ int RingsPreset(EnvVars& ev, ev.Print("USE_DMA_EXEC" , useDmaExec , "Using %s executor", useDmaExec ? "DMA" : "GFX"); ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC"); ev.Print("STRIDE" , stride , "Reordering devices by taking %d steps", stride); - ev.Print("GROUP_SIZE" , groupSize , "Dividing all devices into ring groups of %d", groupSize); + ev.Print("RING_SIZE" , ringSize , "Building rings of size %d", ringSize); printf("\n"); } } - Utils::Print("GPU-%s IntraPod Ring benchmark:\n", useDmaExec ? "DMA" : "GFX"); + Utils::Print("GPU-%s Rings benchmark:\n", useDmaExec ? "DMA" : "GFX"); Utils::Print("==============================\n"); Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n", numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, @@ -107,174 +97,141 @@ int RingsPreset(EnvVars& ev, TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX; - int n = numRanks * numGpus; - int numGroups = n / groupSize; + int numRings = totalGpus / ringSize; + Utils::Print("Running %d parallel ring(s) each of %d devices. All numbers in GB/s:\n", numRings, ringSize); - std::vector indices(n); - for (int k = 0; k < n; k++) indices[k] = k; + // Determine ordering of GPUs for the rings based on stride + std::vector indices(totalGpus); + std::iota(indices.begin(), indices.end(), 0); Utils::StrideGenerate(indices, stride); - std::vector devices(n); - for (int i = 0; i < n; i++) { - int const globalIdx = indices[i]; - int const rank = globalIdx / numGpus; - int const devIdx = globalIdx % numGpus; - devices[i] = {memType, devIdx, rank}; + // Establish memory devices for all GPUs + std::vector memDevices(totalGpus); + for (int i = 0; i < totalGpus; i++) { + memDevices[i] = {memType, indices[i] % numGpus, indices[i] / numGpus}; } - Utils::Print("%d ring(s) of %d devices:\n", numGroups, groupSize); - for (int group = 0; group < numGroups; group++) { - int const groupBase = group * groupSize; - Utils::Print(" Ring %d: ", group); - for (int i = 0; i < groupSize; i++) { - Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex); - } - Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex); - } - Utils::Print("\n"); - - for (int group = 0; group < numGroups; group++) { - int const groupBase = group * groupSize; - std::vector transfers; - - for (int i = 0; i < groupSize; i++) { - int srcIdx = groupBase + i; - int dstIdx = groupBase + (i + 1) % groupSize; - - TransferBench::Transfer transfer; - transfer.numBytes = numBytesPerTransfer; - transfer.srcs.push_back(devices[srcIdx]); - transfer.dsts.push_back(devices[dstIdx]); - transfer.exeDevice = {exeType, - (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex), - (int32_t)(useRemoteRead ? devices[dstIdx].memRank : devices[srcIdx].memRank)}; - transfer.exeSubIndex = -1; - transfer.numSubExecs = numSubExecs; - transfers.push_back(transfer); - + // Build list of Transfers + std::vector transfers; + for (int ringIdx = 0; ringIdx < numRings; ringIdx++) { + int const ringBase = ringIdx * ringSize; + + // Build GFX or DMA transfers for this ring + for (int i = 0; i < ringSize; i++) { + Transfer t; + int srcIdx = ringBase + i; + int dstIdx = ringBase + (i + 1) % ringSize; + int exeIdx = useRemoteRead ? dstIdx : srcIdx; + t.numBytes = numBytesPerTransfer; + t.srcs = {memDevices[srcIdx]}; + t.dsts = {memDevices[dstIdx]}; + t.exeDevice = {exeType, memDevices[exeIdx].memIndex, memDevices[exeIdx].memRank}; + t.numSubExecs = numSubExecs; + transfers.push_back(t); + + // Build NIC transfers between these GPUs as well if requested if (numQueuePairs > 0) { - TransferBench::Transfer nicTransfer; - nicTransfer.numBytes = numBytesPerTransfer; - nicTransfer.srcs.push_back(devices[srcIdx]); - nicTransfer.dsts.push_back(devices[dstIdx]); - nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, - (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank}; - nicTransfer.exeSubIndex = devices[dstIdx].memIndex; + Transfer nicTransfer = t; + nicTransfer.exeDevice = {EXE_NIC_NEAREST, memDevices[exeIdx].memIndex, memDevices[exeIdx].memRank}; + nicTransfer.exeSubIndex = memDevices[useRemoteRead ? srcIdx : dstIdx].memIndex; nicTransfer.numSubExecs = numQueuePairs; transfers.push_back(nicTransfer); } } + } - TransferBench::TestResults results; - if (!TransferBench::RunTransfers(cfg, transfers, results)) { - for (auto const& err : results.errResults) - Utils::Print("%s\n", err.errMsg.c_str()); - return 1; - } - if (showDetails) { - Utils::PrintResults(ev, 1, transfers, results); - Utils::Print("\n"); - } - - if (Utils::RankDoesOutput()) { - Utils::Print("\n--- Ring Group %d ---\n", group); - - int const numHops = groupSize; - int const numRows = 2 + numHops + 3; - int const numCols = 6; - int const precision = 2; - Utils::TableHelper table(numRows, numCols, precision); - - table.DrawRowBorder(0); - table.DrawColBorder(0); - table.DrawColBorder(numCols); - table.DrawRowBorder(numRows); - - table.Set(0, 0, " Src "); - table.Set(0, 1, " Src "); - table.Set(0, 2, " Dst "); - table.Set(0, 3, " Dst "); - table.Set(0, 4, " GFX BW "); - table.Set(1, 0, " Rank "); - table.Set(1, 1, " GPU "); - table.Set(1, 2, " Rank "); - table.Set(1, 3, " GPU "); - table.Set(1, 4, " (GB/s) "); - table.DrawColBorder(2); - table.DrawColBorder(4); + TransferBench::TestResults results; + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return ERR_FATAL; + } + if (showDetails) { + Utils::PrintResults(ev, 1, transfers, results); + Utils::Print("\n"); + } - if (numQueuePairs > 0) { - table.Set(0, 5, " NIC BW "); - table.Set(1, 5, " (GB/s) "); - } else { - table.Set(0, 5, " "); - table.Set(1, 5, " "); - } + if (Utils::RankDoesOutput()) { - table.DrawRowBorder(2); - - double gfxMin = std::numeric_limits::max(); - double gfxAvg = 0.0; - double gfxMax = std::numeric_limits::lowest(); - double nicMin = std::numeric_limits::max(); - double nicAvg = 0.0; - double nicMax = std::numeric_limits::lowest(); - - int tfrIdx = 0; - for (int i = 0; i < numHops; i++) { - int srcIdx = groupBase + i; - int dstIdx = groupBase + (i + 1) % groupSize; - int row = 2 + i; - - double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; - tfrIdx++; - - table.Set(row, 0, " %d ", devices[srcIdx].memRank); - table.Set(row, 1, " %d ", devices[srcIdx].memIndex); - table.Set(row, 2, " %d ", devices[dstIdx].memRank); - table.Set(row, 3, " %d ", devices[dstIdx].memIndex); - table.Set(row, 4, " %.2f ", gfxBw); - - gfxMin = std::min(gfxMin, gfxBw); - gfxAvg += gfxBw; - gfxMax = std::max(gfxMax, gfxBw); - - if (numQueuePairs > 0) { - double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; - tfrIdx++; - table.Set(row, 5, " %.2f ", nicBw); - nicMin = std::min(nicMin, nicBw); - nicAvg += nicBw; - nicMax = std::max(nicMax, nicBw); + // Limit the number of columns of output + int maxColumns = 24; + int colsPerRing = (numQueuePairs ? 3 : 2); + int ringsPerPage = maxColumns / colsPerRing; + int numPages = (numRings + ringsPerPage - 1) / ringsPerPage; + + + // Compute table size + int numRows = numPages * (2 + ringSize + 4); + int numCols = std::min(numRings, ringsPerPage) * colsPerRing; + Utils::TableHelper table(numRows, numCols); + + std::vector> ringMin(numQueuePairs ? 2 : 1, std::vector(numRings, std::numeric_limits::max())); + std::vector> ringSum(numQueuePairs ? 2 : 1, std::vector(numRings, 0.0)); + std::vector> ringMax(numQueuePairs ? 2 : 1, std::vector(numRings, 0.0)); + + for (int pageIdx = 0; pageIdx < numPages; pageIdx++) { + int headerRow = pageIdx * (2 + ringSize + 4); + + table.DrawRowBorder(headerRow); + table.DrawRowBorder(headerRow+2); + for (int r = 0; r < ringsPerPage; r++) { + int ringIdx = pageIdx * ringsPerPage + r; + if (ringIdx >= numRings) break; + int currCol = colsPerRing * r; + + // Set header for ring + table.DrawColBorder(currCol); + table.DrawColBorder(currCol + colsPerRing); + for (int i = 0; i < colsPerRing; i++) + table.Set(headerRow, currCol+i, "Ring%02d", ringIdx); + table.Set(headerRow+1, currCol, "Device"); + table.Set(headerRow+1, currCol+1, "%s BW", useDmaExec ? "DMA" : "GFX"); + if (numQueuePairs) { + table.Set(headerRow+1, currCol+2, "NIC BW"); } - } - - int summaryBase = 2 + numHops; - table.DrawRowBorder(summaryBase); - table.Set(summaryBase , 1, " MAX "); - table.Set(summaryBase + 1, 1, " AVG "); - table.Set(summaryBase + 2, 1, " MIN "); - table.Set(summaryBase , 4, " %.2f ", gfxMax); - table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops); - table.Set(summaryBase + 2, 4, " %.2f ", gfxMin); - if (numQueuePairs > 0) { - table.Set(summaryBase , 5, " %.2f ", nicMax); - table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops); - table.Set(summaryBase + 2, 5, " %.2f ", nicMin); + // Fill results for ring + int baseRow = headerRow + 2; + table.DrawRowBorder(baseRow); + for (int i = 0; i < ringSize; i++) { + int tfrIdx = (ringIdx * ringSize + i) * (colsPerRing - 1); + Transfer const& t = transfers[tfrIdx]; + if (numRanks > 1) { + table.Set(baseRow + i, currCol, "R%02d:%d", t.srcs[0].memRank, t.srcs[0].memIndex); + } else { + table.Set(baseRow + i, currCol, "%d", t.srcs[0].memIndex); + } + + for (int j = 0; j < colsPerRing - 1; j++) { + double bw = results.tfrResults[tfrIdx + j].avgBandwidthGbPerSec; + table.Set(baseRow + i, currCol+1+j, "%7.2f", bw); + ringMin[j][ringIdx] = std::min(ringMin[j][ringIdx], bw); + ringSum[j][ringIdx] += bw; + ringMax[j][ringIdx] = std::max(ringMax[j][ringIdx], bw); + } + } + int statRow = baseRow + ringSize; + table.DrawRowBorder(statRow); + table.Set(statRow , currCol, "MIN"); + table.Set(statRow+1, currCol, "AVG"); + table.Set(statRow+2, currCol, "MAX"); + table.Set(statRow+3, currCol, "SUM"); + + for (int j = 0; j < colsPerRing - 1; j++) { + table.Set(statRow , currCol+1+j, "%7.2f", ringMin[j][ringIdx]); + table.Set(statRow+1, currCol+1+j, "%7.2f", ringSum[j][ringIdx] / ringSize); + table.Set(statRow+2, currCol+1+j, "%7.2f", ringMax[j][ringIdx]); + table.Set(statRow+3, currCol+1+j, "%7.2f", ringSum[j][ringIdx]); + } + table.DrawRowBorder(statRow+3); } - - table.PrintTable(ev.outputToCsv, ev.showBorders); - - Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec); } - } - - if (!Utils::RankDoesOutput()) return 0; + table.PrintTable(ev.outputToCsv, ev.showBorders); + Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec); - if (Utils::HasDuplicateHostname()) { - printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n"); + if (Utils::HasDuplicateHostname()) + Utils::Print("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n"); } - return 0; + return ERR_NONE; }