From 894b08ea824b284f0ee7057629920bfd71e31f2f Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Wed, 11 Mar 2026 11:19:28 +0000 Subject: [PATCH 01/19] [AMD] Add vLLM disaggregated prefill-decode benchmark for MI355X Add multi-node vLLM PD disaggregation recipe using Nixl/RIXL KV transfer and vllm-router, mirroring the existing SGLang disagg recipe structure. - New benchmark config: dsr1-fp8-mi355x-vllm-disagg (1P2D, TP8) - New utils: vllm_disagg_utils/ (job.slurm, server.sh, submit.sh, etc.) - Runner: extend launch_mi355x-amds.sh for vllm-disagg framework --- .github/configs/amd-master.yaml | 72 +++ .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 47 ++ .../multi_node/vllm_disagg_utils/bench.sh | 70 +++ .../multi_node/vllm_disagg_utils/env.sh | 52 ++ .../multi_node/vllm_disagg_utils/job.slurm | 326 +++++++++++++ .../multi_node/vllm_disagg_utils/server.sh | 444 ++++++++++++++++++ .../vllm_disagg_utils/start_etcd.sh | 47 ++ .../multi_node/vllm_disagg_utils/submit.sh | 131 ++++++ .../multi_node/vllm_disagg_utils/sync.py | 198 ++++++++ runners/launch_mi355x-amds.sh | 15 +- 10 files changed, 1399 insertions(+), 3 deletions(-) create mode 100755 benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/bench.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/env.sh create mode 100644 benchmarks/multi_node/vllm_disagg_utils/job.slurm create mode 100755 benchmarks/multi_node/vllm_disagg_utils/server.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/submit.sh create mode 100755 benchmarks/multi_node/vllm_disagg_utils/sync.py diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 337047e57..5c9a7c1ec 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1028,6 +1028,78 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: # - "DECODE_MTP_SIZE=0" +dsr1-fp8-mi355x-vllm-disagg: + image: vllm_disagg_pd:latest + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp8 + framework: vllm-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node + 2 decode nodes + 1 proxy = 4 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 1024 + osl: 8192 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + dsr1-fp4-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 model: amd/DeepSeek-R1-0528-MXFP4 diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh new file mode 100755 index 000000000..a457a2714 --- /dev/null +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +# vLLM disagg uses TP-only parallelism (no EP/DP). +# PREFILL_NODES and DECODE_NODES come from additional-settings in the YAML config. + +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh new file mode 100755 index 000000000..cfe66d460 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# vLLM Disaggregated Benchmark Runner +# +# Usage: bash bench.sh \ +# \ +# + +n_prefill=$1 +n_decode=$2 +prefill_gpus=$3 +decode_gpus=$4 +model_path=$5 +model_name=$6 +# Prefer MODEL_PATH from environment (handles HF cache snapshot resolution) +MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" +log_path=$7 + +chosen_isl=${8:-1024} +chosen_osl=${9:-1024} +concurrency_list=${10:-"512x1"} +chosen_req_rate=${11:-inf} +random_range_ratio=${12:-0.8} +num_prompts_multiplier=${13:-10} + +IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" + +ROUTER_PORT="${ROUTER_PORT:-2584}" + +echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" + +profile_folder="${log_path}/vllm_isl_${chosen_isl}_osl_${chosen_osl}" +mkdir -p "$profile_folder" + +for max_concurrency in "${chosen_concurrencies[@]}"; do + + export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" + + num_prompts=$(( max_concurrency * num_prompts_multiplier )) + if [[ "$num_prompts" -lt 16 ]]; then + num_prompts=16 + fi + + echo "profile_folder: $profile_folder" + echo "max_concurrency: $max_concurrency" + echo "chosen_req_rate: $chosen_req_rate" + echo "MODEL_PATH: $MODEL_PATH" + echo "ROUTER_PORT: $ROUTER_PORT" + echo "chosen_isl: $chosen_isl" + echo "chosen_osl: $chosen_osl" + echo "num_prompts: $num_prompts" + echo "export_file: $export_file" + + vllm bench serve \ + --model "$MODEL_PATH" \ + --backend vllm \ + --host 127.0.0.1 \ + --port "$ROUTER_PORT" \ + --dataset-name "random" \ + --random-input-len "$chosen_isl" \ + --random-output-len "$chosen_osl" \ + --random-prefix-len 0 \ + --num-prompts "$num_prompts" \ + --request-rate "$chosen_req_rate" \ + --ignore-eos \ + --max-concurrency "$max_concurrency" \ + 2>&1 | tee "${export_file}.log" + + sleep 5 + echo "-----------------------------------------" +done diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh new file mode 100755 index 000000000..ebe77f09b --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/env.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# vLLM/Nixl environment setup for multi-node disaggregated serving. +# +# REQUIRED ENVIRONMENT VARIABLES: +# IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) +# Set by runner or auto-detected from hostname. +# +# The Docker image (built from vllm_disagg_inference.ubuntu.amd.Dockerfile) already +# sets LD_LIBRARY_PATH for UCX (/usr/local/ucx/lib) and RIXL (/usr/local/RIXL/install/lib). + +set -x + +# IBDEVICES configuration +# Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh) +# Fall back to hostname detection if not set (for direct script execution) +if [[ -z "$IBDEVICES" ]]; then + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 + elif [[ $NODENAME == mia1* ]]; then + export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + else + DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',') + if [[ -n "$DETECTED" ]]; then + export IBDEVICES="$DETECTED" + else + echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2 + fi + fi + echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)" +else + echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)" +fi + +if [[ -z "$UCX_NET_DEVICES" ]]; then + FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) + if [[ -n "$FIRST_IB" ]]; then + export UCX_NET_DEVICES="${FIRST_IB}:1" + fi + echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" +else + echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)" +fi + +export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) +export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} + +# RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing +export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} + +set +x +echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX" diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm new file mode 100644 index 000000000..710b7168a --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -0,0 +1,326 @@ +#!/bin/bash +#SBATCH --job-name=vllm-pd-bench +#SBATCH -N 4 # CHECK this to be right in batch jobs +#SBATCH -n 4 # CHECK this to be right in batch jobs +#SBATCH --ntasks-per-node=1 +#SBATCH --spread-job +#SBATCH --gres=gpu:8 +#SBATCH --time=24:00:00 +# --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR + +echo "=== Job Start Time ===" +echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" +echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" +echo "=======================" +echo "" + +# ============================================================================= +# Model Validation +# ============================================================================= + +VALID_MODELS=( + "Llama-3.1-405B-Instruct-FP8-KV" + "amd-Llama-3.3-70B-Instruct-FP8-KV" + "DeepSeek-V3" + "DeepSeek-R1-0528" + "gpt-oss-120b" +) + +if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then + echo "Error: DOCKER_IMAGE_NAME is not set." + exit 1 +fi + +MODEL_NAME="${MODEL_NAME:-None}" +model_found=false +for m in "${VALID_MODELS[@]}"; do + [[ "$MODEL_NAME" == "$m" ]] && model_found=true && break +done +if [[ "$model_found" != "true" ]]; then + echo "Error: Model '$MODEL_NAME' not found. Available:" + printf ' - %s\n' "${VALID_MODELS[@]}" + exit 1 +fi +echo "Model found: $MODEL_NAME" + +RUN_FILE="server.sh" +echo "Runfile set: $RUN_FILE" + +# DI_REPO_DIR points to the repo root. +# $(pwd) is vllm_disagg_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root. +export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd) + +xP="${xP:-1}" +yD="${yD:-1}" + +# Benchmark configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" + +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +# ============================================================================= +# Model Path Resolution +# ============================================================================= + +# HF cache directory names may differ from MODEL_NAME +declare -A MODEL_DIR_NAMES=( + ["DeepSeek-R1-0528"]="models--deepseek-ai--DeepSeek-R1-0528" +) + +# MODEL_DIR detection: prefer env var, fall back to hostname detection +if [[ -z "$MODEL_DIR" ]]; then + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + MODEL_DIR="/nfsdata" + elif [[ $NODENAME == mia1* ]]; then + MODEL_DIR="/it-share/data" + else + MODEL_DIR="/nfsdata" + fi + echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)" +fi +export MODEL_DIR + +DISK_DIR_NAME="${MODEL_DIR_NAMES[$MODEL_NAME]:-$MODEL_NAME}" +echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" + +resolve_hf_cache_path() { + local base_path=$1 + if [[ -d "${base_path}/snapshots" ]]; then + local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1) + if [[ -n "$snapshot" ]]; then + echo "${base_path}/snapshots/${snapshot}" + return 0 + fi + fi + echo "$base_path" + return 1 +} + +MODEL_PATH="" +SEARCH_PATHS=( + "${MODEL_DIR}/${DISK_DIR_NAME}" + "${MODEL_DIR}/${MODEL_NAME}" + "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}" + "/nfsdata/hf_hub_cache-0/${MODEL_NAME}" +) + +for search_path in "${SEARCH_PATHS[@]}"; do + if [[ -d "$search_path" ]]; then + RESOLVED=$(resolve_hf_cache_path "$search_path") + MODEL_PATH="$RESOLVED" + echo "Found MODEL_PATH: $MODEL_PATH" + break + fi +done + +if [[ -z "$MODEL_PATH" ]]; then + echo "FATAL: Model '$MODEL_NAME' not found. Searched:" + for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done + exit 1 +fi +echo "Final MODEL_PATH: $MODEL_PATH" + +# ============================================================================= +# Node Selection and vLLM-Specific NUM_NODES +# ============================================================================= + +# vLLM needs xP + yD + 1 (dedicated proxy node) +NUM_NODES=$((xP + yD + 1)) +echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD + 1 proxy)" + +FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) +SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') + +# Update SLURM environment variables +export SLURM_NNODES=$NUM_NODES +export SLURM_NTASKS=$NUM_NODES +export SLURM_JOB_NUM_NODES=$NUM_NODES +export SLURM_NPROCS=$NUM_NODES +export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR" +export SLURM_NODELIST="$SELECTED_NODELIST_STR" +export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)" +export SLURM_NTASKS_PER_NODE=1 + +echo "" +echo "Selected nodes: $SELECTED_NODELIST_STR" + +# ============================================================================= +# IP Resolution +# ============================================================================= + +USER_NAME=$(whoami) +MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1) +NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1') +NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}') + +IPS=() +for NODE in $SELECTED_NODES; do + IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1') + IP=$(echo "$IP" | awk '/src/ {print $7}') + IPS+=("$IP") +done + +echo "Node IPs: ${IPS[*]}" + +DOCKER_MOUNT_PATH="/workspace" +VLLM_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/vllm_disagg_utils" + +NNODES=$NUM_NODES + +echo "MASTER_NODE: ${MASTER_NODE}" +echo "NODE0_ADDR: ${NODE0_ADDR}" +echo "NNODES: ${NNODES}" +echo "REPO DIR: ${DI_REPO_DIR}" +echo "USER: ${USER_NAME}" + +# Reduce log spam +export TQDM_MININTERVAL=20 + +# Translate the host-resolved MODEL_PATH to the Docker mount namespace +DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" + +export DI_REPO_DIR=$DI_REPO_DIR +export VLLM_WS_PATH=$VLLM_WS_PATH +export NNODES=$NNODES +export NODE0_ADDR=$NODE0_ADDR +export MODEL_PATH=$MODEL_PATH +export MODEL_DIR=$MODEL_DIR +export xP=$xP +export yD=$yD +export MODEL_NAME=$MODEL_NAME +export USER_NAME=$USER_NAME +export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" +export GPUS_PER_NODE=$GPUS_PER_NODE +export BENCH_INPUT_LEN=$BENCH_INPUT_LEN +export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN +export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO +export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER +export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY +export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE +export DRY_RUN="${DRY_RUN:-0}" +export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" + +SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') +export DOCKER_CONT_NAME="container_vllm_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" +export RUN_FILE_FULL="$VLLM_WS_PATH/${RUN_FILE}" + +SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) + +cleanup() { + echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." + sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true + echo "[${SLURM_JOB_ID}] cleanup done." +} + +trap cleanup INT TERM HUP + +# Force NFS cache refresh on all nodes +echo "Refreshing NFS caches on all nodes..." +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' + sync + ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils > /dev/null 2>&1 + stat '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils/server.sh > /dev/null 2>&1 + cat '"$DI_REPO_DIR"'/benchmarks/multi_node/vllm_disagg_utils/server.sh > /dev/null 2>&1 + echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true + echo "NFS cache refreshed on $(hostname)" +' + +srun \ + --nodelist="$SELECTED_NODELIST_SRUN" \ + --kill-on-bad-exit=1 \ + --signal=TERM@30 \ + --unbuffered \ + bash -lc " +set -euo pipefail + +echo \"Rank \$SLURM_PROCID on \$(hostname)\" + +# Pre-clean (idempotent) +sudo docker ps -aq --filter \"name=^container_vllm_\" | xargs -r sudo docker rm -f || true +sudo docker ps -aq | xargs -r sudo docker stop || true + +exec sudo docker run --rm \ + --init \ + --stop-timeout 10 \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --device=/dev/infiniband/rdma_cm \ + --device=/dev/infiniband/uverbs0 \ + --device=/dev/infiniband/uverbs1 \ + --device=/dev/infiniband/uverbs2 \ + --device=/dev/infiniband/uverbs3 \ + --device=/dev/infiniband/uverbs4 \ + --device=/dev/infiniband/uverbs5 \ + --device=/dev/infiniband/uverbs6 \ + --device=/dev/infiniband/uverbs7 \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v /sys:/sys \ + -v /etc/libibverbs.d/ionic.driver:/etc/libibverbs.d/ionic.driver:ro \ + -v /lib/x86_64-linux-gnu/libionic.so.1:/lib/x86_64-linux-gnu/libionic.so.1:ro \ + -v /lib/x86_64-linux-gnu/libionic.so:/lib/x86_64-linux-gnu/libionic.so:ro \ + -v /usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so:/usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so:ro \ + -v ${MODEL_DIR}:/models \ + -v \$HOME/.ssh:/root/.ssh \ + --shm-size 128G \ + -v /tmp:/run_logs \ + -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ + -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ + -e SLURM_JOB_ID=\$SLURM_JOB_ID \ + -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \ + -e NNODES=\$NNODES \ + -e NODE_RANK=\$SLURM_PROCID \ + -e NODE0_ADDR=\$NODE0_ADDR \ + -e MODEL_DIR=/models \ + -e MODEL_NAME=\$MODEL_NAME \ + -e MODEL_PATH=$DOCKER_MODEL_PATH \ + -e VLLM_WS_PATH=${VLLM_WS_PATH} \ + -e GPUS_PER_NODE=\$GPUS_PER_NODE \ + -e xP=\$xP \ + -e yD=\$yD \ + -e IPADDRS=\$IPADDRS \ + -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \ + -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \ + -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \ + -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \ + -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \ + -e BENCH_REQUEST_RATE=\$BENCH_REQUEST_RATE \ + -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ + -e DRY_RUN=\$DRY_RUN \ + -e BENCHMARK_LOGS_DIR=/benchmark_logs \ + -e UCX_TLS=all \ + -e UCX_SOCKADDR_TLS_PRIORITY=tcp \ + -e UCX_MEMTYPE_CACHE=y \ + -e UCX_RNDV_SCHEME=get_zcopy \ + -e UCX_RNDV_THRESH=4k \ + -e UCX_ROCM_IPC_MIN_ZCOPY=0 \ + -e UCX_LOG_LEVEL=info \ + -e HSA_ENABLE_SDMA=1 \ + --name \"$DOCKER_CONT_NAME\" \ + \"$DOCKER_IMAGE_NAME\" bash -lc ' + mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' + '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log + ' + +DOCKER_EXIT_CODE=\$? +if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then + echo \"ERROR: docker exited rc=\$DOCKER_EXIT_CODE on \$(hostname)\" + exit \$DOCKER_EXIT_CODE +fi +" + +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true' diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh new file mode 100755 index 000000000..b4ab7bce8 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -0,0 +1,444 @@ +#!/bin/bash +# vLLM Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= +# +# Node role assignment (by NODE_RANK): +# 0 -> Proxy/Router node +# 1..xP -> Prefill nodes (kv_producer) +# xP+1..xP+yD -> Decode nodes (kv_consumer) + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" +yD="${yD:-1}" + +IPADDRS="${IPADDRS:-localhost}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +DRY_RUN="${DRY_RUN:-0}" +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +ROUTER_PORT="${ROUTER_PORT:-2584}" +SERVER_PORT="${SERVER_PORT:-2584}" +ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}" + +# Prefer MODEL_PATH from job.slurm (handles HF cache snapshot resolution) +MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}" + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $VLLM_WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}') +# RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available) +rdma_ip=$(hostname -I | tr ' ' '\n' | grep '^192\.168\.' | head -1) +rdma_ip="${rdma_ip:-$host_ip}" +host_name=$(hostname) + +echo "[INFO] Management IP (barriers/proxy): $host_ip" +echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip" + +# --------------------------------------------------------------------------- +# RDMA route setup for Pensando ionic (RoCEv2) point-to-point /31 links. +# Each benic interface has a /31 to the TOR switch. Without explicit routes, +# traffic to other nodes' RDMA IPs falls through to the management network +# (no RDMA capability). Fix: add a /24 route via the TOR gateway so RoCEv2 +# stays on the ionic fabric. +# --------------------------------------------------------------------------- +if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then + rdma_subnet="${BASH_REMATCH[1]}" + rdma_host="${BASH_REMATCH[2]}" + rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" # /31 peer = TOR switch + rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) + if [[ -n "$rdma_iface" ]]; then + ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ + echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ + echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" + fi +fi + +# Patch Nixl UCX backend: set ucx_error_handling_mode=none for shared-memory +# transport compatibility (Pensando ionic NICs don't support rdmacm, so the +# default UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors) +NIXL_API_FILE=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) +if [[ -n "$NIXL_API_FILE" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$NIXL_API_FILE"; then + sed -i '/init\["num_threads"\] = str(nixl_conf.num_threads)/a\ init["ucx_error_handling_mode"] = "none"' "$NIXL_API_FILE" + echo "[PATCH] Added ucx_error_handling_mode=none to $NIXL_API_FILE" + else + echo "[PATCH] ucx_error_handling_mode already set in $NIXL_API_FILE" + fi +fi + +if [[ -z "$UCX_NET_DEVICES" ]]; then + echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2 + exit 1 +fi + +# ============================================================================= +# Model-Specific Configuration Maps +# ============================================================================= + +declare -A MODEL_PREFILL_CONFIGS=( + ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" + ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["DeepSeek-R1-0528"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["gpt-oss-120b"]="--tensor-parallel-size 8" +) + +declare -A MODEL_DECODE_CONFIGS=( + ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" + ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["DeepSeek-R1-0528"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["gpt-oss-120b"]="--tensor-parallel-size 8" +) + +declare -A MODEL_ENVS=( + ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + ["Llama-3.1-405B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + ["DeepSeek-V3"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + ["DeepSeek-R1-0528"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + ["gpt-oss-120b"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" +) + +get_model_config() { + local mode="$1" + local model_name="$2" + if [[ "$mode" == "prefill" ]]; then + echo "${MODEL_PREFILL_CONFIGS[$model_name]:-"--tensor-parallel-size 8"}" + elif [[ "$mode" == "decode" ]]; then + echo "${MODEL_DECODE_CONFIGS[$model_name]:-"--tensor-parallel-size 8"}" + fi +} + +get_model_envs() { + echo "${MODEL_ENVS[$1]:-""}" +} + +if [[ -z "$MODEL_NAME" ]]; then + echo "ERROR: MODEL_NAME is not set"; exit 1 +fi + +PREFILL_SERVER_CONFIG=$(get_model_config "prefill" "$MODEL_NAME") +DECODE_SERVER_CONFIG=$(get_model_config "decode" "$MODEL_NAME") +PREFILL_MODEL_ENVS=$(get_model_envs "$MODEL_NAME") +DECODE_MODEL_ENVS=$(get_model_envs "$MODEL_NAME") +echo "Using model-specific configuration for: $MODEL_NAME" + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $VLLM_WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 300 + +# ============================================================================= +# ETCD Server Setup +# ============================================================================= + +echo "Proceeding to start etcd server on $host_name" +bash ${VLLM_WS_PATH}/start_etcd.sh > /dev/null & +etcd_pid=$! + +echo "Waiting at etcd server barrier on $host_name" +python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 2379 \ + --wait-for-all-ports \ + --timeout 300 + +echo "All etcd servers are up : $host_name" +sleep 3 + +echo "etcd endpoint health==================" +etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true +echo "======================================" + +python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 2379 \ + --wait-for-all-ports \ + --timeout 300 + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +PREFILL_ARGS="" +DECODE_ARGS="" + +for ((i=1; i<=xP && i<${#IP_ARRAY[@]}; i++)); do + PREFILL_ARGS+="${IP_ARRAY[$i]} " +done + +for ((i=xP+1; i<${#IP_ARRAY[@]}; i++)); do + DECODE_ARGS+="${IP_ARRAY[$i]} " +done + +echo "Prefill node IPs: ${PREFILL_ARGS}" +echo "Decode node IPs: ${DECODE_ARGS}" + +# Common UCX/Nixl environment for prefill and decode workers +setup_ucx_env() { + export UCX_TLS=all + export UCX_SOCKADDR_TLS_PRIORITY=tcp + export UCX_MEMTYPE_CACHE=y + export UCX_RNDV_SCHEME=get_zcopy + export UCX_RNDV_THRESH=4k + export UCX_ROCM_IPC_MIN_ZCOPY=0 + export HSA_ENABLE_SDMA=1 + export UCX_LOG_LEVEL=info + export VLLM_USE_V1=1 + export VLLM_SERVER_DEV_MODE=0 + export VLLM_NIXL_SIDE_CHANNEL_HOST=${host_ip} + export VLLM_NIXL_SIDE_CHANNEL_PORT=5557 +} + +# ============================================================================= +# Node Role Assignment and Server Launch +# ============================================================================= + +if [ "$NODE_RANK" -eq 0 ]; then + echo "NODE INFO =======================================" + echo "================================================" + echo "Node List : ${SLURM_JOB_NODELIST}" + echo "Node IPs : ${IPADDRS}" + echo "Model : ${MODEL_NAME:-'Not specified'}" + echo "================================================" + + echo "CLUSTER INFO ====================================" + echo "================================================" + echo "${host_name}:${host_ip} is Proxy Node" + echo "Prefill servers: ${PREFILL_ARGS}" + echo "Decode servers: ${DECODE_ARGS}" + echo "================================================" + + PD_IPADDRS="${IPADDRS#*,}" + echo "Waiting for all prefill and decode servers to be up . . ." + python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${PD_IPADDRS} \ + --node-ports $SERVER_PORT \ + --wait-for-all-ports \ + --timeout 1800 + + echo "Congratulations!!! All prefill and decode servers are up . . ." + + echo "Starting vLLM Router..." + [ -f /root/.cargo/env ] && source /root/.cargo/env + + PREFILL_URLS="" + DECODE_URLS="" + for ip in ${PREFILL_ARGS}; do + PREFILL_URLS+="--prefill http://${ip}:${SERVER_PORT} " + done + for ip in ${DECODE_ARGS}; do + DECODE_URLS+="--decode http://${ip}:${SERVER_PORT} " + done + + ROUTER_CMD="UCX_TLS=tcp,self,shm VLLM_USE_V1=1 \ + vllm-router \ + --host 0.0.0.0 \ + --port $ROUTER_PORT \ + --vllm-pd-disaggregation \ + $PREFILL_URLS \ + $DECODE_URLS \ + --policy round_robin \ + --prefill-policy round_robin \ + --decode-policy round_robin \ + --intra-node-data-parallel-size 1 \ + --retry-max-retries 3 \ + --health-check-endpoint /health \ + --prometheus-port 29000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $ROUTER_CMD" + else + ROUTER_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_${host_name}.log" + set -x + eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & + set +x + proxy_pid=$! + + HEALTH_BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-health \ + --health-endpoint /health \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + fi + + echo "Router is ready for benchmarking" + fi + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + echo "Benchmarking on ${host_name}:${host_ip}" + cd $VLLM_WS_PATH + + export ROUTER_PORT=$ROUTER_PORT + BENCH_CMD="bash $VLLM_WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the proxy server" + [[ "$DRY_RUN" -eq 0 ]] && kill $proxy_pid + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then + echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + + setup_ucx_env + for env_pair in ${PREFILL_MODEL_ENVS}; do + export "$env_pair" + done + + PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --disable-log-requests \ + --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"${ENGINE_ID}\", \"kv_role\": \"kv_producer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"${rdma_ip}\"'\", \"kv_port\": 14600}' \ + ${PREFILL_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $VLLM_WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the prefill server" + [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid + +else + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + + setup_ucx_env + for env_pair in ${DECODE_MODEL_ENVS}; do + export "$env_pair" + done + + DECODE_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --disable-log-requests \ + --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"${ENGINE_ID}\", \"kv_role\": \"kv_consumer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"${rdma_ip}\"'\", \"kv_port\": 14600}' \ + ${DECODE_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + set -x + eval "$DECODE_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & + set +x + decode_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $VLLM_WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the decode server" + [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid +fi + +echo "Killing the etcd server" +kill $etcd_pid + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh b/benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh new file mode 100755 index 000000000..46bbd2964 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/start_etcd.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -x + +IPADDRS="${IPADDRS:-localhost}" + +# Use management network IP (matching what the Slurm script resolved) +host_ip=$(ip route get 1.1.1.1 2>/dev/null | sed -n 's/.*src \([^ ]*\).*/\1/p') +if [[ -z "$host_ip" ]]; then + host_ip=$(hostname -I | awk '{print $1}') +fi + +IFS=',' read -ra ADDR <<< "$IPADDRS" + +# Determine node name based on position in the IPADDRS list +index=0 +for ip in "${ADDR[@]}"; do + if [[ "$ip" == "$host_ip" ]]; then + break + fi + index=$((index + 1)) +done +node_name="etcd-$((index+1))" + +# Build initial cluster string +initial_cluster="" +for i in "${!ADDR[@]}"; do + peer_name="etcd-$((i+1))" + initial_cluster+="$peer_name=http://${ADDR[i]}:2380" + if [[ $i -lt $((${#ADDR[@]} - 1)) ]]; then + initial_cluster+="," + fi +done + +mkdir -p /var/lib/etcd +rm -rf /var/lib/etcd/* + +/usr/local/bin/etcd/etcd \ + --name "$node_name" \ + --data-dir /var/lib/etcd \ + --initial-advertise-peer-urls http://$host_ip:2380 \ + --listen-peer-urls http://0.0.0.0:2380 \ + --listen-client-urls http://0.0.0.0:2379 \ + --advertise-client-urls http://$host_ip:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster "$initial_cluster" \ + --initial-cluster-state new \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/etcd_NODE${NODE_RANK}.log diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh new file mode 100755 index 000000000..a41a31d79 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# +# Cluster Configuration Template for Multi-Node vLLM Disaggregated Serving +# +# This script submits a multi-node vLLM disaggregated benchmark job to SLURM. +# It must be configured for your specific cluster before use. +# +# Key difference from SGLang: vLLM uses a dedicated proxy node, so +# NUM_NODES = PREFILL_NODES + DECODE_NODES + 1. + +usage() { + cat << 'USAGE' +Usage: + bash submit.sh \ + [NODE_LIST] + +Arguments: + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + NODE_LIST Optional: comma-separated hostnames + +Required environment variables: + SLURM_ACCOUNT SLURM account name + SLURM_PARTITION SLURM partition + TIME_LIMIT Job time limit (e.g., "08:00:00") + MODEL_PATH Path to model directory (e.g., /nfsdata) + MODEL_NAME Model name directory + CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) + RUNNER_NAME Runner identifier (for job name) +USAGE +} + +check_env() { + local name="$1" + if [[ -z "${!name:-}" ]]; then + echo "Error: ${name} not specified" >&2 + usage >&2 + exit 1 + fi +} + +check_env SLURM_ACCOUNT +check_env SLURM_PARTITION +check_env TIME_LIMIT + +check_env MODEL_PATH +check_env MODEL_NAME +check_env CONTAINER_IMAGE +check_env RUNNER_NAME + +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +# COMMAND_LINE ARGS +PREFILL_NODES=$1 +PREFILL_WORKERS=${2:-1} +DECODE_NODES=$3 +DECODE_WORKERS=${4:-1} +ISL=$5 +OSL=$6 +CONCURRENCIES=$7 +REQUEST_RATE=$8 +NODE_LIST=${9} + +# vLLM needs xP + yD + 1 nodes (dedicated proxy node) +NUM_NODES=$((PREFILL_NODES + DECODE_NODES + 1)) +profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" + +# Export variables for the SLURM job +export MODEL_DIR=$MODEL_PATH +export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE +export PROFILER_ARGS=$profiler_args + +# For vLLM, each worker = 1 node (TP=8 per node). +# xP/yD must match the node counts so job.slurm's NUM_NODES = xP+yD+1 is correct. +export xP=$PREFILL_NODES +export yD=$DECODE_NODES +export NUM_NODES=$NUM_NODES +export GPUS_PER_NODE=$GPUS_PER_NODE +export MODEL_NAME=$MODEL_NAME +export BENCH_INPUT_LEN=${ISL} +export BENCH_OUTPUT_LEN=${OSL} +export BENCH_RANDOM_RANGE_RATIO=${BENCH_RANDOM_RANGE_RATIO:-1} +export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} +export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} +export BENCH_REQUEST_RATE=${REQUEST_RATE} + +# Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. +export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +mkdir -p "$BENCHMARK_LOGS_DIR" + +# Optional: pass an explicit node list to sbatch. +NODELIST_OPT=() +if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then + IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST" + if [[ "${#NODE_ARR[@]}" -ne "$NUM_NODES" ]]; then + echo "Error: NODE_LIST has ${#NODE_ARR[@]} nodes but NUM_NODES=${NUM_NODES}" >&2 + echo "Error: NODE_LIST='${NODE_LIST}'" >&2 + exit 1 + fi + NODELIST_CSV="$(IFS=,; echo "${NODE_ARR[*]}")" + NODELIST_OPT=(--nodelist "$NODELIST_CSV") +fi + +# Construct the sbatch command +sbatch_cmd=( + sbatch + --parsable + -N "$NUM_NODES" + -n "$NUM_NODES" + "${NODELIST_OPT[@]}" + --time "$TIME_LIMIT" + --partition "$SLURM_PARTITION" + --account "$SLURM_ACCOUNT" + --job-name "$RUNNER_NAME" + --output "${BENCHMARK_LOGS_DIR}/slurm_job-%j.out" + --error "${BENCHMARK_LOGS_DIR}/slurm_job-%j.err" + "$(dirname "$0")/job.slurm" +) + +JOB_ID=$("${sbatch_cmd[@]}") +if [[ $? -ne 0 ]]; then + echo "Error: Failed to submit job with sbatch" >&2 + exit 1 +fi +echo "$JOB_ID" diff --git a/benchmarks/multi_node/vllm_disagg_utils/sync.py b/benchmarks/multi_node/vllm_disagg_utils/sync.py new file mode 100755 index 000000000..140951519 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/sync.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +""" +Multi-node synchronization utilities for disaggregated inference. + +Subcommands: + barrier - Wait until all specified nodes have opened their ports (TCP barrier) + Optionally wait for HTTP health endpoints to return 200 + wait - Block until a remote port closes (shutdown coordination) +""" + +import socket +import time +import threading +import argparse +import sys +import urllib.request +import urllib.error + + +def is_port_open(ip, port, timeout=2): + """Check if a given IP and port are accessible.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(timeout) + return s.connect_ex((ip, port)) == 0 + + +def check_health(ip, port, path="/health", timeout=2): + """Return True if http://ip:port/path returns HTTP 200.""" + try: + url = f"http://{ip}:{port}{path}" + req = urllib.request.Request(url) + with urllib.request.urlopen(req, timeout=timeout) as resp: + return getattr(resp, "status", 200) == 200 + except (urllib.error.URLError, urllib.error.HTTPError, OSError): + return False + + +# ============================================================================= +# barrier subcommand +# ============================================================================= + +def cmd_barrier(args): + """Wait until all nodes have opened the specified ports.""" + NODE_IPS = [ip.strip() for ip in args.node_ips.split(",") if ip.strip()] + NODE_PORTS = [int(p.strip()) for p in args.node_ports.split(",") if p.strip()] + + if not NODE_IPS: + print("Error: NODE_IPS argument is empty or not set.") + sys.exit(1) + + if len(NODE_PORTS) == 1: + NODE_PORTS *= len(NODE_IPS) + elif len(NODE_PORTS) != len(NODE_IPS): + print("Error: Number of ports must match number of node IPs or only one port should be given for all.") + sys.exit(1) + + server_socket = None + + def open_port(): + nonlocal server_socket + server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server_socket.bind((args.local_ip, args.local_port)) + server_socket.listen(5) + print(f"Port {args.local_port} is now open on {args.local_ip}.") + while True: + conn, addr = server_socket.accept() + conn.close() + + def close_port(): + nonlocal server_socket + if server_socket: + server_socket.close() + print(f"Port {args.local_port} has been closed on {args.local_ip}.") + + if args.enable_port: + threading.Thread(target=open_port, daemon=True).start() + + # Wait for all ports (TCP check) + if args.wait_for_all_ports: + start_time = time.time() + timeout = args.timeout + + while True: + if timeout > 0: + elapsed = time.time() - start_time + if elapsed >= timeout: + not_open = [(ip, port) for ip, port in zip(NODE_IPS, NODE_PORTS) + if not is_port_open(ip, port)] + print(f"ERROR: Timeout after {timeout} seconds waiting for ports to open.", flush=True) + print("The following nodes/ports are still not responding:", flush=True) + for ip, port in not_open: + print(f" - {ip}:{port}", flush=True) + sys.exit(1) + + all_open = all(is_port_open(ip, port) for ip, port in zip(NODE_IPS, NODE_PORTS)) + if all_open: + break + + if timeout > 0: + remaining = timeout - (time.time() - start_time) + print(f"Waiting for nodes.{NODE_PORTS},{NODE_IPS} . . ({remaining:.0f}s remaining)", flush=True) + else: + print(f"Waiting for nodes.{NODE_PORTS},{NODE_IPS} . .", flush=True) + time.sleep(5) + + # Wait for all health endpoints (HTTP check) + if args.wait_for_all_health: + health_path = args.health_endpoint + start_time = time.time() + timeout = args.timeout + + while True: + if timeout > 0: + elapsed = time.time() - start_time + if elapsed >= timeout: + not_ready = [ + (ip, port) + for ip, port in zip(NODE_IPS, NODE_PORTS) + if not check_health(ip, port, health_path) + ] + print(f"ERROR: Timeout after {timeout} seconds waiting for health endpoints.", flush=True) + print(f"The following (http://ip:port{health_path}) are still not responding:", flush=True) + for ip, port in not_ready: + print(f" - http://{ip}:{port}{health_path}", flush=True) + sys.exit(1) + + all_ready = all( + check_health(ip, port, health_path) + for ip, port in zip(NODE_IPS, NODE_PORTS) + ) + if all_ready: + break + + if timeout > 0: + remaining = timeout - (time.time() - start_time) + print( + f"Waiting for health on {list(zip(NODE_IPS, NODE_PORTS))} ({health_path}) .. ({remaining:.0f}s remaining)", + flush=True, + ) + else: + print(f"Waiting for health on {list(zip(NODE_IPS, NODE_PORTS))} ({health_path}) ..", flush=True) + time.sleep(30) + + if args.enable_port: + time.sleep(30) + close_port() + + +# ============================================================================= +# wait subcommand +# ============================================================================= + +def cmd_wait(args): + """Wait while a remote port remains open, exit when it closes.""" + print(f"Waiting while port {args.remote_port} on {args.remote_ip} is open...") + while is_port_open(args.remote_ip, args.remote_port): + time.sleep(5) + print(f"Port {args.remote_port} on {args.remote_ip} is now closed.") + + +# ============================================================================= +# CLI +# ============================================================================= + +def main(): + parser = argparse.ArgumentParser(description="Multi-node synchronization utilities.") + subparsers = parser.add_subparsers(dest="command", required=True) + + # barrier subcommand + bp = subparsers.add_parser("barrier", help="Wait for all nodes to open specified ports.") + bp.add_argument("--local-ip", required=False, help="Local IP address to bind the server.") + bp.add_argument("--local-port", type=int, required=False, help="Port number to bind the server.") + bp.add_argument("--enable-port", action="store_true", help="Enable opening and closing of local port.") + bp.add_argument("--node-ips", required=True, help="Comma-separated list of node IPs.") + bp.add_argument("--node-ports", required=True, help="Comma-separated list of ports to check.") + bp.add_argument("--timeout", type=int, default=600, + help="Timeout in seconds (default: 600). Set to 0 for no timeout.") + bp.add_argument("--wait-for-all-ports", action="store_true", + help="Wait until all node ports are open (TCP).") + bp.add_argument("--wait-for-all-health", action="store_true", + help="Wait until http://ip:port/health returns 200 for all nodes.") + bp.add_argument("--health-endpoint", default="/health", + help="Path for health check (default: /health).") + bp.set_defaults(func=cmd_barrier) + + # wait subcommand + wp = subparsers.add_parser("wait", help="Wait while a remote port remains open.") + wp.add_argument("--remote-ip", required=True, help="Remote server IP address.") + wp.add_argument("--remote-port", type=int, required=True, help="Remote port number.") + wp.set_defaults(func=cmd_wait) + + args = parser.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index fc04f5bb3..865a99aba 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -52,7 +52,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" - if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then + if [[ "$FRAMEWORK" == "sglang-disagg" || "$FRAMEWORK" == "vllm-disagg" ]]; then BENCHMARK_SUBDIR="multi_node" else BENCHMARK_SUBDIR="single_node" @@ -103,8 +103,17 @@ if [[ "$IS_MULTINODE" == "true" ]]; then cat > collect_latest_results.py <<'PY' import os, sys -sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) -for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: +job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) +prefixes = ["sglang", "vllm"] +logs_root = f"{job_dir}/logs/" +candidates = [] +if os.path.isdir(logs_root): + for name in os.listdir(logs_root): + for pfx in prefixes: + subdir = f"{logs_root}{name}/{pfx}_isl_{isl}_osl_{osl}" + if os.path.isdir(subdir): + candidates.append(subdir) +for path in sorted(candidates, key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY From 1c4ad3dea0400e94ce6dd9533eb07383b6b6d317 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Wed, 11 Mar 2026 17:50:16 +0000 Subject: [PATCH 02/19] [AMD] Refactor vLLM disagg recipe: models.yaml, UCX cleanup, QoS support Extract hardcoded model configurations from server.sh bash maps and job.slurm VALID_MODELS into a declarative models.yaml, mirroring the SGLang disagg recipe pattern. Adding a new model now requires no script changes. Also: - Consolidate UCX transport vars in job.slurm Docker env; remove duplicated setup_ucx_env() from server.sh - Extract RDMA workarounds (ionic /31 route fix, Nixl UCX patch) into setup_rdma_env() helper - Lower UCX_LOG_LEVEL from info to warn - Add nicctl mount and QoS/DSCP auto-detection to env.sh - Remove stale host libionic bind-mounts (driver now built into image) --- .../multi_node/vllm_disagg_utils/env.sh | 54 +++++- .../multi_node/vllm_disagg_utils/job.slurm | 46 +++-- .../multi_node/vllm_disagg_utils/models.yaml | 41 +++++ .../multi_node/vllm_disagg_utils/server.sh | 162 ++++++++---------- 4 files changed, 184 insertions(+), 119 deletions(-) create mode 100644 benchmarks/multi_node/vllm_disagg_utils/models.yaml diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh index ebe77f09b..f4340e812 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/env.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/env.sh @@ -33,9 +33,17 @@ else fi if [[ -z "$UCX_NET_DEVICES" ]]; then - FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) - if [[ -n "$FIRST_IB" ]]; then - export UCX_NET_DEVICES="${FIRST_IB}:1" + # Use the first benic interface for UCX TCP transport (maps to ionic RDMA NIC). + # We use TCP device names (benicXp1) instead of IB device names (ionic_X:1) + # because ud_verbs/ionic crashes in ucp_request_memory_dereg (UCX bug with ionic provider). + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1) + if [[ -n "$UCX_NET_DEV" ]]; then + export UCX_NET_DEVICES="$UCX_NET_DEV" + else + FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) + if [[ -n "$FIRST_IB" ]]; then + export UCX_NET_DEVICES="${FIRST_IB}:1" + fi fi echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" else @@ -48,5 +56,43 @@ export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} # RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} +# QoS/DSCP configuration for lossless RoCEv2 fabric. +# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname +if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then + echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)" +elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" ' +$1 == "DSCP" && $2 == ":" && $NF == p { + print $3; exit +}') + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP )) + export UCX_IB_SL=$ND_PRIO + echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + fi + fi +else + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + else + echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration." + fi +fi + set +x -echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX" +echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}" diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 710b7168a..494ef6901 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -18,13 +18,14 @@ echo "" # Model Validation # ============================================================================= -VALID_MODELS=( - "Llama-3.1-405B-Instruct-FP8-KV" - "amd-Llama-3.3-70B-Instruct-FP8-KV" - "DeepSeek-V3" - "DeepSeek-R1-0528" - "gpt-oss-120b" -) +# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ +# at runtime, but the CWD remains the submit-time directory (vllm_disagg_utils/). +MODELS_YAML="$(pwd)/models.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "Error: models.yaml not found at $MODELS_YAML" + exit 1 +fi if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then echo "Error: DOCKER_IMAGE_NAME is not set." @@ -32,13 +33,10 @@ if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then fi MODEL_NAME="${MODEL_NAME:-None}" -model_found=false -for m in "${VALID_MODELS[@]}"; do - [[ "$MODEL_NAME" == "$m" ]] && model_found=true && break -done -if [[ "$model_found" != "true" ]]; then - echo "Error: Model '$MODEL_NAME' not found. Available:" - printf ' - %s\n' "${VALID_MODELS[@]}" +if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then + echo "Error: Model '$MODEL_NAME' not found in models.yaml" + echo "Available models:" + grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' exit 1 fi echo "Model found: $MODEL_NAME" @@ -67,11 +65,6 @@ GPUS_PER_NODE="${GPUS_PER_NODE:-8}" # Model Path Resolution # ============================================================================= -# HF cache directory names may differ from MODEL_NAME -declare -A MODEL_DIR_NAMES=( - ["DeepSeek-R1-0528"]="models--deepseek-ai--DeepSeek-R1-0528" -) - # MODEL_DIR detection: prefer env var, fall back to hostname detection if [[ -z "$MODEL_DIR" ]]; then NODENAME=$(hostname -s) @@ -86,7 +79,11 @@ if [[ -z "$MODEL_DIR" ]]; then fi export MODEL_DIR -DISK_DIR_NAME="${MODEL_DIR_NAMES[$MODEL_NAME]:-$MODEL_NAME}" +# Extract hf_dir from models.yaml (the line after the model's top-level key) +DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} + found && /^[^ ]/{exit} + found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML") +DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}" echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" resolve_hf_cache_path() { @@ -270,10 +267,7 @@ exec sudo docker run --rm \ --security-opt seccomp=unconfined \ --privileged \ -v /sys:/sys \ - -v /etc/libibverbs.d/ionic.driver:/etc/libibverbs.d/ionic.driver:ro \ - -v /lib/x86_64-linux-gnu/libionic.so.1:/lib/x86_64-linux-gnu/libionic.so.1:ro \ - -v /lib/x86_64-linux-gnu/libionic.so:/lib/x86_64-linux-gnu/libionic.so:ro \ - -v /usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so:/usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so:ro \ + $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ -v ${MODEL_DIR}:/models \ -v \$HOME/.ssh:/root/.ssh \ --shm-size 128G \ @@ -302,13 +296,13 @@ exec sudo docker run --rm \ -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ -e DRY_RUN=\$DRY_RUN \ -e BENCHMARK_LOGS_DIR=/benchmark_logs \ - -e UCX_TLS=all \ + -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma \ -e UCX_SOCKADDR_TLS_PRIORITY=tcp \ -e UCX_MEMTYPE_CACHE=y \ -e UCX_RNDV_SCHEME=get_zcopy \ -e UCX_RNDV_THRESH=4k \ -e UCX_ROCM_IPC_MIN_ZCOPY=0 \ - -e UCX_LOG_LEVEL=info \ + -e UCX_LOG_LEVEL=warn \ -e HSA_ENABLE_SDMA=1 \ --name \"$DOCKER_CONT_NAME\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml new file mode 100644 index 000000000..31197ec52 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -0,0 +1,41 @@ +# Model-specific vLLM server configurations for disaggregated inference. +# +# Each top-level key is a MODEL_NAME value (must match the model identifier +# used in amd-master.yaml and the directory/HF-cache name under MODEL_DIR). +# +# To add a new model: add a new top-level entry following the same schema. +# No script changes are required. +# +# Schema: +# : +# prefill_flags: str # vLLM CLI flags for prefill workers +# decode_flags: str # vLLM CLI flags for decode workers +# env: str # Space-separated KEY=VALUE pairs exported before vllm serve +# hf_dir: str # (optional) On-disk directory name if it differs from the key +# # e.g. HF cache layout: models--deepseek-ai--DeepSeek-R1-0528 + +Llama-3.1-405B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +amd-Llama-3.3-70B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +DeepSeek-V3: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + decode_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + +DeepSeek-R1-0528: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + decode_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + hf_dir: "models--deepseek-ai--DeepSeek-R1-0528" + +gpt-oss-120b: + prefill_flags: "--tensor-parallel-size 8" + decode_flags: "--tensor-parallel-size 8" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index b4ab7bce8..21fe506cb 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -53,37 +53,43 @@ host_name=$(hostname) echo "[INFO] Management IP (barriers/proxy): $host_ip" echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip" -# --------------------------------------------------------------------------- -# RDMA route setup for Pensando ionic (RoCEv2) point-to-point /31 links. -# Each benic interface has a /31 to the TOR switch. Without explicit routes, -# traffic to other nodes' RDMA IPs falls through to the management network -# (no RDMA capability). Fix: add a /24 route via the TOR gateway so RoCEv2 -# stays on the ionic fabric. -# --------------------------------------------------------------------------- -if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then - rdma_subnet="${BASH_REMATCH[1]}" - rdma_host="${BASH_REMATCH[2]}" - rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" # /31 peer = TOR switch - rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) - if [[ -n "$rdma_iface" ]]; then - ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ - echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ - echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" +# ============================================================================= +# RDMA / Nixl Workarounds +# ============================================================================= + +setup_rdma_env() { + # Pensando ionic (RoCEv2) point-to-point /31 route fix. + # Each benic interface has a /31 to the TOR switch. Without explicit routes, + # traffic to other nodes' RDMA IPs falls through to the management network. + if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then + local rdma_subnet="${BASH_REMATCH[1]}" + local rdma_host="${BASH_REMATCH[2]}" + local rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" + local rdma_iface + rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) + if [[ -n "$rdma_iface" ]]; then + ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ + echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ + echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" + fi fi -fi -# Patch Nixl UCX backend: set ucx_error_handling_mode=none for shared-memory -# transport compatibility (Pensando ionic NICs don't support rdmacm, so the -# default UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors) -NIXL_API_FILE=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) -if [[ -n "$NIXL_API_FILE" ]]; then - if ! grep -q 'ucx_error_handling_mode' "$NIXL_API_FILE"; then - sed -i '/init\["num_threads"\] = str(nixl_conf.num_threads)/a\ init["ucx_error_handling_mode"] = "none"' "$NIXL_API_FILE" - echo "[PATCH] Added ucx_error_handling_mode=none to $NIXL_API_FILE" - else - echo "[PATCH] ucx_error_handling_mode already set in $NIXL_API_FILE" + # Patch Nixl UCX backend: set ucx_error_handling_mode=none. + # Pensando ionic NICs don't support rdmacm, so the default + # UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors. + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/init\["num_threads"\] = str(nixl_conf.num_threads)/a\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + fi fi -fi +} + +setup_rdma_env if [[ -z "$UCX_NET_DEVICES" ]]; then echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2 @@ -91,56 +97,45 @@ if [[ -z "$UCX_NET_DEVICES" ]]; then fi # ============================================================================= -# Model-Specific Configuration Maps +# Model-Specific Configuration from YAML # ============================================================================= +MODELS_YAML="${VLLM_WS_PATH}/models.yaml" -declare -A MODEL_PREFILL_CONFIGS=( - ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" - ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" - ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - ["DeepSeek-R1-0528"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - ["gpt-oss-120b"]="--tensor-parallel-size 8" -) - -declare -A MODEL_DECODE_CONFIGS=( - ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" - ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" - ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - ["DeepSeek-R1-0528"]="--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - ["gpt-oss-120b"]="--tensor-parallel-size 8" -) - -declare -A MODEL_ENVS=( - ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" - ["Llama-3.1-405B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" - ["DeepSeek-V3"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" - ["DeepSeek-R1-0528"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" - ["gpt-oss-120b"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" -) - -get_model_config() { - local mode="$1" - local model_name="$2" - if [[ "$mode" == "prefill" ]]; then - echo "${MODEL_PREFILL_CONFIGS[$model_name]:-"--tensor-parallel-size 8"}" - elif [[ "$mode" == "decode" ]]; then - echo "${MODEL_DECODE_CONFIGS[$model_name]:-"--tensor-parallel-size 8"}" - fi -} - -get_model_envs() { - echo "${MODEL_ENVS[$1]:-""}" -} +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi if [[ -z "$MODEL_NAME" ]]; then echo "ERROR: MODEL_NAME is not set"; exit 1 fi -PREFILL_SERVER_CONFIG=$(get_model_config "prefill" "$MODEL_NAME") -DECODE_SERVER_CONFIG=$(get_model_config "decode" "$MODEL_NAME") -PREFILL_MODEL_ENVS=$(get_model_envs "$MODEL_NAME") -DECODE_MODEL_ENVS=$(get_model_envs "$MODEL_NAME") -echo "Using model-specific configuration for: $MODEL_NAME" +eval "$(python3 -c " +import yaml, sys + +with open('${MODELS_YAML}') as f: + models = yaml.safe_load(f) + +model_name = '${MODEL_NAME}' +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def bash_escape(s): + \"\"\"Escape a value for safe embedding in a bash double-quoted assignment.\"\"\" + return s.replace('\\\\', '\\\\\\\\').replace('\"', '\\\\\"').replace('\$', '\\\\\$').replace('\`', '\\\\\`') + +pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8')) +df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8')) +ev = bash_escape(m.get('env', '')) +print(f'PREFILL_SERVER_CONFIG=\"{pf}\"') +print(f'DECODE_SERVER_CONFIG=\"{df}\"') +print(f'MODEL_ENVS=\"{ev}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" # ============================================================================= # Container Synchronization @@ -203,20 +198,15 @@ done echo "Prefill node IPs: ${PREFILL_ARGS}" echo "Decode node IPs: ${DECODE_ARGS}" -# Common UCX/Nixl environment for prefill and decode workers -setup_ucx_env() { - export UCX_TLS=all - export UCX_SOCKADDR_TLS_PRIORITY=tcp - export UCX_MEMTYPE_CACHE=y - export UCX_RNDV_SCHEME=get_zcopy - export UCX_RNDV_THRESH=4k - export UCX_ROCM_IPC_MIN_ZCOPY=0 - export HSA_ENABLE_SDMA=1 - export UCX_LOG_LEVEL=info +# vLLM/Nixl-specific environment (UCX transport vars are set at the Docker level in job.slurm) +setup_vllm_env() { export VLLM_USE_V1=1 export VLLM_SERVER_DEV_MODE=0 export VLLM_NIXL_SIDE_CHANNEL_HOST=${host_ip} export VLLM_NIXL_SIDE_CHANNEL_PORT=5557 + for env_pair in ${MODEL_ENVS}; do + export "$env_pair" + done } # ============================================================================= @@ -334,10 +324,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME})" echo "Using prefill config: $PREFILL_SERVER_CONFIG" - setup_ucx_env - for env_pair in ${PREFILL_MODEL_ENVS}; do - export "$env_pair" - done + setup_vllm_env PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ @@ -387,10 +374,7 @@ else echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" echo "Using decode config: $DECODE_SERVER_CONFIG" - setup_ucx_env - for env_pair in ${DECODE_MODEL_ENVS}; do - export "$env_pair" - done + setup_vllm_env DECODE_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ From 04ab30daa2fcff8b4ad09a231def2b2ad89d0993 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Wed, 11 Mar 2026 20:20:51 +0000 Subject: [PATCH 03/19] [AMD] Update vLLM disagg recipe for v0.17.1 NixlConnector API Adapt server.sh to vLLM v0.17.1 breaking changes: - Use simplified kv-transfer-config (side channel via env vars instead of kv_ip/kv_port, add kv_load_failure_policy) - Remove deprecated --disable-log-requests (disabled by default in v0.17) - Route NIXL side channel through RDMA IP for correct fabric path - Fix RIXL ucx_error_handling_mode patch for updated _api.py layout --- benchmarks/multi_node/vllm_disagg_utils/env.sh | 2 +- benchmarks/multi_node/vllm_disagg_utils/server.sh | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh index f4340e812..cc9b9320b 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/env.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/env.sh @@ -6,7 +6,7 @@ # Set by runner or auto-detected from hostname. # # The Docker image (built from vllm_disagg_inference.ubuntu.amd.Dockerfile) already -# sets LD_LIBRARY_PATH for UCX (/usr/local/ucx/lib) and RIXL (/usr/local/RIXL/install/lib). +# sets LD_LIBRARY_PATH for UCX (/usr/local/ucx/lib) and RIXL (/usr/local/rixl/lib). set -x diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 21fe506cb..d90e4b240 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -81,7 +81,7 @@ setup_rdma_env() { nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) if [[ -n "$nixl_api" ]]; then if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then - sed -i '/init\["num_threads"\] = str(nixl_conf.num_threads)/a\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" else echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" @@ -202,8 +202,8 @@ echo "Decode node IPs: ${DECODE_ARGS}" setup_vllm_env() { export VLLM_USE_V1=1 export VLLM_SERVER_DEV_MODE=0 - export VLLM_NIXL_SIDE_CHANNEL_HOST=${host_ip} - export VLLM_NIXL_SIDE_CHANNEL_PORT=5557 + export VLLM_NIXL_SIDE_CHANNEL_HOST=${rdma_ip} + export VLLM_NIXL_SIDE_CHANNEL_PORT=5600 for env_pair in ${MODEL_ENVS}; do export "$env_pair" done @@ -329,8 +329,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --disable-log-requests \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"${ENGINE_ID}\", \"kv_role\": \"kv_producer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"${rdma_ip}\"'\", \"kv_port\": 14600}' \ + --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_producer\", \"kv_load_failure_policy\": \"fail\"}' \ ${PREFILL_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then @@ -379,8 +378,7 @@ else DECODE_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --disable-log-requests \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"${ENGINE_ID}\", \"kv_role\": \"kv_consumer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"${rdma_ip}\"'\", \"kv_port\": 14600}' \ + --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_consumer\", \"kv_load_failure_policy\": \"fail\"}' \ ${DECODE_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then From 99ce774d08e31a1a3d0a36acff19328c08c25415 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Thu, 12 Mar 2026 12:13:36 +0000 Subject: [PATCH 04/19] [AMD] Make vLLM disagg recipe CI-compatible (mia1 cluster) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bench.sh: replace `vllm bench serve` (log-only output) with the shared run_benchmark_serving helper from benchmark_lib.sh, matching the SGLang disagg pattern. This produces the .json result files that the multinode CI workflow expects (benchmark-multinode-tmpl.yml → process_result.py). server.sh: make the Nixl ucx_error_handling_mode=none runtime patch conditional on Pensando ionic RDMA devices (IBDEVICES=*ionic*). On the mia1 cluster (ConnectX/mlx5, IBDEVICES=rdma*), UCX handles error mode natively and the patch is skipped. Model-path resolution and IBDEVICES/UCX/QoS auto-detection were verified to already work on mia1 — no changes needed. Tested locally (Job 2802, 1P+2D, ISL/OSL=1024): conc 8 → 507 tok/s conc 32 → 1778 tok/s conc 16 → 1004 tok/s conc 64 → 2480 tok/s All four .json result files produced; 100% external prefix cache hit rate. --- .../multi_node/vllm_disagg_utils/bench.sh | 27 ++++++++++--------- .../multi_node/vllm_disagg_utils/server.sh | 23 +++++++++------- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh index cfe66d460..69a178ca4 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/bench.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -1,6 +1,9 @@ #!/bin/bash # vLLM Disaggregated Benchmark Runner # +# Produces JSON result files via benchmark_serving.py (same as SGLang bench.sh) +# so that the CI pipeline can collect and process results. +# # Usage: bash bench.sh \ # \ # @@ -11,7 +14,6 @@ prefill_gpus=$3 decode_gpus=$4 model_path=$5 model_name=$6 -# Prefer MODEL_PATH from environment (handles HF cache snapshot resolution) MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" log_path=$7 @@ -31,6 +33,10 @@ echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_ profile_folder="${log_path}/vllm_isl_${chosen_isl}_osl_${chosen_osl}" mkdir -p "$profile_folder" +source "$(dirname "$0")/../../benchmark_lib.sh" + +REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" + for max_concurrency in "${chosen_concurrencies[@]}"; do export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" @@ -50,21 +56,18 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do echo "num_prompts: $num_prompts" echo "export_file: $export_file" - vllm bench serve \ + run_benchmark_serving \ + --bench-serving-dir "$REPO_ROOT" \ --model "$MODEL_PATH" \ - --backend vllm \ - --host 127.0.0.1 \ --port "$ROUTER_PORT" \ - --dataset-name "random" \ - --random-input-len "$chosen_isl" \ - --random-output-len "$chosen_osl" \ - --random-prefix-len 0 \ + --backend openai \ + --input-len "$chosen_isl" \ + --output-len "$chosen_osl" \ + --random-range-ratio "$random_range_ratio" \ --num-prompts "$num_prompts" \ - --request-rate "$chosen_req_rate" \ - --ignore-eos \ --max-concurrency "$max_concurrency" \ - 2>&1 | tee "${export_file}.log" + --result-filename "$export_file" \ + --result-dir /workspace/ - sleep 5 echo "-----------------------------------------" done diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index d90e4b240..933019abe 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -75,17 +75,22 @@ setup_rdma_env() { fi # Patch Nixl UCX backend: set ucx_error_handling_mode=none. - # Pensando ionic NICs don't support rdmacm, so the default + # Only needed for Pensando ionic NICs which don't support rdmacm — the default # UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors. - local nixl_api - nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) - if [[ -n "$nixl_api" ]]; then - if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then - sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" - echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" - else - echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + # ConnectX/mlx5 NICs (mia1 cluster) handle error mode properly; skip the patch. + if [[ "${IBDEVICES:-}" == *ionic* ]]; then + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + fi fi + else + echo "[INFO] Non-ionic RDMA devices (${IBDEVICES:-unset}); skipping ucx_error_handling_mode patch" fi } From d16bd211eadfff0c67d0e45965033e9ff650b1b4 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Thu, 12 Mar 2026 13:46:47 +0000 Subject: [PATCH 05/19] [AMD] Co-locate vLLM disagg router with prefill on NODE_RANK=0 Move the vllm-router from a dedicated proxy node onto the first prefill node, mirroring SGLang's co-location pattern. This reduces the node count from xP + yD + 1 to xP + yD (e.g., 3 nodes instead of 4 for 1P+2D). - server.sh: NODE_RANK=0 now runs both vllm serve (prefill, port 2584) and vllm-router (port 30000); barrier waits on all nodes - submit.sh / job.slurm: NUM_NODES = PREFILL_NODES + DECODE_NODES - bench.sh: ROUTER_PORT default updated to 30000 Local 1P+2D benchmark (ISL/OSL=1024, DeepSeek-R1 FP8, MI355X): - Throughput: +1.6% to +8.4% across concurrency 8-64 - Mean TTFT: -22% to -63% (prefill is local to router) - TPOT/ITL: unchanged (within noise) - 25% fewer nodes, no performance regression --- .github/configs/amd-master.yaml | 2 +- .../multi_node/vllm_disagg_utils/bench.sh | 2 +- .../multi_node/vllm_disagg_utils/job.slurm | 10 ++-- .../multi_node/vllm_disagg_utils/server.sh | 49 ++++++++++++++----- .../multi_node/vllm_disagg_utils/submit.sh | 10 ++-- 5 files changed, 48 insertions(+), 25 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5c9a7c1ec..04772e8b6 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1041,7 +1041,7 @@ dsr1-fp8-mi355x-vllm-disagg: - isl: 1024 osl: 1024 search-space: - # 1P2D: 1 prefill node + 2 decode nodes + 1 proxy = 4 nodes total + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - spec-decoding: "none" conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] prefill: diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh index 69a178ca4..37b9d0b56 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/bench.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -26,7 +26,7 @@ num_prompts_multiplier=${13:-10} IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" -ROUTER_PORT="${ROUTER_PORT:-2584}" +ROUTER_PORT="${ROUTER_PORT:-30000}" echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 494ef6901..7b25fd4b5 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --job-name=vllm-pd-bench -#SBATCH -N 4 # CHECK this to be right in batch jobs -#SBATCH -n 4 # CHECK this to be right in batch jobs +#SBATCH -N 3 # Overridden by submit.sh -N flag +#SBATCH -n 3 # Overridden by submit.sh -n flag #SBATCH --ntasks-per-node=1 #SBATCH --spread-job #SBATCH --gres=gpu:8 @@ -127,9 +127,9 @@ echo "Final MODEL_PATH: $MODEL_PATH" # Node Selection and vLLM-Specific NUM_NODES # ============================================================================= -# vLLM needs xP + yD + 1 (dedicated proxy node) -NUM_NODES=$((xP + yD + 1)) -echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD + 1 proxy)" +# Router co-located with first prefill: xP + yD nodes total (same as SGLang) +NUM_NODES=$((xP + yD)) +echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD, proxy co-located with first prefill)" FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 933019abe..8447046c1 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -3,9 +3,11 @@ # ============================================================================= # # Node role assignment (by NODE_RANK): -# 0 -> Proxy/Router node -# 1..xP -> Prefill nodes (kv_producer) -# xP+1..xP+yD -> Decode nodes (kv_consumer) +# 0 -> Proxy/Router + first Prefill node (kv_producer) +# 1..xP-1 -> Additional Prefill nodes (kv_producer) +# xP..xP+yD-1 -> Decode nodes (kv_consumer) +# +# Total nodes = xP + yD (router co-located with first prefill, like SGLang). # ============================================================================= # Environment Configuration @@ -32,7 +34,7 @@ BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" DRY_RUN="${DRY_RUN:-0}" GPUS_PER_NODE="${GPUS_PER_NODE:-8}" -ROUTER_PORT="${ROUTER_PORT:-2584}" +ROUTER_PORT="${ROUTER_PORT:-30000}" SERVER_PORT="${SERVER_PORT:-2584}" ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}" @@ -192,11 +194,11 @@ IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" PREFILL_ARGS="" DECODE_ARGS="" -for ((i=1; i<=xP && i<${#IP_ARRAY[@]}; i++)); do +for ((i=0; i&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill_pid=$! + fi + echo "Waiting for all prefill and decode servers to be up . . ." python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${PD_IPADDRS} \ + --node-ips ${IPADDRS} \ --node-ports $SERVER_PORT \ --wait-for-all-ports \ --timeout 1800 @@ -322,11 +342,14 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" fi - echo "Killing the proxy server" - [[ "$DRY_RUN" -eq 0 ]] && kill $proxy_pid + echo "Killing the proxy server and prefill server" + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $proxy_pid + kill $prefill_pid + fi -elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then - echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME})" +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then + echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})" echo "Using prefill config: $PREFILL_SERVER_CONFIG" setup_vllm_env diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index a41a31d79..d60ed87e6 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -5,8 +5,8 @@ # This script submits a multi-node vLLM disaggregated benchmark job to SLURM. # It must be configured for your specific cluster before use. # -# Key difference from SGLang: vLLM uses a dedicated proxy node, so -# NUM_NODES = PREFILL_NODES + DECODE_NODES + 1. +# Router is co-located with the first prefill node (same as SGLang), so +# NUM_NODES = PREFILL_NODES + DECODE_NODES. usage() { cat << 'USAGE' @@ -67,8 +67,8 @@ CONCURRENCIES=$7 REQUEST_RATE=$8 NODE_LIST=${9} -# vLLM needs xP + yD + 1 nodes (dedicated proxy node) -NUM_NODES=$((PREFILL_NODES + DECODE_NODES + 1)) +# Router co-located with first prefill: xP + yD nodes total +NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" # Export variables for the SLURM job @@ -77,7 +77,7 @@ export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE export PROFILER_ARGS=$profiler_args # For vLLM, each worker = 1 node (TP=8 per node). -# xP/yD must match the node counts so job.slurm's NUM_NODES = xP+yD+1 is correct. +# xP/yD must match the node counts so NUM_NODES = xP+yD is correct. export xP=$PREFILL_NODES export yD=$DECODE_NODES export NUM_NODES=$NUM_NODES From cf4b88cc4ee1144fc4047495a764406f89e7d2de Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Thu, 12 Mar 2026 17:31:07 +0000 Subject: [PATCH 06/19] [AMD] Use public vLLM base image with runtime dependency install MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the custom Docker image (vllm_disagg_pd:latest) with the public vllm/vllm-openai-rocm:v0.17.1 base image. Missing components (UCX, RIXL, etcd, libionic1, vllm-router) are now installed at container start via setup_deps.sh, which is sourced by server.sh. This eliminates the need to build, host, and maintain a custom image — CI nodes can pull directly from Docker Hub. Changes: - Add setup_deps.sh: idempotent installer for UCX (ROCm fork), RIXL, etcd, libionic1 (Pensando ionic), and vllm-router (NODE_RANK=0 only). Build steps run in subshells to avoid CWD pollution. - server.sh: source setup_deps.sh before any other logic - job.slurm: add --entrypoint "" to override the base image's vllm CLI entrypoint, allowing bash -lc to work correctly - env.sh: update comment (paths now set by setup_deps.sh, not image ENV) - amd-master.yaml: image changed to vllm/vllm-openai-rocm:v0.17.1 Tested locally (Job 2807, 3 nodes, ISL/OSL=1024): Setup overhead: ~2.5 min per node (all components built from source) Benchmark completed successfully across concurrency 8/16/32/64 --- .github/configs/amd-master.yaml | 2 +- .../multi_node/vllm_disagg_utils/env.sh | 4 +- .../multi_node/vllm_disagg_utils/job.slurm | 1 + .../multi_node/vllm_disagg_utils/server.sh | 5 + .../vllm_disagg_utils/setup_deps.sh | 186 ++++++++++++++++++ 5 files changed, 195 insertions(+), 3 deletions(-) create mode 100644 benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 04772e8b6..f36b23795 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1029,7 +1029,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: dsr1-fp8-mi355x-vllm-disagg: - image: vllm_disagg_pd:latest + image: vllm/vllm-openai-rocm:v0.17.1 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh index cc9b9320b..e1cc2f6af 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/env.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/env.sh @@ -5,8 +5,8 @@ # IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) # Set by runner or auto-detected from hostname. # -# The Docker image (built from vllm_disagg_inference.ubuntu.amd.Dockerfile) already -# sets LD_LIBRARY_PATH for UCX (/usr/local/ucx/lib) and RIXL (/usr/local/rixl/lib). +# UCX and RIXL paths (LD_LIBRARY_PATH, PATH) are set by setup_deps.sh, which is +# sourced at the top of server.sh before this file. set -x diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 7b25fd4b5..3a71436fe 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -305,6 +305,7 @@ exec sudo docker run --rm \ -e UCX_LOG_LEVEL=warn \ -e HSA_ENABLE_SDMA=1 \ --name \"$DOCKER_CONT_NAME\" \ + --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 8447046c1..efabf5e32 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -9,6 +9,11 @@ # # Total nodes = xP + yD (router co-located with first prefill, like SGLang). +# ============================================================================= +# Dependency Setup (idempotent; required when using base vLLM image) +# ============================================================================= +source "$(dirname "${BASH_SOURCE[0]}")/setup_deps.sh" + # ============================================================================= # Environment Configuration # ============================================================================= diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh new file mode 100644 index 000000000..ee2524979 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -0,0 +1,186 @@ +#!/bin/bash +# ============================================================================= +# setup_deps.sh — Install missing vLLM disagg dependencies at container start. +# +# Base image: vllm/vllm-openai-rocm:v0.17.1 +# Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist. +# Idempotent: each component is skipped if already present. +# +# Build steps run in subshells to avoid CWD pollution between installers. +# ============================================================================= + +ROCM_PATH="${ROCM_PATH:-/opt/rocm}" +UCX_HOME="${UCX_HOME:-/usr/local/ucx}" +RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}" + +_SETUP_START=$(date +%s) +_SETUP_INSTALLED=() + +# --------------------------------------------------------------------------- +# 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl) +# --------------------------------------------------------------------------- +install_ucx() { + if [[ -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] UCX already present at ${UCX_HOME}" + return 0 + fi + + echo "[SETUP] Installing UCX build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + autoconf automake libtool pkg-config \ + librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \ + infiniband-diags perftest ethtool rdma-core strace \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building UCX from source (ROCm/ucx @ da3fac2a)..." + ( + set -e + mkdir -p /usr/local/src && cd /usr/local/src + git clone --quiet https://github.com/ROCm/ucx.git && cd ucx + git checkout da3fac2a + ./autogen.sh && mkdir -p build && cd build + ../configure \ + --prefix="${UCX_HOME}" \ + --enable-shared --disable-static \ + --disable-doxygen-doc --enable-optimizations \ + --enable-devel-headers --enable-mt \ + --with-rocm="${ROCM_PATH}" --with-verbs --with-dm + make -j"$(nproc)" && make install + ) + rm -rf /usr/local/src/ucx + + if [[ ! -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] ERROR: UCX build failed"; exit 1 + fi + _SETUP_INSTALLED+=("UCX") +} + +# --------------------------------------------------------------------------- +# 2. RIXL (ROCm fork of NIXL — KV cache transfer for disaggregated vLLM) +# --------------------------------------------------------------------------- +install_rixl() { + if python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] RIXL Python bindings already present" + return 0 + fi + + echo "[SETUP] Installing RIXL build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc \ + libcpprest-dev libaio-dev \ + && rm -rf /var/lib/apt/lists/* + pip3 install --quiet meson "pybind11[global]" + + echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..." + ( + set -e + git clone --quiet https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl + git checkout f33a5599 + meson setup build --prefix="${RIXL_HOME}" \ + -Ducx_path="${UCX_HOME}" \ + -Drocm_path="${ROCM_PATH}" + cd build && ninja && ninja install + cd /opt/rixl + pip install --quiet \ + --config-settings=setup-args="-Drocm_path=${ROCM_PATH}" \ + --config-settings=setup-args="-Ducx_path=${UCX_HOME}" . + ) + rm -rf /opt/rixl + + if ! python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] ERROR: RIXL build failed"; exit 1 + fi + _SETUP_INSTALLED+=("RIXL") +} + +# --------------------------------------------------------------------------- +# 3. etcd (distributed KV store for vLLM disagg service discovery) +# --------------------------------------------------------------------------- +install_etcd() { + if [[ -x /usr/local/bin/etcd/etcd ]]; then + echo "[SETUP] etcd already present" + return 0 + fi + + local version="v3.6.0-rc.5" + echo "[SETUP] Downloading etcd ${version}..." + wget -q "https://github.com/etcd-io/etcd/releases/download/${version}/etcd-${version}-linux-amd64.tar.gz" \ + -O /tmp/etcd.tar.gz + mkdir -p /usr/local/bin/etcd + tar -xf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 + rm /tmp/etcd.tar.gz + _SETUP_INSTALLED+=("etcd") +} + +# --------------------------------------------------------------------------- +# 4. libionic1 (Pensando ionic RDMA verbs provider for RoCEv2 KV transfer) +# Harmless on non-Pensando nodes (shared lib is simply unused). +# --------------------------------------------------------------------------- +install_libionic() { + if dpkg -l libionic1 2>/dev/null | grep -q '^ii'; then + echo "[SETUP] libionic1 already installed" + return 0 + fi + + echo "[SETUP] Downloading and installing libionic1..." + wget -q "https://repo.radeon.com/amdainic/pensando/ubuntu/1.117.5/pool/main/r/rdma-core/libionic1_54.0-149.g3304be71_amd64.deb" \ + -O /tmp/libionic1.deb + dpkg -i /tmp/libionic1.deb || true + rm -f /tmp/libionic1.deb + _SETUP_INSTALLED+=("libionic1") +} + +# --------------------------------------------------------------------------- +# 5. vllm-router (Rust-based proxy for PD disaggregation) +# Only needed on NODE_RANK=0 (proxy node). +# --------------------------------------------------------------------------- +install_vllm_router() { + if pip show vllm-router &>/dev/null; then + echo "[SETUP] vllm-router already installed" + return 0 + fi + + echo "[SETUP] Installing Rust toolchain..." + if ! command -v cargo &>/dev/null; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + export PATH="/root/.cargo/bin:${PATH}" + fi + + echo "[SETUP] Installing vllm-router via pip..." + pip install --quiet vllm-router + + if ! pip show vllm-router &>/dev/null; then + echo "[SETUP] ERROR: vllm-router install failed"; exit 1 + fi + _SETUP_INSTALLED+=("vllm-router") +} + +# ============================================================================= +# Run installers +# ============================================================================= + +install_ucx +install_rixl +install_etcd +install_libionic + +if [[ "${NODE_RANK:-0}" -eq 0 ]]; then + install_vllm_router +fi + +# ============================================================================= +# Export paths (persists for server.sh since this file is sourced) +# ============================================================================= + +export ROCM_PATH="${ROCM_PATH}" +export UCX_HOME="${UCX_HOME}" +export RIXL_HOME="${RIXL_HOME}" +export PATH="${UCX_HOME}/bin:/usr/local/bin/etcd:/root/.cargo/bin:${PATH}" +export LD_LIBRARY_PATH="${UCX_HOME}/lib:${RIXL_HOME}/lib:${RIXL_HOME}/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" + +_SETUP_END=$(date +%s) +if [[ ${#_SETUP_INSTALLED[@]} -eq 0 ]]; then + echo "[SETUP] All dependencies already present (${_SETUP_END}s wallclock)" +else + echo "[SETUP] Installed: ${_SETUP_INSTALLED[*]} in $(( _SETUP_END - _SETUP_START ))s" +fi From 1b46ce56cbd97381e697cfc3ecf5dfcb6388cf7b Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 13 Mar 2026 14:19:12 +0000 Subject: [PATCH 07/19] [AMD] Enable Expert Parallelism with MoRI all-to-all on vLLM disagg decode Enable MoRI-based Expert Parallelism (--enable-expert-parallel --all2all-backend mori) on decode workers for DeepSeek-R1-0528, while keeping TP=8 to preserve KV cache transfer compatibility with the prefill node via NixlConnector. This matches SGLang's approach of TP=8 + EP within the TP group. KV Transfer: RIXL/NixlConnector (unchanged) MoE All-to-All: NCCL (default) -> MoRI-EP (--all2all-backend mori) Changes: - models.yaml: Add --enable-expert-parallel --all2all-backend mori to decode_flags; increase engine ready timeout to 1200s - setup_deps.sh: Add MoRI install and vLLM v0.17.1 patches for MoRI-EP + FP8 compatibility (AITER assertion, defer_input_quant) - server.sh: Support decode_env from models.yaml for decode-specific environment overrides - dsr1_fp8_mi355x_vllm-disagg.sh: Pass NODELIST to submit.sh for Slurm node constraints --- .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 4 +- .../multi_node/vllm_disagg_utils/models.yaml | 4 +- .../multi_node/vllm_disagg_utils/server.sh | 7 ++ .../vllm_disagg_utils/setup_deps.sh | 85 +++++++++++++++++++ 4 files changed, 96 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh index a457a2714..167aff5f3 100755 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh @@ -30,14 +30,14 @@ export MODEL_PATH=$MODEL_PATH export MODEL_NAME=$MODEL_NAME export CONTAINER_IMAGE=$IMAGE -# vLLM disagg uses TP-only parallelism (no EP/DP). # PREFILL_NODES and DECODE_NODES come from additional-settings in the YAML config. +# NODELIST (optional) constrains which Slurm nodes are used. JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $PREFILL_NUM_WORKERS \ $DECODE_NODES \ $DECODE_NUM_WORKERS \ - $ISL $OSL "${CONC_LIST// /x}" inf) + $ISL $OSL "${CONC_LIST// /x}" inf "${NODELIST:-}") if [[ $? -ne 0 ]]; then echo "Failed to submit job" >&2 diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index 31197ec52..4a720785a 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -31,8 +31,8 @@ DeepSeek-V3: DeepSeek-R1-0528: prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - decode_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=1200" hf_dir: "models--deepseek-ai--DeepSeek-R1-0528" gpt-oss-120b: diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index efabf5e32..7778dfd34 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -142,9 +142,11 @@ def bash_escape(s): pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8')) df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8')) ev = bash_escape(m.get('env', '')) +dev = bash_escape(m.get('decode_env', '')) print(f'PREFILL_SERVER_CONFIG=\"{pf}\"') print(f'DECODE_SERVER_CONFIG=\"{df}\"') print(f'MODEL_ENVS=\"{ev}\"') +print(f'DECODE_MODEL_ENVS=\"{dev}\"') ")" echo "Loaded model configuration for: $MODEL_NAME" @@ -408,6 +410,11 @@ else setup_vllm_env + for env_pair in ${DECODE_MODEL_ENVS}; do + export "$env_pair" + echo "[DECODE_ENV] $env_pair" + done + DECODE_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index ee2524979..8e2276d1c 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -155,6 +155,89 @@ install_vllm_router() { _SETUP_INSTALLED+=("vllm-router") } +# --------------------------------------------------------------------------- +# 6. MoRI (Modular RDMA Interface — EP dispatch/combine kernels for MoE) +# Required for --all2all-backend mori (Expert Parallelism via RDMA). +# GPU kernels are JIT-compiled on first use; no hipcc needed at install. +# --------------------------------------------------------------------------- +install_mori() { + if python3 -c "import mori" 2>/dev/null; then + echo "[SETUP] MoRI Python bindings already present" + return 0 + fi + + echo "[SETUP] Installing MoRI build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libopenmpi-dev openmpi-bin libpci-dev \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building MoRI from source (ROCm/mori @ b645fc8)..." + ( + set -e + git clone --quiet https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori + git checkout b645fc8 + pip install --quiet . + ) + rm -rf /opt/mori + + if ! python3 -c "import mori" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI build failed"; exit 1 + fi + _SETUP_INSTALLED+=("MoRI") +} + +# --------------------------------------------------------------------------- +# 7. Patch vLLM v0.17.1 MoRI-EP + FP8 incompatibility +# v0.17.1 asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel +# uses defer_input_quant=True which MoRI's prepare/finalize rejects. +# Patch: remove both the AITER requirement assertion and the +# defer_input_quant NotImplementedError so non-AITER kernels work. +# --------------------------------------------------------------------------- +patch_mori_fp8_compat() { + python3 -c ' +import re, os, sys +patched = [] + +# 1. Patch layer.py: remove multi-line AITER assertion for MoRI +try: + import vllm.model_executor.layers.fused_moe.layer as lm + f = lm.__file__ + src = open(f).read() + if "Mori needs to be used with aiter" in src: + new = re.sub( + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", + src, flags=re.DOTALL) + if new != src: + open(f, "w").write(new) + patched.append("layer.py") +except Exception as e: + print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr) + +# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction +try: + import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm + f = mm.__file__ + src = open(f).read() + if "defer_input_quant" in src: + new = re.sub( + r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)", + "pass # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8", + src) + if new != src: + open(f, "w").write(new) + patched.append("mori_prepare_finalize.py") +except Exception as e: + print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr) + +if patched: + print(f"[SETUP] Patched: {chr(44).join(patched)}") +else: + print("[SETUP] No MoRI-FP8 patches needed") +' + _SETUP_INSTALLED+=("MoRI-FP8-patch") +} + # ============================================================================= # Run installers # ============================================================================= @@ -163,6 +246,8 @@ install_ucx install_rixl install_etcd install_libionic +install_mori +patch_mori_fp8_compat if [[ "${NODE_RANK:-0}" -eq 0 ]]; then install_vllm_router From 585ddb4b9dec2ee4933c77ce9a4840e4feba11af Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 13 Mar 2026 23:25:36 +0000 Subject: [PATCH 08/19] [AMD] Switch vLLM disagg KV transfer to MoRI-IO with protocol-aware proxy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace NixlConnector with MoRIIOConnector for KV cache transfer and replace the Rust-based vllm-router with a MoRI-IO-aware Python proxy that handles both HTTP routing and ZMQ-based RDMA endpoint discovery. The key architectural change is that the proxy enriches each request's kv_transfer_params with remote RDMA endpoint info (handshake_port, notify_port, host, port) before dispatching, enabling concurrent prefill+decode in WRITE mode — something vllm-router could not do because it only understands HTTP, not the MoRI-IO registration protocol. Changes: - Add moriio_proxy.py: MoRI-IO-aware proxy with ZMQ service discovery, request enrichment, and /health endpoint (adapted from vLLM upstream moriio_toy_proxy_server.py) - server.sh: switch --kv-transfer-config from NixlConnector to MoRIIOConnector with kv_connector_extra_config (proxy_ip, proxy_ping_port, http_port); launch proxy before prefill on NODE_RANK=0; set VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1 as workaround for v0.17.1 completion-ID mismatch (upstream fix: vllm-project/vllm#34907) - setup_deps.sh: replace vllm-router/Rust install with lightweight Python deps (quart, aiohttp, msgpack, pyzmq) for the proxy Benchmark (Job 2853 vs 2818 NixlConnector baseline, ISL/OSL=1024): TTFT median: -37% to -55% across C8–C64 (e.g. 384→241ms @C64) TTFT p99: -63% at C64 (6622→2469ms) Throughput: +8% at C64 (2634→2844 tok/s) TPOT: unchanged (~22ms @C64) --- .../vllm_disagg_utils/moriio_proxy.py | 309 ++++++++++++++++++ .../multi_node/vllm_disagg_utils/server.sh | 87 ++--- .../vllm_disagg_utils/setup_deps.sh | 29 +- 3 files changed, 358 insertions(+), 67 deletions(-) create mode 100644 benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py diff --git a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py new file mode 100644 index 000000000..82272dd52 --- /dev/null +++ b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +# MoRI-IO proxy server for vLLM PD disaggregation. +# +# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py +# with the following adaptations for production multi-node use: +# - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars +# - /health endpoint for sync.py barrier readiness checks +# - Uses stdlib `re` instead of `regex` to avoid extra dep +# +# The proxy performs two roles that vllm-router cannot: +# 1. ZMQ service discovery — prefill/decode workers register their RDMA ports +# 2. Request enrichment — injects remote endpoint info into kv_transfer_params + +import asyncio +import copy +import logging +import os +import re +import socket +import threading +import uuid + +import aiohttp +import msgpack +import zmq +from quart import Quart, make_response, request + +logger = logging.getLogger("moriio_proxy") +logger.setLevel(logging.DEBUG) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter( + "%(asctime)s %(levelname)s [%(name)s] %(message)s")) +logger.addHandler(handler) + +prefill_instances: list[dict] = [] +decode_instances: list[dict] = [] +request_nums = 0 +app = Quart(__name__) + +IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") + +TRANSFER_TYPE = None + + +def _append_whole_dict_unique(target_list, data_dict): + new_filtered = {k: v for k, v in data_dict.items() if k != "index"} + for existed in target_list: + existed_filtered = {k: v for k, v in existed.items() if k != "index"} + if existed_filtered == new_filtered: + return False + logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s", + data_dict.get("role"), data_dict.get("request_address"), + data_dict.get("handshake_port"), data_dict.get("notify_port"), + data_dict.get("dp_size"), data_dict.get("tp_size")) + target_list.append(data_dict) + transfer_mode = data_dict.get("transfer_mode", "unknown") + global TRANSFER_TYPE + + if TRANSFER_TYPE is None: + TRANSFER_TYPE = transfer_mode + logger.info("Transfer mode set to: %s", TRANSFER_TYPE) + elif transfer_mode != TRANSFER_TYPE: + raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}") + + return True + + +_list_lock = threading.RLock() + + +def _listen_for_register(hostname, port): + context = zmq.Context() + router_socket = context.socket(zmq.ROUTER) + router_socket.bind(f"tcp://{hostname}:{port}") + poller = zmq.Poller() + poller.register(router_socket, zmq.POLLIN) + global prefill_instances + global decode_instances + + while True: + socks = dict(poller.poll()) + if router_socket in socks: + remote_addr, msg = router_socket.recv_multipart() + data = msgpack.loads(msg) + if data["type"] == "HELLO": + pass + elif ( + data["type"] == "register" + and data["role"] == "P" + and data["request_address"] not in prefill_instances + ): + with _list_lock: + _append_whole_dict_unique(prefill_instances, data) + + elif ( + data["type"] == "register" + and data["role"] == "D" + and data["request_address"] not in decode_instances + ): + with _list_lock: + _append_whole_dict_unique(decode_instances, data) + + +def start_service_discovery(hostname, port): + if not hostname: + hostname = socket.gethostname() + if port == 0: + raise ValueError("Port cannot be 0") + + _listener_thread = threading.Thread( + target=_listen_for_register, args=(hostname, port), daemon=True + ) + _listener_thread.start() + logger.info("Service discovery listening on %s:%s", hostname, port) + return _listener_thread + + +async def send_request_to_prefill( + endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank +): + req_data_copy = req_data + + req_data_copy["kv_transfer_params"].update( + { + "do_remote_decode": True, + "do_remote_prefill": False, + "remote_handshake_port": d_endpoint["handshake_port"], + "remote_notify_port": d_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": dip, + "remote_port": dport, + } + ) + req_data_copy["stream"] = False + req_data_copy["max_tokens"] = 1 + if "max_completion_tokens" in req_data_copy: + req_data_copy["max_completion_tokens"] = 1 + if "stream_options" in req_data_copy: + del req_data_copy["stream_options"] + async with aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) as session: + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + if selected_prefill_dp_rank is not None: + headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank) + async with session.post( + url=endpoint, json=req_data_copy, headers=headers + ) as response: + if response.status == 200: + return await response.json() + else: + raise RuntimeError( + f"Prefill response status={response.status}" + ) + + +async def start_decode_request(endpoint, req_data, request_id): + session = aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + response = await session.post(url=endpoint, json=req_data, headers=headers) + return session, response + + +async def stream_decode_response(session, response, request_id): + try: + if response.status == 200: + async for chunk_bytes in response.content.iter_chunked(1024): + yield chunk_bytes + else: + raise RuntimeError( + f"Decode response status={response.status}" + ) + finally: + await session.close() + + +@app.route("/health", methods=["GET"]) +async def health_check(): + with _list_lock: + p_count = len(prefill_instances) + d_count = len(decode_instances) + return await make_response( + ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200) + ) + + +@app.route("/v1/completions", methods=["POST"]) +@app.route("/v1/chat/completions", methods=["POST"]) +async def handle_request(): + try: + with _list_lock: + global request_nums + request_nums += 1 + + def extract_ip_port_fast(url): + match = IP_PORT_PATTERN.search(url) + if not match: + raise ValueError(f"Invalid URL format: {url}") + return match.groups() + + req_data = await request.get_json() + request_id = str(uuid.uuid4()) + + if not prefill_instances or not decode_instances: + return await make_response( + ("Service Unavailable: No prefill or decode instances registered.", 503) + ) + + pid = request_nums % len(prefill_instances) + did = request_nums % len(decode_instances) + prefill_instance_endpoint = prefill_instances[pid] + decode_instance_endpoint = decode_instances[did] + + selected_prefill_dp_rank = None + if prefill_instance_endpoint["dp_size"] > 1: + selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"] + + dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) + + req_data_to_prefill = copy.deepcopy(req_data) + req_data_to_prefill["kv_transfer_params"] = {} + req_data["kv_transfer_params"] = {} + req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( + decode_instance_endpoint["dp_size"] + ) + req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = ( + decode_instance_endpoint["tp_size"] + ) + + send_prefill_task = asyncio.create_task( + send_request_to_prefill( + prefill_instance_endpoint["request_address"], + req_data_to_prefill, + request_id, + decode_instance_endpoint, + dip, + dport, + selected_prefill_dp_rank, + ) + ) + ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"]) + + req_data["max_tokens"] -= 1 + + req_data["kv_transfer_params"] = { + "do_remote_decode": False, + "do_remote_prefill": True, + "remote_handshake_port": prefill_instance_endpoint["handshake_port"], + "remote_notify_port": prefill_instance_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": ip, + "remote_port": port, + } + if TRANSFER_TYPE == "READ": + prefill_response = await send_prefill_task + req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[ + "kv_transfer_params" + ]["remote_engine_id"] + req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[ + "kv_transfer_params" + ]["remote_block_ids"] + + req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[ + "dp_size" + ] + req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[ + "tp_size" + ] + + if selected_prefill_dp_rank is not None: + req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank + + decode_request_task = asyncio.create_task( + start_decode_request( + decode_instance_endpoint["request_address"], req_data, request_id + ) + ) + + session, decode_response = await decode_request_task + stream_generator = stream_decode_response(session, decode_response, request_id) + response = await make_response(stream_generator) + return response + except Exception as e: + logger.exception("Error handling request: %s", e) + return await make_response((f"Internal Server Error: {e!s}", 500)) + + +if __name__ == "__main__": + http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000")) + ping_port = int(os.environ.get("PROXY_PING_PORT", "36367")) + + t = start_service_discovery("0.0.0.0", ping_port) + app.debug = False + app.config["BODY_TIMEOUT"] = 360000 + app.config["RESPONSE_TIMEOUT"] = 360000 + + logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port) + app.run(host="0.0.0.0", port=http_port) + t.join() diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 7778dfd34..f81ff68e1 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -212,12 +212,18 @@ done echo "Prefill node IPs: ${PREFILL_ARGS}" echo "Decode node IPs: ${DECODE_ARGS}" -# vLLM/Nixl-specific environment (UCX transport vars are set at the Docker level in job.slurm) +# MoRI-IO proxy ZMQ registration port (must match moriio_proxy.py PROXY_PING_PORT) +PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" + +# vLLM environment (UCX transport vars are set at the Docker level in job.slurm) setup_vllm_env() { export VLLM_USE_V1=1 export VLLM_SERVER_DEV_MODE=0 export VLLM_NIXL_SIDE_CHANNEL_HOST=${rdma_ip} export VLLM_NIXL_SIDE_CHANNEL_PORT=5600 + # Workaround: disable request-ID randomization so MoRI-IO connector can + # match completion IDs between prefill and decode without PR #34907 patch. + export VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1 for env_pair in ${MODEL_ENVS}; do export "$env_pair" done @@ -245,10 +251,26 @@ if [ "$NODE_RANK" -eq 0 ]; then setup_vllm_env + # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup + echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." + PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ + python3 $VLLM_WS_PATH/moriio_proxy.py" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PROXY_CMD" + else + PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" + set -x + eval "$PROXY_CMD" 2>&1 | tee "$PROXY_LOG_FILE" & + set +x + proxy_pid=$! + sleep 3 + fi + PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_producer\", \"kv_load_failure_policy\": \"fail\"}' \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ ${PREFILL_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then @@ -270,56 +292,19 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Congratulations!!! All prefill and decode servers are up . . ." - echo "Starting vLLM Router..." - [ -f /root/.cargo/env ] && source /root/.cargo/env - - PREFILL_URLS="" - DECODE_URLS="" - for ip in ${PREFILL_ARGS}; do - PREFILL_URLS+="--prefill http://${ip}:${SERVER_PORT} " - done - for ip in ${DECODE_ARGS}; do - DECODE_URLS+="--decode http://${ip}:${SERVER_PORT} " - done - - ROUTER_CMD="UCX_TLS=tcp,self,shm VLLM_USE_V1=1 \ - vllm-router \ - --host 0.0.0.0 \ - --port $ROUTER_PORT \ - --vllm-pd-disaggregation \ - $PREFILL_URLS \ - $DECODE_URLS \ - --policy round_robin \ - --prefill-policy round_robin \ - --decode-policy round_robin \ - --intra-node-data-parallel-size 1 \ - --retry-max-retries 3 \ - --health-check-endpoint /health \ - --prometheus-port 29000" + # Wait for proxy /health to confirm it is accepting requests + HEALTH_BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-health \ + --health-endpoint /health \ + --timeout 1800" if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $ROUTER_CMD" + echo "DRY RUN: $HEALTH_BARRIER_CMD" else - ROUTER_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_${host_name}.log" - set -x - eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & - set +x - proxy_pid=$! - - HEALTH_BARRIER_CMD="python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports ${ROUTER_PORT} \ - --wait-for-all-health \ - --health-endpoint /health \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $HEALTH_BARRIER_CMD" - else - eval "$HEALTH_BARRIER_CMD" - fi - - echo "Router is ready for benchmarking" + eval "$HEALTH_BARRIER_CMD" + echo "MoRI-IO proxy is ready for benchmarking" fi echo "Ready for benchmarking on ${host_name}:${host_ip}" @@ -364,7 +349,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_producer\", \"kv_load_failure_policy\": \"fail\"}' \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ ${PREFILL_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then @@ -418,7 +403,7 @@ else DECODE_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_consumer\", \"kv_load_failure_policy\": \"fail\"}' \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ ${DECODE_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index 8e2276d1c..3af1b5b0e 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -131,28 +131,25 @@ install_libionic() { } # --------------------------------------------------------------------------- -# 5. vllm-router (Rust-based proxy for PD disaggregation) +# 5. MoRI-IO proxy deps (Python packages for the MoRI-IO-aware proxy server) +# The proxy replaces vllm-router: it handles both HTTP routing AND the +# MoRI-IO ZMQ registration/request-enrichment protocol. # Only needed on NODE_RANK=0 (proxy node). # --------------------------------------------------------------------------- -install_vllm_router() { - if pip show vllm-router &>/dev/null; then - echo "[SETUP] vllm-router already installed" +install_mori_proxy_deps() { + if python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] MoRI-IO proxy Python deps already present" return 0 fi - echo "[SETUP] Installing Rust toolchain..." - if ! command -v cargo &>/dev/null; then - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - export PATH="/root/.cargo/bin:${PATH}" - fi - - echo "[SETUP] Installing vllm-router via pip..." - pip install --quiet vllm-router + echo "[SETUP] Installing MoRI-IO proxy Python deps..." + pip install --quiet --ignore-installed blinker + pip install --quiet quart aiohttp msgpack pyzmq - if ! pip show vllm-router &>/dev/null; then - echo "[SETUP] ERROR: vllm-router install failed"; exit 1 + if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1 fi - _SETUP_INSTALLED+=("vllm-router") + _SETUP_INSTALLED+=("mori-proxy-deps") } # --------------------------------------------------------------------------- @@ -250,7 +247,7 @@ install_mori patch_mori_fp8_compat if [[ "${NODE_RANK:-0}" -eq 0 ]]; then - install_vllm_router + install_mori_proxy_deps fi # ============================================================================= From 69fcdbd6af1fa115d3ee1847485a01750920c52b Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 17 Mar 2026 08:47:54 +0000 Subject: [PATCH 09/19] [AMD] BUG fix: RANDOM_RANGE_RATIO never reaches bench.sh Signed-off-by: Theresa Shan --- .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 3 ++- .../multi_node/vllm_disagg_utils/submit.sh | 24 ++++++++++--------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh index 167aff5f3..172ecdf51 100755 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh @@ -37,7 +37,8 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $PREFILL_NUM_WORKERS \ $DECODE_NODES \ $DECODE_NUM_WORKERS \ - $ISL $OSL "${CONC_LIST// /x}" inf "${NODELIST:-}") + $ISL $OSL "${CONC_LIST// /x}" inf "${NODELIST:-}" \ + ${RANDOM_RANGE_RATIO}) if [[ $? -ne 0 ]]; then echo "Failed to submit job" >&2 diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index d60ed87e6..f210d7ac7 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -12,18 +12,19 @@ usage() { cat << 'USAGE' Usage: bash submit.sh \ - [NODE_LIST] + [NODE_LIST] [RANDOM_RANGE_RATIO] Arguments: - PREFILL_NODES Number of prefill nodes - PREFILL_WORKERS Number of prefill workers (usually 1) - DECODE_NODES Number of decode nodes - DECODE_WORKERS Number of decode workers (usually 1) - ISL Input sequence length - OSL Output sequence length - CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") - REQUEST_RATE Request rate ("inf" for max throughput) - NODE_LIST Optional: comma-separated hostnames + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + NODE_LIST Optional: comma-separated hostnames + RANDOM_RANGE_RATIO Optional: random range ratio for benchmark (default 0.8) Required environment variables: SLURM_ACCOUNT SLURM account name @@ -66,6 +67,7 @@ OSL=$6 CONCURRENCIES=$7 REQUEST_RATE=$8 NODE_LIST=${9} +RANDOM_RANGE_RATIO=${10} # Router co-located with first prefill: xP + yD nodes total NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) @@ -85,10 +87,10 @@ export GPUS_PER_NODE=$GPUS_PER_NODE export MODEL_NAME=$MODEL_NAME export BENCH_INPUT_LEN=${ISL} export BENCH_OUTPUT_LEN=${OSL} -export BENCH_RANDOM_RANGE_RATIO=${BENCH_RANDOM_RANGE_RATIO:-1} export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} +export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" From d214e79d60e12bd1dd69cdd98e356ff19fd9e4a2 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 17 Mar 2026 10:22:58 +0000 Subject: [PATCH 10/19] Bug fix: 1. With DRY_RUN=1, node 0 skipped starting proxy/prefill but still ran the first barrier; 2. kill and kill run only when DRY_RUN=0 Signed-off-by: Theresa Shan --- .../multi_node/vllm_disagg_utils/server.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index f81ff68e1..55538d4fa 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -284,11 +284,15 @@ if [ "$NODE_RANK" -eq 0 ]; then fi echo "Waiting for all prefill and decode servers to be up . . ." - python3 $VLLM_WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports $SERVER_PORT \ - --wait-for-all-ports \ - --timeout 1800 + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: skipping barrier (wait-for-all-ports)" + else + python3 $VLLM_WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports $SERVER_PORT \ + --wait-for-all-ports \ + --timeout 1800 + fi echo "Congratulations!!! All prefill and decode servers are up . . ." @@ -336,8 +340,8 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Killing the proxy server and prefill server" if [[ "$DRY_RUN" -eq 0 ]]; then - kill $proxy_pid - kill $prefill_pid + [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true fi elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then From 3ffcc74d3d1256485537ba6ad5700784c70a3149 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Thu, 19 Mar 2026 18:33:36 +0000 Subject: [PATCH 11/19] [AMD] Fix vLLM disagg hang: READ mode support + safety timeouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable READ-mode KV transfer (decode-initiated RDMA reads) with a critical scheduler assertion fix, and add safety timeouts to prevent indefinite hangs during RDMA transfers. Changes: - setup_deps.sh: Add patches — save_kv_layer/start_load_kv handshake timeouts (30s), RDMA transfer timeout (120s), deferred write task expiry (60s), write worker error handling, and scheduler assertion fix for READ-mode intermediate request states - moriio_proxy.py: Add stream idle timeout (PROXY_STREAM_IDLE_TIMEOUT) to abort stalled decode streams, and proper response.release() - submit.sh, job.slurm: Plumb PROXY_STREAM_IDLE_TIMEOUT and VLLM_MORIIO_CONNECTOR_READ_MODE env vars into Docker containers Validated: 1k/1k full sweep (C8–C512), 100% success rate at all concurrency levels, peak 8500 output tok/s at C512. --- .../multi_node/vllm_disagg_utils/job.slurm | 2 + .../vllm_disagg_utils/moriio_proxy.py | 21 +- .../vllm_disagg_utils/setup_deps.sh | 468 +++++++++++++++++- .../multi_node/vllm_disagg_utils/submit.sh | 3 + 4 files changed, 489 insertions(+), 5 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 3a71436fe..b216f53f4 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -304,6 +304,8 @@ exec sudo docker run --rm \ -e UCX_ROCM_IPC_MIN_ZCOPY=0 \ -e UCX_LOG_LEVEL=warn \ -e HSA_ENABLE_SDMA=1 \ + -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ + -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-0} \ --name \"$DOCKER_CONT_NAME\" \ --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py index 82272dd52..b2162c98a 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py +++ b/benchmarks/multi_node/vllm_disagg_utils/moriio_proxy.py @@ -18,6 +18,7 @@ import re import socket import threading +import time import uuid import aiohttp @@ -37,6 +38,8 @@ request_nums = 0 app = Quart(__name__) +STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300")) + IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") TRANSFER_TYPE = None @@ -173,13 +176,27 @@ async def start_decode_request(endpoint, req_data, request_id): async def stream_decode_response(session, response, request_id): try: if response.status == 200: - async for chunk_bytes in response.content.iter_chunked(1024): - yield chunk_bytes + chunk_iter = response.content.iter_chunked(1024).__aiter__() + while True: + try: + chunk_bytes = await asyncio.wait_for( + chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT, + ) + yield chunk_bytes + except StopAsyncIteration: + break + except asyncio.TimeoutError: + logger.error( + "Decode stream %s idle for %ds, aborting", + request_id, STREAM_IDLE_TIMEOUT, + ) + break else: raise RuntimeError( f"Decode response status={response.status}" ) finally: + await response.release() await session.close() diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index 3af1b5b0e..467e1bd5a 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -16,6 +16,19 @@ RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}" _SETUP_START=$(date +%s) _SETUP_INSTALLED=() +git_clone_retry() { + local url="$1" dest="$2" max_tries=3 try=1 + while (( try <= max_tries )); do + if git clone --quiet "$url" "$dest" 2>/dev/null; then return 0; fi + echo "[SETUP] git clone attempt $try/$max_tries failed for $url, retrying in 10s..." + rm -rf "$dest" + sleep 10 + (( try++ )) + done + echo "[SETUP] git clone failed after $max_tries attempts: $url" + return 1 +} + # --------------------------------------------------------------------------- # 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl) # --------------------------------------------------------------------------- @@ -36,7 +49,7 @@ install_ucx() { ( set -e mkdir -p /usr/local/src && cd /usr/local/src - git clone --quiet https://github.com/ROCm/ucx.git && cd ucx + git_clone_retry https://github.com/ROCm/ucx.git ucx && cd ucx git checkout da3fac2a ./autogen.sh && mkdir -p build && cd build ../configure \ @@ -74,7 +87,7 @@ install_rixl() { echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..." ( set -e - git clone --quiet https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl + git_clone_retry https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl git checkout f33a5599 meson setup build --prefix="${RIXL_HOME}" \ -Ducx_path="${UCX_HOME}" \ @@ -171,7 +184,7 @@ install_mori() { echo "[SETUP] Building MoRI from source (ROCm/mori @ b645fc8)..." ( set -e - git clone --quiet https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori + git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori git checkout b645fc8 pip install --quiet . ) @@ -235,6 +248,451 @@ else: _SETUP_INSTALLED+=("MoRI-FP8-patch") } +# --------------------------------------------------------------------------- +# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock) +# In WRITE mode, save_kv_layer spins forever waiting for the handshake +# callback to set write_ready_flags. This blocks the model worker thread, +# preventing it from responding to EngineCore shm_broadcast, causing a +# TimeoutError cascade and crash. +# Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent +# the model worker from deadlocking. +# --------------------------------------------------------------------------- +patch_moriio_save_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + # Already patched? + if "[PATCHED] save_kv_layer timeout" in src: + print("[SETUP] save_kv_layer timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + continue""" + + if old not in src: + print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch") + sys.exit(0) + + new = """ # [PATCHED] save_kv_layer — null guard + timeout + sleep + if remote_engine_id is None: + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + _elapsed = _time.monotonic() - _wait_start + if _elapsed > _SAVE_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for " + "write_ready_flags[%s], breaking to unblock model " + "worker", _elapsed, remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout +# The original status.Wait() blocks forever if an RDMA completion never +# arrives (e.g., NIC queue saturation at C256). This replaces the unbounded +# wait with a polling loop using status.Succeeded() + configurable timeout. +# Also adds error handling to the write worker loop so a single failed +# transfer doesn't kill the background thread. +# --------------------------------------------------------------------------- +patch_moriio_transfer_timeout() { + python3 -c ' +import os, sys, textwrap + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me + f = me.__file__ + src = open(f).read() + + if "[PATCHED] transfer completion timeout" in src: + print("[SETUP] transfer completion timeout patch already applied") + sys.exit(0) + + # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout --- + old_wait = """ def waiting_for_transfer_complete(self): + if not self.transfer_status: + return + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + for status in transfers_to_wait: + try: + status.Wait() + if not status.Succeeded(): + logger.error( + "Transfer failed: %s, Code: %s", status.Message(), status.Code() + ) + raise TransferError("MoRIIO transfer failed!") + except Exception as e: + logger.error("Transfer %s failed: %s", status, e) + raise""" + + new_wait = """ def waiting_for_transfer_complete(self): + # [PATCHED] transfer completion timeout — bounded polling loop + import time as _time, os as _os + if not self.transfer_status: + return + + _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120")) + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + _start = _time.monotonic() + remaining = list(transfers_to_wait) + _polls = 0 + _completed = 0 + + while remaining: + _elapsed = _time.monotonic() - _start + if _elapsed > _timeout: + logger.error( + "[HANGFIX] transfer_timeout elapsed=%.1fs " + "pending=%d/%d completed=%d polls=%d " + "action=raise_transfer_error", + _elapsed, len(remaining), len(transfers_to_wait), + _completed, _polls, + ) + raise TransferError( + f"RDMA transfer timeout after {_elapsed:.1f}s, " + f"{len(remaining)}/{len(transfers_to_wait)} pending" + ) + + still_waiting = [] + for status in remaining: + try: + if status.Succeeded(): + _completed += 1 + continue + still_waiting.append(status) + except Exception as e: + logger.error( + "[HANGFIX] transfer_poll_error error=%s", e) + raise TransferError( + f"Transfer failed during poll: {e}" + ) from e + + remaining = still_waiting + if remaining: + _time.sleep(0.005) + _polls += 1 + if _polls % 2000 == 0: + logger.warning( + "[HANGFIX] transfer_wait pending=%d " + "completed=%d elapsed=%.1fs timeout=%.0fs", + len(remaining), _completed, + _time.monotonic() - _start, _timeout, + )""" + + if old_wait not in src: + print("[SETUP] WARN: waiting_for_transfer_complete pattern not found") + sys.exit(0) + + new_src = src.replace(old_wait, new_wait) + + # --- Patch 2: Add error handling + cleanup to _write_worker_loop --- + old_loop = """ self._execute_write_task(task)""" + + new_loop = """ try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s write_task_failed error=%s " + "action=cleanup_and_mark_done", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None + ) + except Exception: + pass""" + + if old_loop in new_src: + new_src = new_src.replace(old_loop, new_loop, 1) + else: + print("[SETUP] WARN: _write_worker_loop pattern not found for error handling") + + # --- Patch 3: Add deferred task timeout to _process_deferred_tasks --- + old_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + if not self._deferred_tasks: + return + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + if self._is_remote_ready(task): + self._execute_write_task(task) + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + new_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + # [PATCHED] deferred task timeout — prune stale tasks + import time as _time, os as _os + if not self._deferred_tasks: + return + + _DEFER_TIMEOUT = float( + _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60")) + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic()) + if _age > _DEFER_TIMEOUT: + logger.error( + "[HANGFIX] req=%s deferred_task_expired age=%.1fs " + "action=drop_and_mark_done", + task.request_id, _age, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + continue + if self._is_remote_ready(task): + try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s deferred_write_failed error=%s", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + if old_deferred in new_src: + new_src = new_src.replace(old_deferred, new_deferred, 1) + else: + print("[SETUP] WARN: _process_deferred_tasks pattern not found") + + # --- Patch 4: Stamp defer time when task is deferred --- + old_defer_add = """ self._deferred_tasks.append(task)""" + new_defer_add = """ import time as _time2 + if not hasattr(task, "_defer_ts"): + task._defer_ts = _time2.monotonic() + self._deferred_tasks.append(task)""" + if old_defer_add in new_src: + new_src = new_src.replace(old_defer_add, new_defer_add, 1) + else: + print("[SETUP] WARN: deferred task timestamp patch target not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: transfer timeout + writer error handling") + +except Exception as e: + print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer) +# The READ-mode spin loop in start_load_kv has the same unbounded-spin +# issue as save_kv_layer. Add timeout + sleep + null guard. +# --------------------------------------------------------------------------- +patch_moriio_load_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + if "[PATCHED] start_load_kv timeout" in src: + print("[SETUP] start_load_kv timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + continue""" + + if old not in src: + print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping") + sys.exit(0) + + new = """ # [PATCHED] start_load_kv timeout — prevent model worker deadlock + if remote_engine_id is None and not wait_handshake_readd_req: + self._reqs_to_send.update(metadata.reqs_to_send) + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for " + "load_ready_flag[%s]", _time.monotonic() - _wait_start, + remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: start_load_kv replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished +# vLLM v0.17.1 asserts that a request in finished_recving must be either +# WAITING_FOR_REMOTE_KVS or finished. In READ mode the request can +# transition to RUNNING before the aggregated recv notification arrives, +# crashing the engine with AssertionError. +# --------------------------------------------------------------------------- +patch_scheduler_read_mode_fix() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] read-mode recv assertion" in src: + print("[SETUP] scheduler read-mode assertion fix already applied") + sys.exit(0) + + old_recv = """ for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + assert req_id in self.requests + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + else: + assert RequestStatus.is_finished(req.status) + self._free_blocks(self.requests[req_id])""" + + new_recv = """ # [PATCHED] read-mode recv assertion — handle intermediate states + for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping recv", req_id) + continue + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + elif RequestStatus.is_finished(req.status): + self._free_blocks(self.requests[req_id]) + else: + logger.debug( + "Request %s recv finished but status=%s (not " + "WAITING_FOR_REMOTE_KVS or finished), skipping " + "block free — will be freed on request completion", + req_id, req.status.name)""" + + if old_recv not in src: + print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping") + sys.exit(0) + + new_src = src.replace(old_recv, new_recv, 1) + + old_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + assert req_id in self.requests + self._free_blocks(self.requests[req_id])""" + + new_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + req = self.requests[req_id] + if RequestStatus.is_finished(req.status): + self._free_blocks(req) + else: + logger.debug( + "Request %s send finished but status=%s, " + "deferring block free to request completion", + req_id, req.status.name)""" + + if old_send in new_src: + new_src = new_src.replace(old_send, new_send, 1) + else: + print("[SETUP] WARN: scheduler finished_sending pattern not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix") + +except Exception as e: + print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("scheduler-read-mode-fix") +} + # ============================================================================= # Run installers # ============================================================================= @@ -245,6 +703,10 @@ install_etcd install_libionic install_mori patch_mori_fp8_compat +patch_moriio_save_kv_timeout +patch_moriio_transfer_timeout +patch_moriio_load_kv_timeout +patch_scheduler_read_mode_fix if [[ "${NODE_RANK:-0}" -eq 0 ]]; then install_mori_proxy_deps diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index f210d7ac7..5d733b010 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -92,6 +92,9 @@ export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} +export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} +export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-0} + # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" mkdir -p "$BENCHMARK_LOGS_DIR" From 9129ead6635fb15e53b5c6aee75c706db20b5f4c Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sat, 21 Mar 2026 19:15:33 +0000 Subject: [PATCH 12/19] Adapt vLLM disagg recipe for 9N mia1 cluster (mlx5 NICs) Port the vLLM disaggregated serving pipeline from the 4N cluster (Pensando ionic NICs) to the 9N mia1 cluster (mlx5/rdma NICs). Key changes: - Fix C512 deadlock: apply ucx_error_handling_mode=none universally instead of only for ionic NICs. Under high concurrency, UCX's default UCP_ERR_HANDLING_MODE_PEER prevents RIXL RDMA READ retries from recovering after ibv_post_send queue exhaustion, causing prefill KV cache saturation and pipeline deadlock. - Force-reinstall MoRI from b645fc8 to fix PCI topology assertion failure on nodes with Broadcom PEX890xx PCIe switches. - Auto-detect Docker privilege (sudo vs non-sudo) for cross-cluster portability. - Add SLURM_EXCLUDE_NODES support to skip nodes with broken Docker sockets. - Increase VLLM_ENGINE_READY_TIMEOUT_S to 3600 to accommodate longer setup times (RIXL/MoRI source builds over NFS). --- .../multi_node/vllm_disagg_utils/job.slurm | 20 +++++++++---- .../multi_node/vllm_disagg_utils/models.yaml | 2 +- .../multi_node/vllm_disagg_utils/server.sh | 29 +++++++++---------- .../vllm_disagg_utils/setup_deps.sh | 25 ++++++++++++---- .../multi_node/vllm_disagg_utils/submit.sh | 8 +++++ 5 files changed, 57 insertions(+), 27 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index b216f53f4..904aaaff4 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -61,6 +61,16 @@ BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" GPUS_PER_NODE="${GPUS_PER_NODE:-8}" +# ============================================================================= +# Docker privilege detection +# ============================================================================= +if docker ps &>/dev/null; then + DOCKER_CMD="docker" +else + DOCKER_CMD="sudo docker" +fi +export DOCKER_CMD + # ============================================================================= # Model Path Resolution # ============================================================================= @@ -212,7 +222,7 @@ SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) cleanup() { echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." - sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true + rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true echo "[${SLURM_JOB_ID}] cleanup done." } @@ -240,10 +250,10 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" # Pre-clean (idempotent) -sudo docker ps -aq --filter \"name=^container_vllm_\" | xargs -r sudo docker rm -f || true -sudo docker ps -aq | xargs -r sudo docker stop || true +$DOCKER_CMD ps -aq --filter \"name=^container_vllm_\" | xargs -r $DOCKER_CMD rm -f || true +$DOCKER_CMD ps -aq | xargs -r $DOCKER_CMD stop || true -exec sudo docker run --rm \ +exec $DOCKER_CMD run --rm \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -320,4 +330,4 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true' +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c "$DOCKER_CMD rm -f \$DOCKER_CONT_NAME 2>/dev/null || true" diff --git a/benchmarks/multi_node/vllm_disagg_utils/models.yaml b/benchmarks/multi_node/vllm_disagg_utils/models.yaml index 4a720785a..ef062e5f4 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/models.yaml +++ b/benchmarks/multi_node/vllm_disagg_utils/models.yaml @@ -32,7 +32,7 @@ DeepSeek-V3: DeepSeek-R1-0528: prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" - env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=1200" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--deepseek-ai--DeepSeek-R1-0528" gpt-oss-120b: diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 55538d4fa..d21bdbebb 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -82,22 +82,21 @@ setup_rdma_env() { fi # Patch Nixl UCX backend: set ucx_error_handling_mode=none. - # Only needed for Pensando ionic NICs which don't support rdmacm — the default - # UCP_ERR_HANDLING_MODE_PEER causes "no active messages transport" errors. - # ConnectX/mlx5 NICs (mia1 cluster) handle error mode properly; skip the patch. - if [[ "${IBDEVICES:-}" == *ionic* ]]; then - local nixl_api - nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) - if [[ -n "$nixl_api" ]]; then - if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then - sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" - echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api" - else - echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" - fi + # Required for ALL NIC types under high concurrency (C512+). Without this, + # UCX's default UCP_ERR_HANDLING_MODE_PEER triggers transport-level error + # recovery on ibv_post_send failures, preventing RIXL RDMA READ retries from + # recovering gracefully. This causes the prefill KV cache to fill to 100% + # and deadlock the pipeline. On ionic NICs this was already applied (rdmacm + # incompatibility); on mlx5 NICs it was incorrectly skipped. + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api (IBDEVICES=${IBDEVICES:-unset})" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" fi - else - echo "[INFO] Non-ionic RDMA devices (${IBDEVICES:-unset}); skipping ucx_error_handling_mode patch" fi } diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index 467e1bd5a..a6b1f79cb 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -171,8 +171,18 @@ install_mori_proxy_deps() { # GPU kernels are JIT-compiled on first use; no hipcc needed at install. # --------------------------------------------------------------------------- install_mori() { - if python3 -c "import mori" 2>/dev/null; then - echo "[SETUP] MoRI Python bindings already present" + local MORI_TARGET_COMMIT="b645fc8" + local MORI_MARKER="/usr/local/lib/python3.*/dist-packages/.mori_commit_${MORI_TARGET_COMMIT}" + + # The pre-installed MoRI in vllm base images has a PCI topology bug: it + # only maps the secondary bus of each bridge instead of the full + # secondary-to-subordinate range (dsp2dev). This causes an assertion + # failure in TopoSystemPci::Load() on nodes with deeply-nested PCIe + # switch topologies (e.g. Broadcom PEX890xx on MI355X mia1 nodes). + # Always rebuild from the target commit unless the marker file proves + # the correct version was already installed in this container. + if ls $MORI_MARKER &>/dev/null; then + echo "[SETUP] MoRI @ $MORI_TARGET_COMMIT already installed (marker found)" return 0 fi @@ -181,19 +191,22 @@ install_mori() { libopenmpi-dev openmpi-bin libpci-dev \ && rm -rf /var/lib/apt/lists/* - echo "[SETUP] Building MoRI from source (ROCm/mori @ b645fc8)..." + echo "[SETUP] Building MoRI from source (ROCm/mori @ $MORI_TARGET_COMMIT)..." + echo "[SETUP] (overriding pre-installed version to fix PCI topology bug)" ( set -e git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori - git checkout b645fc8 - pip install --quiet . + git checkout "$MORI_TARGET_COMMIT" + pip install --quiet --force-reinstall . ) rm -rf /opt/mori if ! python3 -c "import mori" 2>/dev/null; then echo "[SETUP] ERROR: MoRI build failed"; exit 1 fi - _SETUP_INSTALLED+=("MoRI") + # Drop a marker so re-entry doesn't rebuild + touch $(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")/.mori_commit_${MORI_TARGET_COMMIT} + _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT") } # --------------------------------------------------------------------------- diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index 5d733b010..c5404ec18 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -112,6 +112,13 @@ if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then NODELIST_OPT=(--nodelist "$NODELIST_CSV") fi +# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets). +# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames. +EXCLUDE_OPT=() +if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then + EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") +fi + # Construct the sbatch command sbatch_cmd=( sbatch @@ -119,6 +126,7 @@ sbatch_cmd=( -N "$NUM_NODES" -n "$NUM_NODES" "${NODELIST_OPT[@]}" + "${EXCLUDE_OPT[@]}" --time "$TIME_LIMIT" --partition "$SLURM_PARTITION" --account "$SLURM_ACCOUNT" From 728f91a917ce8be3437d0c2b43f7d3f90fc9317d Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 22 Mar 2026 12:38:46 +0000 Subject: [PATCH 13/19] [AMD] Fix vLLM disagg sweep hang: KV cache leak + benchmark client hardening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Server-side: RIXL can lose `finished_sending` notifications under high concurrency with ibv_post_send failures, permanently leaking prefill KV blocks. Over multiple benchmark rounds (sweep), leaked blocks accumulate and saturate the prefill KV cache, deadlocking C512. - Fix finished_sending handler to unconditionally free KV blocks (the conditional status check had no recovery path, causing leaks) - Add idle KV block reaper: detects engine idle >5s with finished requests still holding blocks, then force-frees them - Add 10s cooldown between benchmark rounds for reaper activation Client-side: SSE streaming loop did not break on the [DONE] sentinel, causing the benchmark client to hang when the proxy held connections open after request completion. - Break SSE loop on [DONE] in completions and chat completions - Share a single aiohttp.ClientSession across all requests (connection pooling via TCPConnector instead of per-request session creation) - Add asyncio.wait_for timeout around asyncio.gather with proper task cancellation and partial result collection - Reduce AIOHTTP_TIMEOUT from 6h to 30min Verified: sweep 1K/1K C128→C256→C512 all pass (Job 6222, 9N cluster). --- .../multi_node/vllm_disagg_utils/bench.sh | 2 + .../vllm_disagg_utils/setup_deps.sh | 123 ++++++++++++- utils/bench_serving/backend_request_func.py | 170 +++++++++++------- utils/bench_serving/benchmark_serving.py | 58 ++++-- 4 files changed, 264 insertions(+), 89 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh index 37b9d0b56..5b9f5c772 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/bench.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/bench.sh @@ -70,4 +70,6 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do --result-dir /workspace/ echo "-----------------------------------------" + echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." + sleep 10 done diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index a6b1f79cb..a95591cb5 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -683,14 +683,7 @@ try: if req_id not in self.requests: logger.debug("Request %s already removed, skipping send", req_id) continue - req = self.requests[req_id] - if RequestStatus.is_finished(req.status): - self._free_blocks(req) - else: - logger.debug( - "Request %s send finished but status=%s, " - "deferring block free to request completion", - req_id, req.status.name)""" + self._free_blocks(self.requests[req_id])""" if old_send in new_src: new_src = new_src.replace(old_send, new_send, 1) @@ -706,6 +699,119 @@ except Exception as e: _SETUP_INSTALLED+=("scheduler-read-mode-fix") } +# --------------------------------------------------------------------------- +# 12. Idle KV block reaper for disaggregated prefill (READ mode) +# The RIXL notification path can lose `finished_sending` signals under +# high concurrency with ibv_post_send failures. This leaves KV blocks +# permanently allocated on the prefill engine even after the decode has +# finished reading. Over multiple benchmark rounds, leaked blocks +# accumulate and eventually saturate the prefill KV cache. +# +# Fix: instrument the scheduler's `schedule()` method to detect idle +# periods (0 running, 0 waiting for >5s) and force-free blocks for +# any remaining requests whose status is finished. +# --------------------------------------------------------------------------- +patch_prefill_idle_kv_reaper() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] idle-kv-reaper" in src: + print("[SETUP] idle KV block reaper already applied") + sys.exit(0) + + # Find the _update_from_kv_xfer_finished method end and add reaper logic + # We inject into the method that processes KV transfer completions. + marker = "[PATCHED] read-mode recv assertion" + if marker not in src: + print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper") + sys.exit(0) + + # Add reaper state initialization to __init__ + old_init_marker = "self.finished_recving_kv_req_ids" + if old_init_marker not in src: + print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler") + sys.exit(0) + + # Find the first occurrence to insert reaper state + init_pos = src.find(old_init_marker) + # Find the line containing it + line_end = src.find("\n", init_pos) + init_line = src[init_pos:line_end] + + # Add reaper state after this line + reaper_init = init_line + """ + # [PATCHED] idle-kv-reaper state + self._idle_kv_reaper_ts = 0.0 + self._idle_kv_reaper_active = False""" + + src = src.replace(init_line, reaper_init, 1) + + # Now add the reaper logic at the end of _update_from_kv_xfer_finished + # Find the finished_sending handler we patched + send_handler = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + self._free_blocks(self.requests[req_id])""" + + reaper_logic = send_handler + """ + + # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks + import time as _time + _REAPER_IDLE_SECS = 5.0 + _num_running = sum(1 for r in self.requests.values() + if r.status == RequestStatus.RUNNING) + _num_waiting = sum(1 for r in self.requests.values() + if r.status == RequestStatus.WAITING) + _is_idle = (_num_running == 0 and _num_waiting == 0) + + if _is_idle: + if not self._idle_kv_reaper_active: + self._idle_kv_reaper_active = True + self._idle_kv_reaper_ts = _time.monotonic() + elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS: + _reaped = 0 + _reap_ids = [] + for _rid, _req in list(self.requests.items()): + if RequestStatus.is_finished(_req.status): + _reap_ids.append(_rid) + for _rid in _reap_ids: + try: + _req = self.requests[_rid] + self._free_blocks(_req) + _reaped += 1 + except Exception as _e: + logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e) + if _reaped > 0: + logger.warning( + "[KV-REAPER] Force-freed blocks for %d finished " + "requests after %.1fs idle", + _reaped, _time.monotonic() - self._idle_kv_reaper_ts) + self._idle_kv_reaper_ts = _time.monotonic() + else: + self._idle_kv_reaper_active = False""" + + if send_handler in src: + src = src.replace(send_handler, reaper_logic, 1) + else: + print("[SETUP] WARN: send handler not found for reaper injection") + sys.exit(0) + + open(f, "w").write(src) + print("[SETUP] Patched: idle KV block reaper for prefill") + +except Exception as e: + print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("idle-kv-reaper") +} + # ============================================================================= # Run installers # ============================================================================= @@ -720,6 +826,7 @@ patch_moriio_save_kv_timeout patch_moriio_transfer_timeout patch_moriio_load_kv_timeout patch_scheduler_read_mode_fix +patch_prefill_idle_kv_reaper if [[ "${NODE_RANK:-0}" -eq 0 ]]; then install_mori_proxy_deps diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index 32331a398..5ba629c06 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -14,7 +14,7 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=30 * 60) @dataclass @@ -49,12 +49,16 @@ class RequestFuncOutput: async def async_request_tgi( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: params = { "best_of": request_func_input.best_of, "max_new_tokens": request_func_input.output_len, @@ -62,7 +66,6 @@ async def async_request_tgi( "temperature": 0.01, # TGI does not accept 0.0 temperature. "top_p": 0.99, # TGI does not accept 1.0 top_p. "truncate": request_func_input.prompt_len, - # TGI does not accept ignore_eos flag. } payload = { "inputs": request_func_input.prompt, @@ -113,21 +116,28 @@ async def async_request_tgi( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_trt_llm( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: assert request_func_input.best_of == 1 payload = { "accumulate_tokens": True, @@ -181,18 +191,25 @@ async def async_request_trt_llm( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: assert request_func_input.best_of == 1 payload = { @@ -225,23 +242,30 @@ async def async_request_deepspeed_mii( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_openai_completions( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: payload = { "model": request_func_input.model_name \ if request_func_input.model_name else request_func_input.model, @@ -281,33 +305,35 @@ async def async_request_openai_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk != "[DONE]": - data = json.loads(chunk) - - # NOTE: Some completion API might have a last - # usage summary response without a token so we - # want to check a token was generated - if choices := data.get("choices"): - # Note that text could be empty here - # e.g. for special tokens - text = choices[0].get("text") - timestamp = time.perf_counter() - # First token - if not first_chunk_received: - first_chunk_received = True - ttft = time.perf_counter() - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) - - most_recent_timestamp = timestamp - generated_text += text or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + if chunk == "[DONE]": + break + + data = json.loads(chunk) + + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") + timestamp = time.perf_counter() + # First token + if not first_chunk_received: + first_chunk_received = True + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += text or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") if first_chunk_received: output.success = True else: @@ -324,6 +350,9 @@ async def async_request_openai_completions( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() if pbar: pbar.update(1) @@ -333,14 +362,18 @@ async def async_request_openai_completions( async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( "chat/completions" ), "OpenAI Chat Completions API URL must end with 'chat/completions'." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) @@ -387,28 +420,30 @@ async def async_request_openai_chat_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk != "[DONE]": - timestamp = time.perf_counter() - data = json.loads(chunk) + if chunk == "[DONE]": + break - if choices := data.get("choices"): - content = choices[0]["delta"].get("content") - # First token - if ttft == 0.0: - ttft = timestamp - st - output.ttft = ttft + timestamp = time.perf_counter() + data = json.loads(chunk) - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft - generated_text += content or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) - most_recent_timestamp = timestamp + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + + most_recent_timestamp = timestamp output.generated_text = generated_text output.success = True @@ -420,6 +455,9 @@ async def async_request_openai_chat_completions( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() if pbar: pbar.update(1) diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py index 70334ea16..88cc02676 100644 --- a/utils/bench_serving/benchmark_serving.py +++ b/utils/bench_serving/benchmark_serving.py @@ -26,6 +26,7 @@ import argparse import asyncio import base64 +import contextlib import gc import io import json @@ -37,9 +38,10 @@ from datetime import datetime from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple +import aiohttp import numpy as np -from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, - RequestFuncOutput) +from backend_request_func import (AIOHTTP_TIMEOUT, ASYNC_REQUEST_FUNCS, + RequestFuncInput, RequestFuncOutput) from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase @@ -348,11 +350,14 @@ async def benchmark( else: raise ValueError(f"Unknown backend: {backend}") + connector = aiohttp.TCPConnector(limit=0, enable_cleanup_closed=True) + shared_session = aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT, connector=connector) + print("Starting initial single prompt test run...") test_prompt, test_prompt_len, test_output_len, test_mm_content = ( input_requests[0]) if backend != "openai-chat" and test_mm_content is not None: - # multi-modal benchmark is only available on OpenAI Chat backend. raise ValueError( "Multi-modal content is only supported on 'openai-chat' backend.") test_input = RequestFuncInput( @@ -371,11 +376,13 @@ async def benchmark( if num_warmups > 0: print(f"Warming up with {num_warmups} requests...") warmup_pbar = None if disable_tqdm else tqdm(total=num_warmups) - warmup_semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else contextlib.nullcontext() + warmup_semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else asyncio.Semaphore(num_warmups) async def warmup_limited_req_fn(): async with warmup_semaphore: - return await request_func(request_func_input=test_input, pbar=warmup_pbar) + return await request_func( + request_func_input=test_input, pbar=warmup_pbar, + session=shared_session) warmup_tasks = [] for _ in range(num_warmups): @@ -388,7 +395,6 @@ async def warmup_limited_req_fn(): print("Warmup completed.") if lora_modules: - # For each input request, choose a LoRA module at random. lora_modules = iter( [random.choice(lora_modules) for _ in range(len(input_requests))]) @@ -405,7 +411,8 @@ async def warmup_limited_req_fn(): best_of=best_of, multi_modal_content=test_mm_content, ignore_eos=ignore_eos) - profile_output = await request_func(request_func_input=profile_input) + profile_output = await request_func( + request_func_input=profile_input, session=shared_session) if profile_output.success: print("Profiler started") @@ -420,20 +427,16 @@ async def warmup_limited_req_fn(): pbar = None if disable_tqdm else tqdm(total=len(input_requests)) - # This can be used once the minimum Python version is 3.10 or higher, - # and it will simplify the code in limited_request_func. - # semaphore = (asyncio.Semaphore(max_concurrency) - # if max_concurrency else contextlib.nullcontext()) semaphore = (asyncio.Semaphore(max_concurrency) if max_concurrency else None) async def limited_request_func(request_func_input, pbar): if semaphore is None: return await request_func(request_func_input=request_func_input, - pbar=pbar) + pbar=pbar, session=shared_session) async with semaphore: return await request_func(request_func_input=request_func_input, - pbar=pbar) + pbar=pbar, session=shared_session) print("Starting main benchmark run...") @@ -460,7 +463,28 @@ async def limited_request_func(request_func_input, pbar): asyncio.create_task( limited_request_func(request_func_input=request_func_input, pbar=pbar))) - outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + gather_timeout = max(7200, len(input_requests) * 30) + try: + outputs: List[RequestFuncOutput] = await asyncio.wait_for( + asyncio.gather(*tasks), timeout=gather_timeout) + except asyncio.TimeoutError: + completed = pbar.n if pbar else "?" + print(f"\n[WARNING] Benchmark timed out after {gather_timeout}s " + f"({completed}/{len(tasks)} requests completed). " + "Collecting partial results...") + for task in tasks: + if not task.done(): + task.cancel() + await asyncio.gather(*tasks, return_exceptions=True) + outputs = [] + for task in tasks: + if task.done() and not task.cancelled(): + try: + outputs.append(task.result()) + except Exception: + outputs.append(RequestFuncOutput()) + else: + outputs.append(RequestFuncOutput()) if profile: print("Stopping profiler...") @@ -473,10 +497,14 @@ async def limited_request_func(request_func_input, pbar): logprobs=logprobs, best_of=best_of, ) - profile_output = await request_func(request_func_input=profile_input) + profile_output = await request_func( + request_func_input=profile_input, session=shared_session) if profile_output.success: print("Profiler stopped") + await shared_session.close() + await connector.close() + if pbar is not None: pbar.close() From a163fd64d5b16f622cdbb706a602dab611f2c1f5 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 22 Mar 2026 18:21:22 +0000 Subject: [PATCH 14/19] [AMD] Fix vLLM disagg Slurm job never terminating after benchmark completion Background processes (proxy, prefill, decode, etcd) were started via `cmd 2>&1 | tee logfile &`, causing bash $! to capture the PID of tee rather than the actual process. `kill $pid` only killed tee, leaving the real process running. The proxy kept port 30000 open, so decode nodes' `sync.py wait` never detected shutdown and the Slurm job hung forever. Additionally, etcd's stderr was not redirected, holding the Docker container's main pipe open and preventing container exit even after server.sh completed. Changes: - Redirect all background processes to log files instead of piping through tee, so $! captures the correct PID (matches SGLang pattern) - Redirect etcd launcher's stderr to prevent pipe leak - Add pkill fallback cleanup for proxy, vllm, and etcd processes - Increase barrier grace period to handle node setup time variance - Increase container creation barrier timeout from 300s to 600s --- .../multi_node/vllm_disagg_utils/server.sh | 29 +++++++++++-------- .../multi_node/vllm_disagg_utils/sync.py | 5 +++- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index d21bdbebb..8a149e776 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -162,14 +162,14 @@ python3 $VLLM_WS_PATH/sync.py barrier \ --node-ips ${IPADDRS} \ --node-ports 5000 \ --wait-for-all-ports \ - --timeout 300 + --timeout 600 # ============================================================================= # ETCD Server Setup # ============================================================================= echo "Proceeding to start etcd server on $host_name" -bash ${VLLM_WS_PATH}/start_etcd.sh > /dev/null & +bash ${VLLM_WS_PATH}/start_etcd.sh > /dev/null 2>&1 & etcd_pid=$! echo "Waiting at etcd server barrier on $host_name" @@ -260,7 +260,7 @@ if [ "$NODE_RANK" -eq 0 ]; then else PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" set -x - eval "$PROXY_CMD" 2>&1 | tee "$PROXY_LOG_FILE" & + eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & set +x proxy_pid=$! sleep 3 @@ -275,9 +275,9 @@ if [ "$NODE_RANK" -eq 0 ]; then if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $PREFILL_CMD" else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & set +x prefill_pid=$! fi @@ -341,6 +341,10 @@ if [ "$NODE_RANK" -eq 0 ]; then if [[ "$DRY_RUN" -eq 0 ]]; then [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true + sleep 2 + # Fallback: ensure no orphaned processes keep ports open + pkill -f moriio_proxy 2>/dev/null || true + pkill -f "vllm serve" 2>/dev/null || true fi elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then @@ -358,9 +362,9 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $PREFILL_CMD" else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & set +x prefill_pid=$! fi @@ -390,7 +394,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then fi echo "Killing the prefill server" - [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid + [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid 2>/dev/null || true else echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" @@ -412,9 +416,9 @@ else if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $DECODE_CMD" else + DECODE_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log" set -x - eval "$DECODE_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & + eval "$DECODE_CMD" > "$DECODE_LOG_FILE" 2>&1 & set +x decode_pid=$! fi @@ -444,11 +448,12 @@ else fi echo "Killing the decode server" - [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid + [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true fi echo "Killing the etcd server" -kill $etcd_pid +kill $etcd_pid 2>/dev/null || true +pkill -f etcd 2>/dev/null || true echo "Script completed successfully" exit 0 diff --git a/benchmarks/multi_node/vllm_disagg_utils/sync.py b/benchmarks/multi_node/vllm_disagg_utils/sync.py index 140951519..3678e7614 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/sync.py +++ b/benchmarks/multi_node/vllm_disagg_utils/sync.py @@ -143,7 +143,10 @@ def close_port(): time.sleep(30) if args.enable_port: - time.sleep(30) + # Keep the port open long enough for slow nodes to pass their barrier. + # The previous 30s was too short when setup times vary by minutes. + grace = max(60, args.timeout // 2) if args.timeout > 0 else 300 + time.sleep(grace) close_port() From cb52c29f66308e778b4cd0f04c9208b60ac12c6b Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 22 Mar 2026 20:44:27 +0000 Subject: [PATCH 15/19] [AMD] Enable MoRI-IO READ mode by default for vLLM disagg --- .github/configs/amd-master.yaml | 3 +++ benchmarks/multi_node/vllm_disagg_utils/job.slurm | 2 +- benchmarks/multi_node/vllm_disagg_utils/submit.sh | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f36b23795..282d78cdf 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1051,6 +1051,7 @@ dsr1-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: num-worker: 2 tp: 8 @@ -1071,6 +1072,7 @@ dsr1-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: num-worker: 2 tp: 8 @@ -1091,6 +1093,7 @@ dsr1-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: num-worker: 2 tp: 8 diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index 904aaaff4..c555f6948 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -315,7 +315,7 @@ exec $DOCKER_CMD run --rm \ -e UCX_LOG_LEVEL=warn \ -e HSA_ENABLE_SDMA=1 \ -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ - -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-0} \ + -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \ --name \"$DOCKER_CONT_NAME\" \ --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index c5404ec18..7063aa7a8 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -93,7 +93,7 @@ export BENCH_REQUEST_RATE=${REQUEST_RATE} export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} -export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-0} +export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" From 25a0310d41a88e274c51d28796fa29a17bfa681e Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 22 Mar 2026 20:57:24 +0000 Subject: [PATCH 16/19] [AMD] Fix CI checkout failure caused by root-owned __pycache__ files Fix per-node Docker privilege detection in vLLM disagg job.slurm --- .../multi_node/vllm_disagg_utils/job.slurm | 18 ++++++++++++++---- .../multi_node/vllm_disagg_utils/server.sh | 3 +++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index c555f6948..d33525081 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -64,6 +64,9 @@ GPUS_PER_NODE="${GPUS_PER_NODE:-8}" # ============================================================================= # Docker privilege detection # ============================================================================= +# Detect on the batch host (used for post-srun cleanup). +# Per-node detection happens inside the srun inline script below because +# some nodes may require sudo while others do not. if docker ps &>/dev/null; then DOCKER_CMD="docker" else @@ -249,11 +252,18 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" +# Per-node Docker privilege detection (some nodes need sudo, others don't) +if docker ps &>/dev/null; then + _DCMD=docker +else + _DCMD='sudo docker' +fi + # Pre-clean (idempotent) -$DOCKER_CMD ps -aq --filter \"name=^container_vllm_\" | xargs -r $DOCKER_CMD rm -f || true -$DOCKER_CMD ps -aq | xargs -r $DOCKER_CMD stop || true +\$_DCMD ps -aq --filter \"name=^container_vllm_\" | xargs -r \$_DCMD rm -f || true +\$_DCMD ps -aq | xargs -r \$_DCMD stop || true -exec $DOCKER_CMD run --rm \ +exec \$_DCMD run --rm \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -330,4 +340,4 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c "$DOCKER_CMD rm -f \$DOCKER_CONT_NAME 2>/dev/null || true" +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'if docker ps &>/dev/null; then D=docker; else D="sudo docker"; fi; $D rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 8a149e776..85a50b38d 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -455,5 +455,8 @@ echo "Killing the etcd server" kill $etcd_pid 2>/dev/null || true pkill -f etcd 2>/dev/null || true +# Clean root-owned __pycache__ so the CI runner can delete the workspace on next checkout +find /workspace -name '__pycache__' -exec rm -rf {} + 2>/dev/null || true + echo "Script completed successfully" exit 0 From 5bbc954e991b930047c2304f9c196efd9b51c5af Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Mon, 23 Mar 2026 09:07:02 +0000 Subject: [PATCH 17/19] [AMD] Fix CI checkout EACCES by redirecting Python bytecache off NFS Docker containers run as root, so __pycache__/*.pyc files created during benchmark_serving.py import end up root-owned on the NFS workspace. The CI runner cannot delete them, breaking checkout. Set PYTHONPYCACHEPREFIX=/tmp/pycache in the Docker env so bytecache stays inside the container. Remove the previous server.sh find-and- delete workaround since the root cause is now addressed. --- benchmarks/multi_node/vllm_disagg_utils/job.slurm | 1 + benchmarks/multi_node/vllm_disagg_utils/server.sh | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index d33525081..bc04f3b61 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -326,6 +326,7 @@ exec \$_DCMD run --rm \ -e HSA_ENABLE_SDMA=1 \ -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \ + -e PYTHONPYCACHEPREFIX=/tmp/pycache \ --name \"$DOCKER_CONT_NAME\" \ --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 85a50b38d..8a149e776 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -455,8 +455,5 @@ echo "Killing the etcd server" kill $etcd_pid 2>/dev/null || true pkill -f etcd 2>/dev/null || true -# Clean root-owned __pycache__ so the CI runner can delete the workspace on next checkout -find /workspace -name '__pycache__' -exec rm -rf {} + 2>/dev/null || true - echo "Script completed successfully" exit 0 From 89ae5168aa60ccdaa10c15819d6c3d199b7a36c1 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Mon, 23 Mar 2026 16:28:18 +0000 Subject: [PATCH 18/19] [AMD] Fix KV reaper deadlock on high-ISL disagg workloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The idle KV block reaper only fired when both running=0 AND waiting=0. Under 8K ISL at C64+, leaked blocks filled the prefill KV cache while new requests queued in WAITING state — the non-empty wait queue prevented the reaper from ever triggering, causing a permanent hang. Remove the waiting-queue check so the reaper fires whenever no requests are actively running, which is precisely when leaked blocks can be safely reclaimed. Verified with 8K/1K sweep (C32–C512) completing without hangs. --- benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh index a95591cb5..e8437a5c9 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/setup_deps.sh @@ -767,11 +767,9 @@ try: _REAPER_IDLE_SECS = 5.0 _num_running = sum(1 for r in self.requests.values() if r.status == RequestStatus.RUNNING) - _num_waiting = sum(1 for r in self.requests.values() - if r.status == RequestStatus.WAITING) - _is_idle = (_num_running == 0 and _num_waiting == 0) + _should_reap = (_num_running == 0) - if _is_idle: + if _should_reap: if not self._idle_kv_reaper_active: self._idle_kv_reaper_active = True self._idle_kv_reaper_ts = _time.monotonic() From f611f47b5c7eafbb4bd9933ca41e65833aa5b64a Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 24 Mar 2026 08:35:21 +0000 Subject: [PATCH 19/19] [AMD] Enable reading PREFILL_TP,PREFILL_EP,PREFILL_DP_ATTN,DECODE_TP,DECODE_EP,DECODE_DP_ATTN from amd-master.yaml config. Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 6 +-- .../multi_node/dsr1_fp8_mi355x_vllm-disagg.sh | 39 +++++++++++++-- .../multi_node/vllm_disagg_utils/job.slurm | 14 ++++++ .../multi_node/vllm_disagg_utils/server.sh | 31 ++++++++++++ .../multi_node/vllm_disagg_utils/submit.sh | 50 +++++++++++++------ 5 files changed, 119 insertions(+), 21 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 282d78cdf..a39a34b74 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1055,7 +1055,7 @@ dsr1-fp8-mi355x-vllm-disagg: decode: num-worker: 2 tp: 8 - ep: 1 + ep: 8 dp-attn: false additional-settings: - "DECODE_NODES=2" @@ -1076,7 +1076,7 @@ dsr1-fp8-mi355x-vllm-disagg: decode: num-worker: 2 tp: 8 - ep: 1 + ep: 8 dp-attn: false additional-settings: - "DECODE_NODES=2" @@ -1097,7 +1097,7 @@ dsr1-fp8-mi355x-vllm-disagg: decode: num-worker: 2 tp: 8 - ep: 1 + ep: 8 dp-attn: false additional-settings: - "DECODE_NODES=2" diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh index 172ecdf51..b21e9204a 100755 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh @@ -11,8 +11,12 @@ check_env_vars \ MODEL_PATH \ PREFILL_NUM_WORKERS \ PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ DECODE_NUM_WORKERS \ DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ RANDOM_RANGE_RATIO @@ -30,15 +34,42 @@ export MODEL_PATH=$MODEL_PATH export MODEL_NAME=$MODEL_NAME export CONTAINER_IMAGE=$IMAGE -# PREFILL_NODES and DECODE_NODES come from additional-settings in the YAML config. -# NODELIST (optional) constrains which Slurm nodes are used. +# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi +# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST. JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $PREFILL_NUM_WORKERS \ $DECODE_NODES \ $DECODE_NUM_WORKERS \ - $ISL $OSL "${CONC_LIST// /x}" inf "${NODELIST:-}" \ - ${RANDOM_RANGE_RATIO}) + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") if [[ $? -ne 0 ]]; then echo "Failed to submit job" >&2 diff --git a/benchmarks/multi_node/vllm_disagg_utils/job.slurm b/benchmarks/multi_node/vllm_disagg_utils/job.slurm index bc04f3b61..e1cad0817 100644 --- a/benchmarks/multi_node/vllm_disagg_utils/job.slurm +++ b/benchmarks/multi_node/vllm_disagg_utils/job.slurm @@ -217,6 +217,14 @@ export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +# TP / EP / DP (from vllm_disagg_utils/submit.sh; mirrors amd_utils disagg) +export PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}" +export PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}" +export DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}" +export DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}" +export PREFILL_TP="${PREFILL_TP:-8}" +export DECODE_TP="${DECODE_TP:-8}" + SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_vllm_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$VLLM_WS_PATH/${RUN_FILE}" @@ -327,6 +335,12 @@ exec \$_DCMD run --rm \ -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} \ -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} \ -e PYTHONPYCACHEPREFIX=/tmp/pycache \ + -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ + -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \ + -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \ + -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \ + -e PREFILL_TP=\$PREFILL_TP \ + -e DECODE_TP=\$DECODE_TP \ --name \"$DOCKER_CONT_NAME\" \ --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' diff --git a/benchmarks/multi_node/vllm_disagg_utils/server.sh b/benchmarks/multi_node/vllm_disagg_utils/server.sh index 8a149e776..9b0ff2ebb 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/server.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/server.sh @@ -150,6 +150,37 @@ print(f'DECODE_MODEL_ENVS=\"{dev}\"') echo "Loaded model configuration for: $MODEL_NAME" +# Apply tensor-parallel size and EP/DP flags from submit pipeline (YAML PREFILL_TP / dp-attn / ep). +if [[ -n "${PREFILL_TP:-}" ]]; then + if echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP}/g") + else + PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP}" + fi +fi +if [[ -n "${DECODE_TP:-}" ]]; then + if echo "$DECODE_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP}/g") + else + DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP}" + fi +fi +if [[ "${PREFILL_ENABLE_EP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + PREFILL_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${PREFILL_ENABLE_DP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + PREFILL_SERVER_CONFIG+=" --enable-dp-attention" +fi +if [[ "${DECODE_ENABLE_EP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + DECODE_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${DECODE_ENABLE_DP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + DECODE_SERVER_CONFIG+=" --enable-dp-attention" +fi + +echo "PREFILL_SERVER_CONFIG (after TP/EP/DP): $PREFILL_SERVER_CONFIG" +echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG" + # ============================================================================= # Container Synchronization # ============================================================================= diff --git a/benchmarks/multi_node/vllm_disagg_utils/submit.sh b/benchmarks/multi_node/vllm_disagg_utils/submit.sh index 7063aa7a8..ecb5a9876 100755 --- a/benchmarks/multi_node/vllm_disagg_utils/submit.sh +++ b/benchmarks/multi_node/vllm_disagg_utils/submit.sh @@ -12,19 +12,29 @@ usage() { cat << 'USAGE' Usage: bash submit.sh \ - [NODE_LIST] [RANDOM_RANGE_RATIO] + \ + \ + \ + \ + [NODE_LIST] Arguments: - PREFILL_NODES Number of prefill nodes - PREFILL_WORKERS Number of prefill workers (usually 1) - DECODE_NODES Number of decode nodes - DECODE_WORKERS Number of decode workers (usually 1) - ISL Input sequence length - OSL Output sequence length - CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") - REQUEST_RATE Request rate ("inf" for max throughput) - NODE_LIST Optional: comma-separated hostnames - RANDOM_RANGE_RATIO Optional: random range ratio for benchmark (default 0.8) + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + PREFILL_ENABLE_EP true/false (from PREFILL_EP in YAML; false when EP==1) + PREFILL_ENABLE_DP true/false (data-parallel attention on prefill) + DECODE_ENABLE_EP true/false (from DECODE_EP in YAML) + DECODE_ENABLE_DP true/false (data-parallel attention on decode) + PREFILL_TP Tensor parallel size per prefill node + DECODE_TP Tensor parallel size per decode node + RANDOM_RANGE_RATIO Random range ratio for benchmark client + NODE_LIST Optional: comma-separated hostnames (must match NUM_NODES) Required environment variables: SLURM_ACCOUNT SLURM account name @@ -57,7 +67,7 @@ check_env RUNNER_NAME GPUS_PER_NODE="${GPUS_PER_NODE:-8}" -# COMMAND_LINE ARGS +# COMMAND_LINE ARGS (aligned with benchmarks/multi_node/amd_utils/submit.sh) PREFILL_NODES=$1 PREFILL_WORKERS=${2:-1} DECODE_NODES=$3 @@ -66,8 +76,14 @@ ISL=$5 OSL=$6 CONCURRENCIES=$7 REQUEST_RATE=$8 -NODE_LIST=${9} -RANDOM_RANGE_RATIO=${10} +PREFILL_ENABLE_EP=${9:-false} +PREFILL_ENABLE_DP=${10:-false} +DECODE_ENABLE_EP=${11:-false} +DECODE_ENABLE_DP=${12:-false} +PREFILL_TP=${13:-8} +DECODE_TP=${14:-8} +RANDOM_RANGE_RATIO=${15:-0.8} +NODE_LIST=${16} # Router co-located with first prefill: xP + yD nodes total NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) @@ -85,6 +101,12 @@ export yD=$DECODE_NODES export NUM_NODES=$NUM_NODES export GPUS_PER_NODE=$GPUS_PER_NODE export MODEL_NAME=$MODEL_NAME +export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP} +export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP} +export DECODE_ENABLE_EP=${DECODE_ENABLE_EP} +export DECODE_ENABLE_DP=${DECODE_ENABLE_DP} +export PREFILL_TP=${PREFILL_TP} +export DECODE_TP=${DECODE_TP} export BENCH_INPUT_LEN=${ISL} export BENCH_OUTPUT_LEN=${OSL} export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10}