diff --git a/benchmarks/single_node/kimik2.5_fp4_b200.sh b/benchmarks/single_node/kimik2.5_fp4_b200.sh index 422a74950..d08e23bb2 100644 --- a/benchmarks/single_node/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/kimik2.5_fp4_b200.sh @@ -38,6 +38,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --reasoning-parser kimi_k2 \ --tool-call-parser kimi_k2 \ --compilation_config.pass_config.fuse_allreduce_rms true \ +--no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh index a8bd01442..967003232 100755 --- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh @@ -70,6 +70,7 @@ $EP \ --block-size=1 \ --no-enable-prefix-caching \ --trust-remote-code \ +--no-enable-prefix-caching \ --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/kimik2.5_int4_h200.sh b/benchmarks/single_node/kimik2.5_int4_h200.sh index 37281f61e..473a1bd73 100755 --- a/benchmarks/single_node/kimik2.5_int4_h200.sh +++ b/benchmarks/single_node/kimik2.5_int4_h200.sh @@ -40,6 +40,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --tool-call-parser kimi_k2 \ --compilation_config.pass_config.fuse_allreduce_rms true \ --trust-remote-code \ +--no-enable-prefix-caching \ --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/kimik2.5_int4_mi325x.sh b/benchmarks/single_node/kimik2.5_int4_mi325x.sh index 8ca56dc29..1a42035a0 100755 --- a/benchmarks/single_node/kimik2.5_int4_mi325x.sh +++ b/benchmarks/single_node/kimik2.5_int4_mi325x.sh @@ -40,6 +40,7 @@ vllm serve $MODEL --port $PORT \ --max-model-len $MAX_MODEL_LEN \ --block-size=64 \ --trust-remote-code \ +--no-enable-prefix-caching \ --max-num-seqs 256 \ --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 & diff --git a/benchmarks/single_node/kimik2.5_int4_mi355x.sh b/benchmarks/single_node/kimik2.5_int4_mi355x.sh index 935c6cd2e..420f8044a 100755 --- a/benchmarks/single_node/kimik2.5_int4_mi355x.sh +++ b/benchmarks/single_node/kimik2.5_int4_mi355x.sh @@ -37,6 +37,7 @@ vllm serve $MODEL --port $PORT \ --max-model-len $MAX_MODEL_LEN \ --block-size=64 \ --trust-remote-code \ +--no-enable-prefix-caching \ --max-num-seqs 256 \ --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 6a8a6e666..3e7db64e5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1108,3 +1108,13 @@ description: - "Update vLLM image from v0.15.1 to v0.18.0 for gptoss H100 and H200 configs" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/960 + +- config-keys: + - kimik2.5-int4-mi325x-vllm + - kimik2.5-int4-mi355x-vllm + - kimik2.5-int4-h200-vllm + - kimik2.5-fp4-mi355x-vllm + - kimik2.5-fp4-b200-vllm + description: + - "Disable prefix caching (--no-enable-prefix-caching) for all Kimi K2.5 benchmarks using random datasets" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/926 diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 170a1bdc3..9d157a858 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -1,6 +1,6 @@ #!/usr/bin/bash -export HF_HUB_CACHE_MOUNT="/home/gharunner/gharunners/hf-hub-cache/" +export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}"