NVIDIA · karya0 · Jan 27, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 21, 2026
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "sglang-Qwen3-0.6B"
+description = "sglang backend with Qwen3-0.6B model"
+test_template_name = "AIDynamo"
+workloads = ["genai_perf.sh"]
+
+[cmd_args]
+docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0"
+
+  [cmd_args.dynamo]
+  backend = "sglang"
+  model = "Qwen/Qwen3-0.6B"
+  endpoint = "v1/chat/completions"
+
+    [cmd_args.dynamo.prefill_worker]
+    num-nodes = 1
+    cmd = 'python3 -m dynamo.sglang'
+    extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics"
+    worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
+    multiple-workers-per-node = "false"
+
+      [cmd_args.dynamo.prefill_worker.args]
+      page-size = 16
+      tensor-parallel-size = 1
+      pipeline-parallel-size = 1
+      disaggregation-mode = "prefill"
+      disaggregation-bootstrap-port = 12345
+      host = "0.0.0.0"
+      port = 40000
+      disaggregation-transfer-backend = "nixl"
+
+    [cmd_args.dynamo.decode_worker]
+    num-nodes = 1
+    cmd = 'python3 -m dynamo.sglang'
+    extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics"
+    worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config'
+    multiple-workers-per-node = "false"
+
+      [cmd_args.dynamo.decode_worker.args]
+      page-size = 16
+      tensor-parallel-size = 1
+      pipeline-parallel-size = 1
+      disaggregation-mode = "decode"
+      disaggregation-bootstrap-port = 12345
+      host = "0.0.0.0"
+      disaggregation-transfer-backend = "nixl"
+
+  [cmd_args.lmcache]
+  controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001"
+
+    [cmd_args.lmcache.args]
+    chunk_size = 256
+    local_cpu = false
+    nixl_buffer_size = 10737418240
+    nixl_buffer_device = "cuda"
+    extra_config_enable_nixl_storage = true
+    extra_config_nixl_backend = "GDS_MT"
+    extra_config_nixl_file_pool_size = 64
+
+    enable_controller = true
+    lmcache_instance_id = "lmcache_default_instance"
+    controller_url = "localhost:9001"
+    lmcache_worker_port = 8788
+    distributed_url = "localhost:8789"
+
+  [cmd_args.genai_perf]
+  cmd = "genai-perf profile"
+  extra-args = "--streaming --verbose -- -v --async"
+
+    [cmd_args.genai_perf.args]
+    endpoint-type = "chat"
+    extra-inputs = 'min_tokens:10'
+    output-tokens-mean = 500
+    output-tokens-stddev = 0
+    random-seed = 123
+    request-count = 50
+    synthetic-input-tokens-mean = 300
+    synthetic-input-tokens-stddev = 0
+    warmup-request-count = 5
+    concurrency = 2
+
+[extra_env_vars]
+UCX_LOG_LEVEL = "warn"
+HF_HUB_OFFLINE = "1"
+TRANSFORMERS_OFFLINE = "1"
+HF_DATASETS_OFFLINE = "1"
+DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
+UCX_TLS = "all"
+#DYN_LOGGING_JSONL="true"
+#OTEL_EXPORT_ENABLED="1"
+#OTEL_EXPORTER_OTLP_TRACES_ENDPOINT="http://localhost:4317"
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,36 +17,80 @@
 name = "vLLM-Qwen3-0.6B"
 description = "vLLM backend with Qwen3-0.6B model"
 test_template_name = "AIDynamo"
+workloads = ["genai_perf.sh"]
 
 [cmd_args]
-docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.0"
+docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1"
 
   [cmd_args.dynamo]
   backend = "vllm"
   model = "Qwen/Qwen3-0.6B"
-  workspace-path = "/workspace/examples/backends/vllm"
-  prefill-cmd = 'python3 -m dynamo.vllm --is-prefill-worker'
-  decode-cmd = 'python3 -m dynamo.vllm'
+  endpoint = "v1/chat/completions"
+
+    [cmd_args.dynamo.prefill_worker]
+    num-nodes = 1
+    cmd = 'python3 -m dynamo.vllm --is-prefill-worker'
+    worker-initialized-regex = 'VllmWorker.*has.been.initialized'
+    multiple-workers-per-node = "false"
+    extra-args = "--no-enable-expert-parallel"
+
+      [cmd_args.dynamo.prefill_worker.args]
+      gpu-memory-utilization = 0.8
+      tensor-parallel-size = 8
+      pipeline-parallel-size = 1
+      data-parallel-size = 1
 
     [cmd_args.dynamo.decode_worker]
-    pipeline-parallel-size = 1
+    num-nodes = 1
+    cmd = 'python3 -m dynamo.vllm'
+    worker-initialized-regex = 'VllmWorker.*has.been.initialized'
+    multiple-workers-per-node = "false"
+    extra-args = "--no-enable-expert-parallel"
+
+      [cmd_args.dynamo.decode_worker.args]
+      gpu-memory-utilization = 0.8
+      tensor-parallel-size = 8
+      pipeline-parallel-size = 1
+      data-parallel-size = 1
+
+  [cmd_args.lmcache]
+  controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001"
+
+    [cmd_args.lmcache.args]
+    chunk_size = 256
+    local_cpu = false
+    nixl_buffer_size = 10737418240
+    nixl_buffer_device = "cuda"
+    extra_config_enable_nixl_storage = true
+    extra_config_nixl_backend = "GDS_MT"
+    extra_config_nixl_file_pool_size = 64
+
+    enable_controller = true
+    lmcache_instance_id = "lmcache_default_instance"
+    controller_url = "localhost:9001"
+    lmcache_worker_port = 8788
+    distributed_url = "localhost:8789"
 
   [cmd_args.genai_perf]
-  model = "Qwen/Qwen3-0.6B"
-  endpoint = "v1/chat/completions"
-  endpoint-type = "chat"
-  extra-inputs = 'min_tokens:10'
-  output-tokens-mean = 500
-  output-tokens-stddev = 0
-  random-seed = 123
-  request-count = 50
-  synthetic-input-tokens-mean = 300
-  synthetic-input-tokens-stddev = 0
-  warmup-request-count = 5
-  concurrency = 2
-  extra-args = "--streaming -- -v --async"
+  cmd = "genai-perf profile"
+  extra-args = "--streaming --verbose -- -v --async"
+
+    [cmd_args.genai_perf.args]
+    endpoint-type = "chat"
+    extra-inputs = 'min_tokens:10'
+    output-tokens-mean = 500
+    output-tokens-stddev = 0
+    random-seed = 123
+    request-count = 50
+    synthetic-input-tokens-mean = 300
+    synthetic-input-tokens-stddev = 0
+    warmup-request-count = 5
+    concurrency = 2
 
 [extra_env_vars]
 UCX_LOG_LEVEL = "warn"
-UCX_TLS = "cuda_copy,rc_x"
+HF_HUB_OFFLINE = "1"
+TRANSFORMERS_OFFLINE = "1"
+HF_DATASETS_OFFLINE = "1"
 DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
+UCX_TLS = "all"
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "dynamo_sglang"
+
+[[Tests]]
+id = "sglang-Qwen3-0.6B"
+test_name = "sglang-Qwen3-0.6B"
+time_limit = "00:20:00"
+
+extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
+
+  [Tests.cmd_args]
+  num_nodes = 2               # 1 prefill node + 1 decode node
+  workloads = "genai_perf.sh"
+
+    [Tests.cmd_args.dynamo]
+    model = "Qwen/Qwen3-0.6B"
+    node-setup-cmd = "hostname"
+
+      [Tests.cmd_args.dynamo.prefill_worker]
+      num-nodes = 1
+
+        [Tests.cmd_args.dynamo.prefill_worker.args]
+        tensor-parallel-size = 1
+
+      [Tests.cmd_args.dynamo.decode_worker]
+      num-nodes = 1
+
+        [Tests.cmd_args.dynamo.decode_worker.args]
+        tensor-parallel-size = 1
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,7 +24,10 @@ test_name = "vLLM-Qwen3-0.6B"
     [Tests.cmd_args.dynamo]
       [Tests.cmd_args.dynamo.prefill_worker]
       num-nodes = 1
-      tensor-parallel-size = 8
+        [Tests.cmd_args.dynamo.prefill_worker.args]
+        tensor-parallel-size = 8
+
       [Tests.cmd_args.dynamo.decode_worker]
       num-nodes = 1
-      tensor-parallel-size = 8
+        [Tests.cmd_args.dynamo.decode_worker.args]
+        tensor-parallel-size = 8
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "dynamo_vllm_kvbm"
+
+[[Tests]]
+id = "vLLM-Qwen3-0.6B"
+test_name = "vLLM-Qwen3-0.6B"
+time_limit = "20:00:00"
+
+extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
+
+  [Tests.cmd_args]
+  storage_cache_dir = "/mnt/vast"
+  num_nodes = 2                   # 1 prefill node + 1 decode node
+  workloads = "genai_perf.sh"
+
+    [Tests.cmd_args.dynamo]
+    model = "Qwen/Qwen3-0.6B"
+    node-setup-cmd = "hostname"
+
+      [Tests.cmd_args.dynamo.prefill_worker]
+      num-nodes = 1
+
+        [Tests.cmd_args.dynamo.prefill_worker.args]
+        tensor-parallel-size = 2
+        connector = "kvbm nixl"
+
+      [Tests.cmd_args.dynamo.decode_worker]
+      num-nodes = 1
+
+        [Tests.cmd_args.dynamo.decode_worker.args]
+        tensor-parallel-size = 2
+        connector = "nixl"
+
+  [Tests.extra_env_vars]
+  # Both variants needed for cross-version CUFile compatibility
+  CUFILE_LOG_LEVEL = "INFO"
+  CUFILE_LOGGING_LEVEL = "INFO"
+  PYTHONHASHSEED = "0"
+
+  # Dynamo Flags
+  DYN_LOG = "info"
+  DYN_SYSTEM_PORT = "8081" # Enable system metrics
+
+  # KVBM Flags
+  DYN_KVBM_METRICS = "1"
+  DYN_KVBM_METRICS_PORT = "6880" # Default port
+
+  # set a large timeout for allocating the disk
+  DYN_KVBM_LEADER_WORKER_INIT_TIMEOUT_SECS = "1200"
+  DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER = "1"        # Force KV cache write on first request
+
+  # Use it only on vast.
+  #DYN_KVBM_DISK_ZEROFILL_FALLBACK="true"
+
+  # set a relatively small CPU cache, so we can do quick disk onboarding
+  DYN_KVBM_CPU_CACHE_GB = "50"
+  # set a large disk cache, so we are actually testing the NIXL with onboarding
+  #DYN_KVBM_DISK_CACHE_GB="100"
+
+  DYN_KVBM_NIXL_BACKEND_UCX = "True"
+  DYN_KVBM_NIXL_BACKEND_GDS = "True"
+
+  # vLLM Flags
+  VLLM_SERVER_DEV_MODE = "1"
+
+  DYN_KVBM_LEADER_ZMQ_PUB_PORT = "57001"
+  DYN_KVBM_LEADER_ZMQ_ACK_PORT = "57002"