-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdeploy_qwen_endpoint.py
More file actions
89 lines (76 loc) · 2.91 KB
/
deploy_qwen_endpoint.py
File metadata and controls
89 lines (76 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import runpod
from dotenv import load_dotenv
import json
def deploy_qwen_1_7b_endpoint():
"""
Deploys a RunPod serverless endpoint for Qwen/Qwen3-1.7B with dynamic LoRA loading.
"""
# --- Configuration ---
ENDPOINT_NAME = "qwen-1.7b-vllm-lora"
# Using a vLLM-ready image from RunPod.
DOCKER_IMAGE = "runpod/vllm:v0.5.1"
# A cost-effective GPU for a 1.7B model.
GPU_TYPE = "NVIDIA RTX A6000"
# Environment variables from your script
ENV_VARS = {
"TORCH_COMPILE_CACHE_DIR": "/root/.cache/torch_compile",
"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"
}
# Startup command arguments from your script
VLLM_ARGS = [
"--model", "Qwen/Qwen3-1.7B",
"--dtype", "bfloat16",
"--max-model-len", "32768",
"--tensor-parallel-size", "1",
"--enable-prefix-caching",
"--disable-log-requests",
"--gpu-memory-utilization", "0.9",
"--enable-lora",
"--max-lora-rank", "64",
# The API server will listen on port 8080 inside the container,
# which RunPod maps to the public internet.
"--port", "8080"
]
# --- Load API Key ---
load_dotenv()
api_key = os.getenv("RUNPOD_API_KEY")
if not api_key:
print("❌ RUNPOD_API_KEY environment variable not set.")
return
runpod.api_key = api_key
print("✅ RUNPOD_API_KEY loaded.")
try:
# --- 1. Create the Template ---
print(f"\n--- Creating Template: {ENDPOINT_NAME}-template ---")
new_template = runpod.create_template(
name=f"{ENDPOINT_NAME}-template",
image_name=DOCKER_IMAGE,
env=ENV_VARS,
container_disk_in_gb=15, # Increased disk size for model and LoRAs
is_serverless=True,
docker_start_cmd=" ".join(["python", "-m", "vllm.entrypoints.openai.api_server"] + VLLM_ARGS)
)
print("✅ Template created successfully!")
print(json.dumps(new_template, indent=2))
template_id = new_template.get('id')
# --- 2. Create the Endpoint ---
print(f"\n--- Creating Endpoint: {ENDPOINT_NAME} ---")
new_endpoint = runpod.create_endpoint(
name=ENDPOINT_NAME,
template_id=template_id,
gpu_ids=GPU_TYPE,
workers_min=0,
workers_max=3,
idle_timeout=10 # Shutdown idle workers after 10 minutes
)
print("✅ Endpoint created successfully!")
print("Your new endpoint is being provisioned. It may take a few minutes to become active.")
print(json.dumps(new_endpoint, indent=2))
except runpod.error.QueryError as err:
print(f"\n❌ An API error occurred: {err}")
print("Query details:", err.query)
except Exception as e:
print(f"\n❌ An unexpected error occurred: {e}")
if __name__ == "__main__":
deploy_qwen_1_7b_endpoint()