Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion nginx.conf
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,19 @@ http {
client_max_body_size 200M;

location /readiness {
return 200 'true';
proxy_intercept_errors on;
proxy_pass http://127.0.0.1:8000/health;
# If ANY error or non-2xx/3xx status occurs, return 503:
error_page 400 401 402 403 404 500 501 502 503 504 =503 @unhealthy;
}

# "Catch" the error code and return 503
location @unhealthy {
return 503 "not ready";
add_header Content-Type text/plain;
}


location / {
proxy_pass http://127.0.0.1:8000;
proxy_set_header Host $host;
Expand Down
74 changes: 43 additions & 31 deletions throughput_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,60 +2,72 @@
import time
import asyncio
import aiohttp
import numpy as np
from typing import Dict, List
from typing import Dict

async def send_request(session: aiohttp.ClientSession, prompt: str, max_tokens: int = 100) -> Dict:
url = "http://a3bafe67e8f2a422ba99b5737992f756-614062644.us-east-1.elb.amazonaws.com/svc/default/vllm-gpus-1-a10g/v1/completions"
# Function to send a request
async def send_request(session: aiohttp.ClientSession, prompt: str) -> Dict:
url = "http://a5ca9a76fc0dc4f33acaa5196bba6ca5-1703175632.us-east-1.elb.amazonaws.com/svc/default/docker-test-gpus-4-l40s/v1/chat/completions"
payload = {
"model": "google/gemma-2b",
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": 0.7
"model": "gane5hvarma/joe-adapter",
"messages": [
{
"role": "user",
"content": prompt
}
]
}

headers = {
"Content-Type": "application/json"
}

async with session.post(url, json=payload) as response:
return await response.json()

async with session.post(url, json=payload, headers=headers) as response:
# Force parse as JSON regardless of Content-Type
text = await response.text()
try:
return json.loads(text)
except json.JSONDecodeError:
print(f"Error: Could not parse response as JSON. Content: {text}")
raise

# Function to measure throughput
async def measure_throughput(
num_concurrent_requests: int = 100,
prompt: str = "Tell me a story about Tensorfuse",
max_tokens: int = 1000
prompt: str = "hello"
) -> float:
async with aiohttp.ClientSession() as session:
start_time = time.time()

# Create tasks for all requests
tasks = [
send_request(session, prompt, max_tokens)
for _ in range(num_concurrent_requests)
]

tasks = [send_request(session, prompt) for _ in range(num_concurrent_requests)]

# Wait for all requests to complete
results = await asyncio.gather(*tasks)

end_time = time.time()
total_time = end_time - start_time

# Calculate total tokens processed
total_tokens = sum(
result["usage"]["completion_tokens"]
for result in results
result["usage"]["total_tokens"] for result in results if "usage" in result
)
throughput = total_tokens / total_time

throughput = total_tokens / total_time if total_time > 0 else 0
return throughput, total_time, len(results)

# Main function to test throughput for different concurrent loads
async def main():
concurrent_requests = [10, 50, 100, 200,500,1000]
concurrent_requests = [10, 50, 100, 200, 500, 1000]
print("Measuring vLLM throughput with different concurrent request loads...")
print("\nRequests | Throughput (tokens/sec) | Total Time (s)")
print("-" * 50)

for num_requests in concurrent_requests:
throughput, total_time, completed = await measure_throughput(num_requests)
print(f"{num_requests:8d} | {throughput:19.2f} | {total_time:13.2f}")
try:
throughput, total_time, completed = await measure_throughput(num_requests)
print(f"{num_requests:8d} | {throughput:19.2f} | {total_time:13.2f}")
except Exception as e:
print(f"Error during test with {num_requests} requests: {e}")

if __name__ == "__main__":
asyncio.run(main())

asyncio.run(main())