diff --git a/nginx.conf b/nginx.conf index c36d9a5..1159152 100644 --- a/nginx.conf +++ b/nginx.conf @@ -26,10 +26,19 @@ http { client_max_body_size 200M; location /readiness { - return 200 'true'; + proxy_intercept_errors on; + proxy_pass http://127.0.0.1:8000/health; + # If ANY error or non-2xx/3xx status occurs, return 503: + error_page 400 401 402 403 404 500 501 502 503 504 =503 @unhealthy; + } + + # "Catch" the error code and return 503 + location @unhealthy { + return 503 "not ready"; add_header Content-Type text/plain; } + location / { proxy_pass http://127.0.0.1:8000; proxy_set_header Host $host; diff --git a/throughput_test.py b/throughput_test.py index baf0c9e..e794cb7 100644 --- a/throughput_test.py +++ b/throughput_test.py @@ -2,60 +2,72 @@ import time import asyncio import aiohttp -import numpy as np -from typing import Dict, List +from typing import Dict -async def send_request(session: aiohttp.ClientSession, prompt: str, max_tokens: int = 100) -> Dict: - url = "http://a3bafe67e8f2a422ba99b5737992f756-614062644.us-east-1.elb.amazonaws.com/svc/default/vllm-gpus-1-a10g/v1/completions" +# Function to send a request +async def send_request(session: aiohttp.ClientSession, prompt: str) -> Dict: + url = "http://a5ca9a76fc0dc4f33acaa5196bba6ca5-1703175632.us-east-1.elb.amazonaws.com/svc/default/docker-test-gpus-4-l40s/v1/chat/completions" payload = { - "model": "google/gemma-2b", - "prompt": prompt, - "max_tokens": max_tokens, - "temperature": 0.7 + "model": "gane5hvarma/joe-adapter", + "messages": [ + { + "role": "user", + "content": prompt + } + ] + } + + headers = { + "Content-Type": "application/json" } - - async with session.post(url, json=payload) as response: - return await response.json() + async with session.post(url, json=payload, headers=headers) as response: + # Force parse as JSON regardless of Content-Type + text = await response.text() + try: + return json.loads(text) + except json.JSONDecodeError: + print(f"Error: Could not parse response as JSON. Content: {text}") + raise + +# Function to measure throughput async def measure_throughput( num_concurrent_requests: int = 100, - prompt: str = "Tell me a story about Tensorfuse", - max_tokens: int = 1000 + prompt: str = "hello" ) -> float: async with aiohttp.ClientSession() as session: start_time = time.time() - + # Create tasks for all requests - tasks = [ - send_request(session, prompt, max_tokens) - for _ in range(num_concurrent_requests) - ] - + tasks = [send_request(session, prompt) for _ in range(num_concurrent_requests)] + # Wait for all requests to complete results = await asyncio.gather(*tasks) - + end_time = time.time() total_time = end_time - start_time - + # Calculate total tokens processed total_tokens = sum( - result["usage"]["completion_tokens"] - for result in results + result["usage"]["total_tokens"] for result in results if "usage" in result ) - - throughput = total_tokens / total_time + + throughput = total_tokens / total_time if total_time > 0 else 0 return throughput, total_time, len(results) +# Main function to test throughput for different concurrent loads async def main(): - concurrent_requests = [10, 50, 100, 200,500,1000] + concurrent_requests = [10, 50, 100, 200, 500, 1000] print("Measuring vLLM throughput with different concurrent request loads...") print("\nRequests | Throughput (tokens/sec) | Total Time (s)") print("-" * 50) - + for num_requests in concurrent_requests: - throughput, total_time, completed = await measure_throughput(num_requests) - print(f"{num_requests:8d} | {throughput:19.2f} | {total_time:13.2f}") + try: + throughput, total_time, completed = await measure_throughput(num_requests) + print(f"{num_requests:8d} | {throughput:19.2f} | {total_time:13.2f}") + except Exception as e: + print(f"Error during test with {num_requests} requests: {e}") if __name__ == "__main__": - asyncio.run(main()) - + asyncio.run(main()) \ No newline at end of file