tensorfuse · samagra14 · Jan 12, 2025
diff --git a/nginx.conf b/nginx.conf
@@ -26,10 +26,19 @@ http {
         client_max_body_size 200M;
 
         location /readiness {
-            return 200 'true';
+            proxy_intercept_errors on;
+            proxy_pass http://127.0.0.1:8000/health;
+            # If ANY error or non-2xx/3xx status occurs, return 503:
+            error_page 400 401 402 403 404 500 501 502 503 504 =503 @unhealthy;
+        }
+
+        # "Catch" the error code and return 503
+        location @unhealthy {
+            return 503 "not ready";
             add_header Content-Type text/plain;
         }
 
+
         location / {
             proxy_pass http://127.0.0.1:8000;
             proxy_set_header Host $host;

diff --git a/throughput_test.py b/throughput_test.py
@@ -2,60 +2,72 @@
 import time
 import asyncio
 import aiohttp
-import numpy as np
-from typing import Dict, List
+from typing import Dict
 
-async def send_request(session: aiohttp.ClientSession, prompt: str, max_tokens: int = 100) -> Dict:
-    url = "http://a3bafe67e8f2a422ba99b5737992f756-614062644.us-east-1.elb.amazonaws.com/svc/default/vllm-gpus-1-a10g/v1/completions"
+# Function to send a request
+async def send_request(session: aiohttp.ClientSession, prompt: str) -> Dict:
+    url = "http://a5ca9a76fc0dc4f33acaa5196bba6ca5-1703175632.us-east-1.elb.amazonaws.com/svc/default/docker-test-gpus-4-l40s/v1/chat/completions"
     payload = {
-        "model": "google/gemma-2b",
-        "prompt": prompt,
-        "max_tokens": max_tokens,
-        "temperature": 0.7
+        "model": "gane5hvarma/joe-adapter",
+        "messages": [
+            {
+                "role": "user",
+                "content": prompt
+            }
+        ]
+    }
+
+    headers = {
+        "Content-Type": "application/json"
     }
-
-    async with session.post(url, json=payload) as response:
-        return await response.json()
 
+    async with session.post(url, json=payload, headers=headers) as response:
+        # Force parse as JSON regardless of Content-Type
+        text = await response.text()
+        try:
+            return json.loads(text)
+        except json.JSONDecodeError:
+            print(f"Error: Could not parse response as JSON. Content: {text}")
+            raise
+
+# Function to measure throughput
 async def measure_throughput(
     num_concurrent_requests: int = 100,
-    prompt: str = "Tell me a story about Tensorfuse",
-    max_tokens: int = 1000
+    prompt: str = "hello"
 ) -> float:
     async with aiohttp.ClientSession() as session:
         start_time = time.time()
-        
+
         # Create tasks for all requests
-        tasks = [
-            send_request(session, prompt, max_tokens)
-            for _ in range(num_concurrent_requests)
-        ]
-
+        tasks = [send_request(session, prompt) for _ in range(num_concurrent_requests)]
+
         # Wait for all requests to complete
         results = await asyncio.gather(*tasks)
-        
+
         end_time = time.time()
         total_time = end_time - start_time
-        
+
         # Calculate total tokens processed
         total_tokens = sum(
-            result["usage"]["completion_tokens"]
-            for result in results
+            result["usage"]["total_tokens"] for result in results if "usage" in result
         )
-        
-        throughput = total_tokens / total_time
+
+        throughput = total_tokens / total_time if total_time > 0 else 0
         return throughput, total_time, len(results)
 
+# Main function to test throughput for different concurrent loads
 async def main():
-    concurrent_requests = [10, 50, 100, 200,500,1000]
+    concurrent_requests = [10, 50, 100, 200, 500, 1000]
     print("Measuring vLLM throughput with different concurrent request loads...")
     print("\nRequests | Throughput (tokens/sec) | Total Time (s)")
     print("-" * 50)
-    
+
     for num_requests in concurrent_requests:
-        throughput, total_time, completed = await measure_throughput(num_requests)
-        print(f"{num_requests:8d} | {throughput:19.2f} | {total_time:13.2f}")
+        try:
+            throughput, total_time, completed = await measure_throughput(num_requests)
+            print(f"{num_requests:8d} | {throughput:19.2f} | {total_time:13.2f}")
+        except Exception as e:
+            print(f"Error during test with {num_requests} requests: {e}")
 
 if __name__ == "__main__":
-    asyncio.run(main())
-
+    asyncio.run(main())