From 5a163dd294c645c3482e55723d24c8d5f80a2ad5 Mon Sep 17 00:00:00 2001
From: Harsha Vardhan <hmannem@uab.edu>
Date: Fri, 21 Nov 2025 20:04:24 -0600
Subject: [PATCH 1/4] Add vLLM integration with setup script and comprehensive
 documentation

---
 vllm/README.md | 642 +++++++++++++++++++++++++++++++++++++++++++++++++
 vllm/setup.sh  | 265 ++++++++++++++++++++
 2 files changed, 907 insertions(+)
 create mode 100644 vllm/README.md
 create mode 100644 vllm/setup.sh

diff --git a/vllm/README.md b/vllm/README.md
new file mode 100644
index 0000000..c0115eb
--- /dev/null
+++ b/vllm/README.md
@@ -0,0 +1,642 @@
+# vLLM
+
+High-performance LLM inference with OpenAI-compatible API.
+
+## What it installs
+
+- **vLLM** - Fast LLM inference engine with PagedAttention
+- **Python virtual environment** - Isolated Python environment
+- **OpenAI-compatible API** - Drop-in replacement for OpenAI API
+- **Systemd service** - Auto-restart and logging
+- **Example scripts** - Python and curl examples
+
+## Features
+
+- **⚡ Fast** - 24x throughput vs HuggingFace, 2-4x faster than Ollama
+- **🎯 Production-ready** - Used by major companies (Cloudflare, NVIDIA, etc.)
+- **🔌 OpenAI-compatible** - Works with OpenAI SDK/clients
+- **🔥 Multi-GPU support** - Tensor parallelism across GPUs
+- **📦 Any model** - Support for Llama, Mistral, Qwen, Phi, etc.
+- **💾 Efficient memory** - PagedAttention for 2x memory efficiency
+- **🔄 Continuous batching** - High throughput under load
+
+## Requirements
+
+- **NVIDIA GPU** - Required (A10, L4, V100, A100, H100, etc.)
+- **8GB+ VRAM** - For 7B models (3B models work with 6GB)
+- **CUDA** - Already provided by Brev
+
+## ⚠️ Required Port
+
+To access from outside Brev, open:
+- **8000/tcp** (vLLM API endpoint)
+
+## Usage
+
+```bash
+bash setup.sh
+```
+
+Takes ~3-5 minutes.
+
+## What you get
+
+- **API Endpoint:** `http://localhost:8000`
+- **Configuration:** `~/vllm-server/config.env`
+- **Examples:** `~/vllm-examples/`
+- **Service:** Auto-starts on boot (after first manual start)
+
+## Quick Start
+
+### 1. Configure the model
+
+Edit `~/vllm-server/config.env`:
+
+```bash
+nano ~/vllm-server/config.env
+```
+
+**For open models (Mistral, Qwen, Phi):**
+```bash
+MODEL_NAME="mistralai/Mistral-7B-Instruct-v0.3"
+```
+
+**For gated models (Llama):**
+1. Get HuggingFace token: https://huggingface.co/settings/tokens
+2. Accept model license on HuggingFace (e.g., https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)
+3. Add token to config:
+```bash
+HF_TOKEN="hf_your_token_here"
+```
+
+### 2. Start the service
+
+```bash
+sudo systemctl start vllm
+```
+
+**Monitor the first start (downloads model):**
+```bash
+sudo journalctl -u vllm -f
+```
+
+The first start takes 3-10 minutes to download the model. Look for:
+```
+INFO: Waiting for application startup.
+INFO: Application startup complete.
+```
+
+### 3. Test it works
+
+```bash
+# Check service is running
+sudo systemctl status vllm
+
+# Test API
+curl http://localhost:8000/v1/models
+
+# Run Python example
+python3 ~/vllm-examples/test_api.py
+```
+
+## Model Selection Guide
+
+### Small & Fast (6-8GB VRAM)
+
+```bash
+# Llama 3.2 3B - Great quality for size
+MODEL_NAME="meta-llama/Llama-3.2-3B-Instruct"
+
+# Phi-3 Mini - Microsoft's efficient model
+MODEL_NAME="microsoft/Phi-3-mini-4k-instruct"
+
+# Gemma 2B - Google's small model
+MODEL_NAME="google/gemma-2b-it"
+```
+
+### Medium (12-16GB VRAM)
+
+```bash
+# Llama 3.1 8B - Excellent all-around
+MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
+
+# Mistral 7B - Fast and capable
+MODEL_NAME="mistralai/Mistral-7B-Instruct-v0.3"
+
+# Qwen 2.5 7B - Best for coding
+MODEL_NAME="Qwen/Qwen2.5-7B-Instruct"
+
+# Nous Hermes 2 - Creative writing
+MODEL_NAME="NousResearch/Hermes-2-Pro-Llama-3-8B"
+```
+
+### Large (24GB+ VRAM)
+
+```bash
+# Llama 3.1 70B (requires 40GB+ VRAM or multi-GPU)
+MODEL_NAME="meta-llama/Llama-3.1-70B-Instruct"
+
+# Qwen 2.5 32B
+MODEL_NAME="Qwen/Qwen2.5-32B-Instruct"
+
+# DeepSeek Coder 33B
+MODEL_NAME="deepseek-ai/deepseek-coder-33b-instruct"
+```
+
+### Quantized (Lower memory)
+
+```bash
+# AWQ quantized models (half the memory)
+MODEL_NAME="TheBloke/Llama-2-13B-chat-AWQ"
+MODEL_NAME="TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
+```
+
+## Multi-GPU Configuration
+
+If you have multiple GPUs:
+
+```bash
+# Edit config
+nano ~/vllm-server/config.env
+
+# Set tensor parallelism to GPU count
+TENSOR_PARALLEL_SIZE="2"  # For 2 GPUs
+TENSOR_PARALLEL_SIZE="4"  # For 4 GPUs
+TENSOR_PARALLEL_SIZE="8"  # For 8 GPUs
+
+# Restart service
+sudo systemctl restart vllm
+```
+
+**Example:** Serve Llama 70B on 2x A100 (40GB each):
+```bash
+MODEL_NAME="meta-llama/Llama-3.1-70B-Instruct"
+TENSOR_PARALLEL_SIZE="2"
+```
+
+## API Usage
+
+### Python (OpenAI SDK)
+
+```python
+from openai import OpenAI
+
+# Point to vLLM server
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="EMPTY"
+)
+
+# Chat completion
+response = client.chat.completions.create(
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    messages=[
+        {"role": "system", "content": "You are a helpful coding assistant."},
+        {"role": "user", "content": "Write a Python function to sort a list."}
+    ],
+    temperature=0.7,
+    max_tokens=500
+)
+
+print(response.choices[0].message.content)
+```
+
+### Python (Streaming)
+
+```python
+stream = client.chat.completions.create(
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    messages=[{"role": "user", "content": "Tell me a story."}],
+    stream=True,
+    max_tokens=500
+)
+
+for chunk in stream:
+    if chunk.choices[0].delta.content:
+        print(chunk.choices[0].delta.content, end="", flush=True)
+```
+
+### cURL
+
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Llama-3.2-3B-Instruct",
+    "messages": [
+      {"role": "user", "content": "Hello!"}
+    ],
+    "temperature": 0.7,
+    "max_tokens": 100
+  }'
+```
+
+### JavaScript/TypeScript
+
+```typescript
+import OpenAI from 'openai';
+
+const client = new OpenAI({
+  baseURL: 'http://localhost:8000/v1',
+  apiKey: 'EMPTY'
+});
+
+const response = await client.chat.completions.create({
+  model: 'meta-llama/Llama-3.2-3B-Instruct',
+  messages: [{ role: 'user', content: 'Hello!' }]
+});
+
+console.log(response.choices[0].message.content);
+```
+
+## Advanced Configuration
+
+### Optimize for throughput
+
+```bash
+nano ~/vllm-server/config.env
+
+# Increase batch size
+MAX_NUM_SEQS="512"
+
+# Use more GPU memory
+GPU_MEMORY_UTILIZATION="0.95"
+
+# Restart
+sudo systemctl restart vllm
+```
+
+### Optimize for latency
+
+```bash
+# Smaller batch size
+MAX_NUM_SEQS="64"
+
+# Lower memory usage (more free for KV cache)
+GPU_MEMORY_UTILIZATION="0.85"
+```
+
+### Longer context windows
+
+```bash
+# Extend max length (uses more memory)
+MAX_MODEL_LEN="8192"  # or 16384, 32768
+```
+
+### Add API authentication
+
+Edit service file:
+
+```bash
+sudo nano /etc/systemd/system/vllm.service
+```
+
+Add to ExecStart line:
+```bash
+--api-key "your-secret-key"
+```
+
+Reload and restart:
+```bash
+sudo systemctl daemon-reload
+sudo systemctl restart vllm
+```
+
+Now use with:
+```python
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="your-secret-key"
+)
+```
+
+## Manage Service
+
+```bash
+# Start service
+sudo systemctl start vllm
+
+# Stop service
+sudo systemctl stop vllm
+
+# Restart service
+sudo systemctl restart vllm
+
+# Check status
+sudo systemctl status vllm
+
+# View logs (live)
+sudo journalctl -u vllm -f
+
+# View recent logs
+sudo journalctl -u vllm -n 100
+```
+
+## Performance Monitoring
+
+### Check GPU usage
+
+```bash
+watch -n 1 nvidia-smi
+```
+
+### API health check
+
+```bash
+curl http://localhost:8000/health
+curl http://localhost:8000/v1/models
+```
+
+### Request metrics
+
+vLLM logs show:
+- Requests per second
+- Token throughput
+- KV cache usage
+- GPU memory usage
+
+```bash
+sudo journalctl -u vllm -f | grep "Avg prompt throughput"
+```
+
+## Troubleshooting
+
+### Service won't start
+
+**Check logs:**
+```bash
+sudo journalctl -u vllm -n 50 --no-pager
+```
+
+**Common issues:**
+
+1. **Out of memory:**
+   - Use smaller model
+   - Lower `GPU_MEMORY_UTILIZATION` to 0.8
+   - Reduce `MAX_MODEL_LEN`
+
+2. **HuggingFace token invalid:**
+   - Verify token at https://huggingface.co/settings/tokens
+   - Accept model license on HuggingFace
+   - Check `HF_TOKEN` in `~/vllm-server/config.env`
+
+3. **Model not found:**
+   - Verify model name on HuggingFace
+   - Check internet connection
+   - Try: `huggingface-cli login` with your token
+
+### Slow first request
+
+This is normal! vLLM:
+1. Downloads model on first start (3-10 minutes)
+2. Loads model into GPU (30-60 seconds)
+3. Warms up inference engine
+
+Subsequent requests are fast.
+
+### Out of GPU memory
+
+```bash
+# Check current memory
+nvidia-smi
+
+# Solutions:
+# 1. Use smaller model
+# 2. Lower GPU memory usage
+nano ~/vllm-server/config.env
+GPU_MEMORY_UTILIZATION="0.8"  # Was 0.9
+
+# 3. Reduce max length
+MAX_MODEL_LEN="2048"  # Was 4096
+
+# 4. Use quantized model (AWQ/GPTQ)
+MODEL_NAME="TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
+
+# Restart
+sudo systemctl restart vllm
+```
+
+### Model download fails
+
+```bash
+# Manual download
+cd ~/.cache/huggingface
+export HF_TOKEN="hf_your_token"
+
+# Install huggingface-cli
+pip install huggingface-hub
+
+# Login
+huggingface-cli login
+
+# Download model
+huggingface-cli download meta-llama/Llama-3.2-3B-Instruct
+```
+
+### API returns errors
+
+```bash
+# Check service is running
+sudo systemctl status vllm
+
+# Check logs for errors
+sudo journalctl -u vllm -n 50
+
+# Test health endpoint
+curl http://localhost:8000/health
+
+# Verify model loaded
+curl http://localhost:8000/v1/models
+```
+
+### Change model
+
+```bash
+# 1. Stop service
+sudo systemctl stop vllm
+
+# 2. Edit config
+nano ~/vllm-server/config.env
+# Change MODEL_NAME
+
+# 3. Clear cache (optional, saves disk space)
+rm -rf ~/.cache/huggingface/hub/*
+
+# 4. Start service
+sudo systemctl start vllm
+
+# 5. Monitor download
+sudo journalctl -u vllm -f
+```
+
+## Benchmarks vs Alternatives
+
+**Throughput (requests/sec):**
+- vLLM: ~2000
+- Ollama: ~500-1000  
+- HuggingFace Transformers: ~80
+
+**Latency (first token):**
+- vLLM: ~20ms
+- Ollama: ~50ms
+- HuggingFace Transformers: ~100ms
+
+**Memory efficiency:**
+- vLLM: 2x better than alternatives (PagedAttention)
+- Can serve 2x more requests with same GPU
+
+## When to use vLLM vs Ollama
+
+**Use vLLM for:**
+- ✅ Production workloads
+- ✅ High throughput needs
+- ✅ Multi-GPU setups
+- ✅ Custom model configurations
+- ✅ Maximum performance
+
+**Use Ollama for:**
+- ✅ Quick prototyping
+- ✅ Simpler setup
+- ✅ Model management UI
+- ✅ Desktop/laptop use
+- ✅ Non-technical users
+
+## Integration Examples
+
+### With LiteLLM
+
+Point LiteLLM to vLLM:
+
+```yaml
+# ~/.litellm/config.yaml
+model_list:
+  - model_name: llama-3-8b
+    litellm_params:
+      model: openai/meta-llama/Llama-3.1-8B-Instruct
+      api_base: http://localhost:8000/v1
+      api_key: EMPTY
+```
+
+### With LangChain
+
+```python
+from langchain_openai import ChatOpenAI
+
+llm = ChatOpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="EMPTY",
+    model="meta-llama/Llama-3.2-3B-Instruct"
+)
+
+response = llm.invoke("Hello!")
+print(response.content)
+```
+
+### With LlamaIndex
+
+```python
+from llama_index.llms.openai import OpenAI
+
+llm = OpenAI(
+    api_base="http://localhost:8000/v1",
+    api_key="EMPTY",
+    model="meta-llama/Llama-3.2-3B-Instruct"
+)
+
+response = llm.complete("Hello!")
+print(response)
+```
+
+## Update vLLM
+
+```bash
+source ~/vllm-server/venv/bin/activate
+pip install --upgrade vllm
+sudo systemctl restart vllm
+```
+
+## Uninstall
+
+```bash
+sudo systemctl stop vllm
+sudo systemctl disable vllm
+sudo rm /etc/systemd/system/vllm.service
+sudo systemctl daemon-reload
+rm -rf ~/vllm-server
+rm -rf ~/vllm-examples
+rm -rf ~/.cache/huggingface  # Optional: removes downloaded models
+```
+
+## Resources
+
+- **GitHub:** https://github.com/vllm-project/vllm
+- **Docs:** https://docs.vllm.ai/
+- **Paper:** https://arxiv.org/abs/2309.06180 (PagedAttention)
+- **Models:** https://huggingface.co/models
+- **Discord:** https://discord.gg/vllm
+
+## Popular Use Cases
+
+1. **Production API** - High-throughput LLM serving
+2. **RAG systems** - Fast embedding + generation
+3. **Code assistants** - Low-latency code completion
+4. **Chatbots** - Concurrent user conversations
+5. **Batch processing** - Large-scale text generation
+6. **Research** - Experiment with different models quickly
+
+## Tips & Best Practices
+
+1. **Start small** - Test with 3B/7B models first
+2. **Monitor GPU** - Use `nvidia-smi` to watch memory
+3. **Tune batch size** - Balance throughput vs latency
+4. **Use quantization** - AWQ models for 2x memory savings
+5. **Enable tensor parallelism** - Utilize all GPUs
+6. **Cache models** - First start is slow (downloads model)
+7. **Set max tokens** - Prevent runaway generations
+8. **Use streaming** - Better UX for long responses
+
+## Example: Production Setup
+
+```bash
+# 1. Use a production-grade model
+MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
+
+# 2. Optimize for throughput
+MAX_NUM_SEQS="512"
+GPU_MEMORY_UTILIZATION="0.95"
+
+# 3. Enable multi-GPU (if available)
+TENSOR_PARALLEL_SIZE="2"
+
+# 4. Set reasonable limits
+MAX_MODEL_LEN="4096"
+
+# 5. Add authentication (edit service file)
+# --api-key "production-secret-key"
+
+# 6. Monitor with Prometheus (optional)
+# Add: --enable-metrics
+```
+
+## Community Models to Try
+
+**Coding:**
+- Qwen/Qwen2.5-Coder-7B-Instruct
+- deepseek-ai/deepseek-coder-6.7b-instruct
+- codellama/CodeLlama-13b-Instruct-hf
+
+**Creative Writing:**
+- NousResearch/Hermes-2-Pro-Llama-3-8B
+- SynthIA-7B-v2.0
+
+**Multilingual:**
+- Qwen/Qwen2.5-7B-Instruct (29+ languages)
+- aya-23-8B (23 languages)
+
+**Fast & Small:**
+- microsoft/Phi-3-mini-4k-instruct
+- google/gemma-2-2b-it
+- stabilityai/stablelm-2-1_6b
+
+Happy serving! 🚀
+
diff --git a/vllm/setup.sh b/vllm/setup.sh
new file mode 100644
index 0000000..1b70af7
--- /dev/null
+++ b/vllm/setup.sh
@@ -0,0 +1,265 @@
+#!/bin/bash
+set -e
+
+# Detect Brev user (handles ubuntu, nvidia, shadeform, etc.)
+detect_brev_user() {
+    if [ -n "${SUDO_USER:-}" ] && [ "$SUDO_USER" != "root" ]; then
+        echo "$SUDO_USER"
+        return
+    fi
+    # Check for Brev-specific markers
+    for user_home in /home/*; do
+        username=$(basename "$user_home")
+        [ "$username" = "launchpad" ] && continue
+        if ls "$user_home"/.lifecycle-script-ls-*.log 2>/dev/null | grep -q . || \
+           [ -f "$user_home/.verb-setup.log" ] || \
+           { [ -L "$user_home/.cache" ] && [ "$(readlink "$user_home/.cache")" = "/ephemeral/cache" ]; }; then
+            echo "$username"
+            return
+        fi
+    done
+    # Fallback to common users
+    [ -d "/home/nvidia" ] && echo "nvidia" && return
+    [ -d "/home/ubuntu" ] && echo "ubuntu" && return
+    echo "ubuntu"
+}
+
+# Set USER and HOME if running as root
+if [ "$(id -u)" -eq 0 ] || [ "${USER:-}" = "root" ]; then
+    DETECTED_USER=$(detect_brev_user)
+    export USER="$DETECTED_USER"
+    export HOME="/home/$DETECTED_USER"
+fi
+
+echo "🚀 Setting up vLLM..."
+echo "User: $USER | Home: $HOME"
+
+# Verify GPU is available
+if command -v nvidia-smi &> /dev/null; then
+    GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
+    GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    echo "✓ GPU detected: $GPU_NAME (Count: $GPU_COUNT)"
+else
+    echo "❌ ERROR: NVIDIA GPU required for vLLM"
+    exit 1
+fi
+
+# Install system dependencies
+echo "Installing system dependencies..."
+sudo apt-get update -qq
+sudo apt-get install -y -qq python3-pip python3-venv curl
+
+# Create vLLM directory
+VLLM_DIR="$HOME/vllm-server"
+mkdir -p "$VLLM_DIR"
+
+# Create virtual environment if it doesn't exist
+if [ ! -d "$VLLM_DIR/venv" ]; then
+    echo "Creating Python virtual environment..."
+    python3 -m venv "$VLLM_DIR/venv"
+else
+    echo "Virtual environment already exists, skipping..."
+fi
+
+# Activate and install vLLM
+echo "Installing vLLM (this may take 2-3 minutes)..."
+source "$VLLM_DIR/venv/bin/activate"
+pip install --upgrade pip -q
+pip install vllm -q
+
+# Create model cache directory
+mkdir -p "$HOME/.cache/huggingface"
+
+# Create example config file
+cat > "$VLLM_DIR/config.env" << 'EOF'
+# vLLM Configuration
+# Edit these values and restart the service: sudo systemctl restart vllm
+
+# Model to serve (Hugging Face model ID)
+MODEL_NAME="meta-llama/Llama-3.2-3B-Instruct"
+
+# API settings
+HOST="0.0.0.0"
+PORT="8000"
+
+# GPU settings (adjust based on your hardware)
+TENSOR_PARALLEL_SIZE="1"  # Set to GPU count for multi-GPU
+GPU_MEMORY_UTILIZATION="0.9"  # Use 90% of GPU memory
+
+# Performance settings
+MAX_MODEL_LEN="4096"  # Maximum sequence length
+MAX_NUM_SEQS="256"    # Maximum number of sequences
+
+# Optional: Hugging Face token (needed for gated models like Llama)
+# HF_TOKEN="hf_..."
+EOF
+
+# Create startup script
+cat > "$VLLM_DIR/start.sh" << 'EOF'
+#!/bin/bash
+set -e
+
+# Load config
+source ~/vllm-server/config.env
+
+# Activate venv
+source ~/vllm-server/venv/bin/activate
+
+# Set HuggingFace cache
+export HF_HOME="$HOME/.cache/huggingface"
+
+# Start vLLM server
+exec python3 -m vllm.entrypoints.openai.api_server \
+    --model "$MODEL_NAME" \
+    --host "$HOST" \
+    --port "$PORT" \
+    --tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
+    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
+    --max-model-len "$MAX_MODEL_LEN" \
+    --max-num-seqs "$MAX_NUM_SEQS" \
+    --trust-remote-code
+EOF
+chmod +x "$VLLM_DIR/start.sh"
+
+# Create systemd service
+sudo tee /etc/systemd/system/vllm.service > /dev/null << EOF
+[Unit]
+Description=vLLM OpenAI-Compatible API Server
+After=network.target
+
+[Service]
+Type=simple
+User=$USER
+WorkingDirectory=$HOME/vllm-server
+Environment="PATH=$VLLM_DIR/venv/bin:/usr/local/bin:/usr/bin:/bin"
+Environment="HF_HOME=$HOME/.cache/huggingface"
+ExecStart=$VLLM_DIR/start.sh
+Restart=on-failure
+RestartSec=10
+StandardOutput=journal
+StandardError=journal
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+# Create example scripts
+mkdir -p "$HOME/vllm-examples"
+
+cat > "$HOME/vllm-examples/test_api.py" << 'EOF'
+#!/usr/bin/env python3
+"""Test vLLM API with OpenAI client"""
+from openai import OpenAI
+
+# Point to local vLLM server
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="EMPTY"  # vLLM doesn't require auth by default
+)
+
+# Test chat completion
+response = client.chat.completions.create(
+    model="meta-llama/Llama-3.2-3B-Instruct",  # Use your model name
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is vLLM?"}
+    ],
+    temperature=0.7,
+    max_tokens=150
+)
+
+print("Response:", response.choices[0].message.content)
+print(f"\nTokens used: {response.usage.total_tokens}")
+EOF
+chmod +x "$HOME/vllm-examples/test_api.py"
+
+cat > "$HOME/vllm-examples/streaming_example.py" << 'EOF'
+#!/usr/bin/env python3
+"""Streaming response example"""
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="EMPTY"
+)
+
+print("Streaming response:\n")
+stream = client.chat.completions.create(
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    messages=[{"role": "user", "content": "Write a short poem about GPUs."}],
+    stream=True,
+    max_tokens=200
+)
+
+for chunk in stream:
+    if chunk.choices[0].delta.content:
+        print(chunk.choices[0].delta.content, end="", flush=True)
+print("\n")
+EOF
+chmod +x "$HOME/vllm-examples/streaming_example.py"
+
+cat > "$HOME/vllm-examples/curl_test.sh" << 'EOF'
+#!/bin/bash
+# Test vLLM API with curl
+
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Llama-3.2-3B-Instruct",
+    "messages": [
+      {"role": "system", "content": "You are a helpful assistant."},
+      {"role": "user", "content": "Hello! What can you do?"}
+    ],
+    "temperature": 0.7,
+    "max_tokens": 100
+  }'
+EOF
+chmod +x "$HOME/vllm-examples/curl_test.sh"
+
+# Fix permissions if running as root
+if [ "$(id -u)" -eq 0 ]; then
+    chown -R $USER:$USER "$VLLM_DIR"
+    chown -R $USER:$USER "$HOME/vllm-examples"
+    chown -R $USER:$USER "$HOME/.cache/huggingface" 2>/dev/null || true
+fi
+
+# Reload systemd and enable service (but don't start yet)
+sudo systemctl daemon-reload
+sudo systemctl enable vllm
+
+echo ""
+echo "✅ vLLM installation complete!"
+echo ""
+echo "⚙️  Configuration: $VLLM_DIR/config.env"
+echo "📝 Examples: $HOME/vllm-examples/"
+echo ""
+echo "🔧 IMPORTANT: Configure before starting!"
+echo ""
+echo "1. Edit the model in config:"
+echo "   nano $VLLM_DIR/config.env"
+echo ""
+echo "2. For gated models (Llama, etc), add HuggingFace token:"
+echo "   - Get token: https://huggingface.co/settings/tokens"
+echo "   - Add to config: HF_TOKEN=\"hf_...\""
+echo ""
+echo "3. Start the service:"
+echo "   sudo systemctl start vllm"
+echo ""
+echo "4. Check status:"
+echo "   sudo systemctl status vllm"
+echo "   sudo journalctl -u vllm -f"
+echo ""
+echo "⚠️  First start downloads the model (~3-10GB) - check logs!"
+echo "⚠️  To access from outside Brev, open port: 8000/tcp"
+echo ""
+echo "Quick test (after starting):"
+echo "   python3 $HOME/vllm-examples/test_api.py"
+echo "   bash $HOME/vllm-examples/curl_test.sh"
+echo ""
+echo "Popular models to try:"
+echo "   • meta-llama/Llama-3.2-3B-Instruct (3B - fast, low memory)"
+echo "   • meta-llama/Llama-3.1-8B-Instruct (8B - balanced)"
+echo "   • mistralai/Mistral-7B-Instruct-v0.3 (7B - good quality)"
+echo "   • Qwen/Qwen2.5-7B-Instruct (7B - excellent coding)"
+echo ""
+

From 21594198388aa82148675f4513c176a65c8e9a76 Mon Sep 17 00:00:00 2001
From: Harsha Vardhan <hmannem@uab.edu>
Date: Fri, 21 Nov 2025 20:10:24 -0600
Subject: [PATCH 2/4] Add vLLM integration

---
 vllm/setup.sh | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/vllm/setup.sh b/vllm/setup.sh
index 1b70af7..45c3c92 100644
--- a/vllm/setup.sh
+++ b/vllm/setup.sh
@@ -38,7 +38,15 @@ echo "User: $USER | Home: $HOME"
 if command -v nvidia-smi &> /dev/null; then
     GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
     GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    GPU_MEMORY=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader | head -1 | awk '{print $1}')
     echo "✓ GPU detected: $GPU_NAME (Count: $GPU_COUNT)"
+    echo "✓ GPU memory: ${GPU_MEMORY}MB"
+    
+    # Warn if memory might be tight
+    if [ "$GPU_MEMORY" -lt 12000 ]; then
+        echo "⚠️  WARNING: GPU has <12GB memory. Default model (Mistral 7B) may not fit."
+        echo "   Consider using a smaller model like: microsoft/Phi-3-mini-4k-instruct"
+    fi
 else
     echo "❌ ERROR: NVIDIA GPU required for vLLM"
     exit 1
@@ -76,7 +84,8 @@ cat > "$VLLM_DIR/config.env" << 'EOF'
 # Edit these values and restart the service: sudo systemctl restart vllm
 
 # Model to serve (Hugging Face model ID)
-MODEL_NAME="meta-llama/Llama-3.2-3B-Instruct"
+# Using Mistral 7B - no token required, excellent quality
+MODEL_NAME="mistralai/Mistral-7B-Instruct-v0.3"
 
 # API settings
 HOST="0.0.0.0"
@@ -148,7 +157,10 @@ mkdir -p "$HOME/vllm-examples"
 
 cat > "$HOME/vllm-examples/test_api.py" << 'EOF'
 #!/usr/bin/env python3
-"""Test vLLM API with OpenAI client"""
+"""Test vLLM API with OpenAI client
+
+Requires: pip install openai>=1.0.0
+"""
 from openai import OpenAI
 
 # Point to local vLLM server
@@ -159,7 +171,7 @@ client = OpenAI(
 
 # Test chat completion
 response = client.chat.completions.create(
-    model="meta-llama/Llama-3.2-3B-Instruct",  # Use your model name
+    model="mistralai/Mistral-7B-Instruct-v0.3",  # Use your model name
     messages=[
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": "What is vLLM?"}
@@ -185,7 +197,7 @@ client = OpenAI(
 
 print("Streaming response:\n")
 stream = client.chat.completions.create(
-    model="meta-llama/Llama-3.2-3B-Instruct",
+    model="mistralai/Mistral-7B-Instruct-v0.3",
     messages=[{"role": "user", "content": "Write a short poem about GPUs."}],
     stream=True,
     max_tokens=200
@@ -205,7 +217,7 @@ cat > "$HOME/vllm-examples/curl_test.sh" << 'EOF'
 curl http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "meta-llama/Llama-3.2-3B-Instruct",
+    "model": "mistralai/Mistral-7B-Instruct-v0.3",
     "messages": [
       {"role": "system", "content": "You are a helpful assistant."},
       {"role": "user", "content": "Hello! What can you do?"}
@@ -253,13 +265,14 @@ echo "⚠️  First start downloads the model (~3-10GB) - check logs!"
 echo "⚠️  To access from outside Brev, open port: 8000/tcp"
 echo ""
 echo "Quick test (after starting):"
+echo "   pip install openai  # If not already installed"
 echo "   python3 $HOME/vllm-examples/test_api.py"
 echo "   bash $HOME/vllm-examples/curl_test.sh"
 echo ""
-echo "Popular models to try:"
-echo "   • meta-llama/Llama-3.2-3B-Instruct (3B - fast, low memory)"
-echo "   • meta-llama/Llama-3.1-8B-Instruct (8B - balanced)"
-echo "   • mistralai/Mistral-7B-Instruct-v0.3 (7B - good quality)"
-echo "   • Qwen/Qwen2.5-7B-Instruct (7B - excellent coding)"
+echo "Popular models to try (edit config.env):"
+echo "   • mistralai/Mistral-7B-Instruct-v0.3 (default - no token needed)"
+echo "   • microsoft/Phi-3-mini-4k-instruct (3.8B - smaller, ~6GB VRAM)"
+echo "   • Qwen/Qwen2.5-7B-Instruct (7B - excellent for coding)"
+echo "   • meta-llama/Llama-3.2-3B-Instruct (3B - needs HF token)"
 echo ""
 

From cf9ab8e62232c8d77aa69ad7338e2332d5d0bfa1 Mon Sep 17 00:00:00 2001
From: Harsha Vardhan Mannem
 <144146034+HarshaVardhanMannem@users.noreply.github.com>
Date: Fri, 21 Nov 2025 21:42:13 -0600
Subject: [PATCH 3/4] Update vllm/setup.sh

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 vllm/setup.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/setup.sh b/vllm/setup.sh
index 45c3c92..6aa4fb2 100644
--- a/vllm/setup.sh
+++ b/vllm/setup.sh
@@ -109,10 +109,10 @@ cat > "$VLLM_DIR/start.sh" << 'EOF'
 set -e
 
 # Load config
-source ~/vllm-server/config.env
+source "$HOME/vllm-server/config.env"
 
 # Activate venv
-source ~/vllm-server/venv/bin/activate
+source "$HOME/vllm-server/venv/bin/activate"
 
 # Set HuggingFace cache
 export HF_HOME="$HOME/.cache/huggingface"

From 5718f9beadf4cbe30924c71b18cabfe5c4a029b4 Mon Sep 17 00:00:00 2001
From: Harsha Vardhan Mannem
 <144146034+HarshaVardhanMannem@users.noreply.github.com>
Date: Mon, 24 Nov 2025 21:27:23 -0600
Subject: [PATCH 4/4] Update vllm/setup.sh

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 vllm/setup.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/setup.sh b/vllm/setup.sh
index 6aa4fb2..6776e60 100644
--- a/vllm/setup.sh
+++ b/vllm/setup.sh
@@ -230,9 +230,9 @@ chmod +x "$HOME/vllm-examples/curl_test.sh"
 
 # Fix permissions if running as root
 if [ "$(id -u)" -eq 0 ]; then
-    chown -R $USER:$USER "$VLLM_DIR"
-    chown -R $USER:$USER "$HOME/vllm-examples"
-    chown -R $USER:$USER "$HOME/.cache/huggingface" 2>/dev/null || true
+    chown -R "$USER:$USER" "$VLLM_DIR"
+    chown -R "$USER:$USER" "$HOME/vllm-examples"
+    chown -R "$USER:$USER" "$HOME/.cache/huggingface" 2>/dev/null || true
 fi
 
 # Reload systemd and enable service (but don't start yet)