EternityJune25 · l4b4r4b4b4 · Aug 18, 2025 · Aug 18, 2025 · Aug 19, 2025 · Aug 19, 2025
diff --git a/.env.example b/.env.example
@@ -0,0 +1,12 @@
+# Optional
+# LLM=
+# EMB_MODEL=
+LOCAL_DOCKER=True
+HUGGINGFACEHUB_API_TOKEN=
+LLM_MAX_CONTEXT_LENGTH=32768
+LLM_SWAP_SPACE=16
+LLM_CPU_OFFLOAD_SPACE=8
+DATASET=cinderella
+OUT_DIR=result/cinderella_vllm
+SAVE_DIR=outputs/cinderella_vllm
+OPENAI_API_KEY=DUMMY_KEY
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+.env
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -89,4 +90,4 @@ Thumbs.db
 # Other
 *.swp
 *.bak
-*.tmp 
+*.tmp
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,28 @@
+FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-runtime
+
+WORKDIR /app
+
+# Install essential OS-level dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install requirements
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+COPY . .
+
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV PYTHONUNBUFFERED=1
+ENV PORT=7373
+ENV HOSTNAME=0.0.0.0
+
+# Set CUDA-related environment variables
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,video
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+# Default command - will be overridden by specific environment Dockerfiles
+CMD ["python", "main_docker.py"]
diff --git a/README.md b/README.md
@@ -210,6 +210,113 @@ netstat -tlnp | grep 8000
 curl http://localhost:8000/v1/models
 ```
 
+### Method 3: Using Local docker compose deployment (main_docker.py) ⚡
+This method deploys everything needed locally. Namely:
+1. vLLM openai server for language model inference
+2. 🤗 HugginFace's Text Embeddings Inference (HF TEI)
-2. 🤗 HugginFace's Text Embeddings Inference (HF TEI)
+2. 🤗 Hugging Face's Text Embeddings Inference (HF TEI)
-2. 🤗 HugginFace's Text Embeddings Inference (HF TEI)
+2. 🤗 Hugging Face's Text Embeddings Inference (HF TEI)
+3. como-app
+
+#### Requirements
+- docker
+- nvidia device plugin
+
+#### 1. Configure inference services 📝
+
+```bash
+cp .env.example .env
+```
+After cpoying the example environment file adjust environment variables as wanted.
-After cpoying the example environment file adjust environment variables as wanted.
+After copying the example environment file adjust environment variables as wanted.
-After cpoying the example environment file adjust environment variables as wanted.
+After copying the example environment file adjust environment variables as wanted.
+
+#### 2. Pull, build and spin-up docker compose deployment
+
+```bash
+docker compose up -d && docker compose logs -f
+```
+
+#### 4. Check Deployment Services' Status 🔍
-#### 4. Check Deployment Services' Status 🔍
+#### 3. Check Deployment Services' Status 🔍
-#### 4. Check Deployment Services' Status 🔍
+#### 3. Check Deployment Services' Status 🔍
+Following the deployment of the application stack. Both HF TEI and vllm need to download models.
+
+The `como-app` is only started once both these are up and running healthy.
+```logs
+[+] Running 3/3
+ ✔ Container comorag-vllm-1      Healthy                                                                                                                                                                         1.0s
+ ✔ Container embeddings          Healthy                                                                                                                                                                        16.0s
+ ✔ Container comorag-como-app-1  Started
+ ```
+
+##### Check HF TEI logs
+```bash
+docker compose logs -f embeddings
+```
+
+Wait until you see the following:
+```logs
+...
+embeddings  | 2025-08-18T16:20:13.621603Z  INFO text_embeddings_router: router/src/lib.rs:239: Starting model backend
+embeddings  | 2025-08-18T16:20:13.623965Z  INFO text_embeddings_backend: backends/src/lib.rs:516: Downloading `model.safetensors`
+embeddings  | 2025-08-18T16:20:13.624022Z  INFO text_embeddings_backend: backends/src/lib.rs:395: Model weights downloaded in 63.05µs
+embeddings  | 2025-08-18T16:20:13.998327Z  INFO text_embeddings_backend_candle: backends/candle/src/lib.rs:412: Starting FlashNomicBert model on Cuda(CudaDevice(DeviceId(1)))
+embeddings  | 2025-08-18T16:20:23.549330Z  INFO text_embeddings_router: router/src/lib.rs:257: Warming up model
+embeddings  | 2025-08-18T16:20:23.945749Z  INFO text_embeddings_router::http::server: router/src/http/server.rs:1852: Starting HTTP server: 0.0.0.0:8080
+embeddings  | 2025-08-18T16:20:23.945762Z  INFO text_embeddings_router::http::server: router/src/http/server.rs:1853: Ready
+```
+
+##### Check vLLM logs
+```bash
+docker compose logs -f vllm
+```
+
+Wait until you see the following:
+```logs
+...
+vllm-1  | INFO 08-18 09:29:55 [api_server.py:1818] Starting vLLM API server 0 on http://0.0.0.0:80
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:29] Available routes are:
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /openapi.json, Methods: HEAD, GET
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /docs, Methods: HEAD, GET
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /docs/oauth2-redirect, Methods: HEAD, GET
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /redoc, Methods: HEAD, GET
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /health, Methods: GET
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /load, Methods: GET
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /ping, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /ping, Methods: GET
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /tokenize, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /detokenize, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /v1/models, Methods: GET
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /version, Methods: GET
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /v1/responses, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /v1/responses/{response_id}, Methods: GET
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /v1/responses/{response_id}/cancel, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /v1/chat/completions, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /v1/completions, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /v1/embeddings, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /pooling, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /classify, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /score, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /v1/score, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /v1/audio/transcriptions, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /v1/audio/translations, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /rerank, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /v1/rerank, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /v2/rerank, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /scale_elastic_ep, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /is_scaling_elastic_ep, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /invocations, Methods: POST
+vllm-1  | INFO 08-18 09:29:55 [launcher.py:37] Route: /metrics, Methods: GET
+vllm-1  | INFO:     Started server process [1]
+vllm-1  | INFO:     Waiting for application startup.
+vllm-1  | INFO:     Application startup complete.
+```
+
+##### ComoRAG
+Now attach to the como-app to see its logs.
+```bash
+docker compose logs -f como-app
+```
+
+For example:
+```logs
+```
+
 ### Comparison of Two Methods 📊
 
 | Feature | OpenAI API (main.py) | vLLM Local (main_vllm.py) |
@@ -264,4 +371,4 @@ For questions or suggestions, feel free to submit an Issue or PR.
 ---
 
 ## Acknowledgement 🙏
-We refer to the repository of [HippoRAG](https://github.com/OSU-NLP-Group/HippoRAG) as a skeleton code.
+We refer to the repository of [HippoRAG](https://github.com/OSU-NLP-Group/HippoRAG) as a skeleton code.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,169 @@
+volumes:
+  hf_cache:
+
+services:
+  como-app:
+    container_name: como-app
+    build: .
+    # image: nvidia/cuda:11.8.0-base-ubuntu22.04
+    # The key part for CDI GPU access:
+    device_cgroup_rules:
+      - "c 195:* rmw"
+      - "c 236:* rmw"
+    devices:
+      - nvidia.com/gpu=all
+    env_file: .env
+    environment:
+      - EMB_MODEL=${EMB_MODEL}
+    ipc: host
+    depends_on:
+      embeddings:
+        condition: service_healthy
+      vllm:
+        condition: service_healthy
+
+  embeddings:
+    deploy:
+      replicas: 1
+    image: ghcr.io/huggingface/text-embeddings-inference:86-1.8
+    volumes:
+      - hf_cache:/.hf_cache
+    ports:
+      - 8011:8080
+    ipc: host
+    container_name: embeddings
+    # Replace runtime with devices for CDI
+    devices:
+      - nvidia.com/gpu=all
+    device_cgroup_rules:
+      - "c 195:* rmw"
+      - "c 236:* rmw"
+    environment:
+      - EMB_MODEL=${EMB_MODEL}
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - USE_FLASH_ATTENTION=True
+      - HF_HUB_ENABLE_HF_TRANSFER=1
+      - HF_HOME=/.hf_cache
+      - RUST_LOG=info
+      # Performance tuning for embedding models
+      - OMP_NUM_THREADS=8
+      - MKL_NUM_THREADS=8
+      - TOKENIZERS_PARALLELISM=true
+    restart: no
+    env_file: .env
+    command:
+      [
+        "--model-id",
+        "${EMB_MODEL:-nomic-ai/nomic-embed-text-v1.5}",
+        "--hostname",
+        "0.0.0.0",
+        "--port",
+        "8080",
+        "--huggingface-hub-cache",
+        "/.hf_cache",
+        "--tokenization-workers",
+        "16",
+        "--max-concurrent-requests",
+        "1024",
+        "--max-batch-tokens",
+        "32768",
+        "--max-batch-requests",
+        "256",
+        "--max-client-batch-size",
+        "64",
+        "--auto-truncate",
+        "--payload-limit",
+        "4000000",
-        "4000000",
+        "4000000"
-        "4000000",
+        "4000000"
+      ]
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
+    logging:
+      driver: json-file
+      options:
+        max-size: "100m"
+        max-file: "1"
+
+  vllm:
+    deploy:
+      replicas: 1
+    volumes:
+      - hf_cache:/.hf_cache
+    restart: no
+    container_name: vllm
+    image: vllm/vllm-openai:latest
+    ports:
+      - 7373:80
+    device_cgroup_rules:
+      - "c 195:* rmw"
+      - "c 236:* rmw"
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - HUGGING_FACE_HUB_TOKEN=${HUGGINGFACEHUB_API_TOKEN:-your_hf_api_access_token}
+      - HF_HOME=/.hf_cache
+      - NVIDIA_VISIBLE_DEVICES=all
+      - VLLM_ATTENTION_BACKEND=FLASHINFER
+      - HF_HUB_ENABLE_HF_TRANSFER=1
+      - MAX_PARALLEL_LOADING_WORKERS=4
+    ipc: host
+    env_file: .env
+    command: [
+        "--host",
+        "0.0.0.0",
+        "--port",
+        "80",
+        "--model",
+        "${LLM:-solidrust/Mistral-7B-Instruct-v0.3-AWQ}",
+        "--served-model-name",
+        "como/lm",
+        "--max-model-len",
+        "${LLM_MAX_CONTEXT_LENGTH}",
+        "--max-num-batched-tokens",
+        "${LLM_MAX_CONTEXT_LENGTH}",
+        "--max-seq-len-to-capture",
+        "${LLM_MAX_CONTEXT_LENGTH}",
+        "--kv-cache-dtype",
+        "fp8",
+        "--quantization",
+        "awq",
+        # "--dtype",
+        # "float16",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "mistral", # llama3_json
+        "--chat-template",
+        "examples/tool_chat_template_mistral_parallel.jinja", # tool_chat_template_llama3_json
+        "--cpu-offload-gb",
+        "${LLM_CPU_OFFLOAD_SPACE}",
+        "--gpu-memory-utilization",
+        "0.73",
+        # "--block-size",
+        # "32",
+        "--swap-space",
+        "${LLM_SWAP_SPACE}",
+        # "--trust-remote-code",
+        "--seed",
+        "4269",
+        "--max-num-seqs",
+        "1",
+        "--trust-remote-code",
+        "--enable-prefix-caching",
+        "--enable-chunked-prefill",
+        # "--disable-sliding-window",
+        # "--max-paddings",
+        # "16",
+        # "--enable-chunked-prefill", # Not possible together with prfix caching enabled
+        # "--enforce-eager",
+        # "--max-parallel-loading-workers",
+        # "2"
+      ]
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://vllm:80/health"]
+      interval: 30s
+      timeout: 5s
+      retries: 5