From f533b6462d90a554a1c677a491bf3ec5547cf665 Mon Sep 17 00:00:00 2001
From: raj-dummugudupu <grajsureshd@gmail.com>
Date: Wed, 25 Mar 2026 05:35:56 -0400
Subject: [PATCH] update and refactor code to support multiple inference
 providers

---
 .env.example                                  |    8 +
 .github/pull_request_template.md              |   39 +
 .github/workflows/code-scans.yaml             |    2 +-
 Contributing.md                               |  176 ++-
 Docs/DOCKER_SETUP.md                          |  231 ----
 Docs/PROJECT_DOCUMENTATION.md                 |   98 --
 Docs/QUICKSTART.md                            |  452 -------
 README.md                                     | 1186 ++++++++++++-----
 Security.md                                   |   29 +-
 ... Conditions .md => TERMS_AND_CONDITIONS.md |    0
 TROUBLESHOOTING.md                            |  876 ++++++++++++
 backend/.env.example                          |  139 ++
 backend/Dockerfile                            |    7 +-
 backend/api.py                                |  456 ++-----
 backend/config.py                             |   63 +
 backend/requirements.txt                      |    2 +
 backend/services/__init__.py                  |    9 +
 backend/services/llm_service.py               |  185 +++
 backend/utils/constants.py                    |   29 +-
 backend/utils/document_processor.py           |  240 +---
 backend/utils/rag_pipeline.py                 |  516 +------
 backend/utils/vector_store.py                 |  627 +--------
 configuration/.env.example                    |    1 -
 configuration/docker-compose.yml              |   32 -
 docker-compose.yml                            |   63 +
 frontend/package-lock.json                    |   11 +-
 frontend/src/App.jsx                          |   52 +-
 frontend/src/components/ChatInterface.jsx     |   72 +-
 frontend/src/components/ConfigSidebar.jsx     |   84 --
 frontend/src/components/DocumentUpload.jsx    |   51 +-
 frontend/src/components/RAGPipelineInfo.jsx   |    4 -
 frontend/src/components/layout/Header.jsx     |    4 -
 frontend/src/main.jsx                         |   25 +-
 frontend/src/pages/Chat.jsx                   |   68 +-
 frontend/src/pages/Home.jsx                   |   19 -
 frontend/src/services/api.js                  |   40 +-
 36 files changed, 2697 insertions(+), 3199 deletions(-)
 create mode 100644 .env.example
 create mode 100644 .github/pull_request_template.md
 delete mode 100644 Docs/DOCKER_SETUP.md
 delete mode 100644 Docs/PROJECT_DOCUMENTATION.md
 delete mode 100644 Docs/QUICKSTART.md
 rename Terms And Conditions .md => TERMS_AND_CONDITIONS.md (100%)
 create mode 100644 TROUBLESHOOTING.md
 create mode 100644 backend/.env.example
 create mode 100644 backend/config.py
 create mode 100644 backend/services/__init__.py
 create mode 100644 backend/services/llm_service.py
 delete mode 100644 configuration/.env.example
 delete mode 100644 configuration/docker-compose.yml
 create mode 100644 docker-compose.yml
 delete mode 100644 frontend/src/components/ConfigSidebar.jsx

diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..d03d7c9
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,8 @@
+# Docker Compose Environment Variables
+# This file is optional - main configuration is in backend/.env
+
+# Backend Port (default: 5000)
+BACKEND_PORT=5000
+
+# Frontend Port (default: 3000)
+FRONTEND_PORT=3000
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 0000000..b93e50b
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,39 @@
+## Summary
+
+<!-- What does this PR do? Keep it to 1-3 bullet points. -->
+
+-
+
+## Type of Change
+
+<!-- Check the one that applies. -->
+
+- [ ] Bug fix
+- [ ] New feature / enhancement
+- [ ] Documentation update
+- [ ] Refactor (no behavior change)
+- [ ] Chore (dependencies, CI, tooling)
+
+## Changes Made
+
+<!-- Briefly describe the key changes. Link to relevant issues if applicable. -->
+
+Resolves #<!-- issue number -->
+
+## How to Test
+
+<!-- Steps a reviewer can follow to verify the changes. -->
+
+1.
+
+## Checklist
+
+- [ ] I have read the [Contributing Guide](../CONTRIBUTING.md)
+- [ ] My branch is up to date with `main`
+- [ ] New environment variables (if any) are documented in `.env.example` and the README
+- [ ] No secrets, API keys, or credentials are included in this PR
+- [ ] I have tested my changes locally
+
+## Screenshots (if applicable)
+
+<!-- Add screenshots for UI changes. Delete this section if not applicable. -->
diff --git a/.github/workflows/code-scans.yaml b/.github/workflows/code-scans.yaml
index 18e5cd9..5139404 100644
--- a/.github/workflows/code-scans.yaml
+++ b/.github/workflows/code-scans.yaml
@@ -37,7 +37,7 @@ jobs:
         run: mkdir -p trivy-reports
         
       - name: Run Trivy FS Scan
-        uses: aquasecurity/trivy-action@0.24.0
+        uses: aquasecurity/trivy-action@0.35.0
         with:
           scan-type: 'fs'
           scan-ref: '.'
diff --git a/Contributing.md b/Contributing.md
index 3601f75..c38dde0 100644
--- a/Contributing.md
+++ b/Contributing.md
@@ -2,9 +2,9 @@
 
 Thanks for your interest in contributing to ClinIQ.
 
-ClinIQ is an open-source clinical document question-answering app built with a Flask backend, a React frontend, and a retrieval-augmented generation pipeline. We welcome improvements across the codebase, documentation, bug reports, design feedback, and workflow polish.
+ClinIQ is an open-source clinical document question-answering app built with a Flask backend, a React frontend, and a retrieval-augmented generation (RAG) pipeline with hybrid search and intelligent reranking. We welcome improvements across the codebase, documentation, bug reports, design feedback, and workflow polish.
 
-Before you start, please read the relevant section below. It helps keep contributions focused, reviewable, and aligned with the current project setup.
+Before you start, read the relevant section below. It helps keep contributions focused, reviewable, and aligned with the current project setup.
 
 ---
 
@@ -24,6 +24,7 @@ npm --version
 
 # Check Docker
 docker --version
+docker compose version
 
 # Check Git
 git --version
@@ -32,7 +33,7 @@ git --version
 New to contributing?
 
 1. Open an issue or pick an existing one to work on.
-2. Sync your branch from `dev`.
+2. Fork the repo and create a branch from `main`.
 3. Follow the local setup guide below.
 4. Run the app locally and verify your change before opening a PR.
 
@@ -42,13 +43,15 @@ New to contributing?
   - [Get help or ask a question?](#get-help-or-ask-a-question)
   - [Report a bug?](#report-a-bug)
   - [Suggest a new feature?](#suggest-a-new-feature)
+  - [Fork and clone the repo?](#fork-and-clone-the-repo)
   - [Set up ClinIQ locally?](#set-up-cliniq-locally)
   - [Start contributing code?](#start-contributing-code)
   - [Improve the documentation?](#improve-the-documentation)
   - [Submit a pull request?](#submit-a-pull-request)
+- [Branching model](#branching-model)
+- [Commit conventions](#commit-conventions)
 - [Code guidelines](#code-guidelines)
 - [Pull request checklist](#pull-request-checklist)
-- [Branching model](#branching-model)
 - [Thank you](#thank-you)
 
 ---
@@ -57,15 +60,15 @@ New to contributing?
 
 ### Get help or ask a question?
 
-- Start with the main project docs in [`README.md`](./README.md), [`Docs/QUICKSTART.md`](./Docs/QUICKSTART.md), and [`Docs/PROJECT_DOCUMENTATION.md`](./Docs/PROJECT_DOCUMENTATION.md).
+- Start with the main project docs in [`README.md`](./README.md), [`TROUBLESHOOTING.md`](./TROUBLESHOOTING.md), [`SECURITY.md`](./SECURITY.md), and [`Docs/QUICKSTART.md`](./Docs/QUICKSTART.md).
 - If something is unclear, open a GitHub issue with your question and the context you already checked.
 
 ### Report a bug?
 
 1. Search existing issues first.
 2. If the bug is new, open a GitHub issue.
-3. Include the environment, what happened, what you expected, and exact steps to reproduce.
-4. Add screenshots, logs, or request/response details if relevant.
+3. Include your environment, what happened, what you expected, and exact steps to reproduce.
+4. Add screenshots, logs, request details, or response payloads if relevant.
 
 ### Suggest a new feature?
 
@@ -73,6 +76,39 @@ New to contributing?
 2. Explain the problem, who it helps, and how it fits ClinIQ.
 3. If the change is large, get alignment in the issue before writing code.
 
+### Fork and clone the repo?
+
+All contributions should come from a **fork** of the repository. This keeps the upstream repo clean and lets maintainers review changes via pull requests.
+
+#### Step 1: Fork the repository
+
+Click the **Fork** button at the top-right of the [ClinIQ repo](https://github.com/cld2labs/ClinIQ) to create a copy under your GitHub account.
+
+#### Step 2: Clone your fork
+
+```bash
+git clone https://github.com/<your-username>/ClinIQ.git
+cd ClinIQ
+```
+
+#### Step 3: Add the upstream remote
+
+```bash
+git remote add upstream https://github.com/cld2labs/ClinIQ.git
+```
+
+This lets you pull in the latest changes from the original repo.
+
+#### Step 4: Create a branch
+
+Always branch off `main`. See [Branching model](#branching-model) for naming conventions.
+
+```bash
+git checkout main
+git pull upstream main
+git checkout -b <type>/<short-description>
+```
+
 ### Set up ClinIQ locally?
 
 #### Prerequisites
@@ -80,21 +116,29 @@ New to contributing?
 - Python 3.10+
 - Node.js 18+ and npm
 - Git
-- An OpenAI API key
+- Docker with Docker Compose v2
+- OpenAI API key (set via environment variable in backend/.env)
 
 #### Option 1: Local development
 
-##### Step 1: Clone the repository
+##### Step 1: Configure environment variables
+
+Create a backend `.env` file for the API key:
 
 ```bash
-git clone git@github-work:cld2labs/ClinIQ.git
-cd ClinIQ
+cd backend
+echo "OPENAI_API_KEY=your_api_key_here" > .env
+cd ..
 ```
 
+Or configure your API key through environment variables at runtime.
+
 ##### Step 2: Install backend dependencies
 
 ```bash
 cd backend
+python -m venv .venv
+source .venv/bin/activate  # On Windows: .venv\Scripts\activate
 pip install -r requirements.txt
 cd ..
 ```
@@ -107,23 +151,7 @@ npm install
 cd ..
 ```
 
-##### Step 4: Configure environment variables
-
-Create `configuration/.env` from `configuration/.env.example` if you want to provide an API key through environment variables:
-
-```bash
-cp configuration/.env.example configuration/.env
-```
-
-Minimum example:
-
-```env
-OPENAI_API_KEY=your_api_key_here
-```
-
-You can also enter the API key in the app's configuration panel at runtime.
-
-##### Step 5: Start the backend
+##### Step 4: Start the backend
 
 ```bash
 cd backend
@@ -153,14 +181,7 @@ The frontend runs at `http://localhost:3000`.
 From the repository root:
 
 ```bash
-docker-compose -f configuration/docker-compose.yml up --build
-```
-
-Or from the `configuration` directory:
-
-```bash
-cd configuration
-docker-compose up --build
+docker compose up --build
 ```
 
 This starts:
@@ -178,11 +199,11 @@ This starts:
 ### Start contributing code?
 
 1. Open or choose an issue.
-2. Create a feature branch from `dev`.
+2. [Fork the repo](#fork-and-clone-the-repo) and create a feature branch from `main`.
 3. Keep the change focused on a single problem.
 4. Run the app locally and verify the affected workflow.
-5. Update docs when behavior, setup, or architecture changes.
-6. Open a pull request back to `dev`.
+5. Update docs when behavior, setup, configuration, or architecture changes.
+6. Open a pull request back to upstream `main`.
 
 ### Improve the documentation?
 
@@ -195,15 +216,60 @@ Documentation updates are welcome. Relevant files currently live in:
 
 ### Submit a pull request?
 
-Follow the checklist below before opening your PR. Your pull request should:
+1. Push your branch to your fork.
+2. Go to the [ClinIQ repo](https://github.com/cld2labs/ClinIQ) and click **Compare & pull request**.
+3. Set the base branch to `main`.
+4. Fill in the PR template (it loads automatically).
+5. Submit the pull request.
 
-- Stay focused on one issue or topic.
-- Explain what changed and why.
-- Include manual verification steps.
-- Include screenshots or short recordings for UI changes.
-- Reference the related GitHub issue when applicable.
+A maintainer will review your PR. You may be asked to make changes — push additional commits to the same branch and they will be added to the PR automatically.
+
+Before opening your PR, sync with upstream to avoid merge conflicts:
+
+```bash
+git fetch upstream
+git rebase upstream/main
+```
 
-Note: this repository currently includes automated security scanning on pull requests via GitHub Actions. If your PR triggers a scan failure, address it before requesting review.
+Follow the checklist below and the [Pull request checklist](#pull-request-checklist) section.
+
+---
+
+## Branching model
+
+- Fork the repo and base new work from `main`.
+- Open pull requests against upstream `main`.
+- Use descriptive branch names with a type prefix:
+
+| Prefix | Use |
+|---|---|
+| `feat/` | New features or enhancements |
+| `fix/` | Bug fixes |
+| `docs/` | Documentation changes |
+| `refactor/` | Code restructuring (no behavior change) |
+| `chore/` | Dependency updates, CI changes, tooling |
+
+Examples: `feat/add-pdf-support`, `fix/embedding-timeout`, `docs/update-quickstart`
+
+---
+
+## Commit conventions
+
+Use [Conventional Commits](https://www.conventionalcommits.org/) format:
+
+```
+<type>(<optional scope>): <short description>
+```
+
+Examples:
+
+```bash
+git commit -m "feat(api): add hybrid search support"
+git commit -m "fix(ui): resolve citation rendering issue"
+git commit -m "docs: update troubleshooting guide"
+```
+
+Keep commits focused — one logical change per commit.
 
 ---
 
@@ -211,11 +277,11 @@ Note: this repository currently includes automated security scanning on pull req
 
 - Follow the existing project structure and patterns before introducing new abstractions.
 - Keep frontend changes consistent with the React + Vite + Tailwind setup already in use.
-- Keep backend changes consistent with the Flask API and utility modules in `backend/utils`.
+- Keep backend changes consistent with the Flask service structure in `backend/`.
 - Avoid unrelated refactors in the same pull request.
-- Do not commit secrets, API keys, uploaded documents, or generated local database files.
+- Do not commit secrets, API keys, uploaded files, local `.env` files, or generated artifacts.
 - Prefer clear, small commits and descriptive pull request summaries.
-- Update documentation when contributor setup, behavior, or API usage changes.
+- Update documentation when contributor setup, behavior, environment variables, or API usage changes.
 
 ---
 
@@ -230,22 +296,12 @@ Before submitting your pull request, confirm the following:
 - You kept the pull request scoped to one issue or topic.
 - You added screenshots for UI changes when relevant.
 - You did not commit secrets or local generated data.
-- You are opening the pull request against `dev`.
-- You reviewed any GitHub Action scan failures and resolved them.
+- You are opening the pull request against `main`.
 
 If one or more of these are missing, the pull request may be sent back for changes before review.
 
 ---
 
-## Branching model
-
-- Base new work from `dev`.
-- Open pull requests against `dev`.
-- Use descriptive branch names such as `fix/upload-error-handling` or `docs/update-contributing-guide`.
-- Rebase or merge the latest `dev` before opening your PR if your branch has drifted.
-
----
-
 ## Thank you
 
 Thanks for contributing to ClinIQ. Whether you're fixing a bug, improving the docs, or refining the product experience, your work helps make the project more useful and easier to maintain.
diff --git a/Docs/DOCKER_SETUP.md b/Docs/DOCKER_SETUP.md
deleted file mode 100644
index e47dabd..0000000
--- a/Docs/DOCKER_SETUP.md
+++ /dev/null
@@ -1,231 +0,0 @@
-# Docker Setup Guide for ClinIQ
-
-This guide explains how to run ClinIQ using Docker, so you don't need to install Node.js or Python dependencies directly on your system.
-
-## Prerequisites
-
-- **Docker Desktop** installed on your system
-  - Download from: https://www.docker.com/products/docker-desktop/
-  - Install and make sure Docker Desktop is running
-
-## Quick Start
-
-### 1. Build and Run with Docker Compose
-
-```bash
-docker-compose up --build
-```
-
-This will:
-- Build the backend (Flask API) container
-- Build the frontend (React) container
-- Start both services
-- Frontend: http://localhost:3000
-- Backend: http://localhost:5000
-
-### 2. Run in Detached Mode (Background)
-
-```bash
-docker-compose up -d --build
-```
-
-### 3. View Logs
-
-```bash
-# All services
-docker-compose logs -f
-
-# Backend only
-docker-compose logs -f backend
-
-# Frontend only
-docker-compose logs -f frontend
-```
-
-### 4. Stop the Services
-
-```bash
-docker-compose down
-```
-
-## Individual Container Commands
-
-### Run Backend Only
-
-```bash
-# Build backend image
-docker build -f Dockerfile.backend -t cliniq-backend .
-
-# Run backend container
-docker run -p 5000:5000 \
-  -v "${PWD}/.chromadb:/app/.chromadb" \
-  -v "${PWD}/uploads:/app/uploads" \
-  cliniq-backend
-```
-
-### Run Frontend Only
-
-```bash
-# Build frontend image
-cd frontend
-docker build -t cliniq-frontend .
-
-# Run frontend container
-docker run -p 3000:3000 \
-  -v "${PWD}:/app" \
-  -v /app/node_modules \
-  cliniq-frontend
-```
-
-## Development Mode
-
-The Docker setup includes volume mounts for development:
-
-- **Backend**: Code changes require container restart
-- **Frontend**: Code changes are hot-reloaded automatically
-
-### Restart Services
-
-```bash
-# Restart all services
-docker-compose restart
-
-# Restart specific service
-docker-compose restart backend
-docker-compose restart frontend
-```
-
-## Troubleshooting
-
-### Port Already in Use
-
-If ports 3000 or 5000 are already in use:
-
-1. Edit `docker-compose.yml`
-2. Change the port mappings:
-   ```yaml
-   ports:
-     - "3001:3000"  # Frontend on different port
-     - "5001:5000"  # Backend on different port
-   ```
-
-### Rebuild After Dependency Changes
-
-If you modify `package.json` or `requirements.txt`:
-
-```bash
-docker-compose up --build --force-recreate
-```
-
-### Clear Docker Cache
-
-```bash
-# Remove all containers and images
-docker-compose down -v
-docker system prune -a
-
-# Rebuild from scratch
-docker-compose up --build
-```
-
-### Check Container Status
-
-```bash
-# List running containers
-docker-compose ps
-
-# Check container logs
-docker-compose logs backend
-docker-compose logs frontend
-```
-
-### Access Container Shell
-
-```bash
-# Backend container
-docker-compose exec backend bash
-
-# Frontend container
-docker-compose exec frontend sh
-```
-
-## Production Build
-
-For production, you'll want to build optimized images:
-
-### Frontend Production Build
-
-Edit `frontend/Dockerfile`:
-
-```dockerfile
-FROM node:20-alpine AS builder
-WORKDIR /app
-COPY package.json package-lock.json* ./
-RUN npm install
-COPY . .
-RUN npm run build
-
-FROM nginx:alpine
-COPY --from=builder /app/dist /usr/share/nginx/html
-EXPOSE 80
-CMD ["nginx", "-g", "daemon off;"]
-```
-
-### Backend Production
-
-Use a production WSGI server like Gunicorn:
-
-```dockerfile
-FROM python:3.11-slim
-WORKDIR /app
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt gunicorn
-COPY . .
-CMD ["gunicorn", "-w", "4", "-b", "0.0.0.0:5000", "api:app"]
-```
-
-## File Structure
-
-```
-.
-├── docker-compose.yml      # Orchestrates both services
-├── Dockerfile.backend      # Backend container definition
-├── frontend/
-│   ├── Dockerfile          # Frontend container definition
-│   └── .dockerignore       # Files to exclude from build
-└── .dockerignore           # Root level ignore file
-```
-
-## Environment Variables
-
-You can set environment variables in `docker-compose.yml`:
-
-```yaml
-services:
-  backend:
-    environment:
-      - FLASK_ENV=production
-      - OPENAI_API_KEY=${OPENAI_API_KEY}
-  
-  frontend:
-    environment:
-      - VITE_BACKEND_ENDPOINT=http://backend:5000
-```
-
-## Benefits of Docker Setup
-
-✅ **No local Node.js installation needed**  
-✅ **No local Python environment setup**  
-✅ **Consistent environment across machines**  
-✅ **Easy to share and deploy**  
-✅ **Isolated dependencies**  
-✅ **Easy cleanup** (just `docker-compose down`)
-
-## Next Steps
-
-1. Install Docker Desktop
-2. Run `docker-compose up --build`
-3. Open http://localhost:3000 in your browser
-4. Start using ClinIQ!
-
-
diff --git a/Docs/PROJECT_DOCUMENTATION.md b/Docs/PROJECT_DOCUMENTATION.md
deleted file mode 100644
index 29e47db..0000000
--- a/Docs/PROJECT_DOCUMENTATION.md
+++ /dev/null
@@ -1,98 +0,0 @@
-# 🏥 ClinIQ - Deep Technical Documentation & Architecture Guide
-
-## 🏢 Stakeholder Summary (Non-Technical)
-ClinIQ is an intelligent assistant designed to help healthcare professionals quickly find accurate information within their own clinical documents (Guidelines, Manuals, Protocols).
-
-**How it works for you:**
-1. **Upload**: You provide your PDF or Word documents. The system "reads" and organizes them.
-2. **Search**: When you ask a question, the system uses "Hybrid Search"—a dual-check method that looks for both the *meaning* of your words and specific *keywords* (like drug names).
-3. **Verify**: The assistant doesn't just give an answer; it shows you its **Thinking Process** so you can trust its logic, and provides **Direct Citations** (with links) to the exact page where it found the information.
-4. **Memory**: The assistant now remembers what you talked about earlier. You can ask followup questions like "Tell me more about the treatment for that" without repeating yourself.
-
-*This tool acts as a high-speed research assistant, ensuring that every answer is grounded in your verified clinical documents.*
-
----
-
-ClinIQ is a high-precision AI Clinical Knowledge Assistant. This document provides a deep dive into the architectural decisions, code implementation, and the roadmap for scaling to an enterprise-grade solution.
-
----
-
-## 🏛️ Architectural Reasoning: Why This Design?
-
-The architecture of ClinIQ is built on the **Retrieval-Augmented Generation (RAG)** pattern, specifically optimized for the high-stakes clinical domain.
-
-### 1. The RAG Pattern vs. Fine-tuning
-**Choice**: RAG (Retrieval-Augmented Generation).
-**Reasoning**: 
-- **Verifiability**: Clinical decisions require citations. RAG provides direct links to source documents, which fine-tuned models cannot do reliably.
-- **Dynamic Knowledge**: Clinical protocols change frequently. RAG allows for instant updates by simply replacing documents, whereas fine-tuning is expensive and slow.
-- **Hallucination Mitigation**: By forcing the LLM to use only the provided context, we significantly reduce the risk of "invented" medical advice.
-
-### 2. Hybrid Search: Dense + Sparse Retrieval
-**Choice**: Combination of ChromaDB (Dense/Vector) and BM25 (Sparse/Keyword).
-**Reasoning**:
-- **Dense Search (Vector)**: Captures semantic meaning (e.g., "respiratory distress" matches "difficulty breathing"). Used via `text-embedding-3-small`.
-- **Sparse Search (BM25)**: Captures exact clinical terminology and rare drug names (e.g., "Sjogren's" or "Rituximab") that might be "diluted" in a high-dimensional vector space.
-- **Reciprocal Rank Fusion (RRF)**: We use RRF to merge these two lists, ensuring that a document that is both semantically relevant AND contains exact keywords rises to the top.
-
-### 3. Reranking: The Precision Filter
-**Choice**: Secondary similarity check after initial retrieval.
-**Reasoning**:
-- Initial search might retrieve 10 chunks based on broad relevance. Reranking performs a more compute-intensive comparison on just these 10 chunks to select the top 3. This ensures the LLM's context window is filled with only the highest-quality information, saving tokens and improving answer accuracy.
-
-### 4. Step-by-Step Thinking Process
-**Choice**: Mandatory Chain-of-Thought (CoT) prompting.
-**Reasoning**:
-- **Clinical Transparency**: Doctors need to know *why* an AI suggests a protocol.
-- **Self-Correction**: Forcing the model to "think" before answering often leads it to catch its own errors in interpretation before generating the final clinical advice.
-
----
-
-## 💻 Code Walkthrough: Data Flow
-
-### A. Document Ingestion (`api.py` -> `document_processor.py`)
-1. **Upload**: User uploads a PDF/Docx via the React frontend.
-2. **Extraction**: `extract_text_from_pdf` (using `PyMuPDF`) extracts text and preserves page numbers.
-3. **Chunking**: Text is broken into ~1000-character chunks with overlap. Overlap ensures that context isn't lost at the boundaries of chunks.
-4. **Embedding**: Chunks are sent to OpenAI's `text-embedding-3-small`.
-5. **Storage**: Vectors and metadata (filename, page) are stored in **ChromaDB**.
-
-### B. Query Execution (`rag_pipeline.py` -> `vector_store.py`)
-1. **Query Rewriting**: If chat history exists, the assistant first uses the LLM to rewrite the followup question into a self-contained search query. (e.g., "What about treatment?" -> "Recommended treatment for [Previously Mentioned Disease]").
-2. **Query Embedding**: The rewritten query is converted into a vector.
-3. **Hybrid Retrieval**:
-   - `search_documents` gets semantic matches.
-   - `bm25_search` gets keyword matches.
-   - `reciprocal_rank_fusion` merges them.
-4. **Reranking**: `rerank_chunks` re-evaluates the top candidates.
-5. **LLM Synthesis**: The top chunks and the **entire conversation history** are provided to the LLM. The system prompt enforces clinical rules (Professional tone, "I don't know" if not found, mandatory citations).
-
----
-
-## 🚀 Scaling Roadmap: Architecture & Implementation
-
-To scale ClinIQ from a local tool to a hospital-wide system, the following upgrades are recommended:
-
-### 1. Scaling the Architecture (Infrastructure)
-- **Distributed Vector Database**: Replace local ChromaDB with **Qdrant** or **Weaviate** in a cluster configuration to handle millions of documents.
-- **Asynchronous Processing**: Use **Celery + Redis** for document ingestion. Uploading a 500-page guideline should happen in the background, not blocking the user session.
-- **Load Balancing**: Deploy the Flask API behind **NGINX** with multiple workers (Gunicorn/Uvicorn) across multiple containers.
-- **Document Management Service**: Move file storage from local `/uploads` to **AWS S3** or **Google Cloud Storage** for persistence and scalability.
-
-### 2. Scaling the Implementation (Features)
-- **Multi-Document Reasoning**: Improve the pipeline to compare contraindications across multiple different uploaded drug manuals simultaneously.
-- **Streaming Responses**: Fully implement Server-Sent Events (SSE) to show the answer as it generates, reducing perceived latency.
-- **User Authentication**: Integrate **Supabase Auth** or **OAuth2** to allow individual doctors to have private, secure "knowledge silos".
-- **Medical LLMs**: Experiment with domain-specific models like **Med-PaLM 2** or fine-tuned Llama-3 for deeper clinical reasoning.
-- **Caching**: Implement **Redis caching** for common queries to avoid redundant LLM calls and reduce costs.
-
----
-
-## 📂 Key Project Files
-- `api.py`: The control center (Flask API).
-- `utils/vector_store.py`: The indexing engine (ChromaDB + BM25).
-- `utils/rag_pipeline.py`: The reasoning engine (RAG workflow).
-- `frontend/src/`: Modern UI built with React/Vite.
-
----
-
diff --git a/Docs/QUICKSTART.md b/Docs/QUICKSTART.md
deleted file mode 100644
index 362dd4c..0000000
--- a/Docs/QUICKSTART.md
+++ /dev/null
@@ -1,452 +0,0 @@
-# 🚀 Quick Start Guide
-
-Get ClinIQ up and running in minutes! This guide explains what each step does.
-
----
-
-## Option 1: Docker (Recommended - Easiest)
-
-### Prerequisites
-- **Docker Desktop** installed and running
-  - Download from: https://www.docker.com/products/docker-desktop/
-  - **What it does**: Docker allows you to run applications in isolated containers without installing dependencies on your computer
-
-### Step-by-Step Instructions
-
-#### Step 1: Clone the Repository
-
-```bash
-git clone <your-repo-url>
-cd clinical-rag
-```
-
-**What this does:**
-- Downloads the project code from GitHub to your computer
-- Navigates into the project directory
-- **Result**: You now have all the project files on your local machine
-
----
-
-#### Step 2: Start Everything with Docker Compose
-
-**First Time (Build + Run):**
-```bash
-docker-compose -f configuration/docker-compose.yml up --build
-```
-
-**Second Time and After (Just Run):**
-```bash
-docker-compose -f configuration/docker-compose.yml up
-```
-
-**What this does:**
-1. **Reads the Docker configuration** (`docker-compose.yml`)
-   - Tells Docker how to build and run both backend and frontend services
-
-2. **Builds the Backend Container**:
-   - Downloads Python 3.11 base image
-   - Installs all Python dependencies (Flask, OpenAI, ChromaDB, etc.) **inside the container**
-   - Copies backend code into the container
-   - Creates a ready-to-run backend image
-   - **Result**: Backend container image is created with everything pre-installed
-
-3. **Builds the Frontend Container**:
-   - Downloads Node.js 20 base image
-   - Installs all npm packages (React, Vite, Tailwind, etc.) **inside the container**
-   - Copies frontend code into the container
-   - Creates a ready-to-run frontend image
-   - **Result**: Frontend container image is created with everything pre-installed
-
-4. **Starts Both Containers**:
-   - Backend container runs `python api.py` → Flask API starts on port 5000
-   - Frontend container runs `npm run dev` → React dev server starts on port 3000
-   - **Result**: Both services are running and communicating with each other
-
-**What you'll see:**
-- Docker downloading base images (first time only)
-- Installing dependencies in containers
-- Starting both services
-- Logs from both backend and frontend
-
-**Time**: 
-- **First time**: 2-5 minutes (building images with dependencies)
-- **Second time onwards**: 10-30 seconds (just starting containers from existing images)
-
-**Important**: 
-- Use `--build` flag **only the first time** or when dependencies change
-- For regular use, just run `docker-compose up` (without `--build`)
-
----
-
-#### Step 3: Access the Application
-
-Open your browser and navigate to: **http://localhost:3000**
-
-**What this does:**
-- Connects to the frontend container running on port 3000
-- Loads the React application in your browser
-- **Result**: You see the ClinIQ interface
-
----
-
-#### Step 4: Stop the Services (When Done)
-
-**To stop the containers:**
-```bash
-# Press Ctrl+C in the terminal where docker-compose is running
-# OR run this command in a new terminal:
-docker-compose -f configuration/docker-compose.yml down
-```
-
-**What this does:**
-- Stops both backend and frontend containers
-- Removes the containers (but keeps the images)
-- Frees up ports 3000 and 5000
-- **Result**: Services are stopped, but images remain for next time
-
-**To stop and remove everything (including images):**
-```bash
-docker-compose -f configuration/docker-compose.yml down --rmi all
-```
-
-**What this does:**
-- Stops containers
-- Removes containers
-- Removes images (you'll need to rebuild next time)
-- **Result**: Complete cleanup
-
----
-
-### First Time Using the App
-
-#### Step 1: Enter Your OpenAI API Key
-
-1. Look for the **Configuration** panel on the left sidebar
-2. Enter your OpenAI API key in the "OpenAI API Key" field
-3. Click outside the field or press Enter
-
-**What this does:**
-- Stores your API key in browser localStorage (stays local, never sent to servers)
-- Enables the app to make API calls to OpenAI for document processing and Q&A
-- **Result**: API key is saved and ready to use
-
-**Where to get API key**: https://platform.openai.com/account/api-keys
-
----
-
-#### Step 2: Upload a Clinical Document
-
-1. In the **Document Upload** section, click "Browse Files" or drag & drop
-2. Select a PDF, DOCX, or TXT file
-3. Click "Upload & Process Document"
-
-**What this does:**
-1. **File Upload**: Sends your file to the backend API
-2. **Text Extraction**: 
-   - If PDF: Extracts text from all pages
-   - If DOCX: Extracts text from Word document
-   - If TXT: Reads the text file
-3. **Text Chunking**: 
-   - Breaks the document into smaller pieces (800 tokens each)
-   - Creates overlapping chunks for better context
-4. **Embedding Creation**: 
-   - Converts each chunk into AI-readable format (embeddings)
-   - Uses OpenAI's `text-embedding-3-small` model
-5. **Storage**: 
-   - Saves chunks and embeddings in ChromaDB (vector database)
-   - Stores metadata (filename, chunk IDs, page numbers)
-6. **Index Creation**: 
-   - Creates search indexes for fast retrieval
-   - Initializes BM25 index for keyword search
-
-**What you'll see:**
-- "Processing document..." message
-- Progress indicator
-- Success message: "Document processed! Created X chunks"
-
-**Result**: Your document is now searchable and ready for questions
-
----
-
-#### Step 3: Ask Questions
-
-1. Type your question in the chat input at the bottom
-2. Press Enter or click "Send"
-
-**What this does:**
-1. **Query Processing**:
-   - Converts your question into an embedding (AI-readable format)
-   - Prepares search query
-
-2. **Document Search** (if Hybrid Search enabled):
-   - **Dense Search**: Finds documents by meaning/semantics
-   - **Sparse Search**: Finds documents by keywords (BM25)
-   - **Reciprocal Rank Fusion (RRF)**: Combines both search results
-   - Retrieves top 15 relevant chunks
-
-3. **Reranking** (if enabled):
-   - Re-ranks results using cosine similarity
-   - Ensures most relevant chunks are at the top
-   - Selects top 7 chunks for context
-
-4. **Answer Generation**:
-   - Sends your question + relevant document chunks to GPT-3.5-Turbo
-   - AI generates answer based ONLY on your document content
-   - Includes source citations
-
-5. **Response Display**:
-   - Shows the answer in the chat
-   - Displays source citations
-   - Shows AI thinking process (if enabled)
-
-**What you'll see:**
-- "Analyzing..." message
-- Answer appears in the chat
-- Source citations below the answer
-- AI thinking process (if enabled)
-
-**Result**: You get an accurate answer with citations from your document
-
----
-
-## Option 2: Local Development (React + Flask)
-
-### Prerequisites
-- **Python 3.10+** installed
-- **Node.js 18+** and npm installed
-- **OpenAI API Key**
-
-### First Time Setup
-
-#### Step 1: Clone the Repository
-
-```bash
-git clone <your-repo-url>
-cd clinical-rag
-```
-
-**What this does:**
-- Downloads project code to your computer
-- **Result**: Project files are on your machine
-
----
-
-#### Step 2: Install Backend Dependencies
-
-```bash
-cd backend
-pip install -r requirements.txt
-cd ..
-```
-
-**What this does:**
-1. Navigates to the backend directory
-2. Reads `requirements.txt` (list of Python packages needed)
-3. Installs all packages using pip:
-   - `flask` - Web framework for API
-   - `flask-cors` - Enables frontend-backend communication
-   - `openai` - OpenAI API client
-   - `chromadb` - Vector database
-   - `PyPDF2` - PDF processing
-   - `python-docx` - Word document processing
-   - And 6 more packages...
-4. **Result**: All Python dependencies are installed on your system
-
-**Time**: 1-3 minutes
-
----
-
-#### Step 3: Install Frontend Dependencies
-
-```bash
-cd frontend
-npm install
-cd ..
-```
-
-**What this does:**
-1. Navigates to the frontend directory
-2. Reads `package.json` (list of npm packages needed)
-3. Installs all packages using npm:
-   - `react` & `react-dom` - React framework
-   - `react-router-dom` - Routing
-   - `vite` - Build tool and dev server
-   - `tailwindcss` - CSS framework
-   - And 10+ more packages...
-4. Creates `node_modules/` folder with all dependencies
-5. **Result**: All frontend dependencies are installed
-
-**Time**: 1-2 minutes
-
----
-
-#### Step 4: Start the Backend
-
-Open a terminal and run:
-
-```bash
-cd backend
-python api.py
-```
-
-**What this does:**
-1. Starts the Flask web server
-2. Creates API endpoints:
-   - `/api/health` - Health check
-   - `/api/upload` - Document upload
-   - `/api/query` - Question answering
-   - `/api/clear` - Clear knowledge base
-   - `/api/status` - Get status
-3. Enables CORS (allows frontend to connect)
-4. **Result**: Backend API is running on `http://localhost:5000`
-
-**What you'll see:**
-- "Starting ClinIQ Backend on port 5000"
-- "Running on http://127.0.0.1:5000"
-- Server logs
-
----
-
-#### Step 5: Start the Frontend
-
-Open another terminal and run:
-
-```bash
-cd frontend
-npm run dev
-```
-
-**What this does:**
-1. Starts Vite development server
-2. Compiles React components
-3. Sets up hot module replacement (auto-refresh on code changes)
-4. Proxies API requests to backend (port 5000)
-5. **Result**: Frontend is running on `http://localhost:3000`
-
-**What you'll see:**
-- "VITE v5.x.x ready in xxx ms"
-- "Local: http://localhost:3000"
-- Dev server logs
-
----
-
-#### Step 6: Access the Application
-
-1. Open your browser
-2. Navigate to `http://localhost:3000`
-
-**What this does:**
-- Loads the React application
-- Connects to backend API
-- **Result**: ClinIQ interface is ready to use
-
----
-
-### Running After First Time Setup
-
-Once you've installed dependencies, you only need to **start the services**:
-
-**Terminal 1 - Backend:**
-```bash
-cd backend
-python api.py
-```
-
-**Terminal 2 - Frontend:**
-```bash
-cd frontend
-npm run dev
-```
-
-**What this does:**
-- Starts the services without reinstalling dependencies
-- **Result**: Application is running (much faster than first time)
-
-**Note**: You don't need to run `pip install` or `npm install` again unless:
-- You add new dependencies to `requirements.txt` or `package.json`
-- You delete `node_modules/` or Python packages
-- You're setting up on a new machine
-
----
-
-#### Stopping the Services (When Done)
-
-**To stop the backend:**
-- Go to Terminal 1 (where backend is running)
-- Press `Ctrl+C`
-- **Result**: Flask server stops, port 5000 is freed
-
-**To stop the frontend:**
-- Go to Terminal 2 (where frontend is running)
-- Press `Ctrl+C`
-- **Result**: Vite dev server stops, port 3000 is freed
-
-**What this does:**
-- Gracefully shuts down the servers
-- Stops all processes
-- Frees up the ports
-- **Result**: Services are stopped, ready to start again anytime
-
----
-
-
-
-## How to Stop the Application
-
-### Docker Method
-
-**Option 1: Stop containers (recommended)**
-```bash
-docker-compose -f configuration/docker-compose.yml down
-```
-- Stops both containers
-- Removes containers (but keeps images)
-- Ports are freed
-- **Next time**: Just run `docker-compose up` (fast!)
-
-**Option 2: Stop with Ctrl+C**
-- Press `Ctrl+C` in the terminal where docker-compose is running
-- Stops containers but keeps them running
-- **Next time**: Run `docker-compose up` again
-
-**Option 3: Complete cleanup**
-```bash
-docker-compose -f configuration/docker-compose.yml down --rmi all
-```
-- Stops containers
-- Removes containers
-- Removes images (you'll need to rebuild next time)
-- **Use when**: You want to free up disk space
-
-### Local Development Method
-
-**Stop Backend:**
-1. Go to the terminal where backend is running
-2. Press `Ctrl+C`
-3. **Result**: Flask server stops, port 5000 is freed
-
-**Stop Frontend:**
-1. Go to the terminal where frontend is running
-2. Press `Ctrl+C`
-3. **Result**: Vite dev server stops, port 3000 is freed
-
-**What happens:**
-- Servers shut down gracefully
-- All processes stop
-- Ports become available for other applications
-- No data is lost (documents and database remain)
-
----
-
-## Troubleshooting
-
-### Docker Issues
-
-**"Port already in use"**:
-- Another application is using port 3000 or 5000
-- **Fix**: Stop other applications or change ports in `docker-compose.yml`
-
-**"Cannot connect to Docker daemon"**:
-- Docker Desktop is not running
-- **Fix**: Start Docker Desktop application
-
diff --git a/README.md b/README.md
index 8ade5ef..a3148e4 100644
--- a/README.md
+++ b/README.md
@@ -1,432 +1,1020 @@
-
 <p align="center">
-  <img src="Docs/assets/InnovationHub-HeaderImage.png" width="800" alt="Company Logo">
+  <img src="Docs/assets/InnovationHub-HeaderImage.png" width="800" alt="ClinIQ Clinical Q&A AI Assistant">
 </p>
 
-<h1 align="center">ClinIQ 🏥 : Clinical Q&A Driven by Your Documents</h1>
-
-<p align="center">
-  <b>Transform clinical documents into an intelligent question-answering system using AI-powered RAG (Retrieval-Augmented Generation)</b>
-</p>
+# ClinIQ — Clinical Q&A AI Assistant
 
+An AI-powered clinical document analysis platform using RAG (Retrieval-Augmented Generation), hybrid search, and intelligent reranking for evidence-based medical question answering. Upload clinical documents (PDF, DOCX, or TXT) and ask questions in natural language — powered by any OpenAI-compatible LLM endpoint, Groq, OpenRouter, or a locally running Ollama model.
 
-## Overview
+---
 
-ClinIQ is a modern web application that allows healthcare professionals to upload clinical documents and ask questions in plain English. Using advanced AI techniques including hybrid search, reranking, and OpenAI's GPT models, it provides accurate, evidence-based answers with source citations.
+## Table of Contents
+
+- [ClinIQ — Clinical Q&A AI Assistant](#cliniq--clinical-qa-ai-assistant)
+  - [Table of Contents](#table-of-contents)
+  - [Project Overview](#project-overview)
+  - [How It Works](#how-it-works)
+  - [Architecture](#architecture)
+    - [Architecture Diagram](#architecture-diagram)
+    - [Service Components](#service-components)
+    - [Typical Flow](#typical-flow)
+  - [Get Started](#get-started)
+    - [Prerequisites](#prerequisites)
+      - [Verify Installation](#verify-installation)
+    - [Quick Start (Docker Deployment)](#quick-start-docker-deployment)
+      - [1. Clone the Repository](#1-clone-the-repository)
+      - [2. Configure the Environment](#2-configure-the-environment)
+      - [3. Build and Start the Application](#3-build-and-start-the-application)
+      - [4. Access the Application](#4-access-the-application)
+      - [5. Verify Services](#5-verify-services)
+      - [6. Stop the Application](#6-stop-the-application)
+    - [Local Development Setup](#local-development-setup)
+  - [Project Structure](#project-structure)
+  - [Usage Guide](#usage-guide)
+    - [Using ClinIQ](#using-cliniq)
+    - [Advanced Features](#advanced-features)
+    - [Best Practices](#best-practices)
+  - [LLM Provider Configuration](#llm-provider-configuration)
+    - [OpenAI](#openai)
+    - [Groq](#groq)
+    - [Ollama](#ollama)
+    - [OpenRouter](#openrouter)
+    - [Custom OpenAI-Compatible API](#custom-openai-compatible-api)
+    - [Switching Providers](#switching-providers)
+  - [Environment Variables](#environment-variables)
+    - [Core LLM Configuration](#core-llm-configuration)
+    - [Model Configuration](#model-configuration)
+    - [Generation Parameters](#generation-parameters)
+    - [Security Configuration](#security-configuration)
+    - [Server Configuration](#server-configuration)
+  - [Technology Stack](#technology-stack)
+    - [Backend](#backend)
+    - [Frontend](#frontend)
+    - [Infrastructure](#infrastructure)
+    - [AI/ML Techniques](#aiml-techniques)
+  - [Troubleshooting](#troubleshooting)
+    - [Common Issues](#common-issues)
+    - [Debug Mode](#debug-mode)
+  - [License](#license)
+  - [Disclaimer](#disclaimer)
 
 ---
 
+## Project Overview
+
+**ClinIQ** is an intelligent clinical question-answering platform that transforms uploaded medical documents into a searchable knowledge base using advanced RAG techniques. Healthcare professionals can ask questions in natural language and receive accurate, evidence-based answers with source citations.
+
+This makes ClinIQ suitable for:
 
-![Demo](Docs/assets/demo.giff.gif)
+- **Enterprise deployments** — connect to a GenAI Gateway or any managed LLM API
+- **Air-gapped environments** — run fully offline with Ollama and a locally hosted model
+- **Local experimentation** — quick setup on a laptop with GPU-accelerated inference
+- **Multi-provider flexibility** — switch between OpenAI, Groq, OpenRouter, Ollama, or custom endpoints
 
 ---
 
-## ✨ Features
+## How It Works
 
-- 📄 **Multi-Format Document Support**: Upload PDF, DOCX, or TXT files
-- 🔍 **Advanced Search**: Hybrid search combining semantic (dense) and keyword (sparse) retrieval
-- 🎯 **Intelligent Reranking**: Cosine similarity-based reranking for improved accuracy
-- 💬 **Interactive Chat**: Natural language Q&A interface with conversation history
-- 📚 **Source Citations**: Every answer includes citations from source documents
-- 🤔 **AI Thinking Process**: Optional step-by-step reasoning display
-- 🎨 **Modern UI**: Clean, responsive React-based interface
-- 🐳 **Docker Support**: Containerized deployment for easy setup
+1. **Document Upload**: Users upload clinical documents (PDF, DOCX, or TXT) through the web interface. The system validates file formats and initiates background processing.
+
+2. **Intelligent Processing**: Documents are extracted, chunked using semantic boundaries (800 tokens with 150 token overlap), and converted to vector embeddings using the configured embedding model.
+
+3. **Hybrid Search**: When users ask questions, ClinIQ employs a dual-search strategy combining dense vector search (semantic similarity) and sparse BM25 search (keyword matching), fused using Reciprocal Rank Fusion (RRF) for optimal retrieval.
+
+4. **Intelligent Reranking**: Retrieved chunks are reranked using cosine similarity with the query embedding to ensure the most relevant context is prioritized.
+
+5. **Answer Generation**: The top-ranked context is fed to the configured LLM with a carefully designed prompt that enforces evidence-based reasoning, includes source citations, and displays step-by-step thinking when enabled.
+
+The platform stores embeddings in ChromaDB for fast retrieval and supports real-time streaming responses for a responsive user experience. All answers include citations linking back to source documents, ensuring clinical traceability.
 
 ---
 
-## 🏗️ Architecture
+## Architecture
+
+This application uses a modern microservices architecture with a React frontend, Flask REST API backend, and ChromaDB vector database. The RAG pipeline implements hybrid search combining dense and sparse retrieval methods, followed by intelligent reranking for optimal context selection. The LLM layer is fully pluggable — any OpenAI-compatible remote endpoint, Groq, OpenRouter, or a locally running Ollama instance can be used via environment configuration.
+
+### Architecture Diagram
 
 ```mermaid
 graph TB
-    subgraph "Client"
-        User[👤 User]
-        Browser[🌐 Browser<br/>localhost:3000]
+    subgraph "Client Layer (port 3000)"
+        A[React Web UI]
+        A1[Document Upload]
+        A2[Query Interface]
+        A3[Real-time Streaming]
+    end
+
+    subgraph "Backend Layer (port 5000)"
+        B[Flask REST API]
+        C[RAG Pipeline]
+        H[Document Processor]
     end
-    subgraph "Frontend Container"
-        React[⚛️ React + Vite<br/>Port 3000]
+
+    subgraph "Search & Retrieval"
+        D[Dense Search<br/>Vector Similarity]
+        E[Sparse Search<br/>BM25 Keyword]
+        F[Hybrid Fusion<br/>RRF Algorithm]
+        G[Reranker<br/>Cosine Similarity]
+    end
+
+    subgraph "Processing Pipeline"
+        I[Text Extractor<br/>PDF/DOCX/TXT]
+        J[Semantic Chunker<br/>tiktoken]
+        K[Embedding Generator]
     end
-    subgraph "Backend Container"
-        Flask[🐍 Flask API<br/>Port 5000]
-        RAG[🔄 RAG Pipeline]
-        
-        subgraph "Search"
-            Vector[🔍 Vector Search]
-            BM25[📊 BM25 Search]
-            Hybrid[🔀 Hybrid Fusion]
-        end
+
+    subgraph "Storage Layer"
+        L[(ChromaDB<br/>Vector Database)]
+        M[(File Storage<br/>uploads/)]
     end
-    subgraph "Storage"
-        ChromaDB[(🗄️ ChromaDB<br/>Vector DB)]
-        Files[(📁 Uploads)]
+
+    subgraph "LLM Inference - Option A: Cloud APIs"
+        N1[OpenAI API]
+        N2[Groq API]
+        N3[OpenRouter API]
     end
-    subgraph "External"
-        OpenAI[🤖 OpenAI API<br/>GPT-3.5 + Embeddings]
+
+    subgraph "LLM Inference - Option B: Local"
+        O[Ollama<br/>localhost:11434]
     end
-    User -->|Upload & Query| Browser
-    Browser -->|HTTP/REST| React
-    React -->|Proxy /api/*| Flask
-    Flask --> RAG
-    RAG --> Vector
-    RAG --> BM25
-    Vector --> Hybrid
-    BM25 --> Hybrid
-    Hybrid -->|Context| OpenAI
-    Vector --> ChromaDB
-    BM25 --> ChromaDB
-    Flask --> ChromaDB
-    Flask --> Files
-    Flask -->|API Key| OpenAI
-    OpenAI -->|Answers| Flask
-    
-    classDef frontend fill:#61dafb,stroke:#333,stroke-width:2px,color:#000
-    classDef backend fill:#3776ab,stroke:#333,stroke-width:2px,color:#fff
-    classDef data fill:#ffa500,stroke:#333,stroke-width:2px,color:#000
-    classDef external fill:#10a37f,stroke:#333,stroke-width:2px,color:#fff
-    
-    class Browser,React frontend
-    class Flask,RAG,Vector,BM25,Hybrid backend
-    class ChromaDB,Files data
-    class OpenAI external
+
+    A1 --> B
+    A2 --> B
+    B --> C
+    B --> H
+    H --> I
+    I --> J
+    J --> K
+    K -->|Store Embeddings| L
+    B -->|Save File| M
+    C -->|Retrieve| D
+    C -->|Retrieve| E
+    D --> L
+    E --> L
+    D --> F
+    E --> F
+    F --> G
+    G -->|Top Chunks| C
+    C -->|LLM_PROVIDER=openai| N1
+    C -->|LLM_PROVIDER=groq| N2
+    C -->|LLM_PROVIDER=openrouter| N3
+    C -->|LLM_PROVIDER=ollama| O
+    K -->|Embedding Request| N1
+    N1 -->|Streaming Answer| C
+    N2 -->|Streaming Answer| C
+    N3 -->|Streaming Answer| C
+    O -->|Streaming Answer| C
+    C -->|SSE Stream| B
+    B -->|Real-time Updates| A3
+
+    style A fill:#61dafb
+    style B fill:#000000,color:#fff
+    style C fill:#ff6b6b
+    style D fill:#4ecdc4
+    style E fill:#4ecdc4
+    style F fill:#95e1d3
+    style G fill:#95e1d3
+    style H fill:#f38181
+    style I fill:#aa96da
+    style J fill:#aa96da
+    style K fill:#aa96da
+    style L fill:#feca57
+    style M fill:#feca57
+    style N1 fill:#10a37f
+    style N2 fill:#10a37f
+    style N3 fill:#10a37f
+    style O fill:#f3e5f5
 ```
 
+### Service Components
 
-### Key Components:
-- **Backend**: Flask REST API (Python) - Port 5000
-- **Frontend**: React + Vite (JavaScript) - Port 3000
-- **Vector Database**: ChromaDB (local persistent storage)
-- **AI Models**: OpenAI GPT-3.5-Turbo & text-embedding-3-small
+| Service       | Container     | Host Port | Description                                                                                     |
+| ------------- | ------------- | --------- | ----------------------------------------------------------------------------------------------- |
+| `backend`     | `backend`     | `5000`    | Flask REST API — document processing, RAG pipeline orchestration, streaming responses           |
+| `frontend`    | `frontend`    | `3000`    | React UI — document upload with drag-and-drop, real-time chat, streaming responses, citations  |
 
----
+**Core Components:**
+
+1. **React Web UI (Port 3000)** - Document upload with drag-and-drop, real-time query interface with streaming responses, chat history with syntax-highlighted citations, and thinking process visualization
 
-## 📋 Prerequisites
+2. **Flask REST API (Port 5000)** - API routing and request validation, orchestrates document processing pipeline, manages ChromaDB connections and operations, streams responses via Server-Sent Events (SSE), implements background processing for uploads
 
-### For Local Development:
-- **Python 3.10+**
-- **Node.js 18+** and npm
-- **OpenAI API Key** ([Get one here](https://platform.openai.com/account/api-keys))
+3. **RAG Pipeline** - Query rewriting with conversation context, hybrid search with RRF fusion, cosine similarity reranking, answer generation with configured LLM, thinking and answer section parsing, source citation generation
 
-### For Docker:
-- **Docker Desktop** ([Download here](https://www.docker.com/products/docker-desktop/))
+4. **Search & Retrieval System**:
+   - **Dense Search**: Vector similarity using embeddings for semantic matching
+   - **Sparse Search**: BM25 algorithm for keyword-based retrieval
+   - **Hybrid Fusion**: Reciprocal Rank Fusion (RRF) combines both methods
+   - **Reranker**: Cosine similarity reranking for final context selection
+
+5. **Document Processing Pipeline**:
+   - **Text Extractor**: Supports PDF (PyPDF2), DOCX (python-docx), and TXT
+   - **Semantic Chunker**: Uses tiktoken for token-aware chunking (800 tokens, 150 overlap)
+   - **Embedding Generator**: Creates embeddings via configured embedding model
+
+6. **ChromaDB** - Persistent vector database storing document embeddings, chunk metadata (source, page numbers, chunk IDs), and BM25 sparse indexes for hybrid search
+
+7. **File Storage** - Manages uploaded document files in `uploads/` directory
+
+8. **LLM Inference** - Pluggable inference layer supporting OpenAI, Groq, Ollama, OpenRouter, and custom OpenAI-compatible APIs
+
+### Typical Flow
+
+1. User uploads clinical document (PDF/DOCX/TXT) via web UI
+2. Backend saves file and initiates background processing
+3. Document processor extracts text and creates semantic chunks
+4. Embedding generator creates vector embeddings for each chunk
+5. Embeddings and metadata stored in ChromaDB with BM25 index
+6. User submits natural language query
+7. Query is embedded and sent to hybrid search system
+8. Dense search finds semantically similar chunks via vector similarity
+9. Sparse search finds keyword-matching chunks via BM25
+10. RRF algorithm fuses results from both methods
+11. Reranker applies cosine similarity to prioritize best chunks
+12. Top context is sent to configured LLM with system prompt
+13. AI generates answer with thinking process and citations
+14. Response streams back to user in real-time via SSE
+15. Citations link to specific source documents and pages
 
 ---
 
-## 🚀 Quick Start
+## Get Started
+
+### Prerequisites
+
+Before you begin, ensure you have the following installed and configured:
+
+- **Docker and Docker Compose** (v2)
+  - [Install Docker](https://docs.docker.com/get-docker/)
+  - [Install Docker Compose](https://docs.docker.com/compose/install/)
+- An LLM provider — one of:
+  - OpenAI: [Get API Key](https://platform.openai.com/account/api-keys)
+  - Groq: [Get API Key](https://console.groq.com/)
+  - OpenRouter: [Get API Key](https://openrouter.ai/keys)
+  - [Ollama](https://ollama.com/download) installed natively (no API key needed)
+  - Any custom OpenAI-compatible API endpoint
+
+#### Verify Installation
+
+```bash
+# Check Docker
+docker --version
+docker compose version
+
+# Verify Docker is running
+docker ps
+```
 
-### Option 1: Local Development (React + Flask)
+### Quick Start (Docker Deployment)
 
-#### Step 1: Clone the Repository
+#### 1. Clone the Repository
 
 ```bash
 git clone https://github.com/cld2labs/ClinIQ.git
 cd ClinIQ
 ```
 
-#### Step 2: Install Backend Dependencies
+#### 2. Configure the Environment
 
 ```bash
-cd backend
-pip install -r requirements.txt
-cd ..
+# Copy the example environment file
+cp backend/.env.example backend/.env
 ```
 
-#### Step 3: Install Frontend Dependencies
+Open `backend/.env` and configure your LLM provider. See [LLM Provider Configuration](#llm-provider-configuration) for detailed per-provider instructions.
 
+**Example for OpenAI:**
 ```bash
-cd frontend
-npm install
-cd ..
+LLM_PROVIDER=openai
+LLM_API_KEY=sk-your-api-key-here
+LLM_BASE_URL=https://api.openai.com/v1
+LLM_CHAT_MODEL=gpt-3.5-turbo
+LLM_EMBEDDING_MODEL=text-embedding-3-small
+```
+
+**Example for Ollama:**
+```bash
+LLM_PROVIDER=ollama
+LLM_BASE_URL=http://localhost:11434/v1
+LLM_CHAT_MODEL=qwen2.5:7b
+LLM_EMBEDDING_MODEL=nomic-embed-text
+# LLM_API_KEY not needed for Ollama
+```
+
+#### 3. Build and Start the Application
+
+```bash
+# Standard (attached)
+docker compose up --build
+
+# Detached (background)
+docker compose up -d --build
 ```
 
-#### Step 4: Start the Backend
+#### 4. Access the Application
+
+Once containers are running:
+
+- **Frontend UI**: http://localhost:3000
+- **Backend API**: http://localhost:5000
+- **Health Check**: http://localhost:5000/api/health
 
-Open a terminal and run:
+#### 5. Verify Services
+
+```bash
+# Health check
+curl http://localhost:5000/api/health
+
+# View running containers
+docker compose ps
+```
+
+**View logs:**
+
+```bash
+# All services
+docker compose logs -f
+
+# Backend only
+docker compose logs -f backend
+
+# Frontend only
+docker compose logs -f frontend
+```
+
+#### 6. Stop the Application
+
+```bash
+docker compose down
+```
+
+---
+
+### Local Development Setup
+
+**For developers who want to run services locally without Docker**
+
+**Backend (Python / Flask)**
 
 ```bash
 cd backend
+
+# Create virtual environment
+python -m venv venv
+source venv/bin/activate        # On Windows: venv\Scripts\activate
+
+# Install dependencies
+pip install -r requirements.txt
+
+# Configure environment
+cp .env.example .env
+# Edit .env with your LLM provider settings
+
+# Start backend
 python api.py
 ```
 
-The backend will start on `http://localhost:5000`
+Backend will run on `http://localhost:5000`
 
-#### Step 5: Start the Frontend
-
-Open another terminal and run:
+**Frontend (Node / Vite)**
 
 ```bash
 cd frontend
+
+# Install dependencies
+npm install
+
+# Start frontend
 npm run dev
 ```
 
-The frontend will start on `http://localhost:3000`
+Frontend will run on `http://localhost:3000`
+
+**Note**: The frontend Vite proxy automatically forwards `/api/*` requests to `http://localhost:5000`, so no additional configuration is needed for local development.
+
+---
+
+## Project Structure
+
+```
+ClinIQ/
+├── backend/                          # Backend Flask Application
+│   ├── api.py                       # Main Flask REST API server
+│   │                                #   - 7 API endpoints
+│   │                                #   - Background document processing
+│   │                                #   - SSE streaming support
+│   │                                #   - Health checks and status
+│   │
+│   ├── config.py                    # Multi-provider LLM configuration
+│   │                                #   - LLM_PROVIDER selection
+│   │                                #   - API key management
+│   │                                #   - Base URL configuration
+│   │                                #   - Model selection
+│   │                                #   - Generation parameters
+│   │
+│   ├── utils/                       # Core backend utilities
+│   │   ├── __init__.py
+│   │   │
+│   │   ├── constants.py             # Model configuration constants
+│   │   │
+│   │   ├── document_processor.py   # Document processing
+│   │   │                            #   - PDF extraction (PyPDF2)
+│   │   │                            #   - DOCX extraction (python-docx)
+│   │   │                            #   - Semantic chunking (tiktoken)
+│   │   │                            #   - Embedding creation
+│   │   │
+│   │   ├── rag_pipeline.py         # RAG pipeline implementation
+│   │   │                            #   - Query rewriting
+│   │   │                            #   - Context retrieval & citations
+│   │   │                            #   - Answer generation (streaming)
+│   │   │                            #   - Thinking/answer parsing
+│   │   │
+│   │   └── vector_store.py         # Search & storage
+│   │                                #   - ChromaDB operations
+│   │                                #   - Dense search (semantic)
+│   │                                #   - Sparse search (BM25)
+│   │                                #   - Hybrid search (RRF fusion)
+│   │                                #   - Reranking (cosine similarity)
+│   │
+│   ├── .env.example                # Environment variable template
+│   │                                #   - Multi-provider configuration
+│   │                                #   - All supported variables
+│   │
+│   ├── requirements.txt            # Python dependencies
+│   └── Dockerfile                  # Backend container configuration
+│
+├── frontend/                       # React + Vite Frontend Application
+│   ├── src/
+│   │   ├── components/
+│   │   │   ├── DocumentUpload.jsx # File upload with drag-and-drop
+│   │   │   │                      #   - Multi-file support
+│   │   │   │                      #   - Progress tracking
+│   │   │   │                      #   - File validation
+│   │   │   │
+│   │   │   ├── ChatInterface.jsx  # Chat UI
+│   │   │   │                      #   - Message display
+│   │   │   │                      #   - Real-time streaming
+│   │   │   │                      #   - Thinking process display
+│   │   │   │                      #   - Citation rendering
+│   │   │   │
+│   │   │   └── layout/
+│   │   │       ├── Header.jsx     # App header with logo
+│   │   │       ├── Footer.jsx     # Footer with tech info
+│   │   │       └── Layout.jsx     # Main layout wrapper
+│   │   │
+│   │   ├── pages/
+│   │   │   ├── Home.jsx           # Landing page
+│   │   │   └── Chat.jsx           # Main chat page
+│   │   │                          #   - State management
+│   │   │                          #   - Document status polling
+│   │   │                          #   - Upload handling
+│   │   │
+│   │   └── services/
+│   │       └── api.js             # API service layer
+│   │                              #   - uploadDocument()
+│   │                              #   - queryDocuments() with SSE
+│   │                              #   - getStatus()
+│   │                              #   - clearDocuments()
+│   │
+│   ├── package.json               # npm dependencies
+│   ├── vite.config.js            # Vite configuration (proxy)
+│   ├── tailwind.config.js        # TailwindCSS configuration
+│   └── Dockerfile                # Frontend container configuration
+│
+├── docker-compose.yml            # Service orchestration
+│                                 #   - Frontend service (port 3000)
+│                                 #   - Backend service (port 5000)
+│                                 #   - Volume mounts (data, uploads)
+│
+├── .chromadb/                    # ChromaDB persistent storage (gitignored)
+│   └── [vector database files]   #   - Document embeddings
+│                                 #   - Metadata & indexes
+│
+├── uploads/                      # Uploaded document files (gitignored)
+│   └── [user-uploaded files]    #   - PDF, DOCX, TXT files
+│
+├── Docs/                         # Project documentation
+│   ├── DOCKER_SETUP.md
+│   ├── PROJECT_DOCUMENTATION.md
+│   ├── QUICKSTART.md
+│   └── assets/
+│
+├── README.md                     # Project documentation (this file)
+├── CONTRIBUTING.md               # Contribution guidelines
+├── TROUBLESHOOTING.md            # Troubleshooting guide
+├── SECURITY.md                   # Security policy
+├── LICENSE.md                    # MIT License
+└── DISCLAIMER.md                 # Usage disclaimer
+```
 
-#### Step 6: Access the Application
+---
 
-1. Open your browser and navigate to `http://localhost:3000`
-2. Enter your OpenAI API key in the configuration panel
-3. Upload a clinical document (PDF, DOCX, or TXT)
-4. Start asking questions!
+## Usage Guide
+
+### Using ClinIQ
+
+1. **Open the Application**
+   - Navigate to `http://localhost:3000`
+
+2. **Upload Clinical Documents**
+   - Click "Upload Document" or drag-and-drop files
+   - Supported formats: PDF, DOCX, TXT
+   - Multiple files can be uploaded
+   - Wait for processing to complete (status shows "processed")
+
+3. **Ask Questions**
+   - Type your clinical question in the chat input
+   - Examples:
+     - "What are the contraindications for this medication?"
+     - "What are the recommended dosage guidelines?"
+     - "What side effects should I monitor?"
+     - "What are the drug interactions?"
+     - "What is the mechanism of action?"
+
+4. **Review Answers**
+   - Read the AI-generated answer with context
+   - Review the thinking process (if enabled)
+   - Check source citations linking to specific documents
+   - Citations include document name and chunk information
+
+5. **Manage Documents**
+   - View current document count in status area
+   - Clear all documents using the "Clear Documents" button
+   - Re-upload documents as needed for new analysis
+
+### Advanced Features
+
+**Hybrid Search**
+- Combines semantic search (meaning-based) with keyword search (BM25)
+- Uses Reciprocal Rank Fusion to merge results
+- Provides more comprehensive retrieval than either method alone
+- Best for complex queries with specific terms
+- Configurable via UI toggle or environment variable
+
+**Reranking**
+- Applies cosine similarity to reorder retrieved chunks
+- Prioritizes chunks most relevant to the query
+- Improves answer quality by focusing on best context
+- Slight performance overhead but better accuracy
+- Configurable via UI toggle or environment variable
+
+**Thinking Mode**
+- Shows AI's reasoning process before the answer
+- Useful for understanding how the AI reached conclusions
+- Helps verify evidence-based reasoning
+- Can be toggled on/off in configuration
+
+**Conversation History**
+- Previous queries and answers are maintained in session
+- Context from prior conversation used for query rewriting
+- Enables follow-up questions and clarifications
+- Cleared when page refreshes or documents are cleared
+
+### Best Practices
+
+1. **Document Quality**
+   - Upload well-formatted documents with clear text
+   - Avoid scanned images without OCR
+   - Use PDF or DOCX for best extraction results
+
+2. **Query Formulation**
+   - Be specific and detailed in your questions
+   - Include relevant clinical terms
+   - Reference specific conditions or medications when applicable
+
+3. **Answer Verification**
+   - Always check source citations
+   - Verify answers against original documents
+   - Consult healthcare professionals for critical decisions
+
+4. **Performance**
+   - Process documents before starting queries
+   - Enable hybrid search for comprehensive results
+   - Use reranking for higher quality (with slight latency trade-off)
 
 ---
 
-### Option 2: Docker Deployment
+## LLM Provider Configuration
+
+ClinIQ supports five LLM providers via environment configuration in `backend/.env`. All providers are configured via the same set of variables — switching requires only updating the `.env` file.
 
-#### Step 1: Clone the Repository
+### OpenAI
+
+**Best for**: High-quality embeddings and chat responses
 
 ```bash
-git clone https://github.com/cld2labs/ClinIQ.git
-cd ClinIQ
+LLM_PROVIDER=openai
+LLM_API_KEY=sk-your-api-key-here
+LLM_BASE_URL=https://api.openai.com/v1
+LLM_CHAT_MODEL=gpt-3.5-turbo
+LLM_EMBEDDING_MODEL=text-embedding-3-small
 ```
 
-#### Step 2: Build and Run with Docker Compose
+- **Get API Key**: https://platform.openai.com/account/api-keys
+- **Recommended Models**:
+  - Chat: `gpt-3.5-turbo`, `gpt-4`, `gpt-4-turbo`, `gpt-4o`
+  - Embeddings: `text-embedding-3-small`, `text-embedding-3-large`
+- **Pricing**: Pay-per-use (check [OpenAI Pricing](https://openai.com/pricing))
 
-From the root directory:
+### Groq
+
+**Best for**: Fast inference with competitive pricing
 
 ```bash
-docker-compose -f configuration/docker-compose.yml up --build
+LLM_PROVIDER=groq
+LLM_API_KEY=gsk_your-groq-api-key
+LLM_BASE_URL=https://api.groq.com/openai/v1
+LLM_CHAT_MODEL=llama-3.2-90b-text-preview
+LLM_EMBEDDING_MODEL=text-embedding-3-small  # Falls back to OpenAI
 ```
 
-Or navigate to the configuration folder:
+- **Get API Key**: https://console.groq.com/
+- **Recommended Models**:
+  - `llama-3.2-90b-text-preview`
+  - `llama-3.1-70b-versatile`
+  - `mixtral-8x7b-32768`
+- **Note**: Groq doesn't provide embeddings; falls back to OpenAI for embeddings
+
+### Ollama
+
+**Best for**: Private, local deployment with no API costs
 
 ```bash
-cd configuration
-docker-compose up --build
+LLM_PROVIDER=ollama
+LLM_BASE_URL=http://localhost:11434/v1
+LLM_CHAT_MODEL=qwen2.5:7b
+LLM_EMBEDDING_MODEL=nomic-embed-text
+# LLM_API_KEY not required for Ollama
 ```
 
-This single command will:
-- Build the backend container (Flask API)
-- Build the frontend container (React app)
-- Start both services automatically
-- Frontend: `http://localhost:3000`
-- Backend: `http://localhost:5000`
+**Setup:**
+
+1. Install Ollama: https://ollama.com/download
+2. Pull models:
+   ```bash
+   # Chat models
+   ollama pull qwen2.5:7b
+   ollama pull llama3.1:8b
+   ollama pull llama3.2:3b
+   ollama pull mistral:7b
+
+   # Embedding model
+   ollama pull nomic-embed-text
+   ```
+3. Verify Ollama is running:
+   ```bash
+   curl http://localhost:11434/api/tags
+   ```
+
+**Recommended Models**:
+- Chat: `qwen2.5:7b`, `llama3.1:8b`, `llama3.2:3b`, `mistral:7b`
+- Embeddings: `nomic-embed-text`
 
-#### Step 3: Access the Application
+**Note**: Run Ollama natively on the host (not in Docker) for best GPU acceleration
 
-1. Open your browser and navigate to `http://localhost:3000`
-2. Enter your OpenAI API key in the configuration panel
-3. Upload a clinical document
-4. Start asking questions!
+### OpenRouter
 
-#### Docker Commands
+**Best for**: Access to multiple models through single API
 
 ```bash
-# Run in background (detached mode)
-docker-compose up -d --build
+LLM_PROVIDER=openrouter
+LLM_API_KEY=sk-or-v1-your-openrouter-key
+LLM_BASE_URL=https://openrouter.ai/api/v1
+LLM_CHAT_MODEL=anthropic/claude-3.5-sonnet
+LLM_EMBEDDING_MODEL=text-embedding-3-small  # Falls back to OpenAI
+```
 
-# View logs
-docker-compose logs -f
+- **Get API Key**: https://openrouter.ai/keys
+- **Recommended Models**:
+  - `anthropic/claude-3.5-sonnet`
+  - `google/gemini-pro-1.5`
+  - `meta-llama/llama-3.1-70b-instruct`
+- **Note**: OpenRouter doesn't provide embeddings; falls back to OpenAI for embeddings
 
-# Stop services
-docker-compose down
+### Custom OpenAI-Compatible API
+
+**Best for**: Enterprise deployments with custom endpoints
 
-# Rebuild after changes
-docker-compose up --build --force-recreate
+```bash
+LLM_PROVIDER=custom
+LLM_API_KEY=your-custom-api-key
+LLM_BASE_URL=https://your-custom-endpoint.com/v1
+LLM_CHAT_MODEL=your-model-name
+LLM_EMBEDDING_MODEL=your-embedding-model-name
 ```
 
-## 🎯 How to Use
+Any enterprise gateway that exposes an OpenAI-compatible `/v1/chat/completions` and `/v1/embeddings` endpoint works without code changes.
 
-### 1. First Time Setup
+### Switching Providers
 
-1. **Open the application** at `http://localhost:3000`
-2. **Enter your OpenAI API key** in the configuration panel (left sidebar)
-3. **Upload a clinical document** using the file uploader
+1. Edit `backend/.env` with the new provider's values
+2. Restart the application:
+   ```bash
+   docker compose restart backend
+   ```
 
-### 2. Asking Questions
+No rebuild is needed — all settings are injected at runtime via environment variables.
 
-1. Type your question in the chat input
-2. Examples:
-   - "What are the contraindications for this medication?"
-   - "How should I monitor this patient's vital signs?"
-   - "What follow-up care is recommended?"
-3. Review the answer with source citations
+---
 
+## Environment Variables
 
-## 🧠 How It Works
+All variables are defined in `backend/.env` (copied from `backend/.env.example`). The backend reads them at startup via the `config.py` module.
 
-### 1. Document Processing
-- Extracts text from PDF/DOCX/TXT files
-- Chunks text into manageable pieces (800 tokens with 150 token overlap)
-- Creates embeddings using OpenAI's `text-embedding-3-small`
+### Core LLM Configuration
 
-### 2. Storage
-- Stores document chunks in ChromaDB (local vector database)
-- Maintains metadata (source file, chunk ID, page numbers)
+| Variable             | Description                                                      | Default                     | Type   |
+| -------------------- | ---------------------------------------------------------------- | --------------------------- | ------ |
+| `LLM_PROVIDER`       | Provider selection: `openai`, `groq`, `ollama`, `openrouter`, `custom` | `openai`                    | string |
+| `LLM_API_KEY`        | API key for the selected provider (not needed for Ollama)        | -                           | string |
+| `LLM_BASE_URL`       | Base URL of the LLM API endpoint                                 | `https://api.openai.com/v1` | string |
 
-### 3. Query Processing
-- **Hybrid Search**: Combines dense (semantic) and sparse (BM25 keyword) search using Reciprocal Rank Fusion (RRF)
-- **Reranking**: Re-ranks results using cosine similarity with query embedding
-- Retrieves top relevant chunks
+### Model Configuration
 
-### 4. Answer Generation
-- Feeds relevant chunks to GPT-3.5-Turbo
-- Generates answer based ONLY on document content
-- Includes source citations
+| Variable              | Description                        | Default                       | Type   |
+| --------------------- | ---------------------------------- | ----------------------------- | ------ |
+| `LLM_CHAT_MODEL`      | Model for chat completions         | `gpt-3.5-turbo`               | string |
+| `LLM_EMBEDDING_MODEL` | Model for creating embeddings      | `text-embedding-3-small`      | string |
 
----
+### Generation Parameters
 
-## 📁 Project Structure
+| Variable          | Description                                                       | Default | Type    |
+| ----------------- | ----------------------------------------------------------------- | ------- | ------- |
+| `TEMPERATURE`     | Sampling temperature. Lower = more deterministic output (0.0–1.0) | `0.7`   | float   |
+| `MAX_TOKENS`      | Maximum tokens in the generated answer                            | `1000`  | integer |
+| `MAX_RETRIES`     | Maximum retry attempts on API failures                            | `3`     | integer |
+| `REQUEST_TIMEOUT` | API request timeout in seconds                                    | `300`   | integer |
 
+### Security Configuration
+
+| Variable      | Description                                                     | Default | Type    |
+| ------------- | --------------------------------------------------------------- | ------- | ------- |
+| `VERIFY_SSL`  | SSL certificate verification. Set `false` only for development  | `true`  | boolean |
+
+### Server Configuration
+
+| Variable    | Description                   | Default       | Type   |
+| ----------- | ----------------------------- | ------------- | ------ |
+| `FLASK_ENV` | Flask environment mode        | `development` | string |
+
+**Example .env file:**
+
+```bash
+# backend/.env
+
+# ============================================================================
+# LLM Provider Configuration
+# ============================================================================
+
+# Provider Selection
+# Options: openai, groq, ollama, openrouter, custom
+LLM_PROVIDER=openai
+
+# API Key (not required for Ollama)
+LLM_API_KEY=sk-your-api-key-here
+
+# Base URL for LLM API
+LLM_BASE_URL=https://api.openai.com/v1
+
+# ============================================================================
+# Model Configuration
+# ============================================================================
+
+# Chat Model (for generating answers)
+LLM_CHAT_MODEL=gpt-3.5-turbo
+
+# Embedding Model (for creating vector representations)
+LLM_EMBEDDING_MODEL=text-embedding-3-small
+
+# ============================================================================
+# Generation Parameters
+# ============================================================================
+
+# Temperature: Controls randomness in responses (0.0 - 1.0)
+TEMPERATURE=0.7
+
+# Maximum Tokens: Maximum length of generated responses
+MAX_TOKENS=1000
+
+# Maximum Retry Attempts: Number of retries on API failures
+MAX_RETRIES=3
+
+# Request Timeout: API request timeout in seconds
+REQUEST_TIMEOUT=300
+
+# ============================================================================
+# Security Configuration
+# ============================================================================
+
+# SSL Verification (use 'true' in production)
+VERIFY_SSL=true
+
+# ============================================================================
+# Flask Configuration
+# ============================================================================
+
+# Flask Environment: development or production
+FLASK_ENV=development
 ```
-ClinIQ/
-├── README.md                         # Main documentation (this file)
-├             
-├── LICENSE  
-├── .gitignore                       # Git ignore rules (excludes .chromadb/, uploads/)
-│
-├── backend/                          # Backend Python Flask Application
-│   ├── api.py                       # 🔥 Main Flask REST API server (614 lines)
-│   │                                #    - 7 API endpoints
-│   │                                #    - Background document processing
-│   │                                #    - SSE streaming support
-│   │                                #    - Security teardown handlers
-│   │
-│   ├── requirements.txt             # Python dependencies
-│   │                                #    Flask, OpenAI, ChromaDB, PyPDF2, etc.
-│   ├── Dockerfile                   # Backend Docker configuration
-│   │                                #    - Python 3.11 slim base
-│   │                                #    - UTF-8 environment setup
-│   │                                #    - Volume mounts for persistence
-│   │
-│   └── utils/                       # Core backend utilities
-│       ├── __init__.py
-│       │
-│       ├── constants.py             # Model configuration (26 lines)
-│       │                            #    - DEFAULT_CHAT_MODEL = "gpt-3.5-turbo"
-│       │                            #    - DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
-│       │
-│       ├── document_processor.py    # Document processing (258 lines)
-│       │                            #    - PDF extraction (PyPDF2)
-│       │                            #    - DOCX extraction (python-docx)
-│       │                            #    - Semantic chunking (tiktoken)
-│       │                            #    - Embedding creation (OpenAI API)
-│       │
-│       ├── rag_pipeline.py          # 🔥 RAG pipeline implementation (686 lines)
-│       │                            #    - Query rewriting (conversation context)
-│       │                            #    - Context retrieval & citations
-│       │                            #    - Answer generation (streaming + non-streaming)
-│       │                            #    - Thinking/answer parsing
-│       │                            #    - Smart citation clearing
-│       │                            #    - Robust buffering logic
-│       │
-│       └── vector_store.py          # 🔥 Search & storage (743 lines)
-│                                    #    - ChromaDB operations (init, add, clear)
-│                                    #    - Dense search (semantic similarity)
-│                                    #    - Sparse search (BM25 keyword)
-│                                    #    - Hybrid search (RRF fusion)
-│                                    #    - Reranking (cosine similarity)
-│
-├── frontend/                        # React + Vite Frontend Application
-│   ├── package.json                 # Dependencies: React, TailwindCSS, Lucide icons
-│   ├── vite.config.js              # Vite configuration (proxy to backend)
-│   ├── tailwind.config.js          # TailwindCSS configuration
-│   ├── Dockerfile                  # Frontend Docker configuration (Node 20 Alpine)
-│   │
-│   ├── public/                     # Static assets
-│   │   └── cloud2labs-logo.png
-│   │
-│   └── src/
-│       ├── main.jsx                # React entry point
-│       ├── App.jsx                 # Main app component (routing)
-│       ├── index.css               # Global TailwindCSS styles
-│       │
-│       ├── components/             # React components
-│       │   ├── DocumentUpload.jsx  # File upload with drag-and-drop
-│       │   │                       #   - Multi-file support
-│       │   │                       #   - File validation
-│       │   │                       #   - Upload progress tracking
-│       │   │
-│       │   ├── ChatInterface.jsx   # 🔥 Chat UI (311 lines)
-│       │   │                       #   - Message display (user + assistant)
-│       │   │                       #   - Real-time streaming accumulation
-│       │   │                       #   - Thinking process display
-│       │   │                       #   - Citation rendering with links
-│       │   │                       #   - Error handling
-│       │   │
-│       │   ├── ConfigSidebar.jsx   # Configuration panel
-│       │   │                       #   - API key input (memory-only)
-│       │   │                       #   - Hybrid search toggle
-│       │   │                       #   - Reranker toggle
-│       │   │                       #   - Thinking mode toggle
-│       │   │                       #   - Model info display
-│       │   │
-│       │   └── layout/
-│       │       ├── Header.jsx      # App header with logo
-│       │       ├── Footer.jsx      # Footer with tech info
-│       │       └── Layout.jsx      # Main layout wrapper
-│       │
-│       ├── pages/                  # Page components
-│       │   ├── Home.jsx            # Landing page
-│       │   └── Chat.jsx            # 🔥 Main chat page (164 lines)
-│       │                           #   - State management (API key, documents, config)
-│       │                           #   - Document status polling
-│       │                           #   - Upload success handling
-│       │                           #   - RAG technology badges
-│       │
-│       └── services/
-│           └── api.js              # API service layer
-│                                   #   - uploadDocument()
-│                                   #   - queryDocuments() with SSE streaming
-│                                   #   - getStatus()
-│                                   #   - clearDocuments()
-│
-├── configuration/                  # Docker & deployment configuration
-│   └── docker-compose.yml          # Docker Compose orchestration
-│                                   #   - Frontend service (port 3000)
-│                                   #   - Backend service (port 5000)
-│                                   #   - Volume mounts (code, data)
-│                                   #   - Network configuration
-│
-├── Docs/                           # Project documentation
-│            
-│   ├── DOCKER_SETUP.md             # Docker deployment guide
-│   ├── PROJECT_DOCUMENTATION.md    # Comprehensive project docs
-│   ├── QUICKSTART.md               # Quick start guide
-│   └── assets/                     # Documentation assets
-│       └── demo.gif                # Application demo
-│
-├── .chromadb/                      # 🔒 ChromaDB persistent storage (gitignored)
-│   └── [vector database files]    #     - Document embeddings
-│                                   #     - Metadata & indexes
-│
-└── uploads/                        # 🔒 Uploaded document files (gitignored)
-    └── [user-uploaded files]       #     - PDF, DOCX, TXT files
-```
----
 
+For complete examples of all provider configurations, see `backend/.env.example`.
+
+---
 
-## 📝 Environment Variables
+## Technology Stack
 
 ### Backend
 
-- `FLASK_ENV`: `development` or `production`
-- `OPENAI_API_KEY`: Your OpenAI API key (optional, can be set in UI)
+- **Framework**: Flask (Python web framework with WSGI)
+- **LLM Integration**:
+  - OpenAI Python SDK (multi-provider compatible)
+  - Configurable via environment variables
+  - Supports OpenAI, Groq, Ollama, OpenRouter, Custom APIs
+- **Vector Database**: ChromaDB (persistent local storage)
+- **Document Processing**:
+  - PyPDF2 (PDF text extraction)
+  - python-docx (DOCX text extraction)
+  - tiktoken (token counting and chunking)
+- **Search Algorithms**:
+  - Dense vector search (cosine similarity)
+  - BM25 sparse search (keyword matching)
+  - Reciprocal Rank Fusion (RRF)
+  - Cosine similarity reranking
+- **API Features**:
+  - Flask-CORS (cross-origin resource sharing)
+  - Server-Sent Events (SSE) for streaming
+  - Background task processing
+- **Utilities**:
+  - NumPy (numerical operations)
+  - python-dotenv (environment variable management)
 
 ### Frontend
 
-- `VITE_BACKEND_ENDPOINT`: Backend API URL (default: `http://localhost:5000`)
-
+- **Framework**: React 18 with JavaScript
+- **Build Tool**: Vite (fast bundler and dev server)
+- **Styling**: Tailwind CSS + PostCSS
+- **UI Components**:
+  - Custom design system
+  - Lucide React icons
+  - Drag-and-drop file upload
+- **State Management**: React hooks (useState, useEffect, useRef)
+- **API Communication**:
+  - Fetch API for REST calls
+  - EventSource for Server-Sent Events (SSE)
+  - Proxy configuration via Vite
+- **Markdown & Code**:
+  - Syntax highlighting for citations
+  - Real-time streaming text display
+
+### Infrastructure
+
+- **Containerization**: Docker + Docker Compose
+- **Volumes**:
+  - ChromaDB persistence (`.chromadb/`)
+  - File uploads storage (`uploads/`)
+- **Networking**: Docker bridge network
+- **Health Checks**: Backend health monitoring
+
+### AI/ML Techniques
+
+- **RAG (Retrieval-Augmented Generation)**:
+  - Document chunking with semantic boundaries
+  - Vector embeddings for semantic search
+  - Context-aware answer generation
+- **Hybrid Search**:
+  - Dense retrieval (embeddings + cosine similarity)
+  - Sparse retrieval (BM25 keyword matching)
+  - Reciprocal Rank Fusion (RRF) algorithm
+- **Reranking**: Cosine similarity for context prioritization
+- **Prompt Engineering**:
+  - Evidence-based reasoning prompts
+  - Citation formatting instructions
+  - Thinking process elicitation
 
 ---
 
+## Troubleshooting
 
-## License
+For comprehensive troubleshooting guidance, common issues, and solutions, refer to:
+
+[Troubleshooting Guide - TROUBLESHOOTING.md](./TROUBLESHOOTING.md)
 
-This project is licensed under [LICENSE](./License.md) file for details.
+### Common Issues
 
+**Issue: "No documents found" error**
+
+```bash
+# Upload documents first and wait for processing to complete
+# Check backend logs
+docker compose logs backend --tail 50
+```
+
+- Ensure documents were uploaded successfully
+- Wait for background processing to complete
+- Verify ChromaDB is accessible
+
+**Issue: LLM API errors**
+
+```bash
+# Test API key and connectivity
+curl -X POST http://localhost:5000/api/status
+
+# Check backend logs for error details
+docker compose logs backend --tail 50
+```
+
+- Verify API key is correct in `backend/.env`
+- Ensure API key has sufficient credits/quota
+- Check network connectivity to LLM provider
+- Verify `LLM_BASE_URL` is correct for your provider
+
+**Issue: Ollama connection refused**
+
+```bash
+# Confirm Ollama is running on the host
+curl http://localhost:11434/api/tags
+
+# If not running, start it
+ollama serve
+```
+
+- Ensure Ollama is running natively on the host (not in Docker)
+- Verify `LLM_BASE_URL=http://localhost:11434/v1` in `backend/.env`
+- Check that required models are pulled (`ollama list`)
+
+**Issue: Empty or poor quality answers**
+
+- Enable hybrid search for better retrieval
+- Enable reranking for improved context selection
+- Verify documents uploaded contain relevant information
+- Try adjusting `TEMPERATURE` in `backend/.env`
+- Check that embeddings were created successfully
+
+**Issue: Slow responses**
+
+- Disable reranking if speed is critical
+- Use faster LLM models (e.g., `gpt-3.5-turbo` vs `gpt-4`)
+- For Ollama, ensure GPU acceleration is enabled
+- Reduce number of retrieved chunks (modify code)
+
+### Debug Mode
+
+Enable verbose logging for deeper inspection:
+
+```bash
+# View real-time container logs
+docker compose logs -f backend
+
+# Check specific errors
+docker compose logs backend | grep ERROR
+
+# View all backend activity
+docker compose logs backend --tail 200
+```
+
+**Clear data and restart:**
+
+```bash
+# Stop services
+docker compose down
+
+# Clear all data
+rm -rf .chromadb uploads
+mkdir .chromadb uploads
+
+# Restart fresh
+docker compose up --build
+```
 
 ---
 
-## Disclaimer
+## License
 
-**ClinIQ** is provided as-is for research, analysis, and informational purposes only. While we strive for accuracy:
+This project is licensed under the terms specified in the [LICENSE.md](./LICENSE.md) file.
 
-- Always verify AI-generated outputs against original clinical sources
+---
+
+## Disclaimer
 
-- Do not rely solely on model responses for diagnosis, treatment, or clinical decision-making
+**ClinIQ** is provided as-is for research, educational, and informational purposes only. This tool is NOT intended for clinical diagnosis, treatment decisions, or patient care.
 
-- Consult qualified healthcare professionals for medical guidance
+**Important Warnings:**
 
-- Validate rigorously and conduct thorough testing before any production or real-world use
+- **Not Medical Advice**: Answers generated by ClinIQ do not constitute medical advice, diagnosis, or treatment recommendations
+- **Always Verify**: Healthcare professionals must verify all AI-generated information against authoritative clinical sources
+- **Human Review Required**: All outputs must be reviewed by qualified medical professionals before any clinical application
+- **No Liability**: The developers assume no liability for any decisions made based on ClinIQ outputs
+- **Data Privacy**: Ensure compliance with HIPAA and other healthcare data regulations when uploading documents
+- **Experimental Technology**: RAG and LLM technologies may produce inaccurate, incomplete, or hallucinated information
+- **Not FDA Approved**: This software has not been evaluated or approved by the FDA or any regulatory agency
 
-For full disclaimer details, see [DISCLAIMER](./Disclaimer.md)
+**Best Practices:**
 
+- Only upload de-identified or appropriately authorized clinical documents
+- Consult qualified healthcare professionals for all medical decisions
+- Validate all information against peer-reviewed medical literature
+- Conduct thorough testing in non-production environments before any real-world use
+- Implement additional safety checks and human oversight for any clinical applications
+- Maintain audit trails and version control for clinical decision support systems
 
+For full disclaimer details, see [DISCLAIMER.md](./DISCLAIMER.md)
diff --git a/Security.md b/Security.md
index 8fdac93..9ac0b56 100644
--- a/Security.md
+++ b/Security.md
@@ -1,11 +1,34 @@
-# Security Policy 
+# Security Policy
 
-The **ClinIQ** does not include production-grade security controls.
+The **ClinIQ - Clinical Q&A AI Assistant** blueprint does not include production-grade security controls.
 
 This repository is not secure by default and must not be used in production without a comprehensive security review.
 
+## Known Considerations
+
+- **API keys**: `OPENAI_API_KEY` and `LLM_API_KEY` are loaded from `.env` or environment variables. Never commit `.env` to version control.
+- **CORS**: CORS is configured to allow all origins by default (`*`). Restrict to specific origins in any non-local deployment.
+- **Clinical data privacy**: Clinical documents and queries are sent to OpenAI's API. Do not use cloud APIs with protected health information (PHI) or personally identifiable information (PII) without proper safeguards.
+- **HIPAA compliance**: This application is NOT HIPAA-compliant by default. Implementing HIPAA compliance requires:
+  - Business Associate Agreements (BAA) with service providers
+  - Encryption at rest and in transit
+  - Access controls and audit logging
+  - Data retention and disposal policies
+- **Uploaded documents**: Files are stored locally in the `uploads/` directory. Implement proper access controls and encryption for sensitive data.
+- **Vector database**: ChromaDB stores document embeddings locally. Ensure proper permissions and consider encryption for sensitive deployments.
+
+## User Responsibilities
+
 Users are responsible for implementing appropriate:
+
 - Authentication and authorization mechanisms
 - Encryption and secure data storage
+- Network-level access controls and firewall rules
 - Monitoring, logging, and auditing
-- Regulatory and compliance safeguards
+- Regulatory and compliance safeguards relevant to their deployment environment (HIPAA, GDPR, etc.)
+- Data anonymization and de-identification for clinical data
+- Secure API key management and rotation policies
+
+## Reporting a Vulnerability
+
+If you discover a security vulnerability in this blueprint, please report it privately to the Cloud2 Labs maintainers rather than opening a public issue.
diff --git a/Terms And Conditions .md b/TERMS_AND_CONDITIONS.md
similarity index 100%
rename from Terms And Conditions .md
rename to TERMS_AND_CONDITIONS.md
diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md
new file mode 100644
index 0000000..0460e46
--- /dev/null
+++ b/TROUBLESHOOTING.md
@@ -0,0 +1,876 @@
+# Troubleshooting Guide
+
+This guide covers common issues and solutions for the ClinIQ Clinical Q&A AI Assistant application.
+
+---
+
+## Table of Contents
+
+- [Installation Issues](#installation-issues)
+- [Configuration Errors](#configuration-errors)
+- [Runtime Errors](#runtime-errors)
+- [Backend Failures](#backend-failures)
+- [Performance Issues](#performance-issues)
+- [Docker Issues](#docker-issues)
+- [Network and API Errors](#network-and-api-errors)
+- [Advanced Debugging](#advanced-debugging)
+
+---
+
+## Installation Issues
+
+### Docker Container Build Fails
+
+**Error:** `ERROR [internal] load build context`
+
+**Cause:** Docker daemon not running or insufficient permissions.
+
+**Solution:**
+```bash
+# Start Docker daemon (Linux/Mac)
+sudo systemctl start docker
+
+# On Windows, start Docker Desktop application
+
+# Verify Docker is running
+docker ps
+```
+
+### Port Already in Use
+
+**Error:** `Error starting userland proxy: listen tcp 0.0.0.0:5000: bind: address already in use`
+
+**Cause:** Another service is using port 5000 (backend) or 3000 (frontend).
+
+**Solution:**
+```bash
+# Find process using the port
+# On Linux/Mac:
+lsof -i :5000
+lsof -i :3000
+
+# On Windows:
+netstat -ano | findstr :5000
+netstat -ano | findstr :3000
+
+# Kill the process or change ports in docker-compose.yml
+```
+
+### Python Dependencies Installation Fails
+
+**Error:** `ERROR: Could not find a version that satisfies the requirement...`
+
+**Cause:** Python version incompatibility or network issues.
+
+**Solution:**
+```bash
+# Ensure Python 3.10+ is installed
+python --version
+
+# Upgrade pip
+pip install --upgrade pip
+
+# Install with verbose output to diagnose
+cd backend
+pip install -r requirements.txt --verbose
+
+# For SSL errors, try:
+pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org -r requirements.txt
+```
+
+### Node.js Dependencies Installation Fails
+
+**Error:** `npm ERR! code ERESOLVE` or `Cannot find module`
+
+**Cause:** Node version incompatibility or corrupted package-lock.
+
+**Solution:**
+```bash
+cd frontend
+
+# Remove existing node_modules and lock file
+rm -rf node_modules package-lock.json
+
+# Ensure Node 18+ is installed
+node --version
+
+# Clean install
+npm install
+
+# If issues persist, try:
+npm install --legacy-peer-deps
+```
+
+---
+
+## Configuration Errors
+
+### Missing OpenAI API Key
+
+**Error:** `ValueError: OpenAI API key is required` or `401 Unauthorized`
+
+**Cause:** OpenAI API key not configured in the application.
+
+**Solution:**
+
+**Option 1: Configure via UI (Recommended)**
+1. Open the application at `http://localhost:3000`
+2. Click on the configuration sidebar (gear icon)
+3. Enter your OpenAI API key in the provided field
+4. The key is stored in browser memory only (not persisted)
+
+**Option 2: Set Environment Variable**
+```bash
+# Add to backend/.env file
+echo "OPENAI_API_KEY=your_actual_api_key_here" >> backend/.env
+
+# Or set as system environment variable
+export OPENAI_API_KEY=your_actual_api_key_here  # Linux/Mac
+set OPENAI_API_KEY=your_actual_api_key_here     # Windows CMD
+```
+
+### Invalid OpenAI API Key
+
+**Error:** `401 Unauthorized` or `Invalid API key`
+
+**Cause:** OpenAI API key is incorrect, expired, or revoked.
+
+**Solution:**
+1. Verify your API key at https://platform.openai.com/api-keys
+2. Generate a new key if needed
+3. Update the key in the configuration sidebar
+4. API key format should start with `sk-`
+
+### OpenAI API Key Without Credits
+
+**Error:** `429 You exceeded your current quota` or `Insufficient credits`
+
+**Cause:** OpenAI account has insufficient credits or billing not set up.
+
+**Solution:**
+1. Check your usage at https://platform.openai.com/usage
+2. Add billing information at https://platform.openai.com/account/billing
+3. Purchase credits or upgrade plan
+4. Wait for rate limits to reset if on free tier
+
+### ChromaDB Initialization Fails
+
+**Error:** `chromadb.errors.NoIndexException` or `Cannot connect to ChromaDB`
+
+**Cause:** ChromaDB directory not accessible or corrupted.
+
+**Solution:**
+```bash
+# Remove and recreate ChromaDB directory
+rm -rf .chromadb
+mkdir .chromadb
+
+# Ensure proper permissions
+chmod 755 .chromadb
+
+# Restart the application
+docker compose down
+docker compose up --build
+```
+
+### File Upload Directory Missing
+
+**Error:** `FileNotFoundError: [Errno 2] No such file or directory: 'uploads'`
+
+**Cause:** Uploads directory doesn't exist.
+
+**Solution:**
+```bash
+# Create uploads directory in project root
+mkdir -p uploads
+
+# Ensure proper permissions
+chmod 755 uploads
+
+# Restart backend
+docker compose restart backend
+```
+
+---
+
+## Runtime Errors
+
+### Document Upload Fails
+
+**Error:** `Failed to upload document` or `Unsupported file type`
+
+**Cause:** File format not supported or file corrupted.
+
+**Solution:**
+1. Verify file is PDF, DOCX, or TXT format
+2. Check file size (max recommended: 50MB)
+3. Try opening the file to ensure it's not corrupted
+4. For PDFs, ensure they contain extractable text (not scanned images)
+5. Check backend logs for specific error:
+   ```bash
+   docker compose logs backend --tail 50
+   ```
+
+### Document Processing Stuck
+
+**Error:** Status shows "processing" indefinitely
+
+**Cause:** Document processing failed or backend crashed.
+
+**Solution:**
+```bash
+# Check backend logs for errors
+docker compose logs backend --tail 100
+
+# Restart backend service
+docker compose restart backend
+
+# Check if document was too large
+# Look for memory errors in logs
+
+# Clear stuck documents
+curl -X DELETE http://localhost:5000/clear
+```
+
+### Query Returns "No Documents Found"
+
+**Error:** `No documents found in vector store`
+
+**Cause:** Documents not properly indexed or ChromaDB cleared.
+
+**Solution:**
+1. Verify documents were successfully uploaded
+2. Check document status at `/status` endpoint:
+   ```bash
+   curl http://localhost:5000/status
+   ```
+3. Re-upload documents if necessary
+4. Check ChromaDB directory exists and contains data:
+   ```bash
+   ls -la .chromadb/
+   ```
+
+### Empty or Incomplete Answers
+
+**Error:** AI returns empty responses or cuts off mid-sentence
+
+**Cause:** Context retrieval failed or token limit exceeded.
+
+**Solution:**
+1. Check if documents are properly chunked and indexed
+2. Verify query is specific enough
+3. Review backend logs for retrieval errors
+4. Try disabling hybrid search and reranking to isolate issue
+5. Increase token limits in `backend/utils/constants.py` if needed
+
+### Citations Not Showing
+
+**Error:** Answers provided without source citations
+
+**Cause:** Citation parsing failed or metadata missing.
+
+**Solution:**
+1. Check document metadata was stored during upload
+2. Review citation format in backend logs
+3. Verify `rag_pipeline.py` is properly extracting citations
+4. Re-upload documents to regenerate metadata
+
+---
+
+## Backend Failures
+
+### Flask Server Won't Start
+
+**Error:** `Address already in use` or `ModuleNotFoundError`
+
+**Cause:** Port conflict or missing dependencies.
+
+**Solution:**
+```bash
+# Check if port 5000 is available
+lsof -i :5000  # Linux/Mac
+netstat -ano | findstr :5000  # Windows
+
+# Kill conflicting process or change port in api.py
+# Change: app.run(host='0.0.0.0', port=5000)
+# To:    app.run(host='0.0.0.0', port=5001)
+
+# Verify all dependencies installed
+cd backend
+pip install -r requirements.txt
+```
+
+### ChromaDB Connection Errors
+
+**Error:** `chromadb.errors.ChromaError: Could not connect to tenant`
+
+**Cause:** ChromaDB client misconfigured or directory permissions issue.
+
+**Solution:**
+```bash
+# Reset ChromaDB
+rm -rf .chromadb
+mkdir .chromadb
+
+# Ensure backend has write permissions
+chmod -R 755 .chromadb
+
+# Update ChromaDB settings in vector_store.py if needed
+# Verify persistence directory path is correct
+
+# Restart application
+docker compose down
+docker compose up --build
+```
+
+### Embedding Generation Fails
+
+**Error:** `OpenAI API error during embedding creation`
+
+**Cause:** API key invalid, network issues, or rate limits.
+
+**Solution:**
+1. Verify OpenAI API key is valid
+2. Check network connectivity to OpenAI API:
+   ```bash
+   curl https://api.openai.com/v1/models \
+     -H "Authorization: Bearer YOUR_API_KEY"
+   ```
+3. Check rate limits at https://platform.openai.com/account/limits
+4. Review embedding model configuration in `constants.py`:
+   ```python
+   DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
+   ```
+5. Wait and retry if rate limited
+
+### Hybrid Search Errors
+
+**Error:** `Failed to perform hybrid search` or BM25 errors
+
+**Cause:** BM25 index not built or sparse search configuration issue.
+
+**Solution:**
+```bash
+# Check backend logs for specific BM25 error
+docker compose logs backend | grep -i "bm25"
+
+# Clear and rebuild vector store
+curl -X DELETE http://localhost:5000/clear
+
+# Re-upload documents to rebuild both dense and sparse indexes
+
+# Disable hybrid search temporarily in UI to test
+# If works without hybrid search, rebuild BM25 index
+```
+
+### Reranking Fails
+
+**Error:** `Reranking failed` or cosine similarity errors
+
+**Cause:** Query embedding generation failed or metadata missing.
+
+**Solution:**
+1. Check if query embeddings are being created
+2. Verify cosine similarity calculation in `vector_store.py`
+3. Test without reranking to isolate issue
+4. Check backend logs for numpy/math errors
+5. Ensure chunk embeddings exist in ChromaDB
+
+### Document Chunking Errors
+
+**Error:** `Failed to chunk document` or tiktoken errors
+
+**Cause:** Token counting failed or document format issue.
+
+**Solution:**
+```bash
+# Verify tiktoken is properly installed
+pip install --upgrade tiktoken
+
+# Check document encoding
+# Ensure UTF-8 encoding for text files
+
+# Review chunking parameters in document_processor.py:
+# - chunk_size (default: 800 tokens)
+# - chunk_overlap (default: 150 tokens)
+
+# Try simpler document first to isolate issue
+```
+
+---
+
+## Performance Issues
+
+### Slow Query Response Times
+
+**Symptom:** Queries take longer than 10 seconds to respond
+
+**Causes and Solutions:**
+
+1. **Large document corpus**
+   - Reduce number of retrieved chunks (default: 5)
+   - Enable reranking to improve quality of fewer chunks
+   - Consider document filtering by metadata
+
+2. **Hybrid search overhead**
+   - Disable hybrid search in UI if not needed
+   - Profile search times in backend logs
+   - Optimize BM25 index parameters
+
+3. **Network latency to OpenAI**
+   - Check internet connection speed
+   - Monitor OpenAI API status at https://status.openai.com
+   - Consider using streaming mode for faster perceived response
+
+4. **Reranking computation**
+   - Disable reranking if not necessary
+   - Reduce number of chunks to rerank
+   - Profile cosine similarity calculations
+
+### High Memory Usage
+
+**Symptom:** Backend container using excessive memory (>2GB)
+
+**Cause:** Large document embeddings or ChromaDB cache.
+
+**Solution:**
+```bash
+# Increase Docker memory limit in Docker Desktop
+# Recommended: 4GB minimum
+
+# Clear old documents
+curl -X DELETE http://localhost:5000/clear
+
+# Optimize chunking strategy
+# Reduce chunk_size in document_processor.py
+
+# Monitor memory usage
+docker stats
+
+# Restart backend periodically if needed
+docker compose restart backend
+```
+
+### Slow Document Upload Processing
+
+**Symptom:** Document processing takes longer than 2 minutes
+
+**Cause:** Large document or slow embedding generation.
+
+**Solution:**
+1. Check document size - consider splitting large PDFs
+2. Verify network speed to OpenAI API
+3. Review backend logs for bottlenecks
+4. Monitor embedding generation time
+5. Consider increasing chunk size to reduce total chunks
+6. Process documents in smaller batches
+
+### Frontend Freezing During Streaming
+
+**Symptom:** UI becomes unresponsive during answer streaming
+
+**Cause:** Too many rapid DOM updates or memory leak.
+
+**Solution:**
+1. Check browser console for JavaScript errors
+2. Reduce update frequency in ChatInterface.jsx
+3. Clear browser cache and cookies
+4. Test in different browser
+5. Check for memory leaks in React DevTools
+
+---
+
+## Docker Issues
+
+### Backend Container Won't Start
+
+**Error:** `backend exited with code 1`
+
+**Solution:**
+```bash
+# Check backend logs for specific error
+docker compose logs backend
+
+# Common causes:
+# 1. Missing dependencies - rebuild:
+docker compose build --no-cache backend
+docker compose up -d
+
+# 2. Port conflict - see Port Already in Use section
+
+# 3. Volume mount issues - verify paths in docker-compose.yml
+```
+
+### Frontend Container Build Fails
+
+**Error:** `npm install failed` or `Cannot find module`
+
+**Solution:**
+```bash
+# Rebuild frontend with clean cache
+docker compose build --no-cache frontend
+docker compose up -d frontend
+
+# Verify Node.js version in Dockerfile (should be 18+)
+
+# Check if package.json is valid
+cd frontend
+npm install  # Test locally first
+```
+
+### Container Memory Issues
+
+**Error:** `Killed` or `Out of memory`
+
+**Solution:**
+```bash
+# Increase Docker memory limit in Docker Desktop settings
+# Recommended: 4GB minimum, 8GB preferred
+
+# Reduce memory usage:
+# - Clear old documents
+# - Reduce chunk size
+# - Process fewer documents concurrently
+
+# Monitor memory usage
+docker stats
+```
+
+### Cannot Connect to Backend from Frontend
+
+**Error:** `Network error` or `Connection refused` in browser console
+
+**Cause:** Docker network misconfiguration or CORS issue.
+
+**Solution:**
+```bash
+# Verify both containers are running
+docker compose ps
+
+# Check backend is accessible
+curl http://localhost:5000/health
+
+# Verify Vite proxy configuration in frontend/vite.config.js:
+# proxy: {
+#   '/api': 'http://backend:5000'
+# }
+
+# Check CORS settings in backend/api.py
+# CORS(app, origins=["*"])  # or specific origins
+
+# Restart services
+docker compose restart
+```
+
+### Volume Mount Permission Errors
+
+**Error:** `Permission denied` when accessing volumes
+
+**Cause:** Docker volume permissions mismatch.
+
+**Solution:**
+```bash
+# Fix permissions on host
+sudo chown -R $USER:$USER .chromadb uploads
+
+# Or in docker-compose.yml, add user directive:
+# user: "1000:1000"  # Your UID:GID
+
+# Rebuild containers
+docker compose down
+docker compose up --build
+```
+
+---
+
+## Network and API Errors
+
+### OpenAI API Connection Refused
+
+**Error:** `ConnectionRefusedError: [Errno 111] Connection refused`
+
+**Cause:** OpenAI API unreachable or network firewall blocking.
+
+**Solution:**
+```bash
+# Test connectivity to OpenAI
+curl https://api.openai.com/v1/models \
+  -H "Authorization: Bearer YOUR_API_KEY"
+
+# Check firewall/proxy settings allow outbound HTTPS
+# Verify DNS resolution
+ping api.openai.com
+
+# Check OpenAI status
+# https://status.openai.com
+
+# Try with different network if behind corporate proxy
+```
+
+### OpenAI API Timeout
+
+**Error:** `Timeout waiting for OpenAI response`
+
+**Cause:** Network latency or OpenAI API overloaded.
+
+**Solution:**
+```bash
+# Increase timeout in backend code
+# In rag_pipeline.py or document_processor.py:
+# timeout = 300  # Increase to 300 seconds
+
+# Check network speed
+# Test with smaller document first
+
+# Switch to streaming mode for better user experience
+```
+
+### Rate Limit Exceeded
+
+**Error:** `429 Too Many Requests` or `Rate limit exceeded`
+
+**Cause:** Too many concurrent API requests.
+
+**Solution:**
+- Wait 60 seconds before retrying
+- Reduce document processing concurrency
+- Check rate limits at https://platform.openai.com/account/limits
+- Upgrade to higher tier OpenAI plan
+- Process documents in smaller batches
+
+### SSL Certificate Errors
+
+**Error:** `SSLError: [SSL: CERTIFICATE_VERIFY_FAILED]`
+
+**Cause:** SSL certificate verification issues or corporate proxy.
+
+**Solution:**
+```bash
+# Update CA certificates
+pip install --upgrade certifi
+
+# For development only (not recommended for production):
+# Add SSL verification bypass in api calls
+# requests.post(..., verify=False)
+
+# Or use corporate CA bundle
+# export REQUESTS_CA_BUNDLE=/path/to/corporate-ca.pem
+```
+
+### CORS Errors in Browser
+
+**Error:** `CORS policy: No 'Access-Control-Allow-Origin' header`
+
+**Cause:** Backend not allowing frontend origin.
+
+**Solution:**
+```python
+# In backend/api.py, ensure CORS is configured:
+from flask_cors import CORS
+
+app = Flask(__name__)
+CORS(app, origins=["http://localhost:3000", "http://localhost:5173"])
+
+# Or allow all origins for development (not for production):
+CORS(app, origins="*")
+
+# Restart backend
+docker compose restart backend
+```
+
+---
+
+## Advanced Debugging
+
+### Enable Debug Logging
+
+To get more detailed logs for debugging:
+
+```bash
+# In backend/api.py, set Flask debug mode:
+app.run(debug=True, host='0.0.0.0', port=5000)
+
+# Or set environment variable
+export FLASK_ENV=development
+
+# Restart backend
+docker compose restart backend
+
+# View detailed logs
+docker compose logs -f backend
+```
+
+### Check Application Health
+
+```bash
+# Backend health check
+curl http://localhost:5000/health
+
+# Expected response:
+# {"status": "healthy", "chromadb": "connected", "models": "loaded"}
+
+# Check document count
+curl http://localhost:5000/status
+
+# View ChromaDB stats
+ls -lah .chromadb/
+```
+
+### Test Individual Components
+
+**Test Document Upload:**
+```bash
+curl -X POST http://localhost:5000/upload \
+  -F "file=@test-document.pdf"
+```
+
+**Test Query (Non-streaming):**
+```bash
+curl -X POST http://localhost:5000/query \
+  -H "Content-Type: application/json" \
+  -H "X-OpenAI-Key: YOUR_API_KEY" \
+  -d '{
+    "query": "What is the patient diagnosis?",
+    "use_hybrid_search": true,
+    "use_reranker": true
+  }'
+```
+
+**Test Embeddings:**
+```bash
+# Test in Python console
+python3
+>>> from openai import OpenAI
+>>> client = OpenAI(api_key="YOUR_KEY")
+>>> response = client.embeddings.create(
+...     model="text-embedding-3-small",
+...     input="test"
+... )
+>>> print(len(response.data[0].embedding))
+# Should print: 1536
+```
+
+### Inspect ChromaDB Contents
+
+```python
+# In Python console
+import chromadb
+
+client = chromadb.PersistentClient(path=".chromadb")
+collection = client.get_or_create_collection("clinical_documents")
+
+# Get count
+print(f"Total chunks: {collection.count()}")
+
+# Sample some documents
+results = collection.get(limit=5, include=["documents", "metadatas"])
+for i, (doc, meta) in enumerate(zip(results['documents'], results['metadatas'])):
+    print(f"\nChunk {i}:")
+    print(f"Source: {meta.get('source', 'unknown')}")
+    print(f"Text: {doc[:200]}...")
+```
+
+### Monitor API Token Usage
+
+```bash
+# Add logging to track token usage in backend
+# In rag_pipeline.py or document_processor.py:
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+# Log before API calls:
+logging.info(f"Generating embeddings for {num_chunks} chunks")
+logging.info(f"Query: {query}")
+logging.info(f"Context length: {len(context)} tokens")
+
+# Check logs
+docker compose logs backend | grep -i "token\|embedding"
+```
+
+### Reset Application State
+
+To completely reset the application:
+
+```bash
+# Stop and remove containers
+docker compose down -v
+
+# Remove data directories
+rm -rf .chromadb uploads
+
+# Recreate directories
+mkdir .chromadb uploads
+
+# Remove temporary files
+rm -rf backend/__pycache__ backend/utils/__pycache__
+
+# Rebuild and restart
+docker compose build --no-cache
+docker compose up -d
+
+# Verify clean state
+curl http://localhost:5000/status
+# Should show: {"status": "ready", "document_count": 0}
+```
+
+### Profile Performance
+
+```python
+# Add timing to critical functions in backend
+import time
+
+def timed_function():
+    start = time.time()
+    # ... function code ...
+    duration = time.time() - start
+    print(f"Function took {duration:.2f} seconds")
+
+# Example: Profile search performance
+start = time.time()
+results = vector_store.hybrid_search(query, k=5)
+print(f"Hybrid search: {time.time() - start:.2f}s")
+
+start = time.time()
+reranked = vector_store.rerank_results(query, results)
+print(f"Reranking: {time.time() - start:.2f}s")
+```
+
+---
+
+## Getting Help
+
+If you continue to experience issues:
+
+1. **Check Logs:** Review backend logs with `docker compose logs backend -f`
+2. **Verify Configuration:** Ensure OpenAI API key is valid and has credits
+3. **Test Connectivity:** Verify network access to OpenAI API
+4. **Inspect Data:** Check ChromaDB and uploads directories exist and have proper permissions
+5. **Report Issues:** If the problem persists, collect:
+   - Error messages from logs
+   - Document type and size
+   - Query that caused the issue
+   - Browser console errors (for frontend issues)
+   - Docker container status (`docker compose ps`)
+   - Configuration settings (redact API key)
+
+---
+
+## Common Success Indicators
+
+A successful run should show:
+
+```
+✅ Backend started successfully on port 5000
+✅ Frontend accessible at http://localhost:3000
+✅ ChromaDB initialized and persistent
+✅ Document uploaded and processed
+✅ Embeddings created and stored
+✅ Query returns relevant answer with citations
+✅ Streaming works smoothly without errors
+✅ Sources properly cited with document references
+```
+
+All components should be running without errors, and queries should return contextually relevant answers based on uploaded documents.
+
+---
diff --git a/backend/.env.example b/backend/.env.example
new file mode 100644
index 0000000..c9f7828
--- /dev/null
+++ b/backend/.env.example
@@ -0,0 +1,139 @@
+# ============================================================================
+# ClinIQ Configuration File
+# ============================================================================
+# Copy this file to .env and fill in your actual values
+# The .env file will be loaded by the application at runtime
+
+# ============================================================================
+# LLM Provider Configuration
+# ============================================================================
+
+# Provider Selection
+# Options: openai, groq, ollama, openrouter, custom
+# - openai: OpenAI's GPT models (default, requires API key)
+# - groq: Groq's fast inference (requires API key)
+# - ollama: Local models (no API key needed)
+# - openrouter: Access to multiple models (requires API key)
+# - custom: Any OpenAI-compatible API
+LLM_PROVIDER=openai
+
+# API Key
+# Required for: OpenAI, Groq, OpenRouter, Custom providers
+# Not required for: Ollama (local models)
+# You can also set this via the UI (takes precedence over this value)
+LLM_API_KEY=sk-your-api-key-here
+
+# Base URL for LLM API
+# Default URLs for each provider:
+# - OpenAI: https://api.openai.com/v1
+# - Groq: https://api.groq.com/openai/v1
+# - Ollama: http://localhost:11434/v1
+# - OpenRouter: https://openrouter.ai/api/v1
+# - Custom: Your custom endpoint URL
+LLM_BASE_URL=https://api.openai.com/v1
+
+# ============================================================================
+# Model Configuration
+# ============================================================================
+
+# Chat Model (for generating answers)
+# OpenAI options: gpt-3.5-turbo, gpt-4, gpt-4-turbo, gpt-4o
+# Groq options: llama-3.2-90b-text-preview, llama-3.1-70b-versatile, mixtral-8x7b-32768
+# Ollama options: qwen2.5:7b, llama3.1:8b, llama3.2:3b, mistral:7b
+# OpenRouter options: anthropic/claude-3.5-sonnet, google/gemini-pro-1.5
+LLM_CHAT_MODEL=gpt-3.5-turbo
+
+# Embedding Model (for creating vector representations)
+# OpenAI options: text-embedding-3-small, text-embedding-3-large
+# For other providers, use compatible embedding models
+# Note: Some providers may not support embeddings (will fall back to OpenAI)
+LLM_EMBEDDING_MODEL=text-embedding-3-small
+
+# ============================================================================
+# Generation Parameters
+# ============================================================================
+
+# Temperature: Controls randomness in responses
+# Range: 0.0 (deterministic) to 1.0 (creative)
+# Recommended: 0.7 for balanced clinical responses
+TEMPERATURE=0.7
+
+# Maximum Tokens: Maximum length of generated responses
+# Note: Actual maximum depends on the model
+# Example: gpt-3.5-turbo supports up to 4096 tokens
+MAX_TOKENS=1000
+
+# Maximum Retry Attempts: Number of retries on API failures
+# Recommended: 3
+MAX_RETRIES=3
+
+# Request Timeout: API request timeout in seconds
+# Recommended: 300 (5 minutes)
+REQUEST_TIMEOUT=300
+
+# ============================================================================
+# Security Configuration
+# ============================================================================
+
+# SSL Verification
+# Set to 'false' ONLY for development with self-signed certificates
+# Always use 'true' in production
+VERIFY_SSL=true
+
+# ============================================================================
+# Flask Configuration
+# ============================================================================
+
+# Flask Environment
+# Options: development, production
+FLASK_ENV=development
+
+
+# ============================================================================
+# Provider-Specific Configuration Examples
+# ============================================================================
+
+# Example 1: OpenAI (Default)
+# LLM_PROVIDER=openai
+# LLM_API_KEY=sk-your-openai-api-key
+# LLM_BASE_URL=https://api.openai.com/v1
+# LLM_CHAT_MODEL=gpt-3.5-turbo
+# LLM_EMBEDDING_MODEL=text-embedding-3-small
+
+# Example 2: Groq (Fast Inference)
+# LLM_PROVIDER=groq
+# LLM_API_KEY=gsk_your-groq-api-key
+# LLM_BASE_URL=https://api.groq.com/openai/v1
+# LLM_CHAT_MODEL=llama-3.2-90b-text-preview
+# LLM_EMBEDDING_MODEL=text-embedding-3-small  # Falls back to OpenAI
+
+# Example 3: Ollama (Local Models - No API Key)
+# LLM_PROVIDER=ollama
+# LLM_API_KEY=  # Not needed
+# LLM_BASE_URL=http://localhost:11434/v1
+# LLM_CHAT_MODEL=qwen2.5:7b
+# LLM_EMBEDDING_MODEL=nomic-embed-text  # If using Ollama embedding model
+
+# Example 4: OpenRouter (Multi-Model Access)
+# LLM_PROVIDER=openrouter
+# LLM_API_KEY=sk-or-v1-your-openrouter-key
+# LLM_BASE_URL=https://openrouter.ai/api/v1
+# LLM_CHAT_MODEL=anthropic/claude-3.5-sonnet
+# LLM_EMBEDDING_MODEL=text-embedding-3-small  # Falls back to OpenAI
+
+# Example 5: Custom OpenAI-Compatible API
+# LLM_PROVIDER=custom
+# LLM_API_KEY=your-custom-api-key
+# LLM_BASE_URL=https://your-custom-endpoint.com/v1
+# LLM_CHAT_MODEL=your-model-name
+# LLM_EMBEDDING_MODEL=your-embedding-model-name
+
+# ============================================================================
+# Notes
+# ============================================================================
+# 1. API keys can also be provided via the UI (takes precedence)
+# 2. For Ollama, make sure Ollama is running: `ollama serve`
+# 3. For Ollama, pull models first: `ollama pull qwen2.5:7b`
+# 4. Different providers may have different model naming conventions
+# 5. Embedding models may not be available on all providers
+# 6. Always keep your API keys secure and never commit them to version control
diff --git a/backend/Dockerfile b/backend/Dockerfile
index 350212b..91909ed 100644
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -10,9 +10,10 @@ ENV LC_ALL=C.UTF-8
 # Set working directory
 WORKDIR /app
 
-# Install system dependencies
+# Install system dependencies (including curl for healthcheck)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     gcc \
+    curl \
     && rm -rf /var/lib/apt/lists/*
 
 # Copy requirements
@@ -32,6 +33,10 @@ RUN mkdir -p uploads .chromadb \
 # Expose port
 EXPOSE 5000
 
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --retries=3 --start-period=40s \
+    CMD curl -f http://localhost:5000/api/health || exit 1
+
 # Run the application as non-root
 USER appuser
 CMD ["python", "api.py"]
diff --git a/backend/api.py b/backend/api.py
index f1e8190..09926b7 100644
--- a/backend/api.py
+++ b/backend/api.py
@@ -1,15 +1,4 @@
-"""
-ClinIQ Backend API - Flask REST API Server
-
-This is the main backend server that handles:
-1. Document uploads and processing
-2. Question answering using RAG (Retrieval-Augmented Generation)
-3. Knowledge base management
-4. Health checks and status monitoring
-
-The API uses Flask framework and communicates with the React frontend via REST endpoints.
-All document processing, vector storage, and AI interactions happen here.
-"""
+"""ClinIQ Backend API - Flask REST API Server"""
 
 from flask import Flask, request, jsonify, Response, send_from_directory
 from flask_cors import CORS
@@ -21,182 +10,66 @@
 import time
 from werkzeug.utils import secure_filename
 from dotenv import load_dotenv
+from config import config
 from utils.document_processor import extract_text_from_pdf, extract_text_from_docx, chunk_text, create_embeddings
 from utils.vector_store import initialize_chromadb, add_documents, clear_store, initialize_bm25_index
 from utils.rag_pipeline import generate_answer, generate_answer_stream, DEFAULT_CHAT_MODEL, DEFAULT_EMBEDDING_MODEL
 
-# ============================================================================
-# LOGGING CONFIGURATION
-# ============================================================================
-# Setup logging to track what the application is doing
-# This helps with debugging and monitoring the application's behavior
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 
-# Load environment variables from .env file (if it exists)
-# This allows us to store sensitive information like API keys outside the code
 load_dotenv()
 
-# ============================================================================
-# FLASK APPLICATION SETUP
-# ============================================================================
-# Create the Flask application instance
-# This is the main WSGI application that handles HTTP requests
 app = Flask(__name__)
-
-# Configure Flask to use UTF-8 encoding for JSON responses
-# This ensures emojis and Unicode characters are properly handled
 app.config['JSON_AS_ASCII'] = False
 app.config['JSONIFY_MIMETYPE'] = 'application/json; charset=utf-8'
-
-# Enable CORS (Cross-Origin Resource Sharing)
-# This allows the React frontend (running on port 3000) to make requests
-# to this backend API (running on port 5000) without browser security errors
 CORS(app)
 
-# ============================================================================
-# APPLICATION CONFIGURATION
-# ============================================================================
-# Folder where uploaded documents will be stored on the server
-# This allows users to download/view original files when clicking citations
 UPLOAD_FOLDER = 'uploads'
-
-# File types that are allowed to be uploaded
-# Currently supports: PDF, Word documents (.docx), and plain text files
 ALLOWED_EXTENSIONS = {'pdf', 'docx', 'txt'}
+MAX_FILE_SIZE = 50 * 1024 * 1024
 
-# Maximum file size allowed for upload (50 megabytes)
-# This prevents users from uploading extremely large files that could crash the server
-MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
-
-# Create the uploads directory if it doesn't exist
-# This ensures the folder is available when users try to upload files
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 
-# ============================================================================
-# STARTUP INITIALIZATION
-# ============================================================================
-# Clear the knowledge base when the application starts
-# This ensures each session starts fresh without leftover data from previous sessions
-# In a production environment, you might want to remove this and persist data
 logger.info("Starting fresh session: Clearing knowledge base.")
 clear_store()
 
-# ============================================================================
-# CLEANUP HANDLERS
-# ============================================================================
-
 def cleanup_on_shutdown():
-    """
-    Cleanup function that runs when the application shuts down.
-    
-    Security measure to ensure API keys are not left in memory.
-    Clears OpenAI API key from environment variables when app terminates.
-    """
+    """Clear API key from environment on shutdown."""
     if "OPENAI_API_KEY" in os.environ:
         del os.environ["OPENAI_API_KEY"]
         logger.info("Cleared API key from environment on shutdown")
 
-# Register cleanup function to run when app terminates
 atexit.register(cleanup_on_shutdown)
 
-# ============================================================================
-# JOB TRACKING STORE
-# ============================================================================
-# Stores the status of background document processing jobs
-# job_id -> {"status": "processing"|"completed"|"failed", "message": "...", "chunks": 0, "files": []}
 processing_jobs = {}
 
 @app.teardown_appcontext
 def teardown_request(exception=None):
-    """
-    Teardown handler that runs after each request.
-    
-    Security measure to ensure API keys don't persist between requests.
-    This clears any API key that might have been set during request processing.
-    """
+    """Clear API key after each request."""
     if "OPENAI_API_KEY" in os.environ:
         del os.environ["OPENAI_API_KEY"]
 
-# ============================================================================
-# HELPER FUNCTIONS
-# ============================================================================
-
 def allowed_file(filename):
-    """
-    Validates if an uploaded file has an allowed extension.
-    
-    This security check ensures only supported file types can be uploaded,
-    preventing potential security issues from malicious file uploads.
-    
-    Args:
-        filename (str): The name of the file to check
-        
-    Returns:
-        bool: True if the file extension is allowed, False otherwise
-        
-    Example:
-        allowed_file("document.pdf") -> True
-        allowed_file("script.exe") -> False
-    """
+    """Check if file extension is allowed."""
     return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
 
-# ============================================================================
-# API ENDPOINTS
-# ============================================================================
-
 @app.route('/api/health', methods=['GET'])
 def health_check():
-    """
-    Health Check Endpoint
-    
-    A simple endpoint to verify that the backend server is running and responsive.
-    This is useful for:
-    - Monitoring tools to check if the service is alive
-    - Docker health checks
-    - Frontend to verify backend connectivity
-    
-    Returns:
-        JSON response with status "healthy" and HTTP 200 status code
-        
-    Example Response:
-        {"status": "healthy"}
-    """
+    """Health check endpoint."""
     return jsonify({"status": "healthy"}), 200
 
 @app.route('/api/files/<filename>', methods=['GET'])
 def serve_file(filename):
-    """
-    File Serving Endpoint
-    
-    Serves the original uploaded document files so users can view/download them
-    when clicking on citations in the chat interface.
-    
-    This endpoint allows the frontend to display the original document
-    that a citation refers to, providing transparency and verification.
-    
-    Args:
-        filename (str): The name of the file to serve (from URL path)
-        
-    Returns:
-        File content if found, or 404 error if file doesn't exist
-        
-    Security Note:
-        Uses secure_filename() to prevent directory traversal attacks
-    """
+    """Serve uploaded files."""
     try:
-        # send_from_directory safely serves files from a specific directory
-        # It prevents directory traversal attacks (e.g., ../../../etc/passwd)
         return send_from_directory(UPLOAD_FOLDER, filename)
     except Exception as e:
         logger.error(f"Error serving file {filename}: {str(e)}")
         return jsonify({"error": "File not found"}), 404
 
 def process_documents_background(job_id, file_paths, api_key):
-    """
-    Background worker that handles the heavy document processing:
-    text extraction, chunking, embedding, and indexing.
-    """
+    """Background worker for document processing."""
     try:
         processing_jobs[job_id] = {
             "status": "processing",
@@ -204,16 +77,16 @@ def process_documents_background(job_id, file_paths, api_key):
             "chunks": 0,
             "files": []
         }
-        
+
         global_chunk_counter = 0
         processed_files = []
         collection = initialize_chromadb()
-        
+
         for file_path in file_paths:
             filename = os.path.basename(file_path)
             file_chunks = []
             file_metadata = []
-            
+
             with open(file_path, 'rb') as f:
                 if filename.endswith('.pdf'):
                     pages_data = extract_text_from_pdf(f)
@@ -224,11 +97,11 @@ def process_documents_background(job_id, file_paths, api_key):
                     f.seek(0)
                     text = str(f.read(), "utf-8")
                     pages_data = [(text, 1)]
-            
+
             for text, page_num in pages_data:
                 if not text or not text.strip():
                     continue
-                
+
                 page_chunks = chunk_text(text)
                 for chunk in page_chunks:
                     file_chunks.append(chunk)
@@ -238,12 +111,12 @@ def process_documents_background(job_id, file_paths, api_key):
                         "chunk_id": global_chunk_counter
                     })
                     global_chunk_counter += 1
-            
+
             if file_chunks:
                 embeddings = create_embeddings(file_chunks, api_key)
                 add_documents(collection, file_chunks, embeddings, file_metadata)
                 processed_files.append(filename)
-        
+
         if not processed_files:
             processing_jobs[job_id].update({
                 "status": "failed",
@@ -252,14 +125,14 @@ def process_documents_background(job_id, file_paths, api_key):
             return
 
         initialize_bm25_index(collection)
-        
+
         processing_jobs[job_id].update({
             "status": "completed",
             "message": f"Successfully processed {len(processed_files)} documents",
             "chunks": global_chunk_counter,
             "files": processed_files
         })
-        
+
     except Exception as e:
         logger.error(f"Background upload error for job {job_id}: {str(e)}", exc_info=True)
         processing_jobs[job_id].update({
@@ -269,60 +142,60 @@ def process_documents_background(job_id, file_paths, api_key):
 
 @app.route('/api/upload', methods=['POST'])
 def upload_document():
-    """
-    STAY-RESPONSIVE UPLOAD ENDPOINT
-    Saves the files then kicks off background processing.
-    """
+    """Handle file uploads and start background processing."""
     try:
         if 'file' not in request.files:
             return jsonify({"error": "No file provided"}), 400
-        
+
         files = request.files.getlist('file')
         api_key = request.form.get('api_key', '')
-        
+
         if not files or all(f.filename == '' for f in files):
             return jsonify({"error": "No files selected"}), 400
-        
+
         api_key = api_key.strip() if api_key else ''
-        if not api_key or len(api_key) < 10 or not api_key.startswith('sk-'):
-            return jsonify({"error": "Valid OpenAI API key is required"}), 400
-        
+        if api_key == 'from_env' or not api_key:
+            api_key = config.get_api_key()
+            logger.info("Using API key from environment configuration")
+
+        if not api_key or len(api_key) < 10:
+            return jsonify({"error": "Valid API key is required. Please configure in .env file"}), 400
+
         job_id = str(uuid.uuid4())
         saved_file_paths = []
-        
+
         for file in files:
             if file.filename == '' or not allowed_file(file.filename):
                 continue
-                
+
             filename = secure_filename(file.filename)
             file_path = os.path.join(UPLOAD_FOLDER, filename)
             file.save(file_path)
             saved_file_paths.append(file_path)
-        
+
         if not saved_file_paths:
              return jsonify({"error": "No valid files were uploaded"}), 400
 
-        # Start background processing
         thread = threading.Thread(
-            target=process_documents_background, 
+            target=process_documents_background,
             args=(job_id, saved_file_paths, api_key)
         )
-        thread.daemon = True # Ensure thread closes with main app
+        thread.daemon = True
         thread.start()
-        
+
         return jsonify({
             "success": True,
             "job_id": job_id,
             "message": "Files uploaded successfully. Processing started in background."
-        }), 202 # 202 Accepted
-        
+        }), 202
+
     except Exception as e:
         logger.error(f"Upload start error: {str(e)}", exc_info=True)
         return jsonify({"error": str(e)}), 500
 
 @app.route('/api/upload/status/<job_id>', methods=['GET'])
 def get_upload_status(job_id):
-    """Checks the status of a background processing job."""
+    """Check status of background processing job."""
     job = processing_jobs.get(job_id)
     if not job:
         return jsonify({"error": "Job not found"}), 404
@@ -330,205 +203,89 @@ def get_upload_status(job_id):
 
 @app.route('/api/query', methods=['POST'])
 def query_documents():
-    """
-    Query Documents Endpoint - THE QUESTION ANSWERING SYSTEM
-    
-    This is the core endpoint that handles user questions. It implements
-    the RAG (Retrieval-Augmented Generation) pipeline:
-    
-    1. RECEIVES QUESTION: Gets the user's question from the frontend
-    2. REWRITES QUERY: If there's conversation history, rewrites the question
-       to be self-contained (e.g., "What about treatment?" -> "Treatment for Diabetes")
-    3. SEARCHES DOCUMENTS: Uses hybrid search to find relevant document chunks
-       - Dense search: Finds by meaning (semantic similarity)
-       - Sparse search: Finds by keywords (BM25)
-       - Combines both using Reciprocal Rank Fusion (RRF)
-    4. RERANKS RESULTS: Re-orders results by relevance using cosine similarity
-    5. GENERATES ANSWER: Sends question + relevant chunks to GPT-3.5-Turbo
-    6. RETURNS ANSWER: Sends back the answer with source citations
-    
-    Request Format:
-        JSON body with:
-            - query: The user's question (string)
-            - api_key: OpenAI API key (string)
-            - history: Optional conversation history (array of messages)
-            - use_hybrid_search: Whether to use hybrid search (boolean, default: True)
-            - use_reranker: Whether to rerank results (boolean, default: True)
-            - show_thinking: Whether to show AI reasoning process (boolean, default: True)
-            - stream: Whether to stream the response (boolean, default: False)
-    
-    Returns:
-        JSON response with:
-            - answer: The generated answer (string)
-            - citations: List of source citations (array of strings)
-            - thinking: AI reasoning process (string, if show_thinking is True)
-            
-    OR (if stream=True):
-        Server-Sent Events (SSE) stream with chunks of the answer
-        
-    Error Responses:
-        - 400: Missing query or API key
-        - 500: Server error during processing
-        
-    Example Request:
-        {
-            "query": "What are the side effects of this medication?",
-            "api_key": "sk-...",
-            "use_hybrid_search": true,
-            "use_reranker": true,
-            "show_thinking": false
-        }
-        
-    Example Response:
-        {
-            "answer": "Based on the document, the side effects include...",
-            "citations": [
-                "Source: medication_guide.pdf | Page: 5",
-                "Source: medication_guide.pdf | Page: 6"
-            ]
-        }
-    """
+    """Handle question-answering queries using RAG pipeline."""
     try:
-        # Get JSON data from request body
         data = request.json
         if not data:
             return jsonify({"error": "No data provided"}), 400
-            
-        # Extract parameters from request
-        query = data.get('query', '')  # User's question
-        api_key = data.get('api_key', '')  # OpenAI API key
-        # Defaults are set to True for better search quality
-        use_hybrid_search = data.get('use_hybrid_search', True)  # Enable hybrid search
-        use_reranker = data.get('use_reranker', True)  # Enable reranking
-        show_thinking = data.get('show_thinking', True)  # Show AI reasoning
-        stream = data.get('stream', False)  # Stream response or return all at once
-        
+
+        query = data.get('query', '')
+        api_key = data.get('api_key', '')
+        use_hybrid_search = data.get('use_hybrid_search', True)
+        use_reranker = data.get('use_reranker', True)
+        show_thinking = data.get('show_thinking', True)
+        stream = data.get('stream', False)
+
         logger.info(f"Processing query (stream={stream}): {query}")
-        
-        # Validate required parameters
+
         if not query:
             return jsonify({"error": "Query is required"}), 400
-        
-        # Validate API key is provided and has minimum length
+
         api_key = api_key.strip() if api_key else ''
-        if not api_key or len(api_key) < 10 or not api_key.startswith('sk-'):
-            logger.warning(f"Invalid API key in query: length={len(api_key)}, starts_with_sk={api_key.startswith('sk-') if api_key else False}")
-            return jsonify({"error": "Valid OpenAI API key is required (must start with 'sk-')"}), 401
-        
-        # ================================================================
-        # STREAMING MODE: Send answer in real-time chunks
-        # ================================================================
-        if (stream):
+        if api_key == 'from_env' or not api_key:
+            api_key = config.get_api_key()
+            logger.info("Using API key from environment configuration")
+
+        if not api_key or len(api_key) < 10:
+            return jsonify({"error": "Valid API key is required. Please configure in .env file"}), 400
+
+        if stream:
             def sse_stream():
-                """
-                Server-Sent Events (SSE) generator function.
-                Yields answer chunks as they're generated by the AI.
-                This provides a real-time typing effect in the frontend.
-                """
+                """Generate SSE stream for real-time responses."""
                 try:
-                    # Generate answer with streaming enabled
                     for chunk in generate_answer_stream(
                         query,
                         api_key=api_key,
-                        history=data.get('history', []),  # Conversation history
+                        history=data.get('history', []),
                         use_hybrid_search=use_hybrid_search,
                         use_reranker=use_reranker,
                         show_thinking=show_thinking
                     ):
-                        # Format as SSE (Server-Sent Events)
-                        # Frontend receives chunks and displays them in real-time
                         yield f"data: {chunk}\n\n"
                 except Exception as stream_err:
                     logger.error(f"Stream error: {str(stream_err)}")
-                    # Send error as SSE event
                     yield f"data: {{\"error\": \"{str(stream_err)}\"}}\n\n"
 
-            # Return streaming response
-            # mimetype='text/event-stream' tells the browser this is an SSE stream
-            # charset=utf-8 ensures proper encoding of Unicode characters (emojis)
             return Response(sse_stream(), mimetype='text/event-stream; charset=utf-8')
 
-        # ================================================================
-        # NON-STREAMING MODE: Generate complete answer and return
-        # ================================================================
-        # Generate answer using the RAG pipeline
-        # This function handles:
-        # - Query rewriting (if history exists)
-        # - Document search (hybrid or dense)
-        # - Reranking (if enabled)
-        # - Answer generation with GPT-3.5-Turbo
         result = generate_answer(
             query,
             api_key=api_key,
-            history=data.get('history', []),  # Previous conversation messages
+            history=data.get('history', []),
             use_hybrid_search=use_hybrid_search,
             use_reranker=use_reranker,
             show_thinking=show_thinking
         )
-        
-        # Format response based on whether thinking process is shown
+
         if show_thinking:
-            # If thinking is enabled, result contains: (answer, citations, thinking)
             answer, citations, thinking = result
             return jsonify({
-                "answer": answer,  # Final answer
-                "citations": citations,  # Source citations
-                "thinking": thinking  # AI reasoning process
+                "answer": answer,
+                "citations": citations,
+                "thinking": thinking
             }), 200
         else:
-            # If thinking is disabled, result contains: (answer, citations, None)
             answer, citations, _ = result
             return jsonify({
                 "answer": answer,
                 "citations": citations
             }), 200
-        
+
     except Exception as e:
-        # Log full error details for debugging
         logger.error(f"Error in query_documents: {str(e)}", exc_info=True)
         return jsonify({"error": str(e)}), 500
 
 @app.route('/api/clear', methods=['POST'])
 def clear_documents():
-    """
-    Clear Knowledge Base Endpoint
-    
-    Removes all uploaded documents and their embeddings from the system.
-    This effectively resets the knowledge base to an empty state.
-    
-    Use Cases:
-        - Starting a new session with different documents
-        - Clearing sensitive data
-        - Resetting after testing
-        
-    What it does:
-        1. Deletes the ChromaDB collection (removes all embeddings and chunks)
-        2. Deletes all files from the uploads folder
-        3. Resets the BM25 index
-        4. Clears OpenAI API key from environment variables (security)
-        
-    Returns:
-        JSON response with success status
-        
-    Example Response:
-        {
-            "success": true,
-            "message": "Knowledge base and files cleared"
-        }
-    """
+    """Clear all documents from knowledge base."""
     try:
-        # Clear the vector database (ChromaDB collection)
-        # This removes all document chunks, embeddings, and metadata
         clear_store()
-        
-        # Also delete the physical files from the uploads folder
-        # This ensures no files remain on the server
+
         for f in os.listdir(UPLOAD_FOLDER):
             file_path = os.path.join(UPLOAD_FOLDER, f)
-            # Only delete files, not directories
             if os.path.isfile(file_path):
                 os.remove(file_path)
-        
+
         return jsonify({"success": True, "message": "Knowledge base and files cleared"}), 200
     except Exception as e:
         logger.error(f"Error in clear_documents: {str(e)}", exc_info=True)
@@ -536,78 +293,35 @@ def clear_documents():
 
 @app.route('/api/status', methods=['GET'])
 def get_status():
-    """
-    Status Check Endpoint
-    
-    Returns the current status of the knowledge base and system configuration.
-    This helps the frontend know:
-    - Whether documents have been uploaded
-    - How many documents are in the system
-    - Which AI models are being used
-    
-    Returns:
-        JSON response with:
-            - has_documents: Boolean indicating if any documents exist
-            - document_count: Number of document chunks in the database
-            - chat_model: AI model used for answering questions
-            - embedding_model: AI model used for creating embeddings
-            
-    Example Response:
-        {
-            "has_documents": true,
-            "document_count": 45,
-            "chat_model": "gpt-3.5-turbo",
-            "embedding_model": "text-embedding-3-small"
-        }
-    """
+    """Get current system status and configuration."""
     try:
-        # Import ChromaDB client to check collection status
         from chromadb import PersistentClient
-        
-        # Connect to the persistent ChromaDB instance
+
         client = PersistentClient(path=".chromadb")
-        
-        # Initialize status data with defaults
-        status_data = {
-            "has_documents": False,  # No documents by default
-            "document_count": 0,  # Zero chunks by default
-            "chat_model": DEFAULT_CHAT_MODEL,  # Model used for Q&A
-            "embedding_model": DEFAULT_EMBEDDING_MODEL  # Model used for embeddings
-        }
-        
+
+        has_documents = False
+        document_count = 0
+
         try:
-            # Try to get the collection
             collection = client.get_collection(name="cliniq_docs")
-            
-            # Count documents in the collection
             count = collection.count()
-            status_data["has_documents"] = count > 0
-            status_data["document_count"] = count
+            has_documents = count > 0
+            document_count = count
         except Exception:
-            # Collection doesn't exist yet (no documents uploaded)
-            # This is normal and not an error
             pass
-            
-        return jsonify(status_data), 200
+
+        return jsonify({
+            "has_documents": has_documents,
+            "document_count": document_count,
+            "chat_model": config.LLM_CHAT_MODEL,
+            "embedding_model": config.LLM_EMBEDDING_MODEL,
+            "provider": config.LLM_PROVIDER,
+            "provider_info": config.get_provider_info()
+        }), 200
     except Exception as e:
         logger.error(f"Error in get_status: {str(e)}", exc_info=True)
         return jsonify({"error": str(e)}), 500
 
-# ============================================================================
-# APPLICATION ENTRY POINT
-# ============================================================================
-
 if __name__ == '__main__':
-    """
-    Main entry point when running the Flask app directly (not via WSGI server).
-    
-    This starts the development server with:
-    - debug=True: Enables debug mode (auto-reload, detailed errors)
-    - host='0.0.0.0': Makes server accessible from any network interface
-    - port=5000: Runs on port 5000
-    
-    For production, use a WSGI server like Gunicorn instead:
-        gunicorn -w 4 -b 0.0.0.0:5000 api:app
-    """
     logger.info("Starting ClinIQ Backend on port 5000")
     app.run(debug=True, host='0.0.0.0', port=5000)
diff --git a/backend/config.py b/backend/config.py
new file mode 100644
index 0000000..4bea4bd
--- /dev/null
+++ b/backend/config.py
@@ -0,0 +1,63 @@
+"""Configuration management for ClinIQ application."""
+
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+class Config:
+    """Application configuration loaded from environment variables."""
+
+    LLM_PROVIDER = os.getenv('LLM_PROVIDER', 'openai').lower()
+    LLM_API_KEY = os.getenv('LLM_API_KEY') or os.getenv('OPENAI_API_KEY', '')
+    LLM_BASE_URL = os.getenv('LLM_BASE_URL', 'https://api.openai.com/v1')
+    LLM_CHAT_MODEL = os.getenv('LLM_CHAT_MODEL', 'gpt-3.5-turbo')
+    LLM_EMBEDDING_MODEL = os.getenv('LLM_EMBEDDING_MODEL', 'text-embedding-3-small')
+    TEMPERATURE = float(os.getenv('TEMPERATURE', '0.7'))
+    MAX_TOKENS = int(os.getenv('MAX_TOKENS', '1000'))
+    MAX_RETRIES = int(os.getenv('MAX_RETRIES', '3'))
+    REQUEST_TIMEOUT = int(os.getenv('REQUEST_TIMEOUT', '300'))
+    VERIFY_SSL = os.getenv('VERIFY_SSL', 'true').lower() == 'true'
+    FLASK_ENV = os.getenv('FLASK_ENV', 'development')
+    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
+
+    @classmethod
+    def get_api_key(cls, provided_key=None):
+        """Get API key with fallback logic."""
+        if provided_key:
+            return provided_key
+        return cls.LLM_API_KEY or cls.OPENAI_API_KEY
+
+    @classmethod
+    def validate_config(cls):
+        """Validate configuration for current provider."""
+        if cls.LLM_PROVIDER == 'ollama':
+            if not cls.LLM_BASE_URL:
+                return False, "LLM_BASE_URL is required for Ollama"
+            return True, None
+
+        if not cls.LLM_API_KEY and not cls.OPENAI_API_KEY:
+            return False, f"API key is required for provider: {cls.LLM_PROVIDER}"
+
+        if not cls.LLM_BASE_URL:
+            return False, "LLM_BASE_URL is required"
+
+        return True, None
+
+    @classmethod
+    def get_provider_info(cls):
+        """Get information about current provider configuration."""
+        return {
+            'provider': cls.LLM_PROVIDER,
+            'base_url': cls.LLM_BASE_URL,
+            'chat_model': cls.LLM_CHAT_MODEL,
+            'embedding_model': cls.LLM_EMBEDDING_MODEL,
+            'temperature': cls.TEMPERATURE,
+            'max_tokens': cls.MAX_TOKENS,
+            'has_api_key': bool(cls.get_api_key()),
+            'verify_ssl': cls.VERIFY_SSL
+        }
+
+
+config = Config()
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 0f1a2c1..c8af618 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -10,3 +10,5 @@ rank-bm25
 numpy
 scikit-learn
 nltk
+tenacity
+httpx
diff --git a/backend/services/__init__.py b/backend/services/__init__.py
new file mode 100644
index 0000000..2227650
--- /dev/null
+++ b/backend/services/__init__.py
@@ -0,0 +1,9 @@
+"""
+Services Package
+
+Contains service modules for ClinIQ application.
+"""
+
+from .llm_service import LLMService, create_llm_service
+
+__all__ = ['LLMService', 'create_llm_service']
diff --git a/backend/services/llm_service.py b/backend/services/llm_service.py
new file mode 100644
index 0000000..d8d7174
--- /dev/null
+++ b/backend/services/llm_service.py
@@ -0,0 +1,185 @@
+"""Universal LLM service supporting multiple providers."""
+
+import time
+from typing import List, Dict, Optional, Iterator
+from openai import OpenAI
+from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
+
+
+class LLMService:
+    """Universal LLM service supporting multiple providers."""
+
+    PROVIDER_BASE_URLS = {
+        'openai': 'https://api.openai.com/v1',
+        'groq': 'https://api.groq.com/openai/v1',
+        'ollama': 'http://localhost:11434/v1',
+        'openrouter': 'https://openrouter.ai/api/v1',
+        'custom': None
+    }
+
+    def __init__(
+        self,
+        provider: str = 'openai',
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+        chat_model: str = 'gpt-3.5-turbo',
+        embedding_model: str = 'text-embedding-3-small',
+        temperature: float = 0.7,
+        max_tokens: int = 1000,
+        max_retries: int = 3,
+        request_timeout: int = 300,
+        verify_ssl: bool = True
+    ):
+        """Initialize LLM service."""
+        self.provider = provider.lower()
+        self.chat_model = chat_model
+        self.embedding_model = embedding_model
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.max_retries = max_retries
+        self.request_timeout = request_timeout
+        self.verify_ssl = verify_ssl
+
+        if base_url:
+            self.base_url = base_url
+        elif self.provider in self.PROVIDER_BASE_URLS:
+            self.base_url = self.PROVIDER_BASE_URLS[self.provider]
+        else:
+            self.base_url = self.PROVIDER_BASE_URLS['openai']
+
+        if self.provider == 'ollama':
+            self.api_key = 'ollama'
+        else:
+            self.api_key = api_key or ''
+
+        import httpx
+        http_client = httpx.Client(verify=self.verify_ssl)
+
+        self.client = OpenAI(
+            api_key=self.api_key,
+            base_url=self.base_url,
+            timeout=self.request_timeout,
+            http_client=http_client
+        )
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=2, max=10),
+        retry=retry_if_exception_type((Exception,)),
+        reraise=True
+    )
+    def _make_request_with_retry(self, request_func, *args, **kwargs):
+        """Make API request with retry logic."""
+        return request_func(*args, **kwargs)
+
+    def create_chat_completion(
+        self,
+        messages: List[Dict[str, str]],
+        model: Optional[str] = None,
+        temperature: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        stream: bool = False,
+        **kwargs
+    ):
+        """Create chat completion."""
+        model = model or self.chat_model
+        temperature = temperature if temperature is not None else self.temperature
+        max_tokens = max_tokens or self.max_tokens
+
+        return self._make_request_with_retry(
+            self.client.chat.completions.create,
+            model=model,
+            messages=messages,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            stream=stream,
+            **kwargs
+        )
+
+    def create_chat_completion_stream(
+        self,
+        messages: List[Dict[str, str]],
+        model: Optional[str] = None,
+        temperature: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        **kwargs
+    ) -> Iterator[str]:
+        """Create streaming chat completion."""
+        stream = self.create_chat_completion(
+            messages=messages,
+            model=model,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            stream=True,
+            **kwargs
+        )
+
+        for chunk in stream:
+            if chunk.choices[0].delta.content:
+                yield chunk.choices[0].delta.content
+
+    def create_embeddings(
+        self,
+        texts: List[str],
+        model: Optional[str] = None,
+        **kwargs
+    ) -> List[List[float]]:
+        """Create embeddings for text chunks."""
+        model = model or self.embedding_model
+
+        response = self._make_request_with_retry(
+            self.client.embeddings.create,
+            model=model,
+            input=texts,
+            **kwargs
+        )
+
+        return [data.embedding for data in response.data]
+
+    def create_single_embedding(
+        self,
+        text: str,
+        model: Optional[str] = None,
+        **kwargs
+    ) -> List[float]:
+        """Create embedding for a single text."""
+        embeddings = self.create_embeddings([text], model=model, **kwargs)
+        return embeddings[0]
+
+    def get_provider_info(self) -> Dict:
+        """Get information about current provider configuration."""
+        return {
+            'provider': self.provider,
+            'base_url': self.base_url,
+            'chat_model': self.chat_model,
+            'embedding_model': self.embedding_model,
+            'temperature': self.temperature,
+            'max_tokens': self.max_tokens,
+            'verify_ssl': self.verify_ssl,
+            'has_api_key': bool(self.api_key and self.api_key != 'ollama')
+        }
+
+
+def create_llm_service(
+    provider: Optional[str] = None,
+    api_key: Optional[str] = None,
+    base_url: Optional[str] = None,
+    chat_model: Optional[str] = None,
+    embedding_model: Optional[str] = None,
+    **kwargs
+) -> LLMService:
+    """Factory function to create LLM service from config."""
+    from config import config as app_config
+
+    return LLMService(
+        provider=provider or app_config.LLM_PROVIDER,
+        api_key=api_key or app_config.get_api_key(),
+        base_url=base_url or app_config.LLM_BASE_URL,
+        chat_model=chat_model or app_config.LLM_CHAT_MODEL,
+        embedding_model=embedding_model or app_config.LLM_EMBEDDING_MODEL,
+        temperature=kwargs.get('temperature', app_config.TEMPERATURE),
+        max_tokens=kwargs.get('max_tokens', app_config.MAX_TOKENS),
+        max_retries=kwargs.get('max_retries', app_config.MAX_RETRIES),
+        request_timeout=kwargs.get('request_timeout', app_config.REQUEST_TIMEOUT),
+        verify_ssl=kwargs.get('verify_ssl', app_config.VERIFY_SSL)
+    )
diff --git a/backend/utils/constants.py b/backend/utils/constants.py
index aff5d5a..5011ec5 100644
--- a/backend/utils/constants.py
+++ b/backend/utils/constants.py
@@ -1,25 +1,12 @@
-"""
-Constants Module
+"""Model constants configuration."""
 
-This module defines the AI models used throughout the application.
-Centralizing these constants makes it easy to:
-1. Update models in one place
-2. Experiment with different models
-3. Maintain consistency across the codebase
+import os
+from dotenv import load_dotenv
 
-These models are from OpenAI and are used for:
-- Chat/Conversation: GPT-3.5-Turbo (for generating answers)
-- Embeddings: text-embedding-3-small (for creating vector representations)
-"""
+load_dotenv()
 
-# Chat Model: Used for generating answers to user questions
-# GPT-3.5-Turbo is OpenAI's efficient and cost-effective chat model
-# It's optimized for conversational AI and question-answering tasks
-# Alternative: "gpt-4" for better quality but higher cost
-CHAT_MODEL = "gpt-3.5-turbo"
+CHAT_MODEL = os.getenv('LLM_CHAT_MODEL', 'gpt-3.5-turbo')
+EMBEDDING_MODEL = os.getenv('LLM_EMBEDDING_MODEL', 'text-embedding-3-small')
 
-# Embedding Model: Used for converting text into vector representations
-# text-embedding-3-small is OpenAI's efficient embedding model
-# Creates 1536-dimensional vectors optimized for semantic search
-# Alternative: "text-embedding-3-large" for better quality but higher cost
-EMBEDDING_MODEL = "text-embedding-3-small"
+DEFAULT_CHAT_MODEL = CHAT_MODEL
+DEFAULT_EMBEDDING_MODEL = EMBEDDING_MODEL
diff --git a/backend/utils/document_processor.py b/backend/utils/document_processor.py
index 7453c34..395fc69 100644
--- a/backend/utils/document_processor.py
+++ b/backend/utils/document_processor.py
@@ -1,14 +1,4 @@
-"""
-Document Processor Module
-
-This module handles all document-related operations:
-1. Extracting text from different file formats (PDF, DOCX, TXT)
-2. Breaking text into smaller chunks for processing
-3. Creating embeddings (vector representations) of text
-
-These functions are the foundation of the RAG system - they prepare documents
-so they can be searched and queried by the AI.
-"""
+"""Document processing utilities for text extraction, chunking, and embedding creation."""
 
 import PyPDF2
 import docx
@@ -17,241 +7,55 @@
 from openai import OpenAI
 
 def extract_text_from_pdf(file):
-    """
-    Extracts text from a PDF file, page by page.
-    
-    This function reads a PDF document and extracts all text content while
-    preserving page numbers. Page numbers are crucial for citations - they
-    allow users to see exactly where information came from in the original document.
-    
-    How it works:
-    1. Uses PyPDF2 to read the PDF file
-    2. Iterates through each page
-    3. Extracts text from each page
-    4. Returns a list of (text, page_number) tuples
-    
-    Args:
-        file: File object or file path of the PDF to process
-        
-    Returns:
-        list: List of tuples, where each tuple contains:
-            - text (str): The extracted text from a page
-            - page_number (int): The page number (1-indexed)
-            
-    Example:
-        Input: PDF with 3 pages
-        Output: [
-            ("Page 1 text content...", 1),
-            ("Page 2 text content...", 2),
-            ("Page 3 text content...", 3)
-        ]
-        
-    Note:
-        - Only extracts text, not images or tables
-        - Skips empty pages
-        - Page numbers start at 1 (not 0)
-    """
+    """Extract text from PDF file, page by page."""
     reader = PyPDF2.PdfReader(file)
     pages_text = []
-    
-    # Iterate through each page in the PDF
+
     for i, page in enumerate(reader.pages):
-        # Extract text from the current page
         extracted = page.extract_text()
-        
-        # Only add non-empty pages to the results
         if extracted:
-            # Store text with page number (1-indexed for user-friendly citations)
             pages_text.append((extracted, i + 1))
-    
+
     return pages_text
 
 def extract_text_from_docx(file):
-    """
-    Extracts text from a Microsoft Word document (.docx file).
-    
-    This function reads a Word document and extracts all text content from
-    paragraphs. Word documents don't have explicit page numbers like PDFs,
-    so we treat the entire document as a single unit.
-    
-    How it works:
-    1. Uses python-docx library to open the document
-    2. Iterates through all paragraphs
-    3. Concatenates paragraph text with newlines
-    4. Returns the complete text as a string
-    
-    Args:
-        file: File object or file path of the .docx file to process
-        
-    Returns:
-        str: All text content from the document, with paragraphs separated by newlines
-        
-    Example:
-        Input: Word document with paragraphs
-        Output: "First paragraph text...\nSecond paragraph text...\n..."
-        
-    Note:
-        - Extracts text from paragraphs only
-        - Tables and images are not extracted
-        - Preserves paragraph structure with newlines
-    """
-    # Open the Word document
+    """Extract text from Word document."""
     doc = docx.Document(file)
-    
-    # Initialize empty string to store all text
     text = ""
-    
-    # Iterate through all paragraphs in the document
+
     for para in doc.paragraphs:
-        # Add paragraph text followed by a newline
-        # This preserves the paragraph structure
         text += para.text + "\n"
-    
+
     return text
 
 def chunk_text(text, chunk_size=800, overlap=150):
-    """
-    Breaks long text into smaller, manageable chunks.
-    
-    This is a critical function for RAG (Retrieval-Augmented Generation) systems.
-    Large documents are split into smaller pieces because:
-    1. AI models have token limits (can't process entire books at once)
-    2. Smaller chunks allow more precise search and retrieval
-    3. Overlapping chunks ensure no important context is lost at boundaries
-    
-    How it works:
-    1. Encodes text into tokens (AI-readable units)
-    2. Creates chunks of specified size (800 tokens)
-    3. Overlaps chunks by 150 tokens to preserve context
-    4. Decodes tokens back to text for each chunk
-    
-    Args:
-        text (str): The text to chunk (can be very long)
-        chunk_size (int): Number of tokens per chunk (default: 800)
-                         - Larger chunks = more context but fewer chunks
-                         - Smaller chunks = less context but more precise search
-        overlap (int): Number of overlapping tokens between chunks (default: 150)
-                      - Prevents losing context at chunk boundaries
-                      - Example: If a sentence spans two chunks, overlap ensures
-                        it's captured in both
-    
-    Returns:
-        list: List of text chunks, each as a string
-        
-    Example:
-        Input: "Very long document text..." (5000 tokens)
-        Output: [
-            "Chunk 1 text (tokens 0-800)...",
-            "Chunk 2 text (tokens 650-1450)...",  # Overlaps with chunk 1
-            "Chunk 3 text (tokens 1300-2100)...",  # Overlaps with chunk 2
-            ...
-        ]
-        
-    Why Overlap Matters:
-        Without overlap, a sentence like "The patient's condition improved
-        significantly" might be split as:
-        - Chunk 1: "...condition improved"
-        - Chunk 2: "significantly..."
-        
-        With overlap, both chunks contain the full sentence, ensuring
-        the AI can understand the complete context.
-    """
-    # Get the tokenizer for the model we're using
-    # "cl100k_base" is the encoding used by GPT-3.5 and GPT-4
+    """Break text into smaller chunks with overlap."""
     enc = tiktoken.get_encoding("cl100k_base")
-    
-    # Convert text into tokens (numbers that represent words/subwords)
-    # This is how AI models understand text
     tokens = enc.encode(text)
-    
-    chunks = []  # List to store the resulting text chunks
-    start = 0  # Starting position in the token array
-    
-    # Continue chunking until we've processed all tokens
+
+    chunks = []
+    start = 0
+
     while start < len(tokens):
-        # Calculate end position for this chunk
         end = start + chunk_size
-        
-        # Extract tokens for this chunk
         chunk_tokens = tokens[start:end]
-        
-        # Convert tokens back to readable text
         chunk_text = enc.decode(chunk_tokens)
         chunks.append(chunk_text)
-        
-        # Move start position forward, accounting for overlap
-        # Example: chunk_size=800, overlap=150
-        # Chunk 1: tokens 0-800
-        # Chunk 2: tokens 650-1450 (starts at 800-150=650, overlaps by 150)
         start += (chunk_size - overlap)
-        
+
     return chunks
 
 def create_embeddings(chunks, api_key):
-    """
-    Converts text chunks into embeddings (vector representations).
-    
-    Embeddings are the "digital fingerprints" of text. They convert words
-    into numbers (vectors) that capture the meaning of the text. This allows
-    the system to:
-    1. Search documents by meaning, not just keywords
-    2. Find similar content even if different words are used
-    3. Perform fast similarity calculations
-    
-    How it works:
-    1. Sends text chunks to OpenAI's embedding API
-    2. OpenAI's model converts each chunk into a vector (list of numbers)
-    3. Returns embeddings that can be stored and searched
-    
-    What are Embeddings?
-        - A vector (list of numbers) that represents text meaning
-        - Similar texts have similar vectors
-        - Example: "heart attack" and "myocardial infarction" have similar embeddings
-        - Dimensions: text-embedding-3-small creates 1536-dimensional vectors
-        
-    Args:
-        chunks (list): List of text strings to convert to embeddings
-        api_key (str): OpenAI API key for authentication
-        
-    Returns:
-        list: List of embedding vectors, one for each input chunk
-              Each embedding is a list of 1536 numbers (floats)
-              
-    Example:
-        Input: ["Patient has diabetes", "Medication dosage is 10mg"]
-        Output: [
-            [0.123, -0.456, 0.789, ...],  # 1536 numbers for first chunk
-            [0.234, -0.567, 0.890, ...]  # 1536 numbers for second chunk
-        ]
-        
-    API Details:
-        - Model: text-embedding-3-small (OpenAI's efficient embedding model)
-        - Cost: Very cheap compared to chat models
-        - Speed: Fast batch processing
-        - Quality: Optimized for semantic similarity search
-        
-    Note:
-        - This function makes an API call to OpenAI
-        - All chunks are processed in a single API call (batch processing)
-        - The API key is used only in memory and not persisted
-    """
-    # Create OpenAI client instance
-    # API key is passed directly to the client and not stored in environment
+    """Convert text chunks into embeddings using configured LLM provider."""
+    from services.llm_service import create_llm_service
+
     try:
-        client = OpenAI(api_key=api_key)
-        
-        # Call OpenAI's embedding API
-        # This converts all text chunks into vector representations in one call
-        response = client.embeddings.create(
-            input=chunks,  # List of text chunks to embed
-            model="text-embedding-3-small"  # OpenAI's efficient embedding model
-        )
+        llm_service = create_llm_service(api_key=api_key)
+        embeddings = llm_service.create_embeddings(chunks)
+        return embeddings
+
     except Exception as e:
         error_msg = str(e)
         if '401' in error_msg or 'Unauthorized' in error_msg or 'authentication' in error_msg.lower():
-            raise ValueError(f"Invalid OpenAI API key. Please check your API key and try again.")
-        raise Exception(f"OpenAI API error: {error_msg}")
-    
-    # Extract embeddings from the response
-    # Each data object contains one embedding vector
-    return [data.embedding for data in response.data]
+            raise ValueError(f"Invalid API key. Please check your API key and try again.")
+        raise Exception(f"LLM API error: {error_msg}")
diff --git a/backend/utils/rag_pipeline.py b/backend/utils/rag_pipeline.py
index 3751ab1..3c3b741 100644
--- a/backend/utils/rag_pipeline.py
+++ b/backend/utils/rag_pipeline.py
@@ -1,226 +1,64 @@
-"""
-RAG (Retrieval-Augmented Generation) Pipeline Module
+"""RAG (Retrieval-Augmented Generation) pipeline for question-answering."""
 
-This module implements the core question-answering system using RAG architecture.
-RAG combines two powerful techniques:
-1. RETRIEVAL: Finds relevant information from documents
-2. GENERATION: Uses AI to generate answers based on retrieved information
-
-The pipeline works in these stages:
-1. Query Rewriting: Makes follow-up questions self-contained
-2. Document Retrieval: Searches for relevant document chunks
-3. Reranking: Re-orders results by relevance
-4. Answer Generation: Creates answer using retrieved context
-
-This ensures answers are grounded in the actual documents, not hallucinated.
-"""
-
-from openai import OpenAI
 import os
+from services.llm_service import create_llm_service
 from utils.vector_store import search_documents, hybrid_search, rerank_chunks
 from utils.constants import CHAT_MODEL, EMBEDDING_MODEL
 
-# Export model constants for use in other modules
 DEFAULT_CHAT_MODEL = CHAT_MODEL
 DEFAULT_EMBEDDING_MODEL = EMBEDDING_MODEL
 
 def _get_context_and_citations(query, api_key, use_hybrid_search, use_reranker):
-    """
-    THE RESEARCHER: Finds relevant document chunks for answering questions.
-    
-    This is the retrieval stage of RAG. It searches through all uploaded documents
-    to find the most relevant pieces of information that can answer the user's question.
-    
-    Process:
-    1. Converts the question into an embedding (vector representation)
-    2. Searches documents using either hybrid or dense search
-    3. Optionally reranks results for better relevance
-    4. Extracts top chunks with their source information
-    5. Formats context and citations for the AI to use
-    
-    Args:
-        query (str): The user's question or search query
-        api_key (str): OpenAI API key for creating query embeddings
-        use_hybrid_search (bool): If True, uses hybrid search (semantic + keyword)
-                                 If False, uses only semantic (dense) search
-        use_reranker (bool): If True, reranks results by relevance
-                            If False, uses original search order
-    
-    Returns:
-        tuple: (context_text, citations)
-            - context_text (str): Formatted text containing relevant document chunks
-                                 Each chunk includes the content and source info
-            - citations (list): List of citation strings for the retrieved chunks
-                              Format: ["Source: filename.pdf | Page: 5", ...]
-    
-    Example:
-        Input:
-            query = "What are the side effects?"
-            use_hybrid_search = True
-            use_reranker = True
-        
-        Output:
-            context_text = "Content: Side effects include nausea...\nSource: med_guide.pdf | Page: 3\n\n..."
-            citations = ["Source: med_guide.pdf | Page: 3", "Source: med_guide.pdf | Page: 4"]
-    
-    Why This Matters:
-        - Better context = Better answers
-        - Citations allow users to verify information
-        - Hybrid search finds more comprehensive results
-        - Reranking ensures most relevant chunks are used
-    """
-    client = OpenAI(api_key=api_key)
-
-    # ========================================================================
-    # STEP 1: Convert question to embedding
-    # ========================================================================
-    # Embeddings allow semantic search - finding documents by meaning
-    # Example: "heart attack" and "myocardial infarction" have similar embeddings
-    emb_response = client.embeddings.create(
-        input=query,  # The user's question
-        model=DEFAULT_EMBEDDING_MODEL  # text-embedding-3-small
-    )
-    # Extract the embedding vector (list of 1536 numbers)
-    query_embedding = emb_response.data[0].embedding
+    """Retrieve relevant document chunks for the query."""
+    llm_service = create_llm_service(api_key=api_key)
+    query_embedding = llm_service.create_single_embedding(query)
 
-    # ========================================================================
-    # STEP 2: Search for relevant document chunks
-    # ========================================================================
-    # Choose between hybrid search (recommended) or dense search only
     if use_hybrid_search:
-        # Hybrid search combines:
-        # - Dense search: Finds by meaning (semantic similarity)
-        # - Sparse search: Finds by keywords (BM25 algorithm)
-        # - Reciprocal Rank Fusion: Combines both results intelligently
-        # This gives the best of both worlds: meaning + keywords
         results = hybrid_search(query_embedding, query, top_k=15)
     else:
-        # Dense search only: Uses semantic similarity
-        # Good for conceptual questions but may miss specific terms
         results = search_documents(query_embedding, top_k=15)
 
-    # ========================================================================
-    # STEP 3: Extract chunks from search results
-    # ========================================================================
-    # Convert ChromaDB results format into our internal format
     initial_chunks = []
     if results['documents'] and results['documents'][0]:
-        # Iterate through each retrieved document chunk
         for i, doc in enumerate(results['documents'][0]):
-            # Get metadata (source file, page number, chunk ID)
             meta = results['metadatas'][0][i]
-            # Store as tuple: (document_text, metadata, score)
-            # Score is set to 1.0 as placeholder (will be updated by reranking)
             initial_chunks.append((doc, meta, 1.0))
 
-    # ========================================================================
-    # STEP 4: Rerank chunks by relevance (optional)
-    # ========================================================================
-    # Reranking improves answer quality by ensuring the most relevant chunks
-    # are used, even if they weren't ranked highest by the initial search
     if use_reranker and initial_chunks:
-        # Re-ranks chunks using cosine similarity with the query
-        # This is a second pass to refine the results
         reranked_chunks = rerank_chunks(query_embedding, initial_chunks, top_k=7)
     else:
-        # If reranking is disabled, just take the top 7 chunks as-is
         reranked_chunks = initial_chunks[:7]
 
-    # ========================================================================
-    # STEP 5: Build context string and citations
-    # ========================================================================
-    # Format the chunks into a context string that the AI can use
-    # Also create a list of citations for the user to see
     context_text = ""
     citations = []
-    
+
     if reranked_chunks:
-        # Process each relevant chunk
         for doc, meta, score in reranked_chunks:
-            # Extract source information
-            source = meta.get('source', 'Unknown')  # Filename
-            page = meta.get('page', 'Unknown')  # Page number
-            
-            # Format: Content + Source information
-            # This format helps the AI understand where information came from
+            source = meta.get('source', 'Unknown')
+            page = meta.get('page', 'Unknown')
+
             context_line = f"Content: {doc}\nSource: {source} | Page: {page}\n\n"
             context_text += context_line
-            
-            # Create citation for user display
             citations.append(f"Source: {source} | Page: {page}")
     else:
-        # No relevant chunks found
         context_text = "No relevant context found in documents."
-    
+
     return context_text, citations
 
 def rewrite_query(query, history, api_key):
-    """
-    THE QUERY REWRITER: Makes follow-up questions self-contained for search.
-    
-    In conversations, users often ask follow-up questions that reference
-    previous context. For example:
-    - User: "What is Diabetes?"
-    - AI: "Diabetes is..."
-    - User: "What about treatment?"  ← This needs context!
-    
-    This function rewrites the follow-up question to include the context,
-    making it searchable: "What about treatment?" → "Treatment for Diabetes"
-    
-    How it works:
-    1. Checks if there's conversation history
-    2. If no history, returns the query as-is
-    3. If history exists, uses GPT to rewrite the question
-    4. Returns a self-contained search query
-    
-    Args:
-        query (str): The current user question (may be a follow-up)
-        history (list): Previous conversation messages
-                       Format: [{"role": "user", "content": "..."}, ...]
-        api_key (str): OpenAI API key for the rewriting model
-    
-    Returns:
-        str: Rewritten query that's self-contained and searchable
-        
-    Example:
-        Input:
-            query = "What about treatment?"
-            history = [
-                {"role": "user", "content": "What is Diabetes?"},
-                {"role": "assistant", "content": "Diabetes is a condition..."}
-            ]
-        
-        Output:
-            "Recommended treatment for Diabetes"
-    
-    Why This Matters:
-        - Makes follow-up questions searchable
-        - Improves retrieval quality in conversations
-        - Enables natural conversation flow
-    """
-    # If no history, the query is already self-contained
+    """Rewrite follow-up questions to be self-contained."""
     if not history:
         return query
 
-    client = OpenAI(api_key=api_key)
-    
-    # ========================================================================
-    # Format conversation history for the rewriter
-    # ========================================================================
+    llm_service = create_llm_service(api_key=api_key)
+
     history_text = ""
-    # Only use last 3 conversation rounds for efficiency
-    # This keeps the prompt manageable while maintaining recent context
     for msg in history[-3:]:
-        # Determine role label
         role = "User" if msg['role'] == 'user' else "Assistant"
-        # Truncate content to 200 chars to keep prompt size reasonable
         content = msg['content'][:200]
         history_text += f"{role}: {content}\n"
 
-    # ========================================================================
-    # Create prompt for query rewriting
-    # ========================================================================
-    REWRITE_PROMPT = f"""Given the following conversation history and a follow-up question, rewrite the follow-up question to be a standalone search query that can be used to find relevant documents. 
+    REWRITE_PROMPT = f"""Given the following conversation history and a follow-up question, rewrite the follow-up question to be a standalone search query that can be used to find relevant documents.
 
 History:
 {history_text}
@@ -229,115 +67,32 @@ def rewrite_query(query, history, api_key):
 
 Standalone Query:"""
 
-    # ========================================================================
-    # Call GPT to rewrite the query
-    # ========================================================================
-    response = client.chat.completions.create(
-        model=CHAT_MODEL,  # GPT-3.5-Turbo
-        messages=[{"role": "user", "content": REWRITE_PROMPT}]
-    )
-    
-    # Extract and return the rewritten query
+    messages = [{"role": "user", "content": REWRITE_PROMPT}]
+    response = llm_service.create_chat_completion(messages=messages)
+
     rewritten = response.choices[0].message.content.strip()
     return rewritten
 
 def generate_answer(query, api_key=None, history=[], use_hybrid_search=True, use_reranker=True, show_thinking=False):
-    """
-    THE CLINICAL ASSISTANT: Core function that generates answers using RAG.
-    
-    This is the main function that orchestrates the entire RAG pipeline:
-    1. Rewrites the query if there's conversation history
-    2. Retrieves relevant document chunks
-    3. Generates an answer using GPT with the retrieved context
-    4. Returns answer with citations
-    
-    The RAG (Retrieval-Augmented Generation) approach ensures:
-    - Answers are grounded in actual documents (not hallucinated)
-    - Sources are tracked and cited
-    - Answers are accurate and verifiable
-    
-    Args:
-        query (str): The user's question
-        api_key (str, optional): OpenAI API key. If not provided, tries environment variable
-        history (list, optional): Conversation history for context
-                                 Format: [{"role": "user", "content": "..."}, ...]
-        use_hybrid_search (bool): Enable hybrid search (default: True)
-        use_reranker (bool): Enable reranking (default: True)
-        show_thinking (bool): Show AI reasoning process (default: False)
-    
-    Returns:
-        tuple: (answer, citations, thinking)
-            - answer (str): The generated answer
-            - citations (list): List of source citations
-            - thinking (str or None): AI reasoning process if show_thinking=True
-    
-    Example:
-        Input:
-            query = "What are the contraindications?"
-            history = []
-            use_hybrid_search = True
-            use_reranker = True
-            show_thinking = False
-        
-        Output:
-            (
-                "Based on the document, contraindications include...",
-                ["Source: guide.pdf | Page: 5"],
-                None
-            )
-    
-    Process Flow:
-        1. Query Rewriting → Makes question searchable
-        2. Document Retrieval → Finds relevant chunks
-        3. Reranking → Refines results
-        4. Answer Generation → GPT creates answer from context
-        5. Response Formatting → Returns answer with citations
-    """
-    # ========================================================================
-    # Validate API key
-    # ========================================================================
+    """Generate answer using RAG pipeline."""
     if not api_key:
         api_key = os.environ.get("OPENAI_API_KEY")
     if not api_key:
         raise ValueError("OpenAI API Key not found")
 
-    client = OpenAI(api_key=api_key)
+    llm_service = create_llm_service(api_key=api_key)
 
-    # ========================================================================
-    # STEP 1: Rewrite query for better search (if history exists)
-    # ========================================================================
-    # Makes follow-up questions self-contained
-    # Example: "What about treatment?" → "Treatment for Diabetes"
     search_query = rewrite_query(query, history, api_key)
-    
-    # ========================================================================
-    # STEP 2: Retrieve relevant document chunks
-    # ========================================================================
-    # This function:
-    # - Searches documents (hybrid or dense)
-    # - Reranks results (if enabled)
-    # - Returns formatted context and citations
     context_text, citations = _get_context_and_citations(
-        search_query, 
-        api_key, 
-        use_hybrid_search, 
+        search_query,
+        api_key,
+        use_hybrid_search,
         use_reranker
     )
 
-    # ========================================================================
-    # STEP 3: Prepare messages for GPT
-    # ========================================================================
     messages = []
-    
-    # ========================================================================
-    # System prompt: Defines the AI's role and behavior
-    # ========================================================================
-    # The system prompt is crucial - it tells GPT:
-    # - What its role is (clinical assistant)
-    # - What rules to follow (cite sources, use only context, etc.)
-    # - How to format answers
+
     if show_thinking:
-        # Version with thinking process enabled
         system_content = f"""You are ClinIQ, an AI assistant for healthcare professionals.
 
 RULES:
@@ -353,7 +108,6 @@ def generate_answer(query, api_key=None, history=[], use_hybrid_search=True, use
 CONTEXT:
 {context_text}"""
     else:
-        # Version without thinking process (faster, cleaner output)
         system_content = f"""You are ClinIQ, an AI assistant for healthcare professionals.
 
 RULES:
@@ -368,111 +122,46 @@ def generate_answer(query, api_key=None, history=[], use_hybrid_search=True, use
 CONTEXT:
 {context_text}"""
 
-    # Add system message (defines AI behavior)
     messages.append({"role": "system", "content": system_content})
-    
-    # ========================================================================
-    # Add conversation history for context
-    # ========================================================================
-    # This allows the AI to understand the conversation flow
-    # Example: If user asked about "Diabetes" before, it knows what "it" refers to
+
     for msg in history:
         messages.append({"role": msg['role'], "content": msg['content']})
-    
-    # ========================================================================
-    # Add the current question
-    # ========================================================================
-    # Format the question based on whether thinking is enabled
+
     final_query = query
     if show_thinking:
-        # Request thinking process before answer
         final_query = f"Question: {query}\n\nFirst, think step-by-step. Then provide your final answer with inline citations.\n\nThinking process:"
     else:
-        # Direct answer request
         final_query = f"Question: {query}\n\nAnswer with inline citations:"
-        
+
     messages.append({"role": "user", "content": final_query})
 
-    # ========================================================================
-    # STEP 4: Generate answer using GPT
-    # ========================================================================
-    # Send the complete message history to GPT
-    # GPT will:
-    # - Read the system prompt (knows its role and rules)
-    # - Review conversation history (understands context)
-    # - Read the retrieved document chunks (has information to answer)
-    # - Generate an answer based ONLY on the provided context
-    response = client.chat.completions.create(
-        model=DEFAULT_CHAT_MODEL,  # GPT-3.5-Turbo
+    response = llm_service.create_chat_completion(
         messages=messages,
-        max_tokens=1000,  # Allow longer responses
-        temperature=0.3   # Lower temperature for more consistent, focused responses
+        temperature=0.3
     )
 
-    # Extract the generated answer
     answer = response.choices[0].message.content
 
-    # ========================================================================
-    # STEP 5: Parse and return response
-    # ========================================================================
     if show_thinking:
-        # Separate thinking process from final answer
         thinking, final_answer = parse_thinking_and_answer(answer)
         return final_answer, citations, thinking
     else:
-        # Return answer with citations (no thinking)
         return answer, citations, None
 
 def generate_answer_stream(query, api_key=None, history=[], use_hybrid_search=True, use_reranker=True, show_thinking=False):
-    """
-    Streaming version of generate_answer - sends answer in real-time chunks.
-    
-    This function works the same as generate_answer, but instead of waiting
-    for the complete answer, it streams chunks as they're generated. This
-    provides a better user experience with a typing effect.
-    
-    How streaming works:
-    1. Same retrieval process as generate_answer
-    2. Calls GPT with stream=True
-    3. Yields answer chunks as they're generated
-    4. Frontend receives chunks and displays them in real-time
-    
-    Args:
-        Same as generate_answer()
-    
-    Yields:
-        str: JSON strings containing answer chunks
-            Format: '{"type": "content", "content": "chunk text"}'
-            Or: '{"type": "metadata", "citations": [...]}'
-    
-    Example Usage:
-        for chunk in generate_answer_stream("What is diabetes?"):
-            # Process each chunk as it arrives
-            data = json.loads(chunk)
-            if data["type"] == "content":
-                print(data["content"], end="")  # Print without newline
-    
-    Benefits:
-        - Better UX: Users see answer appearing in real-time
-        - Perceived faster response (don't wait for complete answer)
-        - More engaging interaction
-    """
-    # Validate API key (same as generate_answer)
+    """Generate answer with streaming response."""
     if not api_key:
         api_key = os.environ.get("OPENAI_API_KEY")
     if not api_key:
         raise ValueError("OpenAI API Key not found")
 
-    client = OpenAI(api_key=api_key)
+    llm_service = create_llm_service(api_key=api_key)
 
-    # Same retrieval process as generate_answer
     search_query = rewrite_query(query, history, api_key)
     context_text, citations = _get_context_and_citations(search_query, api_key, use_hybrid_search, use_reranker)
 
-    # Prepare messages (same as generate_answer)
     messages = []
-    
-    # System prompt (same logic as generate_answer)
+
     if show_thinking:
         system_content = f"""You are ClinIQ, an AI assistant for healthcare professionals.
 
@@ -504,15 +193,12 @@ def generate_answer_stream(query, api_key=None, history=[], use_hybrid_search=Tr
 {context_text}"""
 
     messages.append({"role": "system", "content": system_content})
-    
-    # Add history
+
     for msg in history:
         messages.append({"role": msg['role'], "content": msg['content']})
-        
-    # Format query based on whether thinking process is requested
+
     final_query = query
     if show_thinking:
-        # Strict instructions to avoid repeating the answer in the thinking section
         final_query = (
             f"Question: {query}\n\n"
             "Instructions:\n"
@@ -530,89 +216,58 @@ def generate_answer_stream(query, api_key=None, history=[], use_hybrid_search=Tr
 
     messages.append({"role": "user", "content": final_query})
 
-    # Send citations as metadata first (consistent with user request)
     import json
     yield json.dumps({"type": "metadata", "citations": citations})
 
-    # ========================================================================
-    # Stream the answer with robust buffering
-    # ========================================================================
-    # Call GPT with stream=True to get chunks as they're generated
-    response = client.chat.completions.create(
-        model=DEFAULT_CHAT_MODEL,
-        messages=messages,
-        stream=True,
-        max_tokens=1000,  # Allow longer responses
-        temperature=0.3    # Lower temperature for more consistent, focused responses
-    )
-
     full_response = ""
     if show_thinking:
-        # ROBUST BUFFERING: Detects the marker even if split across chunks
         full_buffer = ""
         found_final_answer = False
         sent_thinking_len = 0
-        # Try multiple possible separators
         possible_markers = ["Final Answer:", "\n\n**\n", "\n**\n\n", "\n\n**"]
         found_marker = None
-        
-        for chunk in response:
-            if chunk.choices and chunk.choices[0].delta.content:
-                new_text = chunk.choices[0].delta.content
-                full_buffer += new_text
-                full_response += new_text
-                
-                if not found_final_answer:
-                    # Check for any of the possible markers
-                    for marker in possible_markers:
-                        if marker in full_buffer:
-                            found_marker = marker
-                            break
-                    
-                    if found_marker:
-                        # Marker FOUND!
-                        found_final_answer = True
-                        parts = full_buffer.split(found_marker, 1)
-                        
-                        # Send the remaining thinking text before the marker
-                        thinking_to_send = parts[0][sent_thinking_len:].strip()
-                        if thinking_to_send:
-                            yield json.dumps({"type": "thinking", "content": thinking_to_send})
-                        
-                        # Send everything after the marker as final answer content
-                        if parts[1].strip():
-                            yield json.dumps({"type": "content", "content": parts[1]})
-                    else:
-                        # Marker NOT YET found.
-                        # Send text that is "safe" (far enough from the end to not be a partial marker)
-                        # Use longest possible marker length for safety
-                        max_marker_len = max(len(m) for m in possible_markers)
-                        safe_len = max(0, len(full_buffer) - max_marker_len - 5)
-                        if safe_len > sent_thinking_len:
-                            to_send = full_buffer[sent_thinking_len:safe_len]
-                            yield json.dumps({"type": "thinking", "content": to_send})
-                            sent_thinking_len = safe_len
+
+        for content_chunk in llm_service.create_chat_completion_stream(messages=messages, temperature=0.3):
+            new_text = content_chunk
+            full_buffer += new_text
+            full_response += new_text
+
+            if not found_final_answer:
+                for marker in possible_markers:
+                    if marker in full_buffer:
+                        found_marker = marker
+                        break
+
+                if found_marker:
+                    found_final_answer = True
+                    parts = full_buffer.split(found_marker, 1)
+
+                    thinking_to_send = parts[0][sent_thinking_len:].strip()
+                    if thinking_to_send:
+                        yield json.dumps({"type": "thinking", "content": thinking_to_send})
+
+                    if parts[1].strip():
+                        yield json.dumps({"type": "content", "content": parts[1]})
                 else:
-                    # After marker, everything is content
-                    yield json.dumps({"type": "content", "content": new_text})
-        
-        # FLUSH REMAINING BUFFER: Send any unsent content at the end of the stream
+                    max_marker_len = max(len(m) for m in possible_markers)
+                    safe_len = max(0, len(full_buffer) - max_marker_len - 5)
+                    if safe_len > sent_thinking_len:
+                        to_send = full_buffer[sent_thinking_len:safe_len]
+                        yield json.dumps({"type": "thinking", "content": to_send})
+                        sent_thinking_len = safe_len
+            else:
+                yield json.dumps({"type": "content", "content": new_text})
+
         if not found_final_answer and sent_thinking_len < len(full_buffer):
-            # No "Final Answer:" marker found, send remaining buffer as thinking
             remaining = full_buffer[sent_thinking_len:].strip()
             if remaining:
                 yield json.dumps({"type": "thinking", "content": remaining})
     else:
-        # Simple mode: send everything as content
-        for chunk in response:
-            if chunk.choices and chunk.choices[0].delta.content:
-                new_text = chunk.choices[0].delta.content
-                full_response += new_text
-                yield json.dumps({"type": "content", "content": new_text})
+        for content_chunk in llm_service.create_chat_completion_stream(messages=messages, temperature=0.3):
+            new_text = content_chunk
+            full_response += new_text
+            yield json.dumps({"type": "content", "content": new_text})
 
-    # ========================================================================
-    # CITATION CLEARING: If we find "I don't have that information", clear citations
-    # ========================================================================
     no_info_phrases = [
         "i don't have that information",
         "don't have that information",
@@ -630,56 +285,23 @@ def generate_answer_stream(query, api_key=None, history=[], use_hybrid_search=Tr
         yield json.dumps({"type": "clear_citations"})
 
 def parse_thinking_and_answer(response_text):
-    """
-    Separates the AI's thinking process from the final answer.
-    
-    When show_thinking=True, GPT generates both a reasoning process
-    and a final answer. This function parses the response to separate them.
-    
-    How it works:
-    1. Looks for common separators like "Final Answer:" or "Answer:"
-    2. If found, splits on the separator
-    3. If not found, tries to detect where thinking ends
-    4. Returns (thinking, answer) tuple
-    
-    Args:
-        response_text (str): The complete response from GPT
-        
-    Returns:
-        tuple: (thinking, answer)
-            - thinking (str): The reasoning process
-            - answer (str): The final answer
-            
-    Example:
-        Input:
-            "Let me think about this...\n\nFinal Answer: The treatment is..."
-        
-        Output:
-            ("Let me think about this...", "The treatment is...")
-    """
-    # Look for common separators between thinking and answer
+    """Separate thinking process from final answer."""
     separators = ["\n\nFinal Answer:", "\n\nAnswer:", "\n\n**Final Answer:**", "\n\n**Answer:**"]
 
-    # Try each separator
     for separator in separators:
         if separator in response_text:
-            # Split on the separator
             parts = response_text.split(separator, 1)
             if len(parts) == 2:
                 thinking = parts[0].strip()
                 answer = parts[1].strip()
                 return thinking, answer
 
-    # If no clear separator found, try to detect where thinking ends
     lines = response_text.split('\n')
     if len(lines) > 1:
-        # Look for lines that start with answer-like phrases
         for i, line in enumerate(lines):
             if line.strip().startswith(('The answer is', 'According to', 'Based on', 'The patient')):
-                # Assume everything before this line is thinking
                 thinking = '\n'.join(lines[:i]).strip()
                 answer = '\n'.join(lines[i:]).strip()
                 return thinking, answer
 
-    # Fallback: treat everything as answer (no thinking detected)
     return "", response_text
diff --git a/backend/utils/vector_store.py b/backend/utils/vector_store.py
index 4363175..3e57b0e 100644
--- a/backend/utils/vector_store.py
+++ b/backend/utils/vector_store.py
@@ -1,21 +1,4 @@
-"""
-Vector Store Module - Document Storage and Search System
-
-This module handles all operations related to storing and searching documents:
-1. ChromaDB Operations: Vector database for storing embeddings
-2. Dense Search: Semantic similarity search using embeddings
-3. Sparse Search: Keyword-based search using BM25 algorithm
-4. Hybrid Search: Combines dense + sparse search using RRF
-5. Reranking: Re-orders results by relevance using cosine similarity
-
-The module implements a sophisticated search system that combines:
-- Semantic understanding (dense search)
-- Keyword matching (sparse search)
-- Intelligent fusion (RRF)
-- Relevance refinement (reranking)
-
-This ensures the most relevant document chunks are retrieved for answering questions.
-"""
+"""Vector store operations for document storage and retrieval."""
 
 import chromadb
 from chromadb.config import Settings
@@ -25,273 +8,73 @@
 from typing import List, Dict, Any, Tuple
 from sklearn.metrics.pairwise import cosine_similarity
 
-# ============================================================================
-# CHROMADB OPERATIONS
-# ============================================================================
-
 def initialize_chromadb():
-    """
-    Initializes ChromaDB - The Vector Database Storage System.
-    
-    ChromaDB is a vector database optimized for storing and searching embeddings.
-    Think of it as a specialized database that:
-    - Stores document chunks and their embeddings (vector representations)
-    - Performs fast similarity searches
-    - Maintains metadata (source file, page number, chunk ID)
-    
-    How it works:
-    1. Creates or connects to a persistent database at ".chromadb" folder
-    2. Creates or retrieves a collection named "cliniq_docs"
-    3. Returns the collection for storing/querying documents
-    
-    What is a Vector Database?
-        - Traditional databases: Search by exact matches (SQL WHERE clauses)
-        - Vector databases: Search by similarity (find "similar" vectors)
-        - Example: Query "heart attack" finds documents about "myocardial infarction"
-                  because their embeddings are similar
-    
-    Returns:
-        Collection: ChromaDB collection object for storing and querying documents
-        
-    Persistence:
-        - Data is stored in ".chromadb" folder on disk
-        - Survives application restarts
-        - Can be cleared with clear_store() function
-    """
-    # Use persistent client - data is saved to disk
-    # This means documents persist even after the application restarts
+    """Initialize ChromaDB persistent client and collection."""
     client = chromadb.PersistentClient(path=".chromadb")
-    
-    # Get or create the collection
-    # If it doesn't exist, ChromaDB creates it automatically
-    # If it exists, ChromaDB retrieves it with all existing data
     collection = client.get_or_create_collection(name="cliniq_docs")
-    
     return collection
 
 def add_documents(collection, documents, embeddings, metadata):
-    """
-    Adds document chunks to the vector database.
-    
-    This function stores the processed document chunks so they can be searched later.
-    Each chunk is stored with:
-    - The original text (for display and context)
-    - Its embedding vector (for similarity search)
-    - Metadata (source file, page number, chunk ID)
-    
-    How it works:
-    1. Generates unique IDs for each chunk (required by ChromaDB)
-    2. Stores chunks, embeddings, and metadata in the collection
-    3. ChromaDB indexes everything for fast retrieval
-    
-    Args:
-        collection: ChromaDB collection object (from initialize_chromadb)
-        documents (list): List of text chunks (strings)
-        embeddings (list): List of embedding vectors (lists of floats)
-                         Each embedding is 1536 numbers representing the text's meaning
-        metadata (list): List of metadata dictionaries
-                        Each dict contains: {"source": "filename.pdf", "page": 5, "chunk_id": 0}
-    
-    Example:
-        Input:
-            documents = ["Chunk 1 text...", "Chunk 2 text..."]
-            embeddings = [[0.1, 0.2, ...], [0.3, 0.4, ...]]  # 1536 numbers each
-            metadata = [{"source": "doc.pdf", "page": 1}, {"source": "doc.pdf", "page": 2}]
-        
-        Result:
-            Both chunks are stored in ChromaDB and can be searched immediately
-    
-    Why Unique IDs?
-        ChromaDB requires unique identifiers for each document.
-        UUIDs ensure no conflicts even if the same document is uploaded multiple times.
-    """
-    # ChromaDB requires unique IDs for each document
-    # Generate UUIDs (Universally Unique Identifiers) for each chunk
+    """Add document chunks to the vector database."""
     import uuid
     ids = [str(uuid.uuid4()) for _ in range(len(documents))]
-    
-    # Add all chunks to the collection in one operation
-    # This is efficient - ChromaDB handles indexing automatically
+
     collection.add(
-        documents=documents,  # Original text chunks
-        embeddings=embeddings,  # Vector representations
-        metadatas=metadata,  # Source information
-        ids=ids  # Unique identifiers
+        documents=documents,
+        embeddings=embeddings,
+        metadatas=metadata,
+        ids=ids
     )
 
 def search_documents(query_embedding, top_k=3):
-    """
-    DENSE SEARCH: Finds documents by semantic meaning.
-    
-    This is semantic/semantic similarity search. It finds documents based on
-    meaning, not just keywords. This is powerful because:
-    - "Heart attack" and "myocardial infarction" are found together
-    - "High blood pressure" and "hypertension" are treated as similar
-    - Works even if different words are used
-    
-    How it works:
-    1. Takes the query embedding (vector representation of the question)
-    2. ChromaDB calculates similarity between query and all stored embeddings
-    3. Returns the top_k most similar document chunks
-    
-    What is Dense Search?
-        - Uses embeddings (dense vectors) to find similar content
-        - Measures semantic similarity (meaning-based)
-        - Example: Query "cardiac event" finds documents about "heart problems"
-                  because their embeddings are close in vector space
-    
-    Args:
-        query_embedding (list): Embedding vector of the query (1536 numbers)
-        top_k (int): Number of results to return (default: 3)
-    
-    Returns:
-        dict: ChromaDB results in format:
-            {
-                'documents': [[chunk1, chunk2, chunk3]],  # Nested list format
-                'metadatas': [[meta1, meta2, meta3]],
-                'distances': [[0.1, 0.2, 0.3]]  # Lower = more similar
-            }
-    
-    Example:
-        Query: "What are the symptoms?"
-        Finds: Documents about "clinical manifestations", "patient presentation", etc.
-               Even if they don't contain the exact word "symptoms"
-    """
-    # Connect to the persistent ChromaDB instance
+    """Search documents using semantic similarity (dense search)."""
     client = chromadb.PersistentClient(path=".chromadb")
     collection = client.get_collection(name="cliniq_docs")
-    
-    # Query the collection using the query embedding
-    # ChromaDB performs vector similarity search internally
+
     results = collection.query(
-        query_embeddings=[query_embedding],  # Query vector
-        n_results=top_k  # Number of results to return
+        query_embeddings=[query_embedding],
+        n_results=top_k
     )
-    
+
     return results
 
 def clear_store():
-    """
-    Clears all documents from the vector database.
-    
-    This function deletes the entire ChromaDB collection, removing:
-    - All document chunks
-    - All embeddings
-    - All metadata
-    - All search indexes
-    
-    Use Cases:
-        - Starting a new session
-        - Clearing sensitive data
-        - Resetting after testing
-    
-    Note:
-        - This is a destructive operation (cannot be undone)
-        - Physical files in uploads/ folder are NOT deleted here
-        - Use with caution in production environments
-        - Also clears any API keys from environment variables for security
-    """
+    """Clear all documents from the vector database."""
     client = chromadb.PersistentClient(path=".chromadb")
     try:
-        # Delete the entire collection
         client.delete_collection(name="cliniq_docs")
     except Exception as e:
-        # Collection might not exist (already cleared or never created)
         print(f"Error deleting collection: {e}")
-    
-    # Clear API key from environment variables if it exists
-    # This ensures API keys are not persisted after session ends
+
     import os
     if "OPENAI_API_KEY" in os.environ:
         del os.environ["OPENAI_API_KEY"]
 
-# ============================================================================
-# BM25 (SPARSE SEARCH) OPERATIONS
-# ============================================================================
-
-# Global variables for BM25 index
-# These are stored in memory for fast keyword search
-bm25_index = None  # The BM25 search index
-bm25_documents = []  # List of all document texts
-bm25_metadata = []  # List of all metadata (parallel to bm25_documents)
+bm25_index = None
+bm25_documents = []
+bm25_metadata = []
 
 def initialize_bm25_index(collection):
-    """
-    Initializes BM25 index for keyword-based (sparse) search.
-    
-    BM25 (Best Matching 25) is a ranking algorithm used for keyword search.
-    Unlike dense search (which finds by meaning), BM25 finds by exact keywords.
-    This is perfect for:
-    - Specific medication names (e.g., "Metformin")
-    - Medical codes (e.g., "ICD-10")
-    - Exact terminology that must match
-    
-    How BM25 Works:
-        1. Tokenizes documents into words
-        2. Calculates term frequency (how often words appear)
-        3. Applies inverse document frequency (rare words score higher)
-        4. Ranks documents by relevance to query keywords
-    
-    Why Both Dense and Sparse?
-        - Dense search: Finds by meaning ("heart problem" finds "cardiac issue")
-        - Sparse search: Finds by keywords ("Metformin" finds exact matches)
-        - Hybrid: Combines both for comprehensive results
-    
-    Args:
-        collection: ChromaDB collection containing all documents
-    
-    Process:
-        1. Retrieves all documents from ChromaDB
-        2. Tokenizes each document into words
-        3. Builds BM25 index from tokenized documents
-        4. Stores documents and metadata for retrieval
-    
-    Note:
-        - Index is built in memory for fast searching
-        - Must be rebuilt when new documents are added
-        - Uses NLTK for tokenization (downloads punkt tokenizer if needed)
-    """
+    """Initialize BM25 index for keyword-based search."""
     global bm25_index, bm25_documents, bm25_metadata
 
     try:
-        # ====================================================================
-        # STEP 1: Get all documents from ChromaDB
-        # ====================================================================
-        # Retrieve all stored documents and their metadata
-        # This loads everything into memory for BM25 indexing
         results = collection.get(include=['documents', 'metadatas'])
-        bm25_documents = results['documents']  # All document texts
-        bm25_metadata = results['metadatas']  # All metadata (parallel array)
+        bm25_documents = results['documents']
+        bm25_metadata = results['metadatas']
 
         if bm25_documents:
-            # ================================================================
-            # STEP 2: Tokenize documents for BM25
-            # ================================================================
-            # BM25 needs documents as lists of words (tokens)
-            # Tokenization splits text into individual words
             import nltk
             from nltk.tokenize import word_tokenize
-            
-            # Download NLTK punkt tokenizer if not already available
-            # This is a one-time download that tokenizes text into words
+
             try:
                 nltk.data.find('tokenizers/punkt')
             except LookupError:
                 nltk.download('punkt', quiet=True)
 
-            # Tokenize each document into words
-            # Lowercase everything for case-insensitive matching
-            # Example: "Patient has Diabetes" → ["patient", "has", "diabetes"]
             tokenized_docs = [word_tokenize(doc.lower()) for doc in bm25_documents]
-            
-            # ================================================================
-            # STEP 3: Build BM25 index
-            # ================================================================
-            # Create the BM25 index from tokenized documents
-            # This index enables fast keyword search
             bm25_index = BM25Okapi(tokenized_docs)
         else:
-            # No documents to index
             bm25_index = None
     except Exception as e:
         print(f"Error initializing BM25 index: {e}")
@@ -300,51 +83,10 @@ def initialize_bm25_index(collection):
         bm25_metadata = []
 
 def bm25_search(query: str, top_k: int = 10) -> List[Tuple[str, Dict, float]]:
-    """
-    SPARSE SEARCH: Finds documents by exact keywords using BM25 algorithm.
-    
-    This function performs keyword-based search, which is excellent for:
-    - Specific medication names: "Metformin", "Insulin"
-    - Medical codes: "ICD-10", "CPT codes"
-    - Exact terminology: "Type 2 Diabetes"
-    - Technical terms that must match exactly
-    
-    How BM25 Scoring Works:
-        - Term Frequency (TF): How often query words appear in document
-        - Inverse Document Frequency (IDF): Rare words score higher
-        - Document Length Normalization: Prevents bias toward long documents
-        - Formula: score = TF * IDF (simplified)
-    
-    Why Sparse Search?
-        - Dense search might miss exact keyword matches
-        - Some queries need exact term matching (medication names, codes)
-        - BM25 is proven effective for information retrieval
-        - Fast and efficient for keyword queries
-    
-    Args:
-        query (str): The search query (e.g., "Metformin side effects")
-        top_k (int): Number of top results to return (default: 10)
-    
-    Returns:
-        list: List of tuples, each containing:
-            - document (str): The document chunk text
-            - metadata (dict): Source information {"source": "...", "page": ...}
-            - score (float): BM25 relevance score (higher = more relevant)
-    
-    Example:
-        Query: "Metformin dosage"
-        Finds: Documents containing both "Metformin" and "dosage"
-        Scores: Higher scores for documents with more occurrences
-    
-    Note:
-        - Returns empty list if BM25 index is not initialized
-        - Only returns documents with positive scores (some relevance)
-    """
-    # Check if BM25 index is available
+    """Search documents using BM25 keyword matching (sparse search)."""
     if bm25_index is None or not bm25_documents:
         return []
 
-    # Tokenize the query (same process as documents)
     import nltk
     from nltk.tokenize import word_tokenize
     try:
@@ -352,261 +94,73 @@ def bm25_search(query: str, top_k: int = 10) -> List[Tuple[str, Dict, float]]:
     except LookupError:
         nltk.download('punkt', quiet=True)
 
-    # Convert query to lowercase and tokenize
-    # Example: "Metformin dosage" → ["metformin", "dosage"]
     tokenized_query = word_tokenize(query.lower())
-
-    # ========================================================================
-    # Calculate BM25 scores for all documents
-    # ========================================================================
-    # get_scores() returns a score for each document
-    # Higher score = more relevant to the query
     scores = bm25_index.get_scores(tokenized_query)
-
-    # ========================================================================
-    # Get top-k results
-    # ========================================================================
-    # argsort returns indices sorted by score (descending)
-    # [::-1] reverses to get highest scores first
-    # [:top_k] takes only the top k results
     top_indices = np.argsort(scores)[::-1][:top_k]
 
-    # ========================================================================
-    # Build results list
-    # ========================================================================
     results = []
     for idx in top_indices:
-        # Only include documents with positive scores
-        # Zero or negative scores mean no relevance
         if scores[idx] > 0:
             results.append((
-                bm25_documents[idx],  # Document text
-                bm25_metadata[idx],  # Metadata (source, page, etc.)
-                float(scores[idx])  # BM25 relevance score
+                bm25_documents[idx],
+                bm25_metadata[idx],
+                float(scores[idx])
             ))
 
     return results
 
-# ============================================================================
-# HYBRID SEARCH OPERATIONS
-# ============================================================================
-
 def reciprocal_rank_fusion(dense_results: List[Tuple[str, Dict, float]],
                           sparse_results: List[Tuple[str, Dict, float]],
                           k: int = 60) -> List[Tuple[str, Dict, float]]:
-    """
-    Combines dense and sparse search results using Reciprocal Rank Fusion (RRF).
-    
-    RRF is a powerful technique that merges results from different search methods
-    without needing to normalize scores. It's particularly effective because:
-    1. Doesn't require score normalization (dense and sparse scores are different scales)
-    2. Gives equal weight to both search methods
-    3. Promotes documents that appear in both result sets
-    4. Is robust and widely used in information retrieval
-    
-    How RRF Works:
-        - Each result gets a score based on its rank (position) in the results
-        - Formula: score = 1 / (k + rank)
-        - Rank 1 (first result) gets highest score
-        - Rank 2 gets lower score, etc.
-        - Results from both searches are combined
-        - Documents appearing in both searches get scores from both
-        - Final ranking is by combined RRF score
-    
-    Why RRF?
-        - Dense search scores: 0.0 to 1.0 (similarity)
-        - Sparse search scores: 0.0 to 10.0+ (BM25)
-        - Can't directly compare or average these
-        - RRF uses ranks instead, which are comparable
-    
-    Args:
-        dense_results: Results from semantic search
-                      List of (document, metadata, score) tuples
-        sparse_results: Results from keyword search
-                       List of (document, metadata, score) tuples
-        k: RRF constant (default: 60)
-           - Higher k = less difference between ranks
-           - Lower k = more emphasis on top results
-           - 60 is a standard value in research
-    
-    Returns:
-        list: Combined and sorted results by RRF score
-              Format: List of (document, metadata, rrf_score) tuples
-    
-    Example:
-        Dense results: [doc1, doc2, doc3]
-        Sparse results: [doc2, doc4, doc1]
-        
-        RRF combines them:
-        - doc2: Appears in both (rank 2 dense, rank 1 sparse) → High RRF score
-        - doc1: Appears in both (rank 1 dense, rank 3 sparse) → High RRF score
-        - doc3: Only in dense → Lower score
-        - doc4: Only in sparse → Lower score
-        
-        Final: [doc2, doc1, doc3, doc4] (sorted by RRF score)
-    """
-    # Dictionary to track RRF scores for each unique document
-    # Key: Unique document identifier
-    # Value: Document data and accumulated RRF score
+    """Combine dense and sparse search results using Reciprocal Rank Fusion."""
     rrf_scores = {}
 
-    # ========================================================================
-    # Process dense search results
-    # ========================================================================
-    # Rank starts at 0 (first result)
     for rank, (doc, meta, score) in enumerate(dense_results):
-        # Create unique key to identify this document
-        # Uses first 100 chars of text + source + chunk_id
-        # This ensures we can match the same document across both searches
         key = f"{doc[:100]}|{meta.get('source', '')}|{meta.get('chunk_id', '')}"
-        
-        # Initialize if this document hasn't been seen before
+
         if key not in rrf_scores:
             rrf_scores[key] = {'doc': doc, 'meta': meta, 'score': 0.0}
-        
-        # Add RRF score contribution from dense search
-        # Formula: 1 / (k + rank)
-        # Rank 0: 1/(60+0) = 0.0167
-        # Rank 1: 1/(60+1) = 0.0164
-        # Rank 2: 1/(60+2) = 0.0161
+
         rrf_scores[key]['score'] += 1.0 / (k + rank + 1)
 
-    # ========================================================================
-    # Process sparse search results
-    # ========================================================================
-    # Same process for BM25 results
     for rank, (doc, meta, score) in enumerate(sparse_results):
         key = f"{doc[:100]}|{meta.get('source', '')}|{meta.get('chunk_id', '')}"
-        
+
         if key not in rrf_scores:
             rrf_scores[key] = {'doc': doc, 'meta': meta, 'score': 0.0}
-        
-        # Add RRF score contribution from sparse search
-        # Documents appearing in both searches get scores from both
+
         rrf_scores[key]['score'] += 1.0 / (k + rank + 1)
 
-    # ========================================================================
-    # Sort by RRF score and return
-    # ========================================================================
-    # Sort all results by their combined RRF score (descending)
     sorted_results = sorted(rrf_scores.values(), key=lambda x: x['score'], reverse=True)
-    
-    # Convert back to tuple format
     return [(item['doc'], item['meta'], item['score']) for item in sorted_results]
 
 def hybrid_search(query_embedding: List[float], query_text: str, top_k: int = 5,
                  alpha: float = 0.5) -> Dict[str, Any]:
-    """
-    Performs HYBRID SEARCH: Combines dense (semantic) and sparse (keyword) search.
-    
-    This is the most powerful search method, combining the best of both worlds:
-    - Dense Search: Finds by meaning (semantic similarity)
-    - Sparse Search: Finds by keywords (BM25)
-    - RRF Fusion: Intelligently combines both result sets
-    
-    Why Hybrid Search?
-        - Dense alone: Might miss exact keyword matches
-        - Sparse alone: Might miss semantic variations
-        - Hybrid: Gets comprehensive results from both methods
-    
-    Process:
-        1. Perform dense search (semantic similarity)
-        2. Perform sparse search (keyword matching)
-        3. Combine results using Reciprocal Rank Fusion (RRF)
-        4. Return top-k most relevant chunks
-    
-    Args:
-        query_embedding (list): Embedding vector for semantic search (1536 numbers)
-        query_text (str): Original query text for keyword search
-        top_k (int): Number of results to return (default: 5)
-        alpha (float): Weight parameter (currently not used, RRF handles weighting)
-    
-    Returns:
-        dict: Results in ChromaDB format:
-            {
-                'documents': [[chunk1, chunk2, ...]],  # Nested list
-                'metadatas': [[meta1, meta2, ...]],
-                'distances': [[0.1, 0.2, ...]]  # Lower = more similar
-            }
-    
-    Example:
-        Query: "cardiac medication side effects"
-        
-        Dense search finds:
-            - Documents about "heart drug adverse reactions" (semantic match)
-            - Documents about "cardiovascular medicine complications"
-        
-        Sparse search finds:
-            - Documents containing exact words "cardiac", "medication", "side effects"
-        
-        RRF combines both, prioritizing documents that appear in both result sets
-    
-    Performance:
-        - Gets 2x top_k results from each method (for better RRF fusion)
-        - RRF combines and selects best top_k
-        - More comprehensive than either method alone
-    """
-    # Connect to ChromaDB
+    """Perform hybrid search combining dense and sparse methods."""
     client = chromadb.PersistentClient(path=".chromadb")
     collection = client.get_collection(name="cliniq_docs")
 
-    # ========================================================================
-    # Initialize BM25 index if needed
-    # ========================================================================
-    # BM25 index must be built before sparse search can work
     if bm25_index is None:
         initialize_bm25_index(collection)
 
-    # ========================================================================
-    # STEP 1: Dense Search (Semantic Similarity)
-    # ========================================================================
-    # Search using embeddings (vector similarity)
-    # Gets more results (top_k * 2) to have more candidates for RRF
     dense_results_chroma = collection.query(
-        query_embeddings=[query_embedding],  # Query embedding vector
-        n_results=top_k * 2  # Get more results for better RRF fusion
+        query_embeddings=[query_embedding],
+        n_results=top_k * 2
     )
 
-    # ========================================================================
-    # Convert ChromaDB results to our internal format
-    # ========================================================================
     dense_results = []
     if dense_results_chroma['documents'] and dense_results_chroma['documents'][0]:
         for i, doc in enumerate(dense_results_chroma['documents'][0]):
             meta = dense_results_chroma['metadatas'][0][i]
-            # Approximate score based on rank (ChromaDB doesn't return exact scores)
-            # Rank 0 (most similar) gets score 1.0, rank 1 gets 0.9, etc.
             score = 1.0 - (i * 0.1)
             dense_results.append((doc, meta, score))
 
-    # ========================================================================
-    # STEP 2: Sparse Search (Keyword Matching)
-    # ========================================================================
-    # Search using BM25 (keyword-based)
-    # Also gets more results for better RRF fusion
     sparse_results = bm25_search(query_text, top_k=top_k * 2)
-
-    # ========================================================================
-    # STEP 3: Combine using Reciprocal Rank Fusion (RRF)
-    # ========================================================================
-    # RRF intelligently merges dense and sparse results
-    # Documents appearing in both searches get higher scores
     combined_results = reciprocal_rank_fusion(dense_results, sparse_results)
-
-    # ========================================================================
-    # STEP 4: Take top-k results
-    # ========================================================================
-    # Select the best top_k chunks after RRF fusion
     top_results = combined_results[:top_k]
 
-    # ========================================================================
-    # STEP 5: Convert back to ChromaDB format
-    # ========================================================================
-    # Format results to match ChromaDB's expected structure
-    # ChromaDB uses nested lists: [[doc1, doc2, ...]]
     documents = [[doc for doc, _, _ in top_results]] if top_results else [[]]
     metadatas = [[meta for _, meta, _ in top_results]] if top_results else [[]]
-    # Convert RRF score to distance (ChromaDB uses distances, lower = more similar)
     distances = [[1.0 - score for _, _, score in top_results]] if top_results else [[]]
 
     return {
@@ -615,128 +169,43 @@ def hybrid_search(query_embedding: List[float], query_text: str, top_k: int = 5,
         'distances': distances
     }
 
-# ============================================================================
-# RERANKING OPERATIONS
-# ============================================================================
-
 def rerank_chunks(query_embedding: List[float], chunks: List[Tuple[str, Dict, float]],
                   top_k: int = 3) -> List[Tuple[str, Dict, float]]:
-    """
-    Re-ranks retrieved chunks by relevance using cosine similarity.
-    
-    Reranking is a refinement step that improves answer quality. Even after
-    hybrid search, the initial ranking might not be perfect. Reranking:
-    1. Creates fresh embeddings for the retrieved chunks
-    2. Calculates cosine similarity with the query embedding
-    3. Re-orders chunks by similarity score
-    4. Returns the most relevant chunks
-    
-    Why Rerank?
-        - Initial search might have ranking errors
-        - Fresh similarity calculation is more accurate
-        - Ensures the most relevant chunks are used for answering
-        - Improves answer quality significantly
-    
-    How Cosine Similarity Works:
-        - Measures angle between two vectors in high-dimensional space
-        - Range: -1 to 1 (1 = identical, 0 = orthogonal, -1 = opposite)
-        - Higher similarity = more relevant to the query
-        - Formula: cos(θ) = (A · B) / (||A|| * ||B||)
-    
-    Args:
-        query_embedding (list): Embedding vector of the query (1536 numbers)
-        chunks (list): List of (document, metadata, score) tuples from initial search
-        top_k (int): Number of top chunks to return after reranking (default: 3)
-    
-    Returns:
-        list: Re-ranked chunks sorted by cosine similarity
-              Format: List of (document, metadata, similarity_score) tuples
-              Higher similarity_score = more relevant
-    
-    Example:
-        Input chunks (from hybrid search):
-            [chunk1 (score: 0.8), chunk2 (score: 0.9), chunk3 (score: 0.7)]
-        
-        After reranking (by cosine similarity):
-            [chunk2 (similarity: 0.95), chunk1 (similarity: 0.88), chunk3 (similarity: 0.72)]
-        
-        Returns top 3: [chunk2, chunk1, chunk3] (reordered by relevance)
-    
-    Performance:
-        - Makes additional API call to create embeddings for chunks
-        - Adds latency but significantly improves answer quality
-        - Typically processes 7-15 chunks, so cost is minimal
-    """
-    from openai import OpenAI
+    """Re-rank retrieved chunks using cosine similarity."""
     import os
 
-    # Validate input
     if not chunks:
         return []
 
-    # ========================================================================
-    # Extract documents and metadata
-    # ========================================================================
-    # Separate the chunks into components
-    documents = [chunk[0] for chunk in chunks]  # Text chunks
-    metadatas = [chunk[1] for chunk in chunks]  # Metadata (source, page, etc.)
+    documents = [chunk[0] for chunk in chunks]
+    metadatas = [chunk[1] for chunk in chunks]
 
-    # ========================================================================
-    # Get API key for creating embeddings
-    # ========================================================================
     api_key = os.environ.get("OPENAI_API_KEY")
     if not api_key:
-        # If no API key, can't rerank - return original order
         return chunks[:top_k]
 
     try:
-        # ====================================================================
-        # Create embeddings for all chunks
-        # ====================================================================
-        # This creates fresh embeddings for the retrieved chunks
-        # These embeddings are then compared with the query embedding
-        client = OpenAI(api_key=api_key)
-        from utils.constants import EMBEDDING_MODEL
-        response = client.embeddings.create(
-            input=documents,  # All chunk texts
-            model=EMBEDDING_MODEL  # text-embedding-3-small
-        )
-        # Extract embedding vectors
-        chunk_embeddings = [data.embedding for data in response.data]
-
-        # ====================================================================
-        # Calculate cosine similarities
-        # ====================================================================
-        # Convert to numpy arrays for efficient computation
-        query_embedding = np.array(query_embedding).reshape(1, -1)  # Shape: (1, 1536)
-        chunk_embeddings = np.array(chunk_embeddings)  # Shape: (n_chunks, 1536)
-
-        # Calculate cosine similarity between query and each chunk
-        # Result: Array of similarity scores (one per chunk)
-        # Higher score = more similar = more relevant
+        from services.llm_service import create_llm_service
+        llm_service = create_llm_service(api_key=api_key)
+
+        chunk_embeddings = llm_service.create_embeddings(documents)
+
+        query_embedding = np.array(query_embedding).reshape(1, -1)
+        chunk_embeddings = np.array(chunk_embeddings)
+
         similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
 
-        # ====================================================================
-        # Re-rank chunks by similarity
-        # ====================================================================
         reranked_results = []
         for i, similarity in enumerate(similarities):
             reranked_results.append((
-                documents[i],  # Original document text
-                metadatas[i],  # Original metadata
-                float(similarity)  # Cosine similarity score
+                documents[i],
+                metadatas[i],
+                float(similarity)
             ))
 
-        # ====================================================================
-        # Sort by similarity score (descending)
-        # ====================================================================
-        # Most similar chunks come first
         reranked_results.sort(key=lambda x: x[2], reverse=True)
-
-        # Return top-k most relevant chunks
         return reranked_results[:top_k]
 
     except Exception as e:
-        # If reranking fails, fall back to original ranking
         print(f"Error in reranking: {e}")
         return chunks[:top_k]
diff --git a/configuration/.env.example b/configuration/.env.example
deleted file mode 100644
index 11b2b32..0000000
--- a/configuration/.env.example
+++ /dev/null
@@ -1 +0,0 @@
-OPENAI_API_KEY=your_api_key_here
diff --git a/configuration/docker-compose.yml b/configuration/docker-compose.yml
deleted file mode 100644
index 9cc6014..0000000
--- a/configuration/docker-compose.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-services:
-  backend:
-    build:
-      context: ..
-      dockerfile: backend/Dockerfile
-    ports:
-      - "5000:5000"
-    volumes:
-      - ../backend/.chromadb:/app/.chromadb
-      - ../backend/uploads:/app/uploads
-    working_dir: /app
-    environment:
-      - FLASK_ENV=development
-    restart: unless-stopped
-
-  frontend:
-    build:
-      context: ..
-      dockerfile: frontend/Dockerfile
-    ports:
-      - "3000:3000"
-    volumes:
-      - ../frontend:/app
-      - /app/node_modules
-    environment:
-      - VITE_BACKEND_ENDPOINT=http://backend:5000
-    depends_on:
-      - backend
-    restart: unless-stopped
-    stdin_open: true
-    tty: true
-
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..334eb32
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,63 @@
+version: '3.8'
+
+services:
+  backend:
+    build:
+      context: .
+      dockerfile: backend/Dockerfile
+    ports:
+      - "${BACKEND_PORT:-5000}:5000"
+    volumes:
+      - ./backend/.chromadb:/app/.chromadb
+      - ./backend/uploads:/app/uploads
+    working_dir: /app
+    environment:
+      - FLASK_ENV=development
+      # LLM Provider Configuration (optional, can be set in .env file)
+      # Uncomment and configure as needed
+      # - LLM_PROVIDER=openai
+      # - LLM_API_KEY=${LLM_API_KEY}
+      # - LLM_BASE_URL=https://api.openai.com/v1
+      # - LLM_CHAT_MODEL=gpt-3.5-turbo
+      # - LLM_EMBEDDING_MODEL=text-embedding-3-small
+      # - TEMPERATURE=0.7
+      # - MAX_TOKENS=1000
+      # - VERIFY_SSL=true
+    env_file:
+      - ./backend/.env
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:5000/api/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+    restart: unless-stopped
+
+  frontend:
+    build:
+      context: .
+      dockerfile: frontend/Dockerfile
+    ports:
+      - "${FRONTEND_PORT:-3000}:3000"
+    volumes:
+      - ./frontend:/app
+      - /app/node_modules
+    environment:
+      - VITE_BACKEND_ENDPOINT=http://backend:5000
+    depends_on:
+      backend:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:3000 || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+    restart: unless-stopped
+    stdin_open: true
+    tty: true
+
+networks:
+  default:
+    name: cliniq-network
+
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index fc403d8..659ff0a 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -67,7 +67,6 @@
       "integrity": "sha512-e7jT4DxYvIDLk1ZHmU/m/mB19rex9sv0c2ftBtjSBv+kVM/902eh0fINUzD7UwLLNR+jU585GxUJ8/EBfAM5fw==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "@babel/code-frame": "^7.27.1",
         "@babel/generator": "^7.28.5",
@@ -1335,7 +1334,6 @@
         }
       ],
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "baseline-browser-mapping": "^2.9.0",
         "caniuse-lite": "^1.0.30001759",
@@ -1453,8 +1451,7 @@
       "version": "3.2.3",
       "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.2.3.tgz",
       "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==",
-      "license": "MIT",
-      "peer": true
+      "license": "MIT"
     },
     "node_modules/debug": {
       "version": "4.4.3",
@@ -1748,7 +1745,6 @@
       "integrity": "sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "bin": {
         "jiti": "bin/jiti.js"
       }
@@ -2002,7 +1998,6 @@
         }
       ],
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "nanoid": "^3.3.11",
         "picocolors": "^1.1.1",
@@ -2172,7 +2167,6 @@
       "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz",
       "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==",
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "loose-envify": "^1.1.0"
       },
@@ -2185,7 +2179,6 @@
       "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz",
       "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==",
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "loose-envify": "^1.1.0",
         "scheduler": "^0.23.2"
@@ -2544,7 +2537,6 @@
       "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "engines": {
         "node": ">=12"
       },
@@ -2616,7 +2608,6 @@
       "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "esbuild": "^0.21.3",
         "postcss": "^8.4.43",
diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx
index 7037b7f..4f4c760 100644
--- a/frontend/src/App.jsx
+++ b/frontend/src/App.jsx
@@ -1,23 +1,3 @@
-/**
- * Main Application Component - React Router Setup
- * 
- * This is the root component of the React application. It sets up:
- * 1. Routing: Defines which component to show for each URL path
- * 2. Toast Notifications: Configures success/error message popups
- * 3. Layout: Wraps all pages with a consistent header/footer
- * 
- * How React Router Works:
- * - BrowserRouter: Enables client-side routing (no page refreshes)
- * - Routes: Container for all route definitions
- * - Route: Maps a URL path to a React component
- * 
- * Example:
- * - URL: "/" → Shows Home component
- * - URL: "/chat" → Shows Chat component
- * 
- * The Layout component wraps both routes, providing consistent navigation.
- */
-
 import { BrowserRouter, Routes, Route } from 'react-router-dom';
 import { Toaster } from 'react-hot-toast';
 import Layout from './components/layout/Layout';
@@ -26,46 +6,34 @@ import Chat from './pages/Chat';
 
 function App() {
   return (
-    // BrowserRouter enables client-side routing
-    // This allows navigation without full page reloads
     <BrowserRouter>
-      {/* Toast Notification System */}
-      {/* Shows temporary success/error messages at the top-right */}
       <Toaster
-        position="top-right"  // Where notifications appear
+        position="top-right"
         toastOptions={{
-          duration: 4000,  // How long notifications stay (4 seconds)
+          duration: 4000,
           style: {
-            background: '#fff',  // White background
-            color: '#374151',  // Gray text
-            boxShadow: '0 10px 15px -3px rgba(0, 0, 0, 0.1)',  // Subtle shadow
+            background: '#fff',
+            color: '#374151',
+            boxShadow: '0 10px 15px -3px rgba(0, 0, 0, 0.1)',
           },
-          // Success notifications (green checkmark)
           success: {
             iconTheme: {
-              primary: '#22c55e',  // Green color
-              secondary: '#fff',  // White checkmark
+              primary: '#22c55e',
+              secondary: '#fff',
             },
           },
-          // Error notifications (red X)
           error: {
             iconTheme: {
-              primary: '#ef4444',  // Red color
-              secondary: '#fff',  // White X
+              primary: '#ef4444',
+              secondary: '#fff',
             },
           },
         }}
       />
-      
-      {/* Route Definitions */}
-      {/* Defines which component to render for each URL path */}
+
       <Routes>
-        {/* Layout wraps all child routes, providing header/footer */}
         <Route path="/" element={<Layout />}>
-          {/* Index route: "/" shows the Home page */}
           <Route index element={<Home />} />
-          
-          {/* Chat route: "/chat" shows the Chat page */}
           <Route path="chat" element={<Chat />} />
         </Route>
       </Routes>
diff --git a/frontend/src/components/ChatInterface.jsx b/frontend/src/components/ChatInterface.jsx
index 5926946..7068c15 100644
--- a/frontend/src/components/ChatInterface.jsx
+++ b/frontend/src/components/ChatInterface.jsx
@@ -3,20 +3,10 @@ import { Send, Loader2, Bot, User, FileText } from 'lucide-react';
 import { queryDocuments } from '../services/api';
 import toast from 'react-hot-toast';
 
-/**
- * THE INTERACTIVE CONSULTANT:
- * This component handles the chat bubble display and the actual messaging logic.
- * Python analogy: Like a CLI loop that takes input and prints responses, but with a GUI.
- */
-const ChatInterface = ({ apiKey, hasDocuments, config }) => {
-  // 'messages' is a list (Python list) of all chat history.
+const ChatInterface = ({ hasDocuments, config }) => {
   const [messages, setMessages] = useState([]);
-  // 'input' is the current text in the typing box.
   const [input, setInput] = useState('');
-  // Loading state (spinner).
   const [isLoading, setIsLoading] = useState(false);
-
-  // This helps us automatically scroll to the bottom of the chat.
   const messagesEndRef = useRef(null);
 
   const scrollToBottom = () => {
@@ -27,39 +17,18 @@ const ChatInterface = ({ apiKey, hasDocuments, config }) => {
     scrollToBottom();
   }, [messages]);
 
-  /**
-   * This runs when you click 'Send' or hit Enter.
-   * It's like the main logic block in a Python script.
-   */
   const handleSubmit = async (e) => {
-    e.preventDefault(); // Stop the page from refreshing (standard web behavior).
+    e.preventDefault();
     if (!input.trim() || isLoading) return;
 
-    // Double-check API key with trim to ensure it's not just whitespace
-    const trimmedKey = apiKey?.trim();
-    if (!trimmedKey || trimmedKey.length < 10) {
-      toast.error('Please enter a valid OpenAI API key first (starts with "sk-")');
-      return;
-    }
-
-    // Additional validation - check if it starts with sk-
-    if (!trimmedKey.startsWith('sk-')) {
-      toast.error('Invalid API key format. OpenAI keys start with "sk-"');
-      return;
-    }
-
     if (!hasDocuments) {
       toast.error('Please upload a document first');
       return;
     }
 
     const userMessage = input.trim();
-    setInput(''); // Clear the text box.
+    setInput('');
 
-    /**
-     * Update the screen with the user's message and a blank space for the AI's reply.
-     * We create a new list by adding the new message to the existing list.
-     */
     setMessages((prev) => [
       ...prev,
       { role: 'user', content: userMessage },
@@ -68,27 +37,13 @@ const ChatInterface = ({ apiKey, hasDocuments, config }) => {
     setIsLoading(true);
 
     try {
-      /**
-       * We call our API service. Notice 'onChunk' - this is a "callback".
-       * Backend sends in order: thinking -> content -> metadata (citations)
-       */
-      // Use trimmed key to avoid whitespace issues
-      await queryDocuments(userMessage, trimmedKey, {
-        history: messages, // Pass conversation history for memory
+      await queryDocuments(userMessage, {
+        history: messages,
         useHybridSearch: config.useHybridSearch,
         useReranker: config.useReranker,
         showThinking: config.showThinking,
         onChunk: (chunk) => {
-          /**
-           * PROCESSING THE RESPONSE:
-           * Backend streams in this order:
-           * 1. "citations" - all retrieved sources (sent first)
-           * 2. "thinking" chunks - AI reasoning process (stream character by character)
-           * 3. "content" chunks - final answer (stream character by character)
-           * 4. "clear_citations" - clear citations if no answer found
-           */
           if (chunk.type === 'citations' || chunk.type === 'metadata') {
-            // Citations received FIRST (all retrieved sources)
             setMessages((prev) => {
               const lastIndex = prev.length - 1;
               const lastMessage = prev[lastIndex];
@@ -102,7 +57,6 @@ const ChatInterface = ({ apiKey, hasDocuments, config }) => {
               ];
             });
           } else if (chunk.type === 'thinking') {
-            // APPEND thinking chunks as they stream in
             setMessages((prev) => {
               const lastIndex = prev.length - 1;
               const lastMessage = prev[lastIndex];
@@ -116,7 +70,6 @@ const ChatInterface = ({ apiKey, hasDocuments, config }) => {
               ];
             });
           } else if (chunk.type === 'content') {
-            // APPEND answer chunks as they stream in
             setMessages((prev) => {
               const lastIndex = prev.length - 1;
               const lastMessage = prev[lastIndex];
@@ -130,7 +83,6 @@ const ChatInterface = ({ apiKey, hasDocuments, config }) => {
               ];
             });
           } else if (chunk.type === 'clear_citations') {
-            // Clear citations if LLM said "no information"
             setMessages((prev) => {
               const lastIndex = prev.length - 1;
               const lastMessage = prev[lastIndex];
@@ -159,11 +111,6 @@ const ChatInterface = ({ apiKey, hasDocuments, config }) => {
     }
   };
 
-  /**
-   * The actual visual part of the chat window.
-   * It loops ('maps') over the 'messages' list and draws each message bubble.
-   * Python analogy: Like a 'for message in messages: print(bubble_html)' loop.
-   */
   return (
     <div className="card h-[600px] flex flex-col">
       <div className="flex items-center mb-4 pb-4 border-b border-gray-200">
@@ -184,14 +131,12 @@ const ChatInterface = ({ apiKey, hasDocuments, config }) => {
               key={index}
               className={`flex gap-3 ${message.role === 'user' ? 'justify-end' : 'justify-start'}`}
             >
-              {/* If it's the AI, show a bot icon */}
               {message.role === 'assistant' && (
                 <div className="flex-shrink-0 w-8 h-8 rounded-full bg-primary-100 flex items-center justify-center">
                   <Bot className="w-5 h-5 text-primary-600" />
                 </div>
               )}
 
-              {/* The message bubble */}
               <div
                 className={`max-w-[80%] rounded-lg p-4 ${message.role === 'user'
                   ? 'bg-primary-600 text-white'
@@ -200,7 +145,6 @@ const ChatInterface = ({ apiKey, hasDocuments, config }) => {
                     : 'bg-gray-100 text-gray-900'
                   }`}
               >
-                {/* Special blue box for the "Thinking" process */}
                 {message.thinking && (
                   <div className="mb-3 p-3 bg-blue-50 rounded border border-blue-200">
                     <p className="text-xs font-semibold text-blue-800 mb-1">AI Thinking Process</p>
@@ -208,16 +152,13 @@ const ChatInterface = ({ apiKey, hasDocuments, config }) => {
                   </div>
                 )}
 
-                {/* The final answer text */}
                 <p className="whitespace-pre-wrap">{message.content}</p>
 
-                {/* Citations section with links to PDFs - at the bottom */}
                 {message.citations && message.citations.length > 0 && (
                   <div className="mt-3 pt-3 border-t border-gray-300">
                     <p className="text-xs font-semibold mb-1">Sources:</p>
                     <ul className="text-xs space-y-1">
                       {message.citations.map((cite, idx) => {
-                        // Logic to turn a citation string into a clickable file link.
                         const parts = cite.split('|');
                         const sourcePart = parts[0].replace('Source:', '').trim();
 
@@ -252,7 +193,6 @@ const ChatInterface = ({ apiKey, hasDocuments, config }) => {
                 )}
               </div>
 
-              {/* If it's the user, show a user icon */}
               {message.role === 'user' && (
                 <div className="flex-shrink-0 w-8 h-8 rounded-full bg-primary-600 flex items-center justify-center">
                   <User className="w-5 h-5 text-white" />
@@ -262,7 +202,6 @@ const ChatInterface = ({ apiKey, hasDocuments, config }) => {
           ))
         )}
 
-        {/* Spinner while waiting for the AI */}
         {isLoading && (
           <div className="flex gap-3 justify-start">
             <div className="flex-shrink-0 w-8 h-8 rounded-full bg-primary-100 flex items-center justify-center">
@@ -276,7 +215,6 @@ const ChatInterface = ({ apiKey, hasDocuments, config }) => {
         <div ref={messagesEndRef} />
       </div>
 
-      {/* The typing box at the bottom */}
       <form onSubmit={handleSubmit} className="flex gap-2">
         <input
           type="text"
diff --git a/frontend/src/components/ConfigSidebar.jsx b/frontend/src/components/ConfigSidebar.jsx
deleted file mode 100644
index 0f8cc40..0000000
--- a/frontend/src/components/ConfigSidebar.jsx
+++ /dev/null
@@ -1,84 +0,0 @@
-import { useState } from 'react';
-import { Settings, Key, Search, Sparkles } from 'lucide-react';
-
-/**
- * THE SETTINGS DASHBOARD:
- * This component allows you to securely enter your AI access key (API Key)
- * and shows you exactly which AI models are powering your clinical assistant.
- * Python analogy: Like a 'Config' class or a sidebar menu in a desktop app.
- */
-const ConfigSidebar = ({ apiKey, onApiKeyChange, config, onConfigChange, models }) => {
-  /**
-   * PROPS (Arguments):
-   * The list above (apiKey, onApiKeyChange, etc.) are like arguments passed to a Python class.
-   * 'apiKey' is the data. 'onApiKeyChange' is a function we call to update that data.
-   */
-
-  // Local state to hide or show the API key text (Like a local variable).
-  const [showApiKey, setShowApiKey] = useState(false);
-
-  return (
-    <div className="card animate-fadeIn">
-      {/* Configuration Header */}
-      <div className="flex items-center mb-4">
-        <Settings className="h-6 w-6 text-primary-600 mr-2" />
-        <h2 className="text-xl font-semibold text-gray-800">Configuration</h2>
-      </div>
-
-      <div className="space-y-6">
-        {/* API Key Box */}
-        <div>
-          <label className="flex items-center text-sm font-medium text-gray-700 mb-2">
-            <Key className="w-4 h-4 mr-2" />
-            OpenAI API Key {!apiKey && <span className="ml-2 text-xs text-red-600">(Required)</span>}
-          </label>
-          <div className="relative">
-            <input
-              type={showApiKey ? 'text' : 'password'}
-              value={apiKey}
-              // This 'onChange' is like a keyboard event listener.
-              onChange={(e) => onApiKeyChange(e.target.value)}
-              placeholder="sk-proj-..."
-              className={`w-full px-4 py-2 border rounded-lg focus:outline-none focus:ring-2 focus:ring-primary-500 pr-10 ${
-                !apiKey ? 'border-red-300 bg-red-50' : 'border-gray-300'
-              }`}
-            />
-            {/* Toggle 'Show/Hide' Button */}
-            <button
-              type="button"
-              onClick={() => setShowApiKey(!showApiKey)}
-              className="absolute right-2 top-1/2 -translate-y-1/2 text-gray-500 hover:text-gray-700"
-            >
-              {showApiKey ? 'Hide' : 'Show'}
-            </button>
-          </div>
-          <p className="text-xs text-gray-500 mt-1">
-            Your API key is kept in memory only and is never saved or shared. It will be cleared when you close the browser or clear the knowledge base.
-          </p>
-          {!apiKey && (
-            <p className="text-xs text-red-600 mt-1 font-medium">
-              ⚠️ Please enter your API key to use the application
-            </p>
-          )}
-        </div>
-
-        {/* Model Info Section (Display only) */}
-        <div className="pt-4 border-t border-gray-200">
-          <div className="flex items-center text-sm font-medium text-gray-700 mb-2">
-            <Sparkles className="w-4 h-4 mr-2" />
-            AI Models
-          </div>
-          <div className="text-xs text-gray-600 space-y-1">
-            {/* These 'models' values come from the Python backend status check. */}
-            <p>• Embedding: {models?.embedding || 'text-embedding-3-small'}</p>
-            <p>• Chat: {models?.chat || 'gpt-3.5-turbo'}</p>
-          </div>
-        </div>
-      </div>
-    </div>
-  );
-};
-
-export default ConfigSidebar;
-
-
diff --git a/frontend/src/components/DocumentUpload.jsx b/frontend/src/components/DocumentUpload.jsx
index c361355..f74124d 100644
--- a/frontend/src/components/DocumentUpload.jsx
+++ b/frontend/src/components/DocumentUpload.jsx
@@ -3,17 +3,11 @@ import { Upload, FileText, X, Loader2 } from 'lucide-react';
 import { uploadDocument, getUploadStatus } from '../services/api';
 import toast from 'react-hot-toast';
 
-/**
- * THE DOCUMENT INTAKE STATION:
- * This component provides the user interface for uploading clinical files.
- * Python analogy: Like a 'input()' function but for files, with a fancy visual box.
- */
-const DocumentUpload = ({ apiKey, onUploadSuccess, currentDocument, onClear }) => {
-  // Logic states (Local variables for this component).
-  const [dragActive, setDragActive] = useState(false); // Is the user dragging a file over the box?
-  const [files, setFiles] = useState([]);              // The actual file objects.
-  const [isLoading, setIsLoading] = useState(false);    // Is it currently uploading or processing?
-  const [processingStatus, setProcessingStatus] = useState(null); // Detailed message from background
+const DocumentUpload = ({ onUploadSuccess, currentDocument, onClear }) => {
+  const [dragActive, setDragActive] = useState(false);
+  const [files, setFiles] = useState([]);
+  const [isLoading, setIsLoading] = useState(false);
+  const [processingStatus, setProcessingStatus] = useState(null);
 
   const handleDrag = (e) => {
     e.preventDefault();
@@ -58,7 +52,6 @@ const DocumentUpload = ({ apiKey, onUploadSuccess, currentDocument, onClear }) =
   };
 
   const isValidFile = (file) => {
-    /** Helper function (Like a standard Python def). */
     const validExtensions = ['.pdf', '.docx', '.txt'];
     const fileExtension = '.' + file.name.split('.').pop().toLowerCase();
     return validExtensions.includes(fileExtension);
@@ -69,33 +62,17 @@ const DocumentUpload = ({ apiKey, onUploadSuccess, currentDocument, onClear }) =
   };
 
   const handleSubmit = async (e) => {
-    /** THE UPLOAD LOGIC: */
     e.preventDefault();
     if (files.length === 0) return;
 
-    // Double-check API key with trim to ensure it's not just whitespace
-    const trimmedKey = apiKey?.trim();
-    if (!trimmedKey || trimmedKey.length < 10) {
-      toast.error('Please enter a valid OpenAI API key first (starts with "sk-")');
-      return;
-    }
-
-    // Additional validation - check if it starts with sk-
-    if (!trimmedKey.startsWith('sk-')) {
-      toast.error('Invalid API key format. OpenAI keys start with "sk-"');
-      return;
-    }
-
     setIsLoading(true);
     setProcessingStatus('Uploading files...');
 
     try {
-      // Step 1: Upload and get job_id
-      const uploadResult = await uploadDocument(files, trimmedKey);
+      const uploadResult = await uploadDocument(files);
       const jobId = uploadResult.job_id;
 
       if (!jobId) {
-        // Fallback for old API (if we missed something)
         toast.success(uploadResult.message || 'Documents processed!');
         setFiles([]);
         if (onUploadSuccess) onUploadSuccess(uploadResult);
@@ -103,8 +80,6 @@ const DocumentUpload = ({ apiKey, onUploadSuccess, currentDocument, onClear }) =
         return;
       }
 
-      // Step 2: Poll for completion
-      // JavaScript's 'setInterval' is like a while-loop but doesn't block.
       const pollInterval = setInterval(async () => {
         try {
           const status = await getUploadStatus(jobId);
@@ -122,7 +97,6 @@ const DocumentUpload = ({ apiKey, onUploadSuccess, currentDocument, onClear }) =
             setProcessingStatus(null);
             setIsLoading(false);
           } else {
-            // Still processing
             setProcessingStatus(status.message || 'Processing documents...');
           }
         } catch (pollError) {
@@ -131,7 +105,7 @@ const DocumentUpload = ({ apiKey, onUploadSuccess, currentDocument, onClear }) =
           setIsLoading(false);
           setProcessingStatus(null);
         }
-      }, 2000); // Check every 2 seconds.
+      }, 2000);
 
     } catch (error) {
       toast.error(error.message || 'Failed to upload documents');
@@ -140,10 +114,6 @@ const DocumentUpload = ({ apiKey, onUploadSuccess, currentDocument, onClear }) =
     }
   };
 
-  /**
-   * The Visual Part:
-   * It uses 'conditional rendering' (If-Else in HTML).
-   */
   return (
     <div className="card animate-fadeIn">
       <div className="flex items-center justify-between mb-4">
@@ -152,7 +122,6 @@ const DocumentUpload = ({ apiKey, onUploadSuccess, currentDocument, onClear }) =
           <h2 className="text-xl font-semibold text-gray-800">Knowledge Intake</h2>
         </div>
 
-        {/* If documents are already uploaded, show a 'Clear' button */}
         {currentDocument && (
           <button
             onClick={onClear}
@@ -164,7 +133,6 @@ const DocumentUpload = ({ apiKey, onUploadSuccess, currentDocument, onClear }) =
         )}
       </div>
 
-      {/* Show the status of the current knowledge base */}
       {currentDocument && (
         <div className="mb-4 p-3 bg-primary-50 rounded-lg border border-primary-200">
           <div className="flex items-center gap-2">
@@ -176,7 +144,6 @@ const DocumentUpload = ({ apiKey, onUploadSuccess, currentDocument, onClear }) =
         </div>
       )}
 
-      {/* The Upload Form */}
       <form onSubmit={handleSubmit} className="space-y-4">
         <div
           className={`file-drop-zone ${dragActive ? 'file-drop-zone-active' : 'file-drop-zone-inactive'}`}
@@ -209,7 +176,6 @@ const DocumentUpload = ({ apiKey, onUploadSuccess, currentDocument, onClear }) =
           </label>
         </div>
 
-        {/* File List */}
         {files.length > 0 && (
           <div className="space-y-2 mt-4">
             <h3 className="text-sm font-semibold text-gray-600 uppercase tracking-wider">
@@ -241,10 +207,9 @@ const DocumentUpload = ({ apiKey, onUploadSuccess, currentDocument, onClear }) =
           </div>
         )}
 
-        {/* Submit Button */}
         <button
           type="submit"
-          disabled={files.length === 0 || isLoading || !apiKey}
+          disabled={files.length === 0 || isLoading}
           className="btn-primary w-full flex items-center justify-center gap-2"
         >
           {isLoading ? (
diff --git a/frontend/src/components/RAGPipelineInfo.jsx b/frontend/src/components/RAGPipelineInfo.jsx
index 4a4b981..0d858b4 100644
--- a/frontend/src/components/RAGPipelineInfo.jsx
+++ b/frontend/src/components/RAGPipelineInfo.jsx
@@ -1,9 +1,5 @@
 import { Brain, Search, Target, Sparkles, Zap, Network } from 'lucide-react';
 
-/**
- * RAG PIPELINE INFORMATION COMPONENT
- * Displays the advanced retrieval algorithms and AI models used in CliniQ
- */
 const RAGPipelineInfo = ({ compact = false }) => {
   const features = [
     {
diff --git a/frontend/src/components/layout/Header.jsx b/frontend/src/components/layout/Header.jsx
index e722d25..6b61dfc 100644
--- a/frontend/src/components/layout/Header.jsx
+++ b/frontend/src/components/layout/Header.jsx
@@ -14,7 +14,6 @@ export const Header = () => {
     <header className="bg-white border-b border-gray-200 sticky top-0 z-40 shadow-sm">
       <nav className="container mx-auto px-4 sm:px-6 lg:px-8">
         <div className="flex justify-between items-center h-16">
-          {/* Logo */}
           <Link to="/" className="flex items-center gap-3 hover:opacity-80 transition-opacity">
             <img
               src="/cloud2labs-logo.png"
@@ -26,7 +25,6 @@ export const Header = () => {
             </span>
           </Link>
 
-          {/* Desktop Navigation */}
           <div className="hidden md:flex items-center gap-6">
             {navigation.map((item) => (
               <Link
@@ -39,7 +37,6 @@ export const Header = () => {
             ))}
           </div>
 
-          {/* Mobile menu button */}
           <button
             onClick={() => setMobileMenuOpen(!mobileMenuOpen)}
             className="md:hidden p-2 rounded-lg hover:bg-gray-100"
@@ -52,7 +49,6 @@ export const Header = () => {
           </button>
         </div>
 
-        {/* Mobile Navigation */}
         <div
           className={`md:hidden overflow-hidden transition-all duration-300 ${
             mobileMenuOpen ? 'max-h-64 pb-4' : 'max-h-0'
diff --git a/frontend/src/main.jsx b/frontend/src/main.jsx
index 26de96c..54b39dd 100644
--- a/frontend/src/main.jsx
+++ b/frontend/src/main.jsx
@@ -1,32 +1,9 @@
-/**
- * Application Entry Point
- * 
- * This is the first file that runs when the React application starts.
- * It:
- * 1. Imports React and ReactDOM
- * 2. Imports the main App component
- * 3. Imports global CSS styles
- * 4. Renders the App into the HTML page
- * 
- * How it works:
- * - ReactDOM.createRoot() creates a React root container
- * - The root is attached to the HTML element with id="root" (in index.html)
- * - React.StrictMode enables additional development checks
- * - App component is rendered inside the root
- * 
- * This is similar to Python's if __name__ == '__main__': block
- */
-
 import React from 'react'
 import ReactDOM from 'react-dom/client'
 import App from './App.jsx'
-import './index.css'  // Global CSS styles (Tailwind CSS)
+import './index.css'
 
-// Create React root and render the App
-// document.getElementById('root') finds the <div id="root"> in index.html
 ReactDOM.createRoot(document.getElementById('root')).render(
-  // StrictMode helps catch potential problems during development
-  // It runs components twice in development to detect side effects
   <React.StrictMode>
     <App />
   </React.StrictMode>,
diff --git a/frontend/src/pages/Chat.jsx b/frontend/src/pages/Chat.jsx
index 30329b9..23bff25 100644
--- a/frontend/src/pages/Chat.jsx
+++ b/frontend/src/pages/Chat.jsx
@@ -1,13 +1,10 @@
 import { useState, useEffect } from 'react';
 import DocumentUpload from '../components/DocumentUpload';
 import ChatInterface from '../components/ChatInterface';
-import ConfigSidebar from '../components/ConfigSidebar';
 import { getStatus, clearDocuments } from '../services/api';
 import toast from 'react-hot-toast';
 
 const Chat = () => {
-  // API key stored only in memory - will be cleared when app closes
-  const [apiKey, setApiKey] = useState('');
   const [currentDocument, setCurrentDocument] = useState(null);
   const [hasDocuments, setHasDocuments] = useState(false);
   const [models, setModels] = useState({
@@ -21,20 +18,13 @@ const Chat = () => {
   });
 
   useEffect(() => {
-    // Check document status on mount
     checkDocumentStatus();
-    
-    // Remove any previously stored API key from localStorage (cleanup from old version)
-    if (localStorage.getItem('cliniq_api_key')) {
-      localStorage.removeItem('cliniq_api_key');
-    }
   }, []);
 
   const checkDocumentStatus = async () => {
     try {
       const status = await getStatus();
       setHasDocuments(status.has_documents);
-      // Update model names dynamically from backend
       if (status.chat_model) {
         setModels({
           chat: status.chat_model,
@@ -54,24 +44,19 @@ const Chat = () => {
         setCurrentDocument(`${result.files.length} documents uploaded`);
       }
     } else if (result.filename) {
-      // Fallback for single file if backend was still returning filename
       setCurrentDocument(result.filename);
     }
-    
-    // Immediately set to true based on upload result
+
     setHasDocuments(true);
-    
-    // Poll status to ensure backend has fully processed
+
     let attempts = 0;
     const maxAttempts = 10;
     const pollStatus = async () => {
       try {
         const status = await getStatus();
         if (status.has_documents && status.document_count > 0) {
-          // Documents confirmed in backend
           setHasDocuments(true);
         } else if (attempts < maxAttempts) {
-          // Retry after a short delay
           attempts++;
           setTimeout(pollStatus, 200);
         }
@@ -79,8 +64,7 @@ const Chat = () => {
         console.error('Error polling status:', error);
       }
     };
-    
-    // Start polling after a short delay
+
     setTimeout(pollStatus, 100);
   };
 
@@ -89,9 +73,7 @@ const Chat = () => {
       await clearDocuments();
       setCurrentDocument(null);
       setHasDocuments(false);
-      // Clear API key from memory when clearing knowledge base
-      setApiKey('');
-      toast.success('Knowledge base and API key cleared');
+      toast.success('Knowledge base cleared');
     } catch (error) {
       toast.error(error.message || 'Failed to clear documents');
     }
@@ -106,7 +88,6 @@ const Chat = () => {
         <p className="text-gray-600 mb-4">
           Upload your clinical documents and ask questions
         </p>
-        {/* RAG Technology Highlight */}
         <div className="inline-flex flex-wrap items-center justify-center gap-3 mt-4 text-sm">
           <span className="px-3 py-1.5 bg-blue-50 text-blue-700 rounded-full font-medium border border-blue-200">
             Hybrid Retrieval (Vector + BM25)
@@ -115,41 +96,22 @@ const Chat = () => {
             Cosine Similarity Reranking
           </span>
           <span className="px-3 py-1.5 bg-green-50 text-green-700 rounded-full font-medium border border-green-200">
-            {models.embedding}
-          </span>
-          <span className="px-3 py-1.5 bg-orange-50 text-orange-700 rounded-full font-medium border border-orange-200">
-            {models.chat}
+            text to vec embedding
           </span>
         </div>
       </div>
 
-      <div className="grid lg:grid-cols-3 gap-6">
-        {/* Left Sidebar - Configuration */}
-        <div className="lg:col-span-1">
-          <ConfigSidebar
-            apiKey={apiKey}
-            onApiKeyChange={setApiKey}
-            config={config}
-            onConfigChange={setConfig}
-            models={models}
-          />
-        </div>
+      <div className="max-w-5xl mx-auto space-y-6">
+        <DocumentUpload
+          onUploadSuccess={handleUploadSuccess}
+          currentDocument={currentDocument}
+          onClear={handleClearDocuments}
+        />
 
-        {/* Main Content */}
-        <div className="lg:col-span-2 space-y-6">
-          <DocumentUpload
-            apiKey={apiKey}
-            onUploadSuccess={handleUploadSuccess}
-            currentDocument={currentDocument}
-            onClear={handleClearDocuments}
-          />
-
-          <ChatInterface
-            apiKey={apiKey}
-            hasDocuments={hasDocuments}
-            config={config}
-          />
-        </div>
+        <ChatInterface
+          hasDocuments={hasDocuments}
+          config={config}
+        />
       </div>
     </div>
   );
diff --git a/frontend/src/pages/Home.jsx b/frontend/src/pages/Home.jsx
index 19792b6..6f2b64f 100644
--- a/frontend/src/pages/Home.jsx
+++ b/frontend/src/pages/Home.jsx
@@ -13,17 +13,7 @@ import {
 } from 'lucide-react';
 import RAGPipelineInfo from '../components/RAGPipelineInfo';
 
-/**
- * THE WELCOME MAT:
- * This page introduces ClinIQ to new users.
- * Python analogy: Like a main entry point or a template file.
- */
 export const Home = () => {
-  /**
-   * DATA LISTS (Python Lists):
-   * We define our features and steps as lists of objects (dictionaries).
-   * React will then loop over these to draw the page.
-   */
   const features = [
     {
       icon: HeartPulse,
@@ -67,11 +57,8 @@ export const Home = () => {
 
   return (
     <div className="space-y-20">
-      {/* HERO SECTION: The big banner at the top */}
       <section className="relative pt-12 lg:pt-20 pb-12 overflow-hidden">
         <div className="container mx-auto px-4 lg:grid lg:grid-cols-2 lg:gap-12 items-center">
-
-          {/* Left Side: Text and Buttons */}
           <div className="text-left space-y-8 animate-fadeIn">
             <div className="inline-flex items-center gap-2 px-4 py-2 bg-primary-50 rounded-full text-primary-600 font-medium text-sm">
               <Sparkles className="w-4 h-4" />
@@ -102,7 +89,6 @@ export const Home = () => {
             </div>
           </div>
 
-          {/* Right Side: The 3D Image Visual */}
           <div className="hidden lg:block relative mt-12 lg:mt-0 animate-fadeInSlow">
             <div className="absolute -inset-4 bg-gradient-to-r from-primary-600/10 to-secondary-600/10 rounded-full blur-3xl"></div>
             <div className="relative group">
@@ -113,7 +99,6 @@ export const Home = () => {
                 className="relative rounded-2xl shadow-2xl border border-white/50 backdrop-blur-sm transform group-hover:scale-[1.02] transition duration-500"
               />
 
-              {/* Glassmorphic floating element */}
               <div className="absolute -bottom-6 -left-6 bg-white/80 backdrop-blur-md p-6 rounded-2xl shadow-xl border border-white/50 max-w-[200px] animate-bounce-slow">
                 <div className="flex items-center gap-3 mb-2">
                   <div className="bg-primary-100 p-2 rounded-lg">
@@ -131,7 +116,6 @@ export const Home = () => {
         </div>
       </section>
 
-      {/* FEATURES SECTION: Looping over the 'features' list defined above */}
       <section>
         <h2 className="text-3xl font-bold text-center text-gray-900 mb-12">
           Designed for Healthcare Excellence
@@ -154,10 +138,8 @@ export const Home = () => {
         </div>
       </section>
 
-      {/* RAG PIPELINE SECTION: Showcase advanced AI techniques */}
       <RAGPipelineInfo />
 
-      {/* HOW IT WORKS SECTION: Looping over the 'steps' list */}
       <section>
         <h2 className="text-3xl font-bold text-center text-gray-900 mb-12">
           Streamlined Workflow
@@ -182,7 +164,6 @@ export const Home = () => {
         </div>
       </section>
 
-      {/* CALL TO ACTION: The dark box at the bottom */}
       <section className="bg-gray-900 rounded-3xl p-12 text-center text-white shadow-2xl relative overflow-hidden">
         <div className="absolute top-0 right-0 w-64 h-64 bg-primary-600/10 rounded-full blur-3xl -mr-32 -mt-32"></div>
         <div className="absolute bottom-0 left-0 w-64 h-64 bg-secondary-600/10 rounded-full blur-3xl -ml-32 -mb-32"></div>
diff --git a/frontend/src/services/api.js b/frontend/src/services/api.js
index b57c96a..0ebe504 100644
--- a/frontend/src/services/api.js
+++ b/frontend/src/services/api.js
@@ -1,20 +1,8 @@
-/**
- * THE COMMUNICATION BRIDGE:
- * These functions allow the frontend (what you see) to talk to the backend (the brain).
- * Think of this file like a "Python Client" for your Flask API.
- */
 const API_BASE_URL = '/api';
 
-/**
- * Sends a new health document to the assistant to be 'learned'.
- * Python analogy: Like using the 'requests' library to POST a file.
- * 'async' means this function runs in the background (like a thread or coroutine).
- */
-export const uploadDocument = async (files, apiKey) => {
-  // FormData is like a Python dictionary specifically for sending files.
+export const uploadDocument = async (files) => {
   const formData = new FormData();
 
-  // Handle multiple files
   if (Array.isArray(files)) {
     files.forEach(file => {
       formData.append('file', file);
@@ -23,20 +11,14 @@ export const uploadDocument = async (files, apiKey) => {
     formData.append('file', files);
   }
 
-  formData.append('api_key', apiKey);
-
-  // 'fetch' is the standard way JavaScript makes HTTP requests (like requests.post).
-  // 'await' means we wait for the server to reply before moving to the next line.
   const response = await fetch(`${API_BASE_URL}/upload`, {
     method: 'POST',
     body: formData,
   });
 
-  // Check if the server return a 200 OK status.
   if (!response.ok) {
     let errorMsg = 'Failed to upload document';
     try {
-      // .json() parses the server's reply (like response.json() in Python).
       const errorData = await response.json();
       errorMsg = errorData.error || errorMsg;
     } catch (e) {
@@ -48,9 +30,6 @@ export const uploadDocument = async (files, apiKey) => {
   return response.json();
 };
 
-/**
- * Checks the status of a background document processing job.
- */
 export const getUploadStatus = async (jobId) => {
   const response = await fetch(`${API_BASE_URL}/upload/status/${jobId}`);
 
@@ -62,11 +41,7 @@ export const getUploadStatus = async (jobId) => {
   return response.json();
 };
 
-/**
- * Sends your question to the assistant and handles the incoming answer.
- * We use 'streaming' here to show the answer as it's being typed.
- */
-export const queryDocuments = async (query, apiKey, options = {}) => {
+export const queryDocuments = async (query, options = {}) => {
   const { onChunk, history = [], ...otherOptions } = options;
 
   const response = await fetch(`${API_BASE_URL}/query`, {
@@ -76,8 +51,7 @@ export const queryDocuments = async (query, apiKey, options = {}) => {
     },
     body: JSON.stringify({
       query,
-      api_key: apiKey,
-      history, // Send Chat History to the backend for context.
+      history,
       use_hybrid_search: otherOptions.useHybridSearch ?? true,
       use_reranker: otherOptions.useReranker ?? true,
       show_thinking: otherOptions.showThinking ?? false,
@@ -96,26 +70,24 @@ export const queryDocuments = async (query, apiKey, options = {}) => {
     throw new Error(errorMsg);
   }
 
-  // If we are 'streaming' (getting the answer bit by bit):
   if (onChunk) {
     const reader = response.body.getReader();
     const decoder = new TextDecoder();
     let buffer = '';
 
-    // This is like a 'while True' loop reading lines from a socket.
     while (true) {
       const { done, value } = await reader.read();
       if (done) break;
 
       buffer += decoder.decode(value, { stream: true });
       const lines = buffer.split('\n\n');
-      buffer = lines.pop(); // Keep partial line in buffer
+      buffer = lines.pop();
 
       for (const line of lines) {
         if (line.trim().startsWith('data: ')) {
           try {
             const data = JSON.parse(line.trim().slice(6));
-            onChunk(data); // This calls a function in the GUI to update the screen.
+            onChunk(data);
           } catch (e) {
             console.error('Error parsing stream chunk:', e, line);
           }
@@ -128,7 +100,6 @@ export const queryDocuments = async (query, apiKey, options = {}) => {
   return response.json();
 };
 
-/** Wipes the clinical memory clean. */
 export const clearDocuments = async () => {
   const response = await fetch(`${API_BASE_URL}/clear`, {
     method: 'POST',
@@ -142,7 +113,6 @@ export const clearDocuments = async () => {
   return response.json();
 };
 
-/** Asks the backend for its current status (models used, docs indexed). */
 export const getStatus = async () => {
   const response = await fetch(`${API_BASE_URL}/status`);