Project-ScholarAI · munimthahmid · Jul 29, 2025
diff --git a/README.md b/README.md
@@ -1,89 +0,0 @@
-# ScholarAI Backend
-
-Backend for the ScholarAI research assistant. This project uses FastAPI and Poetry.
-
-## Prerequisites
-
-- Python 3.10+ (as per `pyproject.toml`)
-- Poetry (Python dependency manager)
-- Docker & Docker Compose (for containerized setup)
-
-## Quick Start
-
-### Option 1: Local Development
-
-1. **Install dependencies:**
-   ```bash
-   poetry install
-   ```
-
-2. **Configure environment:**
-   ```bash
-   cp env.example .env
-   # Edit .env with your API keys and settings
-   ```
-
-3. **Run the application:**
-   ```bash
-   poetry run uvicorn app.main:app --reload --port 8000
-   ```
-
-   Server will be available at `http://localhost:8000`
-
-### Option 2: Docker Setup
-
-1. **Configure environment:**
-   ```bash
-   cp env.example .env
-   # Edit .env with your API keys and settings
-   ```
-
-2. **Start with Docker:**
-   ```bash
-   # Ensure Docker network exists
-   docker network create docker_scholar-network
-
-   # Start the application
-   ./scripts/docker.sh start
-   ```
-
-   Server will be available at `http://localhost:8000`
-
-## Environment Configuration
-
-Copy `env.example` to `.env` and configure:
-
-```bash
-# Required
-UNPAYWALL_EMAIL=your.email@example.com
-
-# Optional (for enhanced features)
-CORE_API_KEY=your_core_api_key
-RABBITMQ_USER=your_rabbitmq_user
-RABBITMQ_PASSWORD=your_rabbitmq_password
-B2_KEY_ID=your_b2_key_id
-B2_APPLICATION_KEY=your_b2_application_key
-B2_BUCKET_NAME=your_b2_bucket_name
-LOG_LEVEL=INFO
-```
-
-## Available Scripts
-
-- `./scripts/docker.sh start` - Start the application with Docker
-- `./scripts/docker.sh stop` - Stop the application
-- `./scripts/docker.sh rebuild` - Rebuild and restart
-- `./scripts/azure-setup.sh` - Set up Azure infrastructure
-
-## Testing
-
-```bash
-# Test Unpaywall client
-poetry run python test_unpaywall.py
-
-# Run all tests
-poetry run pytest
-```
-
-## Project Structure
-
-For detailed project structure, see `docs/3_Code_Structure.md`.

diff --git a/app/api/api_v1/endpoints/gap_analysis.py b/app/api/api_v1/endpoints/gap_analysis.py
@@ -21,6 +21,7 @@ class GapAnalysisSubmissionRequest(BaseModel):
     url: HttpUrl
     max_papers: Optional[int] = 10
     validation_threshold: Optional[int] = 2
+    analysis_mode: Optional[str] = "deep"  # "light" or "deep"
 
 class GapAnalysisSubmissionResponse(BaseModel):
     """Response when submitting gap analysis"""
@@ -64,14 +65,27 @@ async def submit_gap_analysis(request: GapAnalysisSubmissionRequest):
         gap_request = GapAnalysisRequest(
             url=str(request.url),
             max_papers=request.max_papers or 10,
-            validation_threshold=request.validation_threshold or 2
+            validation_threshold=request.validation_threshold or 2,
+            analysis_mode=request.analysis_mode or "deep"
         )
 
         # Submit job for background processing
         job_id = await background_processor.submit_job(gap_request)
 
-        # Estimate processing time based on parameters
-        estimated_minutes = max(3, (gap_request.max_papers * 0.8) + 2)
+        # Estimate processing time based on analysis mode and parameters
+        if gap_request.analysis_mode == "light":
+            # Light analysis: 2-3 minutes
+            base_time = 2.5
+            paper_multiplier = 0.1  # Much faster per paper
+        else:
+            # Deep analysis: 10-15 minutes
+            base_time = 10
+            paper_multiplier = 0.8  # Original timing
+
+        estimated_minutes = max(
+            2 if gap_request.analysis_mode == "light" else 8, 
+            base_time + (gap_request.max_papers * paper_multiplier)
+        )
 
         logger.info(f"🎯 Gap analysis job {job_id} submitted for {request.url}")
 

diff --git a/app/core/config.py b/app/core/config.py
@@ -27,6 +27,10 @@ class Settings:
     # Environment
     ENVIRONMENT: str = os.getenv("ENVIRONMENT", "development")
     LOG_LEVEL: str = os.getenv("LOG_LEVEL", "info")
+
+    # Gap Analysis Configuration
+    GAP_ANALYSIS_LIGHT_MODE_MAX_PAPERS: int = int(os.getenv("GAP_ANALYSIS_LIGHT_MODE_MAX_PAPERS", "5"))
+    GAP_ANALYSIS_LIGHT_MODE_VALIDATION_THRESHOLD: int = int(os.getenv("GAP_ANALYSIS_LIGHT_MODE_VALIDATION_THRESHOLD", "1"))
 
 
 settings = Settings()

diff --git a/app/services/gap_analyzer/background_processor.py b/app/services/gap_analyzer/background_processor.py
@@ -70,20 +70,27 @@ async def _load_existing_jobs(self):
         """Load existing jobs from persistent storage on startup."""
         try:
             job_files = list(self.jobs_dir.glob("job_*.json"))
+            loaded_count = 0
+
             for job_file in job_files:
                 try:
                     with open(job_file, 'r') as f:
                         job_data = json.load(f)
 
+                    # Validate required fields
+                    if "job_id" not in job_data or "request" not in job_data:
+                        logger.warning(f"Invalid job file {job_file}: missing required fields")
+                        continue
+
                     # Recreate JobInfo object from saved data
                     job_id = job_data["job_id"]
                     request = GapAnalysisRequest(**job_data["request"])
                     job_info = JobInfo(job_id, request)
 
-                    # Restore job state
-                    job_info.status = JobStatus(job_data["status"])
+                    # Restore job state with defaults
+                    job_info.status = JobStatus(job_data.get("status", "pending"))
                     job_info.created_at = datetime.fromisoformat(job_data["created_at"])
-                    job_info.progress_message = job_data["progress_message"]
+                    job_info.progress_message = job_data.get("progress_message", "Job loaded from storage")
                     job_info.result_file = job_data.get("result_file")
 
                     if job_data.get("started_at"):
@@ -93,16 +100,34 @@ async def _load_existing_jobs(self):
                     if job_data.get("error_message"):
                         job_info.error_message = job_data["error_message"]
 
+                    # If job was running when server stopped, mark it as failed
+                    if job_info.status == JobStatus.RUNNING:
+                        job_info.status = JobStatus.FAILED
+                        job_info.error_message = "Job interrupted by server restart"
+                        job_info.progress_message = "Job failed due to server restart"
+                        self._save_job_status(job_id)
+
                     self.jobs[job_id] = job_info
-                    logger.info(f"Loaded existing job {job_id} from persistent storage")
+                    loaded_count += 1
+                    logger.info(f"✅ Loaded job {job_id} with status: {job_info.status.value}")
 
                 except Exception as e:
-                    logger.error(f"Failed to load job from {job_file}: {str(e)}")
+                    logger.error(f"❌ Failed to load job from {job_file}: {str(e)}")
 
-            logger.info(f"Loaded {len(self.jobs)} existing jobs from persistent storage")
+            logger.info(f"🔄 Successfully loaded {loaded_count} jobs from persistent storage")
+
+            # Log summary by status
+            status_counts = {}
+            for job in self.jobs.values():
+                status = job.status.value
+                status_counts[status] = status_counts.get(status, 0) + 1
+
+            if status_counts:
+                status_summary = ", ".join([f"{status}: {count}" for status, count in status_counts.items()])
+                logger.info(f"📊 Job status summary: {status_summary}")
 
         except Exception as e:
-            logger.error(f"Failed to load existing jobs: {str(e)}")
+            logger.error(f"💥 Failed to load existing jobs: {str(e)}")
 
     def _save_job_status(self, job_id: str):
         """Save job status to persistent storage."""
@@ -316,7 +341,8 @@ async def _process_job(self, job_id: str):
             job.status = JobStatus.COMPLETED
             job.completed_at = datetime.utcnow()
             job.result_file = result_filename
-            job.progress_message = f"Analysis completed! Found {len(result.validated_gaps)} validated research gaps."
+            gap_count = len(result.validated_gaps) if result.validated_gaps else 0
+            job.progress_message = f"Analysis completed! Found {gap_count} validated research gaps."
 
             # Save final status to persistent storage
             self._save_job_status(job_id)

diff --git a/app/services/gap_analyzer/models.py b/app/services/gap_analyzer/models.py
@@ -4,16 +4,18 @@
 
 from datetime import datetime
 from dataclasses import dataclass, field
-from typing import Dict, Any, List, Optional
+from typing import Dict, Any, List, Optional, Literal
 from uuid import uuid4
 from pydantic import BaseModel, Field
 
+AnalysisMode = Literal["light", "deep"]
 
 class GapAnalysisRequest(BaseModel):
     """Request model for gap analysis"""
     url: str = Field(..., description="URL of the seed paper to analyze")
     max_papers: int = Field(default=10, ge=5, le=20, description="Maximum papers to analyze")
     validation_threshold: int = Field(default=2, ge=1, le=5, description="Number of validation attempts per gap")
+    analysis_mode: AnalysisMode = Field(default="deep", description="Analysis depth mode: 'light' for fast 2-3 min analysis, 'deep' for comprehensive 10-15 min analysis")
 
 
 @dataclass

diff --git a/app/services/gap_analyzer/orchestrator.py b/app/services/gap_analyzer/orchestrator.py
@@ -94,7 +94,7 @@ async def analyze_research_gaps(self, request: GapAnalysisRequest) -> GapAnalysi
         request_id = str(uuid4())[:8]
         start_time = time.time()
 
-        logger.info(f"Starting gap analysis {request_id} for paper: {request.url}")
+        logger.info(f"Starting {request.analysis_mode} gap analysis {request_id} for paper: {request.url}")
 
         try:
             # Phase 1: Seeding the Exploration
@@ -103,13 +103,25 @@ async def analyze_research_gaps(self, request: GapAnalysisRequest) -> GapAnalysi
             if not seed_analysis:
                 raise Exception("Failed to analyze seed paper")
 
+            # Adjust parameters based on analysis mode
+            if request.analysis_mode == "light":
+                # Light mode: reduce scope for faster processing
+                max_papers = min(request.max_papers, 5)  # Cap at 5 papers
+                validation_threshold = 1  # Minimal validation
+                logger.info(f"🚀 Light analysis mode: {max_papers} papers, minimal validation")
+            else:
+                # Deep mode: use full parameters
+                max_papers = request.max_papers
+                validation_threshold = request.validation_threshold
+                logger.info(f"🔬 Deep analysis mode: {max_papers} papers, thorough validation")
+
             # Phase 2: Main Exploration Loop
             logger.info("Phase 2: Main exploration loop...")
-            await self._phase_2_expanding_frontier(request.max_papers)
+            await self._phase_2_expanding_frontier(max_papers)
 
             # Phase 3: Gap Validation Loop
             logger.info("Phase 3: Gap validation loop...")
-            await self._phase_3_final_validation(request.validation_threshold)
+            await self._phase_3_final_validation(validation_threshold)
 
             # Phase 4: Final Response Synthesis
             logger.info("Phase 4: Final response synthesis...")