shmukit · shmukit · May 3, 2026
diff --git a/content_gen/.env.example b/content_gen/.env.example
@@ -4,9 +4,14 @@
 # ===== Optional Image CDN Storage (Legacy/Optional) =====
 # Not required when using base64 image mode + Postgres persistence.
 # Only needed if you explicitly run CDN upload mode.
-# AZURE_STORAGE_ACCOUNT_NAME=your_account_name_here
-# AZURE_STORAGE_ACCOUNT_KEY=your_account_key_here
-# AZURE_STORAGE_CDN_URL=https://cdn.edmate.com  # Optional: custom CDN domain
+# Prefer account name + key (no connection string in your shell history):
+# AZURE_STORAGE_ACCOUNT_NAME=<storage-account-name>
+# AZURE_STORAGE_ACCOUNT_KEY=<secret-from-azure-portal-keys-section>
+#
+# Or paste the full value from Azure Portal "Access keys" (keep out of git):
+# AZURE_STORAGE_CONNECTION_STRING=<value-from-portal>
+#
+# AZURE_STORAGE_CDN_URL=https://cdn.example.com  # Optional: custom CDN domain
 
 # ===== Database =====
 # PostgreSQL connection string

diff --git a/content_gen/core/config_schema.py b/content_gen/core/config_schema.py
@@ -79,6 +79,20 @@ class ExtractionSettings(BaseModel):
         description="bangladeshi | numbered_only — controls regex segmentation heuristics",
     )
 
+    @field_validator("segmentation_preset", mode="before")
+    @classmethod
+    def _validate_segmentation_preset(cls, v: Any) -> str:
+        allowed = frozenset({"bangladeshi", "numbered_only"})
+        s = (v if v is not None else "bangladeshi")
+        if not isinstance(s, str):
+            s = str(s)
+        s = s.strip().lower()
+        if s not in allowed:
+            raise ValueError(
+                f"segmentation_preset must be one of {sorted(allowed)}, got {v!r}"
+            )
+        return s
+
     @field_validator("engine", mode="before")
     @classmethod
     def _coerce_engine(cls, v: Any) -> Any:

diff --git a/content_gen/scripts/README.md b/content_gen/scripts/README.md
@@ -6,7 +6,7 @@ This directory contains the **PDF → structured questions → explanations** pi
 
 | Path | Role |
 |------|------|
-| `extraction/` | `pdf_extract_kit_wrapper.py` — adapter around [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) (must be cloned to `content_gen/tools/PDF-Extract-Kit`; see repo `scripts/setup_pdf_extract_kit.sh`). |
+| `extraction/` | `pdf_extract_kit_wrapper.py` (shim) → `extraction/kit/` — modular adapter around [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) (clone to `content_gen/tools/PDF-Extract-Kit`; see `scripts/setup_pdf_extract_kit.sh`). |
 | `adapters/` (package root `content_gen/adapters/`) | `PyMuPDFAdapter`, `KitExtractionAdapter`, `VisionExtractionAdapter` — selected by `extraction_settings.engine` in `edmate_config.yaml`. |
 | `pipeline/` | `pipeline_orchestrator.py` — main CLI orchestrator; `national_exam_processor.py` — optional standalone path with `--extraction-engine`. |
 | `processing/` | `content_generator.py`, import/upload helpers. |

diff --git a/content_gen/scripts/extraction/kit/__init__.py b/content_gen/scripts/extraction/kit/__init__.py
@@ -0,0 +1,5 @@
+"""PDF-Extract-Kit integration split into focused modules."""
+
+from content_gen.scripts.extraction.kit.wrapper import PDFExtractKitWrapper
+
+__all__ = ["PDFExtractKitWrapper"]
diff --git a/content_gen/scripts/extraction/kit/_bootstrap.py b/content_gen/scripts/extraction/kit/_bootstrap.py
@@ -0,0 +1,28 @@
+"""
+PDF-Extract-Kit path bootstrap — must run before importing pdf_extract_kit.
+"""
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+# content_gen/scripts/extraction/kit -> content_gen
+CONTENT_GEN_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+
+KIT_PATH = CONTENT_GEN_ROOT / "tools" / "PDF-Extract-Kit"
+if str(KIT_PATH) not in sys.path:
+    sys.path.insert(0, str(KIT_PATH))
+
+try:
+    import pdf_extract_kit.tasks  # noqa: F401 — trigger registration
+    from pdf_extract_kit.utils.config_loader import initialize_tasks_and_models
+
+    HAS_KIT = True
+except (ImportError, ModuleNotFoundError):
+    HAS_KIT = False
+    initialize_tasks_and_models = None  # type: ignore[assignment]
+    print(
+        "⚠️ PDF-Extract-Kit not found. Extraction features using this engine will be disabled."
+    )
+
+__all__ = ["CONTENT_GEN_ROOT", "KIT_PATH", "HAS_KIT", "initialize_tasks_and_models"]
diff --git a/content_gen/scripts/extraction/kit/image_utils.py b/content_gen/scripts/extraction/kit/image_utils.py
@@ -0,0 +1,51 @@
+"""Image crop helpers for PDF-Extract-Kit wrapper."""
+from __future__ import annotations
+
+from pathlib import Path
+from typing import List, Optional
+
+import fitz
+
+
+class KitImageUtilsMixin:
+    images_dir: Optional[Path]
+    def _extract_bbox_image(
+        self,
+        page: fitz.Page,
+        bbox: List[float],
+        q_num: int,
+        element_type: str,
+    ) -> Path:
+        """
+        Extract and save image from bounding box
+
+        Args:
+            page: PyMuPDF page object
+            bbox: Bounding box [x0, y0, x1, y1]
+            q_num: Question number
+            element_type: Type of element (figure, table, formula)
+
+        Returns:
+            Path to saved image
+        """
+        width = max(1.0, bbox[2] - bbox[0])
+        height = max(1.0, bbox[3] - bbox[1])
+        pad = max(12.0, min(width, height) * 0.08)
+
+        final_bbox = [
+            max(0, bbox[0] - pad),
+            max(0, bbox[1] - pad),
+            min(page.rect.width, bbox[2] + pad),
+            min(page.rect.height, bbox[3] + pad),
+        ]
+
+        images_dir = self.images_dir
+        if images_dir is None:
+            raise ValueError("images_dir must be initialized before extracting images")
+        img_name = f"q{q_num}_{element_type}.png"
+        img_path = images_dir / img_name
+
+        pix = page.get_pixmap(matrix=fitz.Matrix(3, 3), clip=fitz.Rect(final_bbox))
+        pix.save(str(img_path))
+
+        return img_path