Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions content_gen/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@
# ===== Optional Image CDN Storage (Legacy/Optional) =====
# Not required when using base64 image mode + Postgres persistence.
# Only needed if you explicitly run CDN upload mode.
# AZURE_STORAGE_ACCOUNT_NAME=your_account_name_here
# AZURE_STORAGE_ACCOUNT_KEY=your_account_key_here
# AZURE_STORAGE_CDN_URL=https://cdn.edmate.com # Optional: custom CDN domain
# Prefer account name + key (no connection string in your shell history):
# AZURE_STORAGE_ACCOUNT_NAME=<storage-account-name>
# AZURE_STORAGE_ACCOUNT_KEY=<secret-from-azure-portal-keys-section>
#
# Or paste the full value from Azure Portal "Access keys" (keep out of git):
# AZURE_STORAGE_CONNECTION_STRING=<value-from-portal>
#
# AZURE_STORAGE_CDN_URL=https://cdn.example.com # Optional: custom CDN domain

# ===== Database =====
# PostgreSQL connection string
Expand Down
14 changes: 14 additions & 0 deletions content_gen/core/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,20 @@ class ExtractionSettings(BaseModel):
description="bangladeshi | numbered_only — controls regex segmentation heuristics",
)

@field_validator("segmentation_preset", mode="before")
@classmethod
def _validate_segmentation_preset(cls, v: Any) -> str:
allowed = frozenset({"bangladeshi", "numbered_only"})
s = (v if v is not None else "bangladeshi")
if not isinstance(s, str):
s = str(s)
s = s.strip().lower()
if s not in allowed:
raise ValueError(
f"segmentation_preset must be one of {sorted(allowed)}, got {v!r}"
)
return s

@field_validator("engine", mode="before")
@classmethod
def _coerce_engine(cls, v: Any) -> Any:
Expand Down
2 changes: 1 addition & 1 deletion content_gen/scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ This directory contains the **PDF → structured questions → explanations** pi

| Path | Role |
|------|------|
| `extraction/` | `pdf_extract_kit_wrapper.py` adapter around [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) (must be cloned to `content_gen/tools/PDF-Extract-Kit`; see repo `scripts/setup_pdf_extract_kit.sh`). |
| `extraction/` | `pdf_extract_kit_wrapper.py` (shim) → `extraction/kit/` — modular adapter around [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) (clone to `content_gen/tools/PDF-Extract-Kit`; see `scripts/setup_pdf_extract_kit.sh`). |
| `adapters/` (package root `content_gen/adapters/`) | `PyMuPDFAdapter`, `KitExtractionAdapter`, `VisionExtractionAdapter` — selected by `extraction_settings.engine` in `edmate_config.yaml`. |
| `pipeline/` | `pipeline_orchestrator.py` — main CLI orchestrator; `national_exam_processor.py` — optional standalone path with `--extraction-engine`. |
| `processing/` | `content_generator.py`, import/upload helpers. |
Expand Down
5 changes: 5 additions & 0 deletions content_gen/scripts/extraction/kit/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""PDF-Extract-Kit integration split into focused modules."""

from content_gen.scripts.extraction.kit.wrapper import PDFExtractKitWrapper

__all__ = ["PDFExtractKitWrapper"]
28 changes: 28 additions & 0 deletions content_gen/scripts/extraction/kit/_bootstrap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""
PDF-Extract-Kit path bootstrap — must run before importing pdf_extract_kit.
"""
from __future__ import annotations

import sys
from pathlib import Path

# content_gen/scripts/extraction/kit -> content_gen
CONTENT_GEN_ROOT = Path(__file__).resolve().parent.parent.parent.parent

KIT_PATH = CONTENT_GEN_ROOT / "tools" / "PDF-Extract-Kit"
if str(KIT_PATH) not in sys.path:
sys.path.insert(0, str(KIT_PATH))

try:
import pdf_extract_kit.tasks # noqa: F401 — trigger registration
from pdf_extract_kit.utils.config_loader import initialize_tasks_and_models

HAS_KIT = True
except (ImportError, ModuleNotFoundError):
HAS_KIT = False
initialize_tasks_and_models = None # type: ignore[assignment]
print(
"⚠️ PDF-Extract-Kit not found. Extraction features using this engine will be disabled."
)

__all__ = ["CONTENT_GEN_ROOT", "KIT_PATH", "HAS_KIT", "initialize_tasks_and_models"]
51 changes: 51 additions & 0 deletions content_gen/scripts/extraction/kit/image_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Image crop helpers for PDF-Extract-Kit wrapper."""
from __future__ import annotations

from pathlib import Path
from typing import List, Optional

import fitz


class KitImageUtilsMixin:
images_dir: Optional[Path]
def _extract_bbox_image(
self,
page: fitz.Page,
bbox: List[float],
q_num: int,
element_type: str,
) -> Path:
"""
Extract and save image from bounding box

Args:
page: PyMuPDF page object
bbox: Bounding box [x0, y0, x1, y1]
q_num: Question number
element_type: Type of element (figure, table, formula)

Returns:
Path to saved image
"""
width = max(1.0, bbox[2] - bbox[0])
height = max(1.0, bbox[3] - bbox[1])
pad = max(12.0, min(width, height) * 0.08)

final_bbox = [
max(0, bbox[0] - pad),
max(0, bbox[1] - pad),
min(page.rect.width, bbox[2] + pad),
min(page.rect.height, bbox[3] + pad),
]

images_dir = self.images_dir
if images_dir is None:
raise ValueError("images_dir must be initialized before extracting images")
img_name = f"q{q_num}_{element_type}.png"
img_path = images_dir / img_name

pix = page.get_pixmap(matrix=fitz.Matrix(3, 3), clip=fitz.Rect(final_bbox))
pix.save(str(img_path))

return img_path
Loading
Loading