Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions content_gen/adapters/kit_extraction_adapter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pathlib import Path
import os
from typing import List, Optional, Callable
from typing import Dict, List, Optional, Callable
from content_gen.adapters.base_extraction import BaseExtractionAdapter
from content_gen.core.schemas import ProcessedQuestion
from content_gen.scripts.extraction.pdf_extract_kit_wrapper import PDFExtractKitWrapper
Expand Down Expand Up @@ -65,10 +65,18 @@ def extract_content(
subj = (q_data.get("subject") or "").strip()
if not subj:
subj = self.default_subject
raw_opts = q_data.get("options") or {}
if not isinstance(raw_opts, dict):
raw_opts = {}
opts = {str(k): ("" if v is None else str(v)) for k, v in raw_opts.items()}
for k in ("A", "B", "C", "D"):
opts.setdefault(k, "")
q_text = q_data.get("question_text")
question_text = "" if q_text is None else str(q_text)
questions.append(ProcessedQuestion(
question_number=q_data.get("question_number", 0),
question_text=q_data.get("question_text", ""),
options=q_data.get("options", {}),
question_text=question_text,
options=opts,
subject=subj,
metadata={
"stem_images": stem_image_paths,
Expand Down
38 changes: 26 additions & 12 deletions content_gen/scripts/extraction/kit/page_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class KitPageProcessorMixin:

# --- cross-mixin method dependencies (provided by sibling mixins) ---
@abstractmethod
def _clean_noise(self, text: str) -> str: ...
def _clean_noise(self, text: str | None) -> str: ...

@abstractmethod
def _reconstruct_line_text(
Expand Down Expand Up @@ -70,8 +70,12 @@ def _process_page(
if "lines" in block:
for line in block["lines"]:
for span in line["spans"]:
if not span["text"].strip():
raw_t = span.get("text")
text = "" if raw_t is None else str(raw_t)
if not text.strip():
continue
if span.get("text") != text:
span = {**span, "text": text}
all_spans.append(span)

spans_by_question = {q: [] for q in questions}
Expand Down Expand Up @@ -117,7 +121,7 @@ def _process_page(

marker_indices = []
for i, span in enumerate(vline):
txt = span["text"].strip().rstrip(".")
txt = (span.get("text") or "").strip().rstrip(".")
font = span["font"].lower()
x = span["bbox"][0]
known_cols = [70, 81, 170, 181, 270, 281, 370, 381]
Expand All @@ -134,7 +138,7 @@ def _process_page(
line_avg_baseline,
line_main_size,
)
prefix_text = self._clean_noise(prefix_text)
prefix_text = (self._clean_noise(prefix_text) or "").strip()
if prefix_text:
if current_field == "question_text":
questions[q_num]["question_text"] += " " + prefix_text
Expand All @@ -153,15 +157,25 @@ def _process_page(
line_avg_baseline,
line_main_size,
)
opt_text = self._clean_noise(opt_text)
opt_text = (self._clean_noise(opt_text) or "").strip()
questions[q_num]["options"][opt_letter] += " " + opt_text
current_field = opt_letter
else:
line_text = self._reconstruct_line_text(
vline, line_avg_baseline, line_main_size
)
line_text = self._clean_noise(line_text)
line_text = (self._clean_noise(line_text) or "").strip()
if line_text:
m_opt = re.match(
r"(?i)^\s*([A-D])[\.\):]\s*(.*)$", line_text
)
if m_opt:
letter = m_opt.group(1).upper()
rest = (m_opt.group(2) or "").strip()
if letter in questions[q_num]["options"]:
questions[q_num]["options"][letter] += " " + rest
current_field = letter
continue
if current_field == "question_text":
if not questions[q_num]["question_text"]:
line_text = re.sub(r"^\d+[\.\s]*", "", line_text)
Expand Down Expand Up @@ -197,10 +211,10 @@ def _process_page(
temp_img_path.unlink()

for q in questions.values():
q["question_text"] = q["question_text"].strip()
q["question_text"] = (q.get("question_text") or "").strip()
q["question_text"] = re.sub(r"^(\d+[\.\s]*)+", "", q["question_text"])
for opt in q["options"]:
val = q["options"][opt].strip()
val = (q["options"].get(opt) or "").strip()
val = re.sub(r"\s+[\d_]$", "", val)
q["options"][opt] = val

Expand Down Expand Up @@ -230,9 +244,9 @@ def _detect_question_numbers_with_positions(self, page: fitz.Page) -> List[tuple
if "lines" in block:
for i, line in enumerate(block["lines"]):
line_text = " ".join(
span["text"].strip()
(span.get("text") or "").strip()
for span in line["spans"]
if span["text"].strip()
if (span.get("text") or "").strip()
)
line_text = line_text.strip()

Expand Down Expand Up @@ -268,13 +282,13 @@ def _detect_question_numbers_with_positions(self, page: fitz.Page) -> List[tuple
check_text = ""
if i + 1 < len(block["lines"]):
check_text = " ".join(
s["text"] for s in block["lines"][i + 1]["spans"]
(s.get("text") or "") for s in block["lines"][i + 1]["spans"]
).strip()
elif block_idx + 1 < len(blocks):
next_block = blocks[block_idx + 1]
if "lines" in next_block and len(next_block["lines"]) > 0:
check_text = " ".join(
s["text"] for s in next_block["lines"][0]["spans"]
(s.get("text") or "") for s in next_block["lines"][0]["spans"]
).strip()

if len(check_text) > 3:
Expand Down
8 changes: 8 additions & 0 deletions content_gen/scripts/extraction/kit/question_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def _merge_questions(self, questions: List[Dict]) -> List[Dict]:
},
"stem_images": list(dict.fromkeys(q.get("stem_images", []) or [])),
"option_images": q.get("option_images", {}) or {},
"extraction_warnings": list(q.get("extraction_warnings") or []),
}
continue

Expand All @@ -46,6 +47,13 @@ def _merge_questions(self, questions: List[Dict]) -> List[Dict]:
existing = merged[num]["options"].get(opt, "")
merged[num]["options"][opt] = f"{existing} {opt_text}".strip()

new_warn = q.get("extraction_warnings") or []
if new_warn:
prev = merged[num].get("extraction_warnings") or []
merged[num]["extraction_warnings"] = list(
dict.fromkeys([*prev, *new_warn])
)

merged[num]["stem_images"] = list(
dict.fromkeys(
merged[num]["stem_images"] + (q.get("stem_images", []) or [])
Expand Down
44 changes: 31 additions & 13 deletions content_gen/scripts/extraction/kit/text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@ class KitTextUtilsMixin:
extraction_noise_patterns: List[str]
outputs_dir: Optional[Path]
base_name: Optional[str]
def _clean_noise(self, text: str) -> str:
"""Filter global noise and map symbols from reconstructed text parts"""
def _clean_noise(self, text: str | None) -> str:
"""Filter global noise and map symbols from reconstructed text parts."""
if text is None:
text = ""
symbol_map = {
"\uf070": "π",
"\uf061": "α",
Expand Down Expand Up @@ -47,25 +49,37 @@ def _clean_noise(self, text: str) -> str:
def _reconstruct_line_text(
self, spans: List[Dict], avg_baseline: float, main_size: float
) -> str:
"""Helper to reconstruct text with markup from a list of spans on one line"""
"""Reconstruct one visual line from spans; insert spaces from PDF x-gaps between words."""
if not spans:
return ""
parts = []
pieces: List[tuple[str, float, float]] = []
for span in spans:
text = span["text"]
raw = span.get("text")
text = "" if raw is None else str(raw)
size = span["size"]
top = span["bbox"][1]
x0, x1 = float(span["bbox"][0]), float(span["bbox"][2])

if size < main_size * 0.9:
if top < avg_baseline - 1:
parts.append(f"^{text}")
piece = f"^{text}"
elif top > avg_baseline + 1:
parts.append(f"_{text}")
piece = f"_{text}"
else:
parts.append(text)
piece = text
else:
parts.append(text)
return "".join(parts).strip()
piece = text
pieces.append((piece, x0, x1))

out: List[str] = []
gap_space_px = 1.25
for i, (piece, x0, _x1) in enumerate(pieces):
if i > 0:
prev_x1 = pieces[i - 1][2]
if x0 - prev_x1 > gap_space_px:
out.append(" ")
out.append(piece)
return "".join(out).strip()

def _generate_processed_text(self, output_data: Dict) -> None:
"""Generate the standard processed text file in data/outputs following prompts.py"""
Expand All @@ -87,10 +101,14 @@ def _generate_processed_text(self, output_data: Dict) -> None:
f"Question {q['question_number']}Question and Options in Text Format\n\n"
)

f.write(f"{q['question_text'].strip()}\n\n")
q_body = (q.get("question_text") or "").strip()
f.write(f"{q_body}\n\n")

opts = q["options"]
opt_str = f"A. {opts['A']} B. {opts['B']} C. {opts['C']} D. {opts['D']}"
opts = q.get("options") or {}
opt_str = (
f"A. {opts.get('A') or ''} B. {opts.get('B') or ''} "
f"C. {opts.get('C') or ''} D. {opts.get('D') or ''}"
)
f.write(f"{opt_str.strip()}\n\n")

f.write("Detailed Explanation of the Question and Right Answer\n\n")
Expand Down
2 changes: 1 addition & 1 deletion content_gen/scripts/extraction/kit/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@


class PDFExtractKitWrapper(
KitPageProcessorMixin,
KitTextUtilsMixin,
KitImageUtilsMixin,
KitPageProcessorMixin,
KitQuestionBuilderMixin,
):
"""
Expand Down
62 changes: 62 additions & 0 deletions content_gen/tests/test_regression_guards.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ def _wrapper_without_init() -> PDFExtractKitWrapper:
wrapper.min_question_number = 1
wrapper.max_question_number = 40
wrapper.question_detection_mode = "balanced"
wrapper.extraction_noise_patterns = []
wrapper.outputs_dir = None
wrapper.base_name = None
return wrapper


Expand Down Expand Up @@ -106,6 +109,65 @@ def test_parse_response_returns_empty_for_multi_without_headers():
assert parsed == {}


def test_reconstruct_line_text_handles_none_span_text():
wrapper = _wrapper_without_init()
spans = [
{"text": None, "size": 12.0, "bbox": [0.0, 10.0, 10.0, 20.0]},
{"text": "stem", "size": 12.0, "bbox": [12.0, 10.0, 40.0, 20.0]},
]
assert wrapper._reconstruct_line_text(spans, 15.0, 12.0) == "stem"


def test_clean_noise_accepts_none():
wrapper = _wrapper_without_init()
assert wrapper._clean_noise(None) == ""


def test_generate_processed_text_tolerates_null_question_and_options(tmp_path: Path):
wrapper = _wrapper_without_init()
wrapper.outputs_dir = tmp_path
wrapper.base_name = "nullsafe"
wrapper._generate_processed_text(
{
"questions": [
{
"question_number": 1,
"question_text": None,
"options": {"A": None, "B": "", "C": "", "D": ""},
}
]
}
)
out = tmp_path / "nullsafe_processed.txt"
assert out.exists()
body = out.read_text(encoding="utf-8")
assert "Question 1" in body
assert "A." in body


def test_kit_adapter_coerces_null_question_text_and_options(tmp_path: Path):
adapter = KitExtractionAdapter.__new__(KitExtractionAdapter)
adapter.default_subject = "General"
adapter.wrapper = MagicMock()
adapter.wrapper.extract_questions.return_value = {
"questions": [
{
"question_number": 1,
"question_text": None,
"options": {"A": None, "B": "beta"},
"stem_images": [],
"option_images": {},
}
]
}

result = adapter.extract_content(tmp_path / "source.pdf", tmp_path)
assert result[0].question_text == ""
assert result[0].options["A"] == ""
assert result[0].options["B"] == "beta"
assert result[0].options.get("C") == ""


def test_validate_generated_content_flags_missing_sections():
generator = ContentGenerator(router=MagicMock())
bad = generator._validate_generated_content({
Expand Down
24 changes: 13 additions & 11 deletions qc_viewer/routers/automation.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,9 @@ async def get_metrics():

@router.get("/api/automate/config")
async def get_config():
import yaml
from pathlib import Path

from content_gen.core.config_loader import ConfigLoader
from qc_viewer.config import PROJECT_ROOT

config_path = PROJECT_ROOT / "edmate_config.yaml"
Expand All @@ -288,16 +288,18 @@ async def get_config():
model_routing: dict = {}
kit_present = False

if config_path.exists():
try:
with open(config_path, "r") as f:
data = yaml.safe_load(f) or {}
workspace_data = data.get("workspace", {}) or {}
budget_data = data.get("budget", {}) or {}
extraction_settings = data.get("extraction_settings", {}) or {}
model_routing = data.get("model_routing", {}) or {}
except Exception as e:
print(f"Error loading edmate_config.yaml: {e}")
try:
ec = ConfigLoader.load_config(config_path if config_path.exists() else None)
if hasattr(ec, "model_dump"):
merged = ec.model_dump(mode="json")
else:
merged = json.loads(ec.json()) # type: ignore[attr-defined]
workspace_data = merged.get("workspace") or {}
budget_data = merged.get("budget") or {}
extraction_settings = merged.get("extraction_settings") or {}
model_routing = merged.get("model_routing") or {}
except Exception as e:
print(f"Error loading validated edmate_config: {e}")

kit_path = Path(PROJECT_ROOT) / "content_gen" / "tools" / "PDF-Extract-Kit"
kit_present = kit_path.is_dir() and (kit_path / "pdf_extract_kit").is_dir()
Expand Down
Loading
Loading