diff --git a/content_gen/adapters/kit_extraction_adapter.py b/content_gen/adapters/kit_extraction_adapter.py index b73f9a6..fe2389c 100644 --- a/content_gen/adapters/kit_extraction_adapter.py +++ b/content_gen/adapters/kit_extraction_adapter.py @@ -1,6 +1,6 @@ from pathlib import Path import os -from typing import List, Optional, Callable +from typing import Dict, List, Optional, Callable from content_gen.adapters.base_extraction import BaseExtractionAdapter from content_gen.core.schemas import ProcessedQuestion from content_gen.scripts.extraction.pdf_extract_kit_wrapper import PDFExtractKitWrapper @@ -65,10 +65,18 @@ def extract_content( subj = (q_data.get("subject") or "").strip() if not subj: subj = self.default_subject + raw_opts = q_data.get("options") or {} + if not isinstance(raw_opts, dict): + raw_opts = {} + opts = {str(k): ("" if v is None else str(v)) for k, v in raw_opts.items()} + for k in ("A", "B", "C", "D"): + opts.setdefault(k, "") + q_text = q_data.get("question_text") + question_text = "" if q_text is None else str(q_text) questions.append(ProcessedQuestion( question_number=q_data.get("question_number", 0), - question_text=q_data.get("question_text", ""), - options=q_data.get("options", {}), + question_text=question_text, + options=opts, subject=subj, metadata={ "stem_images": stem_image_paths, diff --git a/content_gen/scripts/extraction/kit/page_processor.py b/content_gen/scripts/extraction/kit/page_processor.py index 949e241..d3eaf1a 100644 --- a/content_gen/scripts/extraction/kit/page_processor.py +++ b/content_gen/scripts/extraction/kit/page_processor.py @@ -19,7 +19,7 @@ class KitPageProcessorMixin: # --- cross-mixin method dependencies (provided by sibling mixins) --- @abstractmethod - def _clean_noise(self, text: str) -> str: ... + def _clean_noise(self, text: str | None) -> str: ... @abstractmethod def _reconstruct_line_text( @@ -70,8 +70,12 @@ def _process_page( if "lines" in block: for line in block["lines"]: for span in line["spans"]: - if not span["text"].strip(): + raw_t = span.get("text") + text = "" if raw_t is None else str(raw_t) + if not text.strip(): continue + if span.get("text") != text: + span = {**span, "text": text} all_spans.append(span) spans_by_question = {q: [] for q in questions} @@ -117,7 +121,7 @@ def _process_page( marker_indices = [] for i, span in enumerate(vline): - txt = span["text"].strip().rstrip(".") + txt = (span.get("text") or "").strip().rstrip(".") font = span["font"].lower() x = span["bbox"][0] known_cols = [70, 81, 170, 181, 270, 281, 370, 381] @@ -134,7 +138,7 @@ def _process_page( line_avg_baseline, line_main_size, ) - prefix_text = self._clean_noise(prefix_text) + prefix_text = (self._clean_noise(prefix_text) or "").strip() if prefix_text: if current_field == "question_text": questions[q_num]["question_text"] += " " + prefix_text @@ -153,15 +157,25 @@ def _process_page( line_avg_baseline, line_main_size, ) - opt_text = self._clean_noise(opt_text) + opt_text = (self._clean_noise(opt_text) or "").strip() questions[q_num]["options"][opt_letter] += " " + opt_text current_field = opt_letter else: line_text = self._reconstruct_line_text( vline, line_avg_baseline, line_main_size ) - line_text = self._clean_noise(line_text) + line_text = (self._clean_noise(line_text) or "").strip() if line_text: + m_opt = re.match( + r"(?i)^\s*([A-D])[\.\):]\s*(.*)$", line_text + ) + if m_opt: + letter = m_opt.group(1).upper() + rest = (m_opt.group(2) or "").strip() + if letter in questions[q_num]["options"]: + questions[q_num]["options"][letter] += " " + rest + current_field = letter + continue if current_field == "question_text": if not questions[q_num]["question_text"]: line_text = re.sub(r"^\d+[\.\s]*", "", line_text) @@ -197,10 +211,10 @@ def _process_page( temp_img_path.unlink() for q in questions.values(): - q["question_text"] = q["question_text"].strip() + q["question_text"] = (q.get("question_text") or "").strip() q["question_text"] = re.sub(r"^(\d+[\.\s]*)+", "", q["question_text"]) for opt in q["options"]: - val = q["options"][opt].strip() + val = (q["options"].get(opt) or "").strip() val = re.sub(r"\s+[\d_]$", "", val) q["options"][opt] = val @@ -230,9 +244,9 @@ def _detect_question_numbers_with_positions(self, page: fitz.Page) -> List[tuple if "lines" in block: for i, line in enumerate(block["lines"]): line_text = " ".join( - span["text"].strip() + (span.get("text") or "").strip() for span in line["spans"] - if span["text"].strip() + if (span.get("text") or "").strip() ) line_text = line_text.strip() @@ -268,13 +282,13 @@ def _detect_question_numbers_with_positions(self, page: fitz.Page) -> List[tuple check_text = "" if i + 1 < len(block["lines"]): check_text = " ".join( - s["text"] for s in block["lines"][i + 1]["spans"] + (s.get("text") or "") for s in block["lines"][i + 1]["spans"] ).strip() elif block_idx + 1 < len(blocks): next_block = blocks[block_idx + 1] if "lines" in next_block and len(next_block["lines"]) > 0: check_text = " ".join( - s["text"] for s in next_block["lines"][0]["spans"] + (s.get("text") or "") for s in next_block["lines"][0]["spans"] ).strip() if len(check_text) > 3: diff --git a/content_gen/scripts/extraction/kit/question_builder.py b/content_gen/scripts/extraction/kit/question_builder.py index d60fed3..8dc26ae 100644 --- a/content_gen/scripts/extraction/kit/question_builder.py +++ b/content_gen/scripts/extraction/kit/question_builder.py @@ -30,6 +30,7 @@ def _merge_questions(self, questions: List[Dict]) -> List[Dict]: }, "stem_images": list(dict.fromkeys(q.get("stem_images", []) or [])), "option_images": q.get("option_images", {}) or {}, + "extraction_warnings": list(q.get("extraction_warnings") or []), } continue @@ -46,6 +47,13 @@ def _merge_questions(self, questions: List[Dict]) -> List[Dict]: existing = merged[num]["options"].get(opt, "") merged[num]["options"][opt] = f"{existing} {opt_text}".strip() + new_warn = q.get("extraction_warnings") or [] + if new_warn: + prev = merged[num].get("extraction_warnings") or [] + merged[num]["extraction_warnings"] = list( + dict.fromkeys([*prev, *new_warn]) + ) + merged[num]["stem_images"] = list( dict.fromkeys( merged[num]["stem_images"] + (q.get("stem_images", []) or []) diff --git a/content_gen/scripts/extraction/kit/text_utils.py b/content_gen/scripts/extraction/kit/text_utils.py index 3425559..5ffec33 100644 --- a/content_gen/scripts/extraction/kit/text_utils.py +++ b/content_gen/scripts/extraction/kit/text_utils.py @@ -11,8 +11,10 @@ class KitTextUtilsMixin: extraction_noise_patterns: List[str] outputs_dir: Optional[Path] base_name: Optional[str] - def _clean_noise(self, text: str) -> str: - """Filter global noise and map symbols from reconstructed text parts""" + def _clean_noise(self, text: str | None) -> str: + """Filter global noise and map symbols from reconstructed text parts.""" + if text is None: + text = "" symbol_map = { "\uf070": "Ο", "\uf061": "Ξ±", @@ -47,25 +49,37 @@ def _clean_noise(self, text: str) -> str: def _reconstruct_line_text( self, spans: List[Dict], avg_baseline: float, main_size: float ) -> str: - """Helper to reconstruct text with markup from a list of spans on one line""" + """Reconstruct one visual line from spans; insert spaces from PDF x-gaps between words.""" if not spans: return "" - parts = [] + pieces: List[tuple[str, float, float]] = [] for span in spans: - text = span["text"] + raw = span.get("text") + text = "" if raw is None else str(raw) size = span["size"] top = span["bbox"][1] + x0, x1 = float(span["bbox"][0]), float(span["bbox"][2]) if size < main_size * 0.9: if top < avg_baseline - 1: - parts.append(f"^{text}") + piece = f"^{text}" elif top > avg_baseline + 1: - parts.append(f"_{text}") + piece = f"_{text}" else: - parts.append(text) + piece = text else: - parts.append(text) - return "".join(parts).strip() + piece = text + pieces.append((piece, x0, x1)) + + out: List[str] = [] + gap_space_px = 1.25 + for i, (piece, x0, _x1) in enumerate(pieces): + if i > 0: + prev_x1 = pieces[i - 1][2] + if x0 - prev_x1 > gap_space_px: + out.append(" ") + out.append(piece) + return "".join(out).strip() def _generate_processed_text(self, output_data: Dict) -> None: """Generate the standard processed text file in data/outputs following prompts.py""" @@ -87,10 +101,14 @@ def _generate_processed_text(self, output_data: Dict) -> None: f"Question {q['question_number']}Question and Options in Text Format\n\n" ) - f.write(f"{q['question_text'].strip()}\n\n") + q_body = (q.get("question_text") or "").strip() + f.write(f"{q_body}\n\n") - opts = q["options"] - opt_str = f"A. {opts['A']} B. {opts['B']} C. {opts['C']} D. {opts['D']}" + opts = q.get("options") or {} + opt_str = ( + f"A. {opts.get('A') or ''} B. {opts.get('B') or ''} " + f"C. {opts.get('C') or ''} D. {opts.get('D') or ''}" + ) f.write(f"{opt_str.strip()}\n\n") f.write("Detailed Explanation of the Question and Right Answer\n\n") diff --git a/content_gen/scripts/extraction/kit/wrapper.py b/content_gen/scripts/extraction/kit/wrapper.py index 1e28306..6a36a7e 100644 --- a/content_gen/scripts/extraction/kit/wrapper.py +++ b/content_gen/scripts/extraction/kit/wrapper.py @@ -20,9 +20,9 @@ class PDFExtractKitWrapper( - KitPageProcessorMixin, KitTextUtilsMixin, KitImageUtilsMixin, + KitPageProcessorMixin, KitQuestionBuilderMixin, ): """ diff --git a/content_gen/tests/test_regression_guards.py b/content_gen/tests/test_regression_guards.py index 78f9c6f..3c16242 100644 --- a/content_gen/tests/test_regression_guards.py +++ b/content_gen/tests/test_regression_guards.py @@ -11,6 +11,9 @@ def _wrapper_without_init() -> PDFExtractKitWrapper: wrapper.min_question_number = 1 wrapper.max_question_number = 40 wrapper.question_detection_mode = "balanced" + wrapper.extraction_noise_patterns = [] + wrapper.outputs_dir = None + wrapper.base_name = None return wrapper @@ -106,6 +109,65 @@ def test_parse_response_returns_empty_for_multi_without_headers(): assert parsed == {} +def test_reconstruct_line_text_handles_none_span_text(): + wrapper = _wrapper_without_init() + spans = [ + {"text": None, "size": 12.0, "bbox": [0.0, 10.0, 10.0, 20.0]}, + {"text": "stem", "size": 12.0, "bbox": [12.0, 10.0, 40.0, 20.0]}, + ] + assert wrapper._reconstruct_line_text(spans, 15.0, 12.0) == "stem" + + +def test_clean_noise_accepts_none(): + wrapper = _wrapper_without_init() + assert wrapper._clean_noise(None) == "" + + +def test_generate_processed_text_tolerates_null_question_and_options(tmp_path: Path): + wrapper = _wrapper_without_init() + wrapper.outputs_dir = tmp_path + wrapper.base_name = "nullsafe" + wrapper._generate_processed_text( + { + "questions": [ + { + "question_number": 1, + "question_text": None, + "options": {"A": None, "B": "", "C": "", "D": ""}, + } + ] + } + ) + out = tmp_path / "nullsafe_processed.txt" + assert out.exists() + body = out.read_text(encoding="utf-8") + assert "Question 1" in body + assert "A." in body + + +def test_kit_adapter_coerces_null_question_text_and_options(tmp_path: Path): + adapter = KitExtractionAdapter.__new__(KitExtractionAdapter) + adapter.default_subject = "General" + adapter.wrapper = MagicMock() + adapter.wrapper.extract_questions.return_value = { + "questions": [ + { + "question_number": 1, + "question_text": None, + "options": {"A": None, "B": "beta"}, + "stem_images": [], + "option_images": {}, + } + ] + } + + result = adapter.extract_content(tmp_path / "source.pdf", tmp_path) + assert result[0].question_text == "" + assert result[0].options["A"] == "" + assert result[0].options["B"] == "beta" + assert result[0].options.get("C") == "" + + def test_validate_generated_content_flags_missing_sections(): generator = ContentGenerator(router=MagicMock()) bad = generator._validate_generated_content({ diff --git a/qc_viewer/routers/automation.py b/qc_viewer/routers/automation.py index 91a692e..2ebb9a4 100644 --- a/qc_viewer/routers/automation.py +++ b/qc_viewer/routers/automation.py @@ -276,9 +276,9 @@ async def get_metrics(): @router.get("/api/automate/config") async def get_config(): - import yaml from pathlib import Path + from content_gen.core.config_loader import ConfigLoader from qc_viewer.config import PROJECT_ROOT config_path = PROJECT_ROOT / "edmate_config.yaml" @@ -288,16 +288,18 @@ async def get_config(): model_routing: dict = {} kit_present = False - if config_path.exists(): - try: - with open(config_path, "r") as f: - data = yaml.safe_load(f) or {} - workspace_data = data.get("workspace", {}) or {} - budget_data = data.get("budget", {}) or {} - extraction_settings = data.get("extraction_settings", {}) or {} - model_routing = data.get("model_routing", {}) or {} - except Exception as e: - print(f"Error loading edmate_config.yaml: {e}") + try: + ec = ConfigLoader.load_config(config_path if config_path.exists() else None) + if hasattr(ec, "model_dump"): + merged = ec.model_dump(mode="json") + else: + merged = json.loads(ec.json()) # type: ignore[attr-defined] + workspace_data = merged.get("workspace") or {} + budget_data = merged.get("budget") or {} + extraction_settings = merged.get("extraction_settings") or {} + model_routing = merged.get("model_routing") or {} + except Exception as e: + print(f"Error loading validated edmate_config: {e}") kit_path = Path(PROJECT_ROOT) / "content_gen" / "tools" / "PDF-Extract-Kit" kit_present = kit_path.is_dir() and (kit_path / "pdf_extract_kit").is_dir() diff --git a/qc_viewer/services/automation_pipeline.py b/qc_viewer/services/automation_pipeline.py index 0833996..8b12686 100644 --- a/qc_viewer/services/automation_pipeline.py +++ b/qc_viewer/services/automation_pipeline.py @@ -1,6 +1,5 @@ import asyncio import base64 -import json import os import re import threading @@ -13,10 +12,10 @@ from content_gen.core.config_schema import DetectionMode from content_gen.core.pedagogy_engine import PedagogyEngine from content_gen.scripts.pipeline.pipeline_orchestrator import PipelineOrchestrator +from qc_viewer.services.draft_store import read_modify_write_json CANCELLATION_EVENTS: dict[str, threading.Event] = {} -METADATA_LOCK = threading.Lock() def _normalize_model_id(model_id: str, provider: Optional[str]) -> str: @@ -178,10 +177,7 @@ def _update_progress(progress: int, message: str, processed_count: Optional[int] raise InterruptedError(f"Task {draft_id} was cancelled by user.") try: - with METADATA_LOCK: - with open(meta_path, "r") as f: - meta = json.load(f) - + def _mut(meta: dict) -> None: update_data = { "progress": progress, "status_message": message, @@ -191,11 +187,9 @@ def _update_progress(progress: int, message: str, processed_count: Optional[int] update_data["processed_count"] = processed_count if total_count is not None: update_data["total_count"] = total_count - meta.update(update_data) - - with open(meta_path, "w") as f: - json.dump(meta, f) + + read_modify_write_json(meta_path, _mut) except Exception as e: print(f"Error updating progress: {e}") @@ -312,6 +306,13 @@ def _update_progress(progress: int, message: str, processed_count: Optional[int] if clean_core_concept.lower() in explanation_text.lower() and len(clean_core_concept) > len(explanation_text) * 0.8: clean_core_concept = "Concept extracted from explanation." + opts_map = q.options if isinstance(q.options, dict) else {} + opt_vals = [str(opts_map.get(k, "") or "").strip() for k in ("A", "B", "C", "D")] + non_empty_opts = sum(1 for v in opt_vals if v) + extraction_warnings = ( + ["mcq_options_missing"] if non_empty_opts < 2 else [] + ) + legacy_q = { "question_number": q.question_number, "text": q.question_text, @@ -328,6 +329,8 @@ def _update_progress(progress: int, message: str, processed_count: Optional[int] "quality_report": quality_report, "contract_warnings": [k for k, v in quality_report.items() if v is False], } + if extraction_warnings: + legacy_q["extraction_warnings"] = extraction_warnings questions_payload.append(legacy_q) t_normalization_end = time.time() @@ -342,53 +345,56 @@ def _update_progress(progress: int, message: str, processed_count: Optional[int] } } - with open(meta_path, "r") as f: - final_meta = json.load(f) - - final_meta.update( - { - "questions": questions_payload, - "status": "PROCESSED", - "progress": 100, - "processed_count": total_questions, - "total_count": total_questions, - "status_message": "Generation complete!", - "id": draft_id, - "subject": subject, - "paper_code": paper_code, - "pedagogy_profile": pedagogy.get_profile_summary(), - "resolved_model_override": resolved_model_override, - "completed_at": datetime.now().isoformat(), - "telemetry": telemetry, - } - ) + def _finalize(meta: dict) -> None: + meta.update( + { + "questions": questions_payload, + "status": "PROCESSED", + "progress": 100, + "processed_count": total_questions, + "total_count": total_questions, + "status_message": "Generation complete!", + "id": draft_id, + "subject": subject, + "paper_code": paper_code, + "pedagogy_profile": pedagogy.get_profile_summary(), + "resolved_model_override": resolved_model_override, + "completed_at": datetime.now().isoformat(), + "telemetry": telemetry, + } + ) - with open(meta_path, "w") as f: - json.dump(final_meta, f) + read_modify_write_json(meta_path, _finalize) except InterruptedError: print(f"Task {draft_id} cancelled.") - _update_progress(0, "Processing stopped by user.") - with open(meta_path, "r") as f: - meta = json.load(f) - meta["status"] = "FAILED" - meta["status_message"] = "Stopped by user" - with open(meta_path, "w") as f: - json.dump(meta, f) + try: + read_modify_write_json( + meta_path, + lambda m: m.update( + { + "progress": 0, + "status_message": "Processing stopped by user.", + "status": "FAILED", + } + ), + ) + except Exception as inner_e: + print(f"Failed to update metadata after cancel: {inner_e}") except Exception as e: print(f"Background Processing Error: {e}") try: - with open(meta_path, "r") as f: - fail_meta = json.load(f) - fail_meta.update( - { - "status": "FAILED", - "error": str(e), - "progress": 0, - "status_message": f"Error: {str(e)}", - } - ) - with open(meta_path, "w") as f: - json.dump(fail_meta, f) + + def _fail(meta: dict) -> None: + meta.update( + { + "status": "FAILED", + "error": str(e), + "progress": 0, + "status_message": f"Error: {str(e)}", + } + ) + + read_modify_write_json(meta_path, _fail) except Exception as inner_e: print(f"Failed to update metadata with error: {inner_e}") diff --git a/qc_viewer/services/draft_store.py b/qc_viewer/services/draft_store.py index 94b8cd1..c18a3e4 100644 --- a/qc_viewer/services/draft_store.py +++ b/qc_viewer/services/draft_store.py @@ -1,8 +1,9 @@ import json +import os import shutil from datetime import datetime from pathlib import Path -from typing import Any, Optional +from typing import Any, Callable, Optional import threading from fastapi import HTTPException @@ -50,6 +51,24 @@ def write_json(path: Path, payload: dict[str, Any]) -> None: json.dump(payload, f) +def read_modify_write_json(path: Path, mutator: Callable[[dict[str, Any]], None]) -> None: + """ + Read JSON, apply mutator in-place, then atomically replace the file. + Holds METADATA_LOCK for the whole operation so callers never read a torn write. + """ + with METADATA_LOCK: + if path.exists(): + with open(path, "r", encoding="utf-8") as f: + data: dict[str, Any] = json.load(f) + else: + data = {} + mutator(data) + tmp = path.with_suffix(path.suffix + ".tmp") + with open(tmp, "w", encoding="utf-8") as f: + json.dump(data, f) + os.replace(tmp, path) + + def list_draft_metadata() -> list[dict[str, Any]]: drafts: list[dict[str, Any]] = [] if not DRAFTS_ROOT.exists(): diff --git a/qc_viewer/static/automate.html b/qc_viewer/static/automate.html index db90308..3879ea4 100644 --- a/qc_viewer/static/automate.html +++ b/qc_viewer/static/automate.html @@ -64,9 +64,9 @@