shmukit · shmukit · May 3, 2026 · May 3, 2026 · May 3, 2026 · May 3, 2026
diff --git a/content_gen/adapters/kit_extraction_adapter.py b/content_gen/adapters/kit_extraction_adapter.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 import os
-from typing import List, Optional, Callable
+from typing import Dict, List, Optional, Callable
 from content_gen.adapters.base_extraction import BaseExtractionAdapter
 from content_gen.core.schemas import ProcessedQuestion
 from content_gen.scripts.extraction.pdf_extract_kit_wrapper import PDFExtractKitWrapper
@@ -65,10 +65,18 @@ def extract_content(
             subj = (q_data.get("subject") or "").strip()
             if not subj:
                 subj = self.default_subject
+            raw_opts = q_data.get("options") or {}
+            if not isinstance(raw_opts, dict):
+                raw_opts = {}
+            opts = {str(k): ("" if v is None else str(v)) for k, v in raw_opts.items()}
+            for k in ("A", "B", "C", "D"):
+                opts.setdefault(k, "")
+            q_text = q_data.get("question_text")
+            question_text = "" if q_text is None else str(q_text)
             questions.append(ProcessedQuestion(
                 question_number=q_data.get("question_number", 0),
-                question_text=q_data.get("question_text", ""),
-                options=q_data.get("options", {}),
+                question_text=question_text,
+                options=opts,
                 subject=subj,
                 metadata={
                     "stem_images": stem_image_paths,

diff --git a/content_gen/scripts/extraction/kit/page_processor.py b/content_gen/scripts/extraction/kit/page_processor.py
@@ -19,7 +19,7 @@ class KitPageProcessorMixin:
 
     # --- cross-mixin method dependencies (provided by sibling mixins) ---
     @abstractmethod
-    def _clean_noise(self, text: str) -> str: ...
+    def _clean_noise(self, text: str | None) -> str: ...
 
     @abstractmethod
     def _reconstruct_line_text(
@@ -70,8 +70,12 @@ def _process_page(
             if "lines" in block:
                 for line in block["lines"]:
                     for span in line["spans"]:
-                        if not span["text"].strip():
+                        raw_t = span.get("text")
+                        text = "" if raw_t is None else str(raw_t)
+                        if not text.strip():
                             continue
+                        if span.get("text") != text:
+                            span = {**span, "text": text}
                         all_spans.append(span)
 
         spans_by_question = {q: [] for q in questions}
@@ -117,7 +121,7 @@ def _process_page(
 
                 marker_indices = []
                 for i, span in enumerate(vline):
-                    txt = span["text"].strip().rstrip(".")
+                    txt = (span.get("text") or "").strip().rstrip(".")
                     font = span["font"].lower()
                     x = span["bbox"][0]
                     known_cols = [70, 81, 170, 181, 270, 281, 370, 381]
@@ -134,7 +138,7 @@ def _process_page(
                             line_avg_baseline,
                             line_main_size,
                         )
-                        prefix_text = self._clean_noise(prefix_text)
+                        prefix_text = (self._clean_noise(prefix_text) or "").strip()
                         if prefix_text:
                             if current_field == "question_text":
                                 questions[q_num]["question_text"] += " " + prefix_text
@@ -153,15 +157,25 @@ def _process_page(
                             line_avg_baseline,
                             line_main_size,
                         )
-                        opt_text = self._clean_noise(opt_text)
+                        opt_text = (self._clean_noise(opt_text) or "").strip()
                         questions[q_num]["options"][opt_letter] += " " + opt_text
                         current_field = opt_letter
                 else:
                     line_text = self._reconstruct_line_text(
                         vline, line_avg_baseline, line_main_size
                     )
-                    line_text = self._clean_noise(line_text)
+                    line_text = (self._clean_noise(line_text) or "").strip()
                     if line_text:
+                        m_opt = re.match(
+                            r"(?i)^\s*([A-D])[\.\):]\s*(.*)$", line_text
+                        )
+                        if m_opt:
+                            letter = m_opt.group(1).upper()
+                            rest = (m_opt.group(2) or "").strip()
+                            if letter in questions[q_num]["options"]:
+                                questions[q_num]["options"][letter] += " " + rest
+                                current_field = letter
+                                continue
                         if current_field == "question_text":
                             if not questions[q_num]["question_text"]:
                                 line_text = re.sub(r"^\d+[\.\s]*", "", line_text)
@@ -197,10 +211,10 @@ def _process_page(
         temp_img_path.unlink()
 
         for q in questions.values():
-            q["question_text"] = q["question_text"].strip()
+            q["question_text"] = (q.get("question_text") or "").strip()
             q["question_text"] = re.sub(r"^(\d+[\.\s]*)+", "", q["question_text"])
             for opt in q["options"]:
-                val = q["options"][opt].strip()
+                val = (q["options"].get(opt) or "").strip()
                 val = re.sub(r"\s+[\d_]$", "", val)
                 q["options"][opt] = val
 
@@ -230,9 +244,9 @@ def _detect_question_numbers_with_positions(self, page: fitz.Page) -> List[tuple
             if "lines" in block:
                 for i, line in enumerate(block["lines"]):
                     line_text = " ".join(
-                        span["text"].strip()
+                        (span.get("text") or "").strip()
                         for span in line["spans"]
-                        if span["text"].strip()
+                        if (span.get("text") or "").strip()
                     )
                     line_text = line_text.strip()
 
@@ -268,13 +282,13 @@ def _detect_question_numbers_with_positions(self, page: fitz.Page) -> List[tuple
                             check_text = ""
                             if i + 1 < len(block["lines"]):
                                 check_text = " ".join(
-                                    s["text"] for s in block["lines"][i + 1]["spans"]
+                                    (s.get("text") or "") for s in block["lines"][i + 1]["spans"]
                                 ).strip()
                             elif block_idx + 1 < len(blocks):
                                 next_block = blocks[block_idx + 1]
                                 if "lines" in next_block and len(next_block["lines"]) > 0:
                                     check_text = " ".join(
-                                        s["text"] for s in next_block["lines"][0]["spans"]
+                                        (s.get("text") or "") for s in next_block["lines"][0]["spans"]
                                     ).strip()
 
                             if len(check_text) > 3:

diff --git a/content_gen/scripts/extraction/kit/question_builder.py b/content_gen/scripts/extraction/kit/question_builder.py
@@ -30,6 +30,7 @@ def _merge_questions(self, questions: List[Dict]) -> List[Dict]:
                     },
                     "stem_images": list(dict.fromkeys(q.get("stem_images", []) or [])),
                     "option_images": q.get("option_images", {}) or {},
+                    "extraction_warnings": list(q.get("extraction_warnings") or []),
                 }
                 continue
 
@@ -46,6 +47,13 @@ def _merge_questions(self, questions: List[Dict]) -> List[Dict]:
                 existing = merged[num]["options"].get(opt, "")
                 merged[num]["options"][opt] = f"{existing} {opt_text}".strip()
 
+            new_warn = q.get("extraction_warnings") or []
+            if new_warn:
+                prev = merged[num].get("extraction_warnings") or []
+                merged[num]["extraction_warnings"] = list(
+                    dict.fromkeys([*prev, *new_warn])
+                )
+
             merged[num]["stem_images"] = list(
                 dict.fromkeys(
                     merged[num]["stem_images"] + (q.get("stem_images", []) or [])

diff --git a/content_gen/scripts/extraction/kit/text_utils.py b/content_gen/scripts/extraction/kit/text_utils.py
@@ -11,8 +11,10 @@ class KitTextUtilsMixin:
     extraction_noise_patterns: List[str]
     outputs_dir: Optional[Path]
     base_name: Optional[str]
-    def _clean_noise(self, text: str) -> str:
-        """Filter global noise and map symbols from reconstructed text parts"""
+    def _clean_noise(self, text: str | None) -> str:
+        """Filter global noise and map symbols from reconstructed text parts."""
+        if text is None:
+            text = ""
         symbol_map = {
             "\uf070": "π",
             "\uf061": "α",
@@ -47,25 +49,37 @@ def _clean_noise(self, text: str) -> str:
     def _reconstruct_line_text(
         self, spans: List[Dict], avg_baseline: float, main_size: float
     ) -> str:
-        """Helper to reconstruct text with markup from a list of spans on one line"""
+        """Reconstruct one visual line from spans; insert spaces from PDF x-gaps between words."""
         if not spans:
             return ""
-        parts = []
+        pieces: List[tuple[str, float, float]] = []
         for span in spans:
-            text = span["text"]
+            raw = span.get("text")
+            text = "" if raw is None else str(raw)
             size = span["size"]
             top = span["bbox"][1]
+            x0, x1 = float(span["bbox"][0]), float(span["bbox"][2])
 
             if size < main_size * 0.9:
                 if top < avg_baseline - 1:
-                    parts.append(f"^{text}")
+                    piece = f"^{text}"
                 elif top > avg_baseline + 1:
-                    parts.append(f"_{text}")
+                    piece = f"_{text}"
                 else:
-                    parts.append(text)
+                    piece = text
             else:
-                parts.append(text)
-        return "".join(parts).strip()
+                piece = text
+            pieces.append((piece, x0, x1))
+
+        out: List[str] = []
+        gap_space_px = 1.25
+        for i, (piece, x0, _x1) in enumerate(pieces):
+            if i > 0:
+                prev_x1 = pieces[i - 1][2]
+                if x0 - prev_x1 > gap_space_px:
+                    out.append(" ")
+            out.append(piece)
+        return "".join(out).strip()
 
     def _generate_processed_text(self, output_data: Dict) -> None:
         """Generate the standard processed text file in data/outputs following prompts.py"""
@@ -87,10 +101,14 @@ def _generate_processed_text(self, output_data: Dict) -> None:
                     f"Question {q['question_number']}Question and Options in Text Format\n\n"
                 )
 
-                f.write(f"{q['question_text'].strip()}\n\n")
+                q_body = (q.get("question_text") or "").strip()
+                f.write(f"{q_body}\n\n")
 
-                opts = q["options"]
-                opt_str = f"A. {opts['A']} B. {opts['B']} C. {opts['C']} D. {opts['D']}"
+                opts = q.get("options") or {}
+                opt_str = (
+                    f"A. {opts.get('A') or ''} B. {opts.get('B') or ''} "
+                    f"C. {opts.get('C') or ''} D. {opts.get('D') or ''}"
+                )
                 f.write(f"{opt_str.strip()}\n\n")
 
                 f.write("Detailed Explanation of the Question and Right Answer\n\n")

diff --git a/content_gen/scripts/extraction/kit/wrapper.py b/content_gen/scripts/extraction/kit/wrapper.py
@@ -20,9 +20,9 @@
 
 
 class PDFExtractKitWrapper(
-    KitPageProcessorMixin,
     KitTextUtilsMixin,
     KitImageUtilsMixin,
+    KitPageProcessorMixin,
     KitQuestionBuilderMixin,
 ):
     """

diff --git a/content_gen/tests/test_regression_guards.py b/content_gen/tests/test_regression_guards.py
@@ -11,6 +11,9 @@ def _wrapper_without_init() -> PDFExtractKitWrapper:
     wrapper.min_question_number = 1
     wrapper.max_question_number = 40
     wrapper.question_detection_mode = "balanced"
+    wrapper.extraction_noise_patterns = []
+    wrapper.outputs_dir = None
+    wrapper.base_name = None
     return wrapper
 
 
@@ -106,6 +109,65 @@ def test_parse_response_returns_empty_for_multi_without_headers():
     assert parsed == {}
 
 
+def test_reconstruct_line_text_handles_none_span_text():
+    wrapper = _wrapper_without_init()
+    spans = [
+        {"text": None, "size": 12.0, "bbox": [0.0, 10.0, 10.0, 20.0]},
+        {"text": "stem", "size": 12.0, "bbox": [12.0, 10.0, 40.0, 20.0]},
+    ]
+    assert wrapper._reconstruct_line_text(spans, 15.0, 12.0) == "stem"
+
+
+def test_clean_noise_accepts_none():
+    wrapper = _wrapper_without_init()
+    assert wrapper._clean_noise(None) == ""
+
+
+def test_generate_processed_text_tolerates_null_question_and_options(tmp_path: Path):
+    wrapper = _wrapper_without_init()
+    wrapper.outputs_dir = tmp_path
+    wrapper.base_name = "nullsafe"
+    wrapper._generate_processed_text(
+        {
+            "questions": [
+                {
+                    "question_number": 1,
+                    "question_text": None,
+                    "options": {"A": None, "B": "", "C": "", "D": ""},
+                }
+            ]
+        }
+    )
+    out = tmp_path / "nullsafe_processed.txt"
+    assert out.exists()
+    body = out.read_text(encoding="utf-8")
+    assert "Question 1" in body
+    assert "A." in body
+
+
+def test_kit_adapter_coerces_null_question_text_and_options(tmp_path: Path):
+    adapter = KitExtractionAdapter.__new__(KitExtractionAdapter)
+    adapter.default_subject = "General"
+    adapter.wrapper = MagicMock()
+    adapter.wrapper.extract_questions.return_value = {
+        "questions": [
+            {
+                "question_number": 1,
+                "question_text": None,
+                "options": {"A": None, "B": "beta"},
+                "stem_images": [],
+                "option_images": {},
+            }
+        ]
+    }
+
+    result = adapter.extract_content(tmp_path / "source.pdf", tmp_path)
+    assert result[0].question_text == ""
+    assert result[0].options["A"] == ""
+    assert result[0].options["B"] == "beta"
+    assert result[0].options.get("C") == ""
+
+
 def test_validate_generated_content_flags_missing_sections():
     generator = ContentGenerator(router=MagicMock())
     bad = generator._validate_generated_content({

diff --git a/qc_viewer/routers/automation.py b/qc_viewer/routers/automation.py
@@ -276,9 +276,9 @@ async def get_metrics():
 
 @router.get("/api/automate/config")
 async def get_config():
-    import yaml
     from pathlib import Path
 
+    from content_gen.core.config_loader import ConfigLoader
     from qc_viewer.config import PROJECT_ROOT
 
     config_path = PROJECT_ROOT / "edmate_config.yaml"
@@ -288,16 +288,18 @@ async def get_config():
     model_routing: dict = {}
     kit_present = False
 
-    if config_path.exists():
-        try:
-            with open(config_path, "r") as f:
-                data = yaml.safe_load(f) or {}
-                workspace_data = data.get("workspace", {}) or {}
-                budget_data = data.get("budget", {}) or {}
-                extraction_settings = data.get("extraction_settings", {}) or {}
-                model_routing = data.get("model_routing", {}) or {}
-        except Exception as e:
-            print(f"Error loading edmate_config.yaml: {e}")
+    try:
+        ec = ConfigLoader.load_config(config_path if config_path.exists() else None)
+        if hasattr(ec, "model_dump"):
+            merged = ec.model_dump(mode="json")
+        else:
+            merged = json.loads(ec.json())  # type: ignore[attr-defined]
+        workspace_data = merged.get("workspace") or {}
+        budget_data = merged.get("budget") or {}
+        extraction_settings = merged.get("extraction_settings") or {}
+        model_routing = merged.get("model_routing") or {}
+    except Exception as e:
+        print(f"Error loading validated edmate_config: {e}")
 
     kit_path = Path(PROJECT_ROOT) / "content_gen" / "tools" / "PDF-Extract-Kit"
     kit_present = kit_path.is_dir() and (kit_path / "pdf_extract_kit").is_dir()