neph1 · neph1 · Apr 13, 2026
diff --git a/tale/json_util.py b/tale/json_util.py
@@ -2,14 +2,149 @@
 import re
 from typing import Any, Optional
 
+
+REASONING_TAG_NAMES = (
+    "think",
+    "thinking",
+    "thought",
+    "thoughts",
+    "reasoning",
+    "analysis",
+    "scratchpad",
+    "reflection",
+    "inner_monologue",
+    "chain_of_thought",
+    "cot",
+)
+
+
+def _skip_quoted_segment(s: str, start: int) -> int:
+    """Skip over a quoted string, preserving escaped quotes."""
+    quote = s[start]
+    i = start + 1
+    escaped = False
+
+    while i < len(s):
+        ch = s[i]
+        if escaped:
+            escaped = False
+        elif ch == "\\":
+            escaped = True
+        elif ch == quote:
+            return i + 1
+        i += 1
+
+    return len(s)
+
+
+def _find_reasoning_fence_end(s: str, start: int) -> Optional[int]:
+    """Find the end of a fenced reasoning block such as ```thinking ... ```."""
+    if not s.startswith("```", start):
+        return None
+
+    line_end = s.find("\n", start)
+    if line_end == -1:
+        line_end = len(s)
+
+    fence_name = s[start + 3:line_end].strip().lower().replace("-", "_")
+    if fence_name not in REASONING_TAG_NAMES:
+        return None
+
+    closing = s.find("```", line_end)
+    return len(s) if closing == -1 else closing + 3
+
+
+def _find_reasoning_tag_end(s: str, start: int) -> Optional[int]:
+    """Find the end of a tagged reasoning block such as <think>...</think>."""
+    remainder = s[start:]
+
+    for tag in REASONING_TAG_NAMES:
+        xml_open = re.match(rf"<\s*{re.escape(tag)}(?:\s+[^>]*)?>", remainder, flags=re.IGNORECASE)
+        if xml_open:
+            closing = re.search(rf"</\s*{re.escape(tag)}\s*>", remainder[xml_open.end():], flags=re.IGNORECASE)
+            return len(s) if closing is None else start + xml_open.end() + closing.end()
+
+        bracket_open = re.match(rf"\[\s*{re.escape(tag)}\s*\]", remainder, flags=re.IGNORECASE)
+        if bracket_open:
+            closing = re.search(rf"\[/\s*{re.escape(tag)}\s*\]", remainder[bracket_open.end():], flags=re.IGNORECASE)
+            return len(s) if closing is None else start + bracket_open.end() + closing.end()
+
+    return None
+
+
+def strip_reasoning_sections(s: str) -> str:
+    """Remove common reasoning sections emitted by LLMs while preserving JSON string values."""
+    if not s:
+        return ""
+
+    cleaned = []
+    i = 0
+    while i < len(s):
+        if s[i] in ('"', "'"):
+            quoted_end = _skip_quoted_segment(s, i)
+            cleaned.append(s[i:quoted_end])
+            i = quoted_end
+            continue
+
+        reasoning_fence_end = _find_reasoning_fence_end(s, i)
+        if reasoning_fence_end is not None:
+            i = reasoning_fence_end
+            continue
+
+        reasoning_tag_end = _find_reasoning_tag_end(s, i)
+        if reasoning_tag_end is not None:
+            i = reasoning_tag_end
+            continue
+
+        cleaned.append(s[i])
+        i += 1
+
+    return re.sub(r"\n{3,}", "\n\n", "".join(cleaned)).strip()
+
 def strip_markdown_fences(s: str) -> str:
     """Remove ```json fences or plain ``` fences."""
     return re.sub(r'^```(?:json)?|```$', '', s.strip(), flags=re.MULTILINE).strip()
 
 def extract_json_block(s: str) -> str:
-    """Extract the first {...} or [...] block if there is extra text around it."""
-    match = re.search(r"(\{.*\}|\[.*\])", s, flags=re.DOTALL)
-    return match.group(1) if match else s
+    """Extract the first balanced JSON object or array from a larger string."""
+    start = None
+    stack = []
+    in_string = False
+    escaped = False
+
+    for i, ch in enumerate(s):
+        if in_string:
+            if escaped:
+                escaped = False
+            elif ch == "\\":
+                escaped = True
+            elif ch == '"':
+                in_string = False
+            continue
+
+        if ch == '"':
+            in_string = True
+            continue
+
+        if start is None:
+            if ch == "{":
+                start = i
+                stack.append("}")
+            elif ch == "[":
+                start = i
+                stack.append("]")
+            continue
+
+        if ch in "[{":
+            stack.append("}" if ch == "{" else "]")
+        elif ch in "}]":
+            if not stack or ch != stack[-1]:
+                return s
+            stack.pop()
+            if not stack and start is not None:
+                return s[start:i + 1]
+
+    return s
 
 def unwrap_double_encoded(s: str) -> str:
     """Unwrap when JSON is a quoted string containing JSON."""
@@ -35,9 +170,11 @@ def sanitize_json(s: str) -> str:
     """Apply a pipeline of cleanup steps before parsing."""
     if not s:
         return ""
+    s = strip_reasoning_sections(s)
     s = strip_markdown_fences(s)
-    s = extract_json_block(s)
     s = unwrap_double_encoded(s)
+    s = strip_reasoning_sections(s)
+    s = extract_json_block(s)
     s = fix_trailing_commas(s)
     s = normalize_literals(s)
     return s.strip()

diff --git a/tests/test_parse_utils.py b/tests/test_parse_utils.py
@@ -375,6 +375,26 @@ def test_sanitize_json(self):
         sanitized = json_util.safe_load(json_string)
         assert sanitized['name'] == 'Whispering Woods'
 
+    def test_sanitize_json_strips_reasoning_tags(self):
+        json_string = '<think>I should provide a forest description first.</think>{"name": "Whispering Woods", "mood": 5}'
+        sanitized = json_util.safe_load(json_string)
+        assert sanitized['name'] == 'Whispering Woods'
+
+    def test_sanitize_json_strips_reasoning_fences(self):
+        json_string = '```thinking\nI will now prepare structured output.\n```\n```json\n{"name": "Whispering Woods", "mood": 5}\n```'
+        sanitized = json_util.safe_load(json_string)
+        assert sanitized['mood'] == 5
+
+    def test_sanitize_json_keeps_tag_like_string_values(self):
+        json_string = '{"name": "Whispering Woods", "note": "<think>this stays in content</think>"}'
+        sanitized = json_util.safe_load(json_string)
+        assert sanitized['note'] == '<think>this stays in content</think>'
+
+    def test_sanitize_json_unwraps_double_encoded_reasoning(self):
+        json_string = '"<analysis>Internal notes</analysis>{\\"name\\": \\"Whispering Woods\\", \\"mood\\": 5}"'
+        sanitized = json_util.safe_load(json_string)
+        assert sanitized['name'] == 'Whispering Woods'
+
     def test_mood_string_from_int(self):
         assert parse_utils.mood_string_from_int(5) == ' uttermost friendly'
         assert parse_utils.mood_string_from_int(0) == ' neutral'