Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 141 additions & 4 deletions tale/json_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,149 @@
import re
from typing import Any, Optional


REASONING_TAG_NAMES = (
"think",
"thinking",
"thought",
"thoughts",
"reasoning",
"analysis",
"scratchpad",
"reflection",
"inner_monologue",
"chain_of_thought",
"cot",
)


def _skip_quoted_segment(s: str, start: int) -> int:
"""Skip over a quoted string, preserving escaped quotes."""
quote = s[start]
i = start + 1
escaped = False

while i < len(s):
ch = s[i]
if escaped:
escaped = False
elif ch == "\\":
escaped = True
elif ch == quote:
return i + 1
i += 1

return len(s)


def _find_reasoning_fence_end(s: str, start: int) -> Optional[int]:
"""Find the end of a fenced reasoning block such as ```thinking ... ```."""
if not s.startswith("```", start):
return None

line_end = s.find("\n", start)
if line_end == -1:
line_end = len(s)

fence_name = s[start + 3:line_end].strip().lower().replace("-", "_")
if fence_name not in REASONING_TAG_NAMES:
return None

closing = s.find("```", line_end)
return len(s) if closing == -1 else closing + 3


def _find_reasoning_tag_end(s: str, start: int) -> Optional[int]:
"""Find the end of a tagged reasoning block such as <think>...</think>."""
remainder = s[start:]

for tag in REASONING_TAG_NAMES:
xml_open = re.match(rf"<\s*{re.escape(tag)}(?:\s+[^>]*)?>", remainder, flags=re.IGNORECASE)
if xml_open:
closing = re.search(rf"</\s*{re.escape(tag)}\s*>", remainder[xml_open.end():], flags=re.IGNORECASE)
return len(s) if closing is None else start + xml_open.end() + closing.end()

bracket_open = re.match(rf"\[\s*{re.escape(tag)}\s*\]", remainder, flags=re.IGNORECASE)
if bracket_open:
closing = re.search(rf"\[/\s*{re.escape(tag)}\s*\]", remainder[bracket_open.end():], flags=re.IGNORECASE)
return len(s) if closing is None else start + bracket_open.end() + closing.end()

return None


def strip_reasoning_sections(s: str) -> str:
"""Remove common reasoning sections emitted by LLMs while preserving JSON string values."""
if not s:
return ""

cleaned = []
i = 0
while i < len(s):
if s[i] in ('"', "'"):
quoted_end = _skip_quoted_segment(s, i)
cleaned.append(s[i:quoted_end])
i = quoted_end
continue

reasoning_fence_end = _find_reasoning_fence_end(s, i)
if reasoning_fence_end is not None:
i = reasoning_fence_end
continue

reasoning_tag_end = _find_reasoning_tag_end(s, i)
if reasoning_tag_end is not None:
i = reasoning_tag_end
continue

cleaned.append(s[i])
i += 1

return re.sub(r"\n{3,}", "\n\n", "".join(cleaned)).strip()

def strip_markdown_fences(s: str) -> str:
"""Remove ```json fences or plain ``` fences."""
return re.sub(r'^```(?:json)?|```$', '', s.strip(), flags=re.MULTILINE).strip()

def extract_json_block(s: str) -> str:
"""Extract the first {...} or [...] block if there is extra text around it."""
match = re.search(r"(\{.*\}|\[.*\])", s, flags=re.DOTALL)
return match.group(1) if match else s
"""Extract the first balanced JSON object or array from a larger string."""
start = None
stack = []
in_string = False
escaped = False

for i, ch in enumerate(s):
if in_string:
if escaped:
escaped = False
elif ch == "\\":
escaped = True
elif ch == '"':
in_string = False
continue

if ch == '"':
in_string = True
continue

if start is None:
if ch == "{":
start = i
stack.append("}")
elif ch == "[":
start = i
stack.append("]")
continue

if ch in "[{":
stack.append("}" if ch == "{" else "]")
elif ch in "}]":
if not stack or ch != stack[-1]:
return s
stack.pop()
if not stack and start is not None:
return s[start:i + 1]

return s

def unwrap_double_encoded(s: str) -> str:
"""Unwrap when JSON is a quoted string containing JSON."""
Expand All @@ -35,9 +170,11 @@ def sanitize_json(s: str) -> str:
"""Apply a pipeline of cleanup steps before parsing."""
if not s:
return ""
s = strip_reasoning_sections(s)
s = strip_markdown_fences(s)
s = extract_json_block(s)
s = unwrap_double_encoded(s)
s = strip_reasoning_sections(s)
s = extract_json_block(s)
s = fix_trailing_commas(s)
s = normalize_literals(s)
return s.strip()
Expand Down
20 changes: 20 additions & 0 deletions tests/test_parse_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,26 @@ def test_sanitize_json(self):
sanitized = json_util.safe_load(json_string)
assert sanitized['name'] == 'Whispering Woods'

def test_sanitize_json_strips_reasoning_tags(self):
json_string = '<think>I should provide a forest description first.</think>{"name": "Whispering Woods", "mood": 5}'
sanitized = json_util.safe_load(json_string)
assert sanitized['name'] == 'Whispering Woods'

def test_sanitize_json_strips_reasoning_fences(self):
json_string = '```thinking\nI will now prepare structured output.\n```\n```json\n{"name": "Whispering Woods", "mood": 5}\n```'
sanitized = json_util.safe_load(json_string)
assert sanitized['mood'] == 5

def test_sanitize_json_keeps_tag_like_string_values(self):
json_string = '{"name": "Whispering Woods", "note": "<think>this stays in content</think>"}'
sanitized = json_util.safe_load(json_string)
assert sanitized['note'] == '<think>this stays in content</think>'

def test_sanitize_json_unwraps_double_encoded_reasoning(self):
json_string = '"<analysis>Internal notes</analysis>{\\"name\\": \\"Whispering Woods\\", \\"mood\\": 5}"'
sanitized = json_util.safe_load(json_string)
assert sanitized['name'] == 'Whispering Woods'

def test_mood_string_from_int(self):
assert parse_utils.mood_string_from_int(5) == ' uttermost friendly'
assert parse_utils.mood_string_from_int(0) == ' neutral'
Expand Down
Loading