diff --git a/PinSub.py b/PinSub.py index 823d2c6..d5af808 100644 --- a/PinSub.py +++ b/PinSub.py @@ -26,10 +26,16 @@ """ import argparse +import hashlib +import http.client import json +import os +import platform import re +import socket import subprocess import sys +import time from dataclasses import dataclass, field from pathlib import Path @@ -74,10 +80,14 @@ def find_names_file(names_arg: Path | None, mkv_path: Path | None) -> Path | Non return None -def load_names(names_path: Path | None) -> tuple[dict[str, str], dict[str, str]]: - """Load (name_map, english_name_map) from a per-film JSON, or empty pair.""" +def load_names(names_path: Path | None) -> tuple[dict[str, str], dict[str, str], dict]: + """Load (name_map, english_name_map, per_film_glossary) from a per-film JSON. + + Returns empty containers if the file is missing. The per-film glossary, + when present, is a mapping {hanzi -> {"english": str, "context": str, "tags": [...]}}. + Underscore-prefixed keys are filtered out at each level.""" if names_path is None: - return {}, {} + return {}, {}, {} try: with names_path.open(encoding="utf-8") as f: data = json.load(f) @@ -85,7 +95,53 @@ def load_names(names_path: Path | None) -> tuple[dict[str, str], dict[str, str]] sys.exit(f"failed to load names file {names_path}: {e}") name_map = {k: v for k, v in (data.get("name_map") or {}).items() if not k.startswith("_")} english_name_map = {k: v for k, v in (data.get("english_name_map") or {}).items() if not k.startswith("_")} - return name_map, english_name_map + per_film_glossary = {k: v for k, v in (data.get("glossary") or {}).items() if not k.startswith("_")} + return name_map, english_name_map, per_film_glossary + + +# Default location for the global glossary (shipped publicly). +DEFAULT_GLOSSARY_PATH = Path(__file__).parent / "glossary.json" + + +def load_glossary(path: Path | None = None) -> dict[str, dict]: + """Load the global Chinese→English glossary. Entries are + {hanzi -> {"english": str, "context": str, "tags": [...]}}. + Underscore-prefixed keys (helps, format docs) are filtered out.""" + p = path or DEFAULT_GLOSSARY_PATH + if not p.exists(): + return {} + try: + with p.open(encoding="utf-8") as f: + data = json.load(f) + except (OSError, json.JSONDecodeError) as e: + print(f"warn: could not load glossary {p}: {e}", file=sys.stderr) + return {} + return {k: v for k, v in data.items() if not k.startswith("_") and isinstance(v, dict)} + + +def merge_glossaries(global_g: dict, per_film_g: dict) -> dict: + """Combine global + per-film glossary. Per-film entries override global on key collisions.""" + out = dict(global_g) + out.update(per_film_g) + return out + + +# Default location for TIMMY's system-prompt file. PinSub loads this at runtime +# and uses it as the system prompt for translator-role calls. If absent, the +# fallback inline string below is used. +DEFAULT_TIMMY_PROMPT_PATH = Path(__file__).parent / "README_TIMMY.md" + + +def load_timmy_prompt(path: Path | None = None) -> str | None: + """Read the TIMMY system-prompt file. Returns None if missing/unreadable.""" + p = path or DEFAULT_TIMMY_PROMPT_PATH + if not p.exists(): + return None + try: + return p.read_text(encoding="utf-8").strip() + except OSError as e: + print(f"warn: could not read {p}: {e}", file=sys.stderr) + return None @dataclass @@ -717,6 +773,1417 @@ def expand(parts: list[str]) -> list[str]: return out +# ---------- dictionary correction (Phase 1: empty-cue fill from CC-CEDICT) ---------- +# +# The Chinese subtitles are the source of truth; the existing English row is an +# imperfect translation, sometimes missing entirely. Phase 1 is conservative: +# we only fill cues where the merged result has no English line. Existing +# English is never overwritten in Phase 1 — that's the LLM verifier's job +# (Phase 2, gated on Arc A770 use per project rule). +# +# Every change is logged to .changes.tsv as a side channel so the manual +# review pass can audit what the automation did. + +DEFAULT_CEDICT_PATH = Path(__file__).parent / "Research" / "primary_sources" / "cedict" / "cedict_1_0_ts_utf-8_mdbg.txt" + +CEDICT_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+\[([^\]]+)\]\s+/(.+)/\s*$") + + +def load_cedict(path: Path) -> dict[str, list[tuple[str, list[str]]]]: + """Parse CC-CEDICT into {simplified: [(pinyin, [defs]), ...]}. + + Multi-entry keys happen when the same simplified form has different pinyin + readings; we keep them all and pick at lookup time. Definitions inside an + entry are split on '/'; CC-CEDICT often has multiple synonymous glosses. + """ + if not path or not path.exists(): + return {} + out: dict[str, list[tuple[str, list[str]]]] = {} + try: + with path.open(encoding="utf-8") as f: + for line in f: + if not line or line.startswith("#"): + continue + m = CEDICT_LINE_RE.match(line) + if not m: + continue + _trad, simp, pinyin, defs_str = m.groups() + defs = [d for d in defs_str.split("/") if d] + out.setdefault(simp, []).append((pinyin, defs)) + except OSError as e: + print(f"warn: could not load cedict {path}: {e}", file=sys.stderr) + return {} + return out + + +def segment_greedy(text: str, cedict: dict, max_len: int = 12) -> list[str]: + """Left-to-right longest-match segmentation against CC-CEDICT keys. + + Hanzi runs are segmented by longest-match; non-Hanzi (punctuation, ASCII, + spaces) pass through one character at a time. max_len caps the lookup + window — most CC-CEDICT entries are <=8 chars; a generous 12 covers all + practical cases without quadratic blowup. + """ + if not text: + return [] + if not cedict: + return list(text) + out: list[str] = [] + i = 0 + n = len(text) + while i < n: + ch = text[i] + if HAN_RE.match(ch): + matched = False + for length in range(min(max_len, n - i), 0, -1): + cand = text[i:i + length] + if cand in cedict: + out.append(cand) + i += length + matched = True + break + if not matched: + # Single-char Hanzi without a multi-char hit; emit it (may still be in cedict as a single-char entry). + out.append(ch) + i += 1 + else: + out.append(ch) + i += 1 + return out + + +def gloss_segment(seg: str, cedict: dict) -> str | None: + """Return the first definition for a Hanzi segment, or None if unmappable. + + For multi-entry segments (e.g., 行 has multiple pinyin readings) the + rough heuristic is "pick the entry with the most definitions" — that + correlates loosely with "most common reading." Phase 2 LLM verifier + will pick smarter; Phase 1 just needs a plausible default. + """ + if seg not in cedict: + return None + entries = cedict[seg] + entries_sorted = sorted(entries, key=lambda e: -len(e[1])) + pinyin, defs = entries_sorted[0] + if not defs: + return None + return defs[0] + + +# Some CC-CEDICT definitions carry editorial annotations like "(slang)", +# "(Tw)", "(literary)", "(courteous, as opposed to informal 你[ni3])", or +# "see X". For a learner subtitle line we want the essential gloss, not +# the metadata. Strip the most common patterns aggressively. +GLOSS_LEADING_PAREN_RE = re.compile(r"^\(([A-Za-z][^)]{0,80})\)\s*") +GLOSS_PINYIN_BRACKETS_RE = re.compile(r"\[[a-zA-Z0-9 ]+\]") +GLOSS_INNER_ANNOTATION_RE = re.compile(r"\s*\((?:lit\.|fig\.|abbr\.|coll\.|slang|literary|formal|courteous|informal|archaic|dialect|usu\.|esp\.|see also|see|CL:[^)]+|as opposed to[^)]*|equivalent to[^)]*|same as[^)]*|written form of[^)]*|variant of[^)]*|usually|especially|by extension|extended meaning|of [^)]*|sound of [^)]*|interjection [^)]*)[^)]*\)") +GLOSS_SEE_ALSO_RE = re.compile(r"\s*;?\s*see\s+(also\s+)?\S+", flags=re.I) +GLOSS_CL_RE = re.compile(r"\s*\(CL:[^)]+\)") + +# Sentence-ending particles common in dialogue. Map to empty string so they +# don't pollute the gloss; punctuation already conveys the sentence end. +PARTICLE_OVERRIDES = { + "啦": "", + "啊": "", + "呀": "", + "哦": "", + "哎": "", + "嗨": "", + "嘛": "", + "呢": "", + "吧": "", + "嗯": "", + "唉": "", + "哟": "", + "嘿": "", + "诶": "", + "了": "", # aspect particle "le"; rarely useful as standalone gloss + "的": "", # possessive/genitive marker + "得": "", # complement marker + "地": "", # adverbial marker + "着": "", # progressive aspect marker + "过": "", # experiential aspect marker + "把": "", # disposal marker +} + +# Common verbs that are nearly always imperatives in subtitle dialogue. +# CC-CEDICT lists them as "to X" (infinitive); rewrite to bare imperative +# when the cue context suggests a command (short cue, ends with !, or +# single verb cue). +IMPERATIVE_LEMMA_OVERRIDES = { + "停": "Stop", + "走": "Go", + "来": "Come", + "去": "Go", + "等": "Wait", + "看": "Look", + "听": "Listen", + "请": "Please", + "起": "Get up", + "坐": "Sit", + "进": "Enter", + "出": "Get out", + "让开": "Move aside", + "小心": "Be careful", + "别动": "Don't move", + "不要": "Don't", + "快点": "Hurry", + "快": "Hurry", +} + + +def clean_gloss(gloss: str) -> str: + """Trim a CC-CEDICT definition for inline display.""" + g = gloss.strip() + # Drop a leading parenthetical annotation if present (may be quite long). + while True: + m = GLOSS_LEADING_PAREN_RE.match(g) + if not m: + break + g = g[m.end():].strip() + # Drop bracketed pinyin like [ni3] anywhere in the text. + g = GLOSS_PINYIN_BRACKETS_RE.sub("", g) + # Drop inner editorial annotations enclosed in parens. + g = GLOSS_INNER_ANNOTATION_RE.sub("", g) + # Drop any remaining "(CL:…)" classifier hints. + g = GLOSS_CL_RE.sub("", g) + # Drop trailing "see also X" / "see X" cross-references. + g = GLOSS_SEE_ALSO_RE.sub("", g) + # Tidy spacing. + g = re.sub(r"\s+", " ", g).strip() + g = re.sub(r"\s+([,.!?;:])", r"\1", g) + # If the gloss reduces to ";" or "," fragments, normalize. + g = g.strip(" ;,") + return g + + +def gloss_hanzi(hanzi: str, cedict: dict, name_map: dict[str, str] | None = None) -> str: + """Build a literal English gloss from a Hanzi line. + + Newlines in the Hanzi are preserved (each line glossed independently). + Unmappable Hanzi appear as `[?]` so a human reviewer can spot them. + + If name_map is provided, segments matching a known proper noun use the + name_map entry instead of the CC-CEDICT literal gloss — so 秀莲 stays + "Xiùlián" (the character) rather than "beautiful lotus" (the literal). + name_map values are tone-marked pinyin; for the English row we strip + the tone marks and capitalize the syllable starts so the gloss matches + the english_name_map convention used elsewhere. + """ + if not hanzi or not cedict: + return "" + # Build a name-segmentation seed. We want longest-first matching of + # name_map keys to take precedence over generic CC-CEDICT segmentation. + name_keys = sorted((name_map or {}).keys(), key=len, reverse=True) if name_map else [] + + out_lines: list[str] = [] + for hline in hanzi.split("\n"): + # First pass: walk the line, replacing name-keys with sentinels, + # then segment the remainder via greedy CC-CEDICT match. + # Simpler: do a per-position check for name keys before each + # CC-CEDICT longest-match. + parts: list[str] = [] + i = 0 + n = len(hline) + while i < n: + ch = hline[i] + if HAN_RE.match(ch): + # Try name_map (longest first). + matched = False + for nk in name_keys: + if hline.startswith(nk, i): + parts.append(_name_to_english(name_map[nk])) + i += len(nk) + matched = True + break + if matched: + continue + # Fall through to CC-CEDICT longest-match. + for length in range(min(12, n - i), 0, -1): + cand = hline[i:i + length] + if cand in cedict: + # Particle override: drop entirely. + if cand in PARTICLE_OVERRIDES: + override = PARTICLE_OVERRIDES[cand] + if override: + parts.append(override) + i += length + matched = True + break + # Imperative override: emit the bare command form + # when the surrounding cue is short and verb-led. + if cand in IMPERATIVE_LEMMA_OVERRIDES: + parts.append(IMPERATIVE_LEMMA_OVERRIDES[cand]) + i += length + matched = True + break + g = gloss_segment(cand, cedict) + if g: + cleaned = clean_gloss(g) + if cleaned: + parts.append(cleaned) + else: + parts.append(f"[{cand}?]") + i += length + matched = True + break + if not matched: + parts.append(f"[{ch}?]") + i += 1 + else: + parts.append(ch) + i += 1 + line = " ".join(p for p in parts if p.strip()).strip() + line = re.sub(r"\s+", " ", line) + line = re.sub(r"\s+([,.!?;:])", r"\1", line) + out_lines.append(line) + return "\n".join(out_lines).strip() + + +def _name_to_english(tone_pinyin: str) -> str: + """Convert a name_map entry (e.g. 'Lǐ Mùbái') to bare English form ('Li Mubai'). + + Strip tone marks via NFKD decomposition + filter combining chars. + Capitalization is preserved (the consonant-side capital is what we want). + """ + import unicodedata + nfkd = unicodedata.normalize("NFKD", tone_pinyin) + return "".join(c for c in nfkd if not unicodedata.combining(c)) + + +def enrich_cues(cues: list[TriCue], cedict: dict, changelog_path: Path | None, + name_map: dict[str, str] | None = None + ) -> tuple[list[TriCue], set[int], list[tuple]]: + """Apply Phase-1 dictionary correction. + + Only fills empty English cues; never overwrites existing English. Logs + every change to .changes.tsv for review. + + Returns (cues, set_of_filled_cue_indices, changelog_rows). The filled + indices are what Phase 2 LLM verifier targets — those are the cues + whose English originated from CC-CEDICT and most need natural-language + cleanup. + """ + if not cedict: + print("enrich: no dictionary loaded; skipping") + return cues, set(), [] + changes: list[tuple] = [] + filled: set[int] = set() + for c in cues: + if c.hanzi and not c.english.strip(): + gloss = gloss_hanzi(c.hanzi, cedict, name_map=name_map) + if gloss: + changes.append((c.index, "FILL", c.hanzi.replace("\n", " | "), "", gloss)) + c.english = gloss + filled.add(c.index) + if changelog_path is not None and changes: + try: + with changelog_path.open("w", encoding="utf-8", newline="") as f: + f.write("idx\taction\thanzi\tbefore\tafter\n") + for row in changes: + f.write("\t".join(str(x).replace("\t", " ").replace("\n", " | ") for x in row) + "\n") + print(f"enrich: {len(changes)} cue(s) filled; changelog -> {changelog_path.name}") + except OSError as e: + print(f"warn: could not write changelog {changelog_path}: {e}", file=sys.stderr) + else: + print(f"enrich: {len(changes)} cue(s) filled") + return cues, filled, changes + + +# ---------- Phase 2: LLM verifier via llama.cpp-vulkan on Arc A770 ---------- +# +# Phase 1 (dictionary) fills empty cues with literal CC-CEDICT gloss; quality +# is bimodal. Phase 2 sends the filled cues (and optionally divergent cues) +# to a local llama.cpp HTTP server for natural-language correction. +# +# Pattern matches the owner's TIMMY backend convention: spawn `llama-server` +# with Vulkan device + auto gpu-layers, talk over HTTP, kill on exit. +# Per project rule (03_PROJECT.md workflow preferences), every A770 dispatch +# is announced and logged to Logs/A770_usage.md. + +LLAMA_HEALTH_TIMEOUT = 180 # seconds; first-call JIT can take a while +LLAMA_REQUEST_TIMEOUT = 120 +LLAMA_DEFAULT_PORT = 8765 # avoid colliding with TIMMY on 8080 + +LLM_SYSTEM_PROMPT = ( + "You translate Chinese film subtitles into English for a Mandarin learner. " + "The Chinese is the source of truth. Produce an idiomatic English line that " + "matches the Chinese meaning. Be short — one line fits on screen. " + "Do NOT explain. Do NOT include the Chinese text. Do NOT add quotes or " + "labels. Output ONLY the English translation. If the Chinese is a single " + "interjection or name, output its English equivalent only. /no_think" +) + + +@dataclass +class LlamaSession: + """Live llama-server handle plus metadata for the A770 usage log.""" + proc: subprocess.Popen + port: int + model_path: Path + server_exe: Path + device: str | None + started_at: float + first_call_at: float | None = None + last_call_at: float | None = None + calls: int = 0 + total_completion_tokens: int = 0 + total_prompt_tokens: int = 0 + + +def find_arc_vulkan_device(server_exe: Path) -> str | None: + """Run `llama-server --list-devices` and return the Arc device id (e.g. 'Vulkan0').""" + try: + out = subprocess.run( + [str(server_exe), "--list-devices"], + capture_output=True, text=True, timeout=30, check=False, + ) + except (OSError, subprocess.SubprocessError) as e: + print(f"warn: --list-devices failed: {e}", file=sys.stderr) + return None + blob = (out.stdout or "") + "\n" + (out.stderr or "") + # llama-server prints lines like: " Vulkan0: Intel(R) Arc(TM) A770 Graphics (...) + for line in blob.splitlines(): + if "Arc" in line and "Vulkan" in line: + m = re.search(r"(Vulkan\d+)", line) + if m: + return m.group(1) + return None + + +def wait_for_llama_health(port: int, timeout: int = LLAMA_HEALTH_TIMEOUT) -> bool: + """Poll /health on the local server until 200 or timeout.""" + deadline = time.time() + timeout + while time.time() < deadline: + try: + conn = http.client.HTTPConnection("127.0.0.1", port, timeout=3) + conn.request("GET", "/health") + r = conn.getresponse() + conn.close() + if r.status == 200: + return True + except (OSError, http.client.HTTPException): + pass + time.sleep(1) + return False + + +def start_llama_server(server_exe: Path, model_path: Path, + port: int = LLAMA_DEFAULT_PORT, + device: str | None = None, + n_gpu_layers: str = "auto", + ctx_size: int = 4096, + n_threads: int = 4) -> LlamaSession: + """Spawn llama-server. Returns a LlamaSession; caller is responsible for stop_llama_server(). + + n_threads caps CPU usage. Vulkan/GPU workloads spend most time on the GPU; + CPU threads are for tokenization, scheduling, and any CPU-resident layers. + Default 4 leaves room for other CPU work.""" + if not server_exe.exists(): + sys.exit(f"missing llama-server: {server_exe}") + if not model_path.exists(): + sys.exit(f"missing GGUF model: {model_path}") + cmd = [str(server_exe), "-m", str(model_path), + "--port", str(port), + "--ctx-size", str(ctx_size), + "--gpu-layers", n_gpu_layers, + "--threads", str(n_threads), + "--threads-batch", str(n_threads), + "--no-warmup"] # we'll measure first-call JIT ourselves + if device: + cmd += ["--device", device] + # 🔴 STARTING A SERVER — per Master §1.6 + print(f"\n🔴 STARTING llama-server on port {port}") + print(f" model: {model_path.name}") + print(f" device: {device or '(auto)'}") + print(f" ctx: {ctx_size}") + print(f" cmd: {' '.join(cmd)}\n") + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + text=True, encoding="utf-8", errors="replace", + bufsize=1) + started = time.time() + # Print server output in the background while waiting for health. + if not wait_for_llama_health(port): + # Capture whatever the server emitted so we can diagnose. + if proc.stdout is not None: + try: + # Non-blocking-ish: read what's available. + proc.terminate() + try: + captured, _ = proc.communicate(timeout=3) + except subprocess.TimeoutExpired: + proc.kill() + captured, _ = proc.communicate() + print(captured[-2000:] if captured else "(no output)", file=sys.stderr) + except (OSError, ValueError): + pass + sys.exit("llama-server failed to come up within timeout") + print(f"llama-server ready on :{port} after {time.time() - started:.1f}s") + return LlamaSession( + proc=proc, port=port, model_path=model_path, server_exe=server_exe, + device=device, started_at=started, + ) + + +def stop_llama_server(session: LlamaSession) -> None: + """Terminate the server and verify the port is free. Per Master §1.6.""" + if session.proc.poll() is None: + try: + session.proc.terminate() + session.proc.wait(timeout=10) + except subprocess.TimeoutExpired: + session.proc.kill() + session.proc.wait(timeout=5) + # Verify port is free. + try: + with socket.create_connection(("127.0.0.1", session.port), timeout=2): + print(f"warn: port {session.port} still bound after kill", file=sys.stderr) + return + except OSError: + pass # port is free, which is what we want + print(f'\nYour honor, the server has been double-confirmed to be closed') + + +def llm_complete(session: LlamaSession, system_prompt: str, user_prompt: str, + max_tokens: int = 96, temperature: float = 0.2) -> str | None: + """POST to /v1/chat/completions; return content or None on failure.""" + body = json.dumps({ + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + "max_tokens": max_tokens, + "temperature": temperature, + "stream": False, + }).encode("utf-8") + try: + conn = http.client.HTTPConnection("127.0.0.1", session.port, timeout=LLAMA_REQUEST_TIMEOUT) + conn.request("POST", "/v1/chat/completions", body=body, + headers={"Content-Type": "application/json"}) + r = conn.getresponse() + if r.status != 200: + print(f"llm error: HTTP {r.status} — {r.read()[:200]}", file=sys.stderr) + conn.close() + return None + data = json.loads(r.read().decode("utf-8")) + conn.close() + except (OSError, json.JSONDecodeError, KeyError) as e: + print(f"llm error: {e}", file=sys.stderr) + return None + now = time.time() + session.calls += 1 + if session.first_call_at is None: + session.first_call_at = now + session.last_call_at = now + usage = data.get("usage") or {} + session.total_completion_tokens += int(usage.get("completion_tokens") or 0) + session.total_prompt_tokens += int(usage.get("prompt_tokens") or 0) + try: + msg = data["choices"][0]["message"] + except (KeyError, IndexError): + return None + content = (msg.get("content") or "").strip() + if content: + return content + # Qwen3 reasoning mode: if /no_think failed and `content` is empty, the + # answer may still live inside `reasoning_content` (a stream of thoughts). + # We try to extract a final-line subtitle from it. Last resort. + reasoning = (msg.get("reasoning_content") or "").strip() + if reasoning: + # Look for an explicit quoted answer first. + m = re.search(r'"([^"]{2,80})"', reasoning) + if m: + return m.group(1).strip() + # Otherwise the last non-empty line is the best guess. + for line in reversed(reasoning.splitlines()): + s = line.strip().strip('"').strip("'").strip() + if 2 <= len(s) <= 120: + return s + return None + + +def _clean_llm_output(text: str) -> str: + """Defensive cleanup of LLM output.""" + if not text: + return "" + t = text.strip() + # Strip wrapping quotes if the model added them. + if (t.startswith('"') and t.endswith('"')) or (t.startswith("'") and t.endswith("'")): + t = t[1:-1].strip() + # Drop trailing model artifacts like "<|im_end|>" or thinking-tag residue. + t = re.sub(r"]*>", "", t, flags=re.I).strip() + # Some Qwen variants prefix the answer with the source language. Drop. + t = re.sub(r"^(English|Translation|Subtitle)\s*[:\-]\s*", "", t, flags=re.I) + return t + + +def build_verify_prompt(cue: TriCue, dict_suggestion: str, + prior_cues: list[TriCue], name_map: dict[str, str]) -> str: + """Build the user-side prompt for cue verification.""" + parts: list[str] = [] + parts.append(f"Chinese: {cue.hanzi}") + if cue.english.strip() and cue.english.strip() != dict_suggestion: + parts.append(f"Existing English (may be wrong or stilted): {cue.english}") + if dict_suggestion: + parts.append(f"Dictionary literal gloss (often awkward, just a hint): {dict_suggestion}") + # Two most recent cues that have Hanzi, for context. + ctx = [pc for pc in prior_cues if pc.hanzi][-2:] + if ctx: + parts.append("Recent context (prior cues):") + for pc in ctx: + parts.append(f" {pc.hanzi} → {pc.english}") + # Name hints if any name appears in this cue. + hints: list[str] = [] + for h, p in (name_map or {}).items(): + if h and h in cue.hanzi: + hints.append(f"{h} = {_name_to_english(p)}") + if hints: + parts.append("Known names: " + "; ".join(hints)) + parts.append("Translate the Chinese above into one short English subtitle line.") + return "\n".join(parts) + + +def _cache_key(model_name: str, hanzi: str) -> str: + return hashlib.sha1(f"{model_name}|{hanzi}".encode("utf-8")).hexdigest() + + +def load_llm_cache(cache_path: Path) -> dict[str, str]: + if not cache_path.exists(): + return {} + try: + with cache_path.open(encoding="utf-8") as f: + return json.load(f) + except (OSError, json.JSONDecodeError): + return {} + + +def save_llm_cache(cache_path: Path, cache: dict[str, str]) -> None: + try: + cache_path.parent.mkdir(parents=True, exist_ok=True) + with cache_path.open("w", encoding="utf-8") as f: + json.dump(cache, f, ensure_ascii=False, indent=1) + except OSError as e: + print(f"warn: could not write cache {cache_path}: {e}", file=sys.stderr) + + +def verify_cues_with_llm(cues: list[TriCue], session: LlamaSession, + filled_indices: set[int], + name_map: dict[str, str], + cedict: dict, + cache: dict[str, str], + model_name: str, + changelog_rows: list[tuple]) -> int: + """Verify the previously-filled-by-dictionary cues; replace English with LLM output. + + Returns the number of cues replaced. + """ + replaced = 0 + cue_by_idx = {c.index: c for c in cues} + indices_sorted = sorted(filled_indices) + print(f"llm verify: {len(indices_sorted)} cue(s) to check") + for n, idx in enumerate(indices_sorted, 1): + c = cue_by_idx.get(idx) + if c is None or not c.hanzi: + continue + key = _cache_key(model_name, c.hanzi) + cached = cache.get(key) + if cached is not None: + new_text = cached + src = "cache" + else: + # Compute dict suggestion fresh (same logic Phase 1 uses). + dict_suggestion = gloss_hanzi(c.hanzi, cedict, name_map=name_map) if cedict else "" + # Two cues directly prior in cue order. + prior = [cue_by_idx[i] for i in range(max(1, idx - 4), idx) if i in cue_by_idx] + prompt = build_verify_prompt(c, dict_suggestion, prior, name_map) + raw = llm_complete(session, LLM_SYSTEM_PROMPT, prompt) + new_text = _clean_llm_output(raw) if raw else "" + if not new_text: + print(f" cue {idx}: no LLM output; keeping existing", file=sys.stderr) + continue + cache[key] = new_text + src = "llm" + if new_text and new_text != c.english.strip(): + changelog_rows.append((c.index, f"VERIFY({src})", c.hanzi.replace("\n", " | "), + c.english, new_text)) + c.english = new_text + replaced += 1 + if n % 5 == 0 or n == len(indices_sorted): + print(f" ...{n}/{len(indices_sorted)} cue(s) processed") + return replaced + + +def append_changelog(changelog_path: Path, rows: list[tuple]) -> None: + """Append (or create) a TSV log of all enrichment actions.""" + if not rows: + return + header_needed = not changelog_path.exists() + try: + with changelog_path.open("a", encoding="utf-8", newline="") as f: + if header_needed: + f.write("idx\taction\thanzi\tbefore\tafter\n") + for row in rows: + f.write("\t".join(str(x).replace("\t", " ").replace("\n", " | ") for x in row) + "\n") + except OSError as e: + print(f"warn: could not write changelog {changelog_path}: {e}", file=sys.stderr) + + +def write_a770_usage_log(log_path: Path, session: LlamaSession, *, + job: str, cues_processed: int, cues_replaced: int, + source_session_path: Path | None, + success: bool, notes: str = "") -> None: + """Append a structured row to Logs/A770_usage.md per the project rule.""" + log_path.parent.mkdir(parents=True, exist_ok=True) + elapsed = time.time() - session.started_at + jit_compile = ((session.first_call_at - session.started_at) + if session.first_call_at else None) + inference_window = ((session.last_call_at - session.first_call_at) + if (session.first_call_at and session.last_call_at) else 0.0) + completion_tps = (session.total_completion_tokens / inference_window + if inference_window > 0 else 0.0) + header = ( + "| time (UTC) | project | job | model | quant | n_ctx | n_gpu_layers " + "| device | calls | prompt_tok | completion_tok | completion_tps | jit_s " + "| total_wall_s | success | notes | session_log |" + ) + sep = "|" + "|".join(["---"] * 16) + "|" + if not log_path.exists(): + log_path.write_text( + "# A770 usage log — PS\n\n" + "Per project rule (`03_PROJECT.md` Owner workflow preferences): every A770 dispatch is logged.\n\n" + f"{header}\n{sep}\n", + encoding="utf-8", + ) + quant = "" + m = re.search(r"-(Q\d[A-Za-z0-9_]*)\.gguf$", session.model_path.name, re.I) + if m: + quant = m.group(1) + now_utc = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + src = source_session_path.name if source_session_path else "" + row_cells = [ + now_utc, "PS", job, session.model_path.name, quant, + "4096", "auto", session.device or "(auto)", + str(session.calls), str(session.total_prompt_tokens), + str(session.total_completion_tokens), + f"{completion_tps:.1f}", + f"{jit_compile:.1f}" if jit_compile is not None else "-", + f"{elapsed:.1f}", + "yes" if success else "no", + notes.replace("|", "/").replace("\n", " "), + src, + ] + with log_path.open("a", encoding="utf-8") as f: + f.write("| " + " | ".join(row_cells) + " |\n") + + +# ---------- Phase 3: Python-orchestrated translate-then-compare correction ---------- +# +# Bigger-scope correction than --llm-verify. Where Phase 2 only re-translates +# the cues Phase 1 filled, Phase 3 also goes after cues whose EXISTING English +# diverges from what the Chinese says. +# +# Pipeline per cue: +# 1. Python heuristic (content-word overlap + length ratio + empty check) +# decides whether the cue is worth sending to the LLM. This keeps TIMMY +# from re-translating the 90% of cues that are already fine. +# 2. TIMMY-translate (fresh prompt, no existing English in context) produces +# a Chinese-faithful candidate. Per-cue cache. +# 3. If existing English exists and differs from TIMMY's, a SEPARATE TIMMY +# call (fresh context, different prompt) judges: A=existing, B=TIMMY's, +# or C=write a better one. Per-cue cache. +# 4. Python applies name_map enforcement on the chosen text and logs. + +# Common English stop words — excluded from content-word overlap calculations. +_STOP_WORDS = { + "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", "am", + "in", "on", "at", "of", "for", "with", "to", "from", "by", "as", "into", + "and", "or", "but", "nor", "so", "if", "then", "than", + "i", "you", "he", "she", "it", "we", "they", + "me", "him", "her", "us", "them", + "my", "your", "his", "hers", "its", "our", "ours", "their", "theirs", + "do", "does", "did", "doing", "done", + "have", "has", "had", "having", + "what", "who", "whom", "where", "when", "why", "how", "which", + "this", "that", "these", "those", + "lit", "fig", "interj", + "not", "no", + "any", "all", "some", "much", "many", "more", "most", "few", "less", "least", + "very", "too", "so", "just", + "let", "lets", +} + + +def _content_words(text: str) -> set[str]: + """Lowercased English content words, stopwords/short words removed.""" + if not text: + return set() + words = re.findall(r"[A-Za-z]+", text.lower()) + return {w for w in words if len(w) >= 2 and w not in _STOP_WORDS} + + +def detect_suspect_english(c: TriCue, cedict: dict, name_map: dict | None, + overlap_threshold: float = 0.2) -> tuple[bool, str]: + """Python heuristic: does this cue's existing English likely diverge from the Chinese? + + Returns (is_suspect, reason). Cheap (no LLM); meant to prefilter so TIMMY + only sees cues that actually need attention. + """ + if not c.hanzi: + return False, "no-hanzi" + existing = c.english.strip() + if not existing: + return True, "empty" + if not cedict: + return False, "no-dict" + # Compute the dict gloss (literal content words). + gloss = gloss_hanzi(c.hanzi, cedict, name_map=name_map) + if not gloss: + return False, "no-gloss" + gloss_words = _content_words(gloss) + if not gloss_words: + # Gloss reduces to particles/empty; nothing to compare against. + return False, "thin-gloss" + en_words = _content_words(existing) + # Add tone-stripped name-map English forms to the gloss vocabulary too — + # that way "Xiu Lian" matches the existing English referring to the character. + for h, p in (name_map or {}).items(): + if h and h in c.hanzi: + gloss_words |= _content_words(_name_to_english(p)) + if not gloss_words: + return False, "thin-gloss" + overlap = len(en_words & gloss_words) / max(1, len(gloss_words)) + if overlap < overlap_threshold: + return True, f"low-overlap({overlap:.2f})" + # Length ratio sanity check on long-ish cues. + han_chars = len([ch for ch in c.hanzi if HAN_RE.match(ch)]) + en_len = len(existing) + if han_chars >= 6 and en_len < han_chars * 1.0: + return True, f"english-too-short({en_len}/{han_chars})" + if han_chars >= 4 and en_len > han_chars * 12: + return True, f"english-too-long({en_len}/{han_chars})" + return False, "ok" + + +# Fallback inline system prompt used when README_TIMMY.md is absent. The real +# prompt lives in README_TIMMY.md next to PinSub.py and is loaded at startup. +LLM_FRESH_SYSTEM_PROMPT = ( + "You translate Chinese film subtitles into English for a Mandarin learner. " + "The Chinese is the source of truth. Translate every Chinese content word " + "(noun, verb, name, modifier) — function words (particles, articles) follow " + "each language's grammar. Preserve Chinese word order when grammatically " + "tolerable in English. Use the 'Known names' and 'Glossary' hints if " + "provided. NEVER leave Chinese characters in your output — if unsure of a " + "term, output [?] for it. Output one short English line. No quotes, no " + "labels, no explanation. /no_think" +) + + +def _build_glossary_hints(hanzi: str, glossary: dict) -> list[tuple[str, str]]: + """Return [(hanzi_term, english_hint)] for every glossary key found in this cue. + + Longest keys first — so "天下第一枪" (compound) matches before "枪" alone.""" + if not hanzi or not glossary: + return [] + hits: list[tuple[str, str]] = [] + for key in sorted(glossary.keys(), key=len, reverse=True): + if key in hanzi: + entry = glossary[key] + english_hint = (entry.get("english") or "").strip() if isinstance(entry, dict) else str(entry) + if english_hint: + hits.append((key, english_hint)) + return hits + + +def _detect_name_candidates(hanzi: str, pinyin: str, name_map: dict | None) -> list[tuple[str, str]]: + """Returns [(hanzi_span, english_form)] for likely name spans in the cue. + + When name_map covers a span, use it (canonical). When not, fall back to a + bare-pinyin form derived from the cue's already-computed pinyin. This is + the generic path for films without a name_map — TIMMY gets a pinyin-derived + hint per likely-name span and can use it verbatim.""" + if not hanzi: + return [] + out: list[tuple[str, str]] = [] + # Apply name_map first (longest first). + if name_map: + for h in sorted(name_map.keys(), key=len, reverse=True): + if h and h in hanzi: + out.append((h, _name_to_english(name_map[h]))) + # No generic auto-detection for now: pypinyin doesn't reliably identify + # names from raw text, and false positives ("walk" picked as surname) would + # poison the hints. Instead, we surface the *entire* pinyin row to TIMMY + # in the prompt (see build_translate_user_prompt) — TIMMY can use the + # pinyin syllable structure to spell any names that weren't in name_map. + return out + + +def build_translate_user_prompt(c: TriCue, prior_cues: list[TriCue], + name_hints: list[tuple[str, str]], + glossary_hits: list[tuple[str, str]], + feedback: str | None = None) -> str: + """Assemble the user-side prompt for a translator-role TIMMY call.""" + parts: list[str] = [] + ctx = [pc for pc in prior_cues if pc.hanzi][-2:] + if ctx: + parts.append("Recent scene context (prior cues for awareness):") + for pc in ctx: + en = pc.english.strip().replace("\n", " ") or "(no english)" + parts.append(f" {pc.hanzi} → {en}") + if name_hints: + parts.append("Known names in this cue: " + "; ".join( + f"{h}={en}" for h, en in name_hints)) + if glossary_hits: + parts.append("Glossary (use these English forms for the Chinese terms below):") + for h, hint in glossary_hits: + parts.append(f" {h} → {hint}") + if c.pinyin: + parts.append(f"Pinyin row: {c.pinyin}") + parts.append(f"Chinese: {c.hanzi}") + if feedback: + parts.append(f"Your previous attempt had these issues — fix them:\n{feedback}") + parts.append("Translate the Chinese above into one short English subtitle line.") + return "\n".join(parts) + + +def llm_translate_fresh(session: LlamaSession, c: TriCue, + prior_cues: list[TriCue], name_map: dict, + glossary: dict, system_prompt: str, + feedback: str | None = None) -> str | None: + """Fresh TIMMY call: translate Chinese only, without seeing existing English.""" + name_hints = _detect_name_candidates(c.hanzi, c.pinyin, name_map) + glossary_hits = _build_glossary_hints(c.hanzi, glossary) + user = build_translate_user_prompt(c, prior_cues, name_hints, glossary_hits, feedback) + raw = llm_complete(session, system_prompt, user) + return _clean_llm_output(raw) if raw else None + + +LLM_COMPARE_SYSTEM_PROMPT = ( + "You judge English subtitle translations for fidelity to a Chinese source. " + "Given the Chinese and two English candidates A and B, decide which " + "candidate better matches the Chinese meaning, OR write a better English " + "translation if both candidates are flawed. Respond with EXACTLY one line:\n" + " A (candidate A is better)\n" + " B (candidate B is better)\n" + " C: (write your own better translation)\n" + "Do not explain. Do not include the Chinese. /no_think" +) + + +def llm_compare_translations(session: LlamaSession, c: TriCue, + en_a: str, en_b: str, + name_map: dict) -> tuple[str, str]: + """Fresh TIMMY: compare two English candidates. Returns (verdict, final_text). + + Verdict is "A", "B", or "C" (model wrote a new translation). + Parse failures default to A (= keep existing) — least risk of regression. + """ + parts: list[str] = [] + parts.append(f"Chinese: {c.hanzi}") + parts.append(f"A: {en_a}") + parts.append(f"B: {en_b}") + hints: list[str] = [] + for h, p in (name_map or {}).items(): + if h and h in c.hanzi: + hints.append(f"{h}={_name_to_english(p)}") + if hints: + parts.append("Known names: " + "; ".join(hints)) + parts.append("Which English better matches the Chinese? Answer A, B, or C: .") + raw = llm_complete(session, LLM_COMPARE_SYSTEM_PROMPT, "\n".join(parts), max_tokens=80) + if not raw: + return "A", en_a + text = raw.strip() + # The model sometimes wraps in extra punctuation. Tolerate "A.", "A!", etc. + first = re.match(r"^\s*([ABC])\s*[:.\-—]?\s*(.*)$", text, re.DOTALL) + if not first: + return "A", en_a + verdict = first.group(1).upper() + rest = first.group(2).strip() + if verdict == "A": + return "A", en_a + if verdict == "B": + return "B", en_b + if verdict == "C": + # Use whatever follows the C label; fall back if empty. + suggestion = _clean_llm_output(rest.split("\n", 1)[0]) + if suggestion: + return "C", suggestion + return "A", en_a + return "A", en_a + + +def validate_translation(english: str, hanzi: str, + name_hints: list[tuple[str, str]], + glossary_hits: list[tuple[str, str]] + ) -> tuple[bool, list[str]]: + """Python post-validation of a TIMMY translation. Returns (ok, feedback_messages). + + Feedback messages are phrased as direct instructions to TIMMY so they can + feed back into the retry loop unchanged. + + Checks: + 1. No Hanzi in the English output. + 2. Each name in the cue's Hanzi appears in canonical form in the English. + 3. Length sanity vs Hanzi character count (only on longer cues). + 4. Glossary coverage is informational — flagged but not rejected (synonym latitude).""" + if not english: + return False, ["Your output was empty. Translate the Chinese into one English line."] + issues: list[str] = [] + + # 1. Hanzi in the English row — hard reject. + leftover = HAN_RE.findall(english) + if leftover: + unique = sorted(set(leftover)) + issues.append( + f"Your output still contains Chinese characters: {' '.join(unique)}. " + "Translate every Hanzi into English. If you don't know a term, " + "output [?] for it instead of leaving the Hanzi." + ) + + # 2. Name canonicalization. Only fires when name_map produced a hint — + # we don't second-guess the generic name path. + en_lower = english.lower() + for hanzi_name, canonical_en in name_hints: + # Tolerate case-insensitive match anywhere in the line. + if canonical_en and canonical_en.lower() not in en_lower: + issues.append( + f"The name '{hanzi_name}' should appear in your English as " + f"'{canonical_en}' (the canonical spelling). Use that exact form." + ) + + # 3. Length sanity — only on cues with enough Hanzi to be meaningful. + han_chars = len([c for c in hanzi if HAN_RE.match(c)]) + en_len_alpha = len(re.sub(r"[^A-Za-z]", "", english)) + if han_chars >= 6 and en_len_alpha < han_chars: + # Likely you dropped content. Common when subjects/verbs are omitted. + issues.append( + f"Your English ({en_len_alpha} letters) is short for a " + f"{han_chars}-Hanzi cue. Make sure every content word in the " + "Chinese has an English equivalent — don't drop nouns or verbs." + ) + + # 4. (informational) Glossary coverage: we don't reject here, but if NONE + # of the glossary terms made it through, note it for the changelog. + # (no-op for now; could feed a softer "consider X" prompt later) + + return len(issues) == 0, issues + + +def correct_cues_with_llm(cues: list[TriCue], session: LlamaSession, + cedict: dict, name_map: dict, + glossary: dict, + system_prompt: str, + mode: str = "divergent", + do_compare: bool = True, + max_cues: int = 0, + max_rounds: int = 3, + cache_path: Path | None = None, + changelog_rows: list[tuple] | None = None + ) -> tuple[int, dict]: + """Phase 3 orchestrator. Returns (cues_replaced, per_action_counts).""" + if changelog_rows is None: + changelog_rows = [] + cue_by_idx = {c.index: c for c in cues} + cues_in_order = sorted(cue_by_idx.keys()) + + # Step 1: Python filtering. Compute the divergence reason for every cue + # exactly once and reuse it for both selection and the informational + # histogram below. + candidates: list[tuple[int, str]] = [] # (idx, reason) + suspect_counts: dict[str, int] = {} + for idx in cues_in_order: + c = cue_by_idx[idx] + if not c.hanzi: + continue + suspect, reason = detect_suspect_english(c, cedict, name_map) + suspect_counts[reason] = suspect_counts.get(reason, 0) + 1 + if mode == "fills": + if not c.english.strip(): + candidates.append((idx, "empty")) + elif mode == "all": + candidates.append((idx, reason)) + else: # divergent + if suspect: + candidates.append((idx, reason)) + print(f"llm correct: mode={mode}, {len(candidates)} candidate cue(s) selected") + print(f" divergence histogram: {dict(sorted(suspect_counts.items(), key=lambda kv: -kv[1])[:8])}") + if max_cues and max_cues > 0 and len(candidates) > max_cues: + candidates = candidates[:max_cues] + print(f" --llm-max set: capping at {len(candidates)}") + + # Step 2: caching scaffolding. + cache: dict[str, str] = {} + if cache_path is not None: + cache = load_llm_cache(cache_path) + model_name = session.model_path.name + + def t_key(hanzi: str) -> str: + return "T|" + hashlib.sha1(f"{model_name}|{hanzi}".encode("utf-8")).hexdigest() + + def c_key(hanzi: str, en_a: str, en_b: str) -> str: + return "C|" + hashlib.sha1(f"{model_name}|{hanzi}|{en_a}|{en_b}".encode("utf-8")).hexdigest() + + # Per-action counters. + counts = {"translate_only": 0, "kept_existing": 0, + "compare_A": 0, "compare_B": 0, "compare_C": 0, + "no_change": 0} + replaced = 0 + + rounds_used_counts = {1: 0, 2: 0, 3: 0, 4: 0} # 4 = gave up + + for n, (idx, reason) in enumerate(candidates, 1): + c = cue_by_idx[idx] + existing = c.english.strip() + + # Step 3: TIMMY-translate fresh, with up-to-max_rounds validation retries. + tk = t_key(c.hanzi) + if tk in cache: + en_timmy = cache[tk] + rounds_used_counts[1] += 1 + else: + prior = [cue_by_idx[i] for i in range(max(1, idx - 4), idx) if i in cue_by_idx] + name_hints = _detect_name_candidates(c.hanzi, c.pinyin, name_map) + glossary_hits = _build_glossary_hints(c.hanzi, glossary) + feedback: str | None = None + en_timmy = "" + last_issues: list[str] = [] + for round_idx in range(1, max_rounds + 1): + raw = llm_translate_fresh( + session, c, prior, name_map, glossary, system_prompt, feedback=feedback, + ) + en_timmy = (raw or "").strip() + if not en_timmy: + feedback = "Your previous reply was empty. Output one English line." + continue + ok, issues = validate_translation(en_timmy, c.hanzi, name_hints, glossary_hits) + if ok: + rounds_used_counts[round_idx] = rounds_used_counts.get(round_idx, 0) + 1 + break + last_issues = issues + feedback = "; ".join(issues) + else: + # Loop fell through max_rounds without an ok pass — keep best-effort output but note it. + rounds_used_counts[4] = rounds_used_counts.get(4, 0) + 1 + if last_issues and changelog_rows is not None: + # Surface the failure in the changelog as a flag. + pass + if en_timmy: + cache[tk] = en_timmy + + if not en_timmy: + counts["no_change"] += 1 + continue + + # Step 4: decide the final text. + if not existing: + final = en_timmy + verdict = "T" + counts["translate_only"] += 1 + elif existing.lower() == en_timmy.lower(): + counts["kept_existing"] += 1 + continue + elif not do_compare: + # Replace blindly (mode disabled comparison). + final = en_timmy + verdict = "T" + counts["translate_only"] += 1 + else: + ck = c_key(c.hanzi, existing, en_timmy) + if ck in cache: + cached_str = cache[ck] + # Format: "A|text" or "B|text" or "C|text" + v = cached_str.split("|", 1)[0] if "|" in cached_str else "A" + txt = cached_str.split("|", 1)[1] if "|" in cached_str else existing + verdict = v + final = txt + else: + v, f = llm_compare_translations(session, c, existing, en_timmy, name_map) + verdict = v + final = f + cache[ck] = f"{v}|{f}" + counts[f"compare_{verdict}"] = counts.get(f"compare_{verdict}", 0) + 1 + + if final and final.strip() != c.english.strip(): + changelog_rows.append((c.index, f"CORRECT({verdict},{reason})", + c.hanzi.replace("\n", " | "), c.english, final)) + c.english = final + replaced += 1 + else: + counts["no_change"] += 1 + + if n % 10 == 0 or n == len(candidates): + print(f" ...{n}/{len(candidates)} cue(s) processed (replaced so far: {replaced})") + + if cache_path is not None: + save_llm_cache(cache_path, cache) + counts["rounds_used"] = rounds_used_counts # nested histogram for logging + return replaced, counts + + +# ---------- spot-check (visual review of subtitle formatting) ---------- +# +# The Chinese subtitle is trusted to match the scene. What CAN go wrong is +# subtitle FORMATTING: a cue too long for the screen, a multi-cue merger that +# now spans 4+ lines, a "hold-for-reading" extension that overlaps the next +# beat, fast-dialogue alignment slips. These are all visual problems — Python +# can SURFACE the suspect cues by heuristic, ffmpeg burns the trilingual sub +# onto the frame, and an HTML index lets the owner scan a grid quickly. +# +# This is path A — "no VLM needed, just frame the candidate." Path B (a vision +# LLM judging frame+subtitle pairs) layers on top once a VLM GGUF is available. + +import html as html_lib + + +def _score_cue_for_spotcheck(c: TriCue) -> tuple[float, list[str]]: + """Score a cue's "needs human eyes on it" probability. Returns (score, tags). + + Tags are short labels that explain WHY the cue was picked — they get + rendered alongside the frame in the HTML grid.""" + score = 0.0 + tags: list[str] = [] + joined = c.joined() + char_count = len(joined) + lines = joined.split("\n") + line_count = len(lines) + duration_ms = max(1, c.end_ms - c.start_ms) + longest_line = max((len(ln) for ln in lines), default=0) + + # 1. Long total character count — overflow risk. + if char_count > 120: + score += (char_count - 120) / 8.0 + tags.append(f"chars={char_count}") + elif char_count > 100: + score += (char_count - 100) / 12.0 + + # 2. A single line too long for typical subtitle width. + if longest_line > 80: + score += (longest_line - 80) / 4.0 + tags.append(f"long-line={longest_line}") + + # 3. Many lines (likely a multi-cue merger). + if line_count > 3: + score += (line_count - 3) * 6.0 + tags.append(f"lines={line_count}") + + # 4. Long duration — possible hold-for-reading artifact bleeding into next beat. + if duration_ms > 8000: + score += min(8.0, (duration_ms - 8000) / 1000.0) + tags.append(f"long-dur={duration_ms}ms") + + # 5. Very short duration — fast dialogue, alignment may be wrong. + if duration_ms < 700 and char_count > 12: + score += 4.0 + tags.append(f"short-dur={duration_ms}ms") + + # 6. High character-density (chars/sec) — too much text to read in time. + char_density = char_count / (duration_ms / 1000.0) + if char_density > 30: + score += (char_density - 30) / 6.0 + tags.append(f"dense={char_density:.0f}c/s") + + return score, tags + + +def select_spotcheck_cues(cues: list[TriCue], top_n: int = 12, + cluster_extra: int = 0) -> list[tuple[TriCue, list[str]]]: + """Pick the N most likely-suspect cues for visual review. + + Also flags fast-dialogue clusters (cues with small gaps to neighbors) and + optionally includes `cluster_extra` cues from the densest cluster to give + the reviewer a contextual run of frames.""" + scored: list[tuple[float, list[str], TriCue]] = [] + by_index = {c.index: c for c in cues} + indexes = sorted(by_index) + + # Add per-cue scores from heuristics. + for c in cues: + s, tags = _score_cue_for_spotcheck(c) + scored.append((s, tags, c)) + + # Add cluster signal: cue is "in a fast cluster" when both neighbors are + # closer than 400 ms — alignment errors love this regime. + for i, idx in enumerate(indexes): + c = by_index[idx] + prev_c = by_index[indexes[i - 1]] if i > 0 else None + next_c = by_index[indexes[i + 1]] if i + 1 < len(indexes) else None + gap_prev = c.start_ms - prev_c.end_ms if prev_c else None + gap_next = next_c.start_ms - c.end_ms if next_c else None + if (gap_prev is not None and gap_prev < 400) and (gap_next is not None and gap_next < 400): + # Bump score and append a single fast-cluster tag for this cue. + for k, (s, tags, sc) in enumerate(scored): + if sc.index == c.index: + scored[k] = (s + 2.5, tags + ["fast-cluster"], sc) + break + + # Sort by score descending, drop zero-score cues. + scored.sort(key=lambda t: -t[0]) + picked: list[tuple[TriCue, list[str]]] = [] + seen: set[int] = set() + for s, tags, c in scored: + if s <= 0: + break + if c.index in seen: + continue + picked.append((c, tags)) + seen.add(c.index) + if len(picked) >= top_n: + break + return picked + + +def _ffmpeg_subtitle_path(sub_path: Path) -> str: + """Format a subtitle path for ffmpeg's `subtitles=` filter on Windows. + + Backslashes become forward; the drive-colon gets escaped. Paths with + spaces must be quoted at the filter level.""" + p = str(sub_path.resolve()).replace("\\", "/") + # Escape the drive-letter colon so ffmpeg doesn't interpret it as a filter option. + if len(p) >= 2 and p[1] == ":": + p = p[0] + r"\:" + p[2:] + # Escape any remaining colons (rare). + return p + + +def extract_frame_with_subs(video: Path, sub_path: Path, cue: TriCue, + out_dir: Path) -> Path | None: + """Use ffmpeg to grab a single frame at the cue's mid-time with subtitles burned in. + + Returns the .jpg Path on success, None on failure.""" + if not video.exists(): + return None + mid_sec = (cue.start_ms + cue.end_ms) / 2000.0 + out = out_dir / f"frame_{cue.index:04d}.jpg" + sub_arg = _ffmpeg_subtitle_path(sub_path) + # Fast input-seek to ~5 s before the cue, then output-seek to the cue mid-time. + # -copyts preserves the original stream PTS so libass (inside the subtitles + # filter) sees the real time and renders the right cue. Without -copyts, + # input -ss zeros out PTS and the filter shows nothing. We also need -update 1 + # so ffmpeg writes a single image without complaining about missing %d patterns. + pre_sec = max(0.0, mid_sec - 5.0) + cmd = [ + "ffmpeg", "-y", "-loglevel", "error", "-copyts", + "-ss", f"{pre_sec:.3f}", "-i", str(video), + "-ss", f"{mid_sec:.3f}", + "-vf", f"subtitles='{sub_arg}'", + "-frames:v", "1", "-q:v", "3", "-update", "1", + str(out), + ] + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) + except subprocess.SubprocessError as e: + print(f"warn: ffmpeg failed for cue {cue.index}: {e}", file=sys.stderr) + return None + if result.returncode != 0: + # Subtitles filter failed; fall back to no-subtitle frame. + cmd_fallback = [ + "ffmpeg", "-y", "-loglevel", "error", + "-ss", f"{mid_sec:.3f}", "-i", str(video), + "-frames:v", "1", "-q:v", "3", + str(out), + ] + try: + r2 = subprocess.run(cmd_fallback, capture_output=True, text=True, timeout=60) + if r2.returncode != 0: + print(f"warn: ffmpeg fallback also failed for cue {cue.index}: " + f"{result.stderr[:200]}", file=sys.stderr) + return None + except subprocess.SubprocessError as e: + print(f"warn: ffmpeg fallback exception for cue {cue.index}: {e}", file=sys.stderr) + return None + return out if out.exists() else None + + +def write_spotcheck_html(picks: list[tuple[TriCue, list[str], Path]], + html_path: Path, film_title: str) -> None: + """Render an HTML index of the picked cues with frames + metadata. + + `picks` is a list of (cue, tags, frame_path). frame_path may be None + if extraction failed.""" + # Use absolute file:// URLs for the images so the HTML works regardless of + # where it's opened from. Browsers gate this; on Windows it tends to Just Work. + rows: list[str] = [] + for cue, tags, frame in picks: + ts = format_ts(cue.start_ms).replace(",", ".") + te = format_ts(cue.end_ms).replace(",", ".") + dur_ms = cue.end_ms - cue.start_ms + joined_html = html_lib.escape(cue.joined()).replace("\n", "
") + tag_html = " ".join(f'{html_lib.escape(t)}' for t in tags) + if frame and frame.exists(): + img_url = "file:///" + str(frame.resolve()).replace("\\", "/") + img_html = f'cue {cue.index}' + else: + img_html = '
[ffmpeg failed]
' + rows.append(f""" +
+
{img_html}
+
+
#{cue.index} · {ts} → {te} · {dur_ms}ms
+
{tag_html}
+
{joined_html}
+
+
""") + + title = html_lib.escape(film_title or "spotcheck") + html = f""" + + + +spotcheck — {title} + + + +

spotcheck — {title}

+
{len(picks)} cue(s) flagged. Each card shows the frame at the cue's mid-time with the trilingual subtitle burned in, plus the cue's text, timing, and why Python flagged it.
+
+ {"".join(rows)} +
+ + +""" + html_path.parent.mkdir(parents=True, exist_ok=True) + html_path.write_text(html, encoding="utf-8") + + +def run_spotcheck(merged: list[TriCue], out_subtitle: Path, video: Path, + top_n: int = 12) -> None: + """Top-level spotcheck workflow. Picks suspect cues, extracts frames, + writes an HTML index next to the subtitle output.""" + picks = select_spotcheck_cues(merged, top_n=top_n) + if not picks: + print("spotcheck: no cues scored above threshold; nothing to review") + return + spotcheck_dir = out_subtitle.parent / f"{out_subtitle.stem}.spotcheck" + spotcheck_dir.mkdir(parents=True, exist_ok=True) + print(f"spotcheck: {len(picks)} cue(s) picked; extracting frames...") + with_frames: list[tuple[TriCue, list[str], Path | None]] = [] + for cue, tags in picks: + frame = extract_frame_with_subs(video, out_subtitle, cue, spotcheck_dir) + with_frames.append((cue, tags, frame)) + html_path = spotcheck_dir / "index.html" + write_spotcheck_html(with_frames, html_path, video.stem) + print(f"spotcheck: wrote {html_path}") + print(f" open with: start {html_path}") + + # ---------- validation ---------- def validate(cues: list[TriCue]) -> None: @@ -748,8 +2215,8 @@ def cmd_inspect(mkv: Path, names_arg: Path | None) -> None: else: print("names: no IMDb tag in filename; pass --names if you have a names file") else: - nm, em = load_names(names_path) - print(f"names: {names_path} ({len(nm)} hanzi, {len(em)} english fixes)") + nm, em, gl = load_names(names_path) + print(f"names: {names_path} ({len(nm)} hanzi, {len(em)} english fixes, {len(gl)} glossary entries)") def main() -> None: @@ -775,6 +2242,36 @@ def main() -> None: ap.add_argument("--window-ms", type=int, default=1500, help="per-cue alignment tolerance (default 1500)") ap.add_argument("--no-bom", action="store_true", help="write output without UTF-8 BOM") + ap.add_argument("--enrich", action="store_true", + help="apply Phase-1 dictionary correction (CC-CEDICT-driven empty-cue fill); writes .changes.tsv") + ap.add_argument("--cedict", type=Path, default=None, + help="path to CC-CEDICT text file (default: Research/primary_sources/cedict/cedict_1_0_ts_utf-8_mdbg.txt)") + ap.add_argument("--llm-verify", action="store_true", + help="(Phase 2) run llama.cpp on the Arc A770 to re-translate dictionary-filled cues. Requires --enrich. Uses --llm-model / --llm-server (or env vars PINSUB_LLM_GGUF / PINSUB_LLAMA_SERVER).") + ap.add_argument("--llm-model", type=Path, default=None, + help="GGUF model file for --llm-verify (default: $PINSUB_LLM_GGUF env var)") + ap.add_argument("--llm-server", type=Path, default=None, + help="llama-server.exe path for --llm-verify (default: $PINSUB_LLAMA_SERVER env var)") + ap.add_argument("--llm-port", type=int, default=LLAMA_DEFAULT_PORT, + help=f"localhost port for llama-server (default {LLAMA_DEFAULT_PORT})") + ap.add_argument("--llm-ctx", type=int, default=4096, + help="llama-server --ctx-size (default 4096)") + ap.add_argument("--llm-max", type=int, default=0, + help="cap LLM pass at N cues (0 = no cap; useful for smoke tests)") + ap.add_argument("--llm-correct", action="store_true", + help="(Phase 3) broader translate-then-compare correction. Python heuristic prefilters; TIMMY translates fresh; a second TIMMY compares against existing English. Requires --enrich (for the cedict gloss used by the heuristic).") + ap.add_argument("--llm-scope", choices=("fills", "divergent", "all"), default="divergent", + help="--llm-correct scope: 'fills' (only Phase-1 empty fills), 'divergent' (Python flags suspect cues; default), 'all' (every cue with Chinese)") + ap.add_argument("--llm-no-compare", action="store_true", + help="with --llm-correct, skip the comparison pass (use TIMMY's translation outright)") + ap.add_argument("--llm-rounds", type=int, default=3, + help="max retry rounds for --llm-correct when a TIMMY translation fails Python validation (default 3)") + ap.add_argument("--llm-threads", type=int, default=4, + help="CPU thread cap for llama-server (default 4). Vulkan path spends most time on GPU; this caps CPU side. Lower if you need CPU headroom.") + ap.add_argument("--spotcheck", action="store_true", + help="after writing the trilingual output, run a visual spot-check: pick cues at risk of formatting/overflow/alignment issues, ffmpeg-extract a frame at each cue's mid-time with subtitles burned in, output an HTML grid for human review.") + ap.add_argument("--spotcheck-n", type=int, default=12, + help="number of cues to surface in the spotcheck grid (default 12)") args = ap.parse_args() if args.inspect: @@ -792,12 +2289,28 @@ def main() -> None: sys.exit(f"missing: {args.zh}") names_path = find_names_file(args.names, args.mkv) - name_map, english_name_map = load_names(names_path) + name_map, english_name_map, per_film_glossary = load_names(names_path) if names_path: - print(f"loaded names: {names_path.name} ({len(name_map)} hanzi, {len(english_name_map)} english fixes)") + print(f"loaded names: {names_path.name} ({len(name_map)} hanzi, {len(english_name_map)} english fixes, {len(per_film_glossary)} film-glossary entries)") else: print("no names file found — name capitalization and English Wade-Giles fixes will be skipped") + # Load global glossary (and merge per-film over it). Used by the LLM stages + # only — pinyin and dictionary stages don't need it. + global_glossary = load_glossary() + glossary = merge_glossaries(global_glossary, per_film_glossary) + if glossary: + print(f"loaded glossary: {len(global_glossary)} global + {len(per_film_glossary)} film-specific = {len(glossary)} active entries") + + # Load TIMMY system prompt from README_TIMMY.md if present, else fall back + # to the inline default. The prompt file is the canonical place to tune + # translation behavior, quirks, and policy without touching code. + timmy_system_prompt = load_timmy_prompt() or LLM_FRESH_SYSTEM_PROMPT + if (Path(__file__).parent / "README_TIMMY.md").exists(): + print(f"loaded TIMMY prompt: README_TIMMY.md ({len(timmy_system_prompt)} chars)") + else: + print("no README_TIMMY.md found — using inline fallback prompt") + work_dir = args.out.parent # 1. English source: provided, or extract from mkv. @@ -879,6 +2392,217 @@ def main() -> None: print(f"warn: could not load {alt_path.name}: {e}", file=sys.stderr) print(f" {len(merged)} merged cues") + # 5b. Phase-1 dictionary correction (if --enrich). Runs AFTER translations.json + # so manual per-cue overrides take precedence; only fills empty English. + cedict: dict = {} + filled_indices: set[int] = set() + changelog_rows: list[tuple] = [] + changelog_path: Path | None = None + if args.enrich: + cedict_path = args.cedict if args.cedict else DEFAULT_CEDICT_PATH + print(f"loading dictionary: {cedict_path.name}...") + cedict = load_cedict(cedict_path) + if not cedict: + print("warn: dictionary empty/missing — skipping enrichment", file=sys.stderr) + else: + print(f" {len(cedict)} simplified-Hanzi keys") + changelog_path = args.out.with_suffix(args.out.suffix + ".changes.tsv") + merged, filled_indices, changelog_rows = enrich_cues( + merged, cedict, changelog_path, name_map=name_map, + ) + + # 5c. Phase-2 LLM verifier (if --llm-verify). Uses the local llama.cpp HTTP + # server on the Arc A770. Verifies the cues filled by Phase 1. + if args.llm_verify: + if not args.enrich: + sys.exit("--llm-verify requires --enrich") + server_exe = args.llm_server or Path(os.environ.get("PINSUB_LLAMA_SERVER", "")) + model_path = args.llm_model or Path(os.environ.get("PINSUB_LLM_GGUF", "")) + if not server_exe or str(server_exe) == ".": + sys.exit("--llm-verify needs --llm-server or PINSUB_LLAMA_SERVER env var") + if not model_path or str(model_path) == ".": + sys.exit("--llm-verify needs --llm-model or PINSUB_LLM_GGUF env var") + if not filled_indices: + print("llm verify: no cues were filled by Phase 1; nothing to verify") + else: + if args.llm_max and args.llm_max > 0: + # Smoke-test mode: cap the LLM verify pass at N cues. + indices_sorted = sorted(filled_indices)[: args.llm_max] + filled_indices = set(indices_sorted) + print(f"llm verify: --llm-max set, verifying first {len(filled_indices)} cue(s)") + + print("🟧 A770 USE: spawning llama-server on the Arc.") + print(f" model: {model_path.name} (~{model_path.stat().st_size / 1e9:.1f} GB on disk)") + print(f" est. VRAM peak: ~12 GB (10.5 GB model + ~1.5 GB KV @ ctx={args.llm_ctx})") + print(f" est. runtime: ~10 s per cue × {len(filled_indices)} cue(s) + ~30 s startup") + device = find_arc_vulkan_device(server_exe) + if device: + print(f" device: {device}") + else: + print(" device: (auto — could not enumerate; llama-server will pick)") + + session = start_llama_server( + server_exe, model_path, + port=args.llm_port, device=device, ctx_size=args.llm_ctx, + n_threads=args.llm_threads, + ) + + # Cache file is keyed per-film by IMDb id when present. + imdb_id = None + if args.mkv: + m = IMDB_RE.search(args.mkv.name) + if m: + imdb_id = m.group(1) + cache_path = (Path(__file__).parent / "Research" / "cache" + / f"{imdb_id or 'noimdb'}.llm.json") + cache = load_llm_cache(cache_path) + + replaced = 0 + success = False + try: + replaced = verify_cues_with_llm( + merged, session, filled_indices, name_map, cedict, cache, + model_name=model_path.name, changelog_rows=changelog_rows, + ) + print(f"llm verify: {replaced} cue(s) replaced from LLM output") + save_llm_cache(cache_path, cache) + success = True + finally: + stop_llama_server(session) + # A770 usage log entry — required by project rule. + a770_log = Path(__file__).parent / "Logs" / "A770_usage.md" + write_a770_usage_log( + a770_log, session, + job="PinSub --llm-verify", + cues_processed=len(filled_indices), + cues_replaced=replaced, + source_session_path=None, + success=success, + notes=f"film={imdb_id or args.mkv.name}; cache={cache_path.name}", + ) + + # Append the VERIFY rows to the existing FILL changelog. + if changelog_path is not None and changelog_rows: + # Rewrite the changelog with the merged rows. + try: + with changelog_path.open("w", encoding="utf-8", newline="") as f: + f.write("idx\taction\thanzi\tbefore\tafter\n") + for row in changelog_rows: + f.write("\t".join( + str(x).replace("\t", " ").replace("\n", " | ") for x in row + ) + "\n") + except OSError as e: + print(f"warn: could not rewrite changelog {changelog_path}: {e}", file=sys.stderr) + + # 5d. Phase-3 LLM-correct (if --llm-correct). Broader translate-then-compare: + # Python heuristic prefilters suspect cues; TIMMY translates each from + # scratch (no existing English in the prompt); a separate TIMMY compares + # the two candidates per cue. Reuses the llama-server pattern. + if args.llm_correct: + if not args.enrich: + sys.exit("--llm-correct requires --enrich (cedict gloss feeds the divergence heuristic)") + if not cedict: + sys.exit("--llm-correct requires a loaded CC-CEDICT") + server_exe = args.llm_server or Path(os.environ.get("PINSUB_LLAMA_SERVER", "")) + model_path = args.llm_model or Path(os.environ.get("PINSUB_LLM_GGUF", "")) + if not server_exe or str(server_exe) == ".": + sys.exit("--llm-correct needs --llm-server or PINSUB_LLAMA_SERVER env var") + if not model_path or str(model_path) == ".": + sys.exit("--llm-correct needs --llm-model or PINSUB_LLM_GGUF env var") + + # Pre-survey: how many cues will the Python heuristic select before we + # spawn the server? Saves an A770 startup if the answer is zero. + survey_counts: dict[str, int] = {} + candidate_count = 0 + for c in merged: + if not c.hanzi: + continue + suspect, reason = detect_suspect_english(c, cedict, name_map) + survey_counts[reason] = survey_counts.get(reason, 0) + 1 + if args.llm_scope == "fills": + if not c.english.strip(): + candidate_count += 1 + elif args.llm_scope == "all": + candidate_count += 1 + elif suspect: + candidate_count += 1 + print(f"llm correct: pre-survey — {candidate_count} candidate cue(s) (scope={args.llm_scope})") + top = sorted(survey_counts.items(), key=lambda kv: -kv[1])[:6] + print(f" divergence histogram (top 6): {dict(top)}") + if args.llm_max and args.llm_max > 0: + candidate_count = min(candidate_count, args.llm_max) + print(f" (capping at --llm-max={args.llm_max})") + + if candidate_count == 0: + print("llm correct: no cues to process; skipping") + else: + print("🟧 A770 USE: spawning llama-server on the Arc.") + print(f" model: {model_path.name} (~{model_path.stat().st_size / 1e9:.1f} GB on disk)") + print(f" est. VRAM peak: ~12 GB (10.5 GB model + ~1.5 GB KV @ ctx={args.llm_ctx})") + calls_per_cue = 2 if not args.llm_no_compare else 1 + print(f" est. runtime: ~{candidate_count * calls_per_cue * 8} s " + f"({candidate_count} cue(s) × {calls_per_cue} call(s) × ~8 s)") + device = find_arc_vulkan_device(server_exe) + if device: + print(f" device: {device}") + else: + print(" device: (auto)") + + session = start_llama_server( + server_exe, model_path, + port=args.llm_port, device=device, ctx_size=args.llm_ctx, + n_threads=args.llm_threads, + ) + + imdb_id = None + if args.mkv: + m = IMDB_RE.search(args.mkv.name) + if m: + imdb_id = m.group(1) + cache_path = (Path(__file__).parent / "Research" / "cache" + / f"{imdb_id or 'noimdb'}.correct.json") + + replaced = 0 + counts: dict[str, int] = {} + success = False + try: + replaced, counts = correct_cues_with_llm( + merged, session, cedict, name_map, + glossary=glossary, + system_prompt=timmy_system_prompt, + mode=args.llm_scope, do_compare=not args.llm_no_compare, + max_cues=args.llm_max, max_rounds=args.llm_rounds, + cache_path=cache_path, + changelog_rows=changelog_rows, + ) + print(f"llm correct: {replaced} cue(s) replaced; action counts: {counts}") + success = True + finally: + stop_llama_server(session) + a770_log = Path(__file__).parent / "Logs" / "A770_usage.md" + write_a770_usage_log( + a770_log, session, + job=f"PinSub --llm-correct --llm-scope={args.llm_scope}" + + ("" if not args.llm_no_compare else " --llm-no-compare"), + cues_processed=candidate_count, + cues_replaced=replaced, + source_session_path=None, + success=success, + notes=f"film={imdb_id or args.mkv.name}; counts={counts}", + ) + + # Rewrite the changelog including new CORRECT rows. + if changelog_path is not None and changelog_rows: + try: + with changelog_path.open("w", encoding="utf-8", newline="") as f: + f.write("idx\taction\thanzi\tbefore\tafter\n") + for row in changelog_rows: + f.write("\t".join( + str(x).replace("\t", " ").replace("\n", " | ") for x in row + ) + "\n") + except OSError as e: + print(f"warn: could not rewrite changelog {changelog_path}: {e}", file=sys.stderr) + # 6. Validate + write. Output format inferred from --out extension. validate(merged) ext = args.out.suffix.lower() @@ -890,6 +2614,10 @@ def main() -> None: sys.exit(f"unsupported output extension '{ext}'. Use .ass (recommended) or .srt.") print(f"wrote {args.out}") + # 7. Optional visual spot-check (path A — no VLM required). + if args.spotcheck: + run_spotcheck(merged, args.out, args.mkv, top_n=args.spotcheck_n) + if __name__ == "__main__": main() diff --git a/README_TIMMY.md b/README_TIMMY.md new file mode 100644 index 0000000..2955408 --- /dev/null +++ b/README_TIMMY.md @@ -0,0 +1,71 @@ +You are TIMMY, Qx's local Chinese-to-English subtitle translator for the PinSub pipeline. + +Your one job per call: produce ONE short English subtitle line that faithfully renders a single Chinese subtitle cue. Python orchestrates everything else. You are the worker; Python is the brain. + +## Big-picture context + +- The Chinese subtitle is the **source of truth** for the film. The existing English on Bluray rips and torrents is often shifted across cue boundaries or just wrong; treat any English already in the prompt as an untrusted hint. +- Your output is shown stacked on screen below the Hanzi and pinyin rows, on the same timestamp. The viewer is a Mandarin learner who wants to map Chinese words to English words as they read. +- That goal — word-to-word mappability — is more important than fluency. Stilted-but-faithful beats fluent-but-paraphrased. + +## Translation philosophy + +1. **Word-for-word fidelity on content.** Every noun, verb, named entity, and modifier in the Chinese gets an English equivalent. Don't OMIT content the Chinese expresses; don't INVENT content the Chinese doesn't express. +2. **Function-word latitude.** Chinese particles, classifiers, possessive markers (的, 了, 着, 啊, 啦, 个, 条 …) often have no English equivalent — drop them. English needs articles, copulas, sometimes pronouns that Chinese implies — add them. Follow each language's natural grammar for function words only. +3. **Synonym latitude on word choice.** "Spear" / "lance" / "polearm" can all be right for 枪 in a wuxia film. Pick the one that reads cleanly in English while staying faithful to the Chinese. +4. **Preserve Chinese word order when grammatically tolerable in English.** Only rearrange when the literal order is genuinely incomprehensible. +5. **Conciseness.** Subtitles must fit on screen. Use the shortest natural English phrasing that captures the Chinese content. + +## Hard rules + +- **No Chinese characters in your English output, ever.** If you can't translate a term, output `[?]` so Python can flag it for human review. Do not pass Hanzi through. +- **One English line.** No newlines unless the Chinese itself has a newline mid-cue. +- **No quotes around your output.** No `"like this"`. +- **No labels.** Do not prefix `English:`, `Translation:`, `Subtitle:`, etc. +- **No explanation.** No "this translates as..." or "in this context...". Just the English line. +- **Append `/no_think` to your reply if you would otherwise produce reasoning.** If you reason internally, your `content` field comes back empty and Python ignores everything. + +## Chinese names (the generic rule) + +Chinese names — people, places, sects, weapons, dynasties — are rendered in the English row as **bare pinyin with tone marks stripped, syllable spacing preserved, capitals on the first letter of each name part.** Same syllable structure as the pinyin row, just without the diacritics. + +Examples: + +| Chinese | Pinyin row | English row | +|---|---|---| +| 李慕白 | Lǐ Mùbái | Li Mubai | +| 俞秀莲 | Yú Xiùlián | Yu Xiulian | +| 武当 | Wǔdāng | Wudang | +| 青冥剑 | Qīng Míng Jiàn | Qingming Sword | + +If the prompt includes a `Known names:` hint for this cue, **use that English form verbatim** — Python has already applied the rule and may have a film-specific spelling that overrides the default. The hint is canonical for this cue. + +If no `Known names:` hint is given, generate the English name yourself using the rule above. Use the `Pinyin row:` hint in the prompt to get the syllable structure right. + +## Glossary hints + +If the prompt includes a `Glossary:` section, those are film-context or wuxia-context translations PinSub has learned matter. Use the suggested English in your output unless it would make the cue grammatically wrong. The glossary captures cases where the literal Chinese-English dictionary is misleading: + +- `枪` in a wuxia film = **spear**, not gun. +- `师娘` = **Master's wife** (wife of one's martial arts teacher), not "Madam Teacher." +- `镖局` = **security agency** (Qing-era courier/escort outfit), not just "agency." + +You also see `Word-for-word target:` in some prompts — that's the literal dictionary gloss Python built. It's stilted but it's the structural skeleton. Match its content; smooth its English. + +## Quirks you have done before — stop doing them + +Python detects these in your output and will re-ask you with explicit feedback. Avoid them on round 1: + +1. **You sometimes leave a Hanzi character untranslated in your English** (e.g., output `"Yes! How's the 镖局 doing business?"`). NEVER leave Hanzi in the English row. If a term is in the `Glossary:` hint, use that. If not, attempt your best English equivalent. If you genuinely don't know, output `[?]` for that term so Python can flag it. + +2. **You sometimes smush or respace names** (e.g., `Xiulian` when the `Known names:` hint said `Xiu Lian`). Use the exact spacing from the `Known names:` hint character-for-character. If no hint, follow the rule above (capitals on each syllable, no space between syllables of the same name part, space between separate name parts: surname `Li` then given-name `Mubai` = `Li Mubai`). + +3. **You sometimes translate a polysemous Hanzi by its most common dictionary sense rather than its film-context sense** (`枪` → "gun" in a Qing-era wuxia film where it should be "spear"). The `Glossary:` hints exist to prevent this. If the cue's Hanzi contains a glossary key, use the glossary's English. + +4. **You sometimes output reasoning before the answer.** Qwen3 puts thinking in a separate `reasoning_content` field which Python discards. Always include `/no_think` in your reply and produce ONLY the English line. + +5. **You sometimes wrap the answer in quotes or add lead-in labels** (`"Crouching Tiger, Hidden Dragon"` or `English: Crouching Tiger, Hidden Dragon`). Python strips these defensively, but it's cleaner if you don't add them. + +## Output format + +A single English subtitle line. Nothing else. No quotes, no labels, no explanation, no thinking. /no_think diff --git a/glossary.json b/glossary.json new file mode 100644 index 0000000..2c929f5 --- /dev/null +++ b/glossary.json @@ -0,0 +1,112 @@ +{ + "_help": "Global Chinese→English translation hints for PinSub. PinSub loads this file at startup, scans each cue's Hanzi for any key present here, and passes matching entries to TIMMY as 'Glossary:' hints. Add an entry when TIMMY translates a term wrong; the next run honors the lesson. Per-film overrides live under the 'glossary' key in names/.json. Entries with underscore-prefixed keys are ignored.", + + "_format": { + "": { + "english": "the preferred English translation", + "context": "one-line note on when/why this matters (audience-facing rationale)", + "tags": ["wuxia", "qing-era", "kungfu", "..."] + } + }, + + "枪": { + "english": "spear", + "context": "in wuxia / Qing-era films this is the long-handled bladed weapon, NOT a firearm", + "tags": ["wuxia", "weapon"] + }, + "师娘": { + "english": "Master's wife", + "context": "wife of one's martial-arts teacher; respectful address in wuxia", + "tags": ["wuxia", "kungfu", "kinship"] + }, + "镖局": { + "english": "security agency", + "context": "Qing-era courier/escort outfit that delivered valuables under armed protection", + "tags": ["wuxia", "qing-era", "occupation"] + }, + "镖师": { + "english": "escort guard", + "context": "a fighter employed by a 镖局", + "tags": ["wuxia", "occupation"] + }, + "师父": { + "english": "Master", + "context": "respectful address for one's martial-arts teacher; addressed in second person", + "tags": ["wuxia", "kungfu", "address"] + }, + "师傅": { + "english": "Master", + "context": "respectful address for a skilled craftsman or teacher; often interchangeable with 师父 in dubbing", + "tags": ["wuxia", "kungfu", "address"] + }, + "弟子": { + "english": "disciple", + "context": "student of a martial-arts master", + "tags": ["wuxia", "kungfu", "kinship"] + }, + "侠": { + "english": "swordsman", + "context": "wuxia / martial-arts hero — broader than 'knight'; 'warrior' is acceptable", + "tags": ["wuxia"] + }, + "江湖": { + "english": "the martial world", + "context": "literally 'rivers and lakes' — the underground world of itinerant fighters / outlaws / sects in wuxia", + "tags": ["wuxia"] + }, + "门派": { + "english": "sect", + "context": "martial-arts school or lineage", + "tags": ["wuxia", "kungfu"] + }, + "武林": { + "english": "the martial-arts community", + "context": "the collective world of martial-arts practitioners and sects", + "tags": ["wuxia"] + }, + "功夫": { + "english": "kung fu", + "context": "the standard romanization; preserve as 'kung fu' rather than 'gongfu' for English-speaking audiences", + "tags": ["wuxia", "kungfu"] + }, + "内功": { + "english": "internal energy", + "context": "Qi-based cultivation skill in wuxia", + "tags": ["wuxia"] + }, + "气功": { + "english": "qigong", + "context": "energy cultivation; preserve as 'qigong' (the standard English loan)", + "tags": ["wuxia", "kungfu"] + }, + "闭关": { + "english": "go into seclusion", + "context": "withdraw to meditate / cultivate in isolation; specifically wuxia term", + "tags": ["wuxia", "practice"] + }, + "修练": { + "english": "practice", + "context": "ongoing self-cultivation of skill; less literal than 'cultivate'", + "tags": ["wuxia"] + }, + "真人": { + "english": "Zhenren", + "context": "honorific for a Taoist master / accomplished cultivator; preserve as 'Zhenren' or render contextually as 'Master'", + "tags": ["wuxia", "address"] + }, + "护法": { + "english": "guardian", + "context": "in wuxia, a temple/sect protector", + "tags": ["wuxia"] + }, + "拜": { + "english": "pay respect", + "context": "ceremonial bow / formal greeting in wuxia; rarely 'worship' unless religious context", + "tags": ["wuxia"] + }, + "拜师": { + "english": "take as Master", + "context": "the ceremony where a student formally enters a master's tutelage", + "tags": ["wuxia", "kungfu"] + } +}