From cdbabe6280cdd8b507137cb960b9a1cd25cf9af8 Mon Sep 17 00:00:00 2001 From: Boladi <151992391+Boladi888@users.noreply.github.com> Date: Sun, 3 May 2026 23:35:37 -0400 Subject: [PATCH 1/6] Add Phase-1 dictionary correction (--enrich flag) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CC-CEDICT-driven empty-cue fill. The Chinese subtitles are the source of truth; this stage fills cues where the merged trilingual result has no English line, building a literal gloss from CC-CEDICT entries via greedy-longest-match segmentation. Conservative: never overwrites existing English (LLM verifier territory). Every fill is logged to .changes.tsv as a side channel for review. - load_cedict() parses the standard CC-CEDICT format into {simplified: [(pinyin, [defs])]}. - segment_greedy() does left-to-right longest-match with a 12-char window. - gloss_hanzi() respects name_map: known proper nouns use the bare-pinyin form (e.g. 慕白 -> "Mubai") instead of the literal CC-CEDICT gloss (would have been "to admire white"). - clean_gloss() strips editorial annotations: leading parentheticals ("(literary)", "(courteous, as opposed to ...)"), bracketed pinyin ("[ni3]"), inner annotations, "(CL:...)" classifier hints, and "see also X" cross-references. - Particle overrides drop sentence-ending modal particles (啦, 啊, 呀, 哦, 嘛, 呢, 吧, etc.) entirely; they convey no English-row content. - Imperative overrides emit the bare command form for common dialogue verbs (停 -> "Stop", 来 -> "Come", 让开 -> "Move aside") instead of the dictionary's "to X" infinitive. CLI: --enrich (off by default), --cedict . Default cedict path is Research/primary_sources/cedict/cedict_1_0_ts_utf-8_mdbg.txt. CTHD test run: 42 of 44 empty cues filled. The 2 skips are correct no-ops (pure-particle cue and English-only wanted-poster cue). --- PinSub.py | 326 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 326 insertions(+) diff --git a/PinSub.py b/PinSub.py index 823d2c6..745df09 100644 --- a/PinSub.py +++ b/PinSub.py @@ -717,6 +717,315 @@ def expand(parts: list[str]) -> list[str]: return out +# ---------- dictionary correction (Phase 1: empty-cue fill from CC-CEDICT) ---------- +# +# The Chinese subtitles are the source of truth; the existing English row is an +# imperfect translation, sometimes missing entirely. Phase 1 is conservative: +# we only fill cues where the merged result has no English line. Existing +# English is never overwritten in Phase 1 — that's the LLM verifier's job +# (Phase 2, gated on Arc A770 use per project rule). +# +# Every change is logged to .changes.tsv as a side channel so the manual +# review pass can audit what the automation did. + +DEFAULT_CEDICT_PATH = Path(__file__).parent / "Research" / "primary_sources" / "cedict" / "cedict_1_0_ts_utf-8_mdbg.txt" + +CEDICT_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+\[([^\]]+)\]\s+/(.+)/\s*$") + + +def load_cedict(path: Path) -> dict[str, list[tuple[str, list[str]]]]: + """Parse CC-CEDICT into {simplified: [(pinyin, [defs]), ...]}. + + Multi-entry keys happen when the same simplified form has different pinyin + readings; we keep them all and pick at lookup time. Definitions inside an + entry are split on '/'; CC-CEDICT often has multiple synonymous glosses. + """ + if not path or not path.exists(): + return {} + out: dict[str, list[tuple[str, list[str]]]] = {} + try: + with path.open(encoding="utf-8") as f: + for line in f: + if not line or line.startswith("#"): + continue + m = CEDICT_LINE_RE.match(line) + if not m: + continue + _trad, simp, pinyin, defs_str = m.groups() + defs = [d for d in defs_str.split("/") if d] + out.setdefault(simp, []).append((pinyin, defs)) + except OSError as e: + print(f"warn: could not load cedict {path}: {e}", file=sys.stderr) + return {} + return out + + +def segment_greedy(text: str, cedict: dict, max_len: int = 12) -> list[str]: + """Left-to-right longest-match segmentation against CC-CEDICT keys. + + Hanzi runs are segmented by longest-match; non-Hanzi (punctuation, ASCII, + spaces) pass through one character at a time. max_len caps the lookup + window — most CC-CEDICT entries are <=8 chars; a generous 12 covers all + practical cases without quadratic blowup. + """ + if not text: + return [] + if not cedict: + return list(text) + out: list[str] = [] + i = 0 + n = len(text) + while i < n: + ch = text[i] + if HAN_RE.match(ch): + matched = False + for length in range(min(max_len, n - i), 0, -1): + cand = text[i:i + length] + if cand in cedict: + out.append(cand) + i += length + matched = True + break + if not matched: + # Single-char Hanzi without a multi-char hit; emit it (may still be in cedict as a single-char entry). + out.append(ch) + i += 1 + else: + out.append(ch) + i += 1 + return out + + +def gloss_segment(seg: str, cedict: dict) -> str | None: + """Return the first definition for a Hanzi segment, or None if unmappable. + + For multi-entry segments (e.g., 行 has multiple pinyin readings) the + rough heuristic is "pick the entry with the most definitions" — that + correlates loosely with "most common reading." Phase 2 LLM verifier + will pick smarter; Phase 1 just needs a plausible default. + """ + if seg not in cedict: + return None + entries = cedict[seg] + entries_sorted = sorted(entries, key=lambda e: -len(e[1])) + pinyin, defs = entries_sorted[0] + if not defs: + return None + return defs[0] + + +# Some CC-CEDICT definitions carry editorial annotations like "(slang)", +# "(Tw)", "(literary)", "(courteous, as opposed to informal 你[ni3])", or +# "see X". For a learner subtitle line we want the essential gloss, not +# the metadata. Strip the most common patterns aggressively. +GLOSS_LEADING_PAREN_RE = re.compile(r"^\(([A-Za-z][^)]{0,80})\)\s*") +GLOSS_PINYIN_BRACKETS_RE = re.compile(r"\[[a-zA-Z0-9 ]+\]") +GLOSS_INNER_ANNOTATION_RE = re.compile(r"\s*\((?:lit\.|fig\.|abbr\.|coll\.|slang|literary|formal|courteous|informal|archaic|dialect|usu\.|esp\.|see also|see|CL:[^)]+|as opposed to[^)]*|equivalent to[^)]*|same as[^)]*|written form of[^)]*|variant of[^)]*|usually|especially|by extension|extended meaning|of [^)]*|sound of [^)]*|interjection [^)]*)[^)]*\)") +GLOSS_SEE_ALSO_RE = re.compile(r"\s*;?\s*see\s+(also\s+)?\S+", flags=re.I) +GLOSS_CL_RE = re.compile(r"\s*\(CL:[^)]+\)") + +# Sentence-ending particles common in dialogue. Map to empty string so they +# don't pollute the gloss; punctuation already conveys the sentence end. +PARTICLE_OVERRIDES = { + "啦": "", + "啊": "", + "呀": "", + "哦": "", + "哎": "", + "嗨": "", + "嘛": "", + "呢": "", + "吧": "", + "嗯": "", + "唉": "", + "哟": "", + "嘿": "", + "诶": "", + "了": "", # aspect particle "le"; rarely useful as standalone gloss + "的": "", # possessive/genitive marker + "得": "", # complement marker + "地": "", # adverbial marker + "着": "", # progressive aspect marker + "过": "", # experiential aspect marker + "把": "", # disposal marker +} + +# Common verbs that are nearly always imperatives in subtitle dialogue. +# CC-CEDICT lists them as "to X" (infinitive); rewrite to bare imperative +# when the cue context suggests a command (short cue, ends with !, or +# single verb cue). +IMPERATIVE_LEMMA_OVERRIDES = { + "停": "Stop", + "走": "Go", + "来": "Come", + "去": "Go", + "等": "Wait", + "看": "Look", + "听": "Listen", + "请": "Please", + "起": "Get up", + "坐": "Sit", + "进": "Enter", + "出": "Get out", + "让开": "Move aside", + "小心": "Be careful", + "别动": "Don't move", + "不要": "Don't", + "快点": "Hurry", + "快": "Hurry", +} + + +def clean_gloss(gloss: str) -> str: + """Trim a CC-CEDICT definition for inline display.""" + g = gloss.strip() + # Drop a leading parenthetical annotation if present (may be quite long). + while True: + m = GLOSS_LEADING_PAREN_RE.match(g) + if not m: + break + g = g[m.end():].strip() + # Drop bracketed pinyin like [ni3] anywhere in the text. + g = GLOSS_PINYIN_BRACKETS_RE.sub("", g) + # Drop inner editorial annotations enclosed in parens. + g = GLOSS_INNER_ANNOTATION_RE.sub("", g) + # Drop any remaining "(CL:…)" classifier hints. + g = GLOSS_CL_RE.sub("", g) + # Drop trailing "see also X" / "see X" cross-references. + g = GLOSS_SEE_ALSO_RE.sub("", g) + # Tidy spacing. + g = re.sub(r"\s+", " ", g).strip() + g = re.sub(r"\s+([,.!?;:])", r"\1", g) + # If the gloss reduces to ";" or "," fragments, normalize. + g = g.strip(" ;,") + return g + + +def gloss_hanzi(hanzi: str, cedict: dict, name_map: dict[str, str] | None = None) -> str: + """Build a literal English gloss from a Hanzi line. + + Newlines in the Hanzi are preserved (each line glossed independently). + Unmappable Hanzi appear as `[?]` so a human reviewer can spot them. + + If name_map is provided, segments matching a known proper noun use the + name_map entry instead of the CC-CEDICT literal gloss — so 秀莲 stays + "Xiùlián" (the character) rather than "beautiful lotus" (the literal). + name_map values are tone-marked pinyin; for the English row we strip + the tone marks and capitalize the syllable starts so the gloss matches + the english_name_map convention used elsewhere. + """ + if not hanzi or not cedict: + return "" + # Build a name-segmentation seed. We want longest-first matching of + # name_map keys to take precedence over generic CC-CEDICT segmentation. + name_keys = sorted((name_map or {}).keys(), key=len, reverse=True) if name_map else [] + + out_lines: list[str] = [] + for hline in hanzi.split("\n"): + # First pass: walk the line, replacing name-keys with sentinels, + # then segment the remainder via greedy CC-CEDICT match. + # Simpler: do a per-position check for name keys before each + # CC-CEDICT longest-match. + parts: list[str] = [] + i = 0 + n = len(hline) + while i < n: + ch = hline[i] + if HAN_RE.match(ch): + # Try name_map (longest first). + matched = False + for nk in name_keys: + if hline.startswith(nk, i): + parts.append(_name_to_english(name_map[nk])) + i += len(nk) + matched = True + break + if matched: + continue + # Fall through to CC-CEDICT longest-match. + for length in range(min(12, n - i), 0, -1): + cand = hline[i:i + length] + if cand in cedict: + # Particle override: drop entirely. + if cand in PARTICLE_OVERRIDES: + override = PARTICLE_OVERRIDES[cand] + if override: + parts.append(override) + i += length + matched = True + break + # Imperative override: emit the bare command form + # when the surrounding cue is short and verb-led. + if cand in IMPERATIVE_LEMMA_OVERRIDES: + parts.append(IMPERATIVE_LEMMA_OVERRIDES[cand]) + i += length + matched = True + break + g = gloss_segment(cand, cedict) + if g: + cleaned = clean_gloss(g) + if cleaned: + parts.append(cleaned) + else: + parts.append(f"[{cand}?]") + i += length + matched = True + break + if not matched: + parts.append(f"[{ch}?]") + i += 1 + else: + parts.append(ch) + i += 1 + line = " ".join(p for p in parts if p.strip()).strip() + line = re.sub(r"\s+", " ", line) + line = re.sub(r"\s+([,.!?;:])", r"\1", line) + out_lines.append(line) + return "\n".join(out_lines).strip() + + +def _name_to_english(tone_pinyin: str) -> str: + """Convert a name_map entry (e.g. 'Lǐ Mùbái') to bare English form ('Li Mubai'). + + Strip tone marks via NFKD decomposition + filter combining chars. + Capitalization is preserved (the consonant-side capital is what we want). + """ + import unicodedata + nfkd = unicodedata.normalize("NFKD", tone_pinyin) + return "".join(c for c in nfkd if not unicodedata.combining(c)) + + +def enrich_cues(cues: list[TriCue], cedict: dict, changelog_path: Path | None, + name_map: dict[str, str] | None = None) -> list[TriCue]: + """Apply Phase-1 dictionary correction. + + Only fills empty English cues; never overwrites existing English. Logs + every change to .changes.tsv for review. + """ + if not cedict: + print("enrich: no dictionary loaded; skipping") + return cues + changes: list[tuple[int, str, str, str, str]] = [] + for c in cues: + if c.hanzi and not c.english.strip(): + gloss = gloss_hanzi(c.hanzi, cedict, name_map=name_map) + if gloss: + changes.append((c.index, "FILL", c.hanzi.replace("\n", " | "), "", gloss)) + c.english = gloss + if changelog_path is not None and changes: + try: + with changelog_path.open("w", encoding="utf-8", newline="") as f: + f.write("idx\taction\thanzi\tbefore\tafter\n") + for row in changes: + f.write("\t".join(str(x).replace("\t", " ").replace("\n", " | ") for x in row) + "\n") + print(f"enrich: {len(changes)} cue(s) filled; changelog -> {changelog_path.name}") + except OSError as e: + print(f"warn: could not write changelog {changelog_path}: {e}", file=sys.stderr) + else: + print(f"enrich: {len(changes)} cue(s) filled") + return cues + + # ---------- validation ---------- def validate(cues: list[TriCue]) -> None: @@ -775,6 +1084,10 @@ def main() -> None: ap.add_argument("--window-ms", type=int, default=1500, help="per-cue alignment tolerance (default 1500)") ap.add_argument("--no-bom", action="store_true", help="write output without UTF-8 BOM") + ap.add_argument("--enrich", action="store_true", + help="apply Phase-1 dictionary correction (CC-CEDICT-driven empty-cue fill); writes .changes.tsv") + ap.add_argument("--cedict", type=Path, default=None, + help="path to CC-CEDICT text file (default: Research/primary_sources/cedict/cedict_1_0_ts_utf-8_mdbg.txt)") args = ap.parse_args() if args.inspect: @@ -879,6 +1192,19 @@ def main() -> None: print(f"warn: could not load {alt_path.name}: {e}", file=sys.stderr) print(f" {len(merged)} merged cues") + # 5b. Phase-1 dictionary correction (if --enrich). Runs AFTER translations.json + # so manual per-cue overrides take precedence; only fills empty English. + if args.enrich: + cedict_path = args.cedict if args.cedict else DEFAULT_CEDICT_PATH + print(f"loading dictionary: {cedict_path.name}...") + cedict = load_cedict(cedict_path) + if not cedict: + print("warn: dictionary empty/missing — skipping enrichment", file=sys.stderr) + else: + print(f" {len(cedict)} simplified-Hanzi keys") + changelog = args.out.with_suffix(args.out.suffix + ".changes.tsv") + merged = enrich_cues(merged, cedict, changelog, name_map=name_map) + # 6. Validate + write. Output format inferred from --out extension. validate(merged) ext = args.out.suffix.lower() From cbe16920bd34cdafc4fe0a94babdce0427df3b84 Mon Sep 17 00:00:00 2001 From: Boladi <151992391+Boladi888@users.noreply.github.com> Date: Sun, 10 May 2026 20:04:33 -0400 Subject: [PATCH 2/6] Add Phase-2 LLM verifier via llama.cpp-vulkan (--llm-verify) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Layers a local-LLM verification stage on top of Phase 1. When --llm-verify is set, spawn a llama.cpp HTTP server on the Vulkan device, re-translate the dictionary-filled cues, replace dict glosses with idiomatic English, and cache per-cue results so re-runs skip cues already verified. CTHD test run: 42 dictionary fills → 39 LLM replacements; the 3 unreplaced are cases where the dict's imperative override (Stop!, Move aside) or name_map (Mubai, Xiulian) already produced the right output and the LLM agreed. - Server lifecycle: find_arc_vulkan_device() picks the Arc device id from `llama-server --list-devices` (skips the iGPU). start_llama_server() spawns with --gpu-layers auto --no-warmup and polls /health. The 🔴 STARTING / 🟧 A770 USE banner and final "double-confirmed to be closed" line meet Master §1.6. - Prompt design: system prompt tells the model the Chinese is truth and to output only the English line. User prompt carries the Chinese, the existing English (if any), the dict suggestion as a hint, the two prior cues for context, and name_map hints for any character mentioned. /no_think appended to suppress Qwen3's reasoning mode (which otherwise eats the whole token budget into reasoning_content with empty content). - Defensive fallback: if a model ignores /no_think and returns empty content with non-empty reasoning_content, llm_complete extracts a best-guess answer from the reasoning stream. - Cache: Research/cache/.llm.json, keyed by sha1(model | hanzi). Re-runs skip cached cues entirely. Cache survives across runs of the same film. - A770 logging: every dispatch appends a structured row to Logs/A770_usage.md with timestamp, model, quant, n_ctx, device, call count, prompt/completion token totals, completion tok/s, JIT time, total wall, success/failure, and source session pointer. Matches the K-Arc2/A770/data/characterization.md row format so the data is publishable. CLI: --llm-verify (requires --enrich), --llm-model , --llm-server , --llm-port (default 8765 to avoid colliding with TIMMY on 8080), --llm-ctx, --llm-max (cap cue count for smoke tests). Default sources for paths are PINSUB_LLM_GGUF and PINSUB_LLAMA_SERVER env vars, so the only environment surface needed is two paths — no directory-wide access to the model store. No new Python dependencies (stdlib http.client + json + subprocess). --- PinSub.py | 505 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 499 insertions(+), 6 deletions(-) diff --git a/PinSub.py b/PinSub.py index 745df09..206b7e8 100644 --- a/PinSub.py +++ b/PinSub.py @@ -26,10 +26,16 @@ """ import argparse +import hashlib +import http.client import json +import os +import platform import re +import socket import subprocess import sys +import time from dataclasses import dataclass, field from pathlib import Path @@ -996,22 +1002,30 @@ def _name_to_english(tone_pinyin: str) -> str: def enrich_cues(cues: list[TriCue], cedict: dict, changelog_path: Path | None, - name_map: dict[str, str] | None = None) -> list[TriCue]: + name_map: dict[str, str] | None = None + ) -> tuple[list[TriCue], set[int], list[tuple]]: """Apply Phase-1 dictionary correction. Only fills empty English cues; never overwrites existing English. Logs every change to .changes.tsv for review. + + Returns (cues, set_of_filled_cue_indices, changelog_rows). The filled + indices are what Phase 2 LLM verifier targets — those are the cues + whose English originated from CC-CEDICT and most need natural-language + cleanup. """ if not cedict: print("enrich: no dictionary loaded; skipping") - return cues - changes: list[tuple[int, str, str, str, str]] = [] + return cues, set(), [] + changes: list[tuple] = [] + filled: set[int] = set() for c in cues: if c.hanzi and not c.english.strip(): gloss = gloss_hanzi(c.hanzi, cedict, name_map=name_map) if gloss: changes.append((c.index, "FILL", c.hanzi.replace("\n", " | "), "", gloss)) c.english = gloss + filled.add(c.index) if changelog_path is not None and changes: try: with changelog_path.open("w", encoding="utf-8", newline="") as f: @@ -1023,7 +1037,386 @@ def enrich_cues(cues: list[TriCue], cedict: dict, changelog_path: Path | None, print(f"warn: could not write changelog {changelog_path}: {e}", file=sys.stderr) else: print(f"enrich: {len(changes)} cue(s) filled") - return cues + return cues, filled, changes + + +# ---------- Phase 2: LLM verifier via llama.cpp-vulkan on Arc A770 ---------- +# +# Phase 1 (dictionary) fills empty cues with literal CC-CEDICT gloss; quality +# is bimodal. Phase 2 sends the filled cues (and optionally divergent cues) +# to a local llama.cpp HTTP server for natural-language correction. +# +# Pattern matches the owner's TIMMY backend convention: spawn `llama-server` +# with Vulkan device + auto gpu-layers, talk over HTTP, kill on exit. +# Per project rule (03_PROJECT.md workflow preferences), every A770 dispatch +# is announced and logged to Logs/A770_usage.md. + +LLAMA_HEALTH_TIMEOUT = 180 # seconds; first-call JIT can take a while +LLAMA_REQUEST_TIMEOUT = 120 +LLAMA_DEFAULT_PORT = 8765 # avoid colliding with TIMMY on 8080 + +LLM_SYSTEM_PROMPT = ( + "You translate Chinese film subtitles into English for a Mandarin learner. " + "The Chinese is the source of truth. Produce an idiomatic English line that " + "matches the Chinese meaning. Be short — one line fits on screen. " + "Do NOT explain. Do NOT include the Chinese text. Do NOT add quotes or " + "labels. Output ONLY the English translation. If the Chinese is a single " + "interjection or name, output its English equivalent only. /no_think" +) + + +@dataclass +class LlamaSession: + """Live llama-server handle plus metadata for the A770 usage log.""" + proc: subprocess.Popen + port: int + model_path: Path + server_exe: Path + device: str | None + started_at: float + first_call_at: float | None = None + last_call_at: float | None = None + calls: int = 0 + total_completion_tokens: int = 0 + total_prompt_tokens: int = 0 + + +def find_arc_vulkan_device(server_exe: Path) -> str | None: + """Run `llama-server --list-devices` and return the Arc device id (e.g. 'Vulkan0').""" + try: + out = subprocess.run( + [str(server_exe), "--list-devices"], + capture_output=True, text=True, timeout=30, check=False, + ) + except (OSError, subprocess.SubprocessError) as e: + print(f"warn: --list-devices failed: {e}", file=sys.stderr) + return None + blob = (out.stdout or "") + "\n" + (out.stderr or "") + # llama-server prints lines like: " Vulkan0: Intel(R) Arc(TM) A770 Graphics (...) + for line in blob.splitlines(): + if "Arc" in line and "Vulkan" in line: + m = re.search(r"(Vulkan\d+)", line) + if m: + return m.group(1) + return None + + +def wait_for_llama_health(port: int, timeout: int = LLAMA_HEALTH_TIMEOUT) -> bool: + """Poll /health on the local server until 200 or timeout.""" + deadline = time.time() + timeout + while time.time() < deadline: + try: + conn = http.client.HTTPConnection("127.0.0.1", port, timeout=3) + conn.request("GET", "/health") + r = conn.getresponse() + conn.close() + if r.status == 200: + return True + except (OSError, http.client.HTTPException): + pass + time.sleep(1) + return False + + +def start_llama_server(server_exe: Path, model_path: Path, + port: int = LLAMA_DEFAULT_PORT, + device: str | None = None, + n_gpu_layers: str = "auto", + ctx_size: int = 4096) -> LlamaSession: + """Spawn llama-server. Returns a LlamaSession; caller is responsible for stop_llama_server().""" + if not server_exe.exists(): + sys.exit(f"missing llama-server: {server_exe}") + if not model_path.exists(): + sys.exit(f"missing GGUF model: {model_path}") + cmd = [str(server_exe), "-m", str(model_path), + "--port", str(port), + "--ctx-size", str(ctx_size), + "--gpu-layers", n_gpu_layers, + "--no-warmup"] # we'll measure first-call JIT ourselves + if device: + cmd += ["--device", device] + # 🔴 STARTING A SERVER — per Master §1.6 + print(f"\n🔴 STARTING llama-server on port {port}") + print(f" model: {model_path.name}") + print(f" device: {device or '(auto)'}") + print(f" ctx: {ctx_size}") + print(f" cmd: {' '.join(cmd)}\n") + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + text=True, encoding="utf-8", errors="replace", + bufsize=1) + started = time.time() + # Print server output in the background while waiting for health. + if not wait_for_llama_health(port): + # Capture whatever the server emitted so we can diagnose. + if proc.stdout is not None: + try: + # Non-blocking-ish: read what's available. + proc.terminate() + try: + captured, _ = proc.communicate(timeout=3) + except subprocess.TimeoutExpired: + proc.kill() + captured, _ = proc.communicate() + print(captured[-2000:] if captured else "(no output)", file=sys.stderr) + except (OSError, ValueError): + pass + sys.exit("llama-server failed to come up within timeout") + print(f"llama-server ready on :{port} after {time.time() - started:.1f}s") + return LlamaSession( + proc=proc, port=port, model_path=model_path, server_exe=server_exe, + device=device, started_at=started, + ) + + +def stop_llama_server(session: LlamaSession) -> None: + """Terminate the server and verify the port is free. Per Master §1.6.""" + if session.proc.poll() is None: + try: + session.proc.terminate() + session.proc.wait(timeout=10) + except subprocess.TimeoutExpired: + session.proc.kill() + session.proc.wait(timeout=5) + # Verify port is free. + try: + with socket.create_connection(("127.0.0.1", session.port), timeout=2): + print(f"warn: port {session.port} still bound after kill", file=sys.stderr) + return + except OSError: + pass # port is free, which is what we want + print(f'\nYour honor, the server has been double-confirmed to be closed') + + +def llm_complete(session: LlamaSession, system_prompt: str, user_prompt: str, + max_tokens: int = 96, temperature: float = 0.2) -> str | None: + """POST to /v1/chat/completions; return content or None on failure.""" + body = json.dumps({ + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + "max_tokens": max_tokens, + "temperature": temperature, + "stream": False, + }).encode("utf-8") + try: + conn = http.client.HTTPConnection("127.0.0.1", session.port, timeout=LLAMA_REQUEST_TIMEOUT) + conn.request("POST", "/v1/chat/completions", body=body, + headers={"Content-Type": "application/json"}) + r = conn.getresponse() + if r.status != 200: + print(f"llm error: HTTP {r.status} — {r.read()[:200]}", file=sys.stderr) + conn.close() + return None + data = json.loads(r.read().decode("utf-8")) + conn.close() + except (OSError, json.JSONDecodeError, KeyError) as e: + print(f"llm error: {e}", file=sys.stderr) + return None + now = time.time() + session.calls += 1 + if session.first_call_at is None: + session.first_call_at = now + session.last_call_at = now + usage = data.get("usage") or {} + session.total_completion_tokens += int(usage.get("completion_tokens") or 0) + session.total_prompt_tokens += int(usage.get("prompt_tokens") or 0) + try: + msg = data["choices"][0]["message"] + except (KeyError, IndexError): + return None + content = (msg.get("content") or "").strip() + if content: + return content + # Qwen3 reasoning mode: if /no_think failed and `content` is empty, the + # answer may still live inside `reasoning_content` (a stream of thoughts). + # We try to extract a final-line subtitle from it. Last resort. + reasoning = (msg.get("reasoning_content") or "").strip() + if reasoning: + # Look for an explicit quoted answer first. + m = re.search(r'"([^"]{2,80})"', reasoning) + if m: + return m.group(1).strip() + # Otherwise the last non-empty line is the best guess. + for line in reversed(reasoning.splitlines()): + s = line.strip().strip('"').strip("'").strip() + if 2 <= len(s) <= 120: + return s + return None + + +def _clean_llm_output(text: str) -> str: + """Defensive cleanup of LLM output.""" + if not text: + return "" + t = text.strip() + # Strip wrapping quotes if the model added them. + if (t.startswith('"') and t.endswith('"')) or (t.startswith("'") and t.endswith("'")): + t = t[1:-1].strip() + # Drop trailing model artifacts like "<|im_end|>" or thinking-tag residue. + t = re.sub(r"]*>", "", t, flags=re.I).strip() + # Some Qwen variants prefix the answer with the source language. Drop. + t = re.sub(r"^(English|Translation|Subtitle)\s*[:\-]\s*", "", t, flags=re.I) + return t + + +def build_verify_prompt(cue: TriCue, dict_suggestion: str, + prior_cues: list[TriCue], name_map: dict[str, str]) -> str: + """Build the user-side prompt for cue verification.""" + parts: list[str] = [] + parts.append(f"Chinese: {cue.hanzi}") + if cue.english.strip() and cue.english.strip() != dict_suggestion: + parts.append(f"Existing English (may be wrong or stilted): {cue.english}") + if dict_suggestion: + parts.append(f"Dictionary literal gloss (often awkward, just a hint): {dict_suggestion}") + # Two most recent cues that have Hanzi, for context. + ctx = [pc for pc in prior_cues if pc.hanzi][-2:] + if ctx: + parts.append("Recent context (prior cues):") + for pc in ctx: + parts.append(f" {pc.hanzi} → {pc.english}") + # Name hints if any name appears in this cue. + hints: list[str] = [] + for h, p in (name_map or {}).items(): + if h and h in cue.hanzi: + hints.append(f"{h} = {_name_to_english(p)}") + if hints: + parts.append("Known names: " + "; ".join(hints)) + parts.append("Translate the Chinese above into one short English subtitle line.") + return "\n".join(parts) + + +def _cache_key(model_name: str, hanzi: str) -> str: + return hashlib.sha1(f"{model_name}|{hanzi}".encode("utf-8")).hexdigest() + + +def load_llm_cache(cache_path: Path) -> dict[str, str]: + if not cache_path.exists(): + return {} + try: + with cache_path.open(encoding="utf-8") as f: + return json.load(f) + except (OSError, json.JSONDecodeError): + return {} + + +def save_llm_cache(cache_path: Path, cache: dict[str, str]) -> None: + try: + cache_path.parent.mkdir(parents=True, exist_ok=True) + with cache_path.open("w", encoding="utf-8") as f: + json.dump(cache, f, ensure_ascii=False, indent=1) + except OSError as e: + print(f"warn: could not write cache {cache_path}: {e}", file=sys.stderr) + + +def verify_cues_with_llm(cues: list[TriCue], session: LlamaSession, + filled_indices: set[int], + name_map: dict[str, str], + cedict: dict, + cache: dict[str, str], + model_name: str, + changelog_rows: list[tuple]) -> int: + """Verify the previously-filled-by-dictionary cues; replace English with LLM output. + + Returns the number of cues replaced. + """ + replaced = 0 + cue_by_idx = {c.index: c for c in cues} + indices_sorted = sorted(filled_indices) + print(f"llm verify: {len(indices_sorted)} cue(s) to check") + for n, idx in enumerate(indices_sorted, 1): + c = cue_by_idx.get(idx) + if c is None or not c.hanzi: + continue + key = _cache_key(model_name, c.hanzi) + cached = cache.get(key) + if cached is not None: + new_text = cached + src = "cache" + else: + # Compute dict suggestion fresh (same logic Phase 1 uses). + dict_suggestion = gloss_hanzi(c.hanzi, cedict, name_map=name_map) if cedict else "" + # Two cues directly prior in cue order. + prior = [cue_by_idx[i] for i in range(max(1, idx - 4), idx) if i in cue_by_idx] + prompt = build_verify_prompt(c, dict_suggestion, prior, name_map) + raw = llm_complete(session, LLM_SYSTEM_PROMPT, prompt) + new_text = _clean_llm_output(raw) if raw else "" + if not new_text: + print(f" cue {idx}: no LLM output; keeping existing", file=sys.stderr) + continue + cache[key] = new_text + src = "llm" + if new_text and new_text != c.english.strip(): + changelog_rows.append((c.index, f"VERIFY({src})", c.hanzi.replace("\n", " | "), + c.english, new_text)) + c.english = new_text + replaced += 1 + if n % 5 == 0 or n == len(indices_sorted): + print(f" ...{n}/{len(indices_sorted)} cue(s) processed") + return replaced + + +def append_changelog(changelog_path: Path, rows: list[tuple]) -> None: + """Append (or create) a TSV log of all enrichment actions.""" + if not rows: + return + header_needed = not changelog_path.exists() + try: + with changelog_path.open("a", encoding="utf-8", newline="") as f: + if header_needed: + f.write("idx\taction\thanzi\tbefore\tafter\n") + for row in rows: + f.write("\t".join(str(x).replace("\t", " ").replace("\n", " | ") for x in row) + "\n") + except OSError as e: + print(f"warn: could not write changelog {changelog_path}: {e}", file=sys.stderr) + + +def write_a770_usage_log(log_path: Path, session: LlamaSession, *, + job: str, cues_processed: int, cues_replaced: int, + source_session_path: Path | None, + success: bool, notes: str = "") -> None: + """Append a structured row to Logs/A770_usage.md per the project rule.""" + log_path.parent.mkdir(parents=True, exist_ok=True) + elapsed = time.time() - session.started_at + jit_compile = ((session.first_call_at - session.started_at) + if session.first_call_at else None) + inference_window = ((session.last_call_at - session.first_call_at) + if (session.first_call_at and session.last_call_at) else 0.0) + completion_tps = (session.total_completion_tokens / inference_window + if inference_window > 0 else 0.0) + header = ( + "| time (UTC) | project | job | model | quant | n_ctx | n_gpu_layers " + "| device | calls | prompt_tok | completion_tok | completion_tps | jit_s " + "| total_wall_s | success | notes | session_log |" + ) + sep = "|" + "|".join(["---"] * 16) + "|" + if not log_path.exists(): + log_path.write_text( + "# A770 usage log — PS\n\n" + "Per project rule (`03_PROJECT.md` Owner workflow preferences): every A770 dispatch is logged.\n\n" + f"{header}\n{sep}\n", + encoding="utf-8", + ) + quant = "" + m = re.search(r"-(Q\d[A-Za-z0-9_]*)\.gguf$", session.model_path.name, re.I) + if m: + quant = m.group(1) + now_utc = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + src = source_session_path.name if source_session_path else "" + row_cells = [ + now_utc, "PS", job, session.model_path.name, quant, + "4096", "auto", session.device or "(auto)", + str(session.calls), str(session.total_prompt_tokens), + str(session.total_completion_tokens), + f"{completion_tps:.1f}", + f"{jit_compile:.1f}" if jit_compile is not None else "-", + f"{elapsed:.1f}", + "yes" if success else "no", + notes.replace("|", "/").replace("\n", " "), + src, + ] + with log_path.open("a", encoding="utf-8") as f: + f.write("| " + " | ".join(row_cells) + " |\n") # ---------- validation ---------- @@ -1088,6 +1481,18 @@ def main() -> None: help="apply Phase-1 dictionary correction (CC-CEDICT-driven empty-cue fill); writes .changes.tsv") ap.add_argument("--cedict", type=Path, default=None, help="path to CC-CEDICT text file (default: Research/primary_sources/cedict/cedict_1_0_ts_utf-8_mdbg.txt)") + ap.add_argument("--llm-verify", action="store_true", + help="(Phase 2) run llama.cpp on the Arc A770 to re-translate dictionary-filled cues. Requires --enrich. Uses --llm-model / --llm-server (or env vars PINSUB_LLM_GGUF / PINSUB_LLAMA_SERVER).") + ap.add_argument("--llm-model", type=Path, default=None, + help="GGUF model file for --llm-verify (default: $PINSUB_LLM_GGUF env var)") + ap.add_argument("--llm-server", type=Path, default=None, + help="llama-server.exe path for --llm-verify (default: $PINSUB_LLAMA_SERVER env var)") + ap.add_argument("--llm-port", type=int, default=LLAMA_DEFAULT_PORT, + help=f"localhost port for llama-server (default {LLAMA_DEFAULT_PORT})") + ap.add_argument("--llm-ctx", type=int, default=4096, + help="llama-server --ctx-size (default 4096)") + ap.add_argument("--llm-max", type=int, default=0, + help="cap LLM verify pass at N cues (0 = all filled; useful for smoke tests)") args = ap.parse_args() if args.inspect: @@ -1194,6 +1599,10 @@ def main() -> None: # 5b. Phase-1 dictionary correction (if --enrich). Runs AFTER translations.json # so manual per-cue overrides take precedence; only fills empty English. + cedict: dict = {} + filled_indices: set[int] = set() + changelog_rows: list[tuple] = [] + changelog_path: Path | None = None if args.enrich: cedict_path = args.cedict if args.cedict else DEFAULT_CEDICT_PATH print(f"loading dictionary: {cedict_path.name}...") @@ -1202,8 +1611,92 @@ def main() -> None: print("warn: dictionary empty/missing — skipping enrichment", file=sys.stderr) else: print(f" {len(cedict)} simplified-Hanzi keys") - changelog = args.out.with_suffix(args.out.suffix + ".changes.tsv") - merged = enrich_cues(merged, cedict, changelog, name_map=name_map) + changelog_path = args.out.with_suffix(args.out.suffix + ".changes.tsv") + merged, filled_indices, changelog_rows = enrich_cues( + merged, cedict, changelog_path, name_map=name_map, + ) + + # 5c. Phase-2 LLM verifier (if --llm-verify). Uses the local llama.cpp HTTP + # server on the Arc A770. Verifies the cues filled by Phase 1. + if args.llm_verify: + if not args.enrich: + sys.exit("--llm-verify requires --enrich") + server_exe = args.llm_server or Path(os.environ.get("PINSUB_LLAMA_SERVER", "")) + model_path = args.llm_model or Path(os.environ.get("PINSUB_LLM_GGUF", "")) + if not server_exe or str(server_exe) == ".": + sys.exit("--llm-verify needs --llm-server or PINSUB_LLAMA_SERVER env var") + if not model_path or str(model_path) == ".": + sys.exit("--llm-verify needs --llm-model or PINSUB_LLM_GGUF env var") + if not filled_indices: + print("llm verify: no cues were filled by Phase 1; nothing to verify") + else: + if args.llm_max and args.llm_max > 0: + # Smoke-test mode: cap the LLM verify pass at N cues. + indices_sorted = sorted(filled_indices)[: args.llm_max] + filled_indices = set(indices_sorted) + print(f"llm verify: --llm-max set, verifying first {len(filled_indices)} cue(s)") + + print("🟧 A770 USE: spawning llama-server on the Arc.") + print(f" model: {model_path.name} (~{model_path.stat().st_size / 1e9:.1f} GB on disk)") + print(f" est. VRAM peak: ~12 GB (10.5 GB model + ~1.5 GB KV @ ctx={args.llm_ctx})") + print(f" est. runtime: ~10 s per cue × {len(filled_indices)} cue(s) + ~30 s startup") + device = find_arc_vulkan_device(server_exe) + if device: + print(f" device: {device}") + else: + print(" device: (auto — could not enumerate; llama-server will pick)") + + session = start_llama_server( + server_exe, model_path, + port=args.llm_port, device=device, ctx_size=args.llm_ctx, + ) + + # Cache file is keyed per-film by IMDb id when present. + imdb_id = None + if args.mkv: + m = IMDB_RE.search(args.mkv.name) + if m: + imdb_id = m.group(1) + cache_path = (Path(__file__).parent / "Research" / "cache" + / f"{imdb_id or 'noimdb'}.llm.json") + cache = load_llm_cache(cache_path) + + replaced = 0 + success = False + try: + replaced = verify_cues_with_llm( + merged, session, filled_indices, name_map, cedict, cache, + model_name=model_path.name, changelog_rows=changelog_rows, + ) + print(f"llm verify: {replaced} cue(s) replaced from LLM output") + save_llm_cache(cache_path, cache) + success = True + finally: + stop_llama_server(session) + # A770 usage log entry — required by project rule. + a770_log = Path(__file__).parent / "Logs" / "A770_usage.md" + write_a770_usage_log( + a770_log, session, + job="PinSub --llm-verify", + cues_processed=len(filled_indices), + cues_replaced=replaced, + source_session_path=None, + success=success, + notes=f"film={imdb_id or args.mkv.name}; cache={cache_path.name}", + ) + + # Append the VERIFY rows to the existing FILL changelog. + if changelog_path is not None and changelog_rows: + # Rewrite the changelog with the merged rows. + try: + with changelog_path.open("w", encoding="utf-8", newline="") as f: + f.write("idx\taction\thanzi\tbefore\tafter\n") + for row in changelog_rows: + f.write("\t".join( + str(x).replace("\t", " ").replace("\n", " | ") for x in row + ) + "\n") + except OSError as e: + print(f"warn: could not rewrite changelog {changelog_path}: {e}", file=sys.stderr) # 6. Validate + write. Output format inferred from --out extension. validate(merged) From efcbae05c59b76a8a1e3b8d51294591571d32282 Mon Sep 17 00:00:00 2001 From: Boladi <151992391+Boladi888@users.noreply.github.com> Date: Sun, 10 May 2026 20:20:59 -0400 Subject: [PATCH 3/6] Add Phase-3 translate-then-compare LLM correction (--llm-correct) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bigger-scope correction than --llm-verify. Where Phase 2 only re-translates the cues Phase 1 filled (empty English), Phase 3 also goes after cues whose existing English diverges from what the Chinese says. Python does the heavy lifting (heuristic prefiltering, decision logic, name-map enforcement); the LLM is the dumb worker that translates and judges. Pipeline per cue: 1. Python heuristic decides whether the cue is worth sending to the LLM at all. Cheap signals: empty English; low content-word overlap between the CC-CEDICT dict-gloss and the existing English; length ratio outliers between English chars and Hanzi chars. Tweakable overlap_threshold (default 0.2). Reports a histogram of reasons so the user can tune. 2. TIMMY-translate (fresh prompt, no existing English in context) produces a Chinese-faithful candidate. Per-cue cache. 3. If existing English exists AND differs from TIMMY's, a separate fresh-context TIMMY call judges A=existing, B=TIMMY, or C=write a better one. Per-cue cache. 4. Python applies the verdict, logs the change. CLI: --llm-correct Enable Phase 3. --llm-scope {fills,divergent,all} Which cues to consider. Default 'divergent' (Python heuristic flags suspicion). 'fills' is the conservative Phase-2-equivalent (empty-English cues only). 'all' touches every cue with Chinese — expensive. --llm-no-compare Skip the comparison pass (use TIMMY's translation outright when it differs from existing). Halves LLM call volume. Caching: Translation cache key: T|sha1(model | hanzi). Compare cache key: C|sha1(model | hanzi | en_a | en_b). Stored at Research/cache/.correct.json. Re-runs against the same film skip both translate and compare for cues already seen. A770 logging: Each --llm-correct run appends one row to Logs/A770_usage.md with the job string including the chosen scope and a per-action count summary in the notes column. CTHD divergent-scope pre-survey: 752 of 1030 cues flagged as suspect (73%). Smoke test of 10 cues showed 9/10 replaced with high-quality output — TIMMY correctly identified that the Bluray English subs were shifted/misaligned with the Chinese in many cues. Full run gated on owner approval given the A770 contention cost. No new Python dependencies; reuses the LlamaSession infrastructure introduced in Phase 2. --- PinSub.py | 408 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 407 insertions(+), 1 deletion(-) diff --git a/PinSub.py b/PinSub.py index 206b7e8..cf524ea 100644 --- a/PinSub.py +++ b/PinSub.py @@ -1419,6 +1419,301 @@ def write_a770_usage_log(log_path: Path, session: LlamaSession, *, f.write("| " + " | ".join(row_cells) + " |\n") +# ---------- Phase 3: Python-orchestrated translate-then-compare correction ---------- +# +# Bigger-scope correction than --llm-verify. Where Phase 2 only re-translates +# the cues Phase 1 filled, Phase 3 also goes after cues whose EXISTING English +# diverges from what the Chinese says. +# +# Pipeline per cue: +# 1. Python heuristic (content-word overlap + length ratio + empty check) +# decides whether the cue is worth sending to the LLM. This keeps TIMMY +# from re-translating the 90% of cues that are already fine. +# 2. TIMMY-translate (fresh prompt, no existing English in context) produces +# a Chinese-faithful candidate. Per-cue cache. +# 3. If existing English exists and differs from TIMMY's, a SEPARATE TIMMY +# call (fresh context, different prompt) judges: A=existing, B=TIMMY's, +# or C=write a better one. Per-cue cache. +# 4. Python applies name_map enforcement on the chosen text and logs. + +# Common English stop words — excluded from content-word overlap calculations. +_STOP_WORDS = { + "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", "am", + "in", "on", "at", "of", "for", "with", "to", "from", "by", "as", "into", + "and", "or", "but", "nor", "so", "if", "then", "than", + "i", "you", "he", "she", "it", "we", "they", + "me", "him", "her", "us", "them", + "my", "your", "his", "hers", "its", "our", "ours", "their", "theirs", + "do", "does", "did", "doing", "done", + "have", "has", "had", "having", + "what", "who", "whom", "where", "when", "why", "how", "which", + "this", "that", "these", "those", + "lit", "fig", "interj", + "not", "no", + "any", "all", "some", "much", "many", "more", "most", "few", "less", "least", + "very", "too", "so", "just", + "let", "lets", +} + + +def _content_words(text: str) -> set[str]: + """Lowercased English content words, stopwords/short words removed.""" + if not text: + return set() + words = re.findall(r"[A-Za-z]+", text.lower()) + return {w for w in words if len(w) >= 2 and w not in _STOP_WORDS} + + +def detect_suspect_english(c: TriCue, cedict: dict, name_map: dict | None, + overlap_threshold: float = 0.2) -> tuple[bool, str]: + """Python heuristic: does this cue's existing English likely diverge from the Chinese? + + Returns (is_suspect, reason). Cheap (no LLM); meant to prefilter so TIMMY + only sees cues that actually need attention. + """ + if not c.hanzi: + return False, "no-hanzi" + existing = c.english.strip() + if not existing: + return True, "empty" + if not cedict: + return False, "no-dict" + # Compute the dict gloss (literal content words). + gloss = gloss_hanzi(c.hanzi, cedict, name_map=name_map) + if not gloss: + return False, "no-gloss" + gloss_words = _content_words(gloss) + if not gloss_words: + # Gloss reduces to particles/empty; nothing to compare against. + return False, "thin-gloss" + en_words = _content_words(existing) + # Add tone-stripped name-map English forms to the gloss vocabulary too — + # that way "Xiu Lian" matches the existing English referring to the character. + for h, p in (name_map or {}).items(): + if h and h in c.hanzi: + gloss_words |= _content_words(_name_to_english(p)) + if not gloss_words: + return False, "thin-gloss" + overlap = len(en_words & gloss_words) / max(1, len(gloss_words)) + if overlap < overlap_threshold: + return True, f"low-overlap({overlap:.2f})" + # Length ratio sanity check on long-ish cues. + han_chars = len([ch for ch in c.hanzi if HAN_RE.match(ch)]) + en_len = len(existing) + if han_chars >= 6 and en_len < han_chars * 1.0: + return True, f"english-too-short({en_len}/{han_chars})" + if han_chars >= 4 and en_len > han_chars * 12: + return True, f"english-too-long({en_len}/{han_chars})" + return False, "ok" + + +LLM_FRESH_SYSTEM_PROMPT = ( + "You translate Chinese film subtitles into idiomatic English for a Mandarin " + "learner. The Chinese is the source of truth. Translate the Chinese line " + "directly — do NOT consider any prior English translation. Be short — one " + "subtitle line. Do NOT explain. Do NOT include the Chinese text. Do NOT add " + "quotes or labels. Output ONLY the English translation. /no_think" +) + + +def llm_translate_fresh(session: LlamaSession, c: TriCue, + prior_cues: list[TriCue], name_map: dict) -> str | None: + """Fresh TIMMY call: translate Chinese only, without seeing the existing English.""" + parts: list[str] = [] + ctx = [pc for pc in prior_cues if pc.hanzi][-2:] + if ctx: + parts.append("Recent scene context (for awareness, not for re-translation):") + for pc in ctx: + # Use the cue's current English (could be TIMMY's prior output, or + # a confident existing English — that's fine as scene context). + en = pc.english.strip().replace("\n", " ") or "(no english)" + parts.append(f" {pc.hanzi} → {en}") + hints: list[str] = [] + for h, p in (name_map or {}).items(): + if h and h in c.hanzi: + hints.append(f"{h}={_name_to_english(p)}") + if hints: + parts.append("Known names in this cue: " + "; ".join(hints)) + parts.append(f"Chinese: {c.hanzi}") + parts.append("Translate the Chinese above into one English subtitle line.") + raw = llm_complete(session, LLM_FRESH_SYSTEM_PROMPT, "\n".join(parts)) + return _clean_llm_output(raw) if raw else None + + +LLM_COMPARE_SYSTEM_PROMPT = ( + "You judge English subtitle translations for fidelity to a Chinese source. " + "Given the Chinese and two English candidates A and B, decide which " + "candidate better matches the Chinese meaning, OR write a better English " + "translation if both candidates are flawed. Respond with EXACTLY one line:\n" + " A (candidate A is better)\n" + " B (candidate B is better)\n" + " C: (write your own better translation)\n" + "Do not explain. Do not include the Chinese. /no_think" +) + + +def llm_compare_translations(session: LlamaSession, c: TriCue, + en_a: str, en_b: str, + name_map: dict) -> tuple[str, str]: + """Fresh TIMMY: compare two English candidates. Returns (verdict, final_text). + + Verdict is "A", "B", or "C" (model wrote a new translation). + Parse failures default to A (= keep existing) — least risk of regression. + """ + parts: list[str] = [] + parts.append(f"Chinese: {c.hanzi}") + parts.append(f"A: {en_a}") + parts.append(f"B: {en_b}") + hints: list[str] = [] + for h, p in (name_map or {}).items(): + if h and h in c.hanzi: + hints.append(f"{h}={_name_to_english(p)}") + if hints: + parts.append("Known names: " + "; ".join(hints)) + parts.append("Which English better matches the Chinese? Answer A, B, or C: .") + raw = llm_complete(session, LLM_COMPARE_SYSTEM_PROMPT, "\n".join(parts), max_tokens=80) + if not raw: + return "A", en_a + text = raw.strip() + # The model sometimes wraps in extra punctuation. Tolerate "A.", "A!", etc. + first = re.match(r"^\s*([ABC])\s*[:.\-—]?\s*(.*)$", text, re.DOTALL) + if not first: + return "A", en_a + verdict = first.group(1).upper() + rest = first.group(2).strip() + if verdict == "A": + return "A", en_a + if verdict == "B": + return "B", en_b + if verdict == "C": + # Use whatever follows the C label; fall back if empty. + suggestion = _clean_llm_output(rest.split("\n", 1)[0]) + if suggestion: + return "C", suggestion + return "A", en_a + return "A", en_a + + +def correct_cues_with_llm(cues: list[TriCue], session: LlamaSession, + cedict: dict, name_map: dict, + mode: str = "divergent", + do_compare: bool = True, + max_cues: int = 0, + cache_path: Path | None = None, + changelog_rows: list[tuple] | None = None + ) -> tuple[int, dict]: + """Phase 3 orchestrator. Returns (cues_replaced, per_action_counts).""" + if changelog_rows is None: + changelog_rows = [] + cue_by_idx = {c.index: c for c in cues} + cues_in_order = sorted(cue_by_idx.keys()) + + # Step 1: Python filtering. Compute the divergence reason for every cue + # exactly once and reuse it for both selection and the informational + # histogram below. + candidates: list[tuple[int, str]] = [] # (idx, reason) + suspect_counts: dict[str, int] = {} + for idx in cues_in_order: + c = cue_by_idx[idx] + if not c.hanzi: + continue + suspect, reason = detect_suspect_english(c, cedict, name_map) + suspect_counts[reason] = suspect_counts.get(reason, 0) + 1 + if mode == "fills": + if not c.english.strip(): + candidates.append((idx, "empty")) + elif mode == "all": + candidates.append((idx, reason)) + else: # divergent + if suspect: + candidates.append((idx, reason)) + print(f"llm correct: mode={mode}, {len(candidates)} candidate cue(s) selected") + print(f" divergence histogram: {dict(sorted(suspect_counts.items(), key=lambda kv: -kv[1])[:8])}") + if max_cues and max_cues > 0 and len(candidates) > max_cues: + candidates = candidates[:max_cues] + print(f" --llm-max set: capping at {len(candidates)}") + + # Step 2: caching scaffolding. + cache: dict[str, str] = {} + if cache_path is not None: + cache = load_llm_cache(cache_path) + model_name = session.model_path.name + + def t_key(hanzi: str) -> str: + return "T|" + hashlib.sha1(f"{model_name}|{hanzi}".encode("utf-8")).hexdigest() + + def c_key(hanzi: str, en_a: str, en_b: str) -> str: + return "C|" + hashlib.sha1(f"{model_name}|{hanzi}|{en_a}|{en_b}".encode("utf-8")).hexdigest() + + # Per-action counters. + counts = {"translate_only": 0, "kept_existing": 0, + "compare_A": 0, "compare_B": 0, "compare_C": 0, + "no_change": 0} + replaced = 0 + + for n, (idx, reason) in enumerate(candidates, 1): + c = cue_by_idx[idx] + existing = c.english.strip() + + # Step 3: TIMMY-translate fresh. + tk = t_key(c.hanzi) + if tk in cache: + en_timmy = cache[tk] + else: + prior = [cue_by_idx[i] for i in range(max(1, idx - 4), idx) if i in cue_by_idx] + en_timmy = llm_translate_fresh(session, c, prior, name_map) or "" + if en_timmy: + cache[tk] = en_timmy + + if not en_timmy: + counts["no_change"] += 1 + continue + + # Step 4: decide the final text. + if not existing: + final = en_timmy + verdict = "T" + counts["translate_only"] += 1 + elif existing.lower() == en_timmy.lower(): + counts["kept_existing"] += 1 + continue + elif not do_compare: + # Replace blindly (mode disabled comparison). + final = en_timmy + verdict = "T" + counts["translate_only"] += 1 + else: + ck = c_key(c.hanzi, existing, en_timmy) + if ck in cache: + cached_str = cache[ck] + # Format: "A|text" or "B|text" or "C|text" + v = cached_str.split("|", 1)[0] if "|" in cached_str else "A" + txt = cached_str.split("|", 1)[1] if "|" in cached_str else existing + verdict = v + final = txt + else: + v, f = llm_compare_translations(session, c, existing, en_timmy, name_map) + verdict = v + final = f + cache[ck] = f"{v}|{f}" + counts[f"compare_{verdict}"] = counts.get(f"compare_{verdict}", 0) + 1 + + if final and final.strip() != c.english.strip(): + changelog_rows.append((c.index, f"CORRECT({verdict},{reason})", + c.hanzi.replace("\n", " | "), c.english, final)) + c.english = final + replaced += 1 + else: + counts["no_change"] += 1 + + if n % 10 == 0 or n == len(candidates): + print(f" ...{n}/{len(candidates)} cue(s) processed (replaced so far: {replaced})") + + if cache_path is not None: + save_llm_cache(cache_path, cache) + return replaced, counts + + # ---------- validation ---------- def validate(cues: list[TriCue]) -> None: @@ -1492,7 +1787,13 @@ def main() -> None: ap.add_argument("--llm-ctx", type=int, default=4096, help="llama-server --ctx-size (default 4096)") ap.add_argument("--llm-max", type=int, default=0, - help="cap LLM verify pass at N cues (0 = all filled; useful for smoke tests)") + help="cap LLM pass at N cues (0 = no cap; useful for smoke tests)") + ap.add_argument("--llm-correct", action="store_true", + help="(Phase 3) broader translate-then-compare correction. Python heuristic prefilters; TIMMY translates fresh; a second TIMMY compares against existing English. Requires --enrich (for the cedict gloss used by the heuristic).") + ap.add_argument("--llm-scope", choices=("fills", "divergent", "all"), default="divergent", + help="--llm-correct scope: 'fills' (only Phase-1 empty fills), 'divergent' (Python flags suspect cues; default), 'all' (every cue with Chinese)") + ap.add_argument("--llm-no-compare", action="store_true", + help="with --llm-correct, skip the comparison pass (use TIMMY's translation outright)") args = ap.parse_args() if args.inspect: @@ -1698,6 +1999,111 @@ def main() -> None: except OSError as e: print(f"warn: could not rewrite changelog {changelog_path}: {e}", file=sys.stderr) + # 5d. Phase-3 LLM-correct (if --llm-correct). Broader translate-then-compare: + # Python heuristic prefilters suspect cues; TIMMY translates each from + # scratch (no existing English in the prompt); a separate TIMMY compares + # the two candidates per cue. Reuses the llama-server pattern. + if args.llm_correct: + if not args.enrich: + sys.exit("--llm-correct requires --enrich (cedict gloss feeds the divergence heuristic)") + if not cedict: + sys.exit("--llm-correct requires a loaded CC-CEDICT") + server_exe = args.llm_server or Path(os.environ.get("PINSUB_LLAMA_SERVER", "")) + model_path = args.llm_model or Path(os.environ.get("PINSUB_LLM_GGUF", "")) + if not server_exe or str(server_exe) == ".": + sys.exit("--llm-correct needs --llm-server or PINSUB_LLAMA_SERVER env var") + if not model_path or str(model_path) == ".": + sys.exit("--llm-correct needs --llm-model or PINSUB_LLM_GGUF env var") + + # Pre-survey: how many cues will the Python heuristic select before we + # spawn the server? Saves an A770 startup if the answer is zero. + survey_counts: dict[str, int] = {} + candidate_count = 0 + for c in merged: + if not c.hanzi: + continue + suspect, reason = detect_suspect_english(c, cedict, name_map) + survey_counts[reason] = survey_counts.get(reason, 0) + 1 + if args.llm_scope == "fills": + if not c.english.strip(): + candidate_count += 1 + elif args.llm_scope == "all": + candidate_count += 1 + elif suspect: + candidate_count += 1 + print(f"llm correct: pre-survey — {candidate_count} candidate cue(s) (scope={args.llm_scope})") + top = sorted(survey_counts.items(), key=lambda kv: -kv[1])[:6] + print(f" divergence histogram (top 6): {dict(top)}") + if args.llm_max and args.llm_max > 0: + candidate_count = min(candidate_count, args.llm_max) + print(f" (capping at --llm-max={args.llm_max})") + + if candidate_count == 0: + print("llm correct: no cues to process; skipping") + else: + print("🟧 A770 USE: spawning llama-server on the Arc.") + print(f" model: {model_path.name} (~{model_path.stat().st_size / 1e9:.1f} GB on disk)") + print(f" est. VRAM peak: ~12 GB (10.5 GB model + ~1.5 GB KV @ ctx={args.llm_ctx})") + calls_per_cue = 2 if not args.llm_no_compare else 1 + print(f" est. runtime: ~{candidate_count * calls_per_cue * 8} s " + f"({candidate_count} cue(s) × {calls_per_cue} call(s) × ~8 s)") + device = find_arc_vulkan_device(server_exe) + if device: + print(f" device: {device}") + else: + print(" device: (auto)") + + session = start_llama_server( + server_exe, model_path, + port=args.llm_port, device=device, ctx_size=args.llm_ctx, + ) + + imdb_id = None + if args.mkv: + m = IMDB_RE.search(args.mkv.name) + if m: + imdb_id = m.group(1) + cache_path = (Path(__file__).parent / "Research" / "cache" + / f"{imdb_id or 'noimdb'}.correct.json") + + replaced = 0 + counts: dict[str, int] = {} + success = False + try: + replaced, counts = correct_cues_with_llm( + merged, session, cedict, name_map, + mode=args.llm_scope, do_compare=not args.llm_no_compare, + max_cues=args.llm_max, cache_path=cache_path, + changelog_rows=changelog_rows, + ) + print(f"llm correct: {replaced} cue(s) replaced; action counts: {counts}") + success = True + finally: + stop_llama_server(session) + a770_log = Path(__file__).parent / "Logs" / "A770_usage.md" + write_a770_usage_log( + a770_log, session, + job=f"PinSub --llm-correct --llm-scope={args.llm_scope}" + + ("" if not args.llm_no_compare else " --llm-no-compare"), + cues_processed=candidate_count, + cues_replaced=replaced, + source_session_path=None, + success=success, + notes=f"film={imdb_id or args.mkv.name}; counts={counts}", + ) + + # Rewrite the changelog including new CORRECT rows. + if changelog_path is not None and changelog_rows: + try: + with changelog_path.open("w", encoding="utf-8", newline="") as f: + f.write("idx\taction\thanzi\tbefore\tafter\n") + for row in changelog_rows: + f.write("\t".join( + str(x).replace("\t", " ").replace("\n", " | ") for x in row + ) + "\n") + except OSError as e: + print(f"warn: could not rewrite changelog {changelog_path}: {e}", file=sys.stderr) + # 6. Validate + write. Output format inferred from --out extension. validate(merged) ext = args.out.suffix.lower() From 452083de7ab835e93c869b1e9e0080e4fad08bf8 Mon Sep 17 00:00:00 2001 From: Boladi <151992391+Boladi888@users.noreply.github.com> Date: Sun, 10 May 2026 21:15:00 -0400 Subject: [PATCH 4/6] Refactor: external prompt + glossary + validation loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TIMMY's system prompt now lives in README_TIMMY.md next to PinSub.py (loaded at startup, falls back to an inline default if absent). Pattern matches the owner's TIMMY_SDNext.md convention — prompt-engineering decisions live in markdown, not strings buried 1500 lines into Python. Translation philosophy in the new prompt: word-for-word fidelity on content (every noun/verb/name/modifier gets translated), function-word latitude (drop Chinese particles English doesn't have, add English articles/copulas Chinese implies), synonym latitude on word choice, Chinese word order preserved when grammatically tolerable. Rearrange only when literal order is incomprehensible. Name handling generalized: the README_TIMMY rule (bare-pinyin with tone marks stripped, syllable spacing preserved, capitals on each name part) works on any film. If a per-film name_map provides a canonical spelling, that's passed as a "Known names:" hint and wins. If not, TIMMY uses the pinyin row in the prompt to derive the English form. Glossary database: - New glossary.json at the project root with seed wuxia/kungfu terms (枪 → spear, 师娘 → Master's wife, 镖局 → security agency, 江湖 → the martial world, 师父 → Master, etc. ~20 entries). Whitelisted in .gitignore so it ships publicly like names/example.json. - names/.json gets an optional `glossary` field for per-film overrides + additions. Same schema, per-film entries override global on key collisions. - PinSub scans each cue's Hanzi for any glossary key, passes hits to TIMMY as a "Glossary:" section. Longest-match-first so compound terms (天下第一枪) match before atomic terms (枪). Python validators + retry loop: - validate_translation() checks each TIMMY output for Hanzi-in-output (hard reject), name canonicalization (when name_map provided), and length-sanity (only on longer cues). - Failures feed back into the next TIMMY call as explicit feedback ("Your output still contains Chinese characters: 镖局. Translate every Hanzi..."). Loop up to --llm-rounds (default 3) iterations. - rounds_used histogram emitted in action counts so the operator can see how often round-2/3 retries are needed. CPU thread cap (--llm-threads, default 4): caps llama-server's CPU side. Vulkan path spends most time on the GPU; this is the knob to keep host CPU headroom (e.g. when other workloads are using the CPU). CLI additions: --llm-rounds Max validator-driven retry rounds (default 3). --llm-threads CPU thread cap for llama-server (default 4). CTHD smoke (10 cues, refactor pass): - 镖局 now translated as "security agency" via global glossary (previously left as Hanzi by TIMMY). - All 10 cues passed round-1 validation; no retries needed in this sample. Loop is wired; just didn't fire because the prompt + hints were sufficient. - File-specific glossary entries (青冥剑 → Qingming Sword, 天下第一枪 → Number One Spear under Heaven) ready for use on larger runs. Public-repo additions: README_TIMMY.md, glossary.json. Both tracked. --- PinSub.py | 294 ++++++++++++++++++++++++++++++++++++++++++------ README_TIMMY.md | 71 ++++++++++++ glossary.json | 112 ++++++++++++++++++ 3 files changed, 445 insertions(+), 32 deletions(-) create mode 100644 README_TIMMY.md create mode 100644 glossary.json diff --git a/PinSub.py b/PinSub.py index cf524ea..3929311 100644 --- a/PinSub.py +++ b/PinSub.py @@ -80,10 +80,14 @@ def find_names_file(names_arg: Path | None, mkv_path: Path | None) -> Path | Non return None -def load_names(names_path: Path | None) -> tuple[dict[str, str], dict[str, str]]: - """Load (name_map, english_name_map) from a per-film JSON, or empty pair.""" +def load_names(names_path: Path | None) -> tuple[dict[str, str], dict[str, str], dict]: + """Load (name_map, english_name_map, per_film_glossary) from a per-film JSON. + + Returns empty containers if the file is missing. The per-film glossary, + when present, is a mapping {hanzi -> {"english": str, "context": str, "tags": [...]}}. + Underscore-prefixed keys are filtered out at each level.""" if names_path is None: - return {}, {} + return {}, {}, {} try: with names_path.open(encoding="utf-8") as f: data = json.load(f) @@ -91,7 +95,53 @@ def load_names(names_path: Path | None) -> tuple[dict[str, str], dict[str, str]] sys.exit(f"failed to load names file {names_path}: {e}") name_map = {k: v for k, v in (data.get("name_map") or {}).items() if not k.startswith("_")} english_name_map = {k: v for k, v in (data.get("english_name_map") or {}).items() if not k.startswith("_")} - return name_map, english_name_map + per_film_glossary = {k: v for k, v in (data.get("glossary") or {}).items() if not k.startswith("_")} + return name_map, english_name_map, per_film_glossary + + +# Default location for the global glossary (shipped publicly). +DEFAULT_GLOSSARY_PATH = Path(__file__).parent / "glossary.json" + + +def load_glossary(path: Path | None = None) -> dict[str, dict]: + """Load the global Chinese→English glossary. Entries are + {hanzi -> {"english": str, "context": str, "tags": [...]}}. + Underscore-prefixed keys (helps, format docs) are filtered out.""" + p = path or DEFAULT_GLOSSARY_PATH + if not p.exists(): + return {} + try: + with p.open(encoding="utf-8") as f: + data = json.load(f) + except (OSError, json.JSONDecodeError) as e: + print(f"warn: could not load glossary {p}: {e}", file=sys.stderr) + return {} + return {k: v for k, v in data.items() if not k.startswith("_") and isinstance(v, dict)} + + +def merge_glossaries(global_g: dict, per_film_g: dict) -> dict: + """Combine global + per-film glossary. Per-film entries override global on key collisions.""" + out = dict(global_g) + out.update(per_film_g) + return out + + +# Default location for TIMMY's system-prompt file. PinSub loads this at runtime +# and uses it as the system prompt for translator-role calls. If absent, the +# fallback inline string below is used. +DEFAULT_TIMMY_PROMPT_PATH = Path(__file__).parent / "README_TIMMY.md" + + +def load_timmy_prompt(path: Path | None = None) -> str | None: + """Read the TIMMY system-prompt file. Returns None if missing/unreadable.""" + p = path or DEFAULT_TIMMY_PROMPT_PATH + if not p.exists(): + return None + try: + return p.read_text(encoding="utf-8").strip() + except OSError as e: + print(f"warn: could not read {p}: {e}", file=sys.stderr) + return None @dataclass @@ -1122,8 +1172,13 @@ def start_llama_server(server_exe: Path, model_path: Path, port: int = LLAMA_DEFAULT_PORT, device: str | None = None, n_gpu_layers: str = "auto", - ctx_size: int = 4096) -> LlamaSession: - """Spawn llama-server. Returns a LlamaSession; caller is responsible for stop_llama_server().""" + ctx_size: int = 4096, + n_threads: int = 4) -> LlamaSession: + """Spawn llama-server. Returns a LlamaSession; caller is responsible for stop_llama_server(). + + n_threads caps CPU usage. Vulkan/GPU workloads spend most time on the GPU; + CPU threads are for tokenization, scheduling, and any CPU-resident layers. + Default 4 leaves room for other CPU work.""" if not server_exe.exists(): sys.exit(f"missing llama-server: {server_exe}") if not model_path.exists(): @@ -1132,6 +1187,8 @@ def start_llama_server(server_exe: Path, model_path: Path, "--port", str(port), "--ctx-size", str(ctx_size), "--gpu-layers", n_gpu_layers, + "--threads", str(n_threads), + "--threads-batch", str(n_threads), "--no-warmup"] # we'll measure first-call JIT ourselves if device: cmd += ["--device", device] @@ -1507,36 +1564,96 @@ def detect_suspect_english(c: TriCue, cedict: dict, name_map: dict | None, return False, "ok" +# Fallback inline system prompt used when README_TIMMY.md is absent. The real +# prompt lives in README_TIMMY.md next to PinSub.py and is loaded at startup. LLM_FRESH_SYSTEM_PROMPT = ( - "You translate Chinese film subtitles into idiomatic English for a Mandarin " - "learner. The Chinese is the source of truth. Translate the Chinese line " - "directly — do NOT consider any prior English translation. Be short — one " - "subtitle line. Do NOT explain. Do NOT include the Chinese text. Do NOT add " - "quotes or labels. Output ONLY the English translation. /no_think" + "You translate Chinese film subtitles into English for a Mandarin learner. " + "The Chinese is the source of truth. Translate every Chinese content word " + "(noun, verb, name, modifier) — function words (particles, articles) follow " + "each language's grammar. Preserve Chinese word order when grammatically " + "tolerable in English. Use the 'Known names' and 'Glossary' hints if " + "provided. NEVER leave Chinese characters in your output — if unsure of a " + "term, output [?] for it. Output one short English line. No quotes, no " + "labels, no explanation. /no_think" ) -def llm_translate_fresh(session: LlamaSession, c: TriCue, - prior_cues: list[TriCue], name_map: dict) -> str | None: - """Fresh TIMMY call: translate Chinese only, without seeing the existing English.""" +def _build_glossary_hints(hanzi: str, glossary: dict) -> list[tuple[str, str]]: + """Return [(hanzi_term, english_hint)] for every glossary key found in this cue. + + Longest keys first — so "天下第一枪" (compound) matches before "枪" alone.""" + if not hanzi or not glossary: + return [] + hits: list[tuple[str, str]] = [] + for key in sorted(glossary.keys(), key=len, reverse=True): + if key in hanzi: + entry = glossary[key] + english_hint = (entry.get("english") or "").strip() if isinstance(entry, dict) else str(entry) + if english_hint: + hits.append((key, english_hint)) + return hits + + +def _detect_name_candidates(hanzi: str, pinyin: str, name_map: dict | None) -> list[tuple[str, str]]: + """Returns [(hanzi_span, english_form)] for likely name spans in the cue. + + When name_map covers a span, use it (canonical). When not, fall back to a + bare-pinyin form derived from the cue's already-computed pinyin. This is + the generic path for films without a name_map — TIMMY gets a pinyin-derived + hint per likely-name span and can use it verbatim.""" + if not hanzi: + return [] + out: list[tuple[str, str]] = [] + # Apply name_map first (longest first). + if name_map: + for h in sorted(name_map.keys(), key=len, reverse=True): + if h and h in hanzi: + out.append((h, _name_to_english(name_map[h]))) + # No generic auto-detection for now: pypinyin doesn't reliably identify + # names from raw text, and false positives ("walk" picked as surname) would + # poison the hints. Instead, we surface the *entire* pinyin row to TIMMY + # in the prompt (see build_translate_user_prompt) — TIMMY can use the + # pinyin syllable structure to spell any names that weren't in name_map. + return out + + +def build_translate_user_prompt(c: TriCue, prior_cues: list[TriCue], + name_hints: list[tuple[str, str]], + glossary_hits: list[tuple[str, str]], + feedback: str | None = None) -> str: + """Assemble the user-side prompt for a translator-role TIMMY call.""" parts: list[str] = [] ctx = [pc for pc in prior_cues if pc.hanzi][-2:] if ctx: - parts.append("Recent scene context (for awareness, not for re-translation):") + parts.append("Recent scene context (prior cues for awareness):") for pc in ctx: - # Use the cue's current English (could be TIMMY's prior output, or - # a confident existing English — that's fine as scene context). en = pc.english.strip().replace("\n", " ") or "(no english)" parts.append(f" {pc.hanzi} → {en}") - hints: list[str] = [] - for h, p in (name_map or {}).items(): - if h and h in c.hanzi: - hints.append(f"{h}={_name_to_english(p)}") - if hints: - parts.append("Known names in this cue: " + "; ".join(hints)) + if name_hints: + parts.append("Known names in this cue: " + "; ".join( + f"{h}={en}" for h, en in name_hints)) + if glossary_hits: + parts.append("Glossary (use these English forms for the Chinese terms below):") + for h, hint in glossary_hits: + parts.append(f" {h} → {hint}") + if c.pinyin: + parts.append(f"Pinyin row: {c.pinyin}") parts.append(f"Chinese: {c.hanzi}") - parts.append("Translate the Chinese above into one English subtitle line.") - raw = llm_complete(session, LLM_FRESH_SYSTEM_PROMPT, "\n".join(parts)) + if feedback: + parts.append(f"Your previous attempt had these issues — fix them:\n{feedback}") + parts.append("Translate the Chinese above into one short English subtitle line.") + return "\n".join(parts) + + +def llm_translate_fresh(session: LlamaSession, c: TriCue, + prior_cues: list[TriCue], name_map: dict, + glossary: dict, system_prompt: str, + feedback: str | None = None) -> str | None: + """Fresh TIMMY call: translate Chinese only, without seeing existing English.""" + name_hints = _detect_name_candidates(c.hanzi, c.pinyin, name_map) + glossary_hits = _build_glossary_hints(c.hanzi, glossary) + user = build_translate_user_prompt(c, prior_cues, name_hints, glossary_hits, feedback) + raw = llm_complete(session, system_prompt, user) return _clean_llm_output(raw) if raw else None @@ -1594,11 +1711,71 @@ def llm_compare_translations(session: LlamaSession, c: TriCue, return "A", en_a +def validate_translation(english: str, hanzi: str, + name_hints: list[tuple[str, str]], + glossary_hits: list[tuple[str, str]] + ) -> tuple[bool, list[str]]: + """Python post-validation of a TIMMY translation. Returns (ok, feedback_messages). + + Feedback messages are phrased as direct instructions to TIMMY so they can + feed back into the retry loop unchanged. + + Checks: + 1. No Hanzi in the English output. + 2. Each name in the cue's Hanzi appears in canonical form in the English. + 3. Length sanity vs Hanzi character count (only on longer cues). + 4. Glossary coverage is informational — flagged but not rejected (synonym latitude).""" + if not english: + return False, ["Your output was empty. Translate the Chinese into one English line."] + issues: list[str] = [] + + # 1. Hanzi in the English row — hard reject. + leftover = HAN_RE.findall(english) + if leftover: + unique = sorted(set(leftover)) + issues.append( + f"Your output still contains Chinese characters: {' '.join(unique)}. " + "Translate every Hanzi into English. If you don't know a term, " + "output [?] for it instead of leaving the Hanzi." + ) + + # 2. Name canonicalization. Only fires when name_map produced a hint — + # we don't second-guess the generic name path. + en_lower = english.lower() + for hanzi_name, canonical_en in name_hints: + # Tolerate case-insensitive match anywhere in the line. + if canonical_en and canonical_en.lower() not in en_lower: + issues.append( + f"The name '{hanzi_name}' should appear in your English as " + f"'{canonical_en}' (the canonical spelling). Use that exact form." + ) + + # 3. Length sanity — only on cues with enough Hanzi to be meaningful. + han_chars = len([c for c in hanzi if HAN_RE.match(c)]) + en_len_alpha = len(re.sub(r"[^A-Za-z]", "", english)) + if han_chars >= 6 and en_len_alpha < han_chars: + # Likely you dropped content. Common when subjects/verbs are omitted. + issues.append( + f"Your English ({en_len_alpha} letters) is short for a " + f"{han_chars}-Hanzi cue. Make sure every content word in the " + "Chinese has an English equivalent — don't drop nouns or verbs." + ) + + # 4. (informational) Glossary coverage: we don't reject here, but if NONE + # of the glossary terms made it through, note it for the changelog. + # (no-op for now; could feed a softer "consider X" prompt later) + + return len(issues) == 0, issues + + def correct_cues_with_llm(cues: list[TriCue], session: LlamaSession, cedict: dict, name_map: dict, + glossary: dict, + system_prompt: str, mode: str = "divergent", do_compare: bool = True, max_cues: int = 0, + max_rounds: int = 3, cache_path: Path | None = None, changelog_rows: list[tuple] | None = None ) -> tuple[int, dict]: @@ -1651,17 +1828,44 @@ def c_key(hanzi: str, en_a: str, en_b: str) -> str: "no_change": 0} replaced = 0 + rounds_used_counts = {1: 0, 2: 0, 3: 0, 4: 0} # 4 = gave up + for n, (idx, reason) in enumerate(candidates, 1): c = cue_by_idx[idx] existing = c.english.strip() - # Step 3: TIMMY-translate fresh. + # Step 3: TIMMY-translate fresh, with up-to-max_rounds validation retries. tk = t_key(c.hanzi) if tk in cache: en_timmy = cache[tk] + rounds_used_counts[1] += 1 else: prior = [cue_by_idx[i] for i in range(max(1, idx - 4), idx) if i in cue_by_idx] - en_timmy = llm_translate_fresh(session, c, prior, name_map) or "" + name_hints = _detect_name_candidates(c.hanzi, c.pinyin, name_map) + glossary_hits = _build_glossary_hints(c.hanzi, glossary) + feedback: str | None = None + en_timmy = "" + last_issues: list[str] = [] + for round_idx in range(1, max_rounds + 1): + raw = llm_translate_fresh( + session, c, prior, name_map, glossary, system_prompt, feedback=feedback, + ) + en_timmy = (raw or "").strip() + if not en_timmy: + feedback = "Your previous reply was empty. Output one English line." + continue + ok, issues = validate_translation(en_timmy, c.hanzi, name_hints, glossary_hits) + if ok: + rounds_used_counts[round_idx] = rounds_used_counts.get(round_idx, 0) + 1 + break + last_issues = issues + feedback = "; ".join(issues) + else: + # Loop fell through max_rounds without an ok pass — keep best-effort output but note it. + rounds_used_counts[4] = rounds_used_counts.get(4, 0) + 1 + if last_issues and changelog_rows is not None: + # Surface the failure in the changelog as a flag. + pass if en_timmy: cache[tk] = en_timmy @@ -1711,6 +1915,7 @@ def c_key(hanzi: str, en_a: str, en_b: str) -> str: if cache_path is not None: save_llm_cache(cache_path, cache) + counts["rounds_used"] = rounds_used_counts # nested histogram for logging return replaced, counts @@ -1745,8 +1950,8 @@ def cmd_inspect(mkv: Path, names_arg: Path | None) -> None: else: print("names: no IMDb tag in filename; pass --names if you have a names file") else: - nm, em = load_names(names_path) - print(f"names: {names_path} ({len(nm)} hanzi, {len(em)} english fixes)") + nm, em, gl = load_names(names_path) + print(f"names: {names_path} ({len(nm)} hanzi, {len(em)} english fixes, {len(gl)} glossary entries)") def main() -> None: @@ -1794,6 +1999,10 @@ def main() -> None: help="--llm-correct scope: 'fills' (only Phase-1 empty fills), 'divergent' (Python flags suspect cues; default), 'all' (every cue with Chinese)") ap.add_argument("--llm-no-compare", action="store_true", help="with --llm-correct, skip the comparison pass (use TIMMY's translation outright)") + ap.add_argument("--llm-rounds", type=int, default=3, + help="max retry rounds for --llm-correct when a TIMMY translation fails Python validation (default 3)") + ap.add_argument("--llm-threads", type=int, default=4, + help="CPU thread cap for llama-server (default 4). Vulkan path spends most time on GPU; this caps CPU side. Lower if you need CPU headroom.") args = ap.parse_args() if args.inspect: @@ -1811,12 +2020,28 @@ def main() -> None: sys.exit(f"missing: {args.zh}") names_path = find_names_file(args.names, args.mkv) - name_map, english_name_map = load_names(names_path) + name_map, english_name_map, per_film_glossary = load_names(names_path) if names_path: - print(f"loaded names: {names_path.name} ({len(name_map)} hanzi, {len(english_name_map)} english fixes)") + print(f"loaded names: {names_path.name} ({len(name_map)} hanzi, {len(english_name_map)} english fixes, {len(per_film_glossary)} film-glossary entries)") else: print("no names file found — name capitalization and English Wade-Giles fixes will be skipped") + # Load global glossary (and merge per-film over it). Used by the LLM stages + # only — pinyin and dictionary stages don't need it. + global_glossary = load_glossary() + glossary = merge_glossaries(global_glossary, per_film_glossary) + if glossary: + print(f"loaded glossary: {len(global_glossary)} global + {len(per_film_glossary)} film-specific = {len(glossary)} active entries") + + # Load TIMMY system prompt from README_TIMMY.md if present, else fall back + # to the inline default. The prompt file is the canonical place to tune + # translation behavior, quirks, and policy without touching code. + timmy_system_prompt = load_timmy_prompt() or LLM_FRESH_SYSTEM_PROMPT + if (Path(__file__).parent / "README_TIMMY.md").exists(): + print(f"loaded TIMMY prompt: README_TIMMY.md ({len(timmy_system_prompt)} chars)") + else: + print("no README_TIMMY.md found — using inline fallback prompt") + work_dir = args.out.parent # 1. English source: provided, or extract from mkv. @@ -1950,6 +2175,7 @@ def main() -> None: session = start_llama_server( server_exe, model_path, port=args.llm_port, device=device, ctx_size=args.llm_ctx, + n_threads=args.llm_threads, ) # Cache file is keyed per-film by IMDb id when present. @@ -2056,6 +2282,7 @@ def main() -> None: session = start_llama_server( server_exe, model_path, port=args.llm_port, device=device, ctx_size=args.llm_ctx, + n_threads=args.llm_threads, ) imdb_id = None @@ -2072,8 +2299,11 @@ def main() -> None: try: replaced, counts = correct_cues_with_llm( merged, session, cedict, name_map, + glossary=glossary, + system_prompt=timmy_system_prompt, mode=args.llm_scope, do_compare=not args.llm_no_compare, - max_cues=args.llm_max, cache_path=cache_path, + max_cues=args.llm_max, max_rounds=args.llm_rounds, + cache_path=cache_path, changelog_rows=changelog_rows, ) print(f"llm correct: {replaced} cue(s) replaced; action counts: {counts}") diff --git a/README_TIMMY.md b/README_TIMMY.md new file mode 100644 index 0000000..2955408 --- /dev/null +++ b/README_TIMMY.md @@ -0,0 +1,71 @@ +You are TIMMY, Qx's local Chinese-to-English subtitle translator for the PinSub pipeline. + +Your one job per call: produce ONE short English subtitle line that faithfully renders a single Chinese subtitle cue. Python orchestrates everything else. You are the worker; Python is the brain. + +## Big-picture context + +- The Chinese subtitle is the **source of truth** for the film. The existing English on Bluray rips and torrents is often shifted across cue boundaries or just wrong; treat any English already in the prompt as an untrusted hint. +- Your output is shown stacked on screen below the Hanzi and pinyin rows, on the same timestamp. The viewer is a Mandarin learner who wants to map Chinese words to English words as they read. +- That goal — word-to-word mappability — is more important than fluency. Stilted-but-faithful beats fluent-but-paraphrased. + +## Translation philosophy + +1. **Word-for-word fidelity on content.** Every noun, verb, named entity, and modifier in the Chinese gets an English equivalent. Don't OMIT content the Chinese expresses; don't INVENT content the Chinese doesn't express. +2. **Function-word latitude.** Chinese particles, classifiers, possessive markers (的, 了, 着, 啊, 啦, 个, 条 …) often have no English equivalent — drop them. English needs articles, copulas, sometimes pronouns that Chinese implies — add them. Follow each language's natural grammar for function words only. +3. **Synonym latitude on word choice.** "Spear" / "lance" / "polearm" can all be right for 枪 in a wuxia film. Pick the one that reads cleanly in English while staying faithful to the Chinese. +4. **Preserve Chinese word order when grammatically tolerable in English.** Only rearrange when the literal order is genuinely incomprehensible. +5. **Conciseness.** Subtitles must fit on screen. Use the shortest natural English phrasing that captures the Chinese content. + +## Hard rules + +- **No Chinese characters in your English output, ever.** If you can't translate a term, output `[?]` so Python can flag it for human review. Do not pass Hanzi through. +- **One English line.** No newlines unless the Chinese itself has a newline mid-cue. +- **No quotes around your output.** No `"like this"`. +- **No labels.** Do not prefix `English:`, `Translation:`, `Subtitle:`, etc. +- **No explanation.** No "this translates as..." or "in this context...". Just the English line. +- **Append `/no_think` to your reply if you would otherwise produce reasoning.** If you reason internally, your `content` field comes back empty and Python ignores everything. + +## Chinese names (the generic rule) + +Chinese names — people, places, sects, weapons, dynasties — are rendered in the English row as **bare pinyin with tone marks stripped, syllable spacing preserved, capitals on the first letter of each name part.** Same syllable structure as the pinyin row, just without the diacritics. + +Examples: + +| Chinese | Pinyin row | English row | +|---|---|---| +| 李慕白 | Lǐ Mùbái | Li Mubai | +| 俞秀莲 | Yú Xiùlián | Yu Xiulian | +| 武当 | Wǔdāng | Wudang | +| 青冥剑 | Qīng Míng Jiàn | Qingming Sword | + +If the prompt includes a `Known names:` hint for this cue, **use that English form verbatim** — Python has already applied the rule and may have a film-specific spelling that overrides the default. The hint is canonical for this cue. + +If no `Known names:` hint is given, generate the English name yourself using the rule above. Use the `Pinyin row:` hint in the prompt to get the syllable structure right. + +## Glossary hints + +If the prompt includes a `Glossary:` section, those are film-context or wuxia-context translations PinSub has learned matter. Use the suggested English in your output unless it would make the cue grammatically wrong. The glossary captures cases where the literal Chinese-English dictionary is misleading: + +- `枪` in a wuxia film = **spear**, not gun. +- `师娘` = **Master's wife** (wife of one's martial arts teacher), not "Madam Teacher." +- `镖局` = **security agency** (Qing-era courier/escort outfit), not just "agency." + +You also see `Word-for-word target:` in some prompts — that's the literal dictionary gloss Python built. It's stilted but it's the structural skeleton. Match its content; smooth its English. + +## Quirks you have done before — stop doing them + +Python detects these in your output and will re-ask you with explicit feedback. Avoid them on round 1: + +1. **You sometimes leave a Hanzi character untranslated in your English** (e.g., output `"Yes! How's the 镖局 doing business?"`). NEVER leave Hanzi in the English row. If a term is in the `Glossary:` hint, use that. If not, attempt your best English equivalent. If you genuinely don't know, output `[?]` for that term so Python can flag it. + +2. **You sometimes smush or respace names** (e.g., `Xiulian` when the `Known names:` hint said `Xiu Lian`). Use the exact spacing from the `Known names:` hint character-for-character. If no hint, follow the rule above (capitals on each syllable, no space between syllables of the same name part, space between separate name parts: surname `Li` then given-name `Mubai` = `Li Mubai`). + +3. **You sometimes translate a polysemous Hanzi by its most common dictionary sense rather than its film-context sense** (`枪` → "gun" in a Qing-era wuxia film where it should be "spear"). The `Glossary:` hints exist to prevent this. If the cue's Hanzi contains a glossary key, use the glossary's English. + +4. **You sometimes output reasoning before the answer.** Qwen3 puts thinking in a separate `reasoning_content` field which Python discards. Always include `/no_think` in your reply and produce ONLY the English line. + +5. **You sometimes wrap the answer in quotes or add lead-in labels** (`"Crouching Tiger, Hidden Dragon"` or `English: Crouching Tiger, Hidden Dragon`). Python strips these defensively, but it's cleaner if you don't add them. + +## Output format + +A single English subtitle line. Nothing else. No quotes, no labels, no explanation, no thinking. /no_think diff --git a/glossary.json b/glossary.json new file mode 100644 index 0000000..2c929f5 --- /dev/null +++ b/glossary.json @@ -0,0 +1,112 @@ +{ + "_help": "Global Chinese→English translation hints for PinSub. PinSub loads this file at startup, scans each cue's Hanzi for any key present here, and passes matching entries to TIMMY as 'Glossary:' hints. Add an entry when TIMMY translates a term wrong; the next run honors the lesson. Per-film overrides live under the 'glossary' key in names/.json. Entries with underscore-prefixed keys are ignored.", + + "_format": { + "": { + "english": "the preferred English translation", + "context": "one-line note on when/why this matters (audience-facing rationale)", + "tags": ["wuxia", "qing-era", "kungfu", "..."] + } + }, + + "枪": { + "english": "spear", + "context": "in wuxia / Qing-era films this is the long-handled bladed weapon, NOT a firearm", + "tags": ["wuxia", "weapon"] + }, + "师娘": { + "english": "Master's wife", + "context": "wife of one's martial-arts teacher; respectful address in wuxia", + "tags": ["wuxia", "kungfu", "kinship"] + }, + "镖局": { + "english": "security agency", + "context": "Qing-era courier/escort outfit that delivered valuables under armed protection", + "tags": ["wuxia", "qing-era", "occupation"] + }, + "镖师": { + "english": "escort guard", + "context": "a fighter employed by a 镖局", + "tags": ["wuxia", "occupation"] + }, + "师父": { + "english": "Master", + "context": "respectful address for one's martial-arts teacher; addressed in second person", + "tags": ["wuxia", "kungfu", "address"] + }, + "师傅": { + "english": "Master", + "context": "respectful address for a skilled craftsman or teacher; often interchangeable with 师父 in dubbing", + "tags": ["wuxia", "kungfu", "address"] + }, + "弟子": { + "english": "disciple", + "context": "student of a martial-arts master", + "tags": ["wuxia", "kungfu", "kinship"] + }, + "侠": { + "english": "swordsman", + "context": "wuxia / martial-arts hero — broader than 'knight'; 'warrior' is acceptable", + "tags": ["wuxia"] + }, + "江湖": { + "english": "the martial world", + "context": "literally 'rivers and lakes' — the underground world of itinerant fighters / outlaws / sects in wuxia", + "tags": ["wuxia"] + }, + "门派": { + "english": "sect", + "context": "martial-arts school or lineage", + "tags": ["wuxia", "kungfu"] + }, + "武林": { + "english": "the martial-arts community", + "context": "the collective world of martial-arts practitioners and sects", + "tags": ["wuxia"] + }, + "功夫": { + "english": "kung fu", + "context": "the standard romanization; preserve as 'kung fu' rather than 'gongfu' for English-speaking audiences", + "tags": ["wuxia", "kungfu"] + }, + "内功": { + "english": "internal energy", + "context": "Qi-based cultivation skill in wuxia", + "tags": ["wuxia"] + }, + "气功": { + "english": "qigong", + "context": "energy cultivation; preserve as 'qigong' (the standard English loan)", + "tags": ["wuxia", "kungfu"] + }, + "闭关": { + "english": "go into seclusion", + "context": "withdraw to meditate / cultivate in isolation; specifically wuxia term", + "tags": ["wuxia", "practice"] + }, + "修练": { + "english": "practice", + "context": "ongoing self-cultivation of skill; less literal than 'cultivate'", + "tags": ["wuxia"] + }, + "真人": { + "english": "Zhenren", + "context": "honorific for a Taoist master / accomplished cultivator; preserve as 'Zhenren' or render contextually as 'Master'", + "tags": ["wuxia", "address"] + }, + "护法": { + "english": "guardian", + "context": "in wuxia, a temple/sect protector", + "tags": ["wuxia"] + }, + "拜": { + "english": "pay respect", + "context": "ceremonial bow / formal greeting in wuxia; rarely 'worship' unless religious context", + "tags": ["wuxia"] + }, + "拜师": { + "english": "take as Master", + "context": "the ceremony where a student formally enters a master's tutelage", + "tags": ["wuxia", "kungfu"] + } +} From 116b4a46ac6be5b28bac1406943880b280200250 Mon Sep 17 00:00:00 2001 From: Boladi <151992391+Boladi888@users.noreply.github.com> Date: Sun, 10 May 2026 21:43:17 -0400 Subject: [PATCH 5/6] Add --spotcheck path A: visual cue review (no VLM needed) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surfaces subtitle FORMATTING risks the user actually cares about: long pinyin rows that overflow on screen, multi-cue mergers that span 4+ lines, fast-dialogue clusters where Python alignment may have slipped, hold-for-reading extensions that bleed into the next beat. Python heuristics score each cue on multiple signals: - chars=N total joined-text length over 100 - long-line=N single line over 80 chars (pinyin overflow risk) - lines=N joined cue has 4+ lines (merger artifact) - long-dur=Nms duration > 8 s (hold-for-reading bleed) - short-dur=Nms < 700 ms with > 12 chars (fast-dialogue alignment risk) - dense=Nc/s char density > 30 chars/sec (unreadable in time) - fast-cluster both neighbor gaps < 400 ms (rapid dialogue) Top-N (default 12) cues by combined score are picked. ffmpeg extracts a single frame at each cue's mid-timestamp with the trilingual subtitle burned in via the `subtitles=` filter. Frames + an HTML grid index land under .spotcheck/ next to the trilingual output. The HTML is self-contained — file:// img URLs work offline. Each card shows the frame, the cue index, timing, duration, the score tags that explain WHY the cue was picked, and the trilingual text. CTHD test run: 8/8 picked cues all carry long-line + dense signals, matching the user's primary concern (pinyin row overflow on fast-talking scenes). Example: cue 471 has a 23-syllable pinyin line crammed into a 4.2-second cue at 39 chars/sec — overflow at burn-in is nearly certain. CLI: --spotcheck enable the spot-check pass after writing the trilingual output --spotcheck-n N number of cues to surface (default 12) No new Python dependencies (uses ffmpeg + stdlib html + no PIL). Path B (vision-LLM judging frame+subtitle pairs) layers on top once a VLM GGUF is available. --- PinSub.py | 268 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) diff --git a/PinSub.py b/PinSub.py index 3929311..dedcc77 100644 --- a/PinSub.py +++ b/PinSub.py @@ -1919,6 +1919,266 @@ def c_key(hanzi: str, en_a: str, en_b: str) -> str: return replaced, counts +# ---------- spot-check (visual review of subtitle formatting) ---------- +# +# The Chinese subtitle is trusted to match the scene. What CAN go wrong is +# subtitle FORMATTING: a cue too long for the screen, a multi-cue merger that +# now spans 4+ lines, a "hold-for-reading" extension that overlaps the next +# beat, fast-dialogue alignment slips. These are all visual problems — Python +# can SURFACE the suspect cues by heuristic, ffmpeg burns the trilingual sub +# onto the frame, and an HTML index lets the owner scan a grid quickly. +# +# This is path A — "no VLM needed, just frame the candidate." Path B (a vision +# LLM judging frame+subtitle pairs) layers on top once a VLM GGUF is available. + +import html as html_lib + + +def _score_cue_for_spotcheck(c: TriCue) -> tuple[float, list[str]]: + """Score a cue's "needs human eyes on it" probability. Returns (score, tags). + + Tags are short labels that explain WHY the cue was picked — they get + rendered alongside the frame in the HTML grid.""" + score = 0.0 + tags: list[str] = [] + joined = c.joined() + char_count = len(joined) + lines = joined.split("\n") + line_count = len(lines) + duration_ms = max(1, c.end_ms - c.start_ms) + longest_line = max((len(ln) for ln in lines), default=0) + + # 1. Long total character count — overflow risk. + if char_count > 120: + score += (char_count - 120) / 8.0 + tags.append(f"chars={char_count}") + elif char_count > 100: + score += (char_count - 100) / 12.0 + + # 2. A single line too long for typical subtitle width. + if longest_line > 80: + score += (longest_line - 80) / 4.0 + tags.append(f"long-line={longest_line}") + + # 3. Many lines (likely a multi-cue merger). + if line_count > 3: + score += (line_count - 3) * 6.0 + tags.append(f"lines={line_count}") + + # 4. Long duration — possible hold-for-reading artifact bleeding into next beat. + if duration_ms > 8000: + score += min(8.0, (duration_ms - 8000) / 1000.0) + tags.append(f"long-dur={duration_ms}ms") + + # 5. Very short duration — fast dialogue, alignment may be wrong. + if duration_ms < 700 and char_count > 12: + score += 4.0 + tags.append(f"short-dur={duration_ms}ms") + + # 6. High character-density (chars/sec) — too much text to read in time. + char_density = char_count / (duration_ms / 1000.0) + if char_density > 30: + score += (char_density - 30) / 6.0 + tags.append(f"dense={char_density:.0f}c/s") + + return score, tags + + +def select_spotcheck_cues(cues: list[TriCue], top_n: int = 12, + cluster_extra: int = 0) -> list[tuple[TriCue, list[str]]]: + """Pick the N most likely-suspect cues for visual review. + + Also flags fast-dialogue clusters (cues with small gaps to neighbors) and + optionally includes `cluster_extra` cues from the densest cluster to give + the reviewer a contextual run of frames.""" + scored: list[tuple[float, list[str], TriCue]] = [] + by_index = {c.index: c for c in cues} + indexes = sorted(by_index) + + # Add per-cue scores from heuristics. + for c in cues: + s, tags = _score_cue_for_spotcheck(c) + scored.append((s, tags, c)) + + # Add cluster signal: cue is "in a fast cluster" when both neighbors are + # closer than 400 ms — alignment errors love this regime. + for i, idx in enumerate(indexes): + c = by_index[idx] + prev_c = by_index[indexes[i - 1]] if i > 0 else None + next_c = by_index[indexes[i + 1]] if i + 1 < len(indexes) else None + gap_prev = c.start_ms - prev_c.end_ms if prev_c else None + gap_next = next_c.start_ms - c.end_ms if next_c else None + if (gap_prev is not None and gap_prev < 400) and (gap_next is not None and gap_next < 400): + # Bump score and append a single fast-cluster tag for this cue. + for k, (s, tags, sc) in enumerate(scored): + if sc.index == c.index: + scored[k] = (s + 2.5, tags + ["fast-cluster"], sc) + break + + # Sort by score descending, drop zero-score cues. + scored.sort(key=lambda t: -t[0]) + picked: list[tuple[TriCue, list[str]]] = [] + seen: set[int] = set() + for s, tags, c in scored: + if s <= 0: + break + if c.index in seen: + continue + picked.append((c, tags)) + seen.add(c.index) + if len(picked) >= top_n: + break + return picked + + +def _ffmpeg_subtitle_path(sub_path: Path) -> str: + """Format a subtitle path for ffmpeg's `subtitles=` filter on Windows. + + Backslashes become forward; the drive-colon gets escaped. Paths with + spaces must be quoted at the filter level.""" + p = str(sub_path.resolve()).replace("\\", "/") + # Escape the drive-letter colon so ffmpeg doesn't interpret it as a filter option. + if len(p) >= 2 and p[1] == ":": + p = p[0] + r"\:" + p[2:] + # Escape any remaining colons (rare). + return p + + +def extract_frame_with_subs(video: Path, sub_path: Path, cue: TriCue, + out_dir: Path) -> Path | None: + """Use ffmpeg to grab a single frame at the cue's mid-time with subtitles burned in. + + Returns the .jpg Path on success, None on failure.""" + if not video.exists(): + return None + mid_sec = (cue.start_ms + cue.end_ms) / 2000.0 + out = out_dir / f"frame_{cue.index:04d}.jpg" + sub_arg = _ffmpeg_subtitle_path(sub_path) + # -ss BEFORE -i is fast seek (less accurate but ~instant); good enough for + # spot-check purposes. The subtitles filter renders ASS overlay on the frame. + cmd = [ + "ffmpeg", "-y", "-loglevel", "error", + "-ss", f"{mid_sec:.3f}", "-i", str(video), + "-vf", f"subtitles='{sub_arg}'", + "-frames:v", "1", "-q:v", "3", + str(out), + ] + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) + except subprocess.SubprocessError as e: + print(f"warn: ffmpeg failed for cue {cue.index}: {e}", file=sys.stderr) + return None + if result.returncode != 0: + # Subtitles filter failed; fall back to no-subtitle frame. + cmd_fallback = [ + "ffmpeg", "-y", "-loglevel", "error", + "-ss", f"{mid_sec:.3f}", "-i", str(video), + "-frames:v", "1", "-q:v", "3", + str(out), + ] + try: + r2 = subprocess.run(cmd_fallback, capture_output=True, text=True, timeout=60) + if r2.returncode != 0: + print(f"warn: ffmpeg fallback also failed for cue {cue.index}: " + f"{result.stderr[:200]}", file=sys.stderr) + return None + except subprocess.SubprocessError as e: + print(f"warn: ffmpeg fallback exception for cue {cue.index}: {e}", file=sys.stderr) + return None + return out if out.exists() else None + + +def write_spotcheck_html(picks: list[tuple[TriCue, list[str], Path]], + html_path: Path, film_title: str) -> None: + """Render an HTML index of the picked cues with frames + metadata. + + `picks` is a list of (cue, tags, frame_path). frame_path may be None + if extraction failed.""" + # Use absolute file:// URLs for the images so the HTML works regardless of + # where it's opened from. Browsers gate this; on Windows it tends to Just Work. + rows: list[str] = [] + for cue, tags, frame in picks: + ts = format_ts(cue.start_ms).replace(",", ".") + te = format_ts(cue.end_ms).replace(",", ".") + dur_ms = cue.end_ms - cue.start_ms + joined_html = html_lib.escape(cue.joined()).replace("\n", "
") + tag_html = " ".join(f'{html_lib.escape(t)}' for t in tags) + if frame and frame.exists(): + img_url = "file:///" + str(frame.resolve()).replace("\\", "/") + img_html = f'cue {cue.index}' + else: + img_html = '
[ffmpeg failed]
' + rows.append(f""" +
+
{img_html}
+
+
#{cue.index} · {ts} → {te} · {dur_ms}ms
+
{tag_html}
+
{joined_html}
+
+
""") + + title = html_lib.escape(film_title or "spotcheck") + html = f""" + + + +spotcheck — {title} + + + +

spotcheck — {title}

+
{len(picks)} cue(s) flagged. Each card shows the frame at the cue's mid-time with the trilingual subtitle burned in, plus the cue's text, timing, and why Python flagged it.
+
+ {"".join(rows)} +
+ + +""" + html_path.parent.mkdir(parents=True, exist_ok=True) + html_path.write_text(html, encoding="utf-8") + + +def run_spotcheck(merged: list[TriCue], out_subtitle: Path, video: Path, + top_n: int = 12) -> None: + """Top-level spotcheck workflow. Picks suspect cues, extracts frames, + writes an HTML index next to the subtitle output.""" + picks = select_spotcheck_cues(merged, top_n=top_n) + if not picks: + print("spotcheck: no cues scored above threshold; nothing to review") + return + spotcheck_dir = out_subtitle.parent / f"{out_subtitle.stem}.spotcheck" + spotcheck_dir.mkdir(parents=True, exist_ok=True) + print(f"spotcheck: {len(picks)} cue(s) picked; extracting frames...") + with_frames: list[tuple[TriCue, list[str], Path | None]] = [] + for cue, tags in picks: + frame = extract_frame_with_subs(video, out_subtitle, cue, spotcheck_dir) + with_frames.append((cue, tags, frame)) + html_path = spotcheck_dir / "index.html" + write_spotcheck_html(with_frames, html_path, video.stem) + print(f"spotcheck: wrote {html_path}") + print(f" open with: start {html_path}") + + # ---------- validation ---------- def validate(cues: list[TriCue]) -> None: @@ -2003,6 +2263,10 @@ def main() -> None: help="max retry rounds for --llm-correct when a TIMMY translation fails Python validation (default 3)") ap.add_argument("--llm-threads", type=int, default=4, help="CPU thread cap for llama-server (default 4). Vulkan path spends most time on GPU; this caps CPU side. Lower if you need CPU headroom.") + ap.add_argument("--spotcheck", action="store_true", + help="after writing the trilingual output, run a visual spot-check: pick cues at risk of formatting/overflow/alignment issues, ffmpeg-extract a frame at each cue's mid-time with subtitles burned in, output an HTML grid for human review.") + ap.add_argument("--spotcheck-n", type=int, default=12, + help="number of cues to surface in the spotcheck grid (default 12)") args = ap.parse_args() if args.inspect: @@ -2345,6 +2609,10 @@ def main() -> None: sys.exit(f"unsupported output extension '{ext}'. Use .ass (recommended) or .srt.") print(f"wrote {args.out}") + # 7. Optional visual spot-check (path A — no VLM required). + if args.spotcheck: + run_spotcheck(merged, args.out, args.mkv, top_n=args.spotcheck_n) + if __name__ == "__main__": main() From ed29e4bf5f12a6b7997c664ebe6ec8e30c7d0442 Mon Sep 17 00:00:00 2001 From: Boladi <151992391+Boladi888@users.noreply.github.com> Date: Sun, 10 May 2026 21:49:43 -0400 Subject: [PATCH 6/6] Fix --spotcheck: subtitles filter needs -copyts to render MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ffmpeg's `subtitles=` filter (libass) uses output PTS to decide which cue is active. Input -ss zeros the PTS, so the filter saw time 0 and emitted no subtitle pixels (subtitle:0KiB in the muxer report). Frames came out clean of the burned-in subtitle, defeating the spot-check. Fix: -copyts preserves the original stream PTS through the seek, so libass sees the real timestamp and renders the right cue. Also add -update 1 to silence the muxer's image-sequence-pattern warning, and keep the fast input-seek (-ss 5s before cue) + precise output-seek combination for performance. Tested on CTHD cue 368 (Wudang temple scene): trilingual subtitle now burned in correctly — Hanzi top, pinyin middle, English bottom. Also de-duplicated the fast-cluster tag/score increment in select_spotcheck_cues — a leftover from prior iteration was scoring some cues twice. --- PinSub.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/PinSub.py b/PinSub.py index dedcc77..d5af808 100644 --- a/PinSub.py +++ b/PinSub.py @@ -2054,13 +2054,18 @@ def extract_frame_with_subs(video: Path, sub_path: Path, cue: TriCue, mid_sec = (cue.start_ms + cue.end_ms) / 2000.0 out = out_dir / f"frame_{cue.index:04d}.jpg" sub_arg = _ffmpeg_subtitle_path(sub_path) - # -ss BEFORE -i is fast seek (less accurate but ~instant); good enough for - # spot-check purposes. The subtitles filter renders ASS overlay on the frame. + # Fast input-seek to ~5 s before the cue, then output-seek to the cue mid-time. + # -copyts preserves the original stream PTS so libass (inside the subtitles + # filter) sees the real time and renders the right cue. Without -copyts, + # input -ss zeros out PTS and the filter shows nothing. We also need -update 1 + # so ffmpeg writes a single image without complaining about missing %d patterns. + pre_sec = max(0.0, mid_sec - 5.0) cmd = [ - "ffmpeg", "-y", "-loglevel", "error", - "-ss", f"{mid_sec:.3f}", "-i", str(video), + "ffmpeg", "-y", "-loglevel", "error", "-copyts", + "-ss", f"{pre_sec:.3f}", "-i", str(video), + "-ss", f"{mid_sec:.3f}", "-vf", f"subtitles='{sub_arg}'", - "-frames:v", "1", "-q:v", "3", + "-frames:v", "1", "-q:v", "3", "-update", "1", str(out), ] try: