diff --git a/README.md b/README.md index 6aae1d0a..33f0b98e 100644 --- a/README.md +++ b/README.md @@ -923,6 +923,21 @@ the `[defaults]` section of your configuration file. │ provide context for │ │ LLM cleanup. │ ╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Live Preview ─────────────────────────────────────────────────────────────────────────╮ +│ --live-preview-log PATH Write rolling live transcription │ +│ preview events to JSONL while │ +│ recording. │ +│ --live-preview-interval FLOAT Seconds between live preview │ +│ retranscriptions. │ +│ [default: 2.0] │ +│ --live-preview-window FLOAT Seconds of recent audio to include │ +│ in each live preview │ +│ retranscription. │ +│ [default: 15.0] │ +│ --live-preview-console,--live-previ… Print rolling live transcription │ +│ preview updates to the terminal │ +│ while recording. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ ╭─ Diarization ──────────────────────────────────────────────────────────────────────────╮ │ --diarize --no-diarize Enable speaker │ │ diarization │ diff --git a/agent_cli/agents/transcribe.py b/agent_cli/agents/transcribe.py index 49ec8381..f47a6756 100644 --- a/agent_cli/agents/transcribe.py +++ b/agent_cli/agents/transcribe.py @@ -365,13 +365,20 @@ async def _async_main( # noqa: PLR0912, PLR0915, C901 emit_output: bool = True, raise_diarization_errors: bool = False, audio_level_callback: Callable[[bytes], None] | None = None, + live_preview_log: Path | None = None, + live_preview_interval: float = 2.0, + live_preview_window: float = 15.0, + live_preview_console: bool = False, ) -> TranscriptResult: """Unified async entry point for both live and file-based transcription.""" start_time = time.monotonic() transcript: str | None saved_recording_path: Path | None = None + live_preview_console_active = ( + live_preview_console and audio_file_path is None and provider_cfg.asr_provider == "wyoming" + ) - with maybe_live(not general_cfg.quiet) as live: + with maybe_live(not general_cfg.quiet and not live_preview_console_active) as live: if audio_file_path: # File-based transcription # Determine if we can use native format support (skip PCM conversion) @@ -447,6 +454,17 @@ def _set_saved_recording_path(path: Path) -> None: openai_asr_cfg, gemini_asr_cfg, ) + live_preview_config = ( + asr.LivePreviewConfig( + log_file=live_preview_log, + interval_seconds=live_preview_interval, + window_seconds=live_preview_window, + console=live_preview_console, + ) + if (live_preview_log or live_preview_console) + and provider_cfg.asr_provider == "wyoming" + else None + ) transcript = await live_transcriber( logger=LOGGER, stop_event=stop_event, @@ -456,6 +474,7 @@ def _set_saved_recording_path(path: Path) -> None: extra_instructions=extra_instructions, recording_path_callback=_set_saved_recording_path, audio_level_callback=audio_level_callback, + live_preview_config=live_preview_config, ) elapsed = time.monotonic() - start_time @@ -650,6 +669,10 @@ def transcribe( # noqa: PLR0912, PLR0911, PLR0915, C901 print_args: bool = opts.PRINT_ARGS, transcription_log: Path | None = opts.TRANSCRIPTION_LOG, voice_level_log: Path | None = opts.VOICE_LEVEL_LOG, + live_preview_log: Path | None = opts.LIVE_PREVIEW_LOG, + live_preview_interval: float = opts.LIVE_PREVIEW_INTERVAL, + live_preview_window: float = opts.LIVE_PREVIEW_WINDOW, + live_preview_console: bool = opts.LIVE_PREVIEW_CONSOLE, # --- Diarization Options --- diarize: bool = opts.DIARIZE, diarize_format: opts.DiarizeFormat = opts.DIARIZE_FORMAT, @@ -697,18 +720,24 @@ def transcribe( # noqa: PLR0912, PLR0911, PLR0915, C901 setup_logging(log_level, log_file, quiet=effective_quiet) + enroll_speakers = _option_default(enroll_speakers) + identify_speakers = _option_default(identify_speakers) + remember_unknown_speakers = _option_default(remember_unknown_speakers) + speaker_profiles_file = _option_default(speaker_profiles_file) + speaker_match_threshold = _option_default(speaker_match_threshold) + live_preview_log = _option_default(live_preview_log) + live_preview_interval = _option_default(live_preview_interval) + live_preview_window = _option_default(live_preview_window) + live_preview_console = _option_default(live_preview_console) + # Expand user path for transcription log if transcription_log: transcription_log = transcription_log.expanduser() voice_level_log = _option_default(voice_level_log) if voice_level_log: voice_level_log = voice_level_log.expanduser() - - enroll_speakers = _option_default(enroll_speakers) - identify_speakers = _option_default(identify_speakers) - remember_unknown_speakers = _option_default(remember_unknown_speakers) - speaker_profiles_file = _option_default(speaker_profiles_file) - speaker_match_threshold = _option_default(speaker_match_threshold) + if live_preview_log: + live_preview_log = live_preview_log.expanduser() # Validate diarization options if not diarize and (enroll_speakers or remember_unknown_speakers): @@ -850,6 +879,10 @@ def transcribe( # noqa: PLR0912, PLR0911, PLR0915, C901 diarization_cfg=diarization_cfg, emit_output=not json_output, raise_diarization_errors=diarize, + live_preview_log=live_preview_log, + live_preview_interval=live_preview_interval, + live_preview_window=live_preview_window, + live_preview_console=live_preview_console, ), ) except ImportError as exc: @@ -942,6 +975,10 @@ def transcribe( # noqa: PLR0912, PLR0911, PLR0915, C901 audio_level_callback=audio_level_writer.write_chunk if audio_level_writer else None, + live_preview_log=live_preview_log, + live_preview_interval=live_preview_interval, + live_preview_window=live_preview_window, + live_preview_console=live_preview_console, ), ) except ImportError as exc: diff --git a/agent_cli/opts.py b/agent_cli/opts.py index 1ed42b83..363d09ae 100644 --- a/agent_cli/opts.py +++ b/agent_cli/opts.py @@ -436,6 +436,31 @@ def _conf_callback(ctx: typer.Context, param: typer.CallbackParam, value: str) - hidden=True, rich_help_panel="General Options", ) +LIVE_PREVIEW_LOG: Path | None = typer.Option( + None, + "--live-preview-log", + help="Write rolling live transcription preview events to JSONL while recording.", + rich_help_panel="Live Preview", +) +LIVE_PREVIEW_INTERVAL: float = typer.Option( + 2.0, + "--live-preview-interval", + help="Seconds between live preview retranscriptions.", + rich_help_panel="Live Preview", +) +LIVE_PREVIEW_WINDOW: float = typer.Option( + 15.0, + "--live-preview-window", + help="Seconds of recent audio to include in each live preview retranscription.", + rich_help_panel="Live Preview", +) +LIVE_PREVIEW_CONSOLE: bool = typer.Option( + False, # noqa: FBT003 + "--live-preview-console", + "--live-preview-stdout", + help="Print rolling live transcription preview updates to the terminal while recording.", + rich_help_panel="Live Preview", +) # --- Server Options --- SERVER_HOST: str = typer.Option( diff --git a/agent_cli/services/asr.py b/agent_cli/services/asr.py index f37e16e6..fd36dc7a 100644 --- a/agent_cli/services/asr.py +++ b/agent_cli/services/asr.py @@ -4,7 +4,10 @@ import asyncio import io +import json import wave +from contextlib import suppress +from dataclasses import dataclass from datetime import UTC, datetime from functools import partial from pathlib import Path @@ -18,7 +21,7 @@ setup_input_stream, ) from agent_cli.core.audio_format import check_ffmpeg_available, convert_audio_to_wyoming_format -from agent_cli.core.utils import manage_send_receive_tasks +from agent_cli.core.utils import err_console, manage_send_receive_tasks from agent_cli.services import ( transcribe_audio_gemini, transcribe_audio_openai, @@ -37,6 +40,176 @@ from agent_cli.core.utils import InteractiveStopEvent +@dataclass(frozen=True) +class LivePreviewConfig: + """Configuration for rolling transcription previews.""" + + log_file: Path | None = None + interval_seconds: float = 2.0 + window_seconds: float = 15.0 + min_audio_seconds: float = 1.0 + console: bool = False + + @property + def max_audio_bytes(self) -> int: + """Maximum number of PCM bytes to keep in the rolling preview window.""" + return int( + self.window_seconds + * constants.AUDIO_RATE + * constants.AUDIO_CHANNELS + * constants.AUDIO_FORMAT_WIDTH, + ) + + @property + def min_audio_bytes(self) -> int: + """Minimum number of PCM bytes before attempting a preview.""" + return int( + self.min_audio_seconds + * constants.AUDIO_RATE + * constants.AUDIO_CHANNELS + * constants.AUDIO_FORMAT_WIDTH, + ) + + +def _write_live_preview_event( + log_file: Path, + *, + event_type: str, + revision: int, + text: str, +) -> None: + """Append one live preview event to a JSONL log.""" + entry = { + "timestamp": datetime.now(UTC).isoformat(), + "type": event_type, + "revision": revision, + "text": text, + "is_final": event_type == "final", + } + log_file.parent.mkdir(parents=True, exist_ok=True) + with log_file.open("a", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + + +def _print_live_preview_event( + *, + event_type: str, + revision: int, + text: str, +) -> None: + """Print one live preview event to the terminal.""" + label = "final" if event_type == "final" else f"live #{revision}" + err_console.print(f"[dim]{label}:[/dim] {text}") + + +class LivePreviewStreamer: + """Periodically transcribe a rolling audio window and write preview events.""" + + def __init__( + self, + preview_config: LivePreviewConfig, + *, + wyoming_asr_cfg: config.WyomingASR, + logger: logging.Logger, + extra_instructions: str | None = None, + ) -> None: + """Initialize the preview streamer.""" + self.config = preview_config + self.wyoming_asr_cfg = wyoming_asr_cfg + self.logger = logger + self.extra_instructions = extra_instructions + self._audio = bytearray() + self._lock = asyncio.Lock() + self._stop_event = asyncio.Event() + self._revision = 0 + self._last_text = "" + + def reset_log(self) -> None: + """Clear the preview log for a new recording session.""" + if self.config.log_file is None: + return + self.config.log_file.parent.mkdir(parents=True, exist_ok=True) + self.config.log_file.write_text("", encoding="utf-8") + + async def add_chunk(self, chunk: bytes) -> None: + """Add a microphone audio chunk to the rolling preview buffer.""" + async with self._lock: + self._audio.extend(chunk) + max_bytes = max(0, self.config.max_audio_bytes) + if max_bytes and len(self._audio) > max_bytes: + del self._audio[: len(self._audio) - max_bytes] + + async def run(self) -> None: + """Run the periodic preview loop until stopped.""" + while not self._stop_event.is_set(): + try: + await asyncio.wait_for(self._stop_event.wait(), self.config.interval_seconds) + break + except TimeoutError: + pass + + try: + await self.emit_partial() + except asyncio.CancelledError: + raise + except Exception: + self.logger.exception("Live transcription preview failed") + + async def emit_partial(self) -> None: + """Transcribe the current rolling window and write a partial if changed.""" + snapshot = await self._audio_snapshot() + if len(snapshot) < self.config.min_audio_bytes: + return + + text = await _transcribe_recorded_audio_wyoming( + audio_data=snapshot, + wyoming_asr_cfg=self.wyoming_asr_cfg, + logger=self.logger, + quiet=True, + extra_instructions=self.extra_instructions, + ) + text = text.strip() + if not text or text == self._last_text: + return + if self._stop_event.is_set(): + return + + self._revision += 1 + self._last_text = text + self._publish_event(event_type="partial", revision=self._revision, text=text) + + def request_stop(self) -> None: + """Stop partial preview emission without writing the final transcript.""" + self._stop_event.set() + + async def stop(self, final_text: str | None = None) -> None: + """Stop previewing and optionally append the final transcript.""" + self.request_stop() + final_text = (final_text or "").strip() + if final_text: + self._revision += 1 + self._publish_event(event_type="final", revision=self._revision, text=final_text) + + def _publish_event(self, *, event_type: str, revision: int, text: str) -> None: + if self.config.log_file is not None: + _write_live_preview_event( + self.config.log_file, + event_type=event_type, + revision=revision, + text=text, + ) + if self.config.console: + _print_live_preview_event( + event_type=event_type, + revision=revision, + text=text, + ) + + async def _audio_snapshot(self) -> bytes: + async with self._lock: + return bytes(self._audio) + + def _get_transcriptions_dir() -> Path: """Get the directory for storing transcription recordings.""" config_dir = Path.home() / ".config" / "agent-cli" / "transcriptions" @@ -247,6 +420,7 @@ async def _send_audio( initial_prompt: str | None = None, recording_path_callback: Callable[[Path], None] | None = None, audio_level_callback: Callable[[bytes], None] | None = None, + live_preview_callback: Callable[[bytes], Awaitable[None]] | None = None, ) -> None: """Read from mic and send to Wyoming server.""" from wyoming.asr import Transcribe # noqa: PLC0415 @@ -272,6 +446,8 @@ async def send_chunk(chunk: bytes) -> None: chunk, logger, ) + if live_preview_callback is not None: + await live_preview_callback(chunk) await client.write_event(AudioChunk(audio=chunk, **constants.WYOMING_AUDIO_CONFIG).event()) try: @@ -482,9 +658,22 @@ async def _transcribe_live_audio_wyoming( extra_instructions: str | None = None, recording_path_callback: Callable[[Path], None] | None = None, audio_level_callback: Callable[[bytes], None] | None = None, + live_preview_config: LivePreviewConfig | None = None, **_kwargs: object, ) -> str | None: """Unified ASR transcription function.""" + live_preview = ( + LivePreviewStreamer( + live_preview_config, + wyoming_asr_cfg=wyoming_asr_cfg, + logger=logger, + extra_instructions=extra_instructions, + ) + if live_preview_config is not None + else None + ) + live_preview_task: asyncio.Task[None] | None = None + final_transcript: str | None = None try: async with wyoming_client_context( wyoming_asr_cfg.asr_wyoming_ip, @@ -500,6 +689,9 @@ async def _transcribe_live_audio_wyoming( stream_config = setup_input_stream(audio_input_cfg.input_device_index) with open_audio_stream(stream_config) as stream: + if live_preview is not None: + live_preview.reset_log() + live_preview_task = asyncio.create_task(live_preview.run()) _, recv_task = await manage_send_receive_tasks( _send_audio( client, @@ -512,6 +704,7 @@ async def _transcribe_live_audio_wyoming( initial_prompt=effective_prompt, recording_path_callback=recording_path_callback, audio_level_callback=audio_level_callback, + live_preview_callback=live_preview.add_chunk if live_preview else None, ), _receive_transcript( client, @@ -521,10 +714,21 @@ async def _transcribe_live_audio_wyoming( ), return_when=asyncio.ALL_COMPLETED, ) - return recv_task.result() + result = recv_task.result() + final_transcript = result + return result except (ConnectionRefusedError, Exception): logger.warning("Failed to connect to Wyoming ASR server") return None + finally: + if live_preview is not None: + live_preview.request_stop() + if live_preview_task is not None: + live_preview_task.cancel() + with suppress(asyncio.CancelledError): + await live_preview_task + if live_preview is not None: + await live_preview.stop(final_transcript) async def _transcribe_live_audio_buffered( diff --git a/docs/commands/transcribe.md b/docs/commands/transcribe.md index 3a58bfa5..1002bc90 100644 --- a/docs/commands/transcribe.md +++ b/docs/commands/transcribe.md @@ -187,6 +187,15 @@ The `--from-file` option supports multiple audio formats: | `--print-args` | `false` | Print the command line arguments, including variables taken from the configuration file. | | `--transcription-log` | - | Append transcripts to JSONL file (timestamp, hostname, model, raw/processed text). Recent entries provide context for LLM cleanup. | +### Live Preview + +| Option | Default | Description | +|--------|---------|-------------| +| `--live-preview-log` | - | Write rolling live transcription preview events to JSONL while recording. | +| `--live-preview-interval` | `2.0` | Seconds between live preview retranscriptions. | +| `--live-preview-window` | `15.0` | Seconds of recent audio to include in each live preview retranscription. | +| `--live-preview-console, --live-preview-stdout` | `false` | Print rolling live transcription preview updates to the terminal while recording. | + ### Diarization | Option | Default | Description | @@ -227,6 +236,19 @@ agent-cli transcribe --toggle cmd + shift + r : /path/to/agent-cli transcribe --toggle --input-device-index 1 ``` +### Live Preview + +Use live preview options when another UI should display provisional transcription text during a recording. The transcriber periodically reprocesses the most recent audio window, so preview text can rewrite earlier words until the final transcription is ready. + +```bash +agent-cli transcribe --toggle \ + --live-preview-log ~/.config/agent-cli/live-preview.jsonl \ + --live-preview-interval 1 \ + --live-preview-window 10 +``` + +For terminal testing, add `--live-preview-console` to print each rolling update while recording. The macOS menu bar app uses the JSONL log when **Show Live Transcription Preview** is enabled in Settings; that setting is off by default. + ### Transcription Log Log all transcriptions with timestamps: diff --git a/docs/installation/macos-app.md b/docs/installation/macos-app.md index 02f780b0..13824571 100644 --- a/docs/installation/macos-app.md +++ b/docs/installation/macos-app.md @@ -45,6 +45,12 @@ The first transcription can take longer because AgentCLI installs the private CL Open **Settings...** from the menu bar app to change shortcuts, enable **Start at Login**, or switch runtime modes. +## Live Transcription Preview + +The menu bar app can show provisional transcription text above the recording meter while you speak. Enable **Show Live Transcription Preview** in **Settings...** to turn it on. + +This setting is off by default. When enabled, AgentCLI writes rolling preview events to `~/.config/agent-cli/live-preview.jsonl` and updates the overlay during toggle or hold-to-transcribe recordings. Preview text is best-effort and may revise earlier words as more audio arrives; the final transcript is still produced and inserted after recording stops. + ## Runtime Modes By default, the app manages its own private `agent-cli` install so the menu bar workflow is zero-config and does not depend on your shell PATH. diff --git a/macos/AgentCLI/Sources/AgentCLI/AgentCommand.swift b/macos/AgentCLI/Sources/AgentCLI/AgentCommand.swift index 895e5429..4bf408bc 100644 --- a/macos/AgentCLI/Sources/AgentCLI/AgentCommand.swift +++ b/macos/AgentCLI/Sources/AgentCLI/AgentCommand.swift @@ -19,6 +19,7 @@ struct AgentCommand { let forceBootstrap: Bool let bootstrapRequirement: AgentBootstrapRequirement let showsRecordingIndicator: Bool + let supportsLivePreviewOverlay: Bool let startNotificationTitle: String? let startNotificationBody: String? let finishNotificationTitle: String? @@ -32,6 +33,7 @@ struct AgentCommand { forceBootstrap: Bool = false, bootstrapRequirement: AgentBootstrapRequirement = .cliRuntime, showsRecordingIndicator: Bool = false, + supportsLivePreviewOverlay: Bool = false, startNotificationTitle: String? = nil, startNotificationBody: String? = nil, finishNotificationTitle: String? = nil @@ -44,6 +46,7 @@ struct AgentCommand { self.forceBootstrap = forceBootstrap self.bootstrapRequirement = bootstrapRequirement self.showsRecordingIndicator = showsRecordingIndicator + self.supportsLivePreviewOverlay = supportsLivePreviewOverlay self.startNotificationTitle = startNotificationTitle self.startNotificationBody = startNotificationBody self.finishNotificationTitle = finishNotificationTitle @@ -51,18 +54,32 @@ struct AgentCommand { func resolvedArguments( extraInstructions: String?, + livePreviewOverlayEnabled: Bool = TranscriptionSettings.isLivePreviewOverlayEnabled(), transcriptionDaemonArguments: [String]? = nil ) -> [String] { if appliesTranscriptionDaemonSettings { return transcriptionDaemonArguments ?? arguments } - guard appliesTranscriptionExtraInstructions else { return arguments } + var resolved = arguments + + if supportsLivePreviewOverlay && livePreviewOverlayEnabled { + resolved += [ + "--live-preview-log", + LiveTranscriptionPreview.defaultLogPath, + "--live-preview-interval", + "1", + "--live-preview-window", + "10", + ] + } + + guard appliesTranscriptionExtraInstructions else { return resolved } let trimmedInstructions = extraInstructions?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" - guard !trimmedInstructions.isVisiblyBlank else { return arguments } + guard !trimmedInstructions.isVisiblyBlank else { return resolved } - return arguments + ["--extra-instructions", trimmedInstructions] + return resolved + ["--extra-instructions", trimmedInstructions] } var menuActivityTitle: String { @@ -84,6 +101,7 @@ struct AgentCommand { appliesTranscriptionExtraInstructions: true, bootstrapRequirement: .transcription, showsRecordingIndicator: true, + supportsLivePreviewOverlay: true, startNotificationTitle: "Transcription Started", startNotificationBody: "Recording audio. Toggle transcription again to stop and transcribe.", finishNotificationTitle: "Transcription Finished" diff --git a/macos/AgentCLI/Sources/AgentCLI/AgentRuntime.swift b/macos/AgentCLI/Sources/AgentCLI/AgentRuntime.swift index fa9faead..bd0779d1 100644 --- a/macos/AgentCLI/Sources/AgentCLI/AgentRuntime.swift +++ b/macos/AgentCLI/Sources/AgentCLI/AgentRuntime.swift @@ -156,6 +156,17 @@ struct AgentRuntime { } func runSelfTestIfRequested() { + if CommandLine.arguments.contains("--agentcli-live-preview-self-test") { + do { + try runLivePreviewSelfTest() + print("AgentCLI live preview self-test ok") + exit(0) + } catch { + print("AgentCLI live preview self-test failed: \(error.localizedDescription)") + exit(1) + } + } + if CommandLine.arguments.contains("--agentcli-self-test") { do { try prepareDirectories() @@ -260,6 +271,73 @@ struct AgentRuntime { } } + private func runLivePreviewSelfTest() throws { + let defaultArguments = AgentCommand.toggleTranscription.arguments + guard !defaultArguments.contains("--live-preview-log"), + !defaultArguments.contains("--live-preview-interval"), + !defaultArguments.contains("--live-preview-window") else { + throw Self.selfTestError("toggle transcription command enables live preview by default") + } + let previewArguments = AgentCommand.toggleTranscription.resolvedArguments( + extraInstructions: nil, + livePreviewOverlayEnabled: true + ) + guard Self.argumentsContainOption( + previewArguments, + "--live-preview-log", + value: LiveTranscriptionPreview.defaultLogPath + ), + Self.argumentsContainOption(previewArguments, "--live-preview-interval", value: "1"), + Self.argumentsContainOption(previewArguments, "--live-preview-window", value: "10") else { + throw Self.selfTestError("toggle transcription command does not enable live preview when requested") + } + + let preview = LiveTranscriptionPreview.shared + let logURL = fileManager.temporaryDirectory + .appendingPathComponent("agent-cli-live-preview-\(UUID().uuidString).jsonl") + defer { + preview.stop() + try? fileManager.removeItem(at: logURL) + } + + preview.start(logURL: logURL) + try [ + #"{"type":"partial","text":"first guess","revision":1}"#, + #"{"type":"partial","text":" live preview works ","revision":2}"#, + ].joined(separator: "\n").write(to: logURL, atomically: true, encoding: .utf8) + preview.poll() + guard preview.text == "live preview works" else { + throw Self.selfTestError("partial preview text was '\(preview.text)'") + } + + try #"{"type":"final","text":"final preview works","revision":3}"# + .write(to: logURL, atomically: true, encoding: .utf8) + preview.poll() + guard preview.text == "final preview works" else { + throw Self.selfTestError("final preview text was '\(preview.text)'") + } + } + + private static func argumentsContainOption( + _ arguments: [String], + _ option: String, + value: String + ) -> Bool { + guard let index = arguments.firstIndex(of: option) else { + return false + } + let valueIndex = arguments.index(after: index) + return arguments.indices.contains(valueIndex) && arguments[valueIndex] == value + } + + private static func selfTestError(_ message: String) -> NSError { + NSError( + domain: "AgentCLI.SelfTest", + code: 1, + userInfo: [NSLocalizedDescriptionKey: message] + ) + } + func ensureInstalled( force: Bool = false, progress: AgentBootstrapProgress = { _ in } diff --git a/macos/AgentCLI/Sources/AgentCLI/LiveTranscriptionPreview.swift b/macos/AgentCLI/Sources/AgentCLI/LiveTranscriptionPreview.swift new file mode 100644 index 00000000..8c72c68c --- /dev/null +++ b/macos/AgentCLI/Sources/AgentCLI/LiveTranscriptionPreview.swift @@ -0,0 +1,110 @@ +import Combine +import Foundation + +final class LiveTranscriptionPreview: ObservableObject { + static let shared = LiveTranscriptionPreview() + static let defaultLogPath = "~/.config/agent-cli/live-preview.jsonl" + static var defaultLogURL: URL { + URL(fileURLWithPath: NSString(string: defaultLogPath).expandingTildeInPath) + } + + @Published private(set) var text = "" + + private static let pollInterval: TimeInterval = 0.25 + private var timer: Timer? + private var logURL: URL? + private var readOffset: UInt64 = 0 + private var pendingLine = "" + + private init() {} + + func start(logURL: URL = defaultLogURL) { + stop() + self.logURL = logURL + readOffset = 0 + pendingLine = "" + text = "" + try? FileManager.default.createDirectory( + at: logURL.deletingLastPathComponent(), + withIntermediateDirectories: true + ) + try? "".write(to: logURL, atomically: true, encoding: .utf8) + poll() + + let timer = Timer(timeInterval: Self.pollInterval, repeats: true) { [weak self] _ in + Task { @MainActor in + self?.poll() + } + } + RunLoop.main.add(timer, forMode: .common) + self.timer = timer + } + + func stop() { + timer?.invalidate() + timer = nil + logURL = nil + readOffset = 0 + pendingLine = "" + text = "" + } + + func poll() { + guard let logURL, + let contents = readNewContents(from: logURL) else { + return + } + + let completeText = pendingLine + contents + let lines = completeText.components(separatedBy: .newlines) + pendingLine = completeText.hasSuffix("\n") || completeText.hasSuffix("\r") ? "" : (lines.last ?? "") + + guard let latestText = lines + .compactMap(Self.previewText) + .last else { + return + } + + if latestText != text { + text = latestText + } + } + + private func readNewContents(from logURL: URL) -> String? { + do { + let attributes = try FileManager.default.attributesOfItem(atPath: logURL.path) + let fileSize = (attributes[.size] as? NSNumber)?.uint64Value ?? 0 + if readOffset > fileSize { + readOffset = 0 + pendingLine = "" + } + guard fileSize > readOffset else { return nil } + + let handle = try FileHandle(forReadingFrom: logURL) + defer { + try? handle.close() + } + try handle.seek(toOffset: readOffset) + guard let data = try handle.readToEnd(), !data.isEmpty else { return nil } + readOffset += UInt64(data.count) + return String(data: data, encoding: .utf8) + } catch { + return nil + } + } + + static func previewText(from line: String) -> String? { + let trimmedLine = line.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmedLine.isEmpty, + let data = trimmedLine.data(using: .utf8), + let object = try? JSONSerialization.jsonObject(with: data) as? [String: Any], + let type = object["type"] as? String, + type == "partial" || type == "final", + let text = object["text"] as? String else { + return nil + } + + let trimmedText = text.trimmingCharacters(in: .whitespacesAndNewlines) + return trimmedText.isEmpty ? nil : trimmedText + } +} diff --git a/macos/AgentCLI/Sources/AgentCLI/RecordingIndicatorController.swift b/macos/AgentCLI/Sources/AgentCLI/RecordingIndicatorController.swift index 4461a07b..8b270ce1 100644 --- a/macos/AgentCLI/Sources/AgentCLI/RecordingIndicatorController.swift +++ b/macos/AgentCLI/Sources/AgentCLI/RecordingIndicatorController.swift @@ -29,7 +29,12 @@ final class RecordingIndicatorController { if !wasRecording { play(.startedRecording) } - VoiceLevelOverlayController.shared.show() + let showsLivePreview = command.supportsLivePreviewOverlay + && TranscriptionSettings.isLivePreviewOverlayEnabled(defaults: defaults) + if showsLivePreview { + LiveTranscriptionPreview.shared.start() + } + VoiceLevelOverlayController.shared.show(showsPreviewSpace: showsLivePreview) } func end(for command: AgentCommand) { @@ -45,6 +50,9 @@ final class RecordingIndicatorController { play(.finishedRecording) VoiceLevelOverlayController.shared.hide() } + if command.supportsLivePreviewOverlay { + LiveTranscriptionPreview.shared.stop() + } } private func play(_ event: RecordingSoundEvent) { diff --git a/macos/AgentCLI/Sources/AgentCLI/Shortcuts.swift b/macos/AgentCLI/Sources/AgentCLI/Shortcuts.swift index 82d4663b..a893c9d4 100644 --- a/macos/AgentCLI/Sources/AgentCLI/Shortcuts.swift +++ b/macos/AgentCLI/Sources/AgentCLI/Shortcuts.swift @@ -281,6 +281,8 @@ struct SettingsView: View { private var useUserInstalledAgentCLI = false @AppStorage(RecordingSoundSettings.enabledKey) private var recordingSoundsEnabled = false + @AppStorage(TranscriptionSettings.livePreviewOverlayEnabledKey) + private var livePreviewOverlayEnabled = false @AppStorage(TranscriptionSettings.transcriptionBackendKey) private var transcriptionBackend = TranscriptionBackend.whisper.rawValue @AppStorage(TranscriptionSettings.transcriptionModelKey) @@ -315,10 +317,11 @@ struct SettingsView: View { Toggle("Use User-Installed agent-cli", isOn: $useUserInstalledAgentCLI) Toggle("Play Recording Sounds", isOn: $recordingSoundsEnabled) + Toggle("Show Live Transcription Preview", isOn: $livePreviewOverlayEnabled) } header: { Text("General") } footer: { - Text("Runs the agent-cli found on PATH with your normal config instead of the app's private bundled-uv runtime. Recording sounds use Frog when recording starts and Purr when recording ends.") + Text("Runs the agent-cli found on PATH with your normal config instead of the app's private bundled-uv runtime. Live preview shows provisional transcription text above the recording meter.") } Section { diff --git a/macos/AgentCLI/Sources/AgentCLI/TranscriptionSettings.swift b/macos/AgentCLI/Sources/AgentCLI/TranscriptionSettings.swift index e651308a..1d643d4b 100644 --- a/macos/AgentCLI/Sources/AgentCLI/TranscriptionSettings.swift +++ b/macos/AgentCLI/Sources/AgentCLI/TranscriptionSettings.swift @@ -59,6 +59,7 @@ enum TranscriptionBackend: String, CaseIterable, Identifiable { enum TranscriptionSettings { static let transcriptionExtraInstructionsKey = "transcriptionExtraInstructions" + static let livePreviewOverlayEnabledKey = "livePreviewOverlayEnabled" static let transcriptionBackendKey = "transcriptionBackend" static let transcriptionModelKey = "transcriptionModel" static let transcriptionModelTTLSecondsKey = "transcriptionModelTTLSeconds" @@ -68,6 +69,10 @@ enum TranscriptionSettings { UserDefaults.standard.string(forKey: transcriptionExtraInstructionsKey) ?? "" } + static func isLivePreviewOverlayEnabled(defaults: UserDefaults = .standard) -> Bool { + defaults.bool(forKey: livePreviewOverlayEnabledKey) + } + static func selectedBackend(userDefaults: UserDefaults = .standard) -> TranscriptionBackend { let rawValue = userDefaults.string(forKey: transcriptionBackendKey) ?? TranscriptionBackend.whisper.rawValue return TranscriptionBackend(rawValue: rawValue) ?? .whisper diff --git a/macos/AgentCLI/Sources/AgentCLI/VoiceLevelOverlay.swift b/macos/AgentCLI/Sources/AgentCLI/VoiceLevelOverlay.swift index 4826b8de..e3994a42 100644 --- a/macos/AgentCLI/Sources/AgentCLI/VoiceLevelOverlay.swift +++ b/macos/AgentCLI/Sources/AgentCLI/VoiceLevelOverlay.swift @@ -4,22 +4,78 @@ import SwiftUI enum VoiceLevelOverlayLayout { static let pillSize = CGSize(width: 147, height: 38) + static let textWidth = CGFloat(420) + static let textHeight = CGFloat(86) static let shadowRadius = CGFloat(13) static let shadowYOffset = CGFloat(6) static let horizontalPadding = shadowRadius static let verticalPadding = shadowRadius + abs(shadowYOffset) - static let panelSize = NSSize( + static let contentSpacing = CGFloat(8) + static let compactPanelSize = NSSize( width: pillSize.width + (horizontalPadding * 2), height: pillSize.height + (verticalPadding * 2) ) + static let previewPanelSize = NSSize( + width: textWidth + (horizontalPadding * 2), + height: pillSize.height + textHeight + contentSpacing + (verticalPadding * 2) + ) static let bottomOffset = CGFloat(38) + + static func panelSize(showsPreviewSpace: Bool) -> NSSize { + showsPreviewSpace ? previewPanelSize : compactPanelSize + } } struct VoiceLevelOverlayView: View { @Environment(\.colorScheme) private var colorScheme @ObservedObject var meter: VoiceLevelMeter + @ObservedObject var preview: LiveTranscriptionPreview + let showsPreviewSpace: Bool var body: some View { + ZStack(alignment: .bottom) { + VStack(spacing: VoiceLevelOverlayLayout.contentSpacing) { + if showsPreviewSpace && !preview.text.isEmpty { + Text(preview.text) + .font(.system(size: 14, weight: .medium)) + .lineLimit(3) + .multilineTextAlignment(.center) + .foregroundStyle(textColor) + .padding(.horizontal, 14) + .padding(.vertical, 10) + .frame( + width: VoiceLevelOverlayLayout.textWidth, + height: VoiceLevelOverlayLayout.textHeight + ) + .background( + RoundedRectangle(cornerRadius: 8, style: .continuous) + .fill(textBackgroundColor) + ) + .overlay( + RoundedRectangle(cornerRadius: 8, style: .continuous) + .stroke(borderColor, lineWidth: 1) + ) + .shadow( + color: shadowColor, + radius: VoiceLevelOverlayLayout.shadowRadius, + y: VoiceLevelOverlayLayout.shadowYOffset + ) + } + + levelMeter + } + .padding(.horizontal, VoiceLevelOverlayLayout.horizontalPadding) + .padding(.bottom, VoiceLevelOverlayLayout.verticalPadding) + } + .frame( + width: VoiceLevelOverlayLayout.panelSize(showsPreviewSpace: showsPreviewSpace).width, + height: VoiceLevelOverlayLayout.panelSize(showsPreviewSpace: showsPreviewSpace).height, + alignment: .bottom + ) + .accessibilityLabel(Text("Voice level")) + } + + private var levelMeter: some View { HStack(alignment: .center, spacing: 3.5) { ForEach(Array(meter.amplitudes.enumerated()), id: \.offset) { _, amplitude in Capsule() @@ -51,9 +107,6 @@ struct VoiceLevelOverlayView: View { radius: VoiceLevelOverlayLayout.shadowRadius, y: VoiceLevelOverlayLayout.shadowYOffset ) - .padding(.horizontal, VoiceLevelOverlayLayout.horizontalPadding) - .padding(.vertical, VoiceLevelOverlayLayout.verticalPadding) - .accessibilityLabel(Text("Voice level")) } private var isLightMode: Bool { @@ -64,6 +117,14 @@ struct VoiceLevelOverlayView: View { isLightMode ? Color.white.opacity(0.88) : Color.black.opacity(0.42) } + private var textBackgroundColor: Color { + isLightMode ? Color.white.opacity(0.94) : Color.black.opacity(0.58) + } + + private var textColor: Color { + isLightMode ? Color.black.opacity(0.86) : Color.white.opacity(0.92) + } + private var borderColor: Color { isLightMode ? Color.black.opacity(0.12) : Color.white.opacity(0.22) } @@ -84,14 +145,19 @@ struct VoiceLevelOverlayView: View { final class VoiceLevelOverlayController { static let shared = VoiceLevelOverlayController() - private let panelSize = VoiceLevelOverlayLayout.panelSize private var panel: NSPanel? + private var showsPreviewSpace = false private init() {} - func show() { + func show(showsPreviewSpace: Bool = false) { let panel = panel ?? makePanel() self.panel = panel + if self.showsPreviewSpace != showsPreviewSpace { + self.showsPreviewSpace = showsPreviewSpace + updatePanelContent(panel) + } + panel.setContentSize(VoiceLevelOverlayLayout.panelSize(showsPreviewSpace: showsPreviewSpace)) position(panel) VoiceLevelMeter.shared.start() panel.orderFrontRegardless() @@ -104,7 +170,10 @@ final class VoiceLevelOverlayController { private func makePanel() -> NSPanel { let panel = NSPanel( - contentRect: NSRect(origin: .zero, size: panelSize), + contentRect: NSRect( + origin: .zero, + size: VoiceLevelOverlayLayout.panelSize(showsPreviewSpace: showsPreviewSpace) + ), styleMask: [.borderless, .nonactivatingPanel], backing: .buffered, defer: false @@ -116,10 +185,18 @@ final class VoiceLevelOverlayController { panel.hasShadow = false panel.ignoresMouseEvents = true panel.collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary, .ignoresCycle] - panel.contentView = NSHostingView(rootView: VoiceLevelOverlayView(meter: VoiceLevelMeter.shared)) + updatePanelContent(panel) return panel } + private func updatePanelContent(_ panel: NSPanel) { + panel.contentView = NSHostingView(rootView: VoiceLevelOverlayView( + meter: VoiceLevelMeter.shared, + preview: LiveTranscriptionPreview.shared, + showsPreviewSpace: showsPreviewSpace + )) + } + private func position(_ panel: NSPanel) { guard let screen = NSScreen.main ?? NSScreen.screens.first else { return } let frame = screen.visibleFrame @@ -128,7 +205,7 @@ final class VoiceLevelOverlayController { - VoiceLevelOverlayLayout.verticalPadding panel.setFrameOrigin( NSPoint( - x: frame.midX - panelSize.width / 2, + x: frame.midX - panel.frame.width / 2, y: y ) ) diff --git a/macos/AgentCLI/Tests/AgentCLITests/AgentCommandTests.swift b/macos/AgentCLI/Tests/AgentCLITests/AgentCommandTests.swift index d748e185..c8e4f214 100644 --- a/macos/AgentCLI/Tests/AgentCLITests/AgentCommandTests.swift +++ b/macos/AgentCLI/Tests/AgentCLITests/AgentCommandTests.swift @@ -20,10 +20,35 @@ final class AgentCommandTests: XCTestCase { XCTAssertEqual(AgentCommand.toggleTranscription.bootstrapRequirement, .transcription) } + func testToggleTranscriptionAddsLivePreviewArgumentsWhenEnabled() { + XCTAssertEqual( + AgentCommand.toggleTranscription.resolvedArguments( + extraInstructions: nil, + livePreviewOverlayEnabled: true + ), + [ + "transcribe", + "--toggle", + "--quiet", + "--voice-level-log", + "~/.config/agent-cli/voice-levels.jsonl", + "--transcription-log", + "~/.config/agent-cli/transcriptions.jsonl", + "--live-preview-log", + "~/.config/agent-cli/live-preview.jsonl", + "--live-preview-interval", + "1", + "--live-preview-window", + "10", + ] + ) + } + func testToggleTranscriptionAppendsConfiguredExtraInstructions() { XCTAssertEqual( AgentCommand.toggleTranscription.resolvedArguments( - extraInstructions: " Remember Bas and Henk.\nPrefer project names. " + extraInstructions: " Remember Bas and Henk.\nPrefer project names. ", + livePreviewOverlayEnabled: true ), [ "transcribe", @@ -33,6 +58,12 @@ final class AgentCommandTests: XCTestCase { "~/.config/agent-cli/voice-levels.jsonl", "--transcription-log", "~/.config/agent-cli/transcriptions.jsonl", + "--live-preview-log", + "~/.config/agent-cli/live-preview.jsonl", + "--live-preview-interval", + "1", + "--live-preview-window", + "10", "--extra-instructions", "Remember Bas and Henk.\nPrefer project names.", ] @@ -41,14 +72,20 @@ final class AgentCommandTests: XCTestCase { func testBlankExtraInstructionsAreIgnored() { XCTAssertEqual( - AgentCommand.toggleTranscription.resolvedArguments(extraInstructions: " \n\t "), + AgentCommand.toggleTranscription.resolvedArguments( + extraInstructions: " \n\t ", + livePreviewOverlayEnabled: false + ), AgentCommand.toggleTranscription.arguments ) } func testVisuallyBlankExtraInstructionsAreIgnored() { XCTAssertEqual( - AgentCommand.toggleTranscription.resolvedArguments(extraInstructions: "\u{2060}\u{FEFF}"), + AgentCommand.toggleTranscription.resolvedArguments( + extraInstructions: "\u{2060}\u{FEFF}", + livePreviewOverlayEnabled: false + ), AgentCommand.toggleTranscription.arguments ) } diff --git a/macos/AgentCLI/Tests/AgentCLITests/LivePreviewSwiftTestingTests.swift b/macos/AgentCLI/Tests/AgentCLITests/LivePreviewSwiftTestingTests.swift new file mode 100644 index 00000000..05d18a38 --- /dev/null +++ b/macos/AgentCLI/Tests/AgentCLITests/LivePreviewSwiftTestingTests.swift @@ -0,0 +1,62 @@ +#if canImport(Testing) +import Foundation +import Testing +@testable import AgentCLI + +@Test +func toggleTranscriptionDisablesLivePreviewArgumentsByDefault() { + #expect(AgentCommand.toggleTranscription.arguments == [ + "transcribe", + "--toggle", + "--quiet", + "--voice-level-log", + "~/.config/agent-cli/voice-levels.jsonl", + "--transcription-log", + "~/.config/agent-cli/transcriptions.jsonl", + ]) +} + +@Test +func toggleTranscriptionEnablesLivePreviewArgumentsWhenConfigured() { + #expect(AgentCommand.toggleTranscription.resolvedArguments( + extraInstructions: nil, + livePreviewOverlayEnabled: true + ) == [ + "transcribe", + "--toggle", + "--quiet", + "--voice-level-log", + "~/.config/agent-cli/voice-levels.jsonl", + "--transcription-log", + "~/.config/agent-cli/transcriptions.jsonl", + "--live-preview-log", + "~/.config/agent-cli/live-preview.jsonl", + "--live-preview-interval", + "1", + "--live-preview-window", + "10", + ]) +} + +@Test +func livePreviewParsesLatestJsonlEvent() throws { + let preview = LiveTranscriptionPreview.shared + let logURL = FileManager.default.temporaryDirectory + .appendingPathComponent("agent-cli-live-preview-\(UUID().uuidString).jsonl") + defer { + preview.stop() + try? FileManager.default.removeItem(at: logURL) + } + + preview.start(logURL: logURL) + try [ + #"{"type":"partial","text":"first guess","revision":1}"#, + #"{"type":"partial","text":" corrected guess ","revision":2}"#, + ].joined(separator: "\n").write(to: logURL, atomically: true, encoding: .utf8) + + preview.poll() + + #expect(preview.text == "corrected guess") +} + +#endif diff --git a/macos/AgentCLI/Tests/AgentCLITests/LiveTranscriptionPreviewTests.swift b/macos/AgentCLI/Tests/AgentCLITests/LiveTranscriptionPreviewTests.swift new file mode 100644 index 00000000..cb4e8302 --- /dev/null +++ b/macos/AgentCLI/Tests/AgentCLITests/LiveTranscriptionPreviewTests.swift @@ -0,0 +1,24 @@ +#if canImport(XCTest) +import XCTest +@testable import AgentCLI + +final class LiveTranscriptionPreviewTests: XCTestCase { + func testPreviewTextParsesPartialEvents() { + let line = #"{"type":"partial","text":" hello world ","revision":1}"# + + XCTAssertEqual(LiveTranscriptionPreview.previewText(from: line), "hello world") + } + + func testPreviewTextParsesFinalEvents() { + let line = #"{"type":"final","text":"Final transcript","revision":2}"# + + XCTAssertEqual(LiveTranscriptionPreview.previewText(from: line), "Final transcript") + } + + func testPreviewTextIgnoresMalformedAndEmptyEvents() { + XCTAssertNil(LiveTranscriptionPreview.previewText(from: "")) + XCTAssertNil(LiveTranscriptionPreview.previewText(from: #"{"type":"debug","text":"no"}"#)) + XCTAssertNil(LiveTranscriptionPreview.previewText(from: #"{"type":"partial","text":" "}"#)) + } +} +#endif diff --git a/macos/AgentCLI/Tests/AgentCLITests/VoiceLevelMeterTests.swift b/macos/AgentCLI/Tests/AgentCLITests/VoiceLevelMeterTests.swift index e1535fa6..8d970188 100644 --- a/macos/AgentCLI/Tests/AgentCLITests/VoiceLevelMeterTests.swift +++ b/macos/AgentCLI/Tests/AgentCLITests/VoiceLevelMeterTests.swift @@ -61,7 +61,7 @@ final class VoiceLevelMeterTests: XCTestCase { } func testOverlayPanelLeavesRoomForShadowBlur() { - let panelSize = VoiceLevelOverlayLayout.panelSize + let panelSize = VoiceLevelOverlayLayout.panelSize(showsPreviewSpace: false) let pillSize = VoiceLevelOverlayLayout.pillSize let shadowRadius = VoiceLevelOverlayLayout.shadowRadius @@ -77,6 +77,18 @@ final class VoiceLevelMeterTests: XCTestCase { ) } + func testOverlayPanelOnlyExpandsForPreviewSpace() { + let compactSize = VoiceLevelOverlayLayout.panelSize(showsPreviewSpace: false) + let previewSize = VoiceLevelOverlayLayout.panelSize(showsPreviewSpace: true) + + XCTAssertEqual( + compactSize.width, + VoiceLevelOverlayLayout.pillSize.width + (VoiceLevelOverlayLayout.horizontalPadding * 2) + ) + XCTAssertGreaterThan(previewSize.width, compactSize.width) + XCTAssertGreaterThan(previewSize.height, compactSize.height) + } + private static let iso8601: ISO8601DateFormatter = { let formatter = ISO8601DateFormatter() formatter.formatOptions = [.withInternetDateTime] diff --git a/tests/agents/test_transcribe.py b/tests/agents/test_transcribe.py index 70eced1b..b136c6ae 100644 --- a/tests/agents/test_transcribe.py +++ b/tests/agents/test_transcribe.py @@ -5,6 +5,7 @@ import asyncio import json import logging +from contextlib import nullcontext from datetime import UTC, datetime, timedelta from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch @@ -183,6 +184,161 @@ async def test_transcribe_main( mock_wyoming_client_context.assert_called_once() +@pytest.mark.asyncio +@patch("agent_cli.agents.transcribe.signal_handling_context") +@patch("agent_cli.agents.transcribe.maybe_live") +@patch("agent_cli.agents.transcribe.asr.create_transcriber") +async def test_live_preview_console_disables_rich_live_status( + mock_create_transcriber: MagicMock, + mock_maybe_live: MagicMock, + mock_signal_handling_context: MagicMock, +) -> None: + """Console live preview should not be hidden by the Rich live status.""" + mock_maybe_live.return_value = nullcontext(None) + mock_create_transcriber.return_value = AsyncMock(return_value="hello world") + mock_signal_handling_context.return_value.__enter__.return_value = asyncio.Event() + + await transcribe._async_main( + extra_instructions=None, + provider_cfg=config.ProviderSelection( + asr_provider="wyoming", + llm_provider="ollama", + tts_provider="wyoming", + ), + general_cfg=config.General( + log_level="INFO", + log_file=None, + quiet=False, + list_devices=False, + clipboard=False, + ), + audio_in_cfg=config.AudioInput(), + wyoming_asr_cfg=config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=12345), + openai_asr_cfg=config.OpenAIASR(asr_openai_model="whisper-1"), + gemini_asr_cfg=config.GeminiASR( + asr_gemini_model="gemini-2.0-flash", + gemini_api_key="test-key", + ), + ollama_cfg=config.Ollama(llm_ollama_model="", llm_ollama_host=""), + openai_llm_cfg=config.OpenAILLM(llm_openai_model="", openai_base_url=None), + gemini_llm_cfg=config.GeminiLLM( + llm_gemini_model="gemini-1.5-flash", + gemini_api_key="test-key", + ), + llm_enabled=False, + transcription_log=None, + emit_output=False, + live_preview_console=True, + ) + + mock_maybe_live.assert_called_once_with(False) + + +@pytest.mark.asyncio +@patch("agent_cli.agents.transcribe.signal_handling_context") +@patch("agent_cli.agents.transcribe.maybe_live") +@patch("agent_cli.agents.transcribe.asr.create_transcriber") +async def test_live_preview_console_keeps_rich_live_status_for_non_wyoming_provider( + mock_create_transcriber: MagicMock, + mock_maybe_live: MagicMock, + mock_signal_handling_context: MagicMock, +) -> None: + """Console preview should only hide Rich status when preview can run.""" + mock_maybe_live.return_value = nullcontext(None) + mock_create_transcriber.return_value = AsyncMock(return_value="hello world") + mock_signal_handling_context.return_value.__enter__.return_value = asyncio.Event() + + await transcribe._async_main( + extra_instructions=None, + provider_cfg=config.ProviderSelection( + asr_provider="openai", + llm_provider="ollama", + tts_provider="wyoming", + ), + general_cfg=config.General( + log_level="INFO", + log_file=None, + quiet=False, + list_devices=False, + clipboard=False, + ), + audio_in_cfg=config.AudioInput(), + wyoming_asr_cfg=config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=12345), + openai_asr_cfg=config.OpenAIASR(asr_openai_model="whisper-1"), + gemini_asr_cfg=config.GeminiASR( + asr_gemini_model="gemini-2.0-flash", + gemini_api_key="test-key", + ), + ollama_cfg=config.Ollama(llm_ollama_model="", llm_ollama_host=""), + openai_llm_cfg=config.OpenAILLM(llm_openai_model="", openai_base_url=None), + gemini_llm_cfg=config.GeminiLLM( + llm_gemini_model="gemini-1.5-flash", + gemini_api_key="test-key", + ), + llm_enabled=False, + transcription_log=None, + emit_output=False, + live_preview_console=True, + ) + + mock_maybe_live.assert_called_once_with(True) + mock_create_transcriber.return_value.assert_awaited_once() + assert mock_create_transcriber.return_value.await_args.kwargs["live_preview_config"] is None + + +@pytest.mark.asyncio +@patch("agent_cli.agents.transcribe.create_recorded_audio_transcriber") +@patch("agent_cli.agents.transcribe.load_audio_from_file") +@patch("agent_cli.agents.transcribe.maybe_live") +async def test_live_preview_console_keeps_rich_live_status_for_file_transcription( + mock_maybe_live: MagicMock, + mock_load_audio_from_file: MagicMock, + mock_create_recorded_audio_transcriber: MagicMock, + tmp_path: Path, +) -> None: + """File transcription cannot emit console preview, so Rich status stays visible.""" + audio_file = tmp_path / "sample.wav" + audio_file.write_bytes(b"audio") + mock_maybe_live.return_value = nullcontext(None) + mock_load_audio_from_file.return_value = b"audio" + mock_create_recorded_audio_transcriber.return_value = AsyncMock(return_value="hello world") + + await transcribe._async_main( + audio_file_path=audio_file, + extra_instructions=None, + provider_cfg=config.ProviderSelection( + asr_provider="wyoming", + llm_provider="ollama", + tts_provider="wyoming", + ), + general_cfg=config.General( + log_level="INFO", + log_file=None, + quiet=False, + list_devices=False, + clipboard=False, + ), + wyoming_asr_cfg=config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=12345), + openai_asr_cfg=config.OpenAIASR(asr_openai_model="whisper-1"), + gemini_asr_cfg=config.GeminiASR( + asr_gemini_model="gemini-2.0-flash", + gemini_api_key="test-key", + ), + ollama_cfg=config.Ollama(llm_ollama_model="", llm_ollama_host=""), + openai_llm_cfg=config.OpenAILLM(llm_openai_model="", openai_base_url=None), + gemini_llm_cfg=config.GeminiLLM( + llm_gemini_model="gemini-1.5-flash", + gemini_api_key="test-key", + ), + llm_enabled=False, + transcription_log=None, + emit_output=False, + live_preview_console=True, + ) + + mock_maybe_live.assert_called_once_with(True) + + def test_log_transcription(tmp_path: Path) -> None: """Test the log_transcription function.""" log_file = tmp_path / "test_log.jsonl" diff --git a/tests/agents/test_transcribe_recovery.py b/tests/agents/test_transcribe_recovery.py index d0e582ae..d4cba8fe 100644 --- a/tests/agents/test_transcribe_recovery.py +++ b/tests/agents/test_transcribe_recovery.py @@ -748,9 +748,11 @@ def test_transcribe_command_last_recording_disabled( ) with ( - patch("agent_cli.agents.transcribe.asyncio.run") as mock_run, + patch("agent_cli.agents.transcribe._async_main", new_callable=AsyncMock) as mock_async_main, patch("agent_cli.core.process.pid_file_context") as mock_pid_context, ): + mock_async_main.return_value = {} + # Call transcribe with --last-recording disabled (0) transcribe.transcribe( last_recording=0, # Disabled @@ -787,6 +789,10 @@ def test_transcribe_command_last_recording_disabled( config_file=None, print_args=False, transcription_log=None, + live_preview_log=tmp_path / "preview.jsonl", + live_preview_interval=1.0, + live_preview_window=10.0, + live_preview_console=True, diarize=False, diarize_format="inline", hf_token=None, @@ -797,12 +803,14 @@ def test_transcribe_command_last_recording_disabled( ) # Verify _async_main was called for normal recording (not from file) - mock_run.assert_called_once() + mock_async_main.assert_awaited_once() mock_pid_context.assert_called_once_with("transcribe") - call_args = mock_run.call_args[0][0] - # Should be normal recording mode, not file mode - assert call_args.__name__ == "_async_main" - call_args.close() # Avoid "coroutine never awaited" warning + async_main_kwargs = mock_async_main.call_args.kwargs + assert async_main_kwargs.get("audio_file_path") is None + assert async_main_kwargs["live_preview_log"] == tmp_path / "preview.jsonl" + assert async_main_kwargs["live_preview_interval"] == 1.0 + assert async_main_kwargs["live_preview_window"] == 10.0 + assert async_main_kwargs["live_preview_console"] is True def test_transcribe_command_conflicting_options() -> None: diff --git a/tests/test_asr.py b/tests/test_asr.py index 1c4b9c5d..fd04b5c2 100644 --- a/tests/test_asr.py +++ b/tests/test_asr.py @@ -3,7 +3,10 @@ from __future__ import annotations import asyncio +import json import threading +from contextlib import suppress +from typing import TYPE_CHECKING from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -13,6 +16,214 @@ from agent_cli import config from agent_cli.services import asr, transcribe_audio_gemini, transcribe_audio_openai +if TYPE_CHECKING: + from pathlib import Path + + +def test_write_live_preview_event(tmp_path: Path) -> None: + """Test that live preview events are written as JSONL.""" + log_file = tmp_path / "preview.jsonl" + + asr._write_live_preview_event( + log_file, + event_type="partial", + revision=1, + text="hello world", + ) + + entry = json.loads(log_file.read_text().strip()) + assert entry["type"] == "partial" + assert entry["revision"] == 1 + assert entry["text"] == "hello world" + assert entry["is_final"] is False + assert "timestamp" in entry + + +def test_print_live_preview_event(capsys: pytest.CaptureFixture[str]) -> None: + """Test that live preview events can be printed to the terminal.""" + asr._print_live_preview_event( + event_type="partial", + revision=3, + text="hello terminal", + ) + + captured = capsys.readouterr() + assert "live #3:" in captured.err + assert "hello terminal" in captured.err + + +@pytest.mark.asyncio +async def test_live_preview_streamer_emits_unique_partials(tmp_path: Path) -> None: + """Test that rolling previews write changed transcript revisions.""" + log_file = tmp_path / "preview.jsonl" + preview = asr.LivePreviewStreamer( + asr.LivePreviewConfig( + log_file=log_file, + interval_seconds=60, + window_seconds=1, + min_audio_seconds=0, + ), + wyoming_asr_cfg=config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=10300), + logger=MagicMock(), + ) + preview.reset_log() + + with patch( + "agent_cli.services.asr._transcribe_recorded_audio_wyoming", + new_callable=AsyncMock, + return_value="hello world", + ): + await preview.add_chunk(b"\x00\x00" * 160) + await preview.emit_partial() + await preview.emit_partial() + + entries = [json.loads(line) for line in log_file.read_text().splitlines()] + assert len(entries) == 1 + assert entries[0]["type"] == "partial" + assert entries[0]["text"] == "hello world" + + +@pytest.mark.asyncio +async def test_live_preview_streamer_can_emit_console_only( + capsys: pytest.CaptureFixture[str], +) -> None: + """Test that rolling previews can be printed without a log file.""" + preview = asr.LivePreviewStreamer( + asr.LivePreviewConfig( + log_file=None, + interval_seconds=60, + window_seconds=1, + min_audio_seconds=0, + console=True, + ), + wyoming_asr_cfg=config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=10300), + logger=MagicMock(), + ) + + with patch( + "agent_cli.services.asr._transcribe_recorded_audio_wyoming", + new_callable=AsyncMock, + return_value="hello console", + ): + await preview.add_chunk(b"\x00\x00" * 160) + await preview.emit_partial() + + captured = capsys.readouterr() + assert "live #1:" in captured.err + assert "hello console" in captured.err + + +@pytest.mark.asyncio +async def test_live_preview_streamer_stop_writes_final(tmp_path: Path) -> None: + """Test that the final transcript is written to the preview log.""" + log_file = tmp_path / "preview.jsonl" + preview = asr.LivePreviewStreamer( + asr.LivePreviewConfig(log_file=log_file), + wyoming_asr_cfg=config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=10300), + logger=MagicMock(), + ) + preview.reset_log() + + await preview.stop("final words") + + entry = json.loads(log_file.read_text().strip()) + assert entry["type"] == "final" + assert entry["text"] == "final words" + assert entry["is_final"] is True + + +@pytest.mark.asyncio +async def test_live_preview_streamer_ignores_partial_after_stop(tmp_path: Path) -> None: + """A stale partial must not be appended after the final transcript.""" + log_file = tmp_path / "preview.jsonl" + preview = asr.LivePreviewStreamer( + asr.LivePreviewConfig(log_file=log_file), + wyoming_asr_cfg=config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=10300), + logger=MagicMock(), + ) + preview.reset_log() + await preview.add_chunk(b"\x00\x00" * 16_000) + await preview.stop("final words") + + with patch( + "agent_cli.services.asr._transcribe_recorded_audio_wyoming", + new_callable=AsyncMock, + return_value="stale partial", + ): + await preview.emit_partial() + + entries = [json.loads(line) for line in log_file.read_text().splitlines()] + assert [entry["type"] for entry in entries] == ["final"] + + +@pytest.mark.asyncio +async def test_live_preview_request_stop_blocks_partial_before_final( + tmp_path: Path, +) -> None: + """Stop signal must block partials even before the final transcript is written.""" + log_file = tmp_path / "preview.jsonl" + preview = asr.LivePreviewStreamer( + asr.LivePreviewConfig(log_file=log_file), + wyoming_asr_cfg=config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=10300), + logger=MagicMock(), + ) + preview.reset_log() + await preview.add_chunk(b"\x00\x00" * 16_000) + preview.request_stop() + + with patch( + "agent_cli.services.asr._transcribe_recorded_audio_wyoming", + new_callable=AsyncMock, + return_value="stale partial", + ): + await preview.emit_partial() + + await preview.stop("final words") + + entries = [json.loads(line) for line in log_file.read_text().splitlines()] + assert [entry["type"] for entry in entries] == ["final"] + assert entries[0]["text"] == "final words" + + +@pytest.mark.asyncio +async def test_live_preview_run_cancel_drops_resolved_partial_before_final( + tmp_path: Path, +) -> None: + """A resolved preview response must not publish after the run task is canceled.""" + log_file = tmp_path / "preview.jsonl" + preview = asr.LivePreviewStreamer( + asr.LivePreviewConfig(log_file=log_file, interval_seconds=0.01), + wyoming_asr_cfg=config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=10300), + logger=MagicMock(), + ) + preview.reset_log() + await preview.add_chunk(b"\x00\x00" * 16_000) + + entered_transcription = asyncio.Event() + transcription_result: asyncio.Future[str] = asyncio.Future() + + async def transcribe_after_signal(**_kwargs: object) -> str: + entered_transcription.set() + return await transcription_result + + with patch( + "agent_cli.services.asr._transcribe_recorded_audio_wyoming", + side_effect=transcribe_after_signal, + ): + task = asyncio.create_task(preview.run()) + await asyncio.wait_for(entered_transcription.wait(), timeout=1) + transcription_result.set_result("stale partial") + preview.request_stop() + task.cancel() + with suppress(asyncio.CancelledError): + await task + + await preview.stop("final words") + + entries = [json.loads(line) for line in log_file.read_text().splitlines()] + assert [entry["type"] for entry in entries] == ["final"] + assert entries[0]["text"] == "final words" + @pytest.mark.asyncio async def test_send_audio() -> None: