diff --git a/agent_cli/agents/transcribe.py b/agent_cli/agents/transcribe.py index f47a6756..a9bc245e 100644 --- a/agent_cli/agents/transcribe.py +++ b/agent_cli/agents/transcribe.py @@ -65,6 +65,7 @@ class TranscriptResult(TypedDict, total=False): transcript: str | None saved_recording_path: Path | None llm_enabled: bool + error: str | None SYSTEM_PROMPT = """ @@ -343,6 +344,14 @@ def _option_default(value: Any) -> Any: return getattr(value, "default", value) +def _transcript_result_error(result: object) -> str | None: + """Return a structured transcript error when one is present.""" + if not isinstance(result, dict): + return None + error = result.get("error") + return error if isinstance(error, str) and error else None + + async def _async_main( # noqa: PLR0912, PLR0915, C901 *, extra_instructions: str | None, @@ -465,17 +474,27 @@ def _set_saved_recording_path(path: Path) -> None: and provider_cfg.asr_provider == "wyoming" else None ) - transcript = await live_transcriber( - logger=LOGGER, - stop_event=stop_event, - quiet=general_cfg.quiet, - live=live, - save_recording=save_recording, - extra_instructions=extra_instructions, - recording_path_callback=_set_saved_recording_path, - audio_level_callback=audio_level_callback, - live_preview_config=live_preview_config, - ) + try: + transcript = await live_transcriber( + logger=LOGGER, + stop_event=stop_event, + quiet=general_cfg.quiet, + live=live, + save_recording=save_recording, + extra_instructions=extra_instructions, + recording_path_callback=_set_saved_recording_path, + audio_level_callback=audio_level_callback, + live_preview_config=live_preview_config, + ) + except asr.SilentAudioCaptureError as exc: + message = str(exc) + LOGGER.warning(message) + return TranscriptResult( + raw_transcript=None, + transcript=None, + llm_enabled=False, + error=message, + ) elapsed = time.monotonic() - start_time @@ -898,6 +917,10 @@ def transcribe( # noqa: PLR0912, PLR0911, PLR0915, C901 raise typer.Exit(1) from None if json_output: print(json.dumps(result)) + if error := _transcript_result_error(result): + if not json_output: + print(error) + raise typer.Exit(1) return # Normal recording mode @@ -994,3 +1017,7 @@ def transcribe( # noqa: PLR0912, PLR0911, PLR0915, C901 raise typer.Exit(1) from None if json_output: print(json.dumps(result)) + if error := _transcript_result_error(result): + if not json_output: + print(error) + raise typer.Exit(1) diff --git a/agent_cli/services/asr.py b/agent_cli/services/asr.py index fd36dc7a..86646070 100644 --- a/agent_cli/services/asr.py +++ b/agent_cli/services/asr.py @@ -5,13 +5,14 @@ import asyncio import io import json +import struct import wave from contextlib import suppress from dataclasses import dataclass from datetime import UTC, datetime from functools import partial from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, NoReturn from agent_cli import constants from agent_cli.core.audio import ( @@ -71,6 +72,51 @@ def min_audio_bytes(self) -> int: ) +@dataclass +class AudioCaptureStats: + """Basic signal stats for captured little-endian int16 PCM audio.""" + + byte_count: int = 0 + peak_sample: int = 0 + + @property + def is_all_silence(self) -> bool: + """Return true when capture produced bytes but every sample was zero.""" + return self.byte_count > 0 and self.peak_sample == 0 + + def observe(self, chunk: bytes) -> None: + """Update stats from one PCM chunk.""" + self.byte_count += len(chunk) + self.peak_sample = max(self.peak_sample, _peak_int16_sample(chunk)) + + +class SilentAudioCaptureError(RuntimeError): + """Raised when microphone capture returns only digital silence.""" + + DEFAULT_MESSAGE = ( + "Microphone capture returned only digital silence. " + "On macOS, allow Microphone permission for Agent CLI, " + "then restart Agent CLI." + ) + + def __init__(self, message: str | None = None) -> None: + """Create an error with the default silent-capture guidance.""" + super().__init__(message or self.DEFAULT_MESSAGE) + + +def _raise_silent_audio_capture_error() -> NoReturn: + raise SilentAudioCaptureError + + +def _peak_int16_sample(chunk: bytes) -> int: + sample_count = len(chunk) // constants.AUDIO_FORMAT_WIDTH + if sample_count <= 0: + return 0 + pcm = chunk[: sample_count * constants.AUDIO_FORMAT_WIDTH] + samples = struct.unpack(f"<{sample_count}h", pcm) + return max((abs(sample) for sample in samples), default=0) + + def _write_live_preview_event( log_file: Path, *, @@ -421,8 +467,8 @@ async def _send_audio( recording_path_callback: Callable[[Path], None] | None = None, audio_level_callback: Callable[[bytes], None] | None = None, live_preview_callback: Callable[[bytes], Awaitable[None]] | None = None, -) -> None: - """Read from mic and send to Wyoming server.""" +) -> AudioCaptureStats: + """Read from mic, send to Wyoming server, and return capture stats.""" from wyoming.asr import Transcribe # noqa: PLC0415 from wyoming.audio import AudioChunk, AudioStart, AudioStop # noqa: PLC0415 @@ -434,9 +480,11 @@ async def _send_audio( # Buffer to save audio if requested audio_buffer = io.BytesIO() if save_recording else None pending_audio_level_tasks: set[asyncio.Task[None]] = set() + capture_stats = AudioCaptureStats() async def send_chunk(chunk: bytes) -> None: """Send audio chunk to ASR server and optionally buffer it.""" + capture_stats.observe(chunk) if audio_buffer is not None: audio_buffer.write(chunk) if audio_level_callback: @@ -475,6 +523,7 @@ async def send_chunk(chunk: bytes) -> None: recording_path_callback(saved_path) finally: await _finish_audio_level_callbacks(pending_audio_level_tasks) + return capture_stats async def record_audio_to_buffer(queue: asyncio.Queue, logger: logging.Logger) -> bytes: @@ -692,7 +741,7 @@ async def _transcribe_live_audio_wyoming( if live_preview is not None: live_preview.reset_log() live_preview_task = asyncio.create_task(live_preview.run()) - _, recv_task = await manage_send_receive_tasks( + send_task, recv_task = await manage_send_receive_tasks( _send_audio( client, stream, @@ -715,8 +764,17 @@ async def _transcribe_live_audio_wyoming( return_when=asyncio.ALL_COMPLETED, ) result = recv_task.result() + capture_stats = send_task.result() if send_task is not None else None + if ( + capture_stats is not None + and capture_stats.is_all_silence + and not result.strip() + ): + _raise_silent_audio_capture_error() final_transcript = result return result + except SilentAudioCaptureError: + raise except (ConnectionRefusedError, Exception): logger.warning("Failed to connect to Wyoming ASR server") return None diff --git a/macos/AgentCLI/Sources/AgentCLI/AgentCommandRunner.swift b/macos/AgentCLI/Sources/AgentCLI/AgentCommandRunner.swift index e023db05..81fe0343 100644 --- a/macos/AgentCLI/Sources/AgentCLI/AgentCommandRunner.swift +++ b/macos/AgentCLI/Sources/AgentCLI/AgentCommandRunner.swift @@ -31,6 +31,7 @@ final class AgentCommandRunner: ObservableObject { private var recordingIndicator = RecordingIndicatorController() private let pasteController: TranscriptPasteController private let bootstrap: AgentBootstrap + private let microphonePermissionController: any MicrophonePermissionControlling private var activityTracker = MenuActivityTracker() private var pendingStopRecordingCommands: Set = [] private var holdTranscriptionState: HoldTranscriptionState = .idle @@ -69,11 +70,13 @@ final class AgentCommandRunner: ObservableObject { init( pasteController: TranscriptPasteController = TranscriptPasteController(), + microphonePermissionController: any MicrophonePermissionControlling = MicrophonePermissionController.shared, bootstrap: @escaping AgentBootstrap = { requirement, force, progress in AgentRuntime.shared.ensureReady(for: requirement, force: force, progress: progress) } ) { self.pasteController = pasteController + self.microphonePermissionController = microphonePermissionController self.bootstrap = bootstrap hasLastError = FileManager.default.fileExists(atPath: AgentRuntime.shared.lastErrorURL.path) } @@ -148,6 +151,7 @@ final class AgentCommandRunner: ObservableObject { statusMessage = "Transcription is already recording" return false } + guard ensureMicrophonePermissionForRecording() else { return false } holdTranscriptionState = .recording holdToTranscribePasteTarget = FocusedTextTarget.capture() @@ -196,6 +200,9 @@ final class AgentCommandRunner: ObservableObject { markStopRequested(for: command) beginTranscribingActivity() } + if shouldStartRecording { + guard ensureMicrophonePermissionForRecording() else { return } + } activeCommandCount += 1 beginCommandActivity(for: command) @@ -383,6 +390,25 @@ final class AgentCommandRunner: ObservableObject { activityTracker.beginTranscribing() } + private func ensureMicrophonePermissionForRecording() -> Bool { + let presentation = MicrophonePermissionPresentation( + status: microphonePermissionController.currentStatus() + ) + guard presentation.canRecord else { + microphonePermissionController.requestAccessIfNeeded { [weak self] granted in + guard granted else { return } + Task { @MainActor in + self?.statusMessage = "Microphone permission enabled. Try recording again." + } + } + let message = presentation.statusMessage ?? "Microphone permission is required for recording." + statusMessage = message + notify(title: "Microphone Permission Required", body: message) + return false + } + return true + } + private func clearTranscribingActivityIfFinished() { if pendingStopRecordingCommands.isEmpty && !holdTranscriptionState.isFinishing { activityTracker.finishTranscribing() diff --git a/macos/AgentCLI/Sources/AgentCLI/MicrophonePermission.swift b/macos/AgentCLI/Sources/AgentCLI/MicrophonePermission.swift new file mode 100644 index 00000000..d22a01c4 --- /dev/null +++ b/macos/AgentCLI/Sources/AgentCLI/MicrophonePermission.swift @@ -0,0 +1,58 @@ +import AVFoundation + +enum MicrophonePermissionStatus { + case authorized + case denied + case notDetermined +} + +struct MicrophonePermissionPresentation { + let status: MicrophonePermissionStatus + + var canRecord: Bool { + status == .authorized + } + + var statusMessage: String? { + switch status { + case .authorized: + return nil + case .denied: + return "Allow Microphone permission for Agent CLI in System Settings." + case .notDetermined: + return "Approve Microphone permission for Agent CLI, then try recording again." + } + } +} + +protocol MicrophonePermissionControlling { + func currentStatus() -> MicrophonePermissionStatus + func requestAccessIfNeeded(completion: @escaping (Bool) -> Void) +} + +final class MicrophonePermissionController: MicrophonePermissionControlling { + static let shared = MicrophonePermissionController() + + private init() {} + + func currentStatus() -> MicrophonePermissionStatus { + switch AVCaptureDevice.authorizationStatus(for: .audio) { + case .authorized: + return .authorized + case .notDetermined: + return .notDetermined + case .denied, .restricted: + return .denied + @unknown default: + return .denied + } + } + + func requestAccessIfNeeded(completion: @escaping (Bool) -> Void) { + guard currentStatus() == .notDetermined else { + completion(currentStatus() == .authorized) + return + } + AVCaptureDevice.requestAccess(for: .audio, completionHandler: completion) + } +} diff --git a/macos/AgentCLI/Tests/AgentCLITests/MicrophonePermissionTests.swift b/macos/AgentCLI/Tests/AgentCLITests/MicrophonePermissionTests.swift new file mode 100644 index 00000000..aba1c5ef --- /dev/null +++ b/macos/AgentCLI/Tests/AgentCLITests/MicrophonePermissionTests.swift @@ -0,0 +1,23 @@ +#if canImport(XCTest) +import XCTest +@testable import AgentCLI + +final class MicrophonePermissionTests: XCTestCase { + func testAuthorizedPermissionAllowsRecording() { + let presentation = MicrophonePermissionPresentation(status: .authorized) + + XCTAssertTrue(presentation.canRecord) + XCTAssertEqual(presentation.statusMessage, nil) + } + + func testDeniedPermissionExplainsSettingsFix() { + let presentation = MicrophonePermissionPresentation(status: .denied) + + XCTAssertFalse(presentation.canRecord) + XCTAssertEqual( + presentation.statusMessage, + "Allow Microphone permission for Agent CLI in System Settings." + ) + } +} +#endif diff --git a/tests/agents/test_transcribe.py b/tests/agents/test_transcribe.py index b136c6ae..c6a7c549 100644 --- a/tests/agents/test_transcribe.py +++ b/tests/agents/test_transcribe.py @@ -184,6 +184,65 @@ async def test_transcribe_main( mock_wyoming_client_context.assert_called_once() +@pytest.mark.asyncio +@patch("agent_cli.agents.transcribe.signal_handling_context") +@patch("agent_cli.agents.transcribe.asr.create_transcriber") +async def test_async_main_returns_error_for_silent_capture( + mock_create_transcriber: MagicMock, + mock_signal_handling_context: MagicMock, + caplog: pytest.LogCaptureFixture, +) -> None: + """Silent microphone capture should be returned as a structured error.""" + + async def fail_transcriber(**_kwargs: object) -> str: + raise transcribe.asr.SilentAudioCaptureError + + mock_create_transcriber.return_value = fail_transcriber + mock_signal_handling_context.return_value.__enter__.return_value = asyncio.Event() + expected_error = transcribe.asr.SilentAudioCaptureError.DEFAULT_MESSAGE + + provider_cfg = config.ProviderSelection( + asr_provider="wyoming", + llm_provider="ollama", + tts_provider="wyoming", + ) + general_cfg = config.General( + log_level="INFO", + log_file=None, + quiet=True, + list_devices=False, + clipboard=False, + ) + + with caplog.at_level(logging.WARNING): + result = await transcribe._async_main( + extra_instructions=None, + provider_cfg=provider_cfg, + general_cfg=general_cfg, + audio_in_cfg=config.AudioInput(), + wyoming_asr_cfg=config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=12345), + openai_asr_cfg=config.OpenAIASR(asr_openai_model="whisper-1"), + gemini_asr_cfg=config.GeminiASR(asr_gemini_model="gemini-2.0-flash"), + ollama_cfg=config.Ollama(llm_ollama_model="test", llm_ollama_host="localhost"), + openai_llm_cfg=config.OpenAILLM(llm_openai_model="gpt-4", openai_base_url=None), + gemini_llm_cfg=config.GeminiLLM( + llm_gemini_model="gemini-1.5-flash", + gemini_api_key="test-key", + ), + llm_enabled=False, + transcription_log=None, + save_recording=False, + ) + + assert result == { + "raw_transcript": None, + "transcript": None, + "llm_enabled": False, + "error": expected_error, + } + assert expected_error in caplog.text + + @pytest.mark.asyncio @patch("agent_cli.agents.transcribe.signal_handling_context") @patch("agent_cli.agents.transcribe.maybe_live") diff --git a/tests/test_asr.py b/tests/test_asr.py index fd04b5c2..0b4aae4e 100644 --- a/tests/test_asr.py +++ b/tests/test_asr.py @@ -273,6 +273,60 @@ async def test_send_audio() -> None: assert levels == [b"fake_audio_chunk"] +@pytest.mark.asyncio +async def test_send_audio_returns_capture_stats() -> None: + """Send path should report whether any non-silent samples were captured.""" + client = AsyncMock() + stream = MagicMock() + stop_event = MagicMock() + stop_event.is_set.side_effect = [False, True] + stop_event.ctrl_c_pressed = False + mock_data = MagicMock() + mock_data.tobytes.return_value = b"\x00\x00\x05\x00" + stream.read.return_value = (mock_data, False) + + stats = await asr._send_audio( + client, + stream, + stop_event, + MagicMock(), + live=MagicMock(), + quiet=True, + save_recording=False, + ) + + assert stats == asr.AudioCaptureStats(byte_count=4, peak_sample=5) + + +@pytest.mark.asyncio +async def test_transcribe_live_audio_wyoming_errors_on_silent_empty_capture() -> None: + """All-zero captured audio plus empty ASR output should surface mic capture failure.""" + with ( + patch("agent_cli.services.asr.wyoming_client_context") as mock_context, + patch("agent_cli.services.asr.open_audio_stream"), + patch("agent_cli.services.asr.setup_input_stream"), + patch("agent_cli.services.asr._send_audio", new_callable=AsyncMock) as mock_send, + patch("agent_cli.services.asr._receive_transcript", new_callable=AsyncMock) as mock_receive, + ): + mock_context.return_value.__aenter__.return_value = AsyncMock() + mock_send.return_value = asr.AudioCaptureStats(byte_count=32_000, peak_sample=0) + mock_receive.return_value = "" + + with pytest.raises(asr.SilentAudioCaptureError): + await asr._transcribe_live_audio_wyoming( + audio_input_cfg=config.AudioInput(input_device_index=None), + wyoming_asr_cfg=config.WyomingASR( + asr_wyoming_ip="localhost", + asr_wyoming_port=10300, + ), + logger=MagicMock(), + stop_event=MagicMock(), + live=MagicMock(), + quiet=True, + save_recording=True, + ) + + @pytest.mark.asyncio async def test_send_audio_does_not_wait_for_audio_level_callback() -> None: """Test that level callbacks cannot block audio delivery to Wyoming."""