From 5c70538a285583f477e83b87c82afd07622fa029 Mon Sep 17 00:00:00 2001 From: ishan bansal Date: Mon, 2 Feb 2026 23:32:12 +0530 Subject: [PATCH 1/4] Add context-aware interruption handling and Mistral AI support - Implement passive word filtering (yeah, ok, hmm) that doesn't interrupt agent speech - Add active interrupt words (stop, wait, hold on) that immediately stop agent - Add LLM provider selection via LLM_PROVIDER env variable (openai/mistral) - Use manual turn detection to control when user input reaches LLM - Track agent speaking state to determine interruption behavior --- examples/voice_agents/basic_agent.py | 143 +++++++++++++++++++++++++-- 1 file changed, 133 insertions(+), 10 deletions(-) diff --git a/examples/voice_agents/basic_agent.py b/examples/voice_agents/basic_agent.py index f064dab5d7..cabcd23ff6 100644 --- a/examples/voice_agents/basic_agent.py +++ b/examples/voice_agents/basic_agent.py @@ -1,4 +1,6 @@ import logging +import os +import re from dotenv import load_dotenv @@ -15,8 +17,11 @@ room_io, ) from livekit.agents.llm import function_tool +from livekit.agents.voice.events import ( + AgentStateChangedEvent, + UserInputTranscribedEvent, +) from livekit.plugins import silero -from livekit.plugins.turn_detector.multilingual import MultilingualModel # uncomment to enable Krisp background voice/noise cancellation # from livekit.plugins import noise_cancellation @@ -25,8 +30,57 @@ load_dotenv() +# ==================================== +# CONTEXT-AWARE INTERRUPTION SETTINGS +# ==================================== + +# Words that should NOT interrupt agent speech or reach the LLM +IGNORE_WORDS = { + "yeah", "yes", "ok", "okay", "hmm", "uh-huh", "uh huh", + "mhm", "mm-hmm", "right", "sure", "yep", "yup", "aha", + "mm", "mhmm", "alright", "got", "it" +} + +# Words that should ALWAYS interrupt agent speech +INTERRUPT_WORDS = { + "stop", "wait", "hold on", "hold up", "no", "cancel", + "nevermind", "never mind", "actually", "but", "however", + "pause", "hold" +} + +def normalize_text(text: str) -> str: + """Normalize text for comparison - lowercase and remove punctuation.""" + return re.sub(r'[^\w\s]', '', text.lower()) + +def is_only_ignore_words(text: str) -> bool: + """Check if text contains ONLY passive acknowledgement words.""" + normalized = normalize_text(text) + words = normalized.split() + if not words: + return False + return all(word in IGNORE_WORDS for word in words) + +def contains_interrupt_word(text: str) -> bool: + """Check if text contains any active interrupt command.""" + normalized = normalize_text(text) + words = normalized.split() + return any(word in INTERRUPT_WORDS for word in words) + +def get_llm_model(): + """Get LLM model based on environment variable.""" + provider = os.getenv("LLM_PROVIDER", "openai").lower() + if provider == "mistral": + return "mistral/mistral-small" + return "openai/gpt-4o-mini" + +llm_model = get_llm_model() +logger.info(f"Using LLM: {llm_model}") + class MyAgent(Agent): + # Class variable to track state when user speech was detected + _agent_was_speaking_at_user_speech = False + def __init__(self) -> None: super().__init__( instructions="Your name is Kelly. You would interact with users via voice." @@ -35,12 +89,43 @@ def __init__(self) -> None: "You are curious and friendly, and have a sense of humor." "you will speak english to the user", ) + self._is_speaking = False async def on_enter(self): # when the agent is added to the session, it'll generate a reply # according to its instructions self.session.generate_reply() + async def on_user_turn_completed(self, turn_ctx, new_message): + """ + CRITICAL: This is called BEFORE the LLM processes user input. + With manual turn detection, we control when to commit user turns and generate replies. + """ + user_text = new_message.text_content + + # Use the captured state from when user started speaking + was_speaking = MyAgent._agent_was_speaking_at_user_speech + + # Check if agent was speaking and user said only passive words + if was_speaking and is_only_ignore_words(user_text): + logger.info(f"✓ IGNORING passive words during speech: '{user_text}'") + # Don't call generate_reply - this prevents LLM from processing it + return + + if contains_interrupt_word(user_text): + logger.info(f"✗ INTERRUPT command detected: '{user_text}' - stopping agent") + # Interrupt the agent's current speech + await self.session.interrupt() + elif was_speaking: + logger.info(f"⚠️ Interrupting with: '{user_text}'") + # Non-passive words during speech - interrupt + await self.session.interrupt() + else: + logger.info(f"💬 Processing user input: '{user_text}'") + + # Let the default behavior handle it (generate reply) + return await super().on_user_turn_completed(turn_ctx, new_message) + # all functions annotated with @function_tool will be passed to the LLM when this # agent is active @function_tool @@ -84,24 +169,62 @@ async def entrypoint(ctx: JobContext): # See all available models at https://docs.livekit.io/agents/models/stt/ stt="deepgram/nova-3", # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response - # See all available models at https://docs.livekit.io/agents/models/llm/ - llm="openai/gpt-4.1-mini", + # Configurable via LLM_PROVIDER environment variable (openai or mistral) + llm=llm_model, # Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear # See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/ - tts="cartesia/sonic-2:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc", - # VAD and turn detection are used to determine when the user is speaking and when the agent should respond + tts="cartesia/sonic-2", + # Manual turn detection - we control when turns complete to filter passive words # See more at https://docs.livekit.io/agents/build/turns - turn_detection=MultilingualModel(), + turn_detection="manual", vad=ctx.proc.userdata["vad"], # allow the LLM to generate a response while waiting for the end of turn # See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation preemptive_generation=True, - # sometimes background noise could interrupt the agent session, these are considered false positive interruptions - # when it's detected, you may resume the agent's speech - resume_false_interruption=True, - false_interruption_timeout=1.0, + # SMART INTERRUPTION: Allow listening during speech, but control interruption manually + allow_interruptions=True, + # Minimum words before triggering interruption (we handle filtering in event handlers) + min_interruption_words=1, ) + # ==================================== + # CONTEXT-AWARE INTERRUPTION HANDLERS + # ==================================== + + # Get reference to the agent to update its state + agent = MyAgent() + + @session.on("agent_state_changed") + def on_agent_state_changed(event: AgentStateChangedEvent): + """Track when agent starts/stops speaking.""" + agent._is_speaking = (event.new_state == "speaking") + if event.new_state == "speaking": + logger.info("🎤 Agent started speaking") + elif event.new_state == "listening": + logger.info("🎧 Agent stopped speaking") + + @session.on("user_input_transcribed") + def on_user_transcript(event: UserInputTranscribedEvent): + """ + Capture agent state when user speaks and manually commit turns. + With manual turn detection, we decide when to commit user input. + """ + if event.is_final and event.transcript.strip(): + # Store whether agent was speaking when this transcript started + MyAgent._agent_was_speaking_at_user_speech = agent._is_speaking + logger.info(f"📝 Transcript: '{event.transcript}' (agent_was_speaking={agent._is_speaking})") + + # MANUAL TURN CONTROL: Decide whether to commit this as a user turn + # Check if it's only passive words while agent is speaking + if agent._is_speaking and is_only_ignore_words(event.transcript): + logger.info(f"🛑 Skipping turn commit for passive words: '{event.transcript}'") + # Do NOT commit - agent continues speaking + return + + # Otherwise, commit the turn (this will trigger on_user_turn_completed) + logger.info(f"✓ Committing user turn: '{event.transcript}'") + session.commit_user_turn() + # log metrics as they are emitted, and total usage after session is over usage_collector = metrics.UsageCollector() From ab95199f6a81aebed1c72e6d14948319e78d2f2c Mon Sep 17 00:00:00 2001 From: ishan bansal Date: Mon, 2 Feb 2026 23:36:29 +0530 Subject: [PATCH 2/4] Add context-aware interruption handling and Mistral AI support Features: - Implement passive word filtering (yeah, ok, hmm) that doesn't interrupt agent speech - Add active interrupt words (stop, wait, hold on) for immediate interruption - Add LLM provider selection via LLM_PROVIDER env variable (openai/mistral) - Use manual turn detection to control when user input reaches LLM - Track agent speaking state to determine interruption behavior - Update requirements.txt with Mistral AI plugin dependency --- examples/voice_agents/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/voice_agents/requirements.txt b/examples/voice_agents/requirements.txt index 1d6a274bad..7ec1947c96 100644 --- a/examples/voice_agents/requirements.txt +++ b/examples/voice_agents/requirements.txt @@ -1,3 +1,4 @@ livekit-agents[openai, cartesia, elevenlabs, deepgram, silero, turn-detector, mcp]>=1.0 +livekit-plugins-mistralai>=0.1.0 python-dotenv>=1.0 duckduckgo-search>=8.0 \ No newline at end of file From d51b1aadd555bd52767f541e9b3c1f94f7ff157b Mon Sep 17 00:00:00 2001 From: ishan bansal Date: Mon, 2 Feb 2026 23:38:41 +0530 Subject: [PATCH 3/4] Update README with context-aware interruption features --- examples/voice_agents/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/voice_agents/README.md b/examples/voice_agents/README.md index aa401505d1..26e167f81d 100644 --- a/examples/voice_agents/README.md +++ b/examples/voice_agents/README.md @@ -6,7 +6,11 @@ This directory contains a comprehensive collection of voice-based agent examples ### 🚀 Getting Started -- [`basic_agent.py`](./basic_agent.py) - A fundamental voice agent with metrics collection +- [`basic_agent.py`](./basic_agent.py) - A fundamental voice agent with context-aware interruption handling, Mistral AI support, and metrics collection + - **Context-Aware Interruptions**: Passive acknowledgements ("yeah", "ok", "hmm") don't interrupt agent speech + - **Active Commands**: Interrupt words ("stop", "wait", "hold on") immediately stop the agent + - **Multi-LLM Support**: Switch between OpenAI and Mistral AI via `LLM_PROVIDER` environment variable + - **Manual Turn Detection**: Full control over when user input reaches the LLM ### 🛠️ Tool Integration & Function Calling From 5873b5f8dd6a5b111b47f8256d7c92fbac33697e Mon Sep 17 00:00:00 2001 From: ishan bansal Date: Mon, 2 Feb 2026 23:39:56 +0530 Subject: [PATCH 4/4] Update main README with context-aware interruption and Mistral AI features --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2a09aac241..1f425a879e 100644 --- a/README.md +++ b/README.md @@ -32,11 +32,13 @@ agents that can see, hear, and understand. ## Features -- **Flexible integrations**: A comprehensive ecosystem to mix and match the right STT, LLM, TTS, and Realtime API to suit your use case. +- **Context-aware interruption handling**: Smart interruption logic that distinguishes between passive acknowledgements ("yeah", "ok", "hmm") and active commands ("stop", "wait"), allowing agents to continue speaking when users are simply listening. +- **Flexible integrations**: A comprehensive ecosystem to mix and match the right STT, LLM, TTS, and Realtime API to suit your use case. Now includes support for Mistral AI in addition to OpenAI. - **Integrated job scheduling**: Built-in task scheduling and distribution with [dispatch APIs](https://docs.livekit.io/agents/build/dispatch/) to connect end users to agents. - **Extensive WebRTC clients**: Build client applications using LiveKit's open-source SDK ecosystem, supporting all major platforms. - **Telephony integration**: Works seamlessly with LiveKit's [telephony stack](https://docs.livekit.io/sip/), allowing your agent to make calls to or receive calls from phones. - **Exchange data with clients**: Use [RPCs](https://docs.livekit.io/home/client/data/rpc/) and other [Data APIs](https://docs.livekit.io/home/client/data/) to seamlessly exchange data with clients. +- **Manual turn detection**: Full control over when user input reaches the LLM, enabling sophisticated conversation flow management. - **Semantic turn detection**: Uses a transformer model to detect when a user is done with their turn, helps to reduce interruptions. - **MCP support**: Native support for MCP. Integrate tools provided by MCP servers with one loc. - **Builtin test framework**: Write tests and use judges to ensure your agent is performing as expected. @@ -48,6 +50,7 @@ To install the core Agents library, along with plugins for popular model provide ```bash pip install "livekit-agents[openai,silero,deepgram,cartesia,turn-detector]~=1.0" +pip install "livekit-plugins-mistralai>=0.1.0" # For Mistral AI support ``` ## Docs and guides