diff --git a/assignment/README.md b/assignment/README.md new file mode 100644 index 0000000000..59e932984f --- /dev/null +++ b/assignment/README.md @@ -0,0 +1,179 @@ + + +

LiveKit Intelligent Interruption Handling +Challenge

+ +

๐Ÿš€ Overview

+

+This project implements a context-aware interruption handling system for a real-time voice AI agent using the LiveKit framework. +

+ +

+Traditional voice agents incorrectly treat small acknowledgements like "yeah", "ok" as interruptions. +This system intelligently distinguishes between passive acknowledgements and active interruptions. +

+ +
+ +

๐Ÿง  Problem Statement

+ + +

Goal: Build a system that understands user intent and reacts correctly.

+ +
+ +

โš™๏ธ System Architecture

+
+User Voice โ†’ VAD โ†’ STT โ†’ classify_input() โ†’ Decision โ†’ Agent Action
+
+ +
+ +

๐Ÿ“‚ Project Structure

+
+voice_agent/
+โ”‚
+โ”œโ”€โ”€ main.py
+โ”œโ”€โ”€ interrupt_handler.py
+โ”œโ”€โ”€ state_manager.py
+โ”œโ”€โ”€ transcript_handler.py
+โ”œโ”€โ”€ .env
+
+ +
+ +

๐Ÿ“„ File Explanation

+ +

๐Ÿ”น main.py

+ + +

Role: Main controller of the system

+ +--- + +

๐Ÿ”น interrupt_handler.py

+

Core decision-making logic:

+ +
+classify_input(text, is_agent_speaking)
+
+ +

Returns:

+ + +--- + +

๐Ÿ”น state_manager.py

+ + +--- + +

๐Ÿ”น transcript_handler.py

+ + +
+ +

๐Ÿงช Scenarios & Solutions

+ +

โœ… Scenario 1: Long Explanation

+

Input: "yeah okay hmm"

+

Result: Agent continues speaking (IGNORE)

+ +--- + +

โœ… Scenario 2: Passive Affirmation

+

Input: "yeah" (when silent)

+

Result: Agent responds normally (RESPOND)

+ +--- + +

โœ… Scenario 3: Correction

+

Input: "no stop"

+

Result: Agent stops immediately (INTERRUPT)

+ +--- + +

โœ… Scenario 4: Mixed Input

+

Input: "yeah okay but wait"

+

Result: Agent stops (INTERRUPT)

+ +
+ +

โšก Key Features

+ + +
+ +

๐Ÿ”‘ Technologies Used

+ + +
+ +

โ–ถ๏ธ How to Run

+
+pip install -e .
+pip install -r requirements.txt
+python main.py console
+
+ +
+ +

๐ŸŽค Test Commands

+
+yeah okay hmm   โ†’ IGNORE
+no stop         โ†’ INTERRUPT
+yeah            โ†’ RESPOND
+hello           โ†’ RESPOND
+
+ +
+ +

๐ŸŽฅ Demo Video

+

+๐Ÿ‘‰ Add your demo video link here +

+ +
+ +

๐Ÿ Conclusion

+

+This system successfully implements a state-aware intelligent interruption handler that improves conversational flow and mimics human-like interaction. +

+ +
+ +

๐Ÿ‘จโ€๐Ÿ’ป Author

+

Aditya Kumar

+ + + diff --git a/assignment/interrupt_handler.py b/assignment/interrupt_handler.py new file mode 100644 index 0000000000..4982ee6a5a --- /dev/null +++ b/assignment/interrupt_handler.py @@ -0,0 +1,38 @@ +# interrupt_handler.py +import re + +IGNORE_WORDS = ["yeah", "okay", "hmm", "uh-huh", "right"] +COMMAND_WORDS = ["stop", "wait", "no"] + + +def classify_input(text: str, is_agent_speaking: bool) -> str: + """ + Decide how agent should react to user input. + + Returns: + "IGNORE" -> ignore input (continue speaking) + "INTERRUPT" -> stop agent immediately + "RESPOND" -> treat as normal input + """ + + if not text: + return "RESPOND" + + text = text.lower().strip() + + # Priority 1: COMMAND (even inside sentence) + for cmd in COMMAND_WORDS: + if cmd in text: + return "INTERRUPT" + + # Priority 2: Ignore filler words ONLY when agent speaking + if is_agent_speaking: + # check exact or repeated fillers + words = re.findall(r'\b\w+\b', text) + + # if ALL words are filler โ†’ ignore + if all(word in IGNORE_WORDS for word in words): + return "IGNORE" + + # Default: normal response + return "RESPOND" \ No newline at end of file diff --git a/assignment/main.py b/assignment/main.py new file mode 100644 index 0000000000..90d2a291d7 --- /dev/null +++ b/assignment/main.py @@ -0,0 +1,76 @@ +import asyncio +import logging + +from dotenv import load_dotenv + +from livekit.agents import ( + Agent, + AgentServer, + AgentSession, + JobContext, + UserStateChangedEvent, + cli, +) +from livekit.plugins import cartesia, deepgram, groq, silero + +from state_manager import StateManager +from transcript_handler import handle_transcript + +logger = logging.getLogger("agent") + +load_dotenv() + +server = AgentServer() + + +@server.rtc_session() +async def entrypoint(ctx: JobContext): + session = AgentSession( + vad=silero.VAD.load(), + llm=groq.LLM(model="llama-3.1-8b-instant"), + stt=deepgram.STT(), + tts=cartesia.TTS(), + user_away_timeout=12.5, + ) + + state_manager = StateManager() + + inactivity_task: asyncio.Task | None = None + + async def user_presence_task(): + for _ in range(3): + await session.generate_reply( + instructions="The user has been inactive. Check if they are present." + ) + await asyncio.sleep(10) + + session.shutdown() + + @session.on("user_state_changed") + def _user_state_changed(ev: UserStateChangedEvent): + nonlocal inactivity_task + if ev.new_state == "away": + inactivity_task = asyncio.create_task(user_presence_task()) + return + + if inactivity_task is not None: + inactivity_task.cancel() + + # agent state tracking + @session.on("agent_state_changed") + def _agent_state_changed(ev): + state_manager.update_agent_state(ev.new_state) + + # transcript handling + @session.on("user_transcript") + def _handle_transcript(ev): + handle_transcript(session, state_manager, ev.text) + + await session.start( + agent=Agent(instructions="You are a helpful assistant."), + room=ctx.room + ) + + +if __name__ == "__main__": + cli.run_app(server) \ No newline at end of file diff --git a/assignment/requirements.txt b/assignment/requirements.txt new file mode 100644 index 0000000000..46d682dce9 --- /dev/null +++ b/assignment/requirements.txt @@ -0,0 +1,5 @@ +livekit-agents[openai, cartesia, elevenlabs, deepgram, silero, turn-detector, mcp]>=1.0 +python-dotenv>=1.0 +duckduckgo-search>=8.0 + +livekit-plugins-groq \ No newline at end of file diff --git a/assignment/state_manager.py b/assignment/state_manager.py new file mode 100644 index 0000000000..fbd6dc580e --- /dev/null +++ b/assignment/state_manager.py @@ -0,0 +1,11 @@ +# state_manager.py + +class StateManager: + def __init__(self): + self.is_agent_speaking = False + + def update_agent_state(self, state): + if state == "speaking": + self.is_agent_speaking = True + else: + self.is_agent_speaking = False \ No newline at end of file diff --git a/assignment/transcript_handler.py b/assignment/transcript_handler.py new file mode 100644 index 0000000000..08d9dc29dd --- /dev/null +++ b/assignment/transcript_handler.py @@ -0,0 +1,26 @@ +# transcript_handler.py + +import asyncio +import time +from interrupt_handler import classify_input + + +def handle_transcript(session, state_manager, text): + user_text = text.lower().strip() + print("User said:", user_text) + + # delay to avoid VAD issue + time.sleep(0.2) + + decision = classify_input(user_text, state_manager.is_agent_speaking) + print("Decision:", decision) + + if decision == "IGNORE": + return + + elif decision == "INTERRUPT": + print(" Interrupting agent!") + session.interrupt() + + else: + return \ No newline at end of file diff --git a/assignment/vdo proof.mov b/assignment/vdo proof.mov new file mode 100644 index 0000000000..60d74b2487 Binary files /dev/null and b/assignment/vdo proof.mov differ