Dark-Sys-Jenkins · Aditya011235 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/assignment/README.md b/assignment/README.md
@@ -0,0 +1,179 @@
+
+
+<h1>LiveKit Intelligent Interruption Handling 
+Challenge </h1>
+
+<h2>🚀 Overview</h2>
+<p>
+This project implements a <b>context-aware interruption handling system</b> for a real-time voice AI agent using the LiveKit framework.
+</p>
+
+<p>
+Traditional voice agents incorrectly treat small acknowledgements like <i>"yeah", "ok"</i> as interruptions.  
+This system intelligently distinguishes between <b>passive acknowledgements</b> and <b>active interruptions</b>.
+</p>
+
+<hr>
+
+<h2>🧠 Problem Statement</h2>
+<ul>
+  <li>Users often say filler words while listening</li>
+  <li>Default VAD interprets them as interruptions</li>
+  <li>Agent stops speaking unnecessarily ❌</li>
+</ul>
+
+<p><b>Goal:</b> Build a system that understands user intent and reacts correctly.</p>
+
+<hr>
+
+<h2>⚙️ System Architecture</h2>
+<pre>
+User Voice → VAD → STT → classify_input() → Decision → Agent Action
+</pre>
+
+<hr>
+
+<h2>📂 Project Structure</h2>
+<pre>
+voice_agent/
+│
+├── main.py
+├── interrupt_handler.py
+├── state_manager.py
+├── transcript_handler.py
+├── .env
+</pre>
+
+<hr>
+
+<h2>📄 File Explanation</h2>
+
+<h3>🔹 main.py</h3>
+<ul>
+  <li>Initializes the agent session</li>
+  <li>Connects VAD, STT, LLM, TTS</li>
+  <li>Registers event handlers</li>
+</ul>
+
+<p><b>Role:</b> Main controller of the system</p>
+
+---
+
+<h3>🔹 interrupt_handler.py</h3>
+<p>Core decision-making logic:</p>
+
+<pre>
+classify_input(text, is_agent_speaking)
+</pre>
+
+<p>Returns:</p>
+<ul>
+  <li><b>IGNORE</b> → continue speaking</li>
+  <li><b>INTERRUPT</b> → stop agent</li>
+  <li><b>RESPOND</b> → normal reply</li>
+</ul>
+
+---
+
+<h3>🔹 state_manager.py</h3>
+<ul>
+  <li>Tracks agent state</li>
+  <li>Maintains: <code>is_agent_speaking</code></li>
+</ul>
+
+---
+
+<h3>🔹 transcript_handler.py</h3>
+<ul>
+  <li>Processes user speech</li>
+  <li>Calls decision logic</li>
+  <li>Executes actions</li>
+</ul>
+
+<hr>
+
+<h2>🧪 Scenarios & Solutions</h2>
+
+<h3>✅ Scenario 1: Long Explanation</h3>
+<p><b>Input:</b> "yeah okay hmm"</p>
+<p><b>Result:</b> Agent continues speaking (IGNORE)</p>
+
+---
+
+<h3>✅ Scenario 2: Passive Affirmation</h3>
+<p><b>Input:</b> "yeah" (when silent)</p>
+<p><b>Result:</b> Agent responds normally (RESPOND)</p>
+
+---
+
+<h3>✅ Scenario 3: Correction</h3>
+<p><b>Input:</b> "no stop"</p>
+<p><b>Result:</b> Agent stops immediately (INTERRUPT)</p>
+
+---
+
+<h3>✅ Scenario 4: Mixed Input</h3>
+<p><b>Input:</b> "yeah okay but wait"</p>
+<p><b>Result:</b> Agent stops (INTERRUPT)</p>
+
+<hr>
+
+<h2>⚡ Key Features</h2>
+<ul>
+  <li>Context-aware interruption handling</li>
+  <li>Real-time decision making</li>
+  <li>Handles mixed and noisy speech inputs</li>
+  <li>Modular architecture</li>
+</ul>
+
+<hr>
+
+<h2>🔑 Technologies Used</h2>
+<ul>
+  <li>LiveKit Agents</li>
+  <li>Groq LLM (LLaMA 3.1)</li>
+  <li>Deepgram (STT)</li>
+  <li>ElevenLabs / Cartesia (TTS)</li>
+  <li>Python (Async Programming)</li>
+</ul>
+
+<hr>
+
+<h2>▶️ How to Run</h2>
+<pre>
+pip install -e .
+pip install -r requirements.txt
+python main.py console
+</pre>
+
+<hr>
+
+<h2>🎤 Test Commands</h2>
+<pre>
+yeah okay hmm   → IGNORE
+no stop         → INTERRUPT
+yeah            → RESPOND
+hello           → RESPOND
+</pre>
+
+<hr>
+
+<h2>🎥 Demo Video</h2>
+<p>
+<a href="https://drive.google.com/file/d/107Zc57PlQg41VAcZ3j7ns8wzJC09NlEW/view?usp=sharing">👉 Add your demo video link here</a>
+</p>
+
+<hr>
+
+<h2>🏁 Conclusion</h2>
+<p>
+This system successfully implements a <b>state-aware intelligent interruption handler</b> that improves conversational flow and mimics human-like interaction.
+</p>
+
+<hr>
+
+<h2>👨‍💻 Author</h2>
+<p>Aditya Kumar</p>
+
+</body>
+</html>
diff --git a/assignment/interrupt_handler.py b/assignment/interrupt_handler.py
@@ -0,0 +1,38 @@
+# interrupt_handler.py
+import re
+
+IGNORE_WORDS = ["yeah", "okay", "hmm", "uh-huh", "right"]
+COMMAND_WORDS = ["stop", "wait", "no"]
+
+
+def classify_input(text: str, is_agent_speaking: bool) -> str:
+    """
+    Decide how agent should react to user input.
+
+    Returns:
+        "IGNORE"     -> ignore input (continue speaking)
+        "INTERRUPT"  -> stop agent immediately
+        "RESPOND"    -> treat as normal input
+    """
+
+    if not text:
+        return "RESPOND"
+
+    text = text.lower().strip()
+
+    # Priority 1: COMMAND (even inside sentence)
+    for cmd in COMMAND_WORDS:
+        if cmd in text:
+            return "INTERRUPT"
+
+    # Priority 2: Ignore filler words ONLY when agent speaking
+    if is_agent_speaking:
+        # check exact or repeated fillers
+        words = re.findall(r'\b\w+\b', text)
+
+        # if ALL words are filler → ignore
+        if all(word in IGNORE_WORDS for word in words):
+            return "IGNORE"
+
+    # Default: normal response
+    return "RESPOND"
diff --git a/assignment/main.py b/assignment/main.py
@@ -0,0 +1,76 @@
+import asyncio
+import logging
+
+from dotenv import load_dotenv
+
+from livekit.agents import (
+    Agent,
+    AgentServer,
+    AgentSession,
+    JobContext,
+    UserStateChangedEvent,
+    cli,
+)
+from livekit.plugins import cartesia, deepgram, groq, silero
+
+from state_manager import StateManager
+from transcript_handler import handle_transcript
+
+logger = logging.getLogger("agent")
+
+load_dotenv()
+
+server = AgentServer()
+
+
+@server.rtc_session()
+async def entrypoint(ctx: JobContext):
+    session = AgentSession(
+        vad=silero.VAD.load(),
+        llm=groq.LLM(model="llama-3.1-8b-instant"),
+        stt=deepgram.STT(),
+        tts=cartesia.TTS(),
+        user_away_timeout=12.5,
+    )
+
+    state_manager = StateManager()
+
+    inactivity_task: asyncio.Task | None = None
+
+    async def user_presence_task():
+        for _ in range(3):
+            await session.generate_reply(
+                instructions="The user has been inactive. Check if they are present."
+            )
+            await asyncio.sleep(10)
+
+        session.shutdown()
+
+    @session.on("user_state_changed")
+    def _user_state_changed(ev: UserStateChangedEvent):
+        nonlocal inactivity_task
+        if ev.new_state == "away":
+            inactivity_task = asyncio.create_task(user_presence_task())
+            return
+
+        if inactivity_task is not None:
+            inactivity_task.cancel()
+
+    # agent state tracking
+    @session.on("agent_state_changed")
+    def _agent_state_changed(ev):
+        state_manager.update_agent_state(ev.new_state)
+
+    # transcript handling
+    @session.on("user_transcript")
+    def _handle_transcript(ev):
+        handle_transcript(session, state_manager, ev.text)
+
+    await session.start(
+        agent=Agent(instructions="You are a helpful assistant."),
+        room=ctx.room
+    )
+
+
+if __name__ == "__main__":
+    cli.run_app(server)
diff --git a/assignment/requirements.txt b/assignment/requirements.txt
@@ -0,0 +1,5 @@
+livekit-agents[openai, cartesia, elevenlabs, deepgram, silero, turn-detector, mcp]>=1.0
+python-dotenv>=1.0
+duckduckgo-search>=8.0
+
+livekit-plugins-groq
diff --git a/assignment/state_manager.py b/assignment/state_manager.py
@@ -0,0 +1,11 @@
+# state_manager.py
+
+class StateManager:
+    def __init__(self):
+        self.is_agent_speaking = False
+
+    def update_agent_state(self, state):
+        if state == "speaking":
+            self.is_agent_speaking = True
+        else:
+            self.is_agent_speaking = False
diff --git a/assignment/transcript_handler.py b/assignment/transcript_handler.py
@@ -0,0 +1,26 @@
+# transcript_handler.py
+
+import asyncio
+import time
+from interrupt_handler import classify_input
+
+
+def handle_transcript(session, state_manager, text):
+    user_text = text.lower().strip()
+    print("User said:", user_text)
+
+    # delay to avoid VAD issue
+    time.sleep(0.2)
+
+    decision = classify_input(user_text, state_manager.is_agent_speaking)
+    print("Decision:", decision)
+
+    if decision == "IGNORE":
+        return
+
+    elif decision == "INTERRUPT":
+        print(" Interrupting agent!")
+        session.interrupt()
+
+    else:
+        return
diff --git a/assignment/vdo proof.mov b/assignment/vdo proof.mov