GetStream · DaemonLoki · Feb 3, 2026 · Feb 3, 2026
diff --git a/plugins/voice_ai/README.md b/plugins/voice_ai/README.md
@@ -0,0 +1,43 @@
+# Voice.ai
+"Voice.ai"
+
+## Features
+- Low-latency HTTP streaming TTS via Voice.ai.
+- PCM output ready for Vision Agents audio pipeline.
+- Configurable voice, model, language, temperature, and top-p.
+
+## Installation
+```sh
+uv add vision-agents-plugins-voice-ai
+```
+
+## Usage
+```python
+from vision_agents.plugins import voice_ai
+
+# Requires VOICE_AI_API_KEY and VOICE_AI_VOICE_ID in the environment
+tts = voice_ai.TTS()
+```
+
+## Configuration
+| Name | Type | Default | Description |
+| --- | --- | --- | --- |
+| `api_key` | `str` or `None` | `None` | Voice.ai API key. Falls back to `VOICE_AI_API_KEY`. |
+| `voice_id` | `str` or `None` | `None` | Voice ID to use. Falls back to `VOICE_AI_VOICE_ID`. |
+| `audio_format` | `str` | `"pcm"` | Output format. `pcm` streams and yields chunks; `wav`/`mp3` are decoded after download. |
+| `model` | `str` or `None` | `None` | Model ID to use for synthesis. |
+| `language` | `str` or `None` | `None` | Language code for synthesis. |
+| `temperature` | `float` or `None` | `None` | Sampling temperature. |
+| `top_p` | `float` or `None` | `None` | Top-p nucleus sampling. |
+| `base_url` | `str` | `"https://dev.voice.ai"` | API base URL. |
+| `timeout_s` | `float` | `60.0` | HTTP timeout in seconds. |
+| `client` | `httpx.AsyncClient` or `None` | `None` | Optional pre-configured HTTP client. |
+
+## Voice IDs
+Use the Voice.ai voices list endpoint to discover `voice_id` values:
+- `GET /api/v1/tts/voice/list`
+
+## Dependencies
+- `vision-agents`
+- `httpx`
+- `av`
diff --git a/plugins/voice_ai/example/README.md b/plugins/voice_ai/example/README.md
@@ -0,0 +1,11 @@
+# Voice.ai TTS Example
+
+Run the example with:
+
+```sh
+uv run python voice_ai_tts_example.py
+```
+
+Environment variables required:
+- `VOICE_AI_API_KEY`
+- `VOICE_AI_VOICE_ID`
diff --git a/plugins/voice_ai/example/__init__.py b/plugins/voice_ai/example/__init__.py
diff --git a/plugins/voice_ai/example/pyproject.toml b/plugins/voice_ai/example/pyproject.toml
@@ -0,0 +1,22 @@
+[project]
+name = "voiceai-tts-example"
+version = "0.0.0"
+requires-python = ">=3.10"
+
+dependencies = [
+  "python-dotenv>=1.0",
+  "vision-agents-plugins-deepgram",
+  "vision-agents-plugins-voice_ai",
+  "vision-agents-plugins-getstream",
+  "vision-agents-plugins-gemini",
+  "vision-agents-plugins-openai",
+  "vision-agents",
+]
+
+[tool.uv.sources]
+"vision-agents-plugins-voice_ai" = {path = "..", editable=true}
+"vision-agents-plugins-deepgram" = {path = "../../deepgram", editable=true}
+"vision-agents-plugins-getstream" = {path = "../../getstream", editable=true}
+"vision-agents-plugins-gemini" = {path = "../../gemini", editable=true}
+"vision-agents-plugins-openai" = {path = "../../openai", editable=true}
+"vision-agents" = {path = "../../../agents-core", editable=true}
diff --git a/plugins/voice_ai/example/uv.lock b/plugins/voice_ai/example/uv.lock
diff --git a/plugins/voice_ai/example/voice_ai_tts_example.py b/plugins/voice_ai/example/voice_ai_tts_example.py
@@ -0,0 +1,65 @@
+"""
+Deepgram TTS Example
+
+This example demonstrates Deepgram TTS integration with Vision Agents.
+
+This example creates an agent that uses:
+- Voice.ai for text-to-speech (TTS)
+- Deepgram for speech-to-text (STT)
+- GetStream for edge/real-time communication
+- Gemini for LLM
+
+Requirements:
+- VOICE_AI_API_KEY environment variable
+- DEEPGRAM_API_KEY environment variable
+- STREAM_API_KEY and STREAM_API_SECRET environment variables
+- GOOGLE_API_KEY environment variable (for Gemini)
+"""
+
+import asyncio
+import logging
+
+from dotenv import load_dotenv
+from vision_agents.core import Agent, Runner, User
+from vision_agents.core.agents import AgentLauncher
+from vision_agents.plugins import deepgram, getstream, openai, voice_ai
+
+logger = logging.getLogger(__name__)
+
+load_dotenv()
+
+
+async def create_agent(**kwargs) -> Agent:
+    """Create the agent with Deepgram TTS and STT."""
+    agent = Agent(
+        edge=getstream.Edge(),
+        agent_user=User(name="Deepgram Agent", id="agent"),
+        instructions="You're a helpful voice AI assistant. Keep replies short and conversational.",
+        tts=voice_ai.TTS(),
+        stt=deepgram.STT(),  # Uses Deepgram Flux for speech-to-text
+        # llm=gemini.LLM("gemini-2.0-flash"),
+        llm=openai.LLM(model="gpt-4o-mini"),
+    )
+    return agent
+
+
+async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
+    """Join the call and start the agent."""
+    # Ensure the agent user is created
+    await agent.create_user()
+    # Create a call
+    call = await agent.create_call(call_type, call_id)
+
+    # Have the agent join the call/room
+    async with agent.join(call):
+        logger.info("Joining call")
+        logger.info("LLM ready")
+
+        await asyncio.sleep(5)
+        await agent.llm.simple_response(text="Hello! How can I help you today?")
+
+        await agent.finish()  # Run till the call ends
+
+
+if __name__ == "__main__":
+    Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli()
diff --git a/plugins/voice_ai/py.typed b/plugins/voice_ai/py.typed
diff --git a/plugins/voice_ai/pyproject.toml b/plugins/voice_ai/pyproject.toml
@@ -0,0 +1,42 @@
+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
+
+[project]
+name = "vision-agents-plugins-voice-ai"
+dynamic = ["version"]
+description = "Voice.ai TTS integration for Vision Agents"
+readme = "README.md"
+keywords = ["voice.ai", "TTS", "text-to-speech", "AI", "voice agents", "agents"]
+requires-python = ">=3.10"
+license = "MIT"
+dependencies = [
+    "vision-agents",
+    "httpx>=0.27.0",
+    "av>=10.0.0",
+]
+
+[project.urls]
+Documentation = "https://visionagents.ai/"
+Website = "https://visionagents.ai/"
+Source = "https://github.com/GetStream/Vision-Agents"
+
+[tool.hatch.version]
+source = "vcs"
+raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
+
+[tool.hatch.build.targets.wheel]
+packages = [".", "vision_agents"]
+
+[tool.hatch.build.targets.sdist]
+include = ["/vision_agents"]
+
+[tool.uv.sources]
+vision-agents = { workspace = true }
+
+[dependency-groups]
+dev = [
+    "pytest>=8.4.1",
+    "pytest-asyncio>=1.0.0",
+    "python-dotenv>=1.0.1",
+]
diff --git a/plugins/voice_ai/tests/test_tts.py b/plugins/voice_ai/tests/test_tts.py
@@ -0,0 +1,38 @@
+import pytest
+from dotenv import load_dotenv
+
+from vision_agents.core.tts.manual_test import manual_tts_to_wav
+from vision_agents.core.tts.testing import TTSSession
+from vision_agents.plugins import voice_ai
+
+load_dotenv()
+
+
+class TestVoiceAiTTS:
+    """Integration tests for Voice.ai TTS."""
+
+    @pytest.fixture
+    async def tts(self) -> voice_ai.TTS:
+        tts = voice_ai.TTS()
+        try:
+            yield tts
+        finally:
+            await tts.close()
+
+    @pytest.mark.integration
+    async def test_voice_ai_tts_convert_text_to_audio(self, tts: voice_ai.TTS):
+        tts.set_output_format(sample_rate=16000, channels=1)
+        session = TTSSession(tts)
+        text = "Hello from Voice.ai."
+
+        await tts.send(text)
+        await session.wait_for_result(timeout=15.0)
+
+        assert not session.errors
+        assert len(session.speeches) > 0
+
+    @pytest.mark.integration
+    async def test_voice_ai_tts_convert_text_to_audio_manual_test(
+        self, tts: voice_ai.TTS
+    ):
+        await manual_tts_to_wav(tts, sample_rate=48000, channels=2)
diff --git a/plugins/voice_ai/vision_agents/plugins/voice_ai/__init__.py b/plugins/voice_ai/vision_agents/plugins/voice_ai/__init__.py
@@ -0,0 +1,6 @@
+from .tts import TTS
+
+# Re-export under the new namespace for convenience
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)
+
+__all__ = ["TTS"]