Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions plugins/voice_ai/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Voice.ai
"Voice.ai"

## Features
- Low-latency HTTP streaming TTS via Voice.ai.
- PCM output ready for Vision Agents audio pipeline.
- Configurable voice, model, language, temperature, and top-p.

## Installation
```sh
uv add vision-agents-plugins-voice-ai
```

## Usage
```python
from vision_agents.plugins import voice_ai

# Requires VOICE_AI_API_KEY and VOICE_AI_VOICE_ID in the environment
tts = voice_ai.TTS()
```

## Configuration
| Name | Type | Default | Description |
| --- | --- | --- | --- |
| `api_key` | `str` or `None` | `None` | Voice.ai API key. Falls back to `VOICE_AI_API_KEY`. |
| `voice_id` | `str` or `None` | `None` | Voice ID to use. Falls back to `VOICE_AI_VOICE_ID`. |
| `audio_format` | `str` | `"pcm"` | Output format. `pcm` streams and yields chunks; `wav`/`mp3` are decoded after download. |
| `model` | `str` or `None` | `None` | Model ID to use for synthesis. |
| `language` | `str` or `None` | `None` | Language code for synthesis. |
| `temperature` | `float` or `None` | `None` | Sampling temperature. |
| `top_p` | `float` or `None` | `None` | Top-p nucleus sampling. |
| `base_url` | `str` | `"https://dev.voice.ai"` | API base URL. |
| `timeout_s` | `float` | `60.0` | HTTP timeout in seconds. |
| `client` | `httpx.AsyncClient` or `None` | `None` | Optional pre-configured HTTP client. |

## Voice IDs
Use the Voice.ai voices list endpoint to discover `voice_id` values:
- `GET /api/v1/tts/voice/list`

## Dependencies
- `vision-agents`
- `httpx`
- `av`
11 changes: 11 additions & 0 deletions plugins/voice_ai/example/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Voice.ai TTS Example

Run the example with:

```sh
uv run python voice_ai_tts_example.py
```

Environment variables required:
- `VOICE_AI_API_KEY`
- `VOICE_AI_VOICE_ID`
Empty file.
22 changes: 22 additions & 0 deletions plugins/voice_ai/example/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[project]
name = "voiceai-tts-example"
version = "0.0.0"
requires-python = ">=3.10"

dependencies = [
"python-dotenv>=1.0",
"vision-agents-plugins-deepgram",
"vision-agents-plugins-voice_ai",
"vision-agents-plugins-getstream",
"vision-agents-plugins-gemini",
"vision-agents-plugins-openai",
"vision-agents",
]

[tool.uv.sources]
"vision-agents-plugins-voice_ai" = {path = "..", editable=true}
"vision-agents-plugins-deepgram" = {path = "../../deepgram", editable=true}
"vision-agents-plugins-getstream" = {path = "../../getstream", editable=true}
"vision-agents-plugins-gemini" = {path = "../../gemini", editable=true}
"vision-agents-plugins-openai" = {path = "../../openai", editable=true}
"vision-agents" = {path = "../../../agents-core", editable=true}
3,050 changes: 3,050 additions & 0 deletions plugins/voice_ai/example/uv.lock

Large diffs are not rendered by default.

65 changes: 65 additions & 0 deletions plugins/voice_ai/example/voice_ai_tts_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""
Deepgram TTS Example

This example demonstrates Deepgram TTS integration with Vision Agents.

This example creates an agent that uses:
- Voice.ai for text-to-speech (TTS)
- Deepgram for speech-to-text (STT)
- GetStream for edge/real-time communication
- Gemini for LLM

Requirements:
- VOICE_AI_API_KEY environment variable
- DEEPGRAM_API_KEY environment variable
- STREAM_API_KEY and STREAM_API_SECRET environment variables
- GOOGLE_API_KEY environment variable (for Gemini)
"""

import asyncio
import logging

from dotenv import load_dotenv
from vision_agents.core import Agent, Runner, User
from vision_agents.core.agents import AgentLauncher
from vision_agents.plugins import deepgram, getstream, openai, voice_ai

logger = logging.getLogger(__name__)

load_dotenv()


async def create_agent(**kwargs) -> Agent:
"""Create the agent with Deepgram TTS and STT."""
agent = Agent(
edge=getstream.Edge(),
agent_user=User(name="Deepgram Agent", id="agent"),
instructions="You're a helpful voice AI assistant. Keep replies short and conversational.",
tts=voice_ai.TTS(),
stt=deepgram.STT(), # Uses Deepgram Flux for speech-to-text
# llm=gemini.LLM("gemini-2.0-flash"),
llm=openai.LLM(model="gpt-4o-mini"),
)
return agent


async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
"""Join the call and start the agent."""
# Ensure the agent user is created
await agent.create_user()
# Create a call
call = await agent.create_call(call_type, call_id)

# Have the agent join the call/room
async with agent.join(call):
logger.info("Joining call")
logger.info("LLM ready")

await asyncio.sleep(5)
await agent.llm.simple_response(text="Hello! How can I help you today?")

await agent.finish() # Run till the call ends


if __name__ == "__main__":
Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli()
Empty file added plugins/voice_ai/py.typed
Empty file.
42 changes: 42 additions & 0 deletions plugins/voice_ai/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
[build-system]
requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"

[project]
name = "vision-agents-plugins-voice-ai"
dynamic = ["version"]
description = "Voice.ai TTS integration for Vision Agents"
readme = "README.md"
keywords = ["voice.ai", "TTS", "text-to-speech", "AI", "voice agents", "agents"]
requires-python = ">=3.10"
license = "MIT"
dependencies = [
"vision-agents",
"httpx>=0.27.0",
"av>=10.0.0",
]

[project.urls]
Documentation = "https://visionagents.ai/"
Website = "https://visionagents.ai/"
Source = "https://github.com/GetStream/Vision-Agents"

[tool.hatch.version]
source = "vcs"
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }

[tool.hatch.build.targets.wheel]
packages = [".", "vision_agents"]

[tool.hatch.build.targets.sdist]
include = ["/vision_agents"]

[tool.uv.sources]
vision-agents = { workspace = true }

[dependency-groups]
dev = [
"pytest>=8.4.1",
"pytest-asyncio>=1.0.0",
"python-dotenv>=1.0.1",
]
38 changes: 38 additions & 0 deletions plugins/voice_ai/tests/test_tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pytest
from dotenv import load_dotenv

from vision_agents.core.tts.manual_test import manual_tts_to_wav
from vision_agents.core.tts.testing import TTSSession
from vision_agents.plugins import voice_ai

load_dotenv()


class TestVoiceAiTTS:
"""Integration tests for Voice.ai TTS."""

@pytest.fixture
async def tts(self) -> voice_ai.TTS:
tts = voice_ai.TTS()
try:
yield tts
finally:
await tts.close()

@pytest.mark.integration
async def test_voice_ai_tts_convert_text_to_audio(self, tts: voice_ai.TTS):
tts.set_output_format(sample_rate=16000, channels=1)
session = TTSSession(tts)
text = "Hello from Voice.ai."

await tts.send(text)
await session.wait_for_result(timeout=15.0)

assert not session.errors
assert len(session.speeches) > 0

@pytest.mark.integration
async def test_voice_ai_tts_convert_text_to_audio_manual_test(
self, tts: voice_ai.TTS
):
await manual_tts_to_wav(tts, sample_rate=48000, channels=2)
6 changes: 6 additions & 0 deletions plugins/voice_ai/vision_agents/plugins/voice_ai/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .tts import TTS

# Re-export under the new namespace for convenience
__path__ = __import__("pkgutil").extend_path(__path__, __name__)

__all__ = ["TTS"]
Loading
Loading