Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions backend/.env.example
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
INWORLD_API_KEY=
ASSEMBLY_AI_API_KEY=

SUPABASE_URL=
SUPABASE_SECRET_KEY=
SUPABASE_SECRET_KEY=
11 changes: 0 additions & 11 deletions backend/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

66 changes: 31 additions & 35 deletions backend/src/config/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,29 @@
* Environment variables can override defaults where appropriate.
*/

export interface AssemblyAITurnDetectionSettings {
endOfTurnConfidenceThreshold: number;
minEndOfTurnSilenceWhenConfident: number;
maxTurnSilence: number;
export interface InworldSTTSettings {
silenceThresholdMs: number;
minSpeechMs: number;
silenceEnergyThreshold: number;
description: string;
}

export type AssemblyAIEagerness = 'low' | 'medium' | 'high';
export type InworldSTTEagerness = 'low' | 'medium' | 'high';

/**
* AssemblyAI turn detection presets based on their documentation
* @see https://www.assemblyai.com/docs/speech-to-text/universal-streaming/turn-detection
* Inworld STT VAD presets controlling how eagerly the system ends a turn.
* These mirror the former AssemblyAI turn-detection presets so existing
* environment-variable overrides (INWORLD_STT_EAGERNESS) behave predictably.
*/
const assemblyAIPresets: Record<
AssemblyAIEagerness,
AssemblyAITurnDetectionSettings
> = {
const inworldSTTPresets: Record<InworldSTTEagerness, InworldSTTSettings> = {
/**
* Aggressive - Quick responses for rapid back-and-forth
* Use cases: Agent Assist, IVR replacements, Retail/E-commerce, Telecom
*/
high: {
endOfTurnConfidenceThreshold: 0.4,
minEndOfTurnSilenceWhenConfident: 160,
maxTurnSilence: 400,
silenceThresholdMs: 400,
minSpeechMs: 100,
silenceEnergyThreshold: 0.01,
description:
'Aggressive - Quick responses for rapid back-and-forth (IVR, order confirmations)',
},
Expand All @@ -39,9 +37,9 @@ const assemblyAIPresets: Record<
* Use cases: Customer Support, Tech Support, Financial Services, Travel
*/
medium: {
endOfTurnConfidenceThreshold: 0.4,
minEndOfTurnSilenceWhenConfident: 400,
maxTurnSilence: 1280,
silenceThresholdMs: 700,
minSpeechMs: 150,
silenceEnergyThreshold: 0.01,
description:
'Balanced - Natural middle ground for most conversational turns',
},
Expand All @@ -51,9 +49,9 @@ const assemblyAIPresets: Record<
* Use cases: Healthcare, Mental Health, Sales, Legal, Language Learning
*/
low: {
endOfTurnConfidenceThreshold: 0.7,
minEndOfTurnSilenceWhenConfident: 800,
maxTurnSilence: 3600,
silenceThresholdMs: 1000,
minSpeechMs: 200,
silenceEnergyThreshold: 0.01,
description:
'Conservative - Patient, allows thinking pauses (Language Learning, Healthcare)',
},
Expand All @@ -76,14 +74,12 @@ export const serverConfig = {
},

/**
* AssemblyAI speech-to-text configuration
* Inworld STT configuration
*/
assemblyAI: {
/** Turn detection eagerness level */
eagerness: (process.env.ASSEMBLY_AI_EAGERNESS ||
'high') as AssemblyAIEagerness,
/** Format turns in output (typically false for real-time processing) */
formatTurns: false,
inworldSTT: {
/** VAD eagerness level */
eagerness: (process.env.INWORLD_STT_EAGERNESS ||
'high') as InworldSTTEagerness,
},
Comment on lines +79 to 83
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

INWORLD_STT_EAGERNESS is cast to InworldSTTEagerness without validation, so an invalid env value will make getInworldSTTSettings() return undefined and cause downstream runtime errors. Consider validating against {'low','medium','high'} (fallback to 'high' and log a warning) before indexing into inworldSTTPresets.

Copilot uses AI. Check for mistakes.

/**
Expand All @@ -96,18 +92,18 @@ export const serverConfig = {
} as const;

/**
* Get AssemblyAI turn detection settings for the configured eagerness level
* Get Inworld STT VAD settings for the configured eagerness level
*/
export function getAssemblyAISettings(): AssemblyAITurnDetectionSettings {
return assemblyAIPresets[serverConfig.assemblyAI.eagerness];
export function getInworldSTTSettings(): InworldSTTSettings {
return inworldSTTPresets[serverConfig.inworldSTT.eagerness];
}

/**
* Get AssemblyAI turn detection settings for a specific eagerness level
* Get Inworld STT VAD settings for a specific eagerness level
* @param eagerness - The eagerness level ('low' | 'medium' | 'high')
*/
export function getAssemblyAISettingsForEagerness(
eagerness: AssemblyAIEagerness
): AssemblyAITurnDetectionSettings {
return assemblyAIPresets[eagerness];
export function getInworldSTTSettingsForEagerness(
eagerness: InworldSTTEagerness
): InworldSTTSettings {
return inworldSTTPresets[eagerness];
}
53 changes: 25 additions & 28 deletions backend/src/graphs/conversation-graph.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
* Conversation Graph for Language Learning App - Inworld Runtime 0.9
*
* This is a long-running circular graph that:
* - Processes continuous audio streams via AssemblyAI STT with built-in VAD
* - Processes continuous audio streams via Inworld STT with energy-based VAD
* - Queues interactions for sequential processing
* - Uses language-specific prompts and TTS voices
* - Loops back for the next interaction automatically
*
* Graph Flow:
* AudioInput → AssemblyAI STT (loop) → TranscriptExtractor → InteractionQueue
* AudioInput → Inworld STT (loop) → TranscriptExtractor → InteractionQueue
* → TextInput → DialogPromptBuilder → LLM → TextChunking → TTSRequestBuilder → TTS
* → TextAggregator → StateUpdate → (loop back to InteractionQueue)
*/
Expand All @@ -23,7 +23,7 @@ import {
TextAggregatorNode,
} from '@inworld/runtime/graph';

import { AssemblyAISTTWebSocketNode } from './nodes/assembly-ai-stt-ws-node.js';
import { InworldSTTNode } from './nodes/inworld-stt-node.js';
import { DialogPromptBuilderNode } from './nodes/dialog-prompt-builder-node.js';
import { InteractionQueueNode } from './nodes/interaction-queue-node.js';
import { MemoryRetrievalNode } from './nodes/memory-retrieval-node.js';
Expand All @@ -37,33 +37,33 @@ import {
DEFAULT_LANGUAGE_CODE,
} from '../config/languages.js';
import { llmConfig } from '../config/llm.js';
import { serverConfig, getAssemblyAISettings } from '../config/server.js';
import { serverConfig, getInworldSTTSettings } from '../config/server.js';
import { graphLogger as logger } from '../utils/logger.js';

export interface ConversationGraphConfig {
assemblyAIApiKey: string;
inworldApiKey: string;
connections: ConnectionsMap;
defaultLanguageCode?: string;
}

/**
* Wrapper class for the conversation graph
* Provides access to the graph and the AssemblyAI node for session management
* Provides access to the graph and the Inworld STT node for session management
*/
export class ConversationGraphWrapper {
graph: Graph;
assemblyAINode: AssemblyAISTTWebSocketNode;
inworldSTTNode: InworldSTTNode;

private constructor(params: {
graph: Graph;
assemblyAINode: AssemblyAISTTWebSocketNode;
inworldSTTNode: InworldSTTNode;
}) {
this.graph = params.graph;
this.assemblyAINode = params.assemblyAINode;
this.inworldSTTNode = params.inworldSTTNode;
}

async destroy(): Promise<void> {
await this.assemblyAINode.destroy();
await this.inworldSTTNode.destroy();
await this.graph.stop();
}

Expand All @@ -73,7 +73,7 @@ export class ConversationGraphWrapper {
static create(config: ConversationGraphConfig): ConversationGraphWrapper {
const {
connections,
assemblyAIApiKey,
inworldApiKey,
defaultLanguageCode = DEFAULT_LANGUAGE_CODE,
} = config;
// Use provided language code or default to Spanish
Expand All @@ -92,20 +92,17 @@ export class ConversationGraphWrapper {
// Start node (audio input proxy)
const audioInputNode = new ProxyNode({ id: `audio-input-proxy${postfix}` });

// AssemblyAI STT with built-in VAD (always uses multilingual model)
const turnDetectionSettings = getAssemblyAISettings();
const assemblyAISTTNode = new AssemblyAISTTWebSocketNode({
id: `assembly-ai-stt-ws-node${postfix}`,
// Inworld STT with energy-based VAD
const sttSettings = getInworldSTTSettings();
const inworldSTTNode = new InworldSTTNode({
id: `inworld-stt-node${postfix}`,
config: {
apiKey: assemblyAIApiKey,
apiKey: inworldApiKey,
connections: connections,
sampleRate: serverConfig.audio.inputSampleRate,
formatTurns: serverConfig.assemblyAI.formatTurns,
endOfTurnConfidenceThreshold:
turnDetectionSettings.endOfTurnConfidenceThreshold,
minEndOfTurnSilenceWhenConfident:
turnDetectionSettings.minEndOfTurnSilenceWhenConfident,
maxTurnSilence: turnDetectionSettings.maxTurnSilence,
silenceThresholdMs: sttSettings.silenceThresholdMs,
minSpeechMs: sttSettings.minSpeechMs,
silenceEnergyThreshold: sttSettings.silenceEnergyThreshold,
},
});

Expand Down Expand Up @@ -190,7 +187,7 @@ export class ConversationGraphWrapper {
graphBuilder
// Add all nodes
.addNode(audioInputNode)
.addNode(assemblyAISTTNode)
.addNode(inworldSTTNode)
.addNode(transcriptExtractorNode)
.addNode(interactionQueueNode)
.addNode(textInputNode)
Expand All @@ -206,10 +203,10 @@ export class ConversationGraphWrapper {
// ============================================================
// Audio Input Flow (STT with VAD)
// ============================================================
.addEdge(audioInputNode, assemblyAISTTNode)
.addEdge(audioInputNode, inworldSTTNode)

// AssemblyAI loops back to itself while stream is active
.addEdge(assemblyAISTTNode, assemblyAISTTNode, {
// Inworld STT loops back to itself while stream is active
.addEdge(inworldSTTNode, inworldSTTNode, {
condition: async (input: unknown) => {
const data = input as { stream_exhausted?: boolean };
return data?.stream_exhausted !== true;
Expand All @@ -219,7 +216,7 @@ export class ConversationGraphWrapper {
})

// When interaction is complete, extract transcript
.addEdge(assemblyAISTTNode, transcriptExtractorNode, {
.addEdge(inworldSTTNode, transcriptExtractorNode, {
condition: async (input: unknown) => {
const data = input as { interaction_complete?: boolean };
return data?.interaction_complete === true;
Expand Down Expand Up @@ -283,7 +280,7 @@ export class ConversationGraphWrapper {

return new ConversationGraphWrapper({
graph,
assemblyAINode: assemblyAISTTNode,
inworldSTTNode,
});
}
}
Expand Down
Loading
Loading