From d09bf48ee2d53de04bab7470f69b2de94897f210 Mon Sep 17 00:00:00 2001 From: Cale Shapera <25466659+cshape@users.noreply.github.com> Date: Fri, 20 Feb 2026 14:11:06 -0800 Subject: [PATCH 01/16] feat: add support for soniox STT --- .gitignore | 1 + backend/.env.example | 3 + backend/package-lock.json | 11 - backend/src/config/server.ts | 54 ++ .../configs/flashcard-generation-graph.json | 91 --- .../lang-learning-conversation-graph.json | 289 -------- .../configs/response-feedback-graph.json | 91 --- backend/src/graphs/conversation-graph.ts | 100 ++- .../graphs/nodes/assembly-ai-stt-ws-node.ts | 3 +- .../src/graphs/nodes/soniox-stt-ws-node.ts | 692 ++++++++++++++++++ backend/src/graphs/nodes/stt-node.ts | 9 + backend/src/helpers/connection-manager.ts | 6 +- backend/src/server.ts | 3 +- backend/src/services/graph-service.ts | 30 +- frontend/package-lock.json | 16 - render.yaml | 4 + 16 files changed, 858 insertions(+), 545 deletions(-) delete mode 100644 backend/src/graphs/configs/flashcard-generation-graph.json delete mode 100644 backend/src/graphs/configs/lang-learning-conversation-graph.json delete mode 100644 backend/src/graphs/configs/response-feedback-graph.json create mode 100644 backend/src/graphs/nodes/soniox-stt-ws-node.ts create mode 100644 backend/src/graphs/nodes/stt-node.ts diff --git a/.gitignore b/.gitignore index 6c4e867..c54347b 100644 --- a/.gitignore +++ b/.gitignore @@ -140,6 +140,7 @@ vite.config.ts.timestamp-* # Project specific backend/audio/ +backend/src/graphs/configs/ .DS_Store CLAUDE.md templates/ diff --git a/backend/.env.example b/backend/.env.example index 2488c28..0cfaf91 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -1,5 +1,8 @@ INWORLD_API_KEY= + +STT_PROVIDER=assembly ASSEMBLY_AI_API_KEY= +SONIOX_API_KEY= SUPABASE_URL= SUPABASE_SECRET_KEY= \ No newline at end of file diff --git a/backend/package-lock.json b/backend/package-lock.json index 51c3e72..0672fc4 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -1695,7 +1695,6 @@ "integrity": "sha512-tK3GPFWbirvNgsNKto+UmB/cRtn6TZfyw0D6IKrW55n6Vbs7KJoZtI//kpTKzE/DUmmnAFD8/Ca46s7Obs92/w==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "8.46.4", "@typescript-eslint/types": "8.46.4", @@ -2191,7 +2190,6 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", - "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -3222,7 +3220,6 @@ "integrity": "sha512-BhHmn2yNOFA9H9JmmIVKJmd288g9hrVRDkdoIgRCRuSySRUHH7r/DI6aAXW9T1WwUuY3DFgrcaqB+deURBLR5g==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.1", @@ -3283,7 +3280,6 @@ "integrity": "sha512-iI1f+D2ViGn+uvv5HuHVUamg8ll4tN+JRHGc6IJi4TP9Kl976C57fzPXgseXNs8v0iA8aSJpHsTWjDb9QJamGQ==", "dev": true, "license": "MIT", - "peer": true, "bin": { "eslint-config-prettier": "bin/cli.js" }, @@ -5343,7 +5339,6 @@ "integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==", "dev": true, "license": "MIT", - "peer": true, "bin": { "prettier": "bin/prettier.cjs" }, @@ -6293,7 +6288,6 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, @@ -6394,7 +6388,6 @@ "integrity": "sha512-ytQKuwgmrrkDTFP4LjR0ToE2nqgy886GpvRSpU0JAnrdBYppuY5rLkRUYPU1yCryb24SsKBTL/hlDQAEFVwtZg==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "esbuild": "~0.25.0", "get-tsconfig": "^4.7.5" @@ -6455,7 +6448,6 @@ "integrity": "sha512-CWBzXQrc/qOkhidw1OzBTQuYRbfyxDXJMVJ1XNwUHGROVmuaeiEm3OslpZ1RV96d7SKKjZKrSJu3+t/xlw3R9A==", "dev": true, "license": "Apache-2.0", - "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -6573,7 +6565,6 @@ "integrity": "sha512-w+N7Hifpc3gRjZ63vYBXA56dvvRlNWRczTdmCBBa+CotUzAPf5b7YMdMR/8CQoeYE5LX3W4wj6RYTgonm1b9DA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "esbuild": "^0.27.0", "fdir": "^6.5.0", @@ -7151,7 +7142,6 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, @@ -7165,7 +7155,6 @@ "integrity": "sha512-E4t7DJ9pESL6E3I8nFjPa4xGUd3PmiWDLsDztS2qXSJWfHtbQnwAWylaBvSNY48I3vr8PTqIZlyK8TE3V3CA4Q==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@vitest/expect": "4.0.16", "@vitest/mocker": "4.0.16", diff --git a/backend/src/config/server.ts b/backend/src/config/server.ts index aa6f9ac..dd71935 100644 --- a/backend/src/config/server.ts +++ b/backend/src/config/server.ts @@ -5,6 +5,8 @@ * Environment variables can override defaults where appropriate. */ +export type STTProvider = 'assembly' | 'soniox'; + export interface AssemblyAITurnDetectionSettings { endOfTurnConfidenceThreshold: number; minEndOfTurnSilenceWhenConfident: number; @@ -59,12 +61,46 @@ const assemblyAIPresets: Record< }, }; +export interface SonioxEndpointSettings { + maxEndpointDelayMs: number; + languageHints: string[]; + description: string; +} + +/** + * Soniox endpoint detection presets mapped to the same eagerness levels. + * max_endpoint_delay_ms controls how quickly Soniox returns endpoints (500-3000ms). + * @see https://soniox.com/docs/stt/rt/endpoint-detection + */ +const sonioxPresets: Record = { + high: { + maxEndpointDelayMs: 500, + languageHints: ['en', 'es'], + description: 'Aggressive - fastest endpoint detection (500ms)', + }, + medium: { + maxEndpointDelayMs: 1000, + languageHints: ['en', 'es'], + description: 'Balanced - moderate endpoint delay (1000ms)', + }, + low: { + maxEndpointDelayMs: 2000, + languageHints: ['en', 'es'], + description: 'Conservative - patient endpoint detection (2000ms)', + }, +}; + export const serverConfig = { /** * HTTP server port */ port: Number(process.env.PORT) || 3000, + /** + * STT provider selection ('assembly' | 'soniox') + */ + sttProvider: (process.env.STT_PROVIDER || 'assembly') as STTProvider, + /** * Audio processing settings */ @@ -86,6 +122,15 @@ export const serverConfig = { formatTurns: false, }, + /** + * Soniox speech-to-text configuration + */ + soniox: { + /** Endpoint detection eagerness level (reuses the same 'low'|'medium'|'high' scale) */ + eagerness: (process.env.SONIOX_EAGERNESS || + 'high') as AssemblyAIEagerness, + }, + /** * Telemetry configuration for Inworld Runtime */ @@ -111,3 +156,12 @@ export function getAssemblyAISettingsForEagerness( ): AssemblyAITurnDetectionSettings { return assemblyAIPresets[eagerness]; } + +/** + * Get Soniox endpoint detection settings for the configured eagerness level. + * Reads SONIOX_EAGERNESS from process.env at call time (after dotenv loads). + */ +export function getSonioxSettings(): SonioxEndpointSettings { + const eagerness = (process.env.SONIOX_EAGERNESS || 'high') as AssemblyAIEagerness; + return sonioxPresets[eagerness]; +} diff --git a/backend/src/graphs/configs/flashcard-generation-graph.json b/backend/src/graphs/configs/flashcard-generation-graph.json deleted file mode 100644 index dc39045..0000000 --- a/backend/src/graphs/configs/flashcard-generation-graph.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "schema_version": "1.2.2", - "main": { - "id": "flashcard-generation-graph", - "nodes": [ - { - "type": "FlashcardPromptBuilderNodeType", - "id": "flashcard-prompt-builder", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "TextToChatRequestNodeType", - "id": "text-to-chat-request", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "id": "llm_node", - "type": "LLMChatNode", - "execution_config": { - "type": "LLMChatNodeExecutionConfig", - "properties": { - "llm_component_id": "llm_node_llm_component", - "text_generation_config": { - "max_new_tokens": 2500, - "max_prompt_length": 2000, - "temperature": 1, - "top_p": 1, - "repetition_penalty": 1, - "frequency_penalty": 0, - "presence_penalty": 0 - }, - "stream": false, - "report_to_client": false, - "response_format": "text" - } - } - }, - { - "type": "FlashcardParserNodeType", - "id": "flashcard-parser", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - } - ], - "edges": [ - { - "from_node": "flashcard-prompt-builder", - "to_node": "text-to-chat-request" - }, - { - "from_node": "text-to-chat-request", - "to_node": "llm_node" - }, - { - "from_node": "llm_node", - "to_node": "flashcard-parser" - } - ], - "end_nodes": ["flashcard-parser"], - "start_nodes": ["flashcard-prompt-builder"] - }, - "components": [ - { - "id": "llm_node_llm_component", - "type": "LLMInterface", - "creation_config": { - "type": "RemoteLLMConfig", - "properties": { - "provider": "openai", - "model_name": "gpt-4.1-nano", - "default_config": {}, - "api_key": "{{INWORLD_API_KEY}}" - } - } - } - ] -} diff --git a/backend/src/graphs/configs/lang-learning-conversation-graph.json b/backend/src/graphs/configs/lang-learning-conversation-graph.json deleted file mode 100644 index 09b897f..0000000 --- a/backend/src/graphs/configs/lang-learning-conversation-graph.json +++ /dev/null @@ -1,289 +0,0 @@ -{ - "schema_version": "1.2.2", - "main": { - "id": "lang-learning-conversation-graph", - "nodes": [ - { - "id": "audio-input-proxy-lang-learning", - "type": "ProxyNode", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "AssemblyAISTTWebSocketNodeType", - "id": "assembly-ai-stt-ws-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "TranscriptExtractorNodeType", - "id": "transcript-extractor-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": true - } - } - }, - { - "type": "InteractionQueueNodeType", - "id": "interaction-queue-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "TextInputNodeType", - "id": "text-input-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": true - } - } - }, - { - "type": "MemoryRetrievalNodeType", - "id": "memory-retrieval-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "DialogPromptBuilderNodeType", - "id": "dialog-prompt-builder-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "id": "llm-node-lang-learning", - "type": "LLMChatNode", - "execution_config": { - "type": "LLMChatNodeExecutionConfig", - "properties": { - "llm_component_id": "llm-node-lang-learning_llm_component", - "text_generation_config": { - "max_new_tokens": 250, - "max_prompt_length": 2000, - "temperature": 1, - "top_p": 1, - "repetition_penalty": 1, - "frequency_penalty": 0, - "presence_penalty": 0 - }, - "stream": true, - "report_to_client": true, - "response_format": "text" - } - } - }, - { - "id": "text-chunking-node-lang-learning", - "type": "TextChunkingNode", - "execution_config": { - "type": "TextChunkingNodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "id": "text-aggregator-node-lang-learning", - "type": "TextAggregatorNode", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "TTSRequestBuilderNodeType", - "id": "tts-request-builder-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "id": "tts-node-lang-learning", - "type": "TTSNode", - "execution_config": { - "type": "TTSNodeExecutionConfig", - "properties": { - "tts_component_id": "tts-node-lang-learning_tts_component", - "voice": { - "id": "Rafael", - "language_code": "es-MX" - }, - "synthesis_config": { - "type": "inworld", - "config": { - "model_id": "inworld-tts-1.5-max", - "inference": { - "speaking_rate": 1, - "temperature": 1.1 - }, - "postprocessing": { - "sample_rate": 22050 - } - } - }, - "report_to_client": true - } - } - }, - { - "type": "StateUpdateNodeType", - "id": "state-update-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": true - } - } - } - ], - "edges": [ - { - "from_node": "audio-input-proxy-lang-learning", - "to_node": "assembly-ai-stt-ws-node-lang-learning" - }, - { - "from_node": "assembly-ai-stt-ws-node-lang-learning", - "to_node": "assembly-ai-stt-ws-node-lang-learning", - "condition_id": "custom-condition-from-assembly-ai-stt-ws-node-lang-learning-to-assembly-ai-stt-ws-node-lang-learning", - "optional": true, - "loop": true - }, - { - "from_node": "assembly-ai-stt-ws-node-lang-learning", - "to_node": "transcript-extractor-node-lang-learning", - "condition_id": "custom-condition-from-assembly-ai-stt-ws-node-lang-learning-to-transcript-extractor-node-lang-learning" - }, - { - "from_node": "transcript-extractor-node-lang-learning", - "to_node": "interaction-queue-node-lang-learning" - }, - { - "from_node": "interaction-queue-node-lang-learning", - "to_node": "text-input-node-lang-learning", - "condition_id": "custom-condition-from-interaction-queue-node-lang-learning-to-text-input-node-lang-learning" - }, - { - "from_node": "text-input-node-lang-learning", - "to_node": "memory-retrieval-node-lang-learning" - }, - { - "from_node": "memory-retrieval-node-lang-learning", - "to_node": "dialog-prompt-builder-node-lang-learning" - }, - { - "from_node": "text-input-node-lang-learning", - "to_node": "tts-request-builder-node-lang-learning" - }, - { - "from_node": "dialog-prompt-builder-node-lang-learning", - "to_node": "llm-node-lang-learning" - }, - { - "from_node": "llm-node-lang-learning", - "to_node": "text-chunking-node-lang-learning" - }, - { - "from_node": "llm-node-lang-learning", - "to_node": "text-aggregator-node-lang-learning" - }, - { - "from_node": "text-chunking-node-lang-learning", - "to_node": "tts-request-builder-node-lang-learning" - }, - { - "from_node": "tts-request-builder-node-lang-learning", - "to_node": "tts-node-lang-learning" - }, - { - "from_node": "text-aggregator-node-lang-learning", - "to_node": "state-update-node-lang-learning" - }, - { - "from_node": "state-update-node-lang-learning", - "to_node": "interaction-queue-node-lang-learning", - "optional": true, - "loop": true - } - ], - "end_nodes": ["tts-node-lang-learning"], - "start_nodes": ["audio-input-proxy-lang-learning"] - }, - "components": [ - { - "id": "llm-node-lang-learning_llm_component", - "type": "LLMInterface", - "creation_config": { - "type": "RemoteLLMConfig", - "properties": { - "provider": "openai", - "model_name": "gpt-4.1-nano", - "default_config": {}, - "api_key": "{{INWORLD_API_KEY}}" - } - } - }, - { - "id": "tts-node-lang-learning_tts_component", - "type": "TTSInterface", - "creation_config": { - "type": "RemoteTTSConfig", - "properties": { - "synthesis_config": { - "type": "inworld", - "config": { - "model_id": "inworld-tts-1.5-max", - "inference": { - "speaking_rate": 1, - "temperature": 1.1 - }, - "postprocessing": { - "sample_rate": 22050 - } - } - }, - "api_key": "{{INWORLD_API_KEY}}" - } - } - }, - { - "id": "custom-condition-from-assembly-ai-stt-ws-node-lang-learning-to-assembly-ai-stt-ws-node-lang-learning", - "type": "custom-condition-from-assembly-ai-stt-ws-node-lang-learning-to-assembly-ai-stt-ws-node-lang-learning" - }, - { - "id": "custom-condition-from-assembly-ai-stt-ws-node-lang-learning-to-transcript-extractor-node-lang-learning", - "type": "custom-condition-from-assembly-ai-stt-ws-node-lang-learning-to-transcript-extractor-node-lang-learning" - }, - { - "id": "custom-condition-from-interaction-queue-node-lang-learning-to-text-input-node-lang-learning", - "type": "custom-condition-from-interaction-queue-node-lang-learning-to-text-input-node-lang-learning" - } - ] -} diff --git a/backend/src/graphs/configs/response-feedback-graph.json b/backend/src/graphs/configs/response-feedback-graph.json deleted file mode 100644 index d694a2b..0000000 --- a/backend/src/graphs/configs/response-feedback-graph.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "schema_version": "1.2.2", - "main": { - "id": "response-feedback-graph", - "nodes": [ - { - "type": "FeedbackPromptBuilderNodeType", - "id": "feedback-prompt-builder", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "TextToChatRequestNodeType", - "id": "text-to-chat-request", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "id": "llm-node", - "type": "LLMChatNode", - "execution_config": { - "type": "LLMChatNodeExecutionConfig", - "properties": { - "llm_component_id": "llm-node_llm_component", - "text_generation_config": { - "max_new_tokens": 100, - "max_prompt_length": 2000, - "temperature": 0.7, - "top_p": 1, - "repetition_penalty": 1, - "frequency_penalty": 0, - "presence_penalty": 0 - }, - "stream": false, - "report_to_client": false, - "response_format": "text" - } - } - }, - { - "type": "FeedbackExtractorNodeType", - "id": "feedback-extractor", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - } - ], - "edges": [ - { - "from_node": "feedback-prompt-builder", - "to_node": "text-to-chat-request" - }, - { - "from_node": "text-to-chat-request", - "to_node": "llm-node" - }, - { - "from_node": "llm-node", - "to_node": "feedback-extractor" - } - ], - "end_nodes": ["feedback-extractor"], - "start_nodes": ["feedback-prompt-builder"] - }, - "components": [ - { - "id": "llm-node_llm_component", - "type": "LLMInterface", - "creation_config": { - "type": "RemoteLLMConfig", - "properties": { - "provider": "openai", - "model_name": "gpt-4.1-nano", - "default_config": {}, - "api_key": "{{INWORLD_API_KEY}}" - } - } - } - ] -} diff --git a/backend/src/graphs/conversation-graph.ts b/backend/src/graphs/conversation-graph.ts index 1c4f578..938392f 100644 --- a/backend/src/graphs/conversation-graph.ts +++ b/backend/src/graphs/conversation-graph.ts @@ -2,13 +2,13 @@ * Conversation Graph for Language Learning App - Inworld Runtime 0.9 * * This is a long-running circular graph that: - * - Processes continuous audio streams via AssemblyAI STT with built-in VAD + * - Processes continuous audio streams via STT (AssemblyAI or Soniox) with built-in VAD * - Queues interactions for sequential processing * - Uses language-specific prompts and TTS voices * - Loops back for the next interaction automatically * * Graph Flow: - * AudioInput → AssemblyAI STT (loop) → TranscriptExtractor → InteractionQueue + * AudioInput → STT (loop) → TranscriptExtractor → InteractionQueue * → TextInput → DialogPromptBuilder → LLM → TextChunking → TTSRequestBuilder → TTS * → TextAggregator → StateUpdate → (loop back to InteractionQueue) */ @@ -16,6 +16,7 @@ import { Graph, GraphBuilder, + CustomNode, ProxyNode, RemoteLLMChatNode, RemoteTTSNode, @@ -24,6 +25,8 @@ import { } from '@inworld/runtime/graph'; import { AssemblyAISTTWebSocketNode } from './nodes/assembly-ai-stt-ws-node.js'; +import { SonioxSTTWebSocketNode } from './nodes/soniox-stt-ws-node.js'; +import { STTNode } from './nodes/stt-node.js'; import { DialogPromptBuilderNode } from './nodes/dialog-prompt-builder-node.js'; import { InteractionQueueNode } from './nodes/interaction-queue-node.js'; import { MemoryRetrievalNode } from './nodes/memory-retrieval-node.js'; @@ -37,33 +40,39 @@ import { DEFAULT_LANGUAGE_CODE, } from '../config/languages.js'; import { llmConfig } from '../config/llm.js'; -import { serverConfig, getAssemblyAISettings } from '../config/server.js'; +import { + serverConfig, + getAssemblyAISettings, + getSonioxSettings, + STTProvider, +} from '../config/server.js'; import { graphLogger as logger } from '../utils/logger.js'; export interface ConversationGraphConfig { - assemblyAIApiKey: string; + sttProvider: STTProvider; + sttApiKey: string; connections: ConnectionsMap; defaultLanguageCode?: string; } /** - * Wrapper class for the conversation graph - * Provides access to the graph and the AssemblyAI node for session management + * Wrapper class for the conversation graph. + * Provides access to the graph and the STT node for session management. */ export class ConversationGraphWrapper { graph: Graph; - assemblyAINode: AssemblyAISTTWebSocketNode; + sttNode: STTNode; private constructor(params: { graph: Graph; - assemblyAINode: AssemblyAISTTWebSocketNode; + sttNode: STTNode; }) { this.graph = params.graph; - this.assemblyAINode = params.assemblyAINode; + this.sttNode = params.sttNode; } async destroy(): Promise { - await this.assemblyAINode.destroy(); + await this.sttNode.destroy(); await this.graph.stop(); } @@ -73,15 +82,19 @@ export class ConversationGraphWrapper { static create(config: ConversationGraphConfig): ConversationGraphWrapper { const { connections, - assemblyAIApiKey, + sttProvider, + sttApiKey, defaultLanguageCode = DEFAULT_LANGUAGE_CODE, } = config; - // Use provided language code or default to Spanish const langConfig = getLanguageConfig(defaultLanguageCode); const postfix = `-lang-learning`; logger.info( - { language: langConfig.name, languageCode: defaultLanguageCode }, + { + language: langConfig.name, + languageCode: defaultLanguageCode, + sttProvider, + }, 'creating_conversation_graph' ); @@ -89,25 +102,40 @@ export class ConversationGraphWrapper { // Create Nodes // ============================================================ - // Start node (audio input proxy) const audioInputNode = new ProxyNode({ id: `audio-input-proxy${postfix}` }); - // AssemblyAI STT with built-in VAD (always uses multilingual model) - const turnDetectionSettings = getAssemblyAISettings(); - const assemblyAISTTNode = new AssemblyAISTTWebSocketNode({ - id: `assembly-ai-stt-ws-node${postfix}`, - config: { - apiKey: assemblyAIApiKey, - connections: connections, - sampleRate: serverConfig.audio.inputSampleRate, - formatTurns: serverConfig.assemblyAI.formatTurns, - endOfTurnConfidenceThreshold: - turnDetectionSettings.endOfTurnConfidenceThreshold, - minEndOfTurnSilenceWhenConfident: - turnDetectionSettings.minEndOfTurnSilenceWhenConfident, - maxTurnSilence: turnDetectionSettings.maxTurnSilence, - }, - }); + // Create STT node based on provider + let sttCustomNode: CustomNode & STTNode; + + if (sttProvider === 'soniox') { + const sonioxSettings = getSonioxSettings(); + sttCustomNode = new SonioxSTTWebSocketNode({ + id: `stt-ws-node${postfix}`, + config: { + apiKey: sttApiKey, + connections: connections, + sampleRate: serverConfig.audio.inputSampleRate, + maxEndpointDelayMs: sonioxSettings.maxEndpointDelayMs, + languageHints: sonioxSettings.languageHints, + }, + }); + } else { + const turnDetectionSettings = getAssemblyAISettings(); + sttCustomNode = new AssemblyAISTTWebSocketNode({ + id: `stt-ws-node${postfix}`, + config: { + apiKey: sttApiKey, + connections: connections, + sampleRate: serverConfig.audio.inputSampleRate, + formatTurns: serverConfig.assemblyAI.formatTurns, + endOfTurnConfidenceThreshold: + turnDetectionSettings.endOfTurnConfidenceThreshold, + minEndOfTurnSilenceWhenConfident: + turnDetectionSettings.minEndOfTurnSilenceWhenConfident, + maxTurnSilence: turnDetectionSettings.maxTurnSilence, + }, + }); + } const transcriptExtractorNode = new TranscriptExtractorNode({ id: `transcript-extractor-node${postfix}`, @@ -190,7 +218,7 @@ export class ConversationGraphWrapper { graphBuilder // Add all nodes .addNode(audioInputNode) - .addNode(assemblyAISTTNode) + .addNode(sttCustomNode) .addNode(transcriptExtractorNode) .addNode(interactionQueueNode) .addNode(textInputNode) @@ -206,10 +234,10 @@ export class ConversationGraphWrapper { // ============================================================ // Audio Input Flow (STT with VAD) // ============================================================ - .addEdge(audioInputNode, assemblyAISTTNode) + .addEdge(audioInputNode, sttCustomNode) - // AssemblyAI loops back to itself while stream is active - .addEdge(assemblyAISTTNode, assemblyAISTTNode, { + // STT loops back to itself while stream is active + .addEdge(sttCustomNode, sttCustomNode, { condition: async (input: unknown) => { const data = input as { stream_exhausted?: boolean }; return data?.stream_exhausted !== true; @@ -219,7 +247,7 @@ export class ConversationGraphWrapper { }) // When interaction is complete, extract transcript - .addEdge(assemblyAISTTNode, transcriptExtractorNode, { + .addEdge(sttCustomNode, transcriptExtractorNode, { condition: async (input: unknown) => { const data = input as { interaction_complete?: boolean }; return data?.interaction_complete === true; @@ -283,7 +311,7 @@ export class ConversationGraphWrapper { return new ConversationGraphWrapper({ graph, - assemblyAINode: assemblyAISTTNode, + sttNode: sttCustomNode, }); } } diff --git a/backend/src/graphs/nodes/assembly-ai-stt-ws-node.ts b/backend/src/graphs/nodes/assembly-ai-stt-ws-node.ts index 3b566e9..3d0169f 100644 --- a/backend/src/graphs/nodes/assembly-ai-stt-ws-node.ts +++ b/backend/src/graphs/nodes/assembly-ai-stt-ws-node.ts @@ -6,6 +6,7 @@ import { v4 as uuidv4 } from 'uuid'; import { Connection } from '../../types/index.js'; import { audioDataToPCM16 } from '../../helpers/audio-utils.js'; import { createLogger } from '../../utils/logger.js'; +import { STTNode } from './stt-node.js'; const logger = createLogger('AssemblyAI'); @@ -224,7 +225,7 @@ class AssemblyAISession { * - Detects turn endings using Assembly.AI's neural turn detection * - Returns DataStreamWithMetadata with transcribed text when a turn completes */ -export class AssemblyAISTTWebSocketNode extends CustomNode { +export class AssemblyAISTTWebSocketNode extends CustomNode implements STTNode { private apiKey: string; private connections: { [sessionId: string]: Connection }; private sampleRate: number; diff --git a/backend/src/graphs/nodes/soniox-stt-ws-node.ts b/backend/src/graphs/nodes/soniox-stt-ws-node.ts new file mode 100644 index 0000000..e78ce48 --- /dev/null +++ b/backend/src/graphs/nodes/soniox-stt-ws-node.ts @@ -0,0 +1,692 @@ +import { DataStreamWithMetadata } from '@inworld/runtime'; +import { CustomNode, GraphTypes, ProcessContext } from '@inworld/runtime/graph'; +import WebSocket from 'ws'; +import { v4 as uuidv4 } from 'uuid'; + +import { Connection } from '../../types/index.js'; +import { audioDataToPCM16 } from '../../helpers/audio-utils.js'; +import { createLogger } from '../../utils/logger.js'; +import { STTNode } from './stt-node.js'; + +const logger = createLogger('Soniox'); + +const SONIOX_WEBSOCKET_URL = 'wss://stt-rt.soniox.com/transcribe-websocket'; +const SONIOX_MODEL = 'stt-rt-v4'; + +/** + * Configuration interface for SonioxSTTWebSocketNode + */ +export interface SonioxSTTWebSocketNodeConfig { + /** Soniox API key */ + apiKey: string; + /** Connections map to access session state */ + connections: { [sessionId: string]: Connection }; + /** Sample rate of the audio stream in Hz */ + sampleRate?: number; + /** Maximum endpoint delay in milliseconds (500-3000, default 2000) */ + maxEndpointDelayMs?: number; + /** Language hints for improved accuracy (e.g. ['en', 'es']) */ + languageHints?: string[]; +} + +/** + * Manages a persistent WebSocket connection to Soniox for a single session. + */ +class SonioxSession { + private ws: WebSocket | null = null; + private wsReady: boolean = false; + private wsConnectionPromise: Promise | null = null; + + public shouldStopProcessing: boolean = false; + + private inactivityTimeout: NodeJS.Timeout | null = null; + private keepaliveInterval: NodeJS.Timeout | null = null; + private lastActivityTime: number = Date.now(); + private readonly INACTIVITY_TIMEOUT_MS = 60000; + private readonly KEEPALIVE_INTERVAL_MS = 5000; + + constructor( + public readonly sessionId: string, + private apiKey: string, + private sampleRate: number, + private maxEndpointDelayMs: number, + private languageHints: string[] + ) {} + + public async ensureConnection(): Promise { + if ( + !this.ws || + !this.wsReady || + this.ws.readyState !== WebSocket.OPEN + ) { + this.closeWebSocket(); + this.initializeWebSocket(); + } + + if (this.wsConnectionPromise) { + await this.wsConnectionPromise; + } + + this.shouldStopProcessing = false; + this.resetInactivityTimer(); + } + + private initializeWebSocket(): void { + logger.debug({ sessionId: this.sessionId }, 'initializing_websocket'); + + this.wsConnectionPromise = new Promise((resolve, reject) => { + this.ws = new WebSocket(SONIOX_WEBSOCKET_URL); + + this.ws.on('open', () => { + logger.debug({ sessionId: this.sessionId }, 'websocket_opened'); + + const config = { + api_key: this.apiKey, + model: SONIOX_MODEL, + audio_format: 'pcm_s16le', + sample_rate: this.sampleRate, + num_channels: 1, + enable_endpoint_detection: true, + max_endpoint_delay_ms: this.maxEndpointDelayMs, + language_hints: this.languageHints, + enable_language_identification: true, + }; + + this.ws!.send(JSON.stringify(config)); + logger.debug( + { + model: SONIOX_MODEL, + sampleRate: this.sampleRate, + maxEndpointDelayMs: this.maxEndpointDelayMs, + languageHints: this.languageHints, + }, + 'config_sent' + ); + + this.wsReady = true; + this.startKeepalive(); + resolve(); + }); + + this.ws.on('error', (error: Error) => { + logger.error({ err: error }, 'websocket_error'); + this.wsReady = false; + reject(error); + }); + + this.ws.on('close', (code: number, reason: Buffer) => { + logger.debug({ code, reason: reason.toString() }, 'websocket_closed'); + this.wsReady = false; + this.stopKeepalive(); + }); + }); + } + + public onMessage(listener: (data: WebSocket.Data) => void): void { + if (this.ws) { + this.ws.on('message', listener); + } + } + + public offMessage(listener: (data: WebSocket.Data) => void): void { + if (this.ws) { + this.ws.off('message', listener); + } + } + + public sendAudio(pcm16Data: Int16Array): void { + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + this.ws.send(Buffer.from(pcm16Data.buffer)); + this.resetInactivityTimer(); + } + } + + public sendFinalize(): void { + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify({ type: 'finalize' })); + } + } + + private startKeepalive(): void { + this.stopKeepalive(); + this.keepaliveInterval = setInterval(() => { + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify({ type: 'keepalive' })); + } + }, this.KEEPALIVE_INTERVAL_MS); + } + + private stopKeepalive(): void { + if (this.keepaliveInterval) { + clearInterval(this.keepaliveInterval); + this.keepaliveInterval = null; + } + } + + private resetInactivityTimer(): void { + if (this.inactivityTimeout) { + clearTimeout(this.inactivityTimeout); + } + this.lastActivityTime = Date.now(); + this.inactivityTimeout = setTimeout(() => { + this.closeDueToInactivity(); + }, this.INACTIVITY_TIMEOUT_MS); + } + + public clearInactivityTimer(): void { + if (this.inactivityTimeout) { + clearTimeout(this.inactivityTimeout); + this.inactivityTimeout = null; + } + } + + private closeDueToInactivity(): void { + const inactiveFor = Date.now() - this.lastActivityTime; + logger.info( + { sessionId: this.sessionId, inactiveMs: inactiveFor }, + 'closing_due_to_inactivity' + ); + this.closeWebSocket(); + } + + private closeWebSocket(): void { + this.stopKeepalive(); + if (this.ws) { + try { + this.ws.removeAllListeners(); + if (this.ws.readyState === WebSocket.OPEN) { + // Send empty string to signal end-of-audio + this.ws.send(''); + this.ws.close(); + } + } catch (e) { + logger.warn({ err: e }, 'error_closing_socket'); + } + this.ws = null; + this.wsReady = false; + } + } + + public async close(): Promise { + if (this.inactivityTimeout) { + clearTimeout(this.inactivityTimeout); + } + + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + try { + // Signal end-of-audio + this.ws.send(''); + await new Promise((resolve) => setTimeout(resolve, 100)); + } catch { + // Ignore + } + } + + this.closeWebSocket(); + } +} + +/** + * SonioxSTTWebSocketNode processes continuous multimodal streams using Soniox's + * streaming Speech-to-Text service via direct WebSocket connection. + * + * This node: + * - Receives MultimodalContent stream (audio and/or text) + * - For audio: extracts audio and feeds to Soniox streaming transcriber + * - For text: bypasses STT and returns text directly + * - Detects turn endings using Soniox's semantic endpoint detection + * - Returns DataStreamWithMetadata with transcribed text when a turn completes + */ +export class SonioxSTTWebSocketNode extends CustomNode implements STTNode { + private apiKey: string; + private connections: { [sessionId: string]: Connection }; + private sampleRate: number; + private maxEndpointDelayMs: number; + private languageHints: string[]; + + private sessions: Map = new Map(); + private readonly TURN_COMPLETION_TIMEOUT_MS = 2000; + private readonly MAX_TRANSCRIPTION_DURATION_MS = 40000; + + constructor(props: { + id?: string; + config: SonioxSTTWebSocketNodeConfig; + }) { + const { config, ...nodeProps } = props; + + if (!config.apiKey) { + throw new Error('SonioxSTTWebSocketNode requires an API key.'); + } + if (!config.connections) { + throw new Error('SonioxSTTWebSocketNode requires a connections object.'); + } + + super({ id: nodeProps.id || 'soniox-stt-ws-node' }); + + this.apiKey = config.apiKey; + this.connections = config.connections; + this.sampleRate = config.sampleRate || 16000; + this.maxEndpointDelayMs = config.maxEndpointDelayMs ?? 2000; + this.languageHints = config.languageHints ?? []; + + logger.info( + { + maxEndpointDelayMs: this.maxEndpointDelayMs, + languageHints: this.languageHints, + }, + 'stt_node_configured' + ); + } + + async process( + context: ProcessContext, + input0: AsyncIterableIterator, + input: DataStreamWithMetadata + ): Promise { + const multimodalStream = + input !== undefined && + input !== null && + input instanceof DataStreamWithMetadata + ? (input.toStream() as unknown as AsyncIterableIterator) + : input0; + + const sessionId = context.getDatastore().get('sessionId') as string; + const connection = this.connections[sessionId]; + + if (connection?.unloaded) { + throw Error(`Session unloaded for sessionId: ${sessionId}`); + } + if (!connection) { + throw Error(`Failed to read connection for sessionId: ${sessionId}`); + } + + const metadata = input?.getMetadata?.() || {}; + let previousIteration = (metadata.iteration as number) || 0; + + if ( + !connection.state.interactionId || + connection.state.interactionId === '' + ) { + connection.state.interactionId = uuidv4(); + } + + const currentId = connection.state.interactionId; + const delimiterIndex = currentId.indexOf('#'); + + if (previousIteration === 0 && delimiterIndex !== -1) { + const iterationStr = currentId.substring(delimiterIndex + 1); + const parsedIteration = parseInt(iterationStr, 10); + if (!isNaN(parsedIteration) && /^\d+$/.test(iterationStr)) { + previousIteration = parsedIteration; + } + } + + const iteration = previousIteration + 1; + const baseId = + delimiterIndex !== -1 + ? currentId.substring(0, delimiterIndex) + : currentId; + const nextInteractionId = `${baseId}#${iteration}`; + + logger.debug({ iteration }, 'starting_transcription'); + + // State tracking + let transcriptText = ''; + let turnDetected = false; + let speechDetected = false; + let audioChunkCount = 0; + let totalAudioSamples = 0; + let isStreamExhausted = false; + let errorOccurred = false; + let errorMessage = ''; + let maxDurationReached = false; + let isTextInput = false; + let textContent: string | undefined; + + // Soniox token accumulation + let finalTokenTexts: string[] = []; + + // Get or create session + let session = this.sessions.get(sessionId); + if (!session) { + session = new SonioxSession( + sessionId, + this.apiKey, + this.sampleRate, + this.maxEndpointDelayMs, + this.languageHints + ); + this.sessions.set(sessionId, session); + } + + // Promise to capture turn result + let turnResolve: (value: string) => void = () => {}; + let turnReject: (error: Error) => void = () => {}; + let turnCompleted = false; + const turnPromise = new Promise((resolve, reject) => { + turnResolve = resolve; + turnReject = reject; + }); + const turnPromiseWithState = turnPromise.then((value) => { + turnCompleted = true; + return value; + }); + + // Soniox message handler for this process() call + const messageHandler = (data: WebSocket.Data) => { + try { + const message = JSON.parse(data.toString()); + + if (message.error_code) { + logger.error( + { code: message.error_code, msg: message.error_message }, + 'soniox_error' + ); + errorOccurred = true; + errorMessage = `${message.error_code}: ${message.error_message}`; + return; + } + + if (session?.shouldStopProcessing) { + return; + } + + const tokens = message.tokens; + if (!tokens || !Array.isArray(tokens) || tokens.length === 0) { + return; + } + + let endpointDetected = false; + const nonFinalTexts: string[] = []; + + for (const token of tokens) { + const text = token.text || ''; + + if (token.is_final) { + // token signals endpoint detection + if (text === '') { + endpointDetected = true; + } else { + finalTokenTexts.push(text); + } + } else { + nonFinalTexts.push(text); + } + } + + // Trigger speech detected on first meaningful text + if (!speechDetected && (nonFinalTexts.length > 0 || finalTokenTexts.length > 0)) { + const hasText = nonFinalTexts.some((t) => t.trim().length > 0) || + finalTokenTexts.some((t) => t.trim().length > 0); + if (hasText) { + speechDetected = true; + logger.debug({ iteration }, 'speech_detected'); + if (connection?.onSpeechDetected) { + connection.onSpeechDetected(nextInteractionId); + } + } + } + + // Send partial transcript from non-final tokens + if (nonFinalTexts.length > 0) { + const partialText = [...finalTokenTexts, ...nonFinalTexts].join('').trim(); + if (partialText) { + this.sendPartialTranscript( + sessionId, + nextInteractionId, + partialText + ); + } + } + + if (endpointDetected) { + let finalTranscript = finalTokenTexts.join('').trim(); + + // Check for pending transcript to stitch + if (connection?.pendingTranscript) { + finalTranscript = + `${connection.pendingTranscript} ${finalTranscript}`.trim(); + logger.debug( + { + iteration, + transcriptSnippet: finalTranscript.substring(0, 80), + }, + 'stitched_transcript' + ); + connection.pendingTranscript = undefined; + } else { + logger.debug( + { iteration, transcriptSnippet: finalTranscript.substring(0, 50) }, + 'endpoint_detected' + ); + } + + if (connection) { + connection.isProcessingInterrupted = false; + } + + transcriptText = finalTranscript; + turnDetected = true; + if (session) session.shouldStopProcessing = true; + turnResolve(finalTranscript); + } + } catch (error) { + logger.error({ err: error }, 'error_handling_message'); + } + }; + + try { + await session.ensureConnection(); + session.onMessage(messageHandler); + + const audioProcessingPromise = (async () => { + let maxDurationTimeout: NodeJS.Timeout | null = null; + try { + maxDurationTimeout = setTimeout(() => { + maxDurationReached = true; + }, this.MAX_TRANSCRIPTION_DURATION_MS); + + while (true) { + if (session?.shouldStopProcessing) break; + + if (maxDurationReached && !transcriptText) { + logger.warn( + { maxDurationMs: this.MAX_TRANSCRIPTION_DURATION_MS }, + 'max_transcription_duration_reached' + ); + break; + } + + const result = await multimodalStream.next(); + + if (result.done) { + logger.debug( + { iteration, audioChunkCount }, + 'multimodal_stream_exhausted' + ); + isStreamExhausted = true; + break; + } + + if (session?.shouldStopProcessing) break; + + const content = result.value as GraphTypes.MultimodalContent; + + // Handle text input + if (content.text !== undefined && content.text !== null) { + logger.debug( + { iteration, textSnippet: content.text.substring(0, 50) }, + 'text_input_detected' + ); + isTextInput = true; + textContent = content.text; + transcriptText = content.text; + turnDetected = true; + if (session) { + session.shouldStopProcessing = true; + session.clearInactivityTimer(); + } + turnResolve(transcriptText); + break; + } + + // Extract audio + if (content.audio === undefined || content.audio === null) continue; + + const audioData = content.audio.data; + if (!audioData || audioData.length === 0) continue; + + audioChunkCount++; + totalAudioSamples += audioData.length; + + const pcm16Data = audioDataToPCM16(audioData); + session?.sendAudio(pcm16Data); + } + } catch (error) { + logger.error({ err: error }, 'error_processing_audio'); + errorOccurred = true; + errorMessage = error instanceof Error ? error.message : String(error); + throw error; + } finally { + if (maxDurationTimeout) { + clearTimeout(maxDurationTimeout); + } + } + })(); + + const raceResult = await Promise.race([ + turnPromiseWithState.then(() => ({ winner: 'turn' as const })), + audioProcessingPromise.then(() => ({ winner: 'audio' as const })), + ]); + + if ( + raceResult.winner === 'audio' && + !turnCompleted && + !maxDurationReached + ) { + logger.debug( + { waitMs: this.TURN_COMPLETION_TIMEOUT_MS }, + 'audio_ended_before_turn_waiting' + ); + + // Send finalize to force Soniox to return any remaining tokens + session.sendFinalize(); + + const timeoutPromise = new Promise<{ winner: 'timeout' }>((resolve) => + setTimeout( + () => resolve({ winner: 'timeout' }), + this.TURN_COMPLETION_TIMEOUT_MS + ) + ); + + const waitResult = await Promise.race([ + turnPromiseWithState.then(() => ({ winner: 'turn' as const })), + timeoutPromise, + ]); + + if (waitResult.winner === 'timeout' && !turnCompleted) { + logger.warn('timed_out_waiting_for_turn'); + turnReject?.(new Error('Timed out waiting for turn completion')); + } + } + + await audioProcessingPromise.catch(() => {}); + + logger.debug( + { iteration, transcriptSnippet: transcriptText?.substring(0, 50) }, + 'transcription_complete' + ); + + if (turnDetected) { + connection.state.interactionId = ''; + } + + const taggedStream = Object.assign(multimodalStream, { + type: 'MultimodalContent', + abort: () => {}, + getMetadata: () => ({}), + }); + + return new DataStreamWithMetadata(taggedStream, { + elementType: 'MultimodalContent', + iteration: iteration, + interactionId: nextInteractionId, + session_id: sessionId, + transcript: transcriptText, + turn_detected: turnDetected, + audio_chunk_count: audioChunkCount, + total_audio_samples: totalAudioSamples, + sample_rate: this.sampleRate, + stream_exhausted: isStreamExhausted, + interaction_complete: turnDetected && transcriptText.length > 0, + error_occurred: errorOccurred, + error_message: errorMessage, + is_text_input: isTextInput, + text_content: textContent, + }); + } catch (error) { + logger.error({ err: error, iteration }, 'transcription_failed'); + + const taggedStream = Object.assign(multimodalStream, { + type: 'MultimodalContent', + abort: () => {}, + getMetadata: () => ({}), + }); + + return new DataStreamWithMetadata(taggedStream, { + elementType: 'MultimodalContent', + iteration: iteration, + interactionId: nextInteractionId, + session_id: sessionId, + transcript: '', + turn_detected: false, + stream_exhausted: isStreamExhausted, + interaction_complete: false, + error_occurred: true, + error_message: error instanceof Error ? error.message : String(error), + is_text_input: isTextInput, + text_content: textContent, + }); + } finally { + if (session) { + session.offMessage(messageHandler); + } + } + } + + private sendPartialTranscript( + sessionId: string, + interactionId: string, + text: string + ): void { + const connection = this.connections[sessionId]; + if (!connection?.onPartialTranscript) return; + + try { + connection.onPartialTranscript(text, interactionId); + } catch (error) { + logger.error({ err: error }, 'error_sending_partial_transcript'); + } + } + + async closeSession(sessionId: string): Promise { + const session = this.sessions.get(sessionId); + if (session) { + logger.debug({ sessionId }, 'closing_session'); + await session.close(); + this.sessions.delete(sessionId); + } + } + + async destroy(): Promise { + logger.info({ sessionCount: this.sessions.size }, 'destroying_node'); + + const promises: Promise[] = []; + for (const session of this.sessions.values()) { + promises.push(session.close()); + } + + await Promise.all(promises); + this.sessions.clear(); + } +} diff --git a/backend/src/graphs/nodes/stt-node.ts b/backend/src/graphs/nodes/stt-node.ts new file mode 100644 index 0000000..2d58b62 --- /dev/null +++ b/backend/src/graphs/nodes/stt-node.ts @@ -0,0 +1,9 @@ +/** + * Common interface for STT (Speech-to-Text) nodes. + * Both AssemblyAI and Soniox implementations conform to this interface + * so they can be used interchangeably in the conversation graph. + */ +export interface STTNode { + closeSession(sessionId: string): Promise; + destroy(): Promise; +} diff --git a/backend/src/helpers/connection-manager.ts b/backend/src/helpers/connection-manager.ts index 9659d5a..0a8f99c 100644 --- a/backend/src/helpers/connection-manager.ts +++ b/backend/src/helpers/connection-manager.ts @@ -4,7 +4,7 @@ * This replaces the AudioProcessor for Inworld Runtime 0.9. * Key differences from AudioProcessor: * - Uses MultimodalStreamManager to feed audio to a long-running graph - * - VAD is handled inside the graph by AssemblyAI (not external Silero) + * - VAD is handled inside the graph by the STT provider (AssemblyAI or Soniox) * - Graph runs continuously for the session duration */ @@ -1085,8 +1085,8 @@ export class ConnectionManager { // End the multimodal stream this.multimodalStreamManager.end(); - // Close AssemblyAI session - await this.graphWrapper.assemblyAINode.closeSession(this.sessionId); + // Close STT session + await this.graphWrapper.sttNode.closeSession(this.sessionId); // Remove from connections map delete this.connections[this.sessionId]; diff --git a/backend/src/server.ts b/backend/src/server.ts index 90fc1d3..0b61b34 100644 --- a/backend/src/server.ts +++ b/backend/src/server.ts @@ -105,9 +105,10 @@ async function startServer(): Promise { try { await initializeGraph(); await exportGraphConfigs(); + const sttProvider = process.env.STT_PROVIDER || 'assembly'; server.listen(serverConfig.port, () => { logger.info({ port: serverConfig.port }, 'server_started'); - logger.info('using_inworld_runtime_0.9_with_assemblyai_stt'); + logger.info({ sttProvider }, 'using_inworld_runtime_0.9_with_stt'); }); } catch (error) { logger.fatal({ err: error }, 'server_start_failed'); diff --git a/backend/src/services/graph-service.ts b/backend/src/services/graph-service.ts index 1bfd910..f0cad43 100644 --- a/backend/src/services/graph-service.ts +++ b/backend/src/services/graph-service.ts @@ -19,6 +19,7 @@ import { getResponseFeedbackGraph } from '../graphs/response-feedback-graph.js'; import { initializeTTSGraphs } from '../graphs/simple-tts-graph.js'; import { serverLogger as logger } from '../utils/logger.js'; import { connections } from './state.js'; +import { STTProvider } from '../config/server.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); @@ -31,16 +32,33 @@ export function getGraphWrapper(): ConversationGraphWrapper | null { } export async function initializeGraph(): Promise { - const assemblyAIApiKey = process.env.ASSEMBLY_AI_API_KEY; - if (!assemblyAIApiKey) { - throw new Error('ASSEMBLY_AI_API_KEY environment variable is required'); + // Read STT_PROVIDER from process.env at call time (after dotenv has loaded), + // not from serverConfig which is evaluated at module load time before dotenv. + const sttProvider = (process.env.STT_PROVIDER || 'assembly') as STTProvider; + let sttApiKey: string; + + if (sttProvider === 'soniox') { + sttApiKey = process.env.SONIOX_API_KEY || ''; + if (!sttApiKey) { + throw new Error( + 'SONIOX_API_KEY environment variable is required when STT_PROVIDER=soniox' + ); + } + } else { + sttApiKey = process.env.ASSEMBLY_AI_API_KEY || ''; + if (!sttApiKey) { + throw new Error( + 'ASSEMBLY_AI_API_KEY environment variable is required when STT_PROVIDER=assembly' + ); + } } - logger.info('initializing_conversation_graph'); + logger.info({ sttProvider }, 'initializing_conversation_graph'); graphWrapper = getConversationGraph({ - assemblyAIApiKey, + sttProvider, + sttApiKey, connections, - defaultLanguageCode: 'es', // Always Spanish + defaultLanguageCode: 'es', }); logger.info('conversation_graph_initialized'); diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 5d0a7b8..8b55696 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -84,7 +84,6 @@ "integrity": "sha512-e7jT4DxYvIDLk1ZHmU/m/mB19rex9sv0c2ftBtjSBv+kVM/902eh0fINUzD7UwLLNR+jU585GxUJ8/EBfAM5fw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@babel/code-frame": "^7.27.1", "@babel/generator": "^7.28.5", @@ -424,7 +423,6 @@ } ], "license": "MIT", - "peer": true, "engines": { "node": ">=18" }, @@ -448,7 +446,6 @@ } ], "license": "MIT", - "peer": true, "engines": { "node": ">=18" } @@ -1667,7 +1664,6 @@ "integrity": "sha512-MWtvHrGZLFttgeEj28VXHxpmwYbor/ATPYbBfSFZEIRK0ecCFLl2Qo55z52Hss+UV9CRN7trSeq1zbgx7YDWWg==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "csstype": "^3.2.2" } @@ -1736,7 +1732,6 @@ "integrity": "sha512-3xP4XzzDNQOIqBMWogftkwxhg5oMKApqY0BAflmLZiFYHqyhSOxv/cd/zPQLTcCXr4AkaKb25joocY0BD1WC6A==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "8.51.0", "@typescript-eslint/types": "8.51.0", @@ -2099,7 +2094,6 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", - "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -2232,7 +2226,6 @@ } ], "license": "MIT", - "peer": true, "dependencies": { "baseline-browser-mapping": "^2.9.0", "caniuse-lite": "^1.0.30001759", @@ -2627,7 +2620,6 @@ "integrity": "sha512-LEyamqS7W5HB3ujJyvi0HQK/dtVINZvd5mAAp9eT5S/ujByGjiZLCzPcHVzuXbpJDJF/cxwHlfceVUDZ2lnSTw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.1", @@ -2688,7 +2680,6 @@ "integrity": "sha512-iI1f+D2ViGn+uvv5HuHVUamg8ll4tN+JRHGc6IJi4TP9Kl976C57fzPXgseXNs8v0iA8aSJpHsTWjDb9QJamGQ==", "dev": true, "license": "MIT", - "peer": true, "bin": { "eslint-config-prettier": "bin/cli.js" }, @@ -3330,7 +3321,6 @@ "integrity": "sha512-8i7LzZj7BF8uplX+ZyOlIz86V6TAsSs+np6m1kpW9u0JWi4z/1t+FzcK1aek+ybTnAC4KhBL4uXCNT0wcUIeCw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "cssstyle": "^4.1.0", "data-urls": "^5.0.0", @@ -3700,7 +3690,6 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, @@ -3753,7 +3742,6 @@ "integrity": "sha512-v6UNi1+3hSlVvv8fSaoUbggEM5VErKmmpGA7Pl3HF8V6uKY7rvClBOJlH6yNwQtfTueNkGVpOv/mtWL9L4bgRA==", "dev": true, "license": "MIT", - "peer": true, "bin": { "prettier": "bin/prettier.cjs" }, @@ -3792,7 +3780,6 @@ "resolved": "https://registry.npmjs.org/react/-/react-19.2.3.tgz", "integrity": "sha512-Ku/hhYbVjOQnXDZFv2+RibmLFGwFdeeKHFcOTlrt7xplBnya5OGn/hIRDsqDiSUcfORsDC7MPxwork8jBwsIWA==", "license": "MIT", - "peer": true, "engines": { "node": ">=0.10.0" } @@ -4145,7 +4132,6 @@ "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "dev": true, "license": "Apache-2.0", - "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -4231,7 +4217,6 @@ "integrity": "sha512-dZwN5L1VlUBewiP6H9s2+B3e3Jg96D0vzN+Ry73sOefebhYr9f94wwkMNN/9ouoU8pV1BqA1d1zGk8928cx0rg==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "esbuild": "^0.27.0", "fdir": "^6.5.0", @@ -4547,7 +4532,6 @@ "integrity": "sha512-0wZ1IRqGGhMP76gLqz8EyfBXKk0J2qo2+H3fi4mcUP/KtTocoX08nmIAHl1Z2kJIZbZee8KOpBCSNPRgauucjw==", "dev": true, "license": "MIT", - "peer": true, "funding": { "url": "https://github.com/sponsors/colinhacks" } diff --git a/render.yaml b/render.yaml index f9f53ca..76f0d4e 100644 --- a/render.yaml +++ b/render.yaml @@ -12,8 +12,12 @@ services: value: production - key: INWORLD_API_KEY sync: false + - key: STT_PROVIDER + value: assembly - key: ASSEMBLY_AI_API_KEY sync: false + - key: SONIOX_API_KEY + sync: false - key: SUPABASE_URL sync: false - key: SUPABASE_SECRET_KEY From 0fddce47510bf91a778c745d33a2714618af4211 Mon Sep 17 00:00:00 2001 From: Cale Shapera <25466659+cshape@users.noreply.github.com> Date: Fri, 20 Feb 2026 14:26:01 -0800 Subject: [PATCH 02/16] fix: conversation histories are separate --- backend/src/services/websocket-handler.ts | 22 +++++++ frontend/src/context/AppContext.tsx | 76 ++++++++++++++++------- 2 files changed, 74 insertions(+), 24 deletions(-) diff --git a/backend/src/services/websocket-handler.ts b/backend/src/services/websocket-handler.ts index e5fee0e..bd3350c 100644 --- a/backend/src/services/websocket-handler.ts +++ b/backend/src/services/websocket-handler.ts @@ -294,12 +294,34 @@ function handleConversationUpdate( connectionId: string, connectionManager: ConnectionManager, message: { + conversationId?: string; data?: { + conversationId?: string; messages?: Array<{ role: string; content: string; timestamp?: string }>; }; messages?: Array<{ role: string; content: string; timestamp?: string }>; } ): void { + const incomingConversationId = + message.conversationId || message.data?.conversationId; + const currentConversationId = connectionManager.getConversationId(); + + if ( + incomingConversationId && + currentConversationId && + incomingConversationId !== currentConversationId + ) { + logger.info( + { + connectionId, + incomingConversationId, + currentConversationId, + }, + 'ignoring_stale_conversation_update' + ); + return; + } + // Handle both formats: { data: { messages: [...] } } and { messages: [...] } const messages = message.messages || diff --git a/frontend/src/context/AppContext.tsx b/frontend/src/context/AppContext.tsx index 4a20157..f5cea64 100644 --- a/frontend/src/context/AppContext.tsx +++ b/frontend/src/context/AppContext.tsx @@ -520,8 +520,6 @@ export function AppProvider({ children }: AppProviderProps) { // Case 1: We have a pending LLM response but user message was already added (text input case) if (pendingLLMResponse && !pendingTranscription) { - // Add only the teacher response - storage.addMessage('assistant', pendingLLMResponse); dispatch({ type: 'ADD_MESSAGE', payload: { @@ -531,8 +529,24 @@ export function AppProvider({ children }: AppProviderProps) { }, }); - const conversationHistory = storage.getConversationHistory(); - wsClient.send({ type: 'conversation_update', data: conversationHistory }); + // Build conversation_update from current chatHistory + new assistant message + const messages = [ + ...currentState.chatHistory.map((m) => ({ + role: m.role === 'learner' ? 'user' : 'assistant', + content: m.content, + timestamp: m.timestamp || new Date().toISOString(), + })), + { + role: 'assistant', + content: pendingLLMResponse, + timestamp: new Date().toISOString(), + }, + ]; + wsClient.send({ + type: 'conversation_update', + conversationId, + messages, + }); pendingLLMResponseRef.current = null; dispatch({ type: 'RESET_STREAMING_STATE' }); @@ -562,7 +576,6 @@ export function AppProvider({ children }: AppProviderProps) { }); } - storage.addMessage('user', pendingTranscription); dispatch({ type: 'ADD_MESSAGE', payload: { @@ -572,7 +585,6 @@ export function AppProvider({ children }: AppProviderProps) { }, }); - storage.addMessage('assistant', pendingLLMResponse); dispatch({ type: 'ADD_MESSAGE', payload: { @@ -582,8 +594,29 @@ export function AppProvider({ children }: AppProviderProps) { }, }); - const conversationHistory = storage.getConversationHistory(); - wsClient.send({ type: 'conversation_update', data: conversationHistory }); + // Build conversation_update from current chatHistory + new user + assistant messages + const messages = [ + ...currentState.chatHistory.map((m) => ({ + role: m.role === 'learner' ? 'user' : 'assistant', + content: m.content, + timestamp: m.timestamp || new Date().toISOString(), + })), + { + role: 'user', + content: pendingTranscription, + timestamp: new Date().toISOString(), + }, + { + role: 'assistant', + content: pendingLLMResponse, + timestamp: new Date().toISOString(), + }, + ]; + wsClient.send({ + type: 'conversation_update', + conversationId, + messages, + }); dispatch({ type: 'SET_PENDING_TRANSCRIPTION', payload: null }); pendingLLMResponseRef.current = null; @@ -686,12 +719,16 @@ export function AppProvider({ children }: AppProviderProps) { }); if (status === 'connected') { - const existingConversation = storage.getConversationHistory(); - if (existingConversation.messages.length > 0) { - wsClient.send({ - type: 'conversation_update', - data: existingConversation, - }); + const currentId = stateRef.current.currentConversationId; + if (currentId) { + const conversationData = storage.getConversation(currentId); + if (conversationData && conversationData.messages.length > 0) { + wsClient.send({ + type: 'conversation_update', + conversationId: currentId, + messages: conversationData.messages, + }); + } } } }); @@ -965,17 +1002,9 @@ export function AppProvider({ children }: AppProviderProps) { })) as ChatMessage[]; // Update chat history to match server state + // Per-conversation storage is kept in sync by the useEffect on chatHistory dispatch({ type: 'SET_CHAT_HISTORY', payload: chatHistory }); - // Also update storage to stay in sync - storage.clearConversation(); - messages.forEach((m) => { - storage.addMessage( - m.role === 'user' ? 'user' : 'assistant', - m.content - ); - }); - // Clear any pending state dispatch({ type: 'SET_PENDING_TRANSCRIPTION', payload: null }); pendingLLMResponseRef.current = null; @@ -1263,7 +1292,6 @@ export function AppProvider({ children }: AppProviderProps) { } // Add user message to chat history immediately (unlike audio where we wait for transcription) - storage.addMessage('user', trimmedText); dispatch({ type: 'ADD_MESSAGE', payload: { From 95acafe776ff08e10914d4d1d8ab41ec7856c0b9 Mon Sep 17 00:00:00 2001 From: Cale Shapera <25466659+cshape@users.noreply.github.com> Date: Fri, 20 Feb 2026 14:31:32 -0800 Subject: [PATCH 03/16] feat: add dynamic language hints to soniox --- backend/src/config/server.ts | 4 --- backend/src/graphs/conversation-graph.ts | 1 - .../src/graphs/nodes/soniox-stt-ws-node.ts | 29 +++++++++++++++++-- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/backend/src/config/server.ts b/backend/src/config/server.ts index dd71935..4e6f9d3 100644 --- a/backend/src/config/server.ts +++ b/backend/src/config/server.ts @@ -63,7 +63,6 @@ const assemblyAIPresets: Record< export interface SonioxEndpointSettings { maxEndpointDelayMs: number; - languageHints: string[]; description: string; } @@ -75,17 +74,14 @@ export interface SonioxEndpointSettings { const sonioxPresets: Record = { high: { maxEndpointDelayMs: 500, - languageHints: ['en', 'es'], description: 'Aggressive - fastest endpoint detection (500ms)', }, medium: { maxEndpointDelayMs: 1000, - languageHints: ['en', 'es'], description: 'Balanced - moderate endpoint delay (1000ms)', }, low: { maxEndpointDelayMs: 2000, - languageHints: ['en', 'es'], description: 'Conservative - patient endpoint detection (2000ms)', }, }; diff --git a/backend/src/graphs/conversation-graph.ts b/backend/src/graphs/conversation-graph.ts index 938392f..fdd627d 100644 --- a/backend/src/graphs/conversation-graph.ts +++ b/backend/src/graphs/conversation-graph.ts @@ -116,7 +116,6 @@ export class ConversationGraphWrapper { connections: connections, sampleRate: serverConfig.audio.inputSampleRate, maxEndpointDelayMs: sonioxSettings.maxEndpointDelayMs, - languageHints: sonioxSettings.languageHints, }, }); } else { diff --git a/backend/src/graphs/nodes/soniox-stt-ws-node.ts b/backend/src/graphs/nodes/soniox-stt-ws-node.ts index e78ce48..afe231c 100644 --- a/backend/src/graphs/nodes/soniox-stt-ws-node.ts +++ b/backend/src/graphs/nodes/soniox-stt-ws-node.ts @@ -180,6 +180,23 @@ class SonioxSession { } } + /** + * Update language hints. If they differ from the current hints, closes the + * existing WebSocket so the next ensureConnection() reopens with the new config. + */ + public updateLanguageHints(hints: string[]): void { + const sorted = [...hints].sort(); + const currentSorted = [...this.languageHints].sort(); + if (sorted.join(',') === currentSorted.join(',')) return; + + logger.info( + { sessionId: this.sessionId, from: this.languageHints, to: hints }, + 'language_hints_changed' + ); + this.languageHints = hints; + this.closeWebSocket(); + } + private closeDueToInactivity(): void { const inactiveFor = Date.now() - this.lastActivityTime; logger.info( @@ -267,7 +284,7 @@ export class SonioxSTTWebSocketNode extends CustomNode implements STTNode { this.connections = config.connections; this.sampleRate = config.sampleRate || 16000; this.maxEndpointDelayMs = config.maxEndpointDelayMs ?? 2000; - this.languageHints = config.languageHints ?? []; + this.languageHints = config.languageHints ?? ['en']; logger.info( { @@ -346,6 +363,12 @@ export class SonioxSTTWebSocketNode extends CustomNode implements STTNode { // Soniox token accumulation let finalTokenTexts: string[] = []; + // Derive per-session language hints from the connection's active language + const targetLang = connection.state.languageCode || 'es'; + const sessionLanguageHints = targetLang === 'en' + ? ['en'] + : ['en', targetLang]; + // Get or create session let session = this.sessions.get(sessionId); if (!session) { @@ -354,9 +377,11 @@ export class SonioxSTTWebSocketNode extends CustomNode implements STTNode { this.apiKey, this.sampleRate, this.maxEndpointDelayMs, - this.languageHints + sessionLanguageHints ); this.sessions.set(sessionId, session); + } else { + session.updateLanguageHints(sessionLanguageHints); } // Promise to capture turn result From 31270473f37c43ef5b5d123305799339ca711dad Mon Sep 17 00:00:00 2001 From: Cale Shapera <25466659+cshape@users.noreply.github.com> Date: Fri, 20 Feb 2026 14:49:55 -0800 Subject: [PATCH 04/16] feat: added support for chinese, japanese, korean, and russian --- .../src/__tests__/config/languages.test.ts | 21 ++- backend/src/config/languages.ts | 156 ++++++++++++++++-- backend/src/config/server.ts | 8 + backend/src/graphs/simple-tts-graph.ts | 4 +- backend/src/services/api-routes.ts | 3 +- backend/src/services/websocket-handler.ts | 6 +- 6 files changed, 179 insertions(+), 19 deletions(-) diff --git a/backend/src/__tests__/config/languages.test.ts b/backend/src/__tests__/config/languages.test.ts index 426d959..59f4933 100644 --- a/backend/src/__tests__/config/languages.test.ts +++ b/backend/src/__tests__/config/languages.test.ts @@ -74,13 +74,30 @@ describe('languages config', () => { expect(codes).toContain('de'); }); - it('matches SUPPORTED_LANGUAGES keys', () => { + it('without provider, returns only languages without requiredSttProvider', () => { const codes = getSupportedLanguageCodes(); - expect(codes.length).toBe(Object.keys(SUPPORTED_LANGUAGES).length); for (const code of codes) { expect(SUPPORTED_LANGUAGES[code]).toBeDefined(); + expect(SUPPORTED_LANGUAGES[code].requiredSttProvider).toBeUndefined(); } }); + + it('with soniox provider, returns all languages', () => { + const codes = getSupportedLanguageCodes('soniox'); + expect(codes.length).toBe(Object.keys(SUPPORTED_LANGUAGES).length); + expect(codes).toContain('zh'); + expect(codes).toContain('ja'); + expect(codes).toContain('ko'); + expect(codes).toContain('ru'); + }); + + it('with assembly provider, excludes soniox-only languages', () => { + const codes = getSupportedLanguageCodes('assembly'); + expect(codes).not.toContain('zh'); + expect(codes).not.toContain('ja'); + expect(codes).not.toContain('ko'); + expect(codes).not.toContain('ru'); + }); }); describe('getLanguageOptions', () => { diff --git a/backend/src/config/languages.ts b/backend/src/config/languages.ts index f6d43ad..ec20d1d 100644 --- a/backend/src/config/languages.ts +++ b/backend/src/config/languages.ts @@ -8,6 +8,7 @@ */ import { createLogger } from '../utils/logger.js'; +import type { STTProvider } from './server.js'; const logger = createLogger('Languages'); @@ -46,6 +47,9 @@ export interface LanguageConfig { // Example conversation topics specific to this language's culture exampleTopics: string[]; + + // If set, this language is only available when the given STT provider is active + requiredSttProvider?: STTProvider; } /** @@ -235,6 +239,126 @@ export const SUPPORTED_LANGUAGES: Record = { 'the Amazon and Brazilian nature', ], }, + + zh: { + code: 'zh', + name: 'Chinese', + nativeName: '中文', + flag: '🇨🇳', + sttLanguageCode: 'zh-CN', + ttsConfig: { + speakerId: 'Xiaoyin', + modelId: 'inworld-tts-1.5-max', + speakingRate: 1, + temperature: 1.1, + languageCode: 'zh-CN', + }, + teacherPersona: { + name: '李老师 (Lǐ Lǎoshī)', + age: 33, + nationality: 'Chinese (Beijing)', + description: + 'a 33 year old Beijinger who loves teaching Mandarin through Chinese culture, food, and modern life', + }, + exampleTopics: [ + 'life in Beijing and Shanghai', + 'Chinese cuisine and regional flavors', + 'Chinese festivals and traditions', + 'modern Chinese pop culture', + 'travel along the Silk Road', + ], + requiredSttProvider: 'soniox', + }, + + ja: { + code: 'ja', + name: 'Japanese', + nativeName: '日本語', + flag: '🇯🇵', + sttLanguageCode: 'ja-JP', + ttsConfig: { + speakerId: 'Asuka', + modelId: 'inworld-tts-1.5-max', + speakingRate: 1, + temperature: 1.1, + languageCode: 'ja-JP', + }, + teacherPersona: { + name: '田中先生 (Tanaka-sensei)', + age: 31, + nationality: 'Japanese (Tokyo)', + description: + 'a 31 year old Tokyoite who is passionate about teaching Japanese through anime, food, and everyday life', + }, + exampleTopics: [ + 'daily life in Tokyo', + 'Japanese cuisine from ramen to kaiseki', + 'anime and manga culture', + 'Japanese seasons and festivals', + 'travel through Kyoto and rural Japan', + ], + requiredSttProvider: 'soniox', + }, + + ko: { + code: 'ko', + name: 'Korean', + nativeName: '한국어', + flag: '🇰🇷', + sttLanguageCode: 'ko-KR', + ttsConfig: { + speakerId: 'Seojun', + modelId: 'inworld-tts-1.5-max', + speakingRate: 1, + temperature: 1.1, + languageCode: 'ko-KR', + }, + teacherPersona: { + name: '김선생님 (Kim Seonsaengnim)', + age: 29, + nationality: 'Korean (Seoul)', + description: + 'a 29 year old Seoulite who enjoys teaching Korean through K-pop, K-drama, and Korean street food culture', + }, + exampleTopics: [ + 'life in Seoul and Busan', + 'Korean food and street food culture', + 'K-pop and K-drama', + 'Korean traditions and holidays', + 'travel through South Korea', + ], + requiredSttProvider: 'soniox', + }, + + ru: { + code: 'ru', + name: 'Russian', + nativeName: 'Русский', + flag: '🇷🇺', + sttLanguageCode: 'ru-RU', + ttsConfig: { + speakerId: 'Elena', + modelId: 'inworld-tts-1.5-max', + speakingRate: 1, + temperature: 1.1, + languageCode: 'ru-RU', + }, + teacherPersona: { + name: 'Елена Петровна (Elena Petrovna)', + age: 37, + nationality: 'Russian (Moscow)', + description: + 'a 37 year old Muscovite who loves teaching Russian through literature, history, and the richness of Russian culture', + }, + exampleTopics: [ + 'life in Moscow and Saint Petersburg', + 'Russian literature and poetry', + 'Russian cuisine and tea culture', + 'Russian music from classical to modern', + 'the Trans-Siberian Railway and Russian nature', + ], + requiredSttProvider: 'soniox', + }, }; /** @@ -255,27 +379,37 @@ export function getLanguageConfig(code: string): LanguageConfig { } /** - * Get all supported language codes + * Get all supported language codes, optionally filtered by STT provider */ -export function getSupportedLanguageCodes(): string[] { - return Object.keys(SUPPORTED_LANGUAGES); +export function getSupportedLanguageCodes(sttProvider?: STTProvider): string[] { + return Object.values(SUPPORTED_LANGUAGES) + .filter( + (lang) => + !lang.requiredSttProvider || lang.requiredSttProvider === sttProvider + ) + .map((lang) => lang.code); } /** - * Get language options for frontend dropdown + * Get language options for frontend dropdown, optionally filtered by STT provider */ -export function getLanguageOptions(): Array<{ +export function getLanguageOptions(sttProvider?: STTProvider): Array<{ code: string; name: string; nativeName: string; flag: string; }> { - return Object.values(SUPPORTED_LANGUAGES).map((lang) => ({ - code: lang.code, - name: lang.name, - nativeName: lang.nativeName, - flag: lang.flag, - })); + return Object.values(SUPPORTED_LANGUAGES) + .filter( + (lang) => + !lang.requiredSttProvider || lang.requiredSttProvider === sttProvider + ) + .map((lang) => ({ + code: lang.code, + name: lang.name, + nativeName: lang.nativeName, + flag: lang.flag, + })); } /** diff --git a/backend/src/config/server.ts b/backend/src/config/server.ts index 4e6f9d3..dbb36fd 100644 --- a/backend/src/config/server.ts +++ b/backend/src/config/server.ts @@ -153,6 +153,14 @@ export function getAssemblyAISettingsForEagerness( return assemblyAIPresets[eagerness]; } +/** + * Get the active STT provider at call time (after dotenv loads). + * Do NOT use serverConfig.sttProvider — it is evaluated at module load time before dotenv. + */ +export function getSttProvider(): STTProvider { + return (process.env.STT_PROVIDER || 'assembly') as STTProvider; +} + /** * Get Soniox endpoint detection settings for the configured eagerness level. * Reads SONIOX_EAGERNESS from process.env at call time (after dotenv loads). diff --git a/backend/src/graphs/simple-tts-graph.ts b/backend/src/graphs/simple-tts-graph.ts index 2b7dd57..02877cb 100644 --- a/backend/src/graphs/simple-tts-graph.ts +++ b/backend/src/graphs/simple-tts-graph.ts @@ -22,7 +22,7 @@ import { getLanguageConfig, getSupportedLanguageCodes, } from '../config/languages.js'; -import { serverConfig } from '../config/server.js'; +import { serverConfig, getSttProvider } from '../config/server.js'; import { graphLogger as logger } from '../utils/logger.js'; export interface SimpleTTSInput { @@ -93,7 +93,7 @@ const simpleTTSGraphs = new Map(); * Initialize TTS graphs for all supported languages */ export function initializeTTSGraphs(): void { - const languageCodes = getSupportedLanguageCodes(); + const languageCodes = getSupportedLanguageCodes(getSttProvider()); logger.info( { languageCount: languageCodes.length }, diff --git a/backend/src/services/api-routes.ts b/backend/src/services/api-routes.ts index ab481bb..dccac71 100644 --- a/backend/src/services/api-routes.ts +++ b/backend/src/services/api-routes.ts @@ -10,6 +10,7 @@ import { getLanguageOptions, DEFAULT_LANGUAGE_CODE, } from '../config/languages.js'; +import { getSttProvider } from '../config/server.js'; import { serverLogger as logger } from '../utils/logger.js'; export const apiRouter = Router(); @@ -53,7 +54,7 @@ apiRouter.post('/export-anki', async (req, res) => { // Languages endpoint apiRouter.get('/languages', (_req, res) => { try { - const languages = getLanguageOptions(); + const languages = getLanguageOptions(getSttProvider()); res.json({ languages, defaultLanguage: DEFAULT_LANGUAGE_CODE }); } catch (error) { logger.error({ err: error }, 'get_languages_error'); diff --git a/backend/src/services/websocket-handler.ts b/backend/src/services/websocket-handler.ts index bd3350c..6115d6c 100644 --- a/backend/src/services/websocket-handler.ts +++ b/backend/src/services/websocket-handler.ts @@ -19,7 +19,7 @@ import { } from '../config/languages.js'; import { serverLogger as logger } from '../utils/logger.js'; import { getSimpleTTSGraph } from '../graphs/simple-tts-graph.js'; -import { serverConfig } from '../config/server.js'; +import { serverConfig, getSttProvider } from '../config/server.js'; import { connections, @@ -411,7 +411,7 @@ async function handleConversationSwitch( } // Validate language code - const supportedCodes = getSupportedLanguageCodes(); + const supportedCodes = getSupportedLanguageCodes(getSttProvider()); const languageCode = supportedCodes.includes(requestedLanguageCode) ? requestedLanguageCode : DEFAULT_LANGUAGE_CODE; @@ -518,7 +518,7 @@ function handleUserContext( const currentAttrs = connectionAttributes.get(connectionId) || {}; // Validate language code - const supportedCodes = getSupportedLanguageCodes(); + const supportedCodes = getSupportedLanguageCodes(getSttProvider()); const validatedLanguageCode = languageCode && supportedCodes.includes(languageCode) ? languageCode From fd3220b65b5d83732a8cfa82bb37bdbfcbe4aaca Mon Sep 17 00:00:00 2001 From: Cale Shapera <25466659+cshape@users.noreply.github.com> Date: Fri, 20 Feb 2026 17:11:12 -0800 Subject: [PATCH 05/16] feat: add flashcard translated sentences, pinyin for zh, and audio for sentences --- backend/src/graphs/flashcard-graph.ts | 6 +- backend/src/helpers/anki-exporter.ts | 15 +++- backend/src/helpers/flashcard-processor.ts | 4 + backend/src/prompts/flashcard.njk | 8 +- frontend/src/components/Flashcard.tsx | 39 +++++++++- frontend/src/components/FlashcardsSection.tsx | 15 ++++ frontend/src/services/SupabaseStorage.ts | 12 +++ frontend/src/styles/main.css | 76 +++++++++++++++++++ frontend/src/types/index.ts | 3 + .../20240108000000_initial_schema.sql | 3 + 10 files changed, 176 insertions(+), 5 deletions(-) diff --git a/backend/src/graphs/flashcard-graph.ts b/backend/src/graphs/flashcard-graph.ts index 6f8b108..cec41c2 100644 --- a/backend/src/graphs/flashcard-graph.ts +++ b/backend/src/graphs/flashcard-graph.ts @@ -51,7 +51,7 @@ class FlashcardParserNode extends CustomNode { const jsonMatch = textContent.match(/\{[\s\S]*\}/); if (jsonMatch) { const parsed = JSON.parse(jsonMatch[0]); - return { + const result: Record = { id: v4(), // Support both new 'targetWord' format and legacy 'spanish' format targetWord: parsed.targetWord ?? parsed.spanish ?? '', @@ -60,6 +60,10 @@ class FlashcardParserNode extends CustomNode { mnemonic: parsed.mnemonic ?? '', timestamp: new Date().toISOString(), }; + if (parsed.exampleTranslation) result.exampleTranslation = parsed.exampleTranslation; + if (parsed.pinyin) result.pinyin = parsed.pinyin; + if (parsed.examplePinyin) result.examplePinyin = parsed.examplePinyin; + return result; } } catch (error) { logger.error({ err: error }, 'failed_to_parse_flashcard_json'); diff --git a/backend/src/helpers/anki-exporter.ts b/backend/src/helpers/anki-exporter.ts index 710eeb8..793a3be 100644 --- a/backend/src/helpers/anki-exporter.ts +++ b/backend/src/helpers/anki-exporter.ts @@ -30,7 +30,10 @@ export class AnkiExporter { return; } - const front = targetWord.trim(); + let front = targetWord.trim(); + if (flashcard.pinyin) { + front += `
${this.escapeHtml(flashcard.pinyin)}`; + } const back = this.formatCardBack(flashcard); // Add tags for organization @@ -61,7 +64,15 @@ export class AnkiExporter { let back = `
${this.escapeHtml(flashcard.english)}
`; if (flashcard.example && flashcard.example.trim()) { - back += `
${this.escapeHtml(flashcard.example)}
`; + let exampleHtml = this.escapeHtml(flashcard.example); + if (flashcard.examplePinyin) { + exampleHtml += `
${this.escapeHtml(flashcard.examplePinyin)}`; + } + back += `
${exampleHtml}
`; + } + + if (flashcard.exampleTranslation && flashcard.exampleTranslation.trim()) { + back += `
${this.escapeHtml(flashcard.exampleTranslation)}
`; } if (flashcard.mnemonic && flashcard.mnemonic.trim()) { diff --git a/backend/src/helpers/flashcard-processor.ts b/backend/src/helpers/flashcard-processor.ts index e7aa240..2b48628 100644 --- a/backend/src/helpers/flashcard-processor.ts +++ b/backend/src/helpers/flashcard-processor.ts @@ -15,7 +15,10 @@ export interface Flashcard { targetWord: string; // The word in the target language (was 'spanish') english: string; example: string; + exampleTranslation?: string; mnemonic: string; + pinyin?: string; + examplePinyin?: string; timestamp: string; languageCode?: string; // Track which language this card belongs to } @@ -118,6 +121,7 @@ export class FlashcardProcessor { studentName: 'Student', teacherName: effectiveLanguageConfig.teacherPersona.name, target_language: effectiveLanguageConfig.name, + language_code: effectiveLanguageCode, messages: messages, flashcards: this.existingFlashcards, }; diff --git a/backend/src/prompts/flashcard.njk b/backend/src/prompts/flashcard.njk index 82c7c27..79c3ac1 100644 --- a/backend/src/prompts/flashcard.njk +++ b/backend/src/prompts/flashcard.njk @@ -5,7 +5,9 @@ Based on the ongoing conversation between {{studentName}} and {{teacherName}}, g - The word in {{target_language}} - The translation in English - An example sentence in {{target_language}} +- An English translation of the example sentence - A mnemonic to help the student remember the word (in English) +{% if language_code == "zh" %}- The pinyin romanization for both the word and the example sentence{% endif %} ## Conversation @@ -23,6 +25,7 @@ Based on the ongoing conversation between {{studentName}} and {{teacherName}}, g - The word must be related to the topics used in the conversation - The word should be useful to the learner so they can continue the conversation with new vocabulary - Avoid cognates +{% if language_code == "zh" %}- Include accurate pinyin with tone marks (e.g. "nǐ hǎo") for the word and the example sentence{% endif %} Now, return JSON with the following format: @@ -30,5 +33,8 @@ Now, return JSON with the following format: "targetWord": "string", "english": "string", "example": "string", - "mnemonic": "string" + "exampleTranslation": "string (English translation of the example sentence)", +{% if language_code == "zh" %} "pinyin": "string (pinyin for the word)", + "examplePinyin": "string (pinyin for the example sentence)", +{% endif %} "mnemonic": "string" } \ No newline at end of file diff --git a/frontend/src/components/Flashcard.tsx b/frontend/src/components/Flashcard.tsx index 84bb9a3..d5e4726 100644 --- a/frontend/src/components/Flashcard.tsx +++ b/frontend/src/components/Flashcard.tsx @@ -5,7 +5,9 @@ interface FlashcardProps { flashcard: FlashcardType; onCardClick?: (flashcard: FlashcardType) => void; onPronounce?: (flashcard: FlashcardType) => void; + onPronounceText?: (text: string) => void; isPronouncing?: boolean; + isPronouncingSentence?: boolean; } function capitalizeFirstLetter(text: string): string { @@ -17,7 +19,9 @@ export function Flashcard({ flashcard, onCardClick, onPronounce, + onPronounceText, isPronouncing = false, + isPronouncingSentence = false, }: FlashcardProps) { const [isFlipped, setIsFlipped] = useState(false); @@ -34,12 +38,26 @@ export function Flashcard({ [flashcard, onPronounce] ); + const handlePronounceExample = useCallback( + (e: React.MouseEvent) => { + e.stopPropagation(); + const text = flashcard.example || flashcard.example_sentence || ''; + if (text && onPronounceText) { + onPronounceText(text); + } + }, + [flashcard, onPronounceText] + ); + // Support both new 'targetWord' and legacy 'spanish' field const targetWord = flashcard.targetWord || flashcard.spanish || flashcard.word || ''; const english = flashcard.english || flashcard.translation || ''; const example = flashcard.example || flashcard.example_sentence || ''; + const exampleTranslation = flashcard.exampleTranslation || ''; const mnemonic = flashcard.mnemonic || ''; + const pinyin = flashcard.pinyin || ''; + const examplePinyin = flashcard.examplePinyin || ''; // Capitalize the first letter of the target word for display const displayTargetWord = capitalizeFirstLetter(targetWord); @@ -52,6 +70,7 @@ export function Flashcard({
{displayTargetWord}
+ {pinyin &&
{pinyin}
}
-
+
{/* Loading overlay when not connected */} {connectionStatus === 'connecting' && (
@@ -223,6 +265,22 @@ export function ChatSection() {
+ {contextMenu && isConnected && ( +
+ + + + + + Create flashcard for “{contextMenu.word.length > 30 + ? contextMenu.word.slice(0, 30) + '…' + : contextMenu.word}” +
+ )} ); } diff --git a/frontend/src/context/AppContext.tsx b/frontend/src/context/AppContext.tsx index f5cea64..071acaa 100644 --- a/frontend/src/context/AppContext.tsx +++ b/frontend/src/context/AppContext.tsx @@ -261,6 +261,7 @@ interface AppContextType { handleInterrupt: () => void; sendTextMessage: (text: string) => void; pronounceWord: (text: string) => void; + createFlashcardForWord: (word: string) => void; // Conversation actions selectConversation: (conversationId: string) => void; createNewConversation: () => void; @@ -1332,6 +1333,21 @@ export function AppProvider({ children }: AppProviderProps) { [state.connectionStatus] ); + // Request flashcard generation for a specific word + const createFlashcardForWord = useCallback( + (word: string) => { + const wsClient = wsClientRef.current; + const trimmed = word.trim(); + if (state.connectionStatus !== 'connected' || !trimmed) return; + + wsClient.send({ + type: 'create_flashcard_request', + word: trimmed, + }); + }, + [state.connectionStatus] + ); + // Select a conversation from the sidebar const selectConversation = useCallback( (conversationId: string) => { @@ -1673,6 +1689,7 @@ export function AppProvider({ children }: AppProviderProps) { handleInterrupt, sendTextMessage, pronounceWord, + createFlashcardForWord, selectConversation, createNewConversation, deleteConversation, @@ -1691,6 +1708,7 @@ export function AppProvider({ children }: AppProviderProps) { handleInterrupt, sendTextMessage, pronounceWord, + createFlashcardForWord, selectConversation, createNewConversation, deleteConversation, diff --git a/frontend/src/styles/main.css b/frontend/src/styles/main.css index 6354d96..8c3f57a 100644 --- a/frontend/src/styles/main.css +++ b/frontend/src/styles/main.css @@ -1322,6 +1322,48 @@ body { cursor: not-allowed; } +/* Flashcard context menu (right-click to create flashcard) */ +.flashcard-context-menu { + position: fixed; + z-index: 1000; + background: #ffffff; + border: 1px solid #e5e5e5; + border-radius: 8px; + padding: 8px 14px; + font-size: 14px; + color: #374151; + cursor: pointer; + box-shadow: 0 4px 16px rgba(0, 0, 0, 0.12); + display: flex; + align-items: center; + gap: 8px; + white-space: nowrap; + user-select: none; + animation: context-menu-in 0.1s ease-out; +} + +.flashcard-context-menu:hover { + background: #f3f4f6; + color: #1a1a1a; +} + +.flashcard-context-menu svg { + width: 16px; + height: 16px; + flex-shrink: 0; +} + +@keyframes context-menu-in { + from { + opacity: 0; + transform: scale(0.95); + } + to { + opacity: 1; + transform: scale(1); + } +} + /* Flashcards Section */ .flashcards-section { background: #ffffff; From c217591847e87e605368f7a8705d83e1f70bfc50 Mon Sep 17 00:00:00 2001 From: Cale Shapera <25466659+cshape@users.noreply.github.com> Date: Fri, 20 Feb 2026 17:53:41 -0800 Subject: [PATCH 07/16] chore: add pretty logs option for prod --- backend/src/utils/logger.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/src/utils/logger.ts b/backend/src/utils/logger.ts index 3403cb8..40e2a33 100644 --- a/backend/src/utils/logger.ts +++ b/backend/src/utils/logger.ts @@ -10,7 +10,8 @@ import pino from 'pino'; -const isDevelopment = process.env.NODE_ENV !== 'production'; +const usePrettyLogs = + process.env.LOG_PRETTY === 'true' || process.env.NODE_ENV !== 'production'; /** * Root logger instance @@ -18,7 +19,7 @@ const isDevelopment = process.env.NODE_ENV !== 'production'; */ export const logger = pino({ level: process.env.LOG_LEVEL || 'info', - transport: isDevelopment + transport: usePrettyLogs ? { target: 'pino-pretty', options: { From 931411d78a1d156289a9f983df44ac3c97465512 Mon Sep 17 00:00:00 2001 From: Cale Shapera <25466659+cshape@users.noreply.github.com> Date: Fri, 20 Feb 2026 18:10:07 -0800 Subject: [PATCH 08/16] fix: align github/render icons better --- frontend/src/styles/main.css | 1 + 1 file changed, 1 insertion(+) diff --git a/frontend/src/styles/main.css b/frontend/src/styles/main.css index 8c3f57a..1976019 100644 --- a/frontend/src/styles/main.css +++ b/frontend/src/styles/main.css @@ -1819,6 +1819,7 @@ body { letter-spacing: 0.02em; width: 100%; box-sizing: border-box; + justify-content: flex-start; } @media (max-width: 768px) { From 45f28b9fcc97d3fed618818ecb7cf4fc0bb3477c Mon Sep 17 00:00:00 2001 From: Cale Shapera <25466659+cshape@users.noreply.github.com> Date: Fri, 20 Feb 2026 18:33:46 -0800 Subject: [PATCH 09/16] fix: better audio setup for mobile --- frontend/src/App.tsx | 5 +-- frontend/src/context/AppContext.tsx | 7 ++-- frontend/src/services/AudioPlayer.ts | 50 +++++++++++++++++++++++++--- 3 files changed, 53 insertions(+), 9 deletions(-) diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index dd39db8..94528a3 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -34,8 +34,9 @@ function AppContent() {
- {/* Hidden audio element for iOS compatibility */} -
{/* Floating Action Buttons */} diff --git a/frontend/src/context/AppContext.tsx b/frontend/src/context/AppContext.tsx index 071acaa..5656f8e 100644 --- a/frontend/src/context/AppContext.tsx +++ b/frontend/src/context/AppContext.tsx @@ -289,9 +289,12 @@ export function AppProvider({ children }: AppProviderProps) { const wsClientRef = useRef(wsClientInstance); const audioHandlerInstance = useMemo(() => new AudioHandler(), []); const audioHandlerRef = useRef(audioHandlerInstance); - const audioPlayerInstance = useMemo(() => new AudioPlayer(), []); + const audioPlayerInstance = useMemo(() => new AudioPlayer('ttsAudioOutput'), []); const audioPlayerRef = useRef(audioPlayerInstance); - const ttsAudioPlayerInstance = useMemo(() => new AudioPlayer(), []); + const ttsAudioPlayerInstance = useMemo( + () => new AudioPlayer('ttsAudioOutputFlashcard'), + [] + ); const ttsAudioPlayerRef = useRef(ttsAudioPlayerInstance); const hasMigratedRef = useRef(false); const conversationsLoadedRef = useRef(false); diff --git a/frontend/src/services/AudioPlayer.ts b/frontend/src/services/AudioPlayer.ts index 61a377d..d13de83 100644 --- a/frontend/src/services/AudioPlayer.ts +++ b/frontend/src/services/AudioPlayer.ts @@ -15,10 +15,14 @@ export class AudioPlayer { private nextStartTime: number = 0; private scheduledSources: AudioBufferSourceNode[] = []; private scheduleInterval: ReturnType | null = null; - private readonly SCHEDULE_AHEAD_TIME = 0.1; // Look 100ms ahead - private readonly FADE_SAMPLES = 128; // ~2.7ms at 48kHz, ~8ms at 16kHz - - constructor() { + private readonly SCHEDULE_AHEAD_TIME = 0.3; // Schedule 300ms ahead for mobile timer resilience + private readonly FADE_SAMPLES = 256; // ~11ms at 22050Hz TTS rate + private mediaStreamDest: MediaStreamAudioDestinationNode | null = null; + private audioElement: HTMLAudioElement | null = null; + private audioElementId: string; + + constructor(audioElementId: string = 'ttsAudioOutput') { + this.audioElementId = audioElementId; this.isIOS = /iPad|iPhone|iPod/.test(navigator.userAgent) || (navigator.platform === 'MacIntel' && navigator.maxTouchPoints > 1); @@ -55,6 +59,31 @@ export class AudioPlayer { await this.audioContext.resume(); } + // Route through