diff --git a/.gitignore b/.gitignore index 6c4e867..c54347b 100644 --- a/.gitignore +++ b/.gitignore @@ -140,6 +140,7 @@ vite.config.ts.timestamp-* # Project specific backend/audio/ +backend/src/graphs/configs/ .DS_Store CLAUDE.md templates/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 51f0e4c..2d1d8a2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -33,7 +33,10 @@ Thank you for your interest in contributing to the Language Learning App! This d ```bash INWORLD_API_KEY=your_api_key_here + # Set one of these: ASSEMBLY_AI_API_KEY=your_api_key_here + # or + SONIOX_API_KEY=your_api_key_here ``` 5. **Verify the setup**: diff --git a/README.md b/README.md index 20af85d..11ddc8d 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ A conversational language learning app powered by Inworld AI Runtime. Practice s - Node.js (v20 or higher) - npm - An Inworld AI account and API key -- An AssemblyAI account and API key (for speech-to-text) +- An [AssemblyAI](https://www.assemblyai.com/) or [Soniox](https://soniox.com/) account and API key (for speech-to-text) ## Get Started @@ -35,17 +35,24 @@ This installs dependencies for the root, backend, and frontend automatically. ### Step 3: Configure Environment Variables -Create a `backend/.env` file: +Create a `backend/.env` file with your Inworld key and **one** of the two STT provider keys: ```bash INWORLD_API_KEY=your_inworld_base64_key + +# Pick one STT provider: ASSEMBLY_AI_API_KEY=your_assemblyai_key +# or +SONIOX_API_KEY=your_soniox_key ``` -| Service | Get Key From | Purpose | -| -------------- | --------------------------------------------------- | --------------------------------- | -| **Inworld** | [platform.inworld.ai](https://platform.inworld.ai/) | AI conversations (Base64 API key) | -| **AssemblyAI** | [assemblyai.com](https://www.assemblyai.com/) | Speech-to-text | +The server auto-detects which STT provider to use based on which API key is present. If both are set, Soniox takes priority. + +| Service | Get Key From | Purpose | +| -------------- | ---------------------------------------------------- | --------------------------------- | +| **Inworld** | [platform.inworld.ai](https://platform.inworld.ai/) | AI conversations (Base64 API key) | +| **AssemblyAI** | [assemblyai.com](https://www.assemblyai.com/) | Speech-to-text (option 1) | +| **Soniox** | [soniox.com](https://soniox.com/) | Speech-to-text (option 2) | ### Step 4: Run the Application @@ -102,6 +109,18 @@ VITE_SUPABASE_PUBLISHABLE_KEY=your_anon_key Find these in: Supabase Dashboard > Settings > API +### Step 6 (Optional): Enable Flashcard Images with Replicate + +When exporting flashcards to Anki, the app can generate a unique illustrative image for each vocabulary word using [Replicate](https://replicate.com/)'s FLUX Schnell model. Without this key, flashcards are exported with audio only. + +Add to `backend/.env`: + +```bash +REPLICATE_API_TOKEN=your_replicate_api_token +``` + +Get a token at [replicate.com/account/api-tokens](https://replicate.com/account/api-tokens). + ## Repo Structure ``` @@ -143,7 +162,7 @@ The app uses a real-time audio streaming architecture: 1. **Frontend** captures microphone audio and streams it via WebSocket 2. **Backend** processes audio through an Inworld Runtime graph: - - AssemblyAI handles speech-to-text with voice activity detection + - Speech-to-text with voice activity detection (AssemblyAI or Soniox) - LLM generates contextual responses in the target language - TTS converts responses back to audio 3. **Flashcards** are auto-generated from conversation vocabulary @@ -166,16 +185,19 @@ Without Supabase, the app works in anonymous mode using localStorage (no memory ## Environment Variables Reference -| Variable | Required | Description | -| --------------------------- | -------- | ------------------------------------------------------------------ | -| `INWORLD_API_KEY` | Yes | Inworld AI Base64 API key | -| `ASSEMBLY_AI_API_KEY` | Yes | AssemblyAI API key | -| `PORT` | No | Server port (default: 3000) | -| `LOG_LEVEL` | No | `trace`, `debug`, `info`, `warn`, `error`, `fatal` (default: info) | -| `NODE_ENV` | No | Set to `production` for production log format | -| `ASSEMBLY_AI_EAGERNESS` | No | Turn detection: `low`, `medium`, `high` (default: high) | -| `SUPABASE_URL` | No | Supabase project URL (enables memory feature) | -| `SUPABASE_SECRET_KEY` | No | Supabase secret key (for backend memory storage) | +| Variable | Required | Description | +| --------------------------- | ------------------ | ------------------------------------------------------------------ | +| `INWORLD_API_KEY` | Yes | Inworld AI Base64 API key | +| `ASSEMBLY_AI_API_KEY` | One of these two ↕ | AssemblyAI API key | +| `SONIOX_API_KEY` | One of these two ↑ | Soniox API key (takes priority if both are set) | +| `PORT` | No | Server port (default: 3000) | +| `LOG_LEVEL` | No | `trace`, `debug`, `info`, `warn`, `error`, `fatal` (default: info) | +| `NODE_ENV` | No | Set to `production` for production log format | +| `ASSEMBLY_AI_EAGERNESS` | No | AssemblyAI turn detection: `low`, `medium`, `high` (default: high) | +| `SONIOX_EAGERNESS` | No | Soniox endpoint detection: `low`, `medium`, `high` (default: high) | +| `SUPABASE_URL` | No | Supabase project URL (enables memory feature) | +| `SUPABASE_SECRET_KEY` | No | Supabase secret key (for backend memory storage) | +| `REPLICATE_API_TOKEN` | No | Replicate API token (enables flashcard image generation) | ## Testing diff --git a/backend/.env.example b/backend/.env.example index 2488c28..251c931 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -1,5 +1,11 @@ INWORLD_API_KEY= + +# Speech-to-text: set ONE of these (Soniox takes priority if both are set) ASSEMBLY_AI_API_KEY= +SONIOX_API_KEY= + +# Optional: generates images for Anki flashcards +REPLICATE_API_TOKEN= SUPABASE_URL= -SUPABASE_SECRET_KEY= \ No newline at end of file +SUPABASE_SECRET_KEY= diff --git a/backend/package-lock.json b/backend/package-lock.json index 51c3e72..1d24a22 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -15,6 +15,7 @@ "cors": "^2.8.5", "dotenv": "^17.2.1", "express": "^4.22.1", + "jsonrepair": "^3.13.2", "pino": "^10.1.0", "uuid": "^11.1.0", "ws": "^8.18.0" @@ -1695,7 +1696,6 @@ "integrity": "sha512-tK3GPFWbirvNgsNKto+UmB/cRtn6TZfyw0D6IKrW55n6Vbs7KJoZtI//kpTKzE/DUmmnAFD8/Ca46s7Obs92/w==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "8.46.4", "@typescript-eslint/types": "8.46.4", @@ -2191,7 +2191,6 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", - "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -3222,7 +3221,6 @@ "integrity": "sha512-BhHmn2yNOFA9H9JmmIVKJmd288g9hrVRDkdoIgRCRuSySRUHH7r/DI6aAXW9T1WwUuY3DFgrcaqB+deURBLR5g==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.1", @@ -3283,7 +3281,6 @@ "integrity": "sha512-iI1f+D2ViGn+uvv5HuHVUamg8ll4tN+JRHGc6IJi4TP9Kl976C57fzPXgseXNs8v0iA8aSJpHsTWjDb9QJamGQ==", "dev": true, "license": "MIT", - "peer": true, "bin": { "eslint-config-prettier": "bin/cli.js" }, @@ -4549,6 +4546,15 @@ "dev": true, "license": "MIT" }, + "node_modules/jsonrepair": { + "version": "3.13.2", + "resolved": "https://registry.npmjs.org/jsonrepair/-/jsonrepair-3.13.2.tgz", + "integrity": "sha512-Leuly0nbM4R+S5SVJk3VHfw1oxnlEK9KygdZvfUtEtTawNDyzB4qa1xWTmFt1aeoA7sXZkVTRuIixJ8bAvqVUg==", + "license": "ISC", + "bin": { + "jsonrepair": "bin/cli.js" + } + }, "node_modules/jszip": { "version": "3.10.1", "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", @@ -5343,7 +5349,6 @@ "integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==", "dev": true, "license": "MIT", - "peer": true, "bin": { "prettier": "bin/prettier.cjs" }, @@ -6293,7 +6298,6 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, @@ -6394,7 +6398,6 @@ "integrity": "sha512-ytQKuwgmrrkDTFP4LjR0ToE2nqgy886GpvRSpU0JAnrdBYppuY5rLkRUYPU1yCryb24SsKBTL/hlDQAEFVwtZg==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "esbuild": "~0.25.0", "get-tsconfig": "^4.7.5" @@ -6455,7 +6458,6 @@ "integrity": "sha512-CWBzXQrc/qOkhidw1OzBTQuYRbfyxDXJMVJ1XNwUHGROVmuaeiEm3OslpZ1RV96d7SKKjZKrSJu3+t/xlw3R9A==", "dev": true, "license": "Apache-2.0", - "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -6573,7 +6575,6 @@ "integrity": "sha512-w+N7Hifpc3gRjZ63vYBXA56dvvRlNWRczTdmCBBa+CotUzAPf5b7YMdMR/8CQoeYE5LX3W4wj6RYTgonm1b9DA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "esbuild": "^0.27.0", "fdir": "^6.5.0", @@ -7151,7 +7152,6 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, @@ -7165,7 +7165,6 @@ "integrity": "sha512-E4t7DJ9pESL6E3I8nFjPa4xGUd3PmiWDLsDztS2qXSJWfHtbQnwAWylaBvSNY48I3vr8PTqIZlyK8TE3V3CA4Q==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@vitest/expect": "4.0.16", "@vitest/mocker": "4.0.16", diff --git a/backend/package.json b/backend/package.json index a2f5f55..6b19171 100644 --- a/backend/package.json +++ b/backend/package.json @@ -58,6 +58,7 @@ "cors": "^2.8.5", "dotenv": "^17.2.1", "express": "^4.22.1", + "jsonrepair": "^3.13.2", "pino": "^10.1.0", "uuid": "^11.1.0", "ws": "^8.18.0" diff --git a/backend/src/__tests__/config/languages.test.ts b/backend/src/__tests__/config/languages.test.ts index 426d959..59f4933 100644 --- a/backend/src/__tests__/config/languages.test.ts +++ b/backend/src/__tests__/config/languages.test.ts @@ -74,13 +74,30 @@ describe('languages config', () => { expect(codes).toContain('de'); }); - it('matches SUPPORTED_LANGUAGES keys', () => { + it('without provider, returns only languages without requiredSttProvider', () => { const codes = getSupportedLanguageCodes(); - expect(codes.length).toBe(Object.keys(SUPPORTED_LANGUAGES).length); for (const code of codes) { expect(SUPPORTED_LANGUAGES[code]).toBeDefined(); + expect(SUPPORTED_LANGUAGES[code].requiredSttProvider).toBeUndefined(); } }); + + it('with soniox provider, returns all languages', () => { + const codes = getSupportedLanguageCodes('soniox'); + expect(codes.length).toBe(Object.keys(SUPPORTED_LANGUAGES).length); + expect(codes).toContain('zh'); + expect(codes).toContain('ja'); + expect(codes).toContain('ko'); + expect(codes).toContain('ru'); + }); + + it('with assembly provider, excludes soniox-only languages', () => { + const codes = getSupportedLanguageCodes('assembly'); + expect(codes).not.toContain('zh'); + expect(codes).not.toContain('ja'); + expect(codes).not.toContain('ko'); + expect(codes).not.toContain('ru'); + }); }); describe('getLanguageOptions', () => { diff --git a/backend/src/config/languages.ts b/backend/src/config/languages.ts index f6d43ad..ec20d1d 100644 --- a/backend/src/config/languages.ts +++ b/backend/src/config/languages.ts @@ -8,6 +8,7 @@ */ import { createLogger } from '../utils/logger.js'; +import type { STTProvider } from './server.js'; const logger = createLogger('Languages'); @@ -46,6 +47,9 @@ export interface LanguageConfig { // Example conversation topics specific to this language's culture exampleTopics: string[]; + + // If set, this language is only available when the given STT provider is active + requiredSttProvider?: STTProvider; } /** @@ -235,6 +239,126 @@ export const SUPPORTED_LANGUAGES: Record = { 'the Amazon and Brazilian nature', ], }, + + zh: { + code: 'zh', + name: 'Chinese', + nativeName: 'δΈ­ζ–‡', + flag: 'πŸ‡¨πŸ‡³', + sttLanguageCode: 'zh-CN', + ttsConfig: { + speakerId: 'Xiaoyin', + modelId: 'inworld-tts-1.5-max', + speakingRate: 1, + temperature: 1.1, + languageCode: 'zh-CN', + }, + teacherPersona: { + name: 'ζŽθ€εΈˆ (Lǐ LǎoshΔ«)', + age: 33, + nationality: 'Chinese (Beijing)', + description: + 'a 33 year old Beijinger who loves teaching Mandarin through Chinese culture, food, and modern life', + }, + exampleTopics: [ + 'life in Beijing and Shanghai', + 'Chinese cuisine and regional flavors', + 'Chinese festivals and traditions', + 'modern Chinese pop culture', + 'travel along the Silk Road', + ], + requiredSttProvider: 'soniox', + }, + + ja: { + code: 'ja', + name: 'Japanese', + nativeName: 'ζ—₯本θͺž', + flag: 'πŸ‡―πŸ‡΅', + sttLanguageCode: 'ja-JP', + ttsConfig: { + speakerId: 'Asuka', + modelId: 'inworld-tts-1.5-max', + speakingRate: 1, + temperature: 1.1, + languageCode: 'ja-JP', + }, + teacherPersona: { + name: 'η”°δΈ­ε…ˆη”Ÿ (Tanaka-sensei)', + age: 31, + nationality: 'Japanese (Tokyo)', + description: + 'a 31 year old Tokyoite who is passionate about teaching Japanese through anime, food, and everyday life', + }, + exampleTopics: [ + 'daily life in Tokyo', + 'Japanese cuisine from ramen to kaiseki', + 'anime and manga culture', + 'Japanese seasons and festivals', + 'travel through Kyoto and rural Japan', + ], + requiredSttProvider: 'soniox', + }, + + ko: { + code: 'ko', + name: 'Korean', + nativeName: 'ν•œκ΅­μ–΄', + flag: 'πŸ‡°πŸ‡·', + sttLanguageCode: 'ko-KR', + ttsConfig: { + speakerId: 'Seojun', + modelId: 'inworld-tts-1.5-max', + speakingRate: 1, + temperature: 1.1, + languageCode: 'ko-KR', + }, + teacherPersona: { + name: 'κΉ€μ„ μƒλ‹˜ (Kim Seonsaengnim)', + age: 29, + nationality: 'Korean (Seoul)', + description: + 'a 29 year old Seoulite who enjoys teaching Korean through K-pop, K-drama, and Korean street food culture', + }, + exampleTopics: [ + 'life in Seoul and Busan', + 'Korean food and street food culture', + 'K-pop and K-drama', + 'Korean traditions and holidays', + 'travel through South Korea', + ], + requiredSttProvider: 'soniox', + }, + + ru: { + code: 'ru', + name: 'Russian', + nativeName: 'Русский', + flag: 'πŸ‡·πŸ‡Ί', + sttLanguageCode: 'ru-RU', + ttsConfig: { + speakerId: 'Elena', + modelId: 'inworld-tts-1.5-max', + speakingRate: 1, + temperature: 1.1, + languageCode: 'ru-RU', + }, + teacherPersona: { + name: 'Π•Π»Π΅Π½Π° ΠŸΠ΅Ρ‚Ρ€ΠΎΠ²Π½Π° (Elena Petrovna)', + age: 37, + nationality: 'Russian (Moscow)', + description: + 'a 37 year old Muscovite who loves teaching Russian through literature, history, and the richness of Russian culture', + }, + exampleTopics: [ + 'life in Moscow and Saint Petersburg', + 'Russian literature and poetry', + 'Russian cuisine and tea culture', + 'Russian music from classical to modern', + 'the Trans-Siberian Railway and Russian nature', + ], + requiredSttProvider: 'soniox', + }, }; /** @@ -255,27 +379,37 @@ export function getLanguageConfig(code: string): LanguageConfig { } /** - * Get all supported language codes + * Get all supported language codes, optionally filtered by STT provider */ -export function getSupportedLanguageCodes(): string[] { - return Object.keys(SUPPORTED_LANGUAGES); +export function getSupportedLanguageCodes(sttProvider?: STTProvider): string[] { + return Object.values(SUPPORTED_LANGUAGES) + .filter( + (lang) => + !lang.requiredSttProvider || lang.requiredSttProvider === sttProvider + ) + .map((lang) => lang.code); } /** - * Get language options for frontend dropdown + * Get language options for frontend dropdown, optionally filtered by STT provider */ -export function getLanguageOptions(): Array<{ +export function getLanguageOptions(sttProvider?: STTProvider): Array<{ code: string; name: string; nativeName: string; flag: string; }> { - return Object.values(SUPPORTED_LANGUAGES).map((lang) => ({ - code: lang.code, - name: lang.name, - nativeName: lang.nativeName, - flag: lang.flag, - })); + return Object.values(SUPPORTED_LANGUAGES) + .filter( + (lang) => + !lang.requiredSttProvider || lang.requiredSttProvider === sttProvider + ) + .map((lang) => ({ + code: lang.code, + name: lang.name, + nativeName: lang.nativeName, + flag: lang.flag, + })); } /** diff --git a/backend/src/config/server.ts b/backend/src/config/server.ts index aa6f9ac..3ad538e 100644 --- a/backend/src/config/server.ts +++ b/backend/src/config/server.ts @@ -5,6 +5,8 @@ * Environment variables can override defaults where appropriate. */ +export type STTProvider = 'assembly' | 'soniox'; + export interface AssemblyAITurnDetectionSettings { endOfTurnConfidenceThreshold: number; minEndOfTurnSilenceWhenConfident: number; @@ -59,6 +61,31 @@ const assemblyAIPresets: Record< }, }; +export interface SonioxEndpointSettings { + maxEndpointDelayMs: number; + description: string; +} + +/** + * Soniox endpoint detection presets mapped to the same eagerness levels. + * max_endpoint_delay_ms controls how quickly Soniox returns endpoints (500-3000ms). + * @see https://soniox.com/docs/stt/rt/endpoint-detection + */ +const sonioxPresets: Record = { + high: { + maxEndpointDelayMs: 500, + description: 'Aggressive - fastest endpoint detection (500ms)', + }, + medium: { + maxEndpointDelayMs: 1000, + description: 'Balanced - moderate endpoint delay (1000ms)', + }, + low: { + maxEndpointDelayMs: 2000, + description: 'Conservative - patient endpoint detection (2000ms)', + }, +}; + export const serverConfig = { /** * HTTP server port @@ -86,6 +113,14 @@ export const serverConfig = { formatTurns: false, }, + /** + * Soniox speech-to-text configuration + */ + soniox: { + /** Endpoint detection eagerness level (reuses the same 'low'|'medium'|'high' scale) */ + eagerness: (process.env.SONIOX_EAGERNESS || 'high') as AssemblyAIEagerness, + }, + /** * Telemetry configuration for Inworld Runtime */ @@ -111,3 +146,22 @@ export function getAssemblyAISettingsForEagerness( ): AssemblyAITurnDetectionSettings { return assemblyAIPresets[eagerness]; } + +/** + * Auto-detect the active STT provider based on which API key is configured. + * SONIOX_API_KEY takes priority if both keys are present. + */ +export function getSttProvider(): STTProvider { + if (process.env.SONIOX_API_KEY) return 'soniox'; + return 'assembly'; +} + +/** + * Get Soniox endpoint detection settings for the configured eagerness level. + * Reads SONIOX_EAGERNESS from process.env at call time (after dotenv loads). + */ +export function getSonioxSettings(): SonioxEndpointSettings { + const eagerness = (process.env.SONIOX_EAGERNESS || + 'high') as AssemblyAIEagerness; + return sonioxPresets[eagerness]; +} diff --git a/backend/src/graphs/configs/flashcard-generation-graph.json b/backend/src/graphs/configs/flashcard-generation-graph.json deleted file mode 100644 index dc39045..0000000 --- a/backend/src/graphs/configs/flashcard-generation-graph.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "schema_version": "1.2.2", - "main": { - "id": "flashcard-generation-graph", - "nodes": [ - { - "type": "FlashcardPromptBuilderNodeType", - "id": "flashcard-prompt-builder", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "TextToChatRequestNodeType", - "id": "text-to-chat-request", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "id": "llm_node", - "type": "LLMChatNode", - "execution_config": { - "type": "LLMChatNodeExecutionConfig", - "properties": { - "llm_component_id": "llm_node_llm_component", - "text_generation_config": { - "max_new_tokens": 2500, - "max_prompt_length": 2000, - "temperature": 1, - "top_p": 1, - "repetition_penalty": 1, - "frequency_penalty": 0, - "presence_penalty": 0 - }, - "stream": false, - "report_to_client": false, - "response_format": "text" - } - } - }, - { - "type": "FlashcardParserNodeType", - "id": "flashcard-parser", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - } - ], - "edges": [ - { - "from_node": "flashcard-prompt-builder", - "to_node": "text-to-chat-request" - }, - { - "from_node": "text-to-chat-request", - "to_node": "llm_node" - }, - { - "from_node": "llm_node", - "to_node": "flashcard-parser" - } - ], - "end_nodes": ["flashcard-parser"], - "start_nodes": ["flashcard-prompt-builder"] - }, - "components": [ - { - "id": "llm_node_llm_component", - "type": "LLMInterface", - "creation_config": { - "type": "RemoteLLMConfig", - "properties": { - "provider": "openai", - "model_name": "gpt-4.1-nano", - "default_config": {}, - "api_key": "{{INWORLD_API_KEY}}" - } - } - } - ] -} diff --git a/backend/src/graphs/configs/lang-learning-conversation-graph.json b/backend/src/graphs/configs/lang-learning-conversation-graph.json deleted file mode 100644 index 09b897f..0000000 --- a/backend/src/graphs/configs/lang-learning-conversation-graph.json +++ /dev/null @@ -1,289 +0,0 @@ -{ - "schema_version": "1.2.2", - "main": { - "id": "lang-learning-conversation-graph", - "nodes": [ - { - "id": "audio-input-proxy-lang-learning", - "type": "ProxyNode", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "AssemblyAISTTWebSocketNodeType", - "id": "assembly-ai-stt-ws-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "TranscriptExtractorNodeType", - "id": "transcript-extractor-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": true - } - } - }, - { - "type": "InteractionQueueNodeType", - "id": "interaction-queue-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "TextInputNodeType", - "id": "text-input-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": true - } - } - }, - { - "type": "MemoryRetrievalNodeType", - "id": "memory-retrieval-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "DialogPromptBuilderNodeType", - "id": "dialog-prompt-builder-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "id": "llm-node-lang-learning", - "type": "LLMChatNode", - "execution_config": { - "type": "LLMChatNodeExecutionConfig", - "properties": { - "llm_component_id": "llm-node-lang-learning_llm_component", - "text_generation_config": { - "max_new_tokens": 250, - "max_prompt_length": 2000, - "temperature": 1, - "top_p": 1, - "repetition_penalty": 1, - "frequency_penalty": 0, - "presence_penalty": 0 - }, - "stream": true, - "report_to_client": true, - "response_format": "text" - } - } - }, - { - "id": "text-chunking-node-lang-learning", - "type": "TextChunkingNode", - "execution_config": { - "type": "TextChunkingNodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "id": "text-aggregator-node-lang-learning", - "type": "TextAggregatorNode", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "TTSRequestBuilderNodeType", - "id": "tts-request-builder-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "id": "tts-node-lang-learning", - "type": "TTSNode", - "execution_config": { - "type": "TTSNodeExecutionConfig", - "properties": { - "tts_component_id": "tts-node-lang-learning_tts_component", - "voice": { - "id": "Rafael", - "language_code": "es-MX" - }, - "synthesis_config": { - "type": "inworld", - "config": { - "model_id": "inworld-tts-1.5-max", - "inference": { - "speaking_rate": 1, - "temperature": 1.1 - }, - "postprocessing": { - "sample_rate": 22050 - } - } - }, - "report_to_client": true - } - } - }, - { - "type": "StateUpdateNodeType", - "id": "state-update-node-lang-learning", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": true - } - } - } - ], - "edges": [ - { - "from_node": "audio-input-proxy-lang-learning", - "to_node": "assembly-ai-stt-ws-node-lang-learning" - }, - { - "from_node": "assembly-ai-stt-ws-node-lang-learning", - "to_node": "assembly-ai-stt-ws-node-lang-learning", - "condition_id": "custom-condition-from-assembly-ai-stt-ws-node-lang-learning-to-assembly-ai-stt-ws-node-lang-learning", - "optional": true, - "loop": true - }, - { - "from_node": "assembly-ai-stt-ws-node-lang-learning", - "to_node": "transcript-extractor-node-lang-learning", - "condition_id": "custom-condition-from-assembly-ai-stt-ws-node-lang-learning-to-transcript-extractor-node-lang-learning" - }, - { - "from_node": "transcript-extractor-node-lang-learning", - "to_node": "interaction-queue-node-lang-learning" - }, - { - "from_node": "interaction-queue-node-lang-learning", - "to_node": "text-input-node-lang-learning", - "condition_id": "custom-condition-from-interaction-queue-node-lang-learning-to-text-input-node-lang-learning" - }, - { - "from_node": "text-input-node-lang-learning", - "to_node": "memory-retrieval-node-lang-learning" - }, - { - "from_node": "memory-retrieval-node-lang-learning", - "to_node": "dialog-prompt-builder-node-lang-learning" - }, - { - "from_node": "text-input-node-lang-learning", - "to_node": "tts-request-builder-node-lang-learning" - }, - { - "from_node": "dialog-prompt-builder-node-lang-learning", - "to_node": "llm-node-lang-learning" - }, - { - "from_node": "llm-node-lang-learning", - "to_node": "text-chunking-node-lang-learning" - }, - { - "from_node": "llm-node-lang-learning", - "to_node": "text-aggregator-node-lang-learning" - }, - { - "from_node": "text-chunking-node-lang-learning", - "to_node": "tts-request-builder-node-lang-learning" - }, - { - "from_node": "tts-request-builder-node-lang-learning", - "to_node": "tts-node-lang-learning" - }, - { - "from_node": "text-aggregator-node-lang-learning", - "to_node": "state-update-node-lang-learning" - }, - { - "from_node": "state-update-node-lang-learning", - "to_node": "interaction-queue-node-lang-learning", - "optional": true, - "loop": true - } - ], - "end_nodes": ["tts-node-lang-learning"], - "start_nodes": ["audio-input-proxy-lang-learning"] - }, - "components": [ - { - "id": "llm-node-lang-learning_llm_component", - "type": "LLMInterface", - "creation_config": { - "type": "RemoteLLMConfig", - "properties": { - "provider": "openai", - "model_name": "gpt-4.1-nano", - "default_config": {}, - "api_key": "{{INWORLD_API_KEY}}" - } - } - }, - { - "id": "tts-node-lang-learning_tts_component", - "type": "TTSInterface", - "creation_config": { - "type": "RemoteTTSConfig", - "properties": { - "synthesis_config": { - "type": "inworld", - "config": { - "model_id": "inworld-tts-1.5-max", - "inference": { - "speaking_rate": 1, - "temperature": 1.1 - }, - "postprocessing": { - "sample_rate": 22050 - } - } - }, - "api_key": "{{INWORLD_API_KEY}}" - } - } - }, - { - "id": "custom-condition-from-assembly-ai-stt-ws-node-lang-learning-to-assembly-ai-stt-ws-node-lang-learning", - "type": "custom-condition-from-assembly-ai-stt-ws-node-lang-learning-to-assembly-ai-stt-ws-node-lang-learning" - }, - { - "id": "custom-condition-from-assembly-ai-stt-ws-node-lang-learning-to-transcript-extractor-node-lang-learning", - "type": "custom-condition-from-assembly-ai-stt-ws-node-lang-learning-to-transcript-extractor-node-lang-learning" - }, - { - "id": "custom-condition-from-interaction-queue-node-lang-learning-to-text-input-node-lang-learning", - "type": "custom-condition-from-interaction-queue-node-lang-learning-to-text-input-node-lang-learning" - } - ] -} diff --git a/backend/src/graphs/configs/response-feedback-graph.json b/backend/src/graphs/configs/response-feedback-graph.json deleted file mode 100644 index d694a2b..0000000 --- a/backend/src/graphs/configs/response-feedback-graph.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "schema_version": "1.2.2", - "main": { - "id": "response-feedback-graph", - "nodes": [ - { - "type": "FeedbackPromptBuilderNodeType", - "id": "feedback-prompt-builder", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "type": "TextToChatRequestNodeType", - "id": "text-to-chat-request", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - }, - { - "id": "llm-node", - "type": "LLMChatNode", - "execution_config": { - "type": "LLMChatNodeExecutionConfig", - "properties": { - "llm_component_id": "llm-node_llm_component", - "text_generation_config": { - "max_new_tokens": 100, - "max_prompt_length": 2000, - "temperature": 0.7, - "top_p": 1, - "repetition_penalty": 1, - "frequency_penalty": 0, - "presence_penalty": 0 - }, - "stream": false, - "report_to_client": false, - "response_format": "text" - } - } - }, - { - "type": "FeedbackExtractorNodeType", - "id": "feedback-extractor", - "execution_config": { - "type": "NodeExecutionConfig", - "properties": { - "report_to_client": false - } - } - } - ], - "edges": [ - { - "from_node": "feedback-prompt-builder", - "to_node": "text-to-chat-request" - }, - { - "from_node": "text-to-chat-request", - "to_node": "llm-node" - }, - { - "from_node": "llm-node", - "to_node": "feedback-extractor" - } - ], - "end_nodes": ["feedback-extractor"], - "start_nodes": ["feedback-prompt-builder"] - }, - "components": [ - { - "id": "llm-node_llm_component", - "type": "LLMInterface", - "creation_config": { - "type": "RemoteLLMConfig", - "properties": { - "provider": "openai", - "model_name": "gpt-4.1-nano", - "default_config": {}, - "api_key": "{{INWORLD_API_KEY}}" - } - } - } - ] -} diff --git a/backend/src/graphs/conversation-graph.ts b/backend/src/graphs/conversation-graph.ts index 1c4f578..20e1949 100644 --- a/backend/src/graphs/conversation-graph.ts +++ b/backend/src/graphs/conversation-graph.ts @@ -2,13 +2,13 @@ * Conversation Graph for Language Learning App - Inworld Runtime 0.9 * * This is a long-running circular graph that: - * - Processes continuous audio streams via AssemblyAI STT with built-in VAD + * - Processes continuous audio streams via STT (AssemblyAI or Soniox) with built-in VAD * - Queues interactions for sequential processing * - Uses language-specific prompts and TTS voices * - Loops back for the next interaction automatically * * Graph Flow: - * AudioInput β†’ AssemblyAI STT (loop) β†’ TranscriptExtractor β†’ InteractionQueue + * AudioInput β†’ STT (loop) β†’ TranscriptExtractor β†’ InteractionQueue * β†’ TextInput β†’ DialogPromptBuilder β†’ LLM β†’ TextChunking β†’ TTSRequestBuilder β†’ TTS * β†’ TextAggregator β†’ StateUpdate β†’ (loop back to InteractionQueue) */ @@ -16,6 +16,7 @@ import { Graph, GraphBuilder, + CustomNode, ProxyNode, RemoteLLMChatNode, RemoteTTSNode, @@ -24,6 +25,8 @@ import { } from '@inworld/runtime/graph'; import { AssemblyAISTTWebSocketNode } from './nodes/assembly-ai-stt-ws-node.js'; +import { SonioxSTTWebSocketNode } from './nodes/soniox-stt-ws-node.js'; +import { STTNode } from './nodes/stt-node.js'; import { DialogPromptBuilderNode } from './nodes/dialog-prompt-builder-node.js'; import { InteractionQueueNode } from './nodes/interaction-queue-node.js'; import { MemoryRetrievalNode } from './nodes/memory-retrieval-node.js'; @@ -37,33 +40,36 @@ import { DEFAULT_LANGUAGE_CODE, } from '../config/languages.js'; import { llmConfig } from '../config/llm.js'; -import { serverConfig, getAssemblyAISettings } from '../config/server.js'; +import { + serverConfig, + getAssemblyAISettings, + getSonioxSettings, + STTProvider, +} from '../config/server.js'; import { graphLogger as logger } from '../utils/logger.js'; export interface ConversationGraphConfig { - assemblyAIApiKey: string; + sttProvider: STTProvider; + sttApiKey: string; connections: ConnectionsMap; defaultLanguageCode?: string; } /** - * Wrapper class for the conversation graph - * Provides access to the graph and the AssemblyAI node for session management + * Wrapper class for the conversation graph. + * Provides access to the graph and the STT node for session management. */ export class ConversationGraphWrapper { graph: Graph; - assemblyAINode: AssemblyAISTTWebSocketNode; + sttNode: STTNode; - private constructor(params: { - graph: Graph; - assemblyAINode: AssemblyAISTTWebSocketNode; - }) { + private constructor(params: { graph: Graph; sttNode: STTNode }) { this.graph = params.graph; - this.assemblyAINode = params.assemblyAINode; + this.sttNode = params.sttNode; } async destroy(): Promise { - await this.assemblyAINode.destroy(); + await this.sttNode.destroy(); await this.graph.stop(); } @@ -73,15 +79,19 @@ export class ConversationGraphWrapper { static create(config: ConversationGraphConfig): ConversationGraphWrapper { const { connections, - assemblyAIApiKey, + sttProvider, + sttApiKey, defaultLanguageCode = DEFAULT_LANGUAGE_CODE, } = config; - // Use provided language code or default to Spanish const langConfig = getLanguageConfig(defaultLanguageCode); const postfix = `-lang-learning`; logger.info( - { language: langConfig.name, languageCode: defaultLanguageCode }, + { + language: langConfig.name, + languageCode: defaultLanguageCode, + sttProvider, + }, 'creating_conversation_graph' ); @@ -89,25 +99,39 @@ export class ConversationGraphWrapper { // Create Nodes // ============================================================ - // Start node (audio input proxy) const audioInputNode = new ProxyNode({ id: `audio-input-proxy${postfix}` }); - // AssemblyAI STT with built-in VAD (always uses multilingual model) - const turnDetectionSettings = getAssemblyAISettings(); - const assemblyAISTTNode = new AssemblyAISTTWebSocketNode({ - id: `assembly-ai-stt-ws-node${postfix}`, - config: { - apiKey: assemblyAIApiKey, - connections: connections, - sampleRate: serverConfig.audio.inputSampleRate, - formatTurns: serverConfig.assemblyAI.formatTurns, - endOfTurnConfidenceThreshold: - turnDetectionSettings.endOfTurnConfidenceThreshold, - minEndOfTurnSilenceWhenConfident: - turnDetectionSettings.minEndOfTurnSilenceWhenConfident, - maxTurnSilence: turnDetectionSettings.maxTurnSilence, - }, - }); + // Create STT node based on provider + let sttCustomNode: CustomNode & STTNode; + + if (sttProvider === 'soniox') { + const sonioxSettings = getSonioxSettings(); + sttCustomNode = new SonioxSTTWebSocketNode({ + id: `stt-ws-node${postfix}`, + config: { + apiKey: sttApiKey, + connections: connections, + sampleRate: serverConfig.audio.inputSampleRate, + maxEndpointDelayMs: sonioxSettings.maxEndpointDelayMs, + }, + }); + } else { + const turnDetectionSettings = getAssemblyAISettings(); + sttCustomNode = new AssemblyAISTTWebSocketNode({ + id: `stt-ws-node${postfix}`, + config: { + apiKey: sttApiKey, + connections: connections, + sampleRate: serverConfig.audio.inputSampleRate, + formatTurns: serverConfig.assemblyAI.formatTurns, + endOfTurnConfidenceThreshold: + turnDetectionSettings.endOfTurnConfidenceThreshold, + minEndOfTurnSilenceWhenConfident: + turnDetectionSettings.minEndOfTurnSilenceWhenConfident, + maxTurnSilence: turnDetectionSettings.maxTurnSilence, + }, + }); + } const transcriptExtractorNode = new TranscriptExtractorNode({ id: `transcript-extractor-node${postfix}`, @@ -190,7 +214,7 @@ export class ConversationGraphWrapper { graphBuilder // Add all nodes .addNode(audioInputNode) - .addNode(assemblyAISTTNode) + .addNode(sttCustomNode) .addNode(transcriptExtractorNode) .addNode(interactionQueueNode) .addNode(textInputNode) @@ -206,10 +230,10 @@ export class ConversationGraphWrapper { // ============================================================ // Audio Input Flow (STT with VAD) // ============================================================ - .addEdge(audioInputNode, assemblyAISTTNode) + .addEdge(audioInputNode, sttCustomNode) - // AssemblyAI loops back to itself while stream is active - .addEdge(assemblyAISTTNode, assemblyAISTTNode, { + // STT loops back to itself while stream is active + .addEdge(sttCustomNode, sttCustomNode, { condition: async (input: unknown) => { const data = input as { stream_exhausted?: boolean }; return data?.stream_exhausted !== true; @@ -219,7 +243,7 @@ export class ConversationGraphWrapper { }) // When interaction is complete, extract transcript - .addEdge(assemblyAISTTNode, transcriptExtractorNode, { + .addEdge(sttCustomNode, transcriptExtractorNode, { condition: async (input: unknown) => { const data = input as { interaction_complete?: boolean }; return data?.interaction_complete === true; @@ -283,7 +307,7 @@ export class ConversationGraphWrapper { return new ConversationGraphWrapper({ graph, - assemblyAINode: assemblyAISTTNode, + sttNode: sttCustomNode, }); } } diff --git a/backend/src/graphs/flashcard-graph.ts b/backend/src/graphs/flashcard-graph.ts index 6f8b108..8fa8641 100644 --- a/backend/src/graphs/flashcard-graph.ts +++ b/backend/src/graphs/flashcard-graph.ts @@ -14,6 +14,7 @@ import { v4 } from 'uuid'; import { Flashcard } from '../helpers/flashcard-processor.js'; import { llmConfig } from '../config/llm.js'; import { flashcardLogger as logger } from '../utils/logger.js'; +import { jsonrepair } from 'jsonrepair'; class FlashcardPromptBuilderNode extends CustomNode { async process( @@ -38,31 +39,49 @@ class TextToChatRequestNode extends CustomNode { class FlashcardParserNode extends CustomNode { process(_context: ProcessContext, input: GraphTypes.Content) { - try { - const content = - (input && - typeof input === 'object' && - 'content' in input && - (input as { content?: unknown }).content) || - input; - const textContent = - typeof content === 'string' ? content : JSON.stringify(content); + const content = + (input && + typeof input === 'object' && + 'content' in input && + (input as { content?: unknown }).content) || + input; + const textContent = + typeof content === 'string' ? content : JSON.stringify(content); - const jsonMatch = textContent.match(/\{[\s\S]*\}/); - if (jsonMatch) { - const parsed = JSON.parse(jsonMatch[0]); - return { + const jsonMatch = textContent.match(/\{[\s\S]*\}/); + if (jsonMatch) { + const raw = jsonMatch[0]; + let parsed: Record | undefined; + + try { + parsed = JSON.parse(raw); + } catch { + try { + parsed = JSON.parse(jsonrepair(raw)); + logger.warn({ raw: raw.slice(0, 500) }, 'flashcard_json_repaired'); + } catch (repairError) { + logger.error( + { raw: raw.slice(0, 500), err: repairError }, + 'failed_to_parse_flashcard_json' + ); + } + } + + if (parsed) { + const result: Record = { id: v4(), - // Support both new 'targetWord' format and legacy 'spanish' format targetWord: parsed.targetWord ?? parsed.spanish ?? '', english: parsed.english ?? '', example: parsed.example ?? '', mnemonic: parsed.mnemonic ?? '', timestamp: new Date().toISOString(), }; + if (parsed.exampleTranslation) + result.exampleTranslation = parsed.exampleTranslation; + if (parsed.pinyin) result.pinyin = parsed.pinyin; + if (parsed.examplePinyin) result.examplePinyin = parsed.examplePinyin; + return result; } - } catch (error) { - logger.error({ err: error }, 'failed_to_parse_flashcard_json'); } return { diff --git a/backend/src/graphs/nodes/assembly-ai-stt-ws-node.ts b/backend/src/graphs/nodes/assembly-ai-stt-ws-node.ts index 3b566e9..3d0169f 100644 --- a/backend/src/graphs/nodes/assembly-ai-stt-ws-node.ts +++ b/backend/src/graphs/nodes/assembly-ai-stt-ws-node.ts @@ -6,6 +6,7 @@ import { v4 as uuidv4 } from 'uuid'; import { Connection } from '../../types/index.js'; import { audioDataToPCM16 } from '../../helpers/audio-utils.js'; import { createLogger } from '../../utils/logger.js'; +import { STTNode } from './stt-node.js'; const logger = createLogger('AssemblyAI'); @@ -224,7 +225,7 @@ class AssemblyAISession { * - Detects turn endings using Assembly.AI's neural turn detection * - Returns DataStreamWithMetadata with transcribed text when a turn completes */ -export class AssemblyAISTTWebSocketNode extends CustomNode { +export class AssemblyAISTTWebSocketNode extends CustomNode implements STTNode { private apiKey: string; private connections: { [sessionId: string]: Connection }; private sampleRate: number; diff --git a/backend/src/graphs/nodes/soniox-stt-ws-node.ts b/backend/src/graphs/nodes/soniox-stt-ws-node.ts new file mode 100644 index 0000000..eb2591c --- /dev/null +++ b/backend/src/graphs/nodes/soniox-stt-ws-node.ts @@ -0,0 +1,718 @@ +import { DataStreamWithMetadata } from '@inworld/runtime'; +import { CustomNode, GraphTypes, ProcessContext } from '@inworld/runtime/graph'; +import WebSocket from 'ws'; +import { v4 as uuidv4 } from 'uuid'; + +import { Connection } from '../../types/index.js'; +import { audioDataToPCM16 } from '../../helpers/audio-utils.js'; +import { createLogger } from '../../utils/logger.js'; +import { STTNode } from './stt-node.js'; + +const logger = createLogger('Soniox'); + +const SONIOX_WEBSOCKET_URL = 'wss://stt-rt.soniox.com/transcribe-websocket'; +const SONIOX_MODEL = 'stt-rt-v4'; + +/** + * Configuration interface for SonioxSTTWebSocketNode + */ +export interface SonioxSTTWebSocketNodeConfig { + /** Soniox API key */ + apiKey: string; + /** Connections map to access session state */ + connections: { [sessionId: string]: Connection }; + /** Sample rate of the audio stream in Hz */ + sampleRate?: number; + /** Maximum endpoint delay in milliseconds (500-3000, default 2000) */ + maxEndpointDelayMs?: number; + /** Language hints for improved accuracy (e.g. ['en', 'es']) */ + languageHints?: string[]; +} + +/** + * Manages a persistent WebSocket connection to Soniox for a single session. + */ +class SonioxSession { + private ws: WebSocket | null = null; + private wsReady: boolean = false; + private wsConnectionPromise: Promise | null = null; + + public shouldStopProcessing: boolean = false; + + private inactivityTimeout: NodeJS.Timeout | null = null; + private keepaliveInterval: NodeJS.Timeout | null = null; + private lastActivityTime: number = Date.now(); + private readonly INACTIVITY_TIMEOUT_MS = 60000; + private readonly KEEPALIVE_INTERVAL_MS = 5000; + + constructor( + public readonly sessionId: string, + private apiKey: string, + private sampleRate: number, + private maxEndpointDelayMs: number, + private languageHints: string[] + ) {} + + public async ensureConnection(): Promise { + if (!this.ws || !this.wsReady || this.ws.readyState !== WebSocket.OPEN) { + this.closeWebSocket(); + this.initializeWebSocket(); + } + + if (this.wsConnectionPromise) { + await this.wsConnectionPromise; + } + + this.shouldStopProcessing = false; + this.resetInactivityTimer(); + } + + private initializeWebSocket(): void { + logger.debug({ sessionId: this.sessionId }, 'initializing_websocket'); + + this.wsConnectionPromise = new Promise((resolve, reject) => { + this.ws = new WebSocket(SONIOX_WEBSOCKET_URL); + + this.ws.on('open', () => { + logger.debug({ sessionId: this.sessionId }, 'websocket_opened'); + + const config = { + api_key: this.apiKey, + model: SONIOX_MODEL, + audio_format: 'pcm_s16le', + sample_rate: this.sampleRate, + num_channels: 1, + enable_endpoint_detection: true, + max_endpoint_delay_ms: this.maxEndpointDelayMs, + language_hints: this.languageHints, + enable_language_identification: true, + }; + + this.ws!.send(JSON.stringify(config)); + logger.debug( + { + model: SONIOX_MODEL, + sampleRate: this.sampleRate, + maxEndpointDelayMs: this.maxEndpointDelayMs, + languageHints: this.languageHints, + }, + 'config_sent' + ); + + this.wsReady = true; + this.startKeepalive(); + resolve(); + }); + + this.ws.on('error', (error: Error) => { + logger.error({ err: error }, 'websocket_error'); + this.wsReady = false; + reject(error); + }); + + this.ws.on('close', (code: number, reason: Buffer) => { + logger.debug({ code, reason: reason.toString() }, 'websocket_closed'); + this.wsReady = false; + this.stopKeepalive(); + }); + }); + } + + public onMessage(listener: (data: WebSocket.Data) => void): void { + if (this.ws) { + this.ws.on('message', listener); + } + } + + public offMessage(listener: (data: WebSocket.Data) => void): void { + if (this.ws) { + this.ws.off('message', listener); + } + } + + public sendAudio(pcm16Data: Int16Array): void { + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + this.ws.send(Buffer.from(pcm16Data.buffer)); + this.resetInactivityTimer(); + } + } + + public sendFinalize(): void { + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify({ type: 'finalize' })); + } + } + + private startKeepalive(): void { + this.stopKeepalive(); + this.keepaliveInterval = setInterval(() => { + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify({ type: 'keepalive' })); + } + }, this.KEEPALIVE_INTERVAL_MS); + } + + private stopKeepalive(): void { + if (this.keepaliveInterval) { + clearInterval(this.keepaliveInterval); + this.keepaliveInterval = null; + } + } + + private resetInactivityTimer(): void { + if (this.inactivityTimeout) { + clearTimeout(this.inactivityTimeout); + } + this.lastActivityTime = Date.now(); + this.inactivityTimeout = setTimeout(() => { + this.closeDueToInactivity(); + }, this.INACTIVITY_TIMEOUT_MS); + } + + public clearInactivityTimer(): void { + if (this.inactivityTimeout) { + clearTimeout(this.inactivityTimeout); + this.inactivityTimeout = null; + } + } + + /** + * Update language hints. If they differ from the current hints, closes the + * existing WebSocket so the next ensureConnection() reopens with the new config. + */ + public updateLanguageHints(hints: string[]): void { + const sorted = [...hints].sort(); + const currentSorted = [...this.languageHints].sort(); + if (sorted.join(',') === currentSorted.join(',')) return; + + logger.info( + { sessionId: this.sessionId, from: this.languageHints, to: hints }, + 'language_hints_changed' + ); + this.languageHints = hints; + this.closeWebSocket(); + } + + private closeDueToInactivity(): void { + const inactiveFor = Date.now() - this.lastActivityTime; + logger.info( + { sessionId: this.sessionId, inactiveMs: inactiveFor }, + 'closing_due_to_inactivity' + ); + this.closeWebSocket(); + } + + private closeWebSocket(): void { + this.stopKeepalive(); + if (this.ws) { + try { + this.ws.removeAllListeners(); + if (this.ws.readyState === WebSocket.OPEN) { + // Send empty string to signal end-of-audio + this.ws.send(''); + this.ws.close(); + } + } catch (e) { + logger.warn({ err: e }, 'error_closing_socket'); + } + this.ws = null; + this.wsReady = false; + } + } + + public async close(): Promise { + if (this.inactivityTimeout) { + clearTimeout(this.inactivityTimeout); + } + + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + try { + // Signal end-of-audio + this.ws.send(''); + await new Promise((resolve) => setTimeout(resolve, 100)); + } catch { + // Ignore + } + } + + this.closeWebSocket(); + } +} + +/** + * SonioxSTTWebSocketNode processes continuous multimodal streams using Soniox's + * streaming Speech-to-Text service via direct WebSocket connection. + * + * This node: + * - Receives MultimodalContent stream (audio and/or text) + * - For audio: extracts audio and feeds to Soniox streaming transcriber + * - For text: bypasses STT and returns text directly + * - Detects turn endings using Soniox's semantic endpoint detection + * - Returns DataStreamWithMetadata with transcribed text when a turn completes + */ +export class SonioxSTTWebSocketNode extends CustomNode implements STTNode { + private apiKey: string; + private connections: { [sessionId: string]: Connection }; + private sampleRate: number; + private maxEndpointDelayMs: number; + private languageHints: string[]; + + private sessions: Map = new Map(); + private readonly TURN_COMPLETION_TIMEOUT_MS = 2000; + private readonly MAX_TRANSCRIPTION_DURATION_MS = 40000; + + constructor(props: { id?: string; config: SonioxSTTWebSocketNodeConfig }) { + const { config, ...nodeProps } = props; + + if (!config.apiKey) { + throw new Error('SonioxSTTWebSocketNode requires an API key.'); + } + if (!config.connections) { + throw new Error('SonioxSTTWebSocketNode requires a connections object.'); + } + + super({ id: nodeProps.id || 'soniox-stt-ws-node' }); + + this.apiKey = config.apiKey; + this.connections = config.connections; + this.sampleRate = config.sampleRate || 16000; + this.maxEndpointDelayMs = config.maxEndpointDelayMs ?? 2000; + this.languageHints = config.languageHints ?? ['en']; + + logger.info( + { + maxEndpointDelayMs: this.maxEndpointDelayMs, + languageHints: this.languageHints, + }, + 'stt_node_configured' + ); + } + + async process( + context: ProcessContext, + input0: AsyncIterableIterator, + input: DataStreamWithMetadata + ): Promise { + const multimodalStream = + input !== undefined && + input !== null && + input instanceof DataStreamWithMetadata + ? (input.toStream() as unknown as AsyncIterableIterator) + : input0; + + const sessionId = context.getDatastore().get('sessionId') as string; + const connection = this.connections[sessionId]; + + if (connection?.unloaded) { + throw Error(`Session unloaded for sessionId: ${sessionId}`); + } + if (!connection) { + throw Error(`Failed to read connection for sessionId: ${sessionId}`); + } + + const metadata = input?.getMetadata?.() || {}; + let previousIteration = (metadata.iteration as number) || 0; + + if ( + !connection.state.interactionId || + connection.state.interactionId === '' + ) { + connection.state.interactionId = uuidv4(); + } + + const currentId = connection.state.interactionId; + const delimiterIndex = currentId.indexOf('#'); + + if (previousIteration === 0 && delimiterIndex !== -1) { + const iterationStr = currentId.substring(delimiterIndex + 1); + const parsedIteration = parseInt(iterationStr, 10); + if (!isNaN(parsedIteration) && /^\d+$/.test(iterationStr)) { + previousIteration = parsedIteration; + } + } + + const iteration = previousIteration + 1; + const baseId = + delimiterIndex !== -1 + ? currentId.substring(0, delimiterIndex) + : currentId; + const nextInteractionId = `${baseId}#${iteration}`; + + logger.debug({ iteration }, 'starting_transcription'); + + // State tracking + let transcriptText = ''; + let turnDetected = false; + let speechDetected = false; + let audioChunkCount = 0; + let totalAudioSamples = 0; + let isStreamExhausted = false; + let errorOccurred = false; + let errorMessage = ''; + let maxDurationReached = false; + let isTextInput = false; + let textContent: string | undefined; + + // Soniox token accumulation + const finalTokenTexts: string[] = []; + + // Derive per-session language hints from the connection's active language + const targetLang = connection.state.languageCode || 'es'; + const sessionLanguageHints = + targetLang === 'en' ? ['en'] : ['en', targetLang]; + + // Get or create session + let session = this.sessions.get(sessionId); + if (!session) { + session = new SonioxSession( + sessionId, + this.apiKey, + this.sampleRate, + this.maxEndpointDelayMs, + sessionLanguageHints + ); + this.sessions.set(sessionId, session); + } else { + session.updateLanguageHints(sessionLanguageHints); + } + + // Promise to capture turn result + let turnResolve: (value: string) => void = () => {}; + let turnReject: (error: Error) => void = () => {}; + let turnCompleted = false; + const turnPromise = new Promise((resolve, reject) => { + turnResolve = resolve; + turnReject = reject; + }); + const turnPromiseWithState = turnPromise.then((value) => { + turnCompleted = true; + return value; + }); + + // Soniox message handler for this process() call + const messageHandler = (data: WebSocket.Data) => { + try { + const message = JSON.parse(data.toString()); + + if (message.error_code) { + logger.error( + { code: message.error_code, msg: message.error_message }, + 'soniox_error' + ); + errorOccurred = true; + errorMessage = `${message.error_code}: ${message.error_message}`; + return; + } + + if (session?.shouldStopProcessing) { + return; + } + + const tokens = message.tokens; + if (!tokens || !Array.isArray(tokens) || tokens.length === 0) { + return; + } + + let endpointDetected = false; + const nonFinalTexts: string[] = []; + + for (const token of tokens) { + const text = token.text || ''; + + if (token.is_final) { + // token signals endpoint detection + if (text === '') { + endpointDetected = true; + } else { + finalTokenTexts.push(text); + } + } else { + nonFinalTexts.push(text); + } + } + + // Trigger speech detected on first meaningful text + if ( + !speechDetected && + (nonFinalTexts.length > 0 || finalTokenTexts.length > 0) + ) { + const hasText = + nonFinalTexts.some((t) => t.trim().length > 0) || + finalTokenTexts.some((t) => t.trim().length > 0); + if (hasText) { + speechDetected = true; + logger.debug({ iteration }, 'speech_detected'); + if (connection?.onSpeechDetected) { + connection.onSpeechDetected(nextInteractionId); + } + } + } + + // Send partial transcript from non-final tokens + if (nonFinalTexts.length > 0) { + const partialText = [...finalTokenTexts, ...nonFinalTexts] + .join('') + .trim(); + if (partialText) { + this.sendPartialTranscript( + sessionId, + nextInteractionId, + partialText + ); + } + } + + if (endpointDetected) { + let finalTranscript = finalTokenTexts.join('').trim(); + + // Check for pending transcript to stitch + if (connection?.pendingTranscript) { + finalTranscript = + `${connection.pendingTranscript} ${finalTranscript}`.trim(); + logger.debug( + { + iteration, + transcriptSnippet: finalTranscript.substring(0, 80), + }, + 'stitched_transcript' + ); + connection.pendingTranscript = undefined; + } else { + logger.debug( + { + iteration, + transcriptSnippet: finalTranscript.substring(0, 50), + }, + 'endpoint_detected' + ); + } + + if (connection) { + connection.isProcessingInterrupted = false; + } + + transcriptText = finalTranscript; + turnDetected = true; + if (session) session.shouldStopProcessing = true; + turnResolve(finalTranscript); + } + } catch (error) { + logger.error({ err: error }, 'error_handling_message'); + } + }; + + try { + await session.ensureConnection(); + session.onMessage(messageHandler); + + const audioProcessingPromise = (async () => { + let maxDurationTimeout: NodeJS.Timeout | null = null; + try { + maxDurationTimeout = setTimeout(() => { + maxDurationReached = true; + }, this.MAX_TRANSCRIPTION_DURATION_MS); + + while (true) { + if (session?.shouldStopProcessing) break; + + if (maxDurationReached && !transcriptText) { + logger.warn( + { maxDurationMs: this.MAX_TRANSCRIPTION_DURATION_MS }, + 'max_transcription_duration_reached' + ); + break; + } + + const result = await multimodalStream.next(); + + if (result.done) { + logger.debug( + { iteration, audioChunkCount }, + 'multimodal_stream_exhausted' + ); + isStreamExhausted = true; + break; + } + + if (session?.shouldStopProcessing) break; + + const content = result.value as GraphTypes.MultimodalContent; + + // Handle text input + if (content.text !== undefined && content.text !== null) { + logger.debug( + { iteration, textSnippet: content.text.substring(0, 50) }, + 'text_input_detected' + ); + isTextInput = true; + textContent = content.text; + transcriptText = content.text; + turnDetected = true; + if (session) { + session.shouldStopProcessing = true; + session.clearInactivityTimer(); + } + turnResolve(transcriptText); + break; + } + + // Extract audio + if (content.audio === undefined || content.audio === null) continue; + + const audioData = content.audio.data; + if (!audioData || audioData.length === 0) continue; + + audioChunkCount++; + totalAudioSamples += audioData.length; + + const pcm16Data = audioDataToPCM16(audioData); + session?.sendAudio(pcm16Data); + } + } catch (error) { + logger.error({ err: error }, 'error_processing_audio'); + errorOccurred = true; + errorMessage = error instanceof Error ? error.message : String(error); + throw error; + } finally { + if (maxDurationTimeout) { + clearTimeout(maxDurationTimeout); + } + } + })(); + + const raceResult = await Promise.race([ + turnPromiseWithState.then(() => ({ winner: 'turn' as const })), + audioProcessingPromise.then(() => ({ winner: 'audio' as const })), + ]); + + if ( + raceResult.winner === 'audio' && + !turnCompleted && + !maxDurationReached + ) { + logger.debug( + { waitMs: this.TURN_COMPLETION_TIMEOUT_MS }, + 'audio_ended_before_turn_waiting' + ); + + // Send finalize to force Soniox to return any remaining tokens + session.sendFinalize(); + + const timeoutPromise = new Promise<{ winner: 'timeout' }>((resolve) => + setTimeout( + () => resolve({ winner: 'timeout' }), + this.TURN_COMPLETION_TIMEOUT_MS + ) + ); + + const waitResult = await Promise.race([ + turnPromiseWithState.then(() => ({ winner: 'turn' as const })), + timeoutPromise, + ]); + + if (waitResult.winner === 'timeout' && !turnCompleted) { + logger.warn('timed_out_waiting_for_turn'); + turnReject?.(new Error('Timed out waiting for turn completion')); + } + } + + await audioProcessingPromise.catch(() => {}); + + logger.debug( + { iteration, transcriptSnippet: transcriptText?.substring(0, 50) }, + 'transcription_complete' + ); + + if (turnDetected) { + connection.state.interactionId = ''; + } + + const taggedStream = Object.assign(multimodalStream, { + type: 'MultimodalContent', + abort: () => {}, + getMetadata: () => ({}), + }); + + return new DataStreamWithMetadata(taggedStream, { + elementType: 'MultimodalContent', + iteration: iteration, + interactionId: nextInteractionId, + session_id: sessionId, + transcript: transcriptText, + turn_detected: turnDetected, + audio_chunk_count: audioChunkCount, + total_audio_samples: totalAudioSamples, + sample_rate: this.sampleRate, + stream_exhausted: isStreamExhausted, + interaction_complete: turnDetected && transcriptText.length > 0, + error_occurred: errorOccurred, + error_message: errorMessage, + is_text_input: isTextInput, + text_content: textContent, + }); + } catch (error) { + logger.error({ err: error, iteration }, 'transcription_failed'); + + const taggedStream = Object.assign(multimodalStream, { + type: 'MultimodalContent', + abort: () => {}, + getMetadata: () => ({}), + }); + + return new DataStreamWithMetadata(taggedStream, { + elementType: 'MultimodalContent', + iteration: iteration, + interactionId: nextInteractionId, + session_id: sessionId, + transcript: '', + turn_detected: false, + stream_exhausted: isStreamExhausted, + interaction_complete: false, + error_occurred: true, + error_message: error instanceof Error ? error.message : String(error), + is_text_input: isTextInput, + text_content: textContent, + }); + } finally { + if (session) { + session.offMessage(messageHandler); + } + } + } + + private sendPartialTranscript( + sessionId: string, + interactionId: string, + text: string + ): void { + const connection = this.connections[sessionId]; + if (!connection?.onPartialTranscript) return; + + try { + connection.onPartialTranscript(text, interactionId); + } catch (error) { + logger.error({ err: error }, 'error_sending_partial_transcript'); + } + } + + async closeSession(sessionId: string): Promise { + const session = this.sessions.get(sessionId); + if (session) { + logger.debug({ sessionId }, 'closing_session'); + await session.close(); + this.sessions.delete(sessionId); + } + } + + async destroy(): Promise { + logger.info({ sessionCount: this.sessions.size }, 'destroying_node'); + + const promises: Promise[] = []; + for (const session of this.sessions.values()) { + promises.push(session.close()); + } + + await Promise.all(promises); + this.sessions.clear(); + } +} diff --git a/backend/src/graphs/nodes/stt-node.ts b/backend/src/graphs/nodes/stt-node.ts new file mode 100644 index 0000000..2d58b62 --- /dev/null +++ b/backend/src/graphs/nodes/stt-node.ts @@ -0,0 +1,9 @@ +/** + * Common interface for STT (Speech-to-Text) nodes. + * Both AssemblyAI and Soniox implementations conform to this interface + * so they can be used interchangeably in the conversation graph. + */ +export interface STTNode { + closeSession(sessionId: string): Promise; + destroy(): Promise; +} diff --git a/backend/src/graphs/simple-tts-graph.ts b/backend/src/graphs/simple-tts-graph.ts index 2b7dd57..02877cb 100644 --- a/backend/src/graphs/simple-tts-graph.ts +++ b/backend/src/graphs/simple-tts-graph.ts @@ -22,7 +22,7 @@ import { getLanguageConfig, getSupportedLanguageCodes, } from '../config/languages.js'; -import { serverConfig } from '../config/server.js'; +import { serverConfig, getSttProvider } from '../config/server.js'; import { graphLogger as logger } from '../utils/logger.js'; export interface SimpleTTSInput { @@ -93,7 +93,7 @@ const simpleTTSGraphs = new Map(); * Initialize TTS graphs for all supported languages */ export function initializeTTSGraphs(): void { - const languageCodes = getSupportedLanguageCodes(); + const languageCodes = getSupportedLanguageCodes(getSttProvider()); logger.info( { languageCount: languageCodes.length }, diff --git a/backend/src/helpers/anki-exporter.ts b/backend/src/helpers/anki-exporter.ts index 710eeb8..d4d2c7b 100644 --- a/backend/src/helpers/anki-exporter.ts +++ b/backend/src/helpers/anki-exporter.ts @@ -1,26 +1,41 @@ // @ts-expect-error - no type definitions available for anki-apkg-export import AnkiExport from 'anki-apkg-export'; import { Flashcard } from './flashcard-processor.js'; +import { GeneratedAudio } from './tts-audio-generator.js'; +import { GeneratedImage } from './image-generator.js'; export class AnkiExporter { /** * Export flashcards to ANKI .apkg format + * @param audioMap - Optional map from targetWord to generated audio file info. + * @param imageMap - Optional map from targetWord to generated image file info. */ async exportFlashcards( flashcards: Flashcard[], - deckName: string = 'Inworld Language Tutor Cards' + deckName: string = 'Inworld Language Tutor Cards', + audioMap?: Map, + imageMap?: Map ): Promise { // eslint-disable-next-line @typescript-eslint/no-explicit-any const apkg = new (AnkiExport as any).default(deckName); - // Add each flashcard as a card + if (audioMap) { + for (const [, audio] of audioMap) { + apkg.addMedia(audio.filename, audio.buffer); + } + } + + if (imageMap) { + for (const [, image] of imageMap) { + apkg.addMedia(image.filename, image.buffer); + } + } + flashcards.forEach((flashcard) => { - // Support both new 'targetWord' and legacy 'spanish' field - // @deprecated Legacy 'spanish' field support - remove when all data migrated + // @deprecated Legacy 'spanish' field support const targetWord = flashcard.targetWord || (flashcard as { spanish?: string }).spanish; - // Skip empty or error flashcards if ( !targetWord || !flashcard.english || @@ -30,13 +45,15 @@ export class AnkiExporter { return; } - const front = targetWord.trim(); - const back = this.formatCardBack(flashcard); + const front = this.formatCardFront( + flashcard, + targetWord.trim(), + audioMap + ); + const back = this.formatCardBack(flashcard, audioMap, imageMap); - // Add tags for organization const tags = ['inworld-language-tutor']; - // Add language tag if available if (flashcard.languageCode) { tags.push(`language-${flashcard.languageCode}`); } @@ -49,31 +66,96 @@ export class AnkiExporter { apkg.addCard(front, back, { tags }); }); - // Generate and return the .apkg file as Buffer const zipBuffer = await apkg.save(); return zipBuffer; } + /** + * Format the front of the card (target word + pinyin + audio) + */ + private formatCardFront( + flashcard: Flashcard, + targetWord: string, + audioMap?: Map + ): string { + const audio = audioMap?.get(targetWord); + + let html = `
`; + + html += `
${this.escapeHtml(targetWord)}
`; + + if (flashcard.pinyin) { + html += `
${this.escapeHtml(flashcard.pinyin)}
`; + } + + if (audio) { + html += `
[sound:${audio.filename}]
`; + } + + html += `
`; + return html; + } + /** * Format the back of the card with English, example, and mnemonic */ - private formatCardBack(flashcard: Flashcard): string { - let back = `
${this.escapeHtml(flashcard.english)}
`; + private formatCardBack( + flashcard: Flashcard, + audioMap?: Map, + imageMap?: Map + ): string { + const targetWord = ( + flashcard.targetWord || + (flashcard as { spanish?: string }).spanish || + '' + ).trim(); + + let html = `
`; + + html += `
${this.escapeHtml(flashcard.english)}
`; + + html += `
`; if (flashcard.example && flashcard.example.trim()) { - back += `
${this.escapeHtml(flashcard.example)}
`; + const sentenceAudio = audioMap?.get(flashcard.example.trim()); + + html += `
`; + + html += `
`; + html += `
`; + html += `
${this.escapeHtml(flashcard.example)}
`; + if (flashcard.examplePinyin) { + html += `
${this.escapeHtml(flashcard.examplePinyin)}
`; + } + html += `
`; + if (sentenceAudio) { + html += `
[sound:${sentenceAudio.filename}]
`; + } + html += `
`; + + if (flashcard.exampleTranslation && flashcard.exampleTranslation.trim()) { + html += `
${this.escapeHtml(flashcard.exampleTranslation)}
`; + } + + html += `
`; + } + + const image = imageMap?.get(targetWord); + if (image) { + html += `
`; } if (flashcard.mnemonic && flashcard.mnemonic.trim()) { - back += `
πŸ’‘ Remember: ${this.escapeHtml(flashcard.mnemonic)}
`; + html += `
`; + html += `
Remember
`; + html += `
${this.escapeHtml(flashcard.mnemonic)}
`; + html += `
`; } - return back; + html += `
`; + return html; } - /** - * Escape HTML characters to prevent XSS and formatting issues - */ private escapeHtml(text: string): string { return text .replace(/&/g, '&') @@ -83,9 +165,6 @@ export class AnkiExporter { .replace(/'/g, '''); } - /** - * Count valid flashcards (ones that can be exported) - */ countValidFlashcards(flashcards: Flashcard[]): number { return flashcards.filter((flashcard) => { const targetWord = diff --git a/backend/src/helpers/audio-utils.ts b/backend/src/helpers/audio-utils.ts index b12c336..115fd01 100644 --- a/backend/src/helpers/audio-utils.ts +++ b/backend/src/helpers/audio-utils.ts @@ -2,6 +2,50 @@ * Audio utility functions for format conversion */ +/** + * Encode raw PCM16 samples as a WAV file buffer. + * Returns a complete .wav file that can be written to disk or embedded in an Anki package. + */ +export function encodeWav( + pcm16: Int16Array, + sampleRate: number, + numChannels: number = 1 +): Buffer { + const bytesPerSample = 2; + const dataByteLength = pcm16.length * bytesPerSample; + const headerSize = 44; + const buffer = Buffer.alloc(headerSize + dataByteLength); + + // RIFF header + buffer.write('RIFF', 0); + buffer.writeUInt32LE(36 + dataByteLength, 4); + buffer.write('WAVE', 8); + + // fmt sub-chunk + buffer.write('fmt ', 12); + buffer.writeUInt32LE(16, 16); // sub-chunk size + buffer.writeUInt16LE(1, 20); // PCM format + buffer.writeUInt16LE(numChannels, 22); + buffer.writeUInt32LE(sampleRate, 24); + buffer.writeUInt32LE(sampleRate * numChannels * bytesPerSample, 28); // byte rate + buffer.writeUInt16LE(numChannels * bytesPerSample, 32); // block align + buffer.writeUInt16LE(bytesPerSample * 8, 34); // bits per sample + + // data sub-chunk + buffer.write('data', 36); + buffer.writeUInt32LE(dataByteLength, 40); + + // PCM samples (little-endian Int16, which is how Int16Array is stored on LE systems) + const pcm16Bytes = Buffer.from( + pcm16.buffer, + pcm16.byteOffset, + pcm16.byteLength + ); + pcm16Bytes.copy(buffer, headerSize); + + return buffer; +} + /** * Convert Float32Array audio data to Int16Array (PCM16) */ diff --git a/backend/src/helpers/connection-manager.ts b/backend/src/helpers/connection-manager.ts index 9659d5a..e59fa72 100644 --- a/backend/src/helpers/connection-manager.ts +++ b/backend/src/helpers/connection-manager.ts @@ -4,7 +4,7 @@ * This replaces the AudioProcessor for Inworld Runtime 0.9. * Key differences from AudioProcessor: * - Uses MultimodalStreamManager to feed audio to a long-running graph - * - VAD is handled inside the graph by AssemblyAI (not external Silero) + * - VAD is handled inside the graph by the STT provider (AssemblyAI or Soniox) * - Graph runs continuously for the session duration */ @@ -465,9 +465,7 @@ export class ConnectionManager { } } - // Only send completion signals if not interrupted if (!wasInterrupted) { - // Send completion signals if (!this.isSwitchingConversation) { this.logger.debug('tts_stream_complete'); this.sendToClient({ @@ -476,7 +474,6 @@ export class ConnectionManager { timestamp: Date.now(), }); - // Send conversation update with conversationId this.sendToClient({ type: 'conversation_update', messages: connection.state.messages, @@ -484,14 +481,17 @@ export class ConnectionManager { timestamp: Date.now(), }); } - - // Trigger flashcard, feedback, and memory generation after TTS completes - this.triggerFlashcardGeneration(); - this.triggerFeedbackGeneration(); - this.triggerMemoryGeneration(); } else { - this.logger.debug('tts_interrupted_skipping_completion'); + this.logger.debug('tts_interrupted_skipping_audio_completion'); } + + // Always trigger flashcard/feedback/memory generation even if TTS was + // interrupted β€” the conversation content is still valid. Messages may + // have been rolled back for utterance stitching, but the remaining + // history still provides useful context for flashcard generation. + this.triggerFlashcardGeneration(); + this.triggerFeedbackGeneration(); + this.triggerMemoryGeneration(); this.markProcessingComplete(); }, @@ -683,7 +683,10 @@ export class ConnectionManager { * Trigger flashcard generation */ private triggerFlashcardGeneration(): void { - if (!this.flashcardCallback) return; + if (!this.flashcardCallback) { + this.logger.debug('skipping_flashcard_no_callback'); + return; + } if (this.conversationId !== this.processingConversationId) { this.logger.info('skipping_flashcard_generation_conversation_changed'); return; @@ -704,6 +707,16 @@ export class ConnectionManager { content: m.content, })); + if (recentMessages.length === 0) { + this.logger.info('skipping_flashcard_no_messages'); + return; + } + + this.logger.info( + { messageCount: recentMessages.length, language: snapshotLanguageCode }, + 'triggering_flashcard_generation' + ); + // Track pending flashcard generation this.pendingFlashcardGeneration = this.flashcardCallback( recentMessages, @@ -1085,8 +1098,8 @@ export class ConnectionManager { // End the multimodal stream this.multimodalStreamManager.end(); - // Close AssemblyAI session - await this.graphWrapper.assemblyAINode.closeSession(this.sessionId); + // Close STT session + await this.graphWrapper.sttNode.closeSession(this.sessionId); // Remove from connections map delete this.connections[this.sessionId]; diff --git a/backend/src/helpers/flashcard-processor.ts b/backend/src/helpers/flashcard-processor.ts index e7aa240..a06d43e 100644 --- a/backend/src/helpers/flashcard-processor.ts +++ b/backend/src/helpers/flashcard-processor.ts @@ -15,7 +15,10 @@ export interface Flashcard { targetWord: string; // The word in the target language (was 'spanish') english: string; example: string; + exampleTranslation?: string; mnemonic: string; + pinyin?: string; + examplePinyin?: string; timestamp: string; languageCode?: string; // Track which language this card belongs to } @@ -57,7 +60,8 @@ export class FlashcardProcessor { messages: ConversationMessage[], count: number = 1, userContext?: UserContextInterface, - languageCodeOverride?: string + languageCodeOverride?: string, + forcedWord?: string ): Promise { const executor = getFlashcardGraph(); @@ -78,7 +82,8 @@ export class FlashcardProcessor { messages, userContext, effectiveLanguageCode, - effectiveLanguageConfig + effectiveLanguageConfig, + forcedWord ) ); } @@ -86,12 +91,17 @@ export class FlashcardProcessor { try { const flashcards = await Promise.all(promises); - // Filter out any failed generations and duplicates const validFlashcards = flashcards.filter( (card) => card.targetWord && card.english ); - // Add to existing flashcards to track for future duplicates + if (validFlashcards.length === 0 && flashcards.length > 0) { + logger.warn( + { generated: flashcards.length }, + 'all_flashcards_filtered_out' + ); + } + this.existingFlashcards.push(...validFlashcards); return validFlashcards; @@ -106,7 +116,8 @@ export class FlashcardProcessor { messages: ConversationMessage[], userContext?: UserContextInterface, languageCode?: string, - languageConfig?: LanguageConfig + languageConfig?: LanguageConfig, + forcedWord?: string ): Promise { // Use explicitly passed language (snapshotted at trigger time) to avoid // reading from mutable this.languageCode which may change during async work @@ -114,13 +125,17 @@ export class FlashcardProcessor { const effectiveLanguageConfig = languageConfig || this.languageConfig; try { - const input = { + const input: Record = { studentName: 'Student', teacherName: effectiveLanguageConfig.teacherPersona.name, target_language: effectiveLanguageConfig.name, + language_code: effectiveLanguageCode, messages: messages, flashcards: this.existingFlashcards, }; + if (forcedWord) { + input.forced_word = forcedWord; + } let executionResult; try { @@ -149,6 +164,10 @@ export class FlashcardProcessor { ); if (isDuplicate) { + logger.info( + { word: flashcard.targetWord }, + 'flashcard_duplicate_skipped' + ); return { id: v4(), targetWord: '', diff --git a/backend/src/helpers/image-generator.ts b/backend/src/helpers/image-generator.ts new file mode 100644 index 0000000..5e397ff --- /dev/null +++ b/backend/src/helpers/image-generator.ts @@ -0,0 +1,130 @@ +/** + * Image Generator using Replicate API (FLUX Schnell model) + * + * Generates illustrative images for flashcard words to aid visual memory. + */ + +import { serverLogger as logger } from '../utils/logger.js'; + +export interface GeneratedImage { + filename: string; + buffer: Buffer; +} + +/** + * Generate an image for a single word using Replicate's FLUX Schnell model. + * Returns the image as a buffer, or null on failure. + */ +async function generateImage( + word: string, + englishWord: string, + index: number +): Promise { + const apiToken = process.env.REPLICATE_API_TOKEN; + if (!apiToken) { + logger.warn('REPLICATE_API_TOKEN not set, skipping image generation'); + return null; + } + + const prompt = `a memorable, colorful, hand-drawn image of ${englishWord}`; + + const response = await fetch( + 'https://api.replicate.com/v1/models/black-forest-labs/flux-schnell/predictions', + { + method: 'POST', + headers: { + Authorization: `Bearer ${apiToken}`, + 'Content-Type': 'application/json', + Prefer: 'wait', + }, + body: JSON.stringify({ + input: { + prompt, + go_fast: true, + megapixels: '1', + num_outputs: 1, + aspect_ratio: '1:1', + output_format: 'webp', + output_quality: 80, + num_inference_steps: 4, + }, + }), + } + ); + + if (!response.ok) { + logger.warn( + { status: response.status, word }, + 'replicate_api_request_failed' + ); + return null; + } + + const data = (await response.json()) as { + status: string; + output?: string[]; + }; + + if (data.status !== 'succeeded' || !data.output?.[0]) { + logger.warn({ word, status: data.status }, 'replicate_prediction_failed'); + return null; + } + + const imageUrl = data.output[0]; + const imageResponse = await fetch(imageUrl); + if (!imageResponse.ok) { + logger.warn({ word, imageUrl }, 'replicate_image_download_failed'); + return null; + } + + const arrayBuffer = await imageResponse.arrayBuffer(); + const buffer = Buffer.from(arrayBuffer); + + const sanitized = word + .trim() + .toLowerCase() + .replace( + /[^a-z0-9\u00C0-\u024F\u1E00-\u1EFF\u3000-\u9FFF\uAC00-\uD7AF]/g, + '_' + ) + .replace(/_+/g, '_') + .replace(/^_|_$/g, ''); + + const filename = `img_${sanitized}_${index}.webp`; + + return { filename, buffer }; +} + +/** + * Generate images for multiple words in sequence. + * @param wordToEnglish - Map from target-language word to its English translation (used as the image prompt). + * Returns a map from the original target word to the image filename and buffer. + */ +export async function generateBatchImages( + wordToEnglish: Map +): Promise> { + const results = new Map(); + const apiToken = process.env.REPLICATE_API_TOKEN; + + if (!apiToken) { + logger.info( + 'REPLICATE_API_TOKEN not configured, skipping image generation' + ); + return results; + } + + let i = 0; + for (const [word, english] of wordToEnglish) { + try { + const image = await generateImage(word, english, i); + if (image) { + results.set(word, image); + } + } catch (error) { + logger.warn({ word, err: error }, 'image_generation_failed_for_word'); + } + i++; + } + + return results; +} diff --git a/backend/src/helpers/tts-audio-generator.ts b/backend/src/helpers/tts-audio-generator.ts new file mode 100644 index 0000000..2a16055 --- /dev/null +++ b/backend/src/helpers/tts-audio-generator.ts @@ -0,0 +1,122 @@ +/** + * Batch TTS Audio Generator + * + * Generates WAV audio buffers for a list of words using the SimpleTTSGraph. + * Used by the Anki exporter to embed pronunciation audio into .apkg files. + */ + +import { GraphTypes } from '@inworld/runtime/graph'; +import { getSimpleTTSGraph } from '../graphs/simple-tts-graph.js'; +import { float32ToPCM16, encodeWav } from './audio-utils.js'; +import { serverConfig } from '../config/server.js'; +import { serverLogger as logger } from '../utils/logger.js'; + +export interface GeneratedAudio { + filename: string; + buffer: Buffer; +} + +/** + * Generate a WAV audio buffer for a single word via TTS. + * Collects all streamed audio chunks into one contiguous buffer. + */ +export async function generateTTSAudio( + text: string, + languageCode: string +): Promise { + const graph = getSimpleTTSGraph(languageCode); + const executionResult = await graph.start({ text: text.trim() }); + + const rawChunks: Buffer[] = []; + + for await (const res of executionResult.outputStream) { + if ('processResponse' in res) { + const resultWithProcess = res as { + processResponse: ( + handlers: Record Promise | void> + ) => Promise; + }; + await resultWithProcess.processResponse({ + TTSOutputStream: async (ttsData: unknown) => { + const ttsStream = ttsData as GraphTypes.TTSOutputStream; + for await (const chunk of ttsStream) { + if (chunk.audio?.data) { + const audioData = chunk.audio.data; + if (typeof audioData === 'string') { + rawChunks.push(Buffer.from(audioData, 'base64')); + } else if (Array.isArray(audioData)) { + rawChunks.push(Buffer.from(audioData)); + } else { + rawChunks.push( + Buffer.from( + audioData.buffer, + audioData.byteOffset, + audioData.byteLength + ) + ); + } + } + } + }, + }); + } + } + + if (rawChunks.length === 0) { + return null; + } + + // Inworld TTS returns raw bytes that represent Float32 PCM samples + const combined = Buffer.concat(rawChunks); + const float32 = new Float32Array( + combined.buffer, + combined.byteOffset, + combined.byteLength / 4 + ); + + const pcm16 = float32ToPCM16(float32); + const sampleRate = serverConfig.audio.ttsSampleRate; + return encodeWav(pcm16, sampleRate); +} + +/** + * Generate TTS audio for multiple words in sequence. + * Returns a map from the original word to the WAV filename and buffer. + */ +export async function generateBatchTTSAudio( + words: string[], + languageCode: string, + onProgress?: (completed: number, total: number) => void +): Promise> { + const results = new Map(); + const total = words.length; + + for (let i = 0; i < words.length; i++) { + const word = words[i]; + try { + const wavBuffer = await generateTTSAudio(word, languageCode); + if (wavBuffer) { + const sanitized = word + .trim() + .toLowerCase() + .replace( + /[^a-z0-9\u00C0-\u024F\u1E00-\u1EFF\u3000-\u9FFF\uAC00-\uD7AF]/g, + '_' + ) + .replace(/_+/g, '_') + .replace(/^_|_$/g, ''); + const filename = `tts_${sanitized}_${i}.wav`; + results.set(word, { filename, buffer: wavBuffer }); + } + onProgress?.(i + 1, total); + } catch (error) { + logger.warn( + { word, languageCode, err: error }, + 'tts_batch_generation_failed_for_word' + ); + onProgress?.(i + 1, total); + } + } + + return results; +} diff --git a/backend/src/prompts/flashcard.njk b/backend/src/prompts/flashcard.njk index 82c7c27..1fab8e6 100644 --- a/backend/src/prompts/flashcard.njk +++ b/backend/src/prompts/flashcard.njk @@ -1,11 +1,17 @@ You are a system that generates flashcards for interesting new vocabulary for a {{target_language}} learning app. +{% if forced_word %} +Create a flashcard for the word/phrase "{{forced_word}}" as used in the conversation between {{studentName}} and {{teacherName}}. Generate the following: +{% else %} Based on the ongoing conversation between {{studentName}} and {{teacherName}}, generate one flashcard with the following things: +{% endif %} - The word in {{target_language}} - The translation in English - An example sentence in {{target_language}} +- An English translation of the example sentence - A mnemonic to help the student remember the word (in English) +{% if language_code == "zh" %}- The pinyin romanization for both the word and the example sentence{% endif %} ## Conversation @@ -20,9 +26,15 @@ Based on the ongoing conversation between {{studentName}} and {{teacherName}}, g ## Guidelines +{% if forced_word %} +- The flashcard MUST be for "{{forced_word}}" β€” do not pick a different word +- If the word appears in the conversation, use that context for the example sentence +{% else %} - The word must be related to the topics used in the conversation - The word should be useful to the learner so they can continue the conversation with new vocabulary - Avoid cognates +{% endif %} +{% if language_code == "zh" %}- Include accurate pinyin with tone marks (e.g. "nǐ hǎo") for the word and the example sentence{% endif %} Now, return JSON with the following format: @@ -30,5 +42,8 @@ Now, return JSON with the following format: "targetWord": "string", "english": "string", "example": "string", - "mnemonic": "string" + "exampleTranslation": "string (English translation of the example sentence)", +{% if language_code == "zh" %} "pinyin": "string (pinyin for the word)", + "examplePinyin": "string (pinyin for the example sentence)", +{% endif %} "mnemonic": "string" } \ No newline at end of file diff --git a/backend/src/server.ts b/backend/src/server.ts index 90fc1d3..ed771ff 100644 --- a/backend/src/server.ts +++ b/backend/src/server.ts @@ -1,7 +1,7 @@ /** * Language Learning Server - Inworld Runtime 0.9 * - * This server uses a long-running circular graph with AssemblyAI for VAD/STT. + * This server uses a long-running circular graph with AssemblyAI or Soniox for VAD/STT. * Key components: * - ConversationGraphWrapper: The main graph that processes audio β†’ STT β†’ LLM β†’ TTS * - ConnectionManager: Manages WebSocket connections and feeds audio to the graph @@ -20,7 +20,7 @@ import { fileURLToPath } from 'url'; import { createServer } from 'http'; import { WebSocketServer } from 'ws'; -import { serverConfig } from './config/server.js'; +import { serverConfig, getSttProvider } from './config/server.js'; import { serverLogger as logger } from './utils/logger.js'; // Import services @@ -107,7 +107,10 @@ async function startServer(): Promise { await exportGraphConfigs(); server.listen(serverConfig.port, () => { logger.info({ port: serverConfig.port }, 'server_started'); - logger.info('using_inworld_runtime_0.9_with_assemblyai_stt'); + logger.info( + { sttProvider: getSttProvider() }, + 'using_inworld_runtime_0.9_with_stt' + ); }); } catch (error) { logger.fatal({ err: error }, 'server_start_failed'); diff --git a/backend/src/services/api-routes.ts b/backend/src/services/api-routes.ts index ab481bb..af2abb9 100644 --- a/backend/src/services/api-routes.ts +++ b/backend/src/services/api-routes.ts @@ -6,10 +6,14 @@ import { Router } from 'express'; import { AnkiExporter } from '../helpers/anki-exporter.js'; +import { generateBatchTTSAudio } from '../helpers/tts-audio-generator.js'; +import { generateBatchImages } from '../helpers/image-generator.js'; +import { Flashcard } from '../helpers/flashcard-processor.js'; import { getLanguageOptions, DEFAULT_LANGUAGE_CODE, } from '../config/languages.js'; +import { getSttProvider } from '../config/server.js'; import { serverLogger as logger } from '../utils/logger.js'; export const apiRouter = Router(); @@ -17,7 +21,7 @@ export const apiRouter = Router(); // ANKI export endpoint apiRouter.post('/export-anki', async (req, res) => { try { - const { flashcards, deckName, languageCode: _languageCode } = req.body; + const { flashcards, deckName, languageCode } = req.body; if (!flashcards || !Array.isArray(flashcards) || flashcards.length === 0) { res.status(400).json({ error: 'No flashcards provided' }); @@ -32,10 +36,59 @@ apiRouter.post('/export-anki', async (req, res) => { return; } + const lang = languageCode || DEFAULT_LANGUAGE_CODE; + const texts: string[] = []; + const wordToEnglish = new Map(); + + for (const fc of flashcards as Flashcard[]) { + const word = ( + fc.targetWord || + (fc as { spanish?: string }).spanish || + '' + ).trim(); + if (word) { + texts.push(word); + if (fc.english) { + wordToEnglish.set(word, fc.english.trim()); + } + } + + const sentence = (fc.example || '').trim(); + if (sentence) texts.push(sentence); + } + + const uniqueTexts = [...new Set(texts)]; + + logger.info( + { + textCount: uniqueTexts.length, + imageCount: wordToEnglish.size, + languageCode: lang, + }, + 'anki_export_generating_media' + ); + + const [audioMap, imageMap] = await Promise.all([ + generateBatchTTSAudio(uniqueTexts, lang), + generateBatchImages(wordToEnglish), + ]); + + logger.info( + { + audioCount: audioMap.size, + imageCount: imageMap.size, + requestedTexts: uniqueTexts.length, + requestedImages: wordToEnglish.size, + }, + 'anki_export_media_generation_complete' + ); + const defaultDeckName = `Inworld Language Tutor Spanish Cards`; const apkgBuffer = await exporter.exportFlashcards( flashcards, - deckName || defaultDeckName + deckName || defaultDeckName, + audioMap, + imageMap ); res.setHeader('Content-Type', 'application/octet-stream'); @@ -53,7 +106,7 @@ apiRouter.post('/export-anki', async (req, res) => { // Languages endpoint apiRouter.get('/languages', (_req, res) => { try { - const languages = getLanguageOptions(); + const languages = getLanguageOptions(getSttProvider()); res.json({ languages, defaultLanguage: DEFAULT_LANGUAGE_CODE }); } catch (error) { logger.error({ err: error }, 'get_languages_error'); diff --git a/backend/src/services/graph-service.ts b/backend/src/services/graph-service.ts index 1bfd910..79670f8 100644 --- a/backend/src/services/graph-service.ts +++ b/backend/src/services/graph-service.ts @@ -19,6 +19,7 @@ import { getResponseFeedbackGraph } from '../graphs/response-feedback-graph.js'; import { initializeTTSGraphs } from '../graphs/simple-tts-graph.js'; import { serverLogger as logger } from '../utils/logger.js'; import { connections } from './state.js'; +import { getSttProvider } from '../config/server.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); @@ -31,16 +32,25 @@ export function getGraphWrapper(): ConversationGraphWrapper | null { } export async function initializeGraph(): Promise { - const assemblyAIApiKey = process.env.ASSEMBLY_AI_API_KEY; - if (!assemblyAIApiKey) { - throw new Error('ASSEMBLY_AI_API_KEY environment variable is required'); + const sonioxKey = process.env.SONIOX_API_KEY; + const assemblyKey = process.env.ASSEMBLY_AI_API_KEY; + + if (!sonioxKey && !assemblyKey) { + throw new Error( + 'No speech-to-text API key configured. ' + + 'Set either SONIOX_API_KEY or ASSEMBLY_AI_API_KEY in your backend/.env file.' + ); } - logger.info('initializing_conversation_graph'); + const sttProvider = getSttProvider(); + const sttApiKey = sttProvider === 'soniox' ? sonioxKey! : assemblyKey!; + + logger.info({ sttProvider }, 'initializing_conversation_graph'); graphWrapper = getConversationGraph({ - assemblyAIApiKey, + sttProvider, + sttApiKey, connections, - defaultLanguageCode: 'es', // Always Spanish + defaultLanguageCode: 'es', }); logger.info('conversation_graph_initialized'); diff --git a/backend/src/services/websocket-handler.ts b/backend/src/services/websocket-handler.ts index e5fee0e..426730b 100644 --- a/backend/src/services/websocket-handler.ts +++ b/backend/src/services/websocket-handler.ts @@ -19,7 +19,7 @@ import { } from '../config/languages.js'; import { serverLogger as logger } from '../utils/logger.js'; import { getSimpleTTSGraph } from '../graphs/simple-tts-graph.js'; -import { serverConfig } from '../config/server.js'; +import { serverConfig, getSttProvider } from '../config/server.js'; import { connections, @@ -112,6 +112,11 @@ export function setupWebSocketHandlers(wss: WebSocketServer): void { conversationId: conversationId || null, }) ); + } else { + logger.info( + { connectionId, languageCode }, + 'flashcard_generation_returned_empty' + ); } } catch (error) { if (!isShuttingDown()) { @@ -279,6 +284,13 @@ function handleMessage( handleTextMessage(connectionId, ws, connectionManager, message); } else if (message.type === 'tts_pronounce_request') { handleTTSPronounce(connectionId, ws, message); + } else if (message.type === 'create_flashcard_request') { + handleCreateFlashcardRequest( + connectionId, + ws, + connectionManager, + message + ); } else { logger.debug( { connectionId, messageType: message.type }, @@ -294,12 +306,34 @@ function handleConversationUpdate( connectionId: string, connectionManager: ConnectionManager, message: { + conversationId?: string; data?: { + conversationId?: string; messages?: Array<{ role: string; content: string; timestamp?: string }>; }; messages?: Array<{ role: string; content: string; timestamp?: string }>; } ): void { + const incomingConversationId = + message.conversationId || message.data?.conversationId; + const currentConversationId = connectionManager.getConversationId(); + + if ( + incomingConversationId && + currentConversationId && + incomingConversationId !== currentConversationId + ) { + logger.info( + { + connectionId, + incomingConversationId, + currentConversationId, + }, + 'ignoring_stale_conversation_update' + ); + return; + } + // Handle both formats: { data: { messages: [...] } } and { messages: [...] } const messages = message.messages || @@ -389,7 +423,7 @@ async function handleConversationSwitch( } // Validate language code - const supportedCodes = getSupportedLanguageCodes(); + const supportedCodes = getSupportedLanguageCodes(getSttProvider()); const languageCode = supportedCodes.includes(requestedLanguageCode) ? requestedLanguageCode : DEFAULT_LANGUAGE_CODE; @@ -496,7 +530,7 @@ function handleUserContext( const currentAttrs = connectionAttributes.get(connectionId) || {}; // Validate language code - const supportedCodes = getSupportedLanguageCodes(); + const supportedCodes = getSupportedLanguageCodes(getSttProvider()); const validatedLanguageCode = languageCode && supportedCodes.includes(languageCode) ? languageCode @@ -608,7 +642,7 @@ async function handleTTSPronounce( return; } - if (text.length > 100) { + if (text.length > 500) { logger.warn( { connectionId, length: text.length }, 'tts_pronounce_text_too_long' @@ -675,3 +709,92 @@ async function handleTTSPronounce( ); } } + +async function handleCreateFlashcardRequest( + connectionId: string, + ws: WebSocket, + connectionManager: ConnectionManager, + message: { word?: string } +): Promise { + const word = message.word?.trim(); + if (!word) { + ws.send( + JSON.stringify({ + type: 'create_flashcard_error', + error: 'No word provided', + }) + ); + return; + } + + const flashcardProcessor = flashcardProcessors.get(connectionId); + if (!flashcardProcessor) { + ws.send( + JSON.stringify({ + type: 'create_flashcard_error', + error: 'No flashcard processor', + }) + ); + return; + } + + const languageCode = + connectionManager.getLanguageCode() || DEFAULT_LANGUAGE_CODE; + const conversationId = connectionManager.getConversationId(); + + const conversationState = connectionManager.getConversationState(); + const recentMessages = conversationState.messages.slice(-10).map((m) => ({ + role: m.role, + content: m.content, + })); + + logger.info({ connectionId, word, languageCode }, 'create_flashcard_request'); + + try { + const attrs = connectionAttributes.get(connectionId) || {}; + const userContext = { + attributes: { timezone: attrs.timezone || '' }, + targetingKey: attrs.userId || connectionId, + }; + + const flashcards = await flashcardProcessor.generateFlashcards( + recentMessages, + 1, + userContext, + languageCode, + word + ); + + if (flashcards.length > 0 && ws.readyState === WebSocket.OPEN) { + ws.send( + JSON.stringify({ + type: 'flashcards_generated', + flashcards, + conversationId: conversationId || null, + }) + ); + logger.info( + { connectionId, word, targetWord: flashcards[0].targetWord }, + 'flashcard_created_for_word' + ); + } else { + ws.send( + JSON.stringify({ + type: 'create_flashcard_error', + error: 'Failed to generate flashcard', + }) + ); + } + } catch (error) { + logger.error( + { err: error, connectionId, word }, + 'create_flashcard_request_error' + ); + ws.send( + JSON.stringify({ + type: 'create_flashcard_error', + error: 'Failed to generate flashcard', + }) + ); + } +} diff --git a/backend/src/utils/logger.ts b/backend/src/utils/logger.ts index 3403cb8..40e2a33 100644 --- a/backend/src/utils/logger.ts +++ b/backend/src/utils/logger.ts @@ -10,7 +10,8 @@ import pino from 'pino'; -const isDevelopment = process.env.NODE_ENV !== 'production'; +const usePrettyLogs = + process.env.LOG_PRETTY === 'true' || process.env.NODE_ENV !== 'production'; /** * Root logger instance @@ -18,7 +19,7 @@ const isDevelopment = process.env.NODE_ENV !== 'production'; */ export const logger = pino({ level: process.env.LOG_LEVEL || 'info', - transport: isDevelopment + transport: usePrettyLogs ? { target: 'pino-pretty', options: { diff --git a/frontend/README.md b/frontend/README.md index 4c96e21..e7aaf98 100644 --- a/frontend/README.md +++ b/frontend/README.md @@ -10,7 +10,7 @@ React + TypeScript frontend for the Inworld Language Tutor application. npm install ``` -2. Create `.env.local` with your Supabase credentials (optional, for auth/sync): +2. (Optional) Create `.env.local` with your Supabase credentials: ```bash VITE_SUPABASE_URL=https://YOUR_PROJECT.supabase.co VITE_SUPABASE_PUBLISHABLE_KEY=your_anon_key diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 5d0a7b8..8b55696 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -84,7 +84,6 @@ "integrity": "sha512-e7jT4DxYvIDLk1ZHmU/m/mB19rex9sv0c2ftBtjSBv+kVM/902eh0fINUzD7UwLLNR+jU585GxUJ8/EBfAM5fw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@babel/code-frame": "^7.27.1", "@babel/generator": "^7.28.5", @@ -424,7 +423,6 @@ } ], "license": "MIT", - "peer": true, "engines": { "node": ">=18" }, @@ -448,7 +446,6 @@ } ], "license": "MIT", - "peer": true, "engines": { "node": ">=18" } @@ -1667,7 +1664,6 @@ "integrity": "sha512-MWtvHrGZLFttgeEj28VXHxpmwYbor/ATPYbBfSFZEIRK0ecCFLl2Qo55z52Hss+UV9CRN7trSeq1zbgx7YDWWg==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "csstype": "^3.2.2" } @@ -1736,7 +1732,6 @@ "integrity": "sha512-3xP4XzzDNQOIqBMWogftkwxhg5oMKApqY0BAflmLZiFYHqyhSOxv/cd/zPQLTcCXr4AkaKb25joocY0BD1WC6A==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "8.51.0", "@typescript-eslint/types": "8.51.0", @@ -2099,7 +2094,6 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", - "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -2232,7 +2226,6 @@ } ], "license": "MIT", - "peer": true, "dependencies": { "baseline-browser-mapping": "^2.9.0", "caniuse-lite": "^1.0.30001759", @@ -2627,7 +2620,6 @@ "integrity": "sha512-LEyamqS7W5HB3ujJyvi0HQK/dtVINZvd5mAAp9eT5S/ujByGjiZLCzPcHVzuXbpJDJF/cxwHlfceVUDZ2lnSTw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.1", @@ -2688,7 +2680,6 @@ "integrity": "sha512-iI1f+D2ViGn+uvv5HuHVUamg8ll4tN+JRHGc6IJi4TP9Kl976C57fzPXgseXNs8v0iA8aSJpHsTWjDb9QJamGQ==", "dev": true, "license": "MIT", - "peer": true, "bin": { "eslint-config-prettier": "bin/cli.js" }, @@ -3330,7 +3321,6 @@ "integrity": "sha512-8i7LzZj7BF8uplX+ZyOlIz86V6TAsSs+np6m1kpW9u0JWi4z/1t+FzcK1aek+ybTnAC4KhBL4uXCNT0wcUIeCw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "cssstyle": "^4.1.0", "data-urls": "^5.0.0", @@ -3700,7 +3690,6 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, @@ -3753,7 +3742,6 @@ "integrity": "sha512-v6UNi1+3hSlVvv8fSaoUbggEM5VErKmmpGA7Pl3HF8V6uKY7rvClBOJlH6yNwQtfTueNkGVpOv/mtWL9L4bgRA==", "dev": true, "license": "MIT", - "peer": true, "bin": { "prettier": "bin/prettier.cjs" }, @@ -3792,7 +3780,6 @@ "resolved": "https://registry.npmjs.org/react/-/react-19.2.3.tgz", "integrity": "sha512-Ku/hhYbVjOQnXDZFv2+RibmLFGwFdeeKHFcOTlrt7xplBnya5OGn/hIRDsqDiSUcfORsDC7MPxwork8jBwsIWA==", "license": "MIT", - "peer": true, "engines": { "node": ">=0.10.0" } @@ -4145,7 +4132,6 @@ "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "dev": true, "license": "Apache-2.0", - "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -4231,7 +4217,6 @@ "integrity": "sha512-dZwN5L1VlUBewiP6H9s2+B3e3Jg96D0vzN+Ry73sOefebhYr9f94wwkMNN/9ouoU8pV1BqA1d1zGk8928cx0rg==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "esbuild": "^0.27.0", "fdir": "^6.5.0", @@ -4547,7 +4532,6 @@ "integrity": "sha512-0wZ1IRqGGhMP76gLqz8EyfBXKk0J2qo2+H3fi4mcUP/KtTocoX08nmIAHl1Z2kJIZbZee8KOpBCSNPRgauucjw==", "dev": true, "license": "MIT", - "peer": true, "funding": { "url": "https://github.com/sponsors/colinhacks" } diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index dd39db8..b77969b 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -45,7 +45,7 @@ function AppContent() { target="_blank" rel="noopener noreferrer" className="fab-button fab-labeled" - aria-label="Deploy on Render" + aria-label="Render" > - Deploy on Render + Render (null); const { chatHistory, currentTranscript, @@ -104,6 +110,34 @@ export function ChatSection() { [textInput, sendTextMessage] ); + const handleContextMenu = useCallback((e: React.MouseEvent) => { + const selection = window.getSelection(); + const selectedText = selection?.toString().trim(); + if (!selectedText) return; + + e.preventDefault(); + setContextMenu({ x: e.clientX, y: e.clientY, word: selectedText }); + }, []); + + const handleCreateFlashcard = useCallback(() => { + if (contextMenu?.word) { + createFlashcardForWord(contextMenu.word); + } + setContextMenu(null); + }, [contextMenu, createFlashcardForWord]); + + // Close context menu on click anywhere or scroll + useEffect(() => { + if (!contextMenu) return; + const close = () => setContextMenu(null); + window.addEventListener('click', close); + window.addEventListener('scroll', close, true); + return () => { + window.removeEventListener('click', close); + window.removeEventListener('scroll', close, true); + }; + }, [contextMenu]); + const isConnected = connectionStatus === 'connected'; return ( @@ -134,7 +168,12 @@ export function ChatSection() {
-
+
{/* Loading overlay when not connected */} {connectionStatus === 'connecting' && (
@@ -223,6 +262,29 @@ export function ChatSection() {
+ {contextMenu && isConnected && ( +
+ + + + + + Create flashcard for “ + {contextMenu.word.length > 30 + ? contextMenu.word.slice(0, 30) + '…' + : contextMenu.word} + ” +
+ )} ); } diff --git a/frontend/src/components/Flashcard.tsx b/frontend/src/components/Flashcard.tsx index 84bb9a3..f68cbbc 100644 --- a/frontend/src/components/Flashcard.tsx +++ b/frontend/src/components/Flashcard.tsx @@ -5,7 +5,9 @@ interface FlashcardProps { flashcard: FlashcardType; onCardClick?: (flashcard: FlashcardType) => void; onPronounce?: (flashcard: FlashcardType) => void; + onPronounceText?: (text: string) => void; isPronouncing?: boolean; + isPronouncingSentence?: boolean; } function capitalizeFirstLetter(text: string): string { @@ -17,7 +19,9 @@ export function Flashcard({ flashcard, onCardClick, onPronounce, + onPronounceText, isPronouncing = false, + isPronouncingSentence = false, }: FlashcardProps) { const [isFlipped, setIsFlipped] = useState(false); @@ -34,12 +38,26 @@ export function Flashcard({ [flashcard, onPronounce] ); + const handlePronounceExample = useCallback( + (e: React.MouseEvent) => { + e.stopPropagation(); + const text = flashcard.example || flashcard.example_sentence || ''; + if (text && onPronounceText) { + onPronounceText(text); + } + }, + [flashcard, onPronounceText] + ); + // Support both new 'targetWord' and legacy 'spanish' field const targetWord = flashcard.targetWord || flashcard.spanish || flashcard.word || ''; const english = flashcard.english || flashcard.translation || ''; const example = flashcard.example || flashcard.example_sentence || ''; + const exampleTranslation = flashcard.exampleTranslation || ''; const mnemonic = flashcard.mnemonic || ''; + const pinyin = flashcard.pinyin || ''; + const examplePinyin = flashcard.examplePinyin || ''; // Capitalize the first letter of the target word for display const displayTargetWord = capitalizeFirstLetter(targetWord); @@ -52,6 +70,7 @@ export function Flashcard({
{displayTargetWord}
+ {pinyin &&
{pinyin}
}
+ + ); diff --git a/frontend/src/context/AppContext.tsx b/frontend/src/context/AppContext.tsx index 4a20157..11cab10 100644 --- a/frontend/src/context/AppContext.tsx +++ b/frontend/src/context/AppContext.tsx @@ -261,6 +261,7 @@ interface AppContextType { handleInterrupt: () => void; sendTextMessage: (text: string) => void; pronounceWord: (text: string) => void; + createFlashcardForWord: (word: string) => void; // Conversation actions selectConversation: (conversationId: string) => void; createNewConversation: () => void; @@ -403,6 +404,60 @@ export function AppProvider({ children }: AppProviderProps) { }; }, []); + // Echo gate (mobile only): mute mic while TTS is playing to prevent + // speaker output from being picked up as user speech. Desktop browsers + // have reliable echo cancellation so we leave interruption enabled there. + useEffect(() => { + const isMobile = + /Android|iPhone|iPad|iPod/i.test(navigator.userAgent) || + (navigator.platform === 'MacIntel' && navigator.maxTouchPoints > 1); + if (!isMobile) return; + + const audioPlayer = audioPlayerRef.current; + const ttsAudioPlayer = ttsAudioPlayerRef.current; + const audioHandler = audioHandlerRef.current; + + let mainPlaying = false; + let ttsPlaying = false; + + const updateMuteState = () => { + if (mainPlaying || ttsPlaying) { + audioHandler.mute(); + } else { + audioHandler.unmute(); + } + }; + + const onMainStarted = () => { + mainPlaying = true; + updateMuteState(); + }; + const onMainDone = () => { + mainPlaying = false; + updateMuteState(); + }; + const onTtsStarted = () => { + ttsPlaying = true; + updateMuteState(); + }; + const onTtsDone = () => { + ttsPlaying = false; + updateMuteState(); + }; + + audioPlayer.on('playback_started', onMainStarted); + audioPlayer.on('playback_finished', onMainDone); + audioPlayer.on('playback_stopped', onMainDone); + + ttsAudioPlayer.on('playback_started', onTtsStarted); + ttsAudioPlayer.on('playback_finished', onTtsDone); + ttsAudioPlayer.on('playback_stopped', onTtsDone); + + return () => { + audioHandler.unmute(); + }; + }, []); + // Load initial state (conversations across all languages) // Only run if Supabase sync hasn't already loaded conversations useEffect(() => { @@ -520,8 +575,6 @@ export function AppProvider({ children }: AppProviderProps) { // Case 1: We have a pending LLM response but user message was already added (text input case) if (pendingLLMResponse && !pendingTranscription) { - // Add only the teacher response - storage.addMessage('assistant', pendingLLMResponse); dispatch({ type: 'ADD_MESSAGE', payload: { @@ -531,8 +584,24 @@ export function AppProvider({ children }: AppProviderProps) { }, }); - const conversationHistory = storage.getConversationHistory(); - wsClient.send({ type: 'conversation_update', data: conversationHistory }); + // Build conversation_update from current chatHistory + new assistant message + const messages = [ + ...currentState.chatHistory.map((m) => ({ + role: m.role === 'learner' ? 'user' : 'assistant', + content: m.content, + timestamp: m.timestamp || new Date().toISOString(), + })), + { + role: 'assistant', + content: pendingLLMResponse, + timestamp: new Date().toISOString(), + }, + ]; + wsClient.send({ + type: 'conversation_update', + conversationId, + messages, + }); pendingLLMResponseRef.current = null; dispatch({ type: 'RESET_STREAMING_STATE' }); @@ -562,7 +631,6 @@ export function AppProvider({ children }: AppProviderProps) { }); } - storage.addMessage('user', pendingTranscription); dispatch({ type: 'ADD_MESSAGE', payload: { @@ -572,7 +640,6 @@ export function AppProvider({ children }: AppProviderProps) { }, }); - storage.addMessage('assistant', pendingLLMResponse); dispatch({ type: 'ADD_MESSAGE', payload: { @@ -582,8 +649,29 @@ export function AppProvider({ children }: AppProviderProps) { }, }); - const conversationHistory = storage.getConversationHistory(); - wsClient.send({ type: 'conversation_update', data: conversationHistory }); + // Build conversation_update from current chatHistory + new user + assistant messages + const messages = [ + ...currentState.chatHistory.map((m) => ({ + role: m.role === 'learner' ? 'user' : 'assistant', + content: m.content, + timestamp: m.timestamp || new Date().toISOString(), + })), + { + role: 'user', + content: pendingTranscription, + timestamp: new Date().toISOString(), + }, + { + role: 'assistant', + content: pendingLLMResponse, + timestamp: new Date().toISOString(), + }, + ]; + wsClient.send({ + type: 'conversation_update', + conversationId, + messages, + }); dispatch({ type: 'SET_PENDING_TRANSCRIPTION', payload: null }); pendingLLMResponseRef.current = null; @@ -686,12 +774,16 @@ export function AppProvider({ children }: AppProviderProps) { }); if (status === 'connected') { - const existingConversation = storage.getConversationHistory(); - if (existingConversation.messages.length > 0) { - wsClient.send({ - type: 'conversation_update', - data: existingConversation, - }); + const currentId = stateRef.current.currentConversationId; + if (currentId) { + const conversationData = storage.getConversation(currentId); + if (conversationData && conversationData.messages.length > 0) { + wsClient.send({ + type: 'conversation_update', + conversationId: currentId, + messages: conversationData.messages, + }); + } } } }); @@ -965,17 +1057,9 @@ export function AppProvider({ children }: AppProviderProps) { })) as ChatMessage[]; // Update chat history to match server state + // Per-conversation storage is kept in sync by the useEffect on chatHistory dispatch({ type: 'SET_CHAT_HISTORY', payload: chatHistory }); - // Also update storage to stay in sync - storage.clearConversation(); - messages.forEach((m) => { - storage.addMessage( - m.role === 'user' ? 'user' : 'assistant', - m.content - ); - }); - // Clear any pending state dispatch({ type: 'SET_PENDING_TRANSCRIPTION', payload: null }); pendingLLMResponseRef.current = null; @@ -1263,7 +1347,6 @@ export function AppProvider({ children }: AppProviderProps) { } // Add user message to chat history immediately (unlike audio where we wait for transcription) - storage.addMessage('user', trimmedText); dispatch({ type: 'ADD_MESSAGE', payload: { @@ -1304,6 +1387,21 @@ export function AppProvider({ children }: AppProviderProps) { [state.connectionStatus] ); + // Request flashcard generation for a specific word + const createFlashcardForWord = useCallback( + (word: string) => { + const wsClient = wsClientRef.current; + const trimmed = word.trim(); + if (state.connectionStatus !== 'connected' || !trimmed) return; + + wsClient.send({ + type: 'create_flashcard_request', + word: trimmed, + }); + }, + [state.connectionStatus] + ); + // Select a conversation from the sidebar const selectConversation = useCallback( (conversationId: string) => { @@ -1645,6 +1743,7 @@ export function AppProvider({ children }: AppProviderProps) { handleInterrupt, sendTextMessage, pronounceWord, + createFlashcardForWord, selectConversation, createNewConversation, deleteConversation, @@ -1663,6 +1762,7 @@ export function AppProvider({ children }: AppProviderProps) { handleInterrupt, sendTextMessage, pronounceWord, + createFlashcardForWord, selectConversation, createNewConversation, deleteConversation, diff --git a/frontend/src/services/AudioHandler.ts b/frontend/src/services/AudioHandler.ts index 1c0a51f..fb1ae02 100644 --- a/frontend/src/services/AudioHandler.ts +++ b/frontend/src/services/AudioHandler.ts @@ -9,6 +9,7 @@ export class AudioHandler { private stream: MediaStream | null = null; private microphone: MediaStreamAudioSourceNode | null = null; private isStreaming = false; + private isMuted = false; private listeners = new Map(); private isIOS: boolean; private iosHandler: IOSAudioHandler | null; @@ -81,7 +82,7 @@ export class AudioHandler { await this.iosHandler.unlockAudioContext?.(); const success = await this.iosHandler.startMicrophone?.((audioData) => { - if (this.isStreaming) { + if (this.isStreaming && !this.isMuted) { this.emit('audioChunk', audioData); } }); @@ -154,7 +155,7 @@ export class AudioHandler { ); this.workletNode.port.onmessage = (event: MessageEvent) => { - if (this.isStreaming) { + if (this.isStreaming && !this.isMuted) { const int16Buffer = event.data as ArrayBuffer; const base64Audio = btoa( String.fromCharCode(...new Uint8Array(int16Buffer)) @@ -189,7 +190,7 @@ export class AudioHandler { let buffer: Float32Array | null = null; this.scriptProcessor.onaudioprocess = (event: AudioProcessingEvent) => { - if (this.isStreaming) { + if (this.isStreaming && !this.isMuted) { const inputData = event.inputBuffer.getChannelData(0); // Append new data to the buffer @@ -288,4 +289,24 @@ export class AudioHandler { getIsStreaming(): boolean { return this.isStreaming; } + + mute(): void { + if (!this.isMuted) { + this.isMuted = true; + console.log( + '[AudioHandler] Muted β€” suppressing audio chunks during TTS playback' + ); + } + } + + unmute(): void { + if (this.isMuted) { + this.isMuted = false; + console.log('[AudioHandler] Unmuted β€” resuming audio chunk emission'); + } + } + + getIsMuted(): boolean { + return this.isMuted; + } } diff --git a/frontend/src/services/AudioPlayer.ts b/frontend/src/services/AudioPlayer.ts index 61a377d..3dc89fd 100644 --- a/frontend/src/services/AudioPlayer.ts +++ b/frontend/src/services/AudioPlayer.ts @@ -6,8 +6,6 @@ export class AudioPlayer { private audioContext: AudioContext | null = null; private audioQueue: AudioBuffer[] = []; private isPlaying = false; - private isStartingPlayback = false; - private currentSource: AudioBufferSourceNode | null = null; private listeners = new Map(); private streamTimeout: ReturnType | null = null; private isIOS: boolean; @@ -15,8 +13,7 @@ export class AudioPlayer { private nextStartTime: number = 0; private scheduledSources: AudioBufferSourceNode[] = []; private scheduleInterval: ReturnType | null = null; - private readonly SCHEDULE_AHEAD_TIME = 0.1; // Look 100ms ahead - private readonly FADE_SAMPLES = 128; // ~2.7ms at 48kHz, ~8ms at 16kHz + private readonly SCHEDULE_AHEAD_TIME = 0.3; constructor() { this.isIOS = @@ -115,24 +112,17 @@ export class AudioPlayer { bytes[i] = binaryString.charCodeAt(i); } - // Create audio buffer const audioBuffer = await this.createAudioBuffer( bytes.buffer, sampleRate, audioFormat ); - this.applyFadeEnvelope(audioBuffer); this.audioQueue.push(audioBuffer); - // Start playback immediately if not already playing - if (!this.isPlaying && !this.isStartingPlayback) { - this.isStartingPlayback = true; + if (!this.isPlaying) { this.startScheduleInterval(); - requestAnimationFrame(() => { - this.isStartingPlayback = false; - this.scheduleBuffers(); - }); + this.scheduleBuffers(); } } catch (error) { console.error('Error processing audio stream:', error); @@ -150,20 +140,9 @@ export class AudioPlayer { let numSamples: number; - console.log( - `[AudioPlayer] createAudioBuffer: format=${audioFormat}, byteLength=${arrayBuffer.byteLength}, sampleRate=${sampleRate}` - ); - if (audioFormat === 'float32') { const float32Array = new Float32Array(arrayBuffer); numSamples = float32Array.length; - console.log( - `[AudioPlayer] Float32 samples: ${numSamples}, first 3 values: [${Array.from( - float32Array.slice(0, 3) - ) - .map((v) => v.toFixed(4)) - .join(', ')}]` - ); const audioBuffer = this.audioContext.createBuffer( 1, @@ -181,7 +160,6 @@ export class AudioPlayer { // Int16 PCM format const int16Array = new Int16Array(arrayBuffer); numSamples = int16Array.length; - console.log(`[AudioPlayer] Int16 samples: ${numSamples}`); const audioBuffer = this.audioContext.createBuffer( 1, @@ -198,24 +176,6 @@ export class AudioPlayer { } } - private applyFadeEnvelope(audioBuffer: AudioBuffer): void { - const channelData = audioBuffer.getChannelData(0); - const length = channelData.length; - const fadeLength = Math.min(this.FADE_SAMPLES, Math.floor(length / 4)); - - // Fade-in at start - for (let i = 0; i < fadeLength; i++) { - const gain = i / fadeLength; - channelData[i] *= gain; - } - - // Fade-out at end - for (let i = 0; i < fadeLength; i++) { - const gain = i / fadeLength; - channelData[length - 1 - i] *= gain; - } - } - private scheduleBuffers(): void { if (!this.audioContext || this.audioQueue.length === 0) { return; @@ -223,16 +183,8 @@ export class AudioPlayer { const currentTime = this.audioContext.currentTime; - // Handle queue underrun with safety margin if (this.nextStartTime < currentTime) { - const underrunAmount = currentTime - this.nextStartTime; - if (underrunAmount > 0.05) { - console.warn( - `[AudioPlayer] Queue underrun: ${(underrunAmount * 1000).toFixed(1)}ms behind` - ); - } - // Add small margin to ensure we're not scheduling in the past - this.nextStartTime = currentTime + 0.005; + this.nextStartTime = currentTime; } // Schedule buffers that should start within SCHEDULE_AHEAD_TIME @@ -261,6 +213,9 @@ export class AudioPlayer { this.scheduledSources.splice(index, 1); } + // Backstop: ensure more buffers get scheduled even if setInterval is delayed + this.scheduleBuffers(); + if (this.scheduledSources.length === 0 && this.audioQueue.length === 0) { this.isPlaying = false; this.stopScheduleInterval(); @@ -270,9 +225,6 @@ export class AudioPlayer { try { source.start(startTime); - console.log( - `Scheduled buffer: ${audioBuffer.duration.toFixed(3)}s at ${startTime.toFixed(3)}` - ); if (!this.isPlaying) { this.isPlaying = true; @@ -312,7 +264,6 @@ export class AudioPlayer { if (this.isIOS && this.iosHandler) { this.iosHandler.stopAudioPlayback?.(); this.isPlaying = false; - this.isStartingPlayback = false; this.emit('playback_stopped'); return; } @@ -332,20 +283,8 @@ export class AudioPlayer { this.scheduledSources = []; this.nextStartTime = 0; - if (this.currentSource) { - try { - this.currentSource.stop(); - this.currentSource.disconnect(); - this.currentSource = null; - } catch (error) { - console.warn('Error stopping audio source:', error); - } - } - - // Clear audio queue to prevent any queued audio from playing this.audioQueue = []; this.isPlaying = false; - this.isStartingPlayback = false; this.emit('playback_stopped'); } diff --git a/frontend/src/services/SupabaseStorage.ts b/frontend/src/services/SupabaseStorage.ts index 0a26dd5..b031b85 100644 --- a/frontend/src/services/SupabaseStorage.ts +++ b/frontend/src/services/SupabaseStorage.ts @@ -212,7 +212,10 @@ export class SupabaseStorage { targetWord: f.target_word, english: f.english, example: f.example, + exampleTranslation: f.example_translation ?? undefined, mnemonic: f.mnemonic, + pinyin: f.pinyin ?? undefined, + examplePinyin: f.example_pinyin ?? undefined, timestamp: f.created_at, languageCode: f.language_code, })); @@ -229,7 +232,10 @@ export class SupabaseStorage { target_word: f.targetWord || f.spanish || '', english: f.english, example: f.example, + example_translation: f.exampleTranslation || null, mnemonic: f.mnemonic, + pinyin: f.pinyin || null, + example_pinyin: f.examplePinyin || null, })); // Use upsert to handle duplicates gracefully @@ -262,7 +268,10 @@ export class SupabaseStorage { targetWord: f.target_word, english: f.english, example: f.example, + exampleTranslation: f.example_translation ?? undefined, mnemonic: f.mnemonic, + pinyin: f.pinyin ?? undefined, + examplePinyin: f.example_pinyin ?? undefined, timestamp: f.created_at, languageCode: f.language_code, conversationId: f.conversation_id, @@ -282,7 +291,10 @@ export class SupabaseStorage { target_word: f.targetWord || f.spanish || '', english: f.english, example: f.example, + example_translation: f.exampleTranslation || null, mnemonic: f.mnemonic, + pinyin: f.pinyin || null, + example_pinyin: f.examplePinyin || null, })); // Use upsert to handle duplicates gracefully diff --git a/frontend/src/styles/main.css b/frontend/src/styles/main.css index ff35019..dcd3116 100644 --- a/frontend/src/styles/main.css +++ b/frontend/src/styles/main.css @@ -1322,6 +1322,48 @@ body { cursor: not-allowed; } +/* Flashcard context menu (right-click to create flashcard) */ +.flashcard-context-menu { + position: fixed; + z-index: 1000; + background: #ffffff; + border: 1px solid #e5e5e5; + border-radius: 8px; + padding: 8px 14px; + font-size: 14px; + color: #374151; + cursor: pointer; + box-shadow: 0 4px 16px rgba(0, 0, 0, 0.12); + display: flex; + align-items: center; + gap: 8px; + white-space: nowrap; + user-select: none; + animation: context-menu-in 0.1s ease-out; +} + +.flashcard-context-menu:hover { + background: #f3f4f6; + color: #1a1a1a; +} + +.flashcard-context-menu svg { + width: 16px; + height: 16px; + flex-shrink: 0; +} + +@keyframes context-menu-in { + from { + opacity: 0; + transform: scale(0.95); + } + to { + opacity: 1; + transform: scale(1); + } +} + /* Flashcards Section */ .flashcards-section { background: #ffffff; @@ -1479,6 +1521,15 @@ body { padding: 0 8px; } +.flashcard-pinyin { + font-size: 18px; + font-weight: 400; + color: #6b7280; + text-align: center; + margin-top: 4px; + letter-spacing: 0.5px; +} + .flashcard-english { font-size: 24px; font-weight: 600; @@ -1501,6 +1552,64 @@ body { overflow-wrap: break-word; hyphens: auto; flex-shrink: 0; + display: flex; + align-items: center; + justify-content: center; + gap: 6px; +} + +.flashcard-example.pronounceable { + cursor: pointer; + border-radius: 8px; + padding: 6px 10px; + transition: background-color 0.2s ease, color 0.2s ease; +} + +.flashcard-example.pronounceable:hover { + background-color: rgba(59, 130, 246, 0.08); + color: #2563eb; +} + +.flashcard-example.pronounceable:active { + background-color: rgba(59, 130, 246, 0.15); +} + +.flashcard-example.pronouncing { + color: #2563eb; + background-color: rgba(59, 130, 246, 0.08); +} + +.example-speaker-icon { + width: 16px; + height: 16px; + flex-shrink: 0; + opacity: 0.4; + transition: opacity 0.2s ease; +} + +.flashcard-example.pronounceable:hover .example-speaker-icon { + opacity: 0.8; +} + +.flashcard-example.pronouncing .example-speaker-icon { + opacity: 1; +} + +.flashcard-example-pinyin { + font-size: 13px; + color: #9ca3af; + text-align: center; + margin-bottom: 4px; + margin-top: -10px; + letter-spacing: 0.3px; +} + +.flashcard-example-translation { + font-size: 13px; + color: #9ca3af; + text-align: center; + margin-bottom: 12px; + line-height: 1.4; } .flashcard-mnemonic { @@ -1550,6 +1659,10 @@ body { .flashcard-target-word { font-size: 24px; } + + .flashcard-pinyin { + font-size: 14px; + } .flashcard-english { font-size: 18px; @@ -1559,6 +1672,11 @@ body { font-size: 13px; padding: 0 8px; } + + .flashcard-example-pinyin, + .flashcard-example-translation { + font-size: 11px; + } .flashcard-mnemonic { font-size: 12px; @@ -1593,6 +1711,71 @@ body { text-align: center; } +/* Export Modal */ +.export-modal-overlay { + position: fixed; + inset: 0; + background: rgba(0, 0, 0, 0.45); + backdrop-filter: blur(4px); + display: flex; + align-items: center; + justify-content: center; + z-index: 2000; + animation: modal-fade-in 0.2s ease-out; +} + +@keyframes modal-fade-in { + from { + opacity: 0; + } + to { + opacity: 1; + } +} + +.export-modal { + background: #ffffff; + border-radius: 16px; + padding: 40px 48px; + display: flex; + flex-direction: column; + align-items: center; + gap: 16px; + box-shadow: 0 20px 60px rgba(0, 0, 0, 0.15), 0 0 0 1px rgba(0, 0, 0, 0.05); + animation: modal-scale-in 0.25s ease-out; +} + +@keyframes modal-scale-in { + from { + opacity: 0; + transform: scale(0.95); + } + to { + opacity: 1; + transform: scale(1); + } +} + +.export-modal-spinner { + width: 40px; + height: 40px; + border: 3px solid #e5e5e5; + border-top-color: #1a1a1a; + border-radius: 50%; + animation: spin 0.8s linear infinite; +} + +.export-modal-text { + font-size: 17px; + font-weight: 600; + color: #1a1a1a; +} + +.export-modal-subtext { + font-size: 14px; + color: #6b7280; +} + /* Responsive */ @media (max-width: 768px) { .header-logo { @@ -1701,12 +1884,43 @@ body { letter-spacing: 0.02em; width: 100%; box-sizing: border-box; + justify-content: flex-start; +} + +/* Sidebar Footer Links (mobile only) */ +.sidebar-footer { + display: none; + flex-direction: column; + gap: 4px; + padding: 12px; + border-top: 1px solid #e5e5e5; +} + +.sidebar-footer-link { + display: flex; + align-items: center; + gap: 10px; + padding: 10px 12px; + border-radius: 8px; + color: #4b5563; + text-decoration: none; + font-size: 13px; + font-weight: 500; + transition: background 0.15s ease, color 0.15s ease; +} + +.sidebar-footer-link:hover { + background: #f0f0f0; + color: #1a1a1a; } @media (max-width: 768px) { .floating-buttons { - bottom: 16px; - right: 16px; + display: none; + } + + .sidebar-footer { + display: flex; } } diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts index 99eb49a..e61b492 100644 --- a/frontend/src/types/index.ts +++ b/frontend/src/types/index.ts @@ -53,7 +53,10 @@ export interface Flashcard { targetWord: string; english: string; example: string; + exampleTranslation?: string; mnemonic: string; + pinyin?: string; + examplePinyin?: string; timestamp?: string; languageCode?: string; conversationId?: string; diff --git a/render.yaml b/render.yaml index f9f53ca..9cfc18d 100644 --- a/render.yaml +++ b/render.yaml @@ -14,6 +14,8 @@ services: sync: false - key: ASSEMBLY_AI_API_KEY sync: false + - key: SONIOX_API_KEY + sync: false - key: SUPABASE_URL sync: false - key: SUPABASE_SECRET_KEY diff --git a/supabase/migrations/20240108000000_initial_schema.sql b/supabase/migrations/20240108000000_initial_schema.sql index 2913abc..d52dd98 100644 --- a/supabase/migrations/20240108000000_initial_schema.sql +++ b/supabase/migrations/20240108000000_initial_schema.sql @@ -51,7 +51,10 @@ create table public.flashcards ( target_word text not null, english text not null, example text, + example_translation text, mnemonic text, + pinyin text, + example_pinyin text, created_at timestamptz default now() not null, unique(user_id, conversation_id, target_word) );