diff --git a/CLAUDE.md b/CLAUDE.md index 81a299e..518b7ab 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -21,7 +21,7 @@ Language learning app using Inworld Realtime API for voice conversations. ## Architecture Browser <-> our WebSocket <-> SessionManager <-> Inworld Realtime WebSocket (STT+LLM+TTS) - STT model: assemblyai/u3-rt-pro (with per-language hints) -- LLM model: openai/gpt-4.1-nano (via Inworld Realtime) +- LLM model: openai/gpt-5.4-mini (via Inworld Realtime) - SessionManager: one per client, manages Inworld WS lifecycle, forwards audio/text, handles greeting, tracks turns - InworldLLM: uses Inworld LLM Router (OpenAI-compatible) for flashcards, feedback, translation - TurnMemory: 5-turn sliding window, non-blocking Supabase persistence diff --git a/backend/src/__tests__/config/languages.test.ts b/backend/src/__tests__/config/languages.test.ts index 74c6e77..d59c303 100644 --- a/backend/src/__tests__/config/languages.test.ts +++ b/backend/src/__tests__/config/languages.test.ts @@ -48,13 +48,15 @@ describe('languages config', () => { expect(config.name).toBeTruthy(); expect(config.nativeName).toBeTruthy(); expect(config.flag).toBeTruthy(); - expect(config.sttLanguageCode).toBeTruthy(); + expect(config.bcp47).toBeTruthy(); expect(config.ttsConfig).toBeDefined(); expect(config.ttsConfig.speakerId).toBeTruthy(); expect(config.ttsConfig.modelId).toBeTruthy(); expect(config.teacherPersona).toBeDefined(); expect(config.teacherPersona.name).toBeTruthy(); expect(config.exampleTopics.length).toBeGreaterThan(0); + expect(Array.isArray(config.disfluencies)).toBe(true); + expect(config.disfluencies.length).toBeGreaterThanOrEqual(1); } }); }); diff --git a/backend/src/__tests__/inworld-llm.test.ts b/backend/src/__tests__/inworld-llm.test.ts index 97226d1..39152ea 100644 --- a/backend/src/__tests__/inworld-llm.test.ts +++ b/backend/src/__tests__/inworld-llm.test.ts @@ -252,7 +252,7 @@ describe('InworldLLM', () => { ); const body = JSON.parse(fetchSpy.mock.calls[0][1]!.body as string); - expect(body.model).toBe('openai/gpt-4.1-nano'); + expect(body.model).toBe('openai/gpt-5.4-mini'); expect(body.messages[0].role).toBe('user'); expect(body.max_tokens).toBeDefined(); expect(body.temperature).toBeDefined(); @@ -295,7 +295,12 @@ describe('InworldLLM', () => { json: async () => ({ audioContent: 'base64audiodata==' }), } as Response); - const audio = await llm.pronounce('Hola', 'Rafael'); + const audio = await llm.pronounce( + 'Hola', + 'Rafael', + 'es-MX', + 'inworld-tts-2' + ); expect(audio).toBe('base64audiodata=='); }); @@ -305,7 +310,7 @@ describe('InworldLLM', () => { json: async () => ({ audioContent: 'audio' }), } as Response); - await llm.pronounce('perro', 'Rafael'); + await llm.pronounce('perro', 'Rafael', 'es-MX', 'inworld-tts-2'); expect(fetchSpy).toHaveBeenCalledWith( 'https://api.inworld.ai/tts/v1/voice', @@ -315,7 +320,8 @@ describe('InworldLLM', () => { const body = JSON.parse(fetchSpy.mock.calls[0][1]!.body as string); expect(body.text).toBe('perro'); expect(body.voice_id).toBe('Rafael'); - expect(body.model_id).toBe('inworld-tts-1.5-max'); + expect(body.model_id).toBe('inworld-tts-2'); + expect(body.language).toBe('es-MX'); expect(body.audio_config.audio_encoding).toBe('LINEAR16'); expect(body.audio_config.sample_rate_hertz).toBe(24000); }); @@ -326,7 +332,12 @@ describe('InworldLLM', () => { status: 500, } as Response); - const audio = await llm.pronounce('Hola', 'Rafael'); + const audio = await llm.pronounce( + 'Hola', + 'Rafael', + 'es-MX', + 'inworld-tts-2' + ); expect(audio).toBeNull(); }); @@ -334,7 +345,12 @@ describe('InworldLLM', () => { process.env.INWORLD_API_KEY = ''; const noKeyLlm = new InworldLLM(); - const audio = await noKeyLlm.pronounce('Hola', 'Rafael'); + const audio = await noKeyLlm.pronounce( + 'Hola', + 'Rafael', + 'es-MX', + 'inworld-tts-2' + ); expect(audio).toBeNull(); }); }); diff --git a/backend/src/__tests__/session-manager.test.ts b/backend/src/__tests__/session-manager.test.ts index 5205b52..db71a7d 100644 --- a/backend/src/__tests__/session-manager.test.ts +++ b/backend/src/__tests__/session-manager.test.ts @@ -1,6 +1,9 @@ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; import WebSocket from 'ws'; -import { SessionManager } from '../services/session-manager.js'; +import { + SessionManager, + stripBracketedTags, +} from '../services/session-manager.js'; // Mock ws module so SessionManager doesn't make real connections vi.mock('ws', () => { @@ -25,6 +28,40 @@ function createMockClientWs() { } as unknown as WebSocket; } +describe('stripBracketedTags', () => { + it('removes a leading steering tag and trims', () => { + expect(stripBracketedTags('[speak warmly] Hello there')).toBe( + 'Hello there' + ); + }); + + it('removes inline non-verbal tags', () => { + expect(stripBracketedTags('That is funny [laugh] really')).toBe( + 'That is funny really' + ); + }); + + it('collapses double spaces left by removed tags', () => { + expect(stripBracketedTags('one [tag] two')).toBe('one two'); + }); + + it('removes the space before punctuation when a tag preceded it', () => { + expect(stripBracketedTags('Hello [laugh] , how are you')).toBe( + 'Hello, how are you' + ); + }); + + it('preserves disfluency text (no brackets) untouched', () => { + expect(stripBracketedTags('えーと、そうですね')).toBe('えーと、そうですね'); + }); + + it('handles multiple bracketed tags in one string', () => { + expect( + stripBracketedTags('[speak gently] Pues [sigh] no sé qué decir') + ).toBe('Pues no sé qué decir'); + }); +}); + describe('SessionManager', () => { const originalEnv = process.env.INWORLD_API_KEY; @@ -281,7 +318,7 @@ describe('SessionManager', () => { }); describe('streaming STT events', () => { - it('should accumulate transcription deltas incrementally', () => { + it('should treat transcription deltas as cumulative (Soniox)', () => { const clientWs = createMockClientWs(); const mgr = new SessionManager({ sessionId: 'test-stt-1', @@ -298,12 +335,12 @@ describe('SessionManager', () => { handler.call(mgr, { type: 'conversation.item.input_audio_transcription.delta', - delta: 'Hola, ', + delta: 'Hola', }); handler.call(mgr, { type: 'conversation.item.input_audio_transcription.delta', - delta: 'me llamo Cale.', + delta: 'Hola, me llamo Cale.', }); const sent = (clientWs as unknown as { _messages: string[] })._messages; @@ -313,7 +350,7 @@ describe('SessionManager', () => { (m: Record) => m.type === 'partial_transcript' ); expect(partials).toHaveLength(2); - expect(partials[0].text).toBe('Hola, '); + expect(partials[0].text).toBe('Hola'); expect(partials[1].text).toBe('Hola, me llamo Cale.'); }); @@ -430,10 +467,112 @@ describe('SessionManager', () => { const sent = JSON.parse(mockInworldWs.send.mock.calls[0][0]); expect(sent.type).toBe('session.update'); expect(sent.session.audio.input.transcription.model).toBe( - 'assemblyai/u3-rt-pro' + 'soniox/stt-rt-v4' ); - expect(sent.session.audio.input.transcription.language).toBe('es-MX'); - expect(sent.session.model).toBe('openai/gpt-4.1-nano'); + expect(sent.session.audio.input.transcription.language).toBe('es'); + expect(sent.session.providerData.tts.language).toBe('es-MX'); + expect(sent.session.model).toBe('openai/gpt-5.4-mini'); + }); + + it('should strip steering and non-verbal tags from completed assistant transcript', () => { + const clientWs = createMockClientWs(); + const mgr = new SessionManager({ + sessionId: 'test-strip-1', + ws: clientWs, + languageCode: 'ja', + }); + + const mgrAny = mgr as unknown as Record; + mgrAny.sessionReady = true; + + const handler = mgrAny.handleInworldEvent as ( + event: Record + ) => void; + handler.call(mgr, { + type: 'response.output_audio_transcript.done', + transcript: + '[speak gently] なるほど、忙しいですね。[laugh] 趣味はありますか?', + }); + + const sent = (clientWs as unknown as { _messages: string[] })._messages; + const completes = sent + .map((m) => JSON.parse(m)) + .filter( + (m: Record) => m.type === 'llm_response_complete' + ); + expect(completes).toHaveLength(1); + expect(completes[0].text).not.toContain('['); + expect(completes[0].text).not.toContain(']'); + expect(completes[0].text).toContain('なるほど'); + expect(completes[0].text).toContain('趣味はありますか'); + }); + + it('should strip a bracketed tag that straddles two streaming deltas', () => { + const clientWs = createMockClientWs(); + const mgr = new SessionManager({ + sessionId: 'test-strip-2', + ws: clientWs, + languageCode: 'es', + }); + + const mgrAny = mgr as unknown as Record; + mgrAny.sessionReady = true; + + const handler = mgrAny.handleInworldEvent as ( + event: Record + ) => void; + + handler.call(mgr, { + type: 'response.output_audio_transcript.delta', + delta: 'Hola, [spe', + }); + handler.call(mgr, { + type: 'response.output_audio_transcript.delta', + delta: 'ak warmly] ¿qué tal?', + }); + + const sent = (clientWs as unknown as { _messages: string[] })._messages; + const chunks = sent + .map((m) => JSON.parse(m)) + .filter( + (m: Record) => m.type === 'llm_response_chunk' + ); + const concatenated = chunks.map((c) => c.text).join(''); + expect(concatenated).not.toContain('['); + expect(concatenated).not.toContain(']'); + expect(concatenated).toContain('Hola'); + expect(concatenated).toContain('¿qué tal?'); + }); + + it('should include TTS-2 expressivity guidance with steering, nonverbal, and target-language disfluencies', () => { + const clientWs = createMockClientWs(); + const mgr = new SessionManager({ + sessionId: 'test-expressivity-1', + ws: clientWs, + languageCode: 'es', + }); + + const mgrAny = mgr as unknown as Record; + const mockInworldWs = { + readyState: 1, + send: vi.fn(), + on: vi.fn(), + close: vi.fn(), + }; + mgrAny.inworldWs = mockInworldWs; + + const sendUpdate = mgrAny.sendSessionUpdate as () => void; + sendUpdate.call(mgr); + + const sent = JSON.parse(mockInworldWs.send.mock.calls[0][0]); + const instructions = sent.session.instructions as string; + + // Steering tag example + expect(instructions).toContain('[speak'); + // Non-verbal tag + expect(instructions).toContain('[laugh]'); + // Spanish disfluency from the seeded list + expect(instructions).toContain('este'); }); }); }); diff --git a/backend/src/config/languages.ts b/backend/src/config/languages.ts index 032033b..ebc021f 100644 --- a/backend/src/config/languages.ts +++ b/backend/src/config/languages.ts @@ -1,10 +1,16 @@ /** * Language Configuration System * - * This module provides a centralized configuration for all supported languages. - * To add a new language: - * 1. Add a new entry to SUPPORTED_LANGUAGES with all required fields - * 2. The rest of the app will automatically support the new language + * Centralized configuration for all supported languages. + * + * Wire-format conventions: + * - `bcp47` is the canonical form ("es-ES", "fi-FI"). Used for TTS-2 via + * `session.providerData.tts.language` (and the REST `/tts/v1/voice` `language` field). + * - `code` is ISO 639-1 where available ("es", "fi"), otherwise ISO 639-2 + * ("fil"). Used as the map key, dropdown value, and Soniox STT hint via + * `transcription.language`. + * + * To add a new language: add a new entry to SUPPORTED_LANGUAGES. */ import { createLogger } from '../utils/logger.js'; @@ -23,53 +29,45 @@ export interface TTSConfig { modelId: string; speakingRate: number; temperature: number; - languageCode?: string; // Optional TTS language code (e.g., 'ja-JP') } export interface LanguageConfig { - // Identifier - code: string; // e.g., 'es', 'ja', 'fr' + /** ISO 639-1 where available, otherwise ISO 639-2 (e.g. 'fil') — map key, dropdown value, Soniox STT hint. */ + code: string; + /** BCP-47 with uppercase region (e.g., 'es-ES', 'fi-FI') — TTS-2 language hint. */ + bcp47: string; - // Display names name: string; // English name: "Spanish" nativeName: string; // Native name: "Español" flag: string; // Emoji flag - // STT configuration - sttLanguageCode: string; // Language code for speech-to-text - - // TTS configuration ttsConfig: TTSConfig; - - // Teacher persona for this language teacherPersona: TeacherPersona; - - // Example conversation topics specific to this language's culture exampleTopics: string[]; + /** 2–4 natural disfluency fillers in the target language, spoken inline (e.g. ja: ['えーと', 'あの']). */ + disfluencies: string[]; } /** * Supported Languages Configuration * - * Each language defines everything needed for: - * - Speech recognition (STT) - * - Text-to-speech (TTS) - * - Teacher persona and conversation style - * - Cultural context and example topics + * The first 6 entries are curated personas. Among the rest, languages + * with native voices in the Inworld TTS-2 catalog use them; the others + * fall back to the multilingual Sarah/Jason voices. */ export const SUPPORTED_LANGUAGES: Record = { + // ── Curated languages ──────────────────────────────────────── en: { code: 'en', + bcp47: 'en-US', name: 'English', nativeName: 'English', flag: '🇺🇸', - sttLanguageCode: 'en-US', ttsConfig: { speakerId: 'Lauren', - modelId: 'inworld-tts-1.5-max', + modelId: 'inworld-tts-2', speakingRate: 1, temperature: 1.1, - languageCode: 'en-US', }, teacherPersona: { name: 'Ms. Sarah Mitchell', @@ -85,20 +83,31 @@ export const SUPPORTED_LANGUAGES: Record = { 'American idioms and slang', 'travel across the United States', ], + disfluencies: [ + 'um', + 'uh', + 'well', + 'you know', + 'like', + 'I mean', + 'so', + 'hmm', + 'kinda', + 'right', + ], }, es: { code: 'es', + bcp47: 'es-MX', name: 'Spanish', nativeName: 'Español', flag: '🇲🇽', - sttLanguageCode: 'es-MX', // Mexican Spanish ttsConfig: { speakerId: 'Rafael', - modelId: 'inworld-tts-1.5-max', + modelId: 'inworld-tts-2', speakingRate: 1, temperature: 1.1, - languageCode: 'es-MX', }, teacherPersona: { name: 'Señor Gael Herrera', @@ -114,20 +123,31 @@ export const SUPPORTED_LANGUAGES: Record = { 'the concept of brunch across cultures', 'Balkan travel', ], + disfluencies: [ + 'este', + 'eh', + 'pues', + 'o sea', + 'bueno', + 'a ver', + 'mira', + 'vale', + 'digamos', + 'es que', + ], }, fr: { code: 'fr', + bcp47: 'fr-FR', name: 'French', nativeName: 'Français', flag: '🇫🇷', - sttLanguageCode: 'fr-FR', ttsConfig: { speakerId: 'Alain', - modelId: 'inworld-tts-1.5-max', + modelId: 'inworld-tts-2', speakingRate: 1, temperature: 1.1, - languageCode: 'fr-FR', }, teacherPersona: { name: 'Monsieur Lucien Dubois', @@ -144,20 +164,31 @@ export const SUPPORTED_LANGUAGES: Record = { 'travel in Provence and the French Riviera', 'French music from Édith Piaf to modern artists', ], + disfluencies: [ + 'euh', + 'ben', + 'bah', + 'tu vois', + 'eh bien', + 'enfin', + 'quoi', + 'voilà', + 'disons', + 'en fait', + ], }, de: { code: 'de', + bcp47: 'de-DE', name: 'German', nativeName: 'Deutsch', flag: '🇩🇪', - sttLanguageCode: 'de-DE', ttsConfig: { speakerId: 'Josef', - modelId: 'inworld-tts-1.5-max', + modelId: 'inworld-tts-2', speakingRate: 1, temperature: 0.7, - languageCode: 'de-DE', }, teacherPersona: { name: 'Herr Klaus Weber', @@ -174,20 +205,31 @@ export const SUPPORTED_LANGUAGES: Record = { 'traveling through Bavaria and the Alps', 'German literature from Goethe to modern authors', ], + disfluencies: [ + 'ähm', + 'also', + 'naja', + 'sozusagen', + 'tja', + 'hmm', + 'eben', + 'halt', + 'irgendwie', + 'weißt du', + ], }, it: { code: 'it', + bcp47: 'it-IT', name: 'Italian', nativeName: 'Italiano', flag: '🇮🇹', - sttLanguageCode: 'it-IT', ttsConfig: { speakerId: 'Orietta', - modelId: 'inworld-tts-1.5-max', + modelId: 'inworld-tts-2', speakingRate: 1, temperature: 1.1, - languageCode: 'it-IT', }, teacherPersona: { name: 'Signora Maria Rossi', @@ -204,20 +246,31 @@ export const SUPPORTED_LANGUAGES: Record = { 'fashion and design in Milan', 'Italian music from opera to modern pop', ], + disfluencies: [ + 'ehm', + 'cioè', + 'allora', + 'insomma', + 'beh', + 'ecco', + 'diciamo', + 'tipo', + 'magari', + 'praticamente', + ], }, pt: { code: 'pt', + bcp47: 'pt-BR', name: 'Portuguese', nativeName: 'Português', flag: '🇧🇷', - sttLanguageCode: 'pt-BR', // Brazilian Portuguese ttsConfig: { speakerId: 'Heitor', - modelId: 'inworld-tts-1.5-max', + modelId: 'inworld-tts-2', speakingRate: 1, temperature: 0.7, - languageCode: 'pt-BR', }, teacherPersona: { name: 'Senhor João Silva', @@ -234,6 +287,1506 @@ export const SUPPORTED_LANGUAGES: Record = { 'football (soccer) culture', 'the Amazon and Brazilian nature', ], + disfluencies: [ + 'é', + 'tipo', + 'então', + 'sabe', + 'né', + 'olha', + 'pois é', + 'meio que', + 'tipo assim', + 'cara', + ], + }, + + // ── Soniox-supported languages (alphabetical) ──────────────────────── + // Languages with native voices in the Inworld catalog use them; the rest + // alternate Sarah/Jason as multilingual TTS-2 fallbacks. + af: { + code: 'af', + bcp47: 'af-ZA', + name: 'Afrikaans', + nativeName: 'Afrikaans', + flag: '🇿🇦', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Pieter', + age: 36, + nationality: 'South African', + description: + 'a South African tutor who loves teaching Afrikaans through Cape Town life, braai culture, and Karoo road trips', + }, + exampleTopics: [ + 'Cape Town and Table Mountain', + 'braai culture and South African food', + 'Afrikaans music and writers', + ], + disfluencies: ['ag', 'um', 'nou ja'], + }, + + sq: { + code: 'sq', + bcp47: 'sq-AL', + name: 'Albanian', + nativeName: 'Shqip', + flag: '🇦🇱', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Arta', + age: 32, + nationality: 'Albanian', + description: + 'an Albanian tutor passionate about Tirana, the Albanian Riviera, and traditional cuisine', + }, + exampleTopics: [ + 'Tirana street life', + 'the Albanian Riviera and Ksamil', + 'traditional dishes like tavë kosi', + ], + disfluencies: ['ëëë', 'pra', 'domethënë'], + }, + + ar: { + code: 'ar', + bcp47: 'ar-SA', + name: 'Arabic', + nativeName: 'العربية', + flag: '🇸🇦', + ttsConfig: { + speakerId: 'Nour', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Layla', + age: 33, + nationality: 'Saudi', + description: + 'a Saudi tutor who loves teaching Arabic through Middle Eastern history, classical poetry, and modern culture', + }, + exampleTopics: [ + 'Arabic poetry and proverbs', + 'food across the Levant and Gulf', + 'travel to Petra, Cairo, and Riyadh', + ], + disfluencies: ['يعني', 'هه', 'يا عني'], + }, + + az: { + code: 'az', + bcp47: 'az-AZ', + name: 'Azerbaijani', + nativeName: 'Azərbaycanca', + flag: '🇦🇿', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Elmira', + age: 34, + nationality: 'Azerbaijani', + description: + 'an Azerbaijani tutor who loves Baku, the Caspian coast, and Caucasus cuisine', + }, + exampleTopics: [ + 'Baku old city and modern skyline', + 'plov and traditional Azerbaijani food', + 'mugham music', + ], + disfluencies: ['yəni', 'ee', 'belə'], + }, + + eu: { + code: 'eu', + bcp47: 'eu-ES', + name: 'Basque', + nativeName: 'Euskara', + flag: '🟥', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Iker', + age: 35, + nationality: 'Basque', + description: + 'a Basque tutor who loves teaching Euskara through San Sebastián pintxos and Bilbao culture', + }, + exampleTopics: [ + 'pintxo bars in Donostia', + 'the Guggenheim and Bilbao', + 'Basque mythology and rural life', + ], + disfluencies: ['eee', 'beno', 'ba'], + }, + + be: { + code: 'be', + bcp47: 'be-BY', + name: 'Belarusian', + nativeName: 'Беларуская', + flag: '🇧🇾', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Hanna', + age: 31, + nationality: 'Belarusian', + description: + 'a Belarusian tutor passionate about Minsk, traditional folk songs, and Belarusian literature', + }, + exampleTopics: [ + 'Minsk and Belarusian cities', + 'draniki and traditional cuisine', + 'Belarusian folk music', + ], + disfluencies: ['ну', 'эээ', 'значыць'], + }, + + bn: { + code: 'bn', + bcp47: 'bn-BD', + name: 'Bengali', + nativeName: 'বাংলা', + flag: '🇧🇩', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Anika', + age: 30, + nationality: 'Bangladeshi', + description: + 'a Bangladeshi tutor who loves teaching Bengali through Dhaka life, Tagore poetry, and the Sundarbans', + }, + exampleTopics: [ + 'Dhaka street food', + 'Tagore and Bengali literature', + 'Sundarbans and rural Bengal', + ], + disfluencies: ['মানে', 'ইয়ে', 'আচ্ছা'], + }, + + bs: { + code: 'bs', + bcp47: 'bs-BA', + name: 'Bosnian', + nativeName: 'Bosanski', + flag: '🇧🇦', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Edin', + age: 37, + nationality: 'Bosnian', + description: + 'a Bosnian tutor passionate about Sarajevo, ćevapi, and Balkan history', + }, + exampleTopics: [ + 'Sarajevo old town', + 'ćevapi and Bosnian cuisine', + 'Mostar and the Stari Most bridge', + ], + disfluencies: ['ovaj', 'ono', 'znaš'], + }, + + bg: { + code: 'bg', + bcp47: 'bg-BG', + name: 'Bulgarian', + nativeName: 'Български', + flag: '🇧🇬', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Boyana', + age: 33, + nationality: 'Bulgarian', + description: + 'a Bulgarian tutor who loves Sofia, Rila monasteries, and Black Sea summers', + }, + exampleTopics: [ + 'Sofia and the Vitosha mountains', + 'banitsa and shopska salad', + 'Bulgarian folk music and dance', + ], + disfluencies: ['ами', 'значи', 'нали'], + }, + + ca: { + code: 'ca', + bcp47: 'ca-ES', + name: 'Catalan', + nativeName: 'Català', + flag: '🟨', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Jordi', + age: 36, + nationality: 'Catalan', + description: + 'a Catalan tutor who loves Barcelona, Gaudí, and Mediterranean coastal life', + }, + exampleTopics: [ + 'Barcelona neighborhoods', + 'castellers and Catalan traditions', + 'pa amb tomàquet and Catalan food', + ], + disfluencies: ['eh', 'doncs', 'o sigui'], + }, + + zh: { + code: 'zh', + bcp47: 'zh-CN', + name: 'Chinese', + nativeName: '中文', + flag: '🇨🇳', + ttsConfig: { + speakerId: 'Mei', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Mei', + age: 32, + nationality: 'Chinese', + description: + 'a Beijing tutor who loves teaching Mandarin through tea culture, classical poetry, and modern Chinese cinema', + }, + exampleTopics: [ + 'Beijing hutongs and street food', + 'Chinese tea culture', + 'classical poetry and modern films', + ], + disfluencies: [ + '那个', + '就是', + '嗯', + '就', + '其实', + '你知道', + '对', + '怎么说呢', + ], + }, + + hr: { + code: 'hr', + bcp47: 'hr-HR', + name: 'Croatian', + nativeName: 'Hrvatski', + flag: '🇭🇷', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Ivana', + age: 34, + nationality: 'Croatian', + description: + 'a Croatian tutor passionate about Dubrovnik, Dalmatian islands, and Adriatic seafood', + }, + exampleTopics: [ + 'Dalmatian coast and islands', + 'Plitvice Lakes', + 'peka and Croatian seafood', + ], + disfluencies: ['ovaj', 'znaš', 'pa'], + }, + + cs: { + code: 'cs', + bcp47: 'cs-CZ', + name: 'Czech', + nativeName: 'Čeština', + flag: '🇨🇿', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Pavel', + age: 38, + nationality: 'Czech', + description: + 'a Czech tutor who loves teaching through Prague history, Bohemian beer halls, and Czech literature', + }, + exampleTopics: [ + 'Prague castle and the old town', + 'Czech pivo and beer culture', + 'Kafka and Czech cinema', + ], + disfluencies: ['no', 'jakoby', 'prostě'], + }, + + da: { + code: 'da', + bcp47: 'da-DK', + name: 'Danish', + nativeName: 'Dansk', + flag: '🇩🇰', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Mette', + age: 33, + nationality: 'Danish', + description: + 'a Danish tutor passionate about Copenhagen, hygge, and Nordic design', + }, + exampleTopics: [ + 'Copenhagen and Nyhavn', + 'hygge and Scandinavian design', + 'smørrebrød and new Nordic cuisine', + ], + disfluencies: ['øh', 'altså', 'jo'], + }, + + nl: { + code: 'nl', + bcp47: 'nl-NL', + name: 'Dutch', + nativeName: 'Nederlands', + flag: '🇳🇱', + ttsConfig: { + speakerId: 'Katrien', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Sanne', + age: 31, + nationality: 'Dutch', + description: + 'a Dutch tutor who loves teaching through Amsterdam canals, cycling culture, and Dutch design', + }, + exampleTopics: [ + 'Amsterdam canals and museums', + 'cycling and Dutch daily life', + 'stroopwafels and bitterballen', + ], + disfluencies: ['eh', 'nou', 'weet je'], + }, + + et: { + code: 'et', + bcp47: 'et-EE', + name: 'Estonian', + nativeName: 'Eesti', + flag: '🇪🇪', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Kaarel', + age: 34, + nationality: 'Estonian', + description: + 'an Estonian tutor passionate about Tallinn old town, e-Estonia, and Baltic forests', + }, + exampleTopics: [ + 'Tallinn medieval old town', + 'Estonian saunas and forest culture', + 'e-Estonia and digital society', + ], + disfluencies: ['noh', 'eee', 'tähendab'], + }, + + fil: { + code: 'fil', + bcp47: 'fil-PH', + name: 'Filipino', + nativeName: 'Filipino', + flag: '🇵🇭', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Liza', + age: 30, + nationality: 'Filipino', + description: + 'a Filipino tutor who loves teaching Tagalog through Manila life, island hopping, and family traditions', + }, + exampleTopics: [ + 'Manila and Cebu', + 'adobo and lechon', + 'Philippine islands and beaches', + ], + disfluencies: ['ano', 'kasi', 'parang'], + }, + + fi: { + code: 'fi', + bcp47: 'fi-FI', + name: 'Finnish', + nativeName: 'Suomi', + flag: '🇫🇮', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Aino', + age: 33, + nationality: 'Finnish', + description: + 'a Finnish tutor who loves teaching Finnish through Helsinki life and Nordic culture', + }, + exampleTopics: [ + 'sauna culture', + 'Finnish design and architecture', + 'life in Helsinki and Lapland', + ], + disfluencies: ['öö', 'niinku', 'tota'], + }, + + gl: { + code: 'gl', + bcp47: 'gl-ES', + name: 'Galician', + nativeName: 'Galego', + flag: '🟦', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Brais', + age: 36, + nationality: 'Galician', + description: + 'a Galician tutor passionate about Santiago de Compostela, Atlantic coast, and Galician seafood', + }, + exampleTopics: [ + 'Camino de Santiago', + 'pulpo a la gallega and seafood', + 'Galician folk music', + ], + disfluencies: ['eh', 'pois', 'ou sexa'], + }, + + el: { + code: 'el', + bcp47: 'el-GR', + name: 'Greek', + nativeName: 'Ελληνικά', + flag: '🇬🇷', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Eleni', + age: 35, + nationality: 'Greek', + description: + 'a Greek tutor who loves teaching through Athens history, the Aegean islands, and Greek philosophy', + }, + exampleTopics: [ + 'Athens and the Acropolis', + 'Greek islands like Santorini and Crete', + 'Greek philosophy and mythology', + ], + disfluencies: ['ε', 'δηλαδή', 'ξέρεις'], + }, + + gu: { + code: 'gu', + bcp47: 'gu-IN', + name: 'Gujarati', + nativeName: 'ગુજરાતી', + flag: '🇮🇳', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Priya', + age: 32, + nationality: 'Gujarati', + description: + 'a Gujarati tutor passionate about Ahmedabad, vegetarian cuisine, and Garba dance', + }, + exampleTopics: [ + 'Ahmedabad and Gujarati culture', + 'dhokla, thepla and vegetarian thalis', + 'Navratri and Garba', + ], + disfluencies: ['એમ', 'મતલબ', 'જેમ કે'], + }, + + he: { + code: 'he', + bcp47: 'he-IL', + name: 'Hebrew', + nativeName: 'עברית', + flag: '🇮🇱', + ttsConfig: { + speakerId: 'Yael', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Noa', + age: 31, + nationality: 'Israeli', + description: + 'an Israeli tutor who loves teaching Hebrew through Tel Aviv beach life, Jerusalem history, and modern Israeli culture', + }, + exampleTopics: [ + 'Tel Aviv and Jaffa', + 'Jerusalem and the Old City', + 'shakshuka and Israeli cuisine', + ], + disfluencies: ['אהה', 'יעני', 'כאילו'], + }, + + hi: { + code: 'hi', + bcp47: 'hi-IN', + name: 'Hindi', + nativeName: 'हिन्दी', + flag: '🇮🇳', + ttsConfig: { + speakerId: 'Aarav', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Aarav', + age: 34, + nationality: 'Indian', + description: + 'an Indian tutor who loves teaching Hindi through Bollywood, street food, and travel across India', + }, + exampleTopics: [ + 'Delhi and Mumbai life', + 'Bollywood films and music', + 'Indian street food and chai', + ], + disfluencies: ['मतलब', 'अरे', 'यानी'], + }, + + hu: { + code: 'hu', + bcp47: 'hu-HU', + name: 'Hungarian', + nativeName: 'Magyar', + flag: '🇭🇺', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Zsófia', + age: 33, + nationality: 'Hungarian', + description: + 'a Hungarian tutor passionate about Budapest, thermal baths, and Magyar literature', + }, + exampleTopics: [ + 'Budapest and the Danube', + 'gulyás and Hungarian cuisine', + 'thermal baths and ruin pubs', + ], + disfluencies: ['hát', 'izé', 'ugye'], + }, + + id: { + code: 'id', + bcp47: 'id-ID', + name: 'Indonesian', + nativeName: 'Bahasa Indonesia', + flag: '🇮🇩', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Budi', + age: 35, + nationality: 'Indonesian', + description: + 'an Indonesian tutor who loves teaching through Bali, Javanese culture, and the diverse archipelago', + }, + exampleTopics: [ + 'Bali and Java', + 'nasi goreng and Indonesian street food', + 'island hopping across Indonesia', + ], + disfluencies: ['anu', 'gitu', 'ya'], + }, + + ja: { + code: 'ja', + bcp47: 'ja-JP', + name: 'Japanese', + nativeName: '日本語', + flag: '🇯🇵', + ttsConfig: { + speakerId: 'Hina', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Yuki', + age: 32, + nationality: 'Japanese', + description: + 'a Japanese tutor who loves teaching through Tokyo neighborhoods, tea ceremony, and modern pop culture', + }, + exampleTopics: [ + 'Tokyo neighborhoods and Kyoto temples', + 'sushi, ramen, and izakaya culture', + 'anime, manga, and J-pop', + ], + disfluencies: [ + 'えーと', + 'あの', + 'そうですね', + 'うーん', + 'まあ', + 'なんか', + 'ええ', + 'まあね', + ], + }, + + kn: { + code: 'kn', + bcp47: 'kn-IN', + name: 'Kannada', + nativeName: 'ಕನ್ನಡ', + flag: '🇮🇳', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Kavya', + age: 30, + nationality: 'Kannadiga', + description: + 'a Karnataka tutor passionate about Bengaluru, Mysuru palaces, and South Indian cuisine', + }, + exampleTopics: [ + 'Bengaluru tech and café culture', + 'Mysuru palace and Hampi ruins', + 'masala dosa and South Indian food', + ], + disfluencies: ['ಅಂದ್ರೆ', 'ಅಯ್ಯೋ', 'ಹಾ'], + }, + + kk: { + code: 'kk', + bcp47: 'kk-KZ', + name: 'Kazakh', + nativeName: 'Қазақ', + flag: '🇰🇿', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Aigerim', + age: 32, + nationality: 'Kazakh', + description: + 'a Kazakh tutor who loves teaching through Almaty, the steppes, and Central Asian traditions', + }, + exampleTopics: [ + 'Almaty and Astana', + 'beshbarmak and steppe cuisine', + 'eagle hunting and Kazakh traditions', + ], + disfluencies: ['яғни', 'ееее', 'былай'], + }, + + ko: { + code: 'ko', + bcp47: 'ko-KR', + name: 'Korean', + nativeName: '한국어', + flag: '🇰🇷', + ttsConfig: { + speakerId: 'Hyunwoo', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Min-jun', + age: 31, + nationality: 'Korean', + description: + 'a Korean tutor who loves teaching through Seoul life, K-pop, and Korean food culture', + }, + exampleTopics: [ + 'Seoul neighborhoods and Jeju island', + 'K-pop, K-dramas, and Korean cinema', + 'kimchi, bibimbap, and Korean BBQ', + ], + disfluencies: [ + '그…', + '음…', + '저기', + '뭐', + '있잖아', + '그러니까', + '아', + '말하자면', + ], + }, + + lv: { + code: 'lv', + bcp47: 'lv-LV', + name: 'Latvian', + nativeName: 'Latviešu', + flag: '🇱🇻', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Liene', + age: 33, + nationality: 'Latvian', + description: + 'a Latvian tutor passionate about Riga art nouveau, Baltic forests, and folk traditions', + }, + exampleTopics: [ + 'Riga old town and art nouveau', + 'Latvian folk songs (dainas)', + 'midsummer Jāņi celebrations', + ], + disfluencies: ['nu', 'tātad', 'redzi'], + }, + + lt: { + code: 'lt', + bcp47: 'lt-LT', + name: 'Lithuanian', + nativeName: 'Lietuvių', + flag: '🇱🇹', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Tomas', + age: 35, + nationality: 'Lithuanian', + description: + 'a Lithuanian tutor who loves Vilnius old town, Curonian Spit dunes, and Baltic history', + }, + exampleTopics: [ + 'Vilnius and Trakai castle', + 'cepelinai and Lithuanian cuisine', + 'Curonian Spit and the Baltic coast', + ], + disfluencies: ['na', 'tai', 'žinai'], + }, + + mk: { + code: 'mk', + bcp47: 'mk-MK', + name: 'Macedonian', + nativeName: 'Македонски', + flag: '🇲🇰', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Nikola', + age: 36, + nationality: 'Macedonian', + description: + 'a Macedonian tutor passionate about Skopje, Lake Ohrid, and Balkan history', + }, + exampleTopics: [ + 'Skopje and Lake Ohrid', + 'tavče gravče and Macedonian cuisine', + 'Balkan folk music', + ], + disfluencies: ['па', 'значи', 'знаеш'], + }, + + ms: { + code: 'ms', + bcp47: 'ms-MY', + name: 'Malay', + nativeName: 'Melayu', + flag: '🇲🇾', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Aisyah', + age: 32, + nationality: 'Malaysian', + description: + 'a Malaysian tutor who loves teaching through KL street food, Penang heritage, and Borneo nature', + }, + exampleTopics: [ + 'Kuala Lumpur and Penang', + 'nasi lemak and Malaysian street food', + 'Borneo rainforests', + ], + disfluencies: ['hmm', 'macam', 'tu'], + }, + + ml: { + code: 'ml', + bcp47: 'ml-IN', + name: 'Malayalam', + nativeName: 'മലയാളം', + flag: '🇮🇳', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Anjali', + age: 31, + nationality: 'Malayali', + description: + 'a Kerala tutor passionate about backwater houseboats, Kathakali, and Keralan cuisine', + }, + exampleTopics: [ + 'Kerala backwaters and Kochi', + 'sadya and coconut-based cooking', + 'Kathakali and traditional arts', + ], + disfluencies: ['അതേ', 'പിന്നെ', 'അല്ലെ'], + }, + + mr: { + code: 'mr', + bcp47: 'mr-IN', + name: 'Marathi', + nativeName: 'मराठी', + flag: '🇮🇳', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Rohan', + age: 33, + nationality: 'Marathi', + description: + 'a Maharashtrian tutor who loves teaching through Mumbai life, Pune culture, and Marathi cinema', + }, + exampleTopics: [ + 'Mumbai and the Western Ghats', + 'vada pav and Maharashtrian street food', + 'Marathi theatre and cinema', + ], + disfluencies: ['म्हणजे', 'अरे', 'तर'], + }, + + no: { + code: 'no', + bcp47: 'nb-NO', + name: 'Norwegian', + nativeName: 'Norsk', + flag: '🇳🇴', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Sigrid', + age: 32, + nationality: 'Norwegian', + description: + 'a Norwegian tutor passionate about Oslo, fjord hikes, and Nordic outdoor life', + }, + exampleTopics: [ + 'Oslo and the Norwegian fjords', + 'friluftsliv and outdoor culture', + 'brunost and Norwegian cuisine', + ], + disfluencies: ['eh', 'liksom', 'altså'], + }, + + fa: { + code: 'fa', + bcp47: 'fa-IR', + name: 'Persian', + nativeName: 'فارسی', + flag: '🇮🇷', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Darius', + age: 36, + nationality: 'Iranian', + description: + 'an Iranian tutor who loves teaching Persian through Tehran life, classical poetry, and Iranian cuisine', + }, + exampleTopics: [ + 'Tehran and Isfahan', + 'Hafez, Rumi and Persian poetry', + 'kebabs, stews, and Persian rice dishes', + ], + disfluencies: ['یعنی', 'خب', 'چیز'], + }, + + pl: { + code: 'pl', + bcp47: 'pl-PL', + name: 'Polish', + nativeName: 'Polski', + flag: '🇵🇱', + ttsConfig: { + speakerId: 'Szymon', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Szymon', + age: 33, + nationality: 'Polish', + description: + 'a Polish tutor passionate about Kraków old town, Polish cinema, and pierogi traditions', + }, + exampleTopics: [ + 'Kraków and Warsaw', + 'pierogi and Polish home cooking', + 'Polish cinema and history', + ], + disfluencies: ['no', 'yyy', 'wiesz'], + }, + + pa: { + code: 'pa', + bcp47: 'pa-IN', + name: 'Punjabi', + nativeName: 'ਪੰਜਾਬੀ', + flag: '🇮🇳', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Harpreet', + age: 34, + nationality: 'Punjabi', + description: + 'a Punjabi tutor who loves teaching through Amritsar, bhangra, and Punjabi food culture', + }, + exampleTopics: [ + 'Amritsar and the Golden Temple', + 'butter chicken, sarson da saag, and Punjabi food', + 'bhangra and Punjabi music', + ], + disfluencies: ['ਮਤਲਬ', 'ਯਾਨੀ', 'ਉਹ'], + }, + + ro: { + code: 'ro', + bcp47: 'ro-RO', + name: 'Romanian', + nativeName: 'Română', + flag: '🇷🇴', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Andrei', + age: 35, + nationality: 'Romanian', + description: + 'a Romanian tutor passionate about Bucharest, Transylvanian castles, and Carpathian villages', + }, + exampleTopics: [ + 'Bucharest and Transylvania', + 'sarmale and Romanian home cooking', + 'Carpathian mountains and folklore', + ], + disfluencies: ['adică', 'păi', 'deci'], + }, + + ru: { + code: 'ru', + bcp47: 'ru-RU', + name: 'Russian', + nativeName: 'Русский', + flag: '🇷🇺', + ttsConfig: { + speakerId: 'Elena', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Anastasia', + age: 34, + nationality: 'Russian', + description: + 'a Russian tutor who loves teaching through Moscow life, classical literature, and Russian cuisine', + }, + exampleTopics: [ + 'Moscow and St. Petersburg', + 'Tolstoy, Dostoevsky and Russian literature', + 'borscht, pelmeni and Russian food', + ], + disfluencies: ['ну', 'это', 'как бы'], + }, + + sr: { + code: 'sr', + bcp47: 'sr-RS', + name: 'Serbian', + nativeName: 'Српски', + flag: '🇷🇸', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Miloš', + age: 36, + nationality: 'Serbian', + description: + 'a Serbian tutor passionate about Belgrade nightlife, Balkan music, and Serbian traditions', + }, + exampleTopics: [ + 'Belgrade nightlife and Novi Sad', + 'ćevapi, ajvar and Serbian food', + 'Exit Festival and Balkan music', + ], + disfluencies: ['ovaj', 'znaš', 'pa'], + }, + + sk: { + code: 'sk', + bcp47: 'sk-SK', + name: 'Slovak', + nativeName: 'Slovenčina', + flag: '🇸🇰', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Zuzana', + age: 32, + nationality: 'Slovak', + description: + 'a Slovak tutor who loves Bratislava, the Tatra mountains, and Slovak folk traditions', + }, + exampleTopics: [ + 'Bratislava and the High Tatras', + 'bryndzové halušky and Slovak cuisine', + 'wooden churches and folk music', + ], + disfluencies: ['no', 'akože', 'proste'], + }, + + sl: { + code: 'sl', + bcp47: 'sl-SI', + name: 'Slovenian', + nativeName: 'Slovenščina', + flag: '🇸🇮', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Maja', + age: 31, + nationality: 'Slovenian', + description: + 'a Slovenian tutor passionate about Ljubljana, Lake Bled, and Julian Alps hiking', + }, + exampleTopics: [ + 'Ljubljana and Lake Bled', + 'potica and Slovenian cuisine', + 'Julian Alps and Postojna caves', + ], + disfluencies: ['no', 'pač', 'a veš'], + }, + + sw: { + code: 'sw', + bcp47: 'sw-KE', + name: 'Swahili', + nativeName: 'Kiswahili', + flag: '🇰🇪', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Amani', + age: 33, + nationality: 'Kenyan', + description: + 'a Kenyan tutor who loves teaching Swahili through Nairobi life, coastal Lamu, and East African culture', + }, + exampleTopics: [ + 'Nairobi and the Maasai Mara', + 'ugali, nyama choma and East African food', + 'Swahili coastal culture and Lamu', + ], + disfluencies: ['eee', 'yaani', 'basi'], + }, + + sv: { + code: 'sv', + bcp47: 'sv-SE', + name: 'Swedish', + nativeName: 'Svenska', + flag: '🇸🇪', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Erik', + age: 35, + nationality: 'Swedish', + description: + 'a Swedish tutor passionate about Stockholm, fika culture, and the Swedish countryside', + }, + exampleTopics: [ + 'Stockholm archipelago', + 'fika and Swedish coffee culture', + 'midsummer and Swedish traditions', + ], + disfluencies: ['öh', 'liksom', 'alltså'], + }, + + ta: { + code: 'ta', + bcp47: 'ta-IN', + name: 'Tamil', + nativeName: 'தமிழ்', + flag: '🇮🇳', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Karthik', + age: 34, + nationality: 'Tamil', + description: + 'a Tamil tutor who loves teaching through Chennai life, Tamil cinema, and South Indian temples', + }, + exampleTopics: [ + 'Chennai and Madurai temples', + 'idli, dosa and Tamil cuisine', + 'Tamil cinema and Carnatic music', + ], + disfluencies: ['அதான்', 'அப்பா', 'என்ன'], + }, + + te: { + code: 'te', + bcp47: 'te-IN', + name: 'Telugu', + nativeName: 'తెలుగు', + flag: '🇮🇳', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Lakshmi', + age: 32, + nationality: 'Telugu', + description: + 'a Telugu tutor passionate about Hyderabad, biryani, and Tollywood cinema', + }, + exampleTopics: [ + 'Hyderabad and the Charminar', + 'Hyderabadi biryani and Andhra cuisine', + 'Tollywood films and Telugu poetry', + ], + disfluencies: ['అంటే', 'అదే', 'అరె'], + }, + + th: { + code: 'th', + bcp47: 'th-TH', + name: 'Thai', + nativeName: 'ไทย', + flag: '🇹🇭', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Siriporn', + age: 31, + nationality: 'Thai', + description: + 'a Thai tutor who loves teaching through Bangkok markets, island life, and Thai food culture', + }, + exampleTopics: [ + 'Bangkok and Chiang Mai', + 'pad thai, tom yum and Thai street food', + 'Thai islands and beaches', + ], + disfluencies: ['เอ่อ', 'แบบ', 'คือ'], + }, + + tr: { + code: 'tr', + bcp47: 'tr-TR', + name: 'Turkish', + nativeName: 'Türkçe', + flag: '🇹🇷', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Emre', + age: 35, + nationality: 'Turkish', + description: + 'a Turkish tutor passionate about Istanbul, Anatolian history, and Turkish cuisine', + }, + exampleTopics: [ + 'Istanbul and the Bosphorus', + 'kebabs, mezes and Turkish breakfasts', + 'Cappadocia and Turkish coast', + ], + disfluencies: ['şey', 'yani', 'işte'], + }, + + uk: { + code: 'uk', + bcp47: 'uk-UA', + name: 'Ukrainian', + nativeName: 'Українська', + flag: '🇺🇦', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Olena', + age: 33, + nationality: 'Ukrainian', + description: + 'a Ukrainian tutor who loves teaching through Kyiv, Lviv coffee houses, and Ukrainian folk traditions', + }, + exampleTopics: [ + 'Kyiv and Lviv', + 'borscht, varenyky and Ukrainian cuisine', + 'Ukrainian folk songs and embroidery', + ], + disfluencies: ['ну', 'це', 'тобто'], + }, + + ur: { + code: 'ur', + bcp47: 'ur-PK', + name: 'Urdu', + nativeName: 'اردو', + flag: '🇵🇰', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Zara', + age: 32, + nationality: 'Pakistani', + description: + 'a Pakistani tutor passionate about Lahore, Urdu poetry (ghazals), and Mughlai cuisine', + }, + exampleTopics: [ + 'Lahore and Karachi', + 'Urdu ghazals and shayari', + 'biryani, nihari and Mughlai food', + ], + disfluencies: ['یعنی', 'مطلب', 'وہ'], + }, + + vi: { + code: 'vi', + bcp47: 'vi-VN', + name: 'Vietnamese', + nativeName: 'Tiếng Việt', + flag: '🇻🇳', + ttsConfig: { + speakerId: 'Sarah', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Linh', + age: 30, + nationality: 'Vietnamese', + description: + 'a Vietnamese tutor who loves teaching through Hanoi street food, Hạ Long Bay, and Vietnamese coffee culture', + }, + exampleTopics: [ + 'Hanoi and Ho Chi Minh City', + 'phở, bánh mì and Vietnamese street food', + 'Hạ Long Bay and the Mekong Delta', + ], + disfluencies: ['ờ', 'thì', 'cái'], + }, + + cy: { + code: 'cy', + bcp47: 'cy-GB', + name: 'Welsh', + nativeName: 'Cymraeg', + flag: '🏴󠁧󠁢󠁷󠁬󠁳󠁿', + ttsConfig: { + speakerId: 'Jason', + modelId: 'inworld-tts-2', + speakingRate: 1, + temperature: 1, + }, + teacherPersona: { + name: 'Rhys', + age: 36, + nationality: 'Welsh', + description: + 'a Welsh tutor passionate about Cardiff, Snowdonia hiking, and Welsh poetry traditions', + }, + exampleTopics: [ + 'Cardiff and the Welsh valleys', + 'Snowdonia and the coastal path', + 'cawl, Welsh cakes and male voice choirs', + ], + disfluencies: ['ym', 'wel', "ti'n gwybod"], }, }; diff --git a/backend/src/config/server.ts b/backend/src/config/server.ts index 81eeea7..22f06be 100644 --- a/backend/src/config/server.ts +++ b/backend/src/config/server.ts @@ -18,5 +18,5 @@ export const serverConfig = { 'wss://api.inworld.ai/api/v1/realtime/session', /** TTS voice model */ - ttsModel: process.env.TTS_MODEL || 'inworld-tts-1.5-max', + ttsModel: process.env.TTS_MODEL || 'inworld-tts-2', } as const; diff --git a/backend/src/helpers/tts-audio-generator.ts b/backend/src/helpers/tts-audio-generator.ts index fa39be9..c87ee44 100644 --- a/backend/src/helpers/tts-audio-generator.ts +++ b/backend/src/helpers/tts-audio-generator.ts @@ -71,7 +71,7 @@ async function generateTTSAudio( } const langConfig = getLanguageConfig(languageCode); - const voiceId = langConfig.ttsConfig.speakerId; + const { speakerId, modelId } = langConfig.ttsConfig; const response = await fetch(TTS_URL, { method: 'POST', @@ -81,8 +81,9 @@ async function generateTTSAudio( }, body: JSON.stringify({ text: text.trim(), - voice_id: voiceId, - model_id: 'inworld-tts-1.5-max', + voice_id: speakerId, + model_id: modelId, + language: langConfig.bcp47, audio_config: { audio_encoding: 'LINEAR16', sample_rate_hertz: 24000, diff --git a/backend/src/services/inworld-llm.ts b/backend/src/services/inworld-llm.ts index 8fd9a3d..f626f9f 100644 --- a/backend/src/services/inworld-llm.ts +++ b/backend/src/services/inworld-llm.ts @@ -129,7 +129,12 @@ Text: ${text}`; * Pronounce text using Inworld TTS API. * Returns base64-encoded LINEAR16 audio at 24kHz, or null on failure. */ - async pronounce(text: string, voiceId: string): Promise { + async pronounce( + text: string, + voiceId: string, + bcp47: string, + modelId: string + ): Promise { if (!this.apiKey) return null; try { @@ -142,7 +147,8 @@ Text: ${text}`; body: JSON.stringify({ text, voice_id: voiceId, - model_id: 'inworld-tts-1.5-max', + model_id: modelId, + language: bcp47, audio_config: { audio_encoding: 'LINEAR16', sample_rate_hertz: 24000, @@ -183,7 +189,7 @@ Text: ${text}`; Authorization: `Basic ${this.apiKey}`, }, body: JSON.stringify({ - model: 'openai/gpt-4.1-nano', + model: 'openai/gpt-5.4-mini', messages: [{ role: 'user', content: prompt }], max_tokens: maxTokens, temperature, diff --git a/backend/src/services/memory-service.ts b/backend/src/services/memory-service.ts index a7db6a9..2658b03 100644 --- a/backend/src/services/memory-service.ts +++ b/backend/src/services/memory-service.ts @@ -257,7 +257,7 @@ Rules: Authorization: `Basic ${process.env.INWORLD_API_KEY}`, }, body: JSON.stringify({ - model: 'openai/gpt-4.1-nano', + model: 'openai/gpt-5.4-mini', messages: [{ role: 'user', content: prompt }], max_tokens: 200, temperature: 0.7, diff --git a/backend/src/services/session-manager.ts b/backend/src/services/session-manager.ts index 25a4624..918a50c 100644 --- a/backend/src/services/session-manager.ts +++ b/backend/src/services/session-manager.ts @@ -23,6 +23,19 @@ export interface SessionManagerOptions { languageCode: string; } +/** + * Remove TTS-2 control tags from text the user will see/persist: + * steering tags ([speak ...]) and non-verbal tags ([laugh], [sigh], ...). + * Disfluencies are plain inline text and are not bracketed, so they survive. + */ +export function stripBracketedTags(text: string): string { + return text + .replace(/\[[^\][]*\]/g, '') + .replace(/[ \t]{2,}/g, ' ') + .replace(/ ([,.!?;:])/g, '$1') + .trim(); +} + export class SessionManager { private ws: ClientWebSocket; private inworldWs: WebSocket | null = null; @@ -37,6 +50,8 @@ export class SessionManager { private greetingItemId: string | null = null; /** Buffer for accumulating partial transcription deltas (incremental) */ private userTextBuffer = ''; + /** Held-back tail of an in-flight assistant chunk that may be the start of a `[...]` tag */ + private assistantPendingTail = ''; constructor(opts: SessionManagerOptions) { this.ws = opts.ws; @@ -223,6 +238,7 @@ export class SessionManager { case 'input_audio_buffer.speech_started': // Cancel any in-progress agent response (matches Inworld playground) this.inworldSend({ type: 'response.cancel' }); + this.assistantPendingTail = ''; this.wsSend({ type: 'speech_detected', data: {} }); this.wsSend({ type: 'interrupt', reason: 'speech_start' }); break; @@ -236,10 +252,11 @@ export class SessionManager { break; case 'conversation.item.input_audio_transcription.delta': { - // Deltas are INCREMENTAL — accumulate into buffer (matches Inworld playground) + // Soniox emits CUMULATIVE deltas — each delta is the full transcript + // so far, not just new tokens. Replace the buffer rather than appending. const partialDelta = event.delta as string; if (partialDelta) { - this.userTextBuffer += partialDelta; + this.userTextBuffer = partialDelta; this.wsSend({ type: 'partial_transcript', text: this.userTextBuffer, @@ -268,11 +285,14 @@ export class SessionManager { case 'response.output_audio_transcript.delta': { const delta = event.delta as string; if (delta) { - this.wsSend({ - type: 'llm_response_chunk', - text: delta, - timestamp: Date.now(), - }); + const cleanedDelta = this.consumeAssistantDelta(delta); + if (cleanedDelta) { + this.wsSend({ + type: 'llm_response_chunk', + text: cleanedDelta, + timestamp: Date.now(), + }); + } } break; } @@ -292,11 +312,14 @@ export class SessionManager { case 'response.output_audio_transcript.done': { const transcript = event.transcript as string; + // Reset streaming-strip state regardless of whether transcript is present + this.assistantPendingTail = ''; if (transcript) { - this.trackAssistantMessage(transcript); + const cleanedTranscript = stripBracketedTags(transcript); + this.trackAssistantMessage(cleanedTranscript); this.wsSend({ type: 'llm_response_complete', - text: transcript, + text: cleanedTranscript, timestamp: Date.now(), }); } @@ -348,8 +371,15 @@ export class SessionManager { } private sendSessionUpdate(): void { - const { teacherPersona, name, exampleTopics, ttsConfig, sttLanguageCode } = - this.langConfig; + const { + teacherPersona, + name, + exampleTopics, + ttsConfig, + code, + bcp47, + disfluencies, + } = this.langConfig; const memoryContext = this.memory.getContext(); let instructions = `# Context @@ -366,7 +396,15 @@ export class SessionManager { # Communication Style - Short, spoken-style sentences — never more than 2 sentences - The user's speech comes via speech-to-text, so tolerate transcription errors -- Ask open-ended questions to get them practicing ${name}`; +- Ask open-ended questions to get them practicing ${name} + +# Voice & expressivity (TTS-2) +- Sound like a real person thinking out loud: include 1–2 natural ${name} disfluencies in MOST responses. Examples for ${name} include ${disfluencies.join(', ')} — these are seeds, NOT an exhaustive list. Any common ${name} filler/hesitation word (or short hedging phrase) is welcome too. +- VARY which disfluency you use. Never reuse the same one in consecutive responses, and don't lean on the single most generic one every turn — rotate across the natural range. Repeating "um" turn after turn sounds robotic. +- Disfluencies are plain inline ${name} text (no brackets) and ARE spoken aloud. They work well to open a turn or pivot between clauses. Don't force them onto emphatic, excited, or one-word responses. +- Optionally start a turn with ONE English steering tag in brackets, e.g. [speak conversationally], [speak warmly], [speak with light curiosity]. These direct your voice but are NOT spoken aloud. +- Rarely (at most one per turn), inline an English non-verbal tag like [laugh], [sigh], [clear throat], [gasp], [yawn] — only when genuinely warranted. +- Bracketed tags are always English voice directions and are silent. Disfluencies are ${name} text and ARE spoken — never bracket them.`; if (memoryContext) { instructions += `\n\n# Recent Conversation Context\n${memoryContext}`; @@ -375,14 +413,14 @@ export class SessionManager { this.inworldSend({ type: 'session.update', session: { - model: 'openai/gpt-4.1-nano', + model: 'openai/gpt-5.4-mini', instructions, output_modalities: ['audio', 'text'], audio: { input: { transcription: { - model: 'assemblyai/u3-rt-pro', - language: sttLanguageCode, + model: 'soniox/stt-rt-v4', + language: code, }, turn_detection: { type: 'semantic_vad', @@ -397,6 +435,9 @@ export class SessionManager { speed: ttsConfig.speakingRate, }, }, + providerData: { + tts: { language: bcp47 }, + }, }, }); } @@ -435,6 +476,31 @@ export class SessionManager { } } + /** + * Strip bracketed control tags from streaming assistant deltas. + * + * A `[...]` tag may straddle two deltas (e.g. `"...[spe"` then `"ak warmly] ..."`). + * We hold back any trailing `[` that hasn't been closed yet, then re-process it + * on the next delta. Returns the cleaned text safe to emit as a chunk now — + * brackets removed only; whitespace/punctuation normalization is deferred + * to the final `.done` pass so chunk concatenation doesn't lose word breaks. + */ + private consumeAssistantDelta(delta: string): string { + const combined = this.assistantPendingTail + delta; + const lastOpen = combined.lastIndexOf('['); + const lastClose = combined.lastIndexOf(']'); + + let safe: string; + if (lastOpen > lastClose) { + safe = combined.slice(0, lastOpen); + this.assistantPendingTail = combined.slice(lastOpen); + } else { + safe = combined; + this.assistantPendingTail = ''; + } + return safe.replace(/\[[^\][]*\]/g, ''); + } + private async reconnect(): Promise { await this.destroy(); this.destroyed = false; diff --git a/backend/src/services/websocket-handler.ts b/backend/src/services/websocket-handler.ts index c147adc..2326593 100644 --- a/backend/src/services/websocket-handler.ts +++ b/backend/src/services/websocket-handler.ts @@ -126,7 +126,9 @@ export function setupWebSocketHandlers(wss: WebSocketServer): void { const langConfig = getLanguageConfig(languageCode); const audio = await llm.pronounce( msg.text, - langConfig.ttsConfig.speakerId + langConfig.ttsConfig.speakerId, + langConfig.bcp47, + langConfig.ttsConfig.modelId ); if (audio) { wsSend(ws, { diff --git a/frontend/public/audio-processor.js b/frontend/public/audio-processor.js index 538a1b4..ffd7b88 100644 --- a/frontend/public/audio-processor.js +++ b/frontend/public/audio-processor.js @@ -1,67 +1,35 @@ /** - * AudioWorklet processor for capturing and resampling microphone audio. - * Resamples to 24kHz PCM16 for Inworld Realtime API. - * Buffers to 100ms chunks (2400 samples at 24kHz). + * AudioWorklet processor — buffers mic samples into 100ms chunks (2400 samples + * at 24kHz) and posts them to the main thread as Int16 PCM. + * + * The capture AudioContext is configured at 24kHz so the worklet receives + * samples at the target rate directly — no resampling needed on every quantum. */ class AudioProcessor extends AudioWorkletProcessor { - constructor(options) { + constructor() { super(); - this.sourceSampleRate = options.processorOptions.sourceSampleRate; - this.targetSampleRate = 24000; - this.resampleRatio = this.sourceSampleRate / this.targetSampleRate; - - this.inputBuffer = null; - this.outputBuffer = []; - this.outputBufferSize = 2400; // 100ms at 24kHz + this.outputBufferSize = 2400; // 100ms @ 24kHz + this.outputBuffer = new Int16Array(this.outputBufferSize); + this.outputIndex = 0; } process(inputs) { - const inputChannel = inputs[0][0]; - if (!inputChannel) return true; - - // Accumulate input samples - const currentLength = this.inputBuffer ? this.inputBuffer.length : 0; - const newBuffer = new Float32Array(currentLength + inputChannel.length); - if (this.inputBuffer) { - newBuffer.set(this.inputBuffer, 0); - } - newBuffer.set(inputChannel, currentLength); - this.inputBuffer = newBuffer; - - // Resample to 24kHz - const numOutputSamples = Math.floor( - this.inputBuffer.length / this.resampleRatio - ); - if (numOutputSamples === 0) return true; - - const resampledData = new Float32Array(numOutputSamples); - for (let i = 0; i < numOutputSamples; i++) { - const correspondingInputIndex = i * this.resampleRatio; - const lowerIndex = Math.floor(correspondingInputIndex); - const upperIndex = Math.ceil(correspondingInputIndex); - const interpolationFactor = correspondingInputIndex - lowerIndex; - - const lowerValue = this.inputBuffer[lowerIndex] || 0; - const upperValue = this.inputBuffer[upperIndex] || 0; - - resampledData[i] = - lowerValue + (upperValue - lowerValue) * interpolationFactor; - } - - // Keep unconsumed input samples - const consumedInputSamples = numOutputSamples * this.resampleRatio; - this.inputBuffer = this.inputBuffer.slice(Math.round(consumedInputSamples)); - - // Convert to Int16 and buffer to 100ms chunks - for (let i = 0; i < resampledData.length; i++) { - this.outputBuffer.push( - Math.max(-32768, Math.min(32767, resampledData[i] * 32768)) + const inputChannel = inputs[0]?.[0]; + if (!inputChannel || inputChannel.length === 0) return true; + + for (let i = 0; i < inputChannel.length; i++) { + const sample = inputChannel[i]; + this.outputBuffer[this.outputIndex++] = Math.max( + -32768, + Math.min(32767, sample * 32768) ); - if (this.outputBuffer.length >= this.outputBufferSize) { - const int16Array = new Int16Array(this.outputBuffer); - this.port.postMessage(int16Array.buffer, [int16Array.buffer]); - this.outputBuffer = []; + if (this.outputIndex >= this.outputBufferSize) { + // Transfer the underlying buffer to avoid copying. + const out = this.outputBuffer.buffer; + this.port.postMessage(out, [out]); + this.outputBuffer = new Int16Array(this.outputBufferSize); + this.outputIndex = 0; } } diff --git a/frontend/src/components/WelcomeModal.tsx b/frontend/src/components/WelcomeModal.tsx index 46cc8cd..edb89fe 100644 --- a/frontend/src/components/WelcomeModal.tsx +++ b/frontend/src/components/WelcomeModal.tsx @@ -51,7 +51,7 @@ export function WelcomeModal() { Auto flashcards
- 6 languages + 60+ languages