inworld-ai · cshape · May 5, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
@@ -21,7 +21,7 @@ Language learning app using Inworld Realtime API for voice conversations.
 ## Architecture
 Browser <-> our WebSocket <-> SessionManager <-> Inworld Realtime WebSocket (STT+LLM+TTS)
 - STT model: assemblyai/u3-rt-pro (with per-language hints)
-- LLM model: openai/gpt-4.1-nano (via Inworld Realtime)
+- LLM model: openai/gpt-5.4-mini (via Inworld Realtime)
 - SessionManager: one per client, manages Inworld WS lifecycle, forwards audio/text, handles greeting, tracks turns
 - InworldLLM: uses Inworld LLM Router (OpenAI-compatible) for flashcards, feedback, translation
 - TurnMemory: 5-turn sliding window, non-blocking Supabase persistence

@@ -48,13 +48,15 @@ describe('languages config', () => {
         expect(config.name).toBeTruthy();
         expect(config.nativeName).toBeTruthy();
         expect(config.flag).toBeTruthy();
-        expect(config.sttLanguageCode).toBeTruthy();
+        expect(config.bcp47).toBeTruthy();
         expect(config.ttsConfig).toBeDefined();
         expect(config.ttsConfig.speakerId).toBeTruthy();
         expect(config.ttsConfig.modelId).toBeTruthy();
         expect(config.teacherPersona).toBeDefined();
         expect(config.teacherPersona.name).toBeTruthy();
         expect(config.exampleTopics.length).toBeGreaterThan(0);
+        expect(Array.isArray(config.disfluencies)).toBe(true);
+        expect(config.disfluencies.length).toBeGreaterThanOrEqual(1);
       }
     });
   });

@@ -252,7 +252,7 @@ describe('InworldLLM', () => {
       );
 
       const body = JSON.parse(fetchSpy.mock.calls[0][1]!.body as string);
-      expect(body.model).toBe('openai/gpt-4.1-nano');
+      expect(body.model).toBe('openai/gpt-5.4-mini');
       expect(body.messages[0].role).toBe('user');
       expect(body.max_tokens).toBeDefined();
       expect(body.temperature).toBeDefined();
@@ -295,7 +295,12 @@ describe('InworldLLM', () => {
         json: async () => ({ audioContent: 'base64audiodata==' }),
       } as Response);
 
-      const audio = await llm.pronounce('Hola', 'Rafael');
+      const audio = await llm.pronounce(
+        'Hola',
+        'Rafael',
+        'es-MX',
+        'inworld-tts-2'
+      );
       expect(audio).toBe('base64audiodata==');
     });
 
@@ -305,7 +310,7 @@ describe('InworldLLM', () => {
         json: async () => ({ audioContent: 'audio' }),
       } as Response);
 
-      await llm.pronounce('perro', 'Rafael');
+      await llm.pronounce('perro', 'Rafael', 'es-MX', 'inworld-tts-2');
 
       expect(fetchSpy).toHaveBeenCalledWith(
         'https://api.inworld.ai/tts/v1/voice',
@@ -315,7 +320,8 @@ describe('InworldLLM', () => {
       const body = JSON.parse(fetchSpy.mock.calls[0][1]!.body as string);
       expect(body.text).toBe('perro');
       expect(body.voice_id).toBe('Rafael');
-      expect(body.model_id).toBe('inworld-tts-1.5-max');
+      expect(body.model_id).toBe('inworld-tts-2');
+      expect(body.language).toBe('es-MX');
       expect(body.audio_config.audio_encoding).toBe('LINEAR16');
       expect(body.audio_config.sample_rate_hertz).toBe(24000);
     });
@@ -326,15 +332,25 @@ describe('InworldLLM', () => {
         status: 500,
       } as Response);
 
-      const audio = await llm.pronounce('Hola', 'Rafael');
+      const audio = await llm.pronounce(
+        'Hola',
+        'Rafael',
+        'es-MX',
+        'inworld-tts-2'
+      );
       expect(audio).toBeNull();
     });
 
     it('should return null when no API key', async () => {
       process.env.INWORLD_API_KEY = '';
       const noKeyLlm = new InworldLLM();
 
-      const audio = await noKeyLlm.pronounce('Hola', 'Rafael');
+      const audio = await noKeyLlm.pronounce(
+        'Hola',
+        'Rafael',
+        'es-MX',
+        'inworld-tts-2'
+      );
       expect(audio).toBeNull();
     });
   });

@@ -1,6 +1,9 @@
 import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
 import WebSocket from 'ws';
-import { SessionManager } from '../services/session-manager.js';
+import {
+  SessionManager,
+  stripBracketedTags,
+} from '../services/session-manager.js';
 
 // Mock ws module so SessionManager doesn't make real connections
 vi.mock('ws', () => {
@@ -25,6 +28,40 @@ function createMockClientWs() {
   } as unknown as WebSocket;
 }
 
+describe('stripBracketedTags', () => {
+  it('removes a leading steering tag and trims', () => {
+    expect(stripBracketedTags('[speak warmly] Hello there')).toBe(
+      'Hello there'
+    );
+  });
+
+  it('removes inline non-verbal tags', () => {
+    expect(stripBracketedTags('That is funny [laugh] really')).toBe(
+      'That is funny really'
+    );
+  });
+
+  it('collapses double spaces left by removed tags', () => {
+    expect(stripBracketedTags('one [tag] two')).toBe('one two');
+  });
+
+  it('removes the space before punctuation when a tag preceded it', () => {
+    expect(stripBracketedTags('Hello [laugh] , how are you')).toBe(
+      'Hello, how are you'
+    );
+  });
+
+  it('preserves disfluency text (no brackets) untouched', () => {
+    expect(stripBracketedTags('えーと、そうですね')).toBe('えーと、そうですね');
+  });
+
+  it('handles multiple bracketed tags in one string', () => {
+    expect(
+      stripBracketedTags('[speak gently] Pues [sigh] no sé qué decir')
+    ).toBe('Pues no sé qué decir');
+  });
+});
+
 describe('SessionManager', () => {
   const originalEnv = process.env.INWORLD_API_KEY;
 
@@ -281,7 +318,7 @@ describe('SessionManager', () => {
   });
 
   describe('streaming STT events', () => {
-    it('should accumulate transcription deltas incrementally', () => {
+    it('should treat transcription deltas as cumulative (Soniox)', () => {
       const clientWs = createMockClientWs();
       const mgr = new SessionManager({
         sessionId: 'test-stt-1',
@@ -298,12 +335,12 @@ describe('SessionManager', () => {
 
       handler.call(mgr, {
         type: 'conversation.item.input_audio_transcription.delta',
-        delta: 'Hola, ',
+        delta: 'Hola',
       });
 
       handler.call(mgr, {
         type: 'conversation.item.input_audio_transcription.delta',
-        delta: 'me llamo Cale.',
+        delta: 'Hola, me llamo Cale.',
       });
 
       const sent = (clientWs as unknown as { _messages: string[] })._messages;
@@ -313,7 +350,7 @@ describe('SessionManager', () => {
           (m: Record<string, unknown>) => m.type === 'partial_transcript'
         );
       expect(partials).toHaveLength(2);
-      expect(partials[0].text).toBe('Hola, ');
+      expect(partials[0].text).toBe('Hola');
       expect(partials[1].text).toBe('Hola, me llamo Cale.');
     });
 
@@ -430,10 +467,112 @@ describe('SessionManager', () => {
       const sent = JSON.parse(mockInworldWs.send.mock.calls[0][0]);
       expect(sent.type).toBe('session.update');
       expect(sent.session.audio.input.transcription.model).toBe(
-        'assemblyai/u3-rt-pro'
+        'soniox/stt-rt-v4'
       );
-      expect(sent.session.audio.input.transcription.language).toBe('es-MX');
-      expect(sent.session.model).toBe('openai/gpt-4.1-nano');
+      expect(sent.session.audio.input.transcription.language).toBe('es');
+      expect(sent.session.providerData.tts.language).toBe('es-MX');
+      expect(sent.session.model).toBe('openai/gpt-5.4-mini');
+    });
+
+    it('should strip steering and non-verbal tags from completed assistant transcript', () => {
+      const clientWs = createMockClientWs();
+      const mgr = new SessionManager({
+        sessionId: 'test-strip-1',
+        ws: clientWs,
+        languageCode: 'ja',
+      });
+
+      const mgrAny = mgr as unknown as Record<string, unknown>;
+      mgrAny.sessionReady = true;
+
+      const handler = mgrAny.handleInworldEvent as (
+        event: Record<string, unknown>
+      ) => void;
+      handler.call(mgr, {
+        type: 'response.output_audio_transcript.done',
+        transcript:
+          '[speak gently] なるほど、忙しいですね。[laugh] 趣味はありますか？',
+      });
+
+      const sent = (clientWs as unknown as { _messages: string[] })._messages;
+      const completes = sent
+        .map((m) => JSON.parse(m))
+        .filter(
+          (m: Record<string, unknown>) => m.type === 'llm_response_complete'
+        );
+      expect(completes).toHaveLength(1);
+      expect(completes[0].text).not.toContain('[');
+      expect(completes[0].text).not.toContain(']');
+      expect(completes[0].text).toContain('なるほど');
+      expect(completes[0].text).toContain('趣味はありますか');
+    });
+
+    it('should strip a bracketed tag that straddles two streaming deltas', () => {
+      const clientWs = createMockClientWs();
+      const mgr = new SessionManager({
+        sessionId: 'test-strip-2',
+        ws: clientWs,
+        languageCode: 'es',
+      });
+
+      const mgrAny = mgr as unknown as Record<string, unknown>;
+      mgrAny.sessionReady = true;
+
+      const handler = mgrAny.handleInworldEvent as (
+        event: Record<string, unknown>
+      ) => void;
+
+      handler.call(mgr, {
+        type: 'response.output_audio_transcript.delta',
+        delta: 'Hola, [spe',
+      });
+      handler.call(mgr, {
+        type: 'response.output_audio_transcript.delta',
+        delta: 'ak warmly] ¿qué tal?',
+      });
+
+      const sent = (clientWs as unknown as { _messages: string[] })._messages;
+      const chunks = sent
+        .map((m) => JSON.parse(m))
+        .filter(
+          (m: Record<string, unknown>) => m.type === 'llm_response_chunk'
+        );
+      const concatenated = chunks.map((c) => c.text).join('');
+      expect(concatenated).not.toContain('[');
+      expect(concatenated).not.toContain(']');
+      expect(concatenated).toContain('Hola');
+      expect(concatenated).toContain('¿qué tal?');
+    });
+
+    it('should include TTS-2 expressivity guidance with steering, nonverbal, and target-language disfluencies', () => {
+      const clientWs = createMockClientWs();
+      const mgr = new SessionManager({
+        sessionId: 'test-expressivity-1',
+        ws: clientWs,
+        languageCode: 'es',
+      });
+
+      const mgrAny = mgr as unknown as Record<string, unknown>;
+      const mockInworldWs = {
+        readyState: 1,
+        send: vi.fn(),
+        on: vi.fn(),
+        close: vi.fn(),
+      };
+      mgrAny.inworldWs = mockInworldWs;
+
+      const sendUpdate = mgrAny.sendSessionUpdate as () => void;
+      sendUpdate.call(mgr);
+
+      const sent = JSON.parse(mockInworldWs.send.mock.calls[0][0]);
+      const instructions = sent.session.instructions as string;
+
+      // Steering tag example
+      expect(instructions).toContain('[speak');
+      // Non-verbal tag
+      expect(instructions).toContain('[laugh]');
+      // Spanish disfluency from the seeded list
+      expect(instructions).toContain('este');
     });
   });
 });