Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Language learning app using Inworld Realtime API for voice conversations.
## Architecture
Browser <-> our WebSocket <-> SessionManager <-> Inworld Realtime WebSocket (STT+LLM+TTS)
- STT model: assemblyai/u3-rt-pro (with per-language hints)
- LLM model: openai/gpt-4.1-nano (via Inworld Realtime)
- LLM model: openai/gpt-5.4-mini (via Inworld Realtime)
- SessionManager: one per client, manages Inworld WS lifecycle, forwards audio/text, handles greeting, tracks turns
- InworldLLM: uses Inworld LLM Router (OpenAI-compatible) for flashcards, feedback, translation
- TurnMemory: 5-turn sliding window, non-blocking Supabase persistence
Expand Down
4 changes: 3 additions & 1 deletion backend/src/__tests__/config/languages.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,15 @@ describe('languages config', () => {
expect(config.name).toBeTruthy();
expect(config.nativeName).toBeTruthy();
expect(config.flag).toBeTruthy();
expect(config.sttLanguageCode).toBeTruthy();
expect(config.bcp47).toBeTruthy();
expect(config.ttsConfig).toBeDefined();
expect(config.ttsConfig.speakerId).toBeTruthy();
expect(config.ttsConfig.modelId).toBeTruthy();
expect(config.teacherPersona).toBeDefined();
expect(config.teacherPersona.name).toBeTruthy();
expect(config.exampleTopics.length).toBeGreaterThan(0);
expect(Array.isArray(config.disfluencies)).toBe(true);
expect(config.disfluencies.length).toBeGreaterThanOrEqual(1);
}
});
});
Expand Down
28 changes: 22 additions & 6 deletions backend/src/__tests__/inworld-llm.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ describe('InworldLLM', () => {
);

const body = JSON.parse(fetchSpy.mock.calls[0][1]!.body as string);
expect(body.model).toBe('openai/gpt-4.1-nano');
expect(body.model).toBe('openai/gpt-5.4-mini');
expect(body.messages[0].role).toBe('user');
expect(body.max_tokens).toBeDefined();
expect(body.temperature).toBeDefined();
Expand Down Expand Up @@ -295,7 +295,12 @@ describe('InworldLLM', () => {
json: async () => ({ audioContent: 'base64audiodata==' }),
} as Response);

const audio = await llm.pronounce('Hola', 'Rafael');
const audio = await llm.pronounce(
'Hola',
'Rafael',
'es-MX',
'inworld-tts-2'
);
expect(audio).toBe('base64audiodata==');
});

Expand All @@ -305,7 +310,7 @@ describe('InworldLLM', () => {
json: async () => ({ audioContent: 'audio' }),
} as Response);

await llm.pronounce('perro', 'Rafael');
await llm.pronounce('perro', 'Rafael', 'es-MX', 'inworld-tts-2');

expect(fetchSpy).toHaveBeenCalledWith(
'https://api.inworld.ai/tts/v1/voice',
Expand All @@ -315,7 +320,8 @@ describe('InworldLLM', () => {
const body = JSON.parse(fetchSpy.mock.calls[0][1]!.body as string);
expect(body.text).toBe('perro');
expect(body.voice_id).toBe('Rafael');
expect(body.model_id).toBe('inworld-tts-1.5-max');
expect(body.model_id).toBe('inworld-tts-2');
expect(body.language).toBe('es-MX');
expect(body.audio_config.audio_encoding).toBe('LINEAR16');
expect(body.audio_config.sample_rate_hertz).toBe(24000);
});
Expand All @@ -326,15 +332,25 @@ describe('InworldLLM', () => {
status: 500,
} as Response);

const audio = await llm.pronounce('Hola', 'Rafael');
const audio = await llm.pronounce(
'Hola',
'Rafael',
'es-MX',
'inworld-tts-2'
);
expect(audio).toBeNull();
});

it('should return null when no API key', async () => {
process.env.INWORLD_API_KEY = '';
const noKeyLlm = new InworldLLM();

const audio = await noKeyLlm.pronounce('Hola', 'Rafael');
const audio = await noKeyLlm.pronounce(
'Hola',
'Rafael',
'es-MX',
'inworld-tts-2'
);
expect(audio).toBeNull();
});
});
Expand Down
155 changes: 147 additions & 8 deletions backend/src/__tests__/session-manager.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import WebSocket from 'ws';
import { SessionManager } from '../services/session-manager.js';
import {
SessionManager,
stripBracketedTags,
} from '../services/session-manager.js';

// Mock ws module so SessionManager doesn't make real connections
vi.mock('ws', () => {
Expand All @@ -25,6 +28,40 @@ function createMockClientWs() {
} as unknown as WebSocket;
}

describe('stripBracketedTags', () => {
it('removes a leading steering tag and trims', () => {
expect(stripBracketedTags('[speak warmly] Hello there')).toBe(
'Hello there'
);
});

it('removes inline non-verbal tags', () => {
expect(stripBracketedTags('That is funny [laugh] really')).toBe(
'That is funny really'
);
});

it('collapses double spaces left by removed tags', () => {
expect(stripBracketedTags('one [tag] two')).toBe('one two');
});

it('removes the space before punctuation when a tag preceded it', () => {
expect(stripBracketedTags('Hello [laugh] , how are you')).toBe(
'Hello, how are you'
);
});

it('preserves disfluency text (no brackets) untouched', () => {
expect(stripBracketedTags('えーと、そうですね')).toBe('えーと、そうですね');
});

it('handles multiple bracketed tags in one string', () => {
expect(
stripBracketedTags('[speak gently] Pues [sigh] no sé qué decir')
).toBe('Pues no sé qué decir');
});
});

describe('SessionManager', () => {
const originalEnv = process.env.INWORLD_API_KEY;

Expand Down Expand Up @@ -281,7 +318,7 @@ describe('SessionManager', () => {
});

describe('streaming STT events', () => {
it('should accumulate transcription deltas incrementally', () => {
it('should treat transcription deltas as cumulative (Soniox)', () => {
const clientWs = createMockClientWs();
const mgr = new SessionManager({
sessionId: 'test-stt-1',
Expand All @@ -298,12 +335,12 @@ describe('SessionManager', () => {

handler.call(mgr, {
type: 'conversation.item.input_audio_transcription.delta',
delta: 'Hola, ',
delta: 'Hola',
});

handler.call(mgr, {
type: 'conversation.item.input_audio_transcription.delta',
delta: 'me llamo Cale.',
delta: 'Hola, me llamo Cale.',
});

const sent = (clientWs as unknown as { _messages: string[] })._messages;
Expand All @@ -313,7 +350,7 @@ describe('SessionManager', () => {
(m: Record<string, unknown>) => m.type === 'partial_transcript'
);
expect(partials).toHaveLength(2);
expect(partials[0].text).toBe('Hola, ');
expect(partials[0].text).toBe('Hola');
expect(partials[1].text).toBe('Hola, me llamo Cale.');
});

Expand Down Expand Up @@ -430,10 +467,112 @@ describe('SessionManager', () => {
const sent = JSON.parse(mockInworldWs.send.mock.calls[0][0]);
expect(sent.type).toBe('session.update');
expect(sent.session.audio.input.transcription.model).toBe(
'assemblyai/u3-rt-pro'
'soniox/stt-rt-v4'
);
expect(sent.session.audio.input.transcription.language).toBe('es-MX');
expect(sent.session.model).toBe('openai/gpt-4.1-nano');
expect(sent.session.audio.input.transcription.language).toBe('es');
expect(sent.session.providerData.tts.language).toBe('es-MX');
expect(sent.session.model).toBe('openai/gpt-5.4-mini');
});

it('should strip steering and non-verbal tags from completed assistant transcript', () => {
const clientWs = createMockClientWs();
const mgr = new SessionManager({
sessionId: 'test-strip-1',
ws: clientWs,
languageCode: 'ja',
});

const mgrAny = mgr as unknown as Record<string, unknown>;
mgrAny.sessionReady = true;

const handler = mgrAny.handleInworldEvent as (
event: Record<string, unknown>
) => void;
handler.call(mgr, {
type: 'response.output_audio_transcript.done',
transcript:
'[speak gently] なるほど、忙しいですね。[laugh] 趣味はありますか?',
});

const sent = (clientWs as unknown as { _messages: string[] })._messages;
const completes = sent
.map((m) => JSON.parse(m))
.filter(
(m: Record<string, unknown>) => m.type === 'llm_response_complete'
);
expect(completes).toHaveLength(1);
expect(completes[0].text).not.toContain('[');
expect(completes[0].text).not.toContain(']');
expect(completes[0].text).toContain('なるほど');
expect(completes[0].text).toContain('趣味はありますか');
});

it('should strip a bracketed tag that straddles two streaming deltas', () => {
const clientWs = createMockClientWs();
const mgr = new SessionManager({
sessionId: 'test-strip-2',
ws: clientWs,
languageCode: 'es',
});

const mgrAny = mgr as unknown as Record<string, unknown>;
mgrAny.sessionReady = true;

const handler = mgrAny.handleInworldEvent as (
event: Record<string, unknown>
) => void;

handler.call(mgr, {
type: 'response.output_audio_transcript.delta',
delta: 'Hola, [spe',
});
handler.call(mgr, {
type: 'response.output_audio_transcript.delta',
delta: 'ak warmly] ¿qué tal?',
});

const sent = (clientWs as unknown as { _messages: string[] })._messages;
const chunks = sent
.map((m) => JSON.parse(m))
.filter(
(m: Record<string, unknown>) => m.type === 'llm_response_chunk'
);
const concatenated = chunks.map((c) => c.text).join('');
expect(concatenated).not.toContain('[');
expect(concatenated).not.toContain(']');
expect(concatenated).toContain('Hola');
expect(concatenated).toContain('¿qué tal?');
});

it('should include TTS-2 expressivity guidance with steering, nonverbal, and target-language disfluencies', () => {
const clientWs = createMockClientWs();
const mgr = new SessionManager({
sessionId: 'test-expressivity-1',
ws: clientWs,
languageCode: 'es',
});

const mgrAny = mgr as unknown as Record<string, unknown>;
const mockInworldWs = {
readyState: 1,
send: vi.fn(),
on: vi.fn(),
close: vi.fn(),
};
mgrAny.inworldWs = mockInworldWs;

const sendUpdate = mgrAny.sendSessionUpdate as () => void;
sendUpdate.call(mgr);

const sent = JSON.parse(mockInworldWs.send.mock.calls[0][0]);
const instructions = sent.session.instructions as string;

// Steering tag example
expect(instructions).toContain('[speak');
// Non-verbal tag
expect(instructions).toContain('[laugh]');
// Spanish disfluency from the seeded list
expect(instructions).toContain('este');
});
});
});
Loading
Loading