diff --git a/examples/250-agora-realtime-transcription-node/.env.example b/examples/250-agora-realtime-transcription-node/.env.example new file mode 100644 index 0000000..c4e27bc --- /dev/null +++ b/examples/250-agora-realtime-transcription-node/.env.example @@ -0,0 +1,6 @@ +# Deepgram — https://console.deepgram.com/ +DEEPGRAM_API_KEY= + +# Agora — https://console.agora.io/ +AGORA_APP_ID= +AGORA_APP_CERTIFICATE= diff --git a/examples/250-agora-realtime-transcription-node/README.md b/examples/250-agora-realtime-transcription-node/README.md new file mode 100644 index 0000000..7077e8e --- /dev/null +++ b/examples/250-agora-realtime-transcription-node/README.md @@ -0,0 +1,57 @@ +# Agora Real-Time Audio Transcription + +Transcribe live audio from an Agora RTC channel in real-time using Deepgram's streaming speech-to-text API. Participants join a voice/video channel and see live captions as they speak, with speaker diarization to identify who said what. + +## What you'll build + +A Node.js server that generates Agora RTC tokens, serves a browser-based UI where users join an Agora channel, captures microphone audio from the Agora session, streams it to Deepgram for real-time transcription, and displays live captions with speaker labels. + +## Prerequisites + +- Node.js 18+ +- Deepgram account — [get a free API key](https://console.deepgram.com/) +- Agora account — [sign up](https://console.agora.io/) + +## Environment variables + +| Variable | Where to find it | +|----------|-----------------| +| `DEEPGRAM_API_KEY` | [Deepgram console](https://console.deepgram.com/) | +| `AGORA_APP_ID` | [Agora console](https://console.agora.io/) → Project Management → App ID | +| `AGORA_APP_CERTIFICATE` | [Agora console](https://console.agora.io/) → Project Management → App Certificate (enable if not active) | + +Copy `.env.example` to `.env` and fill in your values. + +## Install and run + +```bash +npm install +cp .env.example .env +# Fill in your API keys in .env +npm start +``` + +Open `http://localhost:3000` in your browser, enter a channel name, and click "Join Channel". Speak into your microphone and watch transcripts appear in real-time. + +## Key parameters + +| Parameter | Value | Description | +|-----------|-------|-------------| +| `model` | `nova-3` | Deepgram's most accurate general-purpose STT model | +| `encoding` | `linear16` | 16-bit signed PCM — captured from the browser's AudioContext | +| `sample_rate` | `16000` | 16 kHz sample rate for high-quality speech recognition | +| `diarize` | `true` | Enables speaker labels to distinguish channel participants | +| `interim_results` | `true` | Shows partial transcripts while the speaker is still talking | + +## How it works + +1. The browser requests an Agora RTC token from `POST /api/token` — the server generates it using the App Certificate (never exposed to the client) +2. The browser joins the Agora channel using the Agora Web SDK, publishes its microphone audio, and subscribes to remote participants +3. An AudioContext captures the local microphone track, converts float32 samples to signed 16-bit PCM at 16 kHz, and sends binary frames over a WebSocket to `/transcribe` +4. The Node.js server receives audio frames and forwards them to a Deepgram live STT connection +5. Deepgram returns interim and final transcript events with speaker labels, which the server relays back to the browser +6. The browser displays live captions overlaid on the video area and appends final transcripts to a scrolling log panel + +## Starter templates + +[deepgram-starters](https://github.com/orgs/deepgram-starters/repositories) diff --git a/examples/250-agora-realtime-transcription-node/package.json b/examples/250-agora-realtime-transcription-node/package.json new file mode 100644 index 0000000..5c56b85 --- /dev/null +++ b/examples/250-agora-realtime-transcription-node/package.json @@ -0,0 +1,21 @@ +{ + "name": "deepgram-agora-realtime-transcription", + "version": "1.0.0", + "description": "Transcribe Agora RTC channel audio in real-time using Deepgram live STT", + "main": "src/server.js", + "scripts": { + "start": "node src/server.js", + "test": "node tests/test.js" + }, + "dependencies": { + "@deepgram/sdk": "5.0.0", + "agora-token": "^2.0.5", + "dotenv": "^16.4.0", + "express": "^4.21.0", + "express-ws": "^5.0.2", + "ws": "^8.18.0" + }, + "engines": { + "node": ">=18" + } +} diff --git a/examples/250-agora-realtime-transcription-node/src/public/index.html b/examples/250-agora-realtime-transcription-node/src/public/index.html new file mode 100644 index 0000000..5d02d31 --- /dev/null +++ b/examples/250-agora-realtime-transcription-node/src/public/index.html @@ -0,0 +1,242 @@ + + + + + + Agora + Deepgram Live Transcription + + + +
+

Agora + Deepgram

+ Live STT + Ready +
+ + + +
+
+
+
+
+
Join an Agora channel to begin real-time transcription
+
+
+
+
+

Transcript

+
+
+
+ + + + + + diff --git a/examples/250-agora-realtime-transcription-node/src/server.js b/examples/250-agora-realtime-transcription-node/src/server.js new file mode 100644 index 0000000..c542b39 --- /dev/null +++ b/examples/250-agora-realtime-transcription-node/src/server.js @@ -0,0 +1,190 @@ +'use strict'; + +require('dotenv').config({ path: require('path').join(__dirname, '..', '.env') }); + +const express = require('express'); +const expressWs = require('express-ws'); +const path = require('path'); +const { DeepgramClient } = require('@deepgram/sdk'); +const { RtcTokenBuilder, RtcRole } = require('agora-token'); + +const PORT = process.env.PORT || 3000; + +// Browser captures mic audio via AudioContext at 16 kHz, converts float32 +// to signed 16-bit PCM, and sends binary frames over the WebSocket. +const DEEPGRAM_LIVE_OPTIONS = { + model: 'nova-3', + encoding: 'linear16', + sample_rate: 16000, + channels: 1, + smart_format: true, + interim_results: true, + utterance_end_ms: 1500, + punctuate: true, + diarize: true, // ← THIS enables speaker labels for multi-participant channels + tag: 'deepgram-examples', +}; + +function createApp() { + const app = express(); + expressWs(app); + app.use(express.json()); + + if (!process.env.DEEPGRAM_API_KEY) { + console.error('Error: DEEPGRAM_API_KEY environment variable is not set.'); + console.error('Copy .env.example to .env and add your API key.'); + process.exit(1); + } + if (!process.env.AGORA_APP_ID) { + console.error('Error: AGORA_APP_ID environment variable is not set.'); + console.error('Copy .env.example to .env and add your Agora App ID.'); + process.exit(1); + } + if (!process.env.AGORA_APP_CERTIFICATE) { + console.error('Error: AGORA_APP_CERTIFICATE environment variable is not set.'); + console.error('Copy .env.example to .env and add your Agora App Certificate.'); + process.exit(1); + } + + const deepgram = new DeepgramClient({ apiKey: process.env.DEEPGRAM_API_KEY }); + + app.use(express.static(path.join(__dirname, 'public'))); + + // Generate a short-lived Agora RTC token so the browser can join a channel + // without exposing the App Certificate. + app.post('/api/token', (req, res) => { + const { channel, uid } = req.body || {}; + if (!channel) { + return res.status(400).json({ error: 'channel is required' }); + } + + const numericUid = uid ? Number(uid) : 0; + const TOKEN_EXPIRE_SECS = 3600; + const PRIVILEGE_EXPIRE_SECS = 3600; + + // RtcRole.PUBLISHER lets the user send and receive audio/video + const token = RtcTokenBuilder.buildTokenWithUid( + process.env.AGORA_APP_ID, + process.env.AGORA_APP_CERTIFICATE, + channel, + numericUid, + RtcRole.PUBLISHER, + TOKEN_EXPIRE_SECS, + PRIVILEGE_EXPIRE_SECS, + ); + + res.json({ token, appId: process.env.AGORA_APP_ID, channel, uid: numericUid }); + console.log(`[token] Generated for channel="${channel}" uid=${numericUid}`); + }); + + // WebSocket endpoint: browser streams PCM audio here, server forwards to Deepgram. + // Keeps the Deepgram API key server-side while the browser handles + // the Agora RTC connection and audio capture. + app.ws('/transcribe', (clientWs) => { + let dgConnection = null; + let dgReady = false; + const mediaQueue = []; + + console.log('[ws] Client connected for transcription'); + + clientWs.on('message', (raw) => { + if (typeof raw !== 'string' && Buffer.isBuffer(raw)) { + if (dgReady && dgConnection) { + try { dgConnection.sendMedia(raw); } catch {} + } else { + mediaQueue.push(raw); + } + return; + } + + try { + const msg = JSON.parse(raw); + if (msg.type === 'stop') { + console.log('[ws] Client requested stop'); + if (dgConnection) { + try { dgConnection.sendCloseStream({ type: 'CloseStream' }); } catch {} + try { dgConnection.close(); } catch {} + } + } + } catch {} + }); + + clientWs.on('close', () => { + console.log('[ws] Client disconnected'); + if (dgConnection) { + try { dgConnection.sendCloseStream({ type: 'CloseStream' }); } catch {} + try { dgConnection.close(); } catch {} + dgConnection = null; + } + }); + + clientWs.on('error', (err) => { + console.error('[ws] Client error:', err.message); + if (dgConnection) { + try { dgConnection.close(); } catch {} + dgConnection = null; + } + }); + + (async () => { + dgConnection = await deepgram.listen.v1.connect(DEEPGRAM_LIVE_OPTIONS); + + dgConnection.on('open', () => { + console.log('[deepgram] Connection opened'); + dgReady = true; + for (const buf of mediaQueue) { + try { dgConnection.sendMedia(buf); } catch {} + } + mediaQueue.length = 0; + }); + + dgConnection.on('error', (err) => { + console.error('[deepgram] Error:', err.message); + dgReady = false; + }); + + dgConnection.on('close', () => { + console.log('[deepgram] Connection closed'); + dgReady = false; + }); + + dgConnection.on('message', (data) => { + const transcript = data?.channel?.alternatives?.[0]?.transcript; + if (transcript) { + const isFinal = data.is_final; + const tag = isFinal ? 'final' : 'interim'; + const speaker = data?.channel?.alternatives?.[0]?.words?.[0]?.speaker; + console.log(`[${tag}] ${transcript}`); + + if (clientWs.readyState === 1) { + clientWs.send(JSON.stringify({ transcript, is_final: isFinal, speaker })); + } + } + }); + + dgConnection.connect(); + await dgConnection.waitForOpen(); + })().catch((err) => { + console.error('[deepgram] Setup failed:', err.message); + }); + }); + + app.get('/api/health', (_req, res) => { + res.json({ status: 'ok', service: 'deepgram-agora-realtime-transcription' }); + }); + + return app; +} + +if (require.main === module) { + const app = createApp(); + app.listen(PORT, () => { + console.log(`Server listening on port ${PORT}`); + console.log(` POST /api/token — Generate Agora RTC token`); + console.log(` WS /transcribe — Audio streaming WebSocket`); + console.log(` GET /api/health — Health check`); + console.log(`\nOpen http://localhost:${PORT} in your browser`); + }); +} + +module.exports = { createApp }; diff --git a/examples/250-agora-realtime-transcription-node/tests/test.js b/examples/250-agora-realtime-transcription-node/tests/test.js new file mode 100644 index 0000000..4759a98 --- /dev/null +++ b/examples/250-agora-realtime-transcription-node/tests/test.js @@ -0,0 +1,253 @@ +'use strict'; + +const fs = require('fs'); +const path = require('path'); +const { execSync } = require('child_process'); +const WebSocket = require('ws'); + +// ── Credential check — MUST be first ────────────────────────────────────── +const required = fs.readFileSync(path.join(__dirname, '..', '.env.example'), 'utf8') + .split('\n').filter(l => /^[A-Z][A-Z0-9_]+=/.test(l.trim())).map(l => l.split('=')[0].trim()); +const missing = required.filter(k => !process.env[k]); +if (missing.length > 0) { + console.error(`MISSING_CREDENTIALS: ${missing.join(',')}`); + process.exit(2); +} +// ────────────────────────────────────────────────────────────────────────── + +const { createApp } = require('../src/server.js'); + +const PORT = 3099; +const AUDIO_URL = 'https://dpgr.am/spacewalk.wav'; +const TMP_WAV = '/tmp/agora_test.wav'; + +// ── Test 1: Required files exist ──────────────────────────────────────────── +function testFileStructure() { + const root = path.join(__dirname, '..'); + const requiredFiles = [ + '.env.example', + 'package.json', + 'README.md', + 'src/server.js', + 'src/public/index.html', + 'tests/test.js', + ]; + for (const f of requiredFiles) { + if (!fs.existsSync(path.join(root, f))) { + throw new Error(`Missing required file: ${f}`); + } + } + console.log('File structure verified'); +} + +// ── Test 2: Agora token generation ────────────────────────────────────────── +function testAgoraTokenGeneration() { + const { RtcTokenBuilder, RtcRole } = require('agora-token'); + const token = RtcTokenBuilder.buildTokenWithUid( + process.env.AGORA_APP_ID, + process.env.AGORA_APP_CERTIFICATE, + 'ci-test-channel', + 0, + RtcRole.PUBLISHER, + 3600, + 3600, + ); + if (!token || token.length < 20) { + throw new Error('Agora token generation failed'); + } + console.log('Agora token generation verified'); +} + +// ── Test 3: Server module loads and exports createApp ──────────────────────── +function testServerModule() { + if (typeof createApp !== 'function') { + throw new Error('server.js does not export createApp function'); + } + console.log('Server module exports verified'); +} + +// ── Test 4: Token endpoint returns valid response ─────────────────────────── +async function testTokenEndpoint() { + const app = createApp(); + const server = app.listen(PORT); + await new Promise(r => server.on('listening', r)); + + try { + const res = await fetch(`http://localhost:${PORT}/api/token`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ channel: 'ci-test', uid: 12345 }), + }); + + if (res.status !== 200) throw new Error(`/api/token returned ${res.status}`); + + const data = await res.json(); + if (!data.token) throw new Error('Response missing token'); + if (!data.appId) throw new Error('Response missing appId'); + if (data.channel !== 'ci-test') throw new Error('Response channel mismatch'); + if (data.uid !== 12345) throw new Error('Response uid mismatch'); + + console.log('POST /api/token verified'); + + const healthRes = await fetch(`http://localhost:${PORT}/api/health`); + if (healthRes.status !== 200) throw new Error(`/api/health returned ${healthRes.status}`); + const healthData = await healthRes.json(); + if (healthData.status !== 'ok') throw new Error('Health check status not ok'); + + console.log('GET /api/health verified'); + } finally { + server.close(); + } +} + +// ── Test 5: Deepgram live STT with real audio via server WebSocket ────────── +async function testDeepgramLiveTranscription() { + const app = createApp(); + const server = app.listen(PORT + 1); + await new Promise(r => server.on('listening', r)); + + console.log('Downloading test audio...'); + execSync(`curl -s -L -o "${TMP_WAV}" "${AUDIO_URL}"`, { stdio: 'pipe' }); + + const wavBuffer = fs.readFileSync(TMP_WAV); + + let offset = 12; + let dataStart = 0; + let sampleRate = 0; + let bitsPerSample = 0; + let numChannels = 0; + let dataSize = 0; + while (offset < wavBuffer.length - 8) { + const chunkId = wavBuffer.toString('ascii', offset, offset + 4); + const chunkSize = wavBuffer.readUInt32LE(offset + 4); + if (chunkId === 'fmt ') { + numChannels = wavBuffer.readUInt16LE(offset + 10); + sampleRate = wavBuffer.readUInt32LE(offset + 12); + bitsPerSample = wavBuffer.readUInt16LE(offset + 22); + } else if (chunkId === 'data') { + dataStart = offset + 8; + dataSize = chunkSize; + break; + } + offset += 8 + chunkSize; + } + if (!dataStart) throw new Error('Invalid WAV: no data chunk'); + + const bytesPerSample = bitsPerSample / 8; + const totalSamples = Math.floor(dataSize / (bytesPerSample * numChannels)); + const ratio = sampleRate / 16000; + const outLen = Math.floor(totalSamples / ratio); + const pcm16 = Buffer.alloc(outLen * 2); + + for (let i = 0; i < outLen; i++) { + const srcIdx = Math.floor(i * ratio); + const byteOff = dataStart + srcIdx * bytesPerSample * numChannels; + let sample; + if (bitsPerSample === 16) { + sample = wavBuffer.readInt16LE(byteOff); + } else if (bitsPerSample === 24) { + sample = (wavBuffer[byteOff] | (wavBuffer[byteOff + 1] << 8) | (wavBuffer[byteOff + 2] << 16)); + if (sample & 0x800000) sample |= ~0xFFFFFF; + sample = sample >> 8; + } else if (bitsPerSample === 32) { + sample = wavBuffer.readInt32LE(byteOff) >> 16; + } else { + sample = (wavBuffer[byteOff] - 128) << 8; + } + pcm16.writeInt16LE(sample, i * 2); + } + + console.log(`Audio ready: ${pcm16.length} bytes of linear16 16 kHz`); + + const transcripts = []; + + return new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + server.close(); + reject(new Error('Timed out (30s) waiting for Deepgram transcript.')); + }, 30_000); + + const ws = new WebSocket(`ws://localhost:${PORT + 1}/transcribe`); + + ws.on('error', (err) => { + clearTimeout(timeout); + server.close(); + reject(new Error(`WebSocket error: ${err.message}`)); + }); + + ws.on('message', (raw) => { + try { + const data = JSON.parse(raw.toString()); + if (data.transcript) { + const tag = data.is_final ? 'final' : 'interim'; + console.log(`[${tag}] ${data.transcript}`); + transcripts.push(data.transcript); + } + } catch {} + }); + + ws.on('open', () => { + console.log('[test] Connected — streaming audio...'); + + const CHUNK_BYTES = 640; + const MAX_BYTES = 16000 * 2 * 5; + let pos = 0; + + const sendChunk = () => { + if (ws.readyState !== WebSocket.OPEN) return; + if (pos >= pcm16.length || pos >= MAX_BYTES) { + console.log('[test] Audio sent — waiting for final results...'); + ws.send(JSON.stringify({ type: 'stop' })); + setTimeout(() => { + try { ws.close(); } catch {} + }, 2000); + return; + } + ws.send(pcm16.subarray(pos, pos + CHUNK_BYTES)); + pos += CHUNK_BYTES; + setTimeout(sendChunk, 20); + }; + + sendChunk(); + }); + + ws.on('close', () => { + clearTimeout(timeout); + setTimeout(() => { + server.close(); + + if (transcripts.length === 0) { + reject(new Error('No transcripts received from Deepgram.')); + return; + } + + const combined = transcripts.join(' ').toLowerCase(); + const expectedWords = ['spacewalk', 'astronaut', 'nasa']; + const found = expectedWords.filter(w => combined.includes(w)); + + if (found.length === 0) { + reject(new Error( + `Transcripts arrived but no expected words found.\nGot: ${transcripts.slice(0, 3).join(' | ')}` + )); + return; + } + + console.log(`\nTranscript content verified (found: ${found.join(', ')})`); + resolve(transcripts); + }, 1000); + }); + }); +} + +// ── Main ────────────────────────────────────────────────────────────────────── +async function run() { + testFileStructure(); + testServerModule(); + testAgoraTokenGeneration(); + await testTokenEndpoint(); + await testDeepgramLiveTranscription(); +} + +run() + .then(() => { console.log('\nAll tests passed'); process.exit(0); }) + .catch(err => { console.error(`\nTest failed: ${err.message}`); process.exit(1); });