Real-time Voice Features
Earna AI Console implements real-time voice conversations using GPT-4o Realtime API for natural dialogue and OpenAI TTS for high-quality text-to-speech synthesis.
Overview
The voice system provides:
- Real-time bidirectional audio streaming
- Natural conversation with interruption handling
- Multiple voice options and personalities
- Low-latency WebSocket connections
- Audio transcription and synthesis
- Browser-based audio capture and playback
Architecture
Setup
Configure OpenAI API
Add to your .env.local
:
# OpenAI Configuration (Required for voice)
OPENAI_API_KEY=sk-...
NEXT_PUBLIC_OPENAI_API_KEY=sk-... # For client-side WebSocket
# Optional: Custom endpoints
OPENAI_BASE_URL=https://api.openai.com/v1
REALTIME_API_URL=wss://api.openai.com/v1/realtime
Enable Microphone Access
The browser will request microphone permissions on first use. Ensure HTTPS is enabled in production.
Test Audio Setup
# Test TTS generation
curl -X POST http://localhost:3000/api/tts \
-H "Content-Type: application/json" \
-d '{"text": "Hello, this is a test", "voice": "alloy"}'
# Should return audio/mpeg data
GPT-4o Realtime API
WebSocket Connection
Client Setup
Client WebSocket Setup
// app/components/chat/voice-mode-realtime.tsx
import { useEffect, useRef, useState } from 'react';
export function VoiceModeRealtime() {
const wsRef = useRef<WebSocket | null>(null);
const [isConnected, setIsConnected] = useState(false);
const [isRecording, setIsRecording] = useState(false);
const connectWebSocket = async () => {
try {
// Get ephemeral key from server
const response = await fetch('/api/realtime-session', {
method: 'POST',
});
const { key, url } = await response.json();
// Connect to GPT-4o Realtime API
const ws = new WebSocket(
`${url}?model=gpt-4o-realtime-preview`,
[],
{
headers: {
'Authorization': `Bearer ${key}`,
'OpenAI-Beta': 'realtime=v1',
}
}
);
ws.onopen = () => {
console.log('Connected to GPT-4o Realtime API');
setIsConnected(true);
// Configure session
ws.send(JSON.stringify({
type: 'session.update',
session: {
modalities: ['text', 'audio'],
instructions: 'You are a helpful AI assistant.',
voice: 'alloy',
input_audio_format: 'pcm16',
output_audio_format: 'pcm16',
input_audio_transcription: {
model: 'whisper-1'
},
turn_detection: {
type: 'server_vad',
threshold: 0.5,
prefix_padding_ms: 300,
silence_duration_ms: 500
},
temperature: 0.8,
max_response_output_tokens: 4096
}
}));
};
ws.onmessage = handleRealtimeMessage;
ws.onerror = (error) => {
console.error('WebSocket error:', error);
setIsConnected(false);
};
ws.onclose = () => {
console.log('Disconnected from GPT-4o Realtime API');
setIsConnected(false);
};
wsRef.current = ws;
} catch (error) {
console.error('Failed to connect:', error);
}
};
return (
<div className="voice-mode-container">
<button onClick={connectWebSocket} disabled={isConnected}>
{isConnected ? 'Connected' : 'Connect Voice'}
</button>
</div>
);
}
Audio Capture & Streaming
// app/components/chat/audio-capture.tsx
export function AudioCapture({ onAudioData }: { onAudioData: (data: ArrayBuffer) => void }) {
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const audioContextRef = useRef<AudioContext | null>(null);
const startRecording = async () => {
try {
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
sampleRate: 24000,
echoCancellation: true,
noiseSuppression: true,
}
});
// Create audio context for processing
audioContextRef.current = new AudioContext({ sampleRate: 24000 });
const source = audioContextRef.current.createMediaStreamSource(stream);
const processor = audioContextRef.current.createScriptProcessor(4096, 1, 1);
processor.onaudioprocess = (e) => {
const inputData = e.inputBuffer.getChannelData(0);
// Convert Float32Array to PCM16
const pcm16 = new Int16Array(inputData.length);
for (let i = 0; i < inputData.length; i++) {
const s = Math.max(-1, Math.min(1, inputData[i]));
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
}
onAudioData(pcm16.buffer);
};
source.connect(processor);
processor.connect(audioContextRef.current.destination);
} catch (error) {
console.error('Failed to start recording:', error);
}
};
const stopRecording = () => {
if (mediaRecorderRef.current) {
mediaRecorderRef.current.stop();
}
if (audioContextRef.current) {
audioContextRef.current.close();
}
};
return (
<div>
<button onClick={startRecording}>Start Recording</button>
<button onClick={stopRecording}>Stop Recording</button>
</div>
);
}
Audio Playback
// app/components/chat/audio-playback.tsx
export class AudioPlayer {
private audioContext: AudioContext;
private audioQueue: AudioBuffer[] = [];
private isPlaying = false;
constructor() {
this.audioContext = new AudioContext();
}
async playPCM16(pcm16Data: ArrayBuffer) {
// Convert PCM16 to Float32
const pcm16 = new Int16Array(pcm16Data);
const float32 = new Float32Array(pcm16.length);
for (let i = 0; i < pcm16.length; i++) {
float32[i] = pcm16[i] / (pcm16[i] < 0 ? 0x8000 : 0x7FFF);
}
// Create audio buffer
const audioBuffer = this.audioContext.createBuffer(
1, // mono
float32.length, // frame count
24000 // sample rate
);
audioBuffer.getChannelData(0).set(float32);
// Add to queue
this.audioQueue.push(audioBuffer);
// Start playback if not already playing
if (!this.isPlaying) {
this.playNextInQueue();
}
}
private async playNextInQueue() {
if (this.audioQueue.length === 0) {
this.isPlaying = false;
return;
}
this.isPlaying = true;
const audioBuffer = this.audioQueue.shift()!;
const source = this.audioContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(this.audioContext.destination);
source.onended = () => {
this.playNextInQueue();
};
source.start();
}
stop() {
this.audioQueue = [];
this.isPlaying = false;
}
}
Text-to-Speech (TTS)
API Implementation
// app/api/tts/route.ts
import OpenAI from 'openai';
import { NextResponse } from 'next/server';
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
export async function POST(req: Request) {
try {
const { text, voice = 'alloy', speed = 1.0 } = await req.json();
if (!text) {
return NextResponse.json(
{ error: 'Text is required' },
{ status: 400 }
);
}
// Generate speech
const response = await openai.audio.speech.create({
model: 'tts-1',
voice: voice as any,
input: text,
speed,
});
// Get audio data
const buffer = Buffer.from(await response.arrayBuffer());
// Return audio response
return new Response(buffer, {
headers: {
'Content-Type': 'audio/mpeg',
'Content-Length': buffer.length.toString(),
},
});
} catch (error) {
console.error('TTS generation failed:', error);
return NextResponse.json(
{ error: 'Failed to generate speech' },
{ status: 500 }
);
}
}
Client Integration
// app/components/chat/message-assistant-tts.tsx
export function MessageWithTTS({ content }: { content: string }) {
const [isPlaying, setIsPlaying] = useState(false);
const [audioUrl, setAudioUrl] = useState<string | null>(null);
const audioRef = useRef<HTMLAudioElement>(null);
const generateSpeech = async () => {
try {
const response = await fetch('/api/tts', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
text: content,
voice: 'nova', // or 'alloy', 'echo', 'fable', 'onyx', 'shimmer'
speed: 1.0,
}),
});
if (!response.ok) throw new Error('TTS failed');
const blob = await response.blob();
const url = URL.createObjectURL(blob);
setAudioUrl(url);
// Auto-play
if (audioRef.current) {
audioRef.current.src = url;
audioRef.current.play();
setIsPlaying(true);
}
} catch (error) {
console.error('Failed to generate speech:', error);
}
};
return (
<div className="message-with-tts">
<div className="message-content">{content}</div>
<div className="tts-controls">
<button onClick={generateSpeech} disabled={isPlaying}>
<SpeakerIcon />
</button>
<audio
ref={audioRef}
onEnded={() => setIsPlaying(false)}
controls
hidden
/>
</div>
</div>
);
}
Voice Configuration
Available Voices
GPT-4o provides 6 distinct voices, each with unique characteristics suitable for different use cases.
Voice | Description | Best For |
---|---|---|
alloy | Neutral and balanced | General purpose |
nova | Warm and friendly | Customer service |
echo | Smooth and professional | Business applications |
fable | Expressive and dynamic | Storytelling |
onyx | Deep and authoritative | News and documentation |
shimmer | Soft and gentle | Meditation and relaxation |
Voice Settings
// Voice configuration interface
interface VoiceSettings {
voice: 'alloy' | 'nova' | 'echo' | 'fable' | 'onyx' | 'shimmer';
speed: number; // 0.25 to 4.0
temperature: number; // 0.0 to 1.0 (Realtime only)
format: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm';
}
// Example presets
const voicePresets = {
professional: {
voice: 'echo',
speed: 0.95,
temperature: 0.3,
},
conversational: {
voice: 'nova',
speed: 1.0,
temperature: 0.7,
},
educational: {
voice: 'fable',
speed: 0.9,
temperature: 0.5,
},
};
Whisper Transcription
// app/api/whisper/route.ts
export async function POST(req: Request) {
try {
const formData = await req.formData();
const audioFile = formData.get('audio') as File;
if (!audioFile) {
return NextResponse.json(
{ error: 'Audio file required' },
{ status: 400 }
);
}
// Transcribe with Whisper
const transcription = await openai.audio.transcriptions.create({
file: audioFile,
model: 'whisper-1',
language: 'en',
response_format: 'json',
temperature: 0,
});
return NextResponse.json({
text: transcription.text,
});
} catch (error) {
console.error('Transcription failed:', error);
return NextResponse.json(
{ error: 'Failed to transcribe audio' },
{ status: 500 }
);
}
}
Complete Voice Mode Implementation
Component
Voice Mode Component
// app/components/chat/voice-mode.tsx
export function VoiceMode() {
const [mode, setMode] = useState<'idle' | 'listening' | 'thinking' | 'speaking'>('idle');
const {
startConversation,
stopConversation,
isConnected,
transcript,
response
} = useRealtimeConversation();
return (
<div className="voice-mode">
<div className="status-indicator">
{mode === 'idle' && <IdleIcon />}
{mode === 'listening' && <MicIcon className="animate-pulse" />}
{mode === 'thinking' && <LoadingIcon className="animate-spin" />}
{mode === 'speaking' && <SpeakerIcon className="animate-pulse" />}
</div>
<div className="transcript">
{transcript && (
<div className="user-speech">{transcript}</div>
)}
{response && (
<div className="ai-response">{response}</div>
)}
</div>
<button
onClick={isConnected ? stopConversation : startConversation}
className={cn(
'voice-button',
isConnected && 'active'
)}
>
{isConnected ? 'Stop' : 'Start'} Conversation
</button>
</div>
);
}
Performance Optimization
Audio Buffer Management
// Optimize audio buffering for smooth playback
class OptimizedAudioPlayer {
private bufferThreshold = 0.5; // seconds
private maxBufferSize = 5.0; // seconds
async play(audioData: ArrayBuffer) {
const currentBufferDuration = this.getBufferDuration();
// Drop audio if buffer is too full
if (currentBufferDuration > this.maxBufferSize) {
console.warn('Audio buffer overflow, dropping frames');
return;
}
// Start playback when enough is buffered
if (!this.isPlaying && currentBufferDuration > this.bufferThreshold) {
this.startPlayback();
}
this.addToBuffer(audioData);
}
}
Latency Reduction
// Reduce latency with optimized settings
const realtimeConfig = {
// Audio settings for low latency
audio: {
sampleRate: 24000, // Lower sample rate
channelCount: 1, // Mono audio
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
},
// WebSocket settings
websocket: {
binaryType: 'arraybuffer',
reconnectInterval: 1000,
maxReconnectAttempts: 3,
},
// Voice activity detection
vad: {
threshold: 0.5,
prefixPadding: 300, // ms before speech
suffixPadding: 500, // ms after speech
},
};
Troubleshooting
Common Issues
Issue | Cause | Solution |
---|---|---|
No audio output | Browser autoplay policy | User interaction required before playing audio |
High latency | Network conditions | Use closer servers, reduce audio quality |
Echo/feedback | Speaker bleeding to mic | Use headphones or echo cancellation |
Choppy audio | Buffer underrun | Increase buffer threshold |
WebSocket disconnects | Token expiration | Implement automatic reconnection |
Debug Logging
// Enable debug logging for voice mode
const DEBUG = process.env.NODE_ENV === 'development';
function debugLog(category: string, message: any) {
if (DEBUG) {
console.log(`[Voice:${category}]`, message);
}
}
// Log audio metrics
setInterval(() => {
if (DEBUG && audioContext) {
debugLog('Metrics', {
currentTime: audioContext.currentTime,
state: audioContext.state,
sampleRate: audioContext.sampleRate,
latency: audioContext.baseLatency,
});
}
}, 5000);
Security Considerations
Always validate and sanitize audio input. Never expose API keys in client code.
- API Key Security: Use server-side session creation
- Audio Validation: Check file size and format
- Rate Limiting: Limit API calls per user
- Content Filtering: Monitor for inappropriate content
- Privacy: Inform users about audio recording
Next Steps
- WebRTC Guide - Advanced WebRTC configuration
- API Reference - Complete API documentation
- Security - Audio security best practices
Last updated on