From 343ab464d56d4af2ae21f9caf8ae54fb94f18062 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Tue, 10 Mar 2026 17:10:20 +0000 Subject: [PATCH 01/17] voice-call: add OpenAI Realtime API voice mode --- Dockerfile | 2 +- docker-compose.yml | 1 + extensions/voice-call/openclaw.plugin.json | 73 ++ extensions/voice-call/src/config.ts | 98 ++ .../src/providers/openai-realtime-voice.ts | 863 ++++++++++++++++++ extensions/voice-call/src/runtime.ts | 17 + extensions/voice-call/src/test-fixtures.ts | 1 + extensions/voice-call/src/webhook.ts | 40 +- .../src/webhook/realtime-handler.ts | 289 ++++++ 9 files changed, 1381 insertions(+), 3 deletions(-) create mode 100755 extensions/voice-call/src/providers/openai-realtime-voice.ts create mode 100644 extensions/voice-call/src/webhook/realtime-handler.ts diff --git a/Dockerfile b/Dockerfile index 72c413ebe7b..dd703988b11 100644 --- a/Dockerfile +++ b/Dockerfile @@ -88,7 +88,7 @@ RUN pnpm canvas:a2ui:bundle || \ echo "/* A2UI bundle unavailable in this build */" > src/canvas-host/a2ui/a2ui.bundle.js && \ echo "stub" > src/canvas-host/a2ui/.bundle.hash && \ rm -rf vendor/a2ui apps/shared/OpenClawKit/Tools/CanvasA2UI) -RUN pnpm build:docker +RUN NODE_OPTIONS=--max-old-space-size=2048 pnpm build:docker # Force pnpm for UI build (Bun may fail on ARM/Synology architectures) ENV OPENCLAW_PREFER_PNPM=1 RUN pnpm ui:build diff --git a/docker-compose.yml b/docker-compose.yml index c0bffc64458..53b4726b556 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -24,6 +24,7 @@ services: ports: - "${OPENCLAW_GATEWAY_PORT:-18789}:18789" - "${OPENCLAW_BRIDGE_PORT:-18790}:18790" + - "${OPENCLAW_VOICE_WEBHOOK_PORT:-3334}:3334" init: true restart: unless-stopped command: diff --git a/extensions/voice-call/openclaw.plugin.json b/extensions/voice-call/openclaw.plugin.json index fef3ccc6ad9..83f2613530d 100644 --- a/extensions/voice-call/openclaw.plugin.json +++ b/extensions/voice-call/openclaw.plugin.json @@ -134,6 +134,36 @@ "label": "ElevenLabs Base URL", "advanced": true }, + "realtime.enabled": { + "label": "Enable Realtime Voice (OpenAI)", + "help": "Full voice-to-voice mode via OpenAI Realtime API. Requires inboundPolicy open or allowlist." + }, + "realtime.model": { + "label": "Realtime Model", + "advanced": true + }, + "realtime.voice": { + "label": "Realtime Voice" + }, + "realtime.instructions": { + "label": "Realtime Instructions" + }, + "realtime.temperature": { + "label": "Realtime Temperature", + "advanced": true + }, + "realtime.vadThreshold": { + "label": "VAD Sensitivity", + "advanced": true + }, + "realtime.silenceDurationMs": { + "label": "Silence Timeout (ms)", + "advanced": true + }, + "realtime.prefixPaddingMs": { + "label": "Prefix Padding (ms)", + "advanced": true + }, "publicUrl": { "label": "Public Webhook URL", "advanced": true @@ -385,6 +415,49 @@ } } }, + "realtime": { + "type": "object", + "additionalProperties": false, + "properties": { + "enabled": { + "type": "boolean" + }, + "model": { + "type": "string" + }, + "voice": { + "type": "string", + "enum": ["alloy", "ash", "ballad", "cedar", "coral", "echo", "marin", "sage", "shimmer", "verse"] + }, + "instructions": { + "type": "string" + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "vadThreshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "silenceDurationMs": { + "type": "integer", + "minimum": 1 + }, + "prefixPaddingMs": { + "type": "integer", + "minimum": 0 + }, + "tools": { + "type": "array", + "items": { + "type": "object" + } + } + } + }, "publicUrl": { "type": "string" }, diff --git a/extensions/voice-call/src/config.ts b/extensions/voice-call/src/config.ts index 2d1494c7876..7d75c23ac29 100644 --- a/extensions/voice-call/src/config.ts +++ b/extensions/voice-call/src/config.ts @@ -200,6 +200,55 @@ export const OutboundConfigSchema = z .default({ defaultMode: "notify", notifyHangupDelaySec: 3 }); export type OutboundConfig = z.infer; +// ----------------------------------------------------------------------------- +// Realtime Voice Configuration (OpenAI Realtime API — voice-to-voice) +// ----------------------------------------------------------------------------- + +/** + * Zod schema for a single OpenAI Realtime tool (mirrors the RealtimeTool interface + * in providers/openai-realtime-voice.ts, expressed as a schema for config parsing). + */ +export const RealtimeToolSchema = z + .object({ + type: z.literal("function"), + name: z.string(), + description: z.string(), + parameters: z.object({ + type: z.literal("object"), + properties: z.record(z.unknown()), + required: z.array(z.string()).optional(), + }), + }) + .strict(); +export type RealtimeToolConfig = z.infer; + +export const VoiceCallRealtimeConfigSchema = z + .object({ + /** Enable realtime voice-to-voice mode (OpenAI Realtime API). Default: false. */ + enabled: z.boolean().default(false), + /** Realtime model (env: REALTIME_VOICE_MODEL, default: "gpt-4o-mini-realtime-preview") */ + model: z.string().optional(), + /** Voice for AI speech output (env: REALTIME_VOICE_VOICE) */ + voice: z + .enum(["alloy", "ash", "ballad", "cedar", "coral", "echo", "marin", "sage", "shimmer", "verse"]) + .optional(), + /** System instructions / persona (env: REALTIME_VOICE_INSTRUCTIONS) */ + instructions: z.string().optional(), + /** Temperature 0–2 (env: REALTIME_VOICE_TEMPERATURE) */ + temperature: z.number().min(0).max(2).optional(), + /** VAD threshold 0–1 (env: VAD_THRESHOLD) */ + vadThreshold: z.number().min(0).max(1).optional(), + /** Silence duration in ms before turn ends (env: SILENCE_DURATION_MS) */ + silenceDurationMs: z.number().int().positive().optional(), + /** Audio padding before speech in ms */ + prefixPaddingMs: z.number().int().nonnegative().optional(), + /** Tool definitions (OpenAI function-call schema); execution wired via registerToolHandler */ + tools: z.array(RealtimeToolSchema).default([]), + }) + .strict() + .default({ enabled: false, tools: [] }); +export type VoiceCallRealtimeConfig = z.infer; + // ----------------------------------------------------------------------------- // Streaming Configuration (OpenAI Realtime STT) // ----------------------------------------------------------------------------- @@ -324,6 +373,9 @@ export const VoiceCallConfigSchema = z /** Real-time audio streaming configuration */ streaming: VoiceCallStreamingConfigSchema, + /** Realtime voice-to-voice configuration (OpenAI Realtime API) */ + realtime: VoiceCallRealtimeConfigSchema, + /** Public webhook URL override (if set, bypasses tunnel auto-detection) */ publicUrl: z.string().url().optional(), @@ -398,6 +450,11 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal config.webhookSecurity?.trustedProxyIPs ?? defaults.webhookSecurity.trustedProxyIPs, }, streaming: { ...defaults.streaming, ...config.streaming }, + realtime: { + ...defaults.realtime, + ...config.realtime, + tools: config.realtime?.tools ?? defaults.realtime.tools, + }, stt: { ...defaults.stt, ...config.stt }, tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts), }; @@ -453,6 +510,28 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC resolved.webhookSecurity.trustForwardingHeaders ?? false; resolved.webhookSecurity.trustedProxyIPs = resolved.webhookSecurity.trustedProxyIPs ?? []; + // Realtime voice — resolve env var fallbacks + resolved.realtime = { ...resolved.realtime }; + // REALTIME_VOICE_ENABLED=true auto-enables realtime mode (backward compat) + if (!resolved.realtime.enabled && process.env.REALTIME_VOICE_ENABLED === "true") { + resolved.realtime.enabled = true; + } + resolved.realtime.model = resolved.realtime.model ?? process.env.REALTIME_VOICE_MODEL; + resolved.realtime.voice = + (resolved.realtime.voice ?? + (process.env.REALTIME_VOICE_VOICE as VoiceCallRealtimeConfig["voice"])) || undefined; + resolved.realtime.instructions = + resolved.realtime.instructions ?? process.env.REALTIME_VOICE_INSTRUCTIONS; + if (resolved.realtime.temperature == null && process.env.REALTIME_VOICE_TEMPERATURE) { + resolved.realtime.temperature = parseFloat(process.env.REALTIME_VOICE_TEMPERATURE); + } + if (resolved.realtime.vadThreshold == null && process.env.VAD_THRESHOLD) { + resolved.realtime.vadThreshold = parseFloat(process.env.VAD_THRESHOLD); + } + if (resolved.realtime.silenceDurationMs == null && process.env.SILENCE_DURATION_MS) { + resolved.realtime.silenceDurationMs = parseInt(process.env.SILENCE_DURATION_MS, 10); + } + return normalizeVoiceCallConfig(resolved); } @@ -521,5 +600,24 @@ export function validateProviderConfig(config: VoiceCallConfig): { } } + // Realtime mode requires inbound calls to be accepted — policy "disabled" + // means the manager will reject every call before it can be tracked. + // "open" or "allowlist" are the correct choices when realtime.enabled = true. + if (config.realtime?.enabled && config.inboundPolicy === "disabled") { + errors.push( + "plugins.entries.voice-call.config.inboundPolicy must not be \"disabled\" when realtime.enabled is true " + + "(use \"open\" or \"allowlist\" — realtime calls are answered before policy can reject them)", + ); + } + + // Both streaming and realtime cannot be enabled simultaneously — they use + // incompatible WebSocket paths and audio routing. + if (config.realtime?.enabled && config.streaming?.enabled) { + errors.push( + "plugins.entries.voice-call.config: realtime.enabled and streaming.enabled cannot both be true " + + "(they use incompatible audio paths — choose one mode)", + ); + } + return { valid: errors.length === 0, errors }; } diff --git a/extensions/voice-call/src/providers/openai-realtime-voice.ts b/extensions/voice-call/src/providers/openai-realtime-voice.ts new file mode 100755 index 00000000000..ed5b10d6cd6 --- /dev/null +++ b/extensions/voice-call/src/providers/openai-realtime-voice.ts @@ -0,0 +1,863 @@ +/** + * OpenAI Realtime Voice Bridge + * + * Implements a bidirectional voice-to-voice bridge between Twilio Media Streams + * and the OpenAI Realtime API. Replaces the STT → LLM → TTS pipeline with a + * single WebSocket session that handles everything natively. + * + * Key benefits over the STT-only approach: + * - Latency: ~200–400 ms TTFB vs ~1–3.5 s in the pipeline mode + * - Audio format: g711_ulaw (mulaw) is natively supported — zero conversion + * - Barge-in: server VAD handles interruptions automatically + * - No separate LLM or TTS call required + * + * Usage: + * const bridge = new OpenAIRealtimeVoiceBridge({ + * apiKey: process.env.OPENAI_API_KEY!, + * instructions: "You are Gracie, a helpful AI assistant...", + * voice: "nova", + * onAudio: (muLaw) => mediaStreamHandler.sendAudio(streamSid, muLaw), + * onClearAudio: () => mediaStreamHandler.clearAudio(streamSid), + * onTranscript: (role, text) => console.log(`[${role}]: ${text}`), + * }); + * await bridge.connect(); + * bridge.sendAudio(twilioPayloadBuffer); + * bridge.close(); + * + * Integration with media-stream.ts: + * Replace `sttSession` with this bridge in `MediaStreamHandler.handleStart()`. + * Wire audio in/out through the existing sendAudio / clearAudio methods. + * + * @see https://platform.openai.com/docs/guides/realtime + * @see https://www.twilio.com/docs/voice/media-streams + */ + +import WebSocket from "ws"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +/** OpenAI Realtime API voice options */ +export type RealtimeVoice = + | "alloy" + | "ash" + | "ballad" + | "coral" + | "echo" + | "fable" + | "onyx" + | "nova" + | "sage" + | "shimmer" + | "verse"; + +/** Realtime tool definition (mirrors OpenAI function calling schema) */ +export interface RealtimeTool { + type: "function"; + name: string; + description: string; + parameters: { + type: "object"; + properties: Record; + required?: string[]; + }; +} + +/** Tool call event emitted when OpenAI invokes a function */ +export interface ToolCallEvent { + /** Conversation item ID for submitting the result */ + itemId: string; + /** Call ID for matching request/response */ + callId: string; + /** Function name */ + name: string; + /** Parsed JSON arguments */ + args: unknown; +} + +/** + * Configuration for the Realtime Voice Bridge. + */ +export interface RealtimeVoiceConfig { + // ---- Required ---- + /** OpenAI API key */ + apiKey: string; + + // ---- Voice/personality ---- + /** System instructions (persona / behaviour) */ + instructions?: string; + /** Voice to use for AI speech output (default: "nova") */ + voice?: RealtimeVoice; + /** Response temperature 0–1 (default: 0.8) */ + temperature?: number; + + // ---- Model ---- + /** Realtime model (default: "gpt-4o-mini-realtime-preview") */ + model?: string; + + // ---- VAD ---- + /** VAD speech detection threshold 0–1 (default: 0.5) */ + vadThreshold?: number; + /** Silence duration in ms before turn ends (default: 500) */ + silenceDurationMs?: number; + /** Padding before speech in ms (default: 300) */ + prefixPaddingMs?: number; + + // ---- Tools ---- + /** Optional function tools the model can call */ + tools?: RealtimeTool[]; + + // ---- Audio callbacks ---- + /** + * Called for each audio delta chunk from OpenAI. + * @param muLaw - Raw mulaw Buffer ready to send to Twilio + */ + onAudio: (muLaw: Buffer) => void; + + /** + * Called on barge-in (user speech detected mid-response). + * Clear Twilio audio buffer here. + */ + onClearAudio: () => void; + + // ---- Event callbacks (optional) ---- + /** + * Transcript event (partial or final). Role is "user" or "assistant". + */ + onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void; + + /** + * Called when the model invokes a tool/function. + * Your handler should call `bridge.submitToolResult(event.callId, result)`. + */ + onToolCall?: (event: ToolCallEvent) => void; + + /** + * Called when the session is fully connected and configured. + */ + onReady?: () => void; + + /** + * Called on irrecoverable error or max reconnects exceeded. + */ + onError?: (error: Error) => void; + + /** + * Called when the bridge closes (intentionally or after max retries). + */ + onClose?: () => void; +} + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +function base64ToBuffer(b64: string): Buffer { + return Buffer.from(b64, "base64"); +} + +// --------------------------------------------------------------------------- +// Main class +// --------------------------------------------------------------------------- + +/** + * Bidirectional voice bridge between Twilio Media Streams and the OpenAI Realtime API. + * + * Lifecycle: + * new OpenAIRealtimeVoiceBridge(config) + * → connect() — opens WebSocket, configures session + * → sendAudio() — called for each Twilio media chunk + * → [callbacks fire as OpenAI responds] + * → close() — graceful shutdown + */ +export class OpenAIRealtimeVoiceBridge { + private static readonly DEFAULT_MODEL = "gpt-4o-mini-realtime-preview"; + private static readonly MAX_RECONNECT_ATTEMPTS = 5; + private static readonly BASE_RECONNECT_DELAY_MS = 1000; + private static readonly CONNECT_TIMEOUT_MS = 10_000; + + private readonly config: RealtimeVoiceConfig; + private readonly model: string; + + private ws: WebSocket | null = null; + private connected = false; + private intentionallyClosed = false; + private reconnectAttempts = 0; + + /** Pending audio buffers queued while reconnecting */ + private pendingAudio: Buffer[] = []; + + /** Track mark queue for barge-in timing (mirrors reference impl) */ + private markQueue: string[] = []; + private responseStartTimestamp: number | null = null; + private latestMediaTimestamp = 0; + private lastAssistantItemId: string | null = null; + + /** Accumulate tool call arguments (streamed as deltas) */ + private toolCallBuffers = new Map(); + + constructor(config: RealtimeVoiceConfig) { + if (!config.apiKey) { + throw new Error("[RealtimeVoice] OpenAI API key is required"); + } + this.config = config; + this.model = config.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL; + } + + // ------------------------------------------------------------------------- + // Public API + // ------------------------------------------------------------------------- + + /** + * Connect to the OpenAI Realtime API. + * Resolves when the WebSocket is open and session.update has been sent. + * Throws if connection times out. + */ + async connect(): Promise { + this.intentionallyClosed = false; + this.reconnectAttempts = 0; + return this.doConnect(); + } + + /** + * Send a mulaw audio chunk from Twilio to OpenAI. + * Buffers chunks if not yet connected; drains on reconnect. + * + * @param audio - Raw mulaw Buffer (Twilio media.payload decoded from base64) + */ + sendAudio(audio: Buffer): void { + if (!this.connected || this.ws?.readyState !== WebSocket.OPEN) { + // Buffer up to 2 seconds of audio (~320 chunks × 20ms) while reconnecting + if (this.pendingAudio.length < 320) { + this.pendingAudio.push(audio); + } + return; + } + this.sendEvent({ + type: "input_audio_buffer.append", + audio: audio.toString("base64"), + }); + } + + /** + * Update the media timestamp (used for barge-in truncation calculations). + * Call this with data.media.timestamp from each Twilio media event. + */ + setMediaTimestamp(ts: number): void { + this.latestMediaTimestamp = ts; + } + + /** + * Inject a user text message into the conversation (optional). + * Useful for seeding context or simulating a greeting trigger. + */ + sendUserMessage(text: string): void { + this.sendEvent({ + type: "conversation.item.create", + item: { + type: "message", + role: "user", + content: [{ type: "input_text", text }], + }, + }); + this.sendEvent({ type: "response.create" }); + } + + /** + * Submit a tool/function result back to the model. + * Must be called in response to an `onToolCall` event. + * + * @param callId - The call_id from the ToolCallEvent + * @param result - JSON-serializable result value + */ + submitToolResult(callId: string, result: unknown): void { + this.sendEvent({ + type: "conversation.item.create", + item: { + type: "function_call_output", + call_id: callId, + output: JSON.stringify(result), + }, + }); + // Trigger AI to respond now that the tool result is available + this.sendEvent({ type: "response.create" }); + } + + /** + * Gracefully close the bridge. + */ + close(): void { + this.intentionallyClosed = true; + this.connected = false; + if (this.ws) { + this.ws.close(1000, "Bridge closed"); + this.ws = null; + } + this.config.onClose?.(); + } + + /** True if the WebSocket is open and the session is configured. */ + isConnected(): boolean { + return this.connected; + } + + // ------------------------------------------------------------------------- + // Connection management + // ------------------------------------------------------------------------- + + private async doConnect(): Promise { + return new Promise((resolve, reject) => { + const url = `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(this.model)}`; + + console.log(`[RealtimeVoice] Connecting to ${url}`); + + this.ws = new WebSocket(url, { + headers: { + Authorization: `Bearer ${this.config.apiKey}`, + "OpenAI-Beta": "realtime=v1", + }, + }); + + const connectTimeout = setTimeout(() => { + if (!this.connected) { + this.ws?.terminate(); + reject(new Error("[RealtimeVoice] Connection timeout")); + } + }, OpenAIRealtimeVoiceBridge.CONNECT_TIMEOUT_MS); + + this.ws.on("open", () => { + clearTimeout(connectTimeout); + console.log("[RealtimeVoice] WebSocket connected"); + this.connected = true; + this.reconnectAttempts = 0; + + // Small delay to ensure the server is ready before sending session.update + // (mirrors the reference implementation's setTimeout(initializeSession, 100)) + setTimeout(() => { + this.sendSessionUpdate(); + this.drainPendingAudio(); + this.config.onReady?.(); + resolve(); + }, 100); + }); + + this.ws.on("message", (data: Buffer) => { + try { + const event = JSON.parse(data.toString()) as RealtimeEvent; + this.handleEvent(event); + } catch (err) { + console.error("[RealtimeVoice] Failed to parse event:", err); + } + }); + + this.ws.on("error", (err) => { + console.error("[RealtimeVoice] WebSocket error:", err); + if (!this.connected) { + clearTimeout(connectTimeout); + reject(err); + } else { + this.config.onError?.(err instanceof Error ? err : new Error(String(err))); + } + }); + + this.ws.on("close", (code, reason) => { + console.log( + `[RealtimeVoice] WebSocket closed (code: ${code}, reason: ${reason?.toString() || "none"})`, + ); + this.connected = false; + this.ws = null; + + if (!this.intentionallyClosed) { + void this.attemptReconnect(); + } else { + this.config.onClose?.(); + } + }); + }); + } + + /** + * Trigger a greeting response from the AI. + * Useful for seeding context or simulating a greeting trigger. + */ + public triggerGreeting(): void { + if (!this.connected || !this.ws) { + console.warn("[RealtimeVoice] Cannot trigger greeting: not connected"); + return; + } + const greetingEvent = { + type: "response.create", + response: { + instructions: this.config.instructions, + }, + }; + this.sendEvent(greetingEvent); + console.log("[RealtimeVoice] Greeting triggered"); + } + + private sendSessionUpdate(): void { + const cfg = this.config; + + const sessionUpdate: RealtimeSessionUpdate = { + type: "session.update", + session: { + modalities: ["text", "audio"], + instructions: cfg.instructions, + voice: cfg.voice ?? "nova", + input_audio_format: "g711_ulaw", + output_audio_format: "g711_ulaw", + input_audio_transcription: { + model: "whisper-1", + }, + turn_detection: { + type: "server_vad", + threshold: cfg.vadThreshold ?? 0.5, + prefix_padding_ms: cfg.prefixPaddingMs ?? 300, + silence_duration_ms: cfg.silenceDurationMs ?? 500, + create_response: true, + }, + temperature: cfg.temperature ?? 0.8, + ...(cfg.tools && cfg.tools.length > 0 + ? { + tools: cfg.tools, + tool_choice: "auto", + } + : {}), + }, + }; + + console.log("[RealtimeVoice] Sending session.update"); + this.sendEvent(sessionUpdate); + } + + private drainPendingAudio(): void { + if (this.pendingAudio.length === 0) return; + console.log(`[RealtimeVoice] Draining ${this.pendingAudio.length} buffered audio chunks`); + for (const buf of this.pendingAudio) { + this.sendEvent({ + type: "input_audio_buffer.append", + audio: buf.toString("base64"), + }); + } + this.pendingAudio = []; + } + + private async attemptReconnect(): Promise { + if (this.intentionallyClosed) return; + + if ( + this.reconnectAttempts >= OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS + ) { + const err = new Error( + `[RealtimeVoice] Max reconnect attempts (${OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS}) exceeded`, + ); + console.error(err.message); + this.config.onError?.(err); + this.config.onClose?.(); + return; + } + + this.reconnectAttempts++; + const delay = + OpenAIRealtimeVoiceBridge.BASE_RECONNECT_DELAY_MS * + 2 ** (this.reconnectAttempts - 1); + + console.log( + `[RealtimeVoice] Reconnecting (${this.reconnectAttempts}/${OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS}) in ${delay}ms...`, + ); + + await new Promise((resolve) => setTimeout(resolve, delay)); + + if (this.intentionallyClosed) return; + + try { + await this.doConnect(); + console.log("[RealtimeVoice] Reconnected successfully"); + } catch (err) { + console.error("[RealtimeVoice] Reconnect failed:", err); + // doConnect's close handler will call attemptReconnect again + } + } + + // ------------------------------------------------------------------------- + // Event handling + // ------------------------------------------------------------------------- + + private handleEvent(event: RealtimeEvent): void { + switch (event.type) { + // ---- Session lifecycle ---- + case "session.created": + console.log("[RealtimeVoice] Session created"); + break; + + case "session.updated": + console.log("[RealtimeVoice] Session updated"); + break; + + // ---- Audio output: stream audio back to Twilio ---- + case "response.audio.delta": { + if (!event.delta) break; + + const audioBuffer = base64ToBuffer(event.delta); + this.config.onAudio(audioBuffer); + + // Track response start timestamp for barge-in truncation + if (this.responseStartTimestamp === null) { + this.responseStartTimestamp = this.latestMediaTimestamp; + } + + // Track the most recent assistant item ID + if (event.item_id) { + this.lastAssistantItemId = event.item_id; + } + + // Send mark to track playback position + this.sendMark(); + break; + } + + case "response.audio.done": + console.log("[RealtimeVoice] Audio response complete"); + break; + + // ---- Barge-in: user started speaking, interrupt AI response ---- + case "input_audio_buffer.speech_started": + console.log("[RealtimeVoice] Barge-in detected — clearing audio"); + this.handleBargein(); + break; + + case "input_audio_buffer.speech_stopped": + console.log("[RealtimeVoice] Speech stopped"); + break; + + case "input_audio_buffer.committed": + console.log("[RealtimeVoice] Audio buffer committed"); + break; + + // ---- Mark acknowledgment from Twilio ---- + case "response.audio_transcript.delta": + // AI speech transcript streaming (not an event we send to Twilio) + if (event.delta) { + this.config.onTranscript?.("assistant", event.delta, false); + } + break; + + case "response.audio_transcript.done": + if (event.transcript) { + console.log(`[RealtimeVoice] Assistant: ${event.transcript}`); + this.config.onTranscript?.("assistant", event.transcript, true); + } + break; + + // ---- User speech transcription (if text modality enabled) ---- + case "conversation.item.input_audio_transcription.completed": + if (event.transcript) { + console.log(`[RealtimeVoice] User: ${event.transcript}`); + this.config.onTranscript?.("user", event.transcript, true); + } + break; + + case "conversation.item.input_audio_transcription.delta": + if (event.delta) { + this.config.onTranscript?.("user", event.delta, false); + } + break; + + // ---- Tool calling ---- + case "response.function_call_arguments.delta": { + const key = event.item_id ?? "unknown"; + const existing = this.toolCallBuffers.get(key); + if (existing && event.delta) { + existing.args += event.delta; + } else if (event.item_id) { + this.toolCallBuffers.set(event.item_id, { + name: event.name ?? "", + callId: event.call_id ?? "", + args: event.delta ?? "", + }); + } + break; + } + + case "response.function_call_arguments.done": { + const key = event.item_id ?? "unknown"; + const buf = this.toolCallBuffers.get(key); + if (buf && this.config.onToolCall) { + let args: unknown; + try { + args = JSON.parse(buf.args || "{}"); + } catch { + args = {}; + } + this.config.onToolCall({ + itemId: key, + callId: buf.callId || event.call_id || "", + name: buf.name || event.name || "", + args, + }); + } + this.toolCallBuffers.delete(key); + break; + } + + // ---- Response lifecycle ---- + case "response.created": + console.log("[RealtimeVoice] Response started"); + break; + + case "response.done": + console.log("[RealtimeVoice] Response done"); + // Reset mark queue and timestamps for next turn + this.markQueue = []; + this.responseStartTimestamp = null; + this.lastAssistantItemId = null; + break; + + case "response.content.done": + // Individual content part done + break; + + case "rate_limits.updated": + // Log rate limit info if needed + break; + + // ---- Errors ---- + case "error": { + const errMsg = event.error + ? `${(event.error as { message?: string }).message ?? JSON.stringify(event.error)}` + : "Unknown error"; + console.error(`[RealtimeVoice] Error event: ${errMsg}`); + this.config.onError?.(new Error(errMsg)); + break; + } + + default: + // Uncomment for debugging: + // console.log(`[RealtimeVoice] Unhandled event: ${event.type}`); + break; + } + } + + /** + * Handle barge-in: truncate the current assistant response at the + * elapsed audio point, clear the Twilio buffer, and reset state. + * Mirrors the reference implementation's handleSpeechStartedEvent(). + */ + private handleBargein(): void { + if (this.markQueue.length > 0 && this.responseStartTimestamp !== null) { + const elapsedMs = this.latestMediaTimestamp - this.responseStartTimestamp; + + if (this.lastAssistantItemId) { + // Tell OpenAI to truncate the response at the point where the user + // interrupted — this ensures the AI's context matches what was heard + const truncateEvent = { + type: "conversation.item.truncate", + item_id: this.lastAssistantItemId, + content_index: 0, + audio_end_ms: Math.max(0, elapsedMs), + }; + console.log(`[RealtimeVoice] Truncating at ${elapsedMs}ms`); + this.sendEvent(truncateEvent); + } + + // Clear the audio already queued in Twilio's buffer + this.config.onClearAudio(); + + // Reset state + this.markQueue = []; + this.lastAssistantItemId = null; + this.responseStartTimestamp = null; + } else { + // Even if we have no mark queue, still clear audio to be safe + this.config.onClearAudio(); + } + } + + /** + * Send a mark event to Twilio to track audio playback position. + * The mark name is used to coordinate barge-in truncation. + */ + private sendMark(): void { + // We don't send marks directly — the caller does via onAudio + // We track mark queue internally for truncation calculations + this.markQueue.push(`audio-${Date.now()}`); + } + + /** + * Handle Twilio mark acknowledgment (when a mark event comes back from Twilio). + * Call this method when you receive a "mark" event from the Twilio WebSocket. + */ + acknowledgeMark(): void { + if (this.markQueue.length > 0) { + this.markQueue.shift(); + } + } + + // ------------------------------------------------------------------------- + // Utilities + // ------------------------------------------------------------------------- + + private sendEvent(event: unknown): void { + if (this.ws?.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify(event)); + } else { + console.warn("[RealtimeVoice] Attempted to send event while disconnected"); + } + } +} + +// --------------------------------------------------------------------------- +// Provider factory (matches pattern of OpenAIRealtimeSTTProvider) +// --------------------------------------------------------------------------- + +/** + * Configuration for the provider factory. + * Holds shared/default settings; per-call config is passed to createSession(). + */ +export interface RealtimeVoiceProviderConfig { + /** OpenAI API key */ + apiKey: string; + /** Default model (default: "gpt-4o-mini-realtime-preview") */ + model?: string; + /** Default voice (default: "nova") */ + voice?: RealtimeVoice; + /** Default system instructions */ + instructions?: string; + /** Default temperature (default: 0.8) */ + temperature?: number; + /** Default VAD threshold (default: 0.5) */ + vadThreshold?: number; + /** Default silence duration ms (default: 500) */ + silenceDurationMs?: number; + /** Default tools */ + tools?: RealtimeTool[]; +} + +/** + * Factory for creating RealtimeVoiceBridge instances. + * Follows the same pattern as OpenAIRealtimeSTTProvider for easy swapping. + */ +export class OpenAIRealtimeVoiceProvider { + readonly name = "openai-realtime-voice" as const; + + constructor(private readonly defaults: RealtimeVoiceProviderConfig) { + if (!defaults.apiKey) { + throw new Error("[RealtimeVoiceProvider] OpenAI API key is required"); + } + } + + /** + * Create a new voice bridge for a single call session. + * Merges provided config with provider defaults. + */ + createBridge( + callConfig: Omit & Partial>, + ): OpenAIRealtimeVoiceBridge { + const merged: RealtimeVoiceConfig = { + apiKey: this.defaults.apiKey, + model: callConfig.model ?? this.defaults.model, + voice: callConfig.voice ?? this.defaults.voice, + instructions: callConfig.instructions ?? this.defaults.instructions, + temperature: callConfig.temperature ?? this.defaults.temperature, + vadThreshold: callConfig.vadThreshold ?? this.defaults.vadThreshold, + silenceDurationMs: callConfig.silenceDurationMs ?? this.defaults.silenceDurationMs, + tools: callConfig.tools ?? this.defaults.tools, + onAudio: callConfig.onAudio, + onClearAudio: callConfig.onClearAudio, + onTranscript: callConfig.onTranscript, + onToolCall: callConfig.onToolCall, + onReady: callConfig.onReady, + onError: callConfig.onError, + onClose: callConfig.onClose, + }; + return new OpenAIRealtimeVoiceBridge(merged); + } +} + +// --------------------------------------------------------------------------- +// MediaStreamHandler integration helper +// --------------------------------------------------------------------------- + +/** + * Minimal interface that the bridge integration needs from MediaStreamHandler. + * This matches the actual MediaStreamHandler's method signatures. + */ +export interface MediaStreamHandlerLike { + sendAudio(streamSid: string, muLaw: Buffer): void; + clearAudio(streamSid: string): void; + sendMark(streamSid: string, name: string): void; +} + +/** + * Create a RealtimeVoiceBridge wired to an existing MediaStreamHandler session. + * + * Drop-in helper for use inside media-stream.ts handleStart(): + * + * ```typescript + * // In handleStart(), instead of creating an STT session: + * const bridge = createBridgeForStream({ + * streamSid, + * handler: this, // MediaStreamHandler instance + * config: { + * apiKey: "...", + * instructions: "You are Gracie...", + * voice: "nova", + * onTranscript: (role, text, final) => { + * if (final && role === "user") config.onTranscript?.(callId, text); + * }, + * }, + * }); + * await bridge.connect(); + * ``` + */ +export function createBridgeForStream(opts: { + streamSid: string; + handler: MediaStreamHandlerLike; + config: Omit; +}): OpenAIRealtimeVoiceBridge { + return new OpenAIRealtimeVoiceBridge({ + ...opts.config, + onAudio: (muLaw) => { + opts.handler.sendAudio(opts.streamSid, muLaw); + }, + onClearAudio: () => { + opts.handler.clearAudio(opts.streamSid); + }, + }); +} + +// --------------------------------------------------------------------------- +// Internal event shape types (partial — only fields we use) +// --------------------------------------------------------------------------- + +interface RealtimeEvent { + type: string; + delta?: string; + transcript?: string; + item_id?: string; + call_id?: string; + name?: string; + error?: unknown; +} + +interface RealtimeSessionUpdate { + type: "session.update"; + session: { + modalities: string[]; + instructions?: string; + voice: RealtimeVoice; + input_audio_format: string; + output_audio_format: string; + turn_detection: { + type: "server_vad"; + threshold: number; + prefix_padding_ms: number; + silence_duration_ms: number; + create_response: boolean; + }; + temperature: number; + tools?: RealtimeTool[]; + tool_choice?: string; + }; +} diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts index d725e44bf06..4069fa3e493 100644 --- a/extensions/voice-call/src/runtime.ts +++ b/extensions/voice-call/src/runtime.ts @@ -12,6 +12,7 @@ import { createTelephonyTtsProvider } from "./telephony-tts.js"; import { startTunnel, type TunnelResult } from "./tunnel.js"; import { VoiceCallWebhookServer } from "./webhook.js"; import { cleanupTailscaleExposure, setupTailscaleExposure } from "./webhook/tailscale.js"; +import { RealtimeCallHandler } from "./webhook/realtime-handler.js"; export type VoiceCallRuntime = { config: VoiceCallConfig; @@ -20,6 +21,8 @@ export type VoiceCallRuntime = { webhookServer: VoiceCallWebhookServer; webhookUrl: string; publicUrl: string | null; + /** Realtime voice handler — present when config.realtime.enabled is true */ + realtimeHandler?: RealtimeCallHandler; stop: () => Promise; }; @@ -168,6 +171,19 @@ export async function createVoiceCallRuntime(params: { const webhookServer = new VoiceCallWebhookServer(config, manager, provider, coreConfig); const lifecycle = createRuntimeResourceLifecycle({ config, webhookServer }); + // Wire realtime handler before the server starts so it's ready for upgrades + let realtimeHandler: RealtimeCallHandler | undefined; + if (config.realtime.enabled) { + realtimeHandler = new RealtimeCallHandler( + config.realtime, + manager, + provider, + coreConfig, + ); + webhookServer.setRealtimeHandler(realtimeHandler); + log.info("[voice-call] Realtime voice handler initialized"); + } + const localUrl = await webhookServer.start(); // Wrap remaining initialization in try/catch so the webhook server is @@ -252,6 +268,7 @@ export async function createVoiceCallRuntime(params: { webhookServer, webhookUrl, publicUrl, + realtimeHandler, stop, }; } catch (err) { diff --git a/extensions/voice-call/src/test-fixtures.ts b/extensions/voice-call/src/test-fixtures.ts index 594aa064ba5..f38126f858c 100644 --- a/extensions/voice-call/src/test-fixtures.ts +++ b/extensions/voice-call/src/test-fixtures.ts @@ -40,6 +40,7 @@ export function createVoiceCallBaseConfig(params?: { maxPendingConnectionsPerIp: 4, maxConnections: 128, }, + realtime: { enabled: false, tools: [] }, skipSignatureVerification: false, stt: { provider: "openai", model: "whisper-1" }, tts: { diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index 1258229735e..44ab5df373c 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -15,6 +15,7 @@ import { OpenAIRealtimeSTTProvider } from "./providers/stt-openai-realtime.js"; import type { TwilioProvider } from "./providers/twilio.js"; import type { NormalizedEvent, WebhookContext } from "./types.js"; import { startStaleCallReaper } from "./webhook/stale-call-reaper.js"; +import type { RealtimeCallHandler } from "./webhook/realtime-handler.js"; const MAX_WEBHOOK_BODY_BYTES = 1024 * 1024; @@ -60,6 +61,9 @@ export class VoiceCallWebhookServer { /** Media stream handler for bidirectional audio (when streaming enabled) */ private mediaStreamHandler: MediaStreamHandler | null = null; + /** Realtime voice handler — present when config.realtime.enabled is true */ + private realtimeHandler: RealtimeCallHandler | null = null; + constructor( config: VoiceCallConfig, manager: CallManager, @@ -84,6 +88,13 @@ export class VoiceCallWebhookServer { return this.mediaStreamHandler; } + /** + * Wire the realtime call handler (called from runtime.ts before server starts). + */ + setRealtimeHandler(handler: RealtimeCallHandler): void { + this.realtimeHandler = handler; + } + /** * Initialize media streaming with OpenAI Realtime STT. */ @@ -229,9 +240,15 @@ export class VoiceCallWebhookServer { }); }); - // Handle WebSocket upgrades for media streams - if (this.mediaStreamHandler) { + // Handle WebSocket upgrades for realtime voice and media streams + if (this.realtimeHandler || this.mediaStreamHandler) { this.server.on("upgrade", (request, socket, head) => { + // Realtime voice takes precedence when the path matches + if (this.realtimeHandler && this.isRealtimeMode(request)) { + console.log("[voice-call] WebSocket upgrade for realtime voice"); + this.realtimeHandler.handleWebSocketUpgrade(request, socket, head); + return; + } const path = this.getUpgradePathname(request); if (path === streamPath) { console.log("[voice-call] WebSocket upgrade for media stream"); @@ -338,10 +355,29 @@ export class VoiceCallWebhookServer { this.writeWebhookResponse(res, payload); } + /** + * Returns true for WebSocket upgrade paths that belong to the realtime handler. + * Used only for upgrade routing — not for the inbound HTTP webhook POST. + */ + private isRealtimeMode(req: http.IncomingMessage): boolean { + return (req.url ?? "/").includes("/realtime"); + } + private async runWebhookPipeline( req: http.IncomingMessage, webhookPath: string, ): Promise { + // Realtime mode: whenever the realtime handler is active, ALL inbound calls + // use it. The handler returns TwiML so Twilio opens a + // WebSocket to the /voice/stream/realtime path, which is routed back here + // via the upgrade handler's isRealtimeMode() check. + if (this.realtimeHandler && req.method === "POST") { + const url = buildRequestUrl(req.url, req.headers.host); + if (this.isWebhookPathMatch(url.pathname, webhookPath)) { + return this.realtimeHandler.buildTwiMLPayload(req); + } + } + const url = buildRequestUrl(req.url, req.headers.host); if (url.pathname === "/voice/hold-music") { diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts new file mode 100644 index 00000000000..2fa6079696a --- /dev/null +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -0,0 +1,289 @@ +import http from "node:http"; +import type { Duplex } from "node:stream"; +import { type WebSocket, Server as WebSocketServer } from "ws"; +import type { VoiceCallRealtimeConfig } from "../config.js"; +import type { CoreConfig } from "../core-bridge.js"; +import type { CallManager } from "../manager.js"; +import { + OpenAIRealtimeVoiceBridge, + type RealtimeTool, +} from "../providers/openai-realtime-voice.js"; +import type { VoiceCallProvider } from "../providers/base.js"; +import type { NormalizedEvent } from "../types.js"; + +export type ToolHandlerFn = (args: unknown, callId: string) => Promise; + +type WebhookResponsePayload = { + statusCode: number; + body: string; + headers?: Record; +}; + +/** + * Handles inbound voice calls bridged directly to the OpenAI Realtime API. + * + * Responsibilities: + * - Accept WebSocket upgrades from Twilio Media Streams at the /realtime path + * - Return TwiML payload for the initial HTTP webhook + * - Register each call with CallManager (appears in voice status/history) + * - Route tool calls to registered handlers (Phase 5 tool framework) + */ +export class RealtimeCallHandler { + private toolHandlers = new Map(); + + constructor( + private config: VoiceCallRealtimeConfig, + private manager: CallManager, + private provider: VoiceCallProvider, + private coreConfig: CoreConfig | null, + ) {} + + /** + * Handle a WebSocket upgrade request from Twilio for a realtime media stream. + * Called from VoiceCallWebhookServer's upgrade handler when isRealtimeMode() is true. + */ + handleWebSocketUpgrade(request: http.IncomingMessage, socket: Duplex, head: Buffer): void { + const wss = new WebSocketServer({ noServer: true }); + wss.handleUpgrade(request, socket, head, (ws) => { + let bridge: OpenAIRealtimeVoiceBridge | null = null; + let initialized = false; + + ws.on("message", (data: Buffer) => { + try { + const msg = JSON.parse(data.toString()) as Record; + if (!initialized && msg.event === "start") { + initialized = true; + const startData = msg.start as Record | undefined; + const streamSid = startData?.streamSid || "unknown"; + const callSid = startData?.callSid || "unknown"; + bridge = this.handleCall(streamSid, callSid, ws); + } else if (bridge) { + const mediaData = msg.media as Record | undefined; + if (msg.event === "media" && mediaData?.payload) { + bridge.sendAudio(Buffer.from(mediaData.payload as string, "base64")); + if (mediaData.timestamp) { + bridge.setMediaTimestamp(Number(mediaData.timestamp)); + } + } else if (msg.event === "mark") { + bridge.acknowledgeMark(); + } else if (msg.event === "stop") { + bridge.close(); + } + } + } catch (err) { + console.error("[voice-call] Error parsing WS message:", err); + } + }); + + ws.on("close", () => { + bridge?.close(); + }); + }); + } + + /** + * Build the TwiML response payload for a realtime call. + * The WebSocket URL is derived from the incoming request host so no hostname + * is hardcoded. + */ + buildTwiMLPayload(req: http.IncomingMessage): WebhookResponsePayload { + const host = req.headers.host || "localhost:8443"; + const wsUrl = `wss://${host}/voice/stream/realtime`; + console.log(`[voice-call] Returning realtime TwiML with WebSocket: ${wsUrl}`); + const twiml = ` + + + + +`; + return { + statusCode: 200, + headers: { "Content-Type": "text/xml" }, + body: twiml, + }; + } + + /** + * Register a named tool handler to be called when the model invokes a function. + * Must be called before calls begin. + * + * @param name - Function name as declared in config.realtime.tools + * @param fn - Async handler receiving (parsedArgs, internalCallId); return value + * is submitted back to the model as the tool result. + */ + registerToolHandler(name: string, fn: ToolHandlerFn): void { + this.toolHandlers.set(name, fn); + } + + // --------------------------------------------------------------------------- + // Private + // --------------------------------------------------------------------------- + + /** + * Create and start the OpenAI Realtime bridge for a single call session. + * Registers the call with CallManager so it appears in status/history. + * Returns the bridge (or null on fatal config error). + */ + private handleCall( + streamSid: string, + callSid: string, + ws: WebSocket, + ): OpenAIRealtimeVoiceBridge | null { + const apiKey = process.env.OPENAI_API_KEY; + if (!apiKey) { + console.error("[voice-call] No OPENAI_API_KEY for realtime call"); + ws.close(1011, "No API key"); + return null; + } + + const callId = this.registerCallInManager(callSid); + console.log(`[voice-call] Realtime call: streamSid=${streamSid}, callSid=${callSid}, callId=${callId}`); + + // Declare as null first so closures can capture the reference before bridge is created. + // By the time any callback fires, bridge will be fully assigned. + let bridge: OpenAIRealtimeVoiceBridge | null = null; + + bridge = new OpenAIRealtimeVoiceBridge({ + apiKey, + model: this.config.model, + voice: this.config.voice, + instructions: this.config.instructions, + temperature: this.config.temperature, + vadThreshold: this.config.vadThreshold, + silenceDurationMs: this.config.silenceDurationMs, + prefixPaddingMs: this.config.prefixPaddingMs, + tools: this.config.tools as RealtimeTool[], + + onAudio: (muLaw) => { + ws.send( + JSON.stringify({ + event: "media", + streamSid, + media: { payload: muLaw.toString("base64") }, + }), + ); + }, + + onClearAudio: () => { + ws.send(JSON.stringify({ event: "clear", streamSid })); + }, + + onTranscript: (role, text, isFinal) => { + if (isFinal) { + // Emit user speech through the manager for transcript persistence + if (role === "user") { + const event: NormalizedEvent = { + id: `realtime-speech-${callSid}-${Date.now()}`, + type: "call.speech", + callId, + providerCallId: callSid, + timestamp: Date.now(), + transcript: text, + isFinal: true, + }; + this.manager.processEvent(event); + } + } + }, + + onToolCall: (toolEvent) => { + if (bridge) { + void this.executeToolCall( + bridge, + callId, + toolEvent.callId, + toolEvent.name, + toolEvent.args, + ); + } + }, + + onReady: () => { + bridge?.triggerGreeting(); + }, + + onError: (err) => { + console.error("[voice-call] Realtime error:", err.message); + }, + + onClose: () => { + this.endCallInManager(callSid, callId); + }, + }); + + bridge.connect().catch((err: Error) => { + console.error("[voice-call] Failed to connect realtime bridge:", err); + ws.close(1011, "Failed to connect"); + }); + + // Acknowledge the stream connection (mirrors Twilio Media Streams protocol) + ws.send(JSON.stringify({ event: "connected", protocol: "Call", version: "1.0.0" })); + + return bridge; + } + + /** + * Emit synthetic NormalizedEvents to register the call with CallManager. + * Returns the internal callId generated by the manager. + */ + private registerCallInManager(callSid: string): string { + const now = Date.now(); + const baseFields = { + providerCallId: callSid, + timestamp: now, + direction: "inbound" as const, + }; + + // call.initiated causes the manager to auto-create the call record + // (see manager/events.ts createWebhookCall path) + this.manager.processEvent({ + id: `realtime-initiated-${callSid}`, + callId: callSid, + type: "call.initiated", + ...baseFields, + }); + + this.manager.processEvent({ + id: `realtime-answered-${callSid}`, + callId: callSid, + type: "call.answered", + ...baseFields, + }); + + // Resolve the manager-generated internal callId + const call = this.manager.getCallByProviderCallId(callSid); + return call?.callId ?? callSid; + } + + private endCallInManager(callSid: string, callId: string): void { + this.manager.processEvent({ + id: `realtime-ended-${callSid}-${Date.now()}`, + type: "call.ended", + callId, + providerCallId: callSid, + timestamp: Date.now(), + reason: "completed", + }); + } + + private async executeToolCall( + bridge: OpenAIRealtimeVoiceBridge, + callId: string, + bridgeCallId: string, + name: string, + args: unknown, + ): Promise { + const handler = this.toolHandlers.get(name); + let result: unknown; + if (handler) { + try { + result = await handler(args, callId); + } catch (err) { + result = { error: err instanceof Error ? err.message : String(err) }; + } + } else { + result = { error: `Tool "${name}" not available` }; + } + bridge.submitToolResult(bridgeCallId, result); + } +} From 3fd5f44d0588aec6af2e546d67103b81a7f1bfc9 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Tue, 10 Mar 2026 19:29:41 +0000 Subject: [PATCH 02/17] voice-call: suppress inboundGreeting TTS when realtime mode is active --- .../voice-call/src/webhook/realtime-handler.ts | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index 2fa6079696a..7fd426b7c79 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -243,6 +243,14 @@ export class RealtimeCallHandler { ...baseFields, }); + // Clear inboundGreeting from the call record before call.answered fires. + // The realtime bridge owns all voice output; the TTS greeting path would + // fail anyway because provider state is never initialized for realtime calls. + const callRecord = this.manager.getCallByProviderCallId(callSid); + if (callRecord?.metadata) { + delete callRecord.metadata.initialMessage; + } + this.manager.processEvent({ id: `realtime-answered-${callSid}`, callId: callSid, @@ -250,9 +258,7 @@ export class RealtimeCallHandler { ...baseFields, }); - // Resolve the manager-generated internal callId - const call = this.manager.getCallByProviderCallId(callSid); - return call?.callId ?? callSid; + return callRecord?.callId ?? callSid; } private endCallInManager(callSid: string, callId: string): void { From 7429bca0cd54feeea1f1d88b6026bf0371c9f680 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Wed, 11 Mar 2026 19:56:57 +0000 Subject: [PATCH 03/17] voice-call: add tests for realtime config schema and handler --- extensions/voice-call/src/config.test.ts | 113 ++++++++ .../src/webhook/realtime-handler.test.ts | 274 ++++++++++++++++++ 2 files changed, 387 insertions(+) create mode 100644 extensions/voice-call/src/webhook/realtime-handler.test.ts diff --git a/extensions/voice-call/src/config.test.ts b/extensions/voice-call/src/config.test.ts index 1b12e9e84c5..899a12af5a7 100644 --- a/extensions/voice-call/src/config.test.ts +++ b/extensions/voice-call/src/config.test.ts @@ -3,6 +3,7 @@ import { validateProviderConfig, normalizeVoiceCallConfig, resolveVoiceCallConfig, + VoiceCallRealtimeConfigSchema, type VoiceCallConfig, } from "./config.js"; import { createVoiceCallBaseConfig } from "./test-fixtures.js"; @@ -216,3 +217,115 @@ describe("normalizeVoiceCallConfig", () => { expect(normalized.tts?.elevenlabs?.voiceSettings).toEqual({ speed: 1.1 }); }); }); + +describe("VoiceCallRealtimeConfigSchema", () => { + it("defaults to disabled with empty tools array", () => { + const config = VoiceCallRealtimeConfigSchema.parse({}); + expect(config.enabled).toBe(false); + expect(config.tools).toEqual([]); + }); + + it("accepts all valid Realtime API voice names", () => { + const voices = ["alloy", "ash", "ballad", "cedar", "coral", "echo", "marin", "sage", "shimmer", "verse"]; + for (const voice of voices) { + expect(() => VoiceCallRealtimeConfigSchema.parse({ voice })).not.toThrow(); + } + }); + + it("rejects voice names that are not in the Realtime API (e.g. nova, fable, onyx)", () => { + for (const voice of ["nova", "fable", "onyx"]) { + expect(() => VoiceCallRealtimeConfigSchema.parse({ voice })).toThrow(); + } + }); + + it("normalizeVoiceCallConfig propagates realtime sub-config", () => { + const normalized = normalizeVoiceCallConfig({ + enabled: true, + provider: "mock", + realtime: { enabled: true, voice: "marin", instructions: "Be helpful." }, + }); + expect(normalized.realtime.enabled).toBe(true); + expect(normalized.realtime.voice).toBe("marin"); + expect(normalized.realtime.instructions).toBe("Be helpful."); + expect(normalized.realtime.tools).toEqual([]); + }); +}); + +describe("resolveVoiceCallConfig — realtime env vars", () => { + const originalEnv = { ...process.env }; + + afterEach(() => { + process.env = { ...originalEnv }; + }); + + it("auto-enables realtime from REALTIME_VOICE_ENABLED=true", () => { + process.env.REALTIME_VOICE_ENABLED = "true"; + const resolved = resolveVoiceCallConfig(createVoiceCallBaseConfig()); + expect(resolved.realtime.enabled).toBe(true); + }); + + it("does not auto-enable when REALTIME_VOICE_ENABLED is absent or not 'true'", () => { + delete process.env.REALTIME_VOICE_ENABLED; + expect(resolveVoiceCallConfig(createVoiceCallBaseConfig()).realtime.enabled).toBe(false); + + process.env.REALTIME_VOICE_ENABLED = "false"; + expect(resolveVoiceCallConfig(createVoiceCallBaseConfig()).realtime.enabled).toBe(false); + }); + + it("resolves model, voice, instructions, temperature from env vars", () => { + process.env.REALTIME_VOICE_MODEL = "gpt-4o-realtime-preview"; + process.env.REALTIME_VOICE_VOICE = "ash"; + process.env.REALTIME_VOICE_INSTRUCTIONS = "You are helpful."; + process.env.REALTIME_VOICE_TEMPERATURE = "0.8"; + const resolved = resolveVoiceCallConfig(createVoiceCallBaseConfig()); + expect(resolved.realtime.model).toBe("gpt-4o-realtime-preview"); + expect(resolved.realtime.voice).toBe("ash"); + expect(resolved.realtime.instructions).toBe("You are helpful."); + expect(resolved.realtime.temperature).toBeCloseTo(0.8); + }); + + it("resolves vadThreshold and silenceDurationMs from env vars", () => { + process.env.VAD_THRESHOLD = "0.7"; + process.env.SILENCE_DURATION_MS = "1200"; + const resolved = resolveVoiceCallConfig(createVoiceCallBaseConfig()); + expect(resolved.realtime.vadThreshold).toBeCloseTo(0.7); + expect(resolved.realtime.silenceDurationMs).toBe(1200); + }); + + it("config values take precedence over env vars", () => { + process.env.REALTIME_VOICE_VOICE = "ash"; + const base = createVoiceCallBaseConfig(); + base.realtime = { enabled: false, voice: "coral", tools: [] }; + const resolved = resolveVoiceCallConfig(base); + expect(resolved.realtime.voice).toBe("coral"); + }); +}); + +describe("validateProviderConfig — realtime mode", () => { + it("rejects realtime.enabled when inboundPolicy is 'disabled'", () => { + const config = createVoiceCallBaseConfig({ provider: "mock" }); + config.realtime = { enabled: true, tools: [] }; + // inboundPolicy defaults to "disabled" in createVoiceCallBaseConfig + const result = validateProviderConfig(config); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.includes("inboundPolicy"))).toBe(true); + }); + + it("passes when realtime.enabled with inboundPolicy 'open'", () => { + const config = createVoiceCallBaseConfig({ provider: "mock" }); + config.inboundPolicy = "open"; + config.realtime = { enabled: true, tools: [] }; + const result = validateProviderConfig(config); + expect(result.errors.some((e) => e.includes("inboundPolicy"))).toBe(false); + }); + + it("rejects when both realtime.enabled and streaming.enabled are true", () => { + const config = createVoiceCallBaseConfig({ provider: "mock" }); + config.inboundPolicy = "open"; + config.realtime = { enabled: true, tools: [] }; + config.streaming = { ...config.streaming, enabled: true }; + const result = validateProviderConfig(config); + expect(result.valid).toBe(false); + expect(result.errors.some((e) => e.includes("streaming"))).toBe(true); + }); +}); diff --git a/extensions/voice-call/src/webhook/realtime-handler.test.ts b/extensions/voice-call/src/webhook/realtime-handler.test.ts new file mode 100644 index 00000000000..f4ad179d441 --- /dev/null +++ b/extensions/voice-call/src/webhook/realtime-handler.test.ts @@ -0,0 +1,274 @@ +import http from "node:http"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { CallManager } from "../manager.js"; +import type { CallRecord } from "../types.js"; +import type { VoiceCallProvider } from "../providers/base.js"; +import { RealtimeCallHandler } from "./realtime-handler.js"; + +// Minimal realtime config used across tests +const baseRealtimeConfig = { + enabled: true, + voice: "ash" as const, + tools: [] as never[], +}; + +// Fake CallRecord for manager stubs +function makeCallRecord(overrides: Partial = {}): CallRecord { + return { + callId: "call-rt-1", + providerCallId: "CA_test", + provider: "twilio", + direction: "inbound", + state: "answered", + from: "+15550001234", + to: "+15550005678", + startedAt: Date.now(), + transcript: [], + processedEventIds: [], + metadata: { + initialMessage: "Hello! How can I help you today?", + }, + ...overrides, + }; +} + +function makeManager(record?: CallRecord): CallManager { + const storedRecord = record ?? makeCallRecord(); + return { + processEvent: vi.fn(), + getCallByProviderCallId: vi.fn(() => storedRecord), + getCall: vi.fn(() => storedRecord), + } as unknown as CallManager; +} + +function makeProvider(): VoiceCallProvider { + return { + name: "twilio", + verifyWebhook: vi.fn(() => ({ ok: true, verifiedRequestKey: "mock:key" })), + parseWebhookEvent: vi.fn(() => ({ events: [] })), + initiateCall: vi.fn(async () => ({ providerCallId: "CA_test", status: "initiated" as const })), + hangupCall: vi.fn(async () => {}), + playTts: vi.fn(async () => {}), + startListening: vi.fn(async () => {}), + stopListening: vi.fn(async () => {}), + getCallStatus: vi.fn(async () => ({ status: "in-progress" as const, isTerminal: false })), + }; +} + +function makeRequest(url: string, host = "example.ts.net"): http.IncomingMessage { + const req = new http.IncomingMessage(null as never); + req.url = url; + req.method = "POST"; + req.headers = { host }; + return req; +} + +describe("RealtimeCallHandler", () => { + let originalEnv: NodeJS.ProcessEnv; + + beforeEach(() => { + originalEnv = { ...process.env }; + }); + + afterEach(() => { + process.env = { ...originalEnv }; + }); + + // --------------------------------------------------------------------------- + // buildTwiMLPayload + // --------------------------------------------------------------------------- + + describe("buildTwiMLPayload", () => { + it("returns TwiML with wss URL derived from request host", () => { + const handler = new RealtimeCallHandler( + baseRealtimeConfig, + makeManager(), + makeProvider(), + null, + ); + const req = makeRequest("/voice/webhook", "gateway.ts.net"); + const payload = handler.buildTwiMLPayload(req); + + expect(payload.statusCode).toBe(200); + expect(payload.headers?.["Content-Type"]).toBe("text/xml"); + expect(payload.body).toContain(""); + expect(payload.body).toContain(" { + const handler = new RealtimeCallHandler( + baseRealtimeConfig, + makeManager(), + makeProvider(), + null, + ); + const req = makeRequest("/voice/webhook", ""); + const payload = handler.buildTwiMLPayload(req); + + expect(payload.body).toContain("wss://localhost:8443/voice/stream/realtime"); + }); + }); + + // --------------------------------------------------------------------------- + // registerCallInManager — greeting suppression + // --------------------------------------------------------------------------- + + describe("registerCallInManager (via handleCall)", () => { + it("clears metadata.initialMessage so the inboundGreeting TTS path is skipped", () => { + const callRecord = makeCallRecord({ + metadata: { initialMessage: "Hello from config!" }, + }); + const manager = makeManager(callRecord); + + const handler = new RealtimeCallHandler( + baseRealtimeConfig, + manager, + makeProvider(), + null, + ); + + // Access private method via type assertion for unit testing + (handler as unknown as { registerCallInManager: (sid: string) => string }) + .registerCallInManager("CA_test"); + + // call.initiated + call.answered should both have been emitted + expect(vi.mocked(manager.processEvent)).toHaveBeenCalledTimes(2); + const eventTypes = vi.mocked(manager.processEvent).mock.calls.map( + ([e]) => (e as { type: string }).type, + ); + expect(eventTypes).toEqual(["call.initiated", "call.answered"]); + + // initialMessage must be cleared before call.answered fires + expect(callRecord.metadata?.initialMessage).toBeUndefined(); + }); + + it("returns callId from the manager-created call record", () => { + const callRecord = makeCallRecord({ callId: "manager-gen-id" }); + const manager = makeManager(callRecord); + + const handler = new RealtimeCallHandler( + baseRealtimeConfig, + manager, + makeProvider(), + null, + ); + + const result = (handler as unknown as { registerCallInManager: (sid: string) => string }) + .registerCallInManager("CA_test"); + + expect(result).toBe("manager-gen-id"); + }); + + it("falls back to providerCallId when manager has no record", () => { + const manager = { + processEvent: vi.fn(), + getCallByProviderCallId: vi.fn(() => undefined), + } as unknown as CallManager; + + const handler = new RealtimeCallHandler( + baseRealtimeConfig, + manager, + makeProvider(), + null, + ); + + const result = (handler as unknown as { registerCallInManager: (sid: string) => string }) + .registerCallInManager("CA_fallback"); + + expect(result).toBe("CA_fallback"); + }); + }); + + // --------------------------------------------------------------------------- + // Tool handler framework + // --------------------------------------------------------------------------- + + describe("registerToolHandler", () => { + it("routes tool calls to registered handlers and returns their result", async () => { + const handler = new RealtimeCallHandler( + baseRealtimeConfig, + makeManager(), + makeProvider(), + null, + ); + + handler.registerToolHandler("get_time", async () => ({ utc: "2026-03-10T00:00:00Z" })); + + const fakeSubmit = vi.fn(); + const fakeBridge = { submitToolResult: fakeSubmit } as never; + + await ( + handler as unknown as { + executeToolCall: ( + bridge: never, + callId: string, + bridgeCallId: string, + name: string, + args: unknown, + ) => Promise; + } + ).executeToolCall(fakeBridge, "call-1", "bridge-call-1", "get_time", {}); + + expect(fakeSubmit).toHaveBeenCalledWith("bridge-call-1", { utc: "2026-03-10T00:00:00Z" }); + }); + + it("returns an error result for unregistered tool names", async () => { + const handler = new RealtimeCallHandler( + baseRealtimeConfig, + makeManager(), + makeProvider(), + null, + ); + + const fakeSubmit = vi.fn(); + const fakeBridge = { submitToolResult: fakeSubmit } as never; + + await ( + handler as unknown as { + executeToolCall: ( + bridge: never, + callId: string, + bridgeCallId: string, + name: string, + args: unknown, + ) => Promise; + } + ).executeToolCall(fakeBridge, "call-1", "bridge-call-1", "unknown_tool", {}); + + expect(fakeSubmit).toHaveBeenCalledWith("bridge-call-1", { + error: 'Tool "unknown_tool" not available', + }); + }); + + it("returns an error result when a handler throws", async () => { + const handler = new RealtimeCallHandler( + baseRealtimeConfig, + makeManager(), + makeProvider(), + null, + ); + + handler.registerToolHandler("boom", async () => { + throw new Error("handler blew up"); + }); + + const fakeSubmit = vi.fn(); + const fakeBridge = { submitToolResult: fakeSubmit } as never; + + await ( + handler as unknown as { + executeToolCall: ( + bridge: never, + callId: string, + bridgeCallId: string, + name: string, + args: unknown, + ) => Promise; + } + ).executeToolCall(fakeBridge, "call-1", "bridge-call-1", "boom", {}); + + expect(fakeSubmit).toHaveBeenCalledWith("bridge-call-1", { error: "handler blew up" }); + }); + }); +}); From 978f06c19ab249ceec51d3a1df340af2060cac4c Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Wed, 11 Mar 2026 20:29:34 +0000 Subject: [PATCH 04/17] =?UTF-8?q?voice-call:=20fix=20RealtimeVoice=20type?= =?UTF-8?q?=20and=20default=20=E2=80=94=20align=20with=20Realtime=20API?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove nova/fable/onyx (TTS-1 only), add cedar/marin to match the actual OpenAI Realtime API voice list. Change default voice from 'nova' (rejected by the API) to 'alloy'. Co-Authored-By: Claude Sonnet 4.6 --- .../src/providers/openai-realtime-voice.ts | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/extensions/voice-call/src/providers/openai-realtime-voice.ts b/extensions/voice-call/src/providers/openai-realtime-voice.ts index ed5b10d6cd6..ee788904819 100755 --- a/extensions/voice-call/src/providers/openai-realtime-voice.ts +++ b/extensions/voice-call/src/providers/openai-realtime-voice.ts @@ -15,7 +15,7 @@ * const bridge = new OpenAIRealtimeVoiceBridge({ * apiKey: process.env.OPENAI_API_KEY!, * instructions: "You are Gracie, a helpful AI assistant...", - * voice: "nova", + * voice: "alloy", * onAudio: (muLaw) => mediaStreamHandler.sendAudio(streamSid, muLaw), * onClearAudio: () => mediaStreamHandler.clearAudio(streamSid), * onTranscript: (role, text) => console.log(`[${role}]: ${text}`), @@ -38,16 +38,18 @@ import WebSocket from "ws"; // Types // --------------------------------------------------------------------------- -/** OpenAI Realtime API voice options */ +/** OpenAI Realtime API voice options. + * NOTE: These differ from the TTS-1 voices — nova/fable/onyx are NOT supported here. + * Source: live API error "Supported values are: alloy, ash, ballad, cedar, coral, echo, marin, sage, shimmer, verse" + */ export type RealtimeVoice = | "alloy" | "ash" | "ballad" + | "cedar" | "coral" | "echo" - | "fable" - | "onyx" - | "nova" + | "marin" | "sage" | "shimmer" | "verse"; @@ -87,7 +89,7 @@ export interface RealtimeVoiceConfig { // ---- Voice/personality ---- /** System instructions (persona / behaviour) */ instructions?: string; - /** Voice to use for AI speech output (default: "nova") */ + /** Voice to use for AI speech output (default: "alloy") */ voice?: RealtimeVoice; /** Response temperature 0–1 (default: 0.8) */ temperature?: number; @@ -404,7 +406,7 @@ export class OpenAIRealtimeVoiceBridge { session: { modalities: ["text", "audio"], instructions: cfg.instructions, - voice: cfg.voice ?? "nova", + voice: cfg.voice ?? "alloy", input_audio_format: "g711_ulaw", output_audio_format: "g711_ulaw", input_audio_transcription: { @@ -720,7 +722,7 @@ export interface RealtimeVoiceProviderConfig { apiKey: string; /** Default model (default: "gpt-4o-mini-realtime-preview") */ model?: string; - /** Default voice (default: "nova") */ + /** Default voice (default: "alloy") */ voice?: RealtimeVoice; /** Default system instructions */ instructions?: string; @@ -802,7 +804,7 @@ export interface MediaStreamHandlerLike { * config: { * apiKey: "...", * instructions: "You are Gracie...", - * voice: "nova", + * voice: "alloy", * onTranscript: (role, text, final) => { * if (final && role === "user") config.onTranscript?.(callId, text); * }, From 03d4fa28caee9ccc67275d585ec49d1dcf1fca50 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Wed, 11 Mar 2026 20:53:12 +0000 Subject: [PATCH 05/17] voice-call: thread OpenAI API key through config instead of process.env RealtimeCallHandler now accepts the pre-resolved key from config.streaming.openaiApiKey (passed at construction time in runtime.ts), matching the pattern used by the streaming webhook path. Falls back to OPENAI_API_KEY env if not set in config, same as streaming. Co-Authored-By: Claude Sonnet 4.6 --- extensions/voice-call/src/runtime.ts | 1 + extensions/voice-call/src/webhook/realtime-handler.ts | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts index 4069fa3e493..7bad25ee984 100644 --- a/extensions/voice-call/src/runtime.ts +++ b/extensions/voice-call/src/runtime.ts @@ -179,6 +179,7 @@ export async function createVoiceCallRuntime(params: { manager, provider, coreConfig, + config.streaming.openaiApiKey, ); webhookServer.setRealtimeHandler(realtimeHandler); log.info("[voice-call] Realtime voice handler initialized"); diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index 7fd426b7c79..0d409abdd17 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -36,6 +36,8 @@ export class RealtimeCallHandler { private manager: CallManager, private provider: VoiceCallProvider, private coreConfig: CoreConfig | null, + /** Pre-resolved OpenAI API key (falls back to OPENAI_API_KEY env at call time) */ + private openaiApiKey?: string, ) {} /** @@ -129,9 +131,9 @@ export class RealtimeCallHandler { callSid: string, ws: WebSocket, ): OpenAIRealtimeVoiceBridge | null { - const apiKey = process.env.OPENAI_API_KEY; + const apiKey = this.openaiApiKey ?? process.env.OPENAI_API_KEY; if (!apiKey) { - console.error("[voice-call] No OPENAI_API_KEY for realtime call"); + console.error("[voice-call] No OpenAI API key for realtime call (set streaming.openaiApiKey or OPENAI_API_KEY)"); ws.close(1011, "No API key"); return null; } From 18fc208ee53de464e194ac5d06e0b003da3e2c3b Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Wed, 11 Mar 2026 21:02:13 +0000 Subject: [PATCH 06/17] voice-call: webhook correctness fixes for realtime path - Export WebhookResponsePayload from webhook.ts; import in realtime-handler.ts to remove the duplicate local definition - Rename isRealtimeMode -> isRealtimeWebSocketUpgrade to clarify it is only used for WS upgrade routing, not HTTP POST routing - Move realtime TwiML intercept to after verifyWebhook so inbound calls are authenticated against the provider signature before the handler responds Co-Authored-By: Claude Sonnet 4.6 --- extensions/voice-call/src/webhook.ts | 25 ++++++++----------- .../src/webhook/realtime-handler.ts | 7 +----- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index 44ab5df373c..a8c3bf0c4e1 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -19,7 +19,7 @@ import type { RealtimeCallHandler } from "./webhook/realtime-handler.js"; const MAX_WEBHOOK_BODY_BYTES = 1024 * 1024; -type WebhookResponsePayload = { +export type WebhookResponsePayload = { statusCode: number; body: string; headers?: Record; @@ -244,7 +244,7 @@ export class VoiceCallWebhookServer { if (this.realtimeHandler || this.mediaStreamHandler) { this.server.on("upgrade", (request, socket, head) => { // Realtime voice takes precedence when the path matches - if (this.realtimeHandler && this.isRealtimeMode(request)) { + if (this.realtimeHandler && this.isRealtimeWebSocketUpgrade(request)) { console.log("[voice-call] WebSocket upgrade for realtime voice"); this.realtimeHandler.handleWebSocketUpgrade(request, socket, head); return; @@ -359,7 +359,7 @@ export class VoiceCallWebhookServer { * Returns true for WebSocket upgrade paths that belong to the realtime handler. * Used only for upgrade routing — not for the inbound HTTP webhook POST. */ - private isRealtimeMode(req: http.IncomingMessage): boolean { + private isRealtimeWebSocketUpgrade(req: http.IncomingMessage): boolean { return (req.url ?? "/").includes("/realtime"); } @@ -367,17 +367,6 @@ export class VoiceCallWebhookServer { req: http.IncomingMessage, webhookPath: string, ): Promise { - // Realtime mode: whenever the realtime handler is active, ALL inbound calls - // use it. The handler returns TwiML so Twilio opens a - // WebSocket to the /voice/stream/realtime path, which is routed back here - // via the upgrade handler's isRealtimeMode() check. - if (this.realtimeHandler && req.method === "POST") { - const url = buildRequestUrl(req.url, req.headers.host); - if (this.isWebhookPathMatch(url.pathname, webhookPath)) { - return this.realtimeHandler.buildTwiMLPayload(req); - } - } - const url = buildRequestUrl(req.url, req.headers.host); if (url.pathname === "/voice/hold-music") { @@ -432,6 +421,14 @@ export class VoiceCallWebhookServer { return { statusCode: 401, body: "Unauthorized" }; } + // Realtime mode: return TwiML after verification so + // the request is still authenticated against the provider's signature. + // The WebSocket that Twilio opens in response is routed via the upgrade + // handler's isRealtimeWebSocketUpgrade() check. + if (this.realtimeHandler) { + return this.realtimeHandler.buildTwiMLPayload(req); + } + const parsed = this.provider.parseWebhookEvent(ctx, { verifiedRequestKey: verification.verifiedRequestKey, }); diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index 0d409abdd17..4fe0e071e84 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -10,15 +10,10 @@ import { } from "../providers/openai-realtime-voice.js"; import type { VoiceCallProvider } from "../providers/base.js"; import type { NormalizedEvent } from "../types.js"; +import type { WebhookResponsePayload } from "../webhook.js"; export type ToolHandlerFn = (args: unknown, callId: string) => Promise; -type WebhookResponsePayload = { - statusCode: number; - body: string; - headers?: Record; -}; - /** * Handles inbound voice calls bridged directly to the OpenAI Realtime API. * From e554a0ea76ee50099ae26e8885b41ade3ff55e69 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Wed, 11 Mar 2026 21:02:19 +0000 Subject: [PATCH 07/17] voice-call: bridge reliability and clarity improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fire drainPendingAudio + onReady in session.created handler instead of a fixed 100ms setTimeout after WS open; onReady is guarded by a sessionReadyFired flag so it fires exactly once (not on reconnects) - Add comment explaining why input_audio_transcription uses whisper-1 (Realtime API only; distinct from streaming.sttModel) - Mark OpenAIRealtimeVoiceProvider, createBridgeForStream, and MediaStreamHandlerLike as @internal — not used by the built-in handler but kept for external consumers Co-Authored-By: Claude Sonnet 4.6 --- .../src/providers/openai-realtime-voice.ts | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/extensions/voice-call/src/providers/openai-realtime-voice.ts b/extensions/voice-call/src/providers/openai-realtime-voice.ts index ee788904819..97118dc778a 100755 --- a/extensions/voice-call/src/providers/openai-realtime-voice.ts +++ b/extensions/voice-call/src/providers/openai-realtime-voice.ts @@ -199,6 +199,9 @@ export class OpenAIRealtimeVoiceBridge { /** Accumulate tool call arguments (streamed as deltas) */ private toolCallBuffers = new Map(); + /** Guards onReady/greeting so it fires only on the first session, not reconnects */ + private sessionReadyFired = false; + constructor(config: RealtimeVoiceConfig) { if (!config.apiKey) { throw new Error("[RealtimeVoice] OpenAI API key is required"); @@ -333,15 +336,10 @@ export class OpenAIRealtimeVoiceBridge { console.log("[RealtimeVoice] WebSocket connected"); this.connected = true; this.reconnectAttempts = 0; - - // Small delay to ensure the server is ready before sending session.update - // (mirrors the reference implementation's setTimeout(initializeSession, 100)) - setTimeout(() => { - this.sendSessionUpdate(); - this.drainPendingAudio(); - this.config.onReady?.(); - resolve(); - }, 100); + // Send session config immediately — no need to wait; the server + // confirms receipt via session.created which triggers drain + onReady. + this.sendSessionUpdate(); + resolve(); }); this.ws.on("message", (data: Buffer) => { @@ -409,6 +407,9 @@ export class OpenAIRealtimeVoiceBridge { voice: cfg.voice ?? "alloy", input_audio_format: "g711_ulaw", output_audio_format: "g711_ulaw", + // whisper-1 is the only model currently supported by the Realtime API for + // inline user-speech transcription. This is distinct from the streaming + // STT path (streaming.sttModel) which uses gpt-4o-transcribe. input_audio_transcription: { model: "whisper-1", }, @@ -491,6 +492,12 @@ export class OpenAIRealtimeVoiceBridge { // ---- Session lifecycle ---- case "session.created": console.log("[RealtimeVoice] Session created"); + this.drainPendingAudio(); + // Fire onReady exactly once — not on reconnects (greeting already played) + if (!this.sessionReadyFired) { + this.sessionReadyFired = true; + this.config.onReady?.(); + } break; case "session.updated": @@ -716,6 +723,7 @@ export class OpenAIRealtimeVoiceBridge { /** * Configuration for the provider factory. * Holds shared/default settings; per-call config is passed to createSession(). + * @internal Not used by the plugin's built-in realtime handler; exposed for external consumers. */ export interface RealtimeVoiceProviderConfig { /** OpenAI API key */ @@ -784,6 +792,7 @@ export class OpenAIRealtimeVoiceProvider { /** * Minimal interface that the bridge integration needs from MediaStreamHandler. * This matches the actual MediaStreamHandler's method signatures. + * @internal Not used by the plugin's built-in realtime handler; exposed for external consumers. */ export interface MediaStreamHandlerLike { sendAudio(streamSid: string, muLaw: Buffer): void; @@ -793,6 +802,7 @@ export interface MediaStreamHandlerLike { /** * Create a RealtimeVoiceBridge wired to an existing MediaStreamHandler session. + * @internal Not used by the plugin's built-in realtime handler; exposed for external consumers. * * Drop-in helper for use inside media-stream.ts handleStart(): * From 766c6bc141be64e5337af0335a63251d9594bec5 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Wed, 11 Mar 2026 21:03:31 +0000 Subject: [PATCH 08/17] docker-compose: clean up voice webhook port and API key env var - Comment out port 3334 with a note; it is only needed when voice-call serve.bind is 0.0.0.0 for direct inbound access (not the default) - Use ${OPENAI_API_KEY} without :-default so the var is only injected when actually set in the host environment Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 53b4726b556..6859e10c49d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -24,7 +24,9 @@ services: ports: - "${OPENCLAW_GATEWAY_PORT:-18789}:18789" - "${OPENCLAW_BRIDGE_PORT:-18790}:18790" - - "${OPENCLAW_VOICE_WEBHOOK_PORT:-3334}:3334" + ## Uncomment to expose the voice webhook port (required when voice-call plugin + ## serve.bind is set to 0.0.0.0 for direct inbound webhook access): + # - "${OPENCLAW_VOICE_WEBHOOK_PORT:-3334}:3334" init: true restart: unless-stopped command: From 6327d64070d20b1a0b876f69084113da508f1edc Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Wed, 11 Mar 2026 21:10:28 +0000 Subject: [PATCH 09/17] voice-call: document realtime mode in README; annotate tested private methods - Add Realtime voice mode section to README covering requirements, config fields, env var overrides, how the bridge works, and networking requirements (HTTPS+WSS endpoint, Docker bind, Tailscale) - Add JSDoc comments to registerCallInManager and executeToolCall explaining why they are tested via 'as unknown as' type assertion Co-Authored-By: Claude Sonnet 4.6 --- extensions/voice-call/README.md | 57 +++++++++++++++++++ .../src/webhook/realtime-handler.ts | 10 ++++ 2 files changed, 67 insertions(+) diff --git a/extensions/voice-call/README.md b/extensions/voice-call/README.md index 9acc9aec987..29a66d28540 100644 --- a/extensions/voice-call/README.md +++ b/extensions/voice-call/README.md @@ -172,6 +172,63 @@ Actions: - `voicecall.end` (callId) - `voicecall.status` (callId) +## Realtime voice mode (OpenAI Realtime API) + +Realtime mode routes inbound calls directly to the [OpenAI Realtime API](https://platform.openai.com/docs/guides/realtime) for voice-to-voice conversation (~200–400 ms latency vs ~2–3 s for the STT/TTS pipeline). It is disabled by default and mutually exclusive with `streaming.enabled`. + +### Requirements + +- `OPENAI_API_KEY` set in your environment (or `streaming.openaiApiKey` in config). +- A **publicly reachable HTTPS endpoint with WebSocket support** — the webhook server must accept both POST requests (Twilio webhook) and WebSocket upgrades (Twilio Media Stream). A plain HTTP tunnel is not sufficient; Twilio requires WSS. +- `inboundPolicy` set to `"open"` or `"allowlist"` (not `"disabled"`) so the plugin accepts inbound calls. + +### Config + +```json5 +{ + inboundPolicy: "open", // required: realtime needs inbound calls enabled + + realtime: { + enabled: true, + voice: "alloy", // Realtime API voices: alloy, ash, ballad, cedar, coral, + // echo, marin, sage, shimmer, verse + instructions: "You are a helpful assistant.", + model: "gpt-4o-mini-realtime-preview", // optional, this is the default + temperature: 0.8, // 0–2, optional + vadThreshold: 0.5, // voice activity detection sensitivity, 0–1, optional + silenceDurationMs: 500, // ms of silence before end-of-turn, optional + }, +} +``` + +### Environment variable overrides + +All `realtime.*` fields can be set via environment variables (config takes precedence): + +| Env var | Config field | +|---|---| +| `REALTIME_VOICE_ENABLED=true` | `realtime.enabled` | +| `REALTIME_VOICE_MODEL` | `realtime.model` | +| `REALTIME_VOICE_VOICE` | `realtime.voice` | +| `REALTIME_VOICE_INSTRUCTIONS` | `realtime.instructions` | +| `REALTIME_VOICE_TEMPERATURE` | `realtime.temperature` | +| `VAD_THRESHOLD` | `realtime.vadThreshold` | +| `SILENCE_DURATION_MS` | `realtime.silenceDurationMs` | + +### How it works + +1. Twilio sends a POST webhook to `serve.path` (default `/voice/webhook`). +2. The plugin responds with TwiML `` pointing to `wss:///voice/stream/realtime`. +3. Twilio opens a WebSocket to that path carrying the caller's audio in μ-law format. +4. The plugin bridges the WebSocket to the OpenAI Realtime API — audio flows in both directions in real time. +5. The call is registered with CallManager and appears in `openclaw voice status` / `openclaw voice history`. + +### Networking notes + +- `serve.bind` defaults to `127.0.0.1`. If running inside Docker with an external port mapping, set `serve.bind: "0.0.0.0"` so the container's port is reachable from the host. +- The WebSocket upgrade path (`/voice/stream/realtime`) must be reachable on the same host and port as the webhook. Reverse proxies must pass `Upgrade: websocket` headers through. +- When using Tailscale Funnel on the host (outside Docker), configure Funnel to route `/voice/` to the plugin's local port. The gateway itself does not need to be exposed via Tailscale Funnel. + ## Notes - Uses webhook signature verification for Twilio/Telnyx/Plivo. diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index 4fe0e071e84..d459e100aa7 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -222,6 +222,9 @@ export class RealtimeCallHandler { /** * Emit synthetic NormalizedEvents to register the call with CallManager. * Returns the internal callId generated by the manager. + * + * Tested directly via `as unknown as` cast — the logic is non-trivial + * enough to warrant unit testing without promoting to a public method. */ private registerCallInManager(callSid: string): string { const now = Date.now(); @@ -269,6 +272,13 @@ export class RealtimeCallHandler { }); } + /** + * Dispatch a tool call from the Realtime API to the registered handler. + * Submits the result (or an error object) back to the bridge. + * + * Tested directly via `as unknown as` cast — the routing/error logic is + * worth unit testing without exposing the method publicly. + */ private async executeToolCall( bridge: OpenAIRealtimeVoiceBridge, callId: string, From fa267d6966a1065e58ec21966ed96180790c2523 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Wed, 11 Mar 2026 21:17:39 +0000 Subject: [PATCH 10/17] voice-call: add one-time nonce to realtime WebSocket upgrade path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue a UUID stream token in buildTwiMLPayload and embed it in the URL (?token=). handleWebSocketUpgrade validates and consumes the token before accepting the upgrade — connections without a valid token receive 401 and the socket is destroyed. The token is only issued after verifyWebhook passes on the initial POST, so the authentication chain is: Twilio HMAC signature check → TwiML with nonce → WS upgrade validates nonce. Prevents unauthenticated connections from triggering OpenAI Realtime API sessions. Tokens expire after 30 s; expired entries are evicted on next issue. Co-Authored-By: Claude Sonnet 4.6 --- .../src/webhook/realtime-handler.test.ts | 56 ++++++++++++++++++- .../src/webhook/realtime-handler.ts | 48 ++++++++++++++-- 2 files changed, 98 insertions(+), 6 deletions(-) diff --git a/extensions/voice-call/src/webhook/realtime-handler.test.ts b/extensions/voice-call/src/webhook/realtime-handler.test.ts index f4ad179d441..755b21cc7cb 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.test.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.test.ts @@ -5,6 +5,12 @@ import type { CallRecord } from "../types.js"; import type { VoiceCallProvider } from "../providers/base.js"; import { RealtimeCallHandler } from "./realtime-handler.js"; +/** Extract the stream token from a TwiML body string. */ +function extractStreamToken(twiml: string): string | null { + const match = twiml.match(/\?token=([^"&\s]+)/); + return match?.[1] ?? null; +} + // Minimal realtime config used across tests const baseRealtimeConfig = { enabled: true, @@ -93,7 +99,7 @@ describe("RealtimeCallHandler", () => { expect(payload.headers?.["Content-Type"]).toBe("text/xml"); expect(payload.body).toContain(""); expect(payload.body).toContain(" { @@ -106,7 +112,53 @@ describe("RealtimeCallHandler", () => { const req = makeRequest("/voice/webhook", ""); const payload = handler.buildTwiMLPayload(req); - expect(payload.body).toContain("wss://localhost:8443/voice/stream/realtime"); + expect(payload.body).toContain("wss://localhost:8443/voice/stream/realtime?token="); + }); + + it("embeds a unique token on each call", () => { + const handler = new RealtimeCallHandler( + baseRealtimeConfig, + makeManager(), + makeProvider(), + null, + ); + const req = makeRequest("/voice/webhook", "host.example.com"); + const token1 = extractStreamToken(handler.buildTwiMLPayload(req).body); + const token2 = extractStreamToken(handler.buildTwiMLPayload(req).body); + expect(token1).toBeTruthy(); + expect(token2).toBeTruthy(); + expect(token1).not.toBe(token2); + }); + }); + + // --------------------------------------------------------------------------- + // Stream token (nonce) validation + // --------------------------------------------------------------------------- + + describe("stream token (nonce)", () => { + it("issueStreamToken + consumeStreamToken: valid token accepted once then rejected", () => { + const handler = new RealtimeCallHandler( + baseRealtimeConfig, + makeManager(), + makeProvider(), + null, + ); + const issue = (handler as unknown as { issueStreamToken: () => string }).issueStreamToken; + const consume = (handler as unknown as { consumeStreamToken: (t: string) => boolean }).consumeStreamToken; + const token = issue.call(handler); + expect(consume.call(handler, token)).toBe(true); + expect(consume.call(handler, token)).toBe(false); + }); + + it("rejects unknown tokens", () => { + const handler = new RealtimeCallHandler( + baseRealtimeConfig, + makeManager(), + makeProvider(), + null, + ); + const consume = (handler as unknown as { consumeStreamToken: (t: string) => boolean }).consumeStreamToken; + expect(consume.call(handler, "not-a-real-token")).toBe(false); }); }); diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index d459e100aa7..25635a30172 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -1,4 +1,5 @@ import http from "node:http"; +import { randomUUID } from "node:crypto"; import type { Duplex } from "node:stream"; import { type WebSocket, Server as WebSocketServer } from "ws"; import type { VoiceCallRealtimeConfig } from "../config.js"; @@ -23,8 +24,13 @@ export type ToolHandlerFn = (args: unknown, callId: string) => Promise; * - Register each call with CallManager (appears in voice status/history) * - Route tool calls to registered handlers (Phase 5 tool framework) */ +/** How long (ms) a stream token remains valid after TwiML is issued. */ +const STREAM_TOKEN_TTL_MS = 30_000; + export class RealtimeCallHandler { private toolHandlers = new Map(); + /** One-time tokens issued per TwiML response; consumed on WS upgrade. */ + private pendingStreamTokens = new Map(); constructor( private config: VoiceCallRealtimeConfig, @@ -37,9 +43,22 @@ export class RealtimeCallHandler { /** * Handle a WebSocket upgrade request from Twilio for a realtime media stream. - * Called from VoiceCallWebhookServer's upgrade handler when isRealtimeMode() is true. + * Called from VoiceCallWebhookServer's upgrade handler when isRealtimeWebSocketUpgrade() is true. + * + * Validates the one-time stream token embedded in the URL by buildTwiMLPayload before + * accepting the upgrade. This ensures the WS connection was preceded by a properly + * Twilio-signed POST webhook — the token is only issued after verifyWebhook passes. */ handleWebSocketUpgrade(request: http.IncomingMessage, socket: Duplex, head: Buffer): void { + const url = new URL(request.url ?? "/", "wss://localhost"); + const token = url.searchParams.get("token"); + if (!token || !this.consumeStreamToken(token)) { + console.warn("[voice-call] Rejecting WS upgrade: missing or invalid stream token"); + socket.write("HTTP/1.1 401 Unauthorized\r\n\r\n"); + socket.destroy(); + return; + } + const wss = new WebSocketServer({ noServer: true }); wss.handleUpgrade(request, socket, head, (ws) => { let bridge: OpenAIRealtimeVoiceBridge | null = null; @@ -81,12 +100,14 @@ export class RealtimeCallHandler { /** * Build the TwiML response payload for a realtime call. * The WebSocket URL is derived from the incoming request host so no hostname - * is hardcoded. + * is hardcoded. A one-time stream token is embedded in the URL and validated + * by handleWebSocketUpgrade to prevent unauthenticated WS connections. */ buildTwiMLPayload(req: http.IncomingMessage): WebhookResponsePayload { const host = req.headers.host || "localhost:8443"; - const wsUrl = `wss://${host}/voice/stream/realtime`; - console.log(`[voice-call] Returning realtime TwiML with WebSocket: ${wsUrl}`); + const token = this.issueStreamToken(); + const wsUrl = `wss://${host}/voice/stream/realtime?token=${token}`; + console.log(`[voice-call] Returning realtime TwiML with WebSocket: wss://${host}/voice/stream/realtime`); const twiml = ` @@ -116,6 +137,25 @@ export class RealtimeCallHandler { // Private // --------------------------------------------------------------------------- + /** Generate a single-use stream token valid for STREAM_TOKEN_TTL_MS. */ + private issueStreamToken(): string { + const token = randomUUID(); + this.pendingStreamTokens.set(token, Date.now() + STREAM_TOKEN_TTL_MS); + // Evict expired tokens to prevent unbounded growth if calls are abandoned + for (const [t, expiry] of this.pendingStreamTokens) { + if (Date.now() > expiry) this.pendingStreamTokens.delete(t); + } + return token; + } + + /** Consume a stream token. Returns true if valid and not yet used. */ + private consumeStreamToken(token: string): boolean { + const expiry = this.pendingStreamTokens.get(token); + if (expiry === undefined) return false; + this.pendingStreamTokens.delete(token); + return Date.now() <= expiry; + } + /** * Create and start the OpenAI Realtime bridge for a single call session. * Registers the call with CallManager so it appears in status/history. From cc0077e71278042dcd206ec0c65334af587d91f7 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Wed, 11 Mar 2026 21:28:47 +0000 Subject: [PATCH 11/17] voice-call: document Twilio coupling and future adapter refactor path Co-Authored-By: Claude Sonnet 4.6 --- extensions/voice-call/src/webhook/realtime-handler.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index 25635a30172..b4802b9c7d9 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -23,6 +23,14 @@ export type ToolHandlerFn = (args: unknown, callId: string) => Promise; * - Return TwiML payload for the initial HTTP webhook * - Register each call with CallManager (appears in voice status/history) * - Route tool calls to registered handlers (Phase 5 tool framework) + * + * Provider coupling: this class currently speaks the Twilio Media Streams + * WebSocket protocol directly (μ-law audio, streamSid/callSid, start/media/ + * mark/stop events) and emits Twilio TwiML. The OpenAI bridge itself is + * provider-agnostic. If a second provider needs realtime support, the right + * refactor is to extract a RealtimeMediaAdapter interface (buildStreamPayload + + * handleWebSocketUpgrade + MediaStreamCallbacks) so the bridge and call-manager + * wiring can be reused without duplicating the OpenAI session logic. */ /** How long (ms) a stream token remains valid after TwiML is issued. */ const STREAM_TOKEN_TTL_MS = 30_000; From d457ac8fe4da5a06d191a695dfbdd5972aa0ffa3 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Wed, 11 Mar 2026 21:45:51 +0000 Subject: [PATCH 12/17] voice-call: fix docker-compose scope and constant placement - Reset docker-compose.yml to main and apply only the intended change: commented-out voice webhook port with instructions for when to enable - Move STREAM_TOKEN_TTL_MS above the class JSDoc where it belongs Co-Authored-By: Claude Sonnet 4.6 --- extensions/voice-call/src/webhook/realtime-handler.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index b4802b9c7d9..8497e58766c 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -32,6 +32,7 @@ export type ToolHandlerFn = (args: unknown, callId: string) => Promise; * handleWebSocketUpgrade + MediaStreamCallbacks) so the bridge and call-manager * wiring can be reused without duplicating the OpenAI session logic. */ + /** How long (ms) a stream token remains valid after TwiML is issued. */ const STREAM_TOKEN_TTL_MS = 30_000; From 1ea1695edc91c9a1e83b2be2e911b8d84c619cd7 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Thu, 12 Mar 2026 03:55:40 +0000 Subject: [PATCH 13/17] voice-call: fix three realtime review bugs - Remove direct onClose call from bridge.close() to prevent double-fire (the ws close event handler is the single source of truth for onClose) - Gate realtime TwiML intercept on CallStatus=ringing + Direction=inbound so outbound status callbacks and completed events fall through to normal webhook pipeline - Thread caller From/To from HTTP POST body through stream token nonce map to registerCallInManager so inboundPolicy allowlist checks have caller identity on synthetic call.initiated/call.answered events Co-Authored-By: Claude Sonnet 4.6 --- .../src/providers/openai-realtime-voice.ts | 3 +- extensions/voice-call/src/webhook.ts | 14 ++++-- .../src/webhook/realtime-handler.ts | 48 ++++++++++++------- 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/extensions/voice-call/src/providers/openai-realtime-voice.ts b/extensions/voice-call/src/providers/openai-realtime-voice.ts index 97118dc778a..60ac63375c1 100755 --- a/extensions/voice-call/src/providers/openai-realtime-voice.ts +++ b/extensions/voice-call/src/providers/openai-realtime-voice.ts @@ -299,7 +299,8 @@ export class OpenAIRealtimeVoiceBridge { this.ws.close(1000, "Bridge closed"); this.ws = null; } - this.config.onClose?.(); + // onClose fires from the ws "close" event handler (intentionallyClosed branch) + // to avoid double-firing on explicit close(). } /** True if the WebSocket is open and the session is configured. */ diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index a8c3bf0c4e1..fea71e83090 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -421,12 +421,16 @@ export class VoiceCallWebhookServer { return { statusCode: 401, body: "Unauthorized" }; } - // Realtime mode: return TwiML after verification so - // the request is still authenticated against the provider's signature. - // The WebSocket that Twilio opens in response is routed via the upgrade - // handler's isRealtimeWebSocketUpgrade() check. + // Realtime mode: return TwiML for inbound ringing calls only. + // Status callbacks (CallStatus=completed, outbound calls, etc.) must fall + // through to the normal webhook pipeline so call state is updated correctly. if (this.realtimeHandler) { - return this.realtimeHandler.buildTwiMLPayload(req); + const params = new URLSearchParams(ctx.rawBody); + const callStatus = params.get("CallStatus"); + const direction = params.get("Direction"); + if (callStatus === "ringing" && (!direction || direction === "inbound")) { + return this.realtimeHandler.buildTwiMLPayload(req, params); + } } const parsed = this.provider.parseWebhookEvent(ctx, { diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index 8497e58766c..304ee547384 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -38,8 +38,9 @@ const STREAM_TOKEN_TTL_MS = 30_000; export class RealtimeCallHandler { private toolHandlers = new Map(); - /** One-time tokens issued per TwiML response; consumed on WS upgrade. */ - private pendingStreamTokens = new Map(); + /** One-time tokens issued per TwiML response; consumed on WS upgrade. + * Stores expiry + caller metadata so registerCallInManager can include From/To. */ + private pendingStreamTokens = new Map(); constructor( private config: VoiceCallRealtimeConfig, @@ -61,7 +62,8 @@ export class RealtimeCallHandler { handleWebSocketUpgrade(request: http.IncomingMessage, socket: Duplex, head: Buffer): void { const url = new URL(request.url ?? "/", "wss://localhost"); const token = url.searchParams.get("token"); - if (!token || !this.consumeStreamToken(token)) { + const callerMeta = token ? this.consumeStreamToken(token) : null; + if (!callerMeta) { console.warn("[voice-call] Rejecting WS upgrade: missing or invalid stream token"); socket.write("HTTP/1.1 401 Unauthorized\r\n\r\n"); socket.destroy(); @@ -81,7 +83,7 @@ export class RealtimeCallHandler { const startData = msg.start as Record | undefined; const streamSid = startData?.streamSid || "unknown"; const callSid = startData?.callSid || "unknown"; - bridge = this.handleCall(streamSid, callSid, ws); + bridge = this.handleCall(streamSid, callSid, ws, callerMeta); } else if (bridge) { const mediaData = msg.media as Record | undefined; if (msg.event === "media" && mediaData?.payload) { @@ -111,10 +113,16 @@ export class RealtimeCallHandler { * The WebSocket URL is derived from the incoming request host so no hostname * is hardcoded. A one-time stream token is embedded in the URL and validated * by handleWebSocketUpgrade to prevent unauthenticated WS connections. + * + * @param params - Parsed Twilio webhook body params (From/To stored with nonce + * so registerCallInManager can populate caller fields). */ - buildTwiMLPayload(req: http.IncomingMessage): WebhookResponsePayload { + buildTwiMLPayload(req: http.IncomingMessage, params?: URLSearchParams): WebhookResponsePayload { const host = req.headers.host || "localhost:8443"; - const token = this.issueStreamToken(); + const token = this.issueStreamToken({ + from: params?.get("From") ?? undefined, + to: params?.get("To") ?? undefined, + }); const wsUrl = `wss://${host}/voice/stream/realtime?token=${token}`; console.log(`[voice-call] Returning realtime TwiML with WebSocket: wss://${host}/voice/stream/realtime`); const twiml = ` @@ -147,22 +155,22 @@ export class RealtimeCallHandler { // --------------------------------------------------------------------------- /** Generate a single-use stream token valid for STREAM_TOKEN_TTL_MS. */ - private issueStreamToken(): string { + private issueStreamToken(meta: { from?: string; to?: string } = {}): string { const token = randomUUID(); - this.pendingStreamTokens.set(token, Date.now() + STREAM_TOKEN_TTL_MS); + this.pendingStreamTokens.set(token, { expiry: Date.now() + STREAM_TOKEN_TTL_MS, ...meta }); // Evict expired tokens to prevent unbounded growth if calls are abandoned - for (const [t, expiry] of this.pendingStreamTokens) { - if (Date.now() > expiry) this.pendingStreamTokens.delete(t); + for (const [t, entry] of this.pendingStreamTokens) { + if (Date.now() > entry.expiry) this.pendingStreamTokens.delete(t); } return token; } - /** Consume a stream token. Returns true if valid and not yet used. */ - private consumeStreamToken(token: string): boolean { - const expiry = this.pendingStreamTokens.get(token); - if (expiry === undefined) return false; + /** Consume a stream token. Returns caller metadata if valid, null if not. */ + private consumeStreamToken(token: string): { from?: string; to?: string } | null { + const entry = this.pendingStreamTokens.get(token); + if (!entry) return null; this.pendingStreamTokens.delete(token); - return Date.now() <= expiry; + return Date.now() <= entry.expiry ? { from: entry.from, to: entry.to } : null; } /** @@ -174,6 +182,7 @@ export class RealtimeCallHandler { streamSid: string, callSid: string, ws: WebSocket, + callerMeta: { from?: string; to?: string }, ): OpenAIRealtimeVoiceBridge | null { const apiKey = this.openaiApiKey ?? process.env.OPENAI_API_KEY; if (!apiKey) { @@ -182,7 +191,7 @@ export class RealtimeCallHandler { return null; } - const callId = this.registerCallInManager(callSid); + const callId = this.registerCallInManager(callSid, callerMeta); console.log(`[voice-call] Realtime call: streamSid=${streamSid}, callSid=${callSid}, callId=${callId}`); // Declare as null first so closures can capture the reference before bridge is created. @@ -275,12 +284,17 @@ export class RealtimeCallHandler { * Tested directly via `as unknown as` cast — the logic is non-trivial * enough to warrant unit testing without promoting to a public method. */ - private registerCallInManager(callSid: string): string { + private registerCallInManager( + callSid: string, + callerMeta: { from?: string; to?: string } = {}, + ): string { const now = Date.now(); const baseFields = { providerCallId: callSid, timestamp: now, direction: "inbound" as const, + ...(callerMeta.from ? { from: callerMeta.from } : {}), + ...(callerMeta.to ? { to: callerMeta.to } : {}), }; // call.initiated causes the manager to auto-create the call record From 70bd1ffe927450af586044ffdbe3f96315ab2e61 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Thu, 12 Mar 2026 17:43:10 +0000 Subject: [PATCH 14/17] voice-call: fix tsgo type errors and formatting - Import WebSocketServer by named export (not Server alias) from ws - Add input_audio_transcription to RealtimeSessionUpdate interface - Fix z.record() call to pass explicit key schema (zod strict compat) - Cast DeepPartial tools to RealtimeToolConfig[] in normalizeVoiceCallConfig - Update consumeStreamToken test casts to reflect new nullable return type - Apply oxfmt formatting across voice-call extension files Co-Authored-By: Claude Sonnet 4.6 --- extensions/voice-call/README.md | 30 ++++----- extensions/voice-call/openclaw.plugin.json | 13 +++- extensions/voice-call/src/config.test.ts | 13 +++- extensions/voice-call/src/config.ts | 27 ++++++-- .../src/providers/openai-realtime-voice.ts | 8 +-- extensions/voice-call/src/runtime.ts | 2 +- extensions/voice-call/src/webhook.ts | 2 +- .../src/webhook/realtime-handler.test.ts | 62 +++++++++---------- .../src/webhook/realtime-handler.ts | 18 ++++-- 9 files changed, 106 insertions(+), 69 deletions(-) diff --git a/extensions/voice-call/README.md b/extensions/voice-call/README.md index 29a66d28540..6019d87661f 100644 --- a/extensions/voice-call/README.md +++ b/extensions/voice-call/README.md @@ -186,16 +186,16 @@ Realtime mode routes inbound calls directly to the [OpenAI Realtime API](https:/ ```json5 { - inboundPolicy: "open", // required: realtime needs inbound calls enabled + inboundPolicy: "open", // required: realtime needs inbound calls enabled realtime: { enabled: true, - voice: "alloy", // Realtime API voices: alloy, ash, ballad, cedar, coral, - // echo, marin, sage, shimmer, verse + voice: "alloy", // Realtime API voices: alloy, ash, ballad, cedar, coral, + // echo, marin, sage, shimmer, verse instructions: "You are a helpful assistant.", - model: "gpt-4o-mini-realtime-preview", // optional, this is the default - temperature: 0.8, // 0–2, optional - vadThreshold: 0.5, // voice activity detection sensitivity, 0–1, optional + model: "gpt-4o-mini-realtime-preview", // optional, this is the default + temperature: 0.8, // 0–2, optional + vadThreshold: 0.5, // voice activity detection sensitivity, 0–1, optional silenceDurationMs: 500, // ms of silence before end-of-turn, optional }, } @@ -205,15 +205,15 @@ Realtime mode routes inbound calls directly to the [OpenAI Realtime API](https:/ All `realtime.*` fields can be set via environment variables (config takes precedence): -| Env var | Config field | -|---|---| -| `REALTIME_VOICE_ENABLED=true` | `realtime.enabled` | -| `REALTIME_VOICE_MODEL` | `realtime.model` | -| `REALTIME_VOICE_VOICE` | `realtime.voice` | -| `REALTIME_VOICE_INSTRUCTIONS` | `realtime.instructions` | -| `REALTIME_VOICE_TEMPERATURE` | `realtime.temperature` | -| `VAD_THRESHOLD` | `realtime.vadThreshold` | -| `SILENCE_DURATION_MS` | `realtime.silenceDurationMs` | +| Env var | Config field | +| ----------------------------- | ---------------------------- | +| `REALTIME_VOICE_ENABLED=true` | `realtime.enabled` | +| `REALTIME_VOICE_MODEL` | `realtime.model` | +| `REALTIME_VOICE_VOICE` | `realtime.voice` | +| `REALTIME_VOICE_INSTRUCTIONS` | `realtime.instructions` | +| `REALTIME_VOICE_TEMPERATURE` | `realtime.temperature` | +| `VAD_THRESHOLD` | `realtime.vadThreshold` | +| `SILENCE_DURATION_MS` | `realtime.silenceDurationMs` | ### How it works diff --git a/extensions/voice-call/openclaw.plugin.json b/extensions/voice-call/openclaw.plugin.json index 83f2613530d..4a2a5f94433 100644 --- a/extensions/voice-call/openclaw.plugin.json +++ b/extensions/voice-call/openclaw.plugin.json @@ -427,7 +427,18 @@ }, "voice": { "type": "string", - "enum": ["alloy", "ash", "ballad", "cedar", "coral", "echo", "marin", "sage", "shimmer", "verse"] + "enum": [ + "alloy", + "ash", + "ballad", + "cedar", + "coral", + "echo", + "marin", + "sage", + "shimmer", + "verse" + ] }, "instructions": { "type": "string" diff --git a/extensions/voice-call/src/config.test.ts b/extensions/voice-call/src/config.test.ts index 899a12af5a7..ceef8d57d9a 100644 --- a/extensions/voice-call/src/config.test.ts +++ b/extensions/voice-call/src/config.test.ts @@ -226,7 +226,18 @@ describe("VoiceCallRealtimeConfigSchema", () => { }); it("accepts all valid Realtime API voice names", () => { - const voices = ["alloy", "ash", "ballad", "cedar", "coral", "echo", "marin", "sage", "shimmer", "verse"]; + const voices = [ + "alloy", + "ash", + "ballad", + "cedar", + "coral", + "echo", + "marin", + "sage", + "shimmer", + "verse", + ]; for (const voice of voices) { expect(() => VoiceCallRealtimeConfigSchema.parse({ voice })).not.toThrow(); } diff --git a/extensions/voice-call/src/config.ts b/extensions/voice-call/src/config.ts index 7d75c23ac29..2ace2283608 100644 --- a/extensions/voice-call/src/config.ts +++ b/extensions/voice-call/src/config.ts @@ -215,7 +215,7 @@ export const RealtimeToolSchema = z description: z.string(), parameters: z.object({ type: z.literal("object"), - properties: z.record(z.unknown()), + properties: z.record(z.string(), z.unknown()), required: z.array(z.string()).optional(), }), }) @@ -230,7 +230,18 @@ export const VoiceCallRealtimeConfigSchema = z model: z.string().optional(), /** Voice for AI speech output (env: REALTIME_VOICE_VOICE) */ voice: z - .enum(["alloy", "ash", "ballad", "cedar", "coral", "echo", "marin", "sage", "shimmer", "verse"]) + .enum([ + "alloy", + "ash", + "ballad", + "cedar", + "coral", + "echo", + "marin", + "sage", + "shimmer", + "verse", + ]) .optional(), /** System instructions / persona (env: REALTIME_VOICE_INSTRUCTIONS) */ instructions: z.string().optional(), @@ -453,7 +464,10 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal realtime: { ...defaults.realtime, ...config.realtime, - tools: config.realtime?.tools ?? defaults.realtime.tools, + // Cast: DeepPartial makes tool fields appear optional in the input type, + // but Zod validates the full shape before it reaches here. + tools: + (config.realtime?.tools as RealtimeToolConfig[] | undefined) ?? defaults.realtime.tools, }, stt: { ...defaults.stt, ...config.stt }, tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts), @@ -519,7 +533,8 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC resolved.realtime.model = resolved.realtime.model ?? process.env.REALTIME_VOICE_MODEL; resolved.realtime.voice = (resolved.realtime.voice ?? - (process.env.REALTIME_VOICE_VOICE as VoiceCallRealtimeConfig["voice"])) || undefined; + (process.env.REALTIME_VOICE_VOICE as VoiceCallRealtimeConfig["voice"])) || + undefined; resolved.realtime.instructions = resolved.realtime.instructions ?? process.env.REALTIME_VOICE_INSTRUCTIONS; if (resolved.realtime.temperature == null && process.env.REALTIME_VOICE_TEMPERATURE) { @@ -605,8 +620,8 @@ export function validateProviderConfig(config: VoiceCallConfig): { // "open" or "allowlist" are the correct choices when realtime.enabled = true. if (config.realtime?.enabled && config.inboundPolicy === "disabled") { errors.push( - "plugins.entries.voice-call.config.inboundPolicy must not be \"disabled\" when realtime.enabled is true " + - "(use \"open\" or \"allowlist\" — realtime calls are answered before policy can reject them)", + 'plugins.entries.voice-call.config.inboundPolicy must not be "disabled" when realtime.enabled is true ' + + '(use "open" or "allowlist" — realtime calls are answered before policy can reject them)', ); } diff --git a/extensions/voice-call/src/providers/openai-realtime-voice.ts b/extensions/voice-call/src/providers/openai-realtime-voice.ts index 60ac63375c1..42f7b7949f1 100755 --- a/extensions/voice-call/src/providers/openai-realtime-voice.ts +++ b/extensions/voice-call/src/providers/openai-realtime-voice.ts @@ -450,9 +450,7 @@ export class OpenAIRealtimeVoiceBridge { private async attemptReconnect(): Promise { if (this.intentionallyClosed) return; - if ( - this.reconnectAttempts >= OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS - ) { + if (this.reconnectAttempts >= OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS) { const err = new Error( `[RealtimeVoice] Max reconnect attempts (${OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS}) exceeded`, ); @@ -464,8 +462,7 @@ export class OpenAIRealtimeVoiceBridge { this.reconnectAttempts++; const delay = - OpenAIRealtimeVoiceBridge.BASE_RECONNECT_DELAY_MS * - 2 ** (this.reconnectAttempts - 1); + OpenAIRealtimeVoiceBridge.BASE_RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1); console.log( `[RealtimeVoice] Reconnecting (${this.reconnectAttempts}/${OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS}) in ${delay}ms...`, @@ -870,6 +867,7 @@ interface RealtimeSessionUpdate { create_response: boolean; }; temperature: number; + input_audio_transcription?: { model: string }; tools?: RealtimeTool[]; tool_choice?: string; }; diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts index 7bad25ee984..7da90634774 100644 --- a/extensions/voice-call/src/runtime.ts +++ b/extensions/voice-call/src/runtime.ts @@ -11,8 +11,8 @@ import type { TelephonyTtsRuntime } from "./telephony-tts.js"; import { createTelephonyTtsProvider } from "./telephony-tts.js"; import { startTunnel, type TunnelResult } from "./tunnel.js"; import { VoiceCallWebhookServer } from "./webhook.js"; -import { cleanupTailscaleExposure, setupTailscaleExposure } from "./webhook/tailscale.js"; import { RealtimeCallHandler } from "./webhook/realtime-handler.js"; +import { cleanupTailscaleExposure, setupTailscaleExposure } from "./webhook/tailscale.js"; export type VoiceCallRuntime = { config: VoiceCallConfig; diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index fea71e83090..be3cce5f399 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -14,8 +14,8 @@ import type { VoiceCallProvider } from "./providers/base.js"; import { OpenAIRealtimeSTTProvider } from "./providers/stt-openai-realtime.js"; import type { TwilioProvider } from "./providers/twilio.js"; import type { NormalizedEvent, WebhookContext } from "./types.js"; -import { startStaleCallReaper } from "./webhook/stale-call-reaper.js"; import type { RealtimeCallHandler } from "./webhook/realtime-handler.js"; +import { startStaleCallReaper } from "./webhook/stale-call-reaper.js"; const MAX_WEBHOOK_BODY_BYTES = 1024 * 1024; diff --git a/extensions/voice-call/src/webhook/realtime-handler.test.ts b/extensions/voice-call/src/webhook/realtime-handler.test.ts index 755b21cc7cb..07107f08601 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.test.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.test.ts @@ -1,8 +1,8 @@ import http from "node:http"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import type { CallManager } from "../manager.js"; -import type { CallRecord } from "../types.js"; import type { VoiceCallProvider } from "../providers/base.js"; +import type { CallRecord } from "../types.js"; import { RealtimeCallHandler } from "./realtime-handler.js"; /** Extract the stream token from a TwiML body string. */ @@ -144,10 +144,14 @@ describe("RealtimeCallHandler", () => { null, ); const issue = (handler as unknown as { issueStreamToken: () => string }).issueStreamToken; - const consume = (handler as unknown as { consumeStreamToken: (t: string) => boolean }).consumeStreamToken; + const consume = ( + handler as unknown as { + consumeStreamToken: (t: string) => { from?: string; to?: string } | null; + } + ).consumeStreamToken; const token = issue.call(handler); - expect(consume.call(handler, token)).toBe(true); - expect(consume.call(handler, token)).toBe(false); + expect(consume.call(handler, token)).not.toBeNull(); + expect(consume.call(handler, token)).toBeNull(); }); it("rejects unknown tokens", () => { @@ -157,8 +161,12 @@ describe("RealtimeCallHandler", () => { makeProvider(), null, ); - const consume = (handler as unknown as { consumeStreamToken: (t: string) => boolean }).consumeStreamToken; - expect(consume.call(handler, "not-a-real-token")).toBe(false); + const consume = ( + handler as unknown as { + consumeStreamToken: (t: string) => { from?: string; to?: string } | null; + } + ).consumeStreamToken; + expect(consume.call(handler, "not-a-real-token")).toBeNull(); }); }); @@ -173,22 +181,18 @@ describe("RealtimeCallHandler", () => { }); const manager = makeManager(callRecord); - const handler = new RealtimeCallHandler( - baseRealtimeConfig, - manager, - makeProvider(), - null, - ); + const handler = new RealtimeCallHandler(baseRealtimeConfig, manager, makeProvider(), null); // Access private method via type assertion for unit testing - (handler as unknown as { registerCallInManager: (sid: string) => string }) - .registerCallInManager("CA_test"); + ( + handler as unknown as { registerCallInManager: (sid: string) => string } + ).registerCallInManager("CA_test"); // call.initiated + call.answered should both have been emitted expect(vi.mocked(manager.processEvent)).toHaveBeenCalledTimes(2); - const eventTypes = vi.mocked(manager.processEvent).mock.calls.map( - ([e]) => (e as { type: string }).type, - ); + const eventTypes = vi + .mocked(manager.processEvent) + .mock.calls.map(([e]) => (e as { type: string }).type); expect(eventTypes).toEqual(["call.initiated", "call.answered"]); // initialMessage must be cleared before call.answered fires @@ -199,15 +203,11 @@ describe("RealtimeCallHandler", () => { const callRecord = makeCallRecord({ callId: "manager-gen-id" }); const manager = makeManager(callRecord); - const handler = new RealtimeCallHandler( - baseRealtimeConfig, - manager, - makeProvider(), - null, - ); + const handler = new RealtimeCallHandler(baseRealtimeConfig, manager, makeProvider(), null); - const result = (handler as unknown as { registerCallInManager: (sid: string) => string }) - .registerCallInManager("CA_test"); + const result = ( + handler as unknown as { registerCallInManager: (sid: string) => string } + ).registerCallInManager("CA_test"); expect(result).toBe("manager-gen-id"); }); @@ -218,15 +218,11 @@ describe("RealtimeCallHandler", () => { getCallByProviderCallId: vi.fn(() => undefined), } as unknown as CallManager; - const handler = new RealtimeCallHandler( - baseRealtimeConfig, - manager, - makeProvider(), - null, - ); + const handler = new RealtimeCallHandler(baseRealtimeConfig, manager, makeProvider(), null); - const result = (handler as unknown as { registerCallInManager: (sid: string) => string }) - .registerCallInManager("CA_fallback"); + const result = ( + handler as unknown as { registerCallInManager: (sid: string) => string } + ).registerCallInManager("CA_fallback"); expect(result).toBe("CA_fallback"); }); diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index 304ee547384..56796828760 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -1,15 +1,15 @@ -import http from "node:http"; import { randomUUID } from "node:crypto"; +import http from "node:http"; import type { Duplex } from "node:stream"; -import { type WebSocket, Server as WebSocketServer } from "ws"; +import WebSocket, { WebSocketServer } from "ws"; import type { VoiceCallRealtimeConfig } from "../config.js"; import type { CoreConfig } from "../core-bridge.js"; import type { CallManager } from "../manager.js"; +import type { VoiceCallProvider } from "../providers/base.js"; import { OpenAIRealtimeVoiceBridge, type RealtimeTool, } from "../providers/openai-realtime-voice.js"; -import type { VoiceCallProvider } from "../providers/base.js"; import type { NormalizedEvent } from "../types.js"; import type { WebhookResponsePayload } from "../webhook.js"; @@ -124,7 +124,9 @@ export class RealtimeCallHandler { to: params?.get("To") ?? undefined, }); const wsUrl = `wss://${host}/voice/stream/realtime?token=${token}`; - console.log(`[voice-call] Returning realtime TwiML with WebSocket: wss://${host}/voice/stream/realtime`); + console.log( + `[voice-call] Returning realtime TwiML with WebSocket: wss://${host}/voice/stream/realtime`, + ); const twiml = ` @@ -186,13 +188,17 @@ export class RealtimeCallHandler { ): OpenAIRealtimeVoiceBridge | null { const apiKey = this.openaiApiKey ?? process.env.OPENAI_API_KEY; if (!apiKey) { - console.error("[voice-call] No OpenAI API key for realtime call (set streaming.openaiApiKey or OPENAI_API_KEY)"); + console.error( + "[voice-call] No OpenAI API key for realtime call (set streaming.openaiApiKey or OPENAI_API_KEY)", + ); ws.close(1011, "No API key"); return null; } const callId = this.registerCallInManager(callSid, callerMeta); - console.log(`[voice-call] Realtime call: streamSid=${streamSid}, callSid=${callSid}, callId=${callId}`); + console.log( + `[voice-call] Realtime call: streamSid=${streamSid}, callSid=${callSid}, callId=${callId}`, + ); // Declare as null first so closures can capture the reference before bridge is created. // By the time any callback fires, bridge will be fully assigned. From 9bff8bc9769c0ea53f31cf95cc1312488bd0970c Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Thu, 12 Mar 2026 21:10:32 +0000 Subject: [PATCH 15/17] voice-call: embed stream token in URL path instead of query param Tailscale Funnel strips query parameters from proxied WebSocket upgrade requests, causing the one-time stream token to be lost and Twilio's WebSocket connection to be rejected with 401. Move the token from ?token= to a path segment: wss://host/voice/stream/realtime/ Token extraction in handleWebSocketUpgrade now uses the last path segment instead of searchParams. Tests updated to match the new URL format. Co-Authored-By: Claude Sonnet 4.6 --- extensions/voice-call/src/webhook/realtime-handler.test.ts | 6 +++--- extensions/voice-call/src/webhook/realtime-handler.ts | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/extensions/voice-call/src/webhook/realtime-handler.test.ts b/extensions/voice-call/src/webhook/realtime-handler.test.ts index 07107f08601..0f5076c4660 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.test.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.test.ts @@ -7,7 +7,7 @@ import { RealtimeCallHandler } from "./realtime-handler.js"; /** Extract the stream token from a TwiML body string. */ function extractStreamToken(twiml: string): string | null { - const match = twiml.match(/\?token=([^"&\s]+)/); + const match = twiml.match(/\/voice\/stream\/realtime\/([^"&\s]+)/); return match?.[1] ?? null; } @@ -99,7 +99,7 @@ describe("RealtimeCallHandler", () => { expect(payload.headers?.["Content-Type"]).toBe("text/xml"); expect(payload.body).toContain(""); expect(payload.body).toContain(" { @@ -112,7 +112,7 @@ describe("RealtimeCallHandler", () => { const req = makeRequest("/voice/webhook", ""); const payload = handler.buildTwiMLPayload(req); - expect(payload.body).toContain("wss://localhost:8443/voice/stream/realtime?token="); + expect(payload.body).toMatch(/wss:\/\/localhost:8443\/voice\/stream\/realtime\/[0-9a-f-]{36}/); }); it("embeds a unique token on each call", () => { diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index 56796828760..9eb7d4b2f21 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -61,7 +61,9 @@ export class RealtimeCallHandler { */ handleWebSocketUpgrade(request: http.IncomingMessage, socket: Duplex, head: Buffer): void { const url = new URL(request.url ?? "/", "wss://localhost"); - const token = url.searchParams.get("token"); + // Token is embedded as the last path segment (e.g. /voice/stream/realtime/) + // to survive reverse proxies that strip query parameters (e.g. Tailscale Funnel). + const token = url.pathname.split("/").pop() ?? null; const callerMeta = token ? this.consumeStreamToken(token) : null; if (!callerMeta) { console.warn("[voice-call] Rejecting WS upgrade: missing or invalid stream token"); @@ -123,7 +125,7 @@ export class RealtimeCallHandler { from: params?.get("From") ?? undefined, to: params?.get("To") ?? undefined, }); - const wsUrl = `wss://${host}/voice/stream/realtime?token=${token}`; + const wsUrl = `wss://${host}/voice/stream/realtime/${token}`; console.log( `[voice-call] Returning realtime TwiML with WebSocket: wss://${host}/voice/stream/realtime`, ); From ea4b9a446338f11d38f9c3d84eec008d79962f18 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Thu, 12 Mar 2026 21:32:16 +0000 Subject: [PATCH 16/17] voice-call: fix formatting in realtime-handler.test.ts Co-Authored-By: Claude Sonnet 4.6 --- .../voice-call/src/webhook/realtime-handler.test.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/extensions/voice-call/src/webhook/realtime-handler.test.ts b/extensions/voice-call/src/webhook/realtime-handler.test.ts index 0f5076c4660..921fe757a6e 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.test.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.test.ts @@ -99,7 +99,9 @@ describe("RealtimeCallHandler", () => { expect(payload.headers?.["Content-Type"]).toBe("text/xml"); expect(payload.body).toContain(""); expect(payload.body).toContain(" { @@ -112,7 +114,9 @@ describe("RealtimeCallHandler", () => { const req = makeRequest("/voice/webhook", ""); const payload = handler.buildTwiMLPayload(req); - expect(payload.body).toMatch(/wss:\/\/localhost:8443\/voice\/stream\/realtime\/[0-9a-f-]{36}/); + expect(payload.body).toMatch( + /wss:\/\/localhost:8443\/voice\/stream\/realtime\/[0-9a-f-]{36}/, + ); }); it("embeds a unique token on each call", () => { From 6c6c6388bd1aa4fb7f0905d11ff9146ff110bf7b Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Thu, 12 Mar 2026 21:56:17 +0000 Subject: [PATCH 17/17] voice-call: log assistant speech as bot transcript in realtime calls call.speaking events already carry a text field (used by Telnyx) but the manager was discarding it. Wire it through addTranscriptEntry so any call.speaking event with non-empty text records a speaker:"bot" entry. In the realtime handler, emit call.speaking for assistant turns when the transcript is final, mirroring the existing user call.speech path. This restores speaker:"bot" entries in calls.jsonl for realtime calls. Co-Authored-By: Claude Sonnet 4.6 --- extensions/voice-call/src/manager/events.ts | 3 +++ .../voice-call/src/webhook/realtime-handler.ts | 12 +++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/extensions/voice-call/src/manager/events.ts b/extensions/voice-call/src/manager/events.ts index 668369e0c35..2ce9de599aa 100644 --- a/extensions/voice-call/src/manager/events.ts +++ b/extensions/voice-call/src/manager/events.ts @@ -204,6 +204,9 @@ export function processEvent(ctx: EventContext, event: NormalizedEvent): void { break; case "call.speaking": + if (event.text) { + addTranscriptEntry(call, "bot", event.text); + } transitionState(call, "speaking"); break; diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index 9eb7d4b2f21..fc911bb2132 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -233,7 +233,6 @@ export class RealtimeCallHandler { onTranscript: (role, text, isFinal) => { if (isFinal) { - // Emit user speech through the manager for transcript persistence if (role === "user") { const event: NormalizedEvent = { id: `realtime-speech-${callSid}-${Date.now()}`, @@ -245,6 +244,17 @@ export class RealtimeCallHandler { isFinal: true, }; this.manager.processEvent(event); + } else if (role === "assistant") { + // Log assistant turns via call.speaking so they appear as speaker:"bot" + // in the transcript (same mechanism Telnyx uses for TTS speech). + this.manager.processEvent({ + id: `realtime-bot-${callSid}-${Date.now()}`, + type: "call.speaking", + callId, + providerCallId: callSid, + timestamp: Date.now(), + text, + }); } } },