refactor: move voice-call realtime providers into extensions

This commit is contained in:
Peter Steinberger 2026-04-04 12:04:37 +09:00
parent 61f93540b2
commit a23ab9b906
No known key found for this signature in database
90 changed files with 3134 additions and 792 deletions

View File

@ -48,6 +48,7 @@ Docs: https://docs.openclaw.ai
- Telegram/native commands: clean up metadata-driven progress placeholders when replies fall back, edits fail, or local exec approval prompts are suppressed. (#59300) Thanks @jalehman.
- Matrix/backup reset: recreate secret storage during backup reset when stale SSSS state blocks durable backup-key reload, including no-backup repair paths. (#60599) thanks @emonty.
- Matrix: allow secret-storage recreation during automatic repair bootstrap so clients that lose their recovery key can recover and persist new cross-signing keys. (#59846) Thanks @al3mart.
- Voice Call/OpenAI: move realtime voice and realtime transcription onto provider-owned plugin capabilities so `voice-call` uses generic provider selection while keeping realtime Twilio replay and custom webhook-path handling working.
- Matrix/crypto persistence: capture and write the IndexedDB snapshot while holding the snapshot file lock so concurrent gateway and CLI persists cannot overwrite newer crypto state. (#59851) Thanks @al3mart.
- Matrix/media: surface a dedicated `[matrix <kind> attachment too large]` marker for oversized inbound media instead of the generic unavailable marker, and classify size-limit failures with a typed Matrix error. (#60289) Thanks @efe-arv.
- Matrix/Telegram exec approvals: recover stored same-channel account bindings even when session reply state drifted to another channel, so foreign-channel approvals route to the bound account instead of fanning out or being rejected as ambiguous. (#60417) thanks @gumadeiras.

View File

@ -32,6 +32,7 @@ native OpenClaw plugin registers against one or more capability types:
| Text inference | `api.registerProvider(...)` | `openai`, `anthropic` |
| CLI inference backend | `api.registerCliBackend(...)` | `openai`, `anthropic` |
| Speech | `api.registerSpeechProvider(...)` | `elevenlabs`, `microsoft` |
| Realtime voice | `api.registerRealtimeVoiceProvider(...)` | `openai` |
| Media understanding | `api.registerMediaUnderstandingProvider(...)` | `openai`, `google` |
| Image generation | `api.registerImageGenerationProvider(...)` | `openai`, `google` |
| Web search | `api.registerWebSearchProvider(...)` | `google` |
@ -239,8 +240,9 @@ Examples:
- the bundled `minimax`, `mistral`, `moonshot`, and `zai` plugins own their
media-understanding backends
- the `voice-call` plugin is a feature plugin: it owns call transport, tools,
CLI, routes, and runtime, but it consumes core TTS/STT capability instead of
inventing a second speech stack
CLI, routes, and Twilio media-stream bridging, but it consumes shared speech
plus realtime-transcription and realtime-voice capabilities instead of
importing vendor plugins directly
The intended end state is:

View File

@ -146,6 +146,7 @@ A single plugin can register any number of capabilities via the `api` object:
| CLI inference backend | `api.registerCliBackend(...)` | [CLI Backends](/gateway/cli-backends) |
| Channel / messaging | `api.registerChannel(...)` | [Channel Plugins](/plugins/sdk-channel-plugins) |
| Speech (TTS/STT) | `api.registerSpeechProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
| Realtime voice | `api.registerRealtimeVoiceProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
| Media understanding | `api.registerMediaUnderstandingProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
| Image generation | `api.registerImageGenerationProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
| Web search | `api.registerWebSearchProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |

View File

@ -196,6 +196,8 @@ read without importing the plugin runtime.
{
"contracts": {
"speechProviders": ["openai"],
"realtimeTranscriptionProviders": ["openai"],
"realtimeVoiceProviders": ["openai"],
"mediaUnderstandingProviders": ["openai", "openai-codex"],
"imageGenerationProviders": ["openai"],
"webSearchProviders": ["gemini"],
@ -206,13 +208,15 @@ read without importing the plugin runtime.
Each list is optional:
| Field | Type | What it means |
| ----------------------------- | ---------- | -------------------------------------------------------------- |
| `speechProviders` | `string[]` | Speech provider ids this plugin owns. |
| `mediaUnderstandingProviders` | `string[]` | Media-understanding provider ids this plugin owns. |
| `imageGenerationProviders` | `string[]` | Image-generation provider ids this plugin owns. |
| `webSearchProviders` | `string[]` | Web-search provider ids this plugin owns. |
| `tools` | `string[]` | Agent tool names this plugin owns for bundled contract checks. |
| Field | Type | What it means |
| -------------------------------- | ---------- | -------------------------------------------------------------- |
| `speechProviders` | `string[]` | Speech provider ids this plugin owns. |
| `realtimeTranscriptionProviders` | `string[]` | Realtime-transcription provider ids this plugin owns. |
| `realtimeVoiceProviders` | `string[]` | Realtime-voice provider ids this plugin owns. |
| `mediaUnderstandingProviders` | `string[]` | Media-understanding provider ids this plugin owns. |
| `imageGenerationProviders` | `string[]` | Image-generation provider ids this plugin owns. |
| `webSearchProviders` | `string[]` | Web-search provider ids this plugin owns. |
| `tools` | `string[]` | Agent tool names this plugin owns for bundled contract checks. |
Legacy top-level `speechProviders`, `mediaUnderstandingProviders`, and
`imageGenerationProviders` are deprecated. Use `openclaw doctor --fix` to move

View File

@ -128,15 +128,17 @@ methods:
### Capability registration
| Method | What it registers |
| --------------------------------------------- | ------------------------------ |
| `api.registerProvider(...)` | Text inference (LLM) |
| `api.registerCliBackend(...)` | Local CLI inference backend |
| `api.registerChannel(...)` | Messaging channel |
| `api.registerSpeechProvider(...)` | Text-to-speech / STT synthesis |
| `api.registerMediaUnderstandingProvider(...)` | Image/audio/video analysis |
| `api.registerImageGenerationProvider(...)` | Image generation |
| `api.registerWebSearchProvider(...)` | Web search |
| Method | What it registers |
| ------------------------------------------------ | -------------------------------- |
| `api.registerProvider(...)` | Text inference (LLM) |
| `api.registerCliBackend(...)` | Local CLI inference backend |
| `api.registerChannel(...)` | Messaging channel |
| `api.registerSpeechProvider(...)` | Text-to-speech / STT synthesis |
| `api.registerRealtimeTranscriptionProvider(...)` | Streaming realtime transcription |
| `api.registerRealtimeVoiceProvider(...)` | Duplex realtime voice sessions |
| `api.registerMediaUnderstandingProvider(...)` | Image/audio/video analysis |
| `api.registerImageGenerationProvider(...)` | Image generation |
| `api.registerWebSearchProvider(...)` | Web search |
### Tools and commands

View File

@ -324,8 +324,8 @@ API key auth, and dynamic model resolution.
<Step title="Add extra capabilities (optional)">
<a id="step-5-add-extra-capabilities"></a>
A provider plugin can register speech, media understanding, image
generation, and web search alongside text inference:
A provider plugin can register speech, realtime transcription, realtime voice, media
understanding, image generation, and web search alongside text inference:
```typescript
register(api) {
@ -343,6 +343,33 @@ API key auth, and dynamic model resolution.
}),
});
api.registerRealtimeTranscriptionProvider({
id: "acme-ai",
label: "Acme Realtime Transcription",
isConfigured: () => true,
createSession: (req) => ({
connect: async () => {},
sendAudio: () => {},
close: () => {},
isConnected: () => true,
}),
});
api.registerRealtimeVoiceProvider({
id: "acme-ai",
label: "Acme Realtime Voice",
isConfigured: ({ providerConfig }) => Boolean(providerConfig.apiKey),
createBridge: (req) => ({
connect: async () => {},
sendAudio: () => {},
setMediaTimestamp: () => {},
submitToolResult: () => {},
acknowledgeMark: () => {},
close: () => {},
isConnected: () => true,
}),
});
api.registerMediaUnderstandingProvider({
id: "acme-ai",
capabilities: ["image", "audio"],

View File

@ -1 +1,2 @@
export { buildAnthropicCliBackend } from "./cli-backend.js";
export { anthropicMediaUnderstandingProvider } from "./media-understanding-provider.js";

View File

@ -0,0 +1 @@
export { deepgramMediaUnderstandingProvider } from "./media-understanding-provider.js";

View File

@ -0,0 +1 @@
export { buildFalImageGenerationProvider } from "./image-generation-provider.js";

View File

@ -1 +1,3 @@
export { buildGoogleGeminiCliBackend } from "./cli-backend.js";
export { buildGoogleImageGenerationProvider } from "./image-generation-provider.js";
export { googleMediaUnderstandingProvider } from "./media-understanding-provider.js";

View File

@ -0,0 +1 @@
export { groqMediaUnderstandingProvider } from "./media-understanding-provider.js";

View File

@ -47,6 +47,8 @@ function fakeApi(overrides: Partial<OpenClawPluginApi> = {}): OpenClawPluginApi
registerCliBackend() {},
registerProvider() {},
registerSpeechProvider() {},
registerRealtimeTranscriptionProvider() {},
registerRealtimeVoiceProvider() {},
registerMediaUnderstandingProvider() {},
registerImageGenerationProvider() {},
registerWebFetchProvider() {},

View File

@ -31,7 +31,7 @@ export {
isTrustedProxyAddress,
} from "openclaw/plugin-sdk/core";
export { buildComputedAccountStatusSnapshot } from "openclaw/plugin-sdk/channel-status";
export { createAccountStatusSink } from "openclaw/plugin-sdk/compat";
export { createAccountStatusSink } from "openclaw/plugin-sdk/channel-lifecycle";
export { buildAgentMediaPayload } from "openclaw/plugin-sdk/agent-media-payload";
export {
buildModelsProviderData,

View File

@ -0,0 +1,8 @@
export {
buildMinimaxImageGenerationProvider,
buildMinimaxPortalImageGenerationProvider,
} from "./image-generation-provider.js";
export {
minimaxMediaUnderstandingProvider,
minimaxPortalMediaUnderstandingProvider,
} from "./media-understanding-provider.js";

View File

@ -0,0 +1 @@
export { mistralMediaUnderstandingProvider } from "./media-understanding-provider.js";

View File

@ -1 +1,2 @@
export { __testing } from "./src/kimi-web-search-provider.js";
export { moonshotMediaUnderstandingProvider } from "./media-understanding-provider.js";

View File

@ -11,3 +11,5 @@ export {
} from "./default-models.js";
export { buildOpenAICodexProvider } from "./openai-codex-catalog.js";
export { buildOpenAIProvider } from "./openai-provider.js";
export { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
export { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";

View File

@ -12,6 +12,8 @@ import {
resolveOpenAIPromptOverlayMode,
shouldApplyOpenAIPromptOverlay,
} from "./prompt-overlay.js";
import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
import { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
import { buildOpenAISpeechProvider } from "./speech-provider.js";
export default definePluginEntry({
@ -24,6 +26,8 @@ export default definePluginEntry({
api.registerProvider(buildOpenAIProvider());
api.registerProvider(buildOpenAICodexProviderPlugin());
api.registerSpeechProvider(buildOpenAISpeechProvider());
api.registerRealtimeTranscriptionProvider(buildOpenAIRealtimeTranscriptionProvider());
api.registerRealtimeVoiceProvider(buildOpenAIRealtimeVoiceProvider());
api.registerMediaUnderstandingProvider(openaiMediaUnderstandingProvider);
api.registerMediaUnderstandingProvider(openaiCodexMediaUnderstandingProvider);
api.registerImageGenerationProvider(buildOpenAIImageGenerationProvider());

View File

@ -103,16 +103,16 @@ describe("openai codex provider", () => {
api: "openai-codex-responses",
baseUrl: "https://chatgpt.com/backend-api",
reasoning: true,
input: ["text", "image"],
input: ["text", "image"] as const,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 272_000,
maxTokens: 128_000,
};
}
return null;
},
return undefined;
}),
} as never,
} as never);
});
expect(model).toMatchObject({
id: "gpt-5.4",
@ -173,7 +173,7 @@ describe("openai codex provider", () => {
contextWindow: 272_000,
},
],
});
} as never);
expect(entries).toContainEqual(
expect.objectContaining({

View File

@ -34,6 +34,8 @@
],
"contracts": {
"speechProviders": ["openai"],
"realtimeTranscriptionProviders": ["openai"],
"realtimeVoiceProviders": ["openai"],
"mediaUnderstandingProviders": ["openai", "openai-codex"],
"imageGenerationProviders": ["openai"]
},

View File

@ -4,6 +4,9 @@
"private": true,
"description": "OpenClaw OpenAI provider plugins",
"type": "module",
"dependencies": {
"ws": "^8.20.0"
},
"openclaw": {
"extensions": [
"./index.ts"

View File

@ -0,0 +1,27 @@
import { describe, expect, it } from "vitest";
import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
describe("buildOpenAIRealtimeTranscriptionProvider", () => {
it("normalizes OpenAI config defaults", () => {
const provider = buildOpenAIRealtimeTranscriptionProvider();
const resolved = provider.resolveConfig?.({
cfg: {} as never,
rawConfig: {
providers: {
openai: {
apiKey: "sk-test", // pragma: allowlist secret
},
},
},
});
expect(resolved).toEqual({
apiKey: "sk-test",
});
});
it("accepts the legacy openai-realtime alias", () => {
const provider = buildOpenAIRealtimeTranscriptionProvider();
expect(provider.aliases).toContain("openai-realtime");
});
});

View File

@ -0,0 +1,267 @@
import type {
RealtimeTranscriptionProviderConfig,
RealtimeTranscriptionProviderPlugin,
RealtimeTranscriptionSession,
RealtimeTranscriptionSessionCreateRequest,
} from "openclaw/plugin-sdk/realtime-transcription";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import WebSocket from "ws";
type OpenAIRealtimeTranscriptionProviderConfig = {
apiKey?: string;
model?: string;
silenceDurationMs?: number;
vadThreshold?: number;
};
type OpenAIRealtimeTranscriptionSessionConfig = RealtimeTranscriptionSessionCreateRequest & {
apiKey: string;
model: string;
silenceDurationMs: number;
vadThreshold: number;
};
type RealtimeEvent = {
type: string;
delta?: string;
transcript?: string;
error?: unknown;
};
function trimToUndefined(value: unknown): string | undefined {
return typeof value === "string" && value.trim() ? value.trim() : undefined;
}
function asNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
}
function asObject(value: unknown): Record<string, unknown> | undefined {
return typeof value === "object" && value !== null && !Array.isArray(value)
? (value as Record<string, unknown>)
: undefined;
}
function normalizeProviderConfig(
config: RealtimeTranscriptionProviderConfig,
): OpenAIRealtimeTranscriptionProviderConfig {
const providers = asObject(config.providers);
const raw = asObject(providers?.openai) ?? asObject(config.openai) ?? asObject(config);
return {
apiKey:
normalizeResolvedSecretInputString({
value: raw?.apiKey,
path: "plugins.entries.voice-call.config.streaming.providers.openai.apiKey",
}) ??
normalizeResolvedSecretInputString({
value: raw?.openaiApiKey,
path: "plugins.entries.voice-call.config.streaming.openaiApiKey",
}),
model: trimToUndefined(raw?.model) ?? trimToUndefined(raw?.sttModel),
silenceDurationMs: asNumber(raw?.silenceDurationMs),
vadThreshold: asNumber(raw?.vadThreshold),
};
}
function readProviderConfig(
providerConfig: RealtimeTranscriptionProviderConfig,
): OpenAIRealtimeTranscriptionProviderConfig {
return normalizeProviderConfig(providerConfig);
}
class OpenAIRealtimeTranscriptionSession implements RealtimeTranscriptionSession {
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
private static readonly RECONNECT_DELAY_MS = 1000;
private static readonly CONNECT_TIMEOUT_MS = 10_000;
private ws: WebSocket | null = null;
private connected = false;
private closed = false;
private reconnectAttempts = 0;
private pendingTranscript = "";
constructor(private readonly config: OpenAIRealtimeTranscriptionSessionConfig) {}
async connect(): Promise<void> {
this.closed = false;
this.reconnectAttempts = 0;
await this.doConnect();
}
sendAudio(audio: Buffer): void {
if (this.ws?.readyState !== WebSocket.OPEN) {
return;
}
this.sendEvent({
type: "input_audio_buffer.append",
audio: audio.toString("base64"),
});
}
close(): void {
this.closed = true;
this.connected = false;
if (this.ws) {
this.ws.close(1000, "Transcription session closed");
this.ws = null;
}
}
isConnected(): boolean {
return this.connected;
}
private async doConnect(): Promise<void> {
await new Promise<void>((resolve, reject) => {
this.ws = new WebSocket("wss://api.openai.com/v1/realtime?intent=transcription", {
headers: {
Authorization: `Bearer ${this.config.apiKey}`,
"OpenAI-Beta": "realtime=v1",
},
});
const connectTimeout = setTimeout(() => {
reject(new Error("OpenAI realtime transcription connection timeout"));
}, OpenAIRealtimeTranscriptionSession.CONNECT_TIMEOUT_MS);
this.ws.on("open", () => {
clearTimeout(connectTimeout);
this.connected = true;
this.reconnectAttempts = 0;
this.sendEvent({
type: "transcription_session.update",
session: {
input_audio_format: "g711_ulaw",
input_audio_transcription: {
model: this.config.model,
},
turn_detection: {
type: "server_vad",
threshold: this.config.vadThreshold,
prefix_padding_ms: 300,
silence_duration_ms: this.config.silenceDurationMs,
},
},
});
resolve();
});
this.ws.on("message", (data: Buffer) => {
try {
this.handleEvent(JSON.parse(data.toString()) as RealtimeEvent);
} catch (error) {
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
}
});
this.ws.on("error", (error) => {
if (!this.connected) {
clearTimeout(connectTimeout);
reject(error);
return;
}
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
});
this.ws.on("close", () => {
this.connected = false;
if (this.closed) {
return;
}
void this.attemptReconnect();
});
});
}
private async attemptReconnect(): Promise<void> {
if (this.closed) {
return;
}
if (this.reconnectAttempts >= OpenAIRealtimeTranscriptionSession.MAX_RECONNECT_ATTEMPTS) {
this.config.onError?.(new Error("OpenAI realtime transcription reconnect limit reached"));
return;
}
this.reconnectAttempts += 1;
const delay =
OpenAIRealtimeTranscriptionSession.RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
await new Promise((resolve) => setTimeout(resolve, delay));
if (this.closed) {
return;
}
try {
await this.doConnect();
} catch (error) {
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
await this.attemptReconnect();
}
}
private handleEvent(event: RealtimeEvent): void {
switch (event.type) {
case "conversation.item.input_audio_transcription.delta":
if (event.delta) {
this.pendingTranscript += event.delta;
this.config.onPartial?.(this.pendingTranscript);
}
return;
case "conversation.item.input_audio_transcription.completed":
if (event.transcript) {
this.config.onTranscript?.(event.transcript);
}
this.pendingTranscript = "";
return;
case "input_audio_buffer.speech_started":
this.pendingTranscript = "";
this.config.onSpeechStart?.();
return;
case "error": {
const detail =
event.error && typeof event.error === "object" && "message" in event.error
? String((event.error as { message?: unknown }).message ?? "Unknown error")
: event.error
? String(event.error)
: "Unknown error";
this.config.onError?.(new Error(detail));
return;
}
default:
return;
}
}
private sendEvent(event: unknown): void {
if (this.ws?.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify(event));
}
}
}
export function buildOpenAIRealtimeTranscriptionProvider(): RealtimeTranscriptionProviderPlugin {
return {
id: "openai",
label: "OpenAI Realtime Transcription",
aliases: ["openai-realtime"],
autoSelectOrder: 10,
resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig),
isConfigured: ({ providerConfig }) =>
Boolean(readProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
createSession: (req) => {
const config = readProviderConfig(req.providerConfig);
const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
if (!apiKey) {
throw new Error("OpenAI API key missing");
}
return new OpenAIRealtimeTranscriptionSession({
...req,
apiKey,
model: config.model ?? "gpt-4o-transcribe",
silenceDurationMs: config.silenceDurationMs ?? 800,
vadThreshold: config.vadThreshold ?? 0.5,
});
},
};
}

View File

@ -0,0 +1,535 @@
import type {
RealtimeVoiceBridge,
RealtimeVoiceBridgeCreateRequest,
RealtimeVoiceCloseReason,
RealtimeVoiceProviderConfig,
RealtimeVoiceProviderPlugin,
RealtimeVoiceTool,
} from "openclaw/plugin-sdk/realtime-voice";
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
import WebSocket from "ws";
export type OpenAIRealtimeVoice =
| "alloy"
| "ash"
| "ballad"
| "cedar"
| "coral"
| "echo"
| "marin"
| "sage"
| "shimmer"
| "verse";
type OpenAIRealtimeVoiceProviderConfig = {
apiKey?: string;
model?: string;
voice?: OpenAIRealtimeVoice;
temperature?: number;
vadThreshold?: number;
silenceDurationMs?: number;
prefixPaddingMs?: number;
azureEndpoint?: string;
azureDeployment?: string;
azureApiVersion?: string;
};
type OpenAIRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
apiKey: string;
model?: string;
voice?: OpenAIRealtimeVoice;
temperature?: number;
vadThreshold?: number;
silenceDurationMs?: number;
prefixPaddingMs?: number;
azureEndpoint?: string;
azureDeployment?: string;
azureApiVersion?: string;
};
type RealtimeEvent = {
type: string;
delta?: string;
transcript?: string;
item_id?: string;
call_id?: string;
name?: string;
error?: unknown;
};
type RealtimeSessionUpdate = {
type: "session.update";
session: {
modalities: string[];
instructions?: string;
voice: OpenAIRealtimeVoice;
input_audio_format: string;
output_audio_format: string;
turn_detection: {
type: "server_vad";
threshold: number;
prefix_padding_ms: number;
silence_duration_ms: number;
create_response: boolean;
};
temperature: number;
input_audio_transcription?: { model: string };
tools?: RealtimeVoiceTool[];
tool_choice?: string;
};
};
function trimToUndefined(value: unknown): string | undefined {
return typeof value === "string" && value.trim() ? value.trim() : undefined;
}
function asNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
}
function asObject(value: unknown): Record<string, unknown> | undefined {
return typeof value === "object" && value !== null && !Array.isArray(value)
? (value as Record<string, unknown>)
: undefined;
}
function normalizeProviderConfig(
config: RealtimeVoiceProviderConfig,
): OpenAIRealtimeVoiceProviderConfig {
const providers = asObject(config.providers);
const raw = asObject(providers?.openai) ?? asObject(config.openai) ?? asObject(config);
return {
apiKey: normalizeResolvedSecretInputString({
value: raw?.apiKey,
path: "plugins.entries.voice-call.config.realtime.providers.openai.apiKey",
}),
model: trimToUndefined(raw?.model),
voice: raw?.voice as OpenAIRealtimeVoice | undefined,
temperature: asNumber(raw?.temperature),
vadThreshold: asNumber(raw?.vadThreshold),
silenceDurationMs: asNumber(raw?.silenceDurationMs),
prefixPaddingMs: asNumber(raw?.prefixPaddingMs),
azureEndpoint: trimToUndefined(raw?.azureEndpoint),
azureDeployment: trimToUndefined(raw?.azureDeployment),
azureApiVersion: trimToUndefined(raw?.azureApiVersion),
};
}
function readProviderConfig(
providerConfig: RealtimeVoiceProviderConfig,
): OpenAIRealtimeVoiceProviderConfig {
return normalizeProviderConfig(providerConfig);
}
function base64ToBuffer(b64: string): Buffer {
return Buffer.from(b64, "base64");
}
class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
private static readonly DEFAULT_MODEL = "gpt-realtime";
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
private static readonly BASE_RECONNECT_DELAY_MS = 1000;
private static readonly CONNECT_TIMEOUT_MS = 10_000;
private ws: WebSocket | null = null;
private connected = false;
private intentionallyClosed = false;
private reconnectAttempts = 0;
private pendingAudio: Buffer[] = [];
private markQueue: string[] = [];
private responseStartTimestamp: number | null = null;
private latestMediaTimestamp = 0;
private lastAssistantItemId: string | null = null;
private toolCallBuffers = new Map<string, { name: string; callId: string; args: string }>();
constructor(private readonly config: OpenAIRealtimeVoiceBridgeConfig) {}
async connect(): Promise<void> {
this.intentionallyClosed = false;
this.reconnectAttempts = 0;
await this.doConnect();
}
sendAudio(audio: Buffer): void {
if (!this.connected || this.ws?.readyState !== WebSocket.OPEN) {
if (this.pendingAudio.length < 320) {
this.pendingAudio.push(audio);
}
return;
}
this.sendEvent({
type: "input_audio_buffer.append",
audio: audio.toString("base64"),
});
}
setMediaTimestamp(ts: number): void {
this.latestMediaTimestamp = ts;
}
sendUserMessage(text: string): void {
this.sendEvent({
type: "conversation.item.create",
item: {
type: "message",
role: "user",
content: [{ type: "input_text", text }],
},
});
this.sendEvent({ type: "response.create" });
}
triggerGreeting(instructions?: string): void {
if (!this.connected || !this.ws) {
return;
}
this.sendEvent({
type: "response.create",
response: {
instructions: instructions ?? this.config.instructions,
},
});
}
submitToolResult(callId: string, result: unknown): void {
this.sendEvent({
type: "conversation.item.create",
item: {
type: "function_call_output",
call_id: callId,
output: JSON.stringify(result),
},
});
this.sendEvent({ type: "response.create" });
}
acknowledgeMark(): void {
if (this.markQueue.length === 0) {
return;
}
this.markQueue.shift();
if (this.markQueue.length === 0) {
this.responseStartTimestamp = null;
this.lastAssistantItemId = null;
}
}
close(): void {
this.intentionallyClosed = true;
this.connected = false;
if (this.ws) {
this.ws.close(1000, "Bridge closed");
this.ws = null;
}
}
isConnected(): boolean {
return this.connected;
}
private async doConnect(): Promise<void> {
await new Promise<void>((resolve, reject) => {
const { url, headers } = this.resolveConnectionParams();
this.ws = new WebSocket(url, { headers });
const connectTimeout = setTimeout(() => {
reject(new Error("OpenAI realtime connection timeout"));
}, OpenAIRealtimeVoiceBridge.CONNECT_TIMEOUT_MS);
this.ws.on("open", () => {
clearTimeout(connectTimeout);
this.connected = true;
this.reconnectAttempts = 0;
this.sendSessionUpdate();
for (const chunk of this.pendingAudio.splice(0)) {
this.sendAudio(chunk);
}
this.config.onReady?.();
resolve();
});
this.ws.on("message", (data: Buffer) => {
try {
this.handleEvent(JSON.parse(data.toString()) as RealtimeEvent);
} catch (error) {
console.error("[openai] realtime event parse failed:", error);
}
});
this.ws.on("error", (error) => {
if (!this.connected) {
clearTimeout(connectTimeout);
reject(error);
}
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
});
this.ws.on("close", () => {
this.connected = false;
if (this.intentionallyClosed) {
this.config.onClose?.("completed");
return;
}
void this.attemptReconnect();
});
});
}
private resolveConnectionParams(): { url: string; headers: Record<string, string> } {
const cfg = this.config;
if (cfg.azureEndpoint && cfg.azureDeployment) {
const base = cfg.azureEndpoint
.replace(/\/$/, "")
.replace(/^http(s?):/, (_, secure: string) => `ws${secure}:`);
const apiVersion = cfg.azureApiVersion ?? "2024-10-01-preview";
return {
url: `${base}/openai/realtime?api-version=${apiVersion}&deployment=${encodeURIComponent(
cfg.azureDeployment,
)}`,
headers: { "api-key": cfg.apiKey },
};
}
if (cfg.azureEndpoint) {
const base = cfg.azureEndpoint
.replace(/\/$/, "")
.replace(/^http(s?):/, (_, secure: string) => `ws${secure}:`);
return {
url: `${base}/v1/realtime?model=${encodeURIComponent(
cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL,
)}`,
headers: { Authorization: `Bearer ${cfg.apiKey}` },
};
}
return {
url: `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(
cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL,
)}`,
headers: {
Authorization: `Bearer ${cfg.apiKey}`,
"OpenAI-Beta": "realtime=v1",
},
};
}
private async attemptReconnect(): Promise<void> {
if (this.intentionallyClosed) {
return;
}
if (this.reconnectAttempts >= OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS) {
this.config.onClose?.("error");
return;
}
this.reconnectAttempts += 1;
const delay =
OpenAIRealtimeVoiceBridge.BASE_RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
await new Promise((resolve) => setTimeout(resolve, delay));
if (this.intentionallyClosed) {
return;
}
try {
await this.doConnect();
} catch (error) {
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
await this.attemptReconnect();
}
}
private sendSessionUpdate(): void {
const cfg = this.config;
const sessionUpdate: RealtimeSessionUpdate = {
type: "session.update",
session: {
modalities: ["text", "audio"],
instructions: cfg.instructions,
voice: cfg.voice ?? "alloy",
input_audio_format: "g711_ulaw",
output_audio_format: "g711_ulaw",
input_audio_transcription: {
model: "whisper-1",
},
turn_detection: {
type: "server_vad",
threshold: cfg.vadThreshold ?? 0.5,
prefix_padding_ms: cfg.prefixPaddingMs ?? 300,
silence_duration_ms: cfg.silenceDurationMs ?? 500,
create_response: true,
},
temperature: cfg.temperature ?? 0.8,
...(cfg.tools && cfg.tools.length > 0
? {
tools: cfg.tools,
tool_choice: "auto",
}
: {}),
},
};
this.sendEvent(sessionUpdate);
}
private handleEvent(event: RealtimeEvent): void {
switch (event.type) {
case "response.audio.delta": {
if (!event.delta) {
return;
}
const audio = base64ToBuffer(event.delta);
this.config.onAudio(audio);
if (this.responseStartTimestamp === null) {
this.responseStartTimestamp = this.latestMediaTimestamp;
}
if (event.item_id) {
this.lastAssistantItemId = event.item_id;
}
this.sendMark();
return;
}
case "input_audio_buffer.speech_started":
this.handleBargeIn();
return;
case "response.audio_transcript.delta":
if (event.delta) {
this.config.onTranscript?.("assistant", event.delta, false);
}
return;
case "response.audio_transcript.done":
if (event.transcript) {
this.config.onTranscript?.("assistant", event.transcript, true);
}
return;
case "conversation.item.input_audio_transcription.completed":
if (event.transcript) {
this.config.onTranscript?.("user", event.transcript, true);
}
return;
case "conversation.item.input_audio_transcription.delta":
if (event.delta) {
this.config.onTranscript?.("user", event.delta, false);
}
return;
case "response.function_call_arguments.delta": {
const key = event.item_id ?? "unknown";
const existing = this.toolCallBuffers.get(key);
if (existing && event.delta) {
existing.args += event.delta;
} else if (event.item_id) {
this.toolCallBuffers.set(event.item_id, {
name: event.name ?? "",
callId: event.call_id ?? "",
args: event.delta ?? "",
});
}
return;
}
case "response.function_call_arguments.done": {
const key = event.item_id ?? "unknown";
const buffered = this.toolCallBuffers.get(key);
if (this.config.onToolCall) {
const rawArgs =
buffered?.args ||
((event as unknown as Record<string, unknown>).arguments as string) ||
"{}";
let args: unknown = {};
try {
args = JSON.parse(rawArgs);
} catch {}
this.config.onToolCall({
itemId: key,
callId: buffered?.callId || event.call_id || "",
name: buffered?.name || event.name || "",
args,
});
}
this.toolCallBuffers.delete(key);
return;
}
case "error": {
const detail =
event.error && typeof event.error === "object" && "message" in event.error
? String((event.error as { message?: unknown }).message ?? "Unknown error")
: event.error
? String(event.error)
: "Unknown error";
this.config.onError?.(new Error(detail));
return;
}
default:
return;
}
}
private handleBargeIn(): void {
if (this.markQueue.length > 0 && this.responseStartTimestamp !== null) {
const elapsedMs = this.latestMediaTimestamp - this.responseStartTimestamp;
if (this.lastAssistantItemId) {
this.sendEvent({
type: "conversation.item.truncate",
item_id: this.lastAssistantItemId,
content_index: 0,
audio_end_ms: Math.max(0, elapsedMs),
});
}
this.config.onClearAudio();
this.markQueue = [];
this.lastAssistantItemId = null;
this.responseStartTimestamp = null;
return;
}
this.config.onClearAudio();
}
private sendMark(): void {
const markName = `audio-${Date.now()}`;
this.markQueue.push(markName);
this.config.onMark?.(markName);
}
private sendEvent(event: unknown): void {
if (this.ws?.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify(event));
}
}
}
export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
return {
id: "openai",
label: "OpenAI Realtime Voice",
autoSelectOrder: 10,
resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig),
isConfigured: ({ providerConfig }) =>
Boolean(readProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
createBridge: (req) => {
const config = readProviderConfig(req.providerConfig);
const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
if (!apiKey) {
throw new Error("OpenAI API key missing");
}
return new OpenAIRealtimeVoiceBridge({
...req,
apiKey,
model: config.model,
voice: config.voice,
temperature: config.temperature,
vadThreshold: config.vadThreshold,
silenceDurationMs: config.silenceDurationMs,
prefixPaddingMs: config.prefixPaddingMs,
azureEndpoint: config.azureEndpoint,
azureDeployment: config.azureDeployment,
azureApiVersion: config.azureApiVersion,
});
},
};
}
export type { OpenAIRealtimeVoiceProviderConfig };

View File

@ -4,4 +4,6 @@ export {
openaiCodexMediaUnderstandingProvider,
openaiMediaUnderstandingProvider,
} from "./media-understanding-provider.js";
export { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
export { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
export { buildOpenAISpeechProvider } from "./speech-provider.js";

View File

@ -0,0 +1 @@
export { openrouterMediaUnderstandingProvider } from "./media-understanding-provider.js";

View File

@ -145,4 +145,4 @@ Actions:
- While a Twilio stream is active, playback does not fall back to TwiML `<Say>`; stream-TTS failures fail the playback request.
- Outbound conversation calls suppress barge-in only while the initial greeting is actively speaking, then re-enable normal interruption.
- Twilio stream disconnect auto-end uses a short grace window so quick reconnects do not end the call.
- Media streaming requires `ws` and OpenAI Realtime API key.
- Media streaming requires `ws` plus a configured realtime-transcription provider. The bundled provider today is OpenAI.

View File

@ -72,13 +72,25 @@ const voiceCallConfigSchema = {
advanced: true,
},
"streaming.enabled": { label: "Enable Streaming", advanced: true },
"streaming.openaiApiKey": {
"streaming.provider": { label: "Streaming Provider", advanced: true },
"streaming.providers.openai.apiKey": {
label: "OpenAI Realtime API Key",
sensitive: true,
advanced: true,
},
"streaming.sttModel": { label: "Realtime STT Model", advanced: true },
"streaming.providers.openai.model": { label: "Realtime STT Model", advanced: true },
"streaming.streamPath": { label: "Media Stream Path", advanced: true },
"realtime.enabled": { label: "Enable Realtime Voice", advanced: true },
"realtime.provider": { label: "Realtime Voice Provider", advanced: true },
"realtime.streamPath": { label: "Realtime Stream Path", advanced: true },
"realtime.instructions": { label: "Realtime Instructions", advanced: true },
"realtime.providers.openai.apiKey": {
label: "OpenAI Realtime API Key",
sensitive: true,
advanced: true,
},
"realtime.providers.openai.model": { label: "OpenAI Realtime Model", advanced: true },
"realtime.providers.openai.voice": { label: "OpenAI Realtime Voice", advanced: true },
"tts.provider": {
label: "TTS Provider Override",
help: "Deep-merges with messages.tts (Microsoft is ignored for calls).",
@ -181,6 +193,7 @@ export default definePluginEntry({
runtimePromise = createVoiceCallRuntime({
config,
coreConfig: api.config as CoreConfig,
fullConfig: api.config,
agentRuntime: api.runtime.agent,
ttsRuntime: api.runtime.tts,
logger: api.logger,

View File

@ -86,12 +86,16 @@
"label": "Enable Streaming",
"advanced": true
},
"streaming.openaiApiKey": {
"streaming.provider": {
"label": "Streaming Provider",
"advanced": true
},
"streaming.providers.openai.apiKey": {
"label": "OpenAI Realtime API Key",
"sensitive": true,
"advanced": true
},
"streaming.sttModel": {
"streaming.providers.openai.model": {
"label": "Realtime STT Model",
"advanced": true
},
@ -345,9 +349,11 @@
"enabled": {
"type": "boolean"
},
"provider": {
"type": "string"
},
"sttProvider": {
"type": "string",
"enum": ["openai-realtime"]
"type": "string"
},
"openaiApiKey": {
"type": "string"
@ -367,6 +373,13 @@
"streamPath": {
"type": "string"
},
"providers": {
"type": "object",
"additionalProperties": {
"type": "object",
"additionalProperties": true
}
},
"preStartTimeoutMs": {
"type": "integer",
"minimum": 1
@ -385,6 +398,72 @@
}
}
},
"realtime": {
"type": "object",
"additionalProperties": false,
"properties": {
"enabled": {
"type": "boolean"
},
"provider": {
"type": "string"
},
"streamPath": {
"type": "string"
},
"instructions": {
"type": "string"
},
"tools": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": false,
"properties": {
"type": {
"type": "string",
"enum": ["function"]
},
"name": {
"type": "string"
},
"description": {
"type": "string"
},
"parameters": {
"type": "object",
"additionalProperties": false,
"properties": {
"type": {
"type": "string",
"enum": ["object"]
},
"properties": {
"type": "object",
"additionalProperties": true
},
"required": {
"type": "array",
"items": {
"type": "string"
}
}
},
"required": ["type", "properties"]
}
},
"required": ["type", "name", "description", "parameters"]
}
},
"providers": {
"type": "object",
"additionalProperties": {
"type": "object",
"additionalProperties": true
}
}
}
},
"publicUrl": {
"type": "string"
},

View File

@ -179,6 +179,35 @@ describe("validateProviderConfig", () => {
expect(result.errors).toEqual([]);
});
});
describe("realtime config", () => {
it("rejects disabled inbound policy for realtime mode", () => {
const config = createBaseConfig("twilio");
config.realtime.enabled = true;
config.inboundPolicy = "disabled";
const result = validateProviderConfig(config);
expect(result.valid).toBe(false);
expect(result.errors).toContain(
'plugins.entries.voice-call.config.inboundPolicy must not be "disabled" when realtime.enabled is true',
);
});
it("rejects enabling realtime and streaming together", () => {
const config = createBaseConfig("twilio");
config.realtime.enabled = true;
config.streaming.enabled = true;
config.inboundPolicy = "allowlist";
const result = validateProviderConfig(config);
expect(result.valid).toBe(false);
expect(result.errors).toContain(
"plugins.entries.voice-call.config.realtime.enabled and plugins.entries.voice-call.config.streaming.enabled cannot both be true",
);
});
});
});
describe("normalizeVoiceCallConfig", () => {
@ -194,11 +223,25 @@ describe("normalizeVoiceCallConfig", () => {
expect(normalized.serve.path).toBe("/voice/webhook");
expect(normalized.streaming.streamPath).toBe("/custom-stream");
expect(normalized.streaming.sttModel).toBe("gpt-4o-transcribe");
expect(normalized.streaming.provider).toBe("openai");
expect(normalized.streaming.providers.openai).toEqual({});
expect(normalized.realtime.streamPath).toBe("/voice/stream/realtime");
expect(normalized.tunnel.provider).toBe("none");
expect(normalized.webhookSecurity.allowedHosts).toEqual([]);
});
it("derives the realtime stream path from a custom webhook path", () => {
const normalized = normalizeVoiceCallConfig({
enabled: true,
provider: "twilio",
serve: {
path: "/custom/webhook",
},
});
expect(normalized.realtime.streamPath).toBe("/custom/stream/realtime");
});
it("accepts partial nested TTS overrides and preserves nested objects", () => {
const normalized = normalizeVoiceCallConfig({
tts: {

View File

@ -70,7 +70,7 @@ export type PlivoConfig = z.infer<typeof PlivoConfigSchema>;
export const SttConfigSchema = z
.object({
/** STT provider (currently only OpenAI supported) */
/** One-shot STT provider for non-streaming paths. */
provider: z.literal("openai").default("openai"),
/** Whisper model to use */
model: z.string().min(1).default("whisper-1"),
@ -196,25 +196,80 @@ export const OutboundConfigSchema = z
export type OutboundConfig = z.infer<typeof OutboundConfigSchema>;
// -----------------------------------------------------------------------------
// Streaming Configuration (OpenAI Realtime STT)
// Realtime Voice Configuration
// -----------------------------------------------------------------------------
export const RealtimeToolSchema = z
.object({
type: z.literal("function"),
name: z.string().min(1),
description: z.string(),
parameters: z.object({
type: z.literal("object"),
properties: z.record(z.string(), z.unknown()),
required: z.array(z.string()).optional(),
}),
})
.strict();
export type RealtimeToolConfig = z.infer<typeof RealtimeToolSchema>;
export const VoiceCallRealtimeProvidersConfigSchema = z
.record(z.string(), z.record(z.string(), z.unknown()))
.default({});
export type VoiceCallRealtimeProvidersConfig = z.infer<
typeof VoiceCallRealtimeProvidersConfigSchema
>;
export const VoiceCallStreamingProvidersConfigSchema = z
.record(z.string(), z.record(z.string(), z.unknown()))
.default({});
export type VoiceCallStreamingProvidersConfig = z.infer<
typeof VoiceCallStreamingProvidersConfigSchema
>;
export const VoiceCallRealtimeConfigSchema = z
.object({
/** Enable realtime voice-to-voice mode. */
enabled: z.boolean().default(false),
/** Provider id from registered realtime voice providers. */
provider: z.string().min(1).optional(),
/** Optional override for the local WebSocket route path. */
streamPath: z.string().min(1).optional(),
/** System instructions passed to the realtime provider. */
instructions: z.string().optional(),
/** Tool definitions exposed to the realtime provider. */
tools: z.array(RealtimeToolSchema).default([]),
/** Provider-owned raw config blobs keyed by provider id. */
providers: VoiceCallRealtimeProvidersConfigSchema,
})
.strict()
.default({ enabled: false, tools: [], providers: {} });
export type VoiceCallRealtimeConfig = z.infer<typeof VoiceCallRealtimeConfigSchema>;
// -----------------------------------------------------------------------------
// Streaming Configuration (Realtime Transcription)
// -----------------------------------------------------------------------------
export const VoiceCallStreamingConfigSchema = z
.object({
/** Enable real-time audio streaming (requires WebSocket support) */
enabled: z.boolean().default(false),
/** STT provider for real-time transcription */
sttProvider: z.enum(["openai-realtime"]).default("openai-realtime"),
/** OpenAI API key for Realtime API (uses OPENAI_API_KEY env if not set) */
/** Provider id from registered realtime transcription providers. */
provider: z.string().min(1).default("openai"),
/** @deprecated Legacy alias for provider. */
sttProvider: z.string().min(1).optional(),
/** @deprecated Legacy OpenAI-specific API key field. */
openaiApiKey: z.string().min(1).optional(),
/** OpenAI transcription model (default: gpt-4o-transcribe) */
sttModel: z.string().min(1).default("gpt-4o-transcribe"),
/** VAD silence duration in ms before considering speech ended */
silenceDurationMs: z.number().int().positive().default(800),
/** VAD threshold 0-1 (higher = less sensitive) */
vadThreshold: z.number().min(0).max(1).default(0.5),
/** @deprecated Legacy OpenAI-specific transcription model field. */
sttModel: z.string().min(1).optional(),
/** @deprecated Legacy OpenAI-specific VAD silence duration. */
silenceDurationMs: z.number().int().positive().optional(),
/** @deprecated Legacy OpenAI-specific VAD threshold. */
vadThreshold: z.number().min(0).max(1).optional(),
/** WebSocket path for media stream connections */
streamPath: z.string().min(1).default("/voice/stream"),
/** Provider-owned raw config blobs keyed by provider id. */
providers: VoiceCallStreamingProvidersConfigSchema,
/**
* Close unauthenticated media stream sockets if no valid `start` frame arrives in time.
* Protects against pre-auth idle connection hold attacks.
@ -230,11 +285,9 @@ export const VoiceCallStreamingConfigSchema = z
.strict()
.default({
enabled: false,
sttProvider: "openai-realtime",
sttModel: "gpt-4o-transcribe",
silenceDurationMs: 800,
vadThreshold: 0.5,
provider: "openai",
streamPath: "/voice/stream",
providers: {},
preStartTimeoutMs: 5000,
maxPendingConnections: 32,
maxPendingConnectionsPerIp: 4,
@ -319,6 +372,9 @@ export const VoiceCallConfigSchema = z
/** Real-time audio streaming configuration */
streaming: VoiceCallStreamingConfigSchema,
/** Realtime voice-to-voice configuration */
realtime: VoiceCallRealtimeConfigSchema,
/** Public webhook URL override (if set, bypasses tunnel auto-detection) */
publicUrl: z.string().url().optional(),
@ -364,6 +420,29 @@ function cloneDefaultVoiceCallConfig(): VoiceCallConfig {
return structuredClone(DEFAULT_VOICE_CALL_CONFIG);
}
function normalizeWebhookLikePath(pathname: string): string {
const trimmed = pathname.trim();
if (!trimmed) {
return "/";
}
const prefixed = trimmed.startsWith("/") ? trimmed : `/${trimmed}`;
if (prefixed === "/") {
return prefixed;
}
return prefixed.endsWith("/") ? prefixed.slice(0, -1) : prefixed;
}
function defaultRealtimeStreamPathForServePath(servePath: string): string {
const normalized = normalizeWebhookLikePath(servePath);
if (normalized.endsWith("/webhook")) {
return `${normalized.slice(0, -"/webhook".length)}/stream/realtime`;
}
if (normalized === "/") {
return "/voice/stream/realtime";
}
return `${normalized}/stream/realtime`;
}
function normalizeVoiceCallTtsConfig(
defaults: VoiceCallTtsConfig,
overrides: DeepPartial<NonNullable<VoiceCallTtsConfig>> | undefined,
@ -375,14 +454,55 @@ function normalizeVoiceCallTtsConfig(
return TtsConfigSchema.parse(deepMergeDefined(defaults ?? {}, overrides ?? {}));
}
function sanitizeVoiceCallProviderConfigs(
value: Record<string, Record<string, unknown> | undefined> | undefined,
): Record<string, Record<string, unknown>> {
if (!value) {
return {};
}
return Object.fromEntries(
Object.entries(value).filter(
(entry): entry is [string, Record<string, unknown>] => entry[1] !== undefined,
),
);
}
export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig {
const defaults = cloneDefaultVoiceCallConfig();
const serve = { ...defaults.serve, ...config.serve };
const streamingProvider =
config.streaming?.provider ??
(typeof config.streaming?.sttProvider === "string"
? config.streaming.sttProvider
: undefined) ??
defaults.streaming.provider;
const streamingProviders = sanitizeVoiceCallProviderConfigs(
config.streaming?.providers ?? defaults.streaming.providers,
);
if (
typeof streamingProvider === "string" &&
streamingProvider.trim() &&
!(streamingProvider in streamingProviders)
) {
streamingProviders[streamingProvider] = {};
}
const realtimeProvider = config.realtime?.provider ?? defaults.realtime.provider;
const realtimeProviders = sanitizeVoiceCallProviderConfigs(
config.realtime?.providers ?? defaults.realtime.providers,
);
if (
typeof realtimeProvider === "string" &&
realtimeProvider.trim() &&
!(realtimeProvider in realtimeProviders)
) {
realtimeProviders[realtimeProvider] = {};
}
return {
...defaults,
...config,
allowFrom: config.allowFrom ?? defaults.allowFrom,
outbound: { ...defaults.outbound, ...config.outbound },
serve: { ...defaults.serve, ...config.serve },
serve,
tailscale: { ...defaults.tailscale, ...config.tailscale },
tunnel: { ...defaults.tunnel, ...config.tunnel },
webhookSecurity: {
@ -392,7 +512,23 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal
trustedProxyIPs:
config.webhookSecurity?.trustedProxyIPs ?? defaults.webhookSecurity.trustedProxyIPs,
},
streaming: { ...defaults.streaming, ...config.streaming },
streaming: {
...defaults.streaming,
...config.streaming,
provider: streamingProvider,
providers: streamingProviders,
},
realtime: {
...defaults.realtime,
...config.realtime,
provider: realtimeProvider,
streamPath:
config.realtime?.streamPath ??
defaultRealtimeStreamPathForServePath(serve.path ?? defaults.serve.path),
tools:
(config.realtime?.tools as RealtimeToolConfig[] | undefined) ?? defaults.realtime.tools,
providers: realtimeProviders,
},
stt: { ...defaults.stt, ...config.stt },
tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts),
};
@ -448,6 +584,133 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC
resolved.webhookSecurity.trustForwardingHeaders ?? false;
resolved.webhookSecurity.trustedProxyIPs = resolved.webhookSecurity.trustedProxyIPs ?? [];
resolved.streaming = {
...resolved.streaming,
providers: { ...(resolved.streaming.providers ?? {}) },
};
const legacyStreamingRaw = resolved.streaming as Record<string, unknown>;
const openaiStreamingRaw =
resolved.streaming.providers.openai && typeof resolved.streaming.providers.openai === "object"
? { ...(resolved.streaming.providers.openai as Record<string, unknown>) }
: {};
if (
typeof openaiStreamingRaw.apiKey !== "string" &&
typeof legacyStreamingRaw.openaiApiKey === "string"
) {
openaiStreamingRaw.apiKey = legacyStreamingRaw.openaiApiKey;
}
if (
typeof openaiStreamingRaw.model !== "string" &&
typeof legacyStreamingRaw.sttModel === "string"
) {
openaiStreamingRaw.model = legacyStreamingRaw.sttModel;
}
if (
openaiStreamingRaw.silenceDurationMs == null &&
typeof legacyStreamingRaw.silenceDurationMs === "number"
) {
openaiStreamingRaw.silenceDurationMs = legacyStreamingRaw.silenceDurationMs;
}
if (
openaiStreamingRaw.vadThreshold == null &&
typeof legacyStreamingRaw.vadThreshold === "number"
) {
openaiStreamingRaw.vadThreshold = legacyStreamingRaw.vadThreshold;
}
if (typeof openaiStreamingRaw.apiKey !== "string" || !openaiStreamingRaw.apiKey.trim()) {
if (process.env.OPENAI_API_KEY) {
openaiStreamingRaw.apiKey = process.env.OPENAI_API_KEY;
}
}
if (
typeof openaiStreamingRaw.model !== "string" &&
typeof process.env.REALTIME_TRANSCRIPTION_MODEL === "string"
) {
openaiStreamingRaw.model = process.env.REALTIME_TRANSCRIPTION_MODEL;
}
if (
typeof openaiStreamingRaw.model !== "string" &&
typeof process.env.STREAMING_STT_MODEL === "string"
) {
openaiStreamingRaw.model = process.env.STREAMING_STT_MODEL;
}
if (openaiStreamingRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") {
openaiStreamingRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD);
}
if (
openaiStreamingRaw.silenceDurationMs == null &&
typeof process.env.SILENCE_DURATION_MS === "string"
) {
openaiStreamingRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10);
}
if (Object.keys(openaiStreamingRaw).length > 0) {
resolved.streaming.providers.openai = openaiStreamingRaw;
}
if (
typeof resolved.streaming.provider === "string" &&
resolved.streaming.provider.trim() &&
!(resolved.streaming.provider in resolved.streaming.providers)
) {
resolved.streaming.providers[resolved.streaming.provider] = {};
}
resolved.realtime = {
...resolved.realtime,
providers: { ...(resolved.realtime.providers ?? {}) },
};
const openaiRealtimeRaw =
resolved.realtime.providers.openai && typeof resolved.realtime.providers.openai === "object"
? { ...(resolved.realtime.providers.openai as Record<string, unknown>) }
: {};
if (typeof openaiRealtimeRaw.apiKey !== "string" || !openaiRealtimeRaw.apiKey.trim()) {
if (process.env.OPENAI_API_KEY) {
openaiRealtimeRaw.apiKey = process.env.OPENAI_API_KEY;
}
}
if (
typeof openaiRealtimeRaw.model !== "string" &&
typeof process.env.REALTIME_VOICE_MODEL === "string"
) {
openaiRealtimeRaw.model = process.env.REALTIME_VOICE_MODEL;
}
if (
typeof openaiRealtimeRaw.voice !== "string" &&
typeof process.env.REALTIME_VOICE_VOICE === "string"
) {
openaiRealtimeRaw.voice = process.env.REALTIME_VOICE_VOICE;
}
if (
typeof resolved.realtime.instructions !== "string" &&
typeof process.env.REALTIME_VOICE_INSTRUCTIONS === "string"
) {
resolved.realtime.instructions = process.env.REALTIME_VOICE_INSTRUCTIONS;
}
if (
openaiRealtimeRaw.temperature == null &&
typeof process.env.REALTIME_VOICE_TEMPERATURE === "string"
) {
openaiRealtimeRaw.temperature = Number.parseFloat(process.env.REALTIME_VOICE_TEMPERATURE);
}
if (openaiRealtimeRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") {
openaiRealtimeRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD);
}
if (
openaiRealtimeRaw.silenceDurationMs == null &&
typeof process.env.SILENCE_DURATION_MS === "string"
) {
openaiRealtimeRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10);
}
if (Object.keys(openaiRealtimeRaw).length > 0) {
resolved.realtime.providers.openai = openaiRealtimeRaw;
}
if (
typeof resolved.realtime.provider === "string" &&
resolved.realtime.provider.trim() &&
!(resolved.realtime.provider in resolved.realtime.providers)
) {
resolved.realtime.providers[resolved.realtime.provider] = {};
}
return normalizeVoiceCallConfig(resolved);
}
@ -516,5 +779,23 @@ export function validateProviderConfig(config: VoiceCallConfig): {
}
}
if (config.realtime.enabled && config.inboundPolicy === "disabled") {
errors.push(
'plugins.entries.voice-call.config.inboundPolicy must not be "disabled" when realtime.enabled is true',
);
}
if (config.realtime.enabled && config.streaming.enabled) {
errors.push(
"plugins.entries.voice-call.config.realtime.enabled and plugins.entries.voice-call.config.streaming.enabled cannot both be true",
);
}
if (config.realtime.enabled && config.provider && config.provider !== "twilio") {
errors.push(
'plugins.entries.voice-call.config.provider must be "twilio" when realtime.enabled is true',
);
}
return { valid: errors.length === 0, errors };
}

View File

@ -125,7 +125,7 @@ describe("voice-call outbound helpers", () => {
maxConcurrentCalls: 3,
outbound: { defaultMode: "conversation" },
fromNumber: "+14155550100",
tts: { providers: { openai: { voice: "nova" } } },
tts: { provider: "openai", providers: { openai: { voice: "nova" } } },
},
storePath: "/tmp/voice-call.json",
webhookUrl: "https://example.com/webhook",
@ -187,7 +187,7 @@ describe("voice-call outbound helpers", () => {
activeCalls: new Map([["call-1", call]]),
providerCallIdMap: new Map(),
provider: { name: "twilio", playTts },
config: { tts: { providers: { openai: { voice: "alloy" } } } },
config: { tts: { provider: "openai", providers: { openai: { voice: "alloy" } } } },
storePath: "/tmp/voice-call.json",
};

View File

@ -100,11 +100,22 @@ function requireConnectedCall(ctx: ConnectedCallContext, callId: CallId): Connec
};
}
function resolveOpenAITtsVoice(config: SpeakContext["config"]): string | undefined {
const providerConfig = config.tts?.providers?.openai;
return providerConfig && typeof providerConfig === "object"
? (providerConfig.voice as string | undefined)
: undefined;
function resolvePreferredTtsVoice(config: SpeakContext["config"]): string | undefined {
const providerId = config.tts?.provider;
if (!providerId) {
return undefined;
}
const providerConfig = config.tts?.providers?.[providerId];
if (!providerConfig || typeof providerConfig !== "object") {
return undefined;
}
if (typeof providerConfig.voice === "string" && providerConfig.voice.trim()) {
return providerConfig.voice;
}
if (typeof providerConfig.voiceId === "string" && providerConfig.voiceId.trim()) {
return providerConfig.voiceId;
}
return undefined;
}
export async function initiateCall(
@ -164,7 +175,7 @@ export async function initiateCall(
// For notify mode with a message, use inline TwiML with <Say>.
let inlineTwiml: string | undefined;
if (mode === "notify" && initialMessage) {
const pollyVoice = mapVoiceToPolly(resolveOpenAITtsVoice(ctx.config));
const pollyVoice = mapVoiceToPolly(resolvePreferredTtsVoice(ctx.config));
inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
}
@ -212,7 +223,7 @@ export async function speak(
transitionState(call, "speaking");
persistCallRecord(ctx.storePath, call);
const voice = provider.name === "twilio" ? resolveOpenAITtsVoice(ctx.config) : undefined;
const voice = provider.name === "twilio" ? resolvePreferredTtsVoice(ctx.config) : undefined;
await provider.playTts({
callId,
providerCallId,

View File

@ -1,28 +1,27 @@
import { once } from "node:events";
import http from "node:http";
import type {
RealtimeTranscriptionProviderPlugin,
RealtimeTranscriptionSession,
} from "openclaw/plugin-sdk/realtime-transcription";
import { describe, expect, it, vi } from "vitest";
import { WebSocket } from "ws";
import { MediaStreamHandler, sanitizeLogText } from "./media-stream.js";
import type {
OpenAIRealtimeSTTProvider,
RealtimeSTTSession,
} from "./providers/stt-openai-realtime.js";
const createStubSession = (): RealtimeSTTSession => ({
const createStubSession = (): RealtimeTranscriptionSession => ({
connect: async () => {},
sendAudio: () => {},
waitForTranscript: async () => "",
onPartial: () => {},
onTranscript: () => {},
onSpeechStart: () => {},
close: () => {},
isConnected: () => true,
});
const createStubSttProvider = (): OpenAIRealtimeSTTProvider =>
const createStubSttProvider = (): RealtimeTranscriptionProviderPlugin =>
({
createSession: () => createStubSession(),
}) as unknown as OpenAIRealtimeSTTProvider;
id: "openai",
label: "OpenAI",
isConfigured: () => true,
}) as unknown as RealtimeTranscriptionProviderPlugin;
const flush = async (): Promise<void> => {
await new Promise((resolve) => setTimeout(resolve, 0));
@ -104,7 +103,8 @@ const waitForClose = async (
describe("MediaStreamHandler TTS queue", () => {
it("serializes TTS playback and resolves in order", async () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
transcriptionProvider: createStubSttProvider(),
providerConfig: {},
});
const started: number[] = [];
const finished: number[] = [];
@ -137,7 +137,8 @@ describe("MediaStreamHandler TTS queue", () => {
it("cancels active playback and clears queued items", async () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
transcriptionProvider: createStubSttProvider(),
providerConfig: {},
});
let queuedRan = false;
@ -165,7 +166,8 @@ describe("MediaStreamHandler TTS queue", () => {
describe("MediaStreamHandler security hardening", () => {
it("fails sends and closes stream when buffered bytes already exceed the cap", () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
transcriptionProvider: createStubSttProvider(),
providerConfig: {},
});
const ws = {
readyState: WebSocket.OPEN,
@ -177,7 +179,12 @@ describe("MediaStreamHandler security hardening", () => {
handler as unknown as {
sessions: Map<
string,
{ callId: string; streamSid: string; ws: WebSocket; sttSession: RealtimeSTTSession }
{
callId: string;
streamSid: string;
ws: WebSocket;
sttSession: RealtimeTranscriptionSession;
}
>;
}
).sessions.set("MZ-backpressure", {
@ -196,7 +203,8 @@ describe("MediaStreamHandler security hardening", () => {
it("fails sends when buffered bytes exceed cap after enqueueing a frame", () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
transcriptionProvider: createStubSttProvider(),
providerConfig: {},
});
const ws = {
readyState: WebSocket.OPEN,
@ -214,7 +222,12 @@ describe("MediaStreamHandler security hardening", () => {
handler as unknown as {
sessions: Map<
string,
{ callId: string; streamSid: string; ws: WebSocket; sttSession: RealtimeSTTSession }
{
callId: string;
streamSid: string;
ws: WebSocket;
sttSession: RealtimeTranscriptionSession;
}
>;
}
).sessions.set("MZ-overflow", {
@ -243,7 +256,8 @@ describe("MediaStreamHandler security hardening", () => {
const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
[];
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
transcriptionProvider: createStubSttProvider(),
providerConfig: {},
preStartTimeoutMs: 40,
shouldAcceptStream: (params) => {
shouldAcceptStreamCalls.push(params);
@ -266,7 +280,8 @@ describe("MediaStreamHandler security hardening", () => {
it("enforces pending connection limits", async () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
transcriptionProvider: createStubSttProvider(),
providerConfig: {},
preStartTimeoutMs: 5_000,
maxPendingConnections: 1,
maxPendingConnectionsPerIp: 1,
@ -291,7 +306,8 @@ describe("MediaStreamHandler security hardening", () => {
it("rejects upgrades when max connection cap is reached", async () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
transcriptionProvider: createStubSttProvider(),
providerConfig: {},
preStartTimeoutMs: 5_000,
maxConnections: 1,
maxPendingConnections: 10,
@ -319,7 +335,8 @@ describe("MediaStreamHandler security hardening", () => {
it("clears pending state after valid start", async () => {
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
transcriptionProvider: createStubSttProvider(),
providerConfig: {},
preStartTimeoutMs: 40,
shouldAcceptStream: () => true,
});
@ -349,7 +366,8 @@ describe("MediaStreamHandler security hardening", () => {
const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
[];
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
transcriptionProvider: createStubSttProvider(),
providerConfig: {},
preStartTimeoutMs: 1_000,
shouldAcceptStream: (params) => {
shouldAcceptStreamCalls.push(params);

View File

@ -3,24 +3,27 @@
*
* Handles bidirectional audio streaming between Twilio and the AI services.
* - Receives mu-law audio from Twilio via WebSocket
* - Forwards to OpenAI Realtime STT for transcription
* - Forwards to the selected realtime transcription provider
* - Sends TTS audio back to Twilio
*/
import type { IncomingMessage } from "node:http";
import type { Duplex } from "node:stream";
import { type RawData, WebSocket, WebSocketServer } from "ws";
import type {
OpenAIRealtimeSTTProvider,
RealtimeSTTSession,
} from "./providers/stt-openai-realtime.js";
RealtimeTranscriptionProviderConfig,
RealtimeTranscriptionProviderPlugin,
RealtimeTranscriptionSession,
} from "openclaw/plugin-sdk/realtime-transcription";
import { type RawData, WebSocket, WebSocketServer } from "ws";
/**
* Configuration for the media stream handler.
*/
export interface MediaStreamConfig {
/** STT provider for transcription */
sttProvider: OpenAIRealtimeSTTProvider;
/** Realtime transcription provider for streaming STT. */
transcriptionProvider: RealtimeTranscriptionProviderPlugin;
/** Provider-owned config blob passed into the transcription session. */
providerConfig: RealtimeTranscriptionProviderConfig;
/** Close sockets that never send a valid `start` frame within this window. */
preStartTimeoutMs?: number;
/** Max concurrent pre-start sockets. */
@ -50,7 +53,7 @@ interface StreamSession {
callId: string;
streamSid: string;
ws: WebSocket;
sttSession: RealtimeSTTSession;
sttSession: RealtimeTranscriptionSession;
}
type TtsQueueEntry = {
@ -254,20 +257,20 @@ export class MediaStreamHandler {
return null;
}
// Create STT session
const sttSession = this.config.sttProvider.createSession();
// Set up transcript callbacks
sttSession.onPartial((partial) => {
this.config.onPartialTranscript?.(callSid, partial);
});
sttSession.onTranscript((transcript) => {
this.config.onTranscript?.(callSid, transcript);
});
sttSession.onSpeechStart(() => {
this.config.onSpeechStart?.(callSid);
const sttSession = this.config.transcriptionProvider.createSession({
providerConfig: this.config.providerConfig,
onPartial: (partial) => {
this.config.onPartialTranscript?.(callSid, partial);
},
onTranscript: (transcript) => {
this.config.onTranscript?.(callSid, transcript);
},
onSpeechStart: () => {
this.config.onSpeechStart?.(callSid);
},
onError: (error) => {
console.warn("[MediaStream] Transcription session error:", error.message);
},
});
const session: StreamSession = {
@ -282,7 +285,7 @@ export class MediaStreamHandler {
// Notify connection BEFORE STT connect so TTS can work even if STT fails
this.config.onConnect?.(callSid, streamSid);
// Connect to OpenAI STT (non-blocking, log errors but don't fail the call)
// Connect to transcription service (non-blocking, log errors but don't fail the call)
sttSession.connect().catch((err) => {
console.warn(`[MediaStream] STT connection failed (TTS still works):`, err.message);
});

View File

@ -1,10 +1,5 @@
export type { VoiceCallProvider } from "./base.js";
export { MockProvider } from "./mock.js";
export {
OpenAIRealtimeSTTProvider,
type RealtimeSTTConfig,
type RealtimeSTTSession,
} from "./stt-openai-realtime.js";
export { TelnyxProvider } from "./telnyx.js";
export { TwilioProvider } from "./twilio.js";
export { PlivoProvider } from "./plivo.js";

View File

@ -1,42 +0,0 @@
import { describe, expect, it } from "vitest";
import type { RealtimeSTTConfig } from "./stt-openai-realtime.js";
import { OpenAIRealtimeSTTProvider } from "./stt-openai-realtime.js";
type ProviderInternals = {
vadThreshold: number;
silenceDurationMs: number;
};
function readProviderInternals(config: RealtimeSTTConfig): ProviderInternals {
const provider = new OpenAIRealtimeSTTProvider(config) as unknown as Record<string, unknown>;
return {
vadThreshold: provider["vadThreshold"] as number,
silenceDurationMs: provider["silenceDurationMs"] as number,
};
}
describe("OpenAIRealtimeSTTProvider constructor defaults", () => {
it("uses vadThreshold: 0 when explicitly configured (max sensitivity)", () => {
const provider = readProviderInternals({
apiKey: "sk-test", // pragma: allowlist secret
vadThreshold: 0,
});
expect(provider.vadThreshold).toBe(0);
});
it("uses silenceDurationMs: 0 when explicitly configured", () => {
const provider = readProviderInternals({
apiKey: "sk-test", // pragma: allowlist secret
silenceDurationMs: 0,
});
expect(provider.silenceDurationMs).toBe(0);
});
it("falls back to defaults when values are undefined", () => {
const provider = readProviderInternals({
apiKey: "sk-test", // pragma: allowlist secret
});
expect(provider.vadThreshold).toBe(0.5);
expect(provider.silenceDurationMs).toBe(800);
});
});

View File

@ -1,321 +0,0 @@
/**
* OpenAI Realtime STT Provider
*
* Uses the OpenAI Realtime API for streaming transcription with:
* - Direct mu-law audio support (no conversion needed)
* - Built-in server-side VAD for turn detection
* - Low-latency streaming transcription
* - Partial transcript callbacks for real-time UI updates
*/
import WebSocket from "ws";
/**
* Configuration for OpenAI Realtime STT.
*/
export interface RealtimeSTTConfig {
/** OpenAI API key */
apiKey: string;
/** Model to use (default: gpt-4o-transcribe) */
model?: string;
/** Silence duration in ms before considering speech ended (default: 800) */
silenceDurationMs?: number;
/** VAD threshold 0-1 (default: 0.5) */
vadThreshold?: number;
}
/**
* Session for streaming audio and receiving transcripts.
*/
export interface RealtimeSTTSession {
/** Connect to the transcription service */
connect(): Promise<void>;
/** Send mu-law audio data (8kHz mono) */
sendAudio(audio: Buffer): void;
/** Wait for next complete transcript (after VAD detects end of speech) */
waitForTranscript(timeoutMs?: number): Promise<string>;
/** Set callback for partial transcripts (streaming) */
onPartial(callback: (partial: string) => void): void;
/** Set callback for final transcripts */
onTranscript(callback: (transcript: string) => void): void;
/** Set callback when speech starts (VAD) */
onSpeechStart(callback: () => void): void;
/** Close the session */
close(): void;
/** Check if session is connected */
isConnected(): boolean;
}
/**
* Provider factory for OpenAI Realtime STT sessions.
*/
export class OpenAIRealtimeSTTProvider {
readonly name = "openai-realtime";
private apiKey: string;
private model: string;
private silenceDurationMs: number;
private vadThreshold: number;
constructor(config: RealtimeSTTConfig) {
if (!config.apiKey) {
throw new Error("OpenAI API key required for Realtime STT");
}
this.apiKey = config.apiKey;
this.model = config.model || "gpt-4o-transcribe";
this.silenceDurationMs = config.silenceDurationMs ?? 800;
this.vadThreshold = config.vadThreshold ?? 0.5;
}
/**
* Create a new realtime transcription session.
*/
createSession(): RealtimeSTTSession {
return new OpenAIRealtimeSTTSession(
this.apiKey,
this.model,
this.silenceDurationMs,
this.vadThreshold,
);
}
}
/**
* WebSocket-based session for real-time speech-to-text.
*/
class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
private static readonly RECONNECT_DELAY_MS = 1000;
private ws: WebSocket | null = null;
private connected = false;
private closed = false;
private connectTimeout: ReturnType<typeof setTimeout> | null = null;
private reconnectAttempts = 0;
private pendingTranscript = "";
private onTranscriptCallback: ((transcript: string) => void) | null = null;
private onPartialCallback: ((partial: string) => void) | null = null;
private onSpeechStartCallback: (() => void) | null = null;
constructor(
private readonly apiKey: string,
private readonly model: string,
private readonly silenceDurationMs: number,
private readonly vadThreshold: number,
) {}
async connect(): Promise<void> {
this.closed = false;
this.reconnectAttempts = 0;
return this.doConnect();
}
private async doConnect(): Promise<void> {
return new Promise((resolve, reject) => {
const url = "wss://api.openai.com/v1/realtime?intent=transcription";
this.ws = new WebSocket(url, {
headers: {
Authorization: `Bearer ${this.apiKey}`,
"OpenAI-Beta": "realtime=v1",
},
});
this.ws.on("open", () => {
console.log("[RealtimeSTT] WebSocket connected");
this.connected = true;
this.reconnectAttempts = 0;
if (this.connectTimeout) {
clearTimeout(this.connectTimeout);
this.connectTimeout = null;
}
// Configure the transcription session
this.sendEvent({
type: "transcription_session.update",
session: {
input_audio_format: "g711_ulaw",
input_audio_transcription: {
model: this.model,
},
turn_detection: {
type: "server_vad",
threshold: this.vadThreshold,
prefix_padding_ms: 300,
silence_duration_ms: this.silenceDurationMs,
},
},
});
resolve();
});
this.ws.on("message", (data: Buffer) => {
try {
const event = JSON.parse(data.toString());
this.handleEvent(event);
} catch (e) {
console.error("[RealtimeSTT] Failed to parse event:", e);
}
});
this.ws.on("error", (error) => {
console.error("[RealtimeSTT] WebSocket error:", error);
if (!this.connected) {
reject(error);
}
});
this.ws.on("close", (code, reason) => {
console.log(
`[RealtimeSTT] WebSocket closed (code: ${code}, reason: ${reason?.toString() || "none"})`,
);
this.connected = false;
// Attempt reconnection if not intentionally closed
if (!this.closed) {
void this.attemptReconnect();
}
});
this.connectTimeout = setTimeout(() => {
this.connectTimeout = null;
if (!this.connected) {
reject(new Error("Realtime STT connection timeout"));
}
}, 10000);
});
}
private async attemptReconnect(): Promise<void> {
if (this.closed) {
return;
}
if (this.reconnectAttempts >= OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS) {
console.error(
`[RealtimeSTT] Max reconnect attempts (${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS}) reached`,
);
return;
}
this.reconnectAttempts++;
const delay = OpenAIRealtimeSTTSession.RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
console.log(
`[RealtimeSTT] Reconnecting ${this.reconnectAttempts}/${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS} in ${delay}ms...`,
);
await new Promise((resolve) => setTimeout(resolve, delay));
if (this.closed) {
return;
}
try {
await this.doConnect();
console.log("[RealtimeSTT] Reconnected successfully");
} catch (error) {
console.error("[RealtimeSTT] Reconnect failed:", error);
}
}
private handleEvent(event: {
type: string;
delta?: string;
transcript?: string;
error?: unknown;
}): void {
switch (event.type) {
case "transcription_session.created":
case "transcription_session.updated":
case "input_audio_buffer.speech_stopped":
case "input_audio_buffer.committed":
console.log(`[RealtimeSTT] ${event.type}`);
break;
case "conversation.item.input_audio_transcription.delta":
if (event.delta) {
this.pendingTranscript += event.delta;
this.onPartialCallback?.(this.pendingTranscript);
}
break;
case "conversation.item.input_audio_transcription.completed":
if (event.transcript) {
console.log(`[RealtimeSTT] Transcript: ${event.transcript}`);
this.onTranscriptCallback?.(event.transcript);
}
this.pendingTranscript = "";
break;
case "input_audio_buffer.speech_started":
console.log("[RealtimeSTT] Speech started");
this.pendingTranscript = "";
this.onSpeechStartCallback?.();
break;
case "error":
console.error("[RealtimeSTT] Error:", event.error);
break;
}
}
private sendEvent(event: unknown): void {
if (this.ws?.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify(event));
}
}
sendAudio(muLawData: Buffer): void {
if (!this.connected) {
return;
}
this.sendEvent({
type: "input_audio_buffer.append",
audio: muLawData.toString("base64"),
});
}
onPartial(callback: (partial: string) => void): void {
this.onPartialCallback = callback;
}
onTranscript(callback: (transcript: string) => void): void {
this.onTranscriptCallback = callback;
}
onSpeechStart(callback: () => void): void {
this.onSpeechStartCallback = callback;
}
async waitForTranscript(timeoutMs = 30000): Promise<string> {
return new Promise((resolve, reject) => {
const timeout = setTimeout(() => {
this.onTranscriptCallback = null;
reject(new Error("Transcript timeout"));
}, timeoutMs);
this.onTranscriptCallback = (transcript) => {
clearTimeout(timeout);
this.onTranscriptCallback = null;
resolve(transcript);
};
});
}
close(): void {
this.closed = true;
if (this.connectTimeout) {
clearTimeout(this.connectTimeout);
this.connectTimeout = null;
}
if (this.ws) {
this.ws.close();
this.ws = null;
}
this.connected = false;
}
isConnected(): boolean {
return this.connected;
}
}

View File

@ -1,43 +0,0 @@
import { describe, expect, it } from "vitest";
import type { OpenAITTSConfig } from "./tts-openai.js";
import { OpenAITTSProvider } from "./tts-openai.js";
type ProviderInternals = {
model: string;
voice: string;
speed: number;
};
function readProviderInternals(config: OpenAITTSConfig): ProviderInternals {
return new OpenAITTSProvider(config) as unknown as ProviderInternals;
}
describe("OpenAITTSProvider constructor defaults", () => {
it("uses speed: 0 when explicitly configured", () => {
const provider = readProviderInternals({
apiKey: "sk-test", // pragma: allowlist secret
speed: 0,
});
expect(provider.speed).toBe(0);
});
it("falls back to speed default when undefined", () => {
const provider = readProviderInternals({
apiKey: "sk-test", // pragma: allowlist secret
});
expect(provider.speed).toBe(1.0);
});
it("treats blank model and voice overrides as unset", () => {
const provider = readProviderInternals({
apiKey: "sk-test", // pragma: allowlist secret
model: " ",
voice: "",
});
expect(provider.model).toBe("gpt-4o-mini-tts");
expect(provider.voice).toBe("coral");
});
});

View File

@ -1,185 +0,0 @@
import { convertPcmToMulaw8k } from "../telephony-audio.js";
/**
* OpenAI TTS Provider
*
* Generates speech audio using OpenAI's text-to-speech API.
* Handles audio format conversion for telephony (mu-law 8kHz).
*
* Best practices from OpenAI docs:
* - Use gpt-4o-mini-tts for intelligent realtime applications (supports instructions)
* - Use tts-1 for lower latency, tts-1-hd for higher quality
* - Use marin or cedar voices for best quality
* - Use pcm or wav format for fastest response times
*
* @see https://platform.openai.com/docs/guides/text-to-speech
*/
/**
* OpenAI TTS configuration.
*/
export interface OpenAITTSConfig {
/** OpenAI API key (uses OPENAI_API_KEY env if not set) */
apiKey?: string;
/**
* TTS model:
* - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
* - tts-1: lower latency
* - tts-1-hd: higher quality
*/
model?: string;
/**
* Voice to use. For best quality, use marin or cedar.
* All 13 voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
* Note: tts-1/tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer
*/
voice?: string;
/** Speed multiplier (0.25 to 4.0) */
speed?: number;
/**
* Instructions for speech style (only works with gpt-4o-mini-tts model).
* Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
*/
instructions?: string;
}
/**
* Supported OpenAI TTS voices (all 13 built-in voices).
* For best quality, use marin or cedar.
* Note: tts-1 and tts-1-hd support a smaller set.
*/
export const OPENAI_TTS_VOICES = [
"alloy",
"ash",
"ballad",
"coral",
"echo",
"fable",
"nova",
"onyx",
"sage",
"shimmer",
"verse",
"marin",
"cedar",
] as const;
export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
function trimToUndefined(value: string | undefined): string | undefined {
const trimmed = value?.trim();
return trimmed ? trimmed : undefined;
}
function resolveOpenAITtsInstructions(model: string, instructions?: string): string | undefined {
const next = trimToUndefined(instructions);
return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
}
/**
* OpenAI TTS Provider for generating speech audio.
*/
export class OpenAITTSProvider {
private apiKey: string;
private model: string;
private voice: OpenAITTSVoice;
private speed: number;
private instructions?: string;
constructor(config: OpenAITTSConfig = {}) {
this.apiKey =
trimToUndefined(config.apiKey) ?? trimToUndefined(process.env.OPENAI_API_KEY) ?? "";
// Default to gpt-4o-mini-tts for intelligent realtime applications
this.model = trimToUndefined(config.model) ?? "gpt-4o-mini-tts";
// Default to coral - good balance of quality and natural tone
this.voice = (trimToUndefined(config.voice) as OpenAITTSVoice | undefined) ?? "coral";
this.speed = config.speed ?? 1.0;
this.instructions = trimToUndefined(config.instructions);
if (!this.apiKey) {
throw new Error("OpenAI API key required (set OPENAI_API_KEY or pass apiKey)");
}
}
/**
* Generate speech audio from text.
* Returns raw PCM audio data (24kHz, mono, 16-bit).
*/
async synthesize(text: string, instructions?: string): Promise<Buffer> {
// Build request body
const body: Record<string, unknown> = {
model: this.model,
input: text,
voice: this.voice,
response_format: "pcm", // Raw PCM audio (24kHz, mono, 16-bit signed LE)
speed: this.speed,
};
const effectiveInstructions = resolveOpenAITtsInstructions(
this.model,
trimToUndefined(instructions) ?? this.instructions,
);
if (effectiveInstructions) {
body.instructions = effectiveInstructions;
}
const response = await fetch("https://api.openai.com/v1/audio/speech", {
method: "POST",
headers: {
Authorization: `Bearer ${this.apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify(body),
});
if (!response.ok) {
const error = await response.text();
throw new Error(`OpenAI TTS failed: ${response.status} - ${error}`);
}
const arrayBuffer = await response.arrayBuffer();
return Buffer.from(arrayBuffer);
}
/**
* Generate speech and convert to mu-law format for Twilio.
* Twilio Media Streams expect 8kHz mono mu-law audio.
*/
async synthesizeForTwilio(text: string): Promise<Buffer> {
// Get raw PCM from OpenAI (24kHz, 16-bit signed LE, mono)
const pcm24k = await this.synthesize(text);
// Convert from 24kHz PCM to Twilio-compatible 8kHz mu-law
return convertPcmToMulaw8k(pcm24k, 24000);
}
}
/**
* Convert 8-bit mu-law to 16-bit linear PCM.
* Useful for decoding incoming audio.
*/
export function mulawToLinear(mulaw: number): number {
// mu-law is transmitted inverted
mulaw = ~mulaw & 0xff;
const sign = mulaw & 0x80;
const exponent = (mulaw >> 4) & 0x07;
const mantissa = mulaw & 0x0f;
let sample = ((mantissa << 3) + 132) << exponent;
sample -= 132;
return sign ? -sample : sample;
}
/**
* Chunk audio buffer into 20ms frames for streaming.
* At 8kHz mono, 20ms = 160 samples = 160 bytes (mu-law).
*/
export function chunkAudio(audio: Buffer, chunkSize = 160): Generator<Buffer, void, unknown> {
return (function* () {
for (let i = 0; i < audio.length; i += chunkSize) {
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
}
})();
}

View File

@ -0,0 +1,4 @@
export {
getRealtimeTranscriptionProvider,
listRealtimeTranscriptionProviders,
} from "openclaw/plugin-sdk/realtime-transcription";

View File

@ -0,0 +1,4 @@
export {
getRealtimeVoiceProvider,
listRealtimeVoiceProviders,
} from "openclaw/plugin-sdk/realtime-voice";

View File

@ -1,12 +1,14 @@
import type { OpenClawConfig } from "openclaw/plugin-sdk/core";
import type {
RealtimeVoiceProviderConfig,
RealtimeVoiceProviderPlugin,
} from "openclaw/plugin-sdk/realtime-voice";
import type { VoiceCallConfig } from "./config.js";
import { resolveVoiceCallConfig, validateProviderConfig } from "./config.js";
import type { CoreAgentDeps, CoreConfig } from "./core-bridge.js";
import { CallManager } from "./manager.js";
import type { VoiceCallProvider } from "./providers/base.js";
import { MockProvider } from "./providers/mock.js";
import { PlivoProvider } from "./providers/plivo.js";
import { TelnyxProvider } from "./providers/telnyx.js";
import { TwilioProvider } from "./providers/twilio.js";
import type { TwilioProvider } from "./providers/twilio.js";
import type { TelephonyTtsRuntime } from "./telephony-tts.js";
import { createTelephonyTtsProvider } from "./telephony-tts.js";
import { startTunnel, type TunnelResult } from "./tunnel.js";
@ -30,6 +32,11 @@ type Logger = {
debug?: (message: string) => void;
};
type ResolvedRealtimeProvider = {
provider: RealtimeVoiceProviderPlugin;
providerConfig: RealtimeVoiceProviderConfig;
};
function createRuntimeResourceLifecycle(params: {
config: VoiceCallConfig;
webhookServer: VoiceCallWebhookServer;
@ -80,14 +87,15 @@ function isLoopbackBind(bind: string | undefined): boolean {
return bind === "127.0.0.1" || bind === "::1" || bind === "localhost";
}
function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
async function resolveProvider(config: VoiceCallConfig): Promise<VoiceCallProvider> {
const allowNgrokFreeTierLoopbackBypass =
config.tunnel?.provider === "ngrok" &&
isLoopbackBind(config.serve?.bind) &&
(config.tunnel?.allowNgrokFreeTierLoopbackBypass ?? false);
switch (config.provider) {
case "telnyx":
case "telnyx": {
const { TelnyxProvider } = await import("./providers/telnyx.js");
return new TelnyxProvider(
{
apiKey: config.telnyx?.apiKey,
@ -98,7 +106,9 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
skipVerification: config.skipSignatureVerification,
},
);
case "twilio":
}
case "twilio": {
const { TwilioProvider } = await import("./providers/twilio.js");
return new TwilioProvider(
{
accountSid: config.twilio?.accountSid,
@ -112,7 +122,9 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
webhookSecurity: config.webhookSecurity,
},
);
case "plivo":
}
case "plivo": {
const { PlivoProvider } = await import("./providers/plivo.js");
return new PlivoProvider(
{
authId: config.plivo?.authId,
@ -125,21 +137,66 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
webhookSecurity: config.webhookSecurity,
},
);
case "mock":
}
case "mock": {
const { MockProvider } = await import("./providers/mock.js");
return new MockProvider();
}
default:
throw new Error(`Unsupported voice-call provider: ${String(config.provider)}`);
}
}
async function resolveRealtimeProvider(params: {
config: VoiceCallConfig;
fullConfig: OpenClawConfig;
}): Promise<ResolvedRealtimeProvider> {
const { getRealtimeVoiceProvider, listRealtimeVoiceProviders } =
await import("./realtime-voice.runtime.js");
const configuredProviderId = params.config.realtime.provider?.trim();
const configuredProvider = getRealtimeVoiceProvider(configuredProviderId, params.fullConfig);
if (configuredProviderId && !configuredProvider) {
throw new Error(`Realtime voice provider "${configuredProviderId}" is not registered`);
}
const provider =
configuredProvider ??
[...listRealtimeVoiceProviders(params.fullConfig)].sort(
(left, right) =>
(left.autoSelectOrder ?? Number.MAX_SAFE_INTEGER) -
(right.autoSelectOrder ?? Number.MAX_SAFE_INTEGER),
)[0];
if (!provider) {
throw new Error("No realtime voice provider registered");
}
const rawProviderConfig =
(params.config.realtime.providers?.[provider.id] as RealtimeVoiceProviderConfig | undefined) ??
{};
const providerConfig =
provider.resolveConfig?.({
cfg: params.fullConfig,
rawConfig: {
providers: params.config.realtime.providers,
[provider.id]: rawProviderConfig,
},
}) ?? rawProviderConfig;
if (!provider.isConfigured({ cfg: params.fullConfig, providerConfig })) {
throw new Error(`Realtime voice provider "${provider.id}" is not configured`);
}
return { provider, providerConfig };
}
export async function createVoiceCallRuntime(params: {
config: VoiceCallConfig;
coreConfig: CoreConfig;
fullConfig?: OpenClawConfig;
agentRuntime: CoreAgentDeps;
ttsRuntime?: TelephonyTtsRuntime;
logger?: Logger;
}): Promise<VoiceCallRuntime> {
const { config: rawConfig, coreConfig, agentRuntime, ttsRuntime, logger } = params;
const { config: rawConfig, coreConfig, fullConfig, agentRuntime, ttsRuntime, logger } = params;
const log = logger ?? {
info: console.log,
warn: console.warn,
@ -164,8 +221,14 @@ export async function createVoiceCallRuntime(params: {
throw new Error(`Invalid voice-call config: ${validation.errors.join("; ")}`);
}
const provider = resolveProvider(config);
const provider = await resolveProvider(config);
const manager = new CallManager(config);
const realtimeProvider = config.realtime.enabled
? await resolveRealtimeProvider({
config,
fullConfig: (fullConfig ?? (coreConfig as OpenClawConfig)) as OpenClawConfig,
})
: null;
const webhookServer = new VoiceCallWebhookServer(
config,
manager,
@ -173,6 +236,19 @@ export async function createVoiceCallRuntime(params: {
coreConfig,
agentRuntime,
);
if (realtimeProvider) {
const { RealtimeCallHandler } = await import("./webhook/realtime-handler.js");
webhookServer.setRealtimeHandler(
new RealtimeCallHandler(
config.realtime,
manager,
provider,
realtimeProvider.provider,
realtimeProvider.providerConfig,
config.serve.path,
),
);
}
const lifecycle = createRuntimeResourceLifecycle({ config, webhookServer });
const localUrl = await webhookServer.start();
@ -212,6 +288,9 @@ export async function createVoiceCallRuntime(params: {
if (publicUrl && provider.name === "twilio") {
(provider as TwilioProvider).setPublicUrl(publicUrl);
}
if (publicUrl && realtimeProvider) {
webhookServer.getRealtimeHandler()?.setPublicUrl(publicUrl);
}
if (provider.name === "twilio" && config.streaming?.enabled) {
const twilioProvider = provider as TwilioProvider;
@ -243,6 +322,10 @@ export async function createVoiceCallRuntime(params: {
}
}
if (realtimeProvider) {
log.info(`[voice-call] Realtime voice provider: ${realtimeProvider.provider.id}`);
}
await manager.initialize(provider, webhookUrl);
const stop = async () => await lifecycle.stop();

View File

@ -30,16 +30,26 @@ export function createVoiceCallBaseConfig(params?: {
},
streaming: {
enabled: false,
sttProvider: "openai-realtime",
sttModel: "gpt-4o-transcribe",
silenceDurationMs: 800,
vadThreshold: 0.5,
provider: "openai",
providers: {
openai: {
model: "gpt-4o-transcribe",
silenceDurationMs: 800,
vadThreshold: 0.5,
},
},
streamPath: "/voice/stream",
preStartTimeoutMs: 5000,
maxPendingConnections: 32,
maxPendingConnectionsPerIp: 4,
maxConnections: 128,
},
realtime: {
enabled: false,
streamPath: "/voice/stream/realtime",
tools: [],
providers: {},
},
skipSignatureVerification: false,
stt: { provider: "openai", model: "whisper-1" },
tts: {

View File

@ -1,10 +1,36 @@
import { request } from "node:http";
import type { RealtimeTranscriptionProviderPlugin } from "openclaw/plugin-sdk/realtime-transcription";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { VoiceCallConfigSchema, type VoiceCallConfig } from "./config.js";
import type { CallManager } from "./manager.js";
import type { VoiceCallProvider } from "./providers/base.js";
import type { CallRecord, NormalizedEvent } from "./types.js";
import { VoiceCallWebhookServer } from "./webhook.js";
import type { RealtimeCallHandler } from "./webhook/realtime-handler.js";
const mocks = vi.hoisted(() => {
const realtimeTranscriptionProvider: RealtimeTranscriptionProviderPlugin = {
id: "openai",
label: "OpenAI",
aliases: ["openai-realtime"],
isConfigured: () => true,
resolveConfig: ({ rawConfig }) => rawConfig,
createSession: () => ({
connect: async () => {},
sendAudio: () => {},
close: () => {},
isConnected: () => true,
}),
};
return {
getRealtimeTranscriptionProvider: vi.fn(() => realtimeTranscriptionProvider),
};
});
vi.mock("./realtime-transcription.runtime.js", () => ({
getRealtimeTranscriptionProvider: mocks.getRealtimeTranscriptionProvider,
}));
const provider: VoiceCallProvider = {
name: "mock",
@ -291,6 +317,56 @@ describe("VoiceCallWebhookServer replay handling", () => {
}
});
it("returns realtime TwiML for replayed inbound twilio webhooks", async () => {
const parseWebhookEvent = vi.fn(() => ({ events: [], statusCode: 200 }));
const twilioProvider: VoiceCallProvider = {
...provider,
name: "twilio",
verifyWebhook: () => ({ ok: true, isReplay: true, verifiedRequestKey: "twilio:req:replay" }),
parseWebhookEvent,
};
const { manager, processEvent } = createManager([]);
const config = createConfig({
provider: "twilio",
inboundPolicy: "allowlist",
realtime: {
enabled: true,
streamPath: "/voice/stream/realtime",
tools: [],
providers: {},
},
});
const server = new VoiceCallWebhookServer(config, manager, twilioProvider);
server.setRealtimeHandler({
buildTwiMLPayload: () => ({
statusCode: 200,
headers: { "Content-Type": "text/xml" },
body: '<Response><Connect><Stream url="wss://example.test/voice/stream/realtime/token" /></Connect></Response>',
}),
getStreamPathPattern: () => "/voice/stream/realtime",
handleWebSocketUpgrade: () => {},
registerToolHandler: () => {},
setPublicUrl: () => {},
} as unknown as RealtimeCallHandler);
try {
const baseUrl = await server.start();
const response = await postWebhookFormWithHeaders(
server,
baseUrl,
"CallSid=CA123&Direction=inbound&CallStatus=ringing",
{ "x-twilio-signature": "sig" },
);
expect(response.status).toBe(200);
expect(await response.text()).toContain("<Connect><Stream");
expect(parseWebhookEvent).not.toHaveBeenCalled();
expect(processEvent).not.toHaveBeenCalled();
} finally {
await server.stop();
}
});
it("passes verified request key from verifyWebhook into parseWebhookEvent", async () => {
const parseWebhookEvent = vi.fn((_ctx: unknown, options?: { verifiedRequestKey?: string }) => ({
events: [
@ -625,6 +701,7 @@ describe("VoiceCallWebhookServer stream disconnect grace", () => {
manager,
twilioProvider as unknown as VoiceCallProvider,
);
await server.start();
const mediaHandler = server.getMediaStreamHandler() as unknown as {
config: {
@ -717,6 +794,7 @@ describe("VoiceCallWebhookServer barge-in suppression during initial message", (
manager,
createTwilioProvider(clearTtsQueue) as unknown as VoiceCallProvider,
);
await server.start();
const handleInboundResponse = vi.fn(async () => {});
(
server as unknown as {
@ -790,6 +868,7 @@ describe("VoiceCallWebhookServer barge-in suppression during initial message", (
manager,
createTwilioProvider(clearTtsQueue) as unknown as VoiceCallProvider,
);
await server.start();
try {
const media = getMediaCallbacks(server);

View File

@ -1,5 +1,6 @@
import http from "node:http";
import { URL } from "node:url";
import type { OpenClawConfig } from "openclaw/plugin-sdk/core";
import {
createWebhookInFlightLimiter,
WEBHOOK_BODY_READ_DEFAULTS,
@ -16,9 +17,10 @@ import type { CallManager } from "./manager.js";
import type { MediaStreamConfig } from "./media-stream.js";
import { MediaStreamHandler } from "./media-stream.js";
import type { VoiceCallProvider } from "./providers/base.js";
import { OpenAIRealtimeSTTProvider } from "./providers/stt-openai-realtime.js";
import { isProviderStatusTerminal } from "./providers/shared/call-status.js";
import type { TwilioProvider } from "./providers/twilio.js";
import type { CallRecord, NormalizedEvent, WebhookContext } from "./types.js";
import type { RealtimeCallHandler } from "./webhook/realtime-handler.js";
import { startStaleCallReaper } from "./webhook/stale-call-reaper.js";
const MAX_WEBHOOK_BODY_BYTES = WEBHOOK_BODY_READ_DEFAULTS.preAuth.maxBytes;
@ -44,7 +46,7 @@ function sanitizeTranscriptForLog(value: string): string {
return `${sanitized.slice(0, TRANSCRIPT_LOG_MAX_CHARS)}...`;
}
type WebhookResponsePayload = {
export type WebhookResponsePayload = {
statusCode: number;
body: string;
headers?: Record<string, string>;
@ -89,6 +91,8 @@ export class VoiceCallWebhookServer {
private mediaStreamHandler: MediaStreamHandler | null = null;
/** Delayed auto-hangup timers keyed by provider call ID after stream disconnect. */
private pendingDisconnectHangups = new Map<string, ReturnType<typeof setTimeout>>();
/** Realtime voice handler for duplex provider bridges. */
private realtimeHandler: RealtimeCallHandler | null = null;
constructor(
config: VoiceCallConfig,
@ -102,11 +106,6 @@ export class VoiceCallWebhookServer {
this.provider = provider;
this.coreConfig = coreConfig ?? null;
this.agentRuntime = agentRuntime ?? null;
// Initialize media stream handler if streaming is enabled
if (this.config.streaming.enabled) {
this.initializeMediaStreaming();
}
}
/**
@ -116,6 +115,14 @@ export class VoiceCallWebhookServer {
return this.mediaStreamHandler;
}
getRealtimeHandler(): RealtimeCallHandler | null {
return this.realtimeHandler;
}
setRealtimeHandler(handler: RealtimeCallHandler): void {
this.realtimeHandler = handler;
}
private clearPendingDisconnectHangup(providerCallId: string): void {
const existing = this.pendingDisconnectHangups.get(providerCallId);
if (!existing) {
@ -147,26 +154,50 @@ export class VoiceCallWebhookServer {
}
/**
* Initialize media streaming with OpenAI Realtime STT.
* Initialize media streaming with the selected realtime transcription provider.
*/
private initializeMediaStreaming(): void {
private async initializeMediaStreaming(): Promise<void> {
const streaming = this.config.streaming;
const apiKey = streaming.openaiApiKey ?? process.env.OPENAI_API_KEY;
if (!apiKey) {
console.warn("[voice-call] Streaming enabled but no OpenAI API key found");
const selectedProviderId = streaming.provider;
const pluginConfig = this.coreConfig as unknown as OpenClawConfig | undefined;
const { getRealtimeTranscriptionProvider } =
await import("./realtime-transcription.runtime.js");
const provider = getRealtimeTranscriptionProvider(selectedProviderId, pluginConfig);
if (!provider) {
console.warn(
`[voice-call] Streaming enabled but realtime transcription provider "${selectedProviderId}" is not registered`,
);
return;
}
const selectedProviderConfig =
streaming.providers[selectedProviderId] &&
typeof streaming.providers[selectedProviderId] === "object"
? (streaming.providers[selectedProviderId] as Record<string, unknown>)
: undefined;
const canonicalProviderConfig =
streaming.providers[provider.id] && typeof streaming.providers[provider.id] === "object"
? (streaming.providers[provider.id] as Record<string, unknown>)
: undefined;
const rawProviderConfig = {
...(canonicalProviderConfig ?? {}),
...(selectedProviderConfig ?? {}),
};
const providerConfig = provider.resolveConfig
? provider.resolveConfig({
cfg: pluginConfig ?? ({} as OpenClawConfig),
rawConfig: rawProviderConfig,
})
: rawProviderConfig;
if (!provider.isConfigured({ cfg: pluginConfig, providerConfig })) {
console.warn(
`[voice-call] Streaming enabled but provider "${provider.id}" is not configured`,
);
return;
}
const sttProvider = new OpenAIRealtimeSTTProvider({
apiKey,
model: streaming.sttModel,
silenceDurationMs: streaming.silenceDurationMs,
vadThreshold: streaming.vadThreshold,
});
const streamConfig: MediaStreamConfig = {
sttProvider,
transcriptionProvider: provider,
providerConfig,
preStartTimeoutMs: streaming.preStartTimeoutMs,
maxPendingConnections: streaming.maxPendingConnections,
maxPendingConnectionsPerIp: streaming.maxPendingConnectionsPerIp,
@ -309,6 +340,10 @@ export class VoiceCallWebhookServer {
return this.listeningUrl ?? this.resolveListeningUrl(bind, webhookPath);
}
if (this.config.streaming.enabled && !this.mediaStreamHandler) {
await this.initializeMediaStreaming();
}
return new Promise((resolve, reject) => {
this.server = http.createServer((req, res) => {
this.handleRequest(req, res, webhookPath).catch((err) => {
@ -318,12 +353,15 @@ export class VoiceCallWebhookServer {
});
});
// Handle WebSocket upgrades for media streams
if (this.mediaStreamHandler) {
// Handle WebSocket upgrades for realtime voice and media streams.
if (this.realtimeHandler || this.mediaStreamHandler) {
this.server.on("upgrade", (request, socket, head) => {
if (this.realtimeHandler && this.isRealtimeWebSocketUpgrade(request)) {
this.realtimeHandler.handleWebSocketUpgrade(request, socket, head);
return;
}
const path = this.getUpgradePathname(request);
if (path === streamPath) {
console.log("[voice-call] WebSocket upgrade for media stream");
if (path === streamPath && this.mediaStreamHandler) {
this.mediaStreamHandler?.handleUpgrade(request, socket, head);
} else {
socket.destroy();
@ -504,6 +542,10 @@ export class VoiceCallWebhookServer {
return { statusCode: 401, body: "Unauthorized" };
}
if (this.shouldShortCircuitToRealtimeTwiml(ctx)) {
return this.realtimeHandler!.buildTwiMLPayload(req, new URLSearchParams(ctx.rawBody));
}
const parsed = this.provider.parseWebhookEvent(ctx, {
verifiedRequestKey: verification.verifiedRequestKey,
});
@ -555,6 +597,42 @@ export class VoiceCallWebhookServer {
}
}
private isRealtimeWebSocketUpgrade(req: http.IncomingMessage): boolean {
try {
const pathname = buildRequestUrl(req.url, req.headers.host).pathname;
const pattern = this.realtimeHandler?.getStreamPathPattern();
return Boolean(pattern && pathname.startsWith(pattern));
} catch {
return false;
}
}
private shouldShortCircuitToRealtimeTwiml(ctx: WebhookContext): boolean {
if (!this.realtimeHandler || this.provider.name !== "twilio") {
return false;
}
const params = new URLSearchParams(ctx.rawBody);
const direction = params.get("Direction");
const isInbound = !direction || direction === "inbound";
if (!isInbound) {
return false;
}
if (ctx.query?.type === "status") {
return false;
}
const callStatus = params.get("CallStatus");
if (callStatus && isProviderStatusTerminal(callStatus)) {
return false;
}
// Replays must return the same TwiML body so Twilio retries reconnect cleanly.
// The one-time token still changes, but the behavior stays identical.
return !params.get("SpeechResult") && !params.get("Digits");
}
private processParsedEvents(events: NormalizedEvent[]): void {
for (const event of events) {
try {

View File

@ -0,0 +1,92 @@
import http from "node:http";
import type {
RealtimeVoiceBridge,
RealtimeVoiceProviderPlugin,
} from "openclaw/plugin-sdk/realtime-voice";
import { describe, expect, it, vi } from "vitest";
import type { VoiceCallRealtimeConfig } from "../config.js";
import type { CallManager } from "../manager.js";
import type { VoiceCallProvider } from "../providers/base.js";
import { RealtimeCallHandler } from "./realtime-handler.js";
function makeRequest(url: string, host = "gateway.ts.net"): http.IncomingMessage {
const req = new http.IncomingMessage(null as never);
req.url = url;
req.method = "POST";
req.headers = host ? { host } : {};
return req;
}
function makeBridge(): RealtimeVoiceBridge {
return {
connect: async () => {},
sendAudio: () => {},
setMediaTimestamp: () => {},
submitToolResult: () => {},
acknowledgeMark: () => {},
close: () => {},
isConnected: () => true,
triggerGreeting: () => {},
};
}
const realtimeProvider: RealtimeVoiceProviderPlugin = {
id: "openai",
label: "OpenAI",
isConfigured: () => true,
createBridge: () => makeBridge(),
};
function makeHandler(overrides?: Partial<VoiceCallRealtimeConfig>) {
return new RealtimeCallHandler(
{
enabled: true,
streamPath: "/voice/stream/realtime",
instructions: "Be helpful.",
tools: [],
providers: {},
...overrides,
},
{
processEvent: vi.fn(),
getCallByProviderCallId: vi.fn(),
} as unknown as CallManager,
{
name: "twilio",
verifyWebhook: vi.fn(),
parseWebhookEvent: vi.fn(),
initiateCall: vi.fn(),
hangupCall: vi.fn(),
playTts: vi.fn(),
startListening: vi.fn(),
stopListening: vi.fn(),
getCallStatus: vi.fn(),
} as unknown as VoiceCallProvider,
realtimeProvider,
{ apiKey: "test-key" },
"/voice/webhook",
);
}
describe("RealtimeCallHandler path routing", () => {
it("uses the request host and stream path in TwiML", () => {
const handler = makeHandler();
const payload = handler.buildTwiMLPayload(makeRequest("/voice/webhook", "gateway.ts.net"));
expect(payload.statusCode).toBe(200);
expect(payload.body).toMatch(
/wss:\/\/gateway\.ts\.net\/voice\/stream\/realtime\/[0-9a-f-]{36}/,
);
});
it("preserves a public path prefix ahead of serve.path", () => {
const handler = makeHandler({ streamPath: "/custom/stream/realtime" });
handler.setPublicUrl("https://public.example/api/voice/webhook");
const payload = handler.buildTwiMLPayload(makeRequest("/voice/webhook", "127.0.0.1:3334"));
expect(handler.getStreamPathPattern()).toBe("/api/custom/stream/realtime");
expect(payload.body).toMatch(
/wss:\/\/public\.example\/api\/custom\/stream\/realtime\/[0-9a-f-]{36}/,
);
});
});

View File

@ -0,0 +1,413 @@
import { randomUUID } from "node:crypto";
import http from "node:http";
import type { Duplex } from "node:stream";
import type {
RealtimeVoiceBridge,
RealtimeVoiceProviderConfig,
RealtimeVoiceProviderPlugin,
} from "openclaw/plugin-sdk/realtime-voice";
import WebSocket, { WebSocketServer } from "ws";
import type { VoiceCallRealtimeConfig } from "../config.js";
import type { CallManager } from "../manager.js";
import type { VoiceCallProvider } from "../providers/base.js";
import type { CallRecord, NormalizedEvent } from "../types.js";
import type { WebhookResponsePayload } from "../webhook.js";
export type ToolHandlerFn = (args: unknown, callId: string) => Promise<unknown>;
const STREAM_TOKEN_TTL_MS = 30_000;
const DEFAULT_HOST = "localhost:8443";
function normalizePath(pathname: string): string {
const trimmed = pathname.trim();
if (!trimmed) {
return "/";
}
const prefixed = trimmed.startsWith("/") ? trimmed : `/${trimmed}`;
if (prefixed === "/") {
return prefixed;
}
return prefixed.endsWith("/") ? prefixed.slice(0, -1) : prefixed;
}
function buildGreetingInstructions(
baseInstructions: string | undefined,
greeting: string | undefined,
): string | undefined {
const trimmedGreeting = greeting?.trim();
if (!trimmedGreeting) {
return baseInstructions;
}
const intro =
"Start the call by greeting the caller naturally. Include this greeting in your first spoken reply:";
return baseInstructions
? `${baseInstructions}\n\n${intro} "${trimmedGreeting}"`
: `${intro} "${trimmedGreeting}"`;
}
type PendingStreamToken = {
expiry: number;
from?: string;
to?: string;
direction?: "inbound" | "outbound";
};
type CallRegistration = {
callId: string;
initialGreetingInstructions?: string;
};
export class RealtimeCallHandler {
private readonly toolHandlers = new Map<string, ToolHandlerFn>();
private readonly pendingStreamTokens = new Map<string, PendingStreamToken>();
private publicOrigin: string | null = null;
private publicPathPrefix = "";
constructor(
private readonly config: VoiceCallRealtimeConfig,
private readonly manager: CallManager,
private readonly provider: VoiceCallProvider,
private readonly realtimeProvider: RealtimeVoiceProviderPlugin,
private readonly providerConfig: RealtimeVoiceProviderConfig,
private readonly servePath: string,
) {}
setPublicUrl(url: string): void {
try {
const parsed = new URL(url);
this.publicOrigin = parsed.host;
const normalizedServePath = normalizePath(this.servePath);
const normalizedPublicPath = normalizePath(parsed.pathname);
const idx = normalizedPublicPath.indexOf(normalizedServePath);
this.publicPathPrefix = idx > 0 ? normalizedPublicPath.slice(0, idx) : "";
} catch {
this.publicOrigin = null;
this.publicPathPrefix = "";
}
}
getStreamPathPattern(): string {
return `${this.publicPathPrefix}${normalizePath(this.config.streamPath ?? "/voice/stream/realtime")}`;
}
buildTwiMLPayload(req: http.IncomingMessage, params?: URLSearchParams): WebhookResponsePayload {
const host = this.publicOrigin || req.headers.host || DEFAULT_HOST;
const rawDirection = params?.get("Direction");
const token = this.issueStreamToken({
from: params?.get("From") ?? undefined,
to: params?.get("To") ?? undefined,
direction: rawDirection === "outbound-api" ? "outbound" : "inbound",
});
const wsUrl = `wss://${host}${this.getStreamPathPattern()}/${token}`;
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Connect>
<Stream url="${wsUrl}" />
</Connect>
</Response>`;
return {
statusCode: 200,
headers: { "Content-Type": "text/xml" },
body: twiml,
};
}
handleWebSocketUpgrade(request: http.IncomingMessage, socket: Duplex, head: Buffer): void {
const url = new URL(request.url ?? "/", "wss://localhost");
const token = url.pathname.split("/").pop() ?? null;
const callerMeta = token ? this.consumeStreamToken(token) : null;
if (!callerMeta) {
socket.write("HTTP/1.1 401 Unauthorized\r\n\r\n");
socket.destroy();
return;
}
const wss = new WebSocketServer({ noServer: true });
wss.handleUpgrade(request, socket, head, (ws) => {
let bridge: RealtimeVoiceBridge | null = null;
let initialized = false;
ws.on("message", (data: Buffer) => {
try {
const msg = JSON.parse(data.toString()) as Record<string, unknown>;
if (!initialized && msg.event === "start") {
initialized = true;
const startData =
typeof msg.start === "object" && msg.start !== null
? (msg.start as Record<string, unknown>)
: undefined;
const streamSid =
typeof startData?.streamSid === "string" ? startData.streamSid : "unknown";
const callSid = typeof startData?.callSid === "string" ? startData.callSid : "unknown";
bridge = this.handleCall(streamSid, callSid, ws, callerMeta);
return;
}
if (!bridge) {
return;
}
const mediaData =
typeof msg.media === "object" && msg.media !== null
? (msg.media as Record<string, unknown>)
: undefined;
if (msg.event === "media" && typeof mediaData?.payload === "string") {
bridge.sendAudio(Buffer.from(mediaData.payload, "base64"));
if (typeof mediaData.timestamp === "number") {
bridge.setMediaTimestamp(mediaData.timestamp);
} else if (typeof mediaData.timestamp === "string") {
bridge.setMediaTimestamp(Number.parseInt(mediaData.timestamp, 10));
}
return;
}
if (msg.event === "mark") {
bridge.acknowledgeMark();
return;
}
if (msg.event === "stop") {
bridge.close();
}
} catch (error) {
console.error("[voice-call] realtime WS parse failed:", error);
}
});
ws.on("close", () => {
bridge?.close();
});
});
}
registerToolHandler(name: string, fn: ToolHandlerFn): void {
this.toolHandlers.set(name, fn);
}
private issueStreamToken(meta: Omit<PendingStreamToken, "expiry"> = {}): string {
const token = randomUUID();
this.pendingStreamTokens.set(token, { expiry: Date.now() + STREAM_TOKEN_TTL_MS, ...meta });
for (const [candidate, entry] of this.pendingStreamTokens) {
if (Date.now() > entry.expiry) {
this.pendingStreamTokens.delete(candidate);
}
}
return token;
}
private consumeStreamToken(token: string): Omit<PendingStreamToken, "expiry"> | null {
const entry = this.pendingStreamTokens.get(token);
if (!entry) {
return null;
}
this.pendingStreamTokens.delete(token);
if (Date.now() > entry.expiry) {
return null;
}
return {
from: entry.from,
to: entry.to,
direction: entry.direction,
};
}
private handleCall(
streamSid: string,
callSid: string,
ws: WebSocket,
callerMeta: Omit<PendingStreamToken, "expiry">,
): RealtimeVoiceBridge | null {
const registration = this.registerCallInManager(callSid, callerMeta);
if (!registration) {
ws.close(1008, "Caller rejected by policy");
return null;
}
const { callId, initialGreetingInstructions } = registration;
let bridge: RealtimeVoiceBridge | null = null;
let callEndEmitted = false;
const emitCallEnd = (reason: "completed" | "error") => {
if (callEndEmitted) {
return;
}
callEndEmitted = true;
this.endCallInManager(callSid, callId, reason);
};
bridge = this.realtimeProvider.createBridge({
providerConfig: this.providerConfig,
instructions: this.config.instructions,
tools: this.config.tools,
onAudio: (muLaw) => {
if (ws.readyState !== WebSocket.OPEN) {
return;
}
ws.send(
JSON.stringify({
event: "media",
streamSid,
media: { payload: muLaw.toString("base64") },
}),
);
},
onClearAudio: () => {
if (ws.readyState !== WebSocket.OPEN) {
return;
}
ws.send(JSON.stringify({ event: "clear", streamSid }));
},
onMark: (markName) => {
if (ws.readyState !== WebSocket.OPEN) {
return;
}
ws.send(JSON.stringify({ event: "mark", streamSid, mark: { name: markName } }));
},
onTranscript: (role, text, isFinal) => {
if (!isFinal) {
return;
}
if (role === "user") {
const event: NormalizedEvent = {
id: `realtime-speech-${callSid}-${Date.now()}`,
type: "call.speech",
callId,
providerCallId: callSid,
timestamp: Date.now(),
transcript: text,
isFinal: true,
};
this.manager.processEvent(event);
return;
}
this.manager.processEvent({
id: `realtime-bot-${callSid}-${Date.now()}`,
type: "call.speaking",
callId,
providerCallId: callSid,
timestamp: Date.now(),
text,
});
},
onToolCall: (toolEvent) => {
if (!bridge) {
return;
}
void this.executeToolCall(
bridge,
callId,
toolEvent.callId || toolEvent.itemId,
toolEvent.name,
toolEvent.args,
);
},
onReady: () => {
bridge?.triggerGreeting?.(initialGreetingInstructions);
},
onError: (error) => {
console.error("[voice-call] realtime voice error:", error.message);
},
onClose: (reason) => {
if (reason !== "error") {
return;
}
emitCallEnd("error");
if (ws.readyState === WebSocket.OPEN) {
ws.close(1011, "Bridge disconnected");
}
void this.provider
.hangupCall({ callId, providerCallId: callSid, reason: "error" })
.catch((error: unknown) => {
console.warn(
`[voice-call] Failed to hang up realtime call ${callSid}: ${
error instanceof Error ? error.message : String(error)
}`,
);
});
},
});
bridge.connect().catch((error: Error) => {
console.error("[voice-call] Failed to connect realtime bridge:", error);
bridge?.close();
emitCallEnd("error");
ws.close(1011, "Failed to connect");
});
return bridge;
}
private registerCallInManager(
callSid: string,
callerMeta: Omit<PendingStreamToken, "expiry"> = {},
): CallRegistration | null {
const timestamp = Date.now();
const baseFields = {
providerCallId: callSid,
timestamp,
direction: (callerMeta.direction ?? "inbound") as "inbound" | "outbound",
...(callerMeta.from ? { from: callerMeta.from } : {}),
...(callerMeta.to ? { to: callerMeta.to } : {}),
};
this.manager.processEvent({
id: `realtime-initiated-${callSid}`,
callId: callSid,
type: "call.initiated",
...baseFields,
});
const callRecord = this.manager.getCallByProviderCallId(callSid);
if (!callRecord) {
return null;
}
const initialGreeting = this.extractInitialGreeting(callRecord);
if (callRecord.metadata) {
delete callRecord.metadata.initialMessage;
}
this.manager.processEvent({
id: `realtime-answered-${callSid}`,
callId: callSid,
type: "call.answered",
...baseFields,
});
return {
callId: callRecord.callId,
initialGreetingInstructions: buildGreetingInstructions(
this.config.instructions,
initialGreeting,
),
};
}
private extractInitialGreeting(call: CallRecord): string | undefined {
return typeof call.metadata?.initialMessage === "string"
? call.metadata.initialMessage
: undefined;
}
private endCallInManager(callSid: string, callId: string, reason: "completed" | "error"): void {
this.manager.processEvent({
id: `realtime-ended-${callSid}-${Date.now()}`,
type: "call.ended",
callId,
providerCallId: callSid,
timestamp: Date.now(),
reason,
});
}
private async executeToolCall(
bridge: RealtimeVoiceBridge,
callId: string,
bridgeCallId: string,
name: string,
args: unknown,
): Promise<void> {
const handler = this.toolHandlers.get(name);
const result = !handler
? { error: `Tool "${name}" not available` }
: await handler(args, callId).catch((error: unknown) => ({
error: error instanceof Error ? error.message : String(error),
}));
bridge.submitToolResult(bridgeCallId, result);
}
}

View File

@ -0,0 +1 @@
export { zaiMediaUnderstandingProvider } from "./media-understanding-provider.js";

View File

@ -551,6 +551,14 @@
"types": "./dist/plugin-sdk/reply-history.d.ts",
"default": "./dist/plugin-sdk/reply-history.js"
},
"./plugin-sdk/realtime-voice": {
"types": "./dist/plugin-sdk/realtime-voice.d.ts",
"default": "./dist/plugin-sdk/realtime-voice.js"
},
"./plugin-sdk/realtime-transcription": {
"types": "./dist/plugin-sdk/realtime-transcription.d.ts",
"default": "./dist/plugin-sdk/realtime-transcription.js"
},
"./plugin-sdk/media-understanding": {
"types": "./dist/plugin-sdk/media-understanding.d.ts",
"default": "./dist/plugin-sdk/media-understanding.js"

View File

@ -127,6 +127,8 @@
"kimi-coding",
"kilocode",
"reply-history",
"realtime-transcription",
"realtime-voice",
"media-understanding",
"request-url",
"runtime-store",

View File

@ -1,7 +1,8 @@
import { spawnSync } from "node:child_process";
import { mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
import path from "node:path";
import { fileURLToPath } from "node:url";
import { renderRootHelpText } from "../src/cli/program/root-help.ts";
import { fileURLToPath, pathToFileURL } from "node:url";
import { renderRootHelpText as renderSourceRootHelpText } from "../src/cli/program/root-help.ts";
function dedupe(values: string[]): string[] {
const seen = new Set<string>();
@ -82,7 +83,37 @@ export function readBundledChannelCatalogIds(
export async function renderBundledRootHelpText(
_distDirOverride: string = distDir,
): Promise<string> {
return await renderRootHelpText({ pluginDescriptors: [] });
const bundleName = readdirSync(distDirOverride).find(
(entry) => entry.startsWith("root-help-") && entry.endsWith(".js"),
);
if (!bundleName) {
throw new Error("No root-help bundle found in dist; cannot write CLI startup metadata.");
}
const moduleUrl = pathToFileURL(path.join(distDirOverride, bundleName)).href;
const inlineModule = [
`const mod = await import(${JSON.stringify(moduleUrl)});`,
"if (typeof mod.outputRootHelp !== 'function') {",
` throw new Error(${JSON.stringify(`Bundle ${bundleName} does not export outputRootHelp.`)});`,
"}",
"await mod.outputRootHelp();",
"process.exit(0);",
].join("\n");
const result = spawnSync(process.execPath, ["--input-type=module", "--eval", inlineModule], {
cwd: distDirOverride,
encoding: "utf8",
timeout: 30_000,
});
if (result.error) {
throw result.error;
}
if (result.status !== 0) {
const stderr = result.stderr?.trim();
throw new Error(
`Failed to render bundled root help from ${bundleName}` +
(stderr ? `: ${stderr}` : result.signal ? `: terminated by ${result.signal}` : ""),
);
}
return result.stdout ?? "";
}
export async function writeCliStartupMetadata(options?: {
@ -95,7 +126,13 @@ export async function writeCliStartupMetadata(options?: {
const resolvedExtensionsDir = options?.extensionsDir ?? extensionsDir;
const catalog = readBundledChannelCatalogIds(resolvedExtensionsDir);
const channelOptions = dedupe([...CORE_CHANNEL_ORDER, ...catalog]);
const rootHelpText = await renderBundledRootHelpText(resolvedDistDir);
const useSourceRootHelp =
resolvedDistDir === distDir &&
resolvedOutputPath === outputPath &&
resolvedExtensionsDir === extensionsDir;
const rootHelpText = useSourceRootHelp
? await renderSourceRootHelpText({ pluginSdkResolution: "src" })
: await renderBundledRootHelpText(resolvedDistDir);
mkdirSync(resolvedDistDir, { recursive: true });
writeFileSync(
@ -115,4 +152,5 @@ export async function writeCliStartupMetadata(options?: {
if (process.argv[1] && path.resolve(process.argv[1]) === scriptPath) {
await writeCliStartupMetadata();
process.exit(0);
}

View File

@ -1,16 +1,14 @@
import { Command } from "commander";
import { getPluginCliCommandDescriptors } from "../../plugins/cli.js";
import type { OpenClawPluginCliCommandDescriptor } from "../../plugins/types.js";
import type { PluginLoadOptions } from "../../plugins/loader.js";
import { VERSION } from "../../version.js";
import { getCoreCliCommandDescriptors } from "./core-command-descriptors.js";
import { configureProgramHelp } from "./help.js";
import { getSubCliEntries } from "./subcli-descriptors.js";
type RootHelpRenderOptions = {
pluginDescriptors?: OpenClawPluginCliCommandDescriptor[] | null;
};
type RootHelpLoaderOptions = Pick<PluginLoadOptions, "pluginSdkResolution">;
async function buildRootHelpProgram(options?: RootHelpRenderOptions): Promise<Command> {
async function buildRootHelpProgram(loaderOptions?: RootHelpLoaderOptions): Promise<Command> {
const program = new Command();
configureProgramHelp(program, {
programVersion: VERSION,
@ -31,11 +29,7 @@ async function buildRootHelpProgram(options?: RootHelpRenderOptions): Promise<Co
program.command(command.name).description(command.description);
existingCommands.add(command.name);
}
const pluginDescriptors =
options && "pluginDescriptors" in options
? (options.pluginDescriptors ?? [])
: await getPluginCliCommandDescriptors();
for (const command of pluginDescriptors) {
for (const command of await getPluginCliCommandDescriptors(undefined, undefined, loaderOptions)) {
if (existingCommands.has(command.name)) {
continue;
}
@ -46,8 +40,8 @@ async function buildRootHelpProgram(options?: RootHelpRenderOptions): Promise<Co
return program;
}
export async function renderRootHelpText(options?: RootHelpRenderOptions): Promise<string> {
const program = await buildRootHelpProgram(options);
export async function renderRootHelpText(loaderOptions?: RootHelpLoaderOptions): Promise<string> {
const program = await buildRootHelpProgram(loaderOptions);
let output = "";
const originalWrite = process.stdout.write.bind(process.stdout);
const captureWrite: typeof process.stdout.write = ((chunk: string | Uint8Array) => {
@ -63,6 +57,6 @@ export async function renderRootHelpText(options?: RootHelpRenderOptions): Promi
return output;
}
export async function outputRootHelp(options?: RootHelpRenderOptions): Promise<void> {
process.stdout.write(await renderRootHelpText(options));
export async function outputRootHelp(loaderOptions?: RootHelpLoaderOptions): Promise<void> {
process.stdout.write(await renderRootHelpText(loaderOptions));
}

View File

@ -69,6 +69,8 @@ const createRegistry = (diagnostics: PluginDiagnostic[]): PluginRegistry => ({
commands: [],
providers: [],
speechProviders: [],
realtimeTranscriptionProviders: [],
realtimeVoiceProviders: [],
mediaUnderstandingProviders: [],
imageGenerationProviders: [],
webFetchProviders: [],

View File

@ -201,6 +201,8 @@ const createStubPluginRegistry = (): PluginRegistry => ({
}),
},
],
realtimeTranscriptionProviders: [],
realtimeVoiceProviders: [],
mediaUnderstandingProviders: [],
imageGenerationProviders: [],
webFetchProviders: [],

View File

@ -66,6 +66,7 @@ export type {
ProviderReplaySessionState,
ProviderResolveDynamicModelContext,
ProviderResolvedUsageAuth,
RealtimeTranscriptionProviderPlugin,
ProviderSanitizeReplayHistoryContext,
ProviderToolSchemaDiagnostic,
ProviderResolveUsageAuthContext,

View File

@ -51,6 +51,7 @@ export type {
ProviderAuthContext,
ProviderAuthResult,
ProviderRuntimeModel,
RealtimeTranscriptionProviderPlugin,
SpeechProviderPlugin,
} from "../plugins/types.js";
export type {

View File

@ -46,6 +46,7 @@ import type {
ProviderReplayPolicyContext,
ProviderReplaySessionEntry,
ProviderReplaySessionState,
RealtimeTranscriptionProviderPlugin,
ProviderResolvedUsageAuth,
ProviderResolveDynamicModelContext,
ProviderSanitizeReplayHistoryContext,
@ -102,6 +103,7 @@ export type {
ProviderResolveDynamicModelContext,
ProviderNormalizeResolvedModelContext,
ProviderRuntimeModel,
RealtimeTranscriptionProviderPlugin,
SpeechProviderPlugin,
ProviderThinkingPolicyContext,
ProviderValidateReplayTurnsContext,

View File

@ -0,0 +1,16 @@
export type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js";
export type {
RealtimeTranscriptionProviderConfig,
RealtimeTranscriptionProviderConfiguredContext,
RealtimeTranscriptionProviderId,
RealtimeTranscriptionProviderResolveConfigContext,
RealtimeTranscriptionSession,
RealtimeTranscriptionSessionCallbacks,
RealtimeTranscriptionSessionCreateRequest,
} from "../realtime-transcription/provider-types.js";
export {
canonicalizeRealtimeTranscriptionProviderId,
getRealtimeTranscriptionProvider,
listRealtimeTranscriptionProviders,
normalizeRealtimeTranscriptionProviderId,
} from "../realtime-transcription/provider-registry.js";

View File

@ -0,0 +1,20 @@
export type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
export type {
RealtimeVoiceBridge,
RealtimeVoiceBridgeCallbacks,
RealtimeVoiceBridgeCreateRequest,
RealtimeVoiceCloseReason,
RealtimeVoiceProviderConfig,
RealtimeVoiceProviderConfiguredContext,
RealtimeVoiceProviderId,
RealtimeVoiceProviderResolveConfigContext,
RealtimeVoiceRole,
RealtimeVoiceTool,
RealtimeVoiceToolCallEvent,
} from "../realtime-voice/provider-types.js";
export {
canonicalizeRealtimeVoiceProviderId,
getRealtimeVoiceProvider,
listRealtimeVoiceProviders,
normalizeRealtimeVoiceProviderId,
} from "../realtime-voice/provider-registry.js";

View File

@ -1,7 +1,12 @@
import { rmSync } from "node:fs";
import type { OpenClawConfig } from "../config/config.js";
import type { ResolvedTtsConfig } from "../tts/tts.js";
// Public speech helpers for bundled or third-party plugins.
//
// Keep this surface neutral. Provider plugins should not need to know about the
// bundled `speech-core` plugin id just to consume shared speech types/helpers.
// Keep this surface neutral and import-light. Provider builders commonly import
// this module just to get types and a few validation helpers, so avoid pulling
// in the heavy TTS runtime graph at module load time.
export type { SpeechProviderPlugin } from "../plugins/types.js";
export type {
@ -22,14 +27,6 @@ export type {
TtsDirectiveParseResult,
} from "../tts/provider-types.js";
export {
scheduleCleanup,
summarizeText,
normalizeApplyTextNormalization,
normalizeLanguageCode,
normalizeSeed,
requireInRange,
} from "../tts/tts-core.js";
export { parseTtsDirectives } from "../tts/directives.js";
export {
canonicalizeSpeechProviderId,
@ -44,3 +41,71 @@ export {
trimToUndefined,
truncateErrorDetail,
} from "../tts/provider-error-utils.js";
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
export function requireInRange(value: number, min: number, max: number, label: string): void {
if (!Number.isFinite(value) || value < min || value > max) {
throw new Error(`${label} must be between ${min} and ${max}`);
}
}
export function normalizeLanguageCode(code?: string): string | undefined {
const trimmed = code?.trim();
if (!trimmed) {
return undefined;
}
const normalized = trimmed.toLowerCase();
if (!/^[a-z]{2}$/.test(normalized)) {
throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
}
return normalized;
}
export function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
const trimmed = mode?.trim();
if (!trimmed) {
return undefined;
}
const normalized = trimmed.toLowerCase();
if (normalized === "auto" || normalized === "on" || normalized === "off") {
return normalized;
}
throw new Error("applyTextNormalization must be one of: auto, on, off");
}
export function normalizeSeed(seed?: number): number | undefined {
if (seed == null) {
return undefined;
}
const next = Math.floor(seed);
if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
throw new Error("seed must be between 0 and 4294967295");
}
return next;
}
export function scheduleCleanup(
tempDir: string,
delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS,
): void {
const timer = setTimeout(() => {
try {
rmSync(tempDir, { recursive: true, force: true });
} catch {
// ignore cleanup errors
}
}, delayMs);
timer.unref();
}
export async function summarizeText(params: {
text: string;
targetLength: number;
cfg: OpenClawConfig;
config: ResolvedTtsConfig;
timeoutMs: number;
}) {
const { summarizeText: summarizeTextRuntime } = await import("../tts/tts-core.js");
return summarizeTextRuntime(params);
}

View File

@ -28,6 +28,8 @@ export type BuildPluginApiParams = {
| "registerCliBackend"
| "registerProvider"
| "registerSpeechProvider"
| "registerRealtimeTranscriptionProvider"
| "registerRealtimeVoiceProvider"
| "registerMediaUnderstandingProvider"
| "registerImageGenerationProvider"
| "registerWebFetchProvider"
@ -55,6 +57,10 @@ const noopRegisterService: OpenClawPluginApi["registerService"] = () => {};
const noopRegisterCliBackend: OpenClawPluginApi["registerCliBackend"] = () => {};
const noopRegisterProvider: OpenClawPluginApi["registerProvider"] = () => {};
const noopRegisterSpeechProvider: OpenClawPluginApi["registerSpeechProvider"] = () => {};
const noopRegisterRealtimeTranscriptionProvider: OpenClawPluginApi["registerRealtimeTranscriptionProvider"] =
() => {};
const noopRegisterRealtimeVoiceProvider: OpenClawPluginApi["registerRealtimeVoiceProvider"] =
() => {};
const noopRegisterMediaUnderstandingProvider: OpenClawPluginApi["registerMediaUnderstandingProvider"] =
() => {};
const noopRegisterImageGenerationProvider: OpenClawPluginApi["registerImageGenerationProvider"] =
@ -97,6 +103,10 @@ export function buildPluginApi(params: BuildPluginApiParams): OpenClawPluginApi
registerCliBackend: handlers.registerCliBackend ?? noopRegisterCliBackend,
registerProvider: handlers.registerProvider ?? noopRegisterProvider,
registerSpeechProvider: handlers.registerSpeechProvider ?? noopRegisterSpeechProvider,
registerRealtimeTranscriptionProvider:
handlers.registerRealtimeTranscriptionProvider ?? noopRegisterRealtimeTranscriptionProvider,
registerRealtimeVoiceProvider:
handlers.registerRealtimeVoiceProvider ?? noopRegisterRealtimeVoiceProvider,
registerMediaUnderstandingProvider:
handlers.registerMediaUnderstandingProvider ?? noopRegisterMediaUnderstandingProvider,
registerImageGenerationProvider:

View File

@ -28,6 +28,10 @@ describe("bundled capability metadata", () => {
cliBackendIds: uniqueStrings(manifest.cliBackends),
providerIds: uniqueStrings(manifest.providers),
speechProviderIds: uniqueStrings(manifest.contracts?.speechProviders),
realtimeTranscriptionProviderIds: uniqueStrings(
manifest.contracts?.realtimeTranscriptionProviders,
),
realtimeVoiceProviderIds: uniqueStrings(manifest.contracts?.realtimeVoiceProviders),
mediaUnderstandingProviderIds: uniqueStrings(
manifest.contracts?.mediaUnderstandingProviders,
),
@ -41,6 +45,8 @@ describe("bundled capability metadata", () => {
entry.cliBackendIds.length > 0 ||
entry.providerIds.length > 0 ||
entry.speechProviderIds.length > 0 ||
entry.realtimeTranscriptionProviderIds.length > 0 ||
entry.realtimeVoiceProviderIds.length > 0 ||
entry.mediaUnderstandingProviderIds.length > 0 ||
entry.imageGenerationProviderIds.length > 0 ||
entry.webFetchProviderIds.length > 0 ||

View File

@ -5,6 +5,8 @@ export type BundledPluginContractSnapshot = {
cliBackendIds: string[];
providerIds: string[];
speechProviderIds: string[];
realtimeTranscriptionProviderIds: string[];
realtimeVoiceProviderIds: string[];
mediaUnderstandingProviderIds: string[];
imageGenerationProviderIds: string[];
webFetchProviderIds: string[];
@ -37,6 +39,10 @@ export const BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS: readonly BundledPluginContractSn
cliBackendIds: uniqueStrings(manifest.cliBackends),
providerIds: uniqueStrings(manifest.providers),
speechProviderIds: uniqueStrings(manifest.contracts?.speechProviders),
realtimeTranscriptionProviderIds: uniqueStrings(
manifest.contracts?.realtimeTranscriptionProviders,
),
realtimeVoiceProviderIds: uniqueStrings(manifest.contracts?.realtimeVoiceProviders),
mediaUnderstandingProviderIds: uniqueStrings(manifest.contracts?.mediaUnderstandingProviders),
imageGenerationProviderIds: uniqueStrings(manifest.contracts?.imageGenerationProviders),
webFetchProviderIds: uniqueStrings(manifest.contracts?.webFetchProviders),
@ -48,6 +54,8 @@ export const BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS: readonly BundledPluginContractSn
entry.cliBackendIds.length > 0 ||
entry.providerIds.length > 0 ||
entry.speechProviderIds.length > 0 ||
entry.realtimeTranscriptionProviderIds.length > 0 ||
entry.realtimeVoiceProviderIds.length > 0 ||
entry.mediaUnderstandingProviderIds.length > 0 ||
entry.imageGenerationProviderIds.length > 0 ||
entry.webFetchProviderIds.length > 0 ||
@ -68,6 +76,14 @@ export const BUNDLED_PROVIDER_PLUGIN_IDS = collectPluginIds((entry) => entry.pro
export const BUNDLED_SPEECH_PLUGIN_IDS = collectPluginIds((entry) => entry.speechProviderIds);
export const BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS = collectPluginIds(
(entry) => entry.realtimeTranscriptionProviderIds,
);
export const BUNDLED_REALTIME_VOICE_PLUGIN_IDS = collectPluginIds(
(entry) => entry.realtimeVoiceProviderIds,
);
export const BUNDLED_MEDIA_UNDERSTANDING_PLUGIN_IDS = collectPluginIds(
(entry) => entry.mediaUnderstandingProviderIds,
);
@ -84,6 +100,8 @@ export const BUNDLED_RUNTIME_CONTRACT_PLUGIN_IDS = [
(entry) =>
entry.providerIds.length > 0 ||
entry.speechProviderIds.length > 0 ||
entry.realtimeTranscriptionProviderIds.length > 0 ||
entry.realtimeVoiceProviderIds.length > 0 ||
entry.mediaUnderstandingProviderIds.length > 0 ||
entry.imageGenerationProviderIds.length > 0 ||
entry.webFetchProviderIds.length > 0 ||

View File

@ -122,6 +122,8 @@ function createCapabilityPluginRecord(params: {
cliBackendIds: [],
providerIds: [],
speechProviderIds: [],
realtimeTranscriptionProviderIds: [],
realtimeVoiceProviderIds: [],
mediaUnderstandingProviderIds: [],
imageGenerationProviderIds: [],
webFetchProviderIds: [],
@ -272,6 +274,12 @@ export function loadBundledCapabilityRuntimeRegistry(params: {
record.cliBackendIds.push(...captured.cliBackends.map((entry) => entry.id));
record.providerIds.push(...captured.providers.map((entry) => entry.id));
record.speechProviderIds.push(...captured.speechProviders.map((entry) => entry.id));
record.realtimeTranscriptionProviderIds.push(
...captured.realtimeTranscriptionProviders.map((entry) => entry.id),
);
record.realtimeVoiceProviderIds.push(
...captured.realtimeVoiceProviders.map((entry) => entry.id),
);
record.mediaUnderstandingProviderIds.push(
...captured.mediaUnderstandingProviders.map((entry) => entry.id),
);
@ -309,6 +317,24 @@ export function loadBundledCapabilityRuntimeRegistry(params: {
rootDir: record.rootDir,
})),
);
registry.realtimeTranscriptionProviders.push(
...captured.realtimeTranscriptionProviders.map((provider) => ({
pluginId: record.id,
pluginName: record.name,
provider,
source: record.source,
rootDir: record.rootDir,
})),
);
registry.realtimeVoiceProviders.push(
...captured.realtimeVoiceProviders.map((provider) => ({
pluginId: record.id,
pluginName: record.name,
provider,
source: record.source,
rootDir: record.rootDir,
})),
);
registry.mediaUnderstandingProviders.push(
...captured.mediaUnderstandingProviders.map((provider) => ({
pluginId: record.id,

View File

@ -102,7 +102,12 @@ function setBundledCapabilityFixture(contractKey: string) {
}
function expectCompatChainApplied(params: {
key: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders";
key:
| "speechProviders"
| "realtimeTranscriptionProviders"
| "realtimeVoiceProviders"
| "mediaUnderstandingProviders"
| "imageGenerationProviders";
contractKey: string;
cfg: OpenClawConfig;
enablementCompat: {
@ -201,6 +206,8 @@ describe("resolvePluginCapabilityProviders", () => {
it.each([
["speechProviders", "speechProviders"],
["realtimeTranscriptionProviders", "realtimeTranscriptionProviders"],
["realtimeVoiceProviders", "realtimeVoiceProviders"],
["mediaUnderstandingProviders", "mediaUnderstandingProviders"],
["imageGenerationProviders", "imageGenerationProviders"],
] as const)("applies bundled compat before fallback loading for %s", (key, contractKey) => {

View File

@ -9,11 +9,15 @@ import type { PluginRegistry } from "./registry.js";
type CapabilityProviderRegistryKey =
| "speechProviders"
| "realtimeTranscriptionProviders"
| "realtimeVoiceProviders"
| "mediaUnderstandingProviders"
| "imageGenerationProviders";
type CapabilityContractKey =
| "speechProviders"
| "realtimeTranscriptionProviders"
| "realtimeVoiceProviders"
| "mediaUnderstandingProviders"
| "imageGenerationProviders";
@ -22,6 +26,8 @@ type CapabilityProviderForKey<K extends CapabilityProviderRegistryKey> =
const CAPABILITY_CONTRACT_KEY: Record<CapabilityProviderRegistryKey, CapabilityContractKey> = {
speechProviders: "speechProviders",
realtimeTranscriptionProviders: "realtimeTranscriptionProviders",
realtimeVoiceProviders: "realtimeVoiceProviders",
mediaUnderstandingProviders: "mediaUnderstandingProviders",
imageGenerationProviders: "imageGenerationProviders",
};

View File

@ -10,6 +10,8 @@ import type {
OpenClawPluginCliCommandDescriptor,
OpenClawPluginCliRegistrar,
ProviderPlugin,
RealtimeTranscriptionProviderPlugin,
RealtimeVoiceProviderPlugin,
SpeechProviderPlugin,
WebFetchProviderPlugin,
WebSearchProviderPlugin,
@ -27,6 +29,8 @@ export type CapturedPluginRegistration = {
cliRegistrars: CapturedPluginCliRegistration[];
cliBackends: CliBackendPlugin[];
speechProviders: SpeechProviderPlugin[];
realtimeTranscriptionProviders: RealtimeTranscriptionProviderPlugin[];
realtimeVoiceProviders: RealtimeVoiceProviderPlugin[];
mediaUnderstandingProviders: MediaUnderstandingProviderPlugin[];
imageGenerationProviders: ImageGenerationProviderPlugin[];
webFetchProviders: WebFetchProviderPlugin[];
@ -42,6 +46,8 @@ export function createCapturedPluginRegistration(params?: {
const cliRegistrars: CapturedPluginCliRegistration[] = [];
const cliBackends: CliBackendPlugin[] = [];
const speechProviders: SpeechProviderPlugin[] = [];
const realtimeTranscriptionProviders: RealtimeTranscriptionProviderPlugin[] = [];
const realtimeVoiceProviders: RealtimeVoiceProviderPlugin[] = [];
const mediaUnderstandingProviders: MediaUnderstandingProviderPlugin[] = [];
const imageGenerationProviders: ImageGenerationProviderPlugin[] = [];
const webFetchProviders: WebFetchProviderPlugin[] = [];
@ -59,6 +65,8 @@ export function createCapturedPluginRegistration(params?: {
cliRegistrars,
cliBackends,
speechProviders,
realtimeTranscriptionProviders,
realtimeVoiceProviders,
mediaUnderstandingProviders,
imageGenerationProviders,
webFetchProviders,
@ -106,6 +114,12 @@ export function createCapturedPluginRegistration(params?: {
registerSpeechProvider(provider: SpeechProviderPlugin) {
speechProviders.push(provider);
},
registerRealtimeTranscriptionProvider(provider: RealtimeTranscriptionProviderPlugin) {
realtimeTranscriptionProviders.push(provider);
},
registerRealtimeVoiceProvider(provider: RealtimeVoiceProviderPlugin) {
realtimeVoiceProviders.push(provider);
},
registerMediaUnderstandingProvider(provider: MediaUnderstandingProviderPlugin) {
mediaUnderstandingProviders.push(provider);
},

View File

@ -155,9 +155,10 @@ async function loadPluginCliCommandRegistry(
export async function getPluginCliCommandDescriptors(
cfg?: OpenClawConfig,
env?: NodeJS.ProcessEnv,
loaderOptions?: Pick<PluginLoadOptions, "pluginSdkResolution">,
): Promise<OpenClawPluginCliCommandDescriptor[]> {
try {
const { registry } = await loadPluginCliMetadataRegistry(cfg, env);
const { registry } = await loadPluginCliMetadataRegistry(cfg, env, loaderOptions);
const seen = new Set<string>();
const descriptors: OpenClawPluginCliCommandDescriptor[] = [];
for (const entry of registry.cliRegistrars) {

View File

@ -8,6 +8,8 @@ import {
pluginRegistrationContractRegistry,
providerContractLoadError,
providerContractPluginIds,
realtimeTranscriptionProviderContractRegistry,
realtimeVoiceProviderContractRegistry,
resolveWebFetchProviderContractEntriesForPluginId,
resolveWebSearchProviderContractEntriesForPluginId,
speechProviderContractRegistry,
@ -27,7 +29,11 @@ describe("plugin contract registry", () => {
predicate: (plugin: {
origin: string;
providers: unknown[];
contracts?: { speechProviders?: unknown[] };
contracts?: {
speechProviders?: unknown[];
realtimeTranscriptionProviders?: unknown[];
realtimeVoiceProviders?: unknown[];
};
}) => boolean;
}) {
expect(uniqueSortedStrings(params.actualPluginIds)).toEqual(
@ -39,7 +45,11 @@ describe("plugin contract registry", () => {
predicate: (plugin: {
origin: string;
providers: unknown[];
contracts?: { speechProviders?: unknown[] };
contracts?: {
speechProviders?: unknown[];
realtimeTranscriptionProviders?: unknown[];
realtimeVoiceProviders?: unknown[];
};
}) => boolean,
) {
return loadPluginManifestRegistry({})
@ -70,6 +80,14 @@ describe("plugin contract registry", () => {
name: "does not duplicate bundled media provider ids",
ids: () => mediaUnderstandingProviderContractRegistry.map((entry) => entry.provider.id),
},
{
name: "does not duplicate bundled realtime transcription provider ids",
ids: () => realtimeTranscriptionProviderContractRegistry.map((entry) => entry.provider.id),
},
{
name: "does not duplicate bundled realtime voice provider ids",
ids: () => realtimeVoiceProviderContractRegistry.map((entry) => entry.provider.id),
},
{
name: "does not duplicate bundled image-generation provider ids",
ids: () => imageGenerationProviderContractRegistry.map((entry) => entry.provider.id),
@ -101,6 +119,23 @@ describe("plugin contract registry", () => {
});
});
it("covers every bundled realtime voice plugin discovered from manifests", () => {
expectRegistryPluginIds({
actualPluginIds: realtimeVoiceProviderContractRegistry.map((entry) => entry.pluginId),
predicate: (plugin) =>
plugin.origin === "bundled" && (plugin.contracts?.realtimeVoiceProviders?.length ?? 0) > 0,
});
});
it("covers every bundled realtime transcription plugin discovered from manifests", () => {
expectRegistryPluginIds({
actualPluginIds: realtimeTranscriptionProviderContractRegistry.map((entry) => entry.pluginId),
predicate: (plugin) =>
plugin.origin === "bundled" &&
(plugin.contracts?.realtimeTranscriptionProviders?.length ?? 0) > 0,
});
});
it("covers every bundled web fetch plugin from the shared resolver", () => {
const bundledWebFetchPluginIds = resolveBundledWebFetchPluginIds({});

View File

@ -3,6 +3,8 @@ import {
BUNDLED_MEDIA_UNDERSTANDING_PLUGIN_IDS,
BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS,
BUNDLED_PROVIDER_PLUGIN_IDS,
BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS,
BUNDLED_REALTIME_VOICE_PLUGIN_IDS,
BUNDLED_SPEECH_PLUGIN_IDS,
BUNDLED_WEB_FETCH_PLUGIN_IDS,
BUNDLED_WEB_SEARCH_PLUGIN_IDS,
@ -12,6 +14,8 @@ import type {
ImageGenerationProviderPlugin,
MediaUnderstandingProviderPlugin,
ProviderPlugin,
RealtimeTranscriptionProviderPlugin,
RealtimeVoiceProviderPlugin,
SpeechProviderPlugin,
WebFetchProviderPlugin,
WebSearchProviderPlugin,
@ -19,6 +23,8 @@ import type {
import {
loadVitestImageGenerationProviderContractRegistry,
loadVitestMediaUnderstandingProviderContractRegistry,
loadVitestRealtimeTranscriptionProviderContractRegistry,
loadVitestRealtimeVoiceProviderContractRegistry,
loadVitestSpeechProviderContractRegistry,
} from "./speech-vitest-registry.js";
@ -38,6 +44,9 @@ type WebFetchProviderContractEntry = CapabilityContractEntry<WebFetchProviderPlu
};
type SpeechProviderContractEntry = CapabilityContractEntry<SpeechProviderPlugin>;
type RealtimeTranscriptionProviderContractEntry =
CapabilityContractEntry<RealtimeTranscriptionProviderPlugin>;
type RealtimeVoiceProviderContractEntry = CapabilityContractEntry<RealtimeVoiceProviderPlugin>;
type MediaUnderstandingProviderContractEntry =
CapabilityContractEntry<MediaUnderstandingProviderPlugin>;
type ImageGenerationProviderContractEntry = CapabilityContractEntry<ImageGenerationProviderPlugin>;
@ -47,6 +56,8 @@ type PluginRegistrationContractEntry = {
cliBackendIds: string[];
providerIds: string[];
speechProviderIds: string[];
realtimeTranscriptionProviderIds: string[];
realtimeVoiceProviderIds: string[];
mediaUnderstandingProviderIds: string[];
imageGenerationProviderIds: string[];
webFetchProviderIds: string[];
@ -94,6 +105,10 @@ let webSearchProviderContractRegistryByPluginIdCache: Map<
WebSearchProviderContractEntry[]
> | null = null;
let speechProviderContractRegistryCache: SpeechProviderContractEntry[] | null = null;
let realtimeTranscriptionProviderContractRegistryCache:
| RealtimeTranscriptionProviderContractEntry[]
| null = null;
let realtimeVoiceProviderContractRegistryCache: RealtimeVoiceProviderContractEntry[] | null = null;
let mediaUnderstandingProviderContractRegistryCache:
| MediaUnderstandingProviderContractEntry[]
| null = null;
@ -387,6 +402,36 @@ function loadSpeechProviderContractRegistry(): SpeechProviderContractEntry[] {
return speechProviderContractRegistryCache;
}
function loadRealtimeVoiceProviderContractRegistry(): RealtimeVoiceProviderContractEntry[] {
if (!realtimeVoiceProviderContractRegistryCache) {
realtimeVoiceProviderContractRegistryCache = process.env.VITEST
? loadVitestRealtimeVoiceProviderContractRegistry()
: loadBundledCapabilityRuntimeRegistry({
pluginIds: BUNDLED_REALTIME_VOICE_PLUGIN_IDS,
pluginSdkResolution: "dist",
}).realtimeVoiceProviders.map((entry) => ({
pluginId: entry.pluginId,
provider: entry.provider,
}));
}
return realtimeVoiceProviderContractRegistryCache;
}
function loadRealtimeTranscriptionProviderContractRegistry(): RealtimeTranscriptionProviderContractEntry[] {
if (!realtimeTranscriptionProviderContractRegistryCache) {
realtimeTranscriptionProviderContractRegistryCache = process.env.VITEST
? loadVitestRealtimeTranscriptionProviderContractRegistry()
: loadBundledCapabilityRuntimeRegistry({
pluginIds: BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS,
pluginSdkResolution: "dist",
}).realtimeTranscriptionProviders.map((entry) => ({
pluginId: entry.pluginId,
provider: entry.provider,
}));
}
return realtimeTranscriptionProviderContractRegistryCache;
}
function loadMediaUnderstandingProviderContractRegistry(): MediaUnderstandingProviderContractEntry[] {
if (!mediaUnderstandingProviderContractRegistryCache) {
mediaUnderstandingProviderContractRegistryCache = process.env.VITEST
@ -519,6 +564,12 @@ export const speechProviderContractRegistry: SpeechProviderContractEntry[] = cre
loadSpeechProviderContractRegistry,
);
export const realtimeTranscriptionProviderContractRegistry: RealtimeTranscriptionProviderContractEntry[] =
createLazyArrayView(loadRealtimeTranscriptionProviderContractRegistry);
export const realtimeVoiceProviderContractRegistry: RealtimeVoiceProviderContractEntry[] =
createLazyArrayView(loadRealtimeVoiceProviderContractRegistry);
export const mediaUnderstandingProviderContractRegistry: MediaUnderstandingProviderContractEntry[] =
createLazyArrayView(loadMediaUnderstandingProviderContractRegistry);
@ -531,6 +582,8 @@ function loadPluginRegistrationContractRegistry(): PluginRegistrationContractEnt
cliBackendIds: uniqueStrings(entry.cliBackendIds),
providerIds: uniqueStrings(entry.providerIds),
speechProviderIds: uniqueStrings(entry.speechProviderIds),
realtimeTranscriptionProviderIds: uniqueStrings(entry.realtimeTranscriptionProviderIds),
realtimeVoiceProviderIds: uniqueStrings(entry.realtimeVoiceProviderIds),
mediaUnderstandingProviderIds: uniqueStrings(entry.mediaUnderstandingProviderIds),
imageGenerationProviderIds: uniqueStrings(entry.imageGenerationProviderIds),
webFetchProviderIds: uniqueStrings(entry.webFetchProviderIds),

View File

@ -5,6 +5,8 @@ import { createJiti } from "jiti";
import {
BUNDLED_IMAGE_GENERATION_PLUGIN_IDS,
BUNDLED_MEDIA_UNDERSTANDING_PLUGIN_IDS,
BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS,
BUNDLED_REALTIME_VOICE_PLUGIN_IDS,
BUNDLED_SPEECH_PLUGIN_IDS,
} from "../bundled-capability-metadata.js";
import { loadBundledCapabilityRuntimeRegistry } from "../bundled-capability-runtime.js";
@ -13,6 +15,8 @@ import { buildPluginLoaderAliasMap, buildPluginLoaderJitiOptions } from "../sdk-
import type {
ImageGenerationProviderPlugin,
MediaUnderstandingProviderPlugin,
RealtimeTranscriptionProviderPlugin,
RealtimeVoiceProviderPlugin,
SpeechProviderPlugin,
} from "../types.js";
@ -26,6 +30,16 @@ export type MediaUnderstandingProviderContractEntry = {
provider: MediaUnderstandingProviderPlugin;
};
export type RealtimeVoiceProviderContractEntry = {
pluginId: string;
provider: RealtimeVoiceProviderPlugin;
};
export type RealtimeTranscriptionProviderContractEntry = {
pluginId: string;
provider: RealtimeTranscriptionProviderPlugin;
};
export type ImageGenerationProviderContractEntry = {
pluginId: string;
provider: ImageGenerationProviderPlugin;
@ -190,6 +204,96 @@ export function loadVitestMediaUnderstandingProviderContractRegistry(): MediaUnd
return registrations;
}
export function loadVitestRealtimeVoiceProviderContractRegistry(): RealtimeVoiceProviderContractEntry[] {
const registrations: RealtimeVoiceProviderContractEntry[] = [];
const { manifests, unresolvedPluginIds } = resolveTestApiModuleRecords(
BUNDLED_REALTIME_VOICE_PLUGIN_IDS,
);
for (const plugin of manifests) {
if (!plugin.rootDir) {
continue;
}
const testApiPath = path.join(plugin.rootDir, "test-api.ts");
if (!fs.existsSync(testApiPath)) {
continue;
}
const builder = resolveNamedBuilder<RealtimeVoiceProviderPlugin>(
createVitestCapabilityLoader(testApiPath)(testApiPath),
/^build.+RealtimeVoiceProvider$/u,
);
if (!builder) {
continue;
}
registrations.push({
pluginId: plugin.id,
provider: builder(),
});
unresolvedPluginIds.delete(plugin.id);
}
if (unresolvedPluginIds.size === 0) {
return registrations;
}
const runtimeRegistry = loadBundledCapabilityRuntimeRegistry({
pluginIds: [...unresolvedPluginIds],
pluginSdkResolution: "dist",
});
registrations.push(
...runtimeRegistry.realtimeVoiceProviders.map((entry) => ({
pluginId: entry.pluginId,
provider: entry.provider,
})),
);
return registrations;
}
export function loadVitestRealtimeTranscriptionProviderContractRegistry(): RealtimeTranscriptionProviderContractEntry[] {
const registrations: RealtimeTranscriptionProviderContractEntry[] = [];
const { manifests, unresolvedPluginIds } = resolveTestApiModuleRecords(
BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS,
);
for (const plugin of manifests) {
if (!plugin.rootDir) {
continue;
}
const testApiPath = path.join(plugin.rootDir, "test-api.ts");
if (!fs.existsSync(testApiPath)) {
continue;
}
const builder = resolveNamedBuilder<RealtimeTranscriptionProviderPlugin>(
createVitestCapabilityLoader(testApiPath)(testApiPath),
/^build.+RealtimeTranscriptionProvider$/u,
);
if (!builder) {
continue;
}
registrations.push({
pluginId: plugin.id,
provider: builder(),
});
unresolvedPluginIds.delete(plugin.id);
}
if (unresolvedPluginIds.size === 0) {
return registrations;
}
const runtimeRegistry = loadBundledCapabilityRuntimeRegistry({
pluginIds: [...unresolvedPluginIds],
pluginSdkResolution: "dist",
});
registrations.push(
...runtimeRegistry.realtimeTranscriptionProviders.map((entry) => ({
pluginId: entry.pluginId,
provider: entry.provider,
})),
);
return registrations;
}
export function loadVitestImageGenerationProviderContractRegistry(): ImageGenerationProviderContractEntry[] {
const registrations: ImageGenerationProviderContractEntry[] = [];
const { manifests, unresolvedPluginIds } = resolveTestApiModuleRecords(

View File

@ -590,6 +590,8 @@ function createPluginRecord(params: {
cliBackendIds: [],
providerIds: [],
speechProviderIds: [],
realtimeTranscriptionProviderIds: [],
realtimeVoiceProviderIds: [],
mediaUnderstandingProviderIds: [],
imageGenerationProviderIds: [],
webFetchProviderIds: [],

View File

@ -52,6 +52,8 @@ export type PluginManifest = {
export type PluginManifestContracts = {
speechProviders?: string[];
realtimeTranscriptionProviders?: string[];
realtimeVoiceProviders?: string[];
mediaUnderstandingProviders?: string[];
imageGenerationProviders?: string[];
webFetchProviders?: string[];
@ -125,6 +127,8 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u
}
const speechProviders = normalizeStringList(value.speechProviders);
const realtimeTranscriptionProviders = normalizeStringList(value.realtimeTranscriptionProviders);
const realtimeVoiceProviders = normalizeStringList(value.realtimeVoiceProviders);
const mediaUnderstandingProviders = normalizeStringList(value.mediaUnderstandingProviders);
const imageGenerationProviders = normalizeStringList(value.imageGenerationProviders);
const webFetchProviders = normalizeStringList(value.webFetchProviders);
@ -132,6 +136,8 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u
const tools = normalizeStringList(value.tools);
const contracts = {
...(speechProviders.length > 0 ? { speechProviders } : {}),
...(realtimeTranscriptionProviders.length > 0 ? { realtimeTranscriptionProviders } : {}),
...(realtimeVoiceProviders.length > 0 ? { realtimeVoiceProviders } : {}),
...(mediaUnderstandingProviders.length > 0 ? { mediaUnderstandingProviders } : {}),
...(imageGenerationProviders.length > 0 ? { imageGenerationProviders } : {}),
...(webFetchProviders.length > 0 ? { webFetchProviders } : {}),

View File

@ -11,6 +11,8 @@ export function createEmptyPluginRegistry(): PluginRegistry {
providers: [],
cliBackends: [],
speechProviders: [],
realtimeTranscriptionProviders: [],
realtimeVoiceProviders: [],
mediaUnderstandingProviders: [],
imageGenerationProviders: [],
webFetchProviders: [],

View File

@ -38,7 +38,7 @@ import {
import type {
CliBackendPlugin,
ImageGenerationProviderPlugin,
WebFetchProviderPlugin,
RealtimeTranscriptionProviderPlugin,
OpenClawPluginApi,
OpenClawPluginChannelRegistration,
OpenClawPluginCliCommandDescriptor,
@ -52,6 +52,7 @@ import type {
OpenClawPluginHookOptions,
MediaUnderstandingProviderPlugin,
ProviderPlugin,
RealtimeVoiceProviderPlugin,
OpenClawPluginService,
OpenClawPluginToolContext,
OpenClawPluginToolFactory,
@ -67,6 +68,7 @@ import type {
PluginHookHandlerMap,
PluginHookRegistration as TypedPluginHookRegistration,
SpeechProviderPlugin,
WebFetchProviderPlugin,
WebSearchProviderPlugin,
} from "./types.js";
@ -142,6 +144,10 @@ type PluginOwnedProviderRegistration<T extends { id: string }> = {
export type PluginSpeechProviderRegistration =
PluginOwnedProviderRegistration<SpeechProviderPlugin>;
export type PluginRealtimeTranscriptionProviderRegistration =
PluginOwnedProviderRegistration<RealtimeTranscriptionProviderPlugin>;
export type PluginRealtimeVoiceProviderRegistration =
PluginOwnedProviderRegistration<RealtimeVoiceProviderPlugin>;
export type PluginMediaUnderstandingProviderRegistration =
PluginOwnedProviderRegistration<MediaUnderstandingProviderPlugin>;
export type PluginImageGenerationProviderRegistration =
@ -213,6 +219,8 @@ export type PluginRecord = {
cliBackendIds: string[];
providerIds: string[];
speechProviderIds: string[];
realtimeTranscriptionProviderIds: string[];
realtimeVoiceProviderIds: string[];
mediaUnderstandingProviderIds: string[];
imageGenerationProviderIds: string[];
webFetchProviderIds: string[];
@ -239,6 +247,8 @@ export type PluginRegistry = {
providers: PluginProviderRegistration[];
cliBackends?: PluginCliBackendRegistration[];
speechProviders: PluginSpeechProviderRegistration[];
realtimeTranscriptionProviders: PluginRealtimeTranscriptionProviderRegistration[];
realtimeVoiceProviders: PluginRealtimeVoiceProviderRegistration[];
mediaUnderstandingProviders: PluginMediaUnderstandingProviderRegistration[];
imageGenerationProviders: PluginImageGenerationProviderRegistration[];
webFetchProviders: PluginWebFetchProviderRegistration[];
@ -699,6 +709,32 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
});
};
const registerRealtimeTranscriptionProvider = (
record: PluginRecord,
provider: RealtimeTranscriptionProviderPlugin,
) => {
registerUniqueProviderLike({
record,
provider,
kindLabel: "realtime transcription provider",
registrations: registry.realtimeTranscriptionProviders,
ownedIds: record.realtimeTranscriptionProviderIds,
});
};
const registerRealtimeVoiceProvider = (
record: PluginRecord,
provider: RealtimeVoiceProviderPlugin,
) => {
registerUniqueProviderLike({
record,
provider,
kindLabel: "realtime voice provider",
registrations: registry.realtimeVoiceProviders,
ownedIds: record.realtimeVoiceProviderIds,
});
};
const registerMediaUnderstandingProvider = (
record: PluginRecord,
provider: MediaUnderstandingProviderPlugin,
@ -1009,6 +1045,10 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
registerHttpRoute: (routeParams) => registerHttpRoute(record, routeParams),
registerProvider: (provider) => registerProvider(record, provider),
registerSpeechProvider: (provider) => registerSpeechProvider(record, provider),
registerRealtimeTranscriptionProvider: (provider) =>
registerRealtimeTranscriptionProvider(record, provider),
registerRealtimeVoiceProvider: (provider) =>
registerRealtimeVoiceProvider(record, provider),
registerMediaUnderstandingProvider: (provider) =>
registerMediaUnderstandingProvider(record, provider),
registerImageGenerationProvider: (provider) =>
@ -1198,6 +1238,8 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
registerProvider,
registerCliBackend,
registerSpeechProvider,
registerRealtimeTranscriptionProvider,
registerRealtimeVoiceProvider,
registerMediaUnderstandingProvider,
registerImageGenerationProvider,
registerWebSearchProvider,

View File

@ -199,6 +199,8 @@ describe("setActivePluginRegistry", () => {
cliBackendIds: [],
providerIds: [],
speechProviderIds: [],
realtimeTranscriptionProviderIds: [],
realtimeVoiceProviderIds: [],
mediaUnderstandingProviderIds: [],
imageGenerationProviderIds: [],
webFetchProviderIds: [],
@ -225,6 +227,8 @@ describe("setActivePluginRegistry", () => {
cliBackendIds: [],
providerIds: [],
speechProviderIds: [],
realtimeTranscriptionProviderIds: [],
realtimeVoiceProviderIds: [],
mediaUnderstandingProviderIds: [],
imageGenerationProviderIds: [],
webFetchProviderIds: [],

View File

@ -51,6 +51,8 @@ export function createPluginRecord(
cliBackendIds: [],
providerIds: [],
speechProviderIds: [],
realtimeTranscriptionProviderIds: [],
realtimeVoiceProviderIds: [],
mediaUnderstandingProviderIds: [],
imageGenerationProviderIds: [],
webFetchProviderIds: [],
@ -107,7 +109,7 @@ export function createCustomHook(params: {
export function createPluginLoadResult(
overrides: Partial<PluginLoadResult> & Pick<PluginLoadResult, "plugins"> = { plugins: [] },
): PluginLoadResult {
const { plugins, ...rest } = overrides;
const { plugins, realtimeTranscriptionProviders, realtimeVoiceProviders, ...rest } = overrides;
return {
plugins,
diagnostics: [],
@ -129,6 +131,8 @@ export function createPluginLoadResult(
commands: [],
conversationBindingResolvedHandlers: [],
...rest,
realtimeTranscriptionProviders: realtimeTranscriptionProviders ?? [],
realtimeVoiceProviders: realtimeVoiceProviders ?? [],
};
}

View File

@ -28,6 +28,8 @@ export type PluginCapabilityKind =
| "cli-backend"
| "text-inference"
| "speech"
| "realtime-transcription"
| "realtime-voice"
| "media-understanding"
| "image-generation"
| "web-search"
@ -233,6 +235,8 @@ function buildCapabilityEntries(plugin: PluginRegistry["plugins"][number]) {
{ kind: "cli-backend" as const, ids: plugin.cliBackendIds ?? [] },
{ kind: "text-inference" as const, ids: plugin.providerIds },
{ kind: "speech" as const, ids: plugin.speechProviderIds },
{ kind: "realtime-transcription" as const, ids: plugin.realtimeTranscriptionProviderIds },
{ kind: "realtime-voice" as const, ids: plugin.realtimeVoiceProviderIds },
{ kind: "media-understanding" as const, ids: plugin.mediaUnderstandingProviderIds },
{ kind: "image-generation" as const, ids: plugin.imageGenerationProviderIds },
{ kind: "web-search" as const, ids: plugin.webSearchProviderIds },

View File

@ -30,6 +30,22 @@ import type { HookEntry } from "../hooks/types.js";
import type { ImageGenerationProvider } from "../image-generation/types.js";
import type { ProviderUsageSnapshot } from "../infra/provider-usage.types.js";
import type { MediaUnderstandingProvider } from "../media-understanding/types.js";
import type {
RealtimeTranscriptionProviderConfig,
RealtimeTranscriptionProviderConfiguredContext,
RealtimeTranscriptionProviderId,
RealtimeTranscriptionProviderResolveConfigContext,
RealtimeTranscriptionSession,
RealtimeTranscriptionSessionCreateRequest,
} from "../realtime-transcription/provider-types.js";
import type {
RealtimeVoiceBridge,
RealtimeVoiceBridgeCreateRequest,
RealtimeVoiceProviderConfig,
RealtimeVoiceProviderConfiguredContext,
RealtimeVoiceProviderId,
RealtimeVoiceProviderResolveConfigContext,
} from "../realtime-voice/provider-types.js";
import type { RuntimeEnv } from "../runtime.js";
import type {
RuntimeWebFetchMetadata,
@ -1526,6 +1542,38 @@ export type PluginSpeechProviderEntry = SpeechProviderPlugin & {
pluginId: string;
};
/** Realtime transcription capability registered by a plugin. */
export type RealtimeTranscriptionProviderPlugin = {
id: RealtimeTranscriptionProviderId;
label: string;
aliases?: string[];
autoSelectOrder?: number;
resolveConfig?: (
ctx: RealtimeTranscriptionProviderResolveConfigContext,
) => RealtimeTranscriptionProviderConfig;
isConfigured: (ctx: RealtimeTranscriptionProviderConfiguredContext) => boolean;
createSession: (req: RealtimeTranscriptionSessionCreateRequest) => RealtimeTranscriptionSession;
};
export type PluginRealtimeTranscriptionProviderEntry = RealtimeTranscriptionProviderPlugin & {
pluginId: string;
};
/** Realtime voice capability registered by a plugin. */
export type RealtimeVoiceProviderPlugin = {
id: RealtimeVoiceProviderId;
label: string;
aliases?: string[];
autoSelectOrder?: number;
resolveConfig?: (ctx: RealtimeVoiceProviderResolveConfigContext) => RealtimeVoiceProviderConfig;
isConfigured: (ctx: RealtimeVoiceProviderConfiguredContext) => boolean;
createBridge: (req: RealtimeVoiceBridgeCreateRequest) => RealtimeVoiceBridge;
};
export type PluginRealtimeVoiceProviderEntry = RealtimeVoiceProviderPlugin & {
pluginId: string;
};
export type MediaUnderstandingProviderPlugin = MediaUnderstandingProvider;
export type ImageGenerationProviderPlugin = ImageGenerationProvider;
@ -1850,6 +1898,10 @@ export type OpenClawPluginApi = {
registerProvider: (provider: ProviderPlugin) => void;
/** Register a speech synthesis provider (speech capability). */
registerSpeechProvider: (provider: SpeechProviderPlugin) => void;
/** Register a realtime transcription provider (streaming STT capability). */
registerRealtimeTranscriptionProvider: (provider: RealtimeTranscriptionProviderPlugin) => void;
/** Register a realtime voice provider (duplex voice capability). */
registerRealtimeVoiceProvider: (provider: RealtimeVoiceProviderPlugin) => void;
/** Register a media understanding provider (media understanding capability). */
registerMediaUnderstandingProvider: (provider: MediaUnderstandingProviderPlugin) => void;
/** Register an image generation provider (image generation capability). */

View File

@ -0,0 +1,80 @@
import type { OpenClawConfig } from "../config/config.js";
import { resolvePluginCapabilityProviders } from "../plugins/capability-provider-runtime.js";
import type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js";
import type { RealtimeTranscriptionProviderId } from "./provider-types.js";
function trimToUndefined(value: string | undefined): string | undefined {
const trimmed = value?.trim().toLowerCase();
return trimmed ? trimmed : undefined;
}
export function normalizeRealtimeTranscriptionProviderId(
providerId: string | undefined,
): RealtimeTranscriptionProviderId | undefined {
return trimToUndefined(providerId);
}
function resolveRealtimeTranscriptionProviderEntries(
cfg?: OpenClawConfig,
): RealtimeTranscriptionProviderPlugin[] {
return resolvePluginCapabilityProviders({
key: "realtimeTranscriptionProviders",
cfg,
});
}
function buildProviderMaps(cfg?: OpenClawConfig): {
canonical: Map<string, RealtimeTranscriptionProviderPlugin>;
aliases: Map<string, RealtimeTranscriptionProviderPlugin>;
} {
const canonical = new Map<string, RealtimeTranscriptionProviderPlugin>();
const aliases = new Map<string, RealtimeTranscriptionProviderPlugin>();
const register = (provider: RealtimeTranscriptionProviderPlugin) => {
const id = normalizeRealtimeTranscriptionProviderId(provider.id);
if (!id) {
return;
}
canonical.set(id, provider);
aliases.set(id, provider);
for (const alias of provider.aliases ?? []) {
const normalizedAlias = normalizeRealtimeTranscriptionProviderId(alias);
if (normalizedAlias) {
aliases.set(normalizedAlias, provider);
}
}
};
for (const provider of resolveRealtimeTranscriptionProviderEntries(cfg)) {
register(provider);
}
return { canonical, aliases };
}
export function listRealtimeTranscriptionProviders(
cfg?: OpenClawConfig,
): RealtimeTranscriptionProviderPlugin[] {
return [...buildProviderMaps(cfg).canonical.values()];
}
export function getRealtimeTranscriptionProvider(
providerId: string | undefined,
cfg?: OpenClawConfig,
): RealtimeTranscriptionProviderPlugin | undefined {
const normalized = normalizeRealtimeTranscriptionProviderId(providerId);
if (!normalized) {
return undefined;
}
return buildProviderMaps(cfg).aliases.get(normalized);
}
export function canonicalizeRealtimeTranscriptionProviderId(
providerId: string | undefined,
cfg?: OpenClawConfig,
): RealtimeTranscriptionProviderId | undefined {
const normalized = normalizeRealtimeTranscriptionProviderId(providerId);
if (!normalized) {
return undefined;
}
return getRealtimeTranscriptionProvider(normalized, cfg)?.id ?? normalized;
}

View File

@ -0,0 +1,33 @@
import type { OpenClawConfig } from "../config/config.js";
export type RealtimeTranscriptionProviderId = string;
export type RealtimeTranscriptionProviderConfig = Record<string, unknown>;
export type RealtimeTranscriptionProviderResolveConfigContext = {
cfg: OpenClawConfig;
rawConfig: RealtimeTranscriptionProviderConfig;
};
export type RealtimeTranscriptionProviderConfiguredContext = {
cfg?: OpenClawConfig;
providerConfig: RealtimeTranscriptionProviderConfig;
};
export type RealtimeTranscriptionSessionCallbacks = {
onPartial?: (partial: string) => void;
onTranscript?: (transcript: string) => void;
onSpeechStart?: () => void;
onError?: (error: Error) => void;
};
export type RealtimeTranscriptionSessionCreateRequest = RealtimeTranscriptionSessionCallbacks & {
providerConfig: RealtimeTranscriptionProviderConfig;
};
export type RealtimeTranscriptionSession = {
connect(): Promise<void>;
sendAudio(audio: Buffer): void;
close(): void;
isConnected(): boolean;
};

View File

@ -0,0 +1,76 @@
import type { OpenClawConfig } from "../config/config.js";
import { resolvePluginCapabilityProviders } from "../plugins/capability-provider-runtime.js";
import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
import type { RealtimeVoiceProviderId } from "./provider-types.js";
function trimToUndefined(value: string | undefined): string | undefined {
const trimmed = value?.trim().toLowerCase();
return trimmed ? trimmed : undefined;
}
export function normalizeRealtimeVoiceProviderId(
providerId: string | undefined,
): RealtimeVoiceProviderId | undefined {
return trimToUndefined(providerId);
}
function resolveRealtimeVoiceProviderEntries(cfg?: OpenClawConfig): RealtimeVoiceProviderPlugin[] {
return resolvePluginCapabilityProviders({
key: "realtimeVoiceProviders",
cfg,
});
}
function buildProviderMaps(cfg?: OpenClawConfig): {
canonical: Map<string, RealtimeVoiceProviderPlugin>;
aliases: Map<string, RealtimeVoiceProviderPlugin>;
} {
const canonical = new Map<string, RealtimeVoiceProviderPlugin>();
const aliases = new Map<string, RealtimeVoiceProviderPlugin>();
const register = (provider: RealtimeVoiceProviderPlugin) => {
const id = normalizeRealtimeVoiceProviderId(provider.id);
if (!id) {
return;
}
canonical.set(id, provider);
aliases.set(id, provider);
for (const alias of provider.aliases ?? []) {
const normalizedAlias = normalizeRealtimeVoiceProviderId(alias);
if (normalizedAlias) {
aliases.set(normalizedAlias, provider);
}
}
};
for (const provider of resolveRealtimeVoiceProviderEntries(cfg)) {
register(provider);
}
return { canonical, aliases };
}
export function listRealtimeVoiceProviders(cfg?: OpenClawConfig): RealtimeVoiceProviderPlugin[] {
return [...buildProviderMaps(cfg).canonical.values()];
}
export function getRealtimeVoiceProvider(
providerId: string | undefined,
cfg?: OpenClawConfig,
): RealtimeVoiceProviderPlugin | undefined {
const normalized = normalizeRealtimeVoiceProviderId(providerId);
if (!normalized) {
return undefined;
}
return buildProviderMaps(cfg).aliases.get(normalized);
}
export function canonicalizeRealtimeVoiceProviderId(
providerId: string | undefined,
cfg?: OpenClawConfig,
): RealtimeVoiceProviderId | undefined {
const normalized = normalizeRealtimeVoiceProviderId(providerId);
if (!normalized) {
return undefined;
}
return getRealtimeVoiceProvider(normalized, cfg)?.id ?? normalized;
}

View File

@ -0,0 +1,66 @@
import type { OpenClawConfig } from "../config/config.js";
export type RealtimeVoiceProviderId = string;
export type RealtimeVoiceRole = "user" | "assistant";
export type RealtimeVoiceCloseReason = "completed" | "error";
export type RealtimeVoiceTool = {
type: "function";
name: string;
description: string;
parameters: {
type: "object";
properties: Record<string, unknown>;
required?: string[];
};
};
export type RealtimeVoiceToolCallEvent = {
itemId: string;
callId: string;
name: string;
args: unknown;
};
export type RealtimeVoiceBridgeCallbacks = {
onAudio: (muLaw: Buffer) => void;
onClearAudio: () => void;
onMark?: (markName: string) => void;
onTranscript?: (role: RealtimeVoiceRole, text: string, isFinal: boolean) => void;
onToolCall?: (event: RealtimeVoiceToolCallEvent) => void;
onReady?: () => void;
onError?: (error: Error) => void;
onClose?: (reason: RealtimeVoiceCloseReason) => void;
};
export type RealtimeVoiceProviderConfig = Record<string, unknown>;
export type RealtimeVoiceProviderResolveConfigContext = {
cfg: OpenClawConfig;
rawConfig: RealtimeVoiceProviderConfig;
};
export type RealtimeVoiceProviderConfiguredContext = {
cfg?: OpenClawConfig;
providerConfig: RealtimeVoiceProviderConfig;
};
export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & {
providerConfig: RealtimeVoiceProviderConfig;
instructions?: string;
tools?: RealtimeVoiceTool[];
};
export type RealtimeVoiceBridge = {
connect(): Promise<void>;
sendAudio(audio: Buffer): void;
setMediaTimestamp(ts: number): void;
sendUserMessage?(text: string): void;
triggerGreeting?(instructions?: string): void;
submitToolResult(callId: string, result: unknown): void;
acknowledgeMark(): void;
close(): void;
isConnected(): boolean;
};

View File

@ -27,6 +27,8 @@ export const createTestRegistry = (channels: TestChannelRegistration[] = []): Pl
})),
providers: [],
speechProviders: [],
realtimeTranscriptionProviders: [],
realtimeVoiceProviders: [],
mediaUnderstandingProviders: [],
imageGenerationProviders: [],
webFetchProviders: [],

View File

@ -20,6 +20,8 @@ export function createTestPluginApi(api: TestPluginApiInput): OpenClawPluginApi
registerCliBackend() {},
registerProvider() {},
registerSpeechProvider() {},
registerRealtimeTranscriptionProvider() {},
registerRealtimeVoiceProvider() {},
registerMediaUnderstandingProvider() {},
registerImageGenerationProvider() {},
registerWebFetchProvider() {},

View File

@ -92,6 +92,8 @@ export const pluginRegistrationContractCases = {
pluginId: "openai",
providerIds: ["openai", "openai-codex"],
speechProviderIds: ["openai"],
realtimeTranscriptionProviderIds: ["openai"],
realtimeVoiceProviderIds: ["openai"],
mediaUnderstandingProviderIds: ["openai", "openai-codex"],
imageGenerationProviderIds: ["openai"],
cliBackendIds: ["codex-cli"],

View File

@ -13,6 +13,8 @@ type PluginRegistrationContractParams = {
webFetchProviderIds?: string[];
webSearchProviderIds?: string[];
speechProviderIds?: string[];
realtimeTranscriptionProviderIds?: string[];
realtimeVoiceProviderIds?: string[];
mediaUnderstandingProviderIds?: string[];
imageGenerationProviderIds?: string[];
cliBackendIds?: string[];
@ -122,6 +124,22 @@ export function describePluginRegistrationContract(params: PluginRegistrationCon
});
}
if (params.realtimeTranscriptionProviderIds) {
it("keeps bundled realtime-transcription ownership explicit", () => {
expect(findRegistration(params.pluginId).realtimeTranscriptionProviderIds).toEqual(
params.realtimeTranscriptionProviderIds,
);
});
}
if (params.realtimeVoiceProviderIds) {
it("keeps bundled realtime-voice ownership explicit", () => {
expect(findRegistration(params.pluginId).realtimeVoiceProviderIds).toEqual(
params.realtimeVoiceProviderIds,
);
});
}
if (params.mediaUnderstandingProviderIds) {
it("keeps bundled media-understanding ownership explicit", () => {
expect(findRegistration(params.pluginId).mediaUnderstandingProviderIds).toEqual(

View File

@ -110,6 +110,8 @@ function createTestRegistryForSetup(
})),
providers: [],
speechProviders: [],
realtimeTranscriptionProviders: [],
realtimeVoiceProviders: [],
mediaUnderstandingProviders: [],
imageGenerationProviders: [],
webFetchProviders: [],

View File

@ -1,23 +1,23 @@
import { createScopedVitestConfig } from "./vitest.scoped-config.ts";
import { boundaryTestFiles } from "./vitest.unit-paths.mjs";
import { defineConfig } from "vitest/config";
import { sharedVitestConfig } from "./vitest.shared.config.ts";
export function createContractsVitestConfig(env?: Record<string, string | undefined>) {
return createScopedVitestConfig(
[
"src/channels/plugins/contracts/**/*.test.ts",
"src/config/doc-baseline.integration.test.ts",
"src/config/schema.base.generated.test.ts",
"src/config/schema.help.quality.test.ts",
"src/plugins/contracts/**/*.test.ts",
"test/**/*.test.ts",
],
{
env,
exclude: boundaryTestFiles,
name: "contracts",
const base = sharedVitestConfig as Record<string, unknown>;
const baseTest = sharedVitestConfig.test ?? {};
export function createContractsVitestConfig() {
return defineConfig({
...base,
test: {
...baseTest,
isolate: true,
setupFiles: baseTest.setupFiles ?? [],
include: [
"src/channels/plugins/contracts/**/*.test.ts",
"src/plugins/contracts/**/*.test.ts",
],
passWithNoTests: true,
},
);
});
}
export default createContractsVitestConfig();