mirror of https://github.com/openclaw/openclaw.git
refactor: move voice-call realtime providers into extensions
This commit is contained in:
parent
61f93540b2
commit
a23ab9b906
|
|
@ -48,6 +48,7 @@ Docs: https://docs.openclaw.ai
|
|||
- Telegram/native commands: clean up metadata-driven progress placeholders when replies fall back, edits fail, or local exec approval prompts are suppressed. (#59300) Thanks @jalehman.
|
||||
- Matrix/backup reset: recreate secret storage during backup reset when stale SSSS state blocks durable backup-key reload, including no-backup repair paths. (#60599) thanks @emonty.
|
||||
- Matrix: allow secret-storage recreation during automatic repair bootstrap so clients that lose their recovery key can recover and persist new cross-signing keys. (#59846) Thanks @al3mart.
|
||||
- Voice Call/OpenAI: move realtime voice and realtime transcription onto provider-owned plugin capabilities so `voice-call` uses generic provider selection while keeping realtime Twilio replay and custom webhook-path handling working.
|
||||
- Matrix/crypto persistence: capture and write the IndexedDB snapshot while holding the snapshot file lock so concurrent gateway and CLI persists cannot overwrite newer crypto state. (#59851) Thanks @al3mart.
|
||||
- Matrix/media: surface a dedicated `[matrix <kind> attachment too large]` marker for oversized inbound media instead of the generic unavailable marker, and classify size-limit failures with a typed Matrix error. (#60289) Thanks @efe-arv.
|
||||
- Matrix/Telegram exec approvals: recover stored same-channel account bindings even when session reply state drifted to another channel, so foreign-channel approvals route to the bound account instead of fanning out or being rejected as ambiguous. (#60417) thanks @gumadeiras.
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ native OpenClaw plugin registers against one or more capability types:
|
|||
| Text inference | `api.registerProvider(...)` | `openai`, `anthropic` |
|
||||
| CLI inference backend | `api.registerCliBackend(...)` | `openai`, `anthropic` |
|
||||
| Speech | `api.registerSpeechProvider(...)` | `elevenlabs`, `microsoft` |
|
||||
| Realtime voice | `api.registerRealtimeVoiceProvider(...)` | `openai` |
|
||||
| Media understanding | `api.registerMediaUnderstandingProvider(...)` | `openai`, `google` |
|
||||
| Image generation | `api.registerImageGenerationProvider(...)` | `openai`, `google` |
|
||||
| Web search | `api.registerWebSearchProvider(...)` | `google` |
|
||||
|
|
@ -239,8 +240,9 @@ Examples:
|
|||
- the bundled `minimax`, `mistral`, `moonshot`, and `zai` plugins own their
|
||||
media-understanding backends
|
||||
- the `voice-call` plugin is a feature plugin: it owns call transport, tools,
|
||||
CLI, routes, and runtime, but it consumes core TTS/STT capability instead of
|
||||
inventing a second speech stack
|
||||
CLI, routes, and Twilio media-stream bridging, but it consumes shared speech
|
||||
plus realtime-transcription and realtime-voice capabilities instead of
|
||||
importing vendor plugins directly
|
||||
|
||||
The intended end state is:
|
||||
|
||||
|
|
|
|||
|
|
@ -146,6 +146,7 @@ A single plugin can register any number of capabilities via the `api` object:
|
|||
| CLI inference backend | `api.registerCliBackend(...)` | [CLI Backends](/gateway/cli-backends) |
|
||||
| Channel / messaging | `api.registerChannel(...)` | [Channel Plugins](/plugins/sdk-channel-plugins) |
|
||||
| Speech (TTS/STT) | `api.registerSpeechProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
|
||||
| Realtime voice | `api.registerRealtimeVoiceProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
|
||||
| Media understanding | `api.registerMediaUnderstandingProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
|
||||
| Image generation | `api.registerImageGenerationProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
|
||||
| Web search | `api.registerWebSearchProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
|
||||
|
|
|
|||
|
|
@ -196,6 +196,8 @@ read without importing the plugin runtime.
|
|||
{
|
||||
"contracts": {
|
||||
"speechProviders": ["openai"],
|
||||
"realtimeTranscriptionProviders": ["openai"],
|
||||
"realtimeVoiceProviders": ["openai"],
|
||||
"mediaUnderstandingProviders": ["openai", "openai-codex"],
|
||||
"imageGenerationProviders": ["openai"],
|
||||
"webSearchProviders": ["gemini"],
|
||||
|
|
@ -206,13 +208,15 @@ read without importing the plugin runtime.
|
|||
|
||||
Each list is optional:
|
||||
|
||||
| Field | Type | What it means |
|
||||
| ----------------------------- | ---------- | -------------------------------------------------------------- |
|
||||
| `speechProviders` | `string[]` | Speech provider ids this plugin owns. |
|
||||
| `mediaUnderstandingProviders` | `string[]` | Media-understanding provider ids this plugin owns. |
|
||||
| `imageGenerationProviders` | `string[]` | Image-generation provider ids this plugin owns. |
|
||||
| `webSearchProviders` | `string[]` | Web-search provider ids this plugin owns. |
|
||||
| `tools` | `string[]` | Agent tool names this plugin owns for bundled contract checks. |
|
||||
| Field | Type | What it means |
|
||||
| -------------------------------- | ---------- | -------------------------------------------------------------- |
|
||||
| `speechProviders` | `string[]` | Speech provider ids this plugin owns. |
|
||||
| `realtimeTranscriptionProviders` | `string[]` | Realtime-transcription provider ids this plugin owns. |
|
||||
| `realtimeVoiceProviders` | `string[]` | Realtime-voice provider ids this plugin owns. |
|
||||
| `mediaUnderstandingProviders` | `string[]` | Media-understanding provider ids this plugin owns. |
|
||||
| `imageGenerationProviders` | `string[]` | Image-generation provider ids this plugin owns. |
|
||||
| `webSearchProviders` | `string[]` | Web-search provider ids this plugin owns. |
|
||||
| `tools` | `string[]` | Agent tool names this plugin owns for bundled contract checks. |
|
||||
|
||||
Legacy top-level `speechProviders`, `mediaUnderstandingProviders`, and
|
||||
`imageGenerationProviders` are deprecated. Use `openclaw doctor --fix` to move
|
||||
|
|
|
|||
|
|
@ -128,15 +128,17 @@ methods:
|
|||
|
||||
### Capability registration
|
||||
|
||||
| Method | What it registers |
|
||||
| --------------------------------------------- | ------------------------------ |
|
||||
| `api.registerProvider(...)` | Text inference (LLM) |
|
||||
| `api.registerCliBackend(...)` | Local CLI inference backend |
|
||||
| `api.registerChannel(...)` | Messaging channel |
|
||||
| `api.registerSpeechProvider(...)` | Text-to-speech / STT synthesis |
|
||||
| `api.registerMediaUnderstandingProvider(...)` | Image/audio/video analysis |
|
||||
| `api.registerImageGenerationProvider(...)` | Image generation |
|
||||
| `api.registerWebSearchProvider(...)` | Web search |
|
||||
| Method | What it registers |
|
||||
| ------------------------------------------------ | -------------------------------- |
|
||||
| `api.registerProvider(...)` | Text inference (LLM) |
|
||||
| `api.registerCliBackend(...)` | Local CLI inference backend |
|
||||
| `api.registerChannel(...)` | Messaging channel |
|
||||
| `api.registerSpeechProvider(...)` | Text-to-speech / STT synthesis |
|
||||
| `api.registerRealtimeTranscriptionProvider(...)` | Streaming realtime transcription |
|
||||
| `api.registerRealtimeVoiceProvider(...)` | Duplex realtime voice sessions |
|
||||
| `api.registerMediaUnderstandingProvider(...)` | Image/audio/video analysis |
|
||||
| `api.registerImageGenerationProvider(...)` | Image generation |
|
||||
| `api.registerWebSearchProvider(...)` | Web search |
|
||||
|
||||
### Tools and commands
|
||||
|
||||
|
|
|
|||
|
|
@ -324,8 +324,8 @@ API key auth, and dynamic model resolution.
|
|||
|
||||
<Step title="Add extra capabilities (optional)">
|
||||
<a id="step-5-add-extra-capabilities"></a>
|
||||
A provider plugin can register speech, media understanding, image
|
||||
generation, and web search alongside text inference:
|
||||
A provider plugin can register speech, realtime transcription, realtime voice, media
|
||||
understanding, image generation, and web search alongside text inference:
|
||||
|
||||
```typescript
|
||||
register(api) {
|
||||
|
|
@ -343,6 +343,33 @@ API key auth, and dynamic model resolution.
|
|||
}),
|
||||
});
|
||||
|
||||
api.registerRealtimeTranscriptionProvider({
|
||||
id: "acme-ai",
|
||||
label: "Acme Realtime Transcription",
|
||||
isConfigured: () => true,
|
||||
createSession: (req) => ({
|
||||
connect: async () => {},
|
||||
sendAudio: () => {},
|
||||
close: () => {},
|
||||
isConnected: () => true,
|
||||
}),
|
||||
});
|
||||
|
||||
api.registerRealtimeVoiceProvider({
|
||||
id: "acme-ai",
|
||||
label: "Acme Realtime Voice",
|
||||
isConfigured: ({ providerConfig }) => Boolean(providerConfig.apiKey),
|
||||
createBridge: (req) => ({
|
||||
connect: async () => {},
|
||||
sendAudio: () => {},
|
||||
setMediaTimestamp: () => {},
|
||||
submitToolResult: () => {},
|
||||
acknowledgeMark: () => {},
|
||||
close: () => {},
|
||||
isConnected: () => true,
|
||||
}),
|
||||
});
|
||||
|
||||
api.registerMediaUnderstandingProvider({
|
||||
id: "acme-ai",
|
||||
capabilities: ["image", "audio"],
|
||||
|
|
|
|||
|
|
@ -1 +1,2 @@
|
|||
export { buildAnthropicCliBackend } from "./cli-backend.js";
|
||||
export { anthropicMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
export { deepgramMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
|
|
@ -0,0 +1 @@
|
|||
export { buildFalImageGenerationProvider } from "./image-generation-provider.js";
|
||||
|
|
@ -1 +1,3 @@
|
|||
export { buildGoogleGeminiCliBackend } from "./cli-backend.js";
|
||||
export { buildGoogleImageGenerationProvider } from "./image-generation-provider.js";
|
||||
export { googleMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
export { groqMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
|
|
@ -47,6 +47,8 @@ function fakeApi(overrides: Partial<OpenClawPluginApi> = {}): OpenClawPluginApi
|
|||
registerCliBackend() {},
|
||||
registerProvider() {},
|
||||
registerSpeechProvider() {},
|
||||
registerRealtimeTranscriptionProvider() {},
|
||||
registerRealtimeVoiceProvider() {},
|
||||
registerMediaUnderstandingProvider() {},
|
||||
registerImageGenerationProvider() {},
|
||||
registerWebFetchProvider() {},
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ export {
|
|||
isTrustedProxyAddress,
|
||||
} from "openclaw/plugin-sdk/core";
|
||||
export { buildComputedAccountStatusSnapshot } from "openclaw/plugin-sdk/channel-status";
|
||||
export { createAccountStatusSink } from "openclaw/plugin-sdk/compat";
|
||||
export { createAccountStatusSink } from "openclaw/plugin-sdk/channel-lifecycle";
|
||||
export { buildAgentMediaPayload } from "openclaw/plugin-sdk/agent-media-payload";
|
||||
export {
|
||||
buildModelsProviderData,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,8 @@
|
|||
export {
|
||||
buildMinimaxImageGenerationProvider,
|
||||
buildMinimaxPortalImageGenerationProvider,
|
||||
} from "./image-generation-provider.js";
|
||||
export {
|
||||
minimaxMediaUnderstandingProvider,
|
||||
minimaxPortalMediaUnderstandingProvider,
|
||||
} from "./media-understanding-provider.js";
|
||||
|
|
@ -0,0 +1 @@
|
|||
export { mistralMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
|
|
@ -1 +1,2 @@
|
|||
export { __testing } from "./src/kimi-web-search-provider.js";
|
||||
export { moonshotMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
|
|
|
|||
|
|
@ -11,3 +11,5 @@ export {
|
|||
} from "./default-models.js";
|
||||
export { buildOpenAICodexProvider } from "./openai-codex-catalog.js";
|
||||
export { buildOpenAIProvider } from "./openai-provider.js";
|
||||
export { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
|
||||
export { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@ import {
|
|||
resolveOpenAIPromptOverlayMode,
|
||||
shouldApplyOpenAIPromptOverlay,
|
||||
} from "./prompt-overlay.js";
|
||||
import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
|
||||
import { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
|
||||
import { buildOpenAISpeechProvider } from "./speech-provider.js";
|
||||
|
||||
export default definePluginEntry({
|
||||
|
|
@ -24,6 +26,8 @@ export default definePluginEntry({
|
|||
api.registerProvider(buildOpenAIProvider());
|
||||
api.registerProvider(buildOpenAICodexProviderPlugin());
|
||||
api.registerSpeechProvider(buildOpenAISpeechProvider());
|
||||
api.registerRealtimeTranscriptionProvider(buildOpenAIRealtimeTranscriptionProvider());
|
||||
api.registerRealtimeVoiceProvider(buildOpenAIRealtimeVoiceProvider());
|
||||
api.registerMediaUnderstandingProvider(openaiMediaUnderstandingProvider);
|
||||
api.registerMediaUnderstandingProvider(openaiCodexMediaUnderstandingProvider);
|
||||
api.registerImageGenerationProvider(buildOpenAIImageGenerationProvider());
|
||||
|
|
|
|||
|
|
@ -103,16 +103,16 @@ describe("openai codex provider", () => {
|
|||
api: "openai-codex-responses",
|
||||
baseUrl: "https://chatgpt.com/backend-api",
|
||||
reasoning: true,
|
||||
input: ["text", "image"],
|
||||
input: ["text", "image"] as const,
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 272_000,
|
||||
maxTokens: 128_000,
|
||||
};
|
||||
}
|
||||
return null;
|
||||
},
|
||||
return undefined;
|
||||
}),
|
||||
} as never,
|
||||
} as never);
|
||||
});
|
||||
|
||||
expect(model).toMatchObject({
|
||||
id: "gpt-5.4",
|
||||
|
|
@ -173,7 +173,7 @@ describe("openai codex provider", () => {
|
|||
contextWindow: 272_000,
|
||||
},
|
||||
],
|
||||
});
|
||||
} as never);
|
||||
|
||||
expect(entries).toContainEqual(
|
||||
expect.objectContaining({
|
||||
|
|
|
|||
|
|
@ -34,6 +34,8 @@
|
|||
],
|
||||
"contracts": {
|
||||
"speechProviders": ["openai"],
|
||||
"realtimeTranscriptionProviders": ["openai"],
|
||||
"realtimeVoiceProviders": ["openai"],
|
||||
"mediaUnderstandingProviders": ["openai", "openai-codex"],
|
||||
"imageGenerationProviders": ["openai"]
|
||||
},
|
||||
|
|
|
|||
|
|
@ -4,6 +4,9 @@
|
|||
"private": true,
|
||||
"description": "OpenClaw OpenAI provider plugins",
|
||||
"type": "module",
|
||||
"dependencies": {
|
||||
"ws": "^8.20.0"
|
||||
},
|
||||
"openclaw": {
|
||||
"extensions": [
|
||||
"./index.ts"
|
||||
|
|
|
|||
|
|
@ -0,0 +1,27 @@
|
|||
import { describe, expect, it } from "vitest";
|
||||
import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
|
||||
|
||||
describe("buildOpenAIRealtimeTranscriptionProvider", () => {
|
||||
it("normalizes OpenAI config defaults", () => {
|
||||
const provider = buildOpenAIRealtimeTranscriptionProvider();
|
||||
const resolved = provider.resolveConfig?.({
|
||||
cfg: {} as never,
|
||||
rawConfig: {
|
||||
providers: {
|
||||
openai: {
|
||||
apiKey: "sk-test", // pragma: allowlist secret
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(resolved).toEqual({
|
||||
apiKey: "sk-test",
|
||||
});
|
||||
});
|
||||
|
||||
it("accepts the legacy openai-realtime alias", () => {
|
||||
const provider = buildOpenAIRealtimeTranscriptionProvider();
|
||||
expect(provider.aliases).toContain("openai-realtime");
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,267 @@
|
|||
import type {
|
||||
RealtimeTranscriptionProviderConfig,
|
||||
RealtimeTranscriptionProviderPlugin,
|
||||
RealtimeTranscriptionSession,
|
||||
RealtimeTranscriptionSessionCreateRequest,
|
||||
} from "openclaw/plugin-sdk/realtime-transcription";
|
||||
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
|
||||
import WebSocket from "ws";
|
||||
|
||||
type OpenAIRealtimeTranscriptionProviderConfig = {
|
||||
apiKey?: string;
|
||||
model?: string;
|
||||
silenceDurationMs?: number;
|
||||
vadThreshold?: number;
|
||||
};
|
||||
|
||||
type OpenAIRealtimeTranscriptionSessionConfig = RealtimeTranscriptionSessionCreateRequest & {
|
||||
apiKey: string;
|
||||
model: string;
|
||||
silenceDurationMs: number;
|
||||
vadThreshold: number;
|
||||
};
|
||||
|
||||
type RealtimeEvent = {
|
||||
type: string;
|
||||
delta?: string;
|
||||
transcript?: string;
|
||||
error?: unknown;
|
||||
};
|
||||
|
||||
function trimToUndefined(value: unknown): string | undefined {
|
||||
return typeof value === "string" && value.trim() ? value.trim() : undefined;
|
||||
}
|
||||
|
||||
function asNumber(value: unknown): number | undefined {
|
||||
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
|
||||
}
|
||||
|
||||
function asObject(value: unknown): Record<string, unknown> | undefined {
|
||||
return typeof value === "object" && value !== null && !Array.isArray(value)
|
||||
? (value as Record<string, unknown>)
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function normalizeProviderConfig(
|
||||
config: RealtimeTranscriptionProviderConfig,
|
||||
): OpenAIRealtimeTranscriptionProviderConfig {
|
||||
const providers = asObject(config.providers);
|
||||
const raw = asObject(providers?.openai) ?? asObject(config.openai) ?? asObject(config);
|
||||
return {
|
||||
apiKey:
|
||||
normalizeResolvedSecretInputString({
|
||||
value: raw?.apiKey,
|
||||
path: "plugins.entries.voice-call.config.streaming.providers.openai.apiKey",
|
||||
}) ??
|
||||
normalizeResolvedSecretInputString({
|
||||
value: raw?.openaiApiKey,
|
||||
path: "plugins.entries.voice-call.config.streaming.openaiApiKey",
|
||||
}),
|
||||
model: trimToUndefined(raw?.model) ?? trimToUndefined(raw?.sttModel),
|
||||
silenceDurationMs: asNumber(raw?.silenceDurationMs),
|
||||
vadThreshold: asNumber(raw?.vadThreshold),
|
||||
};
|
||||
}
|
||||
|
||||
function readProviderConfig(
|
||||
providerConfig: RealtimeTranscriptionProviderConfig,
|
||||
): OpenAIRealtimeTranscriptionProviderConfig {
|
||||
return normalizeProviderConfig(providerConfig);
|
||||
}
|
||||
|
||||
class OpenAIRealtimeTranscriptionSession implements RealtimeTranscriptionSession {
|
||||
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
|
||||
private static readonly RECONNECT_DELAY_MS = 1000;
|
||||
private static readonly CONNECT_TIMEOUT_MS = 10_000;
|
||||
|
||||
private ws: WebSocket | null = null;
|
||||
private connected = false;
|
||||
private closed = false;
|
||||
private reconnectAttempts = 0;
|
||||
private pendingTranscript = "";
|
||||
|
||||
constructor(private readonly config: OpenAIRealtimeTranscriptionSessionConfig) {}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
this.closed = false;
|
||||
this.reconnectAttempts = 0;
|
||||
await this.doConnect();
|
||||
}
|
||||
|
||||
sendAudio(audio: Buffer): void {
|
||||
if (this.ws?.readyState !== WebSocket.OPEN) {
|
||||
return;
|
||||
}
|
||||
this.sendEvent({
|
||||
type: "input_audio_buffer.append",
|
||||
audio: audio.toString("base64"),
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.closed = true;
|
||||
this.connected = false;
|
||||
if (this.ws) {
|
||||
this.ws.close(1000, "Transcription session closed");
|
||||
this.ws = null;
|
||||
}
|
||||
}
|
||||
|
||||
isConnected(): boolean {
|
||||
return this.connected;
|
||||
}
|
||||
|
||||
private async doConnect(): Promise<void> {
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
this.ws = new WebSocket("wss://api.openai.com/v1/realtime?intent=transcription", {
|
||||
headers: {
|
||||
Authorization: `Bearer ${this.config.apiKey}`,
|
||||
"OpenAI-Beta": "realtime=v1",
|
||||
},
|
||||
});
|
||||
|
||||
const connectTimeout = setTimeout(() => {
|
||||
reject(new Error("OpenAI realtime transcription connection timeout"));
|
||||
}, OpenAIRealtimeTranscriptionSession.CONNECT_TIMEOUT_MS);
|
||||
|
||||
this.ws.on("open", () => {
|
||||
clearTimeout(connectTimeout);
|
||||
this.connected = true;
|
||||
this.reconnectAttempts = 0;
|
||||
this.sendEvent({
|
||||
type: "transcription_session.update",
|
||||
session: {
|
||||
input_audio_format: "g711_ulaw",
|
||||
input_audio_transcription: {
|
||||
model: this.config.model,
|
||||
},
|
||||
turn_detection: {
|
||||
type: "server_vad",
|
||||
threshold: this.config.vadThreshold,
|
||||
prefix_padding_ms: 300,
|
||||
silence_duration_ms: this.config.silenceDurationMs,
|
||||
},
|
||||
},
|
||||
});
|
||||
resolve();
|
||||
});
|
||||
|
||||
this.ws.on("message", (data: Buffer) => {
|
||||
try {
|
||||
this.handleEvent(JSON.parse(data.toString()) as RealtimeEvent);
|
||||
} catch (error) {
|
||||
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
|
||||
}
|
||||
});
|
||||
|
||||
this.ws.on("error", (error) => {
|
||||
if (!this.connected) {
|
||||
clearTimeout(connectTimeout);
|
||||
reject(error);
|
||||
return;
|
||||
}
|
||||
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
|
||||
});
|
||||
|
||||
this.ws.on("close", () => {
|
||||
this.connected = false;
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
void this.attemptReconnect();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
private async attemptReconnect(): Promise<void> {
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
if (this.reconnectAttempts >= OpenAIRealtimeTranscriptionSession.MAX_RECONNECT_ATTEMPTS) {
|
||||
this.config.onError?.(new Error("OpenAI realtime transcription reconnect limit reached"));
|
||||
return;
|
||||
}
|
||||
this.reconnectAttempts += 1;
|
||||
const delay =
|
||||
OpenAIRealtimeTranscriptionSession.RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
await this.doConnect();
|
||||
} catch (error) {
|
||||
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
|
||||
await this.attemptReconnect();
|
||||
}
|
||||
}
|
||||
|
||||
private handleEvent(event: RealtimeEvent): void {
|
||||
switch (event.type) {
|
||||
case "conversation.item.input_audio_transcription.delta":
|
||||
if (event.delta) {
|
||||
this.pendingTranscript += event.delta;
|
||||
this.config.onPartial?.(this.pendingTranscript);
|
||||
}
|
||||
return;
|
||||
|
||||
case "conversation.item.input_audio_transcription.completed":
|
||||
if (event.transcript) {
|
||||
this.config.onTranscript?.(event.transcript);
|
||||
}
|
||||
this.pendingTranscript = "";
|
||||
return;
|
||||
|
||||
case "input_audio_buffer.speech_started":
|
||||
this.pendingTranscript = "";
|
||||
this.config.onSpeechStart?.();
|
||||
return;
|
||||
|
||||
case "error": {
|
||||
const detail =
|
||||
event.error && typeof event.error === "object" && "message" in event.error
|
||||
? String((event.error as { message?: unknown }).message ?? "Unknown error")
|
||||
: event.error
|
||||
? String(event.error)
|
||||
: "Unknown error";
|
||||
this.config.onError?.(new Error(detail));
|
||||
return;
|
||||
}
|
||||
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
private sendEvent(event: unknown): void {
|
||||
if (this.ws?.readyState === WebSocket.OPEN) {
|
||||
this.ws.send(JSON.stringify(event));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export function buildOpenAIRealtimeTranscriptionProvider(): RealtimeTranscriptionProviderPlugin {
|
||||
return {
|
||||
id: "openai",
|
||||
label: "OpenAI Realtime Transcription",
|
||||
aliases: ["openai-realtime"],
|
||||
autoSelectOrder: 10,
|
||||
resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig),
|
||||
isConfigured: ({ providerConfig }) =>
|
||||
Boolean(readProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
|
||||
createSession: (req) => {
|
||||
const config = readProviderConfig(req.providerConfig);
|
||||
const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("OpenAI API key missing");
|
||||
}
|
||||
return new OpenAIRealtimeTranscriptionSession({
|
||||
...req,
|
||||
apiKey,
|
||||
model: config.model ?? "gpt-4o-transcribe",
|
||||
silenceDurationMs: config.silenceDurationMs ?? 800,
|
||||
vadThreshold: config.vadThreshold ?? 0.5,
|
||||
});
|
||||
},
|
||||
};
|
||||
}
|
||||
|
|
@ -0,0 +1,535 @@
|
|||
import type {
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceBridgeCreateRequest,
|
||||
RealtimeVoiceCloseReason,
|
||||
RealtimeVoiceProviderConfig,
|
||||
RealtimeVoiceProviderPlugin,
|
||||
RealtimeVoiceTool,
|
||||
} from "openclaw/plugin-sdk/realtime-voice";
|
||||
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
|
||||
import WebSocket from "ws";
|
||||
|
||||
export type OpenAIRealtimeVoice =
|
||||
| "alloy"
|
||||
| "ash"
|
||||
| "ballad"
|
||||
| "cedar"
|
||||
| "coral"
|
||||
| "echo"
|
||||
| "marin"
|
||||
| "sage"
|
||||
| "shimmer"
|
||||
| "verse";
|
||||
|
||||
type OpenAIRealtimeVoiceProviderConfig = {
|
||||
apiKey?: string;
|
||||
model?: string;
|
||||
voice?: OpenAIRealtimeVoice;
|
||||
temperature?: number;
|
||||
vadThreshold?: number;
|
||||
silenceDurationMs?: number;
|
||||
prefixPaddingMs?: number;
|
||||
azureEndpoint?: string;
|
||||
azureDeployment?: string;
|
||||
azureApiVersion?: string;
|
||||
};
|
||||
|
||||
type OpenAIRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
|
||||
apiKey: string;
|
||||
model?: string;
|
||||
voice?: OpenAIRealtimeVoice;
|
||||
temperature?: number;
|
||||
vadThreshold?: number;
|
||||
silenceDurationMs?: number;
|
||||
prefixPaddingMs?: number;
|
||||
azureEndpoint?: string;
|
||||
azureDeployment?: string;
|
||||
azureApiVersion?: string;
|
||||
};
|
||||
|
||||
type RealtimeEvent = {
|
||||
type: string;
|
||||
delta?: string;
|
||||
transcript?: string;
|
||||
item_id?: string;
|
||||
call_id?: string;
|
||||
name?: string;
|
||||
error?: unknown;
|
||||
};
|
||||
|
||||
type RealtimeSessionUpdate = {
|
||||
type: "session.update";
|
||||
session: {
|
||||
modalities: string[];
|
||||
instructions?: string;
|
||||
voice: OpenAIRealtimeVoice;
|
||||
input_audio_format: string;
|
||||
output_audio_format: string;
|
||||
turn_detection: {
|
||||
type: "server_vad";
|
||||
threshold: number;
|
||||
prefix_padding_ms: number;
|
||||
silence_duration_ms: number;
|
||||
create_response: boolean;
|
||||
};
|
||||
temperature: number;
|
||||
input_audio_transcription?: { model: string };
|
||||
tools?: RealtimeVoiceTool[];
|
||||
tool_choice?: string;
|
||||
};
|
||||
};
|
||||
|
||||
function trimToUndefined(value: unknown): string | undefined {
|
||||
return typeof value === "string" && value.trim() ? value.trim() : undefined;
|
||||
}
|
||||
|
||||
function asNumber(value: unknown): number | undefined {
|
||||
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
|
||||
}
|
||||
|
||||
function asObject(value: unknown): Record<string, unknown> | undefined {
|
||||
return typeof value === "object" && value !== null && !Array.isArray(value)
|
||||
? (value as Record<string, unknown>)
|
||||
: undefined;
|
||||
}
|
||||
|
||||
function normalizeProviderConfig(
|
||||
config: RealtimeVoiceProviderConfig,
|
||||
): OpenAIRealtimeVoiceProviderConfig {
|
||||
const providers = asObject(config.providers);
|
||||
const raw = asObject(providers?.openai) ?? asObject(config.openai) ?? asObject(config);
|
||||
return {
|
||||
apiKey: normalizeResolvedSecretInputString({
|
||||
value: raw?.apiKey,
|
||||
path: "plugins.entries.voice-call.config.realtime.providers.openai.apiKey",
|
||||
}),
|
||||
model: trimToUndefined(raw?.model),
|
||||
voice: raw?.voice as OpenAIRealtimeVoice | undefined,
|
||||
temperature: asNumber(raw?.temperature),
|
||||
vadThreshold: asNumber(raw?.vadThreshold),
|
||||
silenceDurationMs: asNumber(raw?.silenceDurationMs),
|
||||
prefixPaddingMs: asNumber(raw?.prefixPaddingMs),
|
||||
azureEndpoint: trimToUndefined(raw?.azureEndpoint),
|
||||
azureDeployment: trimToUndefined(raw?.azureDeployment),
|
||||
azureApiVersion: trimToUndefined(raw?.azureApiVersion),
|
||||
};
|
||||
}
|
||||
|
||||
function readProviderConfig(
|
||||
providerConfig: RealtimeVoiceProviderConfig,
|
||||
): OpenAIRealtimeVoiceProviderConfig {
|
||||
return normalizeProviderConfig(providerConfig);
|
||||
}
|
||||
|
||||
function base64ToBuffer(b64: string): Buffer {
|
||||
return Buffer.from(b64, "base64");
|
||||
}
|
||||
|
||||
class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
|
||||
private static readonly DEFAULT_MODEL = "gpt-realtime";
|
||||
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
|
||||
private static readonly BASE_RECONNECT_DELAY_MS = 1000;
|
||||
private static readonly CONNECT_TIMEOUT_MS = 10_000;
|
||||
|
||||
private ws: WebSocket | null = null;
|
||||
private connected = false;
|
||||
private intentionallyClosed = false;
|
||||
private reconnectAttempts = 0;
|
||||
private pendingAudio: Buffer[] = [];
|
||||
private markQueue: string[] = [];
|
||||
private responseStartTimestamp: number | null = null;
|
||||
private latestMediaTimestamp = 0;
|
||||
private lastAssistantItemId: string | null = null;
|
||||
private toolCallBuffers = new Map<string, { name: string; callId: string; args: string }>();
|
||||
|
||||
constructor(private readonly config: OpenAIRealtimeVoiceBridgeConfig) {}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
this.intentionallyClosed = false;
|
||||
this.reconnectAttempts = 0;
|
||||
await this.doConnect();
|
||||
}
|
||||
|
||||
sendAudio(audio: Buffer): void {
|
||||
if (!this.connected || this.ws?.readyState !== WebSocket.OPEN) {
|
||||
if (this.pendingAudio.length < 320) {
|
||||
this.pendingAudio.push(audio);
|
||||
}
|
||||
return;
|
||||
}
|
||||
this.sendEvent({
|
||||
type: "input_audio_buffer.append",
|
||||
audio: audio.toString("base64"),
|
||||
});
|
||||
}
|
||||
|
||||
setMediaTimestamp(ts: number): void {
|
||||
this.latestMediaTimestamp = ts;
|
||||
}
|
||||
|
||||
sendUserMessage(text: string): void {
|
||||
this.sendEvent({
|
||||
type: "conversation.item.create",
|
||||
item: {
|
||||
type: "message",
|
||||
role: "user",
|
||||
content: [{ type: "input_text", text }],
|
||||
},
|
||||
});
|
||||
this.sendEvent({ type: "response.create" });
|
||||
}
|
||||
|
||||
triggerGreeting(instructions?: string): void {
|
||||
if (!this.connected || !this.ws) {
|
||||
return;
|
||||
}
|
||||
this.sendEvent({
|
||||
type: "response.create",
|
||||
response: {
|
||||
instructions: instructions ?? this.config.instructions,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
submitToolResult(callId: string, result: unknown): void {
|
||||
this.sendEvent({
|
||||
type: "conversation.item.create",
|
||||
item: {
|
||||
type: "function_call_output",
|
||||
call_id: callId,
|
||||
output: JSON.stringify(result),
|
||||
},
|
||||
});
|
||||
this.sendEvent({ type: "response.create" });
|
||||
}
|
||||
|
||||
acknowledgeMark(): void {
|
||||
if (this.markQueue.length === 0) {
|
||||
return;
|
||||
}
|
||||
this.markQueue.shift();
|
||||
if (this.markQueue.length === 0) {
|
||||
this.responseStartTimestamp = null;
|
||||
this.lastAssistantItemId = null;
|
||||
}
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.intentionallyClosed = true;
|
||||
this.connected = false;
|
||||
if (this.ws) {
|
||||
this.ws.close(1000, "Bridge closed");
|
||||
this.ws = null;
|
||||
}
|
||||
}
|
||||
|
||||
isConnected(): boolean {
|
||||
return this.connected;
|
||||
}
|
||||
|
||||
private async doConnect(): Promise<void> {
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const { url, headers } = this.resolveConnectionParams();
|
||||
this.ws = new WebSocket(url, { headers });
|
||||
|
||||
const connectTimeout = setTimeout(() => {
|
||||
reject(new Error("OpenAI realtime connection timeout"));
|
||||
}, OpenAIRealtimeVoiceBridge.CONNECT_TIMEOUT_MS);
|
||||
|
||||
this.ws.on("open", () => {
|
||||
clearTimeout(connectTimeout);
|
||||
this.connected = true;
|
||||
this.reconnectAttempts = 0;
|
||||
this.sendSessionUpdate();
|
||||
for (const chunk of this.pendingAudio.splice(0)) {
|
||||
this.sendAudio(chunk);
|
||||
}
|
||||
this.config.onReady?.();
|
||||
resolve();
|
||||
});
|
||||
|
||||
this.ws.on("message", (data: Buffer) => {
|
||||
try {
|
||||
this.handleEvent(JSON.parse(data.toString()) as RealtimeEvent);
|
||||
} catch (error) {
|
||||
console.error("[openai] realtime event parse failed:", error);
|
||||
}
|
||||
});
|
||||
|
||||
this.ws.on("error", (error) => {
|
||||
if (!this.connected) {
|
||||
clearTimeout(connectTimeout);
|
||||
reject(error);
|
||||
}
|
||||
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
|
||||
});
|
||||
|
||||
this.ws.on("close", () => {
|
||||
this.connected = false;
|
||||
if (this.intentionallyClosed) {
|
||||
this.config.onClose?.("completed");
|
||||
return;
|
||||
}
|
||||
void this.attemptReconnect();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
private resolveConnectionParams(): { url: string; headers: Record<string, string> } {
|
||||
const cfg = this.config;
|
||||
if (cfg.azureEndpoint && cfg.azureDeployment) {
|
||||
const base = cfg.azureEndpoint
|
||||
.replace(/\/$/, "")
|
||||
.replace(/^http(s?):/, (_, secure: string) => `ws${secure}:`);
|
||||
const apiVersion = cfg.azureApiVersion ?? "2024-10-01-preview";
|
||||
return {
|
||||
url: `${base}/openai/realtime?api-version=${apiVersion}&deployment=${encodeURIComponent(
|
||||
cfg.azureDeployment,
|
||||
)}`,
|
||||
headers: { "api-key": cfg.apiKey },
|
||||
};
|
||||
}
|
||||
|
||||
if (cfg.azureEndpoint) {
|
||||
const base = cfg.azureEndpoint
|
||||
.replace(/\/$/, "")
|
||||
.replace(/^http(s?):/, (_, secure: string) => `ws${secure}:`);
|
||||
return {
|
||||
url: `${base}/v1/realtime?model=${encodeURIComponent(
|
||||
cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL,
|
||||
)}`,
|
||||
headers: { Authorization: `Bearer ${cfg.apiKey}` },
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
url: `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(
|
||||
cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL,
|
||||
)}`,
|
||||
headers: {
|
||||
Authorization: `Bearer ${cfg.apiKey}`,
|
||||
"OpenAI-Beta": "realtime=v1",
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
private async attemptReconnect(): Promise<void> {
|
||||
if (this.intentionallyClosed) {
|
||||
return;
|
||||
}
|
||||
if (this.reconnectAttempts >= OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS) {
|
||||
this.config.onClose?.("error");
|
||||
return;
|
||||
}
|
||||
this.reconnectAttempts += 1;
|
||||
const delay =
|
||||
OpenAIRealtimeVoiceBridge.BASE_RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
if (this.intentionallyClosed) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
await this.doConnect();
|
||||
} catch (error) {
|
||||
this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
|
||||
await this.attemptReconnect();
|
||||
}
|
||||
}
|
||||
|
||||
private sendSessionUpdate(): void {
|
||||
const cfg = this.config;
|
||||
const sessionUpdate: RealtimeSessionUpdate = {
|
||||
type: "session.update",
|
||||
session: {
|
||||
modalities: ["text", "audio"],
|
||||
instructions: cfg.instructions,
|
||||
voice: cfg.voice ?? "alloy",
|
||||
input_audio_format: "g711_ulaw",
|
||||
output_audio_format: "g711_ulaw",
|
||||
input_audio_transcription: {
|
||||
model: "whisper-1",
|
||||
},
|
||||
turn_detection: {
|
||||
type: "server_vad",
|
||||
threshold: cfg.vadThreshold ?? 0.5,
|
||||
prefix_padding_ms: cfg.prefixPaddingMs ?? 300,
|
||||
silence_duration_ms: cfg.silenceDurationMs ?? 500,
|
||||
create_response: true,
|
||||
},
|
||||
temperature: cfg.temperature ?? 0.8,
|
||||
...(cfg.tools && cfg.tools.length > 0
|
||||
? {
|
||||
tools: cfg.tools,
|
||||
tool_choice: "auto",
|
||||
}
|
||||
: {}),
|
||||
},
|
||||
};
|
||||
this.sendEvent(sessionUpdate);
|
||||
}
|
||||
|
||||
private handleEvent(event: RealtimeEvent): void {
|
||||
switch (event.type) {
|
||||
case "response.audio.delta": {
|
||||
if (!event.delta) {
|
||||
return;
|
||||
}
|
||||
const audio = base64ToBuffer(event.delta);
|
||||
this.config.onAudio(audio);
|
||||
if (this.responseStartTimestamp === null) {
|
||||
this.responseStartTimestamp = this.latestMediaTimestamp;
|
||||
}
|
||||
if (event.item_id) {
|
||||
this.lastAssistantItemId = event.item_id;
|
||||
}
|
||||
this.sendMark();
|
||||
return;
|
||||
}
|
||||
|
||||
case "input_audio_buffer.speech_started":
|
||||
this.handleBargeIn();
|
||||
return;
|
||||
|
||||
case "response.audio_transcript.delta":
|
||||
if (event.delta) {
|
||||
this.config.onTranscript?.("assistant", event.delta, false);
|
||||
}
|
||||
return;
|
||||
|
||||
case "response.audio_transcript.done":
|
||||
if (event.transcript) {
|
||||
this.config.onTranscript?.("assistant", event.transcript, true);
|
||||
}
|
||||
return;
|
||||
|
||||
case "conversation.item.input_audio_transcription.completed":
|
||||
if (event.transcript) {
|
||||
this.config.onTranscript?.("user", event.transcript, true);
|
||||
}
|
||||
return;
|
||||
|
||||
case "conversation.item.input_audio_transcription.delta":
|
||||
if (event.delta) {
|
||||
this.config.onTranscript?.("user", event.delta, false);
|
||||
}
|
||||
return;
|
||||
|
||||
case "response.function_call_arguments.delta": {
|
||||
const key = event.item_id ?? "unknown";
|
||||
const existing = this.toolCallBuffers.get(key);
|
||||
if (existing && event.delta) {
|
||||
existing.args += event.delta;
|
||||
} else if (event.item_id) {
|
||||
this.toolCallBuffers.set(event.item_id, {
|
||||
name: event.name ?? "",
|
||||
callId: event.call_id ?? "",
|
||||
args: event.delta ?? "",
|
||||
});
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
case "response.function_call_arguments.done": {
|
||||
const key = event.item_id ?? "unknown";
|
||||
const buffered = this.toolCallBuffers.get(key);
|
||||
if (this.config.onToolCall) {
|
||||
const rawArgs =
|
||||
buffered?.args ||
|
||||
((event as unknown as Record<string, unknown>).arguments as string) ||
|
||||
"{}";
|
||||
let args: unknown = {};
|
||||
try {
|
||||
args = JSON.parse(rawArgs);
|
||||
} catch {}
|
||||
this.config.onToolCall({
|
||||
itemId: key,
|
||||
callId: buffered?.callId || event.call_id || "",
|
||||
name: buffered?.name || event.name || "",
|
||||
args,
|
||||
});
|
||||
}
|
||||
this.toolCallBuffers.delete(key);
|
||||
return;
|
||||
}
|
||||
|
||||
case "error": {
|
||||
const detail =
|
||||
event.error && typeof event.error === "object" && "message" in event.error
|
||||
? String((event.error as { message?: unknown }).message ?? "Unknown error")
|
||||
: event.error
|
||||
? String(event.error)
|
||||
: "Unknown error";
|
||||
this.config.onError?.(new Error(detail));
|
||||
return;
|
||||
}
|
||||
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
private handleBargeIn(): void {
|
||||
if (this.markQueue.length > 0 && this.responseStartTimestamp !== null) {
|
||||
const elapsedMs = this.latestMediaTimestamp - this.responseStartTimestamp;
|
||||
if (this.lastAssistantItemId) {
|
||||
this.sendEvent({
|
||||
type: "conversation.item.truncate",
|
||||
item_id: this.lastAssistantItemId,
|
||||
content_index: 0,
|
||||
audio_end_ms: Math.max(0, elapsedMs),
|
||||
});
|
||||
}
|
||||
this.config.onClearAudio();
|
||||
this.markQueue = [];
|
||||
this.lastAssistantItemId = null;
|
||||
this.responseStartTimestamp = null;
|
||||
return;
|
||||
}
|
||||
this.config.onClearAudio();
|
||||
}
|
||||
|
||||
private sendMark(): void {
|
||||
const markName = `audio-${Date.now()}`;
|
||||
this.markQueue.push(markName);
|
||||
this.config.onMark?.(markName);
|
||||
}
|
||||
|
||||
private sendEvent(event: unknown): void {
|
||||
if (this.ws?.readyState === WebSocket.OPEN) {
|
||||
this.ws.send(JSON.stringify(event));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
|
||||
return {
|
||||
id: "openai",
|
||||
label: "OpenAI Realtime Voice",
|
||||
autoSelectOrder: 10,
|
||||
resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig),
|
||||
isConfigured: ({ providerConfig }) =>
|
||||
Boolean(readProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
|
||||
createBridge: (req) => {
|
||||
const config = readProviderConfig(req.providerConfig);
|
||||
const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("OpenAI API key missing");
|
||||
}
|
||||
return new OpenAIRealtimeVoiceBridge({
|
||||
...req,
|
||||
apiKey,
|
||||
model: config.model,
|
||||
voice: config.voice,
|
||||
temperature: config.temperature,
|
||||
vadThreshold: config.vadThreshold,
|
||||
silenceDurationMs: config.silenceDurationMs,
|
||||
prefixPaddingMs: config.prefixPaddingMs,
|
||||
azureEndpoint: config.azureEndpoint,
|
||||
azureDeployment: config.azureDeployment,
|
||||
azureApiVersion: config.azureApiVersion,
|
||||
});
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export type { OpenAIRealtimeVoiceProviderConfig };
|
||||
|
|
@ -4,4 +4,6 @@ export {
|
|||
openaiCodexMediaUnderstandingProvider,
|
||||
openaiMediaUnderstandingProvider,
|
||||
} from "./media-understanding-provider.js";
|
||||
export { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
|
||||
export { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
|
||||
export { buildOpenAISpeechProvider } from "./speech-provider.js";
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
export { openrouterMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
|
|
@ -145,4 +145,4 @@ Actions:
|
|||
- While a Twilio stream is active, playback does not fall back to TwiML `<Say>`; stream-TTS failures fail the playback request.
|
||||
- Outbound conversation calls suppress barge-in only while the initial greeting is actively speaking, then re-enable normal interruption.
|
||||
- Twilio stream disconnect auto-end uses a short grace window so quick reconnects do not end the call.
|
||||
- Media streaming requires `ws` and OpenAI Realtime API key.
|
||||
- Media streaming requires `ws` plus a configured realtime-transcription provider. The bundled provider today is OpenAI.
|
||||
|
|
|
|||
|
|
@ -72,13 +72,25 @@ const voiceCallConfigSchema = {
|
|||
advanced: true,
|
||||
},
|
||||
"streaming.enabled": { label: "Enable Streaming", advanced: true },
|
||||
"streaming.openaiApiKey": {
|
||||
"streaming.provider": { label: "Streaming Provider", advanced: true },
|
||||
"streaming.providers.openai.apiKey": {
|
||||
label: "OpenAI Realtime API Key",
|
||||
sensitive: true,
|
||||
advanced: true,
|
||||
},
|
||||
"streaming.sttModel": { label: "Realtime STT Model", advanced: true },
|
||||
"streaming.providers.openai.model": { label: "Realtime STT Model", advanced: true },
|
||||
"streaming.streamPath": { label: "Media Stream Path", advanced: true },
|
||||
"realtime.enabled": { label: "Enable Realtime Voice", advanced: true },
|
||||
"realtime.provider": { label: "Realtime Voice Provider", advanced: true },
|
||||
"realtime.streamPath": { label: "Realtime Stream Path", advanced: true },
|
||||
"realtime.instructions": { label: "Realtime Instructions", advanced: true },
|
||||
"realtime.providers.openai.apiKey": {
|
||||
label: "OpenAI Realtime API Key",
|
||||
sensitive: true,
|
||||
advanced: true,
|
||||
},
|
||||
"realtime.providers.openai.model": { label: "OpenAI Realtime Model", advanced: true },
|
||||
"realtime.providers.openai.voice": { label: "OpenAI Realtime Voice", advanced: true },
|
||||
"tts.provider": {
|
||||
label: "TTS Provider Override",
|
||||
help: "Deep-merges with messages.tts (Microsoft is ignored for calls).",
|
||||
|
|
@ -181,6 +193,7 @@ export default definePluginEntry({
|
|||
runtimePromise = createVoiceCallRuntime({
|
||||
config,
|
||||
coreConfig: api.config as CoreConfig,
|
||||
fullConfig: api.config,
|
||||
agentRuntime: api.runtime.agent,
|
||||
ttsRuntime: api.runtime.tts,
|
||||
logger: api.logger,
|
||||
|
|
|
|||
|
|
@ -86,12 +86,16 @@
|
|||
"label": "Enable Streaming",
|
||||
"advanced": true
|
||||
},
|
||||
"streaming.openaiApiKey": {
|
||||
"streaming.provider": {
|
||||
"label": "Streaming Provider",
|
||||
"advanced": true
|
||||
},
|
||||
"streaming.providers.openai.apiKey": {
|
||||
"label": "OpenAI Realtime API Key",
|
||||
"sensitive": true,
|
||||
"advanced": true
|
||||
},
|
||||
"streaming.sttModel": {
|
||||
"streaming.providers.openai.model": {
|
||||
"label": "Realtime STT Model",
|
||||
"advanced": true
|
||||
},
|
||||
|
|
@ -345,9 +349,11 @@
|
|||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"provider": {
|
||||
"type": "string"
|
||||
},
|
||||
"sttProvider": {
|
||||
"type": "string",
|
||||
"enum": ["openai-realtime"]
|
||||
"type": "string"
|
||||
},
|
||||
"openaiApiKey": {
|
||||
"type": "string"
|
||||
|
|
@ -367,6 +373,13 @@
|
|||
"streamPath": {
|
||||
"type": "string"
|
||||
},
|
||||
"providers": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "object",
|
||||
"additionalProperties": true
|
||||
}
|
||||
},
|
||||
"preStartTimeoutMs": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
|
|
@ -385,6 +398,72 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"realtime": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"provider": {
|
||||
"type": "string"
|
||||
},
|
||||
"streamPath": {
|
||||
"type": "string"
|
||||
},
|
||||
"instructions": {
|
||||
"type": "string"
|
||||
},
|
||||
"tools": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": ["function"]
|
||||
},
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": ["object"]
|
||||
},
|
||||
"properties": {
|
||||
"type": "object",
|
||||
"additionalProperties": true
|
||||
},
|
||||
"required": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["type", "properties"]
|
||||
}
|
||||
},
|
||||
"required": ["type", "name", "description", "parameters"]
|
||||
}
|
||||
},
|
||||
"providers": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "object",
|
||||
"additionalProperties": true
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"publicUrl": {
|
||||
"type": "string"
|
||||
},
|
||||
|
|
|
|||
|
|
@ -179,6 +179,35 @@ describe("validateProviderConfig", () => {
|
|||
expect(result.errors).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("realtime config", () => {
|
||||
it("rejects disabled inbound policy for realtime mode", () => {
|
||||
const config = createBaseConfig("twilio");
|
||||
config.realtime.enabled = true;
|
||||
config.inboundPolicy = "disabled";
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(false);
|
||||
expect(result.errors).toContain(
|
||||
'plugins.entries.voice-call.config.inboundPolicy must not be "disabled" when realtime.enabled is true',
|
||||
);
|
||||
});
|
||||
|
||||
it("rejects enabling realtime and streaming together", () => {
|
||||
const config = createBaseConfig("twilio");
|
||||
config.realtime.enabled = true;
|
||||
config.streaming.enabled = true;
|
||||
config.inboundPolicy = "allowlist";
|
||||
|
||||
const result = validateProviderConfig(config);
|
||||
|
||||
expect(result.valid).toBe(false);
|
||||
expect(result.errors).toContain(
|
||||
"plugins.entries.voice-call.config.realtime.enabled and plugins.entries.voice-call.config.streaming.enabled cannot both be true",
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("normalizeVoiceCallConfig", () => {
|
||||
|
|
@ -194,11 +223,25 @@ describe("normalizeVoiceCallConfig", () => {
|
|||
|
||||
expect(normalized.serve.path).toBe("/voice/webhook");
|
||||
expect(normalized.streaming.streamPath).toBe("/custom-stream");
|
||||
expect(normalized.streaming.sttModel).toBe("gpt-4o-transcribe");
|
||||
expect(normalized.streaming.provider).toBe("openai");
|
||||
expect(normalized.streaming.providers.openai).toEqual({});
|
||||
expect(normalized.realtime.streamPath).toBe("/voice/stream/realtime");
|
||||
expect(normalized.tunnel.provider).toBe("none");
|
||||
expect(normalized.webhookSecurity.allowedHosts).toEqual([]);
|
||||
});
|
||||
|
||||
it("derives the realtime stream path from a custom webhook path", () => {
|
||||
const normalized = normalizeVoiceCallConfig({
|
||||
enabled: true,
|
||||
provider: "twilio",
|
||||
serve: {
|
||||
path: "/custom/webhook",
|
||||
},
|
||||
});
|
||||
|
||||
expect(normalized.realtime.streamPath).toBe("/custom/stream/realtime");
|
||||
});
|
||||
|
||||
it("accepts partial nested TTS overrides and preserves nested objects", () => {
|
||||
const normalized = normalizeVoiceCallConfig({
|
||||
tts: {
|
||||
|
|
|
|||
|
|
@ -70,7 +70,7 @@ export type PlivoConfig = z.infer<typeof PlivoConfigSchema>;
|
|||
|
||||
export const SttConfigSchema = z
|
||||
.object({
|
||||
/** STT provider (currently only OpenAI supported) */
|
||||
/** One-shot STT provider for non-streaming paths. */
|
||||
provider: z.literal("openai").default("openai"),
|
||||
/** Whisper model to use */
|
||||
model: z.string().min(1).default("whisper-1"),
|
||||
|
|
@ -196,25 +196,80 @@ export const OutboundConfigSchema = z
|
|||
export type OutboundConfig = z.infer<typeof OutboundConfigSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Streaming Configuration (OpenAI Realtime STT)
|
||||
// Realtime Voice Configuration
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export const RealtimeToolSchema = z
|
||||
.object({
|
||||
type: z.literal("function"),
|
||||
name: z.string().min(1),
|
||||
description: z.string(),
|
||||
parameters: z.object({
|
||||
type: z.literal("object"),
|
||||
properties: z.record(z.string(), z.unknown()),
|
||||
required: z.array(z.string()).optional(),
|
||||
}),
|
||||
})
|
||||
.strict();
|
||||
export type RealtimeToolConfig = z.infer<typeof RealtimeToolSchema>;
|
||||
|
||||
export const VoiceCallRealtimeProvidersConfigSchema = z
|
||||
.record(z.string(), z.record(z.string(), z.unknown()))
|
||||
.default({});
|
||||
export type VoiceCallRealtimeProvidersConfig = z.infer<
|
||||
typeof VoiceCallRealtimeProvidersConfigSchema
|
||||
>;
|
||||
|
||||
export const VoiceCallStreamingProvidersConfigSchema = z
|
||||
.record(z.string(), z.record(z.string(), z.unknown()))
|
||||
.default({});
|
||||
export type VoiceCallStreamingProvidersConfig = z.infer<
|
||||
typeof VoiceCallStreamingProvidersConfigSchema
|
||||
>;
|
||||
|
||||
export const VoiceCallRealtimeConfigSchema = z
|
||||
.object({
|
||||
/** Enable realtime voice-to-voice mode. */
|
||||
enabled: z.boolean().default(false),
|
||||
/** Provider id from registered realtime voice providers. */
|
||||
provider: z.string().min(1).optional(),
|
||||
/** Optional override for the local WebSocket route path. */
|
||||
streamPath: z.string().min(1).optional(),
|
||||
/** System instructions passed to the realtime provider. */
|
||||
instructions: z.string().optional(),
|
||||
/** Tool definitions exposed to the realtime provider. */
|
||||
tools: z.array(RealtimeToolSchema).default([]),
|
||||
/** Provider-owned raw config blobs keyed by provider id. */
|
||||
providers: VoiceCallRealtimeProvidersConfigSchema,
|
||||
})
|
||||
.strict()
|
||||
.default({ enabled: false, tools: [], providers: {} });
|
||||
export type VoiceCallRealtimeConfig = z.infer<typeof VoiceCallRealtimeConfigSchema>;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Streaming Configuration (Realtime Transcription)
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
export const VoiceCallStreamingConfigSchema = z
|
||||
.object({
|
||||
/** Enable real-time audio streaming (requires WebSocket support) */
|
||||
enabled: z.boolean().default(false),
|
||||
/** STT provider for real-time transcription */
|
||||
sttProvider: z.enum(["openai-realtime"]).default("openai-realtime"),
|
||||
/** OpenAI API key for Realtime API (uses OPENAI_API_KEY env if not set) */
|
||||
/** Provider id from registered realtime transcription providers. */
|
||||
provider: z.string().min(1).default("openai"),
|
||||
/** @deprecated Legacy alias for provider. */
|
||||
sttProvider: z.string().min(1).optional(),
|
||||
/** @deprecated Legacy OpenAI-specific API key field. */
|
||||
openaiApiKey: z.string().min(1).optional(),
|
||||
/** OpenAI transcription model (default: gpt-4o-transcribe) */
|
||||
sttModel: z.string().min(1).default("gpt-4o-transcribe"),
|
||||
/** VAD silence duration in ms before considering speech ended */
|
||||
silenceDurationMs: z.number().int().positive().default(800),
|
||||
/** VAD threshold 0-1 (higher = less sensitive) */
|
||||
vadThreshold: z.number().min(0).max(1).default(0.5),
|
||||
/** @deprecated Legacy OpenAI-specific transcription model field. */
|
||||
sttModel: z.string().min(1).optional(),
|
||||
/** @deprecated Legacy OpenAI-specific VAD silence duration. */
|
||||
silenceDurationMs: z.number().int().positive().optional(),
|
||||
/** @deprecated Legacy OpenAI-specific VAD threshold. */
|
||||
vadThreshold: z.number().min(0).max(1).optional(),
|
||||
/** WebSocket path for media stream connections */
|
||||
streamPath: z.string().min(1).default("/voice/stream"),
|
||||
/** Provider-owned raw config blobs keyed by provider id. */
|
||||
providers: VoiceCallStreamingProvidersConfigSchema,
|
||||
/**
|
||||
* Close unauthenticated media stream sockets if no valid `start` frame arrives in time.
|
||||
* Protects against pre-auth idle connection hold attacks.
|
||||
|
|
@ -230,11 +285,9 @@ export const VoiceCallStreamingConfigSchema = z
|
|||
.strict()
|
||||
.default({
|
||||
enabled: false,
|
||||
sttProvider: "openai-realtime",
|
||||
sttModel: "gpt-4o-transcribe",
|
||||
silenceDurationMs: 800,
|
||||
vadThreshold: 0.5,
|
||||
provider: "openai",
|
||||
streamPath: "/voice/stream",
|
||||
providers: {},
|
||||
preStartTimeoutMs: 5000,
|
||||
maxPendingConnections: 32,
|
||||
maxPendingConnectionsPerIp: 4,
|
||||
|
|
@ -319,6 +372,9 @@ export const VoiceCallConfigSchema = z
|
|||
/** Real-time audio streaming configuration */
|
||||
streaming: VoiceCallStreamingConfigSchema,
|
||||
|
||||
/** Realtime voice-to-voice configuration */
|
||||
realtime: VoiceCallRealtimeConfigSchema,
|
||||
|
||||
/** Public webhook URL override (if set, bypasses tunnel auto-detection) */
|
||||
publicUrl: z.string().url().optional(),
|
||||
|
||||
|
|
@ -364,6 +420,29 @@ function cloneDefaultVoiceCallConfig(): VoiceCallConfig {
|
|||
return structuredClone(DEFAULT_VOICE_CALL_CONFIG);
|
||||
}
|
||||
|
||||
function normalizeWebhookLikePath(pathname: string): string {
|
||||
const trimmed = pathname.trim();
|
||||
if (!trimmed) {
|
||||
return "/";
|
||||
}
|
||||
const prefixed = trimmed.startsWith("/") ? trimmed : `/${trimmed}`;
|
||||
if (prefixed === "/") {
|
||||
return prefixed;
|
||||
}
|
||||
return prefixed.endsWith("/") ? prefixed.slice(0, -1) : prefixed;
|
||||
}
|
||||
|
||||
function defaultRealtimeStreamPathForServePath(servePath: string): string {
|
||||
const normalized = normalizeWebhookLikePath(servePath);
|
||||
if (normalized.endsWith("/webhook")) {
|
||||
return `${normalized.slice(0, -"/webhook".length)}/stream/realtime`;
|
||||
}
|
||||
if (normalized === "/") {
|
||||
return "/voice/stream/realtime";
|
||||
}
|
||||
return `${normalized}/stream/realtime`;
|
||||
}
|
||||
|
||||
function normalizeVoiceCallTtsConfig(
|
||||
defaults: VoiceCallTtsConfig,
|
||||
overrides: DeepPartial<NonNullable<VoiceCallTtsConfig>> | undefined,
|
||||
|
|
@ -375,14 +454,55 @@ function normalizeVoiceCallTtsConfig(
|
|||
return TtsConfigSchema.parse(deepMergeDefined(defaults ?? {}, overrides ?? {}));
|
||||
}
|
||||
|
||||
function sanitizeVoiceCallProviderConfigs(
|
||||
value: Record<string, Record<string, unknown> | undefined> | undefined,
|
||||
): Record<string, Record<string, unknown>> {
|
||||
if (!value) {
|
||||
return {};
|
||||
}
|
||||
return Object.fromEntries(
|
||||
Object.entries(value).filter(
|
||||
(entry): entry is [string, Record<string, unknown>] => entry[1] !== undefined,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig {
|
||||
const defaults = cloneDefaultVoiceCallConfig();
|
||||
const serve = { ...defaults.serve, ...config.serve };
|
||||
const streamingProvider =
|
||||
config.streaming?.provider ??
|
||||
(typeof config.streaming?.sttProvider === "string"
|
||||
? config.streaming.sttProvider
|
||||
: undefined) ??
|
||||
defaults.streaming.provider;
|
||||
const streamingProviders = sanitizeVoiceCallProviderConfigs(
|
||||
config.streaming?.providers ?? defaults.streaming.providers,
|
||||
);
|
||||
if (
|
||||
typeof streamingProvider === "string" &&
|
||||
streamingProvider.trim() &&
|
||||
!(streamingProvider in streamingProviders)
|
||||
) {
|
||||
streamingProviders[streamingProvider] = {};
|
||||
}
|
||||
const realtimeProvider = config.realtime?.provider ?? defaults.realtime.provider;
|
||||
const realtimeProviders = sanitizeVoiceCallProviderConfigs(
|
||||
config.realtime?.providers ?? defaults.realtime.providers,
|
||||
);
|
||||
if (
|
||||
typeof realtimeProvider === "string" &&
|
||||
realtimeProvider.trim() &&
|
||||
!(realtimeProvider in realtimeProviders)
|
||||
) {
|
||||
realtimeProviders[realtimeProvider] = {};
|
||||
}
|
||||
return {
|
||||
...defaults,
|
||||
...config,
|
||||
allowFrom: config.allowFrom ?? defaults.allowFrom,
|
||||
outbound: { ...defaults.outbound, ...config.outbound },
|
||||
serve: { ...defaults.serve, ...config.serve },
|
||||
serve,
|
||||
tailscale: { ...defaults.tailscale, ...config.tailscale },
|
||||
tunnel: { ...defaults.tunnel, ...config.tunnel },
|
||||
webhookSecurity: {
|
||||
|
|
@ -392,7 +512,23 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal
|
|||
trustedProxyIPs:
|
||||
config.webhookSecurity?.trustedProxyIPs ?? defaults.webhookSecurity.trustedProxyIPs,
|
||||
},
|
||||
streaming: { ...defaults.streaming, ...config.streaming },
|
||||
streaming: {
|
||||
...defaults.streaming,
|
||||
...config.streaming,
|
||||
provider: streamingProvider,
|
||||
providers: streamingProviders,
|
||||
},
|
||||
realtime: {
|
||||
...defaults.realtime,
|
||||
...config.realtime,
|
||||
provider: realtimeProvider,
|
||||
streamPath:
|
||||
config.realtime?.streamPath ??
|
||||
defaultRealtimeStreamPathForServePath(serve.path ?? defaults.serve.path),
|
||||
tools:
|
||||
(config.realtime?.tools as RealtimeToolConfig[] | undefined) ?? defaults.realtime.tools,
|
||||
providers: realtimeProviders,
|
||||
},
|
||||
stt: { ...defaults.stt, ...config.stt },
|
||||
tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts),
|
||||
};
|
||||
|
|
@ -448,6 +584,133 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC
|
|||
resolved.webhookSecurity.trustForwardingHeaders ?? false;
|
||||
resolved.webhookSecurity.trustedProxyIPs = resolved.webhookSecurity.trustedProxyIPs ?? [];
|
||||
|
||||
resolved.streaming = {
|
||||
...resolved.streaming,
|
||||
providers: { ...(resolved.streaming.providers ?? {}) },
|
||||
};
|
||||
const legacyStreamingRaw = resolved.streaming as Record<string, unknown>;
|
||||
const openaiStreamingRaw =
|
||||
resolved.streaming.providers.openai && typeof resolved.streaming.providers.openai === "object"
|
||||
? { ...(resolved.streaming.providers.openai as Record<string, unknown>) }
|
||||
: {};
|
||||
if (
|
||||
typeof openaiStreamingRaw.apiKey !== "string" &&
|
||||
typeof legacyStreamingRaw.openaiApiKey === "string"
|
||||
) {
|
||||
openaiStreamingRaw.apiKey = legacyStreamingRaw.openaiApiKey;
|
||||
}
|
||||
if (
|
||||
typeof openaiStreamingRaw.model !== "string" &&
|
||||
typeof legacyStreamingRaw.sttModel === "string"
|
||||
) {
|
||||
openaiStreamingRaw.model = legacyStreamingRaw.sttModel;
|
||||
}
|
||||
if (
|
||||
openaiStreamingRaw.silenceDurationMs == null &&
|
||||
typeof legacyStreamingRaw.silenceDurationMs === "number"
|
||||
) {
|
||||
openaiStreamingRaw.silenceDurationMs = legacyStreamingRaw.silenceDurationMs;
|
||||
}
|
||||
if (
|
||||
openaiStreamingRaw.vadThreshold == null &&
|
||||
typeof legacyStreamingRaw.vadThreshold === "number"
|
||||
) {
|
||||
openaiStreamingRaw.vadThreshold = legacyStreamingRaw.vadThreshold;
|
||||
}
|
||||
if (typeof openaiStreamingRaw.apiKey !== "string" || !openaiStreamingRaw.apiKey.trim()) {
|
||||
if (process.env.OPENAI_API_KEY) {
|
||||
openaiStreamingRaw.apiKey = process.env.OPENAI_API_KEY;
|
||||
}
|
||||
}
|
||||
if (
|
||||
typeof openaiStreamingRaw.model !== "string" &&
|
||||
typeof process.env.REALTIME_TRANSCRIPTION_MODEL === "string"
|
||||
) {
|
||||
openaiStreamingRaw.model = process.env.REALTIME_TRANSCRIPTION_MODEL;
|
||||
}
|
||||
if (
|
||||
typeof openaiStreamingRaw.model !== "string" &&
|
||||
typeof process.env.STREAMING_STT_MODEL === "string"
|
||||
) {
|
||||
openaiStreamingRaw.model = process.env.STREAMING_STT_MODEL;
|
||||
}
|
||||
if (openaiStreamingRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") {
|
||||
openaiStreamingRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD);
|
||||
}
|
||||
if (
|
||||
openaiStreamingRaw.silenceDurationMs == null &&
|
||||
typeof process.env.SILENCE_DURATION_MS === "string"
|
||||
) {
|
||||
openaiStreamingRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10);
|
||||
}
|
||||
if (Object.keys(openaiStreamingRaw).length > 0) {
|
||||
resolved.streaming.providers.openai = openaiStreamingRaw;
|
||||
}
|
||||
if (
|
||||
typeof resolved.streaming.provider === "string" &&
|
||||
resolved.streaming.provider.trim() &&
|
||||
!(resolved.streaming.provider in resolved.streaming.providers)
|
||||
) {
|
||||
resolved.streaming.providers[resolved.streaming.provider] = {};
|
||||
}
|
||||
|
||||
resolved.realtime = {
|
||||
...resolved.realtime,
|
||||
providers: { ...(resolved.realtime.providers ?? {}) },
|
||||
};
|
||||
const openaiRealtimeRaw =
|
||||
resolved.realtime.providers.openai && typeof resolved.realtime.providers.openai === "object"
|
||||
? { ...(resolved.realtime.providers.openai as Record<string, unknown>) }
|
||||
: {};
|
||||
if (typeof openaiRealtimeRaw.apiKey !== "string" || !openaiRealtimeRaw.apiKey.trim()) {
|
||||
if (process.env.OPENAI_API_KEY) {
|
||||
openaiRealtimeRaw.apiKey = process.env.OPENAI_API_KEY;
|
||||
}
|
||||
}
|
||||
if (
|
||||
typeof openaiRealtimeRaw.model !== "string" &&
|
||||
typeof process.env.REALTIME_VOICE_MODEL === "string"
|
||||
) {
|
||||
openaiRealtimeRaw.model = process.env.REALTIME_VOICE_MODEL;
|
||||
}
|
||||
if (
|
||||
typeof openaiRealtimeRaw.voice !== "string" &&
|
||||
typeof process.env.REALTIME_VOICE_VOICE === "string"
|
||||
) {
|
||||
openaiRealtimeRaw.voice = process.env.REALTIME_VOICE_VOICE;
|
||||
}
|
||||
if (
|
||||
typeof resolved.realtime.instructions !== "string" &&
|
||||
typeof process.env.REALTIME_VOICE_INSTRUCTIONS === "string"
|
||||
) {
|
||||
resolved.realtime.instructions = process.env.REALTIME_VOICE_INSTRUCTIONS;
|
||||
}
|
||||
if (
|
||||
openaiRealtimeRaw.temperature == null &&
|
||||
typeof process.env.REALTIME_VOICE_TEMPERATURE === "string"
|
||||
) {
|
||||
openaiRealtimeRaw.temperature = Number.parseFloat(process.env.REALTIME_VOICE_TEMPERATURE);
|
||||
}
|
||||
if (openaiRealtimeRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") {
|
||||
openaiRealtimeRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD);
|
||||
}
|
||||
if (
|
||||
openaiRealtimeRaw.silenceDurationMs == null &&
|
||||
typeof process.env.SILENCE_DURATION_MS === "string"
|
||||
) {
|
||||
openaiRealtimeRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10);
|
||||
}
|
||||
if (Object.keys(openaiRealtimeRaw).length > 0) {
|
||||
resolved.realtime.providers.openai = openaiRealtimeRaw;
|
||||
}
|
||||
if (
|
||||
typeof resolved.realtime.provider === "string" &&
|
||||
resolved.realtime.provider.trim() &&
|
||||
!(resolved.realtime.provider in resolved.realtime.providers)
|
||||
) {
|
||||
resolved.realtime.providers[resolved.realtime.provider] = {};
|
||||
}
|
||||
|
||||
return normalizeVoiceCallConfig(resolved);
|
||||
}
|
||||
|
||||
|
|
@ -516,5 +779,23 @@ export function validateProviderConfig(config: VoiceCallConfig): {
|
|||
}
|
||||
}
|
||||
|
||||
if (config.realtime.enabled && config.inboundPolicy === "disabled") {
|
||||
errors.push(
|
||||
'plugins.entries.voice-call.config.inboundPolicy must not be "disabled" when realtime.enabled is true',
|
||||
);
|
||||
}
|
||||
|
||||
if (config.realtime.enabled && config.streaming.enabled) {
|
||||
errors.push(
|
||||
"plugins.entries.voice-call.config.realtime.enabled and plugins.entries.voice-call.config.streaming.enabled cannot both be true",
|
||||
);
|
||||
}
|
||||
|
||||
if (config.realtime.enabled && config.provider && config.provider !== "twilio") {
|
||||
errors.push(
|
||||
'plugins.entries.voice-call.config.provider must be "twilio" when realtime.enabled is true',
|
||||
);
|
||||
}
|
||||
|
||||
return { valid: errors.length === 0, errors };
|
||||
}
|
||||
|
|
|
|||
|
|
@ -125,7 +125,7 @@ describe("voice-call outbound helpers", () => {
|
|||
maxConcurrentCalls: 3,
|
||||
outbound: { defaultMode: "conversation" },
|
||||
fromNumber: "+14155550100",
|
||||
tts: { providers: { openai: { voice: "nova" } } },
|
||||
tts: { provider: "openai", providers: { openai: { voice: "nova" } } },
|
||||
},
|
||||
storePath: "/tmp/voice-call.json",
|
||||
webhookUrl: "https://example.com/webhook",
|
||||
|
|
@ -187,7 +187,7 @@ describe("voice-call outbound helpers", () => {
|
|||
activeCalls: new Map([["call-1", call]]),
|
||||
providerCallIdMap: new Map(),
|
||||
provider: { name: "twilio", playTts },
|
||||
config: { tts: { providers: { openai: { voice: "alloy" } } } },
|
||||
config: { tts: { provider: "openai", providers: { openai: { voice: "alloy" } } } },
|
||||
storePath: "/tmp/voice-call.json",
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -100,11 +100,22 @@ function requireConnectedCall(ctx: ConnectedCallContext, callId: CallId): Connec
|
|||
};
|
||||
}
|
||||
|
||||
function resolveOpenAITtsVoice(config: SpeakContext["config"]): string | undefined {
|
||||
const providerConfig = config.tts?.providers?.openai;
|
||||
return providerConfig && typeof providerConfig === "object"
|
||||
? (providerConfig.voice as string | undefined)
|
||||
: undefined;
|
||||
function resolvePreferredTtsVoice(config: SpeakContext["config"]): string | undefined {
|
||||
const providerId = config.tts?.provider;
|
||||
if (!providerId) {
|
||||
return undefined;
|
||||
}
|
||||
const providerConfig = config.tts?.providers?.[providerId];
|
||||
if (!providerConfig || typeof providerConfig !== "object") {
|
||||
return undefined;
|
||||
}
|
||||
if (typeof providerConfig.voice === "string" && providerConfig.voice.trim()) {
|
||||
return providerConfig.voice;
|
||||
}
|
||||
if (typeof providerConfig.voiceId === "string" && providerConfig.voiceId.trim()) {
|
||||
return providerConfig.voiceId;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export async function initiateCall(
|
||||
|
|
@ -164,7 +175,7 @@ export async function initiateCall(
|
|||
// For notify mode with a message, use inline TwiML with <Say>.
|
||||
let inlineTwiml: string | undefined;
|
||||
if (mode === "notify" && initialMessage) {
|
||||
const pollyVoice = mapVoiceToPolly(resolveOpenAITtsVoice(ctx.config));
|
||||
const pollyVoice = mapVoiceToPolly(resolvePreferredTtsVoice(ctx.config));
|
||||
inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
|
||||
console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
|
||||
}
|
||||
|
|
@ -212,7 +223,7 @@ export async function speak(
|
|||
transitionState(call, "speaking");
|
||||
persistCallRecord(ctx.storePath, call);
|
||||
|
||||
const voice = provider.name === "twilio" ? resolveOpenAITtsVoice(ctx.config) : undefined;
|
||||
const voice = provider.name === "twilio" ? resolvePreferredTtsVoice(ctx.config) : undefined;
|
||||
await provider.playTts({
|
||||
callId,
|
||||
providerCallId,
|
||||
|
|
|
|||
|
|
@ -1,28 +1,27 @@
|
|||
import { once } from "node:events";
|
||||
import http from "node:http";
|
||||
import type {
|
||||
RealtimeTranscriptionProviderPlugin,
|
||||
RealtimeTranscriptionSession,
|
||||
} from "openclaw/plugin-sdk/realtime-transcription";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import { WebSocket } from "ws";
|
||||
import { MediaStreamHandler, sanitizeLogText } from "./media-stream.js";
|
||||
import type {
|
||||
OpenAIRealtimeSTTProvider,
|
||||
RealtimeSTTSession,
|
||||
} from "./providers/stt-openai-realtime.js";
|
||||
|
||||
const createStubSession = (): RealtimeSTTSession => ({
|
||||
const createStubSession = (): RealtimeTranscriptionSession => ({
|
||||
connect: async () => {},
|
||||
sendAudio: () => {},
|
||||
waitForTranscript: async () => "",
|
||||
onPartial: () => {},
|
||||
onTranscript: () => {},
|
||||
onSpeechStart: () => {},
|
||||
close: () => {},
|
||||
isConnected: () => true,
|
||||
});
|
||||
|
||||
const createStubSttProvider = (): OpenAIRealtimeSTTProvider =>
|
||||
const createStubSttProvider = (): RealtimeTranscriptionProviderPlugin =>
|
||||
({
|
||||
createSession: () => createStubSession(),
|
||||
}) as unknown as OpenAIRealtimeSTTProvider;
|
||||
id: "openai",
|
||||
label: "OpenAI",
|
||||
isConfigured: () => true,
|
||||
}) as unknown as RealtimeTranscriptionProviderPlugin;
|
||||
|
||||
const flush = async (): Promise<void> => {
|
||||
await new Promise((resolve) => setTimeout(resolve, 0));
|
||||
|
|
@ -104,7 +103,8 @@ const waitForClose = async (
|
|||
describe("MediaStreamHandler TTS queue", () => {
|
||||
it("serializes TTS playback and resolves in order", async () => {
|
||||
const handler = new MediaStreamHandler({
|
||||
sttProvider: createStubSttProvider(),
|
||||
transcriptionProvider: createStubSttProvider(),
|
||||
providerConfig: {},
|
||||
});
|
||||
const started: number[] = [];
|
||||
const finished: number[] = [];
|
||||
|
|
@ -137,7 +137,8 @@ describe("MediaStreamHandler TTS queue", () => {
|
|||
|
||||
it("cancels active playback and clears queued items", async () => {
|
||||
const handler = new MediaStreamHandler({
|
||||
sttProvider: createStubSttProvider(),
|
||||
transcriptionProvider: createStubSttProvider(),
|
||||
providerConfig: {},
|
||||
});
|
||||
|
||||
let queuedRan = false;
|
||||
|
|
@ -165,7 +166,8 @@ describe("MediaStreamHandler TTS queue", () => {
|
|||
describe("MediaStreamHandler security hardening", () => {
|
||||
it("fails sends and closes stream when buffered bytes already exceed the cap", () => {
|
||||
const handler = new MediaStreamHandler({
|
||||
sttProvider: createStubSttProvider(),
|
||||
transcriptionProvider: createStubSttProvider(),
|
||||
providerConfig: {},
|
||||
});
|
||||
const ws = {
|
||||
readyState: WebSocket.OPEN,
|
||||
|
|
@ -177,7 +179,12 @@ describe("MediaStreamHandler security hardening", () => {
|
|||
handler as unknown as {
|
||||
sessions: Map<
|
||||
string,
|
||||
{ callId: string; streamSid: string; ws: WebSocket; sttSession: RealtimeSTTSession }
|
||||
{
|
||||
callId: string;
|
||||
streamSid: string;
|
||||
ws: WebSocket;
|
||||
sttSession: RealtimeTranscriptionSession;
|
||||
}
|
||||
>;
|
||||
}
|
||||
).sessions.set("MZ-backpressure", {
|
||||
|
|
@ -196,7 +203,8 @@ describe("MediaStreamHandler security hardening", () => {
|
|||
|
||||
it("fails sends when buffered bytes exceed cap after enqueueing a frame", () => {
|
||||
const handler = new MediaStreamHandler({
|
||||
sttProvider: createStubSttProvider(),
|
||||
transcriptionProvider: createStubSttProvider(),
|
||||
providerConfig: {},
|
||||
});
|
||||
const ws = {
|
||||
readyState: WebSocket.OPEN,
|
||||
|
|
@ -214,7 +222,12 @@ describe("MediaStreamHandler security hardening", () => {
|
|||
handler as unknown as {
|
||||
sessions: Map<
|
||||
string,
|
||||
{ callId: string; streamSid: string; ws: WebSocket; sttSession: RealtimeSTTSession }
|
||||
{
|
||||
callId: string;
|
||||
streamSid: string;
|
||||
ws: WebSocket;
|
||||
sttSession: RealtimeTranscriptionSession;
|
||||
}
|
||||
>;
|
||||
}
|
||||
).sessions.set("MZ-overflow", {
|
||||
|
|
@ -243,7 +256,8 @@ describe("MediaStreamHandler security hardening", () => {
|
|||
const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
|
||||
[];
|
||||
const handler = new MediaStreamHandler({
|
||||
sttProvider: createStubSttProvider(),
|
||||
transcriptionProvider: createStubSttProvider(),
|
||||
providerConfig: {},
|
||||
preStartTimeoutMs: 40,
|
||||
shouldAcceptStream: (params) => {
|
||||
shouldAcceptStreamCalls.push(params);
|
||||
|
|
@ -266,7 +280,8 @@ describe("MediaStreamHandler security hardening", () => {
|
|||
|
||||
it("enforces pending connection limits", async () => {
|
||||
const handler = new MediaStreamHandler({
|
||||
sttProvider: createStubSttProvider(),
|
||||
transcriptionProvider: createStubSttProvider(),
|
||||
providerConfig: {},
|
||||
preStartTimeoutMs: 5_000,
|
||||
maxPendingConnections: 1,
|
||||
maxPendingConnectionsPerIp: 1,
|
||||
|
|
@ -291,7 +306,8 @@ describe("MediaStreamHandler security hardening", () => {
|
|||
|
||||
it("rejects upgrades when max connection cap is reached", async () => {
|
||||
const handler = new MediaStreamHandler({
|
||||
sttProvider: createStubSttProvider(),
|
||||
transcriptionProvider: createStubSttProvider(),
|
||||
providerConfig: {},
|
||||
preStartTimeoutMs: 5_000,
|
||||
maxConnections: 1,
|
||||
maxPendingConnections: 10,
|
||||
|
|
@ -319,7 +335,8 @@ describe("MediaStreamHandler security hardening", () => {
|
|||
|
||||
it("clears pending state after valid start", async () => {
|
||||
const handler = new MediaStreamHandler({
|
||||
sttProvider: createStubSttProvider(),
|
||||
transcriptionProvider: createStubSttProvider(),
|
||||
providerConfig: {},
|
||||
preStartTimeoutMs: 40,
|
||||
shouldAcceptStream: () => true,
|
||||
});
|
||||
|
|
@ -349,7 +366,8 @@ describe("MediaStreamHandler security hardening", () => {
|
|||
const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
|
||||
[];
|
||||
const handler = new MediaStreamHandler({
|
||||
sttProvider: createStubSttProvider(),
|
||||
transcriptionProvider: createStubSttProvider(),
|
||||
providerConfig: {},
|
||||
preStartTimeoutMs: 1_000,
|
||||
shouldAcceptStream: (params) => {
|
||||
shouldAcceptStreamCalls.push(params);
|
||||
|
|
|
|||
|
|
@ -3,24 +3,27 @@
|
|||
*
|
||||
* Handles bidirectional audio streaming between Twilio and the AI services.
|
||||
* - Receives mu-law audio from Twilio via WebSocket
|
||||
* - Forwards to OpenAI Realtime STT for transcription
|
||||
* - Forwards to the selected realtime transcription provider
|
||||
* - Sends TTS audio back to Twilio
|
||||
*/
|
||||
|
||||
import type { IncomingMessage } from "node:http";
|
||||
import type { Duplex } from "node:stream";
|
||||
import { type RawData, WebSocket, WebSocketServer } from "ws";
|
||||
import type {
|
||||
OpenAIRealtimeSTTProvider,
|
||||
RealtimeSTTSession,
|
||||
} from "./providers/stt-openai-realtime.js";
|
||||
RealtimeTranscriptionProviderConfig,
|
||||
RealtimeTranscriptionProviderPlugin,
|
||||
RealtimeTranscriptionSession,
|
||||
} from "openclaw/plugin-sdk/realtime-transcription";
|
||||
import { type RawData, WebSocket, WebSocketServer } from "ws";
|
||||
|
||||
/**
|
||||
* Configuration for the media stream handler.
|
||||
*/
|
||||
export interface MediaStreamConfig {
|
||||
/** STT provider for transcription */
|
||||
sttProvider: OpenAIRealtimeSTTProvider;
|
||||
/** Realtime transcription provider for streaming STT. */
|
||||
transcriptionProvider: RealtimeTranscriptionProviderPlugin;
|
||||
/** Provider-owned config blob passed into the transcription session. */
|
||||
providerConfig: RealtimeTranscriptionProviderConfig;
|
||||
/** Close sockets that never send a valid `start` frame within this window. */
|
||||
preStartTimeoutMs?: number;
|
||||
/** Max concurrent pre-start sockets. */
|
||||
|
|
@ -50,7 +53,7 @@ interface StreamSession {
|
|||
callId: string;
|
||||
streamSid: string;
|
||||
ws: WebSocket;
|
||||
sttSession: RealtimeSTTSession;
|
||||
sttSession: RealtimeTranscriptionSession;
|
||||
}
|
||||
|
||||
type TtsQueueEntry = {
|
||||
|
|
@ -254,20 +257,20 @@ export class MediaStreamHandler {
|
|||
return null;
|
||||
}
|
||||
|
||||
// Create STT session
|
||||
const sttSession = this.config.sttProvider.createSession();
|
||||
|
||||
// Set up transcript callbacks
|
||||
sttSession.onPartial((partial) => {
|
||||
this.config.onPartialTranscript?.(callSid, partial);
|
||||
});
|
||||
|
||||
sttSession.onTranscript((transcript) => {
|
||||
this.config.onTranscript?.(callSid, transcript);
|
||||
});
|
||||
|
||||
sttSession.onSpeechStart(() => {
|
||||
this.config.onSpeechStart?.(callSid);
|
||||
const sttSession = this.config.transcriptionProvider.createSession({
|
||||
providerConfig: this.config.providerConfig,
|
||||
onPartial: (partial) => {
|
||||
this.config.onPartialTranscript?.(callSid, partial);
|
||||
},
|
||||
onTranscript: (transcript) => {
|
||||
this.config.onTranscript?.(callSid, transcript);
|
||||
},
|
||||
onSpeechStart: () => {
|
||||
this.config.onSpeechStart?.(callSid);
|
||||
},
|
||||
onError: (error) => {
|
||||
console.warn("[MediaStream] Transcription session error:", error.message);
|
||||
},
|
||||
});
|
||||
|
||||
const session: StreamSession = {
|
||||
|
|
@ -282,7 +285,7 @@ export class MediaStreamHandler {
|
|||
// Notify connection BEFORE STT connect so TTS can work even if STT fails
|
||||
this.config.onConnect?.(callSid, streamSid);
|
||||
|
||||
// Connect to OpenAI STT (non-blocking, log errors but don't fail the call)
|
||||
// Connect to transcription service (non-blocking, log errors but don't fail the call)
|
||||
sttSession.connect().catch((err) => {
|
||||
console.warn(`[MediaStream] STT connection failed (TTS still works):`, err.message);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,10 +1,5 @@
|
|||
export type { VoiceCallProvider } from "./base.js";
|
||||
export { MockProvider } from "./mock.js";
|
||||
export {
|
||||
OpenAIRealtimeSTTProvider,
|
||||
type RealtimeSTTConfig,
|
||||
type RealtimeSTTSession,
|
||||
} from "./stt-openai-realtime.js";
|
||||
export { TelnyxProvider } from "./telnyx.js";
|
||||
export { TwilioProvider } from "./twilio.js";
|
||||
export { PlivoProvider } from "./plivo.js";
|
||||
|
|
|
|||
|
|
@ -1,42 +0,0 @@
|
|||
import { describe, expect, it } from "vitest";
|
||||
import type { RealtimeSTTConfig } from "./stt-openai-realtime.js";
|
||||
import { OpenAIRealtimeSTTProvider } from "./stt-openai-realtime.js";
|
||||
|
||||
type ProviderInternals = {
|
||||
vadThreshold: number;
|
||||
silenceDurationMs: number;
|
||||
};
|
||||
|
||||
function readProviderInternals(config: RealtimeSTTConfig): ProviderInternals {
|
||||
const provider = new OpenAIRealtimeSTTProvider(config) as unknown as Record<string, unknown>;
|
||||
return {
|
||||
vadThreshold: provider["vadThreshold"] as number,
|
||||
silenceDurationMs: provider["silenceDurationMs"] as number,
|
||||
};
|
||||
}
|
||||
|
||||
describe("OpenAIRealtimeSTTProvider constructor defaults", () => {
|
||||
it("uses vadThreshold: 0 when explicitly configured (max sensitivity)", () => {
|
||||
const provider = readProviderInternals({
|
||||
apiKey: "sk-test", // pragma: allowlist secret
|
||||
vadThreshold: 0,
|
||||
});
|
||||
expect(provider.vadThreshold).toBe(0);
|
||||
});
|
||||
|
||||
it("uses silenceDurationMs: 0 when explicitly configured", () => {
|
||||
const provider = readProviderInternals({
|
||||
apiKey: "sk-test", // pragma: allowlist secret
|
||||
silenceDurationMs: 0,
|
||||
});
|
||||
expect(provider.silenceDurationMs).toBe(0);
|
||||
});
|
||||
|
||||
it("falls back to defaults when values are undefined", () => {
|
||||
const provider = readProviderInternals({
|
||||
apiKey: "sk-test", // pragma: allowlist secret
|
||||
});
|
||||
expect(provider.vadThreshold).toBe(0.5);
|
||||
expect(provider.silenceDurationMs).toBe(800);
|
||||
});
|
||||
});
|
||||
|
|
@ -1,321 +0,0 @@
|
|||
/**
|
||||
* OpenAI Realtime STT Provider
|
||||
*
|
||||
* Uses the OpenAI Realtime API for streaming transcription with:
|
||||
* - Direct mu-law audio support (no conversion needed)
|
||||
* - Built-in server-side VAD for turn detection
|
||||
* - Low-latency streaming transcription
|
||||
* - Partial transcript callbacks for real-time UI updates
|
||||
*/
|
||||
|
||||
import WebSocket from "ws";
|
||||
|
||||
/**
|
||||
* Configuration for OpenAI Realtime STT.
|
||||
*/
|
||||
export interface RealtimeSTTConfig {
|
||||
/** OpenAI API key */
|
||||
apiKey: string;
|
||||
/** Model to use (default: gpt-4o-transcribe) */
|
||||
model?: string;
|
||||
/** Silence duration in ms before considering speech ended (default: 800) */
|
||||
silenceDurationMs?: number;
|
||||
/** VAD threshold 0-1 (default: 0.5) */
|
||||
vadThreshold?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Session for streaming audio and receiving transcripts.
|
||||
*/
|
||||
export interface RealtimeSTTSession {
|
||||
/** Connect to the transcription service */
|
||||
connect(): Promise<void>;
|
||||
/** Send mu-law audio data (8kHz mono) */
|
||||
sendAudio(audio: Buffer): void;
|
||||
/** Wait for next complete transcript (after VAD detects end of speech) */
|
||||
waitForTranscript(timeoutMs?: number): Promise<string>;
|
||||
/** Set callback for partial transcripts (streaming) */
|
||||
onPartial(callback: (partial: string) => void): void;
|
||||
/** Set callback for final transcripts */
|
||||
onTranscript(callback: (transcript: string) => void): void;
|
||||
/** Set callback when speech starts (VAD) */
|
||||
onSpeechStart(callback: () => void): void;
|
||||
/** Close the session */
|
||||
close(): void;
|
||||
/** Check if session is connected */
|
||||
isConnected(): boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provider factory for OpenAI Realtime STT sessions.
|
||||
*/
|
||||
export class OpenAIRealtimeSTTProvider {
|
||||
readonly name = "openai-realtime";
|
||||
private apiKey: string;
|
||||
private model: string;
|
||||
private silenceDurationMs: number;
|
||||
private vadThreshold: number;
|
||||
|
||||
constructor(config: RealtimeSTTConfig) {
|
||||
if (!config.apiKey) {
|
||||
throw new Error("OpenAI API key required for Realtime STT");
|
||||
}
|
||||
this.apiKey = config.apiKey;
|
||||
this.model = config.model || "gpt-4o-transcribe";
|
||||
this.silenceDurationMs = config.silenceDurationMs ?? 800;
|
||||
this.vadThreshold = config.vadThreshold ?? 0.5;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new realtime transcription session.
|
||||
*/
|
||||
createSession(): RealtimeSTTSession {
|
||||
return new OpenAIRealtimeSTTSession(
|
||||
this.apiKey,
|
||||
this.model,
|
||||
this.silenceDurationMs,
|
||||
this.vadThreshold,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* WebSocket-based session for real-time speech-to-text.
|
||||
*/
|
||||
class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
|
||||
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
|
||||
private static readonly RECONNECT_DELAY_MS = 1000;
|
||||
|
||||
private ws: WebSocket | null = null;
|
||||
private connected = false;
|
||||
private closed = false;
|
||||
private connectTimeout: ReturnType<typeof setTimeout> | null = null;
|
||||
private reconnectAttempts = 0;
|
||||
private pendingTranscript = "";
|
||||
private onTranscriptCallback: ((transcript: string) => void) | null = null;
|
||||
private onPartialCallback: ((partial: string) => void) | null = null;
|
||||
private onSpeechStartCallback: (() => void) | null = null;
|
||||
|
||||
constructor(
|
||||
private readonly apiKey: string,
|
||||
private readonly model: string,
|
||||
private readonly silenceDurationMs: number,
|
||||
private readonly vadThreshold: number,
|
||||
) {}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
this.closed = false;
|
||||
this.reconnectAttempts = 0;
|
||||
return this.doConnect();
|
||||
}
|
||||
|
||||
private async doConnect(): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const url = "wss://api.openai.com/v1/realtime?intent=transcription";
|
||||
|
||||
this.ws = new WebSocket(url, {
|
||||
headers: {
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
"OpenAI-Beta": "realtime=v1",
|
||||
},
|
||||
});
|
||||
|
||||
this.ws.on("open", () => {
|
||||
console.log("[RealtimeSTT] WebSocket connected");
|
||||
this.connected = true;
|
||||
this.reconnectAttempts = 0;
|
||||
if (this.connectTimeout) {
|
||||
clearTimeout(this.connectTimeout);
|
||||
this.connectTimeout = null;
|
||||
}
|
||||
|
||||
// Configure the transcription session
|
||||
this.sendEvent({
|
||||
type: "transcription_session.update",
|
||||
session: {
|
||||
input_audio_format: "g711_ulaw",
|
||||
input_audio_transcription: {
|
||||
model: this.model,
|
||||
},
|
||||
turn_detection: {
|
||||
type: "server_vad",
|
||||
threshold: this.vadThreshold,
|
||||
prefix_padding_ms: 300,
|
||||
silence_duration_ms: this.silenceDurationMs,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
resolve();
|
||||
});
|
||||
|
||||
this.ws.on("message", (data: Buffer) => {
|
||||
try {
|
||||
const event = JSON.parse(data.toString());
|
||||
this.handleEvent(event);
|
||||
} catch (e) {
|
||||
console.error("[RealtimeSTT] Failed to parse event:", e);
|
||||
}
|
||||
});
|
||||
|
||||
this.ws.on("error", (error) => {
|
||||
console.error("[RealtimeSTT] WebSocket error:", error);
|
||||
if (!this.connected) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
|
||||
this.ws.on("close", (code, reason) => {
|
||||
console.log(
|
||||
`[RealtimeSTT] WebSocket closed (code: ${code}, reason: ${reason?.toString() || "none"})`,
|
||||
);
|
||||
this.connected = false;
|
||||
|
||||
// Attempt reconnection if not intentionally closed
|
||||
if (!this.closed) {
|
||||
void this.attemptReconnect();
|
||||
}
|
||||
});
|
||||
|
||||
this.connectTimeout = setTimeout(() => {
|
||||
this.connectTimeout = null;
|
||||
if (!this.connected) {
|
||||
reject(new Error("Realtime STT connection timeout"));
|
||||
}
|
||||
}, 10000);
|
||||
});
|
||||
}
|
||||
|
||||
private async attemptReconnect(): Promise<void> {
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.reconnectAttempts >= OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS) {
|
||||
console.error(
|
||||
`[RealtimeSTT] Max reconnect attempts (${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS}) reached`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
this.reconnectAttempts++;
|
||||
const delay = OpenAIRealtimeSTTSession.RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
|
||||
console.log(
|
||||
`[RealtimeSTT] Reconnecting ${this.reconnectAttempts}/${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS} in ${delay}ms...`,
|
||||
);
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.doConnect();
|
||||
console.log("[RealtimeSTT] Reconnected successfully");
|
||||
} catch (error) {
|
||||
console.error("[RealtimeSTT] Reconnect failed:", error);
|
||||
}
|
||||
}
|
||||
|
||||
private handleEvent(event: {
|
||||
type: string;
|
||||
delta?: string;
|
||||
transcript?: string;
|
||||
error?: unknown;
|
||||
}): void {
|
||||
switch (event.type) {
|
||||
case "transcription_session.created":
|
||||
case "transcription_session.updated":
|
||||
case "input_audio_buffer.speech_stopped":
|
||||
case "input_audio_buffer.committed":
|
||||
console.log(`[RealtimeSTT] ${event.type}`);
|
||||
break;
|
||||
|
||||
case "conversation.item.input_audio_transcription.delta":
|
||||
if (event.delta) {
|
||||
this.pendingTranscript += event.delta;
|
||||
this.onPartialCallback?.(this.pendingTranscript);
|
||||
}
|
||||
break;
|
||||
|
||||
case "conversation.item.input_audio_transcription.completed":
|
||||
if (event.transcript) {
|
||||
console.log(`[RealtimeSTT] Transcript: ${event.transcript}`);
|
||||
this.onTranscriptCallback?.(event.transcript);
|
||||
}
|
||||
this.pendingTranscript = "";
|
||||
break;
|
||||
|
||||
case "input_audio_buffer.speech_started":
|
||||
console.log("[RealtimeSTT] Speech started");
|
||||
this.pendingTranscript = "";
|
||||
this.onSpeechStartCallback?.();
|
||||
break;
|
||||
|
||||
case "error":
|
||||
console.error("[RealtimeSTT] Error:", event.error);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private sendEvent(event: unknown): void {
|
||||
if (this.ws?.readyState === WebSocket.OPEN) {
|
||||
this.ws.send(JSON.stringify(event));
|
||||
}
|
||||
}
|
||||
|
||||
sendAudio(muLawData: Buffer): void {
|
||||
if (!this.connected) {
|
||||
return;
|
||||
}
|
||||
this.sendEvent({
|
||||
type: "input_audio_buffer.append",
|
||||
audio: muLawData.toString("base64"),
|
||||
});
|
||||
}
|
||||
|
||||
onPartial(callback: (partial: string) => void): void {
|
||||
this.onPartialCallback = callback;
|
||||
}
|
||||
|
||||
onTranscript(callback: (transcript: string) => void): void {
|
||||
this.onTranscriptCallback = callback;
|
||||
}
|
||||
|
||||
onSpeechStart(callback: () => void): void {
|
||||
this.onSpeechStartCallback = callback;
|
||||
}
|
||||
|
||||
async waitForTranscript(timeoutMs = 30000): Promise<string> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timeout = setTimeout(() => {
|
||||
this.onTranscriptCallback = null;
|
||||
reject(new Error("Transcript timeout"));
|
||||
}, timeoutMs);
|
||||
|
||||
this.onTranscriptCallback = (transcript) => {
|
||||
clearTimeout(timeout);
|
||||
this.onTranscriptCallback = null;
|
||||
resolve(transcript);
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.closed = true;
|
||||
if (this.connectTimeout) {
|
||||
clearTimeout(this.connectTimeout);
|
||||
this.connectTimeout = null;
|
||||
}
|
||||
if (this.ws) {
|
||||
this.ws.close();
|
||||
this.ws = null;
|
||||
}
|
||||
this.connected = false;
|
||||
}
|
||||
|
||||
isConnected(): boolean {
|
||||
return this.connected;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,43 +0,0 @@
|
|||
import { describe, expect, it } from "vitest";
|
||||
import type { OpenAITTSConfig } from "./tts-openai.js";
|
||||
import { OpenAITTSProvider } from "./tts-openai.js";
|
||||
|
||||
type ProviderInternals = {
|
||||
model: string;
|
||||
voice: string;
|
||||
speed: number;
|
||||
};
|
||||
|
||||
function readProviderInternals(config: OpenAITTSConfig): ProviderInternals {
|
||||
return new OpenAITTSProvider(config) as unknown as ProviderInternals;
|
||||
}
|
||||
|
||||
describe("OpenAITTSProvider constructor defaults", () => {
|
||||
it("uses speed: 0 when explicitly configured", () => {
|
||||
const provider = readProviderInternals({
|
||||
apiKey: "sk-test", // pragma: allowlist secret
|
||||
speed: 0,
|
||||
});
|
||||
|
||||
expect(provider.speed).toBe(0);
|
||||
});
|
||||
|
||||
it("falls back to speed default when undefined", () => {
|
||||
const provider = readProviderInternals({
|
||||
apiKey: "sk-test", // pragma: allowlist secret
|
||||
});
|
||||
|
||||
expect(provider.speed).toBe(1.0);
|
||||
});
|
||||
|
||||
it("treats blank model and voice overrides as unset", () => {
|
||||
const provider = readProviderInternals({
|
||||
apiKey: "sk-test", // pragma: allowlist secret
|
||||
model: " ",
|
||||
voice: "",
|
||||
});
|
||||
|
||||
expect(provider.model).toBe("gpt-4o-mini-tts");
|
||||
expect(provider.voice).toBe("coral");
|
||||
});
|
||||
});
|
||||
|
|
@ -1,185 +0,0 @@
|
|||
import { convertPcmToMulaw8k } from "../telephony-audio.js";
|
||||
|
||||
/**
|
||||
* OpenAI TTS Provider
|
||||
*
|
||||
* Generates speech audio using OpenAI's text-to-speech API.
|
||||
* Handles audio format conversion for telephony (mu-law 8kHz).
|
||||
*
|
||||
* Best practices from OpenAI docs:
|
||||
* - Use gpt-4o-mini-tts for intelligent realtime applications (supports instructions)
|
||||
* - Use tts-1 for lower latency, tts-1-hd for higher quality
|
||||
* - Use marin or cedar voices for best quality
|
||||
* - Use pcm or wav format for fastest response times
|
||||
*
|
||||
* @see https://platform.openai.com/docs/guides/text-to-speech
|
||||
*/
|
||||
|
||||
/**
|
||||
* OpenAI TTS configuration.
|
||||
*/
|
||||
export interface OpenAITTSConfig {
|
||||
/** OpenAI API key (uses OPENAI_API_KEY env if not set) */
|
||||
apiKey?: string;
|
||||
/**
|
||||
* TTS model:
|
||||
* - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
|
||||
* - tts-1: lower latency
|
||||
* - tts-1-hd: higher quality
|
||||
*/
|
||||
model?: string;
|
||||
/**
|
||||
* Voice to use. For best quality, use marin or cedar.
|
||||
* All 13 voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
|
||||
* Note: tts-1/tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer
|
||||
*/
|
||||
voice?: string;
|
||||
/** Speed multiplier (0.25 to 4.0) */
|
||||
speed?: number;
|
||||
/**
|
||||
* Instructions for speech style (only works with gpt-4o-mini-tts model).
|
||||
* Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
|
||||
*/
|
||||
instructions?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Supported OpenAI TTS voices (all 13 built-in voices).
|
||||
* For best quality, use marin or cedar.
|
||||
* Note: tts-1 and tts-1-hd support a smaller set.
|
||||
*/
|
||||
export const OPENAI_TTS_VOICES = [
|
||||
"alloy",
|
||||
"ash",
|
||||
"ballad",
|
||||
"coral",
|
||||
"echo",
|
||||
"fable",
|
||||
"nova",
|
||||
"onyx",
|
||||
"sage",
|
||||
"shimmer",
|
||||
"verse",
|
||||
"marin",
|
||||
"cedar",
|
||||
] as const;
|
||||
|
||||
export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
|
||||
|
||||
function trimToUndefined(value: string | undefined): string | undefined {
|
||||
const trimmed = value?.trim();
|
||||
return trimmed ? trimmed : undefined;
|
||||
}
|
||||
|
||||
function resolveOpenAITtsInstructions(model: string, instructions?: string): string | undefined {
|
||||
const next = trimToUndefined(instructions);
|
||||
return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* OpenAI TTS Provider for generating speech audio.
|
||||
*/
|
||||
export class OpenAITTSProvider {
|
||||
private apiKey: string;
|
||||
private model: string;
|
||||
private voice: OpenAITTSVoice;
|
||||
private speed: number;
|
||||
private instructions?: string;
|
||||
|
||||
constructor(config: OpenAITTSConfig = {}) {
|
||||
this.apiKey =
|
||||
trimToUndefined(config.apiKey) ?? trimToUndefined(process.env.OPENAI_API_KEY) ?? "";
|
||||
// Default to gpt-4o-mini-tts for intelligent realtime applications
|
||||
this.model = trimToUndefined(config.model) ?? "gpt-4o-mini-tts";
|
||||
// Default to coral - good balance of quality and natural tone
|
||||
this.voice = (trimToUndefined(config.voice) as OpenAITTSVoice | undefined) ?? "coral";
|
||||
this.speed = config.speed ?? 1.0;
|
||||
this.instructions = trimToUndefined(config.instructions);
|
||||
|
||||
if (!this.apiKey) {
|
||||
throw new Error("OpenAI API key required (set OPENAI_API_KEY or pass apiKey)");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate speech audio from text.
|
||||
* Returns raw PCM audio data (24kHz, mono, 16-bit).
|
||||
*/
|
||||
async synthesize(text: string, instructions?: string): Promise<Buffer> {
|
||||
// Build request body
|
||||
const body: Record<string, unknown> = {
|
||||
model: this.model,
|
||||
input: text,
|
||||
voice: this.voice,
|
||||
response_format: "pcm", // Raw PCM audio (24kHz, mono, 16-bit signed LE)
|
||||
speed: this.speed,
|
||||
};
|
||||
|
||||
const effectiveInstructions = resolveOpenAITtsInstructions(
|
||||
this.model,
|
||||
trimToUndefined(instructions) ?? this.instructions,
|
||||
);
|
||||
if (effectiveInstructions) {
|
||||
body.instructions = effectiveInstructions;
|
||||
}
|
||||
|
||||
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`OpenAI TTS failed: ${response.status} - ${error}`);
|
||||
}
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
return Buffer.from(arrayBuffer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate speech and convert to mu-law format for Twilio.
|
||||
* Twilio Media Streams expect 8kHz mono mu-law audio.
|
||||
*/
|
||||
async synthesizeForTwilio(text: string): Promise<Buffer> {
|
||||
// Get raw PCM from OpenAI (24kHz, 16-bit signed LE, mono)
|
||||
const pcm24k = await this.synthesize(text);
|
||||
|
||||
// Convert from 24kHz PCM to Twilio-compatible 8kHz mu-law
|
||||
return convertPcmToMulaw8k(pcm24k, 24000);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert 8-bit mu-law to 16-bit linear PCM.
|
||||
* Useful for decoding incoming audio.
|
||||
*/
|
||||
export function mulawToLinear(mulaw: number): number {
|
||||
// mu-law is transmitted inverted
|
||||
mulaw = ~mulaw & 0xff;
|
||||
|
||||
const sign = mulaw & 0x80;
|
||||
const exponent = (mulaw >> 4) & 0x07;
|
||||
const mantissa = mulaw & 0x0f;
|
||||
|
||||
let sample = ((mantissa << 3) + 132) << exponent;
|
||||
sample -= 132;
|
||||
|
||||
return sign ? -sample : sample;
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk audio buffer into 20ms frames for streaming.
|
||||
* At 8kHz mono, 20ms = 160 samples = 160 bytes (mu-law).
|
||||
*/
|
||||
export function chunkAudio(audio: Buffer, chunkSize = 160): Generator<Buffer, void, unknown> {
|
||||
return (function* () {
|
||||
for (let i = 0; i < audio.length; i += chunkSize) {
|
||||
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
export {
|
||||
getRealtimeTranscriptionProvider,
|
||||
listRealtimeTranscriptionProviders,
|
||||
} from "openclaw/plugin-sdk/realtime-transcription";
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
export {
|
||||
getRealtimeVoiceProvider,
|
||||
listRealtimeVoiceProviders,
|
||||
} from "openclaw/plugin-sdk/realtime-voice";
|
||||
|
|
@ -1,12 +1,14 @@
|
|||
import type { OpenClawConfig } from "openclaw/plugin-sdk/core";
|
||||
import type {
|
||||
RealtimeVoiceProviderConfig,
|
||||
RealtimeVoiceProviderPlugin,
|
||||
} from "openclaw/plugin-sdk/realtime-voice";
|
||||
import type { VoiceCallConfig } from "./config.js";
|
||||
import { resolveVoiceCallConfig, validateProviderConfig } from "./config.js";
|
||||
import type { CoreAgentDeps, CoreConfig } from "./core-bridge.js";
|
||||
import { CallManager } from "./manager.js";
|
||||
import type { VoiceCallProvider } from "./providers/base.js";
|
||||
import { MockProvider } from "./providers/mock.js";
|
||||
import { PlivoProvider } from "./providers/plivo.js";
|
||||
import { TelnyxProvider } from "./providers/telnyx.js";
|
||||
import { TwilioProvider } from "./providers/twilio.js";
|
||||
import type { TwilioProvider } from "./providers/twilio.js";
|
||||
import type { TelephonyTtsRuntime } from "./telephony-tts.js";
|
||||
import { createTelephonyTtsProvider } from "./telephony-tts.js";
|
||||
import { startTunnel, type TunnelResult } from "./tunnel.js";
|
||||
|
|
@ -30,6 +32,11 @@ type Logger = {
|
|||
debug?: (message: string) => void;
|
||||
};
|
||||
|
||||
type ResolvedRealtimeProvider = {
|
||||
provider: RealtimeVoiceProviderPlugin;
|
||||
providerConfig: RealtimeVoiceProviderConfig;
|
||||
};
|
||||
|
||||
function createRuntimeResourceLifecycle(params: {
|
||||
config: VoiceCallConfig;
|
||||
webhookServer: VoiceCallWebhookServer;
|
||||
|
|
@ -80,14 +87,15 @@ function isLoopbackBind(bind: string | undefined): boolean {
|
|||
return bind === "127.0.0.1" || bind === "::1" || bind === "localhost";
|
||||
}
|
||||
|
||||
function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
|
||||
async function resolveProvider(config: VoiceCallConfig): Promise<VoiceCallProvider> {
|
||||
const allowNgrokFreeTierLoopbackBypass =
|
||||
config.tunnel?.provider === "ngrok" &&
|
||||
isLoopbackBind(config.serve?.bind) &&
|
||||
(config.tunnel?.allowNgrokFreeTierLoopbackBypass ?? false);
|
||||
|
||||
switch (config.provider) {
|
||||
case "telnyx":
|
||||
case "telnyx": {
|
||||
const { TelnyxProvider } = await import("./providers/telnyx.js");
|
||||
return new TelnyxProvider(
|
||||
{
|
||||
apiKey: config.telnyx?.apiKey,
|
||||
|
|
@ -98,7 +106,9 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
|
|||
skipVerification: config.skipSignatureVerification,
|
||||
},
|
||||
);
|
||||
case "twilio":
|
||||
}
|
||||
case "twilio": {
|
||||
const { TwilioProvider } = await import("./providers/twilio.js");
|
||||
return new TwilioProvider(
|
||||
{
|
||||
accountSid: config.twilio?.accountSid,
|
||||
|
|
@ -112,7 +122,9 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
|
|||
webhookSecurity: config.webhookSecurity,
|
||||
},
|
||||
);
|
||||
case "plivo":
|
||||
}
|
||||
case "plivo": {
|
||||
const { PlivoProvider } = await import("./providers/plivo.js");
|
||||
return new PlivoProvider(
|
||||
{
|
||||
authId: config.plivo?.authId,
|
||||
|
|
@ -125,21 +137,66 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
|
|||
webhookSecurity: config.webhookSecurity,
|
||||
},
|
||||
);
|
||||
case "mock":
|
||||
}
|
||||
case "mock": {
|
||||
const { MockProvider } = await import("./providers/mock.js");
|
||||
return new MockProvider();
|
||||
}
|
||||
default:
|
||||
throw new Error(`Unsupported voice-call provider: ${String(config.provider)}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function resolveRealtimeProvider(params: {
|
||||
config: VoiceCallConfig;
|
||||
fullConfig: OpenClawConfig;
|
||||
}): Promise<ResolvedRealtimeProvider> {
|
||||
const { getRealtimeVoiceProvider, listRealtimeVoiceProviders } =
|
||||
await import("./realtime-voice.runtime.js");
|
||||
const configuredProviderId = params.config.realtime.provider?.trim();
|
||||
const configuredProvider = getRealtimeVoiceProvider(configuredProviderId, params.fullConfig);
|
||||
if (configuredProviderId && !configuredProvider) {
|
||||
throw new Error(`Realtime voice provider "${configuredProviderId}" is not registered`);
|
||||
}
|
||||
const provider =
|
||||
configuredProvider ??
|
||||
[...listRealtimeVoiceProviders(params.fullConfig)].sort(
|
||||
(left, right) =>
|
||||
(left.autoSelectOrder ?? Number.MAX_SAFE_INTEGER) -
|
||||
(right.autoSelectOrder ?? Number.MAX_SAFE_INTEGER),
|
||||
)[0];
|
||||
if (!provider) {
|
||||
throw new Error("No realtime voice provider registered");
|
||||
}
|
||||
|
||||
const rawProviderConfig =
|
||||
(params.config.realtime.providers?.[provider.id] as RealtimeVoiceProviderConfig | undefined) ??
|
||||
{};
|
||||
const providerConfig =
|
||||
provider.resolveConfig?.({
|
||||
cfg: params.fullConfig,
|
||||
rawConfig: {
|
||||
providers: params.config.realtime.providers,
|
||||
[provider.id]: rawProviderConfig,
|
||||
},
|
||||
}) ?? rawProviderConfig;
|
||||
|
||||
if (!provider.isConfigured({ cfg: params.fullConfig, providerConfig })) {
|
||||
throw new Error(`Realtime voice provider "${provider.id}" is not configured`);
|
||||
}
|
||||
|
||||
return { provider, providerConfig };
|
||||
}
|
||||
|
||||
export async function createVoiceCallRuntime(params: {
|
||||
config: VoiceCallConfig;
|
||||
coreConfig: CoreConfig;
|
||||
fullConfig?: OpenClawConfig;
|
||||
agentRuntime: CoreAgentDeps;
|
||||
ttsRuntime?: TelephonyTtsRuntime;
|
||||
logger?: Logger;
|
||||
}): Promise<VoiceCallRuntime> {
|
||||
const { config: rawConfig, coreConfig, agentRuntime, ttsRuntime, logger } = params;
|
||||
const { config: rawConfig, coreConfig, fullConfig, agentRuntime, ttsRuntime, logger } = params;
|
||||
const log = logger ?? {
|
||||
info: console.log,
|
||||
warn: console.warn,
|
||||
|
|
@ -164,8 +221,14 @@ export async function createVoiceCallRuntime(params: {
|
|||
throw new Error(`Invalid voice-call config: ${validation.errors.join("; ")}`);
|
||||
}
|
||||
|
||||
const provider = resolveProvider(config);
|
||||
const provider = await resolveProvider(config);
|
||||
const manager = new CallManager(config);
|
||||
const realtimeProvider = config.realtime.enabled
|
||||
? await resolveRealtimeProvider({
|
||||
config,
|
||||
fullConfig: (fullConfig ?? (coreConfig as OpenClawConfig)) as OpenClawConfig,
|
||||
})
|
||||
: null;
|
||||
const webhookServer = new VoiceCallWebhookServer(
|
||||
config,
|
||||
manager,
|
||||
|
|
@ -173,6 +236,19 @@ export async function createVoiceCallRuntime(params: {
|
|||
coreConfig,
|
||||
agentRuntime,
|
||||
);
|
||||
if (realtimeProvider) {
|
||||
const { RealtimeCallHandler } = await import("./webhook/realtime-handler.js");
|
||||
webhookServer.setRealtimeHandler(
|
||||
new RealtimeCallHandler(
|
||||
config.realtime,
|
||||
manager,
|
||||
provider,
|
||||
realtimeProvider.provider,
|
||||
realtimeProvider.providerConfig,
|
||||
config.serve.path,
|
||||
),
|
||||
);
|
||||
}
|
||||
const lifecycle = createRuntimeResourceLifecycle({ config, webhookServer });
|
||||
|
||||
const localUrl = await webhookServer.start();
|
||||
|
|
@ -212,6 +288,9 @@ export async function createVoiceCallRuntime(params: {
|
|||
if (publicUrl && provider.name === "twilio") {
|
||||
(provider as TwilioProvider).setPublicUrl(publicUrl);
|
||||
}
|
||||
if (publicUrl && realtimeProvider) {
|
||||
webhookServer.getRealtimeHandler()?.setPublicUrl(publicUrl);
|
||||
}
|
||||
|
||||
if (provider.name === "twilio" && config.streaming?.enabled) {
|
||||
const twilioProvider = provider as TwilioProvider;
|
||||
|
|
@ -243,6 +322,10 @@ export async function createVoiceCallRuntime(params: {
|
|||
}
|
||||
}
|
||||
|
||||
if (realtimeProvider) {
|
||||
log.info(`[voice-call] Realtime voice provider: ${realtimeProvider.provider.id}`);
|
||||
}
|
||||
|
||||
await manager.initialize(provider, webhookUrl);
|
||||
|
||||
const stop = async () => await lifecycle.stop();
|
||||
|
|
|
|||
|
|
@ -30,16 +30,26 @@ export function createVoiceCallBaseConfig(params?: {
|
|||
},
|
||||
streaming: {
|
||||
enabled: false,
|
||||
sttProvider: "openai-realtime",
|
||||
sttModel: "gpt-4o-transcribe",
|
||||
silenceDurationMs: 800,
|
||||
vadThreshold: 0.5,
|
||||
provider: "openai",
|
||||
providers: {
|
||||
openai: {
|
||||
model: "gpt-4o-transcribe",
|
||||
silenceDurationMs: 800,
|
||||
vadThreshold: 0.5,
|
||||
},
|
||||
},
|
||||
streamPath: "/voice/stream",
|
||||
preStartTimeoutMs: 5000,
|
||||
maxPendingConnections: 32,
|
||||
maxPendingConnectionsPerIp: 4,
|
||||
maxConnections: 128,
|
||||
},
|
||||
realtime: {
|
||||
enabled: false,
|
||||
streamPath: "/voice/stream/realtime",
|
||||
tools: [],
|
||||
providers: {},
|
||||
},
|
||||
skipSignatureVerification: false,
|
||||
stt: { provider: "openai", model: "whisper-1" },
|
||||
tts: {
|
||||
|
|
|
|||
|
|
@ -1,10 +1,36 @@
|
|||
import { request } from "node:http";
|
||||
import type { RealtimeTranscriptionProviderPlugin } from "openclaw/plugin-sdk/realtime-transcription";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { VoiceCallConfigSchema, type VoiceCallConfig } from "./config.js";
|
||||
import type { CallManager } from "./manager.js";
|
||||
import type { VoiceCallProvider } from "./providers/base.js";
|
||||
import type { CallRecord, NormalizedEvent } from "./types.js";
|
||||
import { VoiceCallWebhookServer } from "./webhook.js";
|
||||
import type { RealtimeCallHandler } from "./webhook/realtime-handler.js";
|
||||
|
||||
const mocks = vi.hoisted(() => {
|
||||
const realtimeTranscriptionProvider: RealtimeTranscriptionProviderPlugin = {
|
||||
id: "openai",
|
||||
label: "OpenAI",
|
||||
aliases: ["openai-realtime"],
|
||||
isConfigured: () => true,
|
||||
resolveConfig: ({ rawConfig }) => rawConfig,
|
||||
createSession: () => ({
|
||||
connect: async () => {},
|
||||
sendAudio: () => {},
|
||||
close: () => {},
|
||||
isConnected: () => true,
|
||||
}),
|
||||
};
|
||||
|
||||
return {
|
||||
getRealtimeTranscriptionProvider: vi.fn(() => realtimeTranscriptionProvider),
|
||||
};
|
||||
});
|
||||
|
||||
vi.mock("./realtime-transcription.runtime.js", () => ({
|
||||
getRealtimeTranscriptionProvider: mocks.getRealtimeTranscriptionProvider,
|
||||
}));
|
||||
|
||||
const provider: VoiceCallProvider = {
|
||||
name: "mock",
|
||||
|
|
@ -291,6 +317,56 @@ describe("VoiceCallWebhookServer replay handling", () => {
|
|||
}
|
||||
});
|
||||
|
||||
it("returns realtime TwiML for replayed inbound twilio webhooks", async () => {
|
||||
const parseWebhookEvent = vi.fn(() => ({ events: [], statusCode: 200 }));
|
||||
const twilioProvider: VoiceCallProvider = {
|
||||
...provider,
|
||||
name: "twilio",
|
||||
verifyWebhook: () => ({ ok: true, isReplay: true, verifiedRequestKey: "twilio:req:replay" }),
|
||||
parseWebhookEvent,
|
||||
};
|
||||
const { manager, processEvent } = createManager([]);
|
||||
const config = createConfig({
|
||||
provider: "twilio",
|
||||
inboundPolicy: "allowlist",
|
||||
realtime: {
|
||||
enabled: true,
|
||||
streamPath: "/voice/stream/realtime",
|
||||
tools: [],
|
||||
providers: {},
|
||||
},
|
||||
});
|
||||
const server = new VoiceCallWebhookServer(config, manager, twilioProvider);
|
||||
server.setRealtimeHandler({
|
||||
buildTwiMLPayload: () => ({
|
||||
statusCode: 200,
|
||||
headers: { "Content-Type": "text/xml" },
|
||||
body: '<Response><Connect><Stream url="wss://example.test/voice/stream/realtime/token" /></Connect></Response>',
|
||||
}),
|
||||
getStreamPathPattern: () => "/voice/stream/realtime",
|
||||
handleWebSocketUpgrade: () => {},
|
||||
registerToolHandler: () => {},
|
||||
setPublicUrl: () => {},
|
||||
} as unknown as RealtimeCallHandler);
|
||||
|
||||
try {
|
||||
const baseUrl = await server.start();
|
||||
const response = await postWebhookFormWithHeaders(
|
||||
server,
|
||||
baseUrl,
|
||||
"CallSid=CA123&Direction=inbound&CallStatus=ringing",
|
||||
{ "x-twilio-signature": "sig" },
|
||||
);
|
||||
|
||||
expect(response.status).toBe(200);
|
||||
expect(await response.text()).toContain("<Connect><Stream");
|
||||
expect(parseWebhookEvent).not.toHaveBeenCalled();
|
||||
expect(processEvent).not.toHaveBeenCalled();
|
||||
} finally {
|
||||
await server.stop();
|
||||
}
|
||||
});
|
||||
|
||||
it("passes verified request key from verifyWebhook into parseWebhookEvent", async () => {
|
||||
const parseWebhookEvent = vi.fn((_ctx: unknown, options?: { verifiedRequestKey?: string }) => ({
|
||||
events: [
|
||||
|
|
@ -625,6 +701,7 @@ describe("VoiceCallWebhookServer stream disconnect grace", () => {
|
|||
manager,
|
||||
twilioProvider as unknown as VoiceCallProvider,
|
||||
);
|
||||
await server.start();
|
||||
|
||||
const mediaHandler = server.getMediaStreamHandler() as unknown as {
|
||||
config: {
|
||||
|
|
@ -717,6 +794,7 @@ describe("VoiceCallWebhookServer barge-in suppression during initial message", (
|
|||
manager,
|
||||
createTwilioProvider(clearTtsQueue) as unknown as VoiceCallProvider,
|
||||
);
|
||||
await server.start();
|
||||
const handleInboundResponse = vi.fn(async () => {});
|
||||
(
|
||||
server as unknown as {
|
||||
|
|
@ -790,6 +868,7 @@ describe("VoiceCallWebhookServer barge-in suppression during initial message", (
|
|||
manager,
|
||||
createTwilioProvider(clearTtsQueue) as unknown as VoiceCallProvider,
|
||||
);
|
||||
await server.start();
|
||||
|
||||
try {
|
||||
const media = getMediaCallbacks(server);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import http from "node:http";
|
||||
import { URL } from "node:url";
|
||||
import type { OpenClawConfig } from "openclaw/plugin-sdk/core";
|
||||
import {
|
||||
createWebhookInFlightLimiter,
|
||||
WEBHOOK_BODY_READ_DEFAULTS,
|
||||
|
|
@ -16,9 +17,10 @@ import type { CallManager } from "./manager.js";
|
|||
import type { MediaStreamConfig } from "./media-stream.js";
|
||||
import { MediaStreamHandler } from "./media-stream.js";
|
||||
import type { VoiceCallProvider } from "./providers/base.js";
|
||||
import { OpenAIRealtimeSTTProvider } from "./providers/stt-openai-realtime.js";
|
||||
import { isProviderStatusTerminal } from "./providers/shared/call-status.js";
|
||||
import type { TwilioProvider } from "./providers/twilio.js";
|
||||
import type { CallRecord, NormalizedEvent, WebhookContext } from "./types.js";
|
||||
import type { RealtimeCallHandler } from "./webhook/realtime-handler.js";
|
||||
import { startStaleCallReaper } from "./webhook/stale-call-reaper.js";
|
||||
|
||||
const MAX_WEBHOOK_BODY_BYTES = WEBHOOK_BODY_READ_DEFAULTS.preAuth.maxBytes;
|
||||
|
|
@ -44,7 +46,7 @@ function sanitizeTranscriptForLog(value: string): string {
|
|||
return `${sanitized.slice(0, TRANSCRIPT_LOG_MAX_CHARS)}...`;
|
||||
}
|
||||
|
||||
type WebhookResponsePayload = {
|
||||
export type WebhookResponsePayload = {
|
||||
statusCode: number;
|
||||
body: string;
|
||||
headers?: Record<string, string>;
|
||||
|
|
@ -89,6 +91,8 @@ export class VoiceCallWebhookServer {
|
|||
private mediaStreamHandler: MediaStreamHandler | null = null;
|
||||
/** Delayed auto-hangup timers keyed by provider call ID after stream disconnect. */
|
||||
private pendingDisconnectHangups = new Map<string, ReturnType<typeof setTimeout>>();
|
||||
/** Realtime voice handler for duplex provider bridges. */
|
||||
private realtimeHandler: RealtimeCallHandler | null = null;
|
||||
|
||||
constructor(
|
||||
config: VoiceCallConfig,
|
||||
|
|
@ -102,11 +106,6 @@ export class VoiceCallWebhookServer {
|
|||
this.provider = provider;
|
||||
this.coreConfig = coreConfig ?? null;
|
||||
this.agentRuntime = agentRuntime ?? null;
|
||||
|
||||
// Initialize media stream handler if streaming is enabled
|
||||
if (this.config.streaming.enabled) {
|
||||
this.initializeMediaStreaming();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -116,6 +115,14 @@ export class VoiceCallWebhookServer {
|
|||
return this.mediaStreamHandler;
|
||||
}
|
||||
|
||||
getRealtimeHandler(): RealtimeCallHandler | null {
|
||||
return this.realtimeHandler;
|
||||
}
|
||||
|
||||
setRealtimeHandler(handler: RealtimeCallHandler): void {
|
||||
this.realtimeHandler = handler;
|
||||
}
|
||||
|
||||
private clearPendingDisconnectHangup(providerCallId: string): void {
|
||||
const existing = this.pendingDisconnectHangups.get(providerCallId);
|
||||
if (!existing) {
|
||||
|
|
@ -147,26 +154,50 @@ export class VoiceCallWebhookServer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Initialize media streaming with OpenAI Realtime STT.
|
||||
* Initialize media streaming with the selected realtime transcription provider.
|
||||
*/
|
||||
private initializeMediaStreaming(): void {
|
||||
private async initializeMediaStreaming(): Promise<void> {
|
||||
const streaming = this.config.streaming;
|
||||
const apiKey = streaming.openaiApiKey ?? process.env.OPENAI_API_KEY;
|
||||
|
||||
if (!apiKey) {
|
||||
console.warn("[voice-call] Streaming enabled but no OpenAI API key found");
|
||||
const selectedProviderId = streaming.provider;
|
||||
const pluginConfig = this.coreConfig as unknown as OpenClawConfig | undefined;
|
||||
const { getRealtimeTranscriptionProvider } =
|
||||
await import("./realtime-transcription.runtime.js");
|
||||
const provider = getRealtimeTranscriptionProvider(selectedProviderId, pluginConfig);
|
||||
if (!provider) {
|
||||
console.warn(
|
||||
`[voice-call] Streaming enabled but realtime transcription provider "${selectedProviderId}" is not registered`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
const selectedProviderConfig =
|
||||
streaming.providers[selectedProviderId] &&
|
||||
typeof streaming.providers[selectedProviderId] === "object"
|
||||
? (streaming.providers[selectedProviderId] as Record<string, unknown>)
|
||||
: undefined;
|
||||
const canonicalProviderConfig =
|
||||
streaming.providers[provider.id] && typeof streaming.providers[provider.id] === "object"
|
||||
? (streaming.providers[provider.id] as Record<string, unknown>)
|
||||
: undefined;
|
||||
const rawProviderConfig = {
|
||||
...(canonicalProviderConfig ?? {}),
|
||||
...(selectedProviderConfig ?? {}),
|
||||
};
|
||||
const providerConfig = provider.resolveConfig
|
||||
? provider.resolveConfig({
|
||||
cfg: pluginConfig ?? ({} as OpenClawConfig),
|
||||
rawConfig: rawProviderConfig,
|
||||
})
|
||||
: rawProviderConfig;
|
||||
if (!provider.isConfigured({ cfg: pluginConfig, providerConfig })) {
|
||||
console.warn(
|
||||
`[voice-call] Streaming enabled but provider "${provider.id}" is not configured`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const sttProvider = new OpenAIRealtimeSTTProvider({
|
||||
apiKey,
|
||||
model: streaming.sttModel,
|
||||
silenceDurationMs: streaming.silenceDurationMs,
|
||||
vadThreshold: streaming.vadThreshold,
|
||||
});
|
||||
|
||||
const streamConfig: MediaStreamConfig = {
|
||||
sttProvider,
|
||||
transcriptionProvider: provider,
|
||||
providerConfig,
|
||||
preStartTimeoutMs: streaming.preStartTimeoutMs,
|
||||
maxPendingConnections: streaming.maxPendingConnections,
|
||||
maxPendingConnectionsPerIp: streaming.maxPendingConnectionsPerIp,
|
||||
|
|
@ -309,6 +340,10 @@ export class VoiceCallWebhookServer {
|
|||
return this.listeningUrl ?? this.resolveListeningUrl(bind, webhookPath);
|
||||
}
|
||||
|
||||
if (this.config.streaming.enabled && !this.mediaStreamHandler) {
|
||||
await this.initializeMediaStreaming();
|
||||
}
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
this.server = http.createServer((req, res) => {
|
||||
this.handleRequest(req, res, webhookPath).catch((err) => {
|
||||
|
|
@ -318,12 +353,15 @@ export class VoiceCallWebhookServer {
|
|||
});
|
||||
});
|
||||
|
||||
// Handle WebSocket upgrades for media streams
|
||||
if (this.mediaStreamHandler) {
|
||||
// Handle WebSocket upgrades for realtime voice and media streams.
|
||||
if (this.realtimeHandler || this.mediaStreamHandler) {
|
||||
this.server.on("upgrade", (request, socket, head) => {
|
||||
if (this.realtimeHandler && this.isRealtimeWebSocketUpgrade(request)) {
|
||||
this.realtimeHandler.handleWebSocketUpgrade(request, socket, head);
|
||||
return;
|
||||
}
|
||||
const path = this.getUpgradePathname(request);
|
||||
if (path === streamPath) {
|
||||
console.log("[voice-call] WebSocket upgrade for media stream");
|
||||
if (path === streamPath && this.mediaStreamHandler) {
|
||||
this.mediaStreamHandler?.handleUpgrade(request, socket, head);
|
||||
} else {
|
||||
socket.destroy();
|
||||
|
|
@ -504,6 +542,10 @@ export class VoiceCallWebhookServer {
|
|||
return { statusCode: 401, body: "Unauthorized" };
|
||||
}
|
||||
|
||||
if (this.shouldShortCircuitToRealtimeTwiml(ctx)) {
|
||||
return this.realtimeHandler!.buildTwiMLPayload(req, new URLSearchParams(ctx.rawBody));
|
||||
}
|
||||
|
||||
const parsed = this.provider.parseWebhookEvent(ctx, {
|
||||
verifiedRequestKey: verification.verifiedRequestKey,
|
||||
});
|
||||
|
|
@ -555,6 +597,42 @@ export class VoiceCallWebhookServer {
|
|||
}
|
||||
}
|
||||
|
||||
private isRealtimeWebSocketUpgrade(req: http.IncomingMessage): boolean {
|
||||
try {
|
||||
const pathname = buildRequestUrl(req.url, req.headers.host).pathname;
|
||||
const pattern = this.realtimeHandler?.getStreamPathPattern();
|
||||
return Boolean(pattern && pathname.startsWith(pattern));
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private shouldShortCircuitToRealtimeTwiml(ctx: WebhookContext): boolean {
|
||||
if (!this.realtimeHandler || this.provider.name !== "twilio") {
|
||||
return false;
|
||||
}
|
||||
|
||||
const params = new URLSearchParams(ctx.rawBody);
|
||||
const direction = params.get("Direction");
|
||||
const isInbound = !direction || direction === "inbound";
|
||||
if (!isInbound) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ctx.query?.type === "status") {
|
||||
return false;
|
||||
}
|
||||
|
||||
const callStatus = params.get("CallStatus");
|
||||
if (callStatus && isProviderStatusTerminal(callStatus)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Replays must return the same TwiML body so Twilio retries reconnect cleanly.
|
||||
// The one-time token still changes, but the behavior stays identical.
|
||||
return !params.get("SpeechResult") && !params.get("Digits");
|
||||
}
|
||||
|
||||
private processParsedEvents(events: NormalizedEvent[]): void {
|
||||
for (const event of events) {
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,92 @@
|
|||
import http from "node:http";
|
||||
import type {
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceProviderPlugin,
|
||||
} from "openclaw/plugin-sdk/realtime-voice";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import type { VoiceCallRealtimeConfig } from "../config.js";
|
||||
import type { CallManager } from "../manager.js";
|
||||
import type { VoiceCallProvider } from "../providers/base.js";
|
||||
import { RealtimeCallHandler } from "./realtime-handler.js";
|
||||
|
||||
function makeRequest(url: string, host = "gateway.ts.net"): http.IncomingMessage {
|
||||
const req = new http.IncomingMessage(null as never);
|
||||
req.url = url;
|
||||
req.method = "POST";
|
||||
req.headers = host ? { host } : {};
|
||||
return req;
|
||||
}
|
||||
|
||||
function makeBridge(): RealtimeVoiceBridge {
|
||||
return {
|
||||
connect: async () => {},
|
||||
sendAudio: () => {},
|
||||
setMediaTimestamp: () => {},
|
||||
submitToolResult: () => {},
|
||||
acknowledgeMark: () => {},
|
||||
close: () => {},
|
||||
isConnected: () => true,
|
||||
triggerGreeting: () => {},
|
||||
};
|
||||
}
|
||||
|
||||
const realtimeProvider: RealtimeVoiceProviderPlugin = {
|
||||
id: "openai",
|
||||
label: "OpenAI",
|
||||
isConfigured: () => true,
|
||||
createBridge: () => makeBridge(),
|
||||
};
|
||||
|
||||
function makeHandler(overrides?: Partial<VoiceCallRealtimeConfig>) {
|
||||
return new RealtimeCallHandler(
|
||||
{
|
||||
enabled: true,
|
||||
streamPath: "/voice/stream/realtime",
|
||||
instructions: "Be helpful.",
|
||||
tools: [],
|
||||
providers: {},
|
||||
...overrides,
|
||||
},
|
||||
{
|
||||
processEvent: vi.fn(),
|
||||
getCallByProviderCallId: vi.fn(),
|
||||
} as unknown as CallManager,
|
||||
{
|
||||
name: "twilio",
|
||||
verifyWebhook: vi.fn(),
|
||||
parseWebhookEvent: vi.fn(),
|
||||
initiateCall: vi.fn(),
|
||||
hangupCall: vi.fn(),
|
||||
playTts: vi.fn(),
|
||||
startListening: vi.fn(),
|
||||
stopListening: vi.fn(),
|
||||
getCallStatus: vi.fn(),
|
||||
} as unknown as VoiceCallProvider,
|
||||
realtimeProvider,
|
||||
{ apiKey: "test-key" },
|
||||
"/voice/webhook",
|
||||
);
|
||||
}
|
||||
|
||||
describe("RealtimeCallHandler path routing", () => {
|
||||
it("uses the request host and stream path in TwiML", () => {
|
||||
const handler = makeHandler();
|
||||
const payload = handler.buildTwiMLPayload(makeRequest("/voice/webhook", "gateway.ts.net"));
|
||||
|
||||
expect(payload.statusCode).toBe(200);
|
||||
expect(payload.body).toMatch(
|
||||
/wss:\/\/gateway\.ts\.net\/voice\/stream\/realtime\/[0-9a-f-]{36}/,
|
||||
);
|
||||
});
|
||||
|
||||
it("preserves a public path prefix ahead of serve.path", () => {
|
||||
const handler = makeHandler({ streamPath: "/custom/stream/realtime" });
|
||||
handler.setPublicUrl("https://public.example/api/voice/webhook");
|
||||
const payload = handler.buildTwiMLPayload(makeRequest("/voice/webhook", "127.0.0.1:3334"));
|
||||
|
||||
expect(handler.getStreamPathPattern()).toBe("/api/custom/stream/realtime");
|
||||
expect(payload.body).toMatch(
|
||||
/wss:\/\/public\.example\/api\/custom\/stream\/realtime\/[0-9a-f-]{36}/,
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,413 @@
|
|||
import { randomUUID } from "node:crypto";
|
||||
import http from "node:http";
|
||||
import type { Duplex } from "node:stream";
|
||||
import type {
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceProviderConfig,
|
||||
RealtimeVoiceProviderPlugin,
|
||||
} from "openclaw/plugin-sdk/realtime-voice";
|
||||
import WebSocket, { WebSocketServer } from "ws";
|
||||
import type { VoiceCallRealtimeConfig } from "../config.js";
|
||||
import type { CallManager } from "../manager.js";
|
||||
import type { VoiceCallProvider } from "../providers/base.js";
|
||||
import type { CallRecord, NormalizedEvent } from "../types.js";
|
||||
import type { WebhookResponsePayload } from "../webhook.js";
|
||||
|
||||
export type ToolHandlerFn = (args: unknown, callId: string) => Promise<unknown>;
|
||||
|
||||
const STREAM_TOKEN_TTL_MS = 30_000;
|
||||
const DEFAULT_HOST = "localhost:8443";
|
||||
|
||||
function normalizePath(pathname: string): string {
|
||||
const trimmed = pathname.trim();
|
||||
if (!trimmed) {
|
||||
return "/";
|
||||
}
|
||||
const prefixed = trimmed.startsWith("/") ? trimmed : `/${trimmed}`;
|
||||
if (prefixed === "/") {
|
||||
return prefixed;
|
||||
}
|
||||
return prefixed.endsWith("/") ? prefixed.slice(0, -1) : prefixed;
|
||||
}
|
||||
|
||||
function buildGreetingInstructions(
|
||||
baseInstructions: string | undefined,
|
||||
greeting: string | undefined,
|
||||
): string | undefined {
|
||||
const trimmedGreeting = greeting?.trim();
|
||||
if (!trimmedGreeting) {
|
||||
return baseInstructions;
|
||||
}
|
||||
const intro =
|
||||
"Start the call by greeting the caller naturally. Include this greeting in your first spoken reply:";
|
||||
return baseInstructions
|
||||
? `${baseInstructions}\n\n${intro} "${trimmedGreeting}"`
|
||||
: `${intro} "${trimmedGreeting}"`;
|
||||
}
|
||||
|
||||
type PendingStreamToken = {
|
||||
expiry: number;
|
||||
from?: string;
|
||||
to?: string;
|
||||
direction?: "inbound" | "outbound";
|
||||
};
|
||||
|
||||
type CallRegistration = {
|
||||
callId: string;
|
||||
initialGreetingInstructions?: string;
|
||||
};
|
||||
|
||||
export class RealtimeCallHandler {
|
||||
private readonly toolHandlers = new Map<string, ToolHandlerFn>();
|
||||
private readonly pendingStreamTokens = new Map<string, PendingStreamToken>();
|
||||
private publicOrigin: string | null = null;
|
||||
private publicPathPrefix = "";
|
||||
|
||||
constructor(
|
||||
private readonly config: VoiceCallRealtimeConfig,
|
||||
private readonly manager: CallManager,
|
||||
private readonly provider: VoiceCallProvider,
|
||||
private readonly realtimeProvider: RealtimeVoiceProviderPlugin,
|
||||
private readonly providerConfig: RealtimeVoiceProviderConfig,
|
||||
private readonly servePath: string,
|
||||
) {}
|
||||
|
||||
setPublicUrl(url: string): void {
|
||||
try {
|
||||
const parsed = new URL(url);
|
||||
this.publicOrigin = parsed.host;
|
||||
const normalizedServePath = normalizePath(this.servePath);
|
||||
const normalizedPublicPath = normalizePath(parsed.pathname);
|
||||
const idx = normalizedPublicPath.indexOf(normalizedServePath);
|
||||
this.publicPathPrefix = idx > 0 ? normalizedPublicPath.slice(0, idx) : "";
|
||||
} catch {
|
||||
this.publicOrigin = null;
|
||||
this.publicPathPrefix = "";
|
||||
}
|
||||
}
|
||||
|
||||
getStreamPathPattern(): string {
|
||||
return `${this.publicPathPrefix}${normalizePath(this.config.streamPath ?? "/voice/stream/realtime")}`;
|
||||
}
|
||||
|
||||
buildTwiMLPayload(req: http.IncomingMessage, params?: URLSearchParams): WebhookResponsePayload {
|
||||
const host = this.publicOrigin || req.headers.host || DEFAULT_HOST;
|
||||
const rawDirection = params?.get("Direction");
|
||||
const token = this.issueStreamToken({
|
||||
from: params?.get("From") ?? undefined,
|
||||
to: params?.get("To") ?? undefined,
|
||||
direction: rawDirection === "outbound-api" ? "outbound" : "inbound",
|
||||
});
|
||||
const wsUrl = `wss://${host}${this.getStreamPathPattern()}/${token}`;
|
||||
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Connect>
|
||||
<Stream url="${wsUrl}" />
|
||||
</Connect>
|
||||
</Response>`;
|
||||
return {
|
||||
statusCode: 200,
|
||||
headers: { "Content-Type": "text/xml" },
|
||||
body: twiml,
|
||||
};
|
||||
}
|
||||
|
||||
handleWebSocketUpgrade(request: http.IncomingMessage, socket: Duplex, head: Buffer): void {
|
||||
const url = new URL(request.url ?? "/", "wss://localhost");
|
||||
const token = url.pathname.split("/").pop() ?? null;
|
||||
const callerMeta = token ? this.consumeStreamToken(token) : null;
|
||||
if (!callerMeta) {
|
||||
socket.write("HTTP/1.1 401 Unauthorized\r\n\r\n");
|
||||
socket.destroy();
|
||||
return;
|
||||
}
|
||||
|
||||
const wss = new WebSocketServer({ noServer: true });
|
||||
wss.handleUpgrade(request, socket, head, (ws) => {
|
||||
let bridge: RealtimeVoiceBridge | null = null;
|
||||
let initialized = false;
|
||||
|
||||
ws.on("message", (data: Buffer) => {
|
||||
try {
|
||||
const msg = JSON.parse(data.toString()) as Record<string, unknown>;
|
||||
if (!initialized && msg.event === "start") {
|
||||
initialized = true;
|
||||
const startData =
|
||||
typeof msg.start === "object" && msg.start !== null
|
||||
? (msg.start as Record<string, unknown>)
|
||||
: undefined;
|
||||
const streamSid =
|
||||
typeof startData?.streamSid === "string" ? startData.streamSid : "unknown";
|
||||
const callSid = typeof startData?.callSid === "string" ? startData.callSid : "unknown";
|
||||
bridge = this.handleCall(streamSid, callSid, ws, callerMeta);
|
||||
return;
|
||||
}
|
||||
if (!bridge) {
|
||||
return;
|
||||
}
|
||||
const mediaData =
|
||||
typeof msg.media === "object" && msg.media !== null
|
||||
? (msg.media as Record<string, unknown>)
|
||||
: undefined;
|
||||
if (msg.event === "media" && typeof mediaData?.payload === "string") {
|
||||
bridge.sendAudio(Buffer.from(mediaData.payload, "base64"));
|
||||
if (typeof mediaData.timestamp === "number") {
|
||||
bridge.setMediaTimestamp(mediaData.timestamp);
|
||||
} else if (typeof mediaData.timestamp === "string") {
|
||||
bridge.setMediaTimestamp(Number.parseInt(mediaData.timestamp, 10));
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (msg.event === "mark") {
|
||||
bridge.acknowledgeMark();
|
||||
return;
|
||||
}
|
||||
if (msg.event === "stop") {
|
||||
bridge.close();
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("[voice-call] realtime WS parse failed:", error);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on("close", () => {
|
||||
bridge?.close();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
registerToolHandler(name: string, fn: ToolHandlerFn): void {
|
||||
this.toolHandlers.set(name, fn);
|
||||
}
|
||||
|
||||
private issueStreamToken(meta: Omit<PendingStreamToken, "expiry"> = {}): string {
|
||||
const token = randomUUID();
|
||||
this.pendingStreamTokens.set(token, { expiry: Date.now() + STREAM_TOKEN_TTL_MS, ...meta });
|
||||
for (const [candidate, entry] of this.pendingStreamTokens) {
|
||||
if (Date.now() > entry.expiry) {
|
||||
this.pendingStreamTokens.delete(candidate);
|
||||
}
|
||||
}
|
||||
return token;
|
||||
}
|
||||
|
||||
private consumeStreamToken(token: string): Omit<PendingStreamToken, "expiry"> | null {
|
||||
const entry = this.pendingStreamTokens.get(token);
|
||||
if (!entry) {
|
||||
return null;
|
||||
}
|
||||
this.pendingStreamTokens.delete(token);
|
||||
if (Date.now() > entry.expiry) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
from: entry.from,
|
||||
to: entry.to,
|
||||
direction: entry.direction,
|
||||
};
|
||||
}
|
||||
|
||||
private handleCall(
|
||||
streamSid: string,
|
||||
callSid: string,
|
||||
ws: WebSocket,
|
||||
callerMeta: Omit<PendingStreamToken, "expiry">,
|
||||
): RealtimeVoiceBridge | null {
|
||||
const registration = this.registerCallInManager(callSid, callerMeta);
|
||||
if (!registration) {
|
||||
ws.close(1008, "Caller rejected by policy");
|
||||
return null;
|
||||
}
|
||||
|
||||
const { callId, initialGreetingInstructions } = registration;
|
||||
let bridge: RealtimeVoiceBridge | null = null;
|
||||
let callEndEmitted = false;
|
||||
const emitCallEnd = (reason: "completed" | "error") => {
|
||||
if (callEndEmitted) {
|
||||
return;
|
||||
}
|
||||
callEndEmitted = true;
|
||||
this.endCallInManager(callSid, callId, reason);
|
||||
};
|
||||
|
||||
bridge = this.realtimeProvider.createBridge({
|
||||
providerConfig: this.providerConfig,
|
||||
instructions: this.config.instructions,
|
||||
tools: this.config.tools,
|
||||
onAudio: (muLaw) => {
|
||||
if (ws.readyState !== WebSocket.OPEN) {
|
||||
return;
|
||||
}
|
||||
ws.send(
|
||||
JSON.stringify({
|
||||
event: "media",
|
||||
streamSid,
|
||||
media: { payload: muLaw.toString("base64") },
|
||||
}),
|
||||
);
|
||||
},
|
||||
onClearAudio: () => {
|
||||
if (ws.readyState !== WebSocket.OPEN) {
|
||||
return;
|
||||
}
|
||||
ws.send(JSON.stringify({ event: "clear", streamSid }));
|
||||
},
|
||||
onMark: (markName) => {
|
||||
if (ws.readyState !== WebSocket.OPEN) {
|
||||
return;
|
||||
}
|
||||
ws.send(JSON.stringify({ event: "mark", streamSid, mark: { name: markName } }));
|
||||
},
|
||||
onTranscript: (role, text, isFinal) => {
|
||||
if (!isFinal) {
|
||||
return;
|
||||
}
|
||||
if (role === "user") {
|
||||
const event: NormalizedEvent = {
|
||||
id: `realtime-speech-${callSid}-${Date.now()}`,
|
||||
type: "call.speech",
|
||||
callId,
|
||||
providerCallId: callSid,
|
||||
timestamp: Date.now(),
|
||||
transcript: text,
|
||||
isFinal: true,
|
||||
};
|
||||
this.manager.processEvent(event);
|
||||
return;
|
||||
}
|
||||
this.manager.processEvent({
|
||||
id: `realtime-bot-${callSid}-${Date.now()}`,
|
||||
type: "call.speaking",
|
||||
callId,
|
||||
providerCallId: callSid,
|
||||
timestamp: Date.now(),
|
||||
text,
|
||||
});
|
||||
},
|
||||
onToolCall: (toolEvent) => {
|
||||
if (!bridge) {
|
||||
return;
|
||||
}
|
||||
void this.executeToolCall(
|
||||
bridge,
|
||||
callId,
|
||||
toolEvent.callId || toolEvent.itemId,
|
||||
toolEvent.name,
|
||||
toolEvent.args,
|
||||
);
|
||||
},
|
||||
onReady: () => {
|
||||
bridge?.triggerGreeting?.(initialGreetingInstructions);
|
||||
},
|
||||
onError: (error) => {
|
||||
console.error("[voice-call] realtime voice error:", error.message);
|
||||
},
|
||||
onClose: (reason) => {
|
||||
if (reason !== "error") {
|
||||
return;
|
||||
}
|
||||
emitCallEnd("error");
|
||||
if (ws.readyState === WebSocket.OPEN) {
|
||||
ws.close(1011, "Bridge disconnected");
|
||||
}
|
||||
void this.provider
|
||||
.hangupCall({ callId, providerCallId: callSid, reason: "error" })
|
||||
.catch((error: unknown) => {
|
||||
console.warn(
|
||||
`[voice-call] Failed to hang up realtime call ${callSid}: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
bridge.connect().catch((error: Error) => {
|
||||
console.error("[voice-call] Failed to connect realtime bridge:", error);
|
||||
bridge?.close();
|
||||
emitCallEnd("error");
|
||||
ws.close(1011, "Failed to connect");
|
||||
});
|
||||
|
||||
return bridge;
|
||||
}
|
||||
|
||||
private registerCallInManager(
|
||||
callSid: string,
|
||||
callerMeta: Omit<PendingStreamToken, "expiry"> = {},
|
||||
): CallRegistration | null {
|
||||
const timestamp = Date.now();
|
||||
const baseFields = {
|
||||
providerCallId: callSid,
|
||||
timestamp,
|
||||
direction: (callerMeta.direction ?? "inbound") as "inbound" | "outbound",
|
||||
...(callerMeta.from ? { from: callerMeta.from } : {}),
|
||||
...(callerMeta.to ? { to: callerMeta.to } : {}),
|
||||
};
|
||||
|
||||
this.manager.processEvent({
|
||||
id: `realtime-initiated-${callSid}`,
|
||||
callId: callSid,
|
||||
type: "call.initiated",
|
||||
...baseFields,
|
||||
});
|
||||
|
||||
const callRecord = this.manager.getCallByProviderCallId(callSid);
|
||||
if (!callRecord) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const initialGreeting = this.extractInitialGreeting(callRecord);
|
||||
if (callRecord.metadata) {
|
||||
delete callRecord.metadata.initialMessage;
|
||||
}
|
||||
|
||||
this.manager.processEvent({
|
||||
id: `realtime-answered-${callSid}`,
|
||||
callId: callSid,
|
||||
type: "call.answered",
|
||||
...baseFields,
|
||||
});
|
||||
|
||||
return {
|
||||
callId: callRecord.callId,
|
||||
initialGreetingInstructions: buildGreetingInstructions(
|
||||
this.config.instructions,
|
||||
initialGreeting,
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
private extractInitialGreeting(call: CallRecord): string | undefined {
|
||||
return typeof call.metadata?.initialMessage === "string"
|
||||
? call.metadata.initialMessage
|
||||
: undefined;
|
||||
}
|
||||
|
||||
private endCallInManager(callSid: string, callId: string, reason: "completed" | "error"): void {
|
||||
this.manager.processEvent({
|
||||
id: `realtime-ended-${callSid}-${Date.now()}`,
|
||||
type: "call.ended",
|
||||
callId,
|
||||
providerCallId: callSid,
|
||||
timestamp: Date.now(),
|
||||
reason,
|
||||
});
|
||||
}
|
||||
|
||||
private async executeToolCall(
|
||||
bridge: RealtimeVoiceBridge,
|
||||
callId: string,
|
||||
bridgeCallId: string,
|
||||
name: string,
|
||||
args: unknown,
|
||||
): Promise<void> {
|
||||
const handler = this.toolHandlers.get(name);
|
||||
const result = !handler
|
||||
? { error: `Tool "${name}" not available` }
|
||||
: await handler(args, callId).catch((error: unknown) => ({
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
}));
|
||||
bridge.submitToolResult(bridgeCallId, result);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1 @@
|
|||
export { zaiMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
|
|
@ -551,6 +551,14 @@
|
|||
"types": "./dist/plugin-sdk/reply-history.d.ts",
|
||||
"default": "./dist/plugin-sdk/reply-history.js"
|
||||
},
|
||||
"./plugin-sdk/realtime-voice": {
|
||||
"types": "./dist/plugin-sdk/realtime-voice.d.ts",
|
||||
"default": "./dist/plugin-sdk/realtime-voice.js"
|
||||
},
|
||||
"./plugin-sdk/realtime-transcription": {
|
||||
"types": "./dist/plugin-sdk/realtime-transcription.d.ts",
|
||||
"default": "./dist/plugin-sdk/realtime-transcription.js"
|
||||
},
|
||||
"./plugin-sdk/media-understanding": {
|
||||
"types": "./dist/plugin-sdk/media-understanding.d.ts",
|
||||
"default": "./dist/plugin-sdk/media-understanding.js"
|
||||
|
|
|
|||
|
|
@ -127,6 +127,8 @@
|
|||
"kimi-coding",
|
||||
"kilocode",
|
||||
"reply-history",
|
||||
"realtime-transcription",
|
||||
"realtime-voice",
|
||||
"media-understanding",
|
||||
"request-url",
|
||||
"runtime-store",
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
import { spawnSync } from "node:child_process";
|
||||
import { mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { renderRootHelpText } from "../src/cli/program/root-help.ts";
|
||||
import { fileURLToPath, pathToFileURL } from "node:url";
|
||||
import { renderRootHelpText as renderSourceRootHelpText } from "../src/cli/program/root-help.ts";
|
||||
|
||||
function dedupe(values: string[]): string[] {
|
||||
const seen = new Set<string>();
|
||||
|
|
@ -82,7 +83,37 @@ export function readBundledChannelCatalogIds(
|
|||
export async function renderBundledRootHelpText(
|
||||
_distDirOverride: string = distDir,
|
||||
): Promise<string> {
|
||||
return await renderRootHelpText({ pluginDescriptors: [] });
|
||||
const bundleName = readdirSync(distDirOverride).find(
|
||||
(entry) => entry.startsWith("root-help-") && entry.endsWith(".js"),
|
||||
);
|
||||
if (!bundleName) {
|
||||
throw new Error("No root-help bundle found in dist; cannot write CLI startup metadata.");
|
||||
}
|
||||
const moduleUrl = pathToFileURL(path.join(distDirOverride, bundleName)).href;
|
||||
const inlineModule = [
|
||||
`const mod = await import(${JSON.stringify(moduleUrl)});`,
|
||||
"if (typeof mod.outputRootHelp !== 'function') {",
|
||||
` throw new Error(${JSON.stringify(`Bundle ${bundleName} does not export outputRootHelp.`)});`,
|
||||
"}",
|
||||
"await mod.outputRootHelp();",
|
||||
"process.exit(0);",
|
||||
].join("\n");
|
||||
const result = spawnSync(process.execPath, ["--input-type=module", "--eval", inlineModule], {
|
||||
cwd: distDirOverride,
|
||||
encoding: "utf8",
|
||||
timeout: 30_000,
|
||||
});
|
||||
if (result.error) {
|
||||
throw result.error;
|
||||
}
|
||||
if (result.status !== 0) {
|
||||
const stderr = result.stderr?.trim();
|
||||
throw new Error(
|
||||
`Failed to render bundled root help from ${bundleName}` +
|
||||
(stderr ? `: ${stderr}` : result.signal ? `: terminated by ${result.signal}` : ""),
|
||||
);
|
||||
}
|
||||
return result.stdout ?? "";
|
||||
}
|
||||
|
||||
export async function writeCliStartupMetadata(options?: {
|
||||
|
|
@ -95,7 +126,13 @@ export async function writeCliStartupMetadata(options?: {
|
|||
const resolvedExtensionsDir = options?.extensionsDir ?? extensionsDir;
|
||||
const catalog = readBundledChannelCatalogIds(resolvedExtensionsDir);
|
||||
const channelOptions = dedupe([...CORE_CHANNEL_ORDER, ...catalog]);
|
||||
const rootHelpText = await renderBundledRootHelpText(resolvedDistDir);
|
||||
const useSourceRootHelp =
|
||||
resolvedDistDir === distDir &&
|
||||
resolvedOutputPath === outputPath &&
|
||||
resolvedExtensionsDir === extensionsDir;
|
||||
const rootHelpText = useSourceRootHelp
|
||||
? await renderSourceRootHelpText({ pluginSdkResolution: "src" })
|
||||
: await renderBundledRootHelpText(resolvedDistDir);
|
||||
|
||||
mkdirSync(resolvedDistDir, { recursive: true });
|
||||
writeFileSync(
|
||||
|
|
@ -115,4 +152,5 @@ export async function writeCliStartupMetadata(options?: {
|
|||
|
||||
if (process.argv[1] && path.resolve(process.argv[1]) === scriptPath) {
|
||||
await writeCliStartupMetadata();
|
||||
process.exit(0);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,16 +1,14 @@
|
|||
import { Command } from "commander";
|
||||
import { getPluginCliCommandDescriptors } from "../../plugins/cli.js";
|
||||
import type { OpenClawPluginCliCommandDescriptor } from "../../plugins/types.js";
|
||||
import type { PluginLoadOptions } from "../../plugins/loader.js";
|
||||
import { VERSION } from "../../version.js";
|
||||
import { getCoreCliCommandDescriptors } from "./core-command-descriptors.js";
|
||||
import { configureProgramHelp } from "./help.js";
|
||||
import { getSubCliEntries } from "./subcli-descriptors.js";
|
||||
|
||||
type RootHelpRenderOptions = {
|
||||
pluginDescriptors?: OpenClawPluginCliCommandDescriptor[] | null;
|
||||
};
|
||||
type RootHelpLoaderOptions = Pick<PluginLoadOptions, "pluginSdkResolution">;
|
||||
|
||||
async function buildRootHelpProgram(options?: RootHelpRenderOptions): Promise<Command> {
|
||||
async function buildRootHelpProgram(loaderOptions?: RootHelpLoaderOptions): Promise<Command> {
|
||||
const program = new Command();
|
||||
configureProgramHelp(program, {
|
||||
programVersion: VERSION,
|
||||
|
|
@ -31,11 +29,7 @@ async function buildRootHelpProgram(options?: RootHelpRenderOptions): Promise<Co
|
|||
program.command(command.name).description(command.description);
|
||||
existingCommands.add(command.name);
|
||||
}
|
||||
const pluginDescriptors =
|
||||
options && "pluginDescriptors" in options
|
||||
? (options.pluginDescriptors ?? [])
|
||||
: await getPluginCliCommandDescriptors();
|
||||
for (const command of pluginDescriptors) {
|
||||
for (const command of await getPluginCliCommandDescriptors(undefined, undefined, loaderOptions)) {
|
||||
if (existingCommands.has(command.name)) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -46,8 +40,8 @@ async function buildRootHelpProgram(options?: RootHelpRenderOptions): Promise<Co
|
|||
return program;
|
||||
}
|
||||
|
||||
export async function renderRootHelpText(options?: RootHelpRenderOptions): Promise<string> {
|
||||
const program = await buildRootHelpProgram(options);
|
||||
export async function renderRootHelpText(loaderOptions?: RootHelpLoaderOptions): Promise<string> {
|
||||
const program = await buildRootHelpProgram(loaderOptions);
|
||||
let output = "";
|
||||
const originalWrite = process.stdout.write.bind(process.stdout);
|
||||
const captureWrite: typeof process.stdout.write = ((chunk: string | Uint8Array) => {
|
||||
|
|
@ -63,6 +57,6 @@ export async function renderRootHelpText(options?: RootHelpRenderOptions): Promi
|
|||
return output;
|
||||
}
|
||||
|
||||
export async function outputRootHelp(options?: RootHelpRenderOptions): Promise<void> {
|
||||
process.stdout.write(await renderRootHelpText(options));
|
||||
export async function outputRootHelp(loaderOptions?: RootHelpLoaderOptions): Promise<void> {
|
||||
process.stdout.write(await renderRootHelpText(loaderOptions));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,6 +69,8 @@ const createRegistry = (diagnostics: PluginDiagnostic[]): PluginRegistry => ({
|
|||
commands: [],
|
||||
providers: [],
|
||||
speechProviders: [],
|
||||
realtimeTranscriptionProviders: [],
|
||||
realtimeVoiceProviders: [],
|
||||
mediaUnderstandingProviders: [],
|
||||
imageGenerationProviders: [],
|
||||
webFetchProviders: [],
|
||||
|
|
|
|||
|
|
@ -201,6 +201,8 @@ const createStubPluginRegistry = (): PluginRegistry => ({
|
|||
}),
|
||||
},
|
||||
],
|
||||
realtimeTranscriptionProviders: [],
|
||||
realtimeVoiceProviders: [],
|
||||
mediaUnderstandingProviders: [],
|
||||
imageGenerationProviders: [],
|
||||
webFetchProviders: [],
|
||||
|
|
|
|||
|
|
@ -66,6 +66,7 @@ export type {
|
|||
ProviderReplaySessionState,
|
||||
ProviderResolveDynamicModelContext,
|
||||
ProviderResolvedUsageAuth,
|
||||
RealtimeTranscriptionProviderPlugin,
|
||||
ProviderSanitizeReplayHistoryContext,
|
||||
ProviderToolSchemaDiagnostic,
|
||||
ProviderResolveUsageAuthContext,
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ export type {
|
|||
ProviderAuthContext,
|
||||
ProviderAuthResult,
|
||||
ProviderRuntimeModel,
|
||||
RealtimeTranscriptionProviderPlugin,
|
||||
SpeechProviderPlugin,
|
||||
} from "../plugins/types.js";
|
||||
export type {
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@ import type {
|
|||
ProviderReplayPolicyContext,
|
||||
ProviderReplaySessionEntry,
|
||||
ProviderReplaySessionState,
|
||||
RealtimeTranscriptionProviderPlugin,
|
||||
ProviderResolvedUsageAuth,
|
||||
ProviderResolveDynamicModelContext,
|
||||
ProviderSanitizeReplayHistoryContext,
|
||||
|
|
@ -102,6 +103,7 @@ export type {
|
|||
ProviderResolveDynamicModelContext,
|
||||
ProviderNormalizeResolvedModelContext,
|
||||
ProviderRuntimeModel,
|
||||
RealtimeTranscriptionProviderPlugin,
|
||||
SpeechProviderPlugin,
|
||||
ProviderThinkingPolicyContext,
|
||||
ProviderValidateReplayTurnsContext,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,16 @@
|
|||
export type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js";
|
||||
export type {
|
||||
RealtimeTranscriptionProviderConfig,
|
||||
RealtimeTranscriptionProviderConfiguredContext,
|
||||
RealtimeTranscriptionProviderId,
|
||||
RealtimeTranscriptionProviderResolveConfigContext,
|
||||
RealtimeTranscriptionSession,
|
||||
RealtimeTranscriptionSessionCallbacks,
|
||||
RealtimeTranscriptionSessionCreateRequest,
|
||||
} from "../realtime-transcription/provider-types.js";
|
||||
export {
|
||||
canonicalizeRealtimeTranscriptionProviderId,
|
||||
getRealtimeTranscriptionProvider,
|
||||
listRealtimeTranscriptionProviders,
|
||||
normalizeRealtimeTranscriptionProviderId,
|
||||
} from "../realtime-transcription/provider-registry.js";
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
export type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
|
||||
export type {
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceBridgeCallbacks,
|
||||
RealtimeVoiceBridgeCreateRequest,
|
||||
RealtimeVoiceCloseReason,
|
||||
RealtimeVoiceProviderConfig,
|
||||
RealtimeVoiceProviderConfiguredContext,
|
||||
RealtimeVoiceProviderId,
|
||||
RealtimeVoiceProviderResolveConfigContext,
|
||||
RealtimeVoiceRole,
|
||||
RealtimeVoiceTool,
|
||||
RealtimeVoiceToolCallEvent,
|
||||
} from "../realtime-voice/provider-types.js";
|
||||
export {
|
||||
canonicalizeRealtimeVoiceProviderId,
|
||||
getRealtimeVoiceProvider,
|
||||
listRealtimeVoiceProviders,
|
||||
normalizeRealtimeVoiceProviderId,
|
||||
} from "../realtime-voice/provider-registry.js";
|
||||
|
|
@ -1,7 +1,12 @@
|
|||
import { rmSync } from "node:fs";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import type { ResolvedTtsConfig } from "../tts/tts.js";
|
||||
|
||||
// Public speech helpers for bundled or third-party plugins.
|
||||
//
|
||||
// Keep this surface neutral. Provider plugins should not need to know about the
|
||||
// bundled `speech-core` plugin id just to consume shared speech types/helpers.
|
||||
// Keep this surface neutral and import-light. Provider builders commonly import
|
||||
// this module just to get types and a few validation helpers, so avoid pulling
|
||||
// in the heavy TTS runtime graph at module load time.
|
||||
|
||||
export type { SpeechProviderPlugin } from "../plugins/types.js";
|
||||
export type {
|
||||
|
|
@ -22,14 +27,6 @@ export type {
|
|||
TtsDirectiveParseResult,
|
||||
} from "../tts/provider-types.js";
|
||||
|
||||
export {
|
||||
scheduleCleanup,
|
||||
summarizeText,
|
||||
normalizeApplyTextNormalization,
|
||||
normalizeLanguageCode,
|
||||
normalizeSeed,
|
||||
requireInRange,
|
||||
} from "../tts/tts-core.js";
|
||||
export { parseTtsDirectives } from "../tts/directives.js";
|
||||
export {
|
||||
canonicalizeSpeechProviderId,
|
||||
|
|
@ -44,3 +41,71 @@ export {
|
|||
trimToUndefined,
|
||||
truncateErrorDetail,
|
||||
} from "../tts/provider-error-utils.js";
|
||||
|
||||
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
|
||||
|
||||
export function requireInRange(value: number, min: number, max: number, label: string): void {
|
||||
if (!Number.isFinite(value) || value < min || value > max) {
|
||||
throw new Error(`${label} must be between ${min} and ${max}`);
|
||||
}
|
||||
}
|
||||
|
||||
export function normalizeLanguageCode(code?: string): string | undefined {
|
||||
const trimmed = code?.trim();
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = trimmed.toLowerCase();
|
||||
if (!/^[a-z]{2}$/.test(normalized)) {
|
||||
throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
export function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
|
||||
const trimmed = mode?.trim();
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = trimmed.toLowerCase();
|
||||
if (normalized === "auto" || normalized === "on" || normalized === "off") {
|
||||
return normalized;
|
||||
}
|
||||
throw new Error("applyTextNormalization must be one of: auto, on, off");
|
||||
}
|
||||
|
||||
export function normalizeSeed(seed?: number): number | undefined {
|
||||
if (seed == null) {
|
||||
return undefined;
|
||||
}
|
||||
const next = Math.floor(seed);
|
||||
if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
|
||||
throw new Error("seed must be between 0 and 4294967295");
|
||||
}
|
||||
return next;
|
||||
}
|
||||
|
||||
export function scheduleCleanup(
|
||||
tempDir: string,
|
||||
delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS,
|
||||
): void {
|
||||
const timer = setTimeout(() => {
|
||||
try {
|
||||
rmSync(tempDir, { recursive: true, force: true });
|
||||
} catch {
|
||||
// ignore cleanup errors
|
||||
}
|
||||
}, delayMs);
|
||||
timer.unref();
|
||||
}
|
||||
|
||||
export async function summarizeText(params: {
|
||||
text: string;
|
||||
targetLength: number;
|
||||
cfg: OpenClawConfig;
|
||||
config: ResolvedTtsConfig;
|
||||
timeoutMs: number;
|
||||
}) {
|
||||
const { summarizeText: summarizeTextRuntime } = await import("../tts/tts-core.js");
|
||||
return summarizeTextRuntime(params);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,6 +28,8 @@ export type BuildPluginApiParams = {
|
|||
| "registerCliBackend"
|
||||
| "registerProvider"
|
||||
| "registerSpeechProvider"
|
||||
| "registerRealtimeTranscriptionProvider"
|
||||
| "registerRealtimeVoiceProvider"
|
||||
| "registerMediaUnderstandingProvider"
|
||||
| "registerImageGenerationProvider"
|
||||
| "registerWebFetchProvider"
|
||||
|
|
@ -55,6 +57,10 @@ const noopRegisterService: OpenClawPluginApi["registerService"] = () => {};
|
|||
const noopRegisterCliBackend: OpenClawPluginApi["registerCliBackend"] = () => {};
|
||||
const noopRegisterProvider: OpenClawPluginApi["registerProvider"] = () => {};
|
||||
const noopRegisterSpeechProvider: OpenClawPluginApi["registerSpeechProvider"] = () => {};
|
||||
const noopRegisterRealtimeTranscriptionProvider: OpenClawPluginApi["registerRealtimeTranscriptionProvider"] =
|
||||
() => {};
|
||||
const noopRegisterRealtimeVoiceProvider: OpenClawPluginApi["registerRealtimeVoiceProvider"] =
|
||||
() => {};
|
||||
const noopRegisterMediaUnderstandingProvider: OpenClawPluginApi["registerMediaUnderstandingProvider"] =
|
||||
() => {};
|
||||
const noopRegisterImageGenerationProvider: OpenClawPluginApi["registerImageGenerationProvider"] =
|
||||
|
|
@ -97,6 +103,10 @@ export function buildPluginApi(params: BuildPluginApiParams): OpenClawPluginApi
|
|||
registerCliBackend: handlers.registerCliBackend ?? noopRegisterCliBackend,
|
||||
registerProvider: handlers.registerProvider ?? noopRegisterProvider,
|
||||
registerSpeechProvider: handlers.registerSpeechProvider ?? noopRegisterSpeechProvider,
|
||||
registerRealtimeTranscriptionProvider:
|
||||
handlers.registerRealtimeTranscriptionProvider ?? noopRegisterRealtimeTranscriptionProvider,
|
||||
registerRealtimeVoiceProvider:
|
||||
handlers.registerRealtimeVoiceProvider ?? noopRegisterRealtimeVoiceProvider,
|
||||
registerMediaUnderstandingProvider:
|
||||
handlers.registerMediaUnderstandingProvider ?? noopRegisterMediaUnderstandingProvider,
|
||||
registerImageGenerationProvider:
|
||||
|
|
|
|||
|
|
@ -28,6 +28,10 @@ describe("bundled capability metadata", () => {
|
|||
cliBackendIds: uniqueStrings(manifest.cliBackends),
|
||||
providerIds: uniqueStrings(manifest.providers),
|
||||
speechProviderIds: uniqueStrings(manifest.contracts?.speechProviders),
|
||||
realtimeTranscriptionProviderIds: uniqueStrings(
|
||||
manifest.contracts?.realtimeTranscriptionProviders,
|
||||
),
|
||||
realtimeVoiceProviderIds: uniqueStrings(manifest.contracts?.realtimeVoiceProviders),
|
||||
mediaUnderstandingProviderIds: uniqueStrings(
|
||||
manifest.contracts?.mediaUnderstandingProviders,
|
||||
),
|
||||
|
|
@ -41,6 +45,8 @@ describe("bundled capability metadata", () => {
|
|||
entry.cliBackendIds.length > 0 ||
|
||||
entry.providerIds.length > 0 ||
|
||||
entry.speechProviderIds.length > 0 ||
|
||||
entry.realtimeTranscriptionProviderIds.length > 0 ||
|
||||
entry.realtimeVoiceProviderIds.length > 0 ||
|
||||
entry.mediaUnderstandingProviderIds.length > 0 ||
|
||||
entry.imageGenerationProviderIds.length > 0 ||
|
||||
entry.webFetchProviderIds.length > 0 ||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,8 @@ export type BundledPluginContractSnapshot = {
|
|||
cliBackendIds: string[];
|
||||
providerIds: string[];
|
||||
speechProviderIds: string[];
|
||||
realtimeTranscriptionProviderIds: string[];
|
||||
realtimeVoiceProviderIds: string[];
|
||||
mediaUnderstandingProviderIds: string[];
|
||||
imageGenerationProviderIds: string[];
|
||||
webFetchProviderIds: string[];
|
||||
|
|
@ -37,6 +39,10 @@ export const BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS: readonly BundledPluginContractSn
|
|||
cliBackendIds: uniqueStrings(manifest.cliBackends),
|
||||
providerIds: uniqueStrings(manifest.providers),
|
||||
speechProviderIds: uniqueStrings(manifest.contracts?.speechProviders),
|
||||
realtimeTranscriptionProviderIds: uniqueStrings(
|
||||
manifest.contracts?.realtimeTranscriptionProviders,
|
||||
),
|
||||
realtimeVoiceProviderIds: uniqueStrings(manifest.contracts?.realtimeVoiceProviders),
|
||||
mediaUnderstandingProviderIds: uniqueStrings(manifest.contracts?.mediaUnderstandingProviders),
|
||||
imageGenerationProviderIds: uniqueStrings(manifest.contracts?.imageGenerationProviders),
|
||||
webFetchProviderIds: uniqueStrings(manifest.contracts?.webFetchProviders),
|
||||
|
|
@ -48,6 +54,8 @@ export const BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS: readonly BundledPluginContractSn
|
|||
entry.cliBackendIds.length > 0 ||
|
||||
entry.providerIds.length > 0 ||
|
||||
entry.speechProviderIds.length > 0 ||
|
||||
entry.realtimeTranscriptionProviderIds.length > 0 ||
|
||||
entry.realtimeVoiceProviderIds.length > 0 ||
|
||||
entry.mediaUnderstandingProviderIds.length > 0 ||
|
||||
entry.imageGenerationProviderIds.length > 0 ||
|
||||
entry.webFetchProviderIds.length > 0 ||
|
||||
|
|
@ -68,6 +76,14 @@ export const BUNDLED_PROVIDER_PLUGIN_IDS = collectPluginIds((entry) => entry.pro
|
|||
|
||||
export const BUNDLED_SPEECH_PLUGIN_IDS = collectPluginIds((entry) => entry.speechProviderIds);
|
||||
|
||||
export const BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS = collectPluginIds(
|
||||
(entry) => entry.realtimeTranscriptionProviderIds,
|
||||
);
|
||||
|
||||
export const BUNDLED_REALTIME_VOICE_PLUGIN_IDS = collectPluginIds(
|
||||
(entry) => entry.realtimeVoiceProviderIds,
|
||||
);
|
||||
|
||||
export const BUNDLED_MEDIA_UNDERSTANDING_PLUGIN_IDS = collectPluginIds(
|
||||
(entry) => entry.mediaUnderstandingProviderIds,
|
||||
);
|
||||
|
|
@ -84,6 +100,8 @@ export const BUNDLED_RUNTIME_CONTRACT_PLUGIN_IDS = [
|
|||
(entry) =>
|
||||
entry.providerIds.length > 0 ||
|
||||
entry.speechProviderIds.length > 0 ||
|
||||
entry.realtimeTranscriptionProviderIds.length > 0 ||
|
||||
entry.realtimeVoiceProviderIds.length > 0 ||
|
||||
entry.mediaUnderstandingProviderIds.length > 0 ||
|
||||
entry.imageGenerationProviderIds.length > 0 ||
|
||||
entry.webFetchProviderIds.length > 0 ||
|
||||
|
|
|
|||
|
|
@ -122,6 +122,8 @@ function createCapabilityPluginRecord(params: {
|
|||
cliBackendIds: [],
|
||||
providerIds: [],
|
||||
speechProviderIds: [],
|
||||
realtimeTranscriptionProviderIds: [],
|
||||
realtimeVoiceProviderIds: [],
|
||||
mediaUnderstandingProviderIds: [],
|
||||
imageGenerationProviderIds: [],
|
||||
webFetchProviderIds: [],
|
||||
|
|
@ -272,6 +274,12 @@ export function loadBundledCapabilityRuntimeRegistry(params: {
|
|||
record.cliBackendIds.push(...captured.cliBackends.map((entry) => entry.id));
|
||||
record.providerIds.push(...captured.providers.map((entry) => entry.id));
|
||||
record.speechProviderIds.push(...captured.speechProviders.map((entry) => entry.id));
|
||||
record.realtimeTranscriptionProviderIds.push(
|
||||
...captured.realtimeTranscriptionProviders.map((entry) => entry.id),
|
||||
);
|
||||
record.realtimeVoiceProviderIds.push(
|
||||
...captured.realtimeVoiceProviders.map((entry) => entry.id),
|
||||
);
|
||||
record.mediaUnderstandingProviderIds.push(
|
||||
...captured.mediaUnderstandingProviders.map((entry) => entry.id),
|
||||
);
|
||||
|
|
@ -309,6 +317,24 @@ export function loadBundledCapabilityRuntimeRegistry(params: {
|
|||
rootDir: record.rootDir,
|
||||
})),
|
||||
);
|
||||
registry.realtimeTranscriptionProviders.push(
|
||||
...captured.realtimeTranscriptionProviders.map((provider) => ({
|
||||
pluginId: record.id,
|
||||
pluginName: record.name,
|
||||
provider,
|
||||
source: record.source,
|
||||
rootDir: record.rootDir,
|
||||
})),
|
||||
);
|
||||
registry.realtimeVoiceProviders.push(
|
||||
...captured.realtimeVoiceProviders.map((provider) => ({
|
||||
pluginId: record.id,
|
||||
pluginName: record.name,
|
||||
provider,
|
||||
source: record.source,
|
||||
rootDir: record.rootDir,
|
||||
})),
|
||||
);
|
||||
registry.mediaUnderstandingProviders.push(
|
||||
...captured.mediaUnderstandingProviders.map((provider) => ({
|
||||
pluginId: record.id,
|
||||
|
|
|
|||
|
|
@ -102,7 +102,12 @@ function setBundledCapabilityFixture(contractKey: string) {
|
|||
}
|
||||
|
||||
function expectCompatChainApplied(params: {
|
||||
key: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders";
|
||||
key:
|
||||
| "speechProviders"
|
||||
| "realtimeTranscriptionProviders"
|
||||
| "realtimeVoiceProviders"
|
||||
| "mediaUnderstandingProviders"
|
||||
| "imageGenerationProviders";
|
||||
contractKey: string;
|
||||
cfg: OpenClawConfig;
|
||||
enablementCompat: {
|
||||
|
|
@ -201,6 +206,8 @@ describe("resolvePluginCapabilityProviders", () => {
|
|||
|
||||
it.each([
|
||||
["speechProviders", "speechProviders"],
|
||||
["realtimeTranscriptionProviders", "realtimeTranscriptionProviders"],
|
||||
["realtimeVoiceProviders", "realtimeVoiceProviders"],
|
||||
["mediaUnderstandingProviders", "mediaUnderstandingProviders"],
|
||||
["imageGenerationProviders", "imageGenerationProviders"],
|
||||
] as const)("applies bundled compat before fallback loading for %s", (key, contractKey) => {
|
||||
|
|
|
|||
|
|
@ -9,11 +9,15 @@ import type { PluginRegistry } from "./registry.js";
|
|||
|
||||
type CapabilityProviderRegistryKey =
|
||||
| "speechProviders"
|
||||
| "realtimeTranscriptionProviders"
|
||||
| "realtimeVoiceProviders"
|
||||
| "mediaUnderstandingProviders"
|
||||
| "imageGenerationProviders";
|
||||
|
||||
type CapabilityContractKey =
|
||||
| "speechProviders"
|
||||
| "realtimeTranscriptionProviders"
|
||||
| "realtimeVoiceProviders"
|
||||
| "mediaUnderstandingProviders"
|
||||
| "imageGenerationProviders";
|
||||
|
||||
|
|
@ -22,6 +26,8 @@ type CapabilityProviderForKey<K extends CapabilityProviderRegistryKey> =
|
|||
|
||||
const CAPABILITY_CONTRACT_KEY: Record<CapabilityProviderRegistryKey, CapabilityContractKey> = {
|
||||
speechProviders: "speechProviders",
|
||||
realtimeTranscriptionProviders: "realtimeTranscriptionProviders",
|
||||
realtimeVoiceProviders: "realtimeVoiceProviders",
|
||||
mediaUnderstandingProviders: "mediaUnderstandingProviders",
|
||||
imageGenerationProviders: "imageGenerationProviders",
|
||||
};
|
||||
|
|
|
|||
|
|
@ -10,6 +10,8 @@ import type {
|
|||
OpenClawPluginCliCommandDescriptor,
|
||||
OpenClawPluginCliRegistrar,
|
||||
ProviderPlugin,
|
||||
RealtimeTranscriptionProviderPlugin,
|
||||
RealtimeVoiceProviderPlugin,
|
||||
SpeechProviderPlugin,
|
||||
WebFetchProviderPlugin,
|
||||
WebSearchProviderPlugin,
|
||||
|
|
@ -27,6 +29,8 @@ export type CapturedPluginRegistration = {
|
|||
cliRegistrars: CapturedPluginCliRegistration[];
|
||||
cliBackends: CliBackendPlugin[];
|
||||
speechProviders: SpeechProviderPlugin[];
|
||||
realtimeTranscriptionProviders: RealtimeTranscriptionProviderPlugin[];
|
||||
realtimeVoiceProviders: RealtimeVoiceProviderPlugin[];
|
||||
mediaUnderstandingProviders: MediaUnderstandingProviderPlugin[];
|
||||
imageGenerationProviders: ImageGenerationProviderPlugin[];
|
||||
webFetchProviders: WebFetchProviderPlugin[];
|
||||
|
|
@ -42,6 +46,8 @@ export function createCapturedPluginRegistration(params?: {
|
|||
const cliRegistrars: CapturedPluginCliRegistration[] = [];
|
||||
const cliBackends: CliBackendPlugin[] = [];
|
||||
const speechProviders: SpeechProviderPlugin[] = [];
|
||||
const realtimeTranscriptionProviders: RealtimeTranscriptionProviderPlugin[] = [];
|
||||
const realtimeVoiceProviders: RealtimeVoiceProviderPlugin[] = [];
|
||||
const mediaUnderstandingProviders: MediaUnderstandingProviderPlugin[] = [];
|
||||
const imageGenerationProviders: ImageGenerationProviderPlugin[] = [];
|
||||
const webFetchProviders: WebFetchProviderPlugin[] = [];
|
||||
|
|
@ -59,6 +65,8 @@ export function createCapturedPluginRegistration(params?: {
|
|||
cliRegistrars,
|
||||
cliBackends,
|
||||
speechProviders,
|
||||
realtimeTranscriptionProviders,
|
||||
realtimeVoiceProviders,
|
||||
mediaUnderstandingProviders,
|
||||
imageGenerationProviders,
|
||||
webFetchProviders,
|
||||
|
|
@ -106,6 +114,12 @@ export function createCapturedPluginRegistration(params?: {
|
|||
registerSpeechProvider(provider: SpeechProviderPlugin) {
|
||||
speechProviders.push(provider);
|
||||
},
|
||||
registerRealtimeTranscriptionProvider(provider: RealtimeTranscriptionProviderPlugin) {
|
||||
realtimeTranscriptionProviders.push(provider);
|
||||
},
|
||||
registerRealtimeVoiceProvider(provider: RealtimeVoiceProviderPlugin) {
|
||||
realtimeVoiceProviders.push(provider);
|
||||
},
|
||||
registerMediaUnderstandingProvider(provider: MediaUnderstandingProviderPlugin) {
|
||||
mediaUnderstandingProviders.push(provider);
|
||||
},
|
||||
|
|
|
|||
|
|
@ -155,9 +155,10 @@ async function loadPluginCliCommandRegistry(
|
|||
export async function getPluginCliCommandDescriptors(
|
||||
cfg?: OpenClawConfig,
|
||||
env?: NodeJS.ProcessEnv,
|
||||
loaderOptions?: Pick<PluginLoadOptions, "pluginSdkResolution">,
|
||||
): Promise<OpenClawPluginCliCommandDescriptor[]> {
|
||||
try {
|
||||
const { registry } = await loadPluginCliMetadataRegistry(cfg, env);
|
||||
const { registry } = await loadPluginCliMetadataRegistry(cfg, env, loaderOptions);
|
||||
const seen = new Set<string>();
|
||||
const descriptors: OpenClawPluginCliCommandDescriptor[] = [];
|
||||
for (const entry of registry.cliRegistrars) {
|
||||
|
|
|
|||
|
|
@ -8,6 +8,8 @@ import {
|
|||
pluginRegistrationContractRegistry,
|
||||
providerContractLoadError,
|
||||
providerContractPluginIds,
|
||||
realtimeTranscriptionProviderContractRegistry,
|
||||
realtimeVoiceProviderContractRegistry,
|
||||
resolveWebFetchProviderContractEntriesForPluginId,
|
||||
resolveWebSearchProviderContractEntriesForPluginId,
|
||||
speechProviderContractRegistry,
|
||||
|
|
@ -27,7 +29,11 @@ describe("plugin contract registry", () => {
|
|||
predicate: (plugin: {
|
||||
origin: string;
|
||||
providers: unknown[];
|
||||
contracts?: { speechProviders?: unknown[] };
|
||||
contracts?: {
|
||||
speechProviders?: unknown[];
|
||||
realtimeTranscriptionProviders?: unknown[];
|
||||
realtimeVoiceProviders?: unknown[];
|
||||
};
|
||||
}) => boolean;
|
||||
}) {
|
||||
expect(uniqueSortedStrings(params.actualPluginIds)).toEqual(
|
||||
|
|
@ -39,7 +45,11 @@ describe("plugin contract registry", () => {
|
|||
predicate: (plugin: {
|
||||
origin: string;
|
||||
providers: unknown[];
|
||||
contracts?: { speechProviders?: unknown[] };
|
||||
contracts?: {
|
||||
speechProviders?: unknown[];
|
||||
realtimeTranscriptionProviders?: unknown[];
|
||||
realtimeVoiceProviders?: unknown[];
|
||||
};
|
||||
}) => boolean,
|
||||
) {
|
||||
return loadPluginManifestRegistry({})
|
||||
|
|
@ -70,6 +80,14 @@ describe("plugin contract registry", () => {
|
|||
name: "does not duplicate bundled media provider ids",
|
||||
ids: () => mediaUnderstandingProviderContractRegistry.map((entry) => entry.provider.id),
|
||||
},
|
||||
{
|
||||
name: "does not duplicate bundled realtime transcription provider ids",
|
||||
ids: () => realtimeTranscriptionProviderContractRegistry.map((entry) => entry.provider.id),
|
||||
},
|
||||
{
|
||||
name: "does not duplicate bundled realtime voice provider ids",
|
||||
ids: () => realtimeVoiceProviderContractRegistry.map((entry) => entry.provider.id),
|
||||
},
|
||||
{
|
||||
name: "does not duplicate bundled image-generation provider ids",
|
||||
ids: () => imageGenerationProviderContractRegistry.map((entry) => entry.provider.id),
|
||||
|
|
@ -101,6 +119,23 @@ describe("plugin contract registry", () => {
|
|||
});
|
||||
});
|
||||
|
||||
it("covers every bundled realtime voice plugin discovered from manifests", () => {
|
||||
expectRegistryPluginIds({
|
||||
actualPluginIds: realtimeVoiceProviderContractRegistry.map((entry) => entry.pluginId),
|
||||
predicate: (plugin) =>
|
||||
plugin.origin === "bundled" && (plugin.contracts?.realtimeVoiceProviders?.length ?? 0) > 0,
|
||||
});
|
||||
});
|
||||
|
||||
it("covers every bundled realtime transcription plugin discovered from manifests", () => {
|
||||
expectRegistryPluginIds({
|
||||
actualPluginIds: realtimeTranscriptionProviderContractRegistry.map((entry) => entry.pluginId),
|
||||
predicate: (plugin) =>
|
||||
plugin.origin === "bundled" &&
|
||||
(plugin.contracts?.realtimeTranscriptionProviders?.length ?? 0) > 0,
|
||||
});
|
||||
});
|
||||
|
||||
it("covers every bundled web fetch plugin from the shared resolver", () => {
|
||||
const bundledWebFetchPluginIds = resolveBundledWebFetchPluginIds({});
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ import {
|
|||
BUNDLED_MEDIA_UNDERSTANDING_PLUGIN_IDS,
|
||||
BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS,
|
||||
BUNDLED_PROVIDER_PLUGIN_IDS,
|
||||
BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS,
|
||||
BUNDLED_REALTIME_VOICE_PLUGIN_IDS,
|
||||
BUNDLED_SPEECH_PLUGIN_IDS,
|
||||
BUNDLED_WEB_FETCH_PLUGIN_IDS,
|
||||
BUNDLED_WEB_SEARCH_PLUGIN_IDS,
|
||||
|
|
@ -12,6 +14,8 @@ import type {
|
|||
ImageGenerationProviderPlugin,
|
||||
MediaUnderstandingProviderPlugin,
|
||||
ProviderPlugin,
|
||||
RealtimeTranscriptionProviderPlugin,
|
||||
RealtimeVoiceProviderPlugin,
|
||||
SpeechProviderPlugin,
|
||||
WebFetchProviderPlugin,
|
||||
WebSearchProviderPlugin,
|
||||
|
|
@ -19,6 +23,8 @@ import type {
|
|||
import {
|
||||
loadVitestImageGenerationProviderContractRegistry,
|
||||
loadVitestMediaUnderstandingProviderContractRegistry,
|
||||
loadVitestRealtimeTranscriptionProviderContractRegistry,
|
||||
loadVitestRealtimeVoiceProviderContractRegistry,
|
||||
loadVitestSpeechProviderContractRegistry,
|
||||
} from "./speech-vitest-registry.js";
|
||||
|
||||
|
|
@ -38,6 +44,9 @@ type WebFetchProviderContractEntry = CapabilityContractEntry<WebFetchProviderPlu
|
|||
};
|
||||
|
||||
type SpeechProviderContractEntry = CapabilityContractEntry<SpeechProviderPlugin>;
|
||||
type RealtimeTranscriptionProviderContractEntry =
|
||||
CapabilityContractEntry<RealtimeTranscriptionProviderPlugin>;
|
||||
type RealtimeVoiceProviderContractEntry = CapabilityContractEntry<RealtimeVoiceProviderPlugin>;
|
||||
type MediaUnderstandingProviderContractEntry =
|
||||
CapabilityContractEntry<MediaUnderstandingProviderPlugin>;
|
||||
type ImageGenerationProviderContractEntry = CapabilityContractEntry<ImageGenerationProviderPlugin>;
|
||||
|
|
@ -47,6 +56,8 @@ type PluginRegistrationContractEntry = {
|
|||
cliBackendIds: string[];
|
||||
providerIds: string[];
|
||||
speechProviderIds: string[];
|
||||
realtimeTranscriptionProviderIds: string[];
|
||||
realtimeVoiceProviderIds: string[];
|
||||
mediaUnderstandingProviderIds: string[];
|
||||
imageGenerationProviderIds: string[];
|
||||
webFetchProviderIds: string[];
|
||||
|
|
@ -94,6 +105,10 @@ let webSearchProviderContractRegistryByPluginIdCache: Map<
|
|||
WebSearchProviderContractEntry[]
|
||||
> | null = null;
|
||||
let speechProviderContractRegistryCache: SpeechProviderContractEntry[] | null = null;
|
||||
let realtimeTranscriptionProviderContractRegistryCache:
|
||||
| RealtimeTranscriptionProviderContractEntry[]
|
||||
| null = null;
|
||||
let realtimeVoiceProviderContractRegistryCache: RealtimeVoiceProviderContractEntry[] | null = null;
|
||||
let mediaUnderstandingProviderContractRegistryCache:
|
||||
| MediaUnderstandingProviderContractEntry[]
|
||||
| null = null;
|
||||
|
|
@ -387,6 +402,36 @@ function loadSpeechProviderContractRegistry(): SpeechProviderContractEntry[] {
|
|||
return speechProviderContractRegistryCache;
|
||||
}
|
||||
|
||||
function loadRealtimeVoiceProviderContractRegistry(): RealtimeVoiceProviderContractEntry[] {
|
||||
if (!realtimeVoiceProviderContractRegistryCache) {
|
||||
realtimeVoiceProviderContractRegistryCache = process.env.VITEST
|
||||
? loadVitestRealtimeVoiceProviderContractRegistry()
|
||||
: loadBundledCapabilityRuntimeRegistry({
|
||||
pluginIds: BUNDLED_REALTIME_VOICE_PLUGIN_IDS,
|
||||
pluginSdkResolution: "dist",
|
||||
}).realtimeVoiceProviders.map((entry) => ({
|
||||
pluginId: entry.pluginId,
|
||||
provider: entry.provider,
|
||||
}));
|
||||
}
|
||||
return realtimeVoiceProviderContractRegistryCache;
|
||||
}
|
||||
|
||||
function loadRealtimeTranscriptionProviderContractRegistry(): RealtimeTranscriptionProviderContractEntry[] {
|
||||
if (!realtimeTranscriptionProviderContractRegistryCache) {
|
||||
realtimeTranscriptionProviderContractRegistryCache = process.env.VITEST
|
||||
? loadVitestRealtimeTranscriptionProviderContractRegistry()
|
||||
: loadBundledCapabilityRuntimeRegistry({
|
||||
pluginIds: BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS,
|
||||
pluginSdkResolution: "dist",
|
||||
}).realtimeTranscriptionProviders.map((entry) => ({
|
||||
pluginId: entry.pluginId,
|
||||
provider: entry.provider,
|
||||
}));
|
||||
}
|
||||
return realtimeTranscriptionProviderContractRegistryCache;
|
||||
}
|
||||
|
||||
function loadMediaUnderstandingProviderContractRegistry(): MediaUnderstandingProviderContractEntry[] {
|
||||
if (!mediaUnderstandingProviderContractRegistryCache) {
|
||||
mediaUnderstandingProviderContractRegistryCache = process.env.VITEST
|
||||
|
|
@ -519,6 +564,12 @@ export const speechProviderContractRegistry: SpeechProviderContractEntry[] = cre
|
|||
loadSpeechProviderContractRegistry,
|
||||
);
|
||||
|
||||
export const realtimeTranscriptionProviderContractRegistry: RealtimeTranscriptionProviderContractEntry[] =
|
||||
createLazyArrayView(loadRealtimeTranscriptionProviderContractRegistry);
|
||||
|
||||
export const realtimeVoiceProviderContractRegistry: RealtimeVoiceProviderContractEntry[] =
|
||||
createLazyArrayView(loadRealtimeVoiceProviderContractRegistry);
|
||||
|
||||
export const mediaUnderstandingProviderContractRegistry: MediaUnderstandingProviderContractEntry[] =
|
||||
createLazyArrayView(loadMediaUnderstandingProviderContractRegistry);
|
||||
|
||||
|
|
@ -531,6 +582,8 @@ function loadPluginRegistrationContractRegistry(): PluginRegistrationContractEnt
|
|||
cliBackendIds: uniqueStrings(entry.cliBackendIds),
|
||||
providerIds: uniqueStrings(entry.providerIds),
|
||||
speechProviderIds: uniqueStrings(entry.speechProviderIds),
|
||||
realtimeTranscriptionProviderIds: uniqueStrings(entry.realtimeTranscriptionProviderIds),
|
||||
realtimeVoiceProviderIds: uniqueStrings(entry.realtimeVoiceProviderIds),
|
||||
mediaUnderstandingProviderIds: uniqueStrings(entry.mediaUnderstandingProviderIds),
|
||||
imageGenerationProviderIds: uniqueStrings(entry.imageGenerationProviderIds),
|
||||
webFetchProviderIds: uniqueStrings(entry.webFetchProviderIds),
|
||||
|
|
|
|||
|
|
@ -5,6 +5,8 @@ import { createJiti } from "jiti";
|
|||
import {
|
||||
BUNDLED_IMAGE_GENERATION_PLUGIN_IDS,
|
||||
BUNDLED_MEDIA_UNDERSTANDING_PLUGIN_IDS,
|
||||
BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS,
|
||||
BUNDLED_REALTIME_VOICE_PLUGIN_IDS,
|
||||
BUNDLED_SPEECH_PLUGIN_IDS,
|
||||
} from "../bundled-capability-metadata.js";
|
||||
import { loadBundledCapabilityRuntimeRegistry } from "../bundled-capability-runtime.js";
|
||||
|
|
@ -13,6 +15,8 @@ import { buildPluginLoaderAliasMap, buildPluginLoaderJitiOptions } from "../sdk-
|
|||
import type {
|
||||
ImageGenerationProviderPlugin,
|
||||
MediaUnderstandingProviderPlugin,
|
||||
RealtimeTranscriptionProviderPlugin,
|
||||
RealtimeVoiceProviderPlugin,
|
||||
SpeechProviderPlugin,
|
||||
} from "../types.js";
|
||||
|
||||
|
|
@ -26,6 +30,16 @@ export type MediaUnderstandingProviderContractEntry = {
|
|||
provider: MediaUnderstandingProviderPlugin;
|
||||
};
|
||||
|
||||
export type RealtimeVoiceProviderContractEntry = {
|
||||
pluginId: string;
|
||||
provider: RealtimeVoiceProviderPlugin;
|
||||
};
|
||||
|
||||
export type RealtimeTranscriptionProviderContractEntry = {
|
||||
pluginId: string;
|
||||
provider: RealtimeTranscriptionProviderPlugin;
|
||||
};
|
||||
|
||||
export type ImageGenerationProviderContractEntry = {
|
||||
pluginId: string;
|
||||
provider: ImageGenerationProviderPlugin;
|
||||
|
|
@ -190,6 +204,96 @@ export function loadVitestMediaUnderstandingProviderContractRegistry(): MediaUnd
|
|||
return registrations;
|
||||
}
|
||||
|
||||
export function loadVitestRealtimeVoiceProviderContractRegistry(): RealtimeVoiceProviderContractEntry[] {
|
||||
const registrations: RealtimeVoiceProviderContractEntry[] = [];
|
||||
const { manifests, unresolvedPluginIds } = resolveTestApiModuleRecords(
|
||||
BUNDLED_REALTIME_VOICE_PLUGIN_IDS,
|
||||
);
|
||||
|
||||
for (const plugin of manifests) {
|
||||
if (!plugin.rootDir) {
|
||||
continue;
|
||||
}
|
||||
const testApiPath = path.join(plugin.rootDir, "test-api.ts");
|
||||
if (!fs.existsSync(testApiPath)) {
|
||||
continue;
|
||||
}
|
||||
const builder = resolveNamedBuilder<RealtimeVoiceProviderPlugin>(
|
||||
createVitestCapabilityLoader(testApiPath)(testApiPath),
|
||||
/^build.+RealtimeVoiceProvider$/u,
|
||||
);
|
||||
if (!builder) {
|
||||
continue;
|
||||
}
|
||||
registrations.push({
|
||||
pluginId: plugin.id,
|
||||
provider: builder(),
|
||||
});
|
||||
unresolvedPluginIds.delete(plugin.id);
|
||||
}
|
||||
|
||||
if (unresolvedPluginIds.size === 0) {
|
||||
return registrations;
|
||||
}
|
||||
|
||||
const runtimeRegistry = loadBundledCapabilityRuntimeRegistry({
|
||||
pluginIds: [...unresolvedPluginIds],
|
||||
pluginSdkResolution: "dist",
|
||||
});
|
||||
registrations.push(
|
||||
...runtimeRegistry.realtimeVoiceProviders.map((entry) => ({
|
||||
pluginId: entry.pluginId,
|
||||
provider: entry.provider,
|
||||
})),
|
||||
);
|
||||
return registrations;
|
||||
}
|
||||
|
||||
export function loadVitestRealtimeTranscriptionProviderContractRegistry(): RealtimeTranscriptionProviderContractEntry[] {
|
||||
const registrations: RealtimeTranscriptionProviderContractEntry[] = [];
|
||||
const { manifests, unresolvedPluginIds } = resolveTestApiModuleRecords(
|
||||
BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS,
|
||||
);
|
||||
|
||||
for (const plugin of manifests) {
|
||||
if (!plugin.rootDir) {
|
||||
continue;
|
||||
}
|
||||
const testApiPath = path.join(plugin.rootDir, "test-api.ts");
|
||||
if (!fs.existsSync(testApiPath)) {
|
||||
continue;
|
||||
}
|
||||
const builder = resolveNamedBuilder<RealtimeTranscriptionProviderPlugin>(
|
||||
createVitestCapabilityLoader(testApiPath)(testApiPath),
|
||||
/^build.+RealtimeTranscriptionProvider$/u,
|
||||
);
|
||||
if (!builder) {
|
||||
continue;
|
||||
}
|
||||
registrations.push({
|
||||
pluginId: plugin.id,
|
||||
provider: builder(),
|
||||
});
|
||||
unresolvedPluginIds.delete(plugin.id);
|
||||
}
|
||||
|
||||
if (unresolvedPluginIds.size === 0) {
|
||||
return registrations;
|
||||
}
|
||||
|
||||
const runtimeRegistry = loadBundledCapabilityRuntimeRegistry({
|
||||
pluginIds: [...unresolvedPluginIds],
|
||||
pluginSdkResolution: "dist",
|
||||
});
|
||||
registrations.push(
|
||||
...runtimeRegistry.realtimeTranscriptionProviders.map((entry) => ({
|
||||
pluginId: entry.pluginId,
|
||||
provider: entry.provider,
|
||||
})),
|
||||
);
|
||||
return registrations;
|
||||
}
|
||||
|
||||
export function loadVitestImageGenerationProviderContractRegistry(): ImageGenerationProviderContractEntry[] {
|
||||
const registrations: ImageGenerationProviderContractEntry[] = [];
|
||||
const { manifests, unresolvedPluginIds } = resolveTestApiModuleRecords(
|
||||
|
|
|
|||
|
|
@ -590,6 +590,8 @@ function createPluginRecord(params: {
|
|||
cliBackendIds: [],
|
||||
providerIds: [],
|
||||
speechProviderIds: [],
|
||||
realtimeTranscriptionProviderIds: [],
|
||||
realtimeVoiceProviderIds: [],
|
||||
mediaUnderstandingProviderIds: [],
|
||||
imageGenerationProviderIds: [],
|
||||
webFetchProviderIds: [],
|
||||
|
|
|
|||
|
|
@ -52,6 +52,8 @@ export type PluginManifest = {
|
|||
|
||||
export type PluginManifestContracts = {
|
||||
speechProviders?: string[];
|
||||
realtimeTranscriptionProviders?: string[];
|
||||
realtimeVoiceProviders?: string[];
|
||||
mediaUnderstandingProviders?: string[];
|
||||
imageGenerationProviders?: string[];
|
||||
webFetchProviders?: string[];
|
||||
|
|
@ -125,6 +127,8 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u
|
|||
}
|
||||
|
||||
const speechProviders = normalizeStringList(value.speechProviders);
|
||||
const realtimeTranscriptionProviders = normalizeStringList(value.realtimeTranscriptionProviders);
|
||||
const realtimeVoiceProviders = normalizeStringList(value.realtimeVoiceProviders);
|
||||
const mediaUnderstandingProviders = normalizeStringList(value.mediaUnderstandingProviders);
|
||||
const imageGenerationProviders = normalizeStringList(value.imageGenerationProviders);
|
||||
const webFetchProviders = normalizeStringList(value.webFetchProviders);
|
||||
|
|
@ -132,6 +136,8 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u
|
|||
const tools = normalizeStringList(value.tools);
|
||||
const contracts = {
|
||||
...(speechProviders.length > 0 ? { speechProviders } : {}),
|
||||
...(realtimeTranscriptionProviders.length > 0 ? { realtimeTranscriptionProviders } : {}),
|
||||
...(realtimeVoiceProviders.length > 0 ? { realtimeVoiceProviders } : {}),
|
||||
...(mediaUnderstandingProviders.length > 0 ? { mediaUnderstandingProviders } : {}),
|
||||
...(imageGenerationProviders.length > 0 ? { imageGenerationProviders } : {}),
|
||||
...(webFetchProviders.length > 0 ? { webFetchProviders } : {}),
|
||||
|
|
|
|||
|
|
@ -11,6 +11,8 @@ export function createEmptyPluginRegistry(): PluginRegistry {
|
|||
providers: [],
|
||||
cliBackends: [],
|
||||
speechProviders: [],
|
||||
realtimeTranscriptionProviders: [],
|
||||
realtimeVoiceProviders: [],
|
||||
mediaUnderstandingProviders: [],
|
||||
imageGenerationProviders: [],
|
||||
webFetchProviders: [],
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ import {
|
|||
import type {
|
||||
CliBackendPlugin,
|
||||
ImageGenerationProviderPlugin,
|
||||
WebFetchProviderPlugin,
|
||||
RealtimeTranscriptionProviderPlugin,
|
||||
OpenClawPluginApi,
|
||||
OpenClawPluginChannelRegistration,
|
||||
OpenClawPluginCliCommandDescriptor,
|
||||
|
|
@ -52,6 +52,7 @@ import type {
|
|||
OpenClawPluginHookOptions,
|
||||
MediaUnderstandingProviderPlugin,
|
||||
ProviderPlugin,
|
||||
RealtimeVoiceProviderPlugin,
|
||||
OpenClawPluginService,
|
||||
OpenClawPluginToolContext,
|
||||
OpenClawPluginToolFactory,
|
||||
|
|
@ -67,6 +68,7 @@ import type {
|
|||
PluginHookHandlerMap,
|
||||
PluginHookRegistration as TypedPluginHookRegistration,
|
||||
SpeechProviderPlugin,
|
||||
WebFetchProviderPlugin,
|
||||
WebSearchProviderPlugin,
|
||||
} from "./types.js";
|
||||
|
||||
|
|
@ -142,6 +144,10 @@ type PluginOwnedProviderRegistration<T extends { id: string }> = {
|
|||
|
||||
export type PluginSpeechProviderRegistration =
|
||||
PluginOwnedProviderRegistration<SpeechProviderPlugin>;
|
||||
export type PluginRealtimeTranscriptionProviderRegistration =
|
||||
PluginOwnedProviderRegistration<RealtimeTranscriptionProviderPlugin>;
|
||||
export type PluginRealtimeVoiceProviderRegistration =
|
||||
PluginOwnedProviderRegistration<RealtimeVoiceProviderPlugin>;
|
||||
export type PluginMediaUnderstandingProviderRegistration =
|
||||
PluginOwnedProviderRegistration<MediaUnderstandingProviderPlugin>;
|
||||
export type PluginImageGenerationProviderRegistration =
|
||||
|
|
@ -213,6 +219,8 @@ export type PluginRecord = {
|
|||
cliBackendIds: string[];
|
||||
providerIds: string[];
|
||||
speechProviderIds: string[];
|
||||
realtimeTranscriptionProviderIds: string[];
|
||||
realtimeVoiceProviderIds: string[];
|
||||
mediaUnderstandingProviderIds: string[];
|
||||
imageGenerationProviderIds: string[];
|
||||
webFetchProviderIds: string[];
|
||||
|
|
@ -239,6 +247,8 @@ export type PluginRegistry = {
|
|||
providers: PluginProviderRegistration[];
|
||||
cliBackends?: PluginCliBackendRegistration[];
|
||||
speechProviders: PluginSpeechProviderRegistration[];
|
||||
realtimeTranscriptionProviders: PluginRealtimeTranscriptionProviderRegistration[];
|
||||
realtimeVoiceProviders: PluginRealtimeVoiceProviderRegistration[];
|
||||
mediaUnderstandingProviders: PluginMediaUnderstandingProviderRegistration[];
|
||||
imageGenerationProviders: PluginImageGenerationProviderRegistration[];
|
||||
webFetchProviders: PluginWebFetchProviderRegistration[];
|
||||
|
|
@ -699,6 +709,32 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
|
|||
});
|
||||
};
|
||||
|
||||
const registerRealtimeTranscriptionProvider = (
|
||||
record: PluginRecord,
|
||||
provider: RealtimeTranscriptionProviderPlugin,
|
||||
) => {
|
||||
registerUniqueProviderLike({
|
||||
record,
|
||||
provider,
|
||||
kindLabel: "realtime transcription provider",
|
||||
registrations: registry.realtimeTranscriptionProviders,
|
||||
ownedIds: record.realtimeTranscriptionProviderIds,
|
||||
});
|
||||
};
|
||||
|
||||
const registerRealtimeVoiceProvider = (
|
||||
record: PluginRecord,
|
||||
provider: RealtimeVoiceProviderPlugin,
|
||||
) => {
|
||||
registerUniqueProviderLike({
|
||||
record,
|
||||
provider,
|
||||
kindLabel: "realtime voice provider",
|
||||
registrations: registry.realtimeVoiceProviders,
|
||||
ownedIds: record.realtimeVoiceProviderIds,
|
||||
});
|
||||
};
|
||||
|
||||
const registerMediaUnderstandingProvider = (
|
||||
record: PluginRecord,
|
||||
provider: MediaUnderstandingProviderPlugin,
|
||||
|
|
@ -1009,6 +1045,10 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
|
|||
registerHttpRoute: (routeParams) => registerHttpRoute(record, routeParams),
|
||||
registerProvider: (provider) => registerProvider(record, provider),
|
||||
registerSpeechProvider: (provider) => registerSpeechProvider(record, provider),
|
||||
registerRealtimeTranscriptionProvider: (provider) =>
|
||||
registerRealtimeTranscriptionProvider(record, provider),
|
||||
registerRealtimeVoiceProvider: (provider) =>
|
||||
registerRealtimeVoiceProvider(record, provider),
|
||||
registerMediaUnderstandingProvider: (provider) =>
|
||||
registerMediaUnderstandingProvider(record, provider),
|
||||
registerImageGenerationProvider: (provider) =>
|
||||
|
|
@ -1198,6 +1238,8 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
|
|||
registerProvider,
|
||||
registerCliBackend,
|
||||
registerSpeechProvider,
|
||||
registerRealtimeTranscriptionProvider,
|
||||
registerRealtimeVoiceProvider,
|
||||
registerMediaUnderstandingProvider,
|
||||
registerImageGenerationProvider,
|
||||
registerWebSearchProvider,
|
||||
|
|
|
|||
|
|
@ -199,6 +199,8 @@ describe("setActivePluginRegistry", () => {
|
|||
cliBackendIds: [],
|
||||
providerIds: [],
|
||||
speechProviderIds: [],
|
||||
realtimeTranscriptionProviderIds: [],
|
||||
realtimeVoiceProviderIds: [],
|
||||
mediaUnderstandingProviderIds: [],
|
||||
imageGenerationProviderIds: [],
|
||||
webFetchProviderIds: [],
|
||||
|
|
@ -225,6 +227,8 @@ describe("setActivePluginRegistry", () => {
|
|||
cliBackendIds: [],
|
||||
providerIds: [],
|
||||
speechProviderIds: [],
|
||||
realtimeTranscriptionProviderIds: [],
|
||||
realtimeVoiceProviderIds: [],
|
||||
mediaUnderstandingProviderIds: [],
|
||||
imageGenerationProviderIds: [],
|
||||
webFetchProviderIds: [],
|
||||
|
|
|
|||
|
|
@ -51,6 +51,8 @@ export function createPluginRecord(
|
|||
cliBackendIds: [],
|
||||
providerIds: [],
|
||||
speechProviderIds: [],
|
||||
realtimeTranscriptionProviderIds: [],
|
||||
realtimeVoiceProviderIds: [],
|
||||
mediaUnderstandingProviderIds: [],
|
||||
imageGenerationProviderIds: [],
|
||||
webFetchProviderIds: [],
|
||||
|
|
@ -107,7 +109,7 @@ export function createCustomHook(params: {
|
|||
export function createPluginLoadResult(
|
||||
overrides: Partial<PluginLoadResult> & Pick<PluginLoadResult, "plugins"> = { plugins: [] },
|
||||
): PluginLoadResult {
|
||||
const { plugins, ...rest } = overrides;
|
||||
const { plugins, realtimeTranscriptionProviders, realtimeVoiceProviders, ...rest } = overrides;
|
||||
return {
|
||||
plugins,
|
||||
diagnostics: [],
|
||||
|
|
@ -129,6 +131,8 @@ export function createPluginLoadResult(
|
|||
commands: [],
|
||||
conversationBindingResolvedHandlers: [],
|
||||
...rest,
|
||||
realtimeTranscriptionProviders: realtimeTranscriptionProviders ?? [],
|
||||
realtimeVoiceProviders: realtimeVoiceProviders ?? [],
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -28,6 +28,8 @@ export type PluginCapabilityKind =
|
|||
| "cli-backend"
|
||||
| "text-inference"
|
||||
| "speech"
|
||||
| "realtime-transcription"
|
||||
| "realtime-voice"
|
||||
| "media-understanding"
|
||||
| "image-generation"
|
||||
| "web-search"
|
||||
|
|
@ -233,6 +235,8 @@ function buildCapabilityEntries(plugin: PluginRegistry["plugins"][number]) {
|
|||
{ kind: "cli-backend" as const, ids: plugin.cliBackendIds ?? [] },
|
||||
{ kind: "text-inference" as const, ids: plugin.providerIds },
|
||||
{ kind: "speech" as const, ids: plugin.speechProviderIds },
|
||||
{ kind: "realtime-transcription" as const, ids: plugin.realtimeTranscriptionProviderIds },
|
||||
{ kind: "realtime-voice" as const, ids: plugin.realtimeVoiceProviderIds },
|
||||
{ kind: "media-understanding" as const, ids: plugin.mediaUnderstandingProviderIds },
|
||||
{ kind: "image-generation" as const, ids: plugin.imageGenerationProviderIds },
|
||||
{ kind: "web-search" as const, ids: plugin.webSearchProviderIds },
|
||||
|
|
|
|||
|
|
@ -30,6 +30,22 @@ import type { HookEntry } from "../hooks/types.js";
|
|||
import type { ImageGenerationProvider } from "../image-generation/types.js";
|
||||
import type { ProviderUsageSnapshot } from "../infra/provider-usage.types.js";
|
||||
import type { MediaUnderstandingProvider } from "../media-understanding/types.js";
|
||||
import type {
|
||||
RealtimeTranscriptionProviderConfig,
|
||||
RealtimeTranscriptionProviderConfiguredContext,
|
||||
RealtimeTranscriptionProviderId,
|
||||
RealtimeTranscriptionProviderResolveConfigContext,
|
||||
RealtimeTranscriptionSession,
|
||||
RealtimeTranscriptionSessionCreateRequest,
|
||||
} from "../realtime-transcription/provider-types.js";
|
||||
import type {
|
||||
RealtimeVoiceBridge,
|
||||
RealtimeVoiceBridgeCreateRequest,
|
||||
RealtimeVoiceProviderConfig,
|
||||
RealtimeVoiceProviderConfiguredContext,
|
||||
RealtimeVoiceProviderId,
|
||||
RealtimeVoiceProviderResolveConfigContext,
|
||||
} from "../realtime-voice/provider-types.js";
|
||||
import type { RuntimeEnv } from "../runtime.js";
|
||||
import type {
|
||||
RuntimeWebFetchMetadata,
|
||||
|
|
@ -1526,6 +1542,38 @@ export type PluginSpeechProviderEntry = SpeechProviderPlugin & {
|
|||
pluginId: string;
|
||||
};
|
||||
|
||||
/** Realtime transcription capability registered by a plugin. */
|
||||
export type RealtimeTranscriptionProviderPlugin = {
|
||||
id: RealtimeTranscriptionProviderId;
|
||||
label: string;
|
||||
aliases?: string[];
|
||||
autoSelectOrder?: number;
|
||||
resolveConfig?: (
|
||||
ctx: RealtimeTranscriptionProviderResolveConfigContext,
|
||||
) => RealtimeTranscriptionProviderConfig;
|
||||
isConfigured: (ctx: RealtimeTranscriptionProviderConfiguredContext) => boolean;
|
||||
createSession: (req: RealtimeTranscriptionSessionCreateRequest) => RealtimeTranscriptionSession;
|
||||
};
|
||||
|
||||
export type PluginRealtimeTranscriptionProviderEntry = RealtimeTranscriptionProviderPlugin & {
|
||||
pluginId: string;
|
||||
};
|
||||
|
||||
/** Realtime voice capability registered by a plugin. */
|
||||
export type RealtimeVoiceProviderPlugin = {
|
||||
id: RealtimeVoiceProviderId;
|
||||
label: string;
|
||||
aliases?: string[];
|
||||
autoSelectOrder?: number;
|
||||
resolveConfig?: (ctx: RealtimeVoiceProviderResolveConfigContext) => RealtimeVoiceProviderConfig;
|
||||
isConfigured: (ctx: RealtimeVoiceProviderConfiguredContext) => boolean;
|
||||
createBridge: (req: RealtimeVoiceBridgeCreateRequest) => RealtimeVoiceBridge;
|
||||
};
|
||||
|
||||
export type PluginRealtimeVoiceProviderEntry = RealtimeVoiceProviderPlugin & {
|
||||
pluginId: string;
|
||||
};
|
||||
|
||||
export type MediaUnderstandingProviderPlugin = MediaUnderstandingProvider;
|
||||
export type ImageGenerationProviderPlugin = ImageGenerationProvider;
|
||||
|
||||
|
|
@ -1850,6 +1898,10 @@ export type OpenClawPluginApi = {
|
|||
registerProvider: (provider: ProviderPlugin) => void;
|
||||
/** Register a speech synthesis provider (speech capability). */
|
||||
registerSpeechProvider: (provider: SpeechProviderPlugin) => void;
|
||||
/** Register a realtime transcription provider (streaming STT capability). */
|
||||
registerRealtimeTranscriptionProvider: (provider: RealtimeTranscriptionProviderPlugin) => void;
|
||||
/** Register a realtime voice provider (duplex voice capability). */
|
||||
registerRealtimeVoiceProvider: (provider: RealtimeVoiceProviderPlugin) => void;
|
||||
/** Register a media understanding provider (media understanding capability). */
|
||||
registerMediaUnderstandingProvider: (provider: MediaUnderstandingProviderPlugin) => void;
|
||||
/** Register an image generation provider (image generation capability). */
|
||||
|
|
|
|||
|
|
@ -0,0 +1,80 @@
|
|||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import { resolvePluginCapabilityProviders } from "../plugins/capability-provider-runtime.js";
|
||||
import type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js";
|
||||
import type { RealtimeTranscriptionProviderId } from "./provider-types.js";
|
||||
|
||||
function trimToUndefined(value: string | undefined): string | undefined {
|
||||
const trimmed = value?.trim().toLowerCase();
|
||||
return trimmed ? trimmed : undefined;
|
||||
}
|
||||
|
||||
export function normalizeRealtimeTranscriptionProviderId(
|
||||
providerId: string | undefined,
|
||||
): RealtimeTranscriptionProviderId | undefined {
|
||||
return trimToUndefined(providerId);
|
||||
}
|
||||
|
||||
function resolveRealtimeTranscriptionProviderEntries(
|
||||
cfg?: OpenClawConfig,
|
||||
): RealtimeTranscriptionProviderPlugin[] {
|
||||
return resolvePluginCapabilityProviders({
|
||||
key: "realtimeTranscriptionProviders",
|
||||
cfg,
|
||||
});
|
||||
}
|
||||
|
||||
function buildProviderMaps(cfg?: OpenClawConfig): {
|
||||
canonical: Map<string, RealtimeTranscriptionProviderPlugin>;
|
||||
aliases: Map<string, RealtimeTranscriptionProviderPlugin>;
|
||||
} {
|
||||
const canonical = new Map<string, RealtimeTranscriptionProviderPlugin>();
|
||||
const aliases = new Map<string, RealtimeTranscriptionProviderPlugin>();
|
||||
const register = (provider: RealtimeTranscriptionProviderPlugin) => {
|
||||
const id = normalizeRealtimeTranscriptionProviderId(provider.id);
|
||||
if (!id) {
|
||||
return;
|
||||
}
|
||||
canonical.set(id, provider);
|
||||
aliases.set(id, provider);
|
||||
for (const alias of provider.aliases ?? []) {
|
||||
const normalizedAlias = normalizeRealtimeTranscriptionProviderId(alias);
|
||||
if (normalizedAlias) {
|
||||
aliases.set(normalizedAlias, provider);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (const provider of resolveRealtimeTranscriptionProviderEntries(cfg)) {
|
||||
register(provider);
|
||||
}
|
||||
|
||||
return { canonical, aliases };
|
||||
}
|
||||
|
||||
export function listRealtimeTranscriptionProviders(
|
||||
cfg?: OpenClawConfig,
|
||||
): RealtimeTranscriptionProviderPlugin[] {
|
||||
return [...buildProviderMaps(cfg).canonical.values()];
|
||||
}
|
||||
|
||||
export function getRealtimeTranscriptionProvider(
|
||||
providerId: string | undefined,
|
||||
cfg?: OpenClawConfig,
|
||||
): RealtimeTranscriptionProviderPlugin | undefined {
|
||||
const normalized = normalizeRealtimeTranscriptionProviderId(providerId);
|
||||
if (!normalized) {
|
||||
return undefined;
|
||||
}
|
||||
return buildProviderMaps(cfg).aliases.get(normalized);
|
||||
}
|
||||
|
||||
export function canonicalizeRealtimeTranscriptionProviderId(
|
||||
providerId: string | undefined,
|
||||
cfg?: OpenClawConfig,
|
||||
): RealtimeTranscriptionProviderId | undefined {
|
||||
const normalized = normalizeRealtimeTranscriptionProviderId(providerId);
|
||||
if (!normalized) {
|
||||
return undefined;
|
||||
}
|
||||
return getRealtimeTranscriptionProvider(normalized, cfg)?.id ?? normalized;
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
import type { OpenClawConfig } from "../config/config.js";
|
||||
|
||||
export type RealtimeTranscriptionProviderId = string;
|
||||
|
||||
export type RealtimeTranscriptionProviderConfig = Record<string, unknown>;
|
||||
|
||||
export type RealtimeTranscriptionProviderResolveConfigContext = {
|
||||
cfg: OpenClawConfig;
|
||||
rawConfig: RealtimeTranscriptionProviderConfig;
|
||||
};
|
||||
|
||||
export type RealtimeTranscriptionProviderConfiguredContext = {
|
||||
cfg?: OpenClawConfig;
|
||||
providerConfig: RealtimeTranscriptionProviderConfig;
|
||||
};
|
||||
|
||||
export type RealtimeTranscriptionSessionCallbacks = {
|
||||
onPartial?: (partial: string) => void;
|
||||
onTranscript?: (transcript: string) => void;
|
||||
onSpeechStart?: () => void;
|
||||
onError?: (error: Error) => void;
|
||||
};
|
||||
|
||||
export type RealtimeTranscriptionSessionCreateRequest = RealtimeTranscriptionSessionCallbacks & {
|
||||
providerConfig: RealtimeTranscriptionProviderConfig;
|
||||
};
|
||||
|
||||
export type RealtimeTranscriptionSession = {
|
||||
connect(): Promise<void>;
|
||||
sendAudio(audio: Buffer): void;
|
||||
close(): void;
|
||||
isConnected(): boolean;
|
||||
};
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import { resolvePluginCapabilityProviders } from "../plugins/capability-provider-runtime.js";
|
||||
import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
|
||||
import type { RealtimeVoiceProviderId } from "./provider-types.js";
|
||||
|
||||
function trimToUndefined(value: string | undefined): string | undefined {
|
||||
const trimmed = value?.trim().toLowerCase();
|
||||
return trimmed ? trimmed : undefined;
|
||||
}
|
||||
|
||||
export function normalizeRealtimeVoiceProviderId(
|
||||
providerId: string | undefined,
|
||||
): RealtimeVoiceProviderId | undefined {
|
||||
return trimToUndefined(providerId);
|
||||
}
|
||||
|
||||
function resolveRealtimeVoiceProviderEntries(cfg?: OpenClawConfig): RealtimeVoiceProviderPlugin[] {
|
||||
return resolvePluginCapabilityProviders({
|
||||
key: "realtimeVoiceProviders",
|
||||
cfg,
|
||||
});
|
||||
}
|
||||
|
||||
function buildProviderMaps(cfg?: OpenClawConfig): {
|
||||
canonical: Map<string, RealtimeVoiceProviderPlugin>;
|
||||
aliases: Map<string, RealtimeVoiceProviderPlugin>;
|
||||
} {
|
||||
const canonical = new Map<string, RealtimeVoiceProviderPlugin>();
|
||||
const aliases = new Map<string, RealtimeVoiceProviderPlugin>();
|
||||
const register = (provider: RealtimeVoiceProviderPlugin) => {
|
||||
const id = normalizeRealtimeVoiceProviderId(provider.id);
|
||||
if (!id) {
|
||||
return;
|
||||
}
|
||||
canonical.set(id, provider);
|
||||
aliases.set(id, provider);
|
||||
for (const alias of provider.aliases ?? []) {
|
||||
const normalizedAlias = normalizeRealtimeVoiceProviderId(alias);
|
||||
if (normalizedAlias) {
|
||||
aliases.set(normalizedAlias, provider);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (const provider of resolveRealtimeVoiceProviderEntries(cfg)) {
|
||||
register(provider);
|
||||
}
|
||||
|
||||
return { canonical, aliases };
|
||||
}
|
||||
|
||||
export function listRealtimeVoiceProviders(cfg?: OpenClawConfig): RealtimeVoiceProviderPlugin[] {
|
||||
return [...buildProviderMaps(cfg).canonical.values()];
|
||||
}
|
||||
|
||||
export function getRealtimeVoiceProvider(
|
||||
providerId: string | undefined,
|
||||
cfg?: OpenClawConfig,
|
||||
): RealtimeVoiceProviderPlugin | undefined {
|
||||
const normalized = normalizeRealtimeVoiceProviderId(providerId);
|
||||
if (!normalized) {
|
||||
return undefined;
|
||||
}
|
||||
return buildProviderMaps(cfg).aliases.get(normalized);
|
||||
}
|
||||
|
||||
export function canonicalizeRealtimeVoiceProviderId(
|
||||
providerId: string | undefined,
|
||||
cfg?: OpenClawConfig,
|
||||
): RealtimeVoiceProviderId | undefined {
|
||||
const normalized = normalizeRealtimeVoiceProviderId(providerId);
|
||||
if (!normalized) {
|
||||
return undefined;
|
||||
}
|
||||
return getRealtimeVoiceProvider(normalized, cfg)?.id ?? normalized;
|
||||
}
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
import type { OpenClawConfig } from "../config/config.js";
|
||||
|
||||
export type RealtimeVoiceProviderId = string;
|
||||
|
||||
export type RealtimeVoiceRole = "user" | "assistant";
|
||||
|
||||
export type RealtimeVoiceCloseReason = "completed" | "error";
|
||||
|
||||
export type RealtimeVoiceTool = {
|
||||
type: "function";
|
||||
name: string;
|
||||
description: string;
|
||||
parameters: {
|
||||
type: "object";
|
||||
properties: Record<string, unknown>;
|
||||
required?: string[];
|
||||
};
|
||||
};
|
||||
|
||||
export type RealtimeVoiceToolCallEvent = {
|
||||
itemId: string;
|
||||
callId: string;
|
||||
name: string;
|
||||
args: unknown;
|
||||
};
|
||||
|
||||
export type RealtimeVoiceBridgeCallbacks = {
|
||||
onAudio: (muLaw: Buffer) => void;
|
||||
onClearAudio: () => void;
|
||||
onMark?: (markName: string) => void;
|
||||
onTranscript?: (role: RealtimeVoiceRole, text: string, isFinal: boolean) => void;
|
||||
onToolCall?: (event: RealtimeVoiceToolCallEvent) => void;
|
||||
onReady?: () => void;
|
||||
onError?: (error: Error) => void;
|
||||
onClose?: (reason: RealtimeVoiceCloseReason) => void;
|
||||
};
|
||||
|
||||
export type RealtimeVoiceProviderConfig = Record<string, unknown>;
|
||||
|
||||
export type RealtimeVoiceProviderResolveConfigContext = {
|
||||
cfg: OpenClawConfig;
|
||||
rawConfig: RealtimeVoiceProviderConfig;
|
||||
};
|
||||
|
||||
export type RealtimeVoiceProviderConfiguredContext = {
|
||||
cfg?: OpenClawConfig;
|
||||
providerConfig: RealtimeVoiceProviderConfig;
|
||||
};
|
||||
|
||||
export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & {
|
||||
providerConfig: RealtimeVoiceProviderConfig;
|
||||
instructions?: string;
|
||||
tools?: RealtimeVoiceTool[];
|
||||
};
|
||||
|
||||
export type RealtimeVoiceBridge = {
|
||||
connect(): Promise<void>;
|
||||
sendAudio(audio: Buffer): void;
|
||||
setMediaTimestamp(ts: number): void;
|
||||
sendUserMessage?(text: string): void;
|
||||
triggerGreeting?(instructions?: string): void;
|
||||
submitToolResult(callId: string, result: unknown): void;
|
||||
acknowledgeMark(): void;
|
||||
close(): void;
|
||||
isConnected(): boolean;
|
||||
};
|
||||
|
|
@ -27,6 +27,8 @@ export const createTestRegistry = (channels: TestChannelRegistration[] = []): Pl
|
|||
})),
|
||||
providers: [],
|
||||
speechProviders: [],
|
||||
realtimeTranscriptionProviders: [],
|
||||
realtimeVoiceProviders: [],
|
||||
mediaUnderstandingProviders: [],
|
||||
imageGenerationProviders: [],
|
||||
webFetchProviders: [],
|
||||
|
|
|
|||
|
|
@ -20,6 +20,8 @@ export function createTestPluginApi(api: TestPluginApiInput): OpenClawPluginApi
|
|||
registerCliBackend() {},
|
||||
registerProvider() {},
|
||||
registerSpeechProvider() {},
|
||||
registerRealtimeTranscriptionProvider() {},
|
||||
registerRealtimeVoiceProvider() {},
|
||||
registerMediaUnderstandingProvider() {},
|
||||
registerImageGenerationProvider() {},
|
||||
registerWebFetchProvider() {},
|
||||
|
|
|
|||
|
|
@ -92,6 +92,8 @@ export const pluginRegistrationContractCases = {
|
|||
pluginId: "openai",
|
||||
providerIds: ["openai", "openai-codex"],
|
||||
speechProviderIds: ["openai"],
|
||||
realtimeTranscriptionProviderIds: ["openai"],
|
||||
realtimeVoiceProviderIds: ["openai"],
|
||||
mediaUnderstandingProviderIds: ["openai", "openai-codex"],
|
||||
imageGenerationProviderIds: ["openai"],
|
||||
cliBackendIds: ["codex-cli"],
|
||||
|
|
|
|||
|
|
@ -13,6 +13,8 @@ type PluginRegistrationContractParams = {
|
|||
webFetchProviderIds?: string[];
|
||||
webSearchProviderIds?: string[];
|
||||
speechProviderIds?: string[];
|
||||
realtimeTranscriptionProviderIds?: string[];
|
||||
realtimeVoiceProviderIds?: string[];
|
||||
mediaUnderstandingProviderIds?: string[];
|
||||
imageGenerationProviderIds?: string[];
|
||||
cliBackendIds?: string[];
|
||||
|
|
@ -122,6 +124,22 @@ export function describePluginRegistrationContract(params: PluginRegistrationCon
|
|||
});
|
||||
}
|
||||
|
||||
if (params.realtimeTranscriptionProviderIds) {
|
||||
it("keeps bundled realtime-transcription ownership explicit", () => {
|
||||
expect(findRegistration(params.pluginId).realtimeTranscriptionProviderIds).toEqual(
|
||||
params.realtimeTranscriptionProviderIds,
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
if (params.realtimeVoiceProviderIds) {
|
||||
it("keeps bundled realtime-voice ownership explicit", () => {
|
||||
expect(findRegistration(params.pluginId).realtimeVoiceProviderIds).toEqual(
|
||||
params.realtimeVoiceProviderIds,
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
if (params.mediaUnderstandingProviderIds) {
|
||||
it("keeps bundled media-understanding ownership explicit", () => {
|
||||
expect(findRegistration(params.pluginId).mediaUnderstandingProviderIds).toEqual(
|
||||
|
|
|
|||
|
|
@ -110,6 +110,8 @@ function createTestRegistryForSetup(
|
|||
})),
|
||||
providers: [],
|
||||
speechProviders: [],
|
||||
realtimeTranscriptionProviders: [],
|
||||
realtimeVoiceProviders: [],
|
||||
mediaUnderstandingProviders: [],
|
||||
imageGenerationProviders: [],
|
||||
webFetchProviders: [],
|
||||
|
|
|
|||
|
|
@ -1,23 +1,23 @@
|
|||
import { createScopedVitestConfig } from "./vitest.scoped-config.ts";
|
||||
import { boundaryTestFiles } from "./vitest.unit-paths.mjs";
|
||||
import { defineConfig } from "vitest/config";
|
||||
import { sharedVitestConfig } from "./vitest.shared.config.ts";
|
||||
|
||||
export function createContractsVitestConfig(env?: Record<string, string | undefined>) {
|
||||
return createScopedVitestConfig(
|
||||
[
|
||||
"src/channels/plugins/contracts/**/*.test.ts",
|
||||
"src/config/doc-baseline.integration.test.ts",
|
||||
"src/config/schema.base.generated.test.ts",
|
||||
"src/config/schema.help.quality.test.ts",
|
||||
"src/plugins/contracts/**/*.test.ts",
|
||||
"test/**/*.test.ts",
|
||||
],
|
||||
{
|
||||
env,
|
||||
exclude: boundaryTestFiles,
|
||||
name: "contracts",
|
||||
const base = sharedVitestConfig as Record<string, unknown>;
|
||||
const baseTest = sharedVitestConfig.test ?? {};
|
||||
|
||||
export function createContractsVitestConfig() {
|
||||
return defineConfig({
|
||||
...base,
|
||||
test: {
|
||||
...baseTest,
|
||||
isolate: true,
|
||||
setupFiles: baseTest.setupFiles ?? [],
|
||||
include: [
|
||||
"src/channels/plugins/contracts/**/*.test.ts",
|
||||
"src/plugins/contracts/**/*.test.ts",
|
||||
],
|
||||
passWithNoTests: true,
|
||||
},
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
export default createContractsVitestConfig();
|
||||
|
|
|
|||
Loading…
Reference in New Issue