From a23ab9b906dc6a4f6b24bb3f681f395eb792dbcd Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 4 Apr 2026 12:04:37 +0900 Subject: [PATCH] refactor: move voice-call realtime providers into extensions --- CHANGELOG.md | 1 + docs/plugins/architecture.md | 6 +- docs/plugins/building-plugins.md | 1 + docs/plugins/manifest.md | 18 +- docs/plugins/sdk-overview.md | 20 +- docs/plugins/sdk-provider-plugins.md | 31 +- extensions/anthropic/test-api.ts | 1 + extensions/deepgram/test-api.ts | 1 + extensions/fal/test-api.ts | 1 + extensions/google/test-api.ts | 2 + extensions/groq/test-api.ts | 1 + extensions/lobster/src/lobster-tool.test.ts | 2 + extensions/mattermost/runtime-api.ts | 2 +- extensions/minimax/test-api.ts | 8 + extensions/mistral/test-api.ts | 1 + extensions/moonshot/test-api.ts | 1 + extensions/openai/api.ts | 2 + extensions/openai/index.ts | 4 + .../openai/openai-codex-provider.test.ts | 10 +- extensions/openai/openclaw.plugin.json | 2 + extensions/openai/package.json | 3 + .../realtime-transcription-provider.test.ts | 27 + .../openai/realtime-transcription-provider.ts | 267 +++++++++ extensions/openai/realtime-voice-provider.ts | 535 ++++++++++++++++++ extensions/openai/test-api.ts | 2 + extensions/openrouter/test-api.ts | 1 + extensions/voice-call/README.md | 2 +- extensions/voice-call/index.ts | 17 +- extensions/voice-call/openclaw.plugin.json | 87 ++- extensions/voice-call/src/config.test.ts | 45 +- extensions/voice-call/src/config.ts | 315 ++++++++++- .../voice-call/src/manager/outbound.test.ts | 4 +- extensions/voice-call/src/manager/outbound.ts | 25 +- .../voice-call/src/media-stream.test.ts | 62 +- extensions/voice-call/src/media-stream.ts | 49 +- extensions/voice-call/src/providers/index.ts | 5 - .../src/providers/stt-openai-realtime.test.ts | 42 -- .../src/providers/stt-openai-realtime.ts | 321 ----------- .../src/providers/tts-openai.test.ts | 43 -- .../voice-call/src/providers/tts-openai.ts | 185 ------ .../src/realtime-transcription.runtime.ts | 4 + .../voice-call/src/realtime-voice.runtime.ts | 4 + extensions/voice-call/src/runtime.ts | 105 +++- extensions/voice-call/src/test-fixtures.ts | 18 +- extensions/voice-call/src/webhook.test.ts | 79 +++ extensions/voice-call/src/webhook.ts | 128 ++++- .../src/webhook/realtime-handler.test.ts | 92 +++ .../src/webhook/realtime-handler.ts | 413 ++++++++++++++ extensions/zai/test-api.ts | 1 + package.json | 8 + scripts/lib/plugin-sdk-entrypoints.json | 2 + scripts/write-cli-startup-metadata.ts | 46 +- src/cli/program/root-help.ts | 22 +- src/gateway/server-plugins.test.ts | 2 + src/gateway/test-helpers.mocks.ts | 2 + src/plugin-sdk/core.ts | 1 + src/plugin-sdk/index.ts | 1 + src/plugin-sdk/plugin-entry.ts | 2 + src/plugin-sdk/realtime-transcription.ts | 16 + src/plugin-sdk/realtime-voice.ts | 20 + src/plugin-sdk/speech.ts | 85 ++- src/plugins/api-builder.ts | 10 + .../bundled-capability-metadata.test.ts | 6 + src/plugins/bundled-capability-metadata.ts | 18 + src/plugins/bundled-capability-runtime.ts | 26 + .../capability-provider-runtime.test.ts | 9 +- src/plugins/capability-provider-runtime.ts | 6 + src/plugins/captured-registration.ts | 14 + src/plugins/cli.ts | 3 +- .../contracts/registry.contract.test.ts | 39 +- src/plugins/contracts/registry.ts | 53 ++ .../contracts/speech-vitest-registry.ts | 104 ++++ src/plugins/loader.ts | 2 + src/plugins/manifest.ts | 6 + src/plugins/registry-empty.ts | 2 + src/plugins/registry.ts | 44 +- src/plugins/runtime.test.ts | 4 + src/plugins/status.test-helpers.ts | 6 +- src/plugins/status.ts | 4 + src/plugins/types.ts | 52 ++ .../provider-registry.ts | 80 +++ src/realtime-transcription/provider-types.ts | 33 ++ src/realtime-voice/provider-registry.ts | 76 +++ src/realtime-voice/provider-types.ts | 66 +++ src/test-utils/channel-plugins.ts | 2 + test/helpers/plugins/plugin-api.ts | 2 + .../plugin-registration-contract-cases.ts | 2 + .../plugins/plugin-registration-contract.ts | 18 + test/setup-openclaw-runtime.ts | 2 + vitest.contracts.config.ts | 34 +- 90 files changed, 3134 insertions(+), 792 deletions(-) create mode 100644 extensions/deepgram/test-api.ts create mode 100644 extensions/fal/test-api.ts create mode 100644 extensions/groq/test-api.ts create mode 100644 extensions/minimax/test-api.ts create mode 100644 extensions/mistral/test-api.ts create mode 100644 extensions/openai/realtime-transcription-provider.test.ts create mode 100644 extensions/openai/realtime-transcription-provider.ts create mode 100644 extensions/openai/realtime-voice-provider.ts create mode 100644 extensions/openrouter/test-api.ts delete mode 100644 extensions/voice-call/src/providers/stt-openai-realtime.test.ts delete mode 100644 extensions/voice-call/src/providers/stt-openai-realtime.ts delete mode 100644 extensions/voice-call/src/providers/tts-openai.test.ts delete mode 100644 extensions/voice-call/src/providers/tts-openai.ts create mode 100644 extensions/voice-call/src/realtime-transcription.runtime.ts create mode 100644 extensions/voice-call/src/realtime-voice.runtime.ts create mode 100644 extensions/voice-call/src/webhook/realtime-handler.test.ts create mode 100644 extensions/voice-call/src/webhook/realtime-handler.ts create mode 100644 extensions/zai/test-api.ts create mode 100644 src/plugin-sdk/realtime-transcription.ts create mode 100644 src/plugin-sdk/realtime-voice.ts create mode 100644 src/realtime-transcription/provider-registry.ts create mode 100644 src/realtime-transcription/provider-types.ts create mode 100644 src/realtime-voice/provider-registry.ts create mode 100644 src/realtime-voice/provider-types.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index a8e1fac5305..dbeb0470988 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,6 +48,7 @@ Docs: https://docs.openclaw.ai - Telegram/native commands: clean up metadata-driven progress placeholders when replies fall back, edits fail, or local exec approval prompts are suppressed. (#59300) Thanks @jalehman. - Matrix/backup reset: recreate secret storage during backup reset when stale SSSS state blocks durable backup-key reload, including no-backup repair paths. (#60599) thanks @emonty. - Matrix: allow secret-storage recreation during automatic repair bootstrap so clients that lose their recovery key can recover and persist new cross-signing keys. (#59846) Thanks @al3mart. +- Voice Call/OpenAI: move realtime voice and realtime transcription onto provider-owned plugin capabilities so `voice-call` uses generic provider selection while keeping realtime Twilio replay and custom webhook-path handling working. - Matrix/crypto persistence: capture and write the IndexedDB snapshot while holding the snapshot file lock so concurrent gateway and CLI persists cannot overwrite newer crypto state. (#59851) Thanks @al3mart. - Matrix/media: surface a dedicated `[matrix attachment too large]` marker for oversized inbound media instead of the generic unavailable marker, and classify size-limit failures with a typed Matrix error. (#60289) Thanks @efe-arv. - Matrix/Telegram exec approvals: recover stored same-channel account bindings even when session reply state drifted to another channel, so foreign-channel approvals route to the bound account instead of fanning out or being rejected as ambiguous. (#60417) thanks @gumadeiras. diff --git a/docs/plugins/architecture.md b/docs/plugins/architecture.md index 75695e2c718..c81d32239fd 100644 --- a/docs/plugins/architecture.md +++ b/docs/plugins/architecture.md @@ -32,6 +32,7 @@ native OpenClaw plugin registers against one or more capability types: | Text inference | `api.registerProvider(...)` | `openai`, `anthropic` | | CLI inference backend | `api.registerCliBackend(...)` | `openai`, `anthropic` | | Speech | `api.registerSpeechProvider(...)` | `elevenlabs`, `microsoft` | +| Realtime voice | `api.registerRealtimeVoiceProvider(...)` | `openai` | | Media understanding | `api.registerMediaUnderstandingProvider(...)` | `openai`, `google` | | Image generation | `api.registerImageGenerationProvider(...)` | `openai`, `google` | | Web search | `api.registerWebSearchProvider(...)` | `google` | @@ -239,8 +240,9 @@ Examples: - the bundled `minimax`, `mistral`, `moonshot`, and `zai` plugins own their media-understanding backends - the `voice-call` plugin is a feature plugin: it owns call transport, tools, - CLI, routes, and runtime, but it consumes core TTS/STT capability instead of - inventing a second speech stack + CLI, routes, and Twilio media-stream bridging, but it consumes shared speech + plus realtime-transcription and realtime-voice capabilities instead of + importing vendor plugins directly The intended end state is: diff --git a/docs/plugins/building-plugins.md b/docs/plugins/building-plugins.md index 155b90a108b..963fe43dac6 100644 --- a/docs/plugins/building-plugins.md +++ b/docs/plugins/building-plugins.md @@ -146,6 +146,7 @@ A single plugin can register any number of capabilities via the `api` object: | CLI inference backend | `api.registerCliBackend(...)` | [CLI Backends](/gateway/cli-backends) | | Channel / messaging | `api.registerChannel(...)` | [Channel Plugins](/plugins/sdk-channel-plugins) | | Speech (TTS/STT) | `api.registerSpeechProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) | +| Realtime voice | `api.registerRealtimeVoiceProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) | | Media understanding | `api.registerMediaUnderstandingProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) | | Image generation | `api.registerImageGenerationProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) | | Web search | `api.registerWebSearchProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) | diff --git a/docs/plugins/manifest.md b/docs/plugins/manifest.md index 256715f3dc4..881c9615075 100644 --- a/docs/plugins/manifest.md +++ b/docs/plugins/manifest.md @@ -196,6 +196,8 @@ read without importing the plugin runtime. { "contracts": { "speechProviders": ["openai"], + "realtimeTranscriptionProviders": ["openai"], + "realtimeVoiceProviders": ["openai"], "mediaUnderstandingProviders": ["openai", "openai-codex"], "imageGenerationProviders": ["openai"], "webSearchProviders": ["gemini"], @@ -206,13 +208,15 @@ read without importing the plugin runtime. Each list is optional: -| Field | Type | What it means | -| ----------------------------- | ---------- | -------------------------------------------------------------- | -| `speechProviders` | `string[]` | Speech provider ids this plugin owns. | -| `mediaUnderstandingProviders` | `string[]` | Media-understanding provider ids this plugin owns. | -| `imageGenerationProviders` | `string[]` | Image-generation provider ids this plugin owns. | -| `webSearchProviders` | `string[]` | Web-search provider ids this plugin owns. | -| `tools` | `string[]` | Agent tool names this plugin owns for bundled contract checks. | +| Field | Type | What it means | +| -------------------------------- | ---------- | -------------------------------------------------------------- | +| `speechProviders` | `string[]` | Speech provider ids this plugin owns. | +| `realtimeTranscriptionProviders` | `string[]` | Realtime-transcription provider ids this plugin owns. | +| `realtimeVoiceProviders` | `string[]` | Realtime-voice provider ids this plugin owns. | +| `mediaUnderstandingProviders` | `string[]` | Media-understanding provider ids this plugin owns. | +| `imageGenerationProviders` | `string[]` | Image-generation provider ids this plugin owns. | +| `webSearchProviders` | `string[]` | Web-search provider ids this plugin owns. | +| `tools` | `string[]` | Agent tool names this plugin owns for bundled contract checks. | Legacy top-level `speechProviders`, `mediaUnderstandingProviders`, and `imageGenerationProviders` are deprecated. Use `openclaw doctor --fix` to move diff --git a/docs/plugins/sdk-overview.md b/docs/plugins/sdk-overview.md index 840bb079857..52c90e793d4 100644 --- a/docs/plugins/sdk-overview.md +++ b/docs/plugins/sdk-overview.md @@ -128,15 +128,17 @@ methods: ### Capability registration -| Method | What it registers | -| --------------------------------------------- | ------------------------------ | -| `api.registerProvider(...)` | Text inference (LLM) | -| `api.registerCliBackend(...)` | Local CLI inference backend | -| `api.registerChannel(...)` | Messaging channel | -| `api.registerSpeechProvider(...)` | Text-to-speech / STT synthesis | -| `api.registerMediaUnderstandingProvider(...)` | Image/audio/video analysis | -| `api.registerImageGenerationProvider(...)` | Image generation | -| `api.registerWebSearchProvider(...)` | Web search | +| Method | What it registers | +| ------------------------------------------------ | -------------------------------- | +| `api.registerProvider(...)` | Text inference (LLM) | +| `api.registerCliBackend(...)` | Local CLI inference backend | +| `api.registerChannel(...)` | Messaging channel | +| `api.registerSpeechProvider(...)` | Text-to-speech / STT synthesis | +| `api.registerRealtimeTranscriptionProvider(...)` | Streaming realtime transcription | +| `api.registerRealtimeVoiceProvider(...)` | Duplex realtime voice sessions | +| `api.registerMediaUnderstandingProvider(...)` | Image/audio/video analysis | +| `api.registerImageGenerationProvider(...)` | Image generation | +| `api.registerWebSearchProvider(...)` | Web search | ### Tools and commands diff --git a/docs/plugins/sdk-provider-plugins.md b/docs/plugins/sdk-provider-plugins.md index afeddcab209..a1133c940c9 100644 --- a/docs/plugins/sdk-provider-plugins.md +++ b/docs/plugins/sdk-provider-plugins.md @@ -324,8 +324,8 @@ API key auth, and dynamic model resolution. - A provider plugin can register speech, media understanding, image - generation, and web search alongside text inference: + A provider plugin can register speech, realtime transcription, realtime voice, media + understanding, image generation, and web search alongside text inference: ```typescript register(api) { @@ -343,6 +343,33 @@ API key auth, and dynamic model resolution. }), }); + api.registerRealtimeTranscriptionProvider({ + id: "acme-ai", + label: "Acme Realtime Transcription", + isConfigured: () => true, + createSession: (req) => ({ + connect: async () => {}, + sendAudio: () => {}, + close: () => {}, + isConnected: () => true, + }), + }); + + api.registerRealtimeVoiceProvider({ + id: "acme-ai", + label: "Acme Realtime Voice", + isConfigured: ({ providerConfig }) => Boolean(providerConfig.apiKey), + createBridge: (req) => ({ + connect: async () => {}, + sendAudio: () => {}, + setMediaTimestamp: () => {}, + submitToolResult: () => {}, + acknowledgeMark: () => {}, + close: () => {}, + isConnected: () => true, + }), + }); + api.registerMediaUnderstandingProvider({ id: "acme-ai", capabilities: ["image", "audio"], diff --git a/extensions/anthropic/test-api.ts b/extensions/anthropic/test-api.ts index 7d467629cfb..3caea18a4c1 100644 --- a/extensions/anthropic/test-api.ts +++ b/extensions/anthropic/test-api.ts @@ -1 +1,2 @@ export { buildAnthropicCliBackend } from "./cli-backend.js"; +export { anthropicMediaUnderstandingProvider } from "./media-understanding-provider.js"; diff --git a/extensions/deepgram/test-api.ts b/extensions/deepgram/test-api.ts new file mode 100644 index 00000000000..89dff7f7255 --- /dev/null +++ b/extensions/deepgram/test-api.ts @@ -0,0 +1 @@ +export { deepgramMediaUnderstandingProvider } from "./media-understanding-provider.js"; diff --git a/extensions/fal/test-api.ts b/extensions/fal/test-api.ts new file mode 100644 index 00000000000..e9accc54e5b --- /dev/null +++ b/extensions/fal/test-api.ts @@ -0,0 +1 @@ +export { buildFalImageGenerationProvider } from "./image-generation-provider.js"; diff --git a/extensions/google/test-api.ts b/extensions/google/test-api.ts index 2848ab1f800..0d173de2c95 100644 --- a/extensions/google/test-api.ts +++ b/extensions/google/test-api.ts @@ -1 +1,3 @@ export { buildGoogleGeminiCliBackend } from "./cli-backend.js"; +export { buildGoogleImageGenerationProvider } from "./image-generation-provider.js"; +export { googleMediaUnderstandingProvider } from "./media-understanding-provider.js"; diff --git a/extensions/groq/test-api.ts b/extensions/groq/test-api.ts new file mode 100644 index 00000000000..24bc9ceb6a6 --- /dev/null +++ b/extensions/groq/test-api.ts @@ -0,0 +1 @@ +export { groqMediaUnderstandingProvider } from "./media-understanding-provider.js"; diff --git a/extensions/lobster/src/lobster-tool.test.ts b/extensions/lobster/src/lobster-tool.test.ts index 0f39acc5b7d..30621f54b19 100644 --- a/extensions/lobster/src/lobster-tool.test.ts +++ b/extensions/lobster/src/lobster-tool.test.ts @@ -47,6 +47,8 @@ function fakeApi(overrides: Partial = {}): OpenClawPluginApi registerCliBackend() {}, registerProvider() {}, registerSpeechProvider() {}, + registerRealtimeTranscriptionProvider() {}, + registerRealtimeVoiceProvider() {}, registerMediaUnderstandingProvider() {}, registerImageGenerationProvider() {}, registerWebFetchProvider() {}, diff --git a/extensions/mattermost/runtime-api.ts b/extensions/mattermost/runtime-api.ts index 8168a2e00b2..e77586cba1a 100644 --- a/extensions/mattermost/runtime-api.ts +++ b/extensions/mattermost/runtime-api.ts @@ -31,7 +31,7 @@ export { isTrustedProxyAddress, } from "openclaw/plugin-sdk/core"; export { buildComputedAccountStatusSnapshot } from "openclaw/plugin-sdk/channel-status"; -export { createAccountStatusSink } from "openclaw/plugin-sdk/compat"; +export { createAccountStatusSink } from "openclaw/plugin-sdk/channel-lifecycle"; export { buildAgentMediaPayload } from "openclaw/plugin-sdk/agent-media-payload"; export { buildModelsProviderData, diff --git a/extensions/minimax/test-api.ts b/extensions/minimax/test-api.ts new file mode 100644 index 00000000000..5e130df00c3 --- /dev/null +++ b/extensions/minimax/test-api.ts @@ -0,0 +1,8 @@ +export { + buildMinimaxImageGenerationProvider, + buildMinimaxPortalImageGenerationProvider, +} from "./image-generation-provider.js"; +export { + minimaxMediaUnderstandingProvider, + minimaxPortalMediaUnderstandingProvider, +} from "./media-understanding-provider.js"; diff --git a/extensions/mistral/test-api.ts b/extensions/mistral/test-api.ts new file mode 100644 index 00000000000..14e820308cf --- /dev/null +++ b/extensions/mistral/test-api.ts @@ -0,0 +1 @@ +export { mistralMediaUnderstandingProvider } from "./media-understanding-provider.js"; diff --git a/extensions/moonshot/test-api.ts b/extensions/moonshot/test-api.ts index 9168ea3be27..9974ca37872 100644 --- a/extensions/moonshot/test-api.ts +++ b/extensions/moonshot/test-api.ts @@ -1 +1,2 @@ export { __testing } from "./src/kimi-web-search-provider.js"; +export { moonshotMediaUnderstandingProvider } from "./media-understanding-provider.js"; diff --git a/extensions/openai/api.ts b/extensions/openai/api.ts index 8520db0c9b9..7f144f9aa11 100644 --- a/extensions/openai/api.ts +++ b/extensions/openai/api.ts @@ -11,3 +11,5 @@ export { } from "./default-models.js"; export { buildOpenAICodexProvider } from "./openai-codex-catalog.js"; export { buildOpenAIProvider } from "./openai-provider.js"; +export { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js"; +export { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js"; diff --git a/extensions/openai/index.ts b/extensions/openai/index.ts index 0663f6779ea..6fe208dc049 100644 --- a/extensions/openai/index.ts +++ b/extensions/openai/index.ts @@ -12,6 +12,8 @@ import { resolveOpenAIPromptOverlayMode, shouldApplyOpenAIPromptOverlay, } from "./prompt-overlay.js"; +import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js"; +import { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js"; import { buildOpenAISpeechProvider } from "./speech-provider.js"; export default definePluginEntry({ @@ -24,6 +26,8 @@ export default definePluginEntry({ api.registerProvider(buildOpenAIProvider()); api.registerProvider(buildOpenAICodexProviderPlugin()); api.registerSpeechProvider(buildOpenAISpeechProvider()); + api.registerRealtimeTranscriptionProvider(buildOpenAIRealtimeTranscriptionProvider()); + api.registerRealtimeVoiceProvider(buildOpenAIRealtimeVoiceProvider()); api.registerMediaUnderstandingProvider(openaiMediaUnderstandingProvider); api.registerMediaUnderstandingProvider(openaiCodexMediaUnderstandingProvider); api.registerImageGenerationProvider(buildOpenAIImageGenerationProvider()); diff --git a/extensions/openai/openai-codex-provider.test.ts b/extensions/openai/openai-codex-provider.test.ts index e2768dd96d3..3f43317807b 100644 --- a/extensions/openai/openai-codex-provider.test.ts +++ b/extensions/openai/openai-codex-provider.test.ts @@ -103,16 +103,16 @@ describe("openai codex provider", () => { api: "openai-codex-responses", baseUrl: "https://chatgpt.com/backend-api", reasoning: true, - input: ["text", "image"], + input: ["text", "image"] as const, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, contextWindow: 272_000, maxTokens: 128_000, }; } - return null; - }, + return undefined; + }), } as never, - } as never); + }); expect(model).toMatchObject({ id: "gpt-5.4", @@ -173,7 +173,7 @@ describe("openai codex provider", () => { contextWindow: 272_000, }, ], - }); + } as never); expect(entries).toContainEqual( expect.objectContaining({ diff --git a/extensions/openai/openclaw.plugin.json b/extensions/openai/openclaw.plugin.json index 76cf35db0fb..17ad0e97879 100644 --- a/extensions/openai/openclaw.plugin.json +++ b/extensions/openai/openclaw.plugin.json @@ -34,6 +34,8 @@ ], "contracts": { "speechProviders": ["openai"], + "realtimeTranscriptionProviders": ["openai"], + "realtimeVoiceProviders": ["openai"], "mediaUnderstandingProviders": ["openai", "openai-codex"], "imageGenerationProviders": ["openai"] }, diff --git a/extensions/openai/package.json b/extensions/openai/package.json index 327c8a34760..2f5645ac179 100644 --- a/extensions/openai/package.json +++ b/extensions/openai/package.json @@ -4,6 +4,9 @@ "private": true, "description": "OpenClaw OpenAI provider plugins", "type": "module", + "dependencies": { + "ws": "^8.20.0" + }, "openclaw": { "extensions": [ "./index.ts" diff --git a/extensions/openai/realtime-transcription-provider.test.ts b/extensions/openai/realtime-transcription-provider.test.ts new file mode 100644 index 00000000000..214b4908cf5 --- /dev/null +++ b/extensions/openai/realtime-transcription-provider.test.ts @@ -0,0 +1,27 @@ +import { describe, expect, it } from "vitest"; +import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js"; + +describe("buildOpenAIRealtimeTranscriptionProvider", () => { + it("normalizes OpenAI config defaults", () => { + const provider = buildOpenAIRealtimeTranscriptionProvider(); + const resolved = provider.resolveConfig?.({ + cfg: {} as never, + rawConfig: { + providers: { + openai: { + apiKey: "sk-test", // pragma: allowlist secret + }, + }, + }, + }); + + expect(resolved).toEqual({ + apiKey: "sk-test", + }); + }); + + it("accepts the legacy openai-realtime alias", () => { + const provider = buildOpenAIRealtimeTranscriptionProvider(); + expect(provider.aliases).toContain("openai-realtime"); + }); +}); diff --git a/extensions/openai/realtime-transcription-provider.ts b/extensions/openai/realtime-transcription-provider.ts new file mode 100644 index 00000000000..d4fd8d09350 --- /dev/null +++ b/extensions/openai/realtime-transcription-provider.ts @@ -0,0 +1,267 @@ +import type { + RealtimeTranscriptionProviderConfig, + RealtimeTranscriptionProviderPlugin, + RealtimeTranscriptionSession, + RealtimeTranscriptionSessionCreateRequest, +} from "openclaw/plugin-sdk/realtime-transcription"; +import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; +import WebSocket from "ws"; + +type OpenAIRealtimeTranscriptionProviderConfig = { + apiKey?: string; + model?: string; + silenceDurationMs?: number; + vadThreshold?: number; +}; + +type OpenAIRealtimeTranscriptionSessionConfig = RealtimeTranscriptionSessionCreateRequest & { + apiKey: string; + model: string; + silenceDurationMs: number; + vadThreshold: number; +}; + +type RealtimeEvent = { + type: string; + delta?: string; + transcript?: string; + error?: unknown; +}; + +function trimToUndefined(value: unknown): string | undefined { + return typeof value === "string" && value.trim() ? value.trim() : undefined; +} + +function asNumber(value: unknown): number | undefined { + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function asObject(value: unknown): Record | undefined { + return typeof value === "object" && value !== null && !Array.isArray(value) + ? (value as Record) + : undefined; +} + +function normalizeProviderConfig( + config: RealtimeTranscriptionProviderConfig, +): OpenAIRealtimeTranscriptionProviderConfig { + const providers = asObject(config.providers); + const raw = asObject(providers?.openai) ?? asObject(config.openai) ?? asObject(config); + return { + apiKey: + normalizeResolvedSecretInputString({ + value: raw?.apiKey, + path: "plugins.entries.voice-call.config.streaming.providers.openai.apiKey", + }) ?? + normalizeResolvedSecretInputString({ + value: raw?.openaiApiKey, + path: "plugins.entries.voice-call.config.streaming.openaiApiKey", + }), + model: trimToUndefined(raw?.model) ?? trimToUndefined(raw?.sttModel), + silenceDurationMs: asNumber(raw?.silenceDurationMs), + vadThreshold: asNumber(raw?.vadThreshold), + }; +} + +function readProviderConfig( + providerConfig: RealtimeTranscriptionProviderConfig, +): OpenAIRealtimeTranscriptionProviderConfig { + return normalizeProviderConfig(providerConfig); +} + +class OpenAIRealtimeTranscriptionSession implements RealtimeTranscriptionSession { + private static readonly MAX_RECONNECT_ATTEMPTS = 5; + private static readonly RECONNECT_DELAY_MS = 1000; + private static readonly CONNECT_TIMEOUT_MS = 10_000; + + private ws: WebSocket | null = null; + private connected = false; + private closed = false; + private reconnectAttempts = 0; + private pendingTranscript = ""; + + constructor(private readonly config: OpenAIRealtimeTranscriptionSessionConfig) {} + + async connect(): Promise { + this.closed = false; + this.reconnectAttempts = 0; + await this.doConnect(); + } + + sendAudio(audio: Buffer): void { + if (this.ws?.readyState !== WebSocket.OPEN) { + return; + } + this.sendEvent({ + type: "input_audio_buffer.append", + audio: audio.toString("base64"), + }); + } + + close(): void { + this.closed = true; + this.connected = false; + if (this.ws) { + this.ws.close(1000, "Transcription session closed"); + this.ws = null; + } + } + + isConnected(): boolean { + return this.connected; + } + + private async doConnect(): Promise { + await new Promise((resolve, reject) => { + this.ws = new WebSocket("wss://api.openai.com/v1/realtime?intent=transcription", { + headers: { + Authorization: `Bearer ${this.config.apiKey}`, + "OpenAI-Beta": "realtime=v1", + }, + }); + + const connectTimeout = setTimeout(() => { + reject(new Error("OpenAI realtime transcription connection timeout")); + }, OpenAIRealtimeTranscriptionSession.CONNECT_TIMEOUT_MS); + + this.ws.on("open", () => { + clearTimeout(connectTimeout); + this.connected = true; + this.reconnectAttempts = 0; + this.sendEvent({ + type: "transcription_session.update", + session: { + input_audio_format: "g711_ulaw", + input_audio_transcription: { + model: this.config.model, + }, + turn_detection: { + type: "server_vad", + threshold: this.config.vadThreshold, + prefix_padding_ms: 300, + silence_duration_ms: this.config.silenceDurationMs, + }, + }, + }); + resolve(); + }); + + this.ws.on("message", (data: Buffer) => { + try { + this.handleEvent(JSON.parse(data.toString()) as RealtimeEvent); + } catch (error) { + this.config.onError?.(error instanceof Error ? error : new Error(String(error))); + } + }); + + this.ws.on("error", (error) => { + if (!this.connected) { + clearTimeout(connectTimeout); + reject(error); + return; + } + this.config.onError?.(error instanceof Error ? error : new Error(String(error))); + }); + + this.ws.on("close", () => { + this.connected = false; + if (this.closed) { + return; + } + void this.attemptReconnect(); + }); + }); + } + + private async attemptReconnect(): Promise { + if (this.closed) { + return; + } + if (this.reconnectAttempts >= OpenAIRealtimeTranscriptionSession.MAX_RECONNECT_ATTEMPTS) { + this.config.onError?.(new Error("OpenAI realtime transcription reconnect limit reached")); + return; + } + this.reconnectAttempts += 1; + const delay = + OpenAIRealtimeTranscriptionSession.RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1); + await new Promise((resolve) => setTimeout(resolve, delay)); + if (this.closed) { + return; + } + try { + await this.doConnect(); + } catch (error) { + this.config.onError?.(error instanceof Error ? error : new Error(String(error))); + await this.attemptReconnect(); + } + } + + private handleEvent(event: RealtimeEvent): void { + switch (event.type) { + case "conversation.item.input_audio_transcription.delta": + if (event.delta) { + this.pendingTranscript += event.delta; + this.config.onPartial?.(this.pendingTranscript); + } + return; + + case "conversation.item.input_audio_transcription.completed": + if (event.transcript) { + this.config.onTranscript?.(event.transcript); + } + this.pendingTranscript = ""; + return; + + case "input_audio_buffer.speech_started": + this.pendingTranscript = ""; + this.config.onSpeechStart?.(); + return; + + case "error": { + const detail = + event.error && typeof event.error === "object" && "message" in event.error + ? String((event.error as { message?: unknown }).message ?? "Unknown error") + : event.error + ? String(event.error) + : "Unknown error"; + this.config.onError?.(new Error(detail)); + return; + } + + default: + return; + } + } + + private sendEvent(event: unknown): void { + if (this.ws?.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify(event)); + } + } +} + +export function buildOpenAIRealtimeTranscriptionProvider(): RealtimeTranscriptionProviderPlugin { + return { + id: "openai", + label: "OpenAI Realtime Transcription", + aliases: ["openai-realtime"], + autoSelectOrder: 10, + resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig), + isConfigured: ({ providerConfig }) => + Boolean(readProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY), + createSession: (req) => { + const config = readProviderConfig(req.providerConfig); + const apiKey = config.apiKey || process.env.OPENAI_API_KEY; + if (!apiKey) { + throw new Error("OpenAI API key missing"); + } + return new OpenAIRealtimeTranscriptionSession({ + ...req, + apiKey, + model: config.model ?? "gpt-4o-transcribe", + silenceDurationMs: config.silenceDurationMs ?? 800, + vadThreshold: config.vadThreshold ?? 0.5, + }); + }, + }; +} diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts new file mode 100644 index 00000000000..2afb8dcfc40 --- /dev/null +++ b/extensions/openai/realtime-voice-provider.ts @@ -0,0 +1,535 @@ +import type { + RealtimeVoiceBridge, + RealtimeVoiceBridgeCreateRequest, + RealtimeVoiceCloseReason, + RealtimeVoiceProviderConfig, + RealtimeVoiceProviderPlugin, + RealtimeVoiceTool, +} from "openclaw/plugin-sdk/realtime-voice"; +import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; +import WebSocket from "ws"; + +export type OpenAIRealtimeVoice = + | "alloy" + | "ash" + | "ballad" + | "cedar" + | "coral" + | "echo" + | "marin" + | "sage" + | "shimmer" + | "verse"; + +type OpenAIRealtimeVoiceProviderConfig = { + apiKey?: string; + model?: string; + voice?: OpenAIRealtimeVoice; + temperature?: number; + vadThreshold?: number; + silenceDurationMs?: number; + prefixPaddingMs?: number; + azureEndpoint?: string; + azureDeployment?: string; + azureApiVersion?: string; +}; + +type OpenAIRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & { + apiKey: string; + model?: string; + voice?: OpenAIRealtimeVoice; + temperature?: number; + vadThreshold?: number; + silenceDurationMs?: number; + prefixPaddingMs?: number; + azureEndpoint?: string; + azureDeployment?: string; + azureApiVersion?: string; +}; + +type RealtimeEvent = { + type: string; + delta?: string; + transcript?: string; + item_id?: string; + call_id?: string; + name?: string; + error?: unknown; +}; + +type RealtimeSessionUpdate = { + type: "session.update"; + session: { + modalities: string[]; + instructions?: string; + voice: OpenAIRealtimeVoice; + input_audio_format: string; + output_audio_format: string; + turn_detection: { + type: "server_vad"; + threshold: number; + prefix_padding_ms: number; + silence_duration_ms: number; + create_response: boolean; + }; + temperature: number; + input_audio_transcription?: { model: string }; + tools?: RealtimeVoiceTool[]; + tool_choice?: string; + }; +}; + +function trimToUndefined(value: unknown): string | undefined { + return typeof value === "string" && value.trim() ? value.trim() : undefined; +} + +function asNumber(value: unknown): number | undefined { + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function asObject(value: unknown): Record | undefined { + return typeof value === "object" && value !== null && !Array.isArray(value) + ? (value as Record) + : undefined; +} + +function normalizeProviderConfig( + config: RealtimeVoiceProviderConfig, +): OpenAIRealtimeVoiceProviderConfig { + const providers = asObject(config.providers); + const raw = asObject(providers?.openai) ?? asObject(config.openai) ?? asObject(config); + return { + apiKey: normalizeResolvedSecretInputString({ + value: raw?.apiKey, + path: "plugins.entries.voice-call.config.realtime.providers.openai.apiKey", + }), + model: trimToUndefined(raw?.model), + voice: raw?.voice as OpenAIRealtimeVoice | undefined, + temperature: asNumber(raw?.temperature), + vadThreshold: asNumber(raw?.vadThreshold), + silenceDurationMs: asNumber(raw?.silenceDurationMs), + prefixPaddingMs: asNumber(raw?.prefixPaddingMs), + azureEndpoint: trimToUndefined(raw?.azureEndpoint), + azureDeployment: trimToUndefined(raw?.azureDeployment), + azureApiVersion: trimToUndefined(raw?.azureApiVersion), + }; +} + +function readProviderConfig( + providerConfig: RealtimeVoiceProviderConfig, +): OpenAIRealtimeVoiceProviderConfig { + return normalizeProviderConfig(providerConfig); +} + +function base64ToBuffer(b64: string): Buffer { + return Buffer.from(b64, "base64"); +} + +class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge { + private static readonly DEFAULT_MODEL = "gpt-realtime"; + private static readonly MAX_RECONNECT_ATTEMPTS = 5; + private static readonly BASE_RECONNECT_DELAY_MS = 1000; + private static readonly CONNECT_TIMEOUT_MS = 10_000; + + private ws: WebSocket | null = null; + private connected = false; + private intentionallyClosed = false; + private reconnectAttempts = 0; + private pendingAudio: Buffer[] = []; + private markQueue: string[] = []; + private responseStartTimestamp: number | null = null; + private latestMediaTimestamp = 0; + private lastAssistantItemId: string | null = null; + private toolCallBuffers = new Map(); + + constructor(private readonly config: OpenAIRealtimeVoiceBridgeConfig) {} + + async connect(): Promise { + this.intentionallyClosed = false; + this.reconnectAttempts = 0; + await this.doConnect(); + } + + sendAudio(audio: Buffer): void { + if (!this.connected || this.ws?.readyState !== WebSocket.OPEN) { + if (this.pendingAudio.length < 320) { + this.pendingAudio.push(audio); + } + return; + } + this.sendEvent({ + type: "input_audio_buffer.append", + audio: audio.toString("base64"), + }); + } + + setMediaTimestamp(ts: number): void { + this.latestMediaTimestamp = ts; + } + + sendUserMessage(text: string): void { + this.sendEvent({ + type: "conversation.item.create", + item: { + type: "message", + role: "user", + content: [{ type: "input_text", text }], + }, + }); + this.sendEvent({ type: "response.create" }); + } + + triggerGreeting(instructions?: string): void { + if (!this.connected || !this.ws) { + return; + } + this.sendEvent({ + type: "response.create", + response: { + instructions: instructions ?? this.config.instructions, + }, + }); + } + + submitToolResult(callId: string, result: unknown): void { + this.sendEvent({ + type: "conversation.item.create", + item: { + type: "function_call_output", + call_id: callId, + output: JSON.stringify(result), + }, + }); + this.sendEvent({ type: "response.create" }); + } + + acknowledgeMark(): void { + if (this.markQueue.length === 0) { + return; + } + this.markQueue.shift(); + if (this.markQueue.length === 0) { + this.responseStartTimestamp = null; + this.lastAssistantItemId = null; + } + } + + close(): void { + this.intentionallyClosed = true; + this.connected = false; + if (this.ws) { + this.ws.close(1000, "Bridge closed"); + this.ws = null; + } + } + + isConnected(): boolean { + return this.connected; + } + + private async doConnect(): Promise { + await new Promise((resolve, reject) => { + const { url, headers } = this.resolveConnectionParams(); + this.ws = new WebSocket(url, { headers }); + + const connectTimeout = setTimeout(() => { + reject(new Error("OpenAI realtime connection timeout")); + }, OpenAIRealtimeVoiceBridge.CONNECT_TIMEOUT_MS); + + this.ws.on("open", () => { + clearTimeout(connectTimeout); + this.connected = true; + this.reconnectAttempts = 0; + this.sendSessionUpdate(); + for (const chunk of this.pendingAudio.splice(0)) { + this.sendAudio(chunk); + } + this.config.onReady?.(); + resolve(); + }); + + this.ws.on("message", (data: Buffer) => { + try { + this.handleEvent(JSON.parse(data.toString()) as RealtimeEvent); + } catch (error) { + console.error("[openai] realtime event parse failed:", error); + } + }); + + this.ws.on("error", (error) => { + if (!this.connected) { + clearTimeout(connectTimeout); + reject(error); + } + this.config.onError?.(error instanceof Error ? error : new Error(String(error))); + }); + + this.ws.on("close", () => { + this.connected = false; + if (this.intentionallyClosed) { + this.config.onClose?.("completed"); + return; + } + void this.attemptReconnect(); + }); + }); + } + + private resolveConnectionParams(): { url: string; headers: Record } { + const cfg = this.config; + if (cfg.azureEndpoint && cfg.azureDeployment) { + const base = cfg.azureEndpoint + .replace(/\/$/, "") + .replace(/^http(s?):/, (_, secure: string) => `ws${secure}:`); + const apiVersion = cfg.azureApiVersion ?? "2024-10-01-preview"; + return { + url: `${base}/openai/realtime?api-version=${apiVersion}&deployment=${encodeURIComponent( + cfg.azureDeployment, + )}`, + headers: { "api-key": cfg.apiKey }, + }; + } + + if (cfg.azureEndpoint) { + const base = cfg.azureEndpoint + .replace(/\/$/, "") + .replace(/^http(s?):/, (_, secure: string) => `ws${secure}:`); + return { + url: `${base}/v1/realtime?model=${encodeURIComponent( + cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL, + )}`, + headers: { Authorization: `Bearer ${cfg.apiKey}` }, + }; + } + + return { + url: `wss://api.openai.com/v1/realtime?model=${encodeURIComponent( + cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL, + )}`, + headers: { + Authorization: `Bearer ${cfg.apiKey}`, + "OpenAI-Beta": "realtime=v1", + }, + }; + } + + private async attemptReconnect(): Promise { + if (this.intentionallyClosed) { + return; + } + if (this.reconnectAttempts >= OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS) { + this.config.onClose?.("error"); + return; + } + this.reconnectAttempts += 1; + const delay = + OpenAIRealtimeVoiceBridge.BASE_RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1); + await new Promise((resolve) => setTimeout(resolve, delay)); + if (this.intentionallyClosed) { + return; + } + try { + await this.doConnect(); + } catch (error) { + this.config.onError?.(error instanceof Error ? error : new Error(String(error))); + await this.attemptReconnect(); + } + } + + private sendSessionUpdate(): void { + const cfg = this.config; + const sessionUpdate: RealtimeSessionUpdate = { + type: "session.update", + session: { + modalities: ["text", "audio"], + instructions: cfg.instructions, + voice: cfg.voice ?? "alloy", + input_audio_format: "g711_ulaw", + output_audio_format: "g711_ulaw", + input_audio_transcription: { + model: "whisper-1", + }, + turn_detection: { + type: "server_vad", + threshold: cfg.vadThreshold ?? 0.5, + prefix_padding_ms: cfg.prefixPaddingMs ?? 300, + silence_duration_ms: cfg.silenceDurationMs ?? 500, + create_response: true, + }, + temperature: cfg.temperature ?? 0.8, + ...(cfg.tools && cfg.tools.length > 0 + ? { + tools: cfg.tools, + tool_choice: "auto", + } + : {}), + }, + }; + this.sendEvent(sessionUpdate); + } + + private handleEvent(event: RealtimeEvent): void { + switch (event.type) { + case "response.audio.delta": { + if (!event.delta) { + return; + } + const audio = base64ToBuffer(event.delta); + this.config.onAudio(audio); + if (this.responseStartTimestamp === null) { + this.responseStartTimestamp = this.latestMediaTimestamp; + } + if (event.item_id) { + this.lastAssistantItemId = event.item_id; + } + this.sendMark(); + return; + } + + case "input_audio_buffer.speech_started": + this.handleBargeIn(); + return; + + case "response.audio_transcript.delta": + if (event.delta) { + this.config.onTranscript?.("assistant", event.delta, false); + } + return; + + case "response.audio_transcript.done": + if (event.transcript) { + this.config.onTranscript?.("assistant", event.transcript, true); + } + return; + + case "conversation.item.input_audio_transcription.completed": + if (event.transcript) { + this.config.onTranscript?.("user", event.transcript, true); + } + return; + + case "conversation.item.input_audio_transcription.delta": + if (event.delta) { + this.config.onTranscript?.("user", event.delta, false); + } + return; + + case "response.function_call_arguments.delta": { + const key = event.item_id ?? "unknown"; + const existing = this.toolCallBuffers.get(key); + if (existing && event.delta) { + existing.args += event.delta; + } else if (event.item_id) { + this.toolCallBuffers.set(event.item_id, { + name: event.name ?? "", + callId: event.call_id ?? "", + args: event.delta ?? "", + }); + } + return; + } + + case "response.function_call_arguments.done": { + const key = event.item_id ?? "unknown"; + const buffered = this.toolCallBuffers.get(key); + if (this.config.onToolCall) { + const rawArgs = + buffered?.args || + ((event as unknown as Record).arguments as string) || + "{}"; + let args: unknown = {}; + try { + args = JSON.parse(rawArgs); + } catch {} + this.config.onToolCall({ + itemId: key, + callId: buffered?.callId || event.call_id || "", + name: buffered?.name || event.name || "", + args, + }); + } + this.toolCallBuffers.delete(key); + return; + } + + case "error": { + const detail = + event.error && typeof event.error === "object" && "message" in event.error + ? String((event.error as { message?: unknown }).message ?? "Unknown error") + : event.error + ? String(event.error) + : "Unknown error"; + this.config.onError?.(new Error(detail)); + return; + } + + default: + return; + } + } + + private handleBargeIn(): void { + if (this.markQueue.length > 0 && this.responseStartTimestamp !== null) { + const elapsedMs = this.latestMediaTimestamp - this.responseStartTimestamp; + if (this.lastAssistantItemId) { + this.sendEvent({ + type: "conversation.item.truncate", + item_id: this.lastAssistantItemId, + content_index: 0, + audio_end_ms: Math.max(0, elapsedMs), + }); + } + this.config.onClearAudio(); + this.markQueue = []; + this.lastAssistantItemId = null; + this.responseStartTimestamp = null; + return; + } + this.config.onClearAudio(); + } + + private sendMark(): void { + const markName = `audio-${Date.now()}`; + this.markQueue.push(markName); + this.config.onMark?.(markName); + } + + private sendEvent(event: unknown): void { + if (this.ws?.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify(event)); + } + } +} + +export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin { + return { + id: "openai", + label: "OpenAI Realtime Voice", + autoSelectOrder: 10, + resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig), + isConfigured: ({ providerConfig }) => + Boolean(readProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY), + createBridge: (req) => { + const config = readProviderConfig(req.providerConfig); + const apiKey = config.apiKey || process.env.OPENAI_API_KEY; + if (!apiKey) { + throw new Error("OpenAI API key missing"); + } + return new OpenAIRealtimeVoiceBridge({ + ...req, + apiKey, + model: config.model, + voice: config.voice, + temperature: config.temperature, + vadThreshold: config.vadThreshold, + silenceDurationMs: config.silenceDurationMs, + prefixPaddingMs: config.prefixPaddingMs, + azureEndpoint: config.azureEndpoint, + azureDeployment: config.azureDeployment, + azureApiVersion: config.azureApiVersion, + }); + }, + }; +} + +export type { OpenAIRealtimeVoiceProviderConfig }; diff --git a/extensions/openai/test-api.ts b/extensions/openai/test-api.ts index 570203d08a4..50b6e81e18a 100644 --- a/extensions/openai/test-api.ts +++ b/extensions/openai/test-api.ts @@ -4,4 +4,6 @@ export { openaiCodexMediaUnderstandingProvider, openaiMediaUnderstandingProvider, } from "./media-understanding-provider.js"; +export { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js"; +export { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js"; export { buildOpenAISpeechProvider } from "./speech-provider.js"; diff --git a/extensions/openrouter/test-api.ts b/extensions/openrouter/test-api.ts new file mode 100644 index 00000000000..117d8547bb8 --- /dev/null +++ b/extensions/openrouter/test-api.ts @@ -0,0 +1 @@ +export { openrouterMediaUnderstandingProvider } from "./media-understanding-provider.js"; diff --git a/extensions/voice-call/README.md b/extensions/voice-call/README.md index 1bffa9539cd..a2dd7eba40a 100644 --- a/extensions/voice-call/README.md +++ b/extensions/voice-call/README.md @@ -145,4 +145,4 @@ Actions: - While a Twilio stream is active, playback does not fall back to TwiML ``; stream-TTS failures fail the playback request. - Outbound conversation calls suppress barge-in only while the initial greeting is actively speaking, then re-enable normal interruption. - Twilio stream disconnect auto-end uses a short grace window so quick reconnects do not end the call. -- Media streaming requires `ws` and OpenAI Realtime API key. +- Media streaming requires `ws` plus a configured realtime-transcription provider. The bundled provider today is OpenAI. diff --git a/extensions/voice-call/index.ts b/extensions/voice-call/index.ts index f3386c3c042..fc667698586 100644 --- a/extensions/voice-call/index.ts +++ b/extensions/voice-call/index.ts @@ -72,13 +72,25 @@ const voiceCallConfigSchema = { advanced: true, }, "streaming.enabled": { label: "Enable Streaming", advanced: true }, - "streaming.openaiApiKey": { + "streaming.provider": { label: "Streaming Provider", advanced: true }, + "streaming.providers.openai.apiKey": { label: "OpenAI Realtime API Key", sensitive: true, advanced: true, }, - "streaming.sttModel": { label: "Realtime STT Model", advanced: true }, + "streaming.providers.openai.model": { label: "Realtime STT Model", advanced: true }, "streaming.streamPath": { label: "Media Stream Path", advanced: true }, + "realtime.enabled": { label: "Enable Realtime Voice", advanced: true }, + "realtime.provider": { label: "Realtime Voice Provider", advanced: true }, + "realtime.streamPath": { label: "Realtime Stream Path", advanced: true }, + "realtime.instructions": { label: "Realtime Instructions", advanced: true }, + "realtime.providers.openai.apiKey": { + label: "OpenAI Realtime API Key", + sensitive: true, + advanced: true, + }, + "realtime.providers.openai.model": { label: "OpenAI Realtime Model", advanced: true }, + "realtime.providers.openai.voice": { label: "OpenAI Realtime Voice", advanced: true }, "tts.provider": { label: "TTS Provider Override", help: "Deep-merges with messages.tts (Microsoft is ignored for calls).", @@ -181,6 +193,7 @@ export default definePluginEntry({ runtimePromise = createVoiceCallRuntime({ config, coreConfig: api.config as CoreConfig, + fullConfig: api.config, agentRuntime: api.runtime.agent, ttsRuntime: api.runtime.tts, logger: api.logger, diff --git a/extensions/voice-call/openclaw.plugin.json b/extensions/voice-call/openclaw.plugin.json index 0063979b2dc..f0700789bb3 100644 --- a/extensions/voice-call/openclaw.plugin.json +++ b/extensions/voice-call/openclaw.plugin.json @@ -86,12 +86,16 @@ "label": "Enable Streaming", "advanced": true }, - "streaming.openaiApiKey": { + "streaming.provider": { + "label": "Streaming Provider", + "advanced": true + }, + "streaming.providers.openai.apiKey": { "label": "OpenAI Realtime API Key", "sensitive": true, "advanced": true }, - "streaming.sttModel": { + "streaming.providers.openai.model": { "label": "Realtime STT Model", "advanced": true }, @@ -345,9 +349,11 @@ "enabled": { "type": "boolean" }, + "provider": { + "type": "string" + }, "sttProvider": { - "type": "string", - "enum": ["openai-realtime"] + "type": "string" }, "openaiApiKey": { "type": "string" @@ -367,6 +373,13 @@ "streamPath": { "type": "string" }, + "providers": { + "type": "object", + "additionalProperties": { + "type": "object", + "additionalProperties": true + } + }, "preStartTimeoutMs": { "type": "integer", "minimum": 1 @@ -385,6 +398,72 @@ } } }, + "realtime": { + "type": "object", + "additionalProperties": false, + "properties": { + "enabled": { + "type": "boolean" + }, + "provider": { + "type": "string" + }, + "streamPath": { + "type": "string" + }, + "instructions": { + "type": "string" + }, + "tools": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "properties": { + "type": { + "type": "string", + "enum": ["function"] + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "parameters": { + "type": "object", + "additionalProperties": false, + "properties": { + "type": { + "type": "string", + "enum": ["object"] + }, + "properties": { + "type": "object", + "additionalProperties": true + }, + "required": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["type", "properties"] + } + }, + "required": ["type", "name", "description", "parameters"] + } + }, + "providers": { + "type": "object", + "additionalProperties": { + "type": "object", + "additionalProperties": true + } + } + } + }, "publicUrl": { "type": "string" }, diff --git a/extensions/voice-call/src/config.test.ts b/extensions/voice-call/src/config.test.ts index 19db6eb691b..ec268b1c3ca 100644 --- a/extensions/voice-call/src/config.test.ts +++ b/extensions/voice-call/src/config.test.ts @@ -179,6 +179,35 @@ describe("validateProviderConfig", () => { expect(result.errors).toEqual([]); }); }); + + describe("realtime config", () => { + it("rejects disabled inbound policy for realtime mode", () => { + const config = createBaseConfig("twilio"); + config.realtime.enabled = true; + config.inboundPolicy = "disabled"; + + const result = validateProviderConfig(config); + + expect(result.valid).toBe(false); + expect(result.errors).toContain( + 'plugins.entries.voice-call.config.inboundPolicy must not be "disabled" when realtime.enabled is true', + ); + }); + + it("rejects enabling realtime and streaming together", () => { + const config = createBaseConfig("twilio"); + config.realtime.enabled = true; + config.streaming.enabled = true; + config.inboundPolicy = "allowlist"; + + const result = validateProviderConfig(config); + + expect(result.valid).toBe(false); + expect(result.errors).toContain( + "plugins.entries.voice-call.config.realtime.enabled and plugins.entries.voice-call.config.streaming.enabled cannot both be true", + ); + }); + }); }); describe("normalizeVoiceCallConfig", () => { @@ -194,11 +223,25 @@ describe("normalizeVoiceCallConfig", () => { expect(normalized.serve.path).toBe("/voice/webhook"); expect(normalized.streaming.streamPath).toBe("/custom-stream"); - expect(normalized.streaming.sttModel).toBe("gpt-4o-transcribe"); + expect(normalized.streaming.provider).toBe("openai"); + expect(normalized.streaming.providers.openai).toEqual({}); + expect(normalized.realtime.streamPath).toBe("/voice/stream/realtime"); expect(normalized.tunnel.provider).toBe("none"); expect(normalized.webhookSecurity.allowedHosts).toEqual([]); }); + it("derives the realtime stream path from a custom webhook path", () => { + const normalized = normalizeVoiceCallConfig({ + enabled: true, + provider: "twilio", + serve: { + path: "/custom/webhook", + }, + }); + + expect(normalized.realtime.streamPath).toBe("/custom/stream/realtime"); + }); + it("accepts partial nested TTS overrides and preserves nested objects", () => { const normalized = normalizeVoiceCallConfig({ tts: { diff --git a/extensions/voice-call/src/config.ts b/extensions/voice-call/src/config.ts index 74077faf0f1..7e2c519a7be 100644 --- a/extensions/voice-call/src/config.ts +++ b/extensions/voice-call/src/config.ts @@ -70,7 +70,7 @@ export type PlivoConfig = z.infer; export const SttConfigSchema = z .object({ - /** STT provider (currently only OpenAI supported) */ + /** One-shot STT provider for non-streaming paths. */ provider: z.literal("openai").default("openai"), /** Whisper model to use */ model: z.string().min(1).default("whisper-1"), @@ -196,25 +196,80 @@ export const OutboundConfigSchema = z export type OutboundConfig = z.infer; // ----------------------------------------------------------------------------- -// Streaming Configuration (OpenAI Realtime STT) +// Realtime Voice Configuration +// ----------------------------------------------------------------------------- + +export const RealtimeToolSchema = z + .object({ + type: z.literal("function"), + name: z.string().min(1), + description: z.string(), + parameters: z.object({ + type: z.literal("object"), + properties: z.record(z.string(), z.unknown()), + required: z.array(z.string()).optional(), + }), + }) + .strict(); +export type RealtimeToolConfig = z.infer; + +export const VoiceCallRealtimeProvidersConfigSchema = z + .record(z.string(), z.record(z.string(), z.unknown())) + .default({}); +export type VoiceCallRealtimeProvidersConfig = z.infer< + typeof VoiceCallRealtimeProvidersConfigSchema +>; + +export const VoiceCallStreamingProvidersConfigSchema = z + .record(z.string(), z.record(z.string(), z.unknown())) + .default({}); +export type VoiceCallStreamingProvidersConfig = z.infer< + typeof VoiceCallStreamingProvidersConfigSchema +>; + +export const VoiceCallRealtimeConfigSchema = z + .object({ + /** Enable realtime voice-to-voice mode. */ + enabled: z.boolean().default(false), + /** Provider id from registered realtime voice providers. */ + provider: z.string().min(1).optional(), + /** Optional override for the local WebSocket route path. */ + streamPath: z.string().min(1).optional(), + /** System instructions passed to the realtime provider. */ + instructions: z.string().optional(), + /** Tool definitions exposed to the realtime provider. */ + tools: z.array(RealtimeToolSchema).default([]), + /** Provider-owned raw config blobs keyed by provider id. */ + providers: VoiceCallRealtimeProvidersConfigSchema, + }) + .strict() + .default({ enabled: false, tools: [], providers: {} }); +export type VoiceCallRealtimeConfig = z.infer; + +// ----------------------------------------------------------------------------- +// Streaming Configuration (Realtime Transcription) // ----------------------------------------------------------------------------- export const VoiceCallStreamingConfigSchema = z .object({ /** Enable real-time audio streaming (requires WebSocket support) */ enabled: z.boolean().default(false), - /** STT provider for real-time transcription */ - sttProvider: z.enum(["openai-realtime"]).default("openai-realtime"), - /** OpenAI API key for Realtime API (uses OPENAI_API_KEY env if not set) */ + /** Provider id from registered realtime transcription providers. */ + provider: z.string().min(1).default("openai"), + /** @deprecated Legacy alias for provider. */ + sttProvider: z.string().min(1).optional(), + /** @deprecated Legacy OpenAI-specific API key field. */ openaiApiKey: z.string().min(1).optional(), - /** OpenAI transcription model (default: gpt-4o-transcribe) */ - sttModel: z.string().min(1).default("gpt-4o-transcribe"), - /** VAD silence duration in ms before considering speech ended */ - silenceDurationMs: z.number().int().positive().default(800), - /** VAD threshold 0-1 (higher = less sensitive) */ - vadThreshold: z.number().min(0).max(1).default(0.5), + /** @deprecated Legacy OpenAI-specific transcription model field. */ + sttModel: z.string().min(1).optional(), + /** @deprecated Legacy OpenAI-specific VAD silence duration. */ + silenceDurationMs: z.number().int().positive().optional(), + /** @deprecated Legacy OpenAI-specific VAD threshold. */ + vadThreshold: z.number().min(0).max(1).optional(), /** WebSocket path for media stream connections */ streamPath: z.string().min(1).default("/voice/stream"), + /** Provider-owned raw config blobs keyed by provider id. */ + providers: VoiceCallStreamingProvidersConfigSchema, /** * Close unauthenticated media stream sockets if no valid `start` frame arrives in time. * Protects against pre-auth idle connection hold attacks. @@ -230,11 +285,9 @@ export const VoiceCallStreamingConfigSchema = z .strict() .default({ enabled: false, - sttProvider: "openai-realtime", - sttModel: "gpt-4o-transcribe", - silenceDurationMs: 800, - vadThreshold: 0.5, + provider: "openai", streamPath: "/voice/stream", + providers: {}, preStartTimeoutMs: 5000, maxPendingConnections: 32, maxPendingConnectionsPerIp: 4, @@ -319,6 +372,9 @@ export const VoiceCallConfigSchema = z /** Real-time audio streaming configuration */ streaming: VoiceCallStreamingConfigSchema, + /** Realtime voice-to-voice configuration */ + realtime: VoiceCallRealtimeConfigSchema, + /** Public webhook URL override (if set, bypasses tunnel auto-detection) */ publicUrl: z.string().url().optional(), @@ -364,6 +420,29 @@ function cloneDefaultVoiceCallConfig(): VoiceCallConfig { return structuredClone(DEFAULT_VOICE_CALL_CONFIG); } +function normalizeWebhookLikePath(pathname: string): string { + const trimmed = pathname.trim(); + if (!trimmed) { + return "/"; + } + const prefixed = trimmed.startsWith("/") ? trimmed : `/${trimmed}`; + if (prefixed === "/") { + return prefixed; + } + return prefixed.endsWith("/") ? prefixed.slice(0, -1) : prefixed; +} + +function defaultRealtimeStreamPathForServePath(servePath: string): string { + const normalized = normalizeWebhookLikePath(servePath); + if (normalized.endsWith("/webhook")) { + return `${normalized.slice(0, -"/webhook".length)}/stream/realtime`; + } + if (normalized === "/") { + return "/voice/stream/realtime"; + } + return `${normalized}/stream/realtime`; +} + function normalizeVoiceCallTtsConfig( defaults: VoiceCallTtsConfig, overrides: DeepPartial> | undefined, @@ -375,14 +454,55 @@ function normalizeVoiceCallTtsConfig( return TtsConfigSchema.parse(deepMergeDefined(defaults ?? {}, overrides ?? {})); } +function sanitizeVoiceCallProviderConfigs( + value: Record | undefined> | undefined, +): Record> { + if (!value) { + return {}; + } + return Object.fromEntries( + Object.entries(value).filter( + (entry): entry is [string, Record] => entry[1] !== undefined, + ), + ); +} + export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig { const defaults = cloneDefaultVoiceCallConfig(); + const serve = { ...defaults.serve, ...config.serve }; + const streamingProvider = + config.streaming?.provider ?? + (typeof config.streaming?.sttProvider === "string" + ? config.streaming.sttProvider + : undefined) ?? + defaults.streaming.provider; + const streamingProviders = sanitizeVoiceCallProviderConfigs( + config.streaming?.providers ?? defaults.streaming.providers, + ); + if ( + typeof streamingProvider === "string" && + streamingProvider.trim() && + !(streamingProvider in streamingProviders) + ) { + streamingProviders[streamingProvider] = {}; + } + const realtimeProvider = config.realtime?.provider ?? defaults.realtime.provider; + const realtimeProviders = sanitizeVoiceCallProviderConfigs( + config.realtime?.providers ?? defaults.realtime.providers, + ); + if ( + typeof realtimeProvider === "string" && + realtimeProvider.trim() && + !(realtimeProvider in realtimeProviders) + ) { + realtimeProviders[realtimeProvider] = {}; + } return { ...defaults, ...config, allowFrom: config.allowFrom ?? defaults.allowFrom, outbound: { ...defaults.outbound, ...config.outbound }, - serve: { ...defaults.serve, ...config.serve }, + serve, tailscale: { ...defaults.tailscale, ...config.tailscale }, tunnel: { ...defaults.tunnel, ...config.tunnel }, webhookSecurity: { @@ -392,7 +512,23 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal trustedProxyIPs: config.webhookSecurity?.trustedProxyIPs ?? defaults.webhookSecurity.trustedProxyIPs, }, - streaming: { ...defaults.streaming, ...config.streaming }, + streaming: { + ...defaults.streaming, + ...config.streaming, + provider: streamingProvider, + providers: streamingProviders, + }, + realtime: { + ...defaults.realtime, + ...config.realtime, + provider: realtimeProvider, + streamPath: + config.realtime?.streamPath ?? + defaultRealtimeStreamPathForServePath(serve.path ?? defaults.serve.path), + tools: + (config.realtime?.tools as RealtimeToolConfig[] | undefined) ?? defaults.realtime.tools, + providers: realtimeProviders, + }, stt: { ...defaults.stt, ...config.stt }, tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts), }; @@ -448,6 +584,133 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC resolved.webhookSecurity.trustForwardingHeaders ?? false; resolved.webhookSecurity.trustedProxyIPs = resolved.webhookSecurity.trustedProxyIPs ?? []; + resolved.streaming = { + ...resolved.streaming, + providers: { ...(resolved.streaming.providers ?? {}) }, + }; + const legacyStreamingRaw = resolved.streaming as Record; + const openaiStreamingRaw = + resolved.streaming.providers.openai && typeof resolved.streaming.providers.openai === "object" + ? { ...(resolved.streaming.providers.openai as Record) } + : {}; + if ( + typeof openaiStreamingRaw.apiKey !== "string" && + typeof legacyStreamingRaw.openaiApiKey === "string" + ) { + openaiStreamingRaw.apiKey = legacyStreamingRaw.openaiApiKey; + } + if ( + typeof openaiStreamingRaw.model !== "string" && + typeof legacyStreamingRaw.sttModel === "string" + ) { + openaiStreamingRaw.model = legacyStreamingRaw.sttModel; + } + if ( + openaiStreamingRaw.silenceDurationMs == null && + typeof legacyStreamingRaw.silenceDurationMs === "number" + ) { + openaiStreamingRaw.silenceDurationMs = legacyStreamingRaw.silenceDurationMs; + } + if ( + openaiStreamingRaw.vadThreshold == null && + typeof legacyStreamingRaw.vadThreshold === "number" + ) { + openaiStreamingRaw.vadThreshold = legacyStreamingRaw.vadThreshold; + } + if (typeof openaiStreamingRaw.apiKey !== "string" || !openaiStreamingRaw.apiKey.trim()) { + if (process.env.OPENAI_API_KEY) { + openaiStreamingRaw.apiKey = process.env.OPENAI_API_KEY; + } + } + if ( + typeof openaiStreamingRaw.model !== "string" && + typeof process.env.REALTIME_TRANSCRIPTION_MODEL === "string" + ) { + openaiStreamingRaw.model = process.env.REALTIME_TRANSCRIPTION_MODEL; + } + if ( + typeof openaiStreamingRaw.model !== "string" && + typeof process.env.STREAMING_STT_MODEL === "string" + ) { + openaiStreamingRaw.model = process.env.STREAMING_STT_MODEL; + } + if (openaiStreamingRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") { + openaiStreamingRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD); + } + if ( + openaiStreamingRaw.silenceDurationMs == null && + typeof process.env.SILENCE_DURATION_MS === "string" + ) { + openaiStreamingRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10); + } + if (Object.keys(openaiStreamingRaw).length > 0) { + resolved.streaming.providers.openai = openaiStreamingRaw; + } + if ( + typeof resolved.streaming.provider === "string" && + resolved.streaming.provider.trim() && + !(resolved.streaming.provider in resolved.streaming.providers) + ) { + resolved.streaming.providers[resolved.streaming.provider] = {}; + } + + resolved.realtime = { + ...resolved.realtime, + providers: { ...(resolved.realtime.providers ?? {}) }, + }; + const openaiRealtimeRaw = + resolved.realtime.providers.openai && typeof resolved.realtime.providers.openai === "object" + ? { ...(resolved.realtime.providers.openai as Record) } + : {}; + if (typeof openaiRealtimeRaw.apiKey !== "string" || !openaiRealtimeRaw.apiKey.trim()) { + if (process.env.OPENAI_API_KEY) { + openaiRealtimeRaw.apiKey = process.env.OPENAI_API_KEY; + } + } + if ( + typeof openaiRealtimeRaw.model !== "string" && + typeof process.env.REALTIME_VOICE_MODEL === "string" + ) { + openaiRealtimeRaw.model = process.env.REALTIME_VOICE_MODEL; + } + if ( + typeof openaiRealtimeRaw.voice !== "string" && + typeof process.env.REALTIME_VOICE_VOICE === "string" + ) { + openaiRealtimeRaw.voice = process.env.REALTIME_VOICE_VOICE; + } + if ( + typeof resolved.realtime.instructions !== "string" && + typeof process.env.REALTIME_VOICE_INSTRUCTIONS === "string" + ) { + resolved.realtime.instructions = process.env.REALTIME_VOICE_INSTRUCTIONS; + } + if ( + openaiRealtimeRaw.temperature == null && + typeof process.env.REALTIME_VOICE_TEMPERATURE === "string" + ) { + openaiRealtimeRaw.temperature = Number.parseFloat(process.env.REALTIME_VOICE_TEMPERATURE); + } + if (openaiRealtimeRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") { + openaiRealtimeRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD); + } + if ( + openaiRealtimeRaw.silenceDurationMs == null && + typeof process.env.SILENCE_DURATION_MS === "string" + ) { + openaiRealtimeRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10); + } + if (Object.keys(openaiRealtimeRaw).length > 0) { + resolved.realtime.providers.openai = openaiRealtimeRaw; + } + if ( + typeof resolved.realtime.provider === "string" && + resolved.realtime.provider.trim() && + !(resolved.realtime.provider in resolved.realtime.providers) + ) { + resolved.realtime.providers[resolved.realtime.provider] = {}; + } + return normalizeVoiceCallConfig(resolved); } @@ -516,5 +779,23 @@ export function validateProviderConfig(config: VoiceCallConfig): { } } + if (config.realtime.enabled && config.inboundPolicy === "disabled") { + errors.push( + 'plugins.entries.voice-call.config.inboundPolicy must not be "disabled" when realtime.enabled is true', + ); + } + + if (config.realtime.enabled && config.streaming.enabled) { + errors.push( + "plugins.entries.voice-call.config.realtime.enabled and plugins.entries.voice-call.config.streaming.enabled cannot both be true", + ); + } + + if (config.realtime.enabled && config.provider && config.provider !== "twilio") { + errors.push( + 'plugins.entries.voice-call.config.provider must be "twilio" when realtime.enabled is true', + ); + } + return { valid: errors.length === 0, errors }; } diff --git a/extensions/voice-call/src/manager/outbound.test.ts b/extensions/voice-call/src/manager/outbound.test.ts index 70e5117d8f3..33d64fff565 100644 --- a/extensions/voice-call/src/manager/outbound.test.ts +++ b/extensions/voice-call/src/manager/outbound.test.ts @@ -125,7 +125,7 @@ describe("voice-call outbound helpers", () => { maxConcurrentCalls: 3, outbound: { defaultMode: "conversation" }, fromNumber: "+14155550100", - tts: { providers: { openai: { voice: "nova" } } }, + tts: { provider: "openai", providers: { openai: { voice: "nova" } } }, }, storePath: "/tmp/voice-call.json", webhookUrl: "https://example.com/webhook", @@ -187,7 +187,7 @@ describe("voice-call outbound helpers", () => { activeCalls: new Map([["call-1", call]]), providerCallIdMap: new Map(), provider: { name: "twilio", playTts }, - config: { tts: { providers: { openai: { voice: "alloy" } } } }, + config: { tts: { provider: "openai", providers: { openai: { voice: "alloy" } } } }, storePath: "/tmp/voice-call.json", }; diff --git a/extensions/voice-call/src/manager/outbound.ts b/extensions/voice-call/src/manager/outbound.ts index c1f82b0e569..a90e22701a3 100644 --- a/extensions/voice-call/src/manager/outbound.ts +++ b/extensions/voice-call/src/manager/outbound.ts @@ -100,11 +100,22 @@ function requireConnectedCall(ctx: ConnectedCallContext, callId: CallId): Connec }; } -function resolveOpenAITtsVoice(config: SpeakContext["config"]): string | undefined { - const providerConfig = config.tts?.providers?.openai; - return providerConfig && typeof providerConfig === "object" - ? (providerConfig.voice as string | undefined) - : undefined; +function resolvePreferredTtsVoice(config: SpeakContext["config"]): string | undefined { + const providerId = config.tts?.provider; + if (!providerId) { + return undefined; + } + const providerConfig = config.tts?.providers?.[providerId]; + if (!providerConfig || typeof providerConfig !== "object") { + return undefined; + } + if (typeof providerConfig.voice === "string" && providerConfig.voice.trim()) { + return providerConfig.voice; + } + if (typeof providerConfig.voiceId === "string" && providerConfig.voiceId.trim()) { + return providerConfig.voiceId; + } + return undefined; } export async function initiateCall( @@ -164,7 +175,7 @@ export async function initiateCall( // For notify mode with a message, use inline TwiML with . let inlineTwiml: string | undefined; if (mode === "notify" && initialMessage) { - const pollyVoice = mapVoiceToPolly(resolveOpenAITtsVoice(ctx.config)); + const pollyVoice = mapVoiceToPolly(resolvePreferredTtsVoice(ctx.config)); inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice); console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`); } @@ -212,7 +223,7 @@ export async function speak( transitionState(call, "speaking"); persistCallRecord(ctx.storePath, call); - const voice = provider.name === "twilio" ? resolveOpenAITtsVoice(ctx.config) : undefined; + const voice = provider.name === "twilio" ? resolvePreferredTtsVoice(ctx.config) : undefined; await provider.playTts({ callId, providerCallId, diff --git a/extensions/voice-call/src/media-stream.test.ts b/extensions/voice-call/src/media-stream.test.ts index 8f6e16bc5e8..66e2f74c4e2 100644 --- a/extensions/voice-call/src/media-stream.test.ts +++ b/extensions/voice-call/src/media-stream.test.ts @@ -1,28 +1,27 @@ import { once } from "node:events"; import http from "node:http"; +import type { + RealtimeTranscriptionProviderPlugin, + RealtimeTranscriptionSession, +} from "openclaw/plugin-sdk/realtime-transcription"; import { describe, expect, it, vi } from "vitest"; import { WebSocket } from "ws"; import { MediaStreamHandler, sanitizeLogText } from "./media-stream.js"; -import type { - OpenAIRealtimeSTTProvider, - RealtimeSTTSession, -} from "./providers/stt-openai-realtime.js"; -const createStubSession = (): RealtimeSTTSession => ({ +const createStubSession = (): RealtimeTranscriptionSession => ({ connect: async () => {}, sendAudio: () => {}, - waitForTranscript: async () => "", - onPartial: () => {}, - onTranscript: () => {}, - onSpeechStart: () => {}, close: () => {}, isConnected: () => true, }); -const createStubSttProvider = (): OpenAIRealtimeSTTProvider => +const createStubSttProvider = (): RealtimeTranscriptionProviderPlugin => ({ createSession: () => createStubSession(), - }) as unknown as OpenAIRealtimeSTTProvider; + id: "openai", + label: "OpenAI", + isConfigured: () => true, + }) as unknown as RealtimeTranscriptionProviderPlugin; const flush = async (): Promise => { await new Promise((resolve) => setTimeout(resolve, 0)); @@ -104,7 +103,8 @@ const waitForClose = async ( describe("MediaStreamHandler TTS queue", () => { it("serializes TTS playback and resolves in order", async () => { const handler = new MediaStreamHandler({ - sttProvider: createStubSttProvider(), + transcriptionProvider: createStubSttProvider(), + providerConfig: {}, }); const started: number[] = []; const finished: number[] = []; @@ -137,7 +137,8 @@ describe("MediaStreamHandler TTS queue", () => { it("cancels active playback and clears queued items", async () => { const handler = new MediaStreamHandler({ - sttProvider: createStubSttProvider(), + transcriptionProvider: createStubSttProvider(), + providerConfig: {}, }); let queuedRan = false; @@ -165,7 +166,8 @@ describe("MediaStreamHandler TTS queue", () => { describe("MediaStreamHandler security hardening", () => { it("fails sends and closes stream when buffered bytes already exceed the cap", () => { const handler = new MediaStreamHandler({ - sttProvider: createStubSttProvider(), + transcriptionProvider: createStubSttProvider(), + providerConfig: {}, }); const ws = { readyState: WebSocket.OPEN, @@ -177,7 +179,12 @@ describe("MediaStreamHandler security hardening", () => { handler as unknown as { sessions: Map< string, - { callId: string; streamSid: string; ws: WebSocket; sttSession: RealtimeSTTSession } + { + callId: string; + streamSid: string; + ws: WebSocket; + sttSession: RealtimeTranscriptionSession; + } >; } ).sessions.set("MZ-backpressure", { @@ -196,7 +203,8 @@ describe("MediaStreamHandler security hardening", () => { it("fails sends when buffered bytes exceed cap after enqueueing a frame", () => { const handler = new MediaStreamHandler({ - sttProvider: createStubSttProvider(), + transcriptionProvider: createStubSttProvider(), + providerConfig: {}, }); const ws = { readyState: WebSocket.OPEN, @@ -214,7 +222,12 @@ describe("MediaStreamHandler security hardening", () => { handler as unknown as { sessions: Map< string, - { callId: string; streamSid: string; ws: WebSocket; sttSession: RealtimeSTTSession } + { + callId: string; + streamSid: string; + ws: WebSocket; + sttSession: RealtimeTranscriptionSession; + } >; } ).sessions.set("MZ-overflow", { @@ -243,7 +256,8 @@ describe("MediaStreamHandler security hardening", () => { const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> = []; const handler = new MediaStreamHandler({ - sttProvider: createStubSttProvider(), + transcriptionProvider: createStubSttProvider(), + providerConfig: {}, preStartTimeoutMs: 40, shouldAcceptStream: (params) => { shouldAcceptStreamCalls.push(params); @@ -266,7 +280,8 @@ describe("MediaStreamHandler security hardening", () => { it("enforces pending connection limits", async () => { const handler = new MediaStreamHandler({ - sttProvider: createStubSttProvider(), + transcriptionProvider: createStubSttProvider(), + providerConfig: {}, preStartTimeoutMs: 5_000, maxPendingConnections: 1, maxPendingConnectionsPerIp: 1, @@ -291,7 +306,8 @@ describe("MediaStreamHandler security hardening", () => { it("rejects upgrades when max connection cap is reached", async () => { const handler = new MediaStreamHandler({ - sttProvider: createStubSttProvider(), + transcriptionProvider: createStubSttProvider(), + providerConfig: {}, preStartTimeoutMs: 5_000, maxConnections: 1, maxPendingConnections: 10, @@ -319,7 +335,8 @@ describe("MediaStreamHandler security hardening", () => { it("clears pending state after valid start", async () => { const handler = new MediaStreamHandler({ - sttProvider: createStubSttProvider(), + transcriptionProvider: createStubSttProvider(), + providerConfig: {}, preStartTimeoutMs: 40, shouldAcceptStream: () => true, }); @@ -349,7 +366,8 @@ describe("MediaStreamHandler security hardening", () => { const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> = []; const handler = new MediaStreamHandler({ - sttProvider: createStubSttProvider(), + transcriptionProvider: createStubSttProvider(), + providerConfig: {}, preStartTimeoutMs: 1_000, shouldAcceptStream: (params) => { shouldAcceptStreamCalls.push(params); diff --git a/extensions/voice-call/src/media-stream.ts b/extensions/voice-call/src/media-stream.ts index fb259d723b8..0a051a18d38 100644 --- a/extensions/voice-call/src/media-stream.ts +++ b/extensions/voice-call/src/media-stream.ts @@ -3,24 +3,27 @@ * * Handles bidirectional audio streaming between Twilio and the AI services. * - Receives mu-law audio from Twilio via WebSocket - * - Forwards to OpenAI Realtime STT for transcription + * - Forwards to the selected realtime transcription provider * - Sends TTS audio back to Twilio */ import type { IncomingMessage } from "node:http"; import type { Duplex } from "node:stream"; -import { type RawData, WebSocket, WebSocketServer } from "ws"; import type { - OpenAIRealtimeSTTProvider, - RealtimeSTTSession, -} from "./providers/stt-openai-realtime.js"; + RealtimeTranscriptionProviderConfig, + RealtimeTranscriptionProviderPlugin, + RealtimeTranscriptionSession, +} from "openclaw/plugin-sdk/realtime-transcription"; +import { type RawData, WebSocket, WebSocketServer } from "ws"; /** * Configuration for the media stream handler. */ export interface MediaStreamConfig { - /** STT provider for transcription */ - sttProvider: OpenAIRealtimeSTTProvider; + /** Realtime transcription provider for streaming STT. */ + transcriptionProvider: RealtimeTranscriptionProviderPlugin; + /** Provider-owned config blob passed into the transcription session. */ + providerConfig: RealtimeTranscriptionProviderConfig; /** Close sockets that never send a valid `start` frame within this window. */ preStartTimeoutMs?: number; /** Max concurrent pre-start sockets. */ @@ -50,7 +53,7 @@ interface StreamSession { callId: string; streamSid: string; ws: WebSocket; - sttSession: RealtimeSTTSession; + sttSession: RealtimeTranscriptionSession; } type TtsQueueEntry = { @@ -254,20 +257,20 @@ export class MediaStreamHandler { return null; } - // Create STT session - const sttSession = this.config.sttProvider.createSession(); - - // Set up transcript callbacks - sttSession.onPartial((partial) => { - this.config.onPartialTranscript?.(callSid, partial); - }); - - sttSession.onTranscript((transcript) => { - this.config.onTranscript?.(callSid, transcript); - }); - - sttSession.onSpeechStart(() => { - this.config.onSpeechStart?.(callSid); + const sttSession = this.config.transcriptionProvider.createSession({ + providerConfig: this.config.providerConfig, + onPartial: (partial) => { + this.config.onPartialTranscript?.(callSid, partial); + }, + onTranscript: (transcript) => { + this.config.onTranscript?.(callSid, transcript); + }, + onSpeechStart: () => { + this.config.onSpeechStart?.(callSid); + }, + onError: (error) => { + console.warn("[MediaStream] Transcription session error:", error.message); + }, }); const session: StreamSession = { @@ -282,7 +285,7 @@ export class MediaStreamHandler { // Notify connection BEFORE STT connect so TTS can work even if STT fails this.config.onConnect?.(callSid, streamSid); - // Connect to OpenAI STT (non-blocking, log errors but don't fail the call) + // Connect to transcription service (non-blocking, log errors but don't fail the call) sttSession.connect().catch((err) => { console.warn(`[MediaStream] STT connection failed (TTS still works):`, err.message); }); diff --git a/extensions/voice-call/src/providers/index.ts b/extensions/voice-call/src/providers/index.ts index c8183622e35..4b0c2e442d5 100644 --- a/extensions/voice-call/src/providers/index.ts +++ b/extensions/voice-call/src/providers/index.ts @@ -1,10 +1,5 @@ export type { VoiceCallProvider } from "./base.js"; export { MockProvider } from "./mock.js"; -export { - OpenAIRealtimeSTTProvider, - type RealtimeSTTConfig, - type RealtimeSTTSession, -} from "./stt-openai-realtime.js"; export { TelnyxProvider } from "./telnyx.js"; export { TwilioProvider } from "./twilio.js"; export { PlivoProvider } from "./plivo.js"; diff --git a/extensions/voice-call/src/providers/stt-openai-realtime.test.ts b/extensions/voice-call/src/providers/stt-openai-realtime.test.ts deleted file mode 100644 index 5788053db5c..00000000000 --- a/extensions/voice-call/src/providers/stt-openai-realtime.test.ts +++ /dev/null @@ -1,42 +0,0 @@ -import { describe, expect, it } from "vitest"; -import type { RealtimeSTTConfig } from "./stt-openai-realtime.js"; -import { OpenAIRealtimeSTTProvider } from "./stt-openai-realtime.js"; - -type ProviderInternals = { - vadThreshold: number; - silenceDurationMs: number; -}; - -function readProviderInternals(config: RealtimeSTTConfig): ProviderInternals { - const provider = new OpenAIRealtimeSTTProvider(config) as unknown as Record; - return { - vadThreshold: provider["vadThreshold"] as number, - silenceDurationMs: provider["silenceDurationMs"] as number, - }; -} - -describe("OpenAIRealtimeSTTProvider constructor defaults", () => { - it("uses vadThreshold: 0 when explicitly configured (max sensitivity)", () => { - const provider = readProviderInternals({ - apiKey: "sk-test", // pragma: allowlist secret - vadThreshold: 0, - }); - expect(provider.vadThreshold).toBe(0); - }); - - it("uses silenceDurationMs: 0 when explicitly configured", () => { - const provider = readProviderInternals({ - apiKey: "sk-test", // pragma: allowlist secret - silenceDurationMs: 0, - }); - expect(provider.silenceDurationMs).toBe(0); - }); - - it("falls back to defaults when values are undefined", () => { - const provider = readProviderInternals({ - apiKey: "sk-test", // pragma: allowlist secret - }); - expect(provider.vadThreshold).toBe(0.5); - expect(provider.silenceDurationMs).toBe(800); - }); -}); diff --git a/extensions/voice-call/src/providers/stt-openai-realtime.ts b/extensions/voice-call/src/providers/stt-openai-realtime.ts deleted file mode 100644 index eaced5eeef2..00000000000 --- a/extensions/voice-call/src/providers/stt-openai-realtime.ts +++ /dev/null @@ -1,321 +0,0 @@ -/** - * OpenAI Realtime STT Provider - * - * Uses the OpenAI Realtime API for streaming transcription with: - * - Direct mu-law audio support (no conversion needed) - * - Built-in server-side VAD for turn detection - * - Low-latency streaming transcription - * - Partial transcript callbacks for real-time UI updates - */ - -import WebSocket from "ws"; - -/** - * Configuration for OpenAI Realtime STT. - */ -export interface RealtimeSTTConfig { - /** OpenAI API key */ - apiKey: string; - /** Model to use (default: gpt-4o-transcribe) */ - model?: string; - /** Silence duration in ms before considering speech ended (default: 800) */ - silenceDurationMs?: number; - /** VAD threshold 0-1 (default: 0.5) */ - vadThreshold?: number; -} - -/** - * Session for streaming audio and receiving transcripts. - */ -export interface RealtimeSTTSession { - /** Connect to the transcription service */ - connect(): Promise; - /** Send mu-law audio data (8kHz mono) */ - sendAudio(audio: Buffer): void; - /** Wait for next complete transcript (after VAD detects end of speech) */ - waitForTranscript(timeoutMs?: number): Promise; - /** Set callback for partial transcripts (streaming) */ - onPartial(callback: (partial: string) => void): void; - /** Set callback for final transcripts */ - onTranscript(callback: (transcript: string) => void): void; - /** Set callback when speech starts (VAD) */ - onSpeechStart(callback: () => void): void; - /** Close the session */ - close(): void; - /** Check if session is connected */ - isConnected(): boolean; -} - -/** - * Provider factory for OpenAI Realtime STT sessions. - */ -export class OpenAIRealtimeSTTProvider { - readonly name = "openai-realtime"; - private apiKey: string; - private model: string; - private silenceDurationMs: number; - private vadThreshold: number; - - constructor(config: RealtimeSTTConfig) { - if (!config.apiKey) { - throw new Error("OpenAI API key required for Realtime STT"); - } - this.apiKey = config.apiKey; - this.model = config.model || "gpt-4o-transcribe"; - this.silenceDurationMs = config.silenceDurationMs ?? 800; - this.vadThreshold = config.vadThreshold ?? 0.5; - } - - /** - * Create a new realtime transcription session. - */ - createSession(): RealtimeSTTSession { - return new OpenAIRealtimeSTTSession( - this.apiKey, - this.model, - this.silenceDurationMs, - this.vadThreshold, - ); - } -} - -/** - * WebSocket-based session for real-time speech-to-text. - */ -class OpenAIRealtimeSTTSession implements RealtimeSTTSession { - private static readonly MAX_RECONNECT_ATTEMPTS = 5; - private static readonly RECONNECT_DELAY_MS = 1000; - - private ws: WebSocket | null = null; - private connected = false; - private closed = false; - private connectTimeout: ReturnType | null = null; - private reconnectAttempts = 0; - private pendingTranscript = ""; - private onTranscriptCallback: ((transcript: string) => void) | null = null; - private onPartialCallback: ((partial: string) => void) | null = null; - private onSpeechStartCallback: (() => void) | null = null; - - constructor( - private readonly apiKey: string, - private readonly model: string, - private readonly silenceDurationMs: number, - private readonly vadThreshold: number, - ) {} - - async connect(): Promise { - this.closed = false; - this.reconnectAttempts = 0; - return this.doConnect(); - } - - private async doConnect(): Promise { - return new Promise((resolve, reject) => { - const url = "wss://api.openai.com/v1/realtime?intent=transcription"; - - this.ws = new WebSocket(url, { - headers: { - Authorization: `Bearer ${this.apiKey}`, - "OpenAI-Beta": "realtime=v1", - }, - }); - - this.ws.on("open", () => { - console.log("[RealtimeSTT] WebSocket connected"); - this.connected = true; - this.reconnectAttempts = 0; - if (this.connectTimeout) { - clearTimeout(this.connectTimeout); - this.connectTimeout = null; - } - - // Configure the transcription session - this.sendEvent({ - type: "transcription_session.update", - session: { - input_audio_format: "g711_ulaw", - input_audio_transcription: { - model: this.model, - }, - turn_detection: { - type: "server_vad", - threshold: this.vadThreshold, - prefix_padding_ms: 300, - silence_duration_ms: this.silenceDurationMs, - }, - }, - }); - - resolve(); - }); - - this.ws.on("message", (data: Buffer) => { - try { - const event = JSON.parse(data.toString()); - this.handleEvent(event); - } catch (e) { - console.error("[RealtimeSTT] Failed to parse event:", e); - } - }); - - this.ws.on("error", (error) => { - console.error("[RealtimeSTT] WebSocket error:", error); - if (!this.connected) { - reject(error); - } - }); - - this.ws.on("close", (code, reason) => { - console.log( - `[RealtimeSTT] WebSocket closed (code: ${code}, reason: ${reason?.toString() || "none"})`, - ); - this.connected = false; - - // Attempt reconnection if not intentionally closed - if (!this.closed) { - void this.attemptReconnect(); - } - }); - - this.connectTimeout = setTimeout(() => { - this.connectTimeout = null; - if (!this.connected) { - reject(new Error("Realtime STT connection timeout")); - } - }, 10000); - }); - } - - private async attemptReconnect(): Promise { - if (this.closed) { - return; - } - - if (this.reconnectAttempts >= OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS) { - console.error( - `[RealtimeSTT] Max reconnect attempts (${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS}) reached`, - ); - return; - } - - this.reconnectAttempts++; - const delay = OpenAIRealtimeSTTSession.RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1); - console.log( - `[RealtimeSTT] Reconnecting ${this.reconnectAttempts}/${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS} in ${delay}ms...`, - ); - - await new Promise((resolve) => setTimeout(resolve, delay)); - - if (this.closed) { - return; - } - - try { - await this.doConnect(); - console.log("[RealtimeSTT] Reconnected successfully"); - } catch (error) { - console.error("[RealtimeSTT] Reconnect failed:", error); - } - } - - private handleEvent(event: { - type: string; - delta?: string; - transcript?: string; - error?: unknown; - }): void { - switch (event.type) { - case "transcription_session.created": - case "transcription_session.updated": - case "input_audio_buffer.speech_stopped": - case "input_audio_buffer.committed": - console.log(`[RealtimeSTT] ${event.type}`); - break; - - case "conversation.item.input_audio_transcription.delta": - if (event.delta) { - this.pendingTranscript += event.delta; - this.onPartialCallback?.(this.pendingTranscript); - } - break; - - case "conversation.item.input_audio_transcription.completed": - if (event.transcript) { - console.log(`[RealtimeSTT] Transcript: ${event.transcript}`); - this.onTranscriptCallback?.(event.transcript); - } - this.pendingTranscript = ""; - break; - - case "input_audio_buffer.speech_started": - console.log("[RealtimeSTT] Speech started"); - this.pendingTranscript = ""; - this.onSpeechStartCallback?.(); - break; - - case "error": - console.error("[RealtimeSTT] Error:", event.error); - break; - } - } - - private sendEvent(event: unknown): void { - if (this.ws?.readyState === WebSocket.OPEN) { - this.ws.send(JSON.stringify(event)); - } - } - - sendAudio(muLawData: Buffer): void { - if (!this.connected) { - return; - } - this.sendEvent({ - type: "input_audio_buffer.append", - audio: muLawData.toString("base64"), - }); - } - - onPartial(callback: (partial: string) => void): void { - this.onPartialCallback = callback; - } - - onTranscript(callback: (transcript: string) => void): void { - this.onTranscriptCallback = callback; - } - - onSpeechStart(callback: () => void): void { - this.onSpeechStartCallback = callback; - } - - async waitForTranscript(timeoutMs = 30000): Promise { - return new Promise((resolve, reject) => { - const timeout = setTimeout(() => { - this.onTranscriptCallback = null; - reject(new Error("Transcript timeout")); - }, timeoutMs); - - this.onTranscriptCallback = (transcript) => { - clearTimeout(timeout); - this.onTranscriptCallback = null; - resolve(transcript); - }; - }); - } - - close(): void { - this.closed = true; - if (this.connectTimeout) { - clearTimeout(this.connectTimeout); - this.connectTimeout = null; - } - if (this.ws) { - this.ws.close(); - this.ws = null; - } - this.connected = false; - } - - isConnected(): boolean { - return this.connected; - } -} diff --git a/extensions/voice-call/src/providers/tts-openai.test.ts b/extensions/voice-call/src/providers/tts-openai.test.ts deleted file mode 100644 index 79d4644b59f..00000000000 --- a/extensions/voice-call/src/providers/tts-openai.test.ts +++ /dev/null @@ -1,43 +0,0 @@ -import { describe, expect, it } from "vitest"; -import type { OpenAITTSConfig } from "./tts-openai.js"; -import { OpenAITTSProvider } from "./tts-openai.js"; - -type ProviderInternals = { - model: string; - voice: string; - speed: number; -}; - -function readProviderInternals(config: OpenAITTSConfig): ProviderInternals { - return new OpenAITTSProvider(config) as unknown as ProviderInternals; -} - -describe("OpenAITTSProvider constructor defaults", () => { - it("uses speed: 0 when explicitly configured", () => { - const provider = readProviderInternals({ - apiKey: "sk-test", // pragma: allowlist secret - speed: 0, - }); - - expect(provider.speed).toBe(0); - }); - - it("falls back to speed default when undefined", () => { - const provider = readProviderInternals({ - apiKey: "sk-test", // pragma: allowlist secret - }); - - expect(provider.speed).toBe(1.0); - }); - - it("treats blank model and voice overrides as unset", () => { - const provider = readProviderInternals({ - apiKey: "sk-test", // pragma: allowlist secret - model: " ", - voice: "", - }); - - expect(provider.model).toBe("gpt-4o-mini-tts"); - expect(provider.voice).toBe("coral"); - }); -}); diff --git a/extensions/voice-call/src/providers/tts-openai.ts b/extensions/voice-call/src/providers/tts-openai.ts deleted file mode 100644 index 1fdc7a147d3..00000000000 --- a/extensions/voice-call/src/providers/tts-openai.ts +++ /dev/null @@ -1,185 +0,0 @@ -import { convertPcmToMulaw8k } from "../telephony-audio.js"; - -/** - * OpenAI TTS Provider - * - * Generates speech audio using OpenAI's text-to-speech API. - * Handles audio format conversion for telephony (mu-law 8kHz). - * - * Best practices from OpenAI docs: - * - Use gpt-4o-mini-tts for intelligent realtime applications (supports instructions) - * - Use tts-1 for lower latency, tts-1-hd for higher quality - * - Use marin or cedar voices for best quality - * - Use pcm or wav format for fastest response times - * - * @see https://platform.openai.com/docs/guides/text-to-speech - */ - -/** - * OpenAI TTS configuration. - */ -export interface OpenAITTSConfig { - /** OpenAI API key (uses OPENAI_API_KEY env if not set) */ - apiKey?: string; - /** - * TTS model: - * - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended) - * - tts-1: lower latency - * - tts-1-hd: higher quality - */ - model?: string; - /** - * Voice to use. For best quality, use marin or cedar. - * All 13 voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar - * Note: tts-1/tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer - */ - voice?: string; - /** Speed multiplier (0.25 to 4.0) */ - speed?: number; - /** - * Instructions for speech style (only works with gpt-4o-mini-tts model). - * Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent" - */ - instructions?: string; -} - -/** - * Supported OpenAI TTS voices (all 13 built-in voices). - * For best quality, use marin or cedar. - * Note: tts-1 and tts-1-hd support a smaller set. - */ -export const OPENAI_TTS_VOICES = [ - "alloy", - "ash", - "ballad", - "coral", - "echo", - "fable", - "nova", - "onyx", - "sage", - "shimmer", - "verse", - "marin", - "cedar", -] as const; - -export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number]; - -function trimToUndefined(value: string | undefined): string | undefined { - const trimmed = value?.trim(); - return trimmed ? trimmed : undefined; -} - -function resolveOpenAITtsInstructions(model: string, instructions?: string): string | undefined { - const next = trimToUndefined(instructions); - return next && model.includes("gpt-4o-mini-tts") ? next : undefined; -} - -/** - * OpenAI TTS Provider for generating speech audio. - */ -export class OpenAITTSProvider { - private apiKey: string; - private model: string; - private voice: OpenAITTSVoice; - private speed: number; - private instructions?: string; - - constructor(config: OpenAITTSConfig = {}) { - this.apiKey = - trimToUndefined(config.apiKey) ?? trimToUndefined(process.env.OPENAI_API_KEY) ?? ""; - // Default to gpt-4o-mini-tts for intelligent realtime applications - this.model = trimToUndefined(config.model) ?? "gpt-4o-mini-tts"; - // Default to coral - good balance of quality and natural tone - this.voice = (trimToUndefined(config.voice) as OpenAITTSVoice | undefined) ?? "coral"; - this.speed = config.speed ?? 1.0; - this.instructions = trimToUndefined(config.instructions); - - if (!this.apiKey) { - throw new Error("OpenAI API key required (set OPENAI_API_KEY or pass apiKey)"); - } - } - - /** - * Generate speech audio from text. - * Returns raw PCM audio data (24kHz, mono, 16-bit). - */ - async synthesize(text: string, instructions?: string): Promise { - // Build request body - const body: Record = { - model: this.model, - input: text, - voice: this.voice, - response_format: "pcm", // Raw PCM audio (24kHz, mono, 16-bit signed LE) - speed: this.speed, - }; - - const effectiveInstructions = resolveOpenAITtsInstructions( - this.model, - trimToUndefined(instructions) ?? this.instructions, - ); - if (effectiveInstructions) { - body.instructions = effectiveInstructions; - } - - const response = await fetch("https://api.openai.com/v1/audio/speech", { - method: "POST", - headers: { - Authorization: `Bearer ${this.apiKey}`, - "Content-Type": "application/json", - }, - body: JSON.stringify(body), - }); - - if (!response.ok) { - const error = await response.text(); - throw new Error(`OpenAI TTS failed: ${response.status} - ${error}`); - } - - const arrayBuffer = await response.arrayBuffer(); - return Buffer.from(arrayBuffer); - } - - /** - * Generate speech and convert to mu-law format for Twilio. - * Twilio Media Streams expect 8kHz mono mu-law audio. - */ - async synthesizeForTwilio(text: string): Promise { - // Get raw PCM from OpenAI (24kHz, 16-bit signed LE, mono) - const pcm24k = await this.synthesize(text); - - // Convert from 24kHz PCM to Twilio-compatible 8kHz mu-law - return convertPcmToMulaw8k(pcm24k, 24000); - } -} - -/** - * Convert 8-bit mu-law to 16-bit linear PCM. - * Useful for decoding incoming audio. - */ -export function mulawToLinear(mulaw: number): number { - // mu-law is transmitted inverted - mulaw = ~mulaw & 0xff; - - const sign = mulaw & 0x80; - const exponent = (mulaw >> 4) & 0x07; - const mantissa = mulaw & 0x0f; - - let sample = ((mantissa << 3) + 132) << exponent; - sample -= 132; - - return sign ? -sample : sample; -} - -/** - * Chunk audio buffer into 20ms frames for streaming. - * At 8kHz mono, 20ms = 160 samples = 160 bytes (mu-law). - */ -export function chunkAudio(audio: Buffer, chunkSize = 160): Generator { - return (function* () { - for (let i = 0; i < audio.length; i += chunkSize) { - yield audio.subarray(i, Math.min(i + chunkSize, audio.length)); - } - })(); -} diff --git a/extensions/voice-call/src/realtime-transcription.runtime.ts b/extensions/voice-call/src/realtime-transcription.runtime.ts new file mode 100644 index 00000000000..e532af72e13 --- /dev/null +++ b/extensions/voice-call/src/realtime-transcription.runtime.ts @@ -0,0 +1,4 @@ +export { + getRealtimeTranscriptionProvider, + listRealtimeTranscriptionProviders, +} from "openclaw/plugin-sdk/realtime-transcription"; diff --git a/extensions/voice-call/src/realtime-voice.runtime.ts b/extensions/voice-call/src/realtime-voice.runtime.ts new file mode 100644 index 00000000000..6f169676adf --- /dev/null +++ b/extensions/voice-call/src/realtime-voice.runtime.ts @@ -0,0 +1,4 @@ +export { + getRealtimeVoiceProvider, + listRealtimeVoiceProviders, +} from "openclaw/plugin-sdk/realtime-voice"; diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts index 6059f2ce5b8..501c5549fb2 100644 --- a/extensions/voice-call/src/runtime.ts +++ b/extensions/voice-call/src/runtime.ts @@ -1,12 +1,14 @@ +import type { OpenClawConfig } from "openclaw/plugin-sdk/core"; +import type { + RealtimeVoiceProviderConfig, + RealtimeVoiceProviderPlugin, +} from "openclaw/plugin-sdk/realtime-voice"; import type { VoiceCallConfig } from "./config.js"; import { resolveVoiceCallConfig, validateProviderConfig } from "./config.js"; import type { CoreAgentDeps, CoreConfig } from "./core-bridge.js"; import { CallManager } from "./manager.js"; import type { VoiceCallProvider } from "./providers/base.js"; -import { MockProvider } from "./providers/mock.js"; -import { PlivoProvider } from "./providers/plivo.js"; -import { TelnyxProvider } from "./providers/telnyx.js"; -import { TwilioProvider } from "./providers/twilio.js"; +import type { TwilioProvider } from "./providers/twilio.js"; import type { TelephonyTtsRuntime } from "./telephony-tts.js"; import { createTelephonyTtsProvider } from "./telephony-tts.js"; import { startTunnel, type TunnelResult } from "./tunnel.js"; @@ -30,6 +32,11 @@ type Logger = { debug?: (message: string) => void; }; +type ResolvedRealtimeProvider = { + provider: RealtimeVoiceProviderPlugin; + providerConfig: RealtimeVoiceProviderConfig; +}; + function createRuntimeResourceLifecycle(params: { config: VoiceCallConfig; webhookServer: VoiceCallWebhookServer; @@ -80,14 +87,15 @@ function isLoopbackBind(bind: string | undefined): boolean { return bind === "127.0.0.1" || bind === "::1" || bind === "localhost"; } -function resolveProvider(config: VoiceCallConfig): VoiceCallProvider { +async function resolveProvider(config: VoiceCallConfig): Promise { const allowNgrokFreeTierLoopbackBypass = config.tunnel?.provider === "ngrok" && isLoopbackBind(config.serve?.bind) && (config.tunnel?.allowNgrokFreeTierLoopbackBypass ?? false); switch (config.provider) { - case "telnyx": + case "telnyx": { + const { TelnyxProvider } = await import("./providers/telnyx.js"); return new TelnyxProvider( { apiKey: config.telnyx?.apiKey, @@ -98,7 +106,9 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider { skipVerification: config.skipSignatureVerification, }, ); - case "twilio": + } + case "twilio": { + const { TwilioProvider } = await import("./providers/twilio.js"); return new TwilioProvider( { accountSid: config.twilio?.accountSid, @@ -112,7 +122,9 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider { webhookSecurity: config.webhookSecurity, }, ); - case "plivo": + } + case "plivo": { + const { PlivoProvider } = await import("./providers/plivo.js"); return new PlivoProvider( { authId: config.plivo?.authId, @@ -125,21 +137,66 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider { webhookSecurity: config.webhookSecurity, }, ); - case "mock": + } + case "mock": { + const { MockProvider } = await import("./providers/mock.js"); return new MockProvider(); + } default: throw new Error(`Unsupported voice-call provider: ${String(config.provider)}`); } } +async function resolveRealtimeProvider(params: { + config: VoiceCallConfig; + fullConfig: OpenClawConfig; +}): Promise { + const { getRealtimeVoiceProvider, listRealtimeVoiceProviders } = + await import("./realtime-voice.runtime.js"); + const configuredProviderId = params.config.realtime.provider?.trim(); + const configuredProvider = getRealtimeVoiceProvider(configuredProviderId, params.fullConfig); + if (configuredProviderId && !configuredProvider) { + throw new Error(`Realtime voice provider "${configuredProviderId}" is not registered`); + } + const provider = + configuredProvider ?? + [...listRealtimeVoiceProviders(params.fullConfig)].sort( + (left, right) => + (left.autoSelectOrder ?? Number.MAX_SAFE_INTEGER) - + (right.autoSelectOrder ?? Number.MAX_SAFE_INTEGER), + )[0]; + if (!provider) { + throw new Error("No realtime voice provider registered"); + } + + const rawProviderConfig = + (params.config.realtime.providers?.[provider.id] as RealtimeVoiceProviderConfig | undefined) ?? + {}; + const providerConfig = + provider.resolveConfig?.({ + cfg: params.fullConfig, + rawConfig: { + providers: params.config.realtime.providers, + [provider.id]: rawProviderConfig, + }, + }) ?? rawProviderConfig; + + if (!provider.isConfigured({ cfg: params.fullConfig, providerConfig })) { + throw new Error(`Realtime voice provider "${provider.id}" is not configured`); + } + + return { provider, providerConfig }; +} + export async function createVoiceCallRuntime(params: { config: VoiceCallConfig; coreConfig: CoreConfig; + fullConfig?: OpenClawConfig; agentRuntime: CoreAgentDeps; ttsRuntime?: TelephonyTtsRuntime; logger?: Logger; }): Promise { - const { config: rawConfig, coreConfig, agentRuntime, ttsRuntime, logger } = params; + const { config: rawConfig, coreConfig, fullConfig, agentRuntime, ttsRuntime, logger } = params; const log = logger ?? { info: console.log, warn: console.warn, @@ -164,8 +221,14 @@ export async function createVoiceCallRuntime(params: { throw new Error(`Invalid voice-call config: ${validation.errors.join("; ")}`); } - const provider = resolveProvider(config); + const provider = await resolveProvider(config); const manager = new CallManager(config); + const realtimeProvider = config.realtime.enabled + ? await resolveRealtimeProvider({ + config, + fullConfig: (fullConfig ?? (coreConfig as OpenClawConfig)) as OpenClawConfig, + }) + : null; const webhookServer = new VoiceCallWebhookServer( config, manager, @@ -173,6 +236,19 @@ export async function createVoiceCallRuntime(params: { coreConfig, agentRuntime, ); + if (realtimeProvider) { + const { RealtimeCallHandler } = await import("./webhook/realtime-handler.js"); + webhookServer.setRealtimeHandler( + new RealtimeCallHandler( + config.realtime, + manager, + provider, + realtimeProvider.provider, + realtimeProvider.providerConfig, + config.serve.path, + ), + ); + } const lifecycle = createRuntimeResourceLifecycle({ config, webhookServer }); const localUrl = await webhookServer.start(); @@ -212,6 +288,9 @@ export async function createVoiceCallRuntime(params: { if (publicUrl && provider.name === "twilio") { (provider as TwilioProvider).setPublicUrl(publicUrl); } + if (publicUrl && realtimeProvider) { + webhookServer.getRealtimeHandler()?.setPublicUrl(publicUrl); + } if (provider.name === "twilio" && config.streaming?.enabled) { const twilioProvider = provider as TwilioProvider; @@ -243,6 +322,10 @@ export async function createVoiceCallRuntime(params: { } } + if (realtimeProvider) { + log.info(`[voice-call] Realtime voice provider: ${realtimeProvider.provider.id}`); + } + await manager.initialize(provider, webhookUrl); const stop = async () => await lifecycle.stop(); diff --git a/extensions/voice-call/src/test-fixtures.ts b/extensions/voice-call/src/test-fixtures.ts index 4302143b7f0..bb05a6e4bc6 100644 --- a/extensions/voice-call/src/test-fixtures.ts +++ b/extensions/voice-call/src/test-fixtures.ts @@ -30,16 +30,26 @@ export function createVoiceCallBaseConfig(params?: { }, streaming: { enabled: false, - sttProvider: "openai-realtime", - sttModel: "gpt-4o-transcribe", - silenceDurationMs: 800, - vadThreshold: 0.5, + provider: "openai", + providers: { + openai: { + model: "gpt-4o-transcribe", + silenceDurationMs: 800, + vadThreshold: 0.5, + }, + }, streamPath: "/voice/stream", preStartTimeoutMs: 5000, maxPendingConnections: 32, maxPendingConnectionsPerIp: 4, maxConnections: 128, }, + realtime: { + enabled: false, + streamPath: "/voice/stream/realtime", + tools: [], + providers: {}, + }, skipSignatureVerification: false, stt: { provider: "openai", model: "whisper-1" }, tts: { diff --git a/extensions/voice-call/src/webhook.test.ts b/extensions/voice-call/src/webhook.test.ts index f87193c7cd4..cb15233f856 100644 --- a/extensions/voice-call/src/webhook.test.ts +++ b/extensions/voice-call/src/webhook.test.ts @@ -1,10 +1,36 @@ import { request } from "node:http"; +import type { RealtimeTranscriptionProviderPlugin } from "openclaw/plugin-sdk/realtime-transcription"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { VoiceCallConfigSchema, type VoiceCallConfig } from "./config.js"; import type { CallManager } from "./manager.js"; import type { VoiceCallProvider } from "./providers/base.js"; import type { CallRecord, NormalizedEvent } from "./types.js"; import { VoiceCallWebhookServer } from "./webhook.js"; +import type { RealtimeCallHandler } from "./webhook/realtime-handler.js"; + +const mocks = vi.hoisted(() => { + const realtimeTranscriptionProvider: RealtimeTranscriptionProviderPlugin = { + id: "openai", + label: "OpenAI", + aliases: ["openai-realtime"], + isConfigured: () => true, + resolveConfig: ({ rawConfig }) => rawConfig, + createSession: () => ({ + connect: async () => {}, + sendAudio: () => {}, + close: () => {}, + isConnected: () => true, + }), + }; + + return { + getRealtimeTranscriptionProvider: vi.fn(() => realtimeTranscriptionProvider), + }; +}); + +vi.mock("./realtime-transcription.runtime.js", () => ({ + getRealtimeTranscriptionProvider: mocks.getRealtimeTranscriptionProvider, +})); const provider: VoiceCallProvider = { name: "mock", @@ -291,6 +317,56 @@ describe("VoiceCallWebhookServer replay handling", () => { } }); + it("returns realtime TwiML for replayed inbound twilio webhooks", async () => { + const parseWebhookEvent = vi.fn(() => ({ events: [], statusCode: 200 })); + const twilioProvider: VoiceCallProvider = { + ...provider, + name: "twilio", + verifyWebhook: () => ({ ok: true, isReplay: true, verifiedRequestKey: "twilio:req:replay" }), + parseWebhookEvent, + }; + const { manager, processEvent } = createManager([]); + const config = createConfig({ + provider: "twilio", + inboundPolicy: "allowlist", + realtime: { + enabled: true, + streamPath: "/voice/stream/realtime", + tools: [], + providers: {}, + }, + }); + const server = new VoiceCallWebhookServer(config, manager, twilioProvider); + server.setRealtimeHandler({ + buildTwiMLPayload: () => ({ + statusCode: 200, + headers: { "Content-Type": "text/xml" }, + body: '', + }), + getStreamPathPattern: () => "/voice/stream/realtime", + handleWebSocketUpgrade: () => {}, + registerToolHandler: () => {}, + setPublicUrl: () => {}, + } as unknown as RealtimeCallHandler); + + try { + const baseUrl = await server.start(); + const response = await postWebhookFormWithHeaders( + server, + baseUrl, + "CallSid=CA123&Direction=inbound&CallStatus=ringing", + { "x-twilio-signature": "sig" }, + ); + + expect(response.status).toBe(200); + expect(await response.text()).toContain(" { const parseWebhookEvent = vi.fn((_ctx: unknown, options?: { verifiedRequestKey?: string }) => ({ events: [ @@ -625,6 +701,7 @@ describe("VoiceCallWebhookServer stream disconnect grace", () => { manager, twilioProvider as unknown as VoiceCallProvider, ); + await server.start(); const mediaHandler = server.getMediaStreamHandler() as unknown as { config: { @@ -717,6 +794,7 @@ describe("VoiceCallWebhookServer barge-in suppression during initial message", ( manager, createTwilioProvider(clearTtsQueue) as unknown as VoiceCallProvider, ); + await server.start(); const handleInboundResponse = vi.fn(async () => {}); ( server as unknown as { @@ -790,6 +868,7 @@ describe("VoiceCallWebhookServer barge-in suppression during initial message", ( manager, createTwilioProvider(clearTtsQueue) as unknown as VoiceCallProvider, ); + await server.start(); try { const media = getMediaCallbacks(server); diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index 4e20a00f441..e9f36a61ad5 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -1,5 +1,6 @@ import http from "node:http"; import { URL } from "node:url"; +import type { OpenClawConfig } from "openclaw/plugin-sdk/core"; import { createWebhookInFlightLimiter, WEBHOOK_BODY_READ_DEFAULTS, @@ -16,9 +17,10 @@ import type { CallManager } from "./manager.js"; import type { MediaStreamConfig } from "./media-stream.js"; import { MediaStreamHandler } from "./media-stream.js"; import type { VoiceCallProvider } from "./providers/base.js"; -import { OpenAIRealtimeSTTProvider } from "./providers/stt-openai-realtime.js"; +import { isProviderStatusTerminal } from "./providers/shared/call-status.js"; import type { TwilioProvider } from "./providers/twilio.js"; import type { CallRecord, NormalizedEvent, WebhookContext } from "./types.js"; +import type { RealtimeCallHandler } from "./webhook/realtime-handler.js"; import { startStaleCallReaper } from "./webhook/stale-call-reaper.js"; const MAX_WEBHOOK_BODY_BYTES = WEBHOOK_BODY_READ_DEFAULTS.preAuth.maxBytes; @@ -44,7 +46,7 @@ function sanitizeTranscriptForLog(value: string): string { return `${sanitized.slice(0, TRANSCRIPT_LOG_MAX_CHARS)}...`; } -type WebhookResponsePayload = { +export type WebhookResponsePayload = { statusCode: number; body: string; headers?: Record; @@ -89,6 +91,8 @@ export class VoiceCallWebhookServer { private mediaStreamHandler: MediaStreamHandler | null = null; /** Delayed auto-hangup timers keyed by provider call ID after stream disconnect. */ private pendingDisconnectHangups = new Map>(); + /** Realtime voice handler for duplex provider bridges. */ + private realtimeHandler: RealtimeCallHandler | null = null; constructor( config: VoiceCallConfig, @@ -102,11 +106,6 @@ export class VoiceCallWebhookServer { this.provider = provider; this.coreConfig = coreConfig ?? null; this.agentRuntime = agentRuntime ?? null; - - // Initialize media stream handler if streaming is enabled - if (this.config.streaming.enabled) { - this.initializeMediaStreaming(); - } } /** @@ -116,6 +115,14 @@ export class VoiceCallWebhookServer { return this.mediaStreamHandler; } + getRealtimeHandler(): RealtimeCallHandler | null { + return this.realtimeHandler; + } + + setRealtimeHandler(handler: RealtimeCallHandler): void { + this.realtimeHandler = handler; + } + private clearPendingDisconnectHangup(providerCallId: string): void { const existing = this.pendingDisconnectHangups.get(providerCallId); if (!existing) { @@ -147,26 +154,50 @@ export class VoiceCallWebhookServer { } /** - * Initialize media streaming with OpenAI Realtime STT. + * Initialize media streaming with the selected realtime transcription provider. */ - private initializeMediaStreaming(): void { + private async initializeMediaStreaming(): Promise { const streaming = this.config.streaming; - const apiKey = streaming.openaiApiKey ?? process.env.OPENAI_API_KEY; - - if (!apiKey) { - console.warn("[voice-call] Streaming enabled but no OpenAI API key found"); + const selectedProviderId = streaming.provider; + const pluginConfig = this.coreConfig as unknown as OpenClawConfig | undefined; + const { getRealtimeTranscriptionProvider } = + await import("./realtime-transcription.runtime.js"); + const provider = getRealtimeTranscriptionProvider(selectedProviderId, pluginConfig); + if (!provider) { + console.warn( + `[voice-call] Streaming enabled but realtime transcription provider "${selectedProviderId}" is not registered`, + ); + return; + } + const selectedProviderConfig = + streaming.providers[selectedProviderId] && + typeof streaming.providers[selectedProviderId] === "object" + ? (streaming.providers[selectedProviderId] as Record) + : undefined; + const canonicalProviderConfig = + streaming.providers[provider.id] && typeof streaming.providers[provider.id] === "object" + ? (streaming.providers[provider.id] as Record) + : undefined; + const rawProviderConfig = { + ...(canonicalProviderConfig ?? {}), + ...(selectedProviderConfig ?? {}), + }; + const providerConfig = provider.resolveConfig + ? provider.resolveConfig({ + cfg: pluginConfig ?? ({} as OpenClawConfig), + rawConfig: rawProviderConfig, + }) + : rawProviderConfig; + if (!provider.isConfigured({ cfg: pluginConfig, providerConfig })) { + console.warn( + `[voice-call] Streaming enabled but provider "${provider.id}" is not configured`, + ); return; } - const sttProvider = new OpenAIRealtimeSTTProvider({ - apiKey, - model: streaming.sttModel, - silenceDurationMs: streaming.silenceDurationMs, - vadThreshold: streaming.vadThreshold, - }); - const streamConfig: MediaStreamConfig = { - sttProvider, + transcriptionProvider: provider, + providerConfig, preStartTimeoutMs: streaming.preStartTimeoutMs, maxPendingConnections: streaming.maxPendingConnections, maxPendingConnectionsPerIp: streaming.maxPendingConnectionsPerIp, @@ -309,6 +340,10 @@ export class VoiceCallWebhookServer { return this.listeningUrl ?? this.resolveListeningUrl(bind, webhookPath); } + if (this.config.streaming.enabled && !this.mediaStreamHandler) { + await this.initializeMediaStreaming(); + } + return new Promise((resolve, reject) => { this.server = http.createServer((req, res) => { this.handleRequest(req, res, webhookPath).catch((err) => { @@ -318,12 +353,15 @@ export class VoiceCallWebhookServer { }); }); - // Handle WebSocket upgrades for media streams - if (this.mediaStreamHandler) { + // Handle WebSocket upgrades for realtime voice and media streams. + if (this.realtimeHandler || this.mediaStreamHandler) { this.server.on("upgrade", (request, socket, head) => { + if (this.realtimeHandler && this.isRealtimeWebSocketUpgrade(request)) { + this.realtimeHandler.handleWebSocketUpgrade(request, socket, head); + return; + } const path = this.getUpgradePathname(request); - if (path === streamPath) { - console.log("[voice-call] WebSocket upgrade for media stream"); + if (path === streamPath && this.mediaStreamHandler) { this.mediaStreamHandler?.handleUpgrade(request, socket, head); } else { socket.destroy(); @@ -504,6 +542,10 @@ export class VoiceCallWebhookServer { return { statusCode: 401, body: "Unauthorized" }; } + if (this.shouldShortCircuitToRealtimeTwiml(ctx)) { + return this.realtimeHandler!.buildTwiMLPayload(req, new URLSearchParams(ctx.rawBody)); + } + const parsed = this.provider.parseWebhookEvent(ctx, { verifiedRequestKey: verification.verifiedRequestKey, }); @@ -555,6 +597,42 @@ export class VoiceCallWebhookServer { } } + private isRealtimeWebSocketUpgrade(req: http.IncomingMessage): boolean { + try { + const pathname = buildRequestUrl(req.url, req.headers.host).pathname; + const pattern = this.realtimeHandler?.getStreamPathPattern(); + return Boolean(pattern && pathname.startsWith(pattern)); + } catch { + return false; + } + } + + private shouldShortCircuitToRealtimeTwiml(ctx: WebhookContext): boolean { + if (!this.realtimeHandler || this.provider.name !== "twilio") { + return false; + } + + const params = new URLSearchParams(ctx.rawBody); + const direction = params.get("Direction"); + const isInbound = !direction || direction === "inbound"; + if (!isInbound) { + return false; + } + + if (ctx.query?.type === "status") { + return false; + } + + const callStatus = params.get("CallStatus"); + if (callStatus && isProviderStatusTerminal(callStatus)) { + return false; + } + + // Replays must return the same TwiML body so Twilio retries reconnect cleanly. + // The one-time token still changes, but the behavior stays identical. + return !params.get("SpeechResult") && !params.get("Digits"); + } + private processParsedEvents(events: NormalizedEvent[]): void { for (const event of events) { try { diff --git a/extensions/voice-call/src/webhook/realtime-handler.test.ts b/extensions/voice-call/src/webhook/realtime-handler.test.ts new file mode 100644 index 00000000000..dd6489fd9b9 --- /dev/null +++ b/extensions/voice-call/src/webhook/realtime-handler.test.ts @@ -0,0 +1,92 @@ +import http from "node:http"; +import type { + RealtimeVoiceBridge, + RealtimeVoiceProviderPlugin, +} from "openclaw/plugin-sdk/realtime-voice"; +import { describe, expect, it, vi } from "vitest"; +import type { VoiceCallRealtimeConfig } from "../config.js"; +import type { CallManager } from "../manager.js"; +import type { VoiceCallProvider } from "../providers/base.js"; +import { RealtimeCallHandler } from "./realtime-handler.js"; + +function makeRequest(url: string, host = "gateway.ts.net"): http.IncomingMessage { + const req = new http.IncomingMessage(null as never); + req.url = url; + req.method = "POST"; + req.headers = host ? { host } : {}; + return req; +} + +function makeBridge(): RealtimeVoiceBridge { + return { + connect: async () => {}, + sendAudio: () => {}, + setMediaTimestamp: () => {}, + submitToolResult: () => {}, + acknowledgeMark: () => {}, + close: () => {}, + isConnected: () => true, + triggerGreeting: () => {}, + }; +} + +const realtimeProvider: RealtimeVoiceProviderPlugin = { + id: "openai", + label: "OpenAI", + isConfigured: () => true, + createBridge: () => makeBridge(), +}; + +function makeHandler(overrides?: Partial) { + return new RealtimeCallHandler( + { + enabled: true, + streamPath: "/voice/stream/realtime", + instructions: "Be helpful.", + tools: [], + providers: {}, + ...overrides, + }, + { + processEvent: vi.fn(), + getCallByProviderCallId: vi.fn(), + } as unknown as CallManager, + { + name: "twilio", + verifyWebhook: vi.fn(), + parseWebhookEvent: vi.fn(), + initiateCall: vi.fn(), + hangupCall: vi.fn(), + playTts: vi.fn(), + startListening: vi.fn(), + stopListening: vi.fn(), + getCallStatus: vi.fn(), + } as unknown as VoiceCallProvider, + realtimeProvider, + { apiKey: "test-key" }, + "/voice/webhook", + ); +} + +describe("RealtimeCallHandler path routing", () => { + it("uses the request host and stream path in TwiML", () => { + const handler = makeHandler(); + const payload = handler.buildTwiMLPayload(makeRequest("/voice/webhook", "gateway.ts.net")); + + expect(payload.statusCode).toBe(200); + expect(payload.body).toMatch( + /wss:\/\/gateway\.ts\.net\/voice\/stream\/realtime\/[0-9a-f-]{36}/, + ); + }); + + it("preserves a public path prefix ahead of serve.path", () => { + const handler = makeHandler({ streamPath: "/custom/stream/realtime" }); + handler.setPublicUrl("https://public.example/api/voice/webhook"); + const payload = handler.buildTwiMLPayload(makeRequest("/voice/webhook", "127.0.0.1:3334")); + + expect(handler.getStreamPathPattern()).toBe("/api/custom/stream/realtime"); + expect(payload.body).toMatch( + /wss:\/\/public\.example\/api\/custom\/stream\/realtime\/[0-9a-f-]{36}/, + ); + }); +}); diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts new file mode 100644 index 00000000000..dd6165bde83 --- /dev/null +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -0,0 +1,413 @@ +import { randomUUID } from "node:crypto"; +import http from "node:http"; +import type { Duplex } from "node:stream"; +import type { + RealtimeVoiceBridge, + RealtimeVoiceProviderConfig, + RealtimeVoiceProviderPlugin, +} from "openclaw/plugin-sdk/realtime-voice"; +import WebSocket, { WebSocketServer } from "ws"; +import type { VoiceCallRealtimeConfig } from "../config.js"; +import type { CallManager } from "../manager.js"; +import type { VoiceCallProvider } from "../providers/base.js"; +import type { CallRecord, NormalizedEvent } from "../types.js"; +import type { WebhookResponsePayload } from "../webhook.js"; + +export type ToolHandlerFn = (args: unknown, callId: string) => Promise; + +const STREAM_TOKEN_TTL_MS = 30_000; +const DEFAULT_HOST = "localhost:8443"; + +function normalizePath(pathname: string): string { + const trimmed = pathname.trim(); + if (!trimmed) { + return "/"; + } + const prefixed = trimmed.startsWith("/") ? trimmed : `/${trimmed}`; + if (prefixed === "/") { + return prefixed; + } + return prefixed.endsWith("/") ? prefixed.slice(0, -1) : prefixed; +} + +function buildGreetingInstructions( + baseInstructions: string | undefined, + greeting: string | undefined, +): string | undefined { + const trimmedGreeting = greeting?.trim(); + if (!trimmedGreeting) { + return baseInstructions; + } + const intro = + "Start the call by greeting the caller naturally. Include this greeting in your first spoken reply:"; + return baseInstructions + ? `${baseInstructions}\n\n${intro} "${trimmedGreeting}"` + : `${intro} "${trimmedGreeting}"`; +} + +type PendingStreamToken = { + expiry: number; + from?: string; + to?: string; + direction?: "inbound" | "outbound"; +}; + +type CallRegistration = { + callId: string; + initialGreetingInstructions?: string; +}; + +export class RealtimeCallHandler { + private readonly toolHandlers = new Map(); + private readonly pendingStreamTokens = new Map(); + private publicOrigin: string | null = null; + private publicPathPrefix = ""; + + constructor( + private readonly config: VoiceCallRealtimeConfig, + private readonly manager: CallManager, + private readonly provider: VoiceCallProvider, + private readonly realtimeProvider: RealtimeVoiceProviderPlugin, + private readonly providerConfig: RealtimeVoiceProviderConfig, + private readonly servePath: string, + ) {} + + setPublicUrl(url: string): void { + try { + const parsed = new URL(url); + this.publicOrigin = parsed.host; + const normalizedServePath = normalizePath(this.servePath); + const normalizedPublicPath = normalizePath(parsed.pathname); + const idx = normalizedPublicPath.indexOf(normalizedServePath); + this.publicPathPrefix = idx > 0 ? normalizedPublicPath.slice(0, idx) : ""; + } catch { + this.publicOrigin = null; + this.publicPathPrefix = ""; + } + } + + getStreamPathPattern(): string { + return `${this.publicPathPrefix}${normalizePath(this.config.streamPath ?? "/voice/stream/realtime")}`; + } + + buildTwiMLPayload(req: http.IncomingMessage, params?: URLSearchParams): WebhookResponsePayload { + const host = this.publicOrigin || req.headers.host || DEFAULT_HOST; + const rawDirection = params?.get("Direction"); + const token = this.issueStreamToken({ + from: params?.get("From") ?? undefined, + to: params?.get("To") ?? undefined, + direction: rawDirection === "outbound-api" ? "outbound" : "inbound", + }); + const wsUrl = `wss://${host}${this.getStreamPathPattern()}/${token}`; + const twiml = ` + + + + +`; + return { + statusCode: 200, + headers: { "Content-Type": "text/xml" }, + body: twiml, + }; + } + + handleWebSocketUpgrade(request: http.IncomingMessage, socket: Duplex, head: Buffer): void { + const url = new URL(request.url ?? "/", "wss://localhost"); + const token = url.pathname.split("/").pop() ?? null; + const callerMeta = token ? this.consumeStreamToken(token) : null; + if (!callerMeta) { + socket.write("HTTP/1.1 401 Unauthorized\r\n\r\n"); + socket.destroy(); + return; + } + + const wss = new WebSocketServer({ noServer: true }); + wss.handleUpgrade(request, socket, head, (ws) => { + let bridge: RealtimeVoiceBridge | null = null; + let initialized = false; + + ws.on("message", (data: Buffer) => { + try { + const msg = JSON.parse(data.toString()) as Record; + if (!initialized && msg.event === "start") { + initialized = true; + const startData = + typeof msg.start === "object" && msg.start !== null + ? (msg.start as Record) + : undefined; + const streamSid = + typeof startData?.streamSid === "string" ? startData.streamSid : "unknown"; + const callSid = typeof startData?.callSid === "string" ? startData.callSid : "unknown"; + bridge = this.handleCall(streamSid, callSid, ws, callerMeta); + return; + } + if (!bridge) { + return; + } + const mediaData = + typeof msg.media === "object" && msg.media !== null + ? (msg.media as Record) + : undefined; + if (msg.event === "media" && typeof mediaData?.payload === "string") { + bridge.sendAudio(Buffer.from(mediaData.payload, "base64")); + if (typeof mediaData.timestamp === "number") { + bridge.setMediaTimestamp(mediaData.timestamp); + } else if (typeof mediaData.timestamp === "string") { + bridge.setMediaTimestamp(Number.parseInt(mediaData.timestamp, 10)); + } + return; + } + if (msg.event === "mark") { + bridge.acknowledgeMark(); + return; + } + if (msg.event === "stop") { + bridge.close(); + } + } catch (error) { + console.error("[voice-call] realtime WS parse failed:", error); + } + }); + + ws.on("close", () => { + bridge?.close(); + }); + }); + } + + registerToolHandler(name: string, fn: ToolHandlerFn): void { + this.toolHandlers.set(name, fn); + } + + private issueStreamToken(meta: Omit = {}): string { + const token = randomUUID(); + this.pendingStreamTokens.set(token, { expiry: Date.now() + STREAM_TOKEN_TTL_MS, ...meta }); + for (const [candidate, entry] of this.pendingStreamTokens) { + if (Date.now() > entry.expiry) { + this.pendingStreamTokens.delete(candidate); + } + } + return token; + } + + private consumeStreamToken(token: string): Omit | null { + const entry = this.pendingStreamTokens.get(token); + if (!entry) { + return null; + } + this.pendingStreamTokens.delete(token); + if (Date.now() > entry.expiry) { + return null; + } + return { + from: entry.from, + to: entry.to, + direction: entry.direction, + }; + } + + private handleCall( + streamSid: string, + callSid: string, + ws: WebSocket, + callerMeta: Omit, + ): RealtimeVoiceBridge | null { + const registration = this.registerCallInManager(callSid, callerMeta); + if (!registration) { + ws.close(1008, "Caller rejected by policy"); + return null; + } + + const { callId, initialGreetingInstructions } = registration; + let bridge: RealtimeVoiceBridge | null = null; + let callEndEmitted = false; + const emitCallEnd = (reason: "completed" | "error") => { + if (callEndEmitted) { + return; + } + callEndEmitted = true; + this.endCallInManager(callSid, callId, reason); + }; + + bridge = this.realtimeProvider.createBridge({ + providerConfig: this.providerConfig, + instructions: this.config.instructions, + tools: this.config.tools, + onAudio: (muLaw) => { + if (ws.readyState !== WebSocket.OPEN) { + return; + } + ws.send( + JSON.stringify({ + event: "media", + streamSid, + media: { payload: muLaw.toString("base64") }, + }), + ); + }, + onClearAudio: () => { + if (ws.readyState !== WebSocket.OPEN) { + return; + } + ws.send(JSON.stringify({ event: "clear", streamSid })); + }, + onMark: (markName) => { + if (ws.readyState !== WebSocket.OPEN) { + return; + } + ws.send(JSON.stringify({ event: "mark", streamSid, mark: { name: markName } })); + }, + onTranscript: (role, text, isFinal) => { + if (!isFinal) { + return; + } + if (role === "user") { + const event: NormalizedEvent = { + id: `realtime-speech-${callSid}-${Date.now()}`, + type: "call.speech", + callId, + providerCallId: callSid, + timestamp: Date.now(), + transcript: text, + isFinal: true, + }; + this.manager.processEvent(event); + return; + } + this.manager.processEvent({ + id: `realtime-bot-${callSid}-${Date.now()}`, + type: "call.speaking", + callId, + providerCallId: callSid, + timestamp: Date.now(), + text, + }); + }, + onToolCall: (toolEvent) => { + if (!bridge) { + return; + } + void this.executeToolCall( + bridge, + callId, + toolEvent.callId || toolEvent.itemId, + toolEvent.name, + toolEvent.args, + ); + }, + onReady: () => { + bridge?.triggerGreeting?.(initialGreetingInstructions); + }, + onError: (error) => { + console.error("[voice-call] realtime voice error:", error.message); + }, + onClose: (reason) => { + if (reason !== "error") { + return; + } + emitCallEnd("error"); + if (ws.readyState === WebSocket.OPEN) { + ws.close(1011, "Bridge disconnected"); + } + void this.provider + .hangupCall({ callId, providerCallId: callSid, reason: "error" }) + .catch((error: unknown) => { + console.warn( + `[voice-call] Failed to hang up realtime call ${callSid}: ${ + error instanceof Error ? error.message : String(error) + }`, + ); + }); + }, + }); + + bridge.connect().catch((error: Error) => { + console.error("[voice-call] Failed to connect realtime bridge:", error); + bridge?.close(); + emitCallEnd("error"); + ws.close(1011, "Failed to connect"); + }); + + return bridge; + } + + private registerCallInManager( + callSid: string, + callerMeta: Omit = {}, + ): CallRegistration | null { + const timestamp = Date.now(); + const baseFields = { + providerCallId: callSid, + timestamp, + direction: (callerMeta.direction ?? "inbound") as "inbound" | "outbound", + ...(callerMeta.from ? { from: callerMeta.from } : {}), + ...(callerMeta.to ? { to: callerMeta.to } : {}), + }; + + this.manager.processEvent({ + id: `realtime-initiated-${callSid}`, + callId: callSid, + type: "call.initiated", + ...baseFields, + }); + + const callRecord = this.manager.getCallByProviderCallId(callSid); + if (!callRecord) { + return null; + } + + const initialGreeting = this.extractInitialGreeting(callRecord); + if (callRecord.metadata) { + delete callRecord.metadata.initialMessage; + } + + this.manager.processEvent({ + id: `realtime-answered-${callSid}`, + callId: callSid, + type: "call.answered", + ...baseFields, + }); + + return { + callId: callRecord.callId, + initialGreetingInstructions: buildGreetingInstructions( + this.config.instructions, + initialGreeting, + ), + }; + } + + private extractInitialGreeting(call: CallRecord): string | undefined { + return typeof call.metadata?.initialMessage === "string" + ? call.metadata.initialMessage + : undefined; + } + + private endCallInManager(callSid: string, callId: string, reason: "completed" | "error"): void { + this.manager.processEvent({ + id: `realtime-ended-${callSid}-${Date.now()}`, + type: "call.ended", + callId, + providerCallId: callSid, + timestamp: Date.now(), + reason, + }); + } + + private async executeToolCall( + bridge: RealtimeVoiceBridge, + callId: string, + bridgeCallId: string, + name: string, + args: unknown, + ): Promise { + const handler = this.toolHandlers.get(name); + const result = !handler + ? { error: `Tool "${name}" not available` } + : await handler(args, callId).catch((error: unknown) => ({ + error: error instanceof Error ? error.message : String(error), + })); + bridge.submitToolResult(bridgeCallId, result); + } +} diff --git a/extensions/zai/test-api.ts b/extensions/zai/test-api.ts new file mode 100644 index 00000000000..19ef1fbacf3 --- /dev/null +++ b/extensions/zai/test-api.ts @@ -0,0 +1 @@ +export { zaiMediaUnderstandingProvider } from "./media-understanding-provider.js"; diff --git a/package.json b/package.json index b2222d56b40..7ac3c149f23 100644 --- a/package.json +++ b/package.json @@ -551,6 +551,14 @@ "types": "./dist/plugin-sdk/reply-history.d.ts", "default": "./dist/plugin-sdk/reply-history.js" }, + "./plugin-sdk/realtime-voice": { + "types": "./dist/plugin-sdk/realtime-voice.d.ts", + "default": "./dist/plugin-sdk/realtime-voice.js" + }, + "./plugin-sdk/realtime-transcription": { + "types": "./dist/plugin-sdk/realtime-transcription.d.ts", + "default": "./dist/plugin-sdk/realtime-transcription.js" + }, "./plugin-sdk/media-understanding": { "types": "./dist/plugin-sdk/media-understanding.d.ts", "default": "./dist/plugin-sdk/media-understanding.js" diff --git a/scripts/lib/plugin-sdk-entrypoints.json b/scripts/lib/plugin-sdk-entrypoints.json index 13006c74a74..269bca270de 100644 --- a/scripts/lib/plugin-sdk-entrypoints.json +++ b/scripts/lib/plugin-sdk-entrypoints.json @@ -127,6 +127,8 @@ "kimi-coding", "kilocode", "reply-history", + "realtime-transcription", + "realtime-voice", "media-understanding", "request-url", "runtime-store", diff --git a/scripts/write-cli-startup-metadata.ts b/scripts/write-cli-startup-metadata.ts index 402577ca8c0..4ede5e710ec 100644 --- a/scripts/write-cli-startup-metadata.ts +++ b/scripts/write-cli-startup-metadata.ts @@ -1,7 +1,8 @@ +import { spawnSync } from "node:child_process"; import { mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs"; import path from "node:path"; -import { fileURLToPath } from "node:url"; -import { renderRootHelpText } from "../src/cli/program/root-help.ts"; +import { fileURLToPath, pathToFileURL } from "node:url"; +import { renderRootHelpText as renderSourceRootHelpText } from "../src/cli/program/root-help.ts"; function dedupe(values: string[]): string[] { const seen = new Set(); @@ -82,7 +83,37 @@ export function readBundledChannelCatalogIds( export async function renderBundledRootHelpText( _distDirOverride: string = distDir, ): Promise { - return await renderRootHelpText({ pluginDescriptors: [] }); + const bundleName = readdirSync(distDirOverride).find( + (entry) => entry.startsWith("root-help-") && entry.endsWith(".js"), + ); + if (!bundleName) { + throw new Error("No root-help bundle found in dist; cannot write CLI startup metadata."); + } + const moduleUrl = pathToFileURL(path.join(distDirOverride, bundleName)).href; + const inlineModule = [ + `const mod = await import(${JSON.stringify(moduleUrl)});`, + "if (typeof mod.outputRootHelp !== 'function') {", + ` throw new Error(${JSON.stringify(`Bundle ${bundleName} does not export outputRootHelp.`)});`, + "}", + "await mod.outputRootHelp();", + "process.exit(0);", + ].join("\n"); + const result = spawnSync(process.execPath, ["--input-type=module", "--eval", inlineModule], { + cwd: distDirOverride, + encoding: "utf8", + timeout: 30_000, + }); + if (result.error) { + throw result.error; + } + if (result.status !== 0) { + const stderr = result.stderr?.trim(); + throw new Error( + `Failed to render bundled root help from ${bundleName}` + + (stderr ? `: ${stderr}` : result.signal ? `: terminated by ${result.signal}` : ""), + ); + } + return result.stdout ?? ""; } export async function writeCliStartupMetadata(options?: { @@ -95,7 +126,13 @@ export async function writeCliStartupMetadata(options?: { const resolvedExtensionsDir = options?.extensionsDir ?? extensionsDir; const catalog = readBundledChannelCatalogIds(resolvedExtensionsDir); const channelOptions = dedupe([...CORE_CHANNEL_ORDER, ...catalog]); - const rootHelpText = await renderBundledRootHelpText(resolvedDistDir); + const useSourceRootHelp = + resolvedDistDir === distDir && + resolvedOutputPath === outputPath && + resolvedExtensionsDir === extensionsDir; + const rootHelpText = useSourceRootHelp + ? await renderSourceRootHelpText({ pluginSdkResolution: "src" }) + : await renderBundledRootHelpText(resolvedDistDir); mkdirSync(resolvedDistDir, { recursive: true }); writeFileSync( @@ -115,4 +152,5 @@ export async function writeCliStartupMetadata(options?: { if (process.argv[1] && path.resolve(process.argv[1]) === scriptPath) { await writeCliStartupMetadata(); + process.exit(0); } diff --git a/src/cli/program/root-help.ts b/src/cli/program/root-help.ts index 9322d47c607..4328380f7ba 100644 --- a/src/cli/program/root-help.ts +++ b/src/cli/program/root-help.ts @@ -1,16 +1,14 @@ import { Command } from "commander"; import { getPluginCliCommandDescriptors } from "../../plugins/cli.js"; -import type { OpenClawPluginCliCommandDescriptor } from "../../plugins/types.js"; +import type { PluginLoadOptions } from "../../plugins/loader.js"; import { VERSION } from "../../version.js"; import { getCoreCliCommandDescriptors } from "./core-command-descriptors.js"; import { configureProgramHelp } from "./help.js"; import { getSubCliEntries } from "./subcli-descriptors.js"; -type RootHelpRenderOptions = { - pluginDescriptors?: OpenClawPluginCliCommandDescriptor[] | null; -}; +type RootHelpLoaderOptions = Pick; -async function buildRootHelpProgram(options?: RootHelpRenderOptions): Promise { +async function buildRootHelpProgram(loaderOptions?: RootHelpLoaderOptions): Promise { const program = new Command(); configureProgramHelp(program, { programVersion: VERSION, @@ -31,11 +29,7 @@ async function buildRootHelpProgram(options?: RootHelpRenderOptions): Promise { - const program = await buildRootHelpProgram(options); +export async function renderRootHelpText(loaderOptions?: RootHelpLoaderOptions): Promise { + const program = await buildRootHelpProgram(loaderOptions); let output = ""; const originalWrite = process.stdout.write.bind(process.stdout); const captureWrite: typeof process.stdout.write = ((chunk: string | Uint8Array) => { @@ -63,6 +57,6 @@ export async function renderRootHelpText(options?: RootHelpRenderOptions): Promi return output; } -export async function outputRootHelp(options?: RootHelpRenderOptions): Promise { - process.stdout.write(await renderRootHelpText(options)); +export async function outputRootHelp(loaderOptions?: RootHelpLoaderOptions): Promise { + process.stdout.write(await renderRootHelpText(loaderOptions)); } diff --git a/src/gateway/server-plugins.test.ts b/src/gateway/server-plugins.test.ts index 5d62168845d..0687a11983d 100644 --- a/src/gateway/server-plugins.test.ts +++ b/src/gateway/server-plugins.test.ts @@ -69,6 +69,8 @@ const createRegistry = (diagnostics: PluginDiagnostic[]): PluginRegistry => ({ commands: [], providers: [], speechProviders: [], + realtimeTranscriptionProviders: [], + realtimeVoiceProviders: [], mediaUnderstandingProviders: [], imageGenerationProviders: [], webFetchProviders: [], diff --git a/src/gateway/test-helpers.mocks.ts b/src/gateway/test-helpers.mocks.ts index 7bb3c184855..1b26cf62f23 100644 --- a/src/gateway/test-helpers.mocks.ts +++ b/src/gateway/test-helpers.mocks.ts @@ -201,6 +201,8 @@ const createStubPluginRegistry = (): PluginRegistry => ({ }), }, ], + realtimeTranscriptionProviders: [], + realtimeVoiceProviders: [], mediaUnderstandingProviders: [], imageGenerationProviders: [], webFetchProviders: [], diff --git a/src/plugin-sdk/core.ts b/src/plugin-sdk/core.ts index 937a43ecd7d..1f3e3172d0d 100644 --- a/src/plugin-sdk/core.ts +++ b/src/plugin-sdk/core.ts @@ -66,6 +66,7 @@ export type { ProviderReplaySessionState, ProviderResolveDynamicModelContext, ProviderResolvedUsageAuth, + RealtimeTranscriptionProviderPlugin, ProviderSanitizeReplayHistoryContext, ProviderToolSchemaDiagnostic, ProviderResolveUsageAuthContext, diff --git a/src/plugin-sdk/index.ts b/src/plugin-sdk/index.ts index 32e4e9a7ffe..ece2423d82d 100644 --- a/src/plugin-sdk/index.ts +++ b/src/plugin-sdk/index.ts @@ -51,6 +51,7 @@ export type { ProviderAuthContext, ProviderAuthResult, ProviderRuntimeModel, + RealtimeTranscriptionProviderPlugin, SpeechProviderPlugin, } from "../plugins/types.js"; export type { diff --git a/src/plugin-sdk/plugin-entry.ts b/src/plugin-sdk/plugin-entry.ts index bd5d7ad0843..f24e2be6a27 100644 --- a/src/plugin-sdk/plugin-entry.ts +++ b/src/plugin-sdk/plugin-entry.ts @@ -46,6 +46,7 @@ import type { ProviderReplayPolicyContext, ProviderReplaySessionEntry, ProviderReplaySessionState, + RealtimeTranscriptionProviderPlugin, ProviderResolvedUsageAuth, ProviderResolveDynamicModelContext, ProviderSanitizeReplayHistoryContext, @@ -102,6 +103,7 @@ export type { ProviderResolveDynamicModelContext, ProviderNormalizeResolvedModelContext, ProviderRuntimeModel, + RealtimeTranscriptionProviderPlugin, SpeechProviderPlugin, ProviderThinkingPolicyContext, ProviderValidateReplayTurnsContext, diff --git a/src/plugin-sdk/realtime-transcription.ts b/src/plugin-sdk/realtime-transcription.ts new file mode 100644 index 00000000000..e0f68005b07 --- /dev/null +++ b/src/plugin-sdk/realtime-transcription.ts @@ -0,0 +1,16 @@ +export type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js"; +export type { + RealtimeTranscriptionProviderConfig, + RealtimeTranscriptionProviderConfiguredContext, + RealtimeTranscriptionProviderId, + RealtimeTranscriptionProviderResolveConfigContext, + RealtimeTranscriptionSession, + RealtimeTranscriptionSessionCallbacks, + RealtimeTranscriptionSessionCreateRequest, +} from "../realtime-transcription/provider-types.js"; +export { + canonicalizeRealtimeTranscriptionProviderId, + getRealtimeTranscriptionProvider, + listRealtimeTranscriptionProviders, + normalizeRealtimeTranscriptionProviderId, +} from "../realtime-transcription/provider-registry.js"; diff --git a/src/plugin-sdk/realtime-voice.ts b/src/plugin-sdk/realtime-voice.ts new file mode 100644 index 00000000000..41e2ed77400 --- /dev/null +++ b/src/plugin-sdk/realtime-voice.ts @@ -0,0 +1,20 @@ +export type { RealtimeVoiceProviderPlugin } from "../plugins/types.js"; +export type { + RealtimeVoiceBridge, + RealtimeVoiceBridgeCallbacks, + RealtimeVoiceBridgeCreateRequest, + RealtimeVoiceCloseReason, + RealtimeVoiceProviderConfig, + RealtimeVoiceProviderConfiguredContext, + RealtimeVoiceProviderId, + RealtimeVoiceProviderResolveConfigContext, + RealtimeVoiceRole, + RealtimeVoiceTool, + RealtimeVoiceToolCallEvent, +} from "../realtime-voice/provider-types.js"; +export { + canonicalizeRealtimeVoiceProviderId, + getRealtimeVoiceProvider, + listRealtimeVoiceProviders, + normalizeRealtimeVoiceProviderId, +} from "../realtime-voice/provider-registry.js"; diff --git a/src/plugin-sdk/speech.ts b/src/plugin-sdk/speech.ts index 019f80e9ed6..55069df025f 100644 --- a/src/plugin-sdk/speech.ts +++ b/src/plugin-sdk/speech.ts @@ -1,7 +1,12 @@ +import { rmSync } from "node:fs"; +import type { OpenClawConfig } from "../config/config.js"; +import type { ResolvedTtsConfig } from "../tts/tts.js"; + // Public speech helpers for bundled or third-party plugins. // -// Keep this surface neutral. Provider plugins should not need to know about the -// bundled `speech-core` plugin id just to consume shared speech types/helpers. +// Keep this surface neutral and import-light. Provider builders commonly import +// this module just to get types and a few validation helpers, so avoid pulling +// in the heavy TTS runtime graph at module load time. export type { SpeechProviderPlugin } from "../plugins/types.js"; export type { @@ -22,14 +27,6 @@ export type { TtsDirectiveParseResult, } from "../tts/provider-types.js"; -export { - scheduleCleanup, - summarizeText, - normalizeApplyTextNormalization, - normalizeLanguageCode, - normalizeSeed, - requireInRange, -} from "../tts/tts-core.js"; export { parseTtsDirectives } from "../tts/directives.js"; export { canonicalizeSpeechProviderId, @@ -44,3 +41,71 @@ export { trimToUndefined, truncateErrorDetail, } from "../tts/provider-error-utils.js"; + +const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes + +export function requireInRange(value: number, min: number, max: number, label: string): void { + if (!Number.isFinite(value) || value < min || value > max) { + throw new Error(`${label} must be between ${min} and ${max}`); + } +} + +export function normalizeLanguageCode(code?: string): string | undefined { + const trimmed = code?.trim(); + if (!trimmed) { + return undefined; + } + const normalized = trimmed.toLowerCase(); + if (!/^[a-z]{2}$/.test(normalized)) { + throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)"); + } + return normalized; +} + +export function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined { + const trimmed = mode?.trim(); + if (!trimmed) { + return undefined; + } + const normalized = trimmed.toLowerCase(); + if (normalized === "auto" || normalized === "on" || normalized === "off") { + return normalized; + } + throw new Error("applyTextNormalization must be one of: auto, on, off"); +} + +export function normalizeSeed(seed?: number): number | undefined { + if (seed == null) { + return undefined; + } + const next = Math.floor(seed); + if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) { + throw new Error("seed must be between 0 and 4294967295"); + } + return next; +} + +export function scheduleCleanup( + tempDir: string, + delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS, +): void { + const timer = setTimeout(() => { + try { + rmSync(tempDir, { recursive: true, force: true }); + } catch { + // ignore cleanup errors + } + }, delayMs); + timer.unref(); +} + +export async function summarizeText(params: { + text: string; + targetLength: number; + cfg: OpenClawConfig; + config: ResolvedTtsConfig; + timeoutMs: number; +}) { + const { summarizeText: summarizeTextRuntime } = await import("../tts/tts-core.js"); + return summarizeTextRuntime(params); +} diff --git a/src/plugins/api-builder.ts b/src/plugins/api-builder.ts index ab8c66cec10..0c5906758c1 100644 --- a/src/plugins/api-builder.ts +++ b/src/plugins/api-builder.ts @@ -28,6 +28,8 @@ export type BuildPluginApiParams = { | "registerCliBackend" | "registerProvider" | "registerSpeechProvider" + | "registerRealtimeTranscriptionProvider" + | "registerRealtimeVoiceProvider" | "registerMediaUnderstandingProvider" | "registerImageGenerationProvider" | "registerWebFetchProvider" @@ -55,6 +57,10 @@ const noopRegisterService: OpenClawPluginApi["registerService"] = () => {}; const noopRegisterCliBackend: OpenClawPluginApi["registerCliBackend"] = () => {}; const noopRegisterProvider: OpenClawPluginApi["registerProvider"] = () => {}; const noopRegisterSpeechProvider: OpenClawPluginApi["registerSpeechProvider"] = () => {}; +const noopRegisterRealtimeTranscriptionProvider: OpenClawPluginApi["registerRealtimeTranscriptionProvider"] = + () => {}; +const noopRegisterRealtimeVoiceProvider: OpenClawPluginApi["registerRealtimeVoiceProvider"] = + () => {}; const noopRegisterMediaUnderstandingProvider: OpenClawPluginApi["registerMediaUnderstandingProvider"] = () => {}; const noopRegisterImageGenerationProvider: OpenClawPluginApi["registerImageGenerationProvider"] = @@ -97,6 +103,10 @@ export function buildPluginApi(params: BuildPluginApiParams): OpenClawPluginApi registerCliBackend: handlers.registerCliBackend ?? noopRegisterCliBackend, registerProvider: handlers.registerProvider ?? noopRegisterProvider, registerSpeechProvider: handlers.registerSpeechProvider ?? noopRegisterSpeechProvider, + registerRealtimeTranscriptionProvider: + handlers.registerRealtimeTranscriptionProvider ?? noopRegisterRealtimeTranscriptionProvider, + registerRealtimeVoiceProvider: + handlers.registerRealtimeVoiceProvider ?? noopRegisterRealtimeVoiceProvider, registerMediaUnderstandingProvider: handlers.registerMediaUnderstandingProvider ?? noopRegisterMediaUnderstandingProvider, registerImageGenerationProvider: diff --git a/src/plugins/bundled-capability-metadata.test.ts b/src/plugins/bundled-capability-metadata.test.ts index 5bcfd4c5872..0a34fb355b5 100644 --- a/src/plugins/bundled-capability-metadata.test.ts +++ b/src/plugins/bundled-capability-metadata.test.ts @@ -28,6 +28,10 @@ describe("bundled capability metadata", () => { cliBackendIds: uniqueStrings(manifest.cliBackends), providerIds: uniqueStrings(manifest.providers), speechProviderIds: uniqueStrings(manifest.contracts?.speechProviders), + realtimeTranscriptionProviderIds: uniqueStrings( + manifest.contracts?.realtimeTranscriptionProviders, + ), + realtimeVoiceProviderIds: uniqueStrings(manifest.contracts?.realtimeVoiceProviders), mediaUnderstandingProviderIds: uniqueStrings( manifest.contracts?.mediaUnderstandingProviders, ), @@ -41,6 +45,8 @@ describe("bundled capability metadata", () => { entry.cliBackendIds.length > 0 || entry.providerIds.length > 0 || entry.speechProviderIds.length > 0 || + entry.realtimeTranscriptionProviderIds.length > 0 || + entry.realtimeVoiceProviderIds.length > 0 || entry.mediaUnderstandingProviderIds.length > 0 || entry.imageGenerationProviderIds.length > 0 || entry.webFetchProviderIds.length > 0 || diff --git a/src/plugins/bundled-capability-metadata.ts b/src/plugins/bundled-capability-metadata.ts index ebde71de3a9..7a7186153b6 100644 --- a/src/plugins/bundled-capability-metadata.ts +++ b/src/plugins/bundled-capability-metadata.ts @@ -5,6 +5,8 @@ export type BundledPluginContractSnapshot = { cliBackendIds: string[]; providerIds: string[]; speechProviderIds: string[]; + realtimeTranscriptionProviderIds: string[]; + realtimeVoiceProviderIds: string[]; mediaUnderstandingProviderIds: string[]; imageGenerationProviderIds: string[]; webFetchProviderIds: string[]; @@ -37,6 +39,10 @@ export const BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS: readonly BundledPluginContractSn cliBackendIds: uniqueStrings(manifest.cliBackends), providerIds: uniqueStrings(manifest.providers), speechProviderIds: uniqueStrings(manifest.contracts?.speechProviders), + realtimeTranscriptionProviderIds: uniqueStrings( + manifest.contracts?.realtimeTranscriptionProviders, + ), + realtimeVoiceProviderIds: uniqueStrings(manifest.contracts?.realtimeVoiceProviders), mediaUnderstandingProviderIds: uniqueStrings(manifest.contracts?.mediaUnderstandingProviders), imageGenerationProviderIds: uniqueStrings(manifest.contracts?.imageGenerationProviders), webFetchProviderIds: uniqueStrings(manifest.contracts?.webFetchProviders), @@ -48,6 +54,8 @@ export const BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS: readonly BundledPluginContractSn entry.cliBackendIds.length > 0 || entry.providerIds.length > 0 || entry.speechProviderIds.length > 0 || + entry.realtimeTranscriptionProviderIds.length > 0 || + entry.realtimeVoiceProviderIds.length > 0 || entry.mediaUnderstandingProviderIds.length > 0 || entry.imageGenerationProviderIds.length > 0 || entry.webFetchProviderIds.length > 0 || @@ -68,6 +76,14 @@ export const BUNDLED_PROVIDER_PLUGIN_IDS = collectPluginIds((entry) => entry.pro export const BUNDLED_SPEECH_PLUGIN_IDS = collectPluginIds((entry) => entry.speechProviderIds); +export const BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS = collectPluginIds( + (entry) => entry.realtimeTranscriptionProviderIds, +); + +export const BUNDLED_REALTIME_VOICE_PLUGIN_IDS = collectPluginIds( + (entry) => entry.realtimeVoiceProviderIds, +); + export const BUNDLED_MEDIA_UNDERSTANDING_PLUGIN_IDS = collectPluginIds( (entry) => entry.mediaUnderstandingProviderIds, ); @@ -84,6 +100,8 @@ export const BUNDLED_RUNTIME_CONTRACT_PLUGIN_IDS = [ (entry) => entry.providerIds.length > 0 || entry.speechProviderIds.length > 0 || + entry.realtimeTranscriptionProviderIds.length > 0 || + entry.realtimeVoiceProviderIds.length > 0 || entry.mediaUnderstandingProviderIds.length > 0 || entry.imageGenerationProviderIds.length > 0 || entry.webFetchProviderIds.length > 0 || diff --git a/src/plugins/bundled-capability-runtime.ts b/src/plugins/bundled-capability-runtime.ts index b7e67dd5aab..c44dd875e52 100644 --- a/src/plugins/bundled-capability-runtime.ts +++ b/src/plugins/bundled-capability-runtime.ts @@ -122,6 +122,8 @@ function createCapabilityPluginRecord(params: { cliBackendIds: [], providerIds: [], speechProviderIds: [], + realtimeTranscriptionProviderIds: [], + realtimeVoiceProviderIds: [], mediaUnderstandingProviderIds: [], imageGenerationProviderIds: [], webFetchProviderIds: [], @@ -272,6 +274,12 @@ export function loadBundledCapabilityRuntimeRegistry(params: { record.cliBackendIds.push(...captured.cliBackends.map((entry) => entry.id)); record.providerIds.push(...captured.providers.map((entry) => entry.id)); record.speechProviderIds.push(...captured.speechProviders.map((entry) => entry.id)); + record.realtimeTranscriptionProviderIds.push( + ...captured.realtimeTranscriptionProviders.map((entry) => entry.id), + ); + record.realtimeVoiceProviderIds.push( + ...captured.realtimeVoiceProviders.map((entry) => entry.id), + ); record.mediaUnderstandingProviderIds.push( ...captured.mediaUnderstandingProviders.map((entry) => entry.id), ); @@ -309,6 +317,24 @@ export function loadBundledCapabilityRuntimeRegistry(params: { rootDir: record.rootDir, })), ); + registry.realtimeTranscriptionProviders.push( + ...captured.realtimeTranscriptionProviders.map((provider) => ({ + pluginId: record.id, + pluginName: record.name, + provider, + source: record.source, + rootDir: record.rootDir, + })), + ); + registry.realtimeVoiceProviders.push( + ...captured.realtimeVoiceProviders.map((provider) => ({ + pluginId: record.id, + pluginName: record.name, + provider, + source: record.source, + rootDir: record.rootDir, + })), + ); registry.mediaUnderstandingProviders.push( ...captured.mediaUnderstandingProviders.map((provider) => ({ pluginId: record.id, diff --git a/src/plugins/capability-provider-runtime.test.ts b/src/plugins/capability-provider-runtime.test.ts index cb65392f08c..78b8a5e40c8 100644 --- a/src/plugins/capability-provider-runtime.test.ts +++ b/src/plugins/capability-provider-runtime.test.ts @@ -102,7 +102,12 @@ function setBundledCapabilityFixture(contractKey: string) { } function expectCompatChainApplied(params: { - key: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders"; + key: + | "speechProviders" + | "realtimeTranscriptionProviders" + | "realtimeVoiceProviders" + | "mediaUnderstandingProviders" + | "imageGenerationProviders"; contractKey: string; cfg: OpenClawConfig; enablementCompat: { @@ -201,6 +206,8 @@ describe("resolvePluginCapabilityProviders", () => { it.each([ ["speechProviders", "speechProviders"], + ["realtimeTranscriptionProviders", "realtimeTranscriptionProviders"], + ["realtimeVoiceProviders", "realtimeVoiceProviders"], ["mediaUnderstandingProviders", "mediaUnderstandingProviders"], ["imageGenerationProviders", "imageGenerationProviders"], ] as const)("applies bundled compat before fallback loading for %s", (key, contractKey) => { diff --git a/src/plugins/capability-provider-runtime.ts b/src/plugins/capability-provider-runtime.ts index 195aa0ba5d3..4e41fce5f87 100644 --- a/src/plugins/capability-provider-runtime.ts +++ b/src/plugins/capability-provider-runtime.ts @@ -9,11 +9,15 @@ import type { PluginRegistry } from "./registry.js"; type CapabilityProviderRegistryKey = | "speechProviders" + | "realtimeTranscriptionProviders" + | "realtimeVoiceProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders"; type CapabilityContractKey = | "speechProviders" + | "realtimeTranscriptionProviders" + | "realtimeVoiceProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders"; @@ -22,6 +26,8 @@ type CapabilityProviderForKey = const CAPABILITY_CONTRACT_KEY: Record = { speechProviders: "speechProviders", + realtimeTranscriptionProviders: "realtimeTranscriptionProviders", + realtimeVoiceProviders: "realtimeVoiceProviders", mediaUnderstandingProviders: "mediaUnderstandingProviders", imageGenerationProviders: "imageGenerationProviders", }; diff --git a/src/plugins/captured-registration.ts b/src/plugins/captured-registration.ts index 8f20450e517..c816c2d1464 100644 --- a/src/plugins/captured-registration.ts +++ b/src/plugins/captured-registration.ts @@ -10,6 +10,8 @@ import type { OpenClawPluginCliCommandDescriptor, OpenClawPluginCliRegistrar, ProviderPlugin, + RealtimeTranscriptionProviderPlugin, + RealtimeVoiceProviderPlugin, SpeechProviderPlugin, WebFetchProviderPlugin, WebSearchProviderPlugin, @@ -27,6 +29,8 @@ export type CapturedPluginRegistration = { cliRegistrars: CapturedPluginCliRegistration[]; cliBackends: CliBackendPlugin[]; speechProviders: SpeechProviderPlugin[]; + realtimeTranscriptionProviders: RealtimeTranscriptionProviderPlugin[]; + realtimeVoiceProviders: RealtimeVoiceProviderPlugin[]; mediaUnderstandingProviders: MediaUnderstandingProviderPlugin[]; imageGenerationProviders: ImageGenerationProviderPlugin[]; webFetchProviders: WebFetchProviderPlugin[]; @@ -42,6 +46,8 @@ export function createCapturedPluginRegistration(params?: { const cliRegistrars: CapturedPluginCliRegistration[] = []; const cliBackends: CliBackendPlugin[] = []; const speechProviders: SpeechProviderPlugin[] = []; + const realtimeTranscriptionProviders: RealtimeTranscriptionProviderPlugin[] = []; + const realtimeVoiceProviders: RealtimeVoiceProviderPlugin[] = []; const mediaUnderstandingProviders: MediaUnderstandingProviderPlugin[] = []; const imageGenerationProviders: ImageGenerationProviderPlugin[] = []; const webFetchProviders: WebFetchProviderPlugin[] = []; @@ -59,6 +65,8 @@ export function createCapturedPluginRegistration(params?: { cliRegistrars, cliBackends, speechProviders, + realtimeTranscriptionProviders, + realtimeVoiceProviders, mediaUnderstandingProviders, imageGenerationProviders, webFetchProviders, @@ -106,6 +114,12 @@ export function createCapturedPluginRegistration(params?: { registerSpeechProvider(provider: SpeechProviderPlugin) { speechProviders.push(provider); }, + registerRealtimeTranscriptionProvider(provider: RealtimeTranscriptionProviderPlugin) { + realtimeTranscriptionProviders.push(provider); + }, + registerRealtimeVoiceProvider(provider: RealtimeVoiceProviderPlugin) { + realtimeVoiceProviders.push(provider); + }, registerMediaUnderstandingProvider(provider: MediaUnderstandingProviderPlugin) { mediaUnderstandingProviders.push(provider); }, diff --git a/src/plugins/cli.ts b/src/plugins/cli.ts index 3d8f639ac87..a495693b6e9 100644 --- a/src/plugins/cli.ts +++ b/src/plugins/cli.ts @@ -155,9 +155,10 @@ async function loadPluginCliCommandRegistry( export async function getPluginCliCommandDescriptors( cfg?: OpenClawConfig, env?: NodeJS.ProcessEnv, + loaderOptions?: Pick, ): Promise { try { - const { registry } = await loadPluginCliMetadataRegistry(cfg, env); + const { registry } = await loadPluginCliMetadataRegistry(cfg, env, loaderOptions); const seen = new Set(); const descriptors: OpenClawPluginCliCommandDescriptor[] = []; for (const entry of registry.cliRegistrars) { diff --git a/src/plugins/contracts/registry.contract.test.ts b/src/plugins/contracts/registry.contract.test.ts index a0891beb395..70bd8bad2a9 100644 --- a/src/plugins/contracts/registry.contract.test.ts +++ b/src/plugins/contracts/registry.contract.test.ts @@ -8,6 +8,8 @@ import { pluginRegistrationContractRegistry, providerContractLoadError, providerContractPluginIds, + realtimeTranscriptionProviderContractRegistry, + realtimeVoiceProviderContractRegistry, resolveWebFetchProviderContractEntriesForPluginId, resolveWebSearchProviderContractEntriesForPluginId, speechProviderContractRegistry, @@ -27,7 +29,11 @@ describe("plugin contract registry", () => { predicate: (plugin: { origin: string; providers: unknown[]; - contracts?: { speechProviders?: unknown[] }; + contracts?: { + speechProviders?: unknown[]; + realtimeTranscriptionProviders?: unknown[]; + realtimeVoiceProviders?: unknown[]; + }; }) => boolean; }) { expect(uniqueSortedStrings(params.actualPluginIds)).toEqual( @@ -39,7 +45,11 @@ describe("plugin contract registry", () => { predicate: (plugin: { origin: string; providers: unknown[]; - contracts?: { speechProviders?: unknown[] }; + contracts?: { + speechProviders?: unknown[]; + realtimeTranscriptionProviders?: unknown[]; + realtimeVoiceProviders?: unknown[]; + }; }) => boolean, ) { return loadPluginManifestRegistry({}) @@ -70,6 +80,14 @@ describe("plugin contract registry", () => { name: "does not duplicate bundled media provider ids", ids: () => mediaUnderstandingProviderContractRegistry.map((entry) => entry.provider.id), }, + { + name: "does not duplicate bundled realtime transcription provider ids", + ids: () => realtimeTranscriptionProviderContractRegistry.map((entry) => entry.provider.id), + }, + { + name: "does not duplicate bundled realtime voice provider ids", + ids: () => realtimeVoiceProviderContractRegistry.map((entry) => entry.provider.id), + }, { name: "does not duplicate bundled image-generation provider ids", ids: () => imageGenerationProviderContractRegistry.map((entry) => entry.provider.id), @@ -101,6 +119,23 @@ describe("plugin contract registry", () => { }); }); + it("covers every bundled realtime voice plugin discovered from manifests", () => { + expectRegistryPluginIds({ + actualPluginIds: realtimeVoiceProviderContractRegistry.map((entry) => entry.pluginId), + predicate: (plugin) => + plugin.origin === "bundled" && (plugin.contracts?.realtimeVoiceProviders?.length ?? 0) > 0, + }); + }); + + it("covers every bundled realtime transcription plugin discovered from manifests", () => { + expectRegistryPluginIds({ + actualPluginIds: realtimeTranscriptionProviderContractRegistry.map((entry) => entry.pluginId), + predicate: (plugin) => + plugin.origin === "bundled" && + (plugin.contracts?.realtimeTranscriptionProviders?.length ?? 0) > 0, + }); + }); + it("covers every bundled web fetch plugin from the shared resolver", () => { const bundledWebFetchPluginIds = resolveBundledWebFetchPluginIds({}); diff --git a/src/plugins/contracts/registry.ts b/src/plugins/contracts/registry.ts index 2728de07bc1..dfe446209b5 100644 --- a/src/plugins/contracts/registry.ts +++ b/src/plugins/contracts/registry.ts @@ -3,6 +3,8 @@ import { BUNDLED_MEDIA_UNDERSTANDING_PLUGIN_IDS, BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS, BUNDLED_PROVIDER_PLUGIN_IDS, + BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS, + BUNDLED_REALTIME_VOICE_PLUGIN_IDS, BUNDLED_SPEECH_PLUGIN_IDS, BUNDLED_WEB_FETCH_PLUGIN_IDS, BUNDLED_WEB_SEARCH_PLUGIN_IDS, @@ -12,6 +14,8 @@ import type { ImageGenerationProviderPlugin, MediaUnderstandingProviderPlugin, ProviderPlugin, + RealtimeTranscriptionProviderPlugin, + RealtimeVoiceProviderPlugin, SpeechProviderPlugin, WebFetchProviderPlugin, WebSearchProviderPlugin, @@ -19,6 +23,8 @@ import type { import { loadVitestImageGenerationProviderContractRegistry, loadVitestMediaUnderstandingProviderContractRegistry, + loadVitestRealtimeTranscriptionProviderContractRegistry, + loadVitestRealtimeVoiceProviderContractRegistry, loadVitestSpeechProviderContractRegistry, } from "./speech-vitest-registry.js"; @@ -38,6 +44,9 @@ type WebFetchProviderContractEntry = CapabilityContractEntry; +type RealtimeTranscriptionProviderContractEntry = + CapabilityContractEntry; +type RealtimeVoiceProviderContractEntry = CapabilityContractEntry; type MediaUnderstandingProviderContractEntry = CapabilityContractEntry; type ImageGenerationProviderContractEntry = CapabilityContractEntry; @@ -47,6 +56,8 @@ type PluginRegistrationContractEntry = { cliBackendIds: string[]; providerIds: string[]; speechProviderIds: string[]; + realtimeTranscriptionProviderIds: string[]; + realtimeVoiceProviderIds: string[]; mediaUnderstandingProviderIds: string[]; imageGenerationProviderIds: string[]; webFetchProviderIds: string[]; @@ -94,6 +105,10 @@ let webSearchProviderContractRegistryByPluginIdCache: Map< WebSearchProviderContractEntry[] > | null = null; let speechProviderContractRegistryCache: SpeechProviderContractEntry[] | null = null; +let realtimeTranscriptionProviderContractRegistryCache: + | RealtimeTranscriptionProviderContractEntry[] + | null = null; +let realtimeVoiceProviderContractRegistryCache: RealtimeVoiceProviderContractEntry[] | null = null; let mediaUnderstandingProviderContractRegistryCache: | MediaUnderstandingProviderContractEntry[] | null = null; @@ -387,6 +402,36 @@ function loadSpeechProviderContractRegistry(): SpeechProviderContractEntry[] { return speechProviderContractRegistryCache; } +function loadRealtimeVoiceProviderContractRegistry(): RealtimeVoiceProviderContractEntry[] { + if (!realtimeVoiceProviderContractRegistryCache) { + realtimeVoiceProviderContractRegistryCache = process.env.VITEST + ? loadVitestRealtimeVoiceProviderContractRegistry() + : loadBundledCapabilityRuntimeRegistry({ + pluginIds: BUNDLED_REALTIME_VOICE_PLUGIN_IDS, + pluginSdkResolution: "dist", + }).realtimeVoiceProviders.map((entry) => ({ + pluginId: entry.pluginId, + provider: entry.provider, + })); + } + return realtimeVoiceProviderContractRegistryCache; +} + +function loadRealtimeTranscriptionProviderContractRegistry(): RealtimeTranscriptionProviderContractEntry[] { + if (!realtimeTranscriptionProviderContractRegistryCache) { + realtimeTranscriptionProviderContractRegistryCache = process.env.VITEST + ? loadVitestRealtimeTranscriptionProviderContractRegistry() + : loadBundledCapabilityRuntimeRegistry({ + pluginIds: BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS, + pluginSdkResolution: "dist", + }).realtimeTranscriptionProviders.map((entry) => ({ + pluginId: entry.pluginId, + provider: entry.provider, + })); + } + return realtimeTranscriptionProviderContractRegistryCache; +} + function loadMediaUnderstandingProviderContractRegistry(): MediaUnderstandingProviderContractEntry[] { if (!mediaUnderstandingProviderContractRegistryCache) { mediaUnderstandingProviderContractRegistryCache = process.env.VITEST @@ -519,6 +564,12 @@ export const speechProviderContractRegistry: SpeechProviderContractEntry[] = cre loadSpeechProviderContractRegistry, ); +export const realtimeTranscriptionProviderContractRegistry: RealtimeTranscriptionProviderContractEntry[] = + createLazyArrayView(loadRealtimeTranscriptionProviderContractRegistry); + +export const realtimeVoiceProviderContractRegistry: RealtimeVoiceProviderContractEntry[] = + createLazyArrayView(loadRealtimeVoiceProviderContractRegistry); + export const mediaUnderstandingProviderContractRegistry: MediaUnderstandingProviderContractEntry[] = createLazyArrayView(loadMediaUnderstandingProviderContractRegistry); @@ -531,6 +582,8 @@ function loadPluginRegistrationContractRegistry(): PluginRegistrationContractEnt cliBackendIds: uniqueStrings(entry.cliBackendIds), providerIds: uniqueStrings(entry.providerIds), speechProviderIds: uniqueStrings(entry.speechProviderIds), + realtimeTranscriptionProviderIds: uniqueStrings(entry.realtimeTranscriptionProviderIds), + realtimeVoiceProviderIds: uniqueStrings(entry.realtimeVoiceProviderIds), mediaUnderstandingProviderIds: uniqueStrings(entry.mediaUnderstandingProviderIds), imageGenerationProviderIds: uniqueStrings(entry.imageGenerationProviderIds), webFetchProviderIds: uniqueStrings(entry.webFetchProviderIds), diff --git a/src/plugins/contracts/speech-vitest-registry.ts b/src/plugins/contracts/speech-vitest-registry.ts index f5865612dd5..9fb083b7804 100644 --- a/src/plugins/contracts/speech-vitest-registry.ts +++ b/src/plugins/contracts/speech-vitest-registry.ts @@ -5,6 +5,8 @@ import { createJiti } from "jiti"; import { BUNDLED_IMAGE_GENERATION_PLUGIN_IDS, BUNDLED_MEDIA_UNDERSTANDING_PLUGIN_IDS, + BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS, + BUNDLED_REALTIME_VOICE_PLUGIN_IDS, BUNDLED_SPEECH_PLUGIN_IDS, } from "../bundled-capability-metadata.js"; import { loadBundledCapabilityRuntimeRegistry } from "../bundled-capability-runtime.js"; @@ -13,6 +15,8 @@ import { buildPluginLoaderAliasMap, buildPluginLoaderJitiOptions } from "../sdk- import type { ImageGenerationProviderPlugin, MediaUnderstandingProviderPlugin, + RealtimeTranscriptionProviderPlugin, + RealtimeVoiceProviderPlugin, SpeechProviderPlugin, } from "../types.js"; @@ -26,6 +30,16 @@ export type MediaUnderstandingProviderContractEntry = { provider: MediaUnderstandingProviderPlugin; }; +export type RealtimeVoiceProviderContractEntry = { + pluginId: string; + provider: RealtimeVoiceProviderPlugin; +}; + +export type RealtimeTranscriptionProviderContractEntry = { + pluginId: string; + provider: RealtimeTranscriptionProviderPlugin; +}; + export type ImageGenerationProviderContractEntry = { pluginId: string; provider: ImageGenerationProviderPlugin; @@ -190,6 +204,96 @@ export function loadVitestMediaUnderstandingProviderContractRegistry(): MediaUnd return registrations; } +export function loadVitestRealtimeVoiceProviderContractRegistry(): RealtimeVoiceProviderContractEntry[] { + const registrations: RealtimeVoiceProviderContractEntry[] = []; + const { manifests, unresolvedPluginIds } = resolveTestApiModuleRecords( + BUNDLED_REALTIME_VOICE_PLUGIN_IDS, + ); + + for (const plugin of manifests) { + if (!plugin.rootDir) { + continue; + } + const testApiPath = path.join(plugin.rootDir, "test-api.ts"); + if (!fs.existsSync(testApiPath)) { + continue; + } + const builder = resolveNamedBuilder( + createVitestCapabilityLoader(testApiPath)(testApiPath), + /^build.+RealtimeVoiceProvider$/u, + ); + if (!builder) { + continue; + } + registrations.push({ + pluginId: plugin.id, + provider: builder(), + }); + unresolvedPluginIds.delete(plugin.id); + } + + if (unresolvedPluginIds.size === 0) { + return registrations; + } + + const runtimeRegistry = loadBundledCapabilityRuntimeRegistry({ + pluginIds: [...unresolvedPluginIds], + pluginSdkResolution: "dist", + }); + registrations.push( + ...runtimeRegistry.realtimeVoiceProviders.map((entry) => ({ + pluginId: entry.pluginId, + provider: entry.provider, + })), + ); + return registrations; +} + +export function loadVitestRealtimeTranscriptionProviderContractRegistry(): RealtimeTranscriptionProviderContractEntry[] { + const registrations: RealtimeTranscriptionProviderContractEntry[] = []; + const { manifests, unresolvedPluginIds } = resolveTestApiModuleRecords( + BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS, + ); + + for (const plugin of manifests) { + if (!plugin.rootDir) { + continue; + } + const testApiPath = path.join(plugin.rootDir, "test-api.ts"); + if (!fs.existsSync(testApiPath)) { + continue; + } + const builder = resolveNamedBuilder( + createVitestCapabilityLoader(testApiPath)(testApiPath), + /^build.+RealtimeTranscriptionProvider$/u, + ); + if (!builder) { + continue; + } + registrations.push({ + pluginId: plugin.id, + provider: builder(), + }); + unresolvedPluginIds.delete(plugin.id); + } + + if (unresolvedPluginIds.size === 0) { + return registrations; + } + + const runtimeRegistry = loadBundledCapabilityRuntimeRegistry({ + pluginIds: [...unresolvedPluginIds], + pluginSdkResolution: "dist", + }); + registrations.push( + ...runtimeRegistry.realtimeTranscriptionProviders.map((entry) => ({ + pluginId: entry.pluginId, + provider: entry.provider, + })), + ); + return registrations; +} + export function loadVitestImageGenerationProviderContractRegistry(): ImageGenerationProviderContractEntry[] { const registrations: ImageGenerationProviderContractEntry[] = []; const { manifests, unresolvedPluginIds } = resolveTestApiModuleRecords( diff --git a/src/plugins/loader.ts b/src/plugins/loader.ts index e1625766b06..c89b0ea0558 100644 --- a/src/plugins/loader.ts +++ b/src/plugins/loader.ts @@ -590,6 +590,8 @@ function createPluginRecord(params: { cliBackendIds: [], providerIds: [], speechProviderIds: [], + realtimeTranscriptionProviderIds: [], + realtimeVoiceProviderIds: [], mediaUnderstandingProviderIds: [], imageGenerationProviderIds: [], webFetchProviderIds: [], diff --git a/src/plugins/manifest.ts b/src/plugins/manifest.ts index cc55beb5c3a..7dd7e5967f8 100644 --- a/src/plugins/manifest.ts +++ b/src/plugins/manifest.ts @@ -52,6 +52,8 @@ export type PluginManifest = { export type PluginManifestContracts = { speechProviders?: string[]; + realtimeTranscriptionProviders?: string[]; + realtimeVoiceProviders?: string[]; mediaUnderstandingProviders?: string[]; imageGenerationProviders?: string[]; webFetchProviders?: string[]; @@ -125,6 +127,8 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u } const speechProviders = normalizeStringList(value.speechProviders); + const realtimeTranscriptionProviders = normalizeStringList(value.realtimeTranscriptionProviders); + const realtimeVoiceProviders = normalizeStringList(value.realtimeVoiceProviders); const mediaUnderstandingProviders = normalizeStringList(value.mediaUnderstandingProviders); const imageGenerationProviders = normalizeStringList(value.imageGenerationProviders); const webFetchProviders = normalizeStringList(value.webFetchProviders); @@ -132,6 +136,8 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u const tools = normalizeStringList(value.tools); const contracts = { ...(speechProviders.length > 0 ? { speechProviders } : {}), + ...(realtimeTranscriptionProviders.length > 0 ? { realtimeTranscriptionProviders } : {}), + ...(realtimeVoiceProviders.length > 0 ? { realtimeVoiceProviders } : {}), ...(mediaUnderstandingProviders.length > 0 ? { mediaUnderstandingProviders } : {}), ...(imageGenerationProviders.length > 0 ? { imageGenerationProviders } : {}), ...(webFetchProviders.length > 0 ? { webFetchProviders } : {}), diff --git a/src/plugins/registry-empty.ts b/src/plugins/registry-empty.ts index 7e8698cfdd7..ee7183e6d66 100644 --- a/src/plugins/registry-empty.ts +++ b/src/plugins/registry-empty.ts @@ -11,6 +11,8 @@ export function createEmptyPluginRegistry(): PluginRegistry { providers: [], cliBackends: [], speechProviders: [], + realtimeTranscriptionProviders: [], + realtimeVoiceProviders: [], mediaUnderstandingProviders: [], imageGenerationProviders: [], webFetchProviders: [], diff --git a/src/plugins/registry.ts b/src/plugins/registry.ts index b221fde6aa3..a6d19f4f8c8 100644 --- a/src/plugins/registry.ts +++ b/src/plugins/registry.ts @@ -38,7 +38,7 @@ import { import type { CliBackendPlugin, ImageGenerationProviderPlugin, - WebFetchProviderPlugin, + RealtimeTranscriptionProviderPlugin, OpenClawPluginApi, OpenClawPluginChannelRegistration, OpenClawPluginCliCommandDescriptor, @@ -52,6 +52,7 @@ import type { OpenClawPluginHookOptions, MediaUnderstandingProviderPlugin, ProviderPlugin, + RealtimeVoiceProviderPlugin, OpenClawPluginService, OpenClawPluginToolContext, OpenClawPluginToolFactory, @@ -67,6 +68,7 @@ import type { PluginHookHandlerMap, PluginHookRegistration as TypedPluginHookRegistration, SpeechProviderPlugin, + WebFetchProviderPlugin, WebSearchProviderPlugin, } from "./types.js"; @@ -142,6 +144,10 @@ type PluginOwnedProviderRegistration = { export type PluginSpeechProviderRegistration = PluginOwnedProviderRegistration; +export type PluginRealtimeTranscriptionProviderRegistration = + PluginOwnedProviderRegistration; +export type PluginRealtimeVoiceProviderRegistration = + PluginOwnedProviderRegistration; export type PluginMediaUnderstandingProviderRegistration = PluginOwnedProviderRegistration; export type PluginImageGenerationProviderRegistration = @@ -213,6 +219,8 @@ export type PluginRecord = { cliBackendIds: string[]; providerIds: string[]; speechProviderIds: string[]; + realtimeTranscriptionProviderIds: string[]; + realtimeVoiceProviderIds: string[]; mediaUnderstandingProviderIds: string[]; imageGenerationProviderIds: string[]; webFetchProviderIds: string[]; @@ -239,6 +247,8 @@ export type PluginRegistry = { providers: PluginProviderRegistration[]; cliBackends?: PluginCliBackendRegistration[]; speechProviders: PluginSpeechProviderRegistration[]; + realtimeTranscriptionProviders: PluginRealtimeTranscriptionProviderRegistration[]; + realtimeVoiceProviders: PluginRealtimeVoiceProviderRegistration[]; mediaUnderstandingProviders: PluginMediaUnderstandingProviderRegistration[]; imageGenerationProviders: PluginImageGenerationProviderRegistration[]; webFetchProviders: PluginWebFetchProviderRegistration[]; @@ -699,6 +709,32 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) { }); }; + const registerRealtimeTranscriptionProvider = ( + record: PluginRecord, + provider: RealtimeTranscriptionProviderPlugin, + ) => { + registerUniqueProviderLike({ + record, + provider, + kindLabel: "realtime transcription provider", + registrations: registry.realtimeTranscriptionProviders, + ownedIds: record.realtimeTranscriptionProviderIds, + }); + }; + + const registerRealtimeVoiceProvider = ( + record: PluginRecord, + provider: RealtimeVoiceProviderPlugin, + ) => { + registerUniqueProviderLike({ + record, + provider, + kindLabel: "realtime voice provider", + registrations: registry.realtimeVoiceProviders, + ownedIds: record.realtimeVoiceProviderIds, + }); + }; + const registerMediaUnderstandingProvider = ( record: PluginRecord, provider: MediaUnderstandingProviderPlugin, @@ -1009,6 +1045,10 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) { registerHttpRoute: (routeParams) => registerHttpRoute(record, routeParams), registerProvider: (provider) => registerProvider(record, provider), registerSpeechProvider: (provider) => registerSpeechProvider(record, provider), + registerRealtimeTranscriptionProvider: (provider) => + registerRealtimeTranscriptionProvider(record, provider), + registerRealtimeVoiceProvider: (provider) => + registerRealtimeVoiceProvider(record, provider), registerMediaUnderstandingProvider: (provider) => registerMediaUnderstandingProvider(record, provider), registerImageGenerationProvider: (provider) => @@ -1198,6 +1238,8 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) { registerProvider, registerCliBackend, registerSpeechProvider, + registerRealtimeTranscriptionProvider, + registerRealtimeVoiceProvider, registerMediaUnderstandingProvider, registerImageGenerationProvider, registerWebSearchProvider, diff --git a/src/plugins/runtime.test.ts b/src/plugins/runtime.test.ts index 47213675b25..d4393210c0c 100644 --- a/src/plugins/runtime.test.ts +++ b/src/plugins/runtime.test.ts @@ -199,6 +199,8 @@ describe("setActivePluginRegistry", () => { cliBackendIds: [], providerIds: [], speechProviderIds: [], + realtimeTranscriptionProviderIds: [], + realtimeVoiceProviderIds: [], mediaUnderstandingProviderIds: [], imageGenerationProviderIds: [], webFetchProviderIds: [], @@ -225,6 +227,8 @@ describe("setActivePluginRegistry", () => { cliBackendIds: [], providerIds: [], speechProviderIds: [], + realtimeTranscriptionProviderIds: [], + realtimeVoiceProviderIds: [], mediaUnderstandingProviderIds: [], imageGenerationProviderIds: [], webFetchProviderIds: [], diff --git a/src/plugins/status.test-helpers.ts b/src/plugins/status.test-helpers.ts index 29650140178..e8b4119f08a 100644 --- a/src/plugins/status.test-helpers.ts +++ b/src/plugins/status.test-helpers.ts @@ -51,6 +51,8 @@ export function createPluginRecord( cliBackendIds: [], providerIds: [], speechProviderIds: [], + realtimeTranscriptionProviderIds: [], + realtimeVoiceProviderIds: [], mediaUnderstandingProviderIds: [], imageGenerationProviderIds: [], webFetchProviderIds: [], @@ -107,7 +109,7 @@ export function createCustomHook(params: { export function createPluginLoadResult( overrides: Partial & Pick = { plugins: [] }, ): PluginLoadResult { - const { plugins, ...rest } = overrides; + const { plugins, realtimeTranscriptionProviders, realtimeVoiceProviders, ...rest } = overrides; return { plugins, diagnostics: [], @@ -129,6 +131,8 @@ export function createPluginLoadResult( commands: [], conversationBindingResolvedHandlers: [], ...rest, + realtimeTranscriptionProviders: realtimeTranscriptionProviders ?? [], + realtimeVoiceProviders: realtimeVoiceProviders ?? [], }; } diff --git a/src/plugins/status.ts b/src/plugins/status.ts index eeae8a6cdaa..32cc68a47a7 100644 --- a/src/plugins/status.ts +++ b/src/plugins/status.ts @@ -28,6 +28,8 @@ export type PluginCapabilityKind = | "cli-backend" | "text-inference" | "speech" + | "realtime-transcription" + | "realtime-voice" | "media-understanding" | "image-generation" | "web-search" @@ -233,6 +235,8 @@ function buildCapabilityEntries(plugin: PluginRegistry["plugins"][number]) { { kind: "cli-backend" as const, ids: plugin.cliBackendIds ?? [] }, { kind: "text-inference" as const, ids: plugin.providerIds }, { kind: "speech" as const, ids: plugin.speechProviderIds }, + { kind: "realtime-transcription" as const, ids: plugin.realtimeTranscriptionProviderIds }, + { kind: "realtime-voice" as const, ids: plugin.realtimeVoiceProviderIds }, { kind: "media-understanding" as const, ids: plugin.mediaUnderstandingProviderIds }, { kind: "image-generation" as const, ids: plugin.imageGenerationProviderIds }, { kind: "web-search" as const, ids: plugin.webSearchProviderIds }, diff --git a/src/plugins/types.ts b/src/plugins/types.ts index 2d88b77a984..a8d7561f3b2 100644 --- a/src/plugins/types.ts +++ b/src/plugins/types.ts @@ -30,6 +30,22 @@ import type { HookEntry } from "../hooks/types.js"; import type { ImageGenerationProvider } from "../image-generation/types.js"; import type { ProviderUsageSnapshot } from "../infra/provider-usage.types.js"; import type { MediaUnderstandingProvider } from "../media-understanding/types.js"; +import type { + RealtimeTranscriptionProviderConfig, + RealtimeTranscriptionProviderConfiguredContext, + RealtimeTranscriptionProviderId, + RealtimeTranscriptionProviderResolveConfigContext, + RealtimeTranscriptionSession, + RealtimeTranscriptionSessionCreateRequest, +} from "../realtime-transcription/provider-types.js"; +import type { + RealtimeVoiceBridge, + RealtimeVoiceBridgeCreateRequest, + RealtimeVoiceProviderConfig, + RealtimeVoiceProviderConfiguredContext, + RealtimeVoiceProviderId, + RealtimeVoiceProviderResolveConfigContext, +} from "../realtime-voice/provider-types.js"; import type { RuntimeEnv } from "../runtime.js"; import type { RuntimeWebFetchMetadata, @@ -1526,6 +1542,38 @@ export type PluginSpeechProviderEntry = SpeechProviderPlugin & { pluginId: string; }; +/** Realtime transcription capability registered by a plugin. */ +export type RealtimeTranscriptionProviderPlugin = { + id: RealtimeTranscriptionProviderId; + label: string; + aliases?: string[]; + autoSelectOrder?: number; + resolveConfig?: ( + ctx: RealtimeTranscriptionProviderResolveConfigContext, + ) => RealtimeTranscriptionProviderConfig; + isConfigured: (ctx: RealtimeTranscriptionProviderConfiguredContext) => boolean; + createSession: (req: RealtimeTranscriptionSessionCreateRequest) => RealtimeTranscriptionSession; +}; + +export type PluginRealtimeTranscriptionProviderEntry = RealtimeTranscriptionProviderPlugin & { + pluginId: string; +}; + +/** Realtime voice capability registered by a plugin. */ +export type RealtimeVoiceProviderPlugin = { + id: RealtimeVoiceProviderId; + label: string; + aliases?: string[]; + autoSelectOrder?: number; + resolveConfig?: (ctx: RealtimeVoiceProviderResolveConfigContext) => RealtimeVoiceProviderConfig; + isConfigured: (ctx: RealtimeVoiceProviderConfiguredContext) => boolean; + createBridge: (req: RealtimeVoiceBridgeCreateRequest) => RealtimeVoiceBridge; +}; + +export type PluginRealtimeVoiceProviderEntry = RealtimeVoiceProviderPlugin & { + pluginId: string; +}; + export type MediaUnderstandingProviderPlugin = MediaUnderstandingProvider; export type ImageGenerationProviderPlugin = ImageGenerationProvider; @@ -1850,6 +1898,10 @@ export type OpenClawPluginApi = { registerProvider: (provider: ProviderPlugin) => void; /** Register a speech synthesis provider (speech capability). */ registerSpeechProvider: (provider: SpeechProviderPlugin) => void; + /** Register a realtime transcription provider (streaming STT capability). */ + registerRealtimeTranscriptionProvider: (provider: RealtimeTranscriptionProviderPlugin) => void; + /** Register a realtime voice provider (duplex voice capability). */ + registerRealtimeVoiceProvider: (provider: RealtimeVoiceProviderPlugin) => void; /** Register a media understanding provider (media understanding capability). */ registerMediaUnderstandingProvider: (provider: MediaUnderstandingProviderPlugin) => void; /** Register an image generation provider (image generation capability). */ diff --git a/src/realtime-transcription/provider-registry.ts b/src/realtime-transcription/provider-registry.ts new file mode 100644 index 00000000000..28d2e3125ed --- /dev/null +++ b/src/realtime-transcription/provider-registry.ts @@ -0,0 +1,80 @@ +import type { OpenClawConfig } from "../config/config.js"; +import { resolvePluginCapabilityProviders } from "../plugins/capability-provider-runtime.js"; +import type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js"; +import type { RealtimeTranscriptionProviderId } from "./provider-types.js"; + +function trimToUndefined(value: string | undefined): string | undefined { + const trimmed = value?.trim().toLowerCase(); + return trimmed ? trimmed : undefined; +} + +export function normalizeRealtimeTranscriptionProviderId( + providerId: string | undefined, +): RealtimeTranscriptionProviderId | undefined { + return trimToUndefined(providerId); +} + +function resolveRealtimeTranscriptionProviderEntries( + cfg?: OpenClawConfig, +): RealtimeTranscriptionProviderPlugin[] { + return resolvePluginCapabilityProviders({ + key: "realtimeTranscriptionProviders", + cfg, + }); +} + +function buildProviderMaps(cfg?: OpenClawConfig): { + canonical: Map; + aliases: Map; +} { + const canonical = new Map(); + const aliases = new Map(); + const register = (provider: RealtimeTranscriptionProviderPlugin) => { + const id = normalizeRealtimeTranscriptionProviderId(provider.id); + if (!id) { + return; + } + canonical.set(id, provider); + aliases.set(id, provider); + for (const alias of provider.aliases ?? []) { + const normalizedAlias = normalizeRealtimeTranscriptionProviderId(alias); + if (normalizedAlias) { + aliases.set(normalizedAlias, provider); + } + } + }; + + for (const provider of resolveRealtimeTranscriptionProviderEntries(cfg)) { + register(provider); + } + + return { canonical, aliases }; +} + +export function listRealtimeTranscriptionProviders( + cfg?: OpenClawConfig, +): RealtimeTranscriptionProviderPlugin[] { + return [...buildProviderMaps(cfg).canonical.values()]; +} + +export function getRealtimeTranscriptionProvider( + providerId: string | undefined, + cfg?: OpenClawConfig, +): RealtimeTranscriptionProviderPlugin | undefined { + const normalized = normalizeRealtimeTranscriptionProviderId(providerId); + if (!normalized) { + return undefined; + } + return buildProviderMaps(cfg).aliases.get(normalized); +} + +export function canonicalizeRealtimeTranscriptionProviderId( + providerId: string | undefined, + cfg?: OpenClawConfig, +): RealtimeTranscriptionProviderId | undefined { + const normalized = normalizeRealtimeTranscriptionProviderId(providerId); + if (!normalized) { + return undefined; + } + return getRealtimeTranscriptionProvider(normalized, cfg)?.id ?? normalized; +} diff --git a/src/realtime-transcription/provider-types.ts b/src/realtime-transcription/provider-types.ts new file mode 100644 index 00000000000..06d7678eec5 --- /dev/null +++ b/src/realtime-transcription/provider-types.ts @@ -0,0 +1,33 @@ +import type { OpenClawConfig } from "../config/config.js"; + +export type RealtimeTranscriptionProviderId = string; + +export type RealtimeTranscriptionProviderConfig = Record; + +export type RealtimeTranscriptionProviderResolveConfigContext = { + cfg: OpenClawConfig; + rawConfig: RealtimeTranscriptionProviderConfig; +}; + +export type RealtimeTranscriptionProviderConfiguredContext = { + cfg?: OpenClawConfig; + providerConfig: RealtimeTranscriptionProviderConfig; +}; + +export type RealtimeTranscriptionSessionCallbacks = { + onPartial?: (partial: string) => void; + onTranscript?: (transcript: string) => void; + onSpeechStart?: () => void; + onError?: (error: Error) => void; +}; + +export type RealtimeTranscriptionSessionCreateRequest = RealtimeTranscriptionSessionCallbacks & { + providerConfig: RealtimeTranscriptionProviderConfig; +}; + +export type RealtimeTranscriptionSession = { + connect(): Promise; + sendAudio(audio: Buffer): void; + close(): void; + isConnected(): boolean; +}; diff --git a/src/realtime-voice/provider-registry.ts b/src/realtime-voice/provider-registry.ts new file mode 100644 index 00000000000..b2de16385c1 --- /dev/null +++ b/src/realtime-voice/provider-registry.ts @@ -0,0 +1,76 @@ +import type { OpenClawConfig } from "../config/config.js"; +import { resolvePluginCapabilityProviders } from "../plugins/capability-provider-runtime.js"; +import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js"; +import type { RealtimeVoiceProviderId } from "./provider-types.js"; + +function trimToUndefined(value: string | undefined): string | undefined { + const trimmed = value?.trim().toLowerCase(); + return trimmed ? trimmed : undefined; +} + +export function normalizeRealtimeVoiceProviderId( + providerId: string | undefined, +): RealtimeVoiceProviderId | undefined { + return trimToUndefined(providerId); +} + +function resolveRealtimeVoiceProviderEntries(cfg?: OpenClawConfig): RealtimeVoiceProviderPlugin[] { + return resolvePluginCapabilityProviders({ + key: "realtimeVoiceProviders", + cfg, + }); +} + +function buildProviderMaps(cfg?: OpenClawConfig): { + canonical: Map; + aliases: Map; +} { + const canonical = new Map(); + const aliases = new Map(); + const register = (provider: RealtimeVoiceProviderPlugin) => { + const id = normalizeRealtimeVoiceProviderId(provider.id); + if (!id) { + return; + } + canonical.set(id, provider); + aliases.set(id, provider); + for (const alias of provider.aliases ?? []) { + const normalizedAlias = normalizeRealtimeVoiceProviderId(alias); + if (normalizedAlias) { + aliases.set(normalizedAlias, provider); + } + } + }; + + for (const provider of resolveRealtimeVoiceProviderEntries(cfg)) { + register(provider); + } + + return { canonical, aliases }; +} + +export function listRealtimeVoiceProviders(cfg?: OpenClawConfig): RealtimeVoiceProviderPlugin[] { + return [...buildProviderMaps(cfg).canonical.values()]; +} + +export function getRealtimeVoiceProvider( + providerId: string | undefined, + cfg?: OpenClawConfig, +): RealtimeVoiceProviderPlugin | undefined { + const normalized = normalizeRealtimeVoiceProviderId(providerId); + if (!normalized) { + return undefined; + } + return buildProviderMaps(cfg).aliases.get(normalized); +} + +export function canonicalizeRealtimeVoiceProviderId( + providerId: string | undefined, + cfg?: OpenClawConfig, +): RealtimeVoiceProviderId | undefined { + const normalized = normalizeRealtimeVoiceProviderId(providerId); + if (!normalized) { + return undefined; + } + return getRealtimeVoiceProvider(normalized, cfg)?.id ?? normalized; +} diff --git a/src/realtime-voice/provider-types.ts b/src/realtime-voice/provider-types.ts new file mode 100644 index 00000000000..a494bd32cf5 --- /dev/null +++ b/src/realtime-voice/provider-types.ts @@ -0,0 +1,66 @@ +import type { OpenClawConfig } from "../config/config.js"; + +export type RealtimeVoiceProviderId = string; + +export type RealtimeVoiceRole = "user" | "assistant"; + +export type RealtimeVoiceCloseReason = "completed" | "error"; + +export type RealtimeVoiceTool = { + type: "function"; + name: string; + description: string; + parameters: { + type: "object"; + properties: Record; + required?: string[]; + }; +}; + +export type RealtimeVoiceToolCallEvent = { + itemId: string; + callId: string; + name: string; + args: unknown; +}; + +export type RealtimeVoiceBridgeCallbacks = { + onAudio: (muLaw: Buffer) => void; + onClearAudio: () => void; + onMark?: (markName: string) => void; + onTranscript?: (role: RealtimeVoiceRole, text: string, isFinal: boolean) => void; + onToolCall?: (event: RealtimeVoiceToolCallEvent) => void; + onReady?: () => void; + onError?: (error: Error) => void; + onClose?: (reason: RealtimeVoiceCloseReason) => void; +}; + +export type RealtimeVoiceProviderConfig = Record; + +export type RealtimeVoiceProviderResolveConfigContext = { + cfg: OpenClawConfig; + rawConfig: RealtimeVoiceProviderConfig; +}; + +export type RealtimeVoiceProviderConfiguredContext = { + cfg?: OpenClawConfig; + providerConfig: RealtimeVoiceProviderConfig; +}; + +export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & { + providerConfig: RealtimeVoiceProviderConfig; + instructions?: string; + tools?: RealtimeVoiceTool[]; +}; + +export type RealtimeVoiceBridge = { + connect(): Promise; + sendAudio(audio: Buffer): void; + setMediaTimestamp(ts: number): void; + sendUserMessage?(text: string): void; + triggerGreeting?(instructions?: string): void; + submitToolResult(callId: string, result: unknown): void; + acknowledgeMark(): void; + close(): void; + isConnected(): boolean; +}; diff --git a/src/test-utils/channel-plugins.ts b/src/test-utils/channel-plugins.ts index cfa1d40c672..0021ae122b8 100644 --- a/src/test-utils/channel-plugins.ts +++ b/src/test-utils/channel-plugins.ts @@ -27,6 +27,8 @@ export const createTestRegistry = (channels: TestChannelRegistration[] = []): Pl })), providers: [], speechProviders: [], + realtimeTranscriptionProviders: [], + realtimeVoiceProviders: [], mediaUnderstandingProviders: [], imageGenerationProviders: [], webFetchProviders: [], diff --git a/test/helpers/plugins/plugin-api.ts b/test/helpers/plugins/plugin-api.ts index 825d227ba17..ceaa92f68d8 100644 --- a/test/helpers/plugins/plugin-api.ts +++ b/test/helpers/plugins/plugin-api.ts @@ -20,6 +20,8 @@ export function createTestPluginApi(api: TestPluginApiInput): OpenClawPluginApi registerCliBackend() {}, registerProvider() {}, registerSpeechProvider() {}, + registerRealtimeTranscriptionProvider() {}, + registerRealtimeVoiceProvider() {}, registerMediaUnderstandingProvider() {}, registerImageGenerationProvider() {}, registerWebFetchProvider() {}, diff --git a/test/helpers/plugins/plugin-registration-contract-cases.ts b/test/helpers/plugins/plugin-registration-contract-cases.ts index 07005c44cf7..12e3e911d69 100644 --- a/test/helpers/plugins/plugin-registration-contract-cases.ts +++ b/test/helpers/plugins/plugin-registration-contract-cases.ts @@ -92,6 +92,8 @@ export const pluginRegistrationContractCases = { pluginId: "openai", providerIds: ["openai", "openai-codex"], speechProviderIds: ["openai"], + realtimeTranscriptionProviderIds: ["openai"], + realtimeVoiceProviderIds: ["openai"], mediaUnderstandingProviderIds: ["openai", "openai-codex"], imageGenerationProviderIds: ["openai"], cliBackendIds: ["codex-cli"], diff --git a/test/helpers/plugins/plugin-registration-contract.ts b/test/helpers/plugins/plugin-registration-contract.ts index d25f067ec6f..1b39bb1903d 100644 --- a/test/helpers/plugins/plugin-registration-contract.ts +++ b/test/helpers/plugins/plugin-registration-contract.ts @@ -13,6 +13,8 @@ type PluginRegistrationContractParams = { webFetchProviderIds?: string[]; webSearchProviderIds?: string[]; speechProviderIds?: string[]; + realtimeTranscriptionProviderIds?: string[]; + realtimeVoiceProviderIds?: string[]; mediaUnderstandingProviderIds?: string[]; imageGenerationProviderIds?: string[]; cliBackendIds?: string[]; @@ -122,6 +124,22 @@ export function describePluginRegistrationContract(params: PluginRegistrationCon }); } + if (params.realtimeTranscriptionProviderIds) { + it("keeps bundled realtime-transcription ownership explicit", () => { + expect(findRegistration(params.pluginId).realtimeTranscriptionProviderIds).toEqual( + params.realtimeTranscriptionProviderIds, + ); + }); + } + + if (params.realtimeVoiceProviderIds) { + it("keeps bundled realtime-voice ownership explicit", () => { + expect(findRegistration(params.pluginId).realtimeVoiceProviderIds).toEqual( + params.realtimeVoiceProviderIds, + ); + }); + } + if (params.mediaUnderstandingProviderIds) { it("keeps bundled media-understanding ownership explicit", () => { expect(findRegistration(params.pluginId).mediaUnderstandingProviderIds).toEqual( diff --git a/test/setup-openclaw-runtime.ts b/test/setup-openclaw-runtime.ts index e9f5137fc15..0aca0aceeab 100644 --- a/test/setup-openclaw-runtime.ts +++ b/test/setup-openclaw-runtime.ts @@ -110,6 +110,8 @@ function createTestRegistryForSetup( })), providers: [], speechProviders: [], + realtimeTranscriptionProviders: [], + realtimeVoiceProviders: [], mediaUnderstandingProviders: [], imageGenerationProviders: [], webFetchProviders: [], diff --git a/vitest.contracts.config.ts b/vitest.contracts.config.ts index 48507e3eecd..190b0527f4c 100644 --- a/vitest.contracts.config.ts +++ b/vitest.contracts.config.ts @@ -1,23 +1,23 @@ -import { createScopedVitestConfig } from "./vitest.scoped-config.ts"; -import { boundaryTestFiles } from "./vitest.unit-paths.mjs"; +import { defineConfig } from "vitest/config"; +import { sharedVitestConfig } from "./vitest.shared.config.ts"; -export function createContractsVitestConfig(env?: Record) { - return createScopedVitestConfig( - [ - "src/channels/plugins/contracts/**/*.test.ts", - "src/config/doc-baseline.integration.test.ts", - "src/config/schema.base.generated.test.ts", - "src/config/schema.help.quality.test.ts", - "src/plugins/contracts/**/*.test.ts", - "test/**/*.test.ts", - ], - { - env, - exclude: boundaryTestFiles, - name: "contracts", +const base = sharedVitestConfig as Record; +const baseTest = sharedVitestConfig.test ?? {}; + +export function createContractsVitestConfig() { + return defineConfig({ + ...base, + test: { + ...baseTest, + isolate: true, + setupFiles: baseTest.setupFiles ?? [], + include: [ + "src/channels/plugins/contracts/**/*.test.ts", + "src/plugins/contracts/**/*.test.ts", + ], passWithNoTests: true, }, - ); + }); } export default createContractsVitestConfig();