From a23ab9b906dc6a4f6b24bb3f681f395eb792dbcd Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Sat, 4 Apr 2026 12:04:37 +0900
Subject: [PATCH] refactor: move voice-call realtime providers into extensions

---
 CHANGELOG.md                                  |   1 +
 docs/plugins/architecture.md                  |   6 +-
 docs/plugins/building-plugins.md              |   1 +
 docs/plugins/manifest.md                      |  18 +-
 docs/plugins/sdk-overview.md                  |  20 +-
 docs/plugins/sdk-provider-plugins.md          |  31 +-
 extensions/anthropic/test-api.ts              |   1 +
 extensions/deepgram/test-api.ts               |   1 +
 extensions/fal/test-api.ts                    |   1 +
 extensions/google/test-api.ts                 |   2 +
 extensions/groq/test-api.ts                   |   1 +
 extensions/lobster/src/lobster-tool.test.ts   |   2 +
 extensions/mattermost/runtime-api.ts          |   2 +-
 extensions/minimax/test-api.ts                |   8 +
 extensions/mistral/test-api.ts                |   1 +
 extensions/moonshot/test-api.ts               |   1 +
 extensions/openai/api.ts                      |   2 +
 extensions/openai/index.ts                    |   4 +
 .../openai/openai-codex-provider.test.ts      |  10 +-
 extensions/openai/openclaw.plugin.json        |   2 +
 extensions/openai/package.json                |   3 +
 .../realtime-transcription-provider.test.ts   |  27 +
 .../openai/realtime-transcription-provider.ts | 267 +++++++++
 extensions/openai/realtime-voice-provider.ts  | 535 ++++++++++++++++++
 extensions/openai/test-api.ts                 |   2 +
 extensions/openrouter/test-api.ts             |   1 +
 extensions/voice-call/README.md               |   2 +-
 extensions/voice-call/index.ts                |  17 +-
 extensions/voice-call/openclaw.plugin.json    |  87 ++-
 extensions/voice-call/src/config.test.ts      |  45 +-
 extensions/voice-call/src/config.ts           | 315 ++++++++++-
 .../voice-call/src/manager/outbound.test.ts   |   4 +-
 extensions/voice-call/src/manager/outbound.ts |  25 +-
 .../voice-call/src/media-stream.test.ts       |  62 +-
 extensions/voice-call/src/media-stream.ts     |  49 +-
 extensions/voice-call/src/providers/index.ts  |   5 -
 .../src/providers/stt-openai-realtime.test.ts |  42 --
 .../src/providers/stt-openai-realtime.ts      | 321 -----------
 .../src/providers/tts-openai.test.ts          |  43 --
 .../voice-call/src/providers/tts-openai.ts    | 185 ------
 .../src/realtime-transcription.runtime.ts     |   4 +
 .../voice-call/src/realtime-voice.runtime.ts  |   4 +
 extensions/voice-call/src/runtime.ts          | 105 +++-
 extensions/voice-call/src/test-fixtures.ts    |  18 +-
 extensions/voice-call/src/webhook.test.ts     |  79 +++
 extensions/voice-call/src/webhook.ts          | 128 ++++-
 .../src/webhook/realtime-handler.test.ts      |  92 +++
 .../src/webhook/realtime-handler.ts           | 413 ++++++++++++++
 extensions/zai/test-api.ts                    |   1 +
 package.json                                  |   8 +
 scripts/lib/plugin-sdk-entrypoints.json       |   2 +
 scripts/write-cli-startup-metadata.ts         |  46 +-
 src/cli/program/root-help.ts                  |  22 +-
 src/gateway/server-plugins.test.ts            |   2 +
 src/gateway/test-helpers.mocks.ts             |   2 +
 src/plugin-sdk/core.ts                        |   1 +
 src/plugin-sdk/index.ts                       |   1 +
 src/plugin-sdk/plugin-entry.ts                |   2 +
 src/plugin-sdk/realtime-transcription.ts      |  16 +
 src/plugin-sdk/realtime-voice.ts              |  20 +
 src/plugin-sdk/speech.ts                      |  85 ++-
 src/plugins/api-builder.ts                    |  10 +
 .../bundled-capability-metadata.test.ts       |   6 +
 src/plugins/bundled-capability-metadata.ts    |  18 +
 src/plugins/bundled-capability-runtime.ts     |  26 +
 .../capability-provider-runtime.test.ts       |   9 +-
 src/plugins/capability-provider-runtime.ts    |   6 +
 src/plugins/captured-registration.ts          |  14 +
 src/plugins/cli.ts                            |   3 +-
 .../contracts/registry.contract.test.ts       |  39 +-
 src/plugins/contracts/registry.ts             |  53 ++
 .../contracts/speech-vitest-registry.ts       | 104 ++++
 src/plugins/loader.ts                         |   2 +
 src/plugins/manifest.ts                       |   6 +
 src/plugins/registry-empty.ts                 |   2 +
 src/plugins/registry.ts                       |  44 +-
 src/plugins/runtime.test.ts                   |   4 +
 src/plugins/status.test-helpers.ts            |   6 +-
 src/plugins/status.ts                         |   4 +
 src/plugins/types.ts                          |  52 ++
 .../provider-registry.ts                      |  80 +++
 src/realtime-transcription/provider-types.ts  |  33 ++
 src/realtime-voice/provider-registry.ts       |  76 +++
 src/realtime-voice/provider-types.ts          |  66 +++
 src/test-utils/channel-plugins.ts             |   2 +
 test/helpers/plugins/plugin-api.ts            |   2 +
 .../plugin-registration-contract-cases.ts     |   2 +
 .../plugins/plugin-registration-contract.ts   |  18 +
 test/setup-openclaw-runtime.ts                |   2 +
 vitest.contracts.config.ts                    |  34 +-
 90 files changed, 3134 insertions(+), 792 deletions(-)
 create mode 100644 extensions/deepgram/test-api.ts
 create mode 100644 extensions/fal/test-api.ts
 create mode 100644 extensions/groq/test-api.ts
 create mode 100644 extensions/minimax/test-api.ts
 create mode 100644 extensions/mistral/test-api.ts
 create mode 100644 extensions/openai/realtime-transcription-provider.test.ts
 create mode 100644 extensions/openai/realtime-transcription-provider.ts
 create mode 100644 extensions/openai/realtime-voice-provider.ts
 create mode 100644 extensions/openrouter/test-api.ts
 delete mode 100644 extensions/voice-call/src/providers/stt-openai-realtime.test.ts
 delete mode 100644 extensions/voice-call/src/providers/stt-openai-realtime.ts
 delete mode 100644 extensions/voice-call/src/providers/tts-openai.test.ts
 delete mode 100644 extensions/voice-call/src/providers/tts-openai.ts
 create mode 100644 extensions/voice-call/src/realtime-transcription.runtime.ts
 create mode 100644 extensions/voice-call/src/realtime-voice.runtime.ts
 create mode 100644 extensions/voice-call/src/webhook/realtime-handler.test.ts
 create mode 100644 extensions/voice-call/src/webhook/realtime-handler.ts
 create mode 100644 extensions/zai/test-api.ts
 create mode 100644 src/plugin-sdk/realtime-transcription.ts
 create mode 100644 src/plugin-sdk/realtime-voice.ts
 create mode 100644 src/realtime-transcription/provider-registry.ts
 create mode 100644 src/realtime-transcription/provider-types.ts
 create mode 100644 src/realtime-voice/provider-registry.ts
 create mode 100644 src/realtime-voice/provider-types.ts

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a8e1fac5305..dbeb0470988 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -48,6 +48,7 @@ Docs: https://docs.openclaw.ai
 - Telegram/native commands: clean up metadata-driven progress placeholders when replies fall back, edits fail, or local exec approval prompts are suppressed. (#59300) Thanks @jalehman.
 - Matrix/backup reset: recreate secret storage during backup reset when stale SSSS state blocks durable backup-key reload, including no-backup repair paths. (#60599) thanks @emonty.
 - Matrix: allow secret-storage recreation during automatic repair bootstrap so clients that lose their recovery key can recover and persist new cross-signing keys. (#59846) Thanks @al3mart.
+- Voice Call/OpenAI: move realtime voice and realtime transcription onto provider-owned plugin capabilities so `voice-call` uses generic provider selection while keeping realtime Twilio replay and custom webhook-path handling working.
 - Matrix/crypto persistence: capture and write the IndexedDB snapshot while holding the snapshot file lock so concurrent gateway and CLI persists cannot overwrite newer crypto state. (#59851) Thanks @al3mart.
 - Matrix/media: surface a dedicated `[matrix <kind> attachment too large]` marker for oversized inbound media instead of the generic unavailable marker, and classify size-limit failures with a typed Matrix error. (#60289) Thanks @efe-arv.
 - Matrix/Telegram exec approvals: recover stored same-channel account bindings even when session reply state drifted to another channel, so foreign-channel approvals route to the bound account instead of fanning out or being rejected as ambiguous. (#60417) thanks @gumadeiras.
diff --git a/docs/plugins/architecture.md b/docs/plugins/architecture.md
index 75695e2c718..c81d32239fd 100644
--- a/docs/plugins/architecture.md
+++ b/docs/plugins/architecture.md
@@ -32,6 +32,7 @@ native OpenClaw plugin registers against one or more capability types:
 | Text inference        | `api.registerProvider(...)`                   | `openai`, `anthropic`     |
 | CLI inference backend | `api.registerCliBackend(...)`                 | `openai`, `anthropic`     |
 | Speech                | `api.registerSpeechProvider(...)`             | `elevenlabs`, `microsoft` |
+| Realtime voice        | `api.registerRealtimeVoiceProvider(...)`      | `openai`                  |
 | Media understanding   | `api.registerMediaUnderstandingProvider(...)` | `openai`, `google`        |
 | Image generation      | `api.registerImageGenerationProvider(...)`    | `openai`, `google`        |
 | Web search            | `api.registerWebSearchProvider(...)`          | `google`                  |
@@ -239,8 +240,9 @@ Examples:
 - the bundled `minimax`, `mistral`, `moonshot`, and `zai` plugins own their
   media-understanding backends
 - the `voice-call` plugin is a feature plugin: it owns call transport, tools,
-  CLI, routes, and runtime, but it consumes core TTS/STT capability instead of
-  inventing a second speech stack
+  CLI, routes, and Twilio media-stream bridging, but it consumes shared speech
+  plus realtime-transcription and realtime-voice capabilities instead of
+  importing vendor plugins directly
 
 The intended end state is:
 
diff --git a/docs/plugins/building-plugins.md b/docs/plugins/building-plugins.md
index 155b90a108b..963fe43dac6 100644
--- a/docs/plugins/building-plugins.md
+++ b/docs/plugins/building-plugins.md
@@ -146,6 +146,7 @@ A single plugin can register any number of capabilities via the `api` object:
 | CLI inference backend | `api.registerCliBackend(...)`                 | [CLI Backends](/gateway/cli-backends)                                           |
 | Channel / messaging   | `api.registerChannel(...)`                    | [Channel Plugins](/plugins/sdk-channel-plugins)                                 |
 | Speech (TTS/STT)      | `api.registerSpeechProvider(...)`             | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
+| Realtime voice        | `api.registerRealtimeVoiceProvider(...)`      | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
 | Media understanding   | `api.registerMediaUnderstandingProvider(...)` | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
 | Image generation      | `api.registerImageGenerationProvider(...)`    | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
 | Web search            | `api.registerWebSearchProvider(...)`          | [Provider Plugins](/plugins/sdk-provider-plugins#step-5-add-extra-capabilities) |
diff --git a/docs/plugins/manifest.md b/docs/plugins/manifest.md
index 256715f3dc4..881c9615075 100644
--- a/docs/plugins/manifest.md
+++ b/docs/plugins/manifest.md
@@ -196,6 +196,8 @@ read without importing the plugin runtime.
 {
   "contracts": {
     "speechProviders": ["openai"],
+    "realtimeTranscriptionProviders": ["openai"],
+    "realtimeVoiceProviders": ["openai"],
     "mediaUnderstandingProviders": ["openai", "openai-codex"],
     "imageGenerationProviders": ["openai"],
     "webSearchProviders": ["gemini"],
@@ -206,13 +208,15 @@ read without importing the plugin runtime.
 
 Each list is optional:
 
-| Field                         | Type       | What it means                                                  |
-| ----------------------------- | ---------- | -------------------------------------------------------------- |
-| `speechProviders`             | `string[]` | Speech provider ids this plugin owns.                          |
-| `mediaUnderstandingProviders` | `string[]` | Media-understanding provider ids this plugin owns.             |
-| `imageGenerationProviders`    | `string[]` | Image-generation provider ids this plugin owns.                |
-| `webSearchProviders`          | `string[]` | Web-search provider ids this plugin owns.                      |
-| `tools`                       | `string[]` | Agent tool names this plugin owns for bundled contract checks. |
+| Field                            | Type       | What it means                                                  |
+| -------------------------------- | ---------- | -------------------------------------------------------------- |
+| `speechProviders`                | `string[]` | Speech provider ids this plugin owns.                          |
+| `realtimeTranscriptionProviders` | `string[]` | Realtime-transcription provider ids this plugin owns.          |
+| `realtimeVoiceProviders`         | `string[]` | Realtime-voice provider ids this plugin owns.                  |
+| `mediaUnderstandingProviders`    | `string[]` | Media-understanding provider ids this plugin owns.             |
+| `imageGenerationProviders`       | `string[]` | Image-generation provider ids this plugin owns.                |
+| `webSearchProviders`             | `string[]` | Web-search provider ids this plugin owns.                      |
+| `tools`                          | `string[]` | Agent tool names this plugin owns for bundled contract checks. |
 
 Legacy top-level `speechProviders`, `mediaUnderstandingProviders`, and
 `imageGenerationProviders` are deprecated. Use `openclaw doctor --fix` to move
diff --git a/docs/plugins/sdk-overview.md b/docs/plugins/sdk-overview.md
index 840bb079857..52c90e793d4 100644
--- a/docs/plugins/sdk-overview.md
+++ b/docs/plugins/sdk-overview.md
@@ -128,15 +128,17 @@ methods:
 
 ### Capability registration
 
-| Method                                        | What it registers              |
-| --------------------------------------------- | ------------------------------ |
-| `api.registerProvider(...)`                   | Text inference (LLM)           |
-| `api.registerCliBackend(...)`                 | Local CLI inference backend    |
-| `api.registerChannel(...)`                    | Messaging channel              |
-| `api.registerSpeechProvider(...)`             | Text-to-speech / STT synthesis |
-| `api.registerMediaUnderstandingProvider(...)` | Image/audio/video analysis     |
-| `api.registerImageGenerationProvider(...)`    | Image generation               |
-| `api.registerWebSearchProvider(...)`          | Web search                     |
+| Method                                           | What it registers                |
+| ------------------------------------------------ | -------------------------------- |
+| `api.registerProvider(...)`                      | Text inference (LLM)             |
+| `api.registerCliBackend(...)`                    | Local CLI inference backend      |
+| `api.registerChannel(...)`                       | Messaging channel                |
+| `api.registerSpeechProvider(...)`                | Text-to-speech / STT synthesis   |
+| `api.registerRealtimeTranscriptionProvider(...)` | Streaming realtime transcription |
+| `api.registerRealtimeVoiceProvider(...)`         | Duplex realtime voice sessions   |
+| `api.registerMediaUnderstandingProvider(...)`    | Image/audio/video analysis       |
+| `api.registerImageGenerationProvider(...)`       | Image generation                 |
+| `api.registerWebSearchProvider(...)`             | Web search                       |
 
 ### Tools and commands
 
diff --git a/docs/plugins/sdk-provider-plugins.md b/docs/plugins/sdk-provider-plugins.md
index afeddcab209..a1133c940c9 100644
--- a/docs/plugins/sdk-provider-plugins.md
+++ b/docs/plugins/sdk-provider-plugins.md
@@ -324,8 +324,8 @@ API key auth, and dynamic model resolution.
 
   <Step title="Add extra capabilities (optional)">
     <a id="step-5-add-extra-capabilities"></a>
-    A provider plugin can register speech, media understanding, image
-    generation, and web search alongside text inference:
+    A provider plugin can register speech, realtime transcription, realtime voice, media
+    understanding, image generation, and web search alongside text inference:
 
     ```typescript
     register(api) {
@@ -343,6 +343,33 @@ API key auth, and dynamic model resolution.
         }),
       });
 
+      api.registerRealtimeTranscriptionProvider({
+        id: "acme-ai",
+        label: "Acme Realtime Transcription",
+        isConfigured: () => true,
+        createSession: (req) => ({
+          connect: async () => {},
+          sendAudio: () => {},
+          close: () => {},
+          isConnected: () => true,
+        }),
+      });
+
+      api.registerRealtimeVoiceProvider({
+        id: "acme-ai",
+        label: "Acme Realtime Voice",
+        isConfigured: ({ providerConfig }) => Boolean(providerConfig.apiKey),
+        createBridge: (req) => ({
+          connect: async () => {},
+          sendAudio: () => {},
+          setMediaTimestamp: () => {},
+          submitToolResult: () => {},
+          acknowledgeMark: () => {},
+          close: () => {},
+          isConnected: () => true,
+        }),
+      });
+
       api.registerMediaUnderstandingProvider({
         id: "acme-ai",
         capabilities: ["image", "audio"],
diff --git a/extensions/anthropic/test-api.ts b/extensions/anthropic/test-api.ts
index 7d467629cfb..3caea18a4c1 100644
--- a/extensions/anthropic/test-api.ts
+++ b/extensions/anthropic/test-api.ts
@@ -1 +1,2 @@
 export { buildAnthropicCliBackend } from "./cli-backend.js";
+export { anthropicMediaUnderstandingProvider } from "./media-understanding-provider.js";
diff --git a/extensions/deepgram/test-api.ts b/extensions/deepgram/test-api.ts
new file mode 100644
index 00000000000..89dff7f7255
--- /dev/null
+++ b/extensions/deepgram/test-api.ts
@@ -0,0 +1 @@
+export { deepgramMediaUnderstandingProvider } from "./media-understanding-provider.js";
diff --git a/extensions/fal/test-api.ts b/extensions/fal/test-api.ts
new file mode 100644
index 00000000000..e9accc54e5b
--- /dev/null
+++ b/extensions/fal/test-api.ts
@@ -0,0 +1 @@
+export { buildFalImageGenerationProvider } from "./image-generation-provider.js";
diff --git a/extensions/google/test-api.ts b/extensions/google/test-api.ts
index 2848ab1f800..0d173de2c95 100644
--- a/extensions/google/test-api.ts
+++ b/extensions/google/test-api.ts
@@ -1 +1,3 @@
 export { buildGoogleGeminiCliBackend } from "./cli-backend.js";
+export { buildGoogleImageGenerationProvider } from "./image-generation-provider.js";
+export { googleMediaUnderstandingProvider } from "./media-understanding-provider.js";
diff --git a/extensions/groq/test-api.ts b/extensions/groq/test-api.ts
new file mode 100644
index 00000000000..24bc9ceb6a6
--- /dev/null
+++ b/extensions/groq/test-api.ts
@@ -0,0 +1 @@
+export { groqMediaUnderstandingProvider } from "./media-understanding-provider.js";
diff --git a/extensions/lobster/src/lobster-tool.test.ts b/extensions/lobster/src/lobster-tool.test.ts
index 0f39acc5b7d..30621f54b19 100644
--- a/extensions/lobster/src/lobster-tool.test.ts
+++ b/extensions/lobster/src/lobster-tool.test.ts
@@ -47,6 +47,8 @@ function fakeApi(overrides: Partial<OpenClawPluginApi> = {}): OpenClawPluginApi
     registerCliBackend() {},
     registerProvider() {},
     registerSpeechProvider() {},
+    registerRealtimeTranscriptionProvider() {},
+    registerRealtimeVoiceProvider() {},
     registerMediaUnderstandingProvider() {},
     registerImageGenerationProvider() {},
     registerWebFetchProvider() {},
diff --git a/extensions/mattermost/runtime-api.ts b/extensions/mattermost/runtime-api.ts
index 8168a2e00b2..e77586cba1a 100644
--- a/extensions/mattermost/runtime-api.ts
+++ b/extensions/mattermost/runtime-api.ts
@@ -31,7 +31,7 @@ export {
   isTrustedProxyAddress,
 } from "openclaw/plugin-sdk/core";
 export { buildComputedAccountStatusSnapshot } from "openclaw/plugin-sdk/channel-status";
-export { createAccountStatusSink } from "openclaw/plugin-sdk/compat";
+export { createAccountStatusSink } from "openclaw/plugin-sdk/channel-lifecycle";
 export { buildAgentMediaPayload } from "openclaw/plugin-sdk/agent-media-payload";
 export {
   buildModelsProviderData,
diff --git a/extensions/minimax/test-api.ts b/extensions/minimax/test-api.ts
new file mode 100644
index 00000000000..5e130df00c3
--- /dev/null
+++ b/extensions/minimax/test-api.ts
@@ -0,0 +1,8 @@
+export {
+  buildMinimaxImageGenerationProvider,
+  buildMinimaxPortalImageGenerationProvider,
+} from "./image-generation-provider.js";
+export {
+  minimaxMediaUnderstandingProvider,
+  minimaxPortalMediaUnderstandingProvider,
+} from "./media-understanding-provider.js";
diff --git a/extensions/mistral/test-api.ts b/extensions/mistral/test-api.ts
new file mode 100644
index 00000000000..14e820308cf
--- /dev/null
+++ b/extensions/mistral/test-api.ts
@@ -0,0 +1 @@
+export { mistralMediaUnderstandingProvider } from "./media-understanding-provider.js";
diff --git a/extensions/moonshot/test-api.ts b/extensions/moonshot/test-api.ts
index 9168ea3be27..9974ca37872 100644
--- a/extensions/moonshot/test-api.ts
+++ b/extensions/moonshot/test-api.ts
@@ -1 +1,2 @@
 export { __testing } from "./src/kimi-web-search-provider.js";
+export { moonshotMediaUnderstandingProvider } from "./media-understanding-provider.js";
diff --git a/extensions/openai/api.ts b/extensions/openai/api.ts
index 8520db0c9b9..7f144f9aa11 100644
--- a/extensions/openai/api.ts
+++ b/extensions/openai/api.ts
@@ -11,3 +11,5 @@ export {
 } from "./default-models.js";
 export { buildOpenAICodexProvider } from "./openai-codex-catalog.js";
 export { buildOpenAIProvider } from "./openai-provider.js";
+export { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
+export { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
diff --git a/extensions/openai/index.ts b/extensions/openai/index.ts
index 0663f6779ea..6fe208dc049 100644
--- a/extensions/openai/index.ts
+++ b/extensions/openai/index.ts
@@ -12,6 +12,8 @@ import {
   resolveOpenAIPromptOverlayMode,
   shouldApplyOpenAIPromptOverlay,
 } from "./prompt-overlay.js";
+import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
+import { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
 import { buildOpenAISpeechProvider } from "./speech-provider.js";
 
 export default definePluginEntry({
@@ -24,6 +26,8 @@ export default definePluginEntry({
     api.registerProvider(buildOpenAIProvider());
     api.registerProvider(buildOpenAICodexProviderPlugin());
     api.registerSpeechProvider(buildOpenAISpeechProvider());
+    api.registerRealtimeTranscriptionProvider(buildOpenAIRealtimeTranscriptionProvider());
+    api.registerRealtimeVoiceProvider(buildOpenAIRealtimeVoiceProvider());
     api.registerMediaUnderstandingProvider(openaiMediaUnderstandingProvider);
     api.registerMediaUnderstandingProvider(openaiCodexMediaUnderstandingProvider);
     api.registerImageGenerationProvider(buildOpenAIImageGenerationProvider());
diff --git a/extensions/openai/openai-codex-provider.test.ts b/extensions/openai/openai-codex-provider.test.ts
index e2768dd96d3..3f43317807b 100644
--- a/extensions/openai/openai-codex-provider.test.ts
+++ b/extensions/openai/openai-codex-provider.test.ts
@@ -103,16 +103,16 @@ describe("openai codex provider", () => {
               api: "openai-codex-responses",
               baseUrl: "https://chatgpt.com/backend-api",
               reasoning: true,
-              input: ["text", "image"],
+              input: ["text", "image"] as const,
               cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
               contextWindow: 272_000,
               maxTokens: 128_000,
             };
           }
-          return null;
-        },
+          return undefined;
+        }),
       } as never,
-    } as never);
+    });
 
     expect(model).toMatchObject({
       id: "gpt-5.4",
@@ -173,7 +173,7 @@ describe("openai codex provider", () => {
           contextWindow: 272_000,
         },
       ],
-    });
+    } as never);
 
     expect(entries).toContainEqual(
       expect.objectContaining({
diff --git a/extensions/openai/openclaw.plugin.json b/extensions/openai/openclaw.plugin.json
index 76cf35db0fb..17ad0e97879 100644
--- a/extensions/openai/openclaw.plugin.json
+++ b/extensions/openai/openclaw.plugin.json
@@ -34,6 +34,8 @@
   ],
   "contracts": {
     "speechProviders": ["openai"],
+    "realtimeTranscriptionProviders": ["openai"],
+    "realtimeVoiceProviders": ["openai"],
     "mediaUnderstandingProviders": ["openai", "openai-codex"],
     "imageGenerationProviders": ["openai"]
   },
diff --git a/extensions/openai/package.json b/extensions/openai/package.json
index 327c8a34760..2f5645ac179 100644
--- a/extensions/openai/package.json
+++ b/extensions/openai/package.json
@@ -4,6 +4,9 @@
   "private": true,
   "description": "OpenClaw OpenAI provider plugins",
   "type": "module",
+  "dependencies": {
+    "ws": "^8.20.0"
+  },
   "openclaw": {
     "extensions": [
       "./index.ts"
diff --git a/extensions/openai/realtime-transcription-provider.test.ts b/extensions/openai/realtime-transcription-provider.test.ts
new file mode 100644
index 00000000000..214b4908cf5
--- /dev/null
+++ b/extensions/openai/realtime-transcription-provider.test.ts
@@ -0,0 +1,27 @@
+import { describe, expect, it } from "vitest";
+import { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
+
+describe("buildOpenAIRealtimeTranscriptionProvider", () => {
+  it("normalizes OpenAI config defaults", () => {
+    const provider = buildOpenAIRealtimeTranscriptionProvider();
+    const resolved = provider.resolveConfig?.({
+      cfg: {} as never,
+      rawConfig: {
+        providers: {
+          openai: {
+            apiKey: "sk-test", // pragma: allowlist secret
+          },
+        },
+      },
+    });
+
+    expect(resolved).toEqual({
+      apiKey: "sk-test",
+    });
+  });
+
+  it("accepts the legacy openai-realtime alias", () => {
+    const provider = buildOpenAIRealtimeTranscriptionProvider();
+    expect(provider.aliases).toContain("openai-realtime");
+  });
+});
diff --git a/extensions/openai/realtime-transcription-provider.ts b/extensions/openai/realtime-transcription-provider.ts
new file mode 100644
index 00000000000..d4fd8d09350
--- /dev/null
+++ b/extensions/openai/realtime-transcription-provider.ts
@@ -0,0 +1,267 @@
+import type {
+  RealtimeTranscriptionProviderConfig,
+  RealtimeTranscriptionProviderPlugin,
+  RealtimeTranscriptionSession,
+  RealtimeTranscriptionSessionCreateRequest,
+} from "openclaw/plugin-sdk/realtime-transcription";
+import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
+import WebSocket from "ws";
+
+type OpenAIRealtimeTranscriptionProviderConfig = {
+  apiKey?: string;
+  model?: string;
+  silenceDurationMs?: number;
+  vadThreshold?: number;
+};
+
+type OpenAIRealtimeTranscriptionSessionConfig = RealtimeTranscriptionSessionCreateRequest & {
+  apiKey: string;
+  model: string;
+  silenceDurationMs: number;
+  vadThreshold: number;
+};
+
+type RealtimeEvent = {
+  type: string;
+  delta?: string;
+  transcript?: string;
+  error?: unknown;
+};
+
+function trimToUndefined(value: unknown): string | undefined {
+  return typeof value === "string" && value.trim() ? value.trim() : undefined;
+}
+
+function asNumber(value: unknown): number | undefined {
+  return typeof value === "number" && Number.isFinite(value) ? value : undefined;
+}
+
+function asObject(value: unknown): Record<string, unknown> | undefined {
+  return typeof value === "object" && value !== null && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function normalizeProviderConfig(
+  config: RealtimeTranscriptionProviderConfig,
+): OpenAIRealtimeTranscriptionProviderConfig {
+  const providers = asObject(config.providers);
+  const raw = asObject(providers?.openai) ?? asObject(config.openai) ?? asObject(config);
+  return {
+    apiKey:
+      normalizeResolvedSecretInputString({
+        value: raw?.apiKey,
+        path: "plugins.entries.voice-call.config.streaming.providers.openai.apiKey",
+      }) ??
+      normalizeResolvedSecretInputString({
+        value: raw?.openaiApiKey,
+        path: "plugins.entries.voice-call.config.streaming.openaiApiKey",
+      }),
+    model: trimToUndefined(raw?.model) ?? trimToUndefined(raw?.sttModel),
+    silenceDurationMs: asNumber(raw?.silenceDurationMs),
+    vadThreshold: asNumber(raw?.vadThreshold),
+  };
+}
+
+function readProviderConfig(
+  providerConfig: RealtimeTranscriptionProviderConfig,
+): OpenAIRealtimeTranscriptionProviderConfig {
+  return normalizeProviderConfig(providerConfig);
+}
+
+class OpenAIRealtimeTranscriptionSession implements RealtimeTranscriptionSession {
+  private static readonly MAX_RECONNECT_ATTEMPTS = 5;
+  private static readonly RECONNECT_DELAY_MS = 1000;
+  private static readonly CONNECT_TIMEOUT_MS = 10_000;
+
+  private ws: WebSocket | null = null;
+  private connected = false;
+  private closed = false;
+  private reconnectAttempts = 0;
+  private pendingTranscript = "";
+
+  constructor(private readonly config: OpenAIRealtimeTranscriptionSessionConfig) {}
+
+  async connect(): Promise<void> {
+    this.closed = false;
+    this.reconnectAttempts = 0;
+    await this.doConnect();
+  }
+
+  sendAudio(audio: Buffer): void {
+    if (this.ws?.readyState !== WebSocket.OPEN) {
+      return;
+    }
+    this.sendEvent({
+      type: "input_audio_buffer.append",
+      audio: audio.toString("base64"),
+    });
+  }
+
+  close(): void {
+    this.closed = true;
+    this.connected = false;
+    if (this.ws) {
+      this.ws.close(1000, "Transcription session closed");
+      this.ws = null;
+    }
+  }
+
+  isConnected(): boolean {
+    return this.connected;
+  }
+
+  private async doConnect(): Promise<void> {
+    await new Promise<void>((resolve, reject) => {
+      this.ws = new WebSocket("wss://api.openai.com/v1/realtime?intent=transcription", {
+        headers: {
+          Authorization: `Bearer ${this.config.apiKey}`,
+          "OpenAI-Beta": "realtime=v1",
+        },
+      });
+
+      const connectTimeout = setTimeout(() => {
+        reject(new Error("OpenAI realtime transcription connection timeout"));
+      }, OpenAIRealtimeTranscriptionSession.CONNECT_TIMEOUT_MS);
+
+      this.ws.on("open", () => {
+        clearTimeout(connectTimeout);
+        this.connected = true;
+        this.reconnectAttempts = 0;
+        this.sendEvent({
+          type: "transcription_session.update",
+          session: {
+            input_audio_format: "g711_ulaw",
+            input_audio_transcription: {
+              model: this.config.model,
+            },
+            turn_detection: {
+              type: "server_vad",
+              threshold: this.config.vadThreshold,
+              prefix_padding_ms: 300,
+              silence_duration_ms: this.config.silenceDurationMs,
+            },
+          },
+        });
+        resolve();
+      });
+
+      this.ws.on("message", (data: Buffer) => {
+        try {
+          this.handleEvent(JSON.parse(data.toString()) as RealtimeEvent);
+        } catch (error) {
+          this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
+        }
+      });
+
+      this.ws.on("error", (error) => {
+        if (!this.connected) {
+          clearTimeout(connectTimeout);
+          reject(error);
+          return;
+        }
+        this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
+      });
+
+      this.ws.on("close", () => {
+        this.connected = false;
+        if (this.closed) {
+          return;
+        }
+        void this.attemptReconnect();
+      });
+    });
+  }
+
+  private async attemptReconnect(): Promise<void> {
+    if (this.closed) {
+      return;
+    }
+    if (this.reconnectAttempts >= OpenAIRealtimeTranscriptionSession.MAX_RECONNECT_ATTEMPTS) {
+      this.config.onError?.(new Error("OpenAI realtime transcription reconnect limit reached"));
+      return;
+    }
+    this.reconnectAttempts += 1;
+    const delay =
+      OpenAIRealtimeTranscriptionSession.RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
+    await new Promise((resolve) => setTimeout(resolve, delay));
+    if (this.closed) {
+      return;
+    }
+    try {
+      await this.doConnect();
+    } catch (error) {
+      this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
+      await this.attemptReconnect();
+    }
+  }
+
+  private handleEvent(event: RealtimeEvent): void {
+    switch (event.type) {
+      case "conversation.item.input_audio_transcription.delta":
+        if (event.delta) {
+          this.pendingTranscript += event.delta;
+          this.config.onPartial?.(this.pendingTranscript);
+        }
+        return;
+
+      case "conversation.item.input_audio_transcription.completed":
+        if (event.transcript) {
+          this.config.onTranscript?.(event.transcript);
+        }
+        this.pendingTranscript = "";
+        return;
+
+      case "input_audio_buffer.speech_started":
+        this.pendingTranscript = "";
+        this.config.onSpeechStart?.();
+        return;
+
+      case "error": {
+        const detail =
+          event.error && typeof event.error === "object" && "message" in event.error
+            ? String((event.error as { message?: unknown }).message ?? "Unknown error")
+            : event.error
+              ? String(event.error)
+              : "Unknown error";
+        this.config.onError?.(new Error(detail));
+        return;
+      }
+
+      default:
+        return;
+    }
+  }
+
+  private sendEvent(event: unknown): void {
+    if (this.ws?.readyState === WebSocket.OPEN) {
+      this.ws.send(JSON.stringify(event));
+    }
+  }
+}
+
+export function buildOpenAIRealtimeTranscriptionProvider(): RealtimeTranscriptionProviderPlugin {
+  return {
+    id: "openai",
+    label: "OpenAI Realtime Transcription",
+    aliases: ["openai-realtime"],
+    autoSelectOrder: 10,
+    resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig),
+    isConfigured: ({ providerConfig }) =>
+      Boolean(readProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
+    createSession: (req) => {
+      const config = readProviderConfig(req.providerConfig);
+      const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
+      if (!apiKey) {
+        throw new Error("OpenAI API key missing");
+      }
+      return new OpenAIRealtimeTranscriptionSession({
+        ...req,
+        apiKey,
+        model: config.model ?? "gpt-4o-transcribe",
+        silenceDurationMs: config.silenceDurationMs ?? 800,
+        vadThreshold: config.vadThreshold ?? 0.5,
+      });
+    },
+  };
+}
diff --git a/extensions/openai/realtime-voice-provider.ts b/extensions/openai/realtime-voice-provider.ts
new file mode 100644
index 00000000000..2afb8dcfc40
--- /dev/null
+++ b/extensions/openai/realtime-voice-provider.ts
@@ -0,0 +1,535 @@
+import type {
+  RealtimeVoiceBridge,
+  RealtimeVoiceBridgeCreateRequest,
+  RealtimeVoiceCloseReason,
+  RealtimeVoiceProviderConfig,
+  RealtimeVoiceProviderPlugin,
+  RealtimeVoiceTool,
+} from "openclaw/plugin-sdk/realtime-voice";
+import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
+import WebSocket from "ws";
+
+export type OpenAIRealtimeVoice =
+  | "alloy"
+  | "ash"
+  | "ballad"
+  | "cedar"
+  | "coral"
+  | "echo"
+  | "marin"
+  | "sage"
+  | "shimmer"
+  | "verse";
+
+type OpenAIRealtimeVoiceProviderConfig = {
+  apiKey?: string;
+  model?: string;
+  voice?: OpenAIRealtimeVoice;
+  temperature?: number;
+  vadThreshold?: number;
+  silenceDurationMs?: number;
+  prefixPaddingMs?: number;
+  azureEndpoint?: string;
+  azureDeployment?: string;
+  azureApiVersion?: string;
+};
+
+type OpenAIRealtimeVoiceBridgeConfig = RealtimeVoiceBridgeCreateRequest & {
+  apiKey: string;
+  model?: string;
+  voice?: OpenAIRealtimeVoice;
+  temperature?: number;
+  vadThreshold?: number;
+  silenceDurationMs?: number;
+  prefixPaddingMs?: number;
+  azureEndpoint?: string;
+  azureDeployment?: string;
+  azureApiVersion?: string;
+};
+
+type RealtimeEvent = {
+  type: string;
+  delta?: string;
+  transcript?: string;
+  item_id?: string;
+  call_id?: string;
+  name?: string;
+  error?: unknown;
+};
+
+type RealtimeSessionUpdate = {
+  type: "session.update";
+  session: {
+    modalities: string[];
+    instructions?: string;
+    voice: OpenAIRealtimeVoice;
+    input_audio_format: string;
+    output_audio_format: string;
+    turn_detection: {
+      type: "server_vad";
+      threshold: number;
+      prefix_padding_ms: number;
+      silence_duration_ms: number;
+      create_response: boolean;
+    };
+    temperature: number;
+    input_audio_transcription?: { model: string };
+    tools?: RealtimeVoiceTool[];
+    tool_choice?: string;
+  };
+};
+
+function trimToUndefined(value: unknown): string | undefined {
+  return typeof value === "string" && value.trim() ? value.trim() : undefined;
+}
+
+function asNumber(value: unknown): number | undefined {
+  return typeof value === "number" && Number.isFinite(value) ? value : undefined;
+}
+
+function asObject(value: unknown): Record<string, unknown> | undefined {
+  return typeof value === "object" && value !== null && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function normalizeProviderConfig(
+  config: RealtimeVoiceProviderConfig,
+): OpenAIRealtimeVoiceProviderConfig {
+  const providers = asObject(config.providers);
+  const raw = asObject(providers?.openai) ?? asObject(config.openai) ?? asObject(config);
+  return {
+    apiKey: normalizeResolvedSecretInputString({
+      value: raw?.apiKey,
+      path: "plugins.entries.voice-call.config.realtime.providers.openai.apiKey",
+    }),
+    model: trimToUndefined(raw?.model),
+    voice: raw?.voice as OpenAIRealtimeVoice | undefined,
+    temperature: asNumber(raw?.temperature),
+    vadThreshold: asNumber(raw?.vadThreshold),
+    silenceDurationMs: asNumber(raw?.silenceDurationMs),
+    prefixPaddingMs: asNumber(raw?.prefixPaddingMs),
+    azureEndpoint: trimToUndefined(raw?.azureEndpoint),
+    azureDeployment: trimToUndefined(raw?.azureDeployment),
+    azureApiVersion: trimToUndefined(raw?.azureApiVersion),
+  };
+}
+
+function readProviderConfig(
+  providerConfig: RealtimeVoiceProviderConfig,
+): OpenAIRealtimeVoiceProviderConfig {
+  return normalizeProviderConfig(providerConfig);
+}
+
+function base64ToBuffer(b64: string): Buffer {
+  return Buffer.from(b64, "base64");
+}
+
+class OpenAIRealtimeVoiceBridge implements RealtimeVoiceBridge {
+  private static readonly DEFAULT_MODEL = "gpt-realtime";
+  private static readonly MAX_RECONNECT_ATTEMPTS = 5;
+  private static readonly BASE_RECONNECT_DELAY_MS = 1000;
+  private static readonly CONNECT_TIMEOUT_MS = 10_000;
+
+  private ws: WebSocket | null = null;
+  private connected = false;
+  private intentionallyClosed = false;
+  private reconnectAttempts = 0;
+  private pendingAudio: Buffer[] = [];
+  private markQueue: string[] = [];
+  private responseStartTimestamp: number | null = null;
+  private latestMediaTimestamp = 0;
+  private lastAssistantItemId: string | null = null;
+  private toolCallBuffers = new Map<string, { name: string; callId: string; args: string }>();
+
+  constructor(private readonly config: OpenAIRealtimeVoiceBridgeConfig) {}
+
+  async connect(): Promise<void> {
+    this.intentionallyClosed = false;
+    this.reconnectAttempts = 0;
+    await this.doConnect();
+  }
+
+  sendAudio(audio: Buffer): void {
+    if (!this.connected || this.ws?.readyState !== WebSocket.OPEN) {
+      if (this.pendingAudio.length < 320) {
+        this.pendingAudio.push(audio);
+      }
+      return;
+    }
+    this.sendEvent({
+      type: "input_audio_buffer.append",
+      audio: audio.toString("base64"),
+    });
+  }
+
+  setMediaTimestamp(ts: number): void {
+    this.latestMediaTimestamp = ts;
+  }
+
+  sendUserMessage(text: string): void {
+    this.sendEvent({
+      type: "conversation.item.create",
+      item: {
+        type: "message",
+        role: "user",
+        content: [{ type: "input_text", text }],
+      },
+    });
+    this.sendEvent({ type: "response.create" });
+  }
+
+  triggerGreeting(instructions?: string): void {
+    if (!this.connected || !this.ws) {
+      return;
+    }
+    this.sendEvent({
+      type: "response.create",
+      response: {
+        instructions: instructions ?? this.config.instructions,
+      },
+    });
+  }
+
+  submitToolResult(callId: string, result: unknown): void {
+    this.sendEvent({
+      type: "conversation.item.create",
+      item: {
+        type: "function_call_output",
+        call_id: callId,
+        output: JSON.stringify(result),
+      },
+    });
+    this.sendEvent({ type: "response.create" });
+  }
+
+  acknowledgeMark(): void {
+    if (this.markQueue.length === 0) {
+      return;
+    }
+    this.markQueue.shift();
+    if (this.markQueue.length === 0) {
+      this.responseStartTimestamp = null;
+      this.lastAssistantItemId = null;
+    }
+  }
+
+  close(): void {
+    this.intentionallyClosed = true;
+    this.connected = false;
+    if (this.ws) {
+      this.ws.close(1000, "Bridge closed");
+      this.ws = null;
+    }
+  }
+
+  isConnected(): boolean {
+    return this.connected;
+  }
+
+  private async doConnect(): Promise<void> {
+    await new Promise<void>((resolve, reject) => {
+      const { url, headers } = this.resolveConnectionParams();
+      this.ws = new WebSocket(url, { headers });
+
+      const connectTimeout = setTimeout(() => {
+        reject(new Error("OpenAI realtime connection timeout"));
+      }, OpenAIRealtimeVoiceBridge.CONNECT_TIMEOUT_MS);
+
+      this.ws.on("open", () => {
+        clearTimeout(connectTimeout);
+        this.connected = true;
+        this.reconnectAttempts = 0;
+        this.sendSessionUpdate();
+        for (const chunk of this.pendingAudio.splice(0)) {
+          this.sendAudio(chunk);
+        }
+        this.config.onReady?.();
+        resolve();
+      });
+
+      this.ws.on("message", (data: Buffer) => {
+        try {
+          this.handleEvent(JSON.parse(data.toString()) as RealtimeEvent);
+        } catch (error) {
+          console.error("[openai] realtime event parse failed:", error);
+        }
+      });
+
+      this.ws.on("error", (error) => {
+        if (!this.connected) {
+          clearTimeout(connectTimeout);
+          reject(error);
+        }
+        this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
+      });
+
+      this.ws.on("close", () => {
+        this.connected = false;
+        if (this.intentionallyClosed) {
+          this.config.onClose?.("completed");
+          return;
+        }
+        void this.attemptReconnect();
+      });
+    });
+  }
+
+  private resolveConnectionParams(): { url: string; headers: Record<string, string> } {
+    const cfg = this.config;
+    if (cfg.azureEndpoint && cfg.azureDeployment) {
+      const base = cfg.azureEndpoint
+        .replace(/\/$/, "")
+        .replace(/^http(s?):/, (_, secure: string) => `ws${secure}:`);
+      const apiVersion = cfg.azureApiVersion ?? "2024-10-01-preview";
+      return {
+        url: `${base}/openai/realtime?api-version=${apiVersion}&deployment=${encodeURIComponent(
+          cfg.azureDeployment,
+        )}`,
+        headers: { "api-key": cfg.apiKey },
+      };
+    }
+
+    if (cfg.azureEndpoint) {
+      const base = cfg.azureEndpoint
+        .replace(/\/$/, "")
+        .replace(/^http(s?):/, (_, secure: string) => `ws${secure}:`);
+      return {
+        url: `${base}/v1/realtime?model=${encodeURIComponent(
+          cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL,
+        )}`,
+        headers: { Authorization: `Bearer ${cfg.apiKey}` },
+      };
+    }
+
+    return {
+      url: `wss://api.openai.com/v1/realtime?model=${encodeURIComponent(
+        cfg.model ?? OpenAIRealtimeVoiceBridge.DEFAULT_MODEL,
+      )}`,
+      headers: {
+        Authorization: `Bearer ${cfg.apiKey}`,
+        "OpenAI-Beta": "realtime=v1",
+      },
+    };
+  }
+
+  private async attemptReconnect(): Promise<void> {
+    if (this.intentionallyClosed) {
+      return;
+    }
+    if (this.reconnectAttempts >= OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS) {
+      this.config.onClose?.("error");
+      return;
+    }
+    this.reconnectAttempts += 1;
+    const delay =
+      OpenAIRealtimeVoiceBridge.BASE_RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
+    await new Promise((resolve) => setTimeout(resolve, delay));
+    if (this.intentionallyClosed) {
+      return;
+    }
+    try {
+      await this.doConnect();
+    } catch (error) {
+      this.config.onError?.(error instanceof Error ? error : new Error(String(error)));
+      await this.attemptReconnect();
+    }
+  }
+
+  private sendSessionUpdate(): void {
+    const cfg = this.config;
+    const sessionUpdate: RealtimeSessionUpdate = {
+      type: "session.update",
+      session: {
+        modalities: ["text", "audio"],
+        instructions: cfg.instructions,
+        voice: cfg.voice ?? "alloy",
+        input_audio_format: "g711_ulaw",
+        output_audio_format: "g711_ulaw",
+        input_audio_transcription: {
+          model: "whisper-1",
+        },
+        turn_detection: {
+          type: "server_vad",
+          threshold: cfg.vadThreshold ?? 0.5,
+          prefix_padding_ms: cfg.prefixPaddingMs ?? 300,
+          silence_duration_ms: cfg.silenceDurationMs ?? 500,
+          create_response: true,
+        },
+        temperature: cfg.temperature ?? 0.8,
+        ...(cfg.tools && cfg.tools.length > 0
+          ? {
+              tools: cfg.tools,
+              tool_choice: "auto",
+            }
+          : {}),
+      },
+    };
+    this.sendEvent(sessionUpdate);
+  }
+
+  private handleEvent(event: RealtimeEvent): void {
+    switch (event.type) {
+      case "response.audio.delta": {
+        if (!event.delta) {
+          return;
+        }
+        const audio = base64ToBuffer(event.delta);
+        this.config.onAudio(audio);
+        if (this.responseStartTimestamp === null) {
+          this.responseStartTimestamp = this.latestMediaTimestamp;
+        }
+        if (event.item_id) {
+          this.lastAssistantItemId = event.item_id;
+        }
+        this.sendMark();
+        return;
+      }
+
+      case "input_audio_buffer.speech_started":
+        this.handleBargeIn();
+        return;
+
+      case "response.audio_transcript.delta":
+        if (event.delta) {
+          this.config.onTranscript?.("assistant", event.delta, false);
+        }
+        return;
+
+      case "response.audio_transcript.done":
+        if (event.transcript) {
+          this.config.onTranscript?.("assistant", event.transcript, true);
+        }
+        return;
+
+      case "conversation.item.input_audio_transcription.completed":
+        if (event.transcript) {
+          this.config.onTranscript?.("user", event.transcript, true);
+        }
+        return;
+
+      case "conversation.item.input_audio_transcription.delta":
+        if (event.delta) {
+          this.config.onTranscript?.("user", event.delta, false);
+        }
+        return;
+
+      case "response.function_call_arguments.delta": {
+        const key = event.item_id ?? "unknown";
+        const existing = this.toolCallBuffers.get(key);
+        if (existing && event.delta) {
+          existing.args += event.delta;
+        } else if (event.item_id) {
+          this.toolCallBuffers.set(event.item_id, {
+            name: event.name ?? "",
+            callId: event.call_id ?? "",
+            args: event.delta ?? "",
+          });
+        }
+        return;
+      }
+
+      case "response.function_call_arguments.done": {
+        const key = event.item_id ?? "unknown";
+        const buffered = this.toolCallBuffers.get(key);
+        if (this.config.onToolCall) {
+          const rawArgs =
+            buffered?.args ||
+            ((event as unknown as Record<string, unknown>).arguments as string) ||
+            "{}";
+          let args: unknown = {};
+          try {
+            args = JSON.parse(rawArgs);
+          } catch {}
+          this.config.onToolCall({
+            itemId: key,
+            callId: buffered?.callId || event.call_id || "",
+            name: buffered?.name || event.name || "",
+            args,
+          });
+        }
+        this.toolCallBuffers.delete(key);
+        return;
+      }
+
+      case "error": {
+        const detail =
+          event.error && typeof event.error === "object" && "message" in event.error
+            ? String((event.error as { message?: unknown }).message ?? "Unknown error")
+            : event.error
+              ? String(event.error)
+              : "Unknown error";
+        this.config.onError?.(new Error(detail));
+        return;
+      }
+
+      default:
+        return;
+    }
+  }
+
+  private handleBargeIn(): void {
+    if (this.markQueue.length > 0 && this.responseStartTimestamp !== null) {
+      const elapsedMs = this.latestMediaTimestamp - this.responseStartTimestamp;
+      if (this.lastAssistantItemId) {
+        this.sendEvent({
+          type: "conversation.item.truncate",
+          item_id: this.lastAssistantItemId,
+          content_index: 0,
+          audio_end_ms: Math.max(0, elapsedMs),
+        });
+      }
+      this.config.onClearAudio();
+      this.markQueue = [];
+      this.lastAssistantItemId = null;
+      this.responseStartTimestamp = null;
+      return;
+    }
+    this.config.onClearAudio();
+  }
+
+  private sendMark(): void {
+    const markName = `audio-${Date.now()}`;
+    this.markQueue.push(markName);
+    this.config.onMark?.(markName);
+  }
+
+  private sendEvent(event: unknown): void {
+    if (this.ws?.readyState === WebSocket.OPEN) {
+      this.ws.send(JSON.stringify(event));
+    }
+  }
+}
+
+export function buildOpenAIRealtimeVoiceProvider(): RealtimeVoiceProviderPlugin {
+  return {
+    id: "openai",
+    label: "OpenAI Realtime Voice",
+    autoSelectOrder: 10,
+    resolveConfig: ({ rawConfig }) => normalizeProviderConfig(rawConfig),
+    isConfigured: ({ providerConfig }) =>
+      Boolean(readProviderConfig(providerConfig).apiKey || process.env.OPENAI_API_KEY),
+    createBridge: (req) => {
+      const config = readProviderConfig(req.providerConfig);
+      const apiKey = config.apiKey || process.env.OPENAI_API_KEY;
+      if (!apiKey) {
+        throw new Error("OpenAI API key missing");
+      }
+      return new OpenAIRealtimeVoiceBridge({
+        ...req,
+        apiKey,
+        model: config.model,
+        voice: config.voice,
+        temperature: config.temperature,
+        vadThreshold: config.vadThreshold,
+        silenceDurationMs: config.silenceDurationMs,
+        prefixPaddingMs: config.prefixPaddingMs,
+        azureEndpoint: config.azureEndpoint,
+        azureDeployment: config.azureDeployment,
+        azureApiVersion: config.azureApiVersion,
+      });
+    },
+  };
+}
+
+export type { OpenAIRealtimeVoiceProviderConfig };
diff --git a/extensions/openai/test-api.ts b/extensions/openai/test-api.ts
index 570203d08a4..50b6e81e18a 100644
--- a/extensions/openai/test-api.ts
+++ b/extensions/openai/test-api.ts
@@ -4,4 +4,6 @@ export {
   openaiCodexMediaUnderstandingProvider,
   openaiMediaUnderstandingProvider,
 } from "./media-understanding-provider.js";
+export { buildOpenAIRealtimeTranscriptionProvider } from "./realtime-transcription-provider.js";
+export { buildOpenAIRealtimeVoiceProvider } from "./realtime-voice-provider.js";
 export { buildOpenAISpeechProvider } from "./speech-provider.js";
diff --git a/extensions/openrouter/test-api.ts b/extensions/openrouter/test-api.ts
new file mode 100644
index 00000000000..117d8547bb8
--- /dev/null
+++ b/extensions/openrouter/test-api.ts
@@ -0,0 +1 @@
+export { openrouterMediaUnderstandingProvider } from "./media-understanding-provider.js";
diff --git a/extensions/voice-call/README.md b/extensions/voice-call/README.md
index 1bffa9539cd..a2dd7eba40a 100644
--- a/extensions/voice-call/README.md
+++ b/extensions/voice-call/README.md
@@ -145,4 +145,4 @@ Actions:
 - While a Twilio stream is active, playback does not fall back to TwiML `<Say>`; stream-TTS failures fail the playback request.
 - Outbound conversation calls suppress barge-in only while the initial greeting is actively speaking, then re-enable normal interruption.
 - Twilio stream disconnect auto-end uses a short grace window so quick reconnects do not end the call.
-- Media streaming requires `ws` and OpenAI Realtime API key.
+- Media streaming requires `ws` plus a configured realtime-transcription provider. The bundled provider today is OpenAI.
diff --git a/extensions/voice-call/index.ts b/extensions/voice-call/index.ts
index f3386c3c042..fc667698586 100644
--- a/extensions/voice-call/index.ts
+++ b/extensions/voice-call/index.ts
@@ -72,13 +72,25 @@ const voiceCallConfigSchema = {
       advanced: true,
     },
     "streaming.enabled": { label: "Enable Streaming", advanced: true },
-    "streaming.openaiApiKey": {
+    "streaming.provider": { label: "Streaming Provider", advanced: true },
+    "streaming.providers.openai.apiKey": {
       label: "OpenAI Realtime API Key",
       sensitive: true,
       advanced: true,
     },
-    "streaming.sttModel": { label: "Realtime STT Model", advanced: true },
+    "streaming.providers.openai.model": { label: "Realtime STT Model", advanced: true },
     "streaming.streamPath": { label: "Media Stream Path", advanced: true },
+    "realtime.enabled": { label: "Enable Realtime Voice", advanced: true },
+    "realtime.provider": { label: "Realtime Voice Provider", advanced: true },
+    "realtime.streamPath": { label: "Realtime Stream Path", advanced: true },
+    "realtime.instructions": { label: "Realtime Instructions", advanced: true },
+    "realtime.providers.openai.apiKey": {
+      label: "OpenAI Realtime API Key",
+      sensitive: true,
+      advanced: true,
+    },
+    "realtime.providers.openai.model": { label: "OpenAI Realtime Model", advanced: true },
+    "realtime.providers.openai.voice": { label: "OpenAI Realtime Voice", advanced: true },
     "tts.provider": {
       label: "TTS Provider Override",
       help: "Deep-merges with messages.tts (Microsoft is ignored for calls).",
@@ -181,6 +193,7 @@ export default definePluginEntry({
         runtimePromise = createVoiceCallRuntime({
           config,
           coreConfig: api.config as CoreConfig,
+          fullConfig: api.config,
           agentRuntime: api.runtime.agent,
           ttsRuntime: api.runtime.tts,
           logger: api.logger,
diff --git a/extensions/voice-call/openclaw.plugin.json b/extensions/voice-call/openclaw.plugin.json
index 0063979b2dc..f0700789bb3 100644
--- a/extensions/voice-call/openclaw.plugin.json
+++ b/extensions/voice-call/openclaw.plugin.json
@@ -86,12 +86,16 @@
       "label": "Enable Streaming",
       "advanced": true
     },
-    "streaming.openaiApiKey": {
+    "streaming.provider": {
+      "label": "Streaming Provider",
+      "advanced": true
+    },
+    "streaming.providers.openai.apiKey": {
       "label": "OpenAI Realtime API Key",
       "sensitive": true,
       "advanced": true
     },
-    "streaming.sttModel": {
+    "streaming.providers.openai.model": {
       "label": "Realtime STT Model",
       "advanced": true
     },
@@ -345,9 +349,11 @@
           "enabled": {
             "type": "boolean"
           },
+          "provider": {
+            "type": "string"
+          },
           "sttProvider": {
-            "type": "string",
-            "enum": ["openai-realtime"]
+            "type": "string"
           },
           "openaiApiKey": {
             "type": "string"
@@ -367,6 +373,13 @@
           "streamPath": {
             "type": "string"
           },
+          "providers": {
+            "type": "object",
+            "additionalProperties": {
+              "type": "object",
+              "additionalProperties": true
+            }
+          },
           "preStartTimeoutMs": {
             "type": "integer",
             "minimum": 1
@@ -385,6 +398,72 @@
           }
         }
       },
+      "realtime": {
+        "type": "object",
+        "additionalProperties": false,
+        "properties": {
+          "enabled": {
+            "type": "boolean"
+          },
+          "provider": {
+            "type": "string"
+          },
+          "streamPath": {
+            "type": "string"
+          },
+          "instructions": {
+            "type": "string"
+          },
+          "tools": {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "additionalProperties": false,
+              "properties": {
+                "type": {
+                  "type": "string",
+                  "enum": ["function"]
+                },
+                "name": {
+                  "type": "string"
+                },
+                "description": {
+                  "type": "string"
+                },
+                "parameters": {
+                  "type": "object",
+                  "additionalProperties": false,
+                  "properties": {
+                    "type": {
+                      "type": "string",
+                      "enum": ["object"]
+                    },
+                    "properties": {
+                      "type": "object",
+                      "additionalProperties": true
+                    },
+                    "required": {
+                      "type": "array",
+                      "items": {
+                        "type": "string"
+                      }
+                    }
+                  },
+                  "required": ["type", "properties"]
+                }
+              },
+              "required": ["type", "name", "description", "parameters"]
+            }
+          },
+          "providers": {
+            "type": "object",
+            "additionalProperties": {
+              "type": "object",
+              "additionalProperties": true
+            }
+          }
+        }
+      },
       "publicUrl": {
         "type": "string"
       },
diff --git a/extensions/voice-call/src/config.test.ts b/extensions/voice-call/src/config.test.ts
index 19db6eb691b..ec268b1c3ca 100644
--- a/extensions/voice-call/src/config.test.ts
+++ b/extensions/voice-call/src/config.test.ts
@@ -179,6 +179,35 @@ describe("validateProviderConfig", () => {
       expect(result.errors).toEqual([]);
     });
   });
+
+  describe("realtime config", () => {
+    it("rejects disabled inbound policy for realtime mode", () => {
+      const config = createBaseConfig("twilio");
+      config.realtime.enabled = true;
+      config.inboundPolicy = "disabled";
+
+      const result = validateProviderConfig(config);
+
+      expect(result.valid).toBe(false);
+      expect(result.errors).toContain(
+        'plugins.entries.voice-call.config.inboundPolicy must not be "disabled" when realtime.enabled is true',
+      );
+    });
+
+    it("rejects enabling realtime and streaming together", () => {
+      const config = createBaseConfig("twilio");
+      config.realtime.enabled = true;
+      config.streaming.enabled = true;
+      config.inboundPolicy = "allowlist";
+
+      const result = validateProviderConfig(config);
+
+      expect(result.valid).toBe(false);
+      expect(result.errors).toContain(
+        "plugins.entries.voice-call.config.realtime.enabled and plugins.entries.voice-call.config.streaming.enabled cannot both be true",
+      );
+    });
+  });
 });
 
 describe("normalizeVoiceCallConfig", () => {
@@ -194,11 +223,25 @@ describe("normalizeVoiceCallConfig", () => {
 
     expect(normalized.serve.path).toBe("/voice/webhook");
     expect(normalized.streaming.streamPath).toBe("/custom-stream");
-    expect(normalized.streaming.sttModel).toBe("gpt-4o-transcribe");
+    expect(normalized.streaming.provider).toBe("openai");
+    expect(normalized.streaming.providers.openai).toEqual({});
+    expect(normalized.realtime.streamPath).toBe("/voice/stream/realtime");
     expect(normalized.tunnel.provider).toBe("none");
     expect(normalized.webhookSecurity.allowedHosts).toEqual([]);
   });
 
+  it("derives the realtime stream path from a custom webhook path", () => {
+    const normalized = normalizeVoiceCallConfig({
+      enabled: true,
+      provider: "twilio",
+      serve: {
+        path: "/custom/webhook",
+      },
+    });
+
+    expect(normalized.realtime.streamPath).toBe("/custom/stream/realtime");
+  });
+
   it("accepts partial nested TTS overrides and preserves nested objects", () => {
     const normalized = normalizeVoiceCallConfig({
       tts: {
diff --git a/extensions/voice-call/src/config.ts b/extensions/voice-call/src/config.ts
index 74077faf0f1..7e2c519a7be 100644
--- a/extensions/voice-call/src/config.ts
+++ b/extensions/voice-call/src/config.ts
@@ -70,7 +70,7 @@ export type PlivoConfig = z.infer<typeof PlivoConfigSchema>;
 
 export const SttConfigSchema = z
   .object({
-    /** STT provider (currently only OpenAI supported) */
+    /** One-shot STT provider for non-streaming paths. */
     provider: z.literal("openai").default("openai"),
     /** Whisper model to use */
     model: z.string().min(1).default("whisper-1"),
@@ -196,25 +196,80 @@ export const OutboundConfigSchema = z
 export type OutboundConfig = z.infer<typeof OutboundConfigSchema>;
 
 // -----------------------------------------------------------------------------
-// Streaming Configuration (OpenAI Realtime STT)
+// Realtime Voice Configuration
+// -----------------------------------------------------------------------------
+
+export const RealtimeToolSchema = z
+  .object({
+    type: z.literal("function"),
+    name: z.string().min(1),
+    description: z.string(),
+    parameters: z.object({
+      type: z.literal("object"),
+      properties: z.record(z.string(), z.unknown()),
+      required: z.array(z.string()).optional(),
+    }),
+  })
+  .strict();
+export type RealtimeToolConfig = z.infer<typeof RealtimeToolSchema>;
+
+export const VoiceCallRealtimeProvidersConfigSchema = z
+  .record(z.string(), z.record(z.string(), z.unknown()))
+  .default({});
+export type VoiceCallRealtimeProvidersConfig = z.infer<
+  typeof VoiceCallRealtimeProvidersConfigSchema
+>;
+
+export const VoiceCallStreamingProvidersConfigSchema = z
+  .record(z.string(), z.record(z.string(), z.unknown()))
+  .default({});
+export type VoiceCallStreamingProvidersConfig = z.infer<
+  typeof VoiceCallStreamingProvidersConfigSchema
+>;
+
+export const VoiceCallRealtimeConfigSchema = z
+  .object({
+    /** Enable realtime voice-to-voice mode. */
+    enabled: z.boolean().default(false),
+    /** Provider id from registered realtime voice providers. */
+    provider: z.string().min(1).optional(),
+    /** Optional override for the local WebSocket route path. */
+    streamPath: z.string().min(1).optional(),
+    /** System instructions passed to the realtime provider. */
+    instructions: z.string().optional(),
+    /** Tool definitions exposed to the realtime provider. */
+    tools: z.array(RealtimeToolSchema).default([]),
+    /** Provider-owned raw config blobs keyed by provider id. */
+    providers: VoiceCallRealtimeProvidersConfigSchema,
+  })
+  .strict()
+  .default({ enabled: false, tools: [], providers: {} });
+export type VoiceCallRealtimeConfig = z.infer<typeof VoiceCallRealtimeConfigSchema>;
+
+// -----------------------------------------------------------------------------
+// Streaming Configuration (Realtime Transcription)
 // -----------------------------------------------------------------------------
 
 export const VoiceCallStreamingConfigSchema = z
   .object({
     /** Enable real-time audio streaming (requires WebSocket support) */
     enabled: z.boolean().default(false),
-    /** STT provider for real-time transcription */
-    sttProvider: z.enum(["openai-realtime"]).default("openai-realtime"),
-    /** OpenAI API key for Realtime API (uses OPENAI_API_KEY env if not set) */
+    /** Provider id from registered realtime transcription providers. */
+    provider: z.string().min(1).default("openai"),
+    /** @deprecated Legacy alias for provider. */
+    sttProvider: z.string().min(1).optional(),
+    /** @deprecated Legacy OpenAI-specific API key field. */
     openaiApiKey: z.string().min(1).optional(),
-    /** OpenAI transcription model (default: gpt-4o-transcribe) */
-    sttModel: z.string().min(1).default("gpt-4o-transcribe"),
-    /** VAD silence duration in ms before considering speech ended */
-    silenceDurationMs: z.number().int().positive().default(800),
-    /** VAD threshold 0-1 (higher = less sensitive) */
-    vadThreshold: z.number().min(0).max(1).default(0.5),
+    /** @deprecated Legacy OpenAI-specific transcription model field. */
+    sttModel: z.string().min(1).optional(),
+    /** @deprecated Legacy OpenAI-specific VAD silence duration. */
+    silenceDurationMs: z.number().int().positive().optional(),
+    /** @deprecated Legacy OpenAI-specific VAD threshold. */
+    vadThreshold: z.number().min(0).max(1).optional(),
     /** WebSocket path for media stream connections */
     streamPath: z.string().min(1).default("/voice/stream"),
+    /** Provider-owned raw config blobs keyed by provider id. */
+    providers: VoiceCallStreamingProvidersConfigSchema,
     /**
      * Close unauthenticated media stream sockets if no valid `start` frame arrives in time.
      * Protects against pre-auth idle connection hold attacks.
@@ -230,11 +285,9 @@ export const VoiceCallStreamingConfigSchema = z
   .strict()
   .default({
     enabled: false,
-    sttProvider: "openai-realtime",
-    sttModel: "gpt-4o-transcribe",
-    silenceDurationMs: 800,
-    vadThreshold: 0.5,
+    provider: "openai",
     streamPath: "/voice/stream",
+    providers: {},
     preStartTimeoutMs: 5000,
     maxPendingConnections: 32,
     maxPendingConnectionsPerIp: 4,
@@ -319,6 +372,9 @@ export const VoiceCallConfigSchema = z
     /** Real-time audio streaming configuration */
     streaming: VoiceCallStreamingConfigSchema,
 
+    /** Realtime voice-to-voice configuration */
+    realtime: VoiceCallRealtimeConfigSchema,
+
     /** Public webhook URL override (if set, bypasses tunnel auto-detection) */
     publicUrl: z.string().url().optional(),
 
@@ -364,6 +420,29 @@ function cloneDefaultVoiceCallConfig(): VoiceCallConfig {
   return structuredClone(DEFAULT_VOICE_CALL_CONFIG);
 }
 
+function normalizeWebhookLikePath(pathname: string): string {
+  const trimmed = pathname.trim();
+  if (!trimmed) {
+    return "/";
+  }
+  const prefixed = trimmed.startsWith("/") ? trimmed : `/${trimmed}`;
+  if (prefixed === "/") {
+    return prefixed;
+  }
+  return prefixed.endsWith("/") ? prefixed.slice(0, -1) : prefixed;
+}
+
+function defaultRealtimeStreamPathForServePath(servePath: string): string {
+  const normalized = normalizeWebhookLikePath(servePath);
+  if (normalized.endsWith("/webhook")) {
+    return `${normalized.slice(0, -"/webhook".length)}/stream/realtime`;
+  }
+  if (normalized === "/") {
+    return "/voice/stream/realtime";
+  }
+  return `${normalized}/stream/realtime`;
+}
+
 function normalizeVoiceCallTtsConfig(
   defaults: VoiceCallTtsConfig,
   overrides: DeepPartial<NonNullable<VoiceCallTtsConfig>> | undefined,
@@ -375,14 +454,55 @@ function normalizeVoiceCallTtsConfig(
   return TtsConfigSchema.parse(deepMergeDefined(defaults ?? {}, overrides ?? {}));
 }
 
+function sanitizeVoiceCallProviderConfigs(
+  value: Record<string, Record<string, unknown> | undefined> | undefined,
+): Record<string, Record<string, unknown>> {
+  if (!value) {
+    return {};
+  }
+  return Object.fromEntries(
+    Object.entries(value).filter(
+      (entry): entry is [string, Record<string, unknown>] => entry[1] !== undefined,
+    ),
+  );
+}
+
 export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig {
   const defaults = cloneDefaultVoiceCallConfig();
+  const serve = { ...defaults.serve, ...config.serve };
+  const streamingProvider =
+    config.streaming?.provider ??
+    (typeof config.streaming?.sttProvider === "string"
+      ? config.streaming.sttProvider
+      : undefined) ??
+    defaults.streaming.provider;
+  const streamingProviders = sanitizeVoiceCallProviderConfigs(
+    config.streaming?.providers ?? defaults.streaming.providers,
+  );
+  if (
+    typeof streamingProvider === "string" &&
+    streamingProvider.trim() &&
+    !(streamingProvider in streamingProviders)
+  ) {
+    streamingProviders[streamingProvider] = {};
+  }
+  const realtimeProvider = config.realtime?.provider ?? defaults.realtime.provider;
+  const realtimeProviders = sanitizeVoiceCallProviderConfigs(
+    config.realtime?.providers ?? defaults.realtime.providers,
+  );
+  if (
+    typeof realtimeProvider === "string" &&
+    realtimeProvider.trim() &&
+    !(realtimeProvider in realtimeProviders)
+  ) {
+    realtimeProviders[realtimeProvider] = {};
+  }
   return {
     ...defaults,
     ...config,
     allowFrom: config.allowFrom ?? defaults.allowFrom,
     outbound: { ...defaults.outbound, ...config.outbound },
-    serve: { ...defaults.serve, ...config.serve },
+    serve,
     tailscale: { ...defaults.tailscale, ...config.tailscale },
     tunnel: { ...defaults.tunnel, ...config.tunnel },
     webhookSecurity: {
@@ -392,7 +512,23 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal
       trustedProxyIPs:
         config.webhookSecurity?.trustedProxyIPs ?? defaults.webhookSecurity.trustedProxyIPs,
     },
-    streaming: { ...defaults.streaming, ...config.streaming },
+    streaming: {
+      ...defaults.streaming,
+      ...config.streaming,
+      provider: streamingProvider,
+      providers: streamingProviders,
+    },
+    realtime: {
+      ...defaults.realtime,
+      ...config.realtime,
+      provider: realtimeProvider,
+      streamPath:
+        config.realtime?.streamPath ??
+        defaultRealtimeStreamPathForServePath(serve.path ?? defaults.serve.path),
+      tools:
+        (config.realtime?.tools as RealtimeToolConfig[] | undefined) ?? defaults.realtime.tools,
+      providers: realtimeProviders,
+    },
     stt: { ...defaults.stt, ...config.stt },
     tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts),
   };
@@ -448,6 +584,133 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC
     resolved.webhookSecurity.trustForwardingHeaders ?? false;
   resolved.webhookSecurity.trustedProxyIPs = resolved.webhookSecurity.trustedProxyIPs ?? [];
 
+  resolved.streaming = {
+    ...resolved.streaming,
+    providers: { ...(resolved.streaming.providers ?? {}) },
+  };
+  const legacyStreamingRaw = resolved.streaming as Record<string, unknown>;
+  const openaiStreamingRaw =
+    resolved.streaming.providers.openai && typeof resolved.streaming.providers.openai === "object"
+      ? { ...(resolved.streaming.providers.openai as Record<string, unknown>) }
+      : {};
+  if (
+    typeof openaiStreamingRaw.apiKey !== "string" &&
+    typeof legacyStreamingRaw.openaiApiKey === "string"
+  ) {
+    openaiStreamingRaw.apiKey = legacyStreamingRaw.openaiApiKey;
+  }
+  if (
+    typeof openaiStreamingRaw.model !== "string" &&
+    typeof legacyStreamingRaw.sttModel === "string"
+  ) {
+    openaiStreamingRaw.model = legacyStreamingRaw.sttModel;
+  }
+  if (
+    openaiStreamingRaw.silenceDurationMs == null &&
+    typeof legacyStreamingRaw.silenceDurationMs === "number"
+  ) {
+    openaiStreamingRaw.silenceDurationMs = legacyStreamingRaw.silenceDurationMs;
+  }
+  if (
+    openaiStreamingRaw.vadThreshold == null &&
+    typeof legacyStreamingRaw.vadThreshold === "number"
+  ) {
+    openaiStreamingRaw.vadThreshold = legacyStreamingRaw.vadThreshold;
+  }
+  if (typeof openaiStreamingRaw.apiKey !== "string" || !openaiStreamingRaw.apiKey.trim()) {
+    if (process.env.OPENAI_API_KEY) {
+      openaiStreamingRaw.apiKey = process.env.OPENAI_API_KEY;
+    }
+  }
+  if (
+    typeof openaiStreamingRaw.model !== "string" &&
+    typeof process.env.REALTIME_TRANSCRIPTION_MODEL === "string"
+  ) {
+    openaiStreamingRaw.model = process.env.REALTIME_TRANSCRIPTION_MODEL;
+  }
+  if (
+    typeof openaiStreamingRaw.model !== "string" &&
+    typeof process.env.STREAMING_STT_MODEL === "string"
+  ) {
+    openaiStreamingRaw.model = process.env.STREAMING_STT_MODEL;
+  }
+  if (openaiStreamingRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") {
+    openaiStreamingRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD);
+  }
+  if (
+    openaiStreamingRaw.silenceDurationMs == null &&
+    typeof process.env.SILENCE_DURATION_MS === "string"
+  ) {
+    openaiStreamingRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10);
+  }
+  if (Object.keys(openaiStreamingRaw).length > 0) {
+    resolved.streaming.providers.openai = openaiStreamingRaw;
+  }
+  if (
+    typeof resolved.streaming.provider === "string" &&
+    resolved.streaming.provider.trim() &&
+    !(resolved.streaming.provider in resolved.streaming.providers)
+  ) {
+    resolved.streaming.providers[resolved.streaming.provider] = {};
+  }
+
+  resolved.realtime = {
+    ...resolved.realtime,
+    providers: { ...(resolved.realtime.providers ?? {}) },
+  };
+  const openaiRealtimeRaw =
+    resolved.realtime.providers.openai && typeof resolved.realtime.providers.openai === "object"
+      ? { ...(resolved.realtime.providers.openai as Record<string, unknown>) }
+      : {};
+  if (typeof openaiRealtimeRaw.apiKey !== "string" || !openaiRealtimeRaw.apiKey.trim()) {
+    if (process.env.OPENAI_API_KEY) {
+      openaiRealtimeRaw.apiKey = process.env.OPENAI_API_KEY;
+    }
+  }
+  if (
+    typeof openaiRealtimeRaw.model !== "string" &&
+    typeof process.env.REALTIME_VOICE_MODEL === "string"
+  ) {
+    openaiRealtimeRaw.model = process.env.REALTIME_VOICE_MODEL;
+  }
+  if (
+    typeof openaiRealtimeRaw.voice !== "string" &&
+    typeof process.env.REALTIME_VOICE_VOICE === "string"
+  ) {
+    openaiRealtimeRaw.voice = process.env.REALTIME_VOICE_VOICE;
+  }
+  if (
+    typeof resolved.realtime.instructions !== "string" &&
+    typeof process.env.REALTIME_VOICE_INSTRUCTIONS === "string"
+  ) {
+    resolved.realtime.instructions = process.env.REALTIME_VOICE_INSTRUCTIONS;
+  }
+  if (
+    openaiRealtimeRaw.temperature == null &&
+    typeof process.env.REALTIME_VOICE_TEMPERATURE === "string"
+  ) {
+    openaiRealtimeRaw.temperature = Number.parseFloat(process.env.REALTIME_VOICE_TEMPERATURE);
+  }
+  if (openaiRealtimeRaw.vadThreshold == null && typeof process.env.VAD_THRESHOLD === "string") {
+    openaiRealtimeRaw.vadThreshold = Number.parseFloat(process.env.VAD_THRESHOLD);
+  }
+  if (
+    openaiRealtimeRaw.silenceDurationMs == null &&
+    typeof process.env.SILENCE_DURATION_MS === "string"
+  ) {
+    openaiRealtimeRaw.silenceDurationMs = Number.parseInt(process.env.SILENCE_DURATION_MS, 10);
+  }
+  if (Object.keys(openaiRealtimeRaw).length > 0) {
+    resolved.realtime.providers.openai = openaiRealtimeRaw;
+  }
+  if (
+    typeof resolved.realtime.provider === "string" &&
+    resolved.realtime.provider.trim() &&
+    !(resolved.realtime.provider in resolved.realtime.providers)
+  ) {
+    resolved.realtime.providers[resolved.realtime.provider] = {};
+  }
+
   return normalizeVoiceCallConfig(resolved);
 }
 
@@ -516,5 +779,23 @@ export function validateProviderConfig(config: VoiceCallConfig): {
     }
   }
 
+  if (config.realtime.enabled && config.inboundPolicy === "disabled") {
+    errors.push(
+      'plugins.entries.voice-call.config.inboundPolicy must not be "disabled" when realtime.enabled is true',
+    );
+  }
+
+  if (config.realtime.enabled && config.streaming.enabled) {
+    errors.push(
+      "plugins.entries.voice-call.config.realtime.enabled and plugins.entries.voice-call.config.streaming.enabled cannot both be true",
+    );
+  }
+
+  if (config.realtime.enabled && config.provider && config.provider !== "twilio") {
+    errors.push(
+      'plugins.entries.voice-call.config.provider must be "twilio" when realtime.enabled is true',
+    );
+  }
+
   return { valid: errors.length === 0, errors };
 }
diff --git a/extensions/voice-call/src/manager/outbound.test.ts b/extensions/voice-call/src/manager/outbound.test.ts
index 70e5117d8f3..33d64fff565 100644
--- a/extensions/voice-call/src/manager/outbound.test.ts
+++ b/extensions/voice-call/src/manager/outbound.test.ts
@@ -125,7 +125,7 @@ describe("voice-call outbound helpers", () => {
         maxConcurrentCalls: 3,
         outbound: { defaultMode: "conversation" },
         fromNumber: "+14155550100",
-        tts: { providers: { openai: { voice: "nova" } } },
+        tts: { provider: "openai", providers: { openai: { voice: "nova" } } },
       },
       storePath: "/tmp/voice-call.json",
       webhookUrl: "https://example.com/webhook",
@@ -187,7 +187,7 @@ describe("voice-call outbound helpers", () => {
       activeCalls: new Map([["call-1", call]]),
       providerCallIdMap: new Map(),
       provider: { name: "twilio", playTts },
-      config: { tts: { providers: { openai: { voice: "alloy" } } } },
+      config: { tts: { provider: "openai", providers: { openai: { voice: "alloy" } } } },
       storePath: "/tmp/voice-call.json",
     };
 
diff --git a/extensions/voice-call/src/manager/outbound.ts b/extensions/voice-call/src/manager/outbound.ts
index c1f82b0e569..a90e22701a3 100644
--- a/extensions/voice-call/src/manager/outbound.ts
+++ b/extensions/voice-call/src/manager/outbound.ts
@@ -100,11 +100,22 @@ function requireConnectedCall(ctx: ConnectedCallContext, callId: CallId): Connec
   };
 }
 
-function resolveOpenAITtsVoice(config: SpeakContext["config"]): string | undefined {
-  const providerConfig = config.tts?.providers?.openai;
-  return providerConfig && typeof providerConfig === "object"
-    ? (providerConfig.voice as string | undefined)
-    : undefined;
+function resolvePreferredTtsVoice(config: SpeakContext["config"]): string | undefined {
+  const providerId = config.tts?.provider;
+  if (!providerId) {
+    return undefined;
+  }
+  const providerConfig = config.tts?.providers?.[providerId];
+  if (!providerConfig || typeof providerConfig !== "object") {
+    return undefined;
+  }
+  if (typeof providerConfig.voice === "string" && providerConfig.voice.trim()) {
+    return providerConfig.voice;
+  }
+  if (typeof providerConfig.voiceId === "string" && providerConfig.voiceId.trim()) {
+    return providerConfig.voiceId;
+  }
+  return undefined;
 }
 
 export async function initiateCall(
@@ -164,7 +175,7 @@ export async function initiateCall(
     // For notify mode with a message, use inline TwiML with <Say>.
     let inlineTwiml: string | undefined;
     if (mode === "notify" && initialMessage) {
-      const pollyVoice = mapVoiceToPolly(resolveOpenAITtsVoice(ctx.config));
+      const pollyVoice = mapVoiceToPolly(resolvePreferredTtsVoice(ctx.config));
       inlineTwiml = generateNotifyTwiml(initialMessage, pollyVoice);
       console.log(`[voice-call] Using inline TwiML for notify mode (voice: ${pollyVoice})`);
     }
@@ -212,7 +223,7 @@ export async function speak(
     transitionState(call, "speaking");
     persistCallRecord(ctx.storePath, call);
 
-    const voice = provider.name === "twilio" ? resolveOpenAITtsVoice(ctx.config) : undefined;
+    const voice = provider.name === "twilio" ? resolvePreferredTtsVoice(ctx.config) : undefined;
     await provider.playTts({
       callId,
       providerCallId,
diff --git a/extensions/voice-call/src/media-stream.test.ts b/extensions/voice-call/src/media-stream.test.ts
index 8f6e16bc5e8..66e2f74c4e2 100644
--- a/extensions/voice-call/src/media-stream.test.ts
+++ b/extensions/voice-call/src/media-stream.test.ts
@@ -1,28 +1,27 @@
 import { once } from "node:events";
 import http from "node:http";
+import type {
+  RealtimeTranscriptionProviderPlugin,
+  RealtimeTranscriptionSession,
+} from "openclaw/plugin-sdk/realtime-transcription";
 import { describe, expect, it, vi } from "vitest";
 import { WebSocket } from "ws";
 import { MediaStreamHandler, sanitizeLogText } from "./media-stream.js";
-import type {
-  OpenAIRealtimeSTTProvider,
-  RealtimeSTTSession,
-} from "./providers/stt-openai-realtime.js";
 
-const createStubSession = (): RealtimeSTTSession => ({
+const createStubSession = (): RealtimeTranscriptionSession => ({
   connect: async () => {},
   sendAudio: () => {},
-  waitForTranscript: async () => "",
-  onPartial: () => {},
-  onTranscript: () => {},
-  onSpeechStart: () => {},
   close: () => {},
   isConnected: () => true,
 });
 
-const createStubSttProvider = (): OpenAIRealtimeSTTProvider =>
+const createStubSttProvider = (): RealtimeTranscriptionProviderPlugin =>
   ({
     createSession: () => createStubSession(),
-  }) as unknown as OpenAIRealtimeSTTProvider;
+    id: "openai",
+    label: "OpenAI",
+    isConfigured: () => true,
+  }) as unknown as RealtimeTranscriptionProviderPlugin;
 
 const flush = async (): Promise<void> => {
   await new Promise((resolve) => setTimeout(resolve, 0));
@@ -104,7 +103,8 @@ const waitForClose = async (
 describe("MediaStreamHandler TTS queue", () => {
   it("serializes TTS playback and resolves in order", async () => {
     const handler = new MediaStreamHandler({
-      sttProvider: createStubSttProvider(),
+      transcriptionProvider: createStubSttProvider(),
+      providerConfig: {},
     });
     const started: number[] = [];
     const finished: number[] = [];
@@ -137,7 +137,8 @@ describe("MediaStreamHandler TTS queue", () => {
 
   it("cancels active playback and clears queued items", async () => {
     const handler = new MediaStreamHandler({
-      sttProvider: createStubSttProvider(),
+      transcriptionProvider: createStubSttProvider(),
+      providerConfig: {},
     });
 
     let queuedRan = false;
@@ -165,7 +166,8 @@ describe("MediaStreamHandler TTS queue", () => {
 describe("MediaStreamHandler security hardening", () => {
   it("fails sends and closes stream when buffered bytes already exceed the cap", () => {
     const handler = new MediaStreamHandler({
-      sttProvider: createStubSttProvider(),
+      transcriptionProvider: createStubSttProvider(),
+      providerConfig: {},
     });
     const ws = {
       readyState: WebSocket.OPEN,
@@ -177,7 +179,12 @@ describe("MediaStreamHandler security hardening", () => {
       handler as unknown as {
         sessions: Map<
           string,
-          { callId: string; streamSid: string; ws: WebSocket; sttSession: RealtimeSTTSession }
+          {
+            callId: string;
+            streamSid: string;
+            ws: WebSocket;
+            sttSession: RealtimeTranscriptionSession;
+          }
         >;
       }
     ).sessions.set("MZ-backpressure", {
@@ -196,7 +203,8 @@ describe("MediaStreamHandler security hardening", () => {
 
   it("fails sends when buffered bytes exceed cap after enqueueing a frame", () => {
     const handler = new MediaStreamHandler({
-      sttProvider: createStubSttProvider(),
+      transcriptionProvider: createStubSttProvider(),
+      providerConfig: {},
     });
     const ws = {
       readyState: WebSocket.OPEN,
@@ -214,7 +222,12 @@ describe("MediaStreamHandler security hardening", () => {
       handler as unknown as {
         sessions: Map<
           string,
-          { callId: string; streamSid: string; ws: WebSocket; sttSession: RealtimeSTTSession }
+          {
+            callId: string;
+            streamSid: string;
+            ws: WebSocket;
+            sttSession: RealtimeTranscriptionSession;
+          }
         >;
       }
     ).sessions.set("MZ-overflow", {
@@ -243,7 +256,8 @@ describe("MediaStreamHandler security hardening", () => {
     const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
       [];
     const handler = new MediaStreamHandler({
-      sttProvider: createStubSttProvider(),
+      transcriptionProvider: createStubSttProvider(),
+      providerConfig: {},
       preStartTimeoutMs: 40,
       shouldAcceptStream: (params) => {
         shouldAcceptStreamCalls.push(params);
@@ -266,7 +280,8 @@ describe("MediaStreamHandler security hardening", () => {
 
   it("enforces pending connection limits", async () => {
     const handler = new MediaStreamHandler({
-      sttProvider: createStubSttProvider(),
+      transcriptionProvider: createStubSttProvider(),
+      providerConfig: {},
       preStartTimeoutMs: 5_000,
       maxPendingConnections: 1,
       maxPendingConnectionsPerIp: 1,
@@ -291,7 +306,8 @@ describe("MediaStreamHandler security hardening", () => {
 
   it("rejects upgrades when max connection cap is reached", async () => {
     const handler = new MediaStreamHandler({
-      sttProvider: createStubSttProvider(),
+      transcriptionProvider: createStubSttProvider(),
+      providerConfig: {},
       preStartTimeoutMs: 5_000,
       maxConnections: 1,
       maxPendingConnections: 10,
@@ -319,7 +335,8 @@ describe("MediaStreamHandler security hardening", () => {
 
   it("clears pending state after valid start", async () => {
     const handler = new MediaStreamHandler({
-      sttProvider: createStubSttProvider(),
+      transcriptionProvider: createStubSttProvider(),
+      providerConfig: {},
       preStartTimeoutMs: 40,
       shouldAcceptStream: () => true,
     });
@@ -349,7 +366,8 @@ describe("MediaStreamHandler security hardening", () => {
     const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
       [];
     const handler = new MediaStreamHandler({
-      sttProvider: createStubSttProvider(),
+      transcriptionProvider: createStubSttProvider(),
+      providerConfig: {},
       preStartTimeoutMs: 1_000,
       shouldAcceptStream: (params) => {
         shouldAcceptStreamCalls.push(params);
diff --git a/extensions/voice-call/src/media-stream.ts b/extensions/voice-call/src/media-stream.ts
index fb259d723b8..0a051a18d38 100644
--- a/extensions/voice-call/src/media-stream.ts
+++ b/extensions/voice-call/src/media-stream.ts
@@ -3,24 +3,27 @@
  *
  * Handles bidirectional audio streaming between Twilio and the AI services.
  * - Receives mu-law audio from Twilio via WebSocket
- * - Forwards to OpenAI Realtime STT for transcription
+ * - Forwards to the selected realtime transcription provider
  * - Sends TTS audio back to Twilio
  */
 
 import type { IncomingMessage } from "node:http";
 import type { Duplex } from "node:stream";
-import { type RawData, WebSocket, WebSocketServer } from "ws";
 import type {
-  OpenAIRealtimeSTTProvider,
-  RealtimeSTTSession,
-} from "./providers/stt-openai-realtime.js";
+  RealtimeTranscriptionProviderConfig,
+  RealtimeTranscriptionProviderPlugin,
+  RealtimeTranscriptionSession,
+} from "openclaw/plugin-sdk/realtime-transcription";
+import { type RawData, WebSocket, WebSocketServer } from "ws";
 
 /**
  * Configuration for the media stream handler.
  */
 export interface MediaStreamConfig {
-  /** STT provider for transcription */
-  sttProvider: OpenAIRealtimeSTTProvider;
+  /** Realtime transcription provider for streaming STT. */
+  transcriptionProvider: RealtimeTranscriptionProviderPlugin;
+  /** Provider-owned config blob passed into the transcription session. */
+  providerConfig: RealtimeTranscriptionProviderConfig;
   /** Close sockets that never send a valid `start` frame within this window. */
   preStartTimeoutMs?: number;
   /** Max concurrent pre-start sockets. */
@@ -50,7 +53,7 @@ interface StreamSession {
   callId: string;
   streamSid: string;
   ws: WebSocket;
-  sttSession: RealtimeSTTSession;
+  sttSession: RealtimeTranscriptionSession;
 }
 
 type TtsQueueEntry = {
@@ -254,20 +257,20 @@ export class MediaStreamHandler {
       return null;
     }
 
-    // Create STT session
-    const sttSession = this.config.sttProvider.createSession();
-
-    // Set up transcript callbacks
-    sttSession.onPartial((partial) => {
-      this.config.onPartialTranscript?.(callSid, partial);
-    });
-
-    sttSession.onTranscript((transcript) => {
-      this.config.onTranscript?.(callSid, transcript);
-    });
-
-    sttSession.onSpeechStart(() => {
-      this.config.onSpeechStart?.(callSid);
+    const sttSession = this.config.transcriptionProvider.createSession({
+      providerConfig: this.config.providerConfig,
+      onPartial: (partial) => {
+        this.config.onPartialTranscript?.(callSid, partial);
+      },
+      onTranscript: (transcript) => {
+        this.config.onTranscript?.(callSid, transcript);
+      },
+      onSpeechStart: () => {
+        this.config.onSpeechStart?.(callSid);
+      },
+      onError: (error) => {
+        console.warn("[MediaStream] Transcription session error:", error.message);
+      },
     });
 
     const session: StreamSession = {
@@ -282,7 +285,7 @@ export class MediaStreamHandler {
     // Notify connection BEFORE STT connect so TTS can work even if STT fails
     this.config.onConnect?.(callSid, streamSid);
 
-    // Connect to OpenAI STT (non-blocking, log errors but don't fail the call)
+    // Connect to transcription service (non-blocking, log errors but don't fail the call)
     sttSession.connect().catch((err) => {
       console.warn(`[MediaStream] STT connection failed (TTS still works):`, err.message);
     });
diff --git a/extensions/voice-call/src/providers/index.ts b/extensions/voice-call/src/providers/index.ts
index c8183622e35..4b0c2e442d5 100644
--- a/extensions/voice-call/src/providers/index.ts
+++ b/extensions/voice-call/src/providers/index.ts
@@ -1,10 +1,5 @@
 export type { VoiceCallProvider } from "./base.js";
 export { MockProvider } from "./mock.js";
-export {
-  OpenAIRealtimeSTTProvider,
-  type RealtimeSTTConfig,
-  type RealtimeSTTSession,
-} from "./stt-openai-realtime.js";
 export { TelnyxProvider } from "./telnyx.js";
 export { TwilioProvider } from "./twilio.js";
 export { PlivoProvider } from "./plivo.js";
diff --git a/extensions/voice-call/src/providers/stt-openai-realtime.test.ts b/extensions/voice-call/src/providers/stt-openai-realtime.test.ts
deleted file mode 100644
index 5788053db5c..00000000000
--- a/extensions/voice-call/src/providers/stt-openai-realtime.test.ts
+++ /dev/null
@@ -1,42 +0,0 @@
-import { describe, expect, it } from "vitest";
-import type { RealtimeSTTConfig } from "./stt-openai-realtime.js";
-import { OpenAIRealtimeSTTProvider } from "./stt-openai-realtime.js";
-
-type ProviderInternals = {
-  vadThreshold: number;
-  silenceDurationMs: number;
-};
-
-function readProviderInternals(config: RealtimeSTTConfig): ProviderInternals {
-  const provider = new OpenAIRealtimeSTTProvider(config) as unknown as Record<string, unknown>;
-  return {
-    vadThreshold: provider["vadThreshold"] as number,
-    silenceDurationMs: provider["silenceDurationMs"] as number,
-  };
-}
-
-describe("OpenAIRealtimeSTTProvider constructor defaults", () => {
-  it("uses vadThreshold: 0 when explicitly configured (max sensitivity)", () => {
-    const provider = readProviderInternals({
-      apiKey: "sk-test", // pragma: allowlist secret
-      vadThreshold: 0,
-    });
-    expect(provider.vadThreshold).toBe(0);
-  });
-
-  it("uses silenceDurationMs: 0 when explicitly configured", () => {
-    const provider = readProviderInternals({
-      apiKey: "sk-test", // pragma: allowlist secret
-      silenceDurationMs: 0,
-    });
-    expect(provider.silenceDurationMs).toBe(0);
-  });
-
-  it("falls back to defaults when values are undefined", () => {
-    const provider = readProviderInternals({
-      apiKey: "sk-test", // pragma: allowlist secret
-    });
-    expect(provider.vadThreshold).toBe(0.5);
-    expect(provider.silenceDurationMs).toBe(800);
-  });
-});
diff --git a/extensions/voice-call/src/providers/stt-openai-realtime.ts b/extensions/voice-call/src/providers/stt-openai-realtime.ts
deleted file mode 100644
index eaced5eeef2..00000000000
--- a/extensions/voice-call/src/providers/stt-openai-realtime.ts
+++ /dev/null
@@ -1,321 +0,0 @@
-/**
- * OpenAI Realtime STT Provider
- *
- * Uses the OpenAI Realtime API for streaming transcription with:
- * - Direct mu-law audio support (no conversion needed)
- * - Built-in server-side VAD for turn detection
- * - Low-latency streaming transcription
- * - Partial transcript callbacks for real-time UI updates
- */
-
-import WebSocket from "ws";
-
-/**
- * Configuration for OpenAI Realtime STT.
- */
-export interface RealtimeSTTConfig {
-  /** OpenAI API key */
-  apiKey: string;
-  /** Model to use (default: gpt-4o-transcribe) */
-  model?: string;
-  /** Silence duration in ms before considering speech ended (default: 800) */
-  silenceDurationMs?: number;
-  /** VAD threshold 0-1 (default: 0.5) */
-  vadThreshold?: number;
-}
-
-/**
- * Session for streaming audio and receiving transcripts.
- */
-export interface RealtimeSTTSession {
-  /** Connect to the transcription service */
-  connect(): Promise<void>;
-  /** Send mu-law audio data (8kHz mono) */
-  sendAudio(audio: Buffer): void;
-  /** Wait for next complete transcript (after VAD detects end of speech) */
-  waitForTranscript(timeoutMs?: number): Promise<string>;
-  /** Set callback for partial transcripts (streaming) */
-  onPartial(callback: (partial: string) => void): void;
-  /** Set callback for final transcripts */
-  onTranscript(callback: (transcript: string) => void): void;
-  /** Set callback when speech starts (VAD) */
-  onSpeechStart(callback: () => void): void;
-  /** Close the session */
-  close(): void;
-  /** Check if session is connected */
-  isConnected(): boolean;
-}
-
-/**
- * Provider factory for OpenAI Realtime STT sessions.
- */
-export class OpenAIRealtimeSTTProvider {
-  readonly name = "openai-realtime";
-  private apiKey: string;
-  private model: string;
-  private silenceDurationMs: number;
-  private vadThreshold: number;
-
-  constructor(config: RealtimeSTTConfig) {
-    if (!config.apiKey) {
-      throw new Error("OpenAI API key required for Realtime STT");
-    }
-    this.apiKey = config.apiKey;
-    this.model = config.model || "gpt-4o-transcribe";
-    this.silenceDurationMs = config.silenceDurationMs ?? 800;
-    this.vadThreshold = config.vadThreshold ?? 0.5;
-  }
-
-  /**
-   * Create a new realtime transcription session.
-   */
-  createSession(): RealtimeSTTSession {
-    return new OpenAIRealtimeSTTSession(
-      this.apiKey,
-      this.model,
-      this.silenceDurationMs,
-      this.vadThreshold,
-    );
-  }
-}
-
-/**
- * WebSocket-based session for real-time speech-to-text.
- */
-class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
-  private static readonly MAX_RECONNECT_ATTEMPTS = 5;
-  private static readonly RECONNECT_DELAY_MS = 1000;
-
-  private ws: WebSocket | null = null;
-  private connected = false;
-  private closed = false;
-  private connectTimeout: ReturnType<typeof setTimeout> | null = null;
-  private reconnectAttempts = 0;
-  private pendingTranscript = "";
-  private onTranscriptCallback: ((transcript: string) => void) | null = null;
-  private onPartialCallback: ((partial: string) => void) | null = null;
-  private onSpeechStartCallback: (() => void) | null = null;
-
-  constructor(
-    private readonly apiKey: string,
-    private readonly model: string,
-    private readonly silenceDurationMs: number,
-    private readonly vadThreshold: number,
-  ) {}
-
-  async connect(): Promise<void> {
-    this.closed = false;
-    this.reconnectAttempts = 0;
-    return this.doConnect();
-  }
-
-  private async doConnect(): Promise<void> {
-    return new Promise((resolve, reject) => {
-      const url = "wss://api.openai.com/v1/realtime?intent=transcription";
-
-      this.ws = new WebSocket(url, {
-        headers: {
-          Authorization: `Bearer ${this.apiKey}`,
-          "OpenAI-Beta": "realtime=v1",
-        },
-      });
-
-      this.ws.on("open", () => {
-        console.log("[RealtimeSTT] WebSocket connected");
-        this.connected = true;
-        this.reconnectAttempts = 0;
-        if (this.connectTimeout) {
-          clearTimeout(this.connectTimeout);
-          this.connectTimeout = null;
-        }
-
-        // Configure the transcription session
-        this.sendEvent({
-          type: "transcription_session.update",
-          session: {
-            input_audio_format: "g711_ulaw",
-            input_audio_transcription: {
-              model: this.model,
-            },
-            turn_detection: {
-              type: "server_vad",
-              threshold: this.vadThreshold,
-              prefix_padding_ms: 300,
-              silence_duration_ms: this.silenceDurationMs,
-            },
-          },
-        });
-
-        resolve();
-      });
-
-      this.ws.on("message", (data: Buffer) => {
-        try {
-          const event = JSON.parse(data.toString());
-          this.handleEvent(event);
-        } catch (e) {
-          console.error("[RealtimeSTT] Failed to parse event:", e);
-        }
-      });
-
-      this.ws.on("error", (error) => {
-        console.error("[RealtimeSTT] WebSocket error:", error);
-        if (!this.connected) {
-          reject(error);
-        }
-      });
-
-      this.ws.on("close", (code, reason) => {
-        console.log(
-          `[RealtimeSTT] WebSocket closed (code: ${code}, reason: ${reason?.toString() || "none"})`,
-        );
-        this.connected = false;
-
-        // Attempt reconnection if not intentionally closed
-        if (!this.closed) {
-          void this.attemptReconnect();
-        }
-      });
-
-      this.connectTimeout = setTimeout(() => {
-        this.connectTimeout = null;
-        if (!this.connected) {
-          reject(new Error("Realtime STT connection timeout"));
-        }
-      }, 10000);
-    });
-  }
-
-  private async attemptReconnect(): Promise<void> {
-    if (this.closed) {
-      return;
-    }
-
-    if (this.reconnectAttempts >= OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS) {
-      console.error(
-        `[RealtimeSTT] Max reconnect attempts (${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS}) reached`,
-      );
-      return;
-    }
-
-    this.reconnectAttempts++;
-    const delay = OpenAIRealtimeSTTSession.RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
-    console.log(
-      `[RealtimeSTT] Reconnecting ${this.reconnectAttempts}/${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS} in ${delay}ms...`,
-    );
-
-    await new Promise((resolve) => setTimeout(resolve, delay));
-
-    if (this.closed) {
-      return;
-    }
-
-    try {
-      await this.doConnect();
-      console.log("[RealtimeSTT] Reconnected successfully");
-    } catch (error) {
-      console.error("[RealtimeSTT] Reconnect failed:", error);
-    }
-  }
-
-  private handleEvent(event: {
-    type: string;
-    delta?: string;
-    transcript?: string;
-    error?: unknown;
-  }): void {
-    switch (event.type) {
-      case "transcription_session.created":
-      case "transcription_session.updated":
-      case "input_audio_buffer.speech_stopped":
-      case "input_audio_buffer.committed":
-        console.log(`[RealtimeSTT] ${event.type}`);
-        break;
-
-      case "conversation.item.input_audio_transcription.delta":
-        if (event.delta) {
-          this.pendingTranscript += event.delta;
-          this.onPartialCallback?.(this.pendingTranscript);
-        }
-        break;
-
-      case "conversation.item.input_audio_transcription.completed":
-        if (event.transcript) {
-          console.log(`[RealtimeSTT] Transcript: ${event.transcript}`);
-          this.onTranscriptCallback?.(event.transcript);
-        }
-        this.pendingTranscript = "";
-        break;
-
-      case "input_audio_buffer.speech_started":
-        console.log("[RealtimeSTT] Speech started");
-        this.pendingTranscript = "";
-        this.onSpeechStartCallback?.();
-        break;
-
-      case "error":
-        console.error("[RealtimeSTT] Error:", event.error);
-        break;
-    }
-  }
-
-  private sendEvent(event: unknown): void {
-    if (this.ws?.readyState === WebSocket.OPEN) {
-      this.ws.send(JSON.stringify(event));
-    }
-  }
-
-  sendAudio(muLawData: Buffer): void {
-    if (!this.connected) {
-      return;
-    }
-    this.sendEvent({
-      type: "input_audio_buffer.append",
-      audio: muLawData.toString("base64"),
-    });
-  }
-
-  onPartial(callback: (partial: string) => void): void {
-    this.onPartialCallback = callback;
-  }
-
-  onTranscript(callback: (transcript: string) => void): void {
-    this.onTranscriptCallback = callback;
-  }
-
-  onSpeechStart(callback: () => void): void {
-    this.onSpeechStartCallback = callback;
-  }
-
-  async waitForTranscript(timeoutMs = 30000): Promise<string> {
-    return new Promise((resolve, reject) => {
-      const timeout = setTimeout(() => {
-        this.onTranscriptCallback = null;
-        reject(new Error("Transcript timeout"));
-      }, timeoutMs);
-
-      this.onTranscriptCallback = (transcript) => {
-        clearTimeout(timeout);
-        this.onTranscriptCallback = null;
-        resolve(transcript);
-      };
-    });
-  }
-
-  close(): void {
-    this.closed = true;
-    if (this.connectTimeout) {
-      clearTimeout(this.connectTimeout);
-      this.connectTimeout = null;
-    }
-    if (this.ws) {
-      this.ws.close();
-      this.ws = null;
-    }
-    this.connected = false;
-  }
-
-  isConnected(): boolean {
-    return this.connected;
-  }
-}
diff --git a/extensions/voice-call/src/providers/tts-openai.test.ts b/extensions/voice-call/src/providers/tts-openai.test.ts
deleted file mode 100644
index 79d4644b59f..00000000000
--- a/extensions/voice-call/src/providers/tts-openai.test.ts
+++ /dev/null
@@ -1,43 +0,0 @@
-import { describe, expect, it } from "vitest";
-import type { OpenAITTSConfig } from "./tts-openai.js";
-import { OpenAITTSProvider } from "./tts-openai.js";
-
-type ProviderInternals = {
-  model: string;
-  voice: string;
-  speed: number;
-};
-
-function readProviderInternals(config: OpenAITTSConfig): ProviderInternals {
-  return new OpenAITTSProvider(config) as unknown as ProviderInternals;
-}
-
-describe("OpenAITTSProvider constructor defaults", () => {
-  it("uses speed: 0 when explicitly configured", () => {
-    const provider = readProviderInternals({
-      apiKey: "sk-test", // pragma: allowlist secret
-      speed: 0,
-    });
-
-    expect(provider.speed).toBe(0);
-  });
-
-  it("falls back to speed default when undefined", () => {
-    const provider = readProviderInternals({
-      apiKey: "sk-test", // pragma: allowlist secret
-    });
-
-    expect(provider.speed).toBe(1.0);
-  });
-
-  it("treats blank model and voice overrides as unset", () => {
-    const provider = readProviderInternals({
-      apiKey: "sk-test", // pragma: allowlist secret
-      model: "   ",
-      voice: "",
-    });
-
-    expect(provider.model).toBe("gpt-4o-mini-tts");
-    expect(provider.voice).toBe("coral");
-  });
-});
diff --git a/extensions/voice-call/src/providers/tts-openai.ts b/extensions/voice-call/src/providers/tts-openai.ts
deleted file mode 100644
index 1fdc7a147d3..00000000000
--- a/extensions/voice-call/src/providers/tts-openai.ts
+++ /dev/null
@@ -1,185 +0,0 @@
-import { convertPcmToMulaw8k } from "../telephony-audio.js";
-
-/**
- * OpenAI TTS Provider
- *
- * Generates speech audio using OpenAI's text-to-speech API.
- * Handles audio format conversion for telephony (mu-law 8kHz).
- *
- * Best practices from OpenAI docs:
- * - Use gpt-4o-mini-tts for intelligent realtime applications (supports instructions)
- * - Use tts-1 for lower latency, tts-1-hd for higher quality
- * - Use marin or cedar voices for best quality
- * - Use pcm or wav format for fastest response times
- *
- * @see https://platform.openai.com/docs/guides/text-to-speech
- */
-
-/**
- * OpenAI TTS configuration.
- */
-export interface OpenAITTSConfig {
-  /** OpenAI API key (uses OPENAI_API_KEY env if not set) */
-  apiKey?: string;
-  /**
-   * TTS model:
-   * - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
-   * - tts-1: lower latency
-   * - tts-1-hd: higher quality
-   */
-  model?: string;
-  /**
-   * Voice to use. For best quality, use marin or cedar.
-   * All 13 voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
-   * Note: tts-1/tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer
-   */
-  voice?: string;
-  /** Speed multiplier (0.25 to 4.0) */
-  speed?: number;
-  /**
-   * Instructions for speech style (only works with gpt-4o-mini-tts model).
-   * Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
-   */
-  instructions?: string;
-}
-
-/**
- * Supported OpenAI TTS voices (all 13 built-in voices).
- * For best quality, use marin or cedar.
- * Note: tts-1 and tts-1-hd support a smaller set.
- */
-export const OPENAI_TTS_VOICES = [
-  "alloy",
-  "ash",
-  "ballad",
-  "coral",
-  "echo",
-  "fable",
-  "nova",
-  "onyx",
-  "sage",
-  "shimmer",
-  "verse",
-  "marin",
-  "cedar",
-] as const;
-
-export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
-
-function trimToUndefined(value: string | undefined): string | undefined {
-  const trimmed = value?.trim();
-  return trimmed ? trimmed : undefined;
-}
-
-function resolveOpenAITtsInstructions(model: string, instructions?: string): string | undefined {
-  const next = trimToUndefined(instructions);
-  return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
-}
-
-/**
- * OpenAI TTS Provider for generating speech audio.
- */
-export class OpenAITTSProvider {
-  private apiKey: string;
-  private model: string;
-  private voice: OpenAITTSVoice;
-  private speed: number;
-  private instructions?: string;
-
-  constructor(config: OpenAITTSConfig = {}) {
-    this.apiKey =
-      trimToUndefined(config.apiKey) ?? trimToUndefined(process.env.OPENAI_API_KEY) ?? "";
-    // Default to gpt-4o-mini-tts for intelligent realtime applications
-    this.model = trimToUndefined(config.model) ?? "gpt-4o-mini-tts";
-    // Default to coral - good balance of quality and natural tone
-    this.voice = (trimToUndefined(config.voice) as OpenAITTSVoice | undefined) ?? "coral";
-    this.speed = config.speed ?? 1.0;
-    this.instructions = trimToUndefined(config.instructions);
-
-    if (!this.apiKey) {
-      throw new Error("OpenAI API key required (set OPENAI_API_KEY or pass apiKey)");
-    }
-  }
-
-  /**
-   * Generate speech audio from text.
-   * Returns raw PCM audio data (24kHz, mono, 16-bit).
-   */
-  async synthesize(text: string, instructions?: string): Promise<Buffer> {
-    // Build request body
-    const body: Record<string, unknown> = {
-      model: this.model,
-      input: text,
-      voice: this.voice,
-      response_format: "pcm", // Raw PCM audio (24kHz, mono, 16-bit signed LE)
-      speed: this.speed,
-    };
-
-    const effectiveInstructions = resolveOpenAITtsInstructions(
-      this.model,
-      trimToUndefined(instructions) ?? this.instructions,
-    );
-    if (effectiveInstructions) {
-      body.instructions = effectiveInstructions;
-    }
-
-    const response = await fetch("https://api.openai.com/v1/audio/speech", {
-      method: "POST",
-      headers: {
-        Authorization: `Bearer ${this.apiKey}`,
-        "Content-Type": "application/json",
-      },
-      body: JSON.stringify(body),
-    });
-
-    if (!response.ok) {
-      const error = await response.text();
-      throw new Error(`OpenAI TTS failed: ${response.status} - ${error}`);
-    }
-
-    const arrayBuffer = await response.arrayBuffer();
-    return Buffer.from(arrayBuffer);
-  }
-
-  /**
-   * Generate speech and convert to mu-law format for Twilio.
-   * Twilio Media Streams expect 8kHz mono mu-law audio.
-   */
-  async synthesizeForTwilio(text: string): Promise<Buffer> {
-    // Get raw PCM from OpenAI (24kHz, 16-bit signed LE, mono)
-    const pcm24k = await this.synthesize(text);
-
-    // Convert from 24kHz PCM to Twilio-compatible 8kHz mu-law
-    return convertPcmToMulaw8k(pcm24k, 24000);
-  }
-}
-
-/**
- * Convert 8-bit mu-law to 16-bit linear PCM.
- * Useful for decoding incoming audio.
- */
-export function mulawToLinear(mulaw: number): number {
-  // mu-law is transmitted inverted
-  mulaw = ~mulaw & 0xff;
-
-  const sign = mulaw & 0x80;
-  const exponent = (mulaw >> 4) & 0x07;
-  const mantissa = mulaw & 0x0f;
-
-  let sample = ((mantissa << 3) + 132) << exponent;
-  sample -= 132;
-
-  return sign ? -sample : sample;
-}
-
-/**
- * Chunk audio buffer into 20ms frames for streaming.
- * At 8kHz mono, 20ms = 160 samples = 160 bytes (mu-law).
- */
-export function chunkAudio(audio: Buffer, chunkSize = 160): Generator<Buffer, void, unknown> {
-  return (function* () {
-    for (let i = 0; i < audio.length; i += chunkSize) {
-      yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
-    }
-  })();
-}
diff --git a/extensions/voice-call/src/realtime-transcription.runtime.ts b/extensions/voice-call/src/realtime-transcription.runtime.ts
new file mode 100644
index 00000000000..e532af72e13
--- /dev/null
+++ b/extensions/voice-call/src/realtime-transcription.runtime.ts
@@ -0,0 +1,4 @@
+export {
+  getRealtimeTranscriptionProvider,
+  listRealtimeTranscriptionProviders,
+} from "openclaw/plugin-sdk/realtime-transcription";
diff --git a/extensions/voice-call/src/realtime-voice.runtime.ts b/extensions/voice-call/src/realtime-voice.runtime.ts
new file mode 100644
index 00000000000..6f169676adf
--- /dev/null
+++ b/extensions/voice-call/src/realtime-voice.runtime.ts
@@ -0,0 +1,4 @@
+export {
+  getRealtimeVoiceProvider,
+  listRealtimeVoiceProviders,
+} from "openclaw/plugin-sdk/realtime-voice";
diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts
index 6059f2ce5b8..501c5549fb2 100644
--- a/extensions/voice-call/src/runtime.ts
+++ b/extensions/voice-call/src/runtime.ts
@@ -1,12 +1,14 @@
+import type { OpenClawConfig } from "openclaw/plugin-sdk/core";
+import type {
+  RealtimeVoiceProviderConfig,
+  RealtimeVoiceProviderPlugin,
+} from "openclaw/plugin-sdk/realtime-voice";
 import type { VoiceCallConfig } from "./config.js";
 import { resolveVoiceCallConfig, validateProviderConfig } from "./config.js";
 import type { CoreAgentDeps, CoreConfig } from "./core-bridge.js";
 import { CallManager } from "./manager.js";
 import type { VoiceCallProvider } from "./providers/base.js";
-import { MockProvider } from "./providers/mock.js";
-import { PlivoProvider } from "./providers/plivo.js";
-import { TelnyxProvider } from "./providers/telnyx.js";
-import { TwilioProvider } from "./providers/twilio.js";
+import type { TwilioProvider } from "./providers/twilio.js";
 import type { TelephonyTtsRuntime } from "./telephony-tts.js";
 import { createTelephonyTtsProvider } from "./telephony-tts.js";
 import { startTunnel, type TunnelResult } from "./tunnel.js";
@@ -30,6 +32,11 @@ type Logger = {
   debug?: (message: string) => void;
 };
 
+type ResolvedRealtimeProvider = {
+  provider: RealtimeVoiceProviderPlugin;
+  providerConfig: RealtimeVoiceProviderConfig;
+};
+
 function createRuntimeResourceLifecycle(params: {
   config: VoiceCallConfig;
   webhookServer: VoiceCallWebhookServer;
@@ -80,14 +87,15 @@ function isLoopbackBind(bind: string | undefined): boolean {
   return bind === "127.0.0.1" || bind === "::1" || bind === "localhost";
 }
 
-function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
+async function resolveProvider(config: VoiceCallConfig): Promise<VoiceCallProvider> {
   const allowNgrokFreeTierLoopbackBypass =
     config.tunnel?.provider === "ngrok" &&
     isLoopbackBind(config.serve?.bind) &&
     (config.tunnel?.allowNgrokFreeTierLoopbackBypass ?? false);
 
   switch (config.provider) {
-    case "telnyx":
+    case "telnyx": {
+      const { TelnyxProvider } = await import("./providers/telnyx.js");
       return new TelnyxProvider(
         {
           apiKey: config.telnyx?.apiKey,
@@ -98,7 +106,9 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
           skipVerification: config.skipSignatureVerification,
         },
       );
-    case "twilio":
+    }
+    case "twilio": {
+      const { TwilioProvider } = await import("./providers/twilio.js");
       return new TwilioProvider(
         {
           accountSid: config.twilio?.accountSid,
@@ -112,7 +122,9 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
           webhookSecurity: config.webhookSecurity,
         },
       );
-    case "plivo":
+    }
+    case "plivo": {
+      const { PlivoProvider } = await import("./providers/plivo.js");
       return new PlivoProvider(
         {
           authId: config.plivo?.authId,
@@ -125,21 +137,66 @@ function resolveProvider(config: VoiceCallConfig): VoiceCallProvider {
           webhookSecurity: config.webhookSecurity,
         },
       );
-    case "mock":
+    }
+    case "mock": {
+      const { MockProvider } = await import("./providers/mock.js");
       return new MockProvider();
+    }
     default:
       throw new Error(`Unsupported voice-call provider: ${String(config.provider)}`);
   }
 }
 
+async function resolveRealtimeProvider(params: {
+  config: VoiceCallConfig;
+  fullConfig: OpenClawConfig;
+}): Promise<ResolvedRealtimeProvider> {
+  const { getRealtimeVoiceProvider, listRealtimeVoiceProviders } =
+    await import("./realtime-voice.runtime.js");
+  const configuredProviderId = params.config.realtime.provider?.trim();
+  const configuredProvider = getRealtimeVoiceProvider(configuredProviderId, params.fullConfig);
+  if (configuredProviderId && !configuredProvider) {
+    throw new Error(`Realtime voice provider "${configuredProviderId}" is not registered`);
+  }
+  const provider =
+    configuredProvider ??
+    [...listRealtimeVoiceProviders(params.fullConfig)].sort(
+      (left, right) =>
+        (left.autoSelectOrder ?? Number.MAX_SAFE_INTEGER) -
+        (right.autoSelectOrder ?? Number.MAX_SAFE_INTEGER),
+    )[0];
+  if (!provider) {
+    throw new Error("No realtime voice provider registered");
+  }
+
+  const rawProviderConfig =
+    (params.config.realtime.providers?.[provider.id] as RealtimeVoiceProviderConfig | undefined) ??
+    {};
+  const providerConfig =
+    provider.resolveConfig?.({
+      cfg: params.fullConfig,
+      rawConfig: {
+        providers: params.config.realtime.providers,
+        [provider.id]: rawProviderConfig,
+      },
+    }) ?? rawProviderConfig;
+
+  if (!provider.isConfigured({ cfg: params.fullConfig, providerConfig })) {
+    throw new Error(`Realtime voice provider "${provider.id}" is not configured`);
+  }
+
+  return { provider, providerConfig };
+}
+
 export async function createVoiceCallRuntime(params: {
   config: VoiceCallConfig;
   coreConfig: CoreConfig;
+  fullConfig?: OpenClawConfig;
   agentRuntime: CoreAgentDeps;
   ttsRuntime?: TelephonyTtsRuntime;
   logger?: Logger;
 }): Promise<VoiceCallRuntime> {
-  const { config: rawConfig, coreConfig, agentRuntime, ttsRuntime, logger } = params;
+  const { config: rawConfig, coreConfig, fullConfig, agentRuntime, ttsRuntime, logger } = params;
   const log = logger ?? {
     info: console.log,
     warn: console.warn,
@@ -164,8 +221,14 @@ export async function createVoiceCallRuntime(params: {
     throw new Error(`Invalid voice-call config: ${validation.errors.join("; ")}`);
   }
 
-  const provider = resolveProvider(config);
+  const provider = await resolveProvider(config);
   const manager = new CallManager(config);
+  const realtimeProvider = config.realtime.enabled
+    ? await resolveRealtimeProvider({
+        config,
+        fullConfig: (fullConfig ?? (coreConfig as OpenClawConfig)) as OpenClawConfig,
+      })
+    : null;
   const webhookServer = new VoiceCallWebhookServer(
     config,
     manager,
@@ -173,6 +236,19 @@ export async function createVoiceCallRuntime(params: {
     coreConfig,
     agentRuntime,
   );
+  if (realtimeProvider) {
+    const { RealtimeCallHandler } = await import("./webhook/realtime-handler.js");
+    webhookServer.setRealtimeHandler(
+      new RealtimeCallHandler(
+        config.realtime,
+        manager,
+        provider,
+        realtimeProvider.provider,
+        realtimeProvider.providerConfig,
+        config.serve.path,
+      ),
+    );
+  }
   const lifecycle = createRuntimeResourceLifecycle({ config, webhookServer });
 
   const localUrl = await webhookServer.start();
@@ -212,6 +288,9 @@ export async function createVoiceCallRuntime(params: {
     if (publicUrl && provider.name === "twilio") {
       (provider as TwilioProvider).setPublicUrl(publicUrl);
     }
+    if (publicUrl && realtimeProvider) {
+      webhookServer.getRealtimeHandler()?.setPublicUrl(publicUrl);
+    }
 
     if (provider.name === "twilio" && config.streaming?.enabled) {
       const twilioProvider = provider as TwilioProvider;
@@ -243,6 +322,10 @@ export async function createVoiceCallRuntime(params: {
       }
     }
 
+    if (realtimeProvider) {
+      log.info(`[voice-call] Realtime voice provider: ${realtimeProvider.provider.id}`);
+    }
+
     await manager.initialize(provider, webhookUrl);
 
     const stop = async () => await lifecycle.stop();
diff --git a/extensions/voice-call/src/test-fixtures.ts b/extensions/voice-call/src/test-fixtures.ts
index 4302143b7f0..bb05a6e4bc6 100644
--- a/extensions/voice-call/src/test-fixtures.ts
+++ b/extensions/voice-call/src/test-fixtures.ts
@@ -30,16 +30,26 @@ export function createVoiceCallBaseConfig(params?: {
     },
     streaming: {
       enabled: false,
-      sttProvider: "openai-realtime",
-      sttModel: "gpt-4o-transcribe",
-      silenceDurationMs: 800,
-      vadThreshold: 0.5,
+      provider: "openai",
+      providers: {
+        openai: {
+          model: "gpt-4o-transcribe",
+          silenceDurationMs: 800,
+          vadThreshold: 0.5,
+        },
+      },
       streamPath: "/voice/stream",
       preStartTimeoutMs: 5000,
       maxPendingConnections: 32,
       maxPendingConnectionsPerIp: 4,
       maxConnections: 128,
     },
+    realtime: {
+      enabled: false,
+      streamPath: "/voice/stream/realtime",
+      tools: [],
+      providers: {},
+    },
     skipSignatureVerification: false,
     stt: { provider: "openai", model: "whisper-1" },
     tts: {
diff --git a/extensions/voice-call/src/webhook.test.ts b/extensions/voice-call/src/webhook.test.ts
index f87193c7cd4..cb15233f856 100644
--- a/extensions/voice-call/src/webhook.test.ts
+++ b/extensions/voice-call/src/webhook.test.ts
@@ -1,10 +1,36 @@
 import { request } from "node:http";
+import type { RealtimeTranscriptionProviderPlugin } from "openclaw/plugin-sdk/realtime-transcription";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 import { VoiceCallConfigSchema, type VoiceCallConfig } from "./config.js";
 import type { CallManager } from "./manager.js";
 import type { VoiceCallProvider } from "./providers/base.js";
 import type { CallRecord, NormalizedEvent } from "./types.js";
 import { VoiceCallWebhookServer } from "./webhook.js";
+import type { RealtimeCallHandler } from "./webhook/realtime-handler.js";
+
+const mocks = vi.hoisted(() => {
+  const realtimeTranscriptionProvider: RealtimeTranscriptionProviderPlugin = {
+    id: "openai",
+    label: "OpenAI",
+    aliases: ["openai-realtime"],
+    isConfigured: () => true,
+    resolveConfig: ({ rawConfig }) => rawConfig,
+    createSession: () => ({
+      connect: async () => {},
+      sendAudio: () => {},
+      close: () => {},
+      isConnected: () => true,
+    }),
+  };
+
+  return {
+    getRealtimeTranscriptionProvider: vi.fn(() => realtimeTranscriptionProvider),
+  };
+});
+
+vi.mock("./realtime-transcription.runtime.js", () => ({
+  getRealtimeTranscriptionProvider: mocks.getRealtimeTranscriptionProvider,
+}));
 
 const provider: VoiceCallProvider = {
   name: "mock",
@@ -291,6 +317,56 @@ describe("VoiceCallWebhookServer replay handling", () => {
     }
   });
 
+  it("returns realtime TwiML for replayed inbound twilio webhooks", async () => {
+    const parseWebhookEvent = vi.fn(() => ({ events: [], statusCode: 200 }));
+    const twilioProvider: VoiceCallProvider = {
+      ...provider,
+      name: "twilio",
+      verifyWebhook: () => ({ ok: true, isReplay: true, verifiedRequestKey: "twilio:req:replay" }),
+      parseWebhookEvent,
+    };
+    const { manager, processEvent } = createManager([]);
+    const config = createConfig({
+      provider: "twilio",
+      inboundPolicy: "allowlist",
+      realtime: {
+        enabled: true,
+        streamPath: "/voice/stream/realtime",
+        tools: [],
+        providers: {},
+      },
+    });
+    const server = new VoiceCallWebhookServer(config, manager, twilioProvider);
+    server.setRealtimeHandler({
+      buildTwiMLPayload: () => ({
+        statusCode: 200,
+        headers: { "Content-Type": "text/xml" },
+        body: '<Response><Connect><Stream url="wss://example.test/voice/stream/realtime/token" /></Connect></Response>',
+      }),
+      getStreamPathPattern: () => "/voice/stream/realtime",
+      handleWebSocketUpgrade: () => {},
+      registerToolHandler: () => {},
+      setPublicUrl: () => {},
+    } as unknown as RealtimeCallHandler);
+
+    try {
+      const baseUrl = await server.start();
+      const response = await postWebhookFormWithHeaders(
+        server,
+        baseUrl,
+        "CallSid=CA123&Direction=inbound&CallStatus=ringing",
+        { "x-twilio-signature": "sig" },
+      );
+
+      expect(response.status).toBe(200);
+      expect(await response.text()).toContain("<Connect><Stream");
+      expect(parseWebhookEvent).not.toHaveBeenCalled();
+      expect(processEvent).not.toHaveBeenCalled();
+    } finally {
+      await server.stop();
+    }
+  });
+
   it("passes verified request key from verifyWebhook into parseWebhookEvent", async () => {
     const parseWebhookEvent = vi.fn((_ctx: unknown, options?: { verifiedRequestKey?: string }) => ({
       events: [
@@ -625,6 +701,7 @@ describe("VoiceCallWebhookServer stream disconnect grace", () => {
       manager,
       twilioProvider as unknown as VoiceCallProvider,
     );
+    await server.start();
 
     const mediaHandler = server.getMediaStreamHandler() as unknown as {
       config: {
@@ -717,6 +794,7 @@ describe("VoiceCallWebhookServer barge-in suppression during initial message", (
       manager,
       createTwilioProvider(clearTtsQueue) as unknown as VoiceCallProvider,
     );
+    await server.start();
     const handleInboundResponse = vi.fn(async () => {});
     (
       server as unknown as {
@@ -790,6 +868,7 @@ describe("VoiceCallWebhookServer barge-in suppression during initial message", (
       manager,
       createTwilioProvider(clearTtsQueue) as unknown as VoiceCallProvider,
     );
+    await server.start();
 
     try {
       const media = getMediaCallbacks(server);
diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts
index 4e20a00f441..e9f36a61ad5 100644
--- a/extensions/voice-call/src/webhook.ts
+++ b/extensions/voice-call/src/webhook.ts
@@ -1,5 +1,6 @@
 import http from "node:http";
 import { URL } from "node:url";
+import type { OpenClawConfig } from "openclaw/plugin-sdk/core";
 import {
   createWebhookInFlightLimiter,
   WEBHOOK_BODY_READ_DEFAULTS,
@@ -16,9 +17,10 @@ import type { CallManager } from "./manager.js";
 import type { MediaStreamConfig } from "./media-stream.js";
 import { MediaStreamHandler } from "./media-stream.js";
 import type { VoiceCallProvider } from "./providers/base.js";
-import { OpenAIRealtimeSTTProvider } from "./providers/stt-openai-realtime.js";
+import { isProviderStatusTerminal } from "./providers/shared/call-status.js";
 import type { TwilioProvider } from "./providers/twilio.js";
 import type { CallRecord, NormalizedEvent, WebhookContext } from "./types.js";
+import type { RealtimeCallHandler } from "./webhook/realtime-handler.js";
 import { startStaleCallReaper } from "./webhook/stale-call-reaper.js";
 
 const MAX_WEBHOOK_BODY_BYTES = WEBHOOK_BODY_READ_DEFAULTS.preAuth.maxBytes;
@@ -44,7 +46,7 @@ function sanitizeTranscriptForLog(value: string): string {
   return `${sanitized.slice(0, TRANSCRIPT_LOG_MAX_CHARS)}...`;
 }
 
-type WebhookResponsePayload = {
+export type WebhookResponsePayload = {
   statusCode: number;
   body: string;
   headers?: Record<string, string>;
@@ -89,6 +91,8 @@ export class VoiceCallWebhookServer {
   private mediaStreamHandler: MediaStreamHandler | null = null;
   /** Delayed auto-hangup timers keyed by provider call ID after stream disconnect. */
   private pendingDisconnectHangups = new Map<string, ReturnType<typeof setTimeout>>();
+  /** Realtime voice handler for duplex provider bridges. */
+  private realtimeHandler: RealtimeCallHandler | null = null;
 
   constructor(
     config: VoiceCallConfig,
@@ -102,11 +106,6 @@ export class VoiceCallWebhookServer {
     this.provider = provider;
     this.coreConfig = coreConfig ?? null;
     this.agentRuntime = agentRuntime ?? null;
-
-    // Initialize media stream handler if streaming is enabled
-    if (this.config.streaming.enabled) {
-      this.initializeMediaStreaming();
-    }
   }
 
   /**
@@ -116,6 +115,14 @@ export class VoiceCallWebhookServer {
     return this.mediaStreamHandler;
   }
 
+  getRealtimeHandler(): RealtimeCallHandler | null {
+    return this.realtimeHandler;
+  }
+
+  setRealtimeHandler(handler: RealtimeCallHandler): void {
+    this.realtimeHandler = handler;
+  }
+
   private clearPendingDisconnectHangup(providerCallId: string): void {
     const existing = this.pendingDisconnectHangups.get(providerCallId);
     if (!existing) {
@@ -147,26 +154,50 @@ export class VoiceCallWebhookServer {
   }
 
   /**
-   * Initialize media streaming with OpenAI Realtime STT.
+   * Initialize media streaming with the selected realtime transcription provider.
    */
-  private initializeMediaStreaming(): void {
+  private async initializeMediaStreaming(): Promise<void> {
     const streaming = this.config.streaming;
-    const apiKey = streaming.openaiApiKey ?? process.env.OPENAI_API_KEY;
-
-    if (!apiKey) {
-      console.warn("[voice-call] Streaming enabled but no OpenAI API key found");
+    const selectedProviderId = streaming.provider;
+    const pluginConfig = this.coreConfig as unknown as OpenClawConfig | undefined;
+    const { getRealtimeTranscriptionProvider } =
+      await import("./realtime-transcription.runtime.js");
+    const provider = getRealtimeTranscriptionProvider(selectedProviderId, pluginConfig);
+    if (!provider) {
+      console.warn(
+        `[voice-call] Streaming enabled but realtime transcription provider "${selectedProviderId}" is not registered`,
+      );
+      return;
+    }
+    const selectedProviderConfig =
+      streaming.providers[selectedProviderId] &&
+      typeof streaming.providers[selectedProviderId] === "object"
+        ? (streaming.providers[selectedProviderId] as Record<string, unknown>)
+        : undefined;
+    const canonicalProviderConfig =
+      streaming.providers[provider.id] && typeof streaming.providers[provider.id] === "object"
+        ? (streaming.providers[provider.id] as Record<string, unknown>)
+        : undefined;
+    const rawProviderConfig = {
+      ...(canonicalProviderConfig ?? {}),
+      ...(selectedProviderConfig ?? {}),
+    };
+    const providerConfig = provider.resolveConfig
+      ? provider.resolveConfig({
+          cfg: pluginConfig ?? ({} as OpenClawConfig),
+          rawConfig: rawProviderConfig,
+        })
+      : rawProviderConfig;
+    if (!provider.isConfigured({ cfg: pluginConfig, providerConfig })) {
+      console.warn(
+        `[voice-call] Streaming enabled but provider "${provider.id}" is not configured`,
+      );
       return;
     }
 
-    const sttProvider = new OpenAIRealtimeSTTProvider({
-      apiKey,
-      model: streaming.sttModel,
-      silenceDurationMs: streaming.silenceDurationMs,
-      vadThreshold: streaming.vadThreshold,
-    });
-
     const streamConfig: MediaStreamConfig = {
-      sttProvider,
+      transcriptionProvider: provider,
+      providerConfig,
       preStartTimeoutMs: streaming.preStartTimeoutMs,
       maxPendingConnections: streaming.maxPendingConnections,
       maxPendingConnectionsPerIp: streaming.maxPendingConnectionsPerIp,
@@ -309,6 +340,10 @@ export class VoiceCallWebhookServer {
       return this.listeningUrl ?? this.resolveListeningUrl(bind, webhookPath);
     }
 
+    if (this.config.streaming.enabled && !this.mediaStreamHandler) {
+      await this.initializeMediaStreaming();
+    }
+
     return new Promise((resolve, reject) => {
       this.server = http.createServer((req, res) => {
         this.handleRequest(req, res, webhookPath).catch((err) => {
@@ -318,12 +353,15 @@ export class VoiceCallWebhookServer {
         });
       });
 
-      // Handle WebSocket upgrades for media streams
-      if (this.mediaStreamHandler) {
+      // Handle WebSocket upgrades for realtime voice and media streams.
+      if (this.realtimeHandler || this.mediaStreamHandler) {
         this.server.on("upgrade", (request, socket, head) => {
+          if (this.realtimeHandler && this.isRealtimeWebSocketUpgrade(request)) {
+            this.realtimeHandler.handleWebSocketUpgrade(request, socket, head);
+            return;
+          }
           const path = this.getUpgradePathname(request);
-          if (path === streamPath) {
-            console.log("[voice-call] WebSocket upgrade for media stream");
+          if (path === streamPath && this.mediaStreamHandler) {
             this.mediaStreamHandler?.handleUpgrade(request, socket, head);
           } else {
             socket.destroy();
@@ -504,6 +542,10 @@ export class VoiceCallWebhookServer {
         return { statusCode: 401, body: "Unauthorized" };
       }
 
+      if (this.shouldShortCircuitToRealtimeTwiml(ctx)) {
+        return this.realtimeHandler!.buildTwiMLPayload(req, new URLSearchParams(ctx.rawBody));
+      }
+
       const parsed = this.provider.parseWebhookEvent(ctx, {
         verifiedRequestKey: verification.verifiedRequestKey,
       });
@@ -555,6 +597,42 @@ export class VoiceCallWebhookServer {
     }
   }
 
+  private isRealtimeWebSocketUpgrade(req: http.IncomingMessage): boolean {
+    try {
+      const pathname = buildRequestUrl(req.url, req.headers.host).pathname;
+      const pattern = this.realtimeHandler?.getStreamPathPattern();
+      return Boolean(pattern && pathname.startsWith(pattern));
+    } catch {
+      return false;
+    }
+  }
+
+  private shouldShortCircuitToRealtimeTwiml(ctx: WebhookContext): boolean {
+    if (!this.realtimeHandler || this.provider.name !== "twilio") {
+      return false;
+    }
+
+    const params = new URLSearchParams(ctx.rawBody);
+    const direction = params.get("Direction");
+    const isInbound = !direction || direction === "inbound";
+    if (!isInbound) {
+      return false;
+    }
+
+    if (ctx.query?.type === "status") {
+      return false;
+    }
+
+    const callStatus = params.get("CallStatus");
+    if (callStatus && isProviderStatusTerminal(callStatus)) {
+      return false;
+    }
+
+    // Replays must return the same TwiML body so Twilio retries reconnect cleanly.
+    // The one-time token still changes, but the behavior stays identical.
+    return !params.get("SpeechResult") && !params.get("Digits");
+  }
+
   private processParsedEvents(events: NormalizedEvent[]): void {
     for (const event of events) {
       try {
diff --git a/extensions/voice-call/src/webhook/realtime-handler.test.ts b/extensions/voice-call/src/webhook/realtime-handler.test.ts
new file mode 100644
index 00000000000..dd6489fd9b9
--- /dev/null
+++ b/extensions/voice-call/src/webhook/realtime-handler.test.ts
@@ -0,0 +1,92 @@
+import http from "node:http";
+import type {
+  RealtimeVoiceBridge,
+  RealtimeVoiceProviderPlugin,
+} from "openclaw/plugin-sdk/realtime-voice";
+import { describe, expect, it, vi } from "vitest";
+import type { VoiceCallRealtimeConfig } from "../config.js";
+import type { CallManager } from "../manager.js";
+import type { VoiceCallProvider } from "../providers/base.js";
+import { RealtimeCallHandler } from "./realtime-handler.js";
+
+function makeRequest(url: string, host = "gateway.ts.net"): http.IncomingMessage {
+  const req = new http.IncomingMessage(null as never);
+  req.url = url;
+  req.method = "POST";
+  req.headers = host ? { host } : {};
+  return req;
+}
+
+function makeBridge(): RealtimeVoiceBridge {
+  return {
+    connect: async () => {},
+    sendAudio: () => {},
+    setMediaTimestamp: () => {},
+    submitToolResult: () => {},
+    acknowledgeMark: () => {},
+    close: () => {},
+    isConnected: () => true,
+    triggerGreeting: () => {},
+  };
+}
+
+const realtimeProvider: RealtimeVoiceProviderPlugin = {
+  id: "openai",
+  label: "OpenAI",
+  isConfigured: () => true,
+  createBridge: () => makeBridge(),
+};
+
+function makeHandler(overrides?: Partial<VoiceCallRealtimeConfig>) {
+  return new RealtimeCallHandler(
+    {
+      enabled: true,
+      streamPath: "/voice/stream/realtime",
+      instructions: "Be helpful.",
+      tools: [],
+      providers: {},
+      ...overrides,
+    },
+    {
+      processEvent: vi.fn(),
+      getCallByProviderCallId: vi.fn(),
+    } as unknown as CallManager,
+    {
+      name: "twilio",
+      verifyWebhook: vi.fn(),
+      parseWebhookEvent: vi.fn(),
+      initiateCall: vi.fn(),
+      hangupCall: vi.fn(),
+      playTts: vi.fn(),
+      startListening: vi.fn(),
+      stopListening: vi.fn(),
+      getCallStatus: vi.fn(),
+    } as unknown as VoiceCallProvider,
+    realtimeProvider,
+    { apiKey: "test-key" },
+    "/voice/webhook",
+  );
+}
+
+describe("RealtimeCallHandler path routing", () => {
+  it("uses the request host and stream path in TwiML", () => {
+    const handler = makeHandler();
+    const payload = handler.buildTwiMLPayload(makeRequest("/voice/webhook", "gateway.ts.net"));
+
+    expect(payload.statusCode).toBe(200);
+    expect(payload.body).toMatch(
+      /wss:\/\/gateway\.ts\.net\/voice\/stream\/realtime\/[0-9a-f-]{36}/,
+    );
+  });
+
+  it("preserves a public path prefix ahead of serve.path", () => {
+    const handler = makeHandler({ streamPath: "/custom/stream/realtime" });
+    handler.setPublicUrl("https://public.example/api/voice/webhook");
+    const payload = handler.buildTwiMLPayload(makeRequest("/voice/webhook", "127.0.0.1:3334"));
+
+    expect(handler.getStreamPathPattern()).toBe("/api/custom/stream/realtime");
+    expect(payload.body).toMatch(
+      /wss:\/\/public\.example\/api\/custom\/stream\/realtime\/[0-9a-f-]{36}/,
+    );
+  });
+});
diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts
new file mode 100644
index 00000000000..dd6165bde83
--- /dev/null
+++ b/extensions/voice-call/src/webhook/realtime-handler.ts
@@ -0,0 +1,413 @@
+import { randomUUID } from "node:crypto";
+import http from "node:http";
+import type { Duplex } from "node:stream";
+import type {
+  RealtimeVoiceBridge,
+  RealtimeVoiceProviderConfig,
+  RealtimeVoiceProviderPlugin,
+} from "openclaw/plugin-sdk/realtime-voice";
+import WebSocket, { WebSocketServer } from "ws";
+import type { VoiceCallRealtimeConfig } from "../config.js";
+import type { CallManager } from "../manager.js";
+import type { VoiceCallProvider } from "../providers/base.js";
+import type { CallRecord, NormalizedEvent } from "../types.js";
+import type { WebhookResponsePayload } from "../webhook.js";
+
+export type ToolHandlerFn = (args: unknown, callId: string) => Promise<unknown>;
+
+const STREAM_TOKEN_TTL_MS = 30_000;
+const DEFAULT_HOST = "localhost:8443";
+
+function normalizePath(pathname: string): string {
+  const trimmed = pathname.trim();
+  if (!trimmed) {
+    return "/";
+  }
+  const prefixed = trimmed.startsWith("/") ? trimmed : `/${trimmed}`;
+  if (prefixed === "/") {
+    return prefixed;
+  }
+  return prefixed.endsWith("/") ? prefixed.slice(0, -1) : prefixed;
+}
+
+function buildGreetingInstructions(
+  baseInstructions: string | undefined,
+  greeting: string | undefined,
+): string | undefined {
+  const trimmedGreeting = greeting?.trim();
+  if (!trimmedGreeting) {
+    return baseInstructions;
+  }
+  const intro =
+    "Start the call by greeting the caller naturally. Include this greeting in your first spoken reply:";
+  return baseInstructions
+    ? `${baseInstructions}\n\n${intro} "${trimmedGreeting}"`
+    : `${intro} "${trimmedGreeting}"`;
+}
+
+type PendingStreamToken = {
+  expiry: number;
+  from?: string;
+  to?: string;
+  direction?: "inbound" | "outbound";
+};
+
+type CallRegistration = {
+  callId: string;
+  initialGreetingInstructions?: string;
+};
+
+export class RealtimeCallHandler {
+  private readonly toolHandlers = new Map<string, ToolHandlerFn>();
+  private readonly pendingStreamTokens = new Map<string, PendingStreamToken>();
+  private publicOrigin: string | null = null;
+  private publicPathPrefix = "";
+
+  constructor(
+    private readonly config: VoiceCallRealtimeConfig,
+    private readonly manager: CallManager,
+    private readonly provider: VoiceCallProvider,
+    private readonly realtimeProvider: RealtimeVoiceProviderPlugin,
+    private readonly providerConfig: RealtimeVoiceProviderConfig,
+    private readonly servePath: string,
+  ) {}
+
+  setPublicUrl(url: string): void {
+    try {
+      const parsed = new URL(url);
+      this.publicOrigin = parsed.host;
+      const normalizedServePath = normalizePath(this.servePath);
+      const normalizedPublicPath = normalizePath(parsed.pathname);
+      const idx = normalizedPublicPath.indexOf(normalizedServePath);
+      this.publicPathPrefix = idx > 0 ? normalizedPublicPath.slice(0, idx) : "";
+    } catch {
+      this.publicOrigin = null;
+      this.publicPathPrefix = "";
+    }
+  }
+
+  getStreamPathPattern(): string {
+    return `${this.publicPathPrefix}${normalizePath(this.config.streamPath ?? "/voice/stream/realtime")}`;
+  }
+
+  buildTwiMLPayload(req: http.IncomingMessage, params?: URLSearchParams): WebhookResponsePayload {
+    const host = this.publicOrigin || req.headers.host || DEFAULT_HOST;
+    const rawDirection = params?.get("Direction");
+    const token = this.issueStreamToken({
+      from: params?.get("From") ?? undefined,
+      to: params?.get("To") ?? undefined,
+      direction: rawDirection === "outbound-api" ? "outbound" : "inbound",
+    });
+    const wsUrl = `wss://${host}${this.getStreamPathPattern()}/${token}`;
+    const twiml = `<?xml version="1.0" encoding="UTF-8"?>
+<Response>
+  <Connect>
+    <Stream url="${wsUrl}" />
+  </Connect>
+</Response>`;
+    return {
+      statusCode: 200,
+      headers: { "Content-Type": "text/xml" },
+      body: twiml,
+    };
+  }
+
+  handleWebSocketUpgrade(request: http.IncomingMessage, socket: Duplex, head: Buffer): void {
+    const url = new URL(request.url ?? "/", "wss://localhost");
+    const token = url.pathname.split("/").pop() ?? null;
+    const callerMeta = token ? this.consumeStreamToken(token) : null;
+    if (!callerMeta) {
+      socket.write("HTTP/1.1 401 Unauthorized\r\n\r\n");
+      socket.destroy();
+      return;
+    }
+
+    const wss = new WebSocketServer({ noServer: true });
+    wss.handleUpgrade(request, socket, head, (ws) => {
+      let bridge: RealtimeVoiceBridge | null = null;
+      let initialized = false;
+
+      ws.on("message", (data: Buffer) => {
+        try {
+          const msg = JSON.parse(data.toString()) as Record<string, unknown>;
+          if (!initialized && msg.event === "start") {
+            initialized = true;
+            const startData =
+              typeof msg.start === "object" && msg.start !== null
+                ? (msg.start as Record<string, unknown>)
+                : undefined;
+            const streamSid =
+              typeof startData?.streamSid === "string" ? startData.streamSid : "unknown";
+            const callSid = typeof startData?.callSid === "string" ? startData.callSid : "unknown";
+            bridge = this.handleCall(streamSid, callSid, ws, callerMeta);
+            return;
+          }
+          if (!bridge) {
+            return;
+          }
+          const mediaData =
+            typeof msg.media === "object" && msg.media !== null
+              ? (msg.media as Record<string, unknown>)
+              : undefined;
+          if (msg.event === "media" && typeof mediaData?.payload === "string") {
+            bridge.sendAudio(Buffer.from(mediaData.payload, "base64"));
+            if (typeof mediaData.timestamp === "number") {
+              bridge.setMediaTimestamp(mediaData.timestamp);
+            } else if (typeof mediaData.timestamp === "string") {
+              bridge.setMediaTimestamp(Number.parseInt(mediaData.timestamp, 10));
+            }
+            return;
+          }
+          if (msg.event === "mark") {
+            bridge.acknowledgeMark();
+            return;
+          }
+          if (msg.event === "stop") {
+            bridge.close();
+          }
+        } catch (error) {
+          console.error("[voice-call] realtime WS parse failed:", error);
+        }
+      });
+
+      ws.on("close", () => {
+        bridge?.close();
+      });
+    });
+  }
+
+  registerToolHandler(name: string, fn: ToolHandlerFn): void {
+    this.toolHandlers.set(name, fn);
+  }
+
+  private issueStreamToken(meta: Omit<PendingStreamToken, "expiry"> = {}): string {
+    const token = randomUUID();
+    this.pendingStreamTokens.set(token, { expiry: Date.now() + STREAM_TOKEN_TTL_MS, ...meta });
+    for (const [candidate, entry] of this.pendingStreamTokens) {
+      if (Date.now() > entry.expiry) {
+        this.pendingStreamTokens.delete(candidate);
+      }
+    }
+    return token;
+  }
+
+  private consumeStreamToken(token: string): Omit<PendingStreamToken, "expiry"> | null {
+    const entry = this.pendingStreamTokens.get(token);
+    if (!entry) {
+      return null;
+    }
+    this.pendingStreamTokens.delete(token);
+    if (Date.now() > entry.expiry) {
+      return null;
+    }
+    return {
+      from: entry.from,
+      to: entry.to,
+      direction: entry.direction,
+    };
+  }
+
+  private handleCall(
+    streamSid: string,
+    callSid: string,
+    ws: WebSocket,
+    callerMeta: Omit<PendingStreamToken, "expiry">,
+  ): RealtimeVoiceBridge | null {
+    const registration = this.registerCallInManager(callSid, callerMeta);
+    if (!registration) {
+      ws.close(1008, "Caller rejected by policy");
+      return null;
+    }
+
+    const { callId, initialGreetingInstructions } = registration;
+    let bridge: RealtimeVoiceBridge | null = null;
+    let callEndEmitted = false;
+    const emitCallEnd = (reason: "completed" | "error") => {
+      if (callEndEmitted) {
+        return;
+      }
+      callEndEmitted = true;
+      this.endCallInManager(callSid, callId, reason);
+    };
+
+    bridge = this.realtimeProvider.createBridge({
+      providerConfig: this.providerConfig,
+      instructions: this.config.instructions,
+      tools: this.config.tools,
+      onAudio: (muLaw) => {
+        if (ws.readyState !== WebSocket.OPEN) {
+          return;
+        }
+        ws.send(
+          JSON.stringify({
+            event: "media",
+            streamSid,
+            media: { payload: muLaw.toString("base64") },
+          }),
+        );
+      },
+      onClearAudio: () => {
+        if (ws.readyState !== WebSocket.OPEN) {
+          return;
+        }
+        ws.send(JSON.stringify({ event: "clear", streamSid }));
+      },
+      onMark: (markName) => {
+        if (ws.readyState !== WebSocket.OPEN) {
+          return;
+        }
+        ws.send(JSON.stringify({ event: "mark", streamSid, mark: { name: markName } }));
+      },
+      onTranscript: (role, text, isFinal) => {
+        if (!isFinal) {
+          return;
+        }
+        if (role === "user") {
+          const event: NormalizedEvent = {
+            id: `realtime-speech-${callSid}-${Date.now()}`,
+            type: "call.speech",
+            callId,
+            providerCallId: callSid,
+            timestamp: Date.now(),
+            transcript: text,
+            isFinal: true,
+          };
+          this.manager.processEvent(event);
+          return;
+        }
+        this.manager.processEvent({
+          id: `realtime-bot-${callSid}-${Date.now()}`,
+          type: "call.speaking",
+          callId,
+          providerCallId: callSid,
+          timestamp: Date.now(),
+          text,
+        });
+      },
+      onToolCall: (toolEvent) => {
+        if (!bridge) {
+          return;
+        }
+        void this.executeToolCall(
+          bridge,
+          callId,
+          toolEvent.callId || toolEvent.itemId,
+          toolEvent.name,
+          toolEvent.args,
+        );
+      },
+      onReady: () => {
+        bridge?.triggerGreeting?.(initialGreetingInstructions);
+      },
+      onError: (error) => {
+        console.error("[voice-call] realtime voice error:", error.message);
+      },
+      onClose: (reason) => {
+        if (reason !== "error") {
+          return;
+        }
+        emitCallEnd("error");
+        if (ws.readyState === WebSocket.OPEN) {
+          ws.close(1011, "Bridge disconnected");
+        }
+        void this.provider
+          .hangupCall({ callId, providerCallId: callSid, reason: "error" })
+          .catch((error: unknown) => {
+            console.warn(
+              `[voice-call] Failed to hang up realtime call ${callSid}: ${
+                error instanceof Error ? error.message : String(error)
+              }`,
+            );
+          });
+      },
+    });
+
+    bridge.connect().catch((error: Error) => {
+      console.error("[voice-call] Failed to connect realtime bridge:", error);
+      bridge?.close();
+      emitCallEnd("error");
+      ws.close(1011, "Failed to connect");
+    });
+
+    return bridge;
+  }
+
+  private registerCallInManager(
+    callSid: string,
+    callerMeta: Omit<PendingStreamToken, "expiry"> = {},
+  ): CallRegistration | null {
+    const timestamp = Date.now();
+    const baseFields = {
+      providerCallId: callSid,
+      timestamp,
+      direction: (callerMeta.direction ?? "inbound") as "inbound" | "outbound",
+      ...(callerMeta.from ? { from: callerMeta.from } : {}),
+      ...(callerMeta.to ? { to: callerMeta.to } : {}),
+    };
+
+    this.manager.processEvent({
+      id: `realtime-initiated-${callSid}`,
+      callId: callSid,
+      type: "call.initiated",
+      ...baseFields,
+    });
+
+    const callRecord = this.manager.getCallByProviderCallId(callSid);
+    if (!callRecord) {
+      return null;
+    }
+
+    const initialGreeting = this.extractInitialGreeting(callRecord);
+    if (callRecord.metadata) {
+      delete callRecord.metadata.initialMessage;
+    }
+
+    this.manager.processEvent({
+      id: `realtime-answered-${callSid}`,
+      callId: callSid,
+      type: "call.answered",
+      ...baseFields,
+    });
+
+    return {
+      callId: callRecord.callId,
+      initialGreetingInstructions: buildGreetingInstructions(
+        this.config.instructions,
+        initialGreeting,
+      ),
+    };
+  }
+
+  private extractInitialGreeting(call: CallRecord): string | undefined {
+    return typeof call.metadata?.initialMessage === "string"
+      ? call.metadata.initialMessage
+      : undefined;
+  }
+
+  private endCallInManager(callSid: string, callId: string, reason: "completed" | "error"): void {
+    this.manager.processEvent({
+      id: `realtime-ended-${callSid}-${Date.now()}`,
+      type: "call.ended",
+      callId,
+      providerCallId: callSid,
+      timestamp: Date.now(),
+      reason,
+    });
+  }
+
+  private async executeToolCall(
+    bridge: RealtimeVoiceBridge,
+    callId: string,
+    bridgeCallId: string,
+    name: string,
+    args: unknown,
+  ): Promise<void> {
+    const handler = this.toolHandlers.get(name);
+    const result = !handler
+      ? { error: `Tool "${name}" not available` }
+      : await handler(args, callId).catch((error: unknown) => ({
+          error: error instanceof Error ? error.message : String(error),
+        }));
+    bridge.submitToolResult(bridgeCallId, result);
+  }
+}
diff --git a/extensions/zai/test-api.ts b/extensions/zai/test-api.ts
new file mode 100644
index 00000000000..19ef1fbacf3
--- /dev/null
+++ b/extensions/zai/test-api.ts
@@ -0,0 +1 @@
+export { zaiMediaUnderstandingProvider } from "./media-understanding-provider.js";
diff --git a/package.json b/package.json
index b2222d56b40..7ac3c149f23 100644
--- a/package.json
+++ b/package.json
@@ -551,6 +551,14 @@
       "types": "./dist/plugin-sdk/reply-history.d.ts",
       "default": "./dist/plugin-sdk/reply-history.js"
     },
+    "./plugin-sdk/realtime-voice": {
+      "types": "./dist/plugin-sdk/realtime-voice.d.ts",
+      "default": "./dist/plugin-sdk/realtime-voice.js"
+    },
+    "./plugin-sdk/realtime-transcription": {
+      "types": "./dist/plugin-sdk/realtime-transcription.d.ts",
+      "default": "./dist/plugin-sdk/realtime-transcription.js"
+    },
     "./plugin-sdk/media-understanding": {
       "types": "./dist/plugin-sdk/media-understanding.d.ts",
       "default": "./dist/plugin-sdk/media-understanding.js"
diff --git a/scripts/lib/plugin-sdk-entrypoints.json b/scripts/lib/plugin-sdk-entrypoints.json
index 13006c74a74..269bca270de 100644
--- a/scripts/lib/plugin-sdk-entrypoints.json
+++ b/scripts/lib/plugin-sdk-entrypoints.json
@@ -127,6 +127,8 @@
   "kimi-coding",
   "kilocode",
   "reply-history",
+  "realtime-transcription",
+  "realtime-voice",
   "media-understanding",
   "request-url",
   "runtime-store",
diff --git a/scripts/write-cli-startup-metadata.ts b/scripts/write-cli-startup-metadata.ts
index 402577ca8c0..4ede5e710ec 100644
--- a/scripts/write-cli-startup-metadata.ts
+++ b/scripts/write-cli-startup-metadata.ts
@@ -1,7 +1,8 @@
+import { spawnSync } from "node:child_process";
 import { mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
 import path from "node:path";
-import { fileURLToPath } from "node:url";
-import { renderRootHelpText } from "../src/cli/program/root-help.ts";
+import { fileURLToPath, pathToFileURL } from "node:url";
+import { renderRootHelpText as renderSourceRootHelpText } from "../src/cli/program/root-help.ts";
 
 function dedupe(values: string[]): string[] {
   const seen = new Set<string>();
@@ -82,7 +83,37 @@ export function readBundledChannelCatalogIds(
 export async function renderBundledRootHelpText(
   _distDirOverride: string = distDir,
 ): Promise<string> {
-  return await renderRootHelpText({ pluginDescriptors: [] });
+  const bundleName = readdirSync(distDirOverride).find(
+    (entry) => entry.startsWith("root-help-") && entry.endsWith(".js"),
+  );
+  if (!bundleName) {
+    throw new Error("No root-help bundle found in dist; cannot write CLI startup metadata.");
+  }
+  const moduleUrl = pathToFileURL(path.join(distDirOverride, bundleName)).href;
+  const inlineModule = [
+    `const mod = await import(${JSON.stringify(moduleUrl)});`,
+    "if (typeof mod.outputRootHelp !== 'function') {",
+    `  throw new Error(${JSON.stringify(`Bundle ${bundleName} does not export outputRootHelp.`)});`,
+    "}",
+    "await mod.outputRootHelp();",
+    "process.exit(0);",
+  ].join("\n");
+  const result = spawnSync(process.execPath, ["--input-type=module", "--eval", inlineModule], {
+    cwd: distDirOverride,
+    encoding: "utf8",
+    timeout: 30_000,
+  });
+  if (result.error) {
+    throw result.error;
+  }
+  if (result.status !== 0) {
+    const stderr = result.stderr?.trim();
+    throw new Error(
+      `Failed to render bundled root help from ${bundleName}` +
+        (stderr ? `: ${stderr}` : result.signal ? `: terminated by ${result.signal}` : ""),
+    );
+  }
+  return result.stdout ?? "";
 }
 
 export async function writeCliStartupMetadata(options?: {
@@ -95,7 +126,13 @@ export async function writeCliStartupMetadata(options?: {
   const resolvedExtensionsDir = options?.extensionsDir ?? extensionsDir;
   const catalog = readBundledChannelCatalogIds(resolvedExtensionsDir);
   const channelOptions = dedupe([...CORE_CHANNEL_ORDER, ...catalog]);
-  const rootHelpText = await renderBundledRootHelpText(resolvedDistDir);
+  const useSourceRootHelp =
+    resolvedDistDir === distDir &&
+    resolvedOutputPath === outputPath &&
+    resolvedExtensionsDir === extensionsDir;
+  const rootHelpText = useSourceRootHelp
+    ? await renderSourceRootHelpText({ pluginSdkResolution: "src" })
+    : await renderBundledRootHelpText(resolvedDistDir);
 
   mkdirSync(resolvedDistDir, { recursive: true });
   writeFileSync(
@@ -115,4 +152,5 @@ export async function writeCliStartupMetadata(options?: {
 
 if (process.argv[1] && path.resolve(process.argv[1]) === scriptPath) {
   await writeCliStartupMetadata();
+  process.exit(0);
 }
diff --git a/src/cli/program/root-help.ts b/src/cli/program/root-help.ts
index 9322d47c607..4328380f7ba 100644
--- a/src/cli/program/root-help.ts
+++ b/src/cli/program/root-help.ts
@@ -1,16 +1,14 @@
 import { Command } from "commander";
 import { getPluginCliCommandDescriptors } from "../../plugins/cli.js";
-import type { OpenClawPluginCliCommandDescriptor } from "../../plugins/types.js";
+import type { PluginLoadOptions } from "../../plugins/loader.js";
 import { VERSION } from "../../version.js";
 import { getCoreCliCommandDescriptors } from "./core-command-descriptors.js";
 import { configureProgramHelp } from "./help.js";
 import { getSubCliEntries } from "./subcli-descriptors.js";
 
-type RootHelpRenderOptions = {
-  pluginDescriptors?: OpenClawPluginCliCommandDescriptor[] | null;
-};
+type RootHelpLoaderOptions = Pick<PluginLoadOptions, "pluginSdkResolution">;
 
-async function buildRootHelpProgram(options?: RootHelpRenderOptions): Promise<Command> {
+async function buildRootHelpProgram(loaderOptions?: RootHelpLoaderOptions): Promise<Command> {
   const program = new Command();
   configureProgramHelp(program, {
     programVersion: VERSION,
@@ -31,11 +29,7 @@ async function buildRootHelpProgram(options?: RootHelpRenderOptions): Promise<Co
     program.command(command.name).description(command.description);
     existingCommands.add(command.name);
   }
-  const pluginDescriptors =
-    options && "pluginDescriptors" in options
-      ? (options.pluginDescriptors ?? [])
-      : await getPluginCliCommandDescriptors();
-  for (const command of pluginDescriptors) {
+  for (const command of await getPluginCliCommandDescriptors(undefined, undefined, loaderOptions)) {
     if (existingCommands.has(command.name)) {
       continue;
     }
@@ -46,8 +40,8 @@ async function buildRootHelpProgram(options?: RootHelpRenderOptions): Promise<Co
   return program;
 }
 
-export async function renderRootHelpText(options?: RootHelpRenderOptions): Promise<string> {
-  const program = await buildRootHelpProgram(options);
+export async function renderRootHelpText(loaderOptions?: RootHelpLoaderOptions): Promise<string> {
+  const program = await buildRootHelpProgram(loaderOptions);
   let output = "";
   const originalWrite = process.stdout.write.bind(process.stdout);
   const captureWrite: typeof process.stdout.write = ((chunk: string | Uint8Array) => {
@@ -63,6 +57,6 @@ export async function renderRootHelpText(options?: RootHelpRenderOptions): Promi
   return output;
 }
 
-export async function outputRootHelp(options?: RootHelpRenderOptions): Promise<void> {
-  process.stdout.write(await renderRootHelpText(options));
+export async function outputRootHelp(loaderOptions?: RootHelpLoaderOptions): Promise<void> {
+  process.stdout.write(await renderRootHelpText(loaderOptions));
 }
diff --git a/src/gateway/server-plugins.test.ts b/src/gateway/server-plugins.test.ts
index 5d62168845d..0687a11983d 100644
--- a/src/gateway/server-plugins.test.ts
+++ b/src/gateway/server-plugins.test.ts
@@ -69,6 +69,8 @@ const createRegistry = (diagnostics: PluginDiagnostic[]): PluginRegistry => ({
   commands: [],
   providers: [],
   speechProviders: [],
+  realtimeTranscriptionProviders: [],
+  realtimeVoiceProviders: [],
   mediaUnderstandingProviders: [],
   imageGenerationProviders: [],
   webFetchProviders: [],
diff --git a/src/gateway/test-helpers.mocks.ts b/src/gateway/test-helpers.mocks.ts
index 7bb3c184855..1b26cf62f23 100644
--- a/src/gateway/test-helpers.mocks.ts
+++ b/src/gateway/test-helpers.mocks.ts
@@ -201,6 +201,8 @@ const createStubPluginRegistry = (): PluginRegistry => ({
       }),
     },
   ],
+  realtimeTranscriptionProviders: [],
+  realtimeVoiceProviders: [],
   mediaUnderstandingProviders: [],
   imageGenerationProviders: [],
   webFetchProviders: [],
diff --git a/src/plugin-sdk/core.ts b/src/plugin-sdk/core.ts
index 937a43ecd7d..1f3e3172d0d 100644
--- a/src/plugin-sdk/core.ts
+++ b/src/plugin-sdk/core.ts
@@ -66,6 +66,7 @@ export type {
   ProviderReplaySessionState,
   ProviderResolveDynamicModelContext,
   ProviderResolvedUsageAuth,
+  RealtimeTranscriptionProviderPlugin,
   ProviderSanitizeReplayHistoryContext,
   ProviderToolSchemaDiagnostic,
   ProviderResolveUsageAuthContext,
diff --git a/src/plugin-sdk/index.ts b/src/plugin-sdk/index.ts
index 32e4e9a7ffe..ece2423d82d 100644
--- a/src/plugin-sdk/index.ts
+++ b/src/plugin-sdk/index.ts
@@ -51,6 +51,7 @@ export type {
   ProviderAuthContext,
   ProviderAuthResult,
   ProviderRuntimeModel,
+  RealtimeTranscriptionProviderPlugin,
   SpeechProviderPlugin,
 } from "../plugins/types.js";
 export type {
diff --git a/src/plugin-sdk/plugin-entry.ts b/src/plugin-sdk/plugin-entry.ts
index bd5d7ad0843..f24e2be6a27 100644
--- a/src/plugin-sdk/plugin-entry.ts
+++ b/src/plugin-sdk/plugin-entry.ts
@@ -46,6 +46,7 @@ import type {
   ProviderReplayPolicyContext,
   ProviderReplaySessionEntry,
   ProviderReplaySessionState,
+  RealtimeTranscriptionProviderPlugin,
   ProviderResolvedUsageAuth,
   ProviderResolveDynamicModelContext,
   ProviderSanitizeReplayHistoryContext,
@@ -102,6 +103,7 @@ export type {
   ProviderResolveDynamicModelContext,
   ProviderNormalizeResolvedModelContext,
   ProviderRuntimeModel,
+  RealtimeTranscriptionProviderPlugin,
   SpeechProviderPlugin,
   ProviderThinkingPolicyContext,
   ProviderValidateReplayTurnsContext,
diff --git a/src/plugin-sdk/realtime-transcription.ts b/src/plugin-sdk/realtime-transcription.ts
new file mode 100644
index 00000000000..e0f68005b07
--- /dev/null
+++ b/src/plugin-sdk/realtime-transcription.ts
@@ -0,0 +1,16 @@
+export type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js";
+export type {
+  RealtimeTranscriptionProviderConfig,
+  RealtimeTranscriptionProviderConfiguredContext,
+  RealtimeTranscriptionProviderId,
+  RealtimeTranscriptionProviderResolveConfigContext,
+  RealtimeTranscriptionSession,
+  RealtimeTranscriptionSessionCallbacks,
+  RealtimeTranscriptionSessionCreateRequest,
+} from "../realtime-transcription/provider-types.js";
+export {
+  canonicalizeRealtimeTranscriptionProviderId,
+  getRealtimeTranscriptionProvider,
+  listRealtimeTranscriptionProviders,
+  normalizeRealtimeTranscriptionProviderId,
+} from "../realtime-transcription/provider-registry.js";
diff --git a/src/plugin-sdk/realtime-voice.ts b/src/plugin-sdk/realtime-voice.ts
new file mode 100644
index 00000000000..41e2ed77400
--- /dev/null
+++ b/src/plugin-sdk/realtime-voice.ts
@@ -0,0 +1,20 @@
+export type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
+export type {
+  RealtimeVoiceBridge,
+  RealtimeVoiceBridgeCallbacks,
+  RealtimeVoiceBridgeCreateRequest,
+  RealtimeVoiceCloseReason,
+  RealtimeVoiceProviderConfig,
+  RealtimeVoiceProviderConfiguredContext,
+  RealtimeVoiceProviderId,
+  RealtimeVoiceProviderResolveConfigContext,
+  RealtimeVoiceRole,
+  RealtimeVoiceTool,
+  RealtimeVoiceToolCallEvent,
+} from "../realtime-voice/provider-types.js";
+export {
+  canonicalizeRealtimeVoiceProviderId,
+  getRealtimeVoiceProvider,
+  listRealtimeVoiceProviders,
+  normalizeRealtimeVoiceProviderId,
+} from "../realtime-voice/provider-registry.js";
diff --git a/src/plugin-sdk/speech.ts b/src/plugin-sdk/speech.ts
index 019f80e9ed6..55069df025f 100644
--- a/src/plugin-sdk/speech.ts
+++ b/src/plugin-sdk/speech.ts
@@ -1,7 +1,12 @@
+import { rmSync } from "node:fs";
+import type { OpenClawConfig } from "../config/config.js";
+import type { ResolvedTtsConfig } from "../tts/tts.js";
+
 // Public speech helpers for bundled or third-party plugins.
 //
-// Keep this surface neutral. Provider plugins should not need to know about the
-// bundled `speech-core` plugin id just to consume shared speech types/helpers.
+// Keep this surface neutral and import-light. Provider builders commonly import
+// this module just to get types and a few validation helpers, so avoid pulling
+// in the heavy TTS runtime graph at module load time.
 
 export type { SpeechProviderPlugin } from "../plugins/types.js";
 export type {
@@ -22,14 +27,6 @@ export type {
   TtsDirectiveParseResult,
 } from "../tts/provider-types.js";
 
-export {
-  scheduleCleanup,
-  summarizeText,
-  normalizeApplyTextNormalization,
-  normalizeLanguageCode,
-  normalizeSeed,
-  requireInRange,
-} from "../tts/tts-core.js";
 export { parseTtsDirectives } from "../tts/directives.js";
 export {
   canonicalizeSpeechProviderId,
@@ -44,3 +41,71 @@ export {
   trimToUndefined,
   truncateErrorDetail,
 } from "../tts/provider-error-utils.js";
+
+const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
+
+export function requireInRange(value: number, min: number, max: number, label: string): void {
+  if (!Number.isFinite(value) || value < min || value > max) {
+    throw new Error(`${label} must be between ${min} and ${max}`);
+  }
+}
+
+export function normalizeLanguageCode(code?: string): string | undefined {
+  const trimmed = code?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  const normalized = trimmed.toLowerCase();
+  if (!/^[a-z]{2}$/.test(normalized)) {
+    throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
+  }
+  return normalized;
+}
+
+export function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
+  const trimmed = mode?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  const normalized = trimmed.toLowerCase();
+  if (normalized === "auto" || normalized === "on" || normalized === "off") {
+    return normalized;
+  }
+  throw new Error("applyTextNormalization must be one of: auto, on, off");
+}
+
+export function normalizeSeed(seed?: number): number | undefined {
+  if (seed == null) {
+    return undefined;
+  }
+  const next = Math.floor(seed);
+  if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
+    throw new Error("seed must be between 0 and 4294967295");
+  }
+  return next;
+}
+
+export function scheduleCleanup(
+  tempDir: string,
+  delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS,
+): void {
+  const timer = setTimeout(() => {
+    try {
+      rmSync(tempDir, { recursive: true, force: true });
+    } catch {
+      // ignore cleanup errors
+    }
+  }, delayMs);
+  timer.unref();
+}
+
+export async function summarizeText(params: {
+  text: string;
+  targetLength: number;
+  cfg: OpenClawConfig;
+  config: ResolvedTtsConfig;
+  timeoutMs: number;
+}) {
+  const { summarizeText: summarizeTextRuntime } = await import("../tts/tts-core.js");
+  return summarizeTextRuntime(params);
+}
diff --git a/src/plugins/api-builder.ts b/src/plugins/api-builder.ts
index ab8c66cec10..0c5906758c1 100644
--- a/src/plugins/api-builder.ts
+++ b/src/plugins/api-builder.ts
@@ -28,6 +28,8 @@ export type BuildPluginApiParams = {
       | "registerCliBackend"
       | "registerProvider"
       | "registerSpeechProvider"
+      | "registerRealtimeTranscriptionProvider"
+      | "registerRealtimeVoiceProvider"
       | "registerMediaUnderstandingProvider"
       | "registerImageGenerationProvider"
       | "registerWebFetchProvider"
@@ -55,6 +57,10 @@ const noopRegisterService: OpenClawPluginApi["registerService"] = () => {};
 const noopRegisterCliBackend: OpenClawPluginApi["registerCliBackend"] = () => {};
 const noopRegisterProvider: OpenClawPluginApi["registerProvider"] = () => {};
 const noopRegisterSpeechProvider: OpenClawPluginApi["registerSpeechProvider"] = () => {};
+const noopRegisterRealtimeTranscriptionProvider: OpenClawPluginApi["registerRealtimeTranscriptionProvider"] =
+  () => {};
+const noopRegisterRealtimeVoiceProvider: OpenClawPluginApi["registerRealtimeVoiceProvider"] =
+  () => {};
 const noopRegisterMediaUnderstandingProvider: OpenClawPluginApi["registerMediaUnderstandingProvider"] =
   () => {};
 const noopRegisterImageGenerationProvider: OpenClawPluginApi["registerImageGenerationProvider"] =
@@ -97,6 +103,10 @@ export function buildPluginApi(params: BuildPluginApiParams): OpenClawPluginApi
     registerCliBackend: handlers.registerCliBackend ?? noopRegisterCliBackend,
     registerProvider: handlers.registerProvider ?? noopRegisterProvider,
     registerSpeechProvider: handlers.registerSpeechProvider ?? noopRegisterSpeechProvider,
+    registerRealtimeTranscriptionProvider:
+      handlers.registerRealtimeTranscriptionProvider ?? noopRegisterRealtimeTranscriptionProvider,
+    registerRealtimeVoiceProvider:
+      handlers.registerRealtimeVoiceProvider ?? noopRegisterRealtimeVoiceProvider,
     registerMediaUnderstandingProvider:
       handlers.registerMediaUnderstandingProvider ?? noopRegisterMediaUnderstandingProvider,
     registerImageGenerationProvider:
diff --git a/src/plugins/bundled-capability-metadata.test.ts b/src/plugins/bundled-capability-metadata.test.ts
index 5bcfd4c5872..0a34fb355b5 100644
--- a/src/plugins/bundled-capability-metadata.test.ts
+++ b/src/plugins/bundled-capability-metadata.test.ts
@@ -28,6 +28,10 @@ describe("bundled capability metadata", () => {
         cliBackendIds: uniqueStrings(manifest.cliBackends),
         providerIds: uniqueStrings(manifest.providers),
         speechProviderIds: uniqueStrings(manifest.contracts?.speechProviders),
+        realtimeTranscriptionProviderIds: uniqueStrings(
+          manifest.contracts?.realtimeTranscriptionProviders,
+        ),
+        realtimeVoiceProviderIds: uniqueStrings(manifest.contracts?.realtimeVoiceProviders),
         mediaUnderstandingProviderIds: uniqueStrings(
           manifest.contracts?.mediaUnderstandingProviders,
         ),
@@ -41,6 +45,8 @@ describe("bundled capability metadata", () => {
           entry.cliBackendIds.length > 0 ||
           entry.providerIds.length > 0 ||
           entry.speechProviderIds.length > 0 ||
+          entry.realtimeTranscriptionProviderIds.length > 0 ||
+          entry.realtimeVoiceProviderIds.length > 0 ||
           entry.mediaUnderstandingProviderIds.length > 0 ||
           entry.imageGenerationProviderIds.length > 0 ||
           entry.webFetchProviderIds.length > 0 ||
diff --git a/src/plugins/bundled-capability-metadata.ts b/src/plugins/bundled-capability-metadata.ts
index ebde71de3a9..7a7186153b6 100644
--- a/src/plugins/bundled-capability-metadata.ts
+++ b/src/plugins/bundled-capability-metadata.ts
@@ -5,6 +5,8 @@ export type BundledPluginContractSnapshot = {
   cliBackendIds: string[];
   providerIds: string[];
   speechProviderIds: string[];
+  realtimeTranscriptionProviderIds: string[];
+  realtimeVoiceProviderIds: string[];
   mediaUnderstandingProviderIds: string[];
   imageGenerationProviderIds: string[];
   webFetchProviderIds: string[];
@@ -37,6 +39,10 @@ export const BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS: readonly BundledPluginContractSn
     cliBackendIds: uniqueStrings(manifest.cliBackends),
     providerIds: uniqueStrings(manifest.providers),
     speechProviderIds: uniqueStrings(manifest.contracts?.speechProviders),
+    realtimeTranscriptionProviderIds: uniqueStrings(
+      manifest.contracts?.realtimeTranscriptionProviders,
+    ),
+    realtimeVoiceProviderIds: uniqueStrings(manifest.contracts?.realtimeVoiceProviders),
     mediaUnderstandingProviderIds: uniqueStrings(manifest.contracts?.mediaUnderstandingProviders),
     imageGenerationProviderIds: uniqueStrings(manifest.contracts?.imageGenerationProviders),
     webFetchProviderIds: uniqueStrings(manifest.contracts?.webFetchProviders),
@@ -48,6 +54,8 @@ export const BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS: readonly BundledPluginContractSn
         entry.cliBackendIds.length > 0 ||
         entry.providerIds.length > 0 ||
         entry.speechProviderIds.length > 0 ||
+        entry.realtimeTranscriptionProviderIds.length > 0 ||
+        entry.realtimeVoiceProviderIds.length > 0 ||
         entry.mediaUnderstandingProviderIds.length > 0 ||
         entry.imageGenerationProviderIds.length > 0 ||
         entry.webFetchProviderIds.length > 0 ||
@@ -68,6 +76,14 @@ export const BUNDLED_PROVIDER_PLUGIN_IDS = collectPluginIds((entry) => entry.pro
 
 export const BUNDLED_SPEECH_PLUGIN_IDS = collectPluginIds((entry) => entry.speechProviderIds);
 
+export const BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS = collectPluginIds(
+  (entry) => entry.realtimeTranscriptionProviderIds,
+);
+
+export const BUNDLED_REALTIME_VOICE_PLUGIN_IDS = collectPluginIds(
+  (entry) => entry.realtimeVoiceProviderIds,
+);
+
 export const BUNDLED_MEDIA_UNDERSTANDING_PLUGIN_IDS = collectPluginIds(
   (entry) => entry.mediaUnderstandingProviderIds,
 );
@@ -84,6 +100,8 @@ export const BUNDLED_RUNTIME_CONTRACT_PLUGIN_IDS = [
       (entry) =>
         entry.providerIds.length > 0 ||
         entry.speechProviderIds.length > 0 ||
+        entry.realtimeTranscriptionProviderIds.length > 0 ||
+        entry.realtimeVoiceProviderIds.length > 0 ||
         entry.mediaUnderstandingProviderIds.length > 0 ||
         entry.imageGenerationProviderIds.length > 0 ||
         entry.webFetchProviderIds.length > 0 ||
diff --git a/src/plugins/bundled-capability-runtime.ts b/src/plugins/bundled-capability-runtime.ts
index b7e67dd5aab..c44dd875e52 100644
--- a/src/plugins/bundled-capability-runtime.ts
+++ b/src/plugins/bundled-capability-runtime.ts
@@ -122,6 +122,8 @@ function createCapabilityPluginRecord(params: {
     cliBackendIds: [],
     providerIds: [],
     speechProviderIds: [],
+    realtimeTranscriptionProviderIds: [],
+    realtimeVoiceProviderIds: [],
     mediaUnderstandingProviderIds: [],
     imageGenerationProviderIds: [],
     webFetchProviderIds: [],
@@ -272,6 +274,12 @@ export function loadBundledCapabilityRuntimeRegistry(params: {
       record.cliBackendIds.push(...captured.cliBackends.map((entry) => entry.id));
       record.providerIds.push(...captured.providers.map((entry) => entry.id));
       record.speechProviderIds.push(...captured.speechProviders.map((entry) => entry.id));
+      record.realtimeTranscriptionProviderIds.push(
+        ...captured.realtimeTranscriptionProviders.map((entry) => entry.id),
+      );
+      record.realtimeVoiceProviderIds.push(
+        ...captured.realtimeVoiceProviders.map((entry) => entry.id),
+      );
       record.mediaUnderstandingProviderIds.push(
         ...captured.mediaUnderstandingProviders.map((entry) => entry.id),
       );
@@ -309,6 +317,24 @@ export function loadBundledCapabilityRuntimeRegistry(params: {
           rootDir: record.rootDir,
         })),
       );
+      registry.realtimeTranscriptionProviders.push(
+        ...captured.realtimeTranscriptionProviders.map((provider) => ({
+          pluginId: record.id,
+          pluginName: record.name,
+          provider,
+          source: record.source,
+          rootDir: record.rootDir,
+        })),
+      );
+      registry.realtimeVoiceProviders.push(
+        ...captured.realtimeVoiceProviders.map((provider) => ({
+          pluginId: record.id,
+          pluginName: record.name,
+          provider,
+          source: record.source,
+          rootDir: record.rootDir,
+        })),
+      );
       registry.mediaUnderstandingProviders.push(
         ...captured.mediaUnderstandingProviders.map((provider) => ({
           pluginId: record.id,
diff --git a/src/plugins/capability-provider-runtime.test.ts b/src/plugins/capability-provider-runtime.test.ts
index cb65392f08c..78b8a5e40c8 100644
--- a/src/plugins/capability-provider-runtime.test.ts
+++ b/src/plugins/capability-provider-runtime.test.ts
@@ -102,7 +102,12 @@ function setBundledCapabilityFixture(contractKey: string) {
 }
 
 function expectCompatChainApplied(params: {
-  key: "speechProviders" | "mediaUnderstandingProviders" | "imageGenerationProviders";
+  key:
+    | "speechProviders"
+    | "realtimeTranscriptionProviders"
+    | "realtimeVoiceProviders"
+    | "mediaUnderstandingProviders"
+    | "imageGenerationProviders";
   contractKey: string;
   cfg: OpenClawConfig;
   enablementCompat: {
@@ -201,6 +206,8 @@ describe("resolvePluginCapabilityProviders", () => {
 
   it.each([
     ["speechProviders", "speechProviders"],
+    ["realtimeTranscriptionProviders", "realtimeTranscriptionProviders"],
+    ["realtimeVoiceProviders", "realtimeVoiceProviders"],
     ["mediaUnderstandingProviders", "mediaUnderstandingProviders"],
     ["imageGenerationProviders", "imageGenerationProviders"],
   ] as const)("applies bundled compat before fallback loading for %s", (key, contractKey) => {
diff --git a/src/plugins/capability-provider-runtime.ts b/src/plugins/capability-provider-runtime.ts
index 195aa0ba5d3..4e41fce5f87 100644
--- a/src/plugins/capability-provider-runtime.ts
+++ b/src/plugins/capability-provider-runtime.ts
@@ -9,11 +9,15 @@ import type { PluginRegistry } from "./registry.js";
 
 type CapabilityProviderRegistryKey =
   | "speechProviders"
+  | "realtimeTranscriptionProviders"
+  | "realtimeVoiceProviders"
   | "mediaUnderstandingProviders"
   | "imageGenerationProviders";
 
 type CapabilityContractKey =
   | "speechProviders"
+  | "realtimeTranscriptionProviders"
+  | "realtimeVoiceProviders"
   | "mediaUnderstandingProviders"
   | "imageGenerationProviders";
 
@@ -22,6 +26,8 @@ type CapabilityProviderForKey<K extends CapabilityProviderRegistryKey> =
 
 const CAPABILITY_CONTRACT_KEY: Record<CapabilityProviderRegistryKey, CapabilityContractKey> = {
   speechProviders: "speechProviders",
+  realtimeTranscriptionProviders: "realtimeTranscriptionProviders",
+  realtimeVoiceProviders: "realtimeVoiceProviders",
   mediaUnderstandingProviders: "mediaUnderstandingProviders",
   imageGenerationProviders: "imageGenerationProviders",
 };
diff --git a/src/plugins/captured-registration.ts b/src/plugins/captured-registration.ts
index 8f20450e517..c816c2d1464 100644
--- a/src/plugins/captured-registration.ts
+++ b/src/plugins/captured-registration.ts
@@ -10,6 +10,8 @@ import type {
   OpenClawPluginCliCommandDescriptor,
   OpenClawPluginCliRegistrar,
   ProviderPlugin,
+  RealtimeTranscriptionProviderPlugin,
+  RealtimeVoiceProviderPlugin,
   SpeechProviderPlugin,
   WebFetchProviderPlugin,
   WebSearchProviderPlugin,
@@ -27,6 +29,8 @@ export type CapturedPluginRegistration = {
   cliRegistrars: CapturedPluginCliRegistration[];
   cliBackends: CliBackendPlugin[];
   speechProviders: SpeechProviderPlugin[];
+  realtimeTranscriptionProviders: RealtimeTranscriptionProviderPlugin[];
+  realtimeVoiceProviders: RealtimeVoiceProviderPlugin[];
   mediaUnderstandingProviders: MediaUnderstandingProviderPlugin[];
   imageGenerationProviders: ImageGenerationProviderPlugin[];
   webFetchProviders: WebFetchProviderPlugin[];
@@ -42,6 +46,8 @@ export function createCapturedPluginRegistration(params?: {
   const cliRegistrars: CapturedPluginCliRegistration[] = [];
   const cliBackends: CliBackendPlugin[] = [];
   const speechProviders: SpeechProviderPlugin[] = [];
+  const realtimeTranscriptionProviders: RealtimeTranscriptionProviderPlugin[] = [];
+  const realtimeVoiceProviders: RealtimeVoiceProviderPlugin[] = [];
   const mediaUnderstandingProviders: MediaUnderstandingProviderPlugin[] = [];
   const imageGenerationProviders: ImageGenerationProviderPlugin[] = [];
   const webFetchProviders: WebFetchProviderPlugin[] = [];
@@ -59,6 +65,8 @@ export function createCapturedPluginRegistration(params?: {
     cliRegistrars,
     cliBackends,
     speechProviders,
+    realtimeTranscriptionProviders,
+    realtimeVoiceProviders,
     mediaUnderstandingProviders,
     imageGenerationProviders,
     webFetchProviders,
@@ -106,6 +114,12 @@ export function createCapturedPluginRegistration(params?: {
         registerSpeechProvider(provider: SpeechProviderPlugin) {
           speechProviders.push(provider);
         },
+        registerRealtimeTranscriptionProvider(provider: RealtimeTranscriptionProviderPlugin) {
+          realtimeTranscriptionProviders.push(provider);
+        },
+        registerRealtimeVoiceProvider(provider: RealtimeVoiceProviderPlugin) {
+          realtimeVoiceProviders.push(provider);
+        },
         registerMediaUnderstandingProvider(provider: MediaUnderstandingProviderPlugin) {
           mediaUnderstandingProviders.push(provider);
         },
diff --git a/src/plugins/cli.ts b/src/plugins/cli.ts
index 3d8f639ac87..a495693b6e9 100644
--- a/src/plugins/cli.ts
+++ b/src/plugins/cli.ts
@@ -155,9 +155,10 @@ async function loadPluginCliCommandRegistry(
 export async function getPluginCliCommandDescriptors(
   cfg?: OpenClawConfig,
   env?: NodeJS.ProcessEnv,
+  loaderOptions?: Pick<PluginLoadOptions, "pluginSdkResolution">,
 ): Promise<OpenClawPluginCliCommandDescriptor[]> {
   try {
-    const { registry } = await loadPluginCliMetadataRegistry(cfg, env);
+    const { registry } = await loadPluginCliMetadataRegistry(cfg, env, loaderOptions);
     const seen = new Set<string>();
     const descriptors: OpenClawPluginCliCommandDescriptor[] = [];
     for (const entry of registry.cliRegistrars) {
diff --git a/src/plugins/contracts/registry.contract.test.ts b/src/plugins/contracts/registry.contract.test.ts
index a0891beb395..70bd8bad2a9 100644
--- a/src/plugins/contracts/registry.contract.test.ts
+++ b/src/plugins/contracts/registry.contract.test.ts
@@ -8,6 +8,8 @@ import {
   pluginRegistrationContractRegistry,
   providerContractLoadError,
   providerContractPluginIds,
+  realtimeTranscriptionProviderContractRegistry,
+  realtimeVoiceProviderContractRegistry,
   resolveWebFetchProviderContractEntriesForPluginId,
   resolveWebSearchProviderContractEntriesForPluginId,
   speechProviderContractRegistry,
@@ -27,7 +29,11 @@ describe("plugin contract registry", () => {
     predicate: (plugin: {
       origin: string;
       providers: unknown[];
-      contracts?: { speechProviders?: unknown[] };
+      contracts?: {
+        speechProviders?: unknown[];
+        realtimeTranscriptionProviders?: unknown[];
+        realtimeVoiceProviders?: unknown[];
+      };
     }) => boolean;
   }) {
     expect(uniqueSortedStrings(params.actualPluginIds)).toEqual(
@@ -39,7 +45,11 @@ describe("plugin contract registry", () => {
     predicate: (plugin: {
       origin: string;
       providers: unknown[];
-      contracts?: { speechProviders?: unknown[] };
+      contracts?: {
+        speechProviders?: unknown[];
+        realtimeTranscriptionProviders?: unknown[];
+        realtimeVoiceProviders?: unknown[];
+      };
     }) => boolean,
   ) {
     return loadPluginManifestRegistry({})
@@ -70,6 +80,14 @@ describe("plugin contract registry", () => {
       name: "does not duplicate bundled media provider ids",
       ids: () => mediaUnderstandingProviderContractRegistry.map((entry) => entry.provider.id),
     },
+    {
+      name: "does not duplicate bundled realtime transcription provider ids",
+      ids: () => realtimeTranscriptionProviderContractRegistry.map((entry) => entry.provider.id),
+    },
+    {
+      name: "does not duplicate bundled realtime voice provider ids",
+      ids: () => realtimeVoiceProviderContractRegistry.map((entry) => entry.provider.id),
+    },
     {
       name: "does not duplicate bundled image-generation provider ids",
       ids: () => imageGenerationProviderContractRegistry.map((entry) => entry.provider.id),
@@ -101,6 +119,23 @@ describe("plugin contract registry", () => {
     });
   });
 
+  it("covers every bundled realtime voice plugin discovered from manifests", () => {
+    expectRegistryPluginIds({
+      actualPluginIds: realtimeVoiceProviderContractRegistry.map((entry) => entry.pluginId),
+      predicate: (plugin) =>
+        plugin.origin === "bundled" && (plugin.contracts?.realtimeVoiceProviders?.length ?? 0) > 0,
+    });
+  });
+
+  it("covers every bundled realtime transcription plugin discovered from manifests", () => {
+    expectRegistryPluginIds({
+      actualPluginIds: realtimeTranscriptionProviderContractRegistry.map((entry) => entry.pluginId),
+      predicate: (plugin) =>
+        plugin.origin === "bundled" &&
+        (plugin.contracts?.realtimeTranscriptionProviders?.length ?? 0) > 0,
+    });
+  });
+
   it("covers every bundled web fetch plugin from the shared resolver", () => {
     const bundledWebFetchPluginIds = resolveBundledWebFetchPluginIds({});
 
diff --git a/src/plugins/contracts/registry.ts b/src/plugins/contracts/registry.ts
index 2728de07bc1..dfe446209b5 100644
--- a/src/plugins/contracts/registry.ts
+++ b/src/plugins/contracts/registry.ts
@@ -3,6 +3,8 @@ import {
   BUNDLED_MEDIA_UNDERSTANDING_PLUGIN_IDS,
   BUNDLED_PLUGIN_CONTRACT_SNAPSHOTS,
   BUNDLED_PROVIDER_PLUGIN_IDS,
+  BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS,
+  BUNDLED_REALTIME_VOICE_PLUGIN_IDS,
   BUNDLED_SPEECH_PLUGIN_IDS,
   BUNDLED_WEB_FETCH_PLUGIN_IDS,
   BUNDLED_WEB_SEARCH_PLUGIN_IDS,
@@ -12,6 +14,8 @@ import type {
   ImageGenerationProviderPlugin,
   MediaUnderstandingProviderPlugin,
   ProviderPlugin,
+  RealtimeTranscriptionProviderPlugin,
+  RealtimeVoiceProviderPlugin,
   SpeechProviderPlugin,
   WebFetchProviderPlugin,
   WebSearchProviderPlugin,
@@ -19,6 +23,8 @@ import type {
 import {
   loadVitestImageGenerationProviderContractRegistry,
   loadVitestMediaUnderstandingProviderContractRegistry,
+  loadVitestRealtimeTranscriptionProviderContractRegistry,
+  loadVitestRealtimeVoiceProviderContractRegistry,
   loadVitestSpeechProviderContractRegistry,
 } from "./speech-vitest-registry.js";
 
@@ -38,6 +44,9 @@ type WebFetchProviderContractEntry = CapabilityContractEntry<WebFetchProviderPlu
 };
 
 type SpeechProviderContractEntry = CapabilityContractEntry<SpeechProviderPlugin>;
+type RealtimeTranscriptionProviderContractEntry =
+  CapabilityContractEntry<RealtimeTranscriptionProviderPlugin>;
+type RealtimeVoiceProviderContractEntry = CapabilityContractEntry<RealtimeVoiceProviderPlugin>;
 type MediaUnderstandingProviderContractEntry =
   CapabilityContractEntry<MediaUnderstandingProviderPlugin>;
 type ImageGenerationProviderContractEntry = CapabilityContractEntry<ImageGenerationProviderPlugin>;
@@ -47,6 +56,8 @@ type PluginRegistrationContractEntry = {
   cliBackendIds: string[];
   providerIds: string[];
   speechProviderIds: string[];
+  realtimeTranscriptionProviderIds: string[];
+  realtimeVoiceProviderIds: string[];
   mediaUnderstandingProviderIds: string[];
   imageGenerationProviderIds: string[];
   webFetchProviderIds: string[];
@@ -94,6 +105,10 @@ let webSearchProviderContractRegistryByPluginIdCache: Map<
   WebSearchProviderContractEntry[]
 > | null = null;
 let speechProviderContractRegistryCache: SpeechProviderContractEntry[] | null = null;
+let realtimeTranscriptionProviderContractRegistryCache:
+  | RealtimeTranscriptionProviderContractEntry[]
+  | null = null;
+let realtimeVoiceProviderContractRegistryCache: RealtimeVoiceProviderContractEntry[] | null = null;
 let mediaUnderstandingProviderContractRegistryCache:
   | MediaUnderstandingProviderContractEntry[]
   | null = null;
@@ -387,6 +402,36 @@ function loadSpeechProviderContractRegistry(): SpeechProviderContractEntry[] {
   return speechProviderContractRegistryCache;
 }
 
+function loadRealtimeVoiceProviderContractRegistry(): RealtimeVoiceProviderContractEntry[] {
+  if (!realtimeVoiceProviderContractRegistryCache) {
+    realtimeVoiceProviderContractRegistryCache = process.env.VITEST
+      ? loadVitestRealtimeVoiceProviderContractRegistry()
+      : loadBundledCapabilityRuntimeRegistry({
+          pluginIds: BUNDLED_REALTIME_VOICE_PLUGIN_IDS,
+          pluginSdkResolution: "dist",
+        }).realtimeVoiceProviders.map((entry) => ({
+          pluginId: entry.pluginId,
+          provider: entry.provider,
+        }));
+  }
+  return realtimeVoiceProviderContractRegistryCache;
+}
+
+function loadRealtimeTranscriptionProviderContractRegistry(): RealtimeTranscriptionProviderContractEntry[] {
+  if (!realtimeTranscriptionProviderContractRegistryCache) {
+    realtimeTranscriptionProviderContractRegistryCache = process.env.VITEST
+      ? loadVitestRealtimeTranscriptionProviderContractRegistry()
+      : loadBundledCapabilityRuntimeRegistry({
+          pluginIds: BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS,
+          pluginSdkResolution: "dist",
+        }).realtimeTranscriptionProviders.map((entry) => ({
+          pluginId: entry.pluginId,
+          provider: entry.provider,
+        }));
+  }
+  return realtimeTranscriptionProviderContractRegistryCache;
+}
+
 function loadMediaUnderstandingProviderContractRegistry(): MediaUnderstandingProviderContractEntry[] {
   if (!mediaUnderstandingProviderContractRegistryCache) {
     mediaUnderstandingProviderContractRegistryCache = process.env.VITEST
@@ -519,6 +564,12 @@ export const speechProviderContractRegistry: SpeechProviderContractEntry[] = cre
   loadSpeechProviderContractRegistry,
 );
 
+export const realtimeTranscriptionProviderContractRegistry: RealtimeTranscriptionProviderContractEntry[] =
+  createLazyArrayView(loadRealtimeTranscriptionProviderContractRegistry);
+
+export const realtimeVoiceProviderContractRegistry: RealtimeVoiceProviderContractEntry[] =
+  createLazyArrayView(loadRealtimeVoiceProviderContractRegistry);
+
 export const mediaUnderstandingProviderContractRegistry: MediaUnderstandingProviderContractEntry[] =
   createLazyArrayView(loadMediaUnderstandingProviderContractRegistry);
 
@@ -531,6 +582,8 @@ function loadPluginRegistrationContractRegistry(): PluginRegistrationContractEnt
     cliBackendIds: uniqueStrings(entry.cliBackendIds),
     providerIds: uniqueStrings(entry.providerIds),
     speechProviderIds: uniqueStrings(entry.speechProviderIds),
+    realtimeTranscriptionProviderIds: uniqueStrings(entry.realtimeTranscriptionProviderIds),
+    realtimeVoiceProviderIds: uniqueStrings(entry.realtimeVoiceProviderIds),
     mediaUnderstandingProviderIds: uniqueStrings(entry.mediaUnderstandingProviderIds),
     imageGenerationProviderIds: uniqueStrings(entry.imageGenerationProviderIds),
     webFetchProviderIds: uniqueStrings(entry.webFetchProviderIds),
diff --git a/src/plugins/contracts/speech-vitest-registry.ts b/src/plugins/contracts/speech-vitest-registry.ts
index f5865612dd5..9fb083b7804 100644
--- a/src/plugins/contracts/speech-vitest-registry.ts
+++ b/src/plugins/contracts/speech-vitest-registry.ts
@@ -5,6 +5,8 @@ import { createJiti } from "jiti";
 import {
   BUNDLED_IMAGE_GENERATION_PLUGIN_IDS,
   BUNDLED_MEDIA_UNDERSTANDING_PLUGIN_IDS,
+  BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS,
+  BUNDLED_REALTIME_VOICE_PLUGIN_IDS,
   BUNDLED_SPEECH_PLUGIN_IDS,
 } from "../bundled-capability-metadata.js";
 import { loadBundledCapabilityRuntimeRegistry } from "../bundled-capability-runtime.js";
@@ -13,6 +15,8 @@ import { buildPluginLoaderAliasMap, buildPluginLoaderJitiOptions } from "../sdk-
 import type {
   ImageGenerationProviderPlugin,
   MediaUnderstandingProviderPlugin,
+  RealtimeTranscriptionProviderPlugin,
+  RealtimeVoiceProviderPlugin,
   SpeechProviderPlugin,
 } from "../types.js";
 
@@ -26,6 +30,16 @@ export type MediaUnderstandingProviderContractEntry = {
   provider: MediaUnderstandingProviderPlugin;
 };
 
+export type RealtimeVoiceProviderContractEntry = {
+  pluginId: string;
+  provider: RealtimeVoiceProviderPlugin;
+};
+
+export type RealtimeTranscriptionProviderContractEntry = {
+  pluginId: string;
+  provider: RealtimeTranscriptionProviderPlugin;
+};
+
 export type ImageGenerationProviderContractEntry = {
   pluginId: string;
   provider: ImageGenerationProviderPlugin;
@@ -190,6 +204,96 @@ export function loadVitestMediaUnderstandingProviderContractRegistry(): MediaUnd
   return registrations;
 }
 
+export function loadVitestRealtimeVoiceProviderContractRegistry(): RealtimeVoiceProviderContractEntry[] {
+  const registrations: RealtimeVoiceProviderContractEntry[] = [];
+  const { manifests, unresolvedPluginIds } = resolveTestApiModuleRecords(
+    BUNDLED_REALTIME_VOICE_PLUGIN_IDS,
+  );
+
+  for (const plugin of manifests) {
+    if (!plugin.rootDir) {
+      continue;
+    }
+    const testApiPath = path.join(plugin.rootDir, "test-api.ts");
+    if (!fs.existsSync(testApiPath)) {
+      continue;
+    }
+    const builder = resolveNamedBuilder<RealtimeVoiceProviderPlugin>(
+      createVitestCapabilityLoader(testApiPath)(testApiPath),
+      /^build.+RealtimeVoiceProvider$/u,
+    );
+    if (!builder) {
+      continue;
+    }
+    registrations.push({
+      pluginId: plugin.id,
+      provider: builder(),
+    });
+    unresolvedPluginIds.delete(plugin.id);
+  }
+
+  if (unresolvedPluginIds.size === 0) {
+    return registrations;
+  }
+
+  const runtimeRegistry = loadBundledCapabilityRuntimeRegistry({
+    pluginIds: [...unresolvedPluginIds],
+    pluginSdkResolution: "dist",
+  });
+  registrations.push(
+    ...runtimeRegistry.realtimeVoiceProviders.map((entry) => ({
+      pluginId: entry.pluginId,
+      provider: entry.provider,
+    })),
+  );
+  return registrations;
+}
+
+export function loadVitestRealtimeTranscriptionProviderContractRegistry(): RealtimeTranscriptionProviderContractEntry[] {
+  const registrations: RealtimeTranscriptionProviderContractEntry[] = [];
+  const { manifests, unresolvedPluginIds } = resolveTestApiModuleRecords(
+    BUNDLED_REALTIME_TRANSCRIPTION_PLUGIN_IDS,
+  );
+
+  for (const plugin of manifests) {
+    if (!plugin.rootDir) {
+      continue;
+    }
+    const testApiPath = path.join(plugin.rootDir, "test-api.ts");
+    if (!fs.existsSync(testApiPath)) {
+      continue;
+    }
+    const builder = resolveNamedBuilder<RealtimeTranscriptionProviderPlugin>(
+      createVitestCapabilityLoader(testApiPath)(testApiPath),
+      /^build.+RealtimeTranscriptionProvider$/u,
+    );
+    if (!builder) {
+      continue;
+    }
+    registrations.push({
+      pluginId: plugin.id,
+      provider: builder(),
+    });
+    unresolvedPluginIds.delete(plugin.id);
+  }
+
+  if (unresolvedPluginIds.size === 0) {
+    return registrations;
+  }
+
+  const runtimeRegistry = loadBundledCapabilityRuntimeRegistry({
+    pluginIds: [...unresolvedPluginIds],
+    pluginSdkResolution: "dist",
+  });
+  registrations.push(
+    ...runtimeRegistry.realtimeTranscriptionProviders.map((entry) => ({
+      pluginId: entry.pluginId,
+      provider: entry.provider,
+    })),
+  );
+  return registrations;
+}
+
 export function loadVitestImageGenerationProviderContractRegistry(): ImageGenerationProviderContractEntry[] {
   const registrations: ImageGenerationProviderContractEntry[] = [];
   const { manifests, unresolvedPluginIds } = resolveTestApiModuleRecords(
diff --git a/src/plugins/loader.ts b/src/plugins/loader.ts
index e1625766b06..c89b0ea0558 100644
--- a/src/plugins/loader.ts
+++ b/src/plugins/loader.ts
@@ -590,6 +590,8 @@ function createPluginRecord(params: {
     cliBackendIds: [],
     providerIds: [],
     speechProviderIds: [],
+    realtimeTranscriptionProviderIds: [],
+    realtimeVoiceProviderIds: [],
     mediaUnderstandingProviderIds: [],
     imageGenerationProviderIds: [],
     webFetchProviderIds: [],
diff --git a/src/plugins/manifest.ts b/src/plugins/manifest.ts
index cc55beb5c3a..7dd7e5967f8 100644
--- a/src/plugins/manifest.ts
+++ b/src/plugins/manifest.ts
@@ -52,6 +52,8 @@ export type PluginManifest = {
 
 export type PluginManifestContracts = {
   speechProviders?: string[];
+  realtimeTranscriptionProviders?: string[];
+  realtimeVoiceProviders?: string[];
   mediaUnderstandingProviders?: string[];
   imageGenerationProviders?: string[];
   webFetchProviders?: string[];
@@ -125,6 +127,8 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u
   }
 
   const speechProviders = normalizeStringList(value.speechProviders);
+  const realtimeTranscriptionProviders = normalizeStringList(value.realtimeTranscriptionProviders);
+  const realtimeVoiceProviders = normalizeStringList(value.realtimeVoiceProviders);
   const mediaUnderstandingProviders = normalizeStringList(value.mediaUnderstandingProviders);
   const imageGenerationProviders = normalizeStringList(value.imageGenerationProviders);
   const webFetchProviders = normalizeStringList(value.webFetchProviders);
@@ -132,6 +136,8 @@ function normalizeManifestContracts(value: unknown): PluginManifestContracts | u
   const tools = normalizeStringList(value.tools);
   const contracts = {
     ...(speechProviders.length > 0 ? { speechProviders } : {}),
+    ...(realtimeTranscriptionProviders.length > 0 ? { realtimeTranscriptionProviders } : {}),
+    ...(realtimeVoiceProviders.length > 0 ? { realtimeVoiceProviders } : {}),
     ...(mediaUnderstandingProviders.length > 0 ? { mediaUnderstandingProviders } : {}),
     ...(imageGenerationProviders.length > 0 ? { imageGenerationProviders } : {}),
     ...(webFetchProviders.length > 0 ? { webFetchProviders } : {}),
diff --git a/src/plugins/registry-empty.ts b/src/plugins/registry-empty.ts
index 7e8698cfdd7..ee7183e6d66 100644
--- a/src/plugins/registry-empty.ts
+++ b/src/plugins/registry-empty.ts
@@ -11,6 +11,8 @@ export function createEmptyPluginRegistry(): PluginRegistry {
     providers: [],
     cliBackends: [],
     speechProviders: [],
+    realtimeTranscriptionProviders: [],
+    realtimeVoiceProviders: [],
     mediaUnderstandingProviders: [],
     imageGenerationProviders: [],
     webFetchProviders: [],
diff --git a/src/plugins/registry.ts b/src/plugins/registry.ts
index b221fde6aa3..a6d19f4f8c8 100644
--- a/src/plugins/registry.ts
+++ b/src/plugins/registry.ts
@@ -38,7 +38,7 @@ import {
 import type {
   CliBackendPlugin,
   ImageGenerationProviderPlugin,
-  WebFetchProviderPlugin,
+  RealtimeTranscriptionProviderPlugin,
   OpenClawPluginApi,
   OpenClawPluginChannelRegistration,
   OpenClawPluginCliCommandDescriptor,
@@ -52,6 +52,7 @@ import type {
   OpenClawPluginHookOptions,
   MediaUnderstandingProviderPlugin,
   ProviderPlugin,
+  RealtimeVoiceProviderPlugin,
   OpenClawPluginService,
   OpenClawPluginToolContext,
   OpenClawPluginToolFactory,
@@ -67,6 +68,7 @@ import type {
   PluginHookHandlerMap,
   PluginHookRegistration as TypedPluginHookRegistration,
   SpeechProviderPlugin,
+  WebFetchProviderPlugin,
   WebSearchProviderPlugin,
 } from "./types.js";
 
@@ -142,6 +144,10 @@ type PluginOwnedProviderRegistration<T extends { id: string }> = {
 
 export type PluginSpeechProviderRegistration =
   PluginOwnedProviderRegistration<SpeechProviderPlugin>;
+export type PluginRealtimeTranscriptionProviderRegistration =
+  PluginOwnedProviderRegistration<RealtimeTranscriptionProviderPlugin>;
+export type PluginRealtimeVoiceProviderRegistration =
+  PluginOwnedProviderRegistration<RealtimeVoiceProviderPlugin>;
 export type PluginMediaUnderstandingProviderRegistration =
   PluginOwnedProviderRegistration<MediaUnderstandingProviderPlugin>;
 export type PluginImageGenerationProviderRegistration =
@@ -213,6 +219,8 @@ export type PluginRecord = {
   cliBackendIds: string[];
   providerIds: string[];
   speechProviderIds: string[];
+  realtimeTranscriptionProviderIds: string[];
+  realtimeVoiceProviderIds: string[];
   mediaUnderstandingProviderIds: string[];
   imageGenerationProviderIds: string[];
   webFetchProviderIds: string[];
@@ -239,6 +247,8 @@ export type PluginRegistry = {
   providers: PluginProviderRegistration[];
   cliBackends?: PluginCliBackendRegistration[];
   speechProviders: PluginSpeechProviderRegistration[];
+  realtimeTranscriptionProviders: PluginRealtimeTranscriptionProviderRegistration[];
+  realtimeVoiceProviders: PluginRealtimeVoiceProviderRegistration[];
   mediaUnderstandingProviders: PluginMediaUnderstandingProviderRegistration[];
   imageGenerationProviders: PluginImageGenerationProviderRegistration[];
   webFetchProviders: PluginWebFetchProviderRegistration[];
@@ -699,6 +709,32 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
     });
   };
 
+  const registerRealtimeTranscriptionProvider = (
+    record: PluginRecord,
+    provider: RealtimeTranscriptionProviderPlugin,
+  ) => {
+    registerUniqueProviderLike({
+      record,
+      provider,
+      kindLabel: "realtime transcription provider",
+      registrations: registry.realtimeTranscriptionProviders,
+      ownedIds: record.realtimeTranscriptionProviderIds,
+    });
+  };
+
+  const registerRealtimeVoiceProvider = (
+    record: PluginRecord,
+    provider: RealtimeVoiceProviderPlugin,
+  ) => {
+    registerUniqueProviderLike({
+      record,
+      provider,
+      kindLabel: "realtime voice provider",
+      registrations: registry.realtimeVoiceProviders,
+      ownedIds: record.realtimeVoiceProviderIds,
+    });
+  };
+
   const registerMediaUnderstandingProvider = (
     record: PluginRecord,
     provider: MediaUnderstandingProviderPlugin,
@@ -1009,6 +1045,10 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
               registerHttpRoute: (routeParams) => registerHttpRoute(record, routeParams),
               registerProvider: (provider) => registerProvider(record, provider),
               registerSpeechProvider: (provider) => registerSpeechProvider(record, provider),
+              registerRealtimeTranscriptionProvider: (provider) =>
+                registerRealtimeTranscriptionProvider(record, provider),
+              registerRealtimeVoiceProvider: (provider) =>
+                registerRealtimeVoiceProvider(record, provider),
               registerMediaUnderstandingProvider: (provider) =>
                 registerMediaUnderstandingProvider(record, provider),
               registerImageGenerationProvider: (provider) =>
@@ -1198,6 +1238,8 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
     registerProvider,
     registerCliBackend,
     registerSpeechProvider,
+    registerRealtimeTranscriptionProvider,
+    registerRealtimeVoiceProvider,
     registerMediaUnderstandingProvider,
     registerImageGenerationProvider,
     registerWebSearchProvider,
diff --git a/src/plugins/runtime.test.ts b/src/plugins/runtime.test.ts
index 47213675b25..d4393210c0c 100644
--- a/src/plugins/runtime.test.ts
+++ b/src/plugins/runtime.test.ts
@@ -199,6 +199,8 @@ describe("setActivePluginRegistry", () => {
       cliBackendIds: [],
       providerIds: [],
       speechProviderIds: [],
+      realtimeTranscriptionProviderIds: [],
+      realtimeVoiceProviderIds: [],
       mediaUnderstandingProviderIds: [],
       imageGenerationProviderIds: [],
       webFetchProviderIds: [],
@@ -225,6 +227,8 @@ describe("setActivePluginRegistry", () => {
       cliBackendIds: [],
       providerIds: [],
       speechProviderIds: [],
+      realtimeTranscriptionProviderIds: [],
+      realtimeVoiceProviderIds: [],
       mediaUnderstandingProviderIds: [],
       imageGenerationProviderIds: [],
       webFetchProviderIds: [],
diff --git a/src/plugins/status.test-helpers.ts b/src/plugins/status.test-helpers.ts
index 29650140178..e8b4119f08a 100644
--- a/src/plugins/status.test-helpers.ts
+++ b/src/plugins/status.test-helpers.ts
@@ -51,6 +51,8 @@ export function createPluginRecord(
     cliBackendIds: [],
     providerIds: [],
     speechProviderIds: [],
+    realtimeTranscriptionProviderIds: [],
+    realtimeVoiceProviderIds: [],
     mediaUnderstandingProviderIds: [],
     imageGenerationProviderIds: [],
     webFetchProviderIds: [],
@@ -107,7 +109,7 @@ export function createCustomHook(params: {
 export function createPluginLoadResult(
   overrides: Partial<PluginLoadResult> & Pick<PluginLoadResult, "plugins"> = { plugins: [] },
 ): PluginLoadResult {
-  const { plugins, ...rest } = overrides;
+  const { plugins, realtimeTranscriptionProviders, realtimeVoiceProviders, ...rest } = overrides;
   return {
     plugins,
     diagnostics: [],
@@ -129,6 +131,8 @@ export function createPluginLoadResult(
     commands: [],
     conversationBindingResolvedHandlers: [],
     ...rest,
+    realtimeTranscriptionProviders: realtimeTranscriptionProviders ?? [],
+    realtimeVoiceProviders: realtimeVoiceProviders ?? [],
   };
 }
 
diff --git a/src/plugins/status.ts b/src/plugins/status.ts
index eeae8a6cdaa..32cc68a47a7 100644
--- a/src/plugins/status.ts
+++ b/src/plugins/status.ts
@@ -28,6 +28,8 @@ export type PluginCapabilityKind =
   | "cli-backend"
   | "text-inference"
   | "speech"
+  | "realtime-transcription"
+  | "realtime-voice"
   | "media-understanding"
   | "image-generation"
   | "web-search"
@@ -233,6 +235,8 @@ function buildCapabilityEntries(plugin: PluginRegistry["plugins"][number]) {
     { kind: "cli-backend" as const, ids: plugin.cliBackendIds ?? [] },
     { kind: "text-inference" as const, ids: plugin.providerIds },
     { kind: "speech" as const, ids: plugin.speechProviderIds },
+    { kind: "realtime-transcription" as const, ids: plugin.realtimeTranscriptionProviderIds },
+    { kind: "realtime-voice" as const, ids: plugin.realtimeVoiceProviderIds },
     { kind: "media-understanding" as const, ids: plugin.mediaUnderstandingProviderIds },
     { kind: "image-generation" as const, ids: plugin.imageGenerationProviderIds },
     { kind: "web-search" as const, ids: plugin.webSearchProviderIds },
diff --git a/src/plugins/types.ts b/src/plugins/types.ts
index 2d88b77a984..a8d7561f3b2 100644
--- a/src/plugins/types.ts
+++ b/src/plugins/types.ts
@@ -30,6 +30,22 @@ import type { HookEntry } from "../hooks/types.js";
 import type { ImageGenerationProvider } from "../image-generation/types.js";
 import type { ProviderUsageSnapshot } from "../infra/provider-usage.types.js";
 import type { MediaUnderstandingProvider } from "../media-understanding/types.js";
+import type {
+  RealtimeTranscriptionProviderConfig,
+  RealtimeTranscriptionProviderConfiguredContext,
+  RealtimeTranscriptionProviderId,
+  RealtimeTranscriptionProviderResolveConfigContext,
+  RealtimeTranscriptionSession,
+  RealtimeTranscriptionSessionCreateRequest,
+} from "../realtime-transcription/provider-types.js";
+import type {
+  RealtimeVoiceBridge,
+  RealtimeVoiceBridgeCreateRequest,
+  RealtimeVoiceProviderConfig,
+  RealtimeVoiceProviderConfiguredContext,
+  RealtimeVoiceProviderId,
+  RealtimeVoiceProviderResolveConfigContext,
+} from "../realtime-voice/provider-types.js";
 import type { RuntimeEnv } from "../runtime.js";
 import type {
   RuntimeWebFetchMetadata,
@@ -1526,6 +1542,38 @@ export type PluginSpeechProviderEntry = SpeechProviderPlugin & {
   pluginId: string;
 };
 
+/** Realtime transcription capability registered by a plugin. */
+export type RealtimeTranscriptionProviderPlugin = {
+  id: RealtimeTranscriptionProviderId;
+  label: string;
+  aliases?: string[];
+  autoSelectOrder?: number;
+  resolveConfig?: (
+    ctx: RealtimeTranscriptionProviderResolveConfigContext,
+  ) => RealtimeTranscriptionProviderConfig;
+  isConfigured: (ctx: RealtimeTranscriptionProviderConfiguredContext) => boolean;
+  createSession: (req: RealtimeTranscriptionSessionCreateRequest) => RealtimeTranscriptionSession;
+};
+
+export type PluginRealtimeTranscriptionProviderEntry = RealtimeTranscriptionProviderPlugin & {
+  pluginId: string;
+};
+
+/** Realtime voice capability registered by a plugin. */
+export type RealtimeVoiceProviderPlugin = {
+  id: RealtimeVoiceProviderId;
+  label: string;
+  aliases?: string[];
+  autoSelectOrder?: number;
+  resolveConfig?: (ctx: RealtimeVoiceProviderResolveConfigContext) => RealtimeVoiceProviderConfig;
+  isConfigured: (ctx: RealtimeVoiceProviderConfiguredContext) => boolean;
+  createBridge: (req: RealtimeVoiceBridgeCreateRequest) => RealtimeVoiceBridge;
+};
+
+export type PluginRealtimeVoiceProviderEntry = RealtimeVoiceProviderPlugin & {
+  pluginId: string;
+};
+
 export type MediaUnderstandingProviderPlugin = MediaUnderstandingProvider;
 export type ImageGenerationProviderPlugin = ImageGenerationProvider;
 
@@ -1850,6 +1898,10 @@ export type OpenClawPluginApi = {
   registerProvider: (provider: ProviderPlugin) => void;
   /** Register a speech synthesis provider (speech capability). */
   registerSpeechProvider: (provider: SpeechProviderPlugin) => void;
+  /** Register a realtime transcription provider (streaming STT capability). */
+  registerRealtimeTranscriptionProvider: (provider: RealtimeTranscriptionProviderPlugin) => void;
+  /** Register a realtime voice provider (duplex voice capability). */
+  registerRealtimeVoiceProvider: (provider: RealtimeVoiceProviderPlugin) => void;
   /** Register a media understanding provider (media understanding capability). */
   registerMediaUnderstandingProvider: (provider: MediaUnderstandingProviderPlugin) => void;
   /** Register an image generation provider (image generation capability). */
diff --git a/src/realtime-transcription/provider-registry.ts b/src/realtime-transcription/provider-registry.ts
new file mode 100644
index 00000000000..28d2e3125ed
--- /dev/null
+++ b/src/realtime-transcription/provider-registry.ts
@@ -0,0 +1,80 @@
+import type { OpenClawConfig } from "../config/config.js";
+import { resolvePluginCapabilityProviders } from "../plugins/capability-provider-runtime.js";
+import type { RealtimeTranscriptionProviderPlugin } from "../plugins/types.js";
+import type { RealtimeTranscriptionProviderId } from "./provider-types.js";
+
+function trimToUndefined(value: string | undefined): string | undefined {
+  const trimmed = value?.trim().toLowerCase();
+  return trimmed ? trimmed : undefined;
+}
+
+export function normalizeRealtimeTranscriptionProviderId(
+  providerId: string | undefined,
+): RealtimeTranscriptionProviderId | undefined {
+  return trimToUndefined(providerId);
+}
+
+function resolveRealtimeTranscriptionProviderEntries(
+  cfg?: OpenClawConfig,
+): RealtimeTranscriptionProviderPlugin[] {
+  return resolvePluginCapabilityProviders({
+    key: "realtimeTranscriptionProviders",
+    cfg,
+  });
+}
+
+function buildProviderMaps(cfg?: OpenClawConfig): {
+  canonical: Map<string, RealtimeTranscriptionProviderPlugin>;
+  aliases: Map<string, RealtimeTranscriptionProviderPlugin>;
+} {
+  const canonical = new Map<string, RealtimeTranscriptionProviderPlugin>();
+  const aliases = new Map<string, RealtimeTranscriptionProviderPlugin>();
+  const register = (provider: RealtimeTranscriptionProviderPlugin) => {
+    const id = normalizeRealtimeTranscriptionProviderId(provider.id);
+    if (!id) {
+      return;
+    }
+    canonical.set(id, provider);
+    aliases.set(id, provider);
+    for (const alias of provider.aliases ?? []) {
+      const normalizedAlias = normalizeRealtimeTranscriptionProviderId(alias);
+      if (normalizedAlias) {
+        aliases.set(normalizedAlias, provider);
+      }
+    }
+  };
+
+  for (const provider of resolveRealtimeTranscriptionProviderEntries(cfg)) {
+    register(provider);
+  }
+
+  return { canonical, aliases };
+}
+
+export function listRealtimeTranscriptionProviders(
+  cfg?: OpenClawConfig,
+): RealtimeTranscriptionProviderPlugin[] {
+  return [...buildProviderMaps(cfg).canonical.values()];
+}
+
+export function getRealtimeTranscriptionProvider(
+  providerId: string | undefined,
+  cfg?: OpenClawConfig,
+): RealtimeTranscriptionProviderPlugin | undefined {
+  const normalized = normalizeRealtimeTranscriptionProviderId(providerId);
+  if (!normalized) {
+    return undefined;
+  }
+  return buildProviderMaps(cfg).aliases.get(normalized);
+}
+
+export function canonicalizeRealtimeTranscriptionProviderId(
+  providerId: string | undefined,
+  cfg?: OpenClawConfig,
+): RealtimeTranscriptionProviderId | undefined {
+  const normalized = normalizeRealtimeTranscriptionProviderId(providerId);
+  if (!normalized) {
+    return undefined;
+  }
+  return getRealtimeTranscriptionProvider(normalized, cfg)?.id ?? normalized;
+}
diff --git a/src/realtime-transcription/provider-types.ts b/src/realtime-transcription/provider-types.ts
new file mode 100644
index 00000000000..06d7678eec5
--- /dev/null
+++ b/src/realtime-transcription/provider-types.ts
@@ -0,0 +1,33 @@
+import type { OpenClawConfig } from "../config/config.js";
+
+export type RealtimeTranscriptionProviderId = string;
+
+export type RealtimeTranscriptionProviderConfig = Record<string, unknown>;
+
+export type RealtimeTranscriptionProviderResolveConfigContext = {
+  cfg: OpenClawConfig;
+  rawConfig: RealtimeTranscriptionProviderConfig;
+};
+
+export type RealtimeTranscriptionProviderConfiguredContext = {
+  cfg?: OpenClawConfig;
+  providerConfig: RealtimeTranscriptionProviderConfig;
+};
+
+export type RealtimeTranscriptionSessionCallbacks = {
+  onPartial?: (partial: string) => void;
+  onTranscript?: (transcript: string) => void;
+  onSpeechStart?: () => void;
+  onError?: (error: Error) => void;
+};
+
+export type RealtimeTranscriptionSessionCreateRequest = RealtimeTranscriptionSessionCallbacks & {
+  providerConfig: RealtimeTranscriptionProviderConfig;
+};
+
+export type RealtimeTranscriptionSession = {
+  connect(): Promise<void>;
+  sendAudio(audio: Buffer): void;
+  close(): void;
+  isConnected(): boolean;
+};
diff --git a/src/realtime-voice/provider-registry.ts b/src/realtime-voice/provider-registry.ts
new file mode 100644
index 00000000000..b2de16385c1
--- /dev/null
+++ b/src/realtime-voice/provider-registry.ts
@@ -0,0 +1,76 @@
+import type { OpenClawConfig } from "../config/config.js";
+import { resolvePluginCapabilityProviders } from "../plugins/capability-provider-runtime.js";
+import type { RealtimeVoiceProviderPlugin } from "../plugins/types.js";
+import type { RealtimeVoiceProviderId } from "./provider-types.js";
+
+function trimToUndefined(value: string | undefined): string | undefined {
+  const trimmed = value?.trim().toLowerCase();
+  return trimmed ? trimmed : undefined;
+}
+
+export function normalizeRealtimeVoiceProviderId(
+  providerId: string | undefined,
+): RealtimeVoiceProviderId | undefined {
+  return trimToUndefined(providerId);
+}
+
+function resolveRealtimeVoiceProviderEntries(cfg?: OpenClawConfig): RealtimeVoiceProviderPlugin[] {
+  return resolvePluginCapabilityProviders({
+    key: "realtimeVoiceProviders",
+    cfg,
+  });
+}
+
+function buildProviderMaps(cfg?: OpenClawConfig): {
+  canonical: Map<string, RealtimeVoiceProviderPlugin>;
+  aliases: Map<string, RealtimeVoiceProviderPlugin>;
+} {
+  const canonical = new Map<string, RealtimeVoiceProviderPlugin>();
+  const aliases = new Map<string, RealtimeVoiceProviderPlugin>();
+  const register = (provider: RealtimeVoiceProviderPlugin) => {
+    const id = normalizeRealtimeVoiceProviderId(provider.id);
+    if (!id) {
+      return;
+    }
+    canonical.set(id, provider);
+    aliases.set(id, provider);
+    for (const alias of provider.aliases ?? []) {
+      const normalizedAlias = normalizeRealtimeVoiceProviderId(alias);
+      if (normalizedAlias) {
+        aliases.set(normalizedAlias, provider);
+      }
+    }
+  };
+
+  for (const provider of resolveRealtimeVoiceProviderEntries(cfg)) {
+    register(provider);
+  }
+
+  return { canonical, aliases };
+}
+
+export function listRealtimeVoiceProviders(cfg?: OpenClawConfig): RealtimeVoiceProviderPlugin[] {
+  return [...buildProviderMaps(cfg).canonical.values()];
+}
+
+export function getRealtimeVoiceProvider(
+  providerId: string | undefined,
+  cfg?: OpenClawConfig,
+): RealtimeVoiceProviderPlugin | undefined {
+  const normalized = normalizeRealtimeVoiceProviderId(providerId);
+  if (!normalized) {
+    return undefined;
+  }
+  return buildProviderMaps(cfg).aliases.get(normalized);
+}
+
+export function canonicalizeRealtimeVoiceProviderId(
+  providerId: string | undefined,
+  cfg?: OpenClawConfig,
+): RealtimeVoiceProviderId | undefined {
+  const normalized = normalizeRealtimeVoiceProviderId(providerId);
+  if (!normalized) {
+    return undefined;
+  }
+  return getRealtimeVoiceProvider(normalized, cfg)?.id ?? normalized;
+}
diff --git a/src/realtime-voice/provider-types.ts b/src/realtime-voice/provider-types.ts
new file mode 100644
index 00000000000..a494bd32cf5
--- /dev/null
+++ b/src/realtime-voice/provider-types.ts
@@ -0,0 +1,66 @@
+import type { OpenClawConfig } from "../config/config.js";
+
+export type RealtimeVoiceProviderId = string;
+
+export type RealtimeVoiceRole = "user" | "assistant";
+
+export type RealtimeVoiceCloseReason = "completed" | "error";
+
+export type RealtimeVoiceTool = {
+  type: "function";
+  name: string;
+  description: string;
+  parameters: {
+    type: "object";
+    properties: Record<string, unknown>;
+    required?: string[];
+  };
+};
+
+export type RealtimeVoiceToolCallEvent = {
+  itemId: string;
+  callId: string;
+  name: string;
+  args: unknown;
+};
+
+export type RealtimeVoiceBridgeCallbacks = {
+  onAudio: (muLaw: Buffer) => void;
+  onClearAudio: () => void;
+  onMark?: (markName: string) => void;
+  onTranscript?: (role: RealtimeVoiceRole, text: string, isFinal: boolean) => void;
+  onToolCall?: (event: RealtimeVoiceToolCallEvent) => void;
+  onReady?: () => void;
+  onError?: (error: Error) => void;
+  onClose?: (reason: RealtimeVoiceCloseReason) => void;
+};
+
+export type RealtimeVoiceProviderConfig = Record<string, unknown>;
+
+export type RealtimeVoiceProviderResolveConfigContext = {
+  cfg: OpenClawConfig;
+  rawConfig: RealtimeVoiceProviderConfig;
+};
+
+export type RealtimeVoiceProviderConfiguredContext = {
+  cfg?: OpenClawConfig;
+  providerConfig: RealtimeVoiceProviderConfig;
+};
+
+export type RealtimeVoiceBridgeCreateRequest = RealtimeVoiceBridgeCallbacks & {
+  providerConfig: RealtimeVoiceProviderConfig;
+  instructions?: string;
+  tools?: RealtimeVoiceTool[];
+};
+
+export type RealtimeVoiceBridge = {
+  connect(): Promise<void>;
+  sendAudio(audio: Buffer): void;
+  setMediaTimestamp(ts: number): void;
+  sendUserMessage?(text: string): void;
+  triggerGreeting?(instructions?: string): void;
+  submitToolResult(callId: string, result: unknown): void;
+  acknowledgeMark(): void;
+  close(): void;
+  isConnected(): boolean;
+};
diff --git a/src/test-utils/channel-plugins.ts b/src/test-utils/channel-plugins.ts
index cfa1d40c672..0021ae122b8 100644
--- a/src/test-utils/channel-plugins.ts
+++ b/src/test-utils/channel-plugins.ts
@@ -27,6 +27,8 @@ export const createTestRegistry = (channels: TestChannelRegistration[] = []): Pl
   })),
   providers: [],
   speechProviders: [],
+  realtimeTranscriptionProviders: [],
+  realtimeVoiceProviders: [],
   mediaUnderstandingProviders: [],
   imageGenerationProviders: [],
   webFetchProviders: [],
diff --git a/test/helpers/plugins/plugin-api.ts b/test/helpers/plugins/plugin-api.ts
index 825d227ba17..ceaa92f68d8 100644
--- a/test/helpers/plugins/plugin-api.ts
+++ b/test/helpers/plugins/plugin-api.ts
@@ -20,6 +20,8 @@ export function createTestPluginApi(api: TestPluginApiInput): OpenClawPluginApi
     registerCliBackend() {},
     registerProvider() {},
     registerSpeechProvider() {},
+    registerRealtimeTranscriptionProvider() {},
+    registerRealtimeVoiceProvider() {},
     registerMediaUnderstandingProvider() {},
     registerImageGenerationProvider() {},
     registerWebFetchProvider() {},
diff --git a/test/helpers/plugins/plugin-registration-contract-cases.ts b/test/helpers/plugins/plugin-registration-contract-cases.ts
index 07005c44cf7..12e3e911d69 100644
--- a/test/helpers/plugins/plugin-registration-contract-cases.ts
+++ b/test/helpers/plugins/plugin-registration-contract-cases.ts
@@ -92,6 +92,8 @@ export const pluginRegistrationContractCases = {
     pluginId: "openai",
     providerIds: ["openai", "openai-codex"],
     speechProviderIds: ["openai"],
+    realtimeTranscriptionProviderIds: ["openai"],
+    realtimeVoiceProviderIds: ["openai"],
     mediaUnderstandingProviderIds: ["openai", "openai-codex"],
     imageGenerationProviderIds: ["openai"],
     cliBackendIds: ["codex-cli"],
diff --git a/test/helpers/plugins/plugin-registration-contract.ts b/test/helpers/plugins/plugin-registration-contract.ts
index d25f067ec6f..1b39bb1903d 100644
--- a/test/helpers/plugins/plugin-registration-contract.ts
+++ b/test/helpers/plugins/plugin-registration-contract.ts
@@ -13,6 +13,8 @@ type PluginRegistrationContractParams = {
   webFetchProviderIds?: string[];
   webSearchProviderIds?: string[];
   speechProviderIds?: string[];
+  realtimeTranscriptionProviderIds?: string[];
+  realtimeVoiceProviderIds?: string[];
   mediaUnderstandingProviderIds?: string[];
   imageGenerationProviderIds?: string[];
   cliBackendIds?: string[];
@@ -122,6 +124,22 @@ export function describePluginRegistrationContract(params: PluginRegistrationCon
       });
     }
 
+    if (params.realtimeTranscriptionProviderIds) {
+      it("keeps bundled realtime-transcription ownership explicit", () => {
+        expect(findRegistration(params.pluginId).realtimeTranscriptionProviderIds).toEqual(
+          params.realtimeTranscriptionProviderIds,
+        );
+      });
+    }
+
+    if (params.realtimeVoiceProviderIds) {
+      it("keeps bundled realtime-voice ownership explicit", () => {
+        expect(findRegistration(params.pluginId).realtimeVoiceProviderIds).toEqual(
+          params.realtimeVoiceProviderIds,
+        );
+      });
+    }
+
     if (params.mediaUnderstandingProviderIds) {
       it("keeps bundled media-understanding ownership explicit", () => {
         expect(findRegistration(params.pluginId).mediaUnderstandingProviderIds).toEqual(
diff --git a/test/setup-openclaw-runtime.ts b/test/setup-openclaw-runtime.ts
index e9f5137fc15..0aca0aceeab 100644
--- a/test/setup-openclaw-runtime.ts
+++ b/test/setup-openclaw-runtime.ts
@@ -110,6 +110,8 @@ function createTestRegistryForSetup(
     })),
     providers: [],
     speechProviders: [],
+    realtimeTranscriptionProviders: [],
+    realtimeVoiceProviders: [],
     mediaUnderstandingProviders: [],
     imageGenerationProviders: [],
     webFetchProviders: [],
diff --git a/vitest.contracts.config.ts b/vitest.contracts.config.ts
index 48507e3eecd..190b0527f4c 100644
--- a/vitest.contracts.config.ts
+++ b/vitest.contracts.config.ts
@@ -1,23 +1,23 @@
-import { createScopedVitestConfig } from "./vitest.scoped-config.ts";
-import { boundaryTestFiles } from "./vitest.unit-paths.mjs";
+import { defineConfig } from "vitest/config";
+import { sharedVitestConfig } from "./vitest.shared.config.ts";
 
-export function createContractsVitestConfig(env?: Record<string, string | undefined>) {
-  return createScopedVitestConfig(
-    [
-      "src/channels/plugins/contracts/**/*.test.ts",
-      "src/config/doc-baseline.integration.test.ts",
-      "src/config/schema.base.generated.test.ts",
-      "src/config/schema.help.quality.test.ts",
-      "src/plugins/contracts/**/*.test.ts",
-      "test/**/*.test.ts",
-    ],
-    {
-      env,
-      exclude: boundaryTestFiles,
-      name: "contracts",
+const base = sharedVitestConfig as Record<string, unknown>;
+const baseTest = sharedVitestConfig.test ?? {};
+
+export function createContractsVitestConfig() {
+  return defineConfig({
+    ...base,
+    test: {
+      ...baseTest,
+      isolate: true,
+      setupFiles: baseTest.setupFiles ?? [],
+      include: [
+        "src/channels/plugins/contracts/**/*.test.ts",
+        "src/plugins/contracts/**/*.test.ts",
+      ],
       passWithNoTests: true,
     },
-  );
+  });
 }
 
 export default createContractsVitestConfig();