TTS: extract runtime execution

2026-03-15 19:27:20 +00:00 · 2026-03-15 19:27:20 +00:00 · f36f8f9e2d
parent 06e4354f7f
commit f36f8f9e2d
2 changed files with 340 additions and 297 deletions
--- a/src/extension-host/tts-runtime-execution.ts
+++ b/src/extension-host/tts-runtime-execution.ts
@ -0,0 +1,317 @@
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import path from "node:path";
+import type { TtsProvider } from "../config/types.tts.js";
+import { logVerbose } from "../globals.js";
+import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
+import { isVoiceCompatibleAudio } from "../media/audio.js";
+import {
+  edgeTTS,
+  elevenLabsTTS,
+  inferEdgeExtension,
+  openaiTTS,
+  scheduleCleanup,
+} from "../tts/tts-core.js";
+import type {
+  ResolvedTtsConfig,
+  TtsDirectiveOverrides,
+  TtsResult,
+  TtsTelephonyResult,
+} from "../tts/tts.js";
+import {
+  resolveExtensionHostTtsApiKey,
+  supportsExtensionHostTtsTelephony,
+} from "./tts-runtime-registry.js";
+
+const TELEGRAM_OUTPUT = {
+  openai: "opus" as const,
+  // ElevenLabs output formats use codec_sample_rate_bitrate naming.
+  // Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram.
+  elevenlabs: "opus_48000_64",
+  extension: ".opus",
+  voiceCompatible: true,
+};
+
+const DEFAULT_OUTPUT = {
+  openai: "mp3" as const,
+  elevenlabs: "mp3_44100_128",
+  extension: ".mp3",
+  voiceCompatible: false,
+};
+
+const TELEPHONY_OUTPUT = {
+  openai: { format: "pcm" as const, sampleRate: 24000 },
+  elevenlabs: { format: "pcm_22050", sampleRate: 22050 },
+};
+
+const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
+
+const VOICE_BUBBLE_CHANNELS = new Set(["telegram", "feishu", "whatsapp"]);
+
+type ExtensionHostTtsOutputFormat = {
+  openai: "opus" | "mp3";
+  elevenlabs: string;
+  extension: ".opus" | ".mp3";
+  voiceCompatible: boolean;
+};
+
+export function isExtensionHostTtsVoiceBubbleChannel(channel?: string | null): boolean {
+  const channelId = channel?.trim().toLowerCase();
+  return typeof channelId === "string" && VOICE_BUBBLE_CHANNELS.has(channelId);
+}
+
+export function resolveExtensionHostTtsOutputFormat(
+  channel?: string | null,
+): ExtensionHostTtsOutputFormat {
+  if (isExtensionHostTtsVoiceBubbleChannel(channel)) {
+    return TELEGRAM_OUTPUT;
+  }
+  return DEFAULT_OUTPUT;
+}
+
+export function resolveExtensionHostEdgeOutputFormat(config: ResolvedTtsConfig): string {
+  return config.edge.outputFormat || DEFAULT_EDGE_OUTPUT_FORMAT;
+}
+
+export function formatExtensionHostTtsProviderError(provider: TtsProvider, err: unknown): string {
+  const error = err instanceof Error ? err : new Error(String(err));
+  if (error.name === "AbortError") {
+    return `${provider}: request timed out`;
+  }
+  return `${provider}: ${error.message}`;
+}
+
+export function buildExtensionHostTtsFailureResult(errors: string[]): {
+  success: false;
+  error: string;
+} {
+  return {
+    success: false,
+    error: `TTS conversion failed: ${errors.join("; ") || "no providers available"}`,
+  };
+}
+
+export async function executeExtensionHostTextToSpeech(params: {
+  text: string;
+  config: ResolvedTtsConfig;
+  providers: TtsProvider[];
+  channel?: string;
+  overrides?: TtsDirectiveOverrides;
+}): Promise<TtsResult> {
+  const { config, providers } = params;
+  const output = resolveExtensionHostTtsOutputFormat(params.channel);
+  const errors: string[] = [];
+
+  for (const provider of providers) {
+    const providerStart = Date.now();
+    try {
+      if (provider === "edge") {
+        if (!config.edge.enabled) {
+          errors.push("edge: disabled");
+          continue;
+        }
+
+        const tempRoot = resolvePreferredOpenClawTmpDir();
+        mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
+        const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
+        let edgeOutputFormat = resolveExtensionHostEdgeOutputFormat(config);
+        const fallbackEdgeOutputFormat =
+          edgeOutputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
+
+        const attemptEdgeTts = async (outputFormat: string) => {
+          const extension = inferEdgeExtension(outputFormat);
+          const audioPath = path.join(tempDir, `voice-${Date.now()}${extension}`);
+          await edgeTTS({
+            text: params.text,
+            outputPath: audioPath,
+            config: {
+              ...config.edge,
+              outputFormat,
+            },
+            timeoutMs: config.timeoutMs,
+          });
+          return { audioPath, outputFormat };
+        };
+
+        let edgeResult: { audioPath: string; outputFormat: string };
+        try {
+          edgeResult = await attemptEdgeTts(edgeOutputFormat);
+        } catch (err) {
+          if (fallbackEdgeOutputFormat && fallbackEdgeOutputFormat !== edgeOutputFormat) {
+            logVerbose(
+              `TTS: Edge output ${edgeOutputFormat} failed; retrying with ${fallbackEdgeOutputFormat}.`,
+            );
+            edgeOutputFormat = fallbackEdgeOutputFormat;
+            try {
+              edgeResult = await attemptEdgeTts(edgeOutputFormat);
+            } catch (fallbackErr) {
+              try {
+                rmSync(tempDir, { recursive: true, force: true });
+              } catch {}
+              throw fallbackErr;
+            }
+          } else {
+            try {
+              rmSync(tempDir, { recursive: true, force: true });
+            } catch {}
+            throw err;
+          }
+        }
+
+        scheduleCleanup(tempDir);
+        const voiceCompatible = isVoiceCompatibleAudio({ fileName: edgeResult.audioPath });
+
+        return {
+          success: true,
+          audioPath: edgeResult.audioPath,
+          latencyMs: Date.now() - providerStart,
+          provider,
+          outputFormat: edgeResult.outputFormat,
+          voiceCompatible,
+        };
+      }
+
+      const apiKey = resolveExtensionHostTtsApiKey(config, provider);
+      if (!apiKey) {
+        errors.push(`${provider}: no API key`);
+        continue;
+      }
+
+      let audioBuffer: Buffer;
+      if (provider === "elevenlabs") {
+        const voiceIdOverride = params.overrides?.elevenlabs?.voiceId;
+        const modelIdOverride = params.overrides?.elevenlabs?.modelId;
+        const voiceSettings = {
+          ...config.elevenlabs.voiceSettings,
+          ...params.overrides?.elevenlabs?.voiceSettings,
+        };
+        const seedOverride = params.overrides?.elevenlabs?.seed;
+        const normalizationOverride = params.overrides?.elevenlabs?.applyTextNormalization;
+        const languageOverride = params.overrides?.elevenlabs?.languageCode;
+        audioBuffer = await elevenLabsTTS({
+          text: params.text,
+          apiKey,
+          baseUrl: config.elevenlabs.baseUrl,
+          voiceId: voiceIdOverride ?? config.elevenlabs.voiceId,
+          modelId: modelIdOverride ?? config.elevenlabs.modelId,
+          outputFormat: output.elevenlabs,
+          seed: seedOverride ?? config.elevenlabs.seed,
+          applyTextNormalization: normalizationOverride ?? config.elevenlabs.applyTextNormalization,
+          languageCode: languageOverride ?? config.elevenlabs.languageCode,
+          voiceSettings,
+          timeoutMs: config.timeoutMs,
+        });
+      } else {
+        const openaiModelOverride = params.overrides?.openai?.model;
+        const openaiVoiceOverride = params.overrides?.openai?.voice;
+        audioBuffer = await openaiTTS({
+          text: params.text,
+          apiKey,
+          baseUrl: config.openai.baseUrl,
+          model: openaiModelOverride ?? config.openai.model,
+          voice: openaiVoiceOverride ?? config.openai.voice,
+          speed: config.openai.speed,
+          instructions: config.openai.instructions,
+          responseFormat: output.openai,
+          timeoutMs: config.timeoutMs,
+        });
+      }
+
+      const tempRoot = resolvePreferredOpenClawTmpDir();
+      mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
+      const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
+      const audioPath = path.join(tempDir, `voice-${Date.now()}${output.extension}`);
+      writeFileSync(audioPath, audioBuffer);
+      scheduleCleanup(tempDir);
+
+      return {
+        success: true,
+        audioPath,
+        latencyMs: Date.now() - providerStart,
+        provider,
+        outputFormat: provider === "openai" ? output.openai : output.elevenlabs,
+        voiceCompatible: output.voiceCompatible,
+      };
+    } catch (err) {
+      errors.push(formatExtensionHostTtsProviderError(provider, err));
+    }
+  }
+
+  return buildExtensionHostTtsFailureResult(errors);
+}
+
+export async function executeExtensionHostTextToSpeechTelephony(params: {
+  text: string;
+  config: ResolvedTtsConfig;
+  providers: TtsProvider[];
+}): Promise<TtsTelephonyResult> {
+  const { config, providers } = params;
+  const errors: string[] = [];
+
+  for (const provider of providers) {
+    const providerStart = Date.now();
+    try {
+      if (!supportsExtensionHostTtsTelephony(provider)) {
+        errors.push("edge: unsupported for telephony");
+        continue;
+      }
+
+      const apiKey = resolveExtensionHostTtsApiKey(config, provider);
+      if (!apiKey) {
+        errors.push(`${provider}: no API key`);
+        continue;
+      }
+
+      if (provider === "elevenlabs") {
+        const output = TELEPHONY_OUTPUT.elevenlabs;
+        const audioBuffer = await elevenLabsTTS({
+          text: params.text,
+          apiKey,
+          baseUrl: config.elevenlabs.baseUrl,
+          voiceId: config.elevenlabs.voiceId,
+          modelId: config.elevenlabs.modelId,
+          outputFormat: output.format,
+          seed: config.elevenlabs.seed,
+          applyTextNormalization: config.elevenlabs.applyTextNormalization,
+          languageCode: config.elevenlabs.languageCode,
+          voiceSettings: config.elevenlabs.voiceSettings,
+          timeoutMs: config.timeoutMs,
+        });
+
+        return {
+          success: true,
+          audioBuffer,
+          latencyMs: Date.now() - providerStart,
+          provider,
+          outputFormat: output.format,
+          sampleRate: output.sampleRate,
+        };
+      }
+
+      const output = TELEPHONY_OUTPUT.openai;
+      const audioBuffer = await openaiTTS({
+        text: params.text,
+        apiKey,
+        baseUrl: config.openai.baseUrl,
+        model: config.openai.model,
+        voice: config.openai.voice,
+        speed: config.openai.speed,
+        instructions: config.openai.instructions,
+        responseFormat: output.format,
+        timeoutMs: config.timeoutMs,
+      });
+
+      return {
+        success: true,
+        audioBuffer,
+        latencyMs: Date.now() - providerStart,
+        provider,
+        outputFormat: output.format,
+        sampleRate: output.sampleRate,
+      };
+    } catch (err) {
+      errors.push(formatExtensionHostTtsProviderError(provider, err));
+    }
+  }
+
+  return buildExtensionHostTtsFailureResult(errors);
+}
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@ -1,18 +1,7 @@
 import { randomBytes } from "node:crypto";
-import {
-  existsSync,
-  mkdirSync,
-  readFileSync,
-  writeFileSync,
-  mkdtempSync,
-  rmSync,
-  renameSync,
-  unlinkSync,
-} from "node:fs";
+import { existsSync, readFileSync, writeFileSync, renameSync, unlinkSync } from "node:fs";
 import path from "node:path";
 import type { ReplyPayload } from "../auto-reply/types.js";
-import { normalizeChannelId } from "../channels/plugins/index.js";
-import type { ChannelId } from "../channels/plugins/types.js";
 import type { OpenClawConfig } from "../config/config.js";
 import { normalizeResolvedSecretInputString } from "../config/types.secrets.js";
 import type {
@ -22,32 +11,31 @@ import type {
  TtsProvider,
  TtsModelOverrideConfig,
 } from "../config/types.tts.js";
+import {
+  executeExtensionHostTextToSpeech,
+  executeExtensionHostTextToSpeechTelephony,
+  isExtensionHostTtsVoiceBubbleChannel,
+  resolveExtensionHostEdgeOutputFormat,
+  resolveExtensionHostTtsOutputFormat,
+} from "../extension-host/tts-runtime-execution.js";
 import {
  EXTENSION_HOST_TTS_PROVIDER_IDS,
  isExtensionHostTtsProviderConfigured,
  resolveExtensionHostTtsApiKey,
  resolveExtensionHostTtsProviderOrder,
-  supportsExtensionHostTtsTelephony,
 } from "../extension-host/tts-runtime-registry.js";
 import { logVerbose } from "../globals.js";
-import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
 import { stripMarkdown } from "../line/markdown-to-line.js";
-import { isVoiceCompatibleAudio } from "../media/audio.js";
 import { CONFIG_DIR, resolveUserPath } from "../utils.js";
 import {
  DEFAULT_OPENAI_BASE_URL,
-  edgeTTS,
-  elevenLabsTTS,
-  inferEdgeExtension,
  isValidOpenAIModel,
  isValidOpenAIVoice,
  isValidVoiceId,
  OPENAI_TTS_MODELS,
  OPENAI_TTS_VOICES,
  resolveOpenAITtsInstructions,
-  openaiTTS,
  parseTtsDirectives,
-  scheduleCleanup,
  summarizeText,
 } from "./tts-core.js";
 export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "./tts-core.js";
@ -74,27 +62,6 @@ const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
  speed: 1.0,
 };

-const TELEGRAM_OUTPUT = {
-  openai: "opus" as const,
-  // ElevenLabs output formats use codec_sample_rate_bitrate naming.
-  // Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram.
-  elevenlabs: "opus_48000_64",
-  extension: ".opus",
-  voiceCompatible: true,
-};
-
-const DEFAULT_OUTPUT = {
-  openai: "mp3" as const,
-  elevenlabs: "mp3_44100_128",
-  extension: ".mp3",
-  voiceCompatible: false,
-};
-
-const TELEPHONY_OUTPUT = {
-  openai: { format: "pcm" as const, sampleRate: 24000 },
-  elevenlabs: { format: "pcm_22050", sampleRate: 22050 },
-};
-
 const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);

 export type ResolvedTtsConfig = {
@ -507,24 +474,6 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
  lastTtsAttempt = entry;
 }

-/** Channels that require opus audio and support voice-bubble playback */
-const VOICE_BUBBLE_CHANNELS = new Set(["telegram", "feishu", "whatsapp"]);
-
-function resolveOutputFormat(channelId?: string | null) {
-  if (channelId && VOICE_BUBBLE_CHANNELS.has(channelId)) {
-    return TELEGRAM_OUTPUT;
-  }
-  return DEFAULT_OUTPUT;
-}
-
-function resolveChannelId(channel: string | undefined): ChannelId | null {
-  return channel ? normalizeChannelId(channel) : null;
-}
-
-function resolveEdgeOutputFormat(config: ResolvedTtsConfig): string {
-  return config.edge.outputFormat;
-}
-
 export const TTS_PROVIDERS = EXTENSION_HOST_TTS_PROVIDER_IDS;

 export const resolveTtsApiKey = resolveExtensionHostTtsApiKey;
@ -533,21 +482,6 @@ export const resolveTtsProviderOrder = resolveExtensionHostTtsProviderOrder;

 export const isTtsProviderConfigured = isExtensionHostTtsProviderConfigured;

-function formatTtsProviderError(provider: TtsProvider, err: unknown): string {
-  const error = err instanceof Error ? err : new Error(String(err));
-  if (error.name === "AbortError") {
-    return `${provider}: request timed out`;
-  }
-  return `${provider}: ${error.message}`;
-}
-
-function buildTtsFailureResult(errors: string[]): { success: false; error: string } {
-  return {
-    success: false,
-    error: `TTS conversion failed: ${errors.join("; ") || "no providers available"}`,
-  };
-}
-
 function resolveTtsRequestSetup(params: {
  text: string;
  cfg: OpenClawConfig;
@ -594,154 +528,13 @@ export async function textToSpeech(params: {
    return { success: false, error: setup.error };
  }

-  const { config, providers } = setup;
-  const channelId = resolveChannelId(params.channel);
-  const output = resolveOutputFormat(channelId);
-
-  const errors: string[] = [];
-
-  for (const provider of providers) {
-    const providerStart = Date.now();
-    try {
-      if (provider === "edge") {
-        if (!config.edge.enabled) {
-          errors.push("edge: disabled");
-          continue;
-        }
-
-        const tempRoot = resolvePreferredOpenClawTmpDir();
-        mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
-        const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
-        let edgeOutputFormat = resolveEdgeOutputFormat(config);
-        const fallbackEdgeOutputFormat =
-          edgeOutputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
-
-        const attemptEdgeTts = async (outputFormat: string) => {
-          const extension = inferEdgeExtension(outputFormat);
-          const audioPath = path.join(tempDir, `voice-${Date.now()}${extension}`);
-          await edgeTTS({
-            text: params.text,
-            outputPath: audioPath,
-            config: {
-              ...config.edge,
-              outputFormat,
-            },
-            timeoutMs: config.timeoutMs,
-          });
-          return { audioPath, outputFormat };
-        };
-
-        let edgeResult: { audioPath: string; outputFormat: string };
-        try {
-          edgeResult = await attemptEdgeTts(edgeOutputFormat);
-        } catch (err) {
-          if (fallbackEdgeOutputFormat && fallbackEdgeOutputFormat !== edgeOutputFormat) {
-            logVerbose(
-              `TTS: Edge output ${edgeOutputFormat} failed; retrying with ${fallbackEdgeOutputFormat}.`,
-            );
-            edgeOutputFormat = fallbackEdgeOutputFormat;
-            try {
-              edgeResult = await attemptEdgeTts(edgeOutputFormat);
-            } catch (fallbackErr) {
-              try {
-                rmSync(tempDir, { recursive: true, force: true });
-              } catch {
-                // ignore cleanup errors
-              }
-              throw fallbackErr;
-            }
-          } else {
-            try {
-              rmSync(tempDir, { recursive: true, force: true });
-            } catch {
-              // ignore cleanup errors
-            }
-            throw err;
-          }
-        }
-
-        scheduleCleanup(tempDir);
-        const voiceCompatible = isVoiceCompatibleAudio({ fileName: edgeResult.audioPath });
-
-        return {
-          success: true,
-          audioPath: edgeResult.audioPath,
-          latencyMs: Date.now() - providerStart,
-          provider,
-          outputFormat: edgeResult.outputFormat,
-          voiceCompatible,
-        };
-      }
-
-      const apiKey = resolveExtensionHostTtsApiKey(config, provider);
-      if (!apiKey) {
-        errors.push(`${provider}: no API key`);
-        continue;
-      }
-
-      let audioBuffer: Buffer;
-      if (provider === "elevenlabs") {
-        const voiceIdOverride = params.overrides?.elevenlabs?.voiceId;
-        const modelIdOverride = params.overrides?.elevenlabs?.modelId;
-        const voiceSettings = {
-          ...config.elevenlabs.voiceSettings,
-          ...params.overrides?.elevenlabs?.voiceSettings,
-        };
-        const seedOverride = params.overrides?.elevenlabs?.seed;
-        const normalizationOverride = params.overrides?.elevenlabs?.applyTextNormalization;
-        const languageOverride = params.overrides?.elevenlabs?.languageCode;
-        audioBuffer = await elevenLabsTTS({
-          text: params.text,
-          apiKey,
-          baseUrl: config.elevenlabs.baseUrl,
-          voiceId: voiceIdOverride ?? config.elevenlabs.voiceId,
-          modelId: modelIdOverride ?? config.elevenlabs.modelId,
-          outputFormat: output.elevenlabs,
-          seed: seedOverride ?? config.elevenlabs.seed,
-          applyTextNormalization: normalizationOverride ?? config.elevenlabs.applyTextNormalization,
-          languageCode: languageOverride ?? config.elevenlabs.languageCode,
-          voiceSettings,
-          timeoutMs: config.timeoutMs,
-        });
-      } else {
-        const openaiModelOverride = params.overrides?.openai?.model;
-        const openaiVoiceOverride = params.overrides?.openai?.voice;
-        audioBuffer = await openaiTTS({
-          text: params.text,
-          apiKey,
-          baseUrl: config.openai.baseUrl,
-          model: openaiModelOverride ?? config.openai.model,
-          voice: openaiVoiceOverride ?? config.openai.voice,
-          speed: config.openai.speed,
-          instructions: config.openai.instructions,
-          responseFormat: output.openai,
-          timeoutMs: config.timeoutMs,
-        });
-      }
-
-      const latencyMs = Date.now() - providerStart;
-
-      const tempRoot = resolvePreferredOpenClawTmpDir();
-      mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
-      const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
-      const audioPath = path.join(tempDir, `voice-${Date.now()}${output.extension}`);
-      writeFileSync(audioPath, audioBuffer);
-      scheduleCleanup(tempDir);
-
-      return {
-        success: true,
-        audioPath,
-        latencyMs,
-        provider,
-        outputFormat: provider === "openai" ? output.openai : output.elevenlabs,
-        voiceCompatible: output.voiceCompatible,
-      };
-    } catch (err) {
-      errors.push(formatTtsProviderError(provider, err));
-    }
-  }
-
-  return buildTtsFailureResult(errors);
+  return executeExtensionHostTextToSpeech({
+    text: params.text,
+    config: setup.config,
+    providers: setup.providers,
+    channel: params.channel,
+    overrides: params.overrides,
+  });
 }

 export async function textToSpeechTelephony(params: {
@ -758,77 +551,11 @@ export async function textToSpeechTelephony(params: {
    return { success: false, error: setup.error };
  }

-  const { config, providers } = setup;
-
-  const errors: string[] = [];
-
-  for (const provider of providers) {
-    const providerStart = Date.now();
-    try {
-      if (!supportsExtensionHostTtsTelephony(provider)) {
-        errors.push("edge: unsupported for telephony");
-        continue;
-      }
-
-      const apiKey = resolveExtensionHostTtsApiKey(config, provider);
-      if (!apiKey) {
-        errors.push(`${provider}: no API key`);
-        continue;
-      }
-
-      if (provider === "elevenlabs") {
-        const output = TELEPHONY_OUTPUT.elevenlabs;
-        const audioBuffer = await elevenLabsTTS({
-          text: params.text,
-          apiKey,
-          baseUrl: config.elevenlabs.baseUrl,
-          voiceId: config.elevenlabs.voiceId,
-          modelId: config.elevenlabs.modelId,
-          outputFormat: output.format,
-          seed: config.elevenlabs.seed,
-          applyTextNormalization: config.elevenlabs.applyTextNormalization,
-          languageCode: config.elevenlabs.languageCode,
-          voiceSettings: config.elevenlabs.voiceSettings,
-          timeoutMs: config.timeoutMs,
-        });
-
-        return {
-          success: true,
-          audioBuffer,
-          latencyMs: Date.now() - providerStart,
-          provider,
-          outputFormat: output.format,
-          sampleRate: output.sampleRate,
-        };
-      }
-
-      const output = TELEPHONY_OUTPUT.openai;
-      const audioBuffer = await openaiTTS({
-        text: params.text,
-        apiKey,
-        baseUrl: config.openai.baseUrl,
-        model: config.openai.model,
-        voice: config.openai.voice,
-        speed: config.openai.speed,
-        instructions: config.openai.instructions,
-        responseFormat: output.format,
-        timeoutMs: config.timeoutMs,
-      });
-
-      return {
-        success: true,
-        audioBuffer,
-        latencyMs: Date.now() - providerStart,
-        provider,
-        outputFormat: output.format,
-        sampleRate: output.sampleRate,
-      };
-    } catch (err) {
-      errors.push(formatTtsProviderError(provider, err));
-    }
-  }
-
-  return buildTtsFailureResult(errors);
+  return executeExtensionHostTextToSpeechTelephony({
+    text: params.text,
+    config: setup.config,
+    providers: setup.providers,
+  });
 }

 export async function maybeApplyTtsToPayload(params: {
@ -953,9 +680,8 @@ export async function maybeApplyTtsToPayload(params: {
      latencyMs: result.latencyMs,
    };

-    const channelId = resolveChannelId(params.channel);
    const shouldVoice =
-      channelId !== null && VOICE_BUBBLE_CHANNELS.has(channelId) && result.voiceCompatible === true;
+      isExtensionHostTtsVoiceBubbleChannel(params.channel) && result.voiceCompatible === true;
    const finalPayload = {
      ...nextPayload,
      mediaUrl: result.audioPath,
@ -987,6 +713,6 @@ export const _test = {
  parseTtsDirectives,
  resolveModelOverridePolicy,
  summarizeText,
-  resolveOutputFormat,
-  resolveEdgeOutputFormat,
+  resolveOutputFormat: resolveExtensionHostTtsOutputFormat,
+  resolveEdgeOutputFormat: resolveExtensionHostEdgeOutputFormat,
 };