From 1d08ad4bac6a5d984ccf8e67f87e9c5fbccff68d Mon Sep 17 00:00:00 2001
From: Vincent Koc <vincentkoc@ieee.org>
Date: Sun, 22 Mar 2026 17:47:04 -0700
Subject: [PATCH] refactor(tts): remove legacy core speech builders

---
 src/tts/providers/elevenlabs.ts | 127 --------------------------------
 src/tts/providers/microsoft.ts  | 127 --------------------------------
 src/tts/providers/openai.ts     |  57 --------------
 3 files changed, 311 deletions(-)
 delete mode 100644 src/tts/providers/elevenlabs.ts
 delete mode 100644 src/tts/providers/microsoft.ts
 delete mode 100644 src/tts/providers/openai.ts

diff --git a/src/tts/providers/elevenlabs.ts b/src/tts/providers/elevenlabs.ts
deleted file mode 100644
index 99097fc42f3..00000000000
--- a/src/tts/providers/elevenlabs.ts
+++ /dev/null
@@ -1,127 +0,0 @@
-import type { SpeechProviderPlugin } from "../../plugins/types.js";
-import type { SpeechVoiceOption } from "../provider-types.js";
-import { elevenLabsTTS } from "../tts-core.js";
-
-const ELEVENLABS_TTS_MODELS = [
-  "eleven_multilingual_v2",
-  "eleven_turbo_v2_5",
-  "eleven_monolingual_v1",
-] as const;
-
-function normalizeElevenLabsBaseUrl(baseUrl: string | undefined): string {
-  const trimmed = baseUrl?.trim();
-  return trimmed?.replace(/\/+$/, "") || "https://api.elevenlabs.io";
-}
-
-export async function listElevenLabsVoices(params: {
-  apiKey: string;
-  baseUrl?: string;
-}): Promise<SpeechVoiceOption[]> {
-  const res = await fetch(`${normalizeElevenLabsBaseUrl(params.baseUrl)}/v1/voices`, {
-    headers: {
-      "xi-api-key": params.apiKey,
-    },
-  });
-  if (!res.ok) {
-    throw new Error(`ElevenLabs voices API error (${res.status})`);
-  }
-  const json = (await res.json()) as {
-    voices?: Array<{
-      voice_id?: string;
-      name?: string;
-      category?: string;
-      description?: string;
-    }>;
-  };
-  return Array.isArray(json.voices)
-    ? json.voices
-        .map((voice) => ({
-          id: voice.voice_id?.trim() ?? "",
-          name: voice.name?.trim() || undefined,
-          category: voice.category?.trim() || undefined,
-          description: voice.description?.trim() || undefined,
-        }))
-        .filter((voice) => voice.id.length > 0)
-    : [];
-}
-
-export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin {
-  return {
-    id: "elevenlabs",
-    label: "ElevenLabs",
-    models: ELEVENLABS_TTS_MODELS,
-    listVoices: async (req) => {
-      const apiKey =
-        req.apiKey ||
-        req.config?.elevenlabs.apiKey ||
-        process.env.ELEVENLABS_API_KEY ||
-        process.env.XI_API_KEY;
-      if (!apiKey) {
-        throw new Error("ElevenLabs API key missing");
-      }
-      return listElevenLabsVoices({
-        apiKey,
-        baseUrl: req.baseUrl ?? req.config?.elevenlabs.baseUrl,
-      });
-    },
-    isConfigured: ({ config }) =>
-      Boolean(config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY),
-    synthesize: async (req) => {
-      const apiKey =
-        req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
-      if (!apiKey) {
-        throw new Error("ElevenLabs API key missing");
-      }
-      const outputFormat =
-        req.overrides?.elevenlabs?.outputFormat ??
-        (req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128");
-      const audioBuffer = await elevenLabsTTS({
-        text: req.text,
-        apiKey,
-        baseUrl: req.config.elevenlabs.baseUrl,
-        voiceId: req.overrides?.elevenlabs?.voiceId ?? req.config.elevenlabs.voiceId,
-        modelId: req.overrides?.elevenlabs?.modelId ?? req.config.elevenlabs.modelId,
-        outputFormat,
-        seed: req.overrides?.elevenlabs?.seed ?? req.config.elevenlabs.seed,
-        applyTextNormalization:
-          req.overrides?.elevenlabs?.applyTextNormalization ??
-          req.config.elevenlabs.applyTextNormalization,
-        languageCode: req.overrides?.elevenlabs?.languageCode ?? req.config.elevenlabs.languageCode,
-        voiceSettings: {
-          ...req.config.elevenlabs.voiceSettings,
-          ...req.overrides?.elevenlabs?.voiceSettings,
-        },
-        timeoutMs: req.config.timeoutMs,
-      });
-      return {
-        audioBuffer,
-        outputFormat,
-        fileExtension: req.target === "voice-note" ? ".opus" : ".mp3",
-        voiceCompatible: req.target === "voice-note",
-      };
-    },
-    synthesizeTelephony: async (req) => {
-      const apiKey =
-        req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
-      if (!apiKey) {
-        throw new Error("ElevenLabs API key missing");
-      }
-      const outputFormat = "pcm_22050";
-      const sampleRate = 22_050;
-      const audioBuffer = await elevenLabsTTS({
-        text: req.text,
-        apiKey,
-        baseUrl: req.config.elevenlabs.baseUrl,
-        voiceId: req.config.elevenlabs.voiceId,
-        modelId: req.config.elevenlabs.modelId,
-        outputFormat,
-        seed: req.config.elevenlabs.seed,
-        applyTextNormalization: req.config.elevenlabs.applyTextNormalization,
-        languageCode: req.config.elevenlabs.languageCode,
-        voiceSettings: req.config.elevenlabs.voiceSettings,
-        timeoutMs: req.config.timeoutMs,
-      });
-      return { audioBuffer, outputFormat, sampleRate };
-    },
-  };
-}
diff --git a/src/tts/providers/microsoft.ts b/src/tts/providers/microsoft.ts
deleted file mode 100644
index f6c5aa8c379..00000000000
--- a/src/tts/providers/microsoft.ts
+++ /dev/null
@@ -1,127 +0,0 @@
-import { mkdirSync, mkdtempSync, readFileSync, rmSync } from "node:fs";
-import path from "node:path";
-import {
-  CHROMIUM_FULL_VERSION,
-  TRUSTED_CLIENT_TOKEN,
-  generateSecMsGecToken,
-} from "node-edge-tts/dist/drm.js";
-import { resolvePreferredOpenClawTmpDir } from "../../infra/tmp-openclaw-dir.js";
-import { isVoiceCompatibleAudio } from "../../media/audio.js";
-import type { SpeechProviderPlugin } from "../../plugins/types.js";
-import type { SpeechVoiceOption } from "../provider-types.js";
-import { edgeTTS, inferEdgeExtension } from "../tts-core.js";
-
-const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
-
-type MicrosoftVoiceListEntry = {
-  ShortName?: string;
-  FriendlyName?: string;
-  Locale?: string;
-  Gender?: string;
-  VoiceTag?: {
-    ContentCategories?: string[];
-    VoicePersonalities?: string[];
-  };
-};
-
-function buildMicrosoftVoiceHeaders(): Record<string, string> {
-  const major = CHROMIUM_FULL_VERSION.split(".")[0] || "0";
-  return {
-    Authority: "speech.platform.bing.com",
-    Origin: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
-    Accept: "*/*",
-    "User-Agent":
-      `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ` +
-      `(KHTML, like Gecko) Chrome/${major}.0.0.0 Safari/537.36 Edg/${major}.0.0.0`,
-    "Sec-MS-GEC": generateSecMsGecToken(),
-    "Sec-MS-GEC-Version": `1-${CHROMIUM_FULL_VERSION}`,
-  };
-}
-
-function formatMicrosoftVoiceDescription(entry: MicrosoftVoiceListEntry): string | undefined {
-  const personalities = entry.VoiceTag?.VoicePersonalities?.filter(Boolean) ?? [];
-  return personalities.length > 0 ? personalities.join(", ") : undefined;
-}
-
-export async function listMicrosoftVoices(): Promise<SpeechVoiceOption[]> {
-  const response = await fetch(
-    "https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list" +
-      `?trustedclienttoken=${TRUSTED_CLIENT_TOKEN}`,
-    {
-      headers: buildMicrosoftVoiceHeaders(),
-    },
-  );
-  if (!response.ok) {
-    throw new Error(`Microsoft voices API error (${response.status})`);
-  }
-  const voices = (await response.json()) as MicrosoftVoiceListEntry[];
-  return Array.isArray(voices)
-    ? voices
-        .map((voice) => ({
-          id: voice.ShortName?.trim() ?? "",
-          name: voice.FriendlyName?.trim() || voice.ShortName?.trim() || undefined,
-          category: voice.VoiceTag?.ContentCategories?.find((value) => value.trim().length > 0),
-          description: formatMicrosoftVoiceDescription(voice),
-          locale: voice.Locale?.trim() || undefined,
-          gender: voice.Gender?.trim() || undefined,
-          personalities: voice.VoiceTag?.VoicePersonalities?.filter(
-            (value): value is string => value.trim().length > 0,
-          ),
-        }))
-        .filter((voice) => voice.id.length > 0)
-    : [];
-}
-
-export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
-  return {
-    id: "microsoft",
-    label: "Microsoft",
-    aliases: ["edge"],
-    listVoices: async () => await listMicrosoftVoices(),
-    isConfigured: ({ config }) => config.edge.enabled,
-    synthesize: async (req) => {
-      const tempRoot = resolvePreferredOpenClawTmpDir();
-      mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
-      const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-"));
-      let outputFormat = req.overrides?.microsoft?.outputFormat ?? req.config.edge.outputFormat;
-      const fallbackOutputFormat =
-        outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
-
-      try {
-        const runEdge = async (format: string) => {
-          const fileExtension = inferEdgeExtension(format);
-          const outputPath = path.join(tempDir, `speech${fileExtension}`);
-          await edgeTTS({
-            text: req.text,
-            outputPath,
-            config: {
-              ...req.config.edge,
-              voice: req.overrides?.microsoft?.voice ?? req.config.edge.voice,
-              outputFormat: format,
-            },
-            timeoutMs: req.config.timeoutMs,
-          });
-          const audioBuffer = readFileSync(outputPath);
-          return {
-            audioBuffer,
-            outputFormat: format,
-            fileExtension,
-            voiceCompatible: isVoiceCompatibleAudio({ fileName: outputPath }),
-          };
-        };
-
-        try {
-          return await runEdge(outputFormat);
-        } catch (err) {
-          if (!fallbackOutputFormat || fallbackOutputFormat === outputFormat) {
-            throw err;
-          }
-          outputFormat = fallbackOutputFormat;
-          return await runEdge(outputFormat);
-        }
-      } finally {
-        rmSync(tempDir, { recursive: true, force: true });
-      }
-    },
-  };
-}
diff --git a/src/tts/providers/openai.ts b/src/tts/providers/openai.ts
deleted file mode 100644
index 01e5997e85c..00000000000
--- a/src/tts/providers/openai.ts
+++ /dev/null
@@ -1,57 +0,0 @@
-import type { SpeechProviderPlugin } from "../../plugins/types.js";
-import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "../tts-core.js";
-
-export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
-  return {
-    id: "openai",
-    label: "OpenAI",
-    models: OPENAI_TTS_MODELS,
-    voices: OPENAI_TTS_VOICES,
-    listVoices: async () => OPENAI_TTS_VOICES.map((voice) => ({ id: voice, name: voice })),
-    isConfigured: ({ config }) => Boolean(config.openai.apiKey || process.env.OPENAI_API_KEY),
-    synthesize: async (req) => {
-      const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
-      if (!apiKey) {
-        throw new Error("OpenAI API key missing");
-      }
-      const responseFormat = req.target === "voice-note" ? "opus" : "mp3";
-      const audioBuffer = await openaiTTS({
-        text: req.text,
-        apiKey,
-        baseUrl: req.config.openai.baseUrl,
-        model: req.overrides?.openai?.model ?? req.config.openai.model,
-        voice: req.overrides?.openai?.voice ?? req.config.openai.voice,
-        speed: req.overrides?.openai?.speed ?? req.config.openai.speed,
-        instructions: req.config.openai.instructions,
-        responseFormat,
-        timeoutMs: req.config.timeoutMs,
-      });
-      return {
-        audioBuffer,
-        outputFormat: responseFormat,
-        fileExtension: responseFormat === "opus" ? ".opus" : ".mp3",
-        voiceCompatible: req.target === "voice-note",
-      };
-    },
-    synthesizeTelephony: async (req) => {
-      const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY;
-      if (!apiKey) {
-        throw new Error("OpenAI API key missing");
-      }
-      const outputFormat = "pcm";
-      const sampleRate = 24_000;
-      const audioBuffer = await openaiTTS({
-        text: req.text,
-        apiKey,
-        baseUrl: req.config.openai.baseUrl,
-        model: req.config.openai.model,
-        voice: req.config.openai.voice,
-        speed: req.config.openai.speed,
-        instructions: req.config.openai.instructions,
-        responseFormat: outputFormat,
-        timeoutMs: req.config.timeoutMs,
-      });
-      return { audioBuffer, outputFormat, sampleRate };
-    },
-  };
-}