From 1d08ad4bac6a5d984ccf8e67f87e9c5fbccff68d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sun, 22 Mar 2026 17:47:04 -0700 Subject: [PATCH] refactor(tts): remove legacy core speech builders --- src/tts/providers/elevenlabs.ts | 127 -------------------------------- src/tts/providers/microsoft.ts | 127 -------------------------------- src/tts/providers/openai.ts | 57 -------------- 3 files changed, 311 deletions(-) delete mode 100644 src/tts/providers/elevenlabs.ts delete mode 100644 src/tts/providers/microsoft.ts delete mode 100644 src/tts/providers/openai.ts diff --git a/src/tts/providers/elevenlabs.ts b/src/tts/providers/elevenlabs.ts deleted file mode 100644 index 99097fc42f3..00000000000 --- a/src/tts/providers/elevenlabs.ts +++ /dev/null @@ -1,127 +0,0 @@ -import type { SpeechProviderPlugin } from "../../plugins/types.js"; -import type { SpeechVoiceOption } from "../provider-types.js"; -import { elevenLabsTTS } from "../tts-core.js"; - -const ELEVENLABS_TTS_MODELS = [ - "eleven_multilingual_v2", - "eleven_turbo_v2_5", - "eleven_monolingual_v1", -] as const; - -function normalizeElevenLabsBaseUrl(baseUrl: string | undefined): string { - const trimmed = baseUrl?.trim(); - return trimmed?.replace(/\/+$/, "") || "https://api.elevenlabs.io"; -} - -export async function listElevenLabsVoices(params: { - apiKey: string; - baseUrl?: string; -}): Promise { - const res = await fetch(`${normalizeElevenLabsBaseUrl(params.baseUrl)}/v1/voices`, { - headers: { - "xi-api-key": params.apiKey, - }, - }); - if (!res.ok) { - throw new Error(`ElevenLabs voices API error (${res.status})`); - } - const json = (await res.json()) as { - voices?: Array<{ - voice_id?: string; - name?: string; - category?: string; - description?: string; - }>; - }; - return Array.isArray(json.voices) - ? json.voices - .map((voice) => ({ - id: voice.voice_id?.trim() ?? "", - name: voice.name?.trim() || undefined, - category: voice.category?.trim() || undefined, - description: voice.description?.trim() || undefined, - })) - .filter((voice) => voice.id.length > 0) - : []; -} - -export function buildElevenLabsSpeechProvider(): SpeechProviderPlugin { - return { - id: "elevenlabs", - label: "ElevenLabs", - models: ELEVENLABS_TTS_MODELS, - listVoices: async (req) => { - const apiKey = - req.apiKey || - req.config?.elevenlabs.apiKey || - process.env.ELEVENLABS_API_KEY || - process.env.XI_API_KEY; - if (!apiKey) { - throw new Error("ElevenLabs API key missing"); - } - return listElevenLabsVoices({ - apiKey, - baseUrl: req.baseUrl ?? req.config?.elevenlabs.baseUrl, - }); - }, - isConfigured: ({ config }) => - Boolean(config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY), - synthesize: async (req) => { - const apiKey = - req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY; - if (!apiKey) { - throw new Error("ElevenLabs API key missing"); - } - const outputFormat = - req.overrides?.elevenlabs?.outputFormat ?? - (req.target === "voice-note" ? "opus_48000_64" : "mp3_44100_128"); - const audioBuffer = await elevenLabsTTS({ - text: req.text, - apiKey, - baseUrl: req.config.elevenlabs.baseUrl, - voiceId: req.overrides?.elevenlabs?.voiceId ?? req.config.elevenlabs.voiceId, - modelId: req.overrides?.elevenlabs?.modelId ?? req.config.elevenlabs.modelId, - outputFormat, - seed: req.overrides?.elevenlabs?.seed ?? req.config.elevenlabs.seed, - applyTextNormalization: - req.overrides?.elevenlabs?.applyTextNormalization ?? - req.config.elevenlabs.applyTextNormalization, - languageCode: req.overrides?.elevenlabs?.languageCode ?? req.config.elevenlabs.languageCode, - voiceSettings: { - ...req.config.elevenlabs.voiceSettings, - ...req.overrides?.elevenlabs?.voiceSettings, - }, - timeoutMs: req.config.timeoutMs, - }); - return { - audioBuffer, - outputFormat, - fileExtension: req.target === "voice-note" ? ".opus" : ".mp3", - voiceCompatible: req.target === "voice-note", - }; - }, - synthesizeTelephony: async (req) => { - const apiKey = - req.config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY; - if (!apiKey) { - throw new Error("ElevenLabs API key missing"); - } - const outputFormat = "pcm_22050"; - const sampleRate = 22_050; - const audioBuffer = await elevenLabsTTS({ - text: req.text, - apiKey, - baseUrl: req.config.elevenlabs.baseUrl, - voiceId: req.config.elevenlabs.voiceId, - modelId: req.config.elevenlabs.modelId, - outputFormat, - seed: req.config.elevenlabs.seed, - applyTextNormalization: req.config.elevenlabs.applyTextNormalization, - languageCode: req.config.elevenlabs.languageCode, - voiceSettings: req.config.elevenlabs.voiceSettings, - timeoutMs: req.config.timeoutMs, - }); - return { audioBuffer, outputFormat, sampleRate }; - }, - }; -} diff --git a/src/tts/providers/microsoft.ts b/src/tts/providers/microsoft.ts deleted file mode 100644 index f6c5aa8c379..00000000000 --- a/src/tts/providers/microsoft.ts +++ /dev/null @@ -1,127 +0,0 @@ -import { mkdirSync, mkdtempSync, readFileSync, rmSync } from "node:fs"; -import path from "node:path"; -import { - CHROMIUM_FULL_VERSION, - TRUSTED_CLIENT_TOKEN, - generateSecMsGecToken, -} from "node-edge-tts/dist/drm.js"; -import { resolvePreferredOpenClawTmpDir } from "../../infra/tmp-openclaw-dir.js"; -import { isVoiceCompatibleAudio } from "../../media/audio.js"; -import type { SpeechProviderPlugin } from "../../plugins/types.js"; -import type { SpeechVoiceOption } from "../provider-types.js"; -import { edgeTTS, inferEdgeExtension } from "../tts-core.js"; - -const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; - -type MicrosoftVoiceListEntry = { - ShortName?: string; - FriendlyName?: string; - Locale?: string; - Gender?: string; - VoiceTag?: { - ContentCategories?: string[]; - VoicePersonalities?: string[]; - }; -}; - -function buildMicrosoftVoiceHeaders(): Record { - const major = CHROMIUM_FULL_VERSION.split(".")[0] || "0"; - return { - Authority: "speech.platform.bing.com", - Origin: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold", - Accept: "*/*", - "User-Agent": - `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ` + - `(KHTML, like Gecko) Chrome/${major}.0.0.0 Safari/537.36 Edg/${major}.0.0.0`, - "Sec-MS-GEC": generateSecMsGecToken(), - "Sec-MS-GEC-Version": `1-${CHROMIUM_FULL_VERSION}`, - }; -} - -function formatMicrosoftVoiceDescription(entry: MicrosoftVoiceListEntry): string | undefined { - const personalities = entry.VoiceTag?.VoicePersonalities?.filter(Boolean) ?? []; - return personalities.length > 0 ? personalities.join(", ") : undefined; -} - -export async function listMicrosoftVoices(): Promise { - const response = await fetch( - "https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list" + - `?trustedclienttoken=${TRUSTED_CLIENT_TOKEN}`, - { - headers: buildMicrosoftVoiceHeaders(), - }, - ); - if (!response.ok) { - throw new Error(`Microsoft voices API error (${response.status})`); - } - const voices = (await response.json()) as MicrosoftVoiceListEntry[]; - return Array.isArray(voices) - ? voices - .map((voice) => ({ - id: voice.ShortName?.trim() ?? "", - name: voice.FriendlyName?.trim() || voice.ShortName?.trim() || undefined, - category: voice.VoiceTag?.ContentCategories?.find((value) => value.trim().length > 0), - description: formatMicrosoftVoiceDescription(voice), - locale: voice.Locale?.trim() || undefined, - gender: voice.Gender?.trim() || undefined, - personalities: voice.VoiceTag?.VoicePersonalities?.filter( - (value): value is string => value.trim().length > 0, - ), - })) - .filter((voice) => voice.id.length > 0) - : []; -} - -export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin { - return { - id: "microsoft", - label: "Microsoft", - aliases: ["edge"], - listVoices: async () => await listMicrosoftVoices(), - isConfigured: ({ config }) => config.edge.enabled, - synthesize: async (req) => { - const tempRoot = resolvePreferredOpenClawTmpDir(); - mkdirSync(tempRoot, { recursive: true, mode: 0o700 }); - const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-")); - let outputFormat = req.overrides?.microsoft?.outputFormat ?? req.config.edge.outputFormat; - const fallbackOutputFormat = - outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined; - - try { - const runEdge = async (format: string) => { - const fileExtension = inferEdgeExtension(format); - const outputPath = path.join(tempDir, `speech${fileExtension}`); - await edgeTTS({ - text: req.text, - outputPath, - config: { - ...req.config.edge, - voice: req.overrides?.microsoft?.voice ?? req.config.edge.voice, - outputFormat: format, - }, - timeoutMs: req.config.timeoutMs, - }); - const audioBuffer = readFileSync(outputPath); - return { - audioBuffer, - outputFormat: format, - fileExtension, - voiceCompatible: isVoiceCompatibleAudio({ fileName: outputPath }), - }; - }; - - try { - return await runEdge(outputFormat); - } catch (err) { - if (!fallbackOutputFormat || fallbackOutputFormat === outputFormat) { - throw err; - } - outputFormat = fallbackOutputFormat; - return await runEdge(outputFormat); - } - } finally { - rmSync(tempDir, { recursive: true, force: true }); - } - }, - }; -} diff --git a/src/tts/providers/openai.ts b/src/tts/providers/openai.ts deleted file mode 100644 index 01e5997e85c..00000000000 --- a/src/tts/providers/openai.ts +++ /dev/null @@ -1,57 +0,0 @@ -import type { SpeechProviderPlugin } from "../../plugins/types.js"; -import { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, openaiTTS } from "../tts-core.js"; - -export function buildOpenAISpeechProvider(): SpeechProviderPlugin { - return { - id: "openai", - label: "OpenAI", - models: OPENAI_TTS_MODELS, - voices: OPENAI_TTS_VOICES, - listVoices: async () => OPENAI_TTS_VOICES.map((voice) => ({ id: voice, name: voice })), - isConfigured: ({ config }) => Boolean(config.openai.apiKey || process.env.OPENAI_API_KEY), - synthesize: async (req) => { - const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY; - if (!apiKey) { - throw new Error("OpenAI API key missing"); - } - const responseFormat = req.target === "voice-note" ? "opus" : "mp3"; - const audioBuffer = await openaiTTS({ - text: req.text, - apiKey, - baseUrl: req.config.openai.baseUrl, - model: req.overrides?.openai?.model ?? req.config.openai.model, - voice: req.overrides?.openai?.voice ?? req.config.openai.voice, - speed: req.overrides?.openai?.speed ?? req.config.openai.speed, - instructions: req.config.openai.instructions, - responseFormat, - timeoutMs: req.config.timeoutMs, - }); - return { - audioBuffer, - outputFormat: responseFormat, - fileExtension: responseFormat === "opus" ? ".opus" : ".mp3", - voiceCompatible: req.target === "voice-note", - }; - }, - synthesizeTelephony: async (req) => { - const apiKey = req.config.openai.apiKey || process.env.OPENAI_API_KEY; - if (!apiKey) { - throw new Error("OpenAI API key missing"); - } - const outputFormat = "pcm"; - const sampleRate = 24_000; - const audioBuffer = await openaiTTS({ - text: req.text, - apiKey, - baseUrl: req.config.openai.baseUrl, - model: req.config.openai.model, - voice: req.config.openai.voice, - speed: req.config.openai.speed, - instructions: req.config.openai.instructions, - responseFormat: outputFormat, - timeoutMs: req.config.timeoutMs, - }); - return { audioBuffer, outputFormat, sampleRate }; - }, - }; -}