import { mkdirSync, mkdtempSync, readFileSync, rmSync } from "node:fs"; import path from "node:path"; import { CHROMIUM_FULL_VERSION, TRUSTED_CLIENT_TOKEN, generateSecMsGecToken, } from "node-edge-tts/dist/drm.js"; import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/llm-task"; import { isVoiceCompatibleAudio } from "openclaw/plugin-sdk/media-runtime"; import type { SpeechProviderConfig, SpeechProviderPlugin, SpeechVoiceOption, } from "openclaw/plugin-sdk/speech-core"; import { edgeTTS, inferEdgeExtension } from "./tts.js"; const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural"; const DEFAULT_EDGE_LANG = "en-US"; const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; type MicrosoftProviderConfig = { enabled: boolean; voice: string; lang: string; outputFormat: string; outputFormatConfigured: boolean; pitch?: string; rate?: string; volume?: string; saveSubtitles: boolean; proxy?: string; timeoutMs?: number; }; type MicrosoftVoiceListEntry = { ShortName?: string; FriendlyName?: string; Locale?: string; Gender?: string; VoiceTag?: { ContentCategories?: string[]; VoicePersonalities?: string[]; }; }; function trimToUndefined(value: unknown): string | undefined { return typeof value === "string" && value.trim() ? value.trim() : undefined; } function asBoolean(value: unknown): boolean | undefined { return typeof value === "boolean" ? value : undefined; } function asNumber(value: unknown): number | undefined { return typeof value === "number" && Number.isFinite(value) ? value : undefined; } function asObject(value: unknown): Record | undefined { return typeof value === "object" && value !== null && !Array.isArray(value) ? (value as Record) : undefined; } function normalizeMicrosoftProviderConfig( rawConfig: Record, ): MicrosoftProviderConfig { const providers = asObject(rawConfig.providers); const rawEdge = asObject(rawConfig.edge); const rawMicrosoft = asObject(rawConfig.microsoft); const rawProvider = asObject(providers?.microsoft); const raw = { ...(rawEdge ?? {}), ...(rawMicrosoft ?? {}), ...(rawProvider ?? {}) }; const outputFormat = trimToUndefined(raw.outputFormat); return { enabled: asBoolean(raw.enabled) ?? true, voice: trimToUndefined(raw.voice) ?? DEFAULT_EDGE_VOICE, lang: trimToUndefined(raw.lang) ?? DEFAULT_EDGE_LANG, outputFormat: outputFormat ?? DEFAULT_EDGE_OUTPUT_FORMAT, outputFormatConfigured: Boolean(outputFormat), pitch: trimToUndefined(raw.pitch), rate: trimToUndefined(raw.rate), volume: trimToUndefined(raw.volume), saveSubtitles: asBoolean(raw.saveSubtitles) ?? false, proxy: trimToUndefined(raw.proxy), timeoutMs: asNumber(raw.timeoutMs), }; } function readMicrosoftProviderConfig(config: SpeechProviderConfig): MicrosoftProviderConfig { const defaults = normalizeMicrosoftProviderConfig({}); return { enabled: asBoolean(config.enabled) ?? defaults.enabled, voice: trimToUndefined(config.voice) ?? defaults.voice, lang: trimToUndefined(config.lang) ?? defaults.lang, outputFormat: trimToUndefined(config.outputFormat) ?? defaults.outputFormat, outputFormatConfigured: asBoolean(config.outputFormatConfigured) ?? defaults.outputFormatConfigured, pitch: trimToUndefined(config.pitch) ?? defaults.pitch, rate: trimToUndefined(config.rate) ?? defaults.rate, volume: trimToUndefined(config.volume) ?? defaults.volume, saveSubtitles: asBoolean(config.saveSubtitles) ?? defaults.saveSubtitles, proxy: trimToUndefined(config.proxy) ?? defaults.proxy, timeoutMs: asNumber(config.timeoutMs) ?? defaults.timeoutMs, }; } function buildMicrosoftVoiceHeaders(): Record { const major = CHROMIUM_FULL_VERSION.split(".")[0] || "0"; return { Authority: "speech.platform.bing.com", Origin: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold", Accept: "*/*", "User-Agent": `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ` + `(KHTML, like Gecko) Chrome/${major}.0.0.0 Safari/537.36 Edg/${major}.0.0.0`, "Sec-MS-GEC": generateSecMsGecToken(), "Sec-MS-GEC-Version": `1-${CHROMIUM_FULL_VERSION}`, }; } function formatMicrosoftVoiceDescription(entry: MicrosoftVoiceListEntry): string | undefined { const personalities = entry.VoiceTag?.VoicePersonalities?.filter(Boolean) ?? []; return personalities.length > 0 ? personalities.join(", ") : undefined; } export async function listMicrosoftVoices(): Promise { const response = await fetch( "https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list" + `?trustedclienttoken=${TRUSTED_CLIENT_TOKEN}`, { headers: buildMicrosoftVoiceHeaders(), }, ); if (!response.ok) { throw new Error(`Microsoft voices API error (${response.status})`); } const voices = (await response.json()) as MicrosoftVoiceListEntry[]; return Array.isArray(voices) ? voices .map((voice) => ({ id: voice.ShortName?.trim() ?? "", name: voice.FriendlyName?.trim() || voice.ShortName?.trim() || undefined, category: voice.VoiceTag?.ContentCategories?.find((value) => value.trim().length > 0), description: formatMicrosoftVoiceDescription(voice), locale: voice.Locale?.trim() || undefined, gender: voice.Gender?.trim() || undefined, personalities: voice.VoiceTag?.VoicePersonalities?.filter( (value): value is string => value.trim().length > 0, ), })) .filter((voice) => voice.id.length > 0) : []; } export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin { return { id: "microsoft", label: "Microsoft", aliases: ["edge"], autoSelectOrder: 30, resolveConfig: ({ rawConfig }) => normalizeMicrosoftProviderConfig(rawConfig), resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => { const base = normalizeMicrosoftProviderConfig(baseTtsConfig); return { ...base, enabled: true, ...(trimToUndefined(talkProviderConfig.voiceId) == null ? {} : { voice: trimToUndefined(talkProviderConfig.voiceId) }), ...(trimToUndefined(talkProviderConfig.languageCode) == null ? {} : { lang: trimToUndefined(talkProviderConfig.languageCode) }), ...(trimToUndefined(talkProviderConfig.outputFormat) == null ? {} : { outputFormat: trimToUndefined(talkProviderConfig.outputFormat) }), ...(trimToUndefined(talkProviderConfig.pitch) == null ? {} : { pitch: trimToUndefined(talkProviderConfig.pitch) }), ...(trimToUndefined(talkProviderConfig.rate) == null ? {} : { rate: trimToUndefined(talkProviderConfig.rate) }), ...(trimToUndefined(talkProviderConfig.volume) == null ? {} : { volume: trimToUndefined(talkProviderConfig.volume) }), ...(trimToUndefined(talkProviderConfig.proxy) == null ? {} : { proxy: trimToUndefined(talkProviderConfig.proxy) }), ...(asNumber(talkProviderConfig.timeoutMs) == null ? {} : { timeoutMs: asNumber(talkProviderConfig.timeoutMs) }), }; }, resolveTalkOverrides: ({ params }) => ({ ...(trimToUndefined(params.voiceId) == null ? {} : { voice: trimToUndefined(params.voiceId) }), ...(trimToUndefined(params.outputFormat) == null ? {} : { outputFormat: trimToUndefined(params.outputFormat) }), }), listVoices: async () => await listMicrosoftVoices(), isConfigured: ({ providerConfig }) => readMicrosoftProviderConfig(providerConfig).enabled, synthesize: async (req) => { const config = readMicrosoftProviderConfig(req.providerConfig); const tempRoot = resolvePreferredOpenClawTmpDir(); mkdirSync(tempRoot, { recursive: true, mode: 0o700 }); const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-")); const overrideVoice = trimToUndefined(req.providerOverrides?.voice); let outputFormat = trimToUndefined(req.providerOverrides?.outputFormat) ?? config.outputFormat; const fallbackOutputFormat = outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined; try { const runEdge = async (format: string) => { const fileExtension = inferEdgeExtension(format); const outputPath = path.join(tempDir, `speech${fileExtension}`); await edgeTTS({ text: req.text, outputPath, config: { ...config, voice: overrideVoice ?? config.voice, outputFormat: format, }, timeoutMs: req.timeoutMs, }); const audioBuffer = readFileSync(outputPath); return { audioBuffer, outputFormat: format, fileExtension, voiceCompatible: isVoiceCompatibleAudio({ fileName: outputPath }), }; }; try { return await runEdge(outputFormat); } catch (error) { if (!fallbackOutputFormat || fallbackOutputFormat === outputFormat) { throw error; } outputFormat = fallbackOutputFormat; return await runEdge(outputFormat); } } finally { rmSync(tempDir, { recursive: true, force: true }); } }, }; }