diff --git a/src/tts/providers/microsoft.ts b/src/tts/providers/microsoft.ts index fef369740cb..ba2511e4de6 100644 --- a/src/tts/providers/microsoft.ts +++ b/src/tts/providers/microsoft.ts @@ -96,6 +96,7 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin { outputPath, config: { ...req.config.edge, + voice: req.overrides?.microsoft?.voice ?? req.config.edge.voice, outputFormat: format, }, timeoutMs: req.config.timeoutMs, diff --git a/src/tts/providers/openai.ts b/src/tts/providers/openai.ts index 9f96e9ea6e9..01e5997e85c 100644 --- a/src/tts/providers/openai.ts +++ b/src/tts/providers/openai.ts @@ -21,7 +21,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin { baseUrl: req.config.openai.baseUrl, model: req.overrides?.openai?.model ?? req.config.openai.model, voice: req.overrides?.openai?.voice ?? req.config.openai.voice, - speed: req.config.openai.speed, + speed: req.overrides?.openai?.speed ?? req.config.openai.speed, instructions: req.config.openai.instructions, responseFormat, timeoutMs: req.config.timeoutMs, diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 0a5aa81126e..c64dda83909 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -162,6 +162,7 @@ export type TtsDirectiveOverrides = { openai?: { voice?: string; model?: string; + speed?: number; }; elevenlabs?: { voiceId?: string; @@ -171,6 +172,9 @@ export type TtsDirectiveOverrides = { languageCode?: string; voiceSettings?: Partial; }; + microsoft?: { + voice?: string; + }; }; export type TtsDirectiveParseResult = { @@ -191,6 +195,17 @@ export type TtsResult = { voiceCompatible?: boolean; }; +export type TtsSynthesisResult = { + success: boolean; + audioBuffer?: Buffer; + error?: string; + latencyMs?: number; + provider?: string; + outputFormat?: string; + voiceCompatible?: boolean; + fileExtension?: string; +}; + export type TtsTelephonyResult = { success: boolean; audioBuffer?: Buffer; @@ -601,6 +616,7 @@ function resolveTtsRequestSetup(params: { cfg: OpenClawConfig; prefsPath?: string; providerOverride?: TtsProvider; + disableFallback?: boolean; }): | { config: ResolvedTtsConfig; @@ -621,7 +637,7 @@ function resolveTtsRequestSetup(params: { const provider = normalizeSpeechProviderId(params.providerOverride) ?? userProvider; return { config, - providers: resolveTtsProviderOrder(provider, params.cfg), + providers: params.disableFallback ? [provider] : resolveTtsProviderOrder(provider, params.cfg), }; } @@ -631,12 +647,44 @@ export async function textToSpeech(params: { prefsPath?: string; channel?: string; overrides?: TtsDirectiveOverrides; + disableFallback?: boolean; }): Promise { + const synthesis = await synthesizeSpeech(params); + if (!synthesis.success || !synthesis.audioBuffer || !synthesis.fileExtension) { + return buildTtsFailureResult([synthesis.error ?? "TTS conversion failed"]); + } + + const tempRoot = resolvePreferredOpenClawTmpDir(); + mkdirSync(tempRoot, { recursive: true, mode: 0o700 }); + const tempDir = mkdtempSync(path.join(tempRoot, "tts-")); + const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`); + writeFileSync(audioPath, synthesis.audioBuffer); + scheduleCleanup(tempDir); + + return { + success: true, + audioPath, + latencyMs: synthesis.latencyMs, + provider: synthesis.provider, + outputFormat: synthesis.outputFormat, + voiceCompatible: synthesis.voiceCompatible, + }; +} + +export async function synthesizeSpeech(params: { + text: string; + cfg: OpenClawConfig; + prefsPath?: string; + channel?: string; + overrides?: TtsDirectiveOverrides; + disableFallback?: boolean; +}): Promise { const setup = resolveTtsRequestSetup({ text: params.text, cfg: params.cfg, prefsPath: params.prefsPath, providerOverride: params.overrides?.provider, + disableFallback: params.disableFallback, }); if ("error" in setup) { return { success: false, error: setup.error }; @@ -667,22 +715,14 @@ export async function textToSpeech(params: { target, overrides: params.overrides, }); - const latencyMs = Date.now() - providerStart; - - const tempRoot = resolvePreferredOpenClawTmpDir(); - mkdirSync(tempRoot, { recursive: true, mode: 0o700 }); - const tempDir = mkdtempSync(path.join(tempRoot, "tts-")); - const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`); - writeFileSync(audioPath, synthesis.audioBuffer); - scheduleCleanup(tempDir); - return { success: true, - audioPath, - latencyMs, + audioBuffer: synthesis.audioBuffer, + latencyMs: Date.now() - providerStart, provider, outputFormat: synthesis.outputFormat, voiceCompatible: synthesis.voiceCompatible, + fileExtension: synthesis.fileExtension, }; } catch (err) { errors.push(formatTtsProviderError(provider, err));