diff --git a/CHANGELOG.md b/CHANGELOG.md index bf0fed0a83f..acd74411bdb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ Docs: https://docs.openclaw.ai - Matrix/history: add optional room history context for Matrix group triggers via `channels.matrix.historyLimit`, with per-agent watermarks and retry-safe snapshots so failed trigger retries do not drift into newer room messages. (#57022) thanks @chain710. - Diffs: skip unused viewer-versus-file SSR preload work so `diffs` view-only and file-only runs do less render work while keeping mode outputs aligned. (#57909) thanks @gumadeiras. - Matrix/threads: add per-DM `threadReplies` overrides and keep thread session isolation aligned with the effective room or DM thread policy from the triggering message onward. (#57995) thanks @teconomix. +- TTS: Add structured provider diagnostics and fallback attempt analytics. (#57954) Thanks @joshavant. ### Fixes @@ -122,6 +123,7 @@ Docs: https://docs.openclaw.ai - Exec/env: block Python package index override variables from request-scoped host exec environment sanitization so package fetches cannot be redirected through a caller-supplied index. Thanks @nexrin and @vincentkoc. - Telegram/audio: transcode Telegram voice-note `.ogg` attachments before the local `whisper-cli` auto fallback runs, and keep mention-preflight transcription enabled in auto mode when `tools.media.audio` is unset. - Matrix/direct rooms: recover fresh auto-joined 1:1 DMs without eagerly persisting invite-only `m.direct` mappings, while keeping named, aliased, and explicitly configured rooms on the room path. (#58024) Thanks @gumadeiras. +- TTS: Restore 3.28 schema compatibility and fallback observability. (#57953) Thanks @joshavant. ## 2026.3.28 diff --git a/docs/tools/tts.md b/docs/tools/tts.md index f84ec7efef8..f058c150117 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -395,6 +395,8 @@ Notes: - `/tts status` includes fallback visibility for the latest attempt: - success fallback: `Fallback: -> ` plus `Attempts: ...` - failure: `Error: ...` plus `Attempts: ...` + - detailed diagnostics: `Attempt details: provider:outcome(reasonCode) latency` +- OpenAI and ElevenLabs API failures now include parsed provider error detail and request id (when returned by the provider), which is surfaced in TTS errors/logs. ## Agent tool diff --git a/docs/tts.md b/docs/tts.md index 334cd76c7f0..5fea8967ce2 100644 --- a/docs/tts.md +++ b/docs/tts.md @@ -395,6 +395,8 @@ Notes: - `/tts status` includes fallback visibility for the latest attempt: - success fallback: `Fallback: -> ` plus `Attempts: ...` - failure: `Error: ...` plus `Attempts: ...` + - detailed diagnostics: `Attempt details: provider:outcome(reasonCode) latency` +- OpenAI and ElevenLabs API failures now include parsed provider error detail and request id (when returned by the provider), which is surfaced in TTS errors/logs. ## Agent tool diff --git a/extensions/elevenlabs/tts.test.ts b/extensions/elevenlabs/tts.test.ts new file mode 100644 index 00000000000..93fe4331920 --- /dev/null +++ b/extensions/elevenlabs/tts.test.ts @@ -0,0 +1,133 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import { elevenLabsTTS } from "./tts.js"; + +describe("elevenlabs tts diagnostics", () => { + const originalFetch = globalThis.fetch; + + function createStreamingErrorResponse(params: { + status: number; + chunkCount: number; + chunkSize: number; + byte: number; + }): { response: Response; getReadCount: () => number } { + let reads = 0; + const stream = new ReadableStream({ + pull(controller) { + if (reads >= params.chunkCount) { + controller.close(); + return; + } + reads += 1; + controller.enqueue(new Uint8Array(params.chunkSize).fill(params.byte)); + }, + }); + return { + response: new Response(stream, { status: params.status }), + getReadCount: () => reads, + }; + } + + afterEach(() => { + globalThis.fetch = originalFetch; + vi.restoreAllMocks(); + }); + + it("includes parsed provider detail and request id for JSON API errors", async () => { + const fetchMock = vi.fn( + async () => + new Response( + JSON.stringify({ + detail: { + message: "Quota exceeded", + status: "quota_exceeded", + }, + }), + { + status: 429, + headers: { + "Content-Type": "application/json", + "x-request-id": "el_req_456", + }, + }, + ), + ); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + await expect( + elevenLabsTTS({ + text: "hello", + apiKey: "test-key", + baseUrl: "https://api.elevenlabs.io", + voiceId: "pMsXgVXv3BLzUgSXRplE", + modelId: "eleven_multilingual_v2", + outputFormat: "mp3_44100_128", + voiceSettings: { + stability: 0.5, + similarityBoost: 0.75, + style: 0, + useSpeakerBoost: true, + speed: 1.0, + }, + timeoutMs: 5_000, + }), + ).rejects.toThrow( + "ElevenLabs API error (429): Quota exceeded [code=quota_exceeded] [request_id=el_req_456]", + ); + }); + + it("falls back to raw body text when the error body is non-JSON", async () => { + const fetchMock = vi.fn(async () => new Response("service unavailable", { status: 503 })); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + await expect( + elevenLabsTTS({ + text: "hello", + apiKey: "test-key", + baseUrl: "https://api.elevenlabs.io", + voiceId: "pMsXgVXv3BLzUgSXRplE", + modelId: "eleven_multilingual_v2", + outputFormat: "mp3_44100_128", + voiceSettings: { + stability: 0.5, + similarityBoost: 0.75, + style: 0, + useSpeakerBoost: true, + speed: 1.0, + }, + timeoutMs: 5_000, + }), + ).rejects.toThrow("ElevenLabs API error (503): service unavailable"); + }); + + it("caps streamed non-JSON error reads instead of consuming full response bodies", async () => { + const streamed = createStreamingErrorResponse({ + status: 503, + chunkCount: 200, + chunkSize: 1024, + byte: 121, + }); + const fetchMock = vi.fn(async () => streamed.response); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + await expect( + elevenLabsTTS({ + text: "hello", + apiKey: "test-key", + baseUrl: "https://api.elevenlabs.io", + voiceId: "pMsXgVXv3BLzUgSXRplE", + modelId: "eleven_multilingual_v2", + outputFormat: "mp3_44100_128", + voiceSettings: { + stability: 0.5, + similarityBoost: 0.75, + style: 0, + useSpeakerBoost: true, + speed: 1.0, + }, + timeoutMs: 5_000, + }), + ).rejects.toThrow("ElevenLabs API error (503)"); + + expect(streamed.getReadCount()).toBeLessThan(200); + }); +}); diff --git a/extensions/elevenlabs/tts.ts b/extensions/elevenlabs/tts.ts index ef69c1c6587..bfa5b9c8430 100644 --- a/extensions/elevenlabs/tts.ts +++ b/extensions/elevenlabs/tts.ts @@ -1,8 +1,12 @@ import { + asObject, normalizeApplyTextNormalization, normalizeLanguageCode, normalizeSeed, + readResponseTextLimited, requireInRange, + trimToUndefined, + truncateErrorDetail, } from "openclaw/plugin-sdk/speech"; const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io"; @@ -19,6 +23,45 @@ function normalizeElevenLabsBaseUrl(baseUrl?: string): string { return trimmed.replace(/\/+$/, ""); } +function formatElevenLabsErrorPayload(payload: unknown): string | undefined { + const root = asObject(payload); + if (!root) { + return undefined; + } + const detailObject = asObject(root.detail); + const message = + trimToUndefined(root.message) ?? + trimToUndefined(detailObject?.message) ?? + trimToUndefined(detailObject?.detail) ?? + trimToUndefined(root.error); + const code = + trimToUndefined(root.code) ?? + trimToUndefined(detailObject?.code) ?? + trimToUndefined(detailObject?.status); + if (message && code) { + return `${truncateErrorDetail(message)} [code=${code}]`; + } + if (message) { + return truncateErrorDetail(message); + } + if (code) { + return `[code=${code}]`; + } + return undefined; +} + +async function extractElevenLabsErrorDetail(response: Response): Promise { + const rawBody = trimToUndefined(await readResponseTextLimited(response)); + if (!rawBody) { + return undefined; + } + try { + return formatElevenLabsErrorPayload(JSON.parse(rawBody)) ?? truncateErrorDetail(rawBody); + } catch { + return truncateErrorDetail(rawBody); + } +} + function assertElevenLabsVoiceSettings(settings: { stability: number; similarityBoost: number; @@ -106,7 +149,15 @@ export async function elevenLabsTTS(params: { }); if (!response.ok) { - throw new Error(`ElevenLabs API error (${response.status})`); + const detail = await extractElevenLabsErrorDetail(response); + const requestId = + trimToUndefined(response.headers.get("x-request-id")) ?? + trimToUndefined(response.headers.get("request-id")); + throw new Error( + `ElevenLabs API error (${response.status})` + + (detail ? `: ${detail}` : "") + + (requestId ? ` [request_id=${requestId}]` : ""), + ); } return Buffer.from(await response.arrayBuffer()); diff --git a/extensions/openai/tts.test.ts b/extensions/openai/tts.test.ts index 4b5797f0eb7..4ad945cf133 100644 --- a/extensions/openai/tts.test.ts +++ b/extensions/openai/tts.test.ts @@ -1,13 +1,21 @@ -import { describe, expect, it } from "vitest"; +import { afterEach, describe, expect, it, vi } from "vitest"; import { isValidOpenAIModel, isValidOpenAIVoice, OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, + openaiTTS, resolveOpenAITtsInstructions, } from "./tts.js"; describe("openai tts", () => { + const originalFetch = globalThis.fetch; + + afterEach(() => { + globalThis.fetch = originalFetch; + vi.restoreAllMocks(); + }); + describe("isValidOpenAIVoice", () => { it("accepts all valid OpenAI voices including newer additions", () => { for (const voice of OPENAI_TTS_VOICES) { @@ -70,4 +78,110 @@ describe("openai tts", () => { expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts", " ")).toBeUndefined(); }); }); + + describe("openaiTTS diagnostics", () => { + function createStreamingErrorResponse(params: { + status: number; + chunkCount: number; + chunkSize: number; + byte: number; + }): { response: Response; getReadCount: () => number } { + let reads = 0; + const stream = new ReadableStream({ + pull(controller) { + if (reads >= params.chunkCount) { + controller.close(); + return; + } + reads += 1; + controller.enqueue(new Uint8Array(params.chunkSize).fill(params.byte)); + }, + }); + return { + response: new Response(stream, { status: params.status }), + getReadCount: () => reads, + }; + } + + it("includes parsed provider detail and request id for JSON API errors", async () => { + const fetchMock = vi.fn( + async () => + new Response( + JSON.stringify({ + error: { + message: "Invalid API key", + type: "invalid_request_error", + code: "invalid_api_key", + }, + }), + { + status: 401, + headers: { + "Content-Type": "application/json", + "x-request-id": "req_123", + }, + }, + ), + ); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + await expect( + openaiTTS({ + text: "hello", + apiKey: "bad-key", + baseUrl: "https://api.openai.com/v1", + model: "gpt-4o-mini-tts", + voice: "alloy", + responseFormat: "mp3", + timeoutMs: 5_000, + }), + ).rejects.toThrow( + "OpenAI TTS API error (401): Invalid API key [type=invalid_request_error, code=invalid_api_key] [request_id=req_123]", + ); + }); + + it("falls back to raw body text when the error body is non-JSON", async () => { + const fetchMock = vi.fn( + async () => new Response("temporary upstream outage", { status: 503 }), + ); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + await expect( + openaiTTS({ + text: "hello", + apiKey: "test-key", + baseUrl: "https://api.openai.com/v1", + model: "gpt-4o-mini-tts", + voice: "alloy", + responseFormat: "mp3", + timeoutMs: 5_000, + }), + ).rejects.toThrow("OpenAI TTS API error (503): temporary upstream outage"); + }); + + it("caps streamed non-JSON error reads instead of consuming full response bodies", async () => { + const streamed = createStreamingErrorResponse({ + status: 503, + chunkCount: 200, + chunkSize: 1024, + byte: 120, + }); + const fetchMock = vi.fn(async () => streamed.response); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + await expect( + openaiTTS({ + text: "hello", + apiKey: "test-key", + baseUrl: "https://api.openai.com/v1", + model: "gpt-4o-mini-tts", + voice: "alloy", + responseFormat: "mp3", + timeoutMs: 5_000, + }), + ).rejects.toThrow("OpenAI TTS API error (503)"); + + expect(streamed.getReadCount()).toBeLessThan(200); + }); + }); }); diff --git a/extensions/openai/tts.ts b/extensions/openai/tts.ts index edc23baa674..405be74062e 100644 --- a/extensions/openai/tts.ts +++ b/extensions/openai/tts.ts @@ -1,3 +1,10 @@ +import { + asObject, + readResponseTextLimited, + trimToUndefined, + truncateErrorDetail, +} from "openclaw/plugin-sdk/speech"; + export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"; export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const; @@ -58,6 +65,45 @@ export function resolveOpenAITtsInstructions( return next && model.includes("gpt-4o-mini-tts") ? next : undefined; } +function formatOpenAiErrorPayload(payload: unknown): string | undefined { + const root = asObject(payload); + const subject = asObject(root?.error) ?? root; + if (!subject) { + return undefined; + } + const message = + trimToUndefined(subject.message) ?? + trimToUndefined(subject.detail) ?? + trimToUndefined(root?.message); + const type = trimToUndefined(subject.type); + const code = trimToUndefined(subject.code); + const metadata = [type ? `type=${type}` : undefined, code ? `code=${code}` : undefined] + .filter((value): value is string => Boolean(value)) + .join(", "); + if (message && metadata) { + return `${truncateErrorDetail(message)} [${metadata}]`; + } + if (message) { + return truncateErrorDetail(message); + } + if (metadata) { + return `[${metadata}]`; + } + return undefined; +} + +async function extractOpenAiErrorDetail(response: Response): Promise { + const rawBody = trimToUndefined(await readResponseTextLimited(response)); + if (!rawBody) { + return undefined; + } + try { + return formatOpenAiErrorPayload(JSON.parse(rawBody)) ?? truncateErrorDetail(rawBody); + } catch { + return truncateErrorDetail(rawBody); + } +} + export async function openaiTTS(params: { text: string; apiKey: string; @@ -102,7 +148,15 @@ export async function openaiTTS(params: { }); if (!response.ok) { - throw new Error(`OpenAI TTS API error (${response.status})`); + const detail = await extractOpenAiErrorDetail(response); + const requestId = + trimToUndefined(response.headers.get("x-request-id")) ?? + trimToUndefined(response.headers.get("request-id")); + throw new Error( + `OpenAI TTS API error (${response.status})` + + (detail ? `: ${detail}` : "") + + (requestId ? ` [request_id=${requestId}]` : ""), + ); } return Buffer.from(await response.arrayBuffer()); diff --git a/extensions/speech-core/src/tts.ts b/extensions/speech-core/src/tts.ts index c7e5741e926..047c2823754 100644 --- a/extensions/speech-core/src/tts.ts +++ b/extensions/speech-core/src/tts.ts @@ -74,6 +74,22 @@ type TtsUserPrefs = { export type ResolvedTtsModelOverrides = SpeechModelOverridePolicy; +export type TtsAttemptReasonCode = + | "success" + | "no_provider_registered" + | "not_configured" + | "unsupported_for_telephony" + | "timeout" + | "provider_error"; + +export type TtsProviderAttempt = { + provider: string; + outcome: "success" | "skipped" | "failed"; + reasonCode: TtsAttemptReasonCode; + latencyMs?: number; + error?: string; +}; + export type TtsResult = { success: boolean; audioPath?: string; @@ -82,6 +98,7 @@ export type TtsResult = { provider?: string; fallbackFrom?: string; attemptedProviders?: string[]; + attempts?: TtsProviderAttempt[]; outputFormat?: string; voiceCompatible?: boolean; }; @@ -94,6 +111,7 @@ export type TtsSynthesisResult = { provider?: string; fallbackFrom?: string; attemptedProviders?: string[]; + attempts?: TtsProviderAttempt[]; outputFormat?: string; voiceCompatible?: boolean; fileExtension?: string; @@ -107,6 +125,7 @@ export type TtsTelephonyResult = { provider?: string; fallbackFrom?: string; attemptedProviders?: string[]; + attempts?: TtsProviderAttempt[]; outputFormat?: string; sampleRate?: number; }; @@ -119,6 +138,7 @@ type TtsStatusEntry = { provider?: string; fallbackFrom?: string; attemptedProviders?: string[]; + attempts?: TtsProviderAttempt[]; latencyMs?: number; error?: string; }; @@ -556,25 +576,46 @@ function sanitizeTtsErrorForLog(err: unknown): string { function buildTtsFailureResult( errors: string[], attemptedProviders?: string[], -): { success: false; error: string; attemptedProviders?: string[] } { + attempts?: TtsProviderAttempt[], +): { + success: false; + error: string; + attemptedProviders?: string[]; + attempts?: TtsProviderAttempt[]; +} { return { success: false, error: `TTS conversion failed: ${errors.join("; ") || "no providers available"}`, attemptedProviders, + attempts, }; } +type TtsProviderReadyResolution = + | { + kind: "ready"; + provider: NonNullable>; + providerConfig: SpeechProviderConfig; + } + | { + kind: "skip"; + reasonCode: "no_provider_registered" | "not_configured" | "unsupported_for_telephony"; + message: string; + }; + function resolveReadySpeechProvider(params: { provider: TtsProvider; cfg: OpenClawConfig; config: ResolvedTtsConfig; - errors: string[]; requireTelephony?: boolean; -}): NonNullable> | null { +}): TtsProviderReadyResolution { const resolvedProvider = getSpeechProvider(params.provider, params.cfg); if (!resolvedProvider) { - params.errors.push(`${params.provider}: no provider registered`); - return null; + return { + kind: "skip", + reasonCode: "no_provider_registered", + message: `${params.provider}: no provider registered`, + }; } const providerConfig = getResolvedSpeechProviderConfig( params.config, @@ -588,14 +629,24 @@ function resolveReadySpeechProvider(params: { timeoutMs: params.config.timeoutMs, }) ) { - params.errors.push(`${params.provider}: not configured`); - return null; + return { + kind: "skip", + reasonCode: "not_configured", + message: `${params.provider}: not configured`, + }; } if (params.requireTelephony && !resolvedProvider.synthesizeTelephony) { - params.errors.push(`${params.provider}: unsupported for telephony`); - return null; + return { + kind: "skip", + reasonCode: "unsupported_for_telephony", + message: `${params.provider}: unsupported for telephony`, + }; } - return resolvedProvider; + return { + kind: "ready", + provider: resolvedProvider, + providerConfig, + }; } function resolveTtsRequestSetup(params: { @@ -639,10 +690,12 @@ export async function textToSpeech(params: { }): Promise { const synthesis = await synthesizeSpeech(params); if (!synthesis.success || !synthesis.audioBuffer || !synthesis.fileExtension) { - return buildTtsFailureResult( - [synthesis.error ?? "TTS conversion failed"], - synthesis.attemptedProviders, - ); + return { + success: false, + error: synthesis.error ?? "TTS conversion failed", + attemptedProviders: synthesis.attemptedProviders, + attempts: synthesis.attempts, + }; } const tempRoot = resolvePreferredOpenClawTmpDir(); @@ -659,6 +712,7 @@ export async function textToSpeech(params: { provider: synthesis.provider, fallbackFrom: synthesis.fallbackFrom, attemptedProviders: synthesis.attemptedProviders, + attempts: synthesis.attempts, outputFormat: synthesis.outputFormat, voiceCompatible: synthesis.voiceCompatible, }; @@ -689,6 +743,7 @@ export async function synthesizeSpeech(params: { const errors: string[] = []; const attemptedProviders: string[] = []; + const attempts: TtsProviderAttempt[] = []; const primaryProvider = providers[0]; logVerbose( `TTS: starting with provider ${primaryProvider}, fallbacks: ${providers.slice(1).join(", ") || "none"}`, @@ -702,34 +757,57 @@ export async function synthesizeSpeech(params: { provider, cfg: params.cfg, config, - errors, }); - if (!resolvedProvider) { - logVerbose(`TTS: provider ${provider} skipped (${errors[errors.length - 1]})`); + if (resolvedProvider.kind === "skip") { + errors.push(resolvedProvider.message); + attempts.push({ + provider, + outcome: "skipped", + reasonCode: resolvedProvider.reasonCode, + error: resolvedProvider.message, + }); + logVerbose(`TTS: provider ${provider} skipped (${resolvedProvider.message})`); continue; } - const synthesis = await resolvedProvider.synthesize({ + const synthesis = await resolvedProvider.provider.synthesize({ text: params.text, cfg: params.cfg, - providerConfig: getResolvedSpeechProviderConfig(config, resolvedProvider.id, params.cfg), + providerConfig: resolvedProvider.providerConfig, target, - providerOverrides: params.overrides?.providerOverrides?.[resolvedProvider.id], + providerOverrides: params.overrides?.providerOverrides?.[resolvedProvider.provider.id], timeoutMs: config.timeoutMs, }); + const latencyMs = Date.now() - providerStart; + attempts.push({ + provider, + outcome: "success", + reasonCode: "success", + latencyMs, + }); return { success: true, audioBuffer: synthesis.audioBuffer, - latencyMs: Date.now() - providerStart, + latencyMs, provider, fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined, attemptedProviders, + attempts, outputFormat: synthesis.outputFormat, voiceCompatible: synthesis.voiceCompatible, fileExtension: synthesis.fileExtension, }; } catch (err) { const errorMsg = formatTtsProviderError(provider, err); + const latencyMs = Date.now() - providerStart; errors.push(errorMsg); + attempts.push({ + provider, + outcome: "failed", + reasonCode: + err instanceof Error && err.name === "AbortError" ? "timeout" : "provider_error", + latencyMs, + error: errorMsg, + }); const rawError = sanitizeTtsErrorForLog(err); if (provider === primaryProvider) { const hasFallbacks = providers.length > 1; @@ -742,7 +820,7 @@ export async function synthesizeSpeech(params: { } } - return buildTtsFailureResult(errors, attemptedProviders); + return buildTtsFailureResult(errors, attemptedProviders, attempts); } export async function textToSpeechTelephony(params: { @@ -762,7 +840,11 @@ export async function textToSpeechTelephony(params: { const { config, providers } = setup; const errors: string[] = []; const attemptedProviders: string[] = []; + const attempts: TtsProviderAttempt[] = []; const primaryProvider = providers[0]; + logVerbose( + `TTS telephony: starting with provider ${primaryProvider}, fallbacks: ${providers.slice(1).join(", ") || "none"}`, + ); for (const provider of providers) { attemptedProviders.push(provider); @@ -772,35 +854,72 @@ export async function textToSpeechTelephony(params: { provider, cfg: params.cfg, config, - errors, requireTelephony: true, }); - if (!resolvedProvider?.synthesizeTelephony) { + if (resolvedProvider.kind === "skip") { + errors.push(resolvedProvider.message); + attempts.push({ + provider, + outcome: "skipped", + reasonCode: resolvedProvider.reasonCode, + error: resolvedProvider.message, + }); + logVerbose(`TTS telephony: provider ${provider} skipped (${resolvedProvider.message})`); continue; } - const synthesis = await resolvedProvider.synthesizeTelephony({ + const synthesizeTelephony = resolvedProvider.provider.synthesizeTelephony as NonNullable< + typeof resolvedProvider.provider.synthesizeTelephony + >; + const synthesis = await synthesizeTelephony({ text: params.text, cfg: params.cfg, - providerConfig: getResolvedSpeechProviderConfig(config, resolvedProvider.id, params.cfg), + providerConfig: resolvedProvider.providerConfig, timeoutMs: config.timeoutMs, }); + const latencyMs = Date.now() - providerStart; + attempts.push({ + provider, + outcome: "success", + reasonCode: "success", + latencyMs, + }); return { success: true, audioBuffer: synthesis.audioBuffer, - latencyMs: Date.now() - providerStart, + latencyMs, provider, fallbackFrom: provider !== primaryProvider ? primaryProvider : undefined, attemptedProviders, + attempts, outputFormat: synthesis.outputFormat, sampleRate: synthesis.sampleRate, }; } catch (err) { - errors.push(formatTtsProviderError(provider, err)); + const errorMsg = formatTtsProviderError(provider, err); + const latencyMs = Date.now() - providerStart; + errors.push(errorMsg); + attempts.push({ + provider, + outcome: "failed", + reasonCode: + err instanceof Error && err.name === "AbortError" ? "timeout" : "provider_error", + latencyMs, + error: errorMsg, + }); + const rawError = sanitizeTtsErrorForLog(err); + if (provider === primaryProvider) { + const hasFallbacks = providers.length > 1; + logVerbose( + `TTS telephony: primary provider ${provider} failed (${rawError})${hasFallbacks ? "; trying fallback providers." : "; no fallback providers configured."}`, + ); + } else { + logVerbose(`TTS telephony: ${provider} failed (${rawError}); trying next provider.`); + } } } - return buildTtsFailureResult(errors, attemptedProviders); + return buildTtsFailureResult(errors, attemptedProviders, attempts); } export async function listSpeechVoices(params: { @@ -969,6 +1088,7 @@ export async function maybeApplyTtsToPayload(params: { provider: result.provider, fallbackFrom: result.fallbackFrom, attemptedProviders: result.attemptedProviders, + attempts: result.attempts, latencyMs: result.latencyMs, }; @@ -988,6 +1108,7 @@ export async function maybeApplyTtsToPayload(params: { textLength: text.length, summarized: wasSummarized, attemptedProviders: result.attemptedProviders, + attempts: result.attempts, error: result.error, }; diff --git a/src/auto-reply/reply/commands-tts.test.ts b/src/auto-reply/reply/commands-tts.test.ts index 962db9ca942..cfd00cabce3 100644 --- a/src/auto-reply/reply/commands-tts.test.ts +++ b/src/auto-reply/reply/commands-tts.test.ts @@ -65,6 +65,20 @@ describe("handleTtsCommands status fallback reporting", () => { provider: "microsoft", fallbackFrom: "elevenlabs", attemptedProviders: ["elevenlabs", "microsoft"], + attempts: [ + { + provider: "elevenlabs", + outcome: "failed", + reasonCode: "provider_error", + latencyMs: 73, + }, + { + provider: "microsoft", + outcome: "success", + reasonCode: "success", + latencyMs: 420, + }, + ], latencyMs: 420, }); @@ -72,6 +86,9 @@ describe("handleTtsCommands status fallback reporting", () => { expect(result?.shouldContinue).toBe(false); expect(result?.reply?.text).toContain("Fallback: elevenlabs -> microsoft"); expect(result?.reply?.text).toContain("Attempts: elevenlabs -> microsoft"); + expect(result?.reply?.text).toContain( + "Attempt details: elevenlabs:failed(provider_error) 73ms, microsoft:success(ok) 420ms", + ); }); it("shows attempted provider chain for failed attempts", async () => { @@ -82,6 +99,14 @@ describe("handleTtsCommands status fallback reporting", () => { summarized: false, error: "TTS conversion failed", attemptedProviders: ["elevenlabs", "microsoft"], + attempts: [ + { + provider: "elevenlabs", + outcome: "failed", + reasonCode: "timeout", + latencyMs: 999, + }, + ], latencyMs: 420, }); @@ -89,6 +114,7 @@ describe("handleTtsCommands status fallback reporting", () => { expect(result?.shouldContinue).toBe(false); expect(result?.reply?.text).toContain("Error: TTS conversion failed"); expect(result?.reply?.text).toContain("Attempts: elevenlabs -> microsoft"); + expect(result?.reply?.text).toContain("Attempt details: elevenlabs:failed(timeout) 999ms"); }); it("persists fallback metadata from /tts audio and renders it in /tts status", async () => { @@ -103,6 +129,20 @@ describe("handleTtsCommands status fallback reporting", () => { provider: "microsoft", fallbackFrom: "elevenlabs", attemptedProviders: ["elevenlabs", "microsoft"], + attempts: [ + { + provider: "elevenlabs", + outcome: "failed", + reasonCode: "provider_error", + latencyMs: 65, + }, + { + provider: "microsoft", + outcome: "success", + reasonCode: "success", + latencyMs: 175, + }, + ], latencyMs: 175, voiceCompatible: true, }); @@ -116,5 +156,8 @@ describe("handleTtsCommands status fallback reporting", () => { expect(statusResult?.reply?.text).toContain("Provider: microsoft"); expect(statusResult?.reply?.text).toContain("Fallback: elevenlabs -> microsoft"); expect(statusResult?.reply?.text).toContain("Attempts: elevenlabs -> microsoft"); + expect(statusResult?.reply?.text).toContain( + "Attempt details: elevenlabs:failed(provider_error) 65ms, microsoft:success(ok) 175ms", + ); }); }); diff --git a/src/auto-reply/reply/commands-tts.ts b/src/auto-reply/reply/commands-tts.ts index de0f7e116f3..9d513ff1ac3 100644 --- a/src/auto-reply/reply/commands-tts.ts +++ b/src/auto-reply/reply/commands-tts.ts @@ -29,6 +29,10 @@ type ParsedTtsCommand = { args: string; }; +type TtsAttemptDetail = NonNullable< + NonNullable>["attempts"] +>[number]; + function parseTtsCommand(normalized: string): ParsedTtsCommand | null { // Accept `/tts` and `/tts [args]` as a single control surface. if (normalized === "/tts") { @@ -45,6 +49,19 @@ function parseTtsCommand(normalized: string): ParsedTtsCommand | null { return { action: action.toLowerCase(), args: tail.join(" ").trim() }; } +function formatAttemptDetails(attempts: TtsAttemptDetail[] | undefined): string | undefined { + if (!attempts || attempts.length === 0) { + return undefined; + } + return attempts + .map((attempt) => { + const reason = attempt.reasonCode === "success" ? "ok" : attempt.reasonCode; + const latency = Number.isFinite(attempt.latencyMs) ? ` ${attempt.latencyMs}ms` : ""; + return `${attempt.provider}:${attempt.outcome}(${reason})${latency}`; + }) + .join(", "); +} + function ttsUsage(): ReplyPayload { // Keep usage in one place so help/validation stays consistent. return { @@ -137,6 +154,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand provider: result.provider, fallbackFrom: result.fallbackFrom, attemptedProviders: result.attemptedProviders, + attempts: result.attempts, latencyMs: result.latencyMs, }); const payload: ReplyPayload = { @@ -153,6 +171,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand textLength: args.length, summarized: false, attemptedProviders: result.attemptedProviders, + attempts: result.attempts, error: result.error, latencyMs: Date.now() - start, }); @@ -294,12 +313,20 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand if (last.attemptedProviders && last.attemptedProviders.length > 1) { lines.push(`Attempts: ${last.attemptedProviders.join(" -> ")}`); } + const details = formatAttemptDetails(last.attempts); + if (details) { + lines.push(`Attempt details: ${details}`); + } lines.push(`Latency: ${last.latencyMs ?? 0}ms`); } else if (last.error) { lines.push(`Error: ${last.error}`); if (last.attemptedProviders && last.attemptedProviders.length > 0) { lines.push(`Attempts: ${last.attemptedProviders.join(" -> ")}`); } + const details = formatAttemptDetails(last.attempts); + if (details) { + lines.push(`Attempt details: ${details}`); + } } } return { shouldContinue: false, reply: { text: lines.join("\n") } }; diff --git a/src/plugin-sdk/speech.ts b/src/plugin-sdk/speech.ts index 8628da81d16..019f80e9ed6 100644 --- a/src/plugin-sdk/speech.ts +++ b/src/plugin-sdk/speech.ts @@ -38,3 +38,9 @@ export { normalizeSpeechProviderId, } from "../tts/provider-registry.js"; export { normalizeTtsAutoMode, TTS_AUTO_MODES } from "../tts/tts-auto-mode.js"; +export { + asObject, + readResponseTextLimited, + trimToUndefined, + truncateErrorDetail, +} from "../tts/provider-error-utils.js"; diff --git a/src/plugins/contracts/tts.contract.test.ts b/src/plugins/contracts/tts.contract.test.ts index bb4bdb48f05..f021d39fa46 100644 --- a/src/plugins/contracts/tts.contract.test.ts +++ b/src/plugins/contracts/tts.contract.test.ts @@ -680,6 +680,182 @@ describe("tts", () => { }); }); + describe("fallback readiness errors", () => { + it("continues synthesize fallback when primary readiness checks throw", async () => { + const throwingPrimary: SpeechProviderPlugin = { + id: "openai", + label: "OpenAI", + autoSelectOrder: 10, + resolveConfig: () => ({}), + isConfigured: () => { + throw new Error("Authorization: Bearer sk-readiness-throw-token-1234567890\nboom"); + }, + synthesize: async () => { + throw new Error("unexpected synthesize call"); + }, + }; + const fallback: SpeechProviderPlugin = { + id: "microsoft", + label: "Microsoft", + autoSelectOrder: 20, + resolveConfig: () => ({}), + isConfigured: () => true, + synthesize: async () => ({ + audioBuffer: createAudioBuffer(2), + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: true, + }), + }; + const registry = createEmptyPluginRegistry(); + registry.speechProviders = [ + { pluginId: "openai", provider: throwingPrimary, source: "test" }, + { pluginId: "microsoft", provider: fallback, source: "test" }, + ]; + const { cacheKey } = pluginLoaderTesting.resolvePluginLoadCacheContext({ config: {} }); + setActivePluginRegistry(registry, cacheKey); + + const result = await tts.synthesizeSpeech({ + text: "hello fallback", + cfg: { + messages: { + tts: { + provider: "openai", + }, + }, + }, + }); + + expect(result.success).toBe(true); + if (!result.success) { + throw new Error("expected fallback synthesis success"); + } + expect(result.provider).toBe("microsoft"); + expect(result.fallbackFrom).toBe("openai"); + expect(result.attemptedProviders).toEqual(["openai", "microsoft"]); + expect(result.attempts?.[0]).toMatchObject({ + provider: "openai", + outcome: "failed", + reasonCode: "provider_error", + }); + expect(result.attempts?.[1]).toMatchObject({ + provider: "microsoft", + outcome: "success", + reasonCode: "success", + }); + }); + + it("continues telephony fallback when primary readiness checks throw", async () => { + const throwingPrimary: SpeechProviderPlugin = { + id: "primary-throws", + label: "PrimaryThrows", + autoSelectOrder: 10, + resolveConfig: () => ({}), + isConfigured: () => { + throw new Error("Authorization: Bearer sk-telephony-throw-token-1234567890\tboom"); + }, + synthesize: async () => { + throw new Error("unexpected synthesize call"); + }, + }; + const fallback: SpeechProviderPlugin = { + id: "microsoft", + label: "Microsoft", + autoSelectOrder: 20, + resolveConfig: () => ({}), + isConfigured: () => true, + synthesize: async () => ({ + audioBuffer: createAudioBuffer(2), + outputFormat: "mp3", + fileExtension: ".mp3", + voiceCompatible: true, + }), + synthesizeTelephony: async () => ({ + audioBuffer: createAudioBuffer(2), + outputFormat: "mp3", + sampleRate: 24000, + }), + }; + const registry = createEmptyPluginRegistry(); + registry.speechProviders = [ + { pluginId: "primary-throws", provider: throwingPrimary, source: "test" }, + { pluginId: "microsoft", provider: fallback, source: "test" }, + ]; + const { cacheKey } = pluginLoaderTesting.resolvePluginLoadCacheContext({ config: {} }); + setActivePluginRegistry(registry, cacheKey); + + const result = await tts.textToSpeechTelephony({ + text: "hello telephony fallback", + cfg: { + messages: { + tts: { + provider: "primary-throws", + }, + }, + }, + }); + + expect(result.success).toBe(true); + if (!result.success) { + throw new Error("expected telephony fallback success"); + } + expect(result.provider).toBe("microsoft"); + expect(result.fallbackFrom).toBe("primary-throws"); + expect(result.attemptedProviders).toEqual(["primary-throws", "microsoft"]); + expect(result.attempts?.[0]).toMatchObject({ + provider: "primary-throws", + outcome: "failed", + reasonCode: "provider_error", + }); + expect(result.attempts?.[1]).toMatchObject({ + provider: "microsoft", + outcome: "success", + reasonCode: "success", + }); + }); + + it("does not double-prefix textToSpeech failure messages", async () => { + const failingProvider: SpeechProviderPlugin = { + id: "openai", + label: "OpenAI", + autoSelectOrder: 10, + resolveConfig: () => ({}), + isConfigured: () => true, + synthesize: async () => { + throw new Error("provider failed"); + }, + }; + const registry = createEmptyPluginRegistry(); + registry.speechProviders = [ + { pluginId: "openai", provider: failingProvider, source: "test" }, + ]; + const { cacheKey } = pluginLoaderTesting.resolvePluginLoadCacheContext({ config: {} }); + setActivePluginRegistry(registry, cacheKey); + + const result = await tts.textToSpeech({ + text: "hello", + cfg: { + messages: { + tts: { + provider: "openai", + }, + }, + }, + disableFallback: true, + }); + + expect(result.success).toBe(false); + if (result.success) { + throw new Error("expected synthesis failure"); + } + expect(result.error).toBeDefined(); + const errorMessage = result.error ?? ""; + expect(errorMessage).toBe("TTS conversion failed: openai: provider failed"); + expect(errorMessage).not.toContain("TTS conversion failed: TTS conversion failed:"); + expect(errorMessage.match(/TTS conversion failed:/g)).toHaveLength(1); + }); + }); + describe("resolveTtsConfig – openai.baseUrl", () => { const baseCfg: OpenClawConfig = { agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, diff --git a/src/tts/provider-error-utils.ts b/src/tts/provider-error-utils.ts new file mode 100644 index 00000000000..a071ed69de3 --- /dev/null +++ b/src/tts/provider-error-utils.ts @@ -0,0 +1,62 @@ +export function trimToUndefined(value: unknown): string | undefined { + return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined; +} + +export function asObject(value: unknown): Record | undefined { + return typeof value === "object" && value !== null && !Array.isArray(value) + ? (value as Record) + : undefined; +} + +export function truncateErrorDetail(detail: string, limit = 220): string { + return detail.length <= limit ? detail : `${detail.slice(0, limit - 1)}…`; +} + +export async function readResponseTextLimited( + response: Response, + limitBytes = 16 * 1024, +): Promise { + if (limitBytes <= 0) { + return ""; + } + const reader = response.body?.getReader(); + if (!reader) { + return ""; + } + + const decoder = new TextDecoder(); + let total = 0; + let text = ""; + let reachedLimit = false; + + try { + while (true) { + const { value, done } = await reader.read(); + if (done) { + break; + } + if (!value || value.byteLength === 0) { + continue; + } + const remaining = limitBytes - total; + if (remaining <= 0) { + reachedLimit = true; + break; + } + const chunk = value.byteLength > remaining ? value.subarray(0, remaining) : value; + total += chunk.byteLength; + text += decoder.decode(chunk, { stream: true }); + if (total >= limitBytes) { + reachedLimit = true; + break; + } + } + text += decoder.decode(); + } finally { + if (reachedLimit) { + await reader.cancel().catch(() => {}); + } + } + + return text; +}