diff --git a/CHANGELOG.md b/CHANGELOG.md index dbef0a72951..ef0e4629b90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ Docs: https://docs.openclaw.ai - Agents/compaction: surface safeguard-specific cancel reasons and relabel benign manual `/compact` no-op cases as skipped instead of failed. (#51072) Thanks @afurm. - Plugins/CLI backends: move bundled Claude CLI, Codex CLI, and Gemini CLI inference defaults onto the plugin surface, add bundled Gemini CLI backend support, and replace `gateway run --claude-cli-logs` with generic `--cli-backend-logs` while keeping the old flag as a compatibility alias. - Plugins/startup: auto-load bundled provider and CLI-backend plugins from explicit config refs, so bundled Claude CLI, Codex CLI, and Gemini CLI message-provider setups no longer need manual `plugins.allow` entries. +- Config/TTS: auto-migrate legacy speech config on normal reads and secret resolution, keep legacy diagnostics for Doctor, and remove regular-mode runtime fallback for old bundled `tts.` API-key shapes. ### Fixes diff --git a/docs/tools/tts.md b/docs/tools/tts.md index f87c47eb725..27aea56026b 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -15,7 +15,7 @@ It works anywhere OpenClaw can send audio. ## Supported services - **ElevenLabs** (primary or fallback provider) -- **Microsoft** (primary or fallback provider; current bundled implementation uses `node-edge-tts`, default when no API keys) +- **Microsoft** (primary or fallback provider; current bundled implementation uses `node-edge-tts`) - **OpenAI** (primary or fallback provider; also used for summaries) ### Microsoft speech notes @@ -38,9 +38,7 @@ If you want OpenAI or ElevenLabs: - `ELEVENLABS_API_KEY` (or `XI_API_KEY`) - `OPENAI_API_KEY` -Microsoft speech does **not** require an API key. If no API keys are found, -OpenClaw defaults to Microsoft (unless disabled via -`messages.tts.microsoft.enabled=false` or `messages.tts.edge.enabled=false`). +Microsoft speech does **not** require an API key. If multiple providers are configured, the selected provider is used first and the others are fallback options. Auto-summary uses the configured `summaryModel` (or `agents.defaults.model.primary`), @@ -60,8 +58,8 @@ so that provider must also be authenticated if you enable summaries. No. Auto‑TTS is **off** by default. Enable it in config with `messages.tts.auto` or per session with `/tts always` (alias: `/tts on`). -Microsoft speech **is** enabled by default once TTS is on, and is used automatically -when no OpenAI or ElevenLabs API keys are available. +When `messages.tts.provider` is unset, OpenClaw picks the first configured +speech provider in registry auto-select order. ## Config @@ -93,26 +91,28 @@ Full schema is in [Gateway configuration](/gateway/configuration). modelOverrides: { enabled: true, }, - openai: { - apiKey: "openai_api_key", - baseUrl: "https://api.openai.com/v1", - model: "gpt-4o-mini-tts", - voice: "alloy", - }, - elevenlabs: { - apiKey: "elevenlabs_api_key", - baseUrl: "https://api.elevenlabs.io", - voiceId: "voice_id", - modelId: "eleven_multilingual_v2", - seed: 42, - applyTextNormalization: "auto", - languageCode: "en", - voiceSettings: { - stability: 0.5, - similarityBoost: 0.75, - style: 0.0, - useSpeakerBoost: true, - speed: 1.0, + providers: { + openai: { + apiKey: "openai_api_key", + baseUrl: "https://api.openai.com/v1", + model: "gpt-4o-mini-tts", + voice: "alloy", + }, + elevenlabs: { + apiKey: "elevenlabs_api_key", + baseUrl: "https://api.elevenlabs.io", + voiceId: "voice_id", + modelId: "eleven_multilingual_v2", + seed: 42, + applyTextNormalization: "auto", + languageCode: "en", + voiceSettings: { + stability: 0.5, + similarityBoost: 0.75, + style: 0.0, + useSpeakerBoost: true, + speed: 1.0, + }, }, }, }, @@ -128,13 +128,15 @@ Full schema is in [Gateway configuration](/gateway/configuration). tts: { auto: "always", provider: "microsoft", - microsoft: { - enabled: true, - voice: "en-US-MichelleNeural", - lang: "en-US", - outputFormat: "audio-24khz-48kbitrate-mono-mp3", - rate: "+10%", - pitch: "-5%", + providers: { + microsoft: { + enabled: true, + voice: "en-US-MichelleNeural", + lang: "en-US", + outputFormat: "audio-24khz-48kbitrate-mono-mp3", + rate: "+10%", + pitch: "-5%", + }, }, }, }, @@ -147,8 +149,10 @@ Full schema is in [Gateway configuration](/gateway/configuration). { messages: { tts: { - microsoft: { - enabled: false, + providers: { + microsoft: { + enabled: false, + }, }, }, }, @@ -208,37 +212,37 @@ Then run: - `enabled`: legacy toggle (doctor migrates this to `auto`). - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). - `provider`: speech provider id such as `"elevenlabs"`, `"microsoft"`, or `"openai"` (fallback is automatic). -- If `provider` is **unset**, OpenClaw prefers `openai` (if key), then `elevenlabs` (if key), - otherwise `microsoft`. +- If `provider` is **unset**, OpenClaw uses the first configured speech provider in registry auto-select order. - Legacy `provider: "edge"` still works and is normalized to `microsoft`. - `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`. - Accepts `provider/model` or a configured model alias. - `modelOverrides`: allow the model to emit TTS directives (on by default). - `allowProvider` defaults to `false` (provider switching is opt-in). +- `providers.`: provider-owned settings keyed by speech provider id. - `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded. - `timeoutMs`: request timeout (ms). - `prefsPath`: override the local prefs JSON path (provider/limit/summary). - `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `OPENAI_API_KEY`). -- `elevenlabs.baseUrl`: override ElevenLabs API base URL. -- `openai.baseUrl`: override the OpenAI TTS endpoint. - - Resolution order: `messages.tts.openai.baseUrl` -> `OPENAI_TTS_BASE_URL` -> `https://api.openai.com/v1` +- `providers.elevenlabs.baseUrl`: override ElevenLabs API base URL. +- `providers.openai.baseUrl`: override the OpenAI TTS endpoint. + - Resolution order: `messages.tts.providers.openai.baseUrl` -> `OPENAI_TTS_BASE_URL` -> `https://api.openai.com/v1` - Non-default values are treated as OpenAI-compatible TTS endpoints, so custom model and voice names are accepted. -- `elevenlabs.voiceSettings`: +- `providers.elevenlabs.voiceSettings`: - `stability`, `similarityBoost`, `style`: `0..1` - `useSpeakerBoost`: `true|false` - `speed`: `0.5..2.0` (1.0 = normal) -- `elevenlabs.applyTextNormalization`: `auto|on|off` -- `elevenlabs.languageCode`: 2-letter ISO 639-1 (e.g. `en`, `de`) -- `elevenlabs.seed`: integer `0..4294967295` (best-effort determinism) -- `microsoft.enabled`: allow Microsoft speech usage (default `true`; no API key). -- `microsoft.voice`: Microsoft neural voice name (e.g. `en-US-MichelleNeural`). -- `microsoft.lang`: language code (e.g. `en-US`). -- `microsoft.outputFormat`: Microsoft output format (e.g. `audio-24khz-48kbitrate-mono-mp3`). +- `providers.elevenlabs.applyTextNormalization`: `auto|on|off` +- `providers.elevenlabs.languageCode`: 2-letter ISO 639-1 (e.g. `en`, `de`) +- `providers.elevenlabs.seed`: integer `0..4294967295` (best-effort determinism) +- `providers.microsoft.enabled`: allow Microsoft speech usage (default `true`; no API key). +- `providers.microsoft.voice`: Microsoft neural voice name (e.g. `en-US-MichelleNeural`). +- `providers.microsoft.lang`: language code (e.g. `en-US`). +- `providers.microsoft.outputFormat`: Microsoft output format (e.g. `audio-24khz-48kbitrate-mono-mp3`). - See Microsoft Speech output formats for valid values; not all formats are supported by the bundled Edge-backed transport. -- `microsoft.rate` / `microsoft.pitch` / `microsoft.volume`: percent strings (e.g. `+10%`, `-5%`). -- `microsoft.saveSubtitles`: write JSON subtitles alongside the audio file. -- `microsoft.proxy`: proxy URL for Microsoft speech requests. -- `microsoft.timeoutMs`: request timeout override (ms). +- `providers.microsoft.rate` / `providers.microsoft.pitch` / `providers.microsoft.volume`: percent strings (e.g. `+10%`, `-5%`). +- `providers.microsoft.saveSubtitles`: write JSON subtitles alongside the audio file. +- `providers.microsoft.proxy`: proxy URL for Microsoft speech requests. +- `providers.microsoft.timeoutMs`: request timeout override (ms). - `edge.*`: legacy alias for the same Microsoft settings. ## Model-driven overrides (default on) diff --git a/docs/tts.md b/docs/tts.md index ffd56f8b02e..6c3cea7ba05 100644 --- a/docs/tts.md +++ b/docs/tts.md @@ -15,7 +15,7 @@ It works anywhere OpenClaw can send audio. ## Supported services - **ElevenLabs** (primary or fallback provider) -- **Microsoft** (primary or fallback provider; current bundled implementation uses `node-edge-tts`, default when no API keys) +- **Microsoft** (primary or fallback provider; current bundled implementation uses `node-edge-tts`) - **OpenAI** (primary or fallback provider; also used for summaries) ### Microsoft speech notes @@ -38,9 +38,7 @@ If you want OpenAI or ElevenLabs: - `ELEVENLABS_API_KEY` (or `XI_API_KEY`) - `OPENAI_API_KEY` -Microsoft speech does **not** require an API key. If no API keys are found, -OpenClaw defaults to Microsoft (unless disabled via -`messages.tts.microsoft.enabled=false` or `messages.tts.edge.enabled=false`). +Microsoft speech does **not** require an API key. If multiple providers are configured, the selected provider is used first and the others are fallback options. Auto-summary uses the configured `summaryModel` (or `agents.defaults.model.primary`), @@ -60,8 +58,8 @@ so that provider must also be authenticated if you enable summaries. No. Auto‑TTS is **off** by default. Enable it in config with `messages.tts.auto` or per session with `/tts always` (alias: `/tts on`). -Microsoft speech **is** enabled by default once TTS is on, and is used automatically -when no OpenAI or ElevenLabs API keys are available. +When `messages.tts.provider` is unset, OpenClaw picks the first configured +speech provider in registry auto-select order. ## Config @@ -93,26 +91,28 @@ Full schema is in [Gateway configuration](/gateway/configuration). modelOverrides: { enabled: true, }, - openai: { - apiKey: "openai_api_key", - baseUrl: "https://api.openai.com/v1", - model: "gpt-4o-mini-tts", - voice: "alloy", - }, - elevenlabs: { - apiKey: "elevenlabs_api_key", - baseUrl: "https://api.elevenlabs.io", - voiceId: "voice_id", - modelId: "eleven_multilingual_v2", - seed: 42, - applyTextNormalization: "auto", - languageCode: "en", - voiceSettings: { - stability: 0.5, - similarityBoost: 0.75, - style: 0.0, - useSpeakerBoost: true, - speed: 1.0, + providers: { + openai: { + apiKey: "openai_api_key", + baseUrl: "https://api.openai.com/v1", + model: "gpt-4o-mini-tts", + voice: "alloy", + }, + elevenlabs: { + apiKey: "elevenlabs_api_key", + baseUrl: "https://api.elevenlabs.io", + voiceId: "voice_id", + modelId: "eleven_multilingual_v2", + seed: 42, + applyTextNormalization: "auto", + languageCode: "en", + voiceSettings: { + stability: 0.5, + similarityBoost: 0.75, + style: 0.0, + useSpeakerBoost: true, + speed: 1.0, + }, }, }, }, @@ -128,13 +128,15 @@ Full schema is in [Gateway configuration](/gateway/configuration). tts: { auto: "always", provider: "microsoft", - microsoft: { - enabled: true, - voice: "en-US-MichelleNeural", - lang: "en-US", - outputFormat: "audio-24khz-48kbitrate-mono-mp3", - rate: "+10%", - pitch: "-5%", + providers: { + microsoft: { + enabled: true, + voice: "en-US-MichelleNeural", + lang: "en-US", + outputFormat: "audio-24khz-48kbitrate-mono-mp3", + rate: "+10%", + pitch: "-5%", + }, }, }, }, @@ -147,8 +149,10 @@ Full schema is in [Gateway configuration](/gateway/configuration). { messages: { tts: { - microsoft: { - enabled: false, + providers: { + microsoft: { + enabled: false, + }, }, }, }, @@ -208,37 +212,37 @@ Then run: - `enabled`: legacy toggle (doctor migrates this to `auto`). - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). - `provider`: speech provider id such as `"elevenlabs"`, `"microsoft"`, or `"openai"` (fallback is automatic). -- If `provider` is **unset**, OpenClaw prefers `openai` (if key), then `elevenlabs` (if key), - otherwise `microsoft`. +- If `provider` is **unset**, OpenClaw uses the first configured speech provider in registry auto-select order. - Legacy `provider: "edge"` still works and is normalized to `microsoft`. - `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`. - Accepts `provider/model` or a configured model alias. - `modelOverrides`: allow the model to emit TTS directives (on by default). - `allowProvider` defaults to `false` (provider switching is opt-in). +- `providers.`: provider-owned settings keyed by speech provider id. - `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded. - `timeoutMs`: request timeout (ms). - `prefsPath`: override the local prefs JSON path (provider/limit/summary). - `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `OPENAI_API_KEY`). -- `elevenlabs.baseUrl`: override ElevenLabs API base URL. -- `openai.baseUrl`: override the OpenAI TTS endpoint. - - Resolution order: `messages.tts.openai.baseUrl` -> `OPENAI_TTS_BASE_URL` -> `https://api.openai.com/v1` +- `providers.elevenlabs.baseUrl`: override ElevenLabs API base URL. +- `providers.openai.baseUrl`: override the OpenAI TTS endpoint. + - Resolution order: `messages.tts.providers.openai.baseUrl` -> `OPENAI_TTS_BASE_URL` -> `https://api.openai.com/v1` - Non-default values are treated as OpenAI-compatible TTS endpoints, so custom model and voice names are accepted. -- `elevenlabs.voiceSettings`: +- `providers.elevenlabs.voiceSettings`: - `stability`, `similarityBoost`, `style`: `0..1` - `useSpeakerBoost`: `true|false` - `speed`: `0.5..2.0` (1.0 = normal) -- `elevenlabs.applyTextNormalization`: `auto|on|off` -- `elevenlabs.languageCode`: 2-letter ISO 639-1 (e.g. `en`, `de`) -- `elevenlabs.seed`: integer `0..4294967295` (best-effort determinism) -- `microsoft.enabled`: allow Microsoft speech usage (default `true`; no API key). -- `microsoft.voice`: Microsoft neural voice name (e.g. `en-US-MichelleNeural`). -- `microsoft.lang`: language code (e.g. `en-US`). -- `microsoft.outputFormat`: Microsoft output format (e.g. `audio-24khz-48kbitrate-mono-mp3`). +- `providers.elevenlabs.applyTextNormalization`: `auto|on|off` +- `providers.elevenlabs.languageCode`: 2-letter ISO 639-1 (e.g. `en`, `de`) +- `providers.elevenlabs.seed`: integer `0..4294967295` (best-effort determinism) +- `providers.microsoft.enabled`: allow Microsoft speech usage (default `true`; no API key). +- `providers.microsoft.voice`: Microsoft neural voice name (e.g. `en-US-MichelleNeural`). +- `providers.microsoft.lang`: language code (e.g. `en-US`). +- `providers.microsoft.outputFormat`: Microsoft output format (e.g. `audio-24khz-48kbitrate-mono-mp3`). - See Microsoft Speech output formats for valid values; not all formats are supported by the bundled Edge-backed transport. -- `microsoft.rate` / `microsoft.pitch` / `microsoft.volume`: percent strings (e.g. `+10%`, `-5%`). -- `microsoft.saveSubtitles`: write JSON subtitles alongside the audio file. -- `microsoft.proxy`: proxy URL for Microsoft speech requests. -- `microsoft.timeoutMs`: request timeout override (ms). +- `providers.microsoft.rate` / `providers.microsoft.pitch` / `providers.microsoft.volume`: percent strings (e.g. `+10%`, `-5%`). +- `providers.microsoft.saveSubtitles`: write JSON subtitles alongside the audio file. +- `providers.microsoft.proxy`: proxy URL for Microsoft speech requests. +- `providers.microsoft.timeoutMs`: request timeout override (ms). - `edge.*`: legacy alias for the same Microsoft settings. ## Model-driven overrides (default on) diff --git a/src/commands/doctor-legacy-config.migrations.test.ts b/src/commands/doctor-legacy-config.migrations.test.ts index d4de96b8367..869b58f64a7 100644 --- a/src/commands/doctor-legacy-config.migrations.test.ts +++ b/src/commands/doctor-legacy-config.migrations.test.ts @@ -523,7 +523,6 @@ describe("normalizeCompatibilityConfigValues", () => { }); expect(res.config.talk).toEqual({ - provider: "elevenlabs", providers: { elevenlabs: { voiceId: "voice-123", @@ -545,9 +544,7 @@ describe("normalizeCompatibilityConfigValues", () => { interruptOnSpeech: false, silenceTimeoutMs: 1500, }); - expect(res.changes).toEqual([ - "Moved legacy talk flat fields → talk.provider/talk.providers.elevenlabs.", - ]); + expect(res.changes).toEqual(["Moved legacy talk flat fields → talk.providers.elevenlabs."]); }); it("normalizes talk provider ids without overriding explicit provider config", () => { diff --git a/src/commands/doctor-legacy-config.ts b/src/commands/doctor-legacy-config.ts index 99d8783c76a..d489d354be0 100644 --- a/src/commands/doctor-legacy-config.ts +++ b/src/commands/doctor-legacy-config.ts @@ -11,7 +11,7 @@ import { resolveTelegramPreviewStreamMode, } from "../config/discord-preview-streaming.js"; import { migrateLegacyWebSearchConfig } from "../config/legacy-web-search.js"; -import { DEFAULT_TALK_PROVIDER, normalizeTalkSection } from "../config/talk.js"; +import { LEGACY_TALK_PROVIDER_ID, normalizeTalkSection } from "../config/talk.js"; import { DEFAULT_GOOGLE_API_BASE_URL } from "../infra/google-api-base-url.js"; import { DEFAULT_ACCOUNT_ID } from "../routing/session-key.js"; @@ -651,9 +651,7 @@ export function normalizeCompatibilityConfigValues(cfg: OpenClawConfig): { return; } - changes.push( - `Moved legacy talk flat fields → talk.provider/talk.providers.${DEFAULT_TALK_PROVIDER}.`, - ); + changes.push(`Moved legacy talk flat fields → talk.providers.${LEGACY_TALK_PROVIDER_ID}.`); }; const normalizeLegacyCrossContextMessageConfig = () => { diff --git a/src/config/config-misc.test.ts b/src/config/config-misc.test.ts index f089e74015c..8de11cdb086 100644 --- a/src/config/config-misc.test.ts +++ b/src/config/config-misc.test.ts @@ -482,7 +482,7 @@ describe("config strict validation", () => { const snap = await readConfigFileSnapshot(); - expect(snap.valid).toBe(false); + expect(snap.valid).toBe(true); expect(snap.legacyIssues).not.toHaveLength(0); }); }); @@ -517,7 +517,7 @@ describe("config strict validation", () => { }); const snap = await readConfigFileSnapshot(); - expect(snap.valid).toBe(false); + expect(snap.valid).toBe(true); expect(snap.legacyIssues.some((issue) => issue.path === "gateway.bind")).toBe(true); }); }); diff --git a/src/config/config.legacy-config-detection.accepts-imessage-dmpolicy.test.ts b/src/config/config.legacy-config-detection.accepts-imessage-dmpolicy.test.ts index 89632bbc543..d21835ca910 100644 --- a/src/config/config.legacy-config-detection.accepts-imessage-dmpolicy.test.ts +++ b/src/config/config.legacy-config-detection.accepts-imessage-dmpolicy.test.ts @@ -68,7 +68,7 @@ function expectRoutingAllowFromLegacySnapshot( ctx: { snapshot: ConfigSnapshot; parsed: unknown }, expectedAllowFrom: string[], ) { - expect(ctx.snapshot.valid).toBe(false); + expect(ctx.snapshot.valid).toBe(true); expect(ctx.snapshot.legacyIssues.some((issue) => issue.path === "routing.allowFrom")).toBe(true); const parsed = ctx.parsed as { routing?: { allowFrom?: string[] }; @@ -269,7 +269,7 @@ describe("legacy config detection", () => { await withSnapshotForConfig( { memorySearch: { provider: "local", fallback: "none" } }, async (ctx) => { - expect(ctx.snapshot.valid).toBe(false); + expect(ctx.snapshot.valid).toBe(true); expect(ctx.snapshot.legacyIssues.some((issue) => issue.path === "memorySearch")).toBe(true); }, ); @@ -278,14 +278,14 @@ describe("legacy config detection", () => { await withSnapshotForConfig( { heartbeat: { model: "anthropic/claude-3-5-haiku-20241022", every: "30m" } }, async (ctx) => { - expect(ctx.snapshot.valid).toBe(false); + expect(ctx.snapshot.valid).toBe(true); expect(ctx.snapshot.legacyIssues.some((issue) => issue.path === "heartbeat")).toBe(true); }, ); }); it("flags legacy provider sections in snapshot", async () => { await withSnapshotForConfig({ whatsapp: { allowFrom: ["+1555"] } }, async (ctx) => { - expect(ctx.snapshot.valid).toBe(false); + expect(ctx.snapshot.valid).toBe(true); expect(ctx.snapshot.legacyIssues.some((issue) => issue.path === "whatsapp")).toBe(true); const parsed = ctx.parsed as { diff --git a/src/config/defaults.ts b/src/config/defaults.ts index dcc66044737..d535ed0bc9a 100644 --- a/src/config/defaults.ts +++ b/src/config/defaults.ts @@ -3,7 +3,7 @@ import { normalizeProviderId, parseModelRef } from "../agents/model-selection.js import { DEFAULT_AGENT_MAX_CONCURRENT, DEFAULT_SUBAGENT_MAX_CONCURRENT } from "./agent-limits.js"; import { resolveAgentModelPrimaryValue } from "./model-input.js"; import { - DEFAULT_TALK_PROVIDER, + LEGACY_TALK_PROVIDER_ID, normalizeTalkConfig, resolveActiveTalkProviderConfig, resolveTalkApiKey, @@ -204,7 +204,7 @@ export function applyTalkApiKey(config: OpenClawConfig): OpenClawConfig { const talk = normalized.talk; const active = resolveActiveTalkProviderConfig(talk); - if (active?.provider && active.provider !== DEFAULT_TALK_PROVIDER) { + if (!active || active.provider !== LEGACY_TALK_PROVIDER_ID) { return normalized; } @@ -214,7 +214,7 @@ export function applyTalkApiKey(config: OpenClawConfig): OpenClawConfig { return normalized; } - const providerId = active?.provider ?? DEFAULT_TALK_PROVIDER; + const providerId = active.provider; const providers = { ...talk?.providers }; const providerConfig = { ...providers[providerId], apiKey: resolved }; providers[providerId] = providerConfig; @@ -222,7 +222,6 @@ export function applyTalkApiKey(config: OpenClawConfig): OpenClawConfig { const nextTalk = { ...talk, apiKey: resolved, - provider: talk?.provider ?? providerId, providers, }; diff --git a/src/config/io.ts b/src/config/io.ts index 953ff0747c8..c68739188b8 100644 --- a/src/config/io.ts +++ b/src/config/io.ts @@ -41,6 +41,7 @@ import { readConfigIncludeFileWithGuards, resolveConfigIncludes, } from "./includes.js"; +import { migrateLegacyConfig } from "./legacy-migrate.js"; import { findLegacyConfigIssues } from "./legacy.js"; import { applyMergePatch } from "./merge-patch.js"; import { normalizeExecSafeBinProfilesInConfig } from "./normalize-exec-safe-bin.js"; @@ -1185,6 +1186,11 @@ type ConfigReadResolution = { envWarnings: EnvSubstitutionWarning[]; }; +type LegacyMigrationResolution = { + effectiveConfigRaw: unknown; + sourceLegacyIssues: LegacyConfigIssue[]; +}; + function resolveConfigIncludesForRead( parsed: unknown, configPath: string, @@ -1225,6 +1231,21 @@ function resolveConfigForRead( }; } +function resolveLegacyConfigForRead( + resolvedConfigRaw: unknown, + sourceRaw: unknown, +): LegacyMigrationResolution { + const sourceLegacyIssues = findLegacyConfigIssues(resolvedConfigRaw, sourceRaw); + if (sourceLegacyIssues.length === 0) { + return { effectiveConfigRaw: resolvedConfigRaw, sourceLegacyIssues }; + } + const migrated = migrateLegacyConfig(resolvedConfigRaw); + return { + effectiveConfigRaw: migrated.config ?? resolvedConfigRaw, + sourceLegacyIssues, + }; +} + type ReadConfigFileSnapshotInternalResult = { snapshot: ConfigFileSnapshot; envSnapshotForRestore?: Record; @@ -1275,13 +1296,15 @@ export function createConfigIO(overrides: ConfigIoDeps = {}) { deps.env, ); const resolvedConfig = readResolution.resolvedConfigRaw; + const legacyResolution = resolveLegacyConfigForRead(resolvedConfig, parsed); + const effectiveConfigRaw = legacyResolution.effectiveConfigRaw; for (const w of readResolution.envWarnings) { deps.logger.warn( `Config (${configPath}): missing env var "${w.varName}" at ${w.configPath} — feature using this value will be unavailable`, ); } - warnOnConfigMiskeys(resolvedConfig, deps.logger); - if (typeof resolvedConfig !== "object" || resolvedConfig === null) { + warnOnConfigMiskeys(effectiveConfigRaw, deps.logger); + if (typeof effectiveConfigRaw !== "object" || effectiveConfigRaw === null) { observeLoadConfigSnapshot({ path: configPath, exists: true, @@ -1293,31 +1316,31 @@ export function createConfigIO(overrides: ConfigIoDeps = {}) { hash, issues: [], warnings: [], - legacyIssues: [], + legacyIssues: legacyResolution.sourceLegacyIssues, }); return {}; } - const preValidationDuplicates = findDuplicateAgentDirs(resolvedConfig as OpenClawConfig, { + const preValidationDuplicates = findDuplicateAgentDirs(effectiveConfigRaw as OpenClawConfig, { env: deps.env, homedir: deps.homedir, }); if (preValidationDuplicates.length > 0) { throw new DuplicateAgentDirError(preValidationDuplicates); } - const validated = validateConfigObjectWithPlugins(resolvedConfig, { env: deps.env }); + const validated = validateConfigObjectWithPlugins(effectiveConfigRaw, { env: deps.env }); if (!validated.ok) { observeLoadConfigSnapshot({ path: configPath, exists: true, raw, parsed, - resolved: coerceConfig(resolvedConfig), + resolved: coerceConfig(effectiveConfigRaw), valid: false, - config: coerceConfig(resolvedConfig), + config: coerceConfig(effectiveConfigRaw), hash, issues: validated.issues, warnings: validated.warnings, - legacyIssues: findLegacyConfigIssues(resolvedConfig, parsed), + legacyIssues: legacyResolution.sourceLegacyIssues, }); const details = validated.issues .map( @@ -1362,13 +1385,13 @@ export function createConfigIO(overrides: ConfigIoDeps = {}) { exists: true, raw, parsed, - resolved: coerceConfig(resolvedConfig), + resolved: coerceConfig(effectiveConfigRaw), valid: true, config: cfg, hash, issues: [], warnings: validated.warnings, - legacyIssues: findLegacyConfigIssues(resolvedConfig, parsed), + legacyIssues: legacyResolution.sourceLegacyIssues, }); const duplicates = findDuplicateAgentDirs(cfg, { @@ -1536,11 +1559,10 @@ export function createConfigIO(overrides: ConfigIoDeps = {}) { })); const resolvedConfigRaw = readResolution.resolvedConfigRaw; - // Detect legacy keys on resolved config, but only mark source-literal legacy - // entries (for auto-migration) when they are present in the parsed source. - const legacyIssues = findLegacyConfigIssues(resolvedConfigRaw, parsedRes.parsed); + const legacyResolution = resolveLegacyConfigForRead(resolvedConfigRaw, parsedRes.parsed); + const effectiveConfigRaw = legacyResolution.effectiveConfigRaw; - const validated = validateConfigObjectWithPlugins(resolvedConfigRaw, { env: deps.env }); + const validated = validateConfigObjectWithPlugins(effectiveConfigRaw, { env: deps.env }); if (!validated.ok) { return await finalizeReadConfigSnapshotInternalResult(deps, { snapshot: { @@ -1548,13 +1570,13 @@ export function createConfigIO(overrides: ConfigIoDeps = {}) { exists: true, raw, parsed: parsedRes.parsed, - resolved: coerceConfig(resolvedConfigRaw), + resolved: coerceConfig(effectiveConfigRaw), valid: false, - config: coerceConfig(resolvedConfigRaw), + config: coerceConfig(effectiveConfigRaw), hash, issues: validated.issues, warnings: [...validated.warnings, ...envVarWarnings], - legacyIssues, + legacyIssues: legacyResolution.sourceLegacyIssues, }, }); } @@ -1580,13 +1602,13 @@ export function createConfigIO(overrides: ConfigIoDeps = {}) { parsed: parsedRes.parsed, // Use resolvedConfigRaw (after $include and ${ENV} substitution but BEFORE runtime defaults) // for config set/unset operations (issue #6070) - resolved: coerceConfig(resolvedConfigRaw), + resolved: coerceConfig(effectiveConfigRaw), valid: true, config: snapshotConfig, hash, issues: [], warnings: [...validated.warnings, ...envVarWarnings], - legacyIssues, + legacyIssues: legacyResolution.sourceLegacyIssues, }, envSnapshotForRestore: readResolution.envSnapshotForRestore, }); diff --git a/src/config/talk.normalize.test.ts b/src/config/talk.normalize.test.ts index f2b1ddff1a1..a4221e5223c 100644 --- a/src/config/talk.normalize.test.ts +++ b/src/config/talk.normalize.test.ts @@ -36,7 +36,6 @@ describe("talk normalization", () => { }); expect(normalized).toEqual({ - provider: "elevenlabs", providers: { elevenlabs: { voiceId: "voice-123", @@ -149,7 +148,7 @@ describe("talk normalization", () => { async (configPath) => { const io = createConfigIO({ configPath }); const snapshot = await io.readConfigFileSnapshot(); - expect(snapshot.config.talk?.provider).toBe("elevenlabs"); + expect(snapshot.config.talk?.provider).toBeUndefined(); expect(snapshot.config.talk?.providers?.elevenlabs?.voiceId).toBe("voice-123"); expect(snapshot.config.talk?.providers?.elevenlabs?.apiKey).toBe(elevenLabsApiKey); expect(snapshot.config.talk?.apiKey).toBe(elevenLabsApiKey); diff --git a/src/config/talk.ts b/src/config/talk.ts index 32c4255a7a4..517e00ac252 100644 --- a/src/config/talk.ts +++ b/src/config/talk.ts @@ -16,7 +16,7 @@ type TalkApiKeyDeps = { path?: typeof path; }; -export const DEFAULT_TALK_PROVIDER = "elevenlabs"; +export const LEGACY_TALK_PROVIDER_ID = "elevenlabs"; function isPlainObject(value: unknown): value is Record { return typeof value === "object" && value !== null && !Array.isArray(value); @@ -225,19 +225,13 @@ export function normalizeTalkSection(value: TalkConfig | undefined): TalkConfig } if (provider) { normalized.provider = provider; - } else if (providers) { - const ids = Object.keys(providers); - if (ids.length === 1) { - normalized.provider = ids[0]; - } } return Object.keys(normalized).length > 0 ? normalized : undefined; } const legacyProviderConfig = legacyProviderConfigFromTalk(source); if (legacyProviderConfig) { - normalized.provider = DEFAULT_TALK_PROVIDER; - normalized.providers = { [DEFAULT_TALK_PROVIDER]: legacyProviderConfig }; + normalized.providers = { [LEGACY_TALK_PROVIDER_ID]: legacyProviderConfig }; } return Object.keys(normalized).length > 0 ? normalized : undefined; } diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 934594a7253..bec252c9900 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -27,22 +27,8 @@ export type TtsModelOverrideConfig = { export type TtsProviderConfigMap = Record>; -export type TtsConfig = { - /** Auto-TTS mode (preferred). */ - auto?: TtsAutoMode; - /** Legacy: enable auto-TTS when `auto` is not set. */ - enabled?: boolean; - /** Apply TTS to final replies only or to all replies (tool/block/final). */ - mode?: TtsMode; - /** Primary TTS provider (fallbacks are automatic). */ - provider?: TtsProvider; - /** Optional model override for TTS auto-summary (provider/model or alias). */ - summaryModel?: string; - /** Allow the model to override TTS parameters. */ - modelOverrides?: TtsModelOverrideConfig; - /** Provider-specific TTS settings keyed by speech provider id. */ - providers?: TtsProviderConfigMap; - /** ElevenLabs configuration. */ +export type LegacyTtsConfigCompat = { + /** Legacy ElevenLabs configuration. Prefer providers.elevenlabs. */ elevenlabs?: { apiKey?: SecretInput; baseUrl?: string; @@ -59,7 +45,7 @@ export type TtsConfig = { speed?: number; }; }; - /** OpenAI configuration. */ + /** Legacy OpenAI configuration. Prefer providers.openai. */ openai?: { apiKey?: SecretInput; baseUrl?: string; @@ -70,7 +56,7 @@ export type TtsConfig = { /** System-level instructions for the TTS model (gpt-4o-mini-tts only). */ instructions?: string; }; - /** Legacy alias for Microsoft speech configuration. */ + /** Legacy alias for Microsoft speech configuration. Prefer providers.microsoft. */ edge?: { /** Explicitly allow Microsoft speech usage (no API key required). */ enabled?: boolean; @@ -84,7 +70,7 @@ export type TtsConfig = { proxy?: string; timeoutMs?: number; }; - /** Preferred alias for Microsoft speech configuration. */ + /** Legacy Microsoft speech configuration. Prefer providers.microsoft. */ microsoft?: { enabled?: boolean; voice?: string; @@ -97,6 +83,23 @@ export type TtsConfig = { proxy?: string; timeoutMs?: number; }; +}; + +export type TtsConfig = LegacyTtsConfigCompat & { + /** Auto-TTS mode (preferred). */ + auto?: TtsAutoMode; + /** Legacy: enable auto-TTS when `auto` is not set. */ + enabled?: boolean; + /** Apply TTS to final replies only or to all replies (tool/block/final). */ + mode?: TtsMode; + /** Primary TTS provider (fallbacks are automatic). */ + provider?: TtsProvider; + /** Optional model override for TTS auto-summary (provider/model or alias). */ + summaryModel?: string; + /** Allow the model to override TTS parameters. */ + modelOverrides?: TtsModelOverrideConfig; + /** Provider-specific TTS settings keyed by speech provider id. */ + providers?: TtsProviderConfigMap; /** Optional path for local TTS user preferences JSON. */ prefsPath?: string; /** Hard cap for text sent to TTS (chars). */ diff --git a/src/secrets/runtime-config-collectors-tts.ts b/src/secrets/runtime-config-collectors-tts.ts index 1c93323a7f3..295ac9bf244 100644 --- a/src/secrets/runtime-config-collectors-tts.ts +++ b/src/secrets/runtime-config-collectors-tts.ts @@ -54,25 +54,4 @@ export function collectTtsApiKeyAssignments(params: { } return; } - - // Legacy compatibility until migrated configs have been rewritten on disk. - const legacyProviders = ["elevenlabs", "openai"] as const; - for (const providerId of legacyProviders) { - const providerConfig = params.tts[providerId]; - if (!isRecord(providerConfig)) { - continue; - } - collectSecretInputAssignment({ - value: providerConfig.apiKey, - path: `${params.pathPrefix}.${providerId}.apiKey`, - expected: "string", - defaults: params.defaults, - context: params.context, - active: params.active, - inactiveReason: params.inactiveReason, - apply: (value) => { - providerConfig.apiKey = value; - }, - }); - } } diff --git a/src/secrets/runtime.test.ts b/src/secrets/runtime.test.ts index a93c216f1a2..6f892f195cf 100644 --- a/src/secrets/runtime.test.ts +++ b/src/secrets/runtime.test.ts @@ -1949,20 +1949,22 @@ describe("secrets runtime snapshot", () => { loadAuthStore: () => ({ version: 1, profiles: {} }), }); - expect(snapshot.config.channels?.discord?.voice?.tts?.openai?.apiKey).toEqual({ + expect(snapshot.config.channels?.discord?.voice?.tts?.providers?.openai?.apiKey).toEqual({ source: "env", provider: "default", id: "MISSING_DISCORD_VOICE_TTS_OPENAI", }); - expect(snapshot.config.channels?.discord?.accounts?.work?.voice?.tts?.openai?.apiKey).toEqual({ + expect( + snapshot.config.channels?.discord?.accounts?.work?.voice?.tts?.providers?.openai?.apiKey, + ).toEqual({ source: "env", provider: "default", id: "MISSING_DISCORD_WORK_VOICE_TTS_OPENAI", }); expect(snapshot.warnings.map((warning) => warning.path)).toEqual( expect.arrayContaining([ - "channels.discord.voice.tts.openai.apiKey", - "channels.discord.accounts.work.voice.tts.openai.apiKey", + "channels.discord.voice.tts.providers.openai.apiKey", + "channels.discord.accounts.work.voice.tts.providers.openai.apiKey", ]), ); }); @@ -1974,8 +1976,10 @@ describe("secrets runtime snapshot", () => { discord: { voice: { tts: { - openai: { - apiKey: { source: "env", provider: "default", id: "DISCORD_BASE_TTS_OPENAI" }, + providers: { + openai: { + apiKey: { source: "env", provider: "default", id: "DISCORD_BASE_TTS_OPENAI" }, + }, }, }, }, @@ -1990,11 +1994,13 @@ describe("secrets runtime snapshot", () => { enabled: true, voice: { tts: { - openai: { - apiKey: { - source: "env", - provider: "default", - id: "DISCORD_ENABLED_OVERRIDE_TTS_OPENAI", + providers: { + openai: { + apiKey: { + source: "env", + provider: "default", + id: "DISCORD_ENABLED_OVERRIDE_TTS_OPENAI", + }, }, }, }, @@ -2004,11 +2010,13 @@ describe("secrets runtime snapshot", () => { enabled: false, voice: { tts: { - openai: { - apiKey: { - source: "env", - provider: "default", - id: "DISCORD_DISABLED_OVERRIDE_TTS_OPENAI", + providers: { + openai: { + apiKey: { + source: "env", + provider: "default", + id: "DISCORD_DISABLED_OVERRIDE_TTS_OPENAI", + }, }, }, }, @@ -2034,13 +2042,17 @@ describe("secrets runtime snapshot", () => { loadAuthStore: () => ({ version: 1, profiles: {} }), }); - expect(snapshot.config.channels?.discord?.voice?.tts?.openai?.apiKey).toBe("base-tts-openai"); + expect(snapshot.config.channels?.discord?.voice?.tts?.providers?.openai?.apiKey).toBe( + "base-tts-openai", + ); expect(snapshot.config.channels?.discord?.pluralkit?.token).toBe("base-pk-token"); expect( - snapshot.config.channels?.discord?.accounts?.enabledOverride?.voice?.tts?.openai?.apiKey, + snapshot.config.channels?.discord?.accounts?.enabledOverride?.voice?.tts?.providers?.openai + ?.apiKey, ).toBe("enabled-override-tts-openai"); expect( - snapshot.config.channels?.discord?.accounts?.disabledOverride?.voice?.tts?.openai?.apiKey, + snapshot.config.channels?.discord?.accounts?.disabledOverride?.voice?.tts?.providers?.openai + ?.apiKey, ).toEqual({ source: "env", provider: "default", @@ -2055,7 +2067,7 @@ describe("secrets runtime snapshot", () => { ); expect(snapshot.warnings.map((warning) => warning.path)).toEqual( expect.arrayContaining([ - "channels.discord.accounts.disabledOverride.voice.tts.openai.apiKey", + "channels.discord.accounts.disabledOverride.voice.tts.providers.openai.apiKey", "channels.discord.accounts.disabledOverride.pluralkit.token", ]), ); @@ -2068,11 +2080,13 @@ describe("secrets runtime snapshot", () => { discord: { voice: { tts: { - openai: { - apiKey: { - source: "env", - provider: "default", - id: "DISCORD_UNUSED_BASE_TTS_OPENAI", + providers: { + openai: { + apiKey: { + source: "env", + provider: "default", + id: "DISCORD_UNUSED_BASE_TTS_OPENAI", + }, }, }, }, @@ -2082,11 +2096,13 @@ describe("secrets runtime snapshot", () => { enabled: true, voice: { tts: { - openai: { - apiKey: { - source: "env", - provider: "default", - id: "DISCORD_ENABLED_ONLY_TTS_OPENAI", + providers: { + openai: { + apiKey: { + source: "env", + provider: "default", + id: "DISCORD_ENABLED_ONLY_TTS_OPENAI", + }, }, }, }, @@ -2107,15 +2123,16 @@ describe("secrets runtime snapshot", () => { }); expect( - snapshot.config.channels?.discord?.accounts?.enabledOverride?.voice?.tts?.openai?.apiKey, + snapshot.config.channels?.discord?.accounts?.enabledOverride?.voice?.tts?.providers?.openai + ?.apiKey, ).toBe("enabled-only-tts-openai"); - expect(snapshot.config.channels?.discord?.voice?.tts?.openai?.apiKey).toEqual({ + expect(snapshot.config.channels?.discord?.voice?.tts?.providers?.openai?.apiKey).toEqual({ source: "env", provider: "default", id: "DISCORD_UNUSED_BASE_TTS_OPENAI", }); expect(snapshot.warnings.map((warning) => warning.path)).toContain( - "channels.discord.voice.tts.openai.apiKey", + "channels.discord.voice.tts.providers.openai.apiKey", ); }); @@ -2127,8 +2144,10 @@ describe("secrets runtime snapshot", () => { discord: { voice: { tts: { - openai: { - apiKey: { source: "env", provider: "default", id: "DISCORD_BASE_TTS_OK" }, + providers: { + openai: { + apiKey: { source: "env", provider: "default", id: "DISCORD_BASE_TTS_OK" }, + }, }, }, }, @@ -2137,11 +2156,13 @@ describe("secrets runtime snapshot", () => { enabled: true, voice: { tts: { - openai: { - apiKey: { - source: "env", - provider: "default", - id: "DISCORD_ENABLED_OVERRIDE_TTS_MISSING", + providers: { + openai: { + apiKey: { + source: "env", + provider: "default", + id: "DISCORD_ENABLED_OVERRIDE_TTS_MISSING", + }, }, }, }, diff --git a/src/secrets/runtime.ts b/src/secrets/runtime.ts index 5b29f36233b..38f71446b41 100644 --- a/src/secrets/runtime.ts +++ b/src/secrets/runtime.ts @@ -12,6 +12,7 @@ import { setRuntimeConfigSnapshot, type OpenClawConfig, } from "../config/config.js"; +import { migrateLegacyConfig } from "../config/legacy-migrate.js"; import { resolveUserPath } from "../utils.js"; import { collectCommandSecretAssignmentsFromSnapshot, @@ -139,7 +140,9 @@ export async function prepareSecretsRuntimeSnapshot(params: { }): Promise { const runtimeEnv = mergeSecretsRuntimeEnv(params.env); const sourceConfig = structuredClone(params.config); - const resolvedConfig = structuredClone(params.config); + const resolvedConfig = structuredClone( + migrateLegacyConfig(params.config).config ?? params.config, + ); const context = createResolverContext({ sourceConfig, env: runtimeEnv, diff --git a/src/tts/directives.ts b/src/tts/directives.ts index df993ba1bed..ea1d7c76ef7 100644 --- a/src/tts/directives.ts +++ b/src/tts/directives.ts @@ -4,7 +4,6 @@ import { listSpeechProviders } from "./provider-registry.js"; import type { SpeechModelOverridePolicy, SpeechProviderConfig, - SpeechProviderOverrides, TtsDirectiveOverrides, TtsDirectiveParseResult, } from "./provider-types.js"; @@ -13,7 +12,6 @@ type ParseTtsDirectiveOptions = { cfg?: OpenClawConfig; providers?: readonly SpeechProviderPlugin[]; providerConfigs?: Record; - openaiBaseUrl?: string; }; function buildProviderOrder(left: SpeechProviderPlugin, right: SpeechProviderPlugin): number { @@ -36,49 +34,18 @@ function resolveDirectiveProviderConfig( provider: SpeechProviderPlugin, options?: ParseTtsDirectiveOptions, ): SpeechProviderConfig | undefined { - const explicit = options?.providerConfigs?.[provider.id]; - if (explicit) { - return explicit; - } - if (provider.id === "openai" && options?.openaiBaseUrl) { - return { baseUrl: options.openaiBaseUrl }; - } - return undefined; -} - -function mergeProviderOverrides( - target: TtsDirectiveOverrides, - providerId: string, - next: SpeechProviderOverrides, -): void { - target.providerOverrides = { - ...target.providerOverrides, - [providerId]: { - ...target.providerOverrides?.[providerId], - ...next, - }, - }; -} - -function resolveLegacyOptions( - optionsOrOpenaiBaseUrl?: ParseTtsDirectiveOptions | string, -): ParseTtsDirectiveOptions | undefined { - if (typeof optionsOrOpenaiBaseUrl === "string") { - return { openaiBaseUrl: optionsOrOpenaiBaseUrl }; - } - return optionsOrOpenaiBaseUrl; + return options?.providerConfigs?.[provider.id]; } export function parseTtsDirectives( text: string, policy: SpeechModelOverridePolicy, - optionsOrOpenaiBaseUrl?: ParseTtsDirectiveOptions | string, + options?: ParseTtsDirectiveOptions, ): TtsDirectiveParseResult { if (!policy.enabled) { return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false }; } - const options = resolveLegacyOptions(optionsOrOpenaiBaseUrl); const providers = resolveDirectiveProviders(options); const overrides: TtsDirectiveOverrides = {}; const warnings: string[] = []; @@ -135,7 +102,13 @@ export function parseTtsDirectives( } handled = true; if (parsed.overrides) { - mergeProviderOverrides(overrides, provider.id, parsed.overrides); + overrides.providerOverrides = { + ...overrides.providerOverrides, + [provider.id]: { + ...overrides.providerOverrides?.[provider.id], + ...parsed.overrides, + }, + }; } if (parsed.warnings?.length) { warnings.push(...parsed.warnings); diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index d78e9494bca..a3ae013a779 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -86,7 +86,6 @@ const { parseTtsDirectives, resolveModelOverridePolicy, summarizeText, - resolveOutputFormat, getResolvedSpeechProviderConfig, } = _test; @@ -234,65 +233,6 @@ describe("tts", () => { }); }); - describe("resolveOutputFormat", () => { - it("selects opus for opus channels (telegram/feishu/whatsapp/matrix) and mp3 for others", () => { - const cases = [ - { - channel: "telegram", - expected: { - openai: "opus", - elevenlabs: "opus_48000_64", - extension: ".opus", - voiceCompatible: true, - }, - }, - { - channel: "feishu", - expected: { - openai: "opus", - elevenlabs: "opus_48000_64", - extension: ".opus", - voiceCompatible: true, - }, - }, - { - channel: "whatsapp", - expected: { - openai: "opus", - elevenlabs: "opus_48000_64", - extension: ".opus", - voiceCompatible: true, - }, - }, - { - channel: "matrix", - expected: { - openai: "opus", - elevenlabs: "opus_48000_64", - extension: ".opus", - voiceCompatible: true, - }, - }, - { - channel: "discord", - expected: { - openai: "mp3", - elevenlabs: "mp3_44100_128", - extension: ".mp3", - voiceCompatible: false, - }, - }, - ] as const; - for (const testCase of cases) { - const output = resolveOutputFormat(testCase.channel); - expect(output.openai, testCase.channel).toBe(testCase.expected.openai); - expect(output.elevenlabs, testCase.channel).toBe(testCase.expected.elevenlabs); - expect(output.extension, testCase.channel).toBe(testCase.expected.extension); - expect(output.voiceCompatible, testCase.channel).toBe(testCase.expected.voiceCompatible); - } - }); - }); - describe("resolveEdgeOutputFormat", () => { const baseCfg: OpenClawConfig = { agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, @@ -383,9 +323,11 @@ describe("tts", () => { it("accepts custom voices and models when openaiBaseUrl is a non-default endpoint", () => { const policy = resolveModelOverridePolicy({ enabled: true }); const input = "Hello [[tts:voice=kokoro-chinese model=kokoro-v1]] world"; - const customBaseUrl = "http://localhost:8880/v1"; - - const result = parseTtsDirectives(input, policy, customBaseUrl); + const result = parseTtsDirectives(input, policy, { + providerConfigs: { + openai: { baseUrl: "http://localhost:8880/v1" }, + }, + }); const openaiOverrides = result.overrides.providerOverrides?.openai as | { voice?: string; model?: string } | undefined; @@ -398,9 +340,11 @@ describe("tts", () => { it("rejects unknown voices and models when openaiBaseUrl is the default OpenAI endpoint", () => { const policy = resolveModelOverridePolicy({ enabled: true }); const input = "Hello [[tts:voice=kokoro-chinese model=kokoro-v1]] world"; - const defaultBaseUrl = "https://api.openai.com/v1"; - - const result = parseTtsDirectives(input, policy, defaultBaseUrl); + const result = parseTtsDirectives(input, policy, { + providerConfigs: { + openai: { baseUrl: "https://api.openai.com/v1" }, + }, + }); const openaiOverrides = result.overrides.providerOverrides?.openai as | { voice?: string } | undefined; diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 479c2ecd129..f3f32b438db 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -48,22 +48,6 @@ const DEFAULT_TTS_MAX_LENGTH = 1500; const DEFAULT_TTS_SUMMARIZE = true; const DEFAULT_MAX_TEXT_LENGTH = 4096; -const OPUS_OUTPUT = { - openai: "opus" as const, - // ElevenLabs output formats use codec_sample_rate_bitrate naming. - // Opus @ 48kHz/64kbps is a good voice message tradeoff. - elevenlabs: "opus_48000_64", - extension: ".opus", - voiceCompatible: true, -}; - -const DEFAULT_OUTPUT = { - openai: "mp3" as const, - elevenlabs: "mp3_44100_128", - extension: ".mp3", - voiceCompatible: false, -}; - export type ResolvedTtsConfig = { auto: TtsAutoMode; mode: TtsMode; @@ -418,16 +402,9 @@ export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void { lastTtsAttempt = entry; } -/** Channels that require opus audio */ +/** Channels that require voice-note-compatible audio */ const OPUS_CHANNELS = new Set(["telegram", "feishu", "whatsapp", "matrix"]); -function resolveOutputFormat(channelId?: string | null) { - if (channelId && OPUS_CHANNELS.has(channelId)) { - return OPUS_OUTPUT; - } - return DEFAULT_OUTPUT; -} - function resolveChannelId(channel: string | undefined): ChannelId | null { return channel ? normalizeChannelId(channel) : null; } @@ -876,6 +853,5 @@ export const _test = { parseTtsDirectives, resolveModelOverridePolicy, summarizeText, - resolveOutputFormat, getResolvedSpeechProviderConfig, };