diff --git a/src/extension-host/tts-config.ts b/src/extension-host/tts-config.ts new file mode 100644 index 00000000000..55767aedad2 --- /dev/null +++ b/src/extension-host/tts-config.ts @@ -0,0 +1,193 @@ +import type { OpenClawConfig } from "../config/config.js"; +import { normalizeResolvedSecretInputString } from "../config/types.secrets.js"; +import type { + TtsAutoMode, + TtsConfig, + TtsMode, + TtsModelOverrideConfig, + TtsProvider, +} from "../config/types.tts.js"; +import { normalizeExtensionHostTtsAutoMode } from "./tts-preferences.js"; + +export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"; + +const DEFAULT_TIMEOUT_MS = 30_000; +const DEFAULT_MAX_TEXT_LENGTH = 4096; +const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io"; +const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE"; +const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"; +const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"; +const DEFAULT_OPENAI_VOICE = "alloy"; +const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural"; +const DEFAULT_EDGE_LANG = "en-US"; +const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; + +const DEFAULT_ELEVENLABS_VOICE_SETTINGS = { + stability: 0.5, + similarityBoost: 0.75, + style: 0.0, + useSpeakerBoost: true, + speed: 1.0, +}; + +export type ResolvedTtsConfig = { + auto: TtsAutoMode; + mode: TtsMode; + provider: TtsProvider; + providerSource: "config" | "default"; + summaryModel?: string; + modelOverrides: ResolvedTtsModelOverrides; + elevenlabs: { + apiKey?: string; + baseUrl: string; + voiceId: string; + modelId: string; + seed?: number; + applyTextNormalization?: "auto" | "on" | "off"; + languageCode?: string; + voiceSettings: { + stability: number; + similarityBoost: number; + style: number; + useSpeakerBoost: boolean; + speed: number; + }; + }; + openai: { + apiKey?: string; + baseUrl: string; + model: string; + voice: string; + speed?: number; + instructions?: string; + }; + edge: { + enabled: boolean; + voice: string; + lang: string; + outputFormat: string; + outputFormatConfigured: boolean; + pitch?: string; + rate?: string; + volume?: string; + saveSubtitles: boolean; + proxy?: string; + timeoutMs?: number; + }; + prefsPath?: string; + maxTextLength: number; + timeoutMs: number; +}; + +export type ResolvedTtsModelOverrides = { + enabled: boolean; + allowText: boolean; + allowProvider: boolean; + allowVoice: boolean; + allowModelId: boolean; + allowVoiceSettings: boolean; + allowNormalization: boolean; + allowSeed: boolean; +}; + +export const normalizeExtensionHostTtsConfigAutoMode = normalizeExtensionHostTtsAutoMode; + +export function resolveExtensionHostTtsModelOverridePolicy( + overrides: TtsModelOverrideConfig | undefined, +): ResolvedTtsModelOverrides { + const enabled = overrides?.enabled ?? true; + if (!enabled) { + return { + enabled: false, + allowText: false, + allowProvider: false, + allowVoice: false, + allowModelId: false, + allowVoiceSettings: false, + allowNormalization: false, + allowSeed: false, + }; + } + const allow = (value: boolean | undefined, defaultValue = true) => value ?? defaultValue; + return { + enabled: true, + allowText: allow(overrides?.allowText), + allowProvider: allow(overrides?.allowProvider, false), + allowVoice: allow(overrides?.allowVoice), + allowModelId: allow(overrides?.allowModelId), + allowVoiceSettings: allow(overrides?.allowVoiceSettings), + allowNormalization: allow(overrides?.allowNormalization), + allowSeed: allow(overrides?.allowSeed), + }; +} + +export function resolveExtensionHostTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig { + const raw: TtsConfig = cfg.messages?.tts ?? {}; + const providerSource = raw.provider ? "config" : "default"; + const edgeOutputFormat = raw.edge?.outputFormat?.trim(); + const auto = + normalizeExtensionHostTtsConfigAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off"); + return { + auto, + mode: raw.mode ?? "final", + provider: raw.provider ?? "edge", + providerSource, + summaryModel: raw.summaryModel?.trim() || undefined, + modelOverrides: resolveExtensionHostTtsModelOverridePolicy(raw.modelOverrides), + elevenlabs: { + apiKey: normalizeResolvedSecretInputString({ + value: raw.elevenlabs?.apiKey, + path: "messages.tts.elevenlabs.apiKey", + }), + baseUrl: raw.elevenlabs?.baseUrl?.trim() || DEFAULT_ELEVENLABS_BASE_URL, + voiceId: raw.elevenlabs?.voiceId ?? DEFAULT_ELEVENLABS_VOICE_ID, + modelId: raw.elevenlabs?.modelId ?? DEFAULT_ELEVENLABS_MODEL_ID, + seed: raw.elevenlabs?.seed, + applyTextNormalization: raw.elevenlabs?.applyTextNormalization, + languageCode: raw.elevenlabs?.languageCode, + voiceSettings: { + stability: + raw.elevenlabs?.voiceSettings?.stability ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.stability, + similarityBoost: + raw.elevenlabs?.voiceSettings?.similarityBoost ?? + DEFAULT_ELEVENLABS_VOICE_SETTINGS.similarityBoost, + style: raw.elevenlabs?.voiceSettings?.style ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.style, + useSpeakerBoost: + raw.elevenlabs?.voiceSettings?.useSpeakerBoost ?? + DEFAULT_ELEVENLABS_VOICE_SETTINGS.useSpeakerBoost, + speed: raw.elevenlabs?.voiceSettings?.speed ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.speed, + }, + }, + openai: { + apiKey: normalizeResolvedSecretInputString({ + value: raw.openai?.apiKey, + path: "messages.tts.openai.apiKey", + }), + baseUrl: ( + raw.openai?.baseUrl?.trim() || + process.env.OPENAI_TTS_BASE_URL?.trim() || + DEFAULT_OPENAI_BASE_URL + ).replace(/\/+$/, ""), + model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL, + voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE, + speed: raw.openai?.speed, + instructions: raw.openai?.instructions?.trim() || undefined, + }, + edge: { + enabled: raw.edge?.enabled ?? true, + voice: raw.edge?.voice?.trim() || DEFAULT_EDGE_VOICE, + lang: raw.edge?.lang?.trim() || DEFAULT_EDGE_LANG, + outputFormat: edgeOutputFormat || DEFAULT_EDGE_OUTPUT_FORMAT, + outputFormatConfigured: Boolean(edgeOutputFormat), + pitch: raw.edge?.pitch?.trim() || undefined, + rate: raw.edge?.rate?.trim() || undefined, + volume: raw.edge?.volume?.trim() || undefined, + saveSubtitles: raw.edge?.saveSubtitles ?? false, + proxy: raw.edge?.proxy?.trim() || undefined, + timeoutMs: raw.edge?.timeoutMs, + }, + prefsPath: raw.prefsPath, + maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH, + timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS, + }; +} diff --git a/src/extension-host/tts-payload.ts b/src/extension-host/tts-payload.ts index 0bce003071b..3452291a9fb 100644 --- a/src/extension-host/tts-payload.ts +++ b/src/extension-host/tts-payload.ts @@ -3,7 +3,8 @@ import type { OpenClawConfig } from "../config/config.js"; import { logVerbose } from "../globals.js"; import { stripMarkdown } from "../line/markdown-to-line.js"; import { parseTtsDirectives, summarizeText } from "../tts/tts-core.js"; -import type { ResolvedTtsConfig, TtsDirectiveOverrides } from "../tts/tts.js"; +import type { TtsDirectiveOverrides } from "../tts/tts.js"; +import type { ResolvedTtsConfig } from "./tts-config.js"; import { getExtensionHostTtsMaxLength, isExtensionHostTtsSummarizationEnabled, diff --git a/src/extension-host/tts-preferences.test.ts b/src/extension-host/tts-preferences.test.ts index 2b56552d9d0..f2a9b8b4372 100644 --- a/src/extension-host/tts-preferences.test.ts +++ b/src/extension-host/tts-preferences.test.ts @@ -3,7 +3,7 @@ import os from "node:os"; import path from "node:path"; import { afterEach, describe, expect, it } from "vitest"; import { withEnv } from "../test-utils/env.js"; -import type { ResolvedTtsConfig } from "../tts/tts.js"; +import type { ResolvedTtsConfig } from "./tts-config.js"; import { getExtensionHostTtsMaxLength, isExtensionHostTtsEnabled, diff --git a/src/extension-host/tts-preferences.ts b/src/extension-host/tts-preferences.ts index 429abd56c41..cbb9eae9199 100644 --- a/src/extension-host/tts-preferences.ts +++ b/src/extension-host/tts-preferences.ts @@ -9,8 +9,8 @@ import { } from "node:fs"; import path from "node:path"; import type { TtsAutoMode, TtsProvider } from "../config/types.tts.js"; -import type { ResolvedTtsConfig } from "../tts/tts.js"; import { CONFIG_DIR, resolveUserPath } from "../utils.js"; +import type { ResolvedTtsConfig } from "./tts-config.js"; export const DEFAULT_EXTENSION_HOST_TTS_MAX_LENGTH = 1500; export const DEFAULT_EXTENSION_HOST_TTS_SUMMARIZE = true; diff --git a/src/extension-host/tts-runtime-execution.ts b/src/extension-host/tts-runtime-execution.ts index d1c1c1b1c16..1b60e39f99e 100644 --- a/src/extension-host/tts-runtime-execution.ts +++ b/src/extension-host/tts-runtime-execution.ts @@ -11,12 +11,8 @@ import { openaiTTS, scheduleCleanup, } from "../tts/tts-core.js"; -import type { - ResolvedTtsConfig, - TtsDirectiveOverrides, - TtsResult, - TtsTelephonyResult, -} from "../tts/tts.js"; +import type { TtsDirectiveOverrides, TtsResult, TtsTelephonyResult } from "../tts/tts.js"; +import type { ResolvedTtsConfig } from "./tts-config.js"; import { resolveExtensionHostTtsApiKey, supportsExtensionHostTtsTelephony, diff --git a/src/extension-host/tts-runtime-registry.ts b/src/extension-host/tts-runtime-registry.ts index 05c491a6d7b..331b97cebbe 100644 --- a/src/extension-host/tts-runtime-registry.ts +++ b/src/extension-host/tts-runtime-registry.ts @@ -1,5 +1,5 @@ import type { TtsProvider } from "../config/types.tts.js"; -import type { ResolvedTtsConfig } from "../tts/tts.js"; +import type { ResolvedTtsConfig } from "./tts-config.js"; export type ExtensionHostTtsRuntimeProvider = { id: TtsProvider; diff --git a/src/extension-host/tts-runtime-setup.test.ts b/src/extension-host/tts-runtime-setup.test.ts index 4f02fa60325..02755343c21 100644 --- a/src/extension-host/tts-runtime-setup.test.ts +++ b/src/extension-host/tts-runtime-setup.test.ts @@ -3,7 +3,7 @@ import os from "node:os"; import path from "node:path"; import { afterEach, describe, expect, it } from "vitest"; import { withEnv } from "../test-utils/env.js"; -import type { ResolvedTtsConfig } from "../tts/tts.js"; +import type { ResolvedTtsConfig } from "./tts-config.js"; import { resolveExtensionHostTtsProvider, resolveExtensionHostTtsRequestSetup, diff --git a/src/extension-host/tts-runtime-setup.ts b/src/extension-host/tts-runtime-setup.ts index f9521070dba..06e062ff3d7 100644 --- a/src/extension-host/tts-runtime-setup.ts +++ b/src/extension-host/tts-runtime-setup.ts @@ -1,6 +1,6 @@ import { existsSync, readFileSync } from "node:fs"; import type { TtsProvider } from "../config/types.tts.js"; -import type { ResolvedTtsConfig } from "../tts/tts.js"; +import type { ResolvedTtsConfig } from "./tts-config.js"; import { resolveExtensionHostTtsApiKey, resolveExtensionHostTtsProviderOrder, diff --git a/src/extension-host/tts-status.ts b/src/extension-host/tts-status.ts index 438177764fa..ec2d9c2ea38 100644 --- a/src/extension-host/tts-status.ts +++ b/src/extension-host/tts-status.ts @@ -1,5 +1,5 @@ import type { TtsProvider } from "../config/types.tts.js"; -import type { ResolvedTtsConfig } from "../tts/tts.js"; +import type { ResolvedTtsConfig } from "./tts-config.js"; import { getExtensionHostTtsMaxLength, isExtensionHostTtsEnabled, diff --git a/src/tts/tts-core.ts b/src/tts/tts-core.ts index 5d3000d7ad3..98e6dc1b3b3 100644 --- a/src/tts/tts-core.ts +++ b/src/tts/tts-core.ts @@ -12,15 +12,14 @@ import { import { createConfiguredOllamaStreamFn } from "../agents/ollama-stream.js"; import { resolveModelAsync } from "../agents/pi-embedded-runner/model.js"; import type { OpenClawConfig } from "../config/config.js"; -import type { - ResolvedTtsConfig, - ResolvedTtsModelOverrides, - TtsDirectiveOverrides, - TtsDirectiveParseResult, -} from "./tts.js"; +import { + DEFAULT_OPENAI_BASE_URL, + type ResolvedTtsConfig, + type ResolvedTtsModelOverrides, +} from "../extension-host/tts-config.js"; +import type { TtsDirectiveOverrides, TtsDirectiveParseResult } from "./tts.js"; const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io"; -export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"; const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes export function isValidVoiceId(voiceId: string): boolean { diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 6737cb4ee04..e9fc59c31e3 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -1,19 +1,17 @@ import type { ReplyPayload } from "../auto-reply/types.js"; import type { OpenClawConfig } from "../config/config.js"; -import { normalizeResolvedSecretInputString } from "../config/types.secrets.js"; -import type { - TtsConfig, - TtsAutoMode, - TtsMode, - TtsProvider, - TtsModelOverrideConfig, -} from "../config/types.tts.js"; +import type { TtsProvider } from "../config/types.tts.js"; +import { + normalizeExtensionHostTtsConfigAutoMode, + resolveExtensionHostTtsConfig, + resolveExtensionHostTtsModelOverridePolicy, + type ResolvedTtsConfig, +} from "../extension-host/tts-config.js"; import { resolveExtensionHostTtsPayloadPlan } from "../extension-host/tts-payload.js"; import { getExtensionHostTtsMaxLength, isExtensionHostTtsEnabled, isExtensionHostTtsSummarizationEnabled, - normalizeExtensionHostTtsAutoMode, resolveExtensionHostTtsAutoMode, resolveExtensionHostTtsPrefsPath, setExtensionHostTtsAutoMode, @@ -46,7 +44,6 @@ import { } from "../extension-host/tts-status.js"; import { logVerbose } from "../globals.js"; import { - DEFAULT_OPENAI_BASE_URL, isValidOpenAIModel, isValidOpenAIVoice, isValidVoiceId, @@ -58,86 +55,6 @@ import { } from "./tts-core.js"; export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "./tts-core.js"; -const DEFAULT_TIMEOUT_MS = 30_000; -const DEFAULT_MAX_TEXT_LENGTH = 4096; - -const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io"; -const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE"; -const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"; -const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"; -const DEFAULT_OPENAI_VOICE = "alloy"; -const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural"; -const DEFAULT_EDGE_LANG = "en-US"; -const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; - -const DEFAULT_ELEVENLABS_VOICE_SETTINGS = { - stability: 0.5, - similarityBoost: 0.75, - style: 0.0, - useSpeakerBoost: true, - speed: 1.0, -}; - -export type ResolvedTtsConfig = { - auto: TtsAutoMode; - mode: TtsMode; - provider: TtsProvider; - providerSource: "config" | "default"; - summaryModel?: string; - modelOverrides: ResolvedTtsModelOverrides; - elevenlabs: { - apiKey?: string; - baseUrl: string; - voiceId: string; - modelId: string; - seed?: number; - applyTextNormalization?: "auto" | "on" | "off"; - languageCode?: string; - voiceSettings: { - stability: number; - similarityBoost: number; - style: number; - useSpeakerBoost: boolean; - speed: number; - }; - }; - openai: { - apiKey?: string; - baseUrl: string; - model: string; - voice: string; - speed?: number; - instructions?: string; - }; - edge: { - enabled: boolean; - voice: string; - lang: string; - outputFormat: string; - outputFormatConfigured: boolean; - pitch?: string; - rate?: string; - volume?: string; - saveSubtitles: boolean; - proxy?: string; - timeoutMs?: number; - }; - prefsPath?: string; - maxTextLength: number; - timeoutMs: number; -}; - -export type ResolvedTtsModelOverrides = { - enabled: boolean; - allowText: boolean; - allowProvider: boolean; - allowVoice: boolean; - allowModelId: boolean; - allowVoiceSettings: boolean; - allowNormalization: boolean; - allowSeed: boolean; -}; - export type TtsDirectiveOverrides = { ttsText?: string; provider?: TtsProvider; @@ -185,108 +102,9 @@ export type TtsTelephonyResult = { type TtsStatusEntry = ExtensionHostTtsStatusEntry; -export const normalizeTtsAutoMode = normalizeExtensionHostTtsAutoMode; +export const normalizeTtsAutoMode = normalizeExtensionHostTtsConfigAutoMode; -function resolveModelOverridePolicy( - overrides: TtsModelOverrideConfig | undefined, -): ResolvedTtsModelOverrides { - const enabled = overrides?.enabled ?? true; - if (!enabled) { - return { - enabled: false, - allowText: false, - allowProvider: false, - allowVoice: false, - allowModelId: false, - allowVoiceSettings: false, - allowNormalization: false, - allowSeed: false, - }; - } - const allow = (value: boolean | undefined, defaultValue = true) => value ?? defaultValue; - return { - enabled: true, - allowText: allow(overrides?.allowText), - // Provider switching is higher-impact than voice/style tweaks; keep opt-in. - allowProvider: allow(overrides?.allowProvider, false), - allowVoice: allow(overrides?.allowVoice), - allowModelId: allow(overrides?.allowModelId), - allowVoiceSettings: allow(overrides?.allowVoiceSettings), - allowNormalization: allow(overrides?.allowNormalization), - allowSeed: allow(overrides?.allowSeed), - }; -} - -export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig { - const raw: TtsConfig = cfg.messages?.tts ?? {}; - const providerSource = raw.provider ? "config" : "default"; - const edgeOutputFormat = raw.edge?.outputFormat?.trim(); - const auto = normalizeTtsAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off"); - return { - auto, - mode: raw.mode ?? "final", - provider: raw.provider ?? "edge", - providerSource, - summaryModel: raw.summaryModel?.trim() || undefined, - modelOverrides: resolveModelOverridePolicy(raw.modelOverrides), - elevenlabs: { - apiKey: normalizeResolvedSecretInputString({ - value: raw.elevenlabs?.apiKey, - path: "messages.tts.elevenlabs.apiKey", - }), - baseUrl: raw.elevenlabs?.baseUrl?.trim() || DEFAULT_ELEVENLABS_BASE_URL, - voiceId: raw.elevenlabs?.voiceId ?? DEFAULT_ELEVENLABS_VOICE_ID, - modelId: raw.elevenlabs?.modelId ?? DEFAULT_ELEVENLABS_MODEL_ID, - seed: raw.elevenlabs?.seed, - applyTextNormalization: raw.elevenlabs?.applyTextNormalization, - languageCode: raw.elevenlabs?.languageCode, - voiceSettings: { - stability: - raw.elevenlabs?.voiceSettings?.stability ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.stability, - similarityBoost: - raw.elevenlabs?.voiceSettings?.similarityBoost ?? - DEFAULT_ELEVENLABS_VOICE_SETTINGS.similarityBoost, - style: raw.elevenlabs?.voiceSettings?.style ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.style, - useSpeakerBoost: - raw.elevenlabs?.voiceSettings?.useSpeakerBoost ?? - DEFAULT_ELEVENLABS_VOICE_SETTINGS.useSpeakerBoost, - speed: raw.elevenlabs?.voiceSettings?.speed ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.speed, - }, - }, - openai: { - apiKey: normalizeResolvedSecretInputString({ - value: raw.openai?.apiKey, - path: "messages.tts.openai.apiKey", - }), - // Config > env var > default; strip trailing slashes for consistency. - baseUrl: ( - raw.openai?.baseUrl?.trim() || - process.env.OPENAI_TTS_BASE_URL?.trim() || - DEFAULT_OPENAI_BASE_URL - ).replace(/\/+$/, ""), - model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL, - voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE, - speed: raw.openai?.speed, - instructions: raw.openai?.instructions?.trim() || undefined, - }, - edge: { - enabled: raw.edge?.enabled ?? true, - voice: raw.edge?.voice?.trim() || DEFAULT_EDGE_VOICE, - lang: raw.edge?.lang?.trim() || DEFAULT_EDGE_LANG, - outputFormat: edgeOutputFormat || DEFAULT_EDGE_OUTPUT_FORMAT, - outputFormatConfigured: Boolean(edgeOutputFormat), - pitch: raw.edge?.pitch?.trim() || undefined, - rate: raw.edge?.rate?.trim() || undefined, - volume: raw.edge?.volume?.trim() || undefined, - saveSubtitles: raw.edge?.saveSubtitles ?? false, - proxy: raw.edge?.proxy?.trim() || undefined, - timeoutMs: raw.edge?.timeoutMs, - }, - prefsPath: raw.prefsPath, - maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH, - timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS, - }; -} +export const resolveTtsConfig = resolveExtensionHostTtsConfig; export const resolveTtsPrefsPath = resolveExtensionHostTtsPrefsPath; @@ -475,7 +293,7 @@ export const _test = { OPENAI_TTS_VOICES, resolveOpenAITtsInstructions, parseTtsDirectives, - resolveModelOverridePolicy, + resolveModelOverridePolicy: resolveExtensionHostTtsModelOverridePolicy, summarizeText, resolveOutputFormat: resolveExtensionHostTtsOutputFormat, resolveEdgeOutputFormat: resolveExtensionHostEdgeOutputFormat,