TTS: extract config surface

This commit is contained in:
Gustavo Madeira Santana 2026-03-15 20:21:26 +00:00
parent 60156bdfe9
commit ce5019e4ef
11 changed files with 219 additions and 212 deletions

View File

@ -0,0 +1,193 @@
import type { OpenClawConfig } from "../config/config.js";
import { normalizeResolvedSecretInputString } from "../config/types.secrets.js";
import type {
TtsAutoMode,
TtsConfig,
TtsMode,
TtsModelOverrideConfig,
TtsProvider,
} from "../config/types.tts.js";
import { normalizeExtensionHostTtsAutoMode } from "./tts-preferences.js";
export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
const DEFAULT_TIMEOUT_MS = 30_000;
const DEFAULT_MAX_TEXT_LENGTH = 4096;
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2";
const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts";
const DEFAULT_OPENAI_VOICE = "alloy";
const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
const DEFAULT_EDGE_LANG = "en-US";
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
stability: 0.5,
similarityBoost: 0.75,
style: 0.0,
useSpeakerBoost: true,
speed: 1.0,
};
export type ResolvedTtsConfig = {
auto: TtsAutoMode;
mode: TtsMode;
provider: TtsProvider;
providerSource: "config" | "default";
summaryModel?: string;
modelOverrides: ResolvedTtsModelOverrides;
elevenlabs: {
apiKey?: string;
baseUrl: string;
voiceId: string;
modelId: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings: {
stability: number;
similarityBoost: number;
style: number;
useSpeakerBoost: boolean;
speed: number;
};
};
openai: {
apiKey?: string;
baseUrl: string;
model: string;
voice: string;
speed?: number;
instructions?: string;
};
edge: {
enabled: boolean;
voice: string;
lang: string;
outputFormat: string;
outputFormatConfigured: boolean;
pitch?: string;
rate?: string;
volume?: string;
saveSubtitles: boolean;
proxy?: string;
timeoutMs?: number;
};
prefsPath?: string;
maxTextLength: number;
timeoutMs: number;
};
export type ResolvedTtsModelOverrides = {
enabled: boolean;
allowText: boolean;
allowProvider: boolean;
allowVoice: boolean;
allowModelId: boolean;
allowVoiceSettings: boolean;
allowNormalization: boolean;
allowSeed: boolean;
};
export const normalizeExtensionHostTtsConfigAutoMode = normalizeExtensionHostTtsAutoMode;
export function resolveExtensionHostTtsModelOverridePolicy(
overrides: TtsModelOverrideConfig | undefined,
): ResolvedTtsModelOverrides {
const enabled = overrides?.enabled ?? true;
if (!enabled) {
return {
enabled: false,
allowText: false,
allowProvider: false,
allowVoice: false,
allowModelId: false,
allowVoiceSettings: false,
allowNormalization: false,
allowSeed: false,
};
}
const allow = (value: boolean | undefined, defaultValue = true) => value ?? defaultValue;
return {
enabled: true,
allowText: allow(overrides?.allowText),
allowProvider: allow(overrides?.allowProvider, false),
allowVoice: allow(overrides?.allowVoice),
allowModelId: allow(overrides?.allowModelId),
allowVoiceSettings: allow(overrides?.allowVoiceSettings),
allowNormalization: allow(overrides?.allowNormalization),
allowSeed: allow(overrides?.allowSeed),
};
}
export function resolveExtensionHostTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
const raw: TtsConfig = cfg.messages?.tts ?? {};
const providerSource = raw.provider ? "config" : "default";
const edgeOutputFormat = raw.edge?.outputFormat?.trim();
const auto =
normalizeExtensionHostTtsConfigAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off");
return {
auto,
mode: raw.mode ?? "final",
provider: raw.provider ?? "edge",
providerSource,
summaryModel: raw.summaryModel?.trim() || undefined,
modelOverrides: resolveExtensionHostTtsModelOverridePolicy(raw.modelOverrides),
elevenlabs: {
apiKey: normalizeResolvedSecretInputString({
value: raw.elevenlabs?.apiKey,
path: "messages.tts.elevenlabs.apiKey",
}),
baseUrl: raw.elevenlabs?.baseUrl?.trim() || DEFAULT_ELEVENLABS_BASE_URL,
voiceId: raw.elevenlabs?.voiceId ?? DEFAULT_ELEVENLABS_VOICE_ID,
modelId: raw.elevenlabs?.modelId ?? DEFAULT_ELEVENLABS_MODEL_ID,
seed: raw.elevenlabs?.seed,
applyTextNormalization: raw.elevenlabs?.applyTextNormalization,
languageCode: raw.elevenlabs?.languageCode,
voiceSettings: {
stability:
raw.elevenlabs?.voiceSettings?.stability ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.stability,
similarityBoost:
raw.elevenlabs?.voiceSettings?.similarityBoost ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.similarityBoost,
style: raw.elevenlabs?.voiceSettings?.style ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.style,
useSpeakerBoost:
raw.elevenlabs?.voiceSettings?.useSpeakerBoost ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.useSpeakerBoost,
speed: raw.elevenlabs?.voiceSettings?.speed ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.speed,
},
},
openai: {
apiKey: normalizeResolvedSecretInputString({
value: raw.openai?.apiKey,
path: "messages.tts.openai.apiKey",
}),
baseUrl: (
raw.openai?.baseUrl?.trim() ||
process.env.OPENAI_TTS_BASE_URL?.trim() ||
DEFAULT_OPENAI_BASE_URL
).replace(/\/+$/, ""),
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
speed: raw.openai?.speed,
instructions: raw.openai?.instructions?.trim() || undefined,
},
edge: {
enabled: raw.edge?.enabled ?? true,
voice: raw.edge?.voice?.trim() || DEFAULT_EDGE_VOICE,
lang: raw.edge?.lang?.trim() || DEFAULT_EDGE_LANG,
outputFormat: edgeOutputFormat || DEFAULT_EDGE_OUTPUT_FORMAT,
outputFormatConfigured: Boolean(edgeOutputFormat),
pitch: raw.edge?.pitch?.trim() || undefined,
rate: raw.edge?.rate?.trim() || undefined,
volume: raw.edge?.volume?.trim() || undefined,
saveSubtitles: raw.edge?.saveSubtitles ?? false,
proxy: raw.edge?.proxy?.trim() || undefined,
timeoutMs: raw.edge?.timeoutMs,
},
prefsPath: raw.prefsPath,
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS,
};
}

View File

@ -3,7 +3,8 @@ import type { OpenClawConfig } from "../config/config.js";
import { logVerbose } from "../globals.js";
import { stripMarkdown } from "../line/markdown-to-line.js";
import { parseTtsDirectives, summarizeText } from "../tts/tts-core.js";
import type { ResolvedTtsConfig, TtsDirectiveOverrides } from "../tts/tts.js";
import type { TtsDirectiveOverrides } from "../tts/tts.js";
import type { ResolvedTtsConfig } from "./tts-config.js";
import {
getExtensionHostTtsMaxLength,
isExtensionHostTtsSummarizationEnabled,

View File

@ -3,7 +3,7 @@ import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it } from "vitest";
import { withEnv } from "../test-utils/env.js";
import type { ResolvedTtsConfig } from "../tts/tts.js";
import type { ResolvedTtsConfig } from "./tts-config.js";
import {
getExtensionHostTtsMaxLength,
isExtensionHostTtsEnabled,

View File

@ -9,8 +9,8 @@ import {
} from "node:fs";
import path from "node:path";
import type { TtsAutoMode, TtsProvider } from "../config/types.tts.js";
import type { ResolvedTtsConfig } from "../tts/tts.js";
import { CONFIG_DIR, resolveUserPath } from "../utils.js";
import type { ResolvedTtsConfig } from "./tts-config.js";
export const DEFAULT_EXTENSION_HOST_TTS_MAX_LENGTH = 1500;
export const DEFAULT_EXTENSION_HOST_TTS_SUMMARIZE = true;

View File

@ -11,12 +11,8 @@ import {
openaiTTS,
scheduleCleanup,
} from "../tts/tts-core.js";
import type {
ResolvedTtsConfig,
TtsDirectiveOverrides,
TtsResult,
TtsTelephonyResult,
} from "../tts/tts.js";
import type { TtsDirectiveOverrides, TtsResult, TtsTelephonyResult } from "../tts/tts.js";
import type { ResolvedTtsConfig } from "./tts-config.js";
import {
resolveExtensionHostTtsApiKey,
supportsExtensionHostTtsTelephony,

View File

@ -1,5 +1,5 @@
import type { TtsProvider } from "../config/types.tts.js";
import type { ResolvedTtsConfig } from "../tts/tts.js";
import type { ResolvedTtsConfig } from "./tts-config.js";
export type ExtensionHostTtsRuntimeProvider = {
id: TtsProvider;

View File

@ -3,7 +3,7 @@ import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it } from "vitest";
import { withEnv } from "../test-utils/env.js";
import type { ResolvedTtsConfig } from "../tts/tts.js";
import type { ResolvedTtsConfig } from "./tts-config.js";
import {
resolveExtensionHostTtsProvider,
resolveExtensionHostTtsRequestSetup,

View File

@ -1,6 +1,6 @@
import { existsSync, readFileSync } from "node:fs";
import type { TtsProvider } from "../config/types.tts.js";
import type { ResolvedTtsConfig } from "../tts/tts.js";
import type { ResolvedTtsConfig } from "./tts-config.js";
import {
resolveExtensionHostTtsApiKey,
resolveExtensionHostTtsProviderOrder,

View File

@ -1,5 +1,5 @@
import type { TtsProvider } from "../config/types.tts.js";
import type { ResolvedTtsConfig } from "../tts/tts.js";
import type { ResolvedTtsConfig } from "./tts-config.js";
import {
getExtensionHostTtsMaxLength,
isExtensionHostTtsEnabled,

View File

@ -12,15 +12,14 @@ import {
import { createConfiguredOllamaStreamFn } from "../agents/ollama-stream.js";
import { resolveModelAsync } from "../agents/pi-embedded-runner/model.js";
import type { OpenClawConfig } from "../config/config.js";
import type {
ResolvedTtsConfig,
ResolvedTtsModelOverrides,
TtsDirectiveOverrides,
TtsDirectiveParseResult,
} from "./tts.js";
import {
DEFAULT_OPENAI_BASE_URL,
type ResolvedTtsConfig,
type ResolvedTtsModelOverrides,
} from "../extension-host/tts-config.js";
import type { TtsDirectiveOverrides, TtsDirectiveParseResult } from "./tts.js";
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
export function isValidVoiceId(voiceId: string): boolean {

View File

@ -1,19 +1,17 @@
import type { ReplyPayload } from "../auto-reply/types.js";
import type { OpenClawConfig } from "../config/config.js";
import { normalizeResolvedSecretInputString } from "../config/types.secrets.js";
import type {
TtsConfig,
TtsAutoMode,
TtsMode,
TtsProvider,
TtsModelOverrideConfig,
} from "../config/types.tts.js";
import type { TtsProvider } from "../config/types.tts.js";
import {
normalizeExtensionHostTtsConfigAutoMode,
resolveExtensionHostTtsConfig,
resolveExtensionHostTtsModelOverridePolicy,
type ResolvedTtsConfig,
} from "../extension-host/tts-config.js";
import { resolveExtensionHostTtsPayloadPlan } from "../extension-host/tts-payload.js";
import {
getExtensionHostTtsMaxLength,
isExtensionHostTtsEnabled,
isExtensionHostTtsSummarizationEnabled,
normalizeExtensionHostTtsAutoMode,
resolveExtensionHostTtsAutoMode,
resolveExtensionHostTtsPrefsPath,
setExtensionHostTtsAutoMode,
@ -46,7 +44,6 @@ import {
} from "../extension-host/tts-status.js";
import { logVerbose } from "../globals.js";
import {
DEFAULT_OPENAI_BASE_URL,
isValidOpenAIModel,
isValidOpenAIVoice,
isValidVoiceId,
@ -58,86 +55,6 @@ import {
} from "./tts-core.js";
export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "./tts-core.js";
const DEFAULT_TIMEOUT_MS = 30_000;
const DEFAULT_MAX_TEXT_LENGTH = 4096;
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2";
const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts";
const DEFAULT_OPENAI_VOICE = "alloy";
const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
const DEFAULT_EDGE_LANG = "en-US";
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
stability: 0.5,
similarityBoost: 0.75,
style: 0.0,
useSpeakerBoost: true,
speed: 1.0,
};
export type ResolvedTtsConfig = {
auto: TtsAutoMode;
mode: TtsMode;
provider: TtsProvider;
providerSource: "config" | "default";
summaryModel?: string;
modelOverrides: ResolvedTtsModelOverrides;
elevenlabs: {
apiKey?: string;
baseUrl: string;
voiceId: string;
modelId: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings: {
stability: number;
similarityBoost: number;
style: number;
useSpeakerBoost: boolean;
speed: number;
};
};
openai: {
apiKey?: string;
baseUrl: string;
model: string;
voice: string;
speed?: number;
instructions?: string;
};
edge: {
enabled: boolean;
voice: string;
lang: string;
outputFormat: string;
outputFormatConfigured: boolean;
pitch?: string;
rate?: string;
volume?: string;
saveSubtitles: boolean;
proxy?: string;
timeoutMs?: number;
};
prefsPath?: string;
maxTextLength: number;
timeoutMs: number;
};
export type ResolvedTtsModelOverrides = {
enabled: boolean;
allowText: boolean;
allowProvider: boolean;
allowVoice: boolean;
allowModelId: boolean;
allowVoiceSettings: boolean;
allowNormalization: boolean;
allowSeed: boolean;
};
export type TtsDirectiveOverrides = {
ttsText?: string;
provider?: TtsProvider;
@ -185,108 +102,9 @@ export type TtsTelephonyResult = {
type TtsStatusEntry = ExtensionHostTtsStatusEntry;
export const normalizeTtsAutoMode = normalizeExtensionHostTtsAutoMode;
export const normalizeTtsAutoMode = normalizeExtensionHostTtsConfigAutoMode;
function resolveModelOverridePolicy(
overrides: TtsModelOverrideConfig | undefined,
): ResolvedTtsModelOverrides {
const enabled = overrides?.enabled ?? true;
if (!enabled) {
return {
enabled: false,
allowText: false,
allowProvider: false,
allowVoice: false,
allowModelId: false,
allowVoiceSettings: false,
allowNormalization: false,
allowSeed: false,
};
}
const allow = (value: boolean | undefined, defaultValue = true) => value ?? defaultValue;
return {
enabled: true,
allowText: allow(overrides?.allowText),
// Provider switching is higher-impact than voice/style tweaks; keep opt-in.
allowProvider: allow(overrides?.allowProvider, false),
allowVoice: allow(overrides?.allowVoice),
allowModelId: allow(overrides?.allowModelId),
allowVoiceSettings: allow(overrides?.allowVoiceSettings),
allowNormalization: allow(overrides?.allowNormalization),
allowSeed: allow(overrides?.allowSeed),
};
}
export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
const raw: TtsConfig = cfg.messages?.tts ?? {};
const providerSource = raw.provider ? "config" : "default";
const edgeOutputFormat = raw.edge?.outputFormat?.trim();
const auto = normalizeTtsAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off");
return {
auto,
mode: raw.mode ?? "final",
provider: raw.provider ?? "edge",
providerSource,
summaryModel: raw.summaryModel?.trim() || undefined,
modelOverrides: resolveModelOverridePolicy(raw.modelOverrides),
elevenlabs: {
apiKey: normalizeResolvedSecretInputString({
value: raw.elevenlabs?.apiKey,
path: "messages.tts.elevenlabs.apiKey",
}),
baseUrl: raw.elevenlabs?.baseUrl?.trim() || DEFAULT_ELEVENLABS_BASE_URL,
voiceId: raw.elevenlabs?.voiceId ?? DEFAULT_ELEVENLABS_VOICE_ID,
modelId: raw.elevenlabs?.modelId ?? DEFAULT_ELEVENLABS_MODEL_ID,
seed: raw.elevenlabs?.seed,
applyTextNormalization: raw.elevenlabs?.applyTextNormalization,
languageCode: raw.elevenlabs?.languageCode,
voiceSettings: {
stability:
raw.elevenlabs?.voiceSettings?.stability ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.stability,
similarityBoost:
raw.elevenlabs?.voiceSettings?.similarityBoost ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.similarityBoost,
style: raw.elevenlabs?.voiceSettings?.style ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.style,
useSpeakerBoost:
raw.elevenlabs?.voiceSettings?.useSpeakerBoost ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.useSpeakerBoost,
speed: raw.elevenlabs?.voiceSettings?.speed ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.speed,
},
},
openai: {
apiKey: normalizeResolvedSecretInputString({
value: raw.openai?.apiKey,
path: "messages.tts.openai.apiKey",
}),
// Config > env var > default; strip trailing slashes for consistency.
baseUrl: (
raw.openai?.baseUrl?.trim() ||
process.env.OPENAI_TTS_BASE_URL?.trim() ||
DEFAULT_OPENAI_BASE_URL
).replace(/\/+$/, ""),
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
speed: raw.openai?.speed,
instructions: raw.openai?.instructions?.trim() || undefined,
},
edge: {
enabled: raw.edge?.enabled ?? true,
voice: raw.edge?.voice?.trim() || DEFAULT_EDGE_VOICE,
lang: raw.edge?.lang?.trim() || DEFAULT_EDGE_LANG,
outputFormat: edgeOutputFormat || DEFAULT_EDGE_OUTPUT_FORMAT,
outputFormatConfigured: Boolean(edgeOutputFormat),
pitch: raw.edge?.pitch?.trim() || undefined,
rate: raw.edge?.rate?.trim() || undefined,
volume: raw.edge?.volume?.trim() || undefined,
saveSubtitles: raw.edge?.saveSubtitles ?? false,
proxy: raw.edge?.proxy?.trim() || undefined,
timeoutMs: raw.edge?.timeoutMs,
},
prefsPath: raw.prefsPath,
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS,
};
}
export const resolveTtsConfig = resolveExtensionHostTtsConfig;
export const resolveTtsPrefsPath = resolveExtensionHostTtsPrefsPath;
@ -475,7 +293,7 @@ export const _test = {
OPENAI_TTS_VOICES,
resolveOpenAITtsInstructions,
parseTtsDirectives,
resolveModelOverridePolicy,
resolveModelOverridePolicy: resolveExtensionHostTtsModelOverridePolicy,
summarizeText,
resolveOutputFormat: resolveExtensionHostTtsOutputFormat,
resolveEdgeOutputFormat: resolveExtensionHostEdgeOutputFormat,