openclaw/src/tts/tts.ts

938 lines
27 KiB
TypeScript

import { randomBytes } from "node:crypto";
import {
existsSync,
mkdirSync,
readFileSync,
writeFileSync,
mkdtempSync,
renameSync,
unlinkSync,
} from "node:fs";
import path from "node:path";
import type { ReplyPayload } from "../auto-reply/types.js";
import { normalizeChannelId } from "../channels/plugins/index.js";
import type { ChannelId } from "../channels/plugins/types.js";
import type { OpenClawConfig } from "../config/config.js";
import { normalizeResolvedSecretInputString } from "../config/types.secrets.js";
import type {
TtsConfig,
TtsAutoMode,
TtsMode,
TtsProvider,
TtsModelOverrideConfig,
} from "../config/types.tts.js";
import { logVerbose } from "../globals.js";
import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
import { stripMarkdown } from "../line/markdown-to-line.js";
import { resolveSendableOutboundReplyParts } from "../plugin-sdk/reply-payload.js";
import { CONFIG_DIR, resolveUserPath } from "../utils.js";
import {
getSpeechProvider,
listSpeechProviders,
normalizeSpeechProviderId,
} from "./provider-registry.js";
import type { SpeechVoiceOption } from "./provider-types.js";
import {
DEFAULT_OPENAI_BASE_URL,
isValidOpenAIModel,
isValidOpenAIVoice,
isValidVoiceId,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
resolveOpenAITtsInstructions,
parseTtsDirectives,
scheduleCleanup,
summarizeText,
} from "./tts-core.js";
export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "./tts-core.js";
const DEFAULT_TIMEOUT_MS = 30_000;
const DEFAULT_TTS_MAX_LENGTH = 1500;
const DEFAULT_TTS_SUMMARIZE = true;
const DEFAULT_MAX_TEXT_LENGTH = 4096;
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2";
const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts";
const DEFAULT_OPENAI_VOICE = "alloy";
const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
const DEFAULT_EDGE_LANG = "en-US";
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
stability: 0.5,
similarityBoost: 0.75,
style: 0.0,
useSpeakerBoost: true,
speed: 1.0,
};
const TELEGRAM_OUTPUT = {
openai: "opus" as const,
// ElevenLabs output formats use codec_sample_rate_bitrate naming.
// Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram.
elevenlabs: "opus_48000_64",
extension: ".opus",
voiceCompatible: true,
};
const DEFAULT_OUTPUT = {
openai: "mp3" as const,
elevenlabs: "mp3_44100_128",
extension: ".mp3",
voiceCompatible: false,
};
const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);
export type ResolvedTtsConfig = {
auto: TtsAutoMode;
mode: TtsMode;
provider: TtsProvider;
providerSource: "config" | "default";
summaryModel?: string;
modelOverrides: ResolvedTtsModelOverrides;
elevenlabs: {
apiKey?: string;
baseUrl: string;
voiceId: string;
modelId: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings: {
stability: number;
similarityBoost: number;
style: number;
useSpeakerBoost: boolean;
speed: number;
};
};
openai: {
apiKey?: string;
baseUrl: string;
model: string;
voice: string;
speed?: number;
instructions?: string;
};
edge: {
enabled: boolean;
voice: string;
lang: string;
outputFormat: string;
outputFormatConfigured: boolean;
pitch?: string;
rate?: string;
volume?: string;
saveSubtitles: boolean;
proxy?: string;
timeoutMs?: number;
};
prefsPath?: string;
maxTextLength: number;
timeoutMs: number;
};
type TtsUserPrefs = {
tts?: {
auto?: TtsAutoMode;
enabled?: boolean;
provider?: TtsProvider;
maxLength?: number;
summarize?: boolean;
};
};
export type ResolvedTtsModelOverrides = {
enabled: boolean;
allowText: boolean;
allowProvider: boolean;
allowVoice: boolean;
allowModelId: boolean;
allowVoiceSettings: boolean;
allowNormalization: boolean;
allowSeed: boolean;
};
export type TtsDirectiveOverrides = {
ttsText?: string;
provider?: TtsProvider;
openai?: {
voice?: string;
model?: string;
};
elevenlabs?: {
voiceId?: string;
modelId?: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings?: Partial<ResolvedTtsConfig["elevenlabs"]["voiceSettings"]>;
};
};
export type TtsDirectiveParseResult = {
cleanedText: string;
ttsText?: string;
hasDirective: boolean;
overrides: TtsDirectiveOverrides;
warnings: string[];
};
export type TtsResult = {
success: boolean;
audioPath?: string;
error?: string;
latencyMs?: number;
provider?: string;
outputFormat?: string;
voiceCompatible?: boolean;
};
export type TtsTelephonyResult = {
success: boolean;
audioBuffer?: Buffer;
error?: string;
latencyMs?: number;
provider?: string;
outputFormat?: string;
sampleRate?: number;
};
type TtsStatusEntry = {
timestamp: number;
success: boolean;
textLength: number;
summarized: boolean;
provider?: string;
latencyMs?: number;
error?: string;
};
let lastTtsAttempt: TtsStatusEntry | undefined;
export function normalizeTtsAutoMode(value: unknown): TtsAutoMode | undefined {
if (typeof value !== "string") {
return undefined;
}
const normalized = value.trim().toLowerCase();
if (TTS_AUTO_MODES.has(normalized as TtsAutoMode)) {
return normalized as TtsAutoMode;
}
return undefined;
}
function resolveModelOverridePolicy(
overrides: TtsModelOverrideConfig | undefined,
): ResolvedTtsModelOverrides {
const enabled = overrides?.enabled ?? true;
if (!enabled) {
return {
enabled: false,
allowText: false,
allowProvider: false,
allowVoice: false,
allowModelId: false,
allowVoiceSettings: false,
allowNormalization: false,
allowSeed: false,
};
}
const allow = (value: boolean | undefined, defaultValue = true) => value ?? defaultValue;
return {
enabled: true,
allowText: allow(overrides?.allowText),
// Provider switching is higher-impact than voice/style tweaks; keep opt-in.
allowProvider: allow(overrides?.allowProvider, false),
allowVoice: allow(overrides?.allowVoice),
allowModelId: allow(overrides?.allowModelId),
allowVoiceSettings: allow(overrides?.allowVoiceSettings),
allowNormalization: allow(overrides?.allowNormalization),
allowSeed: allow(overrides?.allowSeed),
};
}
export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
const raw: TtsConfig = cfg.messages?.tts ?? {};
const providerSource = raw.provider ? "config" : "default";
const rawMicrosoft = { ...raw.edge, ...raw.microsoft };
const edgeOutputFormat = rawMicrosoft.outputFormat?.trim();
const auto = normalizeTtsAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off");
return {
auto,
mode: raw.mode ?? "final",
provider: normalizeSpeechProviderId(raw.provider) ?? "microsoft",
providerSource,
summaryModel: raw.summaryModel?.trim() || undefined,
modelOverrides: resolveModelOverridePolicy(raw.modelOverrides),
elevenlabs: {
apiKey: normalizeResolvedSecretInputString({
value: raw.elevenlabs?.apiKey,
path: "messages.tts.elevenlabs.apiKey",
}),
baseUrl: raw.elevenlabs?.baseUrl?.trim() || DEFAULT_ELEVENLABS_BASE_URL,
voiceId: raw.elevenlabs?.voiceId ?? DEFAULT_ELEVENLABS_VOICE_ID,
modelId: raw.elevenlabs?.modelId ?? DEFAULT_ELEVENLABS_MODEL_ID,
seed: raw.elevenlabs?.seed,
applyTextNormalization: raw.elevenlabs?.applyTextNormalization,
languageCode: raw.elevenlabs?.languageCode,
voiceSettings: {
stability:
raw.elevenlabs?.voiceSettings?.stability ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.stability,
similarityBoost:
raw.elevenlabs?.voiceSettings?.similarityBoost ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.similarityBoost,
style: raw.elevenlabs?.voiceSettings?.style ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.style,
useSpeakerBoost:
raw.elevenlabs?.voiceSettings?.useSpeakerBoost ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.useSpeakerBoost,
speed: raw.elevenlabs?.voiceSettings?.speed ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.speed,
},
},
openai: {
apiKey: normalizeResolvedSecretInputString({
value: raw.openai?.apiKey,
path: "messages.tts.openai.apiKey",
}),
// Config > env var > default; strip trailing slashes for consistency.
baseUrl: (
raw.openai?.baseUrl?.trim() ||
process.env.OPENAI_TTS_BASE_URL?.trim() ||
DEFAULT_OPENAI_BASE_URL
).replace(/\/+$/, ""),
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
speed: raw.openai?.speed,
instructions: raw.openai?.instructions?.trim() || undefined,
},
edge: {
enabled: rawMicrosoft.enabled ?? true,
voice: rawMicrosoft.voice?.trim() || DEFAULT_EDGE_VOICE,
lang: rawMicrosoft.lang?.trim() || DEFAULT_EDGE_LANG,
outputFormat: edgeOutputFormat || DEFAULT_EDGE_OUTPUT_FORMAT,
outputFormatConfigured: Boolean(edgeOutputFormat),
pitch: rawMicrosoft.pitch?.trim() || undefined,
rate: rawMicrosoft.rate?.trim() || undefined,
volume: rawMicrosoft.volume?.trim() || undefined,
saveSubtitles: rawMicrosoft.saveSubtitles ?? false,
proxy: rawMicrosoft.proxy?.trim() || undefined,
timeoutMs: rawMicrosoft.timeoutMs,
},
prefsPath: raw.prefsPath,
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS,
};
}
export function resolveTtsPrefsPath(config: ResolvedTtsConfig): string {
if (config.prefsPath?.trim()) {
return resolveUserPath(config.prefsPath.trim());
}
const envPath = process.env.OPENCLAW_TTS_PREFS?.trim();
if (envPath) {
return resolveUserPath(envPath);
}
return path.join(CONFIG_DIR, "settings", "tts.json");
}
function resolveTtsAutoModeFromPrefs(prefs: TtsUserPrefs): TtsAutoMode | undefined {
const auto = normalizeTtsAutoMode(prefs.tts?.auto);
if (auto) {
return auto;
}
if (typeof prefs.tts?.enabled === "boolean") {
return prefs.tts.enabled ? "always" : "off";
}
return undefined;
}
export function resolveTtsAutoMode(params: {
config: ResolvedTtsConfig;
prefsPath: string;
sessionAuto?: string;
}): TtsAutoMode {
const sessionAuto = normalizeTtsAutoMode(params.sessionAuto);
if (sessionAuto) {
return sessionAuto;
}
const prefsAuto = resolveTtsAutoModeFromPrefs(readPrefs(params.prefsPath));
if (prefsAuto) {
return prefsAuto;
}
return params.config.auto;
}
export function buildTtsSystemPromptHint(cfg: OpenClawConfig): string | undefined {
const config = resolveTtsConfig(cfg);
const prefsPath = resolveTtsPrefsPath(config);
const autoMode = resolveTtsAutoMode({ config, prefsPath });
if (autoMode === "off") {
return undefined;
}
const maxLength = getTtsMaxLength(prefsPath);
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
const autoHint =
autoMode === "inbound"
? "Only use TTS when the user's last message includes audio/voice."
: autoMode === "tagged"
? "Only use TTS when you include [[tts]] or [[tts:text]] tags."
: undefined;
return [
"Voice (TTS) is enabled.",
autoHint,
`Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`,
"Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.",
]
.filter(Boolean)
.join("\n");
}
function readPrefs(prefsPath: string): TtsUserPrefs {
try {
if (!existsSync(prefsPath)) {
return {};
}
return JSON.parse(readFileSync(prefsPath, "utf8")) as TtsUserPrefs;
} catch {
return {};
}
}
function atomicWriteFileSync(filePath: string, content: string): void {
const tmpPath = `${filePath}.tmp.${Date.now()}.${randomBytes(8).toString("hex")}`;
writeFileSync(tmpPath, content, { mode: 0o600 });
try {
renameSync(tmpPath, filePath);
} catch (err) {
try {
unlinkSync(tmpPath);
} catch {
// ignore
}
throw err;
}
}
function updatePrefs(prefsPath: string, update: (prefs: TtsUserPrefs) => void): void {
const prefs = readPrefs(prefsPath);
update(prefs);
mkdirSync(path.dirname(prefsPath), { recursive: true });
atomicWriteFileSync(prefsPath, JSON.stringify(prefs, null, 2));
}
export function isTtsEnabled(
config: ResolvedTtsConfig,
prefsPath: string,
sessionAuto?: string,
): boolean {
return resolveTtsAutoMode({ config, prefsPath, sessionAuto }) !== "off";
}
export function setTtsAutoMode(prefsPath: string, mode: TtsAutoMode): void {
updatePrefs(prefsPath, (prefs) => {
const next = { ...prefs.tts };
delete next.enabled;
next.auto = mode;
prefs.tts = next;
});
}
export function setTtsEnabled(prefsPath: string, enabled: boolean): void {
setTtsAutoMode(prefsPath, enabled ? "always" : "off");
}
export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider {
const prefs = readPrefs(prefsPath);
const prefsProvider = normalizeSpeechProviderId(prefs.tts?.provider);
if (prefsProvider) {
return prefsProvider;
}
if (config.providerSource === "config") {
return normalizeSpeechProviderId(config.provider) ?? config.provider;
}
if (resolveTtsApiKey(config, "openai")) {
return "openai";
}
if (resolveTtsApiKey(config, "elevenlabs")) {
return "elevenlabs";
}
return "microsoft";
}
export function setTtsProvider(prefsPath: string, provider: TtsProvider): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, provider: normalizeSpeechProviderId(provider) ?? provider };
});
}
export function getTtsMaxLength(prefsPath: string): number {
const prefs = readPrefs(prefsPath);
return prefs.tts?.maxLength ?? DEFAULT_TTS_MAX_LENGTH;
}
export function setTtsMaxLength(prefsPath: string, maxLength: number): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, maxLength };
});
}
export function isSummarizationEnabled(prefsPath: string): boolean {
const prefs = readPrefs(prefsPath);
return prefs.tts?.summarize ?? DEFAULT_TTS_SUMMARIZE;
}
export function setSummarizationEnabled(prefsPath: string, enabled: boolean): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, summarize: enabled };
});
}
export function getLastTtsAttempt(): TtsStatusEntry | undefined {
return lastTtsAttempt;
}
export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
lastTtsAttempt = entry;
}
/** Channels that require opus audio and support voice-bubble playback */
const VOICE_BUBBLE_CHANNELS = new Set(["telegram", "feishu", "whatsapp"]);
function resolveOutputFormat(channelId?: string | null) {
if (channelId && VOICE_BUBBLE_CHANNELS.has(channelId)) {
return TELEGRAM_OUTPUT;
}
return DEFAULT_OUTPUT;
}
function resolveChannelId(channel: string | undefined): ChannelId | null {
return channel ? normalizeChannelId(channel) : null;
}
function resolveEdgeOutputFormat(config: ResolvedTtsConfig): string {
return config.edge.outputFormat;
}
export function resolveTtsApiKey(
config: ResolvedTtsConfig,
provider: TtsProvider,
): string | undefined {
const normalizedProvider = normalizeSpeechProviderId(provider);
if (normalizedProvider === "elevenlabs") {
return config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
}
if (normalizedProvider === "openai") {
return config.openai.apiKey || process.env.OPENAI_API_KEY;
}
return undefined;
}
export const TTS_PROVIDERS = ["openai", "elevenlabs", "microsoft"] as const;
export function resolveTtsProviderOrder(primary: TtsProvider, cfg?: OpenClawConfig): TtsProvider[] {
const normalizedPrimary = normalizeSpeechProviderId(primary) ?? primary;
const ordered = new Set<TtsProvider>([normalizedPrimary]);
for (const provider of TTS_PROVIDERS) {
if (provider !== normalizedPrimary) {
ordered.add(provider);
}
}
for (const provider of listSpeechProviders(cfg)) {
const normalized = normalizeSpeechProviderId(provider.id) ?? provider.id;
if (normalized !== normalizedPrimary) {
ordered.add(normalized);
}
}
return [...ordered];
}
export function isTtsProviderConfigured(
config: ResolvedTtsConfig,
provider: TtsProvider,
cfg?: OpenClawConfig,
): boolean {
const resolvedProvider = getSpeechProvider(provider, cfg);
return resolvedProvider?.isConfigured({ cfg, config }) ?? false;
}
function formatTtsProviderError(provider: TtsProvider, err: unknown): string {
const error = err instanceof Error ? err : new Error(String(err));
if (error.name === "AbortError") {
return `${provider}: request timed out`;
}
return `${provider}: ${error.message}`;
}
function buildTtsFailureResult(errors: string[]): { success: false; error: string } {
return {
success: false,
error: `TTS conversion failed: ${errors.join("; ") || "no providers available"}`,
};
}
function resolveReadySpeechProvider(params: {
provider: TtsProvider;
cfg: OpenClawConfig;
config: ResolvedTtsConfig;
errors: string[];
requireTelephony?: boolean;
}): NonNullable<ReturnType<typeof getSpeechProvider>> | null {
const resolvedProvider = getSpeechProvider(params.provider, params.cfg);
if (!resolvedProvider) {
params.errors.push(`${params.provider}: no provider registered`);
return null;
}
if (!resolvedProvider.isConfigured({ cfg: params.cfg, config: params.config })) {
params.errors.push(`${params.provider}: not configured`);
return null;
}
if (params.requireTelephony && !resolvedProvider.synthesizeTelephony) {
params.errors.push(`${params.provider}: unsupported for telephony`);
return null;
}
return resolvedProvider;
}
function resolveTtsRequestSetup(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
providerOverride?: TtsProvider;
}):
| {
config: ResolvedTtsConfig;
providers: TtsProvider[];
}
| {
error: string;
} {
const config = resolveTtsConfig(params.cfg);
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
if (params.text.length > config.maxTextLength) {
return {
error: `Text too long (${params.text.length} chars, max ${config.maxTextLength})`,
};
}
const userProvider = getTtsProvider(config, prefsPath);
const provider = normalizeSpeechProviderId(params.providerOverride) ?? userProvider;
return {
config,
providers: resolveTtsProviderOrder(provider, params.cfg),
};
}
export async function textToSpeech(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
channel?: string;
overrides?: TtsDirectiveOverrides;
}): Promise<TtsResult> {
const setup = resolveTtsRequestSetup({
text: params.text,
cfg: params.cfg,
prefsPath: params.prefsPath,
providerOverride: params.overrides?.provider,
});
if ("error" in setup) {
return { success: false, error: setup.error };
}
const { config, providers } = setup;
const channelId = resolveChannelId(params.channel);
const target = channelId && VOICE_BUBBLE_CHANNELS.has(channelId) ? "voice-note" : "audio-file";
const errors: string[] = [];
for (const provider of providers) {
const providerStart = Date.now();
try {
const resolvedProvider = resolveReadySpeechProvider({
provider,
cfg: params.cfg,
config,
errors,
});
if (!resolvedProvider) {
continue;
}
const synthesis = await resolvedProvider.synthesize({
text: params.text,
cfg: params.cfg,
config,
target,
overrides: params.overrides,
});
const latencyMs = Date.now() - providerStart;
const tempRoot = resolvePreferredOpenClawTmpDir();
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`);
writeFileSync(audioPath, synthesis.audioBuffer);
scheduleCleanup(tempDir);
return {
success: true,
audioPath,
latencyMs,
provider,
outputFormat: synthesis.outputFormat,
voiceCompatible: synthesis.voiceCompatible,
};
} catch (err) {
errors.push(formatTtsProviderError(provider, err));
}
}
return buildTtsFailureResult(errors);
}
export async function textToSpeechTelephony(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
}): Promise<TtsTelephonyResult> {
const setup = resolveTtsRequestSetup({
text: params.text,
cfg: params.cfg,
prefsPath: params.prefsPath,
});
if ("error" in setup) {
return { success: false, error: setup.error };
}
const { config, providers } = setup;
const errors: string[] = [];
for (const provider of providers) {
const providerStart = Date.now();
try {
const resolvedProvider = resolveReadySpeechProvider({
provider,
cfg: params.cfg,
config,
errors,
requireTelephony: true,
});
if (!resolvedProvider?.synthesizeTelephony) {
continue;
}
const synthesis = await resolvedProvider.synthesizeTelephony({
text: params.text,
cfg: params.cfg,
config,
});
return {
success: true,
audioBuffer: synthesis.audioBuffer,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: synthesis.outputFormat,
sampleRate: synthesis.sampleRate,
};
} catch (err) {
errors.push(formatTtsProviderError(provider, err));
}
}
return buildTtsFailureResult(errors);
}
export async function listSpeechVoices(params: {
provider: string;
cfg?: OpenClawConfig;
config?: ResolvedTtsConfig;
apiKey?: string;
baseUrl?: string;
}): Promise<SpeechVoiceOption[]> {
const provider = normalizeSpeechProviderId(params.provider);
if (!provider) {
throw new Error("speech provider id is required");
}
const config = params.config ?? (params.cfg ? resolveTtsConfig(params.cfg) : undefined);
if (!config) {
throw new Error(`speech provider ${provider} requires cfg or resolved config`);
}
const resolvedProvider = getSpeechProvider(provider, params.cfg);
if (!resolvedProvider) {
throw new Error(`speech provider ${provider} is not registered`);
}
if (!resolvedProvider.listVoices) {
throw new Error(`speech provider ${provider} does not support voice listing`);
}
return await resolvedProvider.listVoices({
cfg: params.cfg,
config,
apiKey: params.apiKey,
baseUrl: params.baseUrl,
});
}
export async function maybeApplyTtsToPayload(params: {
payload: ReplyPayload;
cfg: OpenClawConfig;
channel?: string;
kind?: "tool" | "block" | "final";
inboundAudio?: boolean;
ttsAuto?: string;
}): Promise<ReplyPayload> {
const config = resolveTtsConfig(params.cfg);
const prefsPath = resolveTtsPrefsPath(config);
const autoMode = resolveTtsAutoMode({
config,
prefsPath,
sessionAuto: params.ttsAuto,
});
if (autoMode === "off") {
return params.payload;
}
const reply = resolveSendableOutboundReplyParts(params.payload);
const text = reply.text;
const directives = parseTtsDirectives(text, config.modelOverrides, config.openai.baseUrl);
if (directives.warnings.length > 0) {
logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`);
}
const cleanedText = directives.cleanedText;
const trimmedCleaned = cleanedText.trim();
const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : "";
const ttsText = directives.ttsText?.trim() || visibleText;
const nextPayload =
visibleText === text.trim()
? params.payload
: {
...params.payload,
text: visibleText.length > 0 ? visibleText : undefined,
};
if (autoMode === "tagged" && !directives.hasDirective) {
return nextPayload;
}
if (autoMode === "inbound" && params.inboundAudio !== true) {
return nextPayload;
}
const mode = config.mode ?? "final";
if (mode === "final" && params.kind && params.kind !== "final") {
return nextPayload;
}
if (!ttsText.trim()) {
return nextPayload;
}
if (reply.hasMedia) {
return nextPayload;
}
if (text.includes("MEDIA:")) {
return nextPayload;
}
if (ttsText.trim().length < 10) {
return nextPayload;
}
const maxLength = getTtsMaxLength(prefsPath);
let textForAudio = ttsText.trim();
let wasSummarized = false;
if (textForAudio.length > maxLength) {
if (!isSummarizationEnabled(prefsPath)) {
logVerbose(
`TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
);
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
} else {
try {
const summary = await summarizeText({
text: textForAudio,
targetLength: maxLength,
cfg: params.cfg,
config,
timeoutMs: config.timeoutMs,
});
textForAudio = summary.summary;
wasSummarized = true;
if (textForAudio.length > config.maxTextLength) {
logVerbose(
`TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`,
);
textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`;
}
} catch (err) {
const error = err as Error;
logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`);
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
}
}
}
textForAudio = stripMarkdown(textForAudio).trim(); // strip markdown for TTS (### → "hashtag" etc.)
if (textForAudio.length < 10) {
return nextPayload;
}
const ttsStart = Date.now();
const result = await textToSpeech({
text: textForAudio,
cfg: params.cfg,
prefsPath,
channel: params.channel,
overrides: directives.overrides,
});
if (result.success && result.audioPath) {
lastTtsAttempt = {
timestamp: Date.now(),
success: true,
textLength: text.length,
summarized: wasSummarized,
provider: result.provider,
latencyMs: result.latencyMs,
};
const channelId = resolveChannelId(params.channel);
const shouldVoice =
channelId !== null && VOICE_BUBBLE_CHANNELS.has(channelId) && result.voiceCompatible === true;
const finalPayload = {
...nextPayload,
mediaUrl: result.audioPath,
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
};
return finalPayload;
}
lastTtsAttempt = {
timestamp: Date.now(),
success: false,
textLength: text.length,
summarized: wasSummarized,
error: result.error,
};
const latency = Date.now() - ttsStart;
logVerbose(`TTS: conversion failed after ${latency}ms (${result.error ?? "unknown"}).`);
return nextPayload;
}
export const _test = {
isValidVoiceId,
isValidOpenAIVoice,
isValidOpenAIModel,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
resolveOpenAITtsInstructions,
parseTtsDirectives,
resolveModelOverridePolicy,
summarizeText,
resolveOutputFormat,
resolveEdgeOutputFormat,
};