TTS: extract payload planning

This commit is contained in:
Gustavo Madeira Santana 2026-03-15 19:37:15 +00:00
parent 306ff18671
commit ed5941ed7e
2 changed files with 157 additions and 98 deletions

View File

@ -0,0 +1,139 @@
import type { ReplyPayload } from "../auto-reply/types.js";
import type { OpenClawConfig } from "../config/config.js";
import { logVerbose } from "../globals.js";
import { stripMarkdown } from "../line/markdown-to-line.js";
import { parseTtsDirectives, summarizeText } from "../tts/tts-core.js";
import type { ResolvedTtsConfig, TtsDirectiveOverrides } from "../tts/tts.js";
import {
getExtensionHostTtsMaxLength,
isExtensionHostTtsSummarizationEnabled,
resolveExtensionHostTtsAutoMode,
} from "./tts-preferences.js";
export type ExtensionHostTtsPayloadPlan =
| {
kind: "skip";
payload: ReplyPayload;
}
| {
kind: "ready";
nextPayload: ReplyPayload;
textForAudio: string;
wasSummarized: boolean;
overrides: TtsDirectiveOverrides;
};
export async function resolveExtensionHostTtsPayloadPlan(params: {
payload: ReplyPayload;
cfg: OpenClawConfig;
config: ResolvedTtsConfig;
prefsPath: string;
kind?: "tool" | "block" | "final";
inboundAudio?: boolean;
ttsAuto?: string;
}): Promise<ExtensionHostTtsPayloadPlan> {
const autoMode = resolveExtensionHostTtsAutoMode({
config: params.config,
prefsPath: params.prefsPath,
sessionAuto: params.ttsAuto,
});
if (autoMode === "off") {
return { kind: "skip", payload: params.payload };
}
const text = params.payload.text ?? "";
const directives = parseTtsDirectives(
text,
params.config.modelOverrides,
params.config.openai.baseUrl,
);
if (directives.warnings.length > 0) {
logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`);
}
const cleanedText = directives.cleanedText;
const trimmedCleaned = cleanedText.trim();
const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : "";
const ttsText = directives.ttsText?.trim() || visibleText;
const nextPayload =
visibleText === text.trim()
? params.payload
: {
...params.payload,
text: visibleText.length > 0 ? visibleText : undefined,
};
if (autoMode === "tagged" && !directives.hasDirective) {
return { kind: "skip", payload: nextPayload };
}
if (autoMode === "inbound" && params.inboundAudio !== true) {
return { kind: "skip", payload: nextPayload };
}
const mode = params.config.mode ?? "final";
if (mode === "final" && params.kind && params.kind !== "final") {
return { kind: "skip", payload: nextPayload };
}
if (!ttsText.trim()) {
return { kind: "skip", payload: nextPayload };
}
if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) {
return { kind: "skip", payload: nextPayload };
}
if (text.includes("MEDIA:")) {
return { kind: "skip", payload: nextPayload };
}
if (ttsText.trim().length < 10) {
return { kind: "skip", payload: nextPayload };
}
const maxLength = getExtensionHostTtsMaxLength(params.prefsPath);
let textForAudio = ttsText.trim();
let wasSummarized = false;
if (textForAudio.length > maxLength) {
if (!isExtensionHostTtsSummarizationEnabled(params.prefsPath)) {
logVerbose(
`TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
);
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
} else {
try {
const summary = await summarizeText({
text: textForAudio,
targetLength: maxLength,
cfg: params.cfg,
config: params.config,
timeoutMs: params.config.timeoutMs,
});
textForAudio = summary.summary;
wasSummarized = true;
if (textForAudio.length > params.config.maxTextLength) {
logVerbose(
`TTS: summary exceeded hard limit (${textForAudio.length} > ${params.config.maxTextLength}); truncating.`,
);
textForAudio = `${textForAudio.slice(0, params.config.maxTextLength - 3)}...`;
}
} catch (err) {
const error = err as Error;
logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`);
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
}
}
}
textForAudio = stripMarkdown(textForAudio).trim();
if (textForAudio.length < 10) {
return { kind: "skip", payload: nextPayload };
}
return {
kind: "ready",
nextPayload,
textForAudio,
wasSummarized,
overrides: directives.overrides,
};
}

View File

@ -8,6 +8,7 @@ import type {
TtsProvider,
TtsModelOverrideConfig,
} from "../config/types.tts.js";
import { resolveExtensionHostTtsPayloadPlan } from "../extension-host/tts-payload.js";
import {
getExtensionHostTtsMaxLength,
isExtensionHostTtsEnabled,
@ -39,7 +40,6 @@ import {
resolveExtensionHostTtsRequestSetup,
} from "../extension-host/tts-runtime-setup.js";
import { logVerbose } from "../globals.js";
import { stripMarkdown } from "../line/markdown-to-line.js";
import {
DEFAULT_OPENAI_BASE_URL,
isValidOpenAIModel,
@ -47,8 +47,8 @@ import {
isValidVoiceId,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
resolveOpenAITtsInstructions,
parseTtsDirectives,
resolveOpenAITtsInstructions,
summarizeText,
} from "./tts-core.js";
export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "./tts-core.js";
@ -305,7 +305,7 @@ export function buildTtsSystemPromptHint(cfg: OpenClawConfig): string | undefine
return undefined;
}
const maxLength = getExtensionHostTtsMaxLength(prefsPath);
const summarize = isExtensionHostTtsSummarizationEnabled(prefsPath) ? "on" : "off";
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
const autoHint =
autoMode === "inbound"
? "Only use TTS when the user's last message includes audio/voice."
@ -417,114 +417,34 @@ export async function maybeApplyTtsToPayload(params: {
}): Promise<ReplyPayload> {
const config = resolveTtsConfig(params.cfg);
const prefsPath = resolveTtsPrefsPath(config);
const autoMode = resolveTtsAutoMode({
const plan = await resolveExtensionHostTtsPayloadPlan({
payload: params.payload,
cfg: params.cfg,
config,
prefsPath,
sessionAuto: params.ttsAuto,
kind: params.kind,
inboundAudio: params.inboundAudio,
ttsAuto: params.ttsAuto,
});
if (autoMode === "off") {
return params.payload;
}
const text = params.payload.text ?? "";
const directives = parseTtsDirectives(text, config.modelOverrides, config.openai.baseUrl);
if (directives.warnings.length > 0) {
logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`);
}
const cleanedText = directives.cleanedText;
const trimmedCleaned = cleanedText.trim();
const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : "";
const ttsText = directives.ttsText?.trim() || visibleText;
const nextPayload =
visibleText === text.trim()
? params.payload
: {
...params.payload,
text: visibleText.length > 0 ? visibleText : undefined,
};
if (autoMode === "tagged" && !directives.hasDirective) {
return nextPayload;
}
if (autoMode === "inbound" && params.inboundAudio !== true) {
return nextPayload;
}
const mode = config.mode ?? "final";
if (mode === "final" && params.kind && params.kind !== "final") {
return nextPayload;
}
if (!ttsText.trim()) {
return nextPayload;
}
if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) {
return nextPayload;
}
if (text.includes("MEDIA:")) {
return nextPayload;
}
if (ttsText.trim().length < 10) {
return nextPayload;
}
const maxLength = getTtsMaxLength(prefsPath);
let textForAudio = ttsText.trim();
let wasSummarized = false;
if (textForAudio.length > maxLength) {
if (!isSummarizationEnabled(prefsPath)) {
logVerbose(
`TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
);
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
} else {
try {
const summary = await summarizeText({
text: textForAudio,
targetLength: maxLength,
cfg: params.cfg,
config,
timeoutMs: config.timeoutMs,
});
textForAudio = summary.summary;
wasSummarized = true;
if (textForAudio.length > config.maxTextLength) {
logVerbose(
`TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`,
);
textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`;
}
} catch (err) {
const error = err as Error;
logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`);
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
}
}
}
textForAudio = stripMarkdown(textForAudio).trim(); // strip markdown for TTS (### → "hashtag" etc.)
if (textForAudio.length < 10) {
return nextPayload;
if (plan.kind === "skip") {
return plan.payload;
}
const ttsStart = Date.now();
const result = await textToSpeech({
text: textForAudio,
text: plan.textForAudio,
cfg: params.cfg,
prefsPath,
channel: params.channel,
overrides: directives.overrides,
overrides: plan.overrides,
});
if (result.success && result.audioPath) {
lastTtsAttempt = {
timestamp: Date.now(),
success: true,
textLength: text.length,
summarized: wasSummarized,
textLength: (params.payload.text ?? "").length,
summarized: plan.wasSummarized,
provider: result.provider,
latencyMs: result.latencyMs,
};
@ -532,7 +452,7 @@ export async function maybeApplyTtsToPayload(params: {
const shouldVoice =
isExtensionHostTtsVoiceBubbleChannel(params.channel) && result.voiceCompatible === true;
const finalPayload = {
...nextPayload,
...plan.nextPayload,
mediaUrl: result.audioPath,
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
};
@ -542,8 +462,8 @@ export async function maybeApplyTtsToPayload(params: {
lastTtsAttempt = {
timestamp: Date.now(),
success: false,
textLength: text.length,
summarized: wasSummarized,
textLength: (params.payload.text ?? "").length,
summarized: plan.wasSummarized,
error: result.error,
};