TTS: extract API composition

This commit is contained in:
Gustavo Madeira Santana 2026-03-15 20:47:12 +00:00
parent 820d8870e1
commit 454e44242f
3 changed files with 319 additions and 157 deletions

View File

@ -0,0 +1,135 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
import {
applyExtensionHostTtsToPayload,
buildExtensionHostTtsSystemPromptHint,
runExtensionHostTextToSpeech,
} from "./tts-api.js";
vi.mock("./tts-config.js", () => ({
normalizeExtensionHostTtsConfigAutoMode: vi.fn(),
resolveExtensionHostTtsConfig: vi.fn(),
resolveExtensionHostTtsModelOverridePolicy: vi.fn(),
}));
vi.mock("./tts-preferences.js", () => ({
getExtensionHostTtsMaxLength: vi.fn(),
isExtensionHostTtsSummarizationEnabled: vi.fn(),
resolveExtensionHostTtsAutoMode: vi.fn(),
resolveExtensionHostTtsPrefsPath: vi.fn(),
}));
vi.mock("./tts-payload.js", () => ({
resolveExtensionHostTtsPayloadPlan: vi.fn(),
}));
vi.mock("./tts-runtime-setup.js", () => ({
resolveExtensionHostTtsRequestSetup: vi.fn(),
}));
vi.mock("./tts-runtime-execution.js", () => ({
executeExtensionHostTextToSpeech: vi.fn(),
executeExtensionHostTextToSpeechTelephony: vi.fn(),
isExtensionHostTtsVoiceBubbleChannel: vi.fn(() => false),
resolveExtensionHostEdgeOutputFormat: vi.fn(() => "audio-24khz-48kbitrate-mono-mp3"),
resolveExtensionHostTtsOutputFormat: vi.fn(() => ({
openai: "mp3",
elevenlabs: "mp3_44100_128",
extension: ".mp3",
voiceCompatible: false,
})),
}));
vi.mock("./tts-status.js", () => ({
getExtensionHostLastTtsAttempt: vi.fn(),
setExtensionHostLastTtsAttempt: vi.fn(),
}));
describe("tts-api", () => {
beforeEach(() => {
vi.clearAllMocks();
});
it("builds the remaining system prompt hint through host-owned preferences", async () => {
const configModule = await import("./tts-config.js");
const prefsModule = await import("./tts-preferences.js");
vi.mocked(configModule.resolveExtensionHostTtsConfig).mockReturnValue({} as never);
vi.mocked(prefsModule.resolveExtensionHostTtsPrefsPath).mockReturnValue("/tmp/tts.json");
vi.mocked(prefsModule.resolveExtensionHostTtsAutoMode).mockReturnValue("inbound");
vi.mocked(prefsModule.getExtensionHostTtsMaxLength).mockReturnValue(900);
vi.mocked(prefsModule.isExtensionHostTtsSummarizationEnabled).mockReturnValue(false);
const hint = buildExtensionHostTtsSystemPromptHint({} as never);
expect(hint).toContain("Voice (TTS) is enabled.");
expect(hint).toContain("Only use TTS when the user's last message includes audio/voice.");
expect(hint).toContain("Keep spoken text ≤900 chars");
expect(hint).toContain("summary off");
});
it("returns setup validation errors through the host-owned TTS API", async () => {
const configModule = await import("./tts-config.js");
const prefsModule = await import("./tts-preferences.js");
const setupModule = await import("./tts-runtime-setup.js");
vi.mocked(configModule.resolveExtensionHostTtsConfig).mockReturnValue({} as never);
vi.mocked(prefsModule.resolveExtensionHostTtsPrefsPath).mockReturnValue("/tmp/tts.json");
vi.mocked(setupModule.resolveExtensionHostTtsRequestSetup).mockReturnValue({
error: "Text too long (5000 chars, max 4096)",
});
await expect(
runExtensionHostTextToSpeech({
text: "x".repeat(5000),
cfg: {} as never,
}),
).resolves.toEqual({
success: false,
error: "Text too long (5000 chars, max 4096)",
});
});
it("returns the planned payload when TTS conversion fails", async () => {
const configModule = await import("./tts-config.js");
const prefsModule = await import("./tts-preferences.js");
const payloadModule = await import("./tts-payload.js");
const setupModule = await import("./tts-runtime-setup.js");
const executionModule = await import("./tts-runtime-execution.js");
const statusModule = await import("./tts-status.js");
vi.mocked(configModule.resolveExtensionHostTtsConfig).mockReturnValue({} as never);
vi.mocked(prefsModule.resolveExtensionHostTtsPrefsPath).mockReturnValue("/tmp/tts.json");
vi.mocked(payloadModule.resolveExtensionHostTtsPayloadPlan).mockResolvedValue({
kind: "ready",
nextPayload: { text: "cleaned" },
textForAudio: "speak this",
wasSummarized: true,
overrides: {},
});
vi.mocked(setupModule.resolveExtensionHostTtsRequestSetup).mockReturnValue({
config: {} as never,
providers: ["openai"],
});
vi.mocked(executionModule.executeExtensionHostTextToSpeech).mockResolvedValue({
success: false,
error: "provider failed",
});
const result = await applyExtensionHostTtsToPayload({
payload: { text: "original" },
cfg: {} as never,
channel: "telegram",
kind: "final",
});
expect(result).toEqual({ text: "cleaned" });
expect(statusModule.setExtensionHostLastTtsAttempt).toHaveBeenCalledWith(
expect.objectContaining({
success: false,
textLength: "original".length,
summarized: true,
error: "provider failed",
}),
);
});
});

View File

@ -0,0 +1,169 @@
import type { ReplyPayload } from "../auto-reply/types.js";
import type { OpenClawConfig } from "../config/config.js";
import { logVerbose } from "../globals.js";
import type { TtsDirectiveOverrides, TtsResult, TtsTelephonyResult } from "../tts/tts.js";
import {
resolveExtensionHostTtsConfig,
resolveExtensionHostTtsModelOverridePolicy,
} from "./tts-config.js";
import { resolveExtensionHostTtsPayloadPlan } from "./tts-payload.js";
import {
getExtensionHostTtsMaxLength,
isExtensionHostTtsSummarizationEnabled,
resolveExtensionHostTtsAutoMode,
resolveExtensionHostTtsPrefsPath,
} from "./tts-preferences.js";
import {
executeExtensionHostTextToSpeech,
executeExtensionHostTextToSpeechTelephony,
isExtensionHostTtsVoiceBubbleChannel,
resolveExtensionHostEdgeOutputFormat,
resolveExtensionHostTtsOutputFormat,
} from "./tts-runtime-execution.js";
import { resolveExtensionHostTtsRequestSetup } from "./tts-runtime-setup.js";
import { setExtensionHostLastTtsAttempt, type ExtensionHostTtsStatusEntry } from "./tts-status.js";
export type { ExtensionHostTtsStatusEntry };
export { resolveExtensionHostTtsModelOverridePolicy };
export { resolveExtensionHostTtsOutputFormat, resolveExtensionHostEdgeOutputFormat };
export function buildExtensionHostTtsSystemPromptHint(cfg: OpenClawConfig): string | undefined {
const config = resolveExtensionHostTtsConfig(cfg);
const prefsPath = resolveExtensionHostTtsPrefsPath(config);
const autoMode = resolveExtensionHostTtsAutoMode({ config, prefsPath });
if (autoMode === "off") {
return undefined;
}
const maxLength = getExtensionHostTtsMaxLength(prefsPath);
const summarize = isExtensionHostTtsSummarizationEnabled(prefsPath) ? "on" : "off";
const autoHint =
autoMode === "inbound"
? "Only use TTS when the user's last message includes audio/voice."
: autoMode === "tagged"
? "Only use TTS when you include [[tts]] or [[tts:text]] tags."
: undefined;
return [
"Voice (TTS) is enabled.",
autoHint,
`Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`,
"Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.",
]
.filter(Boolean)
.join("\n");
}
export async function runExtensionHostTextToSpeech(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
channel?: string;
overrides?: TtsDirectiveOverrides;
}): Promise<TtsResult> {
const config = resolveExtensionHostTtsConfig(params.cfg);
const prefsPath = params.prefsPath ?? resolveExtensionHostTtsPrefsPath(config);
const setup = resolveExtensionHostTtsRequestSetup({
text: params.text,
config,
prefsPath,
providerOverride: params.overrides?.provider,
});
if ("error" in setup) {
return { success: false, error: setup.error };
}
return executeExtensionHostTextToSpeech({
text: params.text,
config: setup.config,
providers: setup.providers,
channel: params.channel,
overrides: params.overrides,
});
}
export async function runExtensionHostTextToSpeechTelephony(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
}): Promise<TtsTelephonyResult> {
const config = resolveExtensionHostTtsConfig(params.cfg);
const prefsPath = params.prefsPath ?? resolveExtensionHostTtsPrefsPath(config);
const setup = resolveExtensionHostTtsRequestSetup({
text: params.text,
config,
prefsPath,
});
if ("error" in setup) {
return { success: false, error: setup.error };
}
return executeExtensionHostTextToSpeechTelephony({
text: params.text,
config: setup.config,
providers: setup.providers,
});
}
export async function applyExtensionHostTtsToPayload(params: {
payload: ReplyPayload;
cfg: OpenClawConfig;
channel?: string;
kind?: "tool" | "block" | "final";
inboundAudio?: boolean;
ttsAuto?: string;
}): Promise<ReplyPayload> {
const config = resolveExtensionHostTtsConfig(params.cfg);
const prefsPath = resolveExtensionHostTtsPrefsPath(config);
const plan = await resolveExtensionHostTtsPayloadPlan({
payload: params.payload,
cfg: params.cfg,
config,
prefsPath,
kind: params.kind,
inboundAudio: params.inboundAudio,
ttsAuto: params.ttsAuto,
});
if (plan.kind === "skip") {
return plan.payload;
}
const ttsStart = Date.now();
const result = await runExtensionHostTextToSpeech({
text: plan.textForAudio,
cfg: params.cfg,
prefsPath,
channel: params.channel,
overrides: plan.overrides,
});
if (result.success && result.audioPath) {
setExtensionHostLastTtsAttempt({
timestamp: Date.now(),
success: true,
textLength: (params.payload.text ?? "").length,
summarized: plan.wasSummarized,
provider: result.provider,
latencyMs: result.latencyMs,
});
const shouldVoice =
isExtensionHostTtsVoiceBubbleChannel(params.channel) && result.voiceCompatible === true;
return {
...plan.nextPayload,
mediaUrl: result.audioPath,
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
};
}
setExtensionHostLastTtsAttempt({
timestamp: Date.now(),
success: false,
textLength: (params.payload.text ?? "").length,
summarized: plan.wasSummarized,
error: result.error,
});
const latency = Date.now() - ttsStart;
logVerbose(`TTS: conversion failed after ${latency}ms (${result.error ?? "unknown"}).`);
return plan.nextPayload;
}

View File

@ -1,18 +1,23 @@
import type { ReplyPayload } from "../auto-reply/types.js";
import type { OpenClawConfig } from "../config/config.js";
import type { TtsProvider } from "../config/types.tts.js";
import {
applyExtensionHostTtsToPayload,
buildExtensionHostTtsSystemPromptHint,
resolveExtensionHostEdgeOutputFormat,
resolveExtensionHostTtsModelOverridePolicy,
resolveExtensionHostTtsOutputFormat,
runExtensionHostTextToSpeech,
runExtensionHostTextToSpeechTelephony,
type ExtensionHostTtsStatusEntry,
} from "../extension-host/tts-api.js";
import {
normalizeExtensionHostTtsConfigAutoMode,
resolveExtensionHostTtsConfig,
resolveExtensionHostTtsModelOverridePolicy,
type ResolvedTtsConfig,
} from "../extension-host/tts-config.js";
import { resolveExtensionHostTtsPayloadPlan } from "../extension-host/tts-payload.js";
import {
getExtensionHostTtsMaxLength,
isExtensionHostTtsEnabled,
isExtensionHostTtsSummarizationEnabled,
resolveExtensionHostTtsAutoMode,
resolveExtensionHostTtsPrefsPath,
setExtensionHostTtsAutoMode,
setExtensionHostTtsEnabled,
@ -20,29 +25,17 @@ import {
setExtensionHostTtsProvider,
setExtensionHostTtsSummarizationEnabled,
} from "../extension-host/tts-preferences.js";
import {
executeExtensionHostTextToSpeech,
executeExtensionHostTextToSpeechTelephony,
isExtensionHostTtsVoiceBubbleChannel,
resolveExtensionHostEdgeOutputFormat,
resolveExtensionHostTtsOutputFormat,
} from "../extension-host/tts-runtime-execution.js";
import {
EXTENSION_HOST_TTS_PROVIDER_IDS,
isExtensionHostTtsProviderConfigured,
resolveExtensionHostTtsApiKey,
resolveExtensionHostTtsProviderOrder,
} from "../extension-host/tts-runtime-registry.js";
import {
resolveExtensionHostTtsProvider,
resolveExtensionHostTtsRequestSetup,
} from "../extension-host/tts-runtime-setup.js";
import { resolveExtensionHostTtsProvider } from "../extension-host/tts-runtime-setup.js";
import {
getExtensionHostLastTtsAttempt,
setExtensionHostLastTtsAttempt,
type ExtensionHostTtsStatusEntry,
} from "../extension-host/tts-status.js";
import { logVerbose } from "../globals.js";
import {
isValidOpenAIModel,
isValidOpenAIVoice,
@ -108,32 +101,7 @@ export const resolveTtsConfig = resolveExtensionHostTtsConfig;
export const resolveTtsPrefsPath = resolveExtensionHostTtsPrefsPath;
export const resolveTtsAutoMode = resolveExtensionHostTtsAutoMode;
export function buildTtsSystemPromptHint(cfg: OpenClawConfig): string | undefined {
const config = resolveTtsConfig(cfg);
const prefsPath = resolveTtsPrefsPath(config);
const autoMode = resolveTtsAutoMode({ config, prefsPath });
if (autoMode === "off") {
return undefined;
}
const maxLength = getExtensionHostTtsMaxLength(prefsPath);
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
const autoHint =
autoMode === "inbound"
? "Only use TTS when the user's last message includes audio/voice."
: autoMode === "tagged"
? "Only use TTS when you include [[tts]] or [[tts:text]] tags."
: undefined;
return [
"Voice (TTS) is enabled.",
autoHint,
`Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`,
"Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.",
]
.filter(Boolean)
.join("\n");
}
export const buildTtsSystemPromptHint = buildExtensionHostTtsSystemPromptHint;
export const isTtsEnabled = isExtensionHostTtsEnabled;
@ -169,121 +137,11 @@ export const resolveTtsProviderOrder = resolveExtensionHostTtsProviderOrder;
export const isTtsProviderConfigured = isExtensionHostTtsProviderConfigured;
export async function textToSpeech(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
channel?: string;
overrides?: TtsDirectiveOverrides;
}): Promise<TtsResult> {
const config = resolveTtsConfig(params.cfg);
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
const setup = resolveExtensionHostTtsRequestSetup({
text: params.text,
config,
prefsPath,
providerOverride: params.overrides?.provider,
});
if ("error" in setup) {
return { success: false, error: setup.error };
}
export const textToSpeech = runExtensionHostTextToSpeech;
return executeExtensionHostTextToSpeech({
text: params.text,
config: setup.config,
providers: setup.providers,
channel: params.channel,
overrides: params.overrides,
});
}
export const textToSpeechTelephony = runExtensionHostTextToSpeechTelephony;
export async function textToSpeechTelephony(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
}): Promise<TtsTelephonyResult> {
const config = resolveTtsConfig(params.cfg);
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
const setup = resolveExtensionHostTtsRequestSetup({
text: params.text,
config,
prefsPath,
});
if ("error" in setup) {
return { success: false, error: setup.error };
}
return executeExtensionHostTextToSpeechTelephony({
text: params.text,
config: setup.config,
providers: setup.providers,
});
}
export async function maybeApplyTtsToPayload(params: {
payload: ReplyPayload;
cfg: OpenClawConfig;
channel?: string;
kind?: "tool" | "block" | "final";
inboundAudio?: boolean;
ttsAuto?: string;
}): Promise<ReplyPayload> {
const config = resolveTtsConfig(params.cfg);
const prefsPath = resolveTtsPrefsPath(config);
const plan = await resolveExtensionHostTtsPayloadPlan({
payload: params.payload,
cfg: params.cfg,
config,
prefsPath,
kind: params.kind,
inboundAudio: params.inboundAudio,
ttsAuto: params.ttsAuto,
});
if (plan.kind === "skip") {
return plan.payload;
}
const ttsStart = Date.now();
const result = await textToSpeech({
text: plan.textForAudio,
cfg: params.cfg,
prefsPath,
channel: params.channel,
overrides: plan.overrides,
});
if (result.success && result.audioPath) {
setExtensionHostLastTtsAttempt({
timestamp: Date.now(),
success: true,
textLength: (params.payload.text ?? "").length,
summarized: plan.wasSummarized,
provider: result.provider,
latencyMs: result.latencyMs,
});
const shouldVoice =
isExtensionHostTtsVoiceBubbleChannel(params.channel) && result.voiceCompatible === true;
const finalPayload = {
...plan.nextPayload,
mediaUrl: result.audioPath,
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
};
return finalPayload;
}
setExtensionHostLastTtsAttempt({
timestamp: Date.now(),
success: false,
textLength: (params.payload.text ?? "").length,
summarized: plan.wasSummarized,
error: result.error,
});
const latency = Date.now() - ttsStart;
logVerbose(`TTS: conversion failed after ${latency}ms (${result.error ?? "unknown"}).`);
return nextPayload;
}
export const maybeApplyTtsToPayload = applyExtensionHostTtsToPayload;
export const _test = {
isValidVoiceId,