From 454e44242fd7d1e8424f725845f5e3ccd2d8c2fa Mon Sep 17 00:00:00 2001 From: Gustavo Madeira Santana Date: Sun, 15 Mar 2026 20:47:12 +0000 Subject: [PATCH] TTS: extract API composition --- src/extension-host/tts-api.test.ts | 135 ++++++++++++++++++++++ src/extension-host/tts-api.ts | 169 ++++++++++++++++++++++++++++ src/tts/tts.ts | 172 +++-------------------------- 3 files changed, 319 insertions(+), 157 deletions(-) create mode 100644 src/extension-host/tts-api.test.ts create mode 100644 src/extension-host/tts-api.ts diff --git a/src/extension-host/tts-api.test.ts b/src/extension-host/tts-api.test.ts new file mode 100644 index 00000000000..97306d3633d --- /dev/null +++ b/src/extension-host/tts-api.test.ts @@ -0,0 +1,135 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; +import { + applyExtensionHostTtsToPayload, + buildExtensionHostTtsSystemPromptHint, + runExtensionHostTextToSpeech, +} from "./tts-api.js"; + +vi.mock("./tts-config.js", () => ({ + normalizeExtensionHostTtsConfigAutoMode: vi.fn(), + resolveExtensionHostTtsConfig: vi.fn(), + resolveExtensionHostTtsModelOverridePolicy: vi.fn(), +})); + +vi.mock("./tts-preferences.js", () => ({ + getExtensionHostTtsMaxLength: vi.fn(), + isExtensionHostTtsSummarizationEnabled: vi.fn(), + resolveExtensionHostTtsAutoMode: vi.fn(), + resolveExtensionHostTtsPrefsPath: vi.fn(), +})); + +vi.mock("./tts-payload.js", () => ({ + resolveExtensionHostTtsPayloadPlan: vi.fn(), +})); + +vi.mock("./tts-runtime-setup.js", () => ({ + resolveExtensionHostTtsRequestSetup: vi.fn(), +})); + +vi.mock("./tts-runtime-execution.js", () => ({ + executeExtensionHostTextToSpeech: vi.fn(), + executeExtensionHostTextToSpeechTelephony: vi.fn(), + isExtensionHostTtsVoiceBubbleChannel: vi.fn(() => false), + resolveExtensionHostEdgeOutputFormat: vi.fn(() => "audio-24khz-48kbitrate-mono-mp3"), + resolveExtensionHostTtsOutputFormat: vi.fn(() => ({ + openai: "mp3", + elevenlabs: "mp3_44100_128", + extension: ".mp3", + voiceCompatible: false, + })), +})); + +vi.mock("./tts-status.js", () => ({ + getExtensionHostLastTtsAttempt: vi.fn(), + setExtensionHostLastTtsAttempt: vi.fn(), +})); + +describe("tts-api", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it("builds the remaining system prompt hint through host-owned preferences", async () => { + const configModule = await import("./tts-config.js"); + const prefsModule = await import("./tts-preferences.js"); + + vi.mocked(configModule.resolveExtensionHostTtsConfig).mockReturnValue({} as never); + vi.mocked(prefsModule.resolveExtensionHostTtsPrefsPath).mockReturnValue("/tmp/tts.json"); + vi.mocked(prefsModule.resolveExtensionHostTtsAutoMode).mockReturnValue("inbound"); + vi.mocked(prefsModule.getExtensionHostTtsMaxLength).mockReturnValue(900); + vi.mocked(prefsModule.isExtensionHostTtsSummarizationEnabled).mockReturnValue(false); + + const hint = buildExtensionHostTtsSystemPromptHint({} as never); + + expect(hint).toContain("Voice (TTS) is enabled."); + expect(hint).toContain("Only use TTS when the user's last message includes audio/voice."); + expect(hint).toContain("Keep spoken text ≤900 chars"); + expect(hint).toContain("summary off"); + }); + + it("returns setup validation errors through the host-owned TTS API", async () => { + const configModule = await import("./tts-config.js"); + const prefsModule = await import("./tts-preferences.js"); + const setupModule = await import("./tts-runtime-setup.js"); + + vi.mocked(configModule.resolveExtensionHostTtsConfig).mockReturnValue({} as never); + vi.mocked(prefsModule.resolveExtensionHostTtsPrefsPath).mockReturnValue("/tmp/tts.json"); + vi.mocked(setupModule.resolveExtensionHostTtsRequestSetup).mockReturnValue({ + error: "Text too long (5000 chars, max 4096)", + }); + + await expect( + runExtensionHostTextToSpeech({ + text: "x".repeat(5000), + cfg: {} as never, + }), + ).resolves.toEqual({ + success: false, + error: "Text too long (5000 chars, max 4096)", + }); + }); + + it("returns the planned payload when TTS conversion fails", async () => { + const configModule = await import("./tts-config.js"); + const prefsModule = await import("./tts-preferences.js"); + const payloadModule = await import("./tts-payload.js"); + const setupModule = await import("./tts-runtime-setup.js"); + const executionModule = await import("./tts-runtime-execution.js"); + const statusModule = await import("./tts-status.js"); + + vi.mocked(configModule.resolveExtensionHostTtsConfig).mockReturnValue({} as never); + vi.mocked(prefsModule.resolveExtensionHostTtsPrefsPath).mockReturnValue("/tmp/tts.json"); + vi.mocked(payloadModule.resolveExtensionHostTtsPayloadPlan).mockResolvedValue({ + kind: "ready", + nextPayload: { text: "cleaned" }, + textForAudio: "speak this", + wasSummarized: true, + overrides: {}, + }); + vi.mocked(setupModule.resolveExtensionHostTtsRequestSetup).mockReturnValue({ + config: {} as never, + providers: ["openai"], + }); + vi.mocked(executionModule.executeExtensionHostTextToSpeech).mockResolvedValue({ + success: false, + error: "provider failed", + }); + + const result = await applyExtensionHostTtsToPayload({ + payload: { text: "original" }, + cfg: {} as never, + channel: "telegram", + kind: "final", + }); + + expect(result).toEqual({ text: "cleaned" }); + expect(statusModule.setExtensionHostLastTtsAttempt).toHaveBeenCalledWith( + expect.objectContaining({ + success: false, + textLength: "original".length, + summarized: true, + error: "provider failed", + }), + ); + }); +}); diff --git a/src/extension-host/tts-api.ts b/src/extension-host/tts-api.ts new file mode 100644 index 00000000000..bf51da2f8ec --- /dev/null +++ b/src/extension-host/tts-api.ts @@ -0,0 +1,169 @@ +import type { ReplyPayload } from "../auto-reply/types.js"; +import type { OpenClawConfig } from "../config/config.js"; +import { logVerbose } from "../globals.js"; +import type { TtsDirectiveOverrides, TtsResult, TtsTelephonyResult } from "../tts/tts.js"; +import { + resolveExtensionHostTtsConfig, + resolveExtensionHostTtsModelOverridePolicy, +} from "./tts-config.js"; +import { resolveExtensionHostTtsPayloadPlan } from "./tts-payload.js"; +import { + getExtensionHostTtsMaxLength, + isExtensionHostTtsSummarizationEnabled, + resolveExtensionHostTtsAutoMode, + resolveExtensionHostTtsPrefsPath, +} from "./tts-preferences.js"; +import { + executeExtensionHostTextToSpeech, + executeExtensionHostTextToSpeechTelephony, + isExtensionHostTtsVoiceBubbleChannel, + resolveExtensionHostEdgeOutputFormat, + resolveExtensionHostTtsOutputFormat, +} from "./tts-runtime-execution.js"; +import { resolveExtensionHostTtsRequestSetup } from "./tts-runtime-setup.js"; +import { setExtensionHostLastTtsAttempt, type ExtensionHostTtsStatusEntry } from "./tts-status.js"; + +export type { ExtensionHostTtsStatusEntry }; + +export { resolveExtensionHostTtsModelOverridePolicy }; +export { resolveExtensionHostTtsOutputFormat, resolveExtensionHostEdgeOutputFormat }; + +export function buildExtensionHostTtsSystemPromptHint(cfg: OpenClawConfig): string | undefined { + const config = resolveExtensionHostTtsConfig(cfg); + const prefsPath = resolveExtensionHostTtsPrefsPath(config); + const autoMode = resolveExtensionHostTtsAutoMode({ config, prefsPath }); + if (autoMode === "off") { + return undefined; + } + const maxLength = getExtensionHostTtsMaxLength(prefsPath); + const summarize = isExtensionHostTtsSummarizationEnabled(prefsPath) ? "on" : "off"; + const autoHint = + autoMode === "inbound" + ? "Only use TTS when the user's last message includes audio/voice." + : autoMode === "tagged" + ? "Only use TTS when you include [[tts]] or [[tts:text]] tags." + : undefined; + return [ + "Voice (TTS) is enabled.", + autoHint, + `Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`, + "Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.", + ] + .filter(Boolean) + .join("\n"); +} + +export async function runExtensionHostTextToSpeech(params: { + text: string; + cfg: OpenClawConfig; + prefsPath?: string; + channel?: string; + overrides?: TtsDirectiveOverrides; +}): Promise { + const config = resolveExtensionHostTtsConfig(params.cfg); + const prefsPath = params.prefsPath ?? resolveExtensionHostTtsPrefsPath(config); + const setup = resolveExtensionHostTtsRequestSetup({ + text: params.text, + config, + prefsPath, + providerOverride: params.overrides?.provider, + }); + if ("error" in setup) { + return { success: false, error: setup.error }; + } + + return executeExtensionHostTextToSpeech({ + text: params.text, + config: setup.config, + providers: setup.providers, + channel: params.channel, + overrides: params.overrides, + }); +} + +export async function runExtensionHostTextToSpeechTelephony(params: { + text: string; + cfg: OpenClawConfig; + prefsPath?: string; +}): Promise { + const config = resolveExtensionHostTtsConfig(params.cfg); + const prefsPath = params.prefsPath ?? resolveExtensionHostTtsPrefsPath(config); + const setup = resolveExtensionHostTtsRequestSetup({ + text: params.text, + config, + prefsPath, + }); + if ("error" in setup) { + return { success: false, error: setup.error }; + } + + return executeExtensionHostTextToSpeechTelephony({ + text: params.text, + config: setup.config, + providers: setup.providers, + }); +} + +export async function applyExtensionHostTtsToPayload(params: { + payload: ReplyPayload; + cfg: OpenClawConfig; + channel?: string; + kind?: "tool" | "block" | "final"; + inboundAudio?: boolean; + ttsAuto?: string; +}): Promise { + const config = resolveExtensionHostTtsConfig(params.cfg); + const prefsPath = resolveExtensionHostTtsPrefsPath(config); + const plan = await resolveExtensionHostTtsPayloadPlan({ + payload: params.payload, + cfg: params.cfg, + config, + prefsPath, + kind: params.kind, + inboundAudio: params.inboundAudio, + ttsAuto: params.ttsAuto, + }); + if (plan.kind === "skip") { + return plan.payload; + } + + const ttsStart = Date.now(); + const result = await runExtensionHostTextToSpeech({ + text: plan.textForAudio, + cfg: params.cfg, + prefsPath, + channel: params.channel, + overrides: plan.overrides, + }); + + if (result.success && result.audioPath) { + setExtensionHostLastTtsAttempt({ + timestamp: Date.now(), + success: true, + textLength: (params.payload.text ?? "").length, + summarized: plan.wasSummarized, + provider: result.provider, + latencyMs: result.latencyMs, + }); + + const shouldVoice = + isExtensionHostTtsVoiceBubbleChannel(params.channel) && result.voiceCompatible === true; + return { + ...plan.nextPayload, + mediaUrl: result.audioPath, + audioAsVoice: shouldVoice || params.payload.audioAsVoice, + }; + } + + setExtensionHostLastTtsAttempt({ + timestamp: Date.now(), + success: false, + textLength: (params.payload.text ?? "").length, + summarized: plan.wasSummarized, + error: result.error, + }); + + const latency = Date.now() - ttsStart; + logVerbose(`TTS: conversion failed after ${latency}ms (${result.error ?? "unknown"}).`); + return plan.nextPayload; +} diff --git a/src/tts/tts.ts b/src/tts/tts.ts index e9fc59c31e3..1a527718298 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -1,18 +1,23 @@ -import type { ReplyPayload } from "../auto-reply/types.js"; -import type { OpenClawConfig } from "../config/config.js"; import type { TtsProvider } from "../config/types.tts.js"; +import { + applyExtensionHostTtsToPayload, + buildExtensionHostTtsSystemPromptHint, + resolveExtensionHostEdgeOutputFormat, + resolveExtensionHostTtsModelOverridePolicy, + resolveExtensionHostTtsOutputFormat, + runExtensionHostTextToSpeech, + runExtensionHostTextToSpeechTelephony, + type ExtensionHostTtsStatusEntry, +} from "../extension-host/tts-api.js"; import { normalizeExtensionHostTtsConfigAutoMode, resolveExtensionHostTtsConfig, - resolveExtensionHostTtsModelOverridePolicy, type ResolvedTtsConfig, } from "../extension-host/tts-config.js"; -import { resolveExtensionHostTtsPayloadPlan } from "../extension-host/tts-payload.js"; import { getExtensionHostTtsMaxLength, isExtensionHostTtsEnabled, isExtensionHostTtsSummarizationEnabled, - resolveExtensionHostTtsAutoMode, resolveExtensionHostTtsPrefsPath, setExtensionHostTtsAutoMode, setExtensionHostTtsEnabled, @@ -20,29 +25,17 @@ import { setExtensionHostTtsProvider, setExtensionHostTtsSummarizationEnabled, } from "../extension-host/tts-preferences.js"; -import { - executeExtensionHostTextToSpeech, - executeExtensionHostTextToSpeechTelephony, - isExtensionHostTtsVoiceBubbleChannel, - resolveExtensionHostEdgeOutputFormat, - resolveExtensionHostTtsOutputFormat, -} from "../extension-host/tts-runtime-execution.js"; import { EXTENSION_HOST_TTS_PROVIDER_IDS, isExtensionHostTtsProviderConfigured, resolveExtensionHostTtsApiKey, resolveExtensionHostTtsProviderOrder, } from "../extension-host/tts-runtime-registry.js"; -import { - resolveExtensionHostTtsProvider, - resolveExtensionHostTtsRequestSetup, -} from "../extension-host/tts-runtime-setup.js"; +import { resolveExtensionHostTtsProvider } from "../extension-host/tts-runtime-setup.js"; import { getExtensionHostLastTtsAttempt, setExtensionHostLastTtsAttempt, - type ExtensionHostTtsStatusEntry, } from "../extension-host/tts-status.js"; -import { logVerbose } from "../globals.js"; import { isValidOpenAIModel, isValidOpenAIVoice, @@ -108,32 +101,7 @@ export const resolveTtsConfig = resolveExtensionHostTtsConfig; export const resolveTtsPrefsPath = resolveExtensionHostTtsPrefsPath; -export const resolveTtsAutoMode = resolveExtensionHostTtsAutoMode; - -export function buildTtsSystemPromptHint(cfg: OpenClawConfig): string | undefined { - const config = resolveTtsConfig(cfg); - const prefsPath = resolveTtsPrefsPath(config); - const autoMode = resolveTtsAutoMode({ config, prefsPath }); - if (autoMode === "off") { - return undefined; - } - const maxLength = getExtensionHostTtsMaxLength(prefsPath); - const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off"; - const autoHint = - autoMode === "inbound" - ? "Only use TTS when the user's last message includes audio/voice." - : autoMode === "tagged" - ? "Only use TTS when you include [[tts]] or [[tts:text]] tags." - : undefined; - return [ - "Voice (TTS) is enabled.", - autoHint, - `Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`, - "Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.", - ] - .filter(Boolean) - .join("\n"); -} +export const buildTtsSystemPromptHint = buildExtensionHostTtsSystemPromptHint; export const isTtsEnabled = isExtensionHostTtsEnabled; @@ -169,121 +137,11 @@ export const resolveTtsProviderOrder = resolveExtensionHostTtsProviderOrder; export const isTtsProviderConfigured = isExtensionHostTtsProviderConfigured; -export async function textToSpeech(params: { - text: string; - cfg: OpenClawConfig; - prefsPath?: string; - channel?: string; - overrides?: TtsDirectiveOverrides; -}): Promise { - const config = resolveTtsConfig(params.cfg); - const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config); - const setup = resolveExtensionHostTtsRequestSetup({ - text: params.text, - config, - prefsPath, - providerOverride: params.overrides?.provider, - }); - if ("error" in setup) { - return { success: false, error: setup.error }; - } +export const textToSpeech = runExtensionHostTextToSpeech; - return executeExtensionHostTextToSpeech({ - text: params.text, - config: setup.config, - providers: setup.providers, - channel: params.channel, - overrides: params.overrides, - }); -} +export const textToSpeechTelephony = runExtensionHostTextToSpeechTelephony; -export async function textToSpeechTelephony(params: { - text: string; - cfg: OpenClawConfig; - prefsPath?: string; -}): Promise { - const config = resolveTtsConfig(params.cfg); - const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config); - const setup = resolveExtensionHostTtsRequestSetup({ - text: params.text, - config, - prefsPath, - }); - if ("error" in setup) { - return { success: false, error: setup.error }; - } - - return executeExtensionHostTextToSpeechTelephony({ - text: params.text, - config: setup.config, - providers: setup.providers, - }); -} - -export async function maybeApplyTtsToPayload(params: { - payload: ReplyPayload; - cfg: OpenClawConfig; - channel?: string; - kind?: "tool" | "block" | "final"; - inboundAudio?: boolean; - ttsAuto?: string; -}): Promise { - const config = resolveTtsConfig(params.cfg); - const prefsPath = resolveTtsPrefsPath(config); - const plan = await resolveExtensionHostTtsPayloadPlan({ - payload: params.payload, - cfg: params.cfg, - config, - prefsPath, - kind: params.kind, - inboundAudio: params.inboundAudio, - ttsAuto: params.ttsAuto, - }); - if (plan.kind === "skip") { - return plan.payload; - } - - const ttsStart = Date.now(); - const result = await textToSpeech({ - text: plan.textForAudio, - cfg: params.cfg, - prefsPath, - channel: params.channel, - overrides: plan.overrides, - }); - - if (result.success && result.audioPath) { - setExtensionHostLastTtsAttempt({ - timestamp: Date.now(), - success: true, - textLength: (params.payload.text ?? "").length, - summarized: plan.wasSummarized, - provider: result.provider, - latencyMs: result.latencyMs, - }); - - const shouldVoice = - isExtensionHostTtsVoiceBubbleChannel(params.channel) && result.voiceCompatible === true; - const finalPayload = { - ...plan.nextPayload, - mediaUrl: result.audioPath, - audioAsVoice: shouldVoice || params.payload.audioAsVoice, - }; - return finalPayload; - } - - setExtensionHostLastTtsAttempt({ - timestamp: Date.now(), - success: false, - textLength: (params.payload.text ?? "").length, - summarized: plan.wasSummarized, - error: result.error, - }); - - const latency = Date.now() - ttsStart; - logVerbose(`TTS: conversion failed after ${latency}ms (${result.error ?? "unknown"}).`); - return nextPayload; -} +export const maybeApplyTtsToPayload = applyExtensionHostTtsToPayload; export const _test = { isValidVoiceId,