From 60882db51565ee178a5c8d5bf1162afad0a9c659 Mon Sep 17 00:00:00 2001 From: mariko-code-bot Date: Wed, 11 Mar 2026 22:15:25 +0000 Subject: [PATCH 1/2] feat(tts): add deliveryMode return for metadata-only output --- src/agents/tools/tts-tool.test.ts | 40 +++++++++++++++++++++++ src/agents/tools/tts-tool.ts | 54 +++++++++++++++++++++++++++++-- 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/src/agents/tools/tts-tool.test.ts b/src/agents/tools/tts-tool.test.ts index fe9a6c1def9..07284bf8105 100644 --- a/src/agents/tools/tts-tool.test.ts +++ b/src/agents/tools/tts-tool.test.ts @@ -4,6 +4,11 @@ vi.mock("../../auto-reply/tokens.js", () => ({ SILENT_REPLY_TOKEN: "QUIET_TOKEN", })); +const textToSpeechMock = vi.hoisted(() => vi.fn()); +vi.mock("../../tts/tts.js", () => ({ + textToSpeech: textToSpeechMock, +})); + const { createTtsTool } = await import("./tts-tool.js"); describe("createTtsTool", () => { @@ -13,4 +18,39 @@ describe("createTtsTool", () => { expect(tool.description).toContain("QUIET_TOKEN"); expect(tool.description).not.toContain("NO_REPLY"); }); + + it("returns metadata only in deliveryMode=return", async () => { + textToSpeechMock.mockResolvedValueOnce({ + success: true, + audioPath: "/tmp/openclaw/tts-test/voice.mp3", + provider: "openai", + voiceCompatible: false, + }); + const tool = createTtsTool(); + + const result = await tool.execute("call-1", { + text: "hello", + deliveryMode: "return", + }); + + expect(result.content[0]?.type).toBe("text"); + expect((result.content[0] as { text: string }).text).not.toContain("MEDIA:"); + expect(result.details).toMatchObject({ + ok: true, + deliveryMode: "return", + audioPath: "/tmp/openclaw/tts-test/voice.mp3", + sent: false, + }); + }); + + it("returns validation error for invalid deliveryMode", async () => { + const tool = createTtsTool(); + const result = await tool.execute("call-2", { + text: "hello", + deliveryMode: "invalid-mode", + }); + + expect((result.details as { error?: { code?: string } }).error?.code).toBe("VALIDATION_ERROR"); + expect((result.content[0] as { text: string }).text).toContain("deliveryMode must be one of"); + }); }); diff --git a/src/agents/tools/tts-tool.ts b/src/agents/tools/tts-tool.ts index 03ed3cd9a04..d313ed7140a 100644 --- a/src/agents/tools/tts-tool.ts +++ b/src/agents/tools/tts-tool.ts @@ -12,6 +12,12 @@ const TtsToolSchema = Type.Object({ channel: Type.Optional( Type.String({ description: "Optional channel id to pick output format (e.g. telegram)." }), ), + deliveryMode: Type.Optional( + Type.Union([Type.Literal("send"), Type.Literal("return")], { + description: + "Delivery mode: 'send' (default) returns MEDIA output for normal delivery; 'return' returns metadata only without MEDIA output.", + }), + ), }); export function createTtsTool(opts?: { @@ -27,6 +33,25 @@ export function createTtsTool(opts?: { const params = args as Record; const text = readStringParam(params, "text", { required: true }); const channel = readStringParam(params, "channel"); + const deliveryModeRaw = readStringParam(params, "deliveryMode"); + const deliveryMode = deliveryModeRaw == null || deliveryModeRaw === "" ? "send" : deliveryModeRaw; + if (deliveryMode !== "send" && deliveryMode !== "return") { + return { + content: [ + { + type: "text", + text: "deliveryMode must be one of: send, return", + }, + ], + details: { + ok: false, + error: { + code: "VALIDATION_ERROR", + message: "deliveryMode must be one of: send, return", + }, + }, + }; + } const cfg = opts?.config ?? loadConfig(); const result = await textToSpeech({ text, @@ -35,6 +60,25 @@ export function createTtsTool(opts?: { }); if (result.success && result.audioPath) { + if (deliveryMode === "return") { + return { + content: [ + { + type: "text", + text: "TTS audio generated (return mode).", + }, + ], + details: { + ok: true, + deliveryMode: "return", + audioPath: result.audioPath, + mimeType: "audio/mpeg", + sent: false, + provider: result.provider, + }, + }; + } + const lines: string[] = []; // Tag Telegram Opus output as a voice bubble instead of a file attachment. if (result.voiceCompatible) { @@ -43,7 +87,7 @@ export function createTtsTool(opts?: { lines.push(`MEDIA:${result.audioPath}`); return { content: [{ type: "text", text: lines.join("\n") }], - details: { audioPath: result.audioPath, provider: result.provider }, + details: { audioPath: result.audioPath, provider: result.provider, deliveryMode: "send" }, }; } @@ -54,7 +98,13 @@ export function createTtsTool(opts?: { text: result.error ?? "TTS conversion failed", }, ], - details: { error: result.error }, + details: { + ok: false, + error: { + code: "TTS_GENERATION_FAILED", + message: result.error ?? "TTS conversion failed", + }, + }, }; }, }; From 229817fdc79942dd1b307ac63a873fc267f135be Mon Sep 17 00:00:00 2001 From: mariko-code-bot Date: Wed, 11 Mar 2026 22:57:47 +0000 Subject: [PATCH 2/2] fix(tts): correct return mimeType and include ok in send details --- src/agents/tools/tts-tool.test.ts | 40 +++++++++++++++++++++++++++++++ src/agents/tools/tts-tool.ts | 9 +++++-- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/src/agents/tools/tts-tool.test.ts b/src/agents/tools/tts-tool.test.ts index 07284bf8105..5b07da266ae 100644 --- a/src/agents/tools/tts-tool.test.ts +++ b/src/agents/tools/tts-tool.test.ts @@ -39,10 +39,50 @@ describe("createTtsTool", () => { ok: true, deliveryMode: "return", audioPath: "/tmp/openclaw/tts-test/voice.mp3", + mimeType: "audio/mpeg", sent: false, }); }); + it("uses audio/ogg mimeType for voice-compatible return output", async () => { + textToSpeechMock.mockResolvedValueOnce({ + success: true, + audioPath: "/tmp/openclaw/tts-test/voice.opus", + provider: "openai", + voiceCompatible: true, + }); + const tool = createTtsTool(); + + const result = await tool.execute("call-voice", { + text: "hello", + channel: "telegram", + deliveryMode: "return", + }); + + expect(result.details).toMatchObject({ + ok: true, + deliveryMode: "return", + mimeType: "audio/ogg", + }); + }); + + it("includes ok=true in send mode success details", async () => { + textToSpeechMock.mockResolvedValueOnce({ + success: true, + audioPath: "/tmp/openclaw/tts-test/voice.mp3", + provider: "openai", + voiceCompatible: false, + }); + const tool = createTtsTool(); + + const result = await tool.execute("call-send", { + text: "hello", + deliveryMode: "send", + }); + + expect(result.details).toMatchObject({ ok: true, deliveryMode: "send" }); + }); + it("returns validation error for invalid deliveryMode", async () => { const tool = createTtsTool(); const result = await tool.execute("call-2", { diff --git a/src/agents/tools/tts-tool.ts b/src/agents/tools/tts-tool.ts index d313ed7140a..2d54ab8499f 100644 --- a/src/agents/tools/tts-tool.ts +++ b/src/agents/tools/tts-tool.ts @@ -72,7 +72,7 @@ export function createTtsTool(opts?: { ok: true, deliveryMode: "return", audioPath: result.audioPath, - mimeType: "audio/mpeg", + mimeType: result.voiceCompatible ? "audio/ogg" : "audio/mpeg", sent: false, provider: result.provider, }, @@ -87,7 +87,12 @@ export function createTtsTool(opts?: { lines.push(`MEDIA:${result.audioPath}`); return { content: [{ type: "text", text: lines.join("\n") }], - details: { audioPath: result.audioPath, provider: result.provider, deliveryMode: "send" }, + details: { + ok: true, + audioPath: result.audioPath, + provider: result.provider, + deliveryMode: "send", + }, }; }