From 60882db51565ee178a5c8d5bf1162afad0a9c659 Mon Sep 17 00:00:00 2001
From: mariko-code-bot <mariko-code-bot@users.noreply.github.com>
Date: Wed, 11 Mar 2026 22:15:25 +0000
Subject: [PATCH 1/2] feat(tts): add deliveryMode return for metadata-only
 output

---
 src/agents/tools/tts-tool.test.ts | 40 +++++++++++++++++++++++
 src/agents/tools/tts-tool.ts      | 54 +++++++++++++++++++++++++++++--
 2 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/src/agents/tools/tts-tool.test.ts b/src/agents/tools/tts-tool.test.ts
index fe9a6c1def9..07284bf8105 100644
--- a/src/agents/tools/tts-tool.test.ts
+++ b/src/agents/tools/tts-tool.test.ts
@@ -4,6 +4,11 @@ vi.mock("../../auto-reply/tokens.js", () => ({
   SILENT_REPLY_TOKEN: "QUIET_TOKEN",
 }));
 
+const textToSpeechMock = vi.hoisted(() => vi.fn());
+vi.mock("../../tts/tts.js", () => ({
+  textToSpeech: textToSpeechMock,
+}));
+
 const { createTtsTool } = await import("./tts-tool.js");
 
 describe("createTtsTool", () => {
@@ -13,4 +18,39 @@ describe("createTtsTool", () => {
     expect(tool.description).toContain("QUIET_TOKEN");
     expect(tool.description).not.toContain("NO_REPLY");
   });
+
+  it("returns metadata only in deliveryMode=return", async () => {
+    textToSpeechMock.mockResolvedValueOnce({
+      success: true,
+      audioPath: "/tmp/openclaw/tts-test/voice.mp3",
+      provider: "openai",
+      voiceCompatible: false,
+    });
+    const tool = createTtsTool();
+
+    const result = await tool.execute("call-1", {
+      text: "hello",
+      deliveryMode: "return",
+    });
+
+    expect(result.content[0]?.type).toBe("text");
+    expect((result.content[0] as { text: string }).text).not.toContain("MEDIA:");
+    expect(result.details).toMatchObject({
+      ok: true,
+      deliveryMode: "return",
+      audioPath: "/tmp/openclaw/tts-test/voice.mp3",
+      sent: false,
+    });
+  });
+
+  it("returns validation error for invalid deliveryMode", async () => {
+    const tool = createTtsTool();
+    const result = await tool.execute("call-2", {
+      text: "hello",
+      deliveryMode: "invalid-mode",
+    });
+
+    expect((result.details as { error?: { code?: string } }).error?.code).toBe("VALIDATION_ERROR");
+    expect((result.content[0] as { text: string }).text).toContain("deliveryMode must be one of");
+  });
 });
diff --git a/src/agents/tools/tts-tool.ts b/src/agents/tools/tts-tool.ts
index 03ed3cd9a04..d313ed7140a 100644
--- a/src/agents/tools/tts-tool.ts
+++ b/src/agents/tools/tts-tool.ts
@@ -12,6 +12,12 @@ const TtsToolSchema = Type.Object({
   channel: Type.Optional(
     Type.String({ description: "Optional channel id to pick output format (e.g. telegram)." }),
   ),
+  deliveryMode: Type.Optional(
+    Type.Union([Type.Literal("send"), Type.Literal("return")], {
+      description:
+        "Delivery mode: 'send' (default) returns MEDIA output for normal delivery; 'return' returns metadata only without MEDIA output.",
+    }),
+  ),
 });
 
 export function createTtsTool(opts?: {
@@ -27,6 +33,25 @@ export function createTtsTool(opts?: {
       const params = args as Record<string, unknown>;
       const text = readStringParam(params, "text", { required: true });
       const channel = readStringParam(params, "channel");
+      const deliveryModeRaw = readStringParam(params, "deliveryMode");
+      const deliveryMode = deliveryModeRaw == null || deliveryModeRaw === "" ? "send" : deliveryModeRaw;
+      if (deliveryMode !== "send" && deliveryMode !== "return") {
+        return {
+          content: [
+            {
+              type: "text",
+              text: "deliveryMode must be one of: send, return",
+            },
+          ],
+          details: {
+            ok: false,
+            error: {
+              code: "VALIDATION_ERROR",
+              message: "deliveryMode must be one of: send, return",
+            },
+          },
+        };
+      }
       const cfg = opts?.config ?? loadConfig();
       const result = await textToSpeech({
         text,
@@ -35,6 +60,25 @@ export function createTtsTool(opts?: {
       });
 
       if (result.success && result.audioPath) {
+        if (deliveryMode === "return") {
+          return {
+            content: [
+              {
+                type: "text",
+                text: "TTS audio generated (return mode).",
+              },
+            ],
+            details: {
+              ok: true,
+              deliveryMode: "return",
+              audioPath: result.audioPath,
+              mimeType: "audio/mpeg",
+              sent: false,
+              provider: result.provider,
+            },
+          };
+        }
+
         const lines: string[] = [];
         // Tag Telegram Opus output as a voice bubble instead of a file attachment.
         if (result.voiceCompatible) {
@@ -43,7 +87,7 @@ export function createTtsTool(opts?: {
         lines.push(`MEDIA:${result.audioPath}`);
         return {
           content: [{ type: "text", text: lines.join("\n") }],
-          details: { audioPath: result.audioPath, provider: result.provider },
+          details: { audioPath: result.audioPath, provider: result.provider, deliveryMode: "send" },
         };
       }
 
@@ -54,7 +98,13 @@ export function createTtsTool(opts?: {
             text: result.error ?? "TTS conversion failed",
           },
         ],
-        details: { error: result.error },
+        details: {
+          ok: false,
+          error: {
+            code: "TTS_GENERATION_FAILED",
+            message: result.error ?? "TTS conversion failed",
+          },
+        },
       };
     },
   };

From 229817fdc79942dd1b307ac63a873fc267f135be Mon Sep 17 00:00:00 2001
From: mariko-code-bot <mariko-code-bot@users.noreply.github.com>
Date: Wed, 11 Mar 2026 22:57:47 +0000
Subject: [PATCH 2/2] fix(tts): correct return mimeType and include ok in send
 details

---
 src/agents/tools/tts-tool.test.ts | 40 +++++++++++++++++++++++++++++++
 src/agents/tools/tts-tool.ts      |  9 +++++--
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/src/agents/tools/tts-tool.test.ts b/src/agents/tools/tts-tool.test.ts
index 07284bf8105..5b07da266ae 100644
--- a/src/agents/tools/tts-tool.test.ts
+++ b/src/agents/tools/tts-tool.test.ts
@@ -39,10 +39,50 @@ describe("createTtsTool", () => {
       ok: true,
       deliveryMode: "return",
       audioPath: "/tmp/openclaw/tts-test/voice.mp3",
+      mimeType: "audio/mpeg",
       sent: false,
     });
   });
 
+  it("uses audio/ogg mimeType for voice-compatible return output", async () => {
+    textToSpeechMock.mockResolvedValueOnce({
+      success: true,
+      audioPath: "/tmp/openclaw/tts-test/voice.opus",
+      provider: "openai",
+      voiceCompatible: true,
+    });
+    const tool = createTtsTool();
+
+    const result = await tool.execute("call-voice", {
+      text: "hello",
+      channel: "telegram",
+      deliveryMode: "return",
+    });
+
+    expect(result.details).toMatchObject({
+      ok: true,
+      deliveryMode: "return",
+      mimeType: "audio/ogg",
+    });
+  });
+
+  it("includes ok=true in send mode success details", async () => {
+    textToSpeechMock.mockResolvedValueOnce({
+      success: true,
+      audioPath: "/tmp/openclaw/tts-test/voice.mp3",
+      provider: "openai",
+      voiceCompatible: false,
+    });
+    const tool = createTtsTool();
+
+    const result = await tool.execute("call-send", {
+      text: "hello",
+      deliveryMode: "send",
+    });
+
+    expect(result.details).toMatchObject({ ok: true, deliveryMode: "send" });
+  });
+
   it("returns validation error for invalid deliveryMode", async () => {
     const tool = createTtsTool();
     const result = await tool.execute("call-2", {
diff --git a/src/agents/tools/tts-tool.ts b/src/agents/tools/tts-tool.ts
index d313ed7140a..2d54ab8499f 100644
--- a/src/agents/tools/tts-tool.ts
+++ b/src/agents/tools/tts-tool.ts
@@ -72,7 +72,7 @@ export function createTtsTool(opts?: {
               ok: true,
               deliveryMode: "return",
               audioPath: result.audioPath,
-              mimeType: "audio/mpeg",
+              mimeType: result.voiceCompatible ? "audio/ogg" : "audio/mpeg",
               sent: false,
               provider: result.provider,
             },
@@ -87,7 +87,12 @@ export function createTtsTool(opts?: {
         lines.push(`MEDIA:${result.audioPath}`);
         return {
           content: [{ type: "text", text: lines.join("\n") }],
-          details: { audioPath: result.audioPath, provider: result.provider, deliveryMode: "send" },
+          details: {
+            ok: true,
+            audioPath: result.audioPath,
+            provider: result.provider,
+            deliveryMode: "send",
+          },
         };
       }