fix: deliver tool result media when verbose is off

Extract MEDIA: paths from tool result text content blocks and fall back to details.path when image content exists. Emit media via onToolResult when the verbose emitToolOutput path is off, skipping when shouldEmitToolOutput() is true to avoid duplicate delivery. Allow media-only payloads (no text) through normalizeStreamingText so tool result screenshots reach messaging channels. Ref #11735 Co-authored-by: strelov1 <strelov1@gmail.com>
2026-02-14 18:45:05 +00:00 · 2026-02-14 18:45:05 +00:00 · 131b28adef
parent 906c32da12
commit 131b28adef
5 changed files with 392 additions and 0 deletions
--- a/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.media.test.ts
@ -0,0 +1,174 @@
+import { describe, expect, it, vi } from "vitest";
+import type { EmbeddedPiSubscribeContext } from "./pi-embedded-subscribe.handlers.types.js";
+import { handleToolExecutionEnd } from "./pi-embedded-subscribe.handlers.tools.js";
+
+// Minimal mock context factory. Only the fields needed for the media emission path.
+function createMockContext(overrides?: {
+  shouldEmitToolOutput?: boolean;
+  onToolResult?: ReturnType<typeof vi.fn>;
+}): EmbeddedPiSubscribeContext {
+  const onToolResult = overrides?.onToolResult ?? vi.fn();
+  return {
+    params: {
+      runId: "test-run",
+      onToolResult,
+      onAgentEvent: vi.fn(),
+    },
+    state: {
+      toolMetaById: new Map(),
+      toolMetas: [],
+      toolSummaryById: new Set(),
+      pendingMessagingTexts: new Map(),
+      pendingMessagingTargets: new Map(),
+      messagingToolSentTexts: [],
+      messagingToolSentTextsNormalized: [],
+      messagingToolSentTargets: [],
+    },
+    log: { debug: vi.fn(), warn: vi.fn() },
+    shouldEmitToolResult: vi.fn(() => false),
+    shouldEmitToolOutput: vi.fn(() => overrides?.shouldEmitToolOutput ?? false),
+    emitToolSummary: vi.fn(),
+    emitToolOutput: vi.fn(),
+    trimMessagingToolSent: vi.fn(),
+    hookRunner: undefined,
+    // Fill in remaining required fields with no-ops.
+    blockChunker: null,
+    noteLastAssistant: vi.fn(),
+    stripBlockTags: vi.fn((t: string) => t),
+    emitBlockChunk: vi.fn(),
+    flushBlockReplyBuffer: vi.fn(),
+    emitReasoningStream: vi.fn(),
+    consumeReplyDirectives: vi.fn(() => null),
+    consumePartialReplyDirectives: vi.fn(() => null),
+    resetAssistantMessageState: vi.fn(),
+    resetForCompactionRetry: vi.fn(),
+    finalizeAssistantTexts: vi.fn(),
+    ensureCompactionPromise: vi.fn(),
+    noteCompactionRetry: vi.fn(),
+    resolveCompactionRetry: vi.fn(),
+    maybeResolveCompactionWait: vi.fn(),
+    recordAssistantUsage: vi.fn(),
+    incrementCompactionCount: vi.fn(),
+    getUsageTotals: vi.fn(() => undefined),
+    getCompactionCount: vi.fn(() => 0),
+  } as unknown as EmbeddedPiSubscribeContext;
+}
+
+describe("handleToolExecutionEnd media emission", () => {
+  it("emits media when verbose is off and tool result has MEDIA: path", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "browser",
+      toolCallId: "tc-1",
+      isError: false,
+      result: {
+        content: [
+          { type: "text", text: "MEDIA:/tmp/screenshot.png" },
+          { type: "image", data: "base64", mimeType: "image/png" },
+        ],
+        details: { path: "/tmp/screenshot.png" },
+      },
+    });
+
+    expect(onToolResult).toHaveBeenCalledWith({
+      mediaUrls: ["/tmp/screenshot.png"],
+    });
+  });
+
+  it("does NOT emit media when verbose is full (emitToolOutput handles it)", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({ shouldEmitToolOutput: true, onToolResult });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "browser",
+      toolCallId: "tc-1",
+      isError: false,
+      result: {
+        content: [
+          { type: "text", text: "MEDIA:/tmp/screenshot.png" },
+          { type: "image", data: "base64", mimeType: "image/png" },
+        ],
+        details: { path: "/tmp/screenshot.png" },
+      },
+    });
+
+    // onToolResult should NOT be called by the new media path (emitToolOutput handles it).
+    // It may be called by emitToolOutput, but the new block should not fire.
+    // Verify emitToolOutput was called instead.
+    expect(ctx.emitToolOutput).toHaveBeenCalled();
+    // The direct media emission should not have been called with just mediaUrls.
+    const directMediaCalls = onToolResult.mock.calls.filter(
+      (call: unknown[]) =>
+        call[0] &&
+        typeof call[0] === "object" &&
+        "mediaUrls" in (call[0] as Record<string, unknown>) &&
+        !("text" in (call[0] as Record<string, unknown>)),
+    );
+    expect(directMediaCalls).toHaveLength(0);
+  });
+
+  it("does NOT emit media for error results", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "browser",
+      toolCallId: "tc-1",
+      isError: true,
+      result: {
+        content: [
+          { type: "text", text: "MEDIA:/tmp/screenshot.png" },
+          { type: "image", data: "base64", mimeType: "image/png" },
+        ],
+        details: { path: "/tmp/screenshot.png" },
+      },
+    });
+
+    expect(onToolResult).not.toHaveBeenCalled();
+  });
+
+  it("does NOT emit when tool result has no media", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "bash",
+      toolCallId: "tc-1",
+      isError: false,
+      result: {
+        content: [{ type: "text", text: "Command executed successfully" }],
+      },
+    });
+
+    expect(onToolResult).not.toHaveBeenCalled();
+  });
+
+  it("emits media from details.path fallback when no MEDIA: text", async () => {
+    const onToolResult = vi.fn();
+    const ctx = createMockContext({ shouldEmitToolOutput: false, onToolResult });
+
+    await handleToolExecutionEnd(ctx, {
+      type: "tool_execution_end",
+      toolName: "canvas",
+      toolCallId: "tc-1",
+      isError: false,
+      result: {
+        content: [
+          { type: "text", text: "Rendered canvas" },
+          { type: "image", data: "base64", mimeType: "image/png" },
+        ],
+        details: { path: "/tmp/canvas-output.png" },
+      },
+    });
+
+    expect(onToolResult).toHaveBeenCalledWith({
+      mediaUrls: ["/tmp/canvas-output.png"],
+    });
+  });
+});
--- a/src/agents/pi-embedded-subscribe.handlers.tools.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.tools.ts
@ -10,6 +10,7 @@ import { normalizeTextForComparison } from "./pi-embedded-helpers.js";
 import { isMessagingTool, isMessagingToolSendAction } from "./pi-embedded-messaging.js";
 import {
  extractToolErrorMessage,
+  extractToolResultMediaPaths,
  extractToolResultText,
  extractMessagingToolSend,
  isToolResultError,
@ -266,6 +267,20 @@ export async function handleToolExecutionEnd(
    }
  }

+  // Deliver media from tool results when the verbose emitToolOutput path is off.
+  // When shouldEmitToolOutput() is true, emitToolOutput already delivers media
+  // via parseReplyDirectives (MEDIA: text extraction), so skip to avoid duplicates.
+  if (ctx.params.onToolResult && !isToolError && !ctx.shouldEmitToolOutput()) {
+    const mediaPaths = extractToolResultMediaPaths(result);
+    if (mediaPaths.length > 0) {
+      try {
+        void ctx.params.onToolResult({ mediaUrls: mediaPaths });
+      } catch {
+        // ignore delivery failures
+      }
+    }
+  }
+
  // Run after_tool_call plugin hook (fire-and-forget)
  const hookRunnerAfter = ctx.hookRunner ?? getGlobalHookRunner();
  if (hookRunnerAfter?.hasHooks("after_tool_call")) {
--- a/src/agents/pi-embedded-subscribe.tools.media.test.ts
+++ b/src/agents/pi-embedded-subscribe.tools.media.test.ts
@ -0,0 +1,132 @@
+import { describe, expect, it } from "vitest";
+import { extractToolResultMediaPaths } from "./pi-embedded-subscribe.tools.js";
+
+describe("extractToolResultMediaPaths", () => {
+  it("returns empty array for null/undefined", () => {
+    expect(extractToolResultMediaPaths(null)).toEqual([]);
+    expect(extractToolResultMediaPaths(undefined)).toEqual([]);
+  });
+
+  it("returns empty array for non-object", () => {
+    expect(extractToolResultMediaPaths("hello")).toEqual([]);
+    expect(extractToolResultMediaPaths(42)).toEqual([]);
+  });
+
+  it("returns empty array when content is missing", () => {
+    expect(extractToolResultMediaPaths({ details: { path: "/tmp/img.png" } })).toEqual([]);
+  });
+
+  it("returns empty array when content has no text or image blocks", () => {
+    expect(extractToolResultMediaPaths({ content: [{ type: "other" }] })).toEqual([]);
+  });
+
+  it("extracts MEDIA: path from text content block", () => {
+    const result = {
+      content: [
+        { type: "text", text: "MEDIA:/tmp/screenshot.png" },
+        { type: "image", data: "base64data", mimeType: "image/png" },
+      ],
+      details: { path: "/tmp/screenshot.png" },
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/screenshot.png"]);
+  });
+
+  it("extracts MEDIA: path with extra text in the block", () => {
+    const result = {
+      content: [{ type: "text", text: "Here is the image\nMEDIA:/tmp/output.jpg\nDone" }],
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/output.jpg"]);
+  });
+
+  it("extracts multiple MEDIA: paths from different text blocks", () => {
+    const result = {
+      content: [
+        { type: "text", text: "MEDIA:/tmp/page1.png" },
+        { type: "text", text: "MEDIA:/tmp/page2.png" },
+      ],
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/page1.png", "/tmp/page2.png"]);
+  });
+
+  it("falls back to details.path when image content exists but no MEDIA: text", () => {
+    // Pi SDK read tool doesn't include MEDIA: but OpenClaw imageResult
+    // sets details.path as fallback.
+    const result = {
+      content: [
+        { type: "text", text: "Read image file [image/png]" },
+        { type: "image", data: "base64data", mimeType: "image/png" },
+      ],
+      details: { path: "/tmp/generated.png" },
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/generated.png"]);
+  });
+
+  it("returns empty array when image content exists but no MEDIA: and no details.path", () => {
+    // Pi SDK read tool: has image content but no path anywhere in the result.
+    const result = {
+      content: [
+        { type: "text", text: "Read image file [image/png]" },
+        { type: "image", data: "base64data", mimeType: "image/png" },
+      ],
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual([]);
+  });
+
+  it("does not fall back to details.path when MEDIA: paths are found", () => {
+    const result = {
+      content: [
+        { type: "text", text: "MEDIA:/tmp/from-text.png" },
+        { type: "image", data: "base64data", mimeType: "image/png" },
+      ],
+      details: { path: "/tmp/from-details.png" },
+    };
+    // MEDIA: text takes priority; details.path is NOT also included.
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/from-text.png"]);
+  });
+
+  it("handles backtick-wrapped MEDIA: paths", () => {
+    const result = {
+      content: [{ type: "text", text: "MEDIA: `/tmp/screenshot.png`" }],
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/screenshot.png"]);
+  });
+
+  it("ignores null/undefined items in content array", () => {
+    const result = {
+      content: [null, undefined, { type: "text", text: "MEDIA:/tmp/ok.png" }],
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/ok.png"]);
+  });
+
+  it("returns empty array for text-only results without MEDIA:", () => {
+    const result = {
+      content: [{ type: "text", text: "Command executed successfully" }],
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual([]);
+  });
+
+  it("ignores details.path when no image content exists", () => {
+    // details.path without image content is not media.
+    const result = {
+      content: [{ type: "text", text: "File saved" }],
+      details: { path: "/tmp/data.json" },
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual([]);
+  });
+
+  it("handles details.path with whitespace", () => {
+    const result = {
+      content: [{ type: "image", data: "base64", mimeType: "image/png" }],
+      details: { path: "  /tmp/image.png  " },
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual(["/tmp/image.png"]);
+  });
+
+  it("skips empty details.path", () => {
+    const result = {
+      content: [{ type: "image", data: "base64", mimeType: "image/png" }],
+      details: { path: "   " },
+    };
+    expect(extractToolResultMediaPaths(result)).toEqual([]);
+  });
+});
--- a/src/agents/pi-embedded-subscribe.tools.ts
+++ b/src/agents/pi-embedded-subscribe.tools.ts
@ -1,5 +1,6 @@
 import { getChannelPlugin, normalizeChannelId } from "../channels/plugins/index.js";
 import { normalizeTargetForProvider } from "../infra/outbound/target-normalization.js";
+import { MEDIA_TOKEN_RE } from "../media/parse.js";
 import { truncateUtf16Safe } from "../utils.js";
 import { type MessagingToolSend } from "./pi-embedded-messaging.js";

@ -118,6 +119,72 @@ export function extractToolResultText(result: unknown): string | undefined {
  return texts.join("\n");
 }

+/**
+ * Extract media file paths from a tool result.
+ *
+ * Strategy (first match wins):
+ * 1. Parse `MEDIA:` tokens from text content blocks (all OpenClaw tools).
+ * 2. Fall back to `details.path` when image content exists (OpenClaw imageResult).
+ *
+ * Returns an empty array when no media is found (e.g. Pi SDK `read` tool
+ * returns base64 image data but no file path; those need a different delivery
+ * path like saving to a temp file).
+ */
+export function extractToolResultMediaPaths(result: unknown): string[] {
+  if (!result || typeof result !== "object") {
+    return [];
+  }
+  const record = result as Record<string, unknown>;
+  const content = Array.isArray(record.content) ? record.content : null;
+  if (!content) {
+    return [];
+  }
+
+  // Extract MEDIA: paths from text content blocks.
+  const paths: string[] = [];
+  let hasImageContent = false;
+  for (const item of content) {
+    if (!item || typeof item !== "object") {
+      continue;
+    }
+    const entry = item as Record<string, unknown>;
+    if (entry.type === "image") {
+      hasImageContent = true;
+      continue;
+    }
+    if (entry.type === "text" && typeof entry.text === "string") {
+      // Reset lastIndex since MEDIA_TOKEN_RE is global.
+      MEDIA_TOKEN_RE.lastIndex = 0;
+      let match: RegExpExecArray | null;
+      while ((match = MEDIA_TOKEN_RE.exec(entry.text)) !== null) {
+        // Strip surrounding quotes/backticks and whitespace (mirrors cleanCandidate in media/parse).
+        const p = match[1]
+          ?.replace(/^[`"'[{(]+/, "")
+          .replace(/[`"'\]})\\,]+$/, "")
+          .trim();
+        if (p && p.length <= 4096) {
+          paths.push(p);
+        }
+      }
+    }
+  }
+
+  if (paths.length > 0) {
+    return paths;
+  }
+
+  // Fall back to details.path when image content exists but no MEDIA: text.
+  if (hasImageContent) {
+    const details = record.details as Record<string, unknown> | undefined;
+    const p = typeof details?.path === "string" ? details.path.trim() : "";
+    if (p) {
+      return [p];
+    }
+  }
+
+  return [];
+}
+
 export function isToolResultError(result: unknown): boolean {
  if (!result || typeof result !== "object") {
    return false;
--- a/src/auto-reply/reply/agent-runner-execution.ts
+++ b/src/auto-reply/reply/agent-runner-execution.ts
@ -128,6 +128,10 @@ export async function runAgentTurnWithFallback(params: {
          return { skip: true };
        }
        if (!text) {
+          // Allow media-only payloads (e.g. tool result screenshots) through.
+          if ((payload.mediaUrls?.length ?? 0) > 0) {
+            return { text: undefined, skip: false };
+          }
          return { skip: true };
        }
        const sanitized = sanitizeUserFacingText(text, {