diff --git a/CHANGELOG.md b/CHANGELOG.md index fd47051f45a..f03662be132 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Agents/Nodes media outputs: add dedicated `photos_latest` action handling, block media-returning `nodes invoke` commands, keep metadata-only `camera.list` invoke allowed, and normalize empty `photos_latest` results to a consistent response shape to prevent base64 context bloat. (#34332) Thanks @obviyus. - TUI/session-key canonicalization: normalize `openclaw tui --session` values to lowercase so uppercase session names no longer drop real-time streaming updates due to gateway/TUI key mismatches. (#33866, #34013) thanks @lynnzc. - Outbound/send config threading: pass resolved SecretRef config through outbound adapters and helper send paths so send flows do not reload unresolved runtime config. (#33987) Thanks @joshavant. - Sessions/subagent attachments: remove `attachments[].content.maxLength` from `sessions_spawn` schema to avoid llama.cpp GBNF repetition overflow, and preflight UTF-8 byte size before buffer allocation while keeping runtime file-size enforcement unchanged. (#33648) Thanks @anisoptera. diff --git a/src/agents/openclaw-tools.camera.test.ts b/src/agents/openclaw-tools.camera.test.ts index 5fc01d07a82..9621c55c10b 100644 --- a/src/agents/openclaw-tools.camera.test.ts +++ b/src/agents/openclaw-tools.camera.test.ts @@ -32,16 +32,21 @@ function unexpectedGatewayMethod(method: unknown): never { throw new Error(`unexpected method: ${String(method)}`); } -function getNodesTool() { - const tool = createOpenClawTools().find((candidate) => candidate.name === "nodes"); +function getNodesTool(options?: { modelHasVision?: boolean }) { + const tool = createOpenClawTools( + options?.modelHasVision !== undefined ? { modelHasVision: options.modelHasVision } : {}, + ).find((candidate) => candidate.name === "nodes"); if (!tool) { throw new Error("missing nodes tool"); } return tool; } -async function executeNodes(input: Record) { - return getNodesTool().execute("call1", input as never); +async function executeNodes( + input: Record, + options?: { modelHasVision?: boolean }, +) { + return getNodesTool(options).execute("call1", input as never); } type NodesToolResult = Awaited>; @@ -67,6 +72,11 @@ function expectSingleImage(result: NodesToolResult, params?: { mimeType?: string } } +function expectNoImages(result: NodesToolResult) { + const images = (result.content ?? []).filter((block) => block.type === "image"); + expect(images).toHaveLength(0); +} + function expectFirstTextContains(result: NodesToolResult, expectedText: string) { expect(result.content?.[0]).toMatchObject({ type: "text", @@ -156,10 +166,13 @@ describe("nodes camera_snap", () => { }, }); - const result = await executeNodes({ - action: "camera_snap", - node: NODE_ID, - }); + const result = await executeNodes( + { + action: "camera_snap", + node: NODE_ID, + }, + { modelHasVision: true }, + ); expectSingleImage(result); }); @@ -169,15 +182,39 @@ describe("nodes camera_snap", () => { invokePayload: JPG_PAYLOAD, }); - const result = await executeNodes({ - action: "camera_snap", - node: NODE_ID, - facing: "front", - }); + const result = await executeNodes( + { + action: "camera_snap", + node: NODE_ID, + facing: "front", + }, + { modelHasVision: true }, + ); expectSingleImage(result, { mimeType: "image/jpeg" }); }); + it("omits inline base64 image blocks when model has no vision", async () => { + setupNodeInvokeMock({ + invokePayload: JPG_PAYLOAD, + }); + + const result = await executeNodes( + { + action: "camera_snap", + node: NODE_ID, + facing: "front", + }, + { modelHasVision: false }, + ); + + expectNoImages(result); + expect(result.content?.[0]).toMatchObject({ + type: "text", + text: expect.stringMatching(/^MEDIA:/), + }); + }); + it("passes deviceId when provided", async () => { setupNodeInvokeMock({ onInvoke: (invokeParams) => { @@ -299,6 +336,130 @@ describe("nodes camera_clip", () => { }); }); +describe("nodes photos_latest", () => { + it("returns empty content/details when no photos are available", async () => { + setupNodeInvokeMock({ + onInvoke: (invokeParams) => { + expect(invokeParams).toMatchObject({ + command: "photos.latest", + params: { + limit: 1, + maxWidth: 1600, + quality: 0.85, + }, + }); + return { + payload: { + photos: [], + }, + }; + }, + }); + + const result = await executeNodes( + { + action: "photos_latest", + node: NODE_ID, + }, + { modelHasVision: false }, + ); + + expect(result.content ?? []).toEqual([]); + expect(result.details).toEqual([]); + }); + + it("returns MEDIA paths and no inline images when model has no vision", async () => { + setupNodeInvokeMock({ + remoteIp: "198.51.100.42", + onInvoke: (invokeParams) => { + expect(invokeParams).toMatchObject({ + command: "photos.latest", + params: { + limit: 1, + maxWidth: 1600, + quality: 0.85, + }, + }); + return { + payload: { + photos: [ + { + format: "jpeg", + base64: "aGVsbG8=", + width: 1, + height: 1, + createdAt: "2026-03-04T00:00:00Z", + }, + ], + }, + }; + }, + }); + + const result = await executeNodes( + { + action: "photos_latest", + node: NODE_ID, + }, + { modelHasVision: false }, + ); + + expectNoImages(result); + expect(result.content?.[0]).toMatchObject({ + type: "text", + text: expect.stringMatching(/^MEDIA:/), + }); + const details = Array.isArray(result.details) ? result.details : []; + expect(details[0]).toMatchObject({ + width: 1, + height: 1, + createdAt: "2026-03-04T00:00:00Z", + }); + }); + + it("includes inline image blocks when model has vision", async () => { + setupNodeInvokeMock({ + onInvoke: (invokeParams) => { + expect(invokeParams).toMatchObject({ + command: "photos.latest", + params: { + limit: 1, + maxWidth: 1600, + quality: 0.85, + }, + }); + return { + payload: { + photos: [ + { + format: "jpeg", + base64: "aGVsbG8=", + width: 1, + height: 1, + createdAt: "2026-03-04T00:00:00Z", + }, + ], + }, + }; + }, + }); + + const result = await executeNodes( + { + action: "photos_latest", + node: NODE_ID, + }, + { modelHasVision: true }, + ); + + expect(result.content?.[0]).toMatchObject({ + type: "text", + text: expect.stringMatching(/^MEDIA:/), + }); + expectSingleImage(result, { mimeType: "image/jpeg" }); + }); +}); + describe("nodes notifications_list", () => { it("invokes notifications.list and returns payload", async () => { setupNodeInvokeMock({ @@ -576,3 +737,44 @@ describe("nodes run", () => { ); }); }); + +describe("nodes invoke", () => { + it("allows metadata-only camera.list via generic invoke", async () => { + setupNodeInvokeMock({ + onInvoke: (invokeParams) => { + expect(invokeParams).toMatchObject({ + command: "camera.list", + params: {}, + }); + return { + payload: { + devices: [{ id: "cam-back", name: "Back Camera" }], + }, + }; + }, + }); + + const result = await executeNodes({ + action: "invoke", + node: NODE_ID, + invokeCommand: "camera.list", + }); + + expect(result.details).toMatchObject({ + payload: { + devices: [{ id: "cam-back", name: "Back Camera" }], + }, + }); + }); + + it("blocks media invoke commands to avoid base64 context bloat", async () => { + await expect( + executeNodes({ + action: "invoke", + node: NODE_ID, + invokeCommand: "photos.latest", + invokeParamsJson: '{"limit":1}', + }), + ).rejects.toThrow(/use action="photos_latest"/i); + }); +}); diff --git a/src/agents/openclaw-tools.ts b/src/agents/openclaw-tools.ts index cbd9b7b4140..b09f7821208 100644 --- a/src/agents/openclaw-tools.ts +++ b/src/agents/openclaw-tools.ts @@ -136,6 +136,7 @@ export function createOpenClawTools(options?: { currentChannelId: options?.currentChannelId, currentThreadTs: options?.currentThreadTs, config: options?.config, + modelHasVision: options?.modelHasVision, }), createCronTool({ agentSessionKey: options?.agentSessionKey, diff --git a/src/agents/tools/nodes-tool.ts b/src/agents/tools/nodes-tool.ts index 769fe28e0d9..6572ea41205 100644 --- a/src/agents/tools/nodes-tool.ts +++ b/src/agents/tools/nodes-tool.ts @@ -39,6 +39,7 @@ const NODES_TOOL_ACTIONS = [ "camera_snap", "camera_list", "camera_clip", + "photos_latest", "screen_record", "location_get", "notifications_list", @@ -56,6 +57,12 @@ const NOTIFY_DELIVERIES = ["system", "overlay", "auto"] as const; const NOTIFICATIONS_ACTIONS = ["open", "dismiss", "reply"] as const; const CAMERA_FACING = ["front", "back", "both"] as const; const LOCATION_ACCURACY = ["coarse", "balanced", "precise"] as const; +const MEDIA_INVOKE_ACTIONS = { + "camera.snap": "camera_snap", + "camera.clip": "camera_clip", + "photos.latest": "photos_latest", + "screen.record": "screen_record", +} as const; const NODE_READ_ACTION_COMMANDS = { camera_list: "camera.list", notifications_list: "notifications.list", @@ -118,6 +125,7 @@ const NodesToolSchema = Type.Object({ quality: Type.Optional(Type.Number()), delayMs: Type.Optional(Type.Number()), deviceId: Type.Optional(Type.String()), + limit: Type.Optional(Type.Number()), duration: Type.Optional(Type.String()), durationMs: Type.Optional(Type.Number({ maximum: 300_000 })), includeAudio: Type.Optional(Type.Boolean()), @@ -152,6 +160,7 @@ export function createNodesTool(options?: { currentChannelId?: string; currentThreadTs?: string | number; config?: OpenClawConfig; + modelHasVision?: boolean; }): AnyAgentTool { const sessionKey = options?.agentSessionKey?.trim() || undefined; const turnSourceChannel = options?.agentChannel?.trim() || undefined; @@ -167,7 +176,7 @@ export function createNodesTool(options?: { label: "Nodes", name: "nodes", description: - "Discover and control paired nodes (status/describe/pairing/notify/camera/screen/location/notifications/run/invoke).", + "Discover and control paired nodes (status/describe/pairing/notify/camera/photos/screen/location/notifications/run/invoke).", parameters: NodesToolSchema, execute: async (_toolCallId, args) => { const params = args as Record; @@ -301,7 +310,7 @@ export function createNodesTool(options?: { invalidPayloadMessage: "invalid camera.snap payload", }); content.push({ type: "text", text: `MEDIA:${filePath}` }); - if (payload.base64) { + if (options?.modelHasVision && payload.base64) { content.push({ type: "image", data: payload.base64, @@ -320,6 +329,103 @@ export function createNodesTool(options?: { const result: AgentToolResult = { content, details }; return await sanitizeToolResultImages(result, "nodes:camera_snap", imageSanitization); } + case "photos_latest": { + const node = readStringParam(params, "node", { required: true }); + const resolvedNode = await resolveNode(gatewayOpts, node); + const nodeId = resolvedNode.nodeId; + const limitRaw = + typeof params.limit === "number" && Number.isFinite(params.limit) + ? Math.floor(params.limit) + : DEFAULT_PHOTOS_LIMIT; + const limit = Math.max(1, Math.min(limitRaw, MAX_PHOTOS_LIMIT)); + const maxWidth = + typeof params.maxWidth === "number" && Number.isFinite(params.maxWidth) + ? params.maxWidth + : DEFAULT_PHOTOS_MAX_WIDTH; + const quality = + typeof params.quality === "number" && Number.isFinite(params.quality) + ? params.quality + : DEFAULT_PHOTOS_QUALITY; + const raw = await callGatewayTool<{ payload: unknown }>("node.invoke", gatewayOpts, { + nodeId, + command: "photos.latest", + params: { + limit, + maxWidth, + quality, + }, + idempotencyKey: crypto.randomUUID(), + }); + const payload = + raw?.payload && typeof raw.payload === "object" && !Array.isArray(raw.payload) + ? (raw.payload as Record) + : {}; + const photos = Array.isArray(payload.photos) ? payload.photos : []; + + if (photos.length === 0) { + const result: AgentToolResult = { + content: [], + details: [], + }; + return await sanitizeToolResultImages( + result, + "nodes:photos_latest", + imageSanitization, + ); + } + + const content: AgentToolResult["content"] = []; + const details: Array> = []; + + for (const [index, photoRaw] of photos.entries()) { + const photo = parseCameraSnapPayload(photoRaw); + const normalizedFormat = photo.format.toLowerCase(); + if ( + normalizedFormat !== "jpg" && + normalizedFormat !== "jpeg" && + normalizedFormat !== "png" + ) { + throw new Error(`unsupported photos.latest format: ${photo.format}`); + } + const isJpeg = normalizedFormat === "jpg" || normalizedFormat === "jpeg"; + const filePath = cameraTempPath({ + kind: "snap", + ext: isJpeg ? "jpg" : "png", + id: crypto.randomUUID(), + }); + await writeCameraPayloadToFile({ + filePath, + payload: photo, + expectedHost: resolvedNode.remoteIp, + invalidPayloadMessage: "invalid photos.latest payload", + }); + + content.push({ type: "text", text: `MEDIA:${filePath}` }); + if (options?.modelHasVision && photo.base64) { + content.push({ + type: "image", + data: photo.base64, + mimeType: + imageMimeFromFormat(photo.format) ?? (isJpeg ? "image/jpeg" : "image/png"), + }); + } + + const createdAt = + photoRaw && typeof photoRaw === "object" && !Array.isArray(photoRaw) + ? (photoRaw as Record).createdAt + : undefined; + details.push({ + index, + path: filePath, + width: photo.width, + height: photo.height, + ...(typeof createdAt === "string" ? { createdAt } : {}), + }); + } + + const result: AgentToolResult = { content, details }; + return await sanitizeToolResultImages(result, "nodes:photos_latest", imageSanitization); + } case "camera_list": case "notifications_list": case "device_status": @@ -645,6 +751,14 @@ export function createNodesTool(options?: { const node = readStringParam(params, "node", { required: true }); const nodeId = await resolveNodeId(gatewayOpts, node); const invokeCommand = readStringParam(params, "invokeCommand", { required: true }); + const invokeCommandNormalized = invokeCommand.trim().toLowerCase(); + const dedicatedAction = + MEDIA_INVOKE_ACTIONS[invokeCommandNormalized as keyof typeof MEDIA_INVOKE_ACTIONS]; + if (dedicatedAction) { + throw new Error( + `invokeCommand "${invokeCommand}" returns media payloads and is blocked to prevent base64 context bloat; use action="${dedicatedAction}"`, + ); + } const invokeParamsJson = typeof params.invokeParamsJson === "string" ? params.invokeParamsJson.trim() : ""; let invokeParams: unknown = {}; @@ -695,3 +809,8 @@ export function createNodesTool(options?: { }, }; } + +const DEFAULT_PHOTOS_LIMIT = 1; +const MAX_PHOTOS_LIMIT = 20; +const DEFAULT_PHOTOS_MAX_WIDTH = 1600; +const DEFAULT_PHOTOS_QUALITY = 0.85;