From 54c69414ad522a5d56a6bdcdc85e33272d271c6f Mon Sep 17 00:00:00 2001 From: Neerav Makwana Date: Tue, 31 Mar 2026 00:40:51 -0400 Subject: [PATCH] fix: normalize xai tool result image replay (#58017) (thanks @neeravmakwana) * fix(xai): normalize image tool results for responses * fix(xai): handle reviewed tool result payload cases * fix: normalize xai tool result image replay (#58017) (thanks @neeravmakwana) --------- Co-authored-by: Ayaan Zaidi --- CHANGELOG.md | 1 + extensions/xai/stream.test.ts | 235 ++++++++++++++++++++++++++++++++++ extensions/xai/stream.ts | 117 +++++++++++++++++ 3 files changed, 353 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d578e359b6f..b62f51a41cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -118,6 +118,7 @@ Docs: https://docs.openclaw.ai - Diffs: fall back to plain text when `lang` hints are invalid during diff render and viewer hydration, so bad or stale language values no longer break the diff viewer. (#57902) Thanks @gumadeiras. - Doctor/plugins: skip false Matrix legacy-helper warnings when no migration plans exist, and keep bundled `enabledByDefault` plugins in the gateway startup set. (#57931) Thanks @dinakars777. - Matrix/CLI send: start one-off Matrix send clients before outbound delivery so `openclaw message send --channel matrix` restores E2EE in encrypted rooms instead of sending plain events. (#57936) Thanks @gumadeiras. +- xAI/Responses: normalize image-bearing tool results for xAI responses payloads, including OpenResponses-style `input_image.source` parts, so image tool replays no longer 422 on the follow-up turn. (#58017) Thanks @neeravmakwana. - Cron/isolated sessions: carry the full live-session provider, model, and auth-profile selection across retry restarts so cron jobs with model overrides no longer fail or loop on mid-run model-switch requests. (#57972) Thanks @issaba1. - Matrix/direct rooms: stop trusting remote `is_direct`, honor explicit local `is_direct: false` for discovered DM candidates, and avoid extra member-state lookups for shared rooms so DM routing and repair stay aligned. (#57124) Thanks @w-sss. - Agents/sandbox: make remote FS bridge reads pin the parent path and open the file atomically in the helper so read access cannot race path resolution. Thanks @AntAISecurityLab and @vincentkoc. diff --git a/extensions/xai/stream.test.ts b/extensions/xai/stream.test.ts index 552111d18eb..d6a3c7e399e 100644 --- a/extensions/xai/stream.test.ts +++ b/extensions/xai/stream.test.ts @@ -117,4 +117,239 @@ describe("xai stream wrappers", () => { expect(payload).not.toHaveProperty("reasoningEffort"); expect(payload).not.toHaveProperty("reasoning_effort"); }); + + it("moves image-bearing tool results out of function_call_output payloads", () => { + const payload: Record = { + input: [ + { + type: "function_call_output", + call_id: "call_1", + output: [ + { type: "input_text", text: "Read image" }, + { + type: "input_image", + detail: "auto", + image_url: "data:image/png;base64,QUJDRA==", + }, + ], + }, + ], + }; + const baseStreamFn: StreamFn = (_model, _context, options) => { + options?.onPayload?.(payload, {} as Model<"openai-responses">); + return {} as ReturnType; + }; + const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn); + + void wrapped( + { + api: "openai-responses", + provider: "xai", + id: "grok-4-fast", + input: ["text", "image"], + } as Model<"openai-responses">, + { messages: [] } as Context, + {}, + ); + + expect(payload.input).toEqual([ + { + type: "function_call_output", + call_id: "call_1", + output: "Read image", + }, + { + type: "message", + role: "user", + content: [ + { type: "input_text", text: "Attached image(s) from tool result:" }, + { + type: "input_image", + detail: "auto", + image_url: "data:image/png;base64,QUJDRA==", + }, + ], + }, + ]); + }); + + it("replays source-based input_image parts from tool results", () => { + const payload: Record = { + input: [ + { + type: "function_call_output", + call_id: "call_1", + output: [ + { type: "input_text", text: "Read image" }, + { + type: "input_image", + source: { + type: "base64", + media_type: "image/png", + data: "QUJDRA==", + }, + }, + ], + }, + ], + }; + const baseStreamFn: StreamFn = (_model, _context, options) => { + options?.onPayload?.(payload, {} as Model<"openai-responses">); + return {} as ReturnType; + }; + const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn); + + void wrapped( + { + api: "openai-responses", + provider: "xai", + id: "grok-4-fast", + input: ["text", "image"], + } as Model<"openai-responses">, + { messages: [] } as Context, + {}, + ); + + expect(payload.input).toEqual([ + { + type: "function_call_output", + call_id: "call_1", + output: "Read image", + }, + { + type: "message", + role: "user", + content: [ + { type: "input_text", text: "Attached image(s) from tool result:" }, + { + type: "input_image", + source: { + type: "base64", + media_type: "image/png", + data: "QUJDRA==", + }, + }, + ], + }, + ]); + }); + + it("keeps multiple tool outputs contiguous before replaying collected images", () => { + const payload: Record = { + input: [ + { + type: "function_call_output", + call_id: "call_1", + output: [ + { type: "input_text", text: "first" }, + { + type: "input_image", + detail: "auto", + image_url: "data:image/png;base64,QUFBQQ==", + }, + ], + }, + { + type: "function_call_output", + call_id: "call_2", + output: [ + { type: "input_text", text: "second" }, + { + type: "input_image", + detail: "auto", + image_url: "data:image/png;base64,QkJCQg==", + }, + ], + }, + ], + }; + const baseStreamFn: StreamFn = (_model, _context, options) => { + options?.onPayload?.(payload, {} as Model<"openai-responses">); + return {} as ReturnType; + }; + const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn); + + void wrapped( + { + api: "openai-responses", + provider: "xai", + id: "grok-4-fast", + input: ["text", "image"], + } as Model<"openai-responses">, + { messages: [] } as Context, + {}, + ); + + expect(payload.input).toEqual([ + { + type: "function_call_output", + call_id: "call_1", + output: "first", + }, + { + type: "function_call_output", + call_id: "call_2", + output: "second", + }, + { + type: "message", + role: "user", + content: [ + { type: "input_text", text: "Attached image(s) from tool result:" }, + { + type: "input_image", + detail: "auto", + image_url: "data:image/png;base64,QUFBQQ==", + }, + { + type: "input_image", + detail: "auto", + image_url: "data:image/png;base64,QkJCQg==", + }, + ], + }, + ]); + }); + + it("drops image blocks and uses fallback text for models without image input", () => { + const payload: Record = { + input: [ + { + type: "function_call_output", + call_id: "call_1", + output: [ + { + type: "input_image", + detail: "auto", + image_url: "data:image/png;base64,QUJDRA==", + }, + ], + }, + ], + }; + const baseStreamFn: StreamFn = (_model, _context, options) => { + options?.onPayload?.(payload, {} as Model<"openai-responses">); + return {} as ReturnType; + }; + const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn); + + void wrapped( + { + api: "openai-responses", + provider: "xai", + id: "grok-4-fast", + input: ["text"], + } as Model<"openai-responses">, + { messages: [] } as Context, + {}, + ); + + expect(payload.input).toEqual([ + { + type: "function_call_output", + call_id: "call_1", + output: "(see attached image)", + }, + ]); + }); }); diff --git a/extensions/xai/stream.ts b/extensions/xai/stream.ts index 956fc6cad64..7af4efd1060 100644 --- a/extensions/xai/stream.ts +++ b/extensions/xai/stream.ts @@ -33,6 +33,122 @@ function stripUnsupportedStrictFlag(tool: unknown): unknown { return { ...toolObj, function: nextFunction }; } +function supportsExplicitImageInput(model: { input?: unknown }): boolean { + return Array.isArray(model.input) && model.input.includes("image"); +} + +const TOOL_RESULT_IMAGE_REPLAY_TEXT = "Attached image(s) from tool result:"; + +type ReplayableInputImagePart = + | { + type: "input_image"; + source: { type: "url"; url: string } | { type: "base64"; media_type: string; data: string }; + } + | { type: "input_image"; image_url: string; detail?: string }; + +type NormalizedFunctionCallOutput = { + normalizedItem: unknown; + imageParts: Array>; +}; + +function isReplayableInputImagePart( + part: Record, +): part is ReplayableInputImagePart { + if (part.type !== "input_image") { + return false; + } + if (typeof part.image_url === "string") { + return true; + } + if (!part.source || typeof part.source !== "object") { + return false; + } + const source = part.source as { + type?: unknown; + url?: unknown; + media_type?: unknown; + data?: unknown; + }; + if (source.type === "url") { + return typeof source.url === "string"; + } + return ( + source.type === "base64" && + typeof source.media_type === "string" && + typeof source.data === "string" + ); +} + +function normalizeXaiResponsesFunctionCallOutput( + item: unknown, + includeImages: boolean, +): NormalizedFunctionCallOutput { + if (!item || typeof item !== "object") { + return { normalizedItem: item, imageParts: [] }; + } + + const itemObj = item as Record; + if (itemObj.type !== "function_call_output" || !Array.isArray(itemObj.output)) { + return { normalizedItem: itemObj, imageParts: [] }; + } + + const outputParts = itemObj.output as Array>; + const textOutput = outputParts + .filter( + (part): part is { type: "input_text"; text: string } => + part.type === "input_text" && typeof part.text === "string", + ) + .map((part) => part.text) + .join(""); + + const imageParts = includeImages + ? outputParts.filter((part): part is ReplayableInputImagePart => + isReplayableInputImagePart(part), + ) + : []; + const hadNonTextParts = outputParts.some((part) => part.type !== "input_text"); + + return { + normalizedItem: { + ...itemObj, + output: textOutput || (hadNonTextParts ? "(see attached image)" : ""), + }, + imageParts, + }; +} + +function normalizeXaiResponsesToolResultPayload( + payloadObj: Record, + model: { api?: unknown; input?: unknown }, +): void { + if (model.api !== "openai-responses" || !Array.isArray(payloadObj.input)) { + return; + } + + const includeImages = supportsExplicitImageInput(model); + const normalizedInput: unknown[] = []; + const collectedImageParts: Array> = []; + + for (const item of payloadObj.input) { + const normalized = normalizeXaiResponsesFunctionCallOutput(item, includeImages); + normalizedInput.push(normalized.normalizedItem); + collectedImageParts.push(...normalized.imageParts); + } + + if (collectedImageParts.length > 0) { + normalizedInput.push({ + type: "message", + role: "user", + content: [ + { type: "input_text", text: TOOL_RESULT_IMAGE_REPLAY_TEXT }, + ...collectedImageParts, + ], + }); + } + + payloadObj.input = normalizedInput; +} + export function createXaiToolPayloadCompatibilityWrapper( baseStreamFn: StreamFn | undefined, ): StreamFn { @@ -47,6 +163,7 @@ export function createXaiToolPayloadCompatibilityWrapper( if (Array.isArray(payloadObj.tools)) { payloadObj.tools = payloadObj.tools.map((tool) => stripUnsupportedStrictFlag(tool)); } + normalizeXaiResponsesToolResultPayload(payloadObj, model); delete payloadObj.reasoning; delete payloadObj.reasoningEffort; delete payloadObj.reasoning_effort;