From 54c69414ad522a5d56a6bdcdc85e33272d271c6f Mon Sep 17 00:00:00 2001
From: Neerav Makwana <neeravmakwana@gmail.com>
Date: Tue, 31 Mar 2026 00:40:51 -0400
Subject: [PATCH] fix: normalize xai tool result image replay (#58017) (thanks
 @neeravmakwana)

* fix(xai): normalize image tool results for responses

* fix(xai): handle reviewed tool result payload cases

* fix: normalize xai tool result image replay (#58017) (thanks @neeravmakwana)

---------

Co-authored-by: Ayaan Zaidi <hi@obviy.us>
---
 CHANGELOG.md                  |   1 +
 extensions/xai/stream.test.ts | 235 ++++++++++++++++++++++++++++++++++
 extensions/xai/stream.ts      | 117 +++++++++++++++++
 3 files changed, 353 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d578e359b6f..b62f51a41cf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -118,6 +118,7 @@ Docs: https://docs.openclaw.ai
 - Diffs: fall back to plain text when `lang` hints are invalid during diff render and viewer hydration, so bad or stale language values no longer break the diff viewer. (#57902) Thanks @gumadeiras.
 - Doctor/plugins: skip false Matrix legacy-helper warnings when no migration plans exist, and keep bundled `enabledByDefault` plugins in the gateway startup set. (#57931) Thanks @dinakars777.
 - Matrix/CLI send: start one-off Matrix send clients before outbound delivery so `openclaw message send --channel matrix` restores E2EE in encrypted rooms instead of sending plain events. (#57936) Thanks @gumadeiras.
+- xAI/Responses: normalize image-bearing tool results for xAI responses payloads, including OpenResponses-style `input_image.source` parts, so image tool replays no longer 422 on the follow-up turn. (#58017) Thanks @neeravmakwana.
 - Cron/isolated sessions: carry the full live-session provider, model, and auth-profile selection across retry restarts so cron jobs with model overrides no longer fail or loop on mid-run model-switch requests. (#57972) Thanks @issaba1.
 - Matrix/direct rooms: stop trusting remote `is_direct`, honor explicit local `is_direct: false` for discovered DM candidates, and avoid extra member-state lookups for shared rooms so DM routing and repair stay aligned. (#57124) Thanks @w-sss.
 - Agents/sandbox: make remote FS bridge reads pin the parent path and open the file atomically in the helper so read access cannot race path resolution. Thanks @AntAISecurityLab and @vincentkoc.
diff --git a/extensions/xai/stream.test.ts b/extensions/xai/stream.test.ts
index 552111d18eb..d6a3c7e399e 100644
--- a/extensions/xai/stream.test.ts
+++ b/extensions/xai/stream.test.ts
@@ -117,4 +117,239 @@ describe("xai stream wrappers", () => {
     expect(payload).not.toHaveProperty("reasoningEffort");
     expect(payload).not.toHaveProperty("reasoning_effort");
   });
+
+  it("moves image-bearing tool results out of function_call_output payloads", () => {
+    const payload: Record<string, unknown> = {
+      input: [
+        {
+          type: "function_call_output",
+          call_id: "call_1",
+          output: [
+            { type: "input_text", text: "Read image" },
+            {
+              type: "input_image",
+              detail: "auto",
+              image_url: "data:image/png;base64,QUJDRA==",
+            },
+          ],
+        },
+      ],
+    };
+    const baseStreamFn: StreamFn = (_model, _context, options) => {
+      options?.onPayload?.(payload, {} as Model<"openai-responses">);
+      return {} as ReturnType<StreamFn>;
+    };
+    const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn);
+
+    void wrapped(
+      {
+        api: "openai-responses",
+        provider: "xai",
+        id: "grok-4-fast",
+        input: ["text", "image"],
+      } as Model<"openai-responses">,
+      { messages: [] } as Context,
+      {},
+    );
+
+    expect(payload.input).toEqual([
+      {
+        type: "function_call_output",
+        call_id: "call_1",
+        output: "Read image",
+      },
+      {
+        type: "message",
+        role: "user",
+        content: [
+          { type: "input_text", text: "Attached image(s) from tool result:" },
+          {
+            type: "input_image",
+            detail: "auto",
+            image_url: "data:image/png;base64,QUJDRA==",
+          },
+        ],
+      },
+    ]);
+  });
+
+  it("replays source-based input_image parts from tool results", () => {
+    const payload: Record<string, unknown> = {
+      input: [
+        {
+          type: "function_call_output",
+          call_id: "call_1",
+          output: [
+            { type: "input_text", text: "Read image" },
+            {
+              type: "input_image",
+              source: {
+                type: "base64",
+                media_type: "image/png",
+                data: "QUJDRA==",
+              },
+            },
+          ],
+        },
+      ],
+    };
+    const baseStreamFn: StreamFn = (_model, _context, options) => {
+      options?.onPayload?.(payload, {} as Model<"openai-responses">);
+      return {} as ReturnType<StreamFn>;
+    };
+    const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn);
+
+    void wrapped(
+      {
+        api: "openai-responses",
+        provider: "xai",
+        id: "grok-4-fast",
+        input: ["text", "image"],
+      } as Model<"openai-responses">,
+      { messages: [] } as Context,
+      {},
+    );
+
+    expect(payload.input).toEqual([
+      {
+        type: "function_call_output",
+        call_id: "call_1",
+        output: "Read image",
+      },
+      {
+        type: "message",
+        role: "user",
+        content: [
+          { type: "input_text", text: "Attached image(s) from tool result:" },
+          {
+            type: "input_image",
+            source: {
+              type: "base64",
+              media_type: "image/png",
+              data: "QUJDRA==",
+            },
+          },
+        ],
+      },
+    ]);
+  });
+
+  it("keeps multiple tool outputs contiguous before replaying collected images", () => {
+    const payload: Record<string, unknown> = {
+      input: [
+        {
+          type: "function_call_output",
+          call_id: "call_1",
+          output: [
+            { type: "input_text", text: "first" },
+            {
+              type: "input_image",
+              detail: "auto",
+              image_url: "data:image/png;base64,QUFBQQ==",
+            },
+          ],
+        },
+        {
+          type: "function_call_output",
+          call_id: "call_2",
+          output: [
+            { type: "input_text", text: "second" },
+            {
+              type: "input_image",
+              detail: "auto",
+              image_url: "data:image/png;base64,QkJCQg==",
+            },
+          ],
+        },
+      ],
+    };
+    const baseStreamFn: StreamFn = (_model, _context, options) => {
+      options?.onPayload?.(payload, {} as Model<"openai-responses">);
+      return {} as ReturnType<StreamFn>;
+    };
+    const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn);
+
+    void wrapped(
+      {
+        api: "openai-responses",
+        provider: "xai",
+        id: "grok-4-fast",
+        input: ["text", "image"],
+      } as Model<"openai-responses">,
+      { messages: [] } as Context,
+      {},
+    );
+
+    expect(payload.input).toEqual([
+      {
+        type: "function_call_output",
+        call_id: "call_1",
+        output: "first",
+      },
+      {
+        type: "function_call_output",
+        call_id: "call_2",
+        output: "second",
+      },
+      {
+        type: "message",
+        role: "user",
+        content: [
+          { type: "input_text", text: "Attached image(s) from tool result:" },
+          {
+            type: "input_image",
+            detail: "auto",
+            image_url: "data:image/png;base64,QUFBQQ==",
+          },
+          {
+            type: "input_image",
+            detail: "auto",
+            image_url: "data:image/png;base64,QkJCQg==",
+          },
+        ],
+      },
+    ]);
+  });
+
+  it("drops image blocks and uses fallback text for models without image input", () => {
+    const payload: Record<string, unknown> = {
+      input: [
+        {
+          type: "function_call_output",
+          call_id: "call_1",
+          output: [
+            {
+              type: "input_image",
+              detail: "auto",
+              image_url: "data:image/png;base64,QUJDRA==",
+            },
+          ],
+        },
+      ],
+    };
+    const baseStreamFn: StreamFn = (_model, _context, options) => {
+      options?.onPayload?.(payload, {} as Model<"openai-responses">);
+      return {} as ReturnType<StreamFn>;
+    };
+    const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn);
+
+    void wrapped(
+      {
+        api: "openai-responses",
+        provider: "xai",
+        id: "grok-4-fast",
+        input: ["text"],
+      } as Model<"openai-responses">,
+      { messages: [] } as Context,
+      {},
+    );
+
+    expect(payload.input).toEqual([
+      {
+        type: "function_call_output",
+        call_id: "call_1",
+        output: "(see attached image)",
+      },
+    ]);
+  });
 });
diff --git a/extensions/xai/stream.ts b/extensions/xai/stream.ts
index 956fc6cad64..7af4efd1060 100644
--- a/extensions/xai/stream.ts
+++ b/extensions/xai/stream.ts
@@ -33,6 +33,122 @@ function stripUnsupportedStrictFlag(tool: unknown): unknown {
   return { ...toolObj, function: nextFunction };
 }
 
+function supportsExplicitImageInput(model: { input?: unknown }): boolean {
+  return Array.isArray(model.input) && model.input.includes("image");
+}
+
+const TOOL_RESULT_IMAGE_REPLAY_TEXT = "Attached image(s) from tool result:";
+
+type ReplayableInputImagePart =
+  | {
+      type: "input_image";
+      source: { type: "url"; url: string } | { type: "base64"; media_type: string; data: string };
+    }
+  | { type: "input_image"; image_url: string; detail?: string };
+
+type NormalizedFunctionCallOutput = {
+  normalizedItem: unknown;
+  imageParts: Array<Record<string, unknown>>;
+};
+
+function isReplayableInputImagePart(
+  part: Record<string, unknown>,
+): part is ReplayableInputImagePart {
+  if (part.type !== "input_image") {
+    return false;
+  }
+  if (typeof part.image_url === "string") {
+    return true;
+  }
+  if (!part.source || typeof part.source !== "object") {
+    return false;
+  }
+  const source = part.source as {
+    type?: unknown;
+    url?: unknown;
+    media_type?: unknown;
+    data?: unknown;
+  };
+  if (source.type === "url") {
+    return typeof source.url === "string";
+  }
+  return (
+    source.type === "base64" &&
+    typeof source.media_type === "string" &&
+    typeof source.data === "string"
+  );
+}
+
+function normalizeXaiResponsesFunctionCallOutput(
+  item: unknown,
+  includeImages: boolean,
+): NormalizedFunctionCallOutput {
+  if (!item || typeof item !== "object") {
+    return { normalizedItem: item, imageParts: [] };
+  }
+
+  const itemObj = item as Record<string, unknown>;
+  if (itemObj.type !== "function_call_output" || !Array.isArray(itemObj.output)) {
+    return { normalizedItem: itemObj, imageParts: [] };
+  }
+
+  const outputParts = itemObj.output as Array<Record<string, unknown>>;
+  const textOutput = outputParts
+    .filter(
+      (part): part is { type: "input_text"; text: string } =>
+        part.type === "input_text" && typeof part.text === "string",
+    )
+    .map((part) => part.text)
+    .join("");
+
+  const imageParts = includeImages
+    ? outputParts.filter((part): part is ReplayableInputImagePart =>
+        isReplayableInputImagePart(part),
+      )
+    : [];
+  const hadNonTextParts = outputParts.some((part) => part.type !== "input_text");
+
+  return {
+    normalizedItem: {
+      ...itemObj,
+      output: textOutput || (hadNonTextParts ? "(see attached image)" : ""),
+    },
+    imageParts,
+  };
+}
+
+function normalizeXaiResponsesToolResultPayload(
+  payloadObj: Record<string, unknown>,
+  model: { api?: unknown; input?: unknown },
+): void {
+  if (model.api !== "openai-responses" || !Array.isArray(payloadObj.input)) {
+    return;
+  }
+
+  const includeImages = supportsExplicitImageInput(model);
+  const normalizedInput: unknown[] = [];
+  const collectedImageParts: Array<Record<string, unknown>> = [];
+
+  for (const item of payloadObj.input) {
+    const normalized = normalizeXaiResponsesFunctionCallOutput(item, includeImages);
+    normalizedInput.push(normalized.normalizedItem);
+    collectedImageParts.push(...normalized.imageParts);
+  }
+
+  if (collectedImageParts.length > 0) {
+    normalizedInput.push({
+      type: "message",
+      role: "user",
+      content: [
+        { type: "input_text", text: TOOL_RESULT_IMAGE_REPLAY_TEXT },
+        ...collectedImageParts,
+      ],
+    });
+  }
+
+  payloadObj.input = normalizedInput;
+}
+
 export function createXaiToolPayloadCompatibilityWrapper(
   baseStreamFn: StreamFn | undefined,
 ): StreamFn {
@@ -47,6 +163,7 @@ export function createXaiToolPayloadCompatibilityWrapper(
           if (Array.isArray(payloadObj.tools)) {
             payloadObj.tools = payloadObj.tools.map((tool) => stripUnsupportedStrictFlag(tool));
           }
+          normalizeXaiResponsesToolResultPayload(payloadObj, model);
           delete payloadObj.reasoning;
           delete payloadObj.reasoningEffort;
           delete payloadObj.reasoning_effort;