fix: normalize xai tool result image replay (#58017) (thanks @neeravmakwana)

* fix(xai): normalize image tool results for responses

* fix(xai): handle reviewed tool result payload cases

* fix: normalize xai tool result image replay (#58017) (thanks @neeravmakwana)

---------

Co-authored-by: Ayaan Zaidi <hi@obviy.us>
This commit is contained in:
Neerav Makwana 2026-03-31 00:40:51 -04:00 committed by GitHub
parent 763d5cea44
commit 54c69414ad
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 353 additions and 0 deletions

View File

@ -118,6 +118,7 @@ Docs: https://docs.openclaw.ai
- Diffs: fall back to plain text when `lang` hints are invalid during diff render and viewer hydration, so bad or stale language values no longer break the diff viewer. (#57902) Thanks @gumadeiras.
- Doctor/plugins: skip false Matrix legacy-helper warnings when no migration plans exist, and keep bundled `enabledByDefault` plugins in the gateway startup set. (#57931) Thanks @dinakars777.
- Matrix/CLI send: start one-off Matrix send clients before outbound delivery so `openclaw message send --channel matrix` restores E2EE in encrypted rooms instead of sending plain events. (#57936) Thanks @gumadeiras.
- xAI/Responses: normalize image-bearing tool results for xAI responses payloads, including OpenResponses-style `input_image.source` parts, so image tool replays no longer 422 on the follow-up turn. (#58017) Thanks @neeravmakwana.
- Cron/isolated sessions: carry the full live-session provider, model, and auth-profile selection across retry restarts so cron jobs with model overrides no longer fail or loop on mid-run model-switch requests. (#57972) Thanks @issaba1.
- Matrix/direct rooms: stop trusting remote `is_direct`, honor explicit local `is_direct: false` for discovered DM candidates, and avoid extra member-state lookups for shared rooms so DM routing and repair stay aligned. (#57124) Thanks @w-sss.
- Agents/sandbox: make remote FS bridge reads pin the parent path and open the file atomically in the helper so read access cannot race path resolution. Thanks @AntAISecurityLab and @vincentkoc.

View File

@ -117,4 +117,239 @@ describe("xai stream wrappers", () => {
expect(payload).not.toHaveProperty("reasoningEffort");
expect(payload).not.toHaveProperty("reasoning_effort");
});
it("moves image-bearing tool results out of function_call_output payloads", () => {
const payload: Record<string, unknown> = {
input: [
{
type: "function_call_output",
call_id: "call_1",
output: [
{ type: "input_text", text: "Read image" },
{
type: "input_image",
detail: "auto",
image_url: "data:image/png;base64,QUJDRA==",
},
],
},
],
};
const baseStreamFn: StreamFn = (_model, _context, options) => {
options?.onPayload?.(payload, {} as Model<"openai-responses">);
return {} as ReturnType<StreamFn>;
};
const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn);
void wrapped(
{
api: "openai-responses",
provider: "xai",
id: "grok-4-fast",
input: ["text", "image"],
} as Model<"openai-responses">,
{ messages: [] } as Context,
{},
);
expect(payload.input).toEqual([
{
type: "function_call_output",
call_id: "call_1",
output: "Read image",
},
{
type: "message",
role: "user",
content: [
{ type: "input_text", text: "Attached image(s) from tool result:" },
{
type: "input_image",
detail: "auto",
image_url: "data:image/png;base64,QUJDRA==",
},
],
},
]);
});
it("replays source-based input_image parts from tool results", () => {
const payload: Record<string, unknown> = {
input: [
{
type: "function_call_output",
call_id: "call_1",
output: [
{ type: "input_text", text: "Read image" },
{
type: "input_image",
source: {
type: "base64",
media_type: "image/png",
data: "QUJDRA==",
},
},
],
},
],
};
const baseStreamFn: StreamFn = (_model, _context, options) => {
options?.onPayload?.(payload, {} as Model<"openai-responses">);
return {} as ReturnType<StreamFn>;
};
const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn);
void wrapped(
{
api: "openai-responses",
provider: "xai",
id: "grok-4-fast",
input: ["text", "image"],
} as Model<"openai-responses">,
{ messages: [] } as Context,
{},
);
expect(payload.input).toEqual([
{
type: "function_call_output",
call_id: "call_1",
output: "Read image",
},
{
type: "message",
role: "user",
content: [
{ type: "input_text", text: "Attached image(s) from tool result:" },
{
type: "input_image",
source: {
type: "base64",
media_type: "image/png",
data: "QUJDRA==",
},
},
],
},
]);
});
it("keeps multiple tool outputs contiguous before replaying collected images", () => {
const payload: Record<string, unknown> = {
input: [
{
type: "function_call_output",
call_id: "call_1",
output: [
{ type: "input_text", text: "first" },
{
type: "input_image",
detail: "auto",
image_url: "data:image/png;base64,QUFBQQ==",
},
],
},
{
type: "function_call_output",
call_id: "call_2",
output: [
{ type: "input_text", text: "second" },
{
type: "input_image",
detail: "auto",
image_url: "data:image/png;base64,QkJCQg==",
},
],
},
],
};
const baseStreamFn: StreamFn = (_model, _context, options) => {
options?.onPayload?.(payload, {} as Model<"openai-responses">);
return {} as ReturnType<StreamFn>;
};
const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn);
void wrapped(
{
api: "openai-responses",
provider: "xai",
id: "grok-4-fast",
input: ["text", "image"],
} as Model<"openai-responses">,
{ messages: [] } as Context,
{},
);
expect(payload.input).toEqual([
{
type: "function_call_output",
call_id: "call_1",
output: "first",
},
{
type: "function_call_output",
call_id: "call_2",
output: "second",
},
{
type: "message",
role: "user",
content: [
{ type: "input_text", text: "Attached image(s) from tool result:" },
{
type: "input_image",
detail: "auto",
image_url: "data:image/png;base64,QUFBQQ==",
},
{
type: "input_image",
detail: "auto",
image_url: "data:image/png;base64,QkJCQg==",
},
],
},
]);
});
it("drops image blocks and uses fallback text for models without image input", () => {
const payload: Record<string, unknown> = {
input: [
{
type: "function_call_output",
call_id: "call_1",
output: [
{
type: "input_image",
detail: "auto",
image_url: "data:image/png;base64,QUJDRA==",
},
],
},
],
};
const baseStreamFn: StreamFn = (_model, _context, options) => {
options?.onPayload?.(payload, {} as Model<"openai-responses">);
return {} as ReturnType<StreamFn>;
};
const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn);
void wrapped(
{
api: "openai-responses",
provider: "xai",
id: "grok-4-fast",
input: ["text"],
} as Model<"openai-responses">,
{ messages: [] } as Context,
{},
);
expect(payload.input).toEqual([
{
type: "function_call_output",
call_id: "call_1",
output: "(see attached image)",
},
]);
});
});

View File

@ -33,6 +33,122 @@ function stripUnsupportedStrictFlag(tool: unknown): unknown {
return { ...toolObj, function: nextFunction };
}
function supportsExplicitImageInput(model: { input?: unknown }): boolean {
return Array.isArray(model.input) && model.input.includes("image");
}
const TOOL_RESULT_IMAGE_REPLAY_TEXT = "Attached image(s) from tool result:";
type ReplayableInputImagePart =
| {
type: "input_image";
source: { type: "url"; url: string } | { type: "base64"; media_type: string; data: string };
}
| { type: "input_image"; image_url: string; detail?: string };
type NormalizedFunctionCallOutput = {
normalizedItem: unknown;
imageParts: Array<Record<string, unknown>>;
};
function isReplayableInputImagePart(
part: Record<string, unknown>,
): part is ReplayableInputImagePart {
if (part.type !== "input_image") {
return false;
}
if (typeof part.image_url === "string") {
return true;
}
if (!part.source || typeof part.source !== "object") {
return false;
}
const source = part.source as {
type?: unknown;
url?: unknown;
media_type?: unknown;
data?: unknown;
};
if (source.type === "url") {
return typeof source.url === "string";
}
return (
source.type === "base64" &&
typeof source.media_type === "string" &&
typeof source.data === "string"
);
}
function normalizeXaiResponsesFunctionCallOutput(
item: unknown,
includeImages: boolean,
): NormalizedFunctionCallOutput {
if (!item || typeof item !== "object") {
return { normalizedItem: item, imageParts: [] };
}
const itemObj = item as Record<string, unknown>;
if (itemObj.type !== "function_call_output" || !Array.isArray(itemObj.output)) {
return { normalizedItem: itemObj, imageParts: [] };
}
const outputParts = itemObj.output as Array<Record<string, unknown>>;
const textOutput = outputParts
.filter(
(part): part is { type: "input_text"; text: string } =>
part.type === "input_text" && typeof part.text === "string",
)
.map((part) => part.text)
.join("");
const imageParts = includeImages
? outputParts.filter((part): part is ReplayableInputImagePart =>
isReplayableInputImagePart(part),
)
: [];
const hadNonTextParts = outputParts.some((part) => part.type !== "input_text");
return {
normalizedItem: {
...itemObj,
output: textOutput || (hadNonTextParts ? "(see attached image)" : ""),
},
imageParts,
};
}
function normalizeXaiResponsesToolResultPayload(
payloadObj: Record<string, unknown>,
model: { api?: unknown; input?: unknown },
): void {
if (model.api !== "openai-responses" || !Array.isArray(payloadObj.input)) {
return;
}
const includeImages = supportsExplicitImageInput(model);
const normalizedInput: unknown[] = [];
const collectedImageParts: Array<Record<string, unknown>> = [];
for (const item of payloadObj.input) {
const normalized = normalizeXaiResponsesFunctionCallOutput(item, includeImages);
normalizedInput.push(normalized.normalizedItem);
collectedImageParts.push(...normalized.imageParts);
}
if (collectedImageParts.length > 0) {
normalizedInput.push({
type: "message",
role: "user",
content: [
{ type: "input_text", text: TOOL_RESULT_IMAGE_REPLAY_TEXT },
...collectedImageParts,
],
});
}
payloadObj.input = normalizedInput;
}
export function createXaiToolPayloadCompatibilityWrapper(
baseStreamFn: StreamFn | undefined,
): StreamFn {
@ -47,6 +163,7 @@ export function createXaiToolPayloadCompatibilityWrapper(
if (Array.isArray(payloadObj.tools)) {
payloadObj.tools = payloadObj.tools.map((tool) => stripUnsupportedStrictFlag(tool));
}
normalizeXaiResponsesToolResultPayload(payloadObj, model);
delete payloadObj.reasoning;
delete payloadObj.reasoningEffort;
delete payloadObj.reasoning_effort;