mirror of https://github.com/openclaw/openclaw.git
fix: normalize xai tool result image replay (#58017) (thanks @neeravmakwana)
* fix(xai): normalize image tool results for responses * fix(xai): handle reviewed tool result payload cases * fix: normalize xai tool result image replay (#58017) (thanks @neeravmakwana) --------- Co-authored-by: Ayaan Zaidi <hi@obviy.us>
This commit is contained in:
parent
763d5cea44
commit
54c69414ad
|
|
@ -118,6 +118,7 @@ Docs: https://docs.openclaw.ai
|
|||
- Diffs: fall back to plain text when `lang` hints are invalid during diff render and viewer hydration, so bad or stale language values no longer break the diff viewer. (#57902) Thanks @gumadeiras.
|
||||
- Doctor/plugins: skip false Matrix legacy-helper warnings when no migration plans exist, and keep bundled `enabledByDefault` plugins in the gateway startup set. (#57931) Thanks @dinakars777.
|
||||
- Matrix/CLI send: start one-off Matrix send clients before outbound delivery so `openclaw message send --channel matrix` restores E2EE in encrypted rooms instead of sending plain events. (#57936) Thanks @gumadeiras.
|
||||
- xAI/Responses: normalize image-bearing tool results for xAI responses payloads, including OpenResponses-style `input_image.source` parts, so image tool replays no longer 422 on the follow-up turn. (#58017) Thanks @neeravmakwana.
|
||||
- Cron/isolated sessions: carry the full live-session provider, model, and auth-profile selection across retry restarts so cron jobs with model overrides no longer fail or loop on mid-run model-switch requests. (#57972) Thanks @issaba1.
|
||||
- Matrix/direct rooms: stop trusting remote `is_direct`, honor explicit local `is_direct: false` for discovered DM candidates, and avoid extra member-state lookups for shared rooms so DM routing and repair stay aligned. (#57124) Thanks @w-sss.
|
||||
- Agents/sandbox: make remote FS bridge reads pin the parent path and open the file atomically in the helper so read access cannot race path resolution. Thanks @AntAISecurityLab and @vincentkoc.
|
||||
|
|
|
|||
|
|
@ -117,4 +117,239 @@ describe("xai stream wrappers", () => {
|
|||
expect(payload).not.toHaveProperty("reasoningEffort");
|
||||
expect(payload).not.toHaveProperty("reasoning_effort");
|
||||
});
|
||||
|
||||
it("moves image-bearing tool results out of function_call_output payloads", () => {
|
||||
const payload: Record<string, unknown> = {
|
||||
input: [
|
||||
{
|
||||
type: "function_call_output",
|
||||
call_id: "call_1",
|
||||
output: [
|
||||
{ type: "input_text", text: "Read image" },
|
||||
{
|
||||
type: "input_image",
|
||||
detail: "auto",
|
||||
image_url: "data:image/png;base64,QUJDRA==",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
const baseStreamFn: StreamFn = (_model, _context, options) => {
|
||||
options?.onPayload?.(payload, {} as Model<"openai-responses">);
|
||||
return {} as ReturnType<StreamFn>;
|
||||
};
|
||||
const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn);
|
||||
|
||||
void wrapped(
|
||||
{
|
||||
api: "openai-responses",
|
||||
provider: "xai",
|
||||
id: "grok-4-fast",
|
||||
input: ["text", "image"],
|
||||
} as Model<"openai-responses">,
|
||||
{ messages: [] } as Context,
|
||||
{},
|
||||
);
|
||||
|
||||
expect(payload.input).toEqual([
|
||||
{
|
||||
type: "function_call_output",
|
||||
call_id: "call_1",
|
||||
output: "Read image",
|
||||
},
|
||||
{
|
||||
type: "message",
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "input_text", text: "Attached image(s) from tool result:" },
|
||||
{
|
||||
type: "input_image",
|
||||
detail: "auto",
|
||||
image_url: "data:image/png;base64,QUJDRA==",
|
||||
},
|
||||
],
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it("replays source-based input_image parts from tool results", () => {
|
||||
const payload: Record<string, unknown> = {
|
||||
input: [
|
||||
{
|
||||
type: "function_call_output",
|
||||
call_id: "call_1",
|
||||
output: [
|
||||
{ type: "input_text", text: "Read image" },
|
||||
{
|
||||
type: "input_image",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: "image/png",
|
||||
data: "QUJDRA==",
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
const baseStreamFn: StreamFn = (_model, _context, options) => {
|
||||
options?.onPayload?.(payload, {} as Model<"openai-responses">);
|
||||
return {} as ReturnType<StreamFn>;
|
||||
};
|
||||
const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn);
|
||||
|
||||
void wrapped(
|
||||
{
|
||||
api: "openai-responses",
|
||||
provider: "xai",
|
||||
id: "grok-4-fast",
|
||||
input: ["text", "image"],
|
||||
} as Model<"openai-responses">,
|
||||
{ messages: [] } as Context,
|
||||
{},
|
||||
);
|
||||
|
||||
expect(payload.input).toEqual([
|
||||
{
|
||||
type: "function_call_output",
|
||||
call_id: "call_1",
|
||||
output: "Read image",
|
||||
},
|
||||
{
|
||||
type: "message",
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "input_text", text: "Attached image(s) from tool result:" },
|
||||
{
|
||||
type: "input_image",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: "image/png",
|
||||
data: "QUJDRA==",
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it("keeps multiple tool outputs contiguous before replaying collected images", () => {
|
||||
const payload: Record<string, unknown> = {
|
||||
input: [
|
||||
{
|
||||
type: "function_call_output",
|
||||
call_id: "call_1",
|
||||
output: [
|
||||
{ type: "input_text", text: "first" },
|
||||
{
|
||||
type: "input_image",
|
||||
detail: "auto",
|
||||
image_url: "data:image/png;base64,QUFBQQ==",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
type: "function_call_output",
|
||||
call_id: "call_2",
|
||||
output: [
|
||||
{ type: "input_text", text: "second" },
|
||||
{
|
||||
type: "input_image",
|
||||
detail: "auto",
|
||||
image_url: "data:image/png;base64,QkJCQg==",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
const baseStreamFn: StreamFn = (_model, _context, options) => {
|
||||
options?.onPayload?.(payload, {} as Model<"openai-responses">);
|
||||
return {} as ReturnType<StreamFn>;
|
||||
};
|
||||
const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn);
|
||||
|
||||
void wrapped(
|
||||
{
|
||||
api: "openai-responses",
|
||||
provider: "xai",
|
||||
id: "grok-4-fast",
|
||||
input: ["text", "image"],
|
||||
} as Model<"openai-responses">,
|
||||
{ messages: [] } as Context,
|
||||
{},
|
||||
);
|
||||
|
||||
expect(payload.input).toEqual([
|
||||
{
|
||||
type: "function_call_output",
|
||||
call_id: "call_1",
|
||||
output: "first",
|
||||
},
|
||||
{
|
||||
type: "function_call_output",
|
||||
call_id: "call_2",
|
||||
output: "second",
|
||||
},
|
||||
{
|
||||
type: "message",
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "input_text", text: "Attached image(s) from tool result:" },
|
||||
{
|
||||
type: "input_image",
|
||||
detail: "auto",
|
||||
image_url: "data:image/png;base64,QUFBQQ==",
|
||||
},
|
||||
{
|
||||
type: "input_image",
|
||||
detail: "auto",
|
||||
image_url: "data:image/png;base64,QkJCQg==",
|
||||
},
|
||||
],
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it("drops image blocks and uses fallback text for models without image input", () => {
|
||||
const payload: Record<string, unknown> = {
|
||||
input: [
|
||||
{
|
||||
type: "function_call_output",
|
||||
call_id: "call_1",
|
||||
output: [
|
||||
{
|
||||
type: "input_image",
|
||||
detail: "auto",
|
||||
image_url: "data:image/png;base64,QUJDRA==",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
const baseStreamFn: StreamFn = (_model, _context, options) => {
|
||||
options?.onPayload?.(payload, {} as Model<"openai-responses">);
|
||||
return {} as ReturnType<StreamFn>;
|
||||
};
|
||||
const wrapped = createXaiToolPayloadCompatibilityWrapper(baseStreamFn);
|
||||
|
||||
void wrapped(
|
||||
{
|
||||
api: "openai-responses",
|
||||
provider: "xai",
|
||||
id: "grok-4-fast",
|
||||
input: ["text"],
|
||||
} as Model<"openai-responses">,
|
||||
{ messages: [] } as Context,
|
||||
{},
|
||||
);
|
||||
|
||||
expect(payload.input).toEqual([
|
||||
{
|
||||
type: "function_call_output",
|
||||
call_id: "call_1",
|
||||
output: "(see attached image)",
|
||||
},
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -33,6 +33,122 @@ function stripUnsupportedStrictFlag(tool: unknown): unknown {
|
|||
return { ...toolObj, function: nextFunction };
|
||||
}
|
||||
|
||||
function supportsExplicitImageInput(model: { input?: unknown }): boolean {
|
||||
return Array.isArray(model.input) && model.input.includes("image");
|
||||
}
|
||||
|
||||
const TOOL_RESULT_IMAGE_REPLAY_TEXT = "Attached image(s) from tool result:";
|
||||
|
||||
type ReplayableInputImagePart =
|
||||
| {
|
||||
type: "input_image";
|
||||
source: { type: "url"; url: string } | { type: "base64"; media_type: string; data: string };
|
||||
}
|
||||
| { type: "input_image"; image_url: string; detail?: string };
|
||||
|
||||
type NormalizedFunctionCallOutput = {
|
||||
normalizedItem: unknown;
|
||||
imageParts: Array<Record<string, unknown>>;
|
||||
};
|
||||
|
||||
function isReplayableInputImagePart(
|
||||
part: Record<string, unknown>,
|
||||
): part is ReplayableInputImagePart {
|
||||
if (part.type !== "input_image") {
|
||||
return false;
|
||||
}
|
||||
if (typeof part.image_url === "string") {
|
||||
return true;
|
||||
}
|
||||
if (!part.source || typeof part.source !== "object") {
|
||||
return false;
|
||||
}
|
||||
const source = part.source as {
|
||||
type?: unknown;
|
||||
url?: unknown;
|
||||
media_type?: unknown;
|
||||
data?: unknown;
|
||||
};
|
||||
if (source.type === "url") {
|
||||
return typeof source.url === "string";
|
||||
}
|
||||
return (
|
||||
source.type === "base64" &&
|
||||
typeof source.media_type === "string" &&
|
||||
typeof source.data === "string"
|
||||
);
|
||||
}
|
||||
|
||||
function normalizeXaiResponsesFunctionCallOutput(
|
||||
item: unknown,
|
||||
includeImages: boolean,
|
||||
): NormalizedFunctionCallOutput {
|
||||
if (!item || typeof item !== "object") {
|
||||
return { normalizedItem: item, imageParts: [] };
|
||||
}
|
||||
|
||||
const itemObj = item as Record<string, unknown>;
|
||||
if (itemObj.type !== "function_call_output" || !Array.isArray(itemObj.output)) {
|
||||
return { normalizedItem: itemObj, imageParts: [] };
|
||||
}
|
||||
|
||||
const outputParts = itemObj.output as Array<Record<string, unknown>>;
|
||||
const textOutput = outputParts
|
||||
.filter(
|
||||
(part): part is { type: "input_text"; text: string } =>
|
||||
part.type === "input_text" && typeof part.text === "string",
|
||||
)
|
||||
.map((part) => part.text)
|
||||
.join("");
|
||||
|
||||
const imageParts = includeImages
|
||||
? outputParts.filter((part): part is ReplayableInputImagePart =>
|
||||
isReplayableInputImagePart(part),
|
||||
)
|
||||
: [];
|
||||
const hadNonTextParts = outputParts.some((part) => part.type !== "input_text");
|
||||
|
||||
return {
|
||||
normalizedItem: {
|
||||
...itemObj,
|
||||
output: textOutput || (hadNonTextParts ? "(see attached image)" : ""),
|
||||
},
|
||||
imageParts,
|
||||
};
|
||||
}
|
||||
|
||||
function normalizeXaiResponsesToolResultPayload(
|
||||
payloadObj: Record<string, unknown>,
|
||||
model: { api?: unknown; input?: unknown },
|
||||
): void {
|
||||
if (model.api !== "openai-responses" || !Array.isArray(payloadObj.input)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const includeImages = supportsExplicitImageInput(model);
|
||||
const normalizedInput: unknown[] = [];
|
||||
const collectedImageParts: Array<Record<string, unknown>> = [];
|
||||
|
||||
for (const item of payloadObj.input) {
|
||||
const normalized = normalizeXaiResponsesFunctionCallOutput(item, includeImages);
|
||||
normalizedInput.push(normalized.normalizedItem);
|
||||
collectedImageParts.push(...normalized.imageParts);
|
||||
}
|
||||
|
||||
if (collectedImageParts.length > 0) {
|
||||
normalizedInput.push({
|
||||
type: "message",
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "input_text", text: TOOL_RESULT_IMAGE_REPLAY_TEXT },
|
||||
...collectedImageParts,
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
payloadObj.input = normalizedInput;
|
||||
}
|
||||
|
||||
export function createXaiToolPayloadCompatibilityWrapper(
|
||||
baseStreamFn: StreamFn | undefined,
|
||||
): StreamFn {
|
||||
|
|
@ -47,6 +163,7 @@ export function createXaiToolPayloadCompatibilityWrapper(
|
|||
if (Array.isArray(payloadObj.tools)) {
|
||||
payloadObj.tools = payloadObj.tools.map((tool) => stripUnsupportedStrictFlag(tool));
|
||||
}
|
||||
normalizeXaiResponsesToolResultPayload(payloadObj, model);
|
||||
delete payloadObj.reasoning;
|
||||
delete payloadObj.reasoningEffort;
|
||||
delete payloadObj.reasoning_effort;
|
||||
|
|
|
|||
Loading…
Reference in New Issue