fix: support OpenAI Codex media understanding (#54829) (thanks @neeravmakwana)

* OpenAI: register Codex media understanding provider

* fix: route codex image prompts through system instructions

* fix: add changelog for codex image tool fix (#54829) (thanks @neeravmakwana)

* fix: remove any from provider registration tests (#54829) (thanks @neeravmakwana)

---------

Co-authored-by: Ayaan Zaidi <hi@obviy.us>
This commit is contained in:
Neerav Makwana 2026-03-26 00:40:11 -04:00 committed by GitHub
parent 76ff0d9298
commit 6fd9d2ff38
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 154 additions and 113 deletions

View File

@ -17,6 +17,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- OpenAI Codex/image tools: register Codex for media understanding and route image prompts through Codex instructions so image analysis no longer fails on missing provider registration or missing `instructions`. (#54829) Thanks @neeravmakwana.
- Telegram: deliver verbose tool summaries inside forum topic sessions again, so threaded topic chats now match DM verbose behavior. (#43236) Thanks @frankbuild.
- Agents/sandbox: honor `tools.sandbox.tools.alsoAllow`, let explicit sandbox re-allows remove matching built-in default-deny tools, and keep sandbox explain/error guidance aligned with the effective sandbox tool policy. (#54492) Thanks @ngutman.
- Agents/sandbox: make blocked-tool guidance glob-aware again, redact/sanitize session-specific explain hints for safer copy-paste, and avoid leaking control-character session keys in those hints. (#54684) Thanks @ngutman.

View File

@ -1,6 +1,8 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { getModel } from "@mariozechner/pi-ai";
import { AuthStorage, ModelRegistry } from "@mariozechner/pi-coding-agent";
import OpenAI from "openai";
import * as providerAuth from "openclaw/plugin-sdk/provider-auth";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
@ -38,52 +40,42 @@ const liveEnabled = OPENAI_API_KEY.trim().length > 0 && process.env.OPENCLAW_LIV
const describeLive = liveEnabled ? describe : describe.skip;
const EMPTY_AUTH_STORE = { version: 1, profiles: {} } as const;
function createTemplateModel(modelId: string) {
function resolveTemplateModelId(modelId: string) {
switch (modelId) {
case "gpt-5.4":
return {
id: "gpt-5.2",
name: "GPT-5.2",
provider: "openai",
api: "openai-completions",
baseUrl: "https://api.openai.com/v1",
reasoning: true,
input: ["text", "image"],
cost: { input: 1, output: 2, cacheRead: 0, cacheWrite: 0 },
contextWindow: 400_000,
maxTokens: 128_000,
};
return "gpt-5.2";
case "gpt-5.4-mini":
return {
id: "gpt-5-mini",
name: "GPT-5 mini",
provider: "openai",
api: "openai-completions",
baseUrl: "https://api.openai.com/v1",
reasoning: true,
input: ["text", "image"],
cost: { input: 1, output: 2, cacheRead: 0, cacheWrite: 0 },
contextWindow: 400_000,
maxTokens: 128_000,
};
return "gpt-5-mini";
case "gpt-5.4-nano":
return {
id: "gpt-5-nano",
name: "GPT-5 nano",
provider: "openai",
api: "openai-completions",
baseUrl: "https://api.openai.com/v1",
reasoning: true,
input: ["text", "image"],
cost: { input: 0.5, output: 1, cacheRead: 0, cacheWrite: 0 },
contextWindow: 200_000,
maxTokens: 64_000,
};
return "gpt-5-nano";
default:
throw new Error(`Unsupported live OpenAI plugin model: ${modelId}`);
}
}
function createTemplateModelRegistry(modelId: string): ModelRegistry {
const registry = new ModelRegistry(AuthStorage.inMemory());
const template = getModel("openai", resolveTemplateModelId(modelId));
registry.registerProvider("openai", {
apiKey: "test",
baseUrl: template.baseUrl,
models: [
{
id: template.id,
name: template.name,
api: template.api,
reasoning: template.reasoning,
input: template.input,
cost: template.cost,
contextWindow: template.contextWindow,
maxTokens: template.maxTokens,
...(template.compat ? { compat: template.compat } : {}),
},
],
});
return registry;
}
const registerOpenAIPlugin = () =>
registerProviderPlugin({
plugin,
@ -193,15 +185,15 @@ describe("openai plugin", () => {
const { providers, speechProviders, mediaProviders, imageProviders } = registerOpenAIPlugin();
expect(providers).toHaveLength(2);
expect(
providers.map(
(provider) =>
// oxlint-disable-next-line typescript/no-explicit-any
(provider as any).id,
),
).toEqual(["openai", "openai-codex"]);
expect(providers.map((provider) => provider.id)).toEqual(["openai", "openai-codex"]);
expect(speechProviders).toHaveLength(1);
expect(mediaProviders).toHaveLength(1);
expect(mediaProviders.map((provider) => provider.id)).toEqual(["openai", "openai-codex"]);
const codexMediaProvider = requireRegisteredProvider(
mediaProviders,
"openai-codex",
"media provider",
);
expect(codexMediaProvider.capabilities).toEqual(["image"]);
expect(imageProviders).toHaveLength(1);
});
@ -314,27 +306,17 @@ describeLive("openai plugin live", () => {
const { providers } = registerOpenAIPlugin();
const provider = requireRegisteredProvider(providers, "openai");
// oxlint-disable-next-line typescript/no-explicit-any
const resolved = (provider as any).resolveDynamicModel?.({
const resolved = provider.resolveDynamicModel?.({
provider: "openai",
modelId: LIVE_MODEL_ID,
modelRegistry: {
find(providerId: string, id: string) {
if (providerId !== "openai") {
return null;
}
const template = createTemplateModel(LIVE_MODEL_ID);
return id === template.id ? template : null;
},
},
modelRegistry: createTemplateModelRegistry(LIVE_MODEL_ID),
});
if (!resolved) {
throw new Error("openai provider did not resolve the live model");
}
// oxlint-disable-next-line typescript/no-explicit-any
const normalized = (provider as any).normalizeResolvedModel?.({
const normalized = provider.normalizeResolvedModel?.({
provider: "openai",
modelId: resolved.id,
model: resolved,
@ -364,8 +346,7 @@ describeLive("openai plugin live", () => {
const { speechProviders } = registerOpenAIPlugin();
const speechProvider = requireRegisteredProvider(speechProviders, "openai");
// oxlint-disable-next-line typescript/no-explicit-any
const voices = await (speechProvider as any).listVoices?.({});
const voices = await speechProvider.listVoices?.({});
if (!voices) {
throw new Error("openai speech provider did not return voices");
}
@ -374,8 +355,7 @@ describeLive("openai plugin live", () => {
const cfg = createLiveConfig();
const ttsConfig = createLiveTtsConfig();
// oxlint-disable-next-line typescript/no-explicit-any
const audioFile = await (speechProvider as any).synthesize({
const audioFile = await speechProvider.synthesize({
text: "OpenClaw integration test OK.",
cfg,
config: ttsConfig,
@ -385,8 +365,7 @@ describeLive("openai plugin live", () => {
expect(audioFile.fileExtension).toBe(".mp3");
expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512);
// oxlint-disable-next-line typescript/no-explicit-any
const telephony = await (speechProvider as any).synthesizeTelephony?.({
const telephony = await speechProvider.synthesizeTelephony?.({
text: "Telephony check OK.",
cfg,
config: ttsConfig,
@ -404,16 +383,14 @@ describeLive("openai plugin live", () => {
const cfg = createLiveConfig();
const ttsConfig = createLiveTtsConfig();
// oxlint-disable-next-line typescript/no-explicit-any
const synthesized = await (speechProvider as any).synthesize({
const synthesized = await speechProvider.synthesize({
text: "OpenClaw integration test OK.",
cfg,
config: ttsConfig,
target: "audio-file",
});
// oxlint-disable-next-line typescript/no-explicit-any
const transcription = await (mediaProvider as any).transcribeAudio?.({
const transcription = await mediaProvider.transcribeAudio?.({
buffer: synthesized.audioBuffer,
fileName: "openai-plugin-live.mp3",
mime: "audio/mpeg",
@ -435,8 +412,7 @@ describeLive("openai plugin live", () => {
const agentDir = await createTempAgentDir();
try {
// oxlint-disable-next-line typescript/no-explicit-any
const generated = await (imageProvider as any).generateImage({
const generated = await imageProvider.generateImage({
provider: "openai",
model: LIVE_IMAGE_MODEL,
prompt: "Create a minimal flat orange square centered on a white background.",
@ -464,8 +440,7 @@ describeLive("openai plugin live", () => {
const agentDir = await createTempAgentDir();
try {
// oxlint-disable-next-line typescript/no-explicit-any
const description = await (mediaProvider as any).describeImage?.({
const description = await mediaProvider.describeImage?.({
buffer: createReferencePng(),
fileName: "reference.png",
mime: "image/png",

View File

@ -1,6 +1,9 @@
import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
import { buildOpenAIImageGenerationProvider } from "./image-generation-provider.js";
import { openaiMediaUnderstandingProvider } from "./media-understanding-provider.js";
import {
openaiCodexMediaUnderstandingProvider,
openaiMediaUnderstandingProvider,
} from "./media-understanding-provider.js";
import { buildOpenAICodexProviderPlugin } from "./openai-codex-provider.js";
import { buildOpenAIProvider } from "./openai-provider.js";
import { buildOpenAISpeechProvider } from "./speech-provider.js";
@ -14,6 +17,7 @@ export default definePluginEntry({
api.registerProvider(buildOpenAICodexProviderPlugin());
api.registerSpeechProvider(buildOpenAISpeechProvider());
api.registerMediaUnderstandingProvider(openaiMediaUnderstandingProvider);
api.registerMediaUnderstandingProvider(openaiCodexMediaUnderstandingProvider);
api.registerImageGenerationProvider(buildOpenAIImageGenerationProvider());
},
});

View File

@ -24,3 +24,10 @@ export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
describeImages: describeImagesWithModel,
transcribeAudio: transcribeOpenAiAudio,
};
export const openaiCodexMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "openai-codex",
capabilities: ["image"],
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
};

View File

@ -1,3 +1,4 @@
import { AuthStorage, ModelRegistry } from "@mariozechner/pi-coding-agent";
import OpenAI from "openai";
import { describe, expect, it } from "vitest";
import {
@ -25,13 +26,7 @@ describe("openrouter plugin", () => {
registerOpenRouterPlugin();
expect(providers).toHaveLength(1);
expect(
providers.map(
(provider) =>
// oxlint-disable-next-line typescript/no-explicit-any
(provider as any).id,
),
).toEqual(["openrouter"]);
expect(providers.map((provider) => provider.id)).toEqual(["openrouter"]);
expect(speechProviders).toHaveLength(0);
expect(mediaProviders).toHaveLength(0);
expect(imageProviders).toHaveLength(0);
@ -43,15 +38,10 @@ describeLive("openrouter plugin live", () => {
const { providers } = registerOpenRouterPlugin();
const provider = requireRegisteredProvider(providers, "openrouter");
// oxlint-disable-next-line typescript/no-explicit-any
const resolved = (provider as any).resolveDynamicModel?.({
const resolved = provider.resolveDynamicModel?.({
provider: "openrouter",
modelId: LIVE_MODEL_ID,
modelRegistry: {
find() {
return null;
},
},
modelRegistry: new ModelRegistry(AuthStorage.inMemory()),
});
if (!resolved) {
throw new Error(`openrouter provider did not resolve ${LIVE_MODEL_ID}`);

View File

@ -177,6 +177,67 @@ describe("describeImageWithModel", () => {
expect(minimaxUnderstandImageMock).not.toHaveBeenCalled();
});
it("passes image prompt as system instructions for codex image requests", async () => {
discoverModelsMock.mockReturnValue({
find: vi.fn(() => ({
provider: "openai-codex",
id: "gpt-5.4",
input: ["text", "image"],
baseUrl: "https://chatgpt.com/backend-api",
})),
});
completeMock.mockResolvedValue({
role: "assistant",
api: "openai-codex-responses",
provider: "openai-codex",
model: "gpt-5.4",
stopReason: "stop",
timestamp: Date.now(),
content: [{ type: "text", text: "codex ok" }],
});
const result = await describeImageWithModel({
cfg: {},
agentDir: "/tmp/openclaw-agent",
provider: "openai-codex",
model: "gpt-5.4",
buffer: Buffer.from("png-bytes"),
fileName: "image.png",
mime: "image/png",
prompt: "Describe the image.",
timeoutMs: 1000,
});
expect(result).toEqual({
text: "codex ok",
model: "gpt-5.4",
});
expect(completeMock).toHaveBeenCalledOnce();
expect(completeMock).toHaveBeenCalledWith(
expect.objectContaining({
provider: "openai-codex",
id: "gpt-5.4",
}),
expect.objectContaining({
systemPrompt: "Describe the image.",
messages: [
expect.objectContaining({
role: "user",
content: [
expect.objectContaining({
type: "image",
mimeType: "image/png",
}),
],
}),
],
}),
expect.any(Object),
);
const [, context] = completeMock.mock.calls[0] ?? [];
expect(context?.messages?.[0]?.content).toHaveLength(1);
});
it("normalizes deprecated google flash ids before lookup and keeps profile auth selection", async () => {
const findMock = vi.fn((provider: string, modelId: string) => {
expect(provider).toBe("google");

View File

@ -73,17 +73,15 @@ function buildImageContext(
images: Array<{ buffer: Buffer; mime?: string }>,
): Context {
return {
systemPrompt: prompt,
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
...images.map((image) => ({
type: "image" as const,
data: image.buffer.toString("base64"),
mimeType: image.mime ?? "image/jpeg",
})),
],
content: images.map((image) => ({
type: "image" as const,
data: image.buffer.toString("base64"),
mimeType: image.mime ?? "image/jpeg",
})),
timestamp: Date.now(),
},
],

View File

@ -184,7 +184,10 @@ describe("plugin contract registry", () => {
]);
expect(findMediaUnderstandingProviderIdsForPlugin("mistral")).toEqual(["mistral"]);
expect(findMediaUnderstandingProviderIdsForPlugin("moonshot")).toEqual(["moonshot"]);
expect(findMediaUnderstandingProviderIdsForPlugin("openai")).toEqual(["openai"]);
expect(findMediaUnderstandingProviderIdsForPlugin("openai")).toEqual([
"openai",
"openai-codex",
]);
expect(findMediaUnderstandingProviderIdsForPlugin("zai")).toEqual(["zai"]);
});
@ -244,7 +247,7 @@ describe("plugin contract registry", () => {
expect(findRegistrationForPlugin("openai")).toMatchObject({
providerIds: ["openai", "openai-codex"],
speechProviderIds: ["openai"],
mediaUnderstandingProviderIds: ["openai"],
mediaUnderstandingProviderIds: ["openai", "openai-codex"],
imageGenerationProviderIds: ["openai"],
videoGenerationProviderIds: [],
});

View File

@ -1,10 +1,16 @@
import type {
ImageGenerationProviderPlugin,
MediaUnderstandingProviderPlugin,
ProviderPlugin,
SpeechProviderPlugin,
} from "../../../src/plugins/types.js";
import { createTestPluginApi } from "./plugin-api.js";
type RegisteredProviderCollections = {
providers: unknown[];
speechProviders: unknown[];
mediaProviders: unknown[];
imageProviders: unknown[];
providers: ProviderPlugin[];
speechProviders: SpeechProviderPlugin[];
mediaProviders: MediaUnderstandingProviderPlugin[];
imageProviders: ImageGenerationProviderPlugin[];
};
type ProviderPluginModule = {
@ -16,10 +22,10 @@ export function registerProviderPlugin(params: {
id: string;
name: string;
}): RegisteredProviderCollections {
const providers: unknown[] = [];
const speechProviders: unknown[] = [];
const mediaProviders: unknown[] = [];
const imageProviders: unknown[] = [];
const providers: ProviderPlugin[] = [];
const speechProviders: SpeechProviderPlugin[] = [];
const mediaProviders: MediaUnderstandingProviderPlugin[] = [];
const imageProviders: ImageGenerationProviderPlugin[] = [];
params.plugin.register(
createTestPluginApi({
@ -46,18 +52,14 @@ export function registerProviderPlugin(params: {
return { providers, speechProviders, mediaProviders, imageProviders };
}
export function requireRegisteredProvider<T = unknown>(
entries: unknown[],
export function requireRegisteredProvider<T extends { id: string }>(
entries: T[],
id: string,
label = "provider",
): T {
const entry = entries.find(
(candidate) =>
// oxlint-disable-next-line typescript/no-explicit-any
(candidate as any).id === id,
);
const entry = entries.find((candidate) => candidate.id === id);
if (!entry) {
throw new Error(`${label} ${id} was not registered`);
}
return entry as T;
return entry;
}