mirror of https://github.com/openclaw/openclaw.git
fix: support OpenAI Codex media understanding (#54829) (thanks @neeravmakwana)
* OpenAI: register Codex media understanding provider * fix: route codex image prompts through system instructions * fix: add changelog for codex image tool fix (#54829) (thanks @neeravmakwana) * fix: remove any from provider registration tests (#54829) (thanks @neeravmakwana) --------- Co-authored-by: Ayaan Zaidi <hi@obviy.us>
This commit is contained in:
parent
76ff0d9298
commit
6fd9d2ff38
|
|
@ -17,6 +17,7 @@ Docs: https://docs.openclaw.ai
|
|||
|
||||
### Fixes
|
||||
|
||||
- OpenAI Codex/image tools: register Codex for media understanding and route image prompts through Codex instructions so image analysis no longer fails on missing provider registration or missing `instructions`. (#54829) Thanks @neeravmakwana.
|
||||
- Telegram: deliver verbose tool summaries inside forum topic sessions again, so threaded topic chats now match DM verbose behavior. (#43236) Thanks @frankbuild.
|
||||
- Agents/sandbox: honor `tools.sandbox.tools.alsoAllow`, let explicit sandbox re-allows remove matching built-in default-deny tools, and keep sandbox explain/error guidance aligned with the effective sandbox tool policy. (#54492) Thanks @ngutman.
|
||||
- Agents/sandbox: make blocked-tool guidance glob-aware again, redact/sanitize session-specific explain hints for safer copy-paste, and avoid leaking control-character session keys in those hints. (#54684) Thanks @ngutman.
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { getModel } from "@mariozechner/pi-ai";
|
||||
import { AuthStorage, ModelRegistry } from "@mariozechner/pi-coding-agent";
|
||||
import OpenAI from "openai";
|
||||
import * as providerAuth from "openclaw/plugin-sdk/provider-auth";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
|
@ -38,52 +40,42 @@ const liveEnabled = OPENAI_API_KEY.trim().length > 0 && process.env.OPENCLAW_LIV
|
|||
const describeLive = liveEnabled ? describe : describe.skip;
|
||||
const EMPTY_AUTH_STORE = { version: 1, profiles: {} } as const;
|
||||
|
||||
function createTemplateModel(modelId: string) {
|
||||
function resolveTemplateModelId(modelId: string) {
|
||||
switch (modelId) {
|
||||
case "gpt-5.4":
|
||||
return {
|
||||
id: "gpt-5.2",
|
||||
name: "GPT-5.2",
|
||||
provider: "openai",
|
||||
api: "openai-completions",
|
||||
baseUrl: "https://api.openai.com/v1",
|
||||
reasoning: true,
|
||||
input: ["text", "image"],
|
||||
cost: { input: 1, output: 2, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 400_000,
|
||||
maxTokens: 128_000,
|
||||
};
|
||||
return "gpt-5.2";
|
||||
case "gpt-5.4-mini":
|
||||
return {
|
||||
id: "gpt-5-mini",
|
||||
name: "GPT-5 mini",
|
||||
provider: "openai",
|
||||
api: "openai-completions",
|
||||
baseUrl: "https://api.openai.com/v1",
|
||||
reasoning: true,
|
||||
input: ["text", "image"],
|
||||
cost: { input: 1, output: 2, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 400_000,
|
||||
maxTokens: 128_000,
|
||||
};
|
||||
return "gpt-5-mini";
|
||||
case "gpt-5.4-nano":
|
||||
return {
|
||||
id: "gpt-5-nano",
|
||||
name: "GPT-5 nano",
|
||||
provider: "openai",
|
||||
api: "openai-completions",
|
||||
baseUrl: "https://api.openai.com/v1",
|
||||
reasoning: true,
|
||||
input: ["text", "image"],
|
||||
cost: { input: 0.5, output: 1, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 200_000,
|
||||
maxTokens: 64_000,
|
||||
};
|
||||
return "gpt-5-nano";
|
||||
default:
|
||||
throw new Error(`Unsupported live OpenAI plugin model: ${modelId}`);
|
||||
}
|
||||
}
|
||||
|
||||
function createTemplateModelRegistry(modelId: string): ModelRegistry {
|
||||
const registry = new ModelRegistry(AuthStorage.inMemory());
|
||||
const template = getModel("openai", resolveTemplateModelId(modelId));
|
||||
registry.registerProvider("openai", {
|
||||
apiKey: "test",
|
||||
baseUrl: template.baseUrl,
|
||||
models: [
|
||||
{
|
||||
id: template.id,
|
||||
name: template.name,
|
||||
api: template.api,
|
||||
reasoning: template.reasoning,
|
||||
input: template.input,
|
||||
cost: template.cost,
|
||||
contextWindow: template.contextWindow,
|
||||
maxTokens: template.maxTokens,
|
||||
...(template.compat ? { compat: template.compat } : {}),
|
||||
},
|
||||
],
|
||||
});
|
||||
return registry;
|
||||
}
|
||||
|
||||
const registerOpenAIPlugin = () =>
|
||||
registerProviderPlugin({
|
||||
plugin,
|
||||
|
|
@ -193,15 +185,15 @@ describe("openai plugin", () => {
|
|||
const { providers, speechProviders, mediaProviders, imageProviders } = registerOpenAIPlugin();
|
||||
|
||||
expect(providers).toHaveLength(2);
|
||||
expect(
|
||||
providers.map(
|
||||
(provider) =>
|
||||
// oxlint-disable-next-line typescript/no-explicit-any
|
||||
(provider as any).id,
|
||||
),
|
||||
).toEqual(["openai", "openai-codex"]);
|
||||
expect(providers.map((provider) => provider.id)).toEqual(["openai", "openai-codex"]);
|
||||
expect(speechProviders).toHaveLength(1);
|
||||
expect(mediaProviders).toHaveLength(1);
|
||||
expect(mediaProviders.map((provider) => provider.id)).toEqual(["openai", "openai-codex"]);
|
||||
const codexMediaProvider = requireRegisteredProvider(
|
||||
mediaProviders,
|
||||
"openai-codex",
|
||||
"media provider",
|
||||
);
|
||||
expect(codexMediaProvider.capabilities).toEqual(["image"]);
|
||||
expect(imageProviders).toHaveLength(1);
|
||||
});
|
||||
|
||||
|
|
@ -314,27 +306,17 @@ describeLive("openai plugin live", () => {
|
|||
const { providers } = registerOpenAIPlugin();
|
||||
const provider = requireRegisteredProvider(providers, "openai");
|
||||
|
||||
// oxlint-disable-next-line typescript/no-explicit-any
|
||||
const resolved = (provider as any).resolveDynamicModel?.({
|
||||
const resolved = provider.resolveDynamicModel?.({
|
||||
provider: "openai",
|
||||
modelId: LIVE_MODEL_ID,
|
||||
modelRegistry: {
|
||||
find(providerId: string, id: string) {
|
||||
if (providerId !== "openai") {
|
||||
return null;
|
||||
}
|
||||
const template = createTemplateModel(LIVE_MODEL_ID);
|
||||
return id === template.id ? template : null;
|
||||
},
|
||||
},
|
||||
modelRegistry: createTemplateModelRegistry(LIVE_MODEL_ID),
|
||||
});
|
||||
|
||||
if (!resolved) {
|
||||
throw new Error("openai provider did not resolve the live model");
|
||||
}
|
||||
|
||||
// oxlint-disable-next-line typescript/no-explicit-any
|
||||
const normalized = (provider as any).normalizeResolvedModel?.({
|
||||
const normalized = provider.normalizeResolvedModel?.({
|
||||
provider: "openai",
|
||||
modelId: resolved.id,
|
||||
model: resolved,
|
||||
|
|
@ -364,8 +346,7 @@ describeLive("openai plugin live", () => {
|
|||
const { speechProviders } = registerOpenAIPlugin();
|
||||
const speechProvider = requireRegisteredProvider(speechProviders, "openai");
|
||||
|
||||
// oxlint-disable-next-line typescript/no-explicit-any
|
||||
const voices = await (speechProvider as any).listVoices?.({});
|
||||
const voices = await speechProvider.listVoices?.({});
|
||||
if (!voices) {
|
||||
throw new Error("openai speech provider did not return voices");
|
||||
}
|
||||
|
|
@ -374,8 +355,7 @@ describeLive("openai plugin live", () => {
|
|||
const cfg = createLiveConfig();
|
||||
const ttsConfig = createLiveTtsConfig();
|
||||
|
||||
// oxlint-disable-next-line typescript/no-explicit-any
|
||||
const audioFile = await (speechProvider as any).synthesize({
|
||||
const audioFile = await speechProvider.synthesize({
|
||||
text: "OpenClaw integration test OK.",
|
||||
cfg,
|
||||
config: ttsConfig,
|
||||
|
|
@ -385,8 +365,7 @@ describeLive("openai plugin live", () => {
|
|||
expect(audioFile.fileExtension).toBe(".mp3");
|
||||
expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512);
|
||||
|
||||
// oxlint-disable-next-line typescript/no-explicit-any
|
||||
const telephony = await (speechProvider as any).synthesizeTelephony?.({
|
||||
const telephony = await speechProvider.synthesizeTelephony?.({
|
||||
text: "Telephony check OK.",
|
||||
cfg,
|
||||
config: ttsConfig,
|
||||
|
|
@ -404,16 +383,14 @@ describeLive("openai plugin live", () => {
|
|||
const cfg = createLiveConfig();
|
||||
const ttsConfig = createLiveTtsConfig();
|
||||
|
||||
// oxlint-disable-next-line typescript/no-explicit-any
|
||||
const synthesized = await (speechProvider as any).synthesize({
|
||||
const synthesized = await speechProvider.synthesize({
|
||||
text: "OpenClaw integration test OK.",
|
||||
cfg,
|
||||
config: ttsConfig,
|
||||
target: "audio-file",
|
||||
});
|
||||
|
||||
// oxlint-disable-next-line typescript/no-explicit-any
|
||||
const transcription = await (mediaProvider as any).transcribeAudio?.({
|
||||
const transcription = await mediaProvider.transcribeAudio?.({
|
||||
buffer: synthesized.audioBuffer,
|
||||
fileName: "openai-plugin-live.mp3",
|
||||
mime: "audio/mpeg",
|
||||
|
|
@ -435,8 +412,7 @@ describeLive("openai plugin live", () => {
|
|||
const agentDir = await createTempAgentDir();
|
||||
|
||||
try {
|
||||
// oxlint-disable-next-line typescript/no-explicit-any
|
||||
const generated = await (imageProvider as any).generateImage({
|
||||
const generated = await imageProvider.generateImage({
|
||||
provider: "openai",
|
||||
model: LIVE_IMAGE_MODEL,
|
||||
prompt: "Create a minimal flat orange square centered on a white background.",
|
||||
|
|
@ -464,8 +440,7 @@ describeLive("openai plugin live", () => {
|
|||
const agentDir = await createTempAgentDir();
|
||||
|
||||
try {
|
||||
// oxlint-disable-next-line typescript/no-explicit-any
|
||||
const description = await (mediaProvider as any).describeImage?.({
|
||||
const description = await mediaProvider.describeImage?.({
|
||||
buffer: createReferencePng(),
|
||||
fileName: "reference.png",
|
||||
mime: "image/png",
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
|
||||
import { buildOpenAIImageGenerationProvider } from "./image-generation-provider.js";
|
||||
import { openaiMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
import {
|
||||
openaiCodexMediaUnderstandingProvider,
|
||||
openaiMediaUnderstandingProvider,
|
||||
} from "./media-understanding-provider.js";
|
||||
import { buildOpenAICodexProviderPlugin } from "./openai-codex-provider.js";
|
||||
import { buildOpenAIProvider } from "./openai-provider.js";
|
||||
import { buildOpenAISpeechProvider } from "./speech-provider.js";
|
||||
|
|
@ -14,6 +17,7 @@ export default definePluginEntry({
|
|||
api.registerProvider(buildOpenAICodexProviderPlugin());
|
||||
api.registerSpeechProvider(buildOpenAISpeechProvider());
|
||||
api.registerMediaUnderstandingProvider(openaiMediaUnderstandingProvider);
|
||||
api.registerMediaUnderstandingProvider(openaiCodexMediaUnderstandingProvider);
|
||||
api.registerImageGenerationProvider(buildOpenAIImageGenerationProvider());
|
||||
},
|
||||
});
|
||||
|
|
|
|||
|
|
@ -24,3 +24,10 @@ export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
|||
describeImages: describeImagesWithModel,
|
||||
transcribeAudio: transcribeOpenAiAudio,
|
||||
};
|
||||
|
||||
export const openaiCodexMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "openai-codex",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import { AuthStorage, ModelRegistry } from "@mariozechner/pi-coding-agent";
|
||||
import OpenAI from "openai";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
|
|
@ -25,13 +26,7 @@ describe("openrouter plugin", () => {
|
|||
registerOpenRouterPlugin();
|
||||
|
||||
expect(providers).toHaveLength(1);
|
||||
expect(
|
||||
providers.map(
|
||||
(provider) =>
|
||||
// oxlint-disable-next-line typescript/no-explicit-any
|
||||
(provider as any).id,
|
||||
),
|
||||
).toEqual(["openrouter"]);
|
||||
expect(providers.map((provider) => provider.id)).toEqual(["openrouter"]);
|
||||
expect(speechProviders).toHaveLength(0);
|
||||
expect(mediaProviders).toHaveLength(0);
|
||||
expect(imageProviders).toHaveLength(0);
|
||||
|
|
@ -43,15 +38,10 @@ describeLive("openrouter plugin live", () => {
|
|||
const { providers } = registerOpenRouterPlugin();
|
||||
const provider = requireRegisteredProvider(providers, "openrouter");
|
||||
|
||||
// oxlint-disable-next-line typescript/no-explicit-any
|
||||
const resolved = (provider as any).resolveDynamicModel?.({
|
||||
const resolved = provider.resolveDynamicModel?.({
|
||||
provider: "openrouter",
|
||||
modelId: LIVE_MODEL_ID,
|
||||
modelRegistry: {
|
||||
find() {
|
||||
return null;
|
||||
},
|
||||
},
|
||||
modelRegistry: new ModelRegistry(AuthStorage.inMemory()),
|
||||
});
|
||||
if (!resolved) {
|
||||
throw new Error(`openrouter provider did not resolve ${LIVE_MODEL_ID}`);
|
||||
|
|
|
|||
|
|
@ -177,6 +177,67 @@ describe("describeImageWithModel", () => {
|
|||
expect(minimaxUnderstandImageMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("passes image prompt as system instructions for codex image requests", async () => {
|
||||
discoverModelsMock.mockReturnValue({
|
||||
find: vi.fn(() => ({
|
||||
provider: "openai-codex",
|
||||
id: "gpt-5.4",
|
||||
input: ["text", "image"],
|
||||
baseUrl: "https://chatgpt.com/backend-api",
|
||||
})),
|
||||
});
|
||||
completeMock.mockResolvedValue({
|
||||
role: "assistant",
|
||||
api: "openai-codex-responses",
|
||||
provider: "openai-codex",
|
||||
model: "gpt-5.4",
|
||||
stopReason: "stop",
|
||||
timestamp: Date.now(),
|
||||
content: [{ type: "text", text: "codex ok" }],
|
||||
});
|
||||
|
||||
const result = await describeImageWithModel({
|
||||
cfg: {},
|
||||
agentDir: "/tmp/openclaw-agent",
|
||||
provider: "openai-codex",
|
||||
model: "gpt-5.4",
|
||||
buffer: Buffer.from("png-bytes"),
|
||||
fileName: "image.png",
|
||||
mime: "image/png",
|
||||
prompt: "Describe the image.",
|
||||
timeoutMs: 1000,
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
text: "codex ok",
|
||||
model: "gpt-5.4",
|
||||
});
|
||||
expect(completeMock).toHaveBeenCalledOnce();
|
||||
expect(completeMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
provider: "openai-codex",
|
||||
id: "gpt-5.4",
|
||||
}),
|
||||
expect.objectContaining({
|
||||
systemPrompt: "Describe the image.",
|
||||
messages: [
|
||||
expect.objectContaining({
|
||||
role: "user",
|
||||
content: [
|
||||
expect.objectContaining({
|
||||
type: "image",
|
||||
mimeType: "image/png",
|
||||
}),
|
||||
],
|
||||
}),
|
||||
],
|
||||
}),
|
||||
expect.any(Object),
|
||||
);
|
||||
const [, context] = completeMock.mock.calls[0] ?? [];
|
||||
expect(context?.messages?.[0]?.content).toHaveLength(1);
|
||||
});
|
||||
|
||||
it("normalizes deprecated google flash ids before lookup and keeps profile auth selection", async () => {
|
||||
const findMock = vi.fn((provider: string, modelId: string) => {
|
||||
expect(provider).toBe("google");
|
||||
|
|
|
|||
|
|
@ -73,17 +73,15 @@ function buildImageContext(
|
|||
images: Array<{ buffer: Buffer; mime?: string }>,
|
||||
): Context {
|
||||
return {
|
||||
systemPrompt: prompt,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: prompt },
|
||||
...images.map((image) => ({
|
||||
type: "image" as const,
|
||||
data: image.buffer.toString("base64"),
|
||||
mimeType: image.mime ?? "image/jpeg",
|
||||
})),
|
||||
],
|
||||
content: images.map((image) => ({
|
||||
type: "image" as const,
|
||||
data: image.buffer.toString("base64"),
|
||||
mimeType: image.mime ?? "image/jpeg",
|
||||
})),
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
|
|
|
|||
|
|
@ -184,7 +184,10 @@ describe("plugin contract registry", () => {
|
|||
]);
|
||||
expect(findMediaUnderstandingProviderIdsForPlugin("mistral")).toEqual(["mistral"]);
|
||||
expect(findMediaUnderstandingProviderIdsForPlugin("moonshot")).toEqual(["moonshot"]);
|
||||
expect(findMediaUnderstandingProviderIdsForPlugin("openai")).toEqual(["openai"]);
|
||||
expect(findMediaUnderstandingProviderIdsForPlugin("openai")).toEqual([
|
||||
"openai",
|
||||
"openai-codex",
|
||||
]);
|
||||
expect(findMediaUnderstandingProviderIdsForPlugin("zai")).toEqual(["zai"]);
|
||||
});
|
||||
|
||||
|
|
@ -244,7 +247,7 @@ describe("plugin contract registry", () => {
|
|||
expect(findRegistrationForPlugin("openai")).toMatchObject({
|
||||
providerIds: ["openai", "openai-codex"],
|
||||
speechProviderIds: ["openai"],
|
||||
mediaUnderstandingProviderIds: ["openai"],
|
||||
mediaUnderstandingProviderIds: ["openai", "openai-codex"],
|
||||
imageGenerationProviderIds: ["openai"],
|
||||
videoGenerationProviderIds: [],
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,10 +1,16 @@
|
|||
import type {
|
||||
ImageGenerationProviderPlugin,
|
||||
MediaUnderstandingProviderPlugin,
|
||||
ProviderPlugin,
|
||||
SpeechProviderPlugin,
|
||||
} from "../../../src/plugins/types.js";
|
||||
import { createTestPluginApi } from "./plugin-api.js";
|
||||
|
||||
type RegisteredProviderCollections = {
|
||||
providers: unknown[];
|
||||
speechProviders: unknown[];
|
||||
mediaProviders: unknown[];
|
||||
imageProviders: unknown[];
|
||||
providers: ProviderPlugin[];
|
||||
speechProviders: SpeechProviderPlugin[];
|
||||
mediaProviders: MediaUnderstandingProviderPlugin[];
|
||||
imageProviders: ImageGenerationProviderPlugin[];
|
||||
};
|
||||
|
||||
type ProviderPluginModule = {
|
||||
|
|
@ -16,10 +22,10 @@ export function registerProviderPlugin(params: {
|
|||
id: string;
|
||||
name: string;
|
||||
}): RegisteredProviderCollections {
|
||||
const providers: unknown[] = [];
|
||||
const speechProviders: unknown[] = [];
|
||||
const mediaProviders: unknown[] = [];
|
||||
const imageProviders: unknown[] = [];
|
||||
const providers: ProviderPlugin[] = [];
|
||||
const speechProviders: SpeechProviderPlugin[] = [];
|
||||
const mediaProviders: MediaUnderstandingProviderPlugin[] = [];
|
||||
const imageProviders: ImageGenerationProviderPlugin[] = [];
|
||||
|
||||
params.plugin.register(
|
||||
createTestPluginApi({
|
||||
|
|
@ -46,18 +52,14 @@ export function registerProviderPlugin(params: {
|
|||
return { providers, speechProviders, mediaProviders, imageProviders };
|
||||
}
|
||||
|
||||
export function requireRegisteredProvider<T = unknown>(
|
||||
entries: unknown[],
|
||||
export function requireRegisteredProvider<T extends { id: string }>(
|
||||
entries: T[],
|
||||
id: string,
|
||||
label = "provider",
|
||||
): T {
|
||||
const entry = entries.find(
|
||||
(candidate) =>
|
||||
// oxlint-disable-next-line typescript/no-explicit-any
|
||||
(candidate as any).id === id,
|
||||
);
|
||||
const entry = entries.find((candidate) => candidate.id === id);
|
||||
if (!entry) {
|
||||
throw new Error(`${label} ${id} was not registered`);
|
||||
}
|
||||
return entry as T;
|
||||
return entry;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue