diff --git a/CHANGELOG.md b/CHANGELOG.md index ebd2f6b9639..f8753f65170 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- OpenAI Codex/image tools: register Codex for media understanding and route image prompts through Codex instructions so image analysis no longer fails on missing provider registration or missing `instructions`. (#54829) Thanks @neeravmakwana. - Telegram: deliver verbose tool summaries inside forum topic sessions again, so threaded topic chats now match DM verbose behavior. (#43236) Thanks @frankbuild. - Agents/sandbox: honor `tools.sandbox.tools.alsoAllow`, let explicit sandbox re-allows remove matching built-in default-deny tools, and keep sandbox explain/error guidance aligned with the effective sandbox tool policy. (#54492) Thanks @ngutman. - Agents/sandbox: make blocked-tool guidance glob-aware again, redact/sanitize session-specific explain hints for safer copy-paste, and avoid leaking control-character session keys in those hints. (#54684) Thanks @ngutman. diff --git a/extensions/openai/index.test.ts b/extensions/openai/index.test.ts index 0b1d6f4bfb2..fec3b9416c2 100644 --- a/extensions/openai/index.test.ts +++ b/extensions/openai/index.test.ts @@ -1,6 +1,8 @@ import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; +import { getModel } from "@mariozechner/pi-ai"; +import { AuthStorage, ModelRegistry } from "@mariozechner/pi-coding-agent"; import OpenAI from "openai"; import * as providerAuth from "openclaw/plugin-sdk/provider-auth"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; @@ -38,52 +40,42 @@ const liveEnabled = OPENAI_API_KEY.trim().length > 0 && process.env.OPENCLAW_LIV const describeLive = liveEnabled ? describe : describe.skip; const EMPTY_AUTH_STORE = { version: 1, profiles: {} } as const; -function createTemplateModel(modelId: string) { +function resolveTemplateModelId(modelId: string) { switch (modelId) { case "gpt-5.4": - return { - id: "gpt-5.2", - name: "GPT-5.2", - provider: "openai", - api: "openai-completions", - baseUrl: "https://api.openai.com/v1", - reasoning: true, - input: ["text", "image"], - cost: { input: 1, output: 2, cacheRead: 0, cacheWrite: 0 }, - contextWindow: 400_000, - maxTokens: 128_000, - }; + return "gpt-5.2"; case "gpt-5.4-mini": - return { - id: "gpt-5-mini", - name: "GPT-5 mini", - provider: "openai", - api: "openai-completions", - baseUrl: "https://api.openai.com/v1", - reasoning: true, - input: ["text", "image"], - cost: { input: 1, output: 2, cacheRead: 0, cacheWrite: 0 }, - contextWindow: 400_000, - maxTokens: 128_000, - }; + return "gpt-5-mini"; case "gpt-5.4-nano": - return { - id: "gpt-5-nano", - name: "GPT-5 nano", - provider: "openai", - api: "openai-completions", - baseUrl: "https://api.openai.com/v1", - reasoning: true, - input: ["text", "image"], - cost: { input: 0.5, output: 1, cacheRead: 0, cacheWrite: 0 }, - contextWindow: 200_000, - maxTokens: 64_000, - }; + return "gpt-5-nano"; default: throw new Error(`Unsupported live OpenAI plugin model: ${modelId}`); } } +function createTemplateModelRegistry(modelId: string): ModelRegistry { + const registry = new ModelRegistry(AuthStorage.inMemory()); + const template = getModel("openai", resolveTemplateModelId(modelId)); + registry.registerProvider("openai", { + apiKey: "test", + baseUrl: template.baseUrl, + models: [ + { + id: template.id, + name: template.name, + api: template.api, + reasoning: template.reasoning, + input: template.input, + cost: template.cost, + contextWindow: template.contextWindow, + maxTokens: template.maxTokens, + ...(template.compat ? { compat: template.compat } : {}), + }, + ], + }); + return registry; +} + const registerOpenAIPlugin = () => registerProviderPlugin({ plugin, @@ -193,15 +185,15 @@ describe("openai plugin", () => { const { providers, speechProviders, mediaProviders, imageProviders } = registerOpenAIPlugin(); expect(providers).toHaveLength(2); - expect( - providers.map( - (provider) => - // oxlint-disable-next-line typescript/no-explicit-any - (provider as any).id, - ), - ).toEqual(["openai", "openai-codex"]); + expect(providers.map((provider) => provider.id)).toEqual(["openai", "openai-codex"]); expect(speechProviders).toHaveLength(1); - expect(mediaProviders).toHaveLength(1); + expect(mediaProviders.map((provider) => provider.id)).toEqual(["openai", "openai-codex"]); + const codexMediaProvider = requireRegisteredProvider( + mediaProviders, + "openai-codex", + "media provider", + ); + expect(codexMediaProvider.capabilities).toEqual(["image"]); expect(imageProviders).toHaveLength(1); }); @@ -314,27 +306,17 @@ describeLive("openai plugin live", () => { const { providers } = registerOpenAIPlugin(); const provider = requireRegisteredProvider(providers, "openai"); - // oxlint-disable-next-line typescript/no-explicit-any - const resolved = (provider as any).resolveDynamicModel?.({ + const resolved = provider.resolveDynamicModel?.({ provider: "openai", modelId: LIVE_MODEL_ID, - modelRegistry: { - find(providerId: string, id: string) { - if (providerId !== "openai") { - return null; - } - const template = createTemplateModel(LIVE_MODEL_ID); - return id === template.id ? template : null; - }, - }, + modelRegistry: createTemplateModelRegistry(LIVE_MODEL_ID), }); if (!resolved) { throw new Error("openai provider did not resolve the live model"); } - // oxlint-disable-next-line typescript/no-explicit-any - const normalized = (provider as any).normalizeResolvedModel?.({ + const normalized = provider.normalizeResolvedModel?.({ provider: "openai", modelId: resolved.id, model: resolved, @@ -364,8 +346,7 @@ describeLive("openai plugin live", () => { const { speechProviders } = registerOpenAIPlugin(); const speechProvider = requireRegisteredProvider(speechProviders, "openai"); - // oxlint-disable-next-line typescript/no-explicit-any - const voices = await (speechProvider as any).listVoices?.({}); + const voices = await speechProvider.listVoices?.({}); if (!voices) { throw new Error("openai speech provider did not return voices"); } @@ -374,8 +355,7 @@ describeLive("openai plugin live", () => { const cfg = createLiveConfig(); const ttsConfig = createLiveTtsConfig(); - // oxlint-disable-next-line typescript/no-explicit-any - const audioFile = await (speechProvider as any).synthesize({ + const audioFile = await speechProvider.synthesize({ text: "OpenClaw integration test OK.", cfg, config: ttsConfig, @@ -385,8 +365,7 @@ describeLive("openai plugin live", () => { expect(audioFile.fileExtension).toBe(".mp3"); expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512); - // oxlint-disable-next-line typescript/no-explicit-any - const telephony = await (speechProvider as any).synthesizeTelephony?.({ + const telephony = await speechProvider.synthesizeTelephony?.({ text: "Telephony check OK.", cfg, config: ttsConfig, @@ -404,16 +383,14 @@ describeLive("openai plugin live", () => { const cfg = createLiveConfig(); const ttsConfig = createLiveTtsConfig(); - // oxlint-disable-next-line typescript/no-explicit-any - const synthesized = await (speechProvider as any).synthesize({ + const synthesized = await speechProvider.synthesize({ text: "OpenClaw integration test OK.", cfg, config: ttsConfig, target: "audio-file", }); - // oxlint-disable-next-line typescript/no-explicit-any - const transcription = await (mediaProvider as any).transcribeAudio?.({ + const transcription = await mediaProvider.transcribeAudio?.({ buffer: synthesized.audioBuffer, fileName: "openai-plugin-live.mp3", mime: "audio/mpeg", @@ -435,8 +412,7 @@ describeLive("openai plugin live", () => { const agentDir = await createTempAgentDir(); try { - // oxlint-disable-next-line typescript/no-explicit-any - const generated = await (imageProvider as any).generateImage({ + const generated = await imageProvider.generateImage({ provider: "openai", model: LIVE_IMAGE_MODEL, prompt: "Create a minimal flat orange square centered on a white background.", @@ -464,8 +440,7 @@ describeLive("openai plugin live", () => { const agentDir = await createTempAgentDir(); try { - // oxlint-disable-next-line typescript/no-explicit-any - const description = await (mediaProvider as any).describeImage?.({ + const description = await mediaProvider.describeImage?.({ buffer: createReferencePng(), fileName: "reference.png", mime: "image/png", diff --git a/extensions/openai/index.ts b/extensions/openai/index.ts index 51485252fc9..fb048d29243 100644 --- a/extensions/openai/index.ts +++ b/extensions/openai/index.ts @@ -1,6 +1,9 @@ import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry"; import { buildOpenAIImageGenerationProvider } from "./image-generation-provider.js"; -import { openaiMediaUnderstandingProvider } from "./media-understanding-provider.js"; +import { + openaiCodexMediaUnderstandingProvider, + openaiMediaUnderstandingProvider, +} from "./media-understanding-provider.js"; import { buildOpenAICodexProviderPlugin } from "./openai-codex-provider.js"; import { buildOpenAIProvider } from "./openai-provider.js"; import { buildOpenAISpeechProvider } from "./speech-provider.js"; @@ -14,6 +17,7 @@ export default definePluginEntry({ api.registerProvider(buildOpenAICodexProviderPlugin()); api.registerSpeechProvider(buildOpenAISpeechProvider()); api.registerMediaUnderstandingProvider(openaiMediaUnderstandingProvider); + api.registerMediaUnderstandingProvider(openaiCodexMediaUnderstandingProvider); api.registerImageGenerationProvider(buildOpenAIImageGenerationProvider()); }, }); diff --git a/extensions/openai/media-understanding-provider.ts b/extensions/openai/media-understanding-provider.ts index 9f7c7001935..9b9cd416749 100644 --- a/extensions/openai/media-understanding-provider.ts +++ b/extensions/openai/media-understanding-provider.ts @@ -24,3 +24,10 @@ export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = { describeImages: describeImagesWithModel, transcribeAudio: transcribeOpenAiAudio, }; + +export const openaiCodexMediaUnderstandingProvider: MediaUnderstandingProvider = { + id: "openai-codex", + capabilities: ["image"], + describeImage: describeImageWithModel, + describeImages: describeImagesWithModel, +}; diff --git a/extensions/openrouter/index.test.ts b/extensions/openrouter/index.test.ts index e39e175cfd6..a7bc542241a 100644 --- a/extensions/openrouter/index.test.ts +++ b/extensions/openrouter/index.test.ts @@ -1,3 +1,4 @@ +import { AuthStorage, ModelRegistry } from "@mariozechner/pi-coding-agent"; import OpenAI from "openai"; import { describe, expect, it } from "vitest"; import { @@ -25,13 +26,7 @@ describe("openrouter plugin", () => { registerOpenRouterPlugin(); expect(providers).toHaveLength(1); - expect( - providers.map( - (provider) => - // oxlint-disable-next-line typescript/no-explicit-any - (provider as any).id, - ), - ).toEqual(["openrouter"]); + expect(providers.map((provider) => provider.id)).toEqual(["openrouter"]); expect(speechProviders).toHaveLength(0); expect(mediaProviders).toHaveLength(0); expect(imageProviders).toHaveLength(0); @@ -43,15 +38,10 @@ describeLive("openrouter plugin live", () => { const { providers } = registerOpenRouterPlugin(); const provider = requireRegisteredProvider(providers, "openrouter"); - // oxlint-disable-next-line typescript/no-explicit-any - const resolved = (provider as any).resolveDynamicModel?.({ + const resolved = provider.resolveDynamicModel?.({ provider: "openrouter", modelId: LIVE_MODEL_ID, - modelRegistry: { - find() { - return null; - }, - }, + modelRegistry: new ModelRegistry(AuthStorage.inMemory()), }); if (!resolved) { throw new Error(`openrouter provider did not resolve ${LIVE_MODEL_ID}`); diff --git a/src/media-understanding/image.test.ts b/src/media-understanding/image.test.ts index 5a93e7b59cf..83f8dba6ee2 100644 --- a/src/media-understanding/image.test.ts +++ b/src/media-understanding/image.test.ts @@ -177,6 +177,67 @@ describe("describeImageWithModel", () => { expect(minimaxUnderstandImageMock).not.toHaveBeenCalled(); }); + it("passes image prompt as system instructions for codex image requests", async () => { + discoverModelsMock.mockReturnValue({ + find: vi.fn(() => ({ + provider: "openai-codex", + id: "gpt-5.4", + input: ["text", "image"], + baseUrl: "https://chatgpt.com/backend-api", + })), + }); + completeMock.mockResolvedValue({ + role: "assistant", + api: "openai-codex-responses", + provider: "openai-codex", + model: "gpt-5.4", + stopReason: "stop", + timestamp: Date.now(), + content: [{ type: "text", text: "codex ok" }], + }); + + const result = await describeImageWithModel({ + cfg: {}, + agentDir: "/tmp/openclaw-agent", + provider: "openai-codex", + model: "gpt-5.4", + buffer: Buffer.from("png-bytes"), + fileName: "image.png", + mime: "image/png", + prompt: "Describe the image.", + timeoutMs: 1000, + }); + + expect(result).toEqual({ + text: "codex ok", + model: "gpt-5.4", + }); + expect(completeMock).toHaveBeenCalledOnce(); + expect(completeMock).toHaveBeenCalledWith( + expect.objectContaining({ + provider: "openai-codex", + id: "gpt-5.4", + }), + expect.objectContaining({ + systemPrompt: "Describe the image.", + messages: [ + expect.objectContaining({ + role: "user", + content: [ + expect.objectContaining({ + type: "image", + mimeType: "image/png", + }), + ], + }), + ], + }), + expect.any(Object), + ); + const [, context] = completeMock.mock.calls[0] ?? []; + expect(context?.messages?.[0]?.content).toHaveLength(1); + }); + it("normalizes deprecated google flash ids before lookup and keeps profile auth selection", async () => { const findMock = vi.fn((provider: string, modelId: string) => { expect(provider).toBe("google"); diff --git a/src/media-understanding/image.ts b/src/media-understanding/image.ts index e79c99f359a..21ea1bd5fd3 100644 --- a/src/media-understanding/image.ts +++ b/src/media-understanding/image.ts @@ -73,17 +73,15 @@ function buildImageContext( images: Array<{ buffer: Buffer; mime?: string }>, ): Context { return { + systemPrompt: prompt, messages: [ { role: "user", - content: [ - { type: "text", text: prompt }, - ...images.map((image) => ({ - type: "image" as const, - data: image.buffer.toString("base64"), - mimeType: image.mime ?? "image/jpeg", - })), - ], + content: images.map((image) => ({ + type: "image" as const, + data: image.buffer.toString("base64"), + mimeType: image.mime ?? "image/jpeg", + })), timestamp: Date.now(), }, ], diff --git a/src/plugins/contracts/registry.contract.test.ts b/src/plugins/contracts/registry.contract.test.ts index 3d8e8ef3ca3..da9fa49580d 100644 --- a/src/plugins/contracts/registry.contract.test.ts +++ b/src/plugins/contracts/registry.contract.test.ts @@ -184,7 +184,10 @@ describe("plugin contract registry", () => { ]); expect(findMediaUnderstandingProviderIdsForPlugin("mistral")).toEqual(["mistral"]); expect(findMediaUnderstandingProviderIdsForPlugin("moonshot")).toEqual(["moonshot"]); - expect(findMediaUnderstandingProviderIdsForPlugin("openai")).toEqual(["openai"]); + expect(findMediaUnderstandingProviderIdsForPlugin("openai")).toEqual([ + "openai", + "openai-codex", + ]); expect(findMediaUnderstandingProviderIdsForPlugin("zai")).toEqual(["zai"]); }); @@ -244,7 +247,7 @@ describe("plugin contract registry", () => { expect(findRegistrationForPlugin("openai")).toMatchObject({ providerIds: ["openai", "openai-codex"], speechProviderIds: ["openai"], - mediaUnderstandingProviderIds: ["openai"], + mediaUnderstandingProviderIds: ["openai", "openai-codex"], imageGenerationProviderIds: ["openai"], videoGenerationProviderIds: [], }); diff --git a/test/helpers/extensions/provider-registration.ts b/test/helpers/extensions/provider-registration.ts index b03cbe54041..37c267d6e9a 100644 --- a/test/helpers/extensions/provider-registration.ts +++ b/test/helpers/extensions/provider-registration.ts @@ -1,10 +1,16 @@ +import type { + ImageGenerationProviderPlugin, + MediaUnderstandingProviderPlugin, + ProviderPlugin, + SpeechProviderPlugin, +} from "../../../src/plugins/types.js"; import { createTestPluginApi } from "./plugin-api.js"; type RegisteredProviderCollections = { - providers: unknown[]; - speechProviders: unknown[]; - mediaProviders: unknown[]; - imageProviders: unknown[]; + providers: ProviderPlugin[]; + speechProviders: SpeechProviderPlugin[]; + mediaProviders: MediaUnderstandingProviderPlugin[]; + imageProviders: ImageGenerationProviderPlugin[]; }; type ProviderPluginModule = { @@ -16,10 +22,10 @@ export function registerProviderPlugin(params: { id: string; name: string; }): RegisteredProviderCollections { - const providers: unknown[] = []; - const speechProviders: unknown[] = []; - const mediaProviders: unknown[] = []; - const imageProviders: unknown[] = []; + const providers: ProviderPlugin[] = []; + const speechProviders: SpeechProviderPlugin[] = []; + const mediaProviders: MediaUnderstandingProviderPlugin[] = []; + const imageProviders: ImageGenerationProviderPlugin[] = []; params.plugin.register( createTestPluginApi({ @@ -46,18 +52,14 @@ export function registerProviderPlugin(params: { return { providers, speechProviders, mediaProviders, imageProviders }; } -export function requireRegisteredProvider( - entries: unknown[], +export function requireRegisteredProvider( + entries: T[], id: string, label = "provider", ): T { - const entry = entries.find( - (candidate) => - // oxlint-disable-next-line typescript/no-explicit-any - (candidate as any).id === id, - ); + const entry = entries.find((candidate) => candidate.id === id); if (!entry) { throw new Error(`${label} ${id} was not registered`); } - return entry as T; + return entry; }