diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bcc0b955bb..ebd2f6b9639 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ Docs: https://docs.openclaw.ai - Telegram/pairing: ignore self-authored DM `message` updates so bot-pinned status cards and similar service updates do not trigger bogus pairing requests or re-enter inbound dispatch. (#54530) thanks @huntharo - iMessage: stop leaking inline `[[reply_to:...]]` tags into delivered text by sending `reply_to` as RPC metadata and stripping stray directive tags from outbound messages. (#39512) Thanks @mvanhorn. - Agents/embedded replies: surface mid-turn 429 and overload failures when embedded runs end without a user-visible reply, while preserving successful media-only replies that still use legacy `mediaUrl`. (#50930) Thanks @infichen. +- Agents/image tool: restore the generic image-runtime fallback when no provider-specific media-understanding provider is registered, so image analysis works again for providers like `openrouter` and `minimax-portal`. (#54858) Thanks @MonkeyLeeT. - Agents/compaction: trigger timeout recovery compaction before retrying high-context LLM timeouts so embedded runs stop repeating oversized requests. (#46417) thanks @joeykrug. - Microsoft Teams/config: accept the existing `welcomeCard`, `groupWelcomeCard`, `promptStarters`, and feedback/reflection keys in strict config validation so already-supported Teams runtime settings stop failing schema checks. (#54679) Thanks @gumclaw. - CLI/plugins: make routed commands use the same auto-enabled bundled-channel snapshot as gateway startup, so configured bundled channels like Slack load without requiring a prior config rewrite. (#54809) Thanks @neeravmakwana. diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index 9486a8b87e6..47791dc9626 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -222,45 +222,46 @@ function stubMinimaxFetch(baseResp: { status_code: number; status_msg: string }, } function stubOpenAiCompletionsOkFetch(text = "ok") { - const fetch = vi.fn().mockResolvedValue( - new Response( - new ReadableStream({ - start(controller) { - const encoder = new TextEncoder(); - const chunks = [ - `data: ${JSON.stringify({ - id: "chatcmpl-moonshot-test", - object: "chat.completion.chunk", - created: Math.floor(Date.now() / 1000), - model: "kimi-k2.5", - choices: [ - { - index: 0, - delta: { role: "assistant", content: text }, - finish_reason: null, - }, - ], - })}\n\n`, - `data: ${JSON.stringify({ - id: "chatcmpl-moonshot-test", - object: "chat.completion.chunk", - created: Math.floor(Date.now() / 1000), - model: "kimi-k2.5", - choices: [{ index: 0, delta: {}, finish_reason: "stop" }], - })}\n\n`, - "data: [DONE]\n\n", - ]; - for (const chunk of chunks) { - controller.enqueue(encoder.encode(chunk)); - } - controller.close(); + const fetch = vi.fn().mockImplementation( + async () => + new Response( + new ReadableStream({ + start(controller) { + const encoder = new TextEncoder(); + const chunks = [ + `data: ${JSON.stringify({ + id: "chatcmpl-moonshot-test", + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model: "kimi-k2.5", + choices: [ + { + index: 0, + delta: { role: "assistant", content: text }, + finish_reason: null, + }, + ], + })}\n\n`, + `data: ${JSON.stringify({ + id: "chatcmpl-moonshot-test", + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model: "kimi-k2.5", + choices: [{ index: 0, delta: {}, finish_reason: "stop" }], + })}\n\n`, + "data: [DONE]\n\n", + ]; + for (const chunk of chunks) { + controller.enqueue(encoder.encode(chunk)); + } + controller.close(); + }, + }), + { + status: 200, + headers: { "content-type": "text/event-stream" }, }, - }), - { - status: 200, - headers: { "content-type": "text/event-stream" }, - }, - ), + ), ); global.fetch = withFetchPreconnect(fetch); return fetch; @@ -705,6 +706,110 @@ describe("image tool implicit imageModel config", () => { }); }); + it("falls back to the generic image runtime when openrouter has no media provider registration", async () => { + await withTempAgentDir(async (agentDir) => { + const fetch = stubOpenAiCompletionsOkFetch("ok openrouter"); + const cfg: OpenClawConfig = { + agents: { + defaults: { + model: { primary: "openrouter/google/gemini-2.5-flash-lite" }, + imageModel: { primary: "openrouter/google/gemini-2.5-flash-lite" }, + }, + }, + models: { + providers: { + openrouter: { + api: "openai-completions", + baseUrl: "https://openrouter.ai/api/v1", + apiKey: "openrouter-test", + models: [makeModelDefinition("google/gemini-2.5-flash-lite", ["text", "image"])], + }, + }, + }, + }; + + const tool = requireImageTool(createImageTool({ config: cfg, agentDir })); + const result = await tool.execute("t1", { + prompt: "Describe the image.", + image: `data:image/png;base64,${ONE_PIXEL_PNG_B64}`, + }); + + expect(fetch).toHaveBeenCalledTimes(1); + expect(result.content).toEqual( + expect.arrayContaining([expect.objectContaining({ type: "text", text: "ok openrouter" })]), + ); + }); + }); + + it("falls back to the generic multi-image runtime when openrouter has no media provider registration", async () => { + await withTempAgentDir(async (agentDir) => { + const fetch = stubOpenAiCompletionsOkFetch("ok multi"); + const cfg: OpenClawConfig = { + agents: { + defaults: { + model: { primary: "openrouter/google/gemini-2.5-flash-lite" }, + imageModel: { primary: "openrouter/google/gemini-2.5-flash-lite" }, + }, + }, + models: { + providers: { + openrouter: { + api: "openai-completions", + baseUrl: "https://openrouter.ai/api/v1", + apiKey: "openrouter-test", + models: [makeModelDefinition("google/gemini-2.5-flash-lite", ["text", "image"])], + }, + }, + }, + }; + + const tool = requireImageTool(createImageTool({ config: cfg, agentDir })); + const result = await tool.execute("t1", { + prompt: "Describe the images.", + images: [ + `data:image/png;base64,${ONE_PIXEL_PNG_B64}`, + `data:image/png;base64,${ONE_PIXEL_PNG_B64}`, + ], + }); + + expect(fetch).toHaveBeenCalledTimes(1); + expect(result.content).toEqual( + expect.arrayContaining([expect.objectContaining({ type: "text", text: "ok multi" })]), + ); + }); + }); + + it("falls back to the generic image runtime when minimax-portal has no media provider registration", async () => { + await withTempAgentDir(async (agentDir) => { + installImageUnderstandingProviderStubs(); + await writeAuthProfiles(agentDir, { + version: 1, + profiles: { + "minimax-portal:default": { + type: "oauth", + provider: "minimax-portal", + access: "oauth-test", + refresh: "refresh-test", + expires: Date.now() + 60_000, + }, + }, + }); + const fetch = stubMinimaxOkFetch(); + const cfg: OpenClawConfig = { + agents: { + defaults: { + model: { primary: "minimax-portal/MiniMax-M2.7" }, + imageModel: { primary: "minimax-portal/MiniMax-VL-01" }, + }, + }, + }; + + const tool = requireImageTool(createImageTool({ config: cfg, agentDir })); + await expectImageToolExecOk(tool, `data:image/png;base64,${ONE_PIXEL_PNG_B64}`); + expect(fetch).toHaveBeenCalledTimes(1); + }); + }); + it("exposes an Anthropic-safe image schema without union keywords", async () => { await withMinimaxImageToolFromTempAgentDir(async (tool) => { const violations = findSchemaUnionKeywords(tool.parameters, "image.parameters"); diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index d0a76e1ac07..98870dc6e82 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -3,7 +3,11 @@ import type { OpenClawConfig } from "../../config/config.js"; import { getMediaUnderstandingProvider } from "../../media-understanding/provider-registry.js"; import { buildProviderRegistry } from "../../media-understanding/runner.js"; import { loadWebMedia } from "../../media/web-media.js"; -import type { MediaUnderstandingProvider } from "../../plugin-sdk/media-understanding.js"; +import { + describeImageWithModel, + describeImagesWithModel, + type MediaUnderstandingProvider, +} from "../../plugin-sdk/media-understanding.js"; import { resolveUserPath } from "../../utils.js"; import { isMinimaxVlmProvider } from "../minimax-vlm.js"; import { @@ -164,11 +168,12 @@ async function runImagePrompt(params: { provider, providerRegistry as Map, ); - if (!imageProvider) { - throw new Error(`No media-understanding provider registered for ${provider}`); - } - if (params.images.length > 1 && imageProvider.describeImages) { - const described = await imageProvider.describeImages({ + if ( + params.images.length > 1 && + (imageProvider?.describeImages || !imageProvider?.describeImage) + ) { + const describeImages = imageProvider?.describeImages ?? describeImagesWithModel; + const described = await describeImages({ images: params.images.map((image, index) => ({ buffer: image.buffer, fileName: `image-${index + 1}`, @@ -184,12 +189,10 @@ async function runImagePrompt(params: { }); return { text: described.text, provider, model: described.model ?? modelId }; } - if (!imageProvider.describeImage) { - throw new Error(`Provider does not support image analysis: ${provider}`); - } + const describeImage = imageProvider?.describeImage ?? describeImageWithModel; if (params.images.length === 1) { const image = params.images[0]; - const described = await imageProvider.describeImage({ + const described = await describeImage({ buffer: image.buffer, fileName: "image-1", mime: image.mimeType, @@ -206,7 +209,7 @@ async function runImagePrompt(params: { const parts: string[] = []; for (const [index, image] of params.images.entries()) { - const described = await imageProvider.describeImage({ + const described = await describeImage({ buffer: image.buffer, fileName: `image-${index + 1}`, mime: image.mimeType,