diff --git a/extensions/anthropic/media-understanding-provider.ts b/extensions/anthropic/media-understanding-provider.ts index 68a95c93546..34732e2c7e3 100644 --- a/extensions/anthropic/media-understanding-provider.ts +++ b/extensions/anthropic/media-understanding-provider.ts @@ -7,6 +7,9 @@ import { export const anthropicMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "anthropic", capabilities: ["image"], + defaultModels: { image: "claude-opus-4-6" }, + autoPriority: { image: 20 }, + nativeDocumentInputs: ["pdf"], describeImage: describeImageWithModel, describeImages: describeImagesWithModel, }; diff --git a/extensions/deepgram/media-understanding-provider.ts b/extensions/deepgram/media-understanding-provider.ts index 6bb4a8792be..144d059fac7 100644 --- a/extensions/deepgram/media-understanding-provider.ts +++ b/extensions/deepgram/media-understanding-provider.ts @@ -4,5 +4,7 @@ import { transcribeDeepgramAudio } from "./audio.js"; export const deepgramMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "deepgram", capabilities: ["audio"], + defaultModels: { audio: "nova-3" }, + autoPriority: { audio: 30 }, transcribeAudio: transcribeDeepgramAudio, }; diff --git a/extensions/google/media-understanding-provider.ts b/extensions/google/media-understanding-provider.ts index d3a7f1c2ee1..20ca3353fb8 100644 --- a/extensions/google/media-understanding-provider.ts +++ b/extensions/google/media-understanding-provider.ts @@ -155,6 +155,13 @@ export async function describeGeminiVideo( export const googleMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "google", capabilities: ["image", "audio", "video"], + defaultModels: { + image: DEFAULT_GOOGLE_VIDEO_MODEL, + audio: DEFAULT_GOOGLE_AUDIO_MODEL, + video: DEFAULT_GOOGLE_VIDEO_MODEL, + }, + autoPriority: { image: 30, audio: 40, video: 10 }, + nativeDocumentInputs: ["pdf"], describeImage: describeImageWithModel, describeImages: describeImagesWithModel, transcribeAudio: transcribeGeminiAudio, diff --git a/extensions/groq/media-understanding-provider.ts b/extensions/groq/media-understanding-provider.ts index b8cc0164329..1a8efb2d95f 100644 --- a/extensions/groq/media-understanding-provider.ts +++ b/extensions/groq/media-understanding-provider.ts @@ -9,6 +9,8 @@ const DEFAULT_GROQ_AUDIO_MODEL = "whisper-large-v3-turbo"; export const groqMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "groq", capabilities: ["audio"], + defaultModels: { audio: DEFAULT_GROQ_AUDIO_MODEL }, + autoPriority: { audio: 20 }, transcribeAudio: (req) => transcribeOpenAiCompatibleAudio({ ...req, diff --git a/extensions/minimax/media-understanding-provider.ts b/extensions/minimax/media-understanding-provider.ts index 4501a96dee9..92e726e536b 100644 --- a/extensions/minimax/media-understanding-provider.ts +++ b/extensions/minimax/media-understanding-provider.ts @@ -7,6 +7,8 @@ import { export const minimaxMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "minimax", capabilities: ["image"], + defaultModels: { image: "MiniMax-VL-01" }, + autoPriority: { image: 40 }, describeImage: describeImageWithModel, describeImages: describeImagesWithModel, }; @@ -14,6 +16,8 @@ export const minimaxMediaUnderstandingProvider: MediaUnderstandingProvider = { export const minimaxPortalMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "minimax-portal", capabilities: ["image"], + defaultModels: { image: "MiniMax-VL-01" }, + autoPriority: { image: 50 }, describeImage: describeImageWithModel, describeImages: describeImagesWithModel, }; diff --git a/extensions/mistral/media-understanding-provider.ts b/extensions/mistral/media-understanding-provider.ts index f6ee0f167de..5ac1d29ff5c 100644 --- a/extensions/mistral/media-understanding-provider.ts +++ b/extensions/mistral/media-understanding-provider.ts @@ -9,6 +9,8 @@ const DEFAULT_MISTRAL_AUDIO_MODEL = "voxtral-mini-latest"; export const mistralMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "mistral", capabilities: ["audio"], + defaultModels: { audio: DEFAULT_MISTRAL_AUDIO_MODEL }, + autoPriority: { audio: 50 }, transcribeAudio: async (req) => await transcribeOpenAiCompatibleAudio({ ...req, diff --git a/extensions/moonshot/media-understanding-provider.ts b/extensions/moonshot/media-understanding-provider.ts index 8c9ff66b116..5579c55e31e 100644 --- a/extensions/moonshot/media-understanding-provider.ts +++ b/extensions/moonshot/media-understanding-provider.ts @@ -126,6 +126,8 @@ export async function describeMoonshotVideo( export const moonshotMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "moonshot", capabilities: ["image", "video"], + defaultModels: { image: "kimi-k2.5", video: DEFAULT_MOONSHOT_VIDEO_MODEL }, + autoPriority: { video: 20 }, describeImage: describeImageWithModel, describeImages: describeImagesWithModel, describeVideo: describeMoonshotVideo, diff --git a/extensions/openai/media-understanding-provider.ts b/extensions/openai/media-understanding-provider.ts index 0a8cede64ba..05ac7906f1e 100644 --- a/extensions/openai/media-understanding-provider.ts +++ b/extensions/openai/media-understanding-provider.ts @@ -21,6 +21,11 @@ export async function transcribeOpenAiAudio(params: AudioTranscriptionRequest) { export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "openai", capabilities: ["image", "audio"], + defaultModels: { + image: "gpt-5.4-mini", + audio: OPENAI_DEFAULT_AUDIO_TRANSCRIPTION_MODEL, + }, + autoPriority: { image: 10, audio: 10 }, describeImage: describeImageWithModel, describeImages: describeImagesWithModel, transcribeAudio: transcribeOpenAiAudio, @@ -29,6 +34,7 @@ export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = { export const openaiCodexMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "openai-codex", capabilities: ["image"], + defaultModels: { image: "gpt-5.4" }, describeImage: describeImageWithModel, describeImages: describeImagesWithModel, }; diff --git a/extensions/openrouter/media-understanding-provider.ts b/extensions/openrouter/media-understanding-provider.ts index 0a40be25120..fad91996219 100644 --- a/extensions/openrouter/media-understanding-provider.ts +++ b/extensions/openrouter/media-understanding-provider.ts @@ -7,6 +7,7 @@ import { export const openrouterMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "openrouter", capabilities: ["image"], + defaultModels: { image: "auto" }, describeImage: describeImageWithModel, describeImages: describeImagesWithModel, }; diff --git a/extensions/telegram/src/sticker-cache.test.ts b/extensions/telegram/src/sticker-cache.test.ts index 75a1db8725d..1ad4a22bf60 100644 --- a/extensions/telegram/src/sticker-cache.test.ts +++ b/extensions/telegram/src/sticker-cache.test.ts @@ -11,9 +11,9 @@ vi.mock("openclaw/plugin-sdk/agent-runtime", () => ({ })); vi.mock("openclaw/plugin-sdk/media-runtime", () => ({ - AUTO_IMAGE_KEY_PROVIDERS: ["openai"], - DEFAULT_IMAGE_MODELS: { openai: "gpt-4.1-mini" }, resolveAutoImageModel: vi.fn(async () => null), + resolveAutoMediaKeyProviders: vi.fn(() => ["openai"]), + resolveDefaultMediaModel: vi.fn(() => "gpt-4.1-mini"), })); vi.mock("openclaw/plugin-sdk/media-understanding-runtime", () => ({ diff --git a/extensions/telegram/src/sticker-cache.ts b/extensions/telegram/src/sticker-cache.ts index e6fd3398f16..7f70fc6024f 100644 --- a/extensions/telegram/src/sticker-cache.ts +++ b/extensions/telegram/src/sticker-cache.ts @@ -9,8 +9,11 @@ import { import { resolveDefaultModelForAgent } from "openclaw/plugin-sdk/agent-runtime"; import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime"; import { loadJsonFile, saveJsonFile } from "openclaw/plugin-sdk/json-store"; -import { AUTO_IMAGE_KEY_PROVIDERS, DEFAULT_IMAGE_MODELS } from "openclaw/plugin-sdk/media-runtime"; import { resolveAutoImageModel } from "openclaw/plugin-sdk/media-runtime"; +import { + resolveAutoMediaKeyProviders, + resolveDefaultMediaModel, +} from "openclaw/plugin-sdk/media-runtime"; import { describeImageFileWithModel } from "openclaw/plugin-sdk/media-understanding-runtime"; import { logVerbose } from "openclaw/plugin-sdk/runtime-env"; import { STATE_DIR } from "openclaw/plugin-sdk/state-paths"; @@ -182,6 +185,11 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi } }; + const autoProviders = resolveAutoMediaKeyProviders({ + cfg, + capability: "image", + }); + const selectCatalogModel = (provider: string) => { const entries = catalog.filter( (entry) => @@ -190,7 +198,11 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi if (entries.length === 0) { return undefined; } - const defaultId = DEFAULT_IMAGE_MODELS[provider]; + const defaultId = resolveDefaultMediaModel({ + cfg, + providerId: provider, + capability: "image", + }); const preferred = entries.find((entry) => entry.id === defaultId); return preferred ?? entries[0]; }; @@ -198,16 +210,14 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi let resolved = null as { provider: string; model?: string } | null; if ( activeModel && - AUTO_IMAGE_KEY_PROVIDERS.includes( - activeModel.provider as (typeof AUTO_IMAGE_KEY_PROVIDERS)[number], - ) && + autoProviders.includes(activeModel.provider) && (await hasProviderKey(activeModel.provider)) ) { resolved = activeModel; } if (!resolved) { - for (const provider of AUTO_IMAGE_KEY_PROVIDERS) { + for (const provider of autoProviders) { if (!(await hasProviderKey(provider))) { continue; } diff --git a/extensions/zai/media-understanding-provider.ts b/extensions/zai/media-understanding-provider.ts index bd571230b2d..10b41f69362 100644 --- a/extensions/zai/media-understanding-provider.ts +++ b/extensions/zai/media-understanding-provider.ts @@ -7,6 +7,8 @@ import { export const zaiMediaUnderstandingProvider: MediaUnderstandingProvider = { id: "zai", capabilities: ["image"], + defaultModels: { image: "glm-4.6v" }, + autoPriority: { image: 60 }, describeImage: describeImageWithModel, describeImages: describeImagesWithModel, }; diff --git a/src/agents/tools/image-generate-tool.ts b/src/agents/tools/image-generate-tool.ts index 1ce40887585..d84d6207a42 100644 --- a/src/agents/tools/image-generate-tool.ts +++ b/src/agents/tools/image-generate-tool.ts @@ -126,11 +126,12 @@ function resolveImageGenerationModelCandidates( providerDefaults.set(providerId, `${providerId}/${modelId}`); } + const primaryProvider = resolveDefaultModelRef(cfg).provider; const orderedProviders = [ - resolveDefaultModelRef(cfg).provider, - "openai", - "google", - ...providerDefaults.keys(), + primaryProvider, + ...[...providerDefaults.keys()] + .filter((providerId) => providerId !== primaryProvider) + .toSorted(), ]; const orderedRefs: string[] = []; const seen = new Set(); diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index 897216fe24f..72c9c18994b 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -274,7 +274,7 @@ function createMinimaxImageConfig(): OpenClawConfig { function createDefaultImageFallbackExpectation(primary: string) { return { primary, - fallbacks: ["openai/gpt-5-mini", "anthropic/claude-opus-4-5"], + fallbacks: ["openai/gpt-5.4-mini", "anthropic/claude-opus-4-6"], }; } @@ -618,12 +618,12 @@ describe("image tool implicit imageModel config", () => { agents: { defaults: { model: { primary: "minimax/MiniMax-M2.7" }, - imageModel: { primary: "openai/gpt-5-mini" }, + imageModel: { primary: "openai/gpt-5.4-mini" }, }, }, }; expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({ - primary: "openai/gpt-5-mini", + primary: "openai/gpt-5.4-mini", }); }); }); @@ -638,7 +638,7 @@ describe("image tool implicit imageModel config", () => { agents: { defaults: { model: { primary: "acme/vision-1" }, - imageModel: { primary: "openai/gpt-5-mini" }, + imageModel: { primary: "openai/gpt-5.4-mini" }, }, }, models: { @@ -652,7 +652,7 @@ describe("image tool implicit imageModel config", () => { }; // Tool should still be available for explicit image analysis requests expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({ - primary: "openai/gpt-5-mini", + primary: "openai/gpt-5.4-mini", }); const tool = createImageTool({ config: cfg, agentDir, modelHasVision: true }); expect(tool).not.toBeNull(); @@ -1229,7 +1229,7 @@ describe("image tool response validation", () => { role: "assistant", api: "openai-responses", provider: "openai", - model: "gpt-5-mini", + model: "gpt-5.4-mini", stopReason: "stop", timestamp: Date.now(), usage: makeZeroUsageSnapshot(), @@ -1278,7 +1278,7 @@ describe("image tool response validation", () => { expect(() => __testing.coerceImageAssistantText({ provider: "openai", - model: "gpt-5-mini", + model: "gpt-5.4-mini", message, }), ).toThrow(expectedError); diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index 97b88801d34..c0a906b7ea9 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -1,6 +1,10 @@ import { resolve, isAbsolute } from "node:path"; import { Type } from "@sinclair/typebox"; import type { OpenClawConfig } from "../../config/config.js"; +import { + resolveAutoMediaKeyProviders, + resolveDefaultMediaModel, +} from "../../media-understanding/defaults.js"; import { getMediaUnderstandingProvider } from "../../media-understanding/provider-registry.js"; import { buildProviderRegistry } from "../../media-understanding/runner.js"; import { loadWebMedia } from "../../media/web-media.js"; @@ -40,8 +44,6 @@ import { } from "./tool-runtime.helpers.js"; const DEFAULT_PROMPT = "Describe the image."; -const ANTHROPIC_IMAGE_PRIMARY = "anthropic/claude-opus-4-6"; -const ANTHROPIC_IMAGE_FALLBACK = "anthropic/claude-opus-4-5"; const DEFAULT_MAX_IMAGES = 20; const imageToolProviderDeps = { @@ -103,28 +105,39 @@ export function resolveImageModelConfigForTool(params: { provider: primary.provider, }); const primaryCandidates = (() => { - if (isMinimaxVlmProvider(primary.provider)) { - return [`${primary.provider}/MiniMax-VL-01`]; - } if (providerVisionFromConfig) { return [providerVisionFromConfig]; } - if (primary.provider === "zai") { - return ["zai/glm-4.6v"]; + const providerDefault = resolveDefaultMediaModel({ + cfg: params.cfg, + providerId: primary.provider, + capability: "image", + }); + if (providerDefault) { + return [`${primary.provider}/${providerDefault}`]; } - if (primary.provider === "openai") { - return ["openai/gpt-5-mini"]; - } - if (primary.provider === "anthropic") { - return [ANTHROPIC_IMAGE_PRIMARY]; + if (isMinimaxVlmProvider(primary.provider)) { + return [`${primary.provider}/MiniMax-VL-01`]; } return []; })(); + const autoCandidates = resolveAutoMediaKeyProviders({ + cfg: params.cfg, + capability: "image", + }).map((providerId) => { + const modelId = resolveDefaultMediaModel({ + cfg: params.cfg, + providerId, + capability: "image", + }); + return modelId ? `${providerId}/${modelId}` : null; + }); + return buildToolModelConfigFromCandidates({ explicit, agentDir: params.agentDir, - candidates: [...primaryCandidates, "openai/gpt-5-mini", ANTHROPIC_IMAGE_FALLBACK], + candidates: [...primaryCandidates, ...autoCandidates], }); } diff --git a/src/agents/tools/pdf-tool.helpers.ts b/src/agents/tools/pdf-tool.helpers.ts index 9e207c6add1..e259b79e11e 100644 --- a/src/agents/tools/pdf-tool.helpers.ts +++ b/src/agents/tools/pdf-tool.helpers.ts @@ -4,22 +4,16 @@ import { resolveAgentModelFallbackValues, resolveAgentModelPrimaryValue, } from "../../config/model-input.js"; +import { providerSupportsNativePdfDocument } from "../../media-understanding/defaults.js"; import { extractAssistantText } from "../pi-embedded-utils.js"; export type PdfModelConfig = { primary?: string; fallbacks?: string[] }; -/** - * Providers known to support native PDF document input. - * When the model's provider is in this set, the tool sends raw PDF bytes - * via provider-specific API calls instead of extracting text/images first. - */ -export const NATIVE_PDF_PROVIDERS = new Set(["anthropic", "google"]); - /** * Check whether a provider supports native PDF document input. */ export function providerSupportsNativePdf(provider: string): boolean { - return NATIVE_PDF_PROVIDERS.has(provider.toLowerCase().trim()); + return providerSupportsNativePdfDocument({ providerId: provider }); } /** diff --git a/src/agents/tools/pdf-tool.test.ts b/src/agents/tools/pdf-tool.test.ts index 1cc82a30913..202b605dbfe 100644 --- a/src/agents/tools/pdf-tool.test.ts +++ b/src/agents/tools/pdf-tool.test.ts @@ -46,7 +46,7 @@ async function withTempAgentDir(run: (agentDir: string) => Promise): Promi } const ANTHROPIC_PDF_MODEL = "anthropic/claude-opus-4-6"; -const OPENAI_PDF_MODEL = "openai/gpt-5-mini"; +const OPENAI_PDF_MODEL = "openai/gpt-5.4-mini"; const TEST_PDF_INPUT = { base64: "dGVzdA==", filename: "doc.pdf" } as const; const FAKE_PDF_MEDIA = { kind: "document", @@ -295,12 +295,12 @@ describe("resolvePdfModelConfigForTool", () => { agents: { defaults: { model: { primary: "openai/gpt-5.4" }, - imageModel: { primary: "openai/gpt-5-mini" }, + imageModel: { primary: "openai/gpt-5.4-mini" }, }, }, }; expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({ - primary: "openai/gpt-5-mini", + primary: "openai/gpt-5.4-mini", }); }); }); diff --git a/src/agents/tools/pdf-tool.ts b/src/agents/tools/pdf-tool.ts index 9cd7fb2afce..4b2166d5eb7 100644 --- a/src/agents/tools/pdf-tool.ts +++ b/src/agents/tools/pdf-tool.ts @@ -1,6 +1,11 @@ import { type Context, complete } from "@mariozechner/pi-ai"; import { Type } from "@sinclair/typebox"; import type { OpenClawConfig } from "../../config/config.js"; +import { + providerSupportsNativePdfDocument, + resolveAutoMediaKeyProviders, + resolveDefaultMediaModel, +} from "../../media-understanding/defaults.js"; import { extractPdfContent, type PdfExtractedContent } from "../../media/pdf-extract.js"; import { loadWebMediaRaw } from "../../media/web-media.js"; import { resolveUserPath } from "../../utils.js"; @@ -43,8 +48,6 @@ const DEFAULT_PROMPT = "Analyze this PDF document."; const DEFAULT_MAX_PDFS = 10; const DEFAULT_MAX_BYTES_MB = 10; const DEFAULT_MAX_PAGES = 20; -const ANTHROPIC_PDF_PRIMARY = "anthropic/claude-opus-4-6"; -const ANTHROPIC_PDF_FALLBACK = "anthropic/claude-opus-4-5"; const PDF_MIN_TEXT_CHARS = 200; const PDF_MAX_PIXELS = 4_000_000; @@ -75,9 +78,7 @@ export function resolvePdfModelConfigForTool(params: { // Auto-detect from available providers const primary = resolveDefaultModelRef(params.cfg); - const anthropicOk = hasAuthForProvider({ provider: "anthropic", agentDir: params.agentDir }); const googleOk = hasAuthForProvider({ provider: "google", agentDir: params.agentDir }); - const openaiOk = hasAuthForProvider({ provider: "openai", agentDir: params.agentDir }); const fallbacks: string[] = []; const addFallback = (ref: string) => { @@ -95,30 +96,54 @@ export function resolvePdfModelConfigForTool(params: { cfg: params.cfg, provider: primary.provider, }); + const providerDefault = resolveDefaultMediaModel({ + cfg: params.cfg, + providerId: primary.provider, + capability: "image", + }); + const nativePdfCandidates = resolveAutoMediaKeyProviders({ + cfg: params.cfg, + capability: "image", + }) + .filter((providerId) => providerSupportsNativePdfDocument({ cfg: params.cfg, providerId })) + .filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir })) + .map((providerId) => { + const modelId = resolveDefaultMediaModel({ + cfg: params.cfg, + providerId, + capability: "image", + }); + return modelId ? `${providerId}/${modelId}` : null; + }) + .filter((value): value is string => Boolean(value)); + const genericImageCandidates = resolveAutoMediaKeyProviders({ + cfg: params.cfg, + capability: "image", + }) + .filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir })) + .map((providerId) => { + const modelId = resolveDefaultMediaModel({ + cfg: params.cfg, + providerId, + capability: "image", + }); + return modelId ? `${providerId}/${modelId}` : null; + }) + .filter((value): value is string => Boolean(value)); - if (primary.provider === "anthropic" && anthropicOk) { - preferred = ANTHROPIC_PDF_PRIMARY; - } else if (primary.provider === "google" && googleOk && providerVision) { + if (primary.provider === "google" && googleOk && providerVision) { preferred = providerVision; - } else if (providerOk && providerVision) { - preferred = providerVision; - } else if (anthropicOk) { - preferred = ANTHROPIC_PDF_PRIMARY; - } else if (googleOk) { - preferred = "google/gemini-2.5-pro"; - } else if (openaiOk) { - preferred = "openai/gpt-5-mini"; + } else if (providerOk && (providerVision || providerDefault)) { + preferred = providerVision ?? `${primary.provider}/${providerDefault}`; + } else { + preferred = nativePdfCandidates[0] ?? genericImageCandidates[0] ?? null; } if (preferred?.trim()) { - if (anthropicOk && preferred !== ANTHROPIC_PDF_PRIMARY) { - addFallback(ANTHROPIC_PDF_PRIMARY); - } - if (anthropicOk) { - addFallback(ANTHROPIC_PDF_FALLBACK); - } - if (openaiOk) { - addFallback("openai/gpt-5-mini"); + for (const candidate of [...nativePdfCandidates, ...genericImageCandidates]) { + if (candidate !== preferred) { + addFallback(candidate); + } } const pruned = fallbacks.filter((ref) => ref !== preferred); return { primary: preferred, ...(pruned.length > 0 ? { fallbacks: pruned } : {}) }; diff --git a/src/config/config.schema-regressions.test.ts b/src/config/config.schema-regressions.test.ts index 712fc9c3553..0b3f556b5aa 100644 --- a/src/config/config.schema-regressions.test.ts +++ b/src/config/config.schema-regressions.test.ts @@ -178,7 +178,7 @@ describe("config schema regressions", () => { defaults: { pdfModel: { primary: "anthropic/claude-opus-4-6", - fallbacks: ["openai/gpt-5-mini"], + fallbacks: ["openai/gpt-5.4-mini"], }, pdfMaxBytesMb: 12, pdfMaxPages: 25, @@ -193,7 +193,7 @@ describe("config schema regressions", () => { const res = validateConfigObject({ agents: { defaults: { - pdfModel: { primary: "openai/gpt-5-mini" }, + pdfModel: { primary: "openai/gpt-5.4-mini" }, pdfMaxBytesMb: 0, pdfMaxPages: 0, }, diff --git a/src/config/defaults.ts b/src/config/defaults.ts index 01718186cd6..21ad26a2156 100644 --- a/src/config/defaults.ts +++ b/src/config/defaults.ts @@ -25,7 +25,8 @@ const DEFAULT_MODEL_ALIASES: Readonly> = { // OpenAI gpt: "openai/gpt-5.4", - "gpt-mini": "openai/gpt-5-mini", + "gpt-mini": "openai/gpt-5.4-mini", + "gpt-nano": "openai/gpt-5.4-nano", // Google Gemini (3.x are preview ids in the catalog) gemini: "google/gemini-3.1-pro-preview", diff --git a/src/media-understanding/defaults.test.ts b/src/media-understanding/defaults.test.ts index f7ccedfb85e..03da17225e5 100644 --- a/src/media-understanding/defaults.test.ts +++ b/src/media-understanding/defaults.test.ts @@ -1,44 +1,64 @@ import { describe, expect, it } from "vitest"; import { - AUTO_AUDIO_KEY_PROVIDERS, - AUTO_IMAGE_KEY_PROVIDERS, - AUTO_VIDEO_KEY_PROVIDERS, - DEFAULT_AUDIO_MODELS, - DEFAULT_IMAGE_MODELS, + providerSupportsNativePdfDocument, + resolveAutoMediaKeyProviders, + resolveDefaultMediaModel, } from "./defaults.js"; -describe("DEFAULT_AUDIO_MODELS", () => { - it("includes Mistral Voxtral default", () => { - expect(DEFAULT_AUDIO_MODELS.mistral).toBe("voxtral-mini-latest"); +describe("resolveDefaultMediaModel", () => { + it("resolves bundled audio defaults from provider metadata", () => { + expect(resolveDefaultMediaModel({ providerId: "mistral", capability: "audio" })).toBe( + "voxtral-mini-latest", + ); + }); + + it("resolves bundled image defaults beyond the historical core set", () => { + expect(resolveDefaultMediaModel({ providerId: "minimax-portal", capability: "image" })).toBe( + "MiniMax-VL-01", + ); + expect(resolveDefaultMediaModel({ providerId: "openai-codex", capability: "image" })).toBe( + "gpt-5.4", + ); + expect(resolveDefaultMediaModel({ providerId: "moonshot", capability: "image" })).toBe( + "kimi-k2.5", + ); + expect(resolveDefaultMediaModel({ providerId: "openrouter", capability: "image" })).toBe( + "auto", + ); }); }); -describe("AUTO_AUDIO_KEY_PROVIDERS", () => { - it("includes mistral auto key resolution", () => { - expect(AUTO_AUDIO_KEY_PROVIDERS).toContain("mistral"); +describe("resolveAutoMediaKeyProviders", () => { + it("keeps the bundled audio fallback order", () => { + expect(resolveAutoMediaKeyProviders({ capability: "audio" })).toEqual([ + "openai", + "groq", + "deepgram", + "google", + "mistral", + ]); + }); + + it("keeps the bundled image fallback order", () => { + expect(resolveAutoMediaKeyProviders({ capability: "image" })).toEqual([ + "openai", + "anthropic", + "google", + "minimax", + "minimax-portal", + "zai", + ]); + }); + + it("keeps the bundled video fallback order", () => { + expect(resolveAutoMediaKeyProviders({ capability: "video" })).toEqual(["google", "moonshot"]); }); }); -describe("AUTO_VIDEO_KEY_PROVIDERS", () => { - it("includes moonshot auto key resolution", () => { - expect(AUTO_VIDEO_KEY_PROVIDERS).toContain("moonshot"); - }); -}); - -describe("AUTO_IMAGE_KEY_PROVIDERS", () => { - it("includes minimax-portal auto key resolution", () => { - expect(AUTO_IMAGE_KEY_PROVIDERS).toContain("minimax-portal"); - }); -}); - -describe("DEFAULT_IMAGE_MODELS", () => { - it("includes the MiniMax portal vision default", () => { - expect(DEFAULT_IMAGE_MODELS["minimax-portal"]).toBe("MiniMax-VL-01"); - }); - - it("includes bundled image-provider defaults beyond the core provider set", () => { - expect(DEFAULT_IMAGE_MODELS["openai-codex"]).toBe("gpt-5.4"); - expect(DEFAULT_IMAGE_MODELS.moonshot).toBe("kimi-k2.5"); - expect(DEFAULT_IMAGE_MODELS.openrouter).toBe("auto"); +describe("providerSupportsNativePdfDocument", () => { + it("reads native PDF support from provider metadata", () => { + expect(providerSupportsNativePdfDocument({ providerId: "anthropic" })).toBe(true); + expect(providerSupportsNativePdfDocument({ providerId: "google" })).toBe(true); + expect(providerSupportsNativePdfDocument({ providerId: "openai" })).toBe(false); }); }); diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts index b92dba7ed1c..723d545a44d 100644 --- a/src/media-understanding/defaults.ts +++ b/src/media-understanding/defaults.ts @@ -1,4 +1,6 @@ -import type { MediaUnderstandingCapability } from "./types.js"; +import type { OpenClawConfig } from "../config/config.js"; +import { buildMediaUnderstandingRegistry, normalizeMediaProviderId } from "./provider-registry.js"; +import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js"; const MB = 1024 * 1024; @@ -27,43 +29,79 @@ export const DEFAULT_PROMPT: Record = { video: "Describe the video.", }; export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB; -export const DEFAULT_AUDIO_MODELS: Record = { - groq: "whisper-large-v3-turbo", - openai: "gpt-4o-mini-transcribe", - deepgram: "nova-3", - mistral: "voxtral-mini-latest", -}; - -export const AUTO_AUDIO_KEY_PROVIDERS = [ - "openai", - "groq", - "deepgram", - "google", - "mistral", -] as const; -export const AUTO_IMAGE_KEY_PROVIDERS = [ - "openai", - "anthropic", - "google", - "minimax", - "minimax-portal", - "zai", -] as const; -export const AUTO_VIDEO_KEY_PROVIDERS = ["google", "moonshot"] as const; -export const DEFAULT_IMAGE_MODELS: Record = { - openai: "gpt-5-mini", - "openai-codex": "gpt-5.4", - anthropic: "claude-opus-4-6", - google: "gemini-3-flash-preview", - minimax: "MiniMax-VL-01", - "minimax-portal": "MiniMax-VL-01", - moonshot: "kimi-k2.5", - openrouter: "auto", - zai: "glm-4.6v", -}; export const CLI_OUTPUT_MAX_BUFFER = 5 * MB; export const DEFAULT_MEDIA_CONCURRENCY = 2; +function providerSupportsCapability( + provider: MediaUnderstandingProvider | undefined, + capability: MediaUnderstandingCapability, +): boolean { + if (!provider) { + return false; + } + if (capability === "audio") { + return Boolean(provider.transcribeAudio); + } + if (capability === "image") { + return Boolean(provider.describeImage); + } + return Boolean(provider.describeVideo); +} + +function resolveDefaultRegistry(cfg?: OpenClawConfig) { + return buildMediaUnderstandingRegistry(undefined, cfg ?? ({} as OpenClawConfig)); +} + +export function resolveDefaultMediaModel(params: { + providerId: string; + capability: MediaUnderstandingCapability; + cfg?: OpenClawConfig; + providerRegistry?: Map; +}): string | undefined { + const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg); + const provider = registry.get(normalizeMediaProviderId(params.providerId)); + return provider?.defaultModels?.[params.capability]?.trim() || undefined; +} + +export function resolveAutoMediaKeyProviders(params: { + capability: MediaUnderstandingCapability; + cfg?: OpenClawConfig; + providerRegistry?: Map; +}): string[] { + const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg); + type AutoProviderEntry = { + provider: MediaUnderstandingProvider; + priority: number; + }; + return [...registry.values()] + .filter((provider) => providerSupportsCapability(provider, params.capability)) + .map((provider): AutoProviderEntry | null => { + const priority = provider.autoPriority?.[params.capability]; + return typeof priority === "number" && Number.isFinite(priority) + ? { provider, priority } + : null; + }) + .filter((entry): entry is AutoProviderEntry => entry !== null) + .toSorted((left, right) => { + if (left.priority !== right.priority) { + return left.priority - right.priority; + } + return left.provider.id.localeCompare(right.provider.id); + }) + .map((entry) => normalizeMediaProviderId(entry.provider.id)) + .filter(Boolean); +} + +export function providerSupportsNativePdfDocument(params: { + providerId: string; + cfg?: OpenClawConfig; + providerRegistry?: Map; +}): boolean { + const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg); + const provider = registry.get(normalizeMediaProviderId(params.providerId)); + return provider?.nativeDocumentInputs?.includes("pdf") ?? false; +} + /** * Minimum audio file size in bytes below which transcription is skipped. * Files smaller than this threshold are almost certainly empty or corrupt diff --git a/src/media-understanding/provider-registry.ts b/src/media-understanding/provider-registry.ts index b56faf97bfd..9714e24e0ca 100644 --- a/src/media-understanding/provider-registry.ts +++ b/src/media-understanding/provider-registry.ts @@ -14,6 +14,9 @@ function mergeProviderIntoRegistry( ...existing, ...provider, capabilities: provider.capabilities ?? existing.capabilities, + defaultModels: provider.defaultModels ?? existing.defaultModels, + autoPriority: provider.autoPriority ?? existing.autoPriority, + nativeDocumentInputs: provider.nativeDocumentInputs ?? existing.nativeDocumentInputs, } : provider; registry.set(normalizedKey, merged); @@ -41,6 +44,9 @@ export function buildMediaUnderstandingRegistry( ...existing, ...provider, capabilities: provider.capabilities ?? existing.capabilities, + defaultModels: provider.defaultModels ?? existing.defaultModels, + autoPriority: provider.autoPriority ?? existing.autoPriority, + nativeDocumentInputs: provider.nativeDocumentInputs ?? existing.nativeDocumentInputs, } : provider; registry.set(normalizedKey, merged); diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts index d2793f6f906..ff702a2eeed 100644 --- a/src/media-understanding/runner.entries.ts +++ b/src/media-understanding/runner.entries.ts @@ -24,9 +24,9 @@ import { runExec } from "../process/exec.js"; import { MediaAttachmentCache } from "./attachments.js"; import { CLI_OUTPUT_MAX_BUFFER, - DEFAULT_AUDIO_MODELS, DEFAULT_TIMEOUT_SECONDS, MIN_AUDIO_FILE_BYTES, + resolveDefaultMediaModel, } from "./defaults.js"; import { MediaUnderstandingSkipError } from "./errors.js"; import { fileExists } from "./fs.js"; @@ -548,7 +548,14 @@ export async function runProviderEntry(params: { config: params.config, entry, }); - const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model; + const model = + entry.model?.trim() || + resolveDefaultMediaModel({ + cfg, + providerId, + capability: "audio", + }) || + entry.model; const result = await executeWithApiKeyRotation({ provider: providerId, apiKeys, diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 0d168d17d97..55b3f8228a2 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -25,12 +25,7 @@ import { mergeInboundPathRoots } from "../media/inbound-path-policy.js"; import { getDefaultMediaLocalRoots } from "../media/local-roots.js"; import { runExec } from "../process/exec.js"; import { MediaAttachmentCache, selectAttachments } from "./attachments.js"; -import { - AUTO_AUDIO_KEY_PROVIDERS, - AUTO_IMAGE_KEY_PROVIDERS, - AUTO_VIDEO_KEY_PROVIDERS, - DEFAULT_IMAGE_MODELS, -} from "./defaults.js"; +import { resolveAutoMediaKeyProviders, resolveDefaultMediaModel } from "./defaults.js"; import { isMediaUnderstandingSkipError } from "./errors.js"; import { fileExists } from "./fs.js"; import { extractGeminiResponse } from "./output-extract.js"; @@ -152,7 +147,11 @@ async function resolveAutoImageModelId(params: { if (configuredModel) { return configuredModel; } - const defaultModel = DEFAULT_IMAGE_MODELS[params.providerId]; + const defaultModel = resolveDefaultMediaModel({ + cfg: params.cfg, + providerId: params.providerId, + capability: "image", + }); if (defaultModel) { return defaultModel; } @@ -470,7 +469,11 @@ async function resolveKeyEntry(params: { cfg, providerRegistry, capability, - fallbackProviders: AUTO_IMAGE_KEY_PROVIDERS, + fallbackProviders: resolveAutoMediaKeyProviders({ + cfg, + capability, + providerRegistry, + }), })) { const entry = await checkProvider(providerId); if (entry) { @@ -492,7 +495,11 @@ async function resolveKeyEntry(params: { cfg, providerRegistry, capability, - fallbackProviders: AUTO_VIDEO_KEY_PROVIDERS, + fallbackProviders: resolveAutoMediaKeyProviders({ + cfg, + capability, + providerRegistry, + }), })) { const entry = await checkProvider(providerId, undefined); if (entry) { @@ -513,7 +520,11 @@ async function resolveKeyEntry(params: { cfg, providerRegistry, capability, - fallbackProviders: AUTO_AUDIO_KEY_PROVIDERS, + fallbackProviders: resolveAutoMediaKeyProviders({ + cfg, + capability, + providerRegistry, + }), })) { const entry = await checkProvider(providerId, undefined); if (entry) { diff --git a/src/media-understanding/types.ts b/src/media-understanding/types.ts index 10056c8ae02..4916ca8107c 100644 --- a/src/media-understanding/types.ts +++ b/src/media-understanding/types.ts @@ -137,6 +137,9 @@ export type ImagesDescriptionResult = { export type MediaUnderstandingProvider = { id: string; capabilities?: MediaUnderstandingCapability[]; + defaultModels?: Partial>; + autoPriority?: Partial>; + nativeDocumentInputs?: Array<"pdf">; transcribeAudio?: (req: AudioTranscriptionRequest) => Promise; describeVideo?: (req: VideoDescriptionRequest) => Promise; describeImage?: (req: ImageDescriptionRequest) => Promise;